diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,111337 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 15901, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 12.560257448818305, + "learning_rate": 2.092050209205021e-08, + "loss": 1.3878, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 12.05368035791786, + "learning_rate": 4.184100418410042e-08, + "loss": 1.4129, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 12.118477403459769, + "learning_rate": 6.276150627615063e-08, + "loss": 1.4392, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 12.818196701992624, + "learning_rate": 8.368200836820084e-08, + "loss": 1.4417, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 12.52241286190301, + "learning_rate": 1.0460251046025104e-07, + "loss": 1.4274, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 12.166198039046842, + "learning_rate": 1.2552301255230126e-07, + "loss": 1.4287, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 12.152000828367362, + "learning_rate": 1.4644351464435148e-07, + "loss": 1.3883, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 12.282759445642426, + "learning_rate": 1.6736401673640168e-07, + "loss": 1.5214, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 12.187048651757747, + "learning_rate": 1.882845188284519e-07, + "loss": 1.3853, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 11.755813758937292, + "learning_rate": 2.092050209205021e-07, + "loss": 1.3802, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 12.471968662690486, + "learning_rate": 2.3012552301255234e-07, + "loss": 1.3958, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 12.023481948686502, + "learning_rate": 2.5104602510460253e-07, + "loss": 1.3625, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 11.407879675934606, + "learning_rate": 2.7196652719665275e-07, + "loss": 1.3987, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 11.002417532604325, + "learning_rate": 2.9288702928870297e-07, + "loss": 1.3685, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 12.046397935114733, + "learning_rate": 3.1380753138075313e-07, + "loss": 1.3761, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 9.74413617531794, + "learning_rate": 3.3472803347280335e-07, + "loss": 1.3515, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 9.668487707646594, + "learning_rate": 3.5564853556485363e-07, + "loss": 1.3325, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 9.55703442979226, + "learning_rate": 3.765690376569038e-07, + "loss": 1.3312, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 8.86102637540386, + "learning_rate": 3.97489539748954e-07, + "loss": 1.3139, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 8.885150017781749, + "learning_rate": 4.184100418410042e-07, + "loss": 1.313, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 6.508917024905127, + "learning_rate": 4.393305439330544e-07, + "loss": 1.182, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 12.155384192157996, + "learning_rate": 4.6025104602510467e-07, + "loss": 1.1837, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 6.159131473054676, + "learning_rate": 4.811715481171549e-07, + "loss": 1.2192, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 5.591628396174414, + "learning_rate": 5.020920502092051e-07, + "loss": 1.2123, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 5.2459320704076555, + "learning_rate": 5.230125523012552e-07, + "loss": 1.1723, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 4.686533379285876, + "learning_rate": 5.439330543933055e-07, + "loss": 1.2776, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 4.744258441539658, + "learning_rate": 5.648535564853557e-07, + "loss": 1.1493, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 4.774128304322188, + "learning_rate": 5.857740585774059e-07, + "loss": 1.1565, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 3.7154225724871135, + "learning_rate": 6.066945606694561e-07, + "loss": 1.0396, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 3.5116914798097727, + "learning_rate": 6.276150627615063e-07, + "loss": 1.0286, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.3769413257117877, + "learning_rate": 6.485355648535565e-07, + "loss": 1.0221, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 5.715927901242416, + "learning_rate": 6.694560669456067e-07, + "loss": 0.7152, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 3.0664296690038766, + "learning_rate": 6.90376569037657e-07, + "loss": 1.0258, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 3.8471000703188283, + "learning_rate": 7.112970711297073e-07, + "loss": 1.0315, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 3.04394481220455, + "learning_rate": 7.322175732217573e-07, + "loss": 1.0299, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 2.9290763986647264, + "learning_rate": 7.531380753138076e-07, + "loss": 1.0159, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 2.560701757054639, + "learning_rate": 7.740585774058578e-07, + "loss": 0.9823, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 2.4094589147970704, + "learning_rate": 7.94979079497908e-07, + "loss": 0.9859, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 2.304616304731331, + "learning_rate": 8.158995815899583e-07, + "loss": 0.9283, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 2.336308342883063, + "learning_rate": 8.368200836820084e-07, + "loss": 0.9183, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 2.6741125301434066, + "learning_rate": 8.577405857740586e-07, + "loss": 0.9356, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 2.5818907495124943, + "learning_rate": 8.786610878661088e-07, + "loss": 0.909, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 2.5428484367199093, + "learning_rate": 8.995815899581591e-07, + "loss": 0.9047, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 2.563139490584407, + "learning_rate": 9.205020920502093e-07, + "loss": 0.9273, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 2.559972040338691, + "learning_rate": 9.414225941422594e-07, + "loss": 0.9096, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 2.176763259124502, + "learning_rate": 9.623430962343098e-07, + "loss": 0.8882, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 2.147504876876301, + "learning_rate": 9.8326359832636e-07, + "loss": 0.8603, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 1.9553524513603393, + "learning_rate": 1.0041841004184101e-06, + "loss": 0.8538, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 1.7258382720756227, + "learning_rate": 1.0251046025104603e-06, + "loss": 0.8622, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 2.227266937437163, + "learning_rate": 1.0460251046025104e-06, + "loss": 0.9289, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 1.6795622373939423, + "learning_rate": 1.0669456066945608e-06, + "loss": 0.8651, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 3.254037734651701, + "learning_rate": 1.087866108786611e-06, + "loss": 0.8048, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 1.7583196392527818, + "learning_rate": 1.1087866108786612e-06, + "loss": 0.8187, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 1.6957034862495282, + "learning_rate": 1.1297071129707113e-06, + "loss": 0.8168, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 1.6473460490856577, + "learning_rate": 1.1506276150627615e-06, + "loss": 0.7906, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 1.614593057786042, + "learning_rate": 1.1715481171548119e-06, + "loss": 0.8337, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 1.572763056170728, + "learning_rate": 1.192468619246862e-06, + "loss": 0.8324, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 2.0357680203943334, + "learning_rate": 1.2133891213389122e-06, + "loss": 0.8232, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 1.814797864213227, + "learning_rate": 1.2343096234309624e-06, + "loss": 0.8144, + "step": 59 + }, + { + "epoch": 0.0, + "grad_norm": 1.6438167910451045, + "learning_rate": 1.2552301255230125e-06, + "loss": 0.7745, + "step": 60 + }, + { + "epoch": 0.0, + "grad_norm": 1.8748382016688478, + "learning_rate": 1.276150627615063e-06, + "loss": 0.8187, + "step": 61 + }, + { + "epoch": 0.0, + "grad_norm": 1.783334835407333, + "learning_rate": 1.297071129707113e-06, + "loss": 0.7347, + "step": 62 + }, + { + "epoch": 0.0, + "grad_norm": 1.6367872868851776, + "learning_rate": 1.3179916317991635e-06, + "loss": 0.7966, + "step": 63 + }, + { + "epoch": 0.0, + "grad_norm": 1.7926974408303888, + "learning_rate": 1.3389121338912134e-06, + "loss": 0.7534, + "step": 64 + }, + { + "epoch": 0.0, + "grad_norm": 1.6606628925876652, + "learning_rate": 1.3598326359832636e-06, + "loss": 0.7751, + "step": 65 + }, + { + "epoch": 0.0, + "grad_norm": 2.3262653732005614, + "learning_rate": 1.380753138075314e-06, + "loss": 0.7957, + "step": 66 + }, + { + "epoch": 0.0, + "grad_norm": 1.9727336258636625, + "learning_rate": 1.4016736401673641e-06, + "loss": 0.7589, + "step": 67 + }, + { + "epoch": 0.0, + "grad_norm": 1.5456704266811179, + "learning_rate": 1.4225941422594145e-06, + "loss": 0.7719, + "step": 68 + }, + { + "epoch": 0.0, + "grad_norm": 2.696764818970671, + "learning_rate": 1.4435146443514645e-06, + "loss": 0.7103, + "step": 69 + }, + { + "epoch": 0.0, + "grad_norm": 1.8835448556085705, + "learning_rate": 1.4644351464435146e-06, + "loss": 0.7208, + "step": 70 + }, + { + "epoch": 0.0, + "grad_norm": 1.5943338456870952, + "learning_rate": 1.485355648535565e-06, + "loss": 0.7296, + "step": 71 + }, + { + "epoch": 0.0, + "grad_norm": 2.142273603062082, + "learning_rate": 1.5062761506276152e-06, + "loss": 0.7343, + "step": 72 + }, + { + "epoch": 0.0, + "grad_norm": 1.9454596449518564, + "learning_rate": 1.5271966527196656e-06, + "loss": 0.7321, + "step": 73 + }, + { + "epoch": 0.0, + "grad_norm": 2.384812557384574, + "learning_rate": 1.5481171548117155e-06, + "loss": 0.76, + "step": 74 + }, + { + "epoch": 0.0, + "grad_norm": 1.7705882044117423, + "learning_rate": 1.5690376569037657e-06, + "loss": 0.7091, + "step": 75 + }, + { + "epoch": 0.0, + "grad_norm": 1.6371238362918907, + "learning_rate": 1.589958158995816e-06, + "loss": 0.709, + "step": 76 + }, + { + "epoch": 0.0, + "grad_norm": 2.931138171116971, + "learning_rate": 1.6108786610878662e-06, + "loss": 0.7118, + "step": 77 + }, + { + "epoch": 0.0, + "grad_norm": 1.790129601746314, + "learning_rate": 1.6317991631799166e-06, + "loss": 0.7066, + "step": 78 + }, + { + "epoch": 0.0, + "grad_norm": 1.561249405813863, + "learning_rate": 1.6527196652719666e-06, + "loss": 0.6805, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 2.136737674376449, + "learning_rate": 1.6736401673640167e-06, + "loss": 0.6977, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 1.7710100366758756, + "learning_rate": 1.694560669456067e-06, + "loss": 0.722, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 1.5677734826636633, + "learning_rate": 1.7154811715481173e-06, + "loss": 0.678, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 2.876098097240444, + "learning_rate": 1.7364016736401676e-06, + "loss": 0.6647, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 6.079061521700847, + "learning_rate": 1.7573221757322176e-06, + "loss": 0.6866, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.7513118734948716, + "learning_rate": 1.7782426778242678e-06, + "loss": 0.6748, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.5300678728195052, + "learning_rate": 1.7991631799163181e-06, + "loss": 0.6544, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 1.5018200347221655, + "learning_rate": 1.8200836820083683e-06, + "loss": 0.6691, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.6743180612769364, + "learning_rate": 1.8410041841004187e-06, + "loss": 0.6736, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 2.8169151334734304, + "learning_rate": 1.8619246861924686e-06, + "loss": 0.5151, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 2.103169025054942, + "learning_rate": 1.8828451882845188e-06, + "loss": 0.6861, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 2.7336268979816554, + "learning_rate": 1.9037656903765692e-06, + "loss": 0.6725, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 1.9197089606721394, + "learning_rate": 1.9246861924686196e-06, + "loss": 0.6709, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 1.6507475729497776, + "learning_rate": 1.9456066945606697e-06, + "loss": 0.6621, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 1.5919123304273, + "learning_rate": 1.96652719665272e-06, + "loss": 0.6644, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 1.6086503140886639, + "learning_rate": 1.98744769874477e-06, + "loss": 0.6779, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 1.6852890600303034, + "learning_rate": 2.0083682008368202e-06, + "loss": 0.6649, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 1.709169433281033, + "learning_rate": 2.0292887029288704e-06, + "loss": 0.6657, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 2.255857063906131, + "learning_rate": 2.0502092050209206e-06, + "loss": 0.6827, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 2.2727597545863, + "learning_rate": 2.071129707112971e-06, + "loss": 0.6371, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 1.7926032772173868, + "learning_rate": 2.092050209205021e-06, + "loss": 0.6583, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 2.099978743369382, + "learning_rate": 2.112970711297071e-06, + "loss": 0.6969, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 2.0217607651610785, + "learning_rate": 2.1338912133891217e-06, + "loss": 0.6691, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 1.8662523120478176, + "learning_rate": 2.154811715481172e-06, + "loss": 0.6395, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 1.4402038425145702, + "learning_rate": 2.175732217573222e-06, + "loss": 0.6115, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 1.7596873929734194, + "learning_rate": 2.196652719665272e-06, + "loss": 0.6733, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 1.6899417778676438, + "learning_rate": 2.2175732217573223e-06, + "loss": 0.6507, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 1.920649295828148, + "learning_rate": 2.2384937238493725e-06, + "loss": 0.6683, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 1.7786419616325029, + "learning_rate": 2.2594142259414227e-06, + "loss": 0.6756, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 2.516926447022803, + "learning_rate": 2.2803347280334732e-06, + "loss": 0.6141, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.8608800321743426, + "learning_rate": 2.301255230125523e-06, + "loss": 0.64, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.547921974374885, + "learning_rate": 2.322175732217573e-06, + "loss": 0.6765, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 1.5918671185078295, + "learning_rate": 2.3430962343096237e-06, + "loss": 0.6028, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 2.542704232862084, + "learning_rate": 2.364016736401674e-06, + "loss": 0.6367, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 1.9542813058768365, + "learning_rate": 2.384937238493724e-06, + "loss": 0.6052, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 2.1428940037238218, + "learning_rate": 2.4058577405857742e-06, + "loss": 0.6141, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 1.5871575608949053, + "learning_rate": 2.4267782426778244e-06, + "loss": 0.6421, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.6380030671795447, + "learning_rate": 2.4476987447698746e-06, + "loss": 0.6206, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 1.768553551606372, + "learning_rate": 2.4686192468619247e-06, + "loss": 0.6136, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 1.6324952719566632, + "learning_rate": 2.4895397489539753e-06, + "loss": 0.6165, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 1.6563571805125223, + "learning_rate": 2.510460251046025e-06, + "loss": 0.6085, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 1.7670514673682962, + "learning_rate": 2.5313807531380757e-06, + "loss": 0.6067, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 1.996143798308078, + "learning_rate": 2.552301255230126e-06, + "loss": 0.6103, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 2.245567993212247, + "learning_rate": 2.5732217573221756e-06, + "loss": 0.6164, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 1.7354389724223707, + "learning_rate": 2.594142259414226e-06, + "loss": 0.6175, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 1.7223077760804373, + "learning_rate": 2.6150627615062763e-06, + "loss": 0.6069, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 1.7790619431990171, + "learning_rate": 2.635983263598327e-06, + "loss": 0.6347, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 1.9358395244420905, + "learning_rate": 2.6569037656903767e-06, + "loss": 0.5972, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 2.876919263803925, + "learning_rate": 2.677824267782427e-06, + "loss": 0.6134, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 1.8284656253275764, + "learning_rate": 2.6987447698744774e-06, + "loss": 0.5963, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 2.0927052573703575, + "learning_rate": 2.719665271966527e-06, + "loss": 0.6374, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 2.043065973554129, + "learning_rate": 2.7405857740585778e-06, + "loss": 0.6321, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 1.8387676107474462, + "learning_rate": 2.761506276150628e-06, + "loss": 0.5928, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 1.6784610566868574, + "learning_rate": 2.7824267782426777e-06, + "loss": 0.6031, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 1.7524319607012582, + "learning_rate": 2.8033472803347283e-06, + "loss": 0.5967, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 1.6078432497800754, + "learning_rate": 2.8242677824267784e-06, + "loss": 0.5957, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 2.5260696980523862, + "learning_rate": 2.845188284518829e-06, + "loss": 0.6297, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 1.9377468490909344, + "learning_rate": 2.8661087866108788e-06, + "loss": 0.5889, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 4.5307926184565455, + "learning_rate": 2.887029288702929e-06, + "loss": 0.6061, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 1.9059167405758104, + "learning_rate": 2.9079497907949795e-06, + "loss": 0.6077, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 1.9100292543552058, + "learning_rate": 2.9288702928870293e-06, + "loss": 0.5915, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 1.732542946589111, + "learning_rate": 2.94979079497908e-06, + "loss": 0.5694, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 2.5538540167703245, + "learning_rate": 2.97071129707113e-06, + "loss": 0.6137, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 3.3343185435662486, + "learning_rate": 2.9916317991631798e-06, + "loss": 0.623, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 2.0435238439705445, + "learning_rate": 3.0125523012552303e-06, + "loss": 0.5857, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 1.9264488409367186, + "learning_rate": 3.0334728033472805e-06, + "loss": 0.5854, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 1.8358898581063288, + "learning_rate": 3.054393305439331e-06, + "loss": 0.573, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 2.0776269822206817, + "learning_rate": 3.075313807531381e-06, + "loss": 0.6185, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 1.758152422938119, + "learning_rate": 3.096234309623431e-06, + "loss": 0.5661, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 2.2387002347198237, + "learning_rate": 3.1171548117154816e-06, + "loss": 0.6174, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 1.8649137544286947, + "learning_rate": 3.1380753138075313e-06, + "loss": 0.5836, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 2.1614961662618675, + "learning_rate": 3.158995815899582e-06, + "loss": 0.6087, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 2.963025160385293, + "learning_rate": 3.179916317991632e-06, + "loss": 0.6015, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 2.127885928484131, + "learning_rate": 3.200836820083682e-06, + "loss": 0.5827, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 1.8926972991118225, + "learning_rate": 3.2217573221757324e-06, + "loss": 0.5639, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 1.9778972234978238, + "learning_rate": 3.2426778242677826e-06, + "loss": 0.5472, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 3.742073166608714, + "learning_rate": 3.263598326359833e-06, + "loss": 0.62, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 1.9624844320438002, + "learning_rate": 3.284518828451883e-06, + "loss": 0.576, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 2.0381244644824084, + "learning_rate": 3.305439330543933e-06, + "loss": 0.5507, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 1.2545084375913989, + "learning_rate": 3.3263598326359837e-06, + "loss": 0.4356, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 2.1873318854202206, + "learning_rate": 3.3472803347280334e-06, + "loss": 0.5827, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 2.2282400425869424, + "learning_rate": 3.368200836820084e-06, + "loss": 0.6281, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 1.897405087693351, + "learning_rate": 3.389121338912134e-06, + "loss": 0.5442, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 6.088965814031687, + "learning_rate": 3.410041841004184e-06, + "loss": 0.5998, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 2.026569481356135, + "learning_rate": 3.4309623430962345e-06, + "loss": 0.5726, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 6.103211691829716, + "learning_rate": 3.4518828451882847e-06, + "loss": 0.4828, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 2.9849847813201476, + "learning_rate": 3.4728033472803353e-06, + "loss": 0.5415, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 2.1630055011267317, + "learning_rate": 3.493723849372385e-06, + "loss": 0.5518, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 2.4194171052375233, + "learning_rate": 3.514644351464435e-06, + "loss": 0.5851, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 1.8093937261849506, + "learning_rate": 3.5355648535564858e-06, + "loss": 0.5697, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 1.9064649752754856, + "learning_rate": 3.5564853556485355e-06, + "loss": 0.6195, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 2.846338624409405, + "learning_rate": 3.577405857740586e-06, + "loss": 0.576, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 1.829929989511185, + "learning_rate": 3.5983263598326363e-06, + "loss": 0.5492, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 1.9034714933492618, + "learning_rate": 3.619246861924686e-06, + "loss": 0.5418, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 1.962239415108419, + "learning_rate": 3.6401673640167366e-06, + "loss": 0.5805, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 1.6926327780212296, + "learning_rate": 3.6610878661087868e-06, + "loss": 0.5272, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 0.8187430316592739, + "learning_rate": 3.6820083682008374e-06, + "loss": 0.4574, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 35.24986259368454, + "learning_rate": 3.702928870292887e-06, + "loss": 0.5984, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 2.2422838567432923, + "learning_rate": 3.7238493723849373e-06, + "loss": 0.5598, + "step": 178 + }, + { + "epoch": 0.01, + "grad_norm": 1.8202869130922215, + "learning_rate": 3.744769874476988e-06, + "loss": 0.5432, + "step": 179 + }, + { + "epoch": 0.01, + "grad_norm": 3.2684918363837006, + "learning_rate": 3.7656903765690376e-06, + "loss": 0.5954, + "step": 180 + }, + { + "epoch": 0.01, + "grad_norm": 1.840220183982628, + "learning_rate": 3.786610878661088e-06, + "loss": 0.5471, + "step": 181 + }, + { + "epoch": 0.01, + "grad_norm": 1.7375381450379523, + "learning_rate": 3.8075313807531384e-06, + "loss": 0.5398, + "step": 182 + }, + { + "epoch": 0.01, + "grad_norm": 1.5833188190590233, + "learning_rate": 3.8284518828451885e-06, + "loss": 0.5593, + "step": 183 + }, + { + "epoch": 0.01, + "grad_norm": 2.4203088015902696, + "learning_rate": 3.849372384937239e-06, + "loss": 0.5341, + "step": 184 + }, + { + "epoch": 0.01, + "grad_norm": 2.4037462352952135, + "learning_rate": 3.870292887029289e-06, + "loss": 0.5241, + "step": 185 + }, + { + "epoch": 0.01, + "grad_norm": 1.9018114196493612, + "learning_rate": 3.8912133891213395e-06, + "loss": 0.5541, + "step": 186 + }, + { + "epoch": 0.01, + "grad_norm": 1.8346681758313153, + "learning_rate": 3.912133891213389e-06, + "loss": 0.5884, + "step": 187 + }, + { + "epoch": 0.01, + "grad_norm": 6.6060256359553735, + "learning_rate": 3.93305439330544e-06, + "loss": 0.5501, + "step": 188 + }, + { + "epoch": 0.01, + "grad_norm": 2.1610451908072212, + "learning_rate": 3.9539748953974895e-06, + "loss": 0.5964, + "step": 189 + }, + { + "epoch": 0.01, + "grad_norm": 3.0619098539338365, + "learning_rate": 3.97489539748954e-06, + "loss": 0.5723, + "step": 190 + }, + { + "epoch": 0.01, + "grad_norm": 2.487926344923484, + "learning_rate": 3.995815899581591e-06, + "loss": 0.5874, + "step": 191 + }, + { + "epoch": 0.01, + "grad_norm": 1.8608181899237632, + "learning_rate": 4.0167364016736405e-06, + "loss": 0.5233, + "step": 192 + }, + { + "epoch": 0.01, + "grad_norm": 2.4239428611178764, + "learning_rate": 4.037656903765691e-06, + "loss": 0.5519, + "step": 193 + }, + { + "epoch": 0.01, + "grad_norm": 2.5520657816429004, + "learning_rate": 4.058577405857741e-06, + "loss": 0.6036, + "step": 194 + }, + { + "epoch": 0.01, + "grad_norm": 1.956756479512331, + "learning_rate": 4.0794979079497905e-06, + "loss": 0.5487, + "step": 195 + }, + { + "epoch": 0.01, + "grad_norm": 2.179677188583239, + "learning_rate": 4.100418410041841e-06, + "loss": 0.5362, + "step": 196 + }, + { + "epoch": 0.01, + "grad_norm": 2.58054581350178, + "learning_rate": 4.121338912133892e-06, + "loss": 0.5526, + "step": 197 + }, + { + "epoch": 0.01, + "grad_norm": 2.0882473223940354, + "learning_rate": 4.142259414225942e-06, + "loss": 0.5691, + "step": 198 + }, + { + "epoch": 0.01, + "grad_norm": 2.225018522453562, + "learning_rate": 4.163179916317992e-06, + "loss": 0.5446, + "step": 199 + }, + { + "epoch": 0.01, + "grad_norm": 2.2548977860892365, + "learning_rate": 4.184100418410042e-06, + "loss": 0.5493, + "step": 200 + }, + { + "epoch": 0.01, + "grad_norm": 2.0015897504392632, + "learning_rate": 4.205020920502092e-06, + "loss": 0.5727, + "step": 201 + }, + { + "epoch": 0.01, + "grad_norm": 0.8549759738478968, + "learning_rate": 4.225941422594142e-06, + "loss": 0.4758, + "step": 202 + }, + { + "epoch": 0.01, + "grad_norm": 1.7218688114187135, + "learning_rate": 4.246861924686193e-06, + "loss": 0.5437, + "step": 203 + }, + { + "epoch": 0.01, + "grad_norm": 3.4956100779292383, + "learning_rate": 4.267782426778243e-06, + "loss": 0.5583, + "step": 204 + }, + { + "epoch": 0.01, + "grad_norm": 2.3497272517836394, + "learning_rate": 4.288702928870293e-06, + "loss": 0.561, + "step": 205 + }, + { + "epoch": 0.01, + "grad_norm": 2.1347763158373505, + "learning_rate": 4.309623430962344e-06, + "loss": 0.5512, + "step": 206 + }, + { + "epoch": 0.01, + "grad_norm": 2.017505861291735, + "learning_rate": 4.330543933054393e-06, + "loss": 0.545, + "step": 207 + }, + { + "epoch": 0.01, + "grad_norm": 2.944054774884085, + "learning_rate": 4.351464435146444e-06, + "loss": 0.6029, + "step": 208 + }, + { + "epoch": 0.01, + "grad_norm": 2.05288008370838, + "learning_rate": 4.372384937238494e-06, + "loss": 0.5386, + "step": 209 + }, + { + "epoch": 0.01, + "grad_norm": 1.965382181850134, + "learning_rate": 4.393305439330544e-06, + "loss": 0.5262, + "step": 210 + }, + { + "epoch": 0.01, + "grad_norm": 2.5857770698218623, + "learning_rate": 4.414225941422595e-06, + "loss": 0.5269, + "step": 211 + }, + { + "epoch": 0.01, + "grad_norm": 2.834596608275258, + "learning_rate": 4.435146443514645e-06, + "loss": 0.5422, + "step": 212 + }, + { + "epoch": 0.01, + "grad_norm": 2.057630453079115, + "learning_rate": 4.456066945606695e-06, + "loss": 0.5334, + "step": 213 + }, + { + "epoch": 0.01, + "grad_norm": 2.6220490371524603, + "learning_rate": 4.476987447698745e-06, + "loss": 0.5374, + "step": 214 + }, + { + "epoch": 0.01, + "grad_norm": 2.286435524178221, + "learning_rate": 4.4979079497907956e-06, + "loss": 0.5227, + "step": 215 + }, + { + "epoch": 0.01, + "grad_norm": 2.227202842739513, + "learning_rate": 4.518828451882845e-06, + "loss": 0.5751, + "step": 216 + }, + { + "epoch": 0.01, + "grad_norm": 1.829600359946089, + "learning_rate": 4.539748953974896e-06, + "loss": 0.5109, + "step": 217 + }, + { + "epoch": 0.01, + "grad_norm": 2.1186628117992368, + "learning_rate": 4.5606694560669465e-06, + "loss": 0.5155, + "step": 218 + }, + { + "epoch": 0.01, + "grad_norm": 1.757460139156587, + "learning_rate": 4.581589958158996e-06, + "loss": 0.5528, + "step": 219 + }, + { + "epoch": 0.01, + "grad_norm": 1.913175574549681, + "learning_rate": 4.602510460251046e-06, + "loss": 0.5263, + "step": 220 + }, + { + "epoch": 0.01, + "grad_norm": 1.8302369226695772, + "learning_rate": 4.6234309623430966e-06, + "loss": 0.548, + "step": 221 + }, + { + "epoch": 0.01, + "grad_norm": 2.472263108281696, + "learning_rate": 4.644351464435146e-06, + "loss": 0.5217, + "step": 222 + }, + { + "epoch": 0.01, + "grad_norm": 2.901098226008417, + "learning_rate": 4.665271966527197e-06, + "loss": 0.5094, + "step": 223 + }, + { + "epoch": 0.01, + "grad_norm": 2.23530258377056, + "learning_rate": 4.6861924686192475e-06, + "loss": 0.4972, + "step": 224 + }, + { + "epoch": 0.01, + "grad_norm": 2.901061367814879, + "learning_rate": 4.707112970711297e-06, + "loss": 0.5148, + "step": 225 + }, + { + "epoch": 0.01, + "grad_norm": 2.5767890434567686, + "learning_rate": 4.728033472803348e-06, + "loss": 0.5601, + "step": 226 + }, + { + "epoch": 0.01, + "grad_norm": 2.314143929513127, + "learning_rate": 4.7489539748953976e-06, + "loss": 0.5433, + "step": 227 + }, + { + "epoch": 0.01, + "grad_norm": 2.1529240549180026, + "learning_rate": 4.769874476987448e-06, + "loss": 0.5143, + "step": 228 + }, + { + "epoch": 0.01, + "grad_norm": 3.1515149718204225, + "learning_rate": 4.790794979079498e-06, + "loss": 0.5434, + "step": 229 + }, + { + "epoch": 0.01, + "grad_norm": 2.20706496494061, + "learning_rate": 4.8117154811715485e-06, + "loss": 0.4952, + "step": 230 + }, + { + "epoch": 0.01, + "grad_norm": 2.154685233114359, + "learning_rate": 4.832635983263599e-06, + "loss": 0.5263, + "step": 231 + }, + { + "epoch": 0.01, + "grad_norm": 3.338673730530193, + "learning_rate": 4.853556485355649e-06, + "loss": 0.5515, + "step": 232 + }, + { + "epoch": 0.01, + "grad_norm": 2.332063111503053, + "learning_rate": 4.874476987447699e-06, + "loss": 0.5559, + "step": 233 + }, + { + "epoch": 0.01, + "grad_norm": 1.7427624508109318, + "learning_rate": 4.895397489539749e-06, + "loss": 0.5355, + "step": 234 + }, + { + "epoch": 0.01, + "grad_norm": 2.2863543898558905, + "learning_rate": 4.9163179916318e-06, + "loss": 0.5394, + "step": 235 + }, + { + "epoch": 0.01, + "grad_norm": 2.183148941750467, + "learning_rate": 4.9372384937238495e-06, + "loss": 0.493, + "step": 236 + }, + { + "epoch": 0.01, + "grad_norm": 1.8999604880172074, + "learning_rate": 4.9581589958159e-06, + "loss": 0.5232, + "step": 237 + }, + { + "epoch": 0.01, + "grad_norm": 2.024650317117212, + "learning_rate": 4.979079497907951e-06, + "loss": 0.5463, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 4.519833654324174, + "learning_rate": 5e-06, + "loss": 0.5053, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 2.1145062860634294, + "learning_rate": 5.02092050209205e-06, + "loss": 0.5421, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 1.8230791631428602, + "learning_rate": 5.041841004184101e-06, + "loss": 0.5175, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 2.2292071552421118, + "learning_rate": 5.062761506276151e-06, + "loss": 0.5188, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 2.060899197962938, + "learning_rate": 5.083682008368201e-06, + "loss": 0.5064, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 2.287968133304338, + "learning_rate": 5.104602510460252e-06, + "loss": 0.5456, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 0.8504400266281082, + "learning_rate": 5.125523012552301e-06, + "loss": 0.4325, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 2.1361266792185174, + "learning_rate": 5.146443514644351e-06, + "loss": 0.5422, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 2.4283629300625877, + "learning_rate": 5.167364016736403e-06, + "loss": 0.5613, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 2.004467872375992, + "learning_rate": 5.188284518828452e-06, + "loss": 0.5277, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 3.1897735043913804, + "learning_rate": 5.209205020920503e-06, + "loss": 0.5529, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 3.0391670505960278, + "learning_rate": 5.230125523012553e-06, + "loss": 0.5311, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 2.0097009295986283, + "learning_rate": 5.251046025104602e-06, + "loss": 0.5372, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 3.1829047992119124, + "learning_rate": 5.271966527196654e-06, + "loss": 0.5075, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 1.8215535134615874, + "learning_rate": 5.292887029288704e-06, + "loss": 0.5389, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 2.054601898394612, + "learning_rate": 5.313807531380753e-06, + "loss": 0.549, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 5.329629461954194, + "learning_rate": 5.334728033472804e-06, + "loss": 0.5311, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 1.8914673884674622, + "learning_rate": 5.355648535564854e-06, + "loss": 0.52, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 1.0859952944448994, + "learning_rate": 5.376569037656904e-06, + "loss": 0.4608, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 2.503935817868153, + "learning_rate": 5.397489539748955e-06, + "loss": 0.5126, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 3.6643267775576076, + "learning_rate": 5.418410041841005e-06, + "loss": 0.5213, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 2.65610581106084, + "learning_rate": 5.439330543933054e-06, + "loss": 0.5778, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 3.5025442905122772, + "learning_rate": 5.460251046025105e-06, + "loss": 0.5002, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 2.520819133078781, + "learning_rate": 5.4811715481171555e-06, + "loss": 0.5489, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 2.272951313335712, + "learning_rate": 5.502092050209205e-06, + "loss": 0.563, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 2.413045100161984, + "learning_rate": 5.523012552301256e-06, + "loss": 0.5402, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 0.8929401036698851, + "learning_rate": 5.543933054393306e-06, + "loss": 0.4515, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 3.325230474399367, + "learning_rate": 5.564853556485355e-06, + "loss": 0.55, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 5.049356261368777, + "learning_rate": 5.585774058577407e-06, + "loss": 0.5259, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 2.621157778314054, + "learning_rate": 5.6066945606694565e-06, + "loss": 0.5395, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 96.63248774085258, + "learning_rate": 5.627615062761507e-06, + "loss": 0.5568, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 6.91911523630248, + "learning_rate": 5.648535564853557e-06, + "loss": 0.5073, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 4.378666811200259, + "learning_rate": 5.669456066945607e-06, + "loss": 0.5402, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 5.561323053565177, + "learning_rate": 5.690376569037658e-06, + "loss": 0.5315, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 2.6279745240004773, + "learning_rate": 5.711297071129708e-06, + "loss": 0.4996, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 2.221717607952609, + "learning_rate": 5.7322175732217575e-06, + "loss": 0.5017, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 2.705689661615929, + "learning_rate": 5.753138075313808e-06, + "loss": 0.5159, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 1.9104386727427634, + "learning_rate": 5.774058577405858e-06, + "loss": 0.5142, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 2.8517039944287434, + "learning_rate": 5.7949790794979084e-06, + "loss": 0.5703, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 1.7616299815648704, + "learning_rate": 5.815899581589959e-06, + "loss": 0.5456, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 2.6609993986366334, + "learning_rate": 5.836820083682009e-06, + "loss": 0.5327, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 3.6076800181703126, + "learning_rate": 5.8577405857740585e-06, + "loss": 0.505, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 2.3287901339772996, + "learning_rate": 5.878661087866109e-06, + "loss": 0.4926, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 1.9176150332150999, + "learning_rate": 5.89958158995816e-06, + "loss": 0.4637, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 2.836208971568334, + "learning_rate": 5.92050209205021e-06, + "loss": 0.539, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 3.56516241457902, + "learning_rate": 5.94142259414226e-06, + "loss": 0.5162, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 2.234022868978037, + "learning_rate": 5.96234309623431e-06, + "loss": 0.5201, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 7.193157674838078, + "learning_rate": 5.9832635983263595e-06, + "loss": 0.5133, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 3.6235735117222805, + "learning_rate": 6.004184100418411e-06, + "loss": 0.5325, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 2.0685788179195264, + "learning_rate": 6.025104602510461e-06, + "loss": 0.5088, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 2.8676759066530564, + "learning_rate": 6.046025104602511e-06, + "loss": 0.5303, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 2.0962004596269437, + "learning_rate": 6.066945606694561e-06, + "loss": 0.4912, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 2.3877027702814257, + "learning_rate": 6.087866108786611e-06, + "loss": 0.5036, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 2.7071208206826958, + "learning_rate": 6.108786610878662e-06, + "loss": 0.4957, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 2.2847240073491752, + "learning_rate": 6.129707112970712e-06, + "loss": 0.5106, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 3.3774041487743665, + "learning_rate": 6.150627615062762e-06, + "loss": 0.4879, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 2.6449473691246475, + "learning_rate": 6.171548117154812e-06, + "loss": 0.5333, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 2.4353294447704243, + "learning_rate": 6.192468619246862e-06, + "loss": 0.4978, + "step": 296 + }, + { + "epoch": 0.02, + "grad_norm": 2.020778042253329, + "learning_rate": 6.213389121338913e-06, + "loss": 0.5214, + "step": 297 + }, + { + "epoch": 0.02, + "grad_norm": 3.0942726866993144, + "learning_rate": 6.234309623430963e-06, + "loss": 0.5359, + "step": 298 + }, + { + "epoch": 0.02, + "grad_norm": 3.6716584551967406, + "learning_rate": 6.255230125523013e-06, + "loss": 0.4982, + "step": 299 + }, + { + "epoch": 0.02, + "grad_norm": 2.0646153430845535, + "learning_rate": 6.276150627615063e-06, + "loss": 0.5266, + "step": 300 + }, + { + "epoch": 0.02, + "grad_norm": 1.94715980004556, + "learning_rate": 6.297071129707113e-06, + "loss": 0.5234, + "step": 301 + }, + { + "epoch": 0.02, + "grad_norm": 1.1027040732914546, + "learning_rate": 6.317991631799164e-06, + "loss": 0.4561, + "step": 302 + }, + { + "epoch": 0.02, + "grad_norm": 2.206545821119485, + "learning_rate": 6.3389121338912145e-06, + "loss": 0.4935, + "step": 303 + }, + { + "epoch": 0.02, + "grad_norm": 2.8473207773595686, + "learning_rate": 6.359832635983264e-06, + "loss": 0.5527, + "step": 304 + }, + { + "epoch": 0.02, + "grad_norm": 6.023532167096835, + "learning_rate": 6.380753138075314e-06, + "loss": 0.4862, + "step": 305 + }, + { + "epoch": 0.02, + "grad_norm": 4.292990098601755, + "learning_rate": 6.401673640167364e-06, + "loss": 0.4951, + "step": 306 + }, + { + "epoch": 0.02, + "grad_norm": 2.8132000573019575, + "learning_rate": 6.422594142259415e-06, + "loss": 0.5869, + "step": 307 + }, + { + "epoch": 0.02, + "grad_norm": 10.973838233332685, + "learning_rate": 6.443514644351465e-06, + "loss": 0.4958, + "step": 308 + }, + { + "epoch": 0.02, + "grad_norm": 3.539826559036644, + "learning_rate": 6.4644351464435155e-06, + "loss": 0.5604, + "step": 309 + }, + { + "epoch": 0.02, + "grad_norm": 3.0504923999789053, + "learning_rate": 6.485355648535565e-06, + "loss": 0.4794, + "step": 310 + }, + { + "epoch": 0.02, + "grad_norm": 3.0248026253469127, + "learning_rate": 6.506276150627615e-06, + "loss": 0.4841, + "step": 311 + }, + { + "epoch": 0.02, + "grad_norm": 4.060222864652489, + "learning_rate": 6.527196652719666e-06, + "loss": 0.5296, + "step": 312 + }, + { + "epoch": 0.02, + "grad_norm": 2.5536836715360005, + "learning_rate": 6.548117154811716e-06, + "loss": 0.5435, + "step": 313 + }, + { + "epoch": 0.02, + "grad_norm": 2.8216575901049348, + "learning_rate": 6.569037656903766e-06, + "loss": 0.4525, + "step": 314 + }, + { + "epoch": 0.02, + "grad_norm": 2.2872488197895846, + "learning_rate": 6.5899581589958165e-06, + "loss": 0.4808, + "step": 315 + }, + { + "epoch": 0.02, + "grad_norm": 2.0154227553774837, + "learning_rate": 6.610878661087866e-06, + "loss": 0.5075, + "step": 316 + }, + { + "epoch": 0.02, + "grad_norm": 14.689464889095058, + "learning_rate": 6.631799163179918e-06, + "loss": 0.5294, + "step": 317 + }, + { + "epoch": 0.02, + "grad_norm": 3.511514014884, + "learning_rate": 6.652719665271967e-06, + "loss": 0.5001, + "step": 318 + }, + { + "epoch": 0.02, + "grad_norm": 5.462426397042369, + "learning_rate": 6.673640167364017e-06, + "loss": 0.5201, + "step": 319 + }, + { + "epoch": 0.02, + "grad_norm": 2.309178481355211, + "learning_rate": 6.694560669456067e-06, + "loss": 0.4914, + "step": 320 + }, + { + "epoch": 0.02, + "grad_norm": 3.00046890267073, + "learning_rate": 6.7154811715481175e-06, + "loss": 0.4846, + "step": 321 + }, + { + "epoch": 0.02, + "grad_norm": 2.771069502433317, + "learning_rate": 6.736401673640168e-06, + "loss": 0.5383, + "step": 322 + }, + { + "epoch": 0.02, + "grad_norm": 2.7459182369145854, + "learning_rate": 6.757322175732219e-06, + "loss": 0.5196, + "step": 323 + }, + { + "epoch": 0.02, + "grad_norm": 3.0963134515292685, + "learning_rate": 6.778242677824268e-06, + "loss": 0.4883, + "step": 324 + }, + { + "epoch": 0.02, + "grad_norm": 21.49385232659443, + "learning_rate": 6.799163179916318e-06, + "loss": 0.522, + "step": 325 + }, + { + "epoch": 0.02, + "grad_norm": 1.6575515829395526, + "learning_rate": 6.820083682008368e-06, + "loss": 0.4965, + "step": 326 + }, + { + "epoch": 0.02, + "grad_norm": 2.7250280279432313, + "learning_rate": 6.841004184100419e-06, + "loss": 0.5186, + "step": 327 + }, + { + "epoch": 0.02, + "grad_norm": 2.0326723726262537, + "learning_rate": 6.861924686192469e-06, + "loss": 0.5194, + "step": 328 + }, + { + "epoch": 0.02, + "grad_norm": 2.874319558287596, + "learning_rate": 6.88284518828452e-06, + "loss": 0.4986, + "step": 329 + }, + { + "epoch": 0.02, + "grad_norm": 13.5858321447557, + "learning_rate": 6.903765690376569e-06, + "loss": 0.4747, + "step": 330 + }, + { + "epoch": 0.02, + "grad_norm": 2.85784679462047, + "learning_rate": 6.924686192468619e-06, + "loss": 0.5098, + "step": 331 + }, + { + "epoch": 0.02, + "grad_norm": 2.824972859212857, + "learning_rate": 6.9456066945606706e-06, + "loss": 0.5029, + "step": 332 + }, + { + "epoch": 0.02, + "grad_norm": 3.1117225820256342, + "learning_rate": 6.96652719665272e-06, + "loss": 0.5309, + "step": 333 + }, + { + "epoch": 0.02, + "grad_norm": 4.061982984691296, + "learning_rate": 6.98744769874477e-06, + "loss": 0.5131, + "step": 334 + }, + { + "epoch": 0.02, + "grad_norm": 4.536437490932801, + "learning_rate": 7.008368200836821e-06, + "loss": 0.5196, + "step": 335 + }, + { + "epoch": 0.02, + "grad_norm": 2.309975716765149, + "learning_rate": 7.02928870292887e-06, + "loss": 0.4742, + "step": 336 + }, + { + "epoch": 0.02, + "grad_norm": 3.570204434848717, + "learning_rate": 7.050209205020922e-06, + "loss": 0.5131, + "step": 337 + }, + { + "epoch": 0.02, + "grad_norm": 4.707895248879186, + "learning_rate": 7.0711297071129716e-06, + "loss": 0.5506, + "step": 338 + }, + { + "epoch": 0.02, + "grad_norm": 2.673949592438475, + "learning_rate": 7.092050209205021e-06, + "loss": 0.4867, + "step": 339 + }, + { + "epoch": 0.02, + "grad_norm": 1.7316392716115172, + "learning_rate": 7.112970711297071e-06, + "loss": 0.4904, + "step": 340 + }, + { + "epoch": 0.02, + "grad_norm": 2.3951559468838584, + "learning_rate": 7.133891213389122e-06, + "loss": 0.4647, + "step": 341 + }, + { + "epoch": 0.02, + "grad_norm": 2.448340965348166, + "learning_rate": 7.154811715481172e-06, + "loss": 0.4623, + "step": 342 + }, + { + "epoch": 0.02, + "grad_norm": 2.6649219852341175, + "learning_rate": 7.175732217573223e-06, + "loss": 0.5209, + "step": 343 + }, + { + "epoch": 0.02, + "grad_norm": 4.585114417633767, + "learning_rate": 7.1966527196652726e-06, + "loss": 0.5184, + "step": 344 + }, + { + "epoch": 0.02, + "grad_norm": 5.452898799199118, + "learning_rate": 7.217573221757322e-06, + "loss": 0.4877, + "step": 345 + }, + { + "epoch": 0.02, + "grad_norm": 2.833547011782983, + "learning_rate": 7.238493723849372e-06, + "loss": 0.478, + "step": 346 + }, + { + "epoch": 0.02, + "grad_norm": 2.7953763531070988, + "learning_rate": 7.2594142259414235e-06, + "loss": 0.5042, + "step": 347 + }, + { + "epoch": 0.02, + "grad_norm": 6.734182011955151, + "learning_rate": 7.280334728033473e-06, + "loss": 0.4758, + "step": 348 + }, + { + "epoch": 0.02, + "grad_norm": 1.9624647174911558, + "learning_rate": 7.301255230125524e-06, + "loss": 0.5058, + "step": 349 + }, + { + "epoch": 0.02, + "grad_norm": 2.3363783927689292, + "learning_rate": 7.3221757322175736e-06, + "loss": 0.479, + "step": 350 + }, + { + "epoch": 0.02, + "grad_norm": 1.9336687093258327, + "learning_rate": 7.343096234309623e-06, + "loss": 0.5096, + "step": 351 + }, + { + "epoch": 0.02, + "grad_norm": 2.266888841938558, + "learning_rate": 7.364016736401675e-06, + "loss": 0.5157, + "step": 352 + }, + { + "epoch": 0.02, + "grad_norm": 3.5073325128843336, + "learning_rate": 7.3849372384937245e-06, + "loss": 0.4959, + "step": 353 + }, + { + "epoch": 0.02, + "grad_norm": 1.8126613086801358, + "learning_rate": 7.405857740585774e-06, + "loss": 0.5085, + "step": 354 + }, + { + "epoch": 0.02, + "grad_norm": 3.60687001755244, + "learning_rate": 7.426778242677825e-06, + "loss": 0.5306, + "step": 355 + }, + { + "epoch": 0.02, + "grad_norm": 6.341686498326082, + "learning_rate": 7.4476987447698746e-06, + "loss": 0.4989, + "step": 356 + }, + { + "epoch": 0.02, + "grad_norm": 1.0904637210259118, + "learning_rate": 7.468619246861926e-06, + "loss": 0.4616, + "step": 357 + }, + { + "epoch": 0.02, + "grad_norm": 3.133117015555545, + "learning_rate": 7.489539748953976e-06, + "loss": 0.5255, + "step": 358 + }, + { + "epoch": 0.02, + "grad_norm": 5.006984132530126, + "learning_rate": 7.5104602510460255e-06, + "loss": 0.5218, + "step": 359 + }, + { + "epoch": 0.02, + "grad_norm": 3.3938720688264263, + "learning_rate": 7.531380753138075e-06, + "loss": 0.5064, + "step": 360 + }, + { + "epoch": 0.02, + "grad_norm": 2.870676793551916, + "learning_rate": 7.552301255230127e-06, + "loss": 0.4847, + "step": 361 + }, + { + "epoch": 0.02, + "grad_norm": 0.8390125025598012, + "learning_rate": 7.573221757322176e-06, + "loss": 0.4413, + "step": 362 + }, + { + "epoch": 0.02, + "grad_norm": 3.327764398915044, + "learning_rate": 7.594142259414227e-06, + "loss": 0.5116, + "step": 363 + }, + { + "epoch": 0.02, + "grad_norm": 2.1647685751562404, + "learning_rate": 7.615062761506277e-06, + "loss": 0.5016, + "step": 364 + }, + { + "epoch": 0.02, + "grad_norm": 2.3640020871193026, + "learning_rate": 7.635983263598326e-06, + "loss": 0.5247, + "step": 365 + }, + { + "epoch": 0.02, + "grad_norm": 3.700581385258537, + "learning_rate": 7.656903765690377e-06, + "loss": 0.4815, + "step": 366 + }, + { + "epoch": 0.02, + "grad_norm": 4.767697256841907, + "learning_rate": 7.677824267782428e-06, + "loss": 0.4991, + "step": 367 + }, + { + "epoch": 0.02, + "grad_norm": 2.4927873233606883, + "learning_rate": 7.698744769874478e-06, + "loss": 0.5097, + "step": 368 + }, + { + "epoch": 0.02, + "grad_norm": 4.828020437858698, + "learning_rate": 7.719665271966527e-06, + "loss": 0.4657, + "step": 369 + }, + { + "epoch": 0.02, + "grad_norm": 2.6332343549170316, + "learning_rate": 7.740585774058578e-06, + "loss": 0.502, + "step": 370 + }, + { + "epoch": 0.02, + "grad_norm": 2.4241080414330596, + "learning_rate": 7.761506276150628e-06, + "loss": 0.4871, + "step": 371 + }, + { + "epoch": 0.02, + "grad_norm": 2.4052948087097104, + "learning_rate": 7.782426778242679e-06, + "loss": 0.5261, + "step": 372 + }, + { + "epoch": 0.02, + "grad_norm": 2.4405844871804074, + "learning_rate": 7.80334728033473e-06, + "loss": 0.4534, + "step": 373 + }, + { + "epoch": 0.02, + "grad_norm": 4.648179101276863, + "learning_rate": 7.824267782426778e-06, + "loss": 0.4664, + "step": 374 + }, + { + "epoch": 0.02, + "grad_norm": 2.7911410791896376, + "learning_rate": 7.845188284518829e-06, + "loss": 0.5241, + "step": 375 + }, + { + "epoch": 0.02, + "grad_norm": 1.8902105884147422, + "learning_rate": 7.86610878661088e-06, + "loss": 0.4666, + "step": 376 + }, + { + "epoch": 0.02, + "grad_norm": 3.0296579259691927, + "learning_rate": 7.88702928870293e-06, + "loss": 0.4474, + "step": 377 + }, + { + "epoch": 0.02, + "grad_norm": 11.222181680520652, + "learning_rate": 7.907949790794979e-06, + "loss": 0.4888, + "step": 378 + }, + { + "epoch": 0.02, + "grad_norm": 3.0631162291398097, + "learning_rate": 7.92887029288703e-06, + "loss": 0.4995, + "step": 379 + }, + { + "epoch": 0.02, + "grad_norm": 6.209846140694516, + "learning_rate": 7.94979079497908e-06, + "loss": 0.537, + "step": 380 + }, + { + "epoch": 0.02, + "grad_norm": 2.463680463708773, + "learning_rate": 7.97071129707113e-06, + "loss": 0.5255, + "step": 381 + }, + { + "epoch": 0.02, + "grad_norm": 2.891988986891982, + "learning_rate": 7.991631799163181e-06, + "loss": 0.4869, + "step": 382 + }, + { + "epoch": 0.02, + "grad_norm": 2.867591904677801, + "learning_rate": 8.01255230125523e-06, + "loss": 0.4612, + "step": 383 + }, + { + "epoch": 0.02, + "grad_norm": 2.430308944036147, + "learning_rate": 8.033472803347281e-06, + "loss": 0.5192, + "step": 384 + }, + { + "epoch": 0.02, + "grad_norm": 2.747116918822297, + "learning_rate": 8.054393305439332e-06, + "loss": 0.5017, + "step": 385 + }, + { + "epoch": 0.02, + "grad_norm": 2.085915787920984, + "learning_rate": 8.075313807531382e-06, + "loss": 0.473, + "step": 386 + }, + { + "epoch": 0.02, + "grad_norm": 3.3728873767492407, + "learning_rate": 8.096234309623433e-06, + "loss": 0.4877, + "step": 387 + }, + { + "epoch": 0.02, + "grad_norm": 3.5845719704457917, + "learning_rate": 8.117154811715482e-06, + "loss": 0.5671, + "step": 388 + }, + { + "epoch": 0.02, + "grad_norm": 1.9173579598015733, + "learning_rate": 8.138075313807532e-06, + "loss": 0.4929, + "step": 389 + }, + { + "epoch": 0.02, + "grad_norm": 1.883561647904503, + "learning_rate": 8.158995815899581e-06, + "loss": 0.4941, + "step": 390 + }, + { + "epoch": 0.02, + "grad_norm": 2.940398886539253, + "learning_rate": 8.179916317991633e-06, + "loss": 0.4837, + "step": 391 + }, + { + "epoch": 0.02, + "grad_norm": 1.961712747262717, + "learning_rate": 8.200836820083682e-06, + "loss": 0.4857, + "step": 392 + }, + { + "epoch": 0.02, + "grad_norm": 2.7196777020309737, + "learning_rate": 8.221757322175733e-06, + "loss": 0.5571, + "step": 393 + }, + { + "epoch": 0.02, + "grad_norm": 2.3298620477337977, + "learning_rate": 8.242677824267783e-06, + "loss": 0.5027, + "step": 394 + }, + { + "epoch": 0.02, + "grad_norm": 2.6061366433259003, + "learning_rate": 8.263598326359832e-06, + "loss": 0.4978, + "step": 395 + }, + { + "epoch": 0.02, + "grad_norm": 2.0405769078662233, + "learning_rate": 8.284518828451885e-06, + "loss": 0.4555, + "step": 396 + }, + { + "epoch": 0.02, + "grad_norm": 2.8937000549595946, + "learning_rate": 8.305439330543934e-06, + "loss": 0.4902, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 1.972537027473449, + "learning_rate": 8.326359832635984e-06, + "loss": 0.4724, + "step": 398 + }, + { + "epoch": 0.03, + "grad_norm": 6.187539954145826, + "learning_rate": 8.347280334728035e-06, + "loss": 0.4782, + "step": 399 + }, + { + "epoch": 0.03, + "grad_norm": 3.5411039017670323, + "learning_rate": 8.368200836820084e-06, + "loss": 0.5113, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 2.5085262051947974, + "learning_rate": 8.389121338912136e-06, + "loss": 0.4803, + "step": 401 + }, + { + "epoch": 0.03, + "grad_norm": 2.848215888898236, + "learning_rate": 8.410041841004185e-06, + "loss": 0.4969, + "step": 402 + }, + { + "epoch": 0.03, + "grad_norm": 3.679526385946745, + "learning_rate": 8.430962343096235e-06, + "loss": 0.469, + "step": 403 + }, + { + "epoch": 0.03, + "grad_norm": 2.1049472628050294, + "learning_rate": 8.451882845188284e-06, + "loss": 0.4728, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 2.378765128456705, + "learning_rate": 8.472803347280335e-06, + "loss": 0.4759, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 2.8512740769910905, + "learning_rate": 8.493723849372385e-06, + "loss": 0.5282, + "step": 406 + }, + { + "epoch": 0.03, + "grad_norm": 2.277942641095943, + "learning_rate": 8.514644351464436e-06, + "loss": 0.4712, + "step": 407 + }, + { + "epoch": 0.03, + "grad_norm": 15.443730718018829, + "learning_rate": 8.535564853556487e-06, + "loss": 0.4821, + "step": 408 + }, + { + "epoch": 0.03, + "grad_norm": 2.321172925160859, + "learning_rate": 8.556485355648536e-06, + "loss": 0.4636, + "step": 409 + }, + { + "epoch": 0.03, + "grad_norm": 1.8545919242223279, + "learning_rate": 8.577405857740586e-06, + "loss": 0.4965, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 3.4878827595054522, + "learning_rate": 8.598326359832637e-06, + "loss": 0.4793, + "step": 411 + }, + { + "epoch": 0.03, + "grad_norm": 4.266346876657598, + "learning_rate": 8.619246861924687e-06, + "loss": 0.4922, + "step": 412 + }, + { + "epoch": 0.03, + "grad_norm": 2.5792097367050015, + "learning_rate": 8.640167364016738e-06, + "loss": 0.5004, + "step": 413 + }, + { + "epoch": 0.03, + "grad_norm": 2.009757320195426, + "learning_rate": 8.661087866108787e-06, + "loss": 0.5171, + "step": 414 + }, + { + "epoch": 0.03, + "grad_norm": 2.2238853932080604, + "learning_rate": 8.682008368200837e-06, + "loss": 0.5391, + "step": 415 + }, + { + "epoch": 0.03, + "grad_norm": 2.2712477846853827, + "learning_rate": 8.702928870292888e-06, + "loss": 0.5026, + "step": 416 + }, + { + "epoch": 0.03, + "grad_norm": 2.1988046779112986, + "learning_rate": 8.723849372384939e-06, + "loss": 0.4858, + "step": 417 + }, + { + "epoch": 0.03, + "grad_norm": 2.406534027788669, + "learning_rate": 8.744769874476987e-06, + "loss": 0.4656, + "step": 418 + }, + { + "epoch": 0.03, + "grad_norm": 2.163961435074915, + "learning_rate": 8.765690376569038e-06, + "loss": 0.5427, + "step": 419 + }, + { + "epoch": 0.03, + "grad_norm": 2.3597195024144497, + "learning_rate": 8.786610878661089e-06, + "loss": 0.4897, + "step": 420 + }, + { + "epoch": 0.03, + "grad_norm": 2.168965457878025, + "learning_rate": 8.80753138075314e-06, + "loss": 0.479, + "step": 421 + }, + { + "epoch": 0.03, + "grad_norm": 2.5988091819064887, + "learning_rate": 8.82845188284519e-06, + "loss": 0.5255, + "step": 422 + }, + { + "epoch": 0.03, + "grad_norm": 3.7920088032537924, + "learning_rate": 8.849372384937239e-06, + "loss": 0.4933, + "step": 423 + }, + { + "epoch": 0.03, + "grad_norm": 1.983311565053605, + "learning_rate": 8.87029288702929e-06, + "loss": 0.4882, + "step": 424 + }, + { + "epoch": 0.03, + "grad_norm": 2.6590108562139867, + "learning_rate": 8.89121338912134e-06, + "loss": 0.5219, + "step": 425 + }, + { + "epoch": 0.03, + "grad_norm": 2.1317040286033073, + "learning_rate": 8.91213389121339e-06, + "loss": 0.4815, + "step": 426 + }, + { + "epoch": 0.03, + "grad_norm": 4.274968257780614, + "learning_rate": 8.933054393305441e-06, + "loss": 0.4822, + "step": 427 + }, + { + "epoch": 0.03, + "grad_norm": 3.686968625635076, + "learning_rate": 8.95397489539749e-06, + "loss": 0.4893, + "step": 428 + }, + { + "epoch": 0.03, + "grad_norm": 2.5596408404374777, + "learning_rate": 8.97489539748954e-06, + "loss": 0.4605, + "step": 429 + }, + { + "epoch": 0.03, + "grad_norm": 17.2740122622564, + "learning_rate": 8.995815899581591e-06, + "loss": 0.5216, + "step": 430 + }, + { + "epoch": 0.03, + "grad_norm": 2.921884062267339, + "learning_rate": 9.016736401673642e-06, + "loss": 0.4743, + "step": 431 + }, + { + "epoch": 0.03, + "grad_norm": 2.4626916917319113, + "learning_rate": 9.03765690376569e-06, + "loss": 0.4867, + "step": 432 + }, + { + "epoch": 0.03, + "grad_norm": 2.073554583318843, + "learning_rate": 9.058577405857741e-06, + "loss": 0.4803, + "step": 433 + }, + { + "epoch": 0.03, + "grad_norm": 1.6827488402593347, + "learning_rate": 9.079497907949792e-06, + "loss": 0.4818, + "step": 434 + }, + { + "epoch": 0.03, + "grad_norm": 4.575804476795184, + "learning_rate": 9.10041841004184e-06, + "loss": 0.4579, + "step": 435 + }, + { + "epoch": 0.03, + "grad_norm": 1.9555600587994975, + "learning_rate": 9.121338912133893e-06, + "loss": 0.4641, + "step": 436 + }, + { + "epoch": 0.03, + "grad_norm": 1.9832609347832768, + "learning_rate": 9.142259414225942e-06, + "loss": 0.4641, + "step": 437 + }, + { + "epoch": 0.03, + "grad_norm": 2.5867908607748533, + "learning_rate": 9.163179916317992e-06, + "loss": 0.4515, + "step": 438 + }, + { + "epoch": 0.03, + "grad_norm": 2.206710679839185, + "learning_rate": 9.184100418410043e-06, + "loss": 0.5006, + "step": 439 + }, + { + "epoch": 0.03, + "grad_norm": 1.877142684214976, + "learning_rate": 9.205020920502092e-06, + "loss": 0.4721, + "step": 440 + }, + { + "epoch": 0.03, + "grad_norm": 1.8918168834279614, + "learning_rate": 9.225941422594144e-06, + "loss": 0.4877, + "step": 441 + }, + { + "epoch": 0.03, + "grad_norm": 2.2351645066296113, + "learning_rate": 9.246861924686193e-06, + "loss": 0.4819, + "step": 442 + }, + { + "epoch": 0.03, + "grad_norm": 24.542472415486223, + "learning_rate": 9.267782426778244e-06, + "loss": 0.5727, + "step": 443 + }, + { + "epoch": 0.03, + "grad_norm": 1.9593789970001019, + "learning_rate": 9.288702928870293e-06, + "loss": 0.4963, + "step": 444 + }, + { + "epoch": 0.03, + "grad_norm": 1.7693233306816372, + "learning_rate": 9.309623430962343e-06, + "loss": 0.5029, + "step": 445 + }, + { + "epoch": 0.03, + "grad_norm": 2.3267296066825596, + "learning_rate": 9.330543933054394e-06, + "loss": 0.476, + "step": 446 + }, + { + "epoch": 0.03, + "grad_norm": 1.9083774240017217, + "learning_rate": 9.351464435146444e-06, + "loss": 0.5121, + "step": 447 + }, + { + "epoch": 0.03, + "grad_norm": 5.886945427880566, + "learning_rate": 9.372384937238495e-06, + "loss": 0.4735, + "step": 448 + }, + { + "epoch": 0.03, + "grad_norm": 1.8086180482800336, + "learning_rate": 9.393305439330544e-06, + "loss": 0.4796, + "step": 449 + }, + { + "epoch": 0.03, + "grad_norm": 2.5469885536000434, + "learning_rate": 9.414225941422594e-06, + "loss": 0.5053, + "step": 450 + }, + { + "epoch": 0.03, + "grad_norm": 1.7621867233967463, + "learning_rate": 9.435146443514645e-06, + "loss": 0.4892, + "step": 451 + }, + { + "epoch": 0.03, + "grad_norm": 2.3418689487785005, + "learning_rate": 9.456066945606696e-06, + "loss": 0.4976, + "step": 452 + }, + { + "epoch": 0.03, + "grad_norm": 1.7565291667069471, + "learning_rate": 9.476987447698746e-06, + "loss": 0.4702, + "step": 453 + }, + { + "epoch": 0.03, + "grad_norm": 2.7055592954521366, + "learning_rate": 9.497907949790795e-06, + "loss": 0.4975, + "step": 454 + }, + { + "epoch": 0.03, + "grad_norm": 2.125122571133811, + "learning_rate": 9.518828451882846e-06, + "loss": 0.4854, + "step": 455 + }, + { + "epoch": 0.03, + "grad_norm": 1.916873860259975, + "learning_rate": 9.539748953974896e-06, + "loss": 0.4663, + "step": 456 + }, + { + "epoch": 0.03, + "grad_norm": 3.0397319347096645, + "learning_rate": 9.560669456066947e-06, + "loss": 0.4953, + "step": 457 + }, + { + "epoch": 0.03, + "grad_norm": 2.7578682358617486, + "learning_rate": 9.581589958158996e-06, + "loss": 0.501, + "step": 458 + }, + { + "epoch": 0.03, + "grad_norm": 2.93713540725994, + "learning_rate": 9.602510460251046e-06, + "loss": 0.5161, + "step": 459 + }, + { + "epoch": 0.03, + "grad_norm": 2.481061522409949, + "learning_rate": 9.623430962343097e-06, + "loss": 0.4816, + "step": 460 + }, + { + "epoch": 0.03, + "grad_norm": 3.1183860924267104, + "learning_rate": 9.644351464435148e-06, + "loss": 0.488, + "step": 461 + }, + { + "epoch": 0.03, + "grad_norm": 2.5309859152257093, + "learning_rate": 9.665271966527198e-06, + "loss": 0.4803, + "step": 462 + }, + { + "epoch": 0.03, + "grad_norm": 1.6623926328087257, + "learning_rate": 9.686192468619247e-06, + "loss": 0.4625, + "step": 463 + }, + { + "epoch": 0.03, + "grad_norm": 3.226731031134517, + "learning_rate": 9.707112970711298e-06, + "loss": 0.4777, + "step": 464 + }, + { + "epoch": 0.03, + "grad_norm": 1.9974017132717794, + "learning_rate": 9.728033472803348e-06, + "loss": 0.4999, + "step": 465 + }, + { + "epoch": 0.03, + "grad_norm": 1.9782826327473388, + "learning_rate": 9.748953974895399e-06, + "loss": 0.4746, + "step": 466 + }, + { + "epoch": 0.03, + "grad_norm": 4.280861114366562, + "learning_rate": 9.76987447698745e-06, + "loss": 0.4949, + "step": 467 + }, + { + "epoch": 0.03, + "grad_norm": 13.561532321958314, + "learning_rate": 9.790794979079498e-06, + "loss": 0.5523, + "step": 468 + }, + { + "epoch": 0.03, + "grad_norm": 3.6482834591286095, + "learning_rate": 9.811715481171549e-06, + "loss": 0.4785, + "step": 469 + }, + { + "epoch": 0.03, + "grad_norm": 1.788555807619979, + "learning_rate": 9.8326359832636e-06, + "loss": 0.4918, + "step": 470 + }, + { + "epoch": 0.03, + "grad_norm": 1.7239375482508028, + "learning_rate": 9.85355648535565e-06, + "loss": 0.4521, + "step": 471 + }, + { + "epoch": 0.03, + "grad_norm": 2.075375377188315, + "learning_rate": 9.874476987447699e-06, + "loss": 0.4852, + "step": 472 + }, + { + "epoch": 0.03, + "grad_norm": 2.5660880924744864, + "learning_rate": 9.89539748953975e-06, + "loss": 0.4766, + "step": 473 + }, + { + "epoch": 0.03, + "grad_norm": 1.0997494916527482, + "learning_rate": 9.9163179916318e-06, + "loss": 0.4651, + "step": 474 + }, + { + "epoch": 0.03, + "grad_norm": 3.197086697835969, + "learning_rate": 9.937238493723849e-06, + "loss": 0.5007, + "step": 475 + }, + { + "epoch": 0.03, + "grad_norm": 2.509517393333771, + "learning_rate": 9.958158995815901e-06, + "loss": 0.4993, + "step": 476 + }, + { + "epoch": 0.03, + "grad_norm": 1.7187968677337175, + "learning_rate": 9.97907949790795e-06, + "loss": 0.4827, + "step": 477 + }, + { + "epoch": 0.03, + "grad_norm": 2.282998909013323, + "learning_rate": 1e-05, + "loss": 0.5512, + "step": 478 + }, + { + "epoch": 0.03, + "grad_norm": 2.7084881852837643, + "learning_rate": 9.99999989627056e-06, + "loss": 0.4992, + "step": 479 + }, + { + "epoch": 0.03, + "grad_norm": 2.8868923475095896, + "learning_rate": 9.999999585082243e-06, + "loss": 0.5299, + "step": 480 + }, + { + "epoch": 0.03, + "grad_norm": 2.6316744119836013, + "learning_rate": 9.999999066435062e-06, + "loss": 0.5353, + "step": 481 + }, + { + "epoch": 0.03, + "grad_norm": 1.980509745621231, + "learning_rate": 9.99999834032904e-06, + "loss": 0.4819, + "step": 482 + }, + { + "epoch": 0.03, + "grad_norm": 2.7208825151127956, + "learning_rate": 9.999997406764204e-06, + "loss": 0.4674, + "step": 483 + }, + { + "epoch": 0.03, + "grad_norm": 2.930274641225573, + "learning_rate": 9.999996265740595e-06, + "loss": 0.474, + "step": 484 + }, + { + "epoch": 0.03, + "grad_norm": 3.46637256655527, + "learning_rate": 9.99999491725826e-06, + "loss": 0.4848, + "step": 485 + }, + { + "epoch": 0.03, + "grad_norm": 4.962705189780874, + "learning_rate": 9.999993361317255e-06, + "loss": 0.4832, + "step": 486 + }, + { + "epoch": 0.03, + "grad_norm": 3.9013581764155414, + "learning_rate": 9.999991597917645e-06, + "loss": 0.4701, + "step": 487 + }, + { + "epoch": 0.03, + "grad_norm": 3.392350399151913, + "learning_rate": 9.999989627059501e-06, + "loss": 0.4972, + "step": 488 + }, + { + "epoch": 0.03, + "grad_norm": 1.7267714061114867, + "learning_rate": 9.999987448742909e-06, + "loss": 0.4847, + "step": 489 + }, + { + "epoch": 0.03, + "grad_norm": 2.632008755771487, + "learning_rate": 9.999985062967955e-06, + "loss": 0.4746, + "step": 490 + }, + { + "epoch": 0.03, + "grad_norm": 1.6553610090016668, + "learning_rate": 9.99998246973474e-06, + "loss": 0.4669, + "step": 491 + }, + { + "epoch": 0.03, + "grad_norm": 1.8526190594946903, + "learning_rate": 9.999979669043371e-06, + "loss": 0.4777, + "step": 492 + }, + { + "epoch": 0.03, + "grad_norm": 1.9881684812188742, + "learning_rate": 9.999976660893964e-06, + "loss": 0.496, + "step": 493 + }, + { + "epoch": 0.03, + "grad_norm": 2.192218782535668, + "learning_rate": 9.999973445286645e-06, + "loss": 0.5122, + "step": 494 + }, + { + "epoch": 0.03, + "grad_norm": 1.8663884967123074, + "learning_rate": 9.999970022221547e-06, + "loss": 0.4729, + "step": 495 + }, + { + "epoch": 0.03, + "grad_norm": 1.7233098792878956, + "learning_rate": 9.999966391698814e-06, + "loss": 0.5244, + "step": 496 + }, + { + "epoch": 0.03, + "grad_norm": 3.1689089902666683, + "learning_rate": 9.999962553718592e-06, + "loss": 0.5363, + "step": 497 + }, + { + "epoch": 0.03, + "grad_norm": 2.2179232638230273, + "learning_rate": 9.999958508281042e-06, + "loss": 0.4806, + "step": 498 + }, + { + "epoch": 0.03, + "grad_norm": 7.830937218899892, + "learning_rate": 9.999954255386336e-06, + "loss": 0.5072, + "step": 499 + }, + { + "epoch": 0.03, + "grad_norm": 2.5490979182484326, + "learning_rate": 9.999949795034643e-06, + "loss": 0.4835, + "step": 500 + }, + { + "epoch": 0.03, + "grad_norm": 1.6043327569945123, + "learning_rate": 9.999945127226153e-06, + "loss": 0.473, + "step": 501 + }, + { + "epoch": 0.03, + "grad_norm": 2.0698345439739447, + "learning_rate": 9.999940251961062e-06, + "loss": 0.4923, + "step": 502 + }, + { + "epoch": 0.03, + "grad_norm": 3.404757938812203, + "learning_rate": 9.999935169239564e-06, + "loss": 0.4903, + "step": 503 + }, + { + "epoch": 0.03, + "grad_norm": 2.681536235612271, + "learning_rate": 9.999929879061879e-06, + "loss": 0.4953, + "step": 504 + }, + { + "epoch": 0.03, + "grad_norm": 2.832575757357449, + "learning_rate": 9.999924381428221e-06, + "loss": 0.4759, + "step": 505 + }, + { + "epoch": 0.03, + "grad_norm": 3.6505871660183593, + "learning_rate": 9.99991867633882e-06, + "loss": 0.4857, + "step": 506 + }, + { + "epoch": 0.03, + "grad_norm": 2.2055095409109557, + "learning_rate": 9.999912763793912e-06, + "loss": 0.5042, + "step": 507 + }, + { + "epoch": 0.03, + "grad_norm": 2.707154179903497, + "learning_rate": 9.999906643793741e-06, + "loss": 0.4825, + "step": 508 + }, + { + "epoch": 0.03, + "grad_norm": 2.032646923161125, + "learning_rate": 9.999900316338566e-06, + "loss": 0.4628, + "step": 509 + }, + { + "epoch": 0.03, + "grad_norm": 2.3106848272052103, + "learning_rate": 9.999893781428643e-06, + "loss": 0.4963, + "step": 510 + }, + { + "epoch": 0.03, + "grad_norm": 2.1321540785077566, + "learning_rate": 9.999887039064248e-06, + "loss": 0.4897, + "step": 511 + }, + { + "epoch": 0.03, + "grad_norm": 1.9729172626842642, + "learning_rate": 9.999880089245659e-06, + "loss": 0.5195, + "step": 512 + }, + { + "epoch": 0.03, + "grad_norm": 1.9921180065923847, + "learning_rate": 9.999872931973163e-06, + "loss": 0.4993, + "step": 513 + }, + { + "epoch": 0.03, + "grad_norm": 1.4222433489211916, + "learning_rate": 9.999865567247058e-06, + "loss": 0.4661, + "step": 514 + }, + { + "epoch": 0.03, + "grad_norm": 2.4729050680514097, + "learning_rate": 9.999857995067652e-06, + "loss": 0.4864, + "step": 515 + }, + { + "epoch": 0.03, + "grad_norm": 3.9891660124937793, + "learning_rate": 9.999850215435255e-06, + "loss": 0.4939, + "step": 516 + }, + { + "epoch": 0.03, + "grad_norm": 2.6145310260308054, + "learning_rate": 9.999842228350191e-06, + "loss": 0.5025, + "step": 517 + }, + { + "epoch": 0.03, + "grad_norm": 1.6840151848583955, + "learning_rate": 9.999834033812795e-06, + "loss": 0.4761, + "step": 518 + }, + { + "epoch": 0.03, + "grad_norm": 1.9479719370056958, + "learning_rate": 9.999825631823404e-06, + "loss": 0.445, + "step": 519 + }, + { + "epoch": 0.03, + "grad_norm": 1.9412623736620975, + "learning_rate": 9.999817022382365e-06, + "loss": 0.424, + "step": 520 + }, + { + "epoch": 0.03, + "grad_norm": 8.256004528716746, + "learning_rate": 9.999808205490038e-06, + "loss": 0.525, + "step": 521 + }, + { + "epoch": 0.03, + "grad_norm": 2.0329283510561034, + "learning_rate": 9.999799181146787e-06, + "loss": 0.4958, + "step": 522 + }, + { + "epoch": 0.03, + "grad_norm": 3.29614027965801, + "learning_rate": 9.999789949352988e-06, + "loss": 0.4886, + "step": 523 + }, + { + "epoch": 0.03, + "grad_norm": 2.8802866180224957, + "learning_rate": 9.999780510109023e-06, + "loss": 0.4997, + "step": 524 + }, + { + "epoch": 0.03, + "grad_norm": 2.958253765695981, + "learning_rate": 9.999770863415286e-06, + "loss": 0.5277, + "step": 525 + }, + { + "epoch": 0.03, + "grad_norm": 2.2046301239602006, + "learning_rate": 9.999761009272174e-06, + "loss": 0.4725, + "step": 526 + }, + { + "epoch": 0.03, + "grad_norm": 2.5465710295017514, + "learning_rate": 9.999750947680096e-06, + "loss": 0.4969, + "step": 527 + }, + { + "epoch": 0.03, + "grad_norm": 2.031761355371023, + "learning_rate": 9.999740678639471e-06, + "loss": 0.5108, + "step": 528 + }, + { + "epoch": 0.03, + "grad_norm": 2.0828099016664163, + "learning_rate": 9.999730202150726e-06, + "loss": 0.4825, + "step": 529 + }, + { + "epoch": 0.03, + "grad_norm": 1.9273603673172606, + "learning_rate": 9.999719518214293e-06, + "loss": 0.4704, + "step": 530 + }, + { + "epoch": 0.03, + "grad_norm": 2.5018699726099425, + "learning_rate": 9.999708626830617e-06, + "loss": 0.4905, + "step": 531 + }, + { + "epoch": 0.03, + "grad_norm": 4.504510016209695, + "learning_rate": 9.999697528000151e-06, + "loss": 0.5068, + "step": 532 + }, + { + "epoch": 0.03, + "grad_norm": 2.626186179716801, + "learning_rate": 9.999686221723353e-06, + "loss": 0.4726, + "step": 533 + }, + { + "epoch": 0.03, + "grad_norm": 1.9030201544718344, + "learning_rate": 9.999674708000692e-06, + "loss": 0.4925, + "step": 534 + }, + { + "epoch": 0.03, + "grad_norm": 2.297824608304791, + "learning_rate": 9.99966298683265e-06, + "loss": 0.5128, + "step": 535 + }, + { + "epoch": 0.03, + "grad_norm": 2.4063063427872162, + "learning_rate": 9.999651058219708e-06, + "loss": 0.4323, + "step": 536 + }, + { + "epoch": 0.03, + "grad_norm": 1.9257458540615224, + "learning_rate": 9.999638922162363e-06, + "loss": 0.463, + "step": 537 + }, + { + "epoch": 0.03, + "grad_norm": 3.9692464760787463, + "learning_rate": 9.99962657866112e-06, + "loss": 0.4524, + "step": 538 + }, + { + "epoch": 0.03, + "grad_norm": 2.5905931244414884, + "learning_rate": 9.999614027716488e-06, + "loss": 0.5167, + "step": 539 + }, + { + "epoch": 0.03, + "grad_norm": 6.410373295565918, + "learning_rate": 9.999601269328994e-06, + "loss": 0.4753, + "step": 540 + }, + { + "epoch": 0.03, + "grad_norm": 2.5532275741843966, + "learning_rate": 9.99958830349916e-06, + "loss": 0.463, + "step": 541 + }, + { + "epoch": 0.03, + "grad_norm": 2.8810677571179277, + "learning_rate": 9.999575130227526e-06, + "loss": 0.4694, + "step": 542 + }, + { + "epoch": 0.03, + "grad_norm": 2.9790666546297664, + "learning_rate": 9.999561749514642e-06, + "loss": 0.4966, + "step": 543 + }, + { + "epoch": 0.03, + "grad_norm": 2.9908036371688174, + "learning_rate": 9.999548161361058e-06, + "loss": 0.457, + "step": 544 + }, + { + "epoch": 0.03, + "grad_norm": 2.997095918538681, + "learning_rate": 9.999534365767342e-06, + "loss": 0.5354, + "step": 545 + }, + { + "epoch": 0.03, + "grad_norm": 2.08301974810039, + "learning_rate": 9.999520362734065e-06, + "loss": 0.4981, + "step": 546 + }, + { + "epoch": 0.03, + "grad_norm": 2.231020304541261, + "learning_rate": 9.999506152261809e-06, + "loss": 0.4632, + "step": 547 + }, + { + "epoch": 0.03, + "grad_norm": 3.551668641095965, + "learning_rate": 9.999491734351162e-06, + "loss": 0.4733, + "step": 548 + }, + { + "epoch": 0.03, + "grad_norm": 9.180686411917383, + "learning_rate": 9.999477109002722e-06, + "loss": 0.4842, + "step": 549 + }, + { + "epoch": 0.03, + "grad_norm": 6.72808462322627, + "learning_rate": 9.999462276217096e-06, + "loss": 0.4543, + "step": 550 + }, + { + "epoch": 0.03, + "grad_norm": 2.2164396078924913, + "learning_rate": 9.999447235994902e-06, + "loss": 0.4276, + "step": 551 + }, + { + "epoch": 0.03, + "grad_norm": 2.0792152173641463, + "learning_rate": 9.999431988336762e-06, + "loss": 0.4595, + "step": 552 + }, + { + "epoch": 0.03, + "grad_norm": 3.4026681979538793, + "learning_rate": 9.999416533243309e-06, + "loss": 0.465, + "step": 553 + }, + { + "epoch": 0.03, + "grad_norm": 4.006484644283491, + "learning_rate": 9.999400870715182e-06, + "loss": 0.4442, + "step": 554 + }, + { + "epoch": 0.03, + "grad_norm": 2.6070388377684557, + "learning_rate": 9.999385000753034e-06, + "loss": 0.4676, + "step": 555 + }, + { + "epoch": 0.03, + "grad_norm": 2.040094846791098, + "learning_rate": 9.999368923357525e-06, + "loss": 0.4711, + "step": 556 + }, + { + "epoch": 0.04, + "grad_norm": 3.0506551892026024, + "learning_rate": 9.999352638529316e-06, + "loss": 0.4602, + "step": 557 + }, + { + "epoch": 0.04, + "grad_norm": 4.770924960955531, + "learning_rate": 9.999336146269088e-06, + "loss": 0.4597, + "step": 558 + }, + { + "epoch": 0.04, + "grad_norm": 2.444707020009515, + "learning_rate": 9.999319446577523e-06, + "loss": 0.4944, + "step": 559 + }, + { + "epoch": 0.04, + "grad_norm": 2.0794848971406736, + "learning_rate": 9.999302539455314e-06, + "loss": 0.4328, + "step": 560 + }, + { + "epoch": 0.04, + "grad_norm": 1.968220333607118, + "learning_rate": 9.999285424903163e-06, + "loss": 0.4923, + "step": 561 + }, + { + "epoch": 0.04, + "grad_norm": 2.2070225116189857, + "learning_rate": 9.99926810292178e-06, + "loss": 0.4732, + "step": 562 + }, + { + "epoch": 0.04, + "grad_norm": 1.9672062941744224, + "learning_rate": 9.999250573511883e-06, + "loss": 0.4457, + "step": 563 + }, + { + "epoch": 0.04, + "grad_norm": 3.7397800191770814, + "learning_rate": 9.999232836674202e-06, + "loss": 0.42, + "step": 564 + }, + { + "epoch": 0.04, + "grad_norm": 2.955989009342422, + "learning_rate": 9.99921489240947e-06, + "loss": 0.4446, + "step": 565 + }, + { + "epoch": 0.04, + "grad_norm": 2.9205981841495485, + "learning_rate": 9.999196740718432e-06, + "loss": 0.438, + "step": 566 + }, + { + "epoch": 0.04, + "grad_norm": 10.522660114624852, + "learning_rate": 9.999178381601842e-06, + "loss": 0.4845, + "step": 567 + }, + { + "epoch": 0.04, + "grad_norm": 2.4620507763502344, + "learning_rate": 9.999159815060462e-06, + "loss": 0.4804, + "step": 568 + }, + { + "epoch": 0.04, + "grad_norm": 2.337410846560435, + "learning_rate": 9.99914104109506e-06, + "loss": 0.4626, + "step": 569 + }, + { + "epoch": 0.04, + "grad_norm": 3.8850082114866145, + "learning_rate": 9.999122059706418e-06, + "loss": 0.4671, + "step": 570 + }, + { + "epoch": 0.04, + "grad_norm": 1.8278181600093955, + "learning_rate": 9.999102870895323e-06, + "loss": 0.4578, + "step": 571 + }, + { + "epoch": 0.04, + "grad_norm": 3.489333267750027, + "learning_rate": 9.99908347466257e-06, + "loss": 0.4449, + "step": 572 + }, + { + "epoch": 0.04, + "grad_norm": 2.618328682479948, + "learning_rate": 9.999063871008963e-06, + "loss": 0.4658, + "step": 573 + }, + { + "epoch": 0.04, + "grad_norm": 2.9802982929547728, + "learning_rate": 9.999044059935319e-06, + "loss": 0.4391, + "step": 574 + }, + { + "epoch": 0.04, + "grad_norm": 1.8110250724553485, + "learning_rate": 9.999024041442455e-06, + "loss": 0.4685, + "step": 575 + }, + { + "epoch": 0.04, + "grad_norm": 3.2616464759803057, + "learning_rate": 9.999003815531206e-06, + "loss": 0.457, + "step": 576 + }, + { + "epoch": 0.04, + "grad_norm": 2.187140130650438, + "learning_rate": 9.998983382202408e-06, + "loss": 0.4411, + "step": 577 + }, + { + "epoch": 0.04, + "grad_norm": 2.3819418136366752, + "learning_rate": 9.998962741456912e-06, + "loss": 0.449, + "step": 578 + }, + { + "epoch": 0.04, + "grad_norm": 2.7313485182956, + "learning_rate": 9.998941893295572e-06, + "loss": 0.4712, + "step": 579 + }, + { + "epoch": 0.04, + "grad_norm": 2.063108026944435, + "learning_rate": 9.998920837719254e-06, + "loss": 0.46, + "step": 580 + }, + { + "epoch": 0.04, + "grad_norm": 2.088599257715193, + "learning_rate": 9.998899574728832e-06, + "loss": 0.4623, + "step": 581 + }, + { + "epoch": 0.04, + "grad_norm": 2.208467206475676, + "learning_rate": 9.998878104325186e-06, + "loss": 0.5056, + "step": 582 + }, + { + "epoch": 0.04, + "grad_norm": 3.566290177886027, + "learning_rate": 9.998856426509208e-06, + "loss": 0.4744, + "step": 583 + }, + { + "epoch": 0.04, + "grad_norm": 2.133215283066703, + "learning_rate": 9.998834541281798e-06, + "loss": 0.4992, + "step": 584 + }, + { + "epoch": 0.04, + "grad_norm": 3.530779373872894, + "learning_rate": 9.998812448643866e-06, + "loss": 0.4671, + "step": 585 + }, + { + "epoch": 0.04, + "grad_norm": 2.6625767048203386, + "learning_rate": 9.998790148596326e-06, + "loss": 0.4315, + "step": 586 + }, + { + "epoch": 0.04, + "grad_norm": 1.8671590408093837, + "learning_rate": 9.998767641140103e-06, + "loss": 0.4998, + "step": 587 + }, + { + "epoch": 0.04, + "grad_norm": 3.0928109248997626, + "learning_rate": 9.998744926276132e-06, + "loss": 0.4703, + "step": 588 + }, + { + "epoch": 0.04, + "grad_norm": 3.2506176259906834, + "learning_rate": 9.998722004005356e-06, + "loss": 0.4782, + "step": 589 + }, + { + "epoch": 0.04, + "grad_norm": 1.9063269538020393, + "learning_rate": 9.998698874328725e-06, + "loss": 0.4609, + "step": 590 + }, + { + "epoch": 0.04, + "grad_norm": 2.027797630615375, + "learning_rate": 9.998675537247199e-06, + "loss": 0.4696, + "step": 591 + }, + { + "epoch": 0.04, + "grad_norm": 3.0533268864376435, + "learning_rate": 9.998651992761746e-06, + "loss": 0.4694, + "step": 592 + }, + { + "epoch": 0.04, + "grad_norm": 2.4804998394103683, + "learning_rate": 9.998628240873342e-06, + "loss": 0.4419, + "step": 593 + }, + { + "epoch": 0.04, + "grad_norm": 2.795088503320507, + "learning_rate": 9.998604281582975e-06, + "loss": 0.4994, + "step": 594 + }, + { + "epoch": 0.04, + "grad_norm": 2.9220824194919697, + "learning_rate": 9.998580114891638e-06, + "loss": 0.4541, + "step": 595 + }, + { + "epoch": 0.04, + "grad_norm": 1.8091027763660537, + "learning_rate": 9.998555740800335e-06, + "loss": 0.4528, + "step": 596 + }, + { + "epoch": 0.04, + "grad_norm": 20.594109389576467, + "learning_rate": 9.998531159310074e-06, + "loss": 0.462, + "step": 597 + }, + { + "epoch": 0.04, + "grad_norm": 2.0457223813804037, + "learning_rate": 9.998506370421876e-06, + "loss": 0.4449, + "step": 598 + }, + { + "epoch": 0.04, + "grad_norm": 2.364388799628461, + "learning_rate": 9.998481374136773e-06, + "loss": 0.4438, + "step": 599 + }, + { + "epoch": 0.04, + "grad_norm": 2.225475904704627, + "learning_rate": 9.998456170455796e-06, + "loss": 0.4612, + "step": 600 + }, + { + "epoch": 0.04, + "grad_norm": 4.540161650441191, + "learning_rate": 9.998430759379999e-06, + "loss": 0.458, + "step": 601 + }, + { + "epoch": 0.04, + "grad_norm": 4.602362536891887, + "learning_rate": 9.998405140910427e-06, + "loss": 0.4778, + "step": 602 + }, + { + "epoch": 0.04, + "grad_norm": 2.5625773310302615, + "learning_rate": 9.99837931504815e-06, + "loss": 0.4648, + "step": 603 + }, + { + "epoch": 0.04, + "grad_norm": 1.3451024621950423, + "learning_rate": 9.998353281794235e-06, + "loss": 0.4826, + "step": 604 + }, + { + "epoch": 0.04, + "grad_norm": 2.599579513154246, + "learning_rate": 9.998327041149766e-06, + "loss": 0.4693, + "step": 605 + }, + { + "epoch": 0.04, + "grad_norm": 2.9164031153911307, + "learning_rate": 9.998300593115831e-06, + "loss": 0.4559, + "step": 606 + }, + { + "epoch": 0.04, + "grad_norm": 1.9919266447646287, + "learning_rate": 9.998273937693525e-06, + "loss": 0.4275, + "step": 607 + }, + { + "epoch": 0.04, + "grad_norm": 2.858787089836137, + "learning_rate": 9.998247074883956e-06, + "loss": 0.4338, + "step": 608 + }, + { + "epoch": 0.04, + "grad_norm": 2.952562833224354, + "learning_rate": 9.998220004688238e-06, + "loss": 0.5, + "step": 609 + }, + { + "epoch": 0.04, + "grad_norm": 1.9821066259376316, + "learning_rate": 9.998192727107492e-06, + "loss": 0.4838, + "step": 610 + }, + { + "epoch": 0.04, + "grad_norm": 1.8465287076621513, + "learning_rate": 9.998165242142855e-06, + "loss": 0.4511, + "step": 611 + }, + { + "epoch": 0.04, + "grad_norm": 1.8975261047111254, + "learning_rate": 9.998137549795462e-06, + "loss": 0.4217, + "step": 612 + }, + { + "epoch": 0.04, + "grad_norm": 4.664265661419895, + "learning_rate": 9.998109650066465e-06, + "loss": 0.4556, + "step": 613 + }, + { + "epoch": 0.04, + "grad_norm": 2.024994077208901, + "learning_rate": 9.998081542957021e-06, + "loss": 0.4598, + "step": 614 + }, + { + "epoch": 0.04, + "grad_norm": 2.332872265825148, + "learning_rate": 9.998053228468296e-06, + "loss": 0.4686, + "step": 615 + }, + { + "epoch": 0.04, + "grad_norm": 1.6914635597004524, + "learning_rate": 9.998024706601467e-06, + "loss": 0.4297, + "step": 616 + }, + { + "epoch": 0.04, + "grad_norm": 1.9640858067208848, + "learning_rate": 9.997995977357712e-06, + "loss": 0.4164, + "step": 617 + }, + { + "epoch": 0.04, + "grad_norm": 1.731815997672232, + "learning_rate": 9.99796704073823e-06, + "loss": 0.4402, + "step": 618 + }, + { + "epoch": 0.04, + "grad_norm": 1.4062689009583236, + "learning_rate": 9.997937896744216e-06, + "loss": 0.4367, + "step": 619 + }, + { + "epoch": 0.04, + "grad_norm": 1.8611460221301015, + "learning_rate": 9.997908545376883e-06, + "loss": 0.4446, + "step": 620 + }, + { + "epoch": 0.04, + "grad_norm": 1.9255996259506856, + "learning_rate": 9.997878986637446e-06, + "loss": 0.4364, + "step": 621 + }, + { + "epoch": 0.04, + "grad_norm": 1.832861131931658, + "learning_rate": 9.997849220527132e-06, + "loss": 0.461, + "step": 622 + }, + { + "epoch": 0.04, + "grad_norm": 2.2911431679187215, + "learning_rate": 9.997819247047177e-06, + "loss": 0.439, + "step": 623 + }, + { + "epoch": 0.04, + "grad_norm": 2.1394718060547646, + "learning_rate": 9.997789066198824e-06, + "loss": 0.459, + "step": 624 + }, + { + "epoch": 0.04, + "grad_norm": 1.768603478672137, + "learning_rate": 9.997758677983327e-06, + "loss": 0.5346, + "step": 625 + }, + { + "epoch": 0.04, + "grad_norm": 2.9595764953684403, + "learning_rate": 9.997728082401946e-06, + "loss": 0.4677, + "step": 626 + }, + { + "epoch": 0.04, + "grad_norm": 2.689702247590821, + "learning_rate": 9.997697279455947e-06, + "loss": 0.4479, + "step": 627 + }, + { + "epoch": 0.04, + "grad_norm": 1.8691749958551407, + "learning_rate": 9.997666269146612e-06, + "loss": 0.4684, + "step": 628 + }, + { + "epoch": 0.04, + "grad_norm": 1.588557160746207, + "learning_rate": 9.997635051475227e-06, + "loss": 0.4452, + "step": 629 + }, + { + "epoch": 0.04, + "grad_norm": 2.332900061870853, + "learning_rate": 9.997603626443088e-06, + "loss": 0.4617, + "step": 630 + }, + { + "epoch": 0.04, + "grad_norm": 1.7491579573049527, + "learning_rate": 9.997571994051497e-06, + "loss": 0.4861, + "step": 631 + }, + { + "epoch": 0.04, + "grad_norm": 5.600134242711322, + "learning_rate": 9.997540154301766e-06, + "loss": 0.4686, + "step": 632 + }, + { + "epoch": 0.04, + "grad_norm": 2.2919323119550588, + "learning_rate": 9.99750810719522e-06, + "loss": 0.4816, + "step": 633 + }, + { + "epoch": 0.04, + "grad_norm": 2.3933176511759915, + "learning_rate": 9.997475852733183e-06, + "loss": 0.4721, + "step": 634 + }, + { + "epoch": 0.04, + "grad_norm": 1.7704070087461385, + "learning_rate": 9.997443390916999e-06, + "loss": 0.426, + "step": 635 + }, + { + "epoch": 0.04, + "grad_norm": 1.6750412033533468, + "learning_rate": 9.99741072174801e-06, + "loss": 0.4595, + "step": 636 + }, + { + "epoch": 0.04, + "grad_norm": 2.7392006563073865, + "learning_rate": 9.997377845227577e-06, + "loss": 0.4473, + "step": 637 + }, + { + "epoch": 0.04, + "grad_norm": 1.982753584421421, + "learning_rate": 9.997344761357057e-06, + "loss": 0.4834, + "step": 638 + }, + { + "epoch": 0.04, + "grad_norm": 2.542893874456173, + "learning_rate": 9.997311470137828e-06, + "loss": 0.4372, + "step": 639 + }, + { + "epoch": 0.04, + "grad_norm": 2.3783093034128138, + "learning_rate": 9.99727797157127e-06, + "loss": 0.4505, + "step": 640 + }, + { + "epoch": 0.04, + "grad_norm": 1.863024260620193, + "learning_rate": 9.997244265658774e-06, + "loss": 0.4273, + "step": 641 + }, + { + "epoch": 0.04, + "grad_norm": 2.010156445925229, + "learning_rate": 9.997210352401736e-06, + "loss": 0.4213, + "step": 642 + }, + { + "epoch": 0.04, + "grad_norm": 1.9513288428258124, + "learning_rate": 9.997176231801565e-06, + "loss": 0.4659, + "step": 643 + }, + { + "epoch": 0.04, + "grad_norm": 1.5284652329277972, + "learning_rate": 9.997141903859675e-06, + "loss": 0.5051, + "step": 644 + }, + { + "epoch": 0.04, + "grad_norm": 2.767869147714192, + "learning_rate": 9.997107368577492e-06, + "loss": 0.4745, + "step": 645 + }, + { + "epoch": 0.04, + "grad_norm": 2.1508694750132054, + "learning_rate": 9.99707262595645e-06, + "loss": 0.4139, + "step": 646 + }, + { + "epoch": 0.04, + "grad_norm": 2.11583835038122, + "learning_rate": 9.997037675997987e-06, + "loss": 0.4698, + "step": 647 + }, + { + "epoch": 0.04, + "grad_norm": 2.515650691332356, + "learning_rate": 9.997002518703556e-06, + "loss": 0.455, + "step": 648 + }, + { + "epoch": 0.04, + "grad_norm": 1.9175224977998657, + "learning_rate": 9.996967154074615e-06, + "loss": 0.4673, + "step": 649 + }, + { + "epoch": 0.04, + "grad_norm": 1.9242834891154732, + "learning_rate": 9.99693158211263e-06, + "loss": 0.4762, + "step": 650 + }, + { + "epoch": 0.04, + "grad_norm": 1.9173214362306026, + "learning_rate": 9.99689580281908e-06, + "loss": 0.4902, + "step": 651 + }, + { + "epoch": 0.04, + "grad_norm": 2.2670351609030837, + "learning_rate": 9.996859816195446e-06, + "loss": 0.4644, + "step": 652 + }, + { + "epoch": 0.04, + "grad_norm": 1.8798348386074981, + "learning_rate": 9.996823622243224e-06, + "loss": 0.4319, + "step": 653 + }, + { + "epoch": 0.04, + "grad_norm": 2.8049037930911456, + "learning_rate": 9.996787220963915e-06, + "loss": 0.4488, + "step": 654 + }, + { + "epoch": 0.04, + "grad_norm": 2.365755338439417, + "learning_rate": 9.996750612359026e-06, + "loss": 0.4799, + "step": 655 + }, + { + "epoch": 0.04, + "grad_norm": 6.477361241354372, + "learning_rate": 9.99671379643008e-06, + "loss": 0.4389, + "step": 656 + }, + { + "epoch": 0.04, + "grad_norm": 1.7752975651097125, + "learning_rate": 9.996676773178604e-06, + "loss": 0.4557, + "step": 657 + }, + { + "epoch": 0.04, + "grad_norm": 1.6547541148359164, + "learning_rate": 9.996639542606135e-06, + "loss": 0.4828, + "step": 658 + }, + { + "epoch": 0.04, + "grad_norm": 2.2922398431103956, + "learning_rate": 9.996602104714215e-06, + "loss": 0.4689, + "step": 659 + }, + { + "epoch": 0.04, + "grad_norm": 1.6653628345815956, + "learning_rate": 9.996564459504398e-06, + "loss": 0.4443, + "step": 660 + }, + { + "epoch": 0.04, + "grad_norm": 3.5935263181365933, + "learning_rate": 9.996526606978248e-06, + "loss": 0.4537, + "step": 661 + }, + { + "epoch": 0.04, + "grad_norm": 2.212712442498005, + "learning_rate": 9.996488547137335e-06, + "loss": 0.4419, + "step": 662 + }, + { + "epoch": 0.04, + "grad_norm": 1.9478331010182452, + "learning_rate": 9.996450279983236e-06, + "loss": 0.4565, + "step": 663 + }, + { + "epoch": 0.04, + "grad_norm": 2.5278859392506945, + "learning_rate": 9.99641180551754e-06, + "loss": 0.4433, + "step": 664 + }, + { + "epoch": 0.04, + "grad_norm": 2.027332182723469, + "learning_rate": 9.996373123741843e-06, + "loss": 0.4834, + "step": 665 + }, + { + "epoch": 0.04, + "grad_norm": 2.073061441534379, + "learning_rate": 9.996334234657751e-06, + "loss": 0.4703, + "step": 666 + }, + { + "epoch": 0.04, + "grad_norm": 2.1932471783137433, + "learning_rate": 9.996295138266877e-06, + "loss": 0.4501, + "step": 667 + }, + { + "epoch": 0.04, + "grad_norm": 2.318954452112392, + "learning_rate": 9.996255834570844e-06, + "loss": 0.4785, + "step": 668 + }, + { + "epoch": 0.04, + "grad_norm": 8.867713605240658, + "learning_rate": 9.996216323571283e-06, + "loss": 0.446, + "step": 669 + }, + { + "epoch": 0.04, + "grad_norm": 1.7452091963528424, + "learning_rate": 9.996176605269832e-06, + "loss": 0.5069, + "step": 670 + }, + { + "epoch": 0.04, + "grad_norm": 2.681731236826786, + "learning_rate": 9.996136679668138e-06, + "loss": 0.4774, + "step": 671 + }, + { + "epoch": 0.04, + "grad_norm": 2.5679787343301403, + "learning_rate": 9.99609654676786e-06, + "loss": 0.4378, + "step": 672 + }, + { + "epoch": 0.04, + "grad_norm": 6.285230500434291, + "learning_rate": 9.996056206570662e-06, + "loss": 0.4563, + "step": 673 + }, + { + "epoch": 0.04, + "grad_norm": 2.549352866928325, + "learning_rate": 9.996015659078218e-06, + "loss": 0.4421, + "step": 674 + }, + { + "epoch": 0.04, + "grad_norm": 1.6743044639961289, + "learning_rate": 9.995974904292211e-06, + "loss": 0.4519, + "step": 675 + }, + { + "epoch": 0.04, + "grad_norm": 2.944103245358746, + "learning_rate": 9.995933942214331e-06, + "loss": 0.439, + "step": 676 + }, + { + "epoch": 0.04, + "grad_norm": 2.178578626812516, + "learning_rate": 9.995892772846276e-06, + "loss": 0.4657, + "step": 677 + }, + { + "epoch": 0.04, + "grad_norm": 1.8834378335862514, + "learning_rate": 9.995851396189759e-06, + "loss": 0.4766, + "step": 678 + }, + { + "epoch": 0.04, + "grad_norm": 2.3943764794326166, + "learning_rate": 9.995809812246493e-06, + "loss": 0.4434, + "step": 679 + }, + { + "epoch": 0.04, + "grad_norm": 1.0256693185920414, + "learning_rate": 9.995768021018204e-06, + "loss": 0.4597, + "step": 680 + }, + { + "epoch": 0.04, + "grad_norm": 3.066097959989994, + "learning_rate": 9.995726022506627e-06, + "loss": 0.4364, + "step": 681 + }, + { + "epoch": 0.04, + "grad_norm": 4.765308871003166, + "learning_rate": 9.995683816713503e-06, + "loss": 0.4723, + "step": 682 + }, + { + "epoch": 0.04, + "grad_norm": 2.847147381689037, + "learning_rate": 9.995641403640585e-06, + "loss": 0.4761, + "step": 683 + }, + { + "epoch": 0.04, + "grad_norm": 2.4559454985726097, + "learning_rate": 9.995598783289631e-06, + "loss": 0.4727, + "step": 684 + }, + { + "epoch": 0.04, + "grad_norm": 3.113195011765131, + "learning_rate": 9.99555595566241e-06, + "loss": 0.436, + "step": 685 + }, + { + "epoch": 0.04, + "grad_norm": 3.1883784100307166, + "learning_rate": 9.9955129207607e-06, + "loss": 0.4867, + "step": 686 + }, + { + "epoch": 0.04, + "grad_norm": 3.553270239468023, + "learning_rate": 9.995469678586286e-06, + "loss": 0.5049, + "step": 687 + }, + { + "epoch": 0.04, + "grad_norm": 2.5839827658726273, + "learning_rate": 9.995426229140963e-06, + "loss": 0.4729, + "step": 688 + }, + { + "epoch": 0.04, + "grad_norm": 3.6757771629648346, + "learning_rate": 9.995382572426531e-06, + "loss": 0.425, + "step": 689 + }, + { + "epoch": 0.04, + "grad_norm": 1.8114969729388422, + "learning_rate": 9.995338708444804e-06, + "loss": 0.4355, + "step": 690 + }, + { + "epoch": 0.04, + "grad_norm": 2.459675253701771, + "learning_rate": 9.995294637197602e-06, + "loss": 0.4854, + "step": 691 + }, + { + "epoch": 0.04, + "grad_norm": 2.7511074507439903, + "learning_rate": 9.995250358686753e-06, + "loss": 0.4154, + "step": 692 + }, + { + "epoch": 0.04, + "grad_norm": 3.316640599823098, + "learning_rate": 9.995205872914094e-06, + "loss": 0.4188, + "step": 693 + }, + { + "epoch": 0.04, + "grad_norm": 2.359760985100658, + "learning_rate": 9.995161179881469e-06, + "loss": 0.442, + "step": 694 + }, + { + "epoch": 0.04, + "grad_norm": 2.788610007610766, + "learning_rate": 9.995116279590735e-06, + "loss": 0.4302, + "step": 695 + }, + { + "epoch": 0.04, + "grad_norm": 2.0605553313415297, + "learning_rate": 9.995071172043755e-06, + "loss": 0.4065, + "step": 696 + }, + { + "epoch": 0.04, + "grad_norm": 3.709277814057277, + "learning_rate": 9.9950258572424e-06, + "loss": 0.4429, + "step": 697 + }, + { + "epoch": 0.04, + "grad_norm": 1.8173910399442992, + "learning_rate": 9.994980335188549e-06, + "loss": 0.4684, + "step": 698 + }, + { + "epoch": 0.04, + "grad_norm": 1.8807990544036073, + "learning_rate": 9.994934605884093e-06, + "loss": 0.4423, + "step": 699 + }, + { + "epoch": 0.04, + "grad_norm": 2.07708957760124, + "learning_rate": 9.994888669330927e-06, + "loss": 0.4473, + "step": 700 + }, + { + "epoch": 0.04, + "grad_norm": 3.5759907952586523, + "learning_rate": 9.99484252553096e-06, + "loss": 0.4489, + "step": 701 + }, + { + "epoch": 0.04, + "grad_norm": 2.401807864783039, + "learning_rate": 9.9947961744861e-06, + "loss": 0.458, + "step": 702 + }, + { + "epoch": 0.04, + "grad_norm": 0.8853034430839495, + "learning_rate": 9.99474961619828e-06, + "loss": 0.4362, + "step": 703 + }, + { + "epoch": 0.04, + "grad_norm": 2.4146421279508816, + "learning_rate": 9.994702850669426e-06, + "loss": 0.4408, + "step": 704 + }, + { + "epoch": 0.04, + "grad_norm": 1.6001923652749257, + "learning_rate": 9.994655877901479e-06, + "loss": 0.4403, + "step": 705 + }, + { + "epoch": 0.04, + "grad_norm": 2.7221897854324726, + "learning_rate": 9.994608697896386e-06, + "loss": 0.4425, + "step": 706 + }, + { + "epoch": 0.04, + "grad_norm": 1.9029907396395576, + "learning_rate": 9.994561310656107e-06, + "loss": 0.4155, + "step": 707 + }, + { + "epoch": 0.04, + "grad_norm": 2.219955758206292, + "learning_rate": 9.99451371618261e-06, + "loss": 0.4613, + "step": 708 + }, + { + "epoch": 0.04, + "grad_norm": 2.9205200134365303, + "learning_rate": 9.994465914477866e-06, + "loss": 0.4491, + "step": 709 + }, + { + "epoch": 0.04, + "grad_norm": 4.3594627345880355, + "learning_rate": 9.994417905543863e-06, + "loss": 0.4623, + "step": 710 + }, + { + "epoch": 0.04, + "grad_norm": 3.7961548354447747, + "learning_rate": 9.994369689382586e-06, + "loss": 0.4737, + "step": 711 + }, + { + "epoch": 0.04, + "grad_norm": 5.153950418069649, + "learning_rate": 9.994321265996043e-06, + "loss": 0.4557, + "step": 712 + }, + { + "epoch": 0.04, + "grad_norm": 1.7803043259836775, + "learning_rate": 9.994272635386238e-06, + "loss": 0.4391, + "step": 713 + }, + { + "epoch": 0.04, + "grad_norm": 2.105188709639193, + "learning_rate": 9.99422379755519e-06, + "loss": 0.4386, + "step": 714 + }, + { + "epoch": 0.04, + "grad_norm": 3.4916189281607455, + "learning_rate": 9.994174752504926e-06, + "loss": 0.4487, + "step": 715 + }, + { + "epoch": 0.05, + "grad_norm": 1.8734568379953702, + "learning_rate": 9.994125500237482e-06, + "loss": 0.4182, + "step": 716 + }, + { + "epoch": 0.05, + "grad_norm": 2.291847542999621, + "learning_rate": 9.994076040754898e-06, + "loss": 0.4724, + "step": 717 + }, + { + "epoch": 0.05, + "grad_norm": 4.647137404633726, + "learning_rate": 9.994026374059232e-06, + "loss": 0.4919, + "step": 718 + }, + { + "epoch": 0.05, + "grad_norm": 2.3424631174226507, + "learning_rate": 9.99397650015254e-06, + "loss": 0.4566, + "step": 719 + }, + { + "epoch": 0.05, + "grad_norm": 0.928470910158565, + "learning_rate": 9.993926419036893e-06, + "loss": 0.4845, + "step": 720 + }, + { + "epoch": 0.05, + "grad_norm": 3.3142700080597045, + "learning_rate": 9.993876130714367e-06, + "loss": 0.4495, + "step": 721 + }, + { + "epoch": 0.05, + "grad_norm": 4.046292210969857, + "learning_rate": 9.993825635187052e-06, + "loss": 0.4376, + "step": 722 + }, + { + "epoch": 0.05, + "grad_norm": 2.7066143872941533, + "learning_rate": 9.99377493245704e-06, + "loss": 0.442, + "step": 723 + }, + { + "epoch": 0.05, + "grad_norm": 3.0008424541582586, + "learning_rate": 9.993724022526436e-06, + "loss": 0.4551, + "step": 724 + }, + { + "epoch": 0.05, + "grad_norm": 0.7770143116553527, + "learning_rate": 9.993672905397353e-06, + "loss": 0.4866, + "step": 725 + }, + { + "epoch": 0.05, + "grad_norm": 2.3870371251726765, + "learning_rate": 9.993621581071913e-06, + "loss": 0.4225, + "step": 726 + }, + { + "epoch": 0.05, + "grad_norm": 2.239640924878757, + "learning_rate": 9.993570049552242e-06, + "loss": 0.4549, + "step": 727 + }, + { + "epoch": 0.05, + "grad_norm": 2.186189260968605, + "learning_rate": 9.993518310840481e-06, + "loss": 0.4115, + "step": 728 + }, + { + "epoch": 0.05, + "grad_norm": 1.8149784852264021, + "learning_rate": 9.993466364938776e-06, + "loss": 0.4411, + "step": 729 + }, + { + "epoch": 0.05, + "grad_norm": 3.4569245728822007, + "learning_rate": 9.993414211849281e-06, + "loss": 0.428, + "step": 730 + }, + { + "epoch": 0.05, + "grad_norm": 2.769733089295754, + "learning_rate": 9.993361851574162e-06, + "loss": 0.431, + "step": 731 + }, + { + "epoch": 0.05, + "grad_norm": 2.3420693054026698, + "learning_rate": 9.99330928411559e-06, + "loss": 0.4612, + "step": 732 + }, + { + "epoch": 0.05, + "grad_norm": 2.2476509824120647, + "learning_rate": 9.993256509475746e-06, + "loss": 0.4488, + "step": 733 + }, + { + "epoch": 0.05, + "grad_norm": 3.840328309573542, + "learning_rate": 9.993203527656822e-06, + "loss": 0.437, + "step": 734 + }, + { + "epoch": 0.05, + "grad_norm": 1.792612558054085, + "learning_rate": 9.993150338661014e-06, + "loss": 0.4652, + "step": 735 + }, + { + "epoch": 0.05, + "grad_norm": 6.49519143632042, + "learning_rate": 9.99309694249053e-06, + "loss": 0.4302, + "step": 736 + }, + { + "epoch": 0.05, + "grad_norm": 2.260931105451376, + "learning_rate": 9.993043339147584e-06, + "loss": 0.4485, + "step": 737 + }, + { + "epoch": 0.05, + "grad_norm": 1.9443299972263728, + "learning_rate": 9.992989528634403e-06, + "loss": 0.4193, + "step": 738 + }, + { + "epoch": 0.05, + "grad_norm": 0.8855198794919493, + "learning_rate": 9.992935510953216e-06, + "loss": 0.4948, + "step": 739 + }, + { + "epoch": 0.05, + "grad_norm": 1.852162438384942, + "learning_rate": 9.992881286106268e-06, + "loss": 0.4686, + "step": 740 + }, + { + "epoch": 0.05, + "grad_norm": 1.6678759582183533, + "learning_rate": 9.992826854095806e-06, + "loss": 0.4802, + "step": 741 + }, + { + "epoch": 0.05, + "grad_norm": 2.4981249056088424, + "learning_rate": 9.992772214924089e-06, + "loss": 0.4366, + "step": 742 + }, + { + "epoch": 0.05, + "grad_norm": 1.9979541013190596, + "learning_rate": 9.992717368593385e-06, + "loss": 0.4504, + "step": 743 + }, + { + "epoch": 0.05, + "grad_norm": 2.0834487058102034, + "learning_rate": 9.99266231510597e-06, + "loss": 0.4484, + "step": 744 + }, + { + "epoch": 0.05, + "grad_norm": 1.8185227599352154, + "learning_rate": 9.992607054464128e-06, + "loss": 0.4698, + "step": 745 + }, + { + "epoch": 0.05, + "grad_norm": 0.7366592197539074, + "learning_rate": 9.99255158667015e-06, + "loss": 0.4665, + "step": 746 + }, + { + "epoch": 0.05, + "grad_norm": 1.8722742353505182, + "learning_rate": 9.99249591172634e-06, + "loss": 0.4252, + "step": 747 + }, + { + "epoch": 0.05, + "grad_norm": 3.049576533626131, + "learning_rate": 9.992440029635007e-06, + "loss": 0.4645, + "step": 748 + }, + { + "epoch": 0.05, + "grad_norm": 1.5672077229256496, + "learning_rate": 9.99238394039847e-06, + "loss": 0.4056, + "step": 749 + }, + { + "epoch": 0.05, + "grad_norm": 1.5925498825462083, + "learning_rate": 9.992327644019053e-06, + "loss": 0.434, + "step": 750 + }, + { + "epoch": 0.05, + "grad_norm": 1.8303015713255473, + "learning_rate": 9.992271140499096e-06, + "loss": 0.4563, + "step": 751 + }, + { + "epoch": 0.05, + "grad_norm": 1.4236142884002383, + "learning_rate": 9.992214429840944e-06, + "loss": 0.459, + "step": 752 + }, + { + "epoch": 0.05, + "grad_norm": 1.752914195951192, + "learning_rate": 9.992157512046947e-06, + "loss": 0.4489, + "step": 753 + }, + { + "epoch": 0.05, + "grad_norm": 2.115485119650311, + "learning_rate": 9.992100387119468e-06, + "loss": 0.4452, + "step": 754 + }, + { + "epoch": 0.05, + "grad_norm": 1.8854156175407226, + "learning_rate": 9.992043055060876e-06, + "loss": 0.4402, + "step": 755 + }, + { + "epoch": 0.05, + "grad_norm": 3.7833261170781056, + "learning_rate": 9.991985515873552e-06, + "loss": 0.4629, + "step": 756 + }, + { + "epoch": 0.05, + "grad_norm": 2.9153728045052216, + "learning_rate": 9.991927769559882e-06, + "loss": 0.4416, + "step": 757 + }, + { + "epoch": 0.05, + "grad_norm": 4.783200603276375, + "learning_rate": 9.991869816122262e-06, + "loss": 0.4675, + "step": 758 + }, + { + "epoch": 0.05, + "grad_norm": 1.4475071637171941, + "learning_rate": 9.991811655563096e-06, + "loss": 0.4638, + "step": 759 + }, + { + "epoch": 0.05, + "grad_norm": 1.8403988649058063, + "learning_rate": 9.991753287884797e-06, + "loss": 0.4626, + "step": 760 + }, + { + "epoch": 0.05, + "grad_norm": 2.055641677306497, + "learning_rate": 9.99169471308979e-06, + "loss": 0.4386, + "step": 761 + }, + { + "epoch": 0.05, + "grad_norm": 2.063193682721805, + "learning_rate": 9.991635931180504e-06, + "loss": 0.4363, + "step": 762 + }, + { + "epoch": 0.05, + "grad_norm": 2.7133765269253347, + "learning_rate": 9.991576942159374e-06, + "loss": 0.4595, + "step": 763 + }, + { + "epoch": 0.05, + "grad_norm": 1.9417508822029572, + "learning_rate": 9.991517746028851e-06, + "loss": 0.5405, + "step": 764 + }, + { + "epoch": 0.05, + "grad_norm": 2.548362709307829, + "learning_rate": 9.991458342791394e-06, + "loss": 0.4753, + "step": 765 + }, + { + "epoch": 0.05, + "grad_norm": 1.7744718817859155, + "learning_rate": 9.991398732449461e-06, + "loss": 0.4488, + "step": 766 + }, + { + "epoch": 0.05, + "grad_norm": 2.506629010419364, + "learning_rate": 9.991338915005531e-06, + "loss": 0.4439, + "step": 767 + }, + { + "epoch": 0.05, + "grad_norm": 2.746150729627741, + "learning_rate": 9.991278890462083e-06, + "loss": 0.4498, + "step": 768 + }, + { + "epoch": 0.05, + "grad_norm": 2.1926730472137432, + "learning_rate": 9.991218658821609e-06, + "loss": 0.4984, + "step": 769 + }, + { + "epoch": 0.05, + "grad_norm": 1.846841983617649, + "learning_rate": 9.991158220086606e-06, + "loss": 0.4453, + "step": 770 + }, + { + "epoch": 0.05, + "grad_norm": 1.907955257438044, + "learning_rate": 9.991097574259583e-06, + "loss": 0.446, + "step": 771 + }, + { + "epoch": 0.05, + "grad_norm": 2.5396188166622897, + "learning_rate": 9.991036721343058e-06, + "loss": 0.4506, + "step": 772 + }, + { + "epoch": 0.05, + "grad_norm": 1.9119711306037535, + "learning_rate": 9.990975661339554e-06, + "loss": 0.4549, + "step": 773 + }, + { + "epoch": 0.05, + "grad_norm": 2.1540435742691395, + "learning_rate": 9.990914394251605e-06, + "loss": 0.434, + "step": 774 + }, + { + "epoch": 0.05, + "grad_norm": 2.2126268391679975, + "learning_rate": 9.990852920081753e-06, + "loss": 0.4532, + "step": 775 + }, + { + "epoch": 0.05, + "grad_norm": 3.0377088083931834, + "learning_rate": 9.990791238832547e-06, + "loss": 0.4291, + "step": 776 + }, + { + "epoch": 0.05, + "grad_norm": 1.8288013685619777, + "learning_rate": 9.99072935050655e-06, + "loss": 0.4408, + "step": 777 + }, + { + "epoch": 0.05, + "grad_norm": 1.781560694504767, + "learning_rate": 9.990667255106326e-06, + "loss": 0.4358, + "step": 778 + }, + { + "epoch": 0.05, + "grad_norm": 1.9714217622609607, + "learning_rate": 9.990604952634452e-06, + "loss": 0.4844, + "step": 779 + }, + { + "epoch": 0.05, + "grad_norm": 1.6237259170251106, + "learning_rate": 9.990542443093518e-06, + "loss": 0.4287, + "step": 780 + }, + { + "epoch": 0.05, + "grad_norm": 2.0796091992235115, + "learning_rate": 9.990479726486111e-06, + "loss": 0.4779, + "step": 781 + }, + { + "epoch": 0.05, + "grad_norm": 1.5816331887509902, + "learning_rate": 9.990416802814838e-06, + "loss": 0.4233, + "step": 782 + }, + { + "epoch": 0.05, + "grad_norm": 2.198578168807564, + "learning_rate": 9.990353672082307e-06, + "loss": 0.4779, + "step": 783 + }, + { + "epoch": 0.05, + "grad_norm": 2.31231238158569, + "learning_rate": 9.990290334291137e-06, + "loss": 0.4262, + "step": 784 + }, + { + "epoch": 0.05, + "grad_norm": 1.7891061349189314, + "learning_rate": 9.990226789443959e-06, + "loss": 0.4686, + "step": 785 + }, + { + "epoch": 0.05, + "grad_norm": 1.9574795554447961, + "learning_rate": 9.990163037543407e-06, + "loss": 0.451, + "step": 786 + }, + { + "epoch": 0.05, + "grad_norm": 1.9366174622724797, + "learning_rate": 9.990099078592128e-06, + "loss": 0.4428, + "step": 787 + }, + { + "epoch": 0.05, + "grad_norm": 3.047931041875674, + "learning_rate": 9.990034912592774e-06, + "loss": 0.4289, + "step": 788 + }, + { + "epoch": 0.05, + "grad_norm": 3.2617071894422747, + "learning_rate": 9.98997053954801e-06, + "loss": 0.514, + "step": 789 + }, + { + "epoch": 0.05, + "grad_norm": 1.6326084843093283, + "learning_rate": 9.989905959460502e-06, + "loss": 0.4517, + "step": 790 + }, + { + "epoch": 0.05, + "grad_norm": 1.8286255469673063, + "learning_rate": 9.989841172332932e-06, + "loss": 0.4562, + "step": 791 + }, + { + "epoch": 0.05, + "grad_norm": 1.398080419481011, + "learning_rate": 9.989776178167991e-06, + "loss": 0.422, + "step": 792 + }, + { + "epoch": 0.05, + "grad_norm": 6.451760998367041, + "learning_rate": 9.989710976968375e-06, + "loss": 0.4161, + "step": 793 + }, + { + "epoch": 0.05, + "grad_norm": 1.8979467286199225, + "learning_rate": 9.989645568736786e-06, + "loss": 0.4751, + "step": 794 + }, + { + "epoch": 0.05, + "grad_norm": 3.3096835223998924, + "learning_rate": 9.98957995347594e-06, + "loss": 0.4297, + "step": 795 + }, + { + "epoch": 0.05, + "grad_norm": 4.255040445356041, + "learning_rate": 9.98951413118856e-06, + "loss": 0.4463, + "step": 796 + }, + { + "epoch": 0.05, + "grad_norm": 2.3613276402916803, + "learning_rate": 9.989448101877375e-06, + "loss": 0.466, + "step": 797 + }, + { + "epoch": 0.05, + "grad_norm": 3.3231044170074253, + "learning_rate": 9.989381865545128e-06, + "loss": 0.4539, + "step": 798 + }, + { + "epoch": 0.05, + "grad_norm": 1.7764446748922416, + "learning_rate": 9.989315422194562e-06, + "loss": 0.4217, + "step": 799 + }, + { + "epoch": 0.05, + "grad_norm": 2.944426116231744, + "learning_rate": 9.98924877182844e-06, + "loss": 0.426, + "step": 800 + }, + { + "epoch": 0.05, + "grad_norm": 1.413016157089324, + "learning_rate": 9.989181914449523e-06, + "loss": 0.4499, + "step": 801 + }, + { + "epoch": 0.05, + "grad_norm": 1.8796354863110096, + "learning_rate": 9.989114850060586e-06, + "loss": 0.4537, + "step": 802 + }, + { + "epoch": 0.05, + "grad_norm": 2.30681700893175, + "learning_rate": 9.989047578664414e-06, + "loss": 0.4339, + "step": 803 + }, + { + "epoch": 0.05, + "grad_norm": 0.8686538396109308, + "learning_rate": 9.988980100263796e-06, + "loss": 0.468, + "step": 804 + }, + { + "epoch": 0.05, + "grad_norm": 2.3910489222265263, + "learning_rate": 9.988912414861531e-06, + "loss": 0.4618, + "step": 805 + }, + { + "epoch": 0.05, + "grad_norm": 2.0476596917988434, + "learning_rate": 9.98884452246043e-06, + "loss": 0.4335, + "step": 806 + }, + { + "epoch": 0.05, + "grad_norm": 1.9450486515086292, + "learning_rate": 9.988776423063309e-06, + "loss": 0.4389, + "step": 807 + }, + { + "epoch": 0.05, + "grad_norm": 2.002990635676293, + "learning_rate": 9.988708116672991e-06, + "loss": 0.4735, + "step": 808 + }, + { + "epoch": 0.05, + "grad_norm": 3.92066119516322, + "learning_rate": 9.988639603292315e-06, + "loss": 0.3943, + "step": 809 + }, + { + "epoch": 0.05, + "grad_norm": 1.7646101488911214, + "learning_rate": 9.988570882924118e-06, + "loss": 0.4199, + "step": 810 + }, + { + "epoch": 0.05, + "grad_norm": 1.6045767909297195, + "learning_rate": 9.988501955571257e-06, + "loss": 0.4504, + "step": 811 + }, + { + "epoch": 0.05, + "grad_norm": 1.7229917539663575, + "learning_rate": 9.988432821236588e-06, + "loss": 0.473, + "step": 812 + }, + { + "epoch": 0.05, + "grad_norm": 2.726949425061288, + "learning_rate": 9.98836347992298e-06, + "loss": 0.4415, + "step": 813 + }, + { + "epoch": 0.05, + "grad_norm": 2.338904531000487, + "learning_rate": 9.988293931633312e-06, + "loss": 0.4535, + "step": 814 + }, + { + "epoch": 0.05, + "grad_norm": 2.609909544928862, + "learning_rate": 9.988224176370468e-06, + "loss": 0.4924, + "step": 815 + }, + { + "epoch": 0.05, + "grad_norm": 2.219390158335007, + "learning_rate": 9.988154214137345e-06, + "loss": 0.4382, + "step": 816 + }, + { + "epoch": 0.05, + "grad_norm": 2.407382476139027, + "learning_rate": 9.98808404493684e-06, + "loss": 0.4437, + "step": 817 + }, + { + "epoch": 0.05, + "grad_norm": 1.7609613674124398, + "learning_rate": 9.988013668771872e-06, + "loss": 0.4226, + "step": 818 + }, + { + "epoch": 0.05, + "grad_norm": 0.8812049522958381, + "learning_rate": 9.987943085645355e-06, + "loss": 0.5146, + "step": 819 + }, + { + "epoch": 0.05, + "grad_norm": 1.9054287777952734, + "learning_rate": 9.98787229556022e-06, + "loss": 0.4275, + "step": 820 + }, + { + "epoch": 0.05, + "grad_norm": 2.3132004263071244, + "learning_rate": 9.987801298519404e-06, + "loss": 0.4744, + "step": 821 + }, + { + "epoch": 0.05, + "grad_norm": 2.808889943407105, + "learning_rate": 9.987730094525854e-06, + "loss": 0.4171, + "step": 822 + }, + { + "epoch": 0.05, + "grad_norm": 4.472432442961767, + "learning_rate": 9.987658683582522e-06, + "loss": 0.4471, + "step": 823 + }, + { + "epoch": 0.05, + "grad_norm": 1.818475863719265, + "learning_rate": 9.98758706569237e-06, + "loss": 0.4252, + "step": 824 + }, + { + "epoch": 0.05, + "grad_norm": 2.308317465855414, + "learning_rate": 9.987515240858375e-06, + "loss": 0.4322, + "step": 825 + }, + { + "epoch": 0.05, + "grad_norm": 1.9523186706404483, + "learning_rate": 9.987443209083513e-06, + "loss": 0.4434, + "step": 826 + }, + { + "epoch": 0.05, + "grad_norm": 1.841784401817416, + "learning_rate": 9.987370970370773e-06, + "loss": 0.4176, + "step": 827 + }, + { + "epoch": 0.05, + "grad_norm": 5.95961024478376, + "learning_rate": 9.987298524723153e-06, + "loss": 0.4287, + "step": 828 + }, + { + "epoch": 0.05, + "grad_norm": 2.4942654397408663, + "learning_rate": 9.98722587214366e-06, + "loss": 0.4205, + "step": 829 + }, + { + "epoch": 0.05, + "grad_norm": 2.350426410910016, + "learning_rate": 9.987153012635305e-06, + "loss": 0.4199, + "step": 830 + }, + { + "epoch": 0.05, + "grad_norm": 4.12802650657785, + "learning_rate": 9.987079946201114e-06, + "loss": 0.4457, + "step": 831 + }, + { + "epoch": 0.05, + "grad_norm": 1.8872701395081841, + "learning_rate": 9.987006672844119e-06, + "loss": 0.453, + "step": 832 + }, + { + "epoch": 0.05, + "grad_norm": 18.343687856141752, + "learning_rate": 9.98693319256736e-06, + "loss": 0.3863, + "step": 833 + }, + { + "epoch": 0.05, + "grad_norm": 2.1537149097587385, + "learning_rate": 9.986859505373882e-06, + "loss": 0.4922, + "step": 834 + }, + { + "epoch": 0.05, + "grad_norm": 1.8026116562799057, + "learning_rate": 9.986785611266749e-06, + "loss": 0.4246, + "step": 835 + }, + { + "epoch": 0.05, + "grad_norm": 1.8120638282725512, + "learning_rate": 9.986711510249021e-06, + "loss": 0.4441, + "step": 836 + }, + { + "epoch": 0.05, + "grad_norm": 2.0593819798683164, + "learning_rate": 9.986637202323777e-06, + "loss": 0.3827, + "step": 837 + }, + { + "epoch": 0.05, + "grad_norm": 3.4497291118023137, + "learning_rate": 9.986562687494096e-06, + "loss": 0.4348, + "step": 838 + }, + { + "epoch": 0.05, + "grad_norm": 2.510626177549304, + "learning_rate": 9.986487965763073e-06, + "loss": 0.4411, + "step": 839 + }, + { + "epoch": 0.05, + "grad_norm": 1.9839747214140684, + "learning_rate": 9.986413037133806e-06, + "loss": 0.4852, + "step": 840 + }, + { + "epoch": 0.05, + "grad_norm": 6.030492457333062, + "learning_rate": 9.986337901609407e-06, + "loss": 0.4245, + "step": 841 + }, + { + "epoch": 0.05, + "grad_norm": 0.9773631136619981, + "learning_rate": 9.986262559192992e-06, + "loss": 0.4883, + "step": 842 + }, + { + "epoch": 0.05, + "grad_norm": 3.7266203884732154, + "learning_rate": 9.986187009887685e-06, + "loss": 0.4491, + "step": 843 + }, + { + "epoch": 0.05, + "grad_norm": 2.7682562026424797, + "learning_rate": 9.986111253696625e-06, + "loss": 0.4583, + "step": 844 + }, + { + "epoch": 0.05, + "grad_norm": 2.088482057876728, + "learning_rate": 9.98603529062295e-06, + "loss": 0.4289, + "step": 845 + }, + { + "epoch": 0.05, + "grad_norm": 2.249553220163194, + "learning_rate": 9.985959120669816e-06, + "loss": 0.4257, + "step": 846 + }, + { + "epoch": 0.05, + "grad_norm": 2.1260195024993584, + "learning_rate": 9.985882743840383e-06, + "loss": 0.412, + "step": 847 + }, + { + "epoch": 0.05, + "grad_norm": 1.6891161433123363, + "learning_rate": 9.985806160137816e-06, + "loss": 0.4203, + "step": 848 + }, + { + "epoch": 0.05, + "grad_norm": 1.8324367024281996, + "learning_rate": 9.985729369565299e-06, + "loss": 0.4353, + "step": 849 + }, + { + "epoch": 0.05, + "grad_norm": 8.344083468436015, + "learning_rate": 9.985652372126013e-06, + "loss": 0.4629, + "step": 850 + }, + { + "epoch": 0.05, + "grad_norm": 1.7443403482437772, + "learning_rate": 9.985575167823154e-06, + "loss": 0.4648, + "step": 851 + }, + { + "epoch": 0.05, + "grad_norm": 2.2278136312942904, + "learning_rate": 9.985497756659927e-06, + "loss": 0.4308, + "step": 852 + }, + { + "epoch": 0.05, + "grad_norm": 2.054966974169479, + "learning_rate": 9.985420138639543e-06, + "loss": 0.4502, + "step": 853 + }, + { + "epoch": 0.05, + "grad_norm": 2.7331177037284515, + "learning_rate": 9.985342313765223e-06, + "loss": 0.4313, + "step": 854 + }, + { + "epoch": 0.05, + "grad_norm": 1.8738889044291418, + "learning_rate": 9.985264282040195e-06, + "loss": 0.4138, + "step": 855 + }, + { + "epoch": 0.05, + "grad_norm": 4.393667567747854, + "learning_rate": 9.985186043467697e-06, + "loss": 0.431, + "step": 856 + }, + { + "epoch": 0.05, + "grad_norm": 2.335268424978182, + "learning_rate": 9.985107598050973e-06, + "loss": 0.4578, + "step": 857 + }, + { + "epoch": 0.05, + "grad_norm": 2.31628435357004, + "learning_rate": 9.985028945793282e-06, + "loss": 0.4262, + "step": 858 + }, + { + "epoch": 0.05, + "grad_norm": 2.6258180779200866, + "learning_rate": 9.984950086697886e-06, + "loss": 0.4547, + "step": 859 + }, + { + "epoch": 0.05, + "grad_norm": 4.423719826873862, + "learning_rate": 9.984871020768056e-06, + "loss": 0.4663, + "step": 860 + }, + { + "epoch": 0.05, + "grad_norm": 1.8648510078726412, + "learning_rate": 9.984791748007074e-06, + "loss": 0.4288, + "step": 861 + }, + { + "epoch": 0.05, + "grad_norm": 1.9127701198918514, + "learning_rate": 9.984712268418228e-06, + "loss": 0.4648, + "step": 862 + }, + { + "epoch": 0.05, + "grad_norm": 7.992699198148132, + "learning_rate": 9.984632582004817e-06, + "loss": 0.4029, + "step": 863 + }, + { + "epoch": 0.05, + "grad_norm": 3.4562612420210286, + "learning_rate": 9.984552688770145e-06, + "loss": 0.4631, + "step": 864 + }, + { + "epoch": 0.05, + "grad_norm": 3.197226952886568, + "learning_rate": 9.984472588717528e-06, + "loss": 0.4876, + "step": 865 + }, + { + "epoch": 0.05, + "grad_norm": 2.444824121345561, + "learning_rate": 9.984392281850293e-06, + "loss": 0.4299, + "step": 866 + }, + { + "epoch": 0.05, + "grad_norm": 2.1793522233262013, + "learning_rate": 9.984311768171766e-06, + "loss": 0.4564, + "step": 867 + }, + { + "epoch": 0.05, + "grad_norm": 2.162687932448794, + "learning_rate": 9.984231047685292e-06, + "loss": 0.4551, + "step": 868 + }, + { + "epoch": 0.05, + "grad_norm": 2.997032886043869, + "learning_rate": 9.984150120394219e-06, + "loss": 0.4321, + "step": 869 + }, + { + "epoch": 0.05, + "grad_norm": 1.7485735587581321, + "learning_rate": 9.984068986301902e-06, + "loss": 0.4042, + "step": 870 + }, + { + "epoch": 0.05, + "grad_norm": 2.026706024737377, + "learning_rate": 9.983987645411712e-06, + "loss": 0.4477, + "step": 871 + }, + { + "epoch": 0.05, + "grad_norm": 2.288813397372106, + "learning_rate": 9.983906097727023e-06, + "loss": 0.4386, + "step": 872 + }, + { + "epoch": 0.05, + "grad_norm": 1.6025153100994587, + "learning_rate": 9.983824343251216e-06, + "loss": 0.4004, + "step": 873 + }, + { + "epoch": 0.05, + "grad_norm": 2.700468949812591, + "learning_rate": 9.983742381987685e-06, + "loss": 0.4663, + "step": 874 + }, + { + "epoch": 0.06, + "grad_norm": 3.6293938910155337, + "learning_rate": 9.983660213939832e-06, + "loss": 0.4464, + "step": 875 + }, + { + "epoch": 0.06, + "grad_norm": 2.527513601607223, + "learning_rate": 9.983577839111062e-06, + "loss": 0.4658, + "step": 876 + }, + { + "epoch": 0.06, + "grad_norm": 1.9417977450619628, + "learning_rate": 9.983495257504799e-06, + "loss": 0.4022, + "step": 877 + }, + { + "epoch": 0.06, + "grad_norm": 2.195997308949818, + "learning_rate": 9.983412469124462e-06, + "loss": 0.4557, + "step": 878 + }, + { + "epoch": 0.06, + "grad_norm": 1.3393453275261773, + "learning_rate": 9.983329473973494e-06, + "loss": 0.5276, + "step": 879 + }, + { + "epoch": 0.06, + "grad_norm": 2.1995698085811766, + "learning_rate": 9.983246272055333e-06, + "loss": 0.4331, + "step": 880 + }, + { + "epoch": 0.06, + "grad_norm": 1.8356954093128823, + "learning_rate": 9.983162863373433e-06, + "loss": 0.4788, + "step": 881 + }, + { + "epoch": 0.06, + "grad_norm": 2.297221437287468, + "learning_rate": 9.983079247931255e-06, + "loss": 0.4292, + "step": 882 + }, + { + "epoch": 0.06, + "grad_norm": 2.57184333886103, + "learning_rate": 9.982995425732267e-06, + "loss": 0.4302, + "step": 883 + }, + { + "epoch": 0.06, + "grad_norm": 26.614849061559507, + "learning_rate": 9.982911396779949e-06, + "loss": 0.4599, + "step": 884 + }, + { + "epoch": 0.06, + "grad_norm": 1.812496817272605, + "learning_rate": 9.982827161077787e-06, + "loss": 0.436, + "step": 885 + }, + { + "epoch": 0.06, + "grad_norm": 1.850573999627721, + "learning_rate": 9.982742718629275e-06, + "loss": 0.4297, + "step": 886 + }, + { + "epoch": 0.06, + "grad_norm": 2.1018236009745332, + "learning_rate": 9.982658069437916e-06, + "loss": 0.409, + "step": 887 + }, + { + "epoch": 0.06, + "grad_norm": 4.761668117737086, + "learning_rate": 9.982573213507225e-06, + "loss": 0.4194, + "step": 888 + }, + { + "epoch": 0.06, + "grad_norm": 2.1370168746072733, + "learning_rate": 9.982488150840722e-06, + "loss": 0.4014, + "step": 889 + }, + { + "epoch": 0.06, + "grad_norm": 1.9043261773199873, + "learning_rate": 9.982402881441933e-06, + "loss": 0.3951, + "step": 890 + }, + { + "epoch": 0.06, + "grad_norm": 1.6126271997581667, + "learning_rate": 9.9823174053144e-06, + "loss": 0.4129, + "step": 891 + }, + { + "epoch": 0.06, + "grad_norm": 1.9991985625527473, + "learning_rate": 9.982231722461669e-06, + "loss": 0.4392, + "step": 892 + }, + { + "epoch": 0.06, + "grad_norm": 2.043952618343807, + "learning_rate": 9.982145832887294e-06, + "loss": 0.5015, + "step": 893 + }, + { + "epoch": 0.06, + "grad_norm": 1.939395607295308, + "learning_rate": 9.98205973659484e-06, + "loss": 0.4329, + "step": 894 + }, + { + "epoch": 0.06, + "grad_norm": 1.6488668660224204, + "learning_rate": 9.981973433587876e-06, + "loss": 0.426, + "step": 895 + }, + { + "epoch": 0.06, + "grad_norm": 1.5073712420631002, + "learning_rate": 9.981886923869988e-06, + "loss": 0.3905, + "step": 896 + }, + { + "epoch": 0.06, + "grad_norm": 1.8621661400270306, + "learning_rate": 9.981800207444762e-06, + "loss": 0.4081, + "step": 897 + }, + { + "epoch": 0.06, + "grad_norm": 2.1146298803252797, + "learning_rate": 9.981713284315796e-06, + "loss": 0.4397, + "step": 898 + }, + { + "epoch": 0.06, + "grad_norm": 1.2228346828692689, + "learning_rate": 9.981626154486696e-06, + "loss": 0.5177, + "step": 899 + }, + { + "epoch": 0.06, + "grad_norm": 3.4933527790971017, + "learning_rate": 9.981538817961082e-06, + "loss": 0.4235, + "step": 900 + }, + { + "epoch": 0.06, + "grad_norm": 0.8595059330336282, + "learning_rate": 9.981451274742572e-06, + "loss": 0.4814, + "step": 901 + }, + { + "epoch": 0.06, + "grad_norm": 1.7674160092007893, + "learning_rate": 9.9813635248348e-06, + "loss": 0.4082, + "step": 902 + }, + { + "epoch": 0.06, + "grad_norm": 3.783556821691551, + "learning_rate": 9.98127556824141e-06, + "loss": 0.4332, + "step": 903 + }, + { + "epoch": 0.06, + "grad_norm": 1.6950369875321318, + "learning_rate": 9.981187404966047e-06, + "loss": 0.426, + "step": 904 + }, + { + "epoch": 0.06, + "grad_norm": 2.4850559204818143, + "learning_rate": 9.98109903501237e-06, + "loss": 0.4538, + "step": 905 + }, + { + "epoch": 0.06, + "grad_norm": 3.6481703676970185, + "learning_rate": 9.981010458384048e-06, + "loss": 0.4888, + "step": 906 + }, + { + "epoch": 0.06, + "grad_norm": 13.853290704029964, + "learning_rate": 9.980921675084755e-06, + "loss": 0.448, + "step": 907 + }, + { + "epoch": 0.06, + "grad_norm": 3.5318247712162347, + "learning_rate": 9.980832685118173e-06, + "loss": 0.4558, + "step": 908 + }, + { + "epoch": 0.06, + "grad_norm": 1.614367986623324, + "learning_rate": 9.980743488487999e-06, + "loss": 0.4599, + "step": 909 + }, + { + "epoch": 0.06, + "grad_norm": 4.1062925944638415, + "learning_rate": 9.980654085197928e-06, + "loss": 0.4321, + "step": 910 + }, + { + "epoch": 0.06, + "grad_norm": 1.7061385243800808, + "learning_rate": 9.980564475251673e-06, + "loss": 0.4315, + "step": 911 + }, + { + "epoch": 0.06, + "grad_norm": 2.262690557515717, + "learning_rate": 9.980474658652953e-06, + "loss": 0.4286, + "step": 912 + }, + { + "epoch": 0.06, + "grad_norm": 2.435930689025621, + "learning_rate": 9.98038463540549e-06, + "loss": 0.426, + "step": 913 + }, + { + "epoch": 0.06, + "grad_norm": 1.8404883744451876, + "learning_rate": 9.980294405513024e-06, + "loss": 0.4525, + "step": 914 + }, + { + "epoch": 0.06, + "grad_norm": 2.6347995801011797, + "learning_rate": 9.980203968979298e-06, + "loss": 0.4543, + "step": 915 + }, + { + "epoch": 0.06, + "grad_norm": 1.9011771571903395, + "learning_rate": 9.980113325808062e-06, + "loss": 0.4417, + "step": 916 + }, + { + "epoch": 0.06, + "grad_norm": 1.625756947977176, + "learning_rate": 9.980022476003081e-06, + "loss": 0.4301, + "step": 917 + }, + { + "epoch": 0.06, + "grad_norm": 2.4732928132745444, + "learning_rate": 9.97993141956812e-06, + "loss": 0.4579, + "step": 918 + }, + { + "epoch": 0.06, + "grad_norm": 3.1499894763709215, + "learning_rate": 9.979840156506959e-06, + "loss": 0.4579, + "step": 919 + }, + { + "epoch": 0.06, + "grad_norm": 1.8541051688817138, + "learning_rate": 9.979748686823386e-06, + "loss": 0.4396, + "step": 920 + }, + { + "epoch": 0.06, + "grad_norm": 1.4980284504653731, + "learning_rate": 9.979657010521194e-06, + "loss": 0.4325, + "step": 921 + }, + { + "epoch": 0.06, + "grad_norm": 2.016309885443545, + "learning_rate": 9.979565127604186e-06, + "loss": 0.4334, + "step": 922 + }, + { + "epoch": 0.06, + "grad_norm": 2.6505891850016337, + "learning_rate": 9.97947303807618e-06, + "loss": 0.4258, + "step": 923 + }, + { + "epoch": 0.06, + "grad_norm": 1.7367054042675574, + "learning_rate": 9.97938074194099e-06, + "loss": 0.5969, + "step": 924 + }, + { + "epoch": 0.06, + "grad_norm": 2.0760989197594975, + "learning_rate": 9.97928823920245e-06, + "loss": 0.4242, + "step": 925 + }, + { + "epoch": 0.06, + "grad_norm": 2.921672511976158, + "learning_rate": 9.979195529864397e-06, + "loss": 0.4731, + "step": 926 + }, + { + "epoch": 0.06, + "grad_norm": 4.157068536120661, + "learning_rate": 9.979102613930676e-06, + "loss": 0.469, + "step": 927 + }, + { + "epoch": 0.06, + "grad_norm": 1.628234387008205, + "learning_rate": 9.979009491405145e-06, + "loss": 0.4292, + "step": 928 + }, + { + "epoch": 0.06, + "grad_norm": 1.6049554572463292, + "learning_rate": 9.978916162291666e-06, + "loss": 0.4482, + "step": 929 + }, + { + "epoch": 0.06, + "grad_norm": 2.037473235289058, + "learning_rate": 9.978822626594112e-06, + "loss": 0.4495, + "step": 930 + }, + { + "epoch": 0.06, + "grad_norm": 10.896781965707907, + "learning_rate": 9.978728884316363e-06, + "loss": 0.4556, + "step": 931 + }, + { + "epoch": 0.06, + "grad_norm": 1.8908137079365919, + "learning_rate": 9.97863493546231e-06, + "loss": 0.4368, + "step": 932 + }, + { + "epoch": 0.06, + "grad_norm": 2.5531223267122365, + "learning_rate": 9.97854078003585e-06, + "loss": 0.429, + "step": 933 + }, + { + "epoch": 0.06, + "grad_norm": 7.149434927746202, + "learning_rate": 9.97844641804089e-06, + "loss": 0.4543, + "step": 934 + }, + { + "epoch": 0.06, + "grad_norm": 2.4435713127937873, + "learning_rate": 9.978351849481347e-06, + "loss": 0.4593, + "step": 935 + }, + { + "epoch": 0.06, + "grad_norm": 2.406142026912929, + "learning_rate": 9.978257074361142e-06, + "loss": 0.4534, + "step": 936 + }, + { + "epoch": 0.06, + "grad_norm": 2.09372384551579, + "learning_rate": 9.97816209268421e-06, + "loss": 0.4175, + "step": 937 + }, + { + "epoch": 0.06, + "grad_norm": 2.0932196752914467, + "learning_rate": 9.978066904454489e-06, + "loss": 0.4157, + "step": 938 + }, + { + "epoch": 0.06, + "grad_norm": 2.9765605268754234, + "learning_rate": 9.97797150967593e-06, + "loss": 0.4274, + "step": 939 + }, + { + "epoch": 0.06, + "grad_norm": 2.193357439145338, + "learning_rate": 9.977875908352493e-06, + "loss": 0.4479, + "step": 940 + }, + { + "epoch": 0.06, + "grad_norm": 2.1720783248148368, + "learning_rate": 9.977780100488142e-06, + "loss": 0.4759, + "step": 941 + }, + { + "epoch": 0.06, + "grad_norm": 10.947354107383864, + "learning_rate": 9.977684086086853e-06, + "loss": 0.4512, + "step": 942 + }, + { + "epoch": 0.06, + "grad_norm": 3.8346989509604015, + "learning_rate": 9.977587865152609e-06, + "loss": 0.4239, + "step": 943 + }, + { + "epoch": 0.06, + "grad_norm": 2.7483032110461862, + "learning_rate": 9.977491437689403e-06, + "loss": 0.4594, + "step": 944 + }, + { + "epoch": 0.06, + "grad_norm": 2.0645624753941836, + "learning_rate": 9.977394803701238e-06, + "loss": 0.4333, + "step": 945 + }, + { + "epoch": 0.06, + "grad_norm": 1.7286216204365414, + "learning_rate": 9.97729796319212e-06, + "loss": 0.4436, + "step": 946 + }, + { + "epoch": 0.06, + "grad_norm": 10.362368679846375, + "learning_rate": 9.97720091616607e-06, + "loss": 0.4532, + "step": 947 + }, + { + "epoch": 0.06, + "grad_norm": 2.3733509548490734, + "learning_rate": 9.977103662627112e-06, + "loss": 0.4085, + "step": 948 + }, + { + "epoch": 0.06, + "grad_norm": 7.704821619867478, + "learning_rate": 9.977006202579284e-06, + "loss": 0.4379, + "step": 949 + }, + { + "epoch": 0.06, + "grad_norm": 5.910476077056555, + "learning_rate": 9.976908536026628e-06, + "loss": 0.4538, + "step": 950 + }, + { + "epoch": 0.06, + "grad_norm": 2.498882365552425, + "learning_rate": 9.976810662973198e-06, + "loss": 0.4737, + "step": 951 + }, + { + "epoch": 0.06, + "grad_norm": 2.8904724739380527, + "learning_rate": 9.976712583423053e-06, + "loss": 0.4469, + "step": 952 + }, + { + "epoch": 0.06, + "grad_norm": 2.05992217119996, + "learning_rate": 9.976614297380264e-06, + "loss": 0.4107, + "step": 953 + }, + { + "epoch": 0.06, + "grad_norm": 2.9441060804384573, + "learning_rate": 9.976515804848907e-06, + "loss": 0.4377, + "step": 954 + }, + { + "epoch": 0.06, + "grad_norm": 2.1253849536855154, + "learning_rate": 9.97641710583307e-06, + "loss": 0.4028, + "step": 955 + }, + { + "epoch": 0.06, + "grad_norm": 2.6826962564161625, + "learning_rate": 9.97631820033685e-06, + "loss": 0.4501, + "step": 956 + }, + { + "epoch": 0.06, + "grad_norm": 1.9882186886883442, + "learning_rate": 9.976219088364347e-06, + "loss": 0.4102, + "step": 957 + }, + { + "epoch": 0.06, + "grad_norm": 2.3046200112231214, + "learning_rate": 9.976119769919677e-06, + "loss": 0.4329, + "step": 958 + }, + { + "epoch": 0.06, + "grad_norm": 2.461627477595595, + "learning_rate": 9.976020245006957e-06, + "loss": 0.4451, + "step": 959 + }, + { + "epoch": 0.06, + "grad_norm": 0.8276651557840978, + "learning_rate": 9.97592051363032e-06, + "loss": 0.5161, + "step": 960 + }, + { + "epoch": 0.06, + "grad_norm": 2.5219751753702964, + "learning_rate": 9.975820575793902e-06, + "loss": 0.4213, + "step": 961 + }, + { + "epoch": 0.06, + "grad_norm": 2.6107616287879676, + "learning_rate": 9.975720431501851e-06, + "loss": 0.427, + "step": 962 + }, + { + "epoch": 0.06, + "grad_norm": 1.973225956565384, + "learning_rate": 9.975620080758321e-06, + "loss": 0.431, + "step": 963 + }, + { + "epoch": 0.06, + "grad_norm": 3.6719909709611906, + "learning_rate": 9.975519523567477e-06, + "loss": 0.4253, + "step": 964 + }, + { + "epoch": 0.06, + "grad_norm": 2.450332923063095, + "learning_rate": 9.97541875993349e-06, + "loss": 0.4618, + "step": 965 + }, + { + "epoch": 0.06, + "grad_norm": 2.2104242610244804, + "learning_rate": 9.97531778986054e-06, + "loss": 0.4643, + "step": 966 + }, + { + "epoch": 0.06, + "grad_norm": 9.771495474982805, + "learning_rate": 9.975216613352818e-06, + "loss": 0.44, + "step": 967 + }, + { + "epoch": 0.06, + "grad_norm": 1.79462285334802, + "learning_rate": 9.975115230414524e-06, + "loss": 0.4188, + "step": 968 + }, + { + "epoch": 0.06, + "grad_norm": 4.115747433755278, + "learning_rate": 9.97501364104986e-06, + "loss": 0.4444, + "step": 969 + }, + { + "epoch": 0.06, + "grad_norm": 2.6015233518485714, + "learning_rate": 9.974911845263045e-06, + "loss": 0.4033, + "step": 970 + }, + { + "epoch": 0.06, + "grad_norm": 2.4811623886152963, + "learning_rate": 9.974809843058299e-06, + "loss": 0.4037, + "step": 971 + }, + { + "epoch": 0.06, + "grad_norm": 2.290643090711748, + "learning_rate": 9.974707634439858e-06, + "loss": 0.4017, + "step": 972 + }, + { + "epoch": 0.06, + "grad_norm": 1.8200089712297691, + "learning_rate": 9.974605219411962e-06, + "loss": 0.4622, + "step": 973 + }, + { + "epoch": 0.06, + "grad_norm": 1.3624872532897834, + "learning_rate": 9.974502597978858e-06, + "loss": 0.4331, + "step": 974 + }, + { + "epoch": 0.06, + "grad_norm": 5.27210189356096, + "learning_rate": 9.974399770144807e-06, + "loss": 0.4147, + "step": 975 + }, + { + "epoch": 0.06, + "grad_norm": 2.4383478828633045, + "learning_rate": 9.974296735914072e-06, + "loss": 0.4111, + "step": 976 + }, + { + "epoch": 0.06, + "grad_norm": 2.056648793814065, + "learning_rate": 9.974193495290931e-06, + "loss": 0.44, + "step": 977 + }, + { + "epoch": 0.06, + "grad_norm": 1.879102672129695, + "learning_rate": 9.974090048279666e-06, + "loss": 0.4472, + "step": 978 + }, + { + "epoch": 0.06, + "grad_norm": 1.8466662539655996, + "learning_rate": 9.973986394884571e-06, + "loss": 0.4116, + "step": 979 + }, + { + "epoch": 0.06, + "grad_norm": 2.037496370016901, + "learning_rate": 9.973882535109944e-06, + "loss": 0.4803, + "step": 980 + }, + { + "epoch": 0.06, + "grad_norm": 1.9611846114870142, + "learning_rate": 9.973778468960099e-06, + "loss": 0.4179, + "step": 981 + }, + { + "epoch": 0.06, + "grad_norm": 3.0710228622316924, + "learning_rate": 9.973674196439349e-06, + "loss": 0.4056, + "step": 982 + }, + { + "epoch": 0.06, + "grad_norm": 1.8868523869531129, + "learning_rate": 9.973569717552022e-06, + "loss": 0.4231, + "step": 983 + }, + { + "epoch": 0.06, + "grad_norm": 3.5643636961505556, + "learning_rate": 9.973465032302455e-06, + "loss": 0.4403, + "step": 984 + }, + { + "epoch": 0.06, + "grad_norm": 1.8418721685888957, + "learning_rate": 9.973360140694987e-06, + "loss": 0.4365, + "step": 985 + }, + { + "epoch": 0.06, + "grad_norm": 5.678896806142568, + "learning_rate": 9.973255042733976e-06, + "loss": 0.3718, + "step": 986 + }, + { + "epoch": 0.06, + "grad_norm": 2.2221814726497073, + "learning_rate": 9.973149738423779e-06, + "loss": 0.405, + "step": 987 + }, + { + "epoch": 0.06, + "grad_norm": 2.2032963475640974, + "learning_rate": 9.973044227768765e-06, + "loss": 0.4615, + "step": 988 + }, + { + "epoch": 0.06, + "grad_norm": 1.6657263537472482, + "learning_rate": 9.972938510773313e-06, + "loss": 0.4407, + "step": 989 + }, + { + "epoch": 0.06, + "grad_norm": 3.7025416284652275, + "learning_rate": 9.972832587441811e-06, + "loss": 0.4401, + "step": 990 + }, + { + "epoch": 0.06, + "grad_norm": 2.0383827337350766, + "learning_rate": 9.972726457778651e-06, + "loss": 0.4169, + "step": 991 + }, + { + "epoch": 0.06, + "grad_norm": 2.36485014621816, + "learning_rate": 9.972620121788238e-06, + "loss": 0.4141, + "step": 992 + }, + { + "epoch": 0.06, + "grad_norm": 1.4310161874528062, + "learning_rate": 9.972513579474982e-06, + "loss": 0.4076, + "step": 993 + }, + { + "epoch": 0.06, + "grad_norm": 4.305654451233364, + "learning_rate": 9.97240683084331e-06, + "loss": 0.431, + "step": 994 + }, + { + "epoch": 0.06, + "grad_norm": 2.1923634812750334, + "learning_rate": 9.972299875897641e-06, + "loss": 0.3946, + "step": 995 + }, + { + "epoch": 0.06, + "grad_norm": 1.7901803929292885, + "learning_rate": 9.972192714642422e-06, + "loss": 0.4345, + "step": 996 + }, + { + "epoch": 0.06, + "grad_norm": 1.465272511542452, + "learning_rate": 9.972085347082094e-06, + "loss": 0.4202, + "step": 997 + }, + { + "epoch": 0.06, + "grad_norm": 1.5262679006939772, + "learning_rate": 9.971977773221115e-06, + "loss": 0.4095, + "step": 998 + }, + { + "epoch": 0.06, + "grad_norm": 1.4850430263095706, + "learning_rate": 9.971869993063947e-06, + "loss": 0.4032, + "step": 999 + }, + { + "epoch": 0.06, + "grad_norm": 1.4255973979231296, + "learning_rate": 9.97176200661506e-06, + "loss": 0.4027, + "step": 1000 + }, + { + "epoch": 0.06, + "grad_norm": 0.9043882780359147, + "learning_rate": 9.971653813878937e-06, + "loss": 0.4484, + "step": 1001 + }, + { + "epoch": 0.06, + "grad_norm": 2.1930700474658518, + "learning_rate": 9.971545414860067e-06, + "loss": 0.4517, + "step": 1002 + }, + { + "epoch": 0.06, + "grad_norm": 2.1065434574616355, + "learning_rate": 9.971436809562948e-06, + "loss": 0.4328, + "step": 1003 + }, + { + "epoch": 0.06, + "grad_norm": 1.7849230866270132, + "learning_rate": 9.971327997992085e-06, + "loss": 0.4287, + "step": 1004 + }, + { + "epoch": 0.06, + "grad_norm": 2.079922648594418, + "learning_rate": 9.971218980151993e-06, + "loss": 0.4515, + "step": 1005 + }, + { + "epoch": 0.06, + "grad_norm": 2.026995958465551, + "learning_rate": 9.971109756047197e-06, + "loss": 0.4135, + "step": 1006 + }, + { + "epoch": 0.06, + "grad_norm": 2.3688983241560737, + "learning_rate": 9.971000325682225e-06, + "loss": 0.4248, + "step": 1007 + }, + { + "epoch": 0.06, + "grad_norm": 1.4013386275445587, + "learning_rate": 9.970890689061622e-06, + "loss": 0.4466, + "step": 1008 + }, + { + "epoch": 0.06, + "grad_norm": 1.4509037141099472, + "learning_rate": 9.970780846189934e-06, + "loss": 0.4171, + "step": 1009 + }, + { + "epoch": 0.06, + "grad_norm": 1.894729520330879, + "learning_rate": 9.970670797071719e-06, + "loss": 0.4144, + "step": 1010 + }, + { + "epoch": 0.06, + "grad_norm": 4.0500302607849745, + "learning_rate": 9.970560541711547e-06, + "loss": 0.4362, + "step": 1011 + }, + { + "epoch": 0.06, + "grad_norm": 1.8937281240678299, + "learning_rate": 9.970450080113988e-06, + "loss": 0.4361, + "step": 1012 + }, + { + "epoch": 0.06, + "grad_norm": 2.432129687251775, + "learning_rate": 9.970339412283624e-06, + "loss": 0.4471, + "step": 1013 + }, + { + "epoch": 0.06, + "grad_norm": 1.7322069181436954, + "learning_rate": 9.970228538225051e-06, + "loss": 0.4365, + "step": 1014 + }, + { + "epoch": 0.06, + "grad_norm": 1.5036771104025033, + "learning_rate": 9.97011745794287e-06, + "loss": 0.4034, + "step": 1015 + }, + { + "epoch": 0.06, + "grad_norm": 2.4627252751655098, + "learning_rate": 9.970006171441684e-06, + "loss": 0.4404, + "step": 1016 + }, + { + "epoch": 0.06, + "grad_norm": 2.5231895430952793, + "learning_rate": 9.969894678726118e-06, + "loss": 0.3948, + "step": 1017 + }, + { + "epoch": 0.06, + "grad_norm": 1.5009359351294047, + "learning_rate": 9.969782979800791e-06, + "loss": 0.428, + "step": 1018 + }, + { + "epoch": 0.06, + "grad_norm": 1.6515350137500342, + "learning_rate": 9.969671074670341e-06, + "loss": 0.4681, + "step": 1019 + }, + { + "epoch": 0.06, + "grad_norm": 2.1735423046277487, + "learning_rate": 9.969558963339414e-06, + "loss": 0.4073, + "step": 1020 + }, + { + "epoch": 0.06, + "grad_norm": 1.6301082636370194, + "learning_rate": 9.969446645812656e-06, + "loss": 0.4251, + "step": 1021 + }, + { + "epoch": 0.06, + "grad_norm": 3.353792866562552, + "learning_rate": 9.969334122094732e-06, + "loss": 0.4101, + "step": 1022 + }, + { + "epoch": 0.06, + "grad_norm": 2.4371540032356265, + "learning_rate": 9.969221392190307e-06, + "loss": 0.497, + "step": 1023 + }, + { + "epoch": 0.06, + "grad_norm": 1.8390282407206398, + "learning_rate": 9.96910845610406e-06, + "loss": 0.4329, + "step": 1024 + }, + { + "epoch": 0.06, + "grad_norm": 2.9311063342477017, + "learning_rate": 9.968995313840678e-06, + "loss": 0.4064, + "step": 1025 + }, + { + "epoch": 0.06, + "grad_norm": 2.249935201981566, + "learning_rate": 9.968881965404855e-06, + "loss": 0.3997, + "step": 1026 + }, + { + "epoch": 0.06, + "grad_norm": 1.5217798343803648, + "learning_rate": 9.968768410801292e-06, + "loss": 0.4276, + "step": 1027 + }, + { + "epoch": 0.06, + "grad_norm": 1.3442816268055013, + "learning_rate": 9.968654650034702e-06, + "loss": 0.4113, + "step": 1028 + }, + { + "epoch": 0.06, + "grad_norm": 1.8157652085876863, + "learning_rate": 9.968540683109806e-06, + "loss": 0.4587, + "step": 1029 + }, + { + "epoch": 0.06, + "grad_norm": 1.5606612589384359, + "learning_rate": 9.968426510031333e-06, + "loss": 0.397, + "step": 1030 + }, + { + "epoch": 0.06, + "grad_norm": 3.1053549705834635, + "learning_rate": 9.968312130804019e-06, + "loss": 0.395, + "step": 1031 + }, + { + "epoch": 0.06, + "grad_norm": 2.1756119353760828, + "learning_rate": 9.968197545432608e-06, + "loss": 0.4166, + "step": 1032 + }, + { + "epoch": 0.06, + "grad_norm": 2.0672867495576734, + "learning_rate": 9.968082753921857e-06, + "loss": 0.4351, + "step": 1033 + }, + { + "epoch": 0.07, + "grad_norm": 2.497903565945721, + "learning_rate": 9.967967756276528e-06, + "loss": 0.4164, + "step": 1034 + }, + { + "epoch": 0.07, + "grad_norm": 1.6953064491487724, + "learning_rate": 9.967852552501394e-06, + "loss": 0.453, + "step": 1035 + }, + { + "epoch": 0.07, + "grad_norm": 3.212831139459771, + "learning_rate": 9.967737142601233e-06, + "loss": 0.4428, + "step": 1036 + }, + { + "epoch": 0.07, + "grad_norm": 9.885711416467117, + "learning_rate": 9.967621526580836e-06, + "loss": 0.442, + "step": 1037 + }, + { + "epoch": 0.07, + "grad_norm": 4.419841719673243, + "learning_rate": 9.967505704444996e-06, + "loss": 0.4371, + "step": 1038 + }, + { + "epoch": 0.07, + "grad_norm": 2.1092049370758943, + "learning_rate": 9.967389676198522e-06, + "loss": 0.4586, + "step": 1039 + }, + { + "epoch": 0.07, + "grad_norm": 1.5785517336382928, + "learning_rate": 9.967273441846227e-06, + "loss": 0.4596, + "step": 1040 + }, + { + "epoch": 0.07, + "grad_norm": 2.1897340497462054, + "learning_rate": 9.967157001392933e-06, + "loss": 0.4071, + "step": 1041 + }, + { + "epoch": 0.07, + "grad_norm": 1.7182912408548263, + "learning_rate": 9.967040354843473e-06, + "loss": 0.4325, + "step": 1042 + }, + { + "epoch": 0.07, + "grad_norm": 1.9268781515312112, + "learning_rate": 9.966923502202688e-06, + "loss": 0.4316, + "step": 1043 + }, + { + "epoch": 0.07, + "grad_norm": 2.8581954392547293, + "learning_rate": 9.966806443475423e-06, + "loss": 0.4576, + "step": 1044 + }, + { + "epoch": 0.07, + "grad_norm": 2.2620286989720024, + "learning_rate": 9.966689178666537e-06, + "loss": 0.4151, + "step": 1045 + }, + { + "epoch": 0.07, + "grad_norm": 1.9879301501050124, + "learning_rate": 9.966571707780894e-06, + "loss": 0.3877, + "step": 1046 + }, + { + "epoch": 0.07, + "grad_norm": 1.5951202653168173, + "learning_rate": 9.96645403082337e-06, + "loss": 0.4672, + "step": 1047 + }, + { + "epoch": 0.07, + "grad_norm": 1.892616323634653, + "learning_rate": 9.966336147798848e-06, + "loss": 0.436, + "step": 1048 + }, + { + "epoch": 0.07, + "grad_norm": 3.5057467579778137, + "learning_rate": 9.966218058712218e-06, + "loss": 0.4144, + "step": 1049 + }, + { + "epoch": 0.07, + "grad_norm": 2.5211997236715327, + "learning_rate": 9.966099763568377e-06, + "loss": 0.3761, + "step": 1050 + }, + { + "epoch": 0.07, + "grad_norm": 2.036891383244653, + "learning_rate": 9.96598126237224e-06, + "loss": 0.4391, + "step": 1051 + }, + { + "epoch": 0.07, + "grad_norm": 3.3484219984213954, + "learning_rate": 9.965862555128717e-06, + "loss": 0.3985, + "step": 1052 + }, + { + "epoch": 0.07, + "grad_norm": 3.5805476026012735, + "learning_rate": 9.965743641842737e-06, + "loss": 0.4209, + "step": 1053 + }, + { + "epoch": 0.07, + "grad_norm": 8.843236713409933, + "learning_rate": 9.965624522519233e-06, + "loss": 0.4373, + "step": 1054 + }, + { + "epoch": 0.07, + "grad_norm": 2.0949125824287727, + "learning_rate": 9.965505197163148e-06, + "loss": 0.4292, + "step": 1055 + }, + { + "epoch": 0.07, + "grad_norm": 2.22939719849107, + "learning_rate": 9.965385665779432e-06, + "loss": 0.4244, + "step": 1056 + }, + { + "epoch": 0.07, + "grad_norm": 1.6796170096341314, + "learning_rate": 9.965265928373045e-06, + "loss": 0.3717, + "step": 1057 + }, + { + "epoch": 0.07, + "grad_norm": 1.8208949409810051, + "learning_rate": 9.965145984948956e-06, + "loss": 0.4295, + "step": 1058 + }, + { + "epoch": 0.07, + "grad_norm": 1.6048724862682364, + "learning_rate": 9.96502583551214e-06, + "loss": 0.4128, + "step": 1059 + }, + { + "epoch": 0.07, + "grad_norm": 2.182544860851816, + "learning_rate": 9.964905480067585e-06, + "loss": 0.4421, + "step": 1060 + }, + { + "epoch": 0.07, + "grad_norm": 2.8770526281876805, + "learning_rate": 9.964784918620284e-06, + "loss": 0.4144, + "step": 1061 + }, + { + "epoch": 0.07, + "grad_norm": 2.0502433368725024, + "learning_rate": 9.964664151175235e-06, + "loss": 0.4149, + "step": 1062 + }, + { + "epoch": 0.07, + "grad_norm": 1.6852493174725927, + "learning_rate": 9.964543177737453e-06, + "loss": 0.4279, + "step": 1063 + }, + { + "epoch": 0.07, + "grad_norm": 2.3863314076720705, + "learning_rate": 9.964421998311957e-06, + "loss": 0.4309, + "step": 1064 + }, + { + "epoch": 0.07, + "grad_norm": 1.9774735470449674, + "learning_rate": 9.964300612903775e-06, + "loss": 0.3857, + "step": 1065 + }, + { + "epoch": 0.07, + "grad_norm": 1.7865301839072318, + "learning_rate": 9.964179021517943e-06, + "loss": 0.4374, + "step": 1066 + }, + { + "epoch": 0.07, + "grad_norm": 1.9105237189090487, + "learning_rate": 9.964057224159505e-06, + "loss": 0.434, + "step": 1067 + }, + { + "epoch": 0.07, + "grad_norm": 1.0308961812961612, + "learning_rate": 9.963935220833516e-06, + "loss": 0.5032, + "step": 1068 + }, + { + "epoch": 0.07, + "grad_norm": 6.567470420085035, + "learning_rate": 9.963813011545039e-06, + "loss": 0.4449, + "step": 1069 + }, + { + "epoch": 0.07, + "grad_norm": 1.9238137775653517, + "learning_rate": 9.963690596299142e-06, + "loss": 0.4169, + "step": 1070 + }, + { + "epoch": 0.07, + "grad_norm": 2.224238352628365, + "learning_rate": 9.963567975100906e-06, + "loss": 0.4024, + "step": 1071 + }, + { + "epoch": 0.07, + "grad_norm": 1.8992630344406685, + "learning_rate": 9.963445147955417e-06, + "loss": 0.4147, + "step": 1072 + }, + { + "epoch": 0.07, + "grad_norm": 2.146992246234404, + "learning_rate": 9.963322114867775e-06, + "loss": 0.4716, + "step": 1073 + }, + { + "epoch": 0.07, + "grad_norm": 5.068810859692916, + "learning_rate": 9.963198875843082e-06, + "loss": 0.4104, + "step": 1074 + }, + { + "epoch": 0.07, + "grad_norm": 2.121934838520355, + "learning_rate": 9.963075430886451e-06, + "loss": 0.4189, + "step": 1075 + }, + { + "epoch": 0.07, + "grad_norm": 7.8409386562456875, + "learning_rate": 9.962951780003005e-06, + "loss": 0.4545, + "step": 1076 + }, + { + "epoch": 0.07, + "grad_norm": 2.44691412556006, + "learning_rate": 9.962827923197875e-06, + "loss": 0.4159, + "step": 1077 + }, + { + "epoch": 0.07, + "grad_norm": 1.8674712042238808, + "learning_rate": 9.9627038604762e-06, + "loss": 0.4232, + "step": 1078 + }, + { + "epoch": 0.07, + "grad_norm": 2.7475315454587137, + "learning_rate": 9.962579591843126e-06, + "loss": 0.4344, + "step": 1079 + }, + { + "epoch": 0.07, + "grad_norm": 1.9700184410379884, + "learning_rate": 9.962455117303813e-06, + "loss": 0.385, + "step": 1080 + }, + { + "epoch": 0.07, + "grad_norm": 3.734268009587939, + "learning_rate": 9.96233043686342e-06, + "loss": 0.4488, + "step": 1081 + }, + { + "epoch": 0.07, + "grad_norm": 2.3792889161990405, + "learning_rate": 9.962205550527124e-06, + "loss": 0.4301, + "step": 1082 + }, + { + "epoch": 0.07, + "grad_norm": 3.816078388293769, + "learning_rate": 9.962080458300105e-06, + "loss": 0.4468, + "step": 1083 + }, + { + "epoch": 0.07, + "grad_norm": 1.786762985149165, + "learning_rate": 9.961955160187555e-06, + "loss": 0.4022, + "step": 1084 + }, + { + "epoch": 0.07, + "grad_norm": 1.6825212039123876, + "learning_rate": 9.961829656194672e-06, + "loss": 0.4142, + "step": 1085 + }, + { + "epoch": 0.07, + "grad_norm": 3.4717471390365557, + "learning_rate": 9.961703946326664e-06, + "loss": 0.4195, + "step": 1086 + }, + { + "epoch": 0.07, + "grad_norm": 1.7048649803787788, + "learning_rate": 9.961578030588746e-06, + "loss": 0.3955, + "step": 1087 + }, + { + "epoch": 0.07, + "grad_norm": 3.1645420543212817, + "learning_rate": 9.961451908986142e-06, + "loss": 0.4688, + "step": 1088 + }, + { + "epoch": 0.07, + "grad_norm": 2.104394571412978, + "learning_rate": 9.961325581524086e-06, + "loss": 0.4157, + "step": 1089 + }, + { + "epoch": 0.07, + "grad_norm": 1.7553422306302857, + "learning_rate": 9.961199048207819e-06, + "loss": 0.4036, + "step": 1090 + }, + { + "epoch": 0.07, + "grad_norm": 1.3138483785122332, + "learning_rate": 9.961072309042592e-06, + "loss": 0.5112, + "step": 1091 + }, + { + "epoch": 0.07, + "grad_norm": 2.5564848942173555, + "learning_rate": 9.960945364033662e-06, + "loss": 0.4448, + "step": 1092 + }, + { + "epoch": 0.07, + "grad_norm": 0.9577214807056948, + "learning_rate": 9.9608182131863e-06, + "loss": 0.4981, + "step": 1093 + }, + { + "epoch": 0.07, + "grad_norm": 2.209846204917101, + "learning_rate": 9.960690856505774e-06, + "loss": 0.4315, + "step": 1094 + }, + { + "epoch": 0.07, + "grad_norm": 3.3829740417530734, + "learning_rate": 9.960563293997377e-06, + "loss": 0.4202, + "step": 1095 + }, + { + "epoch": 0.07, + "grad_norm": 3.6975487269010596, + "learning_rate": 9.960435525666397e-06, + "loss": 0.4224, + "step": 1096 + }, + { + "epoch": 0.07, + "grad_norm": 3.1979383948438187, + "learning_rate": 9.960307551518135e-06, + "loss": 0.4143, + "step": 1097 + }, + { + "epoch": 0.07, + "grad_norm": 1.6130717949337823, + "learning_rate": 9.960179371557905e-06, + "loss": 0.4302, + "step": 1098 + }, + { + "epoch": 0.07, + "grad_norm": 13.193657055310098, + "learning_rate": 9.960050985791021e-06, + "loss": 0.4207, + "step": 1099 + }, + { + "epoch": 0.07, + "grad_norm": 1.8108961622421267, + "learning_rate": 9.959922394222811e-06, + "loss": 0.4081, + "step": 1100 + }, + { + "epoch": 0.07, + "grad_norm": 1.8118247307835853, + "learning_rate": 9.959793596858614e-06, + "loss": 0.4129, + "step": 1101 + }, + { + "epoch": 0.07, + "grad_norm": 1.982502282567653, + "learning_rate": 9.959664593703769e-06, + "loss": 0.4397, + "step": 1102 + }, + { + "epoch": 0.07, + "grad_norm": 1.6872548116434403, + "learning_rate": 9.95953538476363e-06, + "loss": 0.4436, + "step": 1103 + }, + { + "epoch": 0.07, + "grad_norm": 1.8915972803886736, + "learning_rate": 9.959405970043558e-06, + "loss": 0.4035, + "step": 1104 + }, + { + "epoch": 0.07, + "grad_norm": 2.005443454230022, + "learning_rate": 9.959276349548926e-06, + "loss": 0.3938, + "step": 1105 + }, + { + "epoch": 0.07, + "grad_norm": 2.1111697114362995, + "learning_rate": 9.959146523285108e-06, + "loss": 0.4682, + "step": 1106 + }, + { + "epoch": 0.07, + "grad_norm": 4.906603648905359, + "learning_rate": 9.959016491257491e-06, + "loss": 0.4771, + "step": 1107 + }, + { + "epoch": 0.07, + "grad_norm": 1.8274519497356905, + "learning_rate": 9.958886253471474e-06, + "loss": 0.4155, + "step": 1108 + }, + { + "epoch": 0.07, + "grad_norm": 2.3098968910760096, + "learning_rate": 9.958755809932457e-06, + "loss": 0.4178, + "step": 1109 + }, + { + "epoch": 0.07, + "grad_norm": 1.9589235520258326, + "learning_rate": 9.958625160645855e-06, + "loss": 0.4624, + "step": 1110 + }, + { + "epoch": 0.07, + "grad_norm": 3.923709855815237, + "learning_rate": 9.958494305617087e-06, + "loss": 0.3995, + "step": 1111 + }, + { + "epoch": 0.07, + "grad_norm": 6.103399335480662, + "learning_rate": 9.958363244851584e-06, + "loss": 0.4158, + "step": 1112 + }, + { + "epoch": 0.07, + "grad_norm": 5.800277095827046, + "learning_rate": 9.95823197835478e-06, + "loss": 0.4377, + "step": 1113 + }, + { + "epoch": 0.07, + "grad_norm": 1.8306837780654002, + "learning_rate": 9.958100506132127e-06, + "loss": 0.4303, + "step": 1114 + }, + { + "epoch": 0.07, + "grad_norm": 4.907673716085669, + "learning_rate": 9.957968828189076e-06, + "loss": 0.4096, + "step": 1115 + }, + { + "epoch": 0.07, + "grad_norm": 2.076372980055298, + "learning_rate": 9.957836944531091e-06, + "loss": 0.4238, + "step": 1116 + }, + { + "epoch": 0.07, + "grad_norm": 1.5765367603892482, + "learning_rate": 9.957704855163648e-06, + "loss": 0.4335, + "step": 1117 + }, + { + "epoch": 0.07, + "grad_norm": 4.268987081114445, + "learning_rate": 9.957572560092223e-06, + "loss": 0.424, + "step": 1118 + }, + { + "epoch": 0.07, + "grad_norm": 2.5252904351103718, + "learning_rate": 9.957440059322308e-06, + "loss": 0.4293, + "step": 1119 + }, + { + "epoch": 0.07, + "grad_norm": 3.5133172714908736, + "learning_rate": 9.957307352859397e-06, + "loss": 0.4297, + "step": 1120 + }, + { + "epoch": 0.07, + "grad_norm": 1.894375399108354, + "learning_rate": 9.957174440709e-06, + "loss": 0.4009, + "step": 1121 + }, + { + "epoch": 0.07, + "grad_norm": 5.095415619491392, + "learning_rate": 9.957041322876632e-06, + "loss": 0.4252, + "step": 1122 + }, + { + "epoch": 0.07, + "grad_norm": 2.1931537983196945, + "learning_rate": 9.956907999367815e-06, + "loss": 0.4436, + "step": 1123 + }, + { + "epoch": 0.07, + "grad_norm": 2.1922907199920667, + "learning_rate": 9.956774470188079e-06, + "loss": 0.427, + "step": 1124 + }, + { + "epoch": 0.07, + "grad_norm": 3.1401379075208133, + "learning_rate": 9.956640735342966e-06, + "loss": 0.4364, + "step": 1125 + }, + { + "epoch": 0.07, + "grad_norm": 2.34329941398725, + "learning_rate": 9.956506794838025e-06, + "loss": 0.4483, + "step": 1126 + }, + { + "epoch": 0.07, + "grad_norm": 2.9979221997681402, + "learning_rate": 9.956372648678814e-06, + "loss": 0.4196, + "step": 1127 + }, + { + "epoch": 0.07, + "grad_norm": 2.8312909577519454, + "learning_rate": 9.956238296870898e-06, + "loss": 0.4273, + "step": 1128 + }, + { + "epoch": 0.07, + "grad_norm": 2.130057516734946, + "learning_rate": 9.956103739419852e-06, + "loss": 0.3799, + "step": 1129 + }, + { + "epoch": 0.07, + "grad_norm": 4.3755194095153955, + "learning_rate": 9.955968976331258e-06, + "loss": 0.4219, + "step": 1130 + }, + { + "epoch": 0.07, + "grad_norm": 2.2437667514805772, + "learning_rate": 9.95583400761071e-06, + "loss": 0.4259, + "step": 1131 + }, + { + "epoch": 0.07, + "grad_norm": 2.1191427593579175, + "learning_rate": 9.955698833263805e-06, + "loss": 0.4436, + "step": 1132 + }, + { + "epoch": 0.07, + "grad_norm": 2.3600928176122453, + "learning_rate": 9.955563453296154e-06, + "loss": 0.4326, + "step": 1133 + }, + { + "epoch": 0.07, + "grad_norm": 5.174961780325701, + "learning_rate": 9.955427867713372e-06, + "loss": 0.4094, + "step": 1134 + }, + { + "epoch": 0.07, + "grad_norm": 2.103957737047937, + "learning_rate": 9.955292076521088e-06, + "loss": 0.4169, + "step": 1135 + }, + { + "epoch": 0.07, + "grad_norm": 2.719030182825631, + "learning_rate": 9.955156079724932e-06, + "loss": 0.4353, + "step": 1136 + }, + { + "epoch": 0.07, + "grad_norm": 6.67841001777247, + "learning_rate": 9.955019877330549e-06, + "loss": 0.3822, + "step": 1137 + }, + { + "epoch": 0.07, + "grad_norm": 1.6936061193769136, + "learning_rate": 9.95488346934359e-06, + "loss": 0.4211, + "step": 1138 + }, + { + "epoch": 0.07, + "grad_norm": 1.8270868845291233, + "learning_rate": 9.954746855769717e-06, + "loss": 0.4136, + "step": 1139 + }, + { + "epoch": 0.07, + "grad_norm": 3.942249183739099, + "learning_rate": 9.954610036614595e-06, + "loss": 0.4024, + "step": 1140 + }, + { + "epoch": 0.07, + "grad_norm": 5.315704927234245, + "learning_rate": 9.954473011883903e-06, + "loss": 0.4155, + "step": 1141 + }, + { + "epoch": 0.07, + "grad_norm": 1.843726684819333, + "learning_rate": 9.954335781583326e-06, + "loss": 0.4286, + "step": 1142 + }, + { + "epoch": 0.07, + "grad_norm": 1.640572597785077, + "learning_rate": 9.954198345718556e-06, + "loss": 0.4089, + "step": 1143 + }, + { + "epoch": 0.07, + "grad_norm": 2.0486714901349705, + "learning_rate": 9.954060704295298e-06, + "loss": 0.3875, + "step": 1144 + }, + { + "epoch": 0.07, + "grad_norm": 2.6723275491556393, + "learning_rate": 9.953922857319261e-06, + "loss": 0.4276, + "step": 1145 + }, + { + "epoch": 0.07, + "grad_norm": 6.724395943504821, + "learning_rate": 9.953784804796167e-06, + "loss": 0.401, + "step": 1146 + }, + { + "epoch": 0.07, + "grad_norm": 2.2957490366865714, + "learning_rate": 9.95364654673174e-06, + "loss": 0.4555, + "step": 1147 + }, + { + "epoch": 0.07, + "grad_norm": 2.170708342495129, + "learning_rate": 9.953508083131722e-06, + "loss": 0.4553, + "step": 1148 + }, + { + "epoch": 0.07, + "grad_norm": 2.8269200120497504, + "learning_rate": 9.953369414001853e-06, + "loss": 0.4604, + "step": 1149 + }, + { + "epoch": 0.07, + "grad_norm": 1.901983238895689, + "learning_rate": 9.95323053934789e-06, + "loss": 0.4097, + "step": 1150 + }, + { + "epoch": 0.07, + "grad_norm": 1.4885513730789306, + "learning_rate": 9.953091459175595e-06, + "loss": 0.426, + "step": 1151 + }, + { + "epoch": 0.07, + "grad_norm": 12.654939040249722, + "learning_rate": 9.952952173490735e-06, + "loss": 0.3987, + "step": 1152 + }, + { + "epoch": 0.07, + "grad_norm": 2.620085677907411, + "learning_rate": 9.952812682299093e-06, + "loss": 0.4093, + "step": 1153 + }, + { + "epoch": 0.07, + "grad_norm": 2.807227735743264, + "learning_rate": 9.952672985606457e-06, + "loss": 0.3993, + "step": 1154 + }, + { + "epoch": 0.07, + "grad_norm": 2.077883064036308, + "learning_rate": 9.95253308341862e-06, + "loss": 0.4111, + "step": 1155 + }, + { + "epoch": 0.07, + "grad_norm": 1.7373548437655704, + "learning_rate": 9.952392975741389e-06, + "loss": 0.3972, + "step": 1156 + }, + { + "epoch": 0.07, + "grad_norm": 2.4599863742025554, + "learning_rate": 9.95225266258058e-06, + "loss": 0.4592, + "step": 1157 + }, + { + "epoch": 0.07, + "grad_norm": 2.8306714239615562, + "learning_rate": 9.952112143942008e-06, + "loss": 0.4112, + "step": 1158 + }, + { + "epoch": 0.07, + "grad_norm": 2.4581276301566324, + "learning_rate": 9.951971419831509e-06, + "loss": 0.4061, + "step": 1159 + }, + { + "epoch": 0.07, + "grad_norm": 2.24926755423996, + "learning_rate": 9.951830490254918e-06, + "loss": 0.4054, + "step": 1160 + }, + { + "epoch": 0.07, + "grad_norm": 3.8121588491986924, + "learning_rate": 9.951689355218088e-06, + "loss": 0.4246, + "step": 1161 + }, + { + "epoch": 0.07, + "grad_norm": 1.8822317861508915, + "learning_rate": 9.951548014726868e-06, + "loss": 0.4248, + "step": 1162 + }, + { + "epoch": 0.07, + "grad_norm": 3.457734983831385, + "learning_rate": 9.951406468787128e-06, + "loss": 0.4024, + "step": 1163 + }, + { + "epoch": 0.07, + "grad_norm": 1.6538322895061792, + "learning_rate": 9.95126471740474e-06, + "loss": 0.4229, + "step": 1164 + }, + { + "epoch": 0.07, + "grad_norm": 2.180062949779296, + "learning_rate": 9.951122760585582e-06, + "loss": 0.3761, + "step": 1165 + }, + { + "epoch": 0.07, + "grad_norm": 2.985139784360479, + "learning_rate": 9.950980598335548e-06, + "loss": 0.4176, + "step": 1166 + }, + { + "epoch": 0.07, + "grad_norm": 3.098600871684249, + "learning_rate": 9.950838230660535e-06, + "loss": 0.4318, + "step": 1167 + }, + { + "epoch": 0.07, + "grad_norm": 2.4070008307892183, + "learning_rate": 9.95069565756645e-06, + "loss": 0.4293, + "step": 1168 + }, + { + "epoch": 0.07, + "grad_norm": 2.2088169739092502, + "learning_rate": 9.950552879059208e-06, + "loss": 0.3926, + "step": 1169 + }, + { + "epoch": 0.07, + "grad_norm": 2.45918126709504, + "learning_rate": 9.950409895144732e-06, + "loss": 0.4406, + "step": 1170 + }, + { + "epoch": 0.07, + "grad_norm": 2.6032233842578774, + "learning_rate": 9.95026670582896e-06, + "loss": 0.3942, + "step": 1171 + }, + { + "epoch": 0.07, + "grad_norm": 2.1697066862160463, + "learning_rate": 9.950123311117828e-06, + "loss": 0.4011, + "step": 1172 + }, + { + "epoch": 0.07, + "grad_norm": 3.169470340729822, + "learning_rate": 9.949979711017288e-06, + "loss": 0.4097, + "step": 1173 + }, + { + "epoch": 0.07, + "grad_norm": 1.9054993124774207, + "learning_rate": 9.949835905533298e-06, + "loss": 0.4122, + "step": 1174 + }, + { + "epoch": 0.07, + "grad_norm": 4.157218184394324, + "learning_rate": 9.949691894671824e-06, + "loss": 0.4312, + "step": 1175 + }, + { + "epoch": 0.07, + "grad_norm": 2.020711252280423, + "learning_rate": 9.949547678438842e-06, + "loss": 0.4044, + "step": 1176 + }, + { + "epoch": 0.07, + "grad_norm": 2.324584809973775, + "learning_rate": 9.949403256840334e-06, + "loss": 0.3903, + "step": 1177 + }, + { + "epoch": 0.07, + "grad_norm": 2.0822794369792312, + "learning_rate": 9.949258629882295e-06, + "loss": 0.4084, + "step": 1178 + }, + { + "epoch": 0.07, + "grad_norm": 1.833722949501575, + "learning_rate": 9.949113797570724e-06, + "loss": 0.3842, + "step": 1179 + }, + { + "epoch": 0.07, + "grad_norm": 2.186114115992921, + "learning_rate": 9.948968759911633e-06, + "loss": 0.4584, + "step": 1180 + }, + { + "epoch": 0.07, + "grad_norm": 5.030559565827121, + "learning_rate": 9.948823516911034e-06, + "loss": 0.4066, + "step": 1181 + }, + { + "epoch": 0.07, + "grad_norm": 1.8127397266979124, + "learning_rate": 9.948678068574959e-06, + "loss": 0.4168, + "step": 1182 + }, + { + "epoch": 0.07, + "grad_norm": 1.9327679928843606, + "learning_rate": 9.948532414909442e-06, + "loss": 0.4453, + "step": 1183 + }, + { + "epoch": 0.07, + "grad_norm": 1.5909469409027934, + "learning_rate": 9.948386555920525e-06, + "loss": 0.4246, + "step": 1184 + }, + { + "epoch": 0.07, + "grad_norm": 1.654456156371072, + "learning_rate": 9.94824049161426e-06, + "loss": 0.3741, + "step": 1185 + }, + { + "epoch": 0.07, + "grad_norm": 3.50992240356473, + "learning_rate": 9.94809422199671e-06, + "loss": 0.432, + "step": 1186 + }, + { + "epoch": 0.07, + "grad_norm": 3.8768086792182466, + "learning_rate": 9.947947747073939e-06, + "loss": 0.3945, + "step": 1187 + }, + { + "epoch": 0.07, + "grad_norm": 4.157471389093322, + "learning_rate": 9.947801066852029e-06, + "loss": 0.4125, + "step": 1188 + }, + { + "epoch": 0.07, + "grad_norm": 3.194982551871048, + "learning_rate": 9.947654181337063e-06, + "loss": 0.4154, + "step": 1189 + }, + { + "epoch": 0.07, + "grad_norm": 2.6524354804752717, + "learning_rate": 9.947507090535139e-06, + "loss": 0.4158, + "step": 1190 + }, + { + "epoch": 0.07, + "grad_norm": 3.4113212327423694, + "learning_rate": 9.947359794452356e-06, + "loss": 0.4155, + "step": 1191 + }, + { + "epoch": 0.07, + "grad_norm": 3.1654870426499024, + "learning_rate": 9.94721229309483e-06, + "loss": 0.4089, + "step": 1192 + }, + { + "epoch": 0.08, + "grad_norm": 2.3239226693757096, + "learning_rate": 9.947064586468677e-06, + "loss": 0.4077, + "step": 1193 + }, + { + "epoch": 0.08, + "grad_norm": 3.045717960251605, + "learning_rate": 9.946916674580028e-06, + "loss": 0.4048, + "step": 1194 + }, + { + "epoch": 0.08, + "grad_norm": 3.94778507161363, + "learning_rate": 9.946768557435019e-06, + "loss": 0.3821, + "step": 1195 + }, + { + "epoch": 0.08, + "grad_norm": 4.863795468418417, + "learning_rate": 9.946620235039797e-06, + "loss": 0.4062, + "step": 1196 + }, + { + "epoch": 0.08, + "grad_norm": 2.1985856913958752, + "learning_rate": 9.946471707400514e-06, + "loss": 0.4419, + "step": 1197 + }, + { + "epoch": 0.08, + "grad_norm": 2.0466579467587525, + "learning_rate": 9.946322974523336e-06, + "loss": 0.4051, + "step": 1198 + }, + { + "epoch": 0.08, + "grad_norm": 1.9695428330223819, + "learning_rate": 9.94617403641443e-06, + "loss": 0.4178, + "step": 1199 + }, + { + "epoch": 0.08, + "grad_norm": 2.3333009429864418, + "learning_rate": 9.946024893079977e-06, + "loss": 0.4116, + "step": 1200 + }, + { + "epoch": 0.08, + "grad_norm": 2.266405677726087, + "learning_rate": 9.945875544526168e-06, + "loss": 0.3841, + "step": 1201 + }, + { + "epoch": 0.08, + "grad_norm": 1.924999768845305, + "learning_rate": 9.945725990759197e-06, + "loss": 0.4005, + "step": 1202 + }, + { + "epoch": 0.08, + "grad_norm": 3.431551423993694, + "learning_rate": 9.94557623178527e-06, + "loss": 0.3906, + "step": 1203 + }, + { + "epoch": 0.08, + "grad_norm": 2.394086168640765, + "learning_rate": 9.945426267610603e-06, + "loss": 0.4211, + "step": 1204 + }, + { + "epoch": 0.08, + "grad_norm": 3.124702489165753, + "learning_rate": 9.945276098241413e-06, + "loss": 0.4062, + "step": 1205 + }, + { + "epoch": 0.08, + "grad_norm": 2.0348838838892327, + "learning_rate": 9.945125723683934e-06, + "loss": 0.4045, + "step": 1206 + }, + { + "epoch": 0.08, + "grad_norm": 2.079576986386569, + "learning_rate": 9.944975143944407e-06, + "loss": 0.404, + "step": 1207 + }, + { + "epoch": 0.08, + "grad_norm": 3.741809985860796, + "learning_rate": 9.944824359029078e-06, + "loss": 0.4253, + "step": 1208 + }, + { + "epoch": 0.08, + "grad_norm": 3.7501764025367916, + "learning_rate": 9.944673368944202e-06, + "loss": 0.397, + "step": 1209 + }, + { + "epoch": 0.08, + "grad_norm": 2.159395742750831, + "learning_rate": 9.944522173696047e-06, + "loss": 0.4116, + "step": 1210 + }, + { + "epoch": 0.08, + "grad_norm": 2.466016323510857, + "learning_rate": 9.944370773290883e-06, + "loss": 0.3907, + "step": 1211 + }, + { + "epoch": 0.08, + "grad_norm": 3.971238187842781, + "learning_rate": 9.944219167734994e-06, + "loss": 0.4272, + "step": 1212 + }, + { + "epoch": 0.08, + "grad_norm": 2.0298059740919654, + "learning_rate": 9.94406735703467e-06, + "loss": 0.406, + "step": 1213 + }, + { + "epoch": 0.08, + "grad_norm": 2.4026259661066875, + "learning_rate": 9.943915341196209e-06, + "loss": 0.3856, + "step": 1214 + }, + { + "epoch": 0.08, + "grad_norm": 2.624219757935532, + "learning_rate": 9.94376312022592e-06, + "loss": 0.405, + "step": 1215 + }, + { + "epoch": 0.08, + "grad_norm": 2.4718592693709014, + "learning_rate": 9.943610694130117e-06, + "loss": 0.408, + "step": 1216 + }, + { + "epoch": 0.08, + "grad_norm": 6.6270700810101255, + "learning_rate": 9.943458062915126e-06, + "loss": 0.4193, + "step": 1217 + }, + { + "epoch": 0.08, + "grad_norm": 1.9171228652969508, + "learning_rate": 9.94330522658728e-06, + "loss": 0.3958, + "step": 1218 + }, + { + "epoch": 0.08, + "grad_norm": 2.02687721984281, + "learning_rate": 9.94315218515292e-06, + "loss": 0.3846, + "step": 1219 + }, + { + "epoch": 0.08, + "grad_norm": 2.270919265580942, + "learning_rate": 9.942998938618394e-06, + "loss": 0.4047, + "step": 1220 + }, + { + "epoch": 0.08, + "grad_norm": 1.6915723864732783, + "learning_rate": 9.942845486990064e-06, + "loss": 0.5804, + "step": 1221 + }, + { + "epoch": 0.08, + "grad_norm": 3.5053069801244665, + "learning_rate": 9.942691830274293e-06, + "loss": 0.3884, + "step": 1222 + }, + { + "epoch": 0.08, + "grad_norm": 2.195396899547578, + "learning_rate": 9.942537968477461e-06, + "loss": 0.3948, + "step": 1223 + }, + { + "epoch": 0.08, + "grad_norm": 1.8535971708343282, + "learning_rate": 9.94238390160595e-06, + "loss": 0.4287, + "step": 1224 + }, + { + "epoch": 0.08, + "grad_norm": 3.925765481633316, + "learning_rate": 9.942229629666152e-06, + "loss": 0.3974, + "step": 1225 + }, + { + "epoch": 0.08, + "grad_norm": 2.254763746431971, + "learning_rate": 9.942075152664467e-06, + "loss": 0.3776, + "step": 1226 + }, + { + "epoch": 0.08, + "grad_norm": 2.6502737838639416, + "learning_rate": 9.941920470607306e-06, + "loss": 0.4086, + "step": 1227 + }, + { + "epoch": 0.08, + "grad_norm": 1.8380067080178115, + "learning_rate": 9.941765583501088e-06, + "loss": 0.4707, + "step": 1228 + }, + { + "epoch": 0.08, + "grad_norm": 2.995217067426309, + "learning_rate": 9.941610491352238e-06, + "loss": 0.405, + "step": 1229 + }, + { + "epoch": 0.08, + "grad_norm": 1.9206367129209652, + "learning_rate": 9.94145519416719e-06, + "loss": 0.3878, + "step": 1230 + }, + { + "epoch": 0.08, + "grad_norm": 3.0937950878177984, + "learning_rate": 9.94129969195239e-06, + "loss": 0.3835, + "step": 1231 + }, + { + "epoch": 0.08, + "grad_norm": 1.681959798676605, + "learning_rate": 9.94114398471429e-06, + "loss": 0.3898, + "step": 1232 + }, + { + "epoch": 0.08, + "grad_norm": 2.38673455622357, + "learning_rate": 9.94098807245935e-06, + "loss": 0.4006, + "step": 1233 + }, + { + "epoch": 0.08, + "grad_norm": 1.7452893009158057, + "learning_rate": 9.940831955194036e-06, + "loss": 0.3908, + "step": 1234 + }, + { + "epoch": 0.08, + "grad_norm": 1.893389830279717, + "learning_rate": 9.94067563292483e-06, + "loss": 0.4397, + "step": 1235 + }, + { + "epoch": 0.08, + "grad_norm": 2.4763339006276053, + "learning_rate": 9.940519105658217e-06, + "loss": 0.4231, + "step": 1236 + }, + { + "epoch": 0.08, + "grad_norm": 1.9206715546328543, + "learning_rate": 9.94036237340069e-06, + "loss": 0.4188, + "step": 1237 + }, + { + "epoch": 0.08, + "grad_norm": 2.226804987608132, + "learning_rate": 9.940205436158753e-06, + "loss": 0.4057, + "step": 1238 + }, + { + "epoch": 0.08, + "grad_norm": 2.2201140495236245, + "learning_rate": 9.940048293938918e-06, + "loss": 0.398, + "step": 1239 + }, + { + "epoch": 0.08, + "grad_norm": 23.169631949745703, + "learning_rate": 9.939890946747703e-06, + "loss": 0.4126, + "step": 1240 + }, + { + "epoch": 0.08, + "grad_norm": 1.8901675996120868, + "learning_rate": 9.93973339459164e-06, + "loss": 0.4422, + "step": 1241 + }, + { + "epoch": 0.08, + "grad_norm": 2.0173839554868, + "learning_rate": 9.939575637477266e-06, + "loss": 0.3825, + "step": 1242 + }, + { + "epoch": 0.08, + "grad_norm": 2.3312057219372173, + "learning_rate": 9.939417675411123e-06, + "loss": 0.3937, + "step": 1243 + }, + { + "epoch": 0.08, + "grad_norm": 2.41994531044071, + "learning_rate": 9.939259508399767e-06, + "loss": 0.3645, + "step": 1244 + }, + { + "epoch": 0.08, + "grad_norm": 5.130732966256115, + "learning_rate": 9.939101136449763e-06, + "loss": 0.4355, + "step": 1245 + }, + { + "epoch": 0.08, + "grad_norm": 0.9407138873887051, + "learning_rate": 9.938942559567677e-06, + "loss": 0.5359, + "step": 1246 + }, + { + "epoch": 0.08, + "grad_norm": 2.1873362989230545, + "learning_rate": 9.938783777760095e-06, + "loss": 0.3971, + "step": 1247 + }, + { + "epoch": 0.08, + "grad_norm": 0.6647545068782758, + "learning_rate": 9.938624791033599e-06, + "loss": 0.4553, + "step": 1248 + }, + { + "epoch": 0.08, + "grad_norm": 3.726292817792635, + "learning_rate": 9.93846559939479e-06, + "loss": 0.4015, + "step": 1249 + }, + { + "epoch": 0.08, + "grad_norm": 0.6179117598872832, + "learning_rate": 9.938306202850272e-06, + "loss": 0.4852, + "step": 1250 + }, + { + "epoch": 0.08, + "grad_norm": 13.38919296381999, + "learning_rate": 9.938146601406657e-06, + "loss": 0.4089, + "step": 1251 + }, + { + "epoch": 0.08, + "grad_norm": 1.947093361026362, + "learning_rate": 9.937986795070568e-06, + "loss": 0.4459, + "step": 1252 + }, + { + "epoch": 0.08, + "grad_norm": 2.2415261426460065, + "learning_rate": 9.937826783848636e-06, + "loss": 0.4175, + "step": 1253 + }, + { + "epoch": 0.08, + "grad_norm": 3.0143115187190905, + "learning_rate": 9.9376665677475e-06, + "loss": 0.3941, + "step": 1254 + }, + { + "epoch": 0.08, + "grad_norm": 1.884912048352357, + "learning_rate": 9.93750614677381e-06, + "loss": 0.4685, + "step": 1255 + }, + { + "epoch": 0.08, + "grad_norm": 10.615011997154566, + "learning_rate": 9.93734552093422e-06, + "loss": 0.3747, + "step": 1256 + }, + { + "epoch": 0.08, + "grad_norm": 2.4576862985160397, + "learning_rate": 9.937184690235393e-06, + "loss": 0.3972, + "step": 1257 + }, + { + "epoch": 0.08, + "grad_norm": 2.301713426062173, + "learning_rate": 9.937023654684004e-06, + "loss": 0.3979, + "step": 1258 + }, + { + "epoch": 0.08, + "grad_norm": 2.3974970802635305, + "learning_rate": 9.936862414286734e-06, + "loss": 0.4505, + "step": 1259 + }, + { + "epoch": 0.08, + "grad_norm": 1.901122120553632, + "learning_rate": 9.936700969050275e-06, + "loss": 0.4301, + "step": 1260 + }, + { + "epoch": 0.08, + "grad_norm": 2.5712528297749246, + "learning_rate": 9.936539318981323e-06, + "loss": 0.4034, + "step": 1261 + }, + { + "epoch": 0.08, + "grad_norm": 2.5061825489424003, + "learning_rate": 9.936377464086586e-06, + "loss": 0.4692, + "step": 1262 + }, + { + "epoch": 0.08, + "grad_norm": 3.062917052282411, + "learning_rate": 9.936215404372783e-06, + "loss": 0.4135, + "step": 1263 + }, + { + "epoch": 0.08, + "grad_norm": 2.024460585702295, + "learning_rate": 9.936053139846631e-06, + "loss": 0.5626, + "step": 1264 + }, + { + "epoch": 0.08, + "grad_norm": 2.647596296601089, + "learning_rate": 9.93589067051487e-06, + "loss": 0.3908, + "step": 1265 + }, + { + "epoch": 0.08, + "grad_norm": 1.8476435586334785, + "learning_rate": 9.935727996384237e-06, + "loss": 0.4074, + "step": 1266 + }, + { + "epoch": 0.08, + "grad_norm": 6.1113709153265035, + "learning_rate": 9.93556511746148e-06, + "loss": 0.401, + "step": 1267 + }, + { + "epoch": 0.08, + "grad_norm": 2.0001077011543593, + "learning_rate": 9.935402033753364e-06, + "loss": 0.3933, + "step": 1268 + }, + { + "epoch": 0.08, + "grad_norm": 4.4100065995475255, + "learning_rate": 9.935238745266648e-06, + "loss": 0.408, + "step": 1269 + }, + { + "epoch": 0.08, + "grad_norm": 1.9698265874125425, + "learning_rate": 9.935075252008113e-06, + "loss": 0.4248, + "step": 1270 + }, + { + "epoch": 0.08, + "grad_norm": 1.6037970756036097, + "learning_rate": 9.934911553984539e-06, + "loss": 0.3888, + "step": 1271 + }, + { + "epoch": 0.08, + "grad_norm": 0.7803594986696203, + "learning_rate": 9.934747651202718e-06, + "loss": 0.484, + "step": 1272 + }, + { + "epoch": 0.08, + "grad_norm": 1.9670363109487332, + "learning_rate": 9.934583543669454e-06, + "loss": 0.4171, + "step": 1273 + }, + { + "epoch": 0.08, + "grad_norm": 2.392762481259209, + "learning_rate": 9.934419231391554e-06, + "loss": 0.3777, + "step": 1274 + }, + { + "epoch": 0.08, + "grad_norm": 2.6645278071234206, + "learning_rate": 9.934254714375834e-06, + "loss": 0.4242, + "step": 1275 + }, + { + "epoch": 0.08, + "grad_norm": 2.2565647136530593, + "learning_rate": 9.934089992629122e-06, + "loss": 0.3958, + "step": 1276 + }, + { + "epoch": 0.08, + "grad_norm": 2.5859524170815797, + "learning_rate": 9.933925066158254e-06, + "loss": 0.4164, + "step": 1277 + }, + { + "epoch": 0.08, + "grad_norm": 2.3127556176510717, + "learning_rate": 9.933759934970069e-06, + "loss": 0.4036, + "step": 1278 + }, + { + "epoch": 0.08, + "grad_norm": 1.8300530787503981, + "learning_rate": 9.93359459907142e-06, + "loss": 0.3761, + "step": 1279 + }, + { + "epoch": 0.08, + "grad_norm": 3.703908562364397, + "learning_rate": 9.933429058469171e-06, + "loss": 0.3914, + "step": 1280 + }, + { + "epoch": 0.08, + "grad_norm": 4.140590584560028, + "learning_rate": 9.933263313170187e-06, + "loss": 0.4299, + "step": 1281 + }, + { + "epoch": 0.08, + "grad_norm": 2.9763097183198512, + "learning_rate": 9.933097363181346e-06, + "loss": 0.4018, + "step": 1282 + }, + { + "epoch": 0.08, + "grad_norm": 2.0209916346738606, + "learning_rate": 9.932931208509533e-06, + "loss": 0.3795, + "step": 1283 + }, + { + "epoch": 0.08, + "grad_norm": 1.6783799119102611, + "learning_rate": 9.93276484916164e-06, + "loss": 0.4336, + "step": 1284 + }, + { + "epoch": 0.08, + "grad_norm": 2.7725227031042743, + "learning_rate": 9.932598285144575e-06, + "loss": 0.3956, + "step": 1285 + }, + { + "epoch": 0.08, + "grad_norm": 2.7385715317313024, + "learning_rate": 9.932431516465244e-06, + "loss": 0.4128, + "step": 1286 + }, + { + "epoch": 0.08, + "grad_norm": 1.744366053658705, + "learning_rate": 9.932264543130568e-06, + "loss": 0.4132, + "step": 1287 + }, + { + "epoch": 0.08, + "grad_norm": 3.683341016651291, + "learning_rate": 9.932097365147477e-06, + "loss": 0.3865, + "step": 1288 + }, + { + "epoch": 0.08, + "grad_norm": 2.334883256421227, + "learning_rate": 9.931929982522906e-06, + "loss": 0.4004, + "step": 1289 + }, + { + "epoch": 0.08, + "grad_norm": 2.0309873328446724, + "learning_rate": 9.931762395263798e-06, + "loss": 0.3918, + "step": 1290 + }, + { + "epoch": 0.08, + "grad_norm": 76.55529773301637, + "learning_rate": 9.93159460337711e-06, + "loss": 0.4079, + "step": 1291 + }, + { + "epoch": 0.08, + "grad_norm": 4.8661975404978115, + "learning_rate": 9.931426606869802e-06, + "loss": 0.4158, + "step": 1292 + }, + { + "epoch": 0.08, + "grad_norm": 2.954796512963682, + "learning_rate": 9.931258405748846e-06, + "loss": 0.3868, + "step": 1293 + }, + { + "epoch": 0.08, + "grad_norm": 3.3411782414968396, + "learning_rate": 9.931090000021218e-06, + "loss": 0.364, + "step": 1294 + }, + { + "epoch": 0.08, + "grad_norm": 2.117366431507436, + "learning_rate": 9.930921389693907e-06, + "loss": 0.4244, + "step": 1295 + }, + { + "epoch": 0.08, + "grad_norm": 1.6985165345579143, + "learning_rate": 9.93075257477391e-06, + "loss": 0.4044, + "step": 1296 + }, + { + "epoch": 0.08, + "grad_norm": 2.2261173854773073, + "learning_rate": 9.930583555268232e-06, + "loss": 0.3767, + "step": 1297 + }, + { + "epoch": 0.08, + "grad_norm": 0.8436602713913757, + "learning_rate": 9.930414331183883e-06, + "loss": 0.4865, + "step": 1298 + }, + { + "epoch": 0.08, + "grad_norm": 1.8361347906103542, + "learning_rate": 9.930244902527885e-06, + "loss": 0.4039, + "step": 1299 + }, + { + "epoch": 0.08, + "grad_norm": 2.714770980754513, + "learning_rate": 9.930075269307271e-06, + "loss": 0.4118, + "step": 1300 + }, + { + "epoch": 0.08, + "grad_norm": 3.5300074767503293, + "learning_rate": 9.929905431529077e-06, + "loss": 0.3933, + "step": 1301 + }, + { + "epoch": 0.08, + "grad_norm": 187.2722523416001, + "learning_rate": 9.92973538920035e-06, + "loss": 0.42, + "step": 1302 + }, + { + "epoch": 0.08, + "grad_norm": 5.621836919561685, + "learning_rate": 9.929565142328145e-06, + "loss": 0.419, + "step": 1303 + }, + { + "epoch": 0.08, + "grad_norm": 3.4384579060836136, + "learning_rate": 9.929394690919527e-06, + "loss": 0.3842, + "step": 1304 + }, + { + "epoch": 0.08, + "grad_norm": 4.729187611472591, + "learning_rate": 9.929224034981568e-06, + "loss": 0.3754, + "step": 1305 + }, + { + "epoch": 0.08, + "grad_norm": 1.9428666565866435, + "learning_rate": 9.929053174521348e-06, + "loss": 0.4653, + "step": 1306 + }, + { + "epoch": 0.08, + "grad_norm": 2.3237029777076876, + "learning_rate": 9.928882109545956e-06, + "loss": 0.389, + "step": 1307 + }, + { + "epoch": 0.08, + "grad_norm": 1.655425575128045, + "learning_rate": 9.928710840062492e-06, + "loss": 0.4253, + "step": 1308 + }, + { + "epoch": 0.08, + "grad_norm": 2.7001924494572114, + "learning_rate": 9.92853936607806e-06, + "loss": 0.4164, + "step": 1309 + }, + { + "epoch": 0.08, + "grad_norm": 1.9253618092665186, + "learning_rate": 9.928367687599775e-06, + "loss": 0.3999, + "step": 1310 + }, + { + "epoch": 0.08, + "grad_norm": 2.356637593843065, + "learning_rate": 9.928195804634761e-06, + "loss": 0.4199, + "step": 1311 + }, + { + "epoch": 0.08, + "grad_norm": 1.6567326503830144, + "learning_rate": 9.928023717190152e-06, + "loss": 0.4097, + "step": 1312 + }, + { + "epoch": 0.08, + "grad_norm": 1.8788831437638487, + "learning_rate": 9.927851425273082e-06, + "loss": 0.389, + "step": 1313 + }, + { + "epoch": 0.08, + "grad_norm": 3.371769965405906, + "learning_rate": 9.927678928890707e-06, + "loss": 0.3723, + "step": 1314 + }, + { + "epoch": 0.08, + "grad_norm": 2.353042406999752, + "learning_rate": 9.92750622805018e-06, + "loss": 0.3935, + "step": 1315 + }, + { + "epoch": 0.08, + "grad_norm": 2.760828452052688, + "learning_rate": 9.927333322758665e-06, + "loss": 0.4225, + "step": 1316 + }, + { + "epoch": 0.08, + "grad_norm": 2.7590947394628937, + "learning_rate": 9.92716021302334e-06, + "loss": 0.4391, + "step": 1317 + }, + { + "epoch": 0.08, + "grad_norm": 2.405071562879864, + "learning_rate": 9.926986898851387e-06, + "loss": 0.4159, + "step": 1318 + }, + { + "epoch": 0.08, + "grad_norm": 3.0184503357446797, + "learning_rate": 9.926813380249995e-06, + "loss": 0.3992, + "step": 1319 + }, + { + "epoch": 0.08, + "grad_norm": 3.0824959041518154, + "learning_rate": 9.926639657226366e-06, + "loss": 0.4168, + "step": 1320 + }, + { + "epoch": 0.08, + "grad_norm": 3.7458826976092214, + "learning_rate": 9.926465729787707e-06, + "loss": 0.378, + "step": 1321 + }, + { + "epoch": 0.08, + "grad_norm": 1.5773154796096445, + "learning_rate": 9.926291597941234e-06, + "loss": 0.4083, + "step": 1322 + }, + { + "epoch": 0.08, + "grad_norm": 3.11089775571572, + "learning_rate": 9.926117261694171e-06, + "loss": 0.375, + "step": 1323 + }, + { + "epoch": 0.08, + "grad_norm": 2.5481490222725935, + "learning_rate": 9.925942721053755e-06, + "loss": 0.3894, + "step": 1324 + }, + { + "epoch": 0.08, + "grad_norm": 3.283882783001254, + "learning_rate": 9.925767976027226e-06, + "loss": 0.4141, + "step": 1325 + }, + { + "epoch": 0.08, + "grad_norm": 1.7701549571254545, + "learning_rate": 9.925593026621833e-06, + "loss": 0.3949, + "step": 1326 + }, + { + "epoch": 0.08, + "grad_norm": 2.7391856293318613, + "learning_rate": 9.925417872844838e-06, + "loss": 0.3814, + "step": 1327 + }, + { + "epoch": 0.08, + "grad_norm": 3.7757977417711746, + "learning_rate": 9.925242514703505e-06, + "loss": 0.4038, + "step": 1328 + }, + { + "epoch": 0.08, + "grad_norm": 2.0179859633576873, + "learning_rate": 9.925066952205113e-06, + "loss": 0.4149, + "step": 1329 + }, + { + "epoch": 0.08, + "grad_norm": 2.27224864723054, + "learning_rate": 9.924891185356946e-06, + "loss": 0.3984, + "step": 1330 + }, + { + "epoch": 0.08, + "grad_norm": 31.31739209436363, + "learning_rate": 9.924715214166297e-06, + "loss": 0.401, + "step": 1331 + }, + { + "epoch": 0.08, + "grad_norm": 2.6021015902600118, + "learning_rate": 9.924539038640464e-06, + "loss": 0.426, + "step": 1332 + }, + { + "epoch": 0.08, + "grad_norm": 2.4224725330505392, + "learning_rate": 9.92436265878676e-06, + "loss": 0.4028, + "step": 1333 + }, + { + "epoch": 0.08, + "grad_norm": 1.5472933057935412, + "learning_rate": 9.924186074612502e-06, + "loss": 0.3756, + "step": 1334 + }, + { + "epoch": 0.08, + "grad_norm": 1.758829975204866, + "learning_rate": 9.92400928612502e-06, + "loss": 0.3987, + "step": 1335 + }, + { + "epoch": 0.08, + "grad_norm": 5.868957251687971, + "learning_rate": 9.923832293331645e-06, + "loss": 0.3917, + "step": 1336 + }, + { + "epoch": 0.08, + "grad_norm": 2.3265652216559642, + "learning_rate": 9.923655096239722e-06, + "loss": 0.4568, + "step": 1337 + }, + { + "epoch": 0.08, + "grad_norm": 1.8043481729106445, + "learning_rate": 9.923477694856605e-06, + "loss": 0.4653, + "step": 1338 + }, + { + "epoch": 0.08, + "grad_norm": 1.755369831885525, + "learning_rate": 9.923300089189653e-06, + "loss": 0.4351, + "step": 1339 + }, + { + "epoch": 0.08, + "grad_norm": 1.6445669739920281, + "learning_rate": 9.923122279246234e-06, + "loss": 0.3942, + "step": 1340 + }, + { + "epoch": 0.08, + "grad_norm": 6.275747062318478, + "learning_rate": 9.922944265033729e-06, + "loss": 0.3919, + "step": 1341 + }, + { + "epoch": 0.08, + "grad_norm": 2.812507379854062, + "learning_rate": 9.922766046559522e-06, + "loss": 0.3981, + "step": 1342 + }, + { + "epoch": 0.08, + "grad_norm": 2.6450415429396688, + "learning_rate": 9.922587623831007e-06, + "loss": 0.3976, + "step": 1343 + }, + { + "epoch": 0.08, + "grad_norm": 2.4341460590153163, + "learning_rate": 9.922408996855588e-06, + "loss": 0.4085, + "step": 1344 + }, + { + "epoch": 0.08, + "grad_norm": 1.877096783037955, + "learning_rate": 9.922230165640678e-06, + "loss": 0.4003, + "step": 1345 + }, + { + "epoch": 0.08, + "grad_norm": 1.8133136042326141, + "learning_rate": 9.922051130193694e-06, + "loss": 0.3784, + "step": 1346 + }, + { + "epoch": 0.08, + "grad_norm": 2.181148964621575, + "learning_rate": 9.921871890522066e-06, + "loss": 0.3794, + "step": 1347 + }, + { + "epoch": 0.08, + "grad_norm": 2.6226953846995316, + "learning_rate": 9.921692446633233e-06, + "loss": 0.3905, + "step": 1348 + }, + { + "epoch": 0.08, + "grad_norm": 2.190614017631786, + "learning_rate": 9.921512798534637e-06, + "loss": 0.4035, + "step": 1349 + }, + { + "epoch": 0.08, + "grad_norm": 0.8681966725964875, + "learning_rate": 9.921332946233733e-06, + "loss": 0.4883, + "step": 1350 + }, + { + "epoch": 0.08, + "grad_norm": 2.2761549349465735, + "learning_rate": 9.921152889737985e-06, + "loss": 0.4457, + "step": 1351 + }, + { + "epoch": 0.09, + "grad_norm": 6.682495836410096, + "learning_rate": 9.920972629054862e-06, + "loss": 0.4148, + "step": 1352 + }, + { + "epoch": 0.09, + "grad_norm": 1.799774426097016, + "learning_rate": 9.920792164191844e-06, + "loss": 0.4202, + "step": 1353 + }, + { + "epoch": 0.09, + "grad_norm": 2.7054108308219185, + "learning_rate": 9.920611495156418e-06, + "loss": 0.409, + "step": 1354 + }, + { + "epoch": 0.09, + "grad_norm": 0.7246505563609941, + "learning_rate": 9.920430621956082e-06, + "loss": 0.5196, + "step": 1355 + }, + { + "epoch": 0.09, + "grad_norm": 1.5568292174747451, + "learning_rate": 9.92024954459834e-06, + "loss": 0.4426, + "step": 1356 + }, + { + "epoch": 0.09, + "grad_norm": 1.8441637875609818, + "learning_rate": 9.920068263090706e-06, + "loss": 0.4272, + "step": 1357 + }, + { + "epoch": 0.09, + "grad_norm": 2.087098202640914, + "learning_rate": 9.9198867774407e-06, + "loss": 0.4004, + "step": 1358 + }, + { + "epoch": 0.09, + "grad_norm": 0.7063518045266224, + "learning_rate": 9.919705087655851e-06, + "loss": 0.4794, + "step": 1359 + }, + { + "epoch": 0.09, + "grad_norm": 3.3065403119255796, + "learning_rate": 9.919523193743701e-06, + "loss": 0.4401, + "step": 1360 + }, + { + "epoch": 0.09, + "grad_norm": 2.032868476159452, + "learning_rate": 9.919341095711796e-06, + "loss": 0.3968, + "step": 1361 + }, + { + "epoch": 0.09, + "grad_norm": 5.026238979965795, + "learning_rate": 9.91915879356769e-06, + "loss": 0.4, + "step": 1362 + }, + { + "epoch": 0.09, + "grad_norm": 3.0441710274220073, + "learning_rate": 9.918976287318948e-06, + "loss": 0.4487, + "step": 1363 + }, + { + "epoch": 0.09, + "grad_norm": 2.460569208947006, + "learning_rate": 9.918793576973145e-06, + "loss": 0.4442, + "step": 1364 + }, + { + "epoch": 0.09, + "grad_norm": 3.7484439196754873, + "learning_rate": 9.91861066253786e-06, + "loss": 0.3806, + "step": 1365 + }, + { + "epoch": 0.09, + "grad_norm": 3.4863483898466465, + "learning_rate": 9.918427544020678e-06, + "loss": 0.4098, + "step": 1366 + }, + { + "epoch": 0.09, + "grad_norm": 2.800766859707209, + "learning_rate": 9.918244221429205e-06, + "loss": 0.4336, + "step": 1367 + }, + { + "epoch": 0.09, + "grad_norm": 1.6647415061590345, + "learning_rate": 9.918060694771043e-06, + "loss": 0.3888, + "step": 1368 + }, + { + "epoch": 0.09, + "grad_norm": 4.901222269363619, + "learning_rate": 9.917876964053806e-06, + "loss": 0.4253, + "step": 1369 + }, + { + "epoch": 0.09, + "grad_norm": 2.2585778539588093, + "learning_rate": 9.91769302928512e-06, + "loss": 0.4311, + "step": 1370 + }, + { + "epoch": 0.09, + "grad_norm": 1.5765041214152822, + "learning_rate": 9.917508890472613e-06, + "loss": 0.4164, + "step": 1371 + }, + { + "epoch": 0.09, + "grad_norm": 2.11036074889742, + "learning_rate": 9.91732454762393e-06, + "loss": 0.3994, + "step": 1372 + }, + { + "epoch": 0.09, + "grad_norm": 1.676435633402841, + "learning_rate": 9.917140000746717e-06, + "loss": 0.4027, + "step": 1373 + }, + { + "epoch": 0.09, + "grad_norm": 1.4989157433821845, + "learning_rate": 9.916955249848631e-06, + "loss": 0.3891, + "step": 1374 + }, + { + "epoch": 0.09, + "grad_norm": 2.1544482520646615, + "learning_rate": 9.916770294937339e-06, + "loss": 0.3936, + "step": 1375 + }, + { + "epoch": 0.09, + "grad_norm": 1.5951856923610135, + "learning_rate": 9.916585136020513e-06, + "loss": 0.3946, + "step": 1376 + }, + { + "epoch": 0.09, + "grad_norm": 6.132462663210312, + "learning_rate": 9.916399773105839e-06, + "loss": 0.3855, + "step": 1377 + }, + { + "epoch": 0.09, + "grad_norm": 1.869095672777846, + "learning_rate": 9.916214206201003e-06, + "loss": 0.4398, + "step": 1378 + }, + { + "epoch": 0.09, + "grad_norm": 2.0817447810542613, + "learning_rate": 9.91602843531371e-06, + "loss": 0.418, + "step": 1379 + }, + { + "epoch": 0.09, + "grad_norm": 2.5619346912924112, + "learning_rate": 9.915842460451663e-06, + "loss": 0.4257, + "step": 1380 + }, + { + "epoch": 0.09, + "grad_norm": 1.6600325364141273, + "learning_rate": 9.915656281622584e-06, + "loss": 0.3744, + "step": 1381 + }, + { + "epoch": 0.09, + "grad_norm": 1.8280151388735506, + "learning_rate": 9.915469898834191e-06, + "loss": 0.4255, + "step": 1382 + }, + { + "epoch": 0.09, + "grad_norm": 2.4442944631790735, + "learning_rate": 9.915283312094222e-06, + "loss": 0.4083, + "step": 1383 + }, + { + "epoch": 0.09, + "grad_norm": 1.6272636478051172, + "learning_rate": 9.91509652141042e-06, + "loss": 0.3985, + "step": 1384 + }, + { + "epoch": 0.09, + "grad_norm": 2.187900171096838, + "learning_rate": 9.91490952679053e-06, + "loss": 0.4175, + "step": 1385 + }, + { + "epoch": 0.09, + "grad_norm": 2.751596329155221, + "learning_rate": 9.914722328242316e-06, + "loss": 0.395, + "step": 1386 + }, + { + "epoch": 0.09, + "grad_norm": 5.015177979893566, + "learning_rate": 9.914534925773543e-06, + "loss": 0.4644, + "step": 1387 + }, + { + "epoch": 0.09, + "grad_norm": 2.5039838659252927, + "learning_rate": 9.914347319391987e-06, + "loss": 0.4122, + "step": 1388 + }, + { + "epoch": 0.09, + "grad_norm": 1.7764630100827854, + "learning_rate": 9.914159509105431e-06, + "loss": 0.3779, + "step": 1389 + }, + { + "epoch": 0.09, + "grad_norm": 1.9194266369463542, + "learning_rate": 9.913971494921669e-06, + "loss": 0.3932, + "step": 1390 + }, + { + "epoch": 0.09, + "grad_norm": 1.6374283687756888, + "learning_rate": 9.9137832768485e-06, + "loss": 0.4016, + "step": 1391 + }, + { + "epoch": 0.09, + "grad_norm": 1.6075015121871845, + "learning_rate": 9.913594854893738e-06, + "loss": 0.42, + "step": 1392 + }, + { + "epoch": 0.09, + "grad_norm": 1.5945031378142487, + "learning_rate": 9.913406229065196e-06, + "loss": 0.4224, + "step": 1393 + }, + { + "epoch": 0.09, + "grad_norm": 3.3031052827672984, + "learning_rate": 9.913217399370702e-06, + "loss": 0.3812, + "step": 1394 + }, + { + "epoch": 0.09, + "grad_norm": 1.2678496994549624, + "learning_rate": 9.913028365818092e-06, + "loss": 0.3828, + "step": 1395 + }, + { + "epoch": 0.09, + "grad_norm": 2.4283198790855907, + "learning_rate": 9.912839128415209e-06, + "loss": 0.4556, + "step": 1396 + }, + { + "epoch": 0.09, + "grad_norm": 1.5556333121747936, + "learning_rate": 9.912649687169901e-06, + "loss": 0.4559, + "step": 1397 + }, + { + "epoch": 0.09, + "grad_norm": 1.8638758042854766, + "learning_rate": 9.912460042090035e-06, + "loss": 0.3946, + "step": 1398 + }, + { + "epoch": 0.09, + "grad_norm": 2.2184827489764256, + "learning_rate": 9.912270193183476e-06, + "loss": 0.3992, + "step": 1399 + }, + { + "epoch": 0.09, + "grad_norm": 2.2500568925914206, + "learning_rate": 9.912080140458102e-06, + "loss": 0.3921, + "step": 1400 + }, + { + "epoch": 0.09, + "grad_norm": 2.2813042713793945, + "learning_rate": 9.911889883921797e-06, + "loss": 0.4086, + "step": 1401 + }, + { + "epoch": 0.09, + "grad_norm": 1.940302937643413, + "learning_rate": 9.911699423582457e-06, + "loss": 0.3584, + "step": 1402 + }, + { + "epoch": 0.09, + "grad_norm": 1.8593565231209148, + "learning_rate": 9.911508759447984e-06, + "loss": 0.4025, + "step": 1403 + }, + { + "epoch": 0.09, + "grad_norm": 1.6361669972872965, + "learning_rate": 9.911317891526286e-06, + "loss": 0.4029, + "step": 1404 + }, + { + "epoch": 0.09, + "grad_norm": 2.0361185029938778, + "learning_rate": 9.911126819825287e-06, + "loss": 0.4166, + "step": 1405 + }, + { + "epoch": 0.09, + "grad_norm": 1.6620122593043212, + "learning_rate": 9.910935544352914e-06, + "loss": 0.4295, + "step": 1406 + }, + { + "epoch": 0.09, + "grad_norm": 1.9094808654016637, + "learning_rate": 9.910744065117101e-06, + "loss": 0.4207, + "step": 1407 + }, + { + "epoch": 0.09, + "grad_norm": 5.115139760675823, + "learning_rate": 9.910552382125797e-06, + "loss": 0.3801, + "step": 1408 + }, + { + "epoch": 0.09, + "grad_norm": 6.750216896874437, + "learning_rate": 9.91036049538695e-06, + "loss": 0.3902, + "step": 1409 + }, + { + "epoch": 0.09, + "grad_norm": 2.764468872418187, + "learning_rate": 9.910168404908525e-06, + "loss": 0.409, + "step": 1410 + }, + { + "epoch": 0.09, + "grad_norm": 1.8805666909647074, + "learning_rate": 9.909976110698491e-06, + "loss": 0.414, + "step": 1411 + }, + { + "epoch": 0.09, + "grad_norm": 1.8817569820293847, + "learning_rate": 9.909783612764827e-06, + "loss": 0.408, + "step": 1412 + }, + { + "epoch": 0.09, + "grad_norm": 1.8955912629798846, + "learning_rate": 9.909590911115521e-06, + "loss": 0.421, + "step": 1413 + }, + { + "epoch": 0.09, + "grad_norm": 0.8233016733210808, + "learning_rate": 9.909398005758567e-06, + "loss": 0.5172, + "step": 1414 + }, + { + "epoch": 0.09, + "grad_norm": 2.9509445029106045, + "learning_rate": 9.909204896701969e-06, + "loss": 0.399, + "step": 1415 + }, + { + "epoch": 0.09, + "grad_norm": 3.0487482740909604, + "learning_rate": 9.909011583953743e-06, + "loss": 0.4208, + "step": 1416 + }, + { + "epoch": 0.09, + "grad_norm": 1.9694569347932842, + "learning_rate": 9.908818067521904e-06, + "loss": 0.4085, + "step": 1417 + }, + { + "epoch": 0.09, + "grad_norm": 3.0491318225826527, + "learning_rate": 9.908624347414486e-06, + "loss": 0.423, + "step": 1418 + }, + { + "epoch": 0.09, + "grad_norm": 1.4949193002474532, + "learning_rate": 9.908430423639524e-06, + "loss": 0.4163, + "step": 1419 + }, + { + "epoch": 0.09, + "grad_norm": 4.605398834105979, + "learning_rate": 9.908236296205066e-06, + "loss": 0.4119, + "step": 1420 + }, + { + "epoch": 0.09, + "grad_norm": 1.4125172978253353, + "learning_rate": 9.908041965119167e-06, + "loss": 0.3951, + "step": 1421 + }, + { + "epoch": 0.09, + "grad_norm": 2.185038540239043, + "learning_rate": 9.907847430389887e-06, + "loss": 0.3815, + "step": 1422 + }, + { + "epoch": 0.09, + "grad_norm": 1.4387868815608758, + "learning_rate": 9.9076526920253e-06, + "loss": 0.3979, + "step": 1423 + }, + { + "epoch": 0.09, + "grad_norm": 1.4015483848228638, + "learning_rate": 9.907457750033487e-06, + "loss": 0.3604, + "step": 1424 + }, + { + "epoch": 0.09, + "grad_norm": 2.357191981985771, + "learning_rate": 9.907262604422537e-06, + "loss": 0.3918, + "step": 1425 + }, + { + "epoch": 0.09, + "grad_norm": 1.5894567076827006, + "learning_rate": 9.907067255200543e-06, + "loss": 0.404, + "step": 1426 + }, + { + "epoch": 0.09, + "grad_norm": 1.9072082104415833, + "learning_rate": 9.906871702375611e-06, + "loss": 0.3818, + "step": 1427 + }, + { + "epoch": 0.09, + "grad_norm": 3.2901148439758, + "learning_rate": 9.90667594595586e-06, + "loss": 0.4283, + "step": 1428 + }, + { + "epoch": 0.09, + "grad_norm": 2.327012966175589, + "learning_rate": 9.906479985949407e-06, + "loss": 0.3966, + "step": 1429 + }, + { + "epoch": 0.09, + "grad_norm": 3.459879041421819, + "learning_rate": 9.906283822364384e-06, + "loss": 0.4073, + "step": 1430 + }, + { + "epoch": 0.09, + "grad_norm": 1.7313958299477028, + "learning_rate": 9.90608745520893e-06, + "loss": 0.4236, + "step": 1431 + }, + { + "epoch": 0.09, + "grad_norm": 4.530695929689069, + "learning_rate": 9.905890884491196e-06, + "loss": 0.4107, + "step": 1432 + }, + { + "epoch": 0.09, + "grad_norm": 2.4345884043627475, + "learning_rate": 9.905694110219335e-06, + "loss": 0.4173, + "step": 1433 + }, + { + "epoch": 0.09, + "grad_norm": 3.250153863573317, + "learning_rate": 9.90549713240151e-06, + "loss": 0.4001, + "step": 1434 + }, + { + "epoch": 0.09, + "grad_norm": 1.9437826118184052, + "learning_rate": 9.905299951045897e-06, + "loss": 0.4219, + "step": 1435 + }, + { + "epoch": 0.09, + "grad_norm": 1.5513169384343328, + "learning_rate": 9.905102566160676e-06, + "loss": 0.4077, + "step": 1436 + }, + { + "epoch": 0.09, + "grad_norm": 2.1322290329463036, + "learning_rate": 9.904904977754038e-06, + "loss": 0.4017, + "step": 1437 + }, + { + "epoch": 0.09, + "grad_norm": 3.6814866941618773, + "learning_rate": 9.904707185834178e-06, + "loss": 0.4068, + "step": 1438 + }, + { + "epoch": 0.09, + "grad_norm": 15.386708142763933, + "learning_rate": 9.904509190409306e-06, + "loss": 0.4049, + "step": 1439 + }, + { + "epoch": 0.09, + "grad_norm": 1.9453388681624317, + "learning_rate": 9.904310991487638e-06, + "loss": 0.4025, + "step": 1440 + }, + { + "epoch": 0.09, + "grad_norm": 1.8745260712880345, + "learning_rate": 9.904112589077395e-06, + "loss": 0.4031, + "step": 1441 + }, + { + "epoch": 0.09, + "grad_norm": 1.5975200337702475, + "learning_rate": 9.90391398318681e-06, + "loss": 0.4017, + "step": 1442 + }, + { + "epoch": 0.09, + "grad_norm": 2.324353289238748, + "learning_rate": 9.903715173824123e-06, + "loss": 0.393, + "step": 1443 + }, + { + "epoch": 0.09, + "grad_norm": 3.1388241598048117, + "learning_rate": 9.903516160997583e-06, + "loss": 0.3989, + "step": 1444 + }, + { + "epoch": 0.09, + "grad_norm": 2.059743073559714, + "learning_rate": 9.903316944715449e-06, + "loss": 0.3701, + "step": 1445 + }, + { + "epoch": 0.09, + "grad_norm": 3.886145433535198, + "learning_rate": 9.903117524985986e-06, + "loss": 0.3989, + "step": 1446 + }, + { + "epoch": 0.09, + "grad_norm": 0.9183789478214723, + "learning_rate": 9.902917901817466e-06, + "loss": 0.519, + "step": 1447 + }, + { + "epoch": 0.09, + "grad_norm": 1.6682113945767112, + "learning_rate": 9.902718075218176e-06, + "loss": 0.4025, + "step": 1448 + }, + { + "epoch": 0.09, + "grad_norm": 2.043910702886714, + "learning_rate": 9.902518045196404e-06, + "loss": 0.4205, + "step": 1449 + }, + { + "epoch": 0.09, + "grad_norm": 1.6660949741835804, + "learning_rate": 9.902317811760449e-06, + "loss": 0.3856, + "step": 1450 + }, + { + "epoch": 0.09, + "grad_norm": 0.6677608029610768, + "learning_rate": 9.902117374918623e-06, + "loss": 0.4778, + "step": 1451 + }, + { + "epoch": 0.09, + "grad_norm": 2.945677472625155, + "learning_rate": 9.901916734679237e-06, + "loss": 0.4131, + "step": 1452 + }, + { + "epoch": 0.09, + "grad_norm": 2.673107488354125, + "learning_rate": 9.901715891050622e-06, + "loss": 0.4297, + "step": 1453 + }, + { + "epoch": 0.09, + "grad_norm": 3.023626774814282, + "learning_rate": 9.901514844041107e-06, + "loss": 0.3997, + "step": 1454 + }, + { + "epoch": 0.09, + "grad_norm": 1.7819456139503294, + "learning_rate": 9.901313593659035e-06, + "loss": 0.3899, + "step": 1455 + }, + { + "epoch": 0.09, + "grad_norm": 13.755152624488014, + "learning_rate": 9.901112139912757e-06, + "loss": 0.4234, + "step": 1456 + }, + { + "epoch": 0.09, + "grad_norm": 2.3820210120304215, + "learning_rate": 9.90091048281063e-06, + "loss": 0.3778, + "step": 1457 + }, + { + "epoch": 0.09, + "grad_norm": 3.201837689877226, + "learning_rate": 9.90070862236102e-06, + "loss": 0.3819, + "step": 1458 + }, + { + "epoch": 0.09, + "grad_norm": 14.46930200786034, + "learning_rate": 9.900506558572309e-06, + "loss": 0.4427, + "step": 1459 + }, + { + "epoch": 0.09, + "grad_norm": 3.4506886847437297, + "learning_rate": 9.900304291452873e-06, + "loss": 0.3964, + "step": 1460 + }, + { + "epoch": 0.09, + "grad_norm": 2.7782056866041125, + "learning_rate": 9.90010182101111e-06, + "loss": 0.38, + "step": 1461 + }, + { + "epoch": 0.09, + "grad_norm": 3.1578013332547443, + "learning_rate": 9.899899147255418e-06, + "loss": 0.3963, + "step": 1462 + }, + { + "epoch": 0.09, + "grad_norm": 1.6429588857787716, + "learning_rate": 9.899696270194208e-06, + "loss": 0.4039, + "step": 1463 + }, + { + "epoch": 0.09, + "grad_norm": 1.782755101220264, + "learning_rate": 9.899493189835896e-06, + "loss": 0.4021, + "step": 1464 + }, + { + "epoch": 0.09, + "grad_norm": 5.805700991513922, + "learning_rate": 9.899289906188909e-06, + "loss": 0.3774, + "step": 1465 + }, + { + "epoch": 0.09, + "grad_norm": 1.1592057386846453, + "learning_rate": 9.899086419261683e-06, + "loss": 0.4994, + "step": 1466 + }, + { + "epoch": 0.09, + "grad_norm": 2.2501526015494115, + "learning_rate": 9.89888272906266e-06, + "loss": 0.4112, + "step": 1467 + }, + { + "epoch": 0.09, + "grad_norm": 2.562735614642925, + "learning_rate": 9.89867883560029e-06, + "loss": 0.4091, + "step": 1468 + }, + { + "epoch": 0.09, + "grad_norm": 1.7883783901104329, + "learning_rate": 9.898474738883033e-06, + "loss": 0.4354, + "step": 1469 + }, + { + "epoch": 0.09, + "grad_norm": 1.86557666058772, + "learning_rate": 9.898270438919359e-06, + "loss": 0.4203, + "step": 1470 + }, + { + "epoch": 0.09, + "grad_norm": 1.3268077301449126, + "learning_rate": 9.898065935717746e-06, + "loss": 0.3679, + "step": 1471 + }, + { + "epoch": 0.09, + "grad_norm": 2.0982417650484964, + "learning_rate": 9.897861229286676e-06, + "loss": 0.3848, + "step": 1472 + }, + { + "epoch": 0.09, + "grad_norm": 9.484283339522934, + "learning_rate": 9.897656319634643e-06, + "loss": 0.412, + "step": 1473 + }, + { + "epoch": 0.09, + "grad_norm": 2.871856135984185, + "learning_rate": 9.897451206770152e-06, + "loss": 0.3826, + "step": 1474 + }, + { + "epoch": 0.09, + "grad_norm": 0.7713368662430186, + "learning_rate": 9.897245890701713e-06, + "loss": 0.4779, + "step": 1475 + }, + { + "epoch": 0.09, + "grad_norm": 2.1635938435196946, + "learning_rate": 9.89704037143784e-06, + "loss": 0.4546, + "step": 1476 + }, + { + "epoch": 0.09, + "grad_norm": 1.5875702231075441, + "learning_rate": 9.896834648987065e-06, + "loss": 0.4266, + "step": 1477 + }, + { + "epoch": 0.09, + "grad_norm": 1.968822406602112, + "learning_rate": 9.896628723357923e-06, + "loss": 0.3995, + "step": 1478 + }, + { + "epoch": 0.09, + "grad_norm": 4.271946906731635, + "learning_rate": 9.896422594558957e-06, + "loss": 0.4044, + "step": 1479 + }, + { + "epoch": 0.09, + "grad_norm": 1.7425755441872492, + "learning_rate": 9.896216262598722e-06, + "loss": 0.3874, + "step": 1480 + }, + { + "epoch": 0.09, + "grad_norm": 1.951237218746135, + "learning_rate": 9.896009727485778e-06, + "loss": 0.4111, + "step": 1481 + }, + { + "epoch": 0.09, + "grad_norm": 0.664163980488461, + "learning_rate": 9.895802989228691e-06, + "loss": 0.4666, + "step": 1482 + }, + { + "epoch": 0.09, + "grad_norm": 2.1918348287420946, + "learning_rate": 9.895596047836045e-06, + "loss": 0.416, + "step": 1483 + }, + { + "epoch": 0.09, + "grad_norm": 2.302188351677537, + "learning_rate": 9.895388903316424e-06, + "loss": 0.3995, + "step": 1484 + }, + { + "epoch": 0.09, + "grad_norm": 1.4967731946965595, + "learning_rate": 9.895181555678419e-06, + "loss": 0.429, + "step": 1485 + }, + { + "epoch": 0.09, + "grad_norm": 1.4109589221032037, + "learning_rate": 9.894974004930638e-06, + "loss": 0.3997, + "step": 1486 + }, + { + "epoch": 0.09, + "grad_norm": 1.6575259792296093, + "learning_rate": 9.894766251081691e-06, + "loss": 0.424, + "step": 1487 + }, + { + "epoch": 0.09, + "grad_norm": 1.621454676112057, + "learning_rate": 9.894558294140199e-06, + "loss": 0.4205, + "step": 1488 + }, + { + "epoch": 0.09, + "grad_norm": 1.4304440388356046, + "learning_rate": 9.894350134114788e-06, + "loss": 0.4239, + "step": 1489 + }, + { + "epoch": 0.09, + "grad_norm": 2.149051462032158, + "learning_rate": 9.894141771014098e-06, + "loss": 0.439, + "step": 1490 + }, + { + "epoch": 0.09, + "grad_norm": 2.675273168917547, + "learning_rate": 9.893933204846772e-06, + "loss": 0.395, + "step": 1491 + }, + { + "epoch": 0.09, + "grad_norm": 2.9996833286022, + "learning_rate": 9.893724435621466e-06, + "loss": 0.4184, + "step": 1492 + }, + { + "epoch": 0.09, + "grad_norm": 2.554857719359423, + "learning_rate": 9.893515463346841e-06, + "loss": 0.3705, + "step": 1493 + }, + { + "epoch": 0.09, + "grad_norm": 10.521627959182956, + "learning_rate": 9.893306288031565e-06, + "loss": 0.4201, + "step": 1494 + }, + { + "epoch": 0.09, + "grad_norm": 16.78095549023195, + "learning_rate": 9.893096909684323e-06, + "loss": 0.4135, + "step": 1495 + }, + { + "epoch": 0.09, + "grad_norm": 2.2207693140787756, + "learning_rate": 9.892887328313796e-06, + "loss": 0.4111, + "step": 1496 + }, + { + "epoch": 0.09, + "grad_norm": 1.956904065585402, + "learning_rate": 9.892677543928687e-06, + "loss": 0.4336, + "step": 1497 + }, + { + "epoch": 0.09, + "grad_norm": 9.407424917915186, + "learning_rate": 9.892467556537692e-06, + "loss": 0.4286, + "step": 1498 + }, + { + "epoch": 0.09, + "grad_norm": 1.5868108314301557, + "learning_rate": 9.89225736614953e-06, + "loss": 0.384, + "step": 1499 + }, + { + "epoch": 0.09, + "grad_norm": 1.7259959262127642, + "learning_rate": 9.89204697277292e-06, + "loss": 0.3981, + "step": 1500 + }, + { + "epoch": 0.09, + "grad_norm": 2.074368901182226, + "learning_rate": 9.891836376416593e-06, + "loss": 0.3943, + "step": 1501 + }, + { + "epoch": 0.09, + "grad_norm": 1.731554332407603, + "learning_rate": 9.891625577089285e-06, + "loss": 0.3714, + "step": 1502 + }, + { + "epoch": 0.09, + "grad_norm": 1.6119072743723795, + "learning_rate": 9.891414574799743e-06, + "loss": 0.413, + "step": 1503 + }, + { + "epoch": 0.09, + "grad_norm": 3.301487286293184, + "learning_rate": 9.891203369556722e-06, + "loss": 0.3408, + "step": 1504 + }, + { + "epoch": 0.09, + "grad_norm": 1.5721496853568406, + "learning_rate": 9.890991961368986e-06, + "loss": 0.3876, + "step": 1505 + }, + { + "epoch": 0.09, + "grad_norm": 1.4907831396596125, + "learning_rate": 9.890780350245305e-06, + "loss": 0.3909, + "step": 1506 + }, + { + "epoch": 0.09, + "grad_norm": 0.7123762050986583, + "learning_rate": 9.890568536194462e-06, + "loss": 0.4781, + "step": 1507 + }, + { + "epoch": 0.09, + "grad_norm": 6.167422005271638, + "learning_rate": 9.890356519225244e-06, + "loss": 0.3659, + "step": 1508 + }, + { + "epoch": 0.09, + "grad_norm": 4.645429350292824, + "learning_rate": 9.890144299346445e-06, + "loss": 0.3883, + "step": 1509 + }, + { + "epoch": 0.09, + "grad_norm": 2.52323903959301, + "learning_rate": 9.889931876566877e-06, + "loss": 0.4334, + "step": 1510 + }, + { + "epoch": 0.1, + "grad_norm": 2.3712718822586965, + "learning_rate": 9.889719250895347e-06, + "loss": 0.4106, + "step": 1511 + }, + { + "epoch": 0.1, + "grad_norm": 2.3503318056051734, + "learning_rate": 9.88950642234068e-06, + "loss": 0.3916, + "step": 1512 + }, + { + "epoch": 0.1, + "grad_norm": 0.6745853673430179, + "learning_rate": 9.889293390911708e-06, + "loss": 0.4582, + "step": 1513 + }, + { + "epoch": 0.1, + "grad_norm": 3.1754403974935093, + "learning_rate": 9.88908015661727e-06, + "loss": 0.4488, + "step": 1514 + }, + { + "epoch": 0.1, + "grad_norm": 1.9143321788962888, + "learning_rate": 9.88886671946621e-06, + "loss": 0.4121, + "step": 1515 + }, + { + "epoch": 0.1, + "grad_norm": 1.9017372274543405, + "learning_rate": 9.888653079467388e-06, + "loss": 0.3977, + "step": 1516 + }, + { + "epoch": 0.1, + "grad_norm": 1.5638253592781546, + "learning_rate": 9.888439236629665e-06, + "loss": 0.4016, + "step": 1517 + }, + { + "epoch": 0.1, + "grad_norm": 2.771085353001597, + "learning_rate": 9.888225190961916e-06, + "loss": 0.4062, + "step": 1518 + }, + { + "epoch": 0.1, + "grad_norm": 0.6580835384058848, + "learning_rate": 9.888010942473021e-06, + "loss": 0.4446, + "step": 1519 + }, + { + "epoch": 0.1, + "grad_norm": 3.725761692401521, + "learning_rate": 9.887796491171871e-06, + "loss": 0.3749, + "step": 1520 + }, + { + "epoch": 0.1, + "grad_norm": 2.135603827933371, + "learning_rate": 9.887581837067362e-06, + "loss": 0.4283, + "step": 1521 + }, + { + "epoch": 0.1, + "grad_norm": 2.1041515451167183, + "learning_rate": 9.8873669801684e-06, + "loss": 0.4041, + "step": 1522 + }, + { + "epoch": 0.1, + "grad_norm": 4.1887339764795035, + "learning_rate": 9.887151920483904e-06, + "loss": 0.4079, + "step": 1523 + }, + { + "epoch": 0.1, + "grad_norm": 3.1493065584900863, + "learning_rate": 9.886936658022792e-06, + "loss": 0.42, + "step": 1524 + }, + { + "epoch": 0.1, + "grad_norm": 2.0796373023459873, + "learning_rate": 9.886721192793998e-06, + "loss": 0.4442, + "step": 1525 + }, + { + "epoch": 0.1, + "grad_norm": 1.7252849265660066, + "learning_rate": 9.886505524806462e-06, + "loss": 0.4001, + "step": 1526 + }, + { + "epoch": 0.1, + "grad_norm": 2.115412855532158, + "learning_rate": 9.886289654069134e-06, + "loss": 0.3988, + "step": 1527 + }, + { + "epoch": 0.1, + "grad_norm": 2.0289058711966885, + "learning_rate": 9.886073580590968e-06, + "loss": 0.4022, + "step": 1528 + }, + { + "epoch": 0.1, + "grad_norm": 3.639479006211727, + "learning_rate": 9.88585730438093e-06, + "loss": 0.4213, + "step": 1529 + }, + { + "epoch": 0.1, + "grad_norm": 3.285274508998952, + "learning_rate": 9.885640825447995e-06, + "loss": 0.4253, + "step": 1530 + }, + { + "epoch": 0.1, + "grad_norm": 4.332633692302388, + "learning_rate": 9.885424143801144e-06, + "loss": 0.426, + "step": 1531 + }, + { + "epoch": 0.1, + "grad_norm": 2.067381084276767, + "learning_rate": 9.885207259449367e-06, + "loss": 0.3872, + "step": 1532 + }, + { + "epoch": 0.1, + "grad_norm": 2.012950955899202, + "learning_rate": 9.884990172401664e-06, + "loss": 0.3851, + "step": 1533 + }, + { + "epoch": 0.1, + "grad_norm": 2.509710170829564, + "learning_rate": 9.884772882667045e-06, + "loss": 0.4018, + "step": 1534 + }, + { + "epoch": 0.1, + "grad_norm": 2.557531037945831, + "learning_rate": 9.88455539025452e-06, + "loss": 0.4018, + "step": 1535 + }, + { + "epoch": 0.1, + "grad_norm": 1.8023998549226363, + "learning_rate": 9.884337695173115e-06, + "loss": 0.3882, + "step": 1536 + }, + { + "epoch": 0.1, + "grad_norm": 1.602399167302603, + "learning_rate": 9.884119797431864e-06, + "loss": 0.357, + "step": 1537 + }, + { + "epoch": 0.1, + "grad_norm": 1.3876713227208, + "learning_rate": 9.883901697039809e-06, + "loss": 0.3714, + "step": 1538 + }, + { + "epoch": 0.1, + "grad_norm": 2.6313126107514955, + "learning_rate": 9.883683394005997e-06, + "loss": 0.4047, + "step": 1539 + }, + { + "epoch": 0.1, + "grad_norm": 1.3148597499003523, + "learning_rate": 9.883464888339487e-06, + "loss": 0.3718, + "step": 1540 + }, + { + "epoch": 0.1, + "grad_norm": 1.8969185070682832, + "learning_rate": 9.883246180049345e-06, + "loss": 0.3797, + "step": 1541 + }, + { + "epoch": 0.1, + "grad_norm": 0.7229688177338228, + "learning_rate": 9.883027269144643e-06, + "loss": 0.4941, + "step": 1542 + }, + { + "epoch": 0.1, + "grad_norm": 0.7406262608377298, + "learning_rate": 9.882808155634469e-06, + "loss": 0.4786, + "step": 1543 + }, + { + "epoch": 0.1, + "grad_norm": 4.130647243420708, + "learning_rate": 9.88258883952791e-06, + "loss": 0.3824, + "step": 1544 + }, + { + "epoch": 0.1, + "grad_norm": 3.61938707034376, + "learning_rate": 9.882369320834068e-06, + "loss": 0.4437, + "step": 1545 + }, + { + "epoch": 0.1, + "grad_norm": 2.2788536790741447, + "learning_rate": 9.882149599562052e-06, + "loss": 0.4287, + "step": 1546 + }, + { + "epoch": 0.1, + "grad_norm": 11.4476349201232, + "learning_rate": 9.881929675720976e-06, + "loss": 0.4043, + "step": 1547 + }, + { + "epoch": 0.1, + "grad_norm": 2.2189743944428164, + "learning_rate": 9.881709549319967e-06, + "loss": 0.4272, + "step": 1548 + }, + { + "epoch": 0.1, + "grad_norm": 2.4537071417017278, + "learning_rate": 9.881489220368159e-06, + "loss": 0.424, + "step": 1549 + }, + { + "epoch": 0.1, + "grad_norm": 1.9156323739614187, + "learning_rate": 9.881268688874692e-06, + "loss": 0.4018, + "step": 1550 + }, + { + "epoch": 0.1, + "grad_norm": 1.8939609430497777, + "learning_rate": 9.881047954848716e-06, + "loss": 0.4109, + "step": 1551 + }, + { + "epoch": 0.1, + "grad_norm": 2.203243748382527, + "learning_rate": 9.880827018299392e-06, + "loss": 0.3821, + "step": 1552 + }, + { + "epoch": 0.1, + "grad_norm": 1.6113795149291312, + "learning_rate": 9.880605879235885e-06, + "loss": 0.4107, + "step": 1553 + }, + { + "epoch": 0.1, + "grad_norm": 1.55470526097955, + "learning_rate": 9.880384537667371e-06, + "loss": 0.3983, + "step": 1554 + }, + { + "epoch": 0.1, + "grad_norm": 3.5228408730582013, + "learning_rate": 9.880162993603032e-06, + "loss": 0.3949, + "step": 1555 + }, + { + "epoch": 0.1, + "grad_norm": 1.716271503272738, + "learning_rate": 9.879941247052066e-06, + "loss": 0.3875, + "step": 1556 + }, + { + "epoch": 0.1, + "grad_norm": 5.048444172986198, + "learning_rate": 9.879719298023669e-06, + "loss": 0.3819, + "step": 1557 + }, + { + "epoch": 0.1, + "grad_norm": 1.7746808164630226, + "learning_rate": 9.87949714652705e-06, + "loss": 0.3964, + "step": 1558 + }, + { + "epoch": 0.1, + "grad_norm": 2.0100644144483955, + "learning_rate": 9.879274792571427e-06, + "loss": 0.4053, + "step": 1559 + }, + { + "epoch": 0.1, + "grad_norm": 2.0803703390667034, + "learning_rate": 9.879052236166029e-06, + "loss": 0.4091, + "step": 1560 + }, + { + "epoch": 0.1, + "grad_norm": 2.0160432550163137, + "learning_rate": 9.878829477320085e-06, + "loss": 0.3961, + "step": 1561 + }, + { + "epoch": 0.1, + "grad_norm": 5.542126576885845, + "learning_rate": 9.87860651604284e-06, + "loss": 0.4289, + "step": 1562 + }, + { + "epoch": 0.1, + "grad_norm": 1.8848961311721117, + "learning_rate": 9.878383352343546e-06, + "loss": 0.4093, + "step": 1563 + }, + { + "epoch": 0.1, + "grad_norm": 1.6335960775558451, + "learning_rate": 9.878159986231461e-06, + "loss": 0.3961, + "step": 1564 + }, + { + "epoch": 0.1, + "grad_norm": 1.5331882824257386, + "learning_rate": 9.877936417715856e-06, + "loss": 0.3945, + "step": 1565 + }, + { + "epoch": 0.1, + "grad_norm": 1.216200431204067, + "learning_rate": 9.877712646806003e-06, + "loss": 0.39, + "step": 1566 + }, + { + "epoch": 0.1, + "grad_norm": 13.218505149285058, + "learning_rate": 9.87748867351119e-06, + "loss": 0.3896, + "step": 1567 + }, + { + "epoch": 0.1, + "grad_norm": 2.1156303943815904, + "learning_rate": 9.877264497840707e-06, + "loss": 0.4114, + "step": 1568 + }, + { + "epoch": 0.1, + "grad_norm": 1.875347790840662, + "learning_rate": 9.877040119803855e-06, + "loss": 0.403, + "step": 1569 + }, + { + "epoch": 0.1, + "grad_norm": 2.9594594653352964, + "learning_rate": 9.87681553940995e-06, + "loss": 0.4144, + "step": 1570 + }, + { + "epoch": 0.1, + "grad_norm": 1.7172301747489058, + "learning_rate": 9.876590756668303e-06, + "loss": 0.3885, + "step": 1571 + }, + { + "epoch": 0.1, + "grad_norm": 1.5643726014145547, + "learning_rate": 9.876365771588246e-06, + "loss": 0.3906, + "step": 1572 + }, + { + "epoch": 0.1, + "grad_norm": 1.379988230516699, + "learning_rate": 9.876140584179111e-06, + "loss": 0.4027, + "step": 1573 + }, + { + "epoch": 0.1, + "grad_norm": 1.4426545999875537, + "learning_rate": 9.87591519445024e-06, + "loss": 0.3911, + "step": 1574 + }, + { + "epoch": 0.1, + "grad_norm": 2.4506249836419176, + "learning_rate": 9.87568960241099e-06, + "loss": 0.3985, + "step": 1575 + }, + { + "epoch": 0.1, + "grad_norm": 1.3969053387991355, + "learning_rate": 9.875463808070715e-06, + "loss": 0.395, + "step": 1576 + }, + { + "epoch": 0.1, + "grad_norm": 3.8140566787652515, + "learning_rate": 9.87523781143879e-06, + "loss": 0.3764, + "step": 1577 + }, + { + "epoch": 0.1, + "grad_norm": 1.6049219044618324, + "learning_rate": 9.875011612524588e-06, + "loss": 0.4054, + "step": 1578 + }, + { + "epoch": 0.1, + "grad_norm": 1.6594386474651648, + "learning_rate": 9.874785211337495e-06, + "loss": 0.4294, + "step": 1579 + }, + { + "epoch": 0.1, + "grad_norm": 1.8489354266790934, + "learning_rate": 9.874558607886903e-06, + "loss": 0.4106, + "step": 1580 + }, + { + "epoch": 0.1, + "grad_norm": 1.9675224339666788, + "learning_rate": 9.874331802182218e-06, + "loss": 0.4027, + "step": 1581 + }, + { + "epoch": 0.1, + "grad_norm": 2.438333569282693, + "learning_rate": 9.874104794232849e-06, + "loss": 0.3833, + "step": 1582 + }, + { + "epoch": 0.1, + "grad_norm": 11.758263080262827, + "learning_rate": 9.873877584048215e-06, + "loss": 0.4013, + "step": 1583 + }, + { + "epoch": 0.1, + "grad_norm": 2.3092880221130323, + "learning_rate": 9.873650171637742e-06, + "loss": 0.4122, + "step": 1584 + }, + { + "epoch": 0.1, + "grad_norm": 2.797009473110623, + "learning_rate": 9.873422557010868e-06, + "loss": 0.3947, + "step": 1585 + }, + { + "epoch": 0.1, + "grad_norm": 33.2149911845115, + "learning_rate": 9.873194740177035e-06, + "loss": 0.399, + "step": 1586 + }, + { + "epoch": 0.1, + "grad_norm": 1.8303575239244754, + "learning_rate": 9.872966721145696e-06, + "loss": 0.4253, + "step": 1587 + }, + { + "epoch": 0.1, + "grad_norm": 1.6917698522229314, + "learning_rate": 9.872738499926313e-06, + "loss": 0.4217, + "step": 1588 + }, + { + "epoch": 0.1, + "grad_norm": 1.867497128558469, + "learning_rate": 9.872510076528354e-06, + "loss": 0.39, + "step": 1589 + }, + { + "epoch": 0.1, + "grad_norm": 1.8831302942748656, + "learning_rate": 9.872281450961298e-06, + "loss": 0.3866, + "step": 1590 + }, + { + "epoch": 0.1, + "grad_norm": 1.6060489849243156, + "learning_rate": 9.872052623234632e-06, + "loss": 0.4132, + "step": 1591 + }, + { + "epoch": 0.1, + "grad_norm": 2.1373426595867966, + "learning_rate": 9.871823593357847e-06, + "loss": 0.4262, + "step": 1592 + }, + { + "epoch": 0.1, + "grad_norm": 1.4616934618411233, + "learning_rate": 9.871594361340448e-06, + "loss": 0.377, + "step": 1593 + }, + { + "epoch": 0.1, + "grad_norm": 2.1246185516809977, + "learning_rate": 9.871364927191946e-06, + "loss": 0.4494, + "step": 1594 + }, + { + "epoch": 0.1, + "grad_norm": 1.6667229764513893, + "learning_rate": 9.87113529092186e-06, + "loss": 0.381, + "step": 1595 + }, + { + "epoch": 0.1, + "grad_norm": 1.9040267498341337, + "learning_rate": 9.870905452539721e-06, + "loss": 0.3983, + "step": 1596 + }, + { + "epoch": 0.1, + "grad_norm": 1.445000136769461, + "learning_rate": 9.870675412055061e-06, + "loss": 0.3836, + "step": 1597 + }, + { + "epoch": 0.1, + "grad_norm": 0.98771284005042, + "learning_rate": 9.870445169477428e-06, + "loss": 0.5325, + "step": 1598 + }, + { + "epoch": 0.1, + "grad_norm": 2.22609115830435, + "learning_rate": 9.870214724816373e-06, + "loss": 0.3934, + "step": 1599 + }, + { + "epoch": 0.1, + "grad_norm": 1.9701962859123494, + "learning_rate": 9.869984078081459e-06, + "loss": 0.4125, + "step": 1600 + }, + { + "epoch": 0.1, + "grad_norm": 2.816912168703317, + "learning_rate": 9.869753229282256e-06, + "loss": 0.3964, + "step": 1601 + }, + { + "epoch": 0.1, + "grad_norm": 1.8402158447295298, + "learning_rate": 9.869522178428342e-06, + "loss": 0.3973, + "step": 1602 + }, + { + "epoch": 0.1, + "grad_norm": 1.8551094047563308, + "learning_rate": 9.869290925529303e-06, + "loss": 0.4264, + "step": 1603 + }, + { + "epoch": 0.1, + "grad_norm": 4.053508950539129, + "learning_rate": 9.869059470594734e-06, + "loss": 0.3979, + "step": 1604 + }, + { + "epoch": 0.1, + "grad_norm": 1.370219472752349, + "learning_rate": 9.86882781363424e-06, + "loss": 0.3683, + "step": 1605 + }, + { + "epoch": 0.1, + "grad_norm": 2.420278752012178, + "learning_rate": 9.868595954657432e-06, + "loss": 0.3844, + "step": 1606 + }, + { + "epoch": 0.1, + "grad_norm": 7.094662872243442, + "learning_rate": 9.86836389367393e-06, + "loss": 0.4059, + "step": 1607 + }, + { + "epoch": 0.1, + "grad_norm": 3.17522914834035, + "learning_rate": 9.868131630693363e-06, + "loss": 0.3919, + "step": 1608 + }, + { + "epoch": 0.1, + "grad_norm": 2.0180806861260696, + "learning_rate": 9.867899165725367e-06, + "loss": 0.3979, + "step": 1609 + }, + { + "epoch": 0.1, + "grad_norm": 1.8407740171109623, + "learning_rate": 9.867666498779589e-06, + "loss": 0.3942, + "step": 1610 + }, + { + "epoch": 0.1, + "grad_norm": 1.7393380060798258, + "learning_rate": 9.867433629865682e-06, + "loss": 0.3783, + "step": 1611 + }, + { + "epoch": 0.1, + "grad_norm": 1.653835467011482, + "learning_rate": 9.867200558993308e-06, + "loss": 0.4261, + "step": 1612 + }, + { + "epoch": 0.1, + "grad_norm": 1.3885446718149328, + "learning_rate": 9.866967286172138e-06, + "loss": 0.3663, + "step": 1613 + }, + { + "epoch": 0.1, + "grad_norm": 4.768975395850437, + "learning_rate": 9.866733811411851e-06, + "loss": 0.3793, + "step": 1614 + }, + { + "epoch": 0.1, + "grad_norm": 1.6931236979606916, + "learning_rate": 9.866500134722135e-06, + "loss": 0.3825, + "step": 1615 + }, + { + "epoch": 0.1, + "grad_norm": 1.6536041558537826, + "learning_rate": 9.866266256112683e-06, + "loss": 0.3672, + "step": 1616 + }, + { + "epoch": 0.1, + "grad_norm": 2.2573399623354495, + "learning_rate": 9.8660321755932e-06, + "loss": 0.4195, + "step": 1617 + }, + { + "epoch": 0.1, + "grad_norm": 1.4421460492372233, + "learning_rate": 9.865797893173398e-06, + "loss": 0.3736, + "step": 1618 + }, + { + "epoch": 0.1, + "grad_norm": 2.634157856818414, + "learning_rate": 9.865563408863001e-06, + "loss": 0.439, + "step": 1619 + }, + { + "epoch": 0.1, + "grad_norm": 0.8969091112606098, + "learning_rate": 9.865328722671736e-06, + "loss": 0.4869, + "step": 1620 + }, + { + "epoch": 0.1, + "grad_norm": 3.780938074845408, + "learning_rate": 9.86509383460934e-06, + "loss": 0.3823, + "step": 1621 + }, + { + "epoch": 0.1, + "grad_norm": 3.2094474894144644, + "learning_rate": 9.86485874468556e-06, + "loss": 0.4181, + "step": 1622 + }, + { + "epoch": 0.1, + "grad_norm": 1.9213414327350735, + "learning_rate": 9.864623452910147e-06, + "loss": 0.3897, + "step": 1623 + }, + { + "epoch": 0.1, + "grad_norm": 1.595731521198233, + "learning_rate": 9.86438795929287e-06, + "loss": 0.4508, + "step": 1624 + }, + { + "epoch": 0.1, + "grad_norm": 1.2749829925861649, + "learning_rate": 9.864152263843494e-06, + "loss": 0.3683, + "step": 1625 + }, + { + "epoch": 0.1, + "grad_norm": 1.931634651898662, + "learning_rate": 9.863916366571801e-06, + "loss": 0.3681, + "step": 1626 + }, + { + "epoch": 0.1, + "grad_norm": 2.047504369413371, + "learning_rate": 9.863680267487579e-06, + "loss": 0.3851, + "step": 1627 + }, + { + "epoch": 0.1, + "grad_norm": 1.562617848727528, + "learning_rate": 9.863443966600625e-06, + "loss": 0.43, + "step": 1628 + }, + { + "epoch": 0.1, + "grad_norm": 2.3665948079790065, + "learning_rate": 9.863207463920741e-06, + "loss": 0.4091, + "step": 1629 + }, + { + "epoch": 0.1, + "grad_norm": 1.9577528694811124, + "learning_rate": 9.862970759457741e-06, + "loss": 0.3883, + "step": 1630 + }, + { + "epoch": 0.1, + "grad_norm": 1.7101952810443333, + "learning_rate": 9.86273385322145e-06, + "loss": 0.3971, + "step": 1631 + }, + { + "epoch": 0.1, + "grad_norm": 1.8747489452894384, + "learning_rate": 9.862496745221691e-06, + "loss": 0.3899, + "step": 1632 + }, + { + "epoch": 0.1, + "grad_norm": 4.399927821968412, + "learning_rate": 9.862259435468305e-06, + "loss": 0.4037, + "step": 1633 + }, + { + "epoch": 0.1, + "grad_norm": 3.434823447356248, + "learning_rate": 9.862021923971139e-06, + "loss": 0.4224, + "step": 1634 + }, + { + "epoch": 0.1, + "grad_norm": 1.7375833342698581, + "learning_rate": 9.861784210740048e-06, + "loss": 0.3817, + "step": 1635 + }, + { + "epoch": 0.1, + "grad_norm": 1.8843841630356726, + "learning_rate": 9.861546295784896e-06, + "loss": 0.3869, + "step": 1636 + }, + { + "epoch": 0.1, + "grad_norm": 1.8641709311826786, + "learning_rate": 9.86130817911555e-06, + "loss": 0.3636, + "step": 1637 + }, + { + "epoch": 0.1, + "grad_norm": 2.42880075483143, + "learning_rate": 9.861069860741896e-06, + "loss": 0.4223, + "step": 1638 + }, + { + "epoch": 0.1, + "grad_norm": 1.5093568068595866, + "learning_rate": 9.860831340673818e-06, + "loss": 0.3893, + "step": 1639 + }, + { + "epoch": 0.1, + "grad_norm": 1.479179518708439, + "learning_rate": 9.860592618921213e-06, + "loss": 0.3596, + "step": 1640 + }, + { + "epoch": 0.1, + "grad_norm": 1.8754934456139913, + "learning_rate": 9.860353695493987e-06, + "loss": 0.411, + "step": 1641 + }, + { + "epoch": 0.1, + "grad_norm": 2.0307806127913177, + "learning_rate": 9.860114570402055e-06, + "loss": 0.4007, + "step": 1642 + }, + { + "epoch": 0.1, + "grad_norm": 0.9901093263597351, + "learning_rate": 9.859875243655336e-06, + "loss": 0.5332, + "step": 1643 + }, + { + "epoch": 0.1, + "grad_norm": 1.6527672581452526, + "learning_rate": 9.85963571526376e-06, + "loss": 0.4029, + "step": 1644 + }, + { + "epoch": 0.1, + "grad_norm": 1.9962457409802663, + "learning_rate": 9.859395985237268e-06, + "loss": 0.4079, + "step": 1645 + }, + { + "epoch": 0.1, + "grad_norm": 1.8445980379975375, + "learning_rate": 9.859156053585805e-06, + "loss": 0.4168, + "step": 1646 + }, + { + "epoch": 0.1, + "grad_norm": 1.6673165701862798, + "learning_rate": 9.858915920319325e-06, + "loss": 0.3907, + "step": 1647 + }, + { + "epoch": 0.1, + "grad_norm": 2.1450276935779082, + "learning_rate": 9.858675585447795e-06, + "loss": 0.3825, + "step": 1648 + }, + { + "epoch": 0.1, + "grad_norm": 1.9224143513215968, + "learning_rate": 9.858435048981184e-06, + "loss": 0.4104, + "step": 1649 + }, + { + "epoch": 0.1, + "grad_norm": 11.099476366730014, + "learning_rate": 9.858194310929474e-06, + "loss": 0.4048, + "step": 1650 + }, + { + "epoch": 0.1, + "grad_norm": 1.7980374577923077, + "learning_rate": 9.857953371302651e-06, + "loss": 0.3801, + "step": 1651 + }, + { + "epoch": 0.1, + "grad_norm": 1.4202534694887032, + "learning_rate": 9.857712230110717e-06, + "loss": 0.37, + "step": 1652 + }, + { + "epoch": 0.1, + "grad_norm": 1.7026733049341634, + "learning_rate": 9.857470887363672e-06, + "loss": 0.3849, + "step": 1653 + }, + { + "epoch": 0.1, + "grad_norm": 1.9840497547579354, + "learning_rate": 9.857229343071532e-06, + "loss": 0.4376, + "step": 1654 + }, + { + "epoch": 0.1, + "grad_norm": 2.3932046299332677, + "learning_rate": 9.85698759724432e-06, + "loss": 0.3797, + "step": 1655 + }, + { + "epoch": 0.1, + "grad_norm": 1.6382322967133962, + "learning_rate": 9.856745649892066e-06, + "loss": 0.3596, + "step": 1656 + }, + { + "epoch": 0.1, + "grad_norm": 3.638171228039363, + "learning_rate": 9.856503501024807e-06, + "loss": 0.4371, + "step": 1657 + }, + { + "epoch": 0.1, + "grad_norm": 2.0245138275986685, + "learning_rate": 9.856261150652593e-06, + "loss": 0.3624, + "step": 1658 + }, + { + "epoch": 0.1, + "grad_norm": 1.5752020038104408, + "learning_rate": 9.856018598785477e-06, + "loss": 0.4202, + "step": 1659 + }, + { + "epoch": 0.1, + "grad_norm": 1.6923476571685003, + "learning_rate": 9.855775845433527e-06, + "loss": 0.4174, + "step": 1660 + }, + { + "epoch": 0.1, + "grad_norm": 1.9751350900252784, + "learning_rate": 9.855532890606809e-06, + "loss": 0.4053, + "step": 1661 + }, + { + "epoch": 0.1, + "grad_norm": 1.6875648058751105, + "learning_rate": 9.855289734315407e-06, + "loss": 0.4246, + "step": 1662 + }, + { + "epoch": 0.1, + "grad_norm": 1.7199870567275772, + "learning_rate": 9.855046376569412e-06, + "loss": 0.3807, + "step": 1663 + }, + { + "epoch": 0.1, + "grad_norm": 2.969451586330859, + "learning_rate": 9.854802817378918e-06, + "loss": 0.3796, + "step": 1664 + }, + { + "epoch": 0.1, + "grad_norm": 2.6592376380297793, + "learning_rate": 9.854559056754031e-06, + "loss": 0.4015, + "step": 1665 + }, + { + "epoch": 0.1, + "grad_norm": 1.5959033763493164, + "learning_rate": 9.85431509470487e-06, + "loss": 0.3982, + "step": 1666 + }, + { + "epoch": 0.1, + "grad_norm": 1.7046308592515858, + "learning_rate": 9.85407093124155e-06, + "loss": 0.3678, + "step": 1667 + }, + { + "epoch": 0.1, + "grad_norm": 2.102849780940891, + "learning_rate": 9.853826566374206e-06, + "loss": 0.4168, + "step": 1668 + }, + { + "epoch": 0.1, + "grad_norm": 1.734985635647761, + "learning_rate": 9.853582000112976e-06, + "loss": 0.3746, + "step": 1669 + }, + { + "epoch": 0.11, + "grad_norm": 1.045023648234381, + "learning_rate": 9.853337232468008e-06, + "loss": 0.5443, + "step": 1670 + }, + { + "epoch": 0.11, + "grad_norm": 2.4610084570737127, + "learning_rate": 9.85309226344946e-06, + "loss": 0.3926, + "step": 1671 + }, + { + "epoch": 0.11, + "grad_norm": 1.8481416710293914, + "learning_rate": 9.85284709306749e-06, + "loss": 0.4055, + "step": 1672 + }, + { + "epoch": 0.11, + "grad_norm": 5.965990410125677, + "learning_rate": 9.852601721332278e-06, + "loss": 0.3732, + "step": 1673 + }, + { + "epoch": 0.11, + "grad_norm": 11.52394801893013, + "learning_rate": 9.852356148253999e-06, + "loss": 0.4066, + "step": 1674 + }, + { + "epoch": 0.11, + "grad_norm": 5.366918402226468, + "learning_rate": 9.852110373842846e-06, + "loss": 0.3858, + "step": 1675 + }, + { + "epoch": 0.11, + "grad_norm": 1.9034868674091014, + "learning_rate": 9.851864398109015e-06, + "loss": 0.4044, + "step": 1676 + }, + { + "epoch": 0.11, + "grad_norm": 1.471418975171063, + "learning_rate": 9.851618221062712e-06, + "loss": 0.3832, + "step": 1677 + }, + { + "epoch": 0.11, + "grad_norm": 1.7960215718083363, + "learning_rate": 9.851371842714152e-06, + "loss": 0.3868, + "step": 1678 + }, + { + "epoch": 0.11, + "grad_norm": 2.1817685990868085, + "learning_rate": 9.851125263073556e-06, + "loss": 0.41, + "step": 1679 + }, + { + "epoch": 0.11, + "grad_norm": 1.971487075980344, + "learning_rate": 9.850878482151158e-06, + "loss": 0.4223, + "step": 1680 + }, + { + "epoch": 0.11, + "grad_norm": 1.410151505915163, + "learning_rate": 9.850631499957196e-06, + "loss": 0.3754, + "step": 1681 + }, + { + "epoch": 0.11, + "grad_norm": 2.067354857684912, + "learning_rate": 9.850384316501915e-06, + "loss": 0.3854, + "step": 1682 + }, + { + "epoch": 0.11, + "grad_norm": 1.8751145476407016, + "learning_rate": 9.850136931795576e-06, + "loss": 0.3807, + "step": 1683 + }, + { + "epoch": 0.11, + "grad_norm": 1.7303156103006214, + "learning_rate": 9.849889345848438e-06, + "loss": 0.4143, + "step": 1684 + }, + { + "epoch": 0.11, + "grad_norm": 1.4127764967959593, + "learning_rate": 9.849641558670779e-06, + "loss": 0.3912, + "step": 1685 + }, + { + "epoch": 0.11, + "grad_norm": 1.9093287900042089, + "learning_rate": 9.849393570272875e-06, + "loss": 0.4027, + "step": 1686 + }, + { + "epoch": 0.11, + "grad_norm": 1.5926673449856141, + "learning_rate": 9.849145380665021e-06, + "loss": 0.4059, + "step": 1687 + }, + { + "epoch": 0.11, + "grad_norm": 2.00550667338651, + "learning_rate": 9.848896989857512e-06, + "loss": 0.3613, + "step": 1688 + }, + { + "epoch": 0.11, + "grad_norm": 1.7627412396817441, + "learning_rate": 9.848648397860654e-06, + "loss": 0.3947, + "step": 1689 + }, + { + "epoch": 0.11, + "grad_norm": 0.8667088767380863, + "learning_rate": 9.848399604684762e-06, + "loss": 0.486, + "step": 1690 + }, + { + "epoch": 0.11, + "grad_norm": 3.3910336632697318, + "learning_rate": 9.848150610340158e-06, + "loss": 0.3763, + "step": 1691 + }, + { + "epoch": 0.11, + "grad_norm": 1.9004403833278962, + "learning_rate": 9.847901414837173e-06, + "loss": 0.4266, + "step": 1692 + }, + { + "epoch": 0.11, + "grad_norm": 2.020778797956488, + "learning_rate": 9.847652018186149e-06, + "loss": 0.3843, + "step": 1693 + }, + { + "epoch": 0.11, + "grad_norm": 1.7097985113804344, + "learning_rate": 9.847402420397431e-06, + "loss": 0.4501, + "step": 1694 + }, + { + "epoch": 0.11, + "grad_norm": 2.795067892387273, + "learning_rate": 9.847152621481378e-06, + "loss": 0.3937, + "step": 1695 + }, + { + "epoch": 0.11, + "grad_norm": 2.137775892968101, + "learning_rate": 9.846902621448354e-06, + "loss": 0.4002, + "step": 1696 + }, + { + "epoch": 0.11, + "grad_norm": 3.2106505072535425, + "learning_rate": 9.846652420308728e-06, + "loss": 0.4096, + "step": 1697 + }, + { + "epoch": 0.11, + "grad_norm": 2.252113147118114, + "learning_rate": 9.846402018072888e-06, + "loss": 0.4264, + "step": 1698 + }, + { + "epoch": 0.11, + "grad_norm": 1.8598361009513775, + "learning_rate": 9.846151414751217e-06, + "loss": 0.3912, + "step": 1699 + }, + { + "epoch": 0.11, + "grad_norm": 0.6481025561732026, + "learning_rate": 9.845900610354117e-06, + "loss": 0.4814, + "step": 1700 + }, + { + "epoch": 0.11, + "grad_norm": 1.8954598861269756, + "learning_rate": 9.845649604891996e-06, + "loss": 0.3972, + "step": 1701 + }, + { + "epoch": 0.11, + "grad_norm": 1.6747124277930796, + "learning_rate": 9.845398398375264e-06, + "loss": 0.386, + "step": 1702 + }, + { + "epoch": 0.11, + "grad_norm": 1.9833024773849135, + "learning_rate": 9.845146990814345e-06, + "loss": 0.3548, + "step": 1703 + }, + { + "epoch": 0.11, + "grad_norm": 2.7982434928052577, + "learning_rate": 9.844895382219673e-06, + "loss": 0.3917, + "step": 1704 + }, + { + "epoch": 0.11, + "grad_norm": 2.620190892161271, + "learning_rate": 9.844643572601685e-06, + "loss": 0.4271, + "step": 1705 + }, + { + "epoch": 0.11, + "grad_norm": 1.9066001799296057, + "learning_rate": 9.844391561970831e-06, + "loss": 0.379, + "step": 1706 + }, + { + "epoch": 0.11, + "grad_norm": 2.0798176899619896, + "learning_rate": 9.844139350337566e-06, + "loss": 0.3768, + "step": 1707 + }, + { + "epoch": 0.11, + "grad_norm": 4.166746859557011, + "learning_rate": 9.843886937712355e-06, + "loss": 0.3948, + "step": 1708 + }, + { + "epoch": 0.11, + "grad_norm": 2.285812184538659, + "learning_rate": 9.843634324105671e-06, + "loss": 0.4038, + "step": 1709 + }, + { + "epoch": 0.11, + "grad_norm": 2.289329268104687, + "learning_rate": 9.843381509527997e-06, + "loss": 0.3844, + "step": 1710 + }, + { + "epoch": 0.11, + "grad_norm": 1.8602726742533615, + "learning_rate": 9.84312849398982e-06, + "loss": 0.3818, + "step": 1711 + }, + { + "epoch": 0.11, + "grad_norm": 0.6256248754386069, + "learning_rate": 9.84287527750164e-06, + "loss": 0.4762, + "step": 1712 + }, + { + "epoch": 0.11, + "grad_norm": 1.8693918376721483, + "learning_rate": 9.842621860073963e-06, + "loss": 0.3712, + "step": 1713 + }, + { + "epoch": 0.11, + "grad_norm": 3.3307457721439757, + "learning_rate": 9.842368241717304e-06, + "loss": 0.3741, + "step": 1714 + }, + { + "epoch": 0.11, + "grad_norm": 2.146545990434858, + "learning_rate": 9.842114422442184e-06, + "loss": 0.3798, + "step": 1715 + }, + { + "epoch": 0.11, + "grad_norm": 0.6478512616698305, + "learning_rate": 9.841860402259139e-06, + "loss": 0.4784, + "step": 1716 + }, + { + "epoch": 0.11, + "grad_norm": 2.7282611243319868, + "learning_rate": 9.841606181178703e-06, + "loss": 0.4248, + "step": 1717 + }, + { + "epoch": 0.11, + "grad_norm": 2.2365112124158246, + "learning_rate": 9.841351759211426e-06, + "loss": 0.4063, + "step": 1718 + }, + { + "epoch": 0.11, + "grad_norm": 1.9877658777221152, + "learning_rate": 9.841097136367868e-06, + "loss": 0.3733, + "step": 1719 + }, + { + "epoch": 0.11, + "grad_norm": 1.70774960444382, + "learning_rate": 9.84084231265859e-06, + "loss": 0.3845, + "step": 1720 + }, + { + "epoch": 0.11, + "grad_norm": 1.967471005270002, + "learning_rate": 9.840587288094165e-06, + "loss": 0.3592, + "step": 1721 + }, + { + "epoch": 0.11, + "grad_norm": 4.7789218726089935, + "learning_rate": 9.840332062685179e-06, + "loss": 0.3853, + "step": 1722 + }, + { + "epoch": 0.11, + "grad_norm": 3.882678991935357, + "learning_rate": 9.840076636442215e-06, + "loss": 0.4165, + "step": 1723 + }, + { + "epoch": 0.11, + "grad_norm": 8.273843033979599, + "learning_rate": 9.839821009375876e-06, + "loss": 0.3865, + "step": 1724 + }, + { + "epoch": 0.11, + "grad_norm": 2.138827529030236, + "learning_rate": 9.839565181496766e-06, + "loss": 0.4044, + "step": 1725 + }, + { + "epoch": 0.11, + "grad_norm": 1.5108712898976502, + "learning_rate": 9.839309152815501e-06, + "loss": 0.4047, + "step": 1726 + }, + { + "epoch": 0.11, + "grad_norm": 2.570049755105157, + "learning_rate": 9.839052923342704e-06, + "loss": 0.3974, + "step": 1727 + }, + { + "epoch": 0.11, + "grad_norm": 2.2654150722194646, + "learning_rate": 9.838796493089004e-06, + "loss": 0.4003, + "step": 1728 + }, + { + "epoch": 0.11, + "grad_norm": 2.524340356891728, + "learning_rate": 9.838539862065047e-06, + "loss": 0.3893, + "step": 1729 + }, + { + "epoch": 0.11, + "grad_norm": 0.7232827143685543, + "learning_rate": 9.838283030281472e-06, + "loss": 0.4895, + "step": 1730 + }, + { + "epoch": 0.11, + "grad_norm": 1.608871570310679, + "learning_rate": 9.838025997748943e-06, + "loss": 0.3789, + "step": 1731 + }, + { + "epoch": 0.11, + "grad_norm": 2.332670753315059, + "learning_rate": 9.837768764478121e-06, + "loss": 0.3698, + "step": 1732 + }, + { + "epoch": 0.11, + "grad_norm": 1.937897982725912, + "learning_rate": 9.837511330479683e-06, + "loss": 0.4052, + "step": 1733 + }, + { + "epoch": 0.11, + "grad_norm": 2.0249010151356908, + "learning_rate": 9.837253695764304e-06, + "loss": 0.3733, + "step": 1734 + }, + { + "epoch": 0.11, + "grad_norm": 2.201770709371206, + "learning_rate": 9.83699586034268e-06, + "loss": 0.3763, + "step": 1735 + }, + { + "epoch": 0.11, + "grad_norm": 0.6129306947845914, + "learning_rate": 9.836737824225504e-06, + "loss": 0.4478, + "step": 1736 + }, + { + "epoch": 0.11, + "grad_norm": 2.872094004617166, + "learning_rate": 9.836479587423487e-06, + "loss": 0.4344, + "step": 1737 + }, + { + "epoch": 0.11, + "grad_norm": 3.1390342220198892, + "learning_rate": 9.836221149947339e-06, + "loss": 0.3946, + "step": 1738 + }, + { + "epoch": 0.11, + "grad_norm": 1.943947248325319, + "learning_rate": 9.835962511807786e-06, + "loss": 0.3732, + "step": 1739 + }, + { + "epoch": 0.11, + "grad_norm": 1.724561629835258, + "learning_rate": 9.835703673015559e-06, + "loss": 0.3871, + "step": 1740 + }, + { + "epoch": 0.11, + "grad_norm": 5.4942667460140875, + "learning_rate": 9.835444633581398e-06, + "loss": 0.3933, + "step": 1741 + }, + { + "epoch": 0.11, + "grad_norm": 2.564816540790656, + "learning_rate": 9.83518539351605e-06, + "loss": 0.4036, + "step": 1742 + }, + { + "epoch": 0.11, + "grad_norm": 2.6389411447725055, + "learning_rate": 9.834925952830272e-06, + "loss": 0.3921, + "step": 1743 + }, + { + "epoch": 0.11, + "grad_norm": 6.692429649224205, + "learning_rate": 9.834666311534828e-06, + "loss": 0.3958, + "step": 1744 + }, + { + "epoch": 0.11, + "grad_norm": 1.749031893229669, + "learning_rate": 9.834406469640492e-06, + "loss": 0.3612, + "step": 1745 + }, + { + "epoch": 0.11, + "grad_norm": 1.7740671276584536, + "learning_rate": 9.834146427158043e-06, + "loss": 0.4254, + "step": 1746 + }, + { + "epoch": 0.11, + "grad_norm": 0.7175704351884112, + "learning_rate": 9.833886184098273e-06, + "loss": 0.4693, + "step": 1747 + }, + { + "epoch": 0.11, + "grad_norm": 1.7262648669789762, + "learning_rate": 9.833625740471981e-06, + "loss": 0.3786, + "step": 1748 + }, + { + "epoch": 0.11, + "grad_norm": 1.563889080426523, + "learning_rate": 9.833365096289971e-06, + "loss": 0.3518, + "step": 1749 + }, + { + "epoch": 0.11, + "grad_norm": 3.300867437494952, + "learning_rate": 9.833104251563058e-06, + "loss": 0.411, + "step": 1750 + }, + { + "epoch": 0.11, + "grad_norm": 7.5285836738716725, + "learning_rate": 9.832843206302063e-06, + "loss": 0.373, + "step": 1751 + }, + { + "epoch": 0.11, + "grad_norm": 1.806762271498663, + "learning_rate": 9.832581960517821e-06, + "loss": 0.3953, + "step": 1752 + }, + { + "epoch": 0.11, + "grad_norm": 3.281968199466739, + "learning_rate": 9.83232051422117e-06, + "loss": 0.359, + "step": 1753 + }, + { + "epoch": 0.11, + "grad_norm": 1.617781802791413, + "learning_rate": 9.832058867422959e-06, + "loss": 0.3946, + "step": 1754 + }, + { + "epoch": 0.11, + "grad_norm": 2.94586607218607, + "learning_rate": 9.831797020134039e-06, + "loss": 0.4012, + "step": 1755 + }, + { + "epoch": 0.11, + "grad_norm": 2.3379790367971496, + "learning_rate": 9.831534972365282e-06, + "loss": 0.3921, + "step": 1756 + }, + { + "epoch": 0.11, + "grad_norm": 1.5255728322019193, + "learning_rate": 9.831272724127555e-06, + "loss": 0.3752, + "step": 1757 + }, + { + "epoch": 0.11, + "grad_norm": 2.2969952038498853, + "learning_rate": 9.831010275431743e-06, + "loss": 0.3881, + "step": 1758 + }, + { + "epoch": 0.11, + "grad_norm": 0.9045826114081688, + "learning_rate": 9.830747626288732e-06, + "loss": 0.4727, + "step": 1759 + }, + { + "epoch": 0.11, + "grad_norm": 1.9483183425740722, + "learning_rate": 9.830484776709424e-06, + "loss": 0.3546, + "step": 1760 + }, + { + "epoch": 0.11, + "grad_norm": 2.637340387906478, + "learning_rate": 9.830221726704721e-06, + "loss": 0.3681, + "step": 1761 + }, + { + "epoch": 0.11, + "grad_norm": 2.1048434018018094, + "learning_rate": 9.82995847628554e-06, + "loss": 0.3739, + "step": 1762 + }, + { + "epoch": 0.11, + "grad_norm": 2.3747090243167563, + "learning_rate": 9.829695025462803e-06, + "loss": 0.3902, + "step": 1763 + }, + { + "epoch": 0.11, + "grad_norm": 1.6126801855514128, + "learning_rate": 9.82943137424744e-06, + "loss": 0.3728, + "step": 1764 + }, + { + "epoch": 0.11, + "grad_norm": 1.7642456284124133, + "learning_rate": 9.82916752265039e-06, + "loss": 0.3927, + "step": 1765 + }, + { + "epoch": 0.11, + "grad_norm": 11.211968680808807, + "learning_rate": 9.828903470682604e-06, + "loss": 0.3728, + "step": 1766 + }, + { + "epoch": 0.11, + "grad_norm": 4.0141784081619045, + "learning_rate": 9.828639218355036e-06, + "loss": 0.3732, + "step": 1767 + }, + { + "epoch": 0.11, + "grad_norm": 2.972918918329857, + "learning_rate": 9.82837476567865e-06, + "loss": 0.3737, + "step": 1768 + }, + { + "epoch": 0.11, + "grad_norm": 2.4618233814111172, + "learning_rate": 9.828110112664417e-06, + "loss": 0.3696, + "step": 1769 + }, + { + "epoch": 0.11, + "grad_norm": 1.7775168603784308, + "learning_rate": 9.82784525932332e-06, + "loss": 0.3962, + "step": 1770 + }, + { + "epoch": 0.11, + "grad_norm": 2.6929858049923627, + "learning_rate": 9.82758020566635e-06, + "loss": 0.3979, + "step": 1771 + }, + { + "epoch": 0.11, + "grad_norm": 1.6325018358882113, + "learning_rate": 9.827314951704501e-06, + "loss": 0.3647, + "step": 1772 + }, + { + "epoch": 0.11, + "grad_norm": 1.457398218064355, + "learning_rate": 9.827049497448782e-06, + "loss": 0.3767, + "step": 1773 + }, + { + "epoch": 0.11, + "grad_norm": 3.421800879401619, + "learning_rate": 9.826783842910203e-06, + "loss": 0.3973, + "step": 1774 + }, + { + "epoch": 0.11, + "grad_norm": 2.3614445207751285, + "learning_rate": 9.826517988099793e-06, + "loss": 0.3866, + "step": 1775 + }, + { + "epoch": 0.11, + "grad_norm": 2.1527589195533974, + "learning_rate": 9.826251933028574e-06, + "loss": 0.3845, + "step": 1776 + }, + { + "epoch": 0.11, + "grad_norm": 2.9785993283026837, + "learning_rate": 9.825985677707593e-06, + "loss": 0.3833, + "step": 1777 + }, + { + "epoch": 0.11, + "grad_norm": 2.5470676531409917, + "learning_rate": 9.825719222147894e-06, + "loss": 0.3563, + "step": 1778 + }, + { + "epoch": 0.11, + "grad_norm": 1.671811416386511, + "learning_rate": 9.825452566360533e-06, + "loss": 0.5295, + "step": 1779 + }, + { + "epoch": 0.11, + "grad_norm": 1.9049135842674518, + "learning_rate": 9.825185710356573e-06, + "loss": 0.3834, + "step": 1780 + }, + { + "epoch": 0.11, + "grad_norm": 1.9408021248551481, + "learning_rate": 9.824918654147088e-06, + "loss": 0.3884, + "step": 1781 + }, + { + "epoch": 0.11, + "grad_norm": 5.364538884082852, + "learning_rate": 9.824651397743159e-06, + "loss": 0.3815, + "step": 1782 + }, + { + "epoch": 0.11, + "grad_norm": 2.100834262112139, + "learning_rate": 9.824383941155872e-06, + "loss": 0.3906, + "step": 1783 + }, + { + "epoch": 0.11, + "grad_norm": 3.679919936795614, + "learning_rate": 9.824116284396328e-06, + "loss": 0.3599, + "step": 1784 + }, + { + "epoch": 0.11, + "grad_norm": 3.890702532650259, + "learning_rate": 9.82384842747563e-06, + "loss": 0.4019, + "step": 1785 + }, + { + "epoch": 0.11, + "grad_norm": 2.3214655606383343, + "learning_rate": 9.823580370404893e-06, + "loss": 0.4022, + "step": 1786 + }, + { + "epoch": 0.11, + "grad_norm": 2.200897581694426, + "learning_rate": 9.823312113195238e-06, + "loss": 0.3882, + "step": 1787 + }, + { + "epoch": 0.11, + "grad_norm": 2.072186629286691, + "learning_rate": 9.823043655857796e-06, + "loss": 0.3611, + "step": 1788 + }, + { + "epoch": 0.11, + "grad_norm": 1.6297660956814268, + "learning_rate": 9.822774998403707e-06, + "loss": 0.3635, + "step": 1789 + }, + { + "epoch": 0.11, + "grad_norm": 2.292905578531731, + "learning_rate": 9.822506140844118e-06, + "loss": 0.3811, + "step": 1790 + }, + { + "epoch": 0.11, + "grad_norm": 1.7087918382651415, + "learning_rate": 9.82223708319018e-06, + "loss": 0.37, + "step": 1791 + }, + { + "epoch": 0.11, + "grad_norm": 2.146685421380077, + "learning_rate": 9.821967825453063e-06, + "loss": 0.38, + "step": 1792 + }, + { + "epoch": 0.11, + "grad_norm": 2.2668390265253864, + "learning_rate": 9.821698367643936e-06, + "loss": 0.3637, + "step": 1793 + }, + { + "epoch": 0.11, + "grad_norm": 1.5321677385444092, + "learning_rate": 9.82142870977398e-06, + "loss": 0.3579, + "step": 1794 + }, + { + "epoch": 0.11, + "grad_norm": 2.4785359192400516, + "learning_rate": 9.821158851854381e-06, + "loss": 0.3679, + "step": 1795 + }, + { + "epoch": 0.11, + "grad_norm": 2.7017927566259976, + "learning_rate": 9.82088879389634e-06, + "loss": 0.3792, + "step": 1796 + }, + { + "epoch": 0.11, + "grad_norm": 5.231011743947335, + "learning_rate": 9.820618535911057e-06, + "loss": 0.3807, + "step": 1797 + }, + { + "epoch": 0.11, + "grad_norm": 5.535052581022571, + "learning_rate": 9.820348077909751e-06, + "loss": 0.3752, + "step": 1798 + }, + { + "epoch": 0.11, + "grad_norm": 2.2642552869800614, + "learning_rate": 9.820077419903642e-06, + "loss": 0.3643, + "step": 1799 + }, + { + "epoch": 0.11, + "grad_norm": 1.856920118871183, + "learning_rate": 9.819806561903958e-06, + "loss": 0.3782, + "step": 1800 + }, + { + "epoch": 0.11, + "grad_norm": 1.4157727619762503, + "learning_rate": 9.819535503921939e-06, + "loss": 0.3795, + "step": 1801 + }, + { + "epoch": 0.11, + "grad_norm": 1.6028140208829351, + "learning_rate": 9.819264245968831e-06, + "loss": 0.36, + "step": 1802 + }, + { + "epoch": 0.11, + "grad_norm": 2.1175706556514085, + "learning_rate": 9.81899278805589e-06, + "loss": 0.379, + "step": 1803 + }, + { + "epoch": 0.11, + "grad_norm": 1.7873196904004678, + "learning_rate": 9.81872113019438e-06, + "loss": 0.3611, + "step": 1804 + }, + { + "epoch": 0.11, + "grad_norm": 3.47258866737647, + "learning_rate": 9.818449272395569e-06, + "loss": 0.4067, + "step": 1805 + }, + { + "epoch": 0.11, + "grad_norm": 1.6120758077434205, + "learning_rate": 9.818177214670742e-06, + "loss": 0.3958, + "step": 1806 + }, + { + "epoch": 0.11, + "grad_norm": 2.1997375866342472, + "learning_rate": 9.81790495703118e-06, + "loss": 0.3687, + "step": 1807 + }, + { + "epoch": 0.11, + "grad_norm": 1.8691936911868914, + "learning_rate": 9.817632499488188e-06, + "loss": 0.3963, + "step": 1808 + }, + { + "epoch": 0.11, + "grad_norm": 2.93149599238276, + "learning_rate": 9.817359842053064e-06, + "loss": 0.3514, + "step": 1809 + }, + { + "epoch": 0.11, + "grad_norm": 2.8647426600058616, + "learning_rate": 9.817086984737126e-06, + "loss": 0.378, + "step": 1810 + }, + { + "epoch": 0.11, + "grad_norm": 1.7732346592905521, + "learning_rate": 9.81681392755169e-06, + "loss": 0.3994, + "step": 1811 + }, + { + "epoch": 0.11, + "grad_norm": 0.7418135546211012, + "learning_rate": 9.81654067050809e-06, + "loss": 0.491, + "step": 1812 + }, + { + "epoch": 0.11, + "grad_norm": 1.9501015746712647, + "learning_rate": 9.816267213617663e-06, + "loss": 0.3689, + "step": 1813 + }, + { + "epoch": 0.11, + "grad_norm": 2.1195063900736173, + "learning_rate": 9.815993556891753e-06, + "loss": 0.3815, + "step": 1814 + }, + { + "epoch": 0.11, + "grad_norm": 2.2836222840638096, + "learning_rate": 9.815719700341717e-06, + "loss": 0.3979, + "step": 1815 + }, + { + "epoch": 0.11, + "grad_norm": 1.610703393567781, + "learning_rate": 9.815445643978918e-06, + "loss": 0.3927, + "step": 1816 + }, + { + "epoch": 0.11, + "grad_norm": 2.370038951133397, + "learning_rate": 9.815171387814726e-06, + "loss": 0.411, + "step": 1817 + }, + { + "epoch": 0.11, + "grad_norm": 2.3157353744370717, + "learning_rate": 9.814896931860518e-06, + "loss": 0.3883, + "step": 1818 + }, + { + "epoch": 0.11, + "grad_norm": 1.9855552743737201, + "learning_rate": 9.814622276127685e-06, + "loss": 0.3855, + "step": 1819 + }, + { + "epoch": 0.11, + "grad_norm": 2.4350674817463283, + "learning_rate": 9.814347420627624e-06, + "loss": 0.3797, + "step": 1820 + }, + { + "epoch": 0.11, + "grad_norm": 2.1328802884017537, + "learning_rate": 9.814072365371736e-06, + "loss": 0.4107, + "step": 1821 + }, + { + "epoch": 0.11, + "grad_norm": 1.8727191776776886, + "learning_rate": 9.813797110371435e-06, + "loss": 0.4059, + "step": 1822 + }, + { + "epoch": 0.11, + "grad_norm": 2.4758822076628575, + "learning_rate": 9.81352165563814e-06, + "loss": 0.3964, + "step": 1823 + }, + { + "epoch": 0.11, + "grad_norm": 1.7728905932904224, + "learning_rate": 9.813246001183283e-06, + "loss": 0.3744, + "step": 1824 + }, + { + "epoch": 0.11, + "grad_norm": 3.3548693281117927, + "learning_rate": 9.812970147018301e-06, + "loss": 0.387, + "step": 1825 + }, + { + "epoch": 0.11, + "grad_norm": 0.7098037795919612, + "learning_rate": 9.812694093154637e-06, + "loss": 0.4919, + "step": 1826 + }, + { + "epoch": 0.11, + "grad_norm": 1.8816550590224619, + "learning_rate": 9.812417839603748e-06, + "loss": 0.3631, + "step": 1827 + }, + { + "epoch": 0.11, + "grad_norm": 2.3025814611521094, + "learning_rate": 9.812141386377095e-06, + "loss": 0.3602, + "step": 1828 + }, + { + "epoch": 0.12, + "grad_norm": 3.8788408297975687, + "learning_rate": 9.811864733486148e-06, + "loss": 0.3771, + "step": 1829 + }, + { + "epoch": 0.12, + "grad_norm": 1.3724140881944669, + "learning_rate": 9.811587880942387e-06, + "loss": 0.3961, + "step": 1830 + }, + { + "epoch": 0.12, + "grad_norm": 2.4392367378303383, + "learning_rate": 9.8113108287573e-06, + "loss": 0.3919, + "step": 1831 + }, + { + "epoch": 0.12, + "grad_norm": 1.6599125668086123, + "learning_rate": 9.811033576942377e-06, + "loss": 0.3523, + "step": 1832 + }, + { + "epoch": 0.12, + "grad_norm": 3.176707774711695, + "learning_rate": 9.81075612550913e-06, + "loss": 0.3869, + "step": 1833 + }, + { + "epoch": 0.12, + "grad_norm": 2.0351763040173565, + "learning_rate": 9.810478474469063e-06, + "loss": 0.3822, + "step": 1834 + }, + { + "epoch": 0.12, + "grad_norm": 1.6228436799116237, + "learning_rate": 9.8102006238337e-06, + "loss": 0.3755, + "step": 1835 + }, + { + "epoch": 0.12, + "grad_norm": 1.5019206471209272, + "learning_rate": 9.80992257361457e-06, + "loss": 0.3835, + "step": 1836 + }, + { + "epoch": 0.12, + "grad_norm": 2.0701113391645256, + "learning_rate": 9.809644323823208e-06, + "loss": 0.3685, + "step": 1837 + }, + { + "epoch": 0.12, + "grad_norm": 1.8122415268301864, + "learning_rate": 9.809365874471162e-06, + "loss": 0.3677, + "step": 1838 + }, + { + "epoch": 0.12, + "grad_norm": 3.392152207416042, + "learning_rate": 9.809087225569982e-06, + "loss": 0.3762, + "step": 1839 + }, + { + "epoch": 0.12, + "grad_norm": 3.973822000004366, + "learning_rate": 9.808808377131232e-06, + "loss": 0.3985, + "step": 1840 + }, + { + "epoch": 0.12, + "grad_norm": 2.1141198723828953, + "learning_rate": 9.80852932916648e-06, + "loss": 0.3799, + "step": 1841 + }, + { + "epoch": 0.12, + "grad_norm": 1.536958406072693, + "learning_rate": 9.808250081687307e-06, + "loss": 0.3838, + "step": 1842 + }, + { + "epoch": 0.12, + "grad_norm": 1.6693594269266874, + "learning_rate": 9.807970634705297e-06, + "loss": 0.3677, + "step": 1843 + }, + { + "epoch": 0.12, + "grad_norm": 3.879146621583926, + "learning_rate": 9.807690988232046e-06, + "loss": 0.3748, + "step": 1844 + }, + { + "epoch": 0.12, + "grad_norm": 1.7360582201283752, + "learning_rate": 9.807411142279155e-06, + "loss": 0.3736, + "step": 1845 + }, + { + "epoch": 0.12, + "grad_norm": 0.7292268293298563, + "learning_rate": 9.807131096858237e-06, + "loss": 0.4865, + "step": 1846 + }, + { + "epoch": 0.12, + "grad_norm": 3.4239736621705856, + "learning_rate": 9.806850851980913e-06, + "loss": 0.4336, + "step": 1847 + }, + { + "epoch": 0.12, + "grad_norm": 1.6510268875876362, + "learning_rate": 9.806570407658807e-06, + "loss": 0.3613, + "step": 1848 + }, + { + "epoch": 0.12, + "grad_norm": 1.5781351338102192, + "learning_rate": 9.806289763903558e-06, + "loss": 0.4012, + "step": 1849 + }, + { + "epoch": 0.12, + "grad_norm": 1.6703472805886768, + "learning_rate": 9.80600892072681e-06, + "loss": 0.3647, + "step": 1850 + }, + { + "epoch": 0.12, + "grad_norm": 1.4835899672197248, + "learning_rate": 9.805727878140216e-06, + "loss": 0.3848, + "step": 1851 + }, + { + "epoch": 0.12, + "grad_norm": 1.3772710502274328, + "learning_rate": 9.805446636155435e-06, + "loss": 0.3511, + "step": 1852 + }, + { + "epoch": 0.12, + "grad_norm": 1.8725052006963179, + "learning_rate": 9.80516519478414e-06, + "loss": 0.3736, + "step": 1853 + }, + { + "epoch": 0.12, + "grad_norm": 1.783161350617083, + "learning_rate": 9.804883554038005e-06, + "loss": 0.3717, + "step": 1854 + }, + { + "epoch": 0.12, + "grad_norm": 5.443154753730953, + "learning_rate": 9.804601713928716e-06, + "loss": 0.3866, + "step": 1855 + }, + { + "epoch": 0.12, + "grad_norm": 1.4668533782426225, + "learning_rate": 9.804319674467969e-06, + "loss": 0.3919, + "step": 1856 + }, + { + "epoch": 0.12, + "grad_norm": 2.026507821088116, + "learning_rate": 9.804037435667465e-06, + "loss": 0.406, + "step": 1857 + }, + { + "epoch": 0.12, + "grad_norm": 1.9596348999579245, + "learning_rate": 9.803754997538915e-06, + "loss": 0.3937, + "step": 1858 + }, + { + "epoch": 0.12, + "grad_norm": 0.821964444594066, + "learning_rate": 9.803472360094037e-06, + "loss": 0.5198, + "step": 1859 + }, + { + "epoch": 0.12, + "grad_norm": 1.844427766970074, + "learning_rate": 9.80318952334456e-06, + "loss": 0.3886, + "step": 1860 + }, + { + "epoch": 0.12, + "grad_norm": 2.5945862940221964, + "learning_rate": 9.802906487302217e-06, + "loss": 0.4021, + "step": 1861 + }, + { + "epoch": 0.12, + "grad_norm": 0.7011221792938005, + "learning_rate": 9.802623251978754e-06, + "loss": 0.4896, + "step": 1862 + }, + { + "epoch": 0.12, + "grad_norm": 2.18323762157779, + "learning_rate": 9.802339817385921e-06, + "loss": 0.3756, + "step": 1863 + }, + { + "epoch": 0.12, + "grad_norm": 2.2646470276799664, + "learning_rate": 9.80205618353548e-06, + "loss": 0.3962, + "step": 1864 + }, + { + "epoch": 0.12, + "grad_norm": 1.8768123406953179, + "learning_rate": 9.801772350439197e-06, + "loss": 0.3497, + "step": 1865 + }, + { + "epoch": 0.12, + "grad_norm": 1.8224164058034686, + "learning_rate": 9.80148831810885e-06, + "loss": 0.3835, + "step": 1866 + }, + { + "epoch": 0.12, + "grad_norm": 1.4532793718077375, + "learning_rate": 9.801204086556226e-06, + "loss": 0.3818, + "step": 1867 + }, + { + "epoch": 0.12, + "grad_norm": 2.242903254828831, + "learning_rate": 9.800919655793118e-06, + "loss": 0.4071, + "step": 1868 + }, + { + "epoch": 0.12, + "grad_norm": 1.7055773310523081, + "learning_rate": 9.800635025831323e-06, + "loss": 0.3604, + "step": 1869 + }, + { + "epoch": 0.12, + "grad_norm": 4.368379984554867, + "learning_rate": 9.800350196682655e-06, + "loss": 0.3688, + "step": 1870 + }, + { + "epoch": 0.12, + "grad_norm": 0.8032910357834164, + "learning_rate": 9.800065168358932e-06, + "loss": 0.5017, + "step": 1871 + }, + { + "epoch": 0.12, + "grad_norm": 5.270233253613688, + "learning_rate": 9.799779940871978e-06, + "loss": 0.3551, + "step": 1872 + }, + { + "epoch": 0.12, + "grad_norm": 2.424770210701576, + "learning_rate": 9.79949451423363e-06, + "loss": 0.4159, + "step": 1873 + }, + { + "epoch": 0.12, + "grad_norm": 1.5062834576744304, + "learning_rate": 9.799208888455728e-06, + "loss": 0.3857, + "step": 1874 + }, + { + "epoch": 0.12, + "grad_norm": 1.9901631549421632, + "learning_rate": 9.798923063550126e-06, + "loss": 0.394, + "step": 1875 + }, + { + "epoch": 0.12, + "grad_norm": 1.6528928084315886, + "learning_rate": 9.798637039528682e-06, + "loss": 0.3741, + "step": 1876 + }, + { + "epoch": 0.12, + "grad_norm": 1.851195106538528, + "learning_rate": 9.798350816403264e-06, + "loss": 0.3903, + "step": 1877 + }, + { + "epoch": 0.12, + "grad_norm": 1.6243796938564128, + "learning_rate": 9.798064394185747e-06, + "loss": 0.3806, + "step": 1878 + }, + { + "epoch": 0.12, + "grad_norm": 2.0692094744188174, + "learning_rate": 9.797777772888018e-06, + "loss": 0.375, + "step": 1879 + }, + { + "epoch": 0.12, + "grad_norm": 2.9722223855442804, + "learning_rate": 9.797490952521965e-06, + "loss": 0.404, + "step": 1880 + }, + { + "epoch": 0.12, + "grad_norm": 4.323189381334899, + "learning_rate": 9.797203933099492e-06, + "loss": 0.4047, + "step": 1881 + }, + { + "epoch": 0.12, + "grad_norm": 1.8726616653905463, + "learning_rate": 9.796916714632507e-06, + "loss": 0.4161, + "step": 1882 + }, + { + "epoch": 0.12, + "grad_norm": 1.4112081624351538, + "learning_rate": 9.796629297132927e-06, + "loss": 0.3643, + "step": 1883 + }, + { + "epoch": 0.12, + "grad_norm": 2.1861272231532065, + "learning_rate": 9.79634168061268e-06, + "loss": 0.3979, + "step": 1884 + }, + { + "epoch": 0.12, + "grad_norm": 2.003887017501671, + "learning_rate": 9.796053865083694e-06, + "loss": 0.3589, + "step": 1885 + }, + { + "epoch": 0.12, + "grad_norm": 1.6608401463733822, + "learning_rate": 9.795765850557917e-06, + "loss": 0.3634, + "step": 1886 + }, + { + "epoch": 0.12, + "grad_norm": 3.719302499213828, + "learning_rate": 9.795477637047295e-06, + "loss": 0.3713, + "step": 1887 + }, + { + "epoch": 0.12, + "grad_norm": 2.5173584641802416, + "learning_rate": 9.795189224563788e-06, + "loss": 0.3754, + "step": 1888 + }, + { + "epoch": 0.12, + "grad_norm": 4.5218269577982015, + "learning_rate": 9.794900613119364e-06, + "loss": 0.3594, + "step": 1889 + }, + { + "epoch": 0.12, + "grad_norm": 2.8582866394907955, + "learning_rate": 9.794611802725997e-06, + "loss": 0.3571, + "step": 1890 + }, + { + "epoch": 0.12, + "grad_norm": 2.072569259131529, + "learning_rate": 9.794322793395669e-06, + "loss": 0.367, + "step": 1891 + }, + { + "epoch": 0.12, + "grad_norm": 0.812020311598828, + "learning_rate": 9.794033585140372e-06, + "loss": 0.524, + "step": 1892 + }, + { + "epoch": 0.12, + "grad_norm": 2.3552908356253797, + "learning_rate": 9.793744177972108e-06, + "loss": 0.3782, + "step": 1893 + }, + { + "epoch": 0.12, + "grad_norm": 13.422310012617706, + "learning_rate": 9.793454571902883e-06, + "loss": 0.3586, + "step": 1894 + }, + { + "epoch": 0.12, + "grad_norm": 3.1116590567977065, + "learning_rate": 9.793164766944714e-06, + "loss": 0.4087, + "step": 1895 + }, + { + "epoch": 0.12, + "grad_norm": 1.5817790936508511, + "learning_rate": 9.792874763109624e-06, + "loss": 0.401, + "step": 1896 + }, + { + "epoch": 0.12, + "grad_norm": 3.317936493490959, + "learning_rate": 9.792584560409649e-06, + "loss": 0.3802, + "step": 1897 + }, + { + "epoch": 0.12, + "grad_norm": 0.6673907285813097, + "learning_rate": 9.792294158856826e-06, + "loss": 0.5049, + "step": 1898 + }, + { + "epoch": 0.12, + "grad_norm": 1.5708418207204, + "learning_rate": 9.792003558463207e-06, + "loss": 0.3849, + "step": 1899 + }, + { + "epoch": 0.12, + "grad_norm": 1.8808633162336108, + "learning_rate": 9.79171275924085e-06, + "loss": 0.3635, + "step": 1900 + }, + { + "epoch": 0.12, + "grad_norm": 2.1897425596392917, + "learning_rate": 9.791421761201816e-06, + "loss": 0.3867, + "step": 1901 + }, + { + "epoch": 0.12, + "grad_norm": 1.4173800838434034, + "learning_rate": 9.791130564358187e-06, + "loss": 0.3594, + "step": 1902 + }, + { + "epoch": 0.12, + "grad_norm": 1.4725636900039665, + "learning_rate": 9.790839168722037e-06, + "loss": 0.3565, + "step": 1903 + }, + { + "epoch": 0.12, + "grad_norm": 2.1064058612467136, + "learning_rate": 9.790547574305463e-06, + "loss": 0.3769, + "step": 1904 + }, + { + "epoch": 0.12, + "grad_norm": 1.8853211503689868, + "learning_rate": 9.79025578112056e-06, + "loss": 0.3813, + "step": 1905 + }, + { + "epoch": 0.12, + "grad_norm": 2.227915359719842, + "learning_rate": 9.789963789179438e-06, + "loss": 0.3948, + "step": 1906 + }, + { + "epoch": 0.12, + "grad_norm": 4.595272909569428, + "learning_rate": 9.789671598494208e-06, + "loss": 0.372, + "step": 1907 + }, + { + "epoch": 0.12, + "grad_norm": 2.7529055558688227, + "learning_rate": 9.789379209076997e-06, + "loss": 0.4011, + "step": 1908 + }, + { + "epoch": 0.12, + "grad_norm": 3.2201098265594053, + "learning_rate": 9.789086620939936e-06, + "loss": 0.4142, + "step": 1909 + }, + { + "epoch": 0.12, + "grad_norm": 1.6230325565626995, + "learning_rate": 9.788793834095165e-06, + "loss": 0.3868, + "step": 1910 + }, + { + "epoch": 0.12, + "grad_norm": 1.8220788291104209, + "learning_rate": 9.788500848554831e-06, + "loss": 0.4005, + "step": 1911 + }, + { + "epoch": 0.12, + "grad_norm": 1.86975313357887, + "learning_rate": 9.788207664331093e-06, + "loss": 0.3862, + "step": 1912 + }, + { + "epoch": 0.12, + "grad_norm": 1.5087622500622635, + "learning_rate": 9.787914281436112e-06, + "loss": 0.4086, + "step": 1913 + }, + { + "epoch": 0.12, + "grad_norm": 2.7034975329118294, + "learning_rate": 9.787620699882064e-06, + "loss": 0.3916, + "step": 1914 + }, + { + "epoch": 0.12, + "grad_norm": 2.8567750650849524, + "learning_rate": 9.78732691968113e-06, + "loss": 0.3843, + "step": 1915 + }, + { + "epoch": 0.12, + "grad_norm": 1.9640602234460758, + "learning_rate": 9.787032940845499e-06, + "loss": 0.3707, + "step": 1916 + }, + { + "epoch": 0.12, + "grad_norm": 1.655332001580452, + "learning_rate": 9.786738763387368e-06, + "loss": 0.3766, + "step": 1917 + }, + { + "epoch": 0.12, + "grad_norm": 2.033882687241404, + "learning_rate": 9.786444387318943e-06, + "loss": 0.3894, + "step": 1918 + }, + { + "epoch": 0.12, + "grad_norm": 2.074227083942343, + "learning_rate": 9.78614981265244e-06, + "loss": 0.3678, + "step": 1919 + }, + { + "epoch": 0.12, + "grad_norm": 0.8630051286114978, + "learning_rate": 9.785855039400079e-06, + "loss": 0.5252, + "step": 1920 + }, + { + "epoch": 0.12, + "grad_norm": 2.519566927333756, + "learning_rate": 9.785560067574092e-06, + "loss": 0.3803, + "step": 1921 + }, + { + "epoch": 0.12, + "grad_norm": 1.6710749180215918, + "learning_rate": 9.785264897186718e-06, + "loss": 0.38, + "step": 1922 + }, + { + "epoch": 0.12, + "grad_norm": 2.0016881036651193, + "learning_rate": 9.784969528250204e-06, + "loss": 0.3609, + "step": 1923 + }, + { + "epoch": 0.12, + "grad_norm": 1.9523850550089148, + "learning_rate": 9.784673960776805e-06, + "loss": 0.39, + "step": 1924 + }, + { + "epoch": 0.12, + "grad_norm": 1.696981077145076, + "learning_rate": 9.784378194778786e-06, + "loss": 0.3593, + "step": 1925 + }, + { + "epoch": 0.12, + "grad_norm": 2.303193951213647, + "learning_rate": 9.784082230268415e-06, + "loss": 0.3688, + "step": 1926 + }, + { + "epoch": 0.12, + "grad_norm": 1.83381667490055, + "learning_rate": 9.783786067257976e-06, + "loss": 0.3837, + "step": 1927 + }, + { + "epoch": 0.12, + "grad_norm": 1.7024276635852529, + "learning_rate": 9.783489705759756e-06, + "loss": 0.3759, + "step": 1928 + }, + { + "epoch": 0.12, + "grad_norm": 3.0647040389760325, + "learning_rate": 9.783193145786053e-06, + "loss": 0.3857, + "step": 1929 + }, + { + "epoch": 0.12, + "grad_norm": 2.017366550215066, + "learning_rate": 9.78289638734917e-06, + "loss": 0.3885, + "step": 1930 + }, + { + "epoch": 0.12, + "grad_norm": 2.2335354084717443, + "learning_rate": 9.78259943046142e-06, + "loss": 0.358, + "step": 1931 + }, + { + "epoch": 0.12, + "grad_norm": 1.7159171008819794, + "learning_rate": 9.782302275135124e-06, + "loss": 0.4002, + "step": 1932 + }, + { + "epoch": 0.12, + "grad_norm": 2.280026934917581, + "learning_rate": 9.782004921382612e-06, + "loss": 0.3735, + "step": 1933 + }, + { + "epoch": 0.12, + "grad_norm": 2.1313823366662983, + "learning_rate": 9.781707369216224e-06, + "loss": 0.3797, + "step": 1934 + }, + { + "epoch": 0.12, + "grad_norm": 21.506648057275566, + "learning_rate": 9.781409618648303e-06, + "loss": 0.3678, + "step": 1935 + }, + { + "epoch": 0.12, + "grad_norm": 1.6622627212392396, + "learning_rate": 9.781111669691203e-06, + "loss": 0.4027, + "step": 1936 + }, + { + "epoch": 0.12, + "grad_norm": 2.1157157261458726, + "learning_rate": 9.78081352235729e-06, + "loss": 0.386, + "step": 1937 + }, + { + "epoch": 0.12, + "grad_norm": 1.771600485678825, + "learning_rate": 9.780515176658931e-06, + "loss": 0.3738, + "step": 1938 + }, + { + "epoch": 0.12, + "grad_norm": 2.0231255387124607, + "learning_rate": 9.780216632608505e-06, + "loss": 0.3615, + "step": 1939 + }, + { + "epoch": 0.12, + "grad_norm": 1.5574669321774992, + "learning_rate": 9.779917890218403e-06, + "loss": 0.3783, + "step": 1940 + }, + { + "epoch": 0.12, + "grad_norm": 5.11765915699987, + "learning_rate": 9.779618949501016e-06, + "loss": 0.3744, + "step": 1941 + }, + { + "epoch": 0.12, + "grad_norm": 1.200780852177338, + "learning_rate": 9.77931981046875e-06, + "loss": 0.3644, + "step": 1942 + }, + { + "epoch": 0.12, + "grad_norm": 2.2477496902459144, + "learning_rate": 9.779020473134016e-06, + "loss": 0.379, + "step": 1943 + }, + { + "epoch": 0.12, + "grad_norm": 2.546558164427262, + "learning_rate": 9.778720937509232e-06, + "loss": 0.3738, + "step": 1944 + }, + { + "epoch": 0.12, + "grad_norm": 1.606784032500254, + "learning_rate": 9.77842120360683e-06, + "loss": 0.3791, + "step": 1945 + }, + { + "epoch": 0.12, + "grad_norm": 1.544110328603507, + "learning_rate": 9.778121271439244e-06, + "loss": 0.4039, + "step": 1946 + }, + { + "epoch": 0.12, + "grad_norm": 1.8406293720086553, + "learning_rate": 9.777821141018921e-06, + "loss": 0.3507, + "step": 1947 + }, + { + "epoch": 0.12, + "grad_norm": 2.305839609769871, + "learning_rate": 9.777520812358312e-06, + "loss": 0.3889, + "step": 1948 + }, + { + "epoch": 0.12, + "grad_norm": 2.029526921175561, + "learning_rate": 9.77722028546988e-06, + "loss": 0.3825, + "step": 1949 + }, + { + "epoch": 0.12, + "grad_norm": 1.9220591347031635, + "learning_rate": 9.776919560366091e-06, + "loss": 0.3596, + "step": 1950 + }, + { + "epoch": 0.12, + "grad_norm": 1.4369895353485262, + "learning_rate": 9.776618637059426e-06, + "loss": 0.3793, + "step": 1951 + }, + { + "epoch": 0.12, + "grad_norm": 2.1288381073443885, + "learning_rate": 9.776317515562368e-06, + "loss": 0.3773, + "step": 1952 + }, + { + "epoch": 0.12, + "grad_norm": 0.907481904334835, + "learning_rate": 9.776016195887412e-06, + "loss": 0.51, + "step": 1953 + }, + { + "epoch": 0.12, + "grad_norm": 1.881392823035069, + "learning_rate": 9.775714678047062e-06, + "loss": 0.3707, + "step": 1954 + }, + { + "epoch": 0.12, + "grad_norm": 1.9875774410229394, + "learning_rate": 9.775412962053827e-06, + "loss": 0.3848, + "step": 1955 + }, + { + "epoch": 0.12, + "grad_norm": 1.6703353259398892, + "learning_rate": 9.775111047920227e-06, + "loss": 0.3707, + "step": 1956 + }, + { + "epoch": 0.12, + "grad_norm": 2.3450663523316644, + "learning_rate": 9.774808935658789e-06, + "loss": 0.3824, + "step": 1957 + }, + { + "epoch": 0.12, + "grad_norm": 3.2576297293049863, + "learning_rate": 9.774506625282045e-06, + "loss": 0.3924, + "step": 1958 + }, + { + "epoch": 0.12, + "grad_norm": 3.1682736851014184, + "learning_rate": 9.774204116802541e-06, + "loss": 0.3861, + "step": 1959 + }, + { + "epoch": 0.12, + "grad_norm": 2.176652831134348, + "learning_rate": 9.77390141023283e-06, + "loss": 0.3936, + "step": 1960 + }, + { + "epoch": 0.12, + "grad_norm": 2.462140381311946, + "learning_rate": 9.773598505585469e-06, + "loss": 0.3875, + "step": 1961 + }, + { + "epoch": 0.12, + "grad_norm": 2.948576032937403, + "learning_rate": 9.773295402873027e-06, + "loss": 0.3846, + "step": 1962 + }, + { + "epoch": 0.12, + "grad_norm": 2.40336693857439, + "learning_rate": 9.772992102108081e-06, + "loss": 0.372, + "step": 1963 + }, + { + "epoch": 0.12, + "grad_norm": 1.4358865188548455, + "learning_rate": 9.772688603303212e-06, + "loss": 0.3784, + "step": 1964 + }, + { + "epoch": 0.12, + "grad_norm": 1.1518668362808788, + "learning_rate": 9.772384906471019e-06, + "loss": 0.3786, + "step": 1965 + }, + { + "epoch": 0.12, + "grad_norm": 1.74123761591902, + "learning_rate": 9.7720810116241e-06, + "loss": 0.3787, + "step": 1966 + }, + { + "epoch": 0.12, + "grad_norm": 2.0715609386547476, + "learning_rate": 9.771776918775062e-06, + "loss": 0.3665, + "step": 1967 + }, + { + "epoch": 0.12, + "grad_norm": 3.0775998085029475, + "learning_rate": 9.771472627936523e-06, + "loss": 0.3756, + "step": 1968 + }, + { + "epoch": 0.12, + "grad_norm": 1.7415421794158161, + "learning_rate": 9.771168139121112e-06, + "loss": 0.3795, + "step": 1969 + }, + { + "epoch": 0.12, + "grad_norm": 1.561194303588185, + "learning_rate": 9.77086345234146e-06, + "loss": 0.3796, + "step": 1970 + }, + { + "epoch": 0.12, + "grad_norm": 1.8302956151937808, + "learning_rate": 9.770558567610206e-06, + "loss": 0.3749, + "step": 1971 + }, + { + "epoch": 0.12, + "grad_norm": 5.810312018015889, + "learning_rate": 9.770253484940006e-06, + "loss": 0.3764, + "step": 1972 + }, + { + "epoch": 0.12, + "grad_norm": 1.5425732988747862, + "learning_rate": 9.769948204343516e-06, + "loss": 0.3728, + "step": 1973 + }, + { + "epoch": 0.12, + "grad_norm": 3.647153725909506, + "learning_rate": 9.769642725833404e-06, + "loss": 0.383, + "step": 1974 + }, + { + "epoch": 0.12, + "grad_norm": 2.149574646060756, + "learning_rate": 9.769337049422342e-06, + "loss": 0.3739, + "step": 1975 + }, + { + "epoch": 0.12, + "grad_norm": 2.0619356098848436, + "learning_rate": 9.769031175123014e-06, + "loss": 0.3921, + "step": 1976 + }, + { + "epoch": 0.12, + "grad_norm": 4.1116561985412945, + "learning_rate": 9.768725102948114e-06, + "loss": 0.3822, + "step": 1977 + }, + { + "epoch": 0.12, + "grad_norm": 1.8387468500880593, + "learning_rate": 9.768418832910335e-06, + "loss": 0.3498, + "step": 1978 + }, + { + "epoch": 0.12, + "grad_norm": 1.466877314186575, + "learning_rate": 9.768112365022393e-06, + "loss": 0.3734, + "step": 1979 + }, + { + "epoch": 0.12, + "grad_norm": 1.6249384200281445, + "learning_rate": 9.767805699297e-06, + "loss": 0.3763, + "step": 1980 + }, + { + "epoch": 0.12, + "grad_norm": 2.317160429032891, + "learning_rate": 9.767498835746877e-06, + "loss": 0.3709, + "step": 1981 + }, + { + "epoch": 0.12, + "grad_norm": 1.546149542105723, + "learning_rate": 9.767191774384762e-06, + "loss": 0.3775, + "step": 1982 + }, + { + "epoch": 0.12, + "grad_norm": 2.392984475719745, + "learning_rate": 9.76688451522339e-06, + "loss": 0.3851, + "step": 1983 + }, + { + "epoch": 0.12, + "grad_norm": 2.3679476559327224, + "learning_rate": 9.766577058275515e-06, + "loss": 0.3711, + "step": 1984 + }, + { + "epoch": 0.12, + "grad_norm": 1.4697593364660515, + "learning_rate": 9.76626940355389e-06, + "loss": 0.3875, + "step": 1985 + }, + { + "epoch": 0.12, + "grad_norm": 1.6129796454028429, + "learning_rate": 9.765961551071281e-06, + "loss": 0.3509, + "step": 1986 + }, + { + "epoch": 0.12, + "grad_norm": 1.4820353944038447, + "learning_rate": 9.765653500840465e-06, + "loss": 0.3905, + "step": 1987 + }, + { + "epoch": 0.13, + "grad_norm": 2.7673014753661827, + "learning_rate": 9.765345252874218e-06, + "loss": 0.3686, + "step": 1988 + }, + { + "epoch": 0.13, + "grad_norm": 6.89545528951867, + "learning_rate": 9.765036807185333e-06, + "loss": 0.3747, + "step": 1989 + }, + { + "epoch": 0.13, + "grad_norm": 1.8510555016509191, + "learning_rate": 9.764728163786607e-06, + "loss": 0.3469, + "step": 1990 + }, + { + "epoch": 0.13, + "grad_norm": 2.5776007745789054, + "learning_rate": 9.764419322690846e-06, + "loss": 0.3916, + "step": 1991 + }, + { + "epoch": 0.13, + "grad_norm": 3.6629761168978403, + "learning_rate": 9.764110283910864e-06, + "loss": 0.3613, + "step": 1992 + }, + { + "epoch": 0.13, + "grad_norm": 1.8771437108275397, + "learning_rate": 9.763801047459487e-06, + "loss": 0.3824, + "step": 1993 + }, + { + "epoch": 0.13, + "grad_norm": 1.8611967018793132, + "learning_rate": 9.76349161334954e-06, + "loss": 0.383, + "step": 1994 + }, + { + "epoch": 0.13, + "grad_norm": 1.5387062747787261, + "learning_rate": 9.763181981593868e-06, + "loss": 0.3686, + "step": 1995 + }, + { + "epoch": 0.13, + "grad_norm": 1.5395254340364815, + "learning_rate": 9.762872152205313e-06, + "loss": 0.3639, + "step": 1996 + }, + { + "epoch": 0.13, + "grad_norm": 6.288502229213549, + "learning_rate": 9.762562125196734e-06, + "loss": 0.3671, + "step": 1997 + }, + { + "epoch": 0.13, + "grad_norm": 2.357514527059941, + "learning_rate": 9.762251900580992e-06, + "loss": 0.3594, + "step": 1998 + }, + { + "epoch": 0.13, + "grad_norm": 1.7716245546139546, + "learning_rate": 9.76194147837096e-06, + "loss": 0.3733, + "step": 1999 + }, + { + "epoch": 0.13, + "grad_norm": 1.893383549361695, + "learning_rate": 9.76163085857952e-06, + "loss": 0.3726, + "step": 2000 + }, + { + "epoch": 0.13, + "grad_norm": 0.8859889597204025, + "learning_rate": 9.761320041219555e-06, + "loss": 0.4947, + "step": 2001 + }, + { + "epoch": 0.13, + "grad_norm": 2.221491499199223, + "learning_rate": 9.761009026303968e-06, + "loss": 0.3807, + "step": 2002 + }, + { + "epoch": 0.13, + "grad_norm": 0.693554606557953, + "learning_rate": 9.760697813845656e-06, + "loss": 0.5143, + "step": 2003 + }, + { + "epoch": 0.13, + "grad_norm": 1.7756681160884638, + "learning_rate": 9.760386403857538e-06, + "loss": 0.3808, + "step": 2004 + }, + { + "epoch": 0.13, + "grad_norm": 2.765243284947136, + "learning_rate": 9.760074796352532e-06, + "loss": 0.3849, + "step": 2005 + }, + { + "epoch": 0.13, + "grad_norm": 1.9655707569516148, + "learning_rate": 9.759762991343569e-06, + "loss": 0.386, + "step": 2006 + }, + { + "epoch": 0.13, + "grad_norm": 1.769282624130448, + "learning_rate": 9.759450988843585e-06, + "loss": 0.3801, + "step": 2007 + }, + { + "epoch": 0.13, + "grad_norm": 1.712213566302072, + "learning_rate": 9.759138788865524e-06, + "loss": 0.3818, + "step": 2008 + }, + { + "epoch": 0.13, + "grad_norm": 1.82289568923332, + "learning_rate": 9.758826391422343e-06, + "loss": 0.3733, + "step": 2009 + }, + { + "epoch": 0.13, + "grad_norm": 2.032705911899892, + "learning_rate": 9.758513796527002e-06, + "loss": 0.3661, + "step": 2010 + }, + { + "epoch": 0.13, + "grad_norm": 1.6822890353301314, + "learning_rate": 9.758201004192471e-06, + "loss": 0.3604, + "step": 2011 + }, + { + "epoch": 0.13, + "grad_norm": 2.190475743965018, + "learning_rate": 9.75788801443173e-06, + "loss": 0.3906, + "step": 2012 + }, + { + "epoch": 0.13, + "grad_norm": 10.261698410710128, + "learning_rate": 9.757574827257764e-06, + "loss": 0.3629, + "step": 2013 + }, + { + "epoch": 0.13, + "grad_norm": 1.0359058427099048, + "learning_rate": 9.757261442683568e-06, + "loss": 0.5211, + "step": 2014 + }, + { + "epoch": 0.13, + "grad_norm": 1.6760454853543507, + "learning_rate": 9.756947860722143e-06, + "loss": 0.3784, + "step": 2015 + }, + { + "epoch": 0.13, + "grad_norm": 2.8343607510351445, + "learning_rate": 9.756634081386504e-06, + "loss": 0.3767, + "step": 2016 + }, + { + "epoch": 0.13, + "grad_norm": 5.226232322669994, + "learning_rate": 9.756320104689667e-06, + "loss": 0.3815, + "step": 2017 + }, + { + "epoch": 0.13, + "grad_norm": 2.906365266928955, + "learning_rate": 9.756005930644662e-06, + "loss": 0.3899, + "step": 2018 + }, + { + "epoch": 0.13, + "grad_norm": 6.127075075399167, + "learning_rate": 9.755691559264522e-06, + "loss": 0.4045, + "step": 2019 + }, + { + "epoch": 0.13, + "grad_norm": 3.6405042644879044, + "learning_rate": 9.755376990562295e-06, + "loss": 0.3806, + "step": 2020 + }, + { + "epoch": 0.13, + "grad_norm": 4.314911126990063, + "learning_rate": 9.755062224551026e-06, + "loss": 0.4185, + "step": 2021 + }, + { + "epoch": 0.13, + "grad_norm": 1.9300617164925287, + "learning_rate": 9.754747261243782e-06, + "loss": 0.389, + "step": 2022 + }, + { + "epoch": 0.13, + "grad_norm": 2.4834698528387698, + "learning_rate": 9.754432100653628e-06, + "loss": 0.3853, + "step": 2023 + }, + { + "epoch": 0.13, + "grad_norm": 3.236187734459461, + "learning_rate": 9.754116742793643e-06, + "loss": 0.3711, + "step": 2024 + }, + { + "epoch": 0.13, + "grad_norm": 3.210971605398184, + "learning_rate": 9.753801187676908e-06, + "loss": 0.3749, + "step": 2025 + }, + { + "epoch": 0.13, + "grad_norm": 2.9677015445758617, + "learning_rate": 9.753485435316518e-06, + "loss": 0.349, + "step": 2026 + }, + { + "epoch": 0.13, + "grad_norm": 1.8476003512204442, + "learning_rate": 9.753169485725575e-06, + "loss": 0.3663, + "step": 2027 + }, + { + "epoch": 0.13, + "grad_norm": 8.368513933261431, + "learning_rate": 9.752853338917187e-06, + "loss": 0.3689, + "step": 2028 + }, + { + "epoch": 0.13, + "grad_norm": 2.0499042881887637, + "learning_rate": 9.752536994904473e-06, + "loss": 0.3622, + "step": 2029 + }, + { + "epoch": 0.13, + "grad_norm": 2.0956243531214254, + "learning_rate": 9.752220453700556e-06, + "loss": 0.351, + "step": 2030 + }, + { + "epoch": 0.13, + "grad_norm": 3.314003420331242, + "learning_rate": 9.751903715318572e-06, + "loss": 0.3467, + "step": 2031 + }, + { + "epoch": 0.13, + "grad_norm": 3.785946372863839, + "learning_rate": 9.751586779771663e-06, + "loss": 0.3788, + "step": 2032 + }, + { + "epoch": 0.13, + "grad_norm": 2.4905383618900525, + "learning_rate": 9.751269647072978e-06, + "loss": 0.3885, + "step": 2033 + }, + { + "epoch": 0.13, + "grad_norm": 3.6776188208779383, + "learning_rate": 9.750952317235678e-06, + "loss": 0.389, + "step": 2034 + }, + { + "epoch": 0.13, + "grad_norm": 2.164191962494118, + "learning_rate": 9.750634790272926e-06, + "loss": 0.3705, + "step": 2035 + }, + { + "epoch": 0.13, + "grad_norm": 4.562466612448937, + "learning_rate": 9.750317066197899e-06, + "loss": 0.3635, + "step": 2036 + }, + { + "epoch": 0.13, + "grad_norm": 0.6978323981070979, + "learning_rate": 9.74999914502378e-06, + "loss": 0.4722, + "step": 2037 + }, + { + "epoch": 0.13, + "grad_norm": 1.7676229652145903, + "learning_rate": 9.749681026763758e-06, + "loss": 0.3735, + "step": 2038 + }, + { + "epoch": 0.13, + "grad_norm": 2.7321059484900543, + "learning_rate": 9.749362711431034e-06, + "loss": 0.3826, + "step": 2039 + }, + { + "epoch": 0.13, + "grad_norm": 2.5780720677869677, + "learning_rate": 9.749044199038817e-06, + "loss": 0.3683, + "step": 2040 + }, + { + "epoch": 0.13, + "grad_norm": 1.7770761131145016, + "learning_rate": 9.74872548960032e-06, + "loss": 0.3505, + "step": 2041 + }, + { + "epoch": 0.13, + "grad_norm": 1.9147841645237889, + "learning_rate": 9.748406583128766e-06, + "loss": 0.3468, + "step": 2042 + }, + { + "epoch": 0.13, + "grad_norm": 3.0539811371487953, + "learning_rate": 9.748087479637392e-06, + "loss": 0.3869, + "step": 2043 + }, + { + "epoch": 0.13, + "grad_norm": 2.1851613542010764, + "learning_rate": 9.747768179139433e-06, + "loss": 0.3993, + "step": 2044 + }, + { + "epoch": 0.13, + "grad_norm": 2.5271231616726504, + "learning_rate": 9.74744868164814e-06, + "loss": 0.3727, + "step": 2045 + }, + { + "epoch": 0.13, + "grad_norm": 1.7743759366833747, + "learning_rate": 9.747128987176768e-06, + "loss": 0.3638, + "step": 2046 + }, + { + "epoch": 0.13, + "grad_norm": 1.9534302961238634, + "learning_rate": 9.746809095738581e-06, + "loss": 0.3793, + "step": 2047 + }, + { + "epoch": 0.13, + "grad_norm": 2.2704836228978835, + "learning_rate": 9.746489007346856e-06, + "loss": 0.3682, + "step": 2048 + }, + { + "epoch": 0.13, + "grad_norm": 2.0785440079876905, + "learning_rate": 9.746168722014871e-06, + "loss": 0.3638, + "step": 2049 + }, + { + "epoch": 0.13, + "grad_norm": 8.380900743695415, + "learning_rate": 9.745848239755915e-06, + "loss": 0.3559, + "step": 2050 + }, + { + "epoch": 0.13, + "grad_norm": 2.1384483816761786, + "learning_rate": 9.745527560583285e-06, + "loss": 0.3768, + "step": 2051 + }, + { + "epoch": 0.13, + "grad_norm": 2.75043907337137, + "learning_rate": 9.745206684510289e-06, + "loss": 0.3926, + "step": 2052 + }, + { + "epoch": 0.13, + "grad_norm": 10.179888990727422, + "learning_rate": 9.744885611550239e-06, + "loss": 0.3968, + "step": 2053 + }, + { + "epoch": 0.13, + "grad_norm": 2.9498528642622612, + "learning_rate": 9.744564341716455e-06, + "loss": 0.3305, + "step": 2054 + }, + { + "epoch": 0.13, + "grad_norm": 2.414228701125571, + "learning_rate": 9.744242875022272e-06, + "loss": 0.3717, + "step": 2055 + }, + { + "epoch": 0.13, + "grad_norm": 3.4024879159291146, + "learning_rate": 9.743921211481024e-06, + "loss": 0.3693, + "step": 2056 + }, + { + "epoch": 0.13, + "grad_norm": 8.637135472120715, + "learning_rate": 9.74359935110606e-06, + "loss": 0.3602, + "step": 2057 + }, + { + "epoch": 0.13, + "grad_norm": 2.3519461879250865, + "learning_rate": 9.743277293910733e-06, + "loss": 0.3867, + "step": 2058 + }, + { + "epoch": 0.13, + "grad_norm": 2.7779436887641977, + "learning_rate": 9.742955039908406e-06, + "loss": 0.3596, + "step": 2059 + }, + { + "epoch": 0.13, + "grad_norm": 2.943526674621215, + "learning_rate": 9.742632589112448e-06, + "loss": 0.3486, + "step": 2060 + }, + { + "epoch": 0.13, + "grad_norm": 5.677983464515481, + "learning_rate": 9.742309941536243e-06, + "loss": 0.368, + "step": 2061 + }, + { + "epoch": 0.13, + "grad_norm": 3.1149395646695925, + "learning_rate": 9.741987097193174e-06, + "loss": 0.3807, + "step": 2062 + }, + { + "epoch": 0.13, + "grad_norm": 2.9735113730032214, + "learning_rate": 9.741664056096637e-06, + "loss": 0.3833, + "step": 2063 + }, + { + "epoch": 0.13, + "grad_norm": 1.7984818734853594, + "learning_rate": 9.741340818260036e-06, + "loss": 0.3558, + "step": 2064 + }, + { + "epoch": 0.13, + "grad_norm": 2.0996088945515368, + "learning_rate": 9.741017383696784e-06, + "loss": 0.3514, + "step": 2065 + }, + { + "epoch": 0.13, + "grad_norm": 5.512897907985271, + "learning_rate": 9.7406937524203e-06, + "loss": 0.3621, + "step": 2066 + }, + { + "epoch": 0.13, + "grad_norm": 1.969422018816204, + "learning_rate": 9.740369924444011e-06, + "loss": 0.3863, + "step": 2067 + }, + { + "epoch": 0.13, + "grad_norm": 2.242100778849942, + "learning_rate": 9.740045899781353e-06, + "loss": 0.3789, + "step": 2068 + }, + { + "epoch": 0.13, + "grad_norm": 3.046530406817976, + "learning_rate": 9.739721678445772e-06, + "loss": 0.3726, + "step": 2069 + }, + { + "epoch": 0.13, + "grad_norm": 2.2390276927521047, + "learning_rate": 9.73939726045072e-06, + "loss": 0.3672, + "step": 2070 + }, + { + "epoch": 0.13, + "grad_norm": 2.730884607148511, + "learning_rate": 9.739072645809659e-06, + "loss": 0.3649, + "step": 2071 + }, + { + "epoch": 0.13, + "grad_norm": 7.137534335295253, + "learning_rate": 9.738747834536056e-06, + "loss": 0.3827, + "step": 2072 + }, + { + "epoch": 0.13, + "grad_norm": 2.1657666343437287, + "learning_rate": 9.738422826643385e-06, + "loss": 0.3657, + "step": 2073 + }, + { + "epoch": 0.13, + "grad_norm": 2.3417265608812627, + "learning_rate": 9.738097622145138e-06, + "loss": 0.3587, + "step": 2074 + }, + { + "epoch": 0.13, + "grad_norm": 2.3546133096058433, + "learning_rate": 9.737772221054805e-06, + "loss": 0.3822, + "step": 2075 + }, + { + "epoch": 0.13, + "grad_norm": 2.162846204439659, + "learning_rate": 9.737446623385885e-06, + "loss": 0.3858, + "step": 2076 + }, + { + "epoch": 0.13, + "grad_norm": 1.91068662472612, + "learning_rate": 9.737120829151892e-06, + "loss": 0.369, + "step": 2077 + }, + { + "epoch": 0.13, + "grad_norm": 1.8461118959609997, + "learning_rate": 9.736794838366342e-06, + "loss": 0.3679, + "step": 2078 + }, + { + "epoch": 0.13, + "grad_norm": 2.5800370867292317, + "learning_rate": 9.736468651042759e-06, + "loss": 0.3669, + "step": 2079 + }, + { + "epoch": 0.13, + "grad_norm": 2.39789416614131, + "learning_rate": 9.73614226719468e-06, + "loss": 0.362, + "step": 2080 + }, + { + "epoch": 0.13, + "grad_norm": 3.6228173046928687, + "learning_rate": 9.735815686835644e-06, + "loss": 0.3846, + "step": 2081 + }, + { + "epoch": 0.13, + "grad_norm": 1.861306080305837, + "learning_rate": 9.735488909979205e-06, + "loss": 0.3842, + "step": 2082 + }, + { + "epoch": 0.13, + "grad_norm": 2.112625158009115, + "learning_rate": 9.735161936638919e-06, + "loss": 0.3508, + "step": 2083 + }, + { + "epoch": 0.13, + "grad_norm": 1.706042145528432, + "learning_rate": 9.734834766828355e-06, + "loss": 0.3668, + "step": 2084 + }, + { + "epoch": 0.13, + "grad_norm": 2.1558621862255363, + "learning_rate": 9.734507400561087e-06, + "loss": 0.3602, + "step": 2085 + }, + { + "epoch": 0.13, + "grad_norm": 4.1062051333568, + "learning_rate": 9.734179837850695e-06, + "loss": 0.3581, + "step": 2086 + }, + { + "epoch": 0.13, + "grad_norm": 1.6518722280133271, + "learning_rate": 9.733852078710774e-06, + "loss": 0.3534, + "step": 2087 + }, + { + "epoch": 0.13, + "grad_norm": 3.3927538505710038, + "learning_rate": 9.733524123154923e-06, + "loss": 0.3999, + "step": 2088 + }, + { + "epoch": 0.13, + "grad_norm": 4.83621811927603, + "learning_rate": 9.733195971196747e-06, + "loss": 0.3594, + "step": 2089 + }, + { + "epoch": 0.13, + "grad_norm": 2.0494235345449296, + "learning_rate": 9.732867622849863e-06, + "loss": 0.3865, + "step": 2090 + }, + { + "epoch": 0.13, + "grad_norm": 2.8664351300158333, + "learning_rate": 9.732539078127895e-06, + "loss": 0.3907, + "step": 2091 + }, + { + "epoch": 0.13, + "grad_norm": 8.274292271643615, + "learning_rate": 9.732210337044475e-06, + "loss": 0.3833, + "step": 2092 + }, + { + "epoch": 0.13, + "grad_norm": 3.3152591139212695, + "learning_rate": 9.731881399613243e-06, + "loss": 0.3815, + "step": 2093 + }, + { + "epoch": 0.13, + "grad_norm": 2.6219381423340673, + "learning_rate": 9.731552265847847e-06, + "loss": 0.3649, + "step": 2094 + }, + { + "epoch": 0.13, + "grad_norm": 1.9066997475563694, + "learning_rate": 9.731222935761943e-06, + "loss": 0.3697, + "step": 2095 + }, + { + "epoch": 0.13, + "grad_norm": 1.9432450165133641, + "learning_rate": 9.730893409369197e-06, + "loss": 0.3576, + "step": 2096 + }, + { + "epoch": 0.13, + "grad_norm": 2.014531214349162, + "learning_rate": 9.730563686683278e-06, + "loss": 0.3605, + "step": 2097 + }, + { + "epoch": 0.13, + "grad_norm": 2.4821851517309583, + "learning_rate": 9.730233767717872e-06, + "loss": 0.3792, + "step": 2098 + }, + { + "epoch": 0.13, + "grad_norm": 2.5019454300568817, + "learning_rate": 9.729903652486664e-06, + "loss": 0.352, + "step": 2099 + }, + { + "epoch": 0.13, + "grad_norm": 2.2452438071156124, + "learning_rate": 9.72957334100335e-06, + "loss": 0.3746, + "step": 2100 + }, + { + "epoch": 0.13, + "grad_norm": 2.347600971039136, + "learning_rate": 9.72924283328164e-06, + "loss": 0.3582, + "step": 2101 + }, + { + "epoch": 0.13, + "grad_norm": 3.5775369431503403, + "learning_rate": 9.728912129335243e-06, + "loss": 0.3446, + "step": 2102 + }, + { + "epoch": 0.13, + "grad_norm": 0.758077460841644, + "learning_rate": 9.728581229177884e-06, + "loss": 0.5081, + "step": 2103 + }, + { + "epoch": 0.13, + "grad_norm": 1.7767864893793226, + "learning_rate": 9.72825013282329e-06, + "loss": 0.3555, + "step": 2104 + }, + { + "epoch": 0.13, + "grad_norm": 3.174707671827372, + "learning_rate": 9.7279188402852e-06, + "loss": 0.349, + "step": 2105 + }, + { + "epoch": 0.13, + "grad_norm": 3.286915943993544, + "learning_rate": 9.72758735157736e-06, + "loss": 0.3704, + "step": 2106 + }, + { + "epoch": 0.13, + "grad_norm": 3.095936787943565, + "learning_rate": 9.727255666713524e-06, + "loss": 0.3737, + "step": 2107 + }, + { + "epoch": 0.13, + "grad_norm": 2.4735221401501195, + "learning_rate": 9.726923785707451e-06, + "loss": 0.366, + "step": 2108 + }, + { + "epoch": 0.13, + "grad_norm": 1.9593873396119232, + "learning_rate": 9.726591708572916e-06, + "loss": 0.3597, + "step": 2109 + }, + { + "epoch": 0.13, + "grad_norm": 2.164677544404085, + "learning_rate": 9.726259435323698e-06, + "loss": 0.3707, + "step": 2110 + }, + { + "epoch": 0.13, + "grad_norm": 3.9366566055794823, + "learning_rate": 9.725926965973579e-06, + "loss": 0.3761, + "step": 2111 + }, + { + "epoch": 0.13, + "grad_norm": 3.6178898052831814, + "learning_rate": 9.725594300536355e-06, + "loss": 0.3591, + "step": 2112 + }, + { + "epoch": 0.13, + "grad_norm": 1.830151832650409, + "learning_rate": 9.72526143902583e-06, + "loss": 0.3657, + "step": 2113 + }, + { + "epoch": 0.13, + "grad_norm": 3.153213668681711, + "learning_rate": 9.724928381455817e-06, + "loss": 0.3519, + "step": 2114 + }, + { + "epoch": 0.13, + "grad_norm": 2.0002205611991255, + "learning_rate": 9.724595127840131e-06, + "loss": 0.3581, + "step": 2115 + }, + { + "epoch": 0.13, + "grad_norm": 2.641998053257897, + "learning_rate": 9.724261678192602e-06, + "loss": 0.3505, + "step": 2116 + }, + { + "epoch": 0.13, + "grad_norm": 1.9981520047442376, + "learning_rate": 9.723928032527066e-06, + "loss": 0.3714, + "step": 2117 + }, + { + "epoch": 0.13, + "grad_norm": 2.5665521763553363, + "learning_rate": 9.723594190857363e-06, + "loss": 0.3701, + "step": 2118 + }, + { + "epoch": 0.13, + "grad_norm": 2.0869060682613885, + "learning_rate": 9.72326015319735e-06, + "loss": 0.353, + "step": 2119 + }, + { + "epoch": 0.13, + "grad_norm": 2.2564321768009163, + "learning_rate": 9.72292591956088e-06, + "loss": 0.3621, + "step": 2120 + }, + { + "epoch": 0.13, + "grad_norm": 2.8311405247028496, + "learning_rate": 9.722591489961829e-06, + "loss": 0.3469, + "step": 2121 + }, + { + "epoch": 0.13, + "grad_norm": 1.9637519786487785, + "learning_rate": 9.722256864414065e-06, + "loss": 0.3475, + "step": 2122 + }, + { + "epoch": 0.13, + "grad_norm": 3.885360005957735, + "learning_rate": 9.721922042931478e-06, + "loss": 0.3653, + "step": 2123 + }, + { + "epoch": 0.13, + "grad_norm": 5.228766380795061, + "learning_rate": 9.721587025527957e-06, + "loss": 0.3675, + "step": 2124 + }, + { + "epoch": 0.13, + "grad_norm": 1.8081692880168567, + "learning_rate": 9.721251812217405e-06, + "loss": 0.3461, + "step": 2125 + }, + { + "epoch": 0.13, + "grad_norm": 10.592096217280288, + "learning_rate": 9.720916403013729e-06, + "loss": 0.3507, + "step": 2126 + }, + { + "epoch": 0.13, + "grad_norm": 1.9566894768797642, + "learning_rate": 9.720580797930845e-06, + "loss": 0.3679, + "step": 2127 + }, + { + "epoch": 0.13, + "grad_norm": 5.810191299703153, + "learning_rate": 9.720244996982683e-06, + "loss": 0.3338, + "step": 2128 + }, + { + "epoch": 0.13, + "grad_norm": 2.345368455037903, + "learning_rate": 9.719909000183167e-06, + "loss": 0.3882, + "step": 2129 + }, + { + "epoch": 0.13, + "grad_norm": 1.8941777637996409, + "learning_rate": 9.719572807546246e-06, + "loss": 0.3543, + "step": 2130 + }, + { + "epoch": 0.13, + "grad_norm": 3.094741052343396, + "learning_rate": 9.719236419085866e-06, + "loss": 0.3667, + "step": 2131 + }, + { + "epoch": 0.13, + "grad_norm": 2.6390743301002137, + "learning_rate": 9.718899834815984e-06, + "loss": 0.3394, + "step": 2132 + }, + { + "epoch": 0.13, + "grad_norm": 3.913875356846735, + "learning_rate": 9.718563054750566e-06, + "loss": 0.3695, + "step": 2133 + }, + { + "epoch": 0.13, + "grad_norm": 2.482946254818429, + "learning_rate": 9.718226078903586e-06, + "loss": 0.3589, + "step": 2134 + }, + { + "epoch": 0.13, + "grad_norm": 1.7318896115244018, + "learning_rate": 9.717888907289026e-06, + "loss": 0.3977, + "step": 2135 + }, + { + "epoch": 0.13, + "grad_norm": 1.6801088251335015, + "learning_rate": 9.717551539920875e-06, + "loss": 0.3653, + "step": 2136 + }, + { + "epoch": 0.13, + "grad_norm": 6.584021393663589, + "learning_rate": 9.717213976813131e-06, + "loss": 0.386, + "step": 2137 + }, + { + "epoch": 0.13, + "grad_norm": 3.6385926170889076, + "learning_rate": 9.7168762179798e-06, + "loss": 0.3489, + "step": 2138 + }, + { + "epoch": 0.13, + "grad_norm": 1.9959221926480015, + "learning_rate": 9.716538263434899e-06, + "loss": 0.3412, + "step": 2139 + }, + { + "epoch": 0.13, + "grad_norm": 2.781919752728311, + "learning_rate": 9.716200113192445e-06, + "loss": 0.3427, + "step": 2140 + }, + { + "epoch": 0.13, + "grad_norm": 2.660943842445923, + "learning_rate": 9.715861767266472e-06, + "loss": 0.3367, + "step": 2141 + }, + { + "epoch": 0.13, + "grad_norm": 3.2532444999278076, + "learning_rate": 9.715523225671019e-06, + "loss": 0.3718, + "step": 2142 + }, + { + "epoch": 0.13, + "grad_norm": 2.5405682742241553, + "learning_rate": 9.715184488420132e-06, + "loss": 0.3913, + "step": 2143 + }, + { + "epoch": 0.13, + "grad_norm": 2.3165325107589307, + "learning_rate": 9.714845555527865e-06, + "loss": 0.3552, + "step": 2144 + }, + { + "epoch": 0.13, + "grad_norm": 4.514725721717478, + "learning_rate": 9.714506427008282e-06, + "loss": 0.3434, + "step": 2145 + }, + { + "epoch": 0.13, + "grad_norm": 1.9133066620117767, + "learning_rate": 9.714167102875452e-06, + "loss": 0.3465, + "step": 2146 + }, + { + "epoch": 0.14, + "grad_norm": 1.8019517890381582, + "learning_rate": 9.713827583143455e-06, + "loss": 0.3542, + "step": 2147 + }, + { + "epoch": 0.14, + "grad_norm": 6.231724150089947, + "learning_rate": 9.71348786782638e-06, + "loss": 0.3729, + "step": 2148 + }, + { + "epoch": 0.14, + "grad_norm": 3.2836644862699473, + "learning_rate": 9.713147956938322e-06, + "loss": 0.3694, + "step": 2149 + }, + { + "epoch": 0.14, + "grad_norm": 3.874461379345722, + "learning_rate": 9.712807850493382e-06, + "loss": 0.345, + "step": 2150 + }, + { + "epoch": 0.14, + "grad_norm": 3.0606832864004367, + "learning_rate": 9.712467548505675e-06, + "loss": 0.3876, + "step": 2151 + }, + { + "epoch": 0.14, + "grad_norm": 1.5345752718763965, + "learning_rate": 9.712127050989319e-06, + "loss": 0.3635, + "step": 2152 + }, + { + "epoch": 0.14, + "grad_norm": 1.781510323719983, + "learning_rate": 9.711786357958442e-06, + "loss": 0.3499, + "step": 2153 + }, + { + "epoch": 0.14, + "grad_norm": 3.2632539701353243, + "learning_rate": 9.711445469427179e-06, + "loss": 0.3659, + "step": 2154 + }, + { + "epoch": 0.14, + "grad_norm": 3.497717047183541, + "learning_rate": 9.711104385409676e-06, + "loss": 0.4012, + "step": 2155 + }, + { + "epoch": 0.14, + "grad_norm": 3.013333543854991, + "learning_rate": 9.710763105920082e-06, + "loss": 0.3674, + "step": 2156 + }, + { + "epoch": 0.14, + "grad_norm": 0.8311470053069792, + "learning_rate": 9.710421630972563e-06, + "loss": 0.4983, + "step": 2157 + }, + { + "epoch": 0.14, + "grad_norm": 3.534554032587132, + "learning_rate": 9.71007996058128e-06, + "loss": 0.368, + "step": 2158 + }, + { + "epoch": 0.14, + "grad_norm": 3.7369760274841606, + "learning_rate": 9.709738094760415e-06, + "loss": 0.3525, + "step": 2159 + }, + { + "epoch": 0.14, + "grad_norm": 2.8680377874275687, + "learning_rate": 9.709396033524153e-06, + "loss": 0.3587, + "step": 2160 + }, + { + "epoch": 0.14, + "grad_norm": 1.8702498262496199, + "learning_rate": 9.709053776886683e-06, + "loss": 0.3524, + "step": 2161 + }, + { + "epoch": 0.14, + "grad_norm": 4.874239426872966, + "learning_rate": 9.708711324862208e-06, + "loss": 0.3459, + "step": 2162 + }, + { + "epoch": 0.14, + "grad_norm": 1.7926988622182338, + "learning_rate": 9.708368677464936e-06, + "loss": 0.3364, + "step": 2163 + }, + { + "epoch": 0.14, + "grad_norm": 3.068595156773728, + "learning_rate": 9.708025834709085e-06, + "loss": 0.3882, + "step": 2164 + }, + { + "epoch": 0.14, + "grad_norm": 4.090259503267306, + "learning_rate": 9.707682796608879e-06, + "loss": 0.3667, + "step": 2165 + }, + { + "epoch": 0.14, + "grad_norm": 2.543607895466217, + "learning_rate": 9.707339563178554e-06, + "loss": 0.3809, + "step": 2166 + }, + { + "epoch": 0.14, + "grad_norm": 2.355073725762526, + "learning_rate": 9.706996134432346e-06, + "loss": 0.3698, + "step": 2167 + }, + { + "epoch": 0.14, + "grad_norm": 9.039427524367994, + "learning_rate": 9.70665251038451e-06, + "loss": 0.3572, + "step": 2168 + }, + { + "epoch": 0.14, + "grad_norm": 2.3888312239997838, + "learning_rate": 9.706308691049302e-06, + "loss": 0.3552, + "step": 2169 + }, + { + "epoch": 0.14, + "grad_norm": 5.188509015657696, + "learning_rate": 9.705964676440984e-06, + "loss": 0.3437, + "step": 2170 + }, + { + "epoch": 0.14, + "grad_norm": 4.681347300694205, + "learning_rate": 9.705620466573837e-06, + "loss": 0.3547, + "step": 2171 + }, + { + "epoch": 0.14, + "grad_norm": 2.4445920081855586, + "learning_rate": 9.705276061462135e-06, + "loss": 0.3979, + "step": 2172 + }, + { + "epoch": 0.14, + "grad_norm": 3.107750685951658, + "learning_rate": 9.704931461120173e-06, + "loss": 0.3716, + "step": 2173 + }, + { + "epoch": 0.14, + "grad_norm": 2.6290719881406557, + "learning_rate": 9.70458666556225e-06, + "loss": 0.371, + "step": 2174 + }, + { + "epoch": 0.14, + "grad_norm": 3.3135326378460883, + "learning_rate": 9.704241674802668e-06, + "loss": 0.3849, + "step": 2175 + }, + { + "epoch": 0.14, + "grad_norm": 1.9109486876018453, + "learning_rate": 9.703896488855742e-06, + "loss": 0.3744, + "step": 2176 + }, + { + "epoch": 0.14, + "grad_norm": 2.09958236067567, + "learning_rate": 9.703551107735798e-06, + "loss": 0.3471, + "step": 2177 + }, + { + "epoch": 0.14, + "grad_norm": 0.8349254683229255, + "learning_rate": 9.703205531457163e-06, + "loss": 0.4838, + "step": 2178 + }, + { + "epoch": 0.14, + "grad_norm": 4.459660001768668, + "learning_rate": 9.702859760034177e-06, + "loss": 0.3497, + "step": 2179 + }, + { + "epoch": 0.14, + "grad_norm": 3.090096725810043, + "learning_rate": 9.702513793481186e-06, + "loss": 0.3532, + "step": 2180 + }, + { + "epoch": 0.14, + "grad_norm": 6.560318138826427, + "learning_rate": 9.702167631812544e-06, + "loss": 0.3704, + "step": 2181 + }, + { + "epoch": 0.14, + "grad_norm": 4.5780875389561455, + "learning_rate": 9.701821275042618e-06, + "loss": 0.3443, + "step": 2182 + }, + { + "epoch": 0.14, + "grad_norm": 4.047438538653314, + "learning_rate": 9.701474723185774e-06, + "loss": 0.3573, + "step": 2183 + }, + { + "epoch": 0.14, + "grad_norm": 2.001446806845423, + "learning_rate": 9.701127976256392e-06, + "loss": 0.3436, + "step": 2184 + }, + { + "epoch": 0.14, + "grad_norm": 2.6806415294516435, + "learning_rate": 9.700781034268861e-06, + "loss": 0.3663, + "step": 2185 + }, + { + "epoch": 0.14, + "grad_norm": 2.992265753593225, + "learning_rate": 9.700433897237576e-06, + "loss": 0.3769, + "step": 2186 + }, + { + "epoch": 0.14, + "grad_norm": 2.383394810883486, + "learning_rate": 9.70008656517694e-06, + "loss": 0.3643, + "step": 2187 + }, + { + "epoch": 0.14, + "grad_norm": 9.130913273587383, + "learning_rate": 9.699739038101363e-06, + "loss": 0.3518, + "step": 2188 + }, + { + "epoch": 0.14, + "grad_norm": 2.142319729777803, + "learning_rate": 9.699391316025266e-06, + "loss": 0.3516, + "step": 2189 + }, + { + "epoch": 0.14, + "grad_norm": 1.9270841797880551, + "learning_rate": 9.699043398963075e-06, + "loss": 0.3514, + "step": 2190 + }, + { + "epoch": 0.14, + "grad_norm": 2.5876699053379104, + "learning_rate": 9.69869528692923e-06, + "loss": 0.37, + "step": 2191 + }, + { + "epoch": 0.14, + "grad_norm": 3.541893933981502, + "learning_rate": 9.698346979938169e-06, + "loss": 0.3654, + "step": 2192 + }, + { + "epoch": 0.14, + "grad_norm": 6.134315845942576, + "learning_rate": 9.697998478004347e-06, + "loss": 0.3506, + "step": 2193 + }, + { + "epoch": 0.14, + "grad_norm": 2.6481892197476746, + "learning_rate": 9.697649781142225e-06, + "loss": 0.3597, + "step": 2194 + }, + { + "epoch": 0.14, + "grad_norm": 2.612137458004614, + "learning_rate": 9.697300889366268e-06, + "loss": 0.3576, + "step": 2195 + }, + { + "epoch": 0.14, + "grad_norm": 0.9316800321828856, + "learning_rate": 9.696951802690955e-06, + "loss": 0.5251, + "step": 2196 + }, + { + "epoch": 0.14, + "grad_norm": 6.172566635186495, + "learning_rate": 9.696602521130768e-06, + "loss": 0.3546, + "step": 2197 + }, + { + "epoch": 0.14, + "grad_norm": 5.960716584993781, + "learning_rate": 9.6962530447002e-06, + "loss": 0.3566, + "step": 2198 + }, + { + "epoch": 0.14, + "grad_norm": 6.32410744425751, + "learning_rate": 9.695903373413753e-06, + "loss": 0.3782, + "step": 2199 + }, + { + "epoch": 0.14, + "grad_norm": 3.665659509685868, + "learning_rate": 9.695553507285934e-06, + "loss": 0.3784, + "step": 2200 + }, + { + "epoch": 0.14, + "grad_norm": 2.5038958624637533, + "learning_rate": 9.695203446331258e-06, + "loss": 0.3638, + "step": 2201 + }, + { + "epoch": 0.14, + "grad_norm": 2.5654988510252177, + "learning_rate": 9.694853190564253e-06, + "loss": 0.3836, + "step": 2202 + }, + { + "epoch": 0.14, + "grad_norm": 5.274045728265552, + "learning_rate": 9.694502739999449e-06, + "loss": 0.3323, + "step": 2203 + }, + { + "epoch": 0.14, + "grad_norm": 3.3439694332278393, + "learning_rate": 9.69415209465139e-06, + "loss": 0.3605, + "step": 2204 + }, + { + "epoch": 0.14, + "grad_norm": 3.560716690652859, + "learning_rate": 9.693801254534622e-06, + "loss": 0.3624, + "step": 2205 + }, + { + "epoch": 0.14, + "grad_norm": 4.409334241591523, + "learning_rate": 9.693450219663703e-06, + "loss": 0.3699, + "step": 2206 + }, + { + "epoch": 0.14, + "grad_norm": 1.9714270733155934, + "learning_rate": 9.693098990053197e-06, + "loss": 0.3708, + "step": 2207 + }, + { + "epoch": 0.14, + "grad_norm": 2.0923282592001042, + "learning_rate": 9.692747565717677e-06, + "loss": 0.3682, + "step": 2208 + }, + { + "epoch": 0.14, + "grad_norm": 3.3440626411450176, + "learning_rate": 9.692395946671727e-06, + "loss": 0.3488, + "step": 2209 + }, + { + "epoch": 0.14, + "grad_norm": 2.5333082990512574, + "learning_rate": 9.692044132929934e-06, + "loss": 0.3774, + "step": 2210 + }, + { + "epoch": 0.14, + "grad_norm": 3.0748073568895737, + "learning_rate": 9.691692124506896e-06, + "loss": 0.3603, + "step": 2211 + }, + { + "epoch": 0.14, + "grad_norm": 2.2534062520128604, + "learning_rate": 9.691339921417219e-06, + "loss": 0.3377, + "step": 2212 + }, + { + "epoch": 0.14, + "grad_norm": 2.5344521511625326, + "learning_rate": 9.690987523675514e-06, + "loss": 0.3578, + "step": 2213 + }, + { + "epoch": 0.14, + "grad_norm": 4.335457593172534, + "learning_rate": 9.690634931296408e-06, + "loss": 0.3637, + "step": 2214 + }, + { + "epoch": 0.14, + "grad_norm": 2.446251914770414, + "learning_rate": 9.690282144294524e-06, + "loss": 0.333, + "step": 2215 + }, + { + "epoch": 0.14, + "grad_norm": 3.2504853190219287, + "learning_rate": 9.689929162684503e-06, + "loss": 0.3596, + "step": 2216 + }, + { + "epoch": 0.14, + "grad_norm": 2.9979319485051117, + "learning_rate": 9.689575986480992e-06, + "loss": 0.3674, + "step": 2217 + }, + { + "epoch": 0.14, + "grad_norm": 3.147606422679075, + "learning_rate": 9.689222615698643e-06, + "loss": 0.3513, + "step": 2218 + }, + { + "epoch": 0.14, + "grad_norm": 3.0713409710626527, + "learning_rate": 9.688869050352119e-06, + "loss": 0.3708, + "step": 2219 + }, + { + "epoch": 0.14, + "grad_norm": 6.697857059745171, + "learning_rate": 9.688515290456092e-06, + "loss": 0.3462, + "step": 2220 + }, + { + "epoch": 0.14, + "grad_norm": 2.905978614169605, + "learning_rate": 9.688161336025234e-06, + "loss": 0.3429, + "step": 2221 + }, + { + "epoch": 0.14, + "grad_norm": 2.909014222345689, + "learning_rate": 9.687807187074238e-06, + "loss": 0.3435, + "step": 2222 + }, + { + "epoch": 0.14, + "grad_norm": 3.615511745018682, + "learning_rate": 9.687452843617792e-06, + "loss": 0.3758, + "step": 2223 + }, + { + "epoch": 0.14, + "grad_norm": 1.8141319252177568, + "learning_rate": 9.687098305670606e-06, + "loss": 0.3673, + "step": 2224 + }, + { + "epoch": 0.14, + "grad_norm": 2.3395920323086177, + "learning_rate": 9.686743573247383e-06, + "loss": 0.3605, + "step": 2225 + }, + { + "epoch": 0.14, + "grad_norm": 2.6540624458603483, + "learning_rate": 9.686388646362846e-06, + "loss": 0.3421, + "step": 2226 + }, + { + "epoch": 0.14, + "grad_norm": 4.120099167484428, + "learning_rate": 9.68603352503172e-06, + "loss": 0.3772, + "step": 2227 + }, + { + "epoch": 0.14, + "grad_norm": 2.8111315477744756, + "learning_rate": 9.685678209268738e-06, + "loss": 0.3679, + "step": 2228 + }, + { + "epoch": 0.14, + "grad_norm": 6.121181310744283, + "learning_rate": 9.685322699088647e-06, + "loss": 0.372, + "step": 2229 + }, + { + "epoch": 0.14, + "grad_norm": 3.3190016218223635, + "learning_rate": 9.684966994506193e-06, + "loss": 0.3582, + "step": 2230 + }, + { + "epoch": 0.14, + "grad_norm": 1.0008797873312196, + "learning_rate": 9.684611095536137e-06, + "loss": 0.513, + "step": 2231 + }, + { + "epoch": 0.14, + "grad_norm": 3.085802656987168, + "learning_rate": 9.684255002193246e-06, + "loss": 0.3413, + "step": 2232 + }, + { + "epoch": 0.14, + "grad_norm": 2.842011045685869, + "learning_rate": 9.683898714492296e-06, + "loss": 0.3636, + "step": 2233 + }, + { + "epoch": 0.14, + "grad_norm": 2.345497910221064, + "learning_rate": 9.683542232448068e-06, + "loss": 0.356, + "step": 2234 + }, + { + "epoch": 0.14, + "grad_norm": 3.3031539572997266, + "learning_rate": 9.683185556075354e-06, + "loss": 0.3674, + "step": 2235 + }, + { + "epoch": 0.14, + "grad_norm": 3.5441288379330023, + "learning_rate": 9.682828685388954e-06, + "loss": 0.3426, + "step": 2236 + }, + { + "epoch": 0.14, + "grad_norm": 6.07700230917055, + "learning_rate": 9.682471620403673e-06, + "loss": 0.3548, + "step": 2237 + }, + { + "epoch": 0.14, + "grad_norm": 2.4081556992150213, + "learning_rate": 9.682114361134327e-06, + "loss": 0.3308, + "step": 2238 + }, + { + "epoch": 0.14, + "grad_norm": 2.7716307869559373, + "learning_rate": 9.681756907595741e-06, + "loss": 0.3497, + "step": 2239 + }, + { + "epoch": 0.14, + "grad_norm": 2.1025799805934735, + "learning_rate": 9.681399259802744e-06, + "loss": 0.3453, + "step": 2240 + }, + { + "epoch": 0.14, + "grad_norm": 2.5764295638939267, + "learning_rate": 9.681041417770176e-06, + "loss": 0.3626, + "step": 2241 + }, + { + "epoch": 0.14, + "grad_norm": 4.239281158943012, + "learning_rate": 9.680683381512888e-06, + "loss": 0.3478, + "step": 2242 + }, + { + "epoch": 0.14, + "grad_norm": 2.2584329750144967, + "learning_rate": 9.68032515104573e-06, + "loss": 0.3585, + "step": 2243 + }, + { + "epoch": 0.14, + "grad_norm": 2.780756543442347, + "learning_rate": 9.679966726383569e-06, + "loss": 0.382, + "step": 2244 + }, + { + "epoch": 0.14, + "grad_norm": 12.641132909899936, + "learning_rate": 9.679608107541278e-06, + "loss": 0.376, + "step": 2245 + }, + { + "epoch": 0.14, + "grad_norm": 2.9373484888075962, + "learning_rate": 9.679249294533733e-06, + "loss": 0.3515, + "step": 2246 + }, + { + "epoch": 0.14, + "grad_norm": 2.7470182000533456, + "learning_rate": 9.678890287375823e-06, + "loss": 0.349, + "step": 2247 + }, + { + "epoch": 0.14, + "grad_norm": 4.075101042853636, + "learning_rate": 9.678531086082444e-06, + "loss": 0.3598, + "step": 2248 + }, + { + "epoch": 0.14, + "grad_norm": 4.545818294862518, + "learning_rate": 9.6781716906685e-06, + "loss": 0.3414, + "step": 2249 + }, + { + "epoch": 0.14, + "grad_norm": 1.72612675981877, + "learning_rate": 9.677812101148906e-06, + "loss": 0.3542, + "step": 2250 + }, + { + "epoch": 0.14, + "grad_norm": 11.599238197206029, + "learning_rate": 9.677452317538576e-06, + "loss": 0.3667, + "step": 2251 + }, + { + "epoch": 0.14, + "grad_norm": 6.552295386551471, + "learning_rate": 9.677092339852443e-06, + "loss": 0.3757, + "step": 2252 + }, + { + "epoch": 0.14, + "grad_norm": 3.2749637261026314, + "learning_rate": 9.676732168105443e-06, + "loss": 0.3502, + "step": 2253 + }, + { + "epoch": 0.14, + "grad_norm": 2.8524830075707692, + "learning_rate": 9.676371802312515e-06, + "loss": 0.361, + "step": 2254 + }, + { + "epoch": 0.14, + "grad_norm": 3.5399726269706826, + "learning_rate": 9.676011242488616e-06, + "loss": 0.3845, + "step": 2255 + }, + { + "epoch": 0.14, + "grad_norm": 2.354759199309399, + "learning_rate": 9.675650488648707e-06, + "loss": 0.3341, + "step": 2256 + }, + { + "epoch": 0.14, + "grad_norm": 3.603553884786818, + "learning_rate": 9.675289540807752e-06, + "loss": 0.3333, + "step": 2257 + }, + { + "epoch": 0.14, + "grad_norm": 3.773587318579438, + "learning_rate": 9.674928398980729e-06, + "loss": 0.3362, + "step": 2258 + }, + { + "epoch": 0.14, + "grad_norm": 2.67176691595643, + "learning_rate": 9.674567063182626e-06, + "loss": 0.3625, + "step": 2259 + }, + { + "epoch": 0.14, + "grad_norm": 4.908384262448215, + "learning_rate": 9.674205533428431e-06, + "loss": 0.3691, + "step": 2260 + }, + { + "epoch": 0.14, + "grad_norm": 2.307049662295108, + "learning_rate": 9.673843809733145e-06, + "loss": 0.3554, + "step": 2261 + }, + { + "epoch": 0.14, + "grad_norm": 2.1999270013192764, + "learning_rate": 9.67348189211178e-06, + "loss": 0.3623, + "step": 2262 + }, + { + "epoch": 0.14, + "grad_norm": 2.79505183529914, + "learning_rate": 9.67311978057935e-06, + "loss": 0.3797, + "step": 2263 + }, + { + "epoch": 0.14, + "grad_norm": 2.568944247674684, + "learning_rate": 9.672757475150878e-06, + "loss": 0.3605, + "step": 2264 + }, + { + "epoch": 0.14, + "grad_norm": 67.47337375924475, + "learning_rate": 9.6723949758414e-06, + "loss": 0.3194, + "step": 2265 + }, + { + "epoch": 0.14, + "grad_norm": 3.1499024496376413, + "learning_rate": 9.672032282665954e-06, + "loss": 0.3772, + "step": 2266 + }, + { + "epoch": 0.14, + "grad_norm": 2.4091814983918054, + "learning_rate": 9.67166939563959e-06, + "loss": 0.3843, + "step": 2267 + }, + { + "epoch": 0.14, + "grad_norm": 2.3335453182494446, + "learning_rate": 9.671306314777367e-06, + "loss": 0.345, + "step": 2268 + }, + { + "epoch": 0.14, + "grad_norm": 5.160702802736921, + "learning_rate": 9.670943040094347e-06, + "loss": 0.3515, + "step": 2269 + }, + { + "epoch": 0.14, + "grad_norm": 3.4360182720795938, + "learning_rate": 9.670579571605605e-06, + "loss": 0.3471, + "step": 2270 + }, + { + "epoch": 0.14, + "grad_norm": 2.653755317861475, + "learning_rate": 9.670215909326219e-06, + "loss": 0.3304, + "step": 2271 + }, + { + "epoch": 0.14, + "grad_norm": 3.1601480854090656, + "learning_rate": 9.669852053271278e-06, + "loss": 0.3534, + "step": 2272 + }, + { + "epoch": 0.14, + "grad_norm": 2.951268372737922, + "learning_rate": 9.669488003455884e-06, + "loss": 0.3548, + "step": 2273 + }, + { + "epoch": 0.14, + "grad_norm": 2.1361984311785482, + "learning_rate": 9.669123759895137e-06, + "loss": 0.372, + "step": 2274 + }, + { + "epoch": 0.14, + "grad_norm": 2.46221679531022, + "learning_rate": 9.668759322604154e-06, + "loss": 0.354, + "step": 2275 + }, + { + "epoch": 0.14, + "grad_norm": 1.9824662415726244, + "learning_rate": 9.66839469159805e-06, + "loss": 0.3596, + "step": 2276 + }, + { + "epoch": 0.14, + "grad_norm": 3.800468707625609, + "learning_rate": 9.668029866891962e-06, + "loss": 0.3483, + "step": 2277 + }, + { + "epoch": 0.14, + "grad_norm": 1.0650565663973084, + "learning_rate": 9.667664848501022e-06, + "loss": 0.5287, + "step": 2278 + }, + { + "epoch": 0.14, + "grad_norm": 2.8209529394633437, + "learning_rate": 9.667299636440377e-06, + "loss": 0.3604, + "step": 2279 + }, + { + "epoch": 0.14, + "grad_norm": 3.6744587582900383, + "learning_rate": 9.66693423072518e-06, + "loss": 0.3607, + "step": 2280 + }, + { + "epoch": 0.14, + "grad_norm": 3.5962459095432746, + "learning_rate": 9.666568631370592e-06, + "loss": 0.3614, + "step": 2281 + }, + { + "epoch": 0.14, + "grad_norm": 0.6639755639194774, + "learning_rate": 9.666202838391783e-06, + "loss": 0.4927, + "step": 2282 + }, + { + "epoch": 0.14, + "grad_norm": 3.2951901857763573, + "learning_rate": 9.66583685180393e-06, + "loss": 0.3445, + "step": 2283 + }, + { + "epoch": 0.14, + "grad_norm": 4.511161265817613, + "learning_rate": 9.66547067162222e-06, + "loss": 0.3773, + "step": 2284 + }, + { + "epoch": 0.14, + "grad_norm": 4.901182692439293, + "learning_rate": 9.665104297861842e-06, + "loss": 0.3601, + "step": 2285 + }, + { + "epoch": 0.14, + "grad_norm": 2.5004267910064106, + "learning_rate": 9.664737730538003e-06, + "loss": 0.3425, + "step": 2286 + }, + { + "epoch": 0.14, + "grad_norm": 2.699872788028884, + "learning_rate": 9.66437096966591e-06, + "loss": 0.3646, + "step": 2287 + }, + { + "epoch": 0.14, + "grad_norm": 3.466253276055321, + "learning_rate": 9.66400401526078e-06, + "loss": 0.3685, + "step": 2288 + }, + { + "epoch": 0.14, + "grad_norm": 2.974486155869181, + "learning_rate": 9.663636867337838e-06, + "loss": 0.348, + "step": 2289 + }, + { + "epoch": 0.14, + "grad_norm": 2.009561889838298, + "learning_rate": 9.66326952591232e-06, + "loss": 0.3456, + "step": 2290 + }, + { + "epoch": 0.14, + "grad_norm": 2.3282365927121345, + "learning_rate": 9.662901990999468e-06, + "loss": 0.3474, + "step": 2291 + }, + { + "epoch": 0.14, + "grad_norm": 3.7540195970671437, + "learning_rate": 9.662534262614528e-06, + "loss": 0.3526, + "step": 2292 + }, + { + "epoch": 0.14, + "grad_norm": 3.031315959580994, + "learning_rate": 9.66216634077276e-06, + "loss": 0.3614, + "step": 2293 + }, + { + "epoch": 0.14, + "grad_norm": 2.290872175706192, + "learning_rate": 9.66179822548943e-06, + "loss": 0.3535, + "step": 2294 + }, + { + "epoch": 0.14, + "grad_norm": 13.08464816871641, + "learning_rate": 9.661429916779812e-06, + "loss": 0.3481, + "step": 2295 + }, + { + "epoch": 0.14, + "grad_norm": 10.618922565460359, + "learning_rate": 9.661061414659185e-06, + "loss": 0.3652, + "step": 2296 + }, + { + "epoch": 0.14, + "grad_norm": 1.8224276494957194, + "learning_rate": 9.660692719142843e-06, + "loss": 0.3428, + "step": 2297 + }, + { + "epoch": 0.14, + "grad_norm": 2.68191708088931, + "learning_rate": 9.660323830246082e-06, + "loss": 0.3411, + "step": 2298 + }, + { + "epoch": 0.14, + "grad_norm": 4.328870414668892, + "learning_rate": 9.659954747984206e-06, + "loss": 0.3403, + "step": 2299 + }, + { + "epoch": 0.14, + "grad_norm": 2.38353695337482, + "learning_rate": 9.65958547237253e-06, + "loss": 0.3736, + "step": 2300 + }, + { + "epoch": 0.14, + "grad_norm": 6.749184432940049, + "learning_rate": 9.659216003426378e-06, + "loss": 0.3624, + "step": 2301 + }, + { + "epoch": 0.14, + "grad_norm": 4.169476479975519, + "learning_rate": 9.658846341161079e-06, + "loss": 0.3761, + "step": 2302 + }, + { + "epoch": 0.14, + "grad_norm": 2.365985932567745, + "learning_rate": 9.658476485591968e-06, + "loss": 0.3781, + "step": 2303 + }, + { + "epoch": 0.14, + "grad_norm": 2.8827023001781615, + "learning_rate": 9.658106436734395e-06, + "loss": 0.3669, + "step": 2304 + }, + { + "epoch": 0.14, + "grad_norm": 3.3501941336783827, + "learning_rate": 9.657736194603711e-06, + "loss": 0.3702, + "step": 2305 + }, + { + "epoch": 0.15, + "grad_norm": 2.8538506973369806, + "learning_rate": 9.657365759215281e-06, + "loss": 0.3449, + "step": 2306 + }, + { + "epoch": 0.15, + "grad_norm": 2.7980927854541333, + "learning_rate": 9.656995130584473e-06, + "loss": 0.3454, + "step": 2307 + }, + { + "epoch": 0.15, + "grad_norm": 11.245172410954305, + "learning_rate": 9.656624308726662e-06, + "loss": 0.3634, + "step": 2308 + }, + { + "epoch": 0.15, + "grad_norm": 2.389572450307798, + "learning_rate": 9.65625329365724e-06, + "loss": 0.3397, + "step": 2309 + }, + { + "epoch": 0.15, + "grad_norm": 2.3100204934540214, + "learning_rate": 9.6558820853916e-06, + "loss": 0.3358, + "step": 2310 + }, + { + "epoch": 0.15, + "grad_norm": 2.0980354364598153, + "learning_rate": 9.655510683945139e-06, + "loss": 0.3616, + "step": 2311 + }, + { + "epoch": 0.15, + "grad_norm": 5.034610519930271, + "learning_rate": 9.655139089333272e-06, + "loss": 0.3441, + "step": 2312 + }, + { + "epoch": 0.15, + "grad_norm": 2.5320154288031897, + "learning_rate": 9.654767301571418e-06, + "loss": 0.3595, + "step": 2313 + }, + { + "epoch": 0.15, + "grad_norm": 1.9739325212177743, + "learning_rate": 9.654395320674998e-06, + "loss": 0.3398, + "step": 2314 + }, + { + "epoch": 0.15, + "grad_norm": 2.2320474183680483, + "learning_rate": 9.65402314665945e-06, + "loss": 0.3235, + "step": 2315 + }, + { + "epoch": 0.15, + "grad_norm": 2.2347126569903, + "learning_rate": 9.653650779540214e-06, + "loss": 0.3631, + "step": 2316 + }, + { + "epoch": 0.15, + "grad_norm": 1.8442530371937012, + "learning_rate": 9.653278219332742e-06, + "loss": 0.3616, + "step": 2317 + }, + { + "epoch": 0.15, + "grad_norm": 5.60449634011267, + "learning_rate": 9.652905466052492e-06, + "loss": 0.3279, + "step": 2318 + }, + { + "epoch": 0.15, + "grad_norm": 2.8604555700909606, + "learning_rate": 9.65253251971493e-06, + "loss": 0.357, + "step": 2319 + }, + { + "epoch": 0.15, + "grad_norm": 2.248440473110048, + "learning_rate": 9.65215938033553e-06, + "loss": 0.4123, + "step": 2320 + }, + { + "epoch": 0.15, + "grad_norm": 3.2557147594249787, + "learning_rate": 9.651786047929772e-06, + "loss": 0.3519, + "step": 2321 + }, + { + "epoch": 0.15, + "grad_norm": 2.337233149359728, + "learning_rate": 9.651412522513151e-06, + "loss": 0.3498, + "step": 2322 + }, + { + "epoch": 0.15, + "grad_norm": 7.08644851302118, + "learning_rate": 9.651038804101162e-06, + "loss": 0.368, + "step": 2323 + }, + { + "epoch": 0.15, + "grad_norm": 2.201117362555605, + "learning_rate": 9.650664892709311e-06, + "loss": 0.3862, + "step": 2324 + }, + { + "epoch": 0.15, + "grad_norm": 2.785244373287499, + "learning_rate": 9.650290788353114e-06, + "loss": 0.3688, + "step": 2325 + }, + { + "epoch": 0.15, + "grad_norm": 2.887624913253619, + "learning_rate": 9.649916491048092e-06, + "loss": 0.3399, + "step": 2326 + }, + { + "epoch": 0.15, + "grad_norm": 3.0446341964073884, + "learning_rate": 9.649542000809775e-06, + "loss": 0.3469, + "step": 2327 + }, + { + "epoch": 0.15, + "grad_norm": 1.2234333392104157, + "learning_rate": 9.649167317653703e-06, + "loss": 0.5292, + "step": 2328 + }, + { + "epoch": 0.15, + "grad_norm": 2.0099881681755503, + "learning_rate": 9.64879244159542e-06, + "loss": 0.3513, + "step": 2329 + }, + { + "epoch": 0.15, + "grad_norm": 2.855983876377842, + "learning_rate": 9.648417372650482e-06, + "loss": 0.3457, + "step": 2330 + }, + { + "epoch": 0.15, + "grad_norm": 1.7475013771571395, + "learning_rate": 9.64804211083445e-06, + "loss": 0.3537, + "step": 2331 + }, + { + "epoch": 0.15, + "grad_norm": 3.1653810440796653, + "learning_rate": 9.647666656162898e-06, + "loss": 0.3517, + "step": 2332 + }, + { + "epoch": 0.15, + "grad_norm": 7.02294797819566, + "learning_rate": 9.647291008651398e-06, + "loss": 0.3651, + "step": 2333 + }, + { + "epoch": 0.15, + "grad_norm": 5.048615989165704, + "learning_rate": 9.64691516831554e-06, + "loss": 0.3682, + "step": 2334 + }, + { + "epoch": 0.15, + "grad_norm": 2.654696700868606, + "learning_rate": 9.646539135170919e-06, + "loss": 0.366, + "step": 2335 + }, + { + "epoch": 0.15, + "grad_norm": 1.9975112521803116, + "learning_rate": 9.646162909233135e-06, + "loss": 0.3726, + "step": 2336 + }, + { + "epoch": 0.15, + "grad_norm": 2.1020659732556655, + "learning_rate": 9.6457864905178e-06, + "loss": 0.3499, + "step": 2337 + }, + { + "epoch": 0.15, + "grad_norm": 2.4810057879316187, + "learning_rate": 9.64540987904053e-06, + "loss": 0.3301, + "step": 2338 + }, + { + "epoch": 0.15, + "grad_norm": 1.880311443001001, + "learning_rate": 9.645033074816955e-06, + "loss": 0.3508, + "step": 2339 + }, + { + "epoch": 0.15, + "grad_norm": 1.9626301859520199, + "learning_rate": 9.644656077862706e-06, + "loss": 0.3628, + "step": 2340 + }, + { + "epoch": 0.15, + "grad_norm": 2.6082585080628293, + "learning_rate": 9.644278888193427e-06, + "loss": 0.3801, + "step": 2341 + }, + { + "epoch": 0.15, + "grad_norm": 9.508594621300968, + "learning_rate": 9.64390150582477e-06, + "loss": 0.3827, + "step": 2342 + }, + { + "epoch": 0.15, + "grad_norm": 3.162779175873289, + "learning_rate": 9.643523930772388e-06, + "loss": 0.3826, + "step": 2343 + }, + { + "epoch": 0.15, + "grad_norm": 3.9706900900386937, + "learning_rate": 9.64314616305195e-06, + "loss": 0.372, + "step": 2344 + }, + { + "epoch": 0.15, + "grad_norm": 2.306415379539258, + "learning_rate": 9.64276820267913e-06, + "loss": 0.3443, + "step": 2345 + }, + { + "epoch": 0.15, + "grad_norm": 4.345837477259276, + "learning_rate": 9.642390049669614e-06, + "loss": 0.345, + "step": 2346 + }, + { + "epoch": 0.15, + "grad_norm": 3.0960833003109123, + "learning_rate": 9.642011704039087e-06, + "loss": 0.3802, + "step": 2347 + }, + { + "epoch": 0.15, + "grad_norm": 2.1174944807098237, + "learning_rate": 9.64163316580325e-06, + "loss": 0.3522, + "step": 2348 + }, + { + "epoch": 0.15, + "grad_norm": 2.5986618743569525, + "learning_rate": 9.64125443497781e-06, + "loss": 0.3747, + "step": 2349 + }, + { + "epoch": 0.15, + "grad_norm": 4.178705716855747, + "learning_rate": 9.640875511578475e-06, + "loss": 0.3789, + "step": 2350 + }, + { + "epoch": 0.15, + "grad_norm": 1.7108012864525584, + "learning_rate": 9.640496395620976e-06, + "loss": 0.3437, + "step": 2351 + }, + { + "epoch": 0.15, + "grad_norm": 1.9976088571999921, + "learning_rate": 9.640117087121038e-06, + "loss": 0.3555, + "step": 2352 + }, + { + "epoch": 0.15, + "grad_norm": 8.332575910936503, + "learning_rate": 9.6397375860944e-06, + "loss": 0.3602, + "step": 2353 + }, + { + "epoch": 0.15, + "grad_norm": 2.0413358711022536, + "learning_rate": 9.63935789255681e-06, + "loss": 0.3297, + "step": 2354 + }, + { + "epoch": 0.15, + "grad_norm": 2.9357488207033153, + "learning_rate": 9.638978006524017e-06, + "loss": 0.3331, + "step": 2355 + }, + { + "epoch": 0.15, + "grad_norm": 2.794251389142506, + "learning_rate": 9.63859792801179e-06, + "loss": 0.3618, + "step": 2356 + }, + { + "epoch": 0.15, + "grad_norm": 2.0938584713205217, + "learning_rate": 9.638217657035895e-06, + "loss": 0.3472, + "step": 2357 + }, + { + "epoch": 0.15, + "grad_norm": 2.1667692379618164, + "learning_rate": 9.637837193612112e-06, + "loss": 0.3462, + "step": 2358 + }, + { + "epoch": 0.15, + "grad_norm": 2.2927168392100215, + "learning_rate": 9.637456537756224e-06, + "loss": 0.353, + "step": 2359 + }, + { + "epoch": 0.15, + "grad_norm": 1.7197863118466372, + "learning_rate": 9.637075689484027e-06, + "loss": 0.3371, + "step": 2360 + }, + { + "epoch": 0.15, + "grad_norm": 9.817247900294978, + "learning_rate": 9.636694648811326e-06, + "loss": 0.3552, + "step": 2361 + }, + { + "epoch": 0.15, + "grad_norm": 2.543037062907734, + "learning_rate": 9.636313415753927e-06, + "loss": 0.3741, + "step": 2362 + }, + { + "epoch": 0.15, + "grad_norm": 3.536416547673591, + "learning_rate": 9.635931990327649e-06, + "loss": 0.3268, + "step": 2363 + }, + { + "epoch": 0.15, + "grad_norm": 3.6625163197933777, + "learning_rate": 9.635550372548317e-06, + "loss": 0.3682, + "step": 2364 + }, + { + "epoch": 0.15, + "grad_norm": 2.3342808485981523, + "learning_rate": 9.635168562431769e-06, + "loss": 0.3543, + "step": 2365 + }, + { + "epoch": 0.15, + "grad_norm": 2.7700670597111325, + "learning_rate": 9.634786559993842e-06, + "loss": 0.3591, + "step": 2366 + }, + { + "epoch": 0.15, + "grad_norm": 2.35645116878552, + "learning_rate": 9.634404365250391e-06, + "loss": 0.3506, + "step": 2367 + }, + { + "epoch": 0.15, + "grad_norm": 4.707460131914564, + "learning_rate": 9.63402197821727e-06, + "loss": 0.3716, + "step": 2368 + }, + { + "epoch": 0.15, + "grad_norm": 1.5427111309804353, + "learning_rate": 9.633639398910346e-06, + "loss": 0.3174, + "step": 2369 + }, + { + "epoch": 0.15, + "grad_norm": 3.715698775498241, + "learning_rate": 9.633256627345494e-06, + "loss": 0.366, + "step": 2370 + }, + { + "epoch": 0.15, + "grad_norm": 7.0358262155339855, + "learning_rate": 9.632873663538594e-06, + "loss": 0.3549, + "step": 2371 + }, + { + "epoch": 0.15, + "grad_norm": 2.52722156177181, + "learning_rate": 9.632490507505536e-06, + "loss": 0.3644, + "step": 2372 + }, + { + "epoch": 0.15, + "grad_norm": 2.0780316152770864, + "learning_rate": 9.632107159262218e-06, + "loss": 0.3586, + "step": 2373 + }, + { + "epoch": 0.15, + "grad_norm": 2.64635097432633, + "learning_rate": 9.631723618824549e-06, + "loss": 0.3482, + "step": 2374 + }, + { + "epoch": 0.15, + "grad_norm": 6.197373002378108, + "learning_rate": 9.63133988620844e-06, + "loss": 0.3509, + "step": 2375 + }, + { + "epoch": 0.15, + "grad_norm": 1.87685837698477, + "learning_rate": 9.63095596142981e-06, + "loss": 0.3944, + "step": 2376 + }, + { + "epoch": 0.15, + "grad_norm": 4.500786724373431, + "learning_rate": 9.630571844504594e-06, + "loss": 0.3502, + "step": 2377 + }, + { + "epoch": 0.15, + "grad_norm": 2.7455177113536733, + "learning_rate": 9.630187535448727e-06, + "loss": 0.3756, + "step": 2378 + }, + { + "epoch": 0.15, + "grad_norm": 5.23001573548013, + "learning_rate": 9.629803034278155e-06, + "loss": 0.4042, + "step": 2379 + }, + { + "epoch": 0.15, + "grad_norm": 1.94647314913899, + "learning_rate": 9.629418341008831e-06, + "loss": 0.3496, + "step": 2380 + }, + { + "epoch": 0.15, + "grad_norm": 2.537475468068543, + "learning_rate": 9.62903345565672e-06, + "loss": 0.3839, + "step": 2381 + }, + { + "epoch": 0.15, + "grad_norm": 2.4068648716576693, + "learning_rate": 9.628648378237786e-06, + "loss": 0.3421, + "step": 2382 + }, + { + "epoch": 0.15, + "grad_norm": 1.9439316442881507, + "learning_rate": 9.62826310876801e-06, + "loss": 0.3703, + "step": 2383 + }, + { + "epoch": 0.15, + "grad_norm": 2.9368852642831618, + "learning_rate": 9.627877647263378e-06, + "loss": 0.3556, + "step": 2384 + }, + { + "epoch": 0.15, + "grad_norm": 7.335718471294046, + "learning_rate": 9.62749199373988e-06, + "loss": 0.3522, + "step": 2385 + }, + { + "epoch": 0.15, + "grad_norm": 4.029347325183008, + "learning_rate": 9.627106148213521e-06, + "loss": 0.3527, + "step": 2386 + }, + { + "epoch": 0.15, + "grad_norm": 4.49920180217814, + "learning_rate": 9.62672011070031e-06, + "loss": 0.3331, + "step": 2387 + }, + { + "epoch": 0.15, + "grad_norm": 3.666386157382629, + "learning_rate": 9.626333881216263e-06, + "loss": 0.3868, + "step": 2388 + }, + { + "epoch": 0.15, + "grad_norm": 3.453130005653644, + "learning_rate": 9.625947459777408e-06, + "loss": 0.3699, + "step": 2389 + }, + { + "epoch": 0.15, + "grad_norm": 2.799579787938819, + "learning_rate": 9.625560846399774e-06, + "loss": 0.3519, + "step": 2390 + }, + { + "epoch": 0.15, + "grad_norm": 3.0096918035158344, + "learning_rate": 9.625174041099403e-06, + "loss": 0.3553, + "step": 2391 + }, + { + "epoch": 0.15, + "grad_norm": 3.00932756878222, + "learning_rate": 9.624787043892349e-06, + "loss": 0.381, + "step": 2392 + }, + { + "epoch": 0.15, + "grad_norm": 2.2037870386656717, + "learning_rate": 9.624399854794664e-06, + "loss": 0.3331, + "step": 2393 + }, + { + "epoch": 0.15, + "grad_norm": 5.667139834720969, + "learning_rate": 9.624012473822417e-06, + "loss": 0.3477, + "step": 2394 + }, + { + "epoch": 0.15, + "grad_norm": 2.955719936010595, + "learning_rate": 9.623624900991676e-06, + "loss": 0.3683, + "step": 2395 + }, + { + "epoch": 0.15, + "grad_norm": 3.5910109694969887, + "learning_rate": 9.623237136318529e-06, + "loss": 0.3476, + "step": 2396 + }, + { + "epoch": 0.15, + "grad_norm": 4.44330547353958, + "learning_rate": 9.62284917981906e-06, + "loss": 0.3623, + "step": 2397 + }, + { + "epoch": 0.15, + "grad_norm": 2.7897407289814233, + "learning_rate": 9.622461031509366e-06, + "loss": 0.3634, + "step": 2398 + }, + { + "epoch": 0.15, + "grad_norm": 2.556571649402105, + "learning_rate": 9.622072691405557e-06, + "loss": 0.3448, + "step": 2399 + }, + { + "epoch": 0.15, + "grad_norm": 2.9887895139227743, + "learning_rate": 9.621684159523739e-06, + "loss": 0.3528, + "step": 2400 + }, + { + "epoch": 0.15, + "grad_norm": 4.848350239825674, + "learning_rate": 9.621295435880038e-06, + "loss": 0.3416, + "step": 2401 + }, + { + "epoch": 0.15, + "grad_norm": 4.979998864781763, + "learning_rate": 9.62090652049058e-06, + "loss": 0.3527, + "step": 2402 + }, + { + "epoch": 0.15, + "grad_norm": 2.741250462625021, + "learning_rate": 9.620517413371503e-06, + "loss": 0.3682, + "step": 2403 + }, + { + "epoch": 0.15, + "grad_norm": 2.4301657781684147, + "learning_rate": 9.62012811453895e-06, + "loss": 0.3721, + "step": 2404 + }, + { + "epoch": 0.15, + "grad_norm": 2.741096371782502, + "learning_rate": 9.619738624009078e-06, + "loss": 0.359, + "step": 2405 + }, + { + "epoch": 0.15, + "grad_norm": 6.18560320803686, + "learning_rate": 9.619348941798044e-06, + "loss": 0.345, + "step": 2406 + }, + { + "epoch": 0.15, + "grad_norm": 3.1573392761835413, + "learning_rate": 9.618959067922019e-06, + "loss": 0.3554, + "step": 2407 + }, + { + "epoch": 0.15, + "grad_norm": 5.900530906012863, + "learning_rate": 9.618569002397176e-06, + "loss": 0.3414, + "step": 2408 + }, + { + "epoch": 0.15, + "grad_norm": 2.8469436713944822, + "learning_rate": 9.618178745239701e-06, + "loss": 0.3523, + "step": 2409 + }, + { + "epoch": 0.15, + "grad_norm": 8.61986651265615, + "learning_rate": 9.617788296465789e-06, + "loss": 0.3599, + "step": 2410 + }, + { + "epoch": 0.15, + "grad_norm": 4.172438301853682, + "learning_rate": 9.617397656091637e-06, + "loss": 0.3371, + "step": 2411 + }, + { + "epoch": 0.15, + "grad_norm": 2.6323418059838244, + "learning_rate": 9.617006824133455e-06, + "loss": 0.3428, + "step": 2412 + }, + { + "epoch": 0.15, + "grad_norm": 2.8478925125461014, + "learning_rate": 9.61661580060746e-06, + "loss": 0.366, + "step": 2413 + }, + { + "epoch": 0.15, + "grad_norm": 9.964172206535194, + "learning_rate": 9.616224585529873e-06, + "loss": 0.3687, + "step": 2414 + }, + { + "epoch": 0.15, + "grad_norm": 3.1357494876913283, + "learning_rate": 9.615833178916932e-06, + "loss": 0.3491, + "step": 2415 + }, + { + "epoch": 0.15, + "grad_norm": 7.510423121848369, + "learning_rate": 9.615441580784873e-06, + "loss": 0.3324, + "step": 2416 + }, + { + "epoch": 0.15, + "grad_norm": 3.4890764516931316, + "learning_rate": 9.615049791149944e-06, + "loss": 0.3411, + "step": 2417 + }, + { + "epoch": 0.15, + "grad_norm": 3.176409776466373, + "learning_rate": 9.614657810028402e-06, + "loss": 0.3902, + "step": 2418 + }, + { + "epoch": 0.15, + "grad_norm": 5.276996482228377, + "learning_rate": 9.614265637436511e-06, + "loss": 0.3526, + "step": 2419 + }, + { + "epoch": 0.15, + "grad_norm": 2.3215123930986823, + "learning_rate": 9.613873273390544e-06, + "loss": 0.3386, + "step": 2420 + }, + { + "epoch": 0.15, + "grad_norm": 3.603729331200219, + "learning_rate": 9.613480717906778e-06, + "loss": 0.3224, + "step": 2421 + }, + { + "epoch": 0.15, + "grad_norm": 2.5575212249276613, + "learning_rate": 9.613087971001502e-06, + "loss": 0.3416, + "step": 2422 + }, + { + "epoch": 0.15, + "grad_norm": 4.24107590249232, + "learning_rate": 9.612695032691013e-06, + "loss": 0.3392, + "step": 2423 + }, + { + "epoch": 0.15, + "grad_norm": 4.012508516067043, + "learning_rate": 9.612301902991615e-06, + "loss": 0.3495, + "step": 2424 + }, + { + "epoch": 0.15, + "grad_norm": 2.909970317601092, + "learning_rate": 9.611908581919618e-06, + "loss": 0.3626, + "step": 2425 + }, + { + "epoch": 0.15, + "grad_norm": 3.230590477505716, + "learning_rate": 9.611515069491342e-06, + "loss": 0.37, + "step": 2426 + }, + { + "epoch": 0.15, + "grad_norm": 3.9191284574500584, + "learning_rate": 9.611121365723115e-06, + "loss": 0.3696, + "step": 2427 + }, + { + "epoch": 0.15, + "grad_norm": 1.8037932763176585, + "learning_rate": 9.610727470631273e-06, + "loss": 0.3561, + "step": 2428 + }, + { + "epoch": 0.15, + "grad_norm": 2.8363081561004386, + "learning_rate": 9.610333384232158e-06, + "loss": 0.3554, + "step": 2429 + }, + { + "epoch": 0.15, + "grad_norm": 7.137207765860285, + "learning_rate": 9.609939106542123e-06, + "loss": 0.3614, + "step": 2430 + }, + { + "epoch": 0.15, + "grad_norm": 3.7826653475158007, + "learning_rate": 9.609544637577524e-06, + "loss": 0.3588, + "step": 2431 + }, + { + "epoch": 0.15, + "grad_norm": 3.958046642291602, + "learning_rate": 9.609149977354733e-06, + "loss": 0.3699, + "step": 2432 + }, + { + "epoch": 0.15, + "grad_norm": 3.2142013270687104, + "learning_rate": 9.608755125890121e-06, + "loss": 0.3577, + "step": 2433 + }, + { + "epoch": 0.15, + "grad_norm": 2.7252581614357747, + "learning_rate": 9.608360083200074e-06, + "loss": 0.3535, + "step": 2434 + }, + { + "epoch": 0.15, + "grad_norm": 4.9804888275379895, + "learning_rate": 9.607964849300981e-06, + "loss": 0.3517, + "step": 2435 + }, + { + "epoch": 0.15, + "grad_norm": 3.1916292446784316, + "learning_rate": 9.607569424209243e-06, + "loss": 0.3332, + "step": 2436 + }, + { + "epoch": 0.15, + "grad_norm": 5.127344557367979, + "learning_rate": 9.607173807941263e-06, + "loss": 0.3447, + "step": 2437 + }, + { + "epoch": 0.15, + "grad_norm": 2.290307544206036, + "learning_rate": 9.60677800051346e-06, + "loss": 0.3452, + "step": 2438 + }, + { + "epoch": 0.15, + "grad_norm": 2.7397610296303663, + "learning_rate": 9.606382001942256e-06, + "loss": 0.3292, + "step": 2439 + }, + { + "epoch": 0.15, + "grad_norm": 6.187356183486216, + "learning_rate": 9.605985812244079e-06, + "loss": 0.3511, + "step": 2440 + }, + { + "epoch": 0.15, + "grad_norm": 2.865261940931568, + "learning_rate": 9.605589431435371e-06, + "loss": 0.3339, + "step": 2441 + }, + { + "epoch": 0.15, + "grad_norm": 3.4530789950310563, + "learning_rate": 9.605192859532577e-06, + "loss": 0.3564, + "step": 2442 + }, + { + "epoch": 0.15, + "grad_norm": 3.7145727865835654, + "learning_rate": 9.604796096552151e-06, + "loss": 0.3763, + "step": 2443 + }, + { + "epoch": 0.15, + "grad_norm": 2.854123911001881, + "learning_rate": 9.604399142510557e-06, + "loss": 0.3717, + "step": 2444 + }, + { + "epoch": 0.15, + "grad_norm": 3.4863620391474894, + "learning_rate": 9.604001997424261e-06, + "loss": 0.3624, + "step": 2445 + }, + { + "epoch": 0.15, + "grad_norm": 1.8969110954008894, + "learning_rate": 9.603604661309747e-06, + "loss": 0.3415, + "step": 2446 + }, + { + "epoch": 0.15, + "grad_norm": 21.640684111149046, + "learning_rate": 9.6032071341835e-06, + "loss": 0.3714, + "step": 2447 + }, + { + "epoch": 0.15, + "grad_norm": 4.682823158355087, + "learning_rate": 9.602809416062011e-06, + "loss": 0.3616, + "step": 2448 + }, + { + "epoch": 0.15, + "grad_norm": 2.4358330308884466, + "learning_rate": 9.602411506961784e-06, + "loss": 0.3422, + "step": 2449 + }, + { + "epoch": 0.15, + "grad_norm": 4.193203650915292, + "learning_rate": 9.602013406899328e-06, + "loss": 0.364, + "step": 2450 + }, + { + "epoch": 0.15, + "grad_norm": 3.2529084116181983, + "learning_rate": 9.601615115891164e-06, + "loss": 0.3626, + "step": 2451 + }, + { + "epoch": 0.15, + "grad_norm": 6.456621999420369, + "learning_rate": 9.601216633953813e-06, + "loss": 0.3517, + "step": 2452 + }, + { + "epoch": 0.15, + "grad_norm": 2.8859910782301066, + "learning_rate": 9.600817961103812e-06, + "loss": 0.383, + "step": 2453 + }, + { + "epoch": 0.15, + "grad_norm": 2.699870473647478, + "learning_rate": 9.600419097357703e-06, + "loss": 0.3199, + "step": 2454 + }, + { + "epoch": 0.15, + "grad_norm": 3.5162567919215273, + "learning_rate": 9.600020042732032e-06, + "loss": 0.3282, + "step": 2455 + }, + { + "epoch": 0.15, + "grad_norm": 1.7765206890967875, + "learning_rate": 9.599620797243361e-06, + "loss": 0.3416, + "step": 2456 + }, + { + "epoch": 0.15, + "grad_norm": 3.122703075758099, + "learning_rate": 9.599221360908252e-06, + "loss": 0.3719, + "step": 2457 + }, + { + "epoch": 0.15, + "grad_norm": 2.4466795969818937, + "learning_rate": 9.59882173374328e-06, + "loss": 0.3518, + "step": 2458 + }, + { + "epoch": 0.15, + "grad_norm": 3.2239881683850915, + "learning_rate": 9.598421915765026e-06, + "loss": 0.3514, + "step": 2459 + }, + { + "epoch": 0.15, + "grad_norm": 2.8689568773882375, + "learning_rate": 9.598021906990079e-06, + "loss": 0.3669, + "step": 2460 + }, + { + "epoch": 0.15, + "grad_norm": 2.9236755291344516, + "learning_rate": 9.597621707435036e-06, + "loss": 0.3709, + "step": 2461 + }, + { + "epoch": 0.15, + "grad_norm": 2.3783941980244863, + "learning_rate": 9.5972213171165e-06, + "loss": 0.3667, + "step": 2462 + }, + { + "epoch": 0.15, + "grad_norm": 2.2096218671045524, + "learning_rate": 9.59682073605109e-06, + "loss": 0.3389, + "step": 2463 + }, + { + "epoch": 0.15, + "grad_norm": 2.333596723331247, + "learning_rate": 9.59641996425542e-06, + "loss": 0.3414, + "step": 2464 + }, + { + "epoch": 0.16, + "grad_norm": 4.812144092220335, + "learning_rate": 9.596019001746122e-06, + "loss": 0.3302, + "step": 2465 + }, + { + "epoch": 0.16, + "grad_norm": 4.86242633104935, + "learning_rate": 9.595617848539834e-06, + "loss": 0.3439, + "step": 2466 + }, + { + "epoch": 0.16, + "grad_norm": 2.066804945858839, + "learning_rate": 9.595216504653197e-06, + "loss": 0.347, + "step": 2467 + }, + { + "epoch": 0.16, + "grad_norm": 3.222549658608177, + "learning_rate": 9.594814970102865e-06, + "loss": 0.3337, + "step": 2468 + }, + { + "epoch": 0.16, + "grad_norm": 2.1083775393708573, + "learning_rate": 9.594413244905499e-06, + "loss": 0.3313, + "step": 2469 + }, + { + "epoch": 0.16, + "grad_norm": 2.299993580756389, + "learning_rate": 9.594011329077765e-06, + "loss": 0.343, + "step": 2470 + }, + { + "epoch": 0.16, + "grad_norm": 5.6414229614092255, + "learning_rate": 9.593609222636344e-06, + "loss": 0.3643, + "step": 2471 + }, + { + "epoch": 0.16, + "grad_norm": 2.23002238737333, + "learning_rate": 9.593206925597916e-06, + "loss": 0.3566, + "step": 2472 + }, + { + "epoch": 0.16, + "grad_norm": 2.3584540697125704, + "learning_rate": 9.592804437979175e-06, + "loss": 0.3536, + "step": 2473 + }, + { + "epoch": 0.16, + "grad_norm": 2.5745857917975177, + "learning_rate": 9.592401759796818e-06, + "loss": 0.3418, + "step": 2474 + }, + { + "epoch": 0.16, + "grad_norm": 3.021634240799391, + "learning_rate": 9.591998891067558e-06, + "loss": 0.3403, + "step": 2475 + }, + { + "epoch": 0.16, + "grad_norm": 3.69197132094815, + "learning_rate": 9.591595831808105e-06, + "loss": 0.334, + "step": 2476 + }, + { + "epoch": 0.16, + "grad_norm": 2.1549057972336096, + "learning_rate": 9.591192582035187e-06, + "loss": 0.346, + "step": 2477 + }, + { + "epoch": 0.16, + "grad_norm": 3.1599262429004344, + "learning_rate": 9.590789141765534e-06, + "loss": 0.3469, + "step": 2478 + }, + { + "epoch": 0.16, + "grad_norm": 3.8986599195289973, + "learning_rate": 9.590385511015885e-06, + "loss": 0.3643, + "step": 2479 + }, + { + "epoch": 0.16, + "grad_norm": 2.043228553739762, + "learning_rate": 9.589981689802988e-06, + "loss": 0.3316, + "step": 2480 + }, + { + "epoch": 0.16, + "grad_norm": 2.9831795768658242, + "learning_rate": 9.589577678143596e-06, + "loss": 0.3271, + "step": 2481 + }, + { + "epoch": 0.16, + "grad_norm": 2.4642250151960425, + "learning_rate": 9.589173476054476e-06, + "loss": 0.3533, + "step": 2482 + }, + { + "epoch": 0.16, + "grad_norm": 7.002826439586185, + "learning_rate": 9.588769083552396e-06, + "loss": 0.3752, + "step": 2483 + }, + { + "epoch": 0.16, + "grad_norm": 2.1579029014664104, + "learning_rate": 9.588364500654137e-06, + "loss": 0.3288, + "step": 2484 + }, + { + "epoch": 0.16, + "grad_norm": 0.905411452132041, + "learning_rate": 9.587959727376485e-06, + "loss": 0.4955, + "step": 2485 + }, + { + "epoch": 0.16, + "grad_norm": 3.367883538218573, + "learning_rate": 9.587554763736236e-06, + "loss": 0.3361, + "step": 2486 + }, + { + "epoch": 0.16, + "grad_norm": 2.728250185300285, + "learning_rate": 9.58714960975019e-06, + "loss": 0.3318, + "step": 2487 + }, + { + "epoch": 0.16, + "grad_norm": 1.701796077702341, + "learning_rate": 9.586744265435158e-06, + "loss": 0.3359, + "step": 2488 + }, + { + "epoch": 0.16, + "grad_norm": 3.930769574319089, + "learning_rate": 9.58633873080796e-06, + "loss": 0.3655, + "step": 2489 + }, + { + "epoch": 0.16, + "grad_norm": 1.8947631935626261, + "learning_rate": 9.585933005885423e-06, + "loss": 0.3234, + "step": 2490 + }, + { + "epoch": 0.16, + "grad_norm": 3.8732111665478097, + "learning_rate": 9.58552709068438e-06, + "loss": 0.3421, + "step": 2491 + }, + { + "epoch": 0.16, + "grad_norm": 2.5178262424855333, + "learning_rate": 9.585120985221672e-06, + "loss": 0.3552, + "step": 2492 + }, + { + "epoch": 0.16, + "grad_norm": 4.138368178767361, + "learning_rate": 9.58471468951415e-06, + "loss": 0.373, + "step": 2493 + }, + { + "epoch": 0.16, + "grad_norm": 2.180941829419746, + "learning_rate": 9.584308203578674e-06, + "loss": 0.3418, + "step": 2494 + }, + { + "epoch": 0.16, + "grad_norm": 1.64143411198683, + "learning_rate": 9.583901527432106e-06, + "loss": 0.3507, + "step": 2495 + }, + { + "epoch": 0.16, + "grad_norm": 3.0856897197685846, + "learning_rate": 9.583494661091324e-06, + "loss": 0.3384, + "step": 2496 + }, + { + "epoch": 0.16, + "grad_norm": 3.774938131234399, + "learning_rate": 9.583087604573206e-06, + "loss": 0.3668, + "step": 2497 + }, + { + "epoch": 0.16, + "grad_norm": 2.7117203023785232, + "learning_rate": 9.582680357894643e-06, + "loss": 0.3745, + "step": 2498 + }, + { + "epoch": 0.16, + "grad_norm": 1.8687654925308135, + "learning_rate": 9.58227292107253e-06, + "loss": 0.3309, + "step": 2499 + }, + { + "epoch": 0.16, + "grad_norm": 3.0499200104652617, + "learning_rate": 9.581865294123778e-06, + "loss": 0.331, + "step": 2500 + }, + { + "epoch": 0.16, + "grad_norm": 2.4279129559113004, + "learning_rate": 9.581457477065294e-06, + "loss": 0.3434, + "step": 2501 + }, + { + "epoch": 0.16, + "grad_norm": 1.934042091373923, + "learning_rate": 9.581049469914004e-06, + "loss": 0.3533, + "step": 2502 + }, + { + "epoch": 0.16, + "grad_norm": 1.9453778743274504, + "learning_rate": 9.580641272686833e-06, + "loss": 0.3558, + "step": 2503 + }, + { + "epoch": 0.16, + "grad_norm": 0.8543505270470636, + "learning_rate": 9.58023288540072e-06, + "loss": 0.5233, + "step": 2504 + }, + { + "epoch": 0.16, + "grad_norm": 2.2962050006236105, + "learning_rate": 9.57982430807261e-06, + "loss": 0.3444, + "step": 2505 + }, + { + "epoch": 0.16, + "grad_norm": 5.39168303113425, + "learning_rate": 9.579415540719453e-06, + "loss": 0.36, + "step": 2506 + }, + { + "epoch": 0.16, + "grad_norm": 1.6872424447087488, + "learning_rate": 9.579006583358212e-06, + "loss": 0.3264, + "step": 2507 + }, + { + "epoch": 0.16, + "grad_norm": 2.727876161396312, + "learning_rate": 9.578597436005854e-06, + "loss": 0.3498, + "step": 2508 + }, + { + "epoch": 0.16, + "grad_norm": 1.6006231337472312, + "learning_rate": 9.578188098679357e-06, + "loss": 0.3436, + "step": 2509 + }, + { + "epoch": 0.16, + "grad_norm": 2.2575156671701886, + "learning_rate": 9.577778571395704e-06, + "loss": 0.3469, + "step": 2510 + }, + { + "epoch": 0.16, + "grad_norm": 2.1869701324813695, + "learning_rate": 9.577368854171887e-06, + "loss": 0.365, + "step": 2511 + }, + { + "epoch": 0.16, + "grad_norm": 2.1311656418139058, + "learning_rate": 9.576958947024906e-06, + "loss": 0.347, + "step": 2512 + }, + { + "epoch": 0.16, + "grad_norm": 2.3141995316655706, + "learning_rate": 9.576548849971767e-06, + "loss": 0.3475, + "step": 2513 + }, + { + "epoch": 0.16, + "grad_norm": 2.3770366881392695, + "learning_rate": 9.576138563029489e-06, + "loss": 0.3466, + "step": 2514 + }, + { + "epoch": 0.16, + "grad_norm": 1.7840172310317712, + "learning_rate": 9.575728086215093e-06, + "loss": 0.3618, + "step": 2515 + }, + { + "epoch": 0.16, + "grad_norm": 4.222578822306946, + "learning_rate": 9.575317419545611e-06, + "loss": 0.3115, + "step": 2516 + }, + { + "epoch": 0.16, + "grad_norm": 2.216072415252696, + "learning_rate": 9.574906563038084e-06, + "loss": 0.3426, + "step": 2517 + }, + { + "epoch": 0.16, + "grad_norm": 1.4519382156869867, + "learning_rate": 9.574495516709557e-06, + "loss": 0.336, + "step": 2518 + }, + { + "epoch": 0.16, + "grad_norm": 3.205717967051008, + "learning_rate": 9.574084280577085e-06, + "loss": 0.34, + "step": 2519 + }, + { + "epoch": 0.16, + "grad_norm": 1.9249991143022704, + "learning_rate": 9.573672854657734e-06, + "loss": 0.3676, + "step": 2520 + }, + { + "epoch": 0.16, + "grad_norm": 1.6796598181338873, + "learning_rate": 9.57326123896857e-06, + "loss": 0.3558, + "step": 2521 + }, + { + "epoch": 0.16, + "grad_norm": 1.9809568005933895, + "learning_rate": 9.572849433526677e-06, + "loss": 0.3659, + "step": 2522 + }, + { + "epoch": 0.16, + "grad_norm": 3.563760658607657, + "learning_rate": 9.572437438349136e-06, + "loss": 0.3477, + "step": 2523 + }, + { + "epoch": 0.16, + "grad_norm": 1.9663139792349593, + "learning_rate": 9.572025253453045e-06, + "loss": 0.3672, + "step": 2524 + }, + { + "epoch": 0.16, + "grad_norm": 3.103341722871928, + "learning_rate": 9.571612878855505e-06, + "loss": 0.339, + "step": 2525 + }, + { + "epoch": 0.16, + "grad_norm": 3.9121819361216263, + "learning_rate": 9.571200314573628e-06, + "loss": 0.3688, + "step": 2526 + }, + { + "epoch": 0.16, + "grad_norm": 2.0710557211190554, + "learning_rate": 9.57078756062453e-06, + "loss": 0.3456, + "step": 2527 + }, + { + "epoch": 0.16, + "grad_norm": 3.092516506635837, + "learning_rate": 9.570374617025336e-06, + "loss": 0.3876, + "step": 2528 + }, + { + "epoch": 0.16, + "grad_norm": 2.5375152893594675, + "learning_rate": 9.569961483793183e-06, + "loss": 0.3618, + "step": 2529 + }, + { + "epoch": 0.16, + "grad_norm": 1.9619645359300384, + "learning_rate": 9.56954816094521e-06, + "loss": 0.3596, + "step": 2530 + }, + { + "epoch": 0.16, + "grad_norm": 3.156338035702917, + "learning_rate": 9.569134648498568e-06, + "loss": 0.3652, + "step": 2531 + }, + { + "epoch": 0.16, + "grad_norm": 2.1336542173655793, + "learning_rate": 9.568720946470414e-06, + "loss": 0.3316, + "step": 2532 + }, + { + "epoch": 0.16, + "grad_norm": 1.9472305196009703, + "learning_rate": 9.568307054877911e-06, + "loss": 0.3587, + "step": 2533 + }, + { + "epoch": 0.16, + "grad_norm": 1.5438049737756938, + "learning_rate": 9.567892973738236e-06, + "loss": 0.3428, + "step": 2534 + }, + { + "epoch": 0.16, + "grad_norm": 1.768489667835759, + "learning_rate": 9.567478703068567e-06, + "loss": 0.3454, + "step": 2535 + }, + { + "epoch": 0.16, + "grad_norm": 1.7685372092996738, + "learning_rate": 9.567064242886095e-06, + "loss": 0.3357, + "step": 2536 + }, + { + "epoch": 0.16, + "grad_norm": 2.305059977430131, + "learning_rate": 9.566649593208015e-06, + "loss": 0.3534, + "step": 2537 + }, + { + "epoch": 0.16, + "grad_norm": 2.666618246425068, + "learning_rate": 9.566234754051531e-06, + "loss": 0.3724, + "step": 2538 + }, + { + "epoch": 0.16, + "grad_norm": 3.009465063885724, + "learning_rate": 9.565819725433857e-06, + "loss": 0.3502, + "step": 2539 + }, + { + "epoch": 0.16, + "grad_norm": 1.48785389289798, + "learning_rate": 9.565404507372213e-06, + "loss": 0.33, + "step": 2540 + }, + { + "epoch": 0.16, + "grad_norm": 9.241466167236867, + "learning_rate": 9.564989099883828e-06, + "loss": 0.3404, + "step": 2541 + }, + { + "epoch": 0.16, + "grad_norm": 2.588467178710788, + "learning_rate": 9.564573502985936e-06, + "loss": 0.3305, + "step": 2542 + }, + { + "epoch": 0.16, + "grad_norm": 2.8390386030689703, + "learning_rate": 9.564157716695783e-06, + "loss": 0.3479, + "step": 2543 + }, + { + "epoch": 0.16, + "grad_norm": 1.7269386059475975, + "learning_rate": 9.563741741030616e-06, + "loss": 0.3628, + "step": 2544 + }, + { + "epoch": 0.16, + "grad_norm": 2.156759376096128, + "learning_rate": 9.563325576007702e-06, + "loss": 0.3594, + "step": 2545 + }, + { + "epoch": 0.16, + "grad_norm": 1.6332586990458544, + "learning_rate": 9.562909221644303e-06, + "loss": 0.3368, + "step": 2546 + }, + { + "epoch": 0.16, + "grad_norm": 3.44282817679179, + "learning_rate": 9.562492677957695e-06, + "loss": 0.3506, + "step": 2547 + }, + { + "epoch": 0.16, + "grad_norm": 2.377567667525139, + "learning_rate": 9.562075944965163e-06, + "loss": 0.3544, + "step": 2548 + }, + { + "epoch": 0.16, + "grad_norm": 5.488996642594074, + "learning_rate": 9.561659022683996e-06, + "loss": 0.3544, + "step": 2549 + }, + { + "epoch": 0.16, + "grad_norm": 2.429938374982071, + "learning_rate": 9.561241911131494e-06, + "loss": 0.3443, + "step": 2550 + }, + { + "epoch": 0.16, + "grad_norm": 1.8974550576283076, + "learning_rate": 9.560824610324964e-06, + "loss": 0.3361, + "step": 2551 + }, + { + "epoch": 0.16, + "grad_norm": 3.1326873919114324, + "learning_rate": 9.560407120281718e-06, + "loss": 0.359, + "step": 2552 + }, + { + "epoch": 0.16, + "grad_norm": 2.242227199761406, + "learning_rate": 9.559989441019081e-06, + "loss": 0.3543, + "step": 2553 + }, + { + "epoch": 0.16, + "grad_norm": 3.0913298144477195, + "learning_rate": 9.559571572554385e-06, + "loss": 0.3724, + "step": 2554 + }, + { + "epoch": 0.16, + "grad_norm": 1.890813144751611, + "learning_rate": 9.559153514904964e-06, + "loss": 0.3664, + "step": 2555 + }, + { + "epoch": 0.16, + "grad_norm": 3.425751552170047, + "learning_rate": 9.558735268088167e-06, + "loss": 0.3593, + "step": 2556 + }, + { + "epoch": 0.16, + "grad_norm": 2.7633058242306126, + "learning_rate": 9.558316832121346e-06, + "loss": 0.3535, + "step": 2557 + }, + { + "epoch": 0.16, + "grad_norm": 2.5866897233691692, + "learning_rate": 9.55789820702186e-06, + "loss": 0.3379, + "step": 2558 + }, + { + "epoch": 0.16, + "grad_norm": 1.733393093973504, + "learning_rate": 9.557479392807085e-06, + "loss": 0.3447, + "step": 2559 + }, + { + "epoch": 0.16, + "grad_norm": 2.073121911444511, + "learning_rate": 9.557060389494395e-06, + "loss": 0.3395, + "step": 2560 + }, + { + "epoch": 0.16, + "grad_norm": 13.08060753241703, + "learning_rate": 9.556641197101173e-06, + "loss": 0.3654, + "step": 2561 + }, + { + "epoch": 0.16, + "grad_norm": 2.8624715183558385, + "learning_rate": 9.556221815644818e-06, + "loss": 0.3428, + "step": 2562 + }, + { + "epoch": 0.16, + "grad_norm": 2.209289265708224, + "learning_rate": 9.555802245142724e-06, + "loss": 0.3388, + "step": 2563 + }, + { + "epoch": 0.16, + "grad_norm": 1.6628985536653489, + "learning_rate": 9.555382485612304e-06, + "loss": 0.3393, + "step": 2564 + }, + { + "epoch": 0.16, + "grad_norm": 2.5411508399725578, + "learning_rate": 9.554962537070973e-06, + "loss": 0.3606, + "step": 2565 + }, + { + "epoch": 0.16, + "grad_norm": 6.727935169955974, + "learning_rate": 9.554542399536156e-06, + "loss": 0.3413, + "step": 2566 + }, + { + "epoch": 0.16, + "grad_norm": 2.3995116338867413, + "learning_rate": 9.554122073025284e-06, + "loss": 0.338, + "step": 2567 + }, + { + "epoch": 0.16, + "grad_norm": 2.866370812255361, + "learning_rate": 9.5537015575558e-06, + "loss": 0.3582, + "step": 2568 + }, + { + "epoch": 0.16, + "grad_norm": 3.7772709331670047, + "learning_rate": 9.553280853145148e-06, + "loss": 0.3868, + "step": 2569 + }, + { + "epoch": 0.16, + "grad_norm": 7.133986095637687, + "learning_rate": 9.552859959810787e-06, + "loss": 0.3481, + "step": 2570 + }, + { + "epoch": 0.16, + "grad_norm": 2.649019599880264, + "learning_rate": 9.55243887757018e-06, + "loss": 0.3532, + "step": 2571 + }, + { + "epoch": 0.16, + "grad_norm": 3.4470511545477005, + "learning_rate": 9.552017606440798e-06, + "loss": 0.3649, + "step": 2572 + }, + { + "epoch": 0.16, + "grad_norm": 2.2246959892360683, + "learning_rate": 9.551596146440119e-06, + "loss": 0.3537, + "step": 2573 + }, + { + "epoch": 0.16, + "grad_norm": 3.2384490679719873, + "learning_rate": 9.551174497585632e-06, + "loss": 0.3638, + "step": 2574 + }, + { + "epoch": 0.16, + "grad_norm": 0.7628758447553016, + "learning_rate": 9.550752659894831e-06, + "loss": 0.4955, + "step": 2575 + }, + { + "epoch": 0.16, + "grad_norm": 1.6820409910663758, + "learning_rate": 9.550330633385218e-06, + "loss": 0.3521, + "step": 2576 + }, + { + "epoch": 0.16, + "grad_norm": 8.116288676880126, + "learning_rate": 9.549908418074307e-06, + "loss": 0.3675, + "step": 2577 + }, + { + "epoch": 0.16, + "grad_norm": 2.8147406939372055, + "learning_rate": 9.549486013979614e-06, + "loss": 0.355, + "step": 2578 + }, + { + "epoch": 0.16, + "grad_norm": 2.07027267824064, + "learning_rate": 9.549063421118664e-06, + "loss": 0.3554, + "step": 2579 + }, + { + "epoch": 0.16, + "grad_norm": 3.179566494282651, + "learning_rate": 9.548640639508994e-06, + "loss": 0.3263, + "step": 2580 + }, + { + "epoch": 0.16, + "grad_norm": 3.950898328802244, + "learning_rate": 9.548217669168144e-06, + "loss": 0.3655, + "step": 2581 + }, + { + "epoch": 0.16, + "grad_norm": 1.6431588628993141, + "learning_rate": 9.547794510113663e-06, + "loss": 0.3521, + "step": 2582 + }, + { + "epoch": 0.16, + "grad_norm": 3.463704793114517, + "learning_rate": 9.547371162363112e-06, + "loss": 0.3729, + "step": 2583 + }, + { + "epoch": 0.16, + "grad_norm": 2.530652400296561, + "learning_rate": 9.546947625934055e-06, + "loss": 0.3831, + "step": 2584 + }, + { + "epoch": 0.16, + "grad_norm": 2.4571569208762076, + "learning_rate": 9.546523900844063e-06, + "loss": 0.3703, + "step": 2585 + }, + { + "epoch": 0.16, + "grad_norm": 2.049199584590343, + "learning_rate": 9.54609998711072e-06, + "loss": 0.3654, + "step": 2586 + }, + { + "epoch": 0.16, + "grad_norm": 2.087287113882566, + "learning_rate": 9.545675884751611e-06, + "loss": 0.3658, + "step": 2587 + }, + { + "epoch": 0.16, + "grad_norm": 2.5926436517104823, + "learning_rate": 9.545251593784339e-06, + "loss": 0.3583, + "step": 2588 + }, + { + "epoch": 0.16, + "grad_norm": 2.2027156831411783, + "learning_rate": 9.544827114226502e-06, + "loss": 0.372, + "step": 2589 + }, + { + "epoch": 0.16, + "grad_norm": 0.8152647557611538, + "learning_rate": 9.544402446095718e-06, + "loss": 0.5014, + "step": 2590 + }, + { + "epoch": 0.16, + "grad_norm": 1.9393017128013679, + "learning_rate": 9.543977589409603e-06, + "loss": 0.3533, + "step": 2591 + }, + { + "epoch": 0.16, + "grad_norm": 2.3638423578958885, + "learning_rate": 9.543552544185788e-06, + "loss": 0.3458, + "step": 2592 + }, + { + "epoch": 0.16, + "grad_norm": 3.1573274962314226, + "learning_rate": 9.54312731044191e-06, + "loss": 0.3417, + "step": 2593 + }, + { + "epoch": 0.16, + "grad_norm": 5.6799178455487676, + "learning_rate": 9.542701888195606e-06, + "loss": 0.3437, + "step": 2594 + }, + { + "epoch": 0.16, + "grad_norm": 1.6581331359518916, + "learning_rate": 9.542276277464534e-06, + "loss": 0.3349, + "step": 2595 + }, + { + "epoch": 0.16, + "grad_norm": 6.012567633353498, + "learning_rate": 9.54185047826635e-06, + "loss": 0.3543, + "step": 2596 + }, + { + "epoch": 0.16, + "grad_norm": 1.7440730857683153, + "learning_rate": 9.541424490618724e-06, + "loss": 0.3475, + "step": 2597 + }, + { + "epoch": 0.16, + "grad_norm": 2.0455347129235357, + "learning_rate": 9.540998314539327e-06, + "loss": 0.3222, + "step": 2598 + }, + { + "epoch": 0.16, + "grad_norm": 3.5701933075545593, + "learning_rate": 9.540571950045847e-06, + "loss": 0.3627, + "step": 2599 + }, + { + "epoch": 0.16, + "grad_norm": 4.105918678269197, + "learning_rate": 9.540145397155972e-06, + "loss": 0.3538, + "step": 2600 + }, + { + "epoch": 0.16, + "grad_norm": 2.1305296537632055, + "learning_rate": 9.539718655887398e-06, + "loss": 0.3482, + "step": 2601 + }, + { + "epoch": 0.16, + "grad_norm": 4.3103267626157065, + "learning_rate": 9.539291726257835e-06, + "loss": 0.3614, + "step": 2602 + }, + { + "epoch": 0.16, + "grad_norm": 3.0582360901291676, + "learning_rate": 9.538864608284994e-06, + "loss": 0.3414, + "step": 2603 + }, + { + "epoch": 0.16, + "grad_norm": 1.8434645933427474, + "learning_rate": 9.5384373019866e-06, + "loss": 0.3451, + "step": 2604 + }, + { + "epoch": 0.16, + "grad_norm": 2.1976945757225415, + "learning_rate": 9.538009807380381e-06, + "loss": 0.3503, + "step": 2605 + }, + { + "epoch": 0.16, + "grad_norm": 3.7150175096102385, + "learning_rate": 9.537582124484074e-06, + "loss": 0.348, + "step": 2606 + }, + { + "epoch": 0.16, + "grad_norm": 1.6675833627117203, + "learning_rate": 9.537154253315426e-06, + "loss": 0.3355, + "step": 2607 + }, + { + "epoch": 0.16, + "grad_norm": 4.916166557382315, + "learning_rate": 9.53672619389219e-06, + "loss": 0.3631, + "step": 2608 + }, + { + "epoch": 0.16, + "grad_norm": 2.8562654869946944, + "learning_rate": 9.536297946232124e-06, + "loss": 0.3449, + "step": 2609 + }, + { + "epoch": 0.16, + "grad_norm": 2.148763267097097, + "learning_rate": 9.535869510353e-06, + "loss": 0.3324, + "step": 2610 + }, + { + "epoch": 0.16, + "grad_norm": 2.035107214695197, + "learning_rate": 9.535440886272592e-06, + "loss": 0.3491, + "step": 2611 + }, + { + "epoch": 0.16, + "grad_norm": 7.074690254764154, + "learning_rate": 9.535012074008688e-06, + "loss": 0.3475, + "step": 2612 + }, + { + "epoch": 0.16, + "grad_norm": 3.2345198467273626, + "learning_rate": 9.534583073579076e-06, + "loss": 0.3642, + "step": 2613 + }, + { + "epoch": 0.16, + "grad_norm": 5.0103357340379375, + "learning_rate": 9.534153885001557e-06, + "loss": 0.3656, + "step": 2614 + }, + { + "epoch": 0.16, + "grad_norm": 3.4933549283783316, + "learning_rate": 9.533724508293942e-06, + "loss": 0.3364, + "step": 2615 + }, + { + "epoch": 0.16, + "grad_norm": 5.846978499674889, + "learning_rate": 9.533294943474044e-06, + "loss": 0.3725, + "step": 2616 + }, + { + "epoch": 0.16, + "grad_norm": 4.751904603471427, + "learning_rate": 9.532865190559686e-06, + "loss": 0.3667, + "step": 2617 + }, + { + "epoch": 0.16, + "grad_norm": 2.5832888425694582, + "learning_rate": 9.532435249568701e-06, + "loss": 0.3582, + "step": 2618 + }, + { + "epoch": 0.16, + "grad_norm": 1.8667365545365433, + "learning_rate": 9.532005120518927e-06, + "loss": 0.3301, + "step": 2619 + }, + { + "epoch": 0.16, + "grad_norm": 3.050198970183284, + "learning_rate": 9.53157480342821e-06, + "loss": 0.328, + "step": 2620 + }, + { + "epoch": 0.16, + "grad_norm": 1.8338078372465996, + "learning_rate": 9.531144298314406e-06, + "loss": 0.3569, + "step": 2621 + }, + { + "epoch": 0.16, + "grad_norm": 2.0190805522209225, + "learning_rate": 9.530713605195377e-06, + "loss": 0.3611, + "step": 2622 + }, + { + "epoch": 0.16, + "grad_norm": 2.744709556009685, + "learning_rate": 9.530282724088991e-06, + "loss": 0.3411, + "step": 2623 + }, + { + "epoch": 0.17, + "grad_norm": 2.7538256351208994, + "learning_rate": 9.52985165501313e-06, + "loss": 0.3534, + "step": 2624 + }, + { + "epoch": 0.17, + "grad_norm": 5.853596636209183, + "learning_rate": 9.529420397985678e-06, + "loss": 0.3498, + "step": 2625 + }, + { + "epoch": 0.17, + "grad_norm": 0.8416926794641617, + "learning_rate": 9.528988953024528e-06, + "loss": 0.5177, + "step": 2626 + }, + { + "epoch": 0.17, + "grad_norm": 2.037586542004035, + "learning_rate": 9.528557320147583e-06, + "loss": 0.3797, + "step": 2627 + }, + { + "epoch": 0.17, + "grad_norm": 1.91942641742022, + "learning_rate": 9.52812549937275e-06, + "loss": 0.3596, + "step": 2628 + }, + { + "epoch": 0.17, + "grad_norm": 1.939099554594765, + "learning_rate": 9.527693490717946e-06, + "loss": 0.3371, + "step": 2629 + }, + { + "epoch": 0.17, + "grad_norm": 2.1548338000960006, + "learning_rate": 9.527261294201098e-06, + "loss": 0.3316, + "step": 2630 + }, + { + "epoch": 0.17, + "grad_norm": 3.740173607989743, + "learning_rate": 9.52682890984014e-06, + "loss": 0.3512, + "step": 2631 + }, + { + "epoch": 0.17, + "grad_norm": 3.152701064283289, + "learning_rate": 9.526396337653008e-06, + "loss": 0.3423, + "step": 2632 + }, + { + "epoch": 0.17, + "grad_norm": 3.988591564166526, + "learning_rate": 9.525963577657651e-06, + "loss": 0.3387, + "step": 2633 + }, + { + "epoch": 0.17, + "grad_norm": 2.5276796406272264, + "learning_rate": 9.525530629872027e-06, + "loss": 0.3593, + "step": 2634 + }, + { + "epoch": 0.17, + "grad_norm": 2.950624673873196, + "learning_rate": 9.5250974943141e-06, + "loss": 0.3651, + "step": 2635 + }, + { + "epoch": 0.17, + "grad_norm": 2.785631448046799, + "learning_rate": 9.524664171001839e-06, + "loss": 0.3434, + "step": 2636 + }, + { + "epoch": 0.17, + "grad_norm": 4.6251489728593995, + "learning_rate": 9.524230659953227e-06, + "loss": 0.34, + "step": 2637 + }, + { + "epoch": 0.17, + "grad_norm": 2.6580689955724477, + "learning_rate": 9.523796961186247e-06, + "loss": 0.3601, + "step": 2638 + }, + { + "epoch": 0.17, + "grad_norm": 2.3618623929564544, + "learning_rate": 9.523363074718895e-06, + "loss": 0.3278, + "step": 2639 + }, + { + "epoch": 0.17, + "grad_norm": 3.126048441695036, + "learning_rate": 9.522929000569177e-06, + "loss": 0.3733, + "step": 2640 + }, + { + "epoch": 0.17, + "grad_norm": 2.632442609393323, + "learning_rate": 9.522494738755099e-06, + "loss": 0.349, + "step": 2641 + }, + { + "epoch": 0.17, + "grad_norm": 2.849686572228049, + "learning_rate": 9.522060289294683e-06, + "loss": 0.3543, + "step": 2642 + }, + { + "epoch": 0.17, + "grad_norm": 1.8878210352938096, + "learning_rate": 9.521625652205954e-06, + "loss": 0.3485, + "step": 2643 + }, + { + "epoch": 0.17, + "grad_norm": 1.7771626486961767, + "learning_rate": 9.521190827506944e-06, + "loss": 0.3576, + "step": 2644 + }, + { + "epoch": 0.17, + "grad_norm": 1.8613438572064482, + "learning_rate": 9.520755815215697e-06, + "loss": 0.3407, + "step": 2645 + }, + { + "epoch": 0.17, + "grad_norm": 2.6765864240324504, + "learning_rate": 9.52032061535026e-06, + "loss": 0.3447, + "step": 2646 + }, + { + "epoch": 0.17, + "grad_norm": 2.5914867880010433, + "learning_rate": 9.519885227928693e-06, + "loss": 0.3449, + "step": 2647 + }, + { + "epoch": 0.17, + "grad_norm": 1.9024737081700733, + "learning_rate": 9.519449652969059e-06, + "loss": 0.3508, + "step": 2648 + }, + { + "epoch": 0.17, + "grad_norm": 1.8534869736610464, + "learning_rate": 9.519013890489433e-06, + "loss": 0.3361, + "step": 2649 + }, + { + "epoch": 0.17, + "grad_norm": 4.054828248159896, + "learning_rate": 9.518577940507893e-06, + "loss": 0.3442, + "step": 2650 + }, + { + "epoch": 0.17, + "grad_norm": 2.422687525071993, + "learning_rate": 9.518141803042528e-06, + "loss": 0.3325, + "step": 2651 + }, + { + "epoch": 0.17, + "grad_norm": 5.20343090953775, + "learning_rate": 9.517705478111434e-06, + "loss": 0.3451, + "step": 2652 + }, + { + "epoch": 0.17, + "grad_norm": 2.7751071814947603, + "learning_rate": 9.517268965732716e-06, + "loss": 0.3689, + "step": 2653 + }, + { + "epoch": 0.17, + "grad_norm": 3.3940516617122967, + "learning_rate": 9.516832265924485e-06, + "loss": 0.365, + "step": 2654 + }, + { + "epoch": 0.17, + "grad_norm": 2.354668871640598, + "learning_rate": 9.516395378704862e-06, + "loss": 0.3412, + "step": 2655 + }, + { + "epoch": 0.17, + "grad_norm": 2.353812747703441, + "learning_rate": 9.51595830409197e-06, + "loss": 0.3643, + "step": 2656 + }, + { + "epoch": 0.17, + "grad_norm": 2.076640095928003, + "learning_rate": 9.515521042103948e-06, + "loss": 0.3303, + "step": 2657 + }, + { + "epoch": 0.17, + "grad_norm": 4.880974707584823, + "learning_rate": 9.515083592758939e-06, + "loss": 0.3404, + "step": 2658 + }, + { + "epoch": 0.17, + "grad_norm": 12.592343016773818, + "learning_rate": 9.514645956075088e-06, + "loss": 0.3489, + "step": 2659 + }, + { + "epoch": 0.17, + "grad_norm": 3.3784402938981795, + "learning_rate": 9.514208132070561e-06, + "loss": 0.3714, + "step": 2660 + }, + { + "epoch": 0.17, + "grad_norm": 2.4593005782718267, + "learning_rate": 9.513770120763518e-06, + "loss": 0.3369, + "step": 2661 + }, + { + "epoch": 0.17, + "grad_norm": 2.361727502900006, + "learning_rate": 9.513331922172138e-06, + "loss": 0.3442, + "step": 2662 + }, + { + "epoch": 0.17, + "grad_norm": 3.366551211810116, + "learning_rate": 9.512893536314598e-06, + "loss": 0.3334, + "step": 2663 + }, + { + "epoch": 0.17, + "grad_norm": 2.0336864622878514, + "learning_rate": 9.51245496320909e-06, + "loss": 0.3557, + "step": 2664 + }, + { + "epoch": 0.17, + "grad_norm": 3.3405514413183313, + "learning_rate": 9.51201620287381e-06, + "loss": 0.3605, + "step": 2665 + }, + { + "epoch": 0.17, + "grad_norm": 1.6840921552846995, + "learning_rate": 9.511577255326965e-06, + "loss": 0.3413, + "step": 2666 + }, + { + "epoch": 0.17, + "grad_norm": 2.749522367654022, + "learning_rate": 9.511138120586765e-06, + "loss": 0.3463, + "step": 2667 + }, + { + "epoch": 0.17, + "grad_norm": 1.6912371977457925, + "learning_rate": 9.510698798671432e-06, + "loss": 0.3545, + "step": 2668 + }, + { + "epoch": 0.17, + "grad_norm": 2.436818083944917, + "learning_rate": 9.510259289599195e-06, + "loss": 0.3544, + "step": 2669 + }, + { + "epoch": 0.17, + "grad_norm": 2.82972619419692, + "learning_rate": 9.509819593388288e-06, + "loss": 0.3525, + "step": 2670 + }, + { + "epoch": 0.17, + "grad_norm": 7.082100565723151, + "learning_rate": 9.509379710056954e-06, + "loss": 0.3475, + "step": 2671 + }, + { + "epoch": 0.17, + "grad_norm": 0.864480199932987, + "learning_rate": 9.508939639623448e-06, + "loss": 0.5275, + "step": 2672 + }, + { + "epoch": 0.17, + "grad_norm": 2.9609948489310076, + "learning_rate": 9.508499382106028e-06, + "loss": 0.3582, + "step": 2673 + }, + { + "epoch": 0.17, + "grad_norm": 3.7104948444024886, + "learning_rate": 9.508058937522959e-06, + "loss": 0.3544, + "step": 2674 + }, + { + "epoch": 0.17, + "grad_norm": 2.8965084469653606, + "learning_rate": 9.50761830589252e-06, + "loss": 0.3704, + "step": 2675 + }, + { + "epoch": 0.17, + "grad_norm": 2.239225738722652, + "learning_rate": 9.507177487232989e-06, + "loss": 0.3597, + "step": 2676 + }, + { + "epoch": 0.17, + "grad_norm": 3.542830523225034, + "learning_rate": 9.506736481562659e-06, + "loss": 0.3436, + "step": 2677 + }, + { + "epoch": 0.17, + "grad_norm": 2.401912160955188, + "learning_rate": 9.506295288899827e-06, + "loss": 0.3635, + "step": 2678 + }, + { + "epoch": 0.17, + "grad_norm": 2.600686461753694, + "learning_rate": 9.5058539092628e-06, + "loss": 0.3507, + "step": 2679 + }, + { + "epoch": 0.17, + "grad_norm": 0.6601039617645533, + "learning_rate": 9.505412342669891e-06, + "loss": 0.4829, + "step": 2680 + }, + { + "epoch": 0.17, + "grad_norm": 2.66433923637712, + "learning_rate": 9.504970589139422e-06, + "loss": 0.3541, + "step": 2681 + }, + { + "epoch": 0.17, + "grad_norm": 3.709006116533518, + "learning_rate": 9.50452864868972e-06, + "loss": 0.3532, + "step": 2682 + }, + { + "epoch": 0.17, + "grad_norm": 4.122040693803943, + "learning_rate": 9.504086521339124e-06, + "loss": 0.342, + "step": 2683 + }, + { + "epoch": 0.17, + "grad_norm": 2.271705062000997, + "learning_rate": 9.50364420710598e-06, + "loss": 0.3539, + "step": 2684 + }, + { + "epoch": 0.17, + "grad_norm": 3.0816573387018766, + "learning_rate": 9.503201706008636e-06, + "loss": 0.3534, + "step": 2685 + }, + { + "epoch": 0.17, + "grad_norm": 4.8107531371915115, + "learning_rate": 9.502759018065455e-06, + "loss": 0.3736, + "step": 2686 + }, + { + "epoch": 0.17, + "grad_norm": 2.9865703063707727, + "learning_rate": 9.502316143294806e-06, + "loss": 0.3456, + "step": 2687 + }, + { + "epoch": 0.17, + "grad_norm": 7.389427019947758, + "learning_rate": 9.501873081715062e-06, + "loss": 0.3439, + "step": 2688 + }, + { + "epoch": 0.17, + "grad_norm": 3.81338473799617, + "learning_rate": 9.501429833344608e-06, + "loss": 0.3494, + "step": 2689 + }, + { + "epoch": 0.17, + "grad_norm": 5.352215648096498, + "learning_rate": 9.500986398201835e-06, + "loss": 0.3599, + "step": 2690 + }, + { + "epoch": 0.17, + "grad_norm": 5.099168036639196, + "learning_rate": 9.50054277630514e-06, + "loss": 0.3332, + "step": 2691 + }, + { + "epoch": 0.17, + "grad_norm": 2.577477281100561, + "learning_rate": 9.500098967672933e-06, + "loss": 0.3376, + "step": 2692 + }, + { + "epoch": 0.17, + "grad_norm": 2.00425205734045, + "learning_rate": 9.499654972323627e-06, + "loss": 0.3238, + "step": 2693 + }, + { + "epoch": 0.17, + "grad_norm": 2.4047158583404316, + "learning_rate": 9.499210790275642e-06, + "loss": 0.3317, + "step": 2694 + }, + { + "epoch": 0.17, + "grad_norm": 6.481222274156482, + "learning_rate": 9.498766421547412e-06, + "loss": 0.3344, + "step": 2695 + }, + { + "epoch": 0.17, + "grad_norm": 8.760444148294617, + "learning_rate": 9.498321866157372e-06, + "loss": 0.3257, + "step": 2696 + }, + { + "epoch": 0.17, + "grad_norm": 3.342022172020348, + "learning_rate": 9.497877124123967e-06, + "loss": 0.3182, + "step": 2697 + }, + { + "epoch": 0.17, + "grad_norm": 2.1425706342728876, + "learning_rate": 9.497432195465652e-06, + "loss": 0.3347, + "step": 2698 + }, + { + "epoch": 0.17, + "grad_norm": 4.284001068161913, + "learning_rate": 9.496987080200886e-06, + "loss": 0.3541, + "step": 2699 + }, + { + "epoch": 0.17, + "grad_norm": 2.1254002365975726, + "learning_rate": 9.496541778348138e-06, + "loss": 0.344, + "step": 2700 + }, + { + "epoch": 0.17, + "grad_norm": 3.9013143896783475, + "learning_rate": 9.496096289925886e-06, + "loss": 0.3475, + "step": 2701 + }, + { + "epoch": 0.17, + "grad_norm": 2.3387279893590986, + "learning_rate": 9.495650614952612e-06, + "loss": 0.332, + "step": 2702 + }, + { + "epoch": 0.17, + "grad_norm": 13.953452241927364, + "learning_rate": 9.495204753446809e-06, + "loss": 0.3452, + "step": 2703 + }, + { + "epoch": 0.17, + "grad_norm": 9.111188700003503, + "learning_rate": 9.494758705426978e-06, + "loss": 0.3236, + "step": 2704 + }, + { + "epoch": 0.17, + "grad_norm": 7.7961239694588595, + "learning_rate": 9.494312470911622e-06, + "loss": 0.3573, + "step": 2705 + }, + { + "epoch": 0.17, + "grad_norm": 2.9370592498352033, + "learning_rate": 9.493866049919261e-06, + "loss": 0.3477, + "step": 2706 + }, + { + "epoch": 0.17, + "grad_norm": 7.006058893233341, + "learning_rate": 9.493419442468414e-06, + "loss": 0.3378, + "step": 2707 + }, + { + "epoch": 0.17, + "grad_norm": 3.65946956367097, + "learning_rate": 9.492972648577616e-06, + "loss": 0.3736, + "step": 2708 + }, + { + "epoch": 0.17, + "grad_norm": 4.3844249311446575, + "learning_rate": 9.4925256682654e-06, + "loss": 0.3471, + "step": 2709 + }, + { + "epoch": 0.17, + "grad_norm": 4.856291878012339, + "learning_rate": 9.492078501550314e-06, + "loss": 0.3516, + "step": 2710 + }, + { + "epoch": 0.17, + "grad_norm": 4.264358389602798, + "learning_rate": 9.491631148450914e-06, + "loss": 0.345, + "step": 2711 + }, + { + "epoch": 0.17, + "grad_norm": 5.857814904464563, + "learning_rate": 9.491183608985757e-06, + "loss": 0.3792, + "step": 2712 + }, + { + "epoch": 0.17, + "grad_norm": 2.6986824116877774, + "learning_rate": 9.490735883173417e-06, + "loss": 0.3626, + "step": 2713 + }, + { + "epoch": 0.17, + "grad_norm": 0.7352081879720826, + "learning_rate": 9.490287971032468e-06, + "loss": 0.475, + "step": 2714 + }, + { + "epoch": 0.17, + "grad_norm": 3.1577051020112794, + "learning_rate": 9.489839872581497e-06, + "loss": 0.3453, + "step": 2715 + }, + { + "epoch": 0.17, + "grad_norm": 3.2301244478097497, + "learning_rate": 9.489391587839091e-06, + "loss": 0.3643, + "step": 2716 + }, + { + "epoch": 0.17, + "grad_norm": 5.677422097230705, + "learning_rate": 9.488943116823857e-06, + "loss": 0.3687, + "step": 2717 + }, + { + "epoch": 0.17, + "grad_norm": 5.126018716941779, + "learning_rate": 9.488494459554401e-06, + "loss": 0.3197, + "step": 2718 + }, + { + "epoch": 0.17, + "grad_norm": 4.7465849147696, + "learning_rate": 9.488045616049335e-06, + "loss": 0.3424, + "step": 2719 + }, + { + "epoch": 0.17, + "grad_norm": 5.6874308654596755, + "learning_rate": 9.487596586327286e-06, + "loss": 0.3431, + "step": 2720 + }, + { + "epoch": 0.17, + "grad_norm": 3.2281008504858373, + "learning_rate": 9.487147370406882e-06, + "loss": 0.3361, + "step": 2721 + }, + { + "epoch": 0.17, + "grad_norm": 0.6901945952652017, + "learning_rate": 9.486697968306764e-06, + "loss": 0.4635, + "step": 2722 + }, + { + "epoch": 0.17, + "grad_norm": 2.953759134002954, + "learning_rate": 9.48624838004558e-06, + "loss": 0.3449, + "step": 2723 + }, + { + "epoch": 0.17, + "grad_norm": 3.082152399780204, + "learning_rate": 9.48579860564198e-06, + "loss": 0.3783, + "step": 2724 + }, + { + "epoch": 0.17, + "grad_norm": 3.703853281066354, + "learning_rate": 9.48534864511463e-06, + "loss": 0.3493, + "step": 2725 + }, + { + "epoch": 0.17, + "grad_norm": 2.8544233592956116, + "learning_rate": 9.484898498482195e-06, + "loss": 0.3318, + "step": 2726 + }, + { + "epoch": 0.17, + "grad_norm": 3.9885102183229755, + "learning_rate": 9.484448165763358e-06, + "loss": 0.3502, + "step": 2727 + }, + { + "epoch": 0.17, + "grad_norm": 2.9964802042385448, + "learning_rate": 9.483997646976802e-06, + "loss": 0.342, + "step": 2728 + }, + { + "epoch": 0.17, + "grad_norm": 2.2126700481589343, + "learning_rate": 9.483546942141216e-06, + "loss": 0.3218, + "step": 2729 + }, + { + "epoch": 0.17, + "grad_norm": 3.412135230470621, + "learning_rate": 9.483096051275305e-06, + "loss": 0.3721, + "step": 2730 + }, + { + "epoch": 0.17, + "grad_norm": 2.8492547414066736, + "learning_rate": 9.482644974397776e-06, + "loss": 0.339, + "step": 2731 + }, + { + "epoch": 0.17, + "grad_norm": 3.569498592151541, + "learning_rate": 9.482193711527345e-06, + "loss": 0.3215, + "step": 2732 + }, + { + "epoch": 0.17, + "grad_norm": 3.6170889960445285, + "learning_rate": 9.481742262682738e-06, + "loss": 0.3352, + "step": 2733 + }, + { + "epoch": 0.17, + "grad_norm": 3.202934192988548, + "learning_rate": 9.48129062788268e-06, + "loss": 0.3705, + "step": 2734 + }, + { + "epoch": 0.17, + "grad_norm": 2.605696492639625, + "learning_rate": 9.480838807145916e-06, + "loss": 0.3302, + "step": 2735 + }, + { + "epoch": 0.17, + "grad_norm": 6.185066278772632, + "learning_rate": 9.480386800491192e-06, + "loss": 0.3633, + "step": 2736 + }, + { + "epoch": 0.17, + "grad_norm": 4.228770585131004, + "learning_rate": 9.47993460793726e-06, + "loss": 0.3279, + "step": 2737 + }, + { + "epoch": 0.17, + "grad_norm": 11.401999316003607, + "learning_rate": 9.479482229502886e-06, + "loss": 0.3476, + "step": 2738 + }, + { + "epoch": 0.17, + "grad_norm": 2.4559655797869313, + "learning_rate": 9.479029665206837e-06, + "loss": 0.3466, + "step": 2739 + }, + { + "epoch": 0.17, + "grad_norm": 2.3456006468757646, + "learning_rate": 9.47857691506789e-06, + "loss": 0.3439, + "step": 2740 + }, + { + "epoch": 0.17, + "grad_norm": 2.4428161643087463, + "learning_rate": 9.478123979104831e-06, + "loss": 0.3432, + "step": 2741 + }, + { + "epoch": 0.17, + "grad_norm": 2.9358941894391037, + "learning_rate": 9.477670857336458e-06, + "loss": 0.3302, + "step": 2742 + }, + { + "epoch": 0.17, + "grad_norm": 0.6408629330984094, + "learning_rate": 9.477217549781565e-06, + "loss": 0.512, + "step": 2743 + }, + { + "epoch": 0.17, + "grad_norm": 4.42865217078952, + "learning_rate": 9.476764056458964e-06, + "loss": 0.3511, + "step": 2744 + }, + { + "epoch": 0.17, + "grad_norm": 2.6952496177437375, + "learning_rate": 9.476310377387473e-06, + "loss": 0.3379, + "step": 2745 + }, + { + "epoch": 0.17, + "grad_norm": 2.2930089427411766, + "learning_rate": 9.47585651258591e-06, + "loss": 0.3298, + "step": 2746 + }, + { + "epoch": 0.17, + "grad_norm": 14.833766862752183, + "learning_rate": 9.475402462073111e-06, + "loss": 0.3575, + "step": 2747 + }, + { + "epoch": 0.17, + "grad_norm": 5.956849764735481, + "learning_rate": 9.474948225867916e-06, + "loss": 0.3262, + "step": 2748 + }, + { + "epoch": 0.17, + "grad_norm": 2.730707992760973, + "learning_rate": 9.47449380398917e-06, + "loss": 0.3422, + "step": 2749 + }, + { + "epoch": 0.17, + "grad_norm": 3.2083692841088185, + "learning_rate": 9.474039196455729e-06, + "loss": 0.3314, + "step": 2750 + }, + { + "epoch": 0.17, + "grad_norm": 0.6240525352921156, + "learning_rate": 9.473584403286454e-06, + "loss": 0.4988, + "step": 2751 + }, + { + "epoch": 0.17, + "grad_norm": 3.187405850798595, + "learning_rate": 9.473129424500218e-06, + "loss": 0.3522, + "step": 2752 + }, + { + "epoch": 0.17, + "grad_norm": 2.7106370564114695, + "learning_rate": 9.472674260115895e-06, + "loss": 0.3372, + "step": 2753 + }, + { + "epoch": 0.17, + "grad_norm": 4.443339382493905, + "learning_rate": 9.472218910152374e-06, + "loss": 0.3364, + "step": 2754 + }, + { + "epoch": 0.17, + "grad_norm": 3.4018084146984333, + "learning_rate": 9.471763374628546e-06, + "loss": 0.3498, + "step": 2755 + }, + { + "epoch": 0.17, + "grad_norm": 1.7353046132565118, + "learning_rate": 9.471307653563313e-06, + "loss": 0.3442, + "step": 2756 + }, + { + "epoch": 0.17, + "grad_norm": 4.083106695960372, + "learning_rate": 9.470851746975582e-06, + "loss": 0.341, + "step": 2757 + }, + { + "epoch": 0.17, + "grad_norm": 5.237154015534022, + "learning_rate": 9.470395654884273e-06, + "loss": 0.3456, + "step": 2758 + }, + { + "epoch": 0.17, + "grad_norm": 3.690322370540809, + "learning_rate": 9.469939377308308e-06, + "loss": 0.3349, + "step": 2759 + }, + { + "epoch": 0.17, + "grad_norm": 3.4584919329566106, + "learning_rate": 9.469482914266618e-06, + "loss": 0.3625, + "step": 2760 + }, + { + "epoch": 0.17, + "grad_norm": 2.093674191456435, + "learning_rate": 9.469026265778142e-06, + "loss": 0.3294, + "step": 2761 + }, + { + "epoch": 0.17, + "grad_norm": 2.7750403189206274, + "learning_rate": 9.468569431861831e-06, + "loss": 0.3288, + "step": 2762 + }, + { + "epoch": 0.17, + "grad_norm": 3.7542207612319314, + "learning_rate": 9.468112412536636e-06, + "loss": 0.3534, + "step": 2763 + }, + { + "epoch": 0.17, + "grad_norm": 3.9043950895979895, + "learning_rate": 9.46765520782152e-06, + "loss": 0.3499, + "step": 2764 + }, + { + "epoch": 0.17, + "grad_norm": 2.7052852220722436, + "learning_rate": 9.467197817735455e-06, + "loss": 0.3219, + "step": 2765 + }, + { + "epoch": 0.17, + "grad_norm": 2.767424423941762, + "learning_rate": 9.466740242297418e-06, + "loss": 0.3533, + "step": 2766 + }, + { + "epoch": 0.17, + "grad_norm": 3.7762048598563815, + "learning_rate": 9.466282481526393e-06, + "loss": 0.3347, + "step": 2767 + }, + { + "epoch": 0.17, + "grad_norm": 2.2167111051395056, + "learning_rate": 9.465824535441377e-06, + "loss": 0.3383, + "step": 2768 + }, + { + "epoch": 0.17, + "grad_norm": 4.126082207266447, + "learning_rate": 9.465366404061366e-06, + "loss": 0.3593, + "step": 2769 + }, + { + "epoch": 0.17, + "grad_norm": 7.933674012815756, + "learning_rate": 9.464908087405374e-06, + "loss": 0.3248, + "step": 2770 + }, + { + "epoch": 0.17, + "grad_norm": 3.3307450476425386, + "learning_rate": 9.464449585492415e-06, + "loss": 0.3469, + "step": 2771 + }, + { + "epoch": 0.17, + "grad_norm": 2.5991292608208285, + "learning_rate": 9.463990898341511e-06, + "loss": 0.3588, + "step": 2772 + }, + { + "epoch": 0.17, + "grad_norm": 2.0305170395619663, + "learning_rate": 9.463532025971696e-06, + "loss": 0.3189, + "step": 2773 + }, + { + "epoch": 0.17, + "grad_norm": 2.243510270280099, + "learning_rate": 9.46307296840201e-06, + "loss": 0.319, + "step": 2774 + }, + { + "epoch": 0.17, + "grad_norm": 3.0991391084486075, + "learning_rate": 9.4626137256515e-06, + "loss": 0.3376, + "step": 2775 + }, + { + "epoch": 0.17, + "grad_norm": 6.942842649520549, + "learning_rate": 9.46215429773922e-06, + "loss": 0.3388, + "step": 2776 + }, + { + "epoch": 0.17, + "grad_norm": 4.977117474931626, + "learning_rate": 9.46169468468423e-06, + "loss": 0.3439, + "step": 2777 + }, + { + "epoch": 0.17, + "grad_norm": 2.224291263022979, + "learning_rate": 9.461234886505604e-06, + "loss": 0.3355, + "step": 2778 + }, + { + "epoch": 0.17, + "grad_norm": 3.0031455465503547, + "learning_rate": 9.46077490322242e-06, + "loss": 0.3531, + "step": 2779 + }, + { + "epoch": 0.17, + "grad_norm": 3.874284615010175, + "learning_rate": 9.46031473485376e-06, + "loss": 0.3409, + "step": 2780 + }, + { + "epoch": 0.17, + "grad_norm": 2.3548662817068347, + "learning_rate": 9.45985438141872e-06, + "loss": 0.3325, + "step": 2781 + }, + { + "epoch": 0.17, + "grad_norm": 4.455369618580227, + "learning_rate": 9.4593938429364e-06, + "loss": 0.3349, + "step": 2782 + }, + { + "epoch": 0.18, + "grad_norm": 2.6616707836133435, + "learning_rate": 9.45893311942591e-06, + "loss": 0.3372, + "step": 2783 + }, + { + "epoch": 0.18, + "grad_norm": 3.265891540437684, + "learning_rate": 9.458472210906363e-06, + "loss": 0.3339, + "step": 2784 + }, + { + "epoch": 0.18, + "grad_norm": 2.51312814291431, + "learning_rate": 9.458011117396886e-06, + "loss": 0.3346, + "step": 2785 + }, + { + "epoch": 0.18, + "grad_norm": 2.6527253674339057, + "learning_rate": 9.45754983891661e-06, + "loss": 0.332, + "step": 2786 + }, + { + "epoch": 0.18, + "grad_norm": 4.38085799438359, + "learning_rate": 9.457088375484671e-06, + "loss": 0.3386, + "step": 2787 + }, + { + "epoch": 0.18, + "grad_norm": 3.1564390974614422, + "learning_rate": 9.45662672712022e-06, + "loss": 0.3395, + "step": 2788 + }, + { + "epoch": 0.18, + "grad_norm": 6.613545889420841, + "learning_rate": 9.456164893842411e-06, + "loss": 0.3485, + "step": 2789 + }, + { + "epoch": 0.18, + "grad_norm": 3.3711434342955076, + "learning_rate": 9.455702875670405e-06, + "loss": 0.3349, + "step": 2790 + }, + { + "epoch": 0.18, + "grad_norm": 3.663328043729727, + "learning_rate": 9.455240672623373e-06, + "loss": 0.3498, + "step": 2791 + }, + { + "epoch": 0.18, + "grad_norm": 4.981069481642985, + "learning_rate": 9.454778284720492e-06, + "loss": 0.3459, + "step": 2792 + }, + { + "epoch": 0.18, + "grad_norm": 2.414719483762963, + "learning_rate": 9.454315711980947e-06, + "loss": 0.342, + "step": 2793 + }, + { + "epoch": 0.18, + "grad_norm": 2.8841543131580942, + "learning_rate": 9.453852954423931e-06, + "loss": 0.3418, + "step": 2794 + }, + { + "epoch": 0.18, + "grad_norm": 2.0040390069744674, + "learning_rate": 9.453390012068645e-06, + "loss": 0.3222, + "step": 2795 + }, + { + "epoch": 0.18, + "grad_norm": 3.13952994394859, + "learning_rate": 9.452926884934299e-06, + "loss": 0.3434, + "step": 2796 + }, + { + "epoch": 0.18, + "grad_norm": 3.353655141473368, + "learning_rate": 9.452463573040105e-06, + "loss": 0.3338, + "step": 2797 + }, + { + "epoch": 0.18, + "grad_norm": 1.9606795539717619, + "learning_rate": 9.45200007640529e-06, + "loss": 0.3514, + "step": 2798 + }, + { + "epoch": 0.18, + "grad_norm": 2.886733353083376, + "learning_rate": 9.451536395049083e-06, + "loss": 0.3471, + "step": 2799 + }, + { + "epoch": 0.18, + "grad_norm": 2.4776331690200872, + "learning_rate": 9.451072528990726e-06, + "loss": 0.3449, + "step": 2800 + }, + { + "epoch": 0.18, + "grad_norm": 4.722283122940009, + "learning_rate": 9.450608478249463e-06, + "loss": 0.3412, + "step": 2801 + }, + { + "epoch": 0.18, + "grad_norm": 2.666023026044664, + "learning_rate": 9.45014424284455e-06, + "loss": 0.3582, + "step": 2802 + }, + { + "epoch": 0.18, + "grad_norm": 3.2828030174161635, + "learning_rate": 9.449679822795246e-06, + "loss": 0.3409, + "step": 2803 + }, + { + "epoch": 0.18, + "grad_norm": 2.1537365589222173, + "learning_rate": 9.449215218120823e-06, + "loss": 0.3452, + "step": 2804 + }, + { + "epoch": 0.18, + "grad_norm": 2.648680999644945, + "learning_rate": 9.448750428840558e-06, + "loss": 0.3366, + "step": 2805 + }, + { + "epoch": 0.18, + "grad_norm": 2.2211160690738936, + "learning_rate": 9.448285454973739e-06, + "loss": 0.3302, + "step": 2806 + }, + { + "epoch": 0.18, + "grad_norm": 3.5726546881622903, + "learning_rate": 9.447820296539651e-06, + "loss": 0.3401, + "step": 2807 + }, + { + "epoch": 0.18, + "grad_norm": 3.4645098836059254, + "learning_rate": 9.447354953557601e-06, + "loss": 0.3373, + "step": 2808 + }, + { + "epoch": 0.18, + "grad_norm": 3.460336111103413, + "learning_rate": 9.446889426046894e-06, + "loss": 0.3601, + "step": 2809 + }, + { + "epoch": 0.18, + "grad_norm": 3.4755813323162292, + "learning_rate": 9.446423714026846e-06, + "loss": 0.3774, + "step": 2810 + }, + { + "epoch": 0.18, + "grad_norm": 3.121433864366657, + "learning_rate": 9.44595781751678e-06, + "loss": 0.3458, + "step": 2811 + }, + { + "epoch": 0.18, + "grad_norm": 3.359935062868799, + "learning_rate": 9.445491736536027e-06, + "loss": 0.3517, + "step": 2812 + }, + { + "epoch": 0.18, + "grad_norm": 2.8225412692039424, + "learning_rate": 9.445025471103928e-06, + "loss": 0.3444, + "step": 2813 + }, + { + "epoch": 0.18, + "grad_norm": 3.3172362395571358, + "learning_rate": 9.444559021239826e-06, + "loss": 0.3294, + "step": 2814 + }, + { + "epoch": 0.18, + "grad_norm": 3.2337656213902073, + "learning_rate": 9.444092386963075e-06, + "loss": 0.3488, + "step": 2815 + }, + { + "epoch": 0.18, + "grad_norm": 3.6685777243671267, + "learning_rate": 9.443625568293038e-06, + "loss": 0.3562, + "step": 2816 + }, + { + "epoch": 0.18, + "grad_norm": 2.0430253762704766, + "learning_rate": 9.443158565249082e-06, + "loss": 0.3217, + "step": 2817 + }, + { + "epoch": 0.18, + "grad_norm": 7.4861147230698055, + "learning_rate": 9.442691377850585e-06, + "loss": 0.3563, + "step": 2818 + }, + { + "epoch": 0.18, + "grad_norm": 3.2111236880069094, + "learning_rate": 9.442224006116935e-06, + "loss": 0.3338, + "step": 2819 + }, + { + "epoch": 0.18, + "grad_norm": 2.163038200171697, + "learning_rate": 9.441756450067519e-06, + "loss": 0.3481, + "step": 2820 + }, + { + "epoch": 0.18, + "grad_norm": 2.91268043030134, + "learning_rate": 9.441288709721737e-06, + "loss": 0.3443, + "step": 2821 + }, + { + "epoch": 0.18, + "grad_norm": 6.726556643663211, + "learning_rate": 9.440820785098998e-06, + "loss": 0.3482, + "step": 2822 + }, + { + "epoch": 0.18, + "grad_norm": 2.2242300931295373, + "learning_rate": 9.440352676218718e-06, + "loss": 0.3519, + "step": 2823 + }, + { + "epoch": 0.18, + "grad_norm": 6.962656578743003, + "learning_rate": 9.439884383100319e-06, + "loss": 0.3454, + "step": 2824 + }, + { + "epoch": 0.18, + "grad_norm": 4.4737964293811485, + "learning_rate": 9.43941590576323e-06, + "loss": 0.3574, + "step": 2825 + }, + { + "epoch": 0.18, + "grad_norm": 3.1128971723349954, + "learning_rate": 9.438947244226889e-06, + "loss": 0.3269, + "step": 2826 + }, + { + "epoch": 0.18, + "grad_norm": 6.592958046916075, + "learning_rate": 9.438478398510744e-06, + "loss": 0.3734, + "step": 2827 + }, + { + "epoch": 0.18, + "grad_norm": 2.5672723601376353, + "learning_rate": 9.438009368634244e-06, + "loss": 0.3451, + "step": 2828 + }, + { + "epoch": 0.18, + "grad_norm": 4.442651213825842, + "learning_rate": 9.437540154616856e-06, + "loss": 0.3383, + "step": 2829 + }, + { + "epoch": 0.18, + "grad_norm": 2.6106401136534685, + "learning_rate": 9.437070756478043e-06, + "loss": 0.3195, + "step": 2830 + }, + { + "epoch": 0.18, + "grad_norm": 7.27165960013486, + "learning_rate": 9.436601174237283e-06, + "loss": 0.3486, + "step": 2831 + }, + { + "epoch": 0.18, + "grad_norm": 3.241876553490455, + "learning_rate": 9.43613140791406e-06, + "loss": 0.3774, + "step": 2832 + }, + { + "epoch": 0.18, + "grad_norm": 3.192553518065512, + "learning_rate": 9.435661457527867e-06, + "loss": 0.3465, + "step": 2833 + }, + { + "epoch": 0.18, + "grad_norm": 4.755475426359888, + "learning_rate": 9.435191323098201e-06, + "loss": 0.3265, + "step": 2834 + }, + { + "epoch": 0.18, + "grad_norm": 2.520657051379427, + "learning_rate": 9.434721004644567e-06, + "loss": 0.3434, + "step": 2835 + }, + { + "epoch": 0.18, + "grad_norm": 6.363693194886535, + "learning_rate": 9.434250502186483e-06, + "loss": 0.3301, + "step": 2836 + }, + { + "epoch": 0.18, + "grad_norm": 15.654449150747247, + "learning_rate": 9.43377981574347e-06, + "loss": 0.3455, + "step": 2837 + }, + { + "epoch": 0.18, + "grad_norm": 3.0885392601630985, + "learning_rate": 9.433308945335058e-06, + "loss": 0.3423, + "step": 2838 + }, + { + "epoch": 0.18, + "grad_norm": 5.274065645708655, + "learning_rate": 9.432837890980781e-06, + "loss": 0.3465, + "step": 2839 + }, + { + "epoch": 0.18, + "grad_norm": 4.544574334105781, + "learning_rate": 9.432366652700189e-06, + "loss": 0.3329, + "step": 2840 + }, + { + "epoch": 0.18, + "grad_norm": 2.8224492961804457, + "learning_rate": 9.43189523051283e-06, + "loss": 0.3466, + "step": 2841 + }, + { + "epoch": 0.18, + "grad_norm": 5.262078457758155, + "learning_rate": 9.431423624438264e-06, + "loss": 0.3265, + "step": 2842 + }, + { + "epoch": 0.18, + "grad_norm": 5.0143890497233015, + "learning_rate": 9.430951834496064e-06, + "loss": 0.3302, + "step": 2843 + }, + { + "epoch": 0.18, + "grad_norm": 7.49727120577982, + "learning_rate": 9.430479860705802e-06, + "loss": 0.3324, + "step": 2844 + }, + { + "epoch": 0.18, + "grad_norm": 3.1745162216049105, + "learning_rate": 9.43000770308706e-06, + "loss": 0.3195, + "step": 2845 + }, + { + "epoch": 0.18, + "grad_norm": 7.5148866182805305, + "learning_rate": 9.42953536165943e-06, + "loss": 0.3487, + "step": 2846 + }, + { + "epoch": 0.18, + "grad_norm": 3.0211553893290013, + "learning_rate": 9.429062836442512e-06, + "loss": 0.3315, + "step": 2847 + }, + { + "epoch": 0.18, + "grad_norm": 2.42692257157526, + "learning_rate": 9.428590127455908e-06, + "loss": 0.3252, + "step": 2848 + }, + { + "epoch": 0.18, + "grad_norm": 3.06852227211664, + "learning_rate": 9.428117234719234e-06, + "loss": 0.3433, + "step": 2849 + }, + { + "epoch": 0.18, + "grad_norm": 2.9072314545965625, + "learning_rate": 9.427644158252109e-06, + "loss": 0.3688, + "step": 2850 + }, + { + "epoch": 0.18, + "grad_norm": 2.852432088608627, + "learning_rate": 9.427170898074166e-06, + "loss": 0.3392, + "step": 2851 + }, + { + "epoch": 0.18, + "grad_norm": 2.7173015240382314, + "learning_rate": 9.426697454205039e-06, + "loss": 0.3377, + "step": 2852 + }, + { + "epoch": 0.18, + "grad_norm": 31.45710822411603, + "learning_rate": 9.42622382666437e-06, + "loss": 0.3548, + "step": 2853 + }, + { + "epoch": 0.18, + "grad_norm": 3.633232496009611, + "learning_rate": 9.425750015471813e-06, + "loss": 0.3437, + "step": 2854 + }, + { + "epoch": 0.18, + "grad_norm": 13.542315376695413, + "learning_rate": 9.425276020647026e-06, + "loss": 0.3293, + "step": 2855 + }, + { + "epoch": 0.18, + "grad_norm": 3.719098152199475, + "learning_rate": 9.42480184220968e-06, + "loss": 0.3383, + "step": 2856 + }, + { + "epoch": 0.18, + "grad_norm": 4.007903046452314, + "learning_rate": 9.424327480179443e-06, + "loss": 0.3383, + "step": 2857 + }, + { + "epoch": 0.18, + "grad_norm": 0.6998075779836991, + "learning_rate": 9.423852934576003e-06, + "loss": 0.5152, + "step": 2858 + }, + { + "epoch": 0.18, + "grad_norm": 5.315523560050483, + "learning_rate": 9.423378205419044e-06, + "loss": 0.3383, + "step": 2859 + }, + { + "epoch": 0.18, + "grad_norm": 5.80912279664292, + "learning_rate": 9.42290329272827e-06, + "loss": 0.351, + "step": 2860 + }, + { + "epoch": 0.18, + "grad_norm": 3.657376800209197, + "learning_rate": 9.422428196523382e-06, + "loss": 0.3621, + "step": 2861 + }, + { + "epoch": 0.18, + "grad_norm": 3.3861887157105075, + "learning_rate": 9.42195291682409e-06, + "loss": 0.3439, + "step": 2862 + }, + { + "epoch": 0.18, + "grad_norm": 5.104540718554345, + "learning_rate": 9.421477453650118e-06, + "loss": 0.3406, + "step": 2863 + }, + { + "epoch": 0.18, + "grad_norm": 2.9449944011930014, + "learning_rate": 9.421001807021196e-06, + "loss": 0.3325, + "step": 2864 + }, + { + "epoch": 0.18, + "grad_norm": 3.203217123864811, + "learning_rate": 9.420525976957053e-06, + "loss": 0.3271, + "step": 2865 + }, + { + "epoch": 0.18, + "grad_norm": 0.5999976239510811, + "learning_rate": 9.420049963477437e-06, + "loss": 0.4892, + "step": 2866 + }, + { + "epoch": 0.18, + "grad_norm": 6.444723589831954, + "learning_rate": 9.419573766602097e-06, + "loss": 0.3451, + "step": 2867 + }, + { + "epoch": 0.18, + "grad_norm": 12.796242662466096, + "learning_rate": 9.41909738635079e-06, + "loss": 0.3507, + "step": 2868 + }, + { + "epoch": 0.18, + "grad_norm": 2.75746100509902, + "learning_rate": 9.418620822743284e-06, + "loss": 0.3328, + "step": 2869 + }, + { + "epoch": 0.18, + "grad_norm": 5.923284656401906, + "learning_rate": 9.418144075799353e-06, + "loss": 0.345, + "step": 2870 + }, + { + "epoch": 0.18, + "grad_norm": 5.192724605693848, + "learning_rate": 9.417667145538778e-06, + "loss": 0.3493, + "step": 2871 + }, + { + "epoch": 0.18, + "grad_norm": 15.947398314030883, + "learning_rate": 9.417190031981343e-06, + "loss": 0.3264, + "step": 2872 + }, + { + "epoch": 0.18, + "grad_norm": 4.4545985955900775, + "learning_rate": 9.41671273514685e-06, + "loss": 0.3547, + "step": 2873 + }, + { + "epoch": 0.18, + "grad_norm": 8.281984482714142, + "learning_rate": 9.416235255055099e-06, + "loss": 0.3592, + "step": 2874 + }, + { + "epoch": 0.18, + "grad_norm": 6.176301761464211, + "learning_rate": 9.415757591725905e-06, + "loss": 0.366, + "step": 2875 + }, + { + "epoch": 0.18, + "grad_norm": 8.554313991084236, + "learning_rate": 9.415279745179084e-06, + "loss": 0.3332, + "step": 2876 + }, + { + "epoch": 0.18, + "grad_norm": 4.742543879468014, + "learning_rate": 9.414801715434464e-06, + "loss": 0.3368, + "step": 2877 + }, + { + "epoch": 0.18, + "grad_norm": 5.879691455524878, + "learning_rate": 9.41432350251188e-06, + "loss": 0.3397, + "step": 2878 + }, + { + "epoch": 0.18, + "grad_norm": 4.792333491669887, + "learning_rate": 9.413845106431171e-06, + "loss": 0.3477, + "step": 2879 + }, + { + "epoch": 0.18, + "grad_norm": 2.6793008193667434, + "learning_rate": 9.41336652721219e-06, + "loss": 0.3394, + "step": 2880 + }, + { + "epoch": 0.18, + "grad_norm": 7.5621857549984695, + "learning_rate": 9.412887764874794e-06, + "loss": 0.3129, + "step": 2881 + }, + { + "epoch": 0.18, + "grad_norm": 2.3834765786758307, + "learning_rate": 9.412408819438847e-06, + "loss": 0.3301, + "step": 2882 + }, + { + "epoch": 0.18, + "grad_norm": 4.512591564980338, + "learning_rate": 9.41192969092422e-06, + "loss": 0.3561, + "step": 2883 + }, + { + "epoch": 0.18, + "grad_norm": 3.4747121695984333, + "learning_rate": 9.411450379350791e-06, + "loss": 0.3456, + "step": 2884 + }, + { + "epoch": 0.18, + "grad_norm": 2.5570404248386294, + "learning_rate": 9.410970884738453e-06, + "loss": 0.3432, + "step": 2885 + }, + { + "epoch": 0.18, + "grad_norm": 4.039301780760711, + "learning_rate": 9.410491207107097e-06, + "loss": 0.3325, + "step": 2886 + }, + { + "epoch": 0.18, + "grad_norm": 3.0699921469434797, + "learning_rate": 9.410011346476628e-06, + "loss": 0.3136, + "step": 2887 + }, + { + "epoch": 0.18, + "grad_norm": 3.257155959011966, + "learning_rate": 9.409531302866954e-06, + "loss": 0.3161, + "step": 2888 + }, + { + "epoch": 0.18, + "grad_norm": 2.4186503244613218, + "learning_rate": 9.409051076297994e-06, + "loss": 0.3277, + "step": 2889 + }, + { + "epoch": 0.18, + "grad_norm": 6.690839806498641, + "learning_rate": 9.408570666789674e-06, + "loss": 0.3273, + "step": 2890 + }, + { + "epoch": 0.18, + "grad_norm": 2.5094390086689167, + "learning_rate": 9.408090074361927e-06, + "loss": 0.3208, + "step": 2891 + }, + { + "epoch": 0.18, + "grad_norm": 26.004381051412743, + "learning_rate": 9.407609299034693e-06, + "loss": 0.3309, + "step": 2892 + }, + { + "epoch": 0.18, + "grad_norm": 3.498975777808764, + "learning_rate": 9.40712834082792e-06, + "loss": 0.3423, + "step": 2893 + }, + { + "epoch": 0.18, + "grad_norm": 20.55535006750138, + "learning_rate": 9.406647199761564e-06, + "loss": 0.3042, + "step": 2894 + }, + { + "epoch": 0.18, + "grad_norm": 6.251041822218146, + "learning_rate": 9.406165875855589e-06, + "loss": 0.3514, + "step": 2895 + }, + { + "epoch": 0.18, + "grad_norm": 0.6971310108682597, + "learning_rate": 9.405684369129965e-06, + "loss": 0.5189, + "step": 2896 + }, + { + "epoch": 0.18, + "grad_norm": 5.029326151582228, + "learning_rate": 9.40520267960467e-06, + "loss": 0.3469, + "step": 2897 + }, + { + "epoch": 0.18, + "grad_norm": 3.0755822619381115, + "learning_rate": 9.404720807299694e-06, + "loss": 0.3337, + "step": 2898 + }, + { + "epoch": 0.18, + "grad_norm": 4.829548529739491, + "learning_rate": 9.404238752235028e-06, + "loss": 0.3392, + "step": 2899 + }, + { + "epoch": 0.18, + "grad_norm": 6.697211143258505, + "learning_rate": 9.40375651443067e-06, + "loss": 0.3278, + "step": 2900 + }, + { + "epoch": 0.18, + "grad_norm": 2.828282563810434, + "learning_rate": 9.403274093906635e-06, + "loss": 0.3295, + "step": 2901 + }, + { + "epoch": 0.18, + "grad_norm": 3.471500784682395, + "learning_rate": 9.402791490682938e-06, + "loss": 0.3103, + "step": 2902 + }, + { + "epoch": 0.18, + "grad_norm": 2.8394804349192406, + "learning_rate": 9.4023087047796e-06, + "loss": 0.3445, + "step": 2903 + }, + { + "epoch": 0.18, + "grad_norm": 3.1678344497176716, + "learning_rate": 9.401825736216654e-06, + "loss": 0.3342, + "step": 2904 + }, + { + "epoch": 0.18, + "grad_norm": 4.012326991399161, + "learning_rate": 9.401342585014141e-06, + "loss": 0.319, + "step": 2905 + }, + { + "epoch": 0.18, + "grad_norm": 3.0374766040889085, + "learning_rate": 9.400859251192104e-06, + "loss": 0.3354, + "step": 2906 + }, + { + "epoch": 0.18, + "grad_norm": 4.121481576580878, + "learning_rate": 9.400375734770603e-06, + "loss": 0.3411, + "step": 2907 + }, + { + "epoch": 0.18, + "grad_norm": 4.287520775902825, + "learning_rate": 9.399892035769696e-06, + "loss": 0.3343, + "step": 2908 + }, + { + "epoch": 0.18, + "grad_norm": 6.992660820053152, + "learning_rate": 9.399408154209452e-06, + "loss": 0.3436, + "step": 2909 + }, + { + "epoch": 0.18, + "grad_norm": 4.734115944816152, + "learning_rate": 9.39892409010995e-06, + "loss": 0.3756, + "step": 2910 + }, + { + "epoch": 0.18, + "grad_norm": 7.892929914048489, + "learning_rate": 9.398439843491273e-06, + "loss": 0.3462, + "step": 2911 + }, + { + "epoch": 0.18, + "grad_norm": 8.99508133042363, + "learning_rate": 9.397955414373518e-06, + "loss": 0.331, + "step": 2912 + }, + { + "epoch": 0.18, + "grad_norm": 8.587144228356296, + "learning_rate": 9.397470802776777e-06, + "loss": 0.3445, + "step": 2913 + }, + { + "epoch": 0.18, + "grad_norm": 5.323607839701405, + "learning_rate": 9.396986008721165e-06, + "loss": 0.3257, + "step": 2914 + }, + { + "epoch": 0.18, + "grad_norm": 16.71452420951083, + "learning_rate": 9.396501032226793e-06, + "loss": 0.3573, + "step": 2915 + }, + { + "epoch": 0.18, + "grad_norm": 3.412411037982219, + "learning_rate": 9.396015873313781e-06, + "loss": 0.361, + "step": 2916 + }, + { + "epoch": 0.18, + "grad_norm": 3.392461801012857, + "learning_rate": 9.395530532002265e-06, + "loss": 0.3419, + "step": 2917 + }, + { + "epoch": 0.18, + "grad_norm": 2.1243772104218155, + "learning_rate": 9.39504500831238e-06, + "loss": 0.333, + "step": 2918 + }, + { + "epoch": 0.18, + "grad_norm": 4.900944207336713, + "learning_rate": 9.39455930226427e-06, + "loss": 0.3153, + "step": 2919 + }, + { + "epoch": 0.18, + "grad_norm": 2.4305900779326794, + "learning_rate": 9.394073413878089e-06, + "loss": 0.3236, + "step": 2920 + }, + { + "epoch": 0.18, + "grad_norm": 6.136392600562325, + "learning_rate": 9.393587343173998e-06, + "loss": 0.3565, + "step": 2921 + }, + { + "epoch": 0.18, + "grad_norm": 2.7350195142850207, + "learning_rate": 9.393101090172164e-06, + "loss": 0.327, + "step": 2922 + }, + { + "epoch": 0.18, + "grad_norm": 4.63063528563335, + "learning_rate": 9.392614654892761e-06, + "loss": 0.3185, + "step": 2923 + }, + { + "epoch": 0.18, + "grad_norm": 6.539628583735849, + "learning_rate": 9.392128037355977e-06, + "loss": 0.317, + "step": 2924 + }, + { + "epoch": 0.18, + "grad_norm": 4.74064779015083, + "learning_rate": 9.391641237581998e-06, + "loss": 0.3446, + "step": 2925 + }, + { + "epoch": 0.18, + "grad_norm": 3.6063991643633355, + "learning_rate": 9.391154255591025e-06, + "loss": 0.3148, + "step": 2926 + }, + { + "epoch": 0.18, + "grad_norm": 6.9856885299036335, + "learning_rate": 9.39066709140326e-06, + "loss": 0.3384, + "step": 2927 + }, + { + "epoch": 0.18, + "grad_norm": 1.995242282639124, + "learning_rate": 9.39017974503892e-06, + "loss": 0.3352, + "step": 2928 + }, + { + "epoch": 0.18, + "grad_norm": 3.4789716544967804, + "learning_rate": 9.389692216518224e-06, + "loss": 0.3461, + "step": 2929 + }, + { + "epoch": 0.18, + "grad_norm": 3.6322536561450733, + "learning_rate": 9.3892045058614e-06, + "loss": 0.3515, + "step": 2930 + }, + { + "epoch": 0.18, + "grad_norm": 3.069745363414444, + "learning_rate": 9.388716613088686e-06, + "loss": 0.3188, + "step": 2931 + }, + { + "epoch": 0.18, + "grad_norm": 1.8862009541003968, + "learning_rate": 9.388228538220326e-06, + "loss": 0.3243, + "step": 2932 + }, + { + "epoch": 0.18, + "grad_norm": 2.801954251400872, + "learning_rate": 9.387740281276568e-06, + "loss": 0.3479, + "step": 2933 + }, + { + "epoch": 0.18, + "grad_norm": 2.189852651221901, + "learning_rate": 9.387251842277672e-06, + "loss": 0.3114, + "step": 2934 + }, + { + "epoch": 0.18, + "grad_norm": 16.32992466654144, + "learning_rate": 9.386763221243905e-06, + "loss": 0.346, + "step": 2935 + }, + { + "epoch": 0.18, + "grad_norm": 3.4940048590995523, + "learning_rate": 9.38627441819554e-06, + "loss": 0.3467, + "step": 2936 + }, + { + "epoch": 0.18, + "grad_norm": 0.9183728794530075, + "learning_rate": 9.385785433152857e-06, + "loss": 0.5135, + "step": 2937 + }, + { + "epoch": 0.18, + "grad_norm": 2.947386359204182, + "learning_rate": 9.385296266136148e-06, + "loss": 0.3439, + "step": 2938 + }, + { + "epoch": 0.18, + "grad_norm": 3.008728640681746, + "learning_rate": 9.384806917165707e-06, + "loss": 0.3548, + "step": 2939 + }, + { + "epoch": 0.18, + "grad_norm": 2.5869169313035396, + "learning_rate": 9.384317386261841e-06, + "loss": 0.3093, + "step": 2940 + }, + { + "epoch": 0.18, + "grad_norm": 2.7542506859998492, + "learning_rate": 9.383827673444856e-06, + "loss": 0.3358, + "step": 2941 + }, + { + "epoch": 0.19, + "grad_norm": 1.8796394186241903, + "learning_rate": 9.383337778735076e-06, + "loss": 0.3122, + "step": 2942 + }, + { + "epoch": 0.19, + "grad_norm": 2.122534408531552, + "learning_rate": 9.382847702152827e-06, + "loss": 0.3351, + "step": 2943 + }, + { + "epoch": 0.19, + "grad_norm": 2.567870346776088, + "learning_rate": 9.382357443718439e-06, + "loss": 0.3158, + "step": 2944 + }, + { + "epoch": 0.19, + "grad_norm": 7.072062774682705, + "learning_rate": 9.381867003452258e-06, + "loss": 0.3413, + "step": 2945 + }, + { + "epoch": 0.19, + "grad_norm": 6.79819236834924, + "learning_rate": 9.381376381374634e-06, + "loss": 0.3347, + "step": 2946 + }, + { + "epoch": 0.19, + "grad_norm": 2.8536223395008538, + "learning_rate": 9.380885577505919e-06, + "loss": 0.3458, + "step": 2947 + }, + { + "epoch": 0.19, + "grad_norm": 3.572644934308234, + "learning_rate": 9.380394591866482e-06, + "loss": 0.3623, + "step": 2948 + }, + { + "epoch": 0.19, + "grad_norm": 3.4068019338376945, + "learning_rate": 9.379903424476692e-06, + "loss": 0.3634, + "step": 2949 + }, + { + "epoch": 0.19, + "grad_norm": 3.3813495653493635, + "learning_rate": 9.379412075356928e-06, + "loss": 0.3271, + "step": 2950 + }, + { + "epoch": 0.19, + "grad_norm": 5.677791592204469, + "learning_rate": 9.37892054452758e-06, + "loss": 0.3262, + "step": 2951 + }, + { + "epoch": 0.19, + "grad_norm": 3.9352185064751115, + "learning_rate": 9.37842883200904e-06, + "loss": 0.3411, + "step": 2952 + }, + { + "epoch": 0.19, + "grad_norm": 2.0548663831312037, + "learning_rate": 9.377936937821712e-06, + "loss": 0.336, + "step": 2953 + }, + { + "epoch": 0.19, + "grad_norm": 2.5100520512670617, + "learning_rate": 9.377444861986004e-06, + "loss": 0.3397, + "step": 2954 + }, + { + "epoch": 0.19, + "grad_norm": 4.136633990219711, + "learning_rate": 9.376952604522333e-06, + "loss": 0.3238, + "step": 2955 + }, + { + "epoch": 0.19, + "grad_norm": 18.523231953088995, + "learning_rate": 9.376460165451125e-06, + "loss": 0.3339, + "step": 2956 + }, + { + "epoch": 0.19, + "grad_norm": 2.7411311506306433, + "learning_rate": 9.37596754479281e-06, + "loss": 0.3264, + "step": 2957 + }, + { + "epoch": 0.19, + "grad_norm": 2.8916559043431933, + "learning_rate": 9.37547474256783e-06, + "loss": 0.3646, + "step": 2958 + }, + { + "epoch": 0.19, + "grad_norm": 7.2563683169537105, + "learning_rate": 9.37498175879663e-06, + "loss": 0.3352, + "step": 2959 + }, + { + "epoch": 0.19, + "grad_norm": 2.762369507271014, + "learning_rate": 9.374488593499666e-06, + "loss": 0.3786, + "step": 2960 + }, + { + "epoch": 0.19, + "grad_norm": 12.92035920480478, + "learning_rate": 9.3739952466974e-06, + "loss": 0.3291, + "step": 2961 + }, + { + "epoch": 0.19, + "grad_norm": 2.137139730565293, + "learning_rate": 9.373501718410303e-06, + "loss": 0.3303, + "step": 2962 + }, + { + "epoch": 0.19, + "grad_norm": 3.367799803671223, + "learning_rate": 9.373008008658852e-06, + "loss": 0.3405, + "step": 2963 + }, + { + "epoch": 0.19, + "grad_norm": 5.0506847971138855, + "learning_rate": 9.372514117463531e-06, + "loss": 0.3706, + "step": 2964 + }, + { + "epoch": 0.19, + "grad_norm": 3.00407829290628, + "learning_rate": 9.372020044844832e-06, + "loss": 0.3437, + "step": 2965 + }, + { + "epoch": 0.19, + "grad_norm": 2.7504908868335254, + "learning_rate": 9.371525790823256e-06, + "loss": 0.3101, + "step": 2966 + }, + { + "epoch": 0.19, + "grad_norm": 4.506023327800912, + "learning_rate": 9.371031355419311e-06, + "loss": 0.3482, + "step": 2967 + }, + { + "epoch": 0.19, + "grad_norm": 4.564355648983167, + "learning_rate": 9.370536738653513e-06, + "loss": 0.3386, + "step": 2968 + }, + { + "epoch": 0.19, + "grad_norm": 38.763433171693926, + "learning_rate": 9.37004194054638e-06, + "loss": 0.3493, + "step": 2969 + }, + { + "epoch": 0.19, + "grad_norm": 2.712320349004185, + "learning_rate": 9.369546961118446e-06, + "loss": 0.3124, + "step": 2970 + }, + { + "epoch": 0.19, + "grad_norm": 3.9352975918037343, + "learning_rate": 9.369051800390245e-06, + "loss": 0.345, + "step": 2971 + }, + { + "epoch": 0.19, + "grad_norm": 3.836155204661295, + "learning_rate": 9.368556458382327e-06, + "loss": 0.3134, + "step": 2972 + }, + { + "epoch": 0.19, + "grad_norm": 5.151021719818088, + "learning_rate": 9.368060935115243e-06, + "loss": 0.3484, + "step": 2973 + }, + { + "epoch": 0.19, + "grad_norm": 2.6961994500940625, + "learning_rate": 9.36756523060955e-06, + "loss": 0.3389, + "step": 2974 + }, + { + "epoch": 0.19, + "grad_norm": 3.963762113080279, + "learning_rate": 9.367069344885818e-06, + "loss": 0.3335, + "step": 2975 + }, + { + "epoch": 0.19, + "grad_norm": 6.356519212038357, + "learning_rate": 9.366573277964623e-06, + "loss": 0.3423, + "step": 2976 + }, + { + "epoch": 0.19, + "grad_norm": 4.219726325252524, + "learning_rate": 9.366077029866546e-06, + "loss": 0.3258, + "step": 2977 + }, + { + "epoch": 0.19, + "grad_norm": 3.647013140102176, + "learning_rate": 9.365580600612178e-06, + "loss": 0.3241, + "step": 2978 + }, + { + "epoch": 0.19, + "grad_norm": 2.7452563532522922, + "learning_rate": 9.365083990222118e-06, + "loss": 0.3339, + "step": 2979 + }, + { + "epoch": 0.19, + "grad_norm": 3.204300651698719, + "learning_rate": 9.36458719871697e-06, + "loss": 0.3233, + "step": 2980 + }, + { + "epoch": 0.19, + "grad_norm": 4.127717263896908, + "learning_rate": 9.364090226117346e-06, + "loss": 0.3514, + "step": 2981 + }, + { + "epoch": 0.19, + "grad_norm": 5.179443951254958, + "learning_rate": 9.363593072443865e-06, + "loss": 0.353, + "step": 2982 + }, + { + "epoch": 0.19, + "grad_norm": 4.417908955575161, + "learning_rate": 9.36309573771716e-06, + "loss": 0.3516, + "step": 2983 + }, + { + "epoch": 0.19, + "grad_norm": 0.9608663473809073, + "learning_rate": 9.362598221957862e-06, + "loss": 0.49, + "step": 2984 + }, + { + "epoch": 0.19, + "grad_norm": 5.707532724763154, + "learning_rate": 9.362100525186616e-06, + "loss": 0.3401, + "step": 2985 + }, + { + "epoch": 0.19, + "grad_norm": 4.223660909851831, + "learning_rate": 9.361602647424069e-06, + "loss": 0.3273, + "step": 2986 + }, + { + "epoch": 0.19, + "grad_norm": 3.732123712149332, + "learning_rate": 9.361104588690882e-06, + "loss": 0.3412, + "step": 2987 + }, + { + "epoch": 0.19, + "grad_norm": 3.3554936274931797, + "learning_rate": 9.36060634900772e-06, + "loss": 0.3259, + "step": 2988 + }, + { + "epoch": 0.19, + "grad_norm": 6.429438026735335, + "learning_rate": 9.360107928395254e-06, + "loss": 0.3459, + "step": 2989 + }, + { + "epoch": 0.19, + "grad_norm": 2.4984931577952993, + "learning_rate": 9.359609326874169e-06, + "loss": 0.3276, + "step": 2990 + }, + { + "epoch": 0.19, + "grad_norm": 5.173541636929663, + "learning_rate": 9.359110544465146e-06, + "loss": 0.3234, + "step": 2991 + }, + { + "epoch": 0.19, + "grad_norm": 2.5013455005168916, + "learning_rate": 9.358611581188884e-06, + "loss": 0.3576, + "step": 2992 + }, + { + "epoch": 0.19, + "grad_norm": 4.090817040137613, + "learning_rate": 9.358112437066088e-06, + "loss": 0.3734, + "step": 2993 + }, + { + "epoch": 0.19, + "grad_norm": 2.7682740799018313, + "learning_rate": 9.357613112117464e-06, + "loss": 0.3278, + "step": 2994 + }, + { + "epoch": 0.19, + "grad_norm": 0.9360354284112907, + "learning_rate": 9.357113606363732e-06, + "loss": 0.487, + "step": 2995 + }, + { + "epoch": 0.19, + "grad_norm": 2.3217461974291056, + "learning_rate": 9.356613919825619e-06, + "loss": 0.3453, + "step": 2996 + }, + { + "epoch": 0.19, + "grad_norm": 1.8950696661048922, + "learning_rate": 9.356114052523854e-06, + "loss": 0.3449, + "step": 2997 + }, + { + "epoch": 0.19, + "grad_norm": 2.0136953082508855, + "learning_rate": 9.355614004479182e-06, + "loss": 0.3282, + "step": 2998 + }, + { + "epoch": 0.19, + "grad_norm": 5.2662429306810665, + "learning_rate": 9.355113775712348e-06, + "loss": 0.3294, + "step": 2999 + }, + { + "epoch": 0.19, + "grad_norm": 3.899287811153719, + "learning_rate": 9.354613366244108e-06, + "loss": 0.3294, + "step": 3000 + }, + { + "epoch": 0.19, + "grad_norm": 2.660311033876325, + "learning_rate": 9.354112776095224e-06, + "loss": 0.3208, + "step": 3001 + }, + { + "epoch": 0.19, + "grad_norm": 6.027784651214224, + "learning_rate": 9.353612005286467e-06, + "loss": 0.332, + "step": 3002 + }, + { + "epoch": 0.19, + "grad_norm": 7.437953303219593, + "learning_rate": 9.353111053838616e-06, + "loss": 0.3424, + "step": 3003 + }, + { + "epoch": 0.19, + "grad_norm": 4.37422599106699, + "learning_rate": 9.352609921772453e-06, + "loss": 0.3381, + "step": 3004 + }, + { + "epoch": 0.19, + "grad_norm": 4.086888289084342, + "learning_rate": 9.352108609108775e-06, + "loss": 0.3681, + "step": 3005 + }, + { + "epoch": 0.19, + "grad_norm": 2.817461296972661, + "learning_rate": 9.351607115868379e-06, + "loss": 0.3162, + "step": 3006 + }, + { + "epoch": 0.19, + "grad_norm": 2.6842867164610493, + "learning_rate": 9.351105442072077e-06, + "loss": 0.3308, + "step": 3007 + }, + { + "epoch": 0.19, + "grad_norm": 7.074557487684647, + "learning_rate": 9.350603587740681e-06, + "loss": 0.3518, + "step": 3008 + }, + { + "epoch": 0.19, + "grad_norm": 9.494142419307336, + "learning_rate": 9.350101552895015e-06, + "loss": 0.336, + "step": 3009 + }, + { + "epoch": 0.19, + "grad_norm": 3.646759380098731, + "learning_rate": 9.349599337555908e-06, + "loss": 0.3509, + "step": 3010 + }, + { + "epoch": 0.19, + "grad_norm": 4.209083844362035, + "learning_rate": 9.3490969417442e-06, + "loss": 0.3559, + "step": 3011 + }, + { + "epoch": 0.19, + "grad_norm": 0.8798719763971381, + "learning_rate": 9.348594365480731e-06, + "loss": 0.5078, + "step": 3012 + }, + { + "epoch": 0.19, + "grad_norm": 2.850350505727774, + "learning_rate": 9.348091608786362e-06, + "loss": 0.3135, + "step": 3013 + }, + { + "epoch": 0.19, + "grad_norm": 2.3426099933361617, + "learning_rate": 9.347588671681949e-06, + "loss": 0.3241, + "step": 3014 + }, + { + "epoch": 0.19, + "grad_norm": 4.129191082772908, + "learning_rate": 9.347085554188358e-06, + "loss": 0.3431, + "step": 3015 + }, + { + "epoch": 0.19, + "grad_norm": 3.3763864590358255, + "learning_rate": 9.346582256326466e-06, + "loss": 0.3512, + "step": 3016 + }, + { + "epoch": 0.19, + "grad_norm": 5.5564846425568994, + "learning_rate": 9.346078778117157e-06, + "loss": 0.3552, + "step": 3017 + }, + { + "epoch": 0.19, + "grad_norm": 2.6375082356956074, + "learning_rate": 9.34557511958132e-06, + "loss": 0.3562, + "step": 3018 + }, + { + "epoch": 0.19, + "grad_norm": 6.675383635989742, + "learning_rate": 9.345071280739853e-06, + "loss": 0.3266, + "step": 3019 + }, + { + "epoch": 0.19, + "grad_norm": 3.8312994945653984, + "learning_rate": 9.34456726161366e-06, + "loss": 0.3541, + "step": 3020 + }, + { + "epoch": 0.19, + "grad_norm": 2.981447370141572, + "learning_rate": 9.344063062223653e-06, + "loss": 0.3316, + "step": 3021 + }, + { + "epoch": 0.19, + "grad_norm": 3.3336506962385037, + "learning_rate": 9.343558682590757e-06, + "loss": 0.3407, + "step": 3022 + }, + { + "epoch": 0.19, + "grad_norm": 3.3114342319151735, + "learning_rate": 9.343054122735894e-06, + "loss": 0.3312, + "step": 3023 + }, + { + "epoch": 0.19, + "grad_norm": 3.9229177179769605, + "learning_rate": 9.342549382680002e-06, + "loss": 0.3311, + "step": 3024 + }, + { + "epoch": 0.19, + "grad_norm": 5.629641199016785, + "learning_rate": 9.342044462444023e-06, + "loss": 0.3182, + "step": 3025 + }, + { + "epoch": 0.19, + "grad_norm": 4.004324983753988, + "learning_rate": 9.341539362048906e-06, + "loss": 0.3424, + "step": 3026 + }, + { + "epoch": 0.19, + "grad_norm": 4.57432765267654, + "learning_rate": 9.34103408151561e-06, + "loss": 0.3384, + "step": 3027 + }, + { + "epoch": 0.19, + "grad_norm": 3.6515450597736896, + "learning_rate": 9.340528620865099e-06, + "loss": 0.3425, + "step": 3028 + }, + { + "epoch": 0.19, + "grad_norm": 7.024417049623289, + "learning_rate": 9.340022980118346e-06, + "loss": 0.3551, + "step": 3029 + }, + { + "epoch": 0.19, + "grad_norm": 2.823128998136369, + "learning_rate": 9.33951715929633e-06, + "loss": 0.307, + "step": 3030 + }, + { + "epoch": 0.19, + "grad_norm": 2.302761290109694, + "learning_rate": 9.339011158420042e-06, + "loss": 0.321, + "step": 3031 + }, + { + "epoch": 0.19, + "grad_norm": 2.7861581439453387, + "learning_rate": 9.338504977510471e-06, + "loss": 0.3333, + "step": 3032 + }, + { + "epoch": 0.19, + "grad_norm": 6.255436023938514, + "learning_rate": 9.337998616588624e-06, + "loss": 0.3197, + "step": 3033 + }, + { + "epoch": 0.19, + "grad_norm": 3.7986968576143894, + "learning_rate": 9.337492075675509e-06, + "loss": 0.354, + "step": 3034 + }, + { + "epoch": 0.19, + "grad_norm": 3.427175316812047, + "learning_rate": 9.336985354792143e-06, + "loss": 0.3422, + "step": 3035 + }, + { + "epoch": 0.19, + "grad_norm": 6.262349587614101, + "learning_rate": 9.336478453959552e-06, + "loss": 0.3455, + "step": 3036 + }, + { + "epoch": 0.19, + "grad_norm": 5.892854706867287, + "learning_rate": 9.335971373198768e-06, + "loss": 0.345, + "step": 3037 + }, + { + "epoch": 0.19, + "grad_norm": 7.66963503363804, + "learning_rate": 9.33546411253083e-06, + "loss": 0.333, + "step": 3038 + }, + { + "epoch": 0.19, + "grad_norm": 2.724888961257912, + "learning_rate": 9.334956671976784e-06, + "loss": 0.3304, + "step": 3039 + }, + { + "epoch": 0.19, + "grad_norm": 3.5438924357371304, + "learning_rate": 9.334449051557687e-06, + "loss": 0.3718, + "step": 3040 + }, + { + "epoch": 0.19, + "grad_norm": 5.697884305935806, + "learning_rate": 9.3339412512946e-06, + "loss": 0.3305, + "step": 3041 + }, + { + "epoch": 0.19, + "grad_norm": 9.46906054185119, + "learning_rate": 9.333433271208592e-06, + "loss": 0.3239, + "step": 3042 + }, + { + "epoch": 0.19, + "grad_norm": 3.9475025359327924, + "learning_rate": 9.332925111320741e-06, + "loss": 0.3392, + "step": 3043 + }, + { + "epoch": 0.19, + "grad_norm": 3.746327794218007, + "learning_rate": 9.33241677165213e-06, + "loss": 0.3452, + "step": 3044 + }, + { + "epoch": 0.19, + "grad_norm": 3.7648031217775895, + "learning_rate": 9.331908252223853e-06, + "loss": 0.3245, + "step": 3045 + }, + { + "epoch": 0.19, + "grad_norm": 3.079432984670926, + "learning_rate": 9.331399553057008e-06, + "loss": 0.3239, + "step": 3046 + }, + { + "epoch": 0.19, + "grad_norm": 4.08175775219486, + "learning_rate": 9.330890674172703e-06, + "loss": 0.3445, + "step": 3047 + }, + { + "epoch": 0.19, + "grad_norm": 0.8930370994396265, + "learning_rate": 9.33038161559205e-06, + "loss": 0.5207, + "step": 3048 + }, + { + "epoch": 0.19, + "grad_norm": 4.035597248186252, + "learning_rate": 9.329872377336173e-06, + "loss": 0.3343, + "step": 3049 + }, + { + "epoch": 0.19, + "grad_norm": 7.783789110657642, + "learning_rate": 9.3293629594262e-06, + "loss": 0.3643, + "step": 3050 + }, + { + "epoch": 0.19, + "grad_norm": 4.452415922961428, + "learning_rate": 9.328853361883268e-06, + "loss": 0.3556, + "step": 3051 + }, + { + "epoch": 0.19, + "grad_norm": 3.4411137902450526, + "learning_rate": 9.32834358472852e-06, + "loss": 0.3363, + "step": 3052 + }, + { + "epoch": 0.19, + "grad_norm": 4.742630188402976, + "learning_rate": 9.327833627983111e-06, + "loss": 0.3372, + "step": 3053 + }, + { + "epoch": 0.19, + "grad_norm": 3.1818245706896975, + "learning_rate": 9.327323491668197e-06, + "loss": 0.3173, + "step": 3054 + }, + { + "epoch": 0.19, + "grad_norm": 4.266553037326733, + "learning_rate": 9.326813175804943e-06, + "loss": 0.3431, + "step": 3055 + }, + { + "epoch": 0.19, + "grad_norm": 3.3925988723286973, + "learning_rate": 9.326302680414527e-06, + "loss": 0.3535, + "step": 3056 + }, + { + "epoch": 0.19, + "grad_norm": 2.8704980127405775, + "learning_rate": 9.325792005518128e-06, + "loss": 0.3413, + "step": 3057 + }, + { + "epoch": 0.19, + "grad_norm": 3.718238107609266, + "learning_rate": 9.325281151136936e-06, + "loss": 0.3143, + "step": 3058 + }, + { + "epoch": 0.19, + "grad_norm": 3.0086727732829504, + "learning_rate": 9.324770117292146e-06, + "loss": 0.3355, + "step": 3059 + }, + { + "epoch": 0.19, + "grad_norm": 3.2553592507183766, + "learning_rate": 9.324258904004961e-06, + "loss": 0.3232, + "step": 3060 + }, + { + "epoch": 0.19, + "grad_norm": 3.2727486642764254, + "learning_rate": 9.323747511296594e-06, + "loss": 0.3667, + "step": 3061 + }, + { + "epoch": 0.19, + "grad_norm": 3.191618268891413, + "learning_rate": 9.323235939188265e-06, + "loss": 0.3226, + "step": 3062 + }, + { + "epoch": 0.19, + "grad_norm": 2.8422933989115813, + "learning_rate": 9.322724187701195e-06, + "loss": 0.3238, + "step": 3063 + }, + { + "epoch": 0.19, + "grad_norm": 2.052104265411854, + "learning_rate": 9.322212256856622e-06, + "loss": 0.3272, + "step": 3064 + }, + { + "epoch": 0.19, + "grad_norm": 1.0366610133851966, + "learning_rate": 9.321700146675785e-06, + "loss": 0.5136, + "step": 3065 + }, + { + "epoch": 0.19, + "grad_norm": 23.899692312448913, + "learning_rate": 9.321187857179932e-06, + "loss": 0.3569, + "step": 3066 + }, + { + "epoch": 0.19, + "grad_norm": 4.692567049351729, + "learning_rate": 9.320675388390321e-06, + "loss": 0.3685, + "step": 3067 + }, + { + "epoch": 0.19, + "grad_norm": 4.750752151923033, + "learning_rate": 9.320162740328211e-06, + "loss": 0.3474, + "step": 3068 + }, + { + "epoch": 0.19, + "grad_norm": 3.9846018748256262, + "learning_rate": 9.319649913014878e-06, + "loss": 0.3264, + "step": 3069 + }, + { + "epoch": 0.19, + "grad_norm": 4.545901753362208, + "learning_rate": 9.319136906471598e-06, + "loss": 0.3371, + "step": 3070 + }, + { + "epoch": 0.19, + "grad_norm": 2.407767762102129, + "learning_rate": 9.318623720719654e-06, + "loss": 0.3256, + "step": 3071 + }, + { + "epoch": 0.19, + "grad_norm": 2.6079027392657275, + "learning_rate": 9.318110355780342e-06, + "loss": 0.3315, + "step": 3072 + }, + { + "epoch": 0.19, + "grad_norm": 4.657985241783449, + "learning_rate": 9.317596811674962e-06, + "loss": 0.3201, + "step": 3073 + }, + { + "epoch": 0.19, + "grad_norm": 4.4748325443645935, + "learning_rate": 9.317083088424822e-06, + "loss": 0.3421, + "step": 3074 + }, + { + "epoch": 0.19, + "grad_norm": 2.736417483683428, + "learning_rate": 9.316569186051234e-06, + "loss": 0.316, + "step": 3075 + }, + { + "epoch": 0.19, + "grad_norm": 4.767032320678666, + "learning_rate": 9.316055104575525e-06, + "loss": 0.353, + "step": 3076 + }, + { + "epoch": 0.19, + "grad_norm": 16.42877438132356, + "learning_rate": 9.315540844019025e-06, + "loss": 0.3299, + "step": 3077 + }, + { + "epoch": 0.19, + "grad_norm": 3.1262551084630785, + "learning_rate": 9.315026404403068e-06, + "loss": 0.3619, + "step": 3078 + }, + { + "epoch": 0.19, + "grad_norm": 2.8639683136116036, + "learning_rate": 9.314511785749001e-06, + "loss": 0.3223, + "step": 3079 + }, + { + "epoch": 0.19, + "grad_norm": 3.035077716440803, + "learning_rate": 9.313996988078178e-06, + "loss": 0.3354, + "step": 3080 + }, + { + "epoch": 0.19, + "grad_norm": 4.395584925167177, + "learning_rate": 9.313482011411957e-06, + "loss": 0.332, + "step": 3081 + }, + { + "epoch": 0.19, + "grad_norm": 2.851017873781992, + "learning_rate": 9.312966855771704e-06, + "loss": 0.3354, + "step": 3082 + }, + { + "epoch": 0.19, + "grad_norm": 4.727301729924524, + "learning_rate": 9.312451521178798e-06, + "loss": 0.3383, + "step": 3083 + }, + { + "epoch": 0.19, + "grad_norm": 2.5206288984760876, + "learning_rate": 9.311936007654618e-06, + "loss": 0.3465, + "step": 3084 + }, + { + "epoch": 0.19, + "grad_norm": 2.5301215803483834, + "learning_rate": 9.311420315220553e-06, + "loss": 0.3325, + "step": 3085 + }, + { + "epoch": 0.19, + "grad_norm": 2.107286661096052, + "learning_rate": 9.310904443898e-06, + "loss": 0.3394, + "step": 3086 + }, + { + "epoch": 0.19, + "grad_norm": 2.430047693361223, + "learning_rate": 9.310388393708368e-06, + "loss": 0.3134, + "step": 3087 + }, + { + "epoch": 0.19, + "grad_norm": 3.0517604275704704, + "learning_rate": 9.309872164673063e-06, + "loss": 0.3392, + "step": 3088 + }, + { + "epoch": 0.19, + "grad_norm": 0.8432433407508314, + "learning_rate": 9.30935575681351e-06, + "loss": 0.4864, + "step": 3089 + }, + { + "epoch": 0.19, + "grad_norm": 3.094975924775508, + "learning_rate": 9.30883917015113e-06, + "loss": 0.3317, + "step": 3090 + }, + { + "epoch": 0.19, + "grad_norm": 2.5260535063997995, + "learning_rate": 9.308322404707359e-06, + "loss": 0.3371, + "step": 3091 + }, + { + "epoch": 0.19, + "grad_norm": 3.746233518227412, + "learning_rate": 9.307805460503639e-06, + "loss": 0.343, + "step": 3092 + }, + { + "epoch": 0.19, + "grad_norm": 3.940903599970333, + "learning_rate": 9.30728833756142e-06, + "loss": 0.3205, + "step": 3093 + }, + { + "epoch": 0.19, + "grad_norm": 6.4481602653343, + "learning_rate": 9.306771035902155e-06, + "loss": 0.315, + "step": 3094 + }, + { + "epoch": 0.19, + "grad_norm": 0.6580255121552712, + "learning_rate": 9.306253555547313e-06, + "loss": 0.4848, + "step": 3095 + }, + { + "epoch": 0.19, + "grad_norm": 2.9432766208942893, + "learning_rate": 9.30573589651836e-06, + "loss": 0.3398, + "step": 3096 + }, + { + "epoch": 0.19, + "grad_norm": 2.5327925577779693, + "learning_rate": 9.305218058836778e-06, + "loss": 0.341, + "step": 3097 + }, + { + "epoch": 0.19, + "grad_norm": 0.6333683879285672, + "learning_rate": 9.304700042524052e-06, + "loss": 0.4821, + "step": 3098 + }, + { + "epoch": 0.19, + "grad_norm": 0.6309725759518167, + "learning_rate": 9.304181847601675e-06, + "loss": 0.5041, + "step": 3099 + }, + { + "epoch": 0.19, + "grad_norm": 4.026104691477374, + "learning_rate": 9.303663474091146e-06, + "loss": 0.3304, + "step": 3100 + }, + { + "epoch": 0.2, + "grad_norm": 2.704467245912126, + "learning_rate": 9.303144922013979e-06, + "loss": 0.3496, + "step": 3101 + }, + { + "epoch": 0.2, + "grad_norm": 2.9587455227385404, + "learning_rate": 9.302626191391684e-06, + "loss": 0.3582, + "step": 3102 + }, + { + "epoch": 0.2, + "grad_norm": 2.1080186806649497, + "learning_rate": 9.302107282245785e-06, + "loss": 0.3252, + "step": 3103 + }, + { + "epoch": 0.2, + "grad_norm": 3.911883259041764, + "learning_rate": 9.301588194597815e-06, + "loss": 0.3305, + "step": 3104 + }, + { + "epoch": 0.2, + "grad_norm": 2.475237511109905, + "learning_rate": 9.30106892846931e-06, + "loss": 0.3291, + "step": 3105 + }, + { + "epoch": 0.2, + "grad_norm": 6.438939667144132, + "learning_rate": 9.300549483881816e-06, + "loss": 0.3584, + "step": 3106 + }, + { + "epoch": 0.2, + "grad_norm": 5.738246580062574, + "learning_rate": 9.300029860856886e-06, + "loss": 0.3348, + "step": 3107 + }, + { + "epoch": 0.2, + "grad_norm": 43.85318841066162, + "learning_rate": 9.299510059416077e-06, + "loss": 0.3466, + "step": 3108 + }, + { + "epoch": 0.2, + "grad_norm": 4.173025583594784, + "learning_rate": 9.298990079580959e-06, + "loss": 0.3364, + "step": 3109 + }, + { + "epoch": 0.2, + "grad_norm": 8.399942685315112, + "learning_rate": 9.298469921373108e-06, + "loss": 0.3367, + "step": 3110 + }, + { + "epoch": 0.2, + "grad_norm": 4.2835717012918515, + "learning_rate": 9.297949584814105e-06, + "loss": 0.3471, + "step": 3111 + }, + { + "epoch": 0.2, + "grad_norm": 2.326600887265767, + "learning_rate": 9.297429069925539e-06, + "loss": 0.3504, + "step": 3112 + }, + { + "epoch": 0.2, + "grad_norm": 5.999845651609561, + "learning_rate": 9.296908376729009e-06, + "loss": 0.352, + "step": 3113 + }, + { + "epoch": 0.2, + "grad_norm": 3.3929357335956967, + "learning_rate": 9.296387505246116e-06, + "loss": 0.3213, + "step": 3114 + }, + { + "epoch": 0.2, + "grad_norm": 3.3982066984923813, + "learning_rate": 9.295866455498477e-06, + "loss": 0.3705, + "step": 3115 + }, + { + "epoch": 0.2, + "grad_norm": 5.6702218223034135, + "learning_rate": 9.295345227507707e-06, + "loss": 0.3333, + "step": 3116 + }, + { + "epoch": 0.2, + "grad_norm": 5.803485576760056, + "learning_rate": 9.294823821295433e-06, + "loss": 0.3195, + "step": 3117 + }, + { + "epoch": 0.2, + "grad_norm": 1.9781165440263238, + "learning_rate": 9.294302236883293e-06, + "loss": 0.3333, + "step": 3118 + }, + { + "epoch": 0.2, + "grad_norm": 0.9132682386226283, + "learning_rate": 9.293780474292923e-06, + "loss": 0.5514, + "step": 3119 + }, + { + "epoch": 0.2, + "grad_norm": 2.9893112971431743, + "learning_rate": 9.293258533545975e-06, + "loss": 0.3381, + "step": 3120 + }, + { + "epoch": 0.2, + "grad_norm": 2.947785338266833, + "learning_rate": 9.292736414664105e-06, + "loss": 0.3055, + "step": 3121 + }, + { + "epoch": 0.2, + "grad_norm": 2.423557293362017, + "learning_rate": 9.292214117668979e-06, + "loss": 0.3434, + "step": 3122 + }, + { + "epoch": 0.2, + "grad_norm": 3.0290706973690344, + "learning_rate": 9.291691642582262e-06, + "loss": 0.3491, + "step": 3123 + }, + { + "epoch": 0.2, + "grad_norm": 22.13754401247412, + "learning_rate": 9.291168989425636e-06, + "loss": 0.3557, + "step": 3124 + }, + { + "epoch": 0.2, + "grad_norm": 2.111758513326185, + "learning_rate": 9.290646158220789e-06, + "loss": 0.3426, + "step": 3125 + }, + { + "epoch": 0.2, + "grad_norm": 3.1929741948548553, + "learning_rate": 9.29012314898941e-06, + "loss": 0.3313, + "step": 3126 + }, + { + "epoch": 0.2, + "grad_norm": 4.386553889091693, + "learning_rate": 9.289599961753201e-06, + "loss": 0.33, + "step": 3127 + }, + { + "epoch": 0.2, + "grad_norm": 5.792576308994678, + "learning_rate": 9.289076596533873e-06, + "loss": 0.3516, + "step": 3128 + }, + { + "epoch": 0.2, + "grad_norm": 2.7360230492857958, + "learning_rate": 9.288553053353136e-06, + "loss": 0.3513, + "step": 3129 + }, + { + "epoch": 0.2, + "grad_norm": 4.696616546068349, + "learning_rate": 9.288029332232718e-06, + "loss": 0.3338, + "step": 3130 + }, + { + "epoch": 0.2, + "grad_norm": 2.458912832502881, + "learning_rate": 9.287505433194344e-06, + "loss": 0.3319, + "step": 3131 + }, + { + "epoch": 0.2, + "grad_norm": 3.5319191980373486, + "learning_rate": 9.286981356259756e-06, + "loss": 0.3549, + "step": 3132 + }, + { + "epoch": 0.2, + "grad_norm": 2.5404540188621425, + "learning_rate": 9.286457101450695e-06, + "loss": 0.333, + "step": 3133 + }, + { + "epoch": 0.2, + "grad_norm": 0.8531424140218881, + "learning_rate": 9.285932668788917e-06, + "loss": 0.507, + "step": 3134 + }, + { + "epoch": 0.2, + "grad_norm": 2.499563968202511, + "learning_rate": 9.28540805829618e-06, + "loss": 0.3394, + "step": 3135 + }, + { + "epoch": 0.2, + "grad_norm": 3.601357275161658, + "learning_rate": 9.284883269994249e-06, + "loss": 0.3343, + "step": 3136 + }, + { + "epoch": 0.2, + "grad_norm": 3.4033310464057345, + "learning_rate": 9.284358303904902e-06, + "loss": 0.3346, + "step": 3137 + }, + { + "epoch": 0.2, + "grad_norm": 3.5510444402774155, + "learning_rate": 9.28383316004992e-06, + "loss": 0.3308, + "step": 3138 + }, + { + "epoch": 0.2, + "grad_norm": 0.6581716057280045, + "learning_rate": 9.28330783845109e-06, + "loss": 0.5153, + "step": 3139 + }, + { + "epoch": 0.2, + "grad_norm": 5.766367521467626, + "learning_rate": 9.28278233913021e-06, + "loss": 0.3639, + "step": 3140 + }, + { + "epoch": 0.2, + "grad_norm": 2.648176503325645, + "learning_rate": 9.282256662109082e-06, + "loss": 0.3282, + "step": 3141 + }, + { + "epoch": 0.2, + "grad_norm": 3.252889421194652, + "learning_rate": 9.281730807409522e-06, + "loss": 0.345, + "step": 3142 + }, + { + "epoch": 0.2, + "grad_norm": 2.567440367216447, + "learning_rate": 9.281204775053342e-06, + "loss": 0.3296, + "step": 3143 + }, + { + "epoch": 0.2, + "grad_norm": 3.164396048956933, + "learning_rate": 9.280678565062375e-06, + "loss": 0.3431, + "step": 3144 + }, + { + "epoch": 0.2, + "grad_norm": 1.8443810127063882, + "learning_rate": 9.280152177458449e-06, + "loss": 0.3378, + "step": 3145 + }, + { + "epoch": 0.2, + "grad_norm": 3.347160350923574, + "learning_rate": 9.279625612263407e-06, + "loss": 0.3405, + "step": 3146 + }, + { + "epoch": 0.2, + "grad_norm": 4.193585439101035, + "learning_rate": 9.279098869499097e-06, + "loss": 0.3184, + "step": 3147 + }, + { + "epoch": 0.2, + "grad_norm": 3.1915934188963817, + "learning_rate": 9.278571949187376e-06, + "loss": 0.3387, + "step": 3148 + }, + { + "epoch": 0.2, + "grad_norm": 1.981672700854984, + "learning_rate": 9.278044851350103e-06, + "loss": 0.3314, + "step": 3149 + }, + { + "epoch": 0.2, + "grad_norm": 0.8244792300051936, + "learning_rate": 9.277517576009152e-06, + "loss": 0.5116, + "step": 3150 + }, + { + "epoch": 0.2, + "grad_norm": 3.0340519588206303, + "learning_rate": 9.276990123186397e-06, + "loss": 0.3521, + "step": 3151 + }, + { + "epoch": 0.2, + "grad_norm": 0.6617316786693868, + "learning_rate": 9.276462492903726e-06, + "loss": 0.4951, + "step": 3152 + }, + { + "epoch": 0.2, + "grad_norm": 2.3169855087654603, + "learning_rate": 9.275934685183032e-06, + "loss": 0.3192, + "step": 3153 + }, + { + "epoch": 0.2, + "grad_norm": 3.0251337233818814, + "learning_rate": 9.275406700046211e-06, + "loss": 0.3288, + "step": 3154 + }, + { + "epoch": 0.2, + "grad_norm": 3.1191920647426374, + "learning_rate": 9.274878537515173e-06, + "loss": 0.358, + "step": 3155 + }, + { + "epoch": 0.2, + "grad_norm": 3.0121696385723533, + "learning_rate": 9.274350197611832e-06, + "loss": 0.3382, + "step": 3156 + }, + { + "epoch": 0.2, + "grad_norm": 2.0404721997473247, + "learning_rate": 9.273821680358107e-06, + "loss": 0.3168, + "step": 3157 + }, + { + "epoch": 0.2, + "grad_norm": 3.273886983149378, + "learning_rate": 9.27329298577593e-06, + "loss": 0.3313, + "step": 3158 + }, + { + "epoch": 0.2, + "grad_norm": 3.057467819196444, + "learning_rate": 9.272764113887237e-06, + "loss": 0.3255, + "step": 3159 + }, + { + "epoch": 0.2, + "grad_norm": 2.2025889870920823, + "learning_rate": 9.272235064713974e-06, + "loss": 0.3302, + "step": 3160 + }, + { + "epoch": 0.2, + "grad_norm": 9.935982182351442, + "learning_rate": 9.271705838278086e-06, + "loss": 0.3437, + "step": 3161 + }, + { + "epoch": 0.2, + "grad_norm": 1.7691037434104837, + "learning_rate": 9.27117643460154e-06, + "loss": 0.3425, + "step": 3162 + }, + { + "epoch": 0.2, + "grad_norm": 2.309836296686852, + "learning_rate": 9.270646853706293e-06, + "loss": 0.3575, + "step": 3163 + }, + { + "epoch": 0.2, + "grad_norm": 2.9099254762908546, + "learning_rate": 9.270117095614324e-06, + "loss": 0.3441, + "step": 3164 + }, + { + "epoch": 0.2, + "grad_norm": 3.3344908505501163, + "learning_rate": 9.269587160347612e-06, + "loss": 0.3233, + "step": 3165 + }, + { + "epoch": 0.2, + "grad_norm": 1.9357850985698077, + "learning_rate": 9.269057047928144e-06, + "loss": 0.3472, + "step": 3166 + }, + { + "epoch": 0.2, + "grad_norm": 3.8801315563559973, + "learning_rate": 9.268526758377919e-06, + "loss": 0.3638, + "step": 3167 + }, + { + "epoch": 0.2, + "grad_norm": 2.7230291838491256, + "learning_rate": 9.267996291718936e-06, + "loss": 0.3573, + "step": 3168 + }, + { + "epoch": 0.2, + "grad_norm": 9.07079599260868, + "learning_rate": 9.267465647973206e-06, + "loss": 0.3545, + "step": 3169 + }, + { + "epoch": 0.2, + "grad_norm": 2.1648255080740424, + "learning_rate": 9.266934827162746e-06, + "loss": 0.34, + "step": 3170 + }, + { + "epoch": 0.2, + "grad_norm": 2.1379391469547526, + "learning_rate": 9.26640382930958e-06, + "loss": 0.3466, + "step": 3171 + }, + { + "epoch": 0.2, + "grad_norm": 1.8543325603940297, + "learning_rate": 9.265872654435743e-06, + "loss": 0.3387, + "step": 3172 + }, + { + "epoch": 0.2, + "grad_norm": 1.8881740805926435, + "learning_rate": 9.26534130256327e-06, + "loss": 0.3239, + "step": 3173 + }, + { + "epoch": 0.2, + "grad_norm": 3.49069838949244, + "learning_rate": 9.264809773714214e-06, + "loss": 0.3218, + "step": 3174 + }, + { + "epoch": 0.2, + "grad_norm": 2.6883041794621128, + "learning_rate": 9.264278067910625e-06, + "loss": 0.3517, + "step": 3175 + }, + { + "epoch": 0.2, + "grad_norm": 4.842922373876776, + "learning_rate": 9.263746185174562e-06, + "loss": 0.3417, + "step": 3176 + }, + { + "epoch": 0.2, + "grad_norm": 2.839745658010722, + "learning_rate": 9.263214125528097e-06, + "loss": 0.3349, + "step": 3177 + }, + { + "epoch": 0.2, + "grad_norm": 3.713975034051698, + "learning_rate": 9.262681888993306e-06, + "loss": 0.3444, + "step": 3178 + }, + { + "epoch": 0.2, + "grad_norm": 2.4326683224951275, + "learning_rate": 9.262149475592272e-06, + "loss": 0.3554, + "step": 3179 + }, + { + "epoch": 0.2, + "grad_norm": 3.1903551994351953, + "learning_rate": 9.261616885347087e-06, + "loss": 0.3523, + "step": 3180 + }, + { + "epoch": 0.2, + "grad_norm": 11.988480030316767, + "learning_rate": 9.261084118279846e-06, + "loss": 0.3268, + "step": 3181 + }, + { + "epoch": 0.2, + "grad_norm": 3.1729441910723652, + "learning_rate": 9.26055117441266e-06, + "loss": 0.3306, + "step": 3182 + }, + { + "epoch": 0.2, + "grad_norm": 1.8335747055425395, + "learning_rate": 9.260018053767634e-06, + "loss": 0.3294, + "step": 3183 + }, + { + "epoch": 0.2, + "grad_norm": 2.483853626654878, + "learning_rate": 9.259484756366894e-06, + "loss": 0.3334, + "step": 3184 + }, + { + "epoch": 0.2, + "grad_norm": 1.8398749503857479, + "learning_rate": 9.258951282232567e-06, + "loss": 0.3409, + "step": 3185 + }, + { + "epoch": 0.2, + "grad_norm": 3.3869859868712218, + "learning_rate": 9.258417631386784e-06, + "loss": 0.3438, + "step": 3186 + }, + { + "epoch": 0.2, + "grad_norm": 5.067702151113323, + "learning_rate": 9.257883803851692e-06, + "loss": 0.3668, + "step": 3187 + }, + { + "epoch": 0.2, + "grad_norm": 3.1551763649208757, + "learning_rate": 9.257349799649437e-06, + "loss": 0.3519, + "step": 3188 + }, + { + "epoch": 0.2, + "grad_norm": 2.8092883233789556, + "learning_rate": 9.256815618802178e-06, + "loss": 0.3489, + "step": 3189 + }, + { + "epoch": 0.2, + "grad_norm": 2.8746975925359735, + "learning_rate": 9.256281261332076e-06, + "loss": 0.3201, + "step": 3190 + }, + { + "epoch": 0.2, + "grad_norm": 3.940141382151419, + "learning_rate": 9.255746727261305e-06, + "loss": 0.3537, + "step": 3191 + }, + { + "epoch": 0.2, + "grad_norm": 3.6974583371840613, + "learning_rate": 9.255212016612044e-06, + "loss": 0.3213, + "step": 3192 + }, + { + "epoch": 0.2, + "grad_norm": 8.844741808366749, + "learning_rate": 9.254677129406477e-06, + "loss": 0.3473, + "step": 3193 + }, + { + "epoch": 0.2, + "grad_norm": 3.2017702725927175, + "learning_rate": 9.254142065666802e-06, + "loss": 0.3399, + "step": 3194 + }, + { + "epoch": 0.2, + "grad_norm": 2.8349854964968766, + "learning_rate": 9.253606825415213e-06, + "loss": 0.3329, + "step": 3195 + }, + { + "epoch": 0.2, + "grad_norm": 4.461347680543548, + "learning_rate": 9.253071408673924e-06, + "loss": 0.3349, + "step": 3196 + }, + { + "epoch": 0.2, + "grad_norm": 2.728796034177238, + "learning_rate": 9.252535815465146e-06, + "loss": 0.3277, + "step": 3197 + }, + { + "epoch": 0.2, + "grad_norm": 2.1476735913302596, + "learning_rate": 9.252000045811105e-06, + "loss": 0.3488, + "step": 3198 + }, + { + "epoch": 0.2, + "grad_norm": 2.572846878344725, + "learning_rate": 9.25146409973403e-06, + "loss": 0.3329, + "step": 3199 + }, + { + "epoch": 0.2, + "grad_norm": 47.92774792019559, + "learning_rate": 9.250927977256155e-06, + "loss": 0.3573, + "step": 3200 + }, + { + "epoch": 0.2, + "grad_norm": 6.197579392795628, + "learning_rate": 9.25039167839973e-06, + "loss": 0.3226, + "step": 3201 + }, + { + "epoch": 0.2, + "grad_norm": 2.9942998432191135, + "learning_rate": 9.249855203187007e-06, + "loss": 0.3523, + "step": 3202 + }, + { + "epoch": 0.2, + "grad_norm": 5.009663714699753, + "learning_rate": 9.249318551640238e-06, + "loss": 0.3222, + "step": 3203 + }, + { + "epoch": 0.2, + "grad_norm": 5.592879992005864, + "learning_rate": 9.2487817237817e-06, + "loss": 0.3509, + "step": 3204 + }, + { + "epoch": 0.2, + "grad_norm": 2.7727777628097505, + "learning_rate": 9.248244719633659e-06, + "loss": 0.3264, + "step": 3205 + }, + { + "epoch": 0.2, + "grad_norm": 6.90503630217772, + "learning_rate": 9.247707539218398e-06, + "loss": 0.3486, + "step": 3206 + }, + { + "epoch": 0.2, + "grad_norm": 6.495555896714801, + "learning_rate": 9.247170182558207e-06, + "loss": 0.3121, + "step": 3207 + }, + { + "epoch": 0.2, + "grad_norm": 3.186933314109556, + "learning_rate": 9.246632649675382e-06, + "loss": 0.3618, + "step": 3208 + }, + { + "epoch": 0.2, + "grad_norm": 3.0725383590343203, + "learning_rate": 9.246094940592224e-06, + "loss": 0.2995, + "step": 3209 + }, + { + "epoch": 0.2, + "grad_norm": 2.7157495868341974, + "learning_rate": 9.245557055331046e-06, + "loss": 0.346, + "step": 3210 + }, + { + "epoch": 0.2, + "grad_norm": 2.1595383518209115, + "learning_rate": 9.245018993914166e-06, + "loss": 0.32, + "step": 3211 + }, + { + "epoch": 0.2, + "grad_norm": 14.779679331049103, + "learning_rate": 9.244480756363904e-06, + "loss": 0.3803, + "step": 3212 + }, + { + "epoch": 0.2, + "grad_norm": 6.372475177234385, + "learning_rate": 9.243942342702601e-06, + "loss": 0.3582, + "step": 3213 + }, + { + "epoch": 0.2, + "grad_norm": 2.2102588382653683, + "learning_rate": 9.24340375295259e-06, + "loss": 0.3208, + "step": 3214 + }, + { + "epoch": 0.2, + "grad_norm": 3.1949738710810096, + "learning_rate": 9.24286498713622e-06, + "loss": 0.3499, + "step": 3215 + }, + { + "epoch": 0.2, + "grad_norm": 4.516471386721431, + "learning_rate": 9.242326045275846e-06, + "loss": 0.3384, + "step": 3216 + }, + { + "epoch": 0.2, + "grad_norm": 3.246970065640399, + "learning_rate": 9.24178692739383e-06, + "loss": 0.32, + "step": 3217 + }, + { + "epoch": 0.2, + "grad_norm": 2.8097854465160834, + "learning_rate": 9.241247633512539e-06, + "loss": 0.3363, + "step": 3218 + }, + { + "epoch": 0.2, + "grad_norm": 3.9042000808275117, + "learning_rate": 9.240708163654351e-06, + "loss": 0.3487, + "step": 3219 + }, + { + "epoch": 0.2, + "grad_norm": 2.6299503296182896, + "learning_rate": 9.240168517841648e-06, + "loss": 0.3395, + "step": 3220 + }, + { + "epoch": 0.2, + "grad_norm": 2.5934262865465665, + "learning_rate": 9.239628696096823e-06, + "loss": 0.3383, + "step": 3221 + }, + { + "epoch": 0.2, + "grad_norm": 2.4141524686623534, + "learning_rate": 9.239088698442272e-06, + "loss": 0.3247, + "step": 3222 + }, + { + "epoch": 0.2, + "grad_norm": 2.7875869828206374, + "learning_rate": 9.238548524900401e-06, + "loss": 0.3478, + "step": 3223 + }, + { + "epoch": 0.2, + "grad_norm": 5.2898484715310765, + "learning_rate": 9.238008175493625e-06, + "loss": 0.3232, + "step": 3224 + }, + { + "epoch": 0.2, + "grad_norm": 4.4207614453299655, + "learning_rate": 9.23746765024436e-06, + "loss": 0.3275, + "step": 3225 + }, + { + "epoch": 0.2, + "grad_norm": 4.932672407280655, + "learning_rate": 9.236926949175037e-06, + "loss": 0.3167, + "step": 3226 + }, + { + "epoch": 0.2, + "grad_norm": 2.8429470125541183, + "learning_rate": 9.23638607230809e-06, + "loss": 0.3219, + "step": 3227 + }, + { + "epoch": 0.2, + "grad_norm": 4.862949427711048, + "learning_rate": 9.235845019665959e-06, + "loss": 0.3273, + "step": 3228 + }, + { + "epoch": 0.2, + "grad_norm": 3.320392874054271, + "learning_rate": 9.235303791271094e-06, + "loss": 0.3384, + "step": 3229 + }, + { + "epoch": 0.2, + "grad_norm": 7.5602851974905425, + "learning_rate": 9.234762387145952e-06, + "loss": 0.3367, + "step": 3230 + }, + { + "epoch": 0.2, + "grad_norm": 3.0776500161744, + "learning_rate": 9.234220807312998e-06, + "loss": 0.3207, + "step": 3231 + }, + { + "epoch": 0.2, + "grad_norm": 2.8843792434976, + "learning_rate": 9.233679051794701e-06, + "loss": 0.36, + "step": 3232 + }, + { + "epoch": 0.2, + "grad_norm": 2.4680316916760128, + "learning_rate": 9.23313712061354e-06, + "loss": 0.3544, + "step": 3233 + }, + { + "epoch": 0.2, + "grad_norm": 2.0833265338170346, + "learning_rate": 9.232595013792004e-06, + "loss": 0.3251, + "step": 3234 + }, + { + "epoch": 0.2, + "grad_norm": 2.291851370050809, + "learning_rate": 9.232052731352578e-06, + "loss": 0.3223, + "step": 3235 + }, + { + "epoch": 0.2, + "grad_norm": 2.4115139121529108, + "learning_rate": 9.23151027331777e-06, + "loss": 0.3164, + "step": 3236 + }, + { + "epoch": 0.2, + "grad_norm": 2.2953978123927525, + "learning_rate": 9.230967639710085e-06, + "loss": 0.3322, + "step": 3237 + }, + { + "epoch": 0.2, + "grad_norm": 2.742944667903736, + "learning_rate": 9.230424830552035e-06, + "loss": 0.3339, + "step": 3238 + }, + { + "epoch": 0.2, + "grad_norm": 8.666546241009362, + "learning_rate": 9.229881845866148e-06, + "loss": 0.3319, + "step": 3239 + }, + { + "epoch": 0.2, + "grad_norm": 5.7040286816864745, + "learning_rate": 9.229338685674948e-06, + "loss": 0.341, + "step": 3240 + }, + { + "epoch": 0.2, + "grad_norm": 4.363329064621435, + "learning_rate": 9.228795350000977e-06, + "loss": 0.3384, + "step": 3241 + }, + { + "epoch": 0.2, + "grad_norm": 2.861605510467439, + "learning_rate": 9.228251838866772e-06, + "loss": 0.3223, + "step": 3242 + }, + { + "epoch": 0.2, + "grad_norm": 4.940652153111325, + "learning_rate": 9.227708152294889e-06, + "loss": 0.3407, + "step": 3243 + }, + { + "epoch": 0.2, + "grad_norm": 5.22711923987941, + "learning_rate": 9.227164290307887e-06, + "loss": 0.3381, + "step": 3244 + }, + { + "epoch": 0.2, + "grad_norm": 3.673798403637111, + "learning_rate": 9.22662025292833e-06, + "loss": 0.3444, + "step": 3245 + }, + { + "epoch": 0.2, + "grad_norm": 13.555285733453422, + "learning_rate": 9.226076040178788e-06, + "loss": 0.3213, + "step": 3246 + }, + { + "epoch": 0.2, + "grad_norm": 4.820065874417691, + "learning_rate": 9.225531652081848e-06, + "loss": 0.3215, + "step": 3247 + }, + { + "epoch": 0.2, + "grad_norm": 6.12254801547705, + "learning_rate": 9.224987088660094e-06, + "loss": 0.3258, + "step": 3248 + }, + { + "epoch": 0.2, + "grad_norm": 3.7767350258221444, + "learning_rate": 9.22444234993612e-06, + "loss": 0.353, + "step": 3249 + }, + { + "epoch": 0.2, + "grad_norm": 5.323233219968319, + "learning_rate": 9.223897435932532e-06, + "loss": 0.3495, + "step": 3250 + }, + { + "epoch": 0.2, + "grad_norm": 2.0002888207855123, + "learning_rate": 9.223352346671935e-06, + "loss": 0.3204, + "step": 3251 + }, + { + "epoch": 0.2, + "grad_norm": 2.4286053134398724, + "learning_rate": 9.222807082176948e-06, + "loss": 0.3241, + "step": 3252 + }, + { + "epoch": 0.2, + "grad_norm": 2.0042722335846075, + "learning_rate": 9.222261642470194e-06, + "loss": 0.3569, + "step": 3253 + }, + { + "epoch": 0.2, + "grad_norm": 2.923639477056454, + "learning_rate": 9.221716027574306e-06, + "loss": 0.3627, + "step": 3254 + }, + { + "epoch": 0.2, + "grad_norm": 3.0732575508603124, + "learning_rate": 9.22117023751192e-06, + "loss": 0.3112, + "step": 3255 + }, + { + "epoch": 0.2, + "grad_norm": 2.941078040252632, + "learning_rate": 9.220624272305683e-06, + "loss": 0.3574, + "step": 3256 + }, + { + "epoch": 0.2, + "grad_norm": 3.6451251005631082, + "learning_rate": 9.22007813197825e-06, + "loss": 0.3392, + "step": 3257 + }, + { + "epoch": 0.2, + "grad_norm": 14.28083926606095, + "learning_rate": 9.219531816552279e-06, + "loss": 0.3649, + "step": 3258 + }, + { + "epoch": 0.2, + "grad_norm": 7.9430400688118254, + "learning_rate": 9.218985326050439e-06, + "loss": 0.3463, + "step": 3259 + }, + { + "epoch": 0.21, + "grad_norm": 26.166018466888865, + "learning_rate": 9.218438660495401e-06, + "loss": 0.3411, + "step": 3260 + }, + { + "epoch": 0.21, + "grad_norm": 2.6872472537936742, + "learning_rate": 9.217891819909854e-06, + "loss": 0.3653, + "step": 3261 + }, + { + "epoch": 0.21, + "grad_norm": 4.436469040818058, + "learning_rate": 9.217344804316482e-06, + "loss": 0.3364, + "step": 3262 + }, + { + "epoch": 0.21, + "grad_norm": 5.1852007080247144, + "learning_rate": 9.216797613737983e-06, + "loss": 0.3385, + "step": 3263 + }, + { + "epoch": 0.21, + "grad_norm": 3.3268107476088984, + "learning_rate": 9.21625024819706e-06, + "loss": 0.3381, + "step": 3264 + }, + { + "epoch": 0.21, + "grad_norm": 9.110258063796547, + "learning_rate": 9.215702707716427e-06, + "loss": 0.3365, + "step": 3265 + }, + { + "epoch": 0.21, + "grad_norm": 2.7578030950227537, + "learning_rate": 9.2151549923188e-06, + "loss": 0.3405, + "step": 3266 + }, + { + "epoch": 0.21, + "grad_norm": 3.871857191905456, + "learning_rate": 9.214607102026905e-06, + "loss": 0.3576, + "step": 3267 + }, + { + "epoch": 0.21, + "grad_norm": 2.68667289240282, + "learning_rate": 9.214059036863477e-06, + "loss": 0.3373, + "step": 3268 + }, + { + "epoch": 0.21, + "grad_norm": 2.376630280488469, + "learning_rate": 9.213510796851253e-06, + "loss": 0.3487, + "step": 3269 + }, + { + "epoch": 0.21, + "grad_norm": 2.6182677959069554, + "learning_rate": 9.212962382012981e-06, + "loss": 0.3455, + "step": 3270 + }, + { + "epoch": 0.21, + "grad_norm": 6.791047510562209, + "learning_rate": 9.21241379237142e-06, + "loss": 0.3583, + "step": 3271 + }, + { + "epoch": 0.21, + "grad_norm": 8.172415132107194, + "learning_rate": 9.211865027949328e-06, + "loss": 0.3175, + "step": 3272 + }, + { + "epoch": 0.21, + "grad_norm": 2.5181523258848157, + "learning_rate": 9.211316088769473e-06, + "loss": 0.3381, + "step": 3273 + }, + { + "epoch": 0.21, + "grad_norm": 4.622769907928998, + "learning_rate": 9.210766974854634e-06, + "loss": 0.3399, + "step": 3274 + }, + { + "epoch": 0.21, + "grad_norm": 3.3280229125343714, + "learning_rate": 9.210217686227593e-06, + "loss": 0.3402, + "step": 3275 + }, + { + "epoch": 0.21, + "grad_norm": 1.9442166229888342, + "learning_rate": 9.209668222911143e-06, + "loss": 0.341, + "step": 3276 + }, + { + "epoch": 0.21, + "grad_norm": 3.0759456934942504, + "learning_rate": 9.209118584928082e-06, + "loss": 0.3301, + "step": 3277 + }, + { + "epoch": 0.21, + "grad_norm": 3.1493635590756597, + "learning_rate": 9.208568772301213e-06, + "loss": 0.3369, + "step": 3278 + }, + { + "epoch": 0.21, + "grad_norm": 2.8006361457377222, + "learning_rate": 9.208018785053353e-06, + "loss": 0.3372, + "step": 3279 + }, + { + "epoch": 0.21, + "grad_norm": 3.5474933450527315, + "learning_rate": 9.207468623207317e-06, + "loss": 0.3464, + "step": 3280 + }, + { + "epoch": 0.21, + "grad_norm": 2.7352724708460774, + "learning_rate": 9.206918286785936e-06, + "loss": 0.3218, + "step": 3281 + }, + { + "epoch": 0.21, + "grad_norm": 2.254842152358999, + "learning_rate": 9.206367775812042e-06, + "loss": 0.3443, + "step": 3282 + }, + { + "epoch": 0.21, + "grad_norm": 2.4955303744267425, + "learning_rate": 9.205817090308478e-06, + "loss": 0.3364, + "step": 3283 + }, + { + "epoch": 0.21, + "grad_norm": 2.0817601413132634, + "learning_rate": 9.205266230298092e-06, + "loss": 0.3414, + "step": 3284 + }, + { + "epoch": 0.21, + "grad_norm": 3.109554213401208, + "learning_rate": 9.204715195803742e-06, + "loss": 0.3199, + "step": 3285 + }, + { + "epoch": 0.21, + "grad_norm": 2.138672698305992, + "learning_rate": 9.20416398684829e-06, + "loss": 0.3211, + "step": 3286 + }, + { + "epoch": 0.21, + "grad_norm": 2.1371287351578165, + "learning_rate": 9.203612603454605e-06, + "loss": 0.3382, + "step": 3287 + }, + { + "epoch": 0.21, + "grad_norm": 3.2386771843454425, + "learning_rate": 9.203061045645567e-06, + "loss": 0.3231, + "step": 3288 + }, + { + "epoch": 0.21, + "grad_norm": 5.523871963082305, + "learning_rate": 9.202509313444061e-06, + "loss": 0.3278, + "step": 3289 + }, + { + "epoch": 0.21, + "grad_norm": 2.824293203598094, + "learning_rate": 9.20195740687298e-06, + "loss": 0.3449, + "step": 3290 + }, + { + "epoch": 0.21, + "grad_norm": 8.593839876042038, + "learning_rate": 9.201405325955222e-06, + "loss": 0.3364, + "step": 3291 + }, + { + "epoch": 0.21, + "grad_norm": 2.112760574710892, + "learning_rate": 9.200853070713695e-06, + "loss": 0.3203, + "step": 3292 + }, + { + "epoch": 0.21, + "grad_norm": 2.540605316940034, + "learning_rate": 9.20030064117131e-06, + "loss": 0.3193, + "step": 3293 + }, + { + "epoch": 0.21, + "grad_norm": 2.32568423100732, + "learning_rate": 9.199748037350996e-06, + "loss": 0.3432, + "step": 3294 + }, + { + "epoch": 0.21, + "grad_norm": 4.773215132307938, + "learning_rate": 9.199195259275673e-06, + "loss": 0.3374, + "step": 3295 + }, + { + "epoch": 0.21, + "grad_norm": 6.438571219906153, + "learning_rate": 9.19864230696828e-06, + "loss": 0.3266, + "step": 3296 + }, + { + "epoch": 0.21, + "grad_norm": 13.87962527297526, + "learning_rate": 9.198089180451761e-06, + "loss": 0.3335, + "step": 3297 + }, + { + "epoch": 0.21, + "grad_norm": 7.452510535287162, + "learning_rate": 9.197535879749065e-06, + "loss": 0.3382, + "step": 3298 + }, + { + "epoch": 0.21, + "grad_norm": 2.7645794132909876, + "learning_rate": 9.196982404883147e-06, + "loss": 0.3291, + "step": 3299 + }, + { + "epoch": 0.21, + "grad_norm": 3.1315230713321487, + "learning_rate": 9.196428755876978e-06, + "loss": 0.3582, + "step": 3300 + }, + { + "epoch": 0.21, + "grad_norm": 2.4407918219457003, + "learning_rate": 9.195874932753525e-06, + "loss": 0.3521, + "step": 3301 + }, + { + "epoch": 0.21, + "grad_norm": 6.401129747125721, + "learning_rate": 9.195320935535769e-06, + "loss": 0.3602, + "step": 3302 + }, + { + "epoch": 0.21, + "grad_norm": 2.532072527604524, + "learning_rate": 9.194766764246694e-06, + "loss": 0.3534, + "step": 3303 + }, + { + "epoch": 0.21, + "grad_norm": 5.412806962110028, + "learning_rate": 9.194212418909296e-06, + "loss": 0.3307, + "step": 3304 + }, + { + "epoch": 0.21, + "grad_norm": 2.5199638819278407, + "learning_rate": 9.193657899546575e-06, + "loss": 0.3348, + "step": 3305 + }, + { + "epoch": 0.21, + "grad_norm": 2.5887591176661275, + "learning_rate": 9.19310320618154e-06, + "loss": 0.3424, + "step": 3306 + }, + { + "epoch": 0.21, + "grad_norm": 12.593860703339445, + "learning_rate": 9.192548338837204e-06, + "loss": 0.3307, + "step": 3307 + }, + { + "epoch": 0.21, + "grad_norm": 1.7497839312493255, + "learning_rate": 9.19199329753659e-06, + "loss": 0.3141, + "step": 3308 + }, + { + "epoch": 0.21, + "grad_norm": 1.4448666963415697, + "learning_rate": 9.191438082302731e-06, + "loss": 0.3187, + "step": 3309 + }, + { + "epoch": 0.21, + "grad_norm": 2.2215136403363744, + "learning_rate": 9.190882693158658e-06, + "loss": 0.3298, + "step": 3310 + }, + { + "epoch": 0.21, + "grad_norm": 2.554905590613441, + "learning_rate": 9.19032713012742e-06, + "loss": 0.3416, + "step": 3311 + }, + { + "epoch": 0.21, + "grad_norm": 1.8985134435188125, + "learning_rate": 9.189771393232065e-06, + "loss": 0.3219, + "step": 3312 + }, + { + "epoch": 0.21, + "grad_norm": 2.8571913602771155, + "learning_rate": 9.189215482495655e-06, + "loss": 0.3188, + "step": 3313 + }, + { + "epoch": 0.21, + "grad_norm": 2.0283925225733554, + "learning_rate": 9.188659397941252e-06, + "loss": 0.3381, + "step": 3314 + }, + { + "epoch": 0.21, + "grad_norm": 3.5622267881856042, + "learning_rate": 9.188103139591934e-06, + "loss": 0.3366, + "step": 3315 + }, + { + "epoch": 0.21, + "grad_norm": 1.6204691471689936, + "learning_rate": 9.187546707470773e-06, + "loss": 0.3257, + "step": 3316 + }, + { + "epoch": 0.21, + "grad_norm": 2.479359532002313, + "learning_rate": 9.186990101600865e-06, + "loss": 0.3587, + "step": 3317 + }, + { + "epoch": 0.21, + "grad_norm": 2.5156339811433504, + "learning_rate": 9.186433322005298e-06, + "loss": 0.3252, + "step": 3318 + }, + { + "epoch": 0.21, + "grad_norm": 2.0677644046278445, + "learning_rate": 9.185876368707178e-06, + "loss": 0.3484, + "step": 3319 + }, + { + "epoch": 0.21, + "grad_norm": 1.8363778250072302, + "learning_rate": 9.185319241729614e-06, + "loss": 0.3283, + "step": 3320 + }, + { + "epoch": 0.21, + "grad_norm": 1.5325217068777581, + "learning_rate": 9.184761941095716e-06, + "loss": 0.3328, + "step": 3321 + }, + { + "epoch": 0.21, + "grad_norm": 3.1078622392122868, + "learning_rate": 9.184204466828618e-06, + "loss": 0.3491, + "step": 3322 + }, + { + "epoch": 0.21, + "grad_norm": 3.7331955765680322, + "learning_rate": 9.18364681895144e-06, + "loss": 0.3464, + "step": 3323 + }, + { + "epoch": 0.21, + "grad_norm": 2.1285879808933634, + "learning_rate": 9.183088997487326e-06, + "loss": 0.3405, + "step": 3324 + }, + { + "epoch": 0.21, + "grad_norm": 2.188071343440662, + "learning_rate": 9.18253100245942e-06, + "loss": 0.3391, + "step": 3325 + }, + { + "epoch": 0.21, + "grad_norm": 2.56313134255, + "learning_rate": 9.181972833890875e-06, + "loss": 0.3324, + "step": 3326 + }, + { + "epoch": 0.21, + "grad_norm": 2.0520794722060756, + "learning_rate": 9.181414491804846e-06, + "loss": 0.3396, + "step": 3327 + }, + { + "epoch": 0.21, + "grad_norm": 3.4612046989301044, + "learning_rate": 9.180855976224505e-06, + "loss": 0.3285, + "step": 3328 + }, + { + "epoch": 0.21, + "grad_norm": 2.2254198126880214, + "learning_rate": 9.180297287173022e-06, + "loss": 0.3317, + "step": 3329 + }, + { + "epoch": 0.21, + "grad_norm": 5.358969469056055, + "learning_rate": 9.179738424673582e-06, + "loss": 0.3415, + "step": 3330 + }, + { + "epoch": 0.21, + "grad_norm": 2.3580050157462966, + "learning_rate": 9.179179388749369e-06, + "loss": 0.3421, + "step": 3331 + }, + { + "epoch": 0.21, + "grad_norm": 6.7782296233398425, + "learning_rate": 9.17862017942358e-06, + "loss": 0.3424, + "step": 3332 + }, + { + "epoch": 0.21, + "grad_norm": 3.562563165541226, + "learning_rate": 9.178060796719417e-06, + "loss": 0.3355, + "step": 3333 + }, + { + "epoch": 0.21, + "grad_norm": 2.1177647447941483, + "learning_rate": 9.177501240660091e-06, + "loss": 0.3427, + "step": 3334 + }, + { + "epoch": 0.21, + "grad_norm": 2.871763419278553, + "learning_rate": 9.176941511268818e-06, + "loss": 0.3622, + "step": 3335 + }, + { + "epoch": 0.21, + "grad_norm": 2.036188243176669, + "learning_rate": 9.176381608568824e-06, + "loss": 0.3325, + "step": 3336 + }, + { + "epoch": 0.21, + "grad_norm": 3.487483995921712, + "learning_rate": 9.175821532583338e-06, + "loss": 0.3647, + "step": 3337 + }, + { + "epoch": 0.21, + "grad_norm": 1.7326747344651108, + "learning_rate": 9.1752612833356e-06, + "loss": 0.3313, + "step": 3338 + }, + { + "epoch": 0.21, + "grad_norm": 3.40782656536387, + "learning_rate": 9.174700860848855e-06, + "loss": 0.3335, + "step": 3339 + }, + { + "epoch": 0.21, + "grad_norm": 2.9838042980440616, + "learning_rate": 9.174140265146356e-06, + "loss": 0.3451, + "step": 3340 + }, + { + "epoch": 0.21, + "grad_norm": 2.7016867447401482, + "learning_rate": 9.173579496251363e-06, + "loss": 0.3469, + "step": 3341 + }, + { + "epoch": 0.21, + "grad_norm": 2.090011723773526, + "learning_rate": 9.173018554187145e-06, + "loss": 0.3482, + "step": 3342 + }, + { + "epoch": 0.21, + "grad_norm": 1.8980960308910082, + "learning_rate": 9.172457438976974e-06, + "loss": 0.3282, + "step": 3343 + }, + { + "epoch": 0.21, + "grad_norm": 2.4699340183754797, + "learning_rate": 9.171896150644132e-06, + "loss": 0.3615, + "step": 3344 + }, + { + "epoch": 0.21, + "grad_norm": 8.797645045459058, + "learning_rate": 9.17133468921191e-06, + "loss": 0.3462, + "step": 3345 + }, + { + "epoch": 0.21, + "grad_norm": 1.4314848202489756, + "learning_rate": 9.170773054703603e-06, + "loss": 0.3203, + "step": 3346 + }, + { + "epoch": 0.21, + "grad_norm": 3.872175357632315, + "learning_rate": 9.170211247142514e-06, + "loss": 0.3468, + "step": 3347 + }, + { + "epoch": 0.21, + "grad_norm": 3.2198033083294213, + "learning_rate": 9.169649266551951e-06, + "loss": 0.3393, + "step": 3348 + }, + { + "epoch": 0.21, + "grad_norm": 2.512652006046951, + "learning_rate": 9.169087112955234e-06, + "loss": 0.3391, + "step": 3349 + }, + { + "epoch": 0.21, + "grad_norm": 2.6463453697273636, + "learning_rate": 9.168524786375689e-06, + "loss": 0.3312, + "step": 3350 + }, + { + "epoch": 0.21, + "grad_norm": 3.908131989900112, + "learning_rate": 9.167962286836647e-06, + "loss": 0.3357, + "step": 3351 + }, + { + "epoch": 0.21, + "grad_norm": 1.8293700890744862, + "learning_rate": 9.167399614361445e-06, + "loss": 0.3225, + "step": 3352 + }, + { + "epoch": 0.21, + "grad_norm": 2.0266428828740053, + "learning_rate": 9.166836768973434e-06, + "loss": 0.3274, + "step": 3353 + }, + { + "epoch": 0.21, + "grad_norm": 1.943899273866206, + "learning_rate": 9.16627375069596e-06, + "loss": 0.3369, + "step": 3354 + }, + { + "epoch": 0.21, + "grad_norm": 3.884931678526371, + "learning_rate": 9.16571055955239e-06, + "loss": 0.3465, + "step": 3355 + }, + { + "epoch": 0.21, + "grad_norm": 5.400698594126364, + "learning_rate": 9.165147195566089e-06, + "loss": 0.3294, + "step": 3356 + }, + { + "epoch": 0.21, + "grad_norm": 2.1125453260908458, + "learning_rate": 9.164583658760432e-06, + "loss": 0.3434, + "step": 3357 + }, + { + "epoch": 0.21, + "grad_norm": 3.468269633182379, + "learning_rate": 9.164019949158804e-06, + "loss": 0.3235, + "step": 3358 + }, + { + "epoch": 0.21, + "grad_norm": 3.0730537988756113, + "learning_rate": 9.163456066784591e-06, + "loss": 0.3275, + "step": 3359 + }, + { + "epoch": 0.21, + "grad_norm": 18.284549859365693, + "learning_rate": 9.162892011661192e-06, + "loss": 0.3543, + "step": 3360 + }, + { + "epoch": 0.21, + "grad_norm": 2.5003264599554074, + "learning_rate": 9.162327783812008e-06, + "loss": 0.389, + "step": 3361 + }, + { + "epoch": 0.21, + "grad_norm": 1.9787320970232836, + "learning_rate": 9.161763383260452e-06, + "loss": 0.3177, + "step": 3362 + }, + { + "epoch": 0.21, + "grad_norm": 1.1673964662883662, + "learning_rate": 9.16119881002994e-06, + "loss": 0.5297, + "step": 3363 + }, + { + "epoch": 0.21, + "grad_norm": 2.815700730125225, + "learning_rate": 9.160634064143899e-06, + "loss": 0.3383, + "step": 3364 + }, + { + "epoch": 0.21, + "grad_norm": 1.7103598931713948, + "learning_rate": 9.16006914562576e-06, + "loss": 0.3523, + "step": 3365 + }, + { + "epoch": 0.21, + "grad_norm": 2.024844366786342, + "learning_rate": 9.159504054498964e-06, + "loss": 0.37, + "step": 3366 + }, + { + "epoch": 0.21, + "grad_norm": 2.676963080534276, + "learning_rate": 9.158938790786955e-06, + "loss": 0.3721, + "step": 3367 + }, + { + "epoch": 0.21, + "grad_norm": 3.2917037062206203, + "learning_rate": 9.15837335451319e-06, + "loss": 0.3256, + "step": 3368 + }, + { + "epoch": 0.21, + "grad_norm": 2.657558902634187, + "learning_rate": 9.157807745701128e-06, + "loss": 0.3164, + "step": 3369 + }, + { + "epoch": 0.21, + "grad_norm": 2.6724387004788803, + "learning_rate": 9.157241964374237e-06, + "loss": 0.3387, + "step": 3370 + }, + { + "epoch": 0.21, + "grad_norm": 4.503820732107395, + "learning_rate": 9.156676010555993e-06, + "loss": 0.3328, + "step": 3371 + }, + { + "epoch": 0.21, + "grad_norm": 3.3457756669202174, + "learning_rate": 9.15610988426988e-06, + "loss": 0.3407, + "step": 3372 + }, + { + "epoch": 0.21, + "grad_norm": 2.548715836200099, + "learning_rate": 9.155543585539384e-06, + "loss": 0.3289, + "step": 3373 + }, + { + "epoch": 0.21, + "grad_norm": 3.0279051529499923, + "learning_rate": 9.154977114388003e-06, + "loss": 0.328, + "step": 3374 + }, + { + "epoch": 0.21, + "grad_norm": 3.4596567994902307, + "learning_rate": 9.154410470839243e-06, + "loss": 0.3438, + "step": 3375 + }, + { + "epoch": 0.21, + "grad_norm": 2.859275170217395, + "learning_rate": 9.153843654916611e-06, + "loss": 0.3806, + "step": 3376 + }, + { + "epoch": 0.21, + "grad_norm": 4.131478239727446, + "learning_rate": 9.15327666664363e-06, + "loss": 0.3191, + "step": 3377 + }, + { + "epoch": 0.21, + "grad_norm": 2.7706683433185906, + "learning_rate": 9.152709506043823e-06, + "loss": 0.3402, + "step": 3378 + }, + { + "epoch": 0.21, + "grad_norm": 3.8437933644698665, + "learning_rate": 9.152142173140722e-06, + "loss": 0.3376, + "step": 3379 + }, + { + "epoch": 0.21, + "grad_norm": 5.7778771841816985, + "learning_rate": 9.151574667957868e-06, + "loss": 0.3266, + "step": 3380 + }, + { + "epoch": 0.21, + "grad_norm": 2.5977096609640595, + "learning_rate": 9.151006990518806e-06, + "loss": 0.3291, + "step": 3381 + }, + { + "epoch": 0.21, + "grad_norm": 2.266092286796118, + "learning_rate": 9.150439140847091e-06, + "loss": 0.3347, + "step": 3382 + }, + { + "epoch": 0.21, + "grad_norm": 2.818289659645148, + "learning_rate": 9.149871118966285e-06, + "loss": 0.3379, + "step": 3383 + }, + { + "epoch": 0.21, + "grad_norm": 2.215748932020619, + "learning_rate": 9.149302924899954e-06, + "loss": 0.3164, + "step": 3384 + }, + { + "epoch": 0.21, + "grad_norm": 5.570108372727355, + "learning_rate": 9.148734558671675e-06, + "loss": 0.3327, + "step": 3385 + }, + { + "epoch": 0.21, + "grad_norm": 5.71923744775369, + "learning_rate": 9.14816602030503e-06, + "loss": 0.3755, + "step": 3386 + }, + { + "epoch": 0.21, + "grad_norm": 2.4901020425627403, + "learning_rate": 9.14759730982361e-06, + "loss": 0.3154, + "step": 3387 + }, + { + "epoch": 0.21, + "grad_norm": 2.661112022598833, + "learning_rate": 9.14702842725101e-06, + "loss": 0.3379, + "step": 3388 + }, + { + "epoch": 0.21, + "grad_norm": 3.9211450406357264, + "learning_rate": 9.146459372610834e-06, + "loss": 0.3458, + "step": 3389 + }, + { + "epoch": 0.21, + "grad_norm": 4.7073832687348816, + "learning_rate": 9.145890145926695e-06, + "loss": 0.3221, + "step": 3390 + }, + { + "epoch": 0.21, + "grad_norm": 2.9851121593898338, + "learning_rate": 9.145320747222208e-06, + "loss": 0.3396, + "step": 3391 + }, + { + "epoch": 0.21, + "grad_norm": 3.8435272556335773, + "learning_rate": 9.144751176521002e-06, + "loss": 0.3525, + "step": 3392 + }, + { + "epoch": 0.21, + "grad_norm": 2.7941329282709573, + "learning_rate": 9.144181433846707e-06, + "loss": 0.3337, + "step": 3393 + }, + { + "epoch": 0.21, + "grad_norm": 8.659776119343045, + "learning_rate": 9.143611519222964e-06, + "loss": 0.3274, + "step": 3394 + }, + { + "epoch": 0.21, + "grad_norm": 7.398221496478616, + "learning_rate": 9.143041432673419e-06, + "loss": 0.336, + "step": 3395 + }, + { + "epoch": 0.21, + "grad_norm": 5.213436868912288, + "learning_rate": 9.142471174221726e-06, + "loss": 0.3309, + "step": 3396 + }, + { + "epoch": 0.21, + "grad_norm": 0.9721671511681025, + "learning_rate": 9.141900743891546e-06, + "loss": 0.5126, + "step": 3397 + }, + { + "epoch": 0.21, + "grad_norm": 3.8562501935600255, + "learning_rate": 9.141330141706546e-06, + "loss": 0.3318, + "step": 3398 + }, + { + "epoch": 0.21, + "grad_norm": 2.0973830541994762, + "learning_rate": 9.140759367690404e-06, + "loss": 0.3279, + "step": 3399 + }, + { + "epoch": 0.21, + "grad_norm": 0.7071700560578132, + "learning_rate": 9.1401884218668e-06, + "loss": 0.5134, + "step": 3400 + }, + { + "epoch": 0.21, + "grad_norm": 6.110391149165387, + "learning_rate": 9.139617304259427e-06, + "loss": 0.3299, + "step": 3401 + }, + { + "epoch": 0.21, + "grad_norm": 5.5283307902572165, + "learning_rate": 9.139046014891977e-06, + "loss": 0.3416, + "step": 3402 + }, + { + "epoch": 0.21, + "grad_norm": 4.180471716055632, + "learning_rate": 9.138474553788157e-06, + "loss": 0.3394, + "step": 3403 + }, + { + "epoch": 0.21, + "grad_norm": 0.6996761485645479, + "learning_rate": 9.137902920971678e-06, + "loss": 0.5086, + "step": 3404 + }, + { + "epoch": 0.21, + "grad_norm": 3.551640849621617, + "learning_rate": 9.137331116466256e-06, + "loss": 0.3267, + "step": 3405 + }, + { + "epoch": 0.21, + "grad_norm": 3.8730634452546617, + "learning_rate": 9.136759140295615e-06, + "loss": 0.3462, + "step": 3406 + }, + { + "epoch": 0.21, + "grad_norm": 3.467497394945697, + "learning_rate": 9.136186992483492e-06, + "loss": 0.3318, + "step": 3407 + }, + { + "epoch": 0.21, + "grad_norm": 2.8178472384582345, + "learning_rate": 9.135614673053624e-06, + "loss": 0.3283, + "step": 3408 + }, + { + "epoch": 0.21, + "grad_norm": 3.2951332013735337, + "learning_rate": 9.135042182029757e-06, + "loss": 0.3313, + "step": 3409 + }, + { + "epoch": 0.21, + "grad_norm": 4.26418890154553, + "learning_rate": 9.134469519435646e-06, + "loss": 0.3418, + "step": 3410 + }, + { + "epoch": 0.21, + "grad_norm": 4.818220638789112, + "learning_rate": 9.13389668529505e-06, + "loss": 0.3337, + "step": 3411 + }, + { + "epoch": 0.21, + "grad_norm": 13.86822808388928, + "learning_rate": 9.133323679631738e-06, + "loss": 0.3174, + "step": 3412 + }, + { + "epoch": 0.21, + "grad_norm": 3.109364057932612, + "learning_rate": 9.132750502469485e-06, + "loss": 0.3165, + "step": 3413 + }, + { + "epoch": 0.21, + "grad_norm": 3.672558504126033, + "learning_rate": 9.132177153832074e-06, + "loss": 0.311, + "step": 3414 + }, + { + "epoch": 0.21, + "grad_norm": 2.832940662614624, + "learning_rate": 9.131603633743292e-06, + "loss": 0.3322, + "step": 3415 + }, + { + "epoch": 0.21, + "grad_norm": 2.204792192716843, + "learning_rate": 9.131029942226937e-06, + "loss": 0.3317, + "step": 3416 + }, + { + "epoch": 0.21, + "grad_norm": 2.832418154868519, + "learning_rate": 9.130456079306814e-06, + "loss": 0.3822, + "step": 3417 + }, + { + "epoch": 0.21, + "grad_norm": 2.5062226053206063, + "learning_rate": 9.12988204500673e-06, + "loss": 0.3308, + "step": 3418 + }, + { + "epoch": 0.22, + "grad_norm": 3.1590126938251, + "learning_rate": 9.129307839350504e-06, + "loss": 0.3274, + "step": 3419 + }, + { + "epoch": 0.22, + "grad_norm": 3.2142266209046824, + "learning_rate": 9.128733462361963e-06, + "loss": 0.3387, + "step": 3420 + }, + { + "epoch": 0.22, + "grad_norm": 1.6472285423146584, + "learning_rate": 9.128158914064934e-06, + "loss": 0.317, + "step": 3421 + }, + { + "epoch": 0.22, + "grad_norm": 3.5113775531399516, + "learning_rate": 9.127584194483262e-06, + "loss": 0.3489, + "step": 3422 + }, + { + "epoch": 0.22, + "grad_norm": 4.943572123418341, + "learning_rate": 9.12700930364079e-06, + "loss": 0.319, + "step": 3423 + }, + { + "epoch": 0.22, + "grad_norm": 2.8812612659283072, + "learning_rate": 9.12643424156137e-06, + "loss": 0.3208, + "step": 3424 + }, + { + "epoch": 0.22, + "grad_norm": 2.9013054774185956, + "learning_rate": 9.125859008268867e-06, + "loss": 0.3423, + "step": 3425 + }, + { + "epoch": 0.22, + "grad_norm": 2.544791406628796, + "learning_rate": 9.125283603787142e-06, + "loss": 0.3342, + "step": 3426 + }, + { + "epoch": 0.22, + "grad_norm": 2.394809981961453, + "learning_rate": 9.124708028140075e-06, + "loss": 0.3356, + "step": 3427 + }, + { + "epoch": 0.22, + "grad_norm": 3.0129142018826496, + "learning_rate": 9.124132281351545e-06, + "loss": 0.3326, + "step": 3428 + }, + { + "epoch": 0.22, + "grad_norm": 4.7907079098252705, + "learning_rate": 9.123556363445442e-06, + "loss": 0.3419, + "step": 3429 + }, + { + "epoch": 0.22, + "grad_norm": 2.423517417702465, + "learning_rate": 9.12298027444566e-06, + "loss": 0.3238, + "step": 3430 + }, + { + "epoch": 0.22, + "grad_norm": 2.2696674535860373, + "learning_rate": 9.122404014376104e-06, + "loss": 0.341, + "step": 3431 + }, + { + "epoch": 0.22, + "grad_norm": 1.8037064937903338, + "learning_rate": 9.121827583260686e-06, + "loss": 0.3079, + "step": 3432 + }, + { + "epoch": 0.22, + "grad_norm": 2.7462740294945083, + "learning_rate": 9.121250981123315e-06, + "loss": 0.3332, + "step": 3433 + }, + { + "epoch": 0.22, + "grad_norm": 4.9452868801654155, + "learning_rate": 9.120674207987923e-06, + "loss": 0.3197, + "step": 3434 + }, + { + "epoch": 0.22, + "grad_norm": 2.483170471661308, + "learning_rate": 9.12009726387844e-06, + "loss": 0.3365, + "step": 3435 + }, + { + "epoch": 0.22, + "grad_norm": 3.7605779973586477, + "learning_rate": 9.119520148818804e-06, + "loss": 0.311, + "step": 3436 + }, + { + "epoch": 0.22, + "grad_norm": 4.403593592969136, + "learning_rate": 9.118942862832958e-06, + "loss": 0.3307, + "step": 3437 + }, + { + "epoch": 0.22, + "grad_norm": 2.670852267455348, + "learning_rate": 9.118365405944856e-06, + "loss": 0.3207, + "step": 3438 + }, + { + "epoch": 0.22, + "grad_norm": 9.939563130698033, + "learning_rate": 9.11778777817846e-06, + "loss": 0.3319, + "step": 3439 + }, + { + "epoch": 0.22, + "grad_norm": 3.3762713484256435, + "learning_rate": 9.117209979557734e-06, + "loss": 0.3312, + "step": 3440 + }, + { + "epoch": 0.22, + "grad_norm": 2.9976177286620933, + "learning_rate": 9.116632010106654e-06, + "loss": 0.3313, + "step": 3441 + }, + { + "epoch": 0.22, + "grad_norm": 2.2074607433805937, + "learning_rate": 9.116053869849198e-06, + "loss": 0.3562, + "step": 3442 + }, + { + "epoch": 0.22, + "grad_norm": 3.0660503344787537, + "learning_rate": 9.115475558809358e-06, + "loss": 0.3273, + "step": 3443 + }, + { + "epoch": 0.22, + "grad_norm": 10.01375776371647, + "learning_rate": 9.114897077011128e-06, + "loss": 0.3725, + "step": 3444 + }, + { + "epoch": 0.22, + "grad_norm": 2.6250311791907452, + "learning_rate": 9.114318424478506e-06, + "loss": 0.3419, + "step": 3445 + }, + { + "epoch": 0.22, + "grad_norm": 3.186364353903956, + "learning_rate": 9.113739601235508e-06, + "loss": 0.3446, + "step": 3446 + }, + { + "epoch": 0.22, + "grad_norm": 3.3887747665661734, + "learning_rate": 9.113160607306143e-06, + "loss": 0.3601, + "step": 3447 + }, + { + "epoch": 0.22, + "grad_norm": 2.2462392757443417, + "learning_rate": 9.112581442714443e-06, + "loss": 0.3427, + "step": 3448 + }, + { + "epoch": 0.22, + "grad_norm": 2.2132050586897387, + "learning_rate": 9.112002107484433e-06, + "loss": 0.2929, + "step": 3449 + }, + { + "epoch": 0.22, + "grad_norm": 5.778137670186096, + "learning_rate": 9.111422601640151e-06, + "loss": 0.3448, + "step": 3450 + }, + { + "epoch": 0.22, + "grad_norm": 0.9218415945345637, + "learning_rate": 9.110842925205642e-06, + "loss": 0.5113, + "step": 3451 + }, + { + "epoch": 0.22, + "grad_norm": 3.3952053390492414, + "learning_rate": 9.11026307820496e-06, + "loss": 0.3367, + "step": 3452 + }, + { + "epoch": 0.22, + "grad_norm": 8.006458909704897, + "learning_rate": 9.10968306066216e-06, + "loss": 0.3262, + "step": 3453 + }, + { + "epoch": 0.22, + "grad_norm": 2.5686362626453807, + "learning_rate": 9.109102872601312e-06, + "loss": 0.3307, + "step": 3454 + }, + { + "epoch": 0.22, + "grad_norm": 2.2922075069105805, + "learning_rate": 9.108522514046487e-06, + "loss": 0.3395, + "step": 3455 + }, + { + "epoch": 0.22, + "grad_norm": 4.548510923432122, + "learning_rate": 9.107941985021765e-06, + "loss": 0.3599, + "step": 3456 + }, + { + "epoch": 0.22, + "grad_norm": 3.0702866979160897, + "learning_rate": 9.107361285551236e-06, + "loss": 0.3282, + "step": 3457 + }, + { + "epoch": 0.22, + "grad_norm": 3.821491197807495, + "learning_rate": 9.106780415658989e-06, + "loss": 0.3466, + "step": 3458 + }, + { + "epoch": 0.22, + "grad_norm": 2.2321418725937776, + "learning_rate": 9.106199375369127e-06, + "loss": 0.3603, + "step": 3459 + }, + { + "epoch": 0.22, + "grad_norm": 0.6049074226834605, + "learning_rate": 9.10561816470576e-06, + "loss": 0.4935, + "step": 3460 + }, + { + "epoch": 0.22, + "grad_norm": 5.659252169270551, + "learning_rate": 9.105036783693006e-06, + "loss": 0.3379, + "step": 3461 + }, + { + "epoch": 0.22, + "grad_norm": 3.256271392693812, + "learning_rate": 9.104455232354982e-06, + "loss": 0.3505, + "step": 3462 + }, + { + "epoch": 0.22, + "grad_norm": 3.5900481103157382, + "learning_rate": 9.10387351071582e-06, + "loss": 0.334, + "step": 3463 + }, + { + "epoch": 0.22, + "grad_norm": 2.7640226305940168, + "learning_rate": 9.103291618799657e-06, + "loss": 0.3398, + "step": 3464 + }, + { + "epoch": 0.22, + "grad_norm": 7.739483356634692, + "learning_rate": 9.102709556630639e-06, + "loss": 0.3349, + "step": 3465 + }, + { + "epoch": 0.22, + "grad_norm": 2.902883040974735, + "learning_rate": 9.10212732423291e-06, + "loss": 0.342, + "step": 3466 + }, + { + "epoch": 0.22, + "grad_norm": 5.323800145239479, + "learning_rate": 9.101544921630634e-06, + "loss": 0.3215, + "step": 3467 + }, + { + "epoch": 0.22, + "grad_norm": 16.354977132274946, + "learning_rate": 9.100962348847974e-06, + "loss": 0.3277, + "step": 3468 + }, + { + "epoch": 0.22, + "grad_norm": 15.436653894832212, + "learning_rate": 9.100379605909102e-06, + "loss": 0.3378, + "step": 3469 + }, + { + "epoch": 0.22, + "grad_norm": 2.5648961244173885, + "learning_rate": 9.099796692838198e-06, + "loss": 0.3047, + "step": 3470 + }, + { + "epoch": 0.22, + "grad_norm": 102.54707431872112, + "learning_rate": 9.099213609659448e-06, + "loss": 0.3161, + "step": 3471 + }, + { + "epoch": 0.22, + "grad_norm": 10.952786479893375, + "learning_rate": 9.09863035639704e-06, + "loss": 0.3645, + "step": 3472 + }, + { + "epoch": 0.22, + "grad_norm": 6.18717403910133, + "learning_rate": 9.098046933075182e-06, + "loss": 0.3426, + "step": 3473 + }, + { + "epoch": 0.22, + "grad_norm": 3.055273260646196, + "learning_rate": 9.097463339718077e-06, + "loss": 0.3182, + "step": 3474 + }, + { + "epoch": 0.22, + "grad_norm": 3.137222206697158, + "learning_rate": 9.09687957634994e-06, + "loss": 0.3281, + "step": 3475 + }, + { + "epoch": 0.22, + "grad_norm": 2.610557892693063, + "learning_rate": 9.096295642994993e-06, + "loss": 0.3319, + "step": 3476 + }, + { + "epoch": 0.22, + "grad_norm": 7.246057627288268, + "learning_rate": 9.095711539677464e-06, + "loss": 0.3434, + "step": 3477 + }, + { + "epoch": 0.22, + "grad_norm": 2.653807832279, + "learning_rate": 9.095127266421589e-06, + "loss": 0.3567, + "step": 3478 + }, + { + "epoch": 0.22, + "grad_norm": 2.5642670646223147, + "learning_rate": 9.094542823251609e-06, + "loss": 0.3241, + "step": 3479 + }, + { + "epoch": 0.22, + "grad_norm": 2.681253858790611, + "learning_rate": 9.093958210191773e-06, + "loss": 0.3339, + "step": 3480 + }, + { + "epoch": 0.22, + "grad_norm": 10.795870686069218, + "learning_rate": 9.09337342726634e-06, + "loss": 0.3071, + "step": 3481 + }, + { + "epoch": 0.22, + "grad_norm": 0.649140128270586, + "learning_rate": 9.092788474499574e-06, + "loss": 0.5054, + "step": 3482 + }, + { + "epoch": 0.22, + "grad_norm": 5.564914198833503, + "learning_rate": 9.092203351915744e-06, + "loss": 0.3149, + "step": 3483 + }, + { + "epoch": 0.22, + "grad_norm": 4.054874636347387, + "learning_rate": 9.091618059539129e-06, + "loss": 0.3367, + "step": 3484 + }, + { + "epoch": 0.22, + "grad_norm": 4.561306503066033, + "learning_rate": 9.091032597394012e-06, + "loss": 0.333, + "step": 3485 + }, + { + "epoch": 0.22, + "grad_norm": 4.352116762755251, + "learning_rate": 9.090446965504687e-06, + "loss": 0.3292, + "step": 3486 + }, + { + "epoch": 0.22, + "grad_norm": 3.6444201086478323, + "learning_rate": 9.089861163895453e-06, + "loss": 0.3343, + "step": 3487 + }, + { + "epoch": 0.22, + "grad_norm": 11.831919652606544, + "learning_rate": 9.089275192590613e-06, + "loss": 0.3517, + "step": 3488 + }, + { + "epoch": 0.22, + "grad_norm": 3.5929553608429483, + "learning_rate": 9.088689051614483e-06, + "loss": 0.3397, + "step": 3489 + }, + { + "epoch": 0.22, + "grad_norm": 3.6375140527785934, + "learning_rate": 9.08810274099138e-06, + "loss": 0.3196, + "step": 3490 + }, + { + "epoch": 0.22, + "grad_norm": 2.36569335735749, + "learning_rate": 9.087516260745635e-06, + "loss": 0.3104, + "step": 3491 + }, + { + "epoch": 0.22, + "grad_norm": 3.002990115600427, + "learning_rate": 9.08692961090158e-06, + "loss": 0.3313, + "step": 3492 + }, + { + "epoch": 0.22, + "grad_norm": 3.842323738404551, + "learning_rate": 9.086342791483555e-06, + "loss": 0.3316, + "step": 3493 + }, + { + "epoch": 0.22, + "grad_norm": 3.5964259177477587, + "learning_rate": 9.08575580251591e-06, + "loss": 0.3253, + "step": 3494 + }, + { + "epoch": 0.22, + "grad_norm": 3.552320116874668, + "learning_rate": 9.085168644022999e-06, + "loss": 0.326, + "step": 3495 + }, + { + "epoch": 0.22, + "grad_norm": 2.7945823372035328, + "learning_rate": 9.084581316029186e-06, + "loss": 0.3226, + "step": 3496 + }, + { + "epoch": 0.22, + "grad_norm": 8.597765238121262, + "learning_rate": 9.08399381855884e-06, + "loss": 0.3434, + "step": 3497 + }, + { + "epoch": 0.22, + "grad_norm": 3.3106496510742263, + "learning_rate": 9.083406151636334e-06, + "loss": 0.3027, + "step": 3498 + }, + { + "epoch": 0.22, + "grad_norm": 2.31908087309126, + "learning_rate": 9.082818315286054e-06, + "loss": 0.34, + "step": 3499 + }, + { + "epoch": 0.22, + "grad_norm": 3.601157452757643, + "learning_rate": 9.082230309532393e-06, + "loss": 0.3334, + "step": 3500 + }, + { + "epoch": 0.22, + "grad_norm": 4.223736593445799, + "learning_rate": 9.081642134399744e-06, + "loss": 0.338, + "step": 3501 + }, + { + "epoch": 0.22, + "grad_norm": 6.344115079313927, + "learning_rate": 9.081053789912513e-06, + "loss": 0.3377, + "step": 3502 + }, + { + "epoch": 0.22, + "grad_norm": 4.746027179157278, + "learning_rate": 9.080465276095112e-06, + "loss": 0.3448, + "step": 3503 + }, + { + "epoch": 0.22, + "grad_norm": 4.61044875039412, + "learning_rate": 9.079876592971957e-06, + "loss": 0.357, + "step": 3504 + }, + { + "epoch": 0.22, + "grad_norm": 2.74052503939075, + "learning_rate": 9.079287740567478e-06, + "loss": 0.3373, + "step": 3505 + }, + { + "epoch": 0.22, + "grad_norm": 3.1027324826146008, + "learning_rate": 9.078698718906103e-06, + "loss": 0.3181, + "step": 3506 + }, + { + "epoch": 0.22, + "grad_norm": 0.6315058772643531, + "learning_rate": 9.078109528012274e-06, + "loss": 0.5071, + "step": 3507 + }, + { + "epoch": 0.22, + "grad_norm": 3.2257699242738753, + "learning_rate": 9.077520167910438e-06, + "loss": 0.3336, + "step": 3508 + }, + { + "epoch": 0.22, + "grad_norm": 2.594941072026912, + "learning_rate": 9.076930638625047e-06, + "loss": 0.3089, + "step": 3509 + }, + { + "epoch": 0.22, + "grad_norm": 5.551073750864412, + "learning_rate": 9.076340940180563e-06, + "loss": 0.3648, + "step": 3510 + }, + { + "epoch": 0.22, + "grad_norm": 9.045605593319252, + "learning_rate": 9.075751072601453e-06, + "loss": 0.3124, + "step": 3511 + }, + { + "epoch": 0.22, + "grad_norm": 5.974039112999473, + "learning_rate": 9.07516103591219e-06, + "loss": 0.3474, + "step": 3512 + }, + { + "epoch": 0.22, + "grad_norm": 3.7088514065386926, + "learning_rate": 9.07457083013726e-06, + "loss": 0.3248, + "step": 3513 + }, + { + "epoch": 0.22, + "grad_norm": 4.516938561201477, + "learning_rate": 9.073980455301148e-06, + "loss": 0.3181, + "step": 3514 + }, + { + "epoch": 0.22, + "grad_norm": 5.129209581638368, + "learning_rate": 9.07338991142835e-06, + "loss": 0.3183, + "step": 3515 + }, + { + "epoch": 0.22, + "grad_norm": 2.8965342970940835, + "learning_rate": 9.072799198543369e-06, + "loss": 0.3116, + "step": 3516 + }, + { + "epoch": 0.22, + "grad_norm": 9.422203672342523, + "learning_rate": 9.072208316670716e-06, + "loss": 0.3219, + "step": 3517 + }, + { + "epoch": 0.22, + "grad_norm": 3.006018969271595, + "learning_rate": 9.071617265834907e-06, + "loss": 0.3378, + "step": 3518 + }, + { + "epoch": 0.22, + "grad_norm": 3.4732222221762674, + "learning_rate": 9.071026046060465e-06, + "loss": 0.3313, + "step": 3519 + }, + { + "epoch": 0.22, + "grad_norm": 3.5821227700492675, + "learning_rate": 9.070434657371923e-06, + "loss": 0.3191, + "step": 3520 + }, + { + "epoch": 0.22, + "grad_norm": 3.1388317281603095, + "learning_rate": 9.069843099793815e-06, + "loss": 0.3446, + "step": 3521 + }, + { + "epoch": 0.22, + "grad_norm": 3.4189128178981054, + "learning_rate": 9.069251373350689e-06, + "loss": 0.3223, + "step": 3522 + }, + { + "epoch": 0.22, + "grad_norm": 5.516764703725452, + "learning_rate": 9.068659478067096e-06, + "loss": 0.3122, + "step": 3523 + }, + { + "epoch": 0.22, + "grad_norm": 5.839666827460726, + "learning_rate": 9.068067413967594e-06, + "loss": 0.3333, + "step": 3524 + }, + { + "epoch": 0.22, + "grad_norm": 5.475461679296307, + "learning_rate": 9.067475181076751e-06, + "loss": 0.3266, + "step": 3525 + }, + { + "epoch": 0.22, + "grad_norm": 6.280856693851178, + "learning_rate": 9.066882779419135e-06, + "loss": 0.3116, + "step": 3526 + }, + { + "epoch": 0.22, + "grad_norm": 4.340628060280135, + "learning_rate": 9.066290209019331e-06, + "loss": 0.3141, + "step": 3527 + }, + { + "epoch": 0.22, + "grad_norm": 14.84375309229679, + "learning_rate": 9.065697469901923e-06, + "loss": 0.3334, + "step": 3528 + }, + { + "epoch": 0.22, + "grad_norm": 3.915250938557479, + "learning_rate": 9.065104562091506e-06, + "loss": 0.3349, + "step": 3529 + }, + { + "epoch": 0.22, + "grad_norm": 3.2244600292329144, + "learning_rate": 9.064511485612679e-06, + "loss": 0.3215, + "step": 3530 + }, + { + "epoch": 0.22, + "grad_norm": 4.027082296129365, + "learning_rate": 9.063918240490052e-06, + "loss": 0.321, + "step": 3531 + }, + { + "epoch": 0.22, + "grad_norm": 2.5432314854030387, + "learning_rate": 9.063324826748239e-06, + "loss": 0.3452, + "step": 3532 + }, + { + "epoch": 0.22, + "grad_norm": 7.511747029055402, + "learning_rate": 9.062731244411862e-06, + "loss": 0.3438, + "step": 3533 + }, + { + "epoch": 0.22, + "grad_norm": 8.897502735314697, + "learning_rate": 9.062137493505548e-06, + "loss": 0.3331, + "step": 3534 + }, + { + "epoch": 0.22, + "grad_norm": 2.7969701159775067, + "learning_rate": 9.061543574053936e-06, + "loss": 0.3351, + "step": 3535 + }, + { + "epoch": 0.22, + "grad_norm": 3.9887297670828126, + "learning_rate": 9.060949486081665e-06, + "loss": 0.3247, + "step": 3536 + }, + { + "epoch": 0.22, + "grad_norm": 3.526286054079545, + "learning_rate": 9.060355229613389e-06, + "loss": 0.3153, + "step": 3537 + }, + { + "epoch": 0.22, + "grad_norm": 3.94897289070156, + "learning_rate": 9.059760804673761e-06, + "loss": 0.355, + "step": 3538 + }, + { + "epoch": 0.22, + "grad_norm": 12.96042756630533, + "learning_rate": 9.059166211287447e-06, + "loss": 0.328, + "step": 3539 + }, + { + "epoch": 0.22, + "grad_norm": 2.2121761861262934, + "learning_rate": 9.058571449479117e-06, + "loss": 0.3284, + "step": 3540 + }, + { + "epoch": 0.22, + "grad_norm": 8.092016249111747, + "learning_rate": 9.057976519273448e-06, + "loss": 0.3637, + "step": 3541 + }, + { + "epoch": 0.22, + "grad_norm": 3.3464536840370003, + "learning_rate": 9.057381420695126e-06, + "loss": 0.3295, + "step": 3542 + }, + { + "epoch": 0.22, + "grad_norm": 2.5024058788918464, + "learning_rate": 9.056786153768841e-06, + "loss": 0.3358, + "step": 3543 + }, + { + "epoch": 0.22, + "grad_norm": 4.243955832177578, + "learning_rate": 9.056190718519295e-06, + "loss": 0.3442, + "step": 3544 + }, + { + "epoch": 0.22, + "grad_norm": 3.5049918892965763, + "learning_rate": 9.05559511497119e-06, + "loss": 0.3402, + "step": 3545 + }, + { + "epoch": 0.22, + "grad_norm": 3.46510198512261, + "learning_rate": 9.05499934314924e-06, + "loss": 0.3071, + "step": 3546 + }, + { + "epoch": 0.22, + "grad_norm": 13.90580788485331, + "learning_rate": 9.054403403078164e-06, + "loss": 0.3401, + "step": 3547 + }, + { + "epoch": 0.22, + "grad_norm": 10.320993086896735, + "learning_rate": 9.053807294782692e-06, + "loss": 0.3142, + "step": 3548 + }, + { + "epoch": 0.22, + "grad_norm": 2.7345491157967383, + "learning_rate": 9.053211018287553e-06, + "loss": 0.3236, + "step": 3549 + }, + { + "epoch": 0.22, + "grad_norm": 3.879192789824122, + "learning_rate": 9.05261457361749e-06, + "loss": 0.3221, + "step": 3550 + }, + { + "epoch": 0.22, + "grad_norm": 2.852462718096176, + "learning_rate": 9.05201796079725e-06, + "loss": 0.338, + "step": 3551 + }, + { + "epoch": 0.22, + "grad_norm": 7.012162404812537, + "learning_rate": 9.051421179851588e-06, + "loss": 0.3126, + "step": 3552 + }, + { + "epoch": 0.22, + "grad_norm": 3.3622073026504316, + "learning_rate": 9.050824230805266e-06, + "loss": 0.3185, + "step": 3553 + }, + { + "epoch": 0.22, + "grad_norm": 4.122895476222348, + "learning_rate": 9.05022711368305e-06, + "loss": 0.3386, + "step": 3554 + }, + { + "epoch": 0.22, + "grad_norm": 4.988696057976078, + "learning_rate": 9.049629828509719e-06, + "loss": 0.3559, + "step": 3555 + }, + { + "epoch": 0.22, + "grad_norm": 2.4607113623648877, + "learning_rate": 9.04903237531005e-06, + "loss": 0.3178, + "step": 3556 + }, + { + "epoch": 0.22, + "grad_norm": 10.988982176083395, + "learning_rate": 9.04843475410884e-06, + "loss": 0.3264, + "step": 3557 + }, + { + "epoch": 0.22, + "grad_norm": 4.371250697174492, + "learning_rate": 9.047836964930877e-06, + "loss": 0.3374, + "step": 3558 + }, + { + "epoch": 0.22, + "grad_norm": 3.8809763407763955, + "learning_rate": 9.047239007800972e-06, + "loss": 0.3463, + "step": 3559 + }, + { + "epoch": 0.22, + "grad_norm": 6.393894186510673, + "learning_rate": 9.04664088274393e-06, + "loss": 0.3185, + "step": 3560 + }, + { + "epoch": 0.22, + "grad_norm": 2.970437004367653, + "learning_rate": 9.04604258978457e-06, + "loss": 0.3444, + "step": 3561 + }, + { + "epoch": 0.22, + "grad_norm": 4.156822946079777, + "learning_rate": 9.045444128947719e-06, + "loss": 0.3195, + "step": 3562 + }, + { + "epoch": 0.22, + "grad_norm": 5.363840755745807, + "learning_rate": 9.0448455002582e-06, + "loss": 0.3487, + "step": 3563 + }, + { + "epoch": 0.22, + "grad_norm": 3.9172663491165216, + "learning_rate": 9.044246703740863e-06, + "loss": 0.3386, + "step": 3564 + }, + { + "epoch": 0.22, + "grad_norm": 7.385849456951373, + "learning_rate": 9.043647739420543e-06, + "loss": 0.3249, + "step": 3565 + }, + { + "epoch": 0.22, + "grad_norm": 5.663101178950992, + "learning_rate": 9.043048607322097e-06, + "loss": 0.3222, + "step": 3566 + }, + { + "epoch": 0.22, + "grad_norm": 6.854934296067239, + "learning_rate": 9.042449307470384e-06, + "loss": 0.3484, + "step": 3567 + }, + { + "epoch": 0.22, + "grad_norm": 4.6198768133298485, + "learning_rate": 9.041849839890267e-06, + "loss": 0.34, + "step": 3568 + }, + { + "epoch": 0.22, + "grad_norm": 4.411133402728843, + "learning_rate": 9.041250204606623e-06, + "loss": 0.3426, + "step": 3569 + }, + { + "epoch": 0.22, + "grad_norm": 8.885057626932522, + "learning_rate": 9.040650401644329e-06, + "loss": 0.3252, + "step": 3570 + }, + { + "epoch": 0.22, + "grad_norm": 4.09092358706227, + "learning_rate": 9.040050431028273e-06, + "loss": 0.3179, + "step": 3571 + }, + { + "epoch": 0.22, + "grad_norm": 5.794735501343648, + "learning_rate": 9.039450292783349e-06, + "loss": 0.3531, + "step": 3572 + }, + { + "epoch": 0.22, + "grad_norm": 7.384471046150093, + "learning_rate": 9.038849986934457e-06, + "loss": 0.3264, + "step": 3573 + }, + { + "epoch": 0.22, + "grad_norm": 2.685652594496571, + "learning_rate": 9.038249513506506e-06, + "loss": 0.3161, + "step": 3574 + }, + { + "epoch": 0.22, + "grad_norm": 2.6532738353491947, + "learning_rate": 9.03764887252441e-06, + "loss": 0.3322, + "step": 3575 + }, + { + "epoch": 0.22, + "grad_norm": 6.452544866358713, + "learning_rate": 9.037048064013088e-06, + "loss": 0.3356, + "step": 3576 + }, + { + "epoch": 0.22, + "grad_norm": 24.05446308649078, + "learning_rate": 9.036447087997473e-06, + "loss": 0.3459, + "step": 3577 + }, + { + "epoch": 0.23, + "grad_norm": 16.84714724386355, + "learning_rate": 9.0358459445025e-06, + "loss": 0.3222, + "step": 3578 + }, + { + "epoch": 0.23, + "grad_norm": 7.087490678264344, + "learning_rate": 9.035244633553109e-06, + "loss": 0.3316, + "step": 3579 + }, + { + "epoch": 0.23, + "grad_norm": 3.6993876837380792, + "learning_rate": 9.034643155174251e-06, + "loss": 0.3244, + "step": 3580 + }, + { + "epoch": 0.23, + "grad_norm": 3.8286101056562893, + "learning_rate": 9.034041509390884e-06, + "loss": 0.3337, + "step": 3581 + }, + { + "epoch": 0.23, + "grad_norm": 7.402972408534468, + "learning_rate": 9.033439696227966e-06, + "loss": 0.3333, + "step": 3582 + }, + { + "epoch": 0.23, + "grad_norm": 5.200789197971586, + "learning_rate": 9.032837715710472e-06, + "loss": 0.3221, + "step": 3583 + }, + { + "epoch": 0.23, + "grad_norm": 3.571183544346358, + "learning_rate": 9.032235567863379e-06, + "loss": 0.328, + "step": 3584 + }, + { + "epoch": 0.23, + "grad_norm": 0.6601985659672188, + "learning_rate": 9.031633252711669e-06, + "loss": 0.5005, + "step": 3585 + }, + { + "epoch": 0.23, + "grad_norm": 3.1690557004051634, + "learning_rate": 9.031030770280335e-06, + "loss": 0.3284, + "step": 3586 + }, + { + "epoch": 0.23, + "grad_norm": 10.490221960830418, + "learning_rate": 9.030428120594375e-06, + "loss": 0.3033, + "step": 3587 + }, + { + "epoch": 0.23, + "grad_norm": 4.145002496929343, + "learning_rate": 9.029825303678794e-06, + "loss": 0.325, + "step": 3588 + }, + { + "epoch": 0.23, + "grad_norm": 3.538829016865508, + "learning_rate": 9.029222319558602e-06, + "loss": 0.3326, + "step": 3589 + }, + { + "epoch": 0.23, + "grad_norm": 0.6067011415421141, + "learning_rate": 9.028619168258818e-06, + "loss": 0.5025, + "step": 3590 + }, + { + "epoch": 0.23, + "grad_norm": 4.701706051279161, + "learning_rate": 9.028015849804473e-06, + "loss": 0.334, + "step": 3591 + }, + { + "epoch": 0.23, + "grad_norm": 4.2478422724832585, + "learning_rate": 9.027412364220592e-06, + "loss": 0.3281, + "step": 3592 + }, + { + "epoch": 0.23, + "grad_norm": 8.079626237912594, + "learning_rate": 9.02680871153222e-06, + "loss": 0.3066, + "step": 3593 + }, + { + "epoch": 0.23, + "grad_norm": 0.6004598064573669, + "learning_rate": 9.026204891764402e-06, + "loss": 0.5165, + "step": 3594 + }, + { + "epoch": 0.23, + "grad_norm": 5.2952808171753825, + "learning_rate": 9.025600904942192e-06, + "loss": 0.3151, + "step": 3595 + }, + { + "epoch": 0.23, + "grad_norm": 3.4729453103217804, + "learning_rate": 9.024996751090652e-06, + "loss": 0.2955, + "step": 3596 + }, + { + "epoch": 0.23, + "grad_norm": 4.162442025468631, + "learning_rate": 9.024392430234843e-06, + "loss": 0.3127, + "step": 3597 + }, + { + "epoch": 0.23, + "grad_norm": 5.857648722786493, + "learning_rate": 9.023787942399847e-06, + "loss": 0.3066, + "step": 3598 + }, + { + "epoch": 0.23, + "grad_norm": 3.573597314277269, + "learning_rate": 9.023183287610742e-06, + "loss": 0.3216, + "step": 3599 + }, + { + "epoch": 0.23, + "grad_norm": 2.7757878378595167, + "learning_rate": 9.022578465892616e-06, + "loss": 0.3224, + "step": 3600 + }, + { + "epoch": 0.23, + "grad_norm": 8.496563184280213, + "learning_rate": 9.021973477270564e-06, + "loss": 0.3266, + "step": 3601 + }, + { + "epoch": 0.23, + "grad_norm": 3.343982204643135, + "learning_rate": 9.02136832176969e-06, + "loss": 0.3165, + "step": 3602 + }, + { + "epoch": 0.23, + "grad_norm": 2.9868406128698726, + "learning_rate": 9.0207629994151e-06, + "loss": 0.3195, + "step": 3603 + }, + { + "epoch": 0.23, + "grad_norm": 3.8329817214190003, + "learning_rate": 9.020157510231913e-06, + "loss": 0.3146, + "step": 3604 + }, + { + "epoch": 0.23, + "grad_norm": 4.514187088615563, + "learning_rate": 9.019551854245252e-06, + "loss": 0.3298, + "step": 3605 + }, + { + "epoch": 0.23, + "grad_norm": 8.818872750892389, + "learning_rate": 9.018946031480242e-06, + "loss": 0.3017, + "step": 3606 + }, + { + "epoch": 0.23, + "grad_norm": 8.50794779055897, + "learning_rate": 9.018340041962023e-06, + "loss": 0.3221, + "step": 3607 + }, + { + "epoch": 0.23, + "grad_norm": 0.6856148215037243, + "learning_rate": 9.017733885715738e-06, + "loss": 0.5393, + "step": 3608 + }, + { + "epoch": 0.23, + "grad_norm": 3.187059545194669, + "learning_rate": 9.01712756276654e-06, + "loss": 0.3584, + "step": 3609 + }, + { + "epoch": 0.23, + "grad_norm": 2.237515072571624, + "learning_rate": 9.01652107313958e-06, + "loss": 0.3429, + "step": 3610 + }, + { + "epoch": 0.23, + "grad_norm": 7.8302884012032035, + "learning_rate": 9.01591441686003e-06, + "loss": 0.3252, + "step": 3611 + }, + { + "epoch": 0.23, + "grad_norm": 3.8054598641119983, + "learning_rate": 9.015307593953058e-06, + "loss": 0.3379, + "step": 3612 + }, + { + "epoch": 0.23, + "grad_norm": 2.8162869027273643, + "learning_rate": 9.014700604443841e-06, + "loss": 0.3201, + "step": 3613 + }, + { + "epoch": 0.23, + "grad_norm": 3.565828862736785, + "learning_rate": 9.014093448357565e-06, + "loss": 0.3054, + "step": 3614 + }, + { + "epoch": 0.23, + "grad_norm": 4.235441939588098, + "learning_rate": 9.013486125719421e-06, + "loss": 0.3294, + "step": 3615 + }, + { + "epoch": 0.23, + "grad_norm": 4.50756348779826, + "learning_rate": 9.012878636554612e-06, + "loss": 0.3289, + "step": 3616 + }, + { + "epoch": 0.23, + "grad_norm": 4.150400408841957, + "learning_rate": 9.012270980888339e-06, + "loss": 0.324, + "step": 3617 + }, + { + "epoch": 0.23, + "grad_norm": 5.8499623137831405, + "learning_rate": 9.011663158745815e-06, + "loss": 0.3419, + "step": 3618 + }, + { + "epoch": 0.23, + "grad_norm": 7.099733398191777, + "learning_rate": 9.011055170152262e-06, + "loss": 0.318, + "step": 3619 + }, + { + "epoch": 0.23, + "grad_norm": 3.0350593646917723, + "learning_rate": 9.010447015132906e-06, + "loss": 0.3322, + "step": 3620 + }, + { + "epoch": 0.23, + "grad_norm": 6.38263128775083, + "learning_rate": 9.009838693712977e-06, + "loss": 0.3161, + "step": 3621 + }, + { + "epoch": 0.23, + "grad_norm": 4.93563306779475, + "learning_rate": 9.009230205917723e-06, + "loss": 0.347, + "step": 3622 + }, + { + "epoch": 0.23, + "grad_norm": 2.199331672094639, + "learning_rate": 9.008621551772384e-06, + "loss": 0.2994, + "step": 3623 + }, + { + "epoch": 0.23, + "grad_norm": 4.090170592346973, + "learning_rate": 9.008012731302218e-06, + "loss": 0.3171, + "step": 3624 + }, + { + "epoch": 0.23, + "grad_norm": 0.6784359594335939, + "learning_rate": 9.007403744532483e-06, + "loss": 0.507, + "step": 3625 + }, + { + "epoch": 0.23, + "grad_norm": 14.347774684910878, + "learning_rate": 9.00679459148845e-06, + "loss": 0.3225, + "step": 3626 + }, + { + "epoch": 0.23, + "grad_norm": 5.125561745303137, + "learning_rate": 9.006185272195392e-06, + "loss": 0.3219, + "step": 3627 + }, + { + "epoch": 0.23, + "grad_norm": 0.6468054495308965, + "learning_rate": 9.005575786678591e-06, + "loss": 0.5253, + "step": 3628 + }, + { + "epoch": 0.23, + "grad_norm": 12.800250621248189, + "learning_rate": 9.004966134963336e-06, + "loss": 0.3316, + "step": 3629 + }, + { + "epoch": 0.23, + "grad_norm": 4.103353805935026, + "learning_rate": 9.004356317074923e-06, + "loss": 0.3657, + "step": 3630 + }, + { + "epoch": 0.23, + "grad_norm": 4.928793129887084, + "learning_rate": 9.003746333038654e-06, + "loss": 0.3305, + "step": 3631 + }, + { + "epoch": 0.23, + "grad_norm": 3.8442002566843962, + "learning_rate": 9.003136182879836e-06, + "loss": 0.3017, + "step": 3632 + }, + { + "epoch": 0.23, + "grad_norm": 5.138988018592437, + "learning_rate": 9.00252586662379e-06, + "loss": 0.3098, + "step": 3633 + }, + { + "epoch": 0.23, + "grad_norm": 3.17121237081852, + "learning_rate": 9.001915384295836e-06, + "loss": 0.3109, + "step": 3634 + }, + { + "epoch": 0.23, + "grad_norm": 6.181123988311583, + "learning_rate": 9.001304735921304e-06, + "loss": 0.3384, + "step": 3635 + }, + { + "epoch": 0.23, + "grad_norm": 3.976924379649157, + "learning_rate": 9.000693921525532e-06, + "loss": 0.3378, + "step": 3636 + }, + { + "epoch": 0.23, + "grad_norm": 23.55450110106337, + "learning_rate": 9.000082941133864e-06, + "loss": 0.3183, + "step": 3637 + }, + { + "epoch": 0.23, + "grad_norm": 5.263468258046588, + "learning_rate": 8.999471794771648e-06, + "loss": 0.3129, + "step": 3638 + }, + { + "epoch": 0.23, + "grad_norm": 3.2530543059965544, + "learning_rate": 8.998860482464243e-06, + "loss": 0.3411, + "step": 3639 + }, + { + "epoch": 0.23, + "grad_norm": 3.108798511236559, + "learning_rate": 8.998249004237015e-06, + "loss": 0.3337, + "step": 3640 + }, + { + "epoch": 0.23, + "grad_norm": 4.657061460978529, + "learning_rate": 8.997637360115334e-06, + "loss": 0.3208, + "step": 3641 + }, + { + "epoch": 0.23, + "grad_norm": 4.294467745471402, + "learning_rate": 8.99702555012458e-06, + "loss": 0.3128, + "step": 3642 + }, + { + "epoch": 0.23, + "grad_norm": 12.162573474260642, + "learning_rate": 8.996413574290132e-06, + "loss": 0.3148, + "step": 3643 + }, + { + "epoch": 0.23, + "grad_norm": 3.229211969470677, + "learning_rate": 8.99580143263739e-06, + "loss": 0.3322, + "step": 3644 + }, + { + "epoch": 0.23, + "grad_norm": 3.3248378481328436, + "learning_rate": 8.995189125191747e-06, + "loss": 0.3311, + "step": 3645 + }, + { + "epoch": 0.23, + "grad_norm": 5.46492374513933, + "learning_rate": 8.994576651978613e-06, + "loss": 0.3179, + "step": 3646 + }, + { + "epoch": 0.23, + "grad_norm": 3.584422035067047, + "learning_rate": 8.993964013023398e-06, + "loss": 0.3129, + "step": 3647 + }, + { + "epoch": 0.23, + "grad_norm": 0.8053558546913938, + "learning_rate": 8.993351208351521e-06, + "loss": 0.4946, + "step": 3648 + }, + { + "epoch": 0.23, + "grad_norm": 2.749542234067599, + "learning_rate": 8.99273823798841e-06, + "loss": 0.3234, + "step": 3649 + }, + { + "epoch": 0.23, + "grad_norm": 4.943498660557402, + "learning_rate": 8.992125101959499e-06, + "loss": 0.3296, + "step": 3650 + }, + { + "epoch": 0.23, + "grad_norm": 5.275118711234532, + "learning_rate": 8.991511800290224e-06, + "loss": 0.351, + "step": 3651 + }, + { + "epoch": 0.23, + "grad_norm": 3.236552152549496, + "learning_rate": 8.990898333006038e-06, + "loss": 0.3288, + "step": 3652 + }, + { + "epoch": 0.23, + "grad_norm": 2.566987854465521, + "learning_rate": 8.990284700132388e-06, + "loss": 0.3435, + "step": 3653 + }, + { + "epoch": 0.23, + "grad_norm": 4.753969343465564, + "learning_rate": 8.98967090169474e-06, + "loss": 0.3323, + "step": 3654 + }, + { + "epoch": 0.23, + "grad_norm": 4.09232575087877, + "learning_rate": 8.98905693771856e-06, + "loss": 0.3177, + "step": 3655 + }, + { + "epoch": 0.23, + "grad_norm": 13.73294460088611, + "learning_rate": 8.988442808229321e-06, + "loss": 0.3312, + "step": 3656 + }, + { + "epoch": 0.23, + "grad_norm": 3.124225989766791, + "learning_rate": 8.987828513252508e-06, + "loss": 0.3328, + "step": 3657 + }, + { + "epoch": 0.23, + "grad_norm": 2.6993878960059834, + "learning_rate": 8.987214052813605e-06, + "loss": 0.321, + "step": 3658 + }, + { + "epoch": 0.23, + "grad_norm": 3.571417924358675, + "learning_rate": 8.98659942693811e-06, + "loss": 0.3178, + "step": 3659 + }, + { + "epoch": 0.23, + "grad_norm": 4.062422683062402, + "learning_rate": 8.985984635651523e-06, + "loss": 0.3215, + "step": 3660 + }, + { + "epoch": 0.23, + "grad_norm": 8.53089332379965, + "learning_rate": 8.985369678979351e-06, + "loss": 0.3241, + "step": 3661 + }, + { + "epoch": 0.23, + "grad_norm": 2.9476753349579776, + "learning_rate": 8.984754556947116e-06, + "loss": 0.3145, + "step": 3662 + }, + { + "epoch": 0.23, + "grad_norm": 3.5988612497606813, + "learning_rate": 8.984139269580337e-06, + "loss": 0.3628, + "step": 3663 + }, + { + "epoch": 0.23, + "grad_norm": 2.463845682359028, + "learning_rate": 8.98352381690454e-06, + "loss": 0.328, + "step": 3664 + }, + { + "epoch": 0.23, + "grad_norm": 1.9701219402967818, + "learning_rate": 8.982908198945266e-06, + "loss": 0.3132, + "step": 3665 + }, + { + "epoch": 0.23, + "grad_norm": 6.189989455839819, + "learning_rate": 8.982292415728057e-06, + "loss": 0.3458, + "step": 3666 + }, + { + "epoch": 0.23, + "grad_norm": 2.1810603676710274, + "learning_rate": 8.981676467278461e-06, + "loss": 0.309, + "step": 3667 + }, + { + "epoch": 0.23, + "grad_norm": 4.188915244416057, + "learning_rate": 8.981060353622037e-06, + "loss": 0.3099, + "step": 3668 + }, + { + "epoch": 0.23, + "grad_norm": 0.8310137010128612, + "learning_rate": 8.980444074784347e-06, + "loss": 0.5004, + "step": 3669 + }, + { + "epoch": 0.23, + "grad_norm": 2.259419586692642, + "learning_rate": 8.979827630790962e-06, + "loss": 0.3066, + "step": 3670 + }, + { + "epoch": 0.23, + "grad_norm": 0.6841209023908836, + "learning_rate": 8.979211021667462e-06, + "loss": 0.4819, + "step": 3671 + }, + { + "epoch": 0.23, + "grad_norm": 2.6469296979890933, + "learning_rate": 8.978594247439428e-06, + "loss": 0.3148, + "step": 3672 + }, + { + "epoch": 0.23, + "grad_norm": 94.27647612541318, + "learning_rate": 8.977977308132451e-06, + "loss": 0.3161, + "step": 3673 + }, + { + "epoch": 0.23, + "grad_norm": 3.418441586277652, + "learning_rate": 8.97736020377213e-06, + "loss": 0.3237, + "step": 3674 + }, + { + "epoch": 0.23, + "grad_norm": 2.2618596391224317, + "learning_rate": 8.976742934384069e-06, + "loss": 0.3235, + "step": 3675 + }, + { + "epoch": 0.23, + "grad_norm": 9.700137943136939, + "learning_rate": 8.97612549999388e-06, + "loss": 0.328, + "step": 3676 + }, + { + "epoch": 0.23, + "grad_norm": 12.443272284273608, + "learning_rate": 8.975507900627183e-06, + "loss": 0.3398, + "step": 3677 + }, + { + "epoch": 0.23, + "grad_norm": 3.320011875026783, + "learning_rate": 8.9748901363096e-06, + "loss": 0.3062, + "step": 3678 + }, + { + "epoch": 0.23, + "grad_norm": 3.802621666852393, + "learning_rate": 8.974272207066767e-06, + "loss": 0.3297, + "step": 3679 + }, + { + "epoch": 0.23, + "grad_norm": 6.487813321902356, + "learning_rate": 8.973654112924321e-06, + "loss": 0.3389, + "step": 3680 + }, + { + "epoch": 0.23, + "grad_norm": 3.811909074531975, + "learning_rate": 8.973035853907906e-06, + "loss": 0.3192, + "step": 3681 + }, + { + "epoch": 0.23, + "grad_norm": 2.903283211290379, + "learning_rate": 8.972417430043178e-06, + "loss": 0.3363, + "step": 3682 + }, + { + "epoch": 0.23, + "grad_norm": 15.701398877082495, + "learning_rate": 8.971798841355794e-06, + "loss": 0.3427, + "step": 3683 + }, + { + "epoch": 0.23, + "grad_norm": 2.833770578560621, + "learning_rate": 8.971180087871423e-06, + "loss": 0.3268, + "step": 3684 + }, + { + "epoch": 0.23, + "grad_norm": 4.305994630544555, + "learning_rate": 8.970561169615734e-06, + "loss": 0.3375, + "step": 3685 + }, + { + "epoch": 0.23, + "grad_norm": 26.902374538022954, + "learning_rate": 8.969942086614413e-06, + "loss": 0.3362, + "step": 3686 + }, + { + "epoch": 0.23, + "grad_norm": 5.1060702742579815, + "learning_rate": 8.969322838893141e-06, + "loss": 0.3261, + "step": 3687 + }, + { + "epoch": 0.23, + "grad_norm": 3.6775137361151065, + "learning_rate": 8.968703426477614e-06, + "loss": 0.3114, + "step": 3688 + }, + { + "epoch": 0.23, + "grad_norm": 2.3155985014505402, + "learning_rate": 8.968083849393535e-06, + "loss": 0.3194, + "step": 3689 + }, + { + "epoch": 0.23, + "grad_norm": 1.1287358994957526, + "learning_rate": 8.967464107666605e-06, + "loss": 0.5333, + "step": 3690 + }, + { + "epoch": 0.23, + "grad_norm": 3.3079687843565235, + "learning_rate": 8.966844201322546e-06, + "loss": 0.3094, + "step": 3691 + }, + { + "epoch": 0.23, + "grad_norm": 2.262970277809321, + "learning_rate": 8.966224130387073e-06, + "loss": 0.3257, + "step": 3692 + }, + { + "epoch": 0.23, + "grad_norm": 2.5092325938675017, + "learning_rate": 8.965603894885917e-06, + "loss": 0.3101, + "step": 3693 + }, + { + "epoch": 0.23, + "grad_norm": 2.107537115365177, + "learning_rate": 8.96498349484481e-06, + "loss": 0.3248, + "step": 3694 + }, + { + "epoch": 0.23, + "grad_norm": 2.978604594760016, + "learning_rate": 8.964362930289497e-06, + "loss": 0.3308, + "step": 3695 + }, + { + "epoch": 0.23, + "grad_norm": 2.581691378008034, + "learning_rate": 8.963742201245725e-06, + "loss": 0.3307, + "step": 3696 + }, + { + "epoch": 0.23, + "grad_norm": 6.270998623950874, + "learning_rate": 8.963121307739246e-06, + "loss": 0.3145, + "step": 3697 + }, + { + "epoch": 0.23, + "grad_norm": 3.328146910350842, + "learning_rate": 8.962500249795826e-06, + "loss": 0.3495, + "step": 3698 + }, + { + "epoch": 0.23, + "grad_norm": 2.4659047660069, + "learning_rate": 8.961879027441234e-06, + "loss": 0.3082, + "step": 3699 + }, + { + "epoch": 0.23, + "grad_norm": 1.6379598399266184, + "learning_rate": 8.961257640701243e-06, + "loss": 0.3328, + "step": 3700 + }, + { + "epoch": 0.23, + "grad_norm": 4.803712893457548, + "learning_rate": 8.960636089601636e-06, + "loss": 0.309, + "step": 3701 + }, + { + "epoch": 0.23, + "grad_norm": 1.910081057150349, + "learning_rate": 8.960014374168203e-06, + "loss": 0.3428, + "step": 3702 + }, + { + "epoch": 0.23, + "grad_norm": 4.2453245711636765, + "learning_rate": 8.959392494426739e-06, + "loss": 0.324, + "step": 3703 + }, + { + "epoch": 0.23, + "grad_norm": 2.6834998604875886, + "learning_rate": 8.958770450403049e-06, + "loss": 0.3078, + "step": 3704 + }, + { + "epoch": 0.23, + "grad_norm": 3.0581316057051184, + "learning_rate": 8.958148242122941e-06, + "loss": 0.3152, + "step": 3705 + }, + { + "epoch": 0.23, + "grad_norm": 3.5118843044790875, + "learning_rate": 8.957525869612234e-06, + "loss": 0.3574, + "step": 3706 + }, + { + "epoch": 0.23, + "grad_norm": 3.8920246205566102, + "learning_rate": 8.956903332896747e-06, + "loss": 0.3389, + "step": 3707 + }, + { + "epoch": 0.23, + "grad_norm": 2.730116226629794, + "learning_rate": 8.956280632002312e-06, + "loss": 0.312, + "step": 3708 + }, + { + "epoch": 0.23, + "grad_norm": 4.230495785228844, + "learning_rate": 8.955657766954768e-06, + "loss": 0.3301, + "step": 3709 + }, + { + "epoch": 0.23, + "grad_norm": 4.865760992518893, + "learning_rate": 8.955034737779955e-06, + "loss": 0.3301, + "step": 3710 + }, + { + "epoch": 0.23, + "grad_norm": 5.083156251365915, + "learning_rate": 8.95441154450373e-06, + "loss": 0.3169, + "step": 3711 + }, + { + "epoch": 0.23, + "grad_norm": 2.8244629017007736, + "learning_rate": 8.953788187151941e-06, + "loss": 0.3368, + "step": 3712 + }, + { + "epoch": 0.23, + "grad_norm": 2.459738031389702, + "learning_rate": 8.953164665750462e-06, + "loss": 0.3271, + "step": 3713 + }, + { + "epoch": 0.23, + "grad_norm": 1.7308846485377833, + "learning_rate": 8.952540980325158e-06, + "loss": 0.327, + "step": 3714 + }, + { + "epoch": 0.23, + "grad_norm": 2.181800187862495, + "learning_rate": 8.951917130901906e-06, + "loss": 0.3441, + "step": 3715 + }, + { + "epoch": 0.23, + "grad_norm": 3.1652979984756446, + "learning_rate": 8.951293117506595e-06, + "loss": 0.3309, + "step": 3716 + }, + { + "epoch": 0.23, + "grad_norm": 5.781485351811903, + "learning_rate": 8.950668940165112e-06, + "loss": 0.3367, + "step": 3717 + }, + { + "epoch": 0.23, + "grad_norm": 6.629968117730134, + "learning_rate": 8.95004459890336e-06, + "loss": 0.3476, + "step": 3718 + }, + { + "epoch": 0.23, + "grad_norm": 3.1850867382302868, + "learning_rate": 8.94942009374724e-06, + "loss": 0.3261, + "step": 3719 + }, + { + "epoch": 0.23, + "grad_norm": 2.3544394777063706, + "learning_rate": 8.948795424722667e-06, + "loss": 0.3375, + "step": 3720 + }, + { + "epoch": 0.23, + "grad_norm": 6.028530381368752, + "learning_rate": 8.948170591855556e-06, + "loss": 0.3539, + "step": 3721 + }, + { + "epoch": 0.23, + "grad_norm": 3.4090725307856715, + "learning_rate": 8.947545595171836e-06, + "loss": 0.3172, + "step": 3722 + }, + { + "epoch": 0.23, + "grad_norm": 2.185929210270183, + "learning_rate": 8.946920434697438e-06, + "loss": 0.3359, + "step": 3723 + }, + { + "epoch": 0.23, + "grad_norm": 2.6740142148931216, + "learning_rate": 8.946295110458298e-06, + "loss": 0.3222, + "step": 3724 + }, + { + "epoch": 0.23, + "grad_norm": 1.9662346786765055, + "learning_rate": 8.945669622480366e-06, + "loss": 0.3276, + "step": 3725 + }, + { + "epoch": 0.23, + "grad_norm": 3.0089739848021453, + "learning_rate": 8.945043970789595e-06, + "loss": 0.3392, + "step": 3726 + }, + { + "epoch": 0.23, + "grad_norm": 2.270920344430561, + "learning_rate": 8.94441815541194e-06, + "loss": 0.3474, + "step": 3727 + }, + { + "epoch": 0.23, + "grad_norm": 2.2934456477702465, + "learning_rate": 8.94379217637337e-06, + "loss": 0.3351, + "step": 3728 + }, + { + "epoch": 0.23, + "grad_norm": 12.182996384810266, + "learning_rate": 8.943166033699858e-06, + "loss": 0.3303, + "step": 3729 + }, + { + "epoch": 0.23, + "grad_norm": 2.6661865474886857, + "learning_rate": 8.942539727417383e-06, + "loss": 0.3328, + "step": 3730 + }, + { + "epoch": 0.23, + "grad_norm": 0.8438046535209291, + "learning_rate": 8.941913257551933e-06, + "loss": 0.5067, + "step": 3731 + }, + { + "epoch": 0.23, + "grad_norm": 2.816218547431086, + "learning_rate": 8.941286624129499e-06, + "loss": 0.3015, + "step": 3732 + }, + { + "epoch": 0.23, + "grad_norm": 2.781419759224102, + "learning_rate": 8.940659827176083e-06, + "loss": 0.3221, + "step": 3733 + }, + { + "epoch": 0.23, + "grad_norm": 5.139232330368408, + "learning_rate": 8.940032866717691e-06, + "loss": 0.3431, + "step": 3734 + }, + { + "epoch": 0.23, + "grad_norm": 4.040457050157037, + "learning_rate": 8.939405742780339e-06, + "loss": 0.3329, + "step": 3735 + }, + { + "epoch": 0.23, + "grad_norm": 4.425416944842065, + "learning_rate": 8.938778455390044e-06, + "loss": 0.3168, + "step": 3736 + }, + { + "epoch": 0.24, + "grad_norm": 2.55777911347417, + "learning_rate": 8.938151004572836e-06, + "loss": 0.3279, + "step": 3737 + }, + { + "epoch": 0.24, + "grad_norm": 3.3790259567541567, + "learning_rate": 8.937523390354746e-06, + "loss": 0.3341, + "step": 3738 + }, + { + "epoch": 0.24, + "grad_norm": 2.4216656743961242, + "learning_rate": 8.936895612761817e-06, + "loss": 0.3277, + "step": 3739 + }, + { + "epoch": 0.24, + "grad_norm": 3.976394989991298, + "learning_rate": 8.936267671820097e-06, + "loss": 0.3328, + "step": 3740 + }, + { + "epoch": 0.24, + "grad_norm": 1.830408235172937, + "learning_rate": 8.935639567555639e-06, + "loss": 0.3122, + "step": 3741 + }, + { + "epoch": 0.24, + "grad_norm": 2.3137290923268266, + "learning_rate": 8.935011299994506e-06, + "loss": 0.3483, + "step": 3742 + }, + { + "epoch": 0.24, + "grad_norm": 0.6892055719287272, + "learning_rate": 8.934382869162763e-06, + "loss": 0.4885, + "step": 3743 + }, + { + "epoch": 0.24, + "grad_norm": 3.3636777971339535, + "learning_rate": 8.933754275086487e-06, + "loss": 0.3242, + "step": 3744 + }, + { + "epoch": 0.24, + "grad_norm": 2.5710501860164143, + "learning_rate": 8.93312551779176e-06, + "loss": 0.3491, + "step": 3745 + }, + { + "epoch": 0.24, + "grad_norm": 2.2249085380746747, + "learning_rate": 8.93249659730467e-06, + "loss": 0.3356, + "step": 3746 + }, + { + "epoch": 0.24, + "grad_norm": 2.5807819832528485, + "learning_rate": 8.931867513651311e-06, + "loss": 0.3285, + "step": 3747 + }, + { + "epoch": 0.24, + "grad_norm": 2.8940150187139984, + "learning_rate": 8.931238266857783e-06, + "loss": 0.3306, + "step": 3748 + }, + { + "epoch": 0.24, + "grad_norm": 19.766635051508676, + "learning_rate": 8.930608856950198e-06, + "loss": 0.313, + "step": 3749 + }, + { + "epoch": 0.24, + "grad_norm": 4.104603320745064, + "learning_rate": 8.92997928395467e-06, + "loss": 0.3398, + "step": 3750 + }, + { + "epoch": 0.24, + "grad_norm": 4.647511585459074, + "learning_rate": 8.929349547897321e-06, + "loss": 0.3167, + "step": 3751 + }, + { + "epoch": 0.24, + "grad_norm": 3.7658197810386245, + "learning_rate": 8.92871964880428e-06, + "loss": 0.3431, + "step": 3752 + }, + { + "epoch": 0.24, + "grad_norm": 2.5170280748470266, + "learning_rate": 8.928089586701682e-06, + "loss": 0.3142, + "step": 3753 + }, + { + "epoch": 0.24, + "grad_norm": 3.8799127059400216, + "learning_rate": 8.927459361615672e-06, + "loss": 0.3149, + "step": 3754 + }, + { + "epoch": 0.24, + "grad_norm": 2.1001750807614474, + "learning_rate": 8.926828973572396e-06, + "loss": 0.3292, + "step": 3755 + }, + { + "epoch": 0.24, + "grad_norm": 5.834904750639392, + "learning_rate": 8.92619842259801e-06, + "loss": 0.3167, + "step": 3756 + }, + { + "epoch": 0.24, + "grad_norm": 3.6708523043172825, + "learning_rate": 8.925567708718679e-06, + "loss": 0.3283, + "step": 3757 + }, + { + "epoch": 0.24, + "grad_norm": 4.647499695013446, + "learning_rate": 8.92493683196057e-06, + "loss": 0.3223, + "step": 3758 + }, + { + "epoch": 0.24, + "grad_norm": 3.0279041702713605, + "learning_rate": 8.924305792349861e-06, + "loss": 0.333, + "step": 3759 + }, + { + "epoch": 0.24, + "grad_norm": 4.7332960540866695, + "learning_rate": 8.923674589912735e-06, + "loss": 0.3194, + "step": 3760 + }, + { + "epoch": 0.24, + "grad_norm": 3.432950887421614, + "learning_rate": 8.92304322467538e-06, + "loss": 0.341, + "step": 3761 + }, + { + "epoch": 0.24, + "grad_norm": 4.082062228111898, + "learning_rate": 8.922411696663992e-06, + "loss": 0.3478, + "step": 3762 + }, + { + "epoch": 0.24, + "grad_norm": 2.9000167492809568, + "learning_rate": 8.921780005904779e-06, + "loss": 0.3429, + "step": 3763 + }, + { + "epoch": 0.24, + "grad_norm": 3.2337721272758104, + "learning_rate": 8.921148152423946e-06, + "loss": 0.3257, + "step": 3764 + }, + { + "epoch": 0.24, + "grad_norm": 2.471092327159333, + "learning_rate": 8.920516136247712e-06, + "loss": 0.3332, + "step": 3765 + }, + { + "epoch": 0.24, + "grad_norm": 2.743527301229908, + "learning_rate": 8.9198839574023e-06, + "loss": 0.3247, + "step": 3766 + }, + { + "epoch": 0.24, + "grad_norm": 1.899637854207825, + "learning_rate": 8.91925161591394e-06, + "loss": 0.3148, + "step": 3767 + }, + { + "epoch": 0.24, + "grad_norm": 0.6936980063567434, + "learning_rate": 8.918619111808869e-06, + "loss": 0.5216, + "step": 3768 + }, + { + "epoch": 0.24, + "grad_norm": 4.055760857112053, + "learning_rate": 8.917986445113329e-06, + "loss": 0.317, + "step": 3769 + }, + { + "epoch": 0.24, + "grad_norm": 2.2410123659660615, + "learning_rate": 8.917353615853575e-06, + "loss": 0.3434, + "step": 3770 + }, + { + "epoch": 0.24, + "grad_norm": 2.233610095788776, + "learning_rate": 8.91672062405586e-06, + "loss": 0.3511, + "step": 3771 + }, + { + "epoch": 0.24, + "grad_norm": 3.05950930626397, + "learning_rate": 8.916087469746448e-06, + "loss": 0.3216, + "step": 3772 + }, + { + "epoch": 0.24, + "grad_norm": 3.964420344525788, + "learning_rate": 8.915454152951613e-06, + "loss": 0.3484, + "step": 3773 + }, + { + "epoch": 0.24, + "grad_norm": 2.502269758118046, + "learning_rate": 8.91482067369763e-06, + "loss": 0.3218, + "step": 3774 + }, + { + "epoch": 0.24, + "grad_norm": 4.546544722679174, + "learning_rate": 8.914187032010786e-06, + "loss": 0.3439, + "step": 3775 + }, + { + "epoch": 0.24, + "grad_norm": 2.9001461345661297, + "learning_rate": 8.913553227917366e-06, + "loss": 0.3146, + "step": 3776 + }, + { + "epoch": 0.24, + "grad_norm": 2.895905726844477, + "learning_rate": 8.912919261443674e-06, + "loss": 0.341, + "step": 3777 + }, + { + "epoch": 0.24, + "grad_norm": 3.256103512767372, + "learning_rate": 8.912285132616012e-06, + "loss": 0.3249, + "step": 3778 + }, + { + "epoch": 0.24, + "grad_norm": 1.7257512595344169, + "learning_rate": 8.911650841460688e-06, + "loss": 0.3137, + "step": 3779 + }, + { + "epoch": 0.24, + "grad_norm": 3.555245107250307, + "learning_rate": 8.911016388004026e-06, + "loss": 0.3293, + "step": 3780 + }, + { + "epoch": 0.24, + "grad_norm": 5.247794459675173, + "learning_rate": 8.910381772272345e-06, + "loss": 0.3238, + "step": 3781 + }, + { + "epoch": 0.24, + "grad_norm": 5.205686012722772, + "learning_rate": 8.90974699429198e-06, + "loss": 0.3413, + "step": 3782 + }, + { + "epoch": 0.24, + "grad_norm": 4.756368157426908, + "learning_rate": 8.909112054089266e-06, + "loss": 0.3684, + "step": 3783 + }, + { + "epoch": 0.24, + "grad_norm": 1.8866906639618424, + "learning_rate": 8.90847695169055e-06, + "loss": 0.3259, + "step": 3784 + }, + { + "epoch": 0.24, + "grad_norm": 4.379658212804618, + "learning_rate": 8.907841687122185e-06, + "loss": 0.3373, + "step": 3785 + }, + { + "epoch": 0.24, + "grad_norm": 3.1938073812990284, + "learning_rate": 8.907206260410527e-06, + "loss": 0.3446, + "step": 3786 + }, + { + "epoch": 0.24, + "grad_norm": 3.0231368009515407, + "learning_rate": 8.906570671581937e-06, + "loss": 0.3385, + "step": 3787 + }, + { + "epoch": 0.24, + "grad_norm": 3.854565265644447, + "learning_rate": 8.905934920662796e-06, + "loss": 0.3425, + "step": 3788 + }, + { + "epoch": 0.24, + "grad_norm": 2.6267245134019084, + "learning_rate": 8.905299007679475e-06, + "loss": 0.3313, + "step": 3789 + }, + { + "epoch": 0.24, + "grad_norm": 3.18832562818506, + "learning_rate": 8.904662932658363e-06, + "loss": 0.3223, + "step": 3790 + }, + { + "epoch": 0.24, + "grad_norm": 3.2337071872214906, + "learning_rate": 8.90402669562585e-06, + "loss": 0.3307, + "step": 3791 + }, + { + "epoch": 0.24, + "grad_norm": 3.152753565770728, + "learning_rate": 8.903390296608334e-06, + "loss": 0.3188, + "step": 3792 + }, + { + "epoch": 0.24, + "grad_norm": 2.4122914444398, + "learning_rate": 8.902753735632221e-06, + "loss": 0.3407, + "step": 3793 + }, + { + "epoch": 0.24, + "grad_norm": 8.65155176540431, + "learning_rate": 8.902117012723926e-06, + "loss": 0.3396, + "step": 3794 + }, + { + "epoch": 0.24, + "grad_norm": 2.2503548434494247, + "learning_rate": 8.901480127909862e-06, + "loss": 0.3302, + "step": 3795 + }, + { + "epoch": 0.24, + "grad_norm": 7.316943240218811, + "learning_rate": 8.900843081216462e-06, + "loss": 0.3291, + "step": 3796 + }, + { + "epoch": 0.24, + "grad_norm": 2.2810976801956016, + "learning_rate": 8.90020587267015e-06, + "loss": 0.3135, + "step": 3797 + }, + { + "epoch": 0.24, + "grad_norm": 2.7027236895256603, + "learning_rate": 8.89956850229737e-06, + "loss": 0.318, + "step": 3798 + }, + { + "epoch": 0.24, + "grad_norm": 1.9369146122029532, + "learning_rate": 8.898930970124567e-06, + "loss": 0.3121, + "step": 3799 + }, + { + "epoch": 0.24, + "grad_norm": 7.284425396522207, + "learning_rate": 8.898293276178191e-06, + "loss": 0.3415, + "step": 3800 + }, + { + "epoch": 0.24, + "grad_norm": 7.813046511380066, + "learning_rate": 8.897655420484705e-06, + "loss": 0.3289, + "step": 3801 + }, + { + "epoch": 0.24, + "grad_norm": 2.7255512268285473, + "learning_rate": 8.897017403070572e-06, + "loss": 0.3326, + "step": 3802 + }, + { + "epoch": 0.24, + "grad_norm": 2.884150913605871, + "learning_rate": 8.896379223962265e-06, + "loss": 0.3316, + "step": 3803 + }, + { + "epoch": 0.24, + "grad_norm": 2.4795682828721763, + "learning_rate": 8.895740883186263e-06, + "loss": 0.3576, + "step": 3804 + }, + { + "epoch": 0.24, + "grad_norm": 3.6271732721104906, + "learning_rate": 8.895102380769051e-06, + "loss": 0.3586, + "step": 3805 + }, + { + "epoch": 0.24, + "grad_norm": 4.08513026610504, + "learning_rate": 8.894463716737125e-06, + "loss": 0.337, + "step": 3806 + }, + { + "epoch": 0.24, + "grad_norm": 2.4815764338416195, + "learning_rate": 8.89382489111698e-06, + "loss": 0.3277, + "step": 3807 + }, + { + "epoch": 0.24, + "grad_norm": 2.491311962319985, + "learning_rate": 8.893185903935125e-06, + "loss": 0.3257, + "step": 3808 + }, + { + "epoch": 0.24, + "grad_norm": 2.3811724499469147, + "learning_rate": 8.892546755218071e-06, + "loss": 0.3281, + "step": 3809 + }, + { + "epoch": 0.24, + "grad_norm": 6.7815819883152955, + "learning_rate": 8.89190744499234e-06, + "loss": 0.3304, + "step": 3810 + }, + { + "epoch": 0.24, + "grad_norm": 3.149020440309964, + "learning_rate": 8.891267973284457e-06, + "loss": 0.3349, + "step": 3811 + }, + { + "epoch": 0.24, + "grad_norm": 1.9390393546693092, + "learning_rate": 8.890628340120951e-06, + "loss": 0.3374, + "step": 3812 + }, + { + "epoch": 0.24, + "grad_norm": 1.97863421238819, + "learning_rate": 8.889988545528366e-06, + "loss": 0.3251, + "step": 3813 + }, + { + "epoch": 0.24, + "grad_norm": 4.164845331444898, + "learning_rate": 8.889348589533247e-06, + "loss": 0.3195, + "step": 3814 + }, + { + "epoch": 0.24, + "grad_norm": 2.3626702090731966, + "learning_rate": 8.888708472162147e-06, + "loss": 0.3169, + "step": 3815 + }, + { + "epoch": 0.24, + "grad_norm": 1.7538277936932547, + "learning_rate": 8.888068193441625e-06, + "loss": 0.3222, + "step": 3816 + }, + { + "epoch": 0.24, + "grad_norm": 1.8797303039884392, + "learning_rate": 8.887427753398249e-06, + "loss": 0.3289, + "step": 3817 + }, + { + "epoch": 0.24, + "grad_norm": 1.6560216591031136, + "learning_rate": 8.886787152058586e-06, + "loss": 0.3268, + "step": 3818 + }, + { + "epoch": 0.24, + "grad_norm": 1.9276721444233782, + "learning_rate": 8.886146389449226e-06, + "loss": 0.3232, + "step": 3819 + }, + { + "epoch": 0.24, + "grad_norm": 4.730994382511737, + "learning_rate": 8.885505465596747e-06, + "loss": 0.3362, + "step": 3820 + }, + { + "epoch": 0.24, + "grad_norm": 2.2764880557056943, + "learning_rate": 8.884864380527744e-06, + "loss": 0.3371, + "step": 3821 + }, + { + "epoch": 0.24, + "grad_norm": 2.450776452083643, + "learning_rate": 8.884223134268818e-06, + "loss": 0.3482, + "step": 3822 + }, + { + "epoch": 0.24, + "grad_norm": 2.20623220073423, + "learning_rate": 8.883581726846577e-06, + "loss": 0.3249, + "step": 3823 + }, + { + "epoch": 0.24, + "grad_norm": 2.187844803081438, + "learning_rate": 8.882940158287632e-06, + "loss": 0.3042, + "step": 3824 + }, + { + "epoch": 0.24, + "grad_norm": 2.7681831053019352, + "learning_rate": 8.882298428618601e-06, + "loss": 0.3337, + "step": 3825 + }, + { + "epoch": 0.24, + "grad_norm": 2.651769413743395, + "learning_rate": 8.881656537866114e-06, + "loss": 0.33, + "step": 3826 + }, + { + "epoch": 0.24, + "grad_norm": 1.684097477318379, + "learning_rate": 8.8810144860568e-06, + "loss": 0.3258, + "step": 3827 + }, + { + "epoch": 0.24, + "grad_norm": 2.8215029317151425, + "learning_rate": 8.880372273217307e-06, + "loss": 0.3073, + "step": 3828 + }, + { + "epoch": 0.24, + "grad_norm": 2.582169091594926, + "learning_rate": 8.879729899374269e-06, + "loss": 0.3372, + "step": 3829 + }, + { + "epoch": 0.24, + "grad_norm": 2.6347330332007806, + "learning_rate": 8.879087364554351e-06, + "loss": 0.3361, + "step": 3830 + }, + { + "epoch": 0.24, + "grad_norm": 2.2103303600187734, + "learning_rate": 8.878444668784208e-06, + "loss": 0.3396, + "step": 3831 + }, + { + "epoch": 0.24, + "grad_norm": 1.8936190746169632, + "learning_rate": 8.877801812090505e-06, + "loss": 0.3371, + "step": 3832 + }, + { + "epoch": 0.24, + "grad_norm": 6.024179134237282, + "learning_rate": 8.877158794499919e-06, + "loss": 0.3271, + "step": 3833 + }, + { + "epoch": 0.24, + "grad_norm": 2.577105719299173, + "learning_rate": 8.876515616039126e-06, + "loss": 0.3583, + "step": 3834 + }, + { + "epoch": 0.24, + "grad_norm": 1.7980166390455867, + "learning_rate": 8.875872276734816e-06, + "loss": 0.3328, + "step": 3835 + }, + { + "epoch": 0.24, + "grad_norm": 1.718324015467852, + "learning_rate": 8.87522877661368e-06, + "loss": 0.3248, + "step": 3836 + }, + { + "epoch": 0.24, + "grad_norm": 3.6030182399426876, + "learning_rate": 8.874585115702419e-06, + "loss": 0.3337, + "step": 3837 + }, + { + "epoch": 0.24, + "grad_norm": 3.13675666787467, + "learning_rate": 8.873941294027742e-06, + "loss": 0.3441, + "step": 3838 + }, + { + "epoch": 0.24, + "grad_norm": 3.92834574895222, + "learning_rate": 8.873297311616356e-06, + "loss": 0.3284, + "step": 3839 + }, + { + "epoch": 0.24, + "grad_norm": 3.09250773906718, + "learning_rate": 8.872653168494988e-06, + "loss": 0.3543, + "step": 3840 + }, + { + "epoch": 0.24, + "grad_norm": 3.0568890325978706, + "learning_rate": 8.872008864690358e-06, + "loss": 0.3658, + "step": 3841 + }, + { + "epoch": 0.24, + "grad_norm": 11.629916235669576, + "learning_rate": 8.871364400229206e-06, + "loss": 0.3304, + "step": 3842 + }, + { + "epoch": 0.24, + "grad_norm": 1.4120061975792972, + "learning_rate": 8.870719775138266e-06, + "loss": 0.3273, + "step": 3843 + }, + { + "epoch": 0.24, + "grad_norm": 2.4307263050769583, + "learning_rate": 8.870074989444289e-06, + "loss": 0.3571, + "step": 3844 + }, + { + "epoch": 0.24, + "grad_norm": 1.631956928757783, + "learning_rate": 8.869430043174027e-06, + "loss": 0.3187, + "step": 3845 + }, + { + "epoch": 0.24, + "grad_norm": 1.9659698865753197, + "learning_rate": 8.868784936354239e-06, + "loss": 0.3204, + "step": 3846 + }, + { + "epoch": 0.24, + "grad_norm": 2.3322765690960945, + "learning_rate": 8.868139669011693e-06, + "loss": 0.3383, + "step": 3847 + }, + { + "epoch": 0.24, + "grad_norm": 2.497011087121863, + "learning_rate": 8.86749424117316e-06, + "loss": 0.3261, + "step": 3848 + }, + { + "epoch": 0.24, + "grad_norm": 2.2073216602945696, + "learning_rate": 8.866848652865422e-06, + "loss": 0.3473, + "step": 3849 + }, + { + "epoch": 0.24, + "grad_norm": 2.733412826783298, + "learning_rate": 8.866202904115265e-06, + "loss": 0.3377, + "step": 3850 + }, + { + "epoch": 0.24, + "grad_norm": 4.996459300023647, + "learning_rate": 8.865556994949484e-06, + "loss": 0.344, + "step": 3851 + }, + { + "epoch": 0.24, + "grad_norm": 0.7861627929439978, + "learning_rate": 8.864910925394875e-06, + "loss": 0.5338, + "step": 3852 + }, + { + "epoch": 0.24, + "grad_norm": 1.8191845219998062, + "learning_rate": 8.864264695478249e-06, + "loss": 0.3341, + "step": 3853 + }, + { + "epoch": 0.24, + "grad_norm": 2.6348075695485664, + "learning_rate": 8.863618305226415e-06, + "loss": 0.3345, + "step": 3854 + }, + { + "epoch": 0.24, + "grad_norm": 2.3203016489538917, + "learning_rate": 8.862971754666197e-06, + "loss": 0.3778, + "step": 3855 + }, + { + "epoch": 0.24, + "grad_norm": 2.023482839826505, + "learning_rate": 8.862325043824417e-06, + "loss": 0.3353, + "step": 3856 + }, + { + "epoch": 0.24, + "grad_norm": 2.0528876842471475, + "learning_rate": 8.861678172727912e-06, + "loss": 0.3191, + "step": 3857 + }, + { + "epoch": 0.24, + "grad_norm": 2.890524971239111, + "learning_rate": 8.861031141403521e-06, + "loss": 0.3361, + "step": 3858 + }, + { + "epoch": 0.24, + "grad_norm": 3.9241296618676054, + "learning_rate": 8.86038394987809e-06, + "loss": 0.3414, + "step": 3859 + }, + { + "epoch": 0.24, + "grad_norm": 2.095547267068001, + "learning_rate": 8.859736598178472e-06, + "loss": 0.3347, + "step": 3860 + }, + { + "epoch": 0.24, + "grad_norm": 3.4156849155693427, + "learning_rate": 8.859089086331525e-06, + "loss": 0.3499, + "step": 3861 + }, + { + "epoch": 0.24, + "grad_norm": 2.41831697144385, + "learning_rate": 8.85844141436412e-06, + "loss": 0.3265, + "step": 3862 + }, + { + "epoch": 0.24, + "grad_norm": 3.005906095641753, + "learning_rate": 8.857793582303126e-06, + "loss": 0.3405, + "step": 3863 + }, + { + "epoch": 0.24, + "grad_norm": 1.968545434221177, + "learning_rate": 8.857145590175424e-06, + "loss": 0.3602, + "step": 3864 + }, + { + "epoch": 0.24, + "grad_norm": 0.7047470613912522, + "learning_rate": 8.8564974380079e-06, + "loss": 0.4768, + "step": 3865 + }, + { + "epoch": 0.24, + "grad_norm": 1.9544057181880778, + "learning_rate": 8.855849125827449e-06, + "loss": 0.331, + "step": 3866 + }, + { + "epoch": 0.24, + "grad_norm": 26.940024139412586, + "learning_rate": 8.855200653660968e-06, + "loss": 0.3507, + "step": 3867 + }, + { + "epoch": 0.24, + "grad_norm": 5.0242492038546995, + "learning_rate": 8.854552021535364e-06, + "loss": 0.349, + "step": 3868 + }, + { + "epoch": 0.24, + "grad_norm": 2.741353559421358, + "learning_rate": 8.853903229477551e-06, + "loss": 0.3585, + "step": 3869 + }, + { + "epoch": 0.24, + "grad_norm": 0.6038561947185022, + "learning_rate": 8.853254277514448e-06, + "loss": 0.5322, + "step": 3870 + }, + { + "epoch": 0.24, + "grad_norm": 2.1505945163444187, + "learning_rate": 8.852605165672978e-06, + "loss": 0.325, + "step": 3871 + }, + { + "epoch": 0.24, + "grad_norm": 12.921557449567707, + "learning_rate": 8.85195589398008e-06, + "loss": 0.3531, + "step": 3872 + }, + { + "epoch": 0.24, + "grad_norm": 1.8701218681116438, + "learning_rate": 8.851306462462689e-06, + "loss": 0.3349, + "step": 3873 + }, + { + "epoch": 0.24, + "grad_norm": 5.742681666858432, + "learning_rate": 8.850656871147751e-06, + "loss": 0.3391, + "step": 3874 + }, + { + "epoch": 0.24, + "grad_norm": 2.2987565301394373, + "learning_rate": 8.850007120062222e-06, + "loss": 0.328, + "step": 3875 + }, + { + "epoch": 0.24, + "grad_norm": 1.915574876506993, + "learning_rate": 8.849357209233058e-06, + "loss": 0.3304, + "step": 3876 + }, + { + "epoch": 0.24, + "grad_norm": 2.592669304463846, + "learning_rate": 8.848707138687227e-06, + "loss": 0.3223, + "step": 3877 + }, + { + "epoch": 0.24, + "grad_norm": 7.59090898074836, + "learning_rate": 8.848056908451698e-06, + "loss": 0.3329, + "step": 3878 + }, + { + "epoch": 0.24, + "grad_norm": 1.986947695461738, + "learning_rate": 8.847406518553456e-06, + "loss": 0.3285, + "step": 3879 + }, + { + "epoch": 0.24, + "grad_norm": 2.0903116730798996, + "learning_rate": 8.846755969019483e-06, + "loss": 0.3604, + "step": 3880 + }, + { + "epoch": 0.24, + "grad_norm": 2.156612702728645, + "learning_rate": 8.846105259876771e-06, + "loss": 0.3292, + "step": 3881 + }, + { + "epoch": 0.24, + "grad_norm": 3.6536210372053626, + "learning_rate": 8.845454391152322e-06, + "loss": 0.3352, + "step": 3882 + }, + { + "epoch": 0.24, + "grad_norm": 4.264987913288185, + "learning_rate": 8.84480336287314e-06, + "loss": 0.3223, + "step": 3883 + }, + { + "epoch": 0.24, + "grad_norm": 4.925664752201105, + "learning_rate": 8.844152175066236e-06, + "loss": 0.3229, + "step": 3884 + }, + { + "epoch": 0.24, + "grad_norm": 3.2169062660615215, + "learning_rate": 8.84350082775863e-06, + "loss": 0.3523, + "step": 3885 + }, + { + "epoch": 0.24, + "grad_norm": 1.766125838423153, + "learning_rate": 8.84284932097735e-06, + "loss": 0.3239, + "step": 3886 + }, + { + "epoch": 0.24, + "grad_norm": 1.894999079590576, + "learning_rate": 8.842197654749424e-06, + "loss": 0.3214, + "step": 3887 + }, + { + "epoch": 0.24, + "grad_norm": 2.061528927370333, + "learning_rate": 8.841545829101895e-06, + "loss": 0.3276, + "step": 3888 + }, + { + "epoch": 0.24, + "grad_norm": 3.705359907076887, + "learning_rate": 8.840893844061804e-06, + "loss": 0.3557, + "step": 3889 + }, + { + "epoch": 0.24, + "grad_norm": 2.401023642036005, + "learning_rate": 8.840241699656207e-06, + "loss": 0.3414, + "step": 3890 + }, + { + "epoch": 0.24, + "grad_norm": 3.5443030138097336, + "learning_rate": 8.839589395912159e-06, + "loss": 0.3158, + "step": 3891 + }, + { + "epoch": 0.24, + "grad_norm": 3.336497941852103, + "learning_rate": 8.838936932856727e-06, + "loss": 0.3442, + "step": 3892 + }, + { + "epoch": 0.24, + "grad_norm": 24.628950073411175, + "learning_rate": 8.838284310516985e-06, + "loss": 0.3462, + "step": 3893 + }, + { + "epoch": 0.24, + "grad_norm": 1.9694358642533767, + "learning_rate": 8.837631528920008e-06, + "loss": 0.3383, + "step": 3894 + }, + { + "epoch": 0.24, + "grad_norm": 1.9458972961783314, + "learning_rate": 8.836978588092883e-06, + "loss": 0.3158, + "step": 3895 + }, + { + "epoch": 0.25, + "grad_norm": 1.8284781128041392, + "learning_rate": 8.8363254880627e-06, + "loss": 0.315, + "step": 3896 + }, + { + "epoch": 0.25, + "grad_norm": 2.378205330626439, + "learning_rate": 8.83567222885656e-06, + "loss": 0.3201, + "step": 3897 + }, + { + "epoch": 0.25, + "grad_norm": 5.530432467524255, + "learning_rate": 8.835018810501565e-06, + "loss": 0.3473, + "step": 3898 + }, + { + "epoch": 0.25, + "grad_norm": 2.0424361820949244, + "learning_rate": 8.834365233024829e-06, + "loss": 0.3436, + "step": 3899 + }, + { + "epoch": 0.25, + "grad_norm": 2.1196827075666516, + "learning_rate": 8.833711496453468e-06, + "loss": 0.3393, + "step": 3900 + }, + { + "epoch": 0.25, + "grad_norm": 4.952618656269322, + "learning_rate": 8.833057600814607e-06, + "loss": 0.3624, + "step": 3901 + }, + { + "epoch": 0.25, + "grad_norm": 1.440008519259911, + "learning_rate": 8.832403546135379e-06, + "loss": 0.3372, + "step": 3902 + }, + { + "epoch": 0.25, + "grad_norm": 13.41174185418652, + "learning_rate": 8.831749332442921e-06, + "loss": 0.3237, + "step": 3903 + }, + { + "epoch": 0.25, + "grad_norm": 3.2010395801925373, + "learning_rate": 8.831094959764377e-06, + "loss": 0.3481, + "step": 3904 + }, + { + "epoch": 0.25, + "grad_norm": 7.583312141968117, + "learning_rate": 8.830440428126898e-06, + "loss": 0.3238, + "step": 3905 + }, + { + "epoch": 0.25, + "grad_norm": 0.6722910462704026, + "learning_rate": 8.829785737557642e-06, + "loss": 0.4975, + "step": 3906 + }, + { + "epoch": 0.25, + "grad_norm": 1.8359111684459553, + "learning_rate": 8.829130888083774e-06, + "loss": 0.3305, + "step": 3907 + }, + { + "epoch": 0.25, + "grad_norm": 2.0463744849386925, + "learning_rate": 8.828475879732463e-06, + "loss": 0.3269, + "step": 3908 + }, + { + "epoch": 0.25, + "grad_norm": 1.855492911832764, + "learning_rate": 8.827820712530888e-06, + "loss": 0.3249, + "step": 3909 + }, + { + "epoch": 0.25, + "grad_norm": 1.4209041408111671, + "learning_rate": 8.827165386506233e-06, + "loss": 0.3198, + "step": 3910 + }, + { + "epoch": 0.25, + "grad_norm": 3.925380030191963, + "learning_rate": 8.826509901685689e-06, + "loss": 0.3197, + "step": 3911 + }, + { + "epoch": 0.25, + "grad_norm": 2.2511658113980753, + "learning_rate": 8.82585425809645e-06, + "loss": 0.3525, + "step": 3912 + }, + { + "epoch": 0.25, + "grad_norm": 2.460079825444177, + "learning_rate": 8.825198455765724e-06, + "loss": 0.3389, + "step": 3913 + }, + { + "epoch": 0.25, + "grad_norm": 2.9491557044021177, + "learning_rate": 8.824542494720721e-06, + "loss": 0.3192, + "step": 3914 + }, + { + "epoch": 0.25, + "grad_norm": 3.9619002074501823, + "learning_rate": 8.823886374988655e-06, + "loss": 0.349, + "step": 3915 + }, + { + "epoch": 0.25, + "grad_norm": 1.5782371474866366, + "learning_rate": 8.823230096596751e-06, + "loss": 0.3208, + "step": 3916 + }, + { + "epoch": 0.25, + "grad_norm": 1.7775910976015772, + "learning_rate": 8.82257365957224e-06, + "loss": 0.341, + "step": 3917 + }, + { + "epoch": 0.25, + "grad_norm": 2.419382613452219, + "learning_rate": 8.821917063942359e-06, + "loss": 0.3168, + "step": 3918 + }, + { + "epoch": 0.25, + "grad_norm": 2.2680541421652127, + "learning_rate": 8.82126030973435e-06, + "loss": 0.3577, + "step": 3919 + }, + { + "epoch": 0.25, + "grad_norm": 1.3986296824244648, + "learning_rate": 8.820603396975463e-06, + "loss": 0.3231, + "step": 3920 + }, + { + "epoch": 0.25, + "grad_norm": 1.792165517815148, + "learning_rate": 8.819946325692955e-06, + "loss": 0.3388, + "step": 3921 + }, + { + "epoch": 0.25, + "grad_norm": 1.9036276897324675, + "learning_rate": 8.81928909591409e-06, + "loss": 0.3421, + "step": 3922 + }, + { + "epoch": 0.25, + "grad_norm": 2.5755263412413703, + "learning_rate": 8.818631707666136e-06, + "loss": 0.3314, + "step": 3923 + }, + { + "epoch": 0.25, + "grad_norm": 2.905918617029896, + "learning_rate": 8.817974160976368e-06, + "loss": 0.3293, + "step": 3924 + }, + { + "epoch": 0.25, + "grad_norm": 2.074687240108824, + "learning_rate": 8.817316455872073e-06, + "loss": 0.3397, + "step": 3925 + }, + { + "epoch": 0.25, + "grad_norm": 1.6556156451209951, + "learning_rate": 8.816658592380538e-06, + "loss": 0.3581, + "step": 3926 + }, + { + "epoch": 0.25, + "grad_norm": 2.5225123949991586, + "learning_rate": 8.816000570529057e-06, + "loss": 0.3395, + "step": 3927 + }, + { + "epoch": 0.25, + "grad_norm": 1.8796581244586301, + "learning_rate": 8.815342390344937e-06, + "loss": 0.3456, + "step": 3928 + }, + { + "epoch": 0.25, + "grad_norm": 3.6006706301784175, + "learning_rate": 8.814684051855482e-06, + "loss": 0.339, + "step": 3929 + }, + { + "epoch": 0.25, + "grad_norm": 2.756539450428205, + "learning_rate": 8.814025555088011e-06, + "loss": 0.3544, + "step": 3930 + }, + { + "epoch": 0.25, + "grad_norm": 2.0715619432674304, + "learning_rate": 8.813366900069844e-06, + "loss": 0.3456, + "step": 3931 + }, + { + "epoch": 0.25, + "grad_norm": 2.210278623420081, + "learning_rate": 8.812708086828312e-06, + "loss": 0.3466, + "step": 3932 + }, + { + "epoch": 0.25, + "grad_norm": 3.6949532304163624, + "learning_rate": 8.81204911539075e-06, + "loss": 0.337, + "step": 3933 + }, + { + "epoch": 0.25, + "grad_norm": 3.4190388446630333, + "learning_rate": 8.8113899857845e-06, + "loss": 0.3304, + "step": 3934 + }, + { + "epoch": 0.25, + "grad_norm": 2.012773969913957, + "learning_rate": 8.810730698036906e-06, + "loss": 0.3242, + "step": 3935 + }, + { + "epoch": 0.25, + "grad_norm": 3.0562977370739435, + "learning_rate": 8.81007125217533e-06, + "loss": 0.3797, + "step": 3936 + }, + { + "epoch": 0.25, + "grad_norm": 4.130834954945286, + "learning_rate": 8.809411648227129e-06, + "loss": 0.3616, + "step": 3937 + }, + { + "epoch": 0.25, + "grad_norm": 2.227304738078117, + "learning_rate": 8.808751886219673e-06, + "loss": 0.349, + "step": 3938 + }, + { + "epoch": 0.25, + "grad_norm": 2.147724538836497, + "learning_rate": 8.808091966180337e-06, + "loss": 0.3456, + "step": 3939 + }, + { + "epoch": 0.25, + "grad_norm": 1.554994194092664, + "learning_rate": 8.8074318881365e-06, + "loss": 0.3311, + "step": 3940 + }, + { + "epoch": 0.25, + "grad_norm": 1.822323489634922, + "learning_rate": 8.80677165211555e-06, + "loss": 0.3178, + "step": 3941 + }, + { + "epoch": 0.25, + "grad_norm": 3.60007168743747, + "learning_rate": 8.806111258144885e-06, + "loss": 0.3416, + "step": 3942 + }, + { + "epoch": 0.25, + "grad_norm": 3.2015769905986544, + "learning_rate": 8.805450706251903e-06, + "loss": 0.3459, + "step": 3943 + }, + { + "epoch": 0.25, + "grad_norm": 2.2836085210108785, + "learning_rate": 8.804789996464012e-06, + "loss": 0.3469, + "step": 3944 + }, + { + "epoch": 0.25, + "grad_norm": 4.745469579523758, + "learning_rate": 8.804129128808626e-06, + "loss": 0.3458, + "step": 3945 + }, + { + "epoch": 0.25, + "grad_norm": 1.5482381798668035, + "learning_rate": 8.803468103313165e-06, + "loss": 0.3205, + "step": 3946 + }, + { + "epoch": 0.25, + "grad_norm": 1.9689575422993166, + "learning_rate": 8.802806920005055e-06, + "loss": 0.3585, + "step": 3947 + }, + { + "epoch": 0.25, + "grad_norm": 2.139807856050084, + "learning_rate": 8.802145578911733e-06, + "loss": 0.3171, + "step": 3948 + }, + { + "epoch": 0.25, + "grad_norm": 1.5883947555878855, + "learning_rate": 8.801484080060638e-06, + "loss": 0.3435, + "step": 3949 + }, + { + "epoch": 0.25, + "grad_norm": 2.9604793929364766, + "learning_rate": 8.800822423479217e-06, + "loss": 0.3577, + "step": 3950 + }, + { + "epoch": 0.25, + "grad_norm": 3.927768363210243, + "learning_rate": 8.800160609194922e-06, + "loss": 0.3428, + "step": 3951 + }, + { + "epoch": 0.25, + "grad_norm": 1.8353010270690588, + "learning_rate": 8.799498637235213e-06, + "loss": 0.3635, + "step": 3952 + }, + { + "epoch": 0.25, + "grad_norm": 2.047035962116984, + "learning_rate": 8.798836507627556e-06, + "loss": 0.3295, + "step": 3953 + }, + { + "epoch": 0.25, + "grad_norm": 2.977250737729621, + "learning_rate": 8.798174220399427e-06, + "loss": 0.361, + "step": 3954 + }, + { + "epoch": 0.25, + "grad_norm": 1.8191323300886468, + "learning_rate": 8.7975117755783e-06, + "loss": 0.322, + "step": 3955 + }, + { + "epoch": 0.25, + "grad_norm": 2.2833073536674253, + "learning_rate": 8.796849173191669e-06, + "loss": 0.3257, + "step": 3956 + }, + { + "epoch": 0.25, + "grad_norm": 2.653580360973125, + "learning_rate": 8.796186413267018e-06, + "loss": 0.3586, + "step": 3957 + }, + { + "epoch": 0.25, + "grad_norm": 1.5959191441071698, + "learning_rate": 8.79552349583185e-06, + "loss": 0.3352, + "step": 3958 + }, + { + "epoch": 0.25, + "grad_norm": 2.68755513942267, + "learning_rate": 8.794860420913672e-06, + "loss": 0.3353, + "step": 3959 + }, + { + "epoch": 0.25, + "grad_norm": 2.290039827976815, + "learning_rate": 8.794197188539994e-06, + "loss": 0.3505, + "step": 3960 + }, + { + "epoch": 0.25, + "grad_norm": 2.6982893771975944, + "learning_rate": 8.793533798738335e-06, + "loss": 0.3631, + "step": 3961 + }, + { + "epoch": 0.25, + "grad_norm": 2.368724845872284, + "learning_rate": 8.792870251536223e-06, + "loss": 0.3364, + "step": 3962 + }, + { + "epoch": 0.25, + "grad_norm": 2.3437852662567056, + "learning_rate": 8.792206546961187e-06, + "loss": 0.3339, + "step": 3963 + }, + { + "epoch": 0.25, + "grad_norm": 2.379021111034225, + "learning_rate": 8.791542685040764e-06, + "loss": 0.331, + "step": 3964 + }, + { + "epoch": 0.25, + "grad_norm": 3.468420002441763, + "learning_rate": 8.7908786658025e-06, + "loss": 0.3697, + "step": 3965 + }, + { + "epoch": 0.25, + "grad_norm": 4.40442348480613, + "learning_rate": 8.790214489273948e-06, + "loss": 0.3239, + "step": 3966 + }, + { + "epoch": 0.25, + "grad_norm": 2.443752448014104, + "learning_rate": 8.789550155482665e-06, + "loss": 0.336, + "step": 3967 + }, + { + "epoch": 0.25, + "grad_norm": 5.423170706382684, + "learning_rate": 8.788885664456212e-06, + "loss": 0.3522, + "step": 3968 + }, + { + "epoch": 0.25, + "grad_norm": 2.0159703741815638, + "learning_rate": 8.788221016222167e-06, + "loss": 0.327, + "step": 3969 + }, + { + "epoch": 0.25, + "grad_norm": 1.413832410196219, + "learning_rate": 8.787556210808101e-06, + "loss": 0.3159, + "step": 3970 + }, + { + "epoch": 0.25, + "grad_norm": 2.229140339814073, + "learning_rate": 8.786891248241602e-06, + "loss": 0.3182, + "step": 3971 + }, + { + "epoch": 0.25, + "grad_norm": 1.8559889221850456, + "learning_rate": 8.786226128550257e-06, + "loss": 0.3232, + "step": 3972 + }, + { + "epoch": 0.25, + "grad_norm": 1.7735259003246646, + "learning_rate": 8.785560851761666e-06, + "loss": 0.3398, + "step": 3973 + }, + { + "epoch": 0.25, + "grad_norm": 3.1257374329945917, + "learning_rate": 8.784895417903431e-06, + "loss": 0.318, + "step": 3974 + }, + { + "epoch": 0.25, + "grad_norm": 3.20304958642847, + "learning_rate": 8.784229827003163e-06, + "loss": 0.3598, + "step": 3975 + }, + { + "epoch": 0.25, + "grad_norm": 2.8481307769447963, + "learning_rate": 8.783564079088478e-06, + "loss": 0.3366, + "step": 3976 + }, + { + "epoch": 0.25, + "grad_norm": 2.4349804126816066, + "learning_rate": 8.782898174186998e-06, + "loss": 0.3247, + "step": 3977 + }, + { + "epoch": 0.25, + "grad_norm": 1.6125703437228829, + "learning_rate": 8.782232112326353e-06, + "loss": 0.3405, + "step": 3978 + }, + { + "epoch": 0.25, + "grad_norm": 2.7563070234189557, + "learning_rate": 8.78156589353418e-06, + "loss": 0.3517, + "step": 3979 + }, + { + "epoch": 0.25, + "grad_norm": 3.158906279170565, + "learning_rate": 8.780899517838122e-06, + "loss": 0.3371, + "step": 3980 + }, + { + "epoch": 0.25, + "grad_norm": 3.023815966204676, + "learning_rate": 8.78023298526583e-06, + "loss": 0.3413, + "step": 3981 + }, + { + "epoch": 0.25, + "grad_norm": 1.7695617599975126, + "learning_rate": 8.779566295844953e-06, + "loss": 0.3358, + "step": 3982 + }, + { + "epoch": 0.25, + "grad_norm": 2.8670330030267603, + "learning_rate": 8.778899449603158e-06, + "loss": 0.345, + "step": 3983 + }, + { + "epoch": 0.25, + "grad_norm": 2.7119260387214954, + "learning_rate": 8.778232446568114e-06, + "loss": 0.3423, + "step": 3984 + }, + { + "epoch": 0.25, + "grad_norm": 1.6557380312202785, + "learning_rate": 8.777565286767493e-06, + "loss": 0.3149, + "step": 3985 + }, + { + "epoch": 0.25, + "grad_norm": 2.031829735179775, + "learning_rate": 8.776897970228979e-06, + "loss": 0.3229, + "step": 3986 + }, + { + "epoch": 0.25, + "grad_norm": 1.6405112659555778, + "learning_rate": 8.77623049698026e-06, + "loss": 0.3264, + "step": 3987 + }, + { + "epoch": 0.25, + "grad_norm": 5.650907873991567, + "learning_rate": 8.775562867049033e-06, + "loss": 0.3352, + "step": 3988 + }, + { + "epoch": 0.25, + "grad_norm": 3.377611140084689, + "learning_rate": 8.774895080462993e-06, + "loss": 0.3464, + "step": 3989 + }, + { + "epoch": 0.25, + "grad_norm": 2.0190881235297744, + "learning_rate": 8.774227137249851e-06, + "loss": 0.3336, + "step": 3990 + }, + { + "epoch": 0.25, + "grad_norm": 1.8384397193429323, + "learning_rate": 8.773559037437323e-06, + "loss": 0.3244, + "step": 3991 + }, + { + "epoch": 0.25, + "grad_norm": 3.938548809288756, + "learning_rate": 8.772890781053127e-06, + "loss": 0.3227, + "step": 3992 + }, + { + "epoch": 0.25, + "grad_norm": 1.931703895266037, + "learning_rate": 8.772222368124992e-06, + "loss": 0.3223, + "step": 3993 + }, + { + "epoch": 0.25, + "grad_norm": 1.6433144475496635, + "learning_rate": 8.77155379868065e-06, + "loss": 0.3265, + "step": 3994 + }, + { + "epoch": 0.25, + "grad_norm": 2.0306364090676436, + "learning_rate": 8.770885072747843e-06, + "loss": 0.3393, + "step": 3995 + }, + { + "epoch": 0.25, + "grad_norm": 0.7653982096852402, + "learning_rate": 8.770216190354316e-06, + "loss": 0.515, + "step": 3996 + }, + { + "epoch": 0.25, + "grad_norm": 1.755430314122129, + "learning_rate": 8.769547151527821e-06, + "loss": 0.3149, + "step": 3997 + }, + { + "epoch": 0.25, + "grad_norm": 2.1910043438408513, + "learning_rate": 8.76887795629612e-06, + "loss": 0.3399, + "step": 3998 + }, + { + "epoch": 0.25, + "grad_norm": 2.1466922075478108, + "learning_rate": 8.768208604686979e-06, + "loss": 0.3409, + "step": 3999 + }, + { + "epoch": 0.25, + "grad_norm": 2.321892658536272, + "learning_rate": 8.767539096728171e-06, + "loss": 0.3238, + "step": 4000 + }, + { + "epoch": 0.25, + "grad_norm": 3.00755075469908, + "learning_rate": 8.766869432447473e-06, + "loss": 0.3231, + "step": 4001 + }, + { + "epoch": 0.25, + "grad_norm": 1.4972781413131402, + "learning_rate": 8.76619961187267e-06, + "loss": 0.3367, + "step": 4002 + }, + { + "epoch": 0.25, + "grad_norm": 5.700095470221592, + "learning_rate": 8.765529635031556e-06, + "loss": 0.3504, + "step": 4003 + }, + { + "epoch": 0.25, + "grad_norm": 2.124431337435667, + "learning_rate": 8.76485950195193e-06, + "loss": 0.321, + "step": 4004 + }, + { + "epoch": 0.25, + "grad_norm": 4.177660901477263, + "learning_rate": 8.764189212661597e-06, + "loss": 0.3545, + "step": 4005 + }, + { + "epoch": 0.25, + "grad_norm": 1.9543874288575571, + "learning_rate": 8.763518767188367e-06, + "loss": 0.3304, + "step": 4006 + }, + { + "epoch": 0.25, + "grad_norm": 2.040240741145905, + "learning_rate": 8.762848165560057e-06, + "loss": 0.3569, + "step": 4007 + }, + { + "epoch": 0.25, + "grad_norm": 3.1462098143685076, + "learning_rate": 8.762177407804495e-06, + "loss": 0.3365, + "step": 4008 + }, + { + "epoch": 0.25, + "grad_norm": 1.3557725363827984, + "learning_rate": 8.761506493949508e-06, + "loss": 0.3272, + "step": 4009 + }, + { + "epoch": 0.25, + "grad_norm": 2.049154870315858, + "learning_rate": 8.760835424022938e-06, + "loss": 0.3453, + "step": 4010 + }, + { + "epoch": 0.25, + "grad_norm": 1.909236488879323, + "learning_rate": 8.760164198052625e-06, + "loss": 0.3437, + "step": 4011 + }, + { + "epoch": 0.25, + "grad_norm": 1.986681260659995, + "learning_rate": 8.75949281606642e-06, + "loss": 0.3473, + "step": 4012 + }, + { + "epoch": 0.25, + "grad_norm": 2.4724205262807475, + "learning_rate": 8.75882127809218e-06, + "loss": 0.3649, + "step": 4013 + }, + { + "epoch": 0.25, + "grad_norm": 2.3375703560766286, + "learning_rate": 8.758149584157772e-06, + "loss": 0.3439, + "step": 4014 + }, + { + "epoch": 0.25, + "grad_norm": 0.7335786317428861, + "learning_rate": 8.757477734291059e-06, + "loss": 0.5261, + "step": 4015 + }, + { + "epoch": 0.25, + "grad_norm": 1.69490251316695, + "learning_rate": 8.756805728519922e-06, + "loss": 0.3438, + "step": 4016 + }, + { + "epoch": 0.25, + "grad_norm": 3.163874147734496, + "learning_rate": 8.756133566872242e-06, + "loss": 0.3544, + "step": 4017 + }, + { + "epoch": 0.25, + "grad_norm": 1.7208233723521968, + "learning_rate": 8.75546124937591e-06, + "loss": 0.3189, + "step": 4018 + }, + { + "epoch": 0.25, + "grad_norm": 2.8147174898790515, + "learning_rate": 8.754788776058818e-06, + "loss": 0.3648, + "step": 4019 + }, + { + "epoch": 0.25, + "grad_norm": 1.965910224778552, + "learning_rate": 8.754116146948873e-06, + "loss": 0.31, + "step": 4020 + }, + { + "epoch": 0.25, + "grad_norm": 3.515679447901728, + "learning_rate": 8.753443362073978e-06, + "loss": 0.3281, + "step": 4021 + }, + { + "epoch": 0.25, + "grad_norm": 3.3864591791292553, + "learning_rate": 8.752770421462053e-06, + "loss": 0.3678, + "step": 4022 + }, + { + "epoch": 0.25, + "grad_norm": 2.511879566043592, + "learning_rate": 8.752097325141017e-06, + "loss": 0.3656, + "step": 4023 + }, + { + "epoch": 0.25, + "grad_norm": 2.1900113369588854, + "learning_rate": 8.751424073138801e-06, + "loss": 0.3317, + "step": 4024 + }, + { + "epoch": 0.25, + "grad_norm": 2.9244560198108775, + "learning_rate": 8.750750665483333e-06, + "loss": 0.3381, + "step": 4025 + }, + { + "epoch": 0.25, + "grad_norm": 1.6220897907902792, + "learning_rate": 8.750077102202562e-06, + "loss": 0.3326, + "step": 4026 + }, + { + "epoch": 0.25, + "grad_norm": 3.205144572196654, + "learning_rate": 8.749403383324428e-06, + "loss": 0.3267, + "step": 4027 + }, + { + "epoch": 0.25, + "grad_norm": 4.67607270989675, + "learning_rate": 8.748729508876889e-06, + "loss": 0.3331, + "step": 4028 + }, + { + "epoch": 0.25, + "grad_norm": 0.6842191927654142, + "learning_rate": 8.748055478887905e-06, + "loss": 0.5013, + "step": 4029 + }, + { + "epoch": 0.25, + "grad_norm": 1.4214965308143919, + "learning_rate": 8.74738129338544e-06, + "loss": 0.347, + "step": 4030 + }, + { + "epoch": 0.25, + "grad_norm": 1.8626450857396741, + "learning_rate": 8.74670695239747e-06, + "loss": 0.3354, + "step": 4031 + }, + { + "epoch": 0.25, + "grad_norm": 2.7129575257256846, + "learning_rate": 8.746032455951972e-06, + "loss": 0.3427, + "step": 4032 + }, + { + "epoch": 0.25, + "grad_norm": 1.894176120050417, + "learning_rate": 8.745357804076937e-06, + "loss": 0.3323, + "step": 4033 + }, + { + "epoch": 0.25, + "grad_norm": 1.4032652273676638, + "learning_rate": 8.744682996800351e-06, + "loss": 0.3469, + "step": 4034 + }, + { + "epoch": 0.25, + "grad_norm": 1.8921606911624178, + "learning_rate": 8.744008034150217e-06, + "loss": 0.3188, + "step": 4035 + }, + { + "epoch": 0.25, + "grad_norm": 1.4596220130329085, + "learning_rate": 8.743332916154541e-06, + "loss": 0.3198, + "step": 4036 + }, + { + "epoch": 0.25, + "grad_norm": 2.6352768292601403, + "learning_rate": 8.742657642841333e-06, + "loss": 0.3366, + "step": 4037 + }, + { + "epoch": 0.25, + "grad_norm": 1.3039448632668107, + "learning_rate": 8.741982214238611e-06, + "loss": 0.3437, + "step": 4038 + }, + { + "epoch": 0.25, + "grad_norm": 2.0927802688428327, + "learning_rate": 8.7413066303744e-06, + "loss": 0.3442, + "step": 4039 + }, + { + "epoch": 0.25, + "grad_norm": 1.535874207421677, + "learning_rate": 8.740630891276733e-06, + "loss": 0.3189, + "step": 4040 + }, + { + "epoch": 0.25, + "grad_norm": 2.388702468093148, + "learning_rate": 8.739954996973646e-06, + "loss": 0.3353, + "step": 4041 + }, + { + "epoch": 0.25, + "grad_norm": 1.6144431932867787, + "learning_rate": 8.739278947493182e-06, + "loss": 0.3352, + "step": 4042 + }, + { + "epoch": 0.25, + "grad_norm": 6.568850958839546, + "learning_rate": 8.738602742863394e-06, + "loss": 0.3593, + "step": 4043 + }, + { + "epoch": 0.25, + "grad_norm": 1.5634723197923737, + "learning_rate": 8.737926383112336e-06, + "loss": 0.3541, + "step": 4044 + }, + { + "epoch": 0.25, + "grad_norm": 1.9645900527362579, + "learning_rate": 8.737249868268076e-06, + "loss": 0.3212, + "step": 4045 + }, + { + "epoch": 0.25, + "grad_norm": 1.8098296651161077, + "learning_rate": 8.736573198358677e-06, + "loss": 0.3475, + "step": 4046 + }, + { + "epoch": 0.25, + "grad_norm": 6.7698301785311354, + "learning_rate": 8.735896373412223e-06, + "loss": 0.333, + "step": 4047 + }, + { + "epoch": 0.25, + "grad_norm": 2.849821522211834, + "learning_rate": 8.735219393456791e-06, + "loss": 0.3535, + "step": 4048 + }, + { + "epoch": 0.25, + "grad_norm": 3.854147709811618, + "learning_rate": 8.73454225852047e-06, + "loss": 0.3459, + "step": 4049 + }, + { + "epoch": 0.25, + "grad_norm": 2.4135048040720535, + "learning_rate": 8.73386496863136e-06, + "loss": 0.3489, + "step": 4050 + }, + { + "epoch": 0.25, + "grad_norm": 2.2122085838660452, + "learning_rate": 8.73318752381756e-06, + "loss": 0.3629, + "step": 4051 + }, + { + "epoch": 0.25, + "grad_norm": 1.5692931435847661, + "learning_rate": 8.732509924107178e-06, + "loss": 0.3483, + "step": 4052 + }, + { + "epoch": 0.25, + "grad_norm": 4.278172906750455, + "learning_rate": 8.731832169528331e-06, + "loss": 0.3405, + "step": 4053 + }, + { + "epoch": 0.25, + "grad_norm": 1.6080014153388025, + "learning_rate": 8.731154260109137e-06, + "loss": 0.3407, + "step": 4054 + }, + { + "epoch": 0.26, + "grad_norm": 2.348106601953248, + "learning_rate": 8.730476195877726e-06, + "loss": 0.3567, + "step": 4055 + }, + { + "epoch": 0.26, + "grad_norm": 3.0510569170967754, + "learning_rate": 8.729797976862231e-06, + "loss": 0.3328, + "step": 4056 + }, + { + "epoch": 0.26, + "grad_norm": 1.5894392806231072, + "learning_rate": 8.729119603090794e-06, + "loss": 0.3264, + "step": 4057 + }, + { + "epoch": 0.26, + "grad_norm": 2.073645775213012, + "learning_rate": 8.728441074591561e-06, + "loss": 0.3173, + "step": 4058 + }, + { + "epoch": 0.26, + "grad_norm": 1.54799909527397, + "learning_rate": 8.727762391392685e-06, + "loss": 0.3499, + "step": 4059 + }, + { + "epoch": 0.26, + "grad_norm": 1.4898635989281481, + "learning_rate": 8.727083553522326e-06, + "loss": 0.3271, + "step": 4060 + }, + { + "epoch": 0.26, + "grad_norm": 1.6195957426803977, + "learning_rate": 8.726404561008652e-06, + "loss": 0.34, + "step": 4061 + }, + { + "epoch": 0.26, + "grad_norm": 2.4569605547413564, + "learning_rate": 8.725725413879833e-06, + "loss": 0.3474, + "step": 4062 + }, + { + "epoch": 0.26, + "grad_norm": 1.63672464437792, + "learning_rate": 8.725046112164048e-06, + "loss": 0.3409, + "step": 4063 + }, + { + "epoch": 0.26, + "grad_norm": 2.07783384634387, + "learning_rate": 8.724366655889484e-06, + "loss": 0.339, + "step": 4064 + }, + { + "epoch": 0.26, + "grad_norm": 2.5052048789526022, + "learning_rate": 8.723687045084332e-06, + "loss": 0.3447, + "step": 4065 + }, + { + "epoch": 0.26, + "grad_norm": 2.0756908514939045, + "learning_rate": 8.72300727977679e-06, + "loss": 0.3434, + "step": 4066 + }, + { + "epoch": 0.26, + "grad_norm": 1.5933030953941294, + "learning_rate": 8.722327359995064e-06, + "loss": 0.3338, + "step": 4067 + }, + { + "epoch": 0.26, + "grad_norm": 2.953815711101553, + "learning_rate": 8.721647285767364e-06, + "loss": 0.3369, + "step": 4068 + }, + { + "epoch": 0.26, + "grad_norm": 2.0628212915236666, + "learning_rate": 8.720967057121908e-06, + "loss": 0.3524, + "step": 4069 + }, + { + "epoch": 0.26, + "grad_norm": 3.1094032642905916, + "learning_rate": 8.720286674086919e-06, + "loss": 0.3364, + "step": 4070 + }, + { + "epoch": 0.26, + "grad_norm": 16.29791883779948, + "learning_rate": 8.719606136690628e-06, + "loss": 0.335, + "step": 4071 + }, + { + "epoch": 0.26, + "grad_norm": 1.9666908533427165, + "learning_rate": 8.718925444961273e-06, + "loss": 0.3611, + "step": 4072 + }, + { + "epoch": 0.26, + "grad_norm": 3.5340981209677524, + "learning_rate": 8.718244598927095e-06, + "loss": 0.361, + "step": 4073 + }, + { + "epoch": 0.26, + "grad_norm": 1.5327835452053369, + "learning_rate": 8.717563598616342e-06, + "loss": 0.3152, + "step": 4074 + }, + { + "epoch": 0.26, + "grad_norm": 1.8868073105187637, + "learning_rate": 8.716882444057275e-06, + "loss": 0.356, + "step": 4075 + }, + { + "epoch": 0.26, + "grad_norm": 3.0230935669880217, + "learning_rate": 8.716201135278152e-06, + "loss": 0.3268, + "step": 4076 + }, + { + "epoch": 0.26, + "grad_norm": 2.0560875190386207, + "learning_rate": 8.715519672307245e-06, + "loss": 0.3375, + "step": 4077 + }, + { + "epoch": 0.26, + "grad_norm": 2.0900475278362407, + "learning_rate": 8.714838055172825e-06, + "loss": 0.3439, + "step": 4078 + }, + { + "epoch": 0.26, + "grad_norm": 2.0781631564196195, + "learning_rate": 8.714156283903177e-06, + "loss": 0.3391, + "step": 4079 + }, + { + "epoch": 0.26, + "grad_norm": 2.270042035954629, + "learning_rate": 8.713474358526588e-06, + "loss": 0.332, + "step": 4080 + }, + { + "epoch": 0.26, + "grad_norm": 1.7474073994956407, + "learning_rate": 8.712792279071351e-06, + "loss": 0.3173, + "step": 4081 + }, + { + "epoch": 0.26, + "grad_norm": 1.6378876500682436, + "learning_rate": 8.712110045565768e-06, + "loss": 0.3154, + "step": 4082 + }, + { + "epoch": 0.26, + "grad_norm": 2.4541175416986114, + "learning_rate": 8.711427658038146e-06, + "loss": 0.3295, + "step": 4083 + }, + { + "epoch": 0.26, + "grad_norm": 1.7196100230078464, + "learning_rate": 8.710745116516798e-06, + "loss": 0.351, + "step": 4084 + }, + { + "epoch": 0.26, + "grad_norm": 2.3541906105401913, + "learning_rate": 8.710062421030043e-06, + "loss": 0.3602, + "step": 4085 + }, + { + "epoch": 0.26, + "grad_norm": 2.1103349288170503, + "learning_rate": 8.709379571606211e-06, + "loss": 0.3392, + "step": 4086 + }, + { + "epoch": 0.26, + "grad_norm": 2.676024096888773, + "learning_rate": 8.708696568273631e-06, + "loss": 0.3126, + "step": 4087 + }, + { + "epoch": 0.26, + "grad_norm": 6.769694033779992, + "learning_rate": 8.708013411060642e-06, + "loss": 0.3628, + "step": 4088 + }, + { + "epoch": 0.26, + "grad_norm": 5.932547398377358, + "learning_rate": 8.70733009999559e-06, + "loss": 0.3451, + "step": 4089 + }, + { + "epoch": 0.26, + "grad_norm": 4.156018069717261, + "learning_rate": 8.706646635106828e-06, + "loss": 0.3339, + "step": 4090 + }, + { + "epoch": 0.26, + "grad_norm": 1.480207759245538, + "learning_rate": 8.705963016422713e-06, + "loss": 0.3145, + "step": 4091 + }, + { + "epoch": 0.26, + "grad_norm": 1.9827084793169916, + "learning_rate": 8.705279243971612e-06, + "loss": 0.3582, + "step": 4092 + }, + { + "epoch": 0.26, + "grad_norm": 1.8554133354964357, + "learning_rate": 8.704595317781892e-06, + "loss": 0.3173, + "step": 4093 + }, + { + "epoch": 0.26, + "grad_norm": 1.399401452220164, + "learning_rate": 8.703911237881932e-06, + "loss": 0.3295, + "step": 4094 + }, + { + "epoch": 0.26, + "grad_norm": 1.775914267033628, + "learning_rate": 8.703227004300117e-06, + "loss": 0.3348, + "step": 4095 + }, + { + "epoch": 0.26, + "grad_norm": 1.7040139261096963, + "learning_rate": 8.702542617064835e-06, + "loss": 0.3288, + "step": 4096 + }, + { + "epoch": 0.26, + "grad_norm": 2.0923667917542046, + "learning_rate": 8.701858076204484e-06, + "loss": 0.3464, + "step": 4097 + }, + { + "epoch": 0.26, + "grad_norm": 1.7426022535152126, + "learning_rate": 8.701173381747465e-06, + "loss": 0.3281, + "step": 4098 + }, + { + "epoch": 0.26, + "grad_norm": 1.7515229375498405, + "learning_rate": 8.700488533722189e-06, + "loss": 0.3328, + "step": 4099 + }, + { + "epoch": 0.26, + "grad_norm": 3.783669089310867, + "learning_rate": 8.699803532157071e-06, + "loss": 0.3502, + "step": 4100 + }, + { + "epoch": 0.26, + "grad_norm": 2.3042027517051666, + "learning_rate": 8.699118377080534e-06, + "loss": 0.3361, + "step": 4101 + }, + { + "epoch": 0.26, + "grad_norm": 2.1278033478288436, + "learning_rate": 8.698433068521005e-06, + "loss": 0.3432, + "step": 4102 + }, + { + "epoch": 0.26, + "grad_norm": 1.7118264209442484, + "learning_rate": 8.697747606506917e-06, + "loss": 0.355, + "step": 4103 + }, + { + "epoch": 0.26, + "grad_norm": 1.8498136601044612, + "learning_rate": 8.697061991066712e-06, + "loss": 0.3486, + "step": 4104 + }, + { + "epoch": 0.26, + "grad_norm": 6.732509917874962, + "learning_rate": 8.696376222228841e-06, + "loss": 0.3322, + "step": 4105 + }, + { + "epoch": 0.26, + "grad_norm": 2.3786558315606947, + "learning_rate": 8.695690300021755e-06, + "loss": 0.3463, + "step": 4106 + }, + { + "epoch": 0.26, + "grad_norm": 1.6395221922436516, + "learning_rate": 8.695004224473912e-06, + "loss": 0.3354, + "step": 4107 + }, + { + "epoch": 0.26, + "grad_norm": 1.2973049883443966, + "learning_rate": 8.69431799561378e-06, + "loss": 0.3417, + "step": 4108 + }, + { + "epoch": 0.26, + "grad_norm": 1.8760467049793943, + "learning_rate": 8.693631613469837e-06, + "loss": 0.325, + "step": 4109 + }, + { + "epoch": 0.26, + "grad_norm": 2.086697428287031, + "learning_rate": 8.692945078070555e-06, + "loss": 0.3413, + "step": 4110 + }, + { + "epoch": 0.26, + "grad_norm": 2.7362402259724723, + "learning_rate": 8.692258389444422e-06, + "loss": 0.3261, + "step": 4111 + }, + { + "epoch": 0.26, + "grad_norm": 1.7640477171220377, + "learning_rate": 8.69157154761993e-06, + "loss": 0.3454, + "step": 4112 + }, + { + "epoch": 0.26, + "grad_norm": 2.701332166606028, + "learning_rate": 8.690884552625579e-06, + "loss": 0.3194, + "step": 4113 + }, + { + "epoch": 0.26, + "grad_norm": 2.1595312033463734, + "learning_rate": 8.690197404489871e-06, + "loss": 0.3383, + "step": 4114 + }, + { + "epoch": 0.26, + "grad_norm": 7.520544858801723, + "learning_rate": 8.689510103241318e-06, + "loss": 0.3359, + "step": 4115 + }, + { + "epoch": 0.26, + "grad_norm": 1.5493797661696136, + "learning_rate": 8.68882264890844e-06, + "loss": 0.3332, + "step": 4116 + }, + { + "epoch": 0.26, + "grad_norm": 3.3290404208472024, + "learning_rate": 8.688135041519756e-06, + "loss": 0.358, + "step": 4117 + }, + { + "epoch": 0.26, + "grad_norm": 2.1260846658752963, + "learning_rate": 8.687447281103799e-06, + "loss": 0.3466, + "step": 4118 + }, + { + "epoch": 0.26, + "grad_norm": 2.3654767099177816, + "learning_rate": 8.686759367689102e-06, + "loss": 0.3299, + "step": 4119 + }, + { + "epoch": 0.26, + "grad_norm": 1.6931618242551096, + "learning_rate": 8.686071301304212e-06, + "loss": 0.3394, + "step": 4120 + }, + { + "epoch": 0.26, + "grad_norm": 0.6070622475886372, + "learning_rate": 8.685383081977678e-06, + "loss": 0.5224, + "step": 4121 + }, + { + "epoch": 0.26, + "grad_norm": 1.9085627802921634, + "learning_rate": 8.684694709738053e-06, + "loss": 0.3409, + "step": 4122 + }, + { + "epoch": 0.26, + "grad_norm": 1.823691904252609, + "learning_rate": 8.6840061846139e-06, + "loss": 0.331, + "step": 4123 + }, + { + "epoch": 0.26, + "grad_norm": 1.55624305014329, + "learning_rate": 8.683317506633787e-06, + "loss": 0.3309, + "step": 4124 + }, + { + "epoch": 0.26, + "grad_norm": 2.531029991410397, + "learning_rate": 8.682628675826288e-06, + "loss": 0.3762, + "step": 4125 + }, + { + "epoch": 0.26, + "grad_norm": 1.6466895504634198, + "learning_rate": 8.681939692219984e-06, + "loss": 0.314, + "step": 4126 + }, + { + "epoch": 0.26, + "grad_norm": 4.205852777188346, + "learning_rate": 8.681250555843462e-06, + "loss": 0.3475, + "step": 4127 + }, + { + "epoch": 0.26, + "grad_norm": 5.253520849435823, + "learning_rate": 8.680561266725316e-06, + "loss": 0.3345, + "step": 4128 + }, + { + "epoch": 0.26, + "grad_norm": 2.7651739332864027, + "learning_rate": 8.679871824894146e-06, + "loss": 0.3313, + "step": 4129 + }, + { + "epoch": 0.26, + "grad_norm": 3.2964360891628264, + "learning_rate": 8.679182230378558e-06, + "loss": 0.3585, + "step": 4130 + }, + { + "epoch": 0.26, + "grad_norm": 0.6070789575432157, + "learning_rate": 8.678492483207163e-06, + "loss": 0.4925, + "step": 4131 + }, + { + "epoch": 0.26, + "grad_norm": 3.0468637162024867, + "learning_rate": 8.677802583408583e-06, + "loss": 0.366, + "step": 4132 + }, + { + "epoch": 0.26, + "grad_norm": 3.2241862210350836, + "learning_rate": 8.67711253101144e-06, + "loss": 0.3831, + "step": 4133 + }, + { + "epoch": 0.26, + "grad_norm": 3.990513516603685, + "learning_rate": 8.676422326044366e-06, + "loss": 0.353, + "step": 4134 + }, + { + "epoch": 0.26, + "grad_norm": 2.21598960991638, + "learning_rate": 8.675731968536004e-06, + "loss": 0.3336, + "step": 4135 + }, + { + "epoch": 0.26, + "grad_norm": 1.687426845091582, + "learning_rate": 8.67504145851499e-06, + "loss": 0.3341, + "step": 4136 + }, + { + "epoch": 0.26, + "grad_norm": 2.5886330691574577, + "learning_rate": 8.674350796009979e-06, + "loss": 0.3206, + "step": 4137 + }, + { + "epoch": 0.26, + "grad_norm": 2.1784018677232813, + "learning_rate": 8.673659981049627e-06, + "loss": 0.341, + "step": 4138 + }, + { + "epoch": 0.26, + "grad_norm": 1.8005510641457785, + "learning_rate": 8.672969013662597e-06, + "loss": 0.3098, + "step": 4139 + }, + { + "epoch": 0.26, + "grad_norm": 1.9983621755265903, + "learning_rate": 8.67227789387756e-06, + "loss": 0.3355, + "step": 4140 + }, + { + "epoch": 0.26, + "grad_norm": 0.6094853938636869, + "learning_rate": 8.671586621723186e-06, + "loss": 0.4963, + "step": 4141 + }, + { + "epoch": 0.26, + "grad_norm": 1.8654529172301353, + "learning_rate": 8.670895197228166e-06, + "loss": 0.3246, + "step": 4142 + }, + { + "epoch": 0.26, + "grad_norm": 3.5184671360323554, + "learning_rate": 8.670203620421183e-06, + "loss": 0.3675, + "step": 4143 + }, + { + "epoch": 0.26, + "grad_norm": 1.8400839084450171, + "learning_rate": 8.669511891330935e-06, + "loss": 0.3382, + "step": 4144 + }, + { + "epoch": 0.26, + "grad_norm": 2.6928610919616545, + "learning_rate": 8.668820009986116e-06, + "loss": 0.3207, + "step": 4145 + }, + { + "epoch": 0.26, + "grad_norm": 1.825620037479355, + "learning_rate": 8.668127976415442e-06, + "loss": 0.3346, + "step": 4146 + }, + { + "epoch": 0.26, + "grad_norm": 1.4993675507647217, + "learning_rate": 8.667435790647623e-06, + "loss": 0.3233, + "step": 4147 + }, + { + "epoch": 0.26, + "grad_norm": 2.683817712185597, + "learning_rate": 8.666743452711377e-06, + "loss": 0.3343, + "step": 4148 + }, + { + "epoch": 0.26, + "grad_norm": 2.2707252503978688, + "learning_rate": 8.666050962635433e-06, + "loss": 0.3503, + "step": 4149 + }, + { + "epoch": 0.26, + "grad_norm": 2.765477733405839, + "learning_rate": 8.665358320448522e-06, + "loss": 0.3297, + "step": 4150 + }, + { + "epoch": 0.26, + "grad_norm": 3.764676704257325, + "learning_rate": 8.664665526179385e-06, + "loss": 0.321, + "step": 4151 + }, + { + "epoch": 0.26, + "grad_norm": 1.4444340401680011, + "learning_rate": 8.663972579856767e-06, + "loss": 0.3356, + "step": 4152 + }, + { + "epoch": 0.26, + "grad_norm": 2.2214300610166555, + "learning_rate": 8.663279481509418e-06, + "loss": 0.3376, + "step": 4153 + }, + { + "epoch": 0.26, + "grad_norm": 2.5631225824079764, + "learning_rate": 8.662586231166095e-06, + "loss": 0.3251, + "step": 4154 + }, + { + "epoch": 0.26, + "grad_norm": 1.829923085490244, + "learning_rate": 8.661892828855565e-06, + "loss": 0.3238, + "step": 4155 + }, + { + "epoch": 0.26, + "grad_norm": 2.807054781236766, + "learning_rate": 8.661199274606597e-06, + "loss": 0.32, + "step": 4156 + }, + { + "epoch": 0.26, + "grad_norm": 2.2096343244508474, + "learning_rate": 8.66050556844797e-06, + "loss": 0.3481, + "step": 4157 + }, + { + "epoch": 0.26, + "grad_norm": 1.4568628902542375, + "learning_rate": 8.659811710408464e-06, + "loss": 0.3326, + "step": 4158 + }, + { + "epoch": 0.26, + "grad_norm": 2.070347369135558, + "learning_rate": 8.659117700516869e-06, + "loss": 0.3417, + "step": 4159 + }, + { + "epoch": 0.26, + "grad_norm": 1.7880604823176935, + "learning_rate": 8.65842353880198e-06, + "loss": 0.3596, + "step": 4160 + }, + { + "epoch": 0.26, + "grad_norm": 1.8212989764591963, + "learning_rate": 8.657729225292601e-06, + "loss": 0.3374, + "step": 4161 + }, + { + "epoch": 0.26, + "grad_norm": 2.3296034565064767, + "learning_rate": 8.657034760017542e-06, + "loss": 0.3242, + "step": 4162 + }, + { + "epoch": 0.26, + "grad_norm": 2.671318243491196, + "learning_rate": 8.656340143005613e-06, + "loss": 0.343, + "step": 4163 + }, + { + "epoch": 0.26, + "grad_norm": 1.5460127523977807, + "learning_rate": 8.655645374285637e-06, + "loss": 0.3277, + "step": 4164 + }, + { + "epoch": 0.26, + "grad_norm": 2.5970223843268796, + "learning_rate": 8.654950453886443e-06, + "loss": 0.3515, + "step": 4165 + }, + { + "epoch": 0.26, + "grad_norm": 1.4468730029500596, + "learning_rate": 8.65425538183686e-06, + "loss": 0.3288, + "step": 4166 + }, + { + "epoch": 0.26, + "grad_norm": 2.317300326240315, + "learning_rate": 8.653560158165732e-06, + "loss": 0.3548, + "step": 4167 + }, + { + "epoch": 0.26, + "grad_norm": 1.5643651382379886, + "learning_rate": 8.652864782901904e-06, + "loss": 0.3272, + "step": 4168 + }, + { + "epoch": 0.26, + "grad_norm": 2.4015717564371584, + "learning_rate": 8.652169256074228e-06, + "loss": 0.3353, + "step": 4169 + }, + { + "epoch": 0.26, + "grad_norm": 2.0544115789224655, + "learning_rate": 8.651473577711562e-06, + "loss": 0.3255, + "step": 4170 + }, + { + "epoch": 0.26, + "grad_norm": 1.794751494229252, + "learning_rate": 8.650777747842772e-06, + "loss": 0.3206, + "step": 4171 + }, + { + "epoch": 0.26, + "grad_norm": 2.483213570874632, + "learning_rate": 8.65008176649673e-06, + "loss": 0.3298, + "step": 4172 + }, + { + "epoch": 0.26, + "grad_norm": 2.244015847694425, + "learning_rate": 8.64938563370231e-06, + "loss": 0.3627, + "step": 4173 + }, + { + "epoch": 0.26, + "grad_norm": 1.5315204706072374, + "learning_rate": 8.648689349488398e-06, + "loss": 0.3274, + "step": 4174 + }, + { + "epoch": 0.26, + "grad_norm": 2.0745904701481184, + "learning_rate": 8.647992913883885e-06, + "loss": 0.3253, + "step": 4175 + }, + { + "epoch": 0.26, + "grad_norm": 1.9800660707088678, + "learning_rate": 8.647296326917667e-06, + "loss": 0.3457, + "step": 4176 + }, + { + "epoch": 0.26, + "grad_norm": 2.9515521489351753, + "learning_rate": 8.646599588618645e-06, + "loss": 0.3346, + "step": 4177 + }, + { + "epoch": 0.26, + "grad_norm": 1.9156005196757473, + "learning_rate": 8.645902699015729e-06, + "loss": 0.3266, + "step": 4178 + }, + { + "epoch": 0.26, + "grad_norm": 1.4545223038385888, + "learning_rate": 8.645205658137834e-06, + "loss": 0.3307, + "step": 4179 + }, + { + "epoch": 0.26, + "grad_norm": 2.0370436764504043, + "learning_rate": 8.644508466013884e-06, + "loss": 0.3282, + "step": 4180 + }, + { + "epoch": 0.26, + "grad_norm": 3.0859785489733227, + "learning_rate": 8.6438111226728e-06, + "loss": 0.3272, + "step": 4181 + }, + { + "epoch": 0.26, + "grad_norm": 2.0441706463861964, + "learning_rate": 8.643113628143524e-06, + "loss": 0.3334, + "step": 4182 + }, + { + "epoch": 0.26, + "grad_norm": 0.6773270507292503, + "learning_rate": 8.64241598245499e-06, + "loss": 0.4833, + "step": 4183 + }, + { + "epoch": 0.26, + "grad_norm": 2.2394590502165745, + "learning_rate": 8.64171818563615e-06, + "loss": 0.3534, + "step": 4184 + }, + { + "epoch": 0.26, + "grad_norm": 2.302595319260231, + "learning_rate": 8.641020237715953e-06, + "loss": 0.3266, + "step": 4185 + }, + { + "epoch": 0.26, + "grad_norm": 1.8034574015766647, + "learning_rate": 8.640322138723357e-06, + "loss": 0.3289, + "step": 4186 + }, + { + "epoch": 0.26, + "grad_norm": 2.699338701431057, + "learning_rate": 8.639623888687335e-06, + "loss": 0.3412, + "step": 4187 + }, + { + "epoch": 0.26, + "grad_norm": 3.8050538947985593, + "learning_rate": 8.638925487636847e-06, + "loss": 0.3431, + "step": 4188 + }, + { + "epoch": 0.26, + "grad_norm": 2.361753746662144, + "learning_rate": 8.638226935600881e-06, + "loss": 0.3219, + "step": 4189 + }, + { + "epoch": 0.26, + "grad_norm": 0.6258431357704101, + "learning_rate": 8.637528232608415e-06, + "loss": 0.5307, + "step": 4190 + }, + { + "epoch": 0.26, + "grad_norm": 1.9084641772079904, + "learning_rate": 8.636829378688443e-06, + "loss": 0.3455, + "step": 4191 + }, + { + "epoch": 0.26, + "grad_norm": 2.213081098267743, + "learning_rate": 8.63613037386996e-06, + "loss": 0.3346, + "step": 4192 + }, + { + "epoch": 0.26, + "grad_norm": 1.9435176600534825, + "learning_rate": 8.635431218181969e-06, + "loss": 0.3315, + "step": 4193 + }, + { + "epoch": 0.26, + "grad_norm": 2.360284991032729, + "learning_rate": 8.63473191165348e-06, + "loss": 0.3636, + "step": 4194 + }, + { + "epoch": 0.26, + "grad_norm": 1.9172780629234418, + "learning_rate": 8.634032454313507e-06, + "loss": 0.3324, + "step": 4195 + }, + { + "epoch": 0.26, + "grad_norm": 1.4100594044399137, + "learning_rate": 8.633332846191074e-06, + "loss": 0.3392, + "step": 4196 + }, + { + "epoch": 0.26, + "grad_norm": 3.5594277567110817, + "learning_rate": 8.632633087315207e-06, + "loss": 0.3202, + "step": 4197 + }, + { + "epoch": 0.26, + "grad_norm": 2.2788181000457977, + "learning_rate": 8.631933177714942e-06, + "loss": 0.3098, + "step": 4198 + }, + { + "epoch": 0.26, + "grad_norm": 3.1170957465696976, + "learning_rate": 8.631233117419317e-06, + "loss": 0.3259, + "step": 4199 + }, + { + "epoch": 0.26, + "grad_norm": 3.3692229325064487, + "learning_rate": 8.630532906457381e-06, + "loss": 0.3363, + "step": 4200 + }, + { + "epoch": 0.26, + "grad_norm": 1.6538040463328463, + "learning_rate": 8.629832544858186e-06, + "loss": 0.3235, + "step": 4201 + }, + { + "epoch": 0.26, + "grad_norm": 2.110178494042075, + "learning_rate": 8.62913203265079e-06, + "loss": 0.338, + "step": 4202 + }, + { + "epoch": 0.26, + "grad_norm": 2.633671884984901, + "learning_rate": 8.628431369864262e-06, + "loss": 0.3267, + "step": 4203 + }, + { + "epoch": 0.26, + "grad_norm": 1.4854503434641324, + "learning_rate": 8.62773055652767e-06, + "loss": 0.3124, + "step": 4204 + }, + { + "epoch": 0.26, + "grad_norm": 1.7229283162260696, + "learning_rate": 8.627029592670093e-06, + "loss": 0.325, + "step": 4205 + }, + { + "epoch": 0.26, + "grad_norm": 1.5823103223056096, + "learning_rate": 8.626328478320617e-06, + "loss": 0.3279, + "step": 4206 + }, + { + "epoch": 0.26, + "grad_norm": 2.4211654887395215, + "learning_rate": 8.625627213508332e-06, + "loss": 0.333, + "step": 4207 + }, + { + "epoch": 0.26, + "grad_norm": 2.332135911111867, + "learning_rate": 8.624925798262335e-06, + "loss": 0.371, + "step": 4208 + }, + { + "epoch": 0.26, + "grad_norm": 1.8388218575572008, + "learning_rate": 8.624224232611726e-06, + "loss": 0.332, + "step": 4209 + }, + { + "epoch": 0.26, + "grad_norm": 1.8924836693249167, + "learning_rate": 8.623522516585618e-06, + "loss": 0.324, + "step": 4210 + }, + { + "epoch": 0.26, + "grad_norm": 2.458712333451677, + "learning_rate": 8.622820650213122e-06, + "loss": 0.3298, + "step": 4211 + }, + { + "epoch": 0.26, + "grad_norm": 1.7460136933060542, + "learning_rate": 8.622118633523364e-06, + "loss": 0.3249, + "step": 4212 + }, + { + "epoch": 0.26, + "grad_norm": 1.5571048992183727, + "learning_rate": 8.621416466545472e-06, + "loss": 0.3261, + "step": 4213 + }, + { + "epoch": 0.27, + "grad_norm": 2.219814818474756, + "learning_rate": 8.620714149308575e-06, + "loss": 0.3262, + "step": 4214 + }, + { + "epoch": 0.27, + "grad_norm": 2.5310169645285376, + "learning_rate": 8.62001168184182e-06, + "loss": 0.3346, + "step": 4215 + }, + { + "epoch": 0.27, + "grad_norm": 3.0374915820909085, + "learning_rate": 8.619309064174349e-06, + "loss": 0.3604, + "step": 4216 + }, + { + "epoch": 0.27, + "grad_norm": 3.4260085703202003, + "learning_rate": 8.618606296335314e-06, + "loss": 0.3424, + "step": 4217 + }, + { + "epoch": 0.27, + "grad_norm": 2.2833963435369493, + "learning_rate": 8.61790337835388e-06, + "loss": 0.3269, + "step": 4218 + }, + { + "epoch": 0.27, + "grad_norm": 1.4732389178520906, + "learning_rate": 8.617200310259206e-06, + "loss": 0.3445, + "step": 4219 + }, + { + "epoch": 0.27, + "grad_norm": 1.580656104579461, + "learning_rate": 8.616497092080469e-06, + "loss": 0.3287, + "step": 4220 + }, + { + "epoch": 0.27, + "grad_norm": 1.5432302970182048, + "learning_rate": 8.615793723846842e-06, + "loss": 0.3295, + "step": 4221 + }, + { + "epoch": 0.27, + "grad_norm": 2.9865712153793176, + "learning_rate": 8.615090205587513e-06, + "loss": 0.353, + "step": 4222 + }, + { + "epoch": 0.27, + "grad_norm": 0.682272812913298, + "learning_rate": 8.614386537331667e-06, + "loss": 0.4891, + "step": 4223 + }, + { + "epoch": 0.27, + "grad_norm": 2.4364191794279937, + "learning_rate": 8.613682719108507e-06, + "loss": 0.3552, + "step": 4224 + }, + { + "epoch": 0.27, + "grad_norm": 1.985457930299239, + "learning_rate": 8.612978750947232e-06, + "loss": 0.3408, + "step": 4225 + }, + { + "epoch": 0.27, + "grad_norm": 1.666392022632377, + "learning_rate": 8.61227463287705e-06, + "loss": 0.3176, + "step": 4226 + }, + { + "epoch": 0.27, + "grad_norm": 2.0680902210910026, + "learning_rate": 8.611570364927178e-06, + "loss": 0.31, + "step": 4227 + }, + { + "epoch": 0.27, + "grad_norm": 2.3050551553530574, + "learning_rate": 8.610865947126836e-06, + "loss": 0.3189, + "step": 4228 + }, + { + "epoch": 0.27, + "grad_norm": 1.90337570792768, + "learning_rate": 8.610161379505253e-06, + "loss": 0.3434, + "step": 4229 + }, + { + "epoch": 0.27, + "grad_norm": 1.771438139668405, + "learning_rate": 8.60945666209166e-06, + "loss": 0.34, + "step": 4230 + }, + { + "epoch": 0.27, + "grad_norm": 1.3298283604768903, + "learning_rate": 8.6087517949153e-06, + "loss": 0.3202, + "step": 4231 + }, + { + "epoch": 0.27, + "grad_norm": 2.916247704002772, + "learning_rate": 8.608046778005419e-06, + "loss": 0.3261, + "step": 4232 + }, + { + "epoch": 0.27, + "grad_norm": 2.030769209932224, + "learning_rate": 8.607341611391268e-06, + "loss": 0.3622, + "step": 4233 + }, + { + "epoch": 0.27, + "grad_norm": 2.1697410164179995, + "learning_rate": 8.606636295102106e-06, + "loss": 0.3252, + "step": 4234 + }, + { + "epoch": 0.27, + "grad_norm": 5.605232603316513, + "learning_rate": 8.6059308291672e-06, + "loss": 0.3363, + "step": 4235 + }, + { + "epoch": 0.27, + "grad_norm": 1.6585366486403808, + "learning_rate": 8.605225213615818e-06, + "loss": 0.3221, + "step": 4236 + }, + { + "epoch": 0.27, + "grad_norm": 2.2632132517410937, + "learning_rate": 8.604519448477236e-06, + "loss": 0.3261, + "step": 4237 + }, + { + "epoch": 0.27, + "grad_norm": 1.7575344780553146, + "learning_rate": 8.603813533780742e-06, + "loss": 0.3425, + "step": 4238 + }, + { + "epoch": 0.27, + "grad_norm": 1.8349045762388945, + "learning_rate": 8.603107469555623e-06, + "loss": 0.3225, + "step": 4239 + }, + { + "epoch": 0.27, + "grad_norm": 2.4451097932181565, + "learning_rate": 8.602401255831176e-06, + "loss": 0.3448, + "step": 4240 + }, + { + "epoch": 0.27, + "grad_norm": 2.2508351057350167, + "learning_rate": 8.601694892636701e-06, + "loss": 0.3168, + "step": 4241 + }, + { + "epoch": 0.27, + "grad_norm": 1.6972953706707048, + "learning_rate": 8.600988380001508e-06, + "loss": 0.3391, + "step": 4242 + }, + { + "epoch": 0.27, + "grad_norm": 4.1794147541088265, + "learning_rate": 8.60028171795491e-06, + "loss": 0.3366, + "step": 4243 + }, + { + "epoch": 0.27, + "grad_norm": 1.5023694633565843, + "learning_rate": 8.599574906526231e-06, + "loss": 0.3464, + "step": 4244 + }, + { + "epoch": 0.27, + "grad_norm": 1.6034641172287127, + "learning_rate": 8.598867945744794e-06, + "loss": 0.3569, + "step": 4245 + }, + { + "epoch": 0.27, + "grad_norm": 1.6657183866249199, + "learning_rate": 8.598160835639935e-06, + "loss": 0.3222, + "step": 4246 + }, + { + "epoch": 0.27, + "grad_norm": 1.5283025124717415, + "learning_rate": 8.59745357624099e-06, + "loss": 0.3339, + "step": 4247 + }, + { + "epoch": 0.27, + "grad_norm": 1.2966176158873703, + "learning_rate": 8.596746167577308e-06, + "loss": 0.3289, + "step": 4248 + }, + { + "epoch": 0.27, + "grad_norm": 2.5372664240922145, + "learning_rate": 8.596038609678236e-06, + "loss": 0.3139, + "step": 4249 + }, + { + "epoch": 0.27, + "grad_norm": 2.3840010217867644, + "learning_rate": 8.595330902573137e-06, + "loss": 0.305, + "step": 4250 + }, + { + "epoch": 0.27, + "grad_norm": 1.6570800328309143, + "learning_rate": 8.594623046291374e-06, + "loss": 0.3161, + "step": 4251 + }, + { + "epoch": 0.27, + "grad_norm": 2.392855303032146, + "learning_rate": 8.593915040862315e-06, + "loss": 0.3046, + "step": 4252 + }, + { + "epoch": 0.27, + "grad_norm": 4.553084091586555, + "learning_rate": 8.593206886315338e-06, + "loss": 0.3393, + "step": 4253 + }, + { + "epoch": 0.27, + "grad_norm": 1.7687996769929684, + "learning_rate": 8.592498582679823e-06, + "loss": 0.347, + "step": 4254 + }, + { + "epoch": 0.27, + "grad_norm": 1.7340705972518944, + "learning_rate": 8.591790129985162e-06, + "loss": 0.3371, + "step": 4255 + }, + { + "epoch": 0.27, + "grad_norm": 1.3751645187296844, + "learning_rate": 8.59108152826075e-06, + "loss": 0.3306, + "step": 4256 + }, + { + "epoch": 0.27, + "grad_norm": 2.558167259182883, + "learning_rate": 8.590372777535984e-06, + "loss": 0.3582, + "step": 4257 + }, + { + "epoch": 0.27, + "grad_norm": 1.9325602192645501, + "learning_rate": 8.589663877840278e-06, + "loss": 0.3659, + "step": 4258 + }, + { + "epoch": 0.27, + "grad_norm": 0.7099919053117987, + "learning_rate": 8.588954829203039e-06, + "loss": 0.4824, + "step": 4259 + }, + { + "epoch": 0.27, + "grad_norm": 1.894137370774947, + "learning_rate": 8.58824563165369e-06, + "loss": 0.3661, + "step": 4260 + }, + { + "epoch": 0.27, + "grad_norm": 1.7094474668222037, + "learning_rate": 8.587536285221656e-06, + "loss": 0.3231, + "step": 4261 + }, + { + "epoch": 0.27, + "grad_norm": 1.6262659477981198, + "learning_rate": 8.58682678993637e-06, + "loss": 0.3357, + "step": 4262 + }, + { + "epoch": 0.27, + "grad_norm": 2.1584387794679487, + "learning_rate": 8.58611714582727e-06, + "loss": 0.3448, + "step": 4263 + }, + { + "epoch": 0.27, + "grad_norm": 6.5630222924968535, + "learning_rate": 8.585407352923799e-06, + "loss": 0.3483, + "step": 4264 + }, + { + "epoch": 0.27, + "grad_norm": 1.9131127117402358, + "learning_rate": 8.584697411255409e-06, + "loss": 0.3447, + "step": 4265 + }, + { + "epoch": 0.27, + "grad_norm": 7.8341610832471895, + "learning_rate": 8.583987320851556e-06, + "loss": 0.3329, + "step": 4266 + }, + { + "epoch": 0.27, + "grad_norm": 3.3496653343170317, + "learning_rate": 8.583277081741703e-06, + "loss": 0.3336, + "step": 4267 + }, + { + "epoch": 0.27, + "grad_norm": 1.607165837648559, + "learning_rate": 8.58256669395532e-06, + "loss": 0.3509, + "step": 4268 + }, + { + "epoch": 0.27, + "grad_norm": 1.9243460639183332, + "learning_rate": 8.581856157521882e-06, + "loss": 0.3428, + "step": 4269 + }, + { + "epoch": 0.27, + "grad_norm": 1.80555944204013, + "learning_rate": 8.58114547247087e-06, + "loss": 0.3495, + "step": 4270 + }, + { + "epoch": 0.27, + "grad_norm": 1.2423985483323439, + "learning_rate": 8.580434638831769e-06, + "loss": 0.3277, + "step": 4271 + }, + { + "epoch": 0.27, + "grad_norm": 1.5004579684570956, + "learning_rate": 8.579723656634077e-06, + "loss": 0.3033, + "step": 4272 + }, + { + "epoch": 0.27, + "grad_norm": 1.5256845265497303, + "learning_rate": 8.579012525907292e-06, + "loss": 0.3342, + "step": 4273 + }, + { + "epoch": 0.27, + "grad_norm": 1.6350629540576957, + "learning_rate": 8.57830124668092e-06, + "loss": 0.3616, + "step": 4274 + }, + { + "epoch": 0.27, + "grad_norm": 2.07337339387284, + "learning_rate": 8.577589818984473e-06, + "loss": 0.3241, + "step": 4275 + }, + { + "epoch": 0.27, + "grad_norm": 1.4436272941552504, + "learning_rate": 8.576878242847472e-06, + "loss": 0.327, + "step": 4276 + }, + { + "epoch": 0.27, + "grad_norm": 1.5680174375865998, + "learning_rate": 8.576166518299438e-06, + "loss": 0.3427, + "step": 4277 + }, + { + "epoch": 0.27, + "grad_norm": 1.9773442544469557, + "learning_rate": 8.575454645369904e-06, + "loss": 0.3532, + "step": 4278 + }, + { + "epoch": 0.27, + "grad_norm": 1.9291989336041662, + "learning_rate": 8.574742624088403e-06, + "loss": 0.3336, + "step": 4279 + }, + { + "epoch": 0.27, + "grad_norm": 2.1602198116166584, + "learning_rate": 8.574030454484486e-06, + "loss": 0.3252, + "step": 4280 + }, + { + "epoch": 0.27, + "grad_norm": 1.8307352894774007, + "learning_rate": 8.573318136587694e-06, + "loss": 0.3255, + "step": 4281 + }, + { + "epoch": 0.27, + "grad_norm": 1.9561684405910387, + "learning_rate": 8.572605670427584e-06, + "loss": 0.3391, + "step": 4282 + }, + { + "epoch": 0.27, + "grad_norm": 1.6164287395728145, + "learning_rate": 8.571893056033722e-06, + "loss": 0.3404, + "step": 4283 + }, + { + "epoch": 0.27, + "grad_norm": 1.8667852039769386, + "learning_rate": 8.571180293435673e-06, + "loss": 0.3273, + "step": 4284 + }, + { + "epoch": 0.27, + "grad_norm": 1.3175344965314466, + "learning_rate": 8.570467382663009e-06, + "loss": 0.3239, + "step": 4285 + }, + { + "epoch": 0.27, + "grad_norm": 7.789391909186617, + "learning_rate": 8.569754323745311e-06, + "loss": 0.3325, + "step": 4286 + }, + { + "epoch": 0.27, + "grad_norm": 1.674364036570318, + "learning_rate": 8.569041116712168e-06, + "loss": 0.3414, + "step": 4287 + }, + { + "epoch": 0.27, + "grad_norm": 1.8334698952624648, + "learning_rate": 8.568327761593169e-06, + "loss": 0.3276, + "step": 4288 + }, + { + "epoch": 0.27, + "grad_norm": 1.4038681936410702, + "learning_rate": 8.567614258417911e-06, + "loss": 0.3397, + "step": 4289 + }, + { + "epoch": 0.27, + "grad_norm": 1.2040393515936567, + "learning_rate": 8.566900607216001e-06, + "loss": 0.3368, + "step": 4290 + }, + { + "epoch": 0.27, + "grad_norm": 1.4904611649416002, + "learning_rate": 8.566186808017052e-06, + "loss": 0.33, + "step": 4291 + }, + { + "epoch": 0.27, + "grad_norm": 2.1832334506308713, + "learning_rate": 8.565472860850675e-06, + "loss": 0.3358, + "step": 4292 + }, + { + "epoch": 0.27, + "grad_norm": 1.5032988899297923, + "learning_rate": 8.564758765746499e-06, + "loss": 0.3702, + "step": 4293 + }, + { + "epoch": 0.27, + "grad_norm": 3.8313876251831345, + "learning_rate": 8.564044522734147e-06, + "loss": 0.3223, + "step": 4294 + }, + { + "epoch": 0.27, + "grad_norm": 1.8462600345655764, + "learning_rate": 8.563330131843259e-06, + "loss": 0.3358, + "step": 4295 + }, + { + "epoch": 0.27, + "grad_norm": 3.173645721342238, + "learning_rate": 8.562615593103474e-06, + "loss": 0.3745, + "step": 4296 + }, + { + "epoch": 0.27, + "grad_norm": 1.6391342795380124, + "learning_rate": 8.56190090654444e-06, + "loss": 0.3325, + "step": 4297 + }, + { + "epoch": 0.27, + "grad_norm": 1.4289099612583376, + "learning_rate": 8.561186072195812e-06, + "loss": 0.3388, + "step": 4298 + }, + { + "epoch": 0.27, + "grad_norm": 1.2782260343317648, + "learning_rate": 8.560471090087247e-06, + "loss": 0.3629, + "step": 4299 + }, + { + "epoch": 0.27, + "grad_norm": 1.204730944478129, + "learning_rate": 8.559755960248414e-06, + "loss": 0.3572, + "step": 4300 + }, + { + "epoch": 0.27, + "grad_norm": 1.948083818207941, + "learning_rate": 8.559040682708982e-06, + "loss": 0.3281, + "step": 4301 + }, + { + "epoch": 0.27, + "grad_norm": 2.274866949173534, + "learning_rate": 8.558325257498632e-06, + "loss": 0.3469, + "step": 4302 + }, + { + "epoch": 0.27, + "grad_norm": 1.2081337728099282, + "learning_rate": 8.557609684647045e-06, + "loss": 0.3526, + "step": 4303 + }, + { + "epoch": 0.27, + "grad_norm": 1.9668900755299428, + "learning_rate": 8.556893964183913e-06, + "loss": 0.3348, + "step": 4304 + }, + { + "epoch": 0.27, + "grad_norm": 1.424996006234006, + "learning_rate": 8.556178096138933e-06, + "loss": 0.3227, + "step": 4305 + }, + { + "epoch": 0.27, + "grad_norm": 1.4852727013421994, + "learning_rate": 8.555462080541809e-06, + "loss": 0.3303, + "step": 4306 + }, + { + "epoch": 0.27, + "grad_norm": 1.5745261843264677, + "learning_rate": 8.554745917422247e-06, + "loss": 0.3625, + "step": 4307 + }, + { + "epoch": 0.27, + "grad_norm": 1.7543657370706096, + "learning_rate": 8.554029606809962e-06, + "loss": 0.3403, + "step": 4308 + }, + { + "epoch": 0.27, + "grad_norm": 1.6253439350883727, + "learning_rate": 8.553313148734676e-06, + "loss": 0.3302, + "step": 4309 + }, + { + "epoch": 0.27, + "grad_norm": 1.8840392671545938, + "learning_rate": 8.552596543226118e-06, + "loss": 0.3388, + "step": 4310 + }, + { + "epoch": 0.27, + "grad_norm": 3.295478319147859, + "learning_rate": 8.551879790314018e-06, + "loss": 0.3543, + "step": 4311 + }, + { + "epoch": 0.27, + "grad_norm": 1.2614115819624998, + "learning_rate": 8.551162890028118e-06, + "loss": 0.3293, + "step": 4312 + }, + { + "epoch": 0.27, + "grad_norm": 9.88470133706015, + "learning_rate": 8.55044584239816e-06, + "loss": 0.3529, + "step": 4313 + }, + { + "epoch": 0.27, + "grad_norm": 1.4376100337365636, + "learning_rate": 8.5497286474539e-06, + "loss": 0.3379, + "step": 4314 + }, + { + "epoch": 0.27, + "grad_norm": 2.1295369507884203, + "learning_rate": 8.54901130522509e-06, + "loss": 0.354, + "step": 4315 + }, + { + "epoch": 0.27, + "grad_norm": 1.5413742819786713, + "learning_rate": 8.5482938157415e-06, + "loss": 0.3353, + "step": 4316 + }, + { + "epoch": 0.27, + "grad_norm": 2.1450275082854895, + "learning_rate": 8.547576179032896e-06, + "loss": 0.3509, + "step": 4317 + }, + { + "epoch": 0.27, + "grad_norm": 4.768370691279072, + "learning_rate": 8.546858395129055e-06, + "loss": 0.3514, + "step": 4318 + }, + { + "epoch": 0.27, + "grad_norm": 1.2362246613404333, + "learning_rate": 8.54614046405976e-06, + "loss": 0.3244, + "step": 4319 + }, + { + "epoch": 0.27, + "grad_norm": 1.1640030660592486, + "learning_rate": 8.545422385854798e-06, + "loss": 0.3587, + "step": 4320 + }, + { + "epoch": 0.27, + "grad_norm": 1.5859332241452266, + "learning_rate": 8.544704160543964e-06, + "loss": 0.3406, + "step": 4321 + }, + { + "epoch": 0.27, + "grad_norm": 2.7424619456056605, + "learning_rate": 8.543985788157058e-06, + "loss": 0.3434, + "step": 4322 + }, + { + "epoch": 0.27, + "grad_norm": 2.3040625766961984, + "learning_rate": 8.543267268723887e-06, + "loss": 0.3342, + "step": 4323 + }, + { + "epoch": 0.27, + "grad_norm": 1.7915454490679616, + "learning_rate": 8.542548602274262e-06, + "loss": 0.3421, + "step": 4324 + }, + { + "epoch": 0.27, + "grad_norm": 2.5245026450484587, + "learning_rate": 8.541829788838007e-06, + "loss": 0.3603, + "step": 4325 + }, + { + "epoch": 0.27, + "grad_norm": 1.8410130517323098, + "learning_rate": 8.54111082844494e-06, + "loss": 0.3339, + "step": 4326 + }, + { + "epoch": 0.27, + "grad_norm": 1.4665300091391573, + "learning_rate": 8.540391721124897e-06, + "loss": 0.3287, + "step": 4327 + }, + { + "epoch": 0.27, + "grad_norm": 1.9614137067521844, + "learning_rate": 8.53967246690771e-06, + "loss": 0.3522, + "step": 4328 + }, + { + "epoch": 0.27, + "grad_norm": 1.169946975148502, + "learning_rate": 8.538953065823229e-06, + "loss": 0.3423, + "step": 4329 + }, + { + "epoch": 0.27, + "grad_norm": 1.4254970564272986, + "learning_rate": 8.538233517901298e-06, + "loss": 0.335, + "step": 4330 + }, + { + "epoch": 0.27, + "grad_norm": 1.8760149928771839, + "learning_rate": 8.537513823171773e-06, + "loss": 0.3336, + "step": 4331 + }, + { + "epoch": 0.27, + "grad_norm": 1.527033180686907, + "learning_rate": 8.536793981664515e-06, + "loss": 0.3316, + "step": 4332 + }, + { + "epoch": 0.27, + "grad_norm": 1.9056575167826721, + "learning_rate": 8.536073993409394e-06, + "loss": 0.3512, + "step": 4333 + }, + { + "epoch": 0.27, + "grad_norm": 1.7523770080774033, + "learning_rate": 8.535353858436284e-06, + "loss": 0.3435, + "step": 4334 + }, + { + "epoch": 0.27, + "grad_norm": 1.4986535924131184, + "learning_rate": 8.534633576775059e-06, + "loss": 0.326, + "step": 4335 + }, + { + "epoch": 0.27, + "grad_norm": 2.1885721986240636, + "learning_rate": 8.533913148455613e-06, + "loss": 0.3488, + "step": 4336 + }, + { + "epoch": 0.27, + "grad_norm": 1.4865190194148143, + "learning_rate": 8.533192573507831e-06, + "loss": 0.3345, + "step": 4337 + }, + { + "epoch": 0.27, + "grad_norm": 1.7807621071293849, + "learning_rate": 8.532471851961612e-06, + "loss": 0.3287, + "step": 4338 + }, + { + "epoch": 0.27, + "grad_norm": 1.7371039780349717, + "learning_rate": 8.531750983846864e-06, + "loss": 0.347, + "step": 4339 + }, + { + "epoch": 0.27, + "grad_norm": 0.6273966515263565, + "learning_rate": 8.531029969193494e-06, + "loss": 0.5233, + "step": 4340 + }, + { + "epoch": 0.27, + "grad_norm": 1.9254934974362559, + "learning_rate": 8.53030880803142e-06, + "loss": 0.3349, + "step": 4341 + }, + { + "epoch": 0.27, + "grad_norm": 1.726887305529337, + "learning_rate": 8.529587500390561e-06, + "loss": 0.358, + "step": 4342 + }, + { + "epoch": 0.27, + "grad_norm": 1.7184910973631984, + "learning_rate": 8.528866046300848e-06, + "loss": 0.3321, + "step": 4343 + }, + { + "epoch": 0.27, + "grad_norm": 1.7275129845163628, + "learning_rate": 8.528144445792215e-06, + "loss": 0.3351, + "step": 4344 + }, + { + "epoch": 0.27, + "grad_norm": 1.5879895951632277, + "learning_rate": 8.527422698894602e-06, + "loss": 0.3278, + "step": 4345 + }, + { + "epoch": 0.27, + "grad_norm": 3.0130951180578815, + "learning_rate": 8.526700805637955e-06, + "loss": 0.327, + "step": 4346 + }, + { + "epoch": 0.27, + "grad_norm": 2.2594256716898915, + "learning_rate": 8.52597876605223e-06, + "loss": 0.3464, + "step": 4347 + }, + { + "epoch": 0.27, + "grad_norm": 1.8173665610318812, + "learning_rate": 8.52525658016738e-06, + "loss": 0.3337, + "step": 4348 + }, + { + "epoch": 0.27, + "grad_norm": 1.1933980716705372, + "learning_rate": 8.524534248013374e-06, + "loss": 0.3506, + "step": 4349 + }, + { + "epoch": 0.27, + "grad_norm": 1.4328666112013297, + "learning_rate": 8.523811769620183e-06, + "loss": 0.3288, + "step": 4350 + }, + { + "epoch": 0.27, + "grad_norm": 1.3862717693388513, + "learning_rate": 8.523089145017781e-06, + "loss": 0.3382, + "step": 4351 + }, + { + "epoch": 0.27, + "grad_norm": 2.963060191769809, + "learning_rate": 8.522366374236155e-06, + "loss": 0.3407, + "step": 4352 + }, + { + "epoch": 0.27, + "grad_norm": 1.375648340144036, + "learning_rate": 8.52164345730529e-06, + "loss": 0.3235, + "step": 4353 + }, + { + "epoch": 0.27, + "grad_norm": 1.759064297095795, + "learning_rate": 8.520920394255183e-06, + "loss": 0.3283, + "step": 4354 + }, + { + "epoch": 0.27, + "grad_norm": 2.989751854975891, + "learning_rate": 8.520197185115836e-06, + "loss": 0.3171, + "step": 4355 + }, + { + "epoch": 0.27, + "grad_norm": 1.5708895020208051, + "learning_rate": 8.519473829917254e-06, + "loss": 0.3305, + "step": 4356 + }, + { + "epoch": 0.27, + "grad_norm": 1.4383389833553923, + "learning_rate": 8.518750328689452e-06, + "loss": 0.3189, + "step": 4357 + }, + { + "epoch": 0.27, + "grad_norm": 1.702322153472481, + "learning_rate": 8.518026681462448e-06, + "loss": 0.3633, + "step": 4358 + }, + { + "epoch": 0.27, + "grad_norm": 1.1957765031845529, + "learning_rate": 8.517302888266268e-06, + "loss": 0.3427, + "step": 4359 + }, + { + "epoch": 0.27, + "grad_norm": 1.7601932345222768, + "learning_rate": 8.516578949130945e-06, + "loss": 0.3347, + "step": 4360 + }, + { + "epoch": 0.27, + "grad_norm": 3.01378365039657, + "learning_rate": 8.515854864086514e-06, + "loss": 0.346, + "step": 4361 + }, + { + "epoch": 0.27, + "grad_norm": 2.3238848278812276, + "learning_rate": 8.515130633163023e-06, + "loss": 0.3405, + "step": 4362 + }, + { + "epoch": 0.27, + "grad_norm": 1.4612797367053143, + "learning_rate": 8.514406256390514e-06, + "loss": 0.365, + "step": 4363 + }, + { + "epoch": 0.27, + "grad_norm": 1.3676256160402094, + "learning_rate": 8.513681733799051e-06, + "loss": 0.3427, + "step": 4364 + }, + { + "epoch": 0.27, + "grad_norm": 1.7017726229022991, + "learning_rate": 8.51295706541869e-06, + "loss": 0.3312, + "step": 4365 + }, + { + "epoch": 0.27, + "grad_norm": 1.8504650548271053, + "learning_rate": 8.5122322512795e-06, + "loss": 0.3181, + "step": 4366 + }, + { + "epoch": 0.27, + "grad_norm": 1.6175004487616216, + "learning_rate": 8.511507291411559e-06, + "loss": 0.3372, + "step": 4367 + }, + { + "epoch": 0.27, + "grad_norm": 1.6717481403154533, + "learning_rate": 8.51078218584494e-06, + "loss": 0.3453, + "step": 4368 + }, + { + "epoch": 0.27, + "grad_norm": 1.6367362300887454, + "learning_rate": 8.510056934609731e-06, + "loss": 0.3267, + "step": 4369 + }, + { + "epoch": 0.27, + "grad_norm": 1.6889658034505874, + "learning_rate": 8.509331537736027e-06, + "loss": 0.3549, + "step": 4370 + }, + { + "epoch": 0.27, + "grad_norm": 1.563582268903867, + "learning_rate": 8.508605995253925e-06, + "loss": 0.3461, + "step": 4371 + }, + { + "epoch": 0.27, + "grad_norm": 1.6291706586706887, + "learning_rate": 8.507880307193528e-06, + "loss": 0.3227, + "step": 4372 + }, + { + "epoch": 0.28, + "grad_norm": 5.3164096190998, + "learning_rate": 8.507154473584947e-06, + "loss": 0.3685, + "step": 4373 + }, + { + "epoch": 0.28, + "grad_norm": 1.4672278234014753, + "learning_rate": 8.506428494458295e-06, + "loss": 0.3194, + "step": 4374 + }, + { + "epoch": 0.28, + "grad_norm": 1.4037526339260669, + "learning_rate": 8.505702369843698e-06, + "loss": 0.3379, + "step": 4375 + }, + { + "epoch": 0.28, + "grad_norm": 2.6108254330532286, + "learning_rate": 8.504976099771282e-06, + "loss": 0.3407, + "step": 4376 + }, + { + "epoch": 0.28, + "grad_norm": 3.0522815552892464, + "learning_rate": 8.504249684271184e-06, + "loss": 0.3404, + "step": 4377 + }, + { + "epoch": 0.28, + "grad_norm": 1.5535046442865097, + "learning_rate": 8.503523123373542e-06, + "loss": 0.3355, + "step": 4378 + }, + { + "epoch": 0.28, + "grad_norm": 1.3438725864520216, + "learning_rate": 8.502796417108502e-06, + "loss": 0.329, + "step": 4379 + }, + { + "epoch": 0.28, + "grad_norm": 2.2135373884053635, + "learning_rate": 8.502069565506217e-06, + "loss": 0.3317, + "step": 4380 + }, + { + "epoch": 0.28, + "grad_norm": 1.7996234124505242, + "learning_rate": 8.501342568596845e-06, + "loss": 0.3336, + "step": 4381 + }, + { + "epoch": 0.28, + "grad_norm": 2.8120541851369305, + "learning_rate": 8.500615426410552e-06, + "loss": 0.379, + "step": 4382 + }, + { + "epoch": 0.28, + "grad_norm": 1.9061961758448398, + "learning_rate": 8.499888138977505e-06, + "loss": 0.3278, + "step": 4383 + }, + { + "epoch": 0.28, + "grad_norm": 1.535295646925185, + "learning_rate": 8.499160706327885e-06, + "loss": 0.3321, + "step": 4384 + }, + { + "epoch": 0.28, + "grad_norm": 1.6314075826180614, + "learning_rate": 8.498433128491871e-06, + "loss": 0.3036, + "step": 4385 + }, + { + "epoch": 0.28, + "grad_norm": 1.3899585689416163, + "learning_rate": 8.497705405499653e-06, + "loss": 0.3316, + "step": 4386 + }, + { + "epoch": 0.28, + "grad_norm": 1.8082256460455335, + "learning_rate": 8.496977537381423e-06, + "loss": 0.3213, + "step": 4387 + }, + { + "epoch": 0.28, + "grad_norm": 1.6111633913066594, + "learning_rate": 8.496249524167385e-06, + "loss": 0.3156, + "step": 4388 + }, + { + "epoch": 0.28, + "grad_norm": 2.178875953257289, + "learning_rate": 8.495521365887746e-06, + "loss": 0.3249, + "step": 4389 + }, + { + "epoch": 0.28, + "grad_norm": 1.5600057353396926, + "learning_rate": 8.494793062572715e-06, + "loss": 0.3251, + "step": 4390 + }, + { + "epoch": 0.28, + "grad_norm": 1.9363086132080793, + "learning_rate": 8.494064614252514e-06, + "loss": 0.3483, + "step": 4391 + }, + { + "epoch": 0.28, + "grad_norm": 2.3219061346519227, + "learning_rate": 8.493336020957363e-06, + "loss": 0.3385, + "step": 4392 + }, + { + "epoch": 0.28, + "grad_norm": 2.0627756771022723, + "learning_rate": 8.492607282717498e-06, + "loss": 0.3138, + "step": 4393 + }, + { + "epoch": 0.28, + "grad_norm": 1.969644771274343, + "learning_rate": 8.491878399563152e-06, + "loss": 0.3509, + "step": 4394 + }, + { + "epoch": 0.28, + "grad_norm": 1.7271320488528492, + "learning_rate": 8.49114937152457e-06, + "loss": 0.3435, + "step": 4395 + }, + { + "epoch": 0.28, + "grad_norm": 2.002131941452337, + "learning_rate": 8.490420198631998e-06, + "loss": 0.3385, + "step": 4396 + }, + { + "epoch": 0.28, + "grad_norm": 1.077766615899552, + "learning_rate": 8.489690880915695e-06, + "loss": 0.3174, + "step": 4397 + }, + { + "epoch": 0.28, + "grad_norm": 1.4175821939526254, + "learning_rate": 8.488961418405917e-06, + "loss": 0.3143, + "step": 4398 + }, + { + "epoch": 0.28, + "grad_norm": 1.373658890823445, + "learning_rate": 8.488231811132932e-06, + "loss": 0.357, + "step": 4399 + }, + { + "epoch": 0.28, + "grad_norm": 2.7707450262173765, + "learning_rate": 8.487502059127015e-06, + "loss": 0.3439, + "step": 4400 + }, + { + "epoch": 0.28, + "grad_norm": 1.3140756892535392, + "learning_rate": 8.486772162418442e-06, + "loss": 0.3364, + "step": 4401 + }, + { + "epoch": 0.28, + "grad_norm": 2.2274575926710707, + "learning_rate": 8.4860421210375e-06, + "loss": 0.3485, + "step": 4402 + }, + { + "epoch": 0.28, + "grad_norm": 1.1946148108466015, + "learning_rate": 8.485311935014478e-06, + "loss": 0.3448, + "step": 4403 + }, + { + "epoch": 0.28, + "grad_norm": 1.2386776861371314, + "learning_rate": 8.484581604379673e-06, + "loss": 0.3226, + "step": 4404 + }, + { + "epoch": 0.28, + "grad_norm": 2.1940307130584027, + "learning_rate": 8.483851129163388e-06, + "loss": 0.3419, + "step": 4405 + }, + { + "epoch": 0.28, + "grad_norm": 1.4218911382822264, + "learning_rate": 8.48312050939593e-06, + "loss": 0.3458, + "step": 4406 + }, + { + "epoch": 0.28, + "grad_norm": 1.4026977993571956, + "learning_rate": 8.482389745107618e-06, + "loss": 0.3425, + "step": 4407 + }, + { + "epoch": 0.28, + "grad_norm": 2.7591858574730552, + "learning_rate": 8.481658836328767e-06, + "loss": 0.333, + "step": 4408 + }, + { + "epoch": 0.28, + "grad_norm": 1.3768237191404413, + "learning_rate": 8.48092778308971e-06, + "loss": 0.3452, + "step": 4409 + }, + { + "epoch": 0.28, + "grad_norm": 1.1607715160729817, + "learning_rate": 8.480196585420775e-06, + "loss": 0.3401, + "step": 4410 + }, + { + "epoch": 0.28, + "grad_norm": 1.7690587304651568, + "learning_rate": 8.479465243352303e-06, + "loss": 0.3353, + "step": 4411 + }, + { + "epoch": 0.28, + "grad_norm": 1.166355849091709, + "learning_rate": 8.478733756914636e-06, + "loss": 0.3216, + "step": 4412 + }, + { + "epoch": 0.28, + "grad_norm": 1.2161506734271486, + "learning_rate": 8.478002126138127e-06, + "loss": 0.3247, + "step": 4413 + }, + { + "epoch": 0.28, + "grad_norm": 1.3162179304770918, + "learning_rate": 8.477270351053133e-06, + "loss": 0.3306, + "step": 4414 + }, + { + "epoch": 0.28, + "grad_norm": 1.4929854712675306, + "learning_rate": 8.476538431690016e-06, + "loss": 0.3497, + "step": 4415 + }, + { + "epoch": 0.28, + "grad_norm": 19.624411338106835, + "learning_rate": 8.475806368079144e-06, + "loss": 0.3609, + "step": 4416 + }, + { + "epoch": 0.28, + "grad_norm": 2.0073075734938985, + "learning_rate": 8.475074160250892e-06, + "loss": 0.3313, + "step": 4417 + }, + { + "epoch": 0.28, + "grad_norm": 2.807099711074169, + "learning_rate": 8.47434180823564e-06, + "loss": 0.3398, + "step": 4418 + }, + { + "epoch": 0.28, + "grad_norm": 2.677998276150886, + "learning_rate": 8.473609312063778e-06, + "loss": 0.3462, + "step": 4419 + }, + { + "epoch": 0.28, + "grad_norm": 0.6659747390409879, + "learning_rate": 8.472876671765692e-06, + "loss": 0.489, + "step": 4420 + }, + { + "epoch": 0.28, + "grad_norm": 2.037584135778137, + "learning_rate": 8.472143887371786e-06, + "loss": 0.3379, + "step": 4421 + }, + { + "epoch": 0.28, + "grad_norm": 1.3806910811268274, + "learning_rate": 8.471410958912463e-06, + "loss": 0.3233, + "step": 4422 + }, + { + "epoch": 0.28, + "grad_norm": 2.0217306989118162, + "learning_rate": 8.470677886418135e-06, + "loss": 0.3474, + "step": 4423 + }, + { + "epoch": 0.28, + "grad_norm": 1.211865032210044, + "learning_rate": 8.469944669919214e-06, + "loss": 0.3365, + "step": 4424 + }, + { + "epoch": 0.28, + "grad_norm": 2.0957457196553895, + "learning_rate": 8.469211309446127e-06, + "loss": 0.3423, + "step": 4425 + }, + { + "epoch": 0.28, + "grad_norm": 1.9589516548469754, + "learning_rate": 8.4684778050293e-06, + "loss": 0.3364, + "step": 4426 + }, + { + "epoch": 0.28, + "grad_norm": 2.0751073588242526, + "learning_rate": 8.467744156699168e-06, + "loss": 0.3377, + "step": 4427 + }, + { + "epoch": 0.28, + "grad_norm": 2.164582665735496, + "learning_rate": 8.467010364486173e-06, + "loss": 0.3701, + "step": 4428 + }, + { + "epoch": 0.28, + "grad_norm": 11.773615112194289, + "learning_rate": 8.466276428420759e-06, + "loss": 0.3499, + "step": 4429 + }, + { + "epoch": 0.28, + "grad_norm": 1.7825839864747932, + "learning_rate": 8.465542348533379e-06, + "loss": 0.3574, + "step": 4430 + }, + { + "epoch": 0.28, + "grad_norm": 3.2084379741261175, + "learning_rate": 8.46480812485449e-06, + "loss": 0.35, + "step": 4431 + }, + { + "epoch": 0.28, + "grad_norm": 1.4643439280425021, + "learning_rate": 8.464073757414561e-06, + "loss": 0.3522, + "step": 4432 + }, + { + "epoch": 0.28, + "grad_norm": 1.6935562458211901, + "learning_rate": 8.463339246244058e-06, + "loss": 0.3346, + "step": 4433 + }, + { + "epoch": 0.28, + "grad_norm": 1.6989172639998948, + "learning_rate": 8.462604591373457e-06, + "loss": 0.3388, + "step": 4434 + }, + { + "epoch": 0.28, + "grad_norm": 1.6921131120928505, + "learning_rate": 8.46186979283324e-06, + "loss": 0.3521, + "step": 4435 + }, + { + "epoch": 0.28, + "grad_norm": 2.552014840242945, + "learning_rate": 8.461134850653899e-06, + "loss": 0.3172, + "step": 4436 + }, + { + "epoch": 0.28, + "grad_norm": 1.5128879381537257, + "learning_rate": 8.460399764865923e-06, + "loss": 0.353, + "step": 4437 + }, + { + "epoch": 0.28, + "grad_norm": 2.2140103876396258, + "learning_rate": 8.459664535499816e-06, + "loss": 0.3549, + "step": 4438 + }, + { + "epoch": 0.28, + "grad_norm": 2.9802234604045736, + "learning_rate": 8.458929162586083e-06, + "loss": 0.3498, + "step": 4439 + }, + { + "epoch": 0.28, + "grad_norm": 5.301930046058223, + "learning_rate": 8.458193646155234e-06, + "loss": 0.329, + "step": 4440 + }, + { + "epoch": 0.28, + "grad_norm": 2.6136415581442636, + "learning_rate": 8.457457986237788e-06, + "loss": 0.3502, + "step": 4441 + }, + { + "epoch": 0.28, + "grad_norm": 2.7302452323933113, + "learning_rate": 8.45672218286427e-06, + "loss": 0.3241, + "step": 4442 + }, + { + "epoch": 0.28, + "grad_norm": 1.1475670132876745, + "learning_rate": 8.455986236065208e-06, + "loss": 0.3252, + "step": 4443 + }, + { + "epoch": 0.28, + "grad_norm": 1.6465254320379765, + "learning_rate": 8.455250145871139e-06, + "loss": 0.3287, + "step": 4444 + }, + { + "epoch": 0.28, + "grad_norm": 2.2861135255516976, + "learning_rate": 8.454513912312605e-06, + "loss": 0.3169, + "step": 4445 + }, + { + "epoch": 0.28, + "grad_norm": 0.697120502963351, + "learning_rate": 8.45377753542015e-06, + "loss": 0.5319, + "step": 4446 + }, + { + "epoch": 0.28, + "grad_norm": 1.733200793446351, + "learning_rate": 8.453041015224332e-06, + "loss": 0.3337, + "step": 4447 + }, + { + "epoch": 0.28, + "grad_norm": 2.060444362366062, + "learning_rate": 8.452304351755712e-06, + "loss": 0.3254, + "step": 4448 + }, + { + "epoch": 0.28, + "grad_norm": 2.8920764037027373, + "learning_rate": 8.451567545044848e-06, + "loss": 0.3392, + "step": 4449 + }, + { + "epoch": 0.28, + "grad_norm": 1.7831363366263777, + "learning_rate": 8.450830595122317e-06, + "loss": 0.3381, + "step": 4450 + }, + { + "epoch": 0.28, + "grad_norm": 1.7005389347084314, + "learning_rate": 8.450093502018698e-06, + "loss": 0.3388, + "step": 4451 + }, + { + "epoch": 0.28, + "grad_norm": 1.8978674480258144, + "learning_rate": 8.449356265764568e-06, + "loss": 0.3497, + "step": 4452 + }, + { + "epoch": 0.28, + "grad_norm": 0.6128982561238903, + "learning_rate": 8.448618886390523e-06, + "loss": 0.4854, + "step": 4453 + }, + { + "epoch": 0.28, + "grad_norm": 2.225443077703998, + "learning_rate": 8.447881363927152e-06, + "loss": 0.3379, + "step": 4454 + }, + { + "epoch": 0.28, + "grad_norm": 2.170752349368643, + "learning_rate": 8.44714369840506e-06, + "loss": 0.3485, + "step": 4455 + }, + { + "epoch": 0.28, + "grad_norm": 2.2547032767916018, + "learning_rate": 8.446405889854853e-06, + "loss": 0.3316, + "step": 4456 + }, + { + "epoch": 0.28, + "grad_norm": 1.396362027733872, + "learning_rate": 8.445667938307145e-06, + "loss": 0.355, + "step": 4457 + }, + { + "epoch": 0.28, + "grad_norm": 1.902622725375012, + "learning_rate": 8.444929843792554e-06, + "loss": 0.3245, + "step": 4458 + }, + { + "epoch": 0.28, + "grad_norm": 2.0892517371336243, + "learning_rate": 8.444191606341705e-06, + "loss": 0.3295, + "step": 4459 + }, + { + "epoch": 0.28, + "grad_norm": 1.5573086843510329, + "learning_rate": 8.443453225985227e-06, + "loss": 0.3189, + "step": 4460 + }, + { + "epoch": 0.28, + "grad_norm": 2.786192565503805, + "learning_rate": 8.442714702753761e-06, + "loss": 0.3289, + "step": 4461 + }, + { + "epoch": 0.28, + "grad_norm": 2.9314382992937276, + "learning_rate": 8.441976036677945e-06, + "loss": 0.3311, + "step": 4462 + }, + { + "epoch": 0.28, + "grad_norm": 2.4805191730640384, + "learning_rate": 8.44123722778843e-06, + "loss": 0.3388, + "step": 4463 + }, + { + "epoch": 0.28, + "grad_norm": 1.4171308586400473, + "learning_rate": 8.440498276115872e-06, + "loss": 0.3672, + "step": 4464 + }, + { + "epoch": 0.28, + "grad_norm": 2.083731638138706, + "learning_rate": 8.439759181690927e-06, + "loss": 0.3624, + "step": 4465 + }, + { + "epoch": 0.28, + "grad_norm": 1.6896070284168334, + "learning_rate": 8.439019944544265e-06, + "loss": 0.338, + "step": 4466 + }, + { + "epoch": 0.28, + "grad_norm": 1.5516054145198155, + "learning_rate": 8.438280564706556e-06, + "loss": 0.3368, + "step": 4467 + }, + { + "epoch": 0.28, + "grad_norm": 1.3380291500414585, + "learning_rate": 8.43754104220848e-06, + "loss": 0.3423, + "step": 4468 + }, + { + "epoch": 0.28, + "grad_norm": 1.5818858495087924, + "learning_rate": 8.436801377080721e-06, + "loss": 0.3434, + "step": 4469 + }, + { + "epoch": 0.28, + "grad_norm": 2.4457063362733797, + "learning_rate": 8.436061569353969e-06, + "loss": 0.3159, + "step": 4470 + }, + { + "epoch": 0.28, + "grad_norm": 2.4635786151827817, + "learning_rate": 8.435321619058918e-06, + "loss": 0.3433, + "step": 4471 + }, + { + "epoch": 0.28, + "grad_norm": 1.6632148533614326, + "learning_rate": 8.434581526226271e-06, + "loss": 0.3433, + "step": 4472 + }, + { + "epoch": 0.28, + "grad_norm": 1.4451986013557694, + "learning_rate": 8.433841290886736e-06, + "loss": 0.3227, + "step": 4473 + }, + { + "epoch": 0.28, + "grad_norm": 3.4790957748402054, + "learning_rate": 8.433100913071028e-06, + "loss": 0.3395, + "step": 4474 + }, + { + "epoch": 0.28, + "grad_norm": 3.3330617974772014, + "learning_rate": 8.432360392809864e-06, + "loss": 0.3293, + "step": 4475 + }, + { + "epoch": 0.28, + "grad_norm": 1.413761591379426, + "learning_rate": 8.43161973013397e-06, + "loss": 0.3207, + "step": 4476 + }, + { + "epoch": 0.28, + "grad_norm": 1.3253103720450454, + "learning_rate": 8.43087892507408e-06, + "loss": 0.3223, + "step": 4477 + }, + { + "epoch": 0.28, + "grad_norm": 2.140728020517338, + "learning_rate": 8.430137977660929e-06, + "loss": 0.3436, + "step": 4478 + }, + { + "epoch": 0.28, + "grad_norm": 1.7925152385170382, + "learning_rate": 8.42939688792526e-06, + "loss": 0.3324, + "step": 4479 + }, + { + "epoch": 0.28, + "grad_norm": 1.8727619116462806, + "learning_rate": 8.428655655897824e-06, + "loss": 0.3391, + "step": 4480 + }, + { + "epoch": 0.28, + "grad_norm": 1.9297848501336063, + "learning_rate": 8.427914281609374e-06, + "loss": 0.3289, + "step": 4481 + }, + { + "epoch": 0.28, + "grad_norm": 2.4249350042801914, + "learning_rate": 8.427172765090671e-06, + "loss": 0.3563, + "step": 4482 + }, + { + "epoch": 0.28, + "grad_norm": 1.4973898360340145, + "learning_rate": 8.426431106372484e-06, + "loss": 0.3346, + "step": 4483 + }, + { + "epoch": 0.28, + "grad_norm": 1.7919917592138357, + "learning_rate": 8.425689305485583e-06, + "loss": 0.3486, + "step": 4484 + }, + { + "epoch": 0.28, + "grad_norm": 2.976785259701706, + "learning_rate": 8.42494736246075e-06, + "loss": 0.3702, + "step": 4485 + }, + { + "epoch": 0.28, + "grad_norm": 1.6446864876745837, + "learning_rate": 8.424205277328767e-06, + "loss": 0.3395, + "step": 4486 + }, + { + "epoch": 0.28, + "grad_norm": 1.5223096088696004, + "learning_rate": 8.423463050120424e-06, + "loss": 0.338, + "step": 4487 + }, + { + "epoch": 0.28, + "grad_norm": 0.6865376594309599, + "learning_rate": 8.42272068086652e-06, + "loss": 0.4876, + "step": 4488 + }, + { + "epoch": 0.28, + "grad_norm": 1.3421517139266632, + "learning_rate": 8.421978169597853e-06, + "loss": 0.3248, + "step": 4489 + }, + { + "epoch": 0.28, + "grad_norm": 1.213398892885125, + "learning_rate": 8.421235516345236e-06, + "loss": 0.3262, + "step": 4490 + }, + { + "epoch": 0.28, + "grad_norm": 2.924517519833465, + "learning_rate": 8.420492721139479e-06, + "loss": 0.3204, + "step": 4491 + }, + { + "epoch": 0.28, + "grad_norm": 1.6575116442764894, + "learning_rate": 8.419749784011405e-06, + "loss": 0.3299, + "step": 4492 + }, + { + "epoch": 0.28, + "grad_norm": 1.7710530905246775, + "learning_rate": 8.419006704991837e-06, + "loss": 0.3431, + "step": 4493 + }, + { + "epoch": 0.28, + "grad_norm": 2.0894328764514416, + "learning_rate": 8.418263484111607e-06, + "loss": 0.325, + "step": 4494 + }, + { + "epoch": 0.28, + "grad_norm": 1.4823357952923764, + "learning_rate": 8.417520121401555e-06, + "loss": 0.3408, + "step": 4495 + }, + { + "epoch": 0.28, + "grad_norm": 0.6770118194333938, + "learning_rate": 8.416776616892523e-06, + "loss": 0.529, + "step": 4496 + }, + { + "epoch": 0.28, + "grad_norm": 9.061292293556901, + "learning_rate": 8.41603297061536e-06, + "loss": 0.3535, + "step": 4497 + }, + { + "epoch": 0.28, + "grad_norm": 1.3675176565008405, + "learning_rate": 8.41528918260092e-06, + "loss": 0.3406, + "step": 4498 + }, + { + "epoch": 0.28, + "grad_norm": 1.8532972918923218, + "learning_rate": 8.414545252880069e-06, + "loss": 0.3355, + "step": 4499 + }, + { + "epoch": 0.28, + "grad_norm": 1.5945330744361292, + "learning_rate": 8.413801181483668e-06, + "loss": 0.3247, + "step": 4500 + }, + { + "epoch": 0.28, + "grad_norm": 1.7164676116388484, + "learning_rate": 8.413056968442593e-06, + "loss": 0.3302, + "step": 4501 + }, + { + "epoch": 0.28, + "grad_norm": 0.5914714367032117, + "learning_rate": 8.412312613787723e-06, + "loss": 0.5142, + "step": 4502 + }, + { + "epoch": 0.28, + "grad_norm": 1.4003420472063788, + "learning_rate": 8.41156811754994e-06, + "loss": 0.3173, + "step": 4503 + }, + { + "epoch": 0.28, + "grad_norm": 1.867307296426739, + "learning_rate": 8.410823479760138e-06, + "loss": 0.3393, + "step": 4504 + }, + { + "epoch": 0.28, + "grad_norm": 2.0443264810913835, + "learning_rate": 8.41007870044921e-06, + "loss": 0.3329, + "step": 4505 + }, + { + "epoch": 0.28, + "grad_norm": 1.9652394315038801, + "learning_rate": 8.40933377964806e-06, + "loss": 0.3539, + "step": 4506 + }, + { + "epoch": 0.28, + "grad_norm": 1.792371914748029, + "learning_rate": 8.408588717387595e-06, + "loss": 0.3289, + "step": 4507 + }, + { + "epoch": 0.28, + "grad_norm": 1.5403269130661204, + "learning_rate": 8.407843513698732e-06, + "loss": 0.3216, + "step": 4508 + }, + { + "epoch": 0.28, + "grad_norm": 1.8494122686288157, + "learning_rate": 8.407098168612388e-06, + "loss": 0.3396, + "step": 4509 + }, + { + "epoch": 0.28, + "grad_norm": 2.0371687097009716, + "learning_rate": 8.406352682159488e-06, + "loss": 0.3519, + "step": 4510 + }, + { + "epoch": 0.28, + "grad_norm": 1.395283305635252, + "learning_rate": 8.405607054370967e-06, + "loss": 0.3355, + "step": 4511 + }, + { + "epoch": 0.28, + "grad_norm": 1.936582424310372, + "learning_rate": 8.404861285277759e-06, + "loss": 0.3329, + "step": 4512 + }, + { + "epoch": 0.28, + "grad_norm": 2.1771448147845263, + "learning_rate": 8.404115374910808e-06, + "loss": 0.3288, + "step": 4513 + }, + { + "epoch": 0.28, + "grad_norm": 2.045448180983775, + "learning_rate": 8.403369323301065e-06, + "loss": 0.3564, + "step": 4514 + }, + { + "epoch": 0.28, + "grad_norm": 1.7095027800576776, + "learning_rate": 8.402623130479482e-06, + "loss": 0.3402, + "step": 4515 + }, + { + "epoch": 0.28, + "grad_norm": 1.582270828514155, + "learning_rate": 8.401876796477023e-06, + "loss": 0.348, + "step": 4516 + }, + { + "epoch": 0.28, + "grad_norm": 2.1942599791844697, + "learning_rate": 8.401130321324653e-06, + "loss": 0.3301, + "step": 4517 + }, + { + "epoch": 0.28, + "grad_norm": 1.6912184946799662, + "learning_rate": 8.400383705053344e-06, + "loss": 0.3518, + "step": 4518 + }, + { + "epoch": 0.28, + "grad_norm": 3.1691049921985823, + "learning_rate": 8.399636947694077e-06, + "loss": 0.3298, + "step": 4519 + }, + { + "epoch": 0.28, + "grad_norm": 2.4106364768480293, + "learning_rate": 8.398890049277834e-06, + "loss": 0.35, + "step": 4520 + }, + { + "epoch": 0.28, + "grad_norm": 2.3155101896571524, + "learning_rate": 8.398143009835604e-06, + "loss": 0.3349, + "step": 4521 + }, + { + "epoch": 0.28, + "grad_norm": 2.6146989377182868, + "learning_rate": 8.397395829398387e-06, + "loss": 0.3234, + "step": 4522 + }, + { + "epoch": 0.28, + "grad_norm": 3.8812126832136733, + "learning_rate": 8.396648507997181e-06, + "loss": 0.3219, + "step": 4523 + }, + { + "epoch": 0.28, + "grad_norm": 3.6515107372210474, + "learning_rate": 8.395901045662995e-06, + "loss": 0.334, + "step": 4524 + }, + { + "epoch": 0.28, + "grad_norm": 2.862319841649697, + "learning_rate": 8.395153442426844e-06, + "loss": 0.3393, + "step": 4525 + }, + { + "epoch": 0.28, + "grad_norm": 2.00465234726739, + "learning_rate": 8.394405698319748e-06, + "loss": 0.3186, + "step": 4526 + }, + { + "epoch": 0.28, + "grad_norm": 3.657619312354189, + "learning_rate": 8.393657813372728e-06, + "loss": 0.336, + "step": 4527 + }, + { + "epoch": 0.28, + "grad_norm": 0.7031161656919498, + "learning_rate": 8.392909787616817e-06, + "loss": 0.5135, + "step": 4528 + }, + { + "epoch": 0.28, + "grad_norm": 1.9786773654926382, + "learning_rate": 8.392161621083054e-06, + "loss": 0.3445, + "step": 4529 + }, + { + "epoch": 0.28, + "grad_norm": 1.810524317888919, + "learning_rate": 8.39141331380248e-06, + "loss": 0.3348, + "step": 4530 + }, + { + "epoch": 0.28, + "grad_norm": 2.2980793145349176, + "learning_rate": 8.390664865806142e-06, + "loss": 0.3429, + "step": 4531 + }, + { + "epoch": 0.29, + "grad_norm": 2.032304141132441, + "learning_rate": 8.389916277125099e-06, + "loss": 0.3271, + "step": 4532 + }, + { + "epoch": 0.29, + "grad_norm": 1.605666291927062, + "learning_rate": 8.389167547790407e-06, + "loss": 0.3182, + "step": 4533 + }, + { + "epoch": 0.29, + "grad_norm": 1.7435515785285134, + "learning_rate": 8.388418677833132e-06, + "loss": 0.3269, + "step": 4534 + }, + { + "epoch": 0.29, + "grad_norm": 1.3544372758044134, + "learning_rate": 8.387669667284351e-06, + "loss": 0.3291, + "step": 4535 + }, + { + "epoch": 0.29, + "grad_norm": 2.319882159195161, + "learning_rate": 8.386920516175135e-06, + "loss": 0.32, + "step": 4536 + }, + { + "epoch": 0.29, + "grad_norm": 2.8330191414718793, + "learning_rate": 8.386171224536573e-06, + "loss": 0.3397, + "step": 4537 + }, + { + "epoch": 0.29, + "grad_norm": 1.6620619363299594, + "learning_rate": 8.385421792399753e-06, + "loss": 0.3347, + "step": 4538 + }, + { + "epoch": 0.29, + "grad_norm": 2.28094788853945, + "learning_rate": 8.384672219795766e-06, + "loss": 0.3278, + "step": 4539 + }, + { + "epoch": 0.29, + "grad_norm": 2.745254652276401, + "learning_rate": 8.38392250675572e-06, + "loss": 0.3362, + "step": 4540 + }, + { + "epoch": 0.29, + "grad_norm": 1.7561920695282103, + "learning_rate": 8.383172653310718e-06, + "loss": 0.3304, + "step": 4541 + }, + { + "epoch": 0.29, + "grad_norm": 1.8161784410716515, + "learning_rate": 8.382422659491873e-06, + "loss": 0.3289, + "step": 4542 + }, + { + "epoch": 0.29, + "grad_norm": 1.7828874551969827, + "learning_rate": 8.381672525330305e-06, + "loss": 0.3089, + "step": 4543 + }, + { + "epoch": 0.29, + "grad_norm": 1.8769878242003888, + "learning_rate": 8.380922250857137e-06, + "loss": 0.3293, + "step": 4544 + }, + { + "epoch": 0.29, + "grad_norm": 1.4945991479535552, + "learning_rate": 8.380171836103499e-06, + "loss": 0.3183, + "step": 4545 + }, + { + "epoch": 0.29, + "grad_norm": 4.5309737210634635, + "learning_rate": 8.379421281100529e-06, + "loss": 0.3357, + "step": 4546 + }, + { + "epoch": 0.29, + "grad_norm": 2.9862667447555324, + "learning_rate": 8.378670585879365e-06, + "loss": 0.314, + "step": 4547 + }, + { + "epoch": 0.29, + "grad_norm": 2.1606467474212208, + "learning_rate": 8.37791975047116e-06, + "loss": 0.3195, + "step": 4548 + }, + { + "epoch": 0.29, + "grad_norm": 2.4590132781243565, + "learning_rate": 8.377168774907063e-06, + "loss": 0.3186, + "step": 4549 + }, + { + "epoch": 0.29, + "grad_norm": 2.1345334959308437, + "learning_rate": 8.376417659218234e-06, + "loss": 0.3297, + "step": 4550 + }, + { + "epoch": 0.29, + "grad_norm": 2.2319609286801514, + "learning_rate": 8.375666403435842e-06, + "loss": 0.3329, + "step": 4551 + }, + { + "epoch": 0.29, + "grad_norm": 2.181362994384634, + "learning_rate": 8.374915007591053e-06, + "loss": 0.3396, + "step": 4552 + }, + { + "epoch": 0.29, + "grad_norm": 1.959476969109861, + "learning_rate": 8.374163471715046e-06, + "loss": 0.3473, + "step": 4553 + }, + { + "epoch": 0.29, + "grad_norm": 1.787575127099224, + "learning_rate": 8.373411795839003e-06, + "loss": 0.3293, + "step": 4554 + }, + { + "epoch": 0.29, + "grad_norm": 2.6641799563675863, + "learning_rate": 8.372659979994116e-06, + "loss": 0.3301, + "step": 4555 + }, + { + "epoch": 0.29, + "grad_norm": 4.621177104530586, + "learning_rate": 8.371908024211572e-06, + "loss": 0.3157, + "step": 4556 + }, + { + "epoch": 0.29, + "grad_norm": 2.2845599080276258, + "learning_rate": 8.371155928522578e-06, + "loss": 0.3256, + "step": 4557 + }, + { + "epoch": 0.29, + "grad_norm": 1.9115502875416628, + "learning_rate": 8.370403692958334e-06, + "loss": 0.3152, + "step": 4558 + }, + { + "epoch": 0.29, + "grad_norm": 2.9359034905984105, + "learning_rate": 8.369651317550055e-06, + "loss": 0.326, + "step": 4559 + }, + { + "epoch": 0.29, + "grad_norm": 1.47395122757619, + "learning_rate": 8.368898802328957e-06, + "loss": 0.3353, + "step": 4560 + }, + { + "epoch": 0.29, + "grad_norm": 3.112484592043131, + "learning_rate": 8.368146147326266e-06, + "loss": 0.3227, + "step": 4561 + }, + { + "epoch": 0.29, + "grad_norm": 1.837742348531059, + "learning_rate": 8.367393352573207e-06, + "loss": 0.3228, + "step": 4562 + }, + { + "epoch": 0.29, + "grad_norm": 1.8198363011071297, + "learning_rate": 8.366640418101018e-06, + "loss": 0.3044, + "step": 4563 + }, + { + "epoch": 0.29, + "grad_norm": 1.4792177581157653, + "learning_rate": 8.365887343940937e-06, + "loss": 0.3083, + "step": 4564 + }, + { + "epoch": 0.29, + "grad_norm": 2.0749427789794592, + "learning_rate": 8.365134130124214e-06, + "loss": 0.3141, + "step": 4565 + }, + { + "epoch": 0.29, + "grad_norm": 1.4680932736556904, + "learning_rate": 8.364380776682095e-06, + "loss": 0.3257, + "step": 4566 + }, + { + "epoch": 0.29, + "grad_norm": 1.5283672458531308, + "learning_rate": 8.363627283645843e-06, + "loss": 0.3326, + "step": 4567 + }, + { + "epoch": 0.29, + "grad_norm": 1.393840568899809, + "learning_rate": 8.362873651046722e-06, + "loss": 0.3201, + "step": 4568 + }, + { + "epoch": 0.29, + "grad_norm": 2.5101834231715774, + "learning_rate": 8.362119878916e-06, + "loss": 0.3444, + "step": 4569 + }, + { + "epoch": 0.29, + "grad_norm": 1.8470073253909227, + "learning_rate": 8.361365967284951e-06, + "loss": 0.326, + "step": 4570 + }, + { + "epoch": 0.29, + "grad_norm": 2.531618393204744, + "learning_rate": 8.360611916184859e-06, + "loss": 0.3273, + "step": 4571 + }, + { + "epoch": 0.29, + "grad_norm": 1.8945311703301655, + "learning_rate": 8.359857725647009e-06, + "loss": 0.3266, + "step": 4572 + }, + { + "epoch": 0.29, + "grad_norm": 2.212260774091573, + "learning_rate": 8.359103395702692e-06, + "loss": 0.3195, + "step": 4573 + }, + { + "epoch": 0.29, + "grad_norm": 3.352783258966287, + "learning_rate": 8.358348926383211e-06, + "loss": 0.3288, + "step": 4574 + }, + { + "epoch": 0.29, + "grad_norm": 3.5333192301589995, + "learning_rate": 8.357594317719868e-06, + "loss": 0.322, + "step": 4575 + }, + { + "epoch": 0.29, + "grad_norm": 1.4610462430917213, + "learning_rate": 8.356839569743974e-06, + "loss": 0.3299, + "step": 4576 + }, + { + "epoch": 0.29, + "grad_norm": 2.887917428489183, + "learning_rate": 8.35608468248684e-06, + "loss": 0.3276, + "step": 4577 + }, + { + "epoch": 0.29, + "grad_norm": 2.56190176490763, + "learning_rate": 8.355329655979794e-06, + "loss": 0.3346, + "step": 4578 + }, + { + "epoch": 0.29, + "grad_norm": 2.1840627550859883, + "learning_rate": 8.35457449025416e-06, + "loss": 0.3176, + "step": 4579 + }, + { + "epoch": 0.29, + "grad_norm": 1.8388061367703301, + "learning_rate": 8.353819185341275e-06, + "loss": 0.3181, + "step": 4580 + }, + { + "epoch": 0.29, + "grad_norm": 1.8328163528404189, + "learning_rate": 8.353063741272471e-06, + "loss": 0.3113, + "step": 4581 + }, + { + "epoch": 0.29, + "grad_norm": 1.4526576376010834, + "learning_rate": 8.352308158079099e-06, + "loss": 0.3248, + "step": 4582 + }, + { + "epoch": 0.29, + "grad_norm": 2.7219056871357408, + "learning_rate": 8.351552435792507e-06, + "loss": 0.316, + "step": 4583 + }, + { + "epoch": 0.29, + "grad_norm": 0.6905315900768709, + "learning_rate": 8.35079657444405e-06, + "loss": 0.5059, + "step": 4584 + }, + { + "epoch": 0.29, + "grad_norm": 0.641832623872991, + "learning_rate": 8.350040574065093e-06, + "loss": 0.4916, + "step": 4585 + }, + { + "epoch": 0.29, + "grad_norm": 1.6668764451013662, + "learning_rate": 8.349284434687001e-06, + "loss": 0.3373, + "step": 4586 + }, + { + "epoch": 0.29, + "grad_norm": 0.5941483496192381, + "learning_rate": 8.348528156341148e-06, + "loss": 0.4646, + "step": 4587 + }, + { + "epoch": 0.29, + "grad_norm": 2.479937255342335, + "learning_rate": 8.347771739058917e-06, + "loss": 0.3096, + "step": 4588 + }, + { + "epoch": 0.29, + "grad_norm": 2.488931151716721, + "learning_rate": 8.34701518287169e-06, + "loss": 0.3407, + "step": 4589 + }, + { + "epoch": 0.29, + "grad_norm": 1.957149558452516, + "learning_rate": 8.346258487810855e-06, + "loss": 0.3442, + "step": 4590 + }, + { + "epoch": 0.29, + "grad_norm": 2.175477160163789, + "learning_rate": 8.345501653907813e-06, + "loss": 0.3293, + "step": 4591 + }, + { + "epoch": 0.29, + "grad_norm": 0.7340860703640896, + "learning_rate": 8.344744681193966e-06, + "loss": 0.5371, + "step": 4592 + }, + { + "epoch": 0.29, + "grad_norm": 1.4367219093086712, + "learning_rate": 8.343987569700723e-06, + "loss": 0.3157, + "step": 4593 + }, + { + "epoch": 0.29, + "grad_norm": 4.348886619276724, + "learning_rate": 8.343230319459493e-06, + "loss": 0.3172, + "step": 4594 + }, + { + "epoch": 0.29, + "grad_norm": 1.85393782325142, + "learning_rate": 8.3424729305017e-06, + "loss": 0.3538, + "step": 4595 + }, + { + "epoch": 0.29, + "grad_norm": 1.5583309786518522, + "learning_rate": 8.341715402858768e-06, + "loss": 0.3281, + "step": 4596 + }, + { + "epoch": 0.29, + "grad_norm": 1.494209692042095, + "learning_rate": 8.340957736562127e-06, + "loss": 0.3219, + "step": 4597 + }, + { + "epoch": 0.29, + "grad_norm": 2.722372494117495, + "learning_rate": 8.340199931643218e-06, + "loss": 0.3217, + "step": 4598 + }, + { + "epoch": 0.29, + "grad_norm": 1.2868989733023135, + "learning_rate": 8.339441988133478e-06, + "loss": 0.3336, + "step": 4599 + }, + { + "epoch": 0.29, + "grad_norm": 2.7866223372150567, + "learning_rate": 8.338683906064361e-06, + "loss": 0.318, + "step": 4600 + }, + { + "epoch": 0.29, + "grad_norm": 2.2134149476295644, + "learning_rate": 8.337925685467318e-06, + "loss": 0.3155, + "step": 4601 + }, + { + "epoch": 0.29, + "grad_norm": 1.5640175712540683, + "learning_rate": 8.337167326373809e-06, + "loss": 0.3295, + "step": 4602 + }, + { + "epoch": 0.29, + "grad_norm": 1.7998947475940565, + "learning_rate": 8.3364088288153e-06, + "loss": 0.3357, + "step": 4603 + }, + { + "epoch": 0.29, + "grad_norm": 1.6051205017039099, + "learning_rate": 8.335650192823263e-06, + "loss": 0.3163, + "step": 4604 + }, + { + "epoch": 0.29, + "grad_norm": 2.161571352742066, + "learning_rate": 8.334891418429174e-06, + "loss": 0.3105, + "step": 4605 + }, + { + "epoch": 0.29, + "grad_norm": 5.988979064688914, + "learning_rate": 8.334132505664519e-06, + "loss": 0.3223, + "step": 4606 + }, + { + "epoch": 0.29, + "grad_norm": 3.736417459213877, + "learning_rate": 8.333373454560782e-06, + "loss": 0.3307, + "step": 4607 + }, + { + "epoch": 0.29, + "grad_norm": 1.6014179630032326, + "learning_rate": 8.33261426514946e-06, + "loss": 0.3223, + "step": 4608 + }, + { + "epoch": 0.29, + "grad_norm": 0.657381201666818, + "learning_rate": 8.331854937462052e-06, + "loss": 0.5172, + "step": 4609 + }, + { + "epoch": 0.29, + "grad_norm": 1.3674659918524494, + "learning_rate": 8.331095471530066e-06, + "loss": 0.3325, + "step": 4610 + }, + { + "epoch": 0.29, + "grad_norm": 3.3468979278569857, + "learning_rate": 8.33033586738501e-06, + "loss": 0.3156, + "step": 4611 + }, + { + "epoch": 0.29, + "grad_norm": 1.7585464608941155, + "learning_rate": 8.329576125058406e-06, + "loss": 0.3245, + "step": 4612 + }, + { + "epoch": 0.29, + "grad_norm": 13.77020996096185, + "learning_rate": 8.328816244581774e-06, + "loss": 0.3179, + "step": 4613 + }, + { + "epoch": 0.29, + "grad_norm": 2.35598120381295, + "learning_rate": 8.328056225986642e-06, + "loss": 0.3384, + "step": 4614 + }, + { + "epoch": 0.29, + "grad_norm": 1.6658406559141374, + "learning_rate": 8.327296069304547e-06, + "loss": 0.3289, + "step": 4615 + }, + { + "epoch": 0.29, + "grad_norm": 1.467570226639747, + "learning_rate": 8.326535774567029e-06, + "loss": 0.3234, + "step": 4616 + }, + { + "epoch": 0.29, + "grad_norm": 1.6365120975622698, + "learning_rate": 8.325775341805634e-06, + "loss": 0.3279, + "step": 4617 + }, + { + "epoch": 0.29, + "grad_norm": 1.7478469575446611, + "learning_rate": 8.325014771051909e-06, + "loss": 0.3288, + "step": 4618 + }, + { + "epoch": 0.29, + "grad_norm": 1.997977303998103, + "learning_rate": 8.324254062337419e-06, + "loss": 0.3227, + "step": 4619 + }, + { + "epoch": 0.29, + "grad_norm": 1.9471147861575473, + "learning_rate": 8.323493215693721e-06, + "loss": 0.3211, + "step": 4620 + }, + { + "epoch": 0.29, + "grad_norm": 1.7943934292722723, + "learning_rate": 8.322732231152386e-06, + "loss": 0.3436, + "step": 4621 + }, + { + "epoch": 0.29, + "grad_norm": 3.04573953156783, + "learning_rate": 8.32197110874499e-06, + "loss": 0.3142, + "step": 4622 + }, + { + "epoch": 0.29, + "grad_norm": 1.8827940454540946, + "learning_rate": 8.321209848503112e-06, + "loss": 0.3174, + "step": 4623 + }, + { + "epoch": 0.29, + "grad_norm": 1.3611606771710143, + "learning_rate": 8.320448450458339e-06, + "loss": 0.3154, + "step": 4624 + }, + { + "epoch": 0.29, + "grad_norm": 1.3159068448606388, + "learning_rate": 8.319686914642261e-06, + "loss": 0.3214, + "step": 4625 + }, + { + "epoch": 0.29, + "grad_norm": 0.6221485082863762, + "learning_rate": 8.318925241086477e-06, + "loss": 0.5184, + "step": 4626 + }, + { + "epoch": 0.29, + "grad_norm": 2.030321585008596, + "learning_rate": 8.31816342982259e-06, + "loss": 0.3225, + "step": 4627 + }, + { + "epoch": 0.29, + "grad_norm": 1.3398635464810131, + "learning_rate": 8.317401480882207e-06, + "loss": 0.3279, + "step": 4628 + }, + { + "epoch": 0.29, + "grad_norm": 1.528329638237705, + "learning_rate": 8.316639394296947e-06, + "loss": 0.3319, + "step": 4629 + }, + { + "epoch": 0.29, + "grad_norm": 2.3018221375463925, + "learning_rate": 8.315877170098425e-06, + "loss": 0.3286, + "step": 4630 + }, + { + "epoch": 0.29, + "grad_norm": 3.8875170625119404, + "learning_rate": 8.315114808318269e-06, + "loss": 0.3098, + "step": 4631 + }, + { + "epoch": 0.29, + "grad_norm": 2.28899150095528, + "learning_rate": 8.314352308988114e-06, + "loss": 0.3377, + "step": 4632 + }, + { + "epoch": 0.29, + "grad_norm": 1.8717690001076601, + "learning_rate": 8.313589672139593e-06, + "loss": 0.3251, + "step": 4633 + }, + { + "epoch": 0.29, + "grad_norm": 0.5992361475426525, + "learning_rate": 8.312826897804352e-06, + "loss": 0.495, + "step": 4634 + }, + { + "epoch": 0.29, + "grad_norm": 1.815845661611426, + "learning_rate": 8.312063986014039e-06, + "loss": 0.3403, + "step": 4635 + }, + { + "epoch": 0.29, + "grad_norm": 1.9168548744208262, + "learning_rate": 8.311300936800306e-06, + "loss": 0.3238, + "step": 4636 + }, + { + "epoch": 0.29, + "grad_norm": 2.235787112400163, + "learning_rate": 8.310537750194819e-06, + "loss": 0.3287, + "step": 4637 + }, + { + "epoch": 0.29, + "grad_norm": 2.6813044200360205, + "learning_rate": 8.309774426229237e-06, + "loss": 0.3224, + "step": 4638 + }, + { + "epoch": 0.29, + "grad_norm": 3.4011164727267102, + "learning_rate": 8.309010964935236e-06, + "loss": 0.3136, + "step": 4639 + }, + { + "epoch": 0.29, + "grad_norm": 1.6158207209231936, + "learning_rate": 8.308247366344494e-06, + "loss": 0.3341, + "step": 4640 + }, + { + "epoch": 0.29, + "grad_norm": 1.733443758580431, + "learning_rate": 8.307483630488693e-06, + "loss": 0.3332, + "step": 4641 + }, + { + "epoch": 0.29, + "grad_norm": 9.319928529755119, + "learning_rate": 8.30671975739952e-06, + "loss": 0.3152, + "step": 4642 + }, + { + "epoch": 0.29, + "grad_norm": 3.8969761869791246, + "learning_rate": 8.305955747108672e-06, + "loss": 0.329, + "step": 4643 + }, + { + "epoch": 0.29, + "grad_norm": 1.934293984723378, + "learning_rate": 8.305191599647846e-06, + "loss": 0.3244, + "step": 4644 + }, + { + "epoch": 0.29, + "grad_norm": 2.1444513735312634, + "learning_rate": 8.304427315048752e-06, + "loss": 0.328, + "step": 4645 + }, + { + "epoch": 0.29, + "grad_norm": 2.0981589553165523, + "learning_rate": 8.303662893343099e-06, + "loss": 0.3279, + "step": 4646 + }, + { + "epoch": 0.29, + "grad_norm": 1.4599554538301995, + "learning_rate": 8.302898334562604e-06, + "loss": 0.3039, + "step": 4647 + }, + { + "epoch": 0.29, + "grad_norm": 2.818042291414395, + "learning_rate": 8.30213363873899e-06, + "loss": 0.3419, + "step": 4648 + }, + { + "epoch": 0.29, + "grad_norm": 1.6640616622599838, + "learning_rate": 8.301368805903988e-06, + "loss": 0.322, + "step": 4649 + }, + { + "epoch": 0.29, + "grad_norm": 1.3387610727095836, + "learning_rate": 8.300603836089329e-06, + "loss": 0.3032, + "step": 4650 + }, + { + "epoch": 0.29, + "grad_norm": 2.0141298870195135, + "learning_rate": 8.299838729326754e-06, + "loss": 0.3531, + "step": 4651 + }, + { + "epoch": 0.29, + "grad_norm": 1.9261902144520084, + "learning_rate": 8.29907348564801e-06, + "loss": 0.3197, + "step": 4652 + }, + { + "epoch": 0.29, + "grad_norm": 2.8634593622131974, + "learning_rate": 8.298308105084845e-06, + "loss": 0.3114, + "step": 4653 + }, + { + "epoch": 0.29, + "grad_norm": 2.5803961812105665, + "learning_rate": 8.29754258766902e-06, + "loss": 0.3529, + "step": 4654 + }, + { + "epoch": 0.29, + "grad_norm": 2.7374020598179145, + "learning_rate": 8.296776933432296e-06, + "loss": 0.348, + "step": 4655 + }, + { + "epoch": 0.29, + "grad_norm": 2.7131753006160757, + "learning_rate": 8.296011142406442e-06, + "loss": 0.331, + "step": 4656 + }, + { + "epoch": 0.29, + "grad_norm": 1.3262733550127155, + "learning_rate": 8.29524521462323e-06, + "loss": 0.3108, + "step": 4657 + }, + { + "epoch": 0.29, + "grad_norm": 2.006111614964186, + "learning_rate": 8.294479150114443e-06, + "loss": 0.346, + "step": 4658 + }, + { + "epoch": 0.29, + "grad_norm": 2.5931057728483875, + "learning_rate": 8.293712948911862e-06, + "loss": 0.3248, + "step": 4659 + }, + { + "epoch": 0.29, + "grad_norm": 1.5811684522172256, + "learning_rate": 8.292946611047282e-06, + "loss": 0.3283, + "step": 4660 + }, + { + "epoch": 0.29, + "grad_norm": 2.5822400594471997, + "learning_rate": 8.292180136552498e-06, + "loss": 0.3131, + "step": 4661 + }, + { + "epoch": 0.29, + "grad_norm": 3.3242294608492324, + "learning_rate": 8.29141352545931e-06, + "loss": 0.3421, + "step": 4662 + }, + { + "epoch": 0.29, + "grad_norm": 2.2683721125412544, + "learning_rate": 8.290646777799531e-06, + "loss": 0.3164, + "step": 4663 + }, + { + "epoch": 0.29, + "grad_norm": 1.7722620972330796, + "learning_rate": 8.289879893604973e-06, + "loss": 0.3224, + "step": 4664 + }, + { + "epoch": 0.29, + "grad_norm": 3.155502333050992, + "learning_rate": 8.289112872907454e-06, + "loss": 0.3285, + "step": 4665 + }, + { + "epoch": 0.29, + "grad_norm": 2.9188493048321282, + "learning_rate": 8.2883457157388e-06, + "loss": 0.3478, + "step": 4666 + }, + { + "epoch": 0.29, + "grad_norm": 2.03795725118682, + "learning_rate": 8.28757842213084e-06, + "loss": 0.339, + "step": 4667 + }, + { + "epoch": 0.29, + "grad_norm": 3.084669198820233, + "learning_rate": 8.286810992115413e-06, + "loss": 0.3402, + "step": 4668 + }, + { + "epoch": 0.29, + "grad_norm": 1.9010436454227169, + "learning_rate": 8.28604342572436e-06, + "loss": 0.2943, + "step": 4669 + }, + { + "epoch": 0.29, + "grad_norm": 1.8805846955981191, + "learning_rate": 8.28527572298953e-06, + "loss": 0.3494, + "step": 4670 + }, + { + "epoch": 0.29, + "grad_norm": 1.5105951866836607, + "learning_rate": 8.284507883942771e-06, + "loss": 0.3298, + "step": 4671 + }, + { + "epoch": 0.29, + "grad_norm": 1.8701312657424063, + "learning_rate": 8.283739908615949e-06, + "loss": 0.3401, + "step": 4672 + }, + { + "epoch": 0.29, + "grad_norm": 1.9356588574831817, + "learning_rate": 8.282971797040925e-06, + "loss": 0.326, + "step": 4673 + }, + { + "epoch": 0.29, + "grad_norm": 1.5187844902542402, + "learning_rate": 8.28220354924957e-06, + "loss": 0.3107, + "step": 4674 + }, + { + "epoch": 0.29, + "grad_norm": 2.1035198049693054, + "learning_rate": 8.28143516527376e-06, + "loss": 0.3319, + "step": 4675 + }, + { + "epoch": 0.29, + "grad_norm": 22.636282393178313, + "learning_rate": 8.280666645145377e-06, + "loss": 0.3254, + "step": 4676 + }, + { + "epoch": 0.29, + "grad_norm": 9.341425307095218, + "learning_rate": 8.279897988896306e-06, + "loss": 0.3503, + "step": 4677 + }, + { + "epoch": 0.29, + "grad_norm": 3.2542353033770426, + "learning_rate": 8.279129196558443e-06, + "loss": 0.3463, + "step": 4678 + }, + { + "epoch": 0.29, + "grad_norm": 2.656661930080295, + "learning_rate": 8.278360268163686e-06, + "loss": 0.3345, + "step": 4679 + }, + { + "epoch": 0.29, + "grad_norm": 3.2763264469002285, + "learning_rate": 8.277591203743937e-06, + "loss": 0.3364, + "step": 4680 + }, + { + "epoch": 0.29, + "grad_norm": 1.5888613647386105, + "learning_rate": 8.276822003331108e-06, + "loss": 0.3112, + "step": 4681 + }, + { + "epoch": 0.29, + "grad_norm": 2.301783583669015, + "learning_rate": 8.276052666957113e-06, + "loss": 0.3297, + "step": 4682 + }, + { + "epoch": 0.29, + "grad_norm": 1.6590518295557342, + "learning_rate": 8.275283194653876e-06, + "loss": 0.3207, + "step": 4683 + }, + { + "epoch": 0.29, + "grad_norm": 3.573784037298755, + "learning_rate": 8.27451358645332e-06, + "loss": 0.3242, + "step": 4684 + }, + { + "epoch": 0.29, + "grad_norm": 0.6889412945575218, + "learning_rate": 8.27374384238738e-06, + "loss": 0.5102, + "step": 4685 + }, + { + "epoch": 0.29, + "grad_norm": 2.0184875092933416, + "learning_rate": 8.272973962487991e-06, + "loss": 0.3179, + "step": 4686 + }, + { + "epoch": 0.29, + "grad_norm": 2.7308184830125093, + "learning_rate": 8.272203946787101e-06, + "loss": 0.3371, + "step": 4687 + }, + { + "epoch": 0.29, + "grad_norm": 9.621088868982925, + "learning_rate": 8.271433795316655e-06, + "loss": 0.319, + "step": 4688 + }, + { + "epoch": 0.29, + "grad_norm": 2.0747084076055846, + "learning_rate": 8.270663508108611e-06, + "loss": 0.3242, + "step": 4689 + }, + { + "epoch": 0.29, + "grad_norm": 0.6603367027911864, + "learning_rate": 8.26989308519493e-06, + "loss": 0.5008, + "step": 4690 + }, + { + "epoch": 0.3, + "grad_norm": 2.85065133230469, + "learning_rate": 8.269122526607577e-06, + "loss": 0.3447, + "step": 4691 + }, + { + "epoch": 0.3, + "grad_norm": 1.8972568965185854, + "learning_rate": 8.268351832378521e-06, + "loss": 0.3276, + "step": 4692 + }, + { + "epoch": 0.3, + "grad_norm": 3.379889960167494, + "learning_rate": 8.267581002539745e-06, + "loss": 0.3097, + "step": 4693 + }, + { + "epoch": 0.3, + "grad_norm": 5.813434996376796, + "learning_rate": 8.266810037123229e-06, + "loss": 0.331, + "step": 4694 + }, + { + "epoch": 0.3, + "grad_norm": 5.022182380703507, + "learning_rate": 8.26603893616096e-06, + "loss": 0.3046, + "step": 4695 + }, + { + "epoch": 0.3, + "grad_norm": 3.313427973205733, + "learning_rate": 8.265267699684937e-06, + "loss": 0.3517, + "step": 4696 + }, + { + "epoch": 0.3, + "grad_norm": 2.2638290308411952, + "learning_rate": 8.264496327727157e-06, + "loss": 0.3138, + "step": 4697 + }, + { + "epoch": 0.3, + "grad_norm": 2.1232531724630594, + "learning_rate": 8.263724820319624e-06, + "loss": 0.3038, + "step": 4698 + }, + { + "epoch": 0.3, + "grad_norm": 1.553147675273588, + "learning_rate": 8.262953177494353e-06, + "loss": 0.314, + "step": 4699 + }, + { + "epoch": 0.3, + "grad_norm": 3.351131033381673, + "learning_rate": 8.262181399283359e-06, + "loss": 0.3085, + "step": 4700 + }, + { + "epoch": 0.3, + "grad_norm": 1.609495727538814, + "learning_rate": 8.261409485718663e-06, + "loss": 0.3145, + "step": 4701 + }, + { + "epoch": 0.3, + "grad_norm": 1.9284030865348525, + "learning_rate": 8.260637436832295e-06, + "loss": 0.3384, + "step": 4702 + }, + { + "epoch": 0.3, + "grad_norm": 1.7319450065480584, + "learning_rate": 8.259865252656289e-06, + "loss": 0.319, + "step": 4703 + }, + { + "epoch": 0.3, + "grad_norm": 0.6200426032426879, + "learning_rate": 8.259092933222683e-06, + "loss": 0.4603, + "step": 4704 + }, + { + "epoch": 0.3, + "grad_norm": 9.693101313184172, + "learning_rate": 8.258320478563524e-06, + "loss": 0.3158, + "step": 4705 + }, + { + "epoch": 0.3, + "grad_norm": 2.889716943238856, + "learning_rate": 8.25754788871086e-06, + "loss": 0.3037, + "step": 4706 + }, + { + "epoch": 0.3, + "grad_norm": 1.4358539033489492, + "learning_rate": 8.256775163696746e-06, + "loss": 0.3106, + "step": 4707 + }, + { + "epoch": 0.3, + "grad_norm": 9.238543110556, + "learning_rate": 8.256002303553248e-06, + "loss": 0.337, + "step": 4708 + }, + { + "epoch": 0.3, + "grad_norm": 1.5476751159653763, + "learning_rate": 8.25522930831243e-06, + "loss": 0.3175, + "step": 4709 + }, + { + "epoch": 0.3, + "grad_norm": 2.3005169638419916, + "learning_rate": 8.254456178006368e-06, + "loss": 0.337, + "step": 4710 + }, + { + "epoch": 0.3, + "grad_norm": 0.599553246533363, + "learning_rate": 8.253682912667136e-06, + "loss": 0.4831, + "step": 4711 + }, + { + "epoch": 0.3, + "grad_norm": 0.5740293443919963, + "learning_rate": 8.252909512326821e-06, + "loss": 0.4913, + "step": 4712 + }, + { + "epoch": 0.3, + "grad_norm": 2.546963579802148, + "learning_rate": 8.252135977017513e-06, + "loss": 0.3189, + "step": 4713 + }, + { + "epoch": 0.3, + "grad_norm": 4.429125795642956, + "learning_rate": 8.251362306771306e-06, + "loss": 0.3568, + "step": 4714 + }, + { + "epoch": 0.3, + "grad_norm": 5.189305238838425, + "learning_rate": 8.250588501620305e-06, + "loss": 0.3221, + "step": 4715 + }, + { + "epoch": 0.3, + "grad_norm": 3.5843130933913105, + "learning_rate": 8.249814561596612e-06, + "loss": 0.3119, + "step": 4716 + }, + { + "epoch": 0.3, + "grad_norm": 3.071138163688954, + "learning_rate": 8.24904048673234e-06, + "loss": 0.3593, + "step": 4717 + }, + { + "epoch": 0.3, + "grad_norm": 2.5238599435112223, + "learning_rate": 8.248266277059607e-06, + "loss": 0.3147, + "step": 4718 + }, + { + "epoch": 0.3, + "grad_norm": 3.1740048627002473, + "learning_rate": 8.247491932610536e-06, + "loss": 0.3211, + "step": 4719 + }, + { + "epoch": 0.3, + "grad_norm": 4.140818851619252, + "learning_rate": 8.246717453417257e-06, + "loss": 0.3097, + "step": 4720 + }, + { + "epoch": 0.3, + "grad_norm": 1.4118365828492045, + "learning_rate": 8.245942839511905e-06, + "loss": 0.3084, + "step": 4721 + }, + { + "epoch": 0.3, + "grad_norm": 1.8780104654279364, + "learning_rate": 8.245168090926618e-06, + "loss": 0.3362, + "step": 4722 + }, + { + "epoch": 0.3, + "grad_norm": 3.509275582536811, + "learning_rate": 8.244393207693544e-06, + "loss": 0.3217, + "step": 4723 + }, + { + "epoch": 0.3, + "grad_norm": 3.307156329842178, + "learning_rate": 8.243618189844831e-06, + "loss": 0.3173, + "step": 4724 + }, + { + "epoch": 0.3, + "grad_norm": 1.6175065624426581, + "learning_rate": 8.24284303741264e-06, + "loss": 0.3281, + "step": 4725 + }, + { + "epoch": 0.3, + "grad_norm": 2.091261383505988, + "learning_rate": 8.242067750429131e-06, + "loss": 0.3256, + "step": 4726 + }, + { + "epoch": 0.3, + "grad_norm": 2.6042560866381548, + "learning_rate": 8.241292328926473e-06, + "loss": 0.3124, + "step": 4727 + }, + { + "epoch": 0.3, + "grad_norm": 1.4559835196693374, + "learning_rate": 8.240516772936837e-06, + "loss": 0.3455, + "step": 4728 + }, + { + "epoch": 0.3, + "grad_norm": 1.4597397892968809, + "learning_rate": 8.239741082492405e-06, + "loss": 0.329, + "step": 4729 + }, + { + "epoch": 0.3, + "grad_norm": 0.714973562946189, + "learning_rate": 8.238965257625363e-06, + "loss": 0.5296, + "step": 4730 + }, + { + "epoch": 0.3, + "grad_norm": 2.8917801488245396, + "learning_rate": 8.238189298367898e-06, + "loss": 0.3334, + "step": 4731 + }, + { + "epoch": 0.3, + "grad_norm": 2.188231882240646, + "learning_rate": 8.237413204752208e-06, + "loss": 0.3409, + "step": 4732 + }, + { + "epoch": 0.3, + "grad_norm": 2.7461597182270046, + "learning_rate": 8.236636976810492e-06, + "loss": 0.3443, + "step": 4733 + }, + { + "epoch": 0.3, + "grad_norm": 3.224238394081794, + "learning_rate": 8.235860614574962e-06, + "loss": 0.3288, + "step": 4734 + }, + { + "epoch": 0.3, + "grad_norm": 1.68912360675444, + "learning_rate": 8.235084118077826e-06, + "loss": 0.2999, + "step": 4735 + }, + { + "epoch": 0.3, + "grad_norm": 3.6644920691114202, + "learning_rate": 8.234307487351303e-06, + "loss": 0.3414, + "step": 4736 + }, + { + "epoch": 0.3, + "grad_norm": 1.9254952544282533, + "learning_rate": 8.233530722427618e-06, + "loss": 0.3194, + "step": 4737 + }, + { + "epoch": 0.3, + "grad_norm": 1.7836344216234346, + "learning_rate": 8.232753823339001e-06, + "loss": 0.3346, + "step": 4738 + }, + { + "epoch": 0.3, + "grad_norm": 1.9185557546294354, + "learning_rate": 8.231976790117684e-06, + "loss": 0.336, + "step": 4739 + }, + { + "epoch": 0.3, + "grad_norm": 1.7885959894956402, + "learning_rate": 8.231199622795912e-06, + "loss": 0.3007, + "step": 4740 + }, + { + "epoch": 0.3, + "grad_norm": 1.401987209489957, + "learning_rate": 8.230422321405925e-06, + "loss": 0.3291, + "step": 4741 + }, + { + "epoch": 0.3, + "grad_norm": 2.816715072691335, + "learning_rate": 8.22964488597998e-06, + "loss": 0.3445, + "step": 4742 + }, + { + "epoch": 0.3, + "grad_norm": 1.8542396643594432, + "learning_rate": 8.228867316550332e-06, + "loss": 0.3322, + "step": 4743 + }, + { + "epoch": 0.3, + "grad_norm": 1.5081081724294076, + "learning_rate": 8.228089613149244e-06, + "loss": 0.3352, + "step": 4744 + }, + { + "epoch": 0.3, + "grad_norm": 1.6356742044834944, + "learning_rate": 8.227311775808984e-06, + "loss": 0.3042, + "step": 4745 + }, + { + "epoch": 0.3, + "grad_norm": 2.520840142695837, + "learning_rate": 8.226533804561828e-06, + "loss": 0.3259, + "step": 4746 + }, + { + "epoch": 0.3, + "grad_norm": 1.768885534148715, + "learning_rate": 8.22575569944005e-06, + "loss": 0.3233, + "step": 4747 + }, + { + "epoch": 0.3, + "grad_norm": 2.3583230985499926, + "learning_rate": 8.22497746047594e-06, + "loss": 0.3386, + "step": 4748 + }, + { + "epoch": 0.3, + "grad_norm": 1.5577941188261932, + "learning_rate": 8.224199087701787e-06, + "loss": 0.3178, + "step": 4749 + }, + { + "epoch": 0.3, + "grad_norm": 1.7525710060671866, + "learning_rate": 8.223420581149887e-06, + "loss": 0.3219, + "step": 4750 + }, + { + "epoch": 0.3, + "grad_norm": 13.948686355608059, + "learning_rate": 8.222641940852543e-06, + "loss": 0.3244, + "step": 4751 + }, + { + "epoch": 0.3, + "grad_norm": 1.5969601204197872, + "learning_rate": 8.221863166842057e-06, + "loss": 0.3273, + "step": 4752 + }, + { + "epoch": 0.3, + "grad_norm": 1.6072918833551268, + "learning_rate": 8.22108425915075e-06, + "loss": 0.32, + "step": 4753 + }, + { + "epoch": 0.3, + "grad_norm": 1.7374397474057224, + "learning_rate": 8.220305217810933e-06, + "loss": 0.3058, + "step": 4754 + }, + { + "epoch": 0.3, + "grad_norm": 1.5285658154607755, + "learning_rate": 8.219526042854933e-06, + "loss": 0.314, + "step": 4755 + }, + { + "epoch": 0.3, + "grad_norm": 3.6978959663770667, + "learning_rate": 8.21874673431508e-06, + "loss": 0.3296, + "step": 4756 + }, + { + "epoch": 0.3, + "grad_norm": 6.685246099049766, + "learning_rate": 8.217967292223707e-06, + "loss": 0.3229, + "step": 4757 + }, + { + "epoch": 0.3, + "grad_norm": 4.516889116784165, + "learning_rate": 8.217187716613156e-06, + "loss": 0.3214, + "step": 4758 + }, + { + "epoch": 0.3, + "grad_norm": 3.5307710719855767, + "learning_rate": 8.21640800751577e-06, + "loss": 0.3346, + "step": 4759 + }, + { + "epoch": 0.3, + "grad_norm": 2.339994045329839, + "learning_rate": 8.215628164963906e-06, + "loss": 0.3115, + "step": 4760 + }, + { + "epoch": 0.3, + "grad_norm": 1.643101351187922, + "learning_rate": 8.214848188989916e-06, + "loss": 0.3225, + "step": 4761 + }, + { + "epoch": 0.3, + "grad_norm": 0.7033291289125447, + "learning_rate": 8.214068079626166e-06, + "loss": 0.491, + "step": 4762 + }, + { + "epoch": 0.3, + "grad_norm": 3.1133177261523097, + "learning_rate": 8.213287836905021e-06, + "loss": 0.3404, + "step": 4763 + }, + { + "epoch": 0.3, + "grad_norm": 2.45630998250199, + "learning_rate": 8.212507460858857e-06, + "loss": 0.3314, + "step": 4764 + }, + { + "epoch": 0.3, + "grad_norm": 2.4375229160064453, + "learning_rate": 8.21172695152005e-06, + "loss": 0.3505, + "step": 4765 + }, + { + "epoch": 0.3, + "grad_norm": 2.0368390595621215, + "learning_rate": 8.21094630892099e-06, + "loss": 0.3092, + "step": 4766 + }, + { + "epoch": 0.3, + "grad_norm": 1.8435608026026553, + "learning_rate": 8.210165533094064e-06, + "loss": 0.3264, + "step": 4767 + }, + { + "epoch": 0.3, + "grad_norm": 17.623385415676598, + "learning_rate": 8.209384624071667e-06, + "loss": 0.3433, + "step": 4768 + }, + { + "epoch": 0.3, + "grad_norm": 2.2817442670181367, + "learning_rate": 8.208603581886203e-06, + "loss": 0.3367, + "step": 4769 + }, + { + "epoch": 0.3, + "grad_norm": 3.6621890877153036, + "learning_rate": 8.207822406570077e-06, + "loss": 0.3169, + "step": 4770 + }, + { + "epoch": 0.3, + "grad_norm": 3.3938859499120446, + "learning_rate": 8.207041098155701e-06, + "loss": 0.3193, + "step": 4771 + }, + { + "epoch": 0.3, + "grad_norm": 1.8851538537508348, + "learning_rate": 8.206259656675493e-06, + "loss": 0.3333, + "step": 4772 + }, + { + "epoch": 0.3, + "grad_norm": 3.531857536210539, + "learning_rate": 8.205478082161877e-06, + "loss": 0.3247, + "step": 4773 + }, + { + "epoch": 0.3, + "grad_norm": 2.44634320497755, + "learning_rate": 8.204696374647282e-06, + "loss": 0.3224, + "step": 4774 + }, + { + "epoch": 0.3, + "grad_norm": 4.280888599599943, + "learning_rate": 8.203914534164143e-06, + "loss": 0.3238, + "step": 4775 + }, + { + "epoch": 0.3, + "grad_norm": 1.6805330915645307, + "learning_rate": 8.203132560744898e-06, + "loss": 0.3106, + "step": 4776 + }, + { + "epoch": 0.3, + "grad_norm": 1.865548206504745, + "learning_rate": 8.202350454421995e-06, + "loss": 0.3333, + "step": 4777 + }, + { + "epoch": 0.3, + "grad_norm": 2.318842918556991, + "learning_rate": 8.201568215227882e-06, + "loss": 0.3232, + "step": 4778 + }, + { + "epoch": 0.3, + "grad_norm": 5.388261525992749, + "learning_rate": 8.200785843195019e-06, + "loss": 0.3273, + "step": 4779 + }, + { + "epoch": 0.3, + "grad_norm": 2.3297509743118145, + "learning_rate": 8.200003338355863e-06, + "loss": 0.3306, + "step": 4780 + }, + { + "epoch": 0.3, + "grad_norm": 2.393080028369493, + "learning_rate": 8.199220700742889e-06, + "loss": 0.3222, + "step": 4781 + }, + { + "epoch": 0.3, + "grad_norm": 2.6313111032338217, + "learning_rate": 8.198437930388562e-06, + "loss": 0.3494, + "step": 4782 + }, + { + "epoch": 0.3, + "grad_norm": 1.4352270482371632, + "learning_rate": 8.197655027325366e-06, + "loss": 0.2982, + "step": 4783 + }, + { + "epoch": 0.3, + "grad_norm": 2.206024029455773, + "learning_rate": 8.196871991585784e-06, + "loss": 0.3443, + "step": 4784 + }, + { + "epoch": 0.3, + "grad_norm": 1.653110086631432, + "learning_rate": 8.196088823202302e-06, + "loss": 0.3143, + "step": 4785 + }, + { + "epoch": 0.3, + "grad_norm": 3.2011751161782405, + "learning_rate": 8.19530552220742e-06, + "loss": 0.3529, + "step": 4786 + }, + { + "epoch": 0.3, + "grad_norm": 17.924828819173896, + "learning_rate": 8.194522088633637e-06, + "loss": 0.3324, + "step": 4787 + }, + { + "epoch": 0.3, + "grad_norm": 2.1398642044622593, + "learning_rate": 8.193738522513458e-06, + "loss": 0.3341, + "step": 4788 + }, + { + "epoch": 0.3, + "grad_norm": 2.7157670836447547, + "learning_rate": 8.192954823879392e-06, + "loss": 0.3054, + "step": 4789 + }, + { + "epoch": 0.3, + "grad_norm": 1.8737679459565855, + "learning_rate": 8.192170992763962e-06, + "loss": 0.2988, + "step": 4790 + }, + { + "epoch": 0.3, + "grad_norm": 1.999504728006175, + "learning_rate": 8.191387029199687e-06, + "loss": 0.3295, + "step": 4791 + }, + { + "epoch": 0.3, + "grad_norm": 0.7098577395923357, + "learning_rate": 8.190602933219096e-06, + "loss": 0.5288, + "step": 4792 + }, + { + "epoch": 0.3, + "grad_norm": 2.072568427445285, + "learning_rate": 8.189818704854721e-06, + "loss": 0.3392, + "step": 4793 + }, + { + "epoch": 0.3, + "grad_norm": 1.9416515731766864, + "learning_rate": 8.189034344139102e-06, + "loss": 0.3369, + "step": 4794 + }, + { + "epoch": 0.3, + "grad_norm": 2.5445706462693876, + "learning_rate": 8.188249851104785e-06, + "loss": 0.3238, + "step": 4795 + }, + { + "epoch": 0.3, + "grad_norm": 1.6009115325262264, + "learning_rate": 8.18746522578432e-06, + "loss": 0.3071, + "step": 4796 + }, + { + "epoch": 0.3, + "grad_norm": 1.8634447593031986, + "learning_rate": 8.186680468210258e-06, + "loss": 0.3042, + "step": 4797 + }, + { + "epoch": 0.3, + "grad_norm": 2.7643107661326325, + "learning_rate": 8.185895578415164e-06, + "loss": 0.3352, + "step": 4798 + }, + { + "epoch": 0.3, + "grad_norm": 1.7129930040941912, + "learning_rate": 8.185110556431606e-06, + "loss": 0.3261, + "step": 4799 + }, + { + "epoch": 0.3, + "grad_norm": 1.8987423140500093, + "learning_rate": 8.184325402292151e-06, + "loss": 0.3098, + "step": 4800 + }, + { + "epoch": 0.3, + "grad_norm": 3.5328024607127184, + "learning_rate": 8.18354011602938e-06, + "loss": 0.3157, + "step": 4801 + }, + { + "epoch": 0.3, + "grad_norm": 4.333752570392361, + "learning_rate": 8.182754697675875e-06, + "loss": 0.3052, + "step": 4802 + }, + { + "epoch": 0.3, + "grad_norm": 2.3306054321306156, + "learning_rate": 8.181969147264224e-06, + "loss": 0.3358, + "step": 4803 + }, + { + "epoch": 0.3, + "grad_norm": 2.1146384907583307, + "learning_rate": 8.181183464827022e-06, + "loss": 0.306, + "step": 4804 + }, + { + "epoch": 0.3, + "grad_norm": 2.0758202525059914, + "learning_rate": 8.180397650396868e-06, + "loss": 0.3044, + "step": 4805 + }, + { + "epoch": 0.3, + "grad_norm": 1.6975350539339977, + "learning_rate": 8.179611704006364e-06, + "loss": 0.3127, + "step": 4806 + }, + { + "epoch": 0.3, + "grad_norm": 2.4776622404553157, + "learning_rate": 8.178825625688126e-06, + "loss": 0.3298, + "step": 4807 + }, + { + "epoch": 0.3, + "grad_norm": 1.8269808749535046, + "learning_rate": 8.178039415474765e-06, + "loss": 0.3403, + "step": 4808 + }, + { + "epoch": 0.3, + "grad_norm": 2.145335388209343, + "learning_rate": 8.177253073398903e-06, + "loss": 0.3016, + "step": 4809 + }, + { + "epoch": 0.3, + "grad_norm": 2.8458166161902985, + "learning_rate": 8.17646659949317e-06, + "loss": 0.3338, + "step": 4810 + }, + { + "epoch": 0.3, + "grad_norm": 3.144466171744352, + "learning_rate": 8.175679993790194e-06, + "loss": 0.3207, + "step": 4811 + }, + { + "epoch": 0.3, + "grad_norm": 1.6336326139927528, + "learning_rate": 8.174893256322613e-06, + "loss": 0.3513, + "step": 4812 + }, + { + "epoch": 0.3, + "grad_norm": 2.51376091258254, + "learning_rate": 8.174106387123073e-06, + "loss": 0.3496, + "step": 4813 + }, + { + "epoch": 0.3, + "grad_norm": 1.7210806637465388, + "learning_rate": 8.173319386224222e-06, + "loss": 0.3484, + "step": 4814 + }, + { + "epoch": 0.3, + "grad_norm": 4.65241854735433, + "learning_rate": 8.172532253658711e-06, + "loss": 0.3292, + "step": 4815 + }, + { + "epoch": 0.3, + "grad_norm": 1.9905391000063817, + "learning_rate": 8.171744989459205e-06, + "loss": 0.3308, + "step": 4816 + }, + { + "epoch": 0.3, + "grad_norm": 1.7518605995680492, + "learning_rate": 8.170957593658365e-06, + "loss": 0.3068, + "step": 4817 + }, + { + "epoch": 0.3, + "grad_norm": 3.311984135292733, + "learning_rate": 8.170170066288861e-06, + "loss": 0.3356, + "step": 4818 + }, + { + "epoch": 0.3, + "grad_norm": 2.122085113950779, + "learning_rate": 8.169382407383368e-06, + "loss": 0.3172, + "step": 4819 + }, + { + "epoch": 0.3, + "grad_norm": 3.995065735215276, + "learning_rate": 8.168594616974572e-06, + "loss": 0.319, + "step": 4820 + }, + { + "epoch": 0.3, + "grad_norm": 3.2246262626361033, + "learning_rate": 8.167806695095157e-06, + "loss": 0.2997, + "step": 4821 + }, + { + "epoch": 0.3, + "grad_norm": 1.7612564023005002, + "learning_rate": 8.167018641777817e-06, + "loss": 0.3288, + "step": 4822 + }, + { + "epoch": 0.3, + "grad_norm": 1.8208704590702947, + "learning_rate": 8.166230457055247e-06, + "loss": 0.3219, + "step": 4823 + }, + { + "epoch": 0.3, + "grad_norm": 1.8920000445494063, + "learning_rate": 8.16544214096015e-06, + "loss": 0.3308, + "step": 4824 + }, + { + "epoch": 0.3, + "grad_norm": 1.997619539738235, + "learning_rate": 8.164653693525239e-06, + "loss": 0.3073, + "step": 4825 + }, + { + "epoch": 0.3, + "grad_norm": 1.3368032693286307, + "learning_rate": 8.163865114783223e-06, + "loss": 0.3083, + "step": 4826 + }, + { + "epoch": 0.3, + "grad_norm": 1.614647051404713, + "learning_rate": 8.163076404766823e-06, + "loss": 0.3022, + "step": 4827 + }, + { + "epoch": 0.3, + "grad_norm": 2.7694390217696063, + "learning_rate": 8.162287563508767e-06, + "loss": 0.3218, + "step": 4828 + }, + { + "epoch": 0.3, + "grad_norm": 1.3419304443359232, + "learning_rate": 8.161498591041783e-06, + "loss": 0.3246, + "step": 4829 + }, + { + "epoch": 0.3, + "grad_norm": 2.935574256862518, + "learning_rate": 8.160709487398605e-06, + "loss": 0.3258, + "step": 4830 + }, + { + "epoch": 0.3, + "grad_norm": 2.4843662681417458, + "learning_rate": 8.159920252611979e-06, + "loss": 0.3159, + "step": 4831 + }, + { + "epoch": 0.3, + "grad_norm": 1.47709063905116, + "learning_rate": 8.159130886714647e-06, + "loss": 0.3029, + "step": 4832 + }, + { + "epoch": 0.3, + "grad_norm": 5.059276053569302, + "learning_rate": 8.158341389739364e-06, + "loss": 0.3372, + "step": 4833 + }, + { + "epoch": 0.3, + "grad_norm": 0.7453200918215377, + "learning_rate": 8.157551761718886e-06, + "loss": 0.4911, + "step": 4834 + }, + { + "epoch": 0.3, + "grad_norm": 5.658487968733307, + "learning_rate": 8.156762002685978e-06, + "loss": 0.316, + "step": 4835 + }, + { + "epoch": 0.3, + "grad_norm": 4.804281861595954, + "learning_rate": 8.155972112673406e-06, + "loss": 0.3176, + "step": 4836 + }, + { + "epoch": 0.3, + "grad_norm": 2.20719583393348, + "learning_rate": 8.155182091713946e-06, + "loss": 0.3131, + "step": 4837 + }, + { + "epoch": 0.3, + "grad_norm": 1.3365859821620707, + "learning_rate": 8.154391939840379e-06, + "loss": 0.3064, + "step": 4838 + }, + { + "epoch": 0.3, + "grad_norm": 1.7261160213559534, + "learning_rate": 8.153601657085486e-06, + "loss": 0.3408, + "step": 4839 + }, + { + "epoch": 0.3, + "grad_norm": 1.563921809444925, + "learning_rate": 8.15281124348206e-06, + "loss": 0.3439, + "step": 4840 + }, + { + "epoch": 0.3, + "grad_norm": 1.779037035691217, + "learning_rate": 8.152020699062892e-06, + "loss": 0.3169, + "step": 4841 + }, + { + "epoch": 0.3, + "grad_norm": 1.6113109661634013, + "learning_rate": 8.15123002386079e-06, + "loss": 0.3282, + "step": 4842 + }, + { + "epoch": 0.3, + "grad_norm": 1.6786433123007816, + "learning_rate": 8.150439217908557e-06, + "loss": 0.2942, + "step": 4843 + }, + { + "epoch": 0.3, + "grad_norm": 1.3948528641322693, + "learning_rate": 8.149648281239005e-06, + "loss": 0.3266, + "step": 4844 + }, + { + "epoch": 0.3, + "grad_norm": 2.406817990679698, + "learning_rate": 8.148857213884948e-06, + "loss": 0.3394, + "step": 4845 + }, + { + "epoch": 0.3, + "grad_norm": 5.435589378225438, + "learning_rate": 8.148066015879214e-06, + "loss": 0.338, + "step": 4846 + }, + { + "epoch": 0.3, + "grad_norm": 73.95252745390995, + "learning_rate": 8.147274687254632e-06, + "loss": 0.3143, + "step": 4847 + }, + { + "epoch": 0.3, + "grad_norm": 1.933090324683608, + "learning_rate": 8.146483228044032e-06, + "loss": 0.3226, + "step": 4848 + }, + { + "epoch": 0.3, + "grad_norm": 3.083182320648684, + "learning_rate": 8.14569163828025e-06, + "loss": 0.3297, + "step": 4849 + }, + { + "epoch": 0.31, + "grad_norm": 3.81068733173486, + "learning_rate": 8.144899917996139e-06, + "loss": 0.3232, + "step": 4850 + }, + { + "epoch": 0.31, + "grad_norm": 2.5115646565506933, + "learning_rate": 8.144108067224544e-06, + "loss": 0.3594, + "step": 4851 + }, + { + "epoch": 0.31, + "grad_norm": 2.3704651812605, + "learning_rate": 8.14331608599832e-06, + "loss": 0.3486, + "step": 4852 + }, + { + "epoch": 0.31, + "grad_norm": 4.750737390975413, + "learning_rate": 8.142523974350327e-06, + "loss": 0.3344, + "step": 4853 + }, + { + "epoch": 0.31, + "grad_norm": 2.641300177158323, + "learning_rate": 8.141731732313434e-06, + "loss": 0.321, + "step": 4854 + }, + { + "epoch": 0.31, + "grad_norm": 1.3377095274666442, + "learning_rate": 8.140939359920512e-06, + "loss": 0.3266, + "step": 4855 + }, + { + "epoch": 0.31, + "grad_norm": 2.2454839829427358, + "learning_rate": 8.140146857204433e-06, + "loss": 0.345, + "step": 4856 + }, + { + "epoch": 0.31, + "grad_norm": 1.6052923028643362, + "learning_rate": 8.139354224198087e-06, + "loss": 0.3172, + "step": 4857 + }, + { + "epoch": 0.31, + "grad_norm": 3.2004381869875216, + "learning_rate": 8.13856146093436e-06, + "loss": 0.3211, + "step": 4858 + }, + { + "epoch": 0.31, + "grad_norm": 1.8407810825110196, + "learning_rate": 8.137768567446141e-06, + "loss": 0.3172, + "step": 4859 + }, + { + "epoch": 0.31, + "grad_norm": 1.8974771862862094, + "learning_rate": 8.136975543766328e-06, + "loss": 0.3347, + "step": 4860 + }, + { + "epoch": 0.31, + "grad_norm": 1.834938564558332, + "learning_rate": 8.136182389927832e-06, + "loss": 0.3347, + "step": 4861 + }, + { + "epoch": 0.31, + "grad_norm": 2.9249037333511576, + "learning_rate": 8.135389105963556e-06, + "loss": 0.3306, + "step": 4862 + }, + { + "epoch": 0.31, + "grad_norm": 1.869441650437946, + "learning_rate": 8.134595691906416e-06, + "loss": 0.3262, + "step": 4863 + }, + { + "epoch": 0.31, + "grad_norm": 2.4694686351112236, + "learning_rate": 8.133802147789337e-06, + "loss": 0.3013, + "step": 4864 + }, + { + "epoch": 0.31, + "grad_norm": 2.6259746653299456, + "learning_rate": 8.133008473645237e-06, + "loss": 0.3365, + "step": 4865 + }, + { + "epoch": 0.31, + "grad_norm": 1.679726853444609, + "learning_rate": 8.132214669507052e-06, + "loss": 0.3034, + "step": 4866 + }, + { + "epoch": 0.31, + "grad_norm": 2.8052295131154517, + "learning_rate": 8.131420735407716e-06, + "loss": 0.3467, + "step": 4867 + }, + { + "epoch": 0.31, + "grad_norm": 1.5368211521749648, + "learning_rate": 8.130626671380172e-06, + "loss": 0.3316, + "step": 4868 + }, + { + "epoch": 0.31, + "grad_norm": 3.1732339031379313, + "learning_rate": 8.129832477457367e-06, + "loss": 0.3287, + "step": 4869 + }, + { + "epoch": 0.31, + "grad_norm": 1.860292595498335, + "learning_rate": 8.129038153672255e-06, + "loss": 0.3396, + "step": 4870 + }, + { + "epoch": 0.31, + "grad_norm": 5.155789115355333, + "learning_rate": 8.128243700057788e-06, + "loss": 0.3324, + "step": 4871 + }, + { + "epoch": 0.31, + "grad_norm": 1.6100435575597394, + "learning_rate": 8.127449116646937e-06, + "loss": 0.311, + "step": 4872 + }, + { + "epoch": 0.31, + "grad_norm": 1.7138489497901312, + "learning_rate": 8.126654403472668e-06, + "loss": 0.3104, + "step": 4873 + }, + { + "epoch": 0.31, + "grad_norm": 1.7018536243420075, + "learning_rate": 8.125859560567952e-06, + "loss": 0.3157, + "step": 4874 + }, + { + "epoch": 0.31, + "grad_norm": 1.75272407707653, + "learning_rate": 8.12506458796577e-06, + "loss": 0.3296, + "step": 4875 + }, + { + "epoch": 0.31, + "grad_norm": 2.877593274176602, + "learning_rate": 8.124269485699108e-06, + "loss": 0.3159, + "step": 4876 + }, + { + "epoch": 0.31, + "grad_norm": 2.099054102035657, + "learning_rate": 8.123474253800958e-06, + "loss": 0.305, + "step": 4877 + }, + { + "epoch": 0.31, + "grad_norm": 1.6060449639620602, + "learning_rate": 8.12267889230431e-06, + "loss": 0.3247, + "step": 4878 + }, + { + "epoch": 0.31, + "grad_norm": 2.09337815632032, + "learning_rate": 8.12188340124217e-06, + "loss": 0.3221, + "step": 4879 + }, + { + "epoch": 0.31, + "grad_norm": 2.4440770016397826, + "learning_rate": 8.121087780647543e-06, + "loss": 0.3189, + "step": 4880 + }, + { + "epoch": 0.31, + "grad_norm": 11.290636335528498, + "learning_rate": 8.120292030553441e-06, + "loss": 0.3163, + "step": 4881 + }, + { + "epoch": 0.31, + "grad_norm": 2.955720730453499, + "learning_rate": 8.119496150992879e-06, + "loss": 0.3247, + "step": 4882 + }, + { + "epoch": 0.31, + "grad_norm": 2.585402146371944, + "learning_rate": 8.118700141998879e-06, + "loss": 0.328, + "step": 4883 + }, + { + "epoch": 0.31, + "grad_norm": 1.8193494686149803, + "learning_rate": 8.117904003604472e-06, + "loss": 0.3118, + "step": 4884 + }, + { + "epoch": 0.31, + "grad_norm": 3.5046889722588577, + "learning_rate": 8.117107735842693e-06, + "loss": 0.3476, + "step": 4885 + }, + { + "epoch": 0.31, + "grad_norm": 2.0278179379997585, + "learning_rate": 8.116311338746574e-06, + "loss": 0.3097, + "step": 4886 + }, + { + "epoch": 0.31, + "grad_norm": 2.3797793022230813, + "learning_rate": 8.115514812349163e-06, + "loss": 0.3331, + "step": 4887 + }, + { + "epoch": 0.31, + "grad_norm": 2.9909270540906427, + "learning_rate": 8.11471815668351e-06, + "loss": 0.3227, + "step": 4888 + }, + { + "epoch": 0.31, + "grad_norm": 2.5316997453624013, + "learning_rate": 8.113921371782667e-06, + "loss": 0.3279, + "step": 4889 + }, + { + "epoch": 0.31, + "grad_norm": 2.2260613296304994, + "learning_rate": 8.113124457679695e-06, + "loss": 0.2957, + "step": 4890 + }, + { + "epoch": 0.31, + "grad_norm": 3.1211395293278983, + "learning_rate": 8.112327414407662e-06, + "loss": 0.3133, + "step": 4891 + }, + { + "epoch": 0.31, + "grad_norm": 1.7679163209400715, + "learning_rate": 8.111530241999634e-06, + "loss": 0.3431, + "step": 4892 + }, + { + "epoch": 0.31, + "grad_norm": 3.741665979087409, + "learning_rate": 8.11073294048869e-06, + "loss": 0.3054, + "step": 4893 + }, + { + "epoch": 0.31, + "grad_norm": 1.2713386087154355, + "learning_rate": 8.109935509907911e-06, + "loss": 0.3154, + "step": 4894 + }, + { + "epoch": 0.31, + "grad_norm": 1.5499479964956717, + "learning_rate": 8.109137950290385e-06, + "loss": 0.2943, + "step": 4895 + }, + { + "epoch": 0.31, + "grad_norm": 1.6241753185389316, + "learning_rate": 8.108340261669204e-06, + "loss": 0.3314, + "step": 4896 + }, + { + "epoch": 0.31, + "grad_norm": 2.065475597908749, + "learning_rate": 8.107542444077461e-06, + "loss": 0.3514, + "step": 4897 + }, + { + "epoch": 0.31, + "grad_norm": 3.3892492590291545, + "learning_rate": 8.106744497548265e-06, + "loss": 0.3229, + "step": 4898 + }, + { + "epoch": 0.31, + "grad_norm": 1.7329539118650756, + "learning_rate": 8.105946422114721e-06, + "loss": 0.2981, + "step": 4899 + }, + { + "epoch": 0.31, + "grad_norm": 1.7567142614783802, + "learning_rate": 8.105148217809941e-06, + "loss": 0.3045, + "step": 4900 + }, + { + "epoch": 0.31, + "grad_norm": 1.6199260900646555, + "learning_rate": 8.104349884667048e-06, + "loss": 0.3203, + "step": 4901 + }, + { + "epoch": 0.31, + "grad_norm": 2.185619067488261, + "learning_rate": 8.103551422719164e-06, + "loss": 0.3026, + "step": 4902 + }, + { + "epoch": 0.31, + "grad_norm": 2.0196664867388665, + "learning_rate": 8.10275283199942e-06, + "loss": 0.3277, + "step": 4903 + }, + { + "epoch": 0.31, + "grad_norm": 1.4596505677728213, + "learning_rate": 8.10195411254095e-06, + "loss": 0.3148, + "step": 4904 + }, + { + "epoch": 0.31, + "grad_norm": 2.0219263492276407, + "learning_rate": 8.101155264376892e-06, + "loss": 0.3206, + "step": 4905 + }, + { + "epoch": 0.31, + "grad_norm": 2.787627867209791, + "learning_rate": 8.100356287540397e-06, + "loss": 0.3254, + "step": 4906 + }, + { + "epoch": 0.31, + "grad_norm": 2.410642521194272, + "learning_rate": 8.09955718206461e-06, + "loss": 0.3093, + "step": 4907 + }, + { + "epoch": 0.31, + "grad_norm": 2.0444840723299285, + "learning_rate": 8.098757947982692e-06, + "loss": 0.321, + "step": 4908 + }, + { + "epoch": 0.31, + "grad_norm": 2.4640748718609533, + "learning_rate": 8.097958585327801e-06, + "loss": 0.3154, + "step": 4909 + }, + { + "epoch": 0.31, + "grad_norm": 3.42997998563266, + "learning_rate": 8.097159094133106e-06, + "loss": 0.2988, + "step": 4910 + }, + { + "epoch": 0.31, + "grad_norm": 0.8019002532735567, + "learning_rate": 8.09635947443178e-06, + "loss": 0.5156, + "step": 4911 + }, + { + "epoch": 0.31, + "grad_norm": 2.2354178381952123, + "learning_rate": 8.095559726256998e-06, + "loss": 0.3281, + "step": 4912 + }, + { + "epoch": 0.31, + "grad_norm": 4.041393490542275, + "learning_rate": 8.094759849641946e-06, + "loss": 0.3238, + "step": 4913 + }, + { + "epoch": 0.31, + "grad_norm": 1.7435732778585893, + "learning_rate": 8.093959844619812e-06, + "loss": 0.3223, + "step": 4914 + }, + { + "epoch": 0.31, + "grad_norm": 2.9563665748240093, + "learning_rate": 8.093159711223788e-06, + "loss": 0.3196, + "step": 4915 + }, + { + "epoch": 0.31, + "grad_norm": 16.896494403766674, + "learning_rate": 8.09235944948707e-06, + "loss": 0.3184, + "step": 4916 + }, + { + "epoch": 0.31, + "grad_norm": 1.5403427987104072, + "learning_rate": 8.09155905944287e-06, + "loss": 0.3232, + "step": 4917 + }, + { + "epoch": 0.31, + "grad_norm": 2.278523899803821, + "learning_rate": 8.090758541124394e-06, + "loss": 0.313, + "step": 4918 + }, + { + "epoch": 0.31, + "grad_norm": 2.0731620283807266, + "learning_rate": 8.089957894564851e-06, + "loss": 0.3244, + "step": 4919 + }, + { + "epoch": 0.31, + "grad_norm": 3.9387298702383564, + "learning_rate": 8.08915711979747e-06, + "loss": 0.3258, + "step": 4920 + }, + { + "epoch": 0.31, + "grad_norm": 2.0861885799415876, + "learning_rate": 8.088356216855474e-06, + "loss": 0.3176, + "step": 4921 + }, + { + "epoch": 0.31, + "grad_norm": 3.3201997461856845, + "learning_rate": 8.087555185772093e-06, + "loss": 0.3651, + "step": 4922 + }, + { + "epoch": 0.31, + "grad_norm": 2.122575679762488, + "learning_rate": 8.08675402658056e-06, + "loss": 0.3112, + "step": 4923 + }, + { + "epoch": 0.31, + "grad_norm": 1.7670045849754261, + "learning_rate": 8.085952739314123e-06, + "loss": 0.3215, + "step": 4924 + }, + { + "epoch": 0.31, + "grad_norm": 1.9158837604562065, + "learning_rate": 8.085151324006024e-06, + "loss": 0.3022, + "step": 4925 + }, + { + "epoch": 0.31, + "grad_norm": 9.144228081612862, + "learning_rate": 8.084349780689517e-06, + "loss": 0.3358, + "step": 4926 + }, + { + "epoch": 0.31, + "grad_norm": 2.215449578261158, + "learning_rate": 8.08354810939786e-06, + "loss": 0.3011, + "step": 4927 + }, + { + "epoch": 0.31, + "grad_norm": 1.8327874943896953, + "learning_rate": 8.082746310164316e-06, + "loss": 0.3094, + "step": 4928 + }, + { + "epoch": 0.31, + "grad_norm": 2.907844742173548, + "learning_rate": 8.08194438302215e-06, + "loss": 0.3482, + "step": 4929 + }, + { + "epoch": 0.31, + "grad_norm": 3.2764122764388346, + "learning_rate": 8.081142328004638e-06, + "loss": 0.3163, + "step": 4930 + }, + { + "epoch": 0.31, + "grad_norm": 1.6559404359038419, + "learning_rate": 8.080340145145058e-06, + "loss": 0.3305, + "step": 4931 + }, + { + "epoch": 0.31, + "grad_norm": 3.8500527172802412, + "learning_rate": 8.079537834476692e-06, + "loss": 0.3137, + "step": 4932 + }, + { + "epoch": 0.31, + "grad_norm": 2.6798642355980546, + "learning_rate": 8.078735396032835e-06, + "loss": 0.3312, + "step": 4933 + }, + { + "epoch": 0.31, + "grad_norm": 1.9532205992175309, + "learning_rate": 8.077932829846774e-06, + "loss": 0.308, + "step": 4934 + }, + { + "epoch": 0.31, + "grad_norm": 2.587313719105938, + "learning_rate": 8.077130135951817e-06, + "loss": 0.3021, + "step": 4935 + }, + { + "epoch": 0.31, + "grad_norm": 0.7471957552749761, + "learning_rate": 8.076327314381262e-06, + "loss": 0.5175, + "step": 4936 + }, + { + "epoch": 0.31, + "grad_norm": 2.477394257055449, + "learning_rate": 8.075524365168426e-06, + "loss": 0.3153, + "step": 4937 + }, + { + "epoch": 0.31, + "grad_norm": 1.6740954872672904, + "learning_rate": 8.074721288346617e-06, + "loss": 0.3046, + "step": 4938 + }, + { + "epoch": 0.31, + "grad_norm": 3.0022588763356945, + "learning_rate": 8.073918083949164e-06, + "loss": 0.3143, + "step": 4939 + }, + { + "epoch": 0.31, + "grad_norm": 2.107769896618522, + "learning_rate": 8.073114752009388e-06, + "loss": 0.3132, + "step": 4940 + }, + { + "epoch": 0.31, + "grad_norm": 9.767220171014733, + "learning_rate": 8.072311292560624e-06, + "loss": 0.308, + "step": 4941 + }, + { + "epoch": 0.31, + "grad_norm": 6.565922059987354, + "learning_rate": 8.071507705636204e-06, + "loss": 0.3194, + "step": 4942 + }, + { + "epoch": 0.31, + "grad_norm": 2.574200775674833, + "learning_rate": 8.070703991269477e-06, + "loss": 0.308, + "step": 4943 + }, + { + "epoch": 0.31, + "grad_norm": 1.9724278178689933, + "learning_rate": 8.069900149493786e-06, + "loss": 0.3212, + "step": 4944 + }, + { + "epoch": 0.31, + "grad_norm": 1.8255427530008028, + "learning_rate": 8.069096180342486e-06, + "loss": 0.3095, + "step": 4945 + }, + { + "epoch": 0.31, + "grad_norm": 1.60634504768918, + "learning_rate": 8.068292083848932e-06, + "loss": 0.3256, + "step": 4946 + }, + { + "epoch": 0.31, + "grad_norm": 4.373028519392421, + "learning_rate": 8.067487860046492e-06, + "loss": 0.3094, + "step": 4947 + }, + { + "epoch": 0.31, + "grad_norm": 1.664247309678174, + "learning_rate": 8.06668350896853e-06, + "loss": 0.3132, + "step": 4948 + }, + { + "epoch": 0.31, + "grad_norm": 1.753065592450937, + "learning_rate": 8.065879030648423e-06, + "loss": 0.3261, + "step": 4949 + }, + { + "epoch": 0.31, + "grad_norm": 2.2169032541546447, + "learning_rate": 8.06507442511955e-06, + "loss": 0.318, + "step": 4950 + }, + { + "epoch": 0.31, + "grad_norm": 4.545690984191229, + "learning_rate": 8.064269692415296e-06, + "loss": 0.3255, + "step": 4951 + }, + { + "epoch": 0.31, + "grad_norm": 5.970656893769879, + "learning_rate": 8.063464832569048e-06, + "loss": 0.3155, + "step": 4952 + }, + { + "epoch": 0.31, + "grad_norm": 2.2575909231858047, + "learning_rate": 8.062659845614202e-06, + "loss": 0.3308, + "step": 4953 + }, + { + "epoch": 0.31, + "grad_norm": 1.801862396995543, + "learning_rate": 8.06185473158416e-06, + "loss": 0.33, + "step": 4954 + }, + { + "epoch": 0.31, + "grad_norm": 2.160705406548701, + "learning_rate": 8.061049490512326e-06, + "loss": 0.3401, + "step": 4955 + }, + { + "epoch": 0.31, + "grad_norm": 2.0949235628577947, + "learning_rate": 8.060244122432116e-06, + "loss": 0.3147, + "step": 4956 + }, + { + "epoch": 0.31, + "grad_norm": 2.941835161034645, + "learning_rate": 8.059438627376936e-06, + "loss": 0.3074, + "step": 4957 + }, + { + "epoch": 0.31, + "grad_norm": 5.744498126916404, + "learning_rate": 8.058633005380216e-06, + "loss": 0.3168, + "step": 4958 + }, + { + "epoch": 0.31, + "grad_norm": 0.7371791076787418, + "learning_rate": 8.05782725647538e-06, + "loss": 0.4851, + "step": 4959 + }, + { + "epoch": 0.31, + "grad_norm": 2.648512559719333, + "learning_rate": 8.057021380695858e-06, + "loss": 0.3249, + "step": 4960 + }, + { + "epoch": 0.31, + "grad_norm": 1.8496177680693084, + "learning_rate": 8.05621537807509e-06, + "loss": 0.3165, + "step": 4961 + }, + { + "epoch": 0.31, + "grad_norm": 5.874420013697325, + "learning_rate": 8.055409248646517e-06, + "loss": 0.308, + "step": 4962 + }, + { + "epoch": 0.31, + "grad_norm": 1.9362228664429233, + "learning_rate": 8.05460299244359e-06, + "loss": 0.3011, + "step": 4963 + }, + { + "epoch": 0.31, + "grad_norm": 1.8530717830919292, + "learning_rate": 8.053796609499755e-06, + "loss": 0.3365, + "step": 4964 + }, + { + "epoch": 0.31, + "grad_norm": 2.188405320612547, + "learning_rate": 8.052990099848478e-06, + "loss": 0.3054, + "step": 4965 + }, + { + "epoch": 0.31, + "grad_norm": 2.9289555558700955, + "learning_rate": 8.052183463523218e-06, + "loss": 0.3065, + "step": 4966 + }, + { + "epoch": 0.31, + "grad_norm": 5.985572830928523, + "learning_rate": 8.051376700557445e-06, + "loss": 0.3128, + "step": 4967 + }, + { + "epoch": 0.31, + "grad_norm": 1.9974222898370826, + "learning_rate": 8.050569810984632e-06, + "loss": 0.3191, + "step": 4968 + }, + { + "epoch": 0.31, + "grad_norm": 2.7153716759739575, + "learning_rate": 8.049762794838258e-06, + "loss": 0.3156, + "step": 4969 + }, + { + "epoch": 0.31, + "grad_norm": 1.9904526917215217, + "learning_rate": 8.04895565215181e-06, + "loss": 0.3171, + "step": 4970 + }, + { + "epoch": 0.31, + "grad_norm": 2.6899332359955075, + "learning_rate": 8.048148382958777e-06, + "loss": 0.3442, + "step": 4971 + }, + { + "epoch": 0.31, + "grad_norm": 3.683926435589481, + "learning_rate": 8.047340987292653e-06, + "loss": 0.3386, + "step": 4972 + }, + { + "epoch": 0.31, + "grad_norm": 1.9911786050649964, + "learning_rate": 8.046533465186939e-06, + "loss": 0.3041, + "step": 4973 + }, + { + "epoch": 0.31, + "grad_norm": 2.073778338160897, + "learning_rate": 8.04572581667514e-06, + "loss": 0.312, + "step": 4974 + }, + { + "epoch": 0.31, + "grad_norm": 1.57490476913578, + "learning_rate": 8.044918041790768e-06, + "loss": 0.3215, + "step": 4975 + }, + { + "epoch": 0.31, + "grad_norm": 1.919997197183369, + "learning_rate": 8.044110140567337e-06, + "loss": 0.3131, + "step": 4976 + }, + { + "epoch": 0.31, + "grad_norm": 0.6494139966985812, + "learning_rate": 8.04330211303837e-06, + "loss": 0.4924, + "step": 4977 + }, + { + "epoch": 0.31, + "grad_norm": 0.7070700658497352, + "learning_rate": 8.042493959237391e-06, + "loss": 0.5016, + "step": 4978 + }, + { + "epoch": 0.31, + "grad_norm": 2.0483476573986326, + "learning_rate": 8.041685679197936e-06, + "loss": 0.3127, + "step": 4979 + }, + { + "epoch": 0.31, + "grad_norm": 2.9444596376193637, + "learning_rate": 8.040877272953538e-06, + "loss": 0.3319, + "step": 4980 + }, + { + "epoch": 0.31, + "grad_norm": 3.477915831462931, + "learning_rate": 8.040068740537741e-06, + "loss": 0.3317, + "step": 4981 + }, + { + "epoch": 0.31, + "grad_norm": 2.230217332345066, + "learning_rate": 8.039260081984092e-06, + "loss": 0.3055, + "step": 4982 + }, + { + "epoch": 0.31, + "grad_norm": 1.7654025561999291, + "learning_rate": 8.038451297326146e-06, + "loss": 0.3122, + "step": 4983 + }, + { + "epoch": 0.31, + "grad_norm": 1.9280784523657415, + "learning_rate": 8.037642386597456e-06, + "loss": 0.308, + "step": 4984 + }, + { + "epoch": 0.31, + "grad_norm": 2.166152756269824, + "learning_rate": 8.03683334983159e-06, + "loss": 0.333, + "step": 4985 + }, + { + "epoch": 0.31, + "grad_norm": 5.922670774962704, + "learning_rate": 8.036024187062113e-06, + "loss": 0.3172, + "step": 4986 + }, + { + "epoch": 0.31, + "grad_norm": 3.9193641648410344, + "learning_rate": 8.0352148983226e-06, + "loss": 0.3169, + "step": 4987 + }, + { + "epoch": 0.31, + "grad_norm": 6.648087946869346, + "learning_rate": 8.03440548364663e-06, + "loss": 0.3145, + "step": 4988 + }, + { + "epoch": 0.31, + "grad_norm": 1.9904013501905227, + "learning_rate": 8.033595943067786e-06, + "loss": 0.3083, + "step": 4989 + }, + { + "epoch": 0.31, + "grad_norm": 6.056318935392141, + "learning_rate": 8.032786276619658e-06, + "loss": 0.349, + "step": 4990 + }, + { + "epoch": 0.31, + "grad_norm": 2.7745935112318016, + "learning_rate": 8.031976484335841e-06, + "loss": 0.3216, + "step": 4991 + }, + { + "epoch": 0.31, + "grad_norm": 2.2124179343633745, + "learning_rate": 8.031166566249935e-06, + "loss": 0.3111, + "step": 4992 + }, + { + "epoch": 0.31, + "grad_norm": 3.971833357269287, + "learning_rate": 8.030356522395544e-06, + "loss": 0.3174, + "step": 4993 + }, + { + "epoch": 0.31, + "grad_norm": 4.053995436223221, + "learning_rate": 8.029546352806277e-06, + "loss": 0.3173, + "step": 4994 + }, + { + "epoch": 0.31, + "grad_norm": 12.351064506842642, + "learning_rate": 8.028736057515753e-06, + "loss": 0.3318, + "step": 4995 + }, + { + "epoch": 0.31, + "grad_norm": 2.1149409158563297, + "learning_rate": 8.02792563655759e-06, + "loss": 0.3247, + "step": 4996 + }, + { + "epoch": 0.31, + "grad_norm": 1.9858625097368985, + "learning_rate": 8.027115089965413e-06, + "loss": 0.3302, + "step": 4997 + }, + { + "epoch": 0.31, + "grad_norm": 4.136126573258584, + "learning_rate": 8.026304417772854e-06, + "loss": 0.326, + "step": 4998 + }, + { + "epoch": 0.31, + "grad_norm": 2.0309427426539384, + "learning_rate": 8.025493620013549e-06, + "loss": 0.3437, + "step": 4999 + }, + { + "epoch": 0.31, + "grad_norm": 3.14193524045625, + "learning_rate": 8.02468269672114e-06, + "loss": 0.3162, + "step": 5000 + }, + { + "epoch": 0.31, + "grad_norm": 2.995483997129608, + "learning_rate": 8.023871647929276e-06, + "loss": 0.3087, + "step": 5001 + }, + { + "epoch": 0.31, + "grad_norm": 2.3277628667057098, + "learning_rate": 8.023060473671605e-06, + "loss": 0.3102, + "step": 5002 + }, + { + "epoch": 0.31, + "grad_norm": 1.9788008089983757, + "learning_rate": 8.022249173981784e-06, + "loss": 0.3104, + "step": 5003 + }, + { + "epoch": 0.31, + "grad_norm": 0.6996974523522912, + "learning_rate": 8.021437748893478e-06, + "loss": 0.4972, + "step": 5004 + }, + { + "epoch": 0.31, + "grad_norm": 3.2174180211368424, + "learning_rate": 8.020626198440352e-06, + "loss": 0.3132, + "step": 5005 + }, + { + "epoch": 0.31, + "grad_norm": 1.640476656829514, + "learning_rate": 8.01981452265608e-06, + "loss": 0.3143, + "step": 5006 + }, + { + "epoch": 0.31, + "grad_norm": 4.334402462138009, + "learning_rate": 8.01900272157434e-06, + "loss": 0.3069, + "step": 5007 + }, + { + "epoch": 0.31, + "grad_norm": 11.900448913831037, + "learning_rate": 8.018190795228815e-06, + "loss": 0.3441, + "step": 5008 + }, + { + "epoch": 0.32, + "grad_norm": 4.063654509610975, + "learning_rate": 8.017378743653193e-06, + "loss": 0.3195, + "step": 5009 + }, + { + "epoch": 0.32, + "grad_norm": 3.3909167925522223, + "learning_rate": 8.016566566881166e-06, + "loss": 0.3042, + "step": 5010 + }, + { + "epoch": 0.32, + "grad_norm": 5.532290570296978, + "learning_rate": 8.015754264946435e-06, + "loss": 0.3315, + "step": 5011 + }, + { + "epoch": 0.32, + "grad_norm": 2.7373760127048294, + "learning_rate": 8.014941837882703e-06, + "loss": 0.3432, + "step": 5012 + }, + { + "epoch": 0.32, + "grad_norm": 2.0618836638509697, + "learning_rate": 8.01412928572368e-06, + "loss": 0.3232, + "step": 5013 + }, + { + "epoch": 0.32, + "grad_norm": 1.876704991000476, + "learning_rate": 8.01331660850308e-06, + "loss": 0.3114, + "step": 5014 + }, + { + "epoch": 0.32, + "grad_norm": 5.145002344501987, + "learning_rate": 8.01250380625462e-06, + "loss": 0.302, + "step": 5015 + }, + { + "epoch": 0.32, + "grad_norm": 4.40383478343123, + "learning_rate": 8.011690879012027e-06, + "loss": 0.3279, + "step": 5016 + }, + { + "epoch": 0.32, + "grad_norm": 2.8666662275168235, + "learning_rate": 8.010877826809028e-06, + "loss": 0.3141, + "step": 5017 + }, + { + "epoch": 0.32, + "grad_norm": 5.595550835367412, + "learning_rate": 8.01006464967936e-06, + "loss": 0.3118, + "step": 5018 + }, + { + "epoch": 0.32, + "grad_norm": 2.383487226000707, + "learning_rate": 8.009251347656766e-06, + "loss": 0.3132, + "step": 5019 + }, + { + "epoch": 0.32, + "grad_norm": 4.675162959329228, + "learning_rate": 8.008437920774987e-06, + "loss": 0.3088, + "step": 5020 + }, + { + "epoch": 0.32, + "grad_norm": 1.6265349900072492, + "learning_rate": 8.007624369067776e-06, + "loss": 0.2884, + "step": 5021 + }, + { + "epoch": 0.32, + "grad_norm": 7.6831646227136945, + "learning_rate": 8.006810692568886e-06, + "loss": 0.3141, + "step": 5022 + }, + { + "epoch": 0.32, + "grad_norm": 3.556303551718319, + "learning_rate": 8.005996891312082e-06, + "loss": 0.3039, + "step": 5023 + }, + { + "epoch": 0.32, + "grad_norm": 2.3412784436245517, + "learning_rate": 8.005182965331127e-06, + "loss": 0.3417, + "step": 5024 + }, + { + "epoch": 0.32, + "grad_norm": 4.284655036838846, + "learning_rate": 8.004368914659792e-06, + "loss": 0.3232, + "step": 5025 + }, + { + "epoch": 0.32, + "grad_norm": 13.063304621855258, + "learning_rate": 8.003554739331856e-06, + "loss": 0.3083, + "step": 5026 + }, + { + "epoch": 0.32, + "grad_norm": 3.4726216877214298, + "learning_rate": 8.002740439381097e-06, + "loss": 0.3258, + "step": 5027 + }, + { + "epoch": 0.32, + "grad_norm": 2.2303048752751287, + "learning_rate": 8.001926014841306e-06, + "loss": 0.3162, + "step": 5028 + }, + { + "epoch": 0.32, + "grad_norm": 3.301325872784528, + "learning_rate": 8.001111465746269e-06, + "loss": 0.3141, + "step": 5029 + }, + { + "epoch": 0.32, + "grad_norm": 1.9678152985026538, + "learning_rate": 8.00029679212979e-06, + "loss": 0.3597, + "step": 5030 + }, + { + "epoch": 0.32, + "grad_norm": 3.746665528896893, + "learning_rate": 7.999481994025666e-06, + "loss": 0.3292, + "step": 5031 + }, + { + "epoch": 0.32, + "grad_norm": 2.8384568544072035, + "learning_rate": 7.998667071467709e-06, + "loss": 0.3117, + "step": 5032 + }, + { + "epoch": 0.32, + "grad_norm": 3.151200798535191, + "learning_rate": 7.997852024489727e-06, + "loss": 0.3197, + "step": 5033 + }, + { + "epoch": 0.32, + "grad_norm": 1.4186715153542795, + "learning_rate": 7.99703685312554e-06, + "loss": 0.2909, + "step": 5034 + }, + { + "epoch": 0.32, + "grad_norm": 2.76518638238281, + "learning_rate": 7.99622155740897e-06, + "loss": 0.3145, + "step": 5035 + }, + { + "epoch": 0.32, + "grad_norm": 2.2394895644670703, + "learning_rate": 7.995406137373848e-06, + "loss": 0.3109, + "step": 5036 + }, + { + "epoch": 0.32, + "grad_norm": 2.5883403986052236, + "learning_rate": 7.994590593054001e-06, + "loss": 0.3132, + "step": 5037 + }, + { + "epoch": 0.32, + "grad_norm": 3.0702547423913122, + "learning_rate": 7.993774924483275e-06, + "loss": 0.3181, + "step": 5038 + }, + { + "epoch": 0.32, + "grad_norm": 2.0816594562824275, + "learning_rate": 7.992959131695508e-06, + "loss": 0.3235, + "step": 5039 + }, + { + "epoch": 0.32, + "grad_norm": 3.63469659261388, + "learning_rate": 7.99214321472455e-06, + "loss": 0.3176, + "step": 5040 + }, + { + "epoch": 0.32, + "grad_norm": 2.0387904745798937, + "learning_rate": 7.991327173604257e-06, + "loss": 0.3086, + "step": 5041 + }, + { + "epoch": 0.32, + "grad_norm": 6.001197198470353, + "learning_rate": 7.990511008368485e-06, + "loss": 0.314, + "step": 5042 + }, + { + "epoch": 0.32, + "grad_norm": 3.091889925430689, + "learning_rate": 7.989694719051101e-06, + "loss": 0.3303, + "step": 5043 + }, + { + "epoch": 0.32, + "grad_norm": 16.46003381239394, + "learning_rate": 7.988878305685972e-06, + "loss": 0.2925, + "step": 5044 + }, + { + "epoch": 0.32, + "grad_norm": 3.684788091771633, + "learning_rate": 7.988061768306973e-06, + "loss": 0.3333, + "step": 5045 + }, + { + "epoch": 0.32, + "grad_norm": 7.578808138336421, + "learning_rate": 7.987245106947984e-06, + "loss": 0.3042, + "step": 5046 + }, + { + "epoch": 0.32, + "grad_norm": 2.158788842711406, + "learning_rate": 7.98642832164289e-06, + "loss": 0.3235, + "step": 5047 + }, + { + "epoch": 0.32, + "grad_norm": 11.113235636521349, + "learning_rate": 7.98561141242558e-06, + "loss": 0.3354, + "step": 5048 + }, + { + "epoch": 0.32, + "grad_norm": 2.889934836684406, + "learning_rate": 7.98479437932995e-06, + "loss": 0.3025, + "step": 5049 + }, + { + "epoch": 0.32, + "grad_norm": 2.9492874215275315, + "learning_rate": 7.983977222389898e-06, + "loss": 0.3066, + "step": 5050 + }, + { + "epoch": 0.32, + "grad_norm": 2.1226520046678004, + "learning_rate": 7.983159941639334e-06, + "loss": 0.3231, + "step": 5051 + }, + { + "epoch": 0.32, + "grad_norm": 2.3378657451530405, + "learning_rate": 7.982342537112162e-06, + "loss": 0.3139, + "step": 5052 + }, + { + "epoch": 0.32, + "grad_norm": 2.6246577298481006, + "learning_rate": 7.981525008842302e-06, + "loss": 0.3338, + "step": 5053 + }, + { + "epoch": 0.32, + "grad_norm": 2.021490134537214, + "learning_rate": 7.980707356863673e-06, + "loss": 0.2936, + "step": 5054 + }, + { + "epoch": 0.32, + "grad_norm": 1.7435643705464416, + "learning_rate": 7.979889581210203e-06, + "loss": 0.3131, + "step": 5055 + }, + { + "epoch": 0.32, + "grad_norm": 2.5048846923382837, + "learning_rate": 7.979071681915821e-06, + "loss": 0.3009, + "step": 5056 + }, + { + "epoch": 0.32, + "grad_norm": 2.3088242899896563, + "learning_rate": 7.978253659014463e-06, + "loss": 0.3153, + "step": 5057 + }, + { + "epoch": 0.32, + "grad_norm": 4.168719405448897, + "learning_rate": 7.977435512540072e-06, + "loss": 0.3066, + "step": 5058 + }, + { + "epoch": 0.32, + "grad_norm": 1.7285131836876428, + "learning_rate": 7.976617242526592e-06, + "loss": 0.31, + "step": 5059 + }, + { + "epoch": 0.32, + "grad_norm": 0.7048803919940461, + "learning_rate": 7.975798849007974e-06, + "loss": 0.5259, + "step": 5060 + }, + { + "epoch": 0.32, + "grad_norm": 1.7197787822123591, + "learning_rate": 7.974980332018178e-06, + "loss": 0.3088, + "step": 5061 + }, + { + "epoch": 0.32, + "grad_norm": 2.799530498120901, + "learning_rate": 7.974161691591165e-06, + "loss": 0.3328, + "step": 5062 + }, + { + "epoch": 0.32, + "grad_norm": 2.5861386311184247, + "learning_rate": 7.973342927760898e-06, + "loss": 0.3019, + "step": 5063 + }, + { + "epoch": 0.32, + "grad_norm": 2.2818011562125737, + "learning_rate": 7.972524040561353e-06, + "loss": 0.3123, + "step": 5064 + }, + { + "epoch": 0.32, + "grad_norm": 2.008332525866915, + "learning_rate": 7.971705030026507e-06, + "loss": 0.3118, + "step": 5065 + }, + { + "epoch": 0.32, + "grad_norm": 2.256715118907958, + "learning_rate": 7.97088589619034e-06, + "loss": 0.2927, + "step": 5066 + }, + { + "epoch": 0.32, + "grad_norm": 6.674991938528282, + "learning_rate": 7.970066639086839e-06, + "loss": 0.301, + "step": 5067 + }, + { + "epoch": 0.32, + "grad_norm": 2.1804556456397246, + "learning_rate": 7.969247258749999e-06, + "loss": 0.302, + "step": 5068 + }, + { + "epoch": 0.32, + "grad_norm": 3.9860280400051207, + "learning_rate": 7.968427755213814e-06, + "loss": 0.3068, + "step": 5069 + }, + { + "epoch": 0.32, + "grad_norm": 3.831230541476977, + "learning_rate": 7.967608128512293e-06, + "loss": 0.3122, + "step": 5070 + }, + { + "epoch": 0.32, + "grad_norm": 2.5164274142461585, + "learning_rate": 7.966788378679435e-06, + "loss": 0.3592, + "step": 5071 + }, + { + "epoch": 0.32, + "grad_norm": 1.837298725090378, + "learning_rate": 7.965968505749262e-06, + "loss": 0.291, + "step": 5072 + }, + { + "epoch": 0.32, + "grad_norm": 1.5549773023611877, + "learning_rate": 7.965148509755785e-06, + "loss": 0.2925, + "step": 5073 + }, + { + "epoch": 0.32, + "grad_norm": 3.2986968139774238, + "learning_rate": 7.96432839073303e-06, + "loss": 0.322, + "step": 5074 + }, + { + "epoch": 0.32, + "grad_norm": 2.409482418167394, + "learning_rate": 7.963508148715024e-06, + "loss": 0.3006, + "step": 5075 + }, + { + "epoch": 0.32, + "grad_norm": 1.6148311753650015, + "learning_rate": 7.962687783735803e-06, + "loss": 0.2995, + "step": 5076 + }, + { + "epoch": 0.32, + "grad_norm": 2.440228470832698, + "learning_rate": 7.961867295829403e-06, + "loss": 0.31, + "step": 5077 + }, + { + "epoch": 0.32, + "grad_norm": 6.806841277108355, + "learning_rate": 7.961046685029868e-06, + "loss": 0.3409, + "step": 5078 + }, + { + "epoch": 0.32, + "grad_norm": 1.4445096722768909, + "learning_rate": 7.960225951371245e-06, + "loss": 0.3029, + "step": 5079 + }, + { + "epoch": 0.32, + "grad_norm": 1.4989431264792445, + "learning_rate": 7.959405094887591e-06, + "loss": 0.2938, + "step": 5080 + }, + { + "epoch": 0.32, + "grad_norm": 0.6791499788006095, + "learning_rate": 7.958584115612963e-06, + "loss": 0.5283, + "step": 5081 + }, + { + "epoch": 0.32, + "grad_norm": 2.0749415291052133, + "learning_rate": 7.957763013581425e-06, + "loss": 0.3239, + "step": 5082 + }, + { + "epoch": 0.32, + "grad_norm": 2.0466423740691315, + "learning_rate": 7.956941788827046e-06, + "loss": 0.3171, + "step": 5083 + }, + { + "epoch": 0.32, + "grad_norm": 18.875177393278847, + "learning_rate": 7.9561204413839e-06, + "loss": 0.3002, + "step": 5084 + }, + { + "epoch": 0.32, + "grad_norm": 1.7414609372383736, + "learning_rate": 7.955298971286066e-06, + "loss": 0.3201, + "step": 5085 + }, + { + "epoch": 0.32, + "grad_norm": 1.8438288218830583, + "learning_rate": 7.954477378567629e-06, + "loss": 0.3316, + "step": 5086 + }, + { + "epoch": 0.32, + "grad_norm": 3.979266579438885, + "learning_rate": 7.953655663262676e-06, + "loss": 0.3338, + "step": 5087 + }, + { + "epoch": 0.32, + "grad_norm": 4.596344116499169, + "learning_rate": 7.952833825405306e-06, + "loss": 0.3172, + "step": 5088 + }, + { + "epoch": 0.32, + "grad_norm": 2.374949279742272, + "learning_rate": 7.952011865029614e-06, + "loss": 0.3357, + "step": 5089 + }, + { + "epoch": 0.32, + "grad_norm": 1.76099965096983, + "learning_rate": 7.951189782169706e-06, + "loss": 0.311, + "step": 5090 + }, + { + "epoch": 0.32, + "grad_norm": 7.791065430818122, + "learning_rate": 7.950367576859694e-06, + "loss": 0.3222, + "step": 5091 + }, + { + "epoch": 0.32, + "grad_norm": 2.064459872491958, + "learning_rate": 7.94954524913369e-06, + "loss": 0.3111, + "step": 5092 + }, + { + "epoch": 0.32, + "grad_norm": 0.6358258563186658, + "learning_rate": 7.948722799025814e-06, + "loss": 0.5034, + "step": 5093 + }, + { + "epoch": 0.32, + "grad_norm": 2.0212872046446795, + "learning_rate": 7.94790022657019e-06, + "loss": 0.3342, + "step": 5094 + }, + { + "epoch": 0.32, + "grad_norm": 1.9751574512457168, + "learning_rate": 7.947077531800953e-06, + "loss": 0.3167, + "step": 5095 + }, + { + "epoch": 0.32, + "grad_norm": 1.3474367355703467, + "learning_rate": 7.946254714752233e-06, + "loss": 0.3204, + "step": 5096 + }, + { + "epoch": 0.32, + "grad_norm": 4.2114763334420475, + "learning_rate": 7.945431775458172e-06, + "loss": 0.3321, + "step": 5097 + }, + { + "epoch": 0.32, + "grad_norm": 1.5620017746977082, + "learning_rate": 7.944608713952913e-06, + "loss": 0.3223, + "step": 5098 + }, + { + "epoch": 0.32, + "grad_norm": 0.5993428243811117, + "learning_rate": 7.94378553027061e-06, + "loss": 0.4901, + "step": 5099 + }, + { + "epoch": 0.32, + "grad_norm": 1.7364528052657522, + "learning_rate": 7.942962224445416e-06, + "loss": 0.3038, + "step": 5100 + }, + { + "epoch": 0.32, + "grad_norm": 3.5768580631474247, + "learning_rate": 7.942138796511493e-06, + "loss": 0.3406, + "step": 5101 + }, + { + "epoch": 0.32, + "grad_norm": 1.7684532732243523, + "learning_rate": 7.941315246503006e-06, + "loss": 0.281, + "step": 5102 + }, + { + "epoch": 0.32, + "grad_norm": 1.8710782619801423, + "learning_rate": 7.940491574454123e-06, + "loss": 0.3424, + "step": 5103 + }, + { + "epoch": 0.32, + "grad_norm": 1.3112896392186466, + "learning_rate": 7.939667780399023e-06, + "loss": 0.309, + "step": 5104 + }, + { + "epoch": 0.32, + "grad_norm": 1.7026879528276058, + "learning_rate": 7.938843864371886e-06, + "loss": 0.3511, + "step": 5105 + }, + { + "epoch": 0.32, + "grad_norm": 1.9839296373220543, + "learning_rate": 7.938019826406895e-06, + "loss": 0.308, + "step": 5106 + }, + { + "epoch": 0.32, + "grad_norm": 1.2861315904284585, + "learning_rate": 7.937195666538245e-06, + "loss": 0.2957, + "step": 5107 + }, + { + "epoch": 0.32, + "grad_norm": 1.9380431598967383, + "learning_rate": 7.936371384800127e-06, + "loss": 0.3362, + "step": 5108 + }, + { + "epoch": 0.32, + "grad_norm": 2.770307461234666, + "learning_rate": 7.935546981226747e-06, + "loss": 0.3473, + "step": 5109 + }, + { + "epoch": 0.32, + "grad_norm": 0.6061355860759877, + "learning_rate": 7.934722455852308e-06, + "loss": 0.4995, + "step": 5110 + }, + { + "epoch": 0.32, + "grad_norm": 1.6253222845390156, + "learning_rate": 7.933897808711022e-06, + "loss": 0.2948, + "step": 5111 + }, + { + "epoch": 0.32, + "grad_norm": 13.468429389333703, + "learning_rate": 7.933073039837104e-06, + "loss": 0.3258, + "step": 5112 + }, + { + "epoch": 0.32, + "grad_norm": 2.1328610215687336, + "learning_rate": 7.932248149264778e-06, + "loss": 0.3124, + "step": 5113 + }, + { + "epoch": 0.32, + "grad_norm": 3.1885687747250673, + "learning_rate": 7.931423137028265e-06, + "loss": 0.3138, + "step": 5114 + }, + { + "epoch": 0.32, + "grad_norm": 1.6322587392956598, + "learning_rate": 7.9305980031618e-06, + "loss": 0.3143, + "step": 5115 + }, + { + "epoch": 0.32, + "grad_norm": 2.691617172288375, + "learning_rate": 7.929772747699621e-06, + "loss": 0.3284, + "step": 5116 + }, + { + "epoch": 0.32, + "grad_norm": 1.4915008100558418, + "learning_rate": 7.928947370675963e-06, + "loss": 0.3271, + "step": 5117 + }, + { + "epoch": 0.32, + "grad_norm": 1.874462099288567, + "learning_rate": 7.928121872125079e-06, + "loss": 0.328, + "step": 5118 + }, + { + "epoch": 0.32, + "grad_norm": 1.8486747584394894, + "learning_rate": 7.927296252081218e-06, + "loss": 0.3323, + "step": 5119 + }, + { + "epoch": 0.32, + "grad_norm": 1.6815775240784545, + "learning_rate": 7.926470510578634e-06, + "loss": 0.3315, + "step": 5120 + }, + { + "epoch": 0.32, + "grad_norm": 3.587468779319142, + "learning_rate": 7.925644647651591e-06, + "loss": 0.2997, + "step": 5121 + }, + { + "epoch": 0.32, + "grad_norm": 3.234666949519142, + "learning_rate": 7.924818663334356e-06, + "loss": 0.3351, + "step": 5122 + }, + { + "epoch": 0.32, + "grad_norm": 1.6422725811821304, + "learning_rate": 7.9239925576612e-06, + "loss": 0.3039, + "step": 5123 + }, + { + "epoch": 0.32, + "grad_norm": 2.7219740756730872, + "learning_rate": 7.923166330666397e-06, + "loss": 0.3198, + "step": 5124 + }, + { + "epoch": 0.32, + "grad_norm": 1.5385086878779366, + "learning_rate": 7.922339982384232e-06, + "loss": 0.3387, + "step": 5125 + }, + { + "epoch": 0.32, + "grad_norm": 1.2084383566082457, + "learning_rate": 7.92151351284899e-06, + "loss": 0.3314, + "step": 5126 + }, + { + "epoch": 0.32, + "grad_norm": 1.4743707311509031, + "learning_rate": 7.920686922094964e-06, + "loss": 0.3374, + "step": 5127 + }, + { + "epoch": 0.32, + "grad_norm": 2.915323215254238, + "learning_rate": 7.91986021015645e-06, + "loss": 0.3095, + "step": 5128 + }, + { + "epoch": 0.32, + "grad_norm": 3.2837458177313508, + "learning_rate": 7.919033377067748e-06, + "loss": 0.3379, + "step": 5129 + }, + { + "epoch": 0.32, + "grad_norm": 1.2373511139087423, + "learning_rate": 7.918206422863169e-06, + "loss": 0.3304, + "step": 5130 + }, + { + "epoch": 0.32, + "grad_norm": 1.5184108331320194, + "learning_rate": 7.91737934757702e-06, + "loss": 0.3183, + "step": 5131 + }, + { + "epoch": 0.32, + "grad_norm": 2.169044407235978, + "learning_rate": 7.91655215124362e-06, + "loss": 0.3342, + "step": 5132 + }, + { + "epoch": 0.32, + "grad_norm": 1.5965243231458044, + "learning_rate": 7.915724833897291e-06, + "loss": 0.318, + "step": 5133 + }, + { + "epoch": 0.32, + "grad_norm": 1.310497701992734, + "learning_rate": 7.914897395572362e-06, + "loss": 0.3191, + "step": 5134 + }, + { + "epoch": 0.32, + "grad_norm": 1.5435031686691405, + "learning_rate": 7.91406983630316e-06, + "loss": 0.3314, + "step": 5135 + }, + { + "epoch": 0.32, + "grad_norm": 3.4206004188264725, + "learning_rate": 7.913242156124026e-06, + "loss": 0.3411, + "step": 5136 + }, + { + "epoch": 0.32, + "grad_norm": 2.0573240464480085, + "learning_rate": 7.9124143550693e-06, + "loss": 0.3216, + "step": 5137 + }, + { + "epoch": 0.32, + "grad_norm": 1.2794630248249228, + "learning_rate": 7.911586433173328e-06, + "loss": 0.305, + "step": 5138 + }, + { + "epoch": 0.32, + "grad_norm": 3.3966530422442425, + "learning_rate": 7.910758390470465e-06, + "loss": 0.3013, + "step": 5139 + }, + { + "epoch": 0.32, + "grad_norm": 2.358532216350067, + "learning_rate": 7.909930226995066e-06, + "loss": 0.3184, + "step": 5140 + }, + { + "epoch": 0.32, + "grad_norm": 2.2842048344007564, + "learning_rate": 7.909101942781494e-06, + "loss": 0.3089, + "step": 5141 + }, + { + "epoch": 0.32, + "grad_norm": 1.3209506365425492, + "learning_rate": 7.908273537864114e-06, + "loss": 0.3221, + "step": 5142 + }, + { + "epoch": 0.32, + "grad_norm": 5.289693722947336, + "learning_rate": 7.9074450122773e-06, + "loss": 0.314, + "step": 5143 + }, + { + "epoch": 0.32, + "grad_norm": 1.7378343070889373, + "learning_rate": 7.906616366055427e-06, + "loss": 0.3512, + "step": 5144 + }, + { + "epoch": 0.32, + "grad_norm": 2.4920375500741807, + "learning_rate": 7.905787599232878e-06, + "loss": 0.3159, + "step": 5145 + }, + { + "epoch": 0.32, + "grad_norm": 1.78577601822971, + "learning_rate": 7.904958711844042e-06, + "loss": 0.2949, + "step": 5146 + }, + { + "epoch": 0.32, + "grad_norm": 2.6478832983625598, + "learning_rate": 7.90412970392331e-06, + "loss": 0.3033, + "step": 5147 + }, + { + "epoch": 0.32, + "grad_norm": 1.7544514373362592, + "learning_rate": 7.903300575505077e-06, + "loss": 0.357, + "step": 5148 + }, + { + "epoch": 0.32, + "grad_norm": 1.6290573379952231, + "learning_rate": 7.902471326623746e-06, + "loss": 0.3407, + "step": 5149 + }, + { + "epoch": 0.32, + "grad_norm": 2.0217958670928295, + "learning_rate": 7.901641957313724e-06, + "loss": 0.3255, + "step": 5150 + }, + { + "epoch": 0.32, + "grad_norm": 1.3195554411977217, + "learning_rate": 7.900812467609423e-06, + "loss": 0.3208, + "step": 5151 + }, + { + "epoch": 0.32, + "grad_norm": 1.635799297520632, + "learning_rate": 7.899982857545263e-06, + "loss": 0.3306, + "step": 5152 + }, + { + "epoch": 0.32, + "grad_norm": 1.8028536636867845, + "learning_rate": 7.899153127155661e-06, + "loss": 0.3093, + "step": 5153 + }, + { + "epoch": 0.32, + "grad_norm": 1.760933628232171, + "learning_rate": 7.898323276475045e-06, + "loss": 0.3177, + "step": 5154 + }, + { + "epoch": 0.32, + "grad_norm": 1.3091170778311194, + "learning_rate": 7.897493305537851e-06, + "loss": 0.3024, + "step": 5155 + }, + { + "epoch": 0.32, + "grad_norm": 1.4170006007915341, + "learning_rate": 7.896663214378512e-06, + "loss": 0.3335, + "step": 5156 + }, + { + "epoch": 0.32, + "grad_norm": 1.5269484964884819, + "learning_rate": 7.895833003031472e-06, + "loss": 0.3463, + "step": 5157 + }, + { + "epoch": 0.32, + "grad_norm": 1.5275599386026182, + "learning_rate": 7.895002671531175e-06, + "loss": 0.3159, + "step": 5158 + }, + { + "epoch": 0.32, + "grad_norm": 1.3944402239154978, + "learning_rate": 7.894172219912077e-06, + "loss": 0.3123, + "step": 5159 + }, + { + "epoch": 0.32, + "grad_norm": 1.3081950411158063, + "learning_rate": 7.893341648208632e-06, + "loss": 0.3119, + "step": 5160 + }, + { + "epoch": 0.32, + "grad_norm": 3.1249826879700624, + "learning_rate": 7.892510956455305e-06, + "loss": 0.3248, + "step": 5161 + }, + { + "epoch": 0.32, + "grad_norm": 2.247131020693949, + "learning_rate": 7.891680144686558e-06, + "loss": 0.3134, + "step": 5162 + }, + { + "epoch": 0.32, + "grad_norm": 0.6270633376295832, + "learning_rate": 7.890849212936866e-06, + "loss": 0.4912, + "step": 5163 + }, + { + "epoch": 0.32, + "grad_norm": 1.3888517215084442, + "learning_rate": 7.890018161240707e-06, + "loss": 0.3212, + "step": 5164 + }, + { + "epoch": 0.32, + "grad_norm": 1.4951206362262988, + "learning_rate": 7.889186989632559e-06, + "loss": 0.32, + "step": 5165 + }, + { + "epoch": 0.32, + "grad_norm": 3.4390563164043506, + "learning_rate": 7.888355698146913e-06, + "loss": 0.3042, + "step": 5166 + }, + { + "epoch": 0.32, + "grad_norm": 2.4237042466124175, + "learning_rate": 7.887524286818257e-06, + "loss": 0.3224, + "step": 5167 + }, + { + "epoch": 0.33, + "grad_norm": 2.0496892155503756, + "learning_rate": 7.886692755681091e-06, + "loss": 0.323, + "step": 5168 + }, + { + "epoch": 0.33, + "grad_norm": 2.3230823370618103, + "learning_rate": 7.885861104769915e-06, + "loss": 0.326, + "step": 5169 + }, + { + "epoch": 0.33, + "grad_norm": 2.3049544437398652, + "learning_rate": 7.885029334119237e-06, + "loss": 0.3466, + "step": 5170 + }, + { + "epoch": 0.33, + "grad_norm": 2.1769160680954185, + "learning_rate": 7.884197443763566e-06, + "loss": 0.3108, + "step": 5171 + }, + { + "epoch": 0.33, + "grad_norm": 2.48672535677508, + "learning_rate": 7.88336543373742e-06, + "loss": 0.3526, + "step": 5172 + }, + { + "epoch": 0.33, + "grad_norm": 2.1648446508690258, + "learning_rate": 7.88253330407532e-06, + "loss": 0.3276, + "step": 5173 + }, + { + "epoch": 0.33, + "grad_norm": 1.340396240680619, + "learning_rate": 7.881701054811796e-06, + "loss": 0.3056, + "step": 5174 + }, + { + "epoch": 0.33, + "grad_norm": 1.728428883608017, + "learning_rate": 7.880868685981376e-06, + "loss": 0.3191, + "step": 5175 + }, + { + "epoch": 0.33, + "grad_norm": 1.4732255951886062, + "learning_rate": 7.880036197618596e-06, + "loss": 0.3213, + "step": 5176 + }, + { + "epoch": 0.33, + "grad_norm": 1.623338077004673, + "learning_rate": 7.879203589757998e-06, + "loss": 0.3124, + "step": 5177 + }, + { + "epoch": 0.33, + "grad_norm": 2.1853307873795726, + "learning_rate": 7.878370862434131e-06, + "loss": 0.3251, + "step": 5178 + }, + { + "epoch": 0.33, + "grad_norm": 2.9052155557771617, + "learning_rate": 7.877538015681542e-06, + "loss": 0.3197, + "step": 5179 + }, + { + "epoch": 0.33, + "grad_norm": 1.389894072532732, + "learning_rate": 7.876705049534792e-06, + "loss": 0.317, + "step": 5180 + }, + { + "epoch": 0.33, + "grad_norm": 1.5412997149227012, + "learning_rate": 7.875871964028439e-06, + "loss": 0.3191, + "step": 5181 + }, + { + "epoch": 0.33, + "grad_norm": 2.485679210130768, + "learning_rate": 7.87503875919705e-06, + "loss": 0.3386, + "step": 5182 + }, + { + "epoch": 0.33, + "grad_norm": 1.6576984661492888, + "learning_rate": 7.874205435075196e-06, + "loss": 0.3207, + "step": 5183 + }, + { + "epoch": 0.33, + "grad_norm": 2.8776673260890777, + "learning_rate": 7.873371991697454e-06, + "loss": 0.3164, + "step": 5184 + }, + { + "epoch": 0.33, + "grad_norm": 1.8228351970614167, + "learning_rate": 7.872538429098404e-06, + "loss": 0.3204, + "step": 5185 + }, + { + "epoch": 0.33, + "grad_norm": 2.01354442212947, + "learning_rate": 7.871704747312631e-06, + "loss": 0.3079, + "step": 5186 + }, + { + "epoch": 0.33, + "grad_norm": 2.6381023841770297, + "learning_rate": 7.87087094637473e-06, + "loss": 0.3119, + "step": 5187 + }, + { + "epoch": 0.33, + "grad_norm": 2.230613493458902, + "learning_rate": 7.870037026319293e-06, + "loss": 0.3333, + "step": 5188 + }, + { + "epoch": 0.33, + "grad_norm": 1.271359396118646, + "learning_rate": 7.869202987180921e-06, + "loss": 0.3418, + "step": 5189 + }, + { + "epoch": 0.33, + "grad_norm": 1.813108543229716, + "learning_rate": 7.868368828994222e-06, + "loss": 0.3196, + "step": 5190 + }, + { + "epoch": 0.33, + "grad_norm": 4.8761358812895885, + "learning_rate": 7.867534551793805e-06, + "loss": 0.3143, + "step": 5191 + }, + { + "epoch": 0.33, + "grad_norm": 1.808748874398439, + "learning_rate": 7.866700155614286e-06, + "loss": 0.3301, + "step": 5192 + }, + { + "epoch": 0.33, + "grad_norm": 1.6232777439998736, + "learning_rate": 7.865865640490287e-06, + "loss": 0.3238, + "step": 5193 + }, + { + "epoch": 0.33, + "grad_norm": 3.1706212707037715, + "learning_rate": 7.865031006456432e-06, + "loss": 0.3351, + "step": 5194 + }, + { + "epoch": 0.33, + "grad_norm": 53.29319560873858, + "learning_rate": 7.86419625354735e-06, + "loss": 0.3162, + "step": 5195 + }, + { + "epoch": 0.33, + "grad_norm": 1.7152615452368334, + "learning_rate": 7.863361381797678e-06, + "loss": 0.3194, + "step": 5196 + }, + { + "epoch": 0.33, + "grad_norm": 1.723064779092635, + "learning_rate": 7.862526391242058e-06, + "loss": 0.3283, + "step": 5197 + }, + { + "epoch": 0.33, + "grad_norm": 1.5969489563372483, + "learning_rate": 7.861691281915133e-06, + "loss": 0.3139, + "step": 5198 + }, + { + "epoch": 0.33, + "grad_norm": 1.4729317597380558, + "learning_rate": 7.860856053851554e-06, + "loss": 0.3064, + "step": 5199 + }, + { + "epoch": 0.33, + "grad_norm": 1.7380473371823648, + "learning_rate": 7.860020707085976e-06, + "loss": 0.3284, + "step": 5200 + }, + { + "epoch": 0.33, + "grad_norm": 32.738389296582035, + "learning_rate": 7.859185241653058e-06, + "loss": 0.3313, + "step": 5201 + }, + { + "epoch": 0.33, + "grad_norm": 1.8081041996036473, + "learning_rate": 7.858349657587465e-06, + "loss": 0.312, + "step": 5202 + }, + { + "epoch": 0.33, + "grad_norm": 1.7493535284366597, + "learning_rate": 7.85751395492387e-06, + "loss": 0.3087, + "step": 5203 + }, + { + "epoch": 0.33, + "grad_norm": 1.7270254734427284, + "learning_rate": 7.856678133696945e-06, + "loss": 0.3228, + "step": 5204 + }, + { + "epoch": 0.33, + "grad_norm": 3.577619806778296, + "learning_rate": 7.85584219394137e-06, + "loss": 0.3132, + "step": 5205 + }, + { + "epoch": 0.33, + "grad_norm": 1.7439505639616169, + "learning_rate": 7.855006135691829e-06, + "loss": 0.3037, + "step": 5206 + }, + { + "epoch": 0.33, + "grad_norm": 1.7274519998258857, + "learning_rate": 7.854169958983014e-06, + "loss": 0.326, + "step": 5207 + }, + { + "epoch": 0.33, + "grad_norm": 2.939987603120739, + "learning_rate": 7.853333663849616e-06, + "loss": 0.3271, + "step": 5208 + }, + { + "epoch": 0.33, + "grad_norm": 2.389205281536424, + "learning_rate": 7.852497250326337e-06, + "loss": 0.3267, + "step": 5209 + }, + { + "epoch": 0.33, + "grad_norm": 1.7981692340837032, + "learning_rate": 7.85166071844788e-06, + "loss": 0.3054, + "step": 5210 + }, + { + "epoch": 0.33, + "grad_norm": 2.775791113021543, + "learning_rate": 7.850824068248955e-06, + "loss": 0.3166, + "step": 5211 + }, + { + "epoch": 0.33, + "grad_norm": 4.622538023807633, + "learning_rate": 7.849987299764276e-06, + "loss": 0.3206, + "step": 5212 + }, + { + "epoch": 0.33, + "grad_norm": 2.576171702100024, + "learning_rate": 7.849150413028562e-06, + "loss": 0.3217, + "step": 5213 + }, + { + "epoch": 0.33, + "grad_norm": 3.2095791500536244, + "learning_rate": 7.848313408076535e-06, + "loss": 0.3331, + "step": 5214 + }, + { + "epoch": 0.33, + "grad_norm": 2.4865642336849416, + "learning_rate": 7.847476284942927e-06, + "loss": 0.3322, + "step": 5215 + }, + { + "epoch": 0.33, + "grad_norm": 1.634545356371704, + "learning_rate": 7.846639043662472e-06, + "loss": 0.3243, + "step": 5216 + }, + { + "epoch": 0.33, + "grad_norm": 1.2618720493918716, + "learning_rate": 7.845801684269905e-06, + "loss": 0.3018, + "step": 5217 + }, + { + "epoch": 0.33, + "grad_norm": 6.498254707575457, + "learning_rate": 7.84496420679997e-06, + "loss": 0.343, + "step": 5218 + }, + { + "epoch": 0.33, + "grad_norm": 2.0083962241665834, + "learning_rate": 7.84412661128742e-06, + "loss": 0.3385, + "step": 5219 + }, + { + "epoch": 0.33, + "grad_norm": 3.171225818129115, + "learning_rate": 7.843288897767003e-06, + "loss": 0.3316, + "step": 5220 + }, + { + "epoch": 0.33, + "grad_norm": 1.813160418605828, + "learning_rate": 7.842451066273478e-06, + "loss": 0.3266, + "step": 5221 + }, + { + "epoch": 0.33, + "grad_norm": 1.4333856933473443, + "learning_rate": 7.841613116841612e-06, + "loss": 0.3052, + "step": 5222 + }, + { + "epoch": 0.33, + "grad_norm": 2.5155183948816697, + "learning_rate": 7.84077504950617e-06, + "loss": 0.3201, + "step": 5223 + }, + { + "epoch": 0.33, + "grad_norm": 1.9748036782521914, + "learning_rate": 7.839936864301924e-06, + "loss": 0.3265, + "step": 5224 + }, + { + "epoch": 0.33, + "grad_norm": 1.9203394613494438, + "learning_rate": 7.839098561263655e-06, + "loss": 0.3036, + "step": 5225 + }, + { + "epoch": 0.33, + "grad_norm": 1.8496166679425865, + "learning_rate": 7.838260140426145e-06, + "loss": 0.325, + "step": 5226 + }, + { + "epoch": 0.33, + "grad_norm": 2.574634293112357, + "learning_rate": 7.837421601824178e-06, + "loss": 0.3214, + "step": 5227 + }, + { + "epoch": 0.33, + "grad_norm": 2.281911113871297, + "learning_rate": 7.83658294549255e-06, + "loss": 0.3317, + "step": 5228 + }, + { + "epoch": 0.33, + "grad_norm": 1.792772421014619, + "learning_rate": 7.835744171466056e-06, + "loss": 0.3063, + "step": 5229 + }, + { + "epoch": 0.33, + "grad_norm": 1.6417754999989007, + "learning_rate": 7.834905279779501e-06, + "loss": 0.3132, + "step": 5230 + }, + { + "epoch": 0.33, + "grad_norm": 5.9212116571629325, + "learning_rate": 7.83406627046769e-06, + "loss": 0.3224, + "step": 5231 + }, + { + "epoch": 0.33, + "grad_norm": 0.6016643983710616, + "learning_rate": 7.833227143565436e-06, + "loss": 0.5195, + "step": 5232 + }, + { + "epoch": 0.33, + "grad_norm": 1.9190803509802752, + "learning_rate": 7.832387899107555e-06, + "loss": 0.3076, + "step": 5233 + }, + { + "epoch": 0.33, + "grad_norm": 2.6528497273908247, + "learning_rate": 7.831548537128868e-06, + "loss": 0.327, + "step": 5234 + }, + { + "epoch": 0.33, + "grad_norm": 1.203993813077144, + "learning_rate": 7.830709057664205e-06, + "loss": 0.3281, + "step": 5235 + }, + { + "epoch": 0.33, + "grad_norm": 1.885793144761983, + "learning_rate": 7.829869460748394e-06, + "loss": 0.3215, + "step": 5236 + }, + { + "epoch": 0.33, + "grad_norm": 1.5274124736436414, + "learning_rate": 7.829029746416272e-06, + "loss": 0.3208, + "step": 5237 + }, + { + "epoch": 0.33, + "grad_norm": 0.570056472955061, + "learning_rate": 7.828189914702681e-06, + "loss": 0.4846, + "step": 5238 + }, + { + "epoch": 0.33, + "grad_norm": 2.3544901211877676, + "learning_rate": 7.82734996564247e-06, + "loss": 0.3348, + "step": 5239 + }, + { + "epoch": 0.33, + "grad_norm": 2.4647389721696236, + "learning_rate": 7.826509899270484e-06, + "loss": 0.3369, + "step": 5240 + }, + { + "epoch": 0.33, + "grad_norm": 1.73010096681408, + "learning_rate": 7.825669715621581e-06, + "loss": 0.3229, + "step": 5241 + }, + { + "epoch": 0.33, + "grad_norm": 1.9056460904486618, + "learning_rate": 7.824829414730625e-06, + "loss": 0.3263, + "step": 5242 + }, + { + "epoch": 0.33, + "grad_norm": 1.3501727826015766, + "learning_rate": 7.82398899663248e-06, + "loss": 0.3186, + "step": 5243 + }, + { + "epoch": 0.33, + "grad_norm": 3.88366306271441, + "learning_rate": 7.823148461362013e-06, + "loss": 0.3225, + "step": 5244 + }, + { + "epoch": 0.33, + "grad_norm": 1.6255590264741315, + "learning_rate": 7.822307808954102e-06, + "loss": 0.3296, + "step": 5245 + }, + { + "epoch": 0.33, + "grad_norm": 2.0484755965364156, + "learning_rate": 7.821467039443628e-06, + "loss": 0.3136, + "step": 5246 + }, + { + "epoch": 0.33, + "grad_norm": 1.480892802698961, + "learning_rate": 7.820626152865476e-06, + "loss": 0.3027, + "step": 5247 + }, + { + "epoch": 0.33, + "grad_norm": 2.1877077552477995, + "learning_rate": 7.819785149254534e-06, + "loss": 0.3071, + "step": 5248 + }, + { + "epoch": 0.33, + "grad_norm": 3.65753819552042, + "learning_rate": 7.818944028645696e-06, + "loss": 0.3295, + "step": 5249 + }, + { + "epoch": 0.33, + "grad_norm": 1.7443418408508877, + "learning_rate": 7.818102791073866e-06, + "loss": 0.3104, + "step": 5250 + }, + { + "epoch": 0.33, + "grad_norm": 2.3554998717394255, + "learning_rate": 7.817261436573944e-06, + "loss": 0.3153, + "step": 5251 + }, + { + "epoch": 0.33, + "grad_norm": 0.6195504060972558, + "learning_rate": 7.816419965180842e-06, + "loss": 0.4923, + "step": 5252 + }, + { + "epoch": 0.33, + "grad_norm": 1.6823505897589057, + "learning_rate": 7.815578376929472e-06, + "loss": 0.3216, + "step": 5253 + }, + { + "epoch": 0.33, + "grad_norm": 1.8260645558341941, + "learning_rate": 7.814736671854755e-06, + "loss": 0.3281, + "step": 5254 + }, + { + "epoch": 0.33, + "grad_norm": 2.262671085875684, + "learning_rate": 7.813894849991613e-06, + "loss": 0.3362, + "step": 5255 + }, + { + "epoch": 0.33, + "grad_norm": 2.189242209242595, + "learning_rate": 7.813052911374976e-06, + "loss": 0.3016, + "step": 5256 + }, + { + "epoch": 0.33, + "grad_norm": 2.031791925024394, + "learning_rate": 7.812210856039777e-06, + "loss": 0.3191, + "step": 5257 + }, + { + "epoch": 0.33, + "grad_norm": 2.8736459076904737, + "learning_rate": 7.811368684020955e-06, + "loss": 0.3345, + "step": 5258 + }, + { + "epoch": 0.33, + "grad_norm": 2.377015528104474, + "learning_rate": 7.810526395353453e-06, + "loss": 0.3238, + "step": 5259 + }, + { + "epoch": 0.33, + "grad_norm": 6.798601382094193, + "learning_rate": 7.809683990072217e-06, + "loss": 0.2939, + "step": 5260 + }, + { + "epoch": 0.33, + "grad_norm": 2.363927610327008, + "learning_rate": 7.808841468212204e-06, + "loss": 0.3274, + "step": 5261 + }, + { + "epoch": 0.33, + "grad_norm": 2.6227323400475644, + "learning_rate": 7.807998829808368e-06, + "loss": 0.3111, + "step": 5262 + }, + { + "epoch": 0.33, + "grad_norm": 1.5637446224382276, + "learning_rate": 7.807156074895673e-06, + "loss": 0.328, + "step": 5263 + }, + { + "epoch": 0.33, + "grad_norm": 1.846290688888176, + "learning_rate": 7.806313203509087e-06, + "loss": 0.3353, + "step": 5264 + }, + { + "epoch": 0.33, + "grad_norm": 2.078383612207726, + "learning_rate": 7.805470215683582e-06, + "loss": 0.3266, + "step": 5265 + }, + { + "epoch": 0.33, + "grad_norm": 1.979762433478405, + "learning_rate": 7.804627111454132e-06, + "loss": 0.3076, + "step": 5266 + }, + { + "epoch": 0.33, + "grad_norm": 2.660159241950354, + "learning_rate": 7.803783890855725e-06, + "loss": 0.3431, + "step": 5267 + }, + { + "epoch": 0.33, + "grad_norm": 1.6204503421514473, + "learning_rate": 7.802940553923344e-06, + "loss": 0.2973, + "step": 5268 + }, + { + "epoch": 0.33, + "grad_norm": 1.6614170714011247, + "learning_rate": 7.80209710069198e-06, + "loss": 0.3181, + "step": 5269 + }, + { + "epoch": 0.33, + "grad_norm": 2.763459680820304, + "learning_rate": 7.801253531196629e-06, + "loss": 0.3141, + "step": 5270 + }, + { + "epoch": 0.33, + "grad_norm": 1.4819628461307792, + "learning_rate": 7.800409845472294e-06, + "loss": 0.3238, + "step": 5271 + }, + { + "epoch": 0.33, + "grad_norm": 1.747752388557921, + "learning_rate": 7.799566043553983e-06, + "loss": 0.3047, + "step": 5272 + }, + { + "epoch": 0.33, + "grad_norm": 2.1211767852187933, + "learning_rate": 7.798722125476702e-06, + "loss": 0.3111, + "step": 5273 + }, + { + "epoch": 0.33, + "grad_norm": 1.5505624779238907, + "learning_rate": 7.797878091275468e-06, + "loss": 0.3108, + "step": 5274 + }, + { + "epoch": 0.33, + "grad_norm": 1.5776271307834984, + "learning_rate": 7.797033940985303e-06, + "loss": 0.3152, + "step": 5275 + }, + { + "epoch": 0.33, + "grad_norm": 1.7226888476530333, + "learning_rate": 7.796189674641232e-06, + "loss": 0.3272, + "step": 5276 + }, + { + "epoch": 0.33, + "grad_norm": 1.308974819580043, + "learning_rate": 7.795345292278284e-06, + "loss": 0.299, + "step": 5277 + }, + { + "epoch": 0.33, + "grad_norm": 2.3214760572985496, + "learning_rate": 7.794500793931496e-06, + "loss": 0.3226, + "step": 5278 + }, + { + "epoch": 0.33, + "grad_norm": 7.811168133254081, + "learning_rate": 7.793656179635907e-06, + "loss": 0.3115, + "step": 5279 + }, + { + "epoch": 0.33, + "grad_norm": 1.4014193914832929, + "learning_rate": 7.79281144942656e-06, + "loss": 0.3258, + "step": 5280 + }, + { + "epoch": 0.33, + "grad_norm": 1.3525619311131196, + "learning_rate": 7.791966603338505e-06, + "loss": 0.2961, + "step": 5281 + }, + { + "epoch": 0.33, + "grad_norm": 1.386191233696376, + "learning_rate": 7.791121641406798e-06, + "loss": 0.3316, + "step": 5282 + }, + { + "epoch": 0.33, + "grad_norm": 1.7465533301761018, + "learning_rate": 7.790276563666496e-06, + "loss": 0.3113, + "step": 5283 + }, + { + "epoch": 0.33, + "grad_norm": 1.848497159369416, + "learning_rate": 7.789431370152663e-06, + "loss": 0.3033, + "step": 5284 + }, + { + "epoch": 0.33, + "grad_norm": 2.1902861996520175, + "learning_rate": 7.788586060900366e-06, + "loss": 0.3257, + "step": 5285 + }, + { + "epoch": 0.33, + "grad_norm": 1.6632439520845537, + "learning_rate": 7.787740635944682e-06, + "loss": 0.3112, + "step": 5286 + }, + { + "epoch": 0.33, + "grad_norm": 2.109090444914576, + "learning_rate": 7.78689509532069e-06, + "loss": 0.2934, + "step": 5287 + }, + { + "epoch": 0.33, + "grad_norm": 1.5185527042333267, + "learning_rate": 7.786049439063466e-06, + "loss": 0.2903, + "step": 5288 + }, + { + "epoch": 0.33, + "grad_norm": 1.3430976374311951, + "learning_rate": 7.785203667208104e-06, + "loss": 0.3141, + "step": 5289 + }, + { + "epoch": 0.33, + "grad_norm": 1.615437410967679, + "learning_rate": 7.784357779789695e-06, + "loss": 0.3289, + "step": 5290 + }, + { + "epoch": 0.33, + "grad_norm": 1.73339053657673, + "learning_rate": 7.783511776843336e-06, + "loss": 0.3611, + "step": 5291 + }, + { + "epoch": 0.33, + "grad_norm": 2.078945198035065, + "learning_rate": 7.78266565840413e-06, + "loss": 0.325, + "step": 5292 + }, + { + "epoch": 0.33, + "grad_norm": 1.8732262119614633, + "learning_rate": 7.781819424507183e-06, + "loss": 0.3265, + "step": 5293 + }, + { + "epoch": 0.33, + "grad_norm": 1.7033490086311325, + "learning_rate": 7.780973075187606e-06, + "loss": 0.3422, + "step": 5294 + }, + { + "epoch": 0.33, + "grad_norm": 1.8274348365490694, + "learning_rate": 7.780126610480516e-06, + "loss": 0.3221, + "step": 5295 + }, + { + "epoch": 0.33, + "grad_norm": 1.921197079915513, + "learning_rate": 7.779280030421036e-06, + "loss": 0.3124, + "step": 5296 + }, + { + "epoch": 0.33, + "grad_norm": 1.3620246030895387, + "learning_rate": 7.77843333504429e-06, + "loss": 0.3154, + "step": 5297 + }, + { + "epoch": 0.33, + "grad_norm": 6.135086271585444, + "learning_rate": 7.777586524385412e-06, + "loss": 0.324, + "step": 5298 + }, + { + "epoch": 0.33, + "grad_norm": 3.0187237282935357, + "learning_rate": 7.776739598479534e-06, + "loss": 0.3084, + "step": 5299 + }, + { + "epoch": 0.33, + "grad_norm": 4.132250643424089, + "learning_rate": 7.775892557361799e-06, + "loss": 0.3285, + "step": 5300 + }, + { + "epoch": 0.33, + "grad_norm": 1.2241173953747528, + "learning_rate": 7.77504540106735e-06, + "loss": 0.329, + "step": 5301 + }, + { + "epoch": 0.33, + "grad_norm": 1.4780742936978613, + "learning_rate": 7.774198129631339e-06, + "loss": 0.32, + "step": 5302 + }, + { + "epoch": 0.33, + "grad_norm": 1.6916449616547573, + "learning_rate": 7.77335074308892e-06, + "loss": 0.3068, + "step": 5303 + }, + { + "epoch": 0.33, + "grad_norm": 3.1138490450859275, + "learning_rate": 7.772503241475253e-06, + "loss": 0.3271, + "step": 5304 + }, + { + "epoch": 0.33, + "grad_norm": 2.5593199988123474, + "learning_rate": 7.771655624825501e-06, + "loss": 0.3124, + "step": 5305 + }, + { + "epoch": 0.33, + "grad_norm": 2.1304137145738844, + "learning_rate": 7.770807893174836e-06, + "loss": 0.3112, + "step": 5306 + }, + { + "epoch": 0.33, + "grad_norm": 1.545415392976424, + "learning_rate": 7.769960046558427e-06, + "loss": 0.3149, + "step": 5307 + }, + { + "epoch": 0.33, + "grad_norm": 1.6742429253967805, + "learning_rate": 7.769112085011458e-06, + "loss": 0.3154, + "step": 5308 + }, + { + "epoch": 0.33, + "grad_norm": 2.8038429385142214, + "learning_rate": 7.76826400856911e-06, + "loss": 0.3309, + "step": 5309 + }, + { + "epoch": 0.33, + "grad_norm": 27.067237088615375, + "learning_rate": 7.76741581726657e-06, + "loss": 0.3173, + "step": 5310 + }, + { + "epoch": 0.33, + "grad_norm": 1.9502668050879342, + "learning_rate": 7.766567511139034e-06, + "loss": 0.305, + "step": 5311 + }, + { + "epoch": 0.33, + "grad_norm": 1.4164528935586955, + "learning_rate": 7.765719090221697e-06, + "loss": 0.3214, + "step": 5312 + }, + { + "epoch": 0.33, + "grad_norm": 1.6561867214571613, + "learning_rate": 7.764870554549762e-06, + "loss": 0.3193, + "step": 5313 + }, + { + "epoch": 0.33, + "grad_norm": 2.10729431809092, + "learning_rate": 7.764021904158436e-06, + "loss": 0.3334, + "step": 5314 + }, + { + "epoch": 0.33, + "grad_norm": 1.4436624384800332, + "learning_rate": 7.763173139082934e-06, + "loss": 0.3215, + "step": 5315 + }, + { + "epoch": 0.33, + "grad_norm": 1.4107756570196157, + "learning_rate": 7.762324259358469e-06, + "loss": 0.3056, + "step": 5316 + }, + { + "epoch": 0.33, + "grad_norm": 2.486026851265413, + "learning_rate": 7.761475265020265e-06, + "loss": 0.3283, + "step": 5317 + }, + { + "epoch": 0.33, + "grad_norm": 1.8007758072365392, + "learning_rate": 7.760626156103545e-06, + "loss": 0.3022, + "step": 5318 + }, + { + "epoch": 0.33, + "grad_norm": 1.3263991040429182, + "learning_rate": 7.759776932643546e-06, + "loss": 0.3285, + "step": 5319 + }, + { + "epoch": 0.33, + "grad_norm": 1.808546364182805, + "learning_rate": 7.758927594675498e-06, + "loss": 0.305, + "step": 5320 + }, + { + "epoch": 0.33, + "grad_norm": 1.7624843777258097, + "learning_rate": 7.758078142234644e-06, + "loss": 0.3198, + "step": 5321 + }, + { + "epoch": 0.33, + "grad_norm": 1.3564778264776514, + "learning_rate": 7.757228575356227e-06, + "loss": 0.3052, + "step": 5322 + }, + { + "epoch": 0.33, + "grad_norm": 1.4638086532198022, + "learning_rate": 7.756378894075503e-06, + "loss": 0.3362, + "step": 5323 + }, + { + "epoch": 0.33, + "grad_norm": 2.2779272416213177, + "learning_rate": 7.75552909842772e-06, + "loss": 0.2917, + "step": 5324 + }, + { + "epoch": 0.33, + "grad_norm": 1.3558217933412742, + "learning_rate": 7.754679188448142e-06, + "loss": 0.3017, + "step": 5325 + }, + { + "epoch": 0.33, + "grad_norm": 2.537657372578936, + "learning_rate": 7.75382916417203e-06, + "loss": 0.326, + "step": 5326 + }, + { + "epoch": 0.34, + "grad_norm": 1.2663583805097514, + "learning_rate": 7.752979025634657e-06, + "loss": 0.3155, + "step": 5327 + }, + { + "epoch": 0.34, + "grad_norm": 2.2802139646917126, + "learning_rate": 7.752128772871292e-06, + "loss": 0.3323, + "step": 5328 + }, + { + "epoch": 0.34, + "grad_norm": 3.355152512161311, + "learning_rate": 7.751278405917216e-06, + "loss": 0.3211, + "step": 5329 + }, + { + "epoch": 0.34, + "grad_norm": 1.245421577558753, + "learning_rate": 7.750427924807712e-06, + "loss": 0.3127, + "step": 5330 + }, + { + "epoch": 0.34, + "grad_norm": 1.4510547679293457, + "learning_rate": 7.74957732957807e-06, + "loss": 0.3106, + "step": 5331 + }, + { + "epoch": 0.34, + "grad_norm": 2.0675998325860987, + "learning_rate": 7.748726620263581e-06, + "loss": 0.3312, + "step": 5332 + }, + { + "epoch": 0.34, + "grad_norm": 1.368644564235117, + "learning_rate": 7.74787579689954e-06, + "loss": 0.3126, + "step": 5333 + }, + { + "epoch": 0.34, + "grad_norm": 2.8347779240013087, + "learning_rate": 7.747024859521253e-06, + "loss": 0.3666, + "step": 5334 + }, + { + "epoch": 0.34, + "grad_norm": 1.782028464072049, + "learning_rate": 7.746173808164023e-06, + "loss": 0.3212, + "step": 5335 + }, + { + "epoch": 0.34, + "grad_norm": 1.6486331249127584, + "learning_rate": 7.745322642863167e-06, + "loss": 0.3083, + "step": 5336 + }, + { + "epoch": 0.34, + "grad_norm": 1.7510337222465016, + "learning_rate": 7.744471363653996e-06, + "loss": 0.3236, + "step": 5337 + }, + { + "epoch": 0.34, + "grad_norm": 1.808508462901859, + "learning_rate": 7.743619970571834e-06, + "loss": 0.3005, + "step": 5338 + }, + { + "epoch": 0.34, + "grad_norm": 1.957146390700833, + "learning_rate": 7.742768463652007e-06, + "loss": 0.3306, + "step": 5339 + }, + { + "epoch": 0.34, + "grad_norm": 1.8095262232863547, + "learning_rate": 7.741916842929843e-06, + "loss": 0.3229, + "step": 5340 + }, + { + "epoch": 0.34, + "grad_norm": 1.8734776294503417, + "learning_rate": 7.741065108440682e-06, + "loss": 0.3192, + "step": 5341 + }, + { + "epoch": 0.34, + "grad_norm": 1.9952200392603823, + "learning_rate": 7.74021326021986e-06, + "loss": 0.3217, + "step": 5342 + }, + { + "epoch": 0.34, + "grad_norm": 1.8703840335891198, + "learning_rate": 7.739361298302721e-06, + "loss": 0.3231, + "step": 5343 + }, + { + "epoch": 0.34, + "grad_norm": 1.960761681104746, + "learning_rate": 7.738509222724617e-06, + "loss": 0.3142, + "step": 5344 + }, + { + "epoch": 0.34, + "grad_norm": 1.620839815612159, + "learning_rate": 7.7376570335209e-06, + "loss": 0.3136, + "step": 5345 + }, + { + "epoch": 0.34, + "grad_norm": 1.8388352318752739, + "learning_rate": 7.736804730726932e-06, + "loss": 0.3101, + "step": 5346 + }, + { + "epoch": 0.34, + "grad_norm": 1.5383323686602937, + "learning_rate": 7.735952314378075e-06, + "loss": 0.3097, + "step": 5347 + }, + { + "epoch": 0.34, + "grad_norm": 7.649249721364612, + "learning_rate": 7.735099784509694e-06, + "loss": 0.329, + "step": 5348 + }, + { + "epoch": 0.34, + "grad_norm": 2.7810964237810616, + "learning_rate": 7.734247141157168e-06, + "loss": 0.3559, + "step": 5349 + }, + { + "epoch": 0.34, + "grad_norm": 1.3182104693968337, + "learning_rate": 7.733394384355869e-06, + "loss": 0.3044, + "step": 5350 + }, + { + "epoch": 0.34, + "grad_norm": 1.5859054146228193, + "learning_rate": 7.732541514141182e-06, + "loss": 0.297, + "step": 5351 + }, + { + "epoch": 0.34, + "grad_norm": 3.34680833618101, + "learning_rate": 7.731688530548495e-06, + "loss": 0.2968, + "step": 5352 + }, + { + "epoch": 0.34, + "grad_norm": 1.4483123754731415, + "learning_rate": 7.7308354336132e-06, + "loss": 0.3031, + "step": 5353 + }, + { + "epoch": 0.34, + "grad_norm": 2.0288664027543004, + "learning_rate": 7.72998222337069e-06, + "loss": 0.3147, + "step": 5354 + }, + { + "epoch": 0.34, + "grad_norm": 1.8317005133311832, + "learning_rate": 7.72912889985637e-06, + "loss": 0.306, + "step": 5355 + }, + { + "epoch": 0.34, + "grad_norm": 1.4893061078897964, + "learning_rate": 7.728275463105645e-06, + "loss": 0.304, + "step": 5356 + }, + { + "epoch": 0.34, + "grad_norm": 2.4565648018620156, + "learning_rate": 7.727421913153925e-06, + "loss": 0.3262, + "step": 5357 + }, + { + "epoch": 0.34, + "grad_norm": 2.115236869711557, + "learning_rate": 7.726568250036625e-06, + "loss": 0.3133, + "step": 5358 + }, + { + "epoch": 0.34, + "grad_norm": 1.6771999997855551, + "learning_rate": 7.725714473789166e-06, + "loss": 0.336, + "step": 5359 + }, + { + "epoch": 0.34, + "grad_norm": 1.5117515210362769, + "learning_rate": 7.724860584446969e-06, + "loss": 0.2993, + "step": 5360 + }, + { + "epoch": 0.34, + "grad_norm": 1.5205162452653131, + "learning_rate": 7.724006582045468e-06, + "loss": 0.3185, + "step": 5361 + }, + { + "epoch": 0.34, + "grad_norm": 1.3642739193931954, + "learning_rate": 7.723152466620098e-06, + "loss": 0.3335, + "step": 5362 + }, + { + "epoch": 0.34, + "grad_norm": 1.9031473059944004, + "learning_rate": 7.722298238206294e-06, + "loss": 0.3343, + "step": 5363 + }, + { + "epoch": 0.34, + "grad_norm": 1.611505317540388, + "learning_rate": 7.721443896839499e-06, + "loss": 0.321, + "step": 5364 + }, + { + "epoch": 0.34, + "grad_norm": 3.1382766240312923, + "learning_rate": 7.720589442555164e-06, + "loss": 0.3062, + "step": 5365 + }, + { + "epoch": 0.34, + "grad_norm": 1.6297875963547095, + "learning_rate": 7.719734875388742e-06, + "loss": 0.3134, + "step": 5366 + }, + { + "epoch": 0.34, + "grad_norm": 1.3373581391367027, + "learning_rate": 7.718880195375686e-06, + "loss": 0.3046, + "step": 5367 + }, + { + "epoch": 0.34, + "grad_norm": 2.488188711680331, + "learning_rate": 7.718025402551464e-06, + "loss": 0.3134, + "step": 5368 + }, + { + "epoch": 0.34, + "grad_norm": 9.349829263295176, + "learning_rate": 7.71717049695154e-06, + "loss": 0.3255, + "step": 5369 + }, + { + "epoch": 0.34, + "grad_norm": 1.4576402250034397, + "learning_rate": 7.716315478611386e-06, + "loss": 0.3396, + "step": 5370 + }, + { + "epoch": 0.34, + "grad_norm": 1.3413342808191373, + "learning_rate": 7.715460347566476e-06, + "loss": 0.3198, + "step": 5371 + }, + { + "epoch": 0.34, + "grad_norm": 2.1504449548549363, + "learning_rate": 7.714605103852297e-06, + "loss": 0.335, + "step": 5372 + }, + { + "epoch": 0.34, + "grad_norm": 1.3610529802936822, + "learning_rate": 7.713749747504327e-06, + "loss": 0.3352, + "step": 5373 + }, + { + "epoch": 0.34, + "grad_norm": 1.7288968360147436, + "learning_rate": 7.712894278558063e-06, + "loss": 0.3354, + "step": 5374 + }, + { + "epoch": 0.34, + "grad_norm": 1.5781156822755555, + "learning_rate": 7.712038697048993e-06, + "loss": 0.332, + "step": 5375 + }, + { + "epoch": 0.34, + "grad_norm": 2.7727885194311908, + "learning_rate": 7.711183003012624e-06, + "loss": 0.3212, + "step": 5376 + }, + { + "epoch": 0.34, + "grad_norm": 1.8413907164823824, + "learning_rate": 7.710327196484453e-06, + "loss": 0.3069, + "step": 5377 + }, + { + "epoch": 0.34, + "grad_norm": 1.6852659524346563, + "learning_rate": 7.709471277499995e-06, + "loss": 0.317, + "step": 5378 + }, + { + "epoch": 0.34, + "grad_norm": 10.089573112753548, + "learning_rate": 7.708615246094759e-06, + "loss": 0.2965, + "step": 5379 + }, + { + "epoch": 0.34, + "grad_norm": 2.573012052285078, + "learning_rate": 7.707759102304267e-06, + "loss": 0.328, + "step": 5380 + }, + { + "epoch": 0.34, + "grad_norm": 2.1998696507379023, + "learning_rate": 7.706902846164042e-06, + "loss": 0.312, + "step": 5381 + }, + { + "epoch": 0.34, + "grad_norm": 2.3072517004354953, + "learning_rate": 7.706046477709607e-06, + "loss": 0.3219, + "step": 5382 + }, + { + "epoch": 0.34, + "grad_norm": 1.829271017570389, + "learning_rate": 7.705189996976497e-06, + "loss": 0.3117, + "step": 5383 + }, + { + "epoch": 0.34, + "grad_norm": 1.7827362078826985, + "learning_rate": 7.704333404000252e-06, + "loss": 0.3062, + "step": 5384 + }, + { + "epoch": 0.34, + "grad_norm": 2.907364631886873, + "learning_rate": 7.703476698816408e-06, + "loss": 0.3182, + "step": 5385 + }, + { + "epoch": 0.34, + "grad_norm": 1.68795382812034, + "learning_rate": 7.702619881460515e-06, + "loss": 0.3042, + "step": 5386 + }, + { + "epoch": 0.34, + "grad_norm": 0.6616793374426807, + "learning_rate": 7.701762951968121e-06, + "loss": 0.4993, + "step": 5387 + }, + { + "epoch": 0.34, + "grad_norm": 4.252617426188242, + "learning_rate": 7.700905910374786e-06, + "loss": 0.3106, + "step": 5388 + }, + { + "epoch": 0.34, + "grad_norm": 3.460615221403972, + "learning_rate": 7.700048756716066e-06, + "loss": 0.3081, + "step": 5389 + }, + { + "epoch": 0.34, + "grad_norm": 1.79653436274685, + "learning_rate": 7.699191491027527e-06, + "loss": 0.3311, + "step": 5390 + }, + { + "epoch": 0.34, + "grad_norm": 1.7241779184814705, + "learning_rate": 7.69833411334474e-06, + "loss": 0.301, + "step": 5391 + }, + { + "epoch": 0.34, + "grad_norm": 1.4955979687830763, + "learning_rate": 7.697476623703278e-06, + "loss": 0.3124, + "step": 5392 + }, + { + "epoch": 0.34, + "grad_norm": 1.6196195767461121, + "learning_rate": 7.696619022138718e-06, + "loss": 0.3263, + "step": 5393 + }, + { + "epoch": 0.34, + "grad_norm": 2.594175641596578, + "learning_rate": 7.695761308686645e-06, + "loss": 0.318, + "step": 5394 + }, + { + "epoch": 0.34, + "grad_norm": 2.267398055036615, + "learning_rate": 7.694903483382648e-06, + "loss": 0.3158, + "step": 5395 + }, + { + "epoch": 0.34, + "grad_norm": 2.85803065073527, + "learning_rate": 7.694045546262318e-06, + "loss": 0.3264, + "step": 5396 + }, + { + "epoch": 0.34, + "grad_norm": 2.8948503634483607, + "learning_rate": 7.693187497361254e-06, + "loss": 0.3358, + "step": 5397 + }, + { + "epoch": 0.34, + "grad_norm": 2.251156620044382, + "learning_rate": 7.692329336715056e-06, + "loss": 0.3063, + "step": 5398 + }, + { + "epoch": 0.34, + "grad_norm": 1.7417624228202535, + "learning_rate": 7.691471064359333e-06, + "loss": 0.3052, + "step": 5399 + }, + { + "epoch": 0.34, + "grad_norm": 1.1760982421251651, + "learning_rate": 7.690612680329694e-06, + "loss": 0.3022, + "step": 5400 + }, + { + "epoch": 0.34, + "grad_norm": 1.883751960610199, + "learning_rate": 7.689754184661757e-06, + "loss": 0.3308, + "step": 5401 + }, + { + "epoch": 0.34, + "grad_norm": 2.4214720740442326, + "learning_rate": 7.68889557739114e-06, + "loss": 0.3177, + "step": 5402 + }, + { + "epoch": 0.34, + "grad_norm": 2.165026328108929, + "learning_rate": 7.68803685855347e-06, + "loss": 0.3196, + "step": 5403 + }, + { + "epoch": 0.34, + "grad_norm": 1.4245136398666445, + "learning_rate": 7.687178028184376e-06, + "loss": 0.3203, + "step": 5404 + }, + { + "epoch": 0.34, + "grad_norm": 1.8519914167079905, + "learning_rate": 7.686319086319494e-06, + "loss": 0.3122, + "step": 5405 + }, + { + "epoch": 0.34, + "grad_norm": 9.359881063028313, + "learning_rate": 7.68546003299446e-06, + "loss": 0.32, + "step": 5406 + }, + { + "epoch": 0.34, + "grad_norm": 2.026191006986111, + "learning_rate": 7.68460086824492e-06, + "loss": 0.3046, + "step": 5407 + }, + { + "epoch": 0.34, + "grad_norm": 3.10177207340718, + "learning_rate": 7.683741592106521e-06, + "loss": 0.3249, + "step": 5408 + }, + { + "epoch": 0.34, + "grad_norm": 4.159585352314094, + "learning_rate": 7.682882204614918e-06, + "loss": 0.3246, + "step": 5409 + }, + { + "epoch": 0.34, + "grad_norm": 3.4964786417229705, + "learning_rate": 7.682022705805765e-06, + "loss": 0.3102, + "step": 5410 + }, + { + "epoch": 0.34, + "grad_norm": 6.013546214111861, + "learning_rate": 7.681163095714727e-06, + "loss": 0.309, + "step": 5411 + }, + { + "epoch": 0.34, + "grad_norm": 1.3057323594457988, + "learning_rate": 7.68030337437747e-06, + "loss": 0.3052, + "step": 5412 + }, + { + "epoch": 0.34, + "grad_norm": 1.7469025350698477, + "learning_rate": 7.679443541829665e-06, + "loss": 0.3161, + "step": 5413 + }, + { + "epoch": 0.34, + "grad_norm": 2.5623748611092583, + "learning_rate": 7.67858359810699e-06, + "loss": 0.3116, + "step": 5414 + }, + { + "epoch": 0.34, + "grad_norm": 4.571531505164328, + "learning_rate": 7.677723543245122e-06, + "loss": 0.3091, + "step": 5415 + }, + { + "epoch": 0.34, + "grad_norm": 2.9415793968538617, + "learning_rate": 7.676863377279747e-06, + "loss": 0.316, + "step": 5416 + }, + { + "epoch": 0.34, + "grad_norm": 1.6608659713305773, + "learning_rate": 7.676003100246558e-06, + "loss": 0.3273, + "step": 5417 + }, + { + "epoch": 0.34, + "grad_norm": 2.873022642099225, + "learning_rate": 7.675142712181247e-06, + "loss": 0.315, + "step": 5418 + }, + { + "epoch": 0.34, + "grad_norm": 1.545331336651048, + "learning_rate": 7.674282213119511e-06, + "loss": 0.327, + "step": 5419 + }, + { + "epoch": 0.34, + "grad_norm": 1.8345110872299504, + "learning_rate": 7.673421603097058e-06, + "loss": 0.3159, + "step": 5420 + }, + { + "epoch": 0.34, + "grad_norm": 1.6698593550026084, + "learning_rate": 7.672560882149594e-06, + "loss": 0.3215, + "step": 5421 + }, + { + "epoch": 0.34, + "grad_norm": 1.270607186862521, + "learning_rate": 7.671700050312832e-06, + "loss": 0.2988, + "step": 5422 + }, + { + "epoch": 0.34, + "grad_norm": 2.852817023142275, + "learning_rate": 7.670839107622488e-06, + "loss": 0.309, + "step": 5423 + }, + { + "epoch": 0.34, + "grad_norm": 2.437957536653853, + "learning_rate": 7.669978054114286e-06, + "loss": 0.3238, + "step": 5424 + }, + { + "epoch": 0.34, + "grad_norm": 1.6842640113997158, + "learning_rate": 7.669116889823955e-06, + "loss": 0.2997, + "step": 5425 + }, + { + "epoch": 0.34, + "grad_norm": 4.206636580338994, + "learning_rate": 7.668255614787219e-06, + "loss": 0.3039, + "step": 5426 + }, + { + "epoch": 0.34, + "grad_norm": 1.9246387885589153, + "learning_rate": 7.667394229039822e-06, + "loss": 0.3201, + "step": 5427 + }, + { + "epoch": 0.34, + "grad_norm": 1.708216086376897, + "learning_rate": 7.666532732617498e-06, + "loss": 0.3288, + "step": 5428 + }, + { + "epoch": 0.34, + "grad_norm": 6.0998039251897485, + "learning_rate": 7.665671125555996e-06, + "loss": 0.3177, + "step": 5429 + }, + { + "epoch": 0.34, + "grad_norm": 3.8879890695701977, + "learning_rate": 7.664809407891063e-06, + "loss": 0.3019, + "step": 5430 + }, + { + "epoch": 0.34, + "grad_norm": 2.5072264638777915, + "learning_rate": 7.663947579658456e-06, + "loss": 0.3139, + "step": 5431 + }, + { + "epoch": 0.34, + "grad_norm": 2.8296488345571333, + "learning_rate": 7.66308564089393e-06, + "loss": 0.3299, + "step": 5432 + }, + { + "epoch": 0.34, + "grad_norm": 3.0746004340729414, + "learning_rate": 7.662223591633254e-06, + "loss": 0.3015, + "step": 5433 + }, + { + "epoch": 0.34, + "grad_norm": 1.5580206769405773, + "learning_rate": 7.66136143191219e-06, + "loss": 0.308, + "step": 5434 + }, + { + "epoch": 0.34, + "grad_norm": 1.4659077118525965, + "learning_rate": 7.660499161766514e-06, + "loss": 0.3092, + "step": 5435 + }, + { + "epoch": 0.34, + "grad_norm": 1.6012483303825125, + "learning_rate": 7.659636781232001e-06, + "loss": 0.3118, + "step": 5436 + }, + { + "epoch": 0.34, + "grad_norm": 1.9973094192329948, + "learning_rate": 7.658774290344435e-06, + "loss": 0.316, + "step": 5437 + }, + { + "epoch": 0.34, + "grad_norm": 1.555775572637292, + "learning_rate": 7.6579116891396e-06, + "loss": 0.3129, + "step": 5438 + }, + { + "epoch": 0.34, + "grad_norm": 1.538207398097279, + "learning_rate": 7.65704897765329e-06, + "loss": 0.3021, + "step": 5439 + }, + { + "epoch": 0.34, + "grad_norm": 0.673180852549415, + "learning_rate": 7.656186155921296e-06, + "loss": 0.5251, + "step": 5440 + }, + { + "epoch": 0.34, + "grad_norm": 1.902886639712222, + "learning_rate": 7.655323223979424e-06, + "loss": 0.3129, + "step": 5441 + }, + { + "epoch": 0.34, + "grad_norm": 3.4532476838787907, + "learning_rate": 7.654460181863473e-06, + "loss": 0.2983, + "step": 5442 + }, + { + "epoch": 0.34, + "grad_norm": 2.1444827784271028, + "learning_rate": 7.653597029609254e-06, + "loss": 0.3406, + "step": 5443 + }, + { + "epoch": 0.34, + "grad_norm": 2.5583675835258517, + "learning_rate": 7.65273376725258e-06, + "loss": 0.3173, + "step": 5444 + }, + { + "epoch": 0.34, + "grad_norm": 1.3527596301442983, + "learning_rate": 7.651870394829272e-06, + "loss": 0.3132, + "step": 5445 + }, + { + "epoch": 0.34, + "grad_norm": 2.2149606730870763, + "learning_rate": 7.651006912375149e-06, + "loss": 0.3034, + "step": 5446 + }, + { + "epoch": 0.34, + "grad_norm": 1.5382464713059398, + "learning_rate": 7.650143319926044e-06, + "loss": 0.3045, + "step": 5447 + }, + { + "epoch": 0.34, + "grad_norm": 1.665601896616514, + "learning_rate": 7.649279617517782e-06, + "loss": 0.3081, + "step": 5448 + }, + { + "epoch": 0.34, + "grad_norm": 1.5132442279557612, + "learning_rate": 7.648415805186205e-06, + "loss": 0.3163, + "step": 5449 + }, + { + "epoch": 0.34, + "grad_norm": 2.677185939394891, + "learning_rate": 7.64755188296715e-06, + "loss": 0.3287, + "step": 5450 + }, + { + "epoch": 0.34, + "grad_norm": 3.492380227783077, + "learning_rate": 7.646687850896468e-06, + "loss": 0.3185, + "step": 5451 + }, + { + "epoch": 0.34, + "grad_norm": 1.8895882032413054, + "learning_rate": 7.645823709010003e-06, + "loss": 0.3086, + "step": 5452 + }, + { + "epoch": 0.34, + "grad_norm": 1.4730331216348589, + "learning_rate": 7.644959457343615e-06, + "loss": 0.3091, + "step": 5453 + }, + { + "epoch": 0.34, + "grad_norm": 1.6305290041659515, + "learning_rate": 7.644095095933159e-06, + "loss": 0.3231, + "step": 5454 + }, + { + "epoch": 0.34, + "grad_norm": 1.9441251975660379, + "learning_rate": 7.643230624814504e-06, + "loss": 0.2992, + "step": 5455 + }, + { + "epoch": 0.34, + "grad_norm": 1.8346439660799356, + "learning_rate": 7.642366044023513e-06, + "loss": 0.3424, + "step": 5456 + }, + { + "epoch": 0.34, + "grad_norm": 2.5807228655220897, + "learning_rate": 7.641501353596063e-06, + "loss": 0.3012, + "step": 5457 + }, + { + "epoch": 0.34, + "grad_norm": 0.646570700792608, + "learning_rate": 7.640636553568028e-06, + "loss": 0.4817, + "step": 5458 + }, + { + "epoch": 0.34, + "grad_norm": 1.4828639952335128, + "learning_rate": 7.639771643975294e-06, + "loss": 0.2872, + "step": 5459 + }, + { + "epoch": 0.34, + "grad_norm": 1.608629996755941, + "learning_rate": 7.638906624853744e-06, + "loss": 0.3362, + "step": 5460 + }, + { + "epoch": 0.34, + "grad_norm": 1.3528149154625222, + "learning_rate": 7.638041496239273e-06, + "loss": 0.3083, + "step": 5461 + }, + { + "epoch": 0.34, + "grad_norm": 1.3575733824670244, + "learning_rate": 7.637176258167773e-06, + "loss": 0.3014, + "step": 5462 + }, + { + "epoch": 0.34, + "grad_norm": 1.8014531298328489, + "learning_rate": 7.636310910675146e-06, + "loss": 0.3332, + "step": 5463 + }, + { + "epoch": 0.34, + "grad_norm": 6.9861518497379675, + "learning_rate": 7.635445453797296e-06, + "loss": 0.3078, + "step": 5464 + }, + { + "epoch": 0.34, + "grad_norm": 1.6201889026965566, + "learning_rate": 7.634579887570135e-06, + "loss": 0.3286, + "step": 5465 + }, + { + "epoch": 0.34, + "grad_norm": 1.8175134249410336, + "learning_rate": 7.633714212029575e-06, + "loss": 0.308, + "step": 5466 + }, + { + "epoch": 0.34, + "grad_norm": 1.5878357263015503, + "learning_rate": 7.632848427211533e-06, + "loss": 0.3027, + "step": 5467 + }, + { + "epoch": 0.34, + "grad_norm": 2.030875584750693, + "learning_rate": 7.631982533151934e-06, + "loss": 0.3293, + "step": 5468 + }, + { + "epoch": 0.34, + "grad_norm": 1.93191629823843, + "learning_rate": 7.631116529886704e-06, + "loss": 0.3043, + "step": 5469 + }, + { + "epoch": 0.34, + "grad_norm": 2.165738276276089, + "learning_rate": 7.630250417451777e-06, + "loss": 0.3533, + "step": 5470 + }, + { + "epoch": 0.34, + "grad_norm": 2.0068217250358273, + "learning_rate": 7.6293841958830874e-06, + "loss": 0.3192, + "step": 5471 + }, + { + "epoch": 0.34, + "grad_norm": 55.71888785457274, + "learning_rate": 7.6285178652165785e-06, + "loss": 0.2905, + "step": 5472 + }, + { + "epoch": 0.34, + "grad_norm": 1.2722623000258324, + "learning_rate": 7.627651425488193e-06, + "loss": 0.3256, + "step": 5473 + }, + { + "epoch": 0.34, + "grad_norm": 3.124205261884713, + "learning_rate": 7.626784876733884e-06, + "loss": 0.307, + "step": 5474 + }, + { + "epoch": 0.34, + "grad_norm": 1.2024345777044216, + "learning_rate": 7.625918218989603e-06, + "loss": 0.3101, + "step": 5475 + }, + { + "epoch": 0.34, + "grad_norm": 1.5904398430538602, + "learning_rate": 7.625051452291312e-06, + "loss": 0.2959, + "step": 5476 + }, + { + "epoch": 0.34, + "grad_norm": 1.9110644815770113, + "learning_rate": 7.624184576674973e-06, + "loss": 0.304, + "step": 5477 + }, + { + "epoch": 0.34, + "grad_norm": 2.974369373027941, + "learning_rate": 7.6233175921765545e-06, + "loss": 0.3172, + "step": 5478 + }, + { + "epoch": 0.34, + "grad_norm": 1.5451315928813225, + "learning_rate": 7.622450498832031e-06, + "loss": 0.3024, + "step": 5479 + }, + { + "epoch": 0.34, + "grad_norm": 2.039741423737045, + "learning_rate": 7.621583296677377e-06, + "loss": 0.3058, + "step": 5480 + }, + { + "epoch": 0.34, + "grad_norm": 3.4226758284193703, + "learning_rate": 7.620715985748578e-06, + "loss": 0.3276, + "step": 5481 + }, + { + "epoch": 0.34, + "grad_norm": 2.6398475391365492, + "learning_rate": 7.619848566081615e-06, + "loss": 0.3309, + "step": 5482 + }, + { + "epoch": 0.34, + "grad_norm": 2.706618561917322, + "learning_rate": 7.6189810377124826e-06, + "loss": 0.3284, + "step": 5483 + }, + { + "epoch": 0.34, + "grad_norm": 2.5754415149307848, + "learning_rate": 7.618113400677176e-06, + "loss": 0.3229, + "step": 5484 + }, + { + "epoch": 0.34, + "grad_norm": 2.194172050701368, + "learning_rate": 7.617245655011694e-06, + "loss": 0.3234, + "step": 5485 + }, + { + "epoch": 0.35, + "grad_norm": 2.050477147220965, + "learning_rate": 7.6163778007520395e-06, + "loss": 0.3344, + "step": 5486 + }, + { + "epoch": 0.35, + "grad_norm": 0.6299172795638901, + "learning_rate": 7.615509837934224e-06, + "loss": 0.5109, + "step": 5487 + }, + { + "epoch": 0.35, + "grad_norm": 2.237847129493219, + "learning_rate": 7.61464176659426e-06, + "loss": 0.318, + "step": 5488 + }, + { + "epoch": 0.35, + "grad_norm": 14.729576606015852, + "learning_rate": 7.6137735867681635e-06, + "loss": 0.3284, + "step": 5489 + }, + { + "epoch": 0.35, + "grad_norm": 1.9396008089136665, + "learning_rate": 7.612905298491958e-06, + "loss": 0.308, + "step": 5490 + }, + { + "epoch": 0.35, + "grad_norm": 2.0854074033250587, + "learning_rate": 7.6120369018016735e-06, + "loss": 0.3206, + "step": 5491 + }, + { + "epoch": 0.35, + "grad_norm": 1.697844250934911, + "learning_rate": 7.611168396733336e-06, + "loss": 0.3042, + "step": 5492 + }, + { + "epoch": 0.35, + "grad_norm": 2.2836817591059266, + "learning_rate": 7.610299783322984e-06, + "loss": 0.3369, + "step": 5493 + }, + { + "epoch": 0.35, + "grad_norm": 1.9248168667966188, + "learning_rate": 7.609431061606658e-06, + "loss": 0.3085, + "step": 5494 + }, + { + "epoch": 0.35, + "grad_norm": 1.5950846935985723, + "learning_rate": 7.6085622316204036e-06, + "loss": 0.3099, + "step": 5495 + }, + { + "epoch": 0.35, + "grad_norm": 1.7311641935530067, + "learning_rate": 7.607693293400266e-06, + "loss": 0.3181, + "step": 5496 + }, + { + "epoch": 0.35, + "grad_norm": 2.14941056256365, + "learning_rate": 7.606824246982305e-06, + "loss": 0.3021, + "step": 5497 + }, + { + "epoch": 0.35, + "grad_norm": 1.4293199970961823, + "learning_rate": 7.6059550924025726e-06, + "loss": 0.3105, + "step": 5498 + }, + { + "epoch": 0.35, + "grad_norm": 2.303225757039219, + "learning_rate": 7.605085829697139e-06, + "loss": 0.2994, + "step": 5499 + }, + { + "epoch": 0.35, + "grad_norm": 1.6735844612911548, + "learning_rate": 7.604216458902063e-06, + "loss": 0.3008, + "step": 5500 + }, + { + "epoch": 0.35, + "grad_norm": 2.401747720583864, + "learning_rate": 7.6033469800534235e-06, + "loss": 0.3203, + "step": 5501 + }, + { + "epoch": 0.35, + "grad_norm": 2.0383037151347123, + "learning_rate": 7.602477393187291e-06, + "loss": 0.3148, + "step": 5502 + }, + { + "epoch": 0.35, + "grad_norm": 1.922985791964369, + "learning_rate": 7.601607698339752e-06, + "loss": 0.3433, + "step": 5503 + }, + { + "epoch": 0.35, + "grad_norm": 3.9108365082688668, + "learning_rate": 7.600737895546886e-06, + "loss": 0.3222, + "step": 5504 + }, + { + "epoch": 0.35, + "grad_norm": 3.1092126191259695, + "learning_rate": 7.599867984844786e-06, + "loss": 0.3178, + "step": 5505 + }, + { + "epoch": 0.35, + "grad_norm": 1.5546278203591173, + "learning_rate": 7.598997966269545e-06, + "loss": 0.316, + "step": 5506 + }, + { + "epoch": 0.35, + "grad_norm": 1.8597113984338176, + "learning_rate": 7.5981278398572634e-06, + "loss": 0.3008, + "step": 5507 + }, + { + "epoch": 0.35, + "grad_norm": 1.3142683648737583, + "learning_rate": 7.597257605644042e-06, + "loss": 0.3095, + "step": 5508 + }, + { + "epoch": 0.35, + "grad_norm": 1.85963895739769, + "learning_rate": 7.59638726366599e-06, + "loss": 0.3422, + "step": 5509 + }, + { + "epoch": 0.35, + "grad_norm": 0.724941260177425, + "learning_rate": 7.595516813959218e-06, + "loss": 0.4937, + "step": 5510 + }, + { + "epoch": 0.35, + "grad_norm": 1.7242903690271727, + "learning_rate": 7.594646256559843e-06, + "loss": 0.3016, + "step": 5511 + }, + { + "epoch": 0.35, + "grad_norm": 3.0813544568681692, + "learning_rate": 7.593775591503986e-06, + "loss": 0.3284, + "step": 5512 + }, + { + "epoch": 0.35, + "grad_norm": 1.8868391929230388, + "learning_rate": 7.592904818827774e-06, + "loss": 0.3198, + "step": 5513 + }, + { + "epoch": 0.35, + "grad_norm": 2.37117999579112, + "learning_rate": 7.592033938567335e-06, + "loss": 0.3097, + "step": 5514 + }, + { + "epoch": 0.35, + "grad_norm": 1.5172549935797623, + "learning_rate": 7.591162950758803e-06, + "loss": 0.304, + "step": 5515 + }, + { + "epoch": 0.35, + "grad_norm": 4.1622814448229715, + "learning_rate": 7.590291855438319e-06, + "loss": 0.3181, + "step": 5516 + }, + { + "epoch": 0.35, + "grad_norm": 1.976591741928131, + "learning_rate": 7.589420652642025e-06, + "loss": 0.2925, + "step": 5517 + }, + { + "epoch": 0.35, + "grad_norm": 1.8019794737536576, + "learning_rate": 7.588549342406068e-06, + "loss": 0.3089, + "step": 5518 + }, + { + "epoch": 0.35, + "grad_norm": 1.9974672229337505, + "learning_rate": 7.587677924766601e-06, + "loss": 0.313, + "step": 5519 + }, + { + "epoch": 0.35, + "grad_norm": 1.6339012014344183, + "learning_rate": 7.586806399759781e-06, + "loss": 0.3142, + "step": 5520 + }, + { + "epoch": 0.35, + "grad_norm": 3.2350736606527395, + "learning_rate": 7.58593476742177e-06, + "loss": 0.3176, + "step": 5521 + }, + { + "epoch": 0.35, + "grad_norm": 5.936643097261422, + "learning_rate": 7.58506302778873e-06, + "loss": 0.3138, + "step": 5522 + }, + { + "epoch": 0.35, + "grad_norm": 1.3658792722133504, + "learning_rate": 7.584191180896835e-06, + "loss": 0.3223, + "step": 5523 + }, + { + "epoch": 0.35, + "grad_norm": 1.8727397687703295, + "learning_rate": 7.583319226782258e-06, + "loss": 0.3138, + "step": 5524 + }, + { + "epoch": 0.35, + "grad_norm": 1.7414055833013713, + "learning_rate": 7.582447165481177e-06, + "loss": 0.2938, + "step": 5525 + }, + { + "epoch": 0.35, + "grad_norm": 2.6510634094253187, + "learning_rate": 7.581574997029777e-06, + "loss": 0.3133, + "step": 5526 + }, + { + "epoch": 0.35, + "grad_norm": 1.4435014538369866, + "learning_rate": 7.580702721464244e-06, + "loss": 0.321, + "step": 5527 + }, + { + "epoch": 0.35, + "grad_norm": 2.090746458260443, + "learning_rate": 7.57983033882077e-06, + "loss": 0.3228, + "step": 5528 + }, + { + "epoch": 0.35, + "grad_norm": 1.309977355524843, + "learning_rate": 7.578957849135555e-06, + "loss": 0.3054, + "step": 5529 + }, + { + "epoch": 0.35, + "grad_norm": 0.6793297113210444, + "learning_rate": 7.578085252444798e-06, + "loss": 0.5371, + "step": 5530 + }, + { + "epoch": 0.35, + "grad_norm": 2.109829442603735, + "learning_rate": 7.577212548784705e-06, + "loss": 0.3139, + "step": 5531 + }, + { + "epoch": 0.35, + "grad_norm": 4.731572911362363, + "learning_rate": 7.576339738191485e-06, + "loss": 0.3142, + "step": 5532 + }, + { + "epoch": 0.35, + "grad_norm": 1.9886255444932028, + "learning_rate": 7.5754668207013535e-06, + "loss": 0.3152, + "step": 5533 + }, + { + "epoch": 0.35, + "grad_norm": 3.0076796278792806, + "learning_rate": 7.574593796350527e-06, + "loss": 0.2962, + "step": 5534 + }, + { + "epoch": 0.35, + "grad_norm": 1.511482315572893, + "learning_rate": 7.573720665175234e-06, + "loss": 0.3052, + "step": 5535 + }, + { + "epoch": 0.35, + "grad_norm": 2.816292089019836, + "learning_rate": 7.572847427211698e-06, + "loss": 0.335, + "step": 5536 + }, + { + "epoch": 0.35, + "grad_norm": 1.9239117791983056, + "learning_rate": 7.5719740824961516e-06, + "loss": 0.3081, + "step": 5537 + }, + { + "epoch": 0.35, + "grad_norm": 1.679967083566104, + "learning_rate": 7.5711006310648316e-06, + "loss": 0.3028, + "step": 5538 + }, + { + "epoch": 0.35, + "grad_norm": 2.576149825720993, + "learning_rate": 7.570227072953982e-06, + "loss": 0.306, + "step": 5539 + }, + { + "epoch": 0.35, + "grad_norm": 1.592116405588844, + "learning_rate": 7.569353408199844e-06, + "loss": 0.3108, + "step": 5540 + }, + { + "epoch": 0.35, + "grad_norm": 1.5437412534820862, + "learning_rate": 7.568479636838672e-06, + "loss": 0.2998, + "step": 5541 + }, + { + "epoch": 0.35, + "grad_norm": 1.7713488575365337, + "learning_rate": 7.567605758906714e-06, + "loss": 0.3167, + "step": 5542 + }, + { + "epoch": 0.35, + "grad_norm": 3.2016171579570147, + "learning_rate": 7.5667317744402345e-06, + "loss": 0.3175, + "step": 5543 + }, + { + "epoch": 0.35, + "grad_norm": 2.0686350989662605, + "learning_rate": 7.565857683475495e-06, + "loss": 0.3285, + "step": 5544 + }, + { + "epoch": 0.35, + "grad_norm": 1.7541404676501888, + "learning_rate": 7.564983486048763e-06, + "loss": 0.3258, + "step": 5545 + }, + { + "epoch": 0.35, + "grad_norm": 2.637809120824183, + "learning_rate": 7.564109182196309e-06, + "loss": 0.304, + "step": 5546 + }, + { + "epoch": 0.35, + "grad_norm": 2.065564160982038, + "learning_rate": 7.563234771954411e-06, + "loss": 0.2936, + "step": 5547 + }, + { + "epoch": 0.35, + "grad_norm": 5.124177000960691, + "learning_rate": 7.562360255359349e-06, + "loss": 0.3253, + "step": 5548 + }, + { + "epoch": 0.35, + "grad_norm": 1.836370184152582, + "learning_rate": 7.5614856324474095e-06, + "loss": 0.3187, + "step": 5549 + }, + { + "epoch": 0.35, + "grad_norm": 3.2304473261043967, + "learning_rate": 7.560610903254881e-06, + "loss": 0.3165, + "step": 5550 + }, + { + "epoch": 0.35, + "grad_norm": 2.211099580150378, + "learning_rate": 7.55973606781806e-06, + "loss": 0.2984, + "step": 5551 + }, + { + "epoch": 0.35, + "grad_norm": 10.106253574896783, + "learning_rate": 7.558861126173241e-06, + "loss": 0.3222, + "step": 5552 + }, + { + "epoch": 0.35, + "grad_norm": 4.43029548593907, + "learning_rate": 7.557986078356727e-06, + "loss": 0.3309, + "step": 5553 + }, + { + "epoch": 0.35, + "grad_norm": 2.4655876812264395, + "learning_rate": 7.557110924404829e-06, + "loss": 0.3112, + "step": 5554 + }, + { + "epoch": 0.35, + "grad_norm": 2.5698564984498824, + "learning_rate": 7.5562356643538586e-06, + "loss": 0.3449, + "step": 5555 + }, + { + "epoch": 0.35, + "grad_norm": 1.8906463936232092, + "learning_rate": 7.555360298240128e-06, + "loss": 0.3077, + "step": 5556 + }, + { + "epoch": 0.35, + "grad_norm": 1.5490763519009287, + "learning_rate": 7.554484826099959e-06, + "loss": 0.3296, + "step": 5557 + }, + { + "epoch": 0.35, + "grad_norm": 0.6689562673367422, + "learning_rate": 7.55360924796968e-06, + "loss": 0.5084, + "step": 5558 + }, + { + "epoch": 0.35, + "grad_norm": 1.4674652561581336, + "learning_rate": 7.552733563885616e-06, + "loss": 0.3063, + "step": 5559 + }, + { + "epoch": 0.35, + "grad_norm": 2.1978635073512747, + "learning_rate": 7.551857773884101e-06, + "loss": 0.3116, + "step": 5560 + }, + { + "epoch": 0.35, + "grad_norm": 1.4731960314675723, + "learning_rate": 7.550981878001476e-06, + "loss": 0.3118, + "step": 5561 + }, + { + "epoch": 0.35, + "grad_norm": 2.4934639668928678, + "learning_rate": 7.550105876274082e-06, + "loss": 0.299, + "step": 5562 + }, + { + "epoch": 0.35, + "grad_norm": 2.4704119077333533, + "learning_rate": 7.549229768738266e-06, + "loss": 0.3229, + "step": 5563 + }, + { + "epoch": 0.35, + "grad_norm": 1.678290582969975, + "learning_rate": 7.548353555430378e-06, + "loss": 0.3035, + "step": 5564 + }, + { + "epoch": 0.35, + "grad_norm": 2.050749009636137, + "learning_rate": 7.547477236386774e-06, + "loss": 0.3078, + "step": 5565 + }, + { + "epoch": 0.35, + "grad_norm": 2.146993060143408, + "learning_rate": 7.546600811643816e-06, + "loss": 0.3065, + "step": 5566 + }, + { + "epoch": 0.35, + "grad_norm": 0.5933902958493147, + "learning_rate": 7.545724281237867e-06, + "loss": 0.5215, + "step": 5567 + }, + { + "epoch": 0.35, + "grad_norm": 6.452429749055142, + "learning_rate": 7.5448476452052955e-06, + "loss": 0.3335, + "step": 5568 + }, + { + "epoch": 0.35, + "grad_norm": 2.1746540608976783, + "learning_rate": 7.543970903582473e-06, + "loss": 0.3232, + "step": 5569 + }, + { + "epoch": 0.35, + "grad_norm": 2.5632182117070235, + "learning_rate": 7.543094056405782e-06, + "loss": 0.3209, + "step": 5570 + }, + { + "epoch": 0.35, + "grad_norm": 1.9638717159951005, + "learning_rate": 7.542217103711601e-06, + "loss": 0.3121, + "step": 5571 + }, + { + "epoch": 0.35, + "grad_norm": 2.144578477203159, + "learning_rate": 7.5413400455363154e-06, + "loss": 0.3242, + "step": 5572 + }, + { + "epoch": 0.35, + "grad_norm": 3.298585420131417, + "learning_rate": 7.54046288191632e-06, + "loss": 0.3155, + "step": 5573 + }, + { + "epoch": 0.35, + "grad_norm": 1.6620275982754027, + "learning_rate": 7.5395856128880045e-06, + "loss": 0.3326, + "step": 5574 + }, + { + "epoch": 0.35, + "grad_norm": 4.614520449225384, + "learning_rate": 7.5387082384877715e-06, + "loss": 0.3193, + "step": 5575 + }, + { + "epoch": 0.35, + "grad_norm": 0.6048965548854468, + "learning_rate": 7.537830758752025e-06, + "loss": 0.5238, + "step": 5576 + }, + { + "epoch": 0.35, + "grad_norm": 3.22001852217234, + "learning_rate": 7.536953173717174e-06, + "loss": 0.3176, + "step": 5577 + }, + { + "epoch": 0.35, + "grad_norm": 1.3787820806039615, + "learning_rate": 7.5360754834196284e-06, + "loss": 0.301, + "step": 5578 + }, + { + "epoch": 0.35, + "grad_norm": 2.234464955645964, + "learning_rate": 7.535197687895805e-06, + "loss": 0.3451, + "step": 5579 + }, + { + "epoch": 0.35, + "grad_norm": 1.5321716541892436, + "learning_rate": 7.53431978718213e-06, + "loss": 0.3164, + "step": 5580 + }, + { + "epoch": 0.35, + "grad_norm": 2.646921846462847, + "learning_rate": 7.533441781315023e-06, + "loss": 0.3161, + "step": 5581 + }, + { + "epoch": 0.35, + "grad_norm": 1.8979577692014298, + "learning_rate": 7.5325636703309165e-06, + "loss": 0.3135, + "step": 5582 + }, + { + "epoch": 0.35, + "grad_norm": 2.1137255471128475, + "learning_rate": 7.531685454266246e-06, + "loss": 0.3035, + "step": 5583 + }, + { + "epoch": 0.35, + "grad_norm": 1.5792122713511922, + "learning_rate": 7.5308071331574485e-06, + "loss": 0.2999, + "step": 5584 + }, + { + "epoch": 0.35, + "grad_norm": 3.5997296727761863, + "learning_rate": 7.529928707040969e-06, + "loss": 0.3255, + "step": 5585 + }, + { + "epoch": 0.35, + "grad_norm": 1.950600940282619, + "learning_rate": 7.529050175953253e-06, + "loss": 0.3175, + "step": 5586 + }, + { + "epoch": 0.35, + "grad_norm": 1.6416121319325168, + "learning_rate": 7.528171539930753e-06, + "loss": 0.3453, + "step": 5587 + }, + { + "epoch": 0.35, + "grad_norm": 0.6169929566759427, + "learning_rate": 7.527292799009924e-06, + "loss": 0.5057, + "step": 5588 + }, + { + "epoch": 0.35, + "grad_norm": 3.5772616025881065, + "learning_rate": 7.526413953227231e-06, + "loss": 0.3108, + "step": 5589 + }, + { + "epoch": 0.35, + "grad_norm": 1.3937024867886079, + "learning_rate": 7.5255350026191336e-06, + "loss": 0.3059, + "step": 5590 + }, + { + "epoch": 0.35, + "grad_norm": 2.7036393897636453, + "learning_rate": 7.524655947222104e-06, + "loss": 0.2912, + "step": 5591 + }, + { + "epoch": 0.35, + "grad_norm": 5.303718523195396, + "learning_rate": 7.523776787072613e-06, + "loss": 0.3108, + "step": 5592 + }, + { + "epoch": 0.35, + "grad_norm": 1.4651604061668437, + "learning_rate": 7.5228975222071425e-06, + "loss": 0.2991, + "step": 5593 + }, + { + "epoch": 0.35, + "grad_norm": 1.744966095349305, + "learning_rate": 7.522018152662171e-06, + "loss": 0.3381, + "step": 5594 + }, + { + "epoch": 0.35, + "grad_norm": 1.9847177841081725, + "learning_rate": 7.521138678474187e-06, + "loss": 0.3055, + "step": 5595 + }, + { + "epoch": 0.35, + "grad_norm": 2.1188786865678044, + "learning_rate": 7.520259099679681e-06, + "loss": 0.3004, + "step": 5596 + }, + { + "epoch": 0.35, + "grad_norm": 1.8088717725026118, + "learning_rate": 7.5193794163151516e-06, + "loss": 0.3149, + "step": 5597 + }, + { + "epoch": 0.35, + "grad_norm": 2.0462386040246234, + "learning_rate": 7.518499628417091e-06, + "loss": 0.2974, + "step": 5598 + }, + { + "epoch": 0.35, + "grad_norm": 1.5040063755350557, + "learning_rate": 7.5176197360220096e-06, + "loss": 0.3011, + "step": 5599 + }, + { + "epoch": 0.35, + "grad_norm": 3.3183981886848737, + "learning_rate": 7.516739739166412e-06, + "loss": 0.3122, + "step": 5600 + }, + { + "epoch": 0.35, + "grad_norm": 1.9674896556573818, + "learning_rate": 7.515859637886814e-06, + "loss": 0.3014, + "step": 5601 + }, + { + "epoch": 0.35, + "grad_norm": 1.7152203946403568, + "learning_rate": 7.514979432219731e-06, + "loss": 0.3145, + "step": 5602 + }, + { + "epoch": 0.35, + "grad_norm": 3.403720004356182, + "learning_rate": 7.514099122201683e-06, + "loss": 0.3127, + "step": 5603 + }, + { + "epoch": 0.35, + "grad_norm": 1.7211492577570522, + "learning_rate": 7.513218707869199e-06, + "loss": 0.3112, + "step": 5604 + }, + { + "epoch": 0.35, + "grad_norm": 0.6104140208393277, + "learning_rate": 7.512338189258805e-06, + "loss": 0.4887, + "step": 5605 + }, + { + "epoch": 0.35, + "grad_norm": 1.5343344536735042, + "learning_rate": 7.511457566407038e-06, + "loss": 0.3113, + "step": 5606 + }, + { + "epoch": 0.35, + "grad_norm": 1.8576184789734569, + "learning_rate": 7.510576839350435e-06, + "loss": 0.3208, + "step": 5607 + }, + { + "epoch": 0.35, + "grad_norm": 2.7988619285075296, + "learning_rate": 7.50969600812554e-06, + "loss": 0.312, + "step": 5608 + }, + { + "epoch": 0.35, + "grad_norm": 2.1524713125801425, + "learning_rate": 7.5088150727689006e-06, + "loss": 0.2999, + "step": 5609 + }, + { + "epoch": 0.35, + "grad_norm": 1.8278447817509023, + "learning_rate": 7.5079340333170654e-06, + "loss": 0.3255, + "step": 5610 + }, + { + "epoch": 0.35, + "grad_norm": 1.4991055701569922, + "learning_rate": 7.507052889806595e-06, + "loss": 0.3143, + "step": 5611 + }, + { + "epoch": 0.35, + "grad_norm": 1.6456788524314352, + "learning_rate": 7.506171642274046e-06, + "loss": 0.3322, + "step": 5612 + }, + { + "epoch": 0.35, + "grad_norm": 1.9083932346937371, + "learning_rate": 7.505290290755985e-06, + "loss": 0.2952, + "step": 5613 + }, + { + "epoch": 0.35, + "grad_norm": 1.6959065724865343, + "learning_rate": 7.5044088352889785e-06, + "loss": 0.3051, + "step": 5614 + }, + { + "epoch": 0.35, + "grad_norm": 1.5812591366602096, + "learning_rate": 7.503527275909603e-06, + "loss": 0.3112, + "step": 5615 + }, + { + "epoch": 0.35, + "grad_norm": 1.4145626855077336, + "learning_rate": 7.502645612654432e-06, + "loss": 0.2984, + "step": 5616 + }, + { + "epoch": 0.35, + "grad_norm": 1.5959014876694964, + "learning_rate": 7.5017638455600505e-06, + "loss": 0.3104, + "step": 5617 + }, + { + "epoch": 0.35, + "grad_norm": 1.9324527414534696, + "learning_rate": 7.5008819746630435e-06, + "loss": 0.2996, + "step": 5618 + }, + { + "epoch": 0.35, + "grad_norm": 1.672622208377859, + "learning_rate": 7.500000000000001e-06, + "loss": 0.2966, + "step": 5619 + }, + { + "epoch": 0.35, + "grad_norm": 1.5514313095791672, + "learning_rate": 7.499117921607518e-06, + "loss": 0.3099, + "step": 5620 + }, + { + "epoch": 0.35, + "grad_norm": 1.8434493314240692, + "learning_rate": 7.498235739522193e-06, + "loss": 0.3271, + "step": 5621 + }, + { + "epoch": 0.35, + "grad_norm": 1.674892198816176, + "learning_rate": 7.497353453780631e-06, + "loss": 0.2908, + "step": 5622 + }, + { + "epoch": 0.35, + "grad_norm": 3.470732541528881, + "learning_rate": 7.496471064419437e-06, + "loss": 0.3165, + "step": 5623 + }, + { + "epoch": 0.35, + "grad_norm": 1.469813071897201, + "learning_rate": 7.495588571475225e-06, + "loss": 0.3097, + "step": 5624 + }, + { + "epoch": 0.35, + "grad_norm": 2.559372444624539, + "learning_rate": 7.494705974984611e-06, + "loss": 0.3126, + "step": 5625 + }, + { + "epoch": 0.35, + "grad_norm": 1.4119806716099852, + "learning_rate": 7.493823274984215e-06, + "loss": 0.3063, + "step": 5626 + }, + { + "epoch": 0.35, + "grad_norm": 6.228598150709018, + "learning_rate": 7.492940471510662e-06, + "loss": 0.2916, + "step": 5627 + }, + { + "epoch": 0.35, + "grad_norm": 1.987139959495272, + "learning_rate": 7.492057564600579e-06, + "loss": 0.3042, + "step": 5628 + }, + { + "epoch": 0.35, + "grad_norm": 1.9055052025674284, + "learning_rate": 7.491174554290602e-06, + "loss": 0.3166, + "step": 5629 + }, + { + "epoch": 0.35, + "grad_norm": 1.3521912386910904, + "learning_rate": 7.490291440617369e-06, + "loss": 0.3072, + "step": 5630 + }, + { + "epoch": 0.35, + "grad_norm": 1.9339713899719875, + "learning_rate": 7.489408223617521e-06, + "loss": 0.3212, + "step": 5631 + }, + { + "epoch": 0.35, + "grad_norm": 1.2991312285805725, + "learning_rate": 7.488524903327703e-06, + "loss": 0.3222, + "step": 5632 + }, + { + "epoch": 0.35, + "grad_norm": 1.5746372787736225, + "learning_rate": 7.487641479784566e-06, + "loss": 0.3023, + "step": 5633 + }, + { + "epoch": 0.35, + "grad_norm": 2.046394299527198, + "learning_rate": 7.486757953024767e-06, + "loss": 0.3154, + "step": 5634 + }, + { + "epoch": 0.35, + "grad_norm": 2.5720029238898356, + "learning_rate": 7.485874323084963e-06, + "loss": 0.3192, + "step": 5635 + }, + { + "epoch": 0.35, + "grad_norm": 2.1660435220907193, + "learning_rate": 7.484990590001818e-06, + "loss": 0.3327, + "step": 5636 + }, + { + "epoch": 0.35, + "grad_norm": 1.6374788207877375, + "learning_rate": 7.484106753811999e-06, + "loss": 0.3035, + "step": 5637 + }, + { + "epoch": 0.35, + "grad_norm": 2.6121513654484185, + "learning_rate": 7.4832228145521805e-06, + "loss": 0.3076, + "step": 5638 + }, + { + "epoch": 0.35, + "grad_norm": 1.380066003424248, + "learning_rate": 7.4823387722590345e-06, + "loss": 0.3108, + "step": 5639 + }, + { + "epoch": 0.35, + "grad_norm": 4.200349463868485, + "learning_rate": 7.481454626969244e-06, + "loss": 0.3084, + "step": 5640 + }, + { + "epoch": 0.35, + "grad_norm": 13.807919801223866, + "learning_rate": 7.480570378719494e-06, + "loss": 0.3304, + "step": 5641 + }, + { + "epoch": 0.35, + "grad_norm": 1.7027573751488092, + "learning_rate": 7.4796860275464734e-06, + "loss": 0.3271, + "step": 5642 + }, + { + "epoch": 0.35, + "grad_norm": 1.9935426564123864, + "learning_rate": 7.4788015734868745e-06, + "loss": 0.3291, + "step": 5643 + }, + { + "epoch": 0.35, + "grad_norm": 3.1901492399519804, + "learning_rate": 7.477917016577396e-06, + "loss": 0.3183, + "step": 5644 + }, + { + "epoch": 0.36, + "grad_norm": 1.5598227356088434, + "learning_rate": 7.477032356854739e-06, + "loss": 0.3284, + "step": 5645 + }, + { + "epoch": 0.36, + "grad_norm": 1.5607753249772587, + "learning_rate": 7.47614759435561e-06, + "loss": 0.3168, + "step": 5646 + }, + { + "epoch": 0.36, + "grad_norm": 3.2946975217902708, + "learning_rate": 7.475262729116718e-06, + "loss": 0.3079, + "step": 5647 + }, + { + "epoch": 0.36, + "grad_norm": 2.1161331653257256, + "learning_rate": 7.47437776117478e-06, + "loss": 0.3147, + "step": 5648 + }, + { + "epoch": 0.36, + "grad_norm": 1.567136292200706, + "learning_rate": 7.473492690566513e-06, + "loss": 0.3097, + "step": 5649 + }, + { + "epoch": 0.36, + "grad_norm": 1.6040788197915585, + "learning_rate": 7.472607517328641e-06, + "loss": 0.3063, + "step": 5650 + }, + { + "epoch": 0.36, + "grad_norm": 7.251851878767218, + "learning_rate": 7.4717222414978915e-06, + "loss": 0.319, + "step": 5651 + }, + { + "epoch": 0.36, + "grad_norm": 2.1348626804101776, + "learning_rate": 7.470836863110996e-06, + "loss": 0.3091, + "step": 5652 + }, + { + "epoch": 0.36, + "grad_norm": 2.685702944543895, + "learning_rate": 7.46995138220469e-06, + "loss": 0.3122, + "step": 5653 + }, + { + "epoch": 0.36, + "grad_norm": 1.3611225750638158, + "learning_rate": 7.469065798815715e-06, + "loss": 0.3128, + "step": 5654 + }, + { + "epoch": 0.36, + "grad_norm": 58.515560938211244, + "learning_rate": 7.468180112980813e-06, + "loss": 0.2983, + "step": 5655 + }, + { + "epoch": 0.36, + "grad_norm": 1.780049182137719, + "learning_rate": 7.467294324736735e-06, + "loss": 0.3055, + "step": 5656 + }, + { + "epoch": 0.36, + "grad_norm": 1.674422368242577, + "learning_rate": 7.466408434120231e-06, + "loss": 0.33, + "step": 5657 + }, + { + "epoch": 0.36, + "grad_norm": 3.670010019365439, + "learning_rate": 7.4655224411680635e-06, + "loss": 0.3054, + "step": 5658 + }, + { + "epoch": 0.36, + "grad_norm": 1.3692648213304666, + "learning_rate": 7.464636345916989e-06, + "loss": 0.3146, + "step": 5659 + }, + { + "epoch": 0.36, + "grad_norm": 1.428873933394962, + "learning_rate": 7.463750148403776e-06, + "loss": 0.3024, + "step": 5660 + }, + { + "epoch": 0.36, + "grad_norm": 4.018528477444935, + "learning_rate": 7.462863848665191e-06, + "loss": 0.3379, + "step": 5661 + }, + { + "epoch": 0.36, + "grad_norm": 1.79207294349736, + "learning_rate": 7.4619774467380135e-06, + "loss": 0.3152, + "step": 5662 + }, + { + "epoch": 0.36, + "grad_norm": 3.8969671134218307, + "learning_rate": 7.461090942659016e-06, + "loss": 0.305, + "step": 5663 + }, + { + "epoch": 0.36, + "grad_norm": 2.1867364046379683, + "learning_rate": 7.4602043364649845e-06, + "loss": 0.3102, + "step": 5664 + }, + { + "epoch": 0.36, + "grad_norm": 1.9064886275259811, + "learning_rate": 7.459317628192706e-06, + "loss": 0.3092, + "step": 5665 + }, + { + "epoch": 0.36, + "grad_norm": 3.51862220711062, + "learning_rate": 7.458430817878971e-06, + "loss": 0.3202, + "step": 5666 + }, + { + "epoch": 0.36, + "grad_norm": 1.3127277383490619, + "learning_rate": 7.457543905560574e-06, + "loss": 0.3002, + "step": 5667 + }, + { + "epoch": 0.36, + "grad_norm": 1.4085002021476818, + "learning_rate": 7.4566568912743156e-06, + "loss": 0.2915, + "step": 5668 + }, + { + "epoch": 0.36, + "grad_norm": 2.169750998987912, + "learning_rate": 7.455769775056998e-06, + "loss": 0.3137, + "step": 5669 + }, + { + "epoch": 0.36, + "grad_norm": 1.4067814726613208, + "learning_rate": 7.454882556945433e-06, + "loss": 0.2997, + "step": 5670 + }, + { + "epoch": 0.36, + "grad_norm": 1.7267074013942263, + "learning_rate": 7.453995236976428e-06, + "loss": 0.3116, + "step": 5671 + }, + { + "epoch": 0.36, + "grad_norm": 1.585783007967785, + "learning_rate": 7.453107815186803e-06, + "loss": 0.3141, + "step": 5672 + }, + { + "epoch": 0.36, + "grad_norm": 1.4987161190589928, + "learning_rate": 7.452220291613377e-06, + "loss": 0.307, + "step": 5673 + }, + { + "epoch": 0.36, + "grad_norm": 1.2510466920102021, + "learning_rate": 7.451332666292977e-06, + "loss": 0.3022, + "step": 5674 + }, + { + "epoch": 0.36, + "grad_norm": 2.0184887848923205, + "learning_rate": 7.450444939262429e-06, + "loss": 0.3041, + "step": 5675 + }, + { + "epoch": 0.36, + "grad_norm": 1.6660699120502802, + "learning_rate": 7.4495571105585685e-06, + "loss": 0.3086, + "step": 5676 + }, + { + "epoch": 0.36, + "grad_norm": 2.8567088182384004, + "learning_rate": 7.448669180218232e-06, + "loss": 0.3147, + "step": 5677 + }, + { + "epoch": 0.36, + "grad_norm": 2.0114870151083903, + "learning_rate": 7.447781148278264e-06, + "loss": 0.306, + "step": 5678 + }, + { + "epoch": 0.36, + "grad_norm": 3.152110807995157, + "learning_rate": 7.446893014775506e-06, + "loss": 0.3235, + "step": 5679 + }, + { + "epoch": 0.36, + "grad_norm": 1.3991897600041783, + "learning_rate": 7.446004779746811e-06, + "loss": 0.3234, + "step": 5680 + }, + { + "epoch": 0.36, + "grad_norm": 1.6360703079205299, + "learning_rate": 7.445116443229033e-06, + "loss": 0.3147, + "step": 5681 + }, + { + "epoch": 0.36, + "grad_norm": 0.6753473981753595, + "learning_rate": 7.4442280052590325e-06, + "loss": 0.4836, + "step": 5682 + }, + { + "epoch": 0.36, + "grad_norm": 1.3987344153282701, + "learning_rate": 7.443339465873668e-06, + "loss": 0.3206, + "step": 5683 + }, + { + "epoch": 0.36, + "grad_norm": 2.1578346647493634, + "learning_rate": 7.442450825109811e-06, + "loss": 0.3032, + "step": 5684 + }, + { + "epoch": 0.36, + "grad_norm": 2.3961765871672758, + "learning_rate": 7.4415620830043315e-06, + "loss": 0.3422, + "step": 5685 + }, + { + "epoch": 0.36, + "grad_norm": 3.3999347252216263, + "learning_rate": 7.440673239594104e-06, + "loss": 0.338, + "step": 5686 + }, + { + "epoch": 0.36, + "grad_norm": 3.458042249081322, + "learning_rate": 7.439784294916006e-06, + "loss": 0.3071, + "step": 5687 + }, + { + "epoch": 0.36, + "grad_norm": 1.442999583186028, + "learning_rate": 7.438895249006928e-06, + "loss": 0.3237, + "step": 5688 + }, + { + "epoch": 0.36, + "grad_norm": 2.2220124555028837, + "learning_rate": 7.438006101903752e-06, + "loss": 0.3352, + "step": 5689 + }, + { + "epoch": 0.36, + "grad_norm": 1.5793718312701053, + "learning_rate": 7.4371168536433736e-06, + "loss": 0.3303, + "step": 5690 + }, + { + "epoch": 0.36, + "grad_norm": 1.402348641953203, + "learning_rate": 7.436227504262686e-06, + "loss": 0.3095, + "step": 5691 + }, + { + "epoch": 0.36, + "grad_norm": 2.4817249184171337, + "learning_rate": 7.435338053798594e-06, + "loss": 0.3038, + "step": 5692 + }, + { + "epoch": 0.36, + "grad_norm": 1.3749261366728374, + "learning_rate": 7.434448502288e-06, + "loss": 0.3013, + "step": 5693 + }, + { + "epoch": 0.36, + "grad_norm": 1.353155418341018, + "learning_rate": 7.433558849767814e-06, + "loss": 0.3008, + "step": 5694 + }, + { + "epoch": 0.36, + "grad_norm": 1.7818034013591229, + "learning_rate": 7.4326690962749475e-06, + "loss": 0.3158, + "step": 5695 + }, + { + "epoch": 0.36, + "grad_norm": 1.7948200058142285, + "learning_rate": 7.431779241846321e-06, + "loss": 0.3059, + "step": 5696 + }, + { + "epoch": 0.36, + "grad_norm": 2.411385096486676, + "learning_rate": 7.430889286518853e-06, + "loss": 0.35, + "step": 5697 + }, + { + "epoch": 0.36, + "grad_norm": 1.6324917356146929, + "learning_rate": 7.429999230329472e-06, + "loss": 0.3016, + "step": 5698 + }, + { + "epoch": 0.36, + "grad_norm": 1.281234103921979, + "learning_rate": 7.429109073315105e-06, + "loss": 0.3344, + "step": 5699 + }, + { + "epoch": 0.36, + "grad_norm": 1.4392219907285286, + "learning_rate": 7.42821881551269e-06, + "loss": 0.298, + "step": 5700 + }, + { + "epoch": 0.36, + "grad_norm": 3.384668267321661, + "learning_rate": 7.427328456959162e-06, + "loss": 0.3195, + "step": 5701 + }, + { + "epoch": 0.36, + "grad_norm": 1.1959807559594506, + "learning_rate": 7.4264379976914654e-06, + "loss": 0.3035, + "step": 5702 + }, + { + "epoch": 0.36, + "grad_norm": 2.690632201664417, + "learning_rate": 7.425547437746546e-06, + "loss": 0.3098, + "step": 5703 + }, + { + "epoch": 0.36, + "grad_norm": 2.6763078633531703, + "learning_rate": 7.424656777161357e-06, + "loss": 0.3167, + "step": 5704 + }, + { + "epoch": 0.36, + "grad_norm": 2.3469438471630495, + "learning_rate": 7.4237660159728496e-06, + "loss": 0.3122, + "step": 5705 + }, + { + "epoch": 0.36, + "grad_norm": 1.6327036984516017, + "learning_rate": 7.422875154217986e-06, + "loss": 0.3062, + "step": 5706 + }, + { + "epoch": 0.36, + "grad_norm": 1.3215204754316474, + "learning_rate": 7.421984191933728e-06, + "loss": 0.3152, + "step": 5707 + }, + { + "epoch": 0.36, + "grad_norm": 0.6332467955778492, + "learning_rate": 7.421093129157044e-06, + "loss": 0.5212, + "step": 5708 + }, + { + "epoch": 0.36, + "grad_norm": 1.6653563696691982, + "learning_rate": 7.4202019659249066e-06, + "loss": 0.3062, + "step": 5709 + }, + { + "epoch": 0.36, + "grad_norm": 1.4541565492272985, + "learning_rate": 7.419310702274289e-06, + "loss": 0.3029, + "step": 5710 + }, + { + "epoch": 0.36, + "grad_norm": 1.8504941946587374, + "learning_rate": 7.418419338242176e-06, + "loss": 0.2965, + "step": 5711 + }, + { + "epoch": 0.36, + "grad_norm": 1.7463594995990253, + "learning_rate": 7.417527873865548e-06, + "loss": 0.3294, + "step": 5712 + }, + { + "epoch": 0.36, + "grad_norm": 1.557788398969939, + "learning_rate": 7.416636309181393e-06, + "loss": 0.3411, + "step": 5713 + }, + { + "epoch": 0.36, + "grad_norm": 0.6168204406296904, + "learning_rate": 7.415744644226706e-06, + "loss": 0.5094, + "step": 5714 + }, + { + "epoch": 0.36, + "grad_norm": 0.6182301560704233, + "learning_rate": 7.414852879038483e-06, + "loss": 0.4927, + "step": 5715 + }, + { + "epoch": 0.36, + "grad_norm": 1.4139939106748585, + "learning_rate": 7.413961013653725e-06, + "loss": 0.3065, + "step": 5716 + }, + { + "epoch": 0.36, + "grad_norm": 1.8082851084173472, + "learning_rate": 7.4130690481094356e-06, + "loss": 0.3026, + "step": 5717 + }, + { + "epoch": 0.36, + "grad_norm": 1.6075580421704037, + "learning_rate": 7.412176982442629e-06, + "loss": 0.3098, + "step": 5718 + }, + { + "epoch": 0.36, + "grad_norm": 1.6559692571334896, + "learning_rate": 7.41128481669031e-06, + "loss": 0.3189, + "step": 5719 + }, + { + "epoch": 0.36, + "grad_norm": 1.604115556433833, + "learning_rate": 7.4103925508895046e-06, + "loss": 0.2997, + "step": 5720 + }, + { + "epoch": 0.36, + "grad_norm": 2.2416715029221943, + "learning_rate": 7.40950018507723e-06, + "loss": 0.3014, + "step": 5721 + }, + { + "epoch": 0.36, + "grad_norm": 1.3092676267296375, + "learning_rate": 7.408607719290512e-06, + "loss": 0.3169, + "step": 5722 + }, + { + "epoch": 0.36, + "grad_norm": 1.535729355769897, + "learning_rate": 7.407715153566383e-06, + "loss": 0.3111, + "step": 5723 + }, + { + "epoch": 0.36, + "grad_norm": 1.9132874418537968, + "learning_rate": 7.4068224879418734e-06, + "loss": 0.3238, + "step": 5724 + }, + { + "epoch": 0.36, + "grad_norm": 1.296794143617355, + "learning_rate": 7.405929722454026e-06, + "loss": 0.2883, + "step": 5725 + }, + { + "epoch": 0.36, + "grad_norm": 0.6492150870470486, + "learning_rate": 7.40503685713988e-06, + "loss": 0.4716, + "step": 5726 + }, + { + "epoch": 0.36, + "grad_norm": 2.3639429959010156, + "learning_rate": 7.404143892036484e-06, + "loss": 0.3275, + "step": 5727 + }, + { + "epoch": 0.36, + "grad_norm": 4.2386137948775255, + "learning_rate": 7.403250827180887e-06, + "loss": 0.3349, + "step": 5728 + }, + { + "epoch": 0.36, + "grad_norm": 1.2563994628426658, + "learning_rate": 7.402357662610144e-06, + "loss": 0.3175, + "step": 5729 + }, + { + "epoch": 0.36, + "grad_norm": 1.7851120512806526, + "learning_rate": 7.4014643983613155e-06, + "loss": 0.3157, + "step": 5730 + }, + { + "epoch": 0.36, + "grad_norm": 1.7893059924225179, + "learning_rate": 7.4005710344714624e-06, + "loss": 0.3052, + "step": 5731 + }, + { + "epoch": 0.36, + "grad_norm": 1.4542712392411767, + "learning_rate": 7.399677570977653e-06, + "loss": 0.3406, + "step": 5732 + }, + { + "epoch": 0.36, + "grad_norm": 1.9076157660029271, + "learning_rate": 7.39878400791696e-06, + "loss": 0.3087, + "step": 5733 + }, + { + "epoch": 0.36, + "grad_norm": 1.3486840979132322, + "learning_rate": 7.397890345326458e-06, + "loss": 0.3052, + "step": 5734 + }, + { + "epoch": 0.36, + "grad_norm": 1.6639528761915583, + "learning_rate": 7.396996583243227e-06, + "loss": 0.3196, + "step": 5735 + }, + { + "epoch": 0.36, + "grad_norm": 1.3565853131676544, + "learning_rate": 7.396102721704348e-06, + "loss": 0.3005, + "step": 5736 + }, + { + "epoch": 0.36, + "grad_norm": 1.2639933629837592, + "learning_rate": 7.395208760746912e-06, + "loss": 0.3085, + "step": 5737 + }, + { + "epoch": 0.36, + "grad_norm": 1.657557845945615, + "learning_rate": 7.394314700408012e-06, + "loss": 0.3254, + "step": 5738 + }, + { + "epoch": 0.36, + "grad_norm": 1.6632622764957616, + "learning_rate": 7.39342054072474e-06, + "loss": 0.3337, + "step": 5739 + }, + { + "epoch": 0.36, + "grad_norm": 1.487677377720476, + "learning_rate": 7.3925262817341996e-06, + "loss": 0.2991, + "step": 5740 + }, + { + "epoch": 0.36, + "grad_norm": 1.701000162151653, + "learning_rate": 7.3916319234734935e-06, + "loss": 0.3186, + "step": 5741 + }, + { + "epoch": 0.36, + "grad_norm": 2.6058788602510203, + "learning_rate": 7.390737465979732e-06, + "loss": 0.287, + "step": 5742 + }, + { + "epoch": 0.36, + "grad_norm": 1.710631298606624, + "learning_rate": 7.389842909290025e-06, + "loss": 0.3284, + "step": 5743 + }, + { + "epoch": 0.36, + "grad_norm": 1.7821217348464067, + "learning_rate": 7.388948253441492e-06, + "loss": 0.3274, + "step": 5744 + }, + { + "epoch": 0.36, + "grad_norm": 1.5799089572036988, + "learning_rate": 7.388053498471253e-06, + "loss": 0.3133, + "step": 5745 + }, + { + "epoch": 0.36, + "grad_norm": 0.6500341324063666, + "learning_rate": 7.387158644416432e-06, + "loss": 0.4969, + "step": 5746 + }, + { + "epoch": 0.36, + "grad_norm": 1.2714644253683998, + "learning_rate": 7.386263691314157e-06, + "loss": 0.3199, + "step": 5747 + }, + { + "epoch": 0.36, + "grad_norm": 1.9249387926907506, + "learning_rate": 7.385368639201567e-06, + "loss": 0.3239, + "step": 5748 + }, + { + "epoch": 0.36, + "grad_norm": 2.0455498736956876, + "learning_rate": 7.384473488115792e-06, + "loss": 0.3217, + "step": 5749 + }, + { + "epoch": 0.36, + "grad_norm": 1.6326238233539525, + "learning_rate": 7.383578238093979e-06, + "loss": 0.3071, + "step": 5750 + }, + { + "epoch": 0.36, + "grad_norm": 2.0793840173506237, + "learning_rate": 7.38268288917327e-06, + "loss": 0.3343, + "step": 5751 + }, + { + "epoch": 0.36, + "grad_norm": 1.3578410524260112, + "learning_rate": 7.381787441390815e-06, + "loss": 0.311, + "step": 5752 + }, + { + "epoch": 0.36, + "grad_norm": 2.504546431557493, + "learning_rate": 7.380891894783769e-06, + "loss": 0.3332, + "step": 5753 + }, + { + "epoch": 0.36, + "grad_norm": 2.478990359257859, + "learning_rate": 7.3799962493892895e-06, + "loss": 0.3078, + "step": 5754 + }, + { + "epoch": 0.36, + "grad_norm": 1.5863958876970328, + "learning_rate": 7.379100505244538e-06, + "loss": 0.3193, + "step": 5755 + }, + { + "epoch": 0.36, + "grad_norm": 1.7685270421751085, + "learning_rate": 7.378204662386683e-06, + "loss": 0.2973, + "step": 5756 + }, + { + "epoch": 0.36, + "grad_norm": 1.5372639700328596, + "learning_rate": 7.37730872085289e-06, + "loss": 0.3192, + "step": 5757 + }, + { + "epoch": 0.36, + "grad_norm": 1.6127723564068863, + "learning_rate": 7.376412680680336e-06, + "loss": 0.3131, + "step": 5758 + }, + { + "epoch": 0.36, + "grad_norm": 3.4735496079413415, + "learning_rate": 7.375516541906199e-06, + "loss": 0.2981, + "step": 5759 + }, + { + "epoch": 0.36, + "grad_norm": 1.877987572363265, + "learning_rate": 7.3746203045676625e-06, + "loss": 0.3228, + "step": 5760 + }, + { + "epoch": 0.36, + "grad_norm": 1.6400665714471598, + "learning_rate": 7.37372396870191e-06, + "loss": 0.3026, + "step": 5761 + }, + { + "epoch": 0.36, + "grad_norm": 1.5285887762229906, + "learning_rate": 7.372827534346134e-06, + "loss": 0.322, + "step": 5762 + }, + { + "epoch": 0.36, + "grad_norm": 1.5689585017596384, + "learning_rate": 7.371931001537529e-06, + "loss": 0.3102, + "step": 5763 + }, + { + "epoch": 0.36, + "grad_norm": 2.342024338730037, + "learning_rate": 7.371034370313296e-06, + "loss": 0.3575, + "step": 5764 + }, + { + "epoch": 0.36, + "grad_norm": 2.468124222734872, + "learning_rate": 7.370137640710632e-06, + "loss": 0.3145, + "step": 5765 + }, + { + "epoch": 0.36, + "grad_norm": 2.320101177618037, + "learning_rate": 7.36924081276675e-06, + "loss": 0.3196, + "step": 5766 + }, + { + "epoch": 0.36, + "grad_norm": 1.4339543507967045, + "learning_rate": 7.368343886518857e-06, + "loss": 0.2987, + "step": 5767 + }, + { + "epoch": 0.36, + "grad_norm": 3.8696148075773147, + "learning_rate": 7.3674468620041716e-06, + "loss": 0.313, + "step": 5768 + }, + { + "epoch": 0.36, + "grad_norm": 2.5474706499442066, + "learning_rate": 7.366549739259908e-06, + "loss": 0.3221, + "step": 5769 + }, + { + "epoch": 0.36, + "grad_norm": 1.2964753833558096, + "learning_rate": 7.365652518323294e-06, + "loss": 0.3021, + "step": 5770 + }, + { + "epoch": 0.36, + "grad_norm": 1.4044127931489798, + "learning_rate": 7.364755199231555e-06, + "loss": 0.3229, + "step": 5771 + }, + { + "epoch": 0.36, + "grad_norm": 1.7601533043645516, + "learning_rate": 7.363857782021922e-06, + "loss": 0.3362, + "step": 5772 + }, + { + "epoch": 0.36, + "grad_norm": 0.6087853838590078, + "learning_rate": 7.362960266731632e-06, + "loss": 0.4977, + "step": 5773 + }, + { + "epoch": 0.36, + "grad_norm": 2.3781158828767315, + "learning_rate": 7.362062653397923e-06, + "loss": 0.32, + "step": 5774 + }, + { + "epoch": 0.36, + "grad_norm": 8.658366266497918, + "learning_rate": 7.361164942058038e-06, + "loss": 0.3192, + "step": 5775 + }, + { + "epoch": 0.36, + "grad_norm": 1.8875124548609907, + "learning_rate": 7.360267132749227e-06, + "loss": 0.3087, + "step": 5776 + }, + { + "epoch": 0.36, + "grad_norm": 2.2727850664024363, + "learning_rate": 7.359369225508738e-06, + "loss": 0.3185, + "step": 5777 + }, + { + "epoch": 0.36, + "grad_norm": 2.450642338466432, + "learning_rate": 7.358471220373831e-06, + "loss": 0.3204, + "step": 5778 + }, + { + "epoch": 0.36, + "grad_norm": 3.6278554362256004, + "learning_rate": 7.357573117381764e-06, + "loss": 0.3008, + "step": 5779 + }, + { + "epoch": 0.36, + "grad_norm": 1.9078310390802198, + "learning_rate": 7.3566749165698e-06, + "loss": 0.3023, + "step": 5780 + }, + { + "epoch": 0.36, + "grad_norm": 1.4518003132283595, + "learning_rate": 7.355776617975209e-06, + "loss": 0.3324, + "step": 5781 + }, + { + "epoch": 0.36, + "grad_norm": 1.9831437966496162, + "learning_rate": 7.354878221635262e-06, + "loss": 0.3067, + "step": 5782 + }, + { + "epoch": 0.36, + "grad_norm": 2.9558412798166955, + "learning_rate": 7.353979727587234e-06, + "loss": 0.284, + "step": 5783 + }, + { + "epoch": 0.36, + "grad_norm": 2.75995278297336, + "learning_rate": 7.353081135868405e-06, + "loss": 0.3316, + "step": 5784 + }, + { + "epoch": 0.36, + "grad_norm": 2.4438940009092813, + "learning_rate": 7.3521824465160605e-06, + "loss": 0.3071, + "step": 5785 + }, + { + "epoch": 0.36, + "grad_norm": 1.4964626549739581, + "learning_rate": 7.3512836595674896e-06, + "loss": 0.3031, + "step": 5786 + }, + { + "epoch": 0.36, + "grad_norm": 4.195188164153965, + "learning_rate": 7.350384775059983e-06, + "loss": 0.3179, + "step": 5787 + }, + { + "epoch": 0.36, + "grad_norm": 1.4793362289035, + "learning_rate": 7.349485793030837e-06, + "loss": 0.3084, + "step": 5788 + }, + { + "epoch": 0.36, + "grad_norm": 1.7286359867807102, + "learning_rate": 7.3485867135173514e-06, + "loss": 0.3196, + "step": 5789 + }, + { + "epoch": 0.36, + "grad_norm": 1.5745322911322044, + "learning_rate": 7.347687536556833e-06, + "loss": 0.3144, + "step": 5790 + }, + { + "epoch": 0.36, + "grad_norm": 2.4459368475648984, + "learning_rate": 7.346788262186588e-06, + "loss": 0.3082, + "step": 5791 + }, + { + "epoch": 0.36, + "grad_norm": 1.5842852315498037, + "learning_rate": 7.34588889044393e-06, + "loss": 0.3118, + "step": 5792 + }, + { + "epoch": 0.36, + "grad_norm": 0.6431567181335914, + "learning_rate": 7.344989421366175e-06, + "loss": 0.5115, + "step": 5793 + }, + { + "epoch": 0.36, + "grad_norm": 1.7974130873253658, + "learning_rate": 7.3440898549906435e-06, + "loss": 0.308, + "step": 5794 + }, + { + "epoch": 0.36, + "grad_norm": 4.149415807747741, + "learning_rate": 7.34319019135466e-06, + "loss": 0.3145, + "step": 5795 + }, + { + "epoch": 0.36, + "grad_norm": 2.0781212669866096, + "learning_rate": 7.342290430495554e-06, + "loss": 0.3185, + "step": 5796 + }, + { + "epoch": 0.36, + "grad_norm": 1.6616416544173185, + "learning_rate": 7.341390572450659e-06, + "loss": 0.3071, + "step": 5797 + }, + { + "epoch": 0.36, + "grad_norm": 5.433830528273421, + "learning_rate": 7.340490617257309e-06, + "loss": 0.3169, + "step": 5798 + }, + { + "epoch": 0.36, + "grad_norm": 2.9030889016357984, + "learning_rate": 7.339590564952845e-06, + "loss": 0.3178, + "step": 5799 + }, + { + "epoch": 0.36, + "grad_norm": 3.646943345805651, + "learning_rate": 7.338690415574614e-06, + "loss": 0.3067, + "step": 5800 + }, + { + "epoch": 0.36, + "grad_norm": 1.6316014584446898, + "learning_rate": 7.337790169159964e-06, + "loss": 0.3009, + "step": 5801 + }, + { + "epoch": 0.36, + "grad_norm": 3.144307940388868, + "learning_rate": 7.3368898257462486e-06, + "loss": 0.3031, + "step": 5802 + }, + { + "epoch": 0.36, + "grad_norm": 2.4620217959917476, + "learning_rate": 7.3359893853708205e-06, + "loss": 0.3002, + "step": 5803 + }, + { + "epoch": 0.37, + "grad_norm": 1.9917308700716931, + "learning_rate": 7.335088848071046e-06, + "loss": 0.3085, + "step": 5804 + }, + { + "epoch": 0.37, + "grad_norm": 1.515370739560277, + "learning_rate": 7.334188213884287e-06, + "loss": 0.3089, + "step": 5805 + }, + { + "epoch": 0.37, + "grad_norm": 1.8956378841874428, + "learning_rate": 7.333287482847913e-06, + "loss": 0.2921, + "step": 5806 + }, + { + "epoch": 0.37, + "grad_norm": 2.0689430657937513, + "learning_rate": 7.332386654999296e-06, + "loss": 0.3124, + "step": 5807 + }, + { + "epoch": 0.37, + "grad_norm": 2.1196299524360107, + "learning_rate": 7.3314857303758155e-06, + "loss": 0.3207, + "step": 5808 + }, + { + "epoch": 0.37, + "grad_norm": 1.8664441193295482, + "learning_rate": 7.330584709014849e-06, + "loss": 0.3125, + "step": 5809 + }, + { + "epoch": 0.37, + "grad_norm": 1.5672815518063077, + "learning_rate": 7.329683590953785e-06, + "loss": 0.3214, + "step": 5810 + }, + { + "epoch": 0.37, + "grad_norm": 1.9459149264097486, + "learning_rate": 7.328782376230011e-06, + "loss": 0.3032, + "step": 5811 + }, + { + "epoch": 0.37, + "grad_norm": 2.8956083387799723, + "learning_rate": 7.32788106488092e-06, + "loss": 0.3189, + "step": 5812 + }, + { + "epoch": 0.37, + "grad_norm": 2.724382629263787, + "learning_rate": 7.326979656943907e-06, + "loss": 0.301, + "step": 5813 + }, + { + "epoch": 0.37, + "grad_norm": 2.040882141972441, + "learning_rate": 7.326078152456375e-06, + "loss": 0.3041, + "step": 5814 + }, + { + "epoch": 0.37, + "grad_norm": 1.9496157807918788, + "learning_rate": 7.325176551455729e-06, + "loss": 0.334, + "step": 5815 + }, + { + "epoch": 0.37, + "grad_norm": 4.224184541491048, + "learning_rate": 7.324274853979381e-06, + "loss": 0.3067, + "step": 5816 + }, + { + "epoch": 0.37, + "grad_norm": 5.00410639397539, + "learning_rate": 7.323373060064738e-06, + "loss": 0.2864, + "step": 5817 + }, + { + "epoch": 0.37, + "grad_norm": 1.3888165581844634, + "learning_rate": 7.322471169749219e-06, + "loss": 0.3062, + "step": 5818 + }, + { + "epoch": 0.37, + "grad_norm": 1.829558357402964, + "learning_rate": 7.321569183070247e-06, + "loss": 0.2967, + "step": 5819 + }, + { + "epoch": 0.37, + "grad_norm": 2.2188317970095683, + "learning_rate": 7.320667100065248e-06, + "loss": 0.309, + "step": 5820 + }, + { + "epoch": 0.37, + "grad_norm": 1.7218714069460432, + "learning_rate": 7.319764920771646e-06, + "loss": 0.3075, + "step": 5821 + }, + { + "epoch": 0.37, + "grad_norm": 2.48354112910123, + "learning_rate": 7.31886264522688e-06, + "loss": 0.289, + "step": 5822 + }, + { + "epoch": 0.37, + "grad_norm": 0.687544450349418, + "learning_rate": 7.3179602734683815e-06, + "loss": 0.5144, + "step": 5823 + }, + { + "epoch": 0.37, + "grad_norm": 1.6247972565050295, + "learning_rate": 7.317057805533596e-06, + "loss": 0.305, + "step": 5824 + }, + { + "epoch": 0.37, + "grad_norm": 2.133622568335391, + "learning_rate": 7.316155241459966e-06, + "loss": 0.3133, + "step": 5825 + }, + { + "epoch": 0.37, + "grad_norm": 2.28950572443818, + "learning_rate": 7.315252581284942e-06, + "loss": 0.3122, + "step": 5826 + }, + { + "epoch": 0.37, + "grad_norm": 1.472810479886116, + "learning_rate": 7.314349825045975e-06, + "loss": 0.3088, + "step": 5827 + }, + { + "epoch": 0.37, + "grad_norm": 0.6783569563645876, + "learning_rate": 7.313446972780522e-06, + "loss": 0.5448, + "step": 5828 + }, + { + "epoch": 0.37, + "grad_norm": 4.821842629114185, + "learning_rate": 7.312544024526045e-06, + "loss": 0.3261, + "step": 5829 + }, + { + "epoch": 0.37, + "grad_norm": 3.204742629300778, + "learning_rate": 7.311640980320012e-06, + "loss": 0.3178, + "step": 5830 + }, + { + "epoch": 0.37, + "grad_norm": 2.2984830118945174, + "learning_rate": 7.310737840199886e-06, + "loss": 0.312, + "step": 5831 + }, + { + "epoch": 0.37, + "grad_norm": 1.2660732407209636, + "learning_rate": 7.3098346042031435e-06, + "loss": 0.3022, + "step": 5832 + }, + { + "epoch": 0.37, + "grad_norm": 2.0585727921304535, + "learning_rate": 7.308931272367259e-06, + "loss": 0.3282, + "step": 5833 + }, + { + "epoch": 0.37, + "grad_norm": 1.814486531397894, + "learning_rate": 7.308027844729717e-06, + "loss": 0.3001, + "step": 5834 + }, + { + "epoch": 0.37, + "grad_norm": 2.139194598575215, + "learning_rate": 7.3071243213279994e-06, + "loss": 0.2996, + "step": 5835 + }, + { + "epoch": 0.37, + "grad_norm": 1.854070756875088, + "learning_rate": 7.306220702199596e-06, + "loss": 0.3038, + "step": 5836 + }, + { + "epoch": 0.37, + "grad_norm": 2.2854665347947902, + "learning_rate": 7.305316987381998e-06, + "loss": 0.3136, + "step": 5837 + }, + { + "epoch": 0.37, + "grad_norm": 1.451365980106976, + "learning_rate": 7.304413176912706e-06, + "loss": 0.2991, + "step": 5838 + }, + { + "epoch": 0.37, + "grad_norm": 1.5527165313661646, + "learning_rate": 7.303509270829217e-06, + "loss": 0.2992, + "step": 5839 + }, + { + "epoch": 0.37, + "grad_norm": 2.184094753030994, + "learning_rate": 7.302605269169036e-06, + "loss": 0.3067, + "step": 5840 + }, + { + "epoch": 0.37, + "grad_norm": 5.009560153174841, + "learning_rate": 7.301701171969673e-06, + "loss": 0.3107, + "step": 5841 + }, + { + "epoch": 0.37, + "grad_norm": 1.8787207694283758, + "learning_rate": 7.300796979268641e-06, + "loss": 0.3032, + "step": 5842 + }, + { + "epoch": 0.37, + "grad_norm": 1.867082892121984, + "learning_rate": 7.299892691103455e-06, + "loss": 0.3259, + "step": 5843 + }, + { + "epoch": 0.37, + "grad_norm": 1.9656140868400405, + "learning_rate": 7.298988307511637e-06, + "loss": 0.3353, + "step": 5844 + }, + { + "epoch": 0.37, + "grad_norm": 2.3475489149999587, + "learning_rate": 7.298083828530708e-06, + "loss": 0.3119, + "step": 5845 + }, + { + "epoch": 0.37, + "grad_norm": 2.625330661974828, + "learning_rate": 7.297179254198202e-06, + "loss": 0.3035, + "step": 5846 + }, + { + "epoch": 0.37, + "grad_norm": 1.5264940611890767, + "learning_rate": 7.296274584551647e-06, + "loss": 0.3146, + "step": 5847 + }, + { + "epoch": 0.37, + "grad_norm": 2.45465879872521, + "learning_rate": 7.295369819628582e-06, + "loss": 0.317, + "step": 5848 + }, + { + "epoch": 0.37, + "grad_norm": 1.891544266720139, + "learning_rate": 7.294464959466545e-06, + "loss": 0.289, + "step": 5849 + }, + { + "epoch": 0.37, + "grad_norm": 3.0262458195071824, + "learning_rate": 7.2935600041030815e-06, + "loss": 0.3021, + "step": 5850 + }, + { + "epoch": 0.37, + "grad_norm": 1.5013778140899698, + "learning_rate": 7.292654953575739e-06, + "loss": 0.2926, + "step": 5851 + }, + { + "epoch": 0.37, + "grad_norm": 2.0427202461352336, + "learning_rate": 7.291749807922072e-06, + "loss": 0.308, + "step": 5852 + }, + { + "epoch": 0.37, + "grad_norm": 1.7416220802159856, + "learning_rate": 7.290844567179635e-06, + "loss": 0.2939, + "step": 5853 + }, + { + "epoch": 0.37, + "grad_norm": 1.8268191816210502, + "learning_rate": 7.289939231385987e-06, + "loss": 0.3055, + "step": 5854 + }, + { + "epoch": 0.37, + "grad_norm": 2.5882337922854566, + "learning_rate": 7.289033800578692e-06, + "loss": 0.3254, + "step": 5855 + }, + { + "epoch": 0.37, + "grad_norm": 1.622749871598468, + "learning_rate": 7.2881282747953195e-06, + "loss": 0.3143, + "step": 5856 + }, + { + "epoch": 0.37, + "grad_norm": 2.408186670504864, + "learning_rate": 7.287222654073442e-06, + "loss": 0.2995, + "step": 5857 + }, + { + "epoch": 0.37, + "grad_norm": 1.733951039006919, + "learning_rate": 7.286316938450633e-06, + "loss": 0.3032, + "step": 5858 + }, + { + "epoch": 0.37, + "grad_norm": 1.96746684889688, + "learning_rate": 7.285411127964472e-06, + "loss": 0.2968, + "step": 5859 + }, + { + "epoch": 0.37, + "grad_norm": 1.7309144654104038, + "learning_rate": 7.284505222652546e-06, + "loss": 0.2935, + "step": 5860 + }, + { + "epoch": 0.37, + "grad_norm": 1.3396226499138895, + "learning_rate": 7.28359922255244e-06, + "loss": 0.2992, + "step": 5861 + }, + { + "epoch": 0.37, + "grad_norm": 1.3488703823264145, + "learning_rate": 7.2826931277017455e-06, + "loss": 0.3092, + "step": 5862 + }, + { + "epoch": 0.37, + "grad_norm": 1.7107288739922248, + "learning_rate": 7.281786938138058e-06, + "loss": 0.2875, + "step": 5863 + }, + { + "epoch": 0.37, + "grad_norm": 1.714898213298103, + "learning_rate": 7.28088065389898e-06, + "loss": 0.311, + "step": 5864 + }, + { + "epoch": 0.37, + "grad_norm": 4.156603177681332, + "learning_rate": 7.279974275022111e-06, + "loss": 0.3169, + "step": 5865 + }, + { + "epoch": 0.37, + "grad_norm": 2.0778148806959416, + "learning_rate": 7.279067801545059e-06, + "loss": 0.292, + "step": 5866 + }, + { + "epoch": 0.37, + "grad_norm": 2.1673113087277693, + "learning_rate": 7.278161233505435e-06, + "loss": 0.2949, + "step": 5867 + }, + { + "epoch": 0.37, + "grad_norm": 1.5382648247895252, + "learning_rate": 7.277254570940857e-06, + "loss": 0.3342, + "step": 5868 + }, + { + "epoch": 0.37, + "grad_norm": 1.723274882034758, + "learning_rate": 7.276347813888942e-06, + "loss": 0.3053, + "step": 5869 + }, + { + "epoch": 0.37, + "grad_norm": 1.4908694377216396, + "learning_rate": 7.275440962387311e-06, + "loss": 0.294, + "step": 5870 + }, + { + "epoch": 0.37, + "grad_norm": 2.122961585526543, + "learning_rate": 7.274534016473595e-06, + "loss": 0.2908, + "step": 5871 + }, + { + "epoch": 0.37, + "grad_norm": 2.5943014249013667, + "learning_rate": 7.273626976185422e-06, + "loss": 0.3053, + "step": 5872 + }, + { + "epoch": 0.37, + "grad_norm": 1.8921446662808121, + "learning_rate": 7.272719841560426e-06, + "loss": 0.3093, + "step": 5873 + }, + { + "epoch": 0.37, + "grad_norm": 2.442207526439503, + "learning_rate": 7.271812612636249e-06, + "loss": 0.3018, + "step": 5874 + }, + { + "epoch": 0.37, + "grad_norm": 3.687574785421414, + "learning_rate": 7.270905289450529e-06, + "loss": 0.3031, + "step": 5875 + }, + { + "epoch": 0.37, + "grad_norm": 1.3049954404147388, + "learning_rate": 7.269997872040918e-06, + "loss": 0.2979, + "step": 5876 + }, + { + "epoch": 0.37, + "grad_norm": 1.494379088008921, + "learning_rate": 7.26909036044506e-06, + "loss": 0.3144, + "step": 5877 + }, + { + "epoch": 0.37, + "grad_norm": 1.4272037481827549, + "learning_rate": 7.268182754700616e-06, + "loss": 0.2785, + "step": 5878 + }, + { + "epoch": 0.37, + "grad_norm": 1.5092362448586174, + "learning_rate": 7.267275054845238e-06, + "loss": 0.3021, + "step": 5879 + }, + { + "epoch": 0.37, + "grad_norm": 2.204995359379711, + "learning_rate": 7.266367260916594e-06, + "loss": 0.3205, + "step": 5880 + }, + { + "epoch": 0.37, + "grad_norm": 1.67255212483285, + "learning_rate": 7.265459372952343e-06, + "loss": 0.3152, + "step": 5881 + }, + { + "epoch": 0.37, + "grad_norm": 1.5525485891146222, + "learning_rate": 7.264551390990161e-06, + "loss": 0.2957, + "step": 5882 + }, + { + "epoch": 0.37, + "grad_norm": 1.7766000510995803, + "learning_rate": 7.2636433150677185e-06, + "loss": 0.3206, + "step": 5883 + }, + { + "epoch": 0.37, + "grad_norm": 2.3475064172629985, + "learning_rate": 7.262735145222696e-06, + "loss": 0.316, + "step": 5884 + }, + { + "epoch": 0.37, + "grad_norm": 2.233693133714042, + "learning_rate": 7.261826881492771e-06, + "loss": 0.3069, + "step": 5885 + }, + { + "epoch": 0.37, + "grad_norm": 3.422949658024172, + "learning_rate": 7.260918523915632e-06, + "loss": 0.3156, + "step": 5886 + }, + { + "epoch": 0.37, + "grad_norm": 1.485762226091767, + "learning_rate": 7.260010072528968e-06, + "loss": 0.2902, + "step": 5887 + }, + { + "epoch": 0.37, + "grad_norm": 7.882394089851754, + "learning_rate": 7.259101527370471e-06, + "loss": 0.3042, + "step": 5888 + }, + { + "epoch": 0.37, + "grad_norm": 2.5545202236983964, + "learning_rate": 7.25819288847784e-06, + "loss": 0.3175, + "step": 5889 + }, + { + "epoch": 0.37, + "grad_norm": 2.1366015587414267, + "learning_rate": 7.257284155888775e-06, + "loss": 0.328, + "step": 5890 + }, + { + "epoch": 0.37, + "grad_norm": 2.1565696032774317, + "learning_rate": 7.25637532964098e-06, + "loss": 0.3377, + "step": 5891 + }, + { + "epoch": 0.37, + "grad_norm": 3.005323489987182, + "learning_rate": 7.255466409772165e-06, + "loss": 0.3136, + "step": 5892 + }, + { + "epoch": 0.37, + "grad_norm": 5.50084749243097, + "learning_rate": 7.254557396320043e-06, + "loss": 0.3229, + "step": 5893 + }, + { + "epoch": 0.37, + "grad_norm": 2.0641706062751233, + "learning_rate": 7.253648289322331e-06, + "loss": 0.3279, + "step": 5894 + }, + { + "epoch": 0.37, + "grad_norm": 1.578493390770236, + "learning_rate": 7.252739088816747e-06, + "loss": 0.2983, + "step": 5895 + }, + { + "epoch": 0.37, + "grad_norm": 2.0699252430865474, + "learning_rate": 7.251829794841017e-06, + "loss": 0.2977, + "step": 5896 + }, + { + "epoch": 0.37, + "grad_norm": 1.4448905027082404, + "learning_rate": 7.25092040743287e-06, + "loss": 0.3252, + "step": 5897 + }, + { + "epoch": 0.37, + "grad_norm": 1.6582162666588753, + "learning_rate": 7.250010926630038e-06, + "loss": 0.3047, + "step": 5898 + }, + { + "epoch": 0.37, + "grad_norm": 3.029785768214059, + "learning_rate": 7.2491013524702545e-06, + "loss": 0.3072, + "step": 5899 + }, + { + "epoch": 0.37, + "grad_norm": 1.9210438445093962, + "learning_rate": 7.248191684991262e-06, + "loss": 0.3059, + "step": 5900 + }, + { + "epoch": 0.37, + "grad_norm": 1.5119819916754642, + "learning_rate": 7.247281924230802e-06, + "loss": 0.3119, + "step": 5901 + }, + { + "epoch": 0.37, + "grad_norm": 3.348791307836979, + "learning_rate": 7.246372070226625e-06, + "loss": 0.3045, + "step": 5902 + }, + { + "epoch": 0.37, + "grad_norm": 3.462685294050548, + "learning_rate": 7.245462123016478e-06, + "loss": 0.3037, + "step": 5903 + }, + { + "epoch": 0.37, + "grad_norm": 5.657400086378503, + "learning_rate": 7.244552082638122e-06, + "loss": 0.3045, + "step": 5904 + }, + { + "epoch": 0.37, + "grad_norm": 2.3256327447015273, + "learning_rate": 7.243641949129312e-06, + "loss": 0.3082, + "step": 5905 + }, + { + "epoch": 0.37, + "grad_norm": 0.6509889104987481, + "learning_rate": 7.242731722527814e-06, + "loss": 0.4944, + "step": 5906 + }, + { + "epoch": 0.37, + "grad_norm": 1.5261226749791494, + "learning_rate": 7.24182140287139e-06, + "loss": 0.3184, + "step": 5907 + }, + { + "epoch": 0.37, + "grad_norm": 2.2445305914520484, + "learning_rate": 7.2409109901978185e-06, + "loss": 0.2872, + "step": 5908 + }, + { + "epoch": 0.37, + "grad_norm": 1.7015555458004097, + "learning_rate": 7.240000484544866e-06, + "loss": 0.3241, + "step": 5909 + }, + { + "epoch": 0.37, + "grad_norm": 2.3164805378662296, + "learning_rate": 7.239089885950317e-06, + "loss": 0.3069, + "step": 5910 + }, + { + "epoch": 0.37, + "grad_norm": 2.7882030170435197, + "learning_rate": 7.238179194451949e-06, + "loss": 0.3103, + "step": 5911 + }, + { + "epoch": 0.37, + "grad_norm": 3.2328517241237735, + "learning_rate": 7.237268410087553e-06, + "loss": 0.3195, + "step": 5912 + }, + { + "epoch": 0.37, + "grad_norm": 0.5985679519054314, + "learning_rate": 7.236357532894916e-06, + "loss": 0.5348, + "step": 5913 + }, + { + "epoch": 0.37, + "grad_norm": 2.7307987927378585, + "learning_rate": 7.235446562911834e-06, + "loss": 0.3005, + "step": 5914 + }, + { + "epoch": 0.37, + "grad_norm": 1.8246946174040426, + "learning_rate": 7.234535500176101e-06, + "loss": 0.3063, + "step": 5915 + }, + { + "epoch": 0.37, + "grad_norm": 2.6524543979794832, + "learning_rate": 7.233624344725524e-06, + "loss": 0.3209, + "step": 5916 + }, + { + "epoch": 0.37, + "grad_norm": 1.3908195391034117, + "learning_rate": 7.232713096597903e-06, + "loss": 0.2883, + "step": 5917 + }, + { + "epoch": 0.37, + "grad_norm": 2.2579837209121814, + "learning_rate": 7.231801755831052e-06, + "loss": 0.3053, + "step": 5918 + }, + { + "epoch": 0.37, + "grad_norm": 1.698560050406029, + "learning_rate": 7.230890322462781e-06, + "loss": 0.2831, + "step": 5919 + }, + { + "epoch": 0.37, + "grad_norm": 3.1326315798504343, + "learning_rate": 7.2299787965309075e-06, + "loss": 0.3413, + "step": 5920 + }, + { + "epoch": 0.37, + "grad_norm": 1.6627139754391318, + "learning_rate": 7.229067178073253e-06, + "loss": 0.2878, + "step": 5921 + }, + { + "epoch": 0.37, + "grad_norm": 1.7135080849189828, + "learning_rate": 7.228155467127642e-06, + "loss": 0.3277, + "step": 5922 + }, + { + "epoch": 0.37, + "grad_norm": 1.844747474669683, + "learning_rate": 7.227243663731904e-06, + "loss": 0.3259, + "step": 5923 + }, + { + "epoch": 0.37, + "grad_norm": 2.8893996693073825, + "learning_rate": 7.226331767923871e-06, + "loss": 0.3204, + "step": 5924 + }, + { + "epoch": 0.37, + "grad_norm": 2.9262178045103964, + "learning_rate": 7.225419779741376e-06, + "loss": 0.3036, + "step": 5925 + }, + { + "epoch": 0.37, + "grad_norm": 2.0445039946056185, + "learning_rate": 7.224507699222263e-06, + "loss": 0.3105, + "step": 5926 + }, + { + "epoch": 0.37, + "grad_norm": 1.6887161344967128, + "learning_rate": 7.223595526404374e-06, + "loss": 0.3332, + "step": 5927 + }, + { + "epoch": 0.37, + "grad_norm": 2.096963681172321, + "learning_rate": 7.2226832613255584e-06, + "loss": 0.3196, + "step": 5928 + }, + { + "epoch": 0.37, + "grad_norm": 2.771536720348542, + "learning_rate": 7.221770904023664e-06, + "loss": 0.3128, + "step": 5929 + }, + { + "epoch": 0.37, + "grad_norm": 3.399124860506791, + "learning_rate": 7.2208584545365505e-06, + "loss": 0.3207, + "step": 5930 + }, + { + "epoch": 0.37, + "grad_norm": 1.545844740184038, + "learning_rate": 7.219945912902073e-06, + "loss": 0.2978, + "step": 5931 + }, + { + "epoch": 0.37, + "grad_norm": 3.4043635595207227, + "learning_rate": 7.2190332791580995e-06, + "loss": 0.325, + "step": 5932 + }, + { + "epoch": 0.37, + "grad_norm": 2.540370324210669, + "learning_rate": 7.218120553342492e-06, + "loss": 0.3149, + "step": 5933 + }, + { + "epoch": 0.37, + "grad_norm": 3.080938864836028, + "learning_rate": 7.217207735493122e-06, + "loss": 0.2886, + "step": 5934 + }, + { + "epoch": 0.37, + "grad_norm": 2.3301992556399873, + "learning_rate": 7.216294825647866e-06, + "loss": 0.3128, + "step": 5935 + }, + { + "epoch": 0.37, + "grad_norm": 1.4832123564266622, + "learning_rate": 7.215381823844601e-06, + "loss": 0.2998, + "step": 5936 + }, + { + "epoch": 0.37, + "grad_norm": 1.960706285117124, + "learning_rate": 7.214468730121209e-06, + "loss": 0.301, + "step": 5937 + }, + { + "epoch": 0.37, + "grad_norm": 2.255026371704293, + "learning_rate": 7.213555544515577e-06, + "loss": 0.3036, + "step": 5938 + }, + { + "epoch": 0.37, + "grad_norm": 2.4455784418932116, + "learning_rate": 7.212642267065593e-06, + "loss": 0.3153, + "step": 5939 + }, + { + "epoch": 0.37, + "grad_norm": 0.6126928210379949, + "learning_rate": 7.211728897809151e-06, + "loss": 0.4656, + "step": 5940 + }, + { + "epoch": 0.37, + "grad_norm": 1.9074410032665503, + "learning_rate": 7.210815436784148e-06, + "loss": 0.2978, + "step": 5941 + }, + { + "epoch": 0.37, + "grad_norm": 2.1222972203089223, + "learning_rate": 7.209901884028487e-06, + "loss": 0.2926, + "step": 5942 + }, + { + "epoch": 0.37, + "grad_norm": 2.0324798593508495, + "learning_rate": 7.20898823958007e-06, + "loss": 0.2851, + "step": 5943 + }, + { + "epoch": 0.37, + "grad_norm": 1.8127984813845401, + "learning_rate": 7.208074503476808e-06, + "loss": 0.3103, + "step": 5944 + }, + { + "epoch": 0.37, + "grad_norm": 2.6075085901736177, + "learning_rate": 7.207160675756614e-06, + "loss": 0.2974, + "step": 5945 + }, + { + "epoch": 0.37, + "grad_norm": 2.2821636272598704, + "learning_rate": 7.206246756457402e-06, + "loss": 0.3086, + "step": 5946 + }, + { + "epoch": 0.37, + "grad_norm": 2.726983264962258, + "learning_rate": 7.205332745617095e-06, + "loss": 0.3128, + "step": 5947 + }, + { + "epoch": 0.37, + "grad_norm": 1.9714385902606477, + "learning_rate": 7.204418643273613e-06, + "loss": 0.3057, + "step": 5948 + }, + { + "epoch": 0.37, + "grad_norm": 2.710025784138471, + "learning_rate": 7.2035044494648865e-06, + "loss": 0.3012, + "step": 5949 + }, + { + "epoch": 0.37, + "grad_norm": 1.5061638534445656, + "learning_rate": 7.202590164228849e-06, + "loss": 0.3223, + "step": 5950 + }, + { + "epoch": 0.37, + "grad_norm": 1.99989521694326, + "learning_rate": 7.20167578760343e-06, + "loss": 0.3166, + "step": 5951 + }, + { + "epoch": 0.37, + "grad_norm": 11.156413550352374, + "learning_rate": 7.200761319626574e-06, + "loss": 0.3094, + "step": 5952 + }, + { + "epoch": 0.37, + "grad_norm": 2.4174844838680327, + "learning_rate": 7.199846760336221e-06, + "loss": 0.2987, + "step": 5953 + }, + { + "epoch": 0.37, + "grad_norm": 3.6831474095576104, + "learning_rate": 7.198932109770319e-06, + "loss": 0.2978, + "step": 5954 + }, + { + "epoch": 0.37, + "grad_norm": 2.494215493871055, + "learning_rate": 7.198017367966817e-06, + "loss": 0.3126, + "step": 5955 + }, + { + "epoch": 0.37, + "grad_norm": 4.3713637925407065, + "learning_rate": 7.197102534963671e-06, + "loss": 0.3164, + "step": 5956 + }, + { + "epoch": 0.37, + "grad_norm": 2.1745249570043237, + "learning_rate": 7.19618761079884e-06, + "loss": 0.3085, + "step": 5957 + }, + { + "epoch": 0.37, + "grad_norm": 0.6721736852571504, + "learning_rate": 7.195272595510282e-06, + "loss": 0.5019, + "step": 5958 + }, + { + "epoch": 0.37, + "grad_norm": 3.759645629800595, + "learning_rate": 7.194357489135966e-06, + "loss": 0.3258, + "step": 5959 + }, + { + "epoch": 0.37, + "grad_norm": 2.1010763393118905, + "learning_rate": 7.193442291713858e-06, + "loss": 0.3065, + "step": 5960 + }, + { + "epoch": 0.37, + "grad_norm": 2.523171481090262, + "learning_rate": 7.192527003281935e-06, + "loss": 0.3185, + "step": 5961 + }, + { + "epoch": 0.37, + "grad_norm": 1.8455391431495372, + "learning_rate": 7.191611623878173e-06, + "loss": 0.2899, + "step": 5962 + }, + { + "epoch": 0.38, + "grad_norm": 2.526983148103827, + "learning_rate": 7.1906961535405505e-06, + "loss": 0.3059, + "step": 5963 + }, + { + "epoch": 0.38, + "grad_norm": 4.225402680218895, + "learning_rate": 7.189780592307054e-06, + "loss": 0.3093, + "step": 5964 + }, + { + "epoch": 0.38, + "grad_norm": 1.5167138065359553, + "learning_rate": 7.188864940215671e-06, + "loss": 0.2865, + "step": 5965 + }, + { + "epoch": 0.38, + "grad_norm": 3.356918859670861, + "learning_rate": 7.187949197304395e-06, + "loss": 0.3218, + "step": 5966 + }, + { + "epoch": 0.38, + "grad_norm": 16.89189404408381, + "learning_rate": 7.187033363611219e-06, + "loss": 0.318, + "step": 5967 + }, + { + "epoch": 0.38, + "grad_norm": 2.11700703467767, + "learning_rate": 7.186117439174145e-06, + "loss": 0.3133, + "step": 5968 + }, + { + "epoch": 0.38, + "grad_norm": 11.005995948321859, + "learning_rate": 7.185201424031174e-06, + "loss": 0.3221, + "step": 5969 + }, + { + "epoch": 0.38, + "grad_norm": 1.470314521155802, + "learning_rate": 7.184285318220316e-06, + "loss": 0.2872, + "step": 5970 + }, + { + "epoch": 0.38, + "grad_norm": 2.751373484900817, + "learning_rate": 7.18336912177958e-06, + "loss": 0.3091, + "step": 5971 + }, + { + "epoch": 0.38, + "grad_norm": 2.0781543745306235, + "learning_rate": 7.182452834746982e-06, + "loss": 0.3037, + "step": 5972 + }, + { + "epoch": 0.38, + "grad_norm": 1.7377097311702716, + "learning_rate": 7.181536457160538e-06, + "loss": 0.2901, + "step": 5973 + }, + { + "epoch": 0.38, + "grad_norm": 1.736701016666469, + "learning_rate": 7.180619989058273e-06, + "loss": 0.3156, + "step": 5974 + }, + { + "epoch": 0.38, + "grad_norm": 1.3994772603359749, + "learning_rate": 7.17970343047821e-06, + "loss": 0.2894, + "step": 5975 + }, + { + "epoch": 0.38, + "grad_norm": 1.8318057658099611, + "learning_rate": 7.178786781458381e-06, + "loss": 0.3041, + "step": 5976 + }, + { + "epoch": 0.38, + "grad_norm": 3.227806643026452, + "learning_rate": 7.177870042036819e-06, + "loss": 0.2914, + "step": 5977 + }, + { + "epoch": 0.38, + "grad_norm": 8.697526425726963, + "learning_rate": 7.176953212251559e-06, + "loss": 0.3074, + "step": 5978 + }, + { + "epoch": 0.38, + "grad_norm": 2.038781239013182, + "learning_rate": 7.176036292140644e-06, + "loss": 0.3121, + "step": 5979 + }, + { + "epoch": 0.38, + "grad_norm": 2.1796377758250554, + "learning_rate": 7.175119281742119e-06, + "loss": 0.3029, + "step": 5980 + }, + { + "epoch": 0.38, + "grad_norm": 5.080976565060239, + "learning_rate": 7.174202181094031e-06, + "loss": 0.3286, + "step": 5981 + }, + { + "epoch": 0.38, + "grad_norm": 16.309657195172605, + "learning_rate": 7.173284990234433e-06, + "loss": 0.2914, + "step": 5982 + }, + { + "epoch": 0.38, + "grad_norm": 0.6046106125210732, + "learning_rate": 7.17236770920138e-06, + "loss": 0.4895, + "step": 5983 + }, + { + "epoch": 0.38, + "grad_norm": 3.87831532572056, + "learning_rate": 7.1714503380329326e-06, + "loss": 0.3074, + "step": 5984 + }, + { + "epoch": 0.38, + "grad_norm": 1.794917558533384, + "learning_rate": 7.170532876767153e-06, + "loss": 0.3168, + "step": 5985 + }, + { + "epoch": 0.38, + "grad_norm": 11.461320850332969, + "learning_rate": 7.16961532544211e-06, + "loss": 0.3167, + "step": 5986 + }, + { + "epoch": 0.38, + "grad_norm": 1.3543175351221206, + "learning_rate": 7.168697684095873e-06, + "loss": 0.3126, + "step": 5987 + }, + { + "epoch": 0.38, + "grad_norm": 2.4638785596026485, + "learning_rate": 7.1677799527665186e-06, + "loss": 0.3172, + "step": 5988 + }, + { + "epoch": 0.38, + "grad_norm": 2.10295359690129, + "learning_rate": 7.166862131492122e-06, + "loss": 0.3179, + "step": 5989 + }, + { + "epoch": 0.38, + "grad_norm": 2.904067089162369, + "learning_rate": 7.165944220310766e-06, + "loss": 0.2811, + "step": 5990 + }, + { + "epoch": 0.38, + "grad_norm": 2.2090260665197143, + "learning_rate": 7.165026219260538e-06, + "loss": 0.3006, + "step": 5991 + }, + { + "epoch": 0.38, + "grad_norm": 0.5909880695299142, + "learning_rate": 7.164108128379528e-06, + "loss": 0.5293, + "step": 5992 + }, + { + "epoch": 0.38, + "grad_norm": 1.5514547504538685, + "learning_rate": 7.163189947705826e-06, + "loss": 0.3085, + "step": 5993 + }, + { + "epoch": 0.38, + "grad_norm": 3.5225524480726405, + "learning_rate": 7.162271677277532e-06, + "loss": 0.2955, + "step": 5994 + }, + { + "epoch": 0.38, + "grad_norm": 2.2985759849943497, + "learning_rate": 7.161353317132744e-06, + "loss": 0.3193, + "step": 5995 + }, + { + "epoch": 0.38, + "grad_norm": 1.9162294349107698, + "learning_rate": 7.160434867309569e-06, + "loss": 0.3037, + "step": 5996 + }, + { + "epoch": 0.38, + "grad_norm": 1.4891738981892237, + "learning_rate": 7.159516327846114e-06, + "loss": 0.3105, + "step": 5997 + }, + { + "epoch": 0.38, + "grad_norm": 2.0724693220140935, + "learning_rate": 7.1585976987804895e-06, + "loss": 0.3097, + "step": 5998 + }, + { + "epoch": 0.38, + "grad_norm": 2.259566667810883, + "learning_rate": 7.157678980150814e-06, + "loss": 0.3068, + "step": 5999 + }, + { + "epoch": 0.38, + "grad_norm": 0.6444077881578327, + "learning_rate": 7.156760171995204e-06, + "loss": 0.5023, + "step": 6000 + }, + { + "epoch": 0.38, + "grad_norm": 1.7271831934927013, + "learning_rate": 7.155841274351784e-06, + "loss": 0.3108, + "step": 6001 + }, + { + "epoch": 0.38, + "grad_norm": 4.6636631154283315, + "learning_rate": 7.154922287258681e-06, + "loss": 0.2844, + "step": 6002 + }, + { + "epoch": 0.38, + "grad_norm": 3.8343864224279804, + "learning_rate": 7.1540032107540245e-06, + "loss": 0.3197, + "step": 6003 + }, + { + "epoch": 0.38, + "grad_norm": 1.848427973087907, + "learning_rate": 7.1530840448759484e-06, + "loss": 0.3049, + "step": 6004 + }, + { + "epoch": 0.38, + "grad_norm": 1.8418480083532276, + "learning_rate": 7.152164789662592e-06, + "loss": 0.3111, + "step": 6005 + }, + { + "epoch": 0.38, + "grad_norm": 1.7151396141888278, + "learning_rate": 7.151245445152096e-06, + "loss": 0.2947, + "step": 6006 + }, + { + "epoch": 0.38, + "grad_norm": 0.6177711619734679, + "learning_rate": 7.1503260113826035e-06, + "loss": 0.484, + "step": 6007 + }, + { + "epoch": 0.38, + "grad_norm": 4.707515386143622, + "learning_rate": 7.1494064883922655e-06, + "loss": 0.2996, + "step": 6008 + }, + { + "epoch": 0.38, + "grad_norm": 2.4864084445145718, + "learning_rate": 7.148486876219235e-06, + "loss": 0.2861, + "step": 6009 + }, + { + "epoch": 0.38, + "grad_norm": 1.7944702242957387, + "learning_rate": 7.14756717490167e-06, + "loss": 0.3191, + "step": 6010 + }, + { + "epoch": 0.38, + "grad_norm": 1.9664301090511955, + "learning_rate": 7.146647384477725e-06, + "loss": 0.3012, + "step": 6011 + }, + { + "epoch": 0.38, + "grad_norm": 3.66737973962642, + "learning_rate": 7.145727504985569e-06, + "loss": 0.2933, + "step": 6012 + }, + { + "epoch": 0.38, + "grad_norm": 3.647342014756224, + "learning_rate": 7.144807536463368e-06, + "loss": 0.2938, + "step": 6013 + }, + { + "epoch": 0.38, + "grad_norm": 2.518720756555653, + "learning_rate": 7.143887478949292e-06, + "loss": 0.3153, + "step": 6014 + }, + { + "epoch": 0.38, + "grad_norm": 1.7140269567116875, + "learning_rate": 7.142967332481516e-06, + "loss": 0.3074, + "step": 6015 + }, + { + "epoch": 0.38, + "grad_norm": 1.9129138897222344, + "learning_rate": 7.142047097098219e-06, + "loss": 0.3017, + "step": 6016 + }, + { + "epoch": 0.38, + "grad_norm": 1.826972517723574, + "learning_rate": 7.1411267728375845e-06, + "loss": 0.3117, + "step": 6017 + }, + { + "epoch": 0.38, + "grad_norm": 7.980931301259122, + "learning_rate": 7.140206359737797e-06, + "loss": 0.2969, + "step": 6018 + }, + { + "epoch": 0.38, + "grad_norm": 2.2998712903592073, + "learning_rate": 7.139285857837046e-06, + "loss": 0.3169, + "step": 6019 + }, + { + "epoch": 0.38, + "grad_norm": 1.756500055755841, + "learning_rate": 7.138365267173524e-06, + "loss": 0.31, + "step": 6020 + }, + { + "epoch": 0.38, + "grad_norm": 2.2435415782378745, + "learning_rate": 7.13744458778543e-06, + "loss": 0.2979, + "step": 6021 + }, + { + "epoch": 0.38, + "grad_norm": 2.228875636301696, + "learning_rate": 7.136523819710963e-06, + "loss": 0.3237, + "step": 6022 + }, + { + "epoch": 0.38, + "grad_norm": 1.9203626706075438, + "learning_rate": 7.135602962988327e-06, + "loss": 0.3089, + "step": 6023 + }, + { + "epoch": 0.38, + "grad_norm": 3.2807773580502912, + "learning_rate": 7.134682017655732e-06, + "loss": 0.2801, + "step": 6024 + }, + { + "epoch": 0.38, + "grad_norm": 1.883383447407521, + "learning_rate": 7.1337609837513875e-06, + "loss": 0.3036, + "step": 6025 + }, + { + "epoch": 0.38, + "grad_norm": 1.9989008456799338, + "learning_rate": 7.132839861313511e-06, + "loss": 0.2943, + "step": 6026 + }, + { + "epoch": 0.38, + "grad_norm": 2.7139691608818124, + "learning_rate": 7.13191865038032e-06, + "loss": 0.3214, + "step": 6027 + }, + { + "epoch": 0.38, + "grad_norm": 4.205294008880047, + "learning_rate": 7.130997350990037e-06, + "loss": 0.3241, + "step": 6028 + }, + { + "epoch": 0.38, + "grad_norm": 1.8248584968970416, + "learning_rate": 7.130075963180889e-06, + "loss": 0.3089, + "step": 6029 + }, + { + "epoch": 0.38, + "grad_norm": 1.7235284044945178, + "learning_rate": 7.129154486991105e-06, + "loss": 0.2949, + "step": 6030 + }, + { + "epoch": 0.38, + "grad_norm": 1.491676776637185, + "learning_rate": 7.128232922458922e-06, + "loss": 0.2964, + "step": 6031 + }, + { + "epoch": 0.38, + "grad_norm": 2.141666058168301, + "learning_rate": 7.127311269622573e-06, + "loss": 0.2879, + "step": 6032 + }, + { + "epoch": 0.38, + "grad_norm": 2.352111095265622, + "learning_rate": 7.126389528520301e-06, + "loss": 0.3072, + "step": 6033 + }, + { + "epoch": 0.38, + "grad_norm": 1.7030797976818755, + "learning_rate": 7.125467699190351e-06, + "loss": 0.3125, + "step": 6034 + }, + { + "epoch": 0.38, + "grad_norm": 1.6068189899876875, + "learning_rate": 7.1245457816709705e-06, + "loss": 0.2837, + "step": 6035 + }, + { + "epoch": 0.38, + "grad_norm": 0.6235180671792958, + "learning_rate": 7.123623776000412e-06, + "loss": 0.4872, + "step": 6036 + }, + { + "epoch": 0.38, + "grad_norm": 2.5349126519788254, + "learning_rate": 7.1227016822169315e-06, + "loss": 0.3251, + "step": 6037 + }, + { + "epoch": 0.38, + "grad_norm": 1.72208190723723, + "learning_rate": 7.121779500358788e-06, + "loss": 0.2987, + "step": 6038 + }, + { + "epoch": 0.38, + "grad_norm": 2.946054516048329, + "learning_rate": 7.120857230464244e-06, + "loss": 0.3218, + "step": 6039 + }, + { + "epoch": 0.38, + "grad_norm": 1.9674156888459624, + "learning_rate": 7.119934872571566e-06, + "loss": 0.2882, + "step": 6040 + }, + { + "epoch": 0.38, + "grad_norm": 2.4279407274139766, + "learning_rate": 7.119012426719024e-06, + "loss": 0.3161, + "step": 6041 + }, + { + "epoch": 0.38, + "grad_norm": 2.351650021155563, + "learning_rate": 7.118089892944894e-06, + "loss": 0.3239, + "step": 6042 + }, + { + "epoch": 0.38, + "grad_norm": 3.0681820473331536, + "learning_rate": 7.117167271287453e-06, + "loss": 0.2923, + "step": 6043 + }, + { + "epoch": 0.38, + "grad_norm": 3.118400686194755, + "learning_rate": 7.116244561784979e-06, + "loss": 0.3165, + "step": 6044 + }, + { + "epoch": 0.38, + "grad_norm": 2.6274338441549956, + "learning_rate": 7.11532176447576e-06, + "loss": 0.3317, + "step": 6045 + }, + { + "epoch": 0.38, + "grad_norm": 1.8604762163536679, + "learning_rate": 7.114398879398084e-06, + "loss": 0.296, + "step": 6046 + }, + { + "epoch": 0.38, + "grad_norm": 2.4985839502161586, + "learning_rate": 7.113475906590243e-06, + "loss": 0.3202, + "step": 6047 + }, + { + "epoch": 0.38, + "grad_norm": 2.256378121651349, + "learning_rate": 7.112552846090533e-06, + "loss": 0.3179, + "step": 6048 + }, + { + "epoch": 0.38, + "grad_norm": 4.719310780296274, + "learning_rate": 7.111629697937253e-06, + "loss": 0.3108, + "step": 6049 + }, + { + "epoch": 0.38, + "grad_norm": 2.0849549900381477, + "learning_rate": 7.110706462168706e-06, + "loss": 0.3093, + "step": 6050 + }, + { + "epoch": 0.38, + "grad_norm": 2.0654583039100354, + "learning_rate": 7.109783138823199e-06, + "loss": 0.3036, + "step": 6051 + }, + { + "epoch": 0.38, + "grad_norm": 3.3209794508592054, + "learning_rate": 7.108859727939042e-06, + "loss": 0.3176, + "step": 6052 + }, + { + "epoch": 0.38, + "grad_norm": 2.340267151334955, + "learning_rate": 7.107936229554549e-06, + "loss": 0.297, + "step": 6053 + }, + { + "epoch": 0.38, + "grad_norm": 1.9390822601459914, + "learning_rate": 7.107012643708039e-06, + "loss": 0.2924, + "step": 6054 + }, + { + "epoch": 0.38, + "grad_norm": 4.77185725744881, + "learning_rate": 7.10608897043783e-06, + "loss": 0.2911, + "step": 6055 + }, + { + "epoch": 0.38, + "grad_norm": 1.7767442966723028, + "learning_rate": 7.10516520978225e-06, + "loss": 0.3039, + "step": 6056 + }, + { + "epoch": 0.38, + "grad_norm": 6.794659391202409, + "learning_rate": 7.104241361779627e-06, + "loss": 0.2888, + "step": 6057 + }, + { + "epoch": 0.38, + "grad_norm": 2.4730653608893003, + "learning_rate": 7.10331742646829e-06, + "loss": 0.3009, + "step": 6058 + }, + { + "epoch": 0.38, + "grad_norm": 3.28545990046778, + "learning_rate": 7.102393403886578e-06, + "loss": 0.3212, + "step": 6059 + }, + { + "epoch": 0.38, + "grad_norm": 2.170668734452948, + "learning_rate": 7.101469294072829e-06, + "loss": 0.3019, + "step": 6060 + }, + { + "epoch": 0.38, + "grad_norm": 2.303197692180006, + "learning_rate": 7.100545097065389e-06, + "loss": 0.2969, + "step": 6061 + }, + { + "epoch": 0.38, + "grad_norm": 4.766199437062035, + "learning_rate": 7.099620812902599e-06, + "loss": 0.2956, + "step": 6062 + }, + { + "epoch": 0.38, + "grad_norm": 2.234164688795876, + "learning_rate": 7.098696441622814e-06, + "loss": 0.2914, + "step": 6063 + }, + { + "epoch": 0.38, + "grad_norm": 2.962299086533189, + "learning_rate": 7.097771983264384e-06, + "loss": 0.3295, + "step": 6064 + }, + { + "epoch": 0.38, + "grad_norm": 1.501685559116226, + "learning_rate": 7.096847437865671e-06, + "loss": 0.2999, + "step": 6065 + }, + { + "epoch": 0.38, + "grad_norm": 2.0932316992331916, + "learning_rate": 7.095922805465031e-06, + "loss": 0.3018, + "step": 6066 + }, + { + "epoch": 0.38, + "grad_norm": 4.123320078935363, + "learning_rate": 7.0949980861008315e-06, + "loss": 0.2807, + "step": 6067 + }, + { + "epoch": 0.38, + "grad_norm": 3.0151895169381566, + "learning_rate": 7.0940732798114395e-06, + "loss": 0.3129, + "step": 6068 + }, + { + "epoch": 0.38, + "grad_norm": 1.5562491480090626, + "learning_rate": 7.0931483866352305e-06, + "loss": 0.3018, + "step": 6069 + }, + { + "epoch": 0.38, + "grad_norm": 2.2349881269372105, + "learning_rate": 7.092223406610574e-06, + "loss": 0.3057, + "step": 6070 + }, + { + "epoch": 0.38, + "grad_norm": 2.8705404189940347, + "learning_rate": 7.091298339775854e-06, + "loss": 0.3209, + "step": 6071 + }, + { + "epoch": 0.38, + "grad_norm": 1.9862349323270718, + "learning_rate": 7.0903731861694505e-06, + "loss": 0.304, + "step": 6072 + }, + { + "epoch": 0.38, + "grad_norm": 3.685636385013489, + "learning_rate": 7.089447945829752e-06, + "loss": 0.2902, + "step": 6073 + }, + { + "epoch": 0.38, + "grad_norm": 2.142857306447189, + "learning_rate": 7.088522618795145e-06, + "loss": 0.2931, + "step": 6074 + }, + { + "epoch": 0.38, + "grad_norm": 1.3857036789661261, + "learning_rate": 7.087597205104026e-06, + "loss": 0.3033, + "step": 6075 + }, + { + "epoch": 0.38, + "grad_norm": 4.160089716025319, + "learning_rate": 7.08667170479479e-06, + "loss": 0.3069, + "step": 6076 + }, + { + "epoch": 0.38, + "grad_norm": 2.1547547172595234, + "learning_rate": 7.085746117905841e-06, + "loss": 0.2862, + "step": 6077 + }, + { + "epoch": 0.38, + "grad_norm": 1.707743010491154, + "learning_rate": 7.084820444475579e-06, + "loss": 0.3067, + "step": 6078 + }, + { + "epoch": 0.38, + "grad_norm": 2.786143828091818, + "learning_rate": 7.083894684542413e-06, + "loss": 0.3, + "step": 6079 + }, + { + "epoch": 0.38, + "grad_norm": 4.907577231597122, + "learning_rate": 7.082968838144756e-06, + "loss": 0.3341, + "step": 6080 + }, + { + "epoch": 0.38, + "grad_norm": 2.2263893472766974, + "learning_rate": 7.082042905321022e-06, + "loss": 0.3041, + "step": 6081 + }, + { + "epoch": 0.38, + "grad_norm": 3.33554888014138, + "learning_rate": 7.081116886109629e-06, + "loss": 0.3103, + "step": 6082 + }, + { + "epoch": 0.38, + "grad_norm": 1.8114787057696915, + "learning_rate": 7.080190780549002e-06, + "loss": 0.3229, + "step": 6083 + }, + { + "epoch": 0.38, + "grad_norm": 1.9649870410684962, + "learning_rate": 7.079264588677564e-06, + "loss": 0.3382, + "step": 6084 + }, + { + "epoch": 0.38, + "grad_norm": 1.799504074440034, + "learning_rate": 7.078338310533744e-06, + "loss": 0.3073, + "step": 6085 + }, + { + "epoch": 0.38, + "grad_norm": 2.6390799041427644, + "learning_rate": 7.077411946155975e-06, + "loss": 0.3015, + "step": 6086 + }, + { + "epoch": 0.38, + "grad_norm": 3.308849381739574, + "learning_rate": 7.076485495582696e-06, + "loss": 0.2826, + "step": 6087 + }, + { + "epoch": 0.38, + "grad_norm": 1.7807209331700014, + "learning_rate": 7.0755589588523464e-06, + "loss": 0.3127, + "step": 6088 + }, + { + "epoch": 0.38, + "grad_norm": 2.062526210664507, + "learning_rate": 7.074632336003368e-06, + "loss": 0.3222, + "step": 6089 + }, + { + "epoch": 0.38, + "grad_norm": 1.8211249363935218, + "learning_rate": 7.0737056270742085e-06, + "loss": 0.2989, + "step": 6090 + }, + { + "epoch": 0.38, + "grad_norm": 3.8539341258880238, + "learning_rate": 7.072778832103321e-06, + "loss": 0.3089, + "step": 6091 + }, + { + "epoch": 0.38, + "grad_norm": 1.903834652848273, + "learning_rate": 7.071851951129156e-06, + "loss": 0.3173, + "step": 6092 + }, + { + "epoch": 0.38, + "grad_norm": 2.2189344812108156, + "learning_rate": 7.070924984190175e-06, + "loss": 0.2947, + "step": 6093 + }, + { + "epoch": 0.38, + "grad_norm": 0.6056192596870574, + "learning_rate": 7.069997931324837e-06, + "loss": 0.5081, + "step": 6094 + }, + { + "epoch": 0.38, + "grad_norm": 2.701728321342455, + "learning_rate": 7.069070792571608e-06, + "loss": 0.3021, + "step": 6095 + }, + { + "epoch": 0.38, + "grad_norm": 2.3949983094981673, + "learning_rate": 7.068143567968958e-06, + "loss": 0.3015, + "step": 6096 + }, + { + "epoch": 0.38, + "grad_norm": 2.2379630527545697, + "learning_rate": 7.067216257555357e-06, + "loss": 0.3115, + "step": 6097 + }, + { + "epoch": 0.38, + "grad_norm": 0.5775514592063432, + "learning_rate": 7.0662888613692815e-06, + "loss": 0.4892, + "step": 6098 + }, + { + "epoch": 0.38, + "grad_norm": 3.39311315109939, + "learning_rate": 7.065361379449213e-06, + "loss": 0.3138, + "step": 6099 + }, + { + "epoch": 0.38, + "grad_norm": 3.2324433257890504, + "learning_rate": 7.06443381183363e-06, + "loss": 0.2861, + "step": 6100 + }, + { + "epoch": 0.38, + "grad_norm": 1.8595551351518653, + "learning_rate": 7.063506158561022e-06, + "loss": 0.2958, + "step": 6101 + }, + { + "epoch": 0.38, + "grad_norm": 2.989934325758585, + "learning_rate": 7.062578419669877e-06, + "loss": 0.2935, + "step": 6102 + }, + { + "epoch": 0.38, + "grad_norm": 1.4691404106150727, + "learning_rate": 7.061650595198692e-06, + "loss": 0.3079, + "step": 6103 + }, + { + "epoch": 0.38, + "grad_norm": 1.7616358926720772, + "learning_rate": 7.060722685185961e-06, + "loss": 0.2921, + "step": 6104 + }, + { + "epoch": 0.38, + "grad_norm": 4.8235942004169186, + "learning_rate": 7.0597946896701854e-06, + "loss": 0.2923, + "step": 6105 + }, + { + "epoch": 0.38, + "grad_norm": 1.6410184610210388, + "learning_rate": 7.05886660868987e-06, + "loss": 0.2969, + "step": 6106 + }, + { + "epoch": 0.38, + "grad_norm": 1.6815154076557426, + "learning_rate": 7.057938442283523e-06, + "loss": 0.2984, + "step": 6107 + }, + { + "epoch": 0.38, + "grad_norm": 1.720821468920267, + "learning_rate": 7.057010190489651e-06, + "loss": 0.289, + "step": 6108 + }, + { + "epoch": 0.38, + "grad_norm": 1.6209093699482704, + "learning_rate": 7.056081853346776e-06, + "loss": 0.2944, + "step": 6109 + }, + { + "epoch": 0.38, + "grad_norm": 0.6758360820245468, + "learning_rate": 7.055153430893412e-06, + "loss": 0.4849, + "step": 6110 + }, + { + "epoch": 0.38, + "grad_norm": 2.8280460061261037, + "learning_rate": 7.054224923168083e-06, + "loss": 0.3002, + "step": 6111 + }, + { + "epoch": 0.38, + "grad_norm": 1.9719266559597206, + "learning_rate": 7.053296330209309e-06, + "loss": 0.3061, + "step": 6112 + }, + { + "epoch": 0.38, + "grad_norm": 3.6649698942428275, + "learning_rate": 7.052367652055628e-06, + "loss": 0.3133, + "step": 6113 + }, + { + "epoch": 0.38, + "grad_norm": 5.329141100603012, + "learning_rate": 7.051438888745566e-06, + "loss": 0.322, + "step": 6114 + }, + { + "epoch": 0.38, + "grad_norm": 1.6134970792257957, + "learning_rate": 7.050510040317661e-06, + "loss": 0.3054, + "step": 6115 + }, + { + "epoch": 0.38, + "grad_norm": 5.600146444124667, + "learning_rate": 7.0495811068104505e-06, + "loss": 0.3033, + "step": 6116 + }, + { + "epoch": 0.38, + "grad_norm": 4.484537435259375, + "learning_rate": 7.048652088262481e-06, + "loss": 0.3137, + "step": 6117 + }, + { + "epoch": 0.38, + "grad_norm": 2.037218912035509, + "learning_rate": 7.047722984712298e-06, + "loss": 0.3306, + "step": 6118 + }, + { + "epoch": 0.38, + "grad_norm": 2.391179915946461, + "learning_rate": 7.0467937961984505e-06, + "loss": 0.2994, + "step": 6119 + }, + { + "epoch": 0.38, + "grad_norm": 2.731809271949276, + "learning_rate": 7.045864522759492e-06, + "loss": 0.3172, + "step": 6120 + }, + { + "epoch": 0.38, + "grad_norm": 1.8965117256482238, + "learning_rate": 7.044935164433982e-06, + "loss": 0.2978, + "step": 6121 + }, + { + "epoch": 0.39, + "grad_norm": 2.130368883395997, + "learning_rate": 7.0440057212604784e-06, + "loss": 0.3091, + "step": 6122 + }, + { + "epoch": 0.39, + "grad_norm": 1.6828950671125487, + "learning_rate": 7.043076193277548e-06, + "loss": 0.2963, + "step": 6123 + }, + { + "epoch": 0.39, + "grad_norm": 2.812860545122394, + "learning_rate": 7.042146580523757e-06, + "loss": 0.2947, + "step": 6124 + }, + { + "epoch": 0.39, + "grad_norm": 3.2461511729264543, + "learning_rate": 7.0412168830376785e-06, + "loss": 0.2904, + "step": 6125 + }, + { + "epoch": 0.39, + "grad_norm": 5.780148370156523, + "learning_rate": 7.040287100857885e-06, + "loss": 0.3078, + "step": 6126 + }, + { + "epoch": 0.39, + "grad_norm": 2.443994902112538, + "learning_rate": 7.039357234022954e-06, + "loss": 0.2968, + "step": 6127 + }, + { + "epoch": 0.39, + "grad_norm": 1.738888871812275, + "learning_rate": 7.03842728257147e-06, + "loss": 0.2973, + "step": 6128 + }, + { + "epoch": 0.39, + "grad_norm": 1.7900939277729429, + "learning_rate": 7.037497246542018e-06, + "loss": 0.3074, + "step": 6129 + }, + { + "epoch": 0.39, + "grad_norm": 4.45456775526933, + "learning_rate": 7.036567125973187e-06, + "loss": 0.2964, + "step": 6130 + }, + { + "epoch": 0.39, + "grad_norm": 2.7554449753870167, + "learning_rate": 7.035636920903568e-06, + "loss": 0.304, + "step": 6131 + }, + { + "epoch": 0.39, + "grad_norm": 1.5027604018520393, + "learning_rate": 7.034706631371756e-06, + "loss": 0.3056, + "step": 6132 + }, + { + "epoch": 0.39, + "grad_norm": 3.6697743466379795, + "learning_rate": 7.033776257416354e-06, + "loss": 0.3144, + "step": 6133 + }, + { + "epoch": 0.39, + "grad_norm": 2.565734714733023, + "learning_rate": 7.03284579907596e-06, + "loss": 0.2951, + "step": 6134 + }, + { + "epoch": 0.39, + "grad_norm": 2.1532350004877565, + "learning_rate": 7.031915256389186e-06, + "loss": 0.3155, + "step": 6135 + }, + { + "epoch": 0.39, + "grad_norm": 2.8870110224742196, + "learning_rate": 7.030984629394637e-06, + "loss": 0.3057, + "step": 6136 + }, + { + "epoch": 0.39, + "grad_norm": 2.501300650823364, + "learning_rate": 7.03005391813093e-06, + "loss": 0.3081, + "step": 6137 + }, + { + "epoch": 0.39, + "grad_norm": 3.4477855092138006, + "learning_rate": 7.029123122636678e-06, + "loss": 0.3133, + "step": 6138 + }, + { + "epoch": 0.39, + "grad_norm": 1.3984298105624784, + "learning_rate": 7.028192242950506e-06, + "loss": 0.3016, + "step": 6139 + }, + { + "epoch": 0.39, + "grad_norm": 5.2803001292669896, + "learning_rate": 7.027261279111033e-06, + "loss": 0.3061, + "step": 6140 + }, + { + "epoch": 0.39, + "grad_norm": 42.62180019802184, + "learning_rate": 7.02633023115689e-06, + "loss": 0.3123, + "step": 6141 + }, + { + "epoch": 0.39, + "grad_norm": 1.8786333964794473, + "learning_rate": 7.025399099126705e-06, + "loss": 0.2947, + "step": 6142 + }, + { + "epoch": 0.39, + "grad_norm": 0.6413535434928973, + "learning_rate": 7.024467883059116e-06, + "loss": 0.5041, + "step": 6143 + }, + { + "epoch": 0.39, + "grad_norm": 1.591790878851951, + "learning_rate": 7.023536582992758e-06, + "loss": 0.2987, + "step": 6144 + }, + { + "epoch": 0.39, + "grad_norm": 2.416299671075495, + "learning_rate": 7.022605198966274e-06, + "loss": 0.2956, + "step": 6145 + }, + { + "epoch": 0.39, + "grad_norm": 2.183675660620761, + "learning_rate": 7.021673731018305e-06, + "loss": 0.2978, + "step": 6146 + }, + { + "epoch": 0.39, + "grad_norm": 2.2303023644855005, + "learning_rate": 7.0207421791875045e-06, + "loss": 0.3042, + "step": 6147 + }, + { + "epoch": 0.39, + "grad_norm": 2.1889082267322033, + "learning_rate": 7.01981054351252e-06, + "loss": 0.2879, + "step": 6148 + }, + { + "epoch": 0.39, + "grad_norm": 5.339385146404882, + "learning_rate": 7.0188788240320095e-06, + "loss": 0.3128, + "step": 6149 + }, + { + "epoch": 0.39, + "grad_norm": 1.9486207992351334, + "learning_rate": 7.017947020784629e-06, + "loss": 0.307, + "step": 6150 + }, + { + "epoch": 0.39, + "grad_norm": 7.696794029026286, + "learning_rate": 7.017015133809044e-06, + "loss": 0.2968, + "step": 6151 + }, + { + "epoch": 0.39, + "grad_norm": 2.0364165147095457, + "learning_rate": 7.016083163143918e-06, + "loss": 0.2984, + "step": 6152 + }, + { + "epoch": 0.39, + "grad_norm": 1.7201399926681409, + "learning_rate": 7.015151108827921e-06, + "loss": 0.2891, + "step": 6153 + }, + { + "epoch": 0.39, + "grad_norm": 2.2876844560550476, + "learning_rate": 7.014218970899724e-06, + "loss": 0.3287, + "step": 6154 + }, + { + "epoch": 0.39, + "grad_norm": 1.5222707925705408, + "learning_rate": 7.013286749398008e-06, + "loss": 0.3477, + "step": 6155 + }, + { + "epoch": 0.39, + "grad_norm": 3.754050839924, + "learning_rate": 7.0123544443614445e-06, + "loss": 0.2759, + "step": 6156 + }, + { + "epoch": 0.39, + "grad_norm": 2.2849969363113685, + "learning_rate": 7.011422055828721e-06, + "loss": 0.3232, + "step": 6157 + }, + { + "epoch": 0.39, + "grad_norm": 1.9947993971964004, + "learning_rate": 7.010489583838525e-06, + "loss": 0.3348, + "step": 6158 + }, + { + "epoch": 0.39, + "grad_norm": 2.0603287475019276, + "learning_rate": 7.009557028429547e-06, + "loss": 0.2989, + "step": 6159 + }, + { + "epoch": 0.39, + "grad_norm": 1.601370430986458, + "learning_rate": 7.008624389640476e-06, + "loss": 0.3173, + "step": 6160 + }, + { + "epoch": 0.39, + "grad_norm": 3.2302915464410913, + "learning_rate": 7.0076916675100115e-06, + "loss": 0.3141, + "step": 6161 + }, + { + "epoch": 0.39, + "grad_norm": 3.503792068663737, + "learning_rate": 7.0067588620768535e-06, + "loss": 0.3128, + "step": 6162 + }, + { + "epoch": 0.39, + "grad_norm": 1.704506418702534, + "learning_rate": 7.005825973379707e-06, + "loss": 0.2945, + "step": 6163 + }, + { + "epoch": 0.39, + "grad_norm": 2.332277329306159, + "learning_rate": 7.004893001457277e-06, + "loss": 0.3014, + "step": 6164 + }, + { + "epoch": 0.39, + "grad_norm": 1.6437128335933913, + "learning_rate": 7.003959946348277e-06, + "loss": 0.3038, + "step": 6165 + }, + { + "epoch": 0.39, + "grad_norm": 2.875263815796361, + "learning_rate": 7.003026808091417e-06, + "loss": 0.308, + "step": 6166 + }, + { + "epoch": 0.39, + "grad_norm": 1.3694084605402639, + "learning_rate": 7.002093586725419e-06, + "loss": 0.3067, + "step": 6167 + }, + { + "epoch": 0.39, + "grad_norm": 1.8694930266490688, + "learning_rate": 7.001160282289e-06, + "loss": 0.305, + "step": 6168 + }, + { + "epoch": 0.39, + "grad_norm": 1.7609396688890446, + "learning_rate": 7.000226894820888e-06, + "loss": 0.2942, + "step": 6169 + }, + { + "epoch": 0.39, + "grad_norm": 2.0930306386116806, + "learning_rate": 6.99929342435981e-06, + "loss": 0.323, + "step": 6170 + }, + { + "epoch": 0.39, + "grad_norm": 2.8113095138643645, + "learning_rate": 6.998359870944495e-06, + "loss": 0.2783, + "step": 6171 + }, + { + "epoch": 0.39, + "grad_norm": 2.371474334736473, + "learning_rate": 6.99742623461368e-06, + "loss": 0.3029, + "step": 6172 + }, + { + "epoch": 0.39, + "grad_norm": 1.530787928579816, + "learning_rate": 6.996492515406104e-06, + "loss": 0.3019, + "step": 6173 + }, + { + "epoch": 0.39, + "grad_norm": 2.1724544346209704, + "learning_rate": 6.995558713360505e-06, + "loss": 0.3042, + "step": 6174 + }, + { + "epoch": 0.39, + "grad_norm": 2.301487508566316, + "learning_rate": 6.994624828515632e-06, + "loss": 0.2969, + "step": 6175 + }, + { + "epoch": 0.39, + "grad_norm": 1.9765221901474412, + "learning_rate": 6.993690860910232e-06, + "loss": 0.3137, + "step": 6176 + }, + { + "epoch": 0.39, + "grad_norm": 2.297429021572898, + "learning_rate": 6.992756810583057e-06, + "loss": 0.3203, + "step": 6177 + }, + { + "epoch": 0.39, + "grad_norm": 1.4405690815449252, + "learning_rate": 6.991822677572862e-06, + "loss": 0.2862, + "step": 6178 + }, + { + "epoch": 0.39, + "grad_norm": 1.9009653441134091, + "learning_rate": 6.9908884619184054e-06, + "loss": 0.2977, + "step": 6179 + }, + { + "epoch": 0.39, + "grad_norm": 4.049451385108946, + "learning_rate": 6.98995416365845e-06, + "loss": 0.302, + "step": 6180 + }, + { + "epoch": 0.39, + "grad_norm": 2.8342916199501595, + "learning_rate": 6.989019782831764e-06, + "loss": 0.3101, + "step": 6181 + }, + { + "epoch": 0.39, + "grad_norm": 3.0547641381279673, + "learning_rate": 6.988085319477114e-06, + "loss": 0.3038, + "step": 6182 + }, + { + "epoch": 0.39, + "grad_norm": 2.3825054175343645, + "learning_rate": 6.987150773633271e-06, + "loss": 0.3111, + "step": 6183 + }, + { + "epoch": 0.39, + "grad_norm": 1.8135322516377184, + "learning_rate": 6.9862161453390145e-06, + "loss": 0.2849, + "step": 6184 + }, + { + "epoch": 0.39, + "grad_norm": 1.4229262153991158, + "learning_rate": 6.9852814346331225e-06, + "loss": 0.3119, + "step": 6185 + }, + { + "epoch": 0.39, + "grad_norm": 4.675703339366878, + "learning_rate": 6.984346641554376e-06, + "loss": 0.2941, + "step": 6186 + }, + { + "epoch": 0.39, + "grad_norm": 2.055667374737458, + "learning_rate": 6.983411766141563e-06, + "loss": 0.3274, + "step": 6187 + }, + { + "epoch": 0.39, + "grad_norm": 2.6502831156998723, + "learning_rate": 6.9824768084334736e-06, + "loss": 0.2932, + "step": 6188 + }, + { + "epoch": 0.39, + "grad_norm": 1.7299002277457307, + "learning_rate": 6.9815417684689e-06, + "loss": 0.3117, + "step": 6189 + }, + { + "epoch": 0.39, + "grad_norm": 1.6634212748601143, + "learning_rate": 6.980606646286637e-06, + "loss": 0.3262, + "step": 6190 + }, + { + "epoch": 0.39, + "grad_norm": 2.774921914155989, + "learning_rate": 6.97967144192549e-06, + "loss": 0.3047, + "step": 6191 + }, + { + "epoch": 0.39, + "grad_norm": 2.1742148632581366, + "learning_rate": 6.978736155424255e-06, + "loss": 0.2885, + "step": 6192 + }, + { + "epoch": 0.39, + "grad_norm": 2.024499807094335, + "learning_rate": 6.977800786821744e-06, + "loss": 0.2962, + "step": 6193 + }, + { + "epoch": 0.39, + "grad_norm": 2.601096611957067, + "learning_rate": 6.976865336156765e-06, + "loss": 0.3191, + "step": 6194 + }, + { + "epoch": 0.39, + "grad_norm": 1.990699948167919, + "learning_rate": 6.975929803468133e-06, + "loss": 0.3044, + "step": 6195 + }, + { + "epoch": 0.39, + "grad_norm": 1.4728847852061975, + "learning_rate": 6.974994188794662e-06, + "loss": 0.2947, + "step": 6196 + }, + { + "epoch": 0.39, + "grad_norm": 1.4848283441348078, + "learning_rate": 6.974058492175176e-06, + "loss": 0.2987, + "step": 6197 + }, + { + "epoch": 0.39, + "grad_norm": 1.5987230287646015, + "learning_rate": 6.973122713648495e-06, + "loss": 0.2955, + "step": 6198 + }, + { + "epoch": 0.39, + "grad_norm": 3.8083417251027383, + "learning_rate": 6.97218685325345e-06, + "loss": 0.293, + "step": 6199 + }, + { + "epoch": 0.39, + "grad_norm": 1.970791728847927, + "learning_rate": 6.9712509110288686e-06, + "loss": 0.2979, + "step": 6200 + }, + { + "epoch": 0.39, + "grad_norm": 2.2241409000408114, + "learning_rate": 6.970314887013585e-06, + "loss": 0.3084, + "step": 6201 + }, + { + "epoch": 0.39, + "grad_norm": 1.776637136199845, + "learning_rate": 6.969378781246436e-06, + "loss": 0.3005, + "step": 6202 + }, + { + "epoch": 0.39, + "grad_norm": 2.3347987153060936, + "learning_rate": 6.968442593766266e-06, + "loss": 0.319, + "step": 6203 + }, + { + "epoch": 0.39, + "grad_norm": 2.2445610065185355, + "learning_rate": 6.967506324611915e-06, + "loss": 0.314, + "step": 6204 + }, + { + "epoch": 0.39, + "grad_norm": 1.6001916367689244, + "learning_rate": 6.9665699738222316e-06, + "loss": 0.2999, + "step": 6205 + }, + { + "epoch": 0.39, + "grad_norm": 4.2452831174311605, + "learning_rate": 6.965633541436066e-06, + "loss": 0.3092, + "step": 6206 + }, + { + "epoch": 0.39, + "grad_norm": 2.9719513079696007, + "learning_rate": 6.964697027492277e-06, + "loss": 0.3064, + "step": 6207 + }, + { + "epoch": 0.39, + "grad_norm": 3.3324282619334165, + "learning_rate": 6.963760432029716e-06, + "loss": 0.3328, + "step": 6208 + }, + { + "epoch": 0.39, + "grad_norm": 3.3561641089060426, + "learning_rate": 6.9628237550872465e-06, + "loss": 0.3092, + "step": 6209 + }, + { + "epoch": 0.39, + "grad_norm": 2.5942604909686575, + "learning_rate": 6.961886996703733e-06, + "loss": 0.3166, + "step": 6210 + }, + { + "epoch": 0.39, + "grad_norm": 152.38072679946387, + "learning_rate": 6.960950156918045e-06, + "loss": 0.2966, + "step": 6211 + }, + { + "epoch": 0.39, + "grad_norm": 1.6104351817313225, + "learning_rate": 6.960013235769051e-06, + "loss": 0.287, + "step": 6212 + }, + { + "epoch": 0.39, + "grad_norm": 1.9535967986263134, + "learning_rate": 6.959076233295625e-06, + "loss": 0.3308, + "step": 6213 + }, + { + "epoch": 0.39, + "grad_norm": 27.233761279908748, + "learning_rate": 6.958139149536648e-06, + "loss": 0.3323, + "step": 6214 + }, + { + "epoch": 0.39, + "grad_norm": 2.0368906440057692, + "learning_rate": 6.957201984531e-06, + "loss": 0.3064, + "step": 6215 + }, + { + "epoch": 0.39, + "grad_norm": 0.6859032984801705, + "learning_rate": 6.956264738317564e-06, + "loss": 0.5166, + "step": 6216 + }, + { + "epoch": 0.39, + "grad_norm": 7.352088624571709, + "learning_rate": 6.9553274109352305e-06, + "loss": 0.3143, + "step": 6217 + }, + { + "epoch": 0.39, + "grad_norm": 3.8277867727762485, + "learning_rate": 6.954390002422889e-06, + "loss": 0.2988, + "step": 6218 + }, + { + "epoch": 0.39, + "grad_norm": 7.602359366787512, + "learning_rate": 6.953452512819435e-06, + "loss": 0.3167, + "step": 6219 + }, + { + "epoch": 0.39, + "grad_norm": 1.5034782055592268, + "learning_rate": 6.952514942163766e-06, + "loss": 0.2939, + "step": 6220 + }, + { + "epoch": 0.39, + "grad_norm": 1.8499237300295603, + "learning_rate": 6.951577290494784e-06, + "loss": 0.3092, + "step": 6221 + }, + { + "epoch": 0.39, + "grad_norm": 2.5586430421078536, + "learning_rate": 6.950639557851395e-06, + "loss": 0.3071, + "step": 6222 + }, + { + "epoch": 0.39, + "grad_norm": 31.27574456336476, + "learning_rate": 6.949701744272506e-06, + "loss": 0.3065, + "step": 6223 + }, + { + "epoch": 0.39, + "grad_norm": 2.0730910107267553, + "learning_rate": 6.9487638497970266e-06, + "loss": 0.3084, + "step": 6224 + }, + { + "epoch": 0.39, + "grad_norm": 2.949287808216297, + "learning_rate": 6.947825874463876e-06, + "loss": 0.3057, + "step": 6225 + }, + { + "epoch": 0.39, + "grad_norm": 1.9460132840550055, + "learning_rate": 6.946887818311969e-06, + "loss": 0.292, + "step": 6226 + }, + { + "epoch": 0.39, + "grad_norm": 1.5116987370461523, + "learning_rate": 6.945949681380229e-06, + "loss": 0.3014, + "step": 6227 + }, + { + "epoch": 0.39, + "grad_norm": 1.727836623745217, + "learning_rate": 6.9450114637075785e-06, + "loss": 0.3037, + "step": 6228 + }, + { + "epoch": 0.39, + "grad_norm": 2.5963086534531157, + "learning_rate": 6.944073165332949e-06, + "loss": 0.303, + "step": 6229 + }, + { + "epoch": 0.39, + "grad_norm": 2.5931571148043875, + "learning_rate": 6.943134786295272e-06, + "loss": 0.3196, + "step": 6230 + }, + { + "epoch": 0.39, + "grad_norm": 1.749088870863373, + "learning_rate": 6.942196326633479e-06, + "loss": 0.298, + "step": 6231 + }, + { + "epoch": 0.39, + "grad_norm": 2.0658775691579407, + "learning_rate": 6.941257786386511e-06, + "loss": 0.3008, + "step": 6232 + }, + { + "epoch": 0.39, + "grad_norm": 1.504533425052485, + "learning_rate": 6.940319165593312e-06, + "loss": 0.2871, + "step": 6233 + }, + { + "epoch": 0.39, + "grad_norm": 1.921881588484143, + "learning_rate": 6.939380464292822e-06, + "loss": 0.3153, + "step": 6234 + }, + { + "epoch": 0.39, + "grad_norm": 1.8339638466264867, + "learning_rate": 6.938441682523992e-06, + "loss": 0.3271, + "step": 6235 + }, + { + "epoch": 0.39, + "grad_norm": 3.858992607009354, + "learning_rate": 6.9375028203257745e-06, + "loss": 0.3066, + "step": 6236 + }, + { + "epoch": 0.39, + "grad_norm": 2.5885599558565247, + "learning_rate": 6.936563877737124e-06, + "loss": 0.3032, + "step": 6237 + }, + { + "epoch": 0.39, + "grad_norm": 1.4347952306799847, + "learning_rate": 6.935624854796996e-06, + "loss": 0.2824, + "step": 6238 + }, + { + "epoch": 0.39, + "grad_norm": 2.0816583249155474, + "learning_rate": 6.934685751544356e-06, + "loss": 0.2944, + "step": 6239 + }, + { + "epoch": 0.39, + "grad_norm": 1.806656978109954, + "learning_rate": 6.933746568018168e-06, + "loss": 0.2914, + "step": 6240 + }, + { + "epoch": 0.39, + "grad_norm": 2.12612682284159, + "learning_rate": 6.932807304257401e-06, + "loss": 0.2904, + "step": 6241 + }, + { + "epoch": 0.39, + "grad_norm": 2.2979071889701084, + "learning_rate": 6.931867960301024e-06, + "loss": 0.3017, + "step": 6242 + }, + { + "epoch": 0.39, + "grad_norm": 2.028160544617963, + "learning_rate": 6.930928536188015e-06, + "loss": 0.3065, + "step": 6243 + }, + { + "epoch": 0.39, + "grad_norm": 0.7094229070482596, + "learning_rate": 6.929989031957352e-06, + "loss": 0.4919, + "step": 6244 + }, + { + "epoch": 0.39, + "grad_norm": 2.496110455846103, + "learning_rate": 6.929049447648015e-06, + "loss": 0.3175, + "step": 6245 + }, + { + "epoch": 0.39, + "grad_norm": 2.453174483302288, + "learning_rate": 6.928109783298989e-06, + "loss": 0.3055, + "step": 6246 + }, + { + "epoch": 0.39, + "grad_norm": 4.99618496581895, + "learning_rate": 6.927170038949267e-06, + "loss": 0.3059, + "step": 6247 + }, + { + "epoch": 0.39, + "grad_norm": 3.6402182504686644, + "learning_rate": 6.926230214637833e-06, + "loss": 0.3034, + "step": 6248 + }, + { + "epoch": 0.39, + "grad_norm": 1.5233747917849692, + "learning_rate": 6.925290310403689e-06, + "loss": 0.3063, + "step": 6249 + }, + { + "epoch": 0.39, + "grad_norm": 1.6409800461397064, + "learning_rate": 6.9243503262858285e-06, + "loss": 0.3051, + "step": 6250 + }, + { + "epoch": 0.39, + "grad_norm": 1.977322549412839, + "learning_rate": 6.9234102623232555e-06, + "loss": 0.2981, + "step": 6251 + }, + { + "epoch": 0.39, + "grad_norm": 0.656496815216476, + "learning_rate": 6.922470118554975e-06, + "loss": 0.5009, + "step": 6252 + }, + { + "epoch": 0.39, + "grad_norm": 1.431708885245252, + "learning_rate": 6.921529895019995e-06, + "loss": 0.2936, + "step": 6253 + }, + { + "epoch": 0.39, + "grad_norm": 2.556348654702341, + "learning_rate": 6.920589591757324e-06, + "loss": 0.3245, + "step": 6254 + }, + { + "epoch": 0.39, + "grad_norm": 4.667316568236721, + "learning_rate": 6.919649208805982e-06, + "loss": 0.3048, + "step": 6255 + }, + { + "epoch": 0.39, + "grad_norm": 7.935196027183069, + "learning_rate": 6.9187087462049825e-06, + "loss": 0.2968, + "step": 6256 + }, + { + "epoch": 0.39, + "grad_norm": 5.454611931272937, + "learning_rate": 6.917768203993351e-06, + "loss": 0.3067, + "step": 6257 + }, + { + "epoch": 0.39, + "grad_norm": 3.5713638831818426, + "learning_rate": 6.91682758221011e-06, + "loss": 0.3024, + "step": 6258 + }, + { + "epoch": 0.39, + "grad_norm": 1.5883838395154235, + "learning_rate": 6.915886880894288e-06, + "loss": 0.3081, + "step": 6259 + }, + { + "epoch": 0.39, + "grad_norm": 4.262431695447782, + "learning_rate": 6.914946100084916e-06, + "loss": 0.3142, + "step": 6260 + }, + { + "epoch": 0.39, + "grad_norm": 1.8432977159515525, + "learning_rate": 6.914005239821029e-06, + "loss": 0.295, + "step": 6261 + }, + { + "epoch": 0.39, + "grad_norm": 1.4358082624150301, + "learning_rate": 6.913064300141664e-06, + "loss": 0.3194, + "step": 6262 + }, + { + "epoch": 0.39, + "grad_norm": 1.5763974867524773, + "learning_rate": 6.912123281085865e-06, + "loss": 0.2939, + "step": 6263 + }, + { + "epoch": 0.39, + "grad_norm": 1.9178292226801221, + "learning_rate": 6.911182182692674e-06, + "loss": 0.3076, + "step": 6264 + }, + { + "epoch": 0.39, + "grad_norm": 2.1300446005851024, + "learning_rate": 6.910241005001139e-06, + "loss": 0.3028, + "step": 6265 + }, + { + "epoch": 0.39, + "grad_norm": 2.342598594435115, + "learning_rate": 6.9092997480503125e-06, + "loss": 0.2987, + "step": 6266 + }, + { + "epoch": 0.39, + "grad_norm": 2.8683884499375756, + "learning_rate": 6.908358411879249e-06, + "loss": 0.3218, + "step": 6267 + }, + { + "epoch": 0.39, + "grad_norm": 1.8350418262789883, + "learning_rate": 6.907416996527003e-06, + "loss": 0.2927, + "step": 6268 + }, + { + "epoch": 0.39, + "grad_norm": 1.4211472264303675, + "learning_rate": 6.906475502032639e-06, + "loss": 0.2996, + "step": 6269 + }, + { + "epoch": 0.39, + "grad_norm": 1.7646794548493567, + "learning_rate": 6.90553392843522e-06, + "loss": 0.2933, + "step": 6270 + }, + { + "epoch": 0.39, + "grad_norm": 3.2783824298141386, + "learning_rate": 6.904592275773816e-06, + "loss": 0.3227, + "step": 6271 + }, + { + "epoch": 0.39, + "grad_norm": 1.576439189656656, + "learning_rate": 6.9036505440874915e-06, + "loss": 0.3012, + "step": 6272 + }, + { + "epoch": 0.39, + "grad_norm": 2.6992834889994706, + "learning_rate": 6.902708733415328e-06, + "loss": 0.3077, + "step": 6273 + }, + { + "epoch": 0.39, + "grad_norm": 2.317652696958572, + "learning_rate": 6.901766843796398e-06, + "loss": 0.3153, + "step": 6274 + }, + { + "epoch": 0.39, + "grad_norm": 2.432923743401709, + "learning_rate": 6.900824875269785e-06, + "loss": 0.3336, + "step": 6275 + }, + { + "epoch": 0.39, + "grad_norm": 1.4839330873472332, + "learning_rate": 6.8998828278745686e-06, + "loss": 0.3213, + "step": 6276 + }, + { + "epoch": 0.39, + "grad_norm": 1.5807956176225497, + "learning_rate": 6.898940701649842e-06, + "loss": 0.2924, + "step": 6277 + }, + { + "epoch": 0.39, + "grad_norm": 3.3568996076431077, + "learning_rate": 6.8979984966346914e-06, + "loss": 0.3095, + "step": 6278 + }, + { + "epoch": 0.39, + "grad_norm": 1.43490768163794, + "learning_rate": 6.897056212868214e-06, + "loss": 0.3039, + "step": 6279 + }, + { + "epoch": 0.39, + "grad_norm": 1.9956618792777565, + "learning_rate": 6.8961138503895005e-06, + "loss": 0.3061, + "step": 6280 + }, + { + "epoch": 0.4, + "grad_norm": 1.9596725519419582, + "learning_rate": 6.89517140923766e-06, + "loss": 0.3204, + "step": 6281 + }, + { + "epoch": 0.4, + "grad_norm": 1.7387812431055278, + "learning_rate": 6.89422888945179e-06, + "loss": 0.2911, + "step": 6282 + }, + { + "epoch": 0.4, + "grad_norm": 2.4441244823517567, + "learning_rate": 6.893286291071e-06, + "loss": 0.3176, + "step": 6283 + }, + { + "epoch": 0.4, + "grad_norm": 1.0913105776521217, + "learning_rate": 6.892343614134395e-06, + "loss": 0.2994, + "step": 6284 + }, + { + "epoch": 0.4, + "grad_norm": 1.550371241432521, + "learning_rate": 6.891400858681097e-06, + "loss": 0.3072, + "step": 6285 + }, + { + "epoch": 0.4, + "grad_norm": 1.4329921081678139, + "learning_rate": 6.890458024750214e-06, + "loss": 0.3028, + "step": 6286 + }, + { + "epoch": 0.4, + "grad_norm": 1.6129288678403952, + "learning_rate": 6.889515112380871e-06, + "loss": 0.3189, + "step": 6287 + }, + { + "epoch": 0.4, + "grad_norm": 1.6080109905841038, + "learning_rate": 6.88857212161219e-06, + "loss": 0.3228, + "step": 6288 + }, + { + "epoch": 0.4, + "grad_norm": 1.3992892619014397, + "learning_rate": 6.887629052483299e-06, + "loss": 0.2798, + "step": 6289 + }, + { + "epoch": 0.4, + "grad_norm": 4.384952561702454, + "learning_rate": 6.886685905033324e-06, + "loss": 0.3005, + "step": 6290 + }, + { + "epoch": 0.4, + "grad_norm": 1.456135948922505, + "learning_rate": 6.885742679301399e-06, + "loss": 0.3127, + "step": 6291 + }, + { + "epoch": 0.4, + "grad_norm": 1.424520455355706, + "learning_rate": 6.884799375326662e-06, + "loss": 0.3069, + "step": 6292 + }, + { + "epoch": 0.4, + "grad_norm": 1.9529667834212303, + "learning_rate": 6.883855993148252e-06, + "loss": 0.3136, + "step": 6293 + }, + { + "epoch": 0.4, + "grad_norm": 1.515609439982961, + "learning_rate": 6.882912532805308e-06, + "loss": 0.3064, + "step": 6294 + }, + { + "epoch": 0.4, + "grad_norm": 1.9749881320005336, + "learning_rate": 6.8819689943369805e-06, + "loss": 0.3119, + "step": 6295 + }, + { + "epoch": 0.4, + "grad_norm": 1.952786377641116, + "learning_rate": 6.881025377782415e-06, + "loss": 0.318, + "step": 6296 + }, + { + "epoch": 0.4, + "grad_norm": 1.8379560886548978, + "learning_rate": 6.880081683180768e-06, + "loss": 0.313, + "step": 6297 + }, + { + "epoch": 0.4, + "grad_norm": 1.1068557721472867, + "learning_rate": 6.879137910571191e-06, + "loss": 0.2853, + "step": 6298 + }, + { + "epoch": 0.4, + "grad_norm": 1.9633950582282589, + "learning_rate": 6.878194059992846e-06, + "loss": 0.2957, + "step": 6299 + }, + { + "epoch": 0.4, + "grad_norm": 1.4827048174594124, + "learning_rate": 6.8772501314848915e-06, + "loss": 0.3068, + "step": 6300 + }, + { + "epoch": 0.4, + "grad_norm": 1.4277887708746932, + "learning_rate": 6.876306125086496e-06, + "loss": 0.2938, + "step": 6301 + }, + { + "epoch": 0.4, + "grad_norm": 1.445439710481941, + "learning_rate": 6.8753620408368235e-06, + "loss": 0.2964, + "step": 6302 + }, + { + "epoch": 0.4, + "grad_norm": 2.037116657515197, + "learning_rate": 6.8744178787750526e-06, + "loss": 0.3028, + "step": 6303 + }, + { + "epoch": 0.4, + "grad_norm": 2.177952274362125, + "learning_rate": 6.873473638940354e-06, + "loss": 0.2987, + "step": 6304 + }, + { + "epoch": 0.4, + "grad_norm": 1.9791992705023835, + "learning_rate": 6.872529321371906e-06, + "loss": 0.3175, + "step": 6305 + }, + { + "epoch": 0.4, + "grad_norm": 1.6868036208965926, + "learning_rate": 6.87158492610889e-06, + "loss": 0.3118, + "step": 6306 + }, + { + "epoch": 0.4, + "grad_norm": 3.198675008942469, + "learning_rate": 6.870640453190491e-06, + "loss": 0.3157, + "step": 6307 + }, + { + "epoch": 0.4, + "grad_norm": 2.433621451491889, + "learning_rate": 6.869695902655898e-06, + "loss": 0.304, + "step": 6308 + }, + { + "epoch": 0.4, + "grad_norm": 1.9460073702454201, + "learning_rate": 6.868751274544301e-06, + "loss": 0.294, + "step": 6309 + }, + { + "epoch": 0.4, + "grad_norm": 1.9670261253683712, + "learning_rate": 6.867806568894893e-06, + "loss": 0.311, + "step": 6310 + }, + { + "epoch": 0.4, + "grad_norm": 0.6591038286039345, + "learning_rate": 6.866861785746873e-06, + "loss": 0.5118, + "step": 6311 + }, + { + "epoch": 0.4, + "grad_norm": 1.4779175952239088, + "learning_rate": 6.865916925139442e-06, + "loss": 0.3067, + "step": 6312 + }, + { + "epoch": 0.4, + "grad_norm": 2.511401538514108, + "learning_rate": 6.864971987111804e-06, + "loss": 0.3111, + "step": 6313 + }, + { + "epoch": 0.4, + "grad_norm": 1.692742950586186, + "learning_rate": 6.864026971703166e-06, + "loss": 0.3071, + "step": 6314 + }, + { + "epoch": 0.4, + "grad_norm": 5.481765192599177, + "learning_rate": 6.863081878952738e-06, + "loss": 0.3295, + "step": 6315 + }, + { + "epoch": 0.4, + "grad_norm": 1.8603319326306396, + "learning_rate": 6.8621367088997325e-06, + "loss": 0.3238, + "step": 6316 + }, + { + "epoch": 0.4, + "grad_norm": 1.715309388077017, + "learning_rate": 6.8611914615833676e-06, + "loss": 0.2939, + "step": 6317 + }, + { + "epoch": 0.4, + "grad_norm": 2.9872358204036504, + "learning_rate": 6.860246137042863e-06, + "loss": 0.2956, + "step": 6318 + }, + { + "epoch": 0.4, + "grad_norm": 1.8065665490318836, + "learning_rate": 6.859300735317444e-06, + "loss": 0.3002, + "step": 6319 + }, + { + "epoch": 0.4, + "grad_norm": 1.342606612220326, + "learning_rate": 6.858355256446333e-06, + "loss": 0.3155, + "step": 6320 + }, + { + "epoch": 0.4, + "grad_norm": 7.590118021556689, + "learning_rate": 6.857409700468762e-06, + "loss": 0.2907, + "step": 6321 + }, + { + "epoch": 0.4, + "grad_norm": 1.921239443935161, + "learning_rate": 6.856464067423963e-06, + "loss": 0.3159, + "step": 6322 + }, + { + "epoch": 0.4, + "grad_norm": 2.574425643570183, + "learning_rate": 6.855518357351174e-06, + "loss": 0.3008, + "step": 6323 + }, + { + "epoch": 0.4, + "grad_norm": 1.5398285287025202, + "learning_rate": 6.854572570289632e-06, + "loss": 0.3065, + "step": 6324 + }, + { + "epoch": 0.4, + "grad_norm": 2.8041753943042287, + "learning_rate": 6.853626706278579e-06, + "loss": 0.3078, + "step": 6325 + }, + { + "epoch": 0.4, + "grad_norm": 17.057675770228045, + "learning_rate": 6.852680765357262e-06, + "loss": 0.3064, + "step": 6326 + }, + { + "epoch": 0.4, + "grad_norm": 1.4892062230634326, + "learning_rate": 6.85173474756493e-06, + "loss": 0.31, + "step": 6327 + }, + { + "epoch": 0.4, + "grad_norm": 2.0511590689152306, + "learning_rate": 6.850788652940832e-06, + "loss": 0.3012, + "step": 6328 + }, + { + "epoch": 0.4, + "grad_norm": 2.0263552318565505, + "learning_rate": 6.849842481524228e-06, + "loss": 0.3301, + "step": 6329 + }, + { + "epoch": 0.4, + "grad_norm": 1.8728865182573091, + "learning_rate": 6.8488962333543715e-06, + "loss": 0.3122, + "step": 6330 + }, + { + "epoch": 0.4, + "grad_norm": 3.517592855766456, + "learning_rate": 6.847949908470529e-06, + "loss": 0.3074, + "step": 6331 + }, + { + "epoch": 0.4, + "grad_norm": 4.178893106391159, + "learning_rate": 6.84700350691196e-06, + "loss": 0.307, + "step": 6332 + }, + { + "epoch": 0.4, + "grad_norm": 3.6021767766807424, + "learning_rate": 6.846057028717937e-06, + "loss": 0.3197, + "step": 6333 + }, + { + "epoch": 0.4, + "grad_norm": 4.501002131768941, + "learning_rate": 6.845110473927727e-06, + "loss": 0.3286, + "step": 6334 + }, + { + "epoch": 0.4, + "grad_norm": 2.2132623633872157, + "learning_rate": 6.844163842580608e-06, + "loss": 0.3226, + "step": 6335 + }, + { + "epoch": 0.4, + "grad_norm": 1.5350315206393872, + "learning_rate": 6.8432171347158535e-06, + "loss": 0.3115, + "step": 6336 + }, + { + "epoch": 0.4, + "grad_norm": 1.1798307847448, + "learning_rate": 6.842270350372749e-06, + "loss": 0.2947, + "step": 6337 + }, + { + "epoch": 0.4, + "grad_norm": 2.6360090486603727, + "learning_rate": 6.8413234895905726e-06, + "loss": 0.2967, + "step": 6338 + }, + { + "epoch": 0.4, + "grad_norm": 2.9848580915620224, + "learning_rate": 6.840376552408614e-06, + "loss": 0.3115, + "step": 6339 + }, + { + "epoch": 0.4, + "grad_norm": 1.3405026564912617, + "learning_rate": 6.839429538866164e-06, + "loss": 0.2822, + "step": 6340 + }, + { + "epoch": 0.4, + "grad_norm": 1.5362126550497548, + "learning_rate": 6.838482449002517e-06, + "loss": 0.286, + "step": 6341 + }, + { + "epoch": 0.4, + "grad_norm": 1.3069866190457384, + "learning_rate": 6.837535282856966e-06, + "loss": 0.2885, + "step": 6342 + }, + { + "epoch": 0.4, + "grad_norm": 1.4765326992977221, + "learning_rate": 6.836588040468812e-06, + "loss": 0.3005, + "step": 6343 + }, + { + "epoch": 0.4, + "grad_norm": 9.565457286700779, + "learning_rate": 6.835640721877359e-06, + "loss": 0.2854, + "step": 6344 + }, + { + "epoch": 0.4, + "grad_norm": 2.559378590319128, + "learning_rate": 6.834693327121913e-06, + "loss": 0.3041, + "step": 6345 + }, + { + "epoch": 0.4, + "grad_norm": 2.0677104007652565, + "learning_rate": 6.83374585624178e-06, + "loss": 0.3031, + "step": 6346 + }, + { + "epoch": 0.4, + "grad_norm": 2.1149379020446104, + "learning_rate": 6.832798309276275e-06, + "loss": 0.3142, + "step": 6347 + }, + { + "epoch": 0.4, + "grad_norm": 3.225979158875827, + "learning_rate": 6.831850686264712e-06, + "loss": 0.2883, + "step": 6348 + }, + { + "epoch": 0.4, + "grad_norm": 1.4380784270508573, + "learning_rate": 6.830902987246413e-06, + "loss": 0.3088, + "step": 6349 + }, + { + "epoch": 0.4, + "grad_norm": 2.1902049420419636, + "learning_rate": 6.8299552122606934e-06, + "loss": 0.3209, + "step": 6350 + }, + { + "epoch": 0.4, + "grad_norm": 5.4983139239455925, + "learning_rate": 6.829007361346885e-06, + "loss": 0.3236, + "step": 6351 + }, + { + "epoch": 0.4, + "grad_norm": 4.38038349341543, + "learning_rate": 6.828059434544309e-06, + "loss": 0.3019, + "step": 6352 + }, + { + "epoch": 0.4, + "grad_norm": 1.5095839511716451, + "learning_rate": 6.827111431892303e-06, + "loss": 0.2996, + "step": 6353 + }, + { + "epoch": 0.4, + "grad_norm": 1.3002850599763964, + "learning_rate": 6.826163353430197e-06, + "loss": 0.2835, + "step": 6354 + }, + { + "epoch": 0.4, + "grad_norm": 1.782981876014829, + "learning_rate": 6.82521519919733e-06, + "loss": 0.3044, + "step": 6355 + }, + { + "epoch": 0.4, + "grad_norm": 1.4652631665027267, + "learning_rate": 6.8242669692330424e-06, + "loss": 0.3209, + "step": 6356 + }, + { + "epoch": 0.4, + "grad_norm": 2.2667979646321514, + "learning_rate": 6.823318663576679e-06, + "loss": 0.3175, + "step": 6357 + }, + { + "epoch": 0.4, + "grad_norm": 1.3577067350887904, + "learning_rate": 6.822370282267585e-06, + "loss": 0.2862, + "step": 6358 + }, + { + "epoch": 0.4, + "grad_norm": 1.5255784700649988, + "learning_rate": 6.82142182534511e-06, + "loss": 0.2999, + "step": 6359 + }, + { + "epoch": 0.4, + "grad_norm": 2.7601001630013533, + "learning_rate": 6.8204732928486096e-06, + "loss": 0.2916, + "step": 6360 + }, + { + "epoch": 0.4, + "grad_norm": 1.6091656405623176, + "learning_rate": 6.819524684817439e-06, + "loss": 0.312, + "step": 6361 + }, + { + "epoch": 0.4, + "grad_norm": 1.7177599903981735, + "learning_rate": 6.8185760012909566e-06, + "loss": 0.2972, + "step": 6362 + }, + { + "epoch": 0.4, + "grad_norm": 1.9267907314481778, + "learning_rate": 6.817627242308525e-06, + "loss": 0.3185, + "step": 6363 + }, + { + "epoch": 0.4, + "grad_norm": 1.8570406736156893, + "learning_rate": 6.816678407909511e-06, + "loss": 0.316, + "step": 6364 + }, + { + "epoch": 0.4, + "grad_norm": 2.0796908376535903, + "learning_rate": 6.815729498133286e-06, + "loss": 0.302, + "step": 6365 + }, + { + "epoch": 0.4, + "grad_norm": 1.4157855196807505, + "learning_rate": 6.814780513019214e-06, + "loss": 0.3037, + "step": 6366 + }, + { + "epoch": 0.4, + "grad_norm": 1.6743395572870021, + "learning_rate": 6.813831452606678e-06, + "loss": 0.2874, + "step": 6367 + }, + { + "epoch": 0.4, + "grad_norm": 5.692394679105457, + "learning_rate": 6.8128823169350535e-06, + "loss": 0.3051, + "step": 6368 + }, + { + "epoch": 0.4, + "grad_norm": 2.763263681427266, + "learning_rate": 6.811933106043721e-06, + "loss": 0.2882, + "step": 6369 + }, + { + "epoch": 0.4, + "grad_norm": 2.9985345857072736, + "learning_rate": 6.8109838199720655e-06, + "loss": 0.3012, + "step": 6370 + }, + { + "epoch": 0.4, + "grad_norm": 1.2706514808811527, + "learning_rate": 6.8100344587594754e-06, + "loss": 0.307, + "step": 6371 + }, + { + "epoch": 0.4, + "grad_norm": 1.3051438527903696, + "learning_rate": 6.809085022445341e-06, + "loss": 0.3037, + "step": 6372 + }, + { + "epoch": 0.4, + "grad_norm": 5.297934797149001, + "learning_rate": 6.808135511069054e-06, + "loss": 0.3042, + "step": 6373 + }, + { + "epoch": 0.4, + "grad_norm": 5.325539927654011, + "learning_rate": 6.807185924670013e-06, + "loss": 0.2952, + "step": 6374 + }, + { + "epoch": 0.4, + "grad_norm": 1.4080357034994866, + "learning_rate": 6.80623626328762e-06, + "loss": 0.2911, + "step": 6375 + }, + { + "epoch": 0.4, + "grad_norm": 1.9720138672113132, + "learning_rate": 6.805286526961274e-06, + "loss": 0.3278, + "step": 6376 + }, + { + "epoch": 0.4, + "grad_norm": 1.8250325174334556, + "learning_rate": 6.804336715730385e-06, + "loss": 0.3113, + "step": 6377 + }, + { + "epoch": 0.4, + "grad_norm": 2.6273334819042335, + "learning_rate": 6.803386829634361e-06, + "loss": 0.3016, + "step": 6378 + }, + { + "epoch": 0.4, + "grad_norm": 1.9125793157768267, + "learning_rate": 6.8024368687126145e-06, + "loss": 0.3044, + "step": 6379 + }, + { + "epoch": 0.4, + "grad_norm": 2.037940233798932, + "learning_rate": 6.801486833004559e-06, + "loss": 0.3044, + "step": 6380 + }, + { + "epoch": 0.4, + "grad_norm": 3.7520585156058943, + "learning_rate": 6.8005367225496155e-06, + "loss": 0.2997, + "step": 6381 + }, + { + "epoch": 0.4, + "grad_norm": 2.6297250564109693, + "learning_rate": 6.799586537387206e-06, + "loss": 0.3251, + "step": 6382 + }, + { + "epoch": 0.4, + "grad_norm": 1.6972051848994267, + "learning_rate": 6.7986362775567545e-06, + "loss": 0.3171, + "step": 6383 + }, + { + "epoch": 0.4, + "grad_norm": 0.5832104903379132, + "learning_rate": 6.79768594309769e-06, + "loss": 0.4883, + "step": 6384 + }, + { + "epoch": 0.4, + "grad_norm": 1.5790471983614522, + "learning_rate": 6.796735534049441e-06, + "loss": 0.3225, + "step": 6385 + }, + { + "epoch": 0.4, + "grad_norm": 5.330783457874948, + "learning_rate": 6.795785050451443e-06, + "loss": 0.3118, + "step": 6386 + }, + { + "epoch": 0.4, + "grad_norm": 5.835327429306243, + "learning_rate": 6.7948344923431355e-06, + "loss": 0.3374, + "step": 6387 + }, + { + "epoch": 0.4, + "grad_norm": 1.617774730322946, + "learning_rate": 6.793883859763955e-06, + "loss": 0.3133, + "step": 6388 + }, + { + "epoch": 0.4, + "grad_norm": 1.7923144480516422, + "learning_rate": 6.792933152753348e-06, + "loss": 0.3036, + "step": 6389 + }, + { + "epoch": 0.4, + "grad_norm": 2.0125838015849853, + "learning_rate": 6.791982371350761e-06, + "loss": 0.2968, + "step": 6390 + }, + { + "epoch": 0.4, + "grad_norm": 2.5522094024516724, + "learning_rate": 6.791031515595641e-06, + "loss": 0.3029, + "step": 6391 + }, + { + "epoch": 0.4, + "grad_norm": 2.270122226573881, + "learning_rate": 6.790080585527442e-06, + "loss": 0.3262, + "step": 6392 + }, + { + "epoch": 0.4, + "grad_norm": 1.4299924927621477, + "learning_rate": 6.789129581185621e-06, + "loss": 0.2933, + "step": 6393 + }, + { + "epoch": 0.4, + "grad_norm": 1.428656152912421, + "learning_rate": 6.788178502609635e-06, + "loss": 0.2936, + "step": 6394 + }, + { + "epoch": 0.4, + "grad_norm": 1.5712739983277206, + "learning_rate": 6.787227349838946e-06, + "loss": 0.3108, + "step": 6395 + }, + { + "epoch": 0.4, + "grad_norm": 1.7508718567894106, + "learning_rate": 6.786276122913021e-06, + "loss": 0.3109, + "step": 6396 + }, + { + "epoch": 0.4, + "grad_norm": 1.6948998904039196, + "learning_rate": 6.785324821871326e-06, + "loss": 0.2911, + "step": 6397 + }, + { + "epoch": 0.4, + "grad_norm": 1.8042237657566123, + "learning_rate": 6.784373446753334e-06, + "loss": 0.3153, + "step": 6398 + }, + { + "epoch": 0.4, + "grad_norm": 2.203912760157126, + "learning_rate": 6.783421997598518e-06, + "loss": 0.3216, + "step": 6399 + }, + { + "epoch": 0.4, + "grad_norm": 2.128446226595074, + "learning_rate": 6.782470474446357e-06, + "loss": 0.2887, + "step": 6400 + }, + { + "epoch": 0.4, + "grad_norm": 2.2689239331071493, + "learning_rate": 6.781518877336328e-06, + "loss": 0.3201, + "step": 6401 + }, + { + "epoch": 0.4, + "grad_norm": 1.9469366874019725, + "learning_rate": 6.7805672063079166e-06, + "loss": 0.2989, + "step": 6402 + }, + { + "epoch": 0.4, + "grad_norm": 1.8764163133376295, + "learning_rate": 6.77961546140061e-06, + "loss": 0.3321, + "step": 6403 + }, + { + "epoch": 0.4, + "grad_norm": 2.9326197487964842, + "learning_rate": 6.778663642653897e-06, + "loss": 0.2928, + "step": 6404 + }, + { + "epoch": 0.4, + "grad_norm": 2.683167891228867, + "learning_rate": 6.77771175010727e-06, + "loss": 0.2963, + "step": 6405 + }, + { + "epoch": 0.4, + "grad_norm": 4.184407438462736, + "learning_rate": 6.776759783800224e-06, + "loss": 0.3145, + "step": 6406 + }, + { + "epoch": 0.4, + "grad_norm": 2.2446580362794384, + "learning_rate": 6.775807743772258e-06, + "loss": 0.3081, + "step": 6407 + }, + { + "epoch": 0.4, + "grad_norm": 1.926659571692815, + "learning_rate": 6.7748556300628764e-06, + "loss": 0.3009, + "step": 6408 + }, + { + "epoch": 0.4, + "grad_norm": 1.6655192578806834, + "learning_rate": 6.773903442711582e-06, + "loss": 0.3182, + "step": 6409 + }, + { + "epoch": 0.4, + "grad_norm": 1.3488201062460332, + "learning_rate": 6.772951181757883e-06, + "loss": 0.3062, + "step": 6410 + }, + { + "epoch": 0.4, + "grad_norm": 3.2345830430784606, + "learning_rate": 6.77199884724129e-06, + "loss": 0.302, + "step": 6411 + }, + { + "epoch": 0.4, + "grad_norm": 2.440144329245238, + "learning_rate": 6.7710464392013165e-06, + "loss": 0.3013, + "step": 6412 + }, + { + "epoch": 0.4, + "grad_norm": 1.4499405252676987, + "learning_rate": 6.770093957677483e-06, + "loss": 0.2919, + "step": 6413 + }, + { + "epoch": 0.4, + "grad_norm": 1.4967558730981563, + "learning_rate": 6.769141402709305e-06, + "loss": 0.2891, + "step": 6414 + }, + { + "epoch": 0.4, + "grad_norm": 1.5492769404609563, + "learning_rate": 6.7681887743363085e-06, + "loss": 0.3092, + "step": 6415 + }, + { + "epoch": 0.4, + "grad_norm": 2.085038937768041, + "learning_rate": 6.767236072598018e-06, + "loss": 0.3176, + "step": 6416 + }, + { + "epoch": 0.4, + "grad_norm": 4.315609889684815, + "learning_rate": 6.766283297533965e-06, + "loss": 0.2823, + "step": 6417 + }, + { + "epoch": 0.4, + "grad_norm": 2.0800677507978316, + "learning_rate": 6.765330449183682e-06, + "loss": 0.3076, + "step": 6418 + }, + { + "epoch": 0.4, + "grad_norm": 1.8613492189867349, + "learning_rate": 6.764377527586701e-06, + "loss": 0.3399, + "step": 6419 + }, + { + "epoch": 0.4, + "grad_norm": 9.716488026652963, + "learning_rate": 6.763424532782562e-06, + "loss": 0.3122, + "step": 6420 + }, + { + "epoch": 0.4, + "grad_norm": 1.537163245755776, + "learning_rate": 6.762471464810808e-06, + "loss": 0.2919, + "step": 6421 + }, + { + "epoch": 0.4, + "grad_norm": 3.444778544636523, + "learning_rate": 6.761518323710983e-06, + "loss": 0.289, + "step": 6422 + }, + { + "epoch": 0.4, + "grad_norm": 1.4808424561732993, + "learning_rate": 6.760565109522634e-06, + "loss": 0.3142, + "step": 6423 + }, + { + "epoch": 0.4, + "grad_norm": 1.5107185897637505, + "learning_rate": 6.75961182228531e-06, + "loss": 0.2986, + "step": 6424 + }, + { + "epoch": 0.4, + "grad_norm": 1.6168280353607318, + "learning_rate": 6.758658462038568e-06, + "loss": 0.2971, + "step": 6425 + }, + { + "epoch": 0.4, + "grad_norm": 2.0215373404663097, + "learning_rate": 6.757705028821961e-06, + "loss": 0.3061, + "step": 6426 + }, + { + "epoch": 0.4, + "grad_norm": 2.0190968985247033, + "learning_rate": 6.756751522675051e-06, + "loss": 0.3095, + "step": 6427 + }, + { + "epoch": 0.4, + "grad_norm": 1.5502530079717356, + "learning_rate": 6.755797943637401e-06, + "loss": 0.2924, + "step": 6428 + }, + { + "epoch": 0.4, + "grad_norm": 1.7631284176320388, + "learning_rate": 6.754844291748575e-06, + "loss": 0.2864, + "step": 6429 + }, + { + "epoch": 0.4, + "grad_norm": 3.1932181990125468, + "learning_rate": 6.753890567048141e-06, + "loss": 0.2986, + "step": 6430 + }, + { + "epoch": 0.4, + "grad_norm": 3.0542045306519885, + "learning_rate": 6.752936769575673e-06, + "loss": 0.3014, + "step": 6431 + }, + { + "epoch": 0.4, + "grad_norm": 1.8273598593855904, + "learning_rate": 6.751982899370746e-06, + "loss": 0.2978, + "step": 6432 + }, + { + "epoch": 0.4, + "grad_norm": 6.733642977777779, + "learning_rate": 6.751028956472935e-06, + "loss": 0.314, + "step": 6433 + }, + { + "epoch": 0.4, + "grad_norm": 2.397989501728536, + "learning_rate": 6.7500749409218235e-06, + "loss": 0.3007, + "step": 6434 + }, + { + "epoch": 0.4, + "grad_norm": 3.7413082411923697, + "learning_rate": 6.749120852756994e-06, + "loss": 0.2918, + "step": 6435 + }, + { + "epoch": 0.4, + "grad_norm": 1.7194939410932075, + "learning_rate": 6.748166692018033e-06, + "loss": 0.2819, + "step": 6436 + }, + { + "epoch": 0.4, + "grad_norm": 1.772032781328613, + "learning_rate": 6.7472124587445306e-06, + "loss": 0.3071, + "step": 6437 + }, + { + "epoch": 0.4, + "grad_norm": 1.731602382722578, + "learning_rate": 6.746258152976082e-06, + "loss": 0.2945, + "step": 6438 + }, + { + "epoch": 0.4, + "grad_norm": 2.0333223022746494, + "learning_rate": 6.745303774752279e-06, + "loss": 0.3064, + "step": 6439 + }, + { + "epoch": 0.41, + "grad_norm": 1.76952663531311, + "learning_rate": 6.744349324112722e-06, + "loss": 0.3132, + "step": 6440 + }, + { + "epoch": 0.41, + "grad_norm": 3.6825494996694594, + "learning_rate": 6.743394801097014e-06, + "loss": 0.3132, + "step": 6441 + }, + { + "epoch": 0.41, + "grad_norm": 2.51728955276763, + "learning_rate": 6.7424402057447606e-06, + "loss": 0.2873, + "step": 6442 + }, + { + "epoch": 0.41, + "grad_norm": 1.7091262720438556, + "learning_rate": 6.741485538095566e-06, + "loss": 0.3023, + "step": 6443 + }, + { + "epoch": 0.41, + "grad_norm": 3.8394371015675834, + "learning_rate": 6.7405307981890436e-06, + "loss": 0.3047, + "step": 6444 + }, + { + "epoch": 0.41, + "grad_norm": 2.0800861005997704, + "learning_rate": 6.739575986064807e-06, + "loss": 0.3005, + "step": 6445 + }, + { + "epoch": 0.41, + "grad_norm": 2.598482420410477, + "learning_rate": 6.738621101762472e-06, + "loss": 0.2811, + "step": 6446 + }, + { + "epoch": 0.41, + "grad_norm": 1.7975966513433286, + "learning_rate": 6.737666145321662e-06, + "loss": 0.2843, + "step": 6447 + }, + { + "epoch": 0.41, + "grad_norm": 2.9658994276713817, + "learning_rate": 6.7367111167819955e-06, + "loss": 0.2969, + "step": 6448 + }, + { + "epoch": 0.41, + "grad_norm": 1.4202457333872267, + "learning_rate": 6.735756016183099e-06, + "loss": 0.2972, + "step": 6449 + }, + { + "epoch": 0.41, + "grad_norm": 1.4356628148175767, + "learning_rate": 6.734800843564604e-06, + "loss": 0.2804, + "step": 6450 + }, + { + "epoch": 0.41, + "grad_norm": 4.434828003590157, + "learning_rate": 6.73384559896614e-06, + "loss": 0.3023, + "step": 6451 + }, + { + "epoch": 0.41, + "grad_norm": 1.9753955136574015, + "learning_rate": 6.732890282427342e-06, + "loss": 0.2929, + "step": 6452 + }, + { + "epoch": 0.41, + "grad_norm": 1.7334199063473066, + "learning_rate": 6.731934893987849e-06, + "loss": 0.2801, + "step": 6453 + }, + { + "epoch": 0.41, + "grad_norm": 3.458364112155257, + "learning_rate": 6.7309794336873e-06, + "loss": 0.2966, + "step": 6454 + }, + { + "epoch": 0.41, + "grad_norm": 2.3031052151800613, + "learning_rate": 6.730023901565341e-06, + "loss": 0.304, + "step": 6455 + }, + { + "epoch": 0.41, + "grad_norm": 1.6402178108076182, + "learning_rate": 6.729068297661618e-06, + "loss": 0.3019, + "step": 6456 + }, + { + "epoch": 0.41, + "grad_norm": 1.6320293107294643, + "learning_rate": 6.728112622015779e-06, + "loss": 0.2945, + "step": 6457 + }, + { + "epoch": 0.41, + "grad_norm": 2.5309669224890703, + "learning_rate": 6.727156874667478e-06, + "loss": 0.3075, + "step": 6458 + }, + { + "epoch": 0.41, + "grad_norm": 3.432981301693717, + "learning_rate": 6.726201055656369e-06, + "loss": 0.2818, + "step": 6459 + }, + { + "epoch": 0.41, + "grad_norm": 0.6973018453399739, + "learning_rate": 6.725245165022114e-06, + "loss": 0.4881, + "step": 6460 + }, + { + "epoch": 0.41, + "grad_norm": 3.446417171142823, + "learning_rate": 6.724289202804373e-06, + "loss": 0.3192, + "step": 6461 + }, + { + "epoch": 0.41, + "grad_norm": 1.3604008127576046, + "learning_rate": 6.723333169042808e-06, + "loss": 0.3047, + "step": 6462 + }, + { + "epoch": 0.41, + "grad_norm": 1.807463817597169, + "learning_rate": 6.722377063777091e-06, + "loss": 0.3137, + "step": 6463 + }, + { + "epoch": 0.41, + "grad_norm": 1.8294947866474718, + "learning_rate": 6.72142088704689e-06, + "loss": 0.3068, + "step": 6464 + }, + { + "epoch": 0.41, + "grad_norm": 1.3060169917182747, + "learning_rate": 6.720464638891878e-06, + "loss": 0.2916, + "step": 6465 + }, + { + "epoch": 0.41, + "grad_norm": 2.008175265254663, + "learning_rate": 6.719508319351733e-06, + "loss": 0.3022, + "step": 6466 + }, + { + "epoch": 0.41, + "grad_norm": 2.124354686119315, + "learning_rate": 6.718551928466133e-06, + "loss": 0.3146, + "step": 6467 + }, + { + "epoch": 0.41, + "grad_norm": 1.6295233948044467, + "learning_rate": 6.717595466274762e-06, + "loss": 0.2968, + "step": 6468 + }, + { + "epoch": 0.41, + "grad_norm": 1.4702849046667679, + "learning_rate": 6.716638932817303e-06, + "loss": 0.2925, + "step": 6469 + }, + { + "epoch": 0.41, + "grad_norm": 1.6444903445591657, + "learning_rate": 6.715682328133447e-06, + "loss": 0.2863, + "step": 6470 + }, + { + "epoch": 0.41, + "grad_norm": 1.8408699849276253, + "learning_rate": 6.714725652262882e-06, + "loss": 0.2907, + "step": 6471 + }, + { + "epoch": 0.41, + "grad_norm": 1.6301294973434417, + "learning_rate": 6.713768905245306e-06, + "loss": 0.2855, + "step": 6472 + }, + { + "epoch": 0.41, + "grad_norm": 1.939913293202927, + "learning_rate": 6.712812087120413e-06, + "loss": 0.3038, + "step": 6473 + }, + { + "epoch": 0.41, + "grad_norm": 2.3121552370294105, + "learning_rate": 6.711855197927904e-06, + "loss": 0.2905, + "step": 6474 + }, + { + "epoch": 0.41, + "grad_norm": 1.5014649916385596, + "learning_rate": 6.710898237707482e-06, + "loss": 0.2995, + "step": 6475 + }, + { + "epoch": 0.41, + "grad_norm": 1.4460069249305667, + "learning_rate": 6.7099412064988555e-06, + "loss": 0.3094, + "step": 6476 + }, + { + "epoch": 0.41, + "grad_norm": 1.9799930823188674, + "learning_rate": 6.708984104341728e-06, + "loss": 0.3129, + "step": 6477 + }, + { + "epoch": 0.41, + "grad_norm": 1.6505256145852305, + "learning_rate": 6.708026931275817e-06, + "loss": 0.3025, + "step": 6478 + }, + { + "epoch": 0.41, + "grad_norm": 3.0605434419766584, + "learning_rate": 6.707069687340834e-06, + "loss": 0.3033, + "step": 6479 + }, + { + "epoch": 0.41, + "grad_norm": 2.0207596390285953, + "learning_rate": 6.706112372576499e-06, + "loss": 0.2911, + "step": 6480 + }, + { + "epoch": 0.41, + "grad_norm": 4.144713949978419, + "learning_rate": 6.705154987022528e-06, + "loss": 0.3052, + "step": 6481 + }, + { + "epoch": 0.41, + "grad_norm": 1.6216085321664697, + "learning_rate": 6.70419753071865e-06, + "loss": 0.3001, + "step": 6482 + }, + { + "epoch": 0.41, + "grad_norm": 1.7893907759835832, + "learning_rate": 6.703240003704588e-06, + "loss": 0.3005, + "step": 6483 + }, + { + "epoch": 0.41, + "grad_norm": 2.026551985538254, + "learning_rate": 6.702282406020076e-06, + "loss": 0.3295, + "step": 6484 + }, + { + "epoch": 0.41, + "grad_norm": 4.656580054248685, + "learning_rate": 6.70132473770484e-06, + "loss": 0.3237, + "step": 6485 + }, + { + "epoch": 0.41, + "grad_norm": 2.3342468805975165, + "learning_rate": 6.700366998798621e-06, + "loss": 0.2965, + "step": 6486 + }, + { + "epoch": 0.41, + "grad_norm": 2.3114027899184495, + "learning_rate": 6.699409189341153e-06, + "loss": 0.2865, + "step": 6487 + }, + { + "epoch": 0.41, + "grad_norm": 1.660419345588762, + "learning_rate": 6.69845130937218e-06, + "loss": 0.2929, + "step": 6488 + }, + { + "epoch": 0.41, + "grad_norm": 1.3563661499613435, + "learning_rate": 6.697493358931446e-06, + "loss": 0.3178, + "step": 6489 + }, + { + "epoch": 0.41, + "grad_norm": 1.5526025441779743, + "learning_rate": 6.696535338058699e-06, + "loss": 0.2829, + "step": 6490 + }, + { + "epoch": 0.41, + "grad_norm": 1.6259505361326623, + "learning_rate": 6.695577246793684e-06, + "loss": 0.2969, + "step": 6491 + }, + { + "epoch": 0.41, + "grad_norm": 0.6633457265011925, + "learning_rate": 6.694619085176159e-06, + "loss": 0.5003, + "step": 6492 + }, + { + "epoch": 0.41, + "grad_norm": 2.104362074544626, + "learning_rate": 6.693660853245878e-06, + "loss": 0.2913, + "step": 6493 + }, + { + "epoch": 0.41, + "grad_norm": 3.0840101305702516, + "learning_rate": 6.6927025510426015e-06, + "loss": 0.2877, + "step": 6494 + }, + { + "epoch": 0.41, + "grad_norm": 2.5817337343184006, + "learning_rate": 6.691744178606087e-06, + "loss": 0.3022, + "step": 6495 + }, + { + "epoch": 0.41, + "grad_norm": 1.5247565946653607, + "learning_rate": 6.690785735976103e-06, + "loss": 0.2988, + "step": 6496 + }, + { + "epoch": 0.41, + "grad_norm": 1.5498495605021445, + "learning_rate": 6.6898272231924155e-06, + "loss": 0.3035, + "step": 6497 + }, + { + "epoch": 0.41, + "grad_norm": 2.986600088902626, + "learning_rate": 6.688868640294796e-06, + "loss": 0.2984, + "step": 6498 + }, + { + "epoch": 0.41, + "grad_norm": 3.0870828973579, + "learning_rate": 6.687909987323016e-06, + "loss": 0.3022, + "step": 6499 + }, + { + "epoch": 0.41, + "grad_norm": 1.8103354335933153, + "learning_rate": 6.686951264316852e-06, + "loss": 0.3161, + "step": 6500 + }, + { + "epoch": 0.41, + "grad_norm": 1.5621955735177566, + "learning_rate": 6.6859924713160825e-06, + "loss": 0.2957, + "step": 6501 + }, + { + "epoch": 0.41, + "grad_norm": 2.0167450198214447, + "learning_rate": 6.685033608360494e-06, + "loss": 0.306, + "step": 6502 + }, + { + "epoch": 0.41, + "grad_norm": 2.373936432373496, + "learning_rate": 6.684074675489864e-06, + "loss": 0.3407, + "step": 6503 + }, + { + "epoch": 0.41, + "grad_norm": 1.8784137649234551, + "learning_rate": 6.683115672743989e-06, + "loss": 0.2972, + "step": 6504 + }, + { + "epoch": 0.41, + "grad_norm": 2.643761595008346, + "learning_rate": 6.682156600162653e-06, + "loss": 0.3029, + "step": 6505 + }, + { + "epoch": 0.41, + "grad_norm": 1.7342194894575267, + "learning_rate": 6.681197457785652e-06, + "loss": 0.2768, + "step": 6506 + }, + { + "epoch": 0.41, + "grad_norm": 2.846046235908457, + "learning_rate": 6.680238245652782e-06, + "loss": 0.2983, + "step": 6507 + }, + { + "epoch": 0.41, + "grad_norm": 2.248225598354093, + "learning_rate": 6.679278963803843e-06, + "loss": 0.2868, + "step": 6508 + }, + { + "epoch": 0.41, + "grad_norm": 0.6656411086377948, + "learning_rate": 6.678319612278636e-06, + "loss": 0.4823, + "step": 6509 + }, + { + "epoch": 0.41, + "grad_norm": 2.085584357878793, + "learning_rate": 6.677360191116971e-06, + "loss": 0.2954, + "step": 6510 + }, + { + "epoch": 0.41, + "grad_norm": 2.875879156190651, + "learning_rate": 6.676400700358647e-06, + "loss": 0.3085, + "step": 6511 + }, + { + "epoch": 0.41, + "grad_norm": 3.1069616976492127, + "learning_rate": 6.6754411400434835e-06, + "loss": 0.2905, + "step": 6512 + }, + { + "epoch": 0.41, + "grad_norm": 1.6032407445338612, + "learning_rate": 6.674481510211292e-06, + "loss": 0.2784, + "step": 6513 + }, + { + "epoch": 0.41, + "grad_norm": 3.0661652484314277, + "learning_rate": 6.673521810901886e-06, + "loss": 0.3457, + "step": 6514 + }, + { + "epoch": 0.41, + "grad_norm": 3.013500104714371, + "learning_rate": 6.672562042155089e-06, + "loss": 0.2974, + "step": 6515 + }, + { + "epoch": 0.41, + "grad_norm": 4.027355388246828, + "learning_rate": 6.671602204010722e-06, + "loss": 0.28, + "step": 6516 + }, + { + "epoch": 0.41, + "grad_norm": 2.221326735148984, + "learning_rate": 6.67064229650861e-06, + "loss": 0.3294, + "step": 6517 + }, + { + "epoch": 0.41, + "grad_norm": 2.5814221531969004, + "learning_rate": 6.669682319688582e-06, + "loss": 0.2906, + "step": 6518 + }, + { + "epoch": 0.41, + "grad_norm": 26.916348187608442, + "learning_rate": 6.6687222735904675e-06, + "loss": 0.3017, + "step": 6519 + }, + { + "epoch": 0.41, + "grad_norm": 2.0790376071172463, + "learning_rate": 6.667762158254104e-06, + "loss": 0.2981, + "step": 6520 + }, + { + "epoch": 0.41, + "grad_norm": 1.9109341389632886, + "learning_rate": 6.6668019737193255e-06, + "loss": 0.2845, + "step": 6521 + }, + { + "epoch": 0.41, + "grad_norm": 6.032732941934832, + "learning_rate": 6.665841720025972e-06, + "loss": 0.2976, + "step": 6522 + }, + { + "epoch": 0.41, + "grad_norm": 1.5256747676019773, + "learning_rate": 6.664881397213887e-06, + "loss": 0.3113, + "step": 6523 + }, + { + "epoch": 0.41, + "grad_norm": 2.0470466405784453, + "learning_rate": 6.663921005322917e-06, + "loss": 0.3078, + "step": 6524 + }, + { + "epoch": 0.41, + "grad_norm": 1.4551442549662454, + "learning_rate": 6.662960544392907e-06, + "loss": 0.3108, + "step": 6525 + }, + { + "epoch": 0.41, + "grad_norm": 2.492917944019753, + "learning_rate": 6.662000014463711e-06, + "loss": 0.3122, + "step": 6526 + }, + { + "epoch": 0.41, + "grad_norm": 1.261631175411658, + "learning_rate": 6.661039415575183e-06, + "loss": 0.2973, + "step": 6527 + }, + { + "epoch": 0.41, + "grad_norm": 1.866713432003546, + "learning_rate": 6.660078747767178e-06, + "loss": 0.309, + "step": 6528 + }, + { + "epoch": 0.41, + "grad_norm": 1.8077549956648529, + "learning_rate": 6.659118011079558e-06, + "loss": 0.3034, + "step": 6529 + }, + { + "epoch": 0.41, + "grad_norm": 2.8211784951489682, + "learning_rate": 6.658157205552185e-06, + "loss": 0.2911, + "step": 6530 + }, + { + "epoch": 0.41, + "grad_norm": 3.25057674642045, + "learning_rate": 6.6571963312249236e-06, + "loss": 0.3295, + "step": 6531 + }, + { + "epoch": 0.41, + "grad_norm": 4.04367286593556, + "learning_rate": 6.656235388137644e-06, + "loss": 0.322, + "step": 6532 + }, + { + "epoch": 0.41, + "grad_norm": 1.2583043092157906, + "learning_rate": 6.655274376330214e-06, + "loss": 0.2728, + "step": 6533 + }, + { + "epoch": 0.41, + "grad_norm": 2.162039432449415, + "learning_rate": 6.654313295842513e-06, + "loss": 0.3119, + "step": 6534 + }, + { + "epoch": 0.41, + "grad_norm": 2.1090665833326745, + "learning_rate": 6.653352146714413e-06, + "loss": 0.3204, + "step": 6535 + }, + { + "epoch": 0.41, + "grad_norm": 1.2228168014288647, + "learning_rate": 6.652390928985797e-06, + "loss": 0.2973, + "step": 6536 + }, + { + "epoch": 0.41, + "grad_norm": 2.771665543505001, + "learning_rate": 6.651429642696545e-06, + "loss": 0.2982, + "step": 6537 + }, + { + "epoch": 0.41, + "grad_norm": 2.1660760953101317, + "learning_rate": 6.6504682878865444e-06, + "loss": 0.2934, + "step": 6538 + }, + { + "epoch": 0.41, + "grad_norm": 7.0469822850552575, + "learning_rate": 6.649506864595683e-06, + "loss": 0.2928, + "step": 6539 + }, + { + "epoch": 0.41, + "grad_norm": 2.6919902299540963, + "learning_rate": 6.648545372863853e-06, + "loss": 0.3067, + "step": 6540 + }, + { + "epoch": 0.41, + "grad_norm": 2.131151185345146, + "learning_rate": 6.647583812730945e-06, + "loss": 0.3182, + "step": 6541 + }, + { + "epoch": 0.41, + "grad_norm": 3.5973916018871677, + "learning_rate": 6.646622184236861e-06, + "loss": 0.3322, + "step": 6542 + }, + { + "epoch": 0.41, + "grad_norm": 2.8035752923220416, + "learning_rate": 6.6456604874214955e-06, + "loss": 0.3168, + "step": 6543 + }, + { + "epoch": 0.41, + "grad_norm": 2.097977463715777, + "learning_rate": 6.644698722324755e-06, + "loss": 0.2925, + "step": 6544 + }, + { + "epoch": 0.41, + "grad_norm": 5.890742047703468, + "learning_rate": 6.643736888986541e-06, + "loss": 0.2945, + "step": 6545 + }, + { + "epoch": 0.41, + "grad_norm": 2.8278897102287393, + "learning_rate": 6.642774987446768e-06, + "loss": 0.3043, + "step": 6546 + }, + { + "epoch": 0.41, + "grad_norm": 1.8485565712722765, + "learning_rate": 6.641813017745339e-06, + "loss": 0.293, + "step": 6547 + }, + { + "epoch": 0.41, + "grad_norm": 2.766376519928722, + "learning_rate": 6.640850979922173e-06, + "loss": 0.2878, + "step": 6548 + }, + { + "epoch": 0.41, + "grad_norm": 2.8180337634210066, + "learning_rate": 6.639888874017185e-06, + "loss": 0.3108, + "step": 6549 + }, + { + "epoch": 0.41, + "grad_norm": 1.8359271171744111, + "learning_rate": 6.638926700070296e-06, + "loss": 0.3123, + "step": 6550 + }, + { + "epoch": 0.41, + "grad_norm": 3.1252229166077155, + "learning_rate": 6.637964458121427e-06, + "loss": 0.2957, + "step": 6551 + }, + { + "epoch": 0.41, + "grad_norm": 2.7650068696064065, + "learning_rate": 6.637002148210502e-06, + "loss": 0.2968, + "step": 6552 + }, + { + "epoch": 0.41, + "grad_norm": 1.545614146749676, + "learning_rate": 6.63603977037745e-06, + "loss": 0.2915, + "step": 6553 + }, + { + "epoch": 0.41, + "grad_norm": 2.0902969731352163, + "learning_rate": 6.635077324662203e-06, + "loss": 0.281, + "step": 6554 + }, + { + "epoch": 0.41, + "grad_norm": 1.898235766333002, + "learning_rate": 6.6341148111046935e-06, + "loss": 0.2912, + "step": 6555 + }, + { + "epoch": 0.41, + "grad_norm": 4.0526073815626145, + "learning_rate": 6.6331522297448584e-06, + "loss": 0.3282, + "step": 6556 + }, + { + "epoch": 0.41, + "grad_norm": 2.580336371799474, + "learning_rate": 6.632189580622636e-06, + "loss": 0.32, + "step": 6557 + }, + { + "epoch": 0.41, + "grad_norm": 2.3217855246364456, + "learning_rate": 6.631226863777968e-06, + "loss": 0.3033, + "step": 6558 + }, + { + "epoch": 0.41, + "grad_norm": 3.3520595471580505, + "learning_rate": 6.6302640792508e-06, + "loss": 0.3188, + "step": 6559 + }, + { + "epoch": 0.41, + "grad_norm": 2.4813151010945194, + "learning_rate": 6.629301227081082e-06, + "loss": 0.2948, + "step": 6560 + }, + { + "epoch": 0.41, + "grad_norm": 2.940935134432427, + "learning_rate": 6.6283383073087595e-06, + "loss": 0.2935, + "step": 6561 + }, + { + "epoch": 0.41, + "grad_norm": 1.5880193534118219, + "learning_rate": 6.62737531997379e-06, + "loss": 0.2863, + "step": 6562 + }, + { + "epoch": 0.41, + "grad_norm": 1.841185018704397, + "learning_rate": 6.626412265116127e-06, + "loss": 0.2871, + "step": 6563 + }, + { + "epoch": 0.41, + "grad_norm": 4.447388579476549, + "learning_rate": 6.625449142775731e-06, + "loss": 0.2895, + "step": 6564 + }, + { + "epoch": 0.41, + "grad_norm": 3.6735093007773516, + "learning_rate": 6.624485952992563e-06, + "loss": 0.3117, + "step": 6565 + }, + { + "epoch": 0.41, + "grad_norm": 4.537175043517064, + "learning_rate": 6.623522695806588e-06, + "loss": 0.3177, + "step": 6566 + }, + { + "epoch": 0.41, + "grad_norm": 1.6200215922676457, + "learning_rate": 6.62255937125777e-06, + "loss": 0.3127, + "step": 6567 + }, + { + "epoch": 0.41, + "grad_norm": 10.564596987242483, + "learning_rate": 6.621595979386084e-06, + "loss": 0.295, + "step": 6568 + }, + { + "epoch": 0.41, + "grad_norm": 8.986314193108306, + "learning_rate": 6.6206325202315e-06, + "loss": 0.3048, + "step": 6569 + }, + { + "epoch": 0.41, + "grad_norm": 0.622307023271134, + "learning_rate": 6.6196689938339946e-06, + "loss": 0.5279, + "step": 6570 + }, + { + "epoch": 0.41, + "grad_norm": 1.342715204296712, + "learning_rate": 6.618705400233544e-06, + "loss": 0.2875, + "step": 6571 + }, + { + "epoch": 0.41, + "grad_norm": 2.057952809297888, + "learning_rate": 6.617741739470134e-06, + "loss": 0.3058, + "step": 6572 + }, + { + "epoch": 0.41, + "grad_norm": 5.340990691708877, + "learning_rate": 6.616778011583744e-06, + "loss": 0.315, + "step": 6573 + }, + { + "epoch": 0.41, + "grad_norm": 2.1280708954523515, + "learning_rate": 6.6158142166143625e-06, + "loss": 0.3019, + "step": 6574 + }, + { + "epoch": 0.41, + "grad_norm": 2.9229894990239464, + "learning_rate": 6.614850354601978e-06, + "loss": 0.2781, + "step": 6575 + }, + { + "epoch": 0.41, + "grad_norm": 1.5369966196890847, + "learning_rate": 6.613886425586586e-06, + "loss": 0.288, + "step": 6576 + }, + { + "epoch": 0.41, + "grad_norm": 1.641657513587774, + "learning_rate": 6.61292242960818e-06, + "loss": 0.2982, + "step": 6577 + }, + { + "epoch": 0.41, + "grad_norm": 1.7241730992220534, + "learning_rate": 6.611958366706757e-06, + "loss": 0.3175, + "step": 6578 + }, + { + "epoch": 0.41, + "grad_norm": 1.8285734215480198, + "learning_rate": 6.610994236922317e-06, + "loss": 0.296, + "step": 6579 + }, + { + "epoch": 0.41, + "grad_norm": 3.3209263340897497, + "learning_rate": 6.610030040294866e-06, + "loss": 0.3147, + "step": 6580 + }, + { + "epoch": 0.41, + "grad_norm": 2.068601570984096, + "learning_rate": 6.6090657768644085e-06, + "loss": 0.2886, + "step": 6581 + }, + { + "epoch": 0.41, + "grad_norm": 2.3594994164369227, + "learning_rate": 6.608101446670953e-06, + "loss": 0.312, + "step": 6582 + }, + { + "epoch": 0.41, + "grad_norm": 1.6020994895289615, + "learning_rate": 6.607137049754513e-06, + "loss": 0.293, + "step": 6583 + }, + { + "epoch": 0.41, + "grad_norm": 1.850261649027184, + "learning_rate": 6.6061725861551026e-06, + "loss": 0.2782, + "step": 6584 + }, + { + "epoch": 0.41, + "grad_norm": 1.5573929837511624, + "learning_rate": 6.605208055912737e-06, + "loss": 0.3199, + "step": 6585 + }, + { + "epoch": 0.41, + "grad_norm": 1.7566236394019696, + "learning_rate": 6.604243459067441e-06, + "loss": 0.2998, + "step": 6586 + }, + { + "epoch": 0.41, + "grad_norm": 1.8706880225366673, + "learning_rate": 6.6032787956592316e-06, + "loss": 0.2885, + "step": 6587 + }, + { + "epoch": 0.41, + "grad_norm": 1.8610202506770097, + "learning_rate": 6.602314065728139e-06, + "loss": 0.3132, + "step": 6588 + }, + { + "epoch": 0.41, + "grad_norm": 1.7072450253287421, + "learning_rate": 6.601349269314188e-06, + "loss": 0.2845, + "step": 6589 + }, + { + "epoch": 0.41, + "grad_norm": 1.6233152007016336, + "learning_rate": 6.600384406457414e-06, + "loss": 0.2859, + "step": 6590 + }, + { + "epoch": 0.41, + "grad_norm": 1.702646182713347, + "learning_rate": 6.599419477197846e-06, + "loss": 0.3026, + "step": 6591 + }, + { + "epoch": 0.41, + "grad_norm": 1.8484181919293035, + "learning_rate": 6.598454481575525e-06, + "loss": 0.3033, + "step": 6592 + }, + { + "epoch": 0.41, + "grad_norm": 1.4374414106581042, + "learning_rate": 6.597489419630486e-06, + "loss": 0.3008, + "step": 6593 + }, + { + "epoch": 0.41, + "grad_norm": 2.7344994247032326, + "learning_rate": 6.5965242914027764e-06, + "loss": 0.3069, + "step": 6594 + }, + { + "epoch": 0.41, + "grad_norm": 2.349672451682584, + "learning_rate": 6.595559096932436e-06, + "loss": 0.3086, + "step": 6595 + }, + { + "epoch": 0.41, + "grad_norm": 2.0954466858856042, + "learning_rate": 6.594593836259516e-06, + "loss": 0.2854, + "step": 6596 + }, + { + "epoch": 0.41, + "grad_norm": 2.4831756558014324, + "learning_rate": 6.5936285094240635e-06, + "loss": 0.3074, + "step": 6597 + }, + { + "epoch": 0.41, + "grad_norm": 2.5382543642269466, + "learning_rate": 6.592663116466136e-06, + "loss": 0.2879, + "step": 6598 + }, + { + "epoch": 0.42, + "grad_norm": 2.0291490477998524, + "learning_rate": 6.591697657425785e-06, + "loss": 0.3152, + "step": 6599 + }, + { + "epoch": 0.42, + "grad_norm": 6.254477060601201, + "learning_rate": 6.590732132343072e-06, + "loss": 0.2717, + "step": 6600 + }, + { + "epoch": 0.42, + "grad_norm": 1.8654577208286884, + "learning_rate": 6.589766541258056e-06, + "loss": 0.3088, + "step": 6601 + }, + { + "epoch": 0.42, + "grad_norm": 1.7547277008358781, + "learning_rate": 6.588800884210804e-06, + "loss": 0.2805, + "step": 6602 + }, + { + "epoch": 0.42, + "grad_norm": 2.2004128043591042, + "learning_rate": 6.587835161241381e-06, + "loss": 0.3029, + "step": 6603 + }, + { + "epoch": 0.42, + "grad_norm": 16.76608811992652, + "learning_rate": 6.586869372389857e-06, + "loss": 0.2944, + "step": 6604 + }, + { + "epoch": 0.42, + "grad_norm": 1.8336780119686658, + "learning_rate": 6.585903517696304e-06, + "loss": 0.3146, + "step": 6605 + }, + { + "epoch": 0.42, + "grad_norm": 1.3608484381333867, + "learning_rate": 6.584937597200797e-06, + "loss": 0.2952, + "step": 6606 + }, + { + "epoch": 0.42, + "grad_norm": 1.7509240104914126, + "learning_rate": 6.5839716109434136e-06, + "loss": 0.3201, + "step": 6607 + }, + { + "epoch": 0.42, + "grad_norm": 1.9604685934523443, + "learning_rate": 6.583005558964235e-06, + "loss": 0.2982, + "step": 6608 + }, + { + "epoch": 0.42, + "grad_norm": 1.3077049783515002, + "learning_rate": 6.582039441303344e-06, + "loss": 0.2804, + "step": 6609 + }, + { + "epoch": 0.42, + "grad_norm": 1.6072040591235521, + "learning_rate": 6.581073258000827e-06, + "loss": 0.3106, + "step": 6610 + }, + { + "epoch": 0.42, + "grad_norm": 2.1569202949236703, + "learning_rate": 6.580107009096771e-06, + "loss": 0.2857, + "step": 6611 + }, + { + "epoch": 0.42, + "grad_norm": 1.7634977147567996, + "learning_rate": 6.57914069463127e-06, + "loss": 0.3002, + "step": 6612 + }, + { + "epoch": 0.42, + "grad_norm": 1.9597734943808822, + "learning_rate": 6.578174314644416e-06, + "loss": 0.3044, + "step": 6613 + }, + { + "epoch": 0.42, + "grad_norm": 2.544044361929901, + "learning_rate": 6.5772078691763065e-06, + "loss": 0.2972, + "step": 6614 + }, + { + "epoch": 0.42, + "grad_norm": 2.3856537063237093, + "learning_rate": 6.57624135826704e-06, + "loss": 0.29, + "step": 6615 + }, + { + "epoch": 0.42, + "grad_norm": 1.7474925133654111, + "learning_rate": 6.575274781956722e-06, + "loss": 0.2909, + "step": 6616 + }, + { + "epoch": 0.42, + "grad_norm": 1.7235652080990869, + "learning_rate": 6.574308140285454e-06, + "loss": 0.2926, + "step": 6617 + }, + { + "epoch": 0.42, + "grad_norm": 2.6267696018741113, + "learning_rate": 6.573341433293345e-06, + "loss": 0.3091, + "step": 6618 + }, + { + "epoch": 0.42, + "grad_norm": 2.279834635014297, + "learning_rate": 6.572374661020505e-06, + "loss": 0.3002, + "step": 6619 + }, + { + "epoch": 0.42, + "grad_norm": 1.7558010712909244, + "learning_rate": 6.571407823507049e-06, + "loss": 0.2795, + "step": 6620 + }, + { + "epoch": 0.42, + "grad_norm": 1.6294631332275649, + "learning_rate": 6.5704409207930905e-06, + "loss": 0.2777, + "step": 6621 + }, + { + "epoch": 0.42, + "grad_norm": 2.4934477254417544, + "learning_rate": 6.569473952918749e-06, + "loss": 0.3045, + "step": 6622 + }, + { + "epoch": 0.42, + "grad_norm": 2.202872314621412, + "learning_rate": 6.5685069199241435e-06, + "loss": 0.3073, + "step": 6623 + }, + { + "epoch": 0.42, + "grad_norm": 2.6364301520786926, + "learning_rate": 6.567539821849403e-06, + "loss": 0.302, + "step": 6624 + }, + { + "epoch": 0.42, + "grad_norm": 1.4578145608951738, + "learning_rate": 6.566572658734649e-06, + "loss": 0.2833, + "step": 6625 + }, + { + "epoch": 0.42, + "grad_norm": 2.5653620306932607, + "learning_rate": 6.565605430620014e-06, + "loss": 0.3066, + "step": 6626 + }, + { + "epoch": 0.42, + "grad_norm": 1.7116355398366587, + "learning_rate": 6.564638137545627e-06, + "loss": 0.3007, + "step": 6627 + }, + { + "epoch": 0.42, + "grad_norm": 1.3005990082073562, + "learning_rate": 6.563670779551627e-06, + "loss": 0.2968, + "step": 6628 + }, + { + "epoch": 0.42, + "grad_norm": 2.512059306827473, + "learning_rate": 6.562703356678147e-06, + "loss": 0.2852, + "step": 6629 + }, + { + "epoch": 0.42, + "grad_norm": 0.8029879121206913, + "learning_rate": 6.56173586896533e-06, + "loss": 0.5057, + "step": 6630 + }, + { + "epoch": 0.42, + "grad_norm": 2.51892830915327, + "learning_rate": 6.560768316453317e-06, + "loss": 0.3069, + "step": 6631 + }, + { + "epoch": 0.42, + "grad_norm": 3.4714159838244183, + "learning_rate": 6.5598006991822565e-06, + "loss": 0.3107, + "step": 6632 + }, + { + "epoch": 0.42, + "grad_norm": 3.17130969302371, + "learning_rate": 6.558833017192292e-06, + "loss": 0.3247, + "step": 6633 + }, + { + "epoch": 0.42, + "grad_norm": 2.0429954347034123, + "learning_rate": 6.557865270523578e-06, + "loss": 0.2966, + "step": 6634 + }, + { + "epoch": 0.42, + "grad_norm": 2.9615465707278945, + "learning_rate": 6.556897459216266e-06, + "loss": 0.2923, + "step": 6635 + }, + { + "epoch": 0.42, + "grad_norm": 2.0565315431508737, + "learning_rate": 6.555929583310515e-06, + "loss": 0.2982, + "step": 6636 + }, + { + "epoch": 0.42, + "grad_norm": 4.637913546854928, + "learning_rate": 6.55496164284648e-06, + "loss": 0.3019, + "step": 6637 + }, + { + "epoch": 0.42, + "grad_norm": 3.1177319613133587, + "learning_rate": 6.553993637864325e-06, + "loss": 0.2935, + "step": 6638 + }, + { + "epoch": 0.42, + "grad_norm": 1.5943474259786545, + "learning_rate": 6.5530255684042145e-06, + "loss": 0.2905, + "step": 6639 + }, + { + "epoch": 0.42, + "grad_norm": 1.5647836227289802, + "learning_rate": 6.552057434506314e-06, + "loss": 0.2964, + "step": 6640 + }, + { + "epoch": 0.42, + "grad_norm": 3.300130802169015, + "learning_rate": 6.551089236210793e-06, + "loss": 0.305, + "step": 6641 + }, + { + "epoch": 0.42, + "grad_norm": 3.525592303022024, + "learning_rate": 6.550120973557825e-06, + "loss": 0.3034, + "step": 6642 + }, + { + "epoch": 0.42, + "grad_norm": 1.5047122486921707, + "learning_rate": 6.549152646587585e-06, + "loss": 0.2963, + "step": 6643 + }, + { + "epoch": 0.42, + "grad_norm": 2.5618472647096326, + "learning_rate": 6.548184255340251e-06, + "loss": 0.3237, + "step": 6644 + }, + { + "epoch": 0.42, + "grad_norm": 1.5165527372393564, + "learning_rate": 6.5472157998560005e-06, + "loss": 0.3144, + "step": 6645 + }, + { + "epoch": 0.42, + "grad_norm": 1.8126356620836233, + "learning_rate": 6.5462472801750195e-06, + "loss": 0.2912, + "step": 6646 + }, + { + "epoch": 0.42, + "grad_norm": 2.288393285526579, + "learning_rate": 6.545278696337492e-06, + "loss": 0.2845, + "step": 6647 + }, + { + "epoch": 0.42, + "grad_norm": 4.232613800542668, + "learning_rate": 6.544310048383608e-06, + "loss": 0.3276, + "step": 6648 + }, + { + "epoch": 0.42, + "grad_norm": 1.9751667329786748, + "learning_rate": 6.543341336353555e-06, + "loss": 0.31, + "step": 6649 + }, + { + "epoch": 0.42, + "grad_norm": 1.6580555248410977, + "learning_rate": 6.54237256028753e-06, + "loss": 0.2896, + "step": 6650 + }, + { + "epoch": 0.42, + "grad_norm": 2.140158115245598, + "learning_rate": 6.541403720225726e-06, + "loss": 0.2798, + "step": 6651 + }, + { + "epoch": 0.42, + "grad_norm": 3.087865297320279, + "learning_rate": 6.5404348162083466e-06, + "loss": 0.3037, + "step": 6652 + }, + { + "epoch": 0.42, + "grad_norm": 2.5568908169910936, + "learning_rate": 6.539465848275588e-06, + "loss": 0.2964, + "step": 6653 + }, + { + "epoch": 0.42, + "grad_norm": 0.7141778926779454, + "learning_rate": 6.538496816467658e-06, + "loss": 0.501, + "step": 6654 + }, + { + "epoch": 0.42, + "grad_norm": 3.0369483023198427, + "learning_rate": 6.537527720824763e-06, + "loss": 0.3049, + "step": 6655 + }, + { + "epoch": 0.42, + "grad_norm": 1.7767126454898903, + "learning_rate": 6.536558561387112e-06, + "loss": 0.3242, + "step": 6656 + }, + { + "epoch": 0.42, + "grad_norm": 2.387913680174168, + "learning_rate": 6.535589338194916e-06, + "loss": 0.2958, + "step": 6657 + }, + { + "epoch": 0.42, + "grad_norm": 1.7439653011474678, + "learning_rate": 6.534620051288392e-06, + "loss": 0.2962, + "step": 6658 + }, + { + "epoch": 0.42, + "grad_norm": 4.1073700321500155, + "learning_rate": 6.533650700707755e-06, + "loss": 0.2867, + "step": 6659 + }, + { + "epoch": 0.42, + "grad_norm": 4.72888325269297, + "learning_rate": 6.532681286493227e-06, + "loss": 0.3015, + "step": 6660 + }, + { + "epoch": 0.42, + "grad_norm": 2.754324138536733, + "learning_rate": 6.5317118086850296e-06, + "loss": 0.3389, + "step": 6661 + }, + { + "epoch": 0.42, + "grad_norm": 1.5798089948703729, + "learning_rate": 6.53074226732339e-06, + "loss": 0.2954, + "step": 6662 + }, + { + "epoch": 0.42, + "grad_norm": 7.357269872813877, + "learning_rate": 6.5297726624485335e-06, + "loss": 0.2935, + "step": 6663 + }, + { + "epoch": 0.42, + "grad_norm": 2.4157827770860054, + "learning_rate": 6.5288029941006924e-06, + "loss": 0.2849, + "step": 6664 + }, + { + "epoch": 0.42, + "grad_norm": 2.483880541105889, + "learning_rate": 6.527833262320099e-06, + "loss": 0.3013, + "step": 6665 + }, + { + "epoch": 0.42, + "grad_norm": 1.8280625928331715, + "learning_rate": 6.526863467146991e-06, + "loss": 0.311, + "step": 6666 + }, + { + "epoch": 0.42, + "grad_norm": 16.137689554284446, + "learning_rate": 6.525893608621604e-06, + "loss": 0.3009, + "step": 6667 + }, + { + "epoch": 0.42, + "grad_norm": 1.4052662791986477, + "learning_rate": 6.524923686784184e-06, + "loss": 0.3071, + "step": 6668 + }, + { + "epoch": 0.42, + "grad_norm": 1.9190988175049173, + "learning_rate": 6.523953701674969e-06, + "loss": 0.3225, + "step": 6669 + }, + { + "epoch": 0.42, + "grad_norm": 3.4271075268165996, + "learning_rate": 6.5229836533342095e-06, + "loss": 0.3028, + "step": 6670 + }, + { + "epoch": 0.42, + "grad_norm": 1.9156249575803732, + "learning_rate": 6.5220135418021515e-06, + "loss": 0.3241, + "step": 6671 + }, + { + "epoch": 0.42, + "grad_norm": 3.1031247527236507, + "learning_rate": 6.521043367119049e-06, + "loss": 0.3017, + "step": 6672 + }, + { + "epoch": 0.42, + "grad_norm": 1.4463256394660986, + "learning_rate": 6.520073129325156e-06, + "loss": 0.2923, + "step": 6673 + }, + { + "epoch": 0.42, + "grad_norm": 2.0932528750837225, + "learning_rate": 6.51910282846073e-06, + "loss": 0.2976, + "step": 6674 + }, + { + "epoch": 0.42, + "grad_norm": 3.325389754089071, + "learning_rate": 6.518132464566026e-06, + "loss": 0.2991, + "step": 6675 + }, + { + "epoch": 0.42, + "grad_norm": 1.4153017818677673, + "learning_rate": 6.517162037681313e-06, + "loss": 0.2982, + "step": 6676 + }, + { + "epoch": 0.42, + "grad_norm": 1.4919394863188316, + "learning_rate": 6.5161915478468495e-06, + "loss": 0.2834, + "step": 6677 + }, + { + "epoch": 0.42, + "grad_norm": 1.7836137050152663, + "learning_rate": 6.5152209951029085e-06, + "loss": 0.2939, + "step": 6678 + }, + { + "epoch": 0.42, + "grad_norm": 3.0392726353543766, + "learning_rate": 6.514250379489754e-06, + "loss": 0.317, + "step": 6679 + }, + { + "epoch": 0.42, + "grad_norm": 8.336572065414183, + "learning_rate": 6.513279701047663e-06, + "loss": 0.2825, + "step": 6680 + }, + { + "epoch": 0.42, + "grad_norm": 1.5357636682880287, + "learning_rate": 6.51230895981691e-06, + "loss": 0.3017, + "step": 6681 + }, + { + "epoch": 0.42, + "grad_norm": 3.056914598784362, + "learning_rate": 6.511338155837772e-06, + "loss": 0.3285, + "step": 6682 + }, + { + "epoch": 0.42, + "grad_norm": 1.9321872914816856, + "learning_rate": 6.510367289150528e-06, + "loss": 0.2908, + "step": 6683 + }, + { + "epoch": 0.42, + "grad_norm": 1.9659965305423102, + "learning_rate": 6.509396359795465e-06, + "loss": 0.3044, + "step": 6684 + }, + { + "epoch": 0.42, + "grad_norm": 1.701003316316974, + "learning_rate": 6.508425367812864e-06, + "loss": 0.3084, + "step": 6685 + }, + { + "epoch": 0.42, + "grad_norm": 1.3292562347262862, + "learning_rate": 6.507454313243016e-06, + "loss": 0.2932, + "step": 6686 + }, + { + "epoch": 0.42, + "grad_norm": 1.8085632941667336, + "learning_rate": 6.5064831961262095e-06, + "loss": 0.2924, + "step": 6687 + }, + { + "epoch": 0.42, + "grad_norm": 5.099391928724341, + "learning_rate": 6.505512016502742e-06, + "loss": 0.293, + "step": 6688 + }, + { + "epoch": 0.42, + "grad_norm": 2.8982831618294798, + "learning_rate": 6.504540774412905e-06, + "loss": 0.3195, + "step": 6689 + }, + { + "epoch": 0.42, + "grad_norm": 1.8581115161281003, + "learning_rate": 6.503569469896999e-06, + "loss": 0.2985, + "step": 6690 + }, + { + "epoch": 0.42, + "grad_norm": 2.0070324488975997, + "learning_rate": 6.502598102995326e-06, + "loss": 0.3109, + "step": 6691 + }, + { + "epoch": 0.42, + "grad_norm": 2.5611289647158277, + "learning_rate": 6.5016266737481895e-06, + "loss": 0.3013, + "step": 6692 + }, + { + "epoch": 0.42, + "grad_norm": 2.1415445046374235, + "learning_rate": 6.500655182195893e-06, + "loss": 0.2832, + "step": 6693 + }, + { + "epoch": 0.42, + "grad_norm": 1.6257866022093999, + "learning_rate": 6.4996836283787475e-06, + "loss": 0.2883, + "step": 6694 + }, + { + "epoch": 0.42, + "grad_norm": 2.4598524460289037, + "learning_rate": 6.498712012337065e-06, + "loss": 0.2936, + "step": 6695 + }, + { + "epoch": 0.42, + "grad_norm": 1.7563371977421394, + "learning_rate": 6.497740334111161e-06, + "loss": 0.3083, + "step": 6696 + }, + { + "epoch": 0.42, + "grad_norm": 3.528684558497098, + "learning_rate": 6.496768593741347e-06, + "loss": 0.3136, + "step": 6697 + }, + { + "epoch": 0.42, + "grad_norm": 2.0870146974393315, + "learning_rate": 6.495796791267948e-06, + "loss": 0.3032, + "step": 6698 + }, + { + "epoch": 0.42, + "grad_norm": 2.1498320571843594, + "learning_rate": 6.4948249267312826e-06, + "loss": 0.3065, + "step": 6699 + }, + { + "epoch": 0.42, + "grad_norm": 3.7312396090469386, + "learning_rate": 6.493853000171677e-06, + "loss": 0.3167, + "step": 6700 + }, + { + "epoch": 0.42, + "grad_norm": 10.953251234131047, + "learning_rate": 6.492881011629455e-06, + "loss": 0.2992, + "step": 6701 + }, + { + "epoch": 0.42, + "grad_norm": 2.4939549599591375, + "learning_rate": 6.49190896114495e-06, + "loss": 0.31, + "step": 6702 + }, + { + "epoch": 0.42, + "grad_norm": 2.727483209643662, + "learning_rate": 6.4909368487584916e-06, + "loss": 0.3, + "step": 6703 + }, + { + "epoch": 0.42, + "grad_norm": 4.580324751770327, + "learning_rate": 6.489964674510415e-06, + "loss": 0.3069, + "step": 6704 + }, + { + "epoch": 0.42, + "grad_norm": 1.682575206260421, + "learning_rate": 6.488992438441055e-06, + "loss": 0.2941, + "step": 6705 + }, + { + "epoch": 0.42, + "grad_norm": 1.6084755212680026, + "learning_rate": 6.488020140590758e-06, + "loss": 0.333, + "step": 6706 + }, + { + "epoch": 0.42, + "grad_norm": 1.414128282407443, + "learning_rate": 6.48704778099986e-06, + "loss": 0.3224, + "step": 6707 + }, + { + "epoch": 0.42, + "grad_norm": 2.3166081613666436, + "learning_rate": 6.486075359708709e-06, + "loss": 0.319, + "step": 6708 + }, + { + "epoch": 0.42, + "grad_norm": 1.6840612137367519, + "learning_rate": 6.485102876757652e-06, + "loss": 0.3168, + "step": 6709 + }, + { + "epoch": 0.42, + "grad_norm": 1.9088392499215119, + "learning_rate": 6.48413033218704e-06, + "loss": 0.2967, + "step": 6710 + }, + { + "epoch": 0.42, + "grad_norm": 1.4688670212016326, + "learning_rate": 6.483157726037222e-06, + "loss": 0.2856, + "step": 6711 + }, + { + "epoch": 0.42, + "grad_norm": 1.407240901376062, + "learning_rate": 6.482185058348556e-06, + "loss": 0.3038, + "step": 6712 + }, + { + "epoch": 0.42, + "grad_norm": 1.7259059175476872, + "learning_rate": 6.4812123291614005e-06, + "loss": 0.2901, + "step": 6713 + }, + { + "epoch": 0.42, + "grad_norm": 2.816169264353281, + "learning_rate": 6.480239538516114e-06, + "loss": 0.2976, + "step": 6714 + }, + { + "epoch": 0.42, + "grad_norm": 1.8517276184680693, + "learning_rate": 6.47926668645306e-06, + "loss": 0.2997, + "step": 6715 + }, + { + "epoch": 0.42, + "grad_norm": 2.155593367474794, + "learning_rate": 6.478293773012603e-06, + "loss": 0.305, + "step": 6716 + }, + { + "epoch": 0.42, + "grad_norm": 1.7211834035333056, + "learning_rate": 6.477320798235112e-06, + "loss": 0.2883, + "step": 6717 + }, + { + "epoch": 0.42, + "grad_norm": 2.5938584569789214, + "learning_rate": 6.476347762160957e-06, + "loss": 0.3108, + "step": 6718 + }, + { + "epoch": 0.42, + "grad_norm": 2.8052742745153534, + "learning_rate": 6.475374664830512e-06, + "loss": 0.2925, + "step": 6719 + }, + { + "epoch": 0.42, + "grad_norm": 3.2814627037738258, + "learning_rate": 6.47440150628415e-06, + "loss": 0.297, + "step": 6720 + }, + { + "epoch": 0.42, + "grad_norm": 2.5331285415764486, + "learning_rate": 6.473428286562251e-06, + "loss": 0.2936, + "step": 6721 + }, + { + "epoch": 0.42, + "grad_norm": 1.916705578993107, + "learning_rate": 6.472455005705197e-06, + "loss": 0.2896, + "step": 6722 + }, + { + "epoch": 0.42, + "grad_norm": 15.634617436509288, + "learning_rate": 6.471481663753367e-06, + "loss": 0.288, + "step": 6723 + }, + { + "epoch": 0.42, + "grad_norm": 3.1405479001930976, + "learning_rate": 6.47050826074715e-06, + "loss": 0.3104, + "step": 6724 + }, + { + "epoch": 0.42, + "grad_norm": 1.829690751521165, + "learning_rate": 6.469534796726934e-06, + "loss": 0.2899, + "step": 6725 + }, + { + "epoch": 0.42, + "grad_norm": 4.493429523614867, + "learning_rate": 6.4685612717331096e-06, + "loss": 0.2882, + "step": 6726 + }, + { + "epoch": 0.42, + "grad_norm": 1.476047469509268, + "learning_rate": 6.467587685806067e-06, + "loss": 0.2847, + "step": 6727 + }, + { + "epoch": 0.42, + "grad_norm": 2.330666995233044, + "learning_rate": 6.466614038986208e-06, + "loss": 0.2914, + "step": 6728 + }, + { + "epoch": 0.42, + "grad_norm": 3.179439580181806, + "learning_rate": 6.465640331313925e-06, + "loss": 0.3025, + "step": 6729 + }, + { + "epoch": 0.42, + "grad_norm": 1.7500517706162204, + "learning_rate": 6.464666562829624e-06, + "loss": 0.2953, + "step": 6730 + }, + { + "epoch": 0.42, + "grad_norm": 7.3990291736926075, + "learning_rate": 6.4636927335737025e-06, + "loss": 0.2996, + "step": 6731 + }, + { + "epoch": 0.42, + "grad_norm": 2.036953456502084, + "learning_rate": 6.462718843586572e-06, + "loss": 0.3033, + "step": 6732 + }, + { + "epoch": 0.42, + "grad_norm": 0.6268336755904357, + "learning_rate": 6.461744892908637e-06, + "loss": 0.4836, + "step": 6733 + }, + { + "epoch": 0.42, + "grad_norm": 1.7624383530294316, + "learning_rate": 6.460770881580311e-06, + "loss": 0.2899, + "step": 6734 + }, + { + "epoch": 0.42, + "grad_norm": 2.6027756428985964, + "learning_rate": 6.4597968096420045e-06, + "loss": 0.2991, + "step": 6735 + }, + { + "epoch": 0.42, + "grad_norm": 2.1245425071428894, + "learning_rate": 6.4588226771341386e-06, + "loss": 0.3101, + "step": 6736 + }, + { + "epoch": 0.42, + "grad_norm": 1.5774516435729113, + "learning_rate": 6.457848484097128e-06, + "loss": 0.2938, + "step": 6737 + }, + { + "epoch": 0.42, + "grad_norm": 2.307916791625838, + "learning_rate": 6.456874230571393e-06, + "loss": 0.3357, + "step": 6738 + }, + { + "epoch": 0.42, + "grad_norm": 1.5779917268889785, + "learning_rate": 6.455899916597359e-06, + "loss": 0.2867, + "step": 6739 + }, + { + "epoch": 0.42, + "grad_norm": 1.688366142488379, + "learning_rate": 6.4549255422154525e-06, + "loss": 0.2887, + "step": 6740 + }, + { + "epoch": 0.42, + "grad_norm": 1.9617952338691722, + "learning_rate": 6.4539511074660995e-06, + "loss": 0.3096, + "step": 6741 + }, + { + "epoch": 0.42, + "grad_norm": 3.260581858861846, + "learning_rate": 6.452976612389733e-06, + "loss": 0.3243, + "step": 6742 + }, + { + "epoch": 0.42, + "grad_norm": 2.004827513471801, + "learning_rate": 6.452002057026786e-06, + "loss": 0.2999, + "step": 6743 + }, + { + "epoch": 0.42, + "grad_norm": 1.6405049139436518, + "learning_rate": 6.451027441417696e-06, + "loss": 0.2742, + "step": 6744 + }, + { + "epoch": 0.42, + "grad_norm": 2.6131278877084996, + "learning_rate": 6.4500527656028985e-06, + "loss": 0.2981, + "step": 6745 + }, + { + "epoch": 0.42, + "grad_norm": 1.9106438014988696, + "learning_rate": 6.449078029622837e-06, + "loss": 0.2896, + "step": 6746 + }, + { + "epoch": 0.42, + "grad_norm": 1.8401447823120518, + "learning_rate": 6.448103233517954e-06, + "loss": 0.3143, + "step": 6747 + }, + { + "epoch": 0.42, + "grad_norm": 1.879439296180402, + "learning_rate": 6.447128377328695e-06, + "loss": 0.2922, + "step": 6748 + }, + { + "epoch": 0.42, + "grad_norm": 2.279784983777409, + "learning_rate": 6.4461534610955104e-06, + "loss": 0.2867, + "step": 6749 + }, + { + "epoch": 0.42, + "grad_norm": 2.746662266839023, + "learning_rate": 6.445178484858849e-06, + "loss": 0.2913, + "step": 6750 + }, + { + "epoch": 0.42, + "grad_norm": 2.671399715811076, + "learning_rate": 6.444203448659165e-06, + "loss": 0.3099, + "step": 6751 + }, + { + "epoch": 0.42, + "grad_norm": 2.0312387518092354, + "learning_rate": 6.443228352536918e-06, + "loss": 0.2873, + "step": 6752 + }, + { + "epoch": 0.42, + "grad_norm": 1.9592013263028545, + "learning_rate": 6.4422531965325594e-06, + "loss": 0.2914, + "step": 6753 + }, + { + "epoch": 0.42, + "grad_norm": 2.6378974765143877, + "learning_rate": 6.441277980686556e-06, + "loss": 0.3196, + "step": 6754 + }, + { + "epoch": 0.42, + "grad_norm": 19.90402098774003, + "learning_rate": 6.4403027050393675e-06, + "loss": 0.3007, + "step": 6755 + }, + { + "epoch": 0.42, + "grad_norm": 2.739745636095288, + "learning_rate": 6.439327369631464e-06, + "loss": 0.2911, + "step": 6756 + }, + { + "epoch": 0.42, + "grad_norm": 1.7499854498849223, + "learning_rate": 6.438351974503309e-06, + "loss": 0.3105, + "step": 6757 + }, + { + "epoch": 0.43, + "grad_norm": 2.36591451892812, + "learning_rate": 6.437376519695376e-06, + "loss": 0.3302, + "step": 6758 + }, + { + "epoch": 0.43, + "grad_norm": 2.7624095080741506, + "learning_rate": 6.436401005248139e-06, + "loss": 0.2895, + "step": 6759 + }, + { + "epoch": 0.43, + "grad_norm": 3.714746600818347, + "learning_rate": 6.435425431202074e-06, + "loss": 0.2996, + "step": 6760 + }, + { + "epoch": 0.43, + "grad_norm": 2.379943254084911, + "learning_rate": 6.434449797597657e-06, + "loss": 0.2836, + "step": 6761 + }, + { + "epoch": 0.43, + "grad_norm": 0.6241433111758001, + "learning_rate": 6.433474104475369e-06, + "loss": 0.4965, + "step": 6762 + }, + { + "epoch": 0.43, + "grad_norm": 1.5604552983852458, + "learning_rate": 6.432498351875696e-06, + "loss": 0.2715, + "step": 6763 + }, + { + "epoch": 0.43, + "grad_norm": 4.704246886300958, + "learning_rate": 6.43152253983912e-06, + "loss": 0.2903, + "step": 6764 + }, + { + "epoch": 0.43, + "grad_norm": 1.7847322408734418, + "learning_rate": 6.430546668406133e-06, + "loss": 0.2985, + "step": 6765 + }, + { + "epoch": 0.43, + "grad_norm": 3.539045795134168, + "learning_rate": 6.429570737617223e-06, + "loss": 0.2816, + "step": 6766 + }, + { + "epoch": 0.43, + "grad_norm": 2.8569052860052397, + "learning_rate": 6.428594747512884e-06, + "loss": 0.2994, + "step": 6767 + }, + { + "epoch": 0.43, + "grad_norm": 1.9623127045136122, + "learning_rate": 6.427618698133612e-06, + "loss": 0.2873, + "step": 6768 + }, + { + "epoch": 0.43, + "grad_norm": 3.505181360047184, + "learning_rate": 6.4266425895199036e-06, + "loss": 0.2754, + "step": 6769 + }, + { + "epoch": 0.43, + "grad_norm": 8.008662784013174, + "learning_rate": 6.425666421712263e-06, + "loss": 0.3048, + "step": 6770 + }, + { + "epoch": 0.43, + "grad_norm": 1.7449333988077806, + "learning_rate": 6.424690194751186e-06, + "loss": 0.3042, + "step": 6771 + }, + { + "epoch": 0.43, + "grad_norm": 5.11664783106136, + "learning_rate": 6.423713908677185e-06, + "loss": 0.309, + "step": 6772 + }, + { + "epoch": 0.43, + "grad_norm": 2.3902479893888047, + "learning_rate": 6.422737563530763e-06, + "loss": 0.3006, + "step": 6773 + }, + { + "epoch": 0.43, + "grad_norm": 1.6467051612233414, + "learning_rate": 6.4217611593524355e-06, + "loss": 0.2875, + "step": 6774 + }, + { + "epoch": 0.43, + "grad_norm": 2.8336430361345313, + "learning_rate": 6.420784696182709e-06, + "loss": 0.3061, + "step": 6775 + }, + { + "epoch": 0.43, + "grad_norm": 3.7093595735481975, + "learning_rate": 6.419808174062103e-06, + "loss": 0.2974, + "step": 6776 + }, + { + "epoch": 0.43, + "grad_norm": 2.0017537665334055, + "learning_rate": 6.418831593031134e-06, + "loss": 0.3129, + "step": 6777 + }, + { + "epoch": 0.43, + "grad_norm": 3.0562628003301255, + "learning_rate": 6.417854953130323e-06, + "loss": 0.2976, + "step": 6778 + }, + { + "epoch": 0.43, + "grad_norm": 8.636640367023912, + "learning_rate": 6.416878254400191e-06, + "loss": 0.2958, + "step": 6779 + }, + { + "epoch": 0.43, + "grad_norm": 1.6510810730970376, + "learning_rate": 6.4159014968812634e-06, + "loss": 0.2977, + "step": 6780 + }, + { + "epoch": 0.43, + "grad_norm": 1.8031167414175482, + "learning_rate": 6.4149246806140675e-06, + "loss": 0.3017, + "step": 6781 + }, + { + "epoch": 0.43, + "grad_norm": 1.480695841073619, + "learning_rate": 6.413947805639136e-06, + "loss": 0.2872, + "step": 6782 + }, + { + "epoch": 0.43, + "grad_norm": 2.5016659168010187, + "learning_rate": 6.412970871996995e-06, + "loss": 0.2858, + "step": 6783 + }, + { + "epoch": 0.43, + "grad_norm": 1.9446304107640044, + "learning_rate": 6.411993879728184e-06, + "loss": 0.2937, + "step": 6784 + }, + { + "epoch": 0.43, + "grad_norm": 2.314690855240622, + "learning_rate": 6.411016828873239e-06, + "loss": 0.2796, + "step": 6785 + }, + { + "epoch": 0.43, + "grad_norm": 0.5776542150995976, + "learning_rate": 6.4100397194727005e-06, + "loss": 0.4739, + "step": 6786 + }, + { + "epoch": 0.43, + "grad_norm": 10.40466950989219, + "learning_rate": 6.409062551567109e-06, + "loss": 0.3025, + "step": 6787 + }, + { + "epoch": 0.43, + "grad_norm": 3.6651469506478778, + "learning_rate": 6.4080853251970086e-06, + "loss": 0.2947, + "step": 6788 + }, + { + "epoch": 0.43, + "grad_norm": 2.1833000065185026, + "learning_rate": 6.4071080404029475e-06, + "loss": 0.2975, + "step": 6789 + }, + { + "epoch": 0.43, + "grad_norm": 3.8783876707876814, + "learning_rate": 6.4061306972254745e-06, + "loss": 0.3256, + "step": 6790 + }, + { + "epoch": 0.43, + "grad_norm": 1.6929911454243527, + "learning_rate": 6.405153295705142e-06, + "loss": 0.3157, + "step": 6791 + }, + { + "epoch": 0.43, + "grad_norm": 2.0992830830375038, + "learning_rate": 6.404175835882503e-06, + "loss": 0.3031, + "step": 6792 + }, + { + "epoch": 0.43, + "grad_norm": 1.6012424203390276, + "learning_rate": 6.403198317798113e-06, + "loss": 0.2832, + "step": 6793 + }, + { + "epoch": 0.43, + "grad_norm": 2.1412361975886807, + "learning_rate": 6.402220741492533e-06, + "loss": 0.3084, + "step": 6794 + }, + { + "epoch": 0.43, + "grad_norm": 1.8586675520090896, + "learning_rate": 6.401243107006325e-06, + "loss": 0.2929, + "step": 6795 + }, + { + "epoch": 0.43, + "grad_norm": 1.5001244380557788, + "learning_rate": 6.4002654143800515e-06, + "loss": 0.2797, + "step": 6796 + }, + { + "epoch": 0.43, + "grad_norm": 3.8021882489270977, + "learning_rate": 6.399287663654279e-06, + "loss": 0.3144, + "step": 6797 + }, + { + "epoch": 0.43, + "grad_norm": 1.561296588065728, + "learning_rate": 6.398309854869574e-06, + "loss": 0.2905, + "step": 6798 + }, + { + "epoch": 0.43, + "grad_norm": 2.6822116196263686, + "learning_rate": 6.397331988066512e-06, + "loss": 0.3022, + "step": 6799 + }, + { + "epoch": 0.43, + "grad_norm": 2.59133152600341, + "learning_rate": 6.396354063285662e-06, + "loss": 0.321, + "step": 6800 + }, + { + "epoch": 0.43, + "grad_norm": 3.7238356491332714, + "learning_rate": 6.395376080567602e-06, + "loss": 0.2834, + "step": 6801 + }, + { + "epoch": 0.43, + "grad_norm": 2.8237679990774085, + "learning_rate": 6.394398039952911e-06, + "loss": 0.2856, + "step": 6802 + }, + { + "epoch": 0.43, + "grad_norm": 3.0978506014726537, + "learning_rate": 6.3934199414821674e-06, + "loss": 0.2922, + "step": 6803 + }, + { + "epoch": 0.43, + "grad_norm": 2.6291710724231745, + "learning_rate": 6.392441785195956e-06, + "loss": 0.2884, + "step": 6804 + }, + { + "epoch": 0.43, + "grad_norm": 2.6154631905494288, + "learning_rate": 6.391463571134862e-06, + "loss": 0.3206, + "step": 6805 + }, + { + "epoch": 0.43, + "grad_norm": 2.8472115736003967, + "learning_rate": 6.390485299339473e-06, + "loss": 0.3124, + "step": 6806 + }, + { + "epoch": 0.43, + "grad_norm": 2.049227778719141, + "learning_rate": 6.389506969850378e-06, + "loss": 0.3071, + "step": 6807 + }, + { + "epoch": 0.43, + "grad_norm": 6.852184541794767, + "learning_rate": 6.3885285827081725e-06, + "loss": 0.2909, + "step": 6808 + }, + { + "epoch": 0.43, + "grad_norm": 4.210604253205481, + "learning_rate": 6.3875501379534486e-06, + "loss": 0.2947, + "step": 6809 + }, + { + "epoch": 0.43, + "grad_norm": 4.195088755565158, + "learning_rate": 6.386571635626804e-06, + "loss": 0.3426, + "step": 6810 + }, + { + "epoch": 0.43, + "grad_norm": 2.254386928581936, + "learning_rate": 6.3855930757688415e-06, + "loss": 0.2929, + "step": 6811 + }, + { + "epoch": 0.43, + "grad_norm": 8.427413066891106, + "learning_rate": 6.38461445842016e-06, + "loss": 0.3237, + "step": 6812 + }, + { + "epoch": 0.43, + "grad_norm": 1.479301294544089, + "learning_rate": 6.383635783621365e-06, + "loss": 0.3242, + "step": 6813 + }, + { + "epoch": 0.43, + "grad_norm": 4.4322326984415525, + "learning_rate": 6.382657051413063e-06, + "loss": 0.3136, + "step": 6814 + }, + { + "epoch": 0.43, + "grad_norm": 2.673907078170811, + "learning_rate": 6.3816782618358666e-06, + "loss": 0.2763, + "step": 6815 + }, + { + "epoch": 0.43, + "grad_norm": 1.5224796805783334, + "learning_rate": 6.380699414930385e-06, + "loss": 0.2943, + "step": 6816 + }, + { + "epoch": 0.43, + "grad_norm": 1.9402496872929773, + "learning_rate": 6.37972051073723e-06, + "loss": 0.2899, + "step": 6817 + }, + { + "epoch": 0.43, + "grad_norm": 1.8635711574000435, + "learning_rate": 6.378741549297021e-06, + "loss": 0.2881, + "step": 6818 + }, + { + "epoch": 0.43, + "grad_norm": 1.8063934042696637, + "learning_rate": 6.377762530650375e-06, + "loss": 0.3006, + "step": 6819 + }, + { + "epoch": 0.43, + "grad_norm": 3.094089157008058, + "learning_rate": 6.376783454837916e-06, + "loss": 0.3159, + "step": 6820 + }, + { + "epoch": 0.43, + "grad_norm": 1.7428949729006693, + "learning_rate": 6.375804321900267e-06, + "loss": 0.2843, + "step": 6821 + }, + { + "epoch": 0.43, + "grad_norm": 1.7019580540891321, + "learning_rate": 6.3748251318780514e-06, + "loss": 0.2909, + "step": 6822 + }, + { + "epoch": 0.43, + "grad_norm": 1.6504712380753666, + "learning_rate": 6.3738458848119e-06, + "loss": 0.2845, + "step": 6823 + }, + { + "epoch": 0.43, + "grad_norm": 1.885256138973961, + "learning_rate": 6.372866580742442e-06, + "loss": 0.2956, + "step": 6824 + }, + { + "epoch": 0.43, + "grad_norm": 2.5142776791261525, + "learning_rate": 6.371887219710312e-06, + "loss": 0.2929, + "step": 6825 + }, + { + "epoch": 0.43, + "grad_norm": 1.5168769328775624, + "learning_rate": 6.370907801756143e-06, + "loss": 0.2959, + "step": 6826 + }, + { + "epoch": 0.43, + "grad_norm": 1.6316484169195389, + "learning_rate": 6.369928326920575e-06, + "loss": 0.2942, + "step": 6827 + }, + { + "epoch": 0.43, + "grad_norm": 1.365074744515247, + "learning_rate": 6.368948795244247e-06, + "loss": 0.2868, + "step": 6828 + }, + { + "epoch": 0.43, + "grad_norm": 3.8919705280839447, + "learning_rate": 6.367969206767803e-06, + "loss": 0.3115, + "step": 6829 + }, + { + "epoch": 0.43, + "grad_norm": 1.8577146692649886, + "learning_rate": 6.366989561531887e-06, + "loss": 0.28, + "step": 6830 + }, + { + "epoch": 0.43, + "grad_norm": 1.27404396869779, + "learning_rate": 6.3660098595771445e-06, + "loss": 0.3005, + "step": 6831 + }, + { + "epoch": 0.43, + "grad_norm": 1.7231158169295044, + "learning_rate": 6.365030100944227e-06, + "loss": 0.2921, + "step": 6832 + }, + { + "epoch": 0.43, + "grad_norm": 2.8452785989556784, + "learning_rate": 6.364050285673788e-06, + "loss": 0.3021, + "step": 6833 + }, + { + "epoch": 0.43, + "grad_norm": 2.034796840341878, + "learning_rate": 6.363070413806478e-06, + "loss": 0.3077, + "step": 6834 + }, + { + "epoch": 0.43, + "grad_norm": 2.0652641545771186, + "learning_rate": 6.362090485382956e-06, + "loss": 0.29, + "step": 6835 + }, + { + "epoch": 0.43, + "grad_norm": 1.953695334345325, + "learning_rate": 6.361110500443879e-06, + "loss": 0.2861, + "step": 6836 + }, + { + "epoch": 0.43, + "grad_norm": 4.583631036849745, + "learning_rate": 6.360130459029912e-06, + "loss": 0.3059, + "step": 6837 + }, + { + "epoch": 0.43, + "grad_norm": 1.4901758759164343, + "learning_rate": 6.3591503611817155e-06, + "loss": 0.3115, + "step": 6838 + }, + { + "epoch": 0.43, + "grad_norm": 2.829612896761817, + "learning_rate": 6.358170206939955e-06, + "loss": 0.3155, + "step": 6839 + }, + { + "epoch": 0.43, + "grad_norm": 1.8677532522239744, + "learning_rate": 6.357189996345302e-06, + "loss": 0.2933, + "step": 6840 + }, + { + "epoch": 0.43, + "grad_norm": 4.431188282024258, + "learning_rate": 6.356209729438425e-06, + "loss": 0.3105, + "step": 6841 + }, + { + "epoch": 0.43, + "grad_norm": 3.2546802077530046, + "learning_rate": 6.3552294062599975e-06, + "loss": 0.3002, + "step": 6842 + }, + { + "epoch": 0.43, + "grad_norm": 2.2200722143856417, + "learning_rate": 6.354249026850694e-06, + "loss": 0.3055, + "step": 6843 + }, + { + "epoch": 0.43, + "grad_norm": 2.5011573402567864, + "learning_rate": 6.3532685912511934e-06, + "loss": 0.2855, + "step": 6844 + }, + { + "epoch": 0.43, + "grad_norm": 2.2676207427517316, + "learning_rate": 6.352288099502175e-06, + "loss": 0.299, + "step": 6845 + }, + { + "epoch": 0.43, + "grad_norm": 2.963298980139748, + "learning_rate": 6.351307551644322e-06, + "loss": 0.3154, + "step": 6846 + }, + { + "epoch": 0.43, + "grad_norm": 2.4566492371647595, + "learning_rate": 6.350326947718319e-06, + "loss": 0.3119, + "step": 6847 + }, + { + "epoch": 0.43, + "grad_norm": 1.859494361893289, + "learning_rate": 6.3493462877648515e-06, + "loss": 0.3009, + "step": 6848 + }, + { + "epoch": 0.43, + "grad_norm": 8.36861430609139, + "learning_rate": 6.348365571824611e-06, + "loss": 0.2866, + "step": 6849 + }, + { + "epoch": 0.43, + "grad_norm": 1.8902335631050846, + "learning_rate": 6.3473847999382855e-06, + "loss": 0.3038, + "step": 6850 + }, + { + "epoch": 0.43, + "grad_norm": 2.148905293926362, + "learning_rate": 6.346403972146574e-06, + "loss": 0.3024, + "step": 6851 + }, + { + "epoch": 0.43, + "grad_norm": 0.6120394598504009, + "learning_rate": 6.345423088490169e-06, + "loss": 0.5123, + "step": 6852 + }, + { + "epoch": 0.43, + "grad_norm": 2.936424933189742, + "learning_rate": 6.344442149009771e-06, + "loss": 0.2978, + "step": 6853 + }, + { + "epoch": 0.43, + "grad_norm": 1.4006229347917698, + "learning_rate": 6.343461153746079e-06, + "loss": 0.2899, + "step": 6854 + }, + { + "epoch": 0.43, + "grad_norm": 3.043274006634977, + "learning_rate": 6.3424801027397984e-06, + "loss": 0.3094, + "step": 6855 + }, + { + "epoch": 0.43, + "grad_norm": 1.3576158863718646, + "learning_rate": 6.3414989960316345e-06, + "loss": 0.2981, + "step": 6856 + }, + { + "epoch": 0.43, + "grad_norm": 7.41163007617088, + "learning_rate": 6.340517833662293e-06, + "loss": 0.2937, + "step": 6857 + }, + { + "epoch": 0.43, + "grad_norm": 1.530216208151628, + "learning_rate": 6.339536615672486e-06, + "loss": 0.2888, + "step": 6858 + }, + { + "epoch": 0.43, + "grad_norm": 2.315749199744174, + "learning_rate": 6.338555342102927e-06, + "loss": 0.2903, + "step": 6859 + }, + { + "epoch": 0.43, + "grad_norm": 3.2305350401404063, + "learning_rate": 6.337574012994327e-06, + "loss": 0.3106, + "step": 6860 + }, + { + "epoch": 0.43, + "grad_norm": 2.6948606536470603, + "learning_rate": 6.336592628387407e-06, + "loss": 0.3062, + "step": 6861 + }, + { + "epoch": 0.43, + "grad_norm": 2.6648834760085593, + "learning_rate": 6.335611188322883e-06, + "loss": 0.3025, + "step": 6862 + }, + { + "epoch": 0.43, + "grad_norm": 2.526308257063972, + "learning_rate": 6.334629692841481e-06, + "loss": 0.2867, + "step": 6863 + }, + { + "epoch": 0.43, + "grad_norm": 2.4595612981056614, + "learning_rate": 6.333648141983921e-06, + "loss": 0.3038, + "step": 6864 + }, + { + "epoch": 0.43, + "grad_norm": 2.71043145840467, + "learning_rate": 6.33266653579093e-06, + "loss": 0.2847, + "step": 6865 + }, + { + "epoch": 0.43, + "grad_norm": 1.5690542517115196, + "learning_rate": 6.3316848743032385e-06, + "loss": 0.3124, + "step": 6866 + }, + { + "epoch": 0.43, + "grad_norm": 2.705659968142546, + "learning_rate": 6.3307031575615775e-06, + "loss": 0.292, + "step": 6867 + }, + { + "epoch": 0.43, + "grad_norm": 1.960419911846101, + "learning_rate": 6.329721385606676e-06, + "loss": 0.2902, + "step": 6868 + }, + { + "epoch": 0.43, + "grad_norm": 1.4939959391311788, + "learning_rate": 6.328739558479275e-06, + "loss": 0.3122, + "step": 6869 + }, + { + "epoch": 0.43, + "grad_norm": 1.9057049387463765, + "learning_rate": 6.3277576762201074e-06, + "loss": 0.332, + "step": 6870 + }, + { + "epoch": 0.43, + "grad_norm": 2.010189182878927, + "learning_rate": 6.326775738869917e-06, + "loss": 0.2978, + "step": 6871 + }, + { + "epoch": 0.43, + "grad_norm": 3.6184330075214888, + "learning_rate": 6.325793746469443e-06, + "loss": 0.3035, + "step": 6872 + }, + { + "epoch": 0.43, + "grad_norm": 1.5469203734541837, + "learning_rate": 6.324811699059433e-06, + "loss": 0.2852, + "step": 6873 + }, + { + "epoch": 0.43, + "grad_norm": 1.6770697859282753, + "learning_rate": 6.323829596680633e-06, + "loss": 0.2777, + "step": 6874 + }, + { + "epoch": 0.43, + "grad_norm": 1.5636152530996292, + "learning_rate": 6.322847439373792e-06, + "loss": 0.2983, + "step": 6875 + }, + { + "epoch": 0.43, + "grad_norm": 2.6705672976398467, + "learning_rate": 6.321865227179658e-06, + "loss": 0.2815, + "step": 6876 + }, + { + "epoch": 0.43, + "grad_norm": 1.5980499810557056, + "learning_rate": 6.3208829601389896e-06, + "loss": 0.2957, + "step": 6877 + }, + { + "epoch": 0.43, + "grad_norm": 2.736204496034844, + "learning_rate": 6.319900638292541e-06, + "loss": 0.2895, + "step": 6878 + }, + { + "epoch": 0.43, + "grad_norm": 1.7046260857418942, + "learning_rate": 6.318918261681072e-06, + "loss": 0.2889, + "step": 6879 + }, + { + "epoch": 0.43, + "grad_norm": 1.848262816668194, + "learning_rate": 6.3179358303453386e-06, + "loss": 0.2919, + "step": 6880 + }, + { + "epoch": 0.43, + "grad_norm": 28.675346970086203, + "learning_rate": 6.3169533443261085e-06, + "loss": 0.3188, + "step": 6881 + }, + { + "epoch": 0.43, + "grad_norm": 1.5106934754378571, + "learning_rate": 6.315970803664145e-06, + "loss": 0.2992, + "step": 6882 + }, + { + "epoch": 0.43, + "grad_norm": 1.4680564361738777, + "learning_rate": 6.314988208400215e-06, + "loss": 0.2846, + "step": 6883 + }, + { + "epoch": 0.43, + "grad_norm": 3.740275830738221, + "learning_rate": 6.314005558575089e-06, + "loss": 0.3061, + "step": 6884 + }, + { + "epoch": 0.43, + "grad_norm": 2.59199399670399, + "learning_rate": 6.313022854229539e-06, + "loss": 0.2955, + "step": 6885 + }, + { + "epoch": 0.43, + "grad_norm": 0.6104358404222554, + "learning_rate": 6.312040095404337e-06, + "loss": 0.4985, + "step": 6886 + }, + { + "epoch": 0.43, + "grad_norm": 2.0916533426754236, + "learning_rate": 6.311057282140261e-06, + "loss": 0.2856, + "step": 6887 + }, + { + "epoch": 0.43, + "grad_norm": 1.7000841777319762, + "learning_rate": 6.310074414478091e-06, + "loss": 0.3012, + "step": 6888 + }, + { + "epoch": 0.43, + "grad_norm": 1.6799428434908734, + "learning_rate": 6.309091492458608e-06, + "loss": 0.2927, + "step": 6889 + }, + { + "epoch": 0.43, + "grad_norm": 5.172910443727807, + "learning_rate": 6.308108516122591e-06, + "loss": 0.3175, + "step": 6890 + }, + { + "epoch": 0.43, + "grad_norm": 2.277465820846939, + "learning_rate": 6.307125485510829e-06, + "loss": 0.2921, + "step": 6891 + }, + { + "epoch": 0.43, + "grad_norm": 2.569782192125306, + "learning_rate": 6.306142400664108e-06, + "loss": 0.2996, + "step": 6892 + }, + { + "epoch": 0.43, + "grad_norm": 2.142878579832889, + "learning_rate": 6.30515926162322e-06, + "loss": 0.2881, + "step": 6893 + }, + { + "epoch": 0.43, + "grad_norm": 1.5730768397650798, + "learning_rate": 6.304176068428957e-06, + "loss": 0.2816, + "step": 6894 + }, + { + "epoch": 0.43, + "grad_norm": 2.7981238689247325, + "learning_rate": 6.30319282112211e-06, + "loss": 0.2968, + "step": 6895 + }, + { + "epoch": 0.43, + "grad_norm": 1.5272437486330543, + "learning_rate": 6.30220951974348e-06, + "loss": 0.2975, + "step": 6896 + }, + { + "epoch": 0.43, + "grad_norm": 3.5952759735211863, + "learning_rate": 6.3012261643338635e-06, + "loss": 0.2968, + "step": 6897 + }, + { + "epoch": 0.43, + "grad_norm": 1.7209101307714316, + "learning_rate": 6.3002427549340615e-06, + "loss": 0.285, + "step": 6898 + }, + { + "epoch": 0.43, + "grad_norm": 1.5355351233576975, + "learning_rate": 6.299259291584879e-06, + "loss": 0.3108, + "step": 6899 + }, + { + "epoch": 0.43, + "grad_norm": 2.328437310208531, + "learning_rate": 6.298275774327121e-06, + "loss": 0.3472, + "step": 6900 + }, + { + "epoch": 0.43, + "grad_norm": 2.6083503246623474, + "learning_rate": 6.297292203201595e-06, + "loss": 0.2988, + "step": 6901 + }, + { + "epoch": 0.43, + "grad_norm": 2.9392954330231182, + "learning_rate": 6.296308578249109e-06, + "loss": 0.3219, + "step": 6902 + }, + { + "epoch": 0.43, + "grad_norm": 0.639787763238278, + "learning_rate": 6.29532489951048e-06, + "loss": 0.5236, + "step": 6903 + }, + { + "epoch": 0.43, + "grad_norm": 5.932409822949063, + "learning_rate": 6.29434116702652e-06, + "loss": 0.3096, + "step": 6904 + }, + { + "epoch": 0.43, + "grad_norm": 4.038587484729636, + "learning_rate": 6.293357380838046e-06, + "loss": 0.3124, + "step": 6905 + }, + { + "epoch": 0.43, + "grad_norm": 0.6437779728204444, + "learning_rate": 6.292373540985875e-06, + "loss": 0.544, + "step": 6906 + }, + { + "epoch": 0.43, + "grad_norm": 1.4183458562866678, + "learning_rate": 6.291389647510833e-06, + "loss": 0.2946, + "step": 6907 + }, + { + "epoch": 0.43, + "grad_norm": 3.5418600018117234, + "learning_rate": 6.2904057004537385e-06, + "loss": 0.3101, + "step": 6908 + }, + { + "epoch": 0.43, + "grad_norm": 1.7109504249161431, + "learning_rate": 6.2894216998554215e-06, + "loss": 0.2858, + "step": 6909 + }, + { + "epoch": 0.43, + "grad_norm": 2.8419552482693984, + "learning_rate": 6.288437645756706e-06, + "loss": 0.2864, + "step": 6910 + }, + { + "epoch": 0.43, + "grad_norm": 1.85911169554198, + "learning_rate": 6.287453538198426e-06, + "loss": 0.311, + "step": 6911 + }, + { + "epoch": 0.43, + "grad_norm": 2.5054704968118577, + "learning_rate": 6.28646937722141e-06, + "loss": 0.2945, + "step": 6912 + }, + { + "epoch": 0.43, + "grad_norm": 3.157171929352471, + "learning_rate": 6.285485162866496e-06, + "loss": 0.2975, + "step": 6913 + }, + { + "epoch": 0.43, + "grad_norm": 3.093106304030806, + "learning_rate": 6.284500895174518e-06, + "loss": 0.2969, + "step": 6914 + }, + { + "epoch": 0.43, + "grad_norm": 2.951128097908572, + "learning_rate": 6.283516574186318e-06, + "loss": 0.3002, + "step": 6915 + }, + { + "epoch": 0.43, + "grad_norm": 2.7873416091672234, + "learning_rate": 6.282532199942735e-06, + "loss": 0.308, + "step": 6916 + }, + { + "epoch": 0.44, + "grad_norm": 3.5250801150628495, + "learning_rate": 6.281547772484612e-06, + "loss": 0.3029, + "step": 6917 + }, + { + "epoch": 0.44, + "grad_norm": 2.3239861226448197, + "learning_rate": 6.280563291852796e-06, + "loss": 0.2981, + "step": 6918 + }, + { + "epoch": 0.44, + "grad_norm": 1.896650528267158, + "learning_rate": 6.279578758088135e-06, + "loss": 0.2865, + "step": 6919 + }, + { + "epoch": 0.44, + "grad_norm": 6.046720401907298, + "learning_rate": 6.278594171231478e-06, + "loss": 0.2971, + "step": 6920 + }, + { + "epoch": 0.44, + "grad_norm": 4.107336850805136, + "learning_rate": 6.277609531323678e-06, + "loss": 0.3215, + "step": 6921 + }, + { + "epoch": 0.44, + "grad_norm": 2.557701835265859, + "learning_rate": 6.27662483840559e-06, + "loss": 0.3184, + "step": 6922 + }, + { + "epoch": 0.44, + "grad_norm": 1.8742361321090764, + "learning_rate": 6.27564009251807e-06, + "loss": 0.2952, + "step": 6923 + }, + { + "epoch": 0.44, + "grad_norm": 2.2743838864681423, + "learning_rate": 6.274655293701974e-06, + "loss": 0.285, + "step": 6924 + }, + { + "epoch": 0.44, + "grad_norm": 1.9320725595818318, + "learning_rate": 6.273670441998169e-06, + "loss": 0.2882, + "step": 6925 + }, + { + "epoch": 0.44, + "grad_norm": 1.645404943097612, + "learning_rate": 6.272685537447513e-06, + "loss": 0.2664, + "step": 6926 + }, + { + "epoch": 0.44, + "grad_norm": 2.2758878763486856, + "learning_rate": 6.271700580090876e-06, + "loss": 0.311, + "step": 6927 + }, + { + "epoch": 0.44, + "grad_norm": 1.9482575278035108, + "learning_rate": 6.270715569969119e-06, + "loss": 0.3052, + "step": 6928 + }, + { + "epoch": 0.44, + "grad_norm": 2.112387212977741, + "learning_rate": 6.26973050712312e-06, + "loss": 0.3033, + "step": 6929 + }, + { + "epoch": 0.44, + "grad_norm": 2.7241744926684435, + "learning_rate": 6.2687453915937445e-06, + "loss": 0.3023, + "step": 6930 + }, + { + "epoch": 0.44, + "grad_norm": 2.0555602708856138, + "learning_rate": 6.267760223421871e-06, + "loss": 0.2969, + "step": 6931 + }, + { + "epoch": 0.44, + "grad_norm": 0.6248297912514691, + "learning_rate": 6.266775002648373e-06, + "loss": 0.5389, + "step": 6932 + }, + { + "epoch": 0.44, + "grad_norm": 2.037528832629987, + "learning_rate": 6.265789729314131e-06, + "loss": 0.2989, + "step": 6933 + }, + { + "epoch": 0.44, + "grad_norm": 3.6910461415496076, + "learning_rate": 6.264804403460023e-06, + "loss": 0.3171, + "step": 6934 + }, + { + "epoch": 0.44, + "grad_norm": 3.1443824498795303, + "learning_rate": 6.263819025126936e-06, + "loss": 0.2882, + "step": 6935 + }, + { + "epoch": 0.44, + "grad_norm": 2.0070005688932357, + "learning_rate": 6.2628335943557505e-06, + "loss": 0.2792, + "step": 6936 + }, + { + "epoch": 0.44, + "grad_norm": 0.6577178825907559, + "learning_rate": 6.261848111187358e-06, + "loss": 0.5032, + "step": 6937 + }, + { + "epoch": 0.44, + "grad_norm": 2.2880460372140314, + "learning_rate": 6.260862575662645e-06, + "loss": 0.311, + "step": 6938 + }, + { + "epoch": 0.44, + "grad_norm": 3.2988816930377083, + "learning_rate": 6.259876987822506e-06, + "loss": 0.2874, + "step": 6939 + }, + { + "epoch": 0.44, + "grad_norm": 6.303903898880498, + "learning_rate": 6.258891347707829e-06, + "loss": 0.2868, + "step": 6940 + }, + { + "epoch": 0.44, + "grad_norm": 1.4554178697573767, + "learning_rate": 6.257905655359519e-06, + "loss": 0.2776, + "step": 6941 + }, + { + "epoch": 0.44, + "grad_norm": 1.3989185717964019, + "learning_rate": 6.256919910818466e-06, + "loss": 0.3024, + "step": 6942 + }, + { + "epoch": 0.44, + "grad_norm": 2.4713631313250355, + "learning_rate": 6.255934114125574e-06, + "loss": 0.2867, + "step": 6943 + }, + { + "epoch": 0.44, + "grad_norm": 14.477810837095802, + "learning_rate": 6.254948265321744e-06, + "loss": 0.2911, + "step": 6944 + }, + { + "epoch": 0.44, + "grad_norm": 1.9950190859446826, + "learning_rate": 6.253962364447883e-06, + "loss": 0.2826, + "step": 6945 + }, + { + "epoch": 0.44, + "grad_norm": 3.5048232846671787, + "learning_rate": 6.252976411544896e-06, + "loss": 0.2893, + "step": 6946 + }, + { + "epoch": 0.44, + "grad_norm": 2.81103356279454, + "learning_rate": 6.251990406653691e-06, + "loss": 0.3192, + "step": 6947 + }, + { + "epoch": 0.44, + "grad_norm": 4.050149692739653, + "learning_rate": 6.25100434981518e-06, + "loss": 0.3046, + "step": 6948 + }, + { + "epoch": 0.44, + "grad_norm": 2.911724694471854, + "learning_rate": 6.250018241070278e-06, + "loss": 0.2898, + "step": 6949 + }, + { + "epoch": 0.44, + "grad_norm": 1.8625527564946367, + "learning_rate": 6.249032080459898e-06, + "loss": 0.296, + "step": 6950 + }, + { + "epoch": 0.44, + "grad_norm": 12.71810994009719, + "learning_rate": 6.248045868024958e-06, + "loss": 0.2963, + "step": 6951 + }, + { + "epoch": 0.44, + "grad_norm": 1.6184633602532053, + "learning_rate": 6.247059603806379e-06, + "loss": 0.287, + "step": 6952 + }, + { + "epoch": 0.44, + "grad_norm": 2.176729576462319, + "learning_rate": 6.246073287845083e-06, + "loss": 0.2843, + "step": 6953 + }, + { + "epoch": 0.44, + "grad_norm": 3.2516712664615466, + "learning_rate": 6.24508692018199e-06, + "loss": 0.2998, + "step": 6954 + }, + { + "epoch": 0.44, + "grad_norm": 3.0138997086144292, + "learning_rate": 6.2441005008580314e-06, + "loss": 0.3118, + "step": 6955 + }, + { + "epoch": 0.44, + "grad_norm": 1.3472168870737558, + "learning_rate": 6.243114029914133e-06, + "loss": 0.284, + "step": 6956 + }, + { + "epoch": 0.44, + "grad_norm": 4.171887894936276, + "learning_rate": 6.242127507391226e-06, + "loss": 0.2945, + "step": 6957 + }, + { + "epoch": 0.44, + "grad_norm": 1.359229503674128, + "learning_rate": 6.241140933330241e-06, + "loss": 0.2897, + "step": 6958 + }, + { + "epoch": 0.44, + "grad_norm": 1.8023195721854788, + "learning_rate": 6.240154307772115e-06, + "loss": 0.2934, + "step": 6959 + }, + { + "epoch": 0.44, + "grad_norm": 1.7584205824226065, + "learning_rate": 6.2391676307577845e-06, + "loss": 0.2928, + "step": 6960 + }, + { + "epoch": 0.44, + "grad_norm": 2.3600575699249644, + "learning_rate": 6.238180902328188e-06, + "loss": 0.2819, + "step": 6961 + }, + { + "epoch": 0.44, + "grad_norm": 1.928848767948197, + "learning_rate": 6.237194122524264e-06, + "loss": 0.2941, + "step": 6962 + }, + { + "epoch": 0.44, + "grad_norm": 2.866835490502667, + "learning_rate": 6.236207291386962e-06, + "loss": 0.3024, + "step": 6963 + }, + { + "epoch": 0.44, + "grad_norm": 2.4273734466129238, + "learning_rate": 6.235220408957221e-06, + "loss": 0.3057, + "step": 6964 + }, + { + "epoch": 0.44, + "grad_norm": 2.862038667793041, + "learning_rate": 6.234233475275994e-06, + "loss": 0.2852, + "step": 6965 + }, + { + "epoch": 0.44, + "grad_norm": 2.8983752565715197, + "learning_rate": 6.233246490384224e-06, + "loss": 0.2887, + "step": 6966 + }, + { + "epoch": 0.44, + "grad_norm": 1.7021755181041338, + "learning_rate": 6.23225945432287e-06, + "loss": 0.2842, + "step": 6967 + }, + { + "epoch": 0.44, + "grad_norm": 2.287035243422365, + "learning_rate": 6.231272367132881e-06, + "loss": 0.3123, + "step": 6968 + }, + { + "epoch": 0.44, + "grad_norm": 2.945247621903697, + "learning_rate": 6.230285228855215e-06, + "loss": 0.2919, + "step": 6969 + }, + { + "epoch": 0.44, + "grad_norm": 2.020562013384556, + "learning_rate": 6.229298039530829e-06, + "loss": 0.2861, + "step": 6970 + }, + { + "epoch": 0.44, + "grad_norm": 2.0372056156931793, + "learning_rate": 6.228310799200685e-06, + "loss": 0.2948, + "step": 6971 + }, + { + "epoch": 0.44, + "grad_norm": 1.912464584260742, + "learning_rate": 6.227323507905743e-06, + "loss": 0.2911, + "step": 6972 + }, + { + "epoch": 0.44, + "grad_norm": 2.374210068437158, + "learning_rate": 6.226336165686969e-06, + "loss": 0.2929, + "step": 6973 + }, + { + "epoch": 0.44, + "grad_norm": 4.135244812766038, + "learning_rate": 6.225348772585329e-06, + "loss": 0.3076, + "step": 6974 + }, + { + "epoch": 0.44, + "grad_norm": 1.6406959607573017, + "learning_rate": 6.224361328641794e-06, + "loss": 0.2881, + "step": 6975 + }, + { + "epoch": 0.44, + "grad_norm": 2.1404839011362387, + "learning_rate": 6.2233738338973304e-06, + "loss": 0.2953, + "step": 6976 + }, + { + "epoch": 0.44, + "grad_norm": 1.649553312960678, + "learning_rate": 6.222386288392914e-06, + "loss": 0.307, + "step": 6977 + }, + { + "epoch": 0.44, + "grad_norm": 2.513140057397445, + "learning_rate": 6.2213986921695194e-06, + "loss": 0.3009, + "step": 6978 + }, + { + "epoch": 0.44, + "grad_norm": 2.273120614566552, + "learning_rate": 6.220411045268124e-06, + "loss": 0.2908, + "step": 6979 + }, + { + "epoch": 0.44, + "grad_norm": 2.083398440398737, + "learning_rate": 6.219423347729707e-06, + "loss": 0.2883, + "step": 6980 + }, + { + "epoch": 0.44, + "grad_norm": 2.8022818181447064, + "learning_rate": 6.218435599595249e-06, + "loss": 0.3016, + "step": 6981 + }, + { + "epoch": 0.44, + "grad_norm": 2.715627016313357, + "learning_rate": 6.217447800905733e-06, + "loss": 0.2806, + "step": 6982 + }, + { + "epoch": 0.44, + "grad_norm": 1.9147989051437198, + "learning_rate": 6.216459951702146e-06, + "loss": 0.2956, + "step": 6983 + }, + { + "epoch": 0.44, + "grad_norm": 2.57260093166668, + "learning_rate": 6.215472052025474e-06, + "loss": 0.2859, + "step": 6984 + }, + { + "epoch": 0.44, + "grad_norm": 2.6807125697290495, + "learning_rate": 6.214484101916709e-06, + "loss": 0.2853, + "step": 6985 + }, + { + "epoch": 0.44, + "grad_norm": 1.712395827031748, + "learning_rate": 6.21349610141684e-06, + "loss": 0.3045, + "step": 6986 + }, + { + "epoch": 0.44, + "grad_norm": 58.18177499566314, + "learning_rate": 6.2125080505668645e-06, + "loss": 0.3021, + "step": 6987 + }, + { + "epoch": 0.44, + "grad_norm": 3.5855958975202866, + "learning_rate": 6.2115199494077735e-06, + "loss": 0.2961, + "step": 6988 + }, + { + "epoch": 0.44, + "grad_norm": 2.754485949036334, + "learning_rate": 6.210531797980571e-06, + "loss": 0.3101, + "step": 6989 + }, + { + "epoch": 0.44, + "grad_norm": 1.8998780353349038, + "learning_rate": 6.2095435963262514e-06, + "loss": 0.3012, + "step": 6990 + }, + { + "epoch": 0.44, + "grad_norm": 2.6426158708593297, + "learning_rate": 6.2085553444858225e-06, + "loss": 0.2826, + "step": 6991 + }, + { + "epoch": 0.44, + "grad_norm": 4.342512165812579, + "learning_rate": 6.207567042500282e-06, + "loss": 0.2853, + "step": 6992 + }, + { + "epoch": 0.44, + "grad_norm": 2.042777282932608, + "learning_rate": 6.206578690410643e-06, + "loss": 0.277, + "step": 6993 + }, + { + "epoch": 0.44, + "grad_norm": 1.6647666484805896, + "learning_rate": 6.20559028825791e-06, + "loss": 0.2834, + "step": 6994 + }, + { + "epoch": 0.44, + "grad_norm": 1.5568308132031212, + "learning_rate": 6.204601836083094e-06, + "loss": 0.2961, + "step": 6995 + }, + { + "epoch": 0.44, + "grad_norm": 3.0035566784926324, + "learning_rate": 6.203613333927209e-06, + "loss": 0.2787, + "step": 6996 + }, + { + "epoch": 0.44, + "grad_norm": 1.3794714496850886, + "learning_rate": 6.202624781831269e-06, + "loss": 0.2754, + "step": 6997 + }, + { + "epoch": 0.44, + "grad_norm": 2.683218158441574, + "learning_rate": 6.20163617983629e-06, + "loss": 0.303, + "step": 6998 + }, + { + "epoch": 0.44, + "grad_norm": 2.316001094462051, + "learning_rate": 6.200647527983292e-06, + "loss": 0.2835, + "step": 6999 + }, + { + "epoch": 0.44, + "grad_norm": 2.7957678371881527, + "learning_rate": 6.199658826313295e-06, + "loss": 0.3096, + "step": 7000 + }, + { + "epoch": 0.44, + "grad_norm": 2.139733121461754, + "learning_rate": 6.198670074867324e-06, + "loss": 0.3103, + "step": 7001 + }, + { + "epoch": 0.44, + "grad_norm": 1.4758325644328265, + "learning_rate": 6.197681273686401e-06, + "loss": 0.2833, + "step": 7002 + }, + { + "epoch": 0.44, + "grad_norm": 5.786111703777581, + "learning_rate": 6.196692422811554e-06, + "loss": 0.3245, + "step": 7003 + }, + { + "epoch": 0.44, + "grad_norm": 3.4195571136952108, + "learning_rate": 6.195703522283813e-06, + "loss": 0.304, + "step": 7004 + }, + { + "epoch": 0.44, + "grad_norm": 2.368769143360143, + "learning_rate": 6.194714572144212e-06, + "loss": 0.3047, + "step": 7005 + }, + { + "epoch": 0.44, + "grad_norm": 2.4142774733542174, + "learning_rate": 6.193725572433779e-06, + "loss": 0.3021, + "step": 7006 + }, + { + "epoch": 0.44, + "grad_norm": 2.3621891793787175, + "learning_rate": 6.192736523193551e-06, + "loss": 0.3102, + "step": 7007 + }, + { + "epoch": 0.44, + "grad_norm": 2.34851259692707, + "learning_rate": 6.191747424464567e-06, + "loss": 0.306, + "step": 7008 + }, + { + "epoch": 0.44, + "grad_norm": 2.2927614161375742, + "learning_rate": 6.1907582762878675e-06, + "loss": 0.3111, + "step": 7009 + }, + { + "epoch": 0.44, + "grad_norm": 2.2750354088498064, + "learning_rate": 6.189769078704489e-06, + "loss": 0.3084, + "step": 7010 + }, + { + "epoch": 0.44, + "grad_norm": 10.013965366878603, + "learning_rate": 6.188779831755479e-06, + "loss": 0.3017, + "step": 7011 + }, + { + "epoch": 0.44, + "grad_norm": 2.013557669244629, + "learning_rate": 6.1877905354818825e-06, + "loss": 0.3053, + "step": 7012 + }, + { + "epoch": 0.44, + "grad_norm": 2.3582681305222066, + "learning_rate": 6.186801189924748e-06, + "loss": 0.2781, + "step": 7013 + }, + { + "epoch": 0.44, + "grad_norm": 3.750226125509539, + "learning_rate": 6.185811795125122e-06, + "loss": 0.309, + "step": 7014 + }, + { + "epoch": 0.44, + "grad_norm": 1.786131507164553, + "learning_rate": 6.18482235112406e-06, + "loss": 0.2924, + "step": 7015 + }, + { + "epoch": 0.44, + "grad_norm": 1.8849568031186652, + "learning_rate": 6.183832857962614e-06, + "loss": 0.2992, + "step": 7016 + }, + { + "epoch": 0.44, + "grad_norm": 2.856593453702831, + "learning_rate": 6.18284331568184e-06, + "loss": 0.3275, + "step": 7017 + }, + { + "epoch": 0.44, + "grad_norm": 2.2274692600465977, + "learning_rate": 6.181853724322795e-06, + "loss": 0.2941, + "step": 7018 + }, + { + "epoch": 0.44, + "grad_norm": 2.120222048101711, + "learning_rate": 6.180864083926541e-06, + "loss": 0.281, + "step": 7019 + }, + { + "epoch": 0.44, + "grad_norm": 1.9364150542022918, + "learning_rate": 6.179874394534138e-06, + "loss": 0.3085, + "step": 7020 + }, + { + "epoch": 0.44, + "grad_norm": 7.128182292913789, + "learning_rate": 6.178884656186651e-06, + "loss": 0.2761, + "step": 7021 + }, + { + "epoch": 0.44, + "grad_norm": 2.6459217754761926, + "learning_rate": 6.177894868925144e-06, + "loss": 0.2888, + "step": 7022 + }, + { + "epoch": 0.44, + "grad_norm": 5.379192067164824, + "learning_rate": 6.17690503279069e-06, + "loss": 0.306, + "step": 7023 + }, + { + "epoch": 0.44, + "grad_norm": 2.3443047043338927, + "learning_rate": 6.175915147824353e-06, + "loss": 0.2965, + "step": 7024 + }, + { + "epoch": 0.44, + "grad_norm": 2.9122964560256457, + "learning_rate": 6.174925214067209e-06, + "loss": 0.3075, + "step": 7025 + }, + { + "epoch": 0.44, + "grad_norm": 1.7078695101143218, + "learning_rate": 6.17393523156033e-06, + "loss": 0.2822, + "step": 7026 + }, + { + "epoch": 0.44, + "grad_norm": 3.4347276812654037, + "learning_rate": 6.172945200344794e-06, + "loss": 0.2816, + "step": 7027 + }, + { + "epoch": 0.44, + "grad_norm": 2.052861955587367, + "learning_rate": 6.171955120461679e-06, + "loss": 0.2809, + "step": 7028 + }, + { + "epoch": 0.44, + "grad_norm": 3.6833519806050177, + "learning_rate": 6.170964991952063e-06, + "loss": 0.3065, + "step": 7029 + }, + { + "epoch": 0.44, + "grad_norm": 2.4351329321545907, + "learning_rate": 6.169974814857029e-06, + "loss": 0.2893, + "step": 7030 + }, + { + "epoch": 0.44, + "grad_norm": 0.6541129622906818, + "learning_rate": 6.168984589217665e-06, + "loss": 0.4677, + "step": 7031 + }, + { + "epoch": 0.44, + "grad_norm": 1.7955176642312072, + "learning_rate": 6.167994315075051e-06, + "loss": 0.3035, + "step": 7032 + }, + { + "epoch": 0.44, + "grad_norm": 2.068203637383184, + "learning_rate": 6.16700399247028e-06, + "loss": 0.3021, + "step": 7033 + }, + { + "epoch": 0.44, + "grad_norm": 1.5691572599151804, + "learning_rate": 6.166013621444439e-06, + "loss": 0.293, + "step": 7034 + }, + { + "epoch": 0.44, + "grad_norm": 2.7879507597793993, + "learning_rate": 6.165023202038623e-06, + "loss": 0.2894, + "step": 7035 + }, + { + "epoch": 0.44, + "grad_norm": 2.1276244932199386, + "learning_rate": 6.164032734293923e-06, + "loss": 0.2785, + "step": 7036 + }, + { + "epoch": 0.44, + "grad_norm": 4.676875938315613, + "learning_rate": 6.163042218251441e-06, + "loss": 0.313, + "step": 7037 + }, + { + "epoch": 0.44, + "grad_norm": 1.8636923280359605, + "learning_rate": 6.162051653952268e-06, + "loss": 0.2848, + "step": 7038 + }, + { + "epoch": 0.44, + "grad_norm": 1.8956087264028727, + "learning_rate": 6.16106104143751e-06, + "loss": 0.3253, + "step": 7039 + }, + { + "epoch": 0.44, + "grad_norm": 6.0370163004082045, + "learning_rate": 6.160070380748266e-06, + "loss": 0.3339, + "step": 7040 + }, + { + "epoch": 0.44, + "grad_norm": 3.3509695033589586, + "learning_rate": 6.159079671925643e-06, + "loss": 0.2997, + "step": 7041 + }, + { + "epoch": 0.44, + "grad_norm": 4.296569875515288, + "learning_rate": 6.158088915010744e-06, + "loss": 0.2879, + "step": 7042 + }, + { + "epoch": 0.44, + "grad_norm": 0.6044115897038609, + "learning_rate": 6.15709811004468e-06, + "loss": 0.4974, + "step": 7043 + }, + { + "epoch": 0.44, + "grad_norm": 1.8354352805137628, + "learning_rate": 6.156107257068558e-06, + "loss": 0.2906, + "step": 7044 + }, + { + "epoch": 0.44, + "grad_norm": 3.2308941184507693, + "learning_rate": 6.155116356123495e-06, + "loss": 0.2823, + "step": 7045 + }, + { + "epoch": 0.44, + "grad_norm": 1.911403847403431, + "learning_rate": 6.154125407250602e-06, + "loss": 0.29, + "step": 7046 + }, + { + "epoch": 0.44, + "grad_norm": 2.47474944203617, + "learning_rate": 6.153134410490995e-06, + "loss": 0.2883, + "step": 7047 + }, + { + "epoch": 0.44, + "grad_norm": 1.3271239921105271, + "learning_rate": 6.152143365885794e-06, + "loss": 0.2917, + "step": 7048 + }, + { + "epoch": 0.44, + "grad_norm": 1.9133580788293076, + "learning_rate": 6.151152273476118e-06, + "loss": 0.2858, + "step": 7049 + }, + { + "epoch": 0.44, + "grad_norm": 19.410731303828012, + "learning_rate": 6.150161133303088e-06, + "loss": 0.3041, + "step": 7050 + }, + { + "epoch": 0.44, + "grad_norm": 3.48746589224747, + "learning_rate": 6.149169945407832e-06, + "loss": 0.2927, + "step": 7051 + }, + { + "epoch": 0.44, + "grad_norm": 2.2059173948238775, + "learning_rate": 6.1481787098314725e-06, + "loss": 0.3055, + "step": 7052 + }, + { + "epoch": 0.44, + "grad_norm": 2.0858129348336796, + "learning_rate": 6.14718742661514e-06, + "loss": 0.2933, + "step": 7053 + }, + { + "epoch": 0.44, + "grad_norm": 2.4960797857699237, + "learning_rate": 6.146196095799963e-06, + "loss": 0.2749, + "step": 7054 + }, + { + "epoch": 0.44, + "grad_norm": 4.904959603471647, + "learning_rate": 6.145204717427073e-06, + "loss": 0.291, + "step": 7055 + }, + { + "epoch": 0.44, + "grad_norm": 2.2726555804621618, + "learning_rate": 6.144213291537606e-06, + "loss": 0.2991, + "step": 7056 + }, + { + "epoch": 0.44, + "grad_norm": 1.5150798268392562, + "learning_rate": 6.143221818172699e-06, + "loss": 0.2961, + "step": 7057 + }, + { + "epoch": 0.44, + "grad_norm": 5.619053693609467, + "learning_rate": 6.142230297373486e-06, + "loss": 0.299, + "step": 7058 + }, + { + "epoch": 0.44, + "grad_norm": 3.5399819273112656, + "learning_rate": 6.141238729181109e-06, + "loss": 0.2867, + "step": 7059 + }, + { + "epoch": 0.44, + "grad_norm": 2.4249270772240146, + "learning_rate": 6.1402471136367115e-06, + "loss": 0.2939, + "step": 7060 + }, + { + "epoch": 0.44, + "grad_norm": 2.078587070086647, + "learning_rate": 6.139255450781436e-06, + "loss": 0.2876, + "step": 7061 + }, + { + "epoch": 0.44, + "grad_norm": 1.5371946901276299, + "learning_rate": 6.138263740656427e-06, + "loss": 0.2646, + "step": 7062 + }, + { + "epoch": 0.44, + "grad_norm": 2.154862380993804, + "learning_rate": 6.137271983302834e-06, + "loss": 0.312, + "step": 7063 + }, + { + "epoch": 0.44, + "grad_norm": 1.2056076224871501, + "learning_rate": 6.136280178761806e-06, + "loss": 0.2954, + "step": 7064 + }, + { + "epoch": 0.44, + "grad_norm": 3.7749927945554256, + "learning_rate": 6.135288327074497e-06, + "loss": 0.3172, + "step": 7065 + }, + { + "epoch": 0.44, + "grad_norm": 1.8188161739814512, + "learning_rate": 6.134296428282056e-06, + "loss": 0.3134, + "step": 7066 + }, + { + "epoch": 0.44, + "grad_norm": 3.233217469619812, + "learning_rate": 6.1333044824256435e-06, + "loss": 0.2885, + "step": 7067 + }, + { + "epoch": 0.44, + "grad_norm": 1.975546975137299, + "learning_rate": 6.132312489546414e-06, + "loss": 0.2984, + "step": 7068 + }, + { + "epoch": 0.44, + "grad_norm": 3.8913759033808324, + "learning_rate": 6.131320449685529e-06, + "loss": 0.2788, + "step": 7069 + }, + { + "epoch": 0.44, + "grad_norm": 4.477800206370894, + "learning_rate": 6.130328362884148e-06, + "loss": 0.2956, + "step": 7070 + }, + { + "epoch": 0.44, + "grad_norm": 0.6984344336266934, + "learning_rate": 6.129336229183437e-06, + "loss": 0.518, + "step": 7071 + }, + { + "epoch": 0.44, + "grad_norm": 3.8583193101830235, + "learning_rate": 6.128344048624558e-06, + "loss": 0.3035, + "step": 7072 + }, + { + "epoch": 0.44, + "grad_norm": 2.2840903028232784, + "learning_rate": 6.1273518212486825e-06, + "loss": 0.2788, + "step": 7073 + }, + { + "epoch": 0.44, + "grad_norm": 4.548828807577619, + "learning_rate": 6.126359547096975e-06, + "loss": 0.2797, + "step": 7074 + }, + { + "epoch": 0.44, + "grad_norm": 5.519022640767992, + "learning_rate": 6.125367226210612e-06, + "loss": 0.299, + "step": 7075 + }, + { + "epoch": 0.45, + "grad_norm": 1.4147106372018525, + "learning_rate": 6.124374858630762e-06, + "loss": 0.2934, + "step": 7076 + }, + { + "epoch": 0.45, + "grad_norm": 1.6580370895773484, + "learning_rate": 6.123382444398603e-06, + "loss": 0.2755, + "step": 7077 + }, + { + "epoch": 0.45, + "grad_norm": 1.3587927852990556, + "learning_rate": 6.12238998355531e-06, + "loss": 0.2892, + "step": 7078 + }, + { + "epoch": 0.45, + "grad_norm": 3.578125520614798, + "learning_rate": 6.121397476142064e-06, + "loss": 0.2821, + "step": 7079 + }, + { + "epoch": 0.45, + "grad_norm": 2.038925348742282, + "learning_rate": 6.1204049222000435e-06, + "loss": 0.2789, + "step": 7080 + }, + { + "epoch": 0.45, + "grad_norm": 3.3745016399428254, + "learning_rate": 6.1194123217704336e-06, + "loss": 0.2845, + "step": 7081 + }, + { + "epoch": 0.45, + "grad_norm": 1.7100527980503495, + "learning_rate": 6.118419674894418e-06, + "loss": 0.2932, + "step": 7082 + }, + { + "epoch": 0.45, + "grad_norm": 2.379610724171847, + "learning_rate": 6.1174269816131845e-06, + "loss": 0.3309, + "step": 7083 + }, + { + "epoch": 0.45, + "grad_norm": 1.2657517778028917, + "learning_rate": 6.11643424196792e-06, + "loss": 0.295, + "step": 7084 + }, + { + "epoch": 0.45, + "grad_norm": 5.733993237194969, + "learning_rate": 6.115441455999816e-06, + "loss": 0.2892, + "step": 7085 + }, + { + "epoch": 0.45, + "grad_norm": 5.657008334448093, + "learning_rate": 6.114448623750065e-06, + "loss": 0.279, + "step": 7086 + }, + { + "epoch": 0.45, + "grad_norm": 2.3213655311496555, + "learning_rate": 6.113455745259861e-06, + "loss": 0.3175, + "step": 7087 + }, + { + "epoch": 0.45, + "grad_norm": 2.5280282867737696, + "learning_rate": 6.112462820570399e-06, + "loss": 0.3049, + "step": 7088 + }, + { + "epoch": 0.45, + "grad_norm": 2.972235837101983, + "learning_rate": 6.11146984972288e-06, + "loss": 0.3146, + "step": 7089 + }, + { + "epoch": 0.45, + "grad_norm": 0.5848359505232334, + "learning_rate": 6.110476832758503e-06, + "loss": 0.4673, + "step": 7090 + }, + { + "epoch": 0.45, + "grad_norm": 2.6857764963936, + "learning_rate": 6.10948376971847e-06, + "loss": 0.2994, + "step": 7091 + }, + { + "epoch": 0.45, + "grad_norm": 3.8886980918386227, + "learning_rate": 6.108490660643982e-06, + "loss": 0.2876, + "step": 7092 + }, + { + "epoch": 0.45, + "grad_norm": 4.040459490022842, + "learning_rate": 6.107497505576251e-06, + "loss": 0.2765, + "step": 7093 + }, + { + "epoch": 0.45, + "grad_norm": 6.509208148055233, + "learning_rate": 6.106504304556479e-06, + "loss": 0.2864, + "step": 7094 + }, + { + "epoch": 0.45, + "grad_norm": 1.6197656977857953, + "learning_rate": 6.10551105762588e-06, + "loss": 0.3046, + "step": 7095 + }, + { + "epoch": 0.45, + "grad_norm": 1.9027807338901899, + "learning_rate": 6.104517764825662e-06, + "loss": 0.2777, + "step": 7096 + }, + { + "epoch": 0.45, + "grad_norm": 1.766224200303339, + "learning_rate": 6.103524426197041e-06, + "loss": 0.2775, + "step": 7097 + }, + { + "epoch": 0.45, + "grad_norm": 2.965032226299235, + "learning_rate": 6.10253104178123e-06, + "loss": 0.2851, + "step": 7098 + }, + { + "epoch": 0.45, + "grad_norm": 1.9785809208109495, + "learning_rate": 6.1015376116194506e-06, + "loss": 0.3002, + "step": 7099 + }, + { + "epoch": 0.45, + "grad_norm": 1.3618764576495743, + "learning_rate": 6.100544135752916e-06, + "loss": 0.2931, + "step": 7100 + }, + { + "epoch": 0.45, + "grad_norm": 2.8428649776226034, + "learning_rate": 6.0995506142228525e-06, + "loss": 0.2921, + "step": 7101 + }, + { + "epoch": 0.45, + "grad_norm": 3.0358845575392865, + "learning_rate": 6.09855704707048e-06, + "loss": 0.2974, + "step": 7102 + }, + { + "epoch": 0.45, + "grad_norm": 6.832942096883757, + "learning_rate": 6.097563434337026e-06, + "loss": 0.2927, + "step": 7103 + }, + { + "epoch": 0.45, + "grad_norm": 2.005828899782704, + "learning_rate": 6.096569776063712e-06, + "loss": 0.2955, + "step": 7104 + }, + { + "epoch": 0.45, + "grad_norm": 2.1884957411659056, + "learning_rate": 6.095576072291774e-06, + "loss": 0.3011, + "step": 7105 + }, + { + "epoch": 0.45, + "grad_norm": 1.7264328630247696, + "learning_rate": 6.094582323062437e-06, + "loss": 0.2942, + "step": 7106 + }, + { + "epoch": 0.45, + "grad_norm": 6.5422946025736, + "learning_rate": 6.093588528416936e-06, + "loss": 0.2919, + "step": 7107 + }, + { + "epoch": 0.45, + "grad_norm": 4.149446507980943, + "learning_rate": 6.092594688396504e-06, + "loss": 0.3032, + "step": 7108 + }, + { + "epoch": 0.45, + "grad_norm": 1.714015273782947, + "learning_rate": 6.091600803042378e-06, + "loss": 0.2971, + "step": 7109 + }, + { + "epoch": 0.45, + "grad_norm": 1.6346469554531453, + "learning_rate": 6.090606872395796e-06, + "loss": 0.2971, + "step": 7110 + }, + { + "epoch": 0.45, + "grad_norm": 2.6839766515658936, + "learning_rate": 6.089612896497996e-06, + "loss": 0.311, + "step": 7111 + }, + { + "epoch": 0.45, + "grad_norm": 2.251165125405875, + "learning_rate": 6.088618875390223e-06, + "loss": 0.3107, + "step": 7112 + }, + { + "epoch": 0.45, + "grad_norm": 2.228872572673163, + "learning_rate": 6.087624809113721e-06, + "loss": 0.2842, + "step": 7113 + }, + { + "epoch": 0.45, + "grad_norm": 2.5442661757751215, + "learning_rate": 6.086630697709731e-06, + "loss": 0.2978, + "step": 7114 + }, + { + "epoch": 0.45, + "grad_norm": 2.2380844371099577, + "learning_rate": 6.085636541219504e-06, + "loss": 0.2898, + "step": 7115 + }, + { + "epoch": 0.45, + "grad_norm": 2.403116211780939, + "learning_rate": 6.084642339684289e-06, + "loss": 0.3015, + "step": 7116 + }, + { + "epoch": 0.45, + "grad_norm": 1.8748136352845692, + "learning_rate": 6.083648093145337e-06, + "loss": 0.3157, + "step": 7117 + }, + { + "epoch": 0.45, + "grad_norm": 4.264386049161767, + "learning_rate": 6.0826538016439e-06, + "loss": 0.3019, + "step": 7118 + }, + { + "epoch": 0.45, + "grad_norm": 1.424593499904079, + "learning_rate": 6.081659465221234e-06, + "loss": 0.2966, + "step": 7119 + }, + { + "epoch": 0.45, + "grad_norm": 3.2664871301678624, + "learning_rate": 6.080665083918595e-06, + "loss": 0.2923, + "step": 7120 + }, + { + "epoch": 0.45, + "grad_norm": 0.6016217598693909, + "learning_rate": 6.079670657777244e-06, + "loss": 0.5003, + "step": 7121 + }, + { + "epoch": 0.45, + "grad_norm": 1.599686531993162, + "learning_rate": 6.078676186838438e-06, + "loss": 0.2986, + "step": 7122 + }, + { + "epoch": 0.45, + "grad_norm": 2.0419308709735158, + "learning_rate": 6.077681671143443e-06, + "loss": 0.3194, + "step": 7123 + }, + { + "epoch": 0.45, + "grad_norm": 5.3146080493616905, + "learning_rate": 6.076687110733519e-06, + "loss": 0.2942, + "step": 7124 + }, + { + "epoch": 0.45, + "grad_norm": 1.3781187233316055, + "learning_rate": 6.075692505649937e-06, + "loss": 0.2602, + "step": 7125 + }, + { + "epoch": 0.45, + "grad_norm": 2.3047037457388875, + "learning_rate": 6.074697855933959e-06, + "loss": 0.2953, + "step": 7126 + }, + { + "epoch": 0.45, + "grad_norm": 1.7933948377616522, + "learning_rate": 6.07370316162686e-06, + "loss": 0.2808, + "step": 7127 + }, + { + "epoch": 0.45, + "grad_norm": 1.8672118045593933, + "learning_rate": 6.0727084227699095e-06, + "loss": 0.2843, + "step": 7128 + }, + { + "epoch": 0.45, + "grad_norm": 1.9693002900450496, + "learning_rate": 6.071713639404382e-06, + "loss": 0.2911, + "step": 7129 + }, + { + "epoch": 0.45, + "grad_norm": 2.8233998369605118, + "learning_rate": 6.07071881157155e-06, + "loss": 0.2838, + "step": 7130 + }, + { + "epoch": 0.45, + "grad_norm": 2.053406986259955, + "learning_rate": 6.069723939312695e-06, + "loss": 0.3018, + "step": 7131 + }, + { + "epoch": 0.45, + "grad_norm": 1.7077429588707012, + "learning_rate": 6.068729022669092e-06, + "loss": 0.2779, + "step": 7132 + }, + { + "epoch": 0.45, + "grad_norm": 1.6434443393824358, + "learning_rate": 6.067734061682024e-06, + "loss": 0.2968, + "step": 7133 + }, + { + "epoch": 0.45, + "grad_norm": 2.0316381344492025, + "learning_rate": 6.066739056392774e-06, + "loss": 0.2954, + "step": 7134 + }, + { + "epoch": 0.45, + "grad_norm": 1.5362265329115818, + "learning_rate": 6.065744006842626e-06, + "loss": 0.2978, + "step": 7135 + }, + { + "epoch": 0.45, + "grad_norm": 1.415171399932464, + "learning_rate": 6.064748913072864e-06, + "loss": 0.2853, + "step": 7136 + }, + { + "epoch": 0.45, + "grad_norm": 1.6488797212567636, + "learning_rate": 6.063753775124781e-06, + "loss": 0.291, + "step": 7137 + }, + { + "epoch": 0.45, + "grad_norm": 2.6571025766959937, + "learning_rate": 6.062758593039663e-06, + "loss": 0.2965, + "step": 7138 + }, + { + "epoch": 0.45, + "grad_norm": 2.4006475600524397, + "learning_rate": 6.061763366858804e-06, + "loss": 0.2868, + "step": 7139 + }, + { + "epoch": 0.45, + "grad_norm": 1.4657827633756655, + "learning_rate": 6.060768096623496e-06, + "loss": 0.3023, + "step": 7140 + }, + { + "epoch": 0.45, + "grad_norm": 1.710688928026125, + "learning_rate": 6.059772782375036e-06, + "loss": 0.2826, + "step": 7141 + }, + { + "epoch": 0.45, + "grad_norm": 3.5995991529604687, + "learning_rate": 6.0587774241547205e-06, + "loss": 0.3118, + "step": 7142 + }, + { + "epoch": 0.45, + "grad_norm": 3.041110035251263, + "learning_rate": 6.057782022003851e-06, + "loss": 0.2813, + "step": 7143 + }, + { + "epoch": 0.45, + "grad_norm": 2.0509229914424894, + "learning_rate": 6.056786575963725e-06, + "loss": 0.2976, + "step": 7144 + }, + { + "epoch": 0.45, + "grad_norm": 2.1445565628745213, + "learning_rate": 6.0557910860756466e-06, + "loss": 0.2924, + "step": 7145 + }, + { + "epoch": 0.45, + "grad_norm": 2.20498732398269, + "learning_rate": 6.054795552380921e-06, + "loss": 0.2961, + "step": 7146 + }, + { + "epoch": 0.45, + "grad_norm": 3.8094318107666756, + "learning_rate": 6.053799974920856e-06, + "loss": 0.2857, + "step": 7147 + }, + { + "epoch": 0.45, + "grad_norm": 1.4924062943715422, + "learning_rate": 6.052804353736757e-06, + "loss": 0.2792, + "step": 7148 + }, + { + "epoch": 0.45, + "grad_norm": 12.540322723520255, + "learning_rate": 6.051808688869934e-06, + "loss": 0.3007, + "step": 7149 + }, + { + "epoch": 0.45, + "grad_norm": 2.777149268579538, + "learning_rate": 6.050812980361701e-06, + "loss": 0.3074, + "step": 7150 + }, + { + "epoch": 0.45, + "grad_norm": 1.3407254052984399, + "learning_rate": 6.049817228253373e-06, + "loss": 0.277, + "step": 7151 + }, + { + "epoch": 0.45, + "grad_norm": 5.7574284251759975, + "learning_rate": 6.048821432586261e-06, + "loss": 0.3029, + "step": 7152 + }, + { + "epoch": 0.45, + "grad_norm": 1.4172601326843792, + "learning_rate": 6.047825593401686e-06, + "loss": 0.2824, + "step": 7153 + }, + { + "epoch": 0.45, + "grad_norm": 1.7614750685381748, + "learning_rate": 6.046829710740966e-06, + "loss": 0.302, + "step": 7154 + }, + { + "epoch": 0.45, + "grad_norm": 1.925876730667548, + "learning_rate": 6.045833784645422e-06, + "loss": 0.304, + "step": 7155 + }, + { + "epoch": 0.45, + "grad_norm": 1.8993360446893568, + "learning_rate": 6.044837815156377e-06, + "loss": 0.2811, + "step": 7156 + }, + { + "epoch": 0.45, + "grad_norm": 2.5832044248453947, + "learning_rate": 6.043841802315153e-06, + "loss": 0.3002, + "step": 7157 + }, + { + "epoch": 0.45, + "grad_norm": 5.597899906254344, + "learning_rate": 6.042845746163081e-06, + "loss": 0.3098, + "step": 7158 + }, + { + "epoch": 0.45, + "grad_norm": 2.2392190828824825, + "learning_rate": 6.041849646741485e-06, + "loss": 0.3133, + "step": 7159 + }, + { + "epoch": 0.45, + "grad_norm": 2.049087140610685, + "learning_rate": 6.040853504091698e-06, + "loss": 0.2963, + "step": 7160 + }, + { + "epoch": 0.45, + "grad_norm": 1.7356790660117334, + "learning_rate": 6.03985731825505e-06, + "loss": 0.2759, + "step": 7161 + }, + { + "epoch": 0.45, + "grad_norm": 3.5465860338521598, + "learning_rate": 6.038861089272875e-06, + "loss": 0.2939, + "step": 7162 + }, + { + "epoch": 0.45, + "grad_norm": 1.560534782086251, + "learning_rate": 6.037864817186507e-06, + "loss": 0.2859, + "step": 7163 + }, + { + "epoch": 0.45, + "grad_norm": 4.841044845563695, + "learning_rate": 6.036868502037286e-06, + "loss": 0.2788, + "step": 7164 + }, + { + "epoch": 0.45, + "grad_norm": 3.0138936133192953, + "learning_rate": 6.035872143866549e-06, + "loss": 0.296, + "step": 7165 + }, + { + "epoch": 0.45, + "grad_norm": 1.4743239005657713, + "learning_rate": 6.034875742715636e-06, + "loss": 0.296, + "step": 7166 + }, + { + "epoch": 0.45, + "grad_norm": 1.88237633275076, + "learning_rate": 6.03387929862589e-06, + "loss": 0.3117, + "step": 7167 + }, + { + "epoch": 0.45, + "grad_norm": 4.762908556869072, + "learning_rate": 6.032882811638656e-06, + "loss": 0.3031, + "step": 7168 + }, + { + "epoch": 0.45, + "grad_norm": 1.416071537999233, + "learning_rate": 6.03188628179528e-06, + "loss": 0.2716, + "step": 7169 + }, + { + "epoch": 0.45, + "grad_norm": 2.3807843710024055, + "learning_rate": 6.030889709137109e-06, + "loss": 0.2826, + "step": 7170 + }, + { + "epoch": 0.45, + "grad_norm": 2.425970117898785, + "learning_rate": 6.029893093705492e-06, + "loss": 0.2963, + "step": 7171 + }, + { + "epoch": 0.45, + "grad_norm": 4.606691631289159, + "learning_rate": 6.0288964355417825e-06, + "loss": 0.317, + "step": 7172 + }, + { + "epoch": 0.45, + "grad_norm": 1.7422822195154766, + "learning_rate": 6.027899734687332e-06, + "loss": 0.2906, + "step": 7173 + }, + { + "epoch": 0.45, + "grad_norm": 1.6710875179664204, + "learning_rate": 6.026902991183496e-06, + "loss": 0.3053, + "step": 7174 + }, + { + "epoch": 0.45, + "grad_norm": 2.757845693594874, + "learning_rate": 6.02590620507163e-06, + "loss": 0.3094, + "step": 7175 + }, + { + "epoch": 0.45, + "grad_norm": 4.299915257306338, + "learning_rate": 6.024909376393093e-06, + "loss": 0.2785, + "step": 7176 + }, + { + "epoch": 0.45, + "grad_norm": 3.3634373999122538, + "learning_rate": 6.0239125051892475e-06, + "loss": 0.2949, + "step": 7177 + }, + { + "epoch": 0.45, + "grad_norm": 2.150206470673079, + "learning_rate": 6.022915591501453e-06, + "loss": 0.2912, + "step": 7178 + }, + { + "epoch": 0.45, + "grad_norm": 2.8093004842615352, + "learning_rate": 6.021918635371072e-06, + "loss": 0.2975, + "step": 7179 + }, + { + "epoch": 0.45, + "grad_norm": 2.29026863303202, + "learning_rate": 6.020921636839473e-06, + "loss": 0.2986, + "step": 7180 + }, + { + "epoch": 0.45, + "grad_norm": 1.6955111512391063, + "learning_rate": 6.019924595948022e-06, + "loss": 0.289, + "step": 7181 + }, + { + "epoch": 0.45, + "grad_norm": 1.773420507313563, + "learning_rate": 6.018927512738088e-06, + "loss": 0.2971, + "step": 7182 + }, + { + "epoch": 0.45, + "grad_norm": 1.4514447964474035, + "learning_rate": 6.017930387251041e-06, + "loss": 0.2911, + "step": 7183 + }, + { + "epoch": 0.45, + "grad_norm": 1.4629645084668805, + "learning_rate": 6.016933219528255e-06, + "loss": 0.3011, + "step": 7184 + }, + { + "epoch": 0.45, + "grad_norm": 2.3663085418713634, + "learning_rate": 6.015936009611103e-06, + "loss": 0.2928, + "step": 7185 + }, + { + "epoch": 0.45, + "grad_norm": 2.1752263871073754, + "learning_rate": 6.0149387575409615e-06, + "loss": 0.3098, + "step": 7186 + }, + { + "epoch": 0.45, + "grad_norm": 1.6604531010115278, + "learning_rate": 6.0139414633592075e-06, + "loss": 0.286, + "step": 7187 + }, + { + "epoch": 0.45, + "grad_norm": 1.3047216009249556, + "learning_rate": 6.012944127107222e-06, + "loss": 0.2945, + "step": 7188 + }, + { + "epoch": 0.45, + "grad_norm": 1.6102446940460937, + "learning_rate": 6.0119467488263846e-06, + "loss": 0.3039, + "step": 7189 + }, + { + "epoch": 0.45, + "grad_norm": 1.6873860243434702, + "learning_rate": 6.010949328558081e-06, + "loss": 0.2813, + "step": 7190 + }, + { + "epoch": 0.45, + "grad_norm": 3.65250739275473, + "learning_rate": 6.009951866343693e-06, + "loss": 0.2974, + "step": 7191 + }, + { + "epoch": 0.45, + "grad_norm": 5.136638892242125, + "learning_rate": 6.008954362224608e-06, + "loss": 0.3219, + "step": 7192 + }, + { + "epoch": 0.45, + "grad_norm": 2.76969378823849, + "learning_rate": 6.007956816242214e-06, + "loss": 0.3051, + "step": 7193 + }, + { + "epoch": 0.45, + "grad_norm": 2.0477884946942178, + "learning_rate": 6.006959228437903e-06, + "loss": 0.279, + "step": 7194 + }, + { + "epoch": 0.45, + "grad_norm": 2.955981559417683, + "learning_rate": 6.005961598853064e-06, + "loss": 0.289, + "step": 7195 + }, + { + "epoch": 0.45, + "grad_norm": 1.8725487337804585, + "learning_rate": 6.004963927529092e-06, + "loss": 0.2863, + "step": 7196 + }, + { + "epoch": 0.45, + "grad_norm": 2.5115666860324817, + "learning_rate": 6.00396621450738e-06, + "loss": 0.2904, + "step": 7197 + }, + { + "epoch": 0.45, + "grad_norm": 1.806800120613164, + "learning_rate": 6.002968459829328e-06, + "loss": 0.3106, + "step": 7198 + }, + { + "epoch": 0.45, + "grad_norm": 4.963758732342158, + "learning_rate": 6.001970663536333e-06, + "loss": 0.2889, + "step": 7199 + }, + { + "epoch": 0.45, + "grad_norm": 194.9734329297264, + "learning_rate": 6.000972825669795e-06, + "loss": 0.3226, + "step": 7200 + }, + { + "epoch": 0.45, + "grad_norm": 1.5076472658835243, + "learning_rate": 5.999974946271116e-06, + "loss": 0.2848, + "step": 7201 + }, + { + "epoch": 0.45, + "grad_norm": 1.935928815908976, + "learning_rate": 5.9989770253817015e-06, + "loss": 0.2931, + "step": 7202 + }, + { + "epoch": 0.45, + "grad_norm": 4.462734348506588, + "learning_rate": 5.9979790630429556e-06, + "loss": 0.2916, + "step": 7203 + }, + { + "epoch": 0.45, + "grad_norm": 2.087409253017042, + "learning_rate": 5.9969810592962866e-06, + "loss": 0.3014, + "step": 7204 + }, + { + "epoch": 0.45, + "grad_norm": 2.3780574766695843, + "learning_rate": 5.995983014183101e-06, + "loss": 0.3041, + "step": 7205 + }, + { + "epoch": 0.45, + "grad_norm": 5.939480393315704, + "learning_rate": 5.994984927744812e-06, + "loss": 0.2984, + "step": 7206 + }, + { + "epoch": 0.45, + "grad_norm": 1.6145280272522786, + "learning_rate": 5.993986800022831e-06, + "loss": 0.2807, + "step": 7207 + }, + { + "epoch": 0.45, + "grad_norm": 3.8932343800479923, + "learning_rate": 5.9929886310585715e-06, + "loss": 0.2725, + "step": 7208 + }, + { + "epoch": 0.45, + "grad_norm": 1.6408730271203036, + "learning_rate": 5.9919904208934495e-06, + "loss": 0.2866, + "step": 7209 + }, + { + "epoch": 0.45, + "grad_norm": 2.057829410640411, + "learning_rate": 5.990992169568884e-06, + "loss": 0.3055, + "step": 7210 + }, + { + "epoch": 0.45, + "grad_norm": 2.0850932435888914, + "learning_rate": 5.989993877126294e-06, + "loss": 0.2901, + "step": 7211 + }, + { + "epoch": 0.45, + "grad_norm": 1.826839017585909, + "learning_rate": 5.988995543607099e-06, + "loss": 0.2915, + "step": 7212 + }, + { + "epoch": 0.45, + "grad_norm": 2.125635241380182, + "learning_rate": 5.9879971690527205e-06, + "loss": 0.3033, + "step": 7213 + }, + { + "epoch": 0.45, + "grad_norm": 1.8525267201779592, + "learning_rate": 5.986998753504586e-06, + "loss": 0.2862, + "step": 7214 + }, + { + "epoch": 0.45, + "grad_norm": 4.721278197258601, + "learning_rate": 5.98600029700412e-06, + "loss": 0.3181, + "step": 7215 + }, + { + "epoch": 0.45, + "grad_norm": 2.8176466356005263, + "learning_rate": 5.985001799592751e-06, + "loss": 0.2982, + "step": 7216 + }, + { + "epoch": 0.45, + "grad_norm": 1.6227108712118903, + "learning_rate": 5.984003261311907e-06, + "loss": 0.2962, + "step": 7217 + }, + { + "epoch": 0.45, + "grad_norm": 2.6973316147915964, + "learning_rate": 5.983004682203019e-06, + "loss": 0.2946, + "step": 7218 + }, + { + "epoch": 0.45, + "grad_norm": 1.8593563400881514, + "learning_rate": 5.982006062307523e-06, + "loss": 0.2914, + "step": 7219 + }, + { + "epoch": 0.45, + "grad_norm": 1.6367525378794925, + "learning_rate": 5.9810074016668505e-06, + "loss": 0.3104, + "step": 7220 + }, + { + "epoch": 0.45, + "grad_norm": 2.2345639766843854, + "learning_rate": 5.980008700322437e-06, + "loss": 0.2808, + "step": 7221 + }, + { + "epoch": 0.45, + "grad_norm": 1.1959260099266358, + "learning_rate": 5.979009958315724e-06, + "loss": 0.2763, + "step": 7222 + }, + { + "epoch": 0.45, + "grad_norm": 2.869002819082661, + "learning_rate": 5.978011175688146e-06, + "loss": 0.3071, + "step": 7223 + }, + { + "epoch": 0.45, + "grad_norm": 5.07459957347487, + "learning_rate": 5.977012352481151e-06, + "loss": 0.2802, + "step": 7224 + }, + { + "epoch": 0.45, + "grad_norm": 2.8806389119383606, + "learning_rate": 5.976013488736176e-06, + "loss": 0.3045, + "step": 7225 + }, + { + "epoch": 0.45, + "grad_norm": 1.9032865003983352, + "learning_rate": 5.975014584494666e-06, + "loss": 0.2754, + "step": 7226 + }, + { + "epoch": 0.45, + "grad_norm": 1.6699520016809288, + "learning_rate": 5.974015639798071e-06, + "loss": 0.3022, + "step": 7227 + }, + { + "epoch": 0.45, + "grad_norm": 1.76294421370008, + "learning_rate": 5.973016654687838e-06, + "loss": 0.3242, + "step": 7228 + }, + { + "epoch": 0.45, + "grad_norm": 3.2832268842075085, + "learning_rate": 5.972017629205414e-06, + "loss": 0.2946, + "step": 7229 + }, + { + "epoch": 0.45, + "grad_norm": 2.221390776982483, + "learning_rate": 5.971018563392252e-06, + "loss": 0.2823, + "step": 7230 + }, + { + "epoch": 0.45, + "grad_norm": 6.806604932737318, + "learning_rate": 5.970019457289807e-06, + "loss": 0.2818, + "step": 7231 + }, + { + "epoch": 0.45, + "grad_norm": 3.002898366321213, + "learning_rate": 5.969020310939531e-06, + "loss": 0.2873, + "step": 7232 + }, + { + "epoch": 0.45, + "grad_norm": 1.7389739878325832, + "learning_rate": 5.96802112438288e-06, + "loss": 0.2862, + "step": 7233 + }, + { + "epoch": 0.45, + "grad_norm": 1.7332056198816865, + "learning_rate": 5.967021897661313e-06, + "loss": 0.3007, + "step": 7234 + }, + { + "epoch": 0.46, + "grad_norm": 2.8028012010013845, + "learning_rate": 5.96602263081629e-06, + "loss": 0.3179, + "step": 7235 + }, + { + "epoch": 0.46, + "grad_norm": 3.0725242141711324, + "learning_rate": 5.965023323889274e-06, + "loss": 0.3128, + "step": 7236 + }, + { + "epoch": 0.46, + "grad_norm": 1.6802336999132215, + "learning_rate": 5.964023976921726e-06, + "loss": 0.2844, + "step": 7237 + }, + { + "epoch": 0.46, + "grad_norm": 2.1904381635886274, + "learning_rate": 5.963024589955109e-06, + "loss": 0.3195, + "step": 7238 + }, + { + "epoch": 0.46, + "grad_norm": 3.3641856927142584, + "learning_rate": 5.962025163030892e-06, + "loss": 0.2782, + "step": 7239 + }, + { + "epoch": 0.46, + "grad_norm": 1.806119631520209, + "learning_rate": 5.961025696190542e-06, + "loss": 0.293, + "step": 7240 + }, + { + "epoch": 0.46, + "grad_norm": 7.6191839085668445, + "learning_rate": 5.96002618947553e-06, + "loss": 0.2976, + "step": 7241 + }, + { + "epoch": 0.46, + "grad_norm": 5.4019028021268545, + "learning_rate": 5.959026642927326e-06, + "loss": 0.2995, + "step": 7242 + }, + { + "epoch": 0.46, + "grad_norm": 6.688824631444094, + "learning_rate": 5.958027056587402e-06, + "loss": 0.2929, + "step": 7243 + }, + { + "epoch": 0.46, + "grad_norm": 3.6488963726063965, + "learning_rate": 5.957027430497234e-06, + "loss": 0.303, + "step": 7244 + }, + { + "epoch": 0.46, + "grad_norm": 1.7708326852048644, + "learning_rate": 5.956027764698299e-06, + "loss": 0.2841, + "step": 7245 + }, + { + "epoch": 0.46, + "grad_norm": 1.5246818748926652, + "learning_rate": 5.955028059232074e-06, + "loss": 0.2819, + "step": 7246 + }, + { + "epoch": 0.46, + "grad_norm": 4.3463251772205735, + "learning_rate": 5.9540283141400375e-06, + "loss": 0.312, + "step": 7247 + }, + { + "epoch": 0.46, + "grad_norm": 4.583674965530944, + "learning_rate": 5.953028529463671e-06, + "loss": 0.2778, + "step": 7248 + }, + { + "epoch": 0.46, + "grad_norm": 3.2052136113771246, + "learning_rate": 5.952028705244461e-06, + "loss": 0.2943, + "step": 7249 + }, + { + "epoch": 0.46, + "grad_norm": 1.6632695069557133, + "learning_rate": 5.951028841523887e-06, + "loss": 0.2919, + "step": 7250 + }, + { + "epoch": 0.46, + "grad_norm": 1.9484240863350015, + "learning_rate": 5.9500289383434375e-06, + "loss": 0.2834, + "step": 7251 + }, + { + "epoch": 0.46, + "grad_norm": 3.637149342076611, + "learning_rate": 5.949028995744599e-06, + "loss": 0.2985, + "step": 7252 + }, + { + "epoch": 0.46, + "grad_norm": 5.3914995576076095, + "learning_rate": 5.9480290137688626e-06, + "loss": 0.287, + "step": 7253 + }, + { + "epoch": 0.46, + "grad_norm": 1.931397503932855, + "learning_rate": 5.94702899245772e-06, + "loss": 0.3006, + "step": 7254 + }, + { + "epoch": 0.46, + "grad_norm": 1.8063217585051436, + "learning_rate": 5.94602893185266e-06, + "loss": 0.2959, + "step": 7255 + }, + { + "epoch": 0.46, + "grad_norm": 4.289267573423351, + "learning_rate": 5.94502883199518e-06, + "loss": 0.2917, + "step": 7256 + }, + { + "epoch": 0.46, + "grad_norm": 1.8022680327182692, + "learning_rate": 5.944028692926776e-06, + "loss": 0.3005, + "step": 7257 + }, + { + "epoch": 0.46, + "grad_norm": 29.957254472344218, + "learning_rate": 5.943028514688944e-06, + "loss": 0.3, + "step": 7258 + }, + { + "epoch": 0.46, + "grad_norm": 2.832096501221565, + "learning_rate": 5.942028297323183e-06, + "loss": 0.2954, + "step": 7259 + }, + { + "epoch": 0.46, + "grad_norm": 1.930825432132346, + "learning_rate": 5.941028040870997e-06, + "loss": 0.297, + "step": 7260 + }, + { + "epoch": 0.46, + "grad_norm": 3.0185155568750943, + "learning_rate": 5.940027745373884e-06, + "loss": 0.3066, + "step": 7261 + }, + { + "epoch": 0.46, + "grad_norm": 3.288879995901175, + "learning_rate": 5.9390274108733515e-06, + "loss": 0.2902, + "step": 7262 + }, + { + "epoch": 0.46, + "grad_norm": 2.686703416547137, + "learning_rate": 5.938027037410903e-06, + "loss": 0.2868, + "step": 7263 + }, + { + "epoch": 0.46, + "grad_norm": 2.1451843873584933, + "learning_rate": 5.937026625028047e-06, + "loss": 0.2829, + "step": 7264 + }, + { + "epoch": 0.46, + "grad_norm": 2.1755340959470746, + "learning_rate": 5.936026173766292e-06, + "loss": 0.318, + "step": 7265 + }, + { + "epoch": 0.46, + "grad_norm": 55.750355936623194, + "learning_rate": 5.9350256836671494e-06, + "loss": 0.29, + "step": 7266 + }, + { + "epoch": 0.46, + "grad_norm": 1.765183249115334, + "learning_rate": 5.934025154772128e-06, + "loss": 0.2789, + "step": 7267 + }, + { + "epoch": 0.46, + "grad_norm": 2.9681713972256576, + "learning_rate": 5.933024587122745e-06, + "loss": 0.3115, + "step": 7268 + }, + { + "epoch": 0.46, + "grad_norm": 2.1816787712382673, + "learning_rate": 5.932023980760515e-06, + "loss": 0.292, + "step": 7269 + }, + { + "epoch": 0.46, + "grad_norm": 1.6475029963947472, + "learning_rate": 5.931023335726957e-06, + "loss": 0.2901, + "step": 7270 + }, + { + "epoch": 0.46, + "grad_norm": 0.6838641190179063, + "learning_rate": 5.930022652063583e-06, + "loss": 0.5215, + "step": 7271 + }, + { + "epoch": 0.46, + "grad_norm": 2.5108281733733957, + "learning_rate": 5.929021929811919e-06, + "loss": 0.2849, + "step": 7272 + }, + { + "epoch": 0.46, + "grad_norm": 1.712095615158612, + "learning_rate": 5.928021169013485e-06, + "loss": 0.2846, + "step": 7273 + }, + { + "epoch": 0.46, + "grad_norm": 14.674838096644239, + "learning_rate": 5.927020369709805e-06, + "loss": 0.2866, + "step": 7274 + }, + { + "epoch": 0.46, + "grad_norm": 5.024535918173516, + "learning_rate": 5.926019531942401e-06, + "loss": 0.2937, + "step": 7275 + }, + { + "epoch": 0.46, + "grad_norm": 2.4882969756059152, + "learning_rate": 5.925018655752804e-06, + "loss": 0.3078, + "step": 7276 + }, + { + "epoch": 0.46, + "grad_norm": 1.561645532711883, + "learning_rate": 5.924017741182539e-06, + "loss": 0.3147, + "step": 7277 + }, + { + "epoch": 0.46, + "grad_norm": 3.6572663485292467, + "learning_rate": 5.923016788273138e-06, + "loss": 0.2933, + "step": 7278 + }, + { + "epoch": 0.46, + "grad_norm": 2.308643302273684, + "learning_rate": 5.922015797066129e-06, + "loss": 0.2912, + "step": 7279 + }, + { + "epoch": 0.46, + "grad_norm": 4.030645572286068, + "learning_rate": 5.921014767603048e-06, + "loss": 0.3169, + "step": 7280 + }, + { + "epoch": 0.46, + "grad_norm": 1.39587674260693, + "learning_rate": 5.920013699925429e-06, + "loss": 0.295, + "step": 7281 + }, + { + "epoch": 0.46, + "grad_norm": 112.0111833183104, + "learning_rate": 5.919012594074807e-06, + "loss": 0.2982, + "step": 7282 + }, + { + "epoch": 0.46, + "grad_norm": 2.6589209320407328, + "learning_rate": 5.9180114500927185e-06, + "loss": 0.2857, + "step": 7283 + }, + { + "epoch": 0.46, + "grad_norm": 2.2727747773501745, + "learning_rate": 5.917010268020707e-06, + "loss": 0.2886, + "step": 7284 + }, + { + "epoch": 0.46, + "grad_norm": 1.849798585055369, + "learning_rate": 5.91600904790031e-06, + "loss": 0.2879, + "step": 7285 + }, + { + "epoch": 0.46, + "grad_norm": 2.9573511836423125, + "learning_rate": 5.915007789773069e-06, + "loss": 0.3066, + "step": 7286 + }, + { + "epoch": 0.46, + "grad_norm": 1.8373904254532667, + "learning_rate": 5.914006493680531e-06, + "loss": 0.2846, + "step": 7287 + }, + { + "epoch": 0.46, + "grad_norm": 1.30294072982048, + "learning_rate": 5.9130051596642404e-06, + "loss": 0.2938, + "step": 7288 + }, + { + "epoch": 0.46, + "grad_norm": 3.657875267381038, + "learning_rate": 5.912003787765743e-06, + "loss": 0.2852, + "step": 7289 + }, + { + "epoch": 0.46, + "grad_norm": 3.433293948338587, + "learning_rate": 5.911002378026588e-06, + "loss": 0.2856, + "step": 7290 + }, + { + "epoch": 0.46, + "grad_norm": 3.630769748538676, + "learning_rate": 5.910000930488326e-06, + "loss": 0.2885, + "step": 7291 + }, + { + "epoch": 0.46, + "grad_norm": 1.8859035035346345, + "learning_rate": 5.908999445192513e-06, + "loss": 0.2901, + "step": 7292 + }, + { + "epoch": 0.46, + "grad_norm": 2.250733228068876, + "learning_rate": 5.907997922180695e-06, + "loss": 0.2854, + "step": 7293 + }, + { + "epoch": 0.46, + "grad_norm": 2.0362654451623787, + "learning_rate": 5.90699636149443e-06, + "loss": 0.3085, + "step": 7294 + }, + { + "epoch": 0.46, + "grad_norm": 2.9214597344762963, + "learning_rate": 5.905994763175275e-06, + "loss": 0.2974, + "step": 7295 + }, + { + "epoch": 0.46, + "grad_norm": 1.5592251145918676, + "learning_rate": 5.9049931272647905e-06, + "loss": 0.2989, + "step": 7296 + }, + { + "epoch": 0.46, + "grad_norm": 2.145934769889822, + "learning_rate": 5.903991453804532e-06, + "loss": 0.304, + "step": 7297 + }, + { + "epoch": 0.46, + "grad_norm": 0.7282478379612267, + "learning_rate": 5.9029897428360625e-06, + "loss": 0.4977, + "step": 7298 + }, + { + "epoch": 0.46, + "grad_norm": 3.846397462762834, + "learning_rate": 5.9019879944009454e-06, + "loss": 0.2994, + "step": 7299 + }, + { + "epoch": 0.46, + "grad_norm": 2.584589394237719, + "learning_rate": 5.900986208540744e-06, + "loss": 0.3091, + "step": 7300 + }, + { + "epoch": 0.46, + "grad_norm": 1.9359017420102511, + "learning_rate": 5.899984385297023e-06, + "loss": 0.2829, + "step": 7301 + }, + { + "epoch": 0.46, + "grad_norm": 3.3744193182835036, + "learning_rate": 5.898982524711353e-06, + "loss": 0.2976, + "step": 7302 + }, + { + "epoch": 0.46, + "grad_norm": 1.8582866800424578, + "learning_rate": 5.897980626825301e-06, + "loss": 0.2817, + "step": 7303 + }, + { + "epoch": 0.46, + "grad_norm": 1.2194878079083267, + "learning_rate": 5.896978691680438e-06, + "loss": 0.2885, + "step": 7304 + }, + { + "epoch": 0.46, + "grad_norm": 2.6776804598703174, + "learning_rate": 5.895976719318335e-06, + "loss": 0.2838, + "step": 7305 + }, + { + "epoch": 0.46, + "grad_norm": 0.5756283658154336, + "learning_rate": 5.894974709780568e-06, + "loss": 0.4982, + "step": 7306 + }, + { + "epoch": 0.46, + "grad_norm": 2.1569390316349253, + "learning_rate": 5.893972663108709e-06, + "loss": 0.286, + "step": 7307 + }, + { + "epoch": 0.46, + "grad_norm": 1.669584606805467, + "learning_rate": 5.892970579344337e-06, + "loss": 0.2945, + "step": 7308 + }, + { + "epoch": 0.46, + "grad_norm": 1.7325601149234242, + "learning_rate": 5.891968458529028e-06, + "loss": 0.3276, + "step": 7309 + }, + { + "epoch": 0.46, + "grad_norm": 1.3263148341652418, + "learning_rate": 5.890966300704366e-06, + "loss": 0.293, + "step": 7310 + }, + { + "epoch": 0.46, + "grad_norm": 2.6667154132344058, + "learning_rate": 5.8899641059119284e-06, + "loss": 0.2831, + "step": 7311 + }, + { + "epoch": 0.46, + "grad_norm": 1.8787594973105606, + "learning_rate": 5.8889618741933e-06, + "loss": 0.3063, + "step": 7312 + }, + { + "epoch": 0.46, + "grad_norm": 2.9704987007244803, + "learning_rate": 5.887959605590062e-06, + "loss": 0.274, + "step": 7313 + }, + { + "epoch": 0.46, + "grad_norm": 2.0981803359192868, + "learning_rate": 5.886957300143806e-06, + "loss": 0.2906, + "step": 7314 + }, + { + "epoch": 0.46, + "grad_norm": 6.1888955643199, + "learning_rate": 5.885954957896115e-06, + "loss": 0.2862, + "step": 7315 + }, + { + "epoch": 0.46, + "grad_norm": 2.018121329207774, + "learning_rate": 5.884952578888578e-06, + "loss": 0.2933, + "step": 7316 + }, + { + "epoch": 0.46, + "grad_norm": 1.7170643080728223, + "learning_rate": 5.883950163162788e-06, + "loss": 0.3004, + "step": 7317 + }, + { + "epoch": 0.46, + "grad_norm": 18.44992481760076, + "learning_rate": 5.882947710760336e-06, + "loss": 0.3015, + "step": 7318 + }, + { + "epoch": 0.46, + "grad_norm": 1.7227972049219455, + "learning_rate": 5.881945221722815e-06, + "loss": 0.2998, + "step": 7319 + }, + { + "epoch": 0.46, + "grad_norm": 1.4908067710610826, + "learning_rate": 5.880942696091818e-06, + "loss": 0.2789, + "step": 7320 + }, + { + "epoch": 0.46, + "grad_norm": 1.8873695098000953, + "learning_rate": 5.879940133908946e-06, + "loss": 0.3, + "step": 7321 + }, + { + "epoch": 0.46, + "grad_norm": 2.2268818679339066, + "learning_rate": 5.878937535215795e-06, + "loss": 0.3084, + "step": 7322 + }, + { + "epoch": 0.46, + "grad_norm": 2.069477399588808, + "learning_rate": 5.877934900053963e-06, + "loss": 0.2885, + "step": 7323 + }, + { + "epoch": 0.46, + "grad_norm": 3.766883386039501, + "learning_rate": 5.876932228465054e-06, + "loss": 0.2943, + "step": 7324 + }, + { + "epoch": 0.46, + "grad_norm": 1.5011298653926668, + "learning_rate": 5.875929520490669e-06, + "loss": 0.2968, + "step": 7325 + }, + { + "epoch": 0.46, + "grad_norm": 1.8427409423855354, + "learning_rate": 5.874926776172413e-06, + "loss": 0.2957, + "step": 7326 + }, + { + "epoch": 0.46, + "grad_norm": 1.78595019446095, + "learning_rate": 5.873923995551888e-06, + "loss": 0.2875, + "step": 7327 + }, + { + "epoch": 0.46, + "grad_norm": 1.9306178224589718, + "learning_rate": 5.8729211786707075e-06, + "loss": 0.3156, + "step": 7328 + }, + { + "epoch": 0.46, + "grad_norm": 1.985264088394038, + "learning_rate": 5.871918325570475e-06, + "loss": 0.3139, + "step": 7329 + }, + { + "epoch": 0.46, + "grad_norm": 2.2983958776720996, + "learning_rate": 5.8709154362928045e-06, + "loss": 0.2781, + "step": 7330 + }, + { + "epoch": 0.46, + "grad_norm": 3.5459953800347166, + "learning_rate": 5.869912510879303e-06, + "loss": 0.3131, + "step": 7331 + }, + { + "epoch": 0.46, + "grad_norm": 12.871828278846024, + "learning_rate": 5.868909549371588e-06, + "loss": 0.298, + "step": 7332 + }, + { + "epoch": 0.46, + "grad_norm": 2.6170950853305737, + "learning_rate": 5.867906551811271e-06, + "loss": 0.2908, + "step": 7333 + }, + { + "epoch": 0.46, + "grad_norm": 6.107788983459267, + "learning_rate": 5.866903518239973e-06, + "loss": 0.284, + "step": 7334 + }, + { + "epoch": 0.46, + "grad_norm": 4.4097658497869885, + "learning_rate": 5.865900448699304e-06, + "loss": 0.299, + "step": 7335 + }, + { + "epoch": 0.46, + "grad_norm": 1.8048049716625916, + "learning_rate": 5.864897343230889e-06, + "loss": 0.2967, + "step": 7336 + }, + { + "epoch": 0.46, + "grad_norm": 1.9191555674538232, + "learning_rate": 5.8638942018763466e-06, + "loss": 0.2897, + "step": 7337 + }, + { + "epoch": 0.46, + "grad_norm": 1.9404756393540705, + "learning_rate": 5.862891024677299e-06, + "loss": 0.2794, + "step": 7338 + }, + { + "epoch": 0.46, + "grad_norm": 2.765853922751242, + "learning_rate": 5.861887811675372e-06, + "loss": 0.3067, + "step": 7339 + }, + { + "epoch": 0.46, + "grad_norm": 2.167100232172597, + "learning_rate": 5.860884562912188e-06, + "loss": 0.3097, + "step": 7340 + }, + { + "epoch": 0.46, + "grad_norm": 1.9071164601550794, + "learning_rate": 5.859881278429374e-06, + "loss": 0.2868, + "step": 7341 + }, + { + "epoch": 0.46, + "grad_norm": 1.6268019484936718, + "learning_rate": 5.858877958268558e-06, + "loss": 0.2861, + "step": 7342 + }, + { + "epoch": 0.46, + "grad_norm": 2.7383118914966724, + "learning_rate": 5.857874602471371e-06, + "loss": 0.2839, + "step": 7343 + }, + { + "epoch": 0.46, + "grad_norm": 2.3895124538247345, + "learning_rate": 5.856871211079444e-06, + "loss": 0.2999, + "step": 7344 + }, + { + "epoch": 0.46, + "grad_norm": 2.9410957812527907, + "learning_rate": 5.855867784134406e-06, + "loss": 0.3021, + "step": 7345 + }, + { + "epoch": 0.46, + "grad_norm": 1.4242961577547777, + "learning_rate": 5.854864321677894e-06, + "loss": 0.2935, + "step": 7346 + }, + { + "epoch": 0.46, + "grad_norm": 0.6225016487997318, + "learning_rate": 5.853860823751543e-06, + "loss": 0.4978, + "step": 7347 + }, + { + "epoch": 0.46, + "grad_norm": 1.6650591672455808, + "learning_rate": 5.852857290396992e-06, + "loss": 0.2858, + "step": 7348 + }, + { + "epoch": 0.46, + "grad_norm": 2.6397770623242103, + "learning_rate": 5.8518537216558745e-06, + "loss": 0.2893, + "step": 7349 + }, + { + "epoch": 0.46, + "grad_norm": 1.9201914622817458, + "learning_rate": 5.850850117569834e-06, + "loss": 0.2778, + "step": 7350 + }, + { + "epoch": 0.46, + "grad_norm": 2.6469677312924365, + "learning_rate": 5.84984647818051e-06, + "loss": 0.3017, + "step": 7351 + }, + { + "epoch": 0.46, + "grad_norm": 3.3510291544324198, + "learning_rate": 5.848842803529547e-06, + "loss": 0.2821, + "step": 7352 + }, + { + "epoch": 0.46, + "grad_norm": 2.9665345362927438, + "learning_rate": 5.847839093658587e-06, + "loss": 0.2919, + "step": 7353 + }, + { + "epoch": 0.46, + "grad_norm": 1.954562981846482, + "learning_rate": 5.84683534860928e-06, + "loss": 0.2845, + "step": 7354 + }, + { + "epoch": 0.46, + "grad_norm": 1.8956564872328672, + "learning_rate": 5.8458315684232685e-06, + "loss": 0.2717, + "step": 7355 + }, + { + "epoch": 0.46, + "grad_norm": 1.248810262332383, + "learning_rate": 5.844827753142203e-06, + "loss": 0.2868, + "step": 7356 + }, + { + "epoch": 0.46, + "grad_norm": 3.113520115498332, + "learning_rate": 5.843823902807733e-06, + "loss": 0.2999, + "step": 7357 + }, + { + "epoch": 0.46, + "grad_norm": 1.6703145513935096, + "learning_rate": 5.84282001746151e-06, + "loss": 0.297, + "step": 7358 + }, + { + "epoch": 0.46, + "grad_norm": 1.8700202881329098, + "learning_rate": 5.841816097145189e-06, + "loss": 0.3041, + "step": 7359 + }, + { + "epoch": 0.46, + "grad_norm": 2.0327005899982455, + "learning_rate": 5.840812141900423e-06, + "loss": 0.3033, + "step": 7360 + }, + { + "epoch": 0.46, + "grad_norm": 1.9215461324762908, + "learning_rate": 5.839808151768865e-06, + "loss": 0.2884, + "step": 7361 + }, + { + "epoch": 0.46, + "grad_norm": 1.4390662266099223, + "learning_rate": 5.838804126792178e-06, + "loss": 0.2844, + "step": 7362 + }, + { + "epoch": 0.46, + "grad_norm": 4.799526167333, + "learning_rate": 5.837800067012016e-06, + "loss": 0.2845, + "step": 7363 + }, + { + "epoch": 0.46, + "grad_norm": 2.447246239952149, + "learning_rate": 5.836795972470041e-06, + "loss": 0.3081, + "step": 7364 + }, + { + "epoch": 0.46, + "grad_norm": 4.7774009755870805, + "learning_rate": 5.835791843207916e-06, + "loss": 0.3062, + "step": 7365 + }, + { + "epoch": 0.46, + "grad_norm": 1.6124144871826007, + "learning_rate": 5.8347876792673044e-06, + "loss": 0.2904, + "step": 7366 + }, + { + "epoch": 0.46, + "grad_norm": 4.649169209059497, + "learning_rate": 5.833783480689868e-06, + "loss": 0.3021, + "step": 7367 + }, + { + "epoch": 0.46, + "grad_norm": 1.4987322099032652, + "learning_rate": 5.832779247517273e-06, + "loss": 0.2963, + "step": 7368 + }, + { + "epoch": 0.46, + "grad_norm": 3.440929233885759, + "learning_rate": 5.831774979791188e-06, + "loss": 0.3098, + "step": 7369 + }, + { + "epoch": 0.46, + "grad_norm": 4.051398084478484, + "learning_rate": 5.830770677553282e-06, + "loss": 0.2834, + "step": 7370 + }, + { + "epoch": 0.46, + "grad_norm": 3.708495911664979, + "learning_rate": 5.829766340845225e-06, + "loss": 0.2849, + "step": 7371 + }, + { + "epoch": 0.46, + "grad_norm": 4.400937708458666, + "learning_rate": 5.828761969708689e-06, + "loss": 0.3028, + "step": 7372 + }, + { + "epoch": 0.46, + "grad_norm": 3.098191143314644, + "learning_rate": 5.827757564185347e-06, + "loss": 0.3081, + "step": 7373 + }, + { + "epoch": 0.46, + "grad_norm": 1.5746115675118102, + "learning_rate": 5.826753124316873e-06, + "loss": 0.2972, + "step": 7374 + }, + { + "epoch": 0.46, + "grad_norm": 3.5442040364507443, + "learning_rate": 5.8257486501449435e-06, + "loss": 0.2807, + "step": 7375 + }, + { + "epoch": 0.46, + "grad_norm": 1.991918654063862, + "learning_rate": 5.824744141711235e-06, + "loss": 0.315, + "step": 7376 + }, + { + "epoch": 0.46, + "grad_norm": 1.6218451131737361, + "learning_rate": 5.823739599057427e-06, + "loss": 0.2823, + "step": 7377 + }, + { + "epoch": 0.46, + "grad_norm": 6.2641020700786445, + "learning_rate": 5.822735022225202e-06, + "loss": 0.3095, + "step": 7378 + }, + { + "epoch": 0.46, + "grad_norm": 2.2021640666825215, + "learning_rate": 5.821730411256238e-06, + "loss": 0.2959, + "step": 7379 + }, + { + "epoch": 0.46, + "grad_norm": 2.7549027559276418, + "learning_rate": 5.82072576619222e-06, + "loss": 0.3049, + "step": 7380 + }, + { + "epoch": 0.46, + "grad_norm": 2.3850711295652784, + "learning_rate": 5.819721087074832e-06, + "loss": 0.3245, + "step": 7381 + }, + { + "epoch": 0.46, + "grad_norm": 3.6691598483890555, + "learning_rate": 5.818716373945762e-06, + "loss": 0.3157, + "step": 7382 + }, + { + "epoch": 0.46, + "grad_norm": 4.657245966556539, + "learning_rate": 5.817711626846691e-06, + "loss": 0.3141, + "step": 7383 + }, + { + "epoch": 0.46, + "grad_norm": 1.68096296240861, + "learning_rate": 5.816706845819316e-06, + "loss": 0.2855, + "step": 7384 + }, + { + "epoch": 0.46, + "grad_norm": 1.8480071058746694, + "learning_rate": 5.815702030905322e-06, + "loss": 0.2913, + "step": 7385 + }, + { + "epoch": 0.46, + "grad_norm": 2.3705701435846134, + "learning_rate": 5.8146971821464034e-06, + "loss": 0.2957, + "step": 7386 + }, + { + "epoch": 0.46, + "grad_norm": 3.711514997249587, + "learning_rate": 5.8136922995842495e-06, + "loss": 0.2887, + "step": 7387 + }, + { + "epoch": 0.46, + "grad_norm": 1.5839163728817727, + "learning_rate": 5.812687383260558e-06, + "loss": 0.2914, + "step": 7388 + }, + { + "epoch": 0.46, + "grad_norm": 1.7639738555541817, + "learning_rate": 5.811682433217023e-06, + "loss": 0.3118, + "step": 7389 + }, + { + "epoch": 0.46, + "grad_norm": 2.8906218401729564, + "learning_rate": 5.810677449495343e-06, + "loss": 0.3015, + "step": 7390 + }, + { + "epoch": 0.46, + "grad_norm": 2.83842369457081, + "learning_rate": 5.809672432137215e-06, + "loss": 0.2835, + "step": 7391 + }, + { + "epoch": 0.46, + "grad_norm": 1.47335619090344, + "learning_rate": 5.80866738118434e-06, + "loss": 0.2795, + "step": 7392 + }, + { + "epoch": 0.46, + "grad_norm": 3.863035727645143, + "learning_rate": 5.807662296678418e-06, + "loss": 0.3085, + "step": 7393 + }, + { + "epoch": 0.47, + "grad_norm": 1.350936176999657, + "learning_rate": 5.806657178661153e-06, + "loss": 0.2814, + "step": 7394 + }, + { + "epoch": 0.47, + "grad_norm": 2.674197133155234, + "learning_rate": 5.805652027174249e-06, + "loss": 0.3206, + "step": 7395 + }, + { + "epoch": 0.47, + "grad_norm": 1.9593121527421304, + "learning_rate": 5.804646842259413e-06, + "loss": 0.3084, + "step": 7396 + }, + { + "epoch": 0.47, + "grad_norm": 3.0677949674119387, + "learning_rate": 5.803641623958348e-06, + "loss": 0.2954, + "step": 7397 + }, + { + "epoch": 0.47, + "grad_norm": 2.52424879535754, + "learning_rate": 5.802636372312766e-06, + "loss": 0.2749, + "step": 7398 + }, + { + "epoch": 0.47, + "grad_norm": 2.1236913832671624, + "learning_rate": 5.801631087364374e-06, + "loss": 0.3015, + "step": 7399 + }, + { + "epoch": 0.47, + "grad_norm": 1.5553322808566252, + "learning_rate": 5.8006257691548865e-06, + "loss": 0.2987, + "step": 7400 + }, + { + "epoch": 0.47, + "grad_norm": 0.5785389856886474, + "learning_rate": 5.799620417726012e-06, + "loss": 0.463, + "step": 7401 + }, + { + "epoch": 0.47, + "grad_norm": 1.4850186335183841, + "learning_rate": 5.798615033119466e-06, + "loss": 0.2934, + "step": 7402 + }, + { + "epoch": 0.47, + "grad_norm": 25.044942457349375, + "learning_rate": 5.797609615376964e-06, + "loss": 0.2855, + "step": 7403 + }, + { + "epoch": 0.47, + "grad_norm": 3.0788518723451155, + "learning_rate": 5.796604164540224e-06, + "loss": 0.2762, + "step": 7404 + }, + { + "epoch": 0.47, + "grad_norm": 1.9639242140463364, + "learning_rate": 5.795598680650959e-06, + "loss": 0.2846, + "step": 7405 + }, + { + "epoch": 0.47, + "grad_norm": 4.202881432684013, + "learning_rate": 5.794593163750894e-06, + "loss": 0.2703, + "step": 7406 + }, + { + "epoch": 0.47, + "grad_norm": 7.61043453216115, + "learning_rate": 5.793587613881747e-06, + "loss": 0.3026, + "step": 7407 + }, + { + "epoch": 0.47, + "grad_norm": 2.821645440211319, + "learning_rate": 5.792582031085241e-06, + "loss": 0.2941, + "step": 7408 + }, + { + "epoch": 0.47, + "grad_norm": 3.1025830781435055, + "learning_rate": 5.791576415403097e-06, + "loss": 0.3294, + "step": 7409 + }, + { + "epoch": 0.47, + "grad_norm": 2.43317684151675, + "learning_rate": 5.790570766877043e-06, + "loss": 0.296, + "step": 7410 + }, + { + "epoch": 0.47, + "grad_norm": 2.0351764553562695, + "learning_rate": 5.789565085548803e-06, + "loss": 0.2845, + "step": 7411 + }, + { + "epoch": 0.47, + "grad_norm": 1.6329261443402958, + "learning_rate": 5.788559371460107e-06, + "loss": 0.2876, + "step": 7412 + }, + { + "epoch": 0.47, + "grad_norm": 2.1815197784932674, + "learning_rate": 5.787553624652678e-06, + "loss": 0.2648, + "step": 7413 + }, + { + "epoch": 0.47, + "grad_norm": 1.672178902243905, + "learning_rate": 5.786547845168253e-06, + "loss": 0.2965, + "step": 7414 + }, + { + "epoch": 0.47, + "grad_norm": 2.781489413288844, + "learning_rate": 5.78554203304856e-06, + "loss": 0.2973, + "step": 7415 + }, + { + "epoch": 0.47, + "grad_norm": 1.906575158794279, + "learning_rate": 5.784536188335334e-06, + "loss": 0.2977, + "step": 7416 + }, + { + "epoch": 0.47, + "grad_norm": 27.952144720770548, + "learning_rate": 5.783530311070306e-06, + "loss": 0.3185, + "step": 7417 + }, + { + "epoch": 0.47, + "grad_norm": 3.3791989634924233, + "learning_rate": 5.782524401295216e-06, + "loss": 0.2817, + "step": 7418 + }, + { + "epoch": 0.47, + "grad_norm": 1.6515979396728617, + "learning_rate": 5.781518459051797e-06, + "loss": 0.2805, + "step": 7419 + }, + { + "epoch": 0.47, + "grad_norm": 1.8358959595852473, + "learning_rate": 5.78051248438179e-06, + "loss": 0.2967, + "step": 7420 + }, + { + "epoch": 0.47, + "grad_norm": 2.4987211405224334, + "learning_rate": 5.779506477326933e-06, + "loss": 0.3037, + "step": 7421 + }, + { + "epoch": 0.47, + "grad_norm": 1.9785200233494902, + "learning_rate": 5.778500437928969e-06, + "loss": 0.3012, + "step": 7422 + }, + { + "epoch": 0.47, + "grad_norm": 2.1999170666941543, + "learning_rate": 5.777494366229637e-06, + "loss": 0.3118, + "step": 7423 + }, + { + "epoch": 0.47, + "grad_norm": 2.253242248500906, + "learning_rate": 5.7764882622706834e-06, + "loss": 0.296, + "step": 7424 + }, + { + "epoch": 0.47, + "grad_norm": 2.1496977476439847, + "learning_rate": 5.7754821260938536e-06, + "loss": 0.2802, + "step": 7425 + }, + { + "epoch": 0.47, + "grad_norm": 2.083243302988889, + "learning_rate": 5.774475957740892e-06, + "loss": 0.293, + "step": 7426 + }, + { + "epoch": 0.47, + "grad_norm": 2.395467071501667, + "learning_rate": 5.7734697572535485e-06, + "loss": 0.2744, + "step": 7427 + }, + { + "epoch": 0.47, + "grad_norm": 2.17167924037797, + "learning_rate": 5.7724635246735695e-06, + "loss": 0.2966, + "step": 7428 + }, + { + "epoch": 0.47, + "grad_norm": 1.6577735965038014, + "learning_rate": 5.771457260042707e-06, + "loss": 0.2826, + "step": 7429 + }, + { + "epoch": 0.47, + "grad_norm": 2.0480434049578315, + "learning_rate": 5.770450963402714e-06, + "loss": 0.2939, + "step": 7430 + }, + { + "epoch": 0.47, + "grad_norm": 1.797695230901797, + "learning_rate": 5.76944463479534e-06, + "loss": 0.2842, + "step": 7431 + }, + { + "epoch": 0.47, + "grad_norm": 4.2435853244238055, + "learning_rate": 5.7684382742623425e-06, + "loss": 0.3024, + "step": 7432 + }, + { + "epoch": 0.47, + "grad_norm": 1.480875688589397, + "learning_rate": 5.767431881845475e-06, + "loss": 0.2863, + "step": 7433 + }, + { + "epoch": 0.47, + "grad_norm": 2.460237068746604, + "learning_rate": 5.766425457586497e-06, + "loss": 0.2841, + "step": 7434 + }, + { + "epoch": 0.47, + "grad_norm": 5.2983250323644855, + "learning_rate": 5.765419001527165e-06, + "loss": 0.3179, + "step": 7435 + }, + { + "epoch": 0.47, + "grad_norm": 1.2555286612370506, + "learning_rate": 5.7644125137092395e-06, + "loss": 0.2768, + "step": 7436 + }, + { + "epoch": 0.47, + "grad_norm": 1.7879728690741918, + "learning_rate": 5.76340599417448e-06, + "loss": 0.3091, + "step": 7437 + }, + { + "epoch": 0.47, + "grad_norm": 1.794846458366639, + "learning_rate": 5.762399442964652e-06, + "loss": 0.299, + "step": 7438 + }, + { + "epoch": 0.47, + "grad_norm": 4.466717185262108, + "learning_rate": 5.761392860121514e-06, + "loss": 0.3052, + "step": 7439 + }, + { + "epoch": 0.47, + "grad_norm": 13.508990408891954, + "learning_rate": 5.760386245686836e-06, + "loss": 0.3105, + "step": 7440 + }, + { + "epoch": 0.47, + "grad_norm": 2.176518988435943, + "learning_rate": 5.759379599702381e-06, + "loss": 0.2905, + "step": 7441 + }, + { + "epoch": 0.47, + "grad_norm": 5.571342908592038, + "learning_rate": 5.758372922209918e-06, + "loss": 0.2919, + "step": 7442 + }, + { + "epoch": 0.47, + "grad_norm": 4.12035874006493, + "learning_rate": 5.757366213251215e-06, + "loss": 0.2881, + "step": 7443 + }, + { + "epoch": 0.47, + "grad_norm": 2.872116166511122, + "learning_rate": 5.756359472868044e-06, + "loss": 0.3009, + "step": 7444 + }, + { + "epoch": 0.47, + "grad_norm": 2.7007860292818515, + "learning_rate": 5.755352701102174e-06, + "loss": 0.2931, + "step": 7445 + }, + { + "epoch": 0.47, + "grad_norm": 1.6324341594720326, + "learning_rate": 5.754345897995378e-06, + "loss": 0.2915, + "step": 7446 + }, + { + "epoch": 0.47, + "grad_norm": 0.6107260486933718, + "learning_rate": 5.753339063589431e-06, + "loss": 0.4902, + "step": 7447 + }, + { + "epoch": 0.47, + "grad_norm": 3.7491841544424926, + "learning_rate": 5.752332197926109e-06, + "loss": 0.2907, + "step": 7448 + }, + { + "epoch": 0.47, + "grad_norm": 1.8626797410832066, + "learning_rate": 5.751325301047188e-06, + "loss": 0.3146, + "step": 7449 + }, + { + "epoch": 0.47, + "grad_norm": 2.509480948330613, + "learning_rate": 5.7503183729944454e-06, + "loss": 0.2746, + "step": 7450 + }, + { + "epoch": 0.47, + "grad_norm": 2.049359590867218, + "learning_rate": 5.749311413809661e-06, + "loss": 0.2858, + "step": 7451 + }, + { + "epoch": 0.47, + "grad_norm": 2.04012364496551, + "learning_rate": 5.748304423534615e-06, + "loss": 0.2673, + "step": 7452 + }, + { + "epoch": 0.47, + "grad_norm": 5.413054241860235, + "learning_rate": 5.74729740221109e-06, + "loss": 0.296, + "step": 7453 + }, + { + "epoch": 0.47, + "grad_norm": 1.3595975797556992, + "learning_rate": 5.746290349880867e-06, + "loss": 0.2916, + "step": 7454 + }, + { + "epoch": 0.47, + "grad_norm": 2.6144008908891876, + "learning_rate": 5.745283266585732e-06, + "loss": 0.2895, + "step": 7455 + }, + { + "epoch": 0.47, + "grad_norm": 3.030825353977351, + "learning_rate": 5.744276152367472e-06, + "loss": 0.2749, + "step": 7456 + }, + { + "epoch": 0.47, + "grad_norm": 2.3108544995994253, + "learning_rate": 5.7432690072678696e-06, + "loss": 0.2952, + "step": 7457 + }, + { + "epoch": 0.47, + "grad_norm": 3.0277926170135334, + "learning_rate": 5.7422618313287184e-06, + "loss": 0.3021, + "step": 7458 + }, + { + "epoch": 0.47, + "grad_norm": 2.9805852476289116, + "learning_rate": 5.741254624591804e-06, + "loss": 0.2806, + "step": 7459 + }, + { + "epoch": 0.47, + "grad_norm": 2.9802090010372773, + "learning_rate": 5.7402473870989205e-06, + "loss": 0.3016, + "step": 7460 + }, + { + "epoch": 0.47, + "grad_norm": 2.6529213476321996, + "learning_rate": 5.7392401188918555e-06, + "loss": 0.3039, + "step": 7461 + }, + { + "epoch": 0.47, + "grad_norm": 1.7974486995779502, + "learning_rate": 5.738232820012407e-06, + "loss": 0.3013, + "step": 7462 + }, + { + "epoch": 0.47, + "grad_norm": 11.920592923579262, + "learning_rate": 5.737225490502366e-06, + "loss": 0.3029, + "step": 7463 + }, + { + "epoch": 0.47, + "grad_norm": 1.8315720248690885, + "learning_rate": 5.736218130403532e-06, + "loss": 0.298, + "step": 7464 + }, + { + "epoch": 0.47, + "grad_norm": 2.032439390079224, + "learning_rate": 5.735210739757697e-06, + "loss": 0.2819, + "step": 7465 + }, + { + "epoch": 0.47, + "grad_norm": 1.4071823116303686, + "learning_rate": 5.734203318606666e-06, + "loss": 0.2905, + "step": 7466 + }, + { + "epoch": 0.47, + "grad_norm": 5.486510636408435, + "learning_rate": 5.7331958669922335e-06, + "loss": 0.3097, + "step": 7467 + }, + { + "epoch": 0.47, + "grad_norm": 5.174635017024112, + "learning_rate": 5.732188384956203e-06, + "loss": 0.2864, + "step": 7468 + }, + { + "epoch": 0.47, + "grad_norm": 2.0793500591352876, + "learning_rate": 5.731180872540374e-06, + "loss": 0.2764, + "step": 7469 + }, + { + "epoch": 0.47, + "grad_norm": 3.267484630972985, + "learning_rate": 5.730173329786554e-06, + "loss": 0.2977, + "step": 7470 + }, + { + "epoch": 0.47, + "grad_norm": 2.2034813592776934, + "learning_rate": 5.729165756736544e-06, + "loss": 0.3035, + "step": 7471 + }, + { + "epoch": 0.47, + "grad_norm": 1.4909879137858109, + "learning_rate": 5.728158153432153e-06, + "loss": 0.2937, + "step": 7472 + }, + { + "epoch": 0.47, + "grad_norm": 1.9755301438766115, + "learning_rate": 5.727150519915186e-06, + "loss": 0.2899, + "step": 7473 + }, + { + "epoch": 0.47, + "grad_norm": 2.884894116487073, + "learning_rate": 5.726142856227453e-06, + "loss": 0.2662, + "step": 7474 + }, + { + "epoch": 0.47, + "grad_norm": 2.4461228179609695, + "learning_rate": 5.725135162410762e-06, + "loss": 0.2871, + "step": 7475 + }, + { + "epoch": 0.47, + "grad_norm": 12.404019744402751, + "learning_rate": 5.724127438506925e-06, + "loss": 0.2991, + "step": 7476 + }, + { + "epoch": 0.47, + "grad_norm": 2.1658715308810628, + "learning_rate": 5.723119684557755e-06, + "loss": 0.2907, + "step": 7477 + }, + { + "epoch": 0.47, + "grad_norm": 1.5756943612655252, + "learning_rate": 5.722111900605066e-06, + "loss": 0.2918, + "step": 7478 + }, + { + "epoch": 0.47, + "grad_norm": 3.070764156037807, + "learning_rate": 5.721104086690671e-06, + "loss": 0.2856, + "step": 7479 + }, + { + "epoch": 0.47, + "grad_norm": 2.3053552661157948, + "learning_rate": 5.720096242856385e-06, + "loss": 0.2934, + "step": 7480 + }, + { + "epoch": 0.47, + "grad_norm": 2.739945420154742, + "learning_rate": 5.719088369144028e-06, + "loss": 0.2909, + "step": 7481 + }, + { + "epoch": 0.47, + "grad_norm": 2.5469861022930633, + "learning_rate": 5.718080465595419e-06, + "loss": 0.2824, + "step": 7482 + }, + { + "epoch": 0.47, + "grad_norm": 4.480193219958061, + "learning_rate": 5.717072532252373e-06, + "loss": 0.2886, + "step": 7483 + }, + { + "epoch": 0.47, + "grad_norm": 3.8616266062916327, + "learning_rate": 5.716064569156716e-06, + "loss": 0.2935, + "step": 7484 + }, + { + "epoch": 0.47, + "grad_norm": 2.953623144323701, + "learning_rate": 5.715056576350267e-06, + "loss": 0.2921, + "step": 7485 + }, + { + "epoch": 0.47, + "grad_norm": 3.594419332724903, + "learning_rate": 5.714048553874852e-06, + "loss": 0.2933, + "step": 7486 + }, + { + "epoch": 0.47, + "grad_norm": 3.2998054298609465, + "learning_rate": 5.713040501772292e-06, + "loss": 0.3098, + "step": 7487 + }, + { + "epoch": 0.47, + "grad_norm": 5.61312532244727, + "learning_rate": 5.712032420084418e-06, + "loss": 0.3033, + "step": 7488 + }, + { + "epoch": 0.47, + "grad_norm": 5.946469730706228, + "learning_rate": 5.7110243088530525e-06, + "loss": 0.3049, + "step": 7489 + }, + { + "epoch": 0.47, + "grad_norm": 1.989729102428159, + "learning_rate": 5.710016168120026e-06, + "loss": 0.2912, + "step": 7490 + }, + { + "epoch": 0.47, + "grad_norm": 2.582846243896762, + "learning_rate": 5.7090079979271665e-06, + "loss": 0.291, + "step": 7491 + }, + { + "epoch": 0.47, + "grad_norm": 55.800817128984455, + "learning_rate": 5.707999798316307e-06, + "loss": 0.284, + "step": 7492 + }, + { + "epoch": 0.47, + "grad_norm": 5.815348963611848, + "learning_rate": 5.706991569329277e-06, + "loss": 0.2922, + "step": 7493 + }, + { + "epoch": 0.47, + "grad_norm": 3.4766139781374577, + "learning_rate": 5.705983311007913e-06, + "loss": 0.2965, + "step": 7494 + }, + { + "epoch": 0.47, + "grad_norm": 2.246352461340009, + "learning_rate": 5.704975023394045e-06, + "loss": 0.2933, + "step": 7495 + }, + { + "epoch": 0.47, + "grad_norm": 4.385660925283995, + "learning_rate": 5.703966706529513e-06, + "loss": 0.2883, + "step": 7496 + }, + { + "epoch": 0.47, + "grad_norm": 2.116793870423358, + "learning_rate": 5.702958360456151e-06, + "loss": 0.3052, + "step": 7497 + }, + { + "epoch": 0.47, + "grad_norm": 3.2875175653246824, + "learning_rate": 5.7019499852158e-06, + "loss": 0.2993, + "step": 7498 + }, + { + "epoch": 0.47, + "grad_norm": 2.3467825549517634, + "learning_rate": 5.700941580850294e-06, + "loss": 0.2891, + "step": 7499 + }, + { + "epoch": 0.47, + "grad_norm": 2.0220070722338837, + "learning_rate": 5.6999331474014795e-06, + "loss": 0.2853, + "step": 7500 + }, + { + "epoch": 0.47, + "grad_norm": 1.889623982996533, + "learning_rate": 5.698924684911195e-06, + "loss": 0.2908, + "step": 7501 + }, + { + "epoch": 0.47, + "grad_norm": 7.640199975628539, + "learning_rate": 5.697916193421283e-06, + "loss": 0.2958, + "step": 7502 + }, + { + "epoch": 0.47, + "grad_norm": 2.216291310154457, + "learning_rate": 5.6969076729735885e-06, + "loss": 0.2744, + "step": 7503 + }, + { + "epoch": 0.47, + "grad_norm": 1.9070709643064623, + "learning_rate": 5.695899123609957e-06, + "loss": 0.2845, + "step": 7504 + }, + { + "epoch": 0.47, + "grad_norm": 2.766277469410023, + "learning_rate": 5.694890545372235e-06, + "loss": 0.3101, + "step": 7505 + }, + { + "epoch": 0.47, + "grad_norm": 0.6244973568012134, + "learning_rate": 5.693881938302271e-06, + "loss": 0.4962, + "step": 7506 + }, + { + "epoch": 0.47, + "grad_norm": 2.4504037855239074, + "learning_rate": 5.69287330244191e-06, + "loss": 0.2893, + "step": 7507 + }, + { + "epoch": 0.47, + "grad_norm": 2.361867015535825, + "learning_rate": 5.691864637833009e-06, + "loss": 0.3067, + "step": 7508 + }, + { + "epoch": 0.47, + "grad_norm": 2.129035141662319, + "learning_rate": 5.690855944517413e-06, + "loss": 0.3131, + "step": 7509 + }, + { + "epoch": 0.47, + "grad_norm": 2.733406459487125, + "learning_rate": 5.689847222536976e-06, + "loss": 0.3003, + "step": 7510 + }, + { + "epoch": 0.47, + "grad_norm": 2.0861672973751695, + "learning_rate": 5.688838471933553e-06, + "loss": 0.291, + "step": 7511 + }, + { + "epoch": 0.47, + "grad_norm": 1.5806383363046554, + "learning_rate": 5.687829692749e-06, + "loss": 0.3019, + "step": 7512 + }, + { + "epoch": 0.47, + "grad_norm": 3.7720927581792263, + "learning_rate": 5.686820885025169e-06, + "loss": 0.2696, + "step": 7513 + }, + { + "epoch": 0.47, + "grad_norm": 1.4731309256315446, + "learning_rate": 5.68581204880392e-06, + "loss": 0.2839, + "step": 7514 + }, + { + "epoch": 0.47, + "grad_norm": 1.3839875728516073, + "learning_rate": 5.684803184127112e-06, + "loss": 0.2824, + "step": 7515 + }, + { + "epoch": 0.47, + "grad_norm": 4.729233174779593, + "learning_rate": 5.683794291036604e-06, + "loss": 0.2875, + "step": 7516 + }, + { + "epoch": 0.47, + "grad_norm": 3.4638741674560807, + "learning_rate": 5.682785369574254e-06, + "loss": 0.2753, + "step": 7517 + }, + { + "epoch": 0.47, + "grad_norm": 7.180851821766722, + "learning_rate": 5.681776419781928e-06, + "loss": 0.2909, + "step": 7518 + }, + { + "epoch": 0.47, + "grad_norm": 2.1562037127667515, + "learning_rate": 5.680767441701487e-06, + "loss": 0.296, + "step": 7519 + }, + { + "epoch": 0.47, + "grad_norm": 2.203281795230169, + "learning_rate": 5.679758435374797e-06, + "loss": 0.2793, + "step": 7520 + }, + { + "epoch": 0.47, + "grad_norm": 4.809742360489713, + "learning_rate": 5.678749400843719e-06, + "loss": 0.3229, + "step": 7521 + }, + { + "epoch": 0.47, + "grad_norm": 3.5053153872792064, + "learning_rate": 5.677740338150126e-06, + "loss": 0.3021, + "step": 7522 + }, + { + "epoch": 0.47, + "grad_norm": 0.5869112896278917, + "learning_rate": 5.6767312473358805e-06, + "loss": 0.4689, + "step": 7523 + }, + { + "epoch": 0.47, + "grad_norm": 1.8921322492543409, + "learning_rate": 5.6757221284428554e-06, + "loss": 0.2785, + "step": 7524 + }, + { + "epoch": 0.47, + "grad_norm": 3.511930460075979, + "learning_rate": 5.674712981512915e-06, + "loss": 0.3081, + "step": 7525 + }, + { + "epoch": 0.47, + "grad_norm": 9.569747007160839, + "learning_rate": 5.673703806587939e-06, + "loss": 0.2707, + "step": 7526 + }, + { + "epoch": 0.47, + "grad_norm": 1.4256604965425432, + "learning_rate": 5.672694603709794e-06, + "loss": 0.2847, + "step": 7527 + }, + { + "epoch": 0.47, + "grad_norm": 2.0290628140660143, + "learning_rate": 5.671685372920355e-06, + "loss": 0.2814, + "step": 7528 + }, + { + "epoch": 0.47, + "grad_norm": 1.446966074724786, + "learning_rate": 5.670676114261495e-06, + "loss": 0.2957, + "step": 7529 + }, + { + "epoch": 0.47, + "grad_norm": 1.6950616372965643, + "learning_rate": 5.669666827775095e-06, + "loss": 0.2872, + "step": 7530 + }, + { + "epoch": 0.47, + "grad_norm": 2.926014532229951, + "learning_rate": 5.668657513503027e-06, + "loss": 0.3254, + "step": 7531 + }, + { + "epoch": 0.47, + "grad_norm": 0.600040825964026, + "learning_rate": 5.667648171487171e-06, + "loss": 0.5185, + "step": 7532 + }, + { + "epoch": 0.47, + "grad_norm": 0.6331571157681681, + "learning_rate": 5.666638801769406e-06, + "loss": 0.489, + "step": 7533 + }, + { + "epoch": 0.47, + "grad_norm": 2.9454720619669392, + "learning_rate": 5.665629404391615e-06, + "loss": 0.2997, + "step": 7534 + }, + { + "epoch": 0.47, + "grad_norm": 3.650075794061309, + "learning_rate": 5.664619979395676e-06, + "loss": 0.3163, + "step": 7535 + }, + { + "epoch": 0.47, + "grad_norm": 2.1008226354398274, + "learning_rate": 5.663610526823474e-06, + "loss": 0.2842, + "step": 7536 + }, + { + "epoch": 0.47, + "grad_norm": 2.0083496967367487, + "learning_rate": 5.662601046716893e-06, + "loss": 0.3236, + "step": 7537 + }, + { + "epoch": 0.47, + "grad_norm": 1.540398965207893, + "learning_rate": 5.661591539117818e-06, + "loss": 0.2709, + "step": 7538 + }, + { + "epoch": 0.47, + "grad_norm": 2.263167581705944, + "learning_rate": 5.660582004068134e-06, + "loss": 0.285, + "step": 7539 + }, + { + "epoch": 0.47, + "grad_norm": 2.470621667087632, + "learning_rate": 5.6595724416097285e-06, + "loss": 0.2997, + "step": 7540 + }, + { + "epoch": 0.47, + "grad_norm": 2.2905877109285977, + "learning_rate": 5.658562851784493e-06, + "loss": 0.3073, + "step": 7541 + }, + { + "epoch": 0.47, + "grad_norm": 2.614383038345281, + "learning_rate": 5.657553234634314e-06, + "loss": 0.2739, + "step": 7542 + }, + { + "epoch": 0.47, + "grad_norm": 1.544974988609688, + "learning_rate": 5.656543590201084e-06, + "loss": 0.2823, + "step": 7543 + }, + { + "epoch": 0.47, + "grad_norm": 1.6832459754722957, + "learning_rate": 5.655533918526693e-06, + "loss": 0.292, + "step": 7544 + }, + { + "epoch": 0.47, + "grad_norm": 2.17433286086938, + "learning_rate": 5.654524219653035e-06, + "loss": 0.3132, + "step": 7545 + }, + { + "epoch": 0.47, + "grad_norm": 1.8187781261421623, + "learning_rate": 5.653514493622007e-06, + "loss": 0.2918, + "step": 7546 + }, + { + "epoch": 0.47, + "grad_norm": 6.373290493754437, + "learning_rate": 5.652504740475499e-06, + "loss": 0.2919, + "step": 7547 + }, + { + "epoch": 0.47, + "grad_norm": 3.2056108905279466, + "learning_rate": 5.651494960255412e-06, + "loss": 0.3059, + "step": 7548 + }, + { + "epoch": 0.47, + "grad_norm": 1.6706705669316926, + "learning_rate": 5.650485153003642e-06, + "loss": 0.2879, + "step": 7549 + }, + { + "epoch": 0.47, + "grad_norm": 2.029525412874832, + "learning_rate": 5.649475318762088e-06, + "loss": 0.2935, + "step": 7550 + }, + { + "epoch": 0.47, + "grad_norm": 2.2076103327721412, + "learning_rate": 5.648465457572648e-06, + "loss": 0.3061, + "step": 7551 + }, + { + "epoch": 0.47, + "grad_norm": 2.077242752112191, + "learning_rate": 5.647455569477225e-06, + "loss": 0.2989, + "step": 7552 + }, + { + "epoch": 0.48, + "grad_norm": 1.812558801833626, + "learning_rate": 5.646445654517721e-06, + "loss": 0.3027, + "step": 7553 + }, + { + "epoch": 0.48, + "grad_norm": 1.2839181061111227, + "learning_rate": 5.645435712736038e-06, + "loss": 0.2699, + "step": 7554 + }, + { + "epoch": 0.48, + "grad_norm": 2.2677247153083626, + "learning_rate": 5.6444257441740804e-06, + "loss": 0.2771, + "step": 7555 + }, + { + "epoch": 0.48, + "grad_norm": 2.4156651211418265, + "learning_rate": 5.643415748873755e-06, + "loss": 0.284, + "step": 7556 + }, + { + "epoch": 0.48, + "grad_norm": 3.546944978902009, + "learning_rate": 5.642405726876967e-06, + "loss": 0.2665, + "step": 7557 + }, + { + "epoch": 0.48, + "grad_norm": 2.834067952436137, + "learning_rate": 5.641395678225624e-06, + "loss": 0.2789, + "step": 7558 + }, + { + "epoch": 0.48, + "grad_norm": 1.7530318367081685, + "learning_rate": 5.640385602961634e-06, + "loss": 0.2797, + "step": 7559 + }, + { + "epoch": 0.48, + "grad_norm": 2.3080553025611903, + "learning_rate": 5.6393755011269115e-06, + "loss": 0.2973, + "step": 7560 + }, + { + "epoch": 0.48, + "grad_norm": 1.8055236618951296, + "learning_rate": 5.63836537276336e-06, + "loss": 0.2944, + "step": 7561 + }, + { + "epoch": 0.48, + "grad_norm": 2.7602991910046204, + "learning_rate": 5.6373552179128975e-06, + "loss": 0.2921, + "step": 7562 + }, + { + "epoch": 0.48, + "grad_norm": 1.8516542757364982, + "learning_rate": 5.636345036617433e-06, + "loss": 0.2922, + "step": 7563 + }, + { + "epoch": 0.48, + "grad_norm": 2.8027538131341654, + "learning_rate": 5.635334828918886e-06, + "loss": 0.3032, + "step": 7564 + }, + { + "epoch": 0.48, + "grad_norm": 2.391280447097233, + "learning_rate": 5.6343245948591655e-06, + "loss": 0.3, + "step": 7565 + }, + { + "epoch": 0.48, + "grad_norm": 1.7098293431121996, + "learning_rate": 5.633314334480191e-06, + "loss": 0.2855, + "step": 7566 + }, + { + "epoch": 0.48, + "grad_norm": 1.8748959742737594, + "learning_rate": 5.632304047823881e-06, + "loss": 0.2792, + "step": 7567 + }, + { + "epoch": 0.48, + "grad_norm": 2.337783856025002, + "learning_rate": 5.631293734932153e-06, + "loss": 0.3168, + "step": 7568 + }, + { + "epoch": 0.48, + "grad_norm": 2.1377957947763133, + "learning_rate": 5.630283395846926e-06, + "loss": 0.2996, + "step": 7569 + }, + { + "epoch": 0.48, + "grad_norm": 2.395129393465031, + "learning_rate": 5.6292730306101215e-06, + "loss": 0.2916, + "step": 7570 + }, + { + "epoch": 0.48, + "grad_norm": 2.5285132336855396, + "learning_rate": 5.6282626392636615e-06, + "loss": 0.2921, + "step": 7571 + }, + { + "epoch": 0.48, + "grad_norm": 59.335283073129325, + "learning_rate": 5.6272522218494695e-06, + "loss": 0.2852, + "step": 7572 + }, + { + "epoch": 0.48, + "grad_norm": 2.699520023762649, + "learning_rate": 5.626241778409467e-06, + "loss": 0.2873, + "step": 7573 + }, + { + "epoch": 0.48, + "grad_norm": 2.392732779304917, + "learning_rate": 5.625231308985582e-06, + "loss": 0.294, + "step": 7574 + }, + { + "epoch": 0.48, + "grad_norm": 3.6432966342313273, + "learning_rate": 5.624220813619739e-06, + "loss": 0.3014, + "step": 7575 + }, + { + "epoch": 0.48, + "grad_norm": 4.380568531075516, + "learning_rate": 5.623210292353865e-06, + "loss": 0.3115, + "step": 7576 + }, + { + "epoch": 0.48, + "grad_norm": 1.951581765742306, + "learning_rate": 5.62219974522989e-06, + "loss": 0.2775, + "step": 7577 + }, + { + "epoch": 0.48, + "grad_norm": 2.2032960059647033, + "learning_rate": 5.621189172289742e-06, + "loss": 0.3008, + "step": 7578 + }, + { + "epoch": 0.48, + "grad_norm": 2.386585173453586, + "learning_rate": 5.620178573575352e-06, + "loss": 0.31, + "step": 7579 + }, + { + "epoch": 0.48, + "grad_norm": 3.233562515286271, + "learning_rate": 5.6191679491286525e-06, + "loss": 0.2862, + "step": 7580 + }, + { + "epoch": 0.48, + "grad_norm": 1.9552337487300702, + "learning_rate": 5.618157298991574e-06, + "loss": 0.2896, + "step": 7581 + }, + { + "epoch": 0.48, + "grad_norm": 1.7236303988944257, + "learning_rate": 5.61714662320605e-06, + "loss": 0.2865, + "step": 7582 + }, + { + "epoch": 0.48, + "grad_norm": 6.463293909716265, + "learning_rate": 5.616135921814018e-06, + "loss": 0.2837, + "step": 7583 + }, + { + "epoch": 0.48, + "grad_norm": 2.9170548306979804, + "learning_rate": 5.615125194857411e-06, + "loss": 0.2936, + "step": 7584 + }, + { + "epoch": 0.48, + "grad_norm": 0.6618161709336832, + "learning_rate": 5.614114442378169e-06, + "loss": 0.5309, + "step": 7585 + }, + { + "epoch": 0.48, + "grad_norm": 4.161281708041211, + "learning_rate": 5.613103664418227e-06, + "loss": 0.2977, + "step": 7586 + }, + { + "epoch": 0.48, + "grad_norm": 1.8520552299589774, + "learning_rate": 5.6120928610195235e-06, + "loss": 0.2714, + "step": 7587 + }, + { + "epoch": 0.48, + "grad_norm": 3.3206685316379283, + "learning_rate": 5.611082032224001e-06, + "loss": 0.293, + "step": 7588 + }, + { + "epoch": 0.48, + "grad_norm": 1.8522249948067022, + "learning_rate": 5.610071178073601e-06, + "loss": 0.2773, + "step": 7589 + }, + { + "epoch": 0.48, + "grad_norm": 1.6560173966684166, + "learning_rate": 5.609060298610263e-06, + "loss": 0.2934, + "step": 7590 + }, + { + "epoch": 0.48, + "grad_norm": 2.457576734413159, + "learning_rate": 5.6080493938759314e-06, + "loss": 0.3222, + "step": 7591 + }, + { + "epoch": 0.48, + "grad_norm": 1.7121792610360973, + "learning_rate": 5.607038463912551e-06, + "loss": 0.292, + "step": 7592 + }, + { + "epoch": 0.48, + "grad_norm": 1.7733976375907252, + "learning_rate": 5.606027508762067e-06, + "loss": 0.3028, + "step": 7593 + }, + { + "epoch": 0.48, + "grad_norm": 3.6817503215393352, + "learning_rate": 5.605016528466424e-06, + "loss": 0.2795, + "step": 7594 + }, + { + "epoch": 0.48, + "grad_norm": 2.016365833923353, + "learning_rate": 5.604005523067569e-06, + "loss": 0.2804, + "step": 7595 + }, + { + "epoch": 0.48, + "grad_norm": 3.2220054674335756, + "learning_rate": 5.602994492607454e-06, + "loss": 0.2928, + "step": 7596 + }, + { + "epoch": 0.48, + "grad_norm": 1.2291516796281352, + "learning_rate": 5.601983437128027e-06, + "loss": 0.2753, + "step": 7597 + }, + { + "epoch": 0.48, + "grad_norm": 1.7494213461557602, + "learning_rate": 5.600972356671238e-06, + "loss": 0.2812, + "step": 7598 + }, + { + "epoch": 0.48, + "grad_norm": 2.3269791290799406, + "learning_rate": 5.599961251279037e-06, + "loss": 0.2952, + "step": 7599 + }, + { + "epoch": 0.48, + "grad_norm": 1.2304916030670954, + "learning_rate": 5.5989501209933795e-06, + "loss": 0.2765, + "step": 7600 + }, + { + "epoch": 0.48, + "grad_norm": 1.459507001809702, + "learning_rate": 5.5979389658562165e-06, + "loss": 0.2799, + "step": 7601 + }, + { + "epoch": 0.48, + "grad_norm": 3.4381696557232515, + "learning_rate": 5.596927785909505e-06, + "loss": 0.2888, + "step": 7602 + }, + { + "epoch": 0.48, + "grad_norm": 1.7657934821560524, + "learning_rate": 5.595916581195198e-06, + "loss": 0.2875, + "step": 7603 + }, + { + "epoch": 0.48, + "grad_norm": 1.6843243807157469, + "learning_rate": 5.594905351755254e-06, + "loss": 0.2724, + "step": 7604 + }, + { + "epoch": 0.48, + "grad_norm": 1.9240401785154129, + "learning_rate": 5.59389409763163e-06, + "loss": 0.2747, + "step": 7605 + }, + { + "epoch": 0.48, + "grad_norm": 1.8506574965474858, + "learning_rate": 5.592882818866286e-06, + "loss": 0.2846, + "step": 7606 + }, + { + "epoch": 0.48, + "grad_norm": 2.044141352460662, + "learning_rate": 5.591871515501181e-06, + "loss": 0.2951, + "step": 7607 + }, + { + "epoch": 0.48, + "grad_norm": 3.4142261519026587, + "learning_rate": 5.590860187578274e-06, + "loss": 0.2961, + "step": 7608 + }, + { + "epoch": 0.48, + "grad_norm": 1.8414898976085707, + "learning_rate": 5.589848835139529e-06, + "loss": 0.2907, + "step": 7609 + }, + { + "epoch": 0.48, + "grad_norm": 3.8158250187790252, + "learning_rate": 5.588837458226908e-06, + "loss": 0.2802, + "step": 7610 + }, + { + "epoch": 0.48, + "grad_norm": 1.1173824203542102, + "learning_rate": 5.587826056882376e-06, + "loss": 0.2924, + "step": 7611 + }, + { + "epoch": 0.48, + "grad_norm": 1.8089366617244502, + "learning_rate": 5.586814631147895e-06, + "loss": 0.2819, + "step": 7612 + }, + { + "epoch": 0.48, + "grad_norm": 1.6528462451797643, + "learning_rate": 5.5858031810654345e-06, + "loss": 0.2996, + "step": 7613 + }, + { + "epoch": 0.48, + "grad_norm": 1.735345799156136, + "learning_rate": 5.5847917066769585e-06, + "loss": 0.2984, + "step": 7614 + }, + { + "epoch": 0.48, + "grad_norm": 1.8993270059871377, + "learning_rate": 5.583780208024436e-06, + "loss": 0.2917, + "step": 7615 + }, + { + "epoch": 0.48, + "grad_norm": 1.3079231039346049, + "learning_rate": 5.582768685149837e-06, + "loss": 0.2794, + "step": 7616 + }, + { + "epoch": 0.48, + "grad_norm": 2.6259454534568314, + "learning_rate": 5.581757138095128e-06, + "loss": 0.329, + "step": 7617 + }, + { + "epoch": 0.48, + "grad_norm": 1.8301455628536605, + "learning_rate": 5.5807455669022825e-06, + "loss": 0.2785, + "step": 7618 + }, + { + "epoch": 0.48, + "grad_norm": 1.789044571160198, + "learning_rate": 5.579733971613274e-06, + "loss": 0.3055, + "step": 7619 + }, + { + "epoch": 0.48, + "grad_norm": 5.880487546744792, + "learning_rate": 5.578722352270071e-06, + "loss": 0.2963, + "step": 7620 + }, + { + "epoch": 0.48, + "grad_norm": 1.533953734467629, + "learning_rate": 5.57771070891465e-06, + "loss": 0.2783, + "step": 7621 + }, + { + "epoch": 0.48, + "grad_norm": 1.443776325070946, + "learning_rate": 5.576699041588986e-06, + "loss": 0.293, + "step": 7622 + }, + { + "epoch": 0.48, + "grad_norm": 1.8706229546458053, + "learning_rate": 5.575687350335055e-06, + "loss": 0.2737, + "step": 7623 + }, + { + "epoch": 0.48, + "grad_norm": 2.16178471422946, + "learning_rate": 5.574675635194833e-06, + "loss": 0.3048, + "step": 7624 + }, + { + "epoch": 0.48, + "grad_norm": 0.6636299220898205, + "learning_rate": 5.573663896210297e-06, + "loss": 0.5273, + "step": 7625 + }, + { + "epoch": 0.48, + "grad_norm": 2.486550452320844, + "learning_rate": 5.5726521334234275e-06, + "loss": 0.3054, + "step": 7626 + }, + { + "epoch": 0.48, + "grad_norm": 5.261091326497108, + "learning_rate": 5.5716403468762045e-06, + "loss": 0.2917, + "step": 7627 + }, + { + "epoch": 0.48, + "grad_norm": 2.1698329739370568, + "learning_rate": 5.570628536610608e-06, + "loss": 0.2911, + "step": 7628 + }, + { + "epoch": 0.48, + "grad_norm": 3.6663475686674074, + "learning_rate": 5.56961670266862e-06, + "loss": 0.2871, + "step": 7629 + }, + { + "epoch": 0.48, + "grad_norm": 2.4871906189983948, + "learning_rate": 5.5686048450922224e-06, + "loss": 0.2918, + "step": 7630 + }, + { + "epoch": 0.48, + "grad_norm": 1.6193746845400931, + "learning_rate": 5.567592963923401e-06, + "loss": 0.2884, + "step": 7631 + }, + { + "epoch": 0.48, + "grad_norm": 2.180697079712196, + "learning_rate": 5.566581059204139e-06, + "loss": 0.3088, + "step": 7632 + }, + { + "epoch": 0.48, + "grad_norm": 1.8345635015984232, + "learning_rate": 5.5655691309764225e-06, + "loss": 0.2951, + "step": 7633 + }, + { + "epoch": 0.48, + "grad_norm": 1.6623792226280405, + "learning_rate": 5.564557179282237e-06, + "loss": 0.2883, + "step": 7634 + }, + { + "epoch": 0.48, + "grad_norm": 3.1105137740109474, + "learning_rate": 5.563545204163574e-06, + "loss": 0.3031, + "step": 7635 + }, + { + "epoch": 0.48, + "grad_norm": 2.000333524241662, + "learning_rate": 5.562533205662417e-06, + "loss": 0.284, + "step": 7636 + }, + { + "epoch": 0.48, + "grad_norm": 2.499717981356932, + "learning_rate": 5.561521183820759e-06, + "loss": 0.2777, + "step": 7637 + }, + { + "epoch": 0.48, + "grad_norm": 1.8619320855670536, + "learning_rate": 5.5605091386805896e-06, + "loss": 0.2966, + "step": 7638 + }, + { + "epoch": 0.48, + "grad_norm": 3.0606967986000067, + "learning_rate": 5.5594970702839005e-06, + "loss": 0.2829, + "step": 7639 + }, + { + "epoch": 0.48, + "grad_norm": 1.561519112722402, + "learning_rate": 5.558484978672684e-06, + "loss": 0.3106, + "step": 7640 + }, + { + "epoch": 0.48, + "grad_norm": 1.9140444866929631, + "learning_rate": 5.557472863888935e-06, + "loss": 0.2895, + "step": 7641 + }, + { + "epoch": 0.48, + "grad_norm": 2.599444497655152, + "learning_rate": 5.556460725974645e-06, + "loss": 0.292, + "step": 7642 + }, + { + "epoch": 0.48, + "grad_norm": 2.613480966642762, + "learning_rate": 5.555448564971812e-06, + "loss": 0.3196, + "step": 7643 + }, + { + "epoch": 0.48, + "grad_norm": 3.5954900095024396, + "learning_rate": 5.554436380922431e-06, + "loss": 0.2959, + "step": 7644 + }, + { + "epoch": 0.48, + "grad_norm": 2.1933243969816356, + "learning_rate": 5.553424173868501e-06, + "loss": 0.2781, + "step": 7645 + }, + { + "epoch": 0.48, + "grad_norm": 3.497756212293826, + "learning_rate": 5.552411943852017e-06, + "loss": 0.2954, + "step": 7646 + }, + { + "epoch": 0.48, + "grad_norm": 11.920059588064577, + "learning_rate": 5.551399690914981e-06, + "loss": 0.3116, + "step": 7647 + }, + { + "epoch": 0.48, + "grad_norm": 2.044223232505679, + "learning_rate": 5.550387415099393e-06, + "loss": 0.2804, + "step": 7648 + }, + { + "epoch": 0.48, + "grad_norm": 2.742875490724477, + "learning_rate": 5.549375116447254e-06, + "loss": 0.3274, + "step": 7649 + }, + { + "epoch": 0.48, + "grad_norm": 1.6956356747388157, + "learning_rate": 5.548362795000565e-06, + "loss": 0.2997, + "step": 7650 + }, + { + "epoch": 0.48, + "grad_norm": 1.303764207487624, + "learning_rate": 5.54735045080133e-06, + "loss": 0.2811, + "step": 7651 + }, + { + "epoch": 0.48, + "grad_norm": 3.4929475893767643, + "learning_rate": 5.546338083891551e-06, + "loss": 0.2798, + "step": 7652 + }, + { + "epoch": 0.48, + "grad_norm": 1.4465382185303735, + "learning_rate": 5.545325694313237e-06, + "loss": 0.3021, + "step": 7653 + }, + { + "epoch": 0.48, + "grad_norm": 1.9279209253137302, + "learning_rate": 5.544313282108389e-06, + "loss": 0.2811, + "step": 7654 + }, + { + "epoch": 0.48, + "grad_norm": 1.7286870769978662, + "learning_rate": 5.543300847319017e-06, + "loss": 0.2946, + "step": 7655 + }, + { + "epoch": 0.48, + "grad_norm": 0.6097230773486723, + "learning_rate": 5.542288389987128e-06, + "loss": 0.4863, + "step": 7656 + }, + { + "epoch": 0.48, + "grad_norm": 3.278823836160525, + "learning_rate": 5.541275910154731e-06, + "loss": 0.293, + "step": 7657 + }, + { + "epoch": 0.48, + "grad_norm": 1.8592303195067155, + "learning_rate": 5.540263407863834e-06, + "loss": 0.2974, + "step": 7658 + }, + { + "epoch": 0.48, + "grad_norm": 1.7508571877058219, + "learning_rate": 5.53925088315645e-06, + "loss": 0.2959, + "step": 7659 + }, + { + "epoch": 0.48, + "grad_norm": 2.384552100276538, + "learning_rate": 5.538238336074587e-06, + "loss": 0.2918, + "step": 7660 + }, + { + "epoch": 0.48, + "grad_norm": 2.0424155998200018, + "learning_rate": 5.537225766660261e-06, + "loss": 0.2951, + "step": 7661 + }, + { + "epoch": 0.48, + "grad_norm": 3.887461789212646, + "learning_rate": 5.536213174955484e-06, + "loss": 0.2815, + "step": 7662 + }, + { + "epoch": 0.48, + "grad_norm": 4.936148551169702, + "learning_rate": 5.535200561002268e-06, + "loss": 0.2975, + "step": 7663 + }, + { + "epoch": 0.48, + "grad_norm": 1.8999436939087193, + "learning_rate": 5.534187924842631e-06, + "loss": 0.2789, + "step": 7664 + }, + { + "epoch": 0.48, + "grad_norm": 2.618104287259037, + "learning_rate": 5.53317526651859e-06, + "loss": 0.2986, + "step": 7665 + }, + { + "epoch": 0.48, + "grad_norm": 4.0458312152352685, + "learning_rate": 5.532162586072158e-06, + "loss": 0.323, + "step": 7666 + }, + { + "epoch": 0.48, + "grad_norm": 2.0072672077035305, + "learning_rate": 5.5311498835453555e-06, + "loss": 0.2891, + "step": 7667 + }, + { + "epoch": 0.48, + "grad_norm": 2.0729173082436962, + "learning_rate": 5.530137158980201e-06, + "loss": 0.2811, + "step": 7668 + }, + { + "epoch": 0.48, + "grad_norm": 2.6521791172526483, + "learning_rate": 5.529124412418715e-06, + "loss": 0.3063, + "step": 7669 + }, + { + "epoch": 0.48, + "grad_norm": 1.6462108771835293, + "learning_rate": 5.5281116439029156e-06, + "loss": 0.2912, + "step": 7670 + }, + { + "epoch": 0.48, + "grad_norm": 3.316829178533151, + "learning_rate": 5.527098853474828e-06, + "loss": 0.3091, + "step": 7671 + }, + { + "epoch": 0.48, + "grad_norm": 1.5556510477501275, + "learning_rate": 5.526086041176472e-06, + "loss": 0.2787, + "step": 7672 + }, + { + "epoch": 0.48, + "grad_norm": 1.6613192769447411, + "learning_rate": 5.525073207049871e-06, + "loss": 0.2797, + "step": 7673 + }, + { + "epoch": 0.48, + "grad_norm": 0.6538928837380741, + "learning_rate": 5.524060351137049e-06, + "loss": 0.5012, + "step": 7674 + }, + { + "epoch": 0.48, + "grad_norm": 2.299107483738187, + "learning_rate": 5.523047473480036e-06, + "loss": 0.2783, + "step": 7675 + }, + { + "epoch": 0.48, + "grad_norm": 1.3144949143742457, + "learning_rate": 5.522034574120851e-06, + "loss": 0.2781, + "step": 7676 + }, + { + "epoch": 0.48, + "grad_norm": 1.5728670283895205, + "learning_rate": 5.521021653101525e-06, + "loss": 0.2924, + "step": 7677 + }, + { + "epoch": 0.48, + "grad_norm": 1.291260118859905, + "learning_rate": 5.520008710464085e-06, + "loss": 0.2724, + "step": 7678 + }, + { + "epoch": 0.48, + "grad_norm": 1.790638424702302, + "learning_rate": 5.518995746250561e-06, + "loss": 0.2688, + "step": 7679 + }, + { + "epoch": 0.48, + "grad_norm": 2.192349802244243, + "learning_rate": 5.517982760502981e-06, + "loss": 0.3109, + "step": 7680 + }, + { + "epoch": 0.48, + "grad_norm": 1.6548324309082583, + "learning_rate": 5.516969753263376e-06, + "loss": 0.2932, + "step": 7681 + }, + { + "epoch": 0.48, + "grad_norm": 2.1810413925839023, + "learning_rate": 5.515956724573778e-06, + "loss": 0.3011, + "step": 7682 + }, + { + "epoch": 0.48, + "grad_norm": 2.95441875468757, + "learning_rate": 5.51494367447622e-06, + "loss": 0.2775, + "step": 7683 + }, + { + "epoch": 0.48, + "grad_norm": 2.9777505854560355, + "learning_rate": 5.513930603012733e-06, + "loss": 0.2919, + "step": 7684 + }, + { + "epoch": 0.48, + "grad_norm": 3.322860532123437, + "learning_rate": 5.512917510225353e-06, + "loss": 0.2773, + "step": 7685 + }, + { + "epoch": 0.48, + "grad_norm": 3.8650869664910847, + "learning_rate": 5.511904396156113e-06, + "loss": 0.2916, + "step": 7686 + }, + { + "epoch": 0.48, + "grad_norm": 6.095873520856129, + "learning_rate": 5.510891260847053e-06, + "loss": 0.2906, + "step": 7687 + }, + { + "epoch": 0.48, + "grad_norm": 2.1493326173476626, + "learning_rate": 5.509878104340205e-06, + "loss": 0.2932, + "step": 7688 + }, + { + "epoch": 0.48, + "grad_norm": 4.960223046675856, + "learning_rate": 5.508864926677609e-06, + "loss": 0.3028, + "step": 7689 + }, + { + "epoch": 0.48, + "grad_norm": 1.3690556805584755, + "learning_rate": 5.507851727901304e-06, + "loss": 0.2919, + "step": 7690 + }, + { + "epoch": 0.48, + "grad_norm": 1.6671958339056985, + "learning_rate": 5.506838508053329e-06, + "loss": 0.2885, + "step": 7691 + }, + { + "epoch": 0.48, + "grad_norm": 1.9193056688864807, + "learning_rate": 5.505825267175723e-06, + "loss": 0.2721, + "step": 7692 + }, + { + "epoch": 0.48, + "grad_norm": 2.4266198792138582, + "learning_rate": 5.50481200531053e-06, + "loss": 0.2947, + "step": 7693 + }, + { + "epoch": 0.48, + "grad_norm": 2.7212335690916, + "learning_rate": 5.503798722499789e-06, + "loss": 0.2802, + "step": 7694 + }, + { + "epoch": 0.48, + "grad_norm": 2.9068507374041386, + "learning_rate": 5.502785418785545e-06, + "loss": 0.2757, + "step": 7695 + }, + { + "epoch": 0.48, + "grad_norm": 2.520798957009045, + "learning_rate": 5.501772094209838e-06, + "loss": 0.303, + "step": 7696 + }, + { + "epoch": 0.48, + "grad_norm": 2.073194997785977, + "learning_rate": 5.500758748814719e-06, + "loss": 0.2753, + "step": 7697 + }, + { + "epoch": 0.48, + "grad_norm": 1.5058041779306213, + "learning_rate": 5.4997453826422286e-06, + "loss": 0.2885, + "step": 7698 + }, + { + "epoch": 0.48, + "grad_norm": 2.8423842920647373, + "learning_rate": 5.498731995734416e-06, + "loss": 0.2876, + "step": 7699 + }, + { + "epoch": 0.48, + "grad_norm": 1.992622462410933, + "learning_rate": 5.497718588133325e-06, + "loss": 0.2942, + "step": 7700 + }, + { + "epoch": 0.48, + "grad_norm": 4.8564163935973115, + "learning_rate": 5.4967051598810074e-06, + "loss": 0.3164, + "step": 7701 + }, + { + "epoch": 0.48, + "grad_norm": 7.905517536132355, + "learning_rate": 5.4956917110195096e-06, + "loss": 0.2868, + "step": 7702 + }, + { + "epoch": 0.48, + "grad_norm": 1.7386476152167354, + "learning_rate": 5.494678241590883e-06, + "loss": 0.3107, + "step": 7703 + }, + { + "epoch": 0.48, + "grad_norm": 1.7342282734045082, + "learning_rate": 5.493664751637176e-06, + "loss": 0.3173, + "step": 7704 + }, + { + "epoch": 0.48, + "grad_norm": 3.0065624921530523, + "learning_rate": 5.492651241200443e-06, + "loss": 0.3109, + "step": 7705 + }, + { + "epoch": 0.48, + "grad_norm": 2.7653686719302852, + "learning_rate": 5.491637710322735e-06, + "loss": 0.3246, + "step": 7706 + }, + { + "epoch": 0.48, + "grad_norm": 5.287865288384318, + "learning_rate": 5.490624159046104e-06, + "loss": 0.2763, + "step": 7707 + }, + { + "epoch": 0.48, + "grad_norm": 1.5601821995705238, + "learning_rate": 5.4896105874126045e-06, + "loss": 0.2883, + "step": 7708 + }, + { + "epoch": 0.48, + "grad_norm": 1.5696044566668663, + "learning_rate": 5.488596995464295e-06, + "loss": 0.281, + "step": 7709 + }, + { + "epoch": 0.48, + "grad_norm": 5.08586468425905, + "learning_rate": 5.4875833832432265e-06, + "loss": 0.3066, + "step": 7710 + }, + { + "epoch": 0.48, + "grad_norm": 2.815212508735295, + "learning_rate": 5.486569750791457e-06, + "loss": 0.2832, + "step": 7711 + }, + { + "epoch": 0.49, + "grad_norm": 15.05406033481365, + "learning_rate": 5.485556098151045e-06, + "loss": 0.2888, + "step": 7712 + }, + { + "epoch": 0.49, + "grad_norm": 2.7899025505151624, + "learning_rate": 5.484542425364049e-06, + "loss": 0.2945, + "step": 7713 + }, + { + "epoch": 0.49, + "grad_norm": 3.4809174048149893, + "learning_rate": 5.483528732472525e-06, + "loss": 0.2767, + "step": 7714 + }, + { + "epoch": 0.49, + "grad_norm": 0.6367459434759566, + "learning_rate": 5.482515019518536e-06, + "loss": 0.5068, + "step": 7715 + }, + { + "epoch": 0.49, + "grad_norm": 0.6513782255389672, + "learning_rate": 5.481501286544142e-06, + "loss": 0.4778, + "step": 7716 + }, + { + "epoch": 0.49, + "grad_norm": 2.362815848020362, + "learning_rate": 5.480487533591405e-06, + "loss": 0.319, + "step": 7717 + }, + { + "epoch": 0.49, + "grad_norm": 2.171551057908031, + "learning_rate": 5.479473760702386e-06, + "loss": 0.2669, + "step": 7718 + }, + { + "epoch": 0.49, + "grad_norm": 3.258315300024818, + "learning_rate": 5.478459967919149e-06, + "loss": 0.3362, + "step": 7719 + }, + { + "epoch": 0.49, + "grad_norm": 1.6593417003266453, + "learning_rate": 5.477446155283758e-06, + "loss": 0.303, + "step": 7720 + }, + { + "epoch": 0.49, + "grad_norm": 3.5581072388702726, + "learning_rate": 5.476432322838279e-06, + "loss": 0.3513, + "step": 7721 + }, + { + "epoch": 0.49, + "grad_norm": 4.589422087143522, + "learning_rate": 5.4754184706247745e-06, + "loss": 0.2976, + "step": 7722 + }, + { + "epoch": 0.49, + "grad_norm": 2.0525801997020117, + "learning_rate": 5.474404598685315e-06, + "loss": 0.2846, + "step": 7723 + }, + { + "epoch": 0.49, + "grad_norm": 11.565055781579796, + "learning_rate": 5.473390707061965e-06, + "loss": 0.2857, + "step": 7724 + }, + { + "epoch": 0.49, + "grad_norm": 1.6844960046250694, + "learning_rate": 5.4723767957967955e-06, + "loss": 0.2706, + "step": 7725 + }, + { + "epoch": 0.49, + "grad_norm": 2.364943055973655, + "learning_rate": 5.4713628649318716e-06, + "loss": 0.3081, + "step": 7726 + }, + { + "epoch": 0.49, + "grad_norm": 1.6268223581545005, + "learning_rate": 5.470348914509267e-06, + "loss": 0.2675, + "step": 7727 + }, + { + "epoch": 0.49, + "grad_norm": 1.5371517334755775, + "learning_rate": 5.46933494457105e-06, + "loss": 0.2983, + "step": 7728 + }, + { + "epoch": 0.49, + "grad_norm": 3.4963837345356237, + "learning_rate": 5.468320955159293e-06, + "loss": 0.281, + "step": 7729 + }, + { + "epoch": 0.49, + "grad_norm": 1.9170074890030921, + "learning_rate": 5.467306946316066e-06, + "loss": 0.2804, + "step": 7730 + }, + { + "epoch": 0.49, + "grad_norm": 5.316003764567229, + "learning_rate": 5.466292918083444e-06, + "loss": 0.2852, + "step": 7731 + }, + { + "epoch": 0.49, + "grad_norm": 5.30807583938851, + "learning_rate": 5.4652788705035024e-06, + "loss": 0.308, + "step": 7732 + }, + { + "epoch": 0.49, + "grad_norm": 1.7788902829699256, + "learning_rate": 5.464264803618312e-06, + "loss": 0.304, + "step": 7733 + }, + { + "epoch": 0.49, + "grad_norm": 2.0855516531344285, + "learning_rate": 5.463250717469951e-06, + "loss": 0.3121, + "step": 7734 + }, + { + "epoch": 0.49, + "grad_norm": 2.734483862790188, + "learning_rate": 5.462236612100496e-06, + "loss": 0.2936, + "step": 7735 + }, + { + "epoch": 0.49, + "grad_norm": 2.071891275691287, + "learning_rate": 5.461222487552022e-06, + "loss": 0.3151, + "step": 7736 + }, + { + "epoch": 0.49, + "grad_norm": 2.735982423941206, + "learning_rate": 5.460208343866607e-06, + "loss": 0.2907, + "step": 7737 + }, + { + "epoch": 0.49, + "grad_norm": 2.4393485166060214, + "learning_rate": 5.4591941810863314e-06, + "loss": 0.2968, + "step": 7738 + }, + { + "epoch": 0.49, + "grad_norm": 0.8097763811921834, + "learning_rate": 5.458179999253274e-06, + "loss": 0.4989, + "step": 7739 + }, + { + "epoch": 0.49, + "grad_norm": 2.011088761778192, + "learning_rate": 5.457165798409514e-06, + "loss": 0.2902, + "step": 7740 + }, + { + "epoch": 0.49, + "grad_norm": 2.871560217624289, + "learning_rate": 5.456151578597133e-06, + "loss": 0.3, + "step": 7741 + }, + { + "epoch": 0.49, + "grad_norm": 1.724933861169226, + "learning_rate": 5.455137339858212e-06, + "loss": 0.3002, + "step": 7742 + }, + { + "epoch": 0.49, + "grad_norm": 2.233794883202092, + "learning_rate": 5.454123082234837e-06, + "loss": 0.2932, + "step": 7743 + }, + { + "epoch": 0.49, + "grad_norm": 1.911994075382329, + "learning_rate": 5.4531088057690864e-06, + "loss": 0.2949, + "step": 7744 + }, + { + "epoch": 0.49, + "grad_norm": 2.8799183901741925, + "learning_rate": 5.4520945105030466e-06, + "loss": 0.2649, + "step": 7745 + }, + { + "epoch": 0.49, + "grad_norm": 2.617560859902063, + "learning_rate": 5.451080196478803e-06, + "loss": 0.2842, + "step": 7746 + }, + { + "epoch": 0.49, + "grad_norm": 3.1354696114542797, + "learning_rate": 5.450065863738442e-06, + "loss": 0.283, + "step": 7747 + }, + { + "epoch": 0.49, + "grad_norm": 3.2134631563214096, + "learning_rate": 5.449051512324046e-06, + "loss": 0.3138, + "step": 7748 + }, + { + "epoch": 0.49, + "grad_norm": 0.6089507927171013, + "learning_rate": 5.448037142277708e-06, + "loss": 0.49, + "step": 7749 + }, + { + "epoch": 0.49, + "grad_norm": 2.031014695423584, + "learning_rate": 5.447022753641511e-06, + "loss": 0.2648, + "step": 7750 + }, + { + "epoch": 0.49, + "grad_norm": 15.389528252136984, + "learning_rate": 5.446008346457549e-06, + "loss": 0.2884, + "step": 7751 + }, + { + "epoch": 0.49, + "grad_norm": 2.8959402309699893, + "learning_rate": 5.444993920767905e-06, + "loss": 0.2959, + "step": 7752 + }, + { + "epoch": 0.49, + "grad_norm": 1.9432775176127008, + "learning_rate": 5.443979476614674e-06, + "loss": 0.2958, + "step": 7753 + }, + { + "epoch": 0.49, + "grad_norm": 1.6567397525583398, + "learning_rate": 5.442965014039947e-06, + "loss": 0.2832, + "step": 7754 + }, + { + "epoch": 0.49, + "grad_norm": 1.751743086406248, + "learning_rate": 5.441950533085814e-06, + "loss": 0.2994, + "step": 7755 + }, + { + "epoch": 0.49, + "grad_norm": 2.3200325140976727, + "learning_rate": 5.440936033794368e-06, + "loss": 0.2856, + "step": 7756 + }, + { + "epoch": 0.49, + "grad_norm": 3.5140588085294735, + "learning_rate": 5.439921516207704e-06, + "loss": 0.2817, + "step": 7757 + }, + { + "epoch": 0.49, + "grad_norm": 1.233163376378304, + "learning_rate": 5.438906980367914e-06, + "loss": 0.2879, + "step": 7758 + }, + { + "epoch": 0.49, + "grad_norm": 2.199435167670893, + "learning_rate": 5.437892426317095e-06, + "loss": 0.2917, + "step": 7759 + }, + { + "epoch": 0.49, + "grad_norm": 3.0216974147693554, + "learning_rate": 5.436877854097338e-06, + "loss": 0.2944, + "step": 7760 + }, + { + "epoch": 0.49, + "grad_norm": 1.7542178720395472, + "learning_rate": 5.435863263750747e-06, + "loss": 0.2779, + "step": 7761 + }, + { + "epoch": 0.49, + "grad_norm": 3.0811060036110973, + "learning_rate": 5.434848655319414e-06, + "loss": 0.2844, + "step": 7762 + }, + { + "epoch": 0.49, + "grad_norm": 2.383498166954667, + "learning_rate": 5.433834028845436e-06, + "loss": 0.2921, + "step": 7763 + }, + { + "epoch": 0.49, + "grad_norm": 2.3910709502821, + "learning_rate": 5.432819384370914e-06, + "loss": 0.2864, + "step": 7764 + }, + { + "epoch": 0.49, + "grad_norm": 3.151322561434658, + "learning_rate": 5.431804721937949e-06, + "loss": 0.2692, + "step": 7765 + }, + { + "epoch": 0.49, + "grad_norm": 9.33066431819288, + "learning_rate": 5.4307900415886374e-06, + "loss": 0.2765, + "step": 7766 + }, + { + "epoch": 0.49, + "grad_norm": 2.1432211433391815, + "learning_rate": 5.429775343365082e-06, + "loss": 0.3102, + "step": 7767 + }, + { + "epoch": 0.49, + "grad_norm": 1.9742557435916317, + "learning_rate": 5.428760627309384e-06, + "loss": 0.2803, + "step": 7768 + }, + { + "epoch": 0.49, + "grad_norm": 1.2674436633968107, + "learning_rate": 5.427745893463647e-06, + "loss": 0.2865, + "step": 7769 + }, + { + "epoch": 0.49, + "grad_norm": 0.6088395490169526, + "learning_rate": 5.426731141869973e-06, + "loss": 0.4981, + "step": 7770 + }, + { + "epoch": 0.49, + "grad_norm": 4.399780861060085, + "learning_rate": 5.425716372570466e-06, + "loss": 0.295, + "step": 7771 + }, + { + "epoch": 0.49, + "grad_norm": 2.097867324104314, + "learning_rate": 5.4247015856072295e-06, + "loss": 0.2791, + "step": 7772 + }, + { + "epoch": 0.49, + "grad_norm": 1.747160477436948, + "learning_rate": 5.4236867810223715e-06, + "loss": 0.2953, + "step": 7773 + }, + { + "epoch": 0.49, + "grad_norm": 2.007433151237102, + "learning_rate": 5.4226719588579935e-06, + "loss": 0.3012, + "step": 7774 + }, + { + "epoch": 0.49, + "grad_norm": 2.0237241169132663, + "learning_rate": 5.421657119156208e-06, + "loss": 0.285, + "step": 7775 + }, + { + "epoch": 0.49, + "grad_norm": 2.1575521191540536, + "learning_rate": 5.420642261959118e-06, + "loss": 0.2872, + "step": 7776 + }, + { + "epoch": 0.49, + "grad_norm": 3.5160786933243062, + "learning_rate": 5.419627387308836e-06, + "loss": 0.2766, + "step": 7777 + }, + { + "epoch": 0.49, + "grad_norm": 1.6131710196763749, + "learning_rate": 5.418612495247465e-06, + "loss": 0.2878, + "step": 7778 + }, + { + "epoch": 0.49, + "grad_norm": 3.3376476265177493, + "learning_rate": 5.4175975858171204e-06, + "loss": 0.2933, + "step": 7779 + }, + { + "epoch": 0.49, + "grad_norm": 1.4680007350926543, + "learning_rate": 5.416582659059909e-06, + "loss": 0.2956, + "step": 7780 + }, + { + "epoch": 0.49, + "grad_norm": 1.6099706267873062, + "learning_rate": 5.4155677150179446e-06, + "loss": 0.2824, + "step": 7781 + }, + { + "epoch": 0.49, + "grad_norm": 2.052321759581494, + "learning_rate": 5.414552753733334e-06, + "loss": 0.2837, + "step": 7782 + }, + { + "epoch": 0.49, + "grad_norm": 0.6167219981940524, + "learning_rate": 5.413537775248198e-06, + "loss": 0.4771, + "step": 7783 + }, + { + "epoch": 0.49, + "grad_norm": 15.557764588811983, + "learning_rate": 5.412522779604642e-06, + "loss": 0.2922, + "step": 7784 + }, + { + "epoch": 0.49, + "grad_norm": 6.8335568018077275, + "learning_rate": 5.411507766844784e-06, + "loss": 0.3058, + "step": 7785 + }, + { + "epoch": 0.49, + "grad_norm": 5.289772327945619, + "learning_rate": 5.410492737010737e-06, + "loss": 0.2712, + "step": 7786 + }, + { + "epoch": 0.49, + "grad_norm": 1.7982921294117131, + "learning_rate": 5.40947769014462e-06, + "loss": 0.2914, + "step": 7787 + }, + { + "epoch": 0.49, + "grad_norm": 2.441009830512988, + "learning_rate": 5.408462626288544e-06, + "loss": 0.2992, + "step": 7788 + }, + { + "epoch": 0.49, + "grad_norm": 2.525357207988294, + "learning_rate": 5.4074475454846275e-06, + "loss": 0.3066, + "step": 7789 + }, + { + "epoch": 0.49, + "grad_norm": 18.45928730511721, + "learning_rate": 5.4064324477749895e-06, + "loss": 0.291, + "step": 7790 + }, + { + "epoch": 0.49, + "grad_norm": 3.0957578722529746, + "learning_rate": 5.405417333201749e-06, + "loss": 0.2946, + "step": 7791 + }, + { + "epoch": 0.49, + "grad_norm": 2.1496594428860574, + "learning_rate": 5.404402201807022e-06, + "loss": 0.2933, + "step": 7792 + }, + { + "epoch": 0.49, + "grad_norm": 4.366107406800677, + "learning_rate": 5.403387053632928e-06, + "loss": 0.2781, + "step": 7793 + }, + { + "epoch": 0.49, + "grad_norm": 2.017692727243486, + "learning_rate": 5.4023718887215906e-06, + "loss": 0.2955, + "step": 7794 + }, + { + "epoch": 0.49, + "grad_norm": 1.5367372740132124, + "learning_rate": 5.401356707115128e-06, + "loss": 0.2757, + "step": 7795 + }, + { + "epoch": 0.49, + "grad_norm": 1.4266617915544213, + "learning_rate": 5.400341508855663e-06, + "loss": 0.2735, + "step": 7796 + }, + { + "epoch": 0.49, + "grad_norm": 3.657927667472044, + "learning_rate": 5.3993262939853175e-06, + "loss": 0.2901, + "step": 7797 + }, + { + "epoch": 0.49, + "grad_norm": 2.038135513034856, + "learning_rate": 5.3983110625462144e-06, + "loss": 0.3043, + "step": 7798 + }, + { + "epoch": 0.49, + "grad_norm": 2.4296953850693885, + "learning_rate": 5.397295814580479e-06, + "loss": 0.3101, + "step": 7799 + }, + { + "epoch": 0.49, + "grad_norm": 2.029014493023995, + "learning_rate": 5.396280550130234e-06, + "loss": 0.2989, + "step": 7800 + }, + { + "epoch": 0.49, + "grad_norm": 1.6424893600592279, + "learning_rate": 5.395265269237604e-06, + "loss": 0.2816, + "step": 7801 + }, + { + "epoch": 0.49, + "grad_norm": 2.602559189504729, + "learning_rate": 5.394249971944717e-06, + "loss": 0.3001, + "step": 7802 + }, + { + "epoch": 0.49, + "grad_norm": 2.380024316459885, + "learning_rate": 5.393234658293699e-06, + "loss": 0.2773, + "step": 7803 + }, + { + "epoch": 0.49, + "grad_norm": 3.1857487631942005, + "learning_rate": 5.392219328326674e-06, + "loss": 0.317, + "step": 7804 + }, + { + "epoch": 0.49, + "grad_norm": 5.544890840486099, + "learning_rate": 5.391203982085775e-06, + "loss": 0.2928, + "step": 7805 + }, + { + "epoch": 0.49, + "grad_norm": 0.6648914466207075, + "learning_rate": 5.390188619613127e-06, + "loss": 0.4924, + "step": 7806 + }, + { + "epoch": 0.49, + "grad_norm": 1.8318319675761625, + "learning_rate": 5.389173240950861e-06, + "loss": 0.2878, + "step": 7807 + }, + { + "epoch": 0.49, + "grad_norm": 2.2661003354214695, + "learning_rate": 5.388157846141105e-06, + "loss": 0.3093, + "step": 7808 + }, + { + "epoch": 0.49, + "grad_norm": 3.3939666974106566, + "learning_rate": 5.3871424352259904e-06, + "loss": 0.2809, + "step": 7809 + }, + { + "epoch": 0.49, + "grad_norm": 4.132518406526393, + "learning_rate": 5.386127008247649e-06, + "loss": 0.3026, + "step": 7810 + }, + { + "epoch": 0.49, + "grad_norm": 1.422275574293052, + "learning_rate": 5.385111565248212e-06, + "loss": 0.2769, + "step": 7811 + }, + { + "epoch": 0.49, + "grad_norm": 2.377147047080068, + "learning_rate": 5.384096106269811e-06, + "loss": 0.2952, + "step": 7812 + }, + { + "epoch": 0.49, + "grad_norm": 1.5309259557800277, + "learning_rate": 5.383080631354582e-06, + "loss": 0.2883, + "step": 7813 + }, + { + "epoch": 0.49, + "grad_norm": 3.260658727800435, + "learning_rate": 5.3820651405446564e-06, + "loss": 0.2987, + "step": 7814 + }, + { + "epoch": 0.49, + "grad_norm": 2.189576227553726, + "learning_rate": 5.38104963388217e-06, + "loss": 0.3113, + "step": 7815 + }, + { + "epoch": 0.49, + "grad_norm": 6.321911564146936, + "learning_rate": 5.380034111409257e-06, + "loss": 0.2805, + "step": 7816 + }, + { + "epoch": 0.49, + "grad_norm": 4.417295041113762, + "learning_rate": 5.379018573168056e-06, + "loss": 0.3089, + "step": 7817 + }, + { + "epoch": 0.49, + "grad_norm": 4.347253533304037, + "learning_rate": 5.378003019200699e-06, + "loss": 0.2785, + "step": 7818 + }, + { + "epoch": 0.49, + "grad_norm": 2.114979822151295, + "learning_rate": 5.376987449549325e-06, + "loss": 0.3094, + "step": 7819 + }, + { + "epoch": 0.49, + "grad_norm": 1.763573269754107, + "learning_rate": 5.375971864256071e-06, + "loss": 0.2907, + "step": 7820 + }, + { + "epoch": 0.49, + "grad_norm": 3.257328131567649, + "learning_rate": 5.3749562633630795e-06, + "loss": 0.2863, + "step": 7821 + }, + { + "epoch": 0.49, + "grad_norm": 2.837231319199659, + "learning_rate": 5.373940646912485e-06, + "loss": 0.2845, + "step": 7822 + }, + { + "epoch": 0.49, + "grad_norm": 2.0511031390317953, + "learning_rate": 5.372925014946428e-06, + "loss": 0.2895, + "step": 7823 + }, + { + "epoch": 0.49, + "grad_norm": 1.6441202753609079, + "learning_rate": 5.371909367507051e-06, + "loss": 0.2964, + "step": 7824 + }, + { + "epoch": 0.49, + "grad_norm": 4.657212904994267, + "learning_rate": 5.370893704636495e-06, + "loss": 0.3129, + "step": 7825 + }, + { + "epoch": 0.49, + "grad_norm": 3.289250358737912, + "learning_rate": 5.369878026376899e-06, + "loss": 0.2884, + "step": 7826 + }, + { + "epoch": 0.49, + "grad_norm": 4.438647161835852, + "learning_rate": 5.368862332770406e-06, + "loss": 0.2834, + "step": 7827 + }, + { + "epoch": 0.49, + "grad_norm": 2.208852229409675, + "learning_rate": 5.36784662385916e-06, + "loss": 0.2716, + "step": 7828 + }, + { + "epoch": 0.49, + "grad_norm": 1.9287582100892882, + "learning_rate": 5.366830899685306e-06, + "loss": 0.277, + "step": 7829 + }, + { + "epoch": 0.49, + "grad_norm": 2.903851899455363, + "learning_rate": 5.365815160290983e-06, + "loss": 0.2959, + "step": 7830 + }, + { + "epoch": 0.49, + "grad_norm": 2.1247015783700065, + "learning_rate": 5.364799405718342e-06, + "loss": 0.3171, + "step": 7831 + }, + { + "epoch": 0.49, + "grad_norm": 1.6676871413798984, + "learning_rate": 5.3637836360095255e-06, + "loss": 0.2803, + "step": 7832 + }, + { + "epoch": 0.49, + "grad_norm": 1.7662499131575617, + "learning_rate": 5.3627678512066795e-06, + "loss": 0.2876, + "step": 7833 + }, + { + "epoch": 0.49, + "grad_norm": 0.6006633156553667, + "learning_rate": 5.36175205135195e-06, + "loss": 0.4691, + "step": 7834 + }, + { + "epoch": 0.49, + "grad_norm": 3.4657365438642187, + "learning_rate": 5.360736236487486e-06, + "loss": 0.2788, + "step": 7835 + }, + { + "epoch": 0.49, + "grad_norm": 4.961876106210694, + "learning_rate": 5.359720406655435e-06, + "loss": 0.2735, + "step": 7836 + }, + { + "epoch": 0.49, + "grad_norm": 2.2152937648310767, + "learning_rate": 5.358704561897946e-06, + "loss": 0.2892, + "step": 7837 + }, + { + "epoch": 0.49, + "grad_norm": 2.053247394787333, + "learning_rate": 5.357688702257165e-06, + "loss": 0.2795, + "step": 7838 + }, + { + "epoch": 0.49, + "grad_norm": 26.465164130014912, + "learning_rate": 5.3566728277752474e-06, + "loss": 0.2893, + "step": 7839 + }, + { + "epoch": 0.49, + "grad_norm": 1.5914615193943744, + "learning_rate": 5.355656938494339e-06, + "loss": 0.2733, + "step": 7840 + }, + { + "epoch": 0.49, + "grad_norm": 6.148678242380159, + "learning_rate": 5.354641034456595e-06, + "loss": 0.3147, + "step": 7841 + }, + { + "epoch": 0.49, + "grad_norm": 1.5725284290527721, + "learning_rate": 5.353625115704161e-06, + "loss": 0.2893, + "step": 7842 + }, + { + "epoch": 0.49, + "grad_norm": 1.870482497199345, + "learning_rate": 5.352609182279195e-06, + "loss": 0.297, + "step": 7843 + }, + { + "epoch": 0.49, + "grad_norm": 1.8492267958161406, + "learning_rate": 5.351593234223847e-06, + "loss": 0.2903, + "step": 7844 + }, + { + "epoch": 0.49, + "grad_norm": 1.461672599556936, + "learning_rate": 5.35057727158027e-06, + "loss": 0.2575, + "step": 7845 + }, + { + "epoch": 0.49, + "grad_norm": 1.2814202977521445, + "learning_rate": 5.349561294390622e-06, + "loss": 0.2828, + "step": 7846 + }, + { + "epoch": 0.49, + "grad_norm": 1.742395186833296, + "learning_rate": 5.348545302697054e-06, + "loss": 0.3104, + "step": 7847 + }, + { + "epoch": 0.49, + "grad_norm": 1.5456809314140165, + "learning_rate": 5.347529296541721e-06, + "loss": 0.29, + "step": 7848 + }, + { + "epoch": 0.49, + "grad_norm": 26.66109870909653, + "learning_rate": 5.346513275966782e-06, + "loss": 0.278, + "step": 7849 + }, + { + "epoch": 0.49, + "grad_norm": 3.1719069188333964, + "learning_rate": 5.34549724101439e-06, + "loss": 0.2842, + "step": 7850 + }, + { + "epoch": 0.49, + "grad_norm": 1.2886971048340783, + "learning_rate": 5.344481191726706e-06, + "loss": 0.2795, + "step": 7851 + }, + { + "epoch": 0.49, + "grad_norm": 2.3802020504268517, + "learning_rate": 5.343465128145884e-06, + "loss": 0.2882, + "step": 7852 + }, + { + "epoch": 0.49, + "grad_norm": 1.4542347798396273, + "learning_rate": 5.342449050314084e-06, + "loss": 0.2714, + "step": 7853 + }, + { + "epoch": 0.49, + "grad_norm": 2.7576394213758326, + "learning_rate": 5.3414329582734635e-06, + "loss": 0.3048, + "step": 7854 + }, + { + "epoch": 0.49, + "grad_norm": 1.9112309111662238, + "learning_rate": 5.340416852066185e-06, + "loss": 0.2774, + "step": 7855 + }, + { + "epoch": 0.49, + "grad_norm": 5.112117802862802, + "learning_rate": 5.339400731734404e-06, + "loss": 0.281, + "step": 7856 + }, + { + "epoch": 0.49, + "grad_norm": 1.6640068522887566, + "learning_rate": 5.338384597320287e-06, + "loss": 0.281, + "step": 7857 + }, + { + "epoch": 0.49, + "grad_norm": 1.9509680169432204, + "learning_rate": 5.33736844886599e-06, + "loss": 0.2896, + "step": 7858 + }, + { + "epoch": 0.49, + "grad_norm": 1.3020465188496206, + "learning_rate": 5.336352286413678e-06, + "loss": 0.2813, + "step": 7859 + }, + { + "epoch": 0.49, + "grad_norm": 3.7139299680622218, + "learning_rate": 5.335336110005511e-06, + "loss": 0.2889, + "step": 7860 + }, + { + "epoch": 0.49, + "grad_norm": 1.7794580784858343, + "learning_rate": 5.3343199196836545e-06, + "loss": 0.2877, + "step": 7861 + }, + { + "epoch": 0.49, + "grad_norm": 1.419535508943437, + "learning_rate": 5.33330371549027e-06, + "loss": 0.2707, + "step": 7862 + }, + { + "epoch": 0.49, + "grad_norm": 2.62458308913267, + "learning_rate": 5.332287497467523e-06, + "loss": 0.3135, + "step": 7863 + }, + { + "epoch": 0.49, + "grad_norm": 2.1344687018315707, + "learning_rate": 5.331271265657576e-06, + "loss": 0.3049, + "step": 7864 + }, + { + "epoch": 0.49, + "grad_norm": 1.423922270242874, + "learning_rate": 5.330255020102598e-06, + "loss": 0.2822, + "step": 7865 + }, + { + "epoch": 0.49, + "grad_norm": 2.138042924156716, + "learning_rate": 5.329238760844751e-06, + "loss": 0.3126, + "step": 7866 + }, + { + "epoch": 0.49, + "grad_norm": 3.442546742227438, + "learning_rate": 5.328222487926204e-06, + "loss": 0.3147, + "step": 7867 + }, + { + "epoch": 0.49, + "grad_norm": 11.370689501851116, + "learning_rate": 5.327206201389121e-06, + "loss": 0.2864, + "step": 7868 + }, + { + "epoch": 0.49, + "grad_norm": 6.397388091452828, + "learning_rate": 5.326189901275673e-06, + "loss": 0.2871, + "step": 7869 + }, + { + "epoch": 0.49, + "grad_norm": 2.2757360750300504, + "learning_rate": 5.325173587628028e-06, + "loss": 0.2865, + "step": 7870 + }, + { + "epoch": 0.5, + "grad_norm": 1.8233082627858472, + "learning_rate": 5.324157260488351e-06, + "loss": 0.3177, + "step": 7871 + }, + { + "epoch": 0.5, + "grad_norm": 2.0797174636973628, + "learning_rate": 5.3231409198988136e-06, + "loss": 0.2977, + "step": 7872 + }, + { + "epoch": 0.5, + "grad_norm": 1.4364150530437172, + "learning_rate": 5.322124565901587e-06, + "loss": 0.2831, + "step": 7873 + }, + { + "epoch": 0.5, + "grad_norm": 1.5394900767597413, + "learning_rate": 5.321108198538839e-06, + "loss": 0.3183, + "step": 7874 + }, + { + "epoch": 0.5, + "grad_norm": 2.1928337204093706, + "learning_rate": 5.3200918178527415e-06, + "loss": 0.2794, + "step": 7875 + }, + { + "epoch": 0.5, + "grad_norm": 9.42904582272032, + "learning_rate": 5.319075423885466e-06, + "loss": 0.2946, + "step": 7876 + }, + { + "epoch": 0.5, + "grad_norm": 1.3753258728152593, + "learning_rate": 5.318059016679184e-06, + "loss": 0.2875, + "step": 7877 + }, + { + "epoch": 0.5, + "grad_norm": 1.5241915561094757, + "learning_rate": 5.31704259627607e-06, + "loss": 0.2903, + "step": 7878 + }, + { + "epoch": 0.5, + "grad_norm": 2.3199319689602134, + "learning_rate": 5.316026162718294e-06, + "loss": 0.3044, + "step": 7879 + }, + { + "epoch": 0.5, + "grad_norm": 1.9476814347952272, + "learning_rate": 5.315009716048031e-06, + "loss": 0.3045, + "step": 7880 + }, + { + "epoch": 0.5, + "grad_norm": 1.7439968772929455, + "learning_rate": 5.313993256307457e-06, + "loss": 0.2993, + "step": 7881 + }, + { + "epoch": 0.5, + "grad_norm": 1.9012851842038336, + "learning_rate": 5.312976783538743e-06, + "loss": 0.2832, + "step": 7882 + }, + { + "epoch": 0.5, + "grad_norm": 1.7364297307752168, + "learning_rate": 5.3119602977840675e-06, + "loss": 0.274, + "step": 7883 + }, + { + "epoch": 0.5, + "grad_norm": 2.1144750913095374, + "learning_rate": 5.310943799085605e-06, + "loss": 0.285, + "step": 7884 + }, + { + "epoch": 0.5, + "grad_norm": 1.7648450352078844, + "learning_rate": 5.309927287485533e-06, + "loss": 0.2915, + "step": 7885 + }, + { + "epoch": 0.5, + "grad_norm": 1.7324473464984051, + "learning_rate": 5.308910763026025e-06, + "loss": 0.277, + "step": 7886 + }, + { + "epoch": 0.5, + "grad_norm": 3.7847826428085893, + "learning_rate": 5.3078942257492635e-06, + "loss": 0.29, + "step": 7887 + }, + { + "epoch": 0.5, + "grad_norm": 2.407069782983817, + "learning_rate": 5.306877675697422e-06, + "loss": 0.299, + "step": 7888 + }, + { + "epoch": 0.5, + "grad_norm": 2.0505374188782954, + "learning_rate": 5.305861112912682e-06, + "loss": 0.3191, + "step": 7889 + }, + { + "epoch": 0.5, + "grad_norm": 2.4128359405053703, + "learning_rate": 5.3048445374372195e-06, + "loss": 0.2896, + "step": 7890 + }, + { + "epoch": 0.5, + "grad_norm": 4.525059847611393, + "learning_rate": 5.303827949313216e-06, + "loss": 0.2844, + "step": 7891 + }, + { + "epoch": 0.5, + "grad_norm": 2.0767857423777576, + "learning_rate": 5.302811348582851e-06, + "loss": 0.2865, + "step": 7892 + }, + { + "epoch": 0.5, + "grad_norm": 1.8358489094171377, + "learning_rate": 5.301794735288307e-06, + "loss": 0.3444, + "step": 7893 + }, + { + "epoch": 0.5, + "grad_norm": 1.868254505609128, + "learning_rate": 5.300778109471761e-06, + "loss": 0.2877, + "step": 7894 + }, + { + "epoch": 0.5, + "grad_norm": 0.5981025833410399, + "learning_rate": 5.2997614711753995e-06, + "loss": 0.4733, + "step": 7895 + }, + { + "epoch": 0.5, + "grad_norm": 1.6909587767237433, + "learning_rate": 5.298744820441401e-06, + "loss": 0.2701, + "step": 7896 + }, + { + "epoch": 0.5, + "grad_norm": 5.89792792416731, + "learning_rate": 5.297728157311949e-06, + "loss": 0.2771, + "step": 7897 + }, + { + "epoch": 0.5, + "grad_norm": 1.3995807621527219, + "learning_rate": 5.296711481829227e-06, + "loss": 0.2811, + "step": 7898 + }, + { + "epoch": 0.5, + "grad_norm": 3.174499932223166, + "learning_rate": 5.295694794035419e-06, + "loss": 0.2876, + "step": 7899 + }, + { + "epoch": 0.5, + "grad_norm": 3.0959527284495594, + "learning_rate": 5.2946780939727084e-06, + "loss": 0.2969, + "step": 7900 + }, + { + "epoch": 0.5, + "grad_norm": 5.323792377120759, + "learning_rate": 5.29366138168328e-06, + "loss": 0.3005, + "step": 7901 + }, + { + "epoch": 0.5, + "grad_norm": 1.8694081748452838, + "learning_rate": 5.292644657209319e-06, + "loss": 0.2853, + "step": 7902 + }, + { + "epoch": 0.5, + "grad_norm": 2.0469038900242316, + "learning_rate": 5.291627920593014e-06, + "loss": 0.2886, + "step": 7903 + }, + { + "epoch": 0.5, + "grad_norm": 0.5926515851845977, + "learning_rate": 5.290611171876545e-06, + "loss": 0.4816, + "step": 7904 + }, + { + "epoch": 0.5, + "grad_norm": 1.5227893147930167, + "learning_rate": 5.289594411102103e-06, + "loss": 0.2952, + "step": 7905 + }, + { + "epoch": 0.5, + "grad_norm": 2.4201675984421223, + "learning_rate": 5.288577638311876e-06, + "loss": 0.2912, + "step": 7906 + }, + { + "epoch": 0.5, + "grad_norm": 1.7328693828558057, + "learning_rate": 5.287560853548051e-06, + "loss": 0.2797, + "step": 7907 + }, + { + "epoch": 0.5, + "grad_norm": 0.5812456041002827, + "learning_rate": 5.286544056852814e-06, + "loss": 0.489, + "step": 7908 + }, + { + "epoch": 0.5, + "grad_norm": 2.4004950807853747, + "learning_rate": 5.285527248268354e-06, + "loss": 0.2968, + "step": 7909 + }, + { + "epoch": 0.5, + "grad_norm": 2.3279271592857764, + "learning_rate": 5.2845104278368616e-06, + "loss": 0.2952, + "step": 7910 + }, + { + "epoch": 0.5, + "grad_norm": 2.069382801279133, + "learning_rate": 5.283493595600529e-06, + "loss": 0.2866, + "step": 7911 + }, + { + "epoch": 0.5, + "grad_norm": 1.8005879057685519, + "learning_rate": 5.28247675160154e-06, + "loss": 0.2826, + "step": 7912 + }, + { + "epoch": 0.5, + "grad_norm": 1.8240246921078522, + "learning_rate": 5.281459895882091e-06, + "loss": 0.2896, + "step": 7913 + }, + { + "epoch": 0.5, + "grad_norm": 0.5857586805162879, + "learning_rate": 5.28044302848437e-06, + "loss": 0.4779, + "step": 7914 + }, + { + "epoch": 0.5, + "grad_norm": 1.883077599717525, + "learning_rate": 5.279426149450571e-06, + "loss": 0.2655, + "step": 7915 + }, + { + "epoch": 0.5, + "grad_norm": 3.706124916748602, + "learning_rate": 5.278409258822883e-06, + "loss": 0.2961, + "step": 7916 + }, + { + "epoch": 0.5, + "grad_norm": 1.3641752380519125, + "learning_rate": 5.277392356643501e-06, + "loss": 0.2874, + "step": 7917 + }, + { + "epoch": 0.5, + "grad_norm": 2.7134409761385343, + "learning_rate": 5.276375442954618e-06, + "loss": 0.2846, + "step": 7918 + }, + { + "epoch": 0.5, + "grad_norm": 1.7934162210103406, + "learning_rate": 5.275358517798428e-06, + "loss": 0.304, + "step": 7919 + }, + { + "epoch": 0.5, + "grad_norm": 1.784469004248253, + "learning_rate": 5.27434158121712e-06, + "loss": 0.2854, + "step": 7920 + }, + { + "epoch": 0.5, + "grad_norm": 3.640741575724123, + "learning_rate": 5.273324633252897e-06, + "loss": 0.2837, + "step": 7921 + }, + { + "epoch": 0.5, + "grad_norm": 1.6976472680018913, + "learning_rate": 5.272307673947947e-06, + "loss": 0.2747, + "step": 7922 + }, + { + "epoch": 0.5, + "grad_norm": 1.7838076336567272, + "learning_rate": 5.271290703344469e-06, + "loss": 0.2779, + "step": 7923 + }, + { + "epoch": 0.5, + "grad_norm": 1.283616882704457, + "learning_rate": 5.270273721484657e-06, + "loss": 0.2924, + "step": 7924 + }, + { + "epoch": 0.5, + "grad_norm": 1.6596932629621144, + "learning_rate": 5.269256728410709e-06, + "loss": 0.2836, + "step": 7925 + }, + { + "epoch": 0.5, + "grad_norm": 1.3390694006704675, + "learning_rate": 5.268239724164819e-06, + "loss": 0.2877, + "step": 7926 + }, + { + "epoch": 0.5, + "grad_norm": 2.4626165226004626, + "learning_rate": 5.267222708789189e-06, + "loss": 0.2914, + "step": 7927 + }, + { + "epoch": 0.5, + "grad_norm": 2.5017678493186915, + "learning_rate": 5.266205682326013e-06, + "loss": 0.3004, + "step": 7928 + }, + { + "epoch": 0.5, + "grad_norm": 1.4501573454757062, + "learning_rate": 5.265188644817492e-06, + "loss": 0.2717, + "step": 7929 + }, + { + "epoch": 0.5, + "grad_norm": 2.9411583587568177, + "learning_rate": 5.264171596305821e-06, + "loss": 0.3036, + "step": 7930 + }, + { + "epoch": 0.5, + "grad_norm": 1.5920735045894463, + "learning_rate": 5.263154536833202e-06, + "loss": 0.3052, + "step": 7931 + }, + { + "epoch": 0.5, + "grad_norm": 1.8451966896150458, + "learning_rate": 5.262137466441834e-06, + "loss": 0.3133, + "step": 7932 + }, + { + "epoch": 0.5, + "grad_norm": 2.4059217498548913, + "learning_rate": 5.261120385173917e-06, + "loss": 0.2871, + "step": 7933 + }, + { + "epoch": 0.5, + "grad_norm": 2.705879722655072, + "learning_rate": 5.260103293071651e-06, + "loss": 0.2989, + "step": 7934 + }, + { + "epoch": 0.5, + "grad_norm": 2.1456088508382196, + "learning_rate": 5.259086190177237e-06, + "loss": 0.3024, + "step": 7935 + }, + { + "epoch": 0.5, + "grad_norm": 2.79279948609646, + "learning_rate": 5.258069076532877e-06, + "loss": 0.2995, + "step": 7936 + }, + { + "epoch": 0.5, + "grad_norm": 2.050047551143473, + "learning_rate": 5.257051952180774e-06, + "loss": 0.3132, + "step": 7937 + }, + { + "epoch": 0.5, + "grad_norm": 5.05594381130633, + "learning_rate": 5.256034817163127e-06, + "loss": 0.3071, + "step": 7938 + }, + { + "epoch": 0.5, + "grad_norm": 2.0684131487006554, + "learning_rate": 5.255017671522142e-06, + "loss": 0.2834, + "step": 7939 + }, + { + "epoch": 0.5, + "grad_norm": 2.241061238094606, + "learning_rate": 5.254000515300019e-06, + "loss": 0.286, + "step": 7940 + }, + { + "epoch": 0.5, + "grad_norm": 2.7180458792869078, + "learning_rate": 5.252983348538967e-06, + "loss": 0.2931, + "step": 7941 + }, + { + "epoch": 0.5, + "grad_norm": 3.548697770337576, + "learning_rate": 5.2519661712811845e-06, + "loss": 0.2959, + "step": 7942 + }, + { + "epoch": 0.5, + "grad_norm": 2.3352332012162123, + "learning_rate": 5.250948983568876e-06, + "loss": 0.3082, + "step": 7943 + }, + { + "epoch": 0.5, + "grad_norm": 1.7361041982240581, + "learning_rate": 5.249931785444251e-06, + "loss": 0.293, + "step": 7944 + }, + { + "epoch": 0.5, + "grad_norm": 1.7055614049768366, + "learning_rate": 5.248914576949512e-06, + "loss": 0.2985, + "step": 7945 + }, + { + "epoch": 0.5, + "grad_norm": 1.4417005900917659, + "learning_rate": 5.2478973581268645e-06, + "loss": 0.29, + "step": 7946 + }, + { + "epoch": 0.5, + "grad_norm": 1.9337301910545015, + "learning_rate": 5.246880129018515e-06, + "loss": 0.2833, + "step": 7947 + }, + { + "epoch": 0.5, + "grad_norm": 2.5940934482588602, + "learning_rate": 5.24586288966667e-06, + "loss": 0.2961, + "step": 7948 + }, + { + "epoch": 0.5, + "grad_norm": 1.8112097807465304, + "learning_rate": 5.24484564011354e-06, + "loss": 0.3054, + "step": 7949 + }, + { + "epoch": 0.5, + "grad_norm": 3.205560532580205, + "learning_rate": 5.2438283804013265e-06, + "loss": 0.2837, + "step": 7950 + }, + { + "epoch": 0.5, + "grad_norm": 4.236503306564229, + "learning_rate": 5.242811110572243e-06, + "loss": 0.2845, + "step": 7951 + }, + { + "epoch": 0.5, + "grad_norm": 3.203666350475942, + "learning_rate": 5.241793830668492e-06, + "loss": 0.2985, + "step": 7952 + }, + { + "epoch": 0.5, + "grad_norm": 2.0611771440486573, + "learning_rate": 5.240776540732288e-06, + "loss": 0.2676, + "step": 7953 + }, + { + "epoch": 0.5, + "grad_norm": 1.653082607696945, + "learning_rate": 5.239759240805835e-06, + "loss": 0.2984, + "step": 7954 + }, + { + "epoch": 0.5, + "grad_norm": 4.7539020266152905, + "learning_rate": 5.238741930931348e-06, + "loss": 0.2772, + "step": 7955 + }, + { + "epoch": 0.5, + "grad_norm": 2.283071163067402, + "learning_rate": 5.237724611151034e-06, + "loss": 0.2782, + "step": 7956 + }, + { + "epoch": 0.5, + "grad_norm": 0.6218217562479014, + "learning_rate": 5.2367072815071015e-06, + "loss": 0.5029, + "step": 7957 + }, + { + "epoch": 0.5, + "grad_norm": 6.276229364350079, + "learning_rate": 5.235689942041765e-06, + "loss": 0.3007, + "step": 7958 + }, + { + "epoch": 0.5, + "grad_norm": 1.387998062722005, + "learning_rate": 5.234672592797236e-06, + "loss": 0.2815, + "step": 7959 + }, + { + "epoch": 0.5, + "grad_norm": 1.6021124702158773, + "learning_rate": 5.233655233815721e-06, + "loss": 0.2809, + "step": 7960 + }, + { + "epoch": 0.5, + "grad_norm": 5.611070341895981, + "learning_rate": 5.232637865139436e-06, + "loss": 0.2809, + "step": 7961 + }, + { + "epoch": 0.5, + "grad_norm": 1.496121167524374, + "learning_rate": 5.231620486810594e-06, + "loss": 0.3149, + "step": 7962 + }, + { + "epoch": 0.5, + "grad_norm": 1.3004514797850226, + "learning_rate": 5.230603098871406e-06, + "loss": 0.2733, + "step": 7963 + }, + { + "epoch": 0.5, + "grad_norm": 3.787446885363309, + "learning_rate": 5.229585701364086e-06, + "loss": 0.2961, + "step": 7964 + }, + { + "epoch": 0.5, + "grad_norm": 1.5105309186295437, + "learning_rate": 5.228568294330847e-06, + "loss": 0.2878, + "step": 7965 + }, + { + "epoch": 0.5, + "grad_norm": 1.9272804431534816, + "learning_rate": 5.227550877813903e-06, + "loss": 0.299, + "step": 7966 + }, + { + "epoch": 0.5, + "grad_norm": 2.332172028792701, + "learning_rate": 5.226533451855471e-06, + "loss": 0.29, + "step": 7967 + }, + { + "epoch": 0.5, + "grad_norm": 1.8385924181373738, + "learning_rate": 5.225516016497761e-06, + "loss": 0.2868, + "step": 7968 + }, + { + "epoch": 0.5, + "grad_norm": 3.238441197023188, + "learning_rate": 5.224498571782992e-06, + "loss": 0.2826, + "step": 7969 + }, + { + "epoch": 0.5, + "grad_norm": 2.7610732894069905, + "learning_rate": 5.223481117753379e-06, + "loss": 0.2874, + "step": 7970 + }, + { + "epoch": 0.5, + "grad_norm": 0.6148411709942464, + "learning_rate": 5.222463654451138e-06, + "loss": 0.5345, + "step": 7971 + }, + { + "epoch": 0.5, + "grad_norm": 1.850150281025383, + "learning_rate": 5.221446181918484e-06, + "loss": 0.3108, + "step": 7972 + }, + { + "epoch": 0.5, + "grad_norm": 1.8929554689762693, + "learning_rate": 5.220428700197635e-06, + "loss": 0.2887, + "step": 7973 + }, + { + "epoch": 0.5, + "grad_norm": 2.2302322929908756, + "learning_rate": 5.219411209330807e-06, + "loss": 0.3011, + "step": 7974 + }, + { + "epoch": 0.5, + "grad_norm": 2.6597011141365123, + "learning_rate": 5.218393709360219e-06, + "loss": 0.2634, + "step": 7975 + }, + { + "epoch": 0.5, + "grad_norm": 1.8544295318833768, + "learning_rate": 5.217376200328087e-06, + "loss": 0.2901, + "step": 7976 + }, + { + "epoch": 0.5, + "grad_norm": 2.6148198615159677, + "learning_rate": 5.216358682276631e-06, + "loss": 0.2785, + "step": 7977 + }, + { + "epoch": 0.5, + "grad_norm": 1.8760216823100602, + "learning_rate": 5.215341155248069e-06, + "loss": 0.2778, + "step": 7978 + }, + { + "epoch": 0.5, + "grad_norm": 2.19102729214334, + "learning_rate": 5.214323619284619e-06, + "loss": 0.2741, + "step": 7979 + }, + { + "epoch": 0.5, + "grad_norm": 1.5999705552751469, + "learning_rate": 5.213306074428503e-06, + "loss": 0.2764, + "step": 7980 + }, + { + "epoch": 0.5, + "grad_norm": 1.5029067945288253, + "learning_rate": 5.212288520721939e-06, + "loss": 0.2786, + "step": 7981 + }, + { + "epoch": 0.5, + "grad_norm": 4.002508771973122, + "learning_rate": 5.2112709582071464e-06, + "loss": 0.2876, + "step": 7982 + }, + { + "epoch": 0.5, + "grad_norm": 20.874911072768043, + "learning_rate": 5.210253386926346e-06, + "loss": 0.2807, + "step": 7983 + }, + { + "epoch": 0.5, + "grad_norm": 2.293807677620967, + "learning_rate": 5.20923580692176e-06, + "loss": 0.2857, + "step": 7984 + }, + { + "epoch": 0.5, + "grad_norm": 1.896818899562079, + "learning_rate": 5.208218218235609e-06, + "loss": 0.2899, + "step": 7985 + }, + { + "epoch": 0.5, + "grad_norm": 3.450485937256754, + "learning_rate": 5.207200620910114e-06, + "loss": 0.2894, + "step": 7986 + }, + { + "epoch": 0.5, + "grad_norm": 2.0716587200662073, + "learning_rate": 5.206183014987497e-06, + "loss": 0.2711, + "step": 7987 + }, + { + "epoch": 0.5, + "grad_norm": 1.8071708646158895, + "learning_rate": 5.205165400509982e-06, + "loss": 0.2742, + "step": 7988 + }, + { + "epoch": 0.5, + "grad_norm": 1.2390266213904055, + "learning_rate": 5.2041477775197875e-06, + "loss": 0.2896, + "step": 7989 + }, + { + "epoch": 0.5, + "grad_norm": 2.839432018744269, + "learning_rate": 5.20313014605914e-06, + "loss": 0.2774, + "step": 7990 + }, + { + "epoch": 0.5, + "grad_norm": 2.098592260273801, + "learning_rate": 5.202112506170263e-06, + "loss": 0.2836, + "step": 7991 + }, + { + "epoch": 0.5, + "grad_norm": 3.037684427485203, + "learning_rate": 5.201094857895377e-06, + "loss": 0.271, + "step": 7992 + }, + { + "epoch": 0.5, + "grad_norm": 2.0779139106441136, + "learning_rate": 5.200077201276711e-06, + "loss": 0.2819, + "step": 7993 + }, + { + "epoch": 0.5, + "grad_norm": 3.0179452472155095, + "learning_rate": 5.1990595363564845e-06, + "loss": 0.3158, + "step": 7994 + }, + { + "epoch": 0.5, + "grad_norm": 2.2662962933022985, + "learning_rate": 5.198041863176925e-06, + "loss": 0.293, + "step": 7995 + }, + { + "epoch": 0.5, + "grad_norm": 1.694266813304795, + "learning_rate": 5.197024181780256e-06, + "loss": 0.2718, + "step": 7996 + }, + { + "epoch": 0.5, + "grad_norm": 1.5220341333856093, + "learning_rate": 5.196006492208705e-06, + "loss": 0.2784, + "step": 7997 + }, + { + "epoch": 0.5, + "grad_norm": 2.6307678542495885, + "learning_rate": 5.194988794504495e-06, + "loss": 0.3027, + "step": 7998 + }, + { + "epoch": 0.5, + "grad_norm": 1.6771723970344932, + "learning_rate": 5.193971088709855e-06, + "loss": 0.2854, + "step": 7999 + }, + { + "epoch": 0.5, + "grad_norm": 1.6996155697591504, + "learning_rate": 5.192953374867009e-06, + "loss": 0.2816, + "step": 8000 + }, + { + "epoch": 0.5, + "grad_norm": 2.342947985914777, + "learning_rate": 5.1919356530181865e-06, + "loss": 0.277, + "step": 8001 + }, + { + "epoch": 0.5, + "grad_norm": 3.1335009395378743, + "learning_rate": 5.190917923205611e-06, + "loss": 0.2752, + "step": 8002 + }, + { + "epoch": 0.5, + "grad_norm": 1.8525786953817451, + "learning_rate": 5.189900185471511e-06, + "loss": 0.2832, + "step": 8003 + }, + { + "epoch": 0.5, + "grad_norm": 4.18225132389599, + "learning_rate": 5.188882439858117e-06, + "loss": 0.2876, + "step": 8004 + }, + { + "epoch": 0.5, + "grad_norm": 2.368988241224818, + "learning_rate": 5.187864686407656e-06, + "loss": 0.2769, + "step": 8005 + }, + { + "epoch": 0.5, + "grad_norm": 2.2003221974676173, + "learning_rate": 5.186846925162353e-06, + "loss": 0.2889, + "step": 8006 + }, + { + "epoch": 0.5, + "grad_norm": 1.4983689874048514, + "learning_rate": 5.1858291561644394e-06, + "loss": 0.2831, + "step": 8007 + }, + { + "epoch": 0.5, + "grad_norm": 1.8058272796197763, + "learning_rate": 5.184811379456145e-06, + "loss": 0.2903, + "step": 8008 + }, + { + "epoch": 0.5, + "grad_norm": 1.7720071199535712, + "learning_rate": 5.183793595079697e-06, + "loss": 0.2825, + "step": 8009 + }, + { + "epoch": 0.5, + "grad_norm": 1.181826200879759, + "learning_rate": 5.1827758030773275e-06, + "loss": 0.2599, + "step": 8010 + }, + { + "epoch": 0.5, + "grad_norm": 3.567614006042909, + "learning_rate": 5.181758003491265e-06, + "loss": 0.2941, + "step": 8011 + }, + { + "epoch": 0.5, + "grad_norm": 2.1988998909507154, + "learning_rate": 5.1807401963637404e-06, + "loss": 0.29, + "step": 8012 + }, + { + "epoch": 0.5, + "grad_norm": 2.083542254299491, + "learning_rate": 5.179722381736983e-06, + "loss": 0.2762, + "step": 8013 + }, + { + "epoch": 0.5, + "grad_norm": 1.2464794092514846, + "learning_rate": 5.178704559653227e-06, + "loss": 0.271, + "step": 8014 + }, + { + "epoch": 0.5, + "grad_norm": 0.59916161382316, + "learning_rate": 5.1776867301547e-06, + "loss": 0.5044, + "step": 8015 + }, + { + "epoch": 0.5, + "grad_norm": 2.2064660580278344, + "learning_rate": 5.176668893283634e-06, + "loss": 0.2926, + "step": 8016 + }, + { + "epoch": 0.5, + "grad_norm": 1.6907536057633934, + "learning_rate": 5.175651049082262e-06, + "loss": 0.2787, + "step": 8017 + }, + { + "epoch": 0.5, + "grad_norm": 2.519586990692234, + "learning_rate": 5.174633197592818e-06, + "loss": 0.283, + "step": 8018 + }, + { + "epoch": 0.5, + "grad_norm": 2.131038688080356, + "learning_rate": 5.1736153388575305e-06, + "loss": 0.2877, + "step": 8019 + }, + { + "epoch": 0.5, + "grad_norm": 1.3906065783995365, + "learning_rate": 5.172597472918635e-06, + "loss": 0.2841, + "step": 8020 + }, + { + "epoch": 0.5, + "grad_norm": 1.3181186580179853, + "learning_rate": 5.1715795998183625e-06, + "loss": 0.2885, + "step": 8021 + }, + { + "epoch": 0.5, + "grad_norm": 2.0147444254565037, + "learning_rate": 5.1705617195989495e-06, + "loss": 0.2849, + "step": 8022 + }, + { + "epoch": 0.5, + "grad_norm": 3.284732829682858, + "learning_rate": 5.169543832302627e-06, + "loss": 0.2959, + "step": 8023 + }, + { + "epoch": 0.5, + "grad_norm": 2.7715806483230008, + "learning_rate": 5.168525937971629e-06, + "loss": 0.2925, + "step": 8024 + }, + { + "epoch": 0.5, + "grad_norm": 1.5507911583763665, + "learning_rate": 5.167508036648191e-06, + "loss": 0.3008, + "step": 8025 + }, + { + "epoch": 0.5, + "grad_norm": 1.8122030484431046, + "learning_rate": 5.166490128374548e-06, + "loss": 0.2793, + "step": 8026 + }, + { + "epoch": 0.5, + "grad_norm": 3.2404819603162123, + "learning_rate": 5.165472213192934e-06, + "loss": 0.3042, + "step": 8027 + }, + { + "epoch": 0.5, + "grad_norm": 1.9126359663228494, + "learning_rate": 5.164454291145582e-06, + "loss": 0.2706, + "step": 8028 + }, + { + "epoch": 0.5, + "grad_norm": 1.6862084248984586, + "learning_rate": 5.163436362274731e-06, + "loss": 0.2823, + "step": 8029 + }, + { + "epoch": 0.5, + "grad_norm": 1.8249442111586103, + "learning_rate": 5.162418426622615e-06, + "loss": 0.2798, + "step": 8030 + }, + { + "epoch": 0.51, + "grad_norm": 1.659652593548811, + "learning_rate": 5.1614004842314694e-06, + "loss": 0.2699, + "step": 8031 + }, + { + "epoch": 0.51, + "grad_norm": 1.9987159271790569, + "learning_rate": 5.160382535143531e-06, + "loss": 0.2833, + "step": 8032 + }, + { + "epoch": 0.51, + "grad_norm": 1.9910294536543716, + "learning_rate": 5.159364579401036e-06, + "loss": 0.2881, + "step": 8033 + }, + { + "epoch": 0.51, + "grad_norm": 10.151206820163265, + "learning_rate": 5.158346617046221e-06, + "loss": 0.2704, + "step": 8034 + }, + { + "epoch": 0.51, + "grad_norm": 1.932043974434462, + "learning_rate": 5.157328648121325e-06, + "loss": 0.2865, + "step": 8035 + }, + { + "epoch": 0.51, + "grad_norm": 3.8742209859969425, + "learning_rate": 5.156310672668584e-06, + "loss": 0.2841, + "step": 8036 + }, + { + "epoch": 0.51, + "grad_norm": 1.6486940944055828, + "learning_rate": 5.155292690730235e-06, + "loss": 0.2925, + "step": 8037 + }, + { + "epoch": 0.51, + "grad_norm": 1.6656818332786414, + "learning_rate": 5.154274702348517e-06, + "loss": 0.2841, + "step": 8038 + }, + { + "epoch": 0.51, + "grad_norm": 1.736002438024291, + "learning_rate": 5.153256707565666e-06, + "loss": 0.2774, + "step": 8039 + }, + { + "epoch": 0.51, + "grad_norm": 1.7506932901812695, + "learning_rate": 5.152238706423925e-06, + "loss": 0.2733, + "step": 8040 + }, + { + "epoch": 0.51, + "grad_norm": 1.6446735246618587, + "learning_rate": 5.151220698965526e-06, + "loss": 0.2824, + "step": 8041 + }, + { + "epoch": 0.51, + "grad_norm": 3.3495018044799867, + "learning_rate": 5.1502026852327136e-06, + "loss": 0.293, + "step": 8042 + }, + { + "epoch": 0.51, + "grad_norm": 1.9900894311739992, + "learning_rate": 5.149184665267725e-06, + "loss": 0.2782, + "step": 8043 + }, + { + "epoch": 0.51, + "grad_norm": 2.4473681503604716, + "learning_rate": 5.148166639112799e-06, + "loss": 0.2822, + "step": 8044 + }, + { + "epoch": 0.51, + "grad_norm": 2.715485009361348, + "learning_rate": 5.1471486068101774e-06, + "loss": 0.3001, + "step": 8045 + }, + { + "epoch": 0.51, + "grad_norm": 1.3454430352257016, + "learning_rate": 5.146130568402097e-06, + "loss": 0.2769, + "step": 8046 + }, + { + "epoch": 0.51, + "grad_norm": 2.3760925397182366, + "learning_rate": 5.1451125239308e-06, + "loss": 0.2995, + "step": 8047 + }, + { + "epoch": 0.51, + "grad_norm": 1.4208275815335933, + "learning_rate": 5.144094473438528e-06, + "loss": 0.2735, + "step": 8048 + }, + { + "epoch": 0.51, + "grad_norm": 1.4885921654964185, + "learning_rate": 5.14307641696752e-06, + "loss": 0.2719, + "step": 8049 + }, + { + "epoch": 0.51, + "grad_norm": 1.845783800197305, + "learning_rate": 5.142058354560016e-06, + "loss": 0.2892, + "step": 8050 + }, + { + "epoch": 0.51, + "grad_norm": 1.552123748741543, + "learning_rate": 5.141040286258259e-06, + "loss": 0.2753, + "step": 8051 + }, + { + "epoch": 0.51, + "grad_norm": 2.1764173994410405, + "learning_rate": 5.140022212104492e-06, + "loss": 0.3153, + "step": 8052 + }, + { + "epoch": 0.51, + "grad_norm": 1.3296462212815934, + "learning_rate": 5.139004132140953e-06, + "loss": 0.2999, + "step": 8053 + }, + { + "epoch": 0.51, + "grad_norm": 1.8875624634374253, + "learning_rate": 5.137986046409884e-06, + "loss": 0.2735, + "step": 8054 + }, + { + "epoch": 0.51, + "grad_norm": 1.5061306577739693, + "learning_rate": 5.136967954953531e-06, + "loss": 0.309, + "step": 8055 + }, + { + "epoch": 0.51, + "grad_norm": 1.5540241913902926, + "learning_rate": 5.135949857814134e-06, + "loss": 0.2859, + "step": 8056 + }, + { + "epoch": 0.51, + "grad_norm": 2.2405392597573455, + "learning_rate": 5.134931755033936e-06, + "loss": 0.2963, + "step": 8057 + }, + { + "epoch": 0.51, + "grad_norm": 4.134265261415188, + "learning_rate": 5.13391364665518e-06, + "loss": 0.3058, + "step": 8058 + }, + { + "epoch": 0.51, + "grad_norm": 1.6670677602740802, + "learning_rate": 5.132895532720108e-06, + "loss": 0.2937, + "step": 8059 + }, + { + "epoch": 0.51, + "grad_norm": 1.2613904123723603, + "learning_rate": 5.131877413270965e-06, + "loss": 0.2802, + "step": 8060 + }, + { + "epoch": 0.51, + "grad_norm": 1.5338154753482607, + "learning_rate": 5.130859288349993e-06, + "loss": 0.2936, + "step": 8061 + }, + { + "epoch": 0.51, + "grad_norm": 1.9411781150630898, + "learning_rate": 5.129841157999438e-06, + "loss": 0.2738, + "step": 8062 + }, + { + "epoch": 0.51, + "grad_norm": 2.2707669150827714, + "learning_rate": 5.128823022261542e-06, + "loss": 0.2865, + "step": 8063 + }, + { + "epoch": 0.51, + "grad_norm": 11.183724499773126, + "learning_rate": 5.127804881178551e-06, + "loss": 0.3193, + "step": 8064 + }, + { + "epoch": 0.51, + "grad_norm": 1.8548627104452602, + "learning_rate": 5.126786734792706e-06, + "loss": 0.2981, + "step": 8065 + }, + { + "epoch": 0.51, + "grad_norm": 2.4332607932074577, + "learning_rate": 5.1257685831462565e-06, + "loss": 0.3022, + "step": 8066 + }, + { + "epoch": 0.51, + "grad_norm": 2.5992390097914027, + "learning_rate": 5.124750426281444e-06, + "loss": 0.3095, + "step": 8067 + }, + { + "epoch": 0.51, + "grad_norm": 12.319122411374217, + "learning_rate": 5.123732264240517e-06, + "loss": 0.3037, + "step": 8068 + }, + { + "epoch": 0.51, + "grad_norm": 2.5635183548913294, + "learning_rate": 5.122714097065714e-06, + "loss": 0.2961, + "step": 8069 + }, + { + "epoch": 0.51, + "grad_norm": 2.354103677632639, + "learning_rate": 5.1216959247992896e-06, + "loss": 0.3098, + "step": 8070 + }, + { + "epoch": 0.51, + "grad_norm": 0.6294266845144957, + "learning_rate": 5.120677747483482e-06, + "loss": 0.4819, + "step": 8071 + }, + { + "epoch": 0.51, + "grad_norm": 1.5833706118593205, + "learning_rate": 5.119659565160542e-06, + "loss": 0.3056, + "step": 8072 + }, + { + "epoch": 0.51, + "grad_norm": 3.2022867462856706, + "learning_rate": 5.1186413778727125e-06, + "loss": 0.2961, + "step": 8073 + }, + { + "epoch": 0.51, + "grad_norm": 1.2339430255592672, + "learning_rate": 5.117623185662245e-06, + "loss": 0.2904, + "step": 8074 + }, + { + "epoch": 0.51, + "grad_norm": 2.4727351428623927, + "learning_rate": 5.11660498857138e-06, + "loss": 0.2811, + "step": 8075 + }, + { + "epoch": 0.51, + "grad_norm": 1.4549063477747954, + "learning_rate": 5.115586786642367e-06, + "loss": 0.3009, + "step": 8076 + }, + { + "epoch": 0.51, + "grad_norm": 2.154997404078864, + "learning_rate": 5.1145685799174525e-06, + "loss": 0.2995, + "step": 8077 + }, + { + "epoch": 0.51, + "grad_norm": 3.298505816209827, + "learning_rate": 5.113550368438885e-06, + "loss": 0.2827, + "step": 8078 + }, + { + "epoch": 0.51, + "grad_norm": 1.4602174794329403, + "learning_rate": 5.112532152248911e-06, + "loss": 0.2928, + "step": 8079 + }, + { + "epoch": 0.51, + "grad_norm": 1.9825143170171933, + "learning_rate": 5.111513931389777e-06, + "loss": 0.299, + "step": 8080 + }, + { + "epoch": 0.51, + "grad_norm": 2.405679316456453, + "learning_rate": 5.110495705903734e-06, + "loss": 0.3207, + "step": 8081 + }, + { + "epoch": 0.51, + "grad_norm": 3.0238107680183663, + "learning_rate": 5.109477475833027e-06, + "loss": 0.2906, + "step": 8082 + }, + { + "epoch": 0.51, + "grad_norm": 1.418260652052426, + "learning_rate": 5.108459241219905e-06, + "loss": 0.2823, + "step": 8083 + }, + { + "epoch": 0.51, + "grad_norm": 0.5686712902216726, + "learning_rate": 5.107441002106616e-06, + "loss": 0.5054, + "step": 8084 + }, + { + "epoch": 0.51, + "grad_norm": 3.1012274202739682, + "learning_rate": 5.106422758535408e-06, + "loss": 0.2993, + "step": 8085 + }, + { + "epoch": 0.51, + "grad_norm": 1.3269839688973062, + "learning_rate": 5.105404510548534e-06, + "loss": 0.2973, + "step": 8086 + }, + { + "epoch": 0.51, + "grad_norm": 2.3121186570343597, + "learning_rate": 5.1043862581882375e-06, + "loss": 0.2844, + "step": 8087 + }, + { + "epoch": 0.51, + "grad_norm": 1.5471815690013278, + "learning_rate": 5.103368001496769e-06, + "loss": 0.2817, + "step": 8088 + }, + { + "epoch": 0.51, + "grad_norm": 2.5163684845845093, + "learning_rate": 5.102349740516379e-06, + "loss": 0.2978, + "step": 8089 + }, + { + "epoch": 0.51, + "grad_norm": 1.8501363901100054, + "learning_rate": 5.101331475289318e-06, + "loss": 0.2905, + "step": 8090 + }, + { + "epoch": 0.51, + "grad_norm": 1.9429442626330335, + "learning_rate": 5.100313205857832e-06, + "loss": 0.3, + "step": 8091 + }, + { + "epoch": 0.51, + "grad_norm": 2.242598751930439, + "learning_rate": 5.099294932264174e-06, + "loss": 0.2949, + "step": 8092 + }, + { + "epoch": 0.51, + "grad_norm": 8.958946009346706, + "learning_rate": 5.098276654550593e-06, + "loss": 0.2762, + "step": 8093 + }, + { + "epoch": 0.51, + "grad_norm": 2.0006594531298725, + "learning_rate": 5.097258372759339e-06, + "loss": 0.2834, + "step": 8094 + }, + { + "epoch": 0.51, + "grad_norm": 1.4309018910323494, + "learning_rate": 5.096240086932661e-06, + "loss": 0.2853, + "step": 8095 + }, + { + "epoch": 0.51, + "grad_norm": 1.5859847155762092, + "learning_rate": 5.095221797112814e-06, + "loss": 0.2821, + "step": 8096 + }, + { + "epoch": 0.51, + "grad_norm": 1.3161250674204685, + "learning_rate": 5.094203503342042e-06, + "loss": 0.2821, + "step": 8097 + }, + { + "epoch": 0.51, + "grad_norm": 0.6229600351794886, + "learning_rate": 5.093185205662602e-06, + "loss": 0.5152, + "step": 8098 + }, + { + "epoch": 0.51, + "grad_norm": 2.575849273510752, + "learning_rate": 5.09216690411674e-06, + "loss": 0.2875, + "step": 8099 + }, + { + "epoch": 0.51, + "grad_norm": 1.3971618086227922, + "learning_rate": 5.091148598746711e-06, + "loss": 0.2902, + "step": 8100 + }, + { + "epoch": 0.51, + "grad_norm": 3.0548110743047925, + "learning_rate": 5.090130289594764e-06, + "loss": 0.2877, + "step": 8101 + }, + { + "epoch": 0.51, + "grad_norm": 2.410056373113431, + "learning_rate": 5.089111976703151e-06, + "loss": 0.2888, + "step": 8102 + }, + { + "epoch": 0.51, + "grad_norm": 1.2334959627674285, + "learning_rate": 5.088093660114125e-06, + "loss": 0.2724, + "step": 8103 + }, + { + "epoch": 0.51, + "grad_norm": 2.1024339018243983, + "learning_rate": 5.087075339869937e-06, + "loss": 0.2774, + "step": 8104 + }, + { + "epoch": 0.51, + "grad_norm": 1.7350354360047069, + "learning_rate": 5.086057016012836e-06, + "loss": 0.2965, + "step": 8105 + }, + { + "epoch": 0.51, + "grad_norm": 11.461945190557998, + "learning_rate": 5.085038688585079e-06, + "loss": 0.2894, + "step": 8106 + }, + { + "epoch": 0.51, + "grad_norm": 2.1600860846477663, + "learning_rate": 5.0840203576289135e-06, + "loss": 0.2837, + "step": 8107 + }, + { + "epoch": 0.51, + "grad_norm": 1.1997379931564873, + "learning_rate": 5.083002023186596e-06, + "loss": 0.2927, + "step": 8108 + }, + { + "epoch": 0.51, + "grad_norm": 1.3115319555158724, + "learning_rate": 5.081983685300377e-06, + "loss": 0.2974, + "step": 8109 + }, + { + "epoch": 0.51, + "grad_norm": 3.0692406318852896, + "learning_rate": 5.080965344012509e-06, + "loss": 0.2764, + "step": 8110 + }, + { + "epoch": 0.51, + "grad_norm": 2.957828967322293, + "learning_rate": 5.079946999365244e-06, + "loss": 0.2705, + "step": 8111 + }, + { + "epoch": 0.51, + "grad_norm": 2.403710494563031, + "learning_rate": 5.078928651400838e-06, + "loss": 0.2998, + "step": 8112 + }, + { + "epoch": 0.51, + "grad_norm": 2.3844751932063026, + "learning_rate": 5.07791030016154e-06, + "loss": 0.2831, + "step": 8113 + }, + { + "epoch": 0.51, + "grad_norm": 1.8789455986311219, + "learning_rate": 5.076891945689606e-06, + "loss": 0.2876, + "step": 8114 + }, + { + "epoch": 0.51, + "grad_norm": 2.2053585238370856, + "learning_rate": 5.075873588027288e-06, + "loss": 0.3017, + "step": 8115 + }, + { + "epoch": 0.51, + "grad_norm": 1.981578176439994, + "learning_rate": 5.074855227216842e-06, + "loss": 0.2856, + "step": 8116 + }, + { + "epoch": 0.51, + "grad_norm": 1.2393565715017612, + "learning_rate": 5.073836863300517e-06, + "loss": 0.2858, + "step": 8117 + }, + { + "epoch": 0.51, + "grad_norm": 2.124483252278634, + "learning_rate": 5.072818496320572e-06, + "loss": 0.3109, + "step": 8118 + }, + { + "epoch": 0.51, + "grad_norm": 2.7475757024501903, + "learning_rate": 5.071800126319256e-06, + "loss": 0.3005, + "step": 8119 + }, + { + "epoch": 0.51, + "grad_norm": 1.9141583768087556, + "learning_rate": 5.070781753338828e-06, + "loss": 0.2769, + "step": 8120 + }, + { + "epoch": 0.51, + "grad_norm": 1.451323318961555, + "learning_rate": 5.069763377421536e-06, + "loss": 0.3067, + "step": 8121 + }, + { + "epoch": 0.51, + "grad_norm": 1.5071921951160125, + "learning_rate": 5.06874499860964e-06, + "loss": 0.3003, + "step": 8122 + }, + { + "epoch": 0.51, + "grad_norm": 3.0897711700626984, + "learning_rate": 5.067726616945391e-06, + "loss": 0.2949, + "step": 8123 + }, + { + "epoch": 0.51, + "grad_norm": 2.4642319003419675, + "learning_rate": 5.066708232471045e-06, + "loss": 0.2986, + "step": 8124 + }, + { + "epoch": 0.51, + "grad_norm": 2.7422941442622264, + "learning_rate": 5.065689845228854e-06, + "loss": 0.2899, + "step": 8125 + }, + { + "epoch": 0.51, + "grad_norm": 1.8900938353400445, + "learning_rate": 5.064671455261078e-06, + "loss": 0.3086, + "step": 8126 + }, + { + "epoch": 0.51, + "grad_norm": 2.1285408519171547, + "learning_rate": 5.063653062609966e-06, + "loss": 0.298, + "step": 8127 + }, + { + "epoch": 0.51, + "grad_norm": 1.8505723361191253, + "learning_rate": 5.062634667317776e-06, + "loss": 0.3228, + "step": 8128 + }, + { + "epoch": 0.51, + "grad_norm": 10.772859833035717, + "learning_rate": 5.061616269426761e-06, + "loss": 0.2894, + "step": 8129 + }, + { + "epoch": 0.51, + "grad_norm": 1.9855041274032852, + "learning_rate": 5.06059786897918e-06, + "loss": 0.2919, + "step": 8130 + }, + { + "epoch": 0.51, + "grad_norm": 1.6736157068345445, + "learning_rate": 5.059579466017285e-06, + "loss": 0.2881, + "step": 8131 + }, + { + "epoch": 0.51, + "grad_norm": 1.8385217449012525, + "learning_rate": 5.058561060583331e-06, + "loss": 0.2595, + "step": 8132 + }, + { + "epoch": 0.51, + "grad_norm": 1.528387329673985, + "learning_rate": 5.057542652719574e-06, + "loss": 0.2834, + "step": 8133 + }, + { + "epoch": 0.51, + "grad_norm": 1.7429707142645707, + "learning_rate": 5.056524242468274e-06, + "loss": 0.2893, + "step": 8134 + }, + { + "epoch": 0.51, + "grad_norm": 2.7041324964610656, + "learning_rate": 5.055505829871678e-06, + "loss": 0.2857, + "step": 8135 + }, + { + "epoch": 0.51, + "grad_norm": 1.4335164106800034, + "learning_rate": 5.054487414972049e-06, + "loss": 0.2865, + "step": 8136 + }, + { + "epoch": 0.51, + "grad_norm": 1.9924978004561862, + "learning_rate": 5.053468997811641e-06, + "loss": 0.2778, + "step": 8137 + }, + { + "epoch": 0.51, + "grad_norm": 40.865144961517984, + "learning_rate": 5.052450578432708e-06, + "loss": 0.2702, + "step": 8138 + }, + { + "epoch": 0.51, + "grad_norm": 1.3016727652926572, + "learning_rate": 5.051432156877508e-06, + "loss": 0.2788, + "step": 8139 + }, + { + "epoch": 0.51, + "grad_norm": 1.9413116312589866, + "learning_rate": 5.050413733188296e-06, + "loss": 0.298, + "step": 8140 + }, + { + "epoch": 0.51, + "grad_norm": 1.7847582140692475, + "learning_rate": 5.049395307407329e-06, + "loss": 0.2858, + "step": 8141 + }, + { + "epoch": 0.51, + "grad_norm": 1.529547832735798, + "learning_rate": 5.048376879576864e-06, + "loss": 0.2785, + "step": 8142 + }, + { + "epoch": 0.51, + "grad_norm": 2.6373970021235005, + "learning_rate": 5.047358449739154e-06, + "loss": 0.3086, + "step": 8143 + }, + { + "epoch": 0.51, + "grad_norm": 2.66063556871953, + "learning_rate": 5.046340017936459e-06, + "loss": 0.2812, + "step": 8144 + }, + { + "epoch": 0.51, + "grad_norm": 2.173410028038673, + "learning_rate": 5.045321584211035e-06, + "loss": 0.2834, + "step": 8145 + }, + { + "epoch": 0.51, + "grad_norm": 8.189909801659308, + "learning_rate": 5.044303148605137e-06, + "loss": 0.3044, + "step": 8146 + }, + { + "epoch": 0.51, + "grad_norm": 2.449596123719175, + "learning_rate": 5.043284711161022e-06, + "loss": 0.3042, + "step": 8147 + }, + { + "epoch": 0.51, + "grad_norm": 2.1180777194657785, + "learning_rate": 5.042266271920949e-06, + "loss": 0.2785, + "step": 8148 + }, + { + "epoch": 0.51, + "grad_norm": 2.8784394804409024, + "learning_rate": 5.041247830927173e-06, + "loss": 0.3005, + "step": 8149 + }, + { + "epoch": 0.51, + "grad_norm": 1.6236132036555047, + "learning_rate": 5.040229388221952e-06, + "loss": 0.2856, + "step": 8150 + }, + { + "epoch": 0.51, + "grad_norm": 2.4877568330618107, + "learning_rate": 5.03921094384754e-06, + "loss": 0.2725, + "step": 8151 + }, + { + "epoch": 0.51, + "grad_norm": 6.779632116067725, + "learning_rate": 5.038192497846198e-06, + "loss": 0.2945, + "step": 8152 + }, + { + "epoch": 0.51, + "grad_norm": 1.8263603784506386, + "learning_rate": 5.037174050260181e-06, + "loss": 0.2847, + "step": 8153 + }, + { + "epoch": 0.51, + "grad_norm": 2.1698830201554764, + "learning_rate": 5.036155601131747e-06, + "loss": 0.2729, + "step": 8154 + }, + { + "epoch": 0.51, + "grad_norm": 3.4927890770584638, + "learning_rate": 5.035137150503151e-06, + "loss": 0.3065, + "step": 8155 + }, + { + "epoch": 0.51, + "grad_norm": 1.5545318227740046, + "learning_rate": 5.034118698416654e-06, + "loss": 0.2747, + "step": 8156 + }, + { + "epoch": 0.51, + "grad_norm": 1.4572955541828734, + "learning_rate": 5.033100244914512e-06, + "loss": 0.2763, + "step": 8157 + }, + { + "epoch": 0.51, + "grad_norm": 5.746315447390973, + "learning_rate": 5.03208179003898e-06, + "loss": 0.3129, + "step": 8158 + }, + { + "epoch": 0.51, + "grad_norm": 1.4664765942120763, + "learning_rate": 5.03106333383232e-06, + "loss": 0.271, + "step": 8159 + }, + { + "epoch": 0.51, + "grad_norm": 14.394632767386923, + "learning_rate": 5.030044876336786e-06, + "loss": 0.3045, + "step": 8160 + }, + { + "epoch": 0.51, + "grad_norm": 1.9123375926286352, + "learning_rate": 5.029026417594637e-06, + "loss": 0.3159, + "step": 8161 + }, + { + "epoch": 0.51, + "grad_norm": 3.5269617215825892, + "learning_rate": 5.02800795764813e-06, + "loss": 0.2955, + "step": 8162 + }, + { + "epoch": 0.51, + "grad_norm": 16.546546453695523, + "learning_rate": 5.026989496539523e-06, + "loss": 0.3047, + "step": 8163 + }, + { + "epoch": 0.51, + "grad_norm": 1.9162045761280904, + "learning_rate": 5.025971034311075e-06, + "loss": 0.2804, + "step": 8164 + }, + { + "epoch": 0.51, + "grad_norm": 2.3073722605086497, + "learning_rate": 5.024952571005041e-06, + "loss": 0.3066, + "step": 8165 + }, + { + "epoch": 0.51, + "grad_norm": 1.9830524460973746, + "learning_rate": 5.02393410666368e-06, + "loss": 0.2835, + "step": 8166 + }, + { + "epoch": 0.51, + "grad_norm": 2.961502319580719, + "learning_rate": 5.022915641329252e-06, + "loss": 0.2923, + "step": 8167 + }, + { + "epoch": 0.51, + "grad_norm": 2.189041131058403, + "learning_rate": 5.021897175044014e-06, + "loss": 0.2874, + "step": 8168 + }, + { + "epoch": 0.51, + "grad_norm": 1.6629475421529214, + "learning_rate": 5.020878707850222e-06, + "loss": 0.2778, + "step": 8169 + }, + { + "epoch": 0.51, + "grad_norm": 1.6436906042157808, + "learning_rate": 5.0198602397901355e-06, + "loss": 0.2887, + "step": 8170 + }, + { + "epoch": 0.51, + "grad_norm": 2.706239287634321, + "learning_rate": 5.018841770906011e-06, + "loss": 0.2839, + "step": 8171 + }, + { + "epoch": 0.51, + "grad_norm": 3.700147076277438, + "learning_rate": 5.017823301240111e-06, + "loss": 0.306, + "step": 8172 + }, + { + "epoch": 0.51, + "grad_norm": 2.644524884182939, + "learning_rate": 5.016804830834687e-06, + "loss": 0.2715, + "step": 8173 + }, + { + "epoch": 0.51, + "grad_norm": 1.574413984967847, + "learning_rate": 5.015786359732003e-06, + "loss": 0.2824, + "step": 8174 + }, + { + "epoch": 0.51, + "grad_norm": 2.6463092278210696, + "learning_rate": 5.014767887974316e-06, + "loss": 0.2892, + "step": 8175 + }, + { + "epoch": 0.51, + "grad_norm": 2.8302860169498425, + "learning_rate": 5.013749415603881e-06, + "loss": 0.2733, + "step": 8176 + }, + { + "epoch": 0.51, + "grad_norm": 1.6922566209092376, + "learning_rate": 5.0127309426629575e-06, + "loss": 0.2665, + "step": 8177 + }, + { + "epoch": 0.51, + "grad_norm": 2.767475220858458, + "learning_rate": 5.011712469193808e-06, + "loss": 0.285, + "step": 8178 + }, + { + "epoch": 0.51, + "grad_norm": 2.821881597810828, + "learning_rate": 5.010693995238684e-06, + "loss": 0.2857, + "step": 8179 + }, + { + "epoch": 0.51, + "grad_norm": 1.4949420014722998, + "learning_rate": 5.009675520839851e-06, + "loss": 0.2976, + "step": 8180 + }, + { + "epoch": 0.51, + "grad_norm": 1.9029231329131013, + "learning_rate": 5.008657046039559e-06, + "loss": 0.27, + "step": 8181 + }, + { + "epoch": 0.51, + "grad_norm": 1.8667607852679868, + "learning_rate": 5.007638570880073e-06, + "loss": 0.2902, + "step": 8182 + }, + { + "epoch": 0.51, + "grad_norm": 0.572467918809941, + "learning_rate": 5.0066200954036495e-06, + "loss": 0.4681, + "step": 8183 + }, + { + "epoch": 0.51, + "grad_norm": 3.2587774934395273, + "learning_rate": 5.005601619652546e-06, + "loss": 0.303, + "step": 8184 + }, + { + "epoch": 0.51, + "grad_norm": 1.9537089598222606, + "learning_rate": 5.00458314366902e-06, + "loss": 0.2837, + "step": 8185 + }, + { + "epoch": 0.51, + "grad_norm": 1.4045046250409439, + "learning_rate": 5.003564667495334e-06, + "loss": 0.2768, + "step": 8186 + }, + { + "epoch": 0.51, + "grad_norm": 1.526752482442207, + "learning_rate": 5.002546191173742e-06, + "loss": 0.2804, + "step": 8187 + }, + { + "epoch": 0.51, + "grad_norm": 1.8022644753475676, + "learning_rate": 5.0015277147465035e-06, + "loss": 0.2986, + "step": 8188 + }, + { + "epoch": 0.51, + "grad_norm": 1.6767843667420423, + "learning_rate": 5.000509238255877e-06, + "loss": 0.2828, + "step": 8189 + }, + { + "epoch": 0.52, + "grad_norm": 2.3425617857572867, + "learning_rate": 4.999490761744123e-06, + "loss": 0.2862, + "step": 8190 + }, + { + "epoch": 0.52, + "grad_norm": 5.9245674360968374, + "learning_rate": 4.9984722852534964e-06, + "loss": 0.2871, + "step": 8191 + }, + { + "epoch": 0.52, + "grad_norm": 2.766118190616532, + "learning_rate": 4.99745380882626e-06, + "loss": 0.2969, + "step": 8192 + }, + { + "epoch": 0.52, + "grad_norm": 1.9747232663183154, + "learning_rate": 4.996435332504668e-06, + "loss": 0.2745, + "step": 8193 + }, + { + "epoch": 0.52, + "grad_norm": 2.8748079083011437, + "learning_rate": 4.995416856330981e-06, + "loss": 0.2835, + "step": 8194 + }, + { + "epoch": 0.52, + "grad_norm": 2.8798777101177233, + "learning_rate": 4.994398380347456e-06, + "loss": 0.2848, + "step": 8195 + }, + { + "epoch": 0.52, + "grad_norm": 2.639808692388689, + "learning_rate": 4.993379904596353e-06, + "loss": 0.2872, + "step": 8196 + }, + { + "epoch": 0.52, + "grad_norm": 2.1435094676835655, + "learning_rate": 4.992361429119927e-06, + "loss": 0.2858, + "step": 8197 + }, + { + "epoch": 0.52, + "grad_norm": 1.9851074493971386, + "learning_rate": 4.991342953960442e-06, + "loss": 0.2826, + "step": 8198 + }, + { + "epoch": 0.52, + "grad_norm": 1.499481015971225, + "learning_rate": 4.990324479160151e-06, + "loss": 0.3049, + "step": 8199 + }, + { + "epoch": 0.52, + "grad_norm": 1.4346083663205933, + "learning_rate": 4.989306004761317e-06, + "loss": 0.2792, + "step": 8200 + }, + { + "epoch": 0.52, + "grad_norm": 5.699142284733824, + "learning_rate": 4.988287530806194e-06, + "loss": 0.3179, + "step": 8201 + }, + { + "epoch": 0.52, + "grad_norm": 1.651240072834334, + "learning_rate": 4.987269057337043e-06, + "loss": 0.2788, + "step": 8202 + }, + { + "epoch": 0.52, + "grad_norm": 1.6379106598045439, + "learning_rate": 4.98625058439612e-06, + "loss": 0.2961, + "step": 8203 + }, + { + "epoch": 0.52, + "grad_norm": 2.0033425123299136, + "learning_rate": 4.985232112025688e-06, + "loss": 0.3004, + "step": 8204 + }, + { + "epoch": 0.52, + "grad_norm": 1.4606393446522026, + "learning_rate": 4.984213640267996e-06, + "loss": 0.2692, + "step": 8205 + }, + { + "epoch": 0.52, + "grad_norm": 2.1594238111119237, + "learning_rate": 4.983195169165313e-06, + "loss": 0.3018, + "step": 8206 + }, + { + "epoch": 0.52, + "grad_norm": 4.073785619809118, + "learning_rate": 4.9821766987598905e-06, + "loss": 0.2803, + "step": 8207 + }, + { + "epoch": 0.52, + "grad_norm": 0.6358225243717622, + "learning_rate": 4.98115822909399e-06, + "loss": 0.487, + "step": 8208 + }, + { + "epoch": 0.52, + "grad_norm": 2.245613572813672, + "learning_rate": 4.980139760209867e-06, + "loss": 0.2846, + "step": 8209 + }, + { + "epoch": 0.52, + "grad_norm": 3.402414324187724, + "learning_rate": 4.979121292149781e-06, + "loss": 0.2835, + "step": 8210 + }, + { + "epoch": 0.52, + "grad_norm": 1.7324346028894333, + "learning_rate": 4.978102824955988e-06, + "loss": 0.2897, + "step": 8211 + }, + { + "epoch": 0.52, + "grad_norm": 1.938265565068531, + "learning_rate": 4.977084358670749e-06, + "loss": 0.3019, + "step": 8212 + }, + { + "epoch": 0.52, + "grad_norm": 2.07939685069403, + "learning_rate": 4.97606589333632e-06, + "loss": 0.2738, + "step": 8213 + }, + { + "epoch": 0.52, + "grad_norm": 2.361960190281248, + "learning_rate": 4.975047428994961e-06, + "loss": 0.2816, + "step": 8214 + }, + { + "epoch": 0.52, + "grad_norm": 2.18567816403474, + "learning_rate": 4.9740289656889276e-06, + "loss": 0.2914, + "step": 8215 + }, + { + "epoch": 0.52, + "grad_norm": 25.224564304578116, + "learning_rate": 4.9730105034604795e-06, + "loss": 0.3092, + "step": 8216 + }, + { + "epoch": 0.52, + "grad_norm": 2.58551899278767, + "learning_rate": 4.971992042351872e-06, + "loss": 0.2799, + "step": 8217 + }, + { + "epoch": 0.52, + "grad_norm": 2.323951453494458, + "learning_rate": 4.970973582405366e-06, + "loss": 0.3097, + "step": 8218 + }, + { + "epoch": 0.52, + "grad_norm": 2.253398014201736, + "learning_rate": 4.969955123663216e-06, + "loss": 0.2934, + "step": 8219 + }, + { + "epoch": 0.52, + "grad_norm": 1.9966836673695352, + "learning_rate": 4.968936666167681e-06, + "loss": 0.2935, + "step": 8220 + }, + { + "epoch": 0.52, + "grad_norm": 1.897449863210299, + "learning_rate": 4.96791820996102e-06, + "loss": 0.2923, + "step": 8221 + }, + { + "epoch": 0.52, + "grad_norm": 1.8632552958564557, + "learning_rate": 4.96689975508549e-06, + "loss": 0.2731, + "step": 8222 + }, + { + "epoch": 0.52, + "grad_norm": 1.7206196721758091, + "learning_rate": 4.9658813015833465e-06, + "loss": 0.2853, + "step": 8223 + }, + { + "epoch": 0.52, + "grad_norm": 2.3362008394752314, + "learning_rate": 4.964862849496851e-06, + "loss": 0.3139, + "step": 8224 + }, + { + "epoch": 0.52, + "grad_norm": 1.8345989807494922, + "learning_rate": 4.9638443988682555e-06, + "loss": 0.3108, + "step": 8225 + }, + { + "epoch": 0.52, + "grad_norm": 1.7005985005836821, + "learning_rate": 4.962825949739822e-06, + "loss": 0.2738, + "step": 8226 + }, + { + "epoch": 0.52, + "grad_norm": 6.040241574651639, + "learning_rate": 4.961807502153802e-06, + "loss": 0.2914, + "step": 8227 + }, + { + "epoch": 0.52, + "grad_norm": 3.763435577672746, + "learning_rate": 4.960789056152461e-06, + "loss": 0.2603, + "step": 8228 + }, + { + "epoch": 0.52, + "grad_norm": 4.959686260414063, + "learning_rate": 4.9597706117780495e-06, + "loss": 0.3001, + "step": 8229 + }, + { + "epoch": 0.52, + "grad_norm": 2.130706889609274, + "learning_rate": 4.958752169072828e-06, + "loss": 0.2816, + "step": 8230 + }, + { + "epoch": 0.52, + "grad_norm": 2.4536967288211615, + "learning_rate": 4.957733728079051e-06, + "loss": 0.2915, + "step": 8231 + }, + { + "epoch": 0.52, + "grad_norm": 2.5668313154641167, + "learning_rate": 4.956715288838979e-06, + "loss": 0.3039, + "step": 8232 + }, + { + "epoch": 0.52, + "grad_norm": 4.61844755821173, + "learning_rate": 4.955696851394865e-06, + "loss": 0.2838, + "step": 8233 + }, + { + "epoch": 0.52, + "grad_norm": 2.768573779849712, + "learning_rate": 4.954678415788968e-06, + "loss": 0.2896, + "step": 8234 + }, + { + "epoch": 0.52, + "grad_norm": 0.6050610772215912, + "learning_rate": 4.9536599820635414e-06, + "loss": 0.4678, + "step": 8235 + }, + { + "epoch": 0.52, + "grad_norm": 2.3711887432190775, + "learning_rate": 4.952641550260846e-06, + "loss": 0.3181, + "step": 8236 + }, + { + "epoch": 0.52, + "grad_norm": 2.2925269815694542, + "learning_rate": 4.951623120423138e-06, + "loss": 0.2657, + "step": 8237 + }, + { + "epoch": 0.52, + "grad_norm": 1.6303030514441423, + "learning_rate": 4.9506046925926725e-06, + "loss": 0.2648, + "step": 8238 + }, + { + "epoch": 0.52, + "grad_norm": 2.3464748567621223, + "learning_rate": 4.949586266811705e-06, + "loss": 0.2856, + "step": 8239 + }, + { + "epoch": 0.52, + "grad_norm": 1.6626060977836936, + "learning_rate": 4.948567843122494e-06, + "loss": 0.277, + "step": 8240 + }, + { + "epoch": 0.52, + "grad_norm": 2.3347243208653583, + "learning_rate": 4.947549421567294e-06, + "loss": 0.2907, + "step": 8241 + }, + { + "epoch": 0.52, + "grad_norm": 1.8140112395828367, + "learning_rate": 4.94653100218836e-06, + "loss": 0.3006, + "step": 8242 + }, + { + "epoch": 0.52, + "grad_norm": 0.6213584045470085, + "learning_rate": 4.945512585027951e-06, + "loss": 0.4728, + "step": 8243 + }, + { + "epoch": 0.52, + "grad_norm": 1.5861131829846289, + "learning_rate": 4.944494170128323e-06, + "loss": 0.2835, + "step": 8244 + }, + { + "epoch": 0.52, + "grad_norm": 1.6417152034342402, + "learning_rate": 4.9434757575317286e-06, + "loss": 0.289, + "step": 8245 + }, + { + "epoch": 0.52, + "grad_norm": 1.6850436567976557, + "learning_rate": 4.9424573472804264e-06, + "loss": 0.2732, + "step": 8246 + }, + { + "epoch": 0.52, + "grad_norm": 13.001742424332056, + "learning_rate": 4.9414389394166705e-06, + "loss": 0.2648, + "step": 8247 + }, + { + "epoch": 0.52, + "grad_norm": 1.693485053493527, + "learning_rate": 4.940420533982718e-06, + "loss": 0.2872, + "step": 8248 + }, + { + "epoch": 0.52, + "grad_norm": 2.6446310687576586, + "learning_rate": 4.9394021310208225e-06, + "loss": 0.3017, + "step": 8249 + }, + { + "epoch": 0.52, + "grad_norm": 1.886786817674654, + "learning_rate": 4.938383730573239e-06, + "loss": 0.2839, + "step": 8250 + }, + { + "epoch": 0.52, + "grad_norm": 1.9207533099181677, + "learning_rate": 4.937365332682225e-06, + "loss": 0.2939, + "step": 8251 + }, + { + "epoch": 0.52, + "grad_norm": 2.0088184978962906, + "learning_rate": 4.936346937390036e-06, + "loss": 0.3113, + "step": 8252 + }, + { + "epoch": 0.52, + "grad_norm": 2.0325608995373363, + "learning_rate": 4.935328544738925e-06, + "loss": 0.2965, + "step": 8253 + }, + { + "epoch": 0.52, + "grad_norm": 0.5485844470082665, + "learning_rate": 4.934310154771147e-06, + "loss": 0.482, + "step": 8254 + }, + { + "epoch": 0.52, + "grad_norm": 3.1495215469116555, + "learning_rate": 4.933291767528957e-06, + "loss": 0.2817, + "step": 8255 + }, + { + "epoch": 0.52, + "grad_norm": 1.5681454838604005, + "learning_rate": 4.9322733830546115e-06, + "loss": 0.2775, + "step": 8256 + }, + { + "epoch": 0.52, + "grad_norm": 1.6840039722539915, + "learning_rate": 4.931255001390361e-06, + "loss": 0.2898, + "step": 8257 + }, + { + "epoch": 0.52, + "grad_norm": 0.5878128307571517, + "learning_rate": 4.930236622578464e-06, + "loss": 0.5024, + "step": 8258 + }, + { + "epoch": 0.52, + "grad_norm": 2.5355625212747435, + "learning_rate": 4.929218246661174e-06, + "loss": 0.3032, + "step": 8259 + }, + { + "epoch": 0.52, + "grad_norm": 1.6850041002117744, + "learning_rate": 4.928199873680745e-06, + "loss": 0.2735, + "step": 8260 + }, + { + "epoch": 0.52, + "grad_norm": 1.4972453730764776, + "learning_rate": 4.927181503679429e-06, + "loss": 0.2757, + "step": 8261 + }, + { + "epoch": 0.52, + "grad_norm": 1.5194446073500507, + "learning_rate": 4.9261631366994845e-06, + "loss": 0.2926, + "step": 8262 + }, + { + "epoch": 0.52, + "grad_norm": 1.8590668669251484, + "learning_rate": 4.925144772783161e-06, + "loss": 0.2843, + "step": 8263 + }, + { + "epoch": 0.52, + "grad_norm": 1.651182949328135, + "learning_rate": 4.924126411972714e-06, + "loss": 0.2989, + "step": 8264 + }, + { + "epoch": 0.52, + "grad_norm": 1.482248813457576, + "learning_rate": 4.923108054310395e-06, + "loss": 0.268, + "step": 8265 + }, + { + "epoch": 0.52, + "grad_norm": 1.3343332975702737, + "learning_rate": 4.922089699838462e-06, + "loss": 0.2852, + "step": 8266 + }, + { + "epoch": 0.52, + "grad_norm": 2.48254594985263, + "learning_rate": 4.921071348599164e-06, + "loss": 0.2729, + "step": 8267 + }, + { + "epoch": 0.52, + "grad_norm": 2.603544230043783, + "learning_rate": 4.9200530006347575e-06, + "loss": 0.2889, + "step": 8268 + }, + { + "epoch": 0.52, + "grad_norm": 2.537772216536732, + "learning_rate": 4.919034655987493e-06, + "loss": 0.306, + "step": 8269 + }, + { + "epoch": 0.52, + "grad_norm": 7.232419358392029, + "learning_rate": 4.918016314699625e-06, + "loss": 0.2935, + "step": 8270 + }, + { + "epoch": 0.52, + "grad_norm": 2.022585956446659, + "learning_rate": 4.916997976813406e-06, + "loss": 0.2878, + "step": 8271 + }, + { + "epoch": 0.52, + "grad_norm": 5.722205604554446, + "learning_rate": 4.9159796423710865e-06, + "loss": 0.2765, + "step": 8272 + }, + { + "epoch": 0.52, + "grad_norm": 2.387381502409368, + "learning_rate": 4.914961311414922e-06, + "loss": 0.2707, + "step": 8273 + }, + { + "epoch": 0.52, + "grad_norm": 1.4932612601342625, + "learning_rate": 4.913942983987165e-06, + "loss": 0.2898, + "step": 8274 + }, + { + "epoch": 0.52, + "grad_norm": 1.4789325587072022, + "learning_rate": 4.912924660130065e-06, + "loss": 0.2836, + "step": 8275 + }, + { + "epoch": 0.52, + "grad_norm": 11.592684636643195, + "learning_rate": 4.911906339885877e-06, + "loss": 0.3092, + "step": 8276 + }, + { + "epoch": 0.52, + "grad_norm": 1.7816285396426617, + "learning_rate": 4.91088802329685e-06, + "loss": 0.2907, + "step": 8277 + }, + { + "epoch": 0.52, + "grad_norm": 2.490854989742586, + "learning_rate": 4.909869710405238e-06, + "loss": 0.2886, + "step": 8278 + }, + { + "epoch": 0.52, + "grad_norm": 2.5912026344454357, + "learning_rate": 4.90885140125329e-06, + "loss": 0.2676, + "step": 8279 + }, + { + "epoch": 0.52, + "grad_norm": 2.210525238436266, + "learning_rate": 4.907833095883261e-06, + "loss": 0.3003, + "step": 8280 + }, + { + "epoch": 0.52, + "grad_norm": 2.1620126673627773, + "learning_rate": 4.9068147943374e-06, + "loss": 0.278, + "step": 8281 + }, + { + "epoch": 0.52, + "grad_norm": 2.1962018503609535, + "learning_rate": 4.905796496657959e-06, + "loss": 0.3072, + "step": 8282 + }, + { + "epoch": 0.52, + "grad_norm": 1.5142125020454695, + "learning_rate": 4.904778202887189e-06, + "loss": 0.3122, + "step": 8283 + }, + { + "epoch": 0.52, + "grad_norm": 1.3758775597468256, + "learning_rate": 4.90375991306734e-06, + "loss": 0.2883, + "step": 8284 + }, + { + "epoch": 0.52, + "grad_norm": 2.068861653821894, + "learning_rate": 4.902741627240663e-06, + "loss": 0.2706, + "step": 8285 + }, + { + "epoch": 0.52, + "grad_norm": 1.5981640795469874, + "learning_rate": 4.90172334544941e-06, + "loss": 0.2863, + "step": 8286 + }, + { + "epoch": 0.52, + "grad_norm": 2.46047673929236, + "learning_rate": 4.900705067735827e-06, + "loss": 0.2893, + "step": 8287 + }, + { + "epoch": 0.52, + "grad_norm": 2.188271235936475, + "learning_rate": 4.899686794142169e-06, + "loss": 0.2984, + "step": 8288 + }, + { + "epoch": 0.52, + "grad_norm": 1.5869922792937985, + "learning_rate": 4.8986685247106835e-06, + "loss": 0.2747, + "step": 8289 + }, + { + "epoch": 0.52, + "grad_norm": 5.901737525071232, + "learning_rate": 4.897650259483623e-06, + "loss": 0.284, + "step": 8290 + }, + { + "epoch": 0.52, + "grad_norm": 1.7613463201423094, + "learning_rate": 4.8966319985032325e-06, + "loss": 0.2656, + "step": 8291 + }, + { + "epoch": 0.52, + "grad_norm": 1.7949183307816083, + "learning_rate": 4.895613741811766e-06, + "loss": 0.2663, + "step": 8292 + }, + { + "epoch": 0.52, + "grad_norm": 1.8372796177519661, + "learning_rate": 4.894595489451468e-06, + "loss": 0.2813, + "step": 8293 + }, + { + "epoch": 0.52, + "grad_norm": 2.9594151641872095, + "learning_rate": 4.893577241464592e-06, + "loss": 0.2773, + "step": 8294 + }, + { + "epoch": 0.52, + "grad_norm": 1.5707005784419934, + "learning_rate": 4.892558997893386e-06, + "loss": 0.2879, + "step": 8295 + }, + { + "epoch": 0.52, + "grad_norm": 2.994405121263485, + "learning_rate": 4.8915407587800965e-06, + "loss": 0.3043, + "step": 8296 + }, + { + "epoch": 0.52, + "grad_norm": 1.889226416275823, + "learning_rate": 4.890522524166974e-06, + "loss": 0.2962, + "step": 8297 + }, + { + "epoch": 0.52, + "grad_norm": 1.236552999968069, + "learning_rate": 4.889504294096268e-06, + "loss": 0.2787, + "step": 8298 + }, + { + "epoch": 0.52, + "grad_norm": 1.773305178868573, + "learning_rate": 4.8884860686102234e-06, + "loss": 0.2971, + "step": 8299 + }, + { + "epoch": 0.52, + "grad_norm": 2.168430941044765, + "learning_rate": 4.887467847751091e-06, + "loss": 0.2717, + "step": 8300 + }, + { + "epoch": 0.52, + "grad_norm": 2.2006097595084317, + "learning_rate": 4.8864496315611175e-06, + "loss": 0.2749, + "step": 8301 + }, + { + "epoch": 0.52, + "grad_norm": 1.9690783700100913, + "learning_rate": 4.8854314200825475e-06, + "loss": 0.2873, + "step": 8302 + }, + { + "epoch": 0.52, + "grad_norm": 3.004249797363292, + "learning_rate": 4.884413213357635e-06, + "loss": 0.2803, + "step": 8303 + }, + { + "epoch": 0.52, + "grad_norm": 1.8958393299053462, + "learning_rate": 4.883395011428622e-06, + "loss": 0.2962, + "step": 8304 + }, + { + "epoch": 0.52, + "grad_norm": 1.800553602780511, + "learning_rate": 4.882376814337757e-06, + "loss": 0.2843, + "step": 8305 + }, + { + "epoch": 0.52, + "grad_norm": 1.4315211587650654, + "learning_rate": 4.881358622127288e-06, + "loss": 0.2698, + "step": 8306 + }, + { + "epoch": 0.52, + "grad_norm": 1.3705326963569648, + "learning_rate": 4.880340434839459e-06, + "loss": 0.2917, + "step": 8307 + }, + { + "epoch": 0.52, + "grad_norm": 2.2516337919657996, + "learning_rate": 4.8793222525165205e-06, + "loss": 0.302, + "step": 8308 + }, + { + "epoch": 0.52, + "grad_norm": 1.4027960878908383, + "learning_rate": 4.878304075200712e-06, + "loss": 0.2815, + "step": 8309 + }, + { + "epoch": 0.52, + "grad_norm": 3.1470103947421344, + "learning_rate": 4.8772859029342864e-06, + "loss": 0.2772, + "step": 8310 + }, + { + "epoch": 0.52, + "grad_norm": 1.6994330713125576, + "learning_rate": 4.876267735759486e-06, + "loss": 0.279, + "step": 8311 + }, + { + "epoch": 0.52, + "grad_norm": 2.2505510396157145, + "learning_rate": 4.875249573718557e-06, + "loss": 0.2926, + "step": 8312 + }, + { + "epoch": 0.52, + "grad_norm": 1.5588934345512109, + "learning_rate": 4.874231416853744e-06, + "loss": 0.3097, + "step": 8313 + }, + { + "epoch": 0.52, + "grad_norm": 1.8267615338004741, + "learning_rate": 4.8732132652072956e-06, + "loss": 0.2931, + "step": 8314 + }, + { + "epoch": 0.52, + "grad_norm": 4.78098828533289, + "learning_rate": 4.872195118821452e-06, + "loss": 0.2902, + "step": 8315 + }, + { + "epoch": 0.52, + "grad_norm": 2.375384396299737, + "learning_rate": 4.871176977738461e-06, + "loss": 0.3106, + "step": 8316 + }, + { + "epoch": 0.52, + "grad_norm": 1.7625365282938825, + "learning_rate": 4.870158842000563e-06, + "loss": 0.2898, + "step": 8317 + }, + { + "epoch": 0.52, + "grad_norm": 2.473743047081472, + "learning_rate": 4.869140711650008e-06, + "loss": 0.2962, + "step": 8318 + }, + { + "epoch": 0.52, + "grad_norm": 4.630733487511131, + "learning_rate": 4.868122586729036e-06, + "loss": 0.2954, + "step": 8319 + }, + { + "epoch": 0.52, + "grad_norm": 2.0925891721373913, + "learning_rate": 4.867104467279894e-06, + "loss": 0.29, + "step": 8320 + }, + { + "epoch": 0.52, + "grad_norm": 2.0939923448910562, + "learning_rate": 4.866086353344822e-06, + "loss": 0.2712, + "step": 8321 + }, + { + "epoch": 0.52, + "grad_norm": 6.41877420552967, + "learning_rate": 4.865068244966066e-06, + "loss": 0.2929, + "step": 8322 + }, + { + "epoch": 0.52, + "grad_norm": 2.270981754482077, + "learning_rate": 4.864050142185868e-06, + "loss": 0.296, + "step": 8323 + }, + { + "epoch": 0.52, + "grad_norm": 1.9316351673686134, + "learning_rate": 4.86303204504647e-06, + "loss": 0.2827, + "step": 8324 + }, + { + "epoch": 0.52, + "grad_norm": 1.5418993099495082, + "learning_rate": 4.862013953590117e-06, + "loss": 0.2727, + "step": 8325 + }, + { + "epoch": 0.52, + "grad_norm": 2.5715061253466334, + "learning_rate": 4.860995867859049e-06, + "loss": 0.2824, + "step": 8326 + }, + { + "epoch": 0.52, + "grad_norm": 1.6413300874583487, + "learning_rate": 4.85997778789551e-06, + "loss": 0.3233, + "step": 8327 + }, + { + "epoch": 0.52, + "grad_norm": 1.7151059470009289, + "learning_rate": 4.858959713741742e-06, + "loss": 0.2717, + "step": 8328 + }, + { + "epoch": 0.52, + "grad_norm": 2.5936853821012376, + "learning_rate": 4.857941645439985e-06, + "loss": 0.2895, + "step": 8329 + }, + { + "epoch": 0.52, + "grad_norm": 3.4025304259215643, + "learning_rate": 4.856923583032483e-06, + "loss": 0.2677, + "step": 8330 + }, + { + "epoch": 0.52, + "grad_norm": 1.5654484206878563, + "learning_rate": 4.855905526561474e-06, + "loss": 0.2879, + "step": 8331 + }, + { + "epoch": 0.52, + "grad_norm": 2.695336616262636, + "learning_rate": 4.8548874760692e-06, + "loss": 0.3043, + "step": 8332 + }, + { + "epoch": 0.52, + "grad_norm": 1.7832227001250294, + "learning_rate": 4.8538694315979045e-06, + "loss": 0.2927, + "step": 8333 + }, + { + "epoch": 0.52, + "grad_norm": 2.1140181232744344, + "learning_rate": 4.852851393189825e-06, + "loss": 0.2861, + "step": 8334 + }, + { + "epoch": 0.52, + "grad_norm": 1.4286677077307632, + "learning_rate": 4.8518333608872015e-06, + "loss": 0.2594, + "step": 8335 + }, + { + "epoch": 0.52, + "grad_norm": 1.9602561494411748, + "learning_rate": 4.850815334732277e-06, + "loss": 0.2874, + "step": 8336 + }, + { + "epoch": 0.52, + "grad_norm": 1.7235780246174375, + "learning_rate": 4.849797314767288e-06, + "loss": 0.2986, + "step": 8337 + }, + { + "epoch": 0.52, + "grad_norm": 3.9333024538245205, + "learning_rate": 4.848779301034476e-06, + "loss": 0.2795, + "step": 8338 + }, + { + "epoch": 0.52, + "grad_norm": 19.95784289340205, + "learning_rate": 4.847761293576078e-06, + "loss": 0.2863, + "step": 8339 + }, + { + "epoch": 0.52, + "grad_norm": 2.346495865872955, + "learning_rate": 4.846743292434334e-06, + "loss": 0.2605, + "step": 8340 + }, + { + "epoch": 0.52, + "grad_norm": 1.8440387401706564, + "learning_rate": 4.845725297651485e-06, + "loss": 0.2689, + "step": 8341 + }, + { + "epoch": 0.52, + "grad_norm": 1.971613200240072, + "learning_rate": 4.844707309269767e-06, + "loss": 0.2854, + "step": 8342 + }, + { + "epoch": 0.52, + "grad_norm": 2.0397420833369697, + "learning_rate": 4.8436893273314176e-06, + "loss": 0.2885, + "step": 8343 + }, + { + "epoch": 0.52, + "grad_norm": 2.888534178756261, + "learning_rate": 4.842671351878677e-06, + "loss": 0.2862, + "step": 8344 + }, + { + "epoch": 0.52, + "grad_norm": 1.8523021751627184, + "learning_rate": 4.84165338295378e-06, + "loss": 0.289, + "step": 8345 + }, + { + "epoch": 0.52, + "grad_norm": 6.75321363063784, + "learning_rate": 4.840635420598967e-06, + "loss": 0.2845, + "step": 8346 + }, + { + "epoch": 0.52, + "grad_norm": 1.9647520906752194, + "learning_rate": 4.839617464856471e-06, + "loss": 0.2811, + "step": 8347 + }, + { + "epoch": 0.52, + "grad_norm": 1.413440971219273, + "learning_rate": 4.838599515768532e-06, + "loss": 0.2945, + "step": 8348 + }, + { + "epoch": 0.53, + "grad_norm": 3.0070939153612004, + "learning_rate": 4.837581573377387e-06, + "loss": 0.2803, + "step": 8349 + }, + { + "epoch": 0.53, + "grad_norm": 2.0024772625960554, + "learning_rate": 4.836563637725271e-06, + "loss": 0.2965, + "step": 8350 + }, + { + "epoch": 0.53, + "grad_norm": 1.783291008563796, + "learning_rate": 4.835545708854419e-06, + "loss": 0.2852, + "step": 8351 + }, + { + "epoch": 0.53, + "grad_norm": 2.226463281537102, + "learning_rate": 4.834527786807069e-06, + "loss": 0.2825, + "step": 8352 + }, + { + "epoch": 0.53, + "grad_norm": 1.3102782421763481, + "learning_rate": 4.8335098716254545e-06, + "loss": 0.2644, + "step": 8353 + }, + { + "epoch": 0.53, + "grad_norm": 2.7464204611587495, + "learning_rate": 4.832491963351809e-06, + "loss": 0.2997, + "step": 8354 + }, + { + "epoch": 0.53, + "grad_norm": 4.88987909463794, + "learning_rate": 4.831474062028372e-06, + "loss": 0.2811, + "step": 8355 + }, + { + "epoch": 0.53, + "grad_norm": 3.314611497551799, + "learning_rate": 4.830456167697375e-06, + "loss": 0.2821, + "step": 8356 + }, + { + "epoch": 0.53, + "grad_norm": 1.34470913155694, + "learning_rate": 4.829438280401052e-06, + "loss": 0.3104, + "step": 8357 + }, + { + "epoch": 0.53, + "grad_norm": 2.271400913559649, + "learning_rate": 4.828420400181639e-06, + "loss": 0.2796, + "step": 8358 + }, + { + "epoch": 0.53, + "grad_norm": 2.0068432578888458, + "learning_rate": 4.827402527081368e-06, + "loss": 0.2887, + "step": 8359 + }, + { + "epoch": 0.53, + "grad_norm": 2.600485994703669, + "learning_rate": 4.826384661142472e-06, + "loss": 0.2832, + "step": 8360 + }, + { + "epoch": 0.53, + "grad_norm": 2.3312780804242537, + "learning_rate": 4.825366802407184e-06, + "loss": 0.2862, + "step": 8361 + }, + { + "epoch": 0.53, + "grad_norm": 7.674285070392237, + "learning_rate": 4.824348950917739e-06, + "loss": 0.279, + "step": 8362 + }, + { + "epoch": 0.53, + "grad_norm": 3.066978445862203, + "learning_rate": 4.823331106716368e-06, + "loss": 0.2793, + "step": 8363 + }, + { + "epoch": 0.53, + "grad_norm": 1.894374156222564, + "learning_rate": 4.822313269845303e-06, + "loss": 0.3039, + "step": 8364 + }, + { + "epoch": 0.53, + "grad_norm": 1.510245593542162, + "learning_rate": 4.821295440346775e-06, + "loss": 0.2798, + "step": 8365 + }, + { + "epoch": 0.53, + "grad_norm": 3.0874445027030997, + "learning_rate": 4.820277618263018e-06, + "loss": 0.2804, + "step": 8366 + }, + { + "epoch": 0.53, + "grad_norm": 1.8498454825062158, + "learning_rate": 4.819259803636261e-06, + "loss": 0.2761, + "step": 8367 + }, + { + "epoch": 0.53, + "grad_norm": 1.6144161860062747, + "learning_rate": 4.8182419965087375e-06, + "loss": 0.295, + "step": 8368 + }, + { + "epoch": 0.53, + "grad_norm": 3.6204533056451464, + "learning_rate": 4.817224196922673e-06, + "loss": 0.2924, + "step": 8369 + }, + { + "epoch": 0.53, + "grad_norm": 2.140072947789576, + "learning_rate": 4.8162064049203035e-06, + "loss": 0.2773, + "step": 8370 + }, + { + "epoch": 0.53, + "grad_norm": 2.3893163555588957, + "learning_rate": 4.815188620543857e-06, + "loss": 0.2981, + "step": 8371 + }, + { + "epoch": 0.53, + "grad_norm": 0.5803892602864232, + "learning_rate": 4.814170843835561e-06, + "loss": 0.5214, + "step": 8372 + }, + { + "epoch": 0.53, + "grad_norm": 1.850281135022143, + "learning_rate": 4.813153074837648e-06, + "loss": 0.2747, + "step": 8373 + }, + { + "epoch": 0.53, + "grad_norm": 2.7567157375102114, + "learning_rate": 4.812135313592348e-06, + "loss": 0.28, + "step": 8374 + }, + { + "epoch": 0.53, + "grad_norm": 1.8665012917513455, + "learning_rate": 4.8111175601418845e-06, + "loss": 0.2767, + "step": 8375 + }, + { + "epoch": 0.53, + "grad_norm": 1.6674636307333104, + "learning_rate": 4.810099814528489e-06, + "loss": 0.2818, + "step": 8376 + }, + { + "epoch": 0.53, + "grad_norm": 14.574159705529876, + "learning_rate": 4.809082076794391e-06, + "loss": 0.2801, + "step": 8377 + }, + { + "epoch": 0.53, + "grad_norm": 0.6994142917636991, + "learning_rate": 4.808064346981815e-06, + "loss": 0.5043, + "step": 8378 + }, + { + "epoch": 0.53, + "grad_norm": 2.043551212387841, + "learning_rate": 4.8070466251329926e-06, + "loss": 0.2809, + "step": 8379 + }, + { + "epoch": 0.53, + "grad_norm": 2.3474862659250526, + "learning_rate": 4.806028911290147e-06, + "loss": 0.3091, + "step": 8380 + }, + { + "epoch": 0.53, + "grad_norm": 1.5123096367847348, + "learning_rate": 4.805011205495505e-06, + "loss": 0.2909, + "step": 8381 + }, + { + "epoch": 0.53, + "grad_norm": 2.5323443497815594, + "learning_rate": 4.803993507791298e-06, + "loss": 0.29, + "step": 8382 + }, + { + "epoch": 0.53, + "grad_norm": 2.514766547547126, + "learning_rate": 4.8029758182197455e-06, + "loss": 0.2773, + "step": 8383 + }, + { + "epoch": 0.53, + "grad_norm": 2.778383922553858, + "learning_rate": 4.801958136823076e-06, + "loss": 0.2793, + "step": 8384 + }, + { + "epoch": 0.53, + "grad_norm": 0.5753223131355168, + "learning_rate": 4.800940463643517e-06, + "loss": 0.4815, + "step": 8385 + }, + { + "epoch": 0.53, + "grad_norm": 1.5322787209475415, + "learning_rate": 4.799922798723291e-06, + "loss": 0.2826, + "step": 8386 + }, + { + "epoch": 0.53, + "grad_norm": 1.590813516641714, + "learning_rate": 4.798905142104624e-06, + "loss": 0.2833, + "step": 8387 + }, + { + "epoch": 0.53, + "grad_norm": 3.406993004851999, + "learning_rate": 4.79788749382974e-06, + "loss": 0.2867, + "step": 8388 + }, + { + "epoch": 0.53, + "grad_norm": 2.7434548670177867, + "learning_rate": 4.796869853940862e-06, + "loss": 0.2695, + "step": 8389 + }, + { + "epoch": 0.53, + "grad_norm": 1.8224793567413622, + "learning_rate": 4.795852222480215e-06, + "loss": 0.2792, + "step": 8390 + }, + { + "epoch": 0.53, + "grad_norm": 1.9544250845793802, + "learning_rate": 4.7948345994900205e-06, + "loss": 0.2935, + "step": 8391 + }, + { + "epoch": 0.53, + "grad_norm": 3.295874486943766, + "learning_rate": 4.793816985012503e-06, + "loss": 0.2921, + "step": 8392 + }, + { + "epoch": 0.53, + "grad_norm": 4.128313276949677, + "learning_rate": 4.792799379089888e-06, + "loss": 0.302, + "step": 8393 + }, + { + "epoch": 0.53, + "grad_norm": 3.058216889694372, + "learning_rate": 4.791781781764392e-06, + "loss": 0.2642, + "step": 8394 + }, + { + "epoch": 0.53, + "grad_norm": 2.1364291315479154, + "learning_rate": 4.79076419307824e-06, + "loss": 0.283, + "step": 8395 + }, + { + "epoch": 0.53, + "grad_norm": 1.8379552323039976, + "learning_rate": 4.789746613073655e-06, + "loss": 0.2699, + "step": 8396 + }, + { + "epoch": 0.53, + "grad_norm": 2.510025828512651, + "learning_rate": 4.788729041792855e-06, + "loss": 0.2553, + "step": 8397 + }, + { + "epoch": 0.53, + "grad_norm": 2.339536887154089, + "learning_rate": 4.787711479278063e-06, + "loss": 0.2747, + "step": 8398 + }, + { + "epoch": 0.53, + "grad_norm": 2.253674938618319, + "learning_rate": 4.7866939255714975e-06, + "loss": 0.2681, + "step": 8399 + }, + { + "epoch": 0.53, + "grad_norm": 2.5930586442139374, + "learning_rate": 4.78567638071538e-06, + "loss": 0.2904, + "step": 8400 + }, + { + "epoch": 0.53, + "grad_norm": 3.60352921632056, + "learning_rate": 4.784658844751933e-06, + "loss": 0.2972, + "step": 8401 + }, + { + "epoch": 0.53, + "grad_norm": 4.056599153816232, + "learning_rate": 4.7836413177233705e-06, + "loss": 0.3112, + "step": 8402 + }, + { + "epoch": 0.53, + "grad_norm": 1.321717028384552, + "learning_rate": 4.782623799671914e-06, + "loss": 0.2693, + "step": 8403 + }, + { + "epoch": 0.53, + "grad_norm": 1.7516074821540624, + "learning_rate": 4.781606290639783e-06, + "loss": 0.2831, + "step": 8404 + }, + { + "epoch": 0.53, + "grad_norm": 1.5426926052473047, + "learning_rate": 4.780588790669195e-06, + "loss": 0.2751, + "step": 8405 + }, + { + "epoch": 0.53, + "grad_norm": 1.4934233565380441, + "learning_rate": 4.779571299802366e-06, + "loss": 0.2705, + "step": 8406 + }, + { + "epoch": 0.53, + "grad_norm": 1.5869935741782923, + "learning_rate": 4.778553818081517e-06, + "loss": 0.2717, + "step": 8407 + }, + { + "epoch": 0.53, + "grad_norm": 2.6472550756050794, + "learning_rate": 4.777536345548863e-06, + "loss": 0.2861, + "step": 8408 + }, + { + "epoch": 0.53, + "grad_norm": 2.0903636854619405, + "learning_rate": 4.7765188822466226e-06, + "loss": 0.2822, + "step": 8409 + }, + { + "epoch": 0.53, + "grad_norm": 1.5875475778358343, + "learning_rate": 4.775501428217009e-06, + "loss": 0.3064, + "step": 8410 + }, + { + "epoch": 0.53, + "grad_norm": 0.6261372605388646, + "learning_rate": 4.77448398350224e-06, + "loss": 0.5134, + "step": 8411 + }, + { + "epoch": 0.53, + "grad_norm": 1.6717839644134513, + "learning_rate": 4.773466548144532e-06, + "loss": 0.2782, + "step": 8412 + }, + { + "epoch": 0.53, + "grad_norm": 0.5573152097917939, + "learning_rate": 4.7724491221860986e-06, + "loss": 0.5053, + "step": 8413 + }, + { + "epoch": 0.53, + "grad_norm": 7.065201075386083, + "learning_rate": 4.771431705669154e-06, + "loss": 0.3012, + "step": 8414 + }, + { + "epoch": 0.53, + "grad_norm": 2.008813283520479, + "learning_rate": 4.770414298635916e-06, + "loss": 0.2862, + "step": 8415 + }, + { + "epoch": 0.53, + "grad_norm": 17.200652662346773, + "learning_rate": 4.769396901128595e-06, + "loss": 0.2675, + "step": 8416 + }, + { + "epoch": 0.53, + "grad_norm": 6.614085080775041, + "learning_rate": 4.768379513189408e-06, + "loss": 0.2782, + "step": 8417 + }, + { + "epoch": 0.53, + "grad_norm": 1.5337764137942766, + "learning_rate": 4.7673621348605655e-06, + "loss": 0.2881, + "step": 8418 + }, + { + "epoch": 0.53, + "grad_norm": 1.9528811152936985, + "learning_rate": 4.76634476618428e-06, + "loss": 0.2912, + "step": 8419 + }, + { + "epoch": 0.53, + "grad_norm": 1.8610145314625597, + "learning_rate": 4.765327407202768e-06, + "loss": 0.2822, + "step": 8420 + }, + { + "epoch": 0.53, + "grad_norm": 1.7819661999567937, + "learning_rate": 4.7643100579582355e-06, + "loss": 0.2901, + "step": 8421 + }, + { + "epoch": 0.53, + "grad_norm": 1.4974709682042993, + "learning_rate": 4.7632927184928985e-06, + "loss": 0.2886, + "step": 8422 + }, + { + "epoch": 0.53, + "grad_norm": 2.346654853336016, + "learning_rate": 4.762275388848968e-06, + "loss": 0.2819, + "step": 8423 + }, + { + "epoch": 0.53, + "grad_norm": 2.5313475112359463, + "learning_rate": 4.761258069068654e-06, + "loss": 0.3097, + "step": 8424 + }, + { + "epoch": 0.53, + "grad_norm": 0.5794129127802287, + "learning_rate": 4.760240759194166e-06, + "loss": 0.4798, + "step": 8425 + }, + { + "epoch": 0.53, + "grad_norm": 2.2478992436172196, + "learning_rate": 4.759223459267715e-06, + "loss": 0.296, + "step": 8426 + }, + { + "epoch": 0.53, + "grad_norm": 2.3140798998622945, + "learning_rate": 4.758206169331509e-06, + "loss": 0.2999, + "step": 8427 + }, + { + "epoch": 0.53, + "grad_norm": 10.209286981225102, + "learning_rate": 4.757188889427761e-06, + "loss": 0.3071, + "step": 8428 + }, + { + "epoch": 0.53, + "grad_norm": 4.151286698593463, + "learning_rate": 4.756171619598674e-06, + "loss": 0.3238, + "step": 8429 + }, + { + "epoch": 0.53, + "grad_norm": 2.89099243932232, + "learning_rate": 4.7551543598864614e-06, + "loss": 0.3135, + "step": 8430 + }, + { + "epoch": 0.53, + "grad_norm": 1.817848693334988, + "learning_rate": 4.75413711033333e-06, + "loss": 0.2839, + "step": 8431 + }, + { + "epoch": 0.53, + "grad_norm": 2.5286534124216655, + "learning_rate": 4.753119870981486e-06, + "loss": 0.2844, + "step": 8432 + }, + { + "epoch": 0.53, + "grad_norm": 2.4574432505536095, + "learning_rate": 4.752102641873136e-06, + "loss": 0.2969, + "step": 8433 + }, + { + "epoch": 0.53, + "grad_norm": 1.6359299468810733, + "learning_rate": 4.75108542305049e-06, + "loss": 0.2988, + "step": 8434 + }, + { + "epoch": 0.53, + "grad_norm": 2.6438591581902067, + "learning_rate": 4.75006821455575e-06, + "loss": 0.3049, + "step": 8435 + }, + { + "epoch": 0.53, + "grad_norm": 2.105956843821535, + "learning_rate": 4.749051016431123e-06, + "loss": 0.2815, + "step": 8436 + }, + { + "epoch": 0.53, + "grad_norm": 7.813638413684079, + "learning_rate": 4.748033828718818e-06, + "loss": 0.2864, + "step": 8437 + }, + { + "epoch": 0.53, + "grad_norm": 2.9256826268681375, + "learning_rate": 4.747016651461035e-06, + "loss": 0.2766, + "step": 8438 + }, + { + "epoch": 0.53, + "grad_norm": 1.9729885408359846, + "learning_rate": 4.7459994846999815e-06, + "loss": 0.2787, + "step": 8439 + }, + { + "epoch": 0.53, + "grad_norm": 1.5754963023637483, + "learning_rate": 4.744982328477859e-06, + "loss": 0.2613, + "step": 8440 + }, + { + "epoch": 0.53, + "grad_norm": 1.5413783623211113, + "learning_rate": 4.7439651828368736e-06, + "loss": 0.2698, + "step": 8441 + }, + { + "epoch": 0.53, + "grad_norm": 1.826139696164723, + "learning_rate": 4.7429480478192285e-06, + "loss": 0.2829, + "step": 8442 + }, + { + "epoch": 0.53, + "grad_norm": 2.77022466616211, + "learning_rate": 4.741930923467123e-06, + "loss": 0.2852, + "step": 8443 + }, + { + "epoch": 0.53, + "grad_norm": 1.6323025777956142, + "learning_rate": 4.740913809822763e-06, + "loss": 0.2783, + "step": 8444 + }, + { + "epoch": 0.53, + "grad_norm": 1.6126840257945725, + "learning_rate": 4.7398967069283505e-06, + "loss": 0.2754, + "step": 8445 + }, + { + "epoch": 0.53, + "grad_norm": 1.675572458730645, + "learning_rate": 4.7388796148260846e-06, + "loss": 0.3032, + "step": 8446 + }, + { + "epoch": 0.53, + "grad_norm": 1.918201435085034, + "learning_rate": 4.737862533558168e-06, + "loss": 0.2745, + "step": 8447 + }, + { + "epoch": 0.53, + "grad_norm": 1.7615596289358781, + "learning_rate": 4.7368454631668e-06, + "loss": 0.2914, + "step": 8448 + }, + { + "epoch": 0.53, + "grad_norm": 1.924231687650136, + "learning_rate": 4.73582840369418e-06, + "loss": 0.2871, + "step": 8449 + }, + { + "epoch": 0.53, + "grad_norm": 3.0118428570949543, + "learning_rate": 4.734811355182511e-06, + "loss": 0.2607, + "step": 8450 + }, + { + "epoch": 0.53, + "grad_norm": 1.965448863629376, + "learning_rate": 4.7337943176739874e-06, + "loss": 0.2744, + "step": 8451 + }, + { + "epoch": 0.53, + "grad_norm": 1.291794914480757, + "learning_rate": 4.732777291210812e-06, + "loss": 0.2702, + "step": 8452 + }, + { + "epoch": 0.53, + "grad_norm": 1.8498930986453157, + "learning_rate": 4.7317602758351814e-06, + "loss": 0.2725, + "step": 8453 + }, + { + "epoch": 0.53, + "grad_norm": 1.5519068068456685, + "learning_rate": 4.730743271589293e-06, + "loss": 0.2793, + "step": 8454 + }, + { + "epoch": 0.53, + "grad_norm": 1.7848523060138315, + "learning_rate": 4.7297262785153455e-06, + "loss": 0.3003, + "step": 8455 + }, + { + "epoch": 0.53, + "grad_norm": 1.7848219075019645, + "learning_rate": 4.728709296655533e-06, + "loss": 0.3335, + "step": 8456 + }, + { + "epoch": 0.53, + "grad_norm": 1.9582090084489585, + "learning_rate": 4.727692326052054e-06, + "loss": 0.281, + "step": 8457 + }, + { + "epoch": 0.53, + "grad_norm": 2.322363457982876, + "learning_rate": 4.726675366747104e-06, + "loss": 0.2695, + "step": 8458 + }, + { + "epoch": 0.53, + "grad_norm": 6.359884895054364, + "learning_rate": 4.72565841878288e-06, + "loss": 0.2899, + "step": 8459 + }, + { + "epoch": 0.53, + "grad_norm": 1.60113332580965, + "learning_rate": 4.724641482201574e-06, + "loss": 0.2953, + "step": 8460 + }, + { + "epoch": 0.53, + "grad_norm": 1.6231979197784872, + "learning_rate": 4.723624557045383e-06, + "loss": 0.268, + "step": 8461 + }, + { + "epoch": 0.53, + "grad_norm": 2.517094361127118, + "learning_rate": 4.7226076433564994e-06, + "loss": 0.287, + "step": 8462 + }, + { + "epoch": 0.53, + "grad_norm": 1.7999813408429162, + "learning_rate": 4.721590741177119e-06, + "loss": 0.2705, + "step": 8463 + }, + { + "epoch": 0.53, + "grad_norm": 0.5857471206262909, + "learning_rate": 4.720573850549431e-06, + "loss": 0.4761, + "step": 8464 + }, + { + "epoch": 0.53, + "grad_norm": 1.8948979812232025, + "learning_rate": 4.719556971515631e-06, + "loss": 0.2996, + "step": 8465 + }, + { + "epoch": 0.53, + "grad_norm": 2.329984744611168, + "learning_rate": 4.718540104117909e-06, + "loss": 0.2931, + "step": 8466 + }, + { + "epoch": 0.53, + "grad_norm": 1.8623617760742492, + "learning_rate": 4.717523248398461e-06, + "loss": 0.2758, + "step": 8467 + }, + { + "epoch": 0.53, + "grad_norm": 1.5228386275642694, + "learning_rate": 4.716506404399473e-06, + "loss": 0.279, + "step": 8468 + }, + { + "epoch": 0.53, + "grad_norm": 2.033646923485184, + "learning_rate": 4.715489572163139e-06, + "loss": 0.27, + "step": 8469 + }, + { + "epoch": 0.53, + "grad_norm": 4.077267248613091, + "learning_rate": 4.714472751731647e-06, + "loss": 0.3069, + "step": 8470 + }, + { + "epoch": 0.53, + "grad_norm": 2.5367434960848136, + "learning_rate": 4.713455943147188e-06, + "loss": 0.266, + "step": 8471 + }, + { + "epoch": 0.53, + "grad_norm": 2.2880178154407433, + "learning_rate": 4.712439146451952e-06, + "loss": 0.2747, + "step": 8472 + }, + { + "epoch": 0.53, + "grad_norm": 1.8439083862790828, + "learning_rate": 4.711422361688124e-06, + "loss": 0.2847, + "step": 8473 + }, + { + "epoch": 0.53, + "grad_norm": 1.403644545133131, + "learning_rate": 4.710405588897896e-06, + "loss": 0.2765, + "step": 8474 + }, + { + "epoch": 0.53, + "grad_norm": 1.467542722163241, + "learning_rate": 4.709388828123457e-06, + "loss": 0.2785, + "step": 8475 + }, + { + "epoch": 0.53, + "grad_norm": 1.4478503120744513, + "learning_rate": 4.708372079406989e-06, + "loss": 0.28, + "step": 8476 + }, + { + "epoch": 0.53, + "grad_norm": 5.252758370568109, + "learning_rate": 4.707355342790682e-06, + "loss": 0.2976, + "step": 8477 + }, + { + "epoch": 0.53, + "grad_norm": 3.5136777425886425, + "learning_rate": 4.706338618316721e-06, + "loss": 0.292, + "step": 8478 + }, + { + "epoch": 0.53, + "grad_norm": 2.106395076529786, + "learning_rate": 4.705321906027293e-06, + "loss": 0.2699, + "step": 8479 + }, + { + "epoch": 0.53, + "grad_norm": 1.79012790847718, + "learning_rate": 4.7043052059645835e-06, + "loss": 0.2797, + "step": 8480 + }, + { + "epoch": 0.53, + "grad_norm": 1.7093815457865003, + "learning_rate": 4.703288518170774e-06, + "loss": 0.2861, + "step": 8481 + }, + { + "epoch": 0.53, + "grad_norm": 1.464103243951695, + "learning_rate": 4.702271842688052e-06, + "loss": 0.3002, + "step": 8482 + }, + { + "epoch": 0.53, + "grad_norm": 1.665304705321683, + "learning_rate": 4.701255179558601e-06, + "loss": 0.2728, + "step": 8483 + }, + { + "epoch": 0.53, + "grad_norm": 1.6848328109589452, + "learning_rate": 4.700238528824602e-06, + "loss": 0.273, + "step": 8484 + }, + { + "epoch": 0.53, + "grad_norm": 1.8086438140132717, + "learning_rate": 4.69922189052824e-06, + "loss": 0.2679, + "step": 8485 + }, + { + "epoch": 0.53, + "grad_norm": 1.892734079629832, + "learning_rate": 4.698205264711695e-06, + "loss": 0.3, + "step": 8486 + }, + { + "epoch": 0.53, + "grad_norm": 1.5813826797534165, + "learning_rate": 4.6971886514171495e-06, + "loss": 0.2855, + "step": 8487 + }, + { + "epoch": 0.53, + "grad_norm": 2.783067685410438, + "learning_rate": 4.696172050686784e-06, + "loss": 0.2845, + "step": 8488 + }, + { + "epoch": 0.53, + "grad_norm": 1.1740685978314063, + "learning_rate": 4.695155462562781e-06, + "loss": 0.2731, + "step": 8489 + }, + { + "epoch": 0.53, + "grad_norm": 1.9210350324320693, + "learning_rate": 4.69413888708732e-06, + "loss": 0.291, + "step": 8490 + }, + { + "epoch": 0.53, + "grad_norm": 2.6339544719116774, + "learning_rate": 4.693122324302579e-06, + "loss": 0.2918, + "step": 8491 + }, + { + "epoch": 0.53, + "grad_norm": 2.660077704029485, + "learning_rate": 4.692105774250738e-06, + "loss": 0.2829, + "step": 8492 + }, + { + "epoch": 0.53, + "grad_norm": 1.51817554738402, + "learning_rate": 4.6910892369739754e-06, + "loss": 0.2641, + "step": 8493 + }, + { + "epoch": 0.53, + "grad_norm": 2.570369080557774, + "learning_rate": 4.690072712514469e-06, + "loss": 0.2775, + "step": 8494 + }, + { + "epoch": 0.53, + "grad_norm": 2.190856266048138, + "learning_rate": 4.689056200914396e-06, + "loss": 0.2969, + "step": 8495 + }, + { + "epoch": 0.53, + "grad_norm": 3.5318523971378313, + "learning_rate": 4.6880397022159325e-06, + "loss": 0.2933, + "step": 8496 + }, + { + "epoch": 0.53, + "grad_norm": 1.398498445276512, + "learning_rate": 4.687023216461257e-06, + "loss": 0.2805, + "step": 8497 + }, + { + "epoch": 0.53, + "grad_norm": 1.4772995611057593, + "learning_rate": 4.686006743692544e-06, + "loss": 0.2819, + "step": 8498 + }, + { + "epoch": 0.53, + "grad_norm": 1.7652774759334076, + "learning_rate": 4.68499028395197e-06, + "loss": 0.2809, + "step": 8499 + }, + { + "epoch": 0.53, + "grad_norm": 1.805621268942833, + "learning_rate": 4.683973837281708e-06, + "loss": 0.2728, + "step": 8500 + }, + { + "epoch": 0.53, + "grad_norm": 2.9497376117748084, + "learning_rate": 4.682957403723933e-06, + "loss": 0.2872, + "step": 8501 + }, + { + "epoch": 0.53, + "grad_norm": 2.7000536329925047, + "learning_rate": 4.681940983320818e-06, + "loss": 0.2906, + "step": 8502 + }, + { + "epoch": 0.53, + "grad_norm": 4.1535056506735675, + "learning_rate": 4.680924576114535e-06, + "loss": 0.282, + "step": 8503 + }, + { + "epoch": 0.53, + "grad_norm": 3.450309175314244, + "learning_rate": 4.679908182147259e-06, + "loss": 0.2887, + "step": 8504 + }, + { + "epoch": 0.53, + "grad_norm": 1.8818322464896178, + "learning_rate": 4.6788918014611624e-06, + "loss": 0.2849, + "step": 8505 + }, + { + "epoch": 0.53, + "grad_norm": 2.470227527852286, + "learning_rate": 4.677875434098414e-06, + "loss": 0.318, + "step": 8506 + }, + { + "epoch": 0.53, + "grad_norm": 3.937588863534305, + "learning_rate": 4.676859080101188e-06, + "loss": 0.2813, + "step": 8507 + }, + { + "epoch": 0.54, + "grad_norm": 1.3713852251171992, + "learning_rate": 4.675842739511651e-06, + "loss": 0.2837, + "step": 8508 + }, + { + "epoch": 0.54, + "grad_norm": 1.4273832632733572, + "learning_rate": 4.674826412371975e-06, + "loss": 0.2943, + "step": 8509 + }, + { + "epoch": 0.54, + "grad_norm": 2.864045484347699, + "learning_rate": 4.673810098724329e-06, + "loss": 0.2909, + "step": 8510 + }, + { + "epoch": 0.54, + "grad_norm": 3.155734353607268, + "learning_rate": 4.6727937986108794e-06, + "loss": 0.2982, + "step": 8511 + }, + { + "epoch": 0.54, + "grad_norm": 1.6988717509513624, + "learning_rate": 4.671777512073797e-06, + "loss": 0.2829, + "step": 8512 + }, + { + "epoch": 0.54, + "grad_norm": 1.9494067114925544, + "learning_rate": 4.67076123915525e-06, + "loss": 0.2889, + "step": 8513 + }, + { + "epoch": 0.54, + "grad_norm": 1.6166359579235476, + "learning_rate": 4.669744979897404e-06, + "loss": 0.2766, + "step": 8514 + }, + { + "epoch": 0.54, + "grad_norm": 1.32629855727391, + "learning_rate": 4.668728734342425e-06, + "loss": 0.2683, + "step": 8515 + }, + { + "epoch": 0.54, + "grad_norm": 1.9156123254111672, + "learning_rate": 4.667712502532479e-06, + "loss": 0.2899, + "step": 8516 + }, + { + "epoch": 0.54, + "grad_norm": 3.110190444078173, + "learning_rate": 4.666696284509731e-06, + "loss": 0.279, + "step": 8517 + }, + { + "epoch": 0.54, + "grad_norm": 1.9956441228668982, + "learning_rate": 4.6656800803163454e-06, + "loss": 0.2739, + "step": 8518 + }, + { + "epoch": 0.54, + "grad_norm": 2.25414985881702, + "learning_rate": 4.664663889994489e-06, + "loss": 0.2828, + "step": 8519 + }, + { + "epoch": 0.54, + "grad_norm": 1.939016030513565, + "learning_rate": 4.6636477135863224e-06, + "loss": 0.2696, + "step": 8520 + }, + { + "epoch": 0.54, + "grad_norm": 1.9488688939407426, + "learning_rate": 4.662631551134011e-06, + "loss": 0.303, + "step": 8521 + }, + { + "epoch": 0.54, + "grad_norm": 5.78436442800251, + "learning_rate": 4.6616154026797144e-06, + "loss": 0.3002, + "step": 8522 + }, + { + "epoch": 0.54, + "grad_norm": 5.224794063484492, + "learning_rate": 4.660599268265597e-06, + "loss": 0.278, + "step": 8523 + }, + { + "epoch": 0.54, + "grad_norm": 4.548502201923166, + "learning_rate": 4.659583147933817e-06, + "loss": 0.2826, + "step": 8524 + }, + { + "epoch": 0.54, + "grad_norm": 41.696948042233466, + "learning_rate": 4.6585670417265365e-06, + "loss": 0.2909, + "step": 8525 + }, + { + "epoch": 0.54, + "grad_norm": 2.2166487367959773, + "learning_rate": 4.657550949685917e-06, + "loss": 0.2808, + "step": 8526 + }, + { + "epoch": 0.54, + "grad_norm": 1.3152271681091539, + "learning_rate": 4.656534871854118e-06, + "loss": 0.2922, + "step": 8527 + }, + { + "epoch": 0.54, + "grad_norm": 13.357387346133596, + "learning_rate": 4.655518808273296e-06, + "loss": 0.2826, + "step": 8528 + }, + { + "epoch": 0.54, + "grad_norm": 1.937390647985258, + "learning_rate": 4.654502758985611e-06, + "loss": 0.2791, + "step": 8529 + }, + { + "epoch": 0.54, + "grad_norm": 5.403687022326223, + "learning_rate": 4.65348672403322e-06, + "loss": 0.3071, + "step": 8530 + }, + { + "epoch": 0.54, + "grad_norm": 1.448216136206693, + "learning_rate": 4.652470703458281e-06, + "loss": 0.2832, + "step": 8531 + }, + { + "epoch": 0.54, + "grad_norm": 2.3510141476195585, + "learning_rate": 4.651454697302949e-06, + "loss": 0.2922, + "step": 8532 + }, + { + "epoch": 0.54, + "grad_norm": 1.2209394625057297, + "learning_rate": 4.650438705609379e-06, + "loss": 0.2792, + "step": 8533 + }, + { + "epoch": 0.54, + "grad_norm": 0.6012992133253265, + "learning_rate": 4.6494227284197296e-06, + "loss": 0.4771, + "step": 8534 + }, + { + "epoch": 0.54, + "grad_norm": 1.5696914695097803, + "learning_rate": 4.648406765776154e-06, + "loss": 0.2923, + "step": 8535 + }, + { + "epoch": 0.54, + "grad_norm": 1.9863923207164877, + "learning_rate": 4.647390817720806e-06, + "loss": 0.2801, + "step": 8536 + }, + { + "epoch": 0.54, + "grad_norm": 2.9554050504720806, + "learning_rate": 4.6463748842958416e-06, + "loss": 0.2992, + "step": 8537 + }, + { + "epoch": 0.54, + "grad_norm": 2.619655729795758, + "learning_rate": 4.645358965543409e-06, + "loss": 0.2754, + "step": 8538 + }, + { + "epoch": 0.54, + "grad_norm": 1.2776661114982213, + "learning_rate": 4.644343061505663e-06, + "loss": 0.2914, + "step": 8539 + }, + { + "epoch": 0.54, + "grad_norm": 3.154509107004463, + "learning_rate": 4.643327172224753e-06, + "loss": 0.271, + "step": 8540 + }, + { + "epoch": 0.54, + "grad_norm": 1.8772663486449142, + "learning_rate": 4.6423112977428346e-06, + "loss": 0.3015, + "step": 8541 + }, + { + "epoch": 0.54, + "grad_norm": 1.500139646785539, + "learning_rate": 4.641295438102056e-06, + "loss": 0.2648, + "step": 8542 + }, + { + "epoch": 0.54, + "grad_norm": 2.1114096276150285, + "learning_rate": 4.6402795933445665e-06, + "loss": 0.2748, + "step": 8543 + }, + { + "epoch": 0.54, + "grad_norm": 1.1839224960513643, + "learning_rate": 4.639263763512516e-06, + "loss": 0.2672, + "step": 8544 + }, + { + "epoch": 0.54, + "grad_norm": 2.8680031295894155, + "learning_rate": 4.638247948648052e-06, + "loss": 0.2775, + "step": 8545 + }, + { + "epoch": 0.54, + "grad_norm": 9.729256112303656, + "learning_rate": 4.637232148793323e-06, + "loss": 0.2943, + "step": 8546 + }, + { + "epoch": 0.54, + "grad_norm": 1.6163734714470552, + "learning_rate": 4.636216363990478e-06, + "loss": 0.2851, + "step": 8547 + }, + { + "epoch": 0.54, + "grad_norm": 3.069947633542292, + "learning_rate": 4.635200594281658e-06, + "loss": 0.2773, + "step": 8548 + }, + { + "epoch": 0.54, + "grad_norm": 1.2162562662701955, + "learning_rate": 4.634184839709018e-06, + "loss": 0.277, + "step": 8549 + }, + { + "epoch": 0.54, + "grad_norm": 2.268725482968977, + "learning_rate": 4.6331691003146966e-06, + "loss": 0.2577, + "step": 8550 + }, + { + "epoch": 0.54, + "grad_norm": 2.2527727220316223, + "learning_rate": 4.632153376140841e-06, + "loss": 0.269, + "step": 8551 + }, + { + "epoch": 0.54, + "grad_norm": 4.20018551401685, + "learning_rate": 4.631137667229596e-06, + "loss": 0.3038, + "step": 8552 + }, + { + "epoch": 0.54, + "grad_norm": 1.7370076469343152, + "learning_rate": 4.630121973623104e-06, + "loss": 0.3108, + "step": 8553 + }, + { + "epoch": 0.54, + "grad_norm": 1.4379113774010683, + "learning_rate": 4.629106295363508e-06, + "loss": 0.2833, + "step": 8554 + }, + { + "epoch": 0.54, + "grad_norm": 1.6625491943937982, + "learning_rate": 4.628090632492949e-06, + "loss": 0.28, + "step": 8555 + }, + { + "epoch": 0.54, + "grad_norm": 2.0031481828937756, + "learning_rate": 4.627074985053572e-06, + "loss": 0.2916, + "step": 8556 + }, + { + "epoch": 0.54, + "grad_norm": 2.3079447897823093, + "learning_rate": 4.626059353087516e-06, + "loss": 0.2753, + "step": 8557 + }, + { + "epoch": 0.54, + "grad_norm": 1.3593400223302623, + "learning_rate": 4.625043736636922e-06, + "loss": 0.2905, + "step": 8558 + }, + { + "epoch": 0.54, + "grad_norm": 1.9397299322249433, + "learning_rate": 4.62402813574393e-06, + "loss": 0.2906, + "step": 8559 + }, + { + "epoch": 0.54, + "grad_norm": 3.4514322496734615, + "learning_rate": 4.623012550450677e-06, + "loss": 0.278, + "step": 8560 + }, + { + "epoch": 0.54, + "grad_norm": 1.9483003155099303, + "learning_rate": 4.621996980799305e-06, + "loss": 0.2826, + "step": 8561 + }, + { + "epoch": 0.54, + "grad_norm": 1.9815518749981667, + "learning_rate": 4.620981426831948e-06, + "loss": 0.287, + "step": 8562 + }, + { + "epoch": 0.54, + "grad_norm": 2.2575559472488433, + "learning_rate": 4.6199658885907425e-06, + "loss": 0.293, + "step": 8563 + }, + { + "epoch": 0.54, + "grad_norm": 4.102273372198595, + "learning_rate": 4.61895036611783e-06, + "loss": 0.2909, + "step": 8564 + }, + { + "epoch": 0.54, + "grad_norm": 2.1884664624238077, + "learning_rate": 4.617934859455344e-06, + "loss": 0.2628, + "step": 8565 + }, + { + "epoch": 0.54, + "grad_norm": 1.9237841388999217, + "learning_rate": 4.6169193686454185e-06, + "loss": 0.2933, + "step": 8566 + }, + { + "epoch": 0.54, + "grad_norm": 1.712097895034254, + "learning_rate": 4.61590389373019e-06, + "loss": 0.2899, + "step": 8567 + }, + { + "epoch": 0.54, + "grad_norm": 1.9398340986304743, + "learning_rate": 4.61488843475179e-06, + "loss": 0.2946, + "step": 8568 + }, + { + "epoch": 0.54, + "grad_norm": 1.362201687229322, + "learning_rate": 4.613872991752354e-06, + "loss": 0.2809, + "step": 8569 + }, + { + "epoch": 0.54, + "grad_norm": 1.9868417860586314, + "learning_rate": 4.6128575647740095e-06, + "loss": 0.2718, + "step": 8570 + }, + { + "epoch": 0.54, + "grad_norm": 2.8410643381187213, + "learning_rate": 4.611842153858896e-06, + "loss": 0.2704, + "step": 8571 + }, + { + "epoch": 0.54, + "grad_norm": 2.913388566325249, + "learning_rate": 4.61082675904914e-06, + "loss": 0.283, + "step": 8572 + }, + { + "epoch": 0.54, + "grad_norm": 4.162397071141212, + "learning_rate": 4.609811380386874e-06, + "loss": 0.2895, + "step": 8573 + }, + { + "epoch": 0.54, + "grad_norm": 1.719229953542822, + "learning_rate": 4.608796017914226e-06, + "loss": 0.2624, + "step": 8574 + }, + { + "epoch": 0.54, + "grad_norm": 4.228916571775909, + "learning_rate": 4.607780671673327e-06, + "loss": 0.2829, + "step": 8575 + }, + { + "epoch": 0.54, + "grad_norm": 1.7571849859213218, + "learning_rate": 4.6067653417063035e-06, + "loss": 0.3053, + "step": 8576 + }, + { + "epoch": 0.54, + "grad_norm": 4.437977724296435, + "learning_rate": 4.6057500280552854e-06, + "loss": 0.2789, + "step": 8577 + }, + { + "epoch": 0.54, + "grad_norm": 3.9557853163063963, + "learning_rate": 4.604734730762396e-06, + "loss": 0.2942, + "step": 8578 + }, + { + "epoch": 0.54, + "grad_norm": 4.121689168314926, + "learning_rate": 4.6037194498697675e-06, + "loss": 0.2935, + "step": 8579 + }, + { + "epoch": 0.54, + "grad_norm": 2.756789307169671, + "learning_rate": 4.602704185419523e-06, + "loss": 0.2775, + "step": 8580 + }, + { + "epoch": 0.54, + "grad_norm": 4.070867974225797, + "learning_rate": 4.601688937453787e-06, + "loss": 0.2829, + "step": 8581 + }, + { + "epoch": 0.54, + "grad_norm": 0.5464122002278814, + "learning_rate": 4.600673706014684e-06, + "loss": 0.4769, + "step": 8582 + }, + { + "epoch": 0.54, + "grad_norm": 1.4424936918505127, + "learning_rate": 4.59965849114434e-06, + "loss": 0.3037, + "step": 8583 + }, + { + "epoch": 0.54, + "grad_norm": 1.458360579085679, + "learning_rate": 4.598643292884874e-06, + "loss": 0.2863, + "step": 8584 + }, + { + "epoch": 0.54, + "grad_norm": 1.6193533008497647, + "learning_rate": 4.59762811127841e-06, + "loss": 0.2804, + "step": 8585 + }, + { + "epoch": 0.54, + "grad_norm": 1.4594848985223432, + "learning_rate": 4.596612946367071e-06, + "loss": 0.2713, + "step": 8586 + }, + { + "epoch": 0.54, + "grad_norm": 1.6832068829104752, + "learning_rate": 4.59559779819298e-06, + "loss": 0.2624, + "step": 8587 + }, + { + "epoch": 0.54, + "grad_norm": 1.8235682189158133, + "learning_rate": 4.594582666798253e-06, + "loss": 0.2949, + "step": 8588 + }, + { + "epoch": 0.54, + "grad_norm": 3.7972655430515903, + "learning_rate": 4.593567552225011e-06, + "loss": 0.2872, + "step": 8589 + }, + { + "epoch": 0.54, + "grad_norm": 3.1422914693986663, + "learning_rate": 4.592552454515373e-06, + "loss": 0.2717, + "step": 8590 + }, + { + "epoch": 0.54, + "grad_norm": 2.070577279731365, + "learning_rate": 4.5915373737114585e-06, + "loss": 0.2867, + "step": 8591 + }, + { + "epoch": 0.54, + "grad_norm": 2.1193049675765288, + "learning_rate": 4.590522309855383e-06, + "loss": 0.2759, + "step": 8592 + }, + { + "epoch": 0.54, + "grad_norm": 2.2639417689430745, + "learning_rate": 4.589507262989263e-06, + "loss": 0.2853, + "step": 8593 + }, + { + "epoch": 0.54, + "grad_norm": 1.916129817849262, + "learning_rate": 4.5884922331552165e-06, + "loss": 0.28, + "step": 8594 + }, + { + "epoch": 0.54, + "grad_norm": 1.9396402468979705, + "learning_rate": 4.587477220395359e-06, + "loss": 0.2951, + "step": 8595 + }, + { + "epoch": 0.54, + "grad_norm": 1.5468369879825565, + "learning_rate": 4.586462224751805e-06, + "loss": 0.2859, + "step": 8596 + }, + { + "epoch": 0.54, + "grad_norm": 3.114272870505431, + "learning_rate": 4.585447246266667e-06, + "loss": 0.2912, + "step": 8597 + }, + { + "epoch": 0.54, + "grad_norm": 2.880106452110881, + "learning_rate": 4.584432284982058e-06, + "loss": 0.2962, + "step": 8598 + }, + { + "epoch": 0.54, + "grad_norm": 1.9331625466461235, + "learning_rate": 4.583417340940094e-06, + "loss": 0.2841, + "step": 8599 + }, + { + "epoch": 0.54, + "grad_norm": 2.71786524563575, + "learning_rate": 4.58240241418288e-06, + "loss": 0.2853, + "step": 8600 + }, + { + "epoch": 0.54, + "grad_norm": 2.1148639066727943, + "learning_rate": 4.581387504752536e-06, + "loss": 0.2777, + "step": 8601 + }, + { + "epoch": 0.54, + "grad_norm": 2.0081180780315173, + "learning_rate": 4.580372612691166e-06, + "loss": 0.2582, + "step": 8602 + }, + { + "epoch": 0.54, + "grad_norm": 6.70801635144531, + "learning_rate": 4.579357738040882e-06, + "loss": 0.2769, + "step": 8603 + }, + { + "epoch": 0.54, + "grad_norm": 1.9732370667667531, + "learning_rate": 4.5783428808437934e-06, + "loss": 0.2713, + "step": 8604 + }, + { + "epoch": 0.54, + "grad_norm": 3.900251136523736, + "learning_rate": 4.577328041142008e-06, + "loss": 0.2843, + "step": 8605 + }, + { + "epoch": 0.54, + "grad_norm": 2.325128911545758, + "learning_rate": 4.576313218977632e-06, + "loss": 0.2766, + "step": 8606 + }, + { + "epoch": 0.54, + "grad_norm": 4.347966589831407, + "learning_rate": 4.575298414392771e-06, + "loss": 0.2701, + "step": 8607 + }, + { + "epoch": 0.54, + "grad_norm": 3.127965411152177, + "learning_rate": 4.574283627429535e-06, + "loss": 0.2671, + "step": 8608 + }, + { + "epoch": 0.54, + "grad_norm": 1.9182318503140041, + "learning_rate": 4.573268858130028e-06, + "loss": 0.2719, + "step": 8609 + }, + { + "epoch": 0.54, + "grad_norm": 2.347908355016946, + "learning_rate": 4.5722541065363535e-06, + "loss": 0.2667, + "step": 8610 + }, + { + "epoch": 0.54, + "grad_norm": 1.8857029220855384, + "learning_rate": 4.571239372690617e-06, + "loss": 0.2738, + "step": 8611 + }, + { + "epoch": 0.54, + "grad_norm": 1.7631583278078047, + "learning_rate": 4.570224656634919e-06, + "loss": 0.2803, + "step": 8612 + }, + { + "epoch": 0.54, + "grad_norm": 1.9975087766146427, + "learning_rate": 4.569209958411365e-06, + "loss": 0.277, + "step": 8613 + }, + { + "epoch": 0.54, + "grad_norm": 0.6197380296160668, + "learning_rate": 4.568195278062053e-06, + "loss": 0.4757, + "step": 8614 + }, + { + "epoch": 0.54, + "grad_norm": 2.8655645199727724, + "learning_rate": 4.567180615629086e-06, + "loss": 0.3062, + "step": 8615 + }, + { + "epoch": 0.54, + "grad_norm": 4.266578233175302, + "learning_rate": 4.566165971154564e-06, + "loss": 0.2906, + "step": 8616 + }, + { + "epoch": 0.54, + "grad_norm": 1.562224386288989, + "learning_rate": 4.565151344680588e-06, + "loss": 0.2779, + "step": 8617 + }, + { + "epoch": 0.54, + "grad_norm": 1.4590352152499, + "learning_rate": 4.564136736249254e-06, + "loss": 0.2596, + "step": 8618 + }, + { + "epoch": 0.54, + "grad_norm": 1.6523264225530647, + "learning_rate": 4.5631221459026624e-06, + "loss": 0.2881, + "step": 8619 + }, + { + "epoch": 0.54, + "grad_norm": 1.93516897016039, + "learning_rate": 4.562107573682908e-06, + "loss": 0.314, + "step": 8620 + }, + { + "epoch": 0.54, + "grad_norm": 3.2328858142846673, + "learning_rate": 4.561093019632089e-06, + "loss": 0.2709, + "step": 8621 + }, + { + "epoch": 0.54, + "grad_norm": 0.551435138906967, + "learning_rate": 4.560078483792297e-06, + "loss": 0.4583, + "step": 8622 + }, + { + "epoch": 0.54, + "grad_norm": 2.183199839452602, + "learning_rate": 4.5590639662056325e-06, + "loss": 0.2828, + "step": 8623 + }, + { + "epoch": 0.54, + "grad_norm": 1.9929203831223479, + "learning_rate": 4.5580494669141865e-06, + "loss": 0.2738, + "step": 8624 + }, + { + "epoch": 0.54, + "grad_norm": 2.0960387672284257, + "learning_rate": 4.557034985960054e-06, + "loss": 0.3031, + "step": 8625 + }, + { + "epoch": 0.54, + "grad_norm": 2.5282488495606374, + "learning_rate": 4.556020523385326e-06, + "loss": 0.2897, + "step": 8626 + }, + { + "epoch": 0.54, + "grad_norm": 1.578240727402863, + "learning_rate": 4.5550060792320965e-06, + "loss": 0.2672, + "step": 8627 + }, + { + "epoch": 0.54, + "grad_norm": 1.5666210547579098, + "learning_rate": 4.5539916535424535e-06, + "loss": 0.2606, + "step": 8628 + }, + { + "epoch": 0.54, + "grad_norm": 3.456100286133945, + "learning_rate": 4.552977246358491e-06, + "loss": 0.2853, + "step": 8629 + }, + { + "epoch": 0.54, + "grad_norm": 0.6108797312051854, + "learning_rate": 4.551962857722293e-06, + "loss": 0.4985, + "step": 8630 + }, + { + "epoch": 0.54, + "grad_norm": 2.6290358899769215, + "learning_rate": 4.5509484876759545e-06, + "loss": 0.2777, + "step": 8631 + }, + { + "epoch": 0.54, + "grad_norm": 1.7857611995949587, + "learning_rate": 4.54993413626156e-06, + "loss": 0.2962, + "step": 8632 + }, + { + "epoch": 0.54, + "grad_norm": 1.5804072172844503, + "learning_rate": 4.548919803521199e-06, + "loss": 0.2651, + "step": 8633 + }, + { + "epoch": 0.54, + "grad_norm": 1.814613053214455, + "learning_rate": 4.547905489496955e-06, + "loss": 0.2728, + "step": 8634 + }, + { + "epoch": 0.54, + "grad_norm": 2.3240438288757073, + "learning_rate": 4.546891194230917e-06, + "loss": 0.265, + "step": 8635 + }, + { + "epoch": 0.54, + "grad_norm": 2.843873010889801, + "learning_rate": 4.545876917765166e-06, + "loss": 0.2854, + "step": 8636 + }, + { + "epoch": 0.54, + "grad_norm": 2.5127610501912625, + "learning_rate": 4.544862660141788e-06, + "loss": 0.3024, + "step": 8637 + }, + { + "epoch": 0.54, + "grad_norm": 1.8378733164015084, + "learning_rate": 4.543848421402868e-06, + "loss": 0.2815, + "step": 8638 + }, + { + "epoch": 0.54, + "grad_norm": 2.3257112494158623, + "learning_rate": 4.542834201590488e-06, + "loss": 0.3038, + "step": 8639 + }, + { + "epoch": 0.54, + "grad_norm": 2.0907842299810633, + "learning_rate": 4.541820000746727e-06, + "loss": 0.2982, + "step": 8640 + }, + { + "epoch": 0.54, + "grad_norm": 3.189431225467172, + "learning_rate": 4.54080581891367e-06, + "loss": 0.2771, + "step": 8641 + }, + { + "epoch": 0.54, + "grad_norm": 2.114439143240404, + "learning_rate": 4.5397916561333945e-06, + "loss": 0.2768, + "step": 8642 + }, + { + "epoch": 0.54, + "grad_norm": 2.3184204342694437, + "learning_rate": 4.538777512447981e-06, + "loss": 0.2953, + "step": 8643 + }, + { + "epoch": 0.54, + "grad_norm": 2.8734650242845112, + "learning_rate": 4.537763387899506e-06, + "loss": 0.2931, + "step": 8644 + }, + { + "epoch": 0.54, + "grad_norm": 1.7485818078402564, + "learning_rate": 4.5367492825300495e-06, + "loss": 0.2725, + "step": 8645 + }, + { + "epoch": 0.54, + "grad_norm": 2.3865100544345346, + "learning_rate": 4.535735196381688e-06, + "loss": 0.2777, + "step": 8646 + }, + { + "epoch": 0.54, + "grad_norm": 2.404794384400355, + "learning_rate": 4.5347211294965e-06, + "loss": 0.2811, + "step": 8647 + }, + { + "epoch": 0.54, + "grad_norm": 1.6905924900884721, + "learning_rate": 4.533707081916557e-06, + "loss": 0.3042, + "step": 8648 + }, + { + "epoch": 0.54, + "grad_norm": 2.273842475985826, + "learning_rate": 4.532693053683937e-06, + "loss": 0.2846, + "step": 8649 + }, + { + "epoch": 0.54, + "grad_norm": 3.868565034561557, + "learning_rate": 4.53167904484071e-06, + "loss": 0.2742, + "step": 8650 + }, + { + "epoch": 0.54, + "grad_norm": 2.248031544343279, + "learning_rate": 4.530665055428953e-06, + "loss": 0.2874, + "step": 8651 + }, + { + "epoch": 0.54, + "grad_norm": 1.511332066634212, + "learning_rate": 4.529651085490735e-06, + "loss": 0.2816, + "step": 8652 + }, + { + "epoch": 0.54, + "grad_norm": 2.139553668217882, + "learning_rate": 4.528637135068128e-06, + "loss": 0.2804, + "step": 8653 + }, + { + "epoch": 0.54, + "grad_norm": 1.8447021782049204, + "learning_rate": 4.527623204203205e-06, + "loss": 0.2768, + "step": 8654 + }, + { + "epoch": 0.54, + "grad_norm": 5.548620663837192, + "learning_rate": 4.526609292938036e-06, + "loss": 0.32, + "step": 8655 + }, + { + "epoch": 0.54, + "grad_norm": 1.9007032332417215, + "learning_rate": 4.525595401314686e-06, + "loss": 0.2812, + "step": 8656 + }, + { + "epoch": 0.54, + "grad_norm": 1.7562137069187282, + "learning_rate": 4.524581529375227e-06, + "loss": 0.2723, + "step": 8657 + }, + { + "epoch": 0.54, + "grad_norm": 5.464324448956869, + "learning_rate": 4.523567677161724e-06, + "loss": 0.2802, + "step": 8658 + }, + { + "epoch": 0.54, + "grad_norm": 1.9539615556465253, + "learning_rate": 4.522553844716245e-06, + "loss": 0.2672, + "step": 8659 + }, + { + "epoch": 0.54, + "grad_norm": 1.6783096749471333, + "learning_rate": 4.521540032080852e-06, + "loss": 0.2769, + "step": 8660 + }, + { + "epoch": 0.54, + "grad_norm": 5.525224386916247, + "learning_rate": 4.520526239297615e-06, + "loss": 0.2747, + "step": 8661 + }, + { + "epoch": 0.54, + "grad_norm": 2.4201583037172347, + "learning_rate": 4.519512466408596e-06, + "loss": 0.2856, + "step": 8662 + }, + { + "epoch": 0.54, + "grad_norm": 2.2608409999456804, + "learning_rate": 4.5184987134558595e-06, + "loss": 0.2971, + "step": 8663 + }, + { + "epoch": 0.54, + "grad_norm": 1.8139226028308588, + "learning_rate": 4.517484980481465e-06, + "loss": 0.2911, + "step": 8664 + }, + { + "epoch": 0.54, + "grad_norm": 1.8104233666573968, + "learning_rate": 4.516471267527477e-06, + "loss": 0.2688, + "step": 8665 + }, + { + "epoch": 0.54, + "grad_norm": 2.159298667844816, + "learning_rate": 4.515457574635954e-06, + "loss": 0.2754, + "step": 8666 + }, + { + "epoch": 0.55, + "grad_norm": 2.8085537988998825, + "learning_rate": 4.514443901848955e-06, + "loss": 0.2794, + "step": 8667 + }, + { + "epoch": 0.55, + "grad_norm": 1.823481615533778, + "learning_rate": 4.513430249208543e-06, + "loss": 0.2566, + "step": 8668 + }, + { + "epoch": 0.55, + "grad_norm": 2.183569794940634, + "learning_rate": 4.512416616756775e-06, + "loss": 0.2781, + "step": 8669 + }, + { + "epoch": 0.55, + "grad_norm": 1.4572901475172415, + "learning_rate": 4.511403004535707e-06, + "loss": 0.2845, + "step": 8670 + }, + { + "epoch": 0.55, + "grad_norm": 2.1257860807033913, + "learning_rate": 4.510389412587396e-06, + "loss": 0.2743, + "step": 8671 + }, + { + "epoch": 0.55, + "grad_norm": 2.586562221477153, + "learning_rate": 4.509375840953898e-06, + "loss": 0.2644, + "step": 8672 + }, + { + "epoch": 0.55, + "grad_norm": 1.8402364882793383, + "learning_rate": 4.508362289677269e-06, + "loss": 0.283, + "step": 8673 + }, + { + "epoch": 0.55, + "grad_norm": 4.456371909412483, + "learning_rate": 4.507348758799559e-06, + "loss": 0.2768, + "step": 8674 + }, + { + "epoch": 0.55, + "grad_norm": 1.8813355872225273, + "learning_rate": 4.506335248362824e-06, + "loss": 0.2696, + "step": 8675 + }, + { + "epoch": 0.55, + "grad_norm": 3.8061554021897575, + "learning_rate": 4.505321758409119e-06, + "loss": 0.2823, + "step": 8676 + }, + { + "epoch": 0.55, + "grad_norm": 1.3721196467245311, + "learning_rate": 4.504308288980492e-06, + "loss": 0.2671, + "step": 8677 + }, + { + "epoch": 0.55, + "grad_norm": 2.2092868208521197, + "learning_rate": 4.503294840118994e-06, + "loss": 0.2712, + "step": 8678 + }, + { + "epoch": 0.55, + "grad_norm": 1.8938703630067009, + "learning_rate": 4.502281411866677e-06, + "loss": 0.2878, + "step": 8679 + }, + { + "epoch": 0.55, + "grad_norm": 3.299576991819874, + "learning_rate": 4.501268004265586e-06, + "loss": 0.2706, + "step": 8680 + }, + { + "epoch": 0.55, + "grad_norm": 4.936327453854064, + "learning_rate": 4.500254617357773e-06, + "loss": 0.2829, + "step": 8681 + }, + { + "epoch": 0.55, + "grad_norm": 2.0996755835709644, + "learning_rate": 4.4992412511852816e-06, + "loss": 0.3129, + "step": 8682 + }, + { + "epoch": 0.55, + "grad_norm": 4.186719977759636, + "learning_rate": 4.498227905790162e-06, + "loss": 0.2874, + "step": 8683 + }, + { + "epoch": 0.55, + "grad_norm": 2.1423388689125566, + "learning_rate": 4.497214581214456e-06, + "loss": 0.2692, + "step": 8684 + }, + { + "epoch": 0.55, + "grad_norm": 1.7107853309303507, + "learning_rate": 4.4962012775002125e-06, + "loss": 0.2653, + "step": 8685 + }, + { + "epoch": 0.55, + "grad_norm": 1.6882321677791206, + "learning_rate": 4.495187994689471e-06, + "loss": 0.2743, + "step": 8686 + }, + { + "epoch": 0.55, + "grad_norm": 4.5269752474096725, + "learning_rate": 4.494174732824279e-06, + "loss": 0.2619, + "step": 8687 + }, + { + "epoch": 0.55, + "grad_norm": 2.587270876376558, + "learning_rate": 4.493161491946673e-06, + "loss": 0.2748, + "step": 8688 + }, + { + "epoch": 0.55, + "grad_norm": 1.4339995786329136, + "learning_rate": 4.492148272098696e-06, + "loss": 0.2663, + "step": 8689 + }, + { + "epoch": 0.55, + "grad_norm": 3.0538978301953326, + "learning_rate": 4.4911350733223915e-06, + "loss": 0.2745, + "step": 8690 + }, + { + "epoch": 0.55, + "grad_norm": 3.0367939560783195, + "learning_rate": 4.490121895659796e-06, + "loss": 0.2801, + "step": 8691 + }, + { + "epoch": 0.55, + "grad_norm": 2.317636489632323, + "learning_rate": 4.489108739152948e-06, + "loss": 0.2782, + "step": 8692 + }, + { + "epoch": 0.55, + "grad_norm": 5.0480680365327215, + "learning_rate": 4.488095603843888e-06, + "loss": 0.2802, + "step": 8693 + }, + { + "epoch": 0.55, + "grad_norm": 1.730880327492681, + "learning_rate": 4.487082489774649e-06, + "loss": 0.3069, + "step": 8694 + }, + { + "epoch": 0.55, + "grad_norm": 20.811687343272325, + "learning_rate": 4.486069396987269e-06, + "loss": 0.2839, + "step": 8695 + }, + { + "epoch": 0.55, + "grad_norm": 1.6116323535684252, + "learning_rate": 4.485056325523783e-06, + "loss": 0.2679, + "step": 8696 + }, + { + "epoch": 0.55, + "grad_norm": 2.489179353127172, + "learning_rate": 4.484043275426223e-06, + "loss": 0.286, + "step": 8697 + }, + { + "epoch": 0.55, + "grad_norm": 1.955007266093121, + "learning_rate": 4.483030246736625e-06, + "loss": 0.2889, + "step": 8698 + }, + { + "epoch": 0.55, + "grad_norm": 3.05642462748354, + "learning_rate": 4.48201723949702e-06, + "loss": 0.2564, + "step": 8699 + }, + { + "epoch": 0.55, + "grad_norm": 2.624203864039519, + "learning_rate": 4.4810042537494395e-06, + "loss": 0.2807, + "step": 8700 + }, + { + "epoch": 0.55, + "grad_norm": 2.792966610432871, + "learning_rate": 4.479991289535916e-06, + "loss": 0.2711, + "step": 8701 + }, + { + "epoch": 0.55, + "grad_norm": 2.1525156680549467, + "learning_rate": 4.4789783468984765e-06, + "loss": 0.2988, + "step": 8702 + }, + { + "epoch": 0.55, + "grad_norm": 19.965924416228063, + "learning_rate": 4.477965425879152e-06, + "loss": 0.2902, + "step": 8703 + }, + { + "epoch": 0.55, + "grad_norm": 4.090974749486585, + "learning_rate": 4.476952526519966e-06, + "loss": 0.263, + "step": 8704 + }, + { + "epoch": 0.55, + "grad_norm": 1.9306939845629139, + "learning_rate": 4.47593964886295e-06, + "loss": 0.2849, + "step": 8705 + }, + { + "epoch": 0.55, + "grad_norm": 10.230335887825449, + "learning_rate": 4.47492679295013e-06, + "loss": 0.2798, + "step": 8706 + }, + { + "epoch": 0.55, + "grad_norm": 1.8523561843967347, + "learning_rate": 4.47391395882353e-06, + "loss": 0.2971, + "step": 8707 + }, + { + "epoch": 0.55, + "grad_norm": 2.903418956748041, + "learning_rate": 4.472901146525174e-06, + "loss": 0.2805, + "step": 8708 + }, + { + "epoch": 0.55, + "grad_norm": 2.2876029265816054, + "learning_rate": 4.471888356097086e-06, + "loss": 0.2949, + "step": 8709 + }, + { + "epoch": 0.55, + "grad_norm": 1.753283396738876, + "learning_rate": 4.470875587581287e-06, + "loss": 0.2638, + "step": 8710 + }, + { + "epoch": 0.55, + "grad_norm": 6.397146123534283, + "learning_rate": 4.4698628410198015e-06, + "loss": 0.2911, + "step": 8711 + }, + { + "epoch": 0.55, + "grad_norm": 1.6834451016165375, + "learning_rate": 4.468850116454645e-06, + "loss": 0.2852, + "step": 8712 + }, + { + "epoch": 0.55, + "grad_norm": 1.5940769876826701, + "learning_rate": 4.467837413927843e-06, + "loss": 0.2776, + "step": 8713 + }, + { + "epoch": 0.55, + "grad_norm": 2.068817197256334, + "learning_rate": 4.466824733481411e-06, + "loss": 0.284, + "step": 8714 + }, + { + "epoch": 0.55, + "grad_norm": 2.733355617713016, + "learning_rate": 4.46581207515737e-06, + "loss": 0.2746, + "step": 8715 + }, + { + "epoch": 0.55, + "grad_norm": 2.4566253155911313, + "learning_rate": 4.464799438997733e-06, + "loss": 0.3043, + "step": 8716 + }, + { + "epoch": 0.55, + "grad_norm": 0.6146027774224536, + "learning_rate": 4.463786825044519e-06, + "loss": 0.5033, + "step": 8717 + }, + { + "epoch": 0.55, + "grad_norm": 1.60328592098261, + "learning_rate": 4.462774233339741e-06, + "loss": 0.2617, + "step": 8718 + }, + { + "epoch": 0.55, + "grad_norm": 1.744840156791795, + "learning_rate": 4.461761663925413e-06, + "loss": 0.2869, + "step": 8719 + }, + { + "epoch": 0.55, + "grad_norm": 2.196057371140877, + "learning_rate": 4.460749116843553e-06, + "loss": 0.2602, + "step": 8720 + }, + { + "epoch": 0.55, + "grad_norm": 3.9060712691560484, + "learning_rate": 4.459736592136167e-06, + "loss": 0.2886, + "step": 8721 + }, + { + "epoch": 0.55, + "grad_norm": 1.8881644284223496, + "learning_rate": 4.45872408984527e-06, + "loss": 0.275, + "step": 8722 + }, + { + "epoch": 0.55, + "grad_norm": 1.7432649745383522, + "learning_rate": 4.457711610012873e-06, + "loss": 0.2992, + "step": 8723 + }, + { + "epoch": 0.55, + "grad_norm": 2.916697148134185, + "learning_rate": 4.456699152680984e-06, + "loss": 0.2965, + "step": 8724 + }, + { + "epoch": 0.55, + "grad_norm": 0.5853925670611175, + "learning_rate": 4.455686717891613e-06, + "loss": 0.4762, + "step": 8725 + }, + { + "epoch": 0.55, + "grad_norm": 1.7172402368895858, + "learning_rate": 4.454674305686766e-06, + "loss": 0.2704, + "step": 8726 + }, + { + "epoch": 0.55, + "grad_norm": 1.559030240890836, + "learning_rate": 4.45366191610845e-06, + "loss": 0.2662, + "step": 8727 + }, + { + "epoch": 0.55, + "grad_norm": 2.1223966740884235, + "learning_rate": 4.452649549198672e-06, + "loss": 0.2764, + "step": 8728 + }, + { + "epoch": 0.55, + "grad_norm": 3.6184665383907286, + "learning_rate": 4.451637204999436e-06, + "loss": 0.277, + "step": 8729 + }, + { + "epoch": 0.55, + "grad_norm": 2.869302526381741, + "learning_rate": 4.450624883552747e-06, + "loss": 0.3031, + "step": 8730 + }, + { + "epoch": 0.55, + "grad_norm": 7.422092786800748, + "learning_rate": 4.449612584900608e-06, + "loss": 0.2922, + "step": 8731 + }, + { + "epoch": 0.55, + "grad_norm": 1.5763176131167882, + "learning_rate": 4.4486003090850195e-06, + "loss": 0.2778, + "step": 8732 + }, + { + "epoch": 0.55, + "grad_norm": 1.9535099900192123, + "learning_rate": 4.447588056147985e-06, + "loss": 0.2831, + "step": 8733 + }, + { + "epoch": 0.55, + "grad_norm": 2.1141862189935123, + "learning_rate": 4.4465758261315e-06, + "loss": 0.2727, + "step": 8734 + }, + { + "epoch": 0.55, + "grad_norm": 2.1301601391388307, + "learning_rate": 4.445563619077569e-06, + "loss": 0.2897, + "step": 8735 + }, + { + "epoch": 0.55, + "grad_norm": 6.302629049170874, + "learning_rate": 4.444551435028189e-06, + "loss": 0.2682, + "step": 8736 + }, + { + "epoch": 0.55, + "grad_norm": 2.269731032538597, + "learning_rate": 4.443539274025356e-06, + "loss": 0.2707, + "step": 8737 + }, + { + "epoch": 0.55, + "grad_norm": 2.3597455198382655, + "learning_rate": 4.442527136111067e-06, + "loss": 0.2982, + "step": 8738 + }, + { + "epoch": 0.55, + "grad_norm": 2.2546564141912344, + "learning_rate": 4.441515021327317e-06, + "loss": 0.2788, + "step": 8739 + }, + { + "epoch": 0.55, + "grad_norm": 2.8816917240269984, + "learning_rate": 4.4405029297161e-06, + "loss": 0.297, + "step": 8740 + }, + { + "epoch": 0.55, + "grad_norm": 4.307853514275158, + "learning_rate": 4.439490861319413e-06, + "loss": 0.2669, + "step": 8741 + }, + { + "epoch": 0.55, + "grad_norm": 1.4396761296655378, + "learning_rate": 4.4384788161792425e-06, + "loss": 0.2729, + "step": 8742 + }, + { + "epoch": 0.55, + "grad_norm": 3.150460184328523, + "learning_rate": 4.437466794337584e-06, + "loss": 0.2988, + "step": 8743 + }, + { + "epoch": 0.55, + "grad_norm": 1.8233458776763678, + "learning_rate": 4.4364547958364275e-06, + "loss": 0.2809, + "step": 8744 + }, + { + "epoch": 0.55, + "grad_norm": 3.339442174771447, + "learning_rate": 4.4354428207177645e-06, + "loss": 0.3095, + "step": 8745 + }, + { + "epoch": 0.55, + "grad_norm": 3.346649406151598, + "learning_rate": 4.434430869023579e-06, + "loss": 0.2827, + "step": 8746 + }, + { + "epoch": 0.55, + "grad_norm": 2.5648129658430627, + "learning_rate": 4.433418940795863e-06, + "loss": 0.2657, + "step": 8747 + }, + { + "epoch": 0.55, + "grad_norm": 2.066333816262808, + "learning_rate": 4.432407036076601e-06, + "loss": 0.2871, + "step": 8748 + }, + { + "epoch": 0.55, + "grad_norm": 3.1392619710472855, + "learning_rate": 4.4313951549077775e-06, + "loss": 0.2633, + "step": 8749 + }, + { + "epoch": 0.55, + "grad_norm": 5.482440548432782, + "learning_rate": 4.430383297331381e-06, + "loss": 0.2965, + "step": 8750 + }, + { + "epoch": 0.55, + "grad_norm": 2.566228056856757, + "learning_rate": 4.429371463389393e-06, + "loss": 0.2913, + "step": 8751 + }, + { + "epoch": 0.55, + "grad_norm": 5.598308007124853, + "learning_rate": 4.428359653123796e-06, + "loss": 0.2866, + "step": 8752 + }, + { + "epoch": 0.55, + "grad_norm": 2.1237908125520835, + "learning_rate": 4.427347866576573e-06, + "loss": 0.2841, + "step": 8753 + }, + { + "epoch": 0.55, + "grad_norm": 1.917361363393779, + "learning_rate": 4.4263361037897046e-06, + "loss": 0.2552, + "step": 8754 + }, + { + "epoch": 0.55, + "grad_norm": 3.9872444843215287, + "learning_rate": 4.4253243648051705e-06, + "loss": 0.2882, + "step": 8755 + }, + { + "epoch": 0.55, + "grad_norm": 2.092550239934306, + "learning_rate": 4.424312649664948e-06, + "loss": 0.2674, + "step": 8756 + }, + { + "epoch": 0.55, + "grad_norm": 1.619481056358797, + "learning_rate": 4.423300958411014e-06, + "loss": 0.2811, + "step": 8757 + }, + { + "epoch": 0.55, + "grad_norm": 2.3155184754507414, + "learning_rate": 4.422289291085351e-06, + "loss": 0.2899, + "step": 8758 + }, + { + "epoch": 0.55, + "grad_norm": 0.6327497334036932, + "learning_rate": 4.42127764772993e-06, + "loss": 0.4672, + "step": 8759 + }, + { + "epoch": 0.55, + "grad_norm": 3.089554601225153, + "learning_rate": 4.420266028386728e-06, + "loss": 0.2847, + "step": 8760 + }, + { + "epoch": 0.55, + "grad_norm": 2.346104769242135, + "learning_rate": 4.419254433097719e-06, + "loss": 0.2635, + "step": 8761 + }, + { + "epoch": 0.55, + "grad_norm": 1.890118108497835, + "learning_rate": 4.418242861904873e-06, + "loss": 0.2842, + "step": 8762 + }, + { + "epoch": 0.55, + "grad_norm": 4.0512350898967515, + "learning_rate": 4.417231314850166e-06, + "loss": 0.279, + "step": 8763 + }, + { + "epoch": 0.55, + "grad_norm": 2.2090497839069054, + "learning_rate": 4.416219791975565e-06, + "loss": 0.2671, + "step": 8764 + }, + { + "epoch": 0.55, + "grad_norm": 3.9640017675332144, + "learning_rate": 4.415208293323042e-06, + "loss": 0.3007, + "step": 8765 + }, + { + "epoch": 0.55, + "grad_norm": 2.5219195963378986, + "learning_rate": 4.414196818934567e-06, + "loss": 0.2573, + "step": 8766 + }, + { + "epoch": 0.55, + "grad_norm": 1.841370004026359, + "learning_rate": 4.413185368852106e-06, + "loss": 0.2699, + "step": 8767 + }, + { + "epoch": 0.55, + "grad_norm": 2.8130023330384395, + "learning_rate": 4.412173943117626e-06, + "loss": 0.2773, + "step": 8768 + }, + { + "epoch": 0.55, + "grad_norm": 2.7112533878540717, + "learning_rate": 4.4111625417730935e-06, + "loss": 0.2785, + "step": 8769 + }, + { + "epoch": 0.55, + "grad_norm": 3.498945737985453, + "learning_rate": 4.410151164860472e-06, + "loss": 0.2918, + "step": 8770 + }, + { + "epoch": 0.55, + "grad_norm": 12.3917418633594, + "learning_rate": 4.409139812421726e-06, + "loss": 0.2571, + "step": 8771 + }, + { + "epoch": 0.55, + "grad_norm": 3.9056012879771713, + "learning_rate": 4.408128484498821e-06, + "loss": 0.3047, + "step": 8772 + }, + { + "epoch": 0.55, + "grad_norm": 3.110824954788999, + "learning_rate": 4.407117181133715e-06, + "loss": 0.2934, + "step": 8773 + }, + { + "epoch": 0.55, + "grad_norm": 4.58552081856733, + "learning_rate": 4.406105902368371e-06, + "loss": 0.2676, + "step": 8774 + }, + { + "epoch": 0.55, + "grad_norm": 1.6280054243402897, + "learning_rate": 4.405094648244747e-06, + "loss": 0.2647, + "step": 8775 + }, + { + "epoch": 0.55, + "grad_norm": 1.9021394426162868, + "learning_rate": 4.404083418804803e-06, + "loss": 0.2641, + "step": 8776 + }, + { + "epoch": 0.55, + "grad_norm": 2.3452257948415998, + "learning_rate": 4.403072214090498e-06, + "loss": 0.2935, + "step": 8777 + }, + { + "epoch": 0.55, + "grad_norm": 4.024300497890209, + "learning_rate": 4.402061034143785e-06, + "loss": 0.2634, + "step": 8778 + }, + { + "epoch": 0.55, + "grad_norm": 3.0979430973519455, + "learning_rate": 4.401049879006621e-06, + "loss": 0.2896, + "step": 8779 + }, + { + "epoch": 0.55, + "grad_norm": 3.0786174457969815, + "learning_rate": 4.400038748720964e-06, + "loss": 0.2682, + "step": 8780 + }, + { + "epoch": 0.55, + "grad_norm": 5.778758056238694, + "learning_rate": 4.399027643328764e-06, + "loss": 0.2667, + "step": 8781 + }, + { + "epoch": 0.55, + "grad_norm": 7.407984165891581, + "learning_rate": 4.3980165628719745e-06, + "loss": 0.3092, + "step": 8782 + }, + { + "epoch": 0.55, + "grad_norm": 1.9337897479889632, + "learning_rate": 4.397005507392547e-06, + "loss": 0.2704, + "step": 8783 + }, + { + "epoch": 0.55, + "grad_norm": 2.0966101736019462, + "learning_rate": 4.3959944769324314e-06, + "loss": 0.2716, + "step": 8784 + }, + { + "epoch": 0.55, + "grad_norm": 1.5932500690387283, + "learning_rate": 4.394983471533579e-06, + "loss": 0.2712, + "step": 8785 + }, + { + "epoch": 0.55, + "grad_norm": 1.7693085950654157, + "learning_rate": 4.393972491237935e-06, + "loss": 0.2749, + "step": 8786 + }, + { + "epoch": 0.55, + "grad_norm": 2.6338923319208116, + "learning_rate": 4.39296153608745e-06, + "loss": 0.2894, + "step": 8787 + }, + { + "epoch": 0.55, + "grad_norm": 3.283903438910121, + "learning_rate": 4.39195060612407e-06, + "loss": 0.2868, + "step": 8788 + }, + { + "epoch": 0.55, + "grad_norm": 2.030213942018656, + "learning_rate": 4.390939701389738e-06, + "loss": 0.2699, + "step": 8789 + }, + { + "epoch": 0.55, + "grad_norm": 2.5366393441986386, + "learning_rate": 4.3899288219264e-06, + "loss": 0.2844, + "step": 8790 + }, + { + "epoch": 0.55, + "grad_norm": 3.4356533143359775, + "learning_rate": 4.388917967776e-06, + "loss": 0.2797, + "step": 8791 + }, + { + "epoch": 0.55, + "grad_norm": 2.782377717338023, + "learning_rate": 4.387907138980477e-06, + "loss": 0.287, + "step": 8792 + }, + { + "epoch": 0.55, + "grad_norm": 2.123170864027156, + "learning_rate": 4.386896335581776e-06, + "loss": 0.2809, + "step": 8793 + }, + { + "epoch": 0.55, + "grad_norm": 2.6215816364133904, + "learning_rate": 4.385885557621833e-06, + "loss": 0.2708, + "step": 8794 + }, + { + "epoch": 0.55, + "grad_norm": 3.036613786061605, + "learning_rate": 4.38487480514259e-06, + "loss": 0.2908, + "step": 8795 + }, + { + "epoch": 0.55, + "grad_norm": 1.3803151073275663, + "learning_rate": 4.383864078185984e-06, + "loss": 0.2701, + "step": 8796 + }, + { + "epoch": 0.55, + "grad_norm": 2.813758549658407, + "learning_rate": 4.382853376793951e-06, + "loss": 0.2751, + "step": 8797 + }, + { + "epoch": 0.55, + "grad_norm": 2.309428422198222, + "learning_rate": 4.381842701008428e-06, + "loss": 0.2689, + "step": 8798 + }, + { + "epoch": 0.55, + "grad_norm": 2.4856878017942488, + "learning_rate": 4.38083205087135e-06, + "loss": 0.2734, + "step": 8799 + }, + { + "epoch": 0.55, + "grad_norm": 2.587352160696565, + "learning_rate": 4.379821426424649e-06, + "loss": 0.2836, + "step": 8800 + }, + { + "epoch": 0.55, + "grad_norm": 1.6788180503058727, + "learning_rate": 4.378810827710258e-06, + "loss": 0.2842, + "step": 8801 + }, + { + "epoch": 0.55, + "grad_norm": 1.7877170572081649, + "learning_rate": 4.3778002547701115e-06, + "loss": 0.2987, + "step": 8802 + }, + { + "epoch": 0.55, + "grad_norm": 1.8127992907502553, + "learning_rate": 4.376789707646136e-06, + "loss": 0.2597, + "step": 8803 + }, + { + "epoch": 0.55, + "grad_norm": 1.6606962822774591, + "learning_rate": 4.375779186380264e-06, + "loss": 0.2674, + "step": 8804 + }, + { + "epoch": 0.55, + "grad_norm": 1.6529512964717805, + "learning_rate": 4.37476869101442e-06, + "loss": 0.2755, + "step": 8805 + }, + { + "epoch": 0.55, + "grad_norm": 2.1804116028790723, + "learning_rate": 4.373758221590534e-06, + "loss": 0.2697, + "step": 8806 + }, + { + "epoch": 0.55, + "grad_norm": 0.659638487284497, + "learning_rate": 4.372747778150534e-06, + "loss": 0.4995, + "step": 8807 + }, + { + "epoch": 0.55, + "grad_norm": 2.5290014777933707, + "learning_rate": 4.37173736073634e-06, + "loss": 0.303, + "step": 8808 + }, + { + "epoch": 0.55, + "grad_norm": 7.305633194643116, + "learning_rate": 4.3707269693898785e-06, + "loss": 0.2826, + "step": 8809 + }, + { + "epoch": 0.55, + "grad_norm": 2.0251981831796755, + "learning_rate": 4.3697166041530745e-06, + "loss": 0.3124, + "step": 8810 + }, + { + "epoch": 0.55, + "grad_norm": 5.292296163971953, + "learning_rate": 4.368706265067848e-06, + "loss": 0.296, + "step": 8811 + }, + { + "epoch": 0.55, + "grad_norm": 2.16034555572653, + "learning_rate": 4.3676959521761204e-06, + "loss": 0.3045, + "step": 8812 + }, + { + "epoch": 0.55, + "grad_norm": 3.2779825250555747, + "learning_rate": 4.36668566551981e-06, + "loss": 0.2639, + "step": 8813 + }, + { + "epoch": 0.55, + "grad_norm": 3.272932253024524, + "learning_rate": 4.365675405140835e-06, + "loss": 0.2909, + "step": 8814 + }, + { + "epoch": 0.55, + "grad_norm": 2.0181629381687625, + "learning_rate": 4.364665171081117e-06, + "loss": 0.319, + "step": 8815 + }, + { + "epoch": 0.55, + "grad_norm": 1.6001049146500168, + "learning_rate": 4.363654963382566e-06, + "loss": 0.2653, + "step": 8816 + }, + { + "epoch": 0.55, + "grad_norm": 4.862646015552912, + "learning_rate": 4.362644782087103e-06, + "loss": 0.2967, + "step": 8817 + }, + { + "epoch": 0.55, + "grad_norm": 1.7613298739018228, + "learning_rate": 4.361634627236641e-06, + "loss": 0.2885, + "step": 8818 + }, + { + "epoch": 0.55, + "grad_norm": 3.3554023763187777, + "learning_rate": 4.36062449887309e-06, + "loss": 0.2969, + "step": 8819 + }, + { + "epoch": 0.55, + "grad_norm": 2.3492156090735654, + "learning_rate": 4.3596143970383665e-06, + "loss": 0.2937, + "step": 8820 + }, + { + "epoch": 0.55, + "grad_norm": 2.0430566626109665, + "learning_rate": 4.358604321774378e-06, + "loss": 0.2853, + "step": 8821 + }, + { + "epoch": 0.55, + "grad_norm": 4.331127963768257, + "learning_rate": 4.357594273123034e-06, + "loss": 0.2727, + "step": 8822 + }, + { + "epoch": 0.55, + "grad_norm": 2.7288435665347155, + "learning_rate": 4.356584251126247e-06, + "loss": 0.2853, + "step": 8823 + }, + { + "epoch": 0.55, + "grad_norm": 3.4801098710765923, + "learning_rate": 4.35557425582592e-06, + "loss": 0.2593, + "step": 8824 + }, + { + "epoch": 0.55, + "grad_norm": 1.534546151544299, + "learning_rate": 4.354564287263963e-06, + "loss": 0.289, + "step": 8825 + }, + { + "epoch": 0.56, + "grad_norm": 2.500854742667218, + "learning_rate": 4.353554345482281e-06, + "loss": 0.3003, + "step": 8826 + }, + { + "epoch": 0.56, + "grad_norm": 2.0811475203848486, + "learning_rate": 4.352544430522776e-06, + "loss": 0.2637, + "step": 8827 + }, + { + "epoch": 0.56, + "grad_norm": 1.736678915037675, + "learning_rate": 4.3515345424273534e-06, + "loss": 0.2727, + "step": 8828 + }, + { + "epoch": 0.56, + "grad_norm": 2.779011418387543, + "learning_rate": 4.350524681237914e-06, + "loss": 0.2915, + "step": 8829 + }, + { + "epoch": 0.56, + "grad_norm": 2.3772966415982157, + "learning_rate": 4.349514846996359e-06, + "loss": 0.3121, + "step": 8830 + }, + { + "epoch": 0.56, + "grad_norm": 2.9069514685270286, + "learning_rate": 4.348505039744588e-06, + "loss": 0.264, + "step": 8831 + }, + { + "epoch": 0.56, + "grad_norm": 1.8079596990685507, + "learning_rate": 4.347495259524502e-06, + "loss": 0.2663, + "step": 8832 + }, + { + "epoch": 0.56, + "grad_norm": 3.0552600442669373, + "learning_rate": 4.346485506377995e-06, + "loss": 0.29, + "step": 8833 + }, + { + "epoch": 0.56, + "grad_norm": 4.443785518207296, + "learning_rate": 4.345475780346966e-06, + "loss": 0.2826, + "step": 8834 + }, + { + "epoch": 0.56, + "grad_norm": 1.4957452462973535, + "learning_rate": 4.344466081473308e-06, + "loss": 0.2745, + "step": 8835 + }, + { + "epoch": 0.56, + "grad_norm": 2.79804967424767, + "learning_rate": 4.343456409798919e-06, + "loss": 0.2953, + "step": 8836 + }, + { + "epoch": 0.56, + "grad_norm": 2.847971924603, + "learning_rate": 4.342446765365688e-06, + "loss": 0.2739, + "step": 8837 + }, + { + "epoch": 0.56, + "grad_norm": 2.6603864587335897, + "learning_rate": 4.341437148215509e-06, + "loss": 0.2813, + "step": 8838 + }, + { + "epoch": 0.56, + "grad_norm": 1.9345727328991653, + "learning_rate": 4.3404275583902715e-06, + "loss": 0.2838, + "step": 8839 + }, + { + "epoch": 0.56, + "grad_norm": 1.8273485211933544, + "learning_rate": 4.339417995931868e-06, + "loss": 0.2805, + "step": 8840 + }, + { + "epoch": 0.56, + "grad_norm": 1.9221728253766586, + "learning_rate": 4.3384084608821836e-06, + "loss": 0.2709, + "step": 8841 + }, + { + "epoch": 0.56, + "grad_norm": 2.3001131084931226, + "learning_rate": 4.337398953283108e-06, + "loss": 0.2908, + "step": 8842 + }, + { + "epoch": 0.56, + "grad_norm": 5.327751492623303, + "learning_rate": 4.3363894731765275e-06, + "loss": 0.2832, + "step": 8843 + }, + { + "epoch": 0.56, + "grad_norm": 2.286516408866927, + "learning_rate": 4.335380020604325e-06, + "loss": 0.2774, + "step": 8844 + }, + { + "epoch": 0.56, + "grad_norm": 3.4334218433812853, + "learning_rate": 4.334370595608388e-06, + "loss": 0.2909, + "step": 8845 + }, + { + "epoch": 0.56, + "grad_norm": 2.558465768562449, + "learning_rate": 4.333361198230594e-06, + "loss": 0.2876, + "step": 8846 + }, + { + "epoch": 0.56, + "grad_norm": 3.5918837871139515, + "learning_rate": 4.33235182851283e-06, + "loss": 0.2767, + "step": 8847 + }, + { + "epoch": 0.56, + "grad_norm": 1.7756730051623848, + "learning_rate": 4.331342486496974e-06, + "loss": 0.2787, + "step": 8848 + }, + { + "epoch": 0.56, + "grad_norm": 4.72648946975082, + "learning_rate": 4.330333172224906e-06, + "loss": 0.2697, + "step": 8849 + }, + { + "epoch": 0.56, + "grad_norm": 1.9080727141380536, + "learning_rate": 4.3293238857385055e-06, + "loss": 0.2665, + "step": 8850 + }, + { + "epoch": 0.56, + "grad_norm": 1.5495870138713592, + "learning_rate": 4.328314627079647e-06, + "loss": 0.2699, + "step": 8851 + }, + { + "epoch": 0.56, + "grad_norm": 2.228494027164655, + "learning_rate": 4.327305396290208e-06, + "loss": 0.2666, + "step": 8852 + }, + { + "epoch": 0.56, + "grad_norm": 4.40738067116132, + "learning_rate": 4.326296193412061e-06, + "loss": 0.2687, + "step": 8853 + }, + { + "epoch": 0.56, + "grad_norm": 3.4099924326658404, + "learning_rate": 4.325287018487085e-06, + "loss": 0.2666, + "step": 8854 + }, + { + "epoch": 0.56, + "grad_norm": 2.716089079601977, + "learning_rate": 4.324277871557146e-06, + "loss": 0.2682, + "step": 8855 + }, + { + "epoch": 0.56, + "grad_norm": 2.406387412827769, + "learning_rate": 4.323268752664121e-06, + "loss": 0.2712, + "step": 8856 + }, + { + "epoch": 0.56, + "grad_norm": 2.047449486050684, + "learning_rate": 4.322259661849876e-06, + "loss": 0.263, + "step": 8857 + }, + { + "epoch": 0.56, + "grad_norm": 8.474439262709888, + "learning_rate": 4.321250599156282e-06, + "loss": 0.2917, + "step": 8858 + }, + { + "epoch": 0.56, + "grad_norm": 2.1979414166924056, + "learning_rate": 4.320241564625206e-06, + "loss": 0.2574, + "step": 8859 + }, + { + "epoch": 0.56, + "grad_norm": 1.9528934305107515, + "learning_rate": 4.319232558298514e-06, + "loss": 0.2685, + "step": 8860 + }, + { + "epoch": 0.56, + "grad_norm": 4.164371315133676, + "learning_rate": 4.318223580218072e-06, + "loss": 0.298, + "step": 8861 + }, + { + "epoch": 0.56, + "grad_norm": 2.1918782637115277, + "learning_rate": 4.3172146304257466e-06, + "loss": 0.2712, + "step": 8862 + }, + { + "epoch": 0.56, + "grad_norm": 5.402434207344038, + "learning_rate": 4.316205708963398e-06, + "loss": 0.2665, + "step": 8863 + }, + { + "epoch": 0.56, + "grad_norm": 1.443865349213514, + "learning_rate": 4.315196815872889e-06, + "loss": 0.2689, + "step": 8864 + }, + { + "epoch": 0.56, + "grad_norm": 2.0935529911608937, + "learning_rate": 4.314187951196081e-06, + "loss": 0.2977, + "step": 8865 + }, + { + "epoch": 0.56, + "grad_norm": 2.4900474685702436, + "learning_rate": 4.313179114974832e-06, + "loss": 0.2671, + "step": 8866 + }, + { + "epoch": 0.56, + "grad_norm": 1.5291984547588007, + "learning_rate": 4.312170307251003e-06, + "loss": 0.2703, + "step": 8867 + }, + { + "epoch": 0.56, + "grad_norm": 2.4134436391843623, + "learning_rate": 4.311161528066447e-06, + "loss": 0.2716, + "step": 8868 + }, + { + "epoch": 0.56, + "grad_norm": 2.422658943864349, + "learning_rate": 4.310152777463025e-06, + "loss": 0.2814, + "step": 8869 + }, + { + "epoch": 0.56, + "grad_norm": 1.5609013701217902, + "learning_rate": 4.309144055482589e-06, + "loss": 0.273, + "step": 8870 + }, + { + "epoch": 0.56, + "grad_norm": 3.8882123327606752, + "learning_rate": 4.308135362166993e-06, + "loss": 0.2802, + "step": 8871 + }, + { + "epoch": 0.56, + "grad_norm": 1.5452605078974324, + "learning_rate": 4.307126697558091e-06, + "loss": 0.2589, + "step": 8872 + }, + { + "epoch": 0.56, + "grad_norm": 2.0462801740054424, + "learning_rate": 4.306118061697732e-06, + "loss": 0.2643, + "step": 8873 + }, + { + "epoch": 0.56, + "grad_norm": 1.617369301677123, + "learning_rate": 4.305109454627766e-06, + "loss": 0.2598, + "step": 8874 + }, + { + "epoch": 0.56, + "grad_norm": 3.2837312884593506, + "learning_rate": 4.304100876390045e-06, + "loss": 0.2671, + "step": 8875 + }, + { + "epoch": 0.56, + "grad_norm": 2.102618332814836, + "learning_rate": 4.303092327026412e-06, + "loss": 0.2781, + "step": 8876 + }, + { + "epoch": 0.56, + "grad_norm": 5.642870964127251, + "learning_rate": 4.302083806578718e-06, + "loss": 0.2976, + "step": 8877 + }, + { + "epoch": 0.56, + "grad_norm": 3.2466194788050067, + "learning_rate": 4.301075315088808e-06, + "loss": 0.2836, + "step": 8878 + }, + { + "epoch": 0.56, + "grad_norm": 1.6094421454054106, + "learning_rate": 4.300066852598522e-06, + "loss": 0.2735, + "step": 8879 + }, + { + "epoch": 0.56, + "grad_norm": 2.7641210259148083, + "learning_rate": 4.299058419149707e-06, + "loss": 0.304, + "step": 8880 + }, + { + "epoch": 0.56, + "grad_norm": 2.497019961234568, + "learning_rate": 4.298050014784203e-06, + "loss": 0.2897, + "step": 8881 + }, + { + "epoch": 0.56, + "grad_norm": 2.2130726264740694, + "learning_rate": 4.29704163954385e-06, + "loss": 0.2581, + "step": 8882 + }, + { + "epoch": 0.56, + "grad_norm": 1.7827414494481095, + "learning_rate": 4.296033293470487e-06, + "loss": 0.2575, + "step": 8883 + }, + { + "epoch": 0.56, + "grad_norm": 1.5952871810474307, + "learning_rate": 4.295024976605955e-06, + "loss": 0.2678, + "step": 8884 + }, + { + "epoch": 0.56, + "grad_norm": 5.048186536648519, + "learning_rate": 4.2940166889920885e-06, + "loss": 0.305, + "step": 8885 + }, + { + "epoch": 0.56, + "grad_norm": 2.468594996183448, + "learning_rate": 4.293008430670724e-06, + "loss": 0.2753, + "step": 8886 + }, + { + "epoch": 0.56, + "grad_norm": 2.04924145427867, + "learning_rate": 4.2920002016836944e-06, + "loss": 0.2693, + "step": 8887 + }, + { + "epoch": 0.56, + "grad_norm": 1.8709365030415537, + "learning_rate": 4.290992002072836e-06, + "loss": 0.2706, + "step": 8888 + }, + { + "epoch": 0.56, + "grad_norm": 1.874629366645945, + "learning_rate": 4.2899838318799765e-06, + "loss": 0.2766, + "step": 8889 + }, + { + "epoch": 0.56, + "grad_norm": 1.879841030053625, + "learning_rate": 4.288975691146949e-06, + "loss": 0.271, + "step": 8890 + }, + { + "epoch": 0.56, + "grad_norm": 2.1328135006619013, + "learning_rate": 4.287967579915583e-06, + "loss": 0.2801, + "step": 8891 + }, + { + "epoch": 0.56, + "grad_norm": 1.8558413617193286, + "learning_rate": 4.286959498227708e-06, + "loss": 0.2657, + "step": 8892 + }, + { + "epoch": 0.56, + "grad_norm": 3.7925751896906728, + "learning_rate": 4.285951446125149e-06, + "loss": 0.26, + "step": 8893 + }, + { + "epoch": 0.56, + "grad_norm": 2.128898122164901, + "learning_rate": 4.284943423649734e-06, + "loss": 0.3001, + "step": 8894 + }, + { + "epoch": 0.56, + "grad_norm": 2.1364360496462225, + "learning_rate": 4.283935430843285e-06, + "loss": 0.2593, + "step": 8895 + }, + { + "epoch": 0.56, + "grad_norm": 1.7862988839976337, + "learning_rate": 4.2829274677476284e-06, + "loss": 0.261, + "step": 8896 + }, + { + "epoch": 0.56, + "grad_norm": 2.728330977776241, + "learning_rate": 4.281919534404584e-06, + "loss": 0.2665, + "step": 8897 + }, + { + "epoch": 0.56, + "grad_norm": 1.9728779970840908, + "learning_rate": 4.280911630855972e-06, + "loss": 0.2592, + "step": 8898 + }, + { + "epoch": 0.56, + "grad_norm": 2.7561037680395684, + "learning_rate": 4.279903757143615e-06, + "loss": 0.2888, + "step": 8899 + }, + { + "epoch": 0.56, + "grad_norm": 1.9675289154909317, + "learning_rate": 4.278895913309331e-06, + "loss": 0.2886, + "step": 8900 + }, + { + "epoch": 0.56, + "grad_norm": 2.267841875167877, + "learning_rate": 4.277888099394935e-06, + "loss": 0.2786, + "step": 8901 + }, + { + "epoch": 0.56, + "grad_norm": 5.322292873054597, + "learning_rate": 4.276880315442246e-06, + "loss": 0.2762, + "step": 8902 + }, + { + "epoch": 0.56, + "grad_norm": 2.0458923668148548, + "learning_rate": 4.275872561493076e-06, + "loss": 0.2754, + "step": 8903 + }, + { + "epoch": 0.56, + "grad_norm": 3.526948251766585, + "learning_rate": 4.274864837589241e-06, + "loss": 0.2759, + "step": 8904 + }, + { + "epoch": 0.56, + "grad_norm": 3.3858840883045325, + "learning_rate": 4.27385714377255e-06, + "loss": 0.2814, + "step": 8905 + }, + { + "epoch": 0.56, + "grad_norm": 7.315365768078168, + "learning_rate": 4.272849480084815e-06, + "loss": 0.2815, + "step": 8906 + }, + { + "epoch": 0.56, + "grad_norm": 4.38894179386223, + "learning_rate": 4.271841846567848e-06, + "loss": 0.2845, + "step": 8907 + }, + { + "epoch": 0.56, + "grad_norm": 2.2669536190395756, + "learning_rate": 4.270834243263457e-06, + "loss": 0.2651, + "step": 8908 + }, + { + "epoch": 0.56, + "grad_norm": 6.1972565635394075, + "learning_rate": 4.269826670213448e-06, + "loss": 0.2761, + "step": 8909 + }, + { + "epoch": 0.56, + "grad_norm": 1.9157004083104388, + "learning_rate": 4.268819127459628e-06, + "loss": 0.2791, + "step": 8910 + }, + { + "epoch": 0.56, + "grad_norm": 2.1022674094391616, + "learning_rate": 4.2678116150437996e-06, + "loss": 0.2913, + "step": 8911 + }, + { + "epoch": 0.56, + "grad_norm": 1.9327625254882868, + "learning_rate": 4.266804133007768e-06, + "loss": 0.2667, + "step": 8912 + }, + { + "epoch": 0.56, + "grad_norm": 1.9456884074858392, + "learning_rate": 4.265796681393334e-06, + "loss": 0.2734, + "step": 8913 + }, + { + "epoch": 0.56, + "grad_norm": 4.111866457250431, + "learning_rate": 4.264789260242302e-06, + "loss": 0.2823, + "step": 8914 + }, + { + "epoch": 0.56, + "grad_norm": 1.7224210336776686, + "learning_rate": 4.2637818695964695e-06, + "loss": 0.2656, + "step": 8915 + }, + { + "epoch": 0.56, + "grad_norm": 2.3786303189828897, + "learning_rate": 4.262774509497635e-06, + "loss": 0.2583, + "step": 8916 + }, + { + "epoch": 0.56, + "grad_norm": 0.6003560851926935, + "learning_rate": 4.261767179987595e-06, + "loss": 0.4711, + "step": 8917 + }, + { + "epoch": 0.56, + "grad_norm": 2.0008897705973374, + "learning_rate": 4.260759881108146e-06, + "loss": 0.2676, + "step": 8918 + }, + { + "epoch": 0.56, + "grad_norm": 1.5084236603405603, + "learning_rate": 4.259752612901082e-06, + "loss": 0.2782, + "step": 8919 + }, + { + "epoch": 0.56, + "grad_norm": 2.2484175022206485, + "learning_rate": 4.258745375408197e-06, + "loss": 0.2773, + "step": 8920 + }, + { + "epoch": 0.56, + "grad_norm": 1.9786812173023716, + "learning_rate": 4.257738168671282e-06, + "loss": 0.2699, + "step": 8921 + }, + { + "epoch": 0.56, + "grad_norm": 0.5888677555861132, + "learning_rate": 4.256730992732131e-06, + "loss": 0.4822, + "step": 8922 + }, + { + "epoch": 0.56, + "grad_norm": 2.9832092959576713, + "learning_rate": 4.25572384763253e-06, + "loss": 0.267, + "step": 8923 + }, + { + "epoch": 0.56, + "grad_norm": 1.6721588709757083, + "learning_rate": 4.25471673341427e-06, + "loss": 0.277, + "step": 8924 + }, + { + "epoch": 0.56, + "grad_norm": 2.28239510560659, + "learning_rate": 4.253709650119136e-06, + "loss": 0.286, + "step": 8925 + }, + { + "epoch": 0.56, + "grad_norm": 2.500595624838231, + "learning_rate": 4.252702597788914e-06, + "loss": 0.2811, + "step": 8926 + }, + { + "epoch": 0.56, + "grad_norm": 3.236622128288961, + "learning_rate": 4.251695576465387e-06, + "loss": 0.2617, + "step": 8927 + }, + { + "epoch": 0.56, + "grad_norm": 2.2044057139659348, + "learning_rate": 4.25068858619034e-06, + "loss": 0.2667, + "step": 8928 + }, + { + "epoch": 0.56, + "grad_norm": 10.69057199713099, + "learning_rate": 4.249681627005555e-06, + "loss": 0.2886, + "step": 8929 + }, + { + "epoch": 0.56, + "grad_norm": 2.8128634196629156, + "learning_rate": 4.248674698952813e-06, + "loss": 0.2816, + "step": 8930 + }, + { + "epoch": 0.56, + "grad_norm": 4.9092676289630655, + "learning_rate": 4.247667802073892e-06, + "loss": 0.271, + "step": 8931 + }, + { + "epoch": 0.56, + "grad_norm": 1.7101813743100585, + "learning_rate": 4.2466609364105705e-06, + "loss": 0.2688, + "step": 8932 + }, + { + "epoch": 0.56, + "grad_norm": 2.0294173890705043, + "learning_rate": 4.2456541020046235e-06, + "loss": 0.2712, + "step": 8933 + }, + { + "epoch": 0.56, + "grad_norm": 2.3269548514176295, + "learning_rate": 4.24464729889783e-06, + "loss": 0.2714, + "step": 8934 + }, + { + "epoch": 0.56, + "grad_norm": 3.065650619612148, + "learning_rate": 4.243640527131956e-06, + "loss": 0.2695, + "step": 8935 + }, + { + "epoch": 0.56, + "grad_norm": 2.756013136161544, + "learning_rate": 4.242633786748786e-06, + "loss": 0.2737, + "step": 8936 + }, + { + "epoch": 0.56, + "grad_norm": 1.9746356836729337, + "learning_rate": 4.241627077790083e-06, + "loss": 0.2925, + "step": 8937 + }, + { + "epoch": 0.56, + "grad_norm": 2.188654632913122, + "learning_rate": 4.24062040029762e-06, + "loss": 0.288, + "step": 8938 + }, + { + "epoch": 0.56, + "grad_norm": 2.3826716843561906, + "learning_rate": 4.239613754313166e-06, + "loss": 0.2909, + "step": 8939 + }, + { + "epoch": 0.56, + "grad_norm": 2.8464110193219216, + "learning_rate": 4.238607139878488e-06, + "loss": 0.2727, + "step": 8940 + }, + { + "epoch": 0.56, + "grad_norm": 1.8897168708999552, + "learning_rate": 4.237600557035351e-06, + "loss": 0.2829, + "step": 8941 + }, + { + "epoch": 0.56, + "grad_norm": 5.811244011148771, + "learning_rate": 4.236594005825522e-06, + "loss": 0.2942, + "step": 8942 + }, + { + "epoch": 0.56, + "grad_norm": 2.3708395973271745, + "learning_rate": 4.235587486290761e-06, + "loss": 0.2816, + "step": 8943 + }, + { + "epoch": 0.56, + "grad_norm": 1.848707589575137, + "learning_rate": 4.234580998472836e-06, + "loss": 0.2735, + "step": 8944 + }, + { + "epoch": 0.56, + "grad_norm": 2.0818197647066383, + "learning_rate": 4.2335745424135035e-06, + "loss": 0.2895, + "step": 8945 + }, + { + "epoch": 0.56, + "grad_norm": 1.8105190386405015, + "learning_rate": 4.232568118154526e-06, + "loss": 0.2802, + "step": 8946 + }, + { + "epoch": 0.56, + "grad_norm": 2.6744721580363158, + "learning_rate": 4.231561725737659e-06, + "loss": 0.303, + "step": 8947 + }, + { + "epoch": 0.56, + "grad_norm": 2.7062637016033686, + "learning_rate": 4.230555365204662e-06, + "loss": 0.2674, + "step": 8948 + }, + { + "epoch": 0.56, + "grad_norm": 2.014579609195482, + "learning_rate": 4.229549036597289e-06, + "loss": 0.2693, + "step": 8949 + }, + { + "epoch": 0.56, + "grad_norm": 1.8115931725341627, + "learning_rate": 4.228542739957293e-06, + "loss": 0.2751, + "step": 8950 + }, + { + "epoch": 0.56, + "grad_norm": 3.1619926271356884, + "learning_rate": 4.227536475326431e-06, + "loss": 0.3196, + "step": 8951 + }, + { + "epoch": 0.56, + "grad_norm": 4.148882830322034, + "learning_rate": 4.226530242746454e-06, + "loss": 0.2791, + "step": 8952 + }, + { + "epoch": 0.56, + "grad_norm": 2.0251760402930987, + "learning_rate": 4.2255240422591084e-06, + "loss": 0.2858, + "step": 8953 + }, + { + "epoch": 0.56, + "grad_norm": 2.6610577453796638, + "learning_rate": 4.224517873906149e-06, + "loss": 0.2654, + "step": 8954 + }, + { + "epoch": 0.56, + "grad_norm": 2.2941427565915737, + "learning_rate": 4.223511737729317e-06, + "loss": 0.2887, + "step": 8955 + }, + { + "epoch": 0.56, + "grad_norm": 2.6778714793567895, + "learning_rate": 4.222505633770365e-06, + "loss": 0.3064, + "step": 8956 + }, + { + "epoch": 0.56, + "grad_norm": 2.4397769365284927, + "learning_rate": 4.221499562071034e-06, + "loss": 0.2769, + "step": 8957 + }, + { + "epoch": 0.56, + "grad_norm": 2.5081654100224524, + "learning_rate": 4.220493522673067e-06, + "loss": 0.2824, + "step": 8958 + }, + { + "epoch": 0.56, + "grad_norm": 1.6314409939815784, + "learning_rate": 4.219487515618211e-06, + "loss": 0.28, + "step": 8959 + }, + { + "epoch": 0.56, + "grad_norm": 5.368558570660478, + "learning_rate": 4.218481540948204e-06, + "loss": 0.2683, + "step": 8960 + }, + { + "epoch": 0.56, + "grad_norm": 2.9065182706097263, + "learning_rate": 4.217475598704785e-06, + "loss": 0.2981, + "step": 8961 + }, + { + "epoch": 0.56, + "grad_norm": 2.1749750023851813, + "learning_rate": 4.216469688929695e-06, + "loss": 0.2622, + "step": 8962 + }, + { + "epoch": 0.56, + "grad_norm": 2.770772405111811, + "learning_rate": 4.215463811664668e-06, + "loss": 0.3086, + "step": 8963 + }, + { + "epoch": 0.56, + "grad_norm": 2.412456722464413, + "learning_rate": 4.214457966951442e-06, + "loss": 0.2791, + "step": 8964 + }, + { + "epoch": 0.56, + "grad_norm": 1.9763561165106647, + "learning_rate": 4.213452154831747e-06, + "loss": 0.256, + "step": 8965 + }, + { + "epoch": 0.56, + "grad_norm": 2.39744250713294, + "learning_rate": 4.212446375347322e-06, + "loss": 0.2825, + "step": 8966 + }, + { + "epoch": 0.56, + "grad_norm": 2.4996357824472555, + "learning_rate": 4.211440628539896e-06, + "loss": 0.2927, + "step": 8967 + }, + { + "epoch": 0.56, + "grad_norm": 3.857622541435515, + "learning_rate": 4.210434914451199e-06, + "loss": 0.2789, + "step": 8968 + }, + { + "epoch": 0.56, + "grad_norm": 2.0531193896832303, + "learning_rate": 4.209429233122958e-06, + "loss": 0.2887, + "step": 8969 + }, + { + "epoch": 0.56, + "grad_norm": 2.7214215778903053, + "learning_rate": 4.208423584596904e-06, + "loss": 0.2895, + "step": 8970 + }, + { + "epoch": 0.56, + "grad_norm": 3.1945404384958525, + "learning_rate": 4.2074179689147605e-06, + "loss": 0.2825, + "step": 8971 + }, + { + "epoch": 0.56, + "grad_norm": 2.990575182429611, + "learning_rate": 4.206412386118255e-06, + "loss": 0.2554, + "step": 8972 + }, + { + "epoch": 0.56, + "grad_norm": 3.0597585020530214, + "learning_rate": 4.205406836249106e-06, + "loss": 0.2827, + "step": 8973 + }, + { + "epoch": 0.56, + "grad_norm": 2.3306231040741285, + "learning_rate": 4.20440131934904e-06, + "loss": 0.2652, + "step": 8974 + }, + { + "epoch": 0.56, + "grad_norm": 2.1147726140867236, + "learning_rate": 4.203395835459778e-06, + "loss": 0.2653, + "step": 8975 + }, + { + "epoch": 0.56, + "grad_norm": 1.7720739081897614, + "learning_rate": 4.202390384623037e-06, + "loss": 0.2642, + "step": 8976 + }, + { + "epoch": 0.56, + "grad_norm": 3.163967357881253, + "learning_rate": 4.201384966880535e-06, + "loss": 0.2689, + "step": 8977 + }, + { + "epoch": 0.56, + "grad_norm": 2.0608159511849995, + "learning_rate": 4.200379582273991e-06, + "loss": 0.2697, + "step": 8978 + }, + { + "epoch": 0.56, + "grad_norm": 1.8158680199937984, + "learning_rate": 4.199374230845116e-06, + "loss": 0.2654, + "step": 8979 + }, + { + "epoch": 0.56, + "grad_norm": 1.9732864716133736, + "learning_rate": 4.198368912635626e-06, + "loss": 0.2733, + "step": 8980 + }, + { + "epoch": 0.56, + "grad_norm": 4.561783066553702, + "learning_rate": 4.197363627687234e-06, + "loss": 0.2807, + "step": 8981 + }, + { + "epoch": 0.56, + "grad_norm": 8.314744656264878, + "learning_rate": 4.196358376041653e-06, + "loss": 0.273, + "step": 8982 + }, + { + "epoch": 0.56, + "grad_norm": 4.240795509974399, + "learning_rate": 4.195353157740589e-06, + "loss": 0.2938, + "step": 8983 + }, + { + "epoch": 0.56, + "grad_norm": 2.195059031410783, + "learning_rate": 4.194347972825752e-06, + "loss": 0.3006, + "step": 8984 + }, + { + "epoch": 0.57, + "grad_norm": 2.17132524491701, + "learning_rate": 4.193342821338848e-06, + "loss": 0.2773, + "step": 8985 + }, + { + "epoch": 0.57, + "grad_norm": 6.7391202719302985, + "learning_rate": 4.192337703321584e-06, + "loss": 0.2733, + "step": 8986 + }, + { + "epoch": 0.57, + "grad_norm": 2.0077722985303383, + "learning_rate": 4.191332618815663e-06, + "loss": 0.2546, + "step": 8987 + }, + { + "epoch": 0.57, + "grad_norm": 2.1119538811705834, + "learning_rate": 4.190327567862786e-06, + "loss": 0.2733, + "step": 8988 + }, + { + "epoch": 0.57, + "grad_norm": 2.54282550770554, + "learning_rate": 4.189322550504658e-06, + "loss": 0.2759, + "step": 8989 + }, + { + "epoch": 0.57, + "grad_norm": 2.1324939689176095, + "learning_rate": 4.1883175667829775e-06, + "loss": 0.2736, + "step": 8990 + }, + { + "epoch": 0.57, + "grad_norm": 1.9892797161565217, + "learning_rate": 4.1873126167394425e-06, + "loss": 0.2995, + "step": 8991 + }, + { + "epoch": 0.57, + "grad_norm": 2.969412900225151, + "learning_rate": 4.186307700415752e-06, + "loss": 0.2571, + "step": 8992 + }, + { + "epoch": 0.57, + "grad_norm": 1.8208932198518593, + "learning_rate": 4.185302817853599e-06, + "loss": 0.2683, + "step": 8993 + }, + { + "epoch": 0.57, + "grad_norm": 11.344225976821004, + "learning_rate": 4.18429796909468e-06, + "loss": 0.2904, + "step": 8994 + }, + { + "epoch": 0.57, + "grad_norm": 3.1287523203770173, + "learning_rate": 4.183293154180684e-06, + "loss": 0.2871, + "step": 8995 + }, + { + "epoch": 0.57, + "grad_norm": 2.5447129794893724, + "learning_rate": 4.1822883731533085e-06, + "loss": 0.2702, + "step": 8996 + }, + { + "epoch": 0.57, + "grad_norm": 2.6363423048591406, + "learning_rate": 4.18128362605424e-06, + "loss": 0.2783, + "step": 8997 + }, + { + "epoch": 0.57, + "grad_norm": 3.5618377162236095, + "learning_rate": 4.180278912925169e-06, + "loss": 0.29, + "step": 8998 + }, + { + "epoch": 0.57, + "grad_norm": 1.874233588117033, + "learning_rate": 4.1792742338077806e-06, + "loss": 0.2697, + "step": 8999 + }, + { + "epoch": 0.57, + "grad_norm": 2.1873105727580326, + "learning_rate": 4.178269588743764e-06, + "loss": 0.2781, + "step": 9000 + }, + { + "epoch": 0.57, + "grad_norm": 1.5649838974752368, + "learning_rate": 4.1772649777748e-06, + "loss": 0.2714, + "step": 9001 + }, + { + "epoch": 0.57, + "grad_norm": 3.177294078163326, + "learning_rate": 4.1762604009425745e-06, + "loss": 0.258, + "step": 9002 + }, + { + "epoch": 0.57, + "grad_norm": 2.1328497363526147, + "learning_rate": 4.175255858288765e-06, + "loss": 0.2743, + "step": 9003 + }, + { + "epoch": 0.57, + "grad_norm": 2.170079281837565, + "learning_rate": 4.174251349855058e-06, + "loss": 0.2749, + "step": 9004 + }, + { + "epoch": 0.57, + "grad_norm": 1.715033219634509, + "learning_rate": 4.173246875683128e-06, + "loss": 0.2712, + "step": 9005 + }, + { + "epoch": 0.57, + "grad_norm": 3.0388177094530606, + "learning_rate": 4.1722424358146555e-06, + "loss": 0.2861, + "step": 9006 + }, + { + "epoch": 0.57, + "grad_norm": 3.4158034859742052, + "learning_rate": 4.1712380302913125e-06, + "loss": 0.2651, + "step": 9007 + }, + { + "epoch": 0.57, + "grad_norm": 5.455353466151032, + "learning_rate": 4.170233659154777e-06, + "loss": 0.2708, + "step": 9008 + }, + { + "epoch": 0.57, + "grad_norm": 2.0803466282254988, + "learning_rate": 4.169229322446719e-06, + "loss": 0.2818, + "step": 9009 + }, + { + "epoch": 0.57, + "grad_norm": 7.300872631067211, + "learning_rate": 4.168225020208813e-06, + "loss": 0.2863, + "step": 9010 + }, + { + "epoch": 0.57, + "grad_norm": 1.9927010428964396, + "learning_rate": 4.167220752482728e-06, + "loss": 0.2686, + "step": 9011 + }, + { + "epoch": 0.57, + "grad_norm": 1.8583405483567006, + "learning_rate": 4.166216519310134e-06, + "loss": 0.2753, + "step": 9012 + }, + { + "epoch": 0.57, + "grad_norm": 2.1441962938758565, + "learning_rate": 4.165212320732696e-06, + "loss": 0.2592, + "step": 9013 + }, + { + "epoch": 0.57, + "grad_norm": 2.7073128293033326, + "learning_rate": 4.1642081567920845e-06, + "loss": 0.2812, + "step": 9014 + }, + { + "epoch": 0.57, + "grad_norm": 0.6457581470100427, + "learning_rate": 4.163204027529959e-06, + "loss": 0.4653, + "step": 9015 + }, + { + "epoch": 0.57, + "grad_norm": 2.6065210477113765, + "learning_rate": 4.162199932987986e-06, + "loss": 0.2681, + "step": 9016 + }, + { + "epoch": 0.57, + "grad_norm": 2.4496132064690843, + "learning_rate": 4.161195873207824e-06, + "loss": 0.2724, + "step": 9017 + }, + { + "epoch": 0.57, + "grad_norm": 1.954544716150455, + "learning_rate": 4.1601918482311355e-06, + "loss": 0.2716, + "step": 9018 + }, + { + "epoch": 0.57, + "grad_norm": 1.8725483274683463, + "learning_rate": 4.159187858099579e-06, + "loss": 0.276, + "step": 9019 + }, + { + "epoch": 0.57, + "grad_norm": 1.2842182803767546, + "learning_rate": 4.158183902854813e-06, + "loss": 0.2446, + "step": 9020 + }, + { + "epoch": 0.57, + "grad_norm": 1.8445995790033862, + "learning_rate": 4.15717998253849e-06, + "loss": 0.2841, + "step": 9021 + }, + { + "epoch": 0.57, + "grad_norm": 1.8235877124634015, + "learning_rate": 4.156176097192269e-06, + "loss": 0.2764, + "step": 9022 + }, + { + "epoch": 0.57, + "grad_norm": 2.319956144085122, + "learning_rate": 4.1551722468577995e-06, + "loss": 0.2723, + "step": 9023 + }, + { + "epoch": 0.57, + "grad_norm": 1.9463835209956808, + "learning_rate": 4.154168431576734e-06, + "loss": 0.2898, + "step": 9024 + }, + { + "epoch": 0.57, + "grad_norm": 2.209764728812881, + "learning_rate": 4.153164651390721e-06, + "loss": 0.2845, + "step": 9025 + }, + { + "epoch": 0.57, + "grad_norm": 5.926766489150122, + "learning_rate": 4.152160906341413e-06, + "loss": 0.2861, + "step": 9026 + }, + { + "epoch": 0.57, + "grad_norm": 2.843391699472927, + "learning_rate": 4.151157196470454e-06, + "loss": 0.2769, + "step": 9027 + }, + { + "epoch": 0.57, + "grad_norm": 1.6687440898243957, + "learning_rate": 4.150153521819491e-06, + "loss": 0.2732, + "step": 9028 + }, + { + "epoch": 0.57, + "grad_norm": 2.4487680916028647, + "learning_rate": 4.149149882430168e-06, + "loss": 0.3057, + "step": 9029 + }, + { + "epoch": 0.57, + "grad_norm": 1.5382904606927403, + "learning_rate": 4.148146278344128e-06, + "loss": 0.2713, + "step": 9030 + }, + { + "epoch": 0.57, + "grad_norm": 1.5606882320486197, + "learning_rate": 4.147142709603011e-06, + "loss": 0.27, + "step": 9031 + }, + { + "epoch": 0.57, + "grad_norm": 3.189897193920214, + "learning_rate": 4.1461391762484574e-06, + "loss": 0.2991, + "step": 9032 + }, + { + "epoch": 0.57, + "grad_norm": 2.827595602428548, + "learning_rate": 4.145135678322106e-06, + "loss": 0.2787, + "step": 9033 + }, + { + "epoch": 0.57, + "grad_norm": 4.112619212290249, + "learning_rate": 4.144132215865595e-06, + "loss": 0.2929, + "step": 9034 + }, + { + "epoch": 0.57, + "grad_norm": 6.160940061450189, + "learning_rate": 4.143128788920558e-06, + "loss": 0.2894, + "step": 9035 + }, + { + "epoch": 0.57, + "grad_norm": 4.106577037100918, + "learning_rate": 4.14212539752863e-06, + "loss": 0.2768, + "step": 9036 + }, + { + "epoch": 0.57, + "grad_norm": 1.7121411615911037, + "learning_rate": 4.141122041731443e-06, + "loss": 0.3053, + "step": 9037 + }, + { + "epoch": 0.57, + "grad_norm": 6.595180974080103, + "learning_rate": 4.140118721570628e-06, + "loss": 0.2968, + "step": 9038 + }, + { + "epoch": 0.57, + "grad_norm": 2.7088594743653682, + "learning_rate": 4.139115437087814e-06, + "loss": 0.2978, + "step": 9039 + }, + { + "epoch": 0.57, + "grad_norm": 2.6442278626820217, + "learning_rate": 4.138112188324629e-06, + "loss": 0.2948, + "step": 9040 + }, + { + "epoch": 0.57, + "grad_norm": 2.3567313026743775, + "learning_rate": 4.1371089753227e-06, + "loss": 0.2952, + "step": 9041 + }, + { + "epoch": 0.57, + "grad_norm": 1.5945756403391027, + "learning_rate": 4.136105798123654e-06, + "loss": 0.2747, + "step": 9042 + }, + { + "epoch": 0.57, + "grad_norm": 2.150254403840772, + "learning_rate": 4.135102656769112e-06, + "loss": 0.2576, + "step": 9043 + }, + { + "epoch": 0.57, + "grad_norm": 2.429389980405473, + "learning_rate": 4.134099551300698e-06, + "loss": 0.2616, + "step": 9044 + }, + { + "epoch": 0.57, + "grad_norm": 2.0269162189147427, + "learning_rate": 4.1330964817600305e-06, + "loss": 0.285, + "step": 9045 + }, + { + "epoch": 0.57, + "grad_norm": 5.05733529819904, + "learning_rate": 4.1320934481887305e-06, + "loss": 0.3027, + "step": 9046 + }, + { + "epoch": 0.57, + "grad_norm": 2.051632157570671, + "learning_rate": 4.131090450628413e-06, + "loss": 0.3013, + "step": 9047 + }, + { + "epoch": 0.57, + "grad_norm": 2.121106128712773, + "learning_rate": 4.1300874891206974e-06, + "loss": 0.2817, + "step": 9048 + }, + { + "epoch": 0.57, + "grad_norm": 4.394807891657507, + "learning_rate": 4.129084563707197e-06, + "loss": 0.293, + "step": 9049 + }, + { + "epoch": 0.57, + "grad_norm": 4.530018270277474, + "learning_rate": 4.128081674429526e-06, + "loss": 0.2618, + "step": 9050 + }, + { + "epoch": 0.57, + "grad_norm": 2.044056314133268, + "learning_rate": 4.127078821329294e-06, + "loss": 0.2705, + "step": 9051 + }, + { + "epoch": 0.57, + "grad_norm": 6.244917727649454, + "learning_rate": 4.126076004448113e-06, + "loss": 0.2619, + "step": 9052 + }, + { + "epoch": 0.57, + "grad_norm": 5.312164056316612, + "learning_rate": 4.125073223827591e-06, + "loss": 0.2734, + "step": 9053 + }, + { + "epoch": 0.57, + "grad_norm": 3.7450869136011407, + "learning_rate": 4.124070479509334e-06, + "loss": 0.2841, + "step": 9054 + }, + { + "epoch": 0.57, + "grad_norm": 1.820645042683543, + "learning_rate": 4.1230677715349475e-06, + "loss": 0.2722, + "step": 9055 + }, + { + "epoch": 0.57, + "grad_norm": 2.8573115699327634, + "learning_rate": 4.122065099946038e-06, + "loss": 0.2865, + "step": 9056 + }, + { + "epoch": 0.57, + "grad_norm": 3.7987228881782893, + "learning_rate": 4.121062464784206e-06, + "loss": 0.2854, + "step": 9057 + }, + { + "epoch": 0.57, + "grad_norm": 2.2720463921823826, + "learning_rate": 4.120059866091056e-06, + "loss": 0.2716, + "step": 9058 + }, + { + "epoch": 0.57, + "grad_norm": 2.864948724792566, + "learning_rate": 4.119057303908183e-06, + "loss": 0.2764, + "step": 9059 + }, + { + "epoch": 0.57, + "grad_norm": 2.9189110251778523, + "learning_rate": 4.118054778277189e-06, + "loss": 0.2754, + "step": 9060 + }, + { + "epoch": 0.57, + "grad_norm": 4.1701046556276715, + "learning_rate": 4.117052289239667e-06, + "loss": 0.2539, + "step": 9061 + }, + { + "epoch": 0.57, + "grad_norm": 2.3569895924012774, + "learning_rate": 4.1160498368372124e-06, + "loss": 0.279, + "step": 9062 + }, + { + "epoch": 0.57, + "grad_norm": 2.6604809832646694, + "learning_rate": 4.1150474211114225e-06, + "loss": 0.2853, + "step": 9063 + }, + { + "epoch": 0.57, + "grad_norm": 3.0861832642577056, + "learning_rate": 4.1140450421038865e-06, + "loss": 0.2757, + "step": 9064 + }, + { + "epoch": 0.57, + "grad_norm": 13.451331767900175, + "learning_rate": 4.113042699856195e-06, + "loss": 0.2689, + "step": 9065 + }, + { + "epoch": 0.57, + "grad_norm": 2.133384723763996, + "learning_rate": 4.112040394409939e-06, + "loss": 0.2833, + "step": 9066 + }, + { + "epoch": 0.57, + "grad_norm": 2.403540064432925, + "learning_rate": 4.1110381258067026e-06, + "loss": 0.2886, + "step": 9067 + }, + { + "epoch": 0.57, + "grad_norm": 2.025493214522345, + "learning_rate": 4.110035894088074e-06, + "loss": 0.2818, + "step": 9068 + }, + { + "epoch": 0.57, + "grad_norm": 2.3290467463766698, + "learning_rate": 4.109033699295637e-06, + "loss": 0.2886, + "step": 9069 + }, + { + "epoch": 0.57, + "grad_norm": 4.0095072490111425, + "learning_rate": 4.108031541470972e-06, + "loss": 0.2735, + "step": 9070 + }, + { + "epoch": 0.57, + "grad_norm": 2.362202204453788, + "learning_rate": 4.107029420655664e-06, + "loss": 0.2803, + "step": 9071 + }, + { + "epoch": 0.57, + "grad_norm": 2.0550589270341977, + "learning_rate": 4.106027336891293e-06, + "loss": 0.2646, + "step": 9072 + }, + { + "epoch": 0.57, + "grad_norm": 3.8612014518138986, + "learning_rate": 4.105025290219435e-06, + "loss": 0.2814, + "step": 9073 + }, + { + "epoch": 0.57, + "grad_norm": 2.388373069164007, + "learning_rate": 4.104023280681667e-06, + "loss": 0.2834, + "step": 9074 + }, + { + "epoch": 0.57, + "grad_norm": 8.504939264725376, + "learning_rate": 4.1030213083195645e-06, + "loss": 0.3156, + "step": 9075 + }, + { + "epoch": 0.57, + "grad_norm": 2.2076874841158785, + "learning_rate": 4.102019373174702e-06, + "loss": 0.2874, + "step": 9076 + }, + { + "epoch": 0.57, + "grad_norm": 1.9629721043413078, + "learning_rate": 4.101017475288648e-06, + "loss": 0.2761, + "step": 9077 + }, + { + "epoch": 0.57, + "grad_norm": 3.2157956782011876, + "learning_rate": 4.100015614702977e-06, + "loss": 0.2773, + "step": 9078 + }, + { + "epoch": 0.57, + "grad_norm": 3.229573127715457, + "learning_rate": 4.099013791459258e-06, + "loss": 0.2923, + "step": 9079 + }, + { + "epoch": 0.57, + "grad_norm": 9.148341882612355, + "learning_rate": 4.098012005599056e-06, + "loss": 0.2728, + "step": 9080 + }, + { + "epoch": 0.57, + "grad_norm": 3.6428228018447255, + "learning_rate": 4.097010257163938e-06, + "loss": 0.2649, + "step": 9081 + }, + { + "epoch": 0.57, + "grad_norm": 2.395846389958749, + "learning_rate": 4.09600854619547e-06, + "loss": 0.2696, + "step": 9082 + }, + { + "epoch": 0.57, + "grad_norm": 4.852686782804362, + "learning_rate": 4.095006872735211e-06, + "loss": 0.2809, + "step": 9083 + }, + { + "epoch": 0.57, + "grad_norm": 13.909755305140324, + "learning_rate": 4.094005236824726e-06, + "loss": 0.2701, + "step": 9084 + }, + { + "epoch": 0.57, + "grad_norm": 3.5149630657118824, + "learning_rate": 4.093003638505571e-06, + "loss": 0.2874, + "step": 9085 + }, + { + "epoch": 0.57, + "grad_norm": 1.9605236579980447, + "learning_rate": 4.092002077819307e-06, + "loss": 0.2682, + "step": 9086 + }, + { + "epoch": 0.57, + "grad_norm": 2.133333854294839, + "learning_rate": 4.09100055480749e-06, + "loss": 0.2695, + "step": 9087 + }, + { + "epoch": 0.57, + "grad_norm": 2.173200404535774, + "learning_rate": 4.0899990695116745e-06, + "loss": 0.2763, + "step": 9088 + }, + { + "epoch": 0.57, + "grad_norm": 0.6469218776810701, + "learning_rate": 4.088997621973413e-06, + "loss": 0.5043, + "step": 9089 + }, + { + "epoch": 0.57, + "grad_norm": 1.6861730144194709, + "learning_rate": 4.0879962122342596e-06, + "loss": 0.2667, + "step": 9090 + }, + { + "epoch": 0.57, + "grad_norm": 3.750872360073634, + "learning_rate": 4.086994840335763e-06, + "loss": 0.2781, + "step": 9091 + }, + { + "epoch": 0.57, + "grad_norm": 0.5900024528158422, + "learning_rate": 4.08599350631947e-06, + "loss": 0.4712, + "step": 9092 + }, + { + "epoch": 0.57, + "grad_norm": 1.9922181100792156, + "learning_rate": 4.084992210226932e-06, + "loss": 0.2779, + "step": 9093 + }, + { + "epoch": 0.57, + "grad_norm": 3.8149982471392105, + "learning_rate": 4.083990952099692e-06, + "loss": 0.2883, + "step": 9094 + }, + { + "epoch": 0.57, + "grad_norm": 3.60490040605363, + "learning_rate": 4.0829897319792944e-06, + "loss": 0.2722, + "step": 9095 + }, + { + "epoch": 0.57, + "grad_norm": 8.097376451613655, + "learning_rate": 4.081988549907282e-06, + "loss": 0.2839, + "step": 9096 + }, + { + "epoch": 0.57, + "grad_norm": 2.5063950607141985, + "learning_rate": 4.080987405925195e-06, + "loss": 0.2883, + "step": 9097 + }, + { + "epoch": 0.57, + "grad_norm": 2.3265190953944352, + "learning_rate": 4.079986300074573e-06, + "loss": 0.2712, + "step": 9098 + }, + { + "epoch": 0.57, + "grad_norm": 3.057986856956302, + "learning_rate": 4.078985232396953e-06, + "loss": 0.2662, + "step": 9099 + }, + { + "epoch": 0.57, + "grad_norm": 2.2865199414983657, + "learning_rate": 4.0779842029338714e-06, + "loss": 0.3091, + "step": 9100 + }, + { + "epoch": 0.57, + "grad_norm": 3.199288830026781, + "learning_rate": 4.076983211726863e-06, + "loss": 0.2693, + "step": 9101 + }, + { + "epoch": 0.57, + "grad_norm": 3.625226987848647, + "learning_rate": 4.075982258817462e-06, + "loss": 0.2732, + "step": 9102 + }, + { + "epoch": 0.57, + "grad_norm": 2.3269084514484146, + "learning_rate": 4.074981344247197e-06, + "loss": 0.2831, + "step": 9103 + }, + { + "epoch": 0.57, + "grad_norm": 2.2655176838129396, + "learning_rate": 4.0739804680576e-06, + "loss": 0.2731, + "step": 9104 + }, + { + "epoch": 0.57, + "grad_norm": 2.8790560167473216, + "learning_rate": 4.0729796302901975e-06, + "loss": 0.2747, + "step": 9105 + }, + { + "epoch": 0.57, + "grad_norm": 1.7285640356748337, + "learning_rate": 4.071978830986518e-06, + "loss": 0.2942, + "step": 9106 + }, + { + "epoch": 0.57, + "grad_norm": 2.2212964584503663, + "learning_rate": 4.070978070188083e-06, + "loss": 0.3006, + "step": 9107 + }, + { + "epoch": 0.57, + "grad_norm": 2.8213856857007094, + "learning_rate": 4.069977347936418e-06, + "loss": 0.285, + "step": 9108 + }, + { + "epoch": 0.57, + "grad_norm": 2.794564427043924, + "learning_rate": 4.068976664273046e-06, + "loss": 0.309, + "step": 9109 + }, + { + "epoch": 0.57, + "grad_norm": 0.6854725343272413, + "learning_rate": 4.067976019239486e-06, + "loss": 0.4747, + "step": 9110 + }, + { + "epoch": 0.57, + "grad_norm": 2.2039729196478537, + "learning_rate": 4.0669754128772554e-06, + "loss": 0.2753, + "step": 9111 + }, + { + "epoch": 0.57, + "grad_norm": 4.479916467364978, + "learning_rate": 4.065974845227874e-06, + "loss": 0.2869, + "step": 9112 + }, + { + "epoch": 0.57, + "grad_norm": 1.750741494660911, + "learning_rate": 4.064974316332854e-06, + "loss": 0.2864, + "step": 9113 + }, + { + "epoch": 0.57, + "grad_norm": 1.8052909941077098, + "learning_rate": 4.063973826233708e-06, + "loss": 0.2689, + "step": 9114 + }, + { + "epoch": 0.57, + "grad_norm": 1.745078312464633, + "learning_rate": 4.062973374971954e-06, + "loss": 0.2915, + "step": 9115 + }, + { + "epoch": 0.57, + "grad_norm": 3.8686242934885526, + "learning_rate": 4.061972962589098e-06, + "loss": 0.2714, + "step": 9116 + }, + { + "epoch": 0.57, + "grad_norm": 2.269804397596038, + "learning_rate": 4.06097258912665e-06, + "loss": 0.281, + "step": 9117 + }, + { + "epoch": 0.57, + "grad_norm": 2.988428007549461, + "learning_rate": 4.059972254626118e-06, + "loss": 0.2704, + "step": 9118 + }, + { + "epoch": 0.57, + "grad_norm": 2.45296716345786, + "learning_rate": 4.058971959129006e-06, + "loss": 0.2776, + "step": 9119 + }, + { + "epoch": 0.57, + "grad_norm": 2.297886432980261, + "learning_rate": 4.057971702676819e-06, + "loss": 0.2805, + "step": 9120 + }, + { + "epoch": 0.57, + "grad_norm": 3.579428080254274, + "learning_rate": 4.0569714853110585e-06, + "loss": 0.275, + "step": 9121 + }, + { + "epoch": 0.57, + "grad_norm": 19.50183963770546, + "learning_rate": 4.055971307073225e-06, + "loss": 0.2918, + "step": 9122 + }, + { + "epoch": 0.57, + "grad_norm": 2.102321279641808, + "learning_rate": 4.054971168004822e-06, + "loss": 0.2697, + "step": 9123 + }, + { + "epoch": 0.57, + "grad_norm": 2.8339897951859103, + "learning_rate": 4.0539710681473415e-06, + "loss": 0.2822, + "step": 9124 + }, + { + "epoch": 0.57, + "grad_norm": 6.964936061859733, + "learning_rate": 4.052971007542283e-06, + "loss": 0.2614, + "step": 9125 + }, + { + "epoch": 0.57, + "grad_norm": 1.9912289138458814, + "learning_rate": 4.051970986231139e-06, + "loss": 0.2736, + "step": 9126 + }, + { + "epoch": 0.57, + "grad_norm": 1.8981902565682565, + "learning_rate": 4.0509710042554026e-06, + "loss": 0.285, + "step": 9127 + }, + { + "epoch": 0.57, + "grad_norm": 0.654667301972239, + "learning_rate": 4.049971061656566e-06, + "loss": 0.4594, + "step": 9128 + }, + { + "epoch": 0.57, + "grad_norm": 3.034644345762902, + "learning_rate": 4.048971158476114e-06, + "loss": 0.2729, + "step": 9129 + }, + { + "epoch": 0.57, + "grad_norm": 3.933527205490952, + "learning_rate": 4.04797129475554e-06, + "loss": 0.2709, + "step": 9130 + }, + { + "epoch": 0.57, + "grad_norm": 2.2155708788882147, + "learning_rate": 4.04697147053633e-06, + "loss": 0.2634, + "step": 9131 + }, + { + "epoch": 0.57, + "grad_norm": 2.733420587729311, + "learning_rate": 4.045971685859964e-06, + "loss": 0.2842, + "step": 9132 + }, + { + "epoch": 0.57, + "grad_norm": 2.216164533715891, + "learning_rate": 4.044971940767927e-06, + "loss": 0.2582, + "step": 9133 + }, + { + "epoch": 0.57, + "grad_norm": 11.635077684144465, + "learning_rate": 4.043972235301703e-06, + "loss": 0.2646, + "step": 9134 + }, + { + "epoch": 0.57, + "grad_norm": 2.5178197638888022, + "learning_rate": 4.042972569502767e-06, + "loss": 0.2963, + "step": 9135 + }, + { + "epoch": 0.57, + "grad_norm": 1.9129191601519608, + "learning_rate": 4.0419729434126e-06, + "loss": 0.2809, + "step": 9136 + }, + { + "epoch": 0.57, + "grad_norm": 4.08700108767703, + "learning_rate": 4.040973357072676e-06, + "loss": 0.286, + "step": 9137 + }, + { + "epoch": 0.57, + "grad_norm": 3.037377534653133, + "learning_rate": 4.039973810524471e-06, + "loss": 0.2813, + "step": 9138 + }, + { + "epoch": 0.57, + "grad_norm": 1.9294021769241192, + "learning_rate": 4.038974303809459e-06, + "loss": 0.2758, + "step": 9139 + }, + { + "epoch": 0.57, + "grad_norm": 38.69368353543663, + "learning_rate": 4.03797483696911e-06, + "loss": 0.2897, + "step": 9140 + }, + { + "epoch": 0.57, + "grad_norm": 4.955541783425685, + "learning_rate": 4.036975410044892e-06, + "loss": 0.2692, + "step": 9141 + }, + { + "epoch": 0.57, + "grad_norm": 2.7956800707173444, + "learning_rate": 4.035976023078278e-06, + "loss": 0.2683, + "step": 9142 + }, + { + "epoch": 0.57, + "grad_norm": 2.4126732889509968, + "learning_rate": 4.0349766761107275e-06, + "loss": 0.2791, + "step": 9143 + }, + { + "epoch": 0.58, + "grad_norm": 1.8318435302789828, + "learning_rate": 4.033977369183709e-06, + "loss": 0.2669, + "step": 9144 + }, + { + "epoch": 0.58, + "grad_norm": 2.4843744083775534, + "learning_rate": 4.032978102338688e-06, + "loss": 0.2585, + "step": 9145 + }, + { + "epoch": 0.58, + "grad_norm": 10.755507177658057, + "learning_rate": 4.031978875617122e-06, + "loss": 0.2679, + "step": 9146 + }, + { + "epoch": 0.58, + "grad_norm": 2.047013529731842, + "learning_rate": 4.030979689060471e-06, + "loss": 0.2694, + "step": 9147 + }, + { + "epoch": 0.58, + "grad_norm": 6.280461486830635, + "learning_rate": 4.029980542710196e-06, + "loss": 0.2728, + "step": 9148 + }, + { + "epoch": 0.58, + "grad_norm": 3.7130861819098677, + "learning_rate": 4.028981436607749e-06, + "loss": 0.2845, + "step": 9149 + }, + { + "epoch": 0.58, + "grad_norm": 2.5905654057925287, + "learning_rate": 4.0279823707945885e-06, + "loss": 0.259, + "step": 9150 + }, + { + "epoch": 0.58, + "grad_norm": 4.786063709105976, + "learning_rate": 4.0269833453121644e-06, + "loss": 0.2669, + "step": 9151 + }, + { + "epoch": 0.58, + "grad_norm": 2.093826153277464, + "learning_rate": 4.025984360201929e-06, + "loss": 0.2681, + "step": 9152 + }, + { + "epoch": 0.58, + "grad_norm": 3.129957657546236, + "learning_rate": 4.0249854155053345e-06, + "loss": 0.2589, + "step": 9153 + }, + { + "epoch": 0.58, + "grad_norm": 1.694219205540417, + "learning_rate": 4.023986511263827e-06, + "loss": 0.2757, + "step": 9154 + }, + { + "epoch": 0.58, + "grad_norm": 5.1687010584432, + "learning_rate": 4.022987647518851e-06, + "loss": 0.2802, + "step": 9155 + }, + { + "epoch": 0.58, + "grad_norm": 2.6205551709536556, + "learning_rate": 4.0219888243118546e-06, + "loss": 0.282, + "step": 9156 + }, + { + "epoch": 0.58, + "grad_norm": 4.042505089343204, + "learning_rate": 4.020990041684278e-06, + "loss": 0.2599, + "step": 9157 + }, + { + "epoch": 0.58, + "grad_norm": 3.9101776779715314, + "learning_rate": 4.019991299677565e-06, + "loss": 0.2828, + "step": 9158 + }, + { + "epoch": 0.58, + "grad_norm": 2.17251905671189, + "learning_rate": 4.018992598333151e-06, + "loss": 0.2864, + "step": 9159 + }, + { + "epoch": 0.58, + "grad_norm": 1.9085476506756567, + "learning_rate": 4.017993937692478e-06, + "loss": 0.2713, + "step": 9160 + }, + { + "epoch": 0.58, + "grad_norm": 3.4533500041412646, + "learning_rate": 4.0169953177969814e-06, + "loss": 0.28, + "step": 9161 + }, + { + "epoch": 0.58, + "grad_norm": 2.684952679191809, + "learning_rate": 4.015996738688094e-06, + "loss": 0.2866, + "step": 9162 + }, + { + "epoch": 0.58, + "grad_norm": 1.9267543954730049, + "learning_rate": 4.01499820040725e-06, + "loss": 0.2744, + "step": 9163 + }, + { + "epoch": 0.58, + "grad_norm": 2.418685357774127, + "learning_rate": 4.013999702995881e-06, + "loss": 0.2793, + "step": 9164 + }, + { + "epoch": 0.58, + "grad_norm": 3.914806246349283, + "learning_rate": 4.013001246495415e-06, + "loss": 0.2716, + "step": 9165 + }, + { + "epoch": 0.58, + "grad_norm": 2.0052424753735694, + "learning_rate": 4.012002830947281e-06, + "loss": 0.2658, + "step": 9166 + }, + { + "epoch": 0.58, + "grad_norm": 2.88961432657455, + "learning_rate": 4.011004456392903e-06, + "loss": 0.2607, + "step": 9167 + }, + { + "epoch": 0.58, + "grad_norm": 2.295553631017359, + "learning_rate": 4.010006122873707e-06, + "loss": 0.2821, + "step": 9168 + }, + { + "epoch": 0.58, + "grad_norm": 4.235881630326167, + "learning_rate": 4.009007830431118e-06, + "loss": 0.2823, + "step": 9169 + }, + { + "epoch": 0.58, + "grad_norm": 0.672443743563525, + "learning_rate": 4.008009579106551e-06, + "loss": 0.487, + "step": 9170 + }, + { + "epoch": 0.58, + "grad_norm": 5.006177691577419, + "learning_rate": 4.007011368941429e-06, + "loss": 0.27, + "step": 9171 + }, + { + "epoch": 0.58, + "grad_norm": 9.546157449135116, + "learning_rate": 4.0060131999771715e-06, + "loss": 0.3116, + "step": 9172 + }, + { + "epoch": 0.58, + "grad_norm": 2.827833423456838, + "learning_rate": 4.00501507225519e-06, + "loss": 0.2665, + "step": 9173 + }, + { + "epoch": 0.58, + "grad_norm": 3.168560296788205, + "learning_rate": 4.0040169858169e-06, + "loss": 0.2701, + "step": 9174 + }, + { + "epoch": 0.58, + "grad_norm": 4.7020082264023735, + "learning_rate": 4.003018940703716e-06, + "loss": 0.2705, + "step": 9175 + }, + { + "epoch": 0.58, + "grad_norm": 5.167975200895409, + "learning_rate": 4.002020936957045e-06, + "loss": 0.2627, + "step": 9176 + }, + { + "epoch": 0.58, + "grad_norm": 2.5360734183114646, + "learning_rate": 4.0010229746183e-06, + "loss": 0.2587, + "step": 9177 + }, + { + "epoch": 0.58, + "grad_norm": 3.23888902376058, + "learning_rate": 4.0000250537288845e-06, + "loss": 0.2696, + "step": 9178 + }, + { + "epoch": 0.58, + "grad_norm": 4.088208684446045, + "learning_rate": 3.999027174330206e-06, + "loss": 0.2776, + "step": 9179 + }, + { + "epoch": 0.58, + "grad_norm": 1.856449926681893, + "learning_rate": 3.9980293364636694e-06, + "loss": 0.2619, + "step": 9180 + }, + { + "epoch": 0.58, + "grad_norm": 5.48506973500211, + "learning_rate": 3.9970315401706726e-06, + "loss": 0.2817, + "step": 9181 + }, + { + "epoch": 0.58, + "grad_norm": 2.0070379975397, + "learning_rate": 3.99603378549262e-06, + "loss": 0.2701, + "step": 9182 + }, + { + "epoch": 0.58, + "grad_norm": 5.975709514802917, + "learning_rate": 3.99503607247091e-06, + "loss": 0.3074, + "step": 9183 + }, + { + "epoch": 0.58, + "grad_norm": 2.600647208977965, + "learning_rate": 3.994038401146937e-06, + "loss": 0.2801, + "step": 9184 + }, + { + "epoch": 0.58, + "grad_norm": 2.6339462822329724, + "learning_rate": 3.993040771562098e-06, + "loss": 0.2858, + "step": 9185 + }, + { + "epoch": 0.58, + "grad_norm": 3.7211227449044615, + "learning_rate": 3.9920431837577876e-06, + "loss": 0.2728, + "step": 9186 + }, + { + "epoch": 0.58, + "grad_norm": 2.1177502246771835, + "learning_rate": 3.991045637775393e-06, + "loss": 0.2637, + "step": 9187 + }, + { + "epoch": 0.58, + "grad_norm": 0.6273683184959713, + "learning_rate": 3.99004813365631e-06, + "loss": 0.5095, + "step": 9188 + }, + { + "epoch": 0.58, + "grad_norm": 4.10203118302687, + "learning_rate": 3.98905067144192e-06, + "loss": 0.2816, + "step": 9189 + }, + { + "epoch": 0.58, + "grad_norm": 8.98073069918242, + "learning_rate": 3.988053251173615e-06, + "loss": 0.267, + "step": 9190 + }, + { + "epoch": 0.58, + "grad_norm": 2.354093564094669, + "learning_rate": 3.987055872892779e-06, + "loss": 0.2778, + "step": 9191 + }, + { + "epoch": 0.58, + "grad_norm": 2.5464327724788087, + "learning_rate": 3.986058536640793e-06, + "loss": 0.256, + "step": 9192 + }, + { + "epoch": 0.58, + "grad_norm": 2.2718249165470352, + "learning_rate": 3.985061242459039e-06, + "loss": 0.284, + "step": 9193 + }, + { + "epoch": 0.58, + "grad_norm": 5.600607028959871, + "learning_rate": 3.984063990388899e-06, + "loss": 0.2738, + "step": 9194 + }, + { + "epoch": 0.58, + "grad_norm": 2.969615068838298, + "learning_rate": 3.983066780471747e-06, + "loss": 0.2857, + "step": 9195 + }, + { + "epoch": 0.58, + "grad_norm": 2.4830170588299847, + "learning_rate": 3.982069612748959e-06, + "loss": 0.2752, + "step": 9196 + }, + { + "epoch": 0.58, + "grad_norm": 2.40807304756567, + "learning_rate": 3.981072487261913e-06, + "loss": 0.2702, + "step": 9197 + }, + { + "epoch": 0.58, + "grad_norm": 5.8419269627637975, + "learning_rate": 3.9800754040519785e-06, + "loss": 0.2587, + "step": 9198 + }, + { + "epoch": 0.58, + "grad_norm": 3.357213022948169, + "learning_rate": 3.979078363160528e-06, + "loss": 0.2775, + "step": 9199 + }, + { + "epoch": 0.58, + "grad_norm": 3.630534764748263, + "learning_rate": 3.9780813646289286e-06, + "loss": 0.2736, + "step": 9200 + }, + { + "epoch": 0.58, + "grad_norm": 2.526560502368286, + "learning_rate": 3.977084408498549e-06, + "loss": 0.271, + "step": 9201 + }, + { + "epoch": 0.58, + "grad_norm": 0.5819987109373376, + "learning_rate": 3.976087494810754e-06, + "loss": 0.4673, + "step": 9202 + }, + { + "epoch": 0.58, + "grad_norm": 4.79083350753169, + "learning_rate": 3.975090623606907e-06, + "loss": 0.2933, + "step": 9203 + }, + { + "epoch": 0.58, + "grad_norm": 2.6885841543703464, + "learning_rate": 3.97409379492837e-06, + "loss": 0.2806, + "step": 9204 + }, + { + "epoch": 0.58, + "grad_norm": 7.983305670568188, + "learning_rate": 3.973097008816505e-06, + "loss": 0.2739, + "step": 9205 + }, + { + "epoch": 0.58, + "grad_norm": 0.6457106968133617, + "learning_rate": 3.972100265312669e-06, + "loss": 0.5008, + "step": 9206 + }, + { + "epoch": 0.58, + "grad_norm": 2.7651344050525344, + "learning_rate": 3.971103564458219e-06, + "loss": 0.2857, + "step": 9207 + }, + { + "epoch": 0.58, + "grad_norm": 3.8530805703622715, + "learning_rate": 3.970106906294509e-06, + "loss": 0.2693, + "step": 9208 + }, + { + "epoch": 0.58, + "grad_norm": 4.628739316517356, + "learning_rate": 3.9691102908628925e-06, + "loss": 0.2895, + "step": 9209 + }, + { + "epoch": 0.58, + "grad_norm": 2.30876149085146, + "learning_rate": 3.968113718204722e-06, + "loss": 0.2686, + "step": 9210 + }, + { + "epoch": 0.58, + "grad_norm": 3.652158089528851, + "learning_rate": 3.967117188361345e-06, + "loss": 0.2991, + "step": 9211 + }, + { + "epoch": 0.58, + "grad_norm": 14.236986397916008, + "learning_rate": 3.96612070137411e-06, + "loss": 0.2507, + "step": 9212 + }, + { + "epoch": 0.58, + "grad_norm": 3.993317824882817, + "learning_rate": 3.965124257284366e-06, + "loss": 0.2902, + "step": 9213 + }, + { + "epoch": 0.58, + "grad_norm": 2.9551932726352708, + "learning_rate": 3.964127856133453e-06, + "loss": 0.2962, + "step": 9214 + }, + { + "epoch": 0.58, + "grad_norm": 2.4245063545873173, + "learning_rate": 3.963131497962715e-06, + "loss": 0.2711, + "step": 9215 + }, + { + "epoch": 0.58, + "grad_norm": 4.503509197929178, + "learning_rate": 3.9621351828134935e-06, + "loss": 0.2891, + "step": 9216 + }, + { + "epoch": 0.58, + "grad_norm": 3.3175082169082923, + "learning_rate": 3.961138910727126e-06, + "loss": 0.2628, + "step": 9217 + }, + { + "epoch": 0.58, + "grad_norm": 4.206590509360579, + "learning_rate": 3.960142681744952e-06, + "loss": 0.3006, + "step": 9218 + }, + { + "epoch": 0.58, + "grad_norm": 1.976329787919778, + "learning_rate": 3.959146495908303e-06, + "loss": 0.2751, + "step": 9219 + }, + { + "epoch": 0.58, + "grad_norm": 3.659175816800913, + "learning_rate": 3.958150353258515e-06, + "loss": 0.2952, + "step": 9220 + }, + { + "epoch": 0.58, + "grad_norm": 1.8089002270216088, + "learning_rate": 3.957154253836921e-06, + "loss": 0.2695, + "step": 9221 + }, + { + "epoch": 0.58, + "grad_norm": 2.261169418230604, + "learning_rate": 3.9561581976848475e-06, + "loss": 0.2722, + "step": 9222 + }, + { + "epoch": 0.58, + "grad_norm": 2.8436599195476457, + "learning_rate": 3.955162184843625e-06, + "loss": 0.2913, + "step": 9223 + }, + { + "epoch": 0.58, + "grad_norm": 1.7849197509321244, + "learning_rate": 3.95416621535458e-06, + "loss": 0.2746, + "step": 9224 + }, + { + "epoch": 0.58, + "grad_norm": 4.4996914745383165, + "learning_rate": 3.953170289259036e-06, + "loss": 0.275, + "step": 9225 + }, + { + "epoch": 0.58, + "grad_norm": 2.2381976452551235, + "learning_rate": 3.952174406598314e-06, + "loss": 0.2761, + "step": 9226 + }, + { + "epoch": 0.58, + "grad_norm": 4.158901164094756, + "learning_rate": 3.951178567413739e-06, + "loss": 0.2662, + "step": 9227 + }, + { + "epoch": 0.58, + "grad_norm": 3.369904440501148, + "learning_rate": 3.950182771746629e-06, + "loss": 0.2514, + "step": 9228 + }, + { + "epoch": 0.58, + "grad_norm": 3.123030064883653, + "learning_rate": 3.9491870196383e-06, + "loss": 0.2937, + "step": 9229 + }, + { + "epoch": 0.58, + "grad_norm": 2.6122154605217522, + "learning_rate": 3.948191311130067e-06, + "loss": 0.2537, + "step": 9230 + }, + { + "epoch": 0.58, + "grad_norm": 3.5841996804823477, + "learning_rate": 3.947195646263246e-06, + "loss": 0.2647, + "step": 9231 + }, + { + "epoch": 0.58, + "grad_norm": 4.3264531672810795, + "learning_rate": 3.946200025079147e-06, + "loss": 0.2782, + "step": 9232 + }, + { + "epoch": 0.58, + "grad_norm": 2.549684787107598, + "learning_rate": 3.945204447619081e-06, + "loss": 0.2824, + "step": 9233 + }, + { + "epoch": 0.58, + "grad_norm": 4.459502477500608, + "learning_rate": 3.944208913924354e-06, + "loss": 0.2699, + "step": 9234 + }, + { + "epoch": 0.58, + "grad_norm": 0.6854850466208346, + "learning_rate": 3.943213424036277e-06, + "loss": 0.4983, + "step": 9235 + }, + { + "epoch": 0.58, + "grad_norm": 3.7813734130762415, + "learning_rate": 3.942217977996151e-06, + "loss": 0.2781, + "step": 9236 + }, + { + "epoch": 0.58, + "grad_norm": 21.838722174618145, + "learning_rate": 3.94122257584528e-06, + "loss": 0.2824, + "step": 9237 + }, + { + "epoch": 0.58, + "grad_norm": 1.6116462264193507, + "learning_rate": 3.940227217624965e-06, + "loss": 0.2923, + "step": 9238 + }, + { + "epoch": 0.58, + "grad_norm": 1.8344541824735696, + "learning_rate": 3.939231903376505e-06, + "loss": 0.2806, + "step": 9239 + }, + { + "epoch": 0.58, + "grad_norm": 3.9031682065374627, + "learning_rate": 3.938236633141199e-06, + "loss": 0.265, + "step": 9240 + }, + { + "epoch": 0.58, + "grad_norm": 3.1751783097786794, + "learning_rate": 3.937241406960338e-06, + "loss": 0.2911, + "step": 9241 + }, + { + "epoch": 0.58, + "grad_norm": 10.212821549681234, + "learning_rate": 3.93624622487522e-06, + "loss": 0.2713, + "step": 9242 + }, + { + "epoch": 0.58, + "grad_norm": 4.7039019833786515, + "learning_rate": 3.935251086927137e-06, + "loss": 0.2872, + "step": 9243 + }, + { + "epoch": 0.58, + "grad_norm": 4.078187430760622, + "learning_rate": 3.934255993157375e-06, + "loss": 0.2821, + "step": 9244 + }, + { + "epoch": 0.58, + "grad_norm": 3.336290319319591, + "learning_rate": 3.933260943607228e-06, + "loss": 0.2729, + "step": 9245 + }, + { + "epoch": 0.58, + "grad_norm": 2.4912092543409914, + "learning_rate": 3.932265938317977e-06, + "loss": 0.2828, + "step": 9246 + }, + { + "epoch": 0.58, + "grad_norm": 0.6318907624468713, + "learning_rate": 3.931270977330909e-06, + "loss": 0.4828, + "step": 9247 + }, + { + "epoch": 0.58, + "grad_norm": 1.7379207239142243, + "learning_rate": 3.930276060687307e-06, + "loss": 0.2641, + "step": 9248 + }, + { + "epoch": 0.58, + "grad_norm": 1.6218894435067417, + "learning_rate": 3.92928118842845e-06, + "loss": 0.2833, + "step": 9249 + }, + { + "epoch": 0.58, + "grad_norm": 0.601040136110174, + "learning_rate": 3.928286360595619e-06, + "loss": 0.5165, + "step": 9250 + }, + { + "epoch": 0.58, + "grad_norm": 2.1536563858684312, + "learning_rate": 3.927291577230091e-06, + "loss": 0.2702, + "step": 9251 + }, + { + "epoch": 0.58, + "grad_norm": 2.8247434711763884, + "learning_rate": 3.926296838373141e-06, + "loss": 0.2659, + "step": 9252 + }, + { + "epoch": 0.58, + "grad_norm": 2.1103878092146062, + "learning_rate": 3.925302144066042e-06, + "loss": 0.2779, + "step": 9253 + }, + { + "epoch": 0.58, + "grad_norm": 8.466824691038825, + "learning_rate": 3.924307494350066e-06, + "loss": 0.2753, + "step": 9254 + }, + { + "epoch": 0.58, + "grad_norm": 1.8396376075892753, + "learning_rate": 3.9233128892664815e-06, + "loss": 0.2634, + "step": 9255 + }, + { + "epoch": 0.58, + "grad_norm": 5.258611616343802, + "learning_rate": 3.922318328856557e-06, + "loss": 0.2826, + "step": 9256 + }, + { + "epoch": 0.58, + "grad_norm": 1.947238415015567, + "learning_rate": 3.921323813161562e-06, + "loss": 0.2762, + "step": 9257 + }, + { + "epoch": 0.58, + "grad_norm": 3.0129693418965022, + "learning_rate": 3.9203293422227564e-06, + "loss": 0.2792, + "step": 9258 + }, + { + "epoch": 0.58, + "grad_norm": 1.8433568163588185, + "learning_rate": 3.919334916081406e-06, + "loss": 0.2579, + "step": 9259 + }, + { + "epoch": 0.58, + "grad_norm": 2.605504807618717, + "learning_rate": 3.918340534778767e-06, + "loss": 0.3022, + "step": 9260 + }, + { + "epoch": 0.58, + "grad_norm": 1.9318041140657272, + "learning_rate": 3.917346198356103e-06, + "loss": 0.2713, + "step": 9261 + }, + { + "epoch": 0.58, + "grad_norm": 3.643841981750636, + "learning_rate": 3.916351906854665e-06, + "loss": 0.2656, + "step": 9262 + }, + { + "epoch": 0.58, + "grad_norm": 1.9256949886880956, + "learning_rate": 3.915357660315712e-06, + "loss": 0.2691, + "step": 9263 + }, + { + "epoch": 0.58, + "grad_norm": 1.9563109630509625, + "learning_rate": 3.914363458780496e-06, + "loss": 0.2651, + "step": 9264 + }, + { + "epoch": 0.58, + "grad_norm": 2.4508005363252714, + "learning_rate": 3.913369302290271e-06, + "loss": 0.2775, + "step": 9265 + }, + { + "epoch": 0.58, + "grad_norm": 3.5971326424201586, + "learning_rate": 3.912375190886281e-06, + "loss": 0.2648, + "step": 9266 + }, + { + "epoch": 0.58, + "grad_norm": 1.675208419701591, + "learning_rate": 3.911381124609778e-06, + "loss": 0.2537, + "step": 9267 + }, + { + "epoch": 0.58, + "grad_norm": 3.2601122342457307, + "learning_rate": 3.9103871035020044e-06, + "loss": 0.29, + "step": 9268 + }, + { + "epoch": 0.58, + "grad_norm": 2.2851296786488087, + "learning_rate": 3.909393127604206e-06, + "loss": 0.2794, + "step": 9269 + }, + { + "epoch": 0.58, + "grad_norm": 2.941750832709662, + "learning_rate": 3.908399196957625e-06, + "loss": 0.2805, + "step": 9270 + }, + { + "epoch": 0.58, + "grad_norm": 2.407309898368578, + "learning_rate": 3.907405311603497e-06, + "loss": 0.2876, + "step": 9271 + }, + { + "epoch": 0.58, + "grad_norm": 2.078145251398732, + "learning_rate": 3.906411471583065e-06, + "loss": 0.2695, + "step": 9272 + }, + { + "epoch": 0.58, + "grad_norm": 2.8576362429051776, + "learning_rate": 3.905417676937564e-06, + "loss": 0.2726, + "step": 9273 + }, + { + "epoch": 0.58, + "grad_norm": 3.2375411146914583, + "learning_rate": 3.9044239277082275e-06, + "loss": 0.2649, + "step": 9274 + }, + { + "epoch": 0.58, + "grad_norm": 1.5901883542414676, + "learning_rate": 3.903430223936289e-06, + "loss": 0.2771, + "step": 9275 + }, + { + "epoch": 0.58, + "grad_norm": 2.386079394630121, + "learning_rate": 3.9024365656629774e-06, + "loss": 0.2847, + "step": 9276 + }, + { + "epoch": 0.58, + "grad_norm": 2.1223109386297265, + "learning_rate": 3.901442952929522e-06, + "loss": 0.2957, + "step": 9277 + }, + { + "epoch": 0.58, + "grad_norm": 1.6301339216888115, + "learning_rate": 3.900449385777148e-06, + "loss": 0.2757, + "step": 9278 + }, + { + "epoch": 0.58, + "grad_norm": 1.331999589760647, + "learning_rate": 3.899455864247085e-06, + "loss": 0.2602, + "step": 9279 + }, + { + "epoch": 0.58, + "grad_norm": 2.0459651310011617, + "learning_rate": 3.898462388380551e-06, + "loss": 0.2835, + "step": 9280 + }, + { + "epoch": 0.58, + "grad_norm": 1.6533536651957998, + "learning_rate": 3.897468958218771e-06, + "loss": 0.2727, + "step": 9281 + }, + { + "epoch": 0.58, + "grad_norm": 2.462847176750664, + "learning_rate": 3.896475573802961e-06, + "loss": 0.2798, + "step": 9282 + }, + { + "epoch": 0.58, + "grad_norm": 2.5149493904701377, + "learning_rate": 3.895482235174341e-06, + "loss": 0.2849, + "step": 9283 + }, + { + "epoch": 0.58, + "grad_norm": 6.296923975090486, + "learning_rate": 3.894488942374123e-06, + "loss": 0.257, + "step": 9284 + }, + { + "epoch": 0.58, + "grad_norm": 2.256151768772217, + "learning_rate": 3.893495695443522e-06, + "loss": 0.2639, + "step": 9285 + }, + { + "epoch": 0.58, + "grad_norm": 7.313014087149545, + "learning_rate": 3.89250249442375e-06, + "loss": 0.278, + "step": 9286 + }, + { + "epoch": 0.58, + "grad_norm": 1.540986611801791, + "learning_rate": 3.891509339356018e-06, + "loss": 0.2769, + "step": 9287 + }, + { + "epoch": 0.58, + "grad_norm": 1.7000881897142213, + "learning_rate": 3.890516230281532e-06, + "loss": 0.2718, + "step": 9288 + }, + { + "epoch": 0.58, + "grad_norm": 1.8600218578769911, + "learning_rate": 3.889523167241499e-06, + "loss": 0.2876, + "step": 9289 + }, + { + "epoch": 0.58, + "grad_norm": 1.6955977080804243, + "learning_rate": 3.888530150277121e-06, + "loss": 0.2753, + "step": 9290 + }, + { + "epoch": 0.58, + "grad_norm": 1.8290332204823654, + "learning_rate": 3.887537179429603e-06, + "loss": 0.2724, + "step": 9291 + }, + { + "epoch": 0.58, + "grad_norm": 2.2165372570674795, + "learning_rate": 3.886544254740141e-06, + "loss": 0.2516, + "step": 9292 + }, + { + "epoch": 0.58, + "grad_norm": 1.6392012056535392, + "learning_rate": 3.885551376249936e-06, + "loss": 0.2578, + "step": 9293 + }, + { + "epoch": 0.58, + "grad_norm": 4.667023314668314, + "learning_rate": 3.884558544000184e-06, + "loss": 0.2903, + "step": 9294 + }, + { + "epoch": 0.58, + "grad_norm": 7.238751676907104, + "learning_rate": 3.883565758032081e-06, + "loss": 0.2625, + "step": 9295 + }, + { + "epoch": 0.58, + "grad_norm": 2.480637919262386, + "learning_rate": 3.882573018386816e-06, + "loss": 0.2831, + "step": 9296 + }, + { + "epoch": 0.58, + "grad_norm": 4.551646106426363, + "learning_rate": 3.8815803251055826e-06, + "loss": 0.274, + "step": 9297 + }, + { + "epoch": 0.58, + "grad_norm": 1.5964922109046245, + "learning_rate": 3.880587678229567e-06, + "loss": 0.2902, + "step": 9298 + }, + { + "epoch": 0.58, + "grad_norm": 2.608368448008926, + "learning_rate": 3.879595077799958e-06, + "loss": 0.2754, + "step": 9299 + }, + { + "epoch": 0.58, + "grad_norm": 1.9146461555348528, + "learning_rate": 3.878602523857938e-06, + "loss": 0.27, + "step": 9300 + }, + { + "epoch": 0.58, + "grad_norm": 1.861027720405857, + "learning_rate": 3.8776100164446905e-06, + "loss": 0.2795, + "step": 9301 + }, + { + "epoch": 0.58, + "grad_norm": 2.5705217704095804, + "learning_rate": 3.876617555601398e-06, + "loss": 0.2834, + "step": 9302 + }, + { + "epoch": 0.59, + "grad_norm": 1.8548233970260224, + "learning_rate": 3.875625141369239e-06, + "loss": 0.2783, + "step": 9303 + }, + { + "epoch": 0.59, + "grad_norm": 3.3720798077685035, + "learning_rate": 3.874632773789389e-06, + "loss": 0.2749, + "step": 9304 + }, + { + "epoch": 0.59, + "grad_norm": 2.3129696441353427, + "learning_rate": 3.8736404529030255e-06, + "loss": 0.2564, + "step": 9305 + }, + { + "epoch": 0.59, + "grad_norm": 2.6769039219375834, + "learning_rate": 3.87264817875132e-06, + "loss": 0.2874, + "step": 9306 + }, + { + "epoch": 0.59, + "grad_norm": 0.600291436389585, + "learning_rate": 3.871655951375443e-06, + "loss": 0.5002, + "step": 9307 + }, + { + "epoch": 0.59, + "grad_norm": 3.9061199034288974, + "learning_rate": 3.870663770816563e-06, + "loss": 0.2943, + "step": 9308 + }, + { + "epoch": 0.59, + "grad_norm": 2.855016061376826, + "learning_rate": 3.869671637115853e-06, + "loss": 0.2887, + "step": 9309 + }, + { + "epoch": 0.59, + "grad_norm": 1.925621228706962, + "learning_rate": 3.868679550314472e-06, + "loss": 0.2769, + "step": 9310 + }, + { + "epoch": 0.59, + "grad_norm": 3.4470475541665326, + "learning_rate": 3.867687510453587e-06, + "loss": 0.2895, + "step": 9311 + }, + { + "epoch": 0.59, + "grad_norm": 2.4803229483520712, + "learning_rate": 3.866695517574358e-06, + "loss": 0.2755, + "step": 9312 + }, + { + "epoch": 0.59, + "grad_norm": 2.0185927478834387, + "learning_rate": 3.865703571717946e-06, + "loss": 0.3111, + "step": 9313 + }, + { + "epoch": 0.59, + "grad_norm": 1.425685119607202, + "learning_rate": 3.864711672925506e-06, + "loss": 0.2812, + "step": 9314 + }, + { + "epoch": 0.59, + "grad_norm": 1.6278036288086817, + "learning_rate": 3.863719821238196e-06, + "loss": 0.2859, + "step": 9315 + }, + { + "epoch": 0.59, + "grad_norm": 6.178543980447864, + "learning_rate": 3.862728016697167e-06, + "loss": 0.2752, + "step": 9316 + }, + { + "epoch": 0.59, + "grad_norm": 2.0612224909353554, + "learning_rate": 3.8617362593435745e-06, + "loss": 0.2922, + "step": 9317 + }, + { + "epoch": 0.59, + "grad_norm": 1.5189909760652602, + "learning_rate": 3.860744549218566e-06, + "loss": 0.2694, + "step": 9318 + }, + { + "epoch": 0.59, + "grad_norm": 4.038894841179102, + "learning_rate": 3.85975288636329e-06, + "loss": 0.2928, + "step": 9319 + }, + { + "epoch": 0.59, + "grad_norm": 2.6361349039783613, + "learning_rate": 3.858761270818892e-06, + "loss": 0.2979, + "step": 9320 + }, + { + "epoch": 0.59, + "grad_norm": 2.7192020723611874, + "learning_rate": 3.857769702626516e-06, + "loss": 0.3063, + "step": 9321 + }, + { + "epoch": 0.59, + "grad_norm": 4.736338572412326, + "learning_rate": 3.8567781818273034e-06, + "loss": 0.285, + "step": 9322 + }, + { + "epoch": 0.59, + "grad_norm": 1.5403393409699893, + "learning_rate": 3.855786708462394e-06, + "loss": 0.2597, + "step": 9323 + }, + { + "epoch": 0.59, + "grad_norm": 5.53307827225536, + "learning_rate": 3.854795282572926e-06, + "loss": 0.2653, + "step": 9324 + }, + { + "epoch": 0.59, + "grad_norm": 1.7944944551201483, + "learning_rate": 3.853803904200039e-06, + "loss": 0.253, + "step": 9325 + }, + { + "epoch": 0.59, + "grad_norm": 1.586030407716853, + "learning_rate": 3.852812573384861e-06, + "loss": 0.2652, + "step": 9326 + }, + { + "epoch": 0.59, + "grad_norm": 2.6646869134092412, + "learning_rate": 3.851821290168528e-06, + "loss": 0.2666, + "step": 9327 + }, + { + "epoch": 0.59, + "grad_norm": 2.4260893578703877, + "learning_rate": 3.85083005459217e-06, + "loss": 0.2783, + "step": 9328 + }, + { + "epoch": 0.59, + "grad_norm": 3.8494398011020112, + "learning_rate": 3.8498388666969134e-06, + "loss": 0.2729, + "step": 9329 + }, + { + "epoch": 0.59, + "grad_norm": 2.1302486781491954, + "learning_rate": 3.848847726523885e-06, + "loss": 0.2606, + "step": 9330 + }, + { + "epoch": 0.59, + "grad_norm": 3.996773571331155, + "learning_rate": 3.847856634114207e-06, + "loss": 0.2671, + "step": 9331 + }, + { + "epoch": 0.59, + "grad_norm": 1.847318593175028, + "learning_rate": 3.846865589509006e-06, + "loss": 0.2897, + "step": 9332 + }, + { + "epoch": 0.59, + "grad_norm": 2.1254200614702987, + "learning_rate": 3.8458745927494e-06, + "loss": 0.275, + "step": 9333 + }, + { + "epoch": 0.59, + "grad_norm": 4.608875024556574, + "learning_rate": 3.844883643876507e-06, + "loss": 0.2774, + "step": 9334 + }, + { + "epoch": 0.59, + "grad_norm": 1.9053440485058792, + "learning_rate": 3.843892742931443e-06, + "loss": 0.2788, + "step": 9335 + }, + { + "epoch": 0.59, + "grad_norm": 3.9981418936468147, + "learning_rate": 3.842901889955322e-06, + "loss": 0.2995, + "step": 9336 + }, + { + "epoch": 0.59, + "grad_norm": 2.1693773742322167, + "learning_rate": 3.841911084989259e-06, + "loss": 0.2745, + "step": 9337 + }, + { + "epoch": 0.59, + "grad_norm": 2.9074893159153605, + "learning_rate": 3.840920328074358e-06, + "loss": 0.2681, + "step": 9338 + }, + { + "epoch": 0.59, + "grad_norm": 3.941526961208954, + "learning_rate": 3.839929619251734e-06, + "loss": 0.2647, + "step": 9339 + }, + { + "epoch": 0.59, + "grad_norm": 1.8735502187969488, + "learning_rate": 3.838938958562491e-06, + "loss": 0.2893, + "step": 9340 + }, + { + "epoch": 0.59, + "grad_norm": 1.9307053582966782, + "learning_rate": 3.837948346047733e-06, + "loss": 0.2689, + "step": 9341 + }, + { + "epoch": 0.59, + "grad_norm": 1.8708470553193515, + "learning_rate": 3.836957781748562e-06, + "loss": 0.2788, + "step": 9342 + }, + { + "epoch": 0.59, + "grad_norm": 3.1371577226420304, + "learning_rate": 3.835967265706078e-06, + "loss": 0.2703, + "step": 9343 + }, + { + "epoch": 0.59, + "grad_norm": 1.7927673673217885, + "learning_rate": 3.834976797961379e-06, + "loss": 0.2856, + "step": 9344 + }, + { + "epoch": 0.59, + "grad_norm": 1.538581634819629, + "learning_rate": 3.833986378555562e-06, + "loss": 0.2729, + "step": 9345 + }, + { + "epoch": 0.59, + "grad_norm": 5.25168465298363, + "learning_rate": 3.832996007529721e-06, + "loss": 0.2933, + "step": 9346 + }, + { + "epoch": 0.59, + "grad_norm": 3.675468828378678, + "learning_rate": 3.83200568492495e-06, + "loss": 0.2854, + "step": 9347 + }, + { + "epoch": 0.59, + "grad_norm": 3.805766266755283, + "learning_rate": 3.8310154107823375e-06, + "loss": 0.2628, + "step": 9348 + }, + { + "epoch": 0.59, + "grad_norm": 2.571832877111927, + "learning_rate": 3.8300251851429715e-06, + "loss": 0.2714, + "step": 9349 + }, + { + "epoch": 0.59, + "grad_norm": 2.098782017288707, + "learning_rate": 3.829035008047939e-06, + "loss": 0.2877, + "step": 9350 + }, + { + "epoch": 0.59, + "grad_norm": 2.2974646345997423, + "learning_rate": 3.8280448795383245e-06, + "loss": 0.267, + "step": 9351 + }, + { + "epoch": 0.59, + "grad_norm": 4.85193585986451, + "learning_rate": 3.827054799655207e-06, + "loss": 0.2624, + "step": 9352 + }, + { + "epoch": 0.59, + "grad_norm": 1.7888315975033644, + "learning_rate": 3.82606476843967e-06, + "loss": 0.2521, + "step": 9353 + }, + { + "epoch": 0.59, + "grad_norm": 1.4060659652934158, + "learning_rate": 3.825074785932792e-06, + "loss": 0.2576, + "step": 9354 + }, + { + "epoch": 0.59, + "grad_norm": 15.16639271451097, + "learning_rate": 3.8240848521756484e-06, + "loss": 0.2634, + "step": 9355 + }, + { + "epoch": 0.59, + "grad_norm": 2.4092035208211344, + "learning_rate": 3.823094967209312e-06, + "loss": 0.2721, + "step": 9356 + }, + { + "epoch": 0.59, + "grad_norm": 2.46751038974512, + "learning_rate": 3.822105131074857e-06, + "loss": 0.2893, + "step": 9357 + }, + { + "epoch": 0.59, + "grad_norm": 2.020572096130606, + "learning_rate": 3.8211153438133515e-06, + "loss": 0.2713, + "step": 9358 + }, + { + "epoch": 0.59, + "grad_norm": 1.5594628901051883, + "learning_rate": 3.820125605465864e-06, + "loss": 0.2877, + "step": 9359 + }, + { + "epoch": 0.59, + "grad_norm": 1.5046444553647635, + "learning_rate": 3.81913591607346e-06, + "loss": 0.2523, + "step": 9360 + }, + { + "epoch": 0.59, + "grad_norm": 1.8256666327324984, + "learning_rate": 3.8181462756772056e-06, + "loss": 0.2721, + "step": 9361 + }, + { + "epoch": 0.59, + "grad_norm": 8.853344318626228, + "learning_rate": 3.817156684318161e-06, + "loss": 0.2699, + "step": 9362 + }, + { + "epoch": 0.59, + "grad_norm": 1.6700246115584205, + "learning_rate": 3.816167142037388e-06, + "loss": 0.2871, + "step": 9363 + }, + { + "epoch": 0.59, + "grad_norm": 1.9158309584807425, + "learning_rate": 3.815177648875941e-06, + "loss": 0.2523, + "step": 9364 + }, + { + "epoch": 0.59, + "grad_norm": 12.271281314895718, + "learning_rate": 3.81418820487488e-06, + "loss": 0.2864, + "step": 9365 + }, + { + "epoch": 0.59, + "grad_norm": 0.6268130753348529, + "learning_rate": 3.813198810075255e-06, + "loss": 0.4973, + "step": 9366 + }, + { + "epoch": 0.59, + "grad_norm": 2.335047972564448, + "learning_rate": 3.8122094645181196e-06, + "loss": 0.2704, + "step": 9367 + }, + { + "epoch": 0.59, + "grad_norm": 9.566688826713257, + "learning_rate": 3.811220168244521e-06, + "loss": 0.2794, + "step": 9368 + }, + { + "epoch": 0.59, + "grad_norm": 3.3316070491854632, + "learning_rate": 3.8102309212955122e-06, + "loss": 0.271, + "step": 9369 + }, + { + "epoch": 0.59, + "grad_norm": 4.087907916226869, + "learning_rate": 3.809241723712135e-06, + "loss": 0.2845, + "step": 9370 + }, + { + "epoch": 0.59, + "grad_norm": 2.2158212205162338, + "learning_rate": 3.8082525755354346e-06, + "loss": 0.2862, + "step": 9371 + }, + { + "epoch": 0.59, + "grad_norm": 2.623133242692415, + "learning_rate": 3.80726347680645e-06, + "loss": 0.2795, + "step": 9372 + }, + { + "epoch": 0.59, + "grad_norm": 2.954505107845916, + "learning_rate": 3.8062744275662237e-06, + "loss": 0.2834, + "step": 9373 + }, + { + "epoch": 0.59, + "grad_norm": 2.1406731505733547, + "learning_rate": 3.8052854278557904e-06, + "loss": 0.2801, + "step": 9374 + }, + { + "epoch": 0.59, + "grad_norm": 3.781870275502023, + "learning_rate": 3.8042964777161862e-06, + "loss": 0.279, + "step": 9375 + }, + { + "epoch": 0.59, + "grad_norm": 3.0254021780680995, + "learning_rate": 3.8033075771884457e-06, + "loss": 0.2877, + "step": 9376 + }, + { + "epoch": 0.59, + "grad_norm": 2.2269111364961764, + "learning_rate": 3.8023187263136008e-06, + "loss": 0.2835, + "step": 9377 + }, + { + "epoch": 0.59, + "grad_norm": 0.5921458140843855, + "learning_rate": 3.8013299251326775e-06, + "loss": 0.4999, + "step": 9378 + }, + { + "epoch": 0.59, + "grad_norm": 3.241316463523188, + "learning_rate": 3.800341173686706e-06, + "loss": 0.2681, + "step": 9379 + }, + { + "epoch": 0.59, + "grad_norm": 2.1789631352634182, + "learning_rate": 3.79935247201671e-06, + "loss": 0.2764, + "step": 9380 + }, + { + "epoch": 0.59, + "grad_norm": 1.931271919058685, + "learning_rate": 3.7983638201637124e-06, + "loss": 0.2557, + "step": 9381 + }, + { + "epoch": 0.59, + "grad_norm": 7.297751134830079, + "learning_rate": 3.7973752181687336e-06, + "loss": 0.2736, + "step": 9382 + }, + { + "epoch": 0.59, + "grad_norm": 1.4650614200000651, + "learning_rate": 3.7963866660727918e-06, + "loss": 0.2677, + "step": 9383 + }, + { + "epoch": 0.59, + "grad_norm": 1.5135719897702828, + "learning_rate": 3.7953981639169063e-06, + "loss": 0.2601, + "step": 9384 + }, + { + "epoch": 0.59, + "grad_norm": 2.4514162525073404, + "learning_rate": 3.7944097117420913e-06, + "loss": 0.2797, + "step": 9385 + }, + { + "epoch": 0.59, + "grad_norm": 4.396225022896346, + "learning_rate": 3.793421309589358e-06, + "loss": 0.2727, + "step": 9386 + }, + { + "epoch": 0.59, + "grad_norm": 1.50773517879765, + "learning_rate": 3.792432957499719e-06, + "loss": 0.2707, + "step": 9387 + }, + { + "epoch": 0.59, + "grad_norm": 1.8732056382137225, + "learning_rate": 3.7914446555141805e-06, + "loss": 0.2551, + "step": 9388 + }, + { + "epoch": 0.59, + "grad_norm": 0.6133073243689083, + "learning_rate": 3.7904564036737502e-06, + "loss": 0.5252, + "step": 9389 + }, + { + "epoch": 0.59, + "grad_norm": 1.7486217173572725, + "learning_rate": 3.7894682020194296e-06, + "loss": 0.2639, + "step": 9390 + }, + { + "epoch": 0.59, + "grad_norm": 2.292660267779196, + "learning_rate": 3.788480050592226e-06, + "loss": 0.2578, + "step": 9391 + }, + { + "epoch": 0.59, + "grad_norm": 1.8630531458317638, + "learning_rate": 3.7874919494331363e-06, + "loss": 0.268, + "step": 9392 + }, + { + "epoch": 0.59, + "grad_norm": 2.87811553970909, + "learning_rate": 3.78650389858316e-06, + "loss": 0.2967, + "step": 9393 + }, + { + "epoch": 0.59, + "grad_norm": 1.5650994855817189, + "learning_rate": 3.7855158980832924e-06, + "loss": 0.2624, + "step": 9394 + }, + { + "epoch": 0.59, + "grad_norm": 1.4322746388884324, + "learning_rate": 3.7845279479745277e-06, + "loss": 0.2668, + "step": 9395 + }, + { + "epoch": 0.59, + "grad_norm": 2.359742918628945, + "learning_rate": 3.7835400482978565e-06, + "loss": 0.2761, + "step": 9396 + }, + { + "epoch": 0.59, + "grad_norm": 1.5529438969965241, + "learning_rate": 3.78255219909427e-06, + "loss": 0.2994, + "step": 9397 + }, + { + "epoch": 0.59, + "grad_norm": 5.203903238239566, + "learning_rate": 3.781564400404752e-06, + "loss": 0.2731, + "step": 9398 + }, + { + "epoch": 0.59, + "grad_norm": 3.66217503373457, + "learning_rate": 3.780576652270295e-06, + "loss": 0.2969, + "step": 9399 + }, + { + "epoch": 0.59, + "grad_norm": 2.2899445082648175, + "learning_rate": 3.7795889547318764e-06, + "loss": 0.2672, + "step": 9400 + }, + { + "epoch": 0.59, + "grad_norm": 21.069135078016973, + "learning_rate": 3.778601307830482e-06, + "loss": 0.2834, + "step": 9401 + }, + { + "epoch": 0.59, + "grad_norm": 1.6697055773900062, + "learning_rate": 3.777613711607087e-06, + "loss": 0.2886, + "step": 9402 + }, + { + "epoch": 0.59, + "grad_norm": 1.8849297878403781, + "learning_rate": 3.776626166102672e-06, + "loss": 0.2937, + "step": 9403 + }, + { + "epoch": 0.59, + "grad_norm": 4.233006061379948, + "learning_rate": 3.7756386713582086e-06, + "loss": 0.2789, + "step": 9404 + }, + { + "epoch": 0.59, + "grad_norm": 1.848638240209906, + "learning_rate": 3.7746512274146707e-06, + "loss": 0.2647, + "step": 9405 + }, + { + "epoch": 0.59, + "grad_norm": 1.6240572167892289, + "learning_rate": 3.773663834313031e-06, + "loss": 0.2654, + "step": 9406 + }, + { + "epoch": 0.59, + "grad_norm": 1.6588593622644934, + "learning_rate": 3.7726764920942584e-06, + "loss": 0.2684, + "step": 9407 + }, + { + "epoch": 0.59, + "grad_norm": 2.4118707001780626, + "learning_rate": 3.7716892007993166e-06, + "loss": 0.2892, + "step": 9408 + }, + { + "epoch": 0.59, + "grad_norm": 2.497149809190493, + "learning_rate": 3.7707019604691725e-06, + "loss": 0.254, + "step": 9409 + }, + { + "epoch": 0.59, + "grad_norm": 8.524310881558819, + "learning_rate": 3.7697147711447867e-06, + "loss": 0.2731, + "step": 9410 + }, + { + "epoch": 0.59, + "grad_norm": 1.8771781720957033, + "learning_rate": 3.7687276328671215e-06, + "loss": 0.2852, + "step": 9411 + }, + { + "epoch": 0.59, + "grad_norm": 6.266244121038863, + "learning_rate": 3.7677405456771325e-06, + "loss": 0.2612, + "step": 9412 + }, + { + "epoch": 0.59, + "grad_norm": 3.647449996299619, + "learning_rate": 3.766753509615776e-06, + "loss": 0.2778, + "step": 9413 + }, + { + "epoch": 0.59, + "grad_norm": 2.6143241562048347, + "learning_rate": 3.7657665247240083e-06, + "loss": 0.2749, + "step": 9414 + }, + { + "epoch": 0.59, + "grad_norm": 6.357830174106946, + "learning_rate": 3.76477959104278e-06, + "loss": 0.2874, + "step": 9415 + }, + { + "epoch": 0.59, + "grad_norm": 2.6185836244562273, + "learning_rate": 3.7637927086130398e-06, + "loss": 0.3081, + "step": 9416 + }, + { + "epoch": 0.59, + "grad_norm": 2.9563248475437076, + "learning_rate": 3.762805877475737e-06, + "loss": 0.2863, + "step": 9417 + }, + { + "epoch": 0.59, + "grad_norm": 1.730649850736178, + "learning_rate": 3.761819097671815e-06, + "loss": 0.2695, + "step": 9418 + }, + { + "epoch": 0.59, + "grad_norm": 2.1894437178596085, + "learning_rate": 3.760832369242219e-06, + "loss": 0.2765, + "step": 9419 + }, + { + "epoch": 0.59, + "grad_norm": 2.5623841114549686, + "learning_rate": 3.7598456922278855e-06, + "loss": 0.2895, + "step": 9420 + }, + { + "epoch": 0.59, + "grad_norm": 1.4984246170610744, + "learning_rate": 3.75885906666976e-06, + "loss": 0.2524, + "step": 9421 + }, + { + "epoch": 0.59, + "grad_norm": 1.983242143886795, + "learning_rate": 3.757872492608775e-06, + "loss": 0.2677, + "step": 9422 + }, + { + "epoch": 0.59, + "grad_norm": 1.8807899664735235, + "learning_rate": 3.7568859700858685e-06, + "loss": 0.292, + "step": 9423 + }, + { + "epoch": 0.59, + "grad_norm": 1.7306002817121755, + "learning_rate": 3.7558994991419694e-06, + "loss": 0.2591, + "step": 9424 + }, + { + "epoch": 0.59, + "grad_norm": 2.767129311082147, + "learning_rate": 3.754913079818012e-06, + "loss": 0.3087, + "step": 9425 + }, + { + "epoch": 0.59, + "grad_norm": 3.1372657968123443, + "learning_rate": 3.75392671215492e-06, + "loss": 0.2889, + "step": 9426 + }, + { + "epoch": 0.59, + "grad_norm": 1.8084331556717348, + "learning_rate": 3.7529403961936217e-06, + "loss": 0.2693, + "step": 9427 + }, + { + "epoch": 0.59, + "grad_norm": 2.3941833346339667, + "learning_rate": 3.751954131975042e-06, + "loss": 0.2737, + "step": 9428 + }, + { + "epoch": 0.59, + "grad_norm": 1.6733878253655239, + "learning_rate": 3.7509679195401033e-06, + "loss": 0.2736, + "step": 9429 + }, + { + "epoch": 0.59, + "grad_norm": 3.475572726616073, + "learning_rate": 3.749981758929724e-06, + "loss": 0.2622, + "step": 9430 + }, + { + "epoch": 0.59, + "grad_norm": 0.5976562643264937, + "learning_rate": 3.7489956501848214e-06, + "loss": 0.4603, + "step": 9431 + }, + { + "epoch": 0.59, + "grad_norm": 2.140521257382837, + "learning_rate": 3.748009593346311e-06, + "loss": 0.2665, + "step": 9432 + }, + { + "epoch": 0.59, + "grad_norm": 2.2425698579874407, + "learning_rate": 3.7470235884551077e-06, + "loss": 0.2951, + "step": 9433 + }, + { + "epoch": 0.59, + "grad_norm": 2.6330897447653205, + "learning_rate": 3.74603763555212e-06, + "loss": 0.2741, + "step": 9434 + }, + { + "epoch": 0.59, + "grad_norm": 1.7797622857168383, + "learning_rate": 3.7450517346782565e-06, + "loss": 0.2763, + "step": 9435 + }, + { + "epoch": 0.59, + "grad_norm": 2.9628850449714346, + "learning_rate": 3.7440658858744274e-06, + "loss": 0.2518, + "step": 9436 + }, + { + "epoch": 0.59, + "grad_norm": 5.0060470636898975, + "learning_rate": 3.7430800891815356e-06, + "loss": 0.2906, + "step": 9437 + }, + { + "epoch": 0.59, + "grad_norm": 1.486718453980096, + "learning_rate": 3.7420943446404835e-06, + "loss": 0.2733, + "step": 9438 + }, + { + "epoch": 0.59, + "grad_norm": 3.0856812294426126, + "learning_rate": 3.7411086522921714e-06, + "loss": 0.2784, + "step": 9439 + }, + { + "epoch": 0.59, + "grad_norm": 1.869901731710971, + "learning_rate": 3.7401230121774966e-06, + "loss": 0.2764, + "step": 9440 + }, + { + "epoch": 0.59, + "grad_norm": 4.112262487038523, + "learning_rate": 3.739137424337357e-06, + "loss": 0.2543, + "step": 9441 + }, + { + "epoch": 0.59, + "grad_norm": 8.635283562506698, + "learning_rate": 3.7381518888126434e-06, + "loss": 0.2939, + "step": 9442 + }, + { + "epoch": 0.59, + "grad_norm": 10.366542969840427, + "learning_rate": 3.73716640564425e-06, + "loss": 0.3017, + "step": 9443 + }, + { + "epoch": 0.59, + "grad_norm": 1.4567349436988268, + "learning_rate": 3.736180974873065e-06, + "loss": 0.2892, + "step": 9444 + }, + { + "epoch": 0.59, + "grad_norm": 7.916334200846063, + "learning_rate": 3.735195596539978e-06, + "loss": 0.2974, + "step": 9445 + }, + { + "epoch": 0.59, + "grad_norm": 1.306518722634645, + "learning_rate": 3.734210270685871e-06, + "loss": 0.2655, + "step": 9446 + }, + { + "epoch": 0.59, + "grad_norm": 2.316169047949291, + "learning_rate": 3.733224997351629e-06, + "loss": 0.2803, + "step": 9447 + }, + { + "epoch": 0.59, + "grad_norm": 4.0936376291264605, + "learning_rate": 3.7322397765781304e-06, + "loss": 0.2902, + "step": 9448 + }, + { + "epoch": 0.59, + "grad_norm": 5.853870498108596, + "learning_rate": 3.731254608406257e-06, + "loss": 0.2482, + "step": 9449 + }, + { + "epoch": 0.59, + "grad_norm": 5.200638253883915, + "learning_rate": 3.730269492876881e-06, + "loss": 0.2712, + "step": 9450 + }, + { + "epoch": 0.59, + "grad_norm": 1.4939312279906427, + "learning_rate": 3.7292844300308808e-06, + "loss": 0.2799, + "step": 9451 + }, + { + "epoch": 0.59, + "grad_norm": 2.0311724594938396, + "learning_rate": 3.728299419909126e-06, + "loss": 0.281, + "step": 9452 + }, + { + "epoch": 0.59, + "grad_norm": 2.0709075397930206, + "learning_rate": 3.7273144625524883e-06, + "loss": 0.2861, + "step": 9453 + }, + { + "epoch": 0.59, + "grad_norm": 1.3837120508819816, + "learning_rate": 3.7263295580018326e-06, + "loss": 0.2674, + "step": 9454 + }, + { + "epoch": 0.59, + "grad_norm": 2.591818620406059, + "learning_rate": 3.7253447062980276e-06, + "loss": 0.2651, + "step": 9455 + }, + { + "epoch": 0.59, + "grad_norm": 2.938159348620676, + "learning_rate": 3.724359907481933e-06, + "loss": 0.2679, + "step": 9456 + }, + { + "epoch": 0.59, + "grad_norm": 4.725483164376001, + "learning_rate": 3.7233751615944115e-06, + "loss": 0.2777, + "step": 9457 + }, + { + "epoch": 0.59, + "grad_norm": 3.838041008716286, + "learning_rate": 3.7223904686763222e-06, + "loss": 0.2736, + "step": 9458 + }, + { + "epoch": 0.59, + "grad_norm": 1.664735300992799, + "learning_rate": 3.7214058287685228e-06, + "loss": 0.2697, + "step": 9459 + }, + { + "epoch": 0.59, + "grad_norm": 6.254194483518903, + "learning_rate": 3.7204212419118656e-06, + "loss": 0.2801, + "step": 9460 + }, + { + "epoch": 0.59, + "grad_norm": 1.681588347728504, + "learning_rate": 3.719436708147206e-06, + "loss": 0.2703, + "step": 9461 + }, + { + "epoch": 0.6, + "grad_norm": 1.4553635322149021, + "learning_rate": 3.71845222751539e-06, + "loss": 0.2541, + "step": 9462 + }, + { + "epoch": 0.6, + "grad_norm": 1.7166927046328586, + "learning_rate": 3.7174678000572684e-06, + "loss": 0.2831, + "step": 9463 + }, + { + "epoch": 0.6, + "grad_norm": 3.008685742866287, + "learning_rate": 3.7164834258136847e-06, + "loss": 0.2794, + "step": 9464 + }, + { + "epoch": 0.6, + "grad_norm": 2.4774342221177545, + "learning_rate": 3.7154991048254823e-06, + "loss": 0.2678, + "step": 9465 + }, + { + "epoch": 0.6, + "grad_norm": 1.443197475780415, + "learning_rate": 3.7145148371335048e-06, + "loss": 0.2886, + "step": 9466 + }, + { + "epoch": 0.6, + "grad_norm": 2.467522895981137, + "learning_rate": 3.713530622778591e-06, + "loss": 0.2793, + "step": 9467 + }, + { + "epoch": 0.6, + "grad_norm": 1.8569011603306602, + "learning_rate": 3.712546461801576e-06, + "loss": 0.301, + "step": 9468 + }, + { + "epoch": 0.6, + "grad_norm": 2.245168060416518, + "learning_rate": 3.7115623542432956e-06, + "loss": 0.2734, + "step": 9469 + }, + { + "epoch": 0.6, + "grad_norm": 4.025178619022494, + "learning_rate": 3.7105783001445806e-06, + "loss": 0.2636, + "step": 9470 + }, + { + "epoch": 0.6, + "grad_norm": 1.2278134931293527, + "learning_rate": 3.7095942995462636e-06, + "loss": 0.2647, + "step": 9471 + }, + { + "epoch": 0.6, + "grad_norm": 2.1190219662361325, + "learning_rate": 3.7086103524891685e-06, + "loss": 0.2782, + "step": 9472 + }, + { + "epoch": 0.6, + "grad_norm": 2.2340393525549143, + "learning_rate": 3.7076264590141254e-06, + "loss": 0.2719, + "step": 9473 + }, + { + "epoch": 0.6, + "grad_norm": 1.9674372597394898, + "learning_rate": 3.706642619161955e-06, + "loss": 0.2818, + "step": 9474 + }, + { + "epoch": 0.6, + "grad_norm": 2.1223506864169996, + "learning_rate": 3.7056588329734824e-06, + "loss": 0.285, + "step": 9475 + }, + { + "epoch": 0.6, + "grad_norm": 2.1125003844713035, + "learning_rate": 3.704675100489521e-06, + "loss": 0.272, + "step": 9476 + }, + { + "epoch": 0.6, + "grad_norm": 4.984208235662652, + "learning_rate": 3.7036914217508925e-06, + "loss": 0.2669, + "step": 9477 + }, + { + "epoch": 0.6, + "grad_norm": 2.0854133214183492, + "learning_rate": 3.7027077967984082e-06, + "loss": 0.293, + "step": 9478 + }, + { + "epoch": 0.6, + "grad_norm": 1.723829858868617, + "learning_rate": 3.7017242256728823e-06, + "loss": 0.2886, + "step": 9479 + }, + { + "epoch": 0.6, + "grad_norm": 3.370612351249497, + "learning_rate": 3.7007407084151227e-06, + "loss": 0.2492, + "step": 9480 + }, + { + "epoch": 0.6, + "grad_norm": 1.740248805408432, + "learning_rate": 3.6997572450659393e-06, + "loss": 0.2811, + "step": 9481 + }, + { + "epoch": 0.6, + "grad_norm": 0.5811140238654917, + "learning_rate": 3.6987738356661377e-06, + "loss": 0.4914, + "step": 9482 + }, + { + "epoch": 0.6, + "grad_norm": 5.374049072434476, + "learning_rate": 3.6977904802565224e-06, + "loss": 0.2781, + "step": 9483 + }, + { + "epoch": 0.6, + "grad_norm": 1.8660932853686598, + "learning_rate": 3.6968071788778915e-06, + "loss": 0.2557, + "step": 9484 + }, + { + "epoch": 0.6, + "grad_norm": 1.9796606495350515, + "learning_rate": 3.6958239315710467e-06, + "loss": 0.2699, + "step": 9485 + }, + { + "epoch": 0.6, + "grad_norm": 4.372045014189948, + "learning_rate": 3.6948407383767814e-06, + "loss": 0.2952, + "step": 9486 + }, + { + "epoch": 0.6, + "grad_norm": 2.1563314601829284, + "learning_rate": 3.693857599335892e-06, + "loss": 0.2675, + "step": 9487 + }, + { + "epoch": 0.6, + "grad_norm": 2.778049127970359, + "learning_rate": 3.6928745144891733e-06, + "loss": 0.2735, + "step": 9488 + }, + { + "epoch": 0.6, + "grad_norm": 2.650237607707643, + "learning_rate": 3.6918914838774112e-06, + "loss": 0.2921, + "step": 9489 + }, + { + "epoch": 0.6, + "grad_norm": 3.5079599585202224, + "learning_rate": 3.6909085075413944e-06, + "loss": 0.2912, + "step": 9490 + }, + { + "epoch": 0.6, + "grad_norm": 2.219577893499021, + "learning_rate": 3.6899255855219103e-06, + "loss": 0.2749, + "step": 9491 + }, + { + "epoch": 0.6, + "grad_norm": 2.6408530900837763, + "learning_rate": 3.688942717859739e-06, + "loss": 0.2782, + "step": 9492 + }, + { + "epoch": 0.6, + "grad_norm": 1.8833484663984135, + "learning_rate": 3.687959904595665e-06, + "loss": 0.2658, + "step": 9493 + }, + { + "epoch": 0.6, + "grad_norm": 1.4750824970868708, + "learning_rate": 3.686977145770464e-06, + "loss": 0.2938, + "step": 9494 + }, + { + "epoch": 0.6, + "grad_norm": 1.5744418776824995, + "learning_rate": 3.6859944414249115e-06, + "loss": 0.2761, + "step": 9495 + }, + { + "epoch": 0.6, + "grad_norm": 3.3620626512794933, + "learning_rate": 3.685011791599786e-06, + "loss": 0.2546, + "step": 9496 + }, + { + "epoch": 0.6, + "grad_norm": 2.194208138257682, + "learning_rate": 3.6840291963358564e-06, + "loss": 0.2784, + "step": 9497 + }, + { + "epoch": 0.6, + "grad_norm": 3.2014091159418414, + "learning_rate": 3.683046655673892e-06, + "loss": 0.2624, + "step": 9498 + }, + { + "epoch": 0.6, + "grad_norm": 2.0876981272408086, + "learning_rate": 3.682064169654663e-06, + "loss": 0.2714, + "step": 9499 + }, + { + "epoch": 0.6, + "grad_norm": 1.9262082312810136, + "learning_rate": 3.681081738318931e-06, + "loss": 0.2787, + "step": 9500 + }, + { + "epoch": 0.6, + "grad_norm": 1.8365381110416983, + "learning_rate": 3.680099361707461e-06, + "loss": 0.2538, + "step": 9501 + }, + { + "epoch": 0.6, + "grad_norm": 1.6147746977786728, + "learning_rate": 3.6791170398610117e-06, + "loss": 0.2692, + "step": 9502 + }, + { + "epoch": 0.6, + "grad_norm": 1.5471520912199355, + "learning_rate": 3.6781347728203433e-06, + "loss": 0.3003, + "step": 9503 + }, + { + "epoch": 0.6, + "grad_norm": 1.770211887379422, + "learning_rate": 3.6771525606262106e-06, + "loss": 0.2785, + "step": 9504 + }, + { + "epoch": 0.6, + "grad_norm": 3.170504245820591, + "learning_rate": 3.676170403319369e-06, + "loss": 0.2679, + "step": 9505 + }, + { + "epoch": 0.6, + "grad_norm": 1.6214248961645321, + "learning_rate": 3.6751883009405677e-06, + "loss": 0.2667, + "step": 9506 + }, + { + "epoch": 0.6, + "grad_norm": 1.5515567298608008, + "learning_rate": 3.674206253530558e-06, + "loss": 0.2733, + "step": 9507 + }, + { + "epoch": 0.6, + "grad_norm": 3.007672022318736, + "learning_rate": 3.673224261130085e-06, + "loss": 0.259, + "step": 9508 + }, + { + "epoch": 0.6, + "grad_norm": 2.1697933187202643, + "learning_rate": 3.6722423237798934e-06, + "loss": 0.2869, + "step": 9509 + }, + { + "epoch": 0.6, + "grad_norm": 4.79572793943694, + "learning_rate": 3.671260441520727e-06, + "loss": 0.2651, + "step": 9510 + }, + { + "epoch": 0.6, + "grad_norm": 1.8426392257354633, + "learning_rate": 3.6702786143933244e-06, + "loss": 0.2844, + "step": 9511 + }, + { + "epoch": 0.6, + "grad_norm": 1.6975805241094903, + "learning_rate": 3.6692968424384246e-06, + "loss": 0.2553, + "step": 9512 + }, + { + "epoch": 0.6, + "grad_norm": 2.6287716843869617, + "learning_rate": 3.668315125696763e-06, + "loss": 0.2726, + "step": 9513 + }, + { + "epoch": 0.6, + "grad_norm": 2.325283612108341, + "learning_rate": 3.667333464209071e-06, + "loss": 0.2788, + "step": 9514 + }, + { + "epoch": 0.6, + "grad_norm": 4.702951496301354, + "learning_rate": 3.6663518580160816e-06, + "loss": 0.2747, + "step": 9515 + }, + { + "epoch": 0.6, + "grad_norm": 2.004179882663363, + "learning_rate": 3.665370307158521e-06, + "loss": 0.2756, + "step": 9516 + }, + { + "epoch": 0.6, + "grad_norm": 1.5485402154075512, + "learning_rate": 3.6643888116771166e-06, + "loss": 0.2737, + "step": 9517 + }, + { + "epoch": 0.6, + "grad_norm": 1.4038471225670404, + "learning_rate": 3.663407371612595e-06, + "loss": 0.2727, + "step": 9518 + }, + { + "epoch": 0.6, + "grad_norm": 1.7067413961004168, + "learning_rate": 3.6624259870056745e-06, + "loss": 0.2655, + "step": 9519 + }, + { + "epoch": 0.6, + "grad_norm": 5.277148410551273, + "learning_rate": 3.661444657897075e-06, + "loss": 0.3208, + "step": 9520 + }, + { + "epoch": 0.6, + "grad_norm": 1.7869677855628887, + "learning_rate": 3.6604633843275155e-06, + "loss": 0.2756, + "step": 9521 + }, + { + "epoch": 0.6, + "grad_norm": 2.4524726043992415, + "learning_rate": 3.6594821663377084e-06, + "loss": 0.2738, + "step": 9522 + }, + { + "epoch": 0.6, + "grad_norm": 4.060214208521723, + "learning_rate": 3.6585010039683684e-06, + "loss": 0.2724, + "step": 9523 + }, + { + "epoch": 0.6, + "grad_norm": 1.8525163279935382, + "learning_rate": 3.6575198972602024e-06, + "loss": 0.2647, + "step": 9524 + }, + { + "epoch": 0.6, + "grad_norm": 2.025691398342604, + "learning_rate": 3.6565388462539216e-06, + "loss": 0.2776, + "step": 9525 + }, + { + "epoch": 0.6, + "grad_norm": 1.6410247161687948, + "learning_rate": 3.655557850990231e-06, + "loss": 0.2577, + "step": 9526 + }, + { + "epoch": 0.6, + "grad_norm": 3.0813297162239737, + "learning_rate": 3.6545769115098324e-06, + "loss": 0.2805, + "step": 9527 + }, + { + "epoch": 0.6, + "grad_norm": 4.382361030620752, + "learning_rate": 3.6535960278534273e-06, + "loss": 0.2588, + "step": 9528 + }, + { + "epoch": 0.6, + "grad_norm": 1.6712409287031182, + "learning_rate": 3.6526152000617153e-06, + "loss": 0.2617, + "step": 9529 + }, + { + "epoch": 0.6, + "grad_norm": 0.6444792416426703, + "learning_rate": 3.651634428175391e-06, + "loss": 0.5259, + "step": 9530 + }, + { + "epoch": 0.6, + "grad_norm": 4.142673954328926, + "learning_rate": 3.6506537122351506e-06, + "loss": 0.2795, + "step": 9531 + }, + { + "epoch": 0.6, + "grad_norm": 1.4073711576279704, + "learning_rate": 3.6496730522816826e-06, + "loss": 0.2696, + "step": 9532 + }, + { + "epoch": 0.6, + "grad_norm": 1.810236169890428, + "learning_rate": 3.6486924483556785e-06, + "loss": 0.288, + "step": 9533 + }, + { + "epoch": 0.6, + "grad_norm": 2.2463740861290313, + "learning_rate": 3.6477119004978256e-06, + "loss": 0.2631, + "step": 9534 + }, + { + "epoch": 0.6, + "grad_norm": 0.6096661233310655, + "learning_rate": 3.646731408748808e-06, + "loss": 0.485, + "step": 9535 + }, + { + "epoch": 0.6, + "grad_norm": 1.5564148241602025, + "learning_rate": 3.6457509731493067e-06, + "loss": 0.2662, + "step": 9536 + }, + { + "epoch": 0.6, + "grad_norm": 2.2205695796794913, + "learning_rate": 3.6447705937400046e-06, + "loss": 0.2825, + "step": 9537 + }, + { + "epoch": 0.6, + "grad_norm": 2.3763773775770396, + "learning_rate": 3.6437902705615767e-06, + "loss": 0.272, + "step": 9538 + }, + { + "epoch": 0.6, + "grad_norm": 1.6252637185109093, + "learning_rate": 3.642810003654699e-06, + "loss": 0.285, + "step": 9539 + }, + { + "epoch": 0.6, + "grad_norm": 4.600352836256971, + "learning_rate": 3.6418297930600453e-06, + "loss": 0.2809, + "step": 9540 + }, + { + "epoch": 0.6, + "grad_norm": 1.8982121176160733, + "learning_rate": 3.6408496388182857e-06, + "loss": 0.2713, + "step": 9541 + }, + { + "epoch": 0.6, + "grad_norm": 1.8034384724322994, + "learning_rate": 3.639869540970089e-06, + "loss": 0.261, + "step": 9542 + }, + { + "epoch": 0.6, + "grad_norm": 2.097463555484217, + "learning_rate": 3.6388894995561217e-06, + "loss": 0.2792, + "step": 9543 + }, + { + "epoch": 0.6, + "grad_norm": 1.704476350937524, + "learning_rate": 3.637909514617046e-06, + "loss": 0.2811, + "step": 9544 + }, + { + "epoch": 0.6, + "grad_norm": 2.3409169881743384, + "learning_rate": 3.6369295861935244e-06, + "loss": 0.2808, + "step": 9545 + }, + { + "epoch": 0.6, + "grad_norm": 3.630134912623273, + "learning_rate": 3.6359497143262147e-06, + "loss": 0.2964, + "step": 9546 + }, + { + "epoch": 0.6, + "grad_norm": 2.659050686141471, + "learning_rate": 3.6349698990557726e-06, + "loss": 0.2805, + "step": 9547 + }, + { + "epoch": 0.6, + "grad_norm": 3.6016395129545904, + "learning_rate": 3.6339901404228563e-06, + "loss": 0.2605, + "step": 9548 + }, + { + "epoch": 0.6, + "grad_norm": 17.524636413876273, + "learning_rate": 3.6330104384681146e-06, + "loss": 0.2828, + "step": 9549 + }, + { + "epoch": 0.6, + "grad_norm": 2.1901263874629353, + "learning_rate": 3.6320307932321975e-06, + "loss": 0.2774, + "step": 9550 + }, + { + "epoch": 0.6, + "grad_norm": 2.596512830833323, + "learning_rate": 3.631051204755754e-06, + "loss": 0.2891, + "step": 9551 + }, + { + "epoch": 0.6, + "grad_norm": 1.7342481675109205, + "learning_rate": 3.630071673079426e-06, + "loss": 0.278, + "step": 9552 + }, + { + "epoch": 0.6, + "grad_norm": 2.0497940911905683, + "learning_rate": 3.629092198243859e-06, + "loss": 0.277, + "step": 9553 + }, + { + "epoch": 0.6, + "grad_norm": 1.957816654030364, + "learning_rate": 3.6281127802896897e-06, + "loss": 0.2831, + "step": 9554 + }, + { + "epoch": 0.6, + "grad_norm": 6.179435272257459, + "learning_rate": 3.6271334192575588e-06, + "loss": 0.2623, + "step": 9555 + }, + { + "epoch": 0.6, + "grad_norm": 1.9349456326869183, + "learning_rate": 3.626154115188102e-06, + "loss": 0.272, + "step": 9556 + }, + { + "epoch": 0.6, + "grad_norm": 1.9523069537094317, + "learning_rate": 3.6251748681219494e-06, + "loss": 0.2677, + "step": 9557 + }, + { + "epoch": 0.6, + "grad_norm": 2.4384347855510473, + "learning_rate": 3.6241956780997345e-06, + "loss": 0.2821, + "step": 9558 + }, + { + "epoch": 0.6, + "grad_norm": 1.8262044086859577, + "learning_rate": 3.623216545162085e-06, + "loss": 0.2821, + "step": 9559 + }, + { + "epoch": 0.6, + "grad_norm": 1.9774129701399263, + "learning_rate": 3.6222374693496252e-06, + "loss": 0.2746, + "step": 9560 + }, + { + "epoch": 0.6, + "grad_norm": 3.2819152854959217, + "learning_rate": 3.621258450702982e-06, + "loss": 0.3035, + "step": 9561 + }, + { + "epoch": 0.6, + "grad_norm": 1.9882917236724367, + "learning_rate": 3.620279489262772e-06, + "loss": 0.2748, + "step": 9562 + }, + { + "epoch": 0.6, + "grad_norm": 2.247525828893059, + "learning_rate": 3.6193005850696173e-06, + "loss": 0.2816, + "step": 9563 + }, + { + "epoch": 0.6, + "grad_norm": 1.8202638992073676, + "learning_rate": 3.6183217381641355e-06, + "loss": 0.2862, + "step": 9564 + }, + { + "epoch": 0.6, + "grad_norm": 7.940772928729769, + "learning_rate": 3.617342948586937e-06, + "loss": 0.2683, + "step": 9565 + }, + { + "epoch": 0.6, + "grad_norm": 1.7387636580466237, + "learning_rate": 3.616364216378636e-06, + "loss": 0.2791, + "step": 9566 + }, + { + "epoch": 0.6, + "grad_norm": 2.1988999742472717, + "learning_rate": 3.615385541579842e-06, + "loss": 0.2757, + "step": 9567 + }, + { + "epoch": 0.6, + "grad_norm": 3.2291481414072614, + "learning_rate": 3.614406924231161e-06, + "loss": 0.2854, + "step": 9568 + }, + { + "epoch": 0.6, + "grad_norm": 1.9529787782012953, + "learning_rate": 3.613428364373196e-06, + "loss": 0.2823, + "step": 9569 + }, + { + "epoch": 0.6, + "grad_norm": 1.7818943939925946, + "learning_rate": 3.612449862046553e-06, + "loss": 0.2595, + "step": 9570 + }, + { + "epoch": 0.6, + "grad_norm": 3.4601553066560093, + "learning_rate": 3.611471417291829e-06, + "loss": 0.2745, + "step": 9571 + }, + { + "epoch": 0.6, + "grad_norm": 1.6440577796244955, + "learning_rate": 3.610493030149623e-06, + "loss": 0.2721, + "step": 9572 + }, + { + "epoch": 0.6, + "grad_norm": 2.678286651123387, + "learning_rate": 3.6095147006605293e-06, + "loss": 0.2681, + "step": 9573 + }, + { + "epoch": 0.6, + "grad_norm": 1.7834784881495682, + "learning_rate": 3.6085364288651394e-06, + "loss": 0.2722, + "step": 9574 + }, + { + "epoch": 0.6, + "grad_norm": 2.814325177259081, + "learning_rate": 3.6075582148040454e-06, + "loss": 0.2871, + "step": 9575 + }, + { + "epoch": 0.6, + "grad_norm": 0.6202310688168078, + "learning_rate": 3.606580058517834e-06, + "loss": 0.5071, + "step": 9576 + }, + { + "epoch": 0.6, + "grad_norm": 1.7258981426967315, + "learning_rate": 3.6056019600470893e-06, + "loss": 0.2909, + "step": 9577 + }, + { + "epoch": 0.6, + "grad_norm": 1.7696629825994823, + "learning_rate": 3.6046239194323983e-06, + "loss": 0.264, + "step": 9578 + }, + { + "epoch": 0.6, + "grad_norm": 1.8065173962302052, + "learning_rate": 3.6036459367143388e-06, + "loss": 0.2726, + "step": 9579 + }, + { + "epoch": 0.6, + "grad_norm": 1.2862529438157462, + "learning_rate": 3.602668011933489e-06, + "loss": 0.276, + "step": 9580 + }, + { + "epoch": 0.6, + "grad_norm": 2.418036462727418, + "learning_rate": 3.6016901451304265e-06, + "loss": 0.3053, + "step": 9581 + }, + { + "epoch": 0.6, + "grad_norm": 2.4270508975282663, + "learning_rate": 3.6007123363457232e-06, + "loss": 0.2808, + "step": 9582 + }, + { + "epoch": 0.6, + "grad_norm": 7.536831210253516, + "learning_rate": 3.5997345856199506e-06, + "loss": 0.2839, + "step": 9583 + }, + { + "epoch": 0.6, + "grad_norm": 2.9428651752459847, + "learning_rate": 3.5987568929936756e-06, + "loss": 0.2851, + "step": 9584 + }, + { + "epoch": 0.6, + "grad_norm": 1.5331140393216478, + "learning_rate": 3.5977792585074668e-06, + "loss": 0.2546, + "step": 9585 + }, + { + "epoch": 0.6, + "grad_norm": 3.3586872468600992, + "learning_rate": 3.596801682201888e-06, + "loss": 0.284, + "step": 9586 + }, + { + "epoch": 0.6, + "grad_norm": 2.404316042555868, + "learning_rate": 3.5958241641174995e-06, + "loss": 0.2785, + "step": 9587 + }, + { + "epoch": 0.6, + "grad_norm": 2.1202416960554027, + "learning_rate": 3.5948467042948597e-06, + "loss": 0.2809, + "step": 9588 + }, + { + "epoch": 0.6, + "grad_norm": 2.6431241123399385, + "learning_rate": 3.5938693027745276e-06, + "loss": 0.2767, + "step": 9589 + }, + { + "epoch": 0.6, + "grad_norm": 1.3141477584046433, + "learning_rate": 3.5928919595970546e-06, + "loss": 0.2599, + "step": 9590 + }, + { + "epoch": 0.6, + "grad_norm": 1.770921993973036, + "learning_rate": 3.5919146748029923e-06, + "loss": 0.2676, + "step": 9591 + }, + { + "epoch": 0.6, + "grad_norm": 1.6714197358579923, + "learning_rate": 3.590937448432893e-06, + "loss": 0.2825, + "step": 9592 + }, + { + "epoch": 0.6, + "grad_norm": 1.5364436102142878, + "learning_rate": 3.589960280527301e-06, + "loss": 0.278, + "step": 9593 + }, + { + "epoch": 0.6, + "grad_norm": 3.229902802414021, + "learning_rate": 3.5889831711267618e-06, + "loss": 0.2728, + "step": 9594 + }, + { + "epoch": 0.6, + "grad_norm": 1.7823816207742216, + "learning_rate": 3.5880061202718175e-06, + "loss": 0.2687, + "step": 9595 + }, + { + "epoch": 0.6, + "grad_norm": 2.947545949653769, + "learning_rate": 3.587029128003006e-06, + "loss": 0.2854, + "step": 9596 + }, + { + "epoch": 0.6, + "grad_norm": 1.9485674708226834, + "learning_rate": 3.5860521943608672e-06, + "loss": 0.2777, + "step": 9597 + }, + { + "epoch": 0.6, + "grad_norm": 1.4068436711392398, + "learning_rate": 3.5850753193859333e-06, + "loss": 0.2802, + "step": 9598 + }, + { + "epoch": 0.6, + "grad_norm": 2.1895938060414317, + "learning_rate": 3.584098503118737e-06, + "loss": 0.2553, + "step": 9599 + }, + { + "epoch": 0.6, + "grad_norm": 2.440780159511413, + "learning_rate": 3.5831217455998103e-06, + "loss": 0.2854, + "step": 9600 + }, + { + "epoch": 0.6, + "grad_norm": 2.7029271831529336, + "learning_rate": 3.5821450468696772e-06, + "loss": 0.2564, + "step": 9601 + }, + { + "epoch": 0.6, + "grad_norm": 6.233549206608332, + "learning_rate": 3.581168406968867e-06, + "loss": 0.2702, + "step": 9602 + }, + { + "epoch": 0.6, + "grad_norm": 2.5303859237555177, + "learning_rate": 3.5801918259378975e-06, + "loss": 0.2618, + "step": 9603 + }, + { + "epoch": 0.6, + "grad_norm": 2.476267417080039, + "learning_rate": 3.5792153038172916e-06, + "loss": 0.2846, + "step": 9604 + }, + { + "epoch": 0.6, + "grad_norm": 1.717802952989295, + "learning_rate": 3.5782388406475675e-06, + "loss": 0.2742, + "step": 9605 + }, + { + "epoch": 0.6, + "grad_norm": 1.9311225811939217, + "learning_rate": 3.577262436469237e-06, + "loss": 0.2551, + "step": 9606 + }, + { + "epoch": 0.6, + "grad_norm": 1.5687961319052377, + "learning_rate": 3.5762860913228157e-06, + "loss": 0.2778, + "step": 9607 + }, + { + "epoch": 0.6, + "grad_norm": 1.662154331815429, + "learning_rate": 3.575309805248815e-06, + "loss": 0.2594, + "step": 9608 + }, + { + "epoch": 0.6, + "grad_norm": 2.770985071755837, + "learning_rate": 3.57433357828774e-06, + "loss": 0.276, + "step": 9609 + }, + { + "epoch": 0.6, + "grad_norm": 1.9560992930775103, + "learning_rate": 3.5733574104800973e-06, + "loss": 0.2923, + "step": 9610 + }, + { + "epoch": 0.6, + "grad_norm": 2.889595933118841, + "learning_rate": 3.5723813018663895e-06, + "loss": 0.2866, + "step": 9611 + }, + { + "epoch": 0.6, + "grad_norm": 1.8200372092854165, + "learning_rate": 3.5714052524871166e-06, + "loss": 0.2617, + "step": 9612 + }, + { + "epoch": 0.6, + "grad_norm": 1.854558588932161, + "learning_rate": 3.5704292623827786e-06, + "loss": 0.2647, + "step": 9613 + }, + { + "epoch": 0.6, + "grad_norm": 8.259319595869737, + "learning_rate": 3.569453331593867e-06, + "loss": 0.294, + "step": 9614 + }, + { + "epoch": 0.6, + "grad_norm": 2.326140397985674, + "learning_rate": 3.5684774601608797e-06, + "loss": 0.2748, + "step": 9615 + }, + { + "epoch": 0.6, + "grad_norm": 1.686550438990365, + "learning_rate": 3.5675016481243065e-06, + "loss": 0.273, + "step": 9616 + }, + { + "epoch": 0.6, + "grad_norm": 1.8149565447399794, + "learning_rate": 3.566525895524632e-06, + "loss": 0.2666, + "step": 9617 + }, + { + "epoch": 0.6, + "grad_norm": 1.734911669419096, + "learning_rate": 3.5655502024023447e-06, + "loss": 0.2823, + "step": 9618 + }, + { + "epoch": 0.6, + "grad_norm": 6.464908315509748, + "learning_rate": 3.564574568797928e-06, + "loss": 0.267, + "step": 9619 + }, + { + "epoch": 0.6, + "grad_norm": 2.6175812496529116, + "learning_rate": 3.5635989947518625e-06, + "loss": 0.2653, + "step": 9620 + }, + { + "epoch": 0.61, + "grad_norm": 2.6728828239123588, + "learning_rate": 3.562623480304623e-06, + "loss": 0.2904, + "step": 9621 + }, + { + "epoch": 0.61, + "grad_norm": 4.739819967402756, + "learning_rate": 3.5616480254966925e-06, + "loss": 0.2685, + "step": 9622 + }, + { + "epoch": 0.61, + "grad_norm": 3.673540114471451, + "learning_rate": 3.5606726303685378e-06, + "loss": 0.2773, + "step": 9623 + }, + { + "epoch": 0.61, + "grad_norm": 1.8132328684780588, + "learning_rate": 3.5596972949606333e-06, + "loss": 0.2837, + "step": 9624 + }, + { + "epoch": 0.61, + "grad_norm": 1.9380952332327748, + "learning_rate": 3.5587220193134457e-06, + "loss": 0.2864, + "step": 9625 + }, + { + "epoch": 0.61, + "grad_norm": 2.7651821095245928, + "learning_rate": 3.5577468034674414e-06, + "loss": 0.2706, + "step": 9626 + }, + { + "epoch": 0.61, + "grad_norm": 2.231627056763443, + "learning_rate": 3.5567716474630853e-06, + "loss": 0.2685, + "step": 9627 + }, + { + "epoch": 0.61, + "grad_norm": 1.6114014750004175, + "learning_rate": 3.5557965513408353e-06, + "loss": 0.2595, + "step": 9628 + }, + { + "epoch": 0.61, + "grad_norm": 2.01174626091983, + "learning_rate": 3.554821515141151e-06, + "loss": 0.2746, + "step": 9629 + }, + { + "epoch": 0.61, + "grad_norm": 2.084811627206056, + "learning_rate": 3.553846538904491e-06, + "loss": 0.2602, + "step": 9630 + }, + { + "epoch": 0.61, + "grad_norm": 2.415620312522918, + "learning_rate": 3.5528716226713055e-06, + "loss": 0.2632, + "step": 9631 + }, + { + "epoch": 0.61, + "grad_norm": 2.1303684915519185, + "learning_rate": 3.551896766482048e-06, + "loss": 0.2802, + "step": 9632 + }, + { + "epoch": 0.61, + "grad_norm": 1.593530392158075, + "learning_rate": 3.5509219703771647e-06, + "loss": 0.2547, + "step": 9633 + }, + { + "epoch": 0.61, + "grad_norm": 3.328479410567316, + "learning_rate": 3.5499472343971027e-06, + "loss": 0.2509, + "step": 9634 + }, + { + "epoch": 0.61, + "grad_norm": 13.704078756234964, + "learning_rate": 3.5489725585823064e-06, + "loss": 0.2883, + "step": 9635 + }, + { + "epoch": 0.61, + "grad_norm": 2.634038761388328, + "learning_rate": 3.5479979429732134e-06, + "loss": 0.2686, + "step": 9636 + }, + { + "epoch": 0.61, + "grad_norm": 3.1474182986777586, + "learning_rate": 3.5470233876102672e-06, + "loss": 0.2815, + "step": 9637 + }, + { + "epoch": 0.61, + "grad_norm": 1.6120178257820572, + "learning_rate": 3.5460488925339013e-06, + "loss": 0.2823, + "step": 9638 + }, + { + "epoch": 0.61, + "grad_norm": 2.6475636322091867, + "learning_rate": 3.545074457784549e-06, + "loss": 0.2595, + "step": 9639 + }, + { + "epoch": 0.61, + "grad_norm": 1.5034596261966902, + "learning_rate": 3.5441000834026427e-06, + "loss": 0.2702, + "step": 9640 + }, + { + "epoch": 0.61, + "grad_norm": 1.7355213612859415, + "learning_rate": 3.543125769428608e-06, + "loss": 0.2643, + "step": 9641 + }, + { + "epoch": 0.61, + "grad_norm": 1.7340420068099016, + "learning_rate": 3.542151515902874e-06, + "loss": 0.2674, + "step": 9642 + }, + { + "epoch": 0.61, + "grad_norm": 1.8024617141733321, + "learning_rate": 3.5411773228658635e-06, + "loss": 0.2716, + "step": 9643 + }, + { + "epoch": 0.61, + "grad_norm": 2.428585854706941, + "learning_rate": 3.5402031903579946e-06, + "loss": 0.2707, + "step": 9644 + }, + { + "epoch": 0.61, + "grad_norm": 3.9927899800573674, + "learning_rate": 3.5392291184196903e-06, + "loss": 0.2814, + "step": 9645 + }, + { + "epoch": 0.61, + "grad_norm": 1.8924978471352056, + "learning_rate": 3.5382551070913647e-06, + "loss": 0.2616, + "step": 9646 + }, + { + "epoch": 0.61, + "grad_norm": 1.9397480744608648, + "learning_rate": 3.5372811564134303e-06, + "loss": 0.2586, + "step": 9647 + }, + { + "epoch": 0.61, + "grad_norm": 2.2187808600142427, + "learning_rate": 3.5363072664263e-06, + "loss": 0.2868, + "step": 9648 + }, + { + "epoch": 0.61, + "grad_norm": 5.965577050260435, + "learning_rate": 3.5353334371703797e-06, + "loss": 0.2712, + "step": 9649 + }, + { + "epoch": 0.61, + "grad_norm": 2.48875613701296, + "learning_rate": 3.5343596686860757e-06, + "loss": 0.274, + "step": 9650 + }, + { + "epoch": 0.61, + "grad_norm": 4.033883317329177, + "learning_rate": 3.533385961013793e-06, + "loss": 0.2754, + "step": 9651 + }, + { + "epoch": 0.61, + "grad_norm": 2.7437274630426374, + "learning_rate": 3.5324123141939327e-06, + "loss": 0.2636, + "step": 9652 + }, + { + "epoch": 0.61, + "grad_norm": 2.274467485172925, + "learning_rate": 3.5314387282668917e-06, + "loss": 0.2636, + "step": 9653 + }, + { + "epoch": 0.61, + "grad_norm": 2.2679854576901377, + "learning_rate": 3.530465203273067e-06, + "loss": 0.2601, + "step": 9654 + }, + { + "epoch": 0.61, + "grad_norm": 1.7316515382461843, + "learning_rate": 3.5294917392528504e-06, + "loss": 0.2671, + "step": 9655 + }, + { + "epoch": 0.61, + "grad_norm": 1.906097991358715, + "learning_rate": 3.5285183362466346e-06, + "loss": 0.2932, + "step": 9656 + }, + { + "epoch": 0.61, + "grad_norm": 2.4206860044962317, + "learning_rate": 3.527544994294806e-06, + "loss": 0.2917, + "step": 9657 + }, + { + "epoch": 0.61, + "grad_norm": 1.8644289231604958, + "learning_rate": 3.5265717134377496e-06, + "loss": 0.3008, + "step": 9658 + }, + { + "epoch": 0.61, + "grad_norm": 2.173226564855597, + "learning_rate": 3.5255984937158505e-06, + "loss": 0.2708, + "step": 9659 + }, + { + "epoch": 0.61, + "grad_norm": 2.0491998347676033, + "learning_rate": 3.52462533516949e-06, + "loss": 0.2832, + "step": 9660 + }, + { + "epoch": 0.61, + "grad_norm": 3.1631596140764504, + "learning_rate": 3.523652237839044e-06, + "loss": 0.2863, + "step": 9661 + }, + { + "epoch": 0.61, + "grad_norm": 4.586519950366805, + "learning_rate": 3.52267920176489e-06, + "loss": 0.2811, + "step": 9662 + }, + { + "epoch": 0.61, + "grad_norm": 2.7611328724501245, + "learning_rate": 3.5217062269873986e-06, + "loss": 0.2673, + "step": 9663 + }, + { + "epoch": 0.61, + "grad_norm": 2.0337344588209123, + "learning_rate": 3.520733313546942e-06, + "loss": 0.2796, + "step": 9664 + }, + { + "epoch": 0.61, + "grad_norm": 2.3318295373856515, + "learning_rate": 3.519760461483888e-06, + "loss": 0.2879, + "step": 9665 + }, + { + "epoch": 0.61, + "grad_norm": 2.1105205949953616, + "learning_rate": 3.5187876708386004e-06, + "loss": 0.2802, + "step": 9666 + }, + { + "epoch": 0.61, + "grad_norm": 1.4433968414646754, + "learning_rate": 3.517814941651444e-06, + "loss": 0.2717, + "step": 9667 + }, + { + "epoch": 0.61, + "grad_norm": 1.3791071439894624, + "learning_rate": 3.5168422739627794e-06, + "loss": 0.2735, + "step": 9668 + }, + { + "epoch": 0.61, + "grad_norm": 2.4088311824071043, + "learning_rate": 3.515869667812962e-06, + "loss": 0.2816, + "step": 9669 + }, + { + "epoch": 0.61, + "grad_norm": 2.6487354639853633, + "learning_rate": 3.5148971232423495e-06, + "loss": 0.2695, + "step": 9670 + }, + { + "epoch": 0.61, + "grad_norm": 13.471460897138366, + "learning_rate": 3.513924640291292e-06, + "loss": 0.2724, + "step": 9671 + }, + { + "epoch": 0.61, + "grad_norm": 1.8466820640785455, + "learning_rate": 3.5129522190001407e-06, + "loss": 0.2717, + "step": 9672 + }, + { + "epoch": 0.61, + "grad_norm": 1.8338084218164097, + "learning_rate": 3.5119798594092426e-06, + "loss": 0.2607, + "step": 9673 + }, + { + "epoch": 0.61, + "grad_norm": 3.141658710673713, + "learning_rate": 3.5110075615589445e-06, + "loss": 0.2931, + "step": 9674 + }, + { + "epoch": 0.61, + "grad_norm": 5.422081082535986, + "learning_rate": 3.5100353254895867e-06, + "loss": 0.2635, + "step": 9675 + }, + { + "epoch": 0.61, + "grad_norm": 1.8453361788018223, + "learning_rate": 3.509063151241511e-06, + "loss": 0.2673, + "step": 9676 + }, + { + "epoch": 0.61, + "grad_norm": 3.836042848355021, + "learning_rate": 3.508091038855052e-06, + "loss": 0.2799, + "step": 9677 + }, + { + "epoch": 0.61, + "grad_norm": 1.6202018431544734, + "learning_rate": 3.507118988370547e-06, + "loss": 0.2726, + "step": 9678 + }, + { + "epoch": 0.61, + "grad_norm": 0.5788904459655787, + "learning_rate": 3.5061469998283255e-06, + "loss": 0.4571, + "step": 9679 + }, + { + "epoch": 0.61, + "grad_norm": 1.5411428286339852, + "learning_rate": 3.5051750732687183e-06, + "loss": 0.2745, + "step": 9680 + }, + { + "epoch": 0.61, + "grad_norm": 1.6994969809747569, + "learning_rate": 3.504203208732052e-06, + "loss": 0.2718, + "step": 9681 + }, + { + "epoch": 0.61, + "grad_norm": 3.6459733915683215, + "learning_rate": 3.5032314062586525e-06, + "loss": 0.2885, + "step": 9682 + }, + { + "epoch": 0.61, + "grad_norm": 1.7305200495893007, + "learning_rate": 3.5022596658888404e-06, + "loss": 0.2721, + "step": 9683 + }, + { + "epoch": 0.61, + "grad_norm": 2.9519146830725447, + "learning_rate": 3.501287987662936e-06, + "loss": 0.2606, + "step": 9684 + }, + { + "epoch": 0.61, + "grad_norm": 3.86565487359643, + "learning_rate": 3.500316371621253e-06, + "loss": 0.2788, + "step": 9685 + }, + { + "epoch": 0.61, + "grad_norm": 0.6391418581472857, + "learning_rate": 3.4993448178041095e-06, + "loss": 0.4783, + "step": 9686 + }, + { + "epoch": 0.61, + "grad_norm": 2.99616721580779, + "learning_rate": 3.498373326251814e-06, + "loss": 0.2875, + "step": 9687 + }, + { + "epoch": 0.61, + "grad_norm": 2.8719292714633617, + "learning_rate": 3.4974018970046745e-06, + "loss": 0.283, + "step": 9688 + }, + { + "epoch": 0.61, + "grad_norm": 1.9315024307023858, + "learning_rate": 3.496430530103001e-06, + "loss": 0.274, + "step": 9689 + }, + { + "epoch": 0.61, + "grad_norm": 1.725138300741341, + "learning_rate": 3.4954592255870964e-06, + "loss": 0.2675, + "step": 9690 + }, + { + "epoch": 0.61, + "grad_norm": 1.989168976277709, + "learning_rate": 3.4944879834972595e-06, + "loss": 0.2755, + "step": 9691 + }, + { + "epoch": 0.61, + "grad_norm": 0.5705142104128255, + "learning_rate": 3.493516803873791e-06, + "loss": 0.4812, + "step": 9692 + }, + { + "epoch": 0.61, + "grad_norm": 2.558959175112957, + "learning_rate": 3.492545686756986e-06, + "loss": 0.2681, + "step": 9693 + }, + { + "epoch": 0.61, + "grad_norm": 1.7260435908193597, + "learning_rate": 3.4915746321871384e-06, + "loss": 0.2738, + "step": 9694 + }, + { + "epoch": 0.61, + "grad_norm": 2.9122895850254102, + "learning_rate": 3.490603640204538e-06, + "loss": 0.2647, + "step": 9695 + }, + { + "epoch": 0.61, + "grad_norm": 1.7505762332473782, + "learning_rate": 3.4896327108494723e-06, + "loss": 0.2547, + "step": 9696 + }, + { + "epoch": 0.61, + "grad_norm": 3.4773745456442398, + "learning_rate": 3.488661844162229e-06, + "loss": 0.2824, + "step": 9697 + }, + { + "epoch": 0.61, + "grad_norm": 1.4595874996958724, + "learning_rate": 3.487691040183091e-06, + "loss": 0.2674, + "step": 9698 + }, + { + "epoch": 0.61, + "grad_norm": 3.7705851454020305, + "learning_rate": 3.4867202989523375e-06, + "loss": 0.2934, + "step": 9699 + }, + { + "epoch": 0.61, + "grad_norm": 3.9124997415924883, + "learning_rate": 3.4857496205102475e-06, + "loss": 0.281, + "step": 9700 + }, + { + "epoch": 0.61, + "grad_norm": 5.965828772178735, + "learning_rate": 3.484779004897094e-06, + "loss": 0.2715, + "step": 9701 + }, + { + "epoch": 0.61, + "grad_norm": 2.1927954447603812, + "learning_rate": 3.4838084521531513e-06, + "loss": 0.268, + "step": 9702 + }, + { + "epoch": 0.61, + "grad_norm": 13.349569070821062, + "learning_rate": 3.4828379623186883e-06, + "loss": 0.2792, + "step": 9703 + }, + { + "epoch": 0.61, + "grad_norm": 2.1068968748909445, + "learning_rate": 3.481867535433974e-06, + "loss": 0.2952, + "step": 9704 + }, + { + "epoch": 0.61, + "grad_norm": 2.076498174164943, + "learning_rate": 3.4808971715392724e-06, + "loss": 0.2959, + "step": 9705 + }, + { + "epoch": 0.61, + "grad_norm": 2.3310854290572403, + "learning_rate": 3.4799268706748457e-06, + "loss": 0.2836, + "step": 9706 + }, + { + "epoch": 0.61, + "grad_norm": 4.888579570534031, + "learning_rate": 3.4789566328809523e-06, + "loss": 0.263, + "step": 9707 + }, + { + "epoch": 0.61, + "grad_norm": 2.048864209009855, + "learning_rate": 3.4779864581978506e-06, + "loss": 0.2666, + "step": 9708 + }, + { + "epoch": 0.61, + "grad_norm": 1.963110769760852, + "learning_rate": 3.477016346665793e-06, + "loss": 0.2819, + "step": 9709 + }, + { + "epoch": 0.61, + "grad_norm": 3.2173594046331218, + "learning_rate": 3.4760462983250327e-06, + "loss": 0.2682, + "step": 9710 + }, + { + "epoch": 0.61, + "grad_norm": 1.6566223404374978, + "learning_rate": 3.475076313215817e-06, + "loss": 0.2577, + "step": 9711 + }, + { + "epoch": 0.61, + "grad_norm": 2.672514898123288, + "learning_rate": 3.4741063913783958e-06, + "loss": 0.2884, + "step": 9712 + }, + { + "epoch": 0.61, + "grad_norm": 1.78725750672085, + "learning_rate": 3.47313653285301e-06, + "loss": 0.2536, + "step": 9713 + }, + { + "epoch": 0.61, + "grad_norm": 2.9106984866986485, + "learning_rate": 3.4721667376799022e-06, + "loss": 0.2626, + "step": 9714 + }, + { + "epoch": 0.61, + "grad_norm": 1.8114622629790127, + "learning_rate": 3.4711970058993092e-06, + "loss": 0.2593, + "step": 9715 + }, + { + "epoch": 0.61, + "grad_norm": 2.947573439174941, + "learning_rate": 3.4702273375514695e-06, + "loss": 0.3067, + "step": 9716 + }, + { + "epoch": 0.61, + "grad_norm": 3.0173391992265914, + "learning_rate": 3.469257732676613e-06, + "loss": 0.2655, + "step": 9717 + }, + { + "epoch": 0.61, + "grad_norm": 2.6964881703085593, + "learning_rate": 3.468288191314971e-06, + "loss": 0.2756, + "step": 9718 + }, + { + "epoch": 0.61, + "grad_norm": 6.87439735988396, + "learning_rate": 3.4673187135067737e-06, + "loss": 0.2887, + "step": 9719 + }, + { + "epoch": 0.61, + "grad_norm": 3.6457496674905365, + "learning_rate": 3.4663492992922467e-06, + "loss": 0.2737, + "step": 9720 + }, + { + "epoch": 0.61, + "grad_norm": 5.40900693369108, + "learning_rate": 3.4653799487116097e-06, + "loss": 0.3005, + "step": 9721 + }, + { + "epoch": 0.61, + "grad_norm": 1.9989324760989478, + "learning_rate": 3.464410661805086e-06, + "loss": 0.2814, + "step": 9722 + }, + { + "epoch": 0.61, + "grad_norm": 2.24778928134871, + "learning_rate": 3.46344143861289e-06, + "loss": 0.2707, + "step": 9723 + }, + { + "epoch": 0.61, + "grad_norm": 2.873259969543035, + "learning_rate": 3.4624722791752395e-06, + "loss": 0.2812, + "step": 9724 + }, + { + "epoch": 0.61, + "grad_norm": 3.9071851397361037, + "learning_rate": 3.4615031835323437e-06, + "loss": 0.2688, + "step": 9725 + }, + { + "epoch": 0.61, + "grad_norm": 4.747612671781315, + "learning_rate": 3.4605341517244123e-06, + "loss": 0.2671, + "step": 9726 + }, + { + "epoch": 0.61, + "grad_norm": 4.549920079770489, + "learning_rate": 3.4595651837916542e-06, + "loss": 0.2821, + "step": 9727 + }, + { + "epoch": 0.61, + "grad_norm": 2.629429335854162, + "learning_rate": 3.4585962797742745e-06, + "loss": 0.2571, + "step": 9728 + }, + { + "epoch": 0.61, + "grad_norm": 3.3913865108676804, + "learning_rate": 3.457627439712472e-06, + "loss": 0.2669, + "step": 9729 + }, + { + "epoch": 0.61, + "grad_norm": 2.973275614830469, + "learning_rate": 3.456658663646447e-06, + "loss": 0.2744, + "step": 9730 + }, + { + "epoch": 0.61, + "grad_norm": 2.1536431811532277, + "learning_rate": 3.4556899516163942e-06, + "loss": 0.2558, + "step": 9731 + }, + { + "epoch": 0.61, + "grad_norm": 2.173750887599849, + "learning_rate": 3.4547213036625107e-06, + "loss": 0.2679, + "step": 9732 + }, + { + "epoch": 0.61, + "grad_norm": 3.444293839107353, + "learning_rate": 3.4537527198249804e-06, + "loss": 0.2827, + "step": 9733 + }, + { + "epoch": 0.61, + "grad_norm": 3.1970971103190347, + "learning_rate": 3.452784200144e-06, + "loss": 0.2825, + "step": 9734 + }, + { + "epoch": 0.61, + "grad_norm": 2.0197608298174905, + "learning_rate": 3.45181574465975e-06, + "loss": 0.2738, + "step": 9735 + }, + { + "epoch": 0.61, + "grad_norm": 7.586690916293952, + "learning_rate": 3.4508473534124155e-06, + "loss": 0.2553, + "step": 9736 + }, + { + "epoch": 0.61, + "grad_norm": 2.221204728485687, + "learning_rate": 3.4498790264421755e-06, + "loss": 0.2999, + "step": 9737 + }, + { + "epoch": 0.61, + "grad_norm": 15.562557774905292, + "learning_rate": 3.4489107637892084e-06, + "loss": 0.272, + "step": 9738 + }, + { + "epoch": 0.61, + "grad_norm": 8.060300648749852, + "learning_rate": 3.4479425654936883e-06, + "loss": 0.2861, + "step": 9739 + }, + { + "epoch": 0.61, + "grad_norm": 3.095010236330334, + "learning_rate": 3.4469744315957863e-06, + "loss": 0.2723, + "step": 9740 + }, + { + "epoch": 0.61, + "grad_norm": 2.1483076401567973, + "learning_rate": 3.4460063621356753e-06, + "loss": 0.2575, + "step": 9741 + }, + { + "epoch": 0.61, + "grad_norm": 1.8839660452427271, + "learning_rate": 3.445038357153521e-06, + "loss": 0.2767, + "step": 9742 + }, + { + "epoch": 0.61, + "grad_norm": 1.5896504501694708, + "learning_rate": 3.4440704166894865e-06, + "loss": 0.2677, + "step": 9743 + }, + { + "epoch": 0.61, + "grad_norm": 1.948613565615765, + "learning_rate": 3.443102540783735e-06, + "loss": 0.2889, + "step": 9744 + }, + { + "epoch": 0.61, + "grad_norm": 1.870423470052233, + "learning_rate": 3.4421347294764236e-06, + "loss": 0.2557, + "step": 9745 + }, + { + "epoch": 0.61, + "grad_norm": 8.399463870416707, + "learning_rate": 3.44116698280771e-06, + "loss": 0.2863, + "step": 9746 + }, + { + "epoch": 0.61, + "grad_norm": 3.383572031556528, + "learning_rate": 3.440199300817746e-06, + "loss": 0.2531, + "step": 9747 + }, + { + "epoch": 0.61, + "grad_norm": 2.3395363548219366, + "learning_rate": 3.4392316835466834e-06, + "loss": 0.2646, + "step": 9748 + }, + { + "epoch": 0.61, + "grad_norm": 1.4901782868002784, + "learning_rate": 3.4382641310346705e-06, + "loss": 0.2628, + "step": 9749 + }, + { + "epoch": 0.61, + "grad_norm": 0.6094170931302001, + "learning_rate": 3.437296643321854e-06, + "loss": 0.4974, + "step": 9750 + }, + { + "epoch": 0.61, + "grad_norm": 1.5547683876758882, + "learning_rate": 3.4363292204483745e-06, + "loss": 0.2522, + "step": 9751 + }, + { + "epoch": 0.61, + "grad_norm": 13.152238272698963, + "learning_rate": 3.435361862454374e-06, + "loss": 0.2768, + "step": 9752 + }, + { + "epoch": 0.61, + "grad_norm": 3.340303677330552, + "learning_rate": 3.4343945693799885e-06, + "loss": 0.313, + "step": 9753 + }, + { + "epoch": 0.61, + "grad_norm": 3.167579734046241, + "learning_rate": 3.4334273412653534e-06, + "loss": 0.2723, + "step": 9754 + }, + { + "epoch": 0.61, + "grad_norm": 1.9885965037663498, + "learning_rate": 3.4324601781505973e-06, + "loss": 0.2588, + "step": 9755 + }, + { + "epoch": 0.61, + "grad_norm": 1.8703308316835587, + "learning_rate": 3.431493080075856e-06, + "loss": 0.2655, + "step": 9756 + }, + { + "epoch": 0.61, + "grad_norm": 2.507613676412212, + "learning_rate": 3.4305260470812522e-06, + "loss": 0.2658, + "step": 9757 + }, + { + "epoch": 0.61, + "grad_norm": 2.202107476049587, + "learning_rate": 3.429559079206911e-06, + "loss": 0.2546, + "step": 9758 + }, + { + "epoch": 0.61, + "grad_norm": 2.5256971890429445, + "learning_rate": 3.428592176492952e-06, + "loss": 0.2929, + "step": 9759 + }, + { + "epoch": 0.61, + "grad_norm": 0.6158143000365329, + "learning_rate": 3.427625338979496e-06, + "loss": 0.4779, + "step": 9760 + }, + { + "epoch": 0.61, + "grad_norm": 1.7972835015970774, + "learning_rate": 3.4266585667066567e-06, + "loss": 0.2903, + "step": 9761 + }, + { + "epoch": 0.61, + "grad_norm": 3.0395101630351227, + "learning_rate": 3.4256918597145484e-06, + "loss": 0.2757, + "step": 9762 + }, + { + "epoch": 0.61, + "grad_norm": 2.310466766664242, + "learning_rate": 3.424725218043279e-06, + "loss": 0.274, + "step": 9763 + }, + { + "epoch": 0.61, + "grad_norm": 1.3011208309822484, + "learning_rate": 3.4237586417329604e-06, + "loss": 0.2813, + "step": 9764 + }, + { + "epoch": 0.61, + "grad_norm": 2.5851888721427545, + "learning_rate": 3.4227921308236943e-06, + "loss": 0.2658, + "step": 9765 + }, + { + "epoch": 0.61, + "grad_norm": 2.4594726188202216, + "learning_rate": 3.4218256853555864e-06, + "loss": 0.2666, + "step": 9766 + }, + { + "epoch": 0.61, + "grad_norm": 2.282973661879234, + "learning_rate": 3.4208593053687323e-06, + "loss": 0.263, + "step": 9767 + }, + { + "epoch": 0.61, + "grad_norm": 2.229247173957106, + "learning_rate": 3.419892990903231e-06, + "loss": 0.2613, + "step": 9768 + }, + { + "epoch": 0.61, + "grad_norm": 1.8265981069370543, + "learning_rate": 3.4189267419991756e-06, + "loss": 0.2859, + "step": 9769 + }, + { + "epoch": 0.61, + "grad_norm": 1.6153322422561114, + "learning_rate": 3.417960558696657e-06, + "loss": 0.2768, + "step": 9770 + }, + { + "epoch": 0.61, + "grad_norm": 2.0136087141017507, + "learning_rate": 3.4169944410357657e-06, + "loss": 0.285, + "step": 9771 + }, + { + "epoch": 0.61, + "grad_norm": 2.0237742417858455, + "learning_rate": 3.4160283890565877e-06, + "loss": 0.2685, + "step": 9772 + }, + { + "epoch": 0.61, + "grad_norm": 1.7441269897520957, + "learning_rate": 3.4150624027992043e-06, + "loss": 0.2813, + "step": 9773 + }, + { + "epoch": 0.61, + "grad_norm": 0.6276477488264848, + "learning_rate": 3.4140964823036984e-06, + "loss": 0.5328, + "step": 9774 + }, + { + "epoch": 0.61, + "grad_norm": 2.180546889951866, + "learning_rate": 3.413130627610145e-06, + "loss": 0.2735, + "step": 9775 + }, + { + "epoch": 0.61, + "grad_norm": 2.752192174421867, + "learning_rate": 3.4121648387586216e-06, + "loss": 0.2816, + "step": 9776 + }, + { + "epoch": 0.61, + "grad_norm": 2.0246969590769095, + "learning_rate": 3.411199115789198e-06, + "loss": 0.2931, + "step": 9777 + }, + { + "epoch": 0.61, + "grad_norm": 1.73087687941042, + "learning_rate": 3.4102334587419437e-06, + "loss": 0.2611, + "step": 9778 + }, + { + "epoch": 0.61, + "grad_norm": 1.5906763656306007, + "learning_rate": 3.4092678676569292e-06, + "loss": 0.2669, + "step": 9779 + }, + { + "epoch": 0.62, + "grad_norm": 2.366577653114579, + "learning_rate": 3.408302342574216e-06, + "loss": 0.2923, + "step": 9780 + }, + { + "epoch": 0.62, + "grad_norm": 4.000430447468467, + "learning_rate": 3.407336883533866e-06, + "loss": 0.282, + "step": 9781 + }, + { + "epoch": 0.62, + "grad_norm": 2.4360295371800174, + "learning_rate": 3.406371490575938e-06, + "loss": 0.2631, + "step": 9782 + }, + { + "epoch": 0.62, + "grad_norm": 0.5623069510926189, + "learning_rate": 3.405406163740487e-06, + "loss": 0.4546, + "step": 9783 + }, + { + "epoch": 0.62, + "grad_norm": 1.6064021606481163, + "learning_rate": 3.4044409030675663e-06, + "loss": 0.2704, + "step": 9784 + }, + { + "epoch": 0.62, + "grad_norm": 2.2011488028748056, + "learning_rate": 3.403475708597225e-06, + "loss": 0.2574, + "step": 9785 + }, + { + "epoch": 0.62, + "grad_norm": 3.1070165081930075, + "learning_rate": 3.402510580369514e-06, + "loss": 0.2781, + "step": 9786 + }, + { + "epoch": 0.62, + "grad_norm": 2.156579431733777, + "learning_rate": 3.4015455184244763e-06, + "loss": 0.2719, + "step": 9787 + }, + { + "epoch": 0.62, + "grad_norm": 4.668771498571321, + "learning_rate": 3.400580522802155e-06, + "loss": 0.2711, + "step": 9788 + }, + { + "epoch": 0.62, + "grad_norm": 1.9583571969718583, + "learning_rate": 3.399615593542588e-06, + "loss": 0.2656, + "step": 9789 + }, + { + "epoch": 0.62, + "grad_norm": 14.246863501708958, + "learning_rate": 3.398650730685813e-06, + "loss": 0.2775, + "step": 9790 + }, + { + "epoch": 0.62, + "grad_norm": 1.3448309348479337, + "learning_rate": 3.3976859342718625e-06, + "loss": 0.2746, + "step": 9791 + }, + { + "epoch": 0.62, + "grad_norm": 1.9250642633814026, + "learning_rate": 3.3967212043407705e-06, + "loss": 0.2709, + "step": 9792 + }, + { + "epoch": 0.62, + "grad_norm": 3.1228204801377535, + "learning_rate": 3.39575654093256e-06, + "loss": 0.2697, + "step": 9793 + }, + { + "epoch": 0.62, + "grad_norm": 5.445133103556168, + "learning_rate": 3.3947919440872627e-06, + "loss": 0.289, + "step": 9794 + }, + { + "epoch": 0.62, + "grad_norm": 1.874824749896127, + "learning_rate": 3.3938274138448982e-06, + "loss": 0.2524, + "step": 9795 + }, + { + "epoch": 0.62, + "grad_norm": 2.5459995597759115, + "learning_rate": 3.392862950245488e-06, + "loss": 0.2621, + "step": 9796 + }, + { + "epoch": 0.62, + "grad_norm": 2.6170387526796945, + "learning_rate": 3.3918985533290484e-06, + "loss": 0.2795, + "step": 9797 + }, + { + "epoch": 0.62, + "grad_norm": 3.712002999552004, + "learning_rate": 3.390934223135594e-06, + "loss": 0.278, + "step": 9798 + }, + { + "epoch": 0.62, + "grad_norm": 2.586512750861614, + "learning_rate": 3.389969959705136e-06, + "loss": 0.2934, + "step": 9799 + }, + { + "epoch": 0.62, + "grad_norm": 2.2440423120988036, + "learning_rate": 3.3890057630776834e-06, + "loss": 0.2631, + "step": 9800 + }, + { + "epoch": 0.62, + "grad_norm": 2.127058916272636, + "learning_rate": 3.3880416332932442e-06, + "loss": 0.2785, + "step": 9801 + }, + { + "epoch": 0.62, + "grad_norm": 2.004798853258625, + "learning_rate": 3.3870775703918212e-06, + "loss": 0.2769, + "step": 9802 + }, + { + "epoch": 0.62, + "grad_norm": 3.5113578287649134, + "learning_rate": 3.3861135744134143e-06, + "loss": 0.2554, + "step": 9803 + }, + { + "epoch": 0.62, + "grad_norm": 1.3826156861765981, + "learning_rate": 3.3851496453980225e-06, + "loss": 0.2639, + "step": 9804 + }, + { + "epoch": 0.62, + "grad_norm": 6.885710051223053, + "learning_rate": 3.3841857833856396e-06, + "loss": 0.2786, + "step": 9805 + }, + { + "epoch": 0.62, + "grad_norm": 3.0955458839006647, + "learning_rate": 3.383221988416259e-06, + "loss": 0.2697, + "step": 9806 + }, + { + "epoch": 0.62, + "grad_norm": 3.7450305018461862, + "learning_rate": 3.382258260529869e-06, + "loss": 0.2817, + "step": 9807 + }, + { + "epoch": 0.62, + "grad_norm": 3.645270739428072, + "learning_rate": 3.381294599766456e-06, + "loss": 0.2641, + "step": 9808 + }, + { + "epoch": 0.62, + "grad_norm": 1.6665386253635939, + "learning_rate": 3.3803310061660067e-06, + "loss": 0.2734, + "step": 9809 + }, + { + "epoch": 0.62, + "grad_norm": 2.422786529716522, + "learning_rate": 3.3793674797685016e-06, + "loss": 0.2616, + "step": 9810 + }, + { + "epoch": 0.62, + "grad_norm": 2.273669920485398, + "learning_rate": 3.3784040206139175e-06, + "loss": 0.2683, + "step": 9811 + }, + { + "epoch": 0.62, + "grad_norm": 2.0069537499235546, + "learning_rate": 3.3774406287422313e-06, + "loss": 0.2736, + "step": 9812 + }, + { + "epoch": 0.62, + "grad_norm": 3.5082467590142463, + "learning_rate": 3.376477304193415e-06, + "loss": 0.2646, + "step": 9813 + }, + { + "epoch": 0.62, + "grad_norm": 4.558642561368121, + "learning_rate": 3.37551404700744e-06, + "loss": 0.2674, + "step": 9814 + }, + { + "epoch": 0.62, + "grad_norm": 5.327870894682449, + "learning_rate": 3.374550857224269e-06, + "loss": 0.265, + "step": 9815 + }, + { + "epoch": 0.62, + "grad_norm": 1.69322909734235, + "learning_rate": 3.3735877348838738e-06, + "loss": 0.2586, + "step": 9816 + }, + { + "epoch": 0.62, + "grad_norm": 1.9039186857196588, + "learning_rate": 3.372624680026211e-06, + "loss": 0.2621, + "step": 9817 + }, + { + "epoch": 0.62, + "grad_norm": 1.720366894920928, + "learning_rate": 3.3716616926912414e-06, + "loss": 0.2874, + "step": 9818 + }, + { + "epoch": 0.62, + "grad_norm": 1.9363342186241619, + "learning_rate": 3.37069877291892e-06, + "loss": 0.2884, + "step": 9819 + }, + { + "epoch": 0.62, + "grad_norm": 1.9685218744187376, + "learning_rate": 3.369735920749201e-06, + "loss": 0.2739, + "step": 9820 + }, + { + "epoch": 0.62, + "grad_norm": 2.8789735450524714, + "learning_rate": 3.3687731362220334e-06, + "loss": 0.2632, + "step": 9821 + }, + { + "epoch": 0.62, + "grad_norm": 1.96679302862232, + "learning_rate": 3.3678104193773654e-06, + "loss": 0.2771, + "step": 9822 + }, + { + "epoch": 0.62, + "grad_norm": 1.6051793016702407, + "learning_rate": 3.366847770255143e-06, + "loss": 0.268, + "step": 9823 + }, + { + "epoch": 0.62, + "grad_norm": 2.357721907351094, + "learning_rate": 3.3658851888953078e-06, + "loss": 0.2678, + "step": 9824 + }, + { + "epoch": 0.62, + "grad_norm": 1.5222149364058855, + "learning_rate": 3.364922675337798e-06, + "loss": 0.2628, + "step": 9825 + }, + { + "epoch": 0.62, + "grad_norm": 10.32973395785985, + "learning_rate": 3.3639602296225514e-06, + "loss": 0.2764, + "step": 9826 + }, + { + "epoch": 0.62, + "grad_norm": 2.6085685513684953, + "learning_rate": 3.3629978517895e-06, + "loss": 0.2667, + "step": 9827 + }, + { + "epoch": 0.62, + "grad_norm": 1.9163977237106014, + "learning_rate": 3.3620355418785767e-06, + "loss": 0.2632, + "step": 9828 + }, + { + "epoch": 0.62, + "grad_norm": 1.5173020236245705, + "learning_rate": 3.361073299929707e-06, + "loss": 0.2714, + "step": 9829 + }, + { + "epoch": 0.62, + "grad_norm": 1.3939246286157458, + "learning_rate": 3.3601111259828155e-06, + "loss": 0.2841, + "step": 9830 + }, + { + "epoch": 0.62, + "grad_norm": 0.6786020369777017, + "learning_rate": 3.3591490200778276e-06, + "loss": 0.4974, + "step": 9831 + }, + { + "epoch": 0.62, + "grad_norm": 3.85122913637047, + "learning_rate": 3.358186982254662e-06, + "loss": 0.27, + "step": 9832 + }, + { + "epoch": 0.62, + "grad_norm": 5.2280408651590315, + "learning_rate": 3.357225012553234e-06, + "loss": 0.2727, + "step": 9833 + }, + { + "epoch": 0.62, + "grad_norm": 2.391669393244888, + "learning_rate": 3.356263111013459e-06, + "loss": 0.2653, + "step": 9834 + }, + { + "epoch": 0.62, + "grad_norm": 2.045810554509706, + "learning_rate": 3.3553012776752468e-06, + "loss": 0.2593, + "step": 9835 + }, + { + "epoch": 0.62, + "grad_norm": 2.2256173942111057, + "learning_rate": 3.354339512578506e-06, + "loss": 0.2686, + "step": 9836 + }, + { + "epoch": 0.62, + "grad_norm": 1.6763608588122259, + "learning_rate": 3.3533778157631403e-06, + "loss": 0.28, + "step": 9837 + }, + { + "epoch": 0.62, + "grad_norm": 2.97164642342639, + "learning_rate": 3.3524161872690548e-06, + "loss": 0.2568, + "step": 9838 + }, + { + "epoch": 0.62, + "grad_norm": 6.3938486688541465, + "learning_rate": 3.351454627136148e-06, + "loss": 0.2994, + "step": 9839 + }, + { + "epoch": 0.62, + "grad_norm": 2.146615352814978, + "learning_rate": 3.350493135404318e-06, + "loss": 0.2748, + "step": 9840 + }, + { + "epoch": 0.62, + "grad_norm": 2.1725227018700743, + "learning_rate": 3.3495317121134564e-06, + "loss": 0.2771, + "step": 9841 + }, + { + "epoch": 0.62, + "grad_norm": 2.92116124176794, + "learning_rate": 3.3485703573034567e-06, + "loss": 0.2912, + "step": 9842 + }, + { + "epoch": 0.62, + "grad_norm": 4.178429621380662, + "learning_rate": 3.347609071014205e-06, + "loss": 0.2731, + "step": 9843 + }, + { + "epoch": 0.62, + "grad_norm": 1.61580037997173, + "learning_rate": 3.3466478532855896e-06, + "loss": 0.2633, + "step": 9844 + }, + { + "epoch": 0.62, + "grad_norm": 2.3181438531650134, + "learning_rate": 3.3456867041574893e-06, + "loss": 0.2793, + "step": 9845 + }, + { + "epoch": 0.62, + "grad_norm": 1.6170399567764744, + "learning_rate": 3.344725623669786e-06, + "loss": 0.2638, + "step": 9846 + }, + { + "epoch": 0.62, + "grad_norm": 1.7100224589147104, + "learning_rate": 3.343764611862358e-06, + "loss": 0.281, + "step": 9847 + }, + { + "epoch": 0.62, + "grad_norm": 2.716361876494082, + "learning_rate": 3.342803668775078e-06, + "loss": 0.2834, + "step": 9848 + }, + { + "epoch": 0.62, + "grad_norm": 3.649937776220787, + "learning_rate": 3.341842794447817e-06, + "loss": 0.2747, + "step": 9849 + }, + { + "epoch": 0.62, + "grad_norm": 3.6239701754106592, + "learning_rate": 3.340881988920445e-06, + "loss": 0.2655, + "step": 9850 + }, + { + "epoch": 0.62, + "grad_norm": 1.6416178897046088, + "learning_rate": 3.339921252232824e-06, + "loss": 0.2581, + "step": 9851 + }, + { + "epoch": 0.62, + "grad_norm": 3.0418941412002503, + "learning_rate": 3.3389605844248187e-06, + "loss": 0.2735, + "step": 9852 + }, + { + "epoch": 0.62, + "grad_norm": 3.894837418888004, + "learning_rate": 3.33799998553629e-06, + "loss": 0.296, + "step": 9853 + }, + { + "epoch": 0.62, + "grad_norm": 4.081975530778152, + "learning_rate": 3.337039455607094e-06, + "loss": 0.2669, + "step": 9854 + }, + { + "epoch": 0.62, + "grad_norm": 2.358622137777047, + "learning_rate": 3.336078994677085e-06, + "loss": 0.2616, + "step": 9855 + }, + { + "epoch": 0.62, + "grad_norm": 2.0191263319974504, + "learning_rate": 3.3351186027861147e-06, + "loss": 0.2834, + "step": 9856 + }, + { + "epoch": 0.62, + "grad_norm": 4.739503391061953, + "learning_rate": 3.334158279974029e-06, + "loss": 0.2865, + "step": 9857 + }, + { + "epoch": 0.62, + "grad_norm": 18.83563435733747, + "learning_rate": 3.3331980262806774e-06, + "loss": 0.268, + "step": 9858 + }, + { + "epoch": 0.62, + "grad_norm": 3.471785915960166, + "learning_rate": 3.3322378417458985e-06, + "loss": 0.2512, + "step": 9859 + }, + { + "epoch": 0.62, + "grad_norm": 2.12319852461157, + "learning_rate": 3.3312777264095325e-06, + "loss": 0.2789, + "step": 9860 + }, + { + "epoch": 0.62, + "grad_norm": 0.603471809159579, + "learning_rate": 3.330317680311419e-06, + "loss": 0.4489, + "step": 9861 + }, + { + "epoch": 0.62, + "grad_norm": 0.621121359786358, + "learning_rate": 3.3293577034913916e-06, + "loss": 0.5016, + "step": 9862 + }, + { + "epoch": 0.62, + "grad_norm": 1.979352789753031, + "learning_rate": 3.3283977959892798e-06, + "loss": 0.2651, + "step": 9863 + }, + { + "epoch": 0.62, + "grad_norm": 1.6987291839572347, + "learning_rate": 3.3274379578449133e-06, + "loss": 0.2607, + "step": 9864 + }, + { + "epoch": 0.62, + "grad_norm": 1.9402783902988476, + "learning_rate": 3.326478189098116e-06, + "loss": 0.2928, + "step": 9865 + }, + { + "epoch": 0.62, + "grad_norm": 1.6762661068517062, + "learning_rate": 3.3255184897887116e-06, + "loss": 0.266, + "step": 9866 + }, + { + "epoch": 0.62, + "grad_norm": 2.020978401809176, + "learning_rate": 3.324558859956517e-06, + "loss": 0.2607, + "step": 9867 + }, + { + "epoch": 0.62, + "grad_norm": 1.4115015915816638, + "learning_rate": 3.323599299641353e-06, + "loss": 0.2877, + "step": 9868 + }, + { + "epoch": 0.62, + "grad_norm": 4.478799202552657, + "learning_rate": 3.3226398088830316e-06, + "loss": 0.268, + "step": 9869 + }, + { + "epoch": 0.62, + "grad_norm": 5.805467424554952, + "learning_rate": 3.321680387721364e-06, + "loss": 0.2557, + "step": 9870 + }, + { + "epoch": 0.62, + "grad_norm": 2.4345762879435267, + "learning_rate": 3.3207210361961583e-06, + "loss": 0.2633, + "step": 9871 + }, + { + "epoch": 0.62, + "grad_norm": 1.8915575667164246, + "learning_rate": 3.3197617543472203e-06, + "loss": 0.2677, + "step": 9872 + }, + { + "epoch": 0.62, + "grad_norm": 3.935309816684142, + "learning_rate": 3.3188025422143497e-06, + "loss": 0.2802, + "step": 9873 + }, + { + "epoch": 0.62, + "grad_norm": 4.896504435668146, + "learning_rate": 3.3178433998373495e-06, + "loss": 0.2717, + "step": 9874 + }, + { + "epoch": 0.62, + "grad_norm": 1.5406812601018596, + "learning_rate": 3.3168843272560127e-06, + "loss": 0.257, + "step": 9875 + }, + { + "epoch": 0.62, + "grad_norm": 2.32919562705976, + "learning_rate": 3.315925324510135e-06, + "loss": 0.2696, + "step": 9876 + }, + { + "epoch": 0.62, + "grad_norm": 2.8104804398343184, + "learning_rate": 3.3149663916395075e-06, + "loss": 0.2703, + "step": 9877 + }, + { + "epoch": 0.62, + "grad_norm": 2.282822067080632, + "learning_rate": 3.314007528683918e-06, + "loss": 0.2709, + "step": 9878 + }, + { + "epoch": 0.62, + "grad_norm": 1.894601814571518, + "learning_rate": 3.3130487356831496e-06, + "loss": 0.2748, + "step": 9879 + }, + { + "epoch": 0.62, + "grad_norm": 3.1047171441470285, + "learning_rate": 3.312090012676987e-06, + "loss": 0.2716, + "step": 9880 + }, + { + "epoch": 0.62, + "grad_norm": 4.249803689698617, + "learning_rate": 3.3111313597052065e-06, + "loss": 0.2742, + "step": 9881 + }, + { + "epoch": 0.62, + "grad_norm": 1.659785378661244, + "learning_rate": 3.3101727768075854e-06, + "loss": 0.2567, + "step": 9882 + }, + { + "epoch": 0.62, + "grad_norm": 2.3689414848001045, + "learning_rate": 3.3092142640238983e-06, + "loss": 0.2664, + "step": 9883 + }, + { + "epoch": 0.62, + "grad_norm": 1.8799493568687033, + "learning_rate": 3.3082558213939142e-06, + "loss": 0.2563, + "step": 9884 + }, + { + "epoch": 0.62, + "grad_norm": 2.0347806542432405, + "learning_rate": 3.3072974489574006e-06, + "loss": 0.2704, + "step": 9885 + }, + { + "epoch": 0.62, + "grad_norm": 2.54993934788399, + "learning_rate": 3.3063391467541227e-06, + "loss": 0.2857, + "step": 9886 + }, + { + "epoch": 0.62, + "grad_norm": 4.424712384660554, + "learning_rate": 3.3053809148238426e-06, + "loss": 0.2804, + "step": 9887 + }, + { + "epoch": 0.62, + "grad_norm": 2.3711495443193664, + "learning_rate": 3.3044227532063177e-06, + "loss": 0.2611, + "step": 9888 + }, + { + "epoch": 0.62, + "grad_norm": 7.119978494852936, + "learning_rate": 3.3034646619413046e-06, + "loss": 0.269, + "step": 9889 + }, + { + "epoch": 0.62, + "grad_norm": 2.85465927054288, + "learning_rate": 3.3025066410685546e-06, + "loss": 0.2636, + "step": 9890 + }, + { + "epoch": 0.62, + "grad_norm": 2.2333958252491697, + "learning_rate": 3.3015486906278203e-06, + "loss": 0.2562, + "step": 9891 + }, + { + "epoch": 0.62, + "grad_norm": 2.332761445734929, + "learning_rate": 3.300590810658848e-06, + "loss": 0.2783, + "step": 9892 + }, + { + "epoch": 0.62, + "grad_norm": 1.5840899591393893, + "learning_rate": 3.299633001201381e-06, + "loss": 0.2738, + "step": 9893 + }, + { + "epoch": 0.62, + "grad_norm": 6.311613716909245, + "learning_rate": 3.298675262295162e-06, + "loss": 0.2682, + "step": 9894 + }, + { + "epoch": 0.62, + "grad_norm": 2.4500980765916665, + "learning_rate": 3.2977175939799265e-06, + "loss": 0.2909, + "step": 9895 + }, + { + "epoch": 0.62, + "grad_norm": 1.5356380795765818, + "learning_rate": 3.2967599962954134e-06, + "loss": 0.2674, + "step": 9896 + }, + { + "epoch": 0.62, + "grad_norm": 2.255907166982026, + "learning_rate": 3.2958024692813515e-06, + "loss": 0.2817, + "step": 9897 + }, + { + "epoch": 0.62, + "grad_norm": 2.8803975011197585, + "learning_rate": 3.2948450129774726e-06, + "loss": 0.2743, + "step": 9898 + }, + { + "epoch": 0.62, + "grad_norm": 2.180125193183308, + "learning_rate": 3.2938876274235034e-06, + "loss": 0.2553, + "step": 9899 + }, + { + "epoch": 0.62, + "grad_norm": 2.068154661473635, + "learning_rate": 3.292930312659167e-06, + "loss": 0.2903, + "step": 9900 + }, + { + "epoch": 0.62, + "grad_norm": 2.4971141147919904, + "learning_rate": 3.291973068724184e-06, + "loss": 0.2838, + "step": 9901 + }, + { + "epoch": 0.62, + "grad_norm": 2.245842240400228, + "learning_rate": 3.2910158956582726e-06, + "loss": 0.2833, + "step": 9902 + }, + { + "epoch": 0.62, + "grad_norm": 1.7865857003738088, + "learning_rate": 3.290058793501147e-06, + "loss": 0.2667, + "step": 9903 + }, + { + "epoch": 0.62, + "grad_norm": 0.6155417737874286, + "learning_rate": 3.289101762292517e-06, + "loss": 0.4693, + "step": 9904 + }, + { + "epoch": 0.62, + "grad_norm": 2.5305638923653797, + "learning_rate": 3.288144802072097e-06, + "loss": 0.2593, + "step": 9905 + }, + { + "epoch": 0.62, + "grad_norm": 2.105242003805574, + "learning_rate": 3.287187912879588e-06, + "loss": 0.252, + "step": 9906 + }, + { + "epoch": 0.62, + "grad_norm": 2.4182474802379645, + "learning_rate": 3.286231094754695e-06, + "loss": 0.2677, + "step": 9907 + }, + { + "epoch": 0.62, + "grad_norm": 2.8304821396172453, + "learning_rate": 3.2852743477371185e-06, + "loss": 0.2703, + "step": 9908 + }, + { + "epoch": 0.62, + "grad_norm": 2.877794321716428, + "learning_rate": 3.284317671866555e-06, + "loss": 0.2678, + "step": 9909 + }, + { + "epoch": 0.62, + "grad_norm": 5.412188533589843, + "learning_rate": 3.2833610671826988e-06, + "loss": 0.2593, + "step": 9910 + }, + { + "epoch": 0.62, + "grad_norm": 2.953949652599423, + "learning_rate": 3.28240453372524e-06, + "loss": 0.2764, + "step": 9911 + }, + { + "epoch": 0.62, + "grad_norm": 7.846073759409722, + "learning_rate": 3.281448071533867e-06, + "loss": 0.2697, + "step": 9912 + }, + { + "epoch": 0.62, + "grad_norm": 1.9134779397970272, + "learning_rate": 3.2804916806482683e-06, + "loss": 0.2639, + "step": 9913 + }, + { + "epoch": 0.62, + "grad_norm": 4.747388588514957, + "learning_rate": 3.279535361108123e-06, + "loss": 0.2822, + "step": 9914 + }, + { + "epoch": 0.62, + "grad_norm": 3.358002438438269, + "learning_rate": 3.278579112953111e-06, + "loss": 0.2751, + "step": 9915 + }, + { + "epoch": 0.62, + "grad_norm": 2.095930422185279, + "learning_rate": 3.2776229362229106e-06, + "loss": 0.2527, + "step": 9916 + }, + { + "epoch": 0.62, + "grad_norm": 2.3417385104267594, + "learning_rate": 3.2766668309571926e-06, + "loss": 0.265, + "step": 9917 + }, + { + "epoch": 0.62, + "grad_norm": 2.0250674225646965, + "learning_rate": 3.2757107971956302e-06, + "loss": 0.2903, + "step": 9918 + }, + { + "epoch": 0.62, + "grad_norm": 0.6154484535404571, + "learning_rate": 3.2747548349778867e-06, + "loss": 0.5037, + "step": 9919 + }, + { + "epoch": 0.62, + "grad_norm": 1.5693137529839287, + "learning_rate": 3.2737989443436314e-06, + "loss": 0.2629, + "step": 9920 + }, + { + "epoch": 0.62, + "grad_norm": 3.7442982164101064, + "learning_rate": 3.2728431253325243e-06, + "loss": 0.2834, + "step": 9921 + }, + { + "epoch": 0.62, + "grad_norm": 2.7455489369445782, + "learning_rate": 3.271887377984223e-06, + "loss": 0.2588, + "step": 9922 + }, + { + "epoch": 0.62, + "grad_norm": 1.5076871106230614, + "learning_rate": 3.270931702338384e-06, + "loss": 0.2717, + "step": 9923 + }, + { + "epoch": 0.62, + "grad_norm": 2.382204334404688, + "learning_rate": 3.269976098434661e-06, + "loss": 0.2707, + "step": 9924 + }, + { + "epoch": 0.62, + "grad_norm": 2.0311926029643566, + "learning_rate": 3.2690205663127007e-06, + "loss": 0.2585, + "step": 9925 + }, + { + "epoch": 0.62, + "grad_norm": 1.8506969859923204, + "learning_rate": 3.268065106012153e-06, + "loss": 0.2643, + "step": 9926 + }, + { + "epoch": 0.62, + "grad_norm": 1.80756019697676, + "learning_rate": 3.2671097175726587e-06, + "loss": 0.2786, + "step": 9927 + }, + { + "epoch": 0.62, + "grad_norm": 1.8991757724178666, + "learning_rate": 3.266154401033861e-06, + "loss": 0.2704, + "step": 9928 + }, + { + "epoch": 0.62, + "grad_norm": 3.9288666279158564, + "learning_rate": 3.265199156435398e-06, + "loss": 0.2869, + "step": 9929 + }, + { + "epoch": 0.62, + "grad_norm": 32.79802203209757, + "learning_rate": 3.2642439838169023e-06, + "loss": 0.2579, + "step": 9930 + }, + { + "epoch": 0.62, + "grad_norm": 6.356750385280537, + "learning_rate": 3.2632888832180066e-06, + "loss": 0.2794, + "step": 9931 + }, + { + "epoch": 0.62, + "grad_norm": 2.4159543439827265, + "learning_rate": 3.2623338546783407e-06, + "loss": 0.2729, + "step": 9932 + }, + { + "epoch": 0.62, + "grad_norm": 2.611233782833403, + "learning_rate": 3.261378898237529e-06, + "loss": 0.2532, + "step": 9933 + }, + { + "epoch": 0.62, + "grad_norm": 2.83625966464755, + "learning_rate": 3.2604240139351937e-06, + "loss": 0.2778, + "step": 9934 + }, + { + "epoch": 0.62, + "grad_norm": 2.0249454942762197, + "learning_rate": 3.2594692018109577e-06, + "loss": 0.2567, + "step": 9935 + }, + { + "epoch": 0.62, + "grad_norm": 4.095720306438147, + "learning_rate": 3.258514461904435e-06, + "loss": 0.2827, + "step": 9936 + }, + { + "epoch": 0.62, + "grad_norm": 1.8356246857067402, + "learning_rate": 3.257559794255241e-06, + "loss": 0.2582, + "step": 9937 + }, + { + "epoch": 0.62, + "grad_norm": 15.879396767792844, + "learning_rate": 3.2566051989029866e-06, + "loss": 0.2895, + "step": 9938 + }, + { + "epoch": 0.63, + "grad_norm": 3.4436324697446716, + "learning_rate": 3.2556506758872784e-06, + "loss": 0.2581, + "step": 9939 + }, + { + "epoch": 0.63, + "grad_norm": 6.428921470844976, + "learning_rate": 3.2546962252477234e-06, + "loss": 0.2653, + "step": 9940 + }, + { + "epoch": 0.63, + "grad_norm": 3.023948149755126, + "learning_rate": 3.2537418470239208e-06, + "loss": 0.2679, + "step": 9941 + }, + { + "epoch": 0.63, + "grad_norm": 3.2480300983769723, + "learning_rate": 3.2527875412554694e-06, + "loss": 0.2454, + "step": 9942 + }, + { + "epoch": 0.63, + "grad_norm": 2.1378039251153815, + "learning_rate": 3.2518333079819684e-06, + "loss": 0.2695, + "step": 9943 + }, + { + "epoch": 0.63, + "grad_norm": 1.3900987952765609, + "learning_rate": 3.2508791472430073e-06, + "loss": 0.2566, + "step": 9944 + }, + { + "epoch": 0.63, + "grad_norm": 3.7525763418817006, + "learning_rate": 3.2499250590781773e-06, + "loss": 0.2575, + "step": 9945 + }, + { + "epoch": 0.63, + "grad_norm": 2.6967761651448225, + "learning_rate": 3.248971043527066e-06, + "loss": 0.2747, + "step": 9946 + }, + { + "epoch": 0.63, + "grad_norm": 2.40864027668934, + "learning_rate": 3.248017100629256e-06, + "loss": 0.2552, + "step": 9947 + }, + { + "epoch": 0.63, + "grad_norm": 7.492652892048151, + "learning_rate": 3.247063230424329e-06, + "loss": 0.2662, + "step": 9948 + }, + { + "epoch": 0.63, + "grad_norm": 38.11552204506903, + "learning_rate": 3.2461094329518593e-06, + "loss": 0.2578, + "step": 9949 + }, + { + "epoch": 0.63, + "grad_norm": 1.9645221821642918, + "learning_rate": 3.245155708251426e-06, + "loss": 0.2744, + "step": 9950 + }, + { + "epoch": 0.63, + "grad_norm": 2.1398337374814704, + "learning_rate": 3.244202056362601e-06, + "loss": 0.2595, + "step": 9951 + }, + { + "epoch": 0.63, + "grad_norm": 2.3540382346625854, + "learning_rate": 3.2432484773249495e-06, + "loss": 0.272, + "step": 9952 + }, + { + "epoch": 0.63, + "grad_norm": 2.011525360598053, + "learning_rate": 3.2422949711780395e-06, + "loss": 0.26, + "step": 9953 + }, + { + "epoch": 0.63, + "grad_norm": 3.7813564347347532, + "learning_rate": 3.2413415379614345e-06, + "loss": 0.2591, + "step": 9954 + }, + { + "epoch": 0.63, + "grad_norm": 7.539751554887144, + "learning_rate": 3.2403881777146905e-06, + "loss": 0.3004, + "step": 9955 + }, + { + "epoch": 0.63, + "grad_norm": 5.311087492806082, + "learning_rate": 3.2394348904773687e-06, + "loss": 0.2835, + "step": 9956 + }, + { + "epoch": 0.63, + "grad_norm": 2.4307233930329737, + "learning_rate": 3.2384816762890182e-06, + "loss": 0.2661, + "step": 9957 + }, + { + "epoch": 0.63, + "grad_norm": 5.040809683042068, + "learning_rate": 3.2375285351891918e-06, + "loss": 0.2787, + "step": 9958 + }, + { + "epoch": 0.63, + "grad_norm": 4.0616870068222015, + "learning_rate": 3.2365754672174386e-06, + "loss": 0.2668, + "step": 9959 + }, + { + "epoch": 0.63, + "grad_norm": 7.763535396368723, + "learning_rate": 3.235622472413301e-06, + "loss": 0.2627, + "step": 9960 + }, + { + "epoch": 0.63, + "grad_norm": 1.6023289537315193, + "learning_rate": 3.234669550816321e-06, + "loss": 0.2675, + "step": 9961 + }, + { + "epoch": 0.63, + "grad_norm": 13.374333265303514, + "learning_rate": 3.2337167024660366e-06, + "loss": 0.2709, + "step": 9962 + }, + { + "epoch": 0.63, + "grad_norm": 1.3761195103993602, + "learning_rate": 3.2327639274019835e-06, + "loss": 0.254, + "step": 9963 + }, + { + "epoch": 0.63, + "grad_norm": 2.732504353626915, + "learning_rate": 3.2318112256636923e-06, + "loss": 0.2654, + "step": 9964 + }, + { + "epoch": 0.63, + "grad_norm": 1.6397055592623355, + "learning_rate": 3.230858597290697e-06, + "loss": 0.2584, + "step": 9965 + }, + { + "epoch": 0.63, + "grad_norm": 2.623168773235153, + "learning_rate": 3.229906042322519e-06, + "loss": 0.269, + "step": 9966 + }, + { + "epoch": 0.63, + "grad_norm": 6.852660925017179, + "learning_rate": 3.2289535607986843e-06, + "loss": 0.2669, + "step": 9967 + }, + { + "epoch": 0.63, + "grad_norm": 2.3472769934172164, + "learning_rate": 3.2280011527587118e-06, + "loss": 0.2578, + "step": 9968 + }, + { + "epoch": 0.63, + "grad_norm": 2.6888069980344453, + "learning_rate": 3.227048818242119e-06, + "loss": 0.2539, + "step": 9969 + }, + { + "epoch": 0.63, + "grad_norm": 1.9437871595464753, + "learning_rate": 3.2260965572884202e-06, + "loss": 0.2621, + "step": 9970 + }, + { + "epoch": 0.63, + "grad_norm": 4.803572970593614, + "learning_rate": 3.2251443699371252e-06, + "loss": 0.2575, + "step": 9971 + }, + { + "epoch": 0.63, + "grad_norm": 2.299282413598395, + "learning_rate": 3.2241922562277416e-06, + "loss": 0.3004, + "step": 9972 + }, + { + "epoch": 0.63, + "grad_norm": 1.9689419717334427, + "learning_rate": 3.223240216199778e-06, + "loss": 0.2797, + "step": 9973 + }, + { + "epoch": 0.63, + "grad_norm": 3.7677124015119414, + "learning_rate": 3.222288249892732e-06, + "loss": 0.2732, + "step": 9974 + }, + { + "epoch": 0.63, + "grad_norm": 5.337022225180417, + "learning_rate": 3.221336357346105e-06, + "loss": 0.2664, + "step": 9975 + }, + { + "epoch": 0.63, + "grad_norm": 2.3705375609607087, + "learning_rate": 3.220384538599392e-06, + "loss": 0.2635, + "step": 9976 + }, + { + "epoch": 0.63, + "grad_norm": 7.426477112735168, + "learning_rate": 3.2194327936920842e-06, + "loss": 0.2733, + "step": 9977 + }, + { + "epoch": 0.63, + "grad_norm": 2.2674385705041455, + "learning_rate": 3.2184811226636746e-06, + "loss": 0.2802, + "step": 9978 + }, + { + "epoch": 0.63, + "grad_norm": 1.6950749532968423, + "learning_rate": 3.217529525553645e-06, + "loss": 0.2738, + "step": 9979 + }, + { + "epoch": 0.63, + "grad_norm": 2.2637541576120266, + "learning_rate": 3.2165780024014825e-06, + "loss": 0.269, + "step": 9980 + }, + { + "epoch": 0.63, + "grad_norm": 2.958006121460725, + "learning_rate": 3.215626553246667e-06, + "loss": 0.2787, + "step": 9981 + }, + { + "epoch": 0.63, + "grad_norm": 2.8480662328736113, + "learning_rate": 3.2146751781286743e-06, + "loss": 0.2801, + "step": 9982 + }, + { + "epoch": 0.63, + "grad_norm": 2.2434885960103785, + "learning_rate": 3.213723877086979e-06, + "loss": 0.2725, + "step": 9983 + }, + { + "epoch": 0.63, + "grad_norm": 1.5809920577684438, + "learning_rate": 3.2127726501610558e-06, + "loss": 0.2435, + "step": 9984 + }, + { + "epoch": 0.63, + "grad_norm": 2.75565235219549, + "learning_rate": 3.2118214973903673e-06, + "loss": 0.2713, + "step": 9985 + }, + { + "epoch": 0.63, + "grad_norm": 3.3000391861556224, + "learning_rate": 3.2108704188143803e-06, + "loss": 0.2737, + "step": 9986 + }, + { + "epoch": 0.63, + "grad_norm": 1.742276002723779, + "learning_rate": 3.209919414472559e-06, + "loss": 0.2541, + "step": 9987 + }, + { + "epoch": 0.63, + "grad_norm": 6.87638450087813, + "learning_rate": 3.20896848440436e-06, + "loss": 0.2627, + "step": 9988 + }, + { + "epoch": 0.63, + "grad_norm": 24.26284996646524, + "learning_rate": 3.2080176286492415e-06, + "loss": 0.2722, + "step": 9989 + }, + { + "epoch": 0.63, + "grad_norm": 1.5086493109460943, + "learning_rate": 3.2070668472466525e-06, + "loss": 0.2743, + "step": 9990 + }, + { + "epoch": 0.63, + "grad_norm": 2.9097411949715766, + "learning_rate": 3.2061161402360454e-06, + "loss": 0.2787, + "step": 9991 + }, + { + "epoch": 0.63, + "grad_norm": 1.9428254453100957, + "learning_rate": 3.2051655076568666e-06, + "loss": 0.2745, + "step": 9992 + }, + { + "epoch": 0.63, + "grad_norm": 2.1766278325768242, + "learning_rate": 3.204214949548558e-06, + "loss": 0.2748, + "step": 9993 + }, + { + "epoch": 0.63, + "grad_norm": 1.5148475629807059, + "learning_rate": 3.2032644659505595e-06, + "loss": 0.2597, + "step": 9994 + }, + { + "epoch": 0.63, + "grad_norm": 1.8351187003270963, + "learning_rate": 3.2023140569023124e-06, + "loss": 0.2713, + "step": 9995 + }, + { + "epoch": 0.63, + "grad_norm": 2.3119515675998468, + "learning_rate": 3.2013637224432463e-06, + "loss": 0.2553, + "step": 9996 + }, + { + "epoch": 0.63, + "grad_norm": 4.5777577907954905, + "learning_rate": 3.200413462612796e-06, + "loss": 0.2628, + "step": 9997 + }, + { + "epoch": 0.63, + "grad_norm": 3.4677048711679106, + "learning_rate": 3.1994632774503853e-06, + "loss": 0.2832, + "step": 9998 + }, + { + "epoch": 0.63, + "grad_norm": 2.5972451498659193, + "learning_rate": 3.198513166995442e-06, + "loss": 0.2736, + "step": 9999 + }, + { + "epoch": 0.63, + "grad_norm": 1.927327376868954, + "learning_rate": 3.197563131287389e-06, + "loss": 0.2639, + "step": 10000 + }, + { + "epoch": 0.63, + "grad_norm": 2.18965047883231, + "learning_rate": 3.19661317036564e-06, + "loss": 0.266, + "step": 10001 + }, + { + "epoch": 0.63, + "grad_norm": 2.341591965281301, + "learning_rate": 3.195663284269615e-06, + "loss": 0.2678, + "step": 10002 + }, + { + "epoch": 0.63, + "grad_norm": 2.1173636223746146, + "learning_rate": 3.1947134730387265e-06, + "loss": 0.2764, + "step": 10003 + }, + { + "epoch": 0.63, + "grad_norm": 1.9686828247975843, + "learning_rate": 3.1937637367123814e-06, + "loss": 0.2741, + "step": 10004 + }, + { + "epoch": 0.63, + "grad_norm": 4.627160398881354, + "learning_rate": 3.192814075329988e-06, + "loss": 0.2683, + "step": 10005 + }, + { + "epoch": 0.63, + "grad_norm": 2.0770602123616824, + "learning_rate": 3.191864488930948e-06, + "loss": 0.2644, + "step": 10006 + }, + { + "epoch": 0.63, + "grad_norm": 3.2461568601882127, + "learning_rate": 3.190914977554661e-06, + "loss": 0.2699, + "step": 10007 + }, + { + "epoch": 0.63, + "grad_norm": 2.6242753466890067, + "learning_rate": 3.1899655412405266e-06, + "loss": 0.2572, + "step": 10008 + }, + { + "epoch": 0.63, + "grad_norm": 2.364714501622136, + "learning_rate": 3.1890161800279353e-06, + "loss": 0.2779, + "step": 10009 + }, + { + "epoch": 0.63, + "grad_norm": 1.3796187874313297, + "learning_rate": 3.188066893956279e-06, + "loss": 0.2623, + "step": 10010 + }, + { + "epoch": 0.63, + "grad_norm": 1.772680539942646, + "learning_rate": 3.1871176830649473e-06, + "loss": 0.2629, + "step": 10011 + }, + { + "epoch": 0.63, + "grad_norm": 1.728648012424328, + "learning_rate": 3.1861685473933223e-06, + "loss": 0.263, + "step": 10012 + }, + { + "epoch": 0.63, + "grad_norm": 2.5028407688194734, + "learning_rate": 3.1852194869807873e-06, + "loss": 0.2677, + "step": 10013 + }, + { + "epoch": 0.63, + "grad_norm": 1.4799106491738157, + "learning_rate": 3.1842705018667173e-06, + "loss": 0.2721, + "step": 10014 + }, + { + "epoch": 0.63, + "grad_norm": 1.7620365634749426, + "learning_rate": 3.18332159209049e-06, + "loss": 0.273, + "step": 10015 + }, + { + "epoch": 0.63, + "grad_norm": 1.433121238508442, + "learning_rate": 3.1823727576914753e-06, + "loss": 0.2537, + "step": 10016 + }, + { + "epoch": 0.63, + "grad_norm": 1.7569294426756594, + "learning_rate": 3.181423998709045e-06, + "loss": 0.2812, + "step": 10017 + }, + { + "epoch": 0.63, + "grad_norm": 7.7976168662829215, + "learning_rate": 3.180475315182563e-06, + "loss": 0.2777, + "step": 10018 + }, + { + "epoch": 0.63, + "grad_norm": 2.626302240605091, + "learning_rate": 3.1795267071513925e-06, + "loss": 0.2528, + "step": 10019 + }, + { + "epoch": 0.63, + "grad_norm": 2.014450872227873, + "learning_rate": 3.178578174654891e-06, + "loss": 0.2717, + "step": 10020 + }, + { + "epoch": 0.63, + "grad_norm": 1.9183255383889546, + "learning_rate": 3.1776297177324167e-06, + "loss": 0.2754, + "step": 10021 + }, + { + "epoch": 0.63, + "grad_norm": 1.8944432411742664, + "learning_rate": 3.1766813364233234e-06, + "loss": 0.2729, + "step": 10022 + }, + { + "epoch": 0.63, + "grad_norm": 2.5160483787146775, + "learning_rate": 3.175733030766959e-06, + "loss": 0.2736, + "step": 10023 + }, + { + "epoch": 0.63, + "grad_norm": 1.8786096699155248, + "learning_rate": 3.1747848008026704e-06, + "loss": 0.2707, + "step": 10024 + }, + { + "epoch": 0.63, + "grad_norm": 2.078054043201778, + "learning_rate": 3.1738366465698043e-06, + "loss": 0.2744, + "step": 10025 + }, + { + "epoch": 0.63, + "grad_norm": 4.514279063968887, + "learning_rate": 3.1728885681076983e-06, + "loss": 0.2889, + "step": 10026 + }, + { + "epoch": 0.63, + "grad_norm": 3.123097377846155, + "learning_rate": 3.1719405654556924e-06, + "loss": 0.2667, + "step": 10027 + }, + { + "epoch": 0.63, + "grad_norm": 1.8728657758142397, + "learning_rate": 3.1709926386531174e-06, + "loss": 0.2802, + "step": 10028 + }, + { + "epoch": 0.63, + "grad_norm": 2.9623826940788787, + "learning_rate": 3.170044787739307e-06, + "loss": 0.2829, + "step": 10029 + }, + { + "epoch": 0.63, + "grad_norm": 1.9078598998142404, + "learning_rate": 3.1690970127535904e-06, + "loss": 0.2637, + "step": 10030 + }, + { + "epoch": 0.63, + "grad_norm": 1.6845025177566337, + "learning_rate": 3.168149313735288e-06, + "loss": 0.2784, + "step": 10031 + }, + { + "epoch": 0.63, + "grad_norm": 3.983978912469738, + "learning_rate": 3.1672016907237256e-06, + "loss": 0.2751, + "step": 10032 + }, + { + "epoch": 0.63, + "grad_norm": 2.2511549794876786, + "learning_rate": 3.166254143758222e-06, + "loss": 0.2652, + "step": 10033 + }, + { + "epoch": 0.63, + "grad_norm": 2.1075240554115413, + "learning_rate": 3.165306672878089e-06, + "loss": 0.2564, + "step": 10034 + }, + { + "epoch": 0.63, + "grad_norm": 1.7632322212063483, + "learning_rate": 3.1643592781226428e-06, + "loss": 0.2728, + "step": 10035 + }, + { + "epoch": 0.63, + "grad_norm": 7.46186032204869, + "learning_rate": 3.163411959531189e-06, + "loss": 0.2616, + "step": 10036 + }, + { + "epoch": 0.63, + "grad_norm": 7.881101303832084, + "learning_rate": 3.1624647171430355e-06, + "loss": 0.2651, + "step": 10037 + }, + { + "epoch": 0.63, + "grad_norm": 3.2289831230123385, + "learning_rate": 3.1615175509974855e-06, + "loss": 0.2933, + "step": 10038 + }, + { + "epoch": 0.63, + "grad_norm": 1.8019027339394116, + "learning_rate": 3.160570461133836e-06, + "loss": 0.29, + "step": 10039 + }, + { + "epoch": 0.63, + "grad_norm": 5.696416257606188, + "learning_rate": 3.1596234475913855e-06, + "loss": 0.2814, + "step": 10040 + }, + { + "epoch": 0.63, + "grad_norm": 0.6196570926025736, + "learning_rate": 3.1586765104094295e-06, + "loss": 0.4681, + "step": 10041 + }, + { + "epoch": 0.63, + "grad_norm": 1.579448029063411, + "learning_rate": 3.1577296496272535e-06, + "loss": 0.2805, + "step": 10042 + }, + { + "epoch": 0.63, + "grad_norm": 1.3746432894832377, + "learning_rate": 3.156782865284148e-06, + "loss": 0.2717, + "step": 10043 + }, + { + "epoch": 0.63, + "grad_norm": 1.8715667524654522, + "learning_rate": 3.155836157419394e-06, + "loss": 0.2784, + "step": 10044 + }, + { + "epoch": 0.63, + "grad_norm": 3.518868977270992, + "learning_rate": 3.1548895260722743e-06, + "loss": 0.2655, + "step": 10045 + }, + { + "epoch": 0.63, + "grad_norm": 2.2942224365251014, + "learning_rate": 3.1539429712820634e-06, + "loss": 0.2673, + "step": 10046 + }, + { + "epoch": 0.63, + "grad_norm": 1.6836469079689118, + "learning_rate": 3.15299649308804e-06, + "loss": 0.2619, + "step": 10047 + }, + { + "epoch": 0.63, + "grad_norm": 2.3316655067254635, + "learning_rate": 3.152050091529472e-06, + "loss": 0.2682, + "step": 10048 + }, + { + "epoch": 0.63, + "grad_norm": 6.145213553660683, + "learning_rate": 3.151103766645629e-06, + "loss": 0.2715, + "step": 10049 + }, + { + "epoch": 0.63, + "grad_norm": 2.251853760793825, + "learning_rate": 3.150157518475774e-06, + "loss": 0.2514, + "step": 10050 + }, + { + "epoch": 0.63, + "grad_norm": 2.5113708883773502, + "learning_rate": 3.149211347059169e-06, + "loss": 0.282, + "step": 10051 + }, + { + "epoch": 0.63, + "grad_norm": 1.7868447197110866, + "learning_rate": 3.1482652524350727e-06, + "loss": 0.2689, + "step": 10052 + }, + { + "epoch": 0.63, + "grad_norm": 4.510469220806779, + "learning_rate": 3.1473192346427396e-06, + "loss": 0.2811, + "step": 10053 + }, + { + "epoch": 0.63, + "grad_norm": 2.4004931673113967, + "learning_rate": 3.146373293721422e-06, + "loss": 0.2644, + "step": 10054 + }, + { + "epoch": 0.63, + "grad_norm": 1.8329117459328055, + "learning_rate": 3.14542742971037e-06, + "loss": 0.2899, + "step": 10055 + }, + { + "epoch": 0.63, + "grad_norm": 2.7556121061371597, + "learning_rate": 3.1444816426488275e-06, + "loss": 0.2678, + "step": 10056 + }, + { + "epoch": 0.63, + "grad_norm": 1.8984575322681536, + "learning_rate": 3.143535932576038e-06, + "loss": 0.2619, + "step": 10057 + }, + { + "epoch": 0.63, + "grad_norm": 2.4185456732644264, + "learning_rate": 3.1425902995312394e-06, + "loss": 0.2663, + "step": 10058 + }, + { + "epoch": 0.63, + "grad_norm": 2.4057435765175152, + "learning_rate": 3.141644743553668e-06, + "loss": 0.2918, + "step": 10059 + }, + { + "epoch": 0.63, + "grad_norm": 1.9093835713711051, + "learning_rate": 3.1406992646825583e-06, + "loss": 0.2776, + "step": 10060 + }, + { + "epoch": 0.63, + "grad_norm": 1.8243505802516449, + "learning_rate": 3.1397538629571368e-06, + "loss": 0.2678, + "step": 10061 + }, + { + "epoch": 0.63, + "grad_norm": 2.95825607455039, + "learning_rate": 3.1388085384166324e-06, + "loss": 0.2818, + "step": 10062 + }, + { + "epoch": 0.63, + "grad_norm": 3.369785162060367, + "learning_rate": 3.1378632911002688e-06, + "loss": 0.2591, + "step": 10063 + }, + { + "epoch": 0.63, + "grad_norm": 1.8408484348366791, + "learning_rate": 3.136918121047264e-06, + "loss": 0.2596, + "step": 10064 + }, + { + "epoch": 0.63, + "grad_norm": 3.530616936246068, + "learning_rate": 3.135973028296836e-06, + "loss": 0.2759, + "step": 10065 + }, + { + "epoch": 0.63, + "grad_norm": 1.8773389145388941, + "learning_rate": 3.135028012888197e-06, + "loss": 0.2538, + "step": 10066 + }, + { + "epoch": 0.63, + "grad_norm": 3.29984652337607, + "learning_rate": 3.134083074860559e-06, + "loss": 0.2716, + "step": 10067 + }, + { + "epoch": 0.63, + "grad_norm": 3.1437430964536395, + "learning_rate": 3.1331382142531265e-06, + "loss": 0.2825, + "step": 10068 + }, + { + "epoch": 0.63, + "grad_norm": 2.2273024299155457, + "learning_rate": 3.132193431105108e-06, + "loss": 0.274, + "step": 10069 + }, + { + "epoch": 0.63, + "grad_norm": 1.4882246604550748, + "learning_rate": 3.1312487254557006e-06, + "loss": 0.2665, + "step": 10070 + }, + { + "epoch": 0.63, + "grad_norm": 1.3844836479646911, + "learning_rate": 3.1303040973441036e-06, + "loss": 0.2646, + "step": 10071 + }, + { + "epoch": 0.63, + "grad_norm": 1.9536467788912197, + "learning_rate": 3.1293595468095094e-06, + "loss": 0.2821, + "step": 10072 + }, + { + "epoch": 0.63, + "grad_norm": 1.6201119669717978, + "learning_rate": 3.1284150738911125e-06, + "loss": 0.2499, + "step": 10073 + }, + { + "epoch": 0.63, + "grad_norm": 2.5418702408945038, + "learning_rate": 3.127470678628096e-06, + "loss": 0.2761, + "step": 10074 + }, + { + "epoch": 0.63, + "grad_norm": 3.3283382132612167, + "learning_rate": 3.1265263610596475e-06, + "loss": 0.2641, + "step": 10075 + }, + { + "epoch": 0.63, + "grad_norm": 1.4433540304588321, + "learning_rate": 3.125582121224947e-06, + "loss": 0.263, + "step": 10076 + }, + { + "epoch": 0.63, + "grad_norm": 20.278285307033244, + "learning_rate": 3.124637959163176e-06, + "loss": 0.2679, + "step": 10077 + }, + { + "epoch": 0.63, + "grad_norm": 4.388136569112997, + "learning_rate": 3.123693874913506e-06, + "loss": 0.2773, + "step": 10078 + }, + { + "epoch": 0.63, + "grad_norm": 1.8994297306023664, + "learning_rate": 3.12274986851511e-06, + "loss": 0.2762, + "step": 10079 + }, + { + "epoch": 0.63, + "grad_norm": 2.3895787241115194, + "learning_rate": 3.1218059400071564e-06, + "loss": 0.2609, + "step": 10080 + }, + { + "epoch": 0.63, + "grad_norm": 2.6774317358193516, + "learning_rate": 3.1208620894288105e-06, + "loss": 0.2852, + "step": 10081 + }, + { + "epoch": 0.63, + "grad_norm": 1.7341905061504503, + "learning_rate": 3.1199183168192337e-06, + "loss": 0.2881, + "step": 10082 + }, + { + "epoch": 0.63, + "grad_norm": 2.3286729637431853, + "learning_rate": 3.1189746222175843e-06, + "loss": 0.2579, + "step": 10083 + }, + { + "epoch": 0.63, + "grad_norm": 2.110021476189369, + "learning_rate": 3.11803100566302e-06, + "loss": 0.2665, + "step": 10084 + }, + { + "epoch": 0.63, + "grad_norm": 2.706249935834069, + "learning_rate": 3.117087467194693e-06, + "loss": 0.2639, + "step": 10085 + }, + { + "epoch": 0.63, + "grad_norm": 1.9493757887438823, + "learning_rate": 3.11614400685175e-06, + "loss": 0.2732, + "step": 10086 + }, + { + "epoch": 0.63, + "grad_norm": 2.2076273682628904, + "learning_rate": 3.1152006246733395e-06, + "loss": 0.2708, + "step": 10087 + }, + { + "epoch": 0.63, + "grad_norm": 2.313402505219228, + "learning_rate": 3.114257320698602e-06, + "loss": 0.2534, + "step": 10088 + }, + { + "epoch": 0.63, + "grad_norm": 3.113284925556673, + "learning_rate": 3.1133140949666785e-06, + "loss": 0.2744, + "step": 10089 + }, + { + "epoch": 0.63, + "grad_norm": 2.1397030668441617, + "learning_rate": 3.1123709475167043e-06, + "loss": 0.2732, + "step": 10090 + }, + { + "epoch": 0.63, + "grad_norm": 1.6414653821745988, + "learning_rate": 3.11142787838781e-06, + "loss": 0.264, + "step": 10091 + }, + { + "epoch": 0.63, + "grad_norm": 4.693477360207317, + "learning_rate": 3.110484887619129e-06, + "loss": 0.2702, + "step": 10092 + }, + { + "epoch": 0.63, + "grad_norm": 1.6470543237118083, + "learning_rate": 3.109541975249787e-06, + "loss": 0.2557, + "step": 10093 + }, + { + "epoch": 0.63, + "grad_norm": 2.5648910258663804, + "learning_rate": 3.1085991413189053e-06, + "loss": 0.2962, + "step": 10094 + }, + { + "epoch": 0.63, + "grad_norm": 2.0426294366035807, + "learning_rate": 3.1076563858656062e-06, + "loss": 0.2506, + "step": 10095 + }, + { + "epoch": 0.63, + "grad_norm": 2.4564743363958335, + "learning_rate": 3.1067137089290033e-06, + "loss": 0.2732, + "step": 10096 + }, + { + "epoch": 0.63, + "grad_norm": 2.19124921830546, + "learning_rate": 3.105771110548212e-06, + "loss": 0.2663, + "step": 10097 + }, + { + "epoch": 0.64, + "grad_norm": 13.372263704027683, + "learning_rate": 3.10482859076234e-06, + "loss": 0.278, + "step": 10098 + }, + { + "epoch": 0.64, + "grad_norm": 3.0814643219929176, + "learning_rate": 3.1038861496104987e-06, + "loss": 0.2662, + "step": 10099 + }, + { + "epoch": 0.64, + "grad_norm": 2.246160079019944, + "learning_rate": 3.102943787131788e-06, + "loss": 0.2625, + "step": 10100 + }, + { + "epoch": 0.64, + "grad_norm": 3.450483413831347, + "learning_rate": 3.1020015033653094e-06, + "loss": 0.2438, + "step": 10101 + }, + { + "epoch": 0.64, + "grad_norm": 3.261575768600086, + "learning_rate": 3.10105929835016e-06, + "loss": 0.2801, + "step": 10102 + }, + { + "epoch": 0.64, + "grad_norm": 1.8687858229308059, + "learning_rate": 3.100117172125433e-06, + "loss": 0.2806, + "step": 10103 + }, + { + "epoch": 0.64, + "grad_norm": 1.880372771729046, + "learning_rate": 3.099175124730218e-06, + "loss": 0.2568, + "step": 10104 + }, + { + "epoch": 0.64, + "grad_norm": 18.921810094277788, + "learning_rate": 3.0982331562036037e-06, + "loss": 0.3059, + "step": 10105 + }, + { + "epoch": 0.64, + "grad_norm": 17.71574008247158, + "learning_rate": 3.097291266584673e-06, + "loss": 0.2753, + "step": 10106 + }, + { + "epoch": 0.64, + "grad_norm": 2.5085525891596148, + "learning_rate": 3.0963494559125084e-06, + "loss": 0.2687, + "step": 10107 + }, + { + "epoch": 0.64, + "grad_norm": 3.174098534749825, + "learning_rate": 3.095407724226186e-06, + "loss": 0.2516, + "step": 10108 + }, + { + "epoch": 0.64, + "grad_norm": 1.9535102854308344, + "learning_rate": 3.0944660715647813e-06, + "loss": 0.2641, + "step": 10109 + }, + { + "epoch": 0.64, + "grad_norm": 4.172982818492106, + "learning_rate": 3.093524497967362e-06, + "loss": 0.3047, + "step": 10110 + }, + { + "epoch": 0.64, + "grad_norm": 2.7053760550966017, + "learning_rate": 3.0925830034729986e-06, + "loss": 0.2637, + "step": 10111 + }, + { + "epoch": 0.64, + "grad_norm": 2.1150917499177964, + "learning_rate": 3.091641588120754e-06, + "loss": 0.2561, + "step": 10112 + }, + { + "epoch": 0.64, + "grad_norm": 2.8556553949819037, + "learning_rate": 3.0907002519496875e-06, + "loss": 0.2812, + "step": 10113 + }, + { + "epoch": 0.64, + "grad_norm": 2.596939683625185, + "learning_rate": 3.089758994998861e-06, + "loss": 0.2707, + "step": 10114 + }, + { + "epoch": 0.64, + "grad_norm": 2.900356263255975, + "learning_rate": 3.088817817307327e-06, + "loss": 0.2668, + "step": 10115 + }, + { + "epoch": 0.64, + "grad_norm": 1.7163415576738235, + "learning_rate": 3.0878767189141355e-06, + "loss": 0.2665, + "step": 10116 + }, + { + "epoch": 0.64, + "grad_norm": 2.344543836072783, + "learning_rate": 3.0869356998583366e-06, + "loss": 0.2718, + "step": 10117 + }, + { + "epoch": 0.64, + "grad_norm": 2.340726681295385, + "learning_rate": 3.0859947601789724e-06, + "loss": 0.2615, + "step": 10118 + }, + { + "epoch": 0.64, + "grad_norm": 2.515503426889501, + "learning_rate": 3.0850538999150866e-06, + "loss": 0.2706, + "step": 10119 + }, + { + "epoch": 0.64, + "grad_norm": 1.876976324052386, + "learning_rate": 3.084113119105714e-06, + "loss": 0.2899, + "step": 10120 + }, + { + "epoch": 0.64, + "grad_norm": 2.083568949066206, + "learning_rate": 3.08317241778989e-06, + "loss": 0.2715, + "step": 10121 + }, + { + "epoch": 0.64, + "grad_norm": 2.544694145277682, + "learning_rate": 3.0822317960066493e-06, + "loss": 0.2864, + "step": 10122 + }, + { + "epoch": 0.64, + "grad_norm": 1.934966786984859, + "learning_rate": 3.081291253795018e-06, + "loss": 0.2742, + "step": 10123 + }, + { + "epoch": 0.64, + "grad_norm": 2.1233943000294593, + "learning_rate": 3.0803507911940193e-06, + "loss": 0.3055, + "step": 10124 + }, + { + "epoch": 0.64, + "grad_norm": 1.6936264444753946, + "learning_rate": 3.0794104082426772e-06, + "loss": 0.2738, + "step": 10125 + }, + { + "epoch": 0.64, + "grad_norm": 3.3895582488985383, + "learning_rate": 3.078470104980008e-06, + "loss": 0.2519, + "step": 10126 + }, + { + "epoch": 0.64, + "grad_norm": 1.712330595434032, + "learning_rate": 3.0775298814450273e-06, + "loss": 0.2664, + "step": 10127 + }, + { + "epoch": 0.64, + "grad_norm": 7.127102241181531, + "learning_rate": 3.076589737676744e-06, + "loss": 0.2912, + "step": 10128 + }, + { + "epoch": 0.64, + "grad_norm": 2.7741587075314817, + "learning_rate": 3.0756496737141715e-06, + "loss": 0.2667, + "step": 10129 + }, + { + "epoch": 0.64, + "grad_norm": 4.889646570103967, + "learning_rate": 3.074709689596312e-06, + "loss": 0.2549, + "step": 10130 + }, + { + "epoch": 0.64, + "grad_norm": 1.6729520432348077, + "learning_rate": 3.0737697853621672e-06, + "loss": 0.2533, + "step": 10131 + }, + { + "epoch": 0.64, + "grad_norm": 1.5657536258905018, + "learning_rate": 3.072829961050735e-06, + "loss": 0.292, + "step": 10132 + }, + { + "epoch": 0.64, + "grad_norm": 2.539708974291937, + "learning_rate": 3.071890216701012e-06, + "loss": 0.263, + "step": 10133 + }, + { + "epoch": 0.64, + "grad_norm": 1.88300859835801, + "learning_rate": 3.0709505523519868e-06, + "loss": 0.287, + "step": 10134 + }, + { + "epoch": 0.64, + "grad_norm": 2.5642082145514746, + "learning_rate": 3.07001096804265e-06, + "loss": 0.2758, + "step": 10135 + }, + { + "epoch": 0.64, + "grad_norm": 2.100345082396697, + "learning_rate": 3.0690714638119843e-06, + "loss": 0.2756, + "step": 10136 + }, + { + "epoch": 0.64, + "grad_norm": 3.8571735285393314, + "learning_rate": 3.068132039698977e-06, + "loss": 0.2894, + "step": 10137 + }, + { + "epoch": 0.64, + "grad_norm": 1.8375961100095908, + "learning_rate": 3.0671926957426e-06, + "loss": 0.2916, + "step": 10138 + }, + { + "epoch": 0.64, + "grad_norm": 2.91512447998109, + "learning_rate": 3.066253431981834e-06, + "loss": 0.2778, + "step": 10139 + }, + { + "epoch": 0.64, + "grad_norm": 1.965821722495347, + "learning_rate": 3.0653142484556454e-06, + "loss": 0.2604, + "step": 10140 + }, + { + "epoch": 0.64, + "grad_norm": 2.48749806441734, + "learning_rate": 3.0643751452030065e-06, + "loss": 0.2872, + "step": 10141 + }, + { + "epoch": 0.64, + "grad_norm": 1.7852139975220183, + "learning_rate": 3.0634361222628794e-06, + "loss": 0.2716, + "step": 10142 + }, + { + "epoch": 0.64, + "grad_norm": 16.413355486247813, + "learning_rate": 3.0624971796742263e-06, + "loss": 0.2601, + "step": 10143 + }, + { + "epoch": 0.64, + "grad_norm": 1.7839414408876089, + "learning_rate": 3.0615583174760083e-06, + "loss": 0.2815, + "step": 10144 + }, + { + "epoch": 0.64, + "grad_norm": 1.9117422090997844, + "learning_rate": 3.0606195357071795e-06, + "loss": 0.2891, + "step": 10145 + }, + { + "epoch": 0.64, + "grad_norm": 2.6819593658141683, + "learning_rate": 3.0596808344066896e-06, + "loss": 0.265, + "step": 10146 + }, + { + "epoch": 0.64, + "grad_norm": 3.459310506845838, + "learning_rate": 3.0587422136134896e-06, + "loss": 0.269, + "step": 10147 + }, + { + "epoch": 0.64, + "grad_norm": 1.7168692223380229, + "learning_rate": 3.0578036733665224e-06, + "loss": 0.2603, + "step": 10148 + }, + { + "epoch": 0.64, + "grad_norm": 29.877550580355468, + "learning_rate": 3.0568652137047312e-06, + "loss": 0.2778, + "step": 10149 + }, + { + "epoch": 0.64, + "grad_norm": 9.178501763688367, + "learning_rate": 3.0559268346670507e-06, + "loss": 0.2973, + "step": 10150 + }, + { + "epoch": 0.64, + "grad_norm": 1.7895490436953738, + "learning_rate": 3.0549885362924215e-06, + "loss": 0.2616, + "step": 10151 + }, + { + "epoch": 0.64, + "grad_norm": 5.7808985512461115, + "learning_rate": 3.0540503186197724e-06, + "loss": 0.2733, + "step": 10152 + }, + { + "epoch": 0.64, + "grad_norm": 2.800684782977907, + "learning_rate": 3.0531121816880327e-06, + "loss": 0.3003, + "step": 10153 + }, + { + "epoch": 0.64, + "grad_norm": 0.6093911880438868, + "learning_rate": 3.052174125536126e-06, + "loss": 0.492, + "step": 10154 + }, + { + "epoch": 0.64, + "grad_norm": 5.932611203987603, + "learning_rate": 3.0512361502029747e-06, + "loss": 0.2607, + "step": 10155 + }, + { + "epoch": 0.64, + "grad_norm": 2.5259995957077686, + "learning_rate": 3.0502982557274962e-06, + "loss": 0.2785, + "step": 10156 + }, + { + "epoch": 0.64, + "grad_norm": 5.030242197573419, + "learning_rate": 3.049360442148608e-06, + "loss": 0.2988, + "step": 10157 + }, + { + "epoch": 0.64, + "grad_norm": 0.6263975983204364, + "learning_rate": 3.0484227095052154e-06, + "loss": 0.5001, + "step": 10158 + }, + { + "epoch": 0.64, + "grad_norm": 1.7789997055425444, + "learning_rate": 3.047485057836235e-06, + "loss": 0.2448, + "step": 10159 + }, + { + "epoch": 0.64, + "grad_norm": 2.440988523108026, + "learning_rate": 3.046547487180566e-06, + "loss": 0.2703, + "step": 10160 + }, + { + "epoch": 0.64, + "grad_norm": 2.2293867136504932, + "learning_rate": 3.0456099975771126e-06, + "loss": 0.2588, + "step": 10161 + }, + { + "epoch": 0.64, + "grad_norm": 2.1273934230670917, + "learning_rate": 3.044672589064771e-06, + "loss": 0.2485, + "step": 10162 + }, + { + "epoch": 0.64, + "grad_norm": 4.213251687312522, + "learning_rate": 3.043735261682438e-06, + "loss": 0.2592, + "step": 10163 + }, + { + "epoch": 0.64, + "grad_norm": 10.799080709975172, + "learning_rate": 3.0427980154690017e-06, + "loss": 0.277, + "step": 10164 + }, + { + "epoch": 0.64, + "grad_norm": 1.932450694159075, + "learning_rate": 3.041860850463352e-06, + "loss": 0.2712, + "step": 10165 + }, + { + "epoch": 0.64, + "grad_norm": 3.63425435114625, + "learning_rate": 3.0409237667043744e-06, + "loss": 0.2686, + "step": 10166 + }, + { + "epoch": 0.64, + "grad_norm": 2.872411582669073, + "learning_rate": 3.039986764230951e-06, + "loss": 0.3041, + "step": 10167 + }, + { + "epoch": 0.64, + "grad_norm": 2.0505403463626175, + "learning_rate": 3.0390498430819565e-06, + "loss": 0.2716, + "step": 10168 + }, + { + "epoch": 0.64, + "grad_norm": 4.092470412115965, + "learning_rate": 3.038113003296268e-06, + "loss": 0.2618, + "step": 10169 + }, + { + "epoch": 0.64, + "grad_norm": 2.8945188839773484, + "learning_rate": 3.037176244912755e-06, + "loss": 0.2612, + "step": 10170 + }, + { + "epoch": 0.64, + "grad_norm": 4.428301612150675, + "learning_rate": 3.036239567970287e-06, + "loss": 0.2788, + "step": 10171 + }, + { + "epoch": 0.64, + "grad_norm": 5.3656500157664695, + "learning_rate": 3.035302972507726e-06, + "loss": 0.2892, + "step": 10172 + }, + { + "epoch": 0.64, + "grad_norm": 3.3679817876270866, + "learning_rate": 3.034366458563933e-06, + "loss": 0.2709, + "step": 10173 + }, + { + "epoch": 0.64, + "grad_norm": 4.598483411415566, + "learning_rate": 3.0334300261777693e-06, + "loss": 0.2705, + "step": 10174 + }, + { + "epoch": 0.64, + "grad_norm": 3.075141900099125, + "learning_rate": 3.032493675388087e-06, + "loss": 0.287, + "step": 10175 + }, + { + "epoch": 0.64, + "grad_norm": 3.685290551967383, + "learning_rate": 3.031557406233736e-06, + "loss": 0.286, + "step": 10176 + }, + { + "epoch": 0.64, + "grad_norm": 2.6572913861913072, + "learning_rate": 3.0306212187535653e-06, + "loss": 0.2689, + "step": 10177 + }, + { + "epoch": 0.64, + "grad_norm": 5.208896982634066, + "learning_rate": 3.029685112986417e-06, + "loss": 0.2475, + "step": 10178 + }, + { + "epoch": 0.64, + "grad_norm": 2.6373364895627884, + "learning_rate": 3.028749088971135e-06, + "loss": 0.2988, + "step": 10179 + }, + { + "epoch": 0.64, + "grad_norm": 5.907750624306396, + "learning_rate": 3.027813146746551e-06, + "loss": 0.2695, + "step": 10180 + }, + { + "epoch": 0.64, + "grad_norm": 1.7924498823603048, + "learning_rate": 3.026877286351505e-06, + "loss": 0.2633, + "step": 10181 + }, + { + "epoch": 0.64, + "grad_norm": 4.356377903405441, + "learning_rate": 3.0259415078248246e-06, + "loss": 0.2504, + "step": 10182 + }, + { + "epoch": 0.64, + "grad_norm": 3.2792728631645054, + "learning_rate": 3.025005811205339e-06, + "loss": 0.2803, + "step": 10183 + }, + { + "epoch": 0.64, + "grad_norm": 5.673231353724311, + "learning_rate": 3.024070196531869e-06, + "loss": 0.2573, + "step": 10184 + }, + { + "epoch": 0.64, + "grad_norm": 1.8496966766462282, + "learning_rate": 3.023134663843236e-06, + "loss": 0.2724, + "step": 10185 + }, + { + "epoch": 0.64, + "grad_norm": 2.0429540934790427, + "learning_rate": 3.0221992131782573e-06, + "loss": 0.292, + "step": 10186 + }, + { + "epoch": 0.64, + "grad_norm": 1.5778665551464879, + "learning_rate": 3.021263844575747e-06, + "loss": 0.2662, + "step": 10187 + }, + { + "epoch": 0.64, + "grad_norm": 1.54086520150844, + "learning_rate": 3.020328558074511e-06, + "loss": 0.2547, + "step": 10188 + }, + { + "epoch": 0.64, + "grad_norm": 1.527184549260713, + "learning_rate": 3.0193933537133624e-06, + "loss": 0.2532, + "step": 10189 + }, + { + "epoch": 0.64, + "grad_norm": 4.666722457301461, + "learning_rate": 3.0184582315311013e-06, + "loss": 0.2761, + "step": 10190 + }, + { + "epoch": 0.64, + "grad_norm": 1.7200665926247563, + "learning_rate": 3.017523191566528e-06, + "loss": 0.2668, + "step": 10191 + }, + { + "epoch": 0.64, + "grad_norm": 1.727789154045117, + "learning_rate": 3.0165882338584383e-06, + "loss": 0.2527, + "step": 10192 + }, + { + "epoch": 0.64, + "grad_norm": 3.2410172379505586, + "learning_rate": 3.0156533584456268e-06, + "loss": 0.2678, + "step": 10193 + }, + { + "epoch": 0.64, + "grad_norm": 2.33067398381951, + "learning_rate": 3.0147185653668805e-06, + "loss": 0.2725, + "step": 10194 + }, + { + "epoch": 0.64, + "grad_norm": 2.12044012495356, + "learning_rate": 3.0137838546609867e-06, + "loss": 0.2834, + "step": 10195 + }, + { + "epoch": 0.64, + "grad_norm": 9.15527127240461, + "learning_rate": 3.0128492263667287e-06, + "loss": 0.2804, + "step": 10196 + }, + { + "epoch": 0.64, + "grad_norm": 2.012823032618822, + "learning_rate": 3.011914680522888e-06, + "loss": 0.2732, + "step": 10197 + }, + { + "epoch": 0.64, + "grad_norm": 23.83203618924269, + "learning_rate": 3.0109802171682366e-06, + "loss": 0.2702, + "step": 10198 + }, + { + "epoch": 0.64, + "grad_norm": 2.096726270747896, + "learning_rate": 3.0100458363415506e-06, + "loss": 0.2744, + "step": 10199 + }, + { + "epoch": 0.64, + "grad_norm": 2.472775888169146, + "learning_rate": 3.0091115380815962e-06, + "loss": 0.277, + "step": 10200 + }, + { + "epoch": 0.64, + "grad_norm": 2.624312152490665, + "learning_rate": 3.0081773224271417e-06, + "loss": 0.2604, + "step": 10201 + }, + { + "epoch": 0.64, + "grad_norm": 3.142779529305478, + "learning_rate": 3.007243189416946e-06, + "loss": 0.2689, + "step": 10202 + }, + { + "epoch": 0.64, + "grad_norm": 3.450999900769122, + "learning_rate": 3.006309139089769e-06, + "loss": 0.2885, + "step": 10203 + }, + { + "epoch": 0.64, + "grad_norm": 2.0392459597264607, + "learning_rate": 3.005375171484369e-06, + "loss": 0.2577, + "step": 10204 + }, + { + "epoch": 0.64, + "grad_norm": 1.6237970373006214, + "learning_rate": 3.0044412866394964e-06, + "loss": 0.2672, + "step": 10205 + }, + { + "epoch": 0.64, + "grad_norm": 2.0768314942856674, + "learning_rate": 3.003507484593898e-06, + "loss": 0.2729, + "step": 10206 + }, + { + "epoch": 0.64, + "grad_norm": 4.674237992765155, + "learning_rate": 3.002573765386322e-06, + "loss": 0.268, + "step": 10207 + }, + { + "epoch": 0.64, + "grad_norm": 3.139310557894765, + "learning_rate": 3.0016401290555065e-06, + "loss": 0.261, + "step": 10208 + }, + { + "epoch": 0.64, + "grad_norm": 2.8527313217119192, + "learning_rate": 3.000706575640193e-06, + "loss": 0.2751, + "step": 10209 + }, + { + "epoch": 0.64, + "grad_norm": 2.017620907037851, + "learning_rate": 2.9997731051791113e-06, + "loss": 0.2442, + "step": 10210 + }, + { + "epoch": 0.64, + "grad_norm": 2.694812998113918, + "learning_rate": 2.9988397177109994e-06, + "loss": 0.2822, + "step": 10211 + }, + { + "epoch": 0.64, + "grad_norm": 2.196133921080475, + "learning_rate": 2.997906413274582e-06, + "loss": 0.2685, + "step": 10212 + }, + { + "epoch": 0.64, + "grad_norm": 2.6917330466268106, + "learning_rate": 2.9969731919085844e-06, + "loss": 0.2771, + "step": 10213 + }, + { + "epoch": 0.64, + "grad_norm": 1.8836634430406083, + "learning_rate": 2.9960400536517246e-06, + "loss": 0.2705, + "step": 10214 + }, + { + "epoch": 0.64, + "grad_norm": 5.13448276858894, + "learning_rate": 2.9951069985427246e-06, + "loss": 0.2855, + "step": 10215 + }, + { + "epoch": 0.64, + "grad_norm": 2.815158030942339, + "learning_rate": 2.9941740266202946e-06, + "loss": 0.2834, + "step": 10216 + }, + { + "epoch": 0.64, + "grad_norm": 0.6404612673691276, + "learning_rate": 2.9932411379231486e-06, + "loss": 0.4989, + "step": 10217 + }, + { + "epoch": 0.64, + "grad_norm": 1.9222654985393097, + "learning_rate": 2.9923083324899894e-06, + "loss": 0.2677, + "step": 10218 + }, + { + "epoch": 0.64, + "grad_norm": 3.2963999108599005, + "learning_rate": 2.9913756103595258e-06, + "loss": 0.2842, + "step": 10219 + }, + { + "epoch": 0.64, + "grad_norm": 2.0794310772310274, + "learning_rate": 2.9904429715704554e-06, + "loss": 0.2727, + "step": 10220 + }, + { + "epoch": 0.64, + "grad_norm": 2.543411504214379, + "learning_rate": 2.989510416161476e-06, + "loss": 0.28, + "step": 10221 + }, + { + "epoch": 0.64, + "grad_norm": 1.8187753782238385, + "learning_rate": 2.988577944171279e-06, + "loss": 0.2506, + "step": 10222 + }, + { + "epoch": 0.64, + "grad_norm": 4.950717599692712, + "learning_rate": 2.9876455556385576e-06, + "loss": 0.2727, + "step": 10223 + }, + { + "epoch": 0.64, + "grad_norm": 2.298750541110782, + "learning_rate": 2.9867132506019958e-06, + "loss": 0.274, + "step": 10224 + }, + { + "epoch": 0.64, + "grad_norm": 2.0069221080400816, + "learning_rate": 2.9857810291002755e-06, + "loss": 0.2578, + "step": 10225 + }, + { + "epoch": 0.64, + "grad_norm": 2.8974810685327945, + "learning_rate": 2.984848891172079e-06, + "loss": 0.2718, + "step": 10226 + }, + { + "epoch": 0.64, + "grad_norm": 2.0549144684644705, + "learning_rate": 2.9839168368560827e-06, + "loss": 0.2783, + "step": 10227 + }, + { + "epoch": 0.64, + "grad_norm": 4.234453526518632, + "learning_rate": 2.982984866190957e-06, + "loss": 0.2693, + "step": 10228 + }, + { + "epoch": 0.64, + "grad_norm": 1.7125253929904005, + "learning_rate": 2.9820529792153717e-06, + "loss": 0.278, + "step": 10229 + }, + { + "epoch": 0.64, + "grad_norm": 1.8623729075608615, + "learning_rate": 2.9811211759679926e-06, + "loss": 0.2622, + "step": 10230 + }, + { + "epoch": 0.64, + "grad_norm": 2.154812623973168, + "learning_rate": 2.9801894564874824e-06, + "loss": 0.2763, + "step": 10231 + }, + { + "epoch": 0.64, + "grad_norm": 1.864391409126263, + "learning_rate": 2.9792578208124976e-06, + "loss": 0.2888, + "step": 10232 + }, + { + "epoch": 0.64, + "grad_norm": 1.5593740056610697, + "learning_rate": 2.9783262689816963e-06, + "loss": 0.2681, + "step": 10233 + }, + { + "epoch": 0.64, + "grad_norm": 2.492521212032518, + "learning_rate": 2.977394801033728e-06, + "loss": 0.2676, + "step": 10234 + }, + { + "epoch": 0.64, + "grad_norm": 0.6088182633682745, + "learning_rate": 2.976463417007244e-06, + "loss": 0.4982, + "step": 10235 + }, + { + "epoch": 0.64, + "grad_norm": 3.6838069910757487, + "learning_rate": 2.975532116940885e-06, + "loss": 0.2583, + "step": 10236 + }, + { + "epoch": 0.64, + "grad_norm": 4.151719183837695, + "learning_rate": 2.974600900873296e-06, + "loss": 0.2623, + "step": 10237 + }, + { + "epoch": 0.64, + "grad_norm": 2.7806131137092898, + "learning_rate": 2.9736697688431116e-06, + "loss": 0.2849, + "step": 10238 + }, + { + "epoch": 0.64, + "grad_norm": 2.0241893268833118, + "learning_rate": 2.972738720888969e-06, + "loss": 0.2668, + "step": 10239 + }, + { + "epoch": 0.64, + "grad_norm": 3.57971094866745, + "learning_rate": 2.971807757049496e-06, + "loss": 0.2606, + "step": 10240 + }, + { + "epoch": 0.64, + "grad_norm": 4.060102433166916, + "learning_rate": 2.970876877363322e-06, + "loss": 0.2874, + "step": 10241 + }, + { + "epoch": 0.64, + "grad_norm": 2.50574208213328, + "learning_rate": 2.9699460818690714e-06, + "loss": 0.2685, + "step": 10242 + }, + { + "epoch": 0.64, + "grad_norm": 16.948747642390405, + "learning_rate": 2.9690153706053638e-06, + "loss": 0.2691, + "step": 10243 + }, + { + "epoch": 0.64, + "grad_norm": 1.7094953346386037, + "learning_rate": 2.968084743610815e-06, + "loss": 0.274, + "step": 10244 + }, + { + "epoch": 0.64, + "grad_norm": 3.663651213945327, + "learning_rate": 2.9671542009240406e-06, + "loss": 0.2665, + "step": 10245 + }, + { + "epoch": 0.64, + "grad_norm": 4.41579868964434, + "learning_rate": 2.966223742583648e-06, + "loss": 0.282, + "step": 10246 + }, + { + "epoch": 0.64, + "grad_norm": 1.6469141833717196, + "learning_rate": 2.965293368628244e-06, + "loss": 0.2562, + "step": 10247 + }, + { + "epoch": 0.64, + "grad_norm": 1.5138190901598279, + "learning_rate": 2.964363079096434e-06, + "loss": 0.2613, + "step": 10248 + }, + { + "epoch": 0.64, + "grad_norm": 2.275875266250925, + "learning_rate": 2.963432874026815e-06, + "loss": 0.2759, + "step": 10249 + }, + { + "epoch": 0.64, + "grad_norm": 2.69531648687304, + "learning_rate": 2.962502753457982e-06, + "loss": 0.2606, + "step": 10250 + }, + { + "epoch": 0.64, + "grad_norm": 1.7325910610110424, + "learning_rate": 2.9615727174285307e-06, + "loss": 0.3142, + "step": 10251 + }, + { + "epoch": 0.64, + "grad_norm": 0.6427437877053559, + "learning_rate": 2.960642765977047e-06, + "loss": 0.4688, + "step": 10252 + }, + { + "epoch": 0.64, + "grad_norm": 3.863669200416984, + "learning_rate": 2.9597128991421187e-06, + "loss": 0.263, + "step": 10253 + }, + { + "epoch": 0.64, + "grad_norm": 5.272733400023962, + "learning_rate": 2.9587831169623244e-06, + "loss": 0.2607, + "step": 10254 + }, + { + "epoch": 0.64, + "grad_norm": 2.0399852713986415, + "learning_rate": 2.957853419476243e-06, + "loss": 0.2753, + "step": 10255 + }, + { + "epoch": 0.64, + "grad_norm": 1.9569226797701358, + "learning_rate": 2.956923806722453e-06, + "loss": 0.2727, + "step": 10256 + }, + { + "epoch": 0.65, + "grad_norm": 1.8961836722298089, + "learning_rate": 2.9559942787395224e-06, + "loss": 0.2579, + "step": 10257 + }, + { + "epoch": 0.65, + "grad_norm": 7.6855425757466405, + "learning_rate": 2.9550648355660195e-06, + "loss": 0.2668, + "step": 10258 + }, + { + "epoch": 0.65, + "grad_norm": 1.574673243926472, + "learning_rate": 2.9541354772405096e-06, + "loss": 0.2774, + "step": 10259 + }, + { + "epoch": 0.65, + "grad_norm": 2.6174823727962924, + "learning_rate": 2.953206203801552e-06, + "loss": 0.2526, + "step": 10260 + }, + { + "epoch": 0.65, + "grad_norm": 1.7780847535332385, + "learning_rate": 2.952277015287705e-06, + "loss": 0.2639, + "step": 10261 + }, + { + "epoch": 0.65, + "grad_norm": 1.7532879482171553, + "learning_rate": 2.95134791173752e-06, + "loss": 0.2646, + "step": 10262 + }, + { + "epoch": 0.65, + "grad_norm": 1.7286493146123572, + "learning_rate": 2.9504188931895507e-06, + "loss": 0.2757, + "step": 10263 + }, + { + "epoch": 0.65, + "grad_norm": 2.061230434689712, + "learning_rate": 2.9494899596823405e-06, + "loss": 0.2696, + "step": 10264 + }, + { + "epoch": 0.65, + "grad_norm": 3.080154476416327, + "learning_rate": 2.948561111254436e-06, + "loss": 0.2778, + "step": 10265 + }, + { + "epoch": 0.65, + "grad_norm": 1.5940276876912574, + "learning_rate": 2.9476323479443736e-06, + "loss": 0.272, + "step": 10266 + }, + { + "epoch": 0.65, + "grad_norm": 4.435622371099773, + "learning_rate": 2.9467036697906914e-06, + "loss": 0.2613, + "step": 10267 + }, + { + "epoch": 0.65, + "grad_norm": 1.9486384831096513, + "learning_rate": 2.9457750768319202e-06, + "loss": 0.2561, + "step": 10268 + }, + { + "epoch": 0.65, + "grad_norm": 2.032735319772164, + "learning_rate": 2.9448465691065906e-06, + "loss": 0.2667, + "step": 10269 + }, + { + "epoch": 0.65, + "grad_norm": 4.890455038224558, + "learning_rate": 2.9439181466532253e-06, + "loss": 0.279, + "step": 10270 + }, + { + "epoch": 0.65, + "grad_norm": 1.8318093530757913, + "learning_rate": 2.9429898095103494e-06, + "loss": 0.2814, + "step": 10271 + }, + { + "epoch": 0.65, + "grad_norm": 1.9060370669230478, + "learning_rate": 2.942061557716479e-06, + "loss": 0.257, + "step": 10272 + }, + { + "epoch": 0.65, + "grad_norm": 1.847142304524054, + "learning_rate": 2.9411333913101316e-06, + "loss": 0.2783, + "step": 10273 + }, + { + "epoch": 0.65, + "grad_norm": 2.158199419371395, + "learning_rate": 2.940205310329816e-06, + "loss": 0.2849, + "step": 10274 + }, + { + "epoch": 0.65, + "grad_norm": 1.6207273913013245, + "learning_rate": 2.9392773148140406e-06, + "loss": 0.2559, + "step": 10275 + }, + { + "epoch": 0.65, + "grad_norm": 1.6615952738715276, + "learning_rate": 2.9383494048013096e-06, + "loss": 0.2675, + "step": 10276 + }, + { + "epoch": 0.65, + "grad_norm": 2.3387500118835, + "learning_rate": 2.937421580330123e-06, + "loss": 0.2515, + "step": 10277 + }, + { + "epoch": 0.65, + "grad_norm": 3.1542056823758857, + "learning_rate": 2.9364938414389797e-06, + "loss": 0.2688, + "step": 10278 + }, + { + "epoch": 0.65, + "grad_norm": 2.2811552349845754, + "learning_rate": 2.9355661881663717e-06, + "loss": 0.251, + "step": 10279 + }, + { + "epoch": 0.65, + "grad_norm": 2.309169932842242, + "learning_rate": 2.9346386205507893e-06, + "loss": 0.268, + "step": 10280 + }, + { + "epoch": 0.65, + "grad_norm": 2.630879173555384, + "learning_rate": 2.9337111386307197e-06, + "loss": 0.2721, + "step": 10281 + }, + { + "epoch": 0.65, + "grad_norm": 1.522778125874977, + "learning_rate": 2.9327837424446442e-06, + "loss": 0.2688, + "step": 10282 + }, + { + "epoch": 0.65, + "grad_norm": 1.7498924542565566, + "learning_rate": 2.9318564320310444e-06, + "loss": 0.2521, + "step": 10283 + }, + { + "epoch": 0.65, + "grad_norm": 1.3253732961167728, + "learning_rate": 2.9309292074283936e-06, + "loss": 0.2515, + "step": 10284 + }, + { + "epoch": 0.65, + "grad_norm": 1.9211988222354697, + "learning_rate": 2.930002068675164e-06, + "loss": 0.2653, + "step": 10285 + }, + { + "epoch": 0.65, + "grad_norm": 2.0755383269524335, + "learning_rate": 2.9290750158098268e-06, + "loss": 0.2695, + "step": 10286 + }, + { + "epoch": 0.65, + "grad_norm": 2.0960936078231605, + "learning_rate": 2.9281480488708445e-06, + "loss": 0.2752, + "step": 10287 + }, + { + "epoch": 0.65, + "grad_norm": 2.2600792081025394, + "learning_rate": 2.9272211678966804e-06, + "loss": 0.2849, + "step": 10288 + }, + { + "epoch": 0.65, + "grad_norm": 1.3490394661035454, + "learning_rate": 2.9262943729257924e-06, + "loss": 0.2601, + "step": 10289 + }, + { + "epoch": 0.65, + "grad_norm": 1.773377022832766, + "learning_rate": 2.9253676639966335e-06, + "loss": 0.2684, + "step": 10290 + }, + { + "epoch": 0.65, + "grad_norm": 0.578923030565268, + "learning_rate": 2.924441041147656e-06, + "loss": 0.4941, + "step": 10291 + }, + { + "epoch": 0.65, + "grad_norm": 2.8919456581718603, + "learning_rate": 2.923514504417304e-06, + "loss": 0.2621, + "step": 10292 + }, + { + "epoch": 0.65, + "grad_norm": 17.785325821684957, + "learning_rate": 2.9225880538440242e-06, + "loss": 0.282, + "step": 10293 + }, + { + "epoch": 0.65, + "grad_norm": 1.8583411826395766, + "learning_rate": 2.921661689466257e-06, + "loss": 0.2702, + "step": 10294 + }, + { + "epoch": 0.65, + "grad_norm": 3.3448475890108256, + "learning_rate": 2.9207354113224384e-06, + "loss": 0.2938, + "step": 10295 + }, + { + "epoch": 0.65, + "grad_norm": 1.936621432202642, + "learning_rate": 2.919809219451e-06, + "loss": 0.2579, + "step": 10296 + }, + { + "epoch": 0.65, + "grad_norm": 3.199092871572625, + "learning_rate": 2.918883113890371e-06, + "loss": 0.2857, + "step": 10297 + }, + { + "epoch": 0.65, + "grad_norm": 2.7950546614614975, + "learning_rate": 2.9179570946789798e-06, + "loss": 0.2776, + "step": 10298 + }, + { + "epoch": 0.65, + "grad_norm": 1.6884177362490773, + "learning_rate": 2.9170311618552467e-06, + "loss": 0.2708, + "step": 10299 + }, + { + "epoch": 0.65, + "grad_norm": 4.239822309094006, + "learning_rate": 2.916105315457588e-06, + "loss": 0.2713, + "step": 10300 + }, + { + "epoch": 0.65, + "grad_norm": 2.8058828654033277, + "learning_rate": 2.9151795555244245e-06, + "loss": 0.2629, + "step": 10301 + }, + { + "epoch": 0.65, + "grad_norm": 1.50441884306904, + "learning_rate": 2.9142538820941613e-06, + "loss": 0.2506, + "step": 10302 + }, + { + "epoch": 0.65, + "grad_norm": 1.8329288039954224, + "learning_rate": 2.913328295205211e-06, + "loss": 0.2677, + "step": 10303 + }, + { + "epoch": 0.65, + "grad_norm": 1.591947523918817, + "learning_rate": 2.9124027948959767e-06, + "loss": 0.2601, + "step": 10304 + }, + { + "epoch": 0.65, + "grad_norm": 2.9945975029575576, + "learning_rate": 2.9114773812048558e-06, + "loss": 0.2811, + "step": 10305 + }, + { + "epoch": 0.65, + "grad_norm": 2.414079663222745, + "learning_rate": 2.9105520541702503e-06, + "loss": 0.2735, + "step": 10306 + }, + { + "epoch": 0.65, + "grad_norm": 2.046007457779282, + "learning_rate": 2.9096268138305495e-06, + "loss": 0.2627, + "step": 10307 + }, + { + "epoch": 0.65, + "grad_norm": 1.7599522828206562, + "learning_rate": 2.908701660224147e-06, + "loss": 0.2625, + "step": 10308 + }, + { + "epoch": 0.65, + "grad_norm": 2.567266171204114, + "learning_rate": 2.9077765933894277e-06, + "loss": 0.2823, + "step": 10309 + }, + { + "epoch": 0.65, + "grad_norm": 1.4579231919226723, + "learning_rate": 2.906851613364771e-06, + "loss": 0.2655, + "step": 10310 + }, + { + "epoch": 0.65, + "grad_norm": 2.240191366744254, + "learning_rate": 2.905926720188561e-06, + "loss": 0.2702, + "step": 10311 + }, + { + "epoch": 0.65, + "grad_norm": 1.9275614105475722, + "learning_rate": 2.905001913899171e-06, + "loss": 0.2667, + "step": 10312 + }, + { + "epoch": 0.65, + "grad_norm": 2.144183315313289, + "learning_rate": 2.9040771945349707e-06, + "loss": 0.2659, + "step": 10313 + }, + { + "epoch": 0.65, + "grad_norm": 2.1892419343701666, + "learning_rate": 2.90315256213433e-06, + "loss": 0.2584, + "step": 10314 + }, + { + "epoch": 0.65, + "grad_norm": 1.6502800388151524, + "learning_rate": 2.9022280167356167e-06, + "loss": 0.2601, + "step": 10315 + }, + { + "epoch": 0.65, + "grad_norm": 1.7334182686629287, + "learning_rate": 2.901303558377188e-06, + "loss": 0.2769, + "step": 10316 + }, + { + "epoch": 0.65, + "grad_norm": 2.2887259233707633, + "learning_rate": 2.9003791870974005e-06, + "loss": 0.2708, + "step": 10317 + }, + { + "epoch": 0.65, + "grad_norm": 1.9577276044161858, + "learning_rate": 2.8994549029346132e-06, + "loss": 0.2741, + "step": 10318 + }, + { + "epoch": 0.65, + "grad_norm": 2.1935604083815123, + "learning_rate": 2.8985307059271718e-06, + "loss": 0.2652, + "step": 10319 + }, + { + "epoch": 0.65, + "grad_norm": 1.4808680686436608, + "learning_rate": 2.897606596113424e-06, + "loss": 0.2761, + "step": 10320 + }, + { + "epoch": 0.65, + "grad_norm": 1.8842912010224175, + "learning_rate": 2.8966825735317113e-06, + "loss": 0.255, + "step": 10321 + }, + { + "epoch": 0.65, + "grad_norm": 1.4303413346360598, + "learning_rate": 2.895758638220374e-06, + "loss": 0.2603, + "step": 10322 + }, + { + "epoch": 0.65, + "grad_norm": 1.8270048984190406, + "learning_rate": 2.894834790217751e-06, + "loss": 0.2594, + "step": 10323 + }, + { + "epoch": 0.65, + "grad_norm": 1.8392163756216793, + "learning_rate": 2.8939110295621707e-06, + "loss": 0.2733, + "step": 10324 + }, + { + "epoch": 0.65, + "grad_norm": 3.4869095340523093, + "learning_rate": 2.892987356291962e-06, + "loss": 0.2696, + "step": 10325 + }, + { + "epoch": 0.65, + "grad_norm": 2.135158149137114, + "learning_rate": 2.892063770445451e-06, + "loss": 0.2605, + "step": 10326 + }, + { + "epoch": 0.65, + "grad_norm": 1.7634824739944561, + "learning_rate": 2.8911402720609594e-06, + "loss": 0.282, + "step": 10327 + }, + { + "epoch": 0.65, + "grad_norm": 1.7813903204132482, + "learning_rate": 2.8902168611768032e-06, + "loss": 0.2767, + "step": 10328 + }, + { + "epoch": 0.65, + "grad_norm": 2.219030505984542, + "learning_rate": 2.889293537831295e-06, + "loss": 0.2818, + "step": 10329 + }, + { + "epoch": 0.65, + "grad_norm": 3.3709590337236626, + "learning_rate": 2.8883703020627467e-06, + "loss": 0.256, + "step": 10330 + }, + { + "epoch": 0.65, + "grad_norm": 2.326527557524211, + "learning_rate": 2.8874471539094672e-06, + "loss": 0.2878, + "step": 10331 + }, + { + "epoch": 0.65, + "grad_norm": 2.1707468394485088, + "learning_rate": 2.8865240934097584e-06, + "loss": 0.3013, + "step": 10332 + }, + { + "epoch": 0.65, + "grad_norm": 3.6931270236050118, + "learning_rate": 2.885601120601916e-06, + "loss": 0.2657, + "step": 10333 + }, + { + "epoch": 0.65, + "grad_norm": 2.010592130910668, + "learning_rate": 2.884678235524241e-06, + "loss": 0.2706, + "step": 10334 + }, + { + "epoch": 0.65, + "grad_norm": 2.6936869219379527, + "learning_rate": 2.8837554382150233e-06, + "loss": 0.2831, + "step": 10335 + }, + { + "epoch": 0.65, + "grad_norm": 1.5181472268339018, + "learning_rate": 2.882832728712551e-06, + "loss": 0.2701, + "step": 10336 + }, + { + "epoch": 0.65, + "grad_norm": 2.255743161270764, + "learning_rate": 2.8819101070551073e-06, + "loss": 0.2889, + "step": 10337 + }, + { + "epoch": 0.65, + "grad_norm": 1.8339844827187541, + "learning_rate": 2.8809875732809762e-06, + "loss": 0.2728, + "step": 10338 + }, + { + "epoch": 0.65, + "grad_norm": 6.079500585688239, + "learning_rate": 2.8800651274284356e-06, + "loss": 0.2915, + "step": 10339 + }, + { + "epoch": 0.65, + "grad_norm": 1.8361709025280701, + "learning_rate": 2.8791427695357586e-06, + "loss": 0.2862, + "step": 10340 + }, + { + "epoch": 0.65, + "grad_norm": 1.6004733164058902, + "learning_rate": 2.8782204996412134e-06, + "loss": 0.2578, + "step": 10341 + }, + { + "epoch": 0.65, + "grad_norm": 1.7316254866285132, + "learning_rate": 2.8772983177830706e-06, + "loss": 0.2716, + "step": 10342 + }, + { + "epoch": 0.65, + "grad_norm": 2.9777529165571632, + "learning_rate": 2.8763762239995903e-06, + "loss": 0.2781, + "step": 10343 + }, + { + "epoch": 0.65, + "grad_norm": 1.7553244499987246, + "learning_rate": 2.8754542183290304e-06, + "loss": 0.2734, + "step": 10344 + }, + { + "epoch": 0.65, + "grad_norm": 1.9787242758944577, + "learning_rate": 2.874532300809651e-06, + "loss": 0.271, + "step": 10345 + }, + { + "epoch": 0.65, + "grad_norm": 1.7816249472016847, + "learning_rate": 2.8736104714796996e-06, + "loss": 0.258, + "step": 10346 + }, + { + "epoch": 0.65, + "grad_norm": 2.844539267323949, + "learning_rate": 2.8726887303774286e-06, + "loss": 0.2856, + "step": 10347 + }, + { + "epoch": 0.65, + "grad_norm": 3.110903491931399, + "learning_rate": 2.8717670775410805e-06, + "loss": 0.2573, + "step": 10348 + }, + { + "epoch": 0.65, + "grad_norm": 3.1961716277508603, + "learning_rate": 2.8708455130088946e-06, + "loss": 0.2925, + "step": 10349 + }, + { + "epoch": 0.65, + "grad_norm": 2.840889344725491, + "learning_rate": 2.8699240368191124e-06, + "loss": 0.2822, + "step": 10350 + }, + { + "epoch": 0.65, + "grad_norm": 2.1900770769306392, + "learning_rate": 2.8690026490099655e-06, + "loss": 0.281, + "step": 10351 + }, + { + "epoch": 0.65, + "grad_norm": 2.032055040907349, + "learning_rate": 2.8680813496196814e-06, + "loss": 0.2678, + "step": 10352 + }, + { + "epoch": 0.65, + "grad_norm": 1.83485329391566, + "learning_rate": 2.8671601386864913e-06, + "loss": 0.2698, + "step": 10353 + }, + { + "epoch": 0.65, + "grad_norm": 2.4372661675444647, + "learning_rate": 2.8662390162486125e-06, + "loss": 0.2813, + "step": 10354 + }, + { + "epoch": 0.65, + "grad_norm": 2.8711937602281714, + "learning_rate": 2.865317982344269e-06, + "loss": 0.274, + "step": 10355 + }, + { + "epoch": 0.65, + "grad_norm": 5.647553746292898, + "learning_rate": 2.864397037011675e-06, + "loss": 0.2646, + "step": 10356 + }, + { + "epoch": 0.65, + "grad_norm": 2.0758115605281473, + "learning_rate": 2.8634761802890387e-06, + "loss": 0.2634, + "step": 10357 + }, + { + "epoch": 0.65, + "grad_norm": 1.8131575878304973, + "learning_rate": 2.862555412214572e-06, + "loss": 0.2681, + "step": 10358 + }, + { + "epoch": 0.65, + "grad_norm": 1.9124368285371574, + "learning_rate": 2.8616347328264764e-06, + "loss": 0.2602, + "step": 10359 + }, + { + "epoch": 0.65, + "grad_norm": 0.6047603012030696, + "learning_rate": 2.860714142162956e-06, + "loss": 0.4803, + "step": 10360 + }, + { + "epoch": 0.65, + "grad_norm": 1.6202814522088258, + "learning_rate": 2.859793640262205e-06, + "loss": 0.2636, + "step": 10361 + }, + { + "epoch": 0.65, + "grad_norm": 2.4631381294144927, + "learning_rate": 2.8588732271624163e-06, + "loss": 0.2689, + "step": 10362 + }, + { + "epoch": 0.65, + "grad_norm": 1.6366857709413358, + "learning_rate": 2.857952902901782e-06, + "loss": 0.2693, + "step": 10363 + }, + { + "epoch": 0.65, + "grad_norm": 3.974490977238434, + "learning_rate": 2.857032667518486e-06, + "loss": 0.2676, + "step": 10364 + }, + { + "epoch": 0.65, + "grad_norm": 2.598651836509557, + "learning_rate": 2.856112521050709e-06, + "loss": 0.2498, + "step": 10365 + }, + { + "epoch": 0.65, + "grad_norm": 2.3267187774940883, + "learning_rate": 2.8551924635366344e-06, + "loss": 0.2692, + "step": 10366 + }, + { + "epoch": 0.65, + "grad_norm": 1.980678960588676, + "learning_rate": 2.854272495014431e-06, + "loss": 0.2727, + "step": 10367 + }, + { + "epoch": 0.65, + "grad_norm": 5.860173799441823, + "learning_rate": 2.8533526155222757e-06, + "loss": 0.2635, + "step": 10368 + }, + { + "epoch": 0.65, + "grad_norm": 3.0060463510588296, + "learning_rate": 2.8524328250983337e-06, + "loss": 0.2659, + "step": 10369 + }, + { + "epoch": 0.65, + "grad_norm": 2.350605986020169, + "learning_rate": 2.8515131237807653e-06, + "loss": 0.2752, + "step": 10370 + }, + { + "epoch": 0.65, + "grad_norm": 2.3201057957361955, + "learning_rate": 2.8505935116077353e-06, + "loss": 0.2608, + "step": 10371 + }, + { + "epoch": 0.65, + "grad_norm": 1.5239889432924736, + "learning_rate": 2.8496739886173994e-06, + "loss": 0.2675, + "step": 10372 + }, + { + "epoch": 0.65, + "grad_norm": 0.6003174799229049, + "learning_rate": 2.848754554847907e-06, + "loss": 0.4657, + "step": 10373 + }, + { + "epoch": 0.65, + "grad_norm": 8.931957096457086, + "learning_rate": 2.8478352103374085e-06, + "loss": 0.2639, + "step": 10374 + }, + { + "epoch": 0.65, + "grad_norm": 2.554794346000949, + "learning_rate": 2.846915955124052e-06, + "loss": 0.2705, + "step": 10375 + }, + { + "epoch": 0.65, + "grad_norm": 7.1017749279224756, + "learning_rate": 2.8459967892459767e-06, + "loss": 0.278, + "step": 10376 + }, + { + "epoch": 0.65, + "grad_norm": 2.2290570477432845, + "learning_rate": 2.845077712741321e-06, + "loss": 0.2522, + "step": 10377 + }, + { + "epoch": 0.65, + "grad_norm": 1.7791977780449229, + "learning_rate": 2.844158725648216e-06, + "loss": 0.2546, + "step": 10378 + }, + { + "epoch": 0.65, + "grad_norm": 3.3351526868247428, + "learning_rate": 2.843239828004797e-06, + "loss": 0.2738, + "step": 10379 + }, + { + "epoch": 0.65, + "grad_norm": 2.3032516202419586, + "learning_rate": 2.8423210198491886e-06, + "loss": 0.2727, + "step": 10380 + }, + { + "epoch": 0.65, + "grad_norm": 1.9390686255543936, + "learning_rate": 2.8414023012195113e-06, + "loss": 0.2549, + "step": 10381 + }, + { + "epoch": 0.65, + "grad_norm": 3.0219803962271286, + "learning_rate": 2.8404836721538866e-06, + "loss": 0.2501, + "step": 10382 + }, + { + "epoch": 0.65, + "grad_norm": 1.8175216309949775, + "learning_rate": 2.8395651326904323e-06, + "loss": 0.2698, + "step": 10383 + }, + { + "epoch": 0.65, + "grad_norm": 1.7904621967344234, + "learning_rate": 2.8386466828672575e-06, + "loss": 0.2797, + "step": 10384 + }, + { + "epoch": 0.65, + "grad_norm": 9.842134789346314, + "learning_rate": 2.8377283227224717e-06, + "loss": 0.2784, + "step": 10385 + }, + { + "epoch": 0.65, + "grad_norm": 6.694564305938711, + "learning_rate": 2.8368100522941755e-06, + "loss": 0.2632, + "step": 10386 + }, + { + "epoch": 0.65, + "grad_norm": 1.7988356164491621, + "learning_rate": 2.8358918716204746e-06, + "loss": 0.2983, + "step": 10387 + }, + { + "epoch": 0.65, + "grad_norm": 1.9138366241854534, + "learning_rate": 2.8349737807394646e-06, + "loss": 0.2747, + "step": 10388 + }, + { + "epoch": 0.65, + "grad_norm": 1.9613620859420138, + "learning_rate": 2.8340557796892353e-06, + "loss": 0.2824, + "step": 10389 + }, + { + "epoch": 0.65, + "grad_norm": 2.8392574585700325, + "learning_rate": 2.833137868507879e-06, + "loss": 0.264, + "step": 10390 + }, + { + "epoch": 0.65, + "grad_norm": 1.4377198742833721, + "learning_rate": 2.832220047233483e-06, + "loss": 0.2531, + "step": 10391 + }, + { + "epoch": 0.65, + "grad_norm": 2.3127795153622928, + "learning_rate": 2.831302315904128e-06, + "loss": 0.2677, + "step": 10392 + }, + { + "epoch": 0.65, + "grad_norm": 3.99991517431462, + "learning_rate": 2.83038467455789e-06, + "loss": 0.2699, + "step": 10393 + }, + { + "epoch": 0.65, + "grad_norm": 2.820705929289855, + "learning_rate": 2.8294671232328473e-06, + "loss": 0.2634, + "step": 10394 + }, + { + "epoch": 0.65, + "grad_norm": 1.518450643734968, + "learning_rate": 2.8285496619670695e-06, + "loss": 0.2773, + "step": 10395 + }, + { + "epoch": 0.65, + "grad_norm": 3.074314846461634, + "learning_rate": 2.827632290798621e-06, + "loss": 0.2569, + "step": 10396 + }, + { + "epoch": 0.65, + "grad_norm": 1.5583800976647444, + "learning_rate": 2.826715009765569e-06, + "loss": 0.2846, + "step": 10397 + }, + { + "epoch": 0.65, + "grad_norm": 2.108231671123516, + "learning_rate": 2.825797818905969e-06, + "loss": 0.2596, + "step": 10398 + }, + { + "epoch": 0.65, + "grad_norm": 3.240758915529665, + "learning_rate": 2.8248807182578817e-06, + "loss": 0.2717, + "step": 10399 + }, + { + "epoch": 0.65, + "grad_norm": 1.8199852189859267, + "learning_rate": 2.8239637078593574e-06, + "loss": 0.2542, + "step": 10400 + }, + { + "epoch": 0.65, + "grad_norm": 0.6174296789920071, + "learning_rate": 2.823046787748441e-06, + "loss": 0.4429, + "step": 10401 + }, + { + "epoch": 0.65, + "grad_norm": 2.456548481553083, + "learning_rate": 2.8221299579631834e-06, + "loss": 0.2578, + "step": 10402 + }, + { + "epoch": 0.65, + "grad_norm": 2.3983486438540016, + "learning_rate": 2.821213218541621e-06, + "loss": 0.2788, + "step": 10403 + }, + { + "epoch": 0.65, + "grad_norm": 1.8787150019399343, + "learning_rate": 2.8202965695217906e-06, + "loss": 0.2491, + "step": 10404 + }, + { + "epoch": 0.65, + "grad_norm": 1.8893076754105873, + "learning_rate": 2.8193800109417293e-06, + "loss": 0.2709, + "step": 10405 + }, + { + "epoch": 0.65, + "grad_norm": 2.1820426995449576, + "learning_rate": 2.818463542839462e-06, + "loss": 0.274, + "step": 10406 + }, + { + "epoch": 0.65, + "grad_norm": 1.5720702494820933, + "learning_rate": 2.8175471652530193e-06, + "loss": 0.2558, + "step": 10407 + }, + { + "epoch": 0.65, + "grad_norm": 1.9245355640229331, + "learning_rate": 2.816630878220421e-06, + "loss": 0.2593, + "step": 10408 + }, + { + "epoch": 0.65, + "grad_norm": 5.193600663839504, + "learning_rate": 2.8157146817796843e-06, + "loss": 0.2668, + "step": 10409 + }, + { + "epoch": 0.65, + "grad_norm": 1.6029318065473126, + "learning_rate": 2.8147985759688267e-06, + "loss": 0.2584, + "step": 10410 + }, + { + "epoch": 0.65, + "grad_norm": 1.7862824366550654, + "learning_rate": 2.8138825608258556e-06, + "loss": 0.2584, + "step": 10411 + }, + { + "epoch": 0.65, + "grad_norm": 8.073517752624644, + "learning_rate": 2.812966636388782e-06, + "loss": 0.2871, + "step": 10412 + }, + { + "epoch": 0.65, + "grad_norm": 1.7103306706033008, + "learning_rate": 2.8120508026956074e-06, + "loss": 0.2525, + "step": 10413 + }, + { + "epoch": 0.65, + "grad_norm": 2.1626339198226368, + "learning_rate": 2.811135059784329e-06, + "loss": 0.273, + "step": 10414 + }, + { + "epoch": 0.65, + "grad_norm": 4.238689838195442, + "learning_rate": 2.8102194076929475e-06, + "loss": 0.2761, + "step": 10415 + }, + { + "epoch": 0.66, + "grad_norm": 2.3273465781146676, + "learning_rate": 2.809303846459452e-06, + "loss": 0.2752, + "step": 10416 + }, + { + "epoch": 0.66, + "grad_norm": 1.5742835740189338, + "learning_rate": 2.808388376121829e-06, + "loss": 0.2618, + "step": 10417 + }, + { + "epoch": 0.66, + "grad_norm": 6.530856597631842, + "learning_rate": 2.8074729967180664e-06, + "loss": 0.2651, + "step": 10418 + }, + { + "epoch": 0.66, + "grad_norm": 4.828599931339612, + "learning_rate": 2.8065577082861416e-06, + "loss": 0.2969, + "step": 10419 + }, + { + "epoch": 0.66, + "grad_norm": 1.966342558520523, + "learning_rate": 2.805642510864036e-06, + "loss": 0.2579, + "step": 10420 + }, + { + "epoch": 0.66, + "grad_norm": 4.479634855617904, + "learning_rate": 2.80472740448972e-06, + "loss": 0.2649, + "step": 10421 + }, + { + "epoch": 0.66, + "grad_norm": 4.368448757489006, + "learning_rate": 2.8038123892011615e-06, + "loss": 0.2586, + "step": 10422 + }, + { + "epoch": 0.66, + "grad_norm": 2.217167700332758, + "learning_rate": 2.8028974650363296e-06, + "loss": 0.2658, + "step": 10423 + }, + { + "epoch": 0.66, + "grad_norm": 1.7521901095895094, + "learning_rate": 2.8019826320331843e-06, + "loss": 0.2817, + "step": 10424 + }, + { + "epoch": 0.66, + "grad_norm": 1.2625006956315874, + "learning_rate": 2.8010678902296822e-06, + "loss": 0.2641, + "step": 10425 + }, + { + "epoch": 0.66, + "grad_norm": 17.121993836236154, + "learning_rate": 2.800153239663779e-06, + "loss": 0.2673, + "step": 10426 + }, + { + "epoch": 0.66, + "grad_norm": 2.14308314556635, + "learning_rate": 2.7992386803734267e-06, + "loss": 0.2674, + "step": 10427 + }, + { + "epoch": 0.66, + "grad_norm": 4.809321431847288, + "learning_rate": 2.7983242123965708e-06, + "loss": 0.2715, + "step": 10428 + }, + { + "epoch": 0.66, + "grad_norm": 3.809139352899553, + "learning_rate": 2.7974098357711544e-06, + "loss": 0.2771, + "step": 10429 + }, + { + "epoch": 0.66, + "grad_norm": 1.5875698629766402, + "learning_rate": 2.796495550535113e-06, + "loss": 0.2859, + "step": 10430 + }, + { + "epoch": 0.66, + "grad_norm": 2.5018999272604305, + "learning_rate": 2.795581356726388e-06, + "loss": 0.2542, + "step": 10431 + }, + { + "epoch": 0.66, + "grad_norm": 7.149900104850301, + "learning_rate": 2.7946672543829077e-06, + "loss": 0.2958, + "step": 10432 + }, + { + "epoch": 0.66, + "grad_norm": 2.2645565055309302, + "learning_rate": 2.7937532435425985e-06, + "loss": 0.2633, + "step": 10433 + }, + { + "epoch": 0.66, + "grad_norm": 2.1242294696964263, + "learning_rate": 2.792839324243386e-06, + "loss": 0.2756, + "step": 10434 + }, + { + "epoch": 0.66, + "grad_norm": 2.6295723493836345, + "learning_rate": 2.791925496523191e-06, + "loss": 0.2699, + "step": 10435 + }, + { + "epoch": 0.66, + "grad_norm": 2.182514401554479, + "learning_rate": 2.7910117604199305e-06, + "loss": 0.2775, + "step": 10436 + }, + { + "epoch": 0.66, + "grad_norm": 3.013516551641431, + "learning_rate": 2.7900981159715157e-06, + "loss": 0.25, + "step": 10437 + }, + { + "epoch": 0.66, + "grad_norm": 2.1994852426873885, + "learning_rate": 2.7891845632158527e-06, + "loss": 0.2791, + "step": 10438 + }, + { + "epoch": 0.66, + "grad_norm": 2.345011476749204, + "learning_rate": 2.788271102190851e-06, + "loss": 0.262, + "step": 10439 + }, + { + "epoch": 0.66, + "grad_norm": 2.1211271212366243, + "learning_rate": 2.7873577329344105e-06, + "loss": 0.2467, + "step": 10440 + }, + { + "epoch": 0.66, + "grad_norm": 1.490889428901454, + "learning_rate": 2.786444455484425e-06, + "loss": 0.2579, + "step": 10441 + }, + { + "epoch": 0.66, + "grad_norm": 3.436044397374049, + "learning_rate": 2.785531269878791e-06, + "loss": 0.271, + "step": 10442 + }, + { + "epoch": 0.66, + "grad_norm": 4.416322267236971, + "learning_rate": 2.784618176155399e-06, + "loss": 0.2669, + "step": 10443 + }, + { + "epoch": 0.66, + "grad_norm": 2.2415655374759598, + "learning_rate": 2.783705174352135e-06, + "loss": 0.2725, + "step": 10444 + }, + { + "epoch": 0.66, + "grad_norm": 2.3137567313680365, + "learning_rate": 2.78279226450688e-06, + "loss": 0.2933, + "step": 10445 + }, + { + "epoch": 0.66, + "grad_norm": 2.787664114700681, + "learning_rate": 2.7818794466575095e-06, + "loss": 0.2632, + "step": 10446 + }, + { + "epoch": 0.66, + "grad_norm": 2.5499048197975385, + "learning_rate": 2.7809667208419034e-06, + "loss": 0.2829, + "step": 10447 + }, + { + "epoch": 0.66, + "grad_norm": 4.293831891826393, + "learning_rate": 2.7800540870979287e-06, + "loss": 0.2886, + "step": 10448 + }, + { + "epoch": 0.66, + "grad_norm": 1.2140332473576787, + "learning_rate": 2.7791415454634507e-06, + "loss": 0.2527, + "step": 10449 + }, + { + "epoch": 0.66, + "grad_norm": 1.3097334048601796, + "learning_rate": 2.778229095976336e-06, + "loss": 0.2625, + "step": 10450 + }, + { + "epoch": 0.66, + "grad_norm": 3.0738810231232616, + "learning_rate": 2.7773167386744432e-06, + "loss": 0.2872, + "step": 10451 + }, + { + "epoch": 0.66, + "grad_norm": 2.570713387112575, + "learning_rate": 2.7764044735956275e-06, + "loss": 0.265, + "step": 10452 + }, + { + "epoch": 0.66, + "grad_norm": 1.7532889903374367, + "learning_rate": 2.775492300777739e-06, + "loss": 0.2567, + "step": 10453 + }, + { + "epoch": 0.66, + "grad_norm": 2.0799293435877386, + "learning_rate": 2.774580220258625e-06, + "loss": 0.2694, + "step": 10454 + }, + { + "epoch": 0.66, + "grad_norm": 2.0581896280321694, + "learning_rate": 2.773668232076132e-06, + "loss": 0.2703, + "step": 10455 + }, + { + "epoch": 0.66, + "grad_norm": 1.9368825258630105, + "learning_rate": 2.7727563362680965e-06, + "loss": 0.2723, + "step": 10456 + }, + { + "epoch": 0.66, + "grad_norm": 2.7194825863986827, + "learning_rate": 2.771844532872359e-06, + "loss": 0.2633, + "step": 10457 + }, + { + "epoch": 0.66, + "grad_norm": 1.6965752359933366, + "learning_rate": 2.770932821926747e-06, + "loss": 0.2712, + "step": 10458 + }, + { + "epoch": 0.66, + "grad_norm": 2.9523782752312764, + "learning_rate": 2.7700212034690933e-06, + "loss": 0.3004, + "step": 10459 + }, + { + "epoch": 0.66, + "grad_norm": 0.6194571393296509, + "learning_rate": 2.769109677537222e-06, + "loss": 0.475, + "step": 10460 + }, + { + "epoch": 0.66, + "grad_norm": 7.743386645198735, + "learning_rate": 2.7681982441689513e-06, + "loss": 0.2706, + "step": 10461 + }, + { + "epoch": 0.66, + "grad_norm": 2.4932330766273485, + "learning_rate": 2.7672869034020978e-06, + "loss": 0.2814, + "step": 10462 + }, + { + "epoch": 0.66, + "grad_norm": 4.197278076937047, + "learning_rate": 2.766375655274479e-06, + "loss": 0.2517, + "step": 10463 + }, + { + "epoch": 0.66, + "grad_norm": 1.9752372692344045, + "learning_rate": 2.765464499823899e-06, + "loss": 0.2739, + "step": 10464 + }, + { + "epoch": 0.66, + "grad_norm": 1.5880826352631157, + "learning_rate": 2.7645534370881682e-06, + "loss": 0.2636, + "step": 10465 + }, + { + "epoch": 0.66, + "grad_norm": 2.308944477965517, + "learning_rate": 2.7636424671050843e-06, + "loss": 0.2867, + "step": 10466 + }, + { + "epoch": 0.66, + "grad_norm": 2.810625758286272, + "learning_rate": 2.762731589912448e-06, + "loss": 0.2854, + "step": 10467 + }, + { + "epoch": 0.66, + "grad_norm": 2.024849569191932, + "learning_rate": 2.7618208055480523e-06, + "loss": 0.2541, + "step": 10468 + }, + { + "epoch": 0.66, + "grad_norm": 1.2611604161150793, + "learning_rate": 2.7609101140496863e-06, + "loss": 0.5138, + "step": 10469 + }, + { + "epoch": 0.66, + "grad_norm": 2.8194719512935786, + "learning_rate": 2.7599995154551352e-06, + "loss": 0.2866, + "step": 10470 + }, + { + "epoch": 0.66, + "grad_norm": 2.858857505565335, + "learning_rate": 2.7590890098021828e-06, + "loss": 0.2742, + "step": 10471 + }, + { + "epoch": 0.66, + "grad_norm": 2.101557608251054, + "learning_rate": 2.75817859712861e-06, + "loss": 0.2592, + "step": 10472 + }, + { + "epoch": 0.66, + "grad_norm": 2.183406893174561, + "learning_rate": 2.757268277472188e-06, + "loss": 0.2591, + "step": 10473 + }, + { + "epoch": 0.66, + "grad_norm": 2.952509958724208, + "learning_rate": 2.7563580508706877e-06, + "loss": 0.2596, + "step": 10474 + }, + { + "epoch": 0.66, + "grad_norm": 2.351708371100462, + "learning_rate": 2.755447917361879e-06, + "loss": 0.2709, + "step": 10475 + }, + { + "epoch": 0.66, + "grad_norm": 4.05542319249982, + "learning_rate": 2.754537876983523e-06, + "loss": 0.2849, + "step": 10476 + }, + { + "epoch": 0.66, + "grad_norm": 3.067770469205034, + "learning_rate": 2.753627929773377e-06, + "loss": 0.2716, + "step": 10477 + }, + { + "epoch": 0.66, + "grad_norm": 1.784719241808555, + "learning_rate": 2.7527180757691973e-06, + "loss": 0.2702, + "step": 10478 + }, + { + "epoch": 0.66, + "grad_norm": 2.4259988908413455, + "learning_rate": 2.7518083150087395e-06, + "loss": 0.2831, + "step": 10479 + }, + { + "epoch": 0.66, + "grad_norm": 1.6593522983902174, + "learning_rate": 2.750898647529747e-06, + "loss": 0.2487, + "step": 10480 + }, + { + "epoch": 0.66, + "grad_norm": 1.7200176513562162, + "learning_rate": 2.7499890733699645e-06, + "loss": 0.2582, + "step": 10481 + }, + { + "epoch": 0.66, + "grad_norm": 4.286478809297931, + "learning_rate": 2.74907959256713e-06, + "loss": 0.258, + "step": 10482 + }, + { + "epoch": 0.66, + "grad_norm": 1.8739935265522478, + "learning_rate": 2.748170205158984e-06, + "loss": 0.2632, + "step": 10483 + }, + { + "epoch": 0.66, + "grad_norm": 2.6781717455937804, + "learning_rate": 2.747260911183255e-06, + "loss": 0.2756, + "step": 10484 + }, + { + "epoch": 0.66, + "grad_norm": 1.75517122915535, + "learning_rate": 2.7463517106776704e-06, + "loss": 0.256, + "step": 10485 + }, + { + "epoch": 0.66, + "grad_norm": 2.2610652072859203, + "learning_rate": 2.7454426036799566e-06, + "loss": 0.2699, + "step": 10486 + }, + { + "epoch": 0.66, + "grad_norm": 2.1226346728576484, + "learning_rate": 2.7445335902278347e-06, + "loss": 0.2606, + "step": 10487 + }, + { + "epoch": 0.66, + "grad_norm": 3.2975074133000453, + "learning_rate": 2.7436246703590206e-06, + "loss": 0.2949, + "step": 10488 + }, + { + "epoch": 0.66, + "grad_norm": 35.896181409660656, + "learning_rate": 2.742715844111228e-06, + "loss": 0.2694, + "step": 10489 + }, + { + "epoch": 0.66, + "grad_norm": 2.3142204998937435, + "learning_rate": 2.7418071115221613e-06, + "loss": 0.2709, + "step": 10490 + }, + { + "epoch": 0.66, + "grad_norm": 1.391682267713271, + "learning_rate": 2.740898472629531e-06, + "loss": 0.2739, + "step": 10491 + }, + { + "epoch": 0.66, + "grad_norm": 1.6642260926130596, + "learning_rate": 2.7399899274710346e-06, + "loss": 0.2461, + "step": 10492 + }, + { + "epoch": 0.66, + "grad_norm": 2.789876276801303, + "learning_rate": 2.7390814760843695e-06, + "loss": 0.2763, + "step": 10493 + }, + { + "epoch": 0.66, + "grad_norm": 1.5921129456610974, + "learning_rate": 2.738173118507229e-06, + "loss": 0.2837, + "step": 10494 + }, + { + "epoch": 0.66, + "grad_norm": 2.2845068057407825, + "learning_rate": 2.7372648547773063e-06, + "loss": 0.263, + "step": 10495 + }, + { + "epoch": 0.66, + "grad_norm": 3.989157271413783, + "learning_rate": 2.736356684932283e-06, + "loss": 0.2817, + "step": 10496 + }, + { + "epoch": 0.66, + "grad_norm": 4.220640561908065, + "learning_rate": 2.7354486090098414e-06, + "loss": 0.254, + "step": 10497 + }, + { + "epoch": 0.66, + "grad_norm": 1.5036794369636048, + "learning_rate": 2.734540627047658e-06, + "loss": 0.2701, + "step": 10498 + }, + { + "epoch": 0.66, + "grad_norm": 2.7119482713201197, + "learning_rate": 2.7336327390834093e-06, + "loss": 0.2751, + "step": 10499 + }, + { + "epoch": 0.66, + "grad_norm": 2.766943979785282, + "learning_rate": 2.7327249451547642e-06, + "loss": 0.2656, + "step": 10500 + }, + { + "epoch": 0.66, + "grad_norm": 2.3265290673315775, + "learning_rate": 2.7318172452993864e-06, + "loss": 0.2688, + "step": 10501 + }, + { + "epoch": 0.66, + "grad_norm": 3.4995671438417246, + "learning_rate": 2.7309096395549395e-06, + "loss": 0.2785, + "step": 10502 + }, + { + "epoch": 0.66, + "grad_norm": 1.7718723163471015, + "learning_rate": 2.730002127959084e-06, + "loss": 0.2642, + "step": 10503 + }, + { + "epoch": 0.66, + "grad_norm": 1.7948091013136696, + "learning_rate": 2.729094710549472e-06, + "loss": 0.2661, + "step": 10504 + }, + { + "epoch": 0.66, + "grad_norm": 2.3992510558784863, + "learning_rate": 2.728187387363754e-06, + "loss": 0.266, + "step": 10505 + }, + { + "epoch": 0.66, + "grad_norm": 2.4472813449806115, + "learning_rate": 2.727280158439575e-06, + "loss": 0.2615, + "step": 10506 + }, + { + "epoch": 0.66, + "grad_norm": 3.843309802113135, + "learning_rate": 2.726373023814581e-06, + "loss": 0.2763, + "step": 10507 + }, + { + "epoch": 0.66, + "grad_norm": 1.76893973000872, + "learning_rate": 2.7254659835264064e-06, + "loss": 0.2833, + "step": 10508 + }, + { + "epoch": 0.66, + "grad_norm": 2.059903131021249, + "learning_rate": 2.7245590376126895e-06, + "loss": 0.2599, + "step": 10509 + }, + { + "epoch": 0.66, + "grad_norm": 2.791626822790956, + "learning_rate": 2.7236521861110586e-06, + "loss": 0.2861, + "step": 10510 + }, + { + "epoch": 0.66, + "grad_norm": 5.1410902440432285, + "learning_rate": 2.722745429059144e-06, + "loss": 0.2682, + "step": 10511 + }, + { + "epoch": 0.66, + "grad_norm": 1.5038795765002517, + "learning_rate": 2.721838766494566e-06, + "loss": 0.2631, + "step": 10512 + }, + { + "epoch": 0.66, + "grad_norm": 2.661065644468989, + "learning_rate": 2.720932198454944e-06, + "loss": 0.2746, + "step": 10513 + }, + { + "epoch": 0.66, + "grad_norm": 7.049903290024286, + "learning_rate": 2.720025724977892e-06, + "loss": 0.2755, + "step": 10514 + }, + { + "epoch": 0.66, + "grad_norm": 2.268990141101887, + "learning_rate": 2.719119346101023e-06, + "loss": 0.2765, + "step": 10515 + }, + { + "epoch": 0.66, + "grad_norm": 2.02605309083175, + "learning_rate": 2.7182130618619423e-06, + "loss": 0.2684, + "step": 10516 + }, + { + "epoch": 0.66, + "grad_norm": 1.6260057395368743, + "learning_rate": 2.7173068722982566e-06, + "loss": 0.2691, + "step": 10517 + }, + { + "epoch": 0.66, + "grad_norm": 2.3232572660315918, + "learning_rate": 2.716400777447561e-06, + "loss": 0.274, + "step": 10518 + }, + { + "epoch": 0.66, + "grad_norm": 5.1385260684203, + "learning_rate": 2.7154947773474556e-06, + "loss": 0.2659, + "step": 10519 + }, + { + "epoch": 0.66, + "grad_norm": 1.6609671404572608, + "learning_rate": 2.7145888720355297e-06, + "loss": 0.2648, + "step": 10520 + }, + { + "epoch": 0.66, + "grad_norm": 2.8010479623478184, + "learning_rate": 2.71368306154937e-06, + "loss": 0.2567, + "step": 10521 + }, + { + "epoch": 0.66, + "grad_norm": 1.8557498651632571, + "learning_rate": 2.7127773459265604e-06, + "loss": 0.2804, + "step": 10522 + }, + { + "epoch": 0.66, + "grad_norm": 1.6119371441864176, + "learning_rate": 2.71187172520468e-06, + "loss": 0.2511, + "step": 10523 + }, + { + "epoch": 0.66, + "grad_norm": 3.1180144127404255, + "learning_rate": 2.710966199421309e-06, + "loss": 0.2618, + "step": 10524 + }, + { + "epoch": 0.66, + "grad_norm": 2.1346526102771137, + "learning_rate": 2.7100607686140155e-06, + "loss": 0.2676, + "step": 10525 + }, + { + "epoch": 0.66, + "grad_norm": 3.39223189442905, + "learning_rate": 2.709155432820366e-06, + "loss": 0.2747, + "step": 10526 + }, + { + "epoch": 0.66, + "grad_norm": 0.5559264437866219, + "learning_rate": 2.7082501920779293e-06, + "loss": 0.4723, + "step": 10527 + }, + { + "epoch": 0.66, + "grad_norm": 1.5950884134719407, + "learning_rate": 2.707345046424262e-06, + "loss": 0.2625, + "step": 10528 + }, + { + "epoch": 0.66, + "grad_norm": 2.627215776788749, + "learning_rate": 2.706439995896921e-06, + "loss": 0.2647, + "step": 10529 + }, + { + "epoch": 0.66, + "grad_norm": 27.149907227318852, + "learning_rate": 2.705535040533457e-06, + "loss": 0.275, + "step": 10530 + }, + { + "epoch": 0.66, + "grad_norm": 2.0678446646933226, + "learning_rate": 2.7046301803714194e-06, + "loss": 0.2686, + "step": 10531 + }, + { + "epoch": 0.66, + "grad_norm": 3.4813309748440227, + "learning_rate": 2.703725415448354e-06, + "loss": 0.2779, + "step": 10532 + }, + { + "epoch": 0.66, + "grad_norm": 2.2494720936144463, + "learning_rate": 2.7028207458017996e-06, + "loss": 0.2647, + "step": 10533 + }, + { + "epoch": 0.66, + "grad_norm": 3.0571871680881295, + "learning_rate": 2.701916171469292e-06, + "loss": 0.2637, + "step": 10534 + }, + { + "epoch": 0.66, + "grad_norm": 2.8114706590401224, + "learning_rate": 2.7010116924883654e-06, + "loss": 0.2637, + "step": 10535 + }, + { + "epoch": 0.66, + "grad_norm": 2.136596359366606, + "learning_rate": 2.7001073088965467e-06, + "loss": 0.2782, + "step": 10536 + }, + { + "epoch": 0.66, + "grad_norm": 1.7784525920721663, + "learning_rate": 2.699203020731362e-06, + "loss": 0.2686, + "step": 10537 + }, + { + "epoch": 0.66, + "grad_norm": 1.7372657869244117, + "learning_rate": 2.6982988280303255e-06, + "loss": 0.2945, + "step": 10538 + }, + { + "epoch": 0.66, + "grad_norm": 2.6775509141565044, + "learning_rate": 2.6973947308309647e-06, + "loss": 0.2817, + "step": 10539 + }, + { + "epoch": 0.66, + "grad_norm": 3.6063032286178625, + "learning_rate": 2.6964907291707844e-06, + "loss": 0.2749, + "step": 10540 + }, + { + "epoch": 0.66, + "grad_norm": 3.037502628470929, + "learning_rate": 2.6955868230872963e-06, + "loss": 0.2749, + "step": 10541 + }, + { + "epoch": 0.66, + "grad_norm": 2.3071206627776366, + "learning_rate": 2.6946830126180016e-06, + "loss": 0.2795, + "step": 10542 + }, + { + "epoch": 0.66, + "grad_norm": 3.5233088589992856, + "learning_rate": 2.6937792978004056e-06, + "loss": 0.2677, + "step": 10543 + }, + { + "epoch": 0.66, + "grad_norm": 2.1354782356786717, + "learning_rate": 2.6928756786720026e-06, + "loss": 0.2873, + "step": 10544 + }, + { + "epoch": 0.66, + "grad_norm": 1.4270493297206324, + "learning_rate": 2.691972155270286e-06, + "loss": 0.2619, + "step": 10545 + }, + { + "epoch": 0.66, + "grad_norm": 2.020419575907655, + "learning_rate": 2.69106872763274e-06, + "loss": 0.2749, + "step": 10546 + }, + { + "epoch": 0.66, + "grad_norm": 1.7673720866108846, + "learning_rate": 2.6901653957968577e-06, + "loss": 0.2802, + "step": 10547 + }, + { + "epoch": 0.66, + "grad_norm": 2.578574726522088, + "learning_rate": 2.6892621598001157e-06, + "loss": 0.2695, + "step": 10548 + }, + { + "epoch": 0.66, + "grad_norm": 1.9129988722106914, + "learning_rate": 2.6883590196799913e-06, + "loss": 0.2724, + "step": 10549 + }, + { + "epoch": 0.66, + "grad_norm": 2.9851458303620775, + "learning_rate": 2.687455975473955e-06, + "loss": 0.275, + "step": 10550 + }, + { + "epoch": 0.66, + "grad_norm": 3.2185346204376484, + "learning_rate": 2.6865530272194796e-06, + "loss": 0.2657, + "step": 10551 + }, + { + "epoch": 0.66, + "grad_norm": 2.017259300764955, + "learning_rate": 2.6856501749540287e-06, + "loss": 0.2625, + "step": 10552 + }, + { + "epoch": 0.66, + "grad_norm": 1.8912869264547008, + "learning_rate": 2.6847474187150603e-06, + "loss": 0.256, + "step": 10553 + }, + { + "epoch": 0.66, + "grad_norm": 1.946934685722033, + "learning_rate": 2.683844758540034e-06, + "loss": 0.2483, + "step": 10554 + }, + { + "epoch": 0.66, + "grad_norm": 2.2920732445097354, + "learning_rate": 2.682942194466405e-06, + "loss": 0.2595, + "step": 10555 + }, + { + "epoch": 0.66, + "grad_norm": 0.5986369436181105, + "learning_rate": 2.682039726531619e-06, + "loss": 0.482, + "step": 10556 + }, + { + "epoch": 0.66, + "grad_norm": 4.0339234045457175, + "learning_rate": 2.6811373547731224e-06, + "loss": 0.2664, + "step": 10557 + }, + { + "epoch": 0.66, + "grad_norm": 1.6464528118677992, + "learning_rate": 2.680235079228354e-06, + "loss": 0.2931, + "step": 10558 + }, + { + "epoch": 0.66, + "grad_norm": 2.344006149002412, + "learning_rate": 2.6793328999347546e-06, + "loss": 0.2614, + "step": 10559 + }, + { + "epoch": 0.66, + "grad_norm": 0.5891767476746761, + "learning_rate": 2.6784308169297525e-06, + "loss": 0.4672, + "step": 10560 + }, + { + "epoch": 0.66, + "grad_norm": 1.2521914885194227, + "learning_rate": 2.677528830250782e-06, + "loss": 0.2643, + "step": 10561 + }, + { + "epoch": 0.66, + "grad_norm": 0.5716718943515395, + "learning_rate": 2.6766269399352628e-06, + "loss": 0.4468, + "step": 10562 + }, + { + "epoch": 0.66, + "grad_norm": 1.8879084440995968, + "learning_rate": 2.6757251460206215e-06, + "loss": 0.2757, + "step": 10563 + }, + { + "epoch": 0.66, + "grad_norm": 1.6270430444033495, + "learning_rate": 2.6748234485442713e-06, + "loss": 0.2806, + "step": 10564 + }, + { + "epoch": 0.66, + "grad_norm": 2.943469820556087, + "learning_rate": 2.6739218475436267e-06, + "loss": 0.2607, + "step": 10565 + }, + { + "epoch": 0.66, + "grad_norm": 1.8286178782546847, + "learning_rate": 2.6730203430560946e-06, + "loss": 0.2743, + "step": 10566 + }, + { + "epoch": 0.66, + "grad_norm": 1.9813317388532967, + "learning_rate": 2.6721189351190835e-06, + "loss": 0.2587, + "step": 10567 + }, + { + "epoch": 0.66, + "grad_norm": 1.6332584518626676, + "learning_rate": 2.6712176237699907e-06, + "loss": 0.2704, + "step": 10568 + }, + { + "epoch": 0.66, + "grad_norm": 2.8949438163141363, + "learning_rate": 2.6703164090462164e-06, + "loss": 0.2613, + "step": 10569 + }, + { + "epoch": 0.66, + "grad_norm": 1.4791171219959713, + "learning_rate": 2.66941529098515e-06, + "loss": 0.2728, + "step": 10570 + }, + { + "epoch": 0.66, + "grad_norm": 2.0272497452550495, + "learning_rate": 2.668514269624186e-06, + "loss": 0.253, + "step": 10571 + }, + { + "epoch": 0.66, + "grad_norm": 2.5323856618951797, + "learning_rate": 2.6676133450007053e-06, + "loss": 0.267, + "step": 10572 + }, + { + "epoch": 0.66, + "grad_norm": 2.572834718520556, + "learning_rate": 2.66671251715209e-06, + "loss": 0.2722, + "step": 10573 + }, + { + "epoch": 0.66, + "grad_norm": 1.872051124777162, + "learning_rate": 2.6658117861157146e-06, + "loss": 0.2553, + "step": 10574 + }, + { + "epoch": 0.67, + "grad_norm": 2.555297755025289, + "learning_rate": 2.6649111519289537e-06, + "loss": 0.281, + "step": 10575 + }, + { + "epoch": 0.67, + "grad_norm": 1.6974026491636993, + "learning_rate": 2.66401061462918e-06, + "loss": 0.2612, + "step": 10576 + }, + { + "epoch": 0.67, + "grad_norm": 2.7852817265602523, + "learning_rate": 2.663110174253754e-06, + "loss": 0.295, + "step": 10577 + }, + { + "epoch": 0.67, + "grad_norm": 1.4251423398342096, + "learning_rate": 2.6622098308400364e-06, + "loss": 0.286, + "step": 10578 + }, + { + "epoch": 0.67, + "grad_norm": 1.9080344692995987, + "learning_rate": 2.6613095844253866e-06, + "loss": 0.2485, + "step": 10579 + }, + { + "epoch": 0.67, + "grad_norm": 1.5804396059148693, + "learning_rate": 2.6604094350471564e-06, + "loss": 0.2516, + "step": 10580 + }, + { + "epoch": 0.67, + "grad_norm": 2.6728663237700423, + "learning_rate": 2.6595093827426942e-06, + "loss": 0.2954, + "step": 10581 + }, + { + "epoch": 0.67, + "grad_norm": 2.7142380925622875, + "learning_rate": 2.6586094275493435e-06, + "loss": 0.2792, + "step": 10582 + }, + { + "epoch": 0.67, + "grad_norm": 1.6355585114588722, + "learning_rate": 2.6577095695044452e-06, + "loss": 0.254, + "step": 10583 + }, + { + "epoch": 0.67, + "grad_norm": 2.617590949599532, + "learning_rate": 2.65680980864534e-06, + "loss": 0.2618, + "step": 10584 + }, + { + "epoch": 0.67, + "grad_norm": 4.231354730375861, + "learning_rate": 2.655910145009358e-06, + "loss": 0.2841, + "step": 10585 + }, + { + "epoch": 0.67, + "grad_norm": 3.0582960429211345, + "learning_rate": 2.6550105786338255e-06, + "loss": 0.2628, + "step": 10586 + }, + { + "epoch": 0.67, + "grad_norm": 2.722263420641252, + "learning_rate": 2.6541111095560713e-06, + "loss": 0.2611, + "step": 10587 + }, + { + "epoch": 0.67, + "grad_norm": 1.7153358268931276, + "learning_rate": 2.6532117378134138e-06, + "loss": 0.2621, + "step": 10588 + }, + { + "epoch": 0.67, + "grad_norm": 1.917108995509193, + "learning_rate": 2.6523124634431698e-06, + "loss": 0.2591, + "step": 10589 + }, + { + "epoch": 0.67, + "grad_norm": 1.4969706127629552, + "learning_rate": 2.6514132864826477e-06, + "loss": 0.2656, + "step": 10590 + }, + { + "epoch": 0.67, + "grad_norm": 4.9993401405330005, + "learning_rate": 2.6505142069691636e-06, + "loss": 0.2675, + "step": 10591 + }, + { + "epoch": 0.67, + "grad_norm": 2.058974857078737, + "learning_rate": 2.6496152249400187e-06, + "loss": 0.2601, + "step": 10592 + }, + { + "epoch": 0.67, + "grad_norm": 1.701421722075308, + "learning_rate": 2.648716340432512e-06, + "loss": 0.2703, + "step": 10593 + }, + { + "epoch": 0.67, + "grad_norm": 1.6556566591839421, + "learning_rate": 2.647817553483939e-06, + "loss": 0.2735, + "step": 10594 + }, + { + "epoch": 0.67, + "grad_norm": 1.7339461124670978, + "learning_rate": 2.646918864131596e-06, + "loss": 0.261, + "step": 10595 + }, + { + "epoch": 0.67, + "grad_norm": 2.31932599983908, + "learning_rate": 2.6460202724127693e-06, + "loss": 0.2598, + "step": 10596 + }, + { + "epoch": 0.67, + "grad_norm": 1.7623526986928681, + "learning_rate": 2.645121778364742e-06, + "loss": 0.2889, + "step": 10597 + }, + { + "epoch": 0.67, + "grad_norm": 2.1739857003218686, + "learning_rate": 2.644223382024791e-06, + "loss": 0.2743, + "step": 10598 + }, + { + "epoch": 0.67, + "grad_norm": 2.187989754663518, + "learning_rate": 2.6433250834301998e-06, + "loss": 0.2596, + "step": 10599 + }, + { + "epoch": 0.67, + "grad_norm": 3.4144606729819666, + "learning_rate": 2.6424268826182377e-06, + "loss": 0.2605, + "step": 10600 + }, + { + "epoch": 0.67, + "grad_norm": 2.590019006914779, + "learning_rate": 2.6415287796261707e-06, + "loss": 0.2806, + "step": 10601 + }, + { + "epoch": 0.67, + "grad_norm": 2.5424578813493213, + "learning_rate": 2.640630774491262e-06, + "loss": 0.2729, + "step": 10602 + }, + { + "epoch": 0.67, + "grad_norm": 2.7529690132675753, + "learning_rate": 2.639732867250776e-06, + "loss": 0.2659, + "step": 10603 + }, + { + "epoch": 0.67, + "grad_norm": 1.8640262209601803, + "learning_rate": 2.6388350579419646e-06, + "loss": 0.2669, + "step": 10604 + }, + { + "epoch": 0.67, + "grad_norm": 3.213846652562735, + "learning_rate": 2.637937346602079e-06, + "loss": 0.2823, + "step": 10605 + }, + { + "epoch": 0.67, + "grad_norm": 3.6387929469524227, + "learning_rate": 2.6370397332683684e-06, + "loss": 0.2788, + "step": 10606 + }, + { + "epoch": 0.67, + "grad_norm": 1.876297500016865, + "learning_rate": 2.636142217978078e-06, + "loss": 0.2602, + "step": 10607 + }, + { + "epoch": 0.67, + "grad_norm": 3.678408696886539, + "learning_rate": 2.6352448007684466e-06, + "loss": 0.2888, + "step": 10608 + }, + { + "epoch": 0.67, + "grad_norm": 1.9014697504973876, + "learning_rate": 2.634347481676708e-06, + "loss": 0.2664, + "step": 10609 + }, + { + "epoch": 0.67, + "grad_norm": 2.0913355328851155, + "learning_rate": 2.6334502607400923e-06, + "loss": 0.2727, + "step": 10610 + }, + { + "epoch": 0.67, + "grad_norm": 4.120909900148851, + "learning_rate": 2.6325531379958314e-06, + "loss": 0.2548, + "step": 10611 + }, + { + "epoch": 0.67, + "grad_norm": 1.507661159249386, + "learning_rate": 2.631656113481145e-06, + "loss": 0.2621, + "step": 10612 + }, + { + "epoch": 0.67, + "grad_norm": 2.049524855023579, + "learning_rate": 2.6307591872332514e-06, + "loss": 0.2565, + "step": 10613 + }, + { + "epoch": 0.67, + "grad_norm": 1.9332949503970627, + "learning_rate": 2.6298623592893676e-06, + "loss": 0.2923, + "step": 10614 + }, + { + "epoch": 0.67, + "grad_norm": 1.6203202242253394, + "learning_rate": 2.628965629686706e-06, + "loss": 0.2765, + "step": 10615 + }, + { + "epoch": 0.67, + "grad_norm": 1.835225537225523, + "learning_rate": 2.628068998462472e-06, + "loss": 0.2629, + "step": 10616 + }, + { + "epoch": 0.67, + "grad_norm": 2.322081098431167, + "learning_rate": 2.627172465653868e-06, + "loss": 0.2647, + "step": 10617 + }, + { + "epoch": 0.67, + "grad_norm": 2.072891592974846, + "learning_rate": 2.6262760312980914e-06, + "loss": 0.2528, + "step": 10618 + }, + { + "epoch": 0.67, + "grad_norm": 3.4635736856140795, + "learning_rate": 2.62537969543234e-06, + "loss": 0.2692, + "step": 10619 + }, + { + "epoch": 0.67, + "grad_norm": 1.6192277971923672, + "learning_rate": 2.6244834580938016e-06, + "loss": 0.2586, + "step": 10620 + }, + { + "epoch": 0.67, + "grad_norm": 6.180052056443318, + "learning_rate": 2.623587319319665e-06, + "loss": 0.2631, + "step": 10621 + }, + { + "epoch": 0.67, + "grad_norm": 2.1416244976145387, + "learning_rate": 2.6226912791471103e-06, + "loss": 0.2495, + "step": 10622 + }, + { + "epoch": 0.67, + "grad_norm": 1.4886024493190455, + "learning_rate": 2.6217953376133187e-06, + "loss": 0.2619, + "step": 10623 + }, + { + "epoch": 0.67, + "grad_norm": 2.510835148698963, + "learning_rate": 2.6208994947554626e-06, + "loss": 0.2615, + "step": 10624 + }, + { + "epoch": 0.67, + "grad_norm": 2.400618012820244, + "learning_rate": 2.620003750610712e-06, + "loss": 0.2911, + "step": 10625 + }, + { + "epoch": 0.67, + "grad_norm": 2.696640786017668, + "learning_rate": 2.6191081052162315e-06, + "loss": 0.2587, + "step": 10626 + }, + { + "epoch": 0.67, + "grad_norm": 1.2871950304100317, + "learning_rate": 2.6182125586091867e-06, + "loss": 0.3058, + "step": 10627 + }, + { + "epoch": 0.67, + "grad_norm": 1.6835677362491857, + "learning_rate": 2.6173171108267316e-06, + "loss": 0.2666, + "step": 10628 + }, + { + "epoch": 0.67, + "grad_norm": 10.260840460070348, + "learning_rate": 2.6164217619060234e-06, + "loss": 0.2759, + "step": 10629 + }, + { + "epoch": 0.67, + "grad_norm": 1.3818412361050458, + "learning_rate": 2.615526511884208e-06, + "loss": 0.2622, + "step": 10630 + }, + { + "epoch": 0.67, + "grad_norm": 1.523975425219532, + "learning_rate": 2.6146313607984355e-06, + "loss": 0.2578, + "step": 10631 + }, + { + "epoch": 0.67, + "grad_norm": 1.8168268645072094, + "learning_rate": 2.6137363086858435e-06, + "loss": 0.2821, + "step": 10632 + }, + { + "epoch": 0.67, + "grad_norm": 1.7028858622010878, + "learning_rate": 2.612841355583571e-06, + "loss": 0.2742, + "step": 10633 + }, + { + "epoch": 0.67, + "grad_norm": 2.2438395892141583, + "learning_rate": 2.6119465015287493e-06, + "loss": 0.2598, + "step": 10634 + }, + { + "epoch": 0.67, + "grad_norm": 1.5397188637521444, + "learning_rate": 2.6110517465585085e-06, + "loss": 0.2648, + "step": 10635 + }, + { + "epoch": 0.67, + "grad_norm": 1.8103106211290358, + "learning_rate": 2.610157090709976e-06, + "loss": 0.2634, + "step": 10636 + }, + { + "epoch": 0.67, + "grad_norm": 2.546450720595291, + "learning_rate": 2.6092625340202703e-06, + "loss": 0.2728, + "step": 10637 + }, + { + "epoch": 0.67, + "grad_norm": 2.8677470316918625, + "learning_rate": 2.6083680765265073e-06, + "loss": 0.266, + "step": 10638 + }, + { + "epoch": 0.67, + "grad_norm": 1.6099796704289258, + "learning_rate": 2.607473718265802e-06, + "loss": 0.2597, + "step": 10639 + }, + { + "epoch": 0.67, + "grad_norm": 1.8982228396210488, + "learning_rate": 2.6065794592752623e-06, + "loss": 0.2763, + "step": 10640 + }, + { + "epoch": 0.67, + "grad_norm": 1.7486909185828226, + "learning_rate": 2.6056852995919918e-06, + "loss": 0.268, + "step": 10641 + }, + { + "epoch": 0.67, + "grad_norm": 2.8147658964351727, + "learning_rate": 2.604791239253089e-06, + "loss": 0.2676, + "step": 10642 + }, + { + "epoch": 0.67, + "grad_norm": 16.94077740106074, + "learning_rate": 2.6038972782956516e-06, + "loss": 0.2558, + "step": 10643 + }, + { + "epoch": 0.67, + "grad_norm": 2.0246544360765015, + "learning_rate": 2.6030034167567752e-06, + "loss": 0.2569, + "step": 10644 + }, + { + "epoch": 0.67, + "grad_norm": 1.750501290100431, + "learning_rate": 2.6021096546735438e-06, + "loss": 0.2758, + "step": 10645 + }, + { + "epoch": 0.67, + "grad_norm": 1.559194212820961, + "learning_rate": 2.60121599208304e-06, + "loss": 0.2756, + "step": 10646 + }, + { + "epoch": 0.67, + "grad_norm": 1.4951093445217885, + "learning_rate": 2.600322429022347e-06, + "loss": 0.2699, + "step": 10647 + }, + { + "epoch": 0.67, + "grad_norm": 2.6597456654934737, + "learning_rate": 2.5994289655285396e-06, + "loss": 0.2889, + "step": 10648 + }, + { + "epoch": 0.67, + "grad_norm": 1.5161201312991648, + "learning_rate": 2.5985356016386883e-06, + "loss": 0.2552, + "step": 10649 + }, + { + "epoch": 0.67, + "grad_norm": 2.0867449553152335, + "learning_rate": 2.597642337389858e-06, + "loss": 0.2597, + "step": 10650 + }, + { + "epoch": 0.67, + "grad_norm": 2.2875975532603436, + "learning_rate": 2.596749172819114e-06, + "loss": 0.2604, + "step": 10651 + }, + { + "epoch": 0.67, + "grad_norm": 1.5626438460521015, + "learning_rate": 2.595856107963518e-06, + "loss": 0.2703, + "step": 10652 + }, + { + "epoch": 0.67, + "grad_norm": 3.6844249062005776, + "learning_rate": 2.5949631428601218e-06, + "loss": 0.2707, + "step": 10653 + }, + { + "epoch": 0.67, + "grad_norm": 2.084937417918076, + "learning_rate": 2.594070277545975e-06, + "loss": 0.2796, + "step": 10654 + }, + { + "epoch": 0.67, + "grad_norm": 1.587092909145393, + "learning_rate": 2.593177512058127e-06, + "loss": 0.2475, + "step": 10655 + }, + { + "epoch": 0.67, + "grad_norm": 0.6498549567788552, + "learning_rate": 2.5922848464336203e-06, + "loss": 0.4876, + "step": 10656 + }, + { + "epoch": 0.67, + "grad_norm": 3.9631345189313008, + "learning_rate": 2.5913922807094894e-06, + "loss": 0.2555, + "step": 10657 + }, + { + "epoch": 0.67, + "grad_norm": 3.348123254239563, + "learning_rate": 2.590499814922772e-06, + "loss": 0.2619, + "step": 10658 + }, + { + "epoch": 0.67, + "grad_norm": 1.464586238642134, + "learning_rate": 2.5896074491104963e-06, + "loss": 0.2535, + "step": 10659 + }, + { + "epoch": 0.67, + "grad_norm": 1.929509701228109, + "learning_rate": 2.58871518330969e-06, + "loss": 0.2537, + "step": 10660 + }, + { + "epoch": 0.67, + "grad_norm": 0.5794823207730463, + "learning_rate": 2.5878230175573743e-06, + "loss": 0.49, + "step": 10661 + }, + { + "epoch": 0.67, + "grad_norm": 2.523906905390259, + "learning_rate": 2.586930951890564e-06, + "loss": 0.2549, + "step": 10662 + }, + { + "epoch": 0.67, + "grad_norm": 2.0591089722327425, + "learning_rate": 2.5860389863462765e-06, + "loss": 0.2622, + "step": 10663 + }, + { + "epoch": 0.67, + "grad_norm": 4.3123396773365945, + "learning_rate": 2.5851471209615186e-06, + "loss": 0.2466, + "step": 10664 + }, + { + "epoch": 0.67, + "grad_norm": 1.9443522141802896, + "learning_rate": 2.5842553557732953e-06, + "loss": 0.2563, + "step": 10665 + }, + { + "epoch": 0.67, + "grad_norm": 1.588902182894885, + "learning_rate": 2.5833636908186064e-06, + "loss": 0.269, + "step": 10666 + }, + { + "epoch": 0.67, + "grad_norm": 5.672968990815488, + "learning_rate": 2.582472126134454e-06, + "loss": 0.2626, + "step": 10667 + }, + { + "epoch": 0.67, + "grad_norm": 2.8489354443635064, + "learning_rate": 2.581580661757826e-06, + "loss": 0.2627, + "step": 10668 + }, + { + "epoch": 0.67, + "grad_norm": 2.4598953094511153, + "learning_rate": 2.5806892977257126e-06, + "loss": 0.2863, + "step": 10669 + }, + { + "epoch": 0.67, + "grad_norm": 1.6698336809258087, + "learning_rate": 2.579798034075095e-06, + "loss": 0.2629, + "step": 10670 + }, + { + "epoch": 0.67, + "grad_norm": 2.629764415185163, + "learning_rate": 2.5789068708429576e-06, + "loss": 0.2706, + "step": 10671 + }, + { + "epoch": 0.67, + "grad_norm": 2.0127877473725233, + "learning_rate": 2.578015808066273e-06, + "loss": 0.2604, + "step": 10672 + }, + { + "epoch": 0.67, + "grad_norm": 2.507531804997822, + "learning_rate": 2.5771248457820165e-06, + "loss": 0.2522, + "step": 10673 + }, + { + "epoch": 0.67, + "grad_norm": 1.7626399370202608, + "learning_rate": 2.5762339840271513e-06, + "loss": 0.255, + "step": 10674 + }, + { + "epoch": 0.67, + "grad_norm": 1.5260960041805964, + "learning_rate": 2.575343222838645e-06, + "loss": 0.2523, + "step": 10675 + }, + { + "epoch": 0.67, + "grad_norm": 1.6774592924681375, + "learning_rate": 2.574452562253455e-06, + "loss": 0.2879, + "step": 10676 + }, + { + "epoch": 0.67, + "grad_norm": 2.077359334144614, + "learning_rate": 2.5735620023085367e-06, + "loss": 0.2625, + "step": 10677 + }, + { + "epoch": 0.67, + "grad_norm": 2.2258704980151838, + "learning_rate": 2.572671543040839e-06, + "loss": 0.2721, + "step": 10678 + }, + { + "epoch": 0.67, + "grad_norm": 4.357607266475697, + "learning_rate": 2.571781184487312e-06, + "loss": 0.2816, + "step": 10679 + }, + { + "epoch": 0.67, + "grad_norm": 1.4169846712490002, + "learning_rate": 2.570890926684895e-06, + "loss": 0.2798, + "step": 10680 + }, + { + "epoch": 0.67, + "grad_norm": 2.784672021914526, + "learning_rate": 2.57000076967053e-06, + "loss": 0.2625, + "step": 10681 + }, + { + "epoch": 0.67, + "grad_norm": 2.240635996638068, + "learning_rate": 2.569110713481147e-06, + "loss": 0.2633, + "step": 10682 + }, + { + "epoch": 0.67, + "grad_norm": 1.615192306625452, + "learning_rate": 2.56822075815368e-06, + "loss": 0.2609, + "step": 10683 + }, + { + "epoch": 0.67, + "grad_norm": 1.6451409067071499, + "learning_rate": 2.567330903725054e-06, + "loss": 0.276, + "step": 10684 + }, + { + "epoch": 0.67, + "grad_norm": 1.3289556990694675, + "learning_rate": 2.566441150232189e-06, + "loss": 0.276, + "step": 10685 + }, + { + "epoch": 0.67, + "grad_norm": 1.7213448987786697, + "learning_rate": 2.5655514977120013e-06, + "loss": 0.2786, + "step": 10686 + }, + { + "epoch": 0.67, + "grad_norm": 2.4652025240297863, + "learning_rate": 2.5646619462014062e-06, + "loss": 0.2752, + "step": 10687 + }, + { + "epoch": 0.67, + "grad_norm": 2.09302677785505, + "learning_rate": 2.5637724957373144e-06, + "loss": 0.2726, + "step": 10688 + }, + { + "epoch": 0.67, + "grad_norm": 2.7267881270737377, + "learning_rate": 2.5628831463566285e-06, + "loss": 0.2611, + "step": 10689 + }, + { + "epoch": 0.67, + "grad_norm": 7.744178529246718, + "learning_rate": 2.561993898096249e-06, + "loss": 0.2617, + "step": 10690 + }, + { + "epoch": 0.67, + "grad_norm": 1.8202378836545405, + "learning_rate": 2.5611047509930737e-06, + "loss": 0.259, + "step": 10691 + }, + { + "epoch": 0.67, + "grad_norm": 1.862782784274091, + "learning_rate": 2.560215705083995e-06, + "loss": 0.2537, + "step": 10692 + }, + { + "epoch": 0.67, + "grad_norm": 2.8719111448227204, + "learning_rate": 2.5593267604058998e-06, + "loss": 0.2604, + "step": 10693 + }, + { + "epoch": 0.67, + "grad_norm": 4.139683591121575, + "learning_rate": 2.558437916995671e-06, + "loss": 0.2696, + "step": 10694 + }, + { + "epoch": 0.67, + "grad_norm": 3.59879788854556, + "learning_rate": 2.5575491748901892e-06, + "loss": 0.2744, + "step": 10695 + }, + { + "epoch": 0.67, + "grad_norm": 1.9827703484918457, + "learning_rate": 2.5566605341263322e-06, + "loss": 0.2725, + "step": 10696 + }, + { + "epoch": 0.67, + "grad_norm": 1.7400888471913067, + "learning_rate": 2.55577199474097e-06, + "loss": 0.2694, + "step": 10697 + }, + { + "epoch": 0.67, + "grad_norm": 0.6260521221584945, + "learning_rate": 2.5548835567709672e-06, + "loss": 0.5226, + "step": 10698 + }, + { + "epoch": 0.67, + "grad_norm": 1.9467036804749311, + "learning_rate": 2.5539952202531905e-06, + "loss": 0.2886, + "step": 10699 + }, + { + "epoch": 0.67, + "grad_norm": 1.751481328040724, + "learning_rate": 2.5531069852244968e-06, + "loss": 0.2718, + "step": 10700 + }, + { + "epoch": 0.67, + "grad_norm": 0.5923047119665359, + "learning_rate": 2.5522188517217404e-06, + "loss": 0.4505, + "step": 10701 + }, + { + "epoch": 0.67, + "grad_norm": 1.921487192719859, + "learning_rate": 2.551330819781769e-06, + "loss": 0.261, + "step": 10702 + }, + { + "epoch": 0.67, + "grad_norm": 0.5990451619774673, + "learning_rate": 2.5504428894414323e-06, + "loss": 0.4841, + "step": 10703 + }, + { + "epoch": 0.67, + "grad_norm": 2.3902303604684936, + "learning_rate": 2.5495550607375723e-06, + "loss": 0.2637, + "step": 10704 + }, + { + "epoch": 0.67, + "grad_norm": 1.8548515651860293, + "learning_rate": 2.5486673337070254e-06, + "loss": 0.2663, + "step": 10705 + }, + { + "epoch": 0.67, + "grad_norm": 5.238012333062435, + "learning_rate": 2.5477797083866227e-06, + "loss": 0.2698, + "step": 10706 + }, + { + "epoch": 0.67, + "grad_norm": 1.7506722369020096, + "learning_rate": 2.5468921848131984e-06, + "loss": 0.2593, + "step": 10707 + }, + { + "epoch": 0.67, + "grad_norm": 1.8529942145505167, + "learning_rate": 2.546004763023574e-06, + "loss": 0.2758, + "step": 10708 + }, + { + "epoch": 0.67, + "grad_norm": 2.5379960918388584, + "learning_rate": 2.5451174430545704e-06, + "loss": 0.2713, + "step": 10709 + }, + { + "epoch": 0.67, + "grad_norm": 1.5288099794809744, + "learning_rate": 2.5442302249430027e-06, + "loss": 0.2624, + "step": 10710 + }, + { + "epoch": 0.67, + "grad_norm": 7.197753413877993, + "learning_rate": 2.5433431087256853e-06, + "loss": 0.2876, + "step": 10711 + }, + { + "epoch": 0.67, + "grad_norm": 1.7506673837411795, + "learning_rate": 2.542456094439427e-06, + "loss": 0.247, + "step": 10712 + }, + { + "epoch": 0.67, + "grad_norm": 1.4603022809404356, + "learning_rate": 2.5415691821210314e-06, + "loss": 0.2709, + "step": 10713 + }, + { + "epoch": 0.67, + "grad_norm": 1.8888431331421702, + "learning_rate": 2.540682371807295e-06, + "loss": 0.2782, + "step": 10714 + }, + { + "epoch": 0.67, + "grad_norm": 2.0378610460366953, + "learning_rate": 2.5397956635350164e-06, + "loss": 0.257, + "step": 10715 + }, + { + "epoch": 0.67, + "grad_norm": 2.4467611374502765, + "learning_rate": 2.5389090573409863e-06, + "loss": 0.2741, + "step": 10716 + }, + { + "epoch": 0.67, + "grad_norm": 1.68515593439781, + "learning_rate": 2.5380225532619886e-06, + "loss": 0.2544, + "step": 10717 + }, + { + "epoch": 0.67, + "grad_norm": 2.8409282143245345, + "learning_rate": 2.5371361513348103e-06, + "loss": 0.2771, + "step": 10718 + }, + { + "epoch": 0.67, + "grad_norm": 2.6940132298961257, + "learning_rate": 2.5362498515962253e-06, + "loss": 0.2586, + "step": 10719 + }, + { + "epoch": 0.67, + "grad_norm": 2.183619629229947, + "learning_rate": 2.5353636540830117e-06, + "loss": 0.2792, + "step": 10720 + }, + { + "epoch": 0.67, + "grad_norm": 1.510278437598951, + "learning_rate": 2.534477558831938e-06, + "loss": 0.259, + "step": 10721 + }, + { + "epoch": 0.67, + "grad_norm": 1.6693127198703324, + "learning_rate": 2.533591565879768e-06, + "loss": 0.265, + "step": 10722 + }, + { + "epoch": 0.67, + "grad_norm": 1.8263344850511583, + "learning_rate": 2.5327056752632674e-06, + "loss": 0.2586, + "step": 10723 + }, + { + "epoch": 0.67, + "grad_norm": 2.214349382760798, + "learning_rate": 2.5318198870191877e-06, + "loss": 0.2747, + "step": 10724 + }, + { + "epoch": 0.67, + "grad_norm": 1.6365207180987742, + "learning_rate": 2.530934201184287e-06, + "loss": 0.2497, + "step": 10725 + }, + { + "epoch": 0.67, + "grad_norm": 2.2915169082627025, + "learning_rate": 2.5300486177953123e-06, + "loss": 0.2671, + "step": 10726 + }, + { + "epoch": 0.67, + "grad_norm": 3.352748126876919, + "learning_rate": 2.5291631368890047e-06, + "loss": 0.2549, + "step": 10727 + }, + { + "epoch": 0.67, + "grad_norm": 2.764105496513473, + "learning_rate": 2.52827775850211e-06, + "loss": 0.2692, + "step": 10728 + }, + { + "epoch": 0.67, + "grad_norm": 1.7484368178982617, + "learning_rate": 2.5273924826713615e-06, + "loss": 0.2528, + "step": 10729 + }, + { + "epoch": 0.67, + "grad_norm": 1.6640790963285514, + "learning_rate": 2.526507309433488e-06, + "loss": 0.2734, + "step": 10730 + }, + { + "epoch": 0.67, + "grad_norm": 4.623934747375392, + "learning_rate": 2.5256222388252223e-06, + "loss": 0.2813, + "step": 10731 + }, + { + "epoch": 0.67, + "grad_norm": 1.731381905726239, + "learning_rate": 2.524737270883283e-06, + "loss": 0.2752, + "step": 10732 + }, + { + "epoch": 0.67, + "grad_norm": 3.9969171361101767, + "learning_rate": 2.5238524056443923e-06, + "loss": 0.2743, + "step": 10733 + }, + { + "epoch": 0.68, + "grad_norm": 0.6007300200548918, + "learning_rate": 2.522967643145263e-06, + "loss": 0.493, + "step": 10734 + }, + { + "epoch": 0.68, + "grad_norm": 3.0248368372994143, + "learning_rate": 2.5220829834226052e-06, + "loss": 0.2668, + "step": 10735 + }, + { + "epoch": 0.68, + "grad_norm": 2.9164470473306445, + "learning_rate": 2.521198426513127e-06, + "loss": 0.2797, + "step": 10736 + }, + { + "epoch": 0.68, + "grad_norm": 3.4311060110477762, + "learning_rate": 2.520313972453529e-06, + "loss": 0.2667, + "step": 10737 + }, + { + "epoch": 0.68, + "grad_norm": 5.356554789475532, + "learning_rate": 2.519429621280507e-06, + "loss": 0.2399, + "step": 10738 + }, + { + "epoch": 0.68, + "grad_norm": 2.921065855938856, + "learning_rate": 2.518545373030755e-06, + "loss": 0.2678, + "step": 10739 + }, + { + "epoch": 0.68, + "grad_norm": 2.897986968370867, + "learning_rate": 2.5176612277409663e-06, + "loss": 0.2874, + "step": 10740 + }, + { + "epoch": 0.68, + "grad_norm": 1.5972060760578377, + "learning_rate": 2.516777185447822e-06, + "loss": 0.2643, + "step": 10741 + }, + { + "epoch": 0.68, + "grad_norm": 1.8534673337783578, + "learning_rate": 2.5158932461880025e-06, + "loss": 0.2643, + "step": 10742 + }, + { + "epoch": 0.68, + "grad_norm": 1.9849075715174838, + "learning_rate": 2.5150094099981824e-06, + "loss": 0.25, + "step": 10743 + }, + { + "epoch": 0.68, + "grad_norm": 1.9097060867123994, + "learning_rate": 2.5141256769150384e-06, + "loss": 0.2774, + "step": 10744 + }, + { + "epoch": 0.68, + "grad_norm": 1.6541412249210272, + "learning_rate": 2.513242046975235e-06, + "loss": 0.2625, + "step": 10745 + }, + { + "epoch": 0.68, + "grad_norm": 1.9101433073194343, + "learning_rate": 2.5123585202154345e-06, + "loss": 0.2517, + "step": 10746 + }, + { + "epoch": 0.68, + "grad_norm": 4.61749849088729, + "learning_rate": 2.511475096672298e-06, + "loss": 0.2729, + "step": 10747 + }, + { + "epoch": 0.68, + "grad_norm": 1.8666139577702474, + "learning_rate": 2.5105917763824807e-06, + "loss": 0.2888, + "step": 10748 + }, + { + "epoch": 0.68, + "grad_norm": 3.5604030068521526, + "learning_rate": 2.5097085593826324e-06, + "loss": 0.2559, + "step": 10749 + }, + { + "epoch": 0.68, + "grad_norm": 1.6913263652043975, + "learning_rate": 2.5088254457093976e-06, + "loss": 0.2734, + "step": 10750 + }, + { + "epoch": 0.68, + "grad_norm": 1.887452588989263, + "learning_rate": 2.507942435399422e-06, + "loss": 0.2699, + "step": 10751 + }, + { + "epoch": 0.68, + "grad_norm": 1.9885788573119678, + "learning_rate": 2.5070595284893405e-06, + "loss": 0.2784, + "step": 10752 + }, + { + "epoch": 0.68, + "grad_norm": 2.321763075651108, + "learning_rate": 2.5061767250157883e-06, + "loss": 0.2862, + "step": 10753 + }, + { + "epoch": 0.68, + "grad_norm": 2.4527588060279713, + "learning_rate": 2.50529402501539e-06, + "loss": 0.2875, + "step": 10754 + }, + { + "epoch": 0.68, + "grad_norm": 2.9871044033792176, + "learning_rate": 2.5044114285247747e-06, + "loss": 0.2728, + "step": 10755 + }, + { + "epoch": 0.68, + "grad_norm": 4.427113811441325, + "learning_rate": 2.5035289355805634e-06, + "loss": 0.27, + "step": 10756 + }, + { + "epoch": 0.68, + "grad_norm": 2.5494336135191533, + "learning_rate": 2.502646546219371e-06, + "loss": 0.26, + "step": 10757 + }, + { + "epoch": 0.68, + "grad_norm": 1.7655865115779152, + "learning_rate": 2.501764260477807e-06, + "loss": 0.2568, + "step": 10758 + }, + { + "epoch": 0.68, + "grad_norm": 2.0906854762511573, + "learning_rate": 2.5008820783924838e-06, + "loss": 0.2565, + "step": 10759 + }, + { + "epoch": 0.68, + "grad_norm": 2.466375728740451, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.2751, + "step": 10760 + }, + { + "epoch": 0.68, + "grad_norm": 1.912316393244981, + "learning_rate": 2.4991180253369594e-06, + "loss": 0.2599, + "step": 10761 + }, + { + "epoch": 0.68, + "grad_norm": 4.0501135923341485, + "learning_rate": 2.4982361544399507e-06, + "loss": 0.2634, + "step": 10762 + }, + { + "epoch": 0.68, + "grad_norm": 2.086258207425374, + "learning_rate": 2.497354387345568e-06, + "loss": 0.2649, + "step": 10763 + }, + { + "epoch": 0.68, + "grad_norm": 1.8297431359179088, + "learning_rate": 2.496472724090399e-06, + "loss": 0.2745, + "step": 10764 + }, + { + "epoch": 0.68, + "grad_norm": 1.8082842533895456, + "learning_rate": 2.495591164711023e-06, + "loss": 0.2486, + "step": 10765 + }, + { + "epoch": 0.68, + "grad_norm": 2.9832680845163195, + "learning_rate": 2.4947097092440158e-06, + "loss": 0.2703, + "step": 10766 + }, + { + "epoch": 0.68, + "grad_norm": 1.5536316284314864, + "learning_rate": 2.4938283577259555e-06, + "loss": 0.2626, + "step": 10767 + }, + { + "epoch": 0.68, + "grad_norm": 1.7983514948701425, + "learning_rate": 2.492947110193407e-06, + "loss": 0.2522, + "step": 10768 + }, + { + "epoch": 0.68, + "grad_norm": 1.964056558789052, + "learning_rate": 2.492065966682935e-06, + "loss": 0.2577, + "step": 10769 + }, + { + "epoch": 0.68, + "grad_norm": 0.6199639346245456, + "learning_rate": 2.4911849272311015e-06, + "loss": 0.4848, + "step": 10770 + }, + { + "epoch": 0.68, + "grad_norm": 2.102287851861108, + "learning_rate": 2.49030399187446e-06, + "loss": 0.2702, + "step": 10771 + }, + { + "epoch": 0.68, + "grad_norm": 1.610894320541659, + "learning_rate": 2.4894231606495663e-06, + "loss": 0.2717, + "step": 10772 + }, + { + "epoch": 0.68, + "grad_norm": 2.1510178553218715, + "learning_rate": 2.488542433592964e-06, + "loss": 0.2612, + "step": 10773 + }, + { + "epoch": 0.68, + "grad_norm": 2.5313128153843305, + "learning_rate": 2.487661810741196e-06, + "loss": 0.275, + "step": 10774 + }, + { + "epoch": 0.68, + "grad_norm": 1.9076731489355447, + "learning_rate": 2.486781292130803e-06, + "loss": 0.2698, + "step": 10775 + }, + { + "epoch": 0.68, + "grad_norm": 2.0592369070006264, + "learning_rate": 2.4859008777983183e-06, + "loss": 0.2503, + "step": 10776 + }, + { + "epoch": 0.68, + "grad_norm": 1.5654897475717329, + "learning_rate": 2.4850205677802703e-06, + "loss": 0.2477, + "step": 10777 + }, + { + "epoch": 0.68, + "grad_norm": 2.043550501093226, + "learning_rate": 2.484140362113187e-06, + "loss": 0.2689, + "step": 10778 + }, + { + "epoch": 0.68, + "grad_norm": 2.541976162692309, + "learning_rate": 2.4832602608335874e-06, + "loss": 0.2726, + "step": 10779 + }, + { + "epoch": 0.68, + "grad_norm": 3.6565962060903403, + "learning_rate": 2.482380263977992e-06, + "loss": 0.2785, + "step": 10780 + }, + { + "epoch": 0.68, + "grad_norm": 2.551375349168563, + "learning_rate": 2.481500371582911e-06, + "loss": 0.2544, + "step": 10781 + }, + { + "epoch": 0.68, + "grad_norm": 2.989536025504863, + "learning_rate": 2.4806205836848505e-06, + "loss": 0.2705, + "step": 10782 + }, + { + "epoch": 0.68, + "grad_norm": 3.8500592155614894, + "learning_rate": 2.4797409003203198e-06, + "loss": 0.2571, + "step": 10783 + }, + { + "epoch": 0.68, + "grad_norm": 2.7112224868963137, + "learning_rate": 2.4788613215258134e-06, + "loss": 0.2846, + "step": 10784 + }, + { + "epoch": 0.68, + "grad_norm": 1.6933863777923017, + "learning_rate": 2.4779818473378303e-06, + "loss": 0.266, + "step": 10785 + }, + { + "epoch": 0.68, + "grad_norm": 5.859174589384238, + "learning_rate": 2.4771024777928605e-06, + "loss": 0.2869, + "step": 10786 + }, + { + "epoch": 0.68, + "grad_norm": 1.708089033096962, + "learning_rate": 2.4762232129273876e-06, + "loss": 0.2466, + "step": 10787 + }, + { + "epoch": 0.68, + "grad_norm": 2.518585611091397, + "learning_rate": 2.475344052777898e-06, + "loss": 0.2499, + "step": 10788 + }, + { + "epoch": 0.68, + "grad_norm": 0.6049145839246542, + "learning_rate": 2.474464997380869e-06, + "loss": 0.4662, + "step": 10789 + }, + { + "epoch": 0.68, + "grad_norm": 2.2675838406970885, + "learning_rate": 2.4735860467727705e-06, + "loss": 0.267, + "step": 10790 + }, + { + "epoch": 0.68, + "grad_norm": 4.825313867014563, + "learning_rate": 2.4727072009900764e-06, + "loss": 0.2668, + "step": 10791 + }, + { + "epoch": 0.68, + "grad_norm": 3.6622286202561916, + "learning_rate": 2.4718284600692476e-06, + "loss": 0.2778, + "step": 10792 + }, + { + "epoch": 0.68, + "grad_norm": 1.9039837610390817, + "learning_rate": 2.4709498240467484e-06, + "loss": 0.2611, + "step": 10793 + }, + { + "epoch": 0.68, + "grad_norm": 6.9378279982357665, + "learning_rate": 2.470071292959033e-06, + "loss": 0.2796, + "step": 10794 + }, + { + "epoch": 0.68, + "grad_norm": 1.913454823643306, + "learning_rate": 2.469192866842552e-06, + "loss": 0.2654, + "step": 10795 + }, + { + "epoch": 0.68, + "grad_norm": 1.4094865781866206, + "learning_rate": 2.4683145457337553e-06, + "loss": 0.2606, + "step": 10796 + }, + { + "epoch": 0.68, + "grad_norm": 4.386602186222407, + "learning_rate": 2.467436329669085e-06, + "loss": 0.2776, + "step": 10797 + }, + { + "epoch": 0.68, + "grad_norm": 2.55838764175605, + "learning_rate": 2.4665582186849783e-06, + "loss": 0.2859, + "step": 10798 + }, + { + "epoch": 0.68, + "grad_norm": 2.9091561101391172, + "learning_rate": 2.465680212817871e-06, + "loss": 0.2752, + "step": 10799 + }, + { + "epoch": 0.68, + "grad_norm": 2.9170688023309563, + "learning_rate": 2.464802312104195e-06, + "loss": 0.2616, + "step": 10800 + }, + { + "epoch": 0.68, + "grad_norm": 1.857867731191885, + "learning_rate": 2.4639245165803732e-06, + "loss": 0.2678, + "step": 10801 + }, + { + "epoch": 0.68, + "grad_norm": 1.8543701442588305, + "learning_rate": 2.4630468262828283e-06, + "loss": 0.2689, + "step": 10802 + }, + { + "epoch": 0.68, + "grad_norm": 0.586511193132362, + "learning_rate": 2.4621692412479753e-06, + "loss": 0.455, + "step": 10803 + }, + { + "epoch": 0.68, + "grad_norm": 1.9110652553644543, + "learning_rate": 2.4612917615122293e-06, + "loss": 0.2768, + "step": 10804 + }, + { + "epoch": 0.68, + "grad_norm": 6.711422682568751, + "learning_rate": 2.460414387111998e-06, + "loss": 0.2612, + "step": 10805 + }, + { + "epoch": 0.68, + "grad_norm": 3.575483264792797, + "learning_rate": 2.4595371180836823e-06, + "loss": 0.272, + "step": 10806 + }, + { + "epoch": 0.68, + "grad_norm": 0.5967939552732896, + "learning_rate": 2.458659954463684e-06, + "loss": 0.4742, + "step": 10807 + }, + { + "epoch": 0.68, + "grad_norm": 1.608502248002645, + "learning_rate": 2.4577828962884005e-06, + "loss": 0.2655, + "step": 10808 + }, + { + "epoch": 0.68, + "grad_norm": 4.121806103608374, + "learning_rate": 2.45690594359422e-06, + "loss": 0.2745, + "step": 10809 + }, + { + "epoch": 0.68, + "grad_norm": 1.782328293865478, + "learning_rate": 2.4560290964175288e-06, + "loss": 0.2619, + "step": 10810 + }, + { + "epoch": 0.68, + "grad_norm": 2.3729383561164705, + "learning_rate": 2.4551523547947066e-06, + "loss": 0.2568, + "step": 10811 + }, + { + "epoch": 0.68, + "grad_norm": 1.8005132301573186, + "learning_rate": 2.454275718762136e-06, + "loss": 0.268, + "step": 10812 + }, + { + "epoch": 0.68, + "grad_norm": 2.2151613944442685, + "learning_rate": 2.4533991883561868e-06, + "loss": 0.2606, + "step": 10813 + }, + { + "epoch": 0.68, + "grad_norm": 2.80747473819906, + "learning_rate": 2.452522763613227e-06, + "loss": 0.2608, + "step": 10814 + }, + { + "epoch": 0.68, + "grad_norm": 1.666388877098076, + "learning_rate": 2.4516464445696223e-06, + "loss": 0.2583, + "step": 10815 + }, + { + "epoch": 0.68, + "grad_norm": 2.739491756727603, + "learning_rate": 2.4507702312617353e-06, + "loss": 0.2749, + "step": 10816 + }, + { + "epoch": 0.68, + "grad_norm": 8.52263683453807, + "learning_rate": 2.4498941237259195e-06, + "loss": 0.2863, + "step": 10817 + }, + { + "epoch": 0.68, + "grad_norm": 2.2901257801936823, + "learning_rate": 2.449018121998526e-06, + "loss": 0.2567, + "step": 10818 + }, + { + "epoch": 0.68, + "grad_norm": 2.4581399080578046, + "learning_rate": 2.4481422261158995e-06, + "loss": 0.269, + "step": 10819 + }, + { + "epoch": 0.68, + "grad_norm": 5.349425912986184, + "learning_rate": 2.4472664361143868e-06, + "loss": 0.2516, + "step": 10820 + }, + { + "epoch": 0.68, + "grad_norm": 1.2633289576747733, + "learning_rate": 2.4463907520303216e-06, + "loss": 0.2705, + "step": 10821 + }, + { + "epoch": 0.68, + "grad_norm": 2.1854781840997743, + "learning_rate": 2.4455151739000415e-06, + "loss": 0.2529, + "step": 10822 + }, + { + "epoch": 0.68, + "grad_norm": 1.9687167937030654, + "learning_rate": 2.4446397017598726e-06, + "loss": 0.2765, + "step": 10823 + }, + { + "epoch": 0.68, + "grad_norm": 1.8201049864582572, + "learning_rate": 2.443764335646143e-06, + "loss": 0.2695, + "step": 10824 + }, + { + "epoch": 0.68, + "grad_norm": 2.0189675172167174, + "learning_rate": 2.4428890755951717e-06, + "loss": 0.2641, + "step": 10825 + }, + { + "epoch": 0.68, + "grad_norm": 2.149857973922792, + "learning_rate": 2.4420139216432746e-06, + "loss": 0.2684, + "step": 10826 + }, + { + "epoch": 0.68, + "grad_norm": 5.691615332807052, + "learning_rate": 2.441138873826761e-06, + "loss": 0.286, + "step": 10827 + }, + { + "epoch": 0.68, + "grad_norm": 1.4865271893224243, + "learning_rate": 2.4402639321819433e-06, + "loss": 0.2649, + "step": 10828 + }, + { + "epoch": 0.68, + "grad_norm": 1.659360497673168, + "learning_rate": 2.4393890967451194e-06, + "loss": 0.2525, + "step": 10829 + }, + { + "epoch": 0.68, + "grad_norm": 3.738767329789025, + "learning_rate": 2.4385143675525917e-06, + "loss": 0.2873, + "step": 10830 + }, + { + "epoch": 0.68, + "grad_norm": 1.3378421747300495, + "learning_rate": 2.4376397446406508e-06, + "loss": 0.2767, + "step": 10831 + }, + { + "epoch": 0.68, + "grad_norm": 3.4810079656240105, + "learning_rate": 2.43676522804559e-06, + "loss": 0.2536, + "step": 10832 + }, + { + "epoch": 0.68, + "grad_norm": 1.7610103965553026, + "learning_rate": 2.435890817803693e-06, + "loss": 0.2519, + "step": 10833 + }, + { + "epoch": 0.68, + "grad_norm": 1.9028691179983896, + "learning_rate": 2.4350165139512384e-06, + "loss": 0.2553, + "step": 10834 + }, + { + "epoch": 0.68, + "grad_norm": 1.611323828889131, + "learning_rate": 2.4341423165245064e-06, + "loss": 0.2689, + "step": 10835 + }, + { + "epoch": 0.68, + "grad_norm": 3.1389346807689904, + "learning_rate": 2.433268225559765e-06, + "loss": 0.2633, + "step": 10836 + }, + { + "epoch": 0.68, + "grad_norm": 0.6035650523608391, + "learning_rate": 2.4323942410932862e-06, + "loss": 0.488, + "step": 10837 + }, + { + "epoch": 0.68, + "grad_norm": 1.7200242146295561, + "learning_rate": 2.4315203631613303e-06, + "loss": 0.2671, + "step": 10838 + }, + { + "epoch": 0.68, + "grad_norm": 3.601566766359058, + "learning_rate": 2.4306465918001554e-06, + "loss": 0.2707, + "step": 10839 + }, + { + "epoch": 0.68, + "grad_norm": 1.9848085958619472, + "learning_rate": 2.429772927046019e-06, + "loss": 0.2733, + "step": 10840 + }, + { + "epoch": 0.68, + "grad_norm": 24.69827138673236, + "learning_rate": 2.4288993689351692e-06, + "loss": 0.2514, + "step": 10841 + }, + { + "epoch": 0.68, + "grad_norm": 2.054443038449273, + "learning_rate": 2.428025917503849e-06, + "loss": 0.2792, + "step": 10842 + }, + { + "epoch": 0.68, + "grad_norm": 2.841957651692076, + "learning_rate": 2.427152572788304e-06, + "loss": 0.2766, + "step": 10843 + }, + { + "epoch": 0.68, + "grad_norm": 1.8254332768160941, + "learning_rate": 2.4262793348247663e-06, + "loss": 0.2604, + "step": 10844 + }, + { + "epoch": 0.68, + "grad_norm": 1.8950096525597646, + "learning_rate": 2.425406203649473e-06, + "loss": 0.25, + "step": 10845 + }, + { + "epoch": 0.68, + "grad_norm": 1.593341431234992, + "learning_rate": 2.4245331792986486e-06, + "loss": 0.2486, + "step": 10846 + }, + { + "epoch": 0.68, + "grad_norm": 1.5497930112050453, + "learning_rate": 2.4236602618085157e-06, + "loss": 0.26, + "step": 10847 + }, + { + "epoch": 0.68, + "grad_norm": 1.6949615459239775, + "learning_rate": 2.4227874512152965e-06, + "loss": 0.2577, + "step": 10848 + }, + { + "epoch": 0.68, + "grad_norm": 3.7377251655209793, + "learning_rate": 2.421914747555204e-06, + "loss": 0.2725, + "step": 10849 + }, + { + "epoch": 0.68, + "grad_norm": 2.3957716170763947, + "learning_rate": 2.4210421508644454e-06, + "loss": 0.2528, + "step": 10850 + }, + { + "epoch": 0.68, + "grad_norm": 1.5906823880435084, + "learning_rate": 2.4201696611792285e-06, + "loss": 0.25, + "step": 10851 + }, + { + "epoch": 0.68, + "grad_norm": 1.3668835120315774, + "learning_rate": 2.419297278535757e-06, + "loss": 0.2646, + "step": 10852 + }, + { + "epoch": 0.68, + "grad_norm": 5.0248823520360695, + "learning_rate": 2.418425002970225e-06, + "loss": 0.2751, + "step": 10853 + }, + { + "epoch": 0.68, + "grad_norm": 1.9113828394692174, + "learning_rate": 2.417552834518825e-06, + "loss": 0.2627, + "step": 10854 + }, + { + "epoch": 0.68, + "grad_norm": 2.6343467579385385, + "learning_rate": 2.4166807732177435e-06, + "loss": 0.27, + "step": 10855 + }, + { + "epoch": 0.68, + "grad_norm": 3.444913547484067, + "learning_rate": 2.4158088191031666e-06, + "loss": 0.2646, + "step": 10856 + }, + { + "epoch": 0.68, + "grad_norm": 2.23435673794038, + "learning_rate": 2.414936972211272e-06, + "loss": 0.2681, + "step": 10857 + }, + { + "epoch": 0.68, + "grad_norm": 8.170829744020603, + "learning_rate": 2.4140652325782317e-06, + "loss": 0.2614, + "step": 10858 + }, + { + "epoch": 0.68, + "grad_norm": 2.3178830668271937, + "learning_rate": 2.4131936002402182e-06, + "loss": 0.2608, + "step": 10859 + }, + { + "epoch": 0.68, + "grad_norm": 3.276373835655226, + "learning_rate": 2.4123220752333994e-06, + "loss": 0.2635, + "step": 10860 + }, + { + "epoch": 0.68, + "grad_norm": 3.4365126148774165, + "learning_rate": 2.4114506575939333e-06, + "loss": 0.2683, + "step": 10861 + }, + { + "epoch": 0.68, + "grad_norm": 2.012939648087876, + "learning_rate": 2.4105793473579773e-06, + "loss": 0.2638, + "step": 10862 + }, + { + "epoch": 0.68, + "grad_norm": 3.619364225017148, + "learning_rate": 2.4097081445616814e-06, + "loss": 0.2718, + "step": 10863 + }, + { + "epoch": 0.68, + "grad_norm": 4.465368191838411, + "learning_rate": 2.408837049241198e-06, + "loss": 0.2668, + "step": 10864 + }, + { + "epoch": 0.68, + "grad_norm": 2.533203366468756, + "learning_rate": 2.407966061432668e-06, + "loss": 0.289, + "step": 10865 + }, + { + "epoch": 0.68, + "grad_norm": 1.4687487469774676, + "learning_rate": 2.407095181172227e-06, + "loss": 0.2565, + "step": 10866 + }, + { + "epoch": 0.68, + "grad_norm": 3.344562654570713, + "learning_rate": 2.4062244084960136e-06, + "loss": 0.2687, + "step": 10867 + }, + { + "epoch": 0.68, + "grad_norm": 12.930673067198498, + "learning_rate": 2.4053537434401576e-06, + "loss": 0.2762, + "step": 10868 + }, + { + "epoch": 0.68, + "grad_norm": 1.551485306675878, + "learning_rate": 2.404483186040784e-06, + "loss": 0.2492, + "step": 10869 + }, + { + "epoch": 0.68, + "grad_norm": 2.0665610186258525, + "learning_rate": 2.403612736334013e-06, + "loss": 0.2472, + "step": 10870 + }, + { + "epoch": 0.68, + "grad_norm": 2.064619466844578, + "learning_rate": 2.402742394355959e-06, + "loss": 0.2585, + "step": 10871 + }, + { + "epoch": 0.68, + "grad_norm": 3.7820775829933004, + "learning_rate": 2.4018721601427387e-06, + "loss": 0.2896, + "step": 10872 + }, + { + "epoch": 0.68, + "grad_norm": 1.8421601210465457, + "learning_rate": 2.4010020337304575e-06, + "loss": 0.2704, + "step": 10873 + }, + { + "epoch": 0.68, + "grad_norm": 4.444179906428219, + "learning_rate": 2.4001320151552153e-06, + "loss": 0.2871, + "step": 10874 + }, + { + "epoch": 0.68, + "grad_norm": 2.0461616569532914, + "learning_rate": 2.3992621044531146e-06, + "loss": 0.2598, + "step": 10875 + }, + { + "epoch": 0.68, + "grad_norm": 0.6281500200261377, + "learning_rate": 2.39839230166025e-06, + "loss": 0.4726, + "step": 10876 + }, + { + "epoch": 0.68, + "grad_norm": 2.289931686135603, + "learning_rate": 2.3975226068127107e-06, + "loss": 0.2634, + "step": 10877 + }, + { + "epoch": 0.68, + "grad_norm": 2.166019674554041, + "learning_rate": 2.39665301994658e-06, + "loss": 0.2687, + "step": 10878 + }, + { + "epoch": 0.68, + "grad_norm": 2.9610345374597213, + "learning_rate": 2.395783541097938e-06, + "loss": 0.2669, + "step": 10879 + }, + { + "epoch": 0.68, + "grad_norm": 2.7195799517151467, + "learning_rate": 2.3949141703028643e-06, + "loss": 0.2563, + "step": 10880 + }, + { + "epoch": 0.68, + "grad_norm": 2.4205161530393116, + "learning_rate": 2.394044907597427e-06, + "loss": 0.272, + "step": 10881 + }, + { + "epoch": 0.68, + "grad_norm": 0.6188588342391146, + "learning_rate": 2.393175753017697e-06, + "loss": 0.4873, + "step": 10882 + }, + { + "epoch": 0.68, + "grad_norm": 1.7269059264314834, + "learning_rate": 2.3923067065997336e-06, + "loss": 0.2583, + "step": 10883 + }, + { + "epoch": 0.68, + "grad_norm": 2.129786075912959, + "learning_rate": 2.391437768379598e-06, + "loss": 0.2668, + "step": 10884 + }, + { + "epoch": 0.68, + "grad_norm": 1.879601366199349, + "learning_rate": 2.3905689383933433e-06, + "loss": 0.2583, + "step": 10885 + }, + { + "epoch": 0.68, + "grad_norm": 2.90273285050398, + "learning_rate": 2.3897002166770183e-06, + "loss": 0.2636, + "step": 10886 + }, + { + "epoch": 0.68, + "grad_norm": 2.7457911686373264, + "learning_rate": 2.3888316032666653e-06, + "loss": 0.264, + "step": 10887 + }, + { + "epoch": 0.68, + "grad_norm": 3.245829896736995, + "learning_rate": 2.3879630981983277e-06, + "loss": 0.2754, + "step": 10888 + }, + { + "epoch": 0.68, + "grad_norm": 3.3258908249547043, + "learning_rate": 2.387094701508042e-06, + "loss": 0.2521, + "step": 10889 + }, + { + "epoch": 0.68, + "grad_norm": 1.8457514254007812, + "learning_rate": 2.3862264132318385e-06, + "loss": 0.2533, + "step": 10890 + }, + { + "epoch": 0.68, + "grad_norm": 1.3115242101589804, + "learning_rate": 2.385358233405741e-06, + "loss": 0.2682, + "step": 10891 + }, + { + "epoch": 0.68, + "grad_norm": 2.3842661648783077, + "learning_rate": 2.3844901620657775e-06, + "loss": 0.2691, + "step": 10892 + }, + { + "epoch": 0.69, + "grad_norm": 2.3848533479403673, + "learning_rate": 2.3836221992479626e-06, + "loss": 0.2574, + "step": 10893 + }, + { + "epoch": 0.69, + "grad_norm": 1.646166234318717, + "learning_rate": 2.3827543449883094e-06, + "loss": 0.2657, + "step": 10894 + }, + { + "epoch": 0.69, + "grad_norm": 2.3394498095130616, + "learning_rate": 2.3818865993228256e-06, + "loss": 0.2471, + "step": 10895 + }, + { + "epoch": 0.69, + "grad_norm": 2.3183147321099677, + "learning_rate": 2.3810189622875174e-06, + "loss": 0.2886, + "step": 10896 + }, + { + "epoch": 0.69, + "grad_norm": 1.4799604803379114, + "learning_rate": 2.3801514339183857e-06, + "loss": 0.2763, + "step": 10897 + }, + { + "epoch": 0.69, + "grad_norm": 1.559558408132068, + "learning_rate": 2.3792840142514244e-06, + "loss": 0.2563, + "step": 10898 + }, + { + "epoch": 0.69, + "grad_norm": 1.9476723320016283, + "learning_rate": 2.3784167033226222e-06, + "loss": 0.259, + "step": 10899 + }, + { + "epoch": 0.69, + "grad_norm": 3.2389654261430936, + "learning_rate": 2.37754950116797e-06, + "loss": 0.262, + "step": 10900 + }, + { + "epoch": 0.69, + "grad_norm": 2.656437300482076, + "learning_rate": 2.3766824078234468e-06, + "loss": 0.2593, + "step": 10901 + }, + { + "epoch": 0.69, + "grad_norm": 2.7259779210563946, + "learning_rate": 2.3758154233250298e-06, + "loss": 0.2618, + "step": 10902 + }, + { + "epoch": 0.69, + "grad_norm": 10.099447602178023, + "learning_rate": 2.3749485477086883e-06, + "loss": 0.2494, + "step": 10903 + }, + { + "epoch": 0.69, + "grad_norm": 1.8399474411979575, + "learning_rate": 2.374081781010398e-06, + "loss": 0.2732, + "step": 10904 + }, + { + "epoch": 0.69, + "grad_norm": 5.3599037997487375, + "learning_rate": 2.373215123266118e-06, + "loss": 0.2799, + "step": 10905 + }, + { + "epoch": 0.69, + "grad_norm": 2.9549760089744432, + "learning_rate": 2.372348574511809e-06, + "loss": 0.2547, + "step": 10906 + }, + { + "epoch": 0.69, + "grad_norm": 1.8140650942537564, + "learning_rate": 2.371482134783423e-06, + "loss": 0.2832, + "step": 10907 + }, + { + "epoch": 0.69, + "grad_norm": 2.3040300334623325, + "learning_rate": 2.370615804116914e-06, + "loss": 0.2495, + "step": 10908 + }, + { + "epoch": 0.69, + "grad_norm": 2.3787249135748456, + "learning_rate": 2.369749582548225e-06, + "loss": 0.2658, + "step": 10909 + }, + { + "epoch": 0.69, + "grad_norm": 3.7002413232831315, + "learning_rate": 2.3688834701132964e-06, + "loss": 0.2777, + "step": 10910 + }, + { + "epoch": 0.69, + "grad_norm": 1.6570233627211397, + "learning_rate": 2.3680174668480656e-06, + "loss": 0.2641, + "step": 10911 + }, + { + "epoch": 0.69, + "grad_norm": 2.9881638843813065, + "learning_rate": 2.367151572788467e-06, + "loss": 0.2632, + "step": 10912 + }, + { + "epoch": 0.69, + "grad_norm": 1.8864654449008889, + "learning_rate": 2.3662857879704265e-06, + "loss": 0.2591, + "step": 10913 + }, + { + "epoch": 0.69, + "grad_norm": 2.382164067425556, + "learning_rate": 2.3654201124298658e-06, + "loss": 0.252, + "step": 10914 + }, + { + "epoch": 0.69, + "grad_norm": 0.6536734335078284, + "learning_rate": 2.364554546202703e-06, + "loss": 0.4739, + "step": 10915 + }, + { + "epoch": 0.69, + "grad_norm": 0.6142496272903245, + "learning_rate": 2.363689089324855e-06, + "loss": 0.4544, + "step": 10916 + }, + { + "epoch": 0.69, + "grad_norm": 1.5179567144839843, + "learning_rate": 2.362823741832229e-06, + "loss": 0.274, + "step": 10917 + }, + { + "epoch": 0.69, + "grad_norm": 3.103286936456892, + "learning_rate": 2.3619585037607283e-06, + "loss": 0.2823, + "step": 10918 + }, + { + "epoch": 0.69, + "grad_norm": 3.6817210396512827, + "learning_rate": 2.3610933751462554e-06, + "loss": 0.2719, + "step": 10919 + }, + { + "epoch": 0.69, + "grad_norm": 3.667014841922441, + "learning_rate": 2.360228356024707e-06, + "loss": 0.2576, + "step": 10920 + }, + { + "epoch": 0.69, + "grad_norm": 2.08097641517747, + "learning_rate": 2.3593634464319732e-06, + "loss": 0.2874, + "step": 10921 + }, + { + "epoch": 0.69, + "grad_norm": 1.3777251939030866, + "learning_rate": 2.35849864640394e-06, + "loss": 0.2536, + "step": 10922 + }, + { + "epoch": 0.69, + "grad_norm": 2.0453044671274974, + "learning_rate": 2.357633955976488e-06, + "loss": 0.2549, + "step": 10923 + }, + { + "epoch": 0.69, + "grad_norm": 1.7899662206002802, + "learning_rate": 2.3567693751854985e-06, + "loss": 0.2622, + "step": 10924 + }, + { + "epoch": 0.69, + "grad_norm": 1.7915825281831113, + "learning_rate": 2.3559049040668425e-06, + "loss": 0.2678, + "step": 10925 + }, + { + "epoch": 0.69, + "grad_norm": 1.9438606617371923, + "learning_rate": 2.355040542656387e-06, + "loss": 0.2499, + "step": 10926 + }, + { + "epoch": 0.69, + "grad_norm": 4.531104619929896, + "learning_rate": 2.354176290989996e-06, + "loss": 0.2725, + "step": 10927 + }, + { + "epoch": 0.69, + "grad_norm": 2.2581310911745267, + "learning_rate": 2.353312149103533e-06, + "loss": 0.2563, + "step": 10928 + }, + { + "epoch": 0.69, + "grad_norm": 2.91521569417984, + "learning_rate": 2.3524481170328506e-06, + "loss": 0.2713, + "step": 10929 + }, + { + "epoch": 0.69, + "grad_norm": 2.413535705432135, + "learning_rate": 2.3515841948137976e-06, + "loss": 0.272, + "step": 10930 + }, + { + "epoch": 0.69, + "grad_norm": 1.8006589061091516, + "learning_rate": 2.350720382482219e-06, + "loss": 0.2605, + "step": 10931 + }, + { + "epoch": 0.69, + "grad_norm": 1.5129055315177493, + "learning_rate": 2.349856680073959e-06, + "loss": 0.2665, + "step": 10932 + }, + { + "epoch": 0.69, + "grad_norm": 0.6849296253493475, + "learning_rate": 2.34899308762485e-06, + "loss": 0.5027, + "step": 10933 + }, + { + "epoch": 0.69, + "grad_norm": 5.583730405393349, + "learning_rate": 2.34812960517073e-06, + "loss": 0.2675, + "step": 10934 + }, + { + "epoch": 0.69, + "grad_norm": 1.4774859424883449, + "learning_rate": 2.347266232747419e-06, + "loss": 0.2637, + "step": 10935 + }, + { + "epoch": 0.69, + "grad_norm": 2.05368579524338, + "learning_rate": 2.346402970390748e-06, + "loss": 0.2647, + "step": 10936 + }, + { + "epoch": 0.69, + "grad_norm": 2.0026683642084535, + "learning_rate": 2.345539818136529e-06, + "loss": 0.2465, + "step": 10937 + }, + { + "epoch": 0.69, + "grad_norm": 2.2468664922163395, + "learning_rate": 2.344676776020579e-06, + "loss": 0.25, + "step": 10938 + }, + { + "epoch": 0.69, + "grad_norm": 2.3456737338948885, + "learning_rate": 2.343813844078704e-06, + "loss": 0.2531, + "step": 10939 + }, + { + "epoch": 0.69, + "grad_norm": 2.2275206115756694, + "learning_rate": 2.3429510223467116e-06, + "loss": 0.262, + "step": 10940 + }, + { + "epoch": 0.69, + "grad_norm": 1.8450804172787154, + "learning_rate": 2.3420883108603997e-06, + "loss": 0.2921, + "step": 10941 + }, + { + "epoch": 0.69, + "grad_norm": 7.667519709873651, + "learning_rate": 2.3412257096555663e-06, + "loss": 0.2575, + "step": 10942 + }, + { + "epoch": 0.69, + "grad_norm": 2.0751679166352885, + "learning_rate": 2.3403632187679987e-06, + "loss": 0.2628, + "step": 10943 + }, + { + "epoch": 0.69, + "grad_norm": 3.46009451010248, + "learning_rate": 2.3395008382334873e-06, + "loss": 0.2603, + "step": 10944 + }, + { + "epoch": 0.69, + "grad_norm": 2.63868910424025, + "learning_rate": 2.3386385680878116e-06, + "loss": 0.2651, + "step": 10945 + }, + { + "epoch": 0.69, + "grad_norm": 3.0428530019975164, + "learning_rate": 2.3377764083667494e-06, + "loss": 0.2817, + "step": 10946 + }, + { + "epoch": 0.69, + "grad_norm": 8.260584311841463, + "learning_rate": 2.33691435910607e-06, + "loss": 0.2916, + "step": 10947 + }, + { + "epoch": 0.69, + "grad_norm": 1.435284089693954, + "learning_rate": 2.336052420341544e-06, + "loss": 0.2585, + "step": 10948 + }, + { + "epoch": 0.69, + "grad_norm": 4.1945625299153075, + "learning_rate": 2.335190592108937e-06, + "loss": 0.2737, + "step": 10949 + }, + { + "epoch": 0.69, + "grad_norm": 1.6983782016735072, + "learning_rate": 2.3343288744440055e-06, + "loss": 0.2687, + "step": 10950 + }, + { + "epoch": 0.69, + "grad_norm": 1.9708341111944603, + "learning_rate": 2.333467267382502e-06, + "loss": 0.2581, + "step": 10951 + }, + { + "epoch": 0.69, + "grad_norm": 6.119651187058837, + "learning_rate": 2.3326057709601795e-06, + "loss": 0.2765, + "step": 10952 + }, + { + "epoch": 0.69, + "grad_norm": 2.8961854399545466, + "learning_rate": 2.331744385212782e-06, + "loss": 0.2429, + "step": 10953 + }, + { + "epoch": 0.69, + "grad_norm": 2.2148688811033104, + "learning_rate": 2.330883110176049e-06, + "loss": 0.268, + "step": 10954 + }, + { + "epoch": 0.69, + "grad_norm": 2.9929853204084975, + "learning_rate": 2.3300219458857144e-06, + "loss": 0.2792, + "step": 10955 + }, + { + "epoch": 0.69, + "grad_norm": 2.621514330675251, + "learning_rate": 2.3291608923775117e-06, + "loss": 0.2754, + "step": 10956 + }, + { + "epoch": 0.69, + "grad_norm": 6.704714025645811, + "learning_rate": 2.3282999496871694e-06, + "loss": 0.2779, + "step": 10957 + }, + { + "epoch": 0.69, + "grad_norm": 1.5268878406254864, + "learning_rate": 2.327439117850408e-06, + "loss": 0.2488, + "step": 10958 + }, + { + "epoch": 0.69, + "grad_norm": 1.6423044441394652, + "learning_rate": 2.3265783969029425e-06, + "loss": 0.2549, + "step": 10959 + }, + { + "epoch": 0.69, + "grad_norm": 1.4383915248263024, + "learning_rate": 2.32571778688049e-06, + "loss": 0.243, + "step": 10960 + }, + { + "epoch": 0.69, + "grad_norm": 1.7243666016070005, + "learning_rate": 2.3248572878187566e-06, + "loss": 0.2502, + "step": 10961 + }, + { + "epoch": 0.69, + "grad_norm": 4.910469847432548, + "learning_rate": 2.323996899753445e-06, + "loss": 0.2767, + "step": 10962 + }, + { + "epoch": 0.69, + "grad_norm": 2.2472062476080032, + "learning_rate": 2.3231366227202523e-06, + "loss": 0.2494, + "step": 10963 + }, + { + "epoch": 0.69, + "grad_norm": 2.5338759061150538, + "learning_rate": 2.3222764567548795e-06, + "loss": 0.2517, + "step": 10964 + }, + { + "epoch": 0.69, + "grad_norm": 2.02135852969192, + "learning_rate": 2.3214164018930124e-06, + "loss": 0.2633, + "step": 10965 + }, + { + "epoch": 0.69, + "grad_norm": 2.0287612662433467, + "learning_rate": 2.3205564581703367e-06, + "loss": 0.2604, + "step": 10966 + }, + { + "epoch": 0.69, + "grad_norm": 1.7936296361297288, + "learning_rate": 2.3196966256225306e-06, + "loss": 0.2523, + "step": 10967 + }, + { + "epoch": 0.69, + "grad_norm": 3.7106428506218263, + "learning_rate": 2.318836904285274e-06, + "loss": 0.3034, + "step": 10968 + }, + { + "epoch": 0.69, + "grad_norm": 2.1569842069850975, + "learning_rate": 2.317977294194237e-06, + "loss": 0.2735, + "step": 10969 + }, + { + "epoch": 0.69, + "grad_norm": 1.4835992143932462, + "learning_rate": 2.3171177953850837e-06, + "loss": 0.2524, + "step": 10970 + }, + { + "epoch": 0.69, + "grad_norm": 2.3607794822239407, + "learning_rate": 2.3162584078934785e-06, + "loss": 0.2833, + "step": 10971 + }, + { + "epoch": 0.69, + "grad_norm": 1.6030701736115138, + "learning_rate": 2.315399131755081e-06, + "loss": 0.2819, + "step": 10972 + }, + { + "epoch": 0.69, + "grad_norm": 2.975997946367097, + "learning_rate": 2.314539967005541e-06, + "loss": 0.2594, + "step": 10973 + }, + { + "epoch": 0.69, + "grad_norm": 2.213239373020768, + "learning_rate": 2.313680913680509e-06, + "loss": 0.2581, + "step": 10974 + }, + { + "epoch": 0.69, + "grad_norm": 2.0669890418402885, + "learning_rate": 2.312821971815625e-06, + "loss": 0.2738, + "step": 10975 + }, + { + "epoch": 0.69, + "grad_norm": 10.436783865483608, + "learning_rate": 2.3119631414465317e-06, + "loss": 0.2694, + "step": 10976 + }, + { + "epoch": 0.69, + "grad_norm": 10.107960424948507, + "learning_rate": 2.3111044226088625e-06, + "loss": 0.2808, + "step": 10977 + }, + { + "epoch": 0.69, + "grad_norm": 1.5052658011789797, + "learning_rate": 2.3102458153382446e-06, + "loss": 0.2551, + "step": 10978 + }, + { + "epoch": 0.69, + "grad_norm": 0.5972256617506898, + "learning_rate": 2.309387319670306e-06, + "loss": 0.4581, + "step": 10979 + }, + { + "epoch": 0.69, + "grad_norm": 2.3041711980146915, + "learning_rate": 2.3085289356406682e-06, + "loss": 0.2531, + "step": 10980 + }, + { + "epoch": 0.69, + "grad_norm": 2.740272779062172, + "learning_rate": 2.3076706632849448e-06, + "loss": 0.2681, + "step": 10981 + }, + { + "epoch": 0.69, + "grad_norm": 5.0874210368604125, + "learning_rate": 2.3068125026387483e-06, + "loss": 0.2864, + "step": 10982 + }, + { + "epoch": 0.69, + "grad_norm": 2.2536115526514644, + "learning_rate": 2.3059544537376834e-06, + "loss": 0.2549, + "step": 10983 + }, + { + "epoch": 0.69, + "grad_norm": 3.1361431338645454, + "learning_rate": 2.3050965166173543e-06, + "loss": 0.2565, + "step": 10984 + }, + { + "epoch": 0.69, + "grad_norm": 2.2496520356866707, + "learning_rate": 2.3042386913133558e-06, + "loss": 0.2632, + "step": 10985 + }, + { + "epoch": 0.69, + "grad_norm": 1.9677520799895307, + "learning_rate": 2.303380977861284e-06, + "loss": 0.26, + "step": 10986 + }, + { + "epoch": 0.69, + "grad_norm": 1.5742895822778318, + "learning_rate": 2.3025233762967238e-06, + "loss": 0.2553, + "step": 10987 + }, + { + "epoch": 0.69, + "grad_norm": 1.8123616300708694, + "learning_rate": 2.3016658866552618e-06, + "loss": 0.2454, + "step": 10988 + }, + { + "epoch": 0.69, + "grad_norm": 1.888777549190069, + "learning_rate": 2.3008085089724743e-06, + "loss": 0.251, + "step": 10989 + }, + { + "epoch": 0.69, + "grad_norm": 1.8074781999311407, + "learning_rate": 2.299951243283936e-06, + "loss": 0.2707, + "step": 10990 + }, + { + "epoch": 0.69, + "grad_norm": 2.4374729526319707, + "learning_rate": 2.299094089625215e-06, + "loss": 0.2717, + "step": 10991 + }, + { + "epoch": 0.69, + "grad_norm": 2.2798070048658396, + "learning_rate": 2.2982370480318794e-06, + "loss": 0.2607, + "step": 10992 + }, + { + "epoch": 0.69, + "grad_norm": 4.9999077928147075, + "learning_rate": 2.297380118539486e-06, + "loss": 0.2686, + "step": 10993 + }, + { + "epoch": 0.69, + "grad_norm": 1.64007053011502, + "learning_rate": 2.2965233011835936e-06, + "loss": 0.2602, + "step": 10994 + }, + { + "epoch": 0.69, + "grad_norm": 2.910541496731146, + "learning_rate": 2.295666595999749e-06, + "loss": 0.2669, + "step": 10995 + }, + { + "epoch": 0.69, + "grad_norm": 1.788300761654256, + "learning_rate": 2.294810003023503e-06, + "loss": 0.275, + "step": 10996 + }, + { + "epoch": 0.69, + "grad_norm": 1.442347356213636, + "learning_rate": 2.293953522290395e-06, + "loss": 0.261, + "step": 10997 + }, + { + "epoch": 0.69, + "grad_norm": 1.8529495014929396, + "learning_rate": 2.2930971538359612e-06, + "loss": 0.2622, + "step": 10998 + }, + { + "epoch": 0.69, + "grad_norm": 1.1671811736080213, + "learning_rate": 2.292240897695734e-06, + "loss": 0.264, + "step": 10999 + }, + { + "epoch": 0.69, + "grad_norm": 1.9863585472536043, + "learning_rate": 2.2913847539052405e-06, + "loss": 0.2704, + "step": 11000 + }, + { + "epoch": 0.69, + "grad_norm": 1.4190139119943348, + "learning_rate": 2.2905287225000062e-06, + "loss": 0.2787, + "step": 11001 + }, + { + "epoch": 0.69, + "grad_norm": 2.967742712789526, + "learning_rate": 2.2896728035155487e-06, + "loss": 0.2645, + "step": 11002 + }, + { + "epoch": 0.69, + "grad_norm": 1.5972908958120042, + "learning_rate": 2.288816996987378e-06, + "loss": 0.2608, + "step": 11003 + }, + { + "epoch": 0.69, + "grad_norm": 1.698590185358748, + "learning_rate": 2.287961302951008e-06, + "loss": 0.2595, + "step": 11004 + }, + { + "epoch": 0.69, + "grad_norm": 2.511746720975442, + "learning_rate": 2.2871057214419403e-06, + "loss": 0.2618, + "step": 11005 + }, + { + "epoch": 0.69, + "grad_norm": 3.0104081680942985, + "learning_rate": 2.2862502524956748e-06, + "loss": 0.2788, + "step": 11006 + }, + { + "epoch": 0.69, + "grad_norm": 4.437342448768188, + "learning_rate": 2.285394896147705e-06, + "loss": 0.2695, + "step": 11007 + }, + { + "epoch": 0.69, + "grad_norm": 6.567028789019159, + "learning_rate": 2.284539652433522e-06, + "loss": 0.2776, + "step": 11008 + }, + { + "epoch": 0.69, + "grad_norm": 2.3600236114539146, + "learning_rate": 2.283684521388615e-06, + "loss": 0.2813, + "step": 11009 + }, + { + "epoch": 0.69, + "grad_norm": 1.4460102376280342, + "learning_rate": 2.282829503048461e-06, + "loss": 0.2683, + "step": 11010 + }, + { + "epoch": 0.69, + "grad_norm": 2.765288002276459, + "learning_rate": 2.2819745974485357e-06, + "loss": 0.2613, + "step": 11011 + }, + { + "epoch": 0.69, + "grad_norm": 3.0838044190318, + "learning_rate": 2.281119804624314e-06, + "loss": 0.2598, + "step": 11012 + }, + { + "epoch": 0.69, + "grad_norm": 2.331842545145046, + "learning_rate": 2.2802651246112606e-06, + "loss": 0.2604, + "step": 11013 + }, + { + "epoch": 0.69, + "grad_norm": 2.082322749369407, + "learning_rate": 2.279410557444838e-06, + "loss": 0.2504, + "step": 11014 + }, + { + "epoch": 0.69, + "grad_norm": 3.6204593116768944, + "learning_rate": 2.278556103160502e-06, + "loss": 0.2498, + "step": 11015 + }, + { + "epoch": 0.69, + "grad_norm": 1.3905597247715427, + "learning_rate": 2.2777017617937073e-06, + "loss": 0.2799, + "step": 11016 + }, + { + "epoch": 0.69, + "grad_norm": 2.472427955573566, + "learning_rate": 2.2768475333799035e-06, + "loss": 0.247, + "step": 11017 + }, + { + "epoch": 0.69, + "grad_norm": 1.6162241556398924, + "learning_rate": 2.2759934179545325e-06, + "loss": 0.2591, + "step": 11018 + }, + { + "epoch": 0.69, + "grad_norm": 1.9127677604285067, + "learning_rate": 2.275139415553031e-06, + "loss": 0.2582, + "step": 11019 + }, + { + "epoch": 0.69, + "grad_norm": 1.7527933414260848, + "learning_rate": 2.2742855262108364e-06, + "loss": 0.2616, + "step": 11020 + }, + { + "epoch": 0.69, + "grad_norm": 3.7751755045145465, + "learning_rate": 2.273431749963377e-06, + "loss": 0.2736, + "step": 11021 + }, + { + "epoch": 0.69, + "grad_norm": 1.5290407432488182, + "learning_rate": 2.272578086846078e-06, + "loss": 0.2625, + "step": 11022 + }, + { + "epoch": 0.69, + "grad_norm": 2.5421788814637516, + "learning_rate": 2.271724536894354e-06, + "loss": 0.2705, + "step": 11023 + }, + { + "epoch": 0.69, + "grad_norm": 2.8246065823332165, + "learning_rate": 2.270871100143629e-06, + "loss": 0.2605, + "step": 11024 + }, + { + "epoch": 0.69, + "grad_norm": 4.860304452842204, + "learning_rate": 2.27001777662931e-06, + "loss": 0.2649, + "step": 11025 + }, + { + "epoch": 0.69, + "grad_norm": 2.445004033177439, + "learning_rate": 2.2691645663868016e-06, + "loss": 0.2386, + "step": 11026 + }, + { + "epoch": 0.69, + "grad_norm": 2.2881442401364325, + "learning_rate": 2.2683114694515045e-06, + "loss": 0.2618, + "step": 11027 + }, + { + "epoch": 0.69, + "grad_norm": 2.64561475847575, + "learning_rate": 2.2674584858588185e-06, + "loss": 0.2607, + "step": 11028 + }, + { + "epoch": 0.69, + "grad_norm": 1.3218454980779675, + "learning_rate": 2.266605615644133e-06, + "loss": 0.2588, + "step": 11029 + }, + { + "epoch": 0.69, + "grad_norm": 1.9211058016416054, + "learning_rate": 2.265752858842834e-06, + "loss": 0.2549, + "step": 11030 + }, + { + "epoch": 0.69, + "grad_norm": 1.5657039121450247, + "learning_rate": 2.2649002154903056e-06, + "loss": 0.2798, + "step": 11031 + }, + { + "epoch": 0.69, + "grad_norm": 2.7234951650213524, + "learning_rate": 2.2640476856219267e-06, + "loss": 0.2708, + "step": 11032 + }, + { + "epoch": 0.69, + "grad_norm": 1.779077471648995, + "learning_rate": 2.263195269273069e-06, + "loss": 0.2506, + "step": 11033 + }, + { + "epoch": 0.69, + "grad_norm": 4.195402374947855, + "learning_rate": 2.2623429664791015e-06, + "loss": 0.2627, + "step": 11034 + }, + { + "epoch": 0.69, + "grad_norm": 2.4381753877699035, + "learning_rate": 2.261490777275384e-06, + "loss": 0.2743, + "step": 11035 + }, + { + "epoch": 0.69, + "grad_norm": 1.5370562146420013, + "learning_rate": 2.260638701697281e-06, + "loss": 0.2655, + "step": 11036 + }, + { + "epoch": 0.69, + "grad_norm": 4.194549114731265, + "learning_rate": 2.259786739780143e-06, + "loss": 0.2479, + "step": 11037 + }, + { + "epoch": 0.69, + "grad_norm": 3.5973098267399335, + "learning_rate": 2.2589348915593197e-06, + "loss": 0.2622, + "step": 11038 + }, + { + "epoch": 0.69, + "grad_norm": 1.3700132494673471, + "learning_rate": 2.2580831570701557e-06, + "loss": 0.2622, + "step": 11039 + }, + { + "epoch": 0.69, + "grad_norm": 2.890398019948207, + "learning_rate": 2.2572315363479935e-06, + "loss": 0.2653, + "step": 11040 + }, + { + "epoch": 0.69, + "grad_norm": 1.7266076924332996, + "learning_rate": 2.2563800294281667e-06, + "loss": 0.2753, + "step": 11041 + }, + { + "epoch": 0.69, + "grad_norm": 1.7583690345518133, + "learning_rate": 2.2555286363460053e-06, + "loss": 0.2725, + "step": 11042 + }, + { + "epoch": 0.69, + "grad_norm": 1.7996428411083296, + "learning_rate": 2.254677357136834e-06, + "loss": 0.2598, + "step": 11043 + }, + { + "epoch": 0.69, + "grad_norm": 4.019957369916979, + "learning_rate": 2.2538261918359776e-06, + "loss": 0.2731, + "step": 11044 + }, + { + "epoch": 0.69, + "grad_norm": 1.4689716646829647, + "learning_rate": 2.252975140478748e-06, + "loss": 0.2626, + "step": 11045 + }, + { + "epoch": 0.69, + "grad_norm": 1.377654027126183, + "learning_rate": 2.2521242031004613e-06, + "loss": 0.2428, + "step": 11046 + }, + { + "epoch": 0.69, + "grad_norm": 2.0002552348555516, + "learning_rate": 2.2512733797364202e-06, + "loss": 0.2699, + "step": 11047 + }, + { + "epoch": 0.69, + "grad_norm": 1.6584052503371383, + "learning_rate": 2.25042267042193e-06, + "loss": 0.2887, + "step": 11048 + }, + { + "epoch": 0.69, + "grad_norm": 1.39934382994924, + "learning_rate": 2.249572075192288e-06, + "loss": 0.2663, + "step": 11049 + }, + { + "epoch": 0.69, + "grad_norm": 0.5849785628171148, + "learning_rate": 2.2487215940827856e-06, + "loss": 0.4702, + "step": 11050 + }, + { + "epoch": 0.69, + "grad_norm": 1.7096365971610517, + "learning_rate": 2.247871227128709e-06, + "loss": 0.2526, + "step": 11051 + }, + { + "epoch": 0.7, + "grad_norm": 1.5156501599784997, + "learning_rate": 2.247020974365344e-06, + "loss": 0.2687, + "step": 11052 + }, + { + "epoch": 0.7, + "grad_norm": 2.368127450149746, + "learning_rate": 2.24617083582797e-06, + "loss": 0.2711, + "step": 11053 + }, + { + "epoch": 0.7, + "grad_norm": 1.861099343005788, + "learning_rate": 2.2453208115518595e-06, + "loss": 0.2611, + "step": 11054 + }, + { + "epoch": 0.7, + "grad_norm": 2.69917155539132, + "learning_rate": 2.2444709015722792e-06, + "loss": 0.2694, + "step": 11055 + }, + { + "epoch": 0.7, + "grad_norm": 1.808405165629587, + "learning_rate": 2.2436211059244977e-06, + "loss": 0.255, + "step": 11056 + }, + { + "epoch": 0.7, + "grad_norm": 1.9282026490070858, + "learning_rate": 2.2427714246437726e-06, + "loss": 0.2549, + "step": 11057 + }, + { + "epoch": 0.7, + "grad_norm": 2.073363292368241, + "learning_rate": 2.241921857765358e-06, + "loss": 0.2526, + "step": 11058 + }, + { + "epoch": 0.7, + "grad_norm": 2.1626567118325903, + "learning_rate": 2.241072405324503e-06, + "loss": 0.2602, + "step": 11059 + }, + { + "epoch": 0.7, + "grad_norm": 2.5080723008362154, + "learning_rate": 2.240223067356454e-06, + "loss": 0.269, + "step": 11060 + }, + { + "epoch": 0.7, + "grad_norm": 1.632891593904277, + "learning_rate": 2.239373843896454e-06, + "loss": 0.2847, + "step": 11061 + }, + { + "epoch": 0.7, + "grad_norm": 2.089561431675379, + "learning_rate": 2.2385247349797367e-06, + "loss": 0.2714, + "step": 11062 + }, + { + "epoch": 0.7, + "grad_norm": 1.538838110790415, + "learning_rate": 2.237675740641531e-06, + "loss": 0.2515, + "step": 11063 + }, + { + "epoch": 0.7, + "grad_norm": 1.681589298357147, + "learning_rate": 2.2368268609170673e-06, + "loss": 0.2563, + "step": 11064 + }, + { + "epoch": 0.7, + "grad_norm": 1.4931947400211565, + "learning_rate": 2.235978095841565e-06, + "loss": 0.2504, + "step": 11065 + }, + { + "epoch": 0.7, + "grad_norm": 3.0144539321232036, + "learning_rate": 2.2351294454502408e-06, + "loss": 0.2758, + "step": 11066 + }, + { + "epoch": 0.7, + "grad_norm": 2.223518759245911, + "learning_rate": 2.234280909778305e-06, + "loss": 0.2767, + "step": 11067 + }, + { + "epoch": 0.7, + "grad_norm": 1.3950419129427172, + "learning_rate": 2.2334324888609673e-06, + "loss": 0.2549, + "step": 11068 + }, + { + "epoch": 0.7, + "grad_norm": 5.276218192016725, + "learning_rate": 2.2325841827334306e-06, + "loss": 0.2688, + "step": 11069 + }, + { + "epoch": 0.7, + "grad_norm": 7.855309539941102, + "learning_rate": 2.2317359914308924e-06, + "loss": 0.279, + "step": 11070 + }, + { + "epoch": 0.7, + "grad_norm": 3.116590015760823, + "learning_rate": 2.2308879149885426e-06, + "loss": 0.2692, + "step": 11071 + }, + { + "epoch": 0.7, + "grad_norm": 1.923673125791136, + "learning_rate": 2.2300399534415733e-06, + "loss": 0.2587, + "step": 11072 + }, + { + "epoch": 0.7, + "grad_norm": 8.903104121230704, + "learning_rate": 2.229192106825167e-06, + "loss": 0.2644, + "step": 11073 + }, + { + "epoch": 0.7, + "grad_norm": 1.8260474646408806, + "learning_rate": 2.2283443751745015e-06, + "loss": 0.2615, + "step": 11074 + }, + { + "epoch": 0.7, + "grad_norm": 1.9848969494847946, + "learning_rate": 2.2274967585247485e-06, + "loss": 0.2506, + "step": 11075 + }, + { + "epoch": 0.7, + "grad_norm": 1.6350166096238177, + "learning_rate": 2.2266492569110804e-06, + "loss": 0.254, + "step": 11076 + }, + { + "epoch": 0.7, + "grad_norm": 2.573425443333891, + "learning_rate": 2.225801870368662e-06, + "loss": 0.2526, + "step": 11077 + }, + { + "epoch": 0.7, + "grad_norm": 1.4488845512042205, + "learning_rate": 2.2249545989326516e-06, + "loss": 0.2705, + "step": 11078 + }, + { + "epoch": 0.7, + "grad_norm": 2.009221734149319, + "learning_rate": 2.224107442638202e-06, + "loss": 0.2724, + "step": 11079 + }, + { + "epoch": 0.7, + "grad_norm": 2.10424937516931, + "learning_rate": 2.223260401520467e-06, + "loss": 0.2525, + "step": 11080 + }, + { + "epoch": 0.7, + "grad_norm": 1.6278741200920794, + "learning_rate": 2.2224134756145897e-06, + "loss": 0.2737, + "step": 11081 + }, + { + "epoch": 0.7, + "grad_norm": 2.0663359360044953, + "learning_rate": 2.221566664955709e-06, + "loss": 0.2653, + "step": 11082 + }, + { + "epoch": 0.7, + "grad_norm": 3.455547465712494, + "learning_rate": 2.2207199695789653e-06, + "loss": 0.2879, + "step": 11083 + }, + { + "epoch": 0.7, + "grad_norm": 1.793989986136283, + "learning_rate": 2.219873389519484e-06, + "loss": 0.2757, + "step": 11084 + }, + { + "epoch": 0.7, + "grad_norm": 1.8067461841811359, + "learning_rate": 2.2190269248123958e-06, + "loss": 0.2644, + "step": 11085 + }, + { + "epoch": 0.7, + "grad_norm": 2.9519397446729565, + "learning_rate": 2.21818057549282e-06, + "loss": 0.286, + "step": 11086 + }, + { + "epoch": 0.7, + "grad_norm": 1.6867546979026484, + "learning_rate": 2.2173343415958715e-06, + "loss": 0.2628, + "step": 11087 + }, + { + "epoch": 0.7, + "grad_norm": 4.901729711202853, + "learning_rate": 2.2164882231566655e-06, + "loss": 0.259, + "step": 11088 + }, + { + "epoch": 0.7, + "grad_norm": 1.6494661718749033, + "learning_rate": 2.2156422202103067e-06, + "loss": 0.2563, + "step": 11089 + }, + { + "epoch": 0.7, + "grad_norm": 2.048678844580366, + "learning_rate": 2.2147963327918964e-06, + "loss": 0.2569, + "step": 11090 + }, + { + "epoch": 0.7, + "grad_norm": 1.4641382326084147, + "learning_rate": 2.2139505609365352e-06, + "loss": 0.2874, + "step": 11091 + }, + { + "epoch": 0.7, + "grad_norm": 2.515578197114846, + "learning_rate": 2.213104904679312e-06, + "loss": 0.2614, + "step": 11092 + }, + { + "epoch": 0.7, + "grad_norm": 1.5400364181574417, + "learning_rate": 2.212259364055318e-06, + "loss": 0.2852, + "step": 11093 + }, + { + "epoch": 0.7, + "grad_norm": 2.0606239270475943, + "learning_rate": 2.2114139390996348e-06, + "loss": 0.2686, + "step": 11094 + }, + { + "epoch": 0.7, + "grad_norm": 1.3630679878668983, + "learning_rate": 2.2105686298473384e-06, + "loss": 0.2529, + "step": 11095 + }, + { + "epoch": 0.7, + "grad_norm": 2.2767080461181015, + "learning_rate": 2.2097234363335058e-06, + "loss": 0.256, + "step": 11096 + }, + { + "epoch": 0.7, + "grad_norm": 1.5891959395349309, + "learning_rate": 2.2088783585932024e-06, + "loss": 0.2578, + "step": 11097 + }, + { + "epoch": 0.7, + "grad_norm": 2.074386376433146, + "learning_rate": 2.2080333966614953e-06, + "loss": 0.2602, + "step": 11098 + }, + { + "epoch": 0.7, + "grad_norm": 1.8349753863808815, + "learning_rate": 2.2071885505734413e-06, + "loss": 0.2693, + "step": 11099 + }, + { + "epoch": 0.7, + "grad_norm": 1.8212975785039227, + "learning_rate": 2.2063438203640934e-06, + "loss": 0.2584, + "step": 11100 + }, + { + "epoch": 0.7, + "grad_norm": 4.166344451571758, + "learning_rate": 2.2054992060685044e-06, + "loss": 0.2836, + "step": 11101 + }, + { + "epoch": 0.7, + "grad_norm": 2.9066922810415483, + "learning_rate": 2.204654707721717e-06, + "loss": 0.2511, + "step": 11102 + }, + { + "epoch": 0.7, + "grad_norm": 2.638776594110408, + "learning_rate": 2.2038103253587685e-06, + "loss": 0.2519, + "step": 11103 + }, + { + "epoch": 0.7, + "grad_norm": 2.9514373854405695, + "learning_rate": 2.2029660590146983e-06, + "loss": 0.2416, + "step": 11104 + }, + { + "epoch": 0.7, + "grad_norm": 1.6263658028104124, + "learning_rate": 2.202121908724533e-06, + "loss": 0.2704, + "step": 11105 + }, + { + "epoch": 0.7, + "grad_norm": 2.3227303615510393, + "learning_rate": 2.2012778745233003e-06, + "loss": 0.2711, + "step": 11106 + }, + { + "epoch": 0.7, + "grad_norm": 2.7892407403209454, + "learning_rate": 2.200433956446019e-06, + "loss": 0.2743, + "step": 11107 + }, + { + "epoch": 0.7, + "grad_norm": 1.6171984573005869, + "learning_rate": 2.199590154527706e-06, + "loss": 0.2783, + "step": 11108 + }, + { + "epoch": 0.7, + "grad_norm": 1.6675790926600702, + "learning_rate": 2.198746468803372e-06, + "loss": 0.2618, + "step": 11109 + }, + { + "epoch": 0.7, + "grad_norm": 2.107905547457524, + "learning_rate": 2.1979028993080233e-06, + "loss": 0.265, + "step": 11110 + }, + { + "epoch": 0.7, + "grad_norm": 1.9079694562089151, + "learning_rate": 2.1970594460766575e-06, + "loss": 0.2471, + "step": 11111 + }, + { + "epoch": 0.7, + "grad_norm": 1.3998216764906461, + "learning_rate": 2.196216109144275e-06, + "loss": 0.2611, + "step": 11112 + }, + { + "epoch": 0.7, + "grad_norm": 4.071212472082706, + "learning_rate": 2.195372888545867e-06, + "loss": 0.2594, + "step": 11113 + }, + { + "epoch": 0.7, + "grad_norm": 5.185912615809283, + "learning_rate": 2.1945297843164197e-06, + "loss": 0.2772, + "step": 11114 + }, + { + "epoch": 0.7, + "grad_norm": 2.572312217957228, + "learning_rate": 2.193686796490913e-06, + "loss": 0.2617, + "step": 11115 + }, + { + "epoch": 0.7, + "grad_norm": 1.7676879298770152, + "learning_rate": 2.1928439251043278e-06, + "loss": 0.2563, + "step": 11116 + }, + { + "epoch": 0.7, + "grad_norm": 2.734980177494873, + "learning_rate": 2.1920011701916337e-06, + "loss": 0.2686, + "step": 11117 + }, + { + "epoch": 0.7, + "grad_norm": 4.030139485054264, + "learning_rate": 2.1911585317877986e-06, + "loss": 0.2384, + "step": 11118 + }, + { + "epoch": 0.7, + "grad_norm": 2.232825620521009, + "learning_rate": 2.1903160099277835e-06, + "loss": 0.2612, + "step": 11119 + }, + { + "epoch": 0.7, + "grad_norm": 1.6215912547677702, + "learning_rate": 2.1894736046465482e-06, + "loss": 0.2608, + "step": 11120 + }, + { + "epoch": 0.7, + "grad_norm": 1.8951279261507745, + "learning_rate": 2.1886313159790463e-06, + "loss": 0.2862, + "step": 11121 + }, + { + "epoch": 0.7, + "grad_norm": 2.1648138596848687, + "learning_rate": 2.187789143960225e-06, + "loss": 0.2668, + "step": 11122 + }, + { + "epoch": 0.7, + "grad_norm": 3.05366253846743, + "learning_rate": 2.186947088625025e-06, + "loss": 0.2593, + "step": 11123 + }, + { + "epoch": 0.7, + "grad_norm": 0.621587619191963, + "learning_rate": 2.1861051500083886e-06, + "loss": 0.4774, + "step": 11124 + }, + { + "epoch": 0.7, + "grad_norm": 7.862334799297239, + "learning_rate": 2.185263328145248e-06, + "loss": 0.267, + "step": 11125 + }, + { + "epoch": 0.7, + "grad_norm": 0.6372825072994112, + "learning_rate": 2.184421623070531e-06, + "loss": 0.4961, + "step": 11126 + }, + { + "epoch": 0.7, + "grad_norm": 5.111011552477857, + "learning_rate": 2.1835800348191604e-06, + "loss": 0.2583, + "step": 11127 + }, + { + "epoch": 0.7, + "grad_norm": 0.5497573113746697, + "learning_rate": 2.182738563426057e-06, + "loss": 0.4468, + "step": 11128 + }, + { + "epoch": 0.7, + "grad_norm": 1.9332307363591024, + "learning_rate": 2.181897208926136e-06, + "loss": 0.2603, + "step": 11129 + }, + { + "epoch": 0.7, + "grad_norm": 1.3140671813940632, + "learning_rate": 2.1810559713543052e-06, + "loss": 0.2479, + "step": 11130 + }, + { + "epoch": 0.7, + "grad_norm": 1.5273597655369189, + "learning_rate": 2.1802148507454675e-06, + "loss": 0.2669, + "step": 11131 + }, + { + "epoch": 0.7, + "grad_norm": 1.6823541823304715, + "learning_rate": 2.1793738471345253e-06, + "loss": 0.2507, + "step": 11132 + }, + { + "epoch": 0.7, + "grad_norm": 1.5571655208137851, + "learning_rate": 2.1785329605563733e-06, + "loss": 0.2642, + "step": 11133 + }, + { + "epoch": 0.7, + "grad_norm": 3.2515328013997116, + "learning_rate": 2.1776921910458982e-06, + "loss": 0.2714, + "step": 11134 + }, + { + "epoch": 0.7, + "grad_norm": 2.2716633686626753, + "learning_rate": 2.1768515386379886e-06, + "loss": 0.2675, + "step": 11135 + }, + { + "epoch": 0.7, + "grad_norm": 2.7139707947096543, + "learning_rate": 2.1760110033675215e-06, + "loss": 0.2651, + "step": 11136 + }, + { + "epoch": 0.7, + "grad_norm": 3.818952731486872, + "learning_rate": 2.175170585269375e-06, + "loss": 0.2651, + "step": 11137 + }, + { + "epoch": 0.7, + "grad_norm": 2.7930437580722884, + "learning_rate": 2.174330284378419e-06, + "loss": 0.2592, + "step": 11138 + }, + { + "epoch": 0.7, + "grad_norm": 1.488512840589353, + "learning_rate": 2.1734901007295166e-06, + "loss": 0.263, + "step": 11139 + }, + { + "epoch": 0.7, + "grad_norm": 3.3384207012645537, + "learning_rate": 2.172650034357532e-06, + "loss": 0.3014, + "step": 11140 + }, + { + "epoch": 0.7, + "grad_norm": 1.7797887466403626, + "learning_rate": 2.1718100852973193e-06, + "loss": 0.2636, + "step": 11141 + }, + { + "epoch": 0.7, + "grad_norm": 2.2366899928859194, + "learning_rate": 2.1709702535837286e-06, + "loss": 0.263, + "step": 11142 + }, + { + "epoch": 0.7, + "grad_norm": 1.8869791297124512, + "learning_rate": 2.1701305392516083e-06, + "loss": 0.2718, + "step": 11143 + }, + { + "epoch": 0.7, + "grad_norm": 1.627419730919339, + "learning_rate": 2.169290942335796e-06, + "loss": 0.2685, + "step": 11144 + }, + { + "epoch": 0.7, + "grad_norm": 3.9933536815401744, + "learning_rate": 2.1684514628711327e-06, + "loss": 0.2591, + "step": 11145 + }, + { + "epoch": 0.7, + "grad_norm": 0.585672142831425, + "learning_rate": 2.167612100892448e-06, + "loss": 0.4837, + "step": 11146 + }, + { + "epoch": 0.7, + "grad_norm": 1.9656785182238308, + "learning_rate": 2.166772856434566e-06, + "loss": 0.2734, + "step": 11147 + }, + { + "epoch": 0.7, + "grad_norm": 1.689638391273336, + "learning_rate": 2.1659337295323117e-06, + "loss": 0.2667, + "step": 11148 + }, + { + "epoch": 0.7, + "grad_norm": 1.6796494381982516, + "learning_rate": 2.1650947202204998e-06, + "loss": 0.2477, + "step": 11149 + }, + { + "epoch": 0.7, + "grad_norm": 1.486513243309083, + "learning_rate": 2.164255828533945e-06, + "loss": 0.2616, + "step": 11150 + }, + { + "epoch": 0.7, + "grad_norm": 3.1551226352039694, + "learning_rate": 2.1634170545074525e-06, + "loss": 0.2679, + "step": 11151 + }, + { + "epoch": 0.7, + "grad_norm": 2.074976292114402, + "learning_rate": 2.162578398175823e-06, + "loss": 0.2644, + "step": 11152 + }, + { + "epoch": 0.7, + "grad_norm": 2.784326610502005, + "learning_rate": 2.1617398595738575e-06, + "loss": 0.2662, + "step": 11153 + }, + { + "epoch": 0.7, + "grad_norm": 2.0804451457232425, + "learning_rate": 2.1609014387363463e-06, + "loss": 0.253, + "step": 11154 + }, + { + "epoch": 0.7, + "grad_norm": 1.8393783950324805, + "learning_rate": 2.1600631356980755e-06, + "loss": 0.2741, + "step": 11155 + }, + { + "epoch": 0.7, + "grad_norm": 3.8965425607078283, + "learning_rate": 2.1592249504938317e-06, + "loss": 0.2648, + "step": 11156 + }, + { + "epoch": 0.7, + "grad_norm": 3.13709796683566, + "learning_rate": 2.1583868831583883e-06, + "loss": 0.3057, + "step": 11157 + }, + { + "epoch": 0.7, + "grad_norm": 1.521775100929573, + "learning_rate": 2.1575489337265226e-06, + "loss": 0.2547, + "step": 11158 + }, + { + "epoch": 0.7, + "grad_norm": 2.2896605781529495, + "learning_rate": 2.156711102233e-06, + "loss": 0.2598, + "step": 11159 + }, + { + "epoch": 0.7, + "grad_norm": 3.0154422851143377, + "learning_rate": 2.155873388712582e-06, + "loss": 0.2795, + "step": 11160 + }, + { + "epoch": 0.7, + "grad_norm": 2.2864212875100796, + "learning_rate": 2.155035793200031e-06, + "loss": 0.2646, + "step": 11161 + }, + { + "epoch": 0.7, + "grad_norm": 1.6538919218949895, + "learning_rate": 2.1541983157300983e-06, + "loss": 0.2664, + "step": 11162 + }, + { + "epoch": 0.7, + "grad_norm": 4.097220743349051, + "learning_rate": 2.15336095633753e-06, + "loss": 0.2788, + "step": 11163 + }, + { + "epoch": 0.7, + "grad_norm": 1.620666279732925, + "learning_rate": 2.152523715057072e-06, + "loss": 0.2747, + "step": 11164 + }, + { + "epoch": 0.7, + "grad_norm": 3.1825001412007596, + "learning_rate": 2.151686591923465e-06, + "loss": 0.2903, + "step": 11165 + }, + { + "epoch": 0.7, + "grad_norm": 2.634309417557884, + "learning_rate": 2.1508495869714395e-06, + "loss": 0.2575, + "step": 11166 + }, + { + "epoch": 0.7, + "grad_norm": 1.7076290668926548, + "learning_rate": 2.1500127002357264e-06, + "loss": 0.2708, + "step": 11167 + }, + { + "epoch": 0.7, + "grad_norm": 1.545814349863931, + "learning_rate": 2.1491759317510456e-06, + "loss": 0.2598, + "step": 11168 + }, + { + "epoch": 0.7, + "grad_norm": 1.8192730287147911, + "learning_rate": 2.148339281552122e-06, + "loss": 0.2671, + "step": 11169 + }, + { + "epoch": 0.7, + "grad_norm": 0.5983696864805788, + "learning_rate": 2.147502749673666e-06, + "loss": 0.4713, + "step": 11170 + }, + { + "epoch": 0.7, + "grad_norm": 2.4567805993697562, + "learning_rate": 2.146666336150386e-06, + "loss": 0.2584, + "step": 11171 + }, + { + "epoch": 0.7, + "grad_norm": 2.7143646039687903, + "learning_rate": 2.1458300410169875e-06, + "loss": 0.2721, + "step": 11172 + }, + { + "epoch": 0.7, + "grad_norm": 1.5423343491809198, + "learning_rate": 2.1449938643081717e-06, + "loss": 0.27, + "step": 11173 + }, + { + "epoch": 0.7, + "grad_norm": 5.500103176962945, + "learning_rate": 2.1441578060586322e-06, + "loss": 0.2648, + "step": 11174 + }, + { + "epoch": 0.7, + "grad_norm": 1.8847348174787146, + "learning_rate": 2.1433218663030576e-06, + "loss": 0.2666, + "step": 11175 + }, + { + "epoch": 0.7, + "grad_norm": 1.5373001390888033, + "learning_rate": 2.142486045076131e-06, + "loss": 0.2703, + "step": 11176 + }, + { + "epoch": 0.7, + "grad_norm": 1.4707709473272812, + "learning_rate": 2.141650342412536e-06, + "loss": 0.2726, + "step": 11177 + }, + { + "epoch": 0.7, + "grad_norm": 1.477765128720051, + "learning_rate": 2.1408147583469446e-06, + "loss": 0.2521, + "step": 11178 + }, + { + "epoch": 0.7, + "grad_norm": 2.38179731331009, + "learning_rate": 2.139979292914026e-06, + "loss": 0.2706, + "step": 11179 + }, + { + "epoch": 0.7, + "grad_norm": 2.201913602413681, + "learning_rate": 2.1391439461484463e-06, + "loss": 0.2605, + "step": 11180 + }, + { + "epoch": 0.7, + "grad_norm": 13.54492519426392, + "learning_rate": 2.1383087180848685e-06, + "loss": 0.2594, + "step": 11181 + }, + { + "epoch": 0.7, + "grad_norm": 1.6837664152401797, + "learning_rate": 2.137473608757944e-06, + "loss": 0.2779, + "step": 11182 + }, + { + "epoch": 0.7, + "grad_norm": 5.5342625439670545, + "learning_rate": 2.1366386182023237e-06, + "loss": 0.2605, + "step": 11183 + }, + { + "epoch": 0.7, + "grad_norm": 2.238471559858704, + "learning_rate": 2.1358037464526516e-06, + "loss": 0.2699, + "step": 11184 + }, + { + "epoch": 0.7, + "grad_norm": 4.21358807675687, + "learning_rate": 2.1349689935435715e-06, + "loss": 0.2727, + "step": 11185 + }, + { + "epoch": 0.7, + "grad_norm": 3.2865253666288954, + "learning_rate": 2.1341343595097163e-06, + "loss": 0.3045, + "step": 11186 + }, + { + "epoch": 0.7, + "grad_norm": 2.5527068162802222, + "learning_rate": 2.133299844385715e-06, + "loss": 0.2666, + "step": 11187 + }, + { + "epoch": 0.7, + "grad_norm": 2.3260381470819143, + "learning_rate": 2.1324654482061953e-06, + "loss": 0.2555, + "step": 11188 + }, + { + "epoch": 0.7, + "grad_norm": 0.6032118521629376, + "learning_rate": 2.131631171005779e-06, + "loss": 0.4699, + "step": 11189 + }, + { + "epoch": 0.7, + "grad_norm": 2.7199801215884585, + "learning_rate": 2.13079701281908e-06, + "loss": 0.2771, + "step": 11190 + }, + { + "epoch": 0.7, + "grad_norm": 1.7997946886436644, + "learning_rate": 2.129962973680708e-06, + "loss": 0.2753, + "step": 11191 + }, + { + "epoch": 0.7, + "grad_norm": 3.3796796507655373, + "learning_rate": 2.1291290536252716e-06, + "loss": 0.2567, + "step": 11192 + }, + { + "epoch": 0.7, + "grad_norm": 4.750675715309363, + "learning_rate": 2.12829525268737e-06, + "loss": 0.2799, + "step": 11193 + }, + { + "epoch": 0.7, + "grad_norm": 2.40052063887953, + "learning_rate": 2.1274615709015972e-06, + "loss": 0.2622, + "step": 11194 + }, + { + "epoch": 0.7, + "grad_norm": 3.4484046836847706, + "learning_rate": 2.126628008302548e-06, + "loss": 0.2883, + "step": 11195 + }, + { + "epoch": 0.7, + "grad_norm": 2.1333827805240704, + "learning_rate": 2.1257945649248048e-06, + "loss": 0.2729, + "step": 11196 + }, + { + "epoch": 0.7, + "grad_norm": 1.5699763365801473, + "learning_rate": 2.1249612408029516e-06, + "loss": 0.246, + "step": 11197 + }, + { + "epoch": 0.7, + "grad_norm": 2.9683026273894892, + "learning_rate": 2.124128035971563e-06, + "loss": 0.2814, + "step": 11198 + }, + { + "epoch": 0.7, + "grad_norm": 1.8845964088070704, + "learning_rate": 2.123294950465209e-06, + "loss": 0.268, + "step": 11199 + }, + { + "epoch": 0.7, + "grad_norm": 1.4046556680794728, + "learning_rate": 2.122461984318459e-06, + "loss": 0.2496, + "step": 11200 + }, + { + "epoch": 0.7, + "grad_norm": 2.362065342879254, + "learning_rate": 2.121629137565872e-06, + "loss": 0.2524, + "step": 11201 + }, + { + "epoch": 0.7, + "grad_norm": 1.8259556858118575, + "learning_rate": 2.1207964102420032e-06, + "loss": 0.2635, + "step": 11202 + }, + { + "epoch": 0.7, + "grad_norm": 2.946762179583013, + "learning_rate": 2.119963802381407e-06, + "loss": 0.2573, + "step": 11203 + }, + { + "epoch": 0.7, + "grad_norm": 1.479033609249847, + "learning_rate": 2.119131314018626e-06, + "loss": 0.2742, + "step": 11204 + }, + { + "epoch": 0.7, + "grad_norm": 0.5973115487074803, + "learning_rate": 2.1182989451882056e-06, + "loss": 0.4903, + "step": 11205 + }, + { + "epoch": 0.7, + "grad_norm": 1.5642157796981417, + "learning_rate": 2.117466695924681e-06, + "loss": 0.2651, + "step": 11206 + }, + { + "epoch": 0.7, + "grad_norm": 1.257603880333518, + "learning_rate": 2.116634566262581e-06, + "loss": 0.242, + "step": 11207 + }, + { + "epoch": 0.7, + "grad_norm": 2.3102517941324026, + "learning_rate": 2.115802556236436e-06, + "loss": 0.2617, + "step": 11208 + }, + { + "epoch": 0.7, + "grad_norm": 2.6392256961189067, + "learning_rate": 2.1149706658807646e-06, + "loss": 0.2691, + "step": 11209 + }, + { + "epoch": 0.7, + "grad_norm": 1.976684480344322, + "learning_rate": 2.1141388952300855e-06, + "loss": 0.2957, + "step": 11210 + }, + { + "epoch": 0.71, + "grad_norm": 2.8422656535467308, + "learning_rate": 2.1133072443189106e-06, + "loss": 0.259, + "step": 11211 + }, + { + "epoch": 0.71, + "grad_norm": 1.367002481481189, + "learning_rate": 2.112475713181743e-06, + "loss": 0.2685, + "step": 11212 + }, + { + "epoch": 0.71, + "grad_norm": 2.5502750509184016, + "learning_rate": 2.1116443018530887e-06, + "loss": 0.2928, + "step": 11213 + }, + { + "epoch": 0.71, + "grad_norm": 2.695723953025711, + "learning_rate": 2.110813010367443e-06, + "loss": 0.2697, + "step": 11214 + }, + { + "epoch": 0.71, + "grad_norm": 2.120399936015536, + "learning_rate": 2.109981838759295e-06, + "loss": 0.2632, + "step": 11215 + }, + { + "epoch": 0.71, + "grad_norm": 0.6510816232567276, + "learning_rate": 2.109150787063134e-06, + "loss": 0.4772, + "step": 11216 + }, + { + "epoch": 0.71, + "grad_norm": 1.6061274158898775, + "learning_rate": 2.108319855313443e-06, + "loss": 0.2485, + "step": 11217 + }, + { + "epoch": 0.71, + "grad_norm": 2.1842445092804588, + "learning_rate": 2.1074890435446976e-06, + "loss": 0.2538, + "step": 11218 + }, + { + "epoch": 0.71, + "grad_norm": 1.5670237583121522, + "learning_rate": 2.1066583517913696e-06, + "loss": 0.2578, + "step": 11219 + }, + { + "epoch": 0.71, + "grad_norm": 2.618919175699726, + "learning_rate": 2.105827780087924e-06, + "loss": 0.2719, + "step": 11220 + }, + { + "epoch": 0.71, + "grad_norm": 8.536684002560847, + "learning_rate": 2.104997328468826e-06, + "loss": 0.2759, + "step": 11221 + }, + { + "epoch": 0.71, + "grad_norm": 0.5864349389049004, + "learning_rate": 2.104166996968531e-06, + "loss": 0.4442, + "step": 11222 + }, + { + "epoch": 0.71, + "grad_norm": 25.57374727550384, + "learning_rate": 2.10333678562149e-06, + "loss": 0.2453, + "step": 11223 + }, + { + "epoch": 0.71, + "grad_norm": 2.1942965238801406, + "learning_rate": 2.1025066944621498e-06, + "loss": 0.2526, + "step": 11224 + }, + { + "epoch": 0.71, + "grad_norm": 0.5604876554963547, + "learning_rate": 2.101676723524955e-06, + "loss": 0.4653, + "step": 11225 + }, + { + "epoch": 0.71, + "grad_norm": 1.538286455308112, + "learning_rate": 2.1008468728443416e-06, + "loss": 0.2652, + "step": 11226 + }, + { + "epoch": 0.71, + "grad_norm": 4.305232031863097, + "learning_rate": 2.1000171424547406e-06, + "loss": 0.2627, + "step": 11227 + }, + { + "epoch": 0.71, + "grad_norm": 1.5268653904148923, + "learning_rate": 2.0991875323905773e-06, + "loss": 0.2597, + "step": 11228 + }, + { + "epoch": 0.71, + "grad_norm": 2.4579551686758565, + "learning_rate": 2.0983580426862777e-06, + "loss": 0.2527, + "step": 11229 + }, + { + "epoch": 0.71, + "grad_norm": 1.6436004300885614, + "learning_rate": 2.0975286733762575e-06, + "loss": 0.2676, + "step": 11230 + }, + { + "epoch": 0.71, + "grad_norm": 2.0157942786307443, + "learning_rate": 2.096699424494925e-06, + "loss": 0.2687, + "step": 11231 + }, + { + "epoch": 0.71, + "grad_norm": 2.089680836571682, + "learning_rate": 2.0958702960766907e-06, + "loss": 0.275, + "step": 11232 + }, + { + "epoch": 0.71, + "grad_norm": 2.4347347039415697, + "learning_rate": 2.0950412881559584e-06, + "loss": 0.2839, + "step": 11233 + }, + { + "epoch": 0.71, + "grad_norm": 1.8412385516769503, + "learning_rate": 2.094212400767122e-06, + "loss": 0.2751, + "step": 11234 + }, + { + "epoch": 0.71, + "grad_norm": 2.072658839230748, + "learning_rate": 2.093383633944575e-06, + "loss": 0.2532, + "step": 11235 + }, + { + "epoch": 0.71, + "grad_norm": 2.642079530520852, + "learning_rate": 2.092554987722702e-06, + "loss": 0.2385, + "step": 11236 + }, + { + "epoch": 0.71, + "grad_norm": 1.8786982703045854, + "learning_rate": 2.091726462135888e-06, + "loss": 0.278, + "step": 11237 + }, + { + "epoch": 0.71, + "grad_norm": 1.8000350325657724, + "learning_rate": 2.0908980572185093e-06, + "loss": 0.2574, + "step": 11238 + }, + { + "epoch": 0.71, + "grad_norm": 2.059791199192832, + "learning_rate": 2.0900697730049353e-06, + "loss": 0.2777, + "step": 11239 + }, + { + "epoch": 0.71, + "grad_norm": 2.4467078106599707, + "learning_rate": 2.089241609529535e-06, + "loss": 0.2698, + "step": 11240 + }, + { + "epoch": 0.71, + "grad_norm": 2.5454674748671775, + "learning_rate": 2.0884135668266725e-06, + "loss": 0.2583, + "step": 11241 + }, + { + "epoch": 0.71, + "grad_norm": 1.5129349252160442, + "learning_rate": 2.0875856449307026e-06, + "loss": 0.2698, + "step": 11242 + }, + { + "epoch": 0.71, + "grad_norm": 2.078105517035438, + "learning_rate": 2.0867578438759766e-06, + "loss": 0.2515, + "step": 11243 + }, + { + "epoch": 0.71, + "grad_norm": 1.8883553602994059, + "learning_rate": 2.085930163696841e-06, + "loss": 0.2811, + "step": 11244 + }, + { + "epoch": 0.71, + "grad_norm": 1.6460025422701143, + "learning_rate": 2.0851026044276405e-06, + "loss": 0.2616, + "step": 11245 + }, + { + "epoch": 0.71, + "grad_norm": 1.6729825269022631, + "learning_rate": 2.0842751661027087e-06, + "loss": 0.2549, + "step": 11246 + }, + { + "epoch": 0.71, + "grad_norm": 2.219001016374269, + "learning_rate": 2.0834478487563815e-06, + "loss": 0.2702, + "step": 11247 + }, + { + "epoch": 0.71, + "grad_norm": 1.6866832792573418, + "learning_rate": 2.082620652422981e-06, + "loss": 0.2761, + "step": 11248 + }, + { + "epoch": 0.71, + "grad_norm": 2.055320779195588, + "learning_rate": 2.081793577136833e-06, + "loss": 0.2475, + "step": 11249 + }, + { + "epoch": 0.71, + "grad_norm": 1.9123174739920292, + "learning_rate": 2.0809666229322533e-06, + "loss": 0.2486, + "step": 11250 + }, + { + "epoch": 0.71, + "grad_norm": 1.997780556288267, + "learning_rate": 2.0801397898435534e-06, + "loss": 0.2686, + "step": 11251 + }, + { + "epoch": 0.71, + "grad_norm": 2.1026974105828473, + "learning_rate": 2.0793130779050374e-06, + "loss": 0.2579, + "step": 11252 + }, + { + "epoch": 0.71, + "grad_norm": 1.8078411799573328, + "learning_rate": 2.078486487151012e-06, + "loss": 0.26, + "step": 11253 + }, + { + "epoch": 0.71, + "grad_norm": 2.882044324375287, + "learning_rate": 2.077660017615769e-06, + "loss": 0.2412, + "step": 11254 + }, + { + "epoch": 0.71, + "grad_norm": 2.2613193331472154, + "learning_rate": 2.076833669333605e-06, + "loss": 0.2832, + "step": 11255 + }, + { + "epoch": 0.71, + "grad_norm": 4.408733614715769, + "learning_rate": 2.0760074423388015e-06, + "loss": 0.2847, + "step": 11256 + }, + { + "epoch": 0.71, + "grad_norm": 2.70585575241767, + "learning_rate": 2.075181336665645e-06, + "loss": 0.2561, + "step": 11257 + }, + { + "epoch": 0.71, + "grad_norm": 2.8369894987381366, + "learning_rate": 2.0743553523484107e-06, + "loss": 0.2864, + "step": 11258 + }, + { + "epoch": 0.71, + "grad_norm": 1.9047531515855274, + "learning_rate": 2.0735294894213687e-06, + "loss": 0.2486, + "step": 11259 + }, + { + "epoch": 0.71, + "grad_norm": 1.8312429988424974, + "learning_rate": 2.072703747918784e-06, + "loss": 0.277, + "step": 11260 + }, + { + "epoch": 0.71, + "grad_norm": 1.84859456575309, + "learning_rate": 2.0718781278749206e-06, + "loss": 0.2704, + "step": 11261 + }, + { + "epoch": 0.71, + "grad_norm": 1.4552664135300197, + "learning_rate": 2.0710526293240367e-06, + "loss": 0.2571, + "step": 11262 + }, + { + "epoch": 0.71, + "grad_norm": 6.2689442857165725, + "learning_rate": 2.0702272523003815e-06, + "loss": 0.2815, + "step": 11263 + }, + { + "epoch": 0.71, + "grad_norm": 3.4942252443609294, + "learning_rate": 2.0694019968381993e-06, + "loss": 0.2756, + "step": 11264 + }, + { + "epoch": 0.71, + "grad_norm": 2.261708605292174, + "learning_rate": 2.068576862971736e-06, + "loss": 0.2721, + "step": 11265 + }, + { + "epoch": 0.71, + "grad_norm": 3.1416618633960653, + "learning_rate": 2.0677518507352246e-06, + "loss": 0.2548, + "step": 11266 + }, + { + "epoch": 0.71, + "grad_norm": 3.741672270038633, + "learning_rate": 2.066926960162896e-06, + "loss": 0.248, + "step": 11267 + }, + { + "epoch": 0.71, + "grad_norm": 3.3673046110835774, + "learning_rate": 2.066102191288979e-06, + "loss": 0.2728, + "step": 11268 + }, + { + "epoch": 0.71, + "grad_norm": 1.8066562817407386, + "learning_rate": 2.065277544147692e-06, + "loss": 0.2591, + "step": 11269 + }, + { + "epoch": 0.71, + "grad_norm": 1.6442037120335296, + "learning_rate": 2.0644530187732538e-06, + "loss": 0.2759, + "step": 11270 + }, + { + "epoch": 0.71, + "grad_norm": 2.7962337544630333, + "learning_rate": 2.0636286151998737e-06, + "loss": 0.2592, + "step": 11271 + }, + { + "epoch": 0.71, + "grad_norm": 3.7439235779197984, + "learning_rate": 2.0628043334617565e-06, + "loss": 0.2651, + "step": 11272 + }, + { + "epoch": 0.71, + "grad_norm": 1.802187952148651, + "learning_rate": 2.061980173593106e-06, + "loss": 0.2611, + "step": 11273 + }, + { + "epoch": 0.71, + "grad_norm": 2.046964530052051, + "learning_rate": 2.061156135628117e-06, + "loss": 0.2458, + "step": 11274 + }, + { + "epoch": 0.71, + "grad_norm": 1.7023506125150913, + "learning_rate": 2.060332219600978e-06, + "loss": 0.2495, + "step": 11275 + }, + { + "epoch": 0.71, + "grad_norm": 1.4510190663030633, + "learning_rate": 2.059508425545876e-06, + "loss": 0.2583, + "step": 11276 + }, + { + "epoch": 0.71, + "grad_norm": 0.5984112980284597, + "learning_rate": 2.058684753496995e-06, + "loss": 0.4544, + "step": 11277 + }, + { + "epoch": 0.71, + "grad_norm": 1.718038916639918, + "learning_rate": 2.0578612034885083e-06, + "loss": 0.2435, + "step": 11278 + }, + { + "epoch": 0.71, + "grad_norm": 2.1972264636950234, + "learning_rate": 2.057037775554585e-06, + "loss": 0.2464, + "step": 11279 + }, + { + "epoch": 0.71, + "grad_norm": 1.3567895580061946, + "learning_rate": 2.056214469729391e-06, + "loss": 0.2606, + "step": 11280 + }, + { + "epoch": 0.71, + "grad_norm": 1.6567632067021063, + "learning_rate": 2.055391286047088e-06, + "loss": 0.2594, + "step": 11281 + }, + { + "epoch": 0.71, + "grad_norm": 2.132715962089489, + "learning_rate": 2.0545682245418314e-06, + "loss": 0.2676, + "step": 11282 + }, + { + "epoch": 0.71, + "grad_norm": 1.8983599977686207, + "learning_rate": 2.053745285247769e-06, + "loss": 0.2495, + "step": 11283 + }, + { + "epoch": 0.71, + "grad_norm": 2.2749586217527535, + "learning_rate": 2.0529224681990477e-06, + "loss": 0.2441, + "step": 11284 + }, + { + "epoch": 0.71, + "grad_norm": 2.194913080985443, + "learning_rate": 2.0520997734298096e-06, + "loss": 0.2555, + "step": 11285 + }, + { + "epoch": 0.71, + "grad_norm": 1.6124897014250361, + "learning_rate": 2.0512772009741883e-06, + "loss": 0.2616, + "step": 11286 + }, + { + "epoch": 0.71, + "grad_norm": 3.539949427933494, + "learning_rate": 2.050454750866313e-06, + "loss": 0.2718, + "step": 11287 + }, + { + "epoch": 0.71, + "grad_norm": 2.7385771706836386, + "learning_rate": 2.0496324231403074e-06, + "loss": 0.2653, + "step": 11288 + }, + { + "epoch": 0.71, + "grad_norm": 1.6378637675240364, + "learning_rate": 2.048810217830295e-06, + "loss": 0.2604, + "step": 11289 + }, + { + "epoch": 0.71, + "grad_norm": 2.0440401002533757, + "learning_rate": 2.0479881349703885e-06, + "loss": 0.2511, + "step": 11290 + }, + { + "epoch": 0.71, + "grad_norm": 2.6730871924817556, + "learning_rate": 2.047166174594696e-06, + "loss": 0.2533, + "step": 11291 + }, + { + "epoch": 0.71, + "grad_norm": 1.8096736846885213, + "learning_rate": 2.0463443367373232e-06, + "loss": 0.2552, + "step": 11292 + }, + { + "epoch": 0.71, + "grad_norm": 1.8010067054137464, + "learning_rate": 2.045522621432372e-06, + "loss": 0.2632, + "step": 11293 + }, + { + "epoch": 0.71, + "grad_norm": 4.317427290002973, + "learning_rate": 2.0447010287139357e-06, + "loss": 0.2924, + "step": 11294 + }, + { + "epoch": 0.71, + "grad_norm": 1.7938139433949192, + "learning_rate": 2.0438795586161027e-06, + "loss": 0.2562, + "step": 11295 + }, + { + "epoch": 0.71, + "grad_norm": 1.696871152362575, + "learning_rate": 2.0430582111729553e-06, + "loss": 0.2522, + "step": 11296 + }, + { + "epoch": 0.71, + "grad_norm": 3.303049992480189, + "learning_rate": 2.0422369864185764e-06, + "loss": 0.2691, + "step": 11297 + }, + { + "epoch": 0.71, + "grad_norm": 1.7809763358651913, + "learning_rate": 2.0414158843870374e-06, + "loss": 0.2572, + "step": 11298 + }, + { + "epoch": 0.71, + "grad_norm": 1.5836344539530396, + "learning_rate": 2.0405949051124098e-06, + "loss": 0.2917, + "step": 11299 + }, + { + "epoch": 0.71, + "grad_norm": 6.499452163795848, + "learning_rate": 2.039774048628754e-06, + "loss": 0.3023, + "step": 11300 + }, + { + "epoch": 0.71, + "grad_norm": 2.742859695149067, + "learning_rate": 2.0389533149701337e-06, + "loss": 0.2683, + "step": 11301 + }, + { + "epoch": 0.71, + "grad_norm": 1.7488158903285846, + "learning_rate": 2.038132704170599e-06, + "loss": 0.2764, + "step": 11302 + }, + { + "epoch": 0.71, + "grad_norm": 1.8710981474400807, + "learning_rate": 2.037312216264199e-06, + "loss": 0.2616, + "step": 11303 + }, + { + "epoch": 0.71, + "grad_norm": 1.7558263431775918, + "learning_rate": 2.0364918512849763e-06, + "loss": 0.2754, + "step": 11304 + }, + { + "epoch": 0.71, + "grad_norm": 1.3723448692660163, + "learning_rate": 2.035671609266972e-06, + "loss": 0.2581, + "step": 11305 + }, + { + "epoch": 0.71, + "grad_norm": 2.655202570536015, + "learning_rate": 2.0348514902442163e-06, + "loss": 0.2574, + "step": 11306 + }, + { + "epoch": 0.71, + "grad_norm": 1.4593716364778655, + "learning_rate": 2.0340314942507404e-06, + "loss": 0.2409, + "step": 11307 + }, + { + "epoch": 0.71, + "grad_norm": 1.4099536551064702, + "learning_rate": 2.0332116213205643e-06, + "loss": 0.2351, + "step": 11308 + }, + { + "epoch": 0.71, + "grad_norm": 1.631861540894847, + "learning_rate": 2.0323918714877094e-06, + "loss": 0.2514, + "step": 11309 + }, + { + "epoch": 0.71, + "grad_norm": 1.8491413432317427, + "learning_rate": 2.031572244786187e-06, + "loss": 0.2756, + "step": 11310 + }, + { + "epoch": 0.71, + "grad_norm": 3.418752229745969, + "learning_rate": 2.0307527412500044e-06, + "loss": 0.2906, + "step": 11311 + }, + { + "epoch": 0.71, + "grad_norm": 3.559290461037333, + "learning_rate": 2.0299333609131635e-06, + "loss": 0.2817, + "step": 11312 + }, + { + "epoch": 0.71, + "grad_norm": 4.14333564023226, + "learning_rate": 2.0291141038096616e-06, + "loss": 0.2633, + "step": 11313 + }, + { + "epoch": 0.71, + "grad_norm": 1.988662410341561, + "learning_rate": 2.028294969973495e-06, + "loss": 0.2566, + "step": 11314 + }, + { + "epoch": 0.71, + "grad_norm": 2.276707696834116, + "learning_rate": 2.0274759594386485e-06, + "loss": 0.2485, + "step": 11315 + }, + { + "epoch": 0.71, + "grad_norm": 0.6027524435209456, + "learning_rate": 2.0266570722391023e-06, + "loss": 0.4605, + "step": 11316 + }, + { + "epoch": 0.71, + "grad_norm": 1.5611061983899408, + "learning_rate": 2.025838308408837e-06, + "loss": 0.2797, + "step": 11317 + }, + { + "epoch": 0.71, + "grad_norm": 1.7898125265811011, + "learning_rate": 2.0250196679818234e-06, + "loss": 0.2612, + "step": 11318 + }, + { + "epoch": 0.71, + "grad_norm": 1.8054608946122124, + "learning_rate": 2.024201150992028e-06, + "loss": 0.274, + "step": 11319 + }, + { + "epoch": 0.71, + "grad_norm": 1.8249182693775046, + "learning_rate": 2.02338275747341e-06, + "loss": 0.2681, + "step": 11320 + }, + { + "epoch": 0.71, + "grad_norm": 4.2086545351873355, + "learning_rate": 2.022564487459929e-06, + "loss": 0.2592, + "step": 11321 + }, + { + "epoch": 0.71, + "grad_norm": 2.263025405654816, + "learning_rate": 2.0217463409855377e-06, + "loss": 0.2577, + "step": 11322 + }, + { + "epoch": 0.71, + "grad_norm": 1.6013874667329715, + "learning_rate": 2.0209283180841803e-06, + "loss": 0.2683, + "step": 11323 + }, + { + "epoch": 0.71, + "grad_norm": 2.716110993432087, + "learning_rate": 2.0201104187897975e-06, + "loss": 0.2551, + "step": 11324 + }, + { + "epoch": 0.71, + "grad_norm": 2.1477287728078567, + "learning_rate": 2.0192926431363274e-06, + "loss": 0.2529, + "step": 11325 + }, + { + "epoch": 0.71, + "grad_norm": 2.315932900324255, + "learning_rate": 2.0184749911576996e-06, + "loss": 0.2609, + "step": 11326 + }, + { + "epoch": 0.71, + "grad_norm": 1.5035421602010106, + "learning_rate": 2.017657462887841e-06, + "loss": 0.2578, + "step": 11327 + }, + { + "epoch": 0.71, + "grad_norm": 5.905785262196667, + "learning_rate": 2.016840058360667e-06, + "loss": 0.2752, + "step": 11328 + }, + { + "epoch": 0.71, + "grad_norm": 2.764265828420965, + "learning_rate": 2.016022777610102e-06, + "loss": 0.2609, + "step": 11329 + }, + { + "epoch": 0.71, + "grad_norm": 2.0082251367652777, + "learning_rate": 2.0152056206700516e-06, + "loss": 0.2576, + "step": 11330 + }, + { + "epoch": 0.71, + "grad_norm": 1.9534691166959397, + "learning_rate": 2.0143885875744217e-06, + "loss": 0.2507, + "step": 11331 + }, + { + "epoch": 0.71, + "grad_norm": 1.9746059108719314, + "learning_rate": 2.013571678357111e-06, + "loss": 0.2694, + "step": 11332 + }, + { + "epoch": 0.71, + "grad_norm": 1.6165579992429837, + "learning_rate": 2.0127548930520173e-06, + "loss": 0.2777, + "step": 11333 + }, + { + "epoch": 0.71, + "grad_norm": 7.443260674923377, + "learning_rate": 2.0119382316930293e-06, + "loss": 0.265, + "step": 11334 + }, + { + "epoch": 0.71, + "grad_norm": 1.9760462315592937, + "learning_rate": 2.011121694314031e-06, + "loss": 0.2701, + "step": 11335 + }, + { + "epoch": 0.71, + "grad_norm": 4.320175705161023, + "learning_rate": 2.0103052809488994e-06, + "loss": 0.2614, + "step": 11336 + }, + { + "epoch": 0.71, + "grad_norm": 2.020051859425412, + "learning_rate": 2.009488991631515e-06, + "loss": 0.2622, + "step": 11337 + }, + { + "epoch": 0.71, + "grad_norm": 2.721175284752356, + "learning_rate": 2.0086728263957446e-06, + "loss": 0.2686, + "step": 11338 + }, + { + "epoch": 0.71, + "grad_norm": 1.6424330441567097, + "learning_rate": 2.007856785275451e-06, + "loss": 0.2632, + "step": 11339 + }, + { + "epoch": 0.71, + "grad_norm": 2.101973955010961, + "learning_rate": 2.0070408683044927e-06, + "loss": 0.2554, + "step": 11340 + }, + { + "epoch": 0.71, + "grad_norm": 1.7957450911433073, + "learning_rate": 2.0062250755167273e-06, + "loss": 0.2608, + "step": 11341 + }, + { + "epoch": 0.71, + "grad_norm": 1.434905798012273, + "learning_rate": 2.005409406946e-06, + "loss": 0.2681, + "step": 11342 + }, + { + "epoch": 0.71, + "grad_norm": 4.055028020870884, + "learning_rate": 2.0045938626261545e-06, + "loss": 0.2695, + "step": 11343 + }, + { + "epoch": 0.71, + "grad_norm": 1.4087870325594685, + "learning_rate": 2.0037784425910297e-06, + "loss": 0.2586, + "step": 11344 + }, + { + "epoch": 0.71, + "grad_norm": 2.3772419911175398, + "learning_rate": 2.0029631468744608e-06, + "loss": 0.2676, + "step": 11345 + }, + { + "epoch": 0.71, + "grad_norm": 1.628980658008442, + "learning_rate": 2.0021479755102746e-06, + "loss": 0.2662, + "step": 11346 + }, + { + "epoch": 0.71, + "grad_norm": 2.73750429787218, + "learning_rate": 2.0013329285322935e-06, + "loss": 0.2606, + "step": 11347 + }, + { + "epoch": 0.71, + "grad_norm": 2.216705420367553, + "learning_rate": 2.000518005974334e-06, + "loss": 0.2732, + "step": 11348 + }, + { + "epoch": 0.71, + "grad_norm": 1.5501045833967493, + "learning_rate": 1.999703207870211e-06, + "loss": 0.2518, + "step": 11349 + }, + { + "epoch": 0.71, + "grad_norm": 2.345120807856687, + "learning_rate": 1.998888534253732e-06, + "loss": 0.2445, + "step": 11350 + }, + { + "epoch": 0.71, + "grad_norm": 0.5801532472428638, + "learning_rate": 1.9980739851586962e-06, + "loss": 0.467, + "step": 11351 + }, + { + "epoch": 0.71, + "grad_norm": 1.4128805356799254, + "learning_rate": 1.9972595606189026e-06, + "loss": 0.2514, + "step": 11352 + }, + { + "epoch": 0.71, + "grad_norm": 1.8872851415395644, + "learning_rate": 1.996445260668145e-06, + "loss": 0.2667, + "step": 11353 + }, + { + "epoch": 0.71, + "grad_norm": 1.51318997529435, + "learning_rate": 1.995631085340209e-06, + "loss": 0.2538, + "step": 11354 + }, + { + "epoch": 0.71, + "grad_norm": 1.9785116508051475, + "learning_rate": 1.994817034668875e-06, + "loss": 0.2547, + "step": 11355 + }, + { + "epoch": 0.71, + "grad_norm": 2.1575406168232623, + "learning_rate": 1.9940031086879184e-06, + "loss": 0.2498, + "step": 11356 + }, + { + "epoch": 0.71, + "grad_norm": 2.4897315642909783, + "learning_rate": 1.9931893074311145e-06, + "loss": 0.272, + "step": 11357 + }, + { + "epoch": 0.71, + "grad_norm": 1.7826147316133816, + "learning_rate": 1.9923756309322246e-06, + "loss": 0.2541, + "step": 11358 + }, + { + "epoch": 0.71, + "grad_norm": 2.3455543589097223, + "learning_rate": 1.9915620792250133e-06, + "loss": 0.2675, + "step": 11359 + }, + { + "epoch": 0.71, + "grad_norm": 0.5774023758944333, + "learning_rate": 1.9907486523432336e-06, + "loss": 0.4495, + "step": 11360 + }, + { + "epoch": 0.71, + "grad_norm": 1.6034605147044834, + "learning_rate": 1.989935350320639e-06, + "loss": 0.265, + "step": 11361 + }, + { + "epoch": 0.71, + "grad_norm": 2.1300232647843877, + "learning_rate": 1.9891221731909733e-06, + "loss": 0.268, + "step": 11362 + }, + { + "epoch": 0.71, + "grad_norm": 1.669427878905994, + "learning_rate": 1.988309120987977e-06, + "loss": 0.2664, + "step": 11363 + }, + { + "epoch": 0.71, + "grad_norm": 1.704242222687707, + "learning_rate": 1.987496193745382e-06, + "loss": 0.2603, + "step": 11364 + }, + { + "epoch": 0.71, + "grad_norm": 1.579530652329033, + "learning_rate": 1.986683391496923e-06, + "loss": 0.2534, + "step": 11365 + }, + { + "epoch": 0.71, + "grad_norm": 1.7661306057677901, + "learning_rate": 1.9858707142763205e-06, + "loss": 0.2562, + "step": 11366 + }, + { + "epoch": 0.71, + "grad_norm": 1.7420720500346207, + "learning_rate": 1.985058162117297e-06, + "loss": 0.2624, + "step": 11367 + }, + { + "epoch": 0.71, + "grad_norm": 0.5833631407263254, + "learning_rate": 1.9842457350535642e-06, + "loss": 0.5086, + "step": 11368 + }, + { + "epoch": 0.71, + "grad_norm": 0.5620309514403976, + "learning_rate": 1.9834334331188345e-06, + "loss": 0.4662, + "step": 11369 + }, + { + "epoch": 0.72, + "grad_norm": 1.294759900570698, + "learning_rate": 1.9826212563468095e-06, + "loss": 0.2521, + "step": 11370 + }, + { + "epoch": 0.72, + "grad_norm": 2.5657930904808723, + "learning_rate": 1.981809204771188e-06, + "loss": 0.2622, + "step": 11371 + }, + { + "epoch": 0.72, + "grad_norm": 3.3102104453249055, + "learning_rate": 1.9809972784256614e-06, + "loss": 0.2722, + "step": 11372 + }, + { + "epoch": 0.72, + "grad_norm": 2.113928762991423, + "learning_rate": 1.9801854773439206e-06, + "loss": 0.2465, + "step": 11373 + }, + { + "epoch": 0.72, + "grad_norm": 2.9706746452481925, + "learning_rate": 1.9793738015596497e-06, + "loss": 0.2603, + "step": 11374 + }, + { + "epoch": 0.72, + "grad_norm": 2.2277375849547645, + "learning_rate": 1.978562251106525e-06, + "loss": 0.2622, + "step": 11375 + }, + { + "epoch": 0.72, + "grad_norm": 2.907641102352029, + "learning_rate": 1.977750826018217e-06, + "loss": 0.2648, + "step": 11376 + }, + { + "epoch": 0.72, + "grad_norm": 3.592571779688514, + "learning_rate": 1.9769395263283973e-06, + "loss": 0.2776, + "step": 11377 + }, + { + "epoch": 0.72, + "grad_norm": 2.2381136243548942, + "learning_rate": 1.9761283520707268e-06, + "loss": 0.2761, + "step": 11378 + }, + { + "epoch": 0.72, + "grad_norm": 1.9940143447905285, + "learning_rate": 1.9753173032788616e-06, + "loss": 0.2401, + "step": 11379 + }, + { + "epoch": 0.72, + "grad_norm": 2.5298983352048547, + "learning_rate": 1.9745063799864506e-06, + "loss": 0.2899, + "step": 11380 + }, + { + "epoch": 0.72, + "grad_norm": 1.8051648505776428, + "learning_rate": 1.973695582227147e-06, + "loss": 0.27, + "step": 11381 + }, + { + "epoch": 0.72, + "grad_norm": 1.4991138970649578, + "learning_rate": 1.972884910034589e-06, + "loss": 0.2431, + "step": 11382 + }, + { + "epoch": 0.72, + "grad_norm": 2.0111220715082245, + "learning_rate": 1.972074363442413e-06, + "loss": 0.2606, + "step": 11383 + }, + { + "epoch": 0.72, + "grad_norm": 2.8018546620806766, + "learning_rate": 1.971263942484248e-06, + "loss": 0.2494, + "step": 11384 + }, + { + "epoch": 0.72, + "grad_norm": 4.002055503375998, + "learning_rate": 1.9704536471937238e-06, + "loss": 0.2641, + "step": 11385 + }, + { + "epoch": 0.72, + "grad_norm": 1.7357738843295063, + "learning_rate": 1.969643477604458e-06, + "loss": 0.2574, + "step": 11386 + }, + { + "epoch": 0.72, + "grad_norm": 2.028767298644523, + "learning_rate": 1.9688334337500677e-06, + "loss": 0.2617, + "step": 11387 + }, + { + "epoch": 0.72, + "grad_norm": 1.581514765675958, + "learning_rate": 1.968023515664158e-06, + "loss": 0.2517, + "step": 11388 + }, + { + "epoch": 0.72, + "grad_norm": 1.9354875005489085, + "learning_rate": 1.967213723380342e-06, + "loss": 0.2719, + "step": 11389 + }, + { + "epoch": 0.72, + "grad_norm": 2.143422083357361, + "learning_rate": 1.966404056932215e-06, + "loss": 0.2546, + "step": 11390 + }, + { + "epoch": 0.72, + "grad_norm": 1.7764018934828976, + "learning_rate": 1.9655945163533723e-06, + "loss": 0.2585, + "step": 11391 + }, + { + "epoch": 0.72, + "grad_norm": 1.497693339571945, + "learning_rate": 1.964785101677401e-06, + "loss": 0.2645, + "step": 11392 + }, + { + "epoch": 0.72, + "grad_norm": 2.2809630448123803, + "learning_rate": 1.9639758129378888e-06, + "loss": 0.2706, + "step": 11393 + }, + { + "epoch": 0.72, + "grad_norm": 3.075029666733048, + "learning_rate": 1.9631666501684126e-06, + "loss": 0.253, + "step": 11394 + }, + { + "epoch": 0.72, + "grad_norm": 1.643925236239441, + "learning_rate": 1.9623576134025447e-06, + "loss": 0.2568, + "step": 11395 + }, + { + "epoch": 0.72, + "grad_norm": 1.7403959416229888, + "learning_rate": 1.9615487026738546e-06, + "loss": 0.2488, + "step": 11396 + }, + { + "epoch": 0.72, + "grad_norm": 1.897176321303575, + "learning_rate": 1.9607399180159076e-06, + "loss": 0.2625, + "step": 11397 + }, + { + "epoch": 0.72, + "grad_norm": 1.4038872054017089, + "learning_rate": 1.9599312594622594e-06, + "loss": 0.2656, + "step": 11398 + }, + { + "epoch": 0.72, + "grad_norm": 2.4292549886742427, + "learning_rate": 1.9591227270464635e-06, + "loss": 0.2451, + "step": 11399 + }, + { + "epoch": 0.72, + "grad_norm": 1.6982447805255887, + "learning_rate": 1.958314320802065e-06, + "loss": 0.2666, + "step": 11400 + }, + { + "epoch": 0.72, + "grad_norm": 1.4206654848390734, + "learning_rate": 1.9575060407626094e-06, + "loss": 0.2529, + "step": 11401 + }, + { + "epoch": 0.72, + "grad_norm": 1.900697119871712, + "learning_rate": 1.9566978869616327e-06, + "loss": 0.2656, + "step": 11402 + }, + { + "epoch": 0.72, + "grad_norm": 1.6884534322469396, + "learning_rate": 1.9558898594326635e-06, + "loss": 0.2742, + "step": 11403 + }, + { + "epoch": 0.72, + "grad_norm": 2.341516010359561, + "learning_rate": 1.9550819582092317e-06, + "loss": 0.2589, + "step": 11404 + }, + { + "epoch": 0.72, + "grad_norm": 1.6031833805313624, + "learning_rate": 1.9542741833248597e-06, + "loss": 0.2622, + "step": 11405 + }, + { + "epoch": 0.72, + "grad_norm": 2.1997605954118207, + "learning_rate": 1.953466534813062e-06, + "loss": 0.2777, + "step": 11406 + }, + { + "epoch": 0.72, + "grad_norm": 2.261326186835715, + "learning_rate": 1.952659012707348e-06, + "loss": 0.2638, + "step": 11407 + }, + { + "epoch": 0.72, + "grad_norm": 5.663279408984441, + "learning_rate": 1.9518516170412234e-06, + "loss": 0.2684, + "step": 11408 + }, + { + "epoch": 0.72, + "grad_norm": 1.5113837783844868, + "learning_rate": 1.9510443478481905e-06, + "loss": 0.2465, + "step": 11409 + }, + { + "epoch": 0.72, + "grad_norm": 1.8365462034653697, + "learning_rate": 1.950237205161741e-06, + "loss": 0.2643, + "step": 11410 + }, + { + "epoch": 0.72, + "grad_norm": 1.4929882173889322, + "learning_rate": 1.9494301890153695e-06, + "loss": 0.2603, + "step": 11411 + }, + { + "epoch": 0.72, + "grad_norm": 1.408961938331628, + "learning_rate": 1.948623299442556e-06, + "loss": 0.2482, + "step": 11412 + }, + { + "epoch": 0.72, + "grad_norm": 1.737767357578204, + "learning_rate": 1.947816536476783e-06, + "loss": 0.2648, + "step": 11413 + }, + { + "epoch": 0.72, + "grad_norm": 2.524209273130116, + "learning_rate": 1.9470099001515237e-06, + "loss": 0.2671, + "step": 11414 + }, + { + "epoch": 0.72, + "grad_norm": 2.4741269951647045, + "learning_rate": 1.9462033905002457e-06, + "loss": 0.2698, + "step": 11415 + }, + { + "epoch": 0.72, + "grad_norm": 2.2131247086558754, + "learning_rate": 1.945397007556412e-06, + "loss": 0.2547, + "step": 11416 + }, + { + "epoch": 0.72, + "grad_norm": 2.4341785772433284, + "learning_rate": 1.9445907513534834e-06, + "loss": 0.2648, + "step": 11417 + }, + { + "epoch": 0.72, + "grad_norm": 3.5722984405073626, + "learning_rate": 1.94378462192491e-06, + "loss": 0.2631, + "step": 11418 + }, + { + "epoch": 0.72, + "grad_norm": 1.5805557957652592, + "learning_rate": 1.942978619304143e-06, + "loss": 0.2633, + "step": 11419 + }, + { + "epoch": 0.72, + "grad_norm": 1.472081491767904, + "learning_rate": 1.9421727435246214e-06, + "loss": 0.2489, + "step": 11420 + }, + { + "epoch": 0.72, + "grad_norm": 2.2142139857719663, + "learning_rate": 1.941366994619785e-06, + "loss": 0.2494, + "step": 11421 + }, + { + "epoch": 0.72, + "grad_norm": 1.7860826111303763, + "learning_rate": 1.9405613726230653e-06, + "loss": 0.2556, + "step": 11422 + }, + { + "epoch": 0.72, + "grad_norm": 2.8046385687067628, + "learning_rate": 1.939755877567888e-06, + "loss": 0.2595, + "step": 11423 + }, + { + "epoch": 0.72, + "grad_norm": 2.6670271164555346, + "learning_rate": 1.9389505094876736e-06, + "loss": 0.2674, + "step": 11424 + }, + { + "epoch": 0.72, + "grad_norm": 1.8890571202441133, + "learning_rate": 1.938145268415839e-06, + "loss": 0.2969, + "step": 11425 + }, + { + "epoch": 0.72, + "grad_norm": 4.9879525381207985, + "learning_rate": 1.9373401543857983e-06, + "loss": 0.2796, + "step": 11426 + }, + { + "epoch": 0.72, + "grad_norm": 1.3161419496663187, + "learning_rate": 1.936535167430954e-06, + "loss": 0.2518, + "step": 11427 + }, + { + "epoch": 0.72, + "grad_norm": 4.232052173403798, + "learning_rate": 1.935730307584705e-06, + "loss": 0.2599, + "step": 11428 + }, + { + "epoch": 0.72, + "grad_norm": 1.9742934527172964, + "learning_rate": 1.9349255748804506e-06, + "loss": 0.2528, + "step": 11429 + }, + { + "epoch": 0.72, + "grad_norm": 1.8988202940201568, + "learning_rate": 1.934120969351578e-06, + "loss": 0.2792, + "step": 11430 + }, + { + "epoch": 0.72, + "grad_norm": 2.403931595456558, + "learning_rate": 1.9333164910314715e-06, + "loss": 0.2518, + "step": 11431 + }, + { + "epoch": 0.72, + "grad_norm": 5.326714842309047, + "learning_rate": 1.9325121399535103e-06, + "loss": 0.2718, + "step": 11432 + }, + { + "epoch": 0.72, + "grad_norm": 1.867530048596062, + "learning_rate": 1.9317079161510675e-06, + "loss": 0.2716, + "step": 11433 + }, + { + "epoch": 0.72, + "grad_norm": 1.516557314740614, + "learning_rate": 1.9309038196575157e-06, + "loss": 0.2595, + "step": 11434 + }, + { + "epoch": 0.72, + "grad_norm": 1.586599339571851, + "learning_rate": 1.9300998505062157e-06, + "loss": 0.2398, + "step": 11435 + }, + { + "epoch": 0.72, + "grad_norm": 3.2522506558220243, + "learning_rate": 1.9292960087305234e-06, + "loss": 0.2481, + "step": 11436 + }, + { + "epoch": 0.72, + "grad_norm": 2.0020347706698143, + "learning_rate": 1.9284922943637965e-06, + "loss": 0.2502, + "step": 11437 + }, + { + "epoch": 0.72, + "grad_norm": 1.79563012579662, + "learning_rate": 1.9276887074393793e-06, + "loss": 0.2682, + "step": 11438 + }, + { + "epoch": 0.72, + "grad_norm": 1.8305073671706702, + "learning_rate": 1.926885247990615e-06, + "loss": 0.2958, + "step": 11439 + }, + { + "epoch": 0.72, + "grad_norm": 2.72451949456854, + "learning_rate": 1.926081916050838e-06, + "loss": 0.2643, + "step": 11440 + }, + { + "epoch": 0.72, + "grad_norm": 0.6114740381338866, + "learning_rate": 1.9252787116533826e-06, + "loss": 0.459, + "step": 11441 + }, + { + "epoch": 0.72, + "grad_norm": 2.5079612345430435, + "learning_rate": 1.924475634831576e-06, + "loss": 0.2539, + "step": 11442 + }, + { + "epoch": 0.72, + "grad_norm": 2.127769946153632, + "learning_rate": 1.9236726856187387e-06, + "loss": 0.279, + "step": 11443 + }, + { + "epoch": 0.72, + "grad_norm": 1.8740271212326134, + "learning_rate": 1.922869864048184e-06, + "loss": 0.2692, + "step": 11444 + }, + { + "epoch": 0.72, + "grad_norm": 7.823193367665255, + "learning_rate": 1.9220671701532256e-06, + "loss": 0.2842, + "step": 11445 + }, + { + "epoch": 0.72, + "grad_norm": 2.3000630886978786, + "learning_rate": 1.921264603967168e-06, + "loss": 0.2528, + "step": 11446 + }, + { + "epoch": 0.72, + "grad_norm": 2.1792497744263115, + "learning_rate": 1.9204621655233096e-06, + "loss": 0.2544, + "step": 11447 + }, + { + "epoch": 0.72, + "grad_norm": 3.960125826508275, + "learning_rate": 1.9196598548549445e-06, + "loss": 0.2498, + "step": 11448 + }, + { + "epoch": 0.72, + "grad_norm": 2.0879985838242017, + "learning_rate": 1.9188576719953635e-06, + "loss": 0.274, + "step": 11449 + }, + { + "epoch": 0.72, + "grad_norm": 1.5514771715147577, + "learning_rate": 1.918055616977852e-06, + "loss": 0.2523, + "step": 11450 + }, + { + "epoch": 0.72, + "grad_norm": 1.915211518804343, + "learning_rate": 1.917253689835687e-06, + "loss": 0.2548, + "step": 11451 + }, + { + "epoch": 0.72, + "grad_norm": 0.6524053314383431, + "learning_rate": 1.91645189060214e-06, + "loss": 0.482, + "step": 11452 + }, + { + "epoch": 0.72, + "grad_norm": 4.990368602133595, + "learning_rate": 1.915650219310483e-06, + "loss": 0.2472, + "step": 11453 + }, + { + "epoch": 0.72, + "grad_norm": 1.7150313533698642, + "learning_rate": 1.914848675993977e-06, + "loss": 0.2724, + "step": 11454 + }, + { + "epoch": 0.72, + "grad_norm": 2.56708186980046, + "learning_rate": 1.914047260685878e-06, + "loss": 0.2543, + "step": 11455 + }, + { + "epoch": 0.72, + "grad_norm": 2.1096326092215487, + "learning_rate": 1.91324597341944e-06, + "loss": 0.2535, + "step": 11456 + }, + { + "epoch": 0.72, + "grad_norm": 2.1502170936499385, + "learning_rate": 1.9124448142279084e-06, + "loss": 0.2619, + "step": 11457 + }, + { + "epoch": 0.72, + "grad_norm": 2.4409823053965565, + "learning_rate": 1.9116437831445273e-06, + "loss": 0.2714, + "step": 11458 + }, + { + "epoch": 0.72, + "grad_norm": 3.231188912890816, + "learning_rate": 1.9108428802025308e-06, + "loss": 0.2617, + "step": 11459 + }, + { + "epoch": 0.72, + "grad_norm": 3.5516612379231187, + "learning_rate": 1.910042105435149e-06, + "loss": 0.2598, + "step": 11460 + }, + { + "epoch": 0.72, + "grad_norm": 2.5415978599533364, + "learning_rate": 1.90924145887561e-06, + "loss": 0.2527, + "step": 11461 + }, + { + "epoch": 0.72, + "grad_norm": 2.4079412109689216, + "learning_rate": 1.90844094055713e-06, + "loss": 0.2778, + "step": 11462 + }, + { + "epoch": 0.72, + "grad_norm": 1.5578665959114903, + "learning_rate": 1.9076405505129298e-06, + "loss": 0.248, + "step": 11463 + }, + { + "epoch": 0.72, + "grad_norm": 2.7101699307622105, + "learning_rate": 1.9068402887762133e-06, + "loss": 0.269, + "step": 11464 + }, + { + "epoch": 0.72, + "grad_norm": 1.8690068427042108, + "learning_rate": 1.9060401553801887e-06, + "loss": 0.267, + "step": 11465 + }, + { + "epoch": 0.72, + "grad_norm": 2.802343913344231, + "learning_rate": 1.9052401503580541e-06, + "loss": 0.2572, + "step": 11466 + }, + { + "epoch": 0.72, + "grad_norm": 1.9048858238915385, + "learning_rate": 1.904440273743003e-06, + "loss": 0.2667, + "step": 11467 + }, + { + "epoch": 0.72, + "grad_norm": 1.5851466547212005, + "learning_rate": 1.903640525568221e-06, + "loss": 0.27, + "step": 11468 + }, + { + "epoch": 0.72, + "grad_norm": 0.6106298306366249, + "learning_rate": 1.902840905866895e-06, + "loss": 0.4944, + "step": 11469 + }, + { + "epoch": 0.72, + "grad_norm": 1.7960634647164426, + "learning_rate": 1.9020414146721995e-06, + "loss": 0.2557, + "step": 11470 + }, + { + "epoch": 0.72, + "grad_norm": 4.069294754634697, + "learning_rate": 1.9012420520173097e-06, + "loss": 0.265, + "step": 11471 + }, + { + "epoch": 0.72, + "grad_norm": 2.1230797406007387, + "learning_rate": 1.9004428179353895e-06, + "loss": 0.249, + "step": 11472 + }, + { + "epoch": 0.72, + "grad_norm": 1.977106912460412, + "learning_rate": 1.8996437124596045e-06, + "loss": 0.2778, + "step": 11473 + }, + { + "epoch": 0.72, + "grad_norm": 2.148882914640681, + "learning_rate": 1.8988447356231082e-06, + "loss": 0.2437, + "step": 11474 + }, + { + "epoch": 0.72, + "grad_norm": 1.612463440365609, + "learning_rate": 1.8980458874590525e-06, + "loss": 0.2563, + "step": 11475 + }, + { + "epoch": 0.72, + "grad_norm": 1.251921214254672, + "learning_rate": 1.8972471680005805e-06, + "loss": 0.2567, + "step": 11476 + }, + { + "epoch": 0.72, + "grad_norm": 1.6270869313081857, + "learning_rate": 1.8964485772808345e-06, + "loss": 0.2501, + "step": 11477 + }, + { + "epoch": 0.72, + "grad_norm": 2.031496906945291, + "learning_rate": 1.8956501153329516e-06, + "loss": 0.268, + "step": 11478 + }, + { + "epoch": 0.72, + "grad_norm": 1.8361410734328967, + "learning_rate": 1.8948517821900592e-06, + "loss": 0.2653, + "step": 11479 + }, + { + "epoch": 0.72, + "grad_norm": 2.1441496679139687, + "learning_rate": 1.8940535778852804e-06, + "loss": 0.2717, + "step": 11480 + }, + { + "epoch": 0.72, + "grad_norm": 2.7183299102711644, + "learning_rate": 1.8932555024517363e-06, + "loss": 0.2746, + "step": 11481 + }, + { + "epoch": 0.72, + "grad_norm": 2.1089751911758734, + "learning_rate": 1.8924575559225406e-06, + "loss": 0.2465, + "step": 11482 + }, + { + "epoch": 0.72, + "grad_norm": 1.8839783331538997, + "learning_rate": 1.8916597383308e-06, + "loss": 0.2596, + "step": 11483 + }, + { + "epoch": 0.72, + "grad_norm": 3.550563791260173, + "learning_rate": 1.8908620497096159e-06, + "loss": 0.2631, + "step": 11484 + }, + { + "epoch": 0.72, + "grad_norm": 4.710879720987616, + "learning_rate": 1.8900644900920884e-06, + "loss": 0.2597, + "step": 11485 + }, + { + "epoch": 0.72, + "grad_norm": 32.768201741543656, + "learning_rate": 1.8892670595113105e-06, + "loss": 0.2624, + "step": 11486 + }, + { + "epoch": 0.72, + "grad_norm": 2.045250859829823, + "learning_rate": 1.8884697580003674e-06, + "loss": 0.2452, + "step": 11487 + }, + { + "epoch": 0.72, + "grad_norm": 2.4852251987712686, + "learning_rate": 1.8876725855923394e-06, + "loss": 0.2683, + "step": 11488 + }, + { + "epoch": 0.72, + "grad_norm": 1.6527115258742013, + "learning_rate": 1.8868755423203056e-06, + "loss": 0.2781, + "step": 11489 + }, + { + "epoch": 0.72, + "grad_norm": 2.399537261154635, + "learning_rate": 1.886078628217335e-06, + "loss": 0.2605, + "step": 11490 + }, + { + "epoch": 0.72, + "grad_norm": 1.9713359301015374, + "learning_rate": 1.8852818433164927e-06, + "loss": 0.2516, + "step": 11491 + }, + { + "epoch": 0.72, + "grad_norm": 4.296147865855016, + "learning_rate": 1.8844851876508375e-06, + "loss": 0.2785, + "step": 11492 + }, + { + "epoch": 0.72, + "grad_norm": 2.3306562118900755, + "learning_rate": 1.883688661253426e-06, + "loss": 0.2675, + "step": 11493 + }, + { + "epoch": 0.72, + "grad_norm": 2.3282691457943727, + "learning_rate": 1.882892264157309e-06, + "loss": 0.2817, + "step": 11494 + }, + { + "epoch": 0.72, + "grad_norm": 2.276776008424065, + "learning_rate": 1.8820959963955282e-06, + "loss": 0.2816, + "step": 11495 + }, + { + "epoch": 0.72, + "grad_norm": 4.638741689277499, + "learning_rate": 1.8812998580011204e-06, + "loss": 0.268, + "step": 11496 + }, + { + "epoch": 0.72, + "grad_norm": 2.652807009176715, + "learning_rate": 1.880503849007123e-06, + "loss": 0.27, + "step": 11497 + }, + { + "epoch": 0.72, + "grad_norm": 0.67812107606071, + "learning_rate": 1.8797079694465619e-06, + "loss": 0.4994, + "step": 11498 + }, + { + "epoch": 0.72, + "grad_norm": 0.6366241977833366, + "learning_rate": 1.8789122193524594e-06, + "loss": 0.4871, + "step": 11499 + }, + { + "epoch": 0.72, + "grad_norm": 1.891088314246333, + "learning_rate": 1.878116598757831e-06, + "loss": 0.2449, + "step": 11500 + }, + { + "epoch": 0.72, + "grad_norm": 2.89586591882636, + "learning_rate": 1.8773211076956898e-06, + "loss": 0.2506, + "step": 11501 + }, + { + "epoch": 0.72, + "grad_norm": 2.8680385957877492, + "learning_rate": 1.8765257461990444e-06, + "loss": 0.2644, + "step": 11502 + }, + { + "epoch": 0.72, + "grad_norm": 1.5303857899235052, + "learning_rate": 1.875730514300893e-06, + "loss": 0.2511, + "step": 11503 + }, + { + "epoch": 0.72, + "grad_norm": 2.1661314046330924, + "learning_rate": 1.8749354120342307e-06, + "loss": 0.2789, + "step": 11504 + }, + { + "epoch": 0.72, + "grad_norm": 5.91931618367787, + "learning_rate": 1.8741404394320506e-06, + "loss": 0.2493, + "step": 11505 + }, + { + "epoch": 0.72, + "grad_norm": 2.0348641881384446, + "learning_rate": 1.8733455965273355e-06, + "loss": 0.2845, + "step": 11506 + }, + { + "epoch": 0.72, + "grad_norm": 2.931735348327818, + "learning_rate": 1.8725508833530638e-06, + "loss": 0.2584, + "step": 11507 + }, + { + "epoch": 0.72, + "grad_norm": 2.981218916565921, + "learning_rate": 1.8717562999422128e-06, + "loss": 0.2699, + "step": 11508 + }, + { + "epoch": 0.72, + "grad_norm": 3.1187270794333974, + "learning_rate": 1.8709618463277474e-06, + "loss": 0.2518, + "step": 11509 + }, + { + "epoch": 0.72, + "grad_norm": 2.451284277050738, + "learning_rate": 1.8701675225426341e-06, + "loss": 0.2623, + "step": 11510 + }, + { + "epoch": 0.72, + "grad_norm": 5.391133879892232, + "learning_rate": 1.8693733286198296e-06, + "loss": 0.2484, + "step": 11511 + }, + { + "epoch": 0.72, + "grad_norm": 3.650244144192052, + "learning_rate": 1.8685792645922852e-06, + "loss": 0.2504, + "step": 11512 + }, + { + "epoch": 0.72, + "grad_norm": 1.5001549512291834, + "learning_rate": 1.86778533049295e-06, + "loss": 0.2578, + "step": 11513 + }, + { + "epoch": 0.72, + "grad_norm": 2.153463694552769, + "learning_rate": 1.8669915263547656e-06, + "loss": 0.258, + "step": 11514 + }, + { + "epoch": 0.72, + "grad_norm": 6.506997653386509, + "learning_rate": 1.8661978522106654e-06, + "loss": 0.2477, + "step": 11515 + }, + { + "epoch": 0.72, + "grad_norm": 4.008297387897438, + "learning_rate": 1.8654043080935846e-06, + "loss": 0.2602, + "step": 11516 + }, + { + "epoch": 0.72, + "grad_norm": 1.7498243961326454, + "learning_rate": 1.864610894036445e-06, + "loss": 0.2524, + "step": 11517 + }, + { + "epoch": 0.72, + "grad_norm": 1.510139821959095, + "learning_rate": 1.86381761007217e-06, + "loss": 0.2503, + "step": 11518 + }, + { + "epoch": 0.72, + "grad_norm": 2.8953997341756814, + "learning_rate": 1.863024456233673e-06, + "loss": 0.2512, + "step": 11519 + }, + { + "epoch": 0.72, + "grad_norm": 2.9859824432416895, + "learning_rate": 1.8622314325538615e-06, + "loss": 0.2664, + "step": 11520 + }, + { + "epoch": 0.72, + "grad_norm": 1.771046837551444, + "learning_rate": 1.8614385390656432e-06, + "loss": 0.2611, + "step": 11521 + }, + { + "epoch": 0.72, + "grad_norm": 2.571088068936452, + "learning_rate": 1.8606457758019125e-06, + "loss": 0.2409, + "step": 11522 + }, + { + "epoch": 0.72, + "grad_norm": 1.928979470443616, + "learning_rate": 1.8598531427955662e-06, + "loss": 0.2667, + "step": 11523 + }, + { + "epoch": 0.72, + "grad_norm": 1.703417933287143, + "learning_rate": 1.859060640079491e-06, + "loss": 0.2641, + "step": 11524 + }, + { + "epoch": 0.72, + "grad_norm": 4.716672621669686, + "learning_rate": 1.8582682676865666e-06, + "loss": 0.2802, + "step": 11525 + }, + { + "epoch": 0.72, + "grad_norm": 2.834495224753895, + "learning_rate": 1.8574760256496742e-06, + "loss": 0.2664, + "step": 11526 + }, + { + "epoch": 0.72, + "grad_norm": 3.3960608525254616, + "learning_rate": 1.8566839140016829e-06, + "loss": 0.2581, + "step": 11527 + }, + { + "epoch": 0.72, + "grad_norm": 2.3241034315555993, + "learning_rate": 1.8558919327754577e-06, + "loss": 0.2741, + "step": 11528 + }, + { + "epoch": 0.73, + "grad_norm": 2.684780560119694, + "learning_rate": 1.8551000820038628e-06, + "loss": 0.2645, + "step": 11529 + }, + { + "epoch": 0.73, + "grad_norm": 1.8758240644753343, + "learning_rate": 1.8543083617197494e-06, + "loss": 0.2545, + "step": 11530 + }, + { + "epoch": 0.73, + "grad_norm": 2.4815599160007857, + "learning_rate": 1.853516771955971e-06, + "loss": 0.2745, + "step": 11531 + }, + { + "epoch": 0.73, + "grad_norm": 1.8268556016603312, + "learning_rate": 1.852725312745371e-06, + "loss": 0.2628, + "step": 11532 + }, + { + "epoch": 0.73, + "grad_norm": 1.8432119921001295, + "learning_rate": 1.8519339841207857e-06, + "loss": 0.2567, + "step": 11533 + }, + { + "epoch": 0.73, + "grad_norm": 3.9344243019241505, + "learning_rate": 1.8511427861150526e-06, + "loss": 0.2548, + "step": 11534 + }, + { + "epoch": 0.73, + "grad_norm": 1.6120661362937332, + "learning_rate": 1.8503517187609993e-06, + "loss": 0.2566, + "step": 11535 + }, + { + "epoch": 0.73, + "grad_norm": 1.8039736735689818, + "learning_rate": 1.8495607820914451e-06, + "loss": 0.2505, + "step": 11536 + }, + { + "epoch": 0.73, + "grad_norm": 2.5540975630774976, + "learning_rate": 1.8487699761392098e-06, + "loss": 0.2689, + "step": 11537 + }, + { + "epoch": 0.73, + "grad_norm": 1.7908464601448393, + "learning_rate": 1.8479793009371073e-06, + "loss": 0.2442, + "step": 11538 + }, + { + "epoch": 0.73, + "grad_norm": 1.2954945231916777, + "learning_rate": 1.8471887565179426e-06, + "loss": 0.2534, + "step": 11539 + }, + { + "epoch": 0.73, + "grad_norm": 1.3186157692833462, + "learning_rate": 1.8463983429145143e-06, + "loss": 0.2682, + "step": 11540 + }, + { + "epoch": 0.73, + "grad_norm": 1.8801713595759486, + "learning_rate": 1.8456080601596216e-06, + "loss": 0.264, + "step": 11541 + }, + { + "epoch": 0.73, + "grad_norm": 3.462008547163594, + "learning_rate": 1.844817908286054e-06, + "loss": 0.2373, + "step": 11542 + }, + { + "epoch": 0.73, + "grad_norm": 2.6097105168691064, + "learning_rate": 1.8440278873265948e-06, + "loss": 0.2622, + "step": 11543 + }, + { + "epoch": 0.73, + "grad_norm": 1.9709023890444757, + "learning_rate": 1.843237997314023e-06, + "loss": 0.2596, + "step": 11544 + }, + { + "epoch": 0.73, + "grad_norm": 1.8745638645026685, + "learning_rate": 1.8424482382811137e-06, + "loss": 0.2769, + "step": 11545 + }, + { + "epoch": 0.73, + "grad_norm": 5.586185787760726, + "learning_rate": 1.841658610260637e-06, + "loss": 0.2613, + "step": 11546 + }, + { + "epoch": 0.73, + "grad_norm": 1.6403014749982072, + "learning_rate": 1.8408691132853551e-06, + "loss": 0.2676, + "step": 11547 + }, + { + "epoch": 0.73, + "grad_norm": 2.2678189150366417, + "learning_rate": 1.8400797473880223e-06, + "loss": 0.3071, + "step": 11548 + }, + { + "epoch": 0.73, + "grad_norm": 1.6357951822984167, + "learning_rate": 1.8392905126013955e-06, + "loss": 0.2721, + "step": 11549 + }, + { + "epoch": 0.73, + "grad_norm": 2.567934800131614, + "learning_rate": 1.8385014089582192e-06, + "loss": 0.2688, + "step": 11550 + }, + { + "epoch": 0.73, + "grad_norm": 1.3716559900717848, + "learning_rate": 1.8377124364912353e-06, + "loss": 0.253, + "step": 11551 + }, + { + "epoch": 0.73, + "grad_norm": 1.8073247367751648, + "learning_rate": 1.8369235952331777e-06, + "loss": 0.2723, + "step": 11552 + }, + { + "epoch": 0.73, + "grad_norm": 4.31472427613521, + "learning_rate": 1.8361348852167777e-06, + "loss": 0.2721, + "step": 11553 + }, + { + "epoch": 0.73, + "grad_norm": 1.8746213927342585, + "learning_rate": 1.8353463064747629e-06, + "loss": 0.2794, + "step": 11554 + }, + { + "epoch": 0.73, + "grad_norm": 1.976646350966091, + "learning_rate": 1.834557859039851e-06, + "loss": 0.2614, + "step": 11555 + }, + { + "epoch": 0.73, + "grad_norm": 1.8511431337909376, + "learning_rate": 1.8337695429447545e-06, + "loss": 0.2523, + "step": 11556 + }, + { + "epoch": 0.73, + "grad_norm": 2.471850900623397, + "learning_rate": 1.832981358222185e-06, + "loss": 0.2729, + "step": 11557 + }, + { + "epoch": 0.73, + "grad_norm": 1.5496506970491823, + "learning_rate": 1.832193304904844e-06, + "loss": 0.2541, + "step": 11558 + }, + { + "epoch": 0.73, + "grad_norm": 1.615265765841692, + "learning_rate": 1.831405383025428e-06, + "loss": 0.2706, + "step": 11559 + }, + { + "epoch": 0.73, + "grad_norm": 1.5479936667300176, + "learning_rate": 1.8306175926166325e-06, + "loss": 0.2746, + "step": 11560 + }, + { + "epoch": 0.73, + "grad_norm": 1.380276231801311, + "learning_rate": 1.8298299337111408e-06, + "loss": 0.2718, + "step": 11561 + }, + { + "epoch": 0.73, + "grad_norm": 1.4341596745639706, + "learning_rate": 1.8290424063416373e-06, + "loss": 0.2644, + "step": 11562 + }, + { + "epoch": 0.73, + "grad_norm": 3.034541876921524, + "learning_rate": 1.828255010540797e-06, + "loss": 0.297, + "step": 11563 + }, + { + "epoch": 0.73, + "grad_norm": 1.7744960371721548, + "learning_rate": 1.8274677463412882e-06, + "loss": 0.254, + "step": 11564 + }, + { + "epoch": 0.73, + "grad_norm": 1.5001021581761966, + "learning_rate": 1.8266806137757798e-06, + "loss": 0.2655, + "step": 11565 + }, + { + "epoch": 0.73, + "grad_norm": 2.615301559198779, + "learning_rate": 1.8258936128769284e-06, + "loss": 0.2687, + "step": 11566 + }, + { + "epoch": 0.73, + "grad_norm": 6.013126014624267, + "learning_rate": 1.8251067436773872e-06, + "loss": 0.2796, + "step": 11567 + }, + { + "epoch": 0.73, + "grad_norm": 1.8850472742960813, + "learning_rate": 1.8243200062098088e-06, + "loss": 0.2565, + "step": 11568 + }, + { + "epoch": 0.73, + "grad_norm": 1.5647787071432304, + "learning_rate": 1.8235334005068317e-06, + "loss": 0.2748, + "step": 11569 + }, + { + "epoch": 0.73, + "grad_norm": 2.069001867201661, + "learning_rate": 1.8227469266010977e-06, + "loss": 0.2538, + "step": 11570 + }, + { + "epoch": 0.73, + "grad_norm": 4.263115722281822, + "learning_rate": 1.8219605845252376e-06, + "loss": 0.2987, + "step": 11571 + }, + { + "epoch": 0.73, + "grad_norm": 1.9129042617801826, + "learning_rate": 1.8211743743118754e-06, + "loss": 0.284, + "step": 11572 + }, + { + "epoch": 0.73, + "grad_norm": 2.319808480813705, + "learning_rate": 1.8203882959936363e-06, + "loss": 0.2652, + "step": 11573 + }, + { + "epoch": 0.73, + "grad_norm": 2.900489197518082, + "learning_rate": 1.8196023496031335e-06, + "loss": 0.2683, + "step": 11574 + }, + { + "epoch": 0.73, + "grad_norm": 1.4806769121490957, + "learning_rate": 1.8188165351729792e-06, + "loss": 0.2531, + "step": 11575 + }, + { + "epoch": 0.73, + "grad_norm": 3.3055423915071636, + "learning_rate": 1.8180308527357776e-06, + "loss": 0.2787, + "step": 11576 + }, + { + "epoch": 0.73, + "grad_norm": 4.6946897793832365, + "learning_rate": 1.817245302324126e-06, + "loss": 0.2731, + "step": 11577 + }, + { + "epoch": 0.73, + "grad_norm": 2.6195659234728494, + "learning_rate": 1.8164598839706216e-06, + "loss": 0.2789, + "step": 11578 + }, + { + "epoch": 0.73, + "grad_norm": 1.6389624269780703, + "learning_rate": 1.815674597707851e-06, + "loss": 0.2615, + "step": 11579 + }, + { + "epoch": 0.73, + "grad_norm": 3.228590629549094, + "learning_rate": 1.814889443568396e-06, + "loss": 0.2412, + "step": 11580 + }, + { + "epoch": 0.73, + "grad_norm": 1.825630571773635, + "learning_rate": 1.8141044215848368e-06, + "loss": 0.2548, + "step": 11581 + }, + { + "epoch": 0.73, + "grad_norm": 14.064168411558224, + "learning_rate": 1.8133195317897423e-06, + "loss": 0.2622, + "step": 11582 + }, + { + "epoch": 0.73, + "grad_norm": 1.5773045508505517, + "learning_rate": 1.8125347742156823e-06, + "loss": 0.259, + "step": 11583 + }, + { + "epoch": 0.73, + "grad_norm": 1.5084784166258076, + "learning_rate": 1.8117501488952166e-06, + "loss": 0.2521, + "step": 11584 + }, + { + "epoch": 0.73, + "grad_norm": 2.833971890557965, + "learning_rate": 1.810965655860898e-06, + "loss": 0.2571, + "step": 11585 + }, + { + "epoch": 0.73, + "grad_norm": 2.91254258856792, + "learning_rate": 1.8101812951452808e-06, + "loss": 0.2701, + "step": 11586 + }, + { + "epoch": 0.73, + "grad_norm": 0.6042623625113431, + "learning_rate": 1.809397066780907e-06, + "loss": 0.456, + "step": 11587 + }, + { + "epoch": 0.73, + "grad_norm": 1.617086890004137, + "learning_rate": 1.8086129708003142e-06, + "loss": 0.2647, + "step": 11588 + }, + { + "epoch": 0.73, + "grad_norm": 2.5307919202942437, + "learning_rate": 1.8078290072360382e-06, + "loss": 0.2663, + "step": 11589 + }, + { + "epoch": 0.73, + "grad_norm": 2.5452028492532603, + "learning_rate": 1.8070451761206082e-06, + "loss": 0.2675, + "step": 11590 + }, + { + "epoch": 0.73, + "grad_norm": 1.4609771617984315, + "learning_rate": 1.806261477486545e-06, + "loss": 0.263, + "step": 11591 + }, + { + "epoch": 0.73, + "grad_norm": 1.8342722329839665, + "learning_rate": 1.8054779113663662e-06, + "loss": 0.258, + "step": 11592 + }, + { + "epoch": 0.73, + "grad_norm": 1.735699545196827, + "learning_rate": 1.8046944777925806e-06, + "loss": 0.2414, + "step": 11593 + }, + { + "epoch": 0.73, + "grad_norm": 2.20141948652523, + "learning_rate": 1.803911176797699e-06, + "loss": 0.2677, + "step": 11594 + }, + { + "epoch": 0.73, + "grad_norm": 2.323350389575852, + "learning_rate": 1.8031280084142193e-06, + "loss": 0.2714, + "step": 11595 + }, + { + "epoch": 0.73, + "grad_norm": 1.919456952414611, + "learning_rate": 1.802344972674635e-06, + "loss": 0.2565, + "step": 11596 + }, + { + "epoch": 0.73, + "grad_norm": 2.4279110676808817, + "learning_rate": 1.8015620696114377e-06, + "loss": 0.2703, + "step": 11597 + }, + { + "epoch": 0.73, + "grad_norm": 2.823465183847571, + "learning_rate": 1.8007792992571128e-06, + "loss": 0.2616, + "step": 11598 + }, + { + "epoch": 0.73, + "grad_norm": 1.46668559256316, + "learning_rate": 1.7999966616441372e-06, + "loss": 0.2607, + "step": 11599 + }, + { + "epoch": 0.73, + "grad_norm": 4.103914243666078, + "learning_rate": 1.7992141568049837e-06, + "loss": 0.288, + "step": 11600 + }, + { + "epoch": 0.73, + "grad_norm": 1.7392868339710725, + "learning_rate": 1.798431784772119e-06, + "loss": 0.2563, + "step": 11601 + }, + { + "epoch": 0.73, + "grad_norm": 1.827683844614338, + "learning_rate": 1.7976495455780074e-06, + "loss": 0.2777, + "step": 11602 + }, + { + "epoch": 0.73, + "grad_norm": 1.4846781854381141, + "learning_rate": 1.796867439255104e-06, + "loss": 0.2756, + "step": 11603 + }, + { + "epoch": 0.73, + "grad_norm": 3.290617499398284, + "learning_rate": 1.7960854658358584e-06, + "loss": 0.2957, + "step": 11604 + }, + { + "epoch": 0.73, + "grad_norm": 2.596333979997199, + "learning_rate": 1.7953036253527178e-06, + "loss": 0.2638, + "step": 11605 + }, + { + "epoch": 0.73, + "grad_norm": 2.0672337342872313, + "learning_rate": 1.7945219178381236e-06, + "loss": 0.2683, + "step": 11606 + }, + { + "epoch": 0.73, + "grad_norm": 3.5226318927342812, + "learning_rate": 1.7937403433245087e-06, + "loss": 0.28, + "step": 11607 + }, + { + "epoch": 0.73, + "grad_norm": 1.2869851636087017, + "learning_rate": 1.7929589018443016e-06, + "loss": 0.2498, + "step": 11608 + }, + { + "epoch": 0.73, + "grad_norm": 2.5392580115799617, + "learning_rate": 1.7921775934299246e-06, + "loss": 0.2635, + "step": 11609 + }, + { + "epoch": 0.73, + "grad_norm": 8.542464530669765, + "learning_rate": 1.791396418113799e-06, + "loss": 0.2789, + "step": 11610 + }, + { + "epoch": 0.73, + "grad_norm": 2.4273074585088805, + "learning_rate": 1.7906153759283346e-06, + "loss": 0.2502, + "step": 11611 + }, + { + "epoch": 0.73, + "grad_norm": 1.5442717287376975, + "learning_rate": 1.789834466905937e-06, + "loss": 0.2487, + "step": 11612 + }, + { + "epoch": 0.73, + "grad_norm": 1.4770303384864432, + "learning_rate": 1.7890536910790095e-06, + "loss": 0.2492, + "step": 11613 + }, + { + "epoch": 0.73, + "grad_norm": 1.4491829471908595, + "learning_rate": 1.7882730484799494e-06, + "loss": 0.2712, + "step": 11614 + }, + { + "epoch": 0.73, + "grad_norm": 1.5239929949679778, + "learning_rate": 1.787492539141145e-06, + "loss": 0.27, + "step": 11615 + }, + { + "epoch": 0.73, + "grad_norm": 3.1348628007020314, + "learning_rate": 1.7867121630949814e-06, + "loss": 0.2663, + "step": 11616 + }, + { + "epoch": 0.73, + "grad_norm": 1.5677734227878761, + "learning_rate": 1.7859319203738357e-06, + "loss": 0.2536, + "step": 11617 + }, + { + "epoch": 0.73, + "grad_norm": 8.219686016898887, + "learning_rate": 1.785151811010085e-06, + "loss": 0.2414, + "step": 11618 + }, + { + "epoch": 0.73, + "grad_norm": 2.4438572905554685, + "learning_rate": 1.7843718350360944e-06, + "loss": 0.2588, + "step": 11619 + }, + { + "epoch": 0.73, + "grad_norm": 1.814113199465102, + "learning_rate": 1.7835919924842298e-06, + "loss": 0.26, + "step": 11620 + }, + { + "epoch": 0.73, + "grad_norm": 2.7805856542310003, + "learning_rate": 1.7828122833868445e-06, + "loss": 0.2621, + "step": 11621 + }, + { + "epoch": 0.73, + "grad_norm": 2.1671388259741913, + "learning_rate": 1.7820327077762938e-06, + "loss": 0.2487, + "step": 11622 + }, + { + "epoch": 0.73, + "grad_norm": 2.289129757929059, + "learning_rate": 1.7812532656849218e-06, + "loss": 0.2776, + "step": 11623 + }, + { + "epoch": 0.73, + "grad_norm": 1.8509117063410268, + "learning_rate": 1.7804739571450675e-06, + "loss": 0.2709, + "step": 11624 + }, + { + "epoch": 0.73, + "grad_norm": 2.2528244140635003, + "learning_rate": 1.7796947821890681e-06, + "loss": 0.268, + "step": 11625 + }, + { + "epoch": 0.73, + "grad_norm": 3.195180930040564, + "learning_rate": 1.7789157408492513e-06, + "loss": 0.2635, + "step": 11626 + }, + { + "epoch": 0.73, + "grad_norm": 2.200946659581297, + "learning_rate": 1.778136833157943e-06, + "loss": 0.2733, + "step": 11627 + }, + { + "epoch": 0.73, + "grad_norm": 1.9669832001505725, + "learning_rate": 1.7773580591474599e-06, + "loss": 0.2679, + "step": 11628 + }, + { + "epoch": 0.73, + "grad_norm": 2.146853320790772, + "learning_rate": 1.7765794188501133e-06, + "loss": 0.2667, + "step": 11629 + }, + { + "epoch": 0.73, + "grad_norm": 4.671603421163086, + "learning_rate": 1.7758009122982144e-06, + "loss": 0.2478, + "step": 11630 + }, + { + "epoch": 0.73, + "grad_norm": 1.4877291221639142, + "learning_rate": 1.7750225395240623e-06, + "loss": 0.2583, + "step": 11631 + }, + { + "epoch": 0.73, + "grad_norm": 1.791835426391931, + "learning_rate": 1.7742443005599507e-06, + "loss": 0.2683, + "step": 11632 + }, + { + "epoch": 0.73, + "grad_norm": 2.398489748969295, + "learning_rate": 1.7734661954381754e-06, + "loss": 0.2643, + "step": 11633 + }, + { + "epoch": 0.73, + "grad_norm": 1.9239909326068136, + "learning_rate": 1.7726882241910166e-06, + "loss": 0.2454, + "step": 11634 + }, + { + "epoch": 0.73, + "grad_norm": 1.893966136002067, + "learning_rate": 1.771910386850757e-06, + "loss": 0.2524, + "step": 11635 + }, + { + "epoch": 0.73, + "grad_norm": 2.3547275542984183, + "learning_rate": 1.7711326834496694e-06, + "loss": 0.2678, + "step": 11636 + }, + { + "epoch": 0.73, + "grad_norm": 2.4015095807488276, + "learning_rate": 1.7703551140200203e-06, + "loss": 0.2713, + "step": 11637 + }, + { + "epoch": 0.73, + "grad_norm": 5.010584242813043, + "learning_rate": 1.769577678594076e-06, + "loss": 0.2388, + "step": 11638 + }, + { + "epoch": 0.73, + "grad_norm": 2.702886554338077, + "learning_rate": 1.7688003772040912e-06, + "loss": 0.2969, + "step": 11639 + }, + { + "epoch": 0.73, + "grad_norm": 2.2686726796212513, + "learning_rate": 1.7680232098823164e-06, + "loss": 0.2788, + "step": 11640 + }, + { + "epoch": 0.73, + "grad_norm": 1.6408821662912878, + "learning_rate": 1.7672461766609993e-06, + "loss": 0.2493, + "step": 11641 + }, + { + "epoch": 0.73, + "grad_norm": 5.642616086181689, + "learning_rate": 1.7664692775723825e-06, + "loss": 0.2551, + "step": 11642 + }, + { + "epoch": 0.73, + "grad_norm": 1.6519694481770981, + "learning_rate": 1.7656925126486979e-06, + "loss": 0.2634, + "step": 11643 + }, + { + "epoch": 0.73, + "grad_norm": 1.416757240748026, + "learning_rate": 1.7649158819221761e-06, + "loss": 0.2641, + "step": 11644 + }, + { + "epoch": 0.73, + "grad_norm": 2.305871361033636, + "learning_rate": 1.7641393854250395e-06, + "loss": 0.2666, + "step": 11645 + }, + { + "epoch": 0.73, + "grad_norm": 1.4664810327906352, + "learning_rate": 1.7633630231895083e-06, + "loss": 0.2602, + "step": 11646 + }, + { + "epoch": 0.73, + "grad_norm": 1.7818493241273152, + "learning_rate": 1.7625867952477943e-06, + "loss": 0.2669, + "step": 11647 + }, + { + "epoch": 0.73, + "grad_norm": 1.4525700460055968, + "learning_rate": 1.7618107016321023e-06, + "loss": 0.2501, + "step": 11648 + }, + { + "epoch": 0.73, + "grad_norm": 1.7809141559111528, + "learning_rate": 1.7610347423746365e-06, + "loss": 0.2684, + "step": 11649 + }, + { + "epoch": 0.73, + "grad_norm": 1.7893666663528591, + "learning_rate": 1.760258917507594e-06, + "loss": 0.25, + "step": 11650 + }, + { + "epoch": 0.73, + "grad_norm": 1.5303411523326833, + "learning_rate": 1.7594832270631633e-06, + "loss": 0.261, + "step": 11651 + }, + { + "epoch": 0.73, + "grad_norm": 0.6455878897895337, + "learning_rate": 1.7587076710735296e-06, + "loss": 0.4513, + "step": 11652 + }, + { + "epoch": 0.73, + "grad_norm": 2.8996348615728103, + "learning_rate": 1.75793224957087e-06, + "loss": 0.2832, + "step": 11653 + }, + { + "epoch": 0.73, + "grad_norm": 2.2014973717587885, + "learning_rate": 1.7571569625873613e-06, + "loss": 0.2597, + "step": 11654 + }, + { + "epoch": 0.73, + "grad_norm": 2.001370566681674, + "learning_rate": 1.7563818101551704e-06, + "loss": 0.2902, + "step": 11655 + }, + { + "epoch": 0.73, + "grad_norm": 2.6365757940514447, + "learning_rate": 1.7556067923064578e-06, + "loss": 0.2573, + "step": 11656 + }, + { + "epoch": 0.73, + "grad_norm": 1.9545635215836095, + "learning_rate": 1.7548319090733822e-06, + "loss": 0.2548, + "step": 11657 + }, + { + "epoch": 0.73, + "grad_norm": 1.3930601162195142, + "learning_rate": 1.7540571604880963e-06, + "loss": 0.2507, + "step": 11658 + }, + { + "epoch": 0.73, + "grad_norm": 2.077357826721203, + "learning_rate": 1.753282546582744e-06, + "loss": 0.2753, + "step": 11659 + }, + { + "epoch": 0.73, + "grad_norm": 1.5105846893542216, + "learning_rate": 1.7525080673894663e-06, + "loss": 0.262, + "step": 11660 + }, + { + "epoch": 0.73, + "grad_norm": 2.0441988034041034, + "learning_rate": 1.7517337229403946e-06, + "loss": 0.2709, + "step": 11661 + }, + { + "epoch": 0.73, + "grad_norm": 1.696674881499434, + "learning_rate": 1.7509595132676627e-06, + "loss": 0.2694, + "step": 11662 + }, + { + "epoch": 0.73, + "grad_norm": 2.05382432379998, + "learning_rate": 1.750185438403391e-06, + "loss": 0.2597, + "step": 11663 + }, + { + "epoch": 0.73, + "grad_norm": 1.4667357356081046, + "learning_rate": 1.7494114983796966e-06, + "loss": 0.2646, + "step": 11664 + }, + { + "epoch": 0.73, + "grad_norm": 2.0920409162784868, + "learning_rate": 1.7486376932286925e-06, + "loss": 0.2512, + "step": 11665 + }, + { + "epoch": 0.73, + "grad_norm": 2.345179870515097, + "learning_rate": 1.7478640229824872e-06, + "loss": 0.2571, + "step": 11666 + }, + { + "epoch": 0.73, + "grad_norm": 2.7376213684105677, + "learning_rate": 1.7470904876731804e-06, + "loss": 0.2741, + "step": 11667 + }, + { + "epoch": 0.73, + "grad_norm": 2.771613632062838, + "learning_rate": 1.7463170873328667e-06, + "loss": 0.2697, + "step": 11668 + }, + { + "epoch": 0.73, + "grad_norm": 1.55801121445325, + "learning_rate": 1.745543821993635e-06, + "loss": 0.2608, + "step": 11669 + }, + { + "epoch": 0.73, + "grad_norm": 0.5823996231364373, + "learning_rate": 1.7447706916875717e-06, + "loss": 0.4698, + "step": 11670 + }, + { + "epoch": 0.73, + "grad_norm": 3.647196429200539, + "learning_rate": 1.7439976964467532e-06, + "loss": 0.2504, + "step": 11671 + }, + { + "epoch": 0.73, + "grad_norm": 1.6926303658328, + "learning_rate": 1.743224836303255e-06, + "loss": 0.2425, + "step": 11672 + }, + { + "epoch": 0.73, + "grad_norm": 1.6919745608462098, + "learning_rate": 1.7424521112891418e-06, + "loss": 0.2689, + "step": 11673 + }, + { + "epoch": 0.73, + "grad_norm": 1.9286600778162801, + "learning_rate": 1.741679521436478e-06, + "loss": 0.277, + "step": 11674 + }, + { + "epoch": 0.73, + "grad_norm": 34.17848592255194, + "learning_rate": 1.740907066777318e-06, + "loss": 0.2728, + "step": 11675 + }, + { + "epoch": 0.73, + "grad_norm": 1.7389085754189018, + "learning_rate": 1.7401347473437124e-06, + "loss": 0.2624, + "step": 11676 + }, + { + "epoch": 0.73, + "grad_norm": 1.5115580443115493, + "learning_rate": 1.7393625631677052e-06, + "loss": 0.2638, + "step": 11677 + }, + { + "epoch": 0.73, + "grad_norm": 2.299529952387337, + "learning_rate": 1.7385905142813387e-06, + "loss": 0.2683, + "step": 11678 + }, + { + "epoch": 0.73, + "grad_norm": 4.678281920994348, + "learning_rate": 1.7378186007166426e-06, + "loss": 0.2594, + "step": 11679 + }, + { + "epoch": 0.73, + "grad_norm": 2.6147927500913823, + "learning_rate": 1.7370468225056487e-06, + "loss": 0.2489, + "step": 11680 + }, + { + "epoch": 0.73, + "grad_norm": 3.3803832366180733, + "learning_rate": 1.7362751796803757e-06, + "loss": 0.2699, + "step": 11681 + }, + { + "epoch": 0.73, + "grad_norm": 2.2956701821478753, + "learning_rate": 1.7355036722728447e-06, + "loss": 0.2695, + "step": 11682 + }, + { + "epoch": 0.73, + "grad_norm": 2.0852043888052307, + "learning_rate": 1.7347323003150647e-06, + "loss": 0.2632, + "step": 11683 + }, + { + "epoch": 0.73, + "grad_norm": 0.5722202040312103, + "learning_rate": 1.7339610638390413e-06, + "loss": 0.4754, + "step": 11684 + }, + { + "epoch": 0.73, + "grad_norm": 1.4999099087433647, + "learning_rate": 1.733189962876773e-06, + "loss": 0.2473, + "step": 11685 + }, + { + "epoch": 0.73, + "grad_norm": 1.3405561995654947, + "learning_rate": 1.7324189974602552e-06, + "loss": 0.2453, + "step": 11686 + }, + { + "epoch": 0.73, + "grad_norm": 1.7920033792237762, + "learning_rate": 1.731648167621479e-06, + "loss": 0.2598, + "step": 11687 + }, + { + "epoch": 0.74, + "grad_norm": 1.9209915036232665, + "learning_rate": 1.7308774733924254e-06, + "loss": 0.2486, + "step": 11688 + }, + { + "epoch": 0.74, + "grad_norm": 1.2248039585299622, + "learning_rate": 1.7301069148050708e-06, + "loss": 0.2375, + "step": 11689 + }, + { + "epoch": 0.74, + "grad_norm": 2.2562677500023627, + "learning_rate": 1.7293364918913896e-06, + "loss": 0.2654, + "step": 11690 + }, + { + "epoch": 0.74, + "grad_norm": 2.5772938891061674, + "learning_rate": 1.7285662046833469e-06, + "loss": 0.2592, + "step": 11691 + }, + { + "epoch": 0.74, + "grad_norm": 1.263217946340115, + "learning_rate": 1.727796053212903e-06, + "loss": 0.257, + "step": 11692 + }, + { + "epoch": 0.74, + "grad_norm": 1.4642998078322564, + "learning_rate": 1.7270260375120114e-06, + "loss": 0.2497, + "step": 11693 + }, + { + "epoch": 0.74, + "grad_norm": 1.922779491169293, + "learning_rate": 1.7262561576126225e-06, + "loss": 0.255, + "step": 11694 + }, + { + "epoch": 0.74, + "grad_norm": 1.7178176465998325, + "learning_rate": 1.7254864135466825e-06, + "loss": 0.2653, + "step": 11695 + }, + { + "epoch": 0.74, + "grad_norm": 1.791889022442444, + "learning_rate": 1.7247168053461272e-06, + "loss": 0.269, + "step": 11696 + }, + { + "epoch": 0.74, + "grad_norm": 2.0789070804220042, + "learning_rate": 1.7239473330428874e-06, + "loss": 0.2611, + "step": 11697 + }, + { + "epoch": 0.74, + "grad_norm": 2.733873805643619, + "learning_rate": 1.7231779966688938e-06, + "loss": 0.2638, + "step": 11698 + }, + { + "epoch": 0.74, + "grad_norm": 2.8731252908626708, + "learning_rate": 1.7224087962560654e-06, + "loss": 0.2865, + "step": 11699 + }, + { + "epoch": 0.74, + "grad_norm": 0.5683024424539835, + "learning_rate": 1.7216397318363175e-06, + "loss": 0.4845, + "step": 11700 + }, + { + "epoch": 0.74, + "grad_norm": 2.3266079086383, + "learning_rate": 1.7208708034415566e-06, + "loss": 0.2603, + "step": 11701 + }, + { + "epoch": 0.74, + "grad_norm": 1.4772091036325459, + "learning_rate": 1.7201020111036942e-06, + "loss": 0.249, + "step": 11702 + }, + { + "epoch": 0.74, + "grad_norm": 2.3318413488655816, + "learning_rate": 1.7193333548546248e-06, + "loss": 0.2519, + "step": 11703 + }, + { + "epoch": 0.74, + "grad_norm": 5.879203382531289, + "learning_rate": 1.7185648347262419e-06, + "loss": 0.2798, + "step": 11704 + }, + { + "epoch": 0.74, + "grad_norm": 4.337589940419514, + "learning_rate": 1.7177964507504309e-06, + "loss": 0.2612, + "step": 11705 + }, + { + "epoch": 0.74, + "grad_norm": 2.8062267155195184, + "learning_rate": 1.7170282029590762e-06, + "loss": 0.2572, + "step": 11706 + }, + { + "epoch": 0.74, + "grad_norm": 4.588670748118247, + "learning_rate": 1.7162600913840527e-06, + "loss": 0.2658, + "step": 11707 + }, + { + "epoch": 0.74, + "grad_norm": 3.0250954480159216, + "learning_rate": 1.715492116057229e-06, + "loss": 0.27, + "step": 11708 + }, + { + "epoch": 0.74, + "grad_norm": 3.308641864443634, + "learning_rate": 1.7147242770104716e-06, + "loss": 0.2613, + "step": 11709 + }, + { + "epoch": 0.74, + "grad_norm": 2.746783481461446, + "learning_rate": 1.7139565742756409e-06, + "loss": 0.2492, + "step": 11710 + }, + { + "epoch": 0.74, + "grad_norm": 3.0127040872933692, + "learning_rate": 1.713189007884588e-06, + "loss": 0.2541, + "step": 11711 + }, + { + "epoch": 0.74, + "grad_norm": 1.461726681942587, + "learning_rate": 1.7124215778691616e-06, + "loss": 0.25, + "step": 11712 + }, + { + "epoch": 0.74, + "grad_norm": 1.7282673392088093, + "learning_rate": 1.7116542842612017e-06, + "loss": 0.2601, + "step": 11713 + }, + { + "epoch": 0.74, + "grad_norm": 2.184984498330675, + "learning_rate": 1.710887127092548e-06, + "loss": 0.2656, + "step": 11714 + }, + { + "epoch": 0.74, + "grad_norm": 2.3939637027447924, + "learning_rate": 1.7101201063950295e-06, + "loss": 0.2477, + "step": 11715 + }, + { + "epoch": 0.74, + "grad_norm": 2.0316670554870813, + "learning_rate": 1.7093532222004694e-06, + "loss": 0.2789, + "step": 11716 + }, + { + "epoch": 0.74, + "grad_norm": 2.2368565527433213, + "learning_rate": 1.7085864745406894e-06, + "loss": 0.2769, + "step": 11717 + }, + { + "epoch": 0.74, + "grad_norm": 1.6876175509679776, + "learning_rate": 1.707819863447504e-06, + "loss": 0.2523, + "step": 11718 + }, + { + "epoch": 0.74, + "grad_norm": 0.5491355963980498, + "learning_rate": 1.7070533889527202e-06, + "loss": 0.4875, + "step": 11719 + }, + { + "epoch": 0.74, + "grad_norm": 2.0802363436224196, + "learning_rate": 1.7062870510881401e-06, + "loss": 0.277, + "step": 11720 + }, + { + "epoch": 0.74, + "grad_norm": 2.3939600406632313, + "learning_rate": 1.7055208498855591e-06, + "loss": 0.2505, + "step": 11721 + }, + { + "epoch": 0.74, + "grad_norm": 1.6561300208490082, + "learning_rate": 1.7047547853767714e-06, + "loss": 0.2656, + "step": 11722 + }, + { + "epoch": 0.74, + "grad_norm": 3.085061399844055, + "learning_rate": 1.7039888575935588e-06, + "loss": 0.2596, + "step": 11723 + }, + { + "epoch": 0.74, + "grad_norm": 2.35310945864418, + "learning_rate": 1.7032230665677046e-06, + "loss": 0.2597, + "step": 11724 + }, + { + "epoch": 0.74, + "grad_norm": 1.4761506138469251, + "learning_rate": 1.7024574123309795e-06, + "loss": 0.2599, + "step": 11725 + }, + { + "epoch": 0.74, + "grad_norm": 1.4213739859126764, + "learning_rate": 1.7016918949151551e-06, + "loss": 0.2459, + "step": 11726 + }, + { + "epoch": 0.74, + "grad_norm": 1.5246435825088704, + "learning_rate": 1.7009265143519927e-06, + "loss": 0.2595, + "step": 11727 + }, + { + "epoch": 0.74, + "grad_norm": 2.3481570591323324, + "learning_rate": 1.7001612706732484e-06, + "loss": 0.2469, + "step": 11728 + }, + { + "epoch": 0.74, + "grad_norm": 2.235773637995172, + "learning_rate": 1.6993961639106726e-06, + "loss": 0.2686, + "step": 11729 + }, + { + "epoch": 0.74, + "grad_norm": 5.271644764279985, + "learning_rate": 1.6986311940960148e-06, + "loss": 0.2753, + "step": 11730 + }, + { + "epoch": 0.74, + "grad_norm": 1.937126375319672, + "learning_rate": 1.6978663612610102e-06, + "loss": 0.2731, + "step": 11731 + }, + { + "epoch": 0.74, + "grad_norm": 1.8566076457703664, + "learning_rate": 1.6971016654373979e-06, + "loss": 0.25, + "step": 11732 + }, + { + "epoch": 0.74, + "grad_norm": 2.036749952673217, + "learning_rate": 1.6963371066569017e-06, + "loss": 0.2727, + "step": 11733 + }, + { + "epoch": 0.74, + "grad_norm": 2.7444668121962237, + "learning_rate": 1.6955726849512495e-06, + "loss": 0.2608, + "step": 11734 + }, + { + "epoch": 0.74, + "grad_norm": 2.2119610463202544, + "learning_rate": 1.6948084003521553e-06, + "loss": 0.2685, + "step": 11735 + }, + { + "epoch": 0.74, + "grad_norm": 1.5387795986646753, + "learning_rate": 1.6940442528913309e-06, + "loss": 0.2516, + "step": 11736 + }, + { + "epoch": 0.74, + "grad_norm": 2.508764890414543, + "learning_rate": 1.6932802426004812e-06, + "loss": 0.2688, + "step": 11737 + }, + { + "epoch": 0.74, + "grad_norm": 2.67967061402469, + "learning_rate": 1.692516369511308e-06, + "loss": 0.2596, + "step": 11738 + }, + { + "epoch": 0.74, + "grad_norm": 0.6086632782889763, + "learning_rate": 1.6917526336555068e-06, + "loss": 0.5037, + "step": 11739 + }, + { + "epoch": 0.74, + "grad_norm": 7.296951764295234, + "learning_rate": 1.6909890350647645e-06, + "loss": 0.2721, + "step": 11740 + }, + { + "epoch": 0.74, + "grad_norm": 4.745862100151195, + "learning_rate": 1.6902255737707634e-06, + "loss": 0.2608, + "step": 11741 + }, + { + "epoch": 0.74, + "grad_norm": 1.730642746716277, + "learning_rate": 1.6894622498051838e-06, + "loss": 0.2485, + "step": 11742 + }, + { + "epoch": 0.74, + "grad_norm": 3.094308521086035, + "learning_rate": 1.6886990631996951e-06, + "loss": 0.2685, + "step": 11743 + }, + { + "epoch": 0.74, + "grad_norm": 4.691004634565247, + "learning_rate": 1.6879360139859646e-06, + "loss": 0.2427, + "step": 11744 + }, + { + "epoch": 0.74, + "grad_norm": 0.60926938700073, + "learning_rate": 1.6871731021956495e-06, + "loss": 0.4766, + "step": 11745 + }, + { + "epoch": 0.74, + "grad_norm": 1.9327936579866638, + "learning_rate": 1.6864103278604071e-06, + "loss": 0.2583, + "step": 11746 + }, + { + "epoch": 0.74, + "grad_norm": 1.493622110827338, + "learning_rate": 1.685647691011887e-06, + "loss": 0.2675, + "step": 11747 + }, + { + "epoch": 0.74, + "grad_norm": 2.4140127398872653, + "learning_rate": 1.6848851916817316e-06, + "loss": 0.2686, + "step": 11748 + }, + { + "epoch": 0.74, + "grad_norm": 3.9979295985962144, + "learning_rate": 1.6841228299015755e-06, + "loss": 0.2646, + "step": 11749 + }, + { + "epoch": 0.74, + "grad_norm": 2.257469315564016, + "learning_rate": 1.683360605703055e-06, + "loss": 0.2735, + "step": 11750 + }, + { + "epoch": 0.74, + "grad_norm": 3.7783167662283246, + "learning_rate": 1.6825985191177941e-06, + "loss": 0.2637, + "step": 11751 + }, + { + "epoch": 0.74, + "grad_norm": 4.997435546723059, + "learning_rate": 1.681836570177413e-06, + "loss": 0.2706, + "step": 11752 + }, + { + "epoch": 0.74, + "grad_norm": 0.597448862879755, + "learning_rate": 1.681074758913523e-06, + "loss": 0.5039, + "step": 11753 + }, + { + "epoch": 0.74, + "grad_norm": 1.9406822339699066, + "learning_rate": 1.6803130853577392e-06, + "loss": 0.2589, + "step": 11754 + }, + { + "epoch": 0.74, + "grad_norm": 3.901023273563546, + "learning_rate": 1.679551549541662e-06, + "loss": 0.2591, + "step": 11755 + }, + { + "epoch": 0.74, + "grad_norm": 0.578241439993665, + "learning_rate": 1.678790151496889e-06, + "loss": 0.473, + "step": 11756 + }, + { + "epoch": 0.74, + "grad_norm": 13.754896853850894, + "learning_rate": 1.6780288912550102e-06, + "loss": 0.2598, + "step": 11757 + }, + { + "epoch": 0.74, + "grad_norm": 3.9530054450688783, + "learning_rate": 1.6772677688476146e-06, + "loss": 0.2555, + "step": 11758 + }, + { + "epoch": 0.74, + "grad_norm": 1.4590854111521858, + "learning_rate": 1.6765067843062816e-06, + "loss": 0.2727, + "step": 11759 + }, + { + "epoch": 0.74, + "grad_norm": 1.8566288786439034, + "learning_rate": 1.6757459376625852e-06, + "loss": 0.2562, + "step": 11760 + }, + { + "epoch": 0.74, + "grad_norm": 3.599038445668937, + "learning_rate": 1.6749852289480906e-06, + "loss": 0.2493, + "step": 11761 + }, + { + "epoch": 0.74, + "grad_norm": 2.1675105011173987, + "learning_rate": 1.6742246581943683e-06, + "loss": 0.2554, + "step": 11762 + }, + { + "epoch": 0.74, + "grad_norm": 1.8702437625465167, + "learning_rate": 1.673464225432972e-06, + "loss": 0.2645, + "step": 11763 + }, + { + "epoch": 0.74, + "grad_norm": 5.628692368302644, + "learning_rate": 1.672703930695454e-06, + "loss": 0.2567, + "step": 11764 + }, + { + "epoch": 0.74, + "grad_norm": 0.5839239474989595, + "learning_rate": 1.6719437740133577e-06, + "loss": 0.4346, + "step": 11765 + }, + { + "epoch": 0.74, + "grad_norm": 2.07993418322798, + "learning_rate": 1.6711837554182276e-06, + "loss": 0.2704, + "step": 11766 + }, + { + "epoch": 0.74, + "grad_norm": 2.081586985271671, + "learning_rate": 1.6704238749415958e-06, + "loss": 0.2715, + "step": 11767 + }, + { + "epoch": 0.74, + "grad_norm": 2.54605044868619, + "learning_rate": 1.6696641326149897e-06, + "loss": 0.2447, + "step": 11768 + }, + { + "epoch": 0.74, + "grad_norm": 7.49403293486929, + "learning_rate": 1.6689045284699347e-06, + "loss": 0.2713, + "step": 11769 + }, + { + "epoch": 0.74, + "grad_norm": 1.7950145411044751, + "learning_rate": 1.6681450625379487e-06, + "loss": 0.2581, + "step": 11770 + }, + { + "epoch": 0.74, + "grad_norm": 2.933289996831663, + "learning_rate": 1.6673857348505422e-06, + "loss": 0.2577, + "step": 11771 + }, + { + "epoch": 0.74, + "grad_norm": 2.0326330429046733, + "learning_rate": 1.6666265454392206e-06, + "loss": 0.2669, + "step": 11772 + }, + { + "epoch": 0.74, + "grad_norm": 1.9024021251154875, + "learning_rate": 1.665867494335483e-06, + "loss": 0.2706, + "step": 11773 + }, + { + "epoch": 0.74, + "grad_norm": 3.053164362489364, + "learning_rate": 1.6651085815708268e-06, + "loss": 0.2647, + "step": 11774 + }, + { + "epoch": 0.74, + "grad_norm": 4.514542001307621, + "learning_rate": 1.664349807176739e-06, + "loss": 0.2515, + "step": 11775 + }, + { + "epoch": 0.74, + "grad_norm": 3.7235526814462925, + "learning_rate": 1.6635911711847007e-06, + "loss": 0.255, + "step": 11776 + }, + { + "epoch": 0.74, + "grad_norm": 2.166238222918449, + "learning_rate": 1.6628326736261914e-06, + "loss": 0.2648, + "step": 11777 + }, + { + "epoch": 0.74, + "grad_norm": 1.6559977904009444, + "learning_rate": 1.6620743145326829e-06, + "loss": 0.2807, + "step": 11778 + }, + { + "epoch": 0.74, + "grad_norm": 2.0426253329229977, + "learning_rate": 1.6613160939356403e-06, + "loss": 0.255, + "step": 11779 + }, + { + "epoch": 0.74, + "grad_norm": 2.0517655887107757, + "learning_rate": 1.6605580118665233e-06, + "loss": 0.2503, + "step": 11780 + }, + { + "epoch": 0.74, + "grad_norm": 2.1754399538743616, + "learning_rate": 1.6598000683567838e-06, + "loss": 0.2556, + "step": 11781 + }, + { + "epoch": 0.74, + "grad_norm": 3.901653787557663, + "learning_rate": 1.659042263437874e-06, + "loss": 0.2483, + "step": 11782 + }, + { + "epoch": 0.74, + "grad_norm": 7.345049395981367, + "learning_rate": 1.6582845971412332e-06, + "loss": 0.2777, + "step": 11783 + }, + { + "epoch": 0.74, + "grad_norm": 2.06937098899403, + "learning_rate": 1.657527069498302e-06, + "loss": 0.2586, + "step": 11784 + }, + { + "epoch": 0.74, + "grad_norm": 1.5680557772512367, + "learning_rate": 1.6567696805405075e-06, + "loss": 0.2572, + "step": 11785 + }, + { + "epoch": 0.74, + "grad_norm": 1.3874081282609536, + "learning_rate": 1.656012430299279e-06, + "loss": 0.2771, + "step": 11786 + }, + { + "epoch": 0.74, + "grad_norm": 0.5989031663760926, + "learning_rate": 1.655255318806035e-06, + "loss": 0.4963, + "step": 11787 + }, + { + "epoch": 0.74, + "grad_norm": 4.529049548014022, + "learning_rate": 1.6544983460921882e-06, + "loss": 0.2653, + "step": 11788 + }, + { + "epoch": 0.74, + "grad_norm": 1.802033404813695, + "learning_rate": 1.6537415121891454e-06, + "loss": 0.2643, + "step": 11789 + }, + { + "epoch": 0.74, + "grad_norm": 1.6588807491390738, + "learning_rate": 1.6529848171283113e-06, + "loss": 0.2607, + "step": 11790 + }, + { + "epoch": 0.74, + "grad_norm": 1.8993431460926697, + "learning_rate": 1.652228260941084e-06, + "loss": 0.249, + "step": 11791 + }, + { + "epoch": 0.74, + "grad_norm": 14.431290294460897, + "learning_rate": 1.6514718436588517e-06, + "loss": 0.2689, + "step": 11792 + }, + { + "epoch": 0.74, + "grad_norm": 3.036317463653162, + "learning_rate": 1.6507155653129992e-06, + "loss": 0.2694, + "step": 11793 + }, + { + "epoch": 0.74, + "grad_norm": 1.8506399393182689, + "learning_rate": 1.6499594259349083e-06, + "loss": 0.2653, + "step": 11794 + }, + { + "epoch": 0.74, + "grad_norm": 1.4949799470416039, + "learning_rate": 1.6492034255559514e-06, + "loss": 0.2744, + "step": 11795 + }, + { + "epoch": 0.74, + "grad_norm": 5.313066645298156, + "learning_rate": 1.6484475642074955e-06, + "loss": 0.264, + "step": 11796 + }, + { + "epoch": 0.74, + "grad_norm": 2.4319119523253767, + "learning_rate": 1.6476918419209021e-06, + "loss": 0.2578, + "step": 11797 + }, + { + "epoch": 0.74, + "grad_norm": 7.21520285802145, + "learning_rate": 1.6469362587275283e-06, + "loss": 0.2688, + "step": 11798 + }, + { + "epoch": 0.74, + "grad_norm": 1.9102041030243637, + "learning_rate": 1.6461808146587266e-06, + "loss": 0.2512, + "step": 11799 + }, + { + "epoch": 0.74, + "grad_norm": 2.13520231046298, + "learning_rate": 1.6454255097458399e-06, + "loss": 0.2572, + "step": 11800 + }, + { + "epoch": 0.74, + "grad_norm": 0.6482321469633137, + "learning_rate": 1.6446703440202054e-06, + "loss": 0.4889, + "step": 11801 + }, + { + "epoch": 0.74, + "grad_norm": 2.4670376716034883, + "learning_rate": 1.6439153175131601e-06, + "loss": 0.2662, + "step": 11802 + }, + { + "epoch": 0.74, + "grad_norm": 2.500349371351017, + "learning_rate": 1.6431604302560289e-06, + "loss": 0.2621, + "step": 11803 + }, + { + "epoch": 0.74, + "grad_norm": 2.568830629198864, + "learning_rate": 1.6424056822801343e-06, + "loss": 0.27, + "step": 11804 + }, + { + "epoch": 0.74, + "grad_norm": 1.689698718055636, + "learning_rate": 1.6416510736167894e-06, + "loss": 0.2718, + "step": 11805 + }, + { + "epoch": 0.74, + "grad_norm": 4.305639380884586, + "learning_rate": 1.6408966042973073e-06, + "loss": 0.2599, + "step": 11806 + }, + { + "epoch": 0.74, + "grad_norm": 2.9843904255661617, + "learning_rate": 1.6401422743529927e-06, + "loss": 0.2596, + "step": 11807 + }, + { + "epoch": 0.74, + "grad_norm": 1.5982928678592767, + "learning_rate": 1.6393880838151432e-06, + "loss": 0.2589, + "step": 11808 + }, + { + "epoch": 0.74, + "grad_norm": 1.4961203988435854, + "learning_rate": 1.6386340327150496e-06, + "loss": 0.2616, + "step": 11809 + }, + { + "epoch": 0.74, + "grad_norm": 2.2307769427273816, + "learning_rate": 1.6378801210840018e-06, + "loss": 0.2721, + "step": 11810 + }, + { + "epoch": 0.74, + "grad_norm": 1.9924991045431384, + "learning_rate": 1.6371263489532795e-06, + "loss": 0.2602, + "step": 11811 + }, + { + "epoch": 0.74, + "grad_norm": 4.355843185123422, + "learning_rate": 1.6363727163541587e-06, + "loss": 0.2713, + "step": 11812 + }, + { + "epoch": 0.74, + "grad_norm": 1.6402361533211867, + "learning_rate": 1.6356192233179063e-06, + "loss": 0.2499, + "step": 11813 + }, + { + "epoch": 0.74, + "grad_norm": 2.775833491056217, + "learning_rate": 1.634865869875788e-06, + "loss": 0.2841, + "step": 11814 + }, + { + "epoch": 0.74, + "grad_norm": 2.4126877447225055, + "learning_rate": 1.6341126560590637e-06, + "loss": 0.2692, + "step": 11815 + }, + { + "epoch": 0.74, + "grad_norm": 1.721410768568857, + "learning_rate": 1.633359581898984e-06, + "loss": 0.2507, + "step": 11816 + }, + { + "epoch": 0.74, + "grad_norm": 0.588412549744532, + "learning_rate": 1.6326066474267931e-06, + "loss": 0.4395, + "step": 11817 + }, + { + "epoch": 0.74, + "grad_norm": 2.3592591381615065, + "learning_rate": 1.6318538526737354e-06, + "loss": 0.2556, + "step": 11818 + }, + { + "epoch": 0.74, + "grad_norm": 2.297946060006865, + "learning_rate": 1.631101197671044e-06, + "loss": 0.2519, + "step": 11819 + }, + { + "epoch": 0.74, + "grad_norm": 1.6660246422459932, + "learning_rate": 1.6303486824499459e-06, + "loss": 0.2548, + "step": 11820 + }, + { + "epoch": 0.74, + "grad_norm": 1.7470054571205338, + "learning_rate": 1.6295963070416658e-06, + "loss": 0.2657, + "step": 11821 + }, + { + "epoch": 0.74, + "grad_norm": 0.6014955834137027, + "learning_rate": 1.628844071477424e-06, + "loss": 0.4609, + "step": 11822 + }, + { + "epoch": 0.74, + "grad_norm": 2.769016713587465, + "learning_rate": 1.628091975788429e-06, + "loss": 0.2657, + "step": 11823 + }, + { + "epoch": 0.74, + "grad_norm": 1.8453180889738028, + "learning_rate": 1.627340020005887e-06, + "loss": 0.291, + "step": 11824 + }, + { + "epoch": 0.74, + "grad_norm": 1.4959257455667023, + "learning_rate": 1.6265882041609964e-06, + "loss": 0.2688, + "step": 11825 + }, + { + "epoch": 0.74, + "grad_norm": 1.7455441204424988, + "learning_rate": 1.6258365282849547e-06, + "loss": 0.2446, + "step": 11826 + }, + { + "epoch": 0.74, + "grad_norm": 2.5213502263259033, + "learning_rate": 1.6250849924089485e-06, + "loss": 0.2907, + "step": 11827 + }, + { + "epoch": 0.74, + "grad_norm": 1.9003721308085535, + "learning_rate": 1.6243335965641593e-06, + "loss": 0.2483, + "step": 11828 + }, + { + "epoch": 0.74, + "grad_norm": 1.6456645060294284, + "learning_rate": 1.6235823407817647e-06, + "loss": 0.246, + "step": 11829 + }, + { + "epoch": 0.74, + "grad_norm": 2.3052556292761452, + "learning_rate": 1.622831225092938e-06, + "loss": 0.2571, + "step": 11830 + }, + { + "epoch": 0.74, + "grad_norm": 1.7321240346034084, + "learning_rate": 1.6220802495288418e-06, + "loss": 0.2507, + "step": 11831 + }, + { + "epoch": 0.74, + "grad_norm": 2.1755243859919093, + "learning_rate": 1.6213294141206366e-06, + "loss": 0.2687, + "step": 11832 + }, + { + "epoch": 0.74, + "grad_norm": 4.039563658824441, + "learning_rate": 1.6205787188994726e-06, + "loss": 0.2647, + "step": 11833 + }, + { + "epoch": 0.74, + "grad_norm": 5.4653985842077635, + "learning_rate": 1.6198281638965023e-06, + "loss": 0.2693, + "step": 11834 + }, + { + "epoch": 0.74, + "grad_norm": 2.6229795598659216, + "learning_rate": 1.619077749142864e-06, + "loss": 0.2536, + "step": 11835 + }, + { + "epoch": 0.74, + "grad_norm": 14.362800910237805, + "learning_rate": 1.618327474669696e-06, + "loss": 0.2458, + "step": 11836 + }, + { + "epoch": 0.74, + "grad_norm": 2.165323961044263, + "learning_rate": 1.6175773405081263e-06, + "loss": 0.2558, + "step": 11837 + }, + { + "epoch": 0.74, + "grad_norm": 3.0797768971008512, + "learning_rate": 1.6168273466892826e-06, + "loss": 0.2724, + "step": 11838 + }, + { + "epoch": 0.74, + "grad_norm": 1.51796264846945, + "learning_rate": 1.6160774932442808e-06, + "loss": 0.2655, + "step": 11839 + }, + { + "epoch": 0.74, + "grad_norm": 2.81021565265868, + "learning_rate": 1.6153277802042349e-06, + "loss": 0.2753, + "step": 11840 + }, + { + "epoch": 0.74, + "grad_norm": 5.32776950753152, + "learning_rate": 1.6145782076002493e-06, + "loss": 0.2506, + "step": 11841 + }, + { + "epoch": 0.74, + "grad_norm": 3.8041643534287357, + "learning_rate": 1.6138287754634285e-06, + "loss": 0.2615, + "step": 11842 + }, + { + "epoch": 0.74, + "grad_norm": 1.9553419239251053, + "learning_rate": 1.6130794838248653e-06, + "loss": 0.2461, + "step": 11843 + }, + { + "epoch": 0.74, + "grad_norm": 1.8993415032975691, + "learning_rate": 1.6123303327156515e-06, + "loss": 0.241, + "step": 11844 + }, + { + "epoch": 0.74, + "grad_norm": 1.476463975922597, + "learning_rate": 1.6115813221668676e-06, + "loss": 0.252, + "step": 11845 + }, + { + "epoch": 0.74, + "grad_norm": 1.791918695056272, + "learning_rate": 1.6108324522095948e-06, + "loss": 0.269, + "step": 11846 + }, + { + "epoch": 0.75, + "grad_norm": 9.34204523919045, + "learning_rate": 1.6100837228749034e-06, + "loss": 0.2535, + "step": 11847 + }, + { + "epoch": 0.75, + "grad_norm": 1.9681715046276151, + "learning_rate": 1.6093351341938595e-06, + "loss": 0.2697, + "step": 11848 + }, + { + "epoch": 0.75, + "grad_norm": 2.92936775419886, + "learning_rate": 1.6085866861975219e-06, + "loss": 0.2582, + "step": 11849 + }, + { + "epoch": 0.75, + "grad_norm": 2.7529375084635115, + "learning_rate": 1.6078383789169466e-06, + "loss": 0.2626, + "step": 11850 + }, + { + "epoch": 0.75, + "grad_norm": 3.7072302835746838, + "learning_rate": 1.6070902123831832e-06, + "loss": 0.2662, + "step": 11851 + }, + { + "epoch": 0.75, + "grad_norm": 1.57082915701216, + "learning_rate": 1.6063421866272738e-06, + "loss": 0.2618, + "step": 11852 + }, + { + "epoch": 0.75, + "grad_norm": 1.8261968009035916, + "learning_rate": 1.6055943016802533e-06, + "loss": 0.2644, + "step": 11853 + }, + { + "epoch": 0.75, + "grad_norm": 2.4569429665757117, + "learning_rate": 1.604846557573156e-06, + "loss": 0.2488, + "step": 11854 + }, + { + "epoch": 0.75, + "grad_norm": 1.946417091387337, + "learning_rate": 1.6040989543370057e-06, + "loss": 0.2365, + "step": 11855 + }, + { + "epoch": 0.75, + "grad_norm": 2.243356464107158, + "learning_rate": 1.603351492002821e-06, + "loss": 0.249, + "step": 11856 + }, + { + "epoch": 0.75, + "grad_norm": 2.093287370996489, + "learning_rate": 1.602604170601615e-06, + "loss": 0.2695, + "step": 11857 + }, + { + "epoch": 0.75, + "grad_norm": 1.5898220441347646, + "learning_rate": 1.601856990164396e-06, + "loss": 0.2597, + "step": 11858 + }, + { + "epoch": 0.75, + "grad_norm": 2.709914051757555, + "learning_rate": 1.6011099507221677e-06, + "loss": 0.2553, + "step": 11859 + }, + { + "epoch": 0.75, + "grad_norm": 2.9656589445698858, + "learning_rate": 1.600363052305925e-06, + "loss": 0.2483, + "step": 11860 + }, + { + "epoch": 0.75, + "grad_norm": 2.1406403928513473, + "learning_rate": 1.599616294946656e-06, + "loss": 0.2502, + "step": 11861 + }, + { + "epoch": 0.75, + "grad_norm": 1.5859280108116744, + "learning_rate": 1.5988696786753489e-06, + "loss": 0.2568, + "step": 11862 + }, + { + "epoch": 0.75, + "grad_norm": 2.451305252896568, + "learning_rate": 1.5981232035229789e-06, + "loss": 0.2579, + "step": 11863 + }, + { + "epoch": 0.75, + "grad_norm": 1.8968391425248419, + "learning_rate": 1.59737686952052e-06, + "loss": 0.2645, + "step": 11864 + }, + { + "epoch": 0.75, + "grad_norm": 1.7087956113154734, + "learning_rate": 1.596630676698937e-06, + "loss": 0.2586, + "step": 11865 + }, + { + "epoch": 0.75, + "grad_norm": 2.6320847648591372, + "learning_rate": 1.5958846250891924e-06, + "loss": 0.2636, + "step": 11866 + }, + { + "epoch": 0.75, + "grad_norm": 2.3935022032319897, + "learning_rate": 1.5951387147222426e-06, + "loss": 0.2676, + "step": 11867 + }, + { + "epoch": 0.75, + "grad_norm": 1.972545154743655, + "learning_rate": 1.5943929456290353e-06, + "loss": 0.2584, + "step": 11868 + }, + { + "epoch": 0.75, + "grad_norm": 1.3620123371947725, + "learning_rate": 1.5936473178405115e-06, + "loss": 0.2434, + "step": 11869 + }, + { + "epoch": 0.75, + "grad_norm": 1.7552358800792407, + "learning_rate": 1.5929018313876133e-06, + "loss": 0.2584, + "step": 11870 + }, + { + "epoch": 0.75, + "grad_norm": 1.484971098012521, + "learning_rate": 1.5921564863012696e-06, + "loss": 0.2621, + "step": 11871 + }, + { + "epoch": 0.75, + "grad_norm": 1.3294578172874192, + "learning_rate": 1.5914112826124046e-06, + "loss": 0.2528, + "step": 11872 + }, + { + "epoch": 0.75, + "grad_norm": 1.7504983993422945, + "learning_rate": 1.5906662203519413e-06, + "loss": 0.2433, + "step": 11873 + }, + { + "epoch": 0.75, + "grad_norm": 2.3484787834414336, + "learning_rate": 1.5899212995507907e-06, + "loss": 0.272, + "step": 11874 + }, + { + "epoch": 0.75, + "grad_norm": 3.7115203724763024, + "learning_rate": 1.5891765202398636e-06, + "loss": 0.2535, + "step": 11875 + }, + { + "epoch": 0.75, + "grad_norm": 2.4853380230261126, + "learning_rate": 1.5884318824500616e-06, + "loss": 0.2553, + "step": 11876 + }, + { + "epoch": 0.75, + "grad_norm": 1.5846041938247355, + "learning_rate": 1.5876873862122781e-06, + "loss": 0.2719, + "step": 11877 + }, + { + "epoch": 0.75, + "grad_norm": 1.4196277911506971, + "learning_rate": 1.586943031557408e-06, + "loss": 0.2557, + "step": 11878 + }, + { + "epoch": 0.75, + "grad_norm": 1.9084763581467403, + "learning_rate": 1.586198818516334e-06, + "loss": 0.2528, + "step": 11879 + }, + { + "epoch": 0.75, + "grad_norm": 1.710566703156637, + "learning_rate": 1.5854547471199328e-06, + "loss": 0.2591, + "step": 11880 + }, + { + "epoch": 0.75, + "grad_norm": 3.1133888050406626, + "learning_rate": 1.5847108173990804e-06, + "loss": 0.2901, + "step": 11881 + }, + { + "epoch": 0.75, + "grad_norm": 3.8562106526254234, + "learning_rate": 1.583967029384641e-06, + "loss": 0.2629, + "step": 11882 + }, + { + "epoch": 0.75, + "grad_norm": 0.6122073234864452, + "learning_rate": 1.5832233831074783e-06, + "loss": 0.4742, + "step": 11883 + }, + { + "epoch": 0.75, + "grad_norm": 0.6017867089945995, + "learning_rate": 1.582479878598447e-06, + "loss": 0.4793, + "step": 11884 + }, + { + "epoch": 0.75, + "grad_norm": 1.6468932831399952, + "learning_rate": 1.581736515888394e-06, + "loss": 0.2553, + "step": 11885 + }, + { + "epoch": 0.75, + "grad_norm": 2.9967885114687465, + "learning_rate": 1.5809932950081657e-06, + "loss": 0.2653, + "step": 11886 + }, + { + "epoch": 0.75, + "grad_norm": 2.266315630618689, + "learning_rate": 1.5802502159885968e-06, + "loss": 0.2681, + "step": 11887 + }, + { + "epoch": 0.75, + "grad_norm": 4.986349448661057, + "learning_rate": 1.5795072788605226e-06, + "loss": 0.2631, + "step": 11888 + }, + { + "epoch": 0.75, + "grad_norm": 3.7236149444049063, + "learning_rate": 1.5787644836547667e-06, + "loss": 0.2822, + "step": 11889 + }, + { + "epoch": 0.75, + "grad_norm": 2.077695773204747, + "learning_rate": 1.5780218304021477e-06, + "loss": 0.263, + "step": 11890 + }, + { + "epoch": 0.75, + "grad_norm": 2.5311655300179265, + "learning_rate": 1.5772793191334828e-06, + "loss": 0.255, + "step": 11891 + }, + { + "epoch": 0.75, + "grad_norm": 2.4024394966952487, + "learning_rate": 1.576536949879578e-06, + "loss": 0.2611, + "step": 11892 + }, + { + "epoch": 0.75, + "grad_norm": 1.950961361803954, + "learning_rate": 1.5757947226712344e-06, + "loss": 0.2727, + "step": 11893 + }, + { + "epoch": 0.75, + "grad_norm": 18.33122662785955, + "learning_rate": 1.5750526375392517e-06, + "loss": 0.281, + "step": 11894 + }, + { + "epoch": 0.75, + "grad_norm": 1.494641768361291, + "learning_rate": 1.5743106945144166e-06, + "loss": 0.2587, + "step": 11895 + }, + { + "epoch": 0.75, + "grad_norm": 2.407395698512057, + "learning_rate": 1.5735688936275174e-06, + "loss": 0.2778, + "step": 11896 + }, + { + "epoch": 0.75, + "grad_norm": 1.5668321288737272, + "learning_rate": 1.5728272349093287e-06, + "loss": 0.2511, + "step": 11897 + }, + { + "epoch": 0.75, + "grad_norm": 1.810810671072288, + "learning_rate": 1.5720857183906275e-06, + "loss": 0.2566, + "step": 11898 + }, + { + "epoch": 0.75, + "grad_norm": 2.810282556503871, + "learning_rate": 1.571344344102178e-06, + "loss": 0.2534, + "step": 11899 + }, + { + "epoch": 0.75, + "grad_norm": 2.1878011512487316, + "learning_rate": 1.5706031120747417e-06, + "loss": 0.2653, + "step": 11900 + }, + { + "epoch": 0.75, + "grad_norm": 4.0712484006038085, + "learning_rate": 1.569862022339072e-06, + "loss": 0.2483, + "step": 11901 + }, + { + "epoch": 0.75, + "grad_norm": 1.6527144597464218, + "learning_rate": 1.56912107492592e-06, + "loss": 0.273, + "step": 11902 + }, + { + "epoch": 0.75, + "grad_norm": 19.838911551780072, + "learning_rate": 1.56838026986603e-06, + "loss": 0.2706, + "step": 11903 + }, + { + "epoch": 0.75, + "grad_norm": 1.522449482689555, + "learning_rate": 1.567639607190138e-06, + "loss": 0.238, + "step": 11904 + }, + { + "epoch": 0.75, + "grad_norm": 3.014349755394319, + "learning_rate": 1.566899086928973e-06, + "loss": 0.247, + "step": 11905 + }, + { + "epoch": 0.75, + "grad_norm": 1.2942124436971643, + "learning_rate": 1.5661587091132646e-06, + "loss": 0.2423, + "step": 11906 + }, + { + "epoch": 0.75, + "grad_norm": 1.6652853533826246, + "learning_rate": 1.565418473773731e-06, + "loss": 0.2477, + "step": 11907 + }, + { + "epoch": 0.75, + "grad_norm": 2.688960854504309, + "learning_rate": 1.564678380941085e-06, + "loss": 0.2634, + "step": 11908 + }, + { + "epoch": 0.75, + "grad_norm": 1.8093549149604553, + "learning_rate": 1.5639384306460331e-06, + "loss": 0.254, + "step": 11909 + }, + { + "epoch": 0.75, + "grad_norm": 1.751705599615996, + "learning_rate": 1.5631986229192791e-06, + "loss": 0.253, + "step": 11910 + }, + { + "epoch": 0.75, + "grad_norm": 2.4181956495410795, + "learning_rate": 1.5624589577915205e-06, + "loss": 0.2694, + "step": 11911 + }, + { + "epoch": 0.75, + "grad_norm": 2.585333299902547, + "learning_rate": 1.5617194352934451e-06, + "loss": 0.2748, + "step": 11912 + }, + { + "epoch": 0.75, + "grad_norm": 1.3754365359386904, + "learning_rate": 1.5609800554557358e-06, + "loss": 0.2587, + "step": 11913 + }, + { + "epoch": 0.75, + "grad_norm": 2.7493901033412564, + "learning_rate": 1.5602408183090744e-06, + "loss": 0.2659, + "step": 11914 + }, + { + "epoch": 0.75, + "grad_norm": 1.479838596165756, + "learning_rate": 1.559501723884131e-06, + "loss": 0.2382, + "step": 11915 + }, + { + "epoch": 0.75, + "grad_norm": 2.45159273101338, + "learning_rate": 1.5587627722115717e-06, + "loss": 0.244, + "step": 11916 + }, + { + "epoch": 0.75, + "grad_norm": 1.7023586831820152, + "learning_rate": 1.5580239633220563e-06, + "loss": 0.2616, + "step": 11917 + }, + { + "epoch": 0.75, + "grad_norm": 1.8896552588110056, + "learning_rate": 1.55728529724624e-06, + "loss": 0.2639, + "step": 11918 + }, + { + "epoch": 0.75, + "grad_norm": 1.9517282312415587, + "learning_rate": 1.5565467740147732e-06, + "loss": 0.2377, + "step": 11919 + }, + { + "epoch": 0.75, + "grad_norm": 5.248304202353867, + "learning_rate": 1.555808393658297e-06, + "loss": 0.2753, + "step": 11920 + }, + { + "epoch": 0.75, + "grad_norm": 1.7899906901154115, + "learning_rate": 1.5550701562074465e-06, + "loss": 0.2597, + "step": 11921 + }, + { + "epoch": 0.75, + "grad_norm": 1.769585045134498, + "learning_rate": 1.5543320616928558e-06, + "loss": 0.2622, + "step": 11922 + }, + { + "epoch": 0.75, + "grad_norm": 2.4658665214876905, + "learning_rate": 1.5535941101451478e-06, + "loss": 0.2566, + "step": 11923 + }, + { + "epoch": 0.75, + "grad_norm": 2.0309553236705917, + "learning_rate": 1.5528563015949421e-06, + "loss": 0.2774, + "step": 11924 + }, + { + "epoch": 0.75, + "grad_norm": 1.5888897895927225, + "learning_rate": 1.5521186360728497e-06, + "loss": 0.262, + "step": 11925 + }, + { + "epoch": 0.75, + "grad_norm": 2.3547565026352504, + "learning_rate": 1.5513811136094786e-06, + "loss": 0.2545, + "step": 11926 + }, + { + "epoch": 0.75, + "grad_norm": 4.266784297481978, + "learning_rate": 1.5506437342354324e-06, + "loss": 0.2558, + "step": 11927 + }, + { + "epoch": 0.75, + "grad_norm": 2.7020897530161574, + "learning_rate": 1.549906497981305e-06, + "loss": 0.2874, + "step": 11928 + }, + { + "epoch": 0.75, + "grad_norm": 10.027743530206067, + "learning_rate": 1.549169404877683e-06, + "loss": 0.2871, + "step": 11929 + }, + { + "epoch": 0.75, + "grad_norm": 1.3002851894086886, + "learning_rate": 1.5484324549551532e-06, + "loss": 0.2446, + "step": 11930 + }, + { + "epoch": 0.75, + "grad_norm": 1.940854903658385, + "learning_rate": 1.5476956482442918e-06, + "loss": 0.2544, + "step": 11931 + }, + { + "epoch": 0.75, + "grad_norm": 1.6021044647928193, + "learning_rate": 1.5469589847756677e-06, + "loss": 0.2539, + "step": 11932 + }, + { + "epoch": 0.75, + "grad_norm": 1.2905678702315992, + "learning_rate": 1.5462224645798507e-06, + "loss": 0.2675, + "step": 11933 + }, + { + "epoch": 0.75, + "grad_norm": 2.2728942236191934, + "learning_rate": 1.5454860876873968e-06, + "loss": 0.2612, + "step": 11934 + }, + { + "epoch": 0.75, + "grad_norm": 1.296256772914394, + "learning_rate": 1.5447498541288625e-06, + "loss": 0.246, + "step": 11935 + }, + { + "epoch": 0.75, + "grad_norm": 2.1979331063387524, + "learning_rate": 1.544013763934794e-06, + "loss": 0.2654, + "step": 11936 + }, + { + "epoch": 0.75, + "grad_norm": 2.21921664885315, + "learning_rate": 1.543277817135731e-06, + "loss": 0.2635, + "step": 11937 + }, + { + "epoch": 0.75, + "grad_norm": 5.553450963655312, + "learning_rate": 1.5425420137622133e-06, + "loss": 0.2692, + "step": 11938 + }, + { + "epoch": 0.75, + "grad_norm": 2.189599600989339, + "learning_rate": 1.5418063538447687e-06, + "loss": 0.248, + "step": 11939 + }, + { + "epoch": 0.75, + "grad_norm": 2.383964627773125, + "learning_rate": 1.5410708374139189e-06, + "loss": 0.2725, + "step": 11940 + }, + { + "epoch": 0.75, + "grad_norm": 1.4591173037880572, + "learning_rate": 1.5403354645001857e-06, + "loss": 0.2633, + "step": 11941 + }, + { + "epoch": 0.75, + "grad_norm": 1.7464732086978818, + "learning_rate": 1.5396002351340772e-06, + "loss": 0.2695, + "step": 11942 + }, + { + "epoch": 0.75, + "grad_norm": 2.1924013514986886, + "learning_rate": 1.5388651493461026e-06, + "loss": 0.2667, + "step": 11943 + }, + { + "epoch": 0.75, + "grad_norm": 1.8687920252622625, + "learning_rate": 1.5381302071667614e-06, + "loss": 0.2557, + "step": 11944 + }, + { + "epoch": 0.75, + "grad_norm": 2.254162357203387, + "learning_rate": 1.5373954086265447e-06, + "loss": 0.2754, + "step": 11945 + }, + { + "epoch": 0.75, + "grad_norm": 2.1932413521027496, + "learning_rate": 1.5366607537559448e-06, + "loss": 0.2463, + "step": 11946 + }, + { + "epoch": 0.75, + "grad_norm": 4.168190153008269, + "learning_rate": 1.5359262425854398e-06, + "loss": 0.2597, + "step": 11947 + }, + { + "epoch": 0.75, + "grad_norm": 10.760150342410197, + "learning_rate": 1.5351918751455097e-06, + "loss": 0.2689, + "step": 11948 + }, + { + "epoch": 0.75, + "grad_norm": 3.5582380661426294, + "learning_rate": 1.5344576514666231e-06, + "loss": 0.2531, + "step": 11949 + }, + { + "epoch": 0.75, + "grad_norm": 2.2917949160450024, + "learning_rate": 1.533723571579242e-06, + "loss": 0.2466, + "step": 11950 + }, + { + "epoch": 0.75, + "grad_norm": 3.0579349956275412, + "learning_rate": 1.532989635513829e-06, + "loss": 0.2569, + "step": 11951 + }, + { + "epoch": 0.75, + "grad_norm": 3.1387940862761026, + "learning_rate": 1.5322558433008332e-06, + "loss": 0.2787, + "step": 11952 + }, + { + "epoch": 0.75, + "grad_norm": 2.0218715524721236, + "learning_rate": 1.531522194970701e-06, + "loss": 0.2627, + "step": 11953 + }, + { + "epoch": 0.75, + "grad_norm": 1.740251383808904, + "learning_rate": 1.5307886905538732e-06, + "loss": 0.2422, + "step": 11954 + }, + { + "epoch": 0.75, + "grad_norm": 0.594114810697703, + "learning_rate": 1.530055330080787e-06, + "loss": 0.4549, + "step": 11955 + }, + { + "epoch": 0.75, + "grad_norm": 2.8247060917469438, + "learning_rate": 1.5293221135818676e-06, + "loss": 0.2722, + "step": 11956 + }, + { + "epoch": 0.75, + "grad_norm": 1.896839741503334, + "learning_rate": 1.5285890410875388e-06, + "loss": 0.2472, + "step": 11957 + }, + { + "epoch": 0.75, + "grad_norm": 1.6456790709071374, + "learning_rate": 1.527856112628215e-06, + "loss": 0.2497, + "step": 11958 + }, + { + "epoch": 0.75, + "grad_norm": 2.44152185672861, + "learning_rate": 1.5271233282343095e-06, + "loss": 0.2591, + "step": 11959 + }, + { + "epoch": 0.75, + "grad_norm": 2.0892971632742032, + "learning_rate": 1.5263906879362256e-06, + "loss": 0.2578, + "step": 11960 + }, + { + "epoch": 0.75, + "grad_norm": 4.194558456264757, + "learning_rate": 1.525658191764361e-06, + "loss": 0.2721, + "step": 11961 + }, + { + "epoch": 0.75, + "grad_norm": 1.8897807708386882, + "learning_rate": 1.5249258397491086e-06, + "loss": 0.2713, + "step": 11962 + }, + { + "epoch": 0.75, + "grad_norm": 2.0664841821110844, + "learning_rate": 1.5241936319208572e-06, + "loss": 0.2652, + "step": 11963 + }, + { + "epoch": 0.75, + "grad_norm": 1.6448856221330757, + "learning_rate": 1.5234615683099857e-06, + "loss": 0.2601, + "step": 11964 + }, + { + "epoch": 0.75, + "grad_norm": 1.678796882725787, + "learning_rate": 1.5227296489468685e-06, + "loss": 0.2756, + "step": 11965 + }, + { + "epoch": 0.75, + "grad_norm": 1.664808175604224, + "learning_rate": 1.5219978738618734e-06, + "loss": 0.2668, + "step": 11966 + }, + { + "epoch": 0.75, + "grad_norm": 4.100819538826289, + "learning_rate": 1.5212662430853659e-06, + "loss": 0.2499, + "step": 11967 + }, + { + "epoch": 0.75, + "grad_norm": 5.376328737191698, + "learning_rate": 1.5205347566477002e-06, + "loss": 0.2637, + "step": 11968 + }, + { + "epoch": 0.75, + "grad_norm": 0.6641584539080455, + "learning_rate": 1.5198034145792267e-06, + "loss": 0.4767, + "step": 11969 + }, + { + "epoch": 0.75, + "grad_norm": 1.7424025291555487, + "learning_rate": 1.5190722169102906e-06, + "loss": 0.2749, + "step": 11970 + }, + { + "epoch": 0.75, + "grad_norm": 1.6825061395834442, + "learning_rate": 1.518341163671233e-06, + "loss": 0.2453, + "step": 11971 + }, + { + "epoch": 0.75, + "grad_norm": 1.900240666346348, + "learning_rate": 1.5176102548923844e-06, + "loss": 0.2641, + "step": 11972 + }, + { + "epoch": 0.75, + "grad_norm": 1.948203795265517, + "learning_rate": 1.5168794906040719e-06, + "loss": 0.2842, + "step": 11973 + }, + { + "epoch": 0.75, + "grad_norm": 1.9603099547812366, + "learning_rate": 1.516148870836614e-06, + "loss": 0.2641, + "step": 11974 + }, + { + "epoch": 0.75, + "grad_norm": 2.1220814185419328, + "learning_rate": 1.5154183956203295e-06, + "loss": 0.2746, + "step": 11975 + }, + { + "epoch": 0.75, + "grad_norm": 1.771850191668032, + "learning_rate": 1.514688064985525e-06, + "loss": 0.2763, + "step": 11976 + }, + { + "epoch": 0.75, + "grad_norm": 4.574735579899599, + "learning_rate": 1.513957878962502e-06, + "loss": 0.249, + "step": 11977 + }, + { + "epoch": 0.75, + "grad_norm": 2.5782235861767826, + "learning_rate": 1.5132278375815578e-06, + "loss": 0.2788, + "step": 11978 + }, + { + "epoch": 0.75, + "grad_norm": 1.5446073858393137, + "learning_rate": 1.512497940872986e-06, + "loss": 0.2519, + "step": 11979 + }, + { + "epoch": 0.75, + "grad_norm": 2.8054213731930635, + "learning_rate": 1.5117681888670694e-06, + "loss": 0.2471, + "step": 11980 + }, + { + "epoch": 0.75, + "grad_norm": 1.4743759970123511, + "learning_rate": 1.5110385815940842e-06, + "loss": 0.2548, + "step": 11981 + }, + { + "epoch": 0.75, + "grad_norm": 1.678778934250006, + "learning_rate": 1.5103091190843072e-06, + "loss": 0.2675, + "step": 11982 + }, + { + "epoch": 0.75, + "grad_norm": 2.3988365123464415, + "learning_rate": 1.509579801368003e-06, + "loss": 0.2404, + "step": 11983 + }, + { + "epoch": 0.75, + "grad_norm": 3.505055998955773, + "learning_rate": 1.5088506284754312e-06, + "loss": 0.2514, + "step": 11984 + }, + { + "epoch": 0.75, + "grad_norm": 5.609320783732682, + "learning_rate": 1.5081216004368492e-06, + "loss": 0.2704, + "step": 11985 + }, + { + "epoch": 0.75, + "grad_norm": 1.7426797782797734, + "learning_rate": 1.5073927172825032e-06, + "loss": 0.2583, + "step": 11986 + }, + { + "epoch": 0.75, + "grad_norm": 2.4989185183015086, + "learning_rate": 1.506663979042638e-06, + "loss": 0.2446, + "step": 11987 + }, + { + "epoch": 0.75, + "grad_norm": 3.5553763225014214, + "learning_rate": 1.505935385747489e-06, + "loss": 0.2661, + "step": 11988 + }, + { + "epoch": 0.75, + "grad_norm": 1.9648034406003163, + "learning_rate": 1.505206937427286e-06, + "loss": 0.2724, + "step": 11989 + }, + { + "epoch": 0.75, + "grad_norm": 1.6727122932410934, + "learning_rate": 1.5044786341122557e-06, + "loss": 0.2687, + "step": 11990 + }, + { + "epoch": 0.75, + "grad_norm": 2.0541108125247995, + "learning_rate": 1.503750475832616e-06, + "loss": 0.2917, + "step": 11991 + }, + { + "epoch": 0.75, + "grad_norm": 1.864789485460652, + "learning_rate": 1.5030224626185774e-06, + "loss": 0.2374, + "step": 11992 + }, + { + "epoch": 0.75, + "grad_norm": 3.556706876194909, + "learning_rate": 1.5022945945003499e-06, + "loss": 0.2578, + "step": 11993 + }, + { + "epoch": 0.75, + "grad_norm": 8.16915453716689, + "learning_rate": 1.50156687150813e-06, + "loss": 0.2727, + "step": 11994 + }, + { + "epoch": 0.75, + "grad_norm": 1.9587952436999463, + "learning_rate": 1.5008392936721166e-06, + "loss": 0.2529, + "step": 11995 + }, + { + "epoch": 0.75, + "grad_norm": 1.7195241674442108, + "learning_rate": 1.5001118610224963e-06, + "loss": 0.2389, + "step": 11996 + }, + { + "epoch": 0.75, + "grad_norm": 3.7931812262722704, + "learning_rate": 1.4993845735894496e-06, + "loss": 0.2544, + "step": 11997 + }, + { + "epoch": 0.75, + "grad_norm": 2.2405025422970355, + "learning_rate": 1.4986574314031566e-06, + "loss": 0.2645, + "step": 11998 + }, + { + "epoch": 0.75, + "grad_norm": 2.135625269821901, + "learning_rate": 1.497930434493784e-06, + "loss": 0.2536, + "step": 11999 + }, + { + "epoch": 0.75, + "grad_norm": 4.046505744040854, + "learning_rate": 1.4972035828914993e-06, + "loss": 0.2862, + "step": 12000 + }, + { + "epoch": 0.75, + "grad_norm": 3.482562277878135, + "learning_rate": 1.4964768766264604e-06, + "loss": 0.2631, + "step": 12001 + }, + { + "epoch": 0.75, + "grad_norm": 2.534614146089112, + "learning_rate": 1.4957503157288166e-06, + "loss": 0.2569, + "step": 12002 + }, + { + "epoch": 0.75, + "grad_norm": 1.8568030182913553, + "learning_rate": 1.4950239002287181e-06, + "loss": 0.2724, + "step": 12003 + }, + { + "epoch": 0.75, + "grad_norm": 3.555922049773706, + "learning_rate": 1.494297630156304e-06, + "loss": 0.2596, + "step": 12004 + }, + { + "epoch": 0.75, + "grad_norm": 2.725616201416361, + "learning_rate": 1.493571505541706e-06, + "loss": 0.2715, + "step": 12005 + }, + { + "epoch": 0.76, + "grad_norm": 2.7002847467613185, + "learning_rate": 1.492845526415056e-06, + "loss": 0.2787, + "step": 12006 + }, + { + "epoch": 0.76, + "grad_norm": 10.678468794217633, + "learning_rate": 1.492119692806473e-06, + "loss": 0.2497, + "step": 12007 + }, + { + "epoch": 0.76, + "grad_norm": 1.8387834655868671, + "learning_rate": 1.4913940047460762e-06, + "loss": 0.2665, + "step": 12008 + }, + { + "epoch": 0.76, + "grad_norm": 1.716906784132446, + "learning_rate": 1.490668462263974e-06, + "loss": 0.2576, + "step": 12009 + }, + { + "epoch": 0.76, + "grad_norm": 5.013935443864974, + "learning_rate": 1.4899430653902691e-06, + "loss": 0.2543, + "step": 12010 + }, + { + "epoch": 0.76, + "grad_norm": 3.024479615880311, + "learning_rate": 1.4892178141550628e-06, + "loss": 0.2492, + "step": 12011 + }, + { + "epoch": 0.76, + "grad_norm": 3.08530901693931, + "learning_rate": 1.4884927085884448e-06, + "loss": 0.265, + "step": 12012 + }, + { + "epoch": 0.76, + "grad_norm": 2.111331303976927, + "learning_rate": 1.4877677487205005e-06, + "loss": 0.2605, + "step": 12013 + }, + { + "epoch": 0.76, + "grad_norm": 1.5905043143237334, + "learning_rate": 1.4870429345813103e-06, + "loss": 0.2578, + "step": 12014 + }, + { + "epoch": 0.76, + "grad_norm": 0.6271409889503476, + "learning_rate": 1.4863182662009501e-06, + "loss": 0.4487, + "step": 12015 + }, + { + "epoch": 0.76, + "grad_norm": 1.5999760615757777, + "learning_rate": 1.4855937436094858e-06, + "loss": 0.2424, + "step": 12016 + }, + { + "epoch": 0.76, + "grad_norm": 7.238525805627581, + "learning_rate": 1.4848693668369795e-06, + "loss": 0.2494, + "step": 12017 + }, + { + "epoch": 0.76, + "grad_norm": 2.4332015326435306, + "learning_rate": 1.4841451359134857e-06, + "loss": 0.2514, + "step": 12018 + }, + { + "epoch": 0.76, + "grad_norm": 1.7978355334402403, + "learning_rate": 1.483421050869056e-06, + "loss": 0.257, + "step": 12019 + }, + { + "epoch": 0.76, + "grad_norm": 2.8016530563684148, + "learning_rate": 1.482697111733733e-06, + "loss": 0.2761, + "step": 12020 + }, + { + "epoch": 0.76, + "grad_norm": 2.0149743446607356, + "learning_rate": 1.4819733185375535e-06, + "loss": 0.2535, + "step": 12021 + }, + { + "epoch": 0.76, + "grad_norm": 1.885860580385384, + "learning_rate": 1.481249671310549e-06, + "loss": 0.2835, + "step": 12022 + }, + { + "epoch": 0.76, + "grad_norm": 1.4484556718172874, + "learning_rate": 1.4805261700827472e-06, + "loss": 0.2524, + "step": 12023 + }, + { + "epoch": 0.76, + "grad_norm": 1.5016746816025455, + "learning_rate": 1.4798028148841665e-06, + "loss": 0.2574, + "step": 12024 + }, + { + "epoch": 0.76, + "grad_norm": 2.199610801733965, + "learning_rate": 1.4790796057448187e-06, + "loss": 0.2754, + "step": 12025 + }, + { + "epoch": 0.76, + "grad_norm": 2.1407007334441586, + "learning_rate": 1.478356542694711e-06, + "loss": 0.2673, + "step": 12026 + }, + { + "epoch": 0.76, + "grad_norm": 2.7195045978267984, + "learning_rate": 1.4776336257638468e-06, + "loss": 0.26, + "step": 12027 + }, + { + "epoch": 0.76, + "grad_norm": 1.8504353959269848, + "learning_rate": 1.4769108549822198e-06, + "loss": 0.2772, + "step": 12028 + }, + { + "epoch": 0.76, + "grad_norm": 2.6572654675745015, + "learning_rate": 1.476188230379818e-06, + "loss": 0.2517, + "step": 12029 + }, + { + "epoch": 0.76, + "grad_norm": 5.972166544547642, + "learning_rate": 1.475465751986625e-06, + "loss": 0.2586, + "step": 12030 + }, + { + "epoch": 0.76, + "grad_norm": 1.5642743584507386, + "learning_rate": 1.4747434198326206e-06, + "loss": 0.252, + "step": 12031 + }, + { + "epoch": 0.76, + "grad_norm": 3.171954189885917, + "learning_rate": 1.474021233947772e-06, + "loss": 0.2424, + "step": 12032 + }, + { + "epoch": 0.76, + "grad_norm": 3.604920851024164, + "learning_rate": 1.4732991943620462e-06, + "loss": 0.2734, + "step": 12033 + }, + { + "epoch": 0.76, + "grad_norm": 2.744629193997247, + "learning_rate": 1.4725773011053995e-06, + "loss": 0.2484, + "step": 12034 + }, + { + "epoch": 0.76, + "grad_norm": 1.3283157992310137, + "learning_rate": 1.4718555542077873e-06, + "loss": 0.279, + "step": 12035 + }, + { + "epoch": 0.76, + "grad_norm": 1.6727049308694935, + "learning_rate": 1.471133953699153e-06, + "loss": 0.2636, + "step": 12036 + }, + { + "epoch": 0.76, + "grad_norm": 1.5159553532366852, + "learning_rate": 1.4704124996094405e-06, + "loss": 0.2327, + "step": 12037 + }, + { + "epoch": 0.76, + "grad_norm": 0.6535073658917429, + "learning_rate": 1.4696911919685813e-06, + "loss": 0.465, + "step": 12038 + }, + { + "epoch": 0.76, + "grad_norm": 4.195212118883088, + "learning_rate": 1.4689700308065064e-06, + "loss": 0.2502, + "step": 12039 + }, + { + "epoch": 0.76, + "grad_norm": 4.714089820462613, + "learning_rate": 1.4682490161531371e-06, + "loss": 0.2472, + "step": 12040 + }, + { + "epoch": 0.76, + "grad_norm": 2.225937081733685, + "learning_rate": 1.467528148038389e-06, + "loss": 0.2574, + "step": 12041 + }, + { + "epoch": 0.76, + "grad_norm": 3.3797596864482804, + "learning_rate": 1.4668074264921712e-06, + "loss": 0.2474, + "step": 12042 + }, + { + "epoch": 0.76, + "grad_norm": 3.1504471511916403, + "learning_rate": 1.4660868515443899e-06, + "loss": 0.2754, + "step": 12043 + }, + { + "epoch": 0.76, + "grad_norm": 1.9441453582088524, + "learning_rate": 1.4653664232249409e-06, + "loss": 0.2717, + "step": 12044 + }, + { + "epoch": 0.76, + "grad_norm": 1.650606618098762, + "learning_rate": 1.4646461415637188e-06, + "loss": 0.2508, + "step": 12045 + }, + { + "epoch": 0.76, + "grad_norm": 1.818647293254394, + "learning_rate": 1.4639260065906058e-06, + "loss": 0.2645, + "step": 12046 + }, + { + "epoch": 0.76, + "grad_norm": 2.7636787860325356, + "learning_rate": 1.4632060183354857e-06, + "loss": 0.2701, + "step": 12047 + }, + { + "epoch": 0.76, + "grad_norm": 1.5771368792371883, + "learning_rate": 1.4624861768282295e-06, + "loss": 0.2503, + "step": 12048 + }, + { + "epoch": 0.76, + "grad_norm": 1.9217027774237119, + "learning_rate": 1.4617664820987055e-06, + "loss": 0.272, + "step": 12049 + }, + { + "epoch": 0.76, + "grad_norm": 2.1391593192395453, + "learning_rate": 1.461046934176773e-06, + "loss": 0.2544, + "step": 12050 + }, + { + "epoch": 0.76, + "grad_norm": 1.8396394512248049, + "learning_rate": 1.460327533092289e-06, + "loss": 0.2571, + "step": 12051 + }, + { + "epoch": 0.76, + "grad_norm": 2.4925167561067028, + "learning_rate": 1.4596082788751048e-06, + "loss": 0.2481, + "step": 12052 + }, + { + "epoch": 0.76, + "grad_norm": 3.106952728761021, + "learning_rate": 1.4588891715550612e-06, + "loss": 0.2565, + "step": 12053 + }, + { + "epoch": 0.76, + "grad_norm": 2.475499050212135, + "learning_rate": 1.458170211161994e-06, + "loss": 0.2689, + "step": 12054 + }, + { + "epoch": 0.76, + "grad_norm": 2.554015332841206, + "learning_rate": 1.457451397725737e-06, + "loss": 0.2559, + "step": 12055 + }, + { + "epoch": 0.76, + "grad_norm": 2.2286089315703848, + "learning_rate": 1.4567327312761143e-06, + "loss": 0.2537, + "step": 12056 + }, + { + "epoch": 0.76, + "grad_norm": 2.140349767536782, + "learning_rate": 1.4560142118429427e-06, + "loss": 0.2764, + "step": 12057 + }, + { + "epoch": 0.76, + "grad_norm": 1.9077386950447266, + "learning_rate": 1.4552958394560374e-06, + "loss": 0.2356, + "step": 12058 + }, + { + "epoch": 0.76, + "grad_norm": 2.411234553571343, + "learning_rate": 1.4545776141452023e-06, + "loss": 0.2842, + "step": 12059 + }, + { + "epoch": 0.76, + "grad_norm": 2.1401287558122895, + "learning_rate": 1.4538595359402413e-06, + "loss": 0.2646, + "step": 12060 + }, + { + "epoch": 0.76, + "grad_norm": 2.198233857607244, + "learning_rate": 1.4531416048709469e-06, + "loss": 0.2531, + "step": 12061 + }, + { + "epoch": 0.76, + "grad_norm": 1.2902543845869912, + "learning_rate": 1.4524238209671048e-06, + "loss": 0.2575, + "step": 12062 + }, + { + "epoch": 0.76, + "grad_norm": 0.6376816292524472, + "learning_rate": 1.4517061842585017e-06, + "loss": 0.482, + "step": 12063 + }, + { + "epoch": 0.76, + "grad_norm": 2.2892471408943904, + "learning_rate": 1.450988694774912e-06, + "loss": 0.2568, + "step": 12064 + }, + { + "epoch": 0.76, + "grad_norm": 2.6052649285732987, + "learning_rate": 1.4502713525461026e-06, + "loss": 0.2605, + "step": 12065 + }, + { + "epoch": 0.76, + "grad_norm": 5.933754656876215, + "learning_rate": 1.44955415760184e-06, + "loss": 0.2531, + "step": 12066 + }, + { + "epoch": 0.76, + "grad_norm": 1.5154627525102058, + "learning_rate": 1.448837109971884e-06, + "loss": 0.2529, + "step": 12067 + }, + { + "epoch": 0.76, + "grad_norm": 1.2899602501426988, + "learning_rate": 1.448120209685983e-06, + "loss": 0.2479, + "step": 12068 + }, + { + "epoch": 0.76, + "grad_norm": 14.110370146331876, + "learning_rate": 1.4474034567738838e-06, + "loss": 0.2645, + "step": 12069 + }, + { + "epoch": 0.76, + "grad_norm": 1.6665665184508645, + "learning_rate": 1.4466868512653237e-06, + "loss": 0.2937, + "step": 12070 + }, + { + "epoch": 0.76, + "grad_norm": 2.774512981607251, + "learning_rate": 1.4459703931900393e-06, + "loss": 0.2538, + "step": 12071 + }, + { + "epoch": 0.76, + "grad_norm": 3.3921342498113027, + "learning_rate": 1.4452540825777555e-06, + "loss": 0.2513, + "step": 12072 + }, + { + "epoch": 0.76, + "grad_norm": 1.697737022695345, + "learning_rate": 1.4445379194581926e-06, + "loss": 0.2451, + "step": 12073 + }, + { + "epoch": 0.76, + "grad_norm": 2.600425139139422, + "learning_rate": 1.4438219038610662e-06, + "loss": 0.2547, + "step": 12074 + }, + { + "epoch": 0.76, + "grad_norm": 1.585123484955184, + "learning_rate": 1.4431060358160876e-06, + "loss": 0.2588, + "step": 12075 + }, + { + "epoch": 0.76, + "grad_norm": 2.4003655430597806, + "learning_rate": 1.4423903153529568e-06, + "loss": 0.2881, + "step": 12076 + }, + { + "epoch": 0.76, + "grad_norm": 1.3755810571745266, + "learning_rate": 1.441674742501371e-06, + "loss": 0.2571, + "step": 12077 + }, + { + "epoch": 0.76, + "grad_norm": 2.3908024382707085, + "learning_rate": 1.440959317291019e-06, + "loss": 0.2651, + "step": 12078 + }, + { + "epoch": 0.76, + "grad_norm": 2.4494320693999985, + "learning_rate": 1.4402440397515882e-06, + "loss": 0.2506, + "step": 12079 + }, + { + "epoch": 0.76, + "grad_norm": 1.9483648144851375, + "learning_rate": 1.4395289099127545e-06, + "loss": 0.2616, + "step": 12080 + }, + { + "epoch": 0.76, + "grad_norm": 2.104228117768375, + "learning_rate": 1.4388139278041895e-06, + "loss": 0.2622, + "step": 12081 + }, + { + "epoch": 0.76, + "grad_norm": 3.2158394746627743, + "learning_rate": 1.4380990934555593e-06, + "loss": 0.2713, + "step": 12082 + }, + { + "epoch": 0.76, + "grad_norm": 2.209765771543538, + "learning_rate": 1.4373844068965265e-06, + "loss": 0.2498, + "step": 12083 + }, + { + "epoch": 0.76, + "grad_norm": 2.2188412155029993, + "learning_rate": 1.4366698681567425e-06, + "loss": 0.2683, + "step": 12084 + }, + { + "epoch": 0.76, + "grad_norm": 3.2706651879775706, + "learning_rate": 1.4359554772658551e-06, + "loss": 0.2465, + "step": 12085 + }, + { + "epoch": 0.76, + "grad_norm": 1.3405066611028733, + "learning_rate": 1.4352412342535033e-06, + "loss": 0.2461, + "step": 12086 + }, + { + "epoch": 0.76, + "grad_norm": 2.7765449780426272, + "learning_rate": 1.434527139149326e-06, + "loss": 0.2667, + "step": 12087 + }, + { + "epoch": 0.76, + "grad_norm": 2.955786494324836, + "learning_rate": 1.4338131919829513e-06, + "loss": 0.2804, + "step": 12088 + }, + { + "epoch": 0.76, + "grad_norm": 2.6086075855513613, + "learning_rate": 1.4330993927839993e-06, + "loss": 0.2731, + "step": 12089 + }, + { + "epoch": 0.76, + "grad_norm": 2.2540326874061916, + "learning_rate": 1.4323857415820892e-06, + "loss": 0.261, + "step": 12090 + }, + { + "epoch": 0.76, + "grad_norm": 2.211682084578564, + "learning_rate": 1.4316722384068332e-06, + "loss": 0.2623, + "step": 12091 + }, + { + "epoch": 0.76, + "grad_norm": 1.7568744545590538, + "learning_rate": 1.430958883287834e-06, + "loss": 0.2656, + "step": 12092 + }, + { + "epoch": 0.76, + "grad_norm": 1.8693378545721617, + "learning_rate": 1.43024567625469e-06, + "loss": 0.2701, + "step": 12093 + }, + { + "epoch": 0.76, + "grad_norm": 1.9474574766802821, + "learning_rate": 1.4295326173369921e-06, + "loss": 0.2653, + "step": 12094 + }, + { + "epoch": 0.76, + "grad_norm": 3.2457017927086524, + "learning_rate": 1.4288197065643288e-06, + "loss": 0.2696, + "step": 12095 + }, + { + "epoch": 0.76, + "grad_norm": 0.5463452648486415, + "learning_rate": 1.428106943966278e-06, + "loss": 0.4737, + "step": 12096 + }, + { + "epoch": 0.76, + "grad_norm": 1.7727890902270838, + "learning_rate": 1.4273943295724163e-06, + "loss": 0.2806, + "step": 12097 + }, + { + "epoch": 0.76, + "grad_norm": 2.3453058333942383, + "learning_rate": 1.4266818634123075e-06, + "loss": 0.2552, + "step": 12098 + }, + { + "epoch": 0.76, + "grad_norm": 2.686475443139553, + "learning_rate": 1.4259695455155165e-06, + "loss": 0.2444, + "step": 12099 + }, + { + "epoch": 0.76, + "grad_norm": 5.097105243851308, + "learning_rate": 1.4252573759115974e-06, + "loss": 0.2453, + "step": 12100 + }, + { + "epoch": 0.76, + "grad_norm": 3.442296196789242, + "learning_rate": 1.4245453546300996e-06, + "loss": 0.2646, + "step": 12101 + }, + { + "epoch": 0.76, + "grad_norm": 1.737063829014486, + "learning_rate": 1.4238334817005638e-06, + "loss": 0.2488, + "step": 12102 + }, + { + "epoch": 0.76, + "grad_norm": 1.618681622374901, + "learning_rate": 1.4231217571525308e-06, + "loss": 0.2622, + "step": 12103 + }, + { + "epoch": 0.76, + "grad_norm": 1.5502227468010201, + "learning_rate": 1.4224101810155273e-06, + "loss": 0.2547, + "step": 12104 + }, + { + "epoch": 0.76, + "grad_norm": 3.0551417447986835, + "learning_rate": 1.4216987533190819e-06, + "loss": 0.2794, + "step": 12105 + }, + { + "epoch": 0.76, + "grad_norm": 2.6893047780363037, + "learning_rate": 1.4209874740927087e-06, + "loss": 0.2388, + "step": 12106 + }, + { + "epoch": 0.76, + "grad_norm": 5.8125832767489305, + "learning_rate": 1.4202763433659244e-06, + "loss": 0.2748, + "step": 12107 + }, + { + "epoch": 0.76, + "grad_norm": 3.8689003970630766, + "learning_rate": 1.419565361168233e-06, + "loss": 0.2619, + "step": 12108 + }, + { + "epoch": 0.76, + "grad_norm": 6.820352185620478, + "learning_rate": 1.4188545275291343e-06, + "loss": 0.2524, + "step": 12109 + }, + { + "epoch": 0.76, + "grad_norm": 4.904757860928734, + "learning_rate": 1.4181438424781203e-06, + "loss": 0.2708, + "step": 12110 + }, + { + "epoch": 0.76, + "grad_norm": 2.219140879375936, + "learning_rate": 1.4174333060446804e-06, + "loss": 0.2796, + "step": 12111 + }, + { + "epoch": 0.76, + "grad_norm": 21.511571905586756, + "learning_rate": 1.4167229182582974e-06, + "loss": 0.2649, + "step": 12112 + }, + { + "epoch": 0.76, + "grad_norm": 0.5516098362282018, + "learning_rate": 1.4160126791484453e-06, + "loss": 0.4471, + "step": 12113 + }, + { + "epoch": 0.76, + "grad_norm": 1.6622975944093763, + "learning_rate": 1.4153025887445914e-06, + "loss": 0.2423, + "step": 12114 + }, + { + "epoch": 0.76, + "grad_norm": 3.50915692108032, + "learning_rate": 1.4145926470762023e-06, + "loss": 0.2729, + "step": 12115 + }, + { + "epoch": 0.76, + "grad_norm": 2.390954474437639, + "learning_rate": 1.4138828541727322e-06, + "loss": 0.2811, + "step": 12116 + }, + { + "epoch": 0.76, + "grad_norm": 1.882459526932715, + "learning_rate": 1.413173210063632e-06, + "loss": 0.2581, + "step": 12117 + }, + { + "epoch": 0.76, + "grad_norm": 10.78501533400747, + "learning_rate": 1.4124637147783431e-06, + "loss": 0.2893, + "step": 12118 + }, + { + "epoch": 0.76, + "grad_norm": 1.6551772270319498, + "learning_rate": 1.4117543683463103e-06, + "loss": 0.2491, + "step": 12119 + }, + { + "epoch": 0.76, + "grad_norm": 2.4012756677228824, + "learning_rate": 1.4110451707969624e-06, + "loss": 0.2481, + "step": 12120 + }, + { + "epoch": 0.76, + "grad_norm": 2.0419703125098643, + "learning_rate": 1.4103361221597244e-06, + "loss": 0.2601, + "step": 12121 + }, + { + "epoch": 0.76, + "grad_norm": 2.0914647375675712, + "learning_rate": 1.4096272224640155e-06, + "loss": 0.2561, + "step": 12122 + }, + { + "epoch": 0.76, + "grad_norm": 2.2375686612246555, + "learning_rate": 1.4089184717392524e-06, + "loss": 0.2406, + "step": 12123 + }, + { + "epoch": 0.76, + "grad_norm": 1.9218048095779081, + "learning_rate": 1.4082098700148395e-06, + "loss": 0.2569, + "step": 12124 + }, + { + "epoch": 0.76, + "grad_norm": 2.7098076528240505, + "learning_rate": 1.407501417320179e-06, + "loss": 0.2744, + "step": 12125 + }, + { + "epoch": 0.76, + "grad_norm": 2.9171778187326898, + "learning_rate": 1.406793113684663e-06, + "loss": 0.2599, + "step": 12126 + }, + { + "epoch": 0.76, + "grad_norm": 5.74314241740096, + "learning_rate": 1.4060849591376858e-06, + "loss": 0.2549, + "step": 12127 + }, + { + "epoch": 0.76, + "grad_norm": 2.414918815760029, + "learning_rate": 1.405376953708627e-06, + "loss": 0.2585, + "step": 12128 + }, + { + "epoch": 0.76, + "grad_norm": 1.799812980432534, + "learning_rate": 1.4046690974268634e-06, + "loss": 0.2424, + "step": 12129 + }, + { + "epoch": 0.76, + "grad_norm": 1.629016152305734, + "learning_rate": 1.4039613903217636e-06, + "loss": 0.2597, + "step": 12130 + }, + { + "epoch": 0.76, + "grad_norm": 6.118408806726114, + "learning_rate": 1.403253832422694e-06, + "loss": 0.2555, + "step": 12131 + }, + { + "epoch": 0.76, + "grad_norm": 5.47675570741331, + "learning_rate": 1.402546423759012e-06, + "loss": 0.2481, + "step": 12132 + }, + { + "epoch": 0.76, + "grad_norm": 2.5083457030073633, + "learning_rate": 1.401839164360067e-06, + "loss": 0.2457, + "step": 12133 + }, + { + "epoch": 0.76, + "grad_norm": 2.1849445597091224, + "learning_rate": 1.401132054255206e-06, + "loss": 0.2577, + "step": 12134 + }, + { + "epoch": 0.76, + "grad_norm": 2.4755413355219726, + "learning_rate": 1.4004250934737701e-06, + "loss": 0.26, + "step": 12135 + }, + { + "epoch": 0.76, + "grad_norm": 4.156483687758241, + "learning_rate": 1.39971828204509e-06, + "loss": 0.2516, + "step": 12136 + }, + { + "epoch": 0.76, + "grad_norm": 1.6256868513634082, + "learning_rate": 1.3990116199984938e-06, + "loss": 0.2482, + "step": 12137 + }, + { + "epoch": 0.76, + "grad_norm": 1.5934940225949321, + "learning_rate": 1.3983051073632996e-06, + "loss": 0.2511, + "step": 12138 + }, + { + "epoch": 0.76, + "grad_norm": 5.487029862492722, + "learning_rate": 1.397598744168826e-06, + "loss": 0.2706, + "step": 12139 + }, + { + "epoch": 0.76, + "grad_norm": 1.862388876202757, + "learning_rate": 1.3968925304443786e-06, + "loss": 0.2494, + "step": 12140 + }, + { + "epoch": 0.76, + "grad_norm": 1.6227209940447236, + "learning_rate": 1.3961864662192586e-06, + "loss": 0.2535, + "step": 12141 + }, + { + "epoch": 0.76, + "grad_norm": 1.7130382213619153, + "learning_rate": 1.3954805515227632e-06, + "loss": 0.2531, + "step": 12142 + }, + { + "epoch": 0.76, + "grad_norm": 1.8278220899417228, + "learning_rate": 1.3947747863841836e-06, + "loss": 0.2528, + "step": 12143 + }, + { + "epoch": 0.76, + "grad_norm": 1.251253788843478, + "learning_rate": 1.3940691708328018e-06, + "loss": 0.2574, + "step": 12144 + }, + { + "epoch": 0.76, + "grad_norm": 3.746734438790287, + "learning_rate": 1.3933637048978949e-06, + "loss": 0.2588, + "step": 12145 + }, + { + "epoch": 0.76, + "grad_norm": 1.9579740225844458, + "learning_rate": 1.3926583886087326e-06, + "loss": 0.2477, + "step": 12146 + }, + { + "epoch": 0.76, + "grad_norm": 3.138227823451893, + "learning_rate": 1.3919532219945819e-06, + "loss": 0.2992, + "step": 12147 + }, + { + "epoch": 0.76, + "grad_norm": 2.7109840867803783, + "learning_rate": 1.3912482050846999e-06, + "loss": 0.2637, + "step": 12148 + }, + { + "epoch": 0.76, + "grad_norm": 2.216299194263243, + "learning_rate": 1.390543337908341e-06, + "loss": 0.2548, + "step": 12149 + }, + { + "epoch": 0.76, + "grad_norm": 1.7964161006614783, + "learning_rate": 1.3898386204947485e-06, + "loss": 0.2665, + "step": 12150 + }, + { + "epoch": 0.76, + "grad_norm": 1.9030536138531213, + "learning_rate": 1.3891340528731656e-06, + "loss": 0.2543, + "step": 12151 + }, + { + "epoch": 0.76, + "grad_norm": 3.386004139164735, + "learning_rate": 1.3884296350728245e-06, + "loss": 0.2494, + "step": 12152 + }, + { + "epoch": 0.76, + "grad_norm": 2.3586158587829322, + "learning_rate": 1.3877253671229528e-06, + "loss": 0.256, + "step": 12153 + }, + { + "epoch": 0.76, + "grad_norm": 0.5827677368568623, + "learning_rate": 1.38702124905277e-06, + "loss": 0.4691, + "step": 12154 + }, + { + "epoch": 0.76, + "grad_norm": 1.9524956657439987, + "learning_rate": 1.386317280891495e-06, + "loss": 0.2708, + "step": 12155 + }, + { + "epoch": 0.76, + "grad_norm": 1.794127078629825, + "learning_rate": 1.3856134626683325e-06, + "loss": 0.2545, + "step": 12156 + }, + { + "epoch": 0.76, + "grad_norm": 2.9505565417195476, + "learning_rate": 1.384909794412489e-06, + "loss": 0.2647, + "step": 12157 + }, + { + "epoch": 0.76, + "grad_norm": 2.3767627075695774, + "learning_rate": 1.3842062761531577e-06, + "loss": 0.2529, + "step": 12158 + }, + { + "epoch": 0.76, + "grad_norm": 2.275406547439901, + "learning_rate": 1.3835029079195322e-06, + "loss": 0.2542, + "step": 12159 + }, + { + "epoch": 0.76, + "grad_norm": 2.1390473320247203, + "learning_rate": 1.3827996897407942e-06, + "loss": 0.2385, + "step": 12160 + }, + { + "epoch": 0.76, + "grad_norm": 4.502283166060426, + "learning_rate": 1.3820966216461224e-06, + "loss": 0.25, + "step": 12161 + }, + { + "epoch": 0.76, + "grad_norm": 1.8176713406240028, + "learning_rate": 1.3813937036646858e-06, + "loss": 0.2561, + "step": 12162 + }, + { + "epoch": 0.76, + "grad_norm": 2.0244744902021257, + "learning_rate": 1.3806909358256522e-06, + "loss": 0.2474, + "step": 12163 + }, + { + "epoch": 0.76, + "grad_norm": 3.7662667869668818, + "learning_rate": 1.3799883181581818e-06, + "loss": 0.2703, + "step": 12164 + }, + { + "epoch": 0.77, + "grad_norm": 6.787122337335751, + "learning_rate": 1.3792858506914258e-06, + "loss": 0.2507, + "step": 12165 + }, + { + "epoch": 0.77, + "grad_norm": 2.411214800296559, + "learning_rate": 1.37858353345453e-06, + "loss": 0.2721, + "step": 12166 + }, + { + "epoch": 0.77, + "grad_norm": 2.065825198070482, + "learning_rate": 1.3778813664766366e-06, + "loss": 0.2403, + "step": 12167 + }, + { + "epoch": 0.77, + "grad_norm": 0.5864386815362713, + "learning_rate": 1.377179349786879e-06, + "loss": 0.4672, + "step": 12168 + }, + { + "epoch": 0.77, + "grad_norm": 1.9330363884101904, + "learning_rate": 1.376477483414385e-06, + "loss": 0.2511, + "step": 12169 + }, + { + "epoch": 0.77, + "grad_norm": 2.3594106557510997, + "learning_rate": 1.3757757673882753e-06, + "loss": 0.2415, + "step": 12170 + }, + { + "epoch": 0.77, + "grad_norm": 2.854869228502983, + "learning_rate": 1.375074201737666e-06, + "loss": 0.2635, + "step": 12171 + }, + { + "epoch": 0.77, + "grad_norm": 3.5622857479687187, + "learning_rate": 1.3743727864916684e-06, + "loss": 0.2404, + "step": 12172 + }, + { + "epoch": 0.77, + "grad_norm": 1.9452416758872502, + "learning_rate": 1.3736715216793833e-06, + "loss": 0.2608, + "step": 12173 + }, + { + "epoch": 0.77, + "grad_norm": 1.7397170903180896, + "learning_rate": 1.3729704073299066e-06, + "loss": 0.2683, + "step": 12174 + }, + { + "epoch": 0.77, + "grad_norm": 3.022305899674779, + "learning_rate": 1.3722694434723316e-06, + "loss": 0.2855, + "step": 12175 + }, + { + "epoch": 0.77, + "grad_norm": 1.9460262754557436, + "learning_rate": 1.3715686301357407e-06, + "loss": 0.2422, + "step": 12176 + }, + { + "epoch": 0.77, + "grad_norm": 2.843969987891146, + "learning_rate": 1.3708679673492127e-06, + "loss": 0.2627, + "step": 12177 + }, + { + "epoch": 0.77, + "grad_norm": 3.0233249351876323, + "learning_rate": 1.3701674551418154e-06, + "loss": 0.2664, + "step": 12178 + }, + { + "epoch": 0.77, + "grad_norm": 1.773220153771355, + "learning_rate": 1.3694670935426202e-06, + "loss": 0.2419, + "step": 12179 + }, + { + "epoch": 0.77, + "grad_norm": 1.8184339760275756, + "learning_rate": 1.3687668825806843e-06, + "loss": 0.241, + "step": 12180 + }, + { + "epoch": 0.77, + "grad_norm": 2.2042115949749004, + "learning_rate": 1.3680668222850607e-06, + "loss": 0.2504, + "step": 12181 + }, + { + "epoch": 0.77, + "grad_norm": 2.52442515121413, + "learning_rate": 1.3673669126847938e-06, + "loss": 0.2895, + "step": 12182 + }, + { + "epoch": 0.77, + "grad_norm": 3.2367914963257904, + "learning_rate": 1.3666671538089272e-06, + "loss": 0.2564, + "step": 12183 + }, + { + "epoch": 0.77, + "grad_norm": 0.6387767623544527, + "learning_rate": 1.365967545686494e-06, + "loss": 0.4548, + "step": 12184 + }, + { + "epoch": 0.77, + "grad_norm": 1.5423614370986392, + "learning_rate": 1.3652680883465224e-06, + "loss": 0.2467, + "step": 12185 + }, + { + "epoch": 0.77, + "grad_norm": 2.296864002626352, + "learning_rate": 1.364568781818031e-06, + "loss": 0.2464, + "step": 12186 + }, + { + "epoch": 0.77, + "grad_norm": 2.202580176727354, + "learning_rate": 1.3638696261300404e-06, + "loss": 0.2654, + "step": 12187 + }, + { + "epoch": 0.77, + "grad_norm": 2.1249299824419166, + "learning_rate": 1.3631706213115582e-06, + "loss": 0.2424, + "step": 12188 + }, + { + "epoch": 0.77, + "grad_norm": 2.350336052881772, + "learning_rate": 1.3624717673915861e-06, + "loss": 0.2368, + "step": 12189 + }, + { + "epoch": 0.77, + "grad_norm": 3.4282564488224514, + "learning_rate": 1.36177306439912e-06, + "loss": 0.2518, + "step": 12190 + }, + { + "epoch": 0.77, + "grad_norm": 2.3358017943237215, + "learning_rate": 1.3610745123631536e-06, + "loss": 0.2449, + "step": 12191 + }, + { + "epoch": 0.77, + "grad_norm": 9.099503338369686, + "learning_rate": 1.360376111312669e-06, + "loss": 0.2403, + "step": 12192 + }, + { + "epoch": 0.77, + "grad_norm": 3.835473065478738, + "learning_rate": 1.359677861276642e-06, + "loss": 0.2733, + "step": 12193 + }, + { + "epoch": 0.77, + "grad_norm": 0.5521528226802335, + "learning_rate": 1.3589797622840473e-06, + "loss": 0.4471, + "step": 12194 + }, + { + "epoch": 0.77, + "grad_norm": 1.8520327327932877, + "learning_rate": 1.3582818143638505e-06, + "loss": 0.2594, + "step": 12195 + }, + { + "epoch": 0.77, + "grad_norm": 1.6606673204036535, + "learning_rate": 1.35758401754501e-06, + "loss": 0.2568, + "step": 12196 + }, + { + "epoch": 0.77, + "grad_norm": 1.789776641886652, + "learning_rate": 1.3568863718564779e-06, + "loss": 0.26, + "step": 12197 + }, + { + "epoch": 0.77, + "grad_norm": 3.3393705210298186, + "learning_rate": 1.3561888773271997e-06, + "loss": 0.2544, + "step": 12198 + }, + { + "epoch": 0.77, + "grad_norm": 2.2725920425656425, + "learning_rate": 1.355491533986119e-06, + "loss": 0.2607, + "step": 12199 + }, + { + "epoch": 0.77, + "grad_norm": 1.4027854019975927, + "learning_rate": 1.354794341862166e-06, + "loss": 0.2545, + "step": 12200 + }, + { + "epoch": 0.77, + "grad_norm": 2.6602622518563193, + "learning_rate": 1.3540973009842722e-06, + "loss": 0.2488, + "step": 12201 + }, + { + "epoch": 0.77, + "grad_norm": 4.4201286451726185, + "learning_rate": 1.3534004113813553e-06, + "loss": 0.2449, + "step": 12202 + }, + { + "epoch": 0.77, + "grad_norm": 1.4548751633048314, + "learning_rate": 1.3527036730823344e-06, + "loss": 0.2549, + "step": 12203 + }, + { + "epoch": 0.77, + "grad_norm": 10.878016111572727, + "learning_rate": 1.3520070861161162e-06, + "loss": 0.2718, + "step": 12204 + }, + { + "epoch": 0.77, + "grad_norm": 4.554610789370885, + "learning_rate": 1.3513106505116036e-06, + "loss": 0.2627, + "step": 12205 + }, + { + "epoch": 0.77, + "grad_norm": 1.8104370302578667, + "learning_rate": 1.350614366297691e-06, + "loss": 0.2473, + "step": 12206 + }, + { + "epoch": 0.77, + "grad_norm": 2.2328859510418466, + "learning_rate": 1.3499182335032728e-06, + "loss": 0.2394, + "step": 12207 + }, + { + "epoch": 0.77, + "grad_norm": 3.4119699133315216, + "learning_rate": 1.3492222521572286e-06, + "loss": 0.2797, + "step": 12208 + }, + { + "epoch": 0.77, + "grad_norm": 1.9954328470563016, + "learning_rate": 1.3485264222884387e-06, + "loss": 0.2661, + "step": 12209 + }, + { + "epoch": 0.77, + "grad_norm": 2.8542275818763985, + "learning_rate": 1.3478307439257721e-06, + "loss": 0.2546, + "step": 12210 + }, + { + "epoch": 0.77, + "grad_norm": 2.0227989591725586, + "learning_rate": 1.347135217098096e-06, + "loss": 0.2864, + "step": 12211 + }, + { + "epoch": 0.77, + "grad_norm": 1.7970596585819967, + "learning_rate": 1.3464398418342683e-06, + "loss": 0.2425, + "step": 12212 + }, + { + "epoch": 0.77, + "grad_norm": 2.4076250465106765, + "learning_rate": 1.3457446181631412e-06, + "loss": 0.255, + "step": 12213 + }, + { + "epoch": 0.77, + "grad_norm": 3.0487167145072678, + "learning_rate": 1.3450495461135587e-06, + "loss": 0.2798, + "step": 12214 + }, + { + "epoch": 0.77, + "grad_norm": 1.9664566338913183, + "learning_rate": 1.3443546257143624e-06, + "loss": 0.2515, + "step": 12215 + }, + { + "epoch": 0.77, + "grad_norm": 1.9933489856074946, + "learning_rate": 1.3436598569943877e-06, + "loss": 0.272, + "step": 12216 + }, + { + "epoch": 0.77, + "grad_norm": 6.363127307583459, + "learning_rate": 1.34296523998246e-06, + "loss": 0.2543, + "step": 12217 + }, + { + "epoch": 0.77, + "grad_norm": 1.7804329629619244, + "learning_rate": 1.3422707747073982e-06, + "loss": 0.26, + "step": 12218 + }, + { + "epoch": 0.77, + "grad_norm": 18.699902909772035, + "learning_rate": 1.3415764611980203e-06, + "loss": 0.2512, + "step": 12219 + }, + { + "epoch": 0.77, + "grad_norm": 2.5363615952587026, + "learning_rate": 1.3408822994831334e-06, + "loss": 0.2604, + "step": 12220 + }, + { + "epoch": 0.77, + "grad_norm": 1.743496592571209, + "learning_rate": 1.340188289591539e-06, + "loss": 0.2669, + "step": 12221 + }, + { + "epoch": 0.77, + "grad_norm": 2.5999052530662206, + "learning_rate": 1.339494431552032e-06, + "loss": 0.269, + "step": 12222 + }, + { + "epoch": 0.77, + "grad_norm": 2.2818387282556643, + "learning_rate": 1.3388007253934027e-06, + "loss": 0.2514, + "step": 12223 + }, + { + "epoch": 0.77, + "grad_norm": 3.7594046784324404, + "learning_rate": 1.3381071711444354e-06, + "loss": 0.2566, + "step": 12224 + }, + { + "epoch": 0.77, + "grad_norm": 4.996730874594099, + "learning_rate": 1.3374137688339061e-06, + "loss": 0.2631, + "step": 12225 + }, + { + "epoch": 0.77, + "grad_norm": 1.6209726289735795, + "learning_rate": 1.3367205184905834e-06, + "loss": 0.238, + "step": 12226 + }, + { + "epoch": 0.77, + "grad_norm": 3.4512645326127482, + "learning_rate": 1.3360274201432345e-06, + "loss": 0.2676, + "step": 12227 + }, + { + "epoch": 0.77, + "grad_norm": 4.004577670521368, + "learning_rate": 1.3353344738206158e-06, + "loss": 0.2577, + "step": 12228 + }, + { + "epoch": 0.77, + "grad_norm": 1.9752401514118758, + "learning_rate": 1.3346416795514794e-06, + "loss": 0.2645, + "step": 12229 + }, + { + "epoch": 0.77, + "grad_norm": 3.8238158477170305, + "learning_rate": 1.3339490373645686e-06, + "loss": 0.2532, + "step": 12230 + }, + { + "epoch": 0.77, + "grad_norm": 0.5715346091088411, + "learning_rate": 1.3332565472886238e-06, + "loss": 0.4747, + "step": 12231 + }, + { + "epoch": 0.77, + "grad_norm": 3.0124216246458646, + "learning_rate": 1.3325642093523789e-06, + "loss": 0.247, + "step": 12232 + }, + { + "epoch": 0.77, + "grad_norm": 2.4418951142527257, + "learning_rate": 1.3318720235845594e-06, + "loss": 0.253, + "step": 12233 + }, + { + "epoch": 0.77, + "grad_norm": 3.4855711853991473, + "learning_rate": 1.3311799900138834e-06, + "loss": 0.2633, + "step": 12234 + }, + { + "epoch": 0.77, + "grad_norm": 4.7269823728543825, + "learning_rate": 1.3304881086690674e-06, + "loss": 0.262, + "step": 12235 + }, + { + "epoch": 0.77, + "grad_norm": 2.111751311674048, + "learning_rate": 1.3297963795788177e-06, + "loss": 0.2404, + "step": 12236 + }, + { + "epoch": 0.77, + "grad_norm": 1.866616775899338, + "learning_rate": 1.3291048027718357e-06, + "loss": 0.2553, + "step": 12237 + }, + { + "epoch": 0.77, + "grad_norm": 3.0361375820927377, + "learning_rate": 1.3284133782768139e-06, + "loss": 0.2482, + "step": 12238 + }, + { + "epoch": 0.77, + "grad_norm": 1.8277084526024296, + "learning_rate": 1.3277221061224422e-06, + "loss": 0.2697, + "step": 12239 + }, + { + "epoch": 0.77, + "grad_norm": 1.760103770365269, + "learning_rate": 1.3270309863374047e-06, + "loss": 0.277, + "step": 12240 + }, + { + "epoch": 0.77, + "grad_norm": 3.4116259303005694, + "learning_rate": 1.3263400189503756e-06, + "loss": 0.2468, + "step": 12241 + }, + { + "epoch": 0.77, + "grad_norm": 1.8392725729239117, + "learning_rate": 1.3256492039900226e-06, + "loss": 0.241, + "step": 12242 + }, + { + "epoch": 0.77, + "grad_norm": 11.103783520812081, + "learning_rate": 1.324958541485012e-06, + "loss": 0.259, + "step": 12243 + }, + { + "epoch": 0.77, + "grad_norm": 0.6020325443230078, + "learning_rate": 1.3242680314639995e-06, + "loss": 0.486, + "step": 12244 + }, + { + "epoch": 0.77, + "grad_norm": 2.3984756000123766, + "learning_rate": 1.3235776739556333e-06, + "loss": 0.247, + "step": 12245 + }, + { + "epoch": 0.77, + "grad_norm": 2.2871965411899904, + "learning_rate": 1.3228874689885606e-06, + "loss": 0.2402, + "step": 12246 + }, + { + "epoch": 0.77, + "grad_norm": 4.720329837754432, + "learning_rate": 1.3221974165914176e-06, + "loss": 0.2578, + "step": 12247 + }, + { + "epoch": 0.77, + "grad_norm": 2.7224414669061514, + "learning_rate": 1.3215075167928372e-06, + "loss": 0.2637, + "step": 12248 + }, + { + "epoch": 0.77, + "grad_norm": 1.820060765595058, + "learning_rate": 1.3208177696214436e-06, + "loss": 0.2714, + "step": 12249 + }, + { + "epoch": 0.77, + "grad_norm": 1.5356019136963701, + "learning_rate": 1.3201281751058542e-06, + "loss": 0.2641, + "step": 12250 + }, + { + "epoch": 0.77, + "grad_norm": 1.9641856076478836, + "learning_rate": 1.319438733274685e-06, + "loss": 0.2455, + "step": 12251 + }, + { + "epoch": 0.77, + "grad_norm": 2.2092214727781436, + "learning_rate": 1.3187494441565397e-06, + "loss": 0.2526, + "step": 12252 + }, + { + "epoch": 0.77, + "grad_norm": 8.519156337722903, + "learning_rate": 1.318060307780017e-06, + "loss": 0.275, + "step": 12253 + }, + { + "epoch": 0.77, + "grad_norm": 2.9369238548468752, + "learning_rate": 1.3173713241737123e-06, + "loss": 0.2549, + "step": 12254 + }, + { + "epoch": 0.77, + "grad_norm": 1.776159112521937, + "learning_rate": 1.3166824933662136e-06, + "loss": 0.2591, + "step": 12255 + }, + { + "epoch": 0.77, + "grad_norm": 1.7435280964426205, + "learning_rate": 1.3159938153861012e-06, + "loss": 0.2569, + "step": 12256 + }, + { + "epoch": 0.77, + "grad_norm": 2.043634005003201, + "learning_rate": 1.3153052902619484e-06, + "loss": 0.266, + "step": 12257 + }, + { + "epoch": 0.77, + "grad_norm": 2.939685880575297, + "learning_rate": 1.3146169180223228e-06, + "loss": 0.257, + "step": 12258 + }, + { + "epoch": 0.77, + "grad_norm": 2.0139655057295855, + "learning_rate": 1.3139286986957884e-06, + "loss": 0.2564, + "step": 12259 + }, + { + "epoch": 0.77, + "grad_norm": 1.6765394533407212, + "learning_rate": 1.3132406323108981e-06, + "loss": 0.2587, + "step": 12260 + }, + { + "epoch": 0.77, + "grad_norm": 0.5876611684976215, + "learning_rate": 1.3125527188962034e-06, + "loss": 0.4666, + "step": 12261 + }, + { + "epoch": 0.77, + "grad_norm": 3.0890686446704105, + "learning_rate": 1.311864958480245e-06, + "loss": 0.2331, + "step": 12262 + }, + { + "epoch": 0.77, + "grad_norm": 1.677254633816732, + "learning_rate": 1.311177351091562e-06, + "loss": 0.2422, + "step": 12263 + }, + { + "epoch": 0.77, + "grad_norm": 2.256836932168832, + "learning_rate": 1.3104898967586827e-06, + "loss": 0.2553, + "step": 12264 + }, + { + "epoch": 0.77, + "grad_norm": 1.70247783766268, + "learning_rate": 1.3098025955101306e-06, + "loss": 0.2522, + "step": 12265 + }, + { + "epoch": 0.77, + "grad_norm": 2.1079012304757425, + "learning_rate": 1.3091154473744221e-06, + "loss": 0.2481, + "step": 12266 + }, + { + "epoch": 0.77, + "grad_norm": 0.603033998295756, + "learning_rate": 1.308428452380071e-06, + "loss": 0.4413, + "step": 12267 + }, + { + "epoch": 0.77, + "grad_norm": 3.8033667127370485, + "learning_rate": 1.3077416105555784e-06, + "loss": 0.2582, + "step": 12268 + }, + { + "epoch": 0.77, + "grad_norm": 5.134625154010529, + "learning_rate": 1.3070549219294465e-06, + "loss": 0.25, + "step": 12269 + }, + { + "epoch": 0.77, + "grad_norm": 3.9638394449868155, + "learning_rate": 1.306368386530164e-06, + "loss": 0.2741, + "step": 12270 + }, + { + "epoch": 0.77, + "grad_norm": 2.6311224043929804, + "learning_rate": 1.3056820043862189e-06, + "loss": 0.2553, + "step": 12271 + }, + { + "epoch": 0.77, + "grad_norm": 3.800949375937788, + "learning_rate": 1.3049957755260894e-06, + "loss": 0.2474, + "step": 12272 + }, + { + "epoch": 0.77, + "grad_norm": 2.7937288608334936, + "learning_rate": 1.3043096999782478e-06, + "loss": 0.2493, + "step": 12273 + }, + { + "epoch": 0.77, + "grad_norm": 1.947387471232567, + "learning_rate": 1.3036237777711602e-06, + "loss": 0.2383, + "step": 12274 + }, + { + "epoch": 0.77, + "grad_norm": 1.8362005445986116, + "learning_rate": 1.3029380089332872e-06, + "loss": 0.2557, + "step": 12275 + }, + { + "epoch": 0.77, + "grad_norm": 2.2221529467055463, + "learning_rate": 1.3022523934930848e-06, + "loss": 0.2772, + "step": 12276 + }, + { + "epoch": 0.77, + "grad_norm": 2.750850222222142, + "learning_rate": 1.301566931478998e-06, + "loss": 0.2386, + "step": 12277 + }, + { + "epoch": 0.77, + "grad_norm": 2.210715233519939, + "learning_rate": 1.3008816229194671e-06, + "loss": 0.278, + "step": 12278 + }, + { + "epoch": 0.77, + "grad_norm": 2.228814960391766, + "learning_rate": 1.3001964678429296e-06, + "loss": 0.2579, + "step": 12279 + }, + { + "epoch": 0.77, + "grad_norm": 2.7137110584683724, + "learning_rate": 1.2995114662778125e-06, + "loss": 0.2445, + "step": 12280 + }, + { + "epoch": 0.77, + "grad_norm": 1.6586462192318272, + "learning_rate": 1.2988266182525371e-06, + "loss": 0.25, + "step": 12281 + }, + { + "epoch": 0.77, + "grad_norm": 1.5299095831505582, + "learning_rate": 1.298141923795518e-06, + "loss": 0.2341, + "step": 12282 + }, + { + "epoch": 0.77, + "grad_norm": 1.6383193506290523, + "learning_rate": 1.2974573829351655e-06, + "loss": 0.2546, + "step": 12283 + }, + { + "epoch": 0.77, + "grad_norm": 3.1167957018806236, + "learning_rate": 1.2967729956998843e-06, + "loss": 0.2629, + "step": 12284 + }, + { + "epoch": 0.77, + "grad_norm": 3.167159529146566, + "learning_rate": 1.296088762118069e-06, + "loss": 0.2414, + "step": 12285 + }, + { + "epoch": 0.77, + "grad_norm": 1.7300635099688708, + "learning_rate": 1.2954046822181088e-06, + "loss": 0.2661, + "step": 12286 + }, + { + "epoch": 0.77, + "grad_norm": 1.9307077429699162, + "learning_rate": 1.2947207560283891e-06, + "loss": 0.2538, + "step": 12287 + }, + { + "epoch": 0.77, + "grad_norm": 2.343178622654274, + "learning_rate": 1.2940369835772875e-06, + "loss": 0.2707, + "step": 12288 + }, + { + "epoch": 0.77, + "grad_norm": 3.000069593865243, + "learning_rate": 1.2933533648931734e-06, + "loss": 0.2533, + "step": 12289 + }, + { + "epoch": 0.77, + "grad_norm": 1.9158547480528179, + "learning_rate": 1.2926699000044107e-06, + "loss": 0.2614, + "step": 12290 + }, + { + "epoch": 0.77, + "grad_norm": 2.064007732610284, + "learning_rate": 1.2919865889393584e-06, + "loss": 0.266, + "step": 12291 + }, + { + "epoch": 0.77, + "grad_norm": 1.5719989869828415, + "learning_rate": 1.2913034317263706e-06, + "loss": 0.2428, + "step": 12292 + }, + { + "epoch": 0.77, + "grad_norm": 0.5747876282402975, + "learning_rate": 1.2906204283937907e-06, + "loss": 0.4773, + "step": 12293 + }, + { + "epoch": 0.77, + "grad_norm": 6.215520242756292, + "learning_rate": 1.2899375789699564e-06, + "loss": 0.2596, + "step": 12294 + }, + { + "epoch": 0.77, + "grad_norm": 1.8201404682568871, + "learning_rate": 1.289254883483203e-06, + "loss": 0.2761, + "step": 12295 + }, + { + "epoch": 0.77, + "grad_norm": 1.6073309541184808, + "learning_rate": 1.2885723419618556e-06, + "loss": 0.2527, + "step": 12296 + }, + { + "epoch": 0.77, + "grad_norm": 2.1361266884470442, + "learning_rate": 1.2878899544342327e-06, + "loss": 0.2513, + "step": 12297 + }, + { + "epoch": 0.77, + "grad_norm": 1.8028098344253858, + "learning_rate": 1.2872077209286505e-06, + "loss": 0.2404, + "step": 12298 + }, + { + "epoch": 0.77, + "grad_norm": 9.785329958891657, + "learning_rate": 1.2865256414734133e-06, + "loss": 0.2312, + "step": 12299 + }, + { + "epoch": 0.77, + "grad_norm": 6.398529969598262, + "learning_rate": 1.285843716096824e-06, + "loss": 0.2621, + "step": 12300 + }, + { + "epoch": 0.77, + "grad_norm": 2.7311487904016185, + "learning_rate": 1.2851619448271762e-06, + "loss": 0.2635, + "step": 12301 + }, + { + "epoch": 0.77, + "grad_norm": 3.3297448433351207, + "learning_rate": 1.2844803276927565e-06, + "loss": 0.2564, + "step": 12302 + }, + { + "epoch": 0.77, + "grad_norm": 2.5766723875318966, + "learning_rate": 1.283798864721849e-06, + "loss": 0.2797, + "step": 12303 + }, + { + "epoch": 0.77, + "grad_norm": 1.9294270427827505, + "learning_rate": 1.2831175559427266e-06, + "loss": 0.2548, + "step": 12304 + }, + { + "epoch": 0.77, + "grad_norm": 1.5845101405312683, + "learning_rate": 1.2824364013836583e-06, + "loss": 0.2635, + "step": 12305 + }, + { + "epoch": 0.77, + "grad_norm": 3.954355978769204, + "learning_rate": 1.2817554010729071e-06, + "loss": 0.2688, + "step": 12306 + }, + { + "epoch": 0.77, + "grad_norm": 1.4917984473922121, + "learning_rate": 1.2810745550387283e-06, + "loss": 0.264, + "step": 12307 + }, + { + "epoch": 0.77, + "grad_norm": 2.4676256062330664, + "learning_rate": 1.2803938633093726e-06, + "loss": 0.2606, + "step": 12308 + }, + { + "epoch": 0.77, + "grad_norm": 2.3007111312262816, + "learning_rate": 1.2797133259130822e-06, + "loss": 0.2567, + "step": 12309 + }, + { + "epoch": 0.77, + "grad_norm": 1.7365384012426694, + "learning_rate": 1.2790329428780928e-06, + "loss": 0.2528, + "step": 12310 + }, + { + "epoch": 0.77, + "grad_norm": 2.859494170895407, + "learning_rate": 1.2783527142326375e-06, + "loss": 0.262, + "step": 12311 + }, + { + "epoch": 0.77, + "grad_norm": 2.078863950762435, + "learning_rate": 1.277672640004936e-06, + "loss": 0.2674, + "step": 12312 + }, + { + "epoch": 0.77, + "grad_norm": 1.7138783608599124, + "learning_rate": 1.2769927202232108e-06, + "loss": 0.2524, + "step": 12313 + }, + { + "epoch": 0.77, + "grad_norm": 1.8801935359782573, + "learning_rate": 1.2763129549156699e-06, + "loss": 0.2678, + "step": 12314 + }, + { + "epoch": 0.77, + "grad_norm": 1.53751842020365, + "learning_rate": 1.2756333441105168e-06, + "loss": 0.2458, + "step": 12315 + }, + { + "epoch": 0.77, + "grad_norm": 4.082930999696426, + "learning_rate": 1.2749538878359535e-06, + "loss": 0.2641, + "step": 12316 + }, + { + "epoch": 0.77, + "grad_norm": 1.973673385915936, + "learning_rate": 1.2742745861201694e-06, + "loss": 0.2432, + "step": 12317 + }, + { + "epoch": 0.77, + "grad_norm": 2.012117341277662, + "learning_rate": 1.273595438991349e-06, + "loss": 0.2942, + "step": 12318 + }, + { + "epoch": 0.77, + "grad_norm": 1.8174064867377344, + "learning_rate": 1.2729164464776743e-06, + "loss": 0.2483, + "step": 12319 + }, + { + "epoch": 0.77, + "grad_norm": 1.6989239722003788, + "learning_rate": 1.2722376086073147e-06, + "loss": 0.281, + "step": 12320 + }, + { + "epoch": 0.77, + "grad_norm": 1.6236687822976983, + "learning_rate": 1.27155892540844e-06, + "loss": 0.259, + "step": 12321 + }, + { + "epoch": 0.77, + "grad_norm": 1.7435312816188404, + "learning_rate": 1.270880396909207e-06, + "loss": 0.2619, + "step": 12322 + }, + { + "epoch": 0.77, + "grad_norm": 1.9706493887070782, + "learning_rate": 1.2702020231377688e-06, + "loss": 0.2336, + "step": 12323 + }, + { + "epoch": 0.78, + "grad_norm": 1.4683597384257465, + "learning_rate": 1.2695238041222752e-06, + "loss": 0.2698, + "step": 12324 + }, + { + "epoch": 0.78, + "grad_norm": 1.937780816532654, + "learning_rate": 1.2688457398908648e-06, + "loss": 0.2518, + "step": 12325 + }, + { + "epoch": 0.78, + "grad_norm": 2.1077805243028296, + "learning_rate": 1.2681678304716711e-06, + "loss": 0.2603, + "step": 12326 + }, + { + "epoch": 0.78, + "grad_norm": 0.5625326232647218, + "learning_rate": 1.267490075892822e-06, + "loss": 0.4493, + "step": 12327 + }, + { + "epoch": 0.78, + "grad_norm": 2.3462421567618548, + "learning_rate": 1.2668124761824408e-06, + "loss": 0.2449, + "step": 12328 + }, + { + "epoch": 0.78, + "grad_norm": 5.261398561192962, + "learning_rate": 1.266135031368641e-06, + "loss": 0.2699, + "step": 12329 + }, + { + "epoch": 0.78, + "grad_norm": 2.170633701689669, + "learning_rate": 1.265457741479531e-06, + "loss": 0.2718, + "step": 12330 + }, + { + "epoch": 0.78, + "grad_norm": 2.8752811960831397, + "learning_rate": 1.2647806065432106e-06, + "loss": 0.2518, + "step": 12331 + }, + { + "epoch": 0.78, + "grad_norm": 5.166894900555235, + "learning_rate": 1.2641036265877794e-06, + "loss": 0.2643, + "step": 12332 + }, + { + "epoch": 0.78, + "grad_norm": 2.247543311408234, + "learning_rate": 1.2634268016413242e-06, + "loss": 0.2607, + "step": 12333 + }, + { + "epoch": 0.78, + "grad_norm": 1.9844969605356477, + "learning_rate": 1.2627501317319263e-06, + "loss": 0.2614, + "step": 12334 + }, + { + "epoch": 0.78, + "grad_norm": 3.2404791224035177, + "learning_rate": 1.2620736168876636e-06, + "loss": 0.2507, + "step": 12335 + }, + { + "epoch": 0.78, + "grad_norm": 1.941104051602157, + "learning_rate": 1.2613972571366074e-06, + "loss": 0.2586, + "step": 12336 + }, + { + "epoch": 0.78, + "grad_norm": 2.378846833590021, + "learning_rate": 1.2607210525068192e-06, + "loss": 0.2526, + "step": 12337 + }, + { + "epoch": 0.78, + "grad_norm": 1.9412812644753084, + "learning_rate": 1.260045003026355e-06, + "loss": 0.2595, + "step": 12338 + }, + { + "epoch": 0.78, + "grad_norm": 2.836719651860733, + "learning_rate": 1.2593691087232684e-06, + "loss": 0.2706, + "step": 12339 + }, + { + "epoch": 0.78, + "grad_norm": 2.541883754239091, + "learning_rate": 1.258693369625601e-06, + "loss": 0.2536, + "step": 12340 + }, + { + "epoch": 0.78, + "grad_norm": 1.9858546006136708, + "learning_rate": 1.2580177857613912e-06, + "loss": 0.2424, + "step": 12341 + }, + { + "epoch": 0.78, + "grad_norm": 8.818343775609208, + "learning_rate": 1.2573423571586686e-06, + "loss": 0.2537, + "step": 12342 + }, + { + "epoch": 0.78, + "grad_norm": 1.8340129739737918, + "learning_rate": 1.2566670838454593e-06, + "loss": 0.2549, + "step": 12343 + }, + { + "epoch": 0.78, + "grad_norm": 0.5880251809977763, + "learning_rate": 1.2559919658497827e-06, + "loss": 0.4537, + "step": 12344 + }, + { + "epoch": 0.78, + "grad_norm": 2.0419660037749505, + "learning_rate": 1.2553170031996497e-06, + "loss": 0.2734, + "step": 12345 + }, + { + "epoch": 0.78, + "grad_norm": 2.1628444714305917, + "learning_rate": 1.2546421959230642e-06, + "loss": 0.2404, + "step": 12346 + }, + { + "epoch": 0.78, + "grad_norm": 5.122535391561523, + "learning_rate": 1.253967544048028e-06, + "loss": 0.2551, + "step": 12347 + }, + { + "epoch": 0.78, + "grad_norm": 4.0228905299660935, + "learning_rate": 1.2532930476025322e-06, + "loss": 0.2608, + "step": 12348 + }, + { + "epoch": 0.78, + "grad_norm": 3.1976743069358076, + "learning_rate": 1.2526187066145628e-06, + "loss": 0.2584, + "step": 12349 + }, + { + "epoch": 0.78, + "grad_norm": 3.140037319583642, + "learning_rate": 1.2519445211120979e-06, + "loss": 0.2546, + "step": 12350 + }, + { + "epoch": 0.78, + "grad_norm": 1.8626642140723708, + "learning_rate": 1.2512704911231116e-06, + "loss": 0.2392, + "step": 12351 + }, + { + "epoch": 0.78, + "grad_norm": 6.315816455665586, + "learning_rate": 1.2505966166755729e-06, + "loss": 0.2531, + "step": 12352 + }, + { + "epoch": 0.78, + "grad_norm": 3.7528098609029947, + "learning_rate": 1.2499228977974403e-06, + "loss": 0.2682, + "step": 12353 + }, + { + "epoch": 0.78, + "grad_norm": 1.9548536727600647, + "learning_rate": 1.2492493345166662e-06, + "loss": 0.2473, + "step": 12354 + }, + { + "epoch": 0.78, + "grad_norm": 2.5180813166607314, + "learning_rate": 1.2485759268612008e-06, + "loss": 0.2503, + "step": 12355 + }, + { + "epoch": 0.78, + "grad_norm": 2.4406103045390592, + "learning_rate": 1.2479026748589834e-06, + "loss": 0.25, + "step": 12356 + }, + { + "epoch": 0.78, + "grad_norm": 4.0824957613167765, + "learning_rate": 1.2472295785379468e-06, + "loss": 0.2502, + "step": 12357 + }, + { + "epoch": 0.78, + "grad_norm": 1.5232142161234588, + "learning_rate": 1.2465566379260224e-06, + "loss": 0.2542, + "step": 12358 + }, + { + "epoch": 0.78, + "grad_norm": 1.9319094125273095, + "learning_rate": 1.245883853051128e-06, + "loss": 0.2372, + "step": 12359 + }, + { + "epoch": 0.78, + "grad_norm": 1.8080425389440837, + "learning_rate": 1.2452112239411823e-06, + "loss": 0.2629, + "step": 12360 + }, + { + "epoch": 0.78, + "grad_norm": 12.304213693016038, + "learning_rate": 1.2445387506240925e-06, + "loss": 0.2732, + "step": 12361 + }, + { + "epoch": 0.78, + "grad_norm": 0.6106698475194454, + "learning_rate": 1.243866433127759e-06, + "loss": 0.4669, + "step": 12362 + }, + { + "epoch": 0.78, + "grad_norm": 1.8446023964467877, + "learning_rate": 1.24319427148008e-06, + "loss": 0.2544, + "step": 12363 + }, + { + "epoch": 0.78, + "grad_norm": 3.3624951310690876, + "learning_rate": 1.242522265708942e-06, + "loss": 0.2857, + "step": 12364 + }, + { + "epoch": 0.78, + "grad_norm": 5.60000752409501, + "learning_rate": 1.2418504158422306e-06, + "loss": 0.2747, + "step": 12365 + }, + { + "epoch": 0.78, + "grad_norm": 1.652813906098466, + "learning_rate": 1.2411787219078208e-06, + "loss": 0.2423, + "step": 12366 + }, + { + "epoch": 0.78, + "grad_norm": 3.5945675052839787, + "learning_rate": 1.2405071839335808e-06, + "loss": 0.262, + "step": 12367 + }, + { + "epoch": 0.78, + "grad_norm": 2.565938368415108, + "learning_rate": 1.2398358019473766e-06, + "loss": 0.2546, + "step": 12368 + }, + { + "epoch": 0.78, + "grad_norm": 1.755968406847944, + "learning_rate": 1.2391645759770642e-06, + "loss": 0.2445, + "step": 12369 + }, + { + "epoch": 0.78, + "grad_norm": 178.19725017159877, + "learning_rate": 1.238493506050492e-06, + "loss": 0.2667, + "step": 12370 + }, + { + "epoch": 0.78, + "grad_norm": 1.518734467800095, + "learning_rate": 1.2378225921955067e-06, + "loss": 0.2489, + "step": 12371 + }, + { + "epoch": 0.78, + "grad_norm": 3.685700762759292, + "learning_rate": 1.237151834439943e-06, + "loss": 0.2652, + "step": 12372 + }, + { + "epoch": 0.78, + "grad_norm": 1.6633170610302996, + "learning_rate": 1.2364812328116348e-06, + "loss": 0.2425, + "step": 12373 + }, + { + "epoch": 0.78, + "grad_norm": 1.9514821613397824, + "learning_rate": 1.235810787338405e-06, + "loss": 0.2413, + "step": 12374 + }, + { + "epoch": 0.78, + "grad_norm": 4.374962238376874, + "learning_rate": 1.2351404980480702e-06, + "loss": 0.2653, + "step": 12375 + }, + { + "epoch": 0.78, + "grad_norm": 0.5858438771644625, + "learning_rate": 1.2344703649684442e-06, + "loss": 0.4414, + "step": 12376 + }, + { + "epoch": 0.78, + "grad_norm": 2.555637482244403, + "learning_rate": 1.233800388127332e-06, + "loss": 0.2658, + "step": 12377 + }, + { + "epoch": 0.78, + "grad_norm": 1.9046070394648476, + "learning_rate": 1.233130567552529e-06, + "loss": 0.2615, + "step": 12378 + }, + { + "epoch": 0.78, + "grad_norm": 2.2725242366830396, + "learning_rate": 1.2324609032718298e-06, + "loss": 0.2602, + "step": 12379 + }, + { + "epoch": 0.78, + "grad_norm": 1.9590298443285727, + "learning_rate": 1.2317913953130213e-06, + "loss": 0.272, + "step": 12380 + }, + { + "epoch": 0.78, + "grad_norm": 1.9062195970508713, + "learning_rate": 1.2311220437038801e-06, + "loss": 0.2589, + "step": 12381 + }, + { + "epoch": 0.78, + "grad_norm": 1.7911703758876762, + "learning_rate": 1.23045284847218e-06, + "loss": 0.2562, + "step": 12382 + }, + { + "epoch": 0.78, + "grad_norm": 2.5286150250647546, + "learning_rate": 1.2297838096456855e-06, + "loss": 0.2563, + "step": 12383 + }, + { + "epoch": 0.78, + "grad_norm": 1.4972484247860378, + "learning_rate": 1.2291149272521586e-06, + "loss": 0.2602, + "step": 12384 + }, + { + "epoch": 0.78, + "grad_norm": 2.0914695433560055, + "learning_rate": 1.2284462013193515e-06, + "loss": 0.2404, + "step": 12385 + }, + { + "epoch": 0.78, + "grad_norm": 1.8861475027800798, + "learning_rate": 1.227777631875009e-06, + "loss": 0.2666, + "step": 12386 + }, + { + "epoch": 0.78, + "grad_norm": 2.5058317852938243, + "learning_rate": 1.2271092189468726e-06, + "loss": 0.2568, + "step": 12387 + }, + { + "epoch": 0.78, + "grad_norm": 2.8664571856773016, + "learning_rate": 1.2264409625626778e-06, + "loss": 0.245, + "step": 12388 + }, + { + "epoch": 0.78, + "grad_norm": 2.626670678213925, + "learning_rate": 1.2257728627501497e-06, + "loss": 0.2518, + "step": 12389 + }, + { + "epoch": 0.78, + "grad_norm": 1.7524761963452473, + "learning_rate": 1.2251049195370097e-06, + "loss": 0.2582, + "step": 12390 + }, + { + "epoch": 0.78, + "grad_norm": 1.7858765545749833, + "learning_rate": 1.2244371329509697e-06, + "loss": 0.2594, + "step": 12391 + }, + { + "epoch": 0.78, + "grad_norm": 1.6927565945282785, + "learning_rate": 1.223769503019741e-06, + "loss": 0.2558, + "step": 12392 + }, + { + "epoch": 0.78, + "grad_norm": 4.001672851441979, + "learning_rate": 1.2231020297710228e-06, + "loss": 0.2422, + "step": 12393 + }, + { + "epoch": 0.78, + "grad_norm": 3.661360873683687, + "learning_rate": 1.2224347132325082e-06, + "loss": 0.2527, + "step": 12394 + }, + { + "epoch": 0.78, + "grad_norm": 2.009360135903299, + "learning_rate": 1.2217675534318873e-06, + "loss": 0.243, + "step": 12395 + }, + { + "epoch": 0.78, + "grad_norm": 2.2687522923484273, + "learning_rate": 1.2211005503968431e-06, + "loss": 0.2624, + "step": 12396 + }, + { + "epoch": 0.78, + "grad_norm": 2.4004277469686612, + "learning_rate": 1.220433704155049e-06, + "loss": 0.2477, + "step": 12397 + }, + { + "epoch": 0.78, + "grad_norm": 2.0374917725442665, + "learning_rate": 1.219767014734174e-06, + "loss": 0.2686, + "step": 12398 + }, + { + "epoch": 0.78, + "grad_norm": 2.6373522856638223, + "learning_rate": 1.2191004821618785e-06, + "loss": 0.2533, + "step": 12399 + }, + { + "epoch": 0.78, + "grad_norm": 2.590581844065551, + "learning_rate": 1.2184341064658206e-06, + "loss": 0.2631, + "step": 12400 + }, + { + "epoch": 0.78, + "grad_norm": 2.877908650692595, + "learning_rate": 1.2177678876736488e-06, + "loss": 0.254, + "step": 12401 + }, + { + "epoch": 0.78, + "grad_norm": 7.574956335709526, + "learning_rate": 1.2171018258130042e-06, + "loss": 0.2649, + "step": 12402 + }, + { + "epoch": 0.78, + "grad_norm": 4.123557344062702, + "learning_rate": 1.2164359209115235e-06, + "loss": 0.244, + "step": 12403 + }, + { + "epoch": 0.78, + "grad_norm": 1.2841673830322133, + "learning_rate": 1.2157701729968384e-06, + "loss": 0.2629, + "step": 12404 + }, + { + "epoch": 0.78, + "grad_norm": 1.658671307641624, + "learning_rate": 1.2151045820965708e-06, + "loss": 0.2447, + "step": 12405 + }, + { + "epoch": 0.78, + "grad_norm": 3.439038942895115, + "learning_rate": 1.2144391482383361e-06, + "loss": 0.2476, + "step": 12406 + }, + { + "epoch": 0.78, + "grad_norm": 4.109420604086303, + "learning_rate": 1.2137738714497437e-06, + "loss": 0.2548, + "step": 12407 + }, + { + "epoch": 0.78, + "grad_norm": 1.7114240748355343, + "learning_rate": 1.2131087517584006e-06, + "loss": 0.2651, + "step": 12408 + }, + { + "epoch": 0.78, + "grad_norm": 1.9308033531199176, + "learning_rate": 1.2124437891918995e-06, + "loss": 0.2484, + "step": 12409 + }, + { + "epoch": 0.78, + "grad_norm": 1.7548605238780082, + "learning_rate": 1.2117789837778349e-06, + "loss": 0.2356, + "step": 12410 + }, + { + "epoch": 0.78, + "grad_norm": 2.974220225322346, + "learning_rate": 1.2111143355437877e-06, + "loss": 0.234, + "step": 12411 + }, + { + "epoch": 0.78, + "grad_norm": 1.752797791187074, + "learning_rate": 1.2104498445173373e-06, + "loss": 0.2339, + "step": 12412 + }, + { + "epoch": 0.78, + "grad_norm": 0.6263168882674981, + "learning_rate": 1.2097855107260542e-06, + "loss": 0.4699, + "step": 12413 + }, + { + "epoch": 0.78, + "grad_norm": 2.156443661231618, + "learning_rate": 1.209121334197501e-06, + "loss": 0.247, + "step": 12414 + }, + { + "epoch": 0.78, + "grad_norm": 2.4142469559438777, + "learning_rate": 1.2084573149592382e-06, + "loss": 0.2503, + "step": 12415 + }, + { + "epoch": 0.78, + "grad_norm": 1.647771114225083, + "learning_rate": 1.2077934530388163e-06, + "loss": 0.2396, + "step": 12416 + }, + { + "epoch": 0.78, + "grad_norm": 6.529543479823659, + "learning_rate": 1.2071297484637785e-06, + "loss": 0.2468, + "step": 12417 + }, + { + "epoch": 0.78, + "grad_norm": 1.6622057967595985, + "learning_rate": 1.2064662012616651e-06, + "loss": 0.2584, + "step": 12418 + }, + { + "epoch": 0.78, + "grad_norm": 2.868284889800844, + "learning_rate": 1.2058028114600061e-06, + "loss": 0.2506, + "step": 12419 + }, + { + "epoch": 0.78, + "grad_norm": 2.3575051245893266, + "learning_rate": 1.205139579086329e-06, + "loss": 0.2731, + "step": 12420 + }, + { + "epoch": 0.78, + "grad_norm": 1.8052124583456362, + "learning_rate": 1.204476504168151e-06, + "loss": 0.244, + "step": 12421 + }, + { + "epoch": 0.78, + "grad_norm": 1.7006314857029459, + "learning_rate": 1.203813586732983e-06, + "loss": 0.2552, + "step": 12422 + }, + { + "epoch": 0.78, + "grad_norm": 2.815932145949014, + "learning_rate": 1.2031508268083342e-06, + "loss": 0.2782, + "step": 12423 + }, + { + "epoch": 0.78, + "grad_norm": 3.5145252146341597, + "learning_rate": 1.2024882244216996e-06, + "loss": 0.2438, + "step": 12424 + }, + { + "epoch": 0.78, + "grad_norm": 2.940945168592251, + "learning_rate": 1.201825779600575e-06, + "loss": 0.2598, + "step": 12425 + }, + { + "epoch": 0.78, + "grad_norm": 1.6301880384540195, + "learning_rate": 1.2011634923724452e-06, + "loss": 0.2485, + "step": 12426 + }, + { + "epoch": 0.78, + "grad_norm": 1.843311714911491, + "learning_rate": 1.2005013627647889e-06, + "loss": 0.2532, + "step": 12427 + }, + { + "epoch": 0.78, + "grad_norm": 1.7614695281699693, + "learning_rate": 1.1998393908050803e-06, + "loss": 0.2537, + "step": 12428 + }, + { + "epoch": 0.78, + "grad_norm": 2.1405152867619286, + "learning_rate": 1.1991775765207854e-06, + "loss": 0.2679, + "step": 12429 + }, + { + "epoch": 0.78, + "grad_norm": 0.5735871136892312, + "learning_rate": 1.1985159199393626e-06, + "loss": 0.4516, + "step": 12430 + }, + { + "epoch": 0.78, + "grad_norm": 2.7317408857345176, + "learning_rate": 1.1978544210882675e-06, + "loss": 0.2497, + "step": 12431 + }, + { + "epoch": 0.78, + "grad_norm": 2.075251173038843, + "learning_rate": 1.197193079994945e-06, + "loss": 0.267, + "step": 12432 + }, + { + "epoch": 0.78, + "grad_norm": 1.828177396460023, + "learning_rate": 1.1965318966868372e-06, + "loss": 0.2445, + "step": 12433 + }, + { + "epoch": 0.78, + "grad_norm": 1.91106666692495, + "learning_rate": 1.1958708711913769e-06, + "loss": 0.2403, + "step": 12434 + }, + { + "epoch": 0.78, + "grad_norm": 1.7164893516252615, + "learning_rate": 1.1952100035359893e-06, + "loss": 0.261, + "step": 12435 + }, + { + "epoch": 0.78, + "grad_norm": 1.9739400176144315, + "learning_rate": 1.1945492937480984e-06, + "loss": 0.2515, + "step": 12436 + }, + { + "epoch": 0.78, + "grad_norm": 1.6913900386011396, + "learning_rate": 1.1938887418551164e-06, + "loss": 0.254, + "step": 12437 + }, + { + "epoch": 0.78, + "grad_norm": 1.442235477648228, + "learning_rate": 1.1932283478844497e-06, + "loss": 0.2649, + "step": 12438 + }, + { + "epoch": 0.78, + "grad_norm": 1.726568134779693, + "learning_rate": 1.1925681118635008e-06, + "loss": 0.2729, + "step": 12439 + }, + { + "epoch": 0.78, + "grad_norm": 0.6070856432939178, + "learning_rate": 1.1919080338196642e-06, + "loss": 0.4923, + "step": 12440 + }, + { + "epoch": 0.78, + "grad_norm": 3.9357918761025683, + "learning_rate": 1.1912481137803277e-06, + "loss": 0.2582, + "step": 12441 + }, + { + "epoch": 0.78, + "grad_norm": 4.239781688494264, + "learning_rate": 1.1905883517728723e-06, + "loss": 0.2741, + "step": 12442 + }, + { + "epoch": 0.78, + "grad_norm": 12.548832100515837, + "learning_rate": 1.1899287478246707e-06, + "loss": 0.2451, + "step": 12443 + }, + { + "epoch": 0.78, + "grad_norm": 1.9318255389905616, + "learning_rate": 1.1892693019630946e-06, + "loss": 0.2755, + "step": 12444 + }, + { + "epoch": 0.78, + "grad_norm": 2.309551825458991, + "learning_rate": 1.1886100142155032e-06, + "loss": 0.2632, + "step": 12445 + }, + { + "epoch": 0.78, + "grad_norm": 2.5611418476221632, + "learning_rate": 1.1879508846092513e-06, + "loss": 0.2738, + "step": 12446 + }, + { + "epoch": 0.78, + "grad_norm": 3.5195417147947374, + "learning_rate": 1.1872919131716875e-06, + "loss": 0.2497, + "step": 12447 + }, + { + "epoch": 0.78, + "grad_norm": 1.8920029144962827, + "learning_rate": 1.1866330999301562e-06, + "loss": 0.2658, + "step": 12448 + }, + { + "epoch": 0.78, + "grad_norm": 3.4493648452887045, + "learning_rate": 1.1859744449119908e-06, + "loss": 0.2571, + "step": 12449 + }, + { + "epoch": 0.78, + "grad_norm": 1.5764234114088442, + "learning_rate": 1.1853159481445203e-06, + "loss": 0.2499, + "step": 12450 + }, + { + "epoch": 0.78, + "grad_norm": 1.9210018274176985, + "learning_rate": 1.184657609655065e-06, + "loss": 0.2657, + "step": 12451 + }, + { + "epoch": 0.78, + "grad_norm": 2.6734108429947123, + "learning_rate": 1.183999429470944e-06, + "loss": 0.2585, + "step": 12452 + }, + { + "epoch": 0.78, + "grad_norm": 2.85496893988578, + "learning_rate": 1.1833414076194643e-06, + "loss": 0.2543, + "step": 12453 + }, + { + "epoch": 0.78, + "grad_norm": 1.874318287772399, + "learning_rate": 1.1826835441279277e-06, + "loss": 0.2703, + "step": 12454 + }, + { + "epoch": 0.78, + "grad_norm": 2.020251792916572, + "learning_rate": 1.182025839023631e-06, + "loss": 0.2456, + "step": 12455 + }, + { + "epoch": 0.78, + "grad_norm": 3.054236271080544, + "learning_rate": 1.1813682923338654e-06, + "loss": 0.2484, + "step": 12456 + }, + { + "epoch": 0.78, + "grad_norm": 2.5462757259698257, + "learning_rate": 1.1807109040859115e-06, + "loss": 0.256, + "step": 12457 + }, + { + "epoch": 0.78, + "grad_norm": 2.7216708233070572, + "learning_rate": 1.1800536743070467e-06, + "loss": 0.2622, + "step": 12458 + }, + { + "epoch": 0.78, + "grad_norm": 3.7179250263832087, + "learning_rate": 1.1793966030245379e-06, + "loss": 0.2536, + "step": 12459 + }, + { + "epoch": 0.78, + "grad_norm": 0.6603924928332737, + "learning_rate": 1.1787396902656518e-06, + "loss": 0.4963, + "step": 12460 + }, + { + "epoch": 0.78, + "grad_norm": 2.5371285628577036, + "learning_rate": 1.1780829360576418e-06, + "loss": 0.2527, + "step": 12461 + }, + { + "epoch": 0.78, + "grad_norm": 2.1436323176442866, + "learning_rate": 1.1774263404277607e-06, + "loss": 0.2524, + "step": 12462 + }, + { + "epoch": 0.78, + "grad_norm": 2.620969723792433, + "learning_rate": 1.1767699034032492e-06, + "loss": 0.2527, + "step": 12463 + }, + { + "epoch": 0.78, + "grad_norm": 4.06020484986667, + "learning_rate": 1.1761136250113465e-06, + "loss": 0.2586, + "step": 12464 + }, + { + "epoch": 0.78, + "grad_norm": 10.324309916458656, + "learning_rate": 1.1754575052792815e-06, + "loss": 0.2812, + "step": 12465 + }, + { + "epoch": 0.78, + "grad_norm": 2.1097163696447896, + "learning_rate": 1.1748015442342781e-06, + "loss": 0.273, + "step": 12466 + }, + { + "epoch": 0.78, + "grad_norm": 0.5635493235170816, + "learning_rate": 1.1741457419035507e-06, + "loss": 0.4458, + "step": 12467 + }, + { + "epoch": 0.78, + "grad_norm": 9.33909884006406, + "learning_rate": 1.1734900983143137e-06, + "loss": 0.2516, + "step": 12468 + }, + { + "epoch": 0.78, + "grad_norm": 2.627532320798, + "learning_rate": 1.1728346134937684e-06, + "loss": 0.2681, + "step": 12469 + }, + { + "epoch": 0.78, + "grad_norm": 2.4333070540487776, + "learning_rate": 1.1721792874691134e-06, + "loss": 0.2567, + "step": 12470 + }, + { + "epoch": 0.78, + "grad_norm": 1.9691859001338756, + "learning_rate": 1.1715241202675376e-06, + "loss": 0.269, + "step": 12471 + }, + { + "epoch": 0.78, + "grad_norm": 3.214259415331407, + "learning_rate": 1.170869111916228e-06, + "loss": 0.247, + "step": 12472 + }, + { + "epoch": 0.78, + "grad_norm": 2.141149431017653, + "learning_rate": 1.1702142624423597e-06, + "loss": 0.2471, + "step": 12473 + }, + { + "epoch": 0.78, + "grad_norm": 1.4431629562547268, + "learning_rate": 1.169559571873104e-06, + "loss": 0.2388, + "step": 12474 + }, + { + "epoch": 0.78, + "grad_norm": 1.9320162319467629, + "learning_rate": 1.1689050402356245e-06, + "loss": 0.2676, + "step": 12475 + }, + { + "epoch": 0.78, + "grad_norm": 1.9550619027480893, + "learning_rate": 1.1682506675570794e-06, + "loss": 0.2668, + "step": 12476 + }, + { + "epoch": 0.78, + "grad_norm": 3.423839090169735, + "learning_rate": 1.1675964538646212e-06, + "loss": 0.2612, + "step": 12477 + }, + { + "epoch": 0.78, + "grad_norm": 2.703081067910517, + "learning_rate": 1.1669423991853934e-06, + "loss": 0.2656, + "step": 12478 + }, + { + "epoch": 0.78, + "grad_norm": 2.809074591000523, + "learning_rate": 1.1662885035465326e-06, + "loss": 0.2561, + "step": 12479 + }, + { + "epoch": 0.78, + "grad_norm": 6.289841843814653, + "learning_rate": 1.1656347669751721e-06, + "loss": 0.2365, + "step": 12480 + }, + { + "epoch": 0.78, + "grad_norm": 1.7508959446791594, + "learning_rate": 1.1649811894984365e-06, + "loss": 0.2742, + "step": 12481 + }, + { + "epoch": 0.78, + "grad_norm": 2.768419559671135, + "learning_rate": 1.1643277711434426e-06, + "loss": 0.2486, + "step": 12482 + }, + { + "epoch": 0.79, + "grad_norm": 3.1725298500932095, + "learning_rate": 1.1636745119373006e-06, + "loss": 0.2634, + "step": 12483 + }, + { + "epoch": 0.79, + "grad_norm": 2.0440031235292655, + "learning_rate": 1.1630214119071176e-06, + "loss": 0.2552, + "step": 12484 + }, + { + "epoch": 0.79, + "grad_norm": 2.165666677740684, + "learning_rate": 1.1623684710799931e-06, + "loss": 0.2787, + "step": 12485 + }, + { + "epoch": 0.79, + "grad_norm": 2.634598689959782, + "learning_rate": 1.1617156894830168e-06, + "loss": 0.2526, + "step": 12486 + }, + { + "epoch": 0.79, + "grad_norm": 2.0799976191475413, + "learning_rate": 1.1610630671432726e-06, + "loss": 0.2522, + "step": 12487 + }, + { + "epoch": 0.79, + "grad_norm": 6.526831303344769, + "learning_rate": 1.160410604087842e-06, + "loss": 0.275, + "step": 12488 + }, + { + "epoch": 0.79, + "grad_norm": 1.7336969226906678, + "learning_rate": 1.1597583003437957e-06, + "loss": 0.2441, + "step": 12489 + }, + { + "epoch": 0.79, + "grad_norm": 1.3347651756990528, + "learning_rate": 1.1591061559381983e-06, + "loss": 0.2344, + "step": 12490 + }, + { + "epoch": 0.79, + "grad_norm": 1.5842669488335628, + "learning_rate": 1.1584541708981056e-06, + "loss": 0.2454, + "step": 12491 + }, + { + "epoch": 0.79, + "grad_norm": 2.0914782853210023, + "learning_rate": 1.157802345250576e-06, + "loss": 0.241, + "step": 12492 + }, + { + "epoch": 0.79, + "grad_norm": 0.6067020423351305, + "learning_rate": 1.1571506790226512e-06, + "loss": 0.5115, + "step": 12493 + }, + { + "epoch": 0.79, + "grad_norm": 4.577052080039558, + "learning_rate": 1.1564991722413703e-06, + "loss": 0.2581, + "step": 12494 + }, + { + "epoch": 0.79, + "grad_norm": 2.141169498888766, + "learning_rate": 1.1558478249337645e-06, + "loss": 0.2558, + "step": 12495 + }, + { + "epoch": 0.79, + "grad_norm": 1.8072534450279496, + "learning_rate": 1.155196637126862e-06, + "loss": 0.2558, + "step": 12496 + }, + { + "epoch": 0.79, + "grad_norm": 2.4883261753788135, + "learning_rate": 1.1545456088476798e-06, + "loss": 0.2548, + "step": 12497 + }, + { + "epoch": 0.79, + "grad_norm": 2.1176064660172833, + "learning_rate": 1.1538947401232292e-06, + "loss": 0.2452, + "step": 12498 + }, + { + "epoch": 0.79, + "grad_norm": 2.44912751435392, + "learning_rate": 1.1532440309805172e-06, + "loss": 0.2758, + "step": 12499 + }, + { + "epoch": 0.79, + "grad_norm": 2.273582491757086, + "learning_rate": 1.1525934814465445e-06, + "loss": 0.2393, + "step": 12500 + }, + { + "epoch": 0.79, + "grad_norm": 1.9041350921919935, + "learning_rate": 1.151943091548302e-06, + "loss": 0.2604, + "step": 12501 + }, + { + "epoch": 0.79, + "grad_norm": 2.287163244391985, + "learning_rate": 1.1512928613127755e-06, + "loss": 0.2495, + "step": 12502 + }, + { + "epoch": 0.79, + "grad_norm": 3.4595723358799626, + "learning_rate": 1.1506427907669433e-06, + "loss": 0.256, + "step": 12503 + }, + { + "epoch": 0.79, + "grad_norm": 4.49651984953011, + "learning_rate": 1.1499928799377797e-06, + "loss": 0.2595, + "step": 12504 + }, + { + "epoch": 0.79, + "grad_norm": 2.125624539039244, + "learning_rate": 1.1493431288522506e-06, + "loss": 0.2559, + "step": 12505 + }, + { + "epoch": 0.79, + "grad_norm": 1.8762300636033735, + "learning_rate": 1.1486935375373127e-06, + "loss": 0.2578, + "step": 12506 + }, + { + "epoch": 0.79, + "grad_norm": 1.5016800503813323, + "learning_rate": 1.1480441060199205e-06, + "loss": 0.2544, + "step": 12507 + }, + { + "epoch": 0.79, + "grad_norm": 6.838339903371633, + "learning_rate": 1.147394834327022e-06, + "loss": 0.2664, + "step": 12508 + }, + { + "epoch": 0.79, + "grad_norm": 3.881716108332491, + "learning_rate": 1.1467457224855545e-06, + "loss": 0.2537, + "step": 12509 + }, + { + "epoch": 0.79, + "grad_norm": 1.5807947566489926, + "learning_rate": 1.1460967705224513e-06, + "loss": 0.2605, + "step": 12510 + }, + { + "epoch": 0.79, + "grad_norm": 4.26284734727908, + "learning_rate": 1.1454479784646366e-06, + "loss": 0.2579, + "step": 12511 + }, + { + "epoch": 0.79, + "grad_norm": 5.8058990245876085, + "learning_rate": 1.1447993463390338e-06, + "loss": 0.2557, + "step": 12512 + }, + { + "epoch": 0.79, + "grad_norm": 2.6603955876008762, + "learning_rate": 1.1441508741725532e-06, + "loss": 0.2607, + "step": 12513 + }, + { + "epoch": 0.79, + "grad_norm": 1.8662390847806956, + "learning_rate": 1.1435025619921003e-06, + "loss": 0.27, + "step": 12514 + }, + { + "epoch": 0.79, + "grad_norm": 1.8993136860139344, + "learning_rate": 1.1428544098245764e-06, + "loss": 0.2605, + "step": 12515 + }, + { + "epoch": 0.79, + "grad_norm": 1.6618781794788233, + "learning_rate": 1.1422064176968751e-06, + "loss": 0.2393, + "step": 12516 + }, + { + "epoch": 0.79, + "grad_norm": 1.6535574441274803, + "learning_rate": 1.1415585856358818e-06, + "loss": 0.2459, + "step": 12517 + }, + { + "epoch": 0.79, + "grad_norm": 2.1107706388054064, + "learning_rate": 1.140910913668476e-06, + "loss": 0.254, + "step": 12518 + }, + { + "epoch": 0.79, + "grad_norm": 2.0057155683324996, + "learning_rate": 1.1402634018215297e-06, + "loss": 0.2567, + "step": 12519 + }, + { + "epoch": 0.79, + "grad_norm": 1.8805554651096124, + "learning_rate": 1.1396160501219122e-06, + "loss": 0.256, + "step": 12520 + }, + { + "epoch": 0.79, + "grad_norm": 1.5254401353977964, + "learning_rate": 1.1389688585964797e-06, + "loss": 0.2589, + "step": 12521 + }, + { + "epoch": 0.79, + "grad_norm": 2.7727760794380036, + "learning_rate": 1.1383218272720886e-06, + "loss": 0.2486, + "step": 12522 + }, + { + "epoch": 0.79, + "grad_norm": 4.226380985944135, + "learning_rate": 1.1376749561755829e-06, + "loss": 0.251, + "step": 12523 + }, + { + "epoch": 0.79, + "grad_norm": 1.9691395237304765, + "learning_rate": 1.137028245333805e-06, + "loss": 0.2513, + "step": 12524 + }, + { + "epoch": 0.79, + "grad_norm": 3.7968276456284897, + "learning_rate": 1.1363816947735866e-06, + "loss": 0.2797, + "step": 12525 + }, + { + "epoch": 0.79, + "grad_norm": 1.7319762883552918, + "learning_rate": 1.135735304521754e-06, + "loss": 0.2488, + "step": 12526 + }, + { + "epoch": 0.79, + "grad_norm": 2.3259678274218527, + "learning_rate": 1.1350890746051257e-06, + "loss": 0.2511, + "step": 12527 + }, + { + "epoch": 0.79, + "grad_norm": 1.774371210727824, + "learning_rate": 1.1344430050505174e-06, + "loss": 0.2683, + "step": 12528 + }, + { + "epoch": 0.79, + "grad_norm": 3.4778816333882983, + "learning_rate": 1.1337970958847354e-06, + "loss": 0.2398, + "step": 12529 + }, + { + "epoch": 0.79, + "grad_norm": 2.35855918601479, + "learning_rate": 1.1331513471345796e-06, + "loss": 0.2563, + "step": 12530 + }, + { + "epoch": 0.79, + "grad_norm": 2.002575286983129, + "learning_rate": 1.1325057588268406e-06, + "loss": 0.2638, + "step": 12531 + }, + { + "epoch": 0.79, + "grad_norm": 1.794344514719857, + "learning_rate": 1.1318603309883092e-06, + "loss": 0.2573, + "step": 12532 + }, + { + "epoch": 0.79, + "grad_norm": 3.0473068150855536, + "learning_rate": 1.131215063645763e-06, + "loss": 0.2479, + "step": 12533 + }, + { + "epoch": 0.79, + "grad_norm": 2.1101361089676454, + "learning_rate": 1.1305699568259754e-06, + "loss": 0.2535, + "step": 12534 + }, + { + "epoch": 0.79, + "grad_norm": 2.6986855224429824, + "learning_rate": 1.129925010555712e-06, + "loss": 0.2493, + "step": 12535 + }, + { + "epoch": 0.79, + "grad_norm": 2.41802514819793, + "learning_rate": 1.129280224861734e-06, + "loss": 0.2528, + "step": 12536 + }, + { + "epoch": 0.79, + "grad_norm": 2.1709270847637714, + "learning_rate": 1.128635599770796e-06, + "loss": 0.2532, + "step": 12537 + }, + { + "epoch": 0.79, + "grad_norm": 1.6679563241448176, + "learning_rate": 1.1279911353096428e-06, + "loss": 0.2451, + "step": 12538 + }, + { + "epoch": 0.79, + "grad_norm": 1.737360594545524, + "learning_rate": 1.127346831505014e-06, + "loss": 0.2513, + "step": 12539 + }, + { + "epoch": 0.79, + "grad_norm": 2.782237902939371, + "learning_rate": 1.126702688383645e-06, + "loss": 0.2483, + "step": 12540 + }, + { + "epoch": 0.79, + "grad_norm": 2.1581764216894004, + "learning_rate": 1.1260587059722612e-06, + "loss": 0.2604, + "step": 12541 + }, + { + "epoch": 0.79, + "grad_norm": 1.9506341887736045, + "learning_rate": 1.1254148842975826e-06, + "loss": 0.2657, + "step": 12542 + }, + { + "epoch": 0.79, + "grad_norm": 2.031156869127606, + "learning_rate": 1.1247712233863196e-06, + "loss": 0.243, + "step": 12543 + }, + { + "epoch": 0.79, + "grad_norm": 1.8372615209296899, + "learning_rate": 1.1241277232651849e-06, + "loss": 0.2634, + "step": 12544 + }, + { + "epoch": 0.79, + "grad_norm": 4.997029139268055, + "learning_rate": 1.123484383960875e-06, + "loss": 0.2414, + "step": 12545 + }, + { + "epoch": 0.79, + "grad_norm": 3.031356522291658, + "learning_rate": 1.1228412055000831e-06, + "loss": 0.2791, + "step": 12546 + }, + { + "epoch": 0.79, + "grad_norm": 2.188305805998532, + "learning_rate": 1.1221981879094952e-06, + "loss": 0.2515, + "step": 12547 + }, + { + "epoch": 0.79, + "grad_norm": 5.713710686893574, + "learning_rate": 1.121555331215794e-06, + "loss": 0.2685, + "step": 12548 + }, + { + "epoch": 0.79, + "grad_norm": 0.6204778214657698, + "learning_rate": 1.12091263544565e-06, + "loss": 0.4421, + "step": 12549 + }, + { + "epoch": 0.79, + "grad_norm": 1.6060995304773358, + "learning_rate": 1.1202701006257317e-06, + "loss": 0.248, + "step": 12550 + }, + { + "epoch": 0.79, + "grad_norm": 4.884199085345835, + "learning_rate": 1.119627726782695e-06, + "loss": 0.2599, + "step": 12551 + }, + { + "epoch": 0.79, + "grad_norm": 2.031984087381864, + "learning_rate": 1.1189855139431988e-06, + "loss": 0.2416, + "step": 12552 + }, + { + "epoch": 0.79, + "grad_norm": 3.219641225185397, + "learning_rate": 1.1183434621338874e-06, + "loss": 0.267, + "step": 12553 + }, + { + "epoch": 0.79, + "grad_norm": 2.19017248474657, + "learning_rate": 1.1177015713814005e-06, + "loss": 0.2626, + "step": 12554 + }, + { + "epoch": 0.79, + "grad_norm": 2.9210577488538396, + "learning_rate": 1.1170598417123695e-06, + "loss": 0.2715, + "step": 12555 + }, + { + "epoch": 0.79, + "grad_norm": 3.2658083660306367, + "learning_rate": 1.116418273153424e-06, + "loss": 0.239, + "step": 12556 + }, + { + "epoch": 0.79, + "grad_norm": 1.335857299370904, + "learning_rate": 1.1157768657311824e-06, + "loss": 0.2561, + "step": 12557 + }, + { + "epoch": 0.79, + "grad_norm": 1.8964327199140771, + "learning_rate": 1.1151356194722563e-06, + "loss": 0.2807, + "step": 12558 + }, + { + "epoch": 0.79, + "grad_norm": 2.270531759566121, + "learning_rate": 1.114494534403253e-06, + "loss": 0.2648, + "step": 12559 + }, + { + "epoch": 0.79, + "grad_norm": 1.6292387873161762, + "learning_rate": 1.1138536105507752e-06, + "loss": 0.2628, + "step": 12560 + }, + { + "epoch": 0.79, + "grad_norm": 2.33056391057903, + "learning_rate": 1.1132128479414133e-06, + "loss": 0.2614, + "step": 12561 + }, + { + "epoch": 0.79, + "grad_norm": 4.3726849490375415, + "learning_rate": 1.1125722466017547e-06, + "loss": 0.2387, + "step": 12562 + }, + { + "epoch": 0.79, + "grad_norm": 5.069500013054947, + "learning_rate": 1.1119318065583763e-06, + "loss": 0.256, + "step": 12563 + }, + { + "epoch": 0.79, + "grad_norm": 2.369525702333648, + "learning_rate": 1.111291527837855e-06, + "loss": 0.2503, + "step": 12564 + }, + { + "epoch": 0.79, + "grad_norm": 1.6172731811989232, + "learning_rate": 1.110651410466755e-06, + "loss": 0.2294, + "step": 12565 + }, + { + "epoch": 0.79, + "grad_norm": 1.9636983040958742, + "learning_rate": 1.1100114544716351e-06, + "loss": 0.2402, + "step": 12566 + }, + { + "epoch": 0.79, + "grad_norm": 2.0941481280855103, + "learning_rate": 1.1093716598790494e-06, + "loss": 0.2619, + "step": 12567 + }, + { + "epoch": 0.79, + "grad_norm": 2.8640338042927724, + "learning_rate": 1.1087320267155448e-06, + "loss": 0.2475, + "step": 12568 + }, + { + "epoch": 0.79, + "grad_norm": 1.731232259530004, + "learning_rate": 1.108092555007661e-06, + "loss": 0.2579, + "step": 12569 + }, + { + "epoch": 0.79, + "grad_norm": 2.035099166033582, + "learning_rate": 1.1074532447819291e-06, + "loss": 0.2555, + "step": 12570 + }, + { + "epoch": 0.79, + "grad_norm": 2.88827441752297, + "learning_rate": 1.1068140960648753e-06, + "loss": 0.2681, + "step": 12571 + }, + { + "epoch": 0.79, + "grad_norm": 2.257072743708595, + "learning_rate": 1.1061751088830208e-06, + "loss": 0.2593, + "step": 12572 + }, + { + "epoch": 0.79, + "grad_norm": 1.754039672030549, + "learning_rate": 1.1055362832628757e-06, + "loss": 0.2535, + "step": 12573 + }, + { + "epoch": 0.79, + "grad_norm": 1.4491884704594167, + "learning_rate": 1.1048976192309496e-06, + "loss": 0.2379, + "step": 12574 + }, + { + "epoch": 0.79, + "grad_norm": 2.9714295966204913, + "learning_rate": 1.1042591168137379e-06, + "loss": 0.2552, + "step": 12575 + }, + { + "epoch": 0.79, + "grad_norm": 4.122563903119853, + "learning_rate": 1.103620776037736e-06, + "loss": 0.2591, + "step": 12576 + }, + { + "epoch": 0.79, + "grad_norm": 1.7153754520571782, + "learning_rate": 1.1029825969294294e-06, + "loss": 0.2519, + "step": 12577 + }, + { + "epoch": 0.79, + "grad_norm": 2.915400174354112, + "learning_rate": 1.102344579515297e-06, + "loss": 0.2725, + "step": 12578 + }, + { + "epoch": 0.79, + "grad_norm": 1.9739363400590335, + "learning_rate": 1.1017067238218093e-06, + "loss": 0.2466, + "step": 12579 + }, + { + "epoch": 0.79, + "grad_norm": 2.3522173238319866, + "learning_rate": 1.1010690298754352e-06, + "loss": 0.2969, + "step": 12580 + }, + { + "epoch": 0.79, + "grad_norm": 1.8681438226119749, + "learning_rate": 1.1004314977026304e-06, + "loss": 0.256, + "step": 12581 + }, + { + "epoch": 0.79, + "grad_norm": 2.1370796301878596, + "learning_rate": 1.0997941273298512e-06, + "loss": 0.2456, + "step": 12582 + }, + { + "epoch": 0.79, + "grad_norm": 1.5404171831350435, + "learning_rate": 1.09915691878354e-06, + "loss": 0.2436, + "step": 12583 + }, + { + "epoch": 0.79, + "grad_norm": 2.030863537508889, + "learning_rate": 1.0985198720901375e-06, + "loss": 0.2489, + "step": 12584 + }, + { + "epoch": 0.79, + "grad_norm": 2.387414855463944, + "learning_rate": 1.097882987276076e-06, + "loss": 0.2555, + "step": 12585 + }, + { + "epoch": 0.79, + "grad_norm": 1.8833909598958247, + "learning_rate": 1.09724626436778e-06, + "loss": 0.2542, + "step": 12586 + }, + { + "epoch": 0.79, + "grad_norm": 4.31885258932891, + "learning_rate": 1.0966097033916674e-06, + "loss": 0.2561, + "step": 12587 + }, + { + "epoch": 0.79, + "grad_norm": 0.6444723961562331, + "learning_rate": 1.095973304374151e-06, + "loss": 0.456, + "step": 12588 + }, + { + "epoch": 0.79, + "grad_norm": 2.0304281248864497, + "learning_rate": 1.095337067341638e-06, + "loss": 0.2607, + "step": 12589 + }, + { + "epoch": 0.79, + "grad_norm": 2.4028968508352064, + "learning_rate": 1.094700992320526e-06, + "loss": 0.2495, + "step": 12590 + }, + { + "epoch": 0.79, + "grad_norm": 0.6004150796113565, + "learning_rate": 1.0940650793372048e-06, + "loss": 0.4702, + "step": 12591 + }, + { + "epoch": 0.79, + "grad_norm": 1.704736975204606, + "learning_rate": 1.0934293284180625e-06, + "loss": 0.2513, + "step": 12592 + }, + { + "epoch": 0.79, + "grad_norm": 2.7708386570462173, + "learning_rate": 1.092793739589476e-06, + "loss": 0.2751, + "step": 12593 + }, + { + "epoch": 0.79, + "grad_norm": 1.935491697614958, + "learning_rate": 1.0921583128778174e-06, + "loss": 0.256, + "step": 12594 + }, + { + "epoch": 0.79, + "grad_norm": 2.163138681186147, + "learning_rate": 1.0915230483094502e-06, + "loss": 0.245, + "step": 12595 + }, + { + "epoch": 0.79, + "grad_norm": 2.0448580586558363, + "learning_rate": 1.090887945910734e-06, + "loss": 0.2569, + "step": 12596 + }, + { + "epoch": 0.79, + "grad_norm": 1.8075072237919652, + "learning_rate": 1.0902530057080218e-06, + "loss": 0.2554, + "step": 12597 + }, + { + "epoch": 0.79, + "grad_norm": 1.3769456760994443, + "learning_rate": 1.0896182277276568e-06, + "loss": 0.2421, + "step": 12598 + }, + { + "epoch": 0.79, + "grad_norm": 8.635367036395204, + "learning_rate": 1.0889836119959757e-06, + "loss": 0.2763, + "step": 12599 + }, + { + "epoch": 0.79, + "grad_norm": 2.465053926785206, + "learning_rate": 1.0883491585393125e-06, + "loss": 0.2391, + "step": 12600 + }, + { + "epoch": 0.79, + "grad_norm": 1.4718448703491538, + "learning_rate": 1.0877148673839905e-06, + "loss": 0.2541, + "step": 12601 + }, + { + "epoch": 0.79, + "grad_norm": 2.782588444553286, + "learning_rate": 1.0870807385563282e-06, + "loss": 0.2515, + "step": 12602 + }, + { + "epoch": 0.79, + "grad_norm": 2.755272247053251, + "learning_rate": 1.0864467720826343e-06, + "loss": 0.2696, + "step": 12603 + }, + { + "epoch": 0.79, + "grad_norm": 0.60891908528279, + "learning_rate": 1.0858129679892148e-06, + "loss": 0.4695, + "step": 12604 + }, + { + "epoch": 0.79, + "grad_norm": 5.766288661741712, + "learning_rate": 1.0851793263023696e-06, + "loss": 0.2431, + "step": 12605 + }, + { + "epoch": 0.79, + "grad_norm": 8.057605425557707, + "learning_rate": 1.0845458470483877e-06, + "loss": 0.268, + "step": 12606 + }, + { + "epoch": 0.79, + "grad_norm": 1.766783735190372, + "learning_rate": 1.083912530253552e-06, + "loss": 0.2559, + "step": 12607 + }, + { + "epoch": 0.79, + "grad_norm": 1.691451251144815, + "learning_rate": 1.0832793759441418e-06, + "loss": 0.2425, + "step": 12608 + }, + { + "epoch": 0.79, + "grad_norm": 5.444297540711089, + "learning_rate": 1.082646384146428e-06, + "loss": 0.2727, + "step": 12609 + }, + { + "epoch": 0.79, + "grad_norm": 1.9744189851229572, + "learning_rate": 1.0820135548866718e-06, + "loss": 0.2696, + "step": 12610 + }, + { + "epoch": 0.79, + "grad_norm": 1.9574124572550855, + "learning_rate": 1.0813808881911326e-06, + "loss": 0.2555, + "step": 12611 + }, + { + "epoch": 0.79, + "grad_norm": 1.865698936407434, + "learning_rate": 1.0807483840860616e-06, + "loss": 0.2522, + "step": 12612 + }, + { + "epoch": 0.79, + "grad_norm": 5.47363020299997, + "learning_rate": 1.080116042597702e-06, + "loss": 0.2427, + "step": 12613 + }, + { + "epoch": 0.79, + "grad_norm": 1.5894927783966755, + "learning_rate": 1.0794838637522898e-06, + "loss": 0.2488, + "step": 12614 + }, + { + "epoch": 0.79, + "grad_norm": 2.6714157842975013, + "learning_rate": 1.0788518475760545e-06, + "loss": 0.2562, + "step": 12615 + }, + { + "epoch": 0.79, + "grad_norm": 2.059916192120151, + "learning_rate": 1.0782199940952226e-06, + "loss": 0.2521, + "step": 12616 + }, + { + "epoch": 0.79, + "grad_norm": 2.582203056058463, + "learning_rate": 1.0775883033360085e-06, + "loss": 0.2539, + "step": 12617 + }, + { + "epoch": 0.79, + "grad_norm": 2.281674729655202, + "learning_rate": 1.0769567753246214e-06, + "loss": 0.2754, + "step": 12618 + }, + { + "epoch": 0.79, + "grad_norm": 5.247061740126943, + "learning_rate": 1.076325410087266e-06, + "loss": 0.245, + "step": 12619 + }, + { + "epoch": 0.79, + "grad_norm": 2.107378748262202, + "learning_rate": 1.0756942076501397e-06, + "loss": 0.2466, + "step": 12620 + }, + { + "epoch": 0.79, + "grad_norm": 2.070681722068169, + "learning_rate": 1.0750631680394314e-06, + "loss": 0.2543, + "step": 12621 + }, + { + "epoch": 0.79, + "grad_norm": 2.894514451643714, + "learning_rate": 1.0744322912813231e-06, + "loss": 0.2681, + "step": 12622 + }, + { + "epoch": 0.79, + "grad_norm": 7.6935646765427474, + "learning_rate": 1.0738015774019911e-06, + "loss": 0.2756, + "step": 12623 + }, + { + "epoch": 0.79, + "grad_norm": 1.4813043818811578, + "learning_rate": 1.0731710264276062e-06, + "loss": 0.2463, + "step": 12624 + }, + { + "epoch": 0.79, + "grad_norm": 2.0690950187614274, + "learning_rate": 1.072540638384329e-06, + "loss": 0.2559, + "step": 12625 + }, + { + "epoch": 0.79, + "grad_norm": 2.1003674811340676, + "learning_rate": 1.0719104132983176e-06, + "loss": 0.273, + "step": 12626 + }, + { + "epoch": 0.79, + "grad_norm": 1.999234616830355, + "learning_rate": 1.0712803511957199e-06, + "loss": 0.2639, + "step": 12627 + }, + { + "epoch": 0.79, + "grad_norm": 2.0810651801883537, + "learning_rate": 1.0706504521026788e-06, + "loss": 0.2828, + "step": 12628 + }, + { + "epoch": 0.79, + "grad_norm": 2.252220017339744, + "learning_rate": 1.0700207160453308e-06, + "loss": 0.2584, + "step": 12629 + }, + { + "epoch": 0.79, + "grad_norm": 2.5991030471073953, + "learning_rate": 1.0693911430498032e-06, + "loss": 0.243, + "step": 12630 + }, + { + "epoch": 0.79, + "grad_norm": 5.270072849526293, + "learning_rate": 1.0687617331422173e-06, + "loss": 0.2957, + "step": 12631 + }, + { + "epoch": 0.79, + "grad_norm": 2.0615147930890165, + "learning_rate": 1.0681324863486907e-06, + "loss": 0.2353, + "step": 12632 + }, + { + "epoch": 0.79, + "grad_norm": 1.942022182797392, + "learning_rate": 1.0675034026953302e-06, + "loss": 0.2611, + "step": 12633 + }, + { + "epoch": 0.79, + "grad_norm": 1.7247293771471623, + "learning_rate": 1.06687448220824e-06, + "loss": 0.2438, + "step": 12634 + }, + { + "epoch": 0.79, + "grad_norm": 27.532396668252304, + "learning_rate": 1.0662457249135116e-06, + "loss": 0.2613, + "step": 12635 + }, + { + "epoch": 0.79, + "grad_norm": 1.9627645819310615, + "learning_rate": 1.0656171308372371e-06, + "loss": 0.2532, + "step": 12636 + }, + { + "epoch": 0.79, + "grad_norm": 2.860449922467492, + "learning_rate": 1.0649887000054954e-06, + "loss": 0.2438, + "step": 12637 + }, + { + "epoch": 0.79, + "grad_norm": 4.428910228790399, + "learning_rate": 1.0643604324443623e-06, + "loss": 0.2636, + "step": 12638 + }, + { + "epoch": 0.79, + "grad_norm": 1.5791223517173654, + "learning_rate": 1.0637323281799045e-06, + "loss": 0.2629, + "step": 12639 + }, + { + "epoch": 0.79, + "grad_norm": 2.5656433746105467, + "learning_rate": 1.063104387238183e-06, + "loss": 0.2398, + "step": 12640 + }, + { + "epoch": 0.79, + "grad_norm": 4.084940454890411, + "learning_rate": 1.0624766096452555e-06, + "loss": 0.243, + "step": 12641 + }, + { + "epoch": 0.8, + "grad_norm": 1.509478982840512, + "learning_rate": 1.0618489954271667e-06, + "loss": 0.244, + "step": 12642 + }, + { + "epoch": 0.8, + "grad_norm": 2.604940557742458, + "learning_rate": 1.0612215446099566e-06, + "loss": 0.2649, + "step": 12643 + }, + { + "epoch": 0.8, + "grad_norm": 3.152044277074773, + "learning_rate": 1.0605942572196626e-06, + "loss": 0.2692, + "step": 12644 + }, + { + "epoch": 0.8, + "grad_norm": 3.2724483867519862, + "learning_rate": 1.0599671332823096e-06, + "loss": 0.2498, + "step": 12645 + }, + { + "epoch": 0.8, + "grad_norm": 1.3261794034612644, + "learning_rate": 1.0593401728239183e-06, + "loss": 0.2654, + "step": 12646 + }, + { + "epoch": 0.8, + "grad_norm": 1.697217798764649, + "learning_rate": 1.0587133758705015e-06, + "loss": 0.257, + "step": 12647 + }, + { + "epoch": 0.8, + "grad_norm": 0.556527071138105, + "learning_rate": 1.0580867424480674e-06, + "loss": 0.4191, + "step": 12648 + }, + { + "epoch": 0.8, + "grad_norm": 1.7960712602951898, + "learning_rate": 1.0574602725826171e-06, + "loss": 0.2576, + "step": 12649 + }, + { + "epoch": 0.8, + "grad_norm": 1.3832907326687363, + "learning_rate": 1.0568339663001431e-06, + "loss": 0.2487, + "step": 12650 + }, + { + "epoch": 0.8, + "grad_norm": 1.4440824776907264, + "learning_rate": 1.0562078236266304e-06, + "loss": 0.2546, + "step": 12651 + }, + { + "epoch": 0.8, + "grad_norm": 2.7054232843073094, + "learning_rate": 1.0555818445880612e-06, + "loss": 0.2566, + "step": 12652 + }, + { + "epoch": 0.8, + "grad_norm": 2.1606381253869293, + "learning_rate": 1.0549560292104071e-06, + "loss": 0.2563, + "step": 12653 + }, + { + "epoch": 0.8, + "grad_norm": 1.4857746976529542, + "learning_rate": 1.054330377519635e-06, + "loss": 0.2505, + "step": 12654 + }, + { + "epoch": 0.8, + "grad_norm": 1.5759929981336405, + "learning_rate": 1.0537048895417024e-06, + "loss": 0.2412, + "step": 12655 + }, + { + "epoch": 0.8, + "grad_norm": 1.4120537014752426, + "learning_rate": 1.0530795653025634e-06, + "loss": 0.2384, + "step": 12656 + }, + { + "epoch": 0.8, + "grad_norm": 2.0359148928023334, + "learning_rate": 1.052454404828165e-06, + "loss": 0.267, + "step": 12657 + }, + { + "epoch": 0.8, + "grad_norm": 1.715593916937375, + "learning_rate": 1.0518294081444447e-06, + "loss": 0.2517, + "step": 12658 + }, + { + "epoch": 0.8, + "grad_norm": 2.7637411676576837, + "learning_rate": 1.0512045752773336e-06, + "loss": 0.2485, + "step": 12659 + }, + { + "epoch": 0.8, + "grad_norm": 1.9585179326587279, + "learning_rate": 1.0505799062527605e-06, + "loss": 0.258, + "step": 12660 + }, + { + "epoch": 0.8, + "grad_norm": 1.7517473444612268, + "learning_rate": 1.0499554010966418e-06, + "loss": 0.2686, + "step": 12661 + }, + { + "epoch": 0.8, + "grad_norm": 2.1278934042656314, + "learning_rate": 1.0493310598348894e-06, + "loss": 0.2586, + "step": 12662 + }, + { + "epoch": 0.8, + "grad_norm": 2.4294070183837997, + "learning_rate": 1.048706882493407e-06, + "loss": 0.2745, + "step": 12663 + }, + { + "epoch": 0.8, + "grad_norm": 1.563678841463182, + "learning_rate": 1.0480828690980949e-06, + "loss": 0.2446, + "step": 12664 + }, + { + "epoch": 0.8, + "grad_norm": 5.356532435630444, + "learning_rate": 1.0474590196748447e-06, + "loss": 0.2536, + "step": 12665 + }, + { + "epoch": 0.8, + "grad_norm": 2.3055337489752814, + "learning_rate": 1.0468353342495407e-06, + "loss": 0.2638, + "step": 12666 + }, + { + "epoch": 0.8, + "grad_norm": 2.271229517955468, + "learning_rate": 1.0462118128480586e-06, + "loss": 0.2704, + "step": 12667 + }, + { + "epoch": 0.8, + "grad_norm": 4.103510597477528, + "learning_rate": 1.0455884554962725e-06, + "loss": 0.2555, + "step": 12668 + }, + { + "epoch": 0.8, + "grad_norm": 1.7146463056873202, + "learning_rate": 1.0449652622200458e-06, + "loss": 0.2393, + "step": 12669 + }, + { + "epoch": 0.8, + "grad_norm": 2.4774842234541814, + "learning_rate": 1.044342233045233e-06, + "loss": 0.2438, + "step": 12670 + }, + { + "epoch": 0.8, + "grad_norm": 3.0275772087169353, + "learning_rate": 1.0437193679976887e-06, + "loss": 0.2474, + "step": 12671 + }, + { + "epoch": 0.8, + "grad_norm": 2.1172927523323017, + "learning_rate": 1.0430966671032538e-06, + "loss": 0.2519, + "step": 12672 + }, + { + "epoch": 0.8, + "grad_norm": 2.3934315297416164, + "learning_rate": 1.0424741303877678e-06, + "loss": 0.2507, + "step": 12673 + }, + { + "epoch": 0.8, + "grad_norm": 1.9443693351825124, + "learning_rate": 1.0418517578770594e-06, + "loss": 0.2457, + "step": 12674 + }, + { + "epoch": 0.8, + "grad_norm": 1.69091874738381, + "learning_rate": 1.041229549596951e-06, + "loss": 0.2632, + "step": 12675 + }, + { + "epoch": 0.8, + "grad_norm": 2.526762667542659, + "learning_rate": 1.0406075055732612e-06, + "loss": 0.239, + "step": 12676 + }, + { + "epoch": 0.8, + "grad_norm": 1.4981714523450758, + "learning_rate": 1.0399856258317987e-06, + "loss": 0.2469, + "step": 12677 + }, + { + "epoch": 0.8, + "grad_norm": 1.5519275324277573, + "learning_rate": 1.039363910398365e-06, + "loss": 0.2467, + "step": 12678 + }, + { + "epoch": 0.8, + "grad_norm": 1.6470867041201902, + "learning_rate": 1.0387423592987584e-06, + "loss": 0.2581, + "step": 12679 + }, + { + "epoch": 0.8, + "grad_norm": 1.9412357340760982, + "learning_rate": 1.0381209725587666e-06, + "loss": 0.2582, + "step": 12680 + }, + { + "epoch": 0.8, + "grad_norm": 1.693824949661888, + "learning_rate": 1.0374997502041739e-06, + "loss": 0.2697, + "step": 12681 + }, + { + "epoch": 0.8, + "grad_norm": 3.7987068313073133, + "learning_rate": 1.0368786922607548e-06, + "loss": 0.2595, + "step": 12682 + }, + { + "epoch": 0.8, + "grad_norm": 2.366354239649986, + "learning_rate": 1.0362577987542766e-06, + "loss": 0.2667, + "step": 12683 + }, + { + "epoch": 0.8, + "grad_norm": 2.726829433353076, + "learning_rate": 1.035637069710504e-06, + "loss": 0.2383, + "step": 12684 + }, + { + "epoch": 0.8, + "grad_norm": 1.799693095943145, + "learning_rate": 1.0350165051551897e-06, + "loss": 0.2378, + "step": 12685 + }, + { + "epoch": 0.8, + "grad_norm": 2.207478286308404, + "learning_rate": 1.0343961051140845e-06, + "loss": 0.2402, + "step": 12686 + }, + { + "epoch": 0.8, + "grad_norm": 2.626782023532939, + "learning_rate": 1.0337758696129285e-06, + "loss": 0.2538, + "step": 12687 + }, + { + "epoch": 0.8, + "grad_norm": 2.924991570707471, + "learning_rate": 1.0331557986774553e-06, + "loss": 0.2448, + "step": 12688 + }, + { + "epoch": 0.8, + "grad_norm": 2.2163584607766262, + "learning_rate": 1.0325358923333956e-06, + "loss": 0.2855, + "step": 12689 + }, + { + "epoch": 0.8, + "grad_norm": 2.505883828485406, + "learning_rate": 1.031916150606468e-06, + "loss": 0.2641, + "step": 12690 + }, + { + "epoch": 0.8, + "grad_norm": 1.9471321499543168, + "learning_rate": 1.0312965735223868e-06, + "loss": 0.2357, + "step": 12691 + }, + { + "epoch": 0.8, + "grad_norm": 2.48507471862247, + "learning_rate": 1.0306771611068595e-06, + "loss": 0.2602, + "step": 12692 + }, + { + "epoch": 0.8, + "grad_norm": 0.6326021287696546, + "learning_rate": 1.030057913385588e-06, + "loss": 0.4777, + "step": 12693 + }, + { + "epoch": 0.8, + "grad_norm": 4.45425023918084, + "learning_rate": 1.0294388303842661e-06, + "loss": 0.2637, + "step": 12694 + }, + { + "epoch": 0.8, + "grad_norm": 2.2861385267994963, + "learning_rate": 1.0288199121285775e-06, + "loss": 0.2343, + "step": 12695 + }, + { + "epoch": 0.8, + "grad_norm": 1.7641721723017243, + "learning_rate": 1.0282011586442064e-06, + "loss": 0.2537, + "step": 12696 + }, + { + "epoch": 0.8, + "grad_norm": 3.974476382647005, + "learning_rate": 1.0275825699568231e-06, + "loss": 0.2493, + "step": 12697 + }, + { + "epoch": 0.8, + "grad_norm": 2.123960814989126, + "learning_rate": 1.0269641460920954e-06, + "loss": 0.2348, + "step": 12698 + }, + { + "epoch": 0.8, + "grad_norm": 2.3583238287554678, + "learning_rate": 1.0263458870756808e-06, + "loss": 0.2453, + "step": 12699 + }, + { + "epoch": 0.8, + "grad_norm": 2.018289054983028, + "learning_rate": 1.0257277929332332e-06, + "loss": 0.2637, + "step": 12700 + }, + { + "epoch": 0.8, + "grad_norm": 2.665058189261105, + "learning_rate": 1.0251098636904e-06, + "loss": 0.2504, + "step": 12701 + }, + { + "epoch": 0.8, + "grad_norm": 2.91998136874884, + "learning_rate": 1.0244920993728185e-06, + "loss": 0.273, + "step": 12702 + }, + { + "epoch": 0.8, + "grad_norm": 2.3023157744331306, + "learning_rate": 1.0238745000061201e-06, + "loss": 0.2412, + "step": 12703 + }, + { + "epoch": 0.8, + "grad_norm": 3.7980366962356116, + "learning_rate": 1.0232570656159325e-06, + "loss": 0.2548, + "step": 12704 + }, + { + "epoch": 0.8, + "grad_norm": 1.4788772562573218, + "learning_rate": 1.0226397962278723e-06, + "loss": 0.26, + "step": 12705 + }, + { + "epoch": 0.8, + "grad_norm": 1.5763378591316777, + "learning_rate": 1.0220226918675519e-06, + "loss": 0.2523, + "step": 12706 + }, + { + "epoch": 0.8, + "grad_norm": 2.201895235173311, + "learning_rate": 1.0214057525605742e-06, + "loss": 0.2688, + "step": 12707 + }, + { + "epoch": 0.8, + "grad_norm": 1.531095714465481, + "learning_rate": 1.0207889783325386e-06, + "loss": 0.2441, + "step": 12708 + }, + { + "epoch": 0.8, + "grad_norm": 2.7185099593682156, + "learning_rate": 1.0201723692090376e-06, + "loss": 0.2475, + "step": 12709 + }, + { + "epoch": 0.8, + "grad_norm": 5.803324053474017, + "learning_rate": 1.019555925215654e-06, + "loss": 0.2596, + "step": 12710 + }, + { + "epoch": 0.8, + "grad_norm": 1.6728505484323652, + "learning_rate": 1.0189396463779632e-06, + "loss": 0.2465, + "step": 12711 + }, + { + "epoch": 0.8, + "grad_norm": 0.611571019272076, + "learning_rate": 1.0183235327215396e-06, + "loss": 0.4697, + "step": 12712 + }, + { + "epoch": 0.8, + "grad_norm": 1.5883228438460357, + "learning_rate": 1.0177075842719448e-06, + "loss": 0.2656, + "step": 12713 + }, + { + "epoch": 0.8, + "grad_norm": 0.6129906049332171, + "learning_rate": 1.0170918010547355e-06, + "loss": 0.4433, + "step": 12714 + }, + { + "epoch": 0.8, + "grad_norm": 2.389832644539706, + "learning_rate": 1.0164761830954606e-06, + "loss": 0.2487, + "step": 12715 + }, + { + "epoch": 0.8, + "grad_norm": 2.426246001497093, + "learning_rate": 1.0158607304196643e-06, + "loss": 0.2529, + "step": 12716 + }, + { + "epoch": 0.8, + "grad_norm": 2.101607057639297, + "learning_rate": 1.0152454430528847e-06, + "loss": 0.2349, + "step": 12717 + }, + { + "epoch": 0.8, + "grad_norm": 1.9409808877926633, + "learning_rate": 1.0146303210206487e-06, + "loss": 0.2657, + "step": 12718 + }, + { + "epoch": 0.8, + "grad_norm": 2.479657028448129, + "learning_rate": 1.0140153643484784e-06, + "loss": 0.2712, + "step": 12719 + }, + { + "epoch": 0.8, + "grad_norm": 1.708947078782897, + "learning_rate": 1.0134005730618922e-06, + "loss": 0.2384, + "step": 12720 + }, + { + "epoch": 0.8, + "grad_norm": 2.027054384171033, + "learning_rate": 1.012785947186397e-06, + "loss": 0.2579, + "step": 12721 + }, + { + "epoch": 0.8, + "grad_norm": 1.8035305780545818, + "learning_rate": 1.0121714867474936e-06, + "loss": 0.2394, + "step": 12722 + }, + { + "epoch": 0.8, + "grad_norm": 2.134175423304704, + "learning_rate": 1.0115571917706795e-06, + "loss": 0.2453, + "step": 12723 + }, + { + "epoch": 0.8, + "grad_norm": 3.70589306166117, + "learning_rate": 1.010943062281441e-06, + "loss": 0.2683, + "step": 12724 + }, + { + "epoch": 0.8, + "grad_norm": 0.6795361124172253, + "learning_rate": 1.0103290983052611e-06, + "loss": 0.4873, + "step": 12725 + }, + { + "epoch": 0.8, + "grad_norm": 3.470026123728847, + "learning_rate": 1.0097152998676135e-06, + "loss": 0.2792, + "step": 12726 + }, + { + "epoch": 0.8, + "grad_norm": 3.051263251291704, + "learning_rate": 1.009101666993964e-06, + "loss": 0.2619, + "step": 12727 + }, + { + "epoch": 0.8, + "grad_norm": 3.9010949523591494, + "learning_rate": 1.0084881997097768e-06, + "loss": 0.247, + "step": 12728 + }, + { + "epoch": 0.8, + "grad_norm": 2.496350425292508, + "learning_rate": 1.0078748980405033e-06, + "loss": 0.261, + "step": 12729 + }, + { + "epoch": 0.8, + "grad_norm": 1.5563006687611018, + "learning_rate": 1.0072617620115904e-06, + "loss": 0.2591, + "step": 12730 + }, + { + "epoch": 0.8, + "grad_norm": 1.3271151319139887, + "learning_rate": 1.0066487916484803e-06, + "loss": 0.2479, + "step": 12731 + }, + { + "epoch": 0.8, + "grad_norm": 2.9768100611518893, + "learning_rate": 1.0060359869766028e-06, + "loss": 0.2716, + "step": 12732 + }, + { + "epoch": 0.8, + "grad_norm": 2.7156932569004555, + "learning_rate": 1.0054233480213881e-06, + "loss": 0.2751, + "step": 12733 + }, + { + "epoch": 0.8, + "grad_norm": 1.698599394553698, + "learning_rate": 1.0048108748082536e-06, + "loss": 0.2638, + "step": 12734 + }, + { + "epoch": 0.8, + "grad_norm": 2.1124446176601013, + "learning_rate": 1.0041985673626115e-06, + "loss": 0.2565, + "step": 12735 + }, + { + "epoch": 0.8, + "grad_norm": 2.215011893965315, + "learning_rate": 1.0035864257098687e-06, + "loss": 0.2579, + "step": 12736 + }, + { + "epoch": 0.8, + "grad_norm": 3.0804420682947686, + "learning_rate": 1.0029744498754225e-06, + "loss": 0.2562, + "step": 12737 + }, + { + "epoch": 0.8, + "grad_norm": 1.6035858514247574, + "learning_rate": 1.002362639884667e-06, + "loss": 0.2662, + "step": 12738 + }, + { + "epoch": 0.8, + "grad_norm": 2.602340476526753, + "learning_rate": 1.0017509957629868e-06, + "loss": 0.2575, + "step": 12739 + }, + { + "epoch": 0.8, + "grad_norm": 3.558152836341865, + "learning_rate": 1.0011395175357574e-06, + "loss": 0.2493, + "step": 12740 + }, + { + "epoch": 0.8, + "grad_norm": 6.4520598093403265, + "learning_rate": 1.0005282052283539e-06, + "loss": 0.2486, + "step": 12741 + }, + { + "epoch": 0.8, + "grad_norm": 1.725741189843362, + "learning_rate": 9.999170588661388e-07, + "loss": 0.2716, + "step": 12742 + }, + { + "epoch": 0.8, + "grad_norm": 4.360730072590659, + "learning_rate": 9.99306078474469e-07, + "loss": 0.2407, + "step": 12743 + }, + { + "epoch": 0.8, + "grad_norm": 1.755519875333957, + "learning_rate": 9.986952640786972e-07, + "loss": 0.2392, + "step": 12744 + }, + { + "epoch": 0.8, + "grad_norm": 4.155347866804942, + "learning_rate": 9.980846157041645e-07, + "loss": 0.2585, + "step": 12745 + }, + { + "epoch": 0.8, + "grad_norm": 1.6009202595564043, + "learning_rate": 9.974741333762106e-07, + "loss": 0.2654, + "step": 12746 + }, + { + "epoch": 0.8, + "grad_norm": 2.64625703476425, + "learning_rate": 9.968638171201644e-07, + "loss": 0.2722, + "step": 12747 + }, + { + "epoch": 0.8, + "grad_norm": 3.4761751049718286, + "learning_rate": 9.96253666961347e-07, + "loss": 0.2639, + "step": 12748 + }, + { + "epoch": 0.8, + "grad_norm": 2.7387890736189626, + "learning_rate": 9.956436829250782e-07, + "loss": 0.2654, + "step": 12749 + }, + { + "epoch": 0.8, + "grad_norm": 1.40965657231041, + "learning_rate": 9.950338650366659e-07, + "loss": 0.2449, + "step": 12750 + }, + { + "epoch": 0.8, + "grad_norm": 1.7228430507127512, + "learning_rate": 9.944242133214098e-07, + "loss": 0.2585, + "step": 12751 + }, + { + "epoch": 0.8, + "grad_norm": 2.0072685071796164, + "learning_rate": 9.938147278046083e-07, + "loss": 0.2517, + "step": 12752 + }, + { + "epoch": 0.8, + "grad_norm": 3.765746681085658, + "learning_rate": 9.932054085115512e-07, + "loss": 0.2647, + "step": 12753 + }, + { + "epoch": 0.8, + "grad_norm": 3.536878930923567, + "learning_rate": 9.925962554675185e-07, + "loss": 0.2676, + "step": 12754 + }, + { + "epoch": 0.8, + "grad_norm": 1.6169529515121037, + "learning_rate": 9.919872686977849e-07, + "loss": 0.2656, + "step": 12755 + }, + { + "epoch": 0.8, + "grad_norm": 1.7243509227023388, + "learning_rate": 9.913784482276167e-07, + "loss": 0.2635, + "step": 12756 + }, + { + "epoch": 0.8, + "grad_norm": 1.3668179612717561, + "learning_rate": 9.90769794082279e-07, + "loss": 0.2522, + "step": 12757 + }, + { + "epoch": 0.8, + "grad_norm": 1.9917937310395633, + "learning_rate": 9.901613062870236e-07, + "loss": 0.2597, + "step": 12758 + }, + { + "epoch": 0.8, + "grad_norm": 1.6579782239575436, + "learning_rate": 9.89552984867096e-07, + "loss": 0.2537, + "step": 12759 + }, + { + "epoch": 0.8, + "grad_norm": 2.3337970745103087, + "learning_rate": 9.889448298477388e-07, + "loss": 0.2597, + "step": 12760 + }, + { + "epoch": 0.8, + "grad_norm": 0.6246909970505483, + "learning_rate": 9.88336841254186e-07, + "loss": 0.488, + "step": 12761 + }, + { + "epoch": 0.8, + "grad_norm": 3.5945754365392033, + "learning_rate": 9.87729019111664e-07, + "loss": 0.2525, + "step": 12762 + }, + { + "epoch": 0.8, + "grad_norm": 26.287682843088376, + "learning_rate": 9.871213634453908e-07, + "loss": 0.2531, + "step": 12763 + }, + { + "epoch": 0.8, + "grad_norm": 1.913740426617599, + "learning_rate": 9.865138742805792e-07, + "loss": 0.2536, + "step": 12764 + }, + { + "epoch": 0.8, + "grad_norm": 2.300508795335946, + "learning_rate": 9.859065516424365e-07, + "loss": 0.2608, + "step": 12765 + }, + { + "epoch": 0.8, + "grad_norm": 1.8217823616327986, + "learning_rate": 9.852993955561607e-07, + "loss": 0.272, + "step": 12766 + }, + { + "epoch": 0.8, + "grad_norm": 2.057594120481516, + "learning_rate": 9.846924060469433e-07, + "loss": 0.2533, + "step": 12767 + }, + { + "epoch": 0.8, + "grad_norm": 13.860080822210733, + "learning_rate": 9.840855831399694e-07, + "loss": 0.2516, + "step": 12768 + }, + { + "epoch": 0.8, + "grad_norm": 1.7091044263044803, + "learning_rate": 9.834789268604188e-07, + "loss": 0.2561, + "step": 12769 + }, + { + "epoch": 0.8, + "grad_norm": 4.678264760509789, + "learning_rate": 9.828724372334624e-07, + "loss": 0.2418, + "step": 12770 + }, + { + "epoch": 0.8, + "grad_norm": 2.0376825477549967, + "learning_rate": 9.822661142842621e-07, + "loss": 0.2461, + "step": 12771 + }, + { + "epoch": 0.8, + "grad_norm": 2.6357489493609827, + "learning_rate": 9.816599580379783e-07, + "loss": 0.2875, + "step": 12772 + }, + { + "epoch": 0.8, + "grad_norm": 1.4977430951989361, + "learning_rate": 9.8105396851976e-07, + "loss": 0.2596, + "step": 12773 + }, + { + "epoch": 0.8, + "grad_norm": 4.921853057194947, + "learning_rate": 9.8044814575475e-07, + "loss": 0.2389, + "step": 12774 + }, + { + "epoch": 0.8, + "grad_norm": 1.7556084874755709, + "learning_rate": 9.798424897680876e-07, + "loss": 0.2353, + "step": 12775 + }, + { + "epoch": 0.8, + "grad_norm": 2.7325476307067564, + "learning_rate": 9.79237000584899e-07, + "loss": 0.2584, + "step": 12776 + }, + { + "epoch": 0.8, + "grad_norm": 1.9852158607627954, + "learning_rate": 9.786316782303108e-07, + "loss": 0.2637, + "step": 12777 + }, + { + "epoch": 0.8, + "grad_norm": 5.209972925321309, + "learning_rate": 9.78026522729436e-07, + "loss": 0.2469, + "step": 12778 + }, + { + "epoch": 0.8, + "grad_norm": 1.8084338101734356, + "learning_rate": 9.774215341073844e-07, + "loss": 0.2574, + "step": 12779 + }, + { + "epoch": 0.8, + "grad_norm": 1.7451656808315519, + "learning_rate": 9.76816712389259e-07, + "loss": 0.2579, + "step": 12780 + }, + { + "epoch": 0.8, + "grad_norm": 1.6661527500741184, + "learning_rate": 9.762120576001543e-07, + "loss": 0.2327, + "step": 12781 + }, + { + "epoch": 0.8, + "grad_norm": 1.656309263103782, + "learning_rate": 9.756075697651573e-07, + "loss": 0.2496, + "step": 12782 + }, + { + "epoch": 0.8, + "grad_norm": 1.8303826124579643, + "learning_rate": 9.750032489093514e-07, + "loss": 0.2729, + "step": 12783 + }, + { + "epoch": 0.8, + "grad_norm": 1.530801960260885, + "learning_rate": 9.743990950578087e-07, + "loss": 0.2436, + "step": 12784 + }, + { + "epoch": 0.8, + "grad_norm": 1.7175450632254168, + "learning_rate": 9.73795108235599e-07, + "loss": 0.2557, + "step": 12785 + }, + { + "epoch": 0.8, + "grad_norm": 1.8929034367106248, + "learning_rate": 9.731912884677814e-07, + "loss": 0.2563, + "step": 12786 + }, + { + "epoch": 0.8, + "grad_norm": 0.5587418258075241, + "learning_rate": 9.72587635779409e-07, + "loss": 0.4351, + "step": 12787 + }, + { + "epoch": 0.8, + "grad_norm": 3.842594908282135, + "learning_rate": 9.719841501955296e-07, + "loss": 0.2666, + "step": 12788 + }, + { + "epoch": 0.8, + "grad_norm": 3.128309565322895, + "learning_rate": 9.713808317411815e-07, + "loss": 0.2747, + "step": 12789 + }, + { + "epoch": 0.8, + "grad_norm": 1.7326631363657472, + "learning_rate": 9.707776804414e-07, + "loss": 0.2498, + "step": 12790 + }, + { + "epoch": 0.8, + "grad_norm": 3.05237976920696, + "learning_rate": 9.701746963212084e-07, + "loss": 0.2568, + "step": 12791 + }, + { + "epoch": 0.8, + "grad_norm": 5.393489461025843, + "learning_rate": 9.695718794056258e-07, + "loss": 0.2575, + "step": 12792 + }, + { + "epoch": 0.8, + "grad_norm": 4.3888296321583455, + "learning_rate": 9.689692297196657e-07, + "loss": 0.2639, + "step": 12793 + }, + { + "epoch": 0.8, + "grad_norm": 1.5725615619980446, + "learning_rate": 9.683667472883324e-07, + "loss": 0.2482, + "step": 12794 + }, + { + "epoch": 0.8, + "grad_norm": 2.647815195484356, + "learning_rate": 9.677644321366226e-07, + "loss": 0.2362, + "step": 12795 + }, + { + "epoch": 0.8, + "grad_norm": 5.174720726874843, + "learning_rate": 9.671622842895294e-07, + "loss": 0.2453, + "step": 12796 + }, + { + "epoch": 0.8, + "grad_norm": 1.4792700718431802, + "learning_rate": 9.66560303772035e-07, + "loss": 0.2445, + "step": 12797 + }, + { + "epoch": 0.8, + "grad_norm": 1.5752754867579595, + "learning_rate": 9.65958490609119e-07, + "loss": 0.2413, + "step": 12798 + }, + { + "epoch": 0.8, + "grad_norm": 6.7160303960197085, + "learning_rate": 9.653568448257504e-07, + "loss": 0.2587, + "step": 12799 + }, + { + "epoch": 0.8, + "grad_norm": 2.0086538354064927, + "learning_rate": 9.647553664468918e-07, + "loss": 0.2394, + "step": 12800 + }, + { + "epoch": 0.81, + "grad_norm": 1.9664307217398085, + "learning_rate": 9.641540554975015e-07, + "loss": 0.2393, + "step": 12801 + }, + { + "epoch": 0.81, + "grad_norm": 1.615912294402693, + "learning_rate": 9.635529120025282e-07, + "loss": 0.2488, + "step": 12802 + }, + { + "epoch": 0.81, + "grad_norm": 3.917978411956469, + "learning_rate": 9.629519359869121e-07, + "loss": 0.2617, + "step": 12803 + }, + { + "epoch": 0.81, + "grad_norm": 3.1349444477823143, + "learning_rate": 9.623511274755914e-07, + "loss": 0.2725, + "step": 12804 + }, + { + "epoch": 0.81, + "grad_norm": 2.2862184235505376, + "learning_rate": 9.617504864934956e-07, + "loss": 0.2523, + "step": 12805 + }, + { + "epoch": 0.81, + "grad_norm": 1.6156777265235944, + "learning_rate": 9.611500130655443e-07, + "loss": 0.2493, + "step": 12806 + }, + { + "epoch": 0.81, + "grad_norm": 2.8710067540421056, + "learning_rate": 9.60549707216653e-07, + "loss": 0.2344, + "step": 12807 + }, + { + "epoch": 0.81, + "grad_norm": 2.0947206449001587, + "learning_rate": 9.599495689717276e-07, + "loss": 0.248, + "step": 12808 + }, + { + "epoch": 0.81, + "grad_norm": 2.667282840868198, + "learning_rate": 9.593495983556723e-07, + "loss": 0.2886, + "step": 12809 + }, + { + "epoch": 0.81, + "grad_norm": 1.5302462702039108, + "learning_rate": 9.58749795393379e-07, + "loss": 0.2506, + "step": 12810 + }, + { + "epoch": 0.81, + "grad_norm": 2.076053237357009, + "learning_rate": 9.581501601097332e-07, + "loss": 0.2556, + "step": 12811 + }, + { + "epoch": 0.81, + "grad_norm": 2.006810597267588, + "learning_rate": 9.57550692529617e-07, + "loss": 0.2599, + "step": 12812 + }, + { + "epoch": 0.81, + "grad_norm": 2.172004393315114, + "learning_rate": 9.569513926779033e-07, + "loss": 0.2536, + "step": 12813 + }, + { + "epoch": 0.81, + "grad_norm": 12.770538705483302, + "learning_rate": 9.563522605794579e-07, + "loss": 0.2527, + "step": 12814 + }, + { + "epoch": 0.81, + "grad_norm": 11.051092054586464, + "learning_rate": 9.557532962591398e-07, + "loss": 0.2669, + "step": 12815 + }, + { + "epoch": 0.81, + "grad_norm": 2.1166252046240914, + "learning_rate": 9.551544997417995e-07, + "loss": 0.2507, + "step": 12816 + }, + { + "epoch": 0.81, + "grad_norm": 3.1193775209546764, + "learning_rate": 9.545558710522844e-07, + "loss": 0.2492, + "step": 12817 + }, + { + "epoch": 0.81, + "grad_norm": 1.575697964565946, + "learning_rate": 9.53957410215432e-07, + "loss": 0.2512, + "step": 12818 + }, + { + "epoch": 0.81, + "grad_norm": 1.6749766983147105, + "learning_rate": 9.533591172560714e-07, + "loss": 0.2675, + "step": 12819 + }, + { + "epoch": 0.81, + "grad_norm": 3.2145755436199703, + "learning_rate": 9.527609921990294e-07, + "loss": 0.2808, + "step": 12820 + }, + { + "epoch": 0.81, + "grad_norm": 2.2916494501173883, + "learning_rate": 9.521630350691235e-07, + "loss": 0.2596, + "step": 12821 + }, + { + "epoch": 0.81, + "grad_norm": 0.6229630033286424, + "learning_rate": 9.515652458911629e-07, + "loss": 0.4291, + "step": 12822 + }, + { + "epoch": 0.81, + "grad_norm": 2.3131772122071124, + "learning_rate": 9.509676246899513e-07, + "loss": 0.2415, + "step": 12823 + }, + { + "epoch": 0.81, + "grad_norm": 1.8486509994036227, + "learning_rate": 9.503701714902836e-07, + "loss": 0.2713, + "step": 12824 + }, + { + "epoch": 0.81, + "grad_norm": 1.7387775178104907, + "learning_rate": 9.497728863169514e-07, + "loss": 0.2508, + "step": 12825 + }, + { + "epoch": 0.81, + "grad_norm": 1.8702734825778233, + "learning_rate": 9.491757691947367e-07, + "loss": 0.2491, + "step": 12826 + }, + { + "epoch": 0.81, + "grad_norm": 1.7961841237914369, + "learning_rate": 9.485788201484125e-07, + "loss": 0.2639, + "step": 12827 + }, + { + "epoch": 0.81, + "grad_norm": 0.576440342154987, + "learning_rate": 9.479820392027494e-07, + "loss": 0.4522, + "step": 12828 + }, + { + "epoch": 0.81, + "grad_norm": 2.179496433329002, + "learning_rate": 9.473854263825105e-07, + "loss": 0.2623, + "step": 12829 + }, + { + "epoch": 0.81, + "grad_norm": 1.3879735818632433, + "learning_rate": 9.467889817124481e-07, + "loss": 0.2404, + "step": 12830 + }, + { + "epoch": 0.81, + "grad_norm": 3.22086572123504, + "learning_rate": 9.461927052173097e-07, + "loss": 0.2447, + "step": 12831 + }, + { + "epoch": 0.81, + "grad_norm": 2.394871369895212, + "learning_rate": 9.455965969218356e-07, + "loss": 0.2446, + "step": 12832 + }, + { + "epoch": 0.81, + "grad_norm": 4.12420794196283, + "learning_rate": 9.450006568507614e-07, + "loss": 0.255, + "step": 12833 + }, + { + "epoch": 0.81, + "grad_norm": 3.0344644144945154, + "learning_rate": 9.444048850288112e-07, + "loss": 0.2411, + "step": 12834 + }, + { + "epoch": 0.81, + "grad_norm": 3.399867326488132, + "learning_rate": 9.43809281480707e-07, + "loss": 0.2623, + "step": 12835 + }, + { + "epoch": 0.81, + "grad_norm": 2.8563141513055275, + "learning_rate": 9.432138462311591e-07, + "loss": 0.273, + "step": 12836 + }, + { + "epoch": 0.81, + "grad_norm": 2.431892728930356, + "learning_rate": 9.426185793048754e-07, + "loss": 0.2535, + "step": 12837 + }, + { + "epoch": 0.81, + "grad_norm": 2.930204766985941, + "learning_rate": 9.420234807265538e-07, + "loss": 0.2507, + "step": 12838 + }, + { + "epoch": 0.81, + "grad_norm": 2.1713764203900605, + "learning_rate": 9.414285505208859e-07, + "loss": 0.2457, + "step": 12839 + }, + { + "epoch": 0.81, + "grad_norm": 1.9472027670717913, + "learning_rate": 9.408337887125546e-07, + "loss": 0.2546, + "step": 12840 + }, + { + "epoch": 0.81, + "grad_norm": 1.644738162379368, + "learning_rate": 9.402391953262396e-07, + "loss": 0.2392, + "step": 12841 + }, + { + "epoch": 0.81, + "grad_norm": 1.6318367312835584, + "learning_rate": 9.396447703866124e-07, + "loss": 0.2492, + "step": 12842 + }, + { + "epoch": 0.81, + "grad_norm": 1.7337807755938024, + "learning_rate": 9.390505139183359e-07, + "loss": 0.2597, + "step": 12843 + }, + { + "epoch": 0.81, + "grad_norm": 5.795926326609472, + "learning_rate": 9.384564259460655e-07, + "loss": 0.2571, + "step": 12844 + }, + { + "epoch": 0.81, + "grad_norm": 2.3201580163658386, + "learning_rate": 9.378625064944529e-07, + "loss": 0.2558, + "step": 12845 + }, + { + "epoch": 0.81, + "grad_norm": 5.997465048882375, + "learning_rate": 9.372687555881405e-07, + "loss": 0.2662, + "step": 12846 + }, + { + "epoch": 0.81, + "grad_norm": 3.0019004629313737, + "learning_rate": 9.366751732517632e-07, + "loss": 0.2517, + "step": 12847 + }, + { + "epoch": 0.81, + "grad_norm": 4.361205174224972, + "learning_rate": 9.360817595099491e-07, + "loss": 0.2498, + "step": 12848 + }, + { + "epoch": 0.81, + "grad_norm": 1.6482585744755953, + "learning_rate": 9.354885143873216e-07, + "loss": 0.2615, + "step": 12849 + }, + { + "epoch": 0.81, + "grad_norm": 1.5524535099587276, + "learning_rate": 9.348954379084957e-07, + "loss": 0.2543, + "step": 12850 + }, + { + "epoch": 0.81, + "grad_norm": 3.1563532493672573, + "learning_rate": 9.343025300980791e-07, + "loss": 0.2533, + "step": 12851 + }, + { + "epoch": 0.81, + "grad_norm": 0.5901668742846251, + "learning_rate": 9.337097909806703e-07, + "loss": 0.4666, + "step": 12852 + }, + { + "epoch": 0.81, + "grad_norm": 2.0350044462128816, + "learning_rate": 9.331172205808659e-07, + "loss": 0.2605, + "step": 12853 + }, + { + "epoch": 0.81, + "grad_norm": 1.8035053011714821, + "learning_rate": 9.325248189232521e-07, + "loss": 0.2636, + "step": 12854 + }, + { + "epoch": 0.81, + "grad_norm": 1.3109480430631197, + "learning_rate": 9.319325860324069e-07, + "loss": 0.2425, + "step": 12855 + }, + { + "epoch": 0.81, + "grad_norm": 1.790154375725121, + "learning_rate": 9.313405219329041e-07, + "loss": 0.2624, + "step": 12856 + }, + { + "epoch": 0.81, + "grad_norm": 1.9340313613188451, + "learning_rate": 9.307486266493109e-07, + "loss": 0.2546, + "step": 12857 + }, + { + "epoch": 0.81, + "grad_norm": 3.552539089425548, + "learning_rate": 9.301569002061855e-07, + "loss": 0.2468, + "step": 12858 + }, + { + "epoch": 0.81, + "grad_norm": 2.235652719347328, + "learning_rate": 9.295653426280793e-07, + "loss": 0.245, + "step": 12859 + }, + { + "epoch": 0.81, + "grad_norm": 1.5643757271303158, + "learning_rate": 9.289739539395354e-07, + "loss": 0.2407, + "step": 12860 + }, + { + "epoch": 0.81, + "grad_norm": 5.573721095433635, + "learning_rate": 9.283827341650942e-07, + "loss": 0.2391, + "step": 12861 + }, + { + "epoch": 0.81, + "grad_norm": 1.8385291457680781, + "learning_rate": 9.277916833292855e-07, + "loss": 0.2621, + "step": 12862 + }, + { + "epoch": 0.81, + "grad_norm": 1.895979292505263, + "learning_rate": 9.272008014566314e-07, + "loss": 0.2624, + "step": 12863 + }, + { + "epoch": 0.81, + "grad_norm": 2.2921231198056478, + "learning_rate": 9.266100885716506e-07, + "loss": 0.2464, + "step": 12864 + }, + { + "epoch": 0.81, + "grad_norm": 1.461850035186532, + "learning_rate": 9.260195446988535e-07, + "loss": 0.2441, + "step": 12865 + }, + { + "epoch": 0.81, + "grad_norm": 1.6173435668297256, + "learning_rate": 9.254291698627416e-07, + "loss": 0.249, + "step": 12866 + }, + { + "epoch": 0.81, + "grad_norm": 2.168350408546128, + "learning_rate": 9.248389640878109e-07, + "loss": 0.2275, + "step": 12867 + }, + { + "epoch": 0.81, + "grad_norm": 2.2547508996488923, + "learning_rate": 9.242489273985483e-07, + "loss": 0.2603, + "step": 12868 + }, + { + "epoch": 0.81, + "grad_norm": 1.5311450365000812, + "learning_rate": 9.236590598194384e-07, + "loss": 0.2443, + "step": 12869 + }, + { + "epoch": 0.81, + "grad_norm": 1.687600126500834, + "learning_rate": 9.230693613749547e-07, + "loss": 0.2647, + "step": 12870 + }, + { + "epoch": 0.81, + "grad_norm": 2.3168067255507325, + "learning_rate": 9.224798320895634e-07, + "loss": 0.24, + "step": 12871 + }, + { + "epoch": 0.81, + "grad_norm": 2.99266560903549, + "learning_rate": 9.218904719877258e-07, + "loss": 0.2475, + "step": 12872 + }, + { + "epoch": 0.81, + "grad_norm": 1.5829464029758733, + "learning_rate": 9.213012810938976e-07, + "loss": 0.2483, + "step": 12873 + }, + { + "epoch": 0.81, + "grad_norm": 1.9153073565322267, + "learning_rate": 9.207122594325241e-07, + "loss": 0.2744, + "step": 12874 + }, + { + "epoch": 0.81, + "grad_norm": 1.7244034798619865, + "learning_rate": 9.201234070280446e-07, + "loss": 0.2507, + "step": 12875 + }, + { + "epoch": 0.81, + "grad_norm": 2.0440624380199686, + "learning_rate": 9.195347239048902e-07, + "loss": 0.2725, + "step": 12876 + }, + { + "epoch": 0.81, + "grad_norm": 0.5612430259333051, + "learning_rate": 9.189462100874891e-07, + "loss": 0.4663, + "step": 12877 + }, + { + "epoch": 0.81, + "grad_norm": 41.09160100313767, + "learning_rate": 9.183578656002584e-07, + "loss": 0.2568, + "step": 12878 + }, + { + "epoch": 0.81, + "grad_norm": 2.1080926671363964, + "learning_rate": 9.177696904676086e-07, + "loss": 0.279, + "step": 12879 + }, + { + "epoch": 0.81, + "grad_norm": 1.5491718013748426, + "learning_rate": 9.171816847139447e-07, + "loss": 0.2621, + "step": 12880 + }, + { + "epoch": 0.81, + "grad_norm": 1.5374963144072058, + "learning_rate": 9.165938483636666e-07, + "loss": 0.242, + "step": 12881 + }, + { + "epoch": 0.81, + "grad_norm": 1.8972797521043407, + "learning_rate": 9.160061814411625e-07, + "loss": 0.276, + "step": 12882 + }, + { + "epoch": 0.81, + "grad_norm": 3.3082568587378285, + "learning_rate": 9.154186839708157e-07, + "loss": 0.2648, + "step": 12883 + }, + { + "epoch": 0.81, + "grad_norm": 1.782943228581823, + "learning_rate": 9.148313559770011e-07, + "loss": 0.261, + "step": 12884 + }, + { + "epoch": 0.81, + "grad_norm": 0.6283344357888913, + "learning_rate": 9.142441974840915e-07, + "loss": 0.4548, + "step": 12885 + }, + { + "epoch": 0.81, + "grad_norm": 0.5597703822869617, + "learning_rate": 9.136572085164458e-07, + "loss": 0.4444, + "step": 12886 + }, + { + "epoch": 0.81, + "grad_norm": 2.304495916707265, + "learning_rate": 9.130703890984222e-07, + "loss": 0.2518, + "step": 12887 + }, + { + "epoch": 0.81, + "grad_norm": 2.488735308124743, + "learning_rate": 9.124837392543656e-07, + "loss": 0.2576, + "step": 12888 + }, + { + "epoch": 0.81, + "grad_norm": 1.6975954867493765, + "learning_rate": 9.118972590086206e-07, + "loss": 0.2461, + "step": 12889 + }, + { + "epoch": 0.81, + "grad_norm": 1.4664348549370394, + "learning_rate": 9.113109483855193e-07, + "loss": 0.2495, + "step": 12890 + }, + { + "epoch": 0.81, + "grad_norm": 2.4709030237841163, + "learning_rate": 9.107248074093894e-07, + "loss": 0.2503, + "step": 12891 + }, + { + "epoch": 0.81, + "grad_norm": 3.001523179076291, + "learning_rate": 9.101388361045488e-07, + "loss": 0.2319, + "step": 12892 + }, + { + "epoch": 0.81, + "grad_norm": 3.130189121210011, + "learning_rate": 9.095530344953141e-07, + "loss": 0.2468, + "step": 12893 + }, + { + "epoch": 0.81, + "grad_norm": 1.3774295368187583, + "learning_rate": 9.08967402605988e-07, + "loss": 0.2412, + "step": 12894 + }, + { + "epoch": 0.81, + "grad_norm": 2.3103110629274615, + "learning_rate": 9.083819404608724e-07, + "loss": 0.2583, + "step": 12895 + }, + { + "epoch": 0.81, + "grad_norm": 2.244186611790629, + "learning_rate": 9.07796648084256e-07, + "loss": 0.2593, + "step": 12896 + }, + { + "epoch": 0.81, + "grad_norm": 0.6191162625720816, + "learning_rate": 9.072115255004266e-07, + "loss": 0.4698, + "step": 12897 + }, + { + "epoch": 0.81, + "grad_norm": 1.615867861107461, + "learning_rate": 9.066265727336604e-07, + "loss": 0.2664, + "step": 12898 + }, + { + "epoch": 0.81, + "grad_norm": 1.8105279253481417, + "learning_rate": 9.060417898082285e-07, + "loss": 0.2538, + "step": 12899 + }, + { + "epoch": 0.81, + "grad_norm": 3.3438830026991466, + "learning_rate": 9.054571767483932e-07, + "loss": 0.26, + "step": 12900 + }, + { + "epoch": 0.81, + "grad_norm": 2.057403257654064, + "learning_rate": 9.048727335784124e-07, + "loss": 0.2596, + "step": 12901 + }, + { + "epoch": 0.81, + "grad_norm": 2.3587940448726448, + "learning_rate": 9.042884603225372e-07, + "loss": 0.2396, + "step": 12902 + }, + { + "epoch": 0.81, + "grad_norm": 1.7891339199649128, + "learning_rate": 9.03704357005008e-07, + "loss": 0.2504, + "step": 12903 + }, + { + "epoch": 0.81, + "grad_norm": 1.539726229067604, + "learning_rate": 9.031204236500601e-07, + "loss": 0.2624, + "step": 12904 + }, + { + "epoch": 0.81, + "grad_norm": 1.7644042066418655, + "learning_rate": 9.025366602819235e-07, + "loss": 0.2628, + "step": 12905 + }, + { + "epoch": 0.81, + "grad_norm": 1.4714732485959336, + "learning_rate": 9.019530669248195e-07, + "loss": 0.2528, + "step": 12906 + }, + { + "epoch": 0.81, + "grad_norm": 1.682278135449895, + "learning_rate": 9.013696436029607e-07, + "loss": 0.2503, + "step": 12907 + }, + { + "epoch": 0.81, + "grad_norm": 1.6022755323694893, + "learning_rate": 9.007863903405551e-07, + "loss": 0.2609, + "step": 12908 + }, + { + "epoch": 0.81, + "grad_norm": 1.7965323724809963, + "learning_rate": 9.002033071618027e-07, + "loss": 0.267, + "step": 12909 + }, + { + "epoch": 0.81, + "grad_norm": 1.890080830376393, + "learning_rate": 8.996203940908982e-07, + "loss": 0.2665, + "step": 12910 + }, + { + "epoch": 0.81, + "grad_norm": 2.6391509388317074, + "learning_rate": 8.990376511520272e-07, + "loss": 0.2571, + "step": 12911 + }, + { + "epoch": 0.81, + "grad_norm": 2.438269915323138, + "learning_rate": 8.984550783693663e-07, + "loss": 0.2434, + "step": 12912 + }, + { + "epoch": 0.81, + "grad_norm": 2.484500330904699, + "learning_rate": 8.978726757670908e-07, + "loss": 0.2498, + "step": 12913 + }, + { + "epoch": 0.81, + "grad_norm": 0.6036576597089971, + "learning_rate": 8.972904433693646e-07, + "loss": 0.4462, + "step": 12914 + }, + { + "epoch": 0.81, + "grad_norm": 1.653183563269335, + "learning_rate": 8.967083812003446e-07, + "loss": 0.2505, + "step": 12915 + }, + { + "epoch": 0.81, + "grad_norm": 1.416421650311631, + "learning_rate": 8.961264892841798e-07, + "loss": 0.2437, + "step": 12916 + }, + { + "epoch": 0.81, + "grad_norm": 1.8761232962721022, + "learning_rate": 8.955447676450191e-07, + "loss": 0.2705, + "step": 12917 + }, + { + "epoch": 0.81, + "grad_norm": 3.757939120259039, + "learning_rate": 8.94963216306996e-07, + "loss": 0.258, + "step": 12918 + }, + { + "epoch": 0.81, + "grad_norm": 1.4166521640306378, + "learning_rate": 8.943818352942401e-07, + "loss": 0.2511, + "step": 12919 + }, + { + "epoch": 0.81, + "grad_norm": 2.5323323393674855, + "learning_rate": 8.938006246308734e-07, + "loss": 0.2661, + "step": 12920 + }, + { + "epoch": 0.81, + "grad_norm": 3.422876001062616, + "learning_rate": 8.932195843410135e-07, + "loss": 0.2556, + "step": 12921 + }, + { + "epoch": 0.81, + "grad_norm": 1.6779079539753463, + "learning_rate": 8.926387144487675e-07, + "loss": 0.2487, + "step": 12922 + }, + { + "epoch": 0.81, + "grad_norm": 3.8720471401606082, + "learning_rate": 8.920580149782354e-07, + "loss": 0.2805, + "step": 12923 + }, + { + "epoch": 0.81, + "grad_norm": 1.9861607351680166, + "learning_rate": 8.914774859535131e-07, + "loss": 0.269, + "step": 12924 + }, + { + "epoch": 0.81, + "grad_norm": 2.306090823850083, + "learning_rate": 8.908971273986883e-07, + "loss": 0.2541, + "step": 12925 + }, + { + "epoch": 0.81, + "grad_norm": 1.7058781362913975, + "learning_rate": 8.903169393378403e-07, + "loss": 0.249, + "step": 12926 + }, + { + "epoch": 0.81, + "grad_norm": 2.717577300853418, + "learning_rate": 8.897369217950424e-07, + "loss": 0.2565, + "step": 12927 + }, + { + "epoch": 0.81, + "grad_norm": 1.6589760285420476, + "learning_rate": 8.891570747943584e-07, + "loss": 0.2591, + "step": 12928 + }, + { + "epoch": 0.81, + "grad_norm": 2.305319716338477, + "learning_rate": 8.885773983598512e-07, + "loss": 0.283, + "step": 12929 + }, + { + "epoch": 0.81, + "grad_norm": 15.79531730809761, + "learning_rate": 8.879978925155697e-07, + "loss": 0.2434, + "step": 12930 + }, + { + "epoch": 0.81, + "grad_norm": 2.0361184049130663, + "learning_rate": 8.874185572855581e-07, + "loss": 0.2496, + "step": 12931 + }, + { + "epoch": 0.81, + "grad_norm": 2.442694204864611, + "learning_rate": 8.868393926938557e-07, + "loss": 0.2603, + "step": 12932 + }, + { + "epoch": 0.81, + "grad_norm": 2.8625723943847077, + "learning_rate": 8.862603987644942e-07, + "loss": 0.2457, + "step": 12933 + }, + { + "epoch": 0.81, + "grad_norm": 1.4946677941746433, + "learning_rate": 8.856815755214953e-07, + "loss": 0.2475, + "step": 12934 + }, + { + "epoch": 0.81, + "grad_norm": 1.2963165714071727, + "learning_rate": 8.851029229888752e-07, + "loss": 0.2418, + "step": 12935 + }, + { + "epoch": 0.81, + "grad_norm": 0.583915943443398, + "learning_rate": 8.845244411906429e-07, + "loss": 0.4955, + "step": 12936 + }, + { + "epoch": 0.81, + "grad_norm": 2.3722575964806563, + "learning_rate": 8.839461301508028e-07, + "loss": 0.2765, + "step": 12937 + }, + { + "epoch": 0.81, + "grad_norm": 2.2057989099379087, + "learning_rate": 8.833679898933472e-07, + "loss": 0.2733, + "step": 12938 + }, + { + "epoch": 0.81, + "grad_norm": 4.690125514832156, + "learning_rate": 8.827900204422674e-07, + "loss": 0.2689, + "step": 12939 + }, + { + "epoch": 0.81, + "grad_norm": 1.8825434705305653, + "learning_rate": 8.822122218215406e-07, + "loss": 0.2667, + "step": 12940 + }, + { + "epoch": 0.81, + "grad_norm": 1.6403611439779844, + "learning_rate": 8.816345940551446e-07, + "loss": 0.2476, + "step": 12941 + }, + { + "epoch": 0.81, + "grad_norm": 1.505775895939719, + "learning_rate": 8.81057137167044e-07, + "loss": 0.2499, + "step": 12942 + }, + { + "epoch": 0.81, + "grad_norm": 1.2747582625413525, + "learning_rate": 8.80479851181199e-07, + "loss": 0.2676, + "step": 12943 + }, + { + "epoch": 0.81, + "grad_norm": 2.8306581193431004, + "learning_rate": 8.799027361215607e-07, + "loss": 0.2467, + "step": 12944 + }, + { + "epoch": 0.81, + "grad_norm": 1.9694901453239277, + "learning_rate": 8.793257920120774e-07, + "loss": 0.272, + "step": 12945 + }, + { + "epoch": 0.81, + "grad_norm": 1.8756997540171922, + "learning_rate": 8.78749018876685e-07, + "loss": 0.2542, + "step": 12946 + }, + { + "epoch": 0.81, + "grad_norm": 2.621652512605524, + "learning_rate": 8.781724167393168e-07, + "loss": 0.2575, + "step": 12947 + }, + { + "epoch": 0.81, + "grad_norm": 2.6949599893835643, + "learning_rate": 8.775959856238953e-07, + "loss": 0.2553, + "step": 12948 + }, + { + "epoch": 0.81, + "grad_norm": 3.237138056523293, + "learning_rate": 8.770197255543395e-07, + "loss": 0.2743, + "step": 12949 + }, + { + "epoch": 0.81, + "grad_norm": 2.0481638052120377, + "learning_rate": 8.764436365545592e-07, + "loss": 0.2534, + "step": 12950 + }, + { + "epoch": 0.81, + "grad_norm": 2.0585499310221427, + "learning_rate": 8.75867718648456e-07, + "loss": 0.2583, + "step": 12951 + }, + { + "epoch": 0.81, + "grad_norm": 4.588495449316286, + "learning_rate": 8.752919718599256e-07, + "loss": 0.2605, + "step": 12952 + }, + { + "epoch": 0.81, + "grad_norm": 1.775181838047722, + "learning_rate": 8.747163962128574e-07, + "loss": 0.2455, + "step": 12953 + }, + { + "epoch": 0.81, + "grad_norm": 2.716596625811185, + "learning_rate": 8.741409917311344e-07, + "loss": 0.2475, + "step": 12954 + }, + { + "epoch": 0.81, + "grad_norm": 1.9314409012285993, + "learning_rate": 8.735657584386297e-07, + "loss": 0.2515, + "step": 12955 + }, + { + "epoch": 0.81, + "grad_norm": 5.929806241612521, + "learning_rate": 8.729906963592105e-07, + "loss": 0.2601, + "step": 12956 + }, + { + "epoch": 0.81, + "grad_norm": 1.971862189547834, + "learning_rate": 8.724158055167386e-07, + "loss": 0.2582, + "step": 12957 + }, + { + "epoch": 0.81, + "grad_norm": 1.9891388691416703, + "learning_rate": 8.718410859350663e-07, + "loss": 0.2402, + "step": 12958 + }, + { + "epoch": 0.81, + "grad_norm": 4.022258753997461, + "learning_rate": 8.712665376380403e-07, + "loss": 0.2615, + "step": 12959 + }, + { + "epoch": 0.82, + "grad_norm": 1.5026717591833973, + "learning_rate": 8.706921606494973e-07, + "loss": 0.2603, + "step": 12960 + }, + { + "epoch": 0.82, + "grad_norm": 12.121274332580468, + "learning_rate": 8.701179549932709e-07, + "loss": 0.2524, + "step": 12961 + }, + { + "epoch": 0.82, + "grad_norm": 3.721103342967426, + "learning_rate": 8.695439206931877e-07, + "loss": 0.26, + "step": 12962 + }, + { + "epoch": 0.82, + "grad_norm": 3.4580206751263725, + "learning_rate": 8.689700577730637e-07, + "loss": 0.2503, + "step": 12963 + }, + { + "epoch": 0.82, + "grad_norm": 1.6261407089771402, + "learning_rate": 8.68396366256708e-07, + "loss": 0.2521, + "step": 12964 + }, + { + "epoch": 0.82, + "grad_norm": 4.315283498841684, + "learning_rate": 8.678228461679272e-07, + "loss": 0.287, + "step": 12965 + }, + { + "epoch": 0.82, + "grad_norm": 1.870100406785817, + "learning_rate": 8.672494975305157e-07, + "loss": 0.2541, + "step": 12966 + }, + { + "epoch": 0.82, + "grad_norm": 1.8365201530208666, + "learning_rate": 8.666763203682637e-07, + "loss": 0.2448, + "step": 12967 + }, + { + "epoch": 0.82, + "grad_norm": 3.331207549065846, + "learning_rate": 8.661033147049496e-07, + "loss": 0.2659, + "step": 12968 + }, + { + "epoch": 0.82, + "grad_norm": 0.6157615990454055, + "learning_rate": 8.655304805643549e-07, + "loss": 0.4566, + "step": 12969 + }, + { + "epoch": 0.82, + "grad_norm": 1.8008006782529928, + "learning_rate": 8.649578179702434e-07, + "loss": 0.2544, + "step": 12970 + }, + { + "epoch": 0.82, + "grad_norm": 1.532257319195622, + "learning_rate": 8.64385326946377e-07, + "loss": 0.2559, + "step": 12971 + }, + { + "epoch": 0.82, + "grad_norm": 3.867931380705508, + "learning_rate": 8.638130075165079e-07, + "loss": 0.273, + "step": 12972 + }, + { + "epoch": 0.82, + "grad_norm": 2.1692684286071247, + "learning_rate": 8.632408597043851e-07, + "loss": 0.2718, + "step": 12973 + }, + { + "epoch": 0.82, + "grad_norm": 4.268772053840724, + "learning_rate": 8.626688835337471e-07, + "loss": 0.2619, + "step": 12974 + }, + { + "epoch": 0.82, + "grad_norm": 2.6911422353850587, + "learning_rate": 8.620970790283251e-07, + "loss": 0.2482, + "step": 12975 + }, + { + "epoch": 0.82, + "grad_norm": 2.855496193606545, + "learning_rate": 8.615254462118427e-07, + "loss": 0.2643, + "step": 12976 + }, + { + "epoch": 0.82, + "grad_norm": 17.318438943732563, + "learning_rate": 8.609539851080234e-07, + "loss": 0.2607, + "step": 12977 + }, + { + "epoch": 0.82, + "grad_norm": 2.7953197647700785, + "learning_rate": 8.603826957405742e-07, + "loss": 0.2741, + "step": 12978 + }, + { + "epoch": 0.82, + "grad_norm": 3.143269425400992, + "learning_rate": 8.598115781332006e-07, + "loss": 0.2431, + "step": 12979 + }, + { + "epoch": 0.82, + "grad_norm": 3.362342912598155, + "learning_rate": 8.592406323095964e-07, + "loss": 0.2635, + "step": 12980 + }, + { + "epoch": 0.82, + "grad_norm": 0.5253350567382885, + "learning_rate": 8.58669858293455e-07, + "loss": 0.422, + "step": 12981 + }, + { + "epoch": 0.82, + "grad_norm": 2.248639521149241, + "learning_rate": 8.580992561084567e-07, + "loss": 0.2567, + "step": 12982 + }, + { + "epoch": 0.82, + "grad_norm": 1.737356727789742, + "learning_rate": 8.575288257782755e-07, + "loss": 0.2581, + "step": 12983 + }, + { + "epoch": 0.82, + "grad_norm": 1.8994868493645913, + "learning_rate": 8.569585673265818e-07, + "loss": 0.2729, + "step": 12984 + }, + { + "epoch": 0.82, + "grad_norm": 1.7322243587602308, + "learning_rate": 8.563884807770373e-07, + "loss": 0.2507, + "step": 12985 + }, + { + "epoch": 0.82, + "grad_norm": 5.5259055541114055, + "learning_rate": 8.558185661532942e-07, + "loss": 0.267, + "step": 12986 + }, + { + "epoch": 0.82, + "grad_norm": 1.718643461005986, + "learning_rate": 8.552488234789996e-07, + "loss": 0.2719, + "step": 12987 + }, + { + "epoch": 0.82, + "grad_norm": 5.129722357225122, + "learning_rate": 8.546792527777925e-07, + "loss": 0.278, + "step": 12988 + }, + { + "epoch": 0.82, + "grad_norm": 1.4463680420732525, + "learning_rate": 8.541098540733067e-07, + "loss": 0.2546, + "step": 12989 + }, + { + "epoch": 0.82, + "grad_norm": 2.257847127807278, + "learning_rate": 8.535406273891678e-07, + "loss": 0.2536, + "step": 12990 + }, + { + "epoch": 0.82, + "grad_norm": 1.9405059302079728, + "learning_rate": 8.529715727489912e-07, + "loss": 0.2497, + "step": 12991 + }, + { + "epoch": 0.82, + "grad_norm": 2.4784141342344626, + "learning_rate": 8.524026901763905e-07, + "loss": 0.2645, + "step": 12992 + }, + { + "epoch": 0.82, + "grad_norm": 3.9415160324746514, + "learning_rate": 8.518339796949704e-07, + "loss": 0.2873, + "step": 12993 + }, + { + "epoch": 0.82, + "grad_norm": 1.8877086490397215, + "learning_rate": 8.512654413283261e-07, + "loss": 0.2462, + "step": 12994 + }, + { + "epoch": 0.82, + "grad_norm": 4.172067567216997, + "learning_rate": 8.506970751000477e-07, + "loss": 0.2547, + "step": 12995 + }, + { + "epoch": 0.82, + "grad_norm": 9.178190644279585, + "learning_rate": 8.501288810337166e-07, + "loss": 0.2644, + "step": 12996 + }, + { + "epoch": 0.82, + "grad_norm": 1.999811991749933, + "learning_rate": 8.495608591529103e-07, + "loss": 0.2413, + "step": 12997 + }, + { + "epoch": 0.82, + "grad_norm": 1.7508726125273202, + "learning_rate": 8.489930094811949e-07, + "loss": 0.256, + "step": 12998 + }, + { + "epoch": 0.82, + "grad_norm": 3.424967339738821, + "learning_rate": 8.484253320421337e-07, + "loss": 0.2648, + "step": 12999 + }, + { + "epoch": 0.82, + "grad_norm": 1.644868330238792, + "learning_rate": 8.478578268592779e-07, + "loss": 0.2572, + "step": 13000 + }, + { + "epoch": 0.82, + "grad_norm": 2.041706935467664, + "learning_rate": 8.472904939561776e-07, + "loss": 0.2583, + "step": 13001 + }, + { + "epoch": 0.82, + "grad_norm": 2.989928503212847, + "learning_rate": 8.467233333563707e-07, + "loss": 0.2508, + "step": 13002 + }, + { + "epoch": 0.82, + "grad_norm": 1.300913008024875, + "learning_rate": 8.461563450833898e-07, + "loss": 0.2443, + "step": 13003 + }, + { + "epoch": 0.82, + "grad_norm": 3.046459539029601, + "learning_rate": 8.455895291607585e-07, + "loss": 0.2885, + "step": 13004 + }, + { + "epoch": 0.82, + "grad_norm": 3.6342500234576756, + "learning_rate": 8.450228856119974e-07, + "loss": 0.2371, + "step": 13005 + }, + { + "epoch": 0.82, + "grad_norm": 2.8703494527860083, + "learning_rate": 8.444564144606177e-07, + "loss": 0.2566, + "step": 13006 + }, + { + "epoch": 0.82, + "grad_norm": 1.3922039419797838, + "learning_rate": 8.438901157301221e-07, + "loss": 0.2384, + "step": 13007 + }, + { + "epoch": 0.82, + "grad_norm": 6.7703336374296015, + "learning_rate": 8.433239894440071e-07, + "loss": 0.2472, + "step": 13008 + }, + { + "epoch": 0.82, + "grad_norm": 1.9150429948459586, + "learning_rate": 8.427580356257636e-07, + "loss": 0.2544, + "step": 13009 + }, + { + "epoch": 0.82, + "grad_norm": 0.5940597519353604, + "learning_rate": 8.421922542988736e-07, + "loss": 0.4712, + "step": 13010 + }, + { + "epoch": 0.82, + "grad_norm": 1.8368424429118755, + "learning_rate": 8.416266454868122e-07, + "loss": 0.2502, + "step": 13011 + }, + { + "epoch": 0.82, + "grad_norm": 1.8532035972685212, + "learning_rate": 8.410612092130455e-07, + "loss": 0.2496, + "step": 13012 + }, + { + "epoch": 0.82, + "grad_norm": 2.388364709508336, + "learning_rate": 8.40495945501037e-07, + "loss": 0.2618, + "step": 13013 + }, + { + "epoch": 0.82, + "grad_norm": 4.132774989207916, + "learning_rate": 8.399308543742408e-07, + "loss": 0.2689, + "step": 13014 + }, + { + "epoch": 0.82, + "grad_norm": 1.895250884264923, + "learning_rate": 8.393659358561024e-07, + "loss": 0.2568, + "step": 13015 + }, + { + "epoch": 0.82, + "grad_norm": 2.0614659051887556, + "learning_rate": 8.388011899700605e-07, + "loss": 0.2532, + "step": 13016 + }, + { + "epoch": 0.82, + "grad_norm": 1.7982387345417716, + "learning_rate": 8.382366167395495e-07, + "loss": 0.2294, + "step": 13017 + }, + { + "epoch": 0.82, + "grad_norm": 1.8196576703216867, + "learning_rate": 8.37672216187993e-07, + "loss": 0.2392, + "step": 13018 + }, + { + "epoch": 0.82, + "grad_norm": 1.9383974662274779, + "learning_rate": 8.3710798833881e-07, + "loss": 0.2627, + "step": 13019 + }, + { + "epoch": 0.82, + "grad_norm": 1.959002470919046, + "learning_rate": 8.36543933215409e-07, + "loss": 0.26, + "step": 13020 + }, + { + "epoch": 0.82, + "grad_norm": 2.624990470072323, + "learning_rate": 8.359800508411958e-07, + "loss": 0.2714, + "step": 13021 + }, + { + "epoch": 0.82, + "grad_norm": 2.8743607113147993, + "learning_rate": 8.354163412395671e-07, + "loss": 0.2504, + "step": 13022 + }, + { + "epoch": 0.82, + "grad_norm": 0.5898234152130922, + "learning_rate": 8.348528044339122e-07, + "loss": 0.4621, + "step": 13023 + }, + { + "epoch": 0.82, + "grad_norm": 2.3573439550623307, + "learning_rate": 8.34289440447611e-07, + "loss": 0.2858, + "step": 13024 + }, + { + "epoch": 0.82, + "grad_norm": 2.501002413577344, + "learning_rate": 8.33726249304041e-07, + "loss": 0.2591, + "step": 13025 + }, + { + "epoch": 0.82, + "grad_norm": 1.8653577441948457, + "learning_rate": 8.331632310265691e-07, + "loss": 0.2568, + "step": 13026 + }, + { + "epoch": 0.82, + "grad_norm": 14.172463980478453, + "learning_rate": 8.326003856385561e-07, + "loss": 0.2819, + "step": 13027 + }, + { + "epoch": 0.82, + "grad_norm": 3.0252902553124086, + "learning_rate": 8.320377131633545e-07, + "loss": 0.2463, + "step": 13028 + }, + { + "epoch": 0.82, + "grad_norm": 2.009323889116786, + "learning_rate": 8.314752136243104e-07, + "loss": 0.252, + "step": 13029 + }, + { + "epoch": 0.82, + "grad_norm": 0.5814904848572607, + "learning_rate": 8.309128870447658e-07, + "loss": 0.4545, + "step": 13030 + }, + { + "epoch": 0.82, + "grad_norm": 1.6858426417017702, + "learning_rate": 8.303507334480499e-07, + "loss": 0.2535, + "step": 13031 + }, + { + "epoch": 0.82, + "grad_norm": 1.342394518394939, + "learning_rate": 8.297887528574877e-07, + "loss": 0.2461, + "step": 13032 + }, + { + "epoch": 0.82, + "grad_norm": 2.292131029782006, + "learning_rate": 8.292269452963981e-07, + "loss": 0.2667, + "step": 13033 + }, + { + "epoch": 0.82, + "grad_norm": 1.4481962584993637, + "learning_rate": 8.286653107880904e-07, + "loss": 0.2496, + "step": 13034 + }, + { + "epoch": 0.82, + "grad_norm": 1.5152240136766413, + "learning_rate": 8.281038493558674e-07, + "loss": 0.2593, + "step": 13035 + }, + { + "epoch": 0.82, + "grad_norm": 2.622225821286546, + "learning_rate": 8.275425610230269e-07, + "loss": 0.2631, + "step": 13036 + }, + { + "epoch": 0.82, + "grad_norm": 1.6396270837493994, + "learning_rate": 8.269814458128556e-07, + "loss": 0.2505, + "step": 13037 + }, + { + "epoch": 0.82, + "grad_norm": 1.4101603639984648, + "learning_rate": 8.264205037486367e-07, + "loss": 0.2561, + "step": 13038 + }, + { + "epoch": 0.82, + "grad_norm": 2.516095248356477, + "learning_rate": 8.258597348536452e-07, + "loss": 0.2655, + "step": 13039 + }, + { + "epoch": 0.82, + "grad_norm": 2.599263168900209, + "learning_rate": 8.252991391511455e-07, + "loss": 0.2586, + "step": 13040 + }, + { + "epoch": 0.82, + "grad_norm": 1.9634882322785763, + "learning_rate": 8.247387166644011e-07, + "loss": 0.2556, + "step": 13041 + }, + { + "epoch": 0.82, + "grad_norm": 8.347372733686193, + "learning_rate": 8.24178467416663e-07, + "loss": 0.2784, + "step": 13042 + }, + { + "epoch": 0.82, + "grad_norm": 3.183040854119145, + "learning_rate": 8.236183914311769e-07, + "loss": 0.2704, + "step": 13043 + }, + { + "epoch": 0.82, + "grad_norm": 5.055340739105419, + "learning_rate": 8.230584887311826e-07, + "loss": 0.2661, + "step": 13044 + }, + { + "epoch": 0.82, + "grad_norm": 1.8134922061435854, + "learning_rate": 8.224987593399098e-07, + "loss": 0.2686, + "step": 13045 + }, + { + "epoch": 0.82, + "grad_norm": 1.811205474303526, + "learning_rate": 8.219392032805846e-07, + "loss": 0.2595, + "step": 13046 + }, + { + "epoch": 0.82, + "grad_norm": 4.203829807712511, + "learning_rate": 8.213798205764228e-07, + "loss": 0.2692, + "step": 13047 + }, + { + "epoch": 0.82, + "grad_norm": 2.807074794904907, + "learning_rate": 8.208206112506329e-07, + "loss": 0.2538, + "step": 13048 + }, + { + "epoch": 0.82, + "grad_norm": 2.5836032842498233, + "learning_rate": 8.202615753264204e-07, + "loss": 0.3067, + "step": 13049 + }, + { + "epoch": 0.82, + "grad_norm": 2.563150443366848, + "learning_rate": 8.19702712826978e-07, + "loss": 0.2932, + "step": 13050 + }, + { + "epoch": 0.82, + "grad_norm": 4.188312959201578, + "learning_rate": 8.191440237754961e-07, + "loss": 0.2533, + "step": 13051 + }, + { + "epoch": 0.82, + "grad_norm": 1.3865614737569423, + "learning_rate": 8.185855081951538e-07, + "loss": 0.2511, + "step": 13052 + }, + { + "epoch": 0.82, + "grad_norm": 1.8887032677106688, + "learning_rate": 8.180271661091266e-07, + "loss": 0.252, + "step": 13053 + }, + { + "epoch": 0.82, + "grad_norm": 3.8215593645014145, + "learning_rate": 8.174689975405809e-07, + "loss": 0.241, + "step": 13054 + }, + { + "epoch": 0.82, + "grad_norm": 3.4799180184625094, + "learning_rate": 8.169110025126747e-07, + "loss": 0.2423, + "step": 13055 + }, + { + "epoch": 0.82, + "grad_norm": 2.0505103696824696, + "learning_rate": 8.163531810485603e-07, + "loss": 0.2702, + "step": 13056 + }, + { + "epoch": 0.82, + "grad_norm": 0.633765709039043, + "learning_rate": 8.157955331713846e-07, + "loss": 0.4643, + "step": 13057 + }, + { + "epoch": 0.82, + "grad_norm": 1.6442694501102397, + "learning_rate": 8.15238058904283e-07, + "loss": 0.2624, + "step": 13058 + }, + { + "epoch": 0.82, + "grad_norm": 2.4342128978916855, + "learning_rate": 8.146807582703886e-07, + "loss": 0.2529, + "step": 13059 + }, + { + "epoch": 0.82, + "grad_norm": 1.573319810450124, + "learning_rate": 8.141236312928219e-07, + "loss": 0.2547, + "step": 13060 + }, + { + "epoch": 0.82, + "grad_norm": 1.7221318023916532, + "learning_rate": 8.135666779947027e-07, + "loss": 0.2413, + "step": 13061 + }, + { + "epoch": 0.82, + "grad_norm": 2.087675320801948, + "learning_rate": 8.13009898399137e-07, + "loss": 0.2439, + "step": 13062 + }, + { + "epoch": 0.82, + "grad_norm": 2.1914829503325612, + "learning_rate": 8.124532925292283e-07, + "loss": 0.2403, + "step": 13063 + }, + { + "epoch": 0.82, + "grad_norm": 2.3682928785855832, + "learning_rate": 8.118968604080684e-07, + "loss": 0.261, + "step": 13064 + }, + { + "epoch": 0.82, + "grad_norm": 2.031624591214938, + "learning_rate": 8.113406020587472e-07, + "loss": 0.2685, + "step": 13065 + }, + { + "epoch": 0.82, + "grad_norm": 2.358590074475766, + "learning_rate": 8.107845175043455e-07, + "loss": 0.272, + "step": 13066 + }, + { + "epoch": 0.82, + "grad_norm": 2.4451503825110303, + "learning_rate": 8.102286067679354e-07, + "loss": 0.2553, + "step": 13067 + }, + { + "epoch": 0.82, + "grad_norm": 2.508759541027896, + "learning_rate": 8.096728698725803e-07, + "loss": 0.2408, + "step": 13068 + }, + { + "epoch": 0.82, + "grad_norm": 2.233493219583581, + "learning_rate": 8.091173068413427e-07, + "loss": 0.2453, + "step": 13069 + }, + { + "epoch": 0.82, + "grad_norm": 1.7521175580643524, + "learning_rate": 8.085619176972715e-07, + "loss": 0.2529, + "step": 13070 + }, + { + "epoch": 0.82, + "grad_norm": 3.9641078041129796, + "learning_rate": 8.080067024634109e-07, + "loss": 0.2446, + "step": 13071 + }, + { + "epoch": 0.82, + "grad_norm": 0.6226189982102704, + "learning_rate": 8.07451661162797e-07, + "loss": 0.469, + "step": 13072 + }, + { + "epoch": 0.82, + "grad_norm": 1.6846918995664295, + "learning_rate": 8.068967938184608e-07, + "loss": 0.2421, + "step": 13073 + }, + { + "epoch": 0.82, + "grad_norm": 2.890165279115881, + "learning_rate": 8.063421004534256e-07, + "loss": 0.2569, + "step": 13074 + }, + { + "epoch": 0.82, + "grad_norm": 1.6052195605657091, + "learning_rate": 8.057875810907051e-07, + "loss": 0.2359, + "step": 13075 + }, + { + "epoch": 0.82, + "grad_norm": 1.615479030906575, + "learning_rate": 8.052332357533066e-07, + "loss": 0.2479, + "step": 13076 + }, + { + "epoch": 0.82, + "grad_norm": 1.929071088557361, + "learning_rate": 8.046790644642327e-07, + "loss": 0.249, + "step": 13077 + }, + { + "epoch": 0.82, + "grad_norm": 2.3559093829198097, + "learning_rate": 8.041250672464768e-07, + "loss": 0.2535, + "step": 13078 + }, + { + "epoch": 0.82, + "grad_norm": 2.4031842626611275, + "learning_rate": 8.035712441230237e-07, + "loss": 0.2788, + "step": 13079 + }, + { + "epoch": 0.82, + "grad_norm": 1.2772086345128029, + "learning_rate": 8.030175951168528e-07, + "loss": 0.2559, + "step": 13080 + }, + { + "epoch": 0.82, + "grad_norm": 0.6154076525591489, + "learning_rate": 8.024641202509365e-07, + "loss": 0.4888, + "step": 13081 + }, + { + "epoch": 0.82, + "grad_norm": 1.9684861077151803, + "learning_rate": 8.019108195482406e-07, + "loss": 0.2489, + "step": 13082 + }, + { + "epoch": 0.82, + "grad_norm": 2.1455532813566687, + "learning_rate": 8.013576930317218e-07, + "loss": 0.2679, + "step": 13083 + }, + { + "epoch": 0.82, + "grad_norm": 1.9666094734323385, + "learning_rate": 8.008047407243285e-07, + "loss": 0.2671, + "step": 13084 + }, + { + "epoch": 0.82, + "grad_norm": 4.4877253216716495, + "learning_rate": 8.002519626490063e-07, + "loss": 0.2542, + "step": 13085 + }, + { + "epoch": 0.82, + "grad_norm": 2.1984824133815075, + "learning_rate": 7.996993588286894e-07, + "loss": 0.251, + "step": 13086 + }, + { + "epoch": 0.82, + "grad_norm": 1.715523572971109, + "learning_rate": 7.991469292863058e-07, + "loss": 0.2515, + "step": 13087 + }, + { + "epoch": 0.82, + "grad_norm": 2.76257404545504, + "learning_rate": 7.985946740447792e-07, + "loss": 0.2559, + "step": 13088 + }, + { + "epoch": 0.82, + "grad_norm": 2.7938664839444995, + "learning_rate": 7.980425931270203e-07, + "loss": 0.2837, + "step": 13089 + }, + { + "epoch": 0.82, + "grad_norm": 1.3795535571582775, + "learning_rate": 7.974906865559396e-07, + "loss": 0.2578, + "step": 13090 + }, + { + "epoch": 0.82, + "grad_norm": 0.5783046593675997, + "learning_rate": 7.969389543544343e-07, + "loss": 0.4643, + "step": 13091 + }, + { + "epoch": 0.82, + "grad_norm": 2.0700533399711567, + "learning_rate": 7.963873965453961e-07, + "loss": 0.259, + "step": 13092 + }, + { + "epoch": 0.82, + "grad_norm": 1.932886429506239, + "learning_rate": 7.958360131517123e-07, + "loss": 0.2496, + "step": 13093 + }, + { + "epoch": 0.82, + "grad_norm": 1.9467870566710295, + "learning_rate": 7.952848041962601e-07, + "loss": 0.2563, + "step": 13094 + }, + { + "epoch": 0.82, + "grad_norm": 1.5904858129912447, + "learning_rate": 7.947337697019087e-07, + "loss": 0.2476, + "step": 13095 + }, + { + "epoch": 0.82, + "grad_norm": 1.7050276569152154, + "learning_rate": 7.941829096915237e-07, + "loss": 0.2524, + "step": 13096 + }, + { + "epoch": 0.82, + "grad_norm": 2.9404042332245237, + "learning_rate": 7.936322241879585e-07, + "loss": 0.2556, + "step": 13097 + }, + { + "epoch": 0.82, + "grad_norm": 1.4866373699453335, + "learning_rate": 7.930817132140656e-07, + "loss": 0.2457, + "step": 13098 + }, + { + "epoch": 0.82, + "grad_norm": 1.7794633701255003, + "learning_rate": 7.92531376792684e-07, + "loss": 0.2714, + "step": 13099 + }, + { + "epoch": 0.82, + "grad_norm": 3.492381490675387, + "learning_rate": 7.919812149466483e-07, + "loss": 0.2514, + "step": 13100 + }, + { + "epoch": 0.82, + "grad_norm": 2.3275529548383482, + "learning_rate": 7.914312276987873e-07, + "loss": 0.2514, + "step": 13101 + }, + { + "epoch": 0.82, + "grad_norm": 4.607591987618546, + "learning_rate": 7.908814150719185e-07, + "loss": 0.2561, + "step": 13102 + }, + { + "epoch": 0.82, + "grad_norm": 2.048912563356653, + "learning_rate": 7.903317770888574e-07, + "loss": 0.2599, + "step": 13103 + }, + { + "epoch": 0.82, + "grad_norm": 4.764661385919167, + "learning_rate": 7.897823137724081e-07, + "loss": 0.2659, + "step": 13104 + }, + { + "epoch": 0.82, + "grad_norm": 4.161276809576672, + "learning_rate": 7.892330251453672e-07, + "loss": 0.2393, + "step": 13105 + }, + { + "epoch": 0.82, + "grad_norm": 0.6182133375116967, + "learning_rate": 7.886839112305288e-07, + "loss": 0.4546, + "step": 13106 + }, + { + "epoch": 0.82, + "grad_norm": 1.9272856699175505, + "learning_rate": 7.881349720506754e-07, + "loss": 0.2671, + "step": 13107 + }, + { + "epoch": 0.82, + "grad_norm": 5.285158660255171, + "learning_rate": 7.875862076285812e-07, + "loss": 0.287, + "step": 13108 + }, + { + "epoch": 0.82, + "grad_norm": 1.6366069699449706, + "learning_rate": 7.870376179870187e-07, + "loss": 0.238, + "step": 13109 + }, + { + "epoch": 0.82, + "grad_norm": 23.590969575871426, + "learning_rate": 7.864892031487476e-07, + "loss": 0.2653, + "step": 13110 + }, + { + "epoch": 0.82, + "grad_norm": 3.009571809747192, + "learning_rate": 7.859409631365245e-07, + "loss": 0.2476, + "step": 13111 + }, + { + "epoch": 0.82, + "grad_norm": 13.366996876037401, + "learning_rate": 7.853928979730962e-07, + "loss": 0.2549, + "step": 13112 + }, + { + "epoch": 0.82, + "grad_norm": 3.0365944811854693, + "learning_rate": 7.848450076812008e-07, + "loss": 0.2719, + "step": 13113 + }, + { + "epoch": 0.82, + "grad_norm": 2.413728089958753, + "learning_rate": 7.842972922835745e-07, + "loss": 0.2596, + "step": 13114 + }, + { + "epoch": 0.82, + "grad_norm": 1.4686232985740932, + "learning_rate": 7.837497518029419e-07, + "loss": 0.2396, + "step": 13115 + }, + { + "epoch": 0.82, + "grad_norm": 1.712527937302349, + "learning_rate": 7.832023862620192e-07, + "loss": 0.2501, + "step": 13116 + }, + { + "epoch": 0.82, + "grad_norm": 1.7211641572292184, + "learning_rate": 7.826551956835193e-07, + "loss": 0.2667, + "step": 13117 + }, + { + "epoch": 0.82, + "grad_norm": 1.8893581009564415, + "learning_rate": 7.821081800901475e-07, + "loss": 0.2509, + "step": 13118 + }, + { + "epoch": 0.83, + "grad_norm": 1.6012426350371194, + "learning_rate": 7.815613395045996e-07, + "loss": 0.2522, + "step": 13119 + }, + { + "epoch": 0.83, + "grad_norm": 3.0291694474029827, + "learning_rate": 7.810146739495638e-07, + "loss": 0.2445, + "step": 13120 + }, + { + "epoch": 0.83, + "grad_norm": 1.7227236330250217, + "learning_rate": 7.804681834477223e-07, + "loss": 0.2576, + "step": 13121 + }, + { + "epoch": 0.83, + "grad_norm": 0.5701982959971131, + "learning_rate": 7.799218680217513e-07, + "loss": 0.4314, + "step": 13122 + }, + { + "epoch": 0.83, + "grad_norm": 2.602939963796262, + "learning_rate": 7.793757276943181e-07, + "loss": 0.2579, + "step": 13123 + }, + { + "epoch": 0.83, + "grad_norm": 1.744144377902834, + "learning_rate": 7.788297624880814e-07, + "loss": 0.2362, + "step": 13124 + }, + { + "epoch": 0.83, + "grad_norm": 2.3946643028272416, + "learning_rate": 7.782839724256952e-07, + "loss": 0.2598, + "step": 13125 + }, + { + "epoch": 0.83, + "grad_norm": 3.4460012789736063, + "learning_rate": 7.777383575298069e-07, + "loss": 0.2554, + "step": 13126 + }, + { + "epoch": 0.83, + "grad_norm": 2.449953303242268, + "learning_rate": 7.771929178230542e-07, + "loss": 0.2568, + "step": 13127 + }, + { + "epoch": 0.83, + "grad_norm": 2.4017882233989867, + "learning_rate": 7.766476533280659e-07, + "loss": 0.273, + "step": 13128 + }, + { + "epoch": 0.83, + "grad_norm": 2.3111915189622882, + "learning_rate": 7.761025640674696e-07, + "loss": 0.2588, + "step": 13129 + }, + { + "epoch": 0.83, + "grad_norm": 5.916331245414365, + "learning_rate": 7.755576500638806e-07, + "loss": 0.273, + "step": 13130 + }, + { + "epoch": 0.83, + "grad_norm": 4.460799444511176, + "learning_rate": 7.75012911339908e-07, + "loss": 0.2489, + "step": 13131 + }, + { + "epoch": 0.83, + "grad_norm": 2.0456496495781993, + "learning_rate": 7.74468347918153e-07, + "loss": 0.2764, + "step": 13132 + }, + { + "epoch": 0.83, + "grad_norm": 1.6935012493677806, + "learning_rate": 7.739239598212111e-07, + "loss": 0.2417, + "step": 13133 + }, + { + "epoch": 0.83, + "grad_norm": 3.1028167821480643, + "learning_rate": 7.733797470716725e-07, + "loss": 0.2647, + "step": 13134 + }, + { + "epoch": 0.83, + "grad_norm": 1.4120097639929552, + "learning_rate": 7.728357096921152e-07, + "loss": 0.2461, + "step": 13135 + }, + { + "epoch": 0.83, + "grad_norm": 1.8307900217480282, + "learning_rate": 7.722918477051112e-07, + "loss": 0.2509, + "step": 13136 + }, + { + "epoch": 0.83, + "grad_norm": 14.174653470847996, + "learning_rate": 7.717481611332289e-07, + "loss": 0.2601, + "step": 13137 + }, + { + "epoch": 0.83, + "grad_norm": 2.6613766491342608, + "learning_rate": 7.712046499990255e-07, + "loss": 0.2677, + "step": 13138 + }, + { + "epoch": 0.83, + "grad_norm": 5.22315979333206, + "learning_rate": 7.706613143250524e-07, + "loss": 0.2511, + "step": 13139 + }, + { + "epoch": 0.83, + "grad_norm": 1.6642103270753872, + "learning_rate": 7.701181541338526e-07, + "loss": 0.259, + "step": 13140 + }, + { + "epoch": 0.83, + "grad_norm": 7.5889709885115195, + "learning_rate": 7.695751694479636e-07, + "loss": 0.2491, + "step": 13141 + }, + { + "epoch": 0.83, + "grad_norm": 2.247138147154396, + "learning_rate": 7.690323602899163e-07, + "loss": 0.2498, + "step": 13142 + }, + { + "epoch": 0.83, + "grad_norm": 2.5433199558546358, + "learning_rate": 7.684897266822311e-07, + "loss": 0.2634, + "step": 13143 + }, + { + "epoch": 0.83, + "grad_norm": 1.9310444110559235, + "learning_rate": 7.67947268647422e-07, + "loss": 0.2526, + "step": 13144 + }, + { + "epoch": 0.83, + "grad_norm": 2.2473337510819267, + "learning_rate": 7.67404986207999e-07, + "loss": 0.2774, + "step": 13145 + }, + { + "epoch": 0.83, + "grad_norm": 1.7584321078236875, + "learning_rate": 7.668628793864607e-07, + "loss": 0.2491, + "step": 13146 + }, + { + "epoch": 0.83, + "grad_norm": 2.3861916248282093, + "learning_rate": 7.663209482052997e-07, + "loss": 0.2442, + "step": 13147 + }, + { + "epoch": 0.83, + "grad_norm": 2.506798592293803, + "learning_rate": 7.657791926870034e-07, + "loss": 0.2767, + "step": 13148 + }, + { + "epoch": 0.83, + "grad_norm": 1.35585626017092, + "learning_rate": 7.652376128540478e-07, + "loss": 0.2345, + "step": 13149 + }, + { + "epoch": 0.83, + "grad_norm": 1.4655352478852601, + "learning_rate": 7.646962087289073e-07, + "loss": 0.2452, + "step": 13150 + }, + { + "epoch": 0.83, + "grad_norm": 0.5919262894747134, + "learning_rate": 7.641549803340431e-07, + "loss": 0.5069, + "step": 13151 + }, + { + "epoch": 0.83, + "grad_norm": 1.4528374372651491, + "learning_rate": 7.636139276919119e-07, + "loss": 0.2492, + "step": 13152 + }, + { + "epoch": 0.83, + "grad_norm": 3.2757614043041645, + "learning_rate": 7.630730508249639e-07, + "loss": 0.2659, + "step": 13153 + }, + { + "epoch": 0.83, + "grad_norm": 1.702400540208499, + "learning_rate": 7.625323497556414e-07, + "loss": 0.2468, + "step": 13154 + }, + { + "epoch": 0.83, + "grad_norm": 2.078352471027836, + "learning_rate": 7.619918245063768e-07, + "loss": 0.2396, + "step": 13155 + }, + { + "epoch": 0.83, + "grad_norm": 2.5552855032131907, + "learning_rate": 7.614514750996005e-07, + "loss": 0.2879, + "step": 13156 + }, + { + "epoch": 0.83, + "grad_norm": 1.6837207980809006, + "learning_rate": 7.609113015577291e-07, + "loss": 0.2469, + "step": 13157 + }, + { + "epoch": 0.83, + "grad_norm": 2.518276060545688, + "learning_rate": 7.60371303903179e-07, + "loss": 0.2697, + "step": 13158 + }, + { + "epoch": 0.83, + "grad_norm": 6.810011764493329, + "learning_rate": 7.598314821583542e-07, + "loss": 0.26, + "step": 13159 + }, + { + "epoch": 0.83, + "grad_norm": 3.5868077906950226, + "learning_rate": 7.592918363456509e-07, + "loss": 0.2648, + "step": 13160 + }, + { + "epoch": 0.83, + "grad_norm": 11.504520961715231, + "learning_rate": 7.58752366487463e-07, + "loss": 0.2546, + "step": 13161 + }, + { + "epoch": 0.83, + "grad_norm": 2.1614706095236045, + "learning_rate": 7.58213072606171e-07, + "loss": 0.2611, + "step": 13162 + }, + { + "epoch": 0.83, + "grad_norm": 2.6102353747156255, + "learning_rate": 7.576739547241546e-07, + "loss": 0.2494, + "step": 13163 + }, + { + "epoch": 0.83, + "grad_norm": 2.548698086483542, + "learning_rate": 7.571350128637811e-07, + "loss": 0.2656, + "step": 13164 + }, + { + "epoch": 0.83, + "grad_norm": 1.954585837576112, + "learning_rate": 7.565962470474109e-07, + "loss": 0.2697, + "step": 13165 + }, + { + "epoch": 0.83, + "grad_norm": 1.8921314004904175, + "learning_rate": 7.560576572974004e-07, + "loss": 0.2626, + "step": 13166 + }, + { + "epoch": 0.83, + "grad_norm": 1.425673724714399, + "learning_rate": 7.555192436360958e-07, + "loss": 0.2373, + "step": 13167 + }, + { + "epoch": 0.83, + "grad_norm": 2.1670183969758345, + "learning_rate": 7.549810060858359e-07, + "loss": 0.2578, + "step": 13168 + }, + { + "epoch": 0.83, + "grad_norm": 1.4958604366379131, + "learning_rate": 7.544429446689544e-07, + "loss": 0.2669, + "step": 13169 + }, + { + "epoch": 0.83, + "grad_norm": 2.2188583707750205, + "learning_rate": 7.539050594077768e-07, + "loss": 0.2569, + "step": 13170 + }, + { + "epoch": 0.83, + "grad_norm": 2.5344431709143387, + "learning_rate": 7.533673503246203e-07, + "loss": 0.2821, + "step": 13171 + }, + { + "epoch": 0.83, + "grad_norm": 2.3546647669187095, + "learning_rate": 7.528298174417953e-07, + "loss": 0.2604, + "step": 13172 + }, + { + "epoch": 0.83, + "grad_norm": 1.2383655837220653, + "learning_rate": 7.522924607816034e-07, + "loss": 0.2359, + "step": 13173 + }, + { + "epoch": 0.83, + "grad_norm": 3.26240150075405, + "learning_rate": 7.517552803663441e-07, + "loss": 0.2444, + "step": 13174 + }, + { + "epoch": 0.83, + "grad_norm": 1.8469803590004534, + "learning_rate": 7.512182762183035e-07, + "loss": 0.2438, + "step": 13175 + }, + { + "epoch": 0.83, + "grad_norm": 2.8829531334783365, + "learning_rate": 7.506814483597619e-07, + "loss": 0.2579, + "step": 13176 + }, + { + "epoch": 0.83, + "grad_norm": 1.7580614740710847, + "learning_rate": 7.501447968129949e-07, + "loss": 0.2526, + "step": 13177 + }, + { + "epoch": 0.83, + "grad_norm": 2.3951182972511615, + "learning_rate": 7.496083216002703e-07, + "loss": 0.2467, + "step": 13178 + }, + { + "epoch": 0.83, + "grad_norm": 1.5016522555526541, + "learning_rate": 7.490720227438453e-07, + "loss": 0.2339, + "step": 13179 + }, + { + "epoch": 0.83, + "grad_norm": 7.336968063304557, + "learning_rate": 7.485359002659731e-07, + "loss": 0.2534, + "step": 13180 + }, + { + "epoch": 0.83, + "grad_norm": 1.7470124960417608, + "learning_rate": 7.479999541888966e-07, + "loss": 0.2551, + "step": 13181 + }, + { + "epoch": 0.83, + "grad_norm": 7.100544671494195, + "learning_rate": 7.474641845348557e-07, + "loss": 0.2671, + "step": 13182 + }, + { + "epoch": 0.83, + "grad_norm": 2.7244243275115387, + "learning_rate": 7.469285913260787e-07, + "loss": 0.2454, + "step": 13183 + }, + { + "epoch": 0.83, + "grad_norm": 2.3512624149281267, + "learning_rate": 7.46393174584788e-07, + "loss": 0.2696, + "step": 13184 + }, + { + "epoch": 0.83, + "grad_norm": 2.814366230180223, + "learning_rate": 7.458579343331996e-07, + "loss": 0.261, + "step": 13185 + }, + { + "epoch": 0.83, + "grad_norm": 1.7616757227815985, + "learning_rate": 7.453228705935228e-07, + "loss": 0.2757, + "step": 13186 + }, + { + "epoch": 0.83, + "grad_norm": 2.1225086198030043, + "learning_rate": 7.447879833879574e-07, + "loss": 0.2778, + "step": 13187 + }, + { + "epoch": 0.83, + "grad_norm": 1.559266608611239, + "learning_rate": 7.442532727386964e-07, + "loss": 0.2539, + "step": 13188 + }, + { + "epoch": 0.83, + "grad_norm": 2.19378700486509, + "learning_rate": 7.437187386679252e-07, + "loss": 0.2413, + "step": 13189 + }, + { + "epoch": 0.83, + "grad_norm": 3.0478588030052065, + "learning_rate": 7.431843811978246e-07, + "loss": 0.253, + "step": 13190 + }, + { + "epoch": 0.83, + "grad_norm": 2.267341315514969, + "learning_rate": 7.426502003505653e-07, + "loss": 0.2662, + "step": 13191 + }, + { + "epoch": 0.83, + "grad_norm": 1.3898595182236768, + "learning_rate": 7.421161961483098e-07, + "loss": 0.2508, + "step": 13192 + }, + { + "epoch": 0.83, + "grad_norm": 1.727880332474302, + "learning_rate": 7.415823686132162e-07, + "loss": 0.2531, + "step": 13193 + }, + { + "epoch": 0.83, + "grad_norm": 0.6029757269574015, + "learning_rate": 7.410487177674347e-07, + "loss": 0.4658, + "step": 13194 + }, + { + "epoch": 0.83, + "grad_norm": 1.4805363682229404, + "learning_rate": 7.40515243633107e-07, + "loss": 0.2562, + "step": 13195 + }, + { + "epoch": 0.83, + "grad_norm": 1.652345291347373, + "learning_rate": 7.399819462323677e-07, + "loss": 0.2593, + "step": 13196 + }, + { + "epoch": 0.83, + "grad_norm": 2.792579656590822, + "learning_rate": 7.394488255873422e-07, + "loss": 0.2517, + "step": 13197 + }, + { + "epoch": 0.83, + "grad_norm": 1.647559192520862, + "learning_rate": 7.389158817201541e-07, + "loss": 0.2488, + "step": 13198 + }, + { + "epoch": 0.83, + "grad_norm": 2.3104969639439124, + "learning_rate": 7.383831146529136e-07, + "loss": 0.241, + "step": 13199 + }, + { + "epoch": 0.83, + "grad_norm": 1.7519835733076743, + "learning_rate": 7.378505244077283e-07, + "loss": 0.261, + "step": 13200 + }, + { + "epoch": 0.83, + "grad_norm": 1.9419281616050887, + "learning_rate": 7.373181110066941e-07, + "loss": 0.2629, + "step": 13201 + }, + { + "epoch": 0.83, + "grad_norm": 2.056614347365575, + "learning_rate": 7.367858744719036e-07, + "loss": 0.2662, + "step": 13202 + }, + { + "epoch": 0.83, + "grad_norm": 4.19721152563661, + "learning_rate": 7.3625381482544e-07, + "loss": 0.263, + "step": 13203 + }, + { + "epoch": 0.83, + "grad_norm": 1.3233526406143272, + "learning_rate": 7.357219320893789e-07, + "loss": 0.2509, + "step": 13204 + }, + { + "epoch": 0.83, + "grad_norm": 1.899407420166879, + "learning_rate": 7.351902262857874e-07, + "loss": 0.265, + "step": 13205 + }, + { + "epoch": 0.83, + "grad_norm": 2.4408411128453946, + "learning_rate": 7.346586974367298e-07, + "loss": 0.2658, + "step": 13206 + }, + { + "epoch": 0.83, + "grad_norm": 1.8495275953807058, + "learning_rate": 7.341273455642577e-07, + "loss": 0.2699, + "step": 13207 + }, + { + "epoch": 0.83, + "grad_norm": 1.9123545368181403, + "learning_rate": 7.33596170690421e-07, + "loss": 0.2567, + "step": 13208 + }, + { + "epoch": 0.83, + "grad_norm": 2.2028353397593383, + "learning_rate": 7.330651728372551e-07, + "loss": 0.264, + "step": 13209 + }, + { + "epoch": 0.83, + "grad_norm": 1.7664593579300796, + "learning_rate": 7.325343520267957e-07, + "loss": 0.2734, + "step": 13210 + }, + { + "epoch": 0.83, + "grad_norm": 2.070524073509291, + "learning_rate": 7.32003708281066e-07, + "loss": 0.2561, + "step": 13211 + }, + { + "epoch": 0.83, + "grad_norm": 4.088478926229157, + "learning_rate": 7.314732416220821e-07, + "loss": 0.2613, + "step": 13212 + }, + { + "epoch": 0.83, + "grad_norm": 1.8787847196137988, + "learning_rate": 7.309429520718558e-07, + "loss": 0.2474, + "step": 13213 + }, + { + "epoch": 0.83, + "grad_norm": 2.271709088883474, + "learning_rate": 7.304128396523885e-07, + "loss": 0.2565, + "step": 13214 + }, + { + "epoch": 0.83, + "grad_norm": 1.2318000708085362, + "learning_rate": 7.29882904385677e-07, + "loss": 0.281, + "step": 13215 + }, + { + "epoch": 0.83, + "grad_norm": 2.775665621712501, + "learning_rate": 7.293531462937087e-07, + "loss": 0.2556, + "step": 13216 + }, + { + "epoch": 0.83, + "grad_norm": 1.339480552195945, + "learning_rate": 7.288235653984621e-07, + "loss": 0.2503, + "step": 13217 + }, + { + "epoch": 0.83, + "grad_norm": 1.643151832930828, + "learning_rate": 7.282941617219142e-07, + "loss": 0.2349, + "step": 13218 + }, + { + "epoch": 0.83, + "grad_norm": 1.345094487632265, + "learning_rate": 7.277649352860283e-07, + "loss": 0.2469, + "step": 13219 + }, + { + "epoch": 0.83, + "grad_norm": 2.1513817184507706, + "learning_rate": 7.272358861127626e-07, + "loss": 0.2731, + "step": 13220 + }, + { + "epoch": 0.83, + "grad_norm": 1.6633674018639215, + "learning_rate": 7.267070142240706e-07, + "loss": 0.2499, + "step": 13221 + }, + { + "epoch": 0.83, + "grad_norm": 2.9642687538246135, + "learning_rate": 7.261783196418937e-07, + "loss": 0.2829, + "step": 13222 + }, + { + "epoch": 0.83, + "grad_norm": 1.9624608906456469, + "learning_rate": 7.256498023881703e-07, + "loss": 0.2467, + "step": 13223 + }, + { + "epoch": 0.83, + "grad_norm": 1.484682183252951, + "learning_rate": 7.251214624848285e-07, + "loss": 0.2586, + "step": 13224 + }, + { + "epoch": 0.83, + "grad_norm": 1.9693426805083505, + "learning_rate": 7.245932999537897e-07, + "loss": 0.2455, + "step": 13225 + }, + { + "epoch": 0.83, + "grad_norm": 2.792972748711006, + "learning_rate": 7.240653148169696e-07, + "loss": 0.2732, + "step": 13226 + }, + { + "epoch": 0.83, + "grad_norm": 2.354445537378943, + "learning_rate": 7.235375070962747e-07, + "loss": 0.2671, + "step": 13227 + }, + { + "epoch": 0.83, + "grad_norm": 9.647517482193596, + "learning_rate": 7.230098768136035e-07, + "loss": 0.2676, + "step": 13228 + }, + { + "epoch": 0.83, + "grad_norm": 2.466791352935791, + "learning_rate": 7.22482423990849e-07, + "loss": 0.2462, + "step": 13229 + }, + { + "epoch": 0.83, + "grad_norm": 3.4521212964525203, + "learning_rate": 7.219551486498976e-07, + "loss": 0.2588, + "step": 13230 + }, + { + "epoch": 0.83, + "grad_norm": 1.8064537812046055, + "learning_rate": 7.214280508126259e-07, + "loss": 0.2543, + "step": 13231 + }, + { + "epoch": 0.83, + "grad_norm": 5.374520532959189, + "learning_rate": 7.20901130500904e-07, + "loss": 0.2537, + "step": 13232 + }, + { + "epoch": 0.83, + "grad_norm": 3.0395352338624595, + "learning_rate": 7.203743877365938e-07, + "loss": 0.2387, + "step": 13233 + }, + { + "epoch": 0.83, + "grad_norm": 2.892245557213895, + "learning_rate": 7.198478225415522e-07, + "loss": 0.2541, + "step": 13234 + }, + { + "epoch": 0.83, + "grad_norm": 3.7168205865884385, + "learning_rate": 7.193214349376276e-07, + "loss": 0.2518, + "step": 13235 + }, + { + "epoch": 0.83, + "grad_norm": 1.6872993447475668, + "learning_rate": 7.187952249466585e-07, + "loss": 0.261, + "step": 13236 + }, + { + "epoch": 0.83, + "grad_norm": 2.214984353252086, + "learning_rate": 7.182691925904795e-07, + "loss": 0.2493, + "step": 13237 + }, + { + "epoch": 0.83, + "grad_norm": 1.7230628565198083, + "learning_rate": 7.177433378909183e-07, + "loss": 0.2305, + "step": 13238 + }, + { + "epoch": 0.83, + "grad_norm": 2.2796634006638157, + "learning_rate": 7.172176608697923e-07, + "loss": 0.2675, + "step": 13239 + }, + { + "epoch": 0.83, + "grad_norm": 3.098432726002284, + "learning_rate": 7.166921615489125e-07, + "loss": 0.2395, + "step": 13240 + }, + { + "epoch": 0.83, + "grad_norm": 1.9166388897722202, + "learning_rate": 7.16166839950082e-07, + "loss": 0.248, + "step": 13241 + }, + { + "epoch": 0.83, + "grad_norm": 2.1520564944718763, + "learning_rate": 7.156416960950991e-07, + "loss": 0.2768, + "step": 13242 + }, + { + "epoch": 0.83, + "grad_norm": 1.8871636579136695, + "learning_rate": 7.15116730005752e-07, + "loss": 0.2446, + "step": 13243 + }, + { + "epoch": 0.83, + "grad_norm": 2.098539015704749, + "learning_rate": 7.145919417038216e-07, + "loss": 0.2442, + "step": 13244 + }, + { + "epoch": 0.83, + "grad_norm": 1.5020532192567475, + "learning_rate": 7.140673312110835e-07, + "loss": 0.2506, + "step": 13245 + }, + { + "epoch": 0.83, + "grad_norm": 1.376122086608251, + "learning_rate": 7.135428985493053e-07, + "loss": 0.2538, + "step": 13246 + }, + { + "epoch": 0.83, + "grad_norm": 1.8465195022234036, + "learning_rate": 7.130186437402464e-07, + "loss": 0.2473, + "step": 13247 + }, + { + "epoch": 0.83, + "grad_norm": 1.438491086285166, + "learning_rate": 7.124945668056577e-07, + "loss": 0.2552, + "step": 13248 + }, + { + "epoch": 0.83, + "grad_norm": 1.513353316282746, + "learning_rate": 7.119706677672844e-07, + "loss": 0.2412, + "step": 13249 + }, + { + "epoch": 0.83, + "grad_norm": 17.7935000376769, + "learning_rate": 7.114469466468654e-07, + "loss": 0.2554, + "step": 13250 + }, + { + "epoch": 0.83, + "grad_norm": 0.6543348206489981, + "learning_rate": 7.109234034661288e-07, + "loss": 0.474, + "step": 13251 + }, + { + "epoch": 0.83, + "grad_norm": 1.700080862271829, + "learning_rate": 7.104000382467996e-07, + "loss": 0.2575, + "step": 13252 + }, + { + "epoch": 0.83, + "grad_norm": 2.3586833265131317, + "learning_rate": 7.098768510105908e-07, + "loss": 0.253, + "step": 13253 + }, + { + "epoch": 0.83, + "grad_norm": 1.6050639256172698, + "learning_rate": 7.093538417792128e-07, + "loss": 0.2639, + "step": 13254 + }, + { + "epoch": 0.83, + "grad_norm": 2.064275598744791, + "learning_rate": 7.088310105743645e-07, + "loss": 0.2479, + "step": 13255 + }, + { + "epoch": 0.83, + "grad_norm": 0.6008283946217221, + "learning_rate": 7.083083574177402e-07, + "loss": 0.4699, + "step": 13256 + }, + { + "epoch": 0.83, + "grad_norm": 4.864594186193576, + "learning_rate": 7.077858823310235e-07, + "loss": 0.2373, + "step": 13257 + }, + { + "epoch": 0.83, + "grad_norm": 1.4337804645080292, + "learning_rate": 7.072635853358956e-07, + "loss": 0.227, + "step": 13258 + }, + { + "epoch": 0.83, + "grad_norm": 2.7580940825618323, + "learning_rate": 7.067414664540251e-07, + "loss": 0.252, + "step": 13259 + }, + { + "epoch": 0.83, + "grad_norm": 1.630406533128059, + "learning_rate": 7.062195257070781e-07, + "loss": 0.245, + "step": 13260 + }, + { + "epoch": 0.83, + "grad_norm": 1.5015015359620418, + "learning_rate": 7.056977631167083e-07, + "loss": 0.2495, + "step": 13261 + }, + { + "epoch": 0.83, + "grad_norm": 1.3854198484063311, + "learning_rate": 7.051761787045674e-07, + "loss": 0.2632, + "step": 13262 + }, + { + "epoch": 0.83, + "grad_norm": 1.8801030117422544, + "learning_rate": 7.046547724922953e-07, + "loss": 0.2759, + "step": 13263 + }, + { + "epoch": 0.83, + "grad_norm": 1.8163154524820007, + "learning_rate": 7.041335445015258e-07, + "loss": 0.2415, + "step": 13264 + }, + { + "epoch": 0.83, + "grad_norm": 2.217397588848267, + "learning_rate": 7.036124947538847e-07, + "loss": 0.2631, + "step": 13265 + }, + { + "epoch": 0.83, + "grad_norm": 1.6033136688166005, + "learning_rate": 7.030916232709922e-07, + "loss": 0.2685, + "step": 13266 + }, + { + "epoch": 0.83, + "grad_norm": 0.6105248645404295, + "learning_rate": 7.025709300744621e-07, + "loss": 0.488, + "step": 13267 + }, + { + "epoch": 0.83, + "grad_norm": 3.035529685884794, + "learning_rate": 7.020504151858965e-07, + "loss": 0.2497, + "step": 13268 + }, + { + "epoch": 0.83, + "grad_norm": 1.5992868409870489, + "learning_rate": 7.015300786268925e-07, + "loss": 0.2444, + "step": 13269 + }, + { + "epoch": 0.83, + "grad_norm": 4.169102854967177, + "learning_rate": 7.010099204190418e-07, + "loss": 0.2435, + "step": 13270 + }, + { + "epoch": 0.83, + "grad_norm": 2.972304788053702, + "learning_rate": 7.00489940583925e-07, + "loss": 0.2737, + "step": 13271 + }, + { + "epoch": 0.83, + "grad_norm": 0.5843583323031228, + "learning_rate": 6.999701391431174e-07, + "loss": 0.4534, + "step": 13272 + }, + { + "epoch": 0.83, + "grad_norm": 2.4708934859929377, + "learning_rate": 6.994505161181859e-07, + "loss": 0.2625, + "step": 13273 + }, + { + "epoch": 0.83, + "grad_norm": 2.672414264833315, + "learning_rate": 6.989310715306902e-07, + "loss": 0.2716, + "step": 13274 + }, + { + "epoch": 0.83, + "grad_norm": 1.8924993545687268, + "learning_rate": 6.984118054021854e-07, + "loss": 0.2454, + "step": 13275 + }, + { + "epoch": 0.83, + "grad_norm": 2.939114888197796, + "learning_rate": 6.978927177542156e-07, + "loss": 0.2576, + "step": 13276 + }, + { + "epoch": 0.83, + "grad_norm": 3.333441468598719, + "learning_rate": 6.973738086083171e-07, + "loss": 0.2646, + "step": 13277 + }, + { + "epoch": 0.84, + "grad_norm": 2.4711775409493217, + "learning_rate": 6.968550779860222e-07, + "loss": 0.2591, + "step": 13278 + }, + { + "epoch": 0.84, + "grad_norm": 1.5540539054006046, + "learning_rate": 6.963365259088539e-07, + "loss": 0.2614, + "step": 13279 + }, + { + "epoch": 0.84, + "grad_norm": 2.0696361553775606, + "learning_rate": 6.958181523983276e-07, + "loss": 0.23, + "step": 13280 + }, + { + "epoch": 0.84, + "grad_norm": 1.8755504916308086, + "learning_rate": 6.952999574759483e-07, + "loss": 0.2683, + "step": 13281 + }, + { + "epoch": 0.84, + "grad_norm": 3.721954281596013, + "learning_rate": 6.947819411632223e-07, + "loss": 0.2467, + "step": 13282 + }, + { + "epoch": 0.84, + "grad_norm": 6.834870733049077, + "learning_rate": 6.942641034816405e-07, + "loss": 0.2611, + "step": 13283 + }, + { + "epoch": 0.84, + "grad_norm": 2.995436936193978, + "learning_rate": 6.937464444526887e-07, + "loss": 0.2544, + "step": 13284 + }, + { + "epoch": 0.84, + "grad_norm": 1.8579390339059376, + "learning_rate": 6.932289640978446e-07, + "loss": 0.2307, + "step": 13285 + }, + { + "epoch": 0.84, + "grad_norm": 2.5764521190366008, + "learning_rate": 6.927116624385821e-07, + "loss": 0.2841, + "step": 13286 + }, + { + "epoch": 0.84, + "grad_norm": 2.2031118651098205, + "learning_rate": 6.92194539496363e-07, + "loss": 0.2812, + "step": 13287 + }, + { + "epoch": 0.84, + "grad_norm": 1.8542181699164357, + "learning_rate": 6.916775952926425e-07, + "loss": 0.2473, + "step": 13288 + }, + { + "epoch": 0.84, + "grad_norm": 2.453543301183245, + "learning_rate": 6.911608298488715e-07, + "loss": 0.2425, + "step": 13289 + }, + { + "epoch": 0.84, + "grad_norm": 2.2483628857390707, + "learning_rate": 6.906442431864919e-07, + "loss": 0.2574, + "step": 13290 + }, + { + "epoch": 0.84, + "grad_norm": 2.43022301675465, + "learning_rate": 6.90127835326937e-07, + "loss": 0.2297, + "step": 13291 + }, + { + "epoch": 0.84, + "grad_norm": 2.1328799968395646, + "learning_rate": 6.896116062916335e-07, + "loss": 0.2452, + "step": 13292 + }, + { + "epoch": 0.84, + "grad_norm": 2.2169989321064327, + "learning_rate": 6.89095556101999e-07, + "loss": 0.2653, + "step": 13293 + }, + { + "epoch": 0.84, + "grad_norm": 1.4787048257489264, + "learning_rate": 6.885796847794485e-07, + "loss": 0.2352, + "step": 13294 + }, + { + "epoch": 0.84, + "grad_norm": 1.5187295195723474, + "learning_rate": 6.880639923453846e-07, + "loss": 0.2444, + "step": 13295 + }, + { + "epoch": 0.84, + "grad_norm": 2.0944255461533845, + "learning_rate": 6.875484788212039e-07, + "loss": 0.2459, + "step": 13296 + }, + { + "epoch": 0.84, + "grad_norm": 1.7562433286035743, + "learning_rate": 6.870331442282957e-07, + "loss": 0.2441, + "step": 13297 + }, + { + "epoch": 0.84, + "grad_norm": 2.1571936644591587, + "learning_rate": 6.865179885880446e-07, + "loss": 0.2726, + "step": 13298 + }, + { + "epoch": 0.84, + "grad_norm": 1.8982898554061316, + "learning_rate": 6.860030119218241e-07, + "loss": 0.2473, + "step": 13299 + }, + { + "epoch": 0.84, + "grad_norm": 2.7195865458687813, + "learning_rate": 6.854882142510011e-07, + "loss": 0.2874, + "step": 13300 + }, + { + "epoch": 0.84, + "grad_norm": 2.073445083949944, + "learning_rate": 6.849735955969339e-07, + "loss": 0.2666, + "step": 13301 + }, + { + "epoch": 0.84, + "grad_norm": 3.4489188422992814, + "learning_rate": 6.844591559809777e-07, + "loss": 0.2475, + "step": 13302 + }, + { + "epoch": 0.84, + "grad_norm": 2.3863288274383887, + "learning_rate": 6.83944895424477e-07, + "loss": 0.2506, + "step": 13303 + }, + { + "epoch": 0.84, + "grad_norm": 1.4542813608180722, + "learning_rate": 6.834308139487672e-07, + "loss": 0.2434, + "step": 13304 + }, + { + "epoch": 0.84, + "grad_norm": 3.339508487499196, + "learning_rate": 6.829169115751799e-07, + "loss": 0.2617, + "step": 13305 + }, + { + "epoch": 0.84, + "grad_norm": 12.19686953944788, + "learning_rate": 6.824031883250393e-07, + "loss": 0.2518, + "step": 13306 + }, + { + "epoch": 0.84, + "grad_norm": 5.828381161059583, + "learning_rate": 6.818896442196593e-07, + "loss": 0.2425, + "step": 13307 + }, + { + "epoch": 0.84, + "grad_norm": 2.455156198998655, + "learning_rate": 6.81376279280348e-07, + "loss": 0.2705, + "step": 13308 + }, + { + "epoch": 0.84, + "grad_norm": 1.6372669096726111, + "learning_rate": 6.808630935284039e-07, + "loss": 0.2509, + "step": 13309 + }, + { + "epoch": 0.84, + "grad_norm": 2.09744505580353, + "learning_rate": 6.803500869851232e-07, + "loss": 0.2365, + "step": 13310 + }, + { + "epoch": 0.84, + "grad_norm": 1.894054835084033, + "learning_rate": 6.798372596717889e-07, + "loss": 0.2611, + "step": 13311 + }, + { + "epoch": 0.84, + "grad_norm": 0.6162670830008417, + "learning_rate": 6.793246116096813e-07, + "loss": 0.4855, + "step": 13312 + }, + { + "epoch": 0.84, + "grad_norm": 2.4248765821721117, + "learning_rate": 6.788121428200689e-07, + "loss": 0.2712, + "step": 13313 + }, + { + "epoch": 0.84, + "grad_norm": 2.9485942370631357, + "learning_rate": 6.782998533242169e-07, + "loss": 0.2617, + "step": 13314 + }, + { + "epoch": 0.84, + "grad_norm": 2.025037599325337, + "learning_rate": 6.777877431433799e-07, + "loss": 0.2631, + "step": 13315 + }, + { + "epoch": 0.84, + "grad_norm": 3.608005797982861, + "learning_rate": 6.772758122988072e-07, + "loss": 0.2475, + "step": 13316 + }, + { + "epoch": 0.84, + "grad_norm": 5.677616574605857, + "learning_rate": 6.767640608117376e-07, + "loss": 0.2854, + "step": 13317 + }, + { + "epoch": 0.84, + "grad_norm": 2.4455361472405373, + "learning_rate": 6.762524887034072e-07, + "loss": 0.2669, + "step": 13318 + }, + { + "epoch": 0.84, + "grad_norm": 3.889383126616042, + "learning_rate": 6.757410959950395e-07, + "loss": 0.2522, + "step": 13319 + }, + { + "epoch": 0.84, + "grad_norm": 4.433859600496642, + "learning_rate": 6.752298827078563e-07, + "loss": 0.2429, + "step": 13320 + }, + { + "epoch": 0.84, + "grad_norm": 6.962131400877815, + "learning_rate": 6.747188488630651e-07, + "loss": 0.2784, + "step": 13321 + }, + { + "epoch": 0.84, + "grad_norm": 8.928368177247108, + "learning_rate": 6.74207994481873e-07, + "loss": 0.2576, + "step": 13322 + }, + { + "epoch": 0.84, + "grad_norm": 4.2226999805099, + "learning_rate": 6.736973195854741e-07, + "loss": 0.2595, + "step": 13323 + }, + { + "epoch": 0.84, + "grad_norm": 3.0145970102896236, + "learning_rate": 6.731868241950585e-07, + "loss": 0.2345, + "step": 13324 + }, + { + "epoch": 0.84, + "grad_norm": 1.79198770341662, + "learning_rate": 6.726765083318054e-07, + "loss": 0.2609, + "step": 13325 + }, + { + "epoch": 0.84, + "grad_norm": 1.8091921624436693, + "learning_rate": 6.721663720168897e-07, + "loss": 0.25, + "step": 13326 + }, + { + "epoch": 0.84, + "grad_norm": 1.5577873315549133, + "learning_rate": 6.716564152714799e-07, + "loss": 0.2506, + "step": 13327 + }, + { + "epoch": 0.84, + "grad_norm": 1.750512885142994, + "learning_rate": 6.711466381167336e-07, + "loss": 0.2323, + "step": 13328 + }, + { + "epoch": 0.84, + "grad_norm": 2.3412387824989707, + "learning_rate": 6.706370405738011e-07, + "loss": 0.2749, + "step": 13329 + }, + { + "epoch": 0.84, + "grad_norm": 1.4847828746570872, + "learning_rate": 6.701276226638287e-07, + "loss": 0.2463, + "step": 13330 + }, + { + "epoch": 0.84, + "grad_norm": 2.790574376609553, + "learning_rate": 6.696183844079513e-07, + "loss": 0.2551, + "step": 13331 + }, + { + "epoch": 0.84, + "grad_norm": 2.0287865241314864, + "learning_rate": 6.691093258272995e-07, + "loss": 0.2726, + "step": 13332 + }, + { + "epoch": 0.84, + "grad_norm": 1.8746573160134075, + "learning_rate": 6.686004469429919e-07, + "loss": 0.252, + "step": 13333 + }, + { + "epoch": 0.84, + "grad_norm": 3.972510693307056, + "learning_rate": 6.680917477761472e-07, + "loss": 0.2859, + "step": 13334 + }, + { + "epoch": 0.84, + "grad_norm": 1.8567151185780817, + "learning_rate": 6.675832283478701e-07, + "loss": 0.2478, + "step": 13335 + }, + { + "epoch": 0.84, + "grad_norm": 2.6380011904104417, + "learning_rate": 6.670748886792605e-07, + "loss": 0.247, + "step": 13336 + }, + { + "epoch": 0.84, + "grad_norm": 2.3114982050939092, + "learning_rate": 6.665667287914085e-07, + "loss": 0.2753, + "step": 13337 + }, + { + "epoch": 0.84, + "grad_norm": 4.475359041361067, + "learning_rate": 6.660587487054016e-07, + "loss": 0.2607, + "step": 13338 + }, + { + "epoch": 0.84, + "grad_norm": 0.6237312823262653, + "learning_rate": 6.655509484423145e-07, + "loss": 0.4768, + "step": 13339 + }, + { + "epoch": 0.84, + "grad_norm": 1.687907174511587, + "learning_rate": 6.650433280232177e-07, + "loss": 0.2569, + "step": 13340 + }, + { + "epoch": 0.84, + "grad_norm": 1.5854181964847236, + "learning_rate": 6.64535887469171e-07, + "loss": 0.252, + "step": 13341 + }, + { + "epoch": 0.84, + "grad_norm": 2.270724207695215, + "learning_rate": 6.640286268012325e-07, + "loss": 0.2527, + "step": 13342 + }, + { + "epoch": 0.84, + "grad_norm": 1.6781150362675301, + "learning_rate": 6.635215460404482e-07, + "loss": 0.2488, + "step": 13343 + }, + { + "epoch": 0.84, + "grad_norm": 2.107836793691485, + "learning_rate": 6.630146452078579e-07, + "loss": 0.2637, + "step": 13344 + }, + { + "epoch": 0.84, + "grad_norm": 2.5307032776601286, + "learning_rate": 6.625079243244914e-07, + "loss": 0.2651, + "step": 13345 + }, + { + "epoch": 0.84, + "grad_norm": 4.159166297456615, + "learning_rate": 6.62001383411377e-07, + "loss": 0.2504, + "step": 13346 + }, + { + "epoch": 0.84, + "grad_norm": 1.4547549816024652, + "learning_rate": 6.6149502248953e-07, + "loss": 0.2554, + "step": 13347 + }, + { + "epoch": 0.84, + "grad_norm": 1.8706399694321145, + "learning_rate": 6.609888415799598e-07, + "loss": 0.25, + "step": 13348 + }, + { + "epoch": 0.84, + "grad_norm": 0.6056446752643904, + "learning_rate": 6.604828407036695e-07, + "loss": 0.4846, + "step": 13349 + }, + { + "epoch": 0.84, + "grad_norm": 4.340569330601539, + "learning_rate": 6.599770198816547e-07, + "loss": 0.2835, + "step": 13350 + }, + { + "epoch": 0.84, + "grad_norm": 6.345268295357784, + "learning_rate": 6.594713791349022e-07, + "loss": 0.2702, + "step": 13351 + }, + { + "epoch": 0.84, + "grad_norm": 3.0795552831337734, + "learning_rate": 6.589659184843922e-07, + "loss": 0.2562, + "step": 13352 + }, + { + "epoch": 0.84, + "grad_norm": 1.5328508534892715, + "learning_rate": 6.584606379510955e-07, + "loss": 0.2458, + "step": 13353 + }, + { + "epoch": 0.84, + "grad_norm": 1.522539358597986, + "learning_rate": 6.579555375559793e-07, + "loss": 0.2622, + "step": 13354 + }, + { + "epoch": 0.84, + "grad_norm": 1.9604813076177836, + "learning_rate": 6.574506173200008e-07, + "loss": 0.2598, + "step": 13355 + }, + { + "epoch": 0.84, + "grad_norm": 1.385096274681475, + "learning_rate": 6.569458772641074e-07, + "loss": 0.2707, + "step": 13356 + }, + { + "epoch": 0.84, + "grad_norm": 3.670842291996775, + "learning_rate": 6.564413174092443e-07, + "loss": 0.279, + "step": 13357 + }, + { + "epoch": 0.84, + "grad_norm": 1.42196166335391, + "learning_rate": 6.55936937776347e-07, + "loss": 0.2447, + "step": 13358 + }, + { + "epoch": 0.84, + "grad_norm": 1.81297513134383, + "learning_rate": 6.554327383863418e-07, + "loss": 0.2646, + "step": 13359 + }, + { + "epoch": 0.84, + "grad_norm": 1.9493446450335274, + "learning_rate": 6.549287192601494e-07, + "loss": 0.2555, + "step": 13360 + }, + { + "epoch": 0.84, + "grad_norm": 1.8715200415549311, + "learning_rate": 6.54424880418681e-07, + "loss": 0.238, + "step": 13361 + }, + { + "epoch": 0.84, + "grad_norm": 2.266689418164463, + "learning_rate": 6.539212218828439e-07, + "loss": 0.2708, + "step": 13362 + }, + { + "epoch": 0.84, + "grad_norm": 2.5689967724985774, + "learning_rate": 6.53417743673534e-07, + "loss": 0.2632, + "step": 13363 + }, + { + "epoch": 0.84, + "grad_norm": 2.131417965387195, + "learning_rate": 6.529144458116432e-07, + "loss": 0.2539, + "step": 13364 + }, + { + "epoch": 0.84, + "grad_norm": 7.492466989934758, + "learning_rate": 6.524113283180522e-07, + "loss": 0.2598, + "step": 13365 + }, + { + "epoch": 0.84, + "grad_norm": 2.6808072474302356, + "learning_rate": 6.519083912136381e-07, + "loss": 0.2658, + "step": 13366 + }, + { + "epoch": 0.84, + "grad_norm": 1.5444482608135894, + "learning_rate": 6.514056345192682e-07, + "loss": 0.2466, + "step": 13367 + }, + { + "epoch": 0.84, + "grad_norm": 2.100620416313012, + "learning_rate": 6.50903058255803e-07, + "loss": 0.2548, + "step": 13368 + }, + { + "epoch": 0.84, + "grad_norm": 1.6240214069296524, + "learning_rate": 6.504006624440934e-07, + "loss": 0.2517, + "step": 13369 + }, + { + "epoch": 0.84, + "grad_norm": 1.831067554539817, + "learning_rate": 6.498984471049869e-07, + "loss": 0.2366, + "step": 13370 + }, + { + "epoch": 0.84, + "grad_norm": 2.425528581014166, + "learning_rate": 6.493964122593193e-07, + "loss": 0.276, + "step": 13371 + }, + { + "epoch": 0.84, + "grad_norm": 1.7891392092614682, + "learning_rate": 6.488945579279237e-07, + "loss": 0.2584, + "step": 13372 + }, + { + "epoch": 0.84, + "grad_norm": 3.0941617305695686, + "learning_rate": 6.483928841316201e-07, + "loss": 0.2413, + "step": 13373 + }, + { + "epoch": 0.84, + "grad_norm": 1.8179463764117592, + "learning_rate": 6.47891390891226e-07, + "loss": 0.2757, + "step": 13374 + }, + { + "epoch": 0.84, + "grad_norm": 1.5556348084502554, + "learning_rate": 6.473900782275482e-07, + "loss": 0.2557, + "step": 13375 + }, + { + "epoch": 0.84, + "grad_norm": 2.2595594744221827, + "learning_rate": 6.468889461613875e-07, + "loss": 0.2805, + "step": 13376 + }, + { + "epoch": 0.84, + "grad_norm": 1.4269481979911813, + "learning_rate": 6.463879947135349e-07, + "loss": 0.2396, + "step": 13377 + }, + { + "epoch": 0.84, + "grad_norm": 3.5734877924953476, + "learning_rate": 6.458872239047776e-07, + "loss": 0.2814, + "step": 13378 + }, + { + "epoch": 0.84, + "grad_norm": 6.752253659863759, + "learning_rate": 6.453866337558939e-07, + "loss": 0.2464, + "step": 13379 + }, + { + "epoch": 0.84, + "grad_norm": 3.141801705001186, + "learning_rate": 6.448862242876536e-07, + "loss": 0.2844, + "step": 13380 + }, + { + "epoch": 0.84, + "grad_norm": 2.0042754335064585, + "learning_rate": 6.443859955208187e-07, + "loss": 0.2642, + "step": 13381 + }, + { + "epoch": 0.84, + "grad_norm": 2.36639946167784, + "learning_rate": 6.438859474761461e-07, + "loss": 0.261, + "step": 13382 + }, + { + "epoch": 0.84, + "grad_norm": 1.7727691878110658, + "learning_rate": 6.43386080174383e-07, + "loss": 0.2633, + "step": 13383 + }, + { + "epoch": 0.84, + "grad_norm": 2.5624324691703557, + "learning_rate": 6.428863936362694e-07, + "loss": 0.2822, + "step": 13384 + }, + { + "epoch": 0.84, + "grad_norm": 2.512964644654413, + "learning_rate": 6.423868878825373e-07, + "loss": 0.2765, + "step": 13385 + }, + { + "epoch": 0.84, + "grad_norm": 2.037510962546739, + "learning_rate": 6.418875629339133e-07, + "loss": 0.2756, + "step": 13386 + }, + { + "epoch": 0.84, + "grad_norm": 2.0939592328857977, + "learning_rate": 6.413884188111163e-07, + "loss": 0.2613, + "step": 13387 + }, + { + "epoch": 0.84, + "grad_norm": 3.3063107985055353, + "learning_rate": 6.408894555348555e-07, + "loss": 0.2566, + "step": 13388 + }, + { + "epoch": 0.84, + "grad_norm": 2.2158554344475223, + "learning_rate": 6.40390673125833e-07, + "loss": 0.2719, + "step": 13389 + }, + { + "epoch": 0.84, + "grad_norm": 1.8427488080983283, + "learning_rate": 6.398920716047458e-07, + "loss": 0.2431, + "step": 13390 + }, + { + "epoch": 0.84, + "grad_norm": 1.3407186830334374, + "learning_rate": 6.39393650992281e-07, + "loss": 0.2597, + "step": 13391 + }, + { + "epoch": 0.84, + "grad_norm": 3.373527650414928, + "learning_rate": 6.388954113091195e-07, + "loss": 0.2386, + "step": 13392 + }, + { + "epoch": 0.84, + "grad_norm": 1.4077006452566116, + "learning_rate": 6.383973525759318e-07, + "loss": 0.2544, + "step": 13393 + }, + { + "epoch": 0.84, + "grad_norm": 4.117246292700643, + "learning_rate": 6.378994748133855e-07, + "loss": 0.2587, + "step": 13394 + }, + { + "epoch": 0.84, + "grad_norm": 2.6371280794912155, + "learning_rate": 6.374017780421387e-07, + "loss": 0.2621, + "step": 13395 + }, + { + "epoch": 0.84, + "grad_norm": 2.445796724466462, + "learning_rate": 6.369042622828408e-07, + "loss": 0.2597, + "step": 13396 + }, + { + "epoch": 0.84, + "grad_norm": 2.233675456331351, + "learning_rate": 6.364069275561341e-07, + "loss": 0.2545, + "step": 13397 + }, + { + "epoch": 0.84, + "grad_norm": 8.886902055526752, + "learning_rate": 6.359097738826559e-07, + "loss": 0.2439, + "step": 13398 + }, + { + "epoch": 0.84, + "grad_norm": 2.4013864313638993, + "learning_rate": 6.354128012830319e-07, + "loss": 0.2656, + "step": 13399 + }, + { + "epoch": 0.84, + "grad_norm": 1.2310331489065025, + "learning_rate": 6.349160097778839e-07, + "loss": 0.2474, + "step": 13400 + }, + { + "epoch": 0.84, + "grad_norm": 4.737654862163965, + "learning_rate": 6.344193993878223e-07, + "loss": 0.247, + "step": 13401 + }, + { + "epoch": 0.84, + "grad_norm": 0.5846140988984052, + "learning_rate": 6.339229701334543e-07, + "loss": 0.4467, + "step": 13402 + }, + { + "epoch": 0.84, + "grad_norm": 1.4099723567048166, + "learning_rate": 6.334267220353779e-07, + "loss": 0.2469, + "step": 13403 + }, + { + "epoch": 0.84, + "grad_norm": 1.6596960463805375, + "learning_rate": 6.329306551141834e-07, + "loss": 0.2722, + "step": 13404 + }, + { + "epoch": 0.84, + "grad_norm": 4.121620019677974, + "learning_rate": 6.324347693904515e-07, + "loss": 0.2378, + "step": 13405 + }, + { + "epoch": 0.84, + "grad_norm": 1.9842353800341574, + "learning_rate": 6.319390648847596e-07, + "loss": 0.2748, + "step": 13406 + }, + { + "epoch": 0.84, + "grad_norm": 3.082599947725112, + "learning_rate": 6.314435416176745e-07, + "loss": 0.2491, + "step": 13407 + }, + { + "epoch": 0.84, + "grad_norm": 1.825142349968624, + "learning_rate": 6.30948199609755e-07, + "loss": 0.2587, + "step": 13408 + }, + { + "epoch": 0.84, + "grad_norm": 1.5584881233554497, + "learning_rate": 6.304530388815555e-07, + "loss": 0.2518, + "step": 13409 + }, + { + "epoch": 0.84, + "grad_norm": 1.3550523837795236, + "learning_rate": 6.299580594536214e-07, + "loss": 0.2402, + "step": 13410 + }, + { + "epoch": 0.84, + "grad_norm": 13.775748833236635, + "learning_rate": 6.294632613464891e-07, + "loss": 0.2489, + "step": 13411 + }, + { + "epoch": 0.84, + "grad_norm": 1.4795996961937647, + "learning_rate": 6.289686445806897e-07, + "loss": 0.2497, + "step": 13412 + }, + { + "epoch": 0.84, + "grad_norm": 1.737286113311703, + "learning_rate": 6.284742091767437e-07, + "loss": 0.2534, + "step": 13413 + }, + { + "epoch": 0.84, + "grad_norm": 2.0572373881665493, + "learning_rate": 6.279799551551685e-07, + "loss": 0.258, + "step": 13414 + }, + { + "epoch": 0.84, + "grad_norm": 2.7420740158125616, + "learning_rate": 6.274858825364693e-07, + "loss": 0.2472, + "step": 13415 + }, + { + "epoch": 0.84, + "grad_norm": 2.535841739020183, + "learning_rate": 6.269919913411487e-07, + "loss": 0.2548, + "step": 13416 + }, + { + "epoch": 0.84, + "grad_norm": 2.37352567868235, + "learning_rate": 6.264982815896964e-07, + "loss": 0.2687, + "step": 13417 + }, + { + "epoch": 0.84, + "grad_norm": 2.0267893266036907, + "learning_rate": 6.260047533025998e-07, + "loss": 0.2646, + "step": 13418 + }, + { + "epoch": 0.84, + "grad_norm": 4.328546667924095, + "learning_rate": 6.255114065003353e-07, + "loss": 0.2637, + "step": 13419 + }, + { + "epoch": 0.84, + "grad_norm": 2.644197023306434, + "learning_rate": 6.250182412033723e-07, + "loss": 0.2681, + "step": 13420 + }, + { + "epoch": 0.84, + "grad_norm": 4.605194812598993, + "learning_rate": 6.245252574321719e-07, + "loss": 0.2687, + "step": 13421 + }, + { + "epoch": 0.84, + "grad_norm": 2.342267842668547, + "learning_rate": 6.24032455207192e-07, + "loss": 0.2654, + "step": 13422 + }, + { + "epoch": 0.84, + "grad_norm": 1.5008454522030399, + "learning_rate": 6.235398345488769e-07, + "loss": 0.2371, + "step": 13423 + }, + { + "epoch": 0.84, + "grad_norm": 2.216123174695491, + "learning_rate": 6.230473954776683e-07, + "loss": 0.262, + "step": 13424 + }, + { + "epoch": 0.84, + "grad_norm": 3.1194285837111244, + "learning_rate": 6.225551380139966e-07, + "loss": 0.241, + "step": 13425 + }, + { + "epoch": 0.84, + "grad_norm": 1.9680174260662269, + "learning_rate": 6.220630621782886e-07, + "loss": 0.2508, + "step": 13426 + }, + { + "epoch": 0.84, + "grad_norm": 2.765124437463708, + "learning_rate": 6.215711679909603e-07, + "loss": 0.2565, + "step": 13427 + }, + { + "epoch": 0.84, + "grad_norm": 2.0869895718564586, + "learning_rate": 6.210794554724209e-07, + "loss": 0.2593, + "step": 13428 + }, + { + "epoch": 0.84, + "grad_norm": 2.613307153027159, + "learning_rate": 6.205879246430718e-07, + "loss": 0.2364, + "step": 13429 + }, + { + "epoch": 0.84, + "grad_norm": 1.7907274267962165, + "learning_rate": 6.200965755233085e-07, + "loss": 0.2368, + "step": 13430 + }, + { + "epoch": 0.84, + "grad_norm": 2.0093124444776462, + "learning_rate": 6.19605408133519e-07, + "loss": 0.2706, + "step": 13431 + }, + { + "epoch": 0.84, + "grad_norm": 2.8182818504541047, + "learning_rate": 6.191144224940815e-07, + "loss": 0.2419, + "step": 13432 + }, + { + "epoch": 0.84, + "grad_norm": 2.4251340973267377, + "learning_rate": 6.186236186253669e-07, + "loss": 0.2584, + "step": 13433 + }, + { + "epoch": 0.84, + "grad_norm": 1.4680766014110684, + "learning_rate": 6.181329965477417e-07, + "loss": 0.2584, + "step": 13434 + }, + { + "epoch": 0.84, + "grad_norm": 16.852248030799743, + "learning_rate": 6.176425562815613e-07, + "loss": 0.273, + "step": 13435 + }, + { + "epoch": 0.84, + "grad_norm": 2.2213768022193507, + "learning_rate": 6.171522978471755e-07, + "loss": 0.257, + "step": 13436 + }, + { + "epoch": 0.85, + "grad_norm": 1.7927775350305657, + "learning_rate": 6.166622212649248e-07, + "loss": 0.2699, + "step": 13437 + }, + { + "epoch": 0.85, + "grad_norm": 1.7780035068961886, + "learning_rate": 6.161723265551439e-07, + "loss": 0.2439, + "step": 13438 + }, + { + "epoch": 0.85, + "grad_norm": 1.3467121746151032, + "learning_rate": 6.156826137381605e-07, + "loss": 0.2427, + "step": 13439 + }, + { + "epoch": 0.85, + "grad_norm": 1.8487489334041787, + "learning_rate": 6.151930828342933e-07, + "loss": 0.2558, + "step": 13440 + }, + { + "epoch": 0.85, + "grad_norm": 2.1103522994585706, + "learning_rate": 6.147037338638523e-07, + "loss": 0.2488, + "step": 13441 + }, + { + "epoch": 0.85, + "grad_norm": 1.5836701096911896, + "learning_rate": 6.142145668471434e-07, + "loss": 0.248, + "step": 13442 + }, + { + "epoch": 0.85, + "grad_norm": 7.656884641141751, + "learning_rate": 6.137255818044618e-07, + "loss": 0.2581, + "step": 13443 + }, + { + "epoch": 0.85, + "grad_norm": 1.5778648933304922, + "learning_rate": 6.132367787560972e-07, + "loss": 0.2572, + "step": 13444 + }, + { + "epoch": 0.85, + "grad_norm": 2.228521511300527, + "learning_rate": 6.127481577223293e-07, + "loss": 0.2729, + "step": 13445 + }, + { + "epoch": 0.85, + "grad_norm": 1.9416217940728462, + "learning_rate": 6.12259718723433e-07, + "loss": 0.2716, + "step": 13446 + }, + { + "epoch": 0.85, + "grad_norm": 21.189065981942754, + "learning_rate": 6.117714617796755e-07, + "loss": 0.2664, + "step": 13447 + }, + { + "epoch": 0.85, + "grad_norm": 1.5858810560868786, + "learning_rate": 6.11283386911315e-07, + "loss": 0.2734, + "step": 13448 + }, + { + "epoch": 0.85, + "grad_norm": 1.8034508169949752, + "learning_rate": 6.107954941386002e-07, + "loss": 0.2564, + "step": 13449 + }, + { + "epoch": 0.85, + "grad_norm": 2.4459616840143066, + "learning_rate": 6.103077834817778e-07, + "loss": 0.2415, + "step": 13450 + }, + { + "epoch": 0.85, + "grad_norm": 1.5788569586631858, + "learning_rate": 6.098202549610821e-07, + "loss": 0.2687, + "step": 13451 + }, + { + "epoch": 0.85, + "grad_norm": 1.6741883371995105, + "learning_rate": 6.09332908596742e-07, + "loss": 0.2449, + "step": 13452 + }, + { + "epoch": 0.85, + "grad_norm": 3.280979706528602, + "learning_rate": 6.088457444089774e-07, + "loss": 0.2589, + "step": 13453 + }, + { + "epoch": 0.85, + "grad_norm": 2.0334725165814875, + "learning_rate": 6.083587624180021e-07, + "loss": 0.2751, + "step": 13454 + }, + { + "epoch": 0.85, + "grad_norm": 2.0825827042994396, + "learning_rate": 6.078719626440238e-07, + "loss": 0.265, + "step": 13455 + }, + { + "epoch": 0.85, + "grad_norm": 2.2214091746545925, + "learning_rate": 6.073853451072387e-07, + "loss": 0.253, + "step": 13456 + }, + { + "epoch": 0.85, + "grad_norm": 5.350869610333454, + "learning_rate": 6.068989098278366e-07, + "loss": 0.2475, + "step": 13457 + }, + { + "epoch": 0.85, + "grad_norm": 6.5054463094168895, + "learning_rate": 6.064126568260026e-07, + "loss": 0.2722, + "step": 13458 + }, + { + "epoch": 0.85, + "grad_norm": 4.375200479008572, + "learning_rate": 6.059265861219122e-07, + "loss": 0.2624, + "step": 13459 + }, + { + "epoch": 0.85, + "grad_norm": 1.491099614455404, + "learning_rate": 6.054406977357308e-07, + "loss": 0.2601, + "step": 13460 + }, + { + "epoch": 0.85, + "grad_norm": 2.080453525909033, + "learning_rate": 6.049549916876213e-07, + "loss": 0.2557, + "step": 13461 + }, + { + "epoch": 0.85, + "grad_norm": 1.7852621489247482, + "learning_rate": 6.044694679977353e-07, + "loss": 0.2426, + "step": 13462 + }, + { + "epoch": 0.85, + "grad_norm": 3.50377367865537, + "learning_rate": 6.03984126686219e-07, + "loss": 0.2959, + "step": 13463 + }, + { + "epoch": 0.85, + "grad_norm": 2.8221389980922518, + "learning_rate": 6.034989677732095e-07, + "loss": 0.2645, + "step": 13464 + }, + { + "epoch": 0.85, + "grad_norm": 1.782062025100652, + "learning_rate": 6.030139912788363e-07, + "loss": 0.2419, + "step": 13465 + }, + { + "epoch": 0.85, + "grad_norm": 1.7677736292569717, + "learning_rate": 6.025291972232233e-07, + "loss": 0.2638, + "step": 13466 + }, + { + "epoch": 0.85, + "grad_norm": 2.0981586975400246, + "learning_rate": 6.020445856264845e-07, + "loss": 0.2484, + "step": 13467 + }, + { + "epoch": 0.85, + "grad_norm": 5.568847040983956, + "learning_rate": 6.015601565087265e-07, + "loss": 0.2353, + "step": 13468 + }, + { + "epoch": 0.85, + "grad_norm": 1.535192413258961, + "learning_rate": 6.010759098900514e-07, + "loss": 0.2656, + "step": 13469 + }, + { + "epoch": 0.85, + "grad_norm": 1.9884449494052558, + "learning_rate": 6.005918457905491e-07, + "loss": 0.2595, + "step": 13470 + }, + { + "epoch": 0.85, + "grad_norm": 1.727137316725863, + "learning_rate": 6.001079642303059e-07, + "loss": 0.2539, + "step": 13471 + }, + { + "epoch": 0.85, + "grad_norm": 1.5961615519099264, + "learning_rate": 5.996242652293987e-07, + "loss": 0.2397, + "step": 13472 + }, + { + "epoch": 0.85, + "grad_norm": 4.032893371999254, + "learning_rate": 5.99140748807896e-07, + "loss": 0.2435, + "step": 13473 + }, + { + "epoch": 0.85, + "grad_norm": 1.7176294075536171, + "learning_rate": 5.98657414985861e-07, + "loss": 0.2473, + "step": 13474 + }, + { + "epoch": 0.85, + "grad_norm": 1.4133225690296671, + "learning_rate": 5.981742637833465e-07, + "loss": 0.2331, + "step": 13475 + }, + { + "epoch": 0.85, + "grad_norm": 3.8912674668183675, + "learning_rate": 5.976912952204017e-07, + "loss": 0.2321, + "step": 13476 + }, + { + "epoch": 0.85, + "grad_norm": 1.7216631369538566, + "learning_rate": 5.972085093170637e-07, + "loss": 0.2466, + "step": 13477 + }, + { + "epoch": 0.85, + "grad_norm": 4.1920283523365836, + "learning_rate": 5.967259060933644e-07, + "loss": 0.2559, + "step": 13478 + }, + { + "epoch": 0.85, + "grad_norm": 0.6549034677481446, + "learning_rate": 5.962434855693295e-07, + "loss": 0.4658, + "step": 13479 + }, + { + "epoch": 0.85, + "grad_norm": 1.6649424133775486, + "learning_rate": 5.957612477649743e-07, + "loss": 0.2529, + "step": 13480 + }, + { + "epoch": 0.85, + "grad_norm": 3.3857659518307384, + "learning_rate": 5.952791927003066e-07, + "loss": 0.2601, + "step": 13481 + }, + { + "epoch": 0.85, + "grad_norm": 1.6784070898508838, + "learning_rate": 5.9479732039533e-07, + "loss": 0.2544, + "step": 13482 + }, + { + "epoch": 0.85, + "grad_norm": 5.187974414621806, + "learning_rate": 5.94315630870036e-07, + "loss": 0.2726, + "step": 13483 + }, + { + "epoch": 0.85, + "grad_norm": 1.5241617959174945, + "learning_rate": 5.938341241444123e-07, + "loss": 0.2262, + "step": 13484 + }, + { + "epoch": 0.85, + "grad_norm": 3.8372876965840628, + "learning_rate": 5.93352800238437e-07, + "loss": 0.2463, + "step": 13485 + }, + { + "epoch": 0.85, + "grad_norm": 1.8398225505536796, + "learning_rate": 5.928716591720813e-07, + "loss": 0.2664, + "step": 13486 + }, + { + "epoch": 0.85, + "grad_norm": 1.4749406718398648, + "learning_rate": 5.92390700965309e-07, + "loss": 0.2532, + "step": 13487 + }, + { + "epoch": 0.85, + "grad_norm": 2.5737951844579836, + "learning_rate": 5.91909925638075e-07, + "loss": 0.2412, + "step": 13488 + }, + { + "epoch": 0.85, + "grad_norm": 2.508596175108206, + "learning_rate": 5.914293332103266e-07, + "loss": 0.2619, + "step": 13489 + }, + { + "epoch": 0.85, + "grad_norm": 3.189303442915712, + "learning_rate": 5.90948923702006e-07, + "loss": 0.2576, + "step": 13490 + }, + { + "epoch": 0.85, + "grad_norm": 1.539680287597827, + "learning_rate": 5.90468697133047e-07, + "loss": 0.2418, + "step": 13491 + }, + { + "epoch": 0.85, + "grad_norm": 1.6590691911248443, + "learning_rate": 5.89988653523374e-07, + "loss": 0.2508, + "step": 13492 + }, + { + "epoch": 0.85, + "grad_norm": 1.7274892066635184, + "learning_rate": 5.895087928929033e-07, + "loss": 0.2595, + "step": 13493 + }, + { + "epoch": 0.85, + "grad_norm": 1.7782888705814661, + "learning_rate": 5.890291152615479e-07, + "loss": 0.2515, + "step": 13494 + }, + { + "epoch": 0.85, + "grad_norm": 2.8518785472865065, + "learning_rate": 5.885496206492097e-07, + "loss": 0.2582, + "step": 13495 + }, + { + "epoch": 0.85, + "grad_norm": 1.550453497656156, + "learning_rate": 5.880703090757833e-07, + "loss": 0.2589, + "step": 13496 + }, + { + "epoch": 0.85, + "grad_norm": 2.3147100621447727, + "learning_rate": 5.875911805611551e-07, + "loss": 0.2595, + "step": 13497 + }, + { + "epoch": 0.85, + "grad_norm": 2.3720167509519614, + "learning_rate": 5.871122351252056e-07, + "loss": 0.2464, + "step": 13498 + }, + { + "epoch": 0.85, + "grad_norm": 1.546748493072562, + "learning_rate": 5.866334727878093e-07, + "loss": 0.2449, + "step": 13499 + }, + { + "epoch": 0.85, + "grad_norm": 1.3805248661240959, + "learning_rate": 5.861548935688288e-07, + "loss": 0.2417, + "step": 13500 + }, + { + "epoch": 0.85, + "grad_norm": 5.565451613317308, + "learning_rate": 5.856764974881213e-07, + "loss": 0.261, + "step": 13501 + }, + { + "epoch": 0.85, + "grad_norm": 4.48745688257381, + "learning_rate": 5.851982845655368e-07, + "loss": 0.2569, + "step": 13502 + }, + { + "epoch": 0.85, + "grad_norm": 1.2245977594989972, + "learning_rate": 5.847202548209174e-07, + "loss": 0.2456, + "step": 13503 + }, + { + "epoch": 0.85, + "grad_norm": 1.8972157072057165, + "learning_rate": 5.842424082740972e-07, + "loss": 0.2516, + "step": 13504 + }, + { + "epoch": 0.85, + "grad_norm": 3.8840926956516535, + "learning_rate": 5.837647449449019e-07, + "loss": 0.241, + "step": 13505 + }, + { + "epoch": 0.85, + "grad_norm": 1.6658739249500758, + "learning_rate": 5.832872648531512e-07, + "loss": 0.2783, + "step": 13506 + }, + { + "epoch": 0.85, + "grad_norm": 2.0168126895491363, + "learning_rate": 5.828099680186577e-07, + "loss": 0.2419, + "step": 13507 + }, + { + "epoch": 0.85, + "grad_norm": 1.9599077828272837, + "learning_rate": 5.823328544612245e-07, + "loss": 0.2429, + "step": 13508 + }, + { + "epoch": 0.85, + "grad_norm": 1.7095248415156747, + "learning_rate": 5.818559242006472e-07, + "loss": 0.2533, + "step": 13509 + }, + { + "epoch": 0.85, + "grad_norm": 1.7421267957726232, + "learning_rate": 5.813791772567157e-07, + "loss": 0.2464, + "step": 13510 + }, + { + "epoch": 0.85, + "grad_norm": 1.800904870901792, + "learning_rate": 5.809026136492107e-07, + "loss": 0.2547, + "step": 13511 + }, + { + "epoch": 0.85, + "grad_norm": 3.881181483130517, + "learning_rate": 5.804262333979044e-07, + "loss": 0.2457, + "step": 13512 + }, + { + "epoch": 0.85, + "grad_norm": 4.552193320850763, + "learning_rate": 5.799500365225647e-07, + "loss": 0.2456, + "step": 13513 + }, + { + "epoch": 0.85, + "grad_norm": 1.77824722130231, + "learning_rate": 5.794740230429475e-07, + "loss": 0.257, + "step": 13514 + }, + { + "epoch": 0.85, + "grad_norm": 2.251834361956646, + "learning_rate": 5.789981929788063e-07, + "loss": 0.2541, + "step": 13515 + }, + { + "epoch": 0.85, + "grad_norm": 1.7435382526825194, + "learning_rate": 5.785225463498828e-07, + "loss": 0.2603, + "step": 13516 + }, + { + "epoch": 0.85, + "grad_norm": 1.6986746749677004, + "learning_rate": 5.780470831759111e-07, + "loss": 0.2572, + "step": 13517 + }, + { + "epoch": 0.85, + "grad_norm": 4.969212550955034, + "learning_rate": 5.775718034766209e-07, + "loss": 0.2665, + "step": 13518 + }, + { + "epoch": 0.85, + "grad_norm": 1.6106812182464696, + "learning_rate": 5.77096707271732e-07, + "loss": 0.267, + "step": 13519 + }, + { + "epoch": 0.85, + "grad_norm": 2.7315584862627755, + "learning_rate": 5.766217945809554e-07, + "loss": 0.2478, + "step": 13520 + }, + { + "epoch": 0.85, + "grad_norm": 2.6835443075396217, + "learning_rate": 5.761470654239987e-07, + "loss": 0.2552, + "step": 13521 + }, + { + "epoch": 0.85, + "grad_norm": 1.980618816403487, + "learning_rate": 5.756725198205565e-07, + "loss": 0.2636, + "step": 13522 + }, + { + "epoch": 0.85, + "grad_norm": 3.3422479895103936, + "learning_rate": 5.751981577903216e-07, + "loss": 0.2599, + "step": 13523 + }, + { + "epoch": 0.85, + "grad_norm": 1.5699725414163723, + "learning_rate": 5.747239793529735e-07, + "loss": 0.2315, + "step": 13524 + }, + { + "epoch": 0.85, + "grad_norm": 1.614291219139042, + "learning_rate": 5.742499845281874e-07, + "loss": 0.2545, + "step": 13525 + }, + { + "epoch": 0.85, + "grad_norm": 2.364417162361661, + "learning_rate": 5.737761733356312e-07, + "loss": 0.2654, + "step": 13526 + }, + { + "epoch": 0.85, + "grad_norm": 1.5999843211238662, + "learning_rate": 5.733025457949625e-07, + "loss": 0.231, + "step": 13527 + }, + { + "epoch": 0.85, + "grad_norm": 2.106616621026424, + "learning_rate": 5.728291019258347e-07, + "loss": 0.263, + "step": 13528 + }, + { + "epoch": 0.85, + "grad_norm": 1.9067278643917807, + "learning_rate": 5.723558417478914e-07, + "loss": 0.2597, + "step": 13529 + }, + { + "epoch": 0.85, + "grad_norm": 1.8953330005585292, + "learning_rate": 5.718827652807673e-07, + "loss": 0.256, + "step": 13530 + }, + { + "epoch": 0.85, + "grad_norm": 2.104440011041646, + "learning_rate": 5.714098725440936e-07, + "loss": 0.2536, + "step": 13531 + }, + { + "epoch": 0.85, + "grad_norm": 2.749262134426467, + "learning_rate": 5.709371635574907e-07, + "loss": 0.2563, + "step": 13532 + }, + { + "epoch": 0.85, + "grad_norm": 0.5855358582502723, + "learning_rate": 5.704646383405698e-07, + "loss": 0.4516, + "step": 13533 + }, + { + "epoch": 0.85, + "grad_norm": 1.4983159479219632, + "learning_rate": 5.699922969129406e-07, + "loss": 0.2454, + "step": 13534 + }, + { + "epoch": 0.85, + "grad_norm": 3.589545383496322, + "learning_rate": 5.695201392941985e-07, + "loss": 0.2669, + "step": 13535 + }, + { + "epoch": 0.85, + "grad_norm": 1.378932170923166, + "learning_rate": 5.690481655039359e-07, + "loss": 0.2542, + "step": 13536 + }, + { + "epoch": 0.85, + "grad_norm": 2.2701012800953952, + "learning_rate": 5.685763755617357e-07, + "loss": 0.2683, + "step": 13537 + }, + { + "epoch": 0.85, + "grad_norm": 2.00697994797012, + "learning_rate": 5.68104769487171e-07, + "loss": 0.2499, + "step": 13538 + }, + { + "epoch": 0.85, + "grad_norm": 2.9762545860281695, + "learning_rate": 5.676333472998131e-07, + "loss": 0.2611, + "step": 13539 + }, + { + "epoch": 0.85, + "grad_norm": 3.0532568180829864, + "learning_rate": 5.671621090192203e-07, + "loss": 0.2431, + "step": 13540 + }, + { + "epoch": 0.85, + "grad_norm": 5.670252802575042, + "learning_rate": 5.66691054664944e-07, + "loss": 0.2341, + "step": 13541 + }, + { + "epoch": 0.85, + "grad_norm": 2.7785526737823396, + "learning_rate": 5.662201842565301e-07, + "loss": 0.2701, + "step": 13542 + }, + { + "epoch": 0.85, + "grad_norm": 2.335948586720108, + "learning_rate": 5.657494978135169e-07, + "loss": 0.2658, + "step": 13543 + }, + { + "epoch": 0.85, + "grad_norm": 1.6821751756813734, + "learning_rate": 5.652789953554338e-07, + "loss": 0.2459, + "step": 13544 + }, + { + "epoch": 0.85, + "grad_norm": 2.7165543974639266, + "learning_rate": 5.648086769018019e-07, + "loss": 0.2508, + "step": 13545 + }, + { + "epoch": 0.85, + "grad_norm": 1.3152126272537419, + "learning_rate": 5.643385424721342e-07, + "loss": 0.243, + "step": 13546 + }, + { + "epoch": 0.85, + "grad_norm": 2.6376253550899063, + "learning_rate": 5.63868592085941e-07, + "loss": 0.2722, + "step": 13547 + }, + { + "epoch": 0.85, + "grad_norm": 2.2887753226860474, + "learning_rate": 5.633988257627187e-07, + "loss": 0.2433, + "step": 13548 + }, + { + "epoch": 0.85, + "grad_norm": 0.577455232390544, + "learning_rate": 5.629292435219586e-07, + "loss": 0.4639, + "step": 13549 + }, + { + "epoch": 0.85, + "grad_norm": 1.8079642268829752, + "learning_rate": 5.624598453831453e-07, + "loss": 0.2437, + "step": 13550 + }, + { + "epoch": 0.85, + "grad_norm": 1.853634505212085, + "learning_rate": 5.619906313657558e-07, + "loss": 0.2513, + "step": 13551 + }, + { + "epoch": 0.85, + "grad_norm": 2.0104315005559203, + "learning_rate": 5.615216014892577e-07, + "loss": 0.2434, + "step": 13552 + }, + { + "epoch": 0.85, + "grad_norm": 1.7080766982438698, + "learning_rate": 5.610527557731126e-07, + "loss": 0.2691, + "step": 13553 + }, + { + "epoch": 0.85, + "grad_norm": 1.636742689216997, + "learning_rate": 5.605840942367713e-07, + "loss": 0.2523, + "step": 13554 + }, + { + "epoch": 0.85, + "grad_norm": 2.7398274197926993, + "learning_rate": 5.60115616899683e-07, + "loss": 0.2946, + "step": 13555 + }, + { + "epoch": 0.85, + "grad_norm": 1.470485277610429, + "learning_rate": 5.596473237812833e-07, + "loss": 0.2481, + "step": 13556 + }, + { + "epoch": 0.85, + "grad_norm": 2.2016074473377323, + "learning_rate": 5.591792149010022e-07, + "loss": 0.2522, + "step": 13557 + }, + { + "epoch": 0.85, + "grad_norm": 2.0406811642020126, + "learning_rate": 5.587112902782638e-07, + "loss": 0.2464, + "step": 13558 + }, + { + "epoch": 0.85, + "grad_norm": 1.7116847885826194, + "learning_rate": 5.582435499324829e-07, + "loss": 0.2581, + "step": 13559 + }, + { + "epoch": 0.85, + "grad_norm": 2.439569701263958, + "learning_rate": 5.577759938830668e-07, + "loss": 0.277, + "step": 13560 + }, + { + "epoch": 0.85, + "grad_norm": 2.3058979220050366, + "learning_rate": 5.57308622149415e-07, + "loss": 0.2702, + "step": 13561 + }, + { + "epoch": 0.85, + "grad_norm": 1.5615495482226585, + "learning_rate": 5.568414347509188e-07, + "loss": 0.2566, + "step": 13562 + }, + { + "epoch": 0.85, + "grad_norm": 2.491320604001727, + "learning_rate": 5.563744317069642e-07, + "loss": 0.2455, + "step": 13563 + }, + { + "epoch": 0.85, + "grad_norm": 1.9199517882555668, + "learning_rate": 5.559076130369273e-07, + "loss": 0.2255, + "step": 13564 + }, + { + "epoch": 0.85, + "grad_norm": 2.3006377515034426, + "learning_rate": 5.554409787601755e-07, + "loss": 0.242, + "step": 13565 + }, + { + "epoch": 0.85, + "grad_norm": 3.6010076661097696, + "learning_rate": 5.549745288960729e-07, + "loss": 0.2485, + "step": 13566 + }, + { + "epoch": 0.85, + "grad_norm": 2.113651352580854, + "learning_rate": 5.545082634639726e-07, + "loss": 0.2385, + "step": 13567 + }, + { + "epoch": 0.85, + "grad_norm": 1.8076755945116019, + "learning_rate": 5.54042182483221e-07, + "loss": 0.2542, + "step": 13568 + }, + { + "epoch": 0.85, + "grad_norm": 1.5008753258119847, + "learning_rate": 5.535762859731547e-07, + "loss": 0.2428, + "step": 13569 + }, + { + "epoch": 0.85, + "grad_norm": 2.2858870591617326, + "learning_rate": 5.531105739531073e-07, + "loss": 0.2642, + "step": 13570 + }, + { + "epoch": 0.85, + "grad_norm": 2.312244975118548, + "learning_rate": 5.52645046442401e-07, + "loss": 0.2473, + "step": 13571 + }, + { + "epoch": 0.85, + "grad_norm": 2.3237576285969577, + "learning_rate": 5.521797034603499e-07, + "loss": 0.2434, + "step": 13572 + }, + { + "epoch": 0.85, + "grad_norm": 1.5538232390013953, + "learning_rate": 5.517145450262639e-07, + "loss": 0.2557, + "step": 13573 + }, + { + "epoch": 0.85, + "grad_norm": 2.0129384304103244, + "learning_rate": 5.512495711594418e-07, + "loss": 0.2396, + "step": 13574 + }, + { + "epoch": 0.85, + "grad_norm": 1.6514149107581721, + "learning_rate": 5.507847818791778e-07, + "loss": 0.2658, + "step": 13575 + }, + { + "epoch": 0.85, + "grad_norm": 1.6918203015153468, + "learning_rate": 5.503201772047556e-07, + "loss": 0.2524, + "step": 13576 + }, + { + "epoch": 0.85, + "grad_norm": 0.6636673205522431, + "learning_rate": 5.498557571554519e-07, + "loss": 0.4798, + "step": 13577 + }, + { + "epoch": 0.85, + "grad_norm": 2.565588593550987, + "learning_rate": 5.493915217505386e-07, + "loss": 0.2524, + "step": 13578 + }, + { + "epoch": 0.85, + "grad_norm": 2.7679157801907177, + "learning_rate": 5.489274710092746e-07, + "loss": 0.2763, + "step": 13579 + }, + { + "epoch": 0.85, + "grad_norm": 1.4383566699951482, + "learning_rate": 5.484636049509173e-07, + "loss": 0.2352, + "step": 13580 + }, + { + "epoch": 0.85, + "grad_norm": 1.800519061403932, + "learning_rate": 5.47999923594712e-07, + "loss": 0.2356, + "step": 13581 + }, + { + "epoch": 0.85, + "grad_norm": 0.5724452368813069, + "learning_rate": 5.475364269598959e-07, + "loss": 0.4391, + "step": 13582 + }, + { + "epoch": 0.85, + "grad_norm": 2.7872398232081483, + "learning_rate": 5.470731150657033e-07, + "loss": 0.2431, + "step": 13583 + }, + { + "epoch": 0.85, + "grad_norm": 1.934105082982598, + "learning_rate": 5.466099879313563e-07, + "loss": 0.2714, + "step": 13584 + }, + { + "epoch": 0.85, + "grad_norm": 3.8558269694509377, + "learning_rate": 5.461470455760698e-07, + "loss": 0.2742, + "step": 13585 + }, + { + "epoch": 0.85, + "grad_norm": 1.7107592585793334, + "learning_rate": 5.45684288019055e-07, + "loss": 0.2502, + "step": 13586 + }, + { + "epoch": 0.85, + "grad_norm": 1.9950131298505234, + "learning_rate": 5.452217152795092e-07, + "loss": 0.2433, + "step": 13587 + }, + { + "epoch": 0.85, + "grad_norm": 2.3131958912287827, + "learning_rate": 5.447593273766283e-07, + "loss": 0.2603, + "step": 13588 + }, + { + "epoch": 0.85, + "grad_norm": 4.12074925694779, + "learning_rate": 5.442971243295964e-07, + "loss": 0.248, + "step": 13589 + }, + { + "epoch": 0.85, + "grad_norm": 1.9615825929841024, + "learning_rate": 5.438351061575897e-07, + "loss": 0.2313, + "step": 13590 + }, + { + "epoch": 0.85, + "grad_norm": 2.175260952878419, + "learning_rate": 5.433732728797808e-07, + "loss": 0.2481, + "step": 13591 + }, + { + "epoch": 0.85, + "grad_norm": 2.0754082213129466, + "learning_rate": 5.429116245153304e-07, + "loss": 0.2584, + "step": 13592 + }, + { + "epoch": 0.85, + "grad_norm": 1.691298911844114, + "learning_rate": 5.424501610833921e-07, + "loss": 0.2483, + "step": 13593 + }, + { + "epoch": 0.85, + "grad_norm": 1.6745785689146537, + "learning_rate": 5.41988882603115e-07, + "loss": 0.2606, + "step": 13594 + }, + { + "epoch": 0.85, + "grad_norm": 2.6332867781682845, + "learning_rate": 5.415277890936377e-07, + "loss": 0.2592, + "step": 13595 + }, + { + "epoch": 0.86, + "grad_norm": 1.6742408535272966, + "learning_rate": 5.410668805740921e-07, + "loss": 0.2464, + "step": 13596 + }, + { + "epoch": 0.86, + "grad_norm": 5.014704593624091, + "learning_rate": 5.406061570636012e-07, + "loss": 0.3037, + "step": 13597 + }, + { + "epoch": 0.86, + "grad_norm": 2.3033209178746947, + "learning_rate": 5.401456185812809e-07, + "loss": 0.2595, + "step": 13598 + }, + { + "epoch": 0.86, + "grad_norm": 3.3024031502079834, + "learning_rate": 5.396852651462409e-07, + "loss": 0.2795, + "step": 13599 + }, + { + "epoch": 0.86, + "grad_norm": 0.6235627715437748, + "learning_rate": 5.392250967775825e-07, + "loss": 0.4798, + "step": 13600 + }, + { + "epoch": 0.86, + "grad_norm": 1.7957041900238733, + "learning_rate": 5.387651134943966e-07, + "loss": 0.2478, + "step": 13601 + }, + { + "epoch": 0.86, + "grad_norm": 3.086923984648585, + "learning_rate": 5.383053153157703e-07, + "loss": 0.2586, + "step": 13602 + }, + { + "epoch": 0.86, + "grad_norm": 3.866657186222648, + "learning_rate": 5.378457022607819e-07, + "loss": 0.2716, + "step": 13603 + }, + { + "epoch": 0.86, + "grad_norm": 1.7137875550053123, + "learning_rate": 5.373862743485014e-07, + "loss": 0.2462, + "step": 13604 + }, + { + "epoch": 0.86, + "grad_norm": 1.3864030986668119, + "learning_rate": 5.369270315979908e-07, + "loss": 0.2356, + "step": 13605 + }, + { + "epoch": 0.86, + "grad_norm": 2.4348947069003946, + "learning_rate": 5.364679740283041e-07, + "loss": 0.2462, + "step": 13606 + }, + { + "epoch": 0.86, + "grad_norm": 1.5469037905580496, + "learning_rate": 5.360091016584901e-07, + "loss": 0.2552, + "step": 13607 + }, + { + "epoch": 0.86, + "grad_norm": 1.607533664012968, + "learning_rate": 5.355504145075874e-07, + "loss": 0.2375, + "step": 13608 + }, + { + "epoch": 0.86, + "grad_norm": 1.5724723128126032, + "learning_rate": 5.350919125946269e-07, + "loss": 0.2497, + "step": 13609 + }, + { + "epoch": 0.86, + "grad_norm": 2.0558780389893303, + "learning_rate": 5.346335959386333e-07, + "loss": 0.2488, + "step": 13610 + }, + { + "epoch": 0.86, + "grad_norm": 1.9389550348105287, + "learning_rate": 5.341754645586244e-07, + "loss": 0.2505, + "step": 13611 + }, + { + "epoch": 0.86, + "grad_norm": 4.871688509492094, + "learning_rate": 5.337175184736077e-07, + "loss": 0.2274, + "step": 13612 + }, + { + "epoch": 0.86, + "grad_norm": 3.4524242667794094, + "learning_rate": 5.332597577025845e-07, + "loss": 0.2704, + "step": 13613 + }, + { + "epoch": 0.86, + "grad_norm": 1.7026679359135917, + "learning_rate": 5.328021822645463e-07, + "loss": 0.262, + "step": 13614 + }, + { + "epoch": 0.86, + "grad_norm": 1.9682632115171763, + "learning_rate": 5.323447921784813e-07, + "loss": 0.2597, + "step": 13615 + }, + { + "epoch": 0.86, + "grad_norm": 2.8148672515326587, + "learning_rate": 5.318875874633661e-07, + "loss": 0.2491, + "step": 13616 + }, + { + "epoch": 0.86, + "grad_norm": 1.560120130082495, + "learning_rate": 5.314305681381704e-07, + "loss": 0.2768, + "step": 13617 + }, + { + "epoch": 0.86, + "grad_norm": 12.424553683391421, + "learning_rate": 5.309737342218574e-07, + "loss": 0.2412, + "step": 13618 + }, + { + "epoch": 0.86, + "grad_norm": 1.572884717040157, + "learning_rate": 5.30517085733383e-07, + "loss": 0.2529, + "step": 13619 + }, + { + "epoch": 0.86, + "grad_norm": 5.6180434518983775, + "learning_rate": 5.300606226916938e-07, + "loss": 0.2694, + "step": 13620 + }, + { + "epoch": 0.86, + "grad_norm": 2.0940017996641846, + "learning_rate": 5.296043451157285e-07, + "loss": 0.2543, + "step": 13621 + }, + { + "epoch": 0.86, + "grad_norm": 3.3697832821973646, + "learning_rate": 5.29148253024418e-07, + "loss": 0.2648, + "step": 13622 + }, + { + "epoch": 0.86, + "grad_norm": 1.9218459094353313, + "learning_rate": 5.286923464366889e-07, + "loss": 0.2452, + "step": 13623 + }, + { + "epoch": 0.86, + "grad_norm": 2.248390007409386, + "learning_rate": 5.282366253714555e-07, + "loss": 0.2567, + "step": 13624 + }, + { + "epoch": 0.86, + "grad_norm": 1.1861708252662804, + "learning_rate": 5.277810898476283e-07, + "loss": 0.2561, + "step": 13625 + }, + { + "epoch": 0.86, + "grad_norm": 2.343048640813765, + "learning_rate": 5.273257398841059e-07, + "loss": 0.2496, + "step": 13626 + }, + { + "epoch": 0.86, + "grad_norm": 4.602411140422076, + "learning_rate": 5.268705754997838e-07, + "loss": 0.2558, + "step": 13627 + }, + { + "epoch": 0.86, + "grad_norm": 1.6285792398465866, + "learning_rate": 5.26415596713547e-07, + "loss": 0.2552, + "step": 13628 + }, + { + "epoch": 0.86, + "grad_norm": 1.755575220971988, + "learning_rate": 5.259608035442726e-07, + "loss": 0.2617, + "step": 13629 + }, + { + "epoch": 0.86, + "grad_norm": 2.0699700530282445, + "learning_rate": 5.255061960108309e-07, + "loss": 0.2441, + "step": 13630 + }, + { + "epoch": 0.86, + "grad_norm": 2.030379181313497, + "learning_rate": 5.250517741320854e-07, + "loss": 0.2528, + "step": 13631 + }, + { + "epoch": 0.86, + "grad_norm": 2.351254439335697, + "learning_rate": 5.245975379268892e-07, + "loss": 0.247, + "step": 13632 + }, + { + "epoch": 0.86, + "grad_norm": 1.2964371653384898, + "learning_rate": 5.241434874140916e-07, + "loss": 0.2484, + "step": 13633 + }, + { + "epoch": 0.86, + "grad_norm": 2.1064082182356647, + "learning_rate": 5.236896226125293e-07, + "loss": 0.2584, + "step": 13634 + }, + { + "epoch": 0.86, + "grad_norm": 2.374465623231935, + "learning_rate": 5.232359435410366e-07, + "loss": 0.2646, + "step": 13635 + }, + { + "epoch": 0.86, + "grad_norm": 1.9347914803804405, + "learning_rate": 5.227824502184364e-07, + "loss": 0.2734, + "step": 13636 + }, + { + "epoch": 0.86, + "grad_norm": 3.2171314538639177, + "learning_rate": 5.223291426635447e-07, + "loss": 0.2626, + "step": 13637 + }, + { + "epoch": 0.86, + "grad_norm": 2.2351019414444853, + "learning_rate": 5.21876020895169e-07, + "loss": 0.2675, + "step": 13638 + }, + { + "epoch": 0.86, + "grad_norm": 2.2136681821585618, + "learning_rate": 5.214230849321111e-07, + "loss": 0.2506, + "step": 13639 + }, + { + "epoch": 0.86, + "grad_norm": 2.83766489256189, + "learning_rate": 5.209703347931655e-07, + "loss": 0.2735, + "step": 13640 + }, + { + "epoch": 0.86, + "grad_norm": 3.174020625700449, + "learning_rate": 5.205177704971159e-07, + "loss": 0.2671, + "step": 13641 + }, + { + "epoch": 0.86, + "grad_norm": 2.430670413441794, + "learning_rate": 5.200653920627402e-07, + "loss": 0.2969, + "step": 13642 + }, + { + "epoch": 0.86, + "grad_norm": 1.3900125754278547, + "learning_rate": 5.196131995088089e-07, + "loss": 0.2399, + "step": 13643 + }, + { + "epoch": 0.86, + "grad_norm": 2.2168712897411376, + "learning_rate": 5.191611928540846e-07, + "loss": 0.2571, + "step": 13644 + }, + { + "epoch": 0.86, + "grad_norm": 2.418268250393664, + "learning_rate": 5.187093721173198e-07, + "loss": 0.2576, + "step": 13645 + }, + { + "epoch": 0.86, + "grad_norm": 3.0594401712808112, + "learning_rate": 5.182577373172642e-07, + "loss": 0.2432, + "step": 13646 + }, + { + "epoch": 0.86, + "grad_norm": 3.3299983009554044, + "learning_rate": 5.178062884726547e-07, + "loss": 0.2425, + "step": 13647 + }, + { + "epoch": 0.86, + "grad_norm": 1.743647726617732, + "learning_rate": 5.173550256022241e-07, + "loss": 0.2393, + "step": 13648 + }, + { + "epoch": 0.86, + "grad_norm": 2.8736867425503996, + "learning_rate": 5.169039487246958e-07, + "loss": 0.2661, + "step": 13649 + }, + { + "epoch": 0.86, + "grad_norm": 1.7689666442057652, + "learning_rate": 5.164530578587845e-07, + "loss": 0.2541, + "step": 13650 + }, + { + "epoch": 0.86, + "grad_norm": 26.14275002768576, + "learning_rate": 5.160023530232005e-07, + "loss": 0.2637, + "step": 13651 + }, + { + "epoch": 0.86, + "grad_norm": 1.4701743228404707, + "learning_rate": 5.155518342366434e-07, + "loss": 0.2543, + "step": 13652 + }, + { + "epoch": 0.86, + "grad_norm": 3.1312873255106126, + "learning_rate": 5.151015015178046e-07, + "loss": 0.2821, + "step": 13653 + }, + { + "epoch": 0.86, + "grad_norm": 12.76115379036279, + "learning_rate": 5.14651354885371e-07, + "loss": 0.233, + "step": 13654 + }, + { + "epoch": 0.86, + "grad_norm": 1.5255376443447688, + "learning_rate": 5.142013943580205e-07, + "loss": 0.2645, + "step": 13655 + }, + { + "epoch": 0.86, + "grad_norm": 1.810116332603644, + "learning_rate": 5.137516199544218e-07, + "loss": 0.2718, + "step": 13656 + }, + { + "epoch": 0.86, + "grad_norm": 1.7718179422281233, + "learning_rate": 5.133020316932369e-07, + "loss": 0.2302, + "step": 13657 + }, + { + "epoch": 0.86, + "grad_norm": 2.1572912137807267, + "learning_rate": 5.128526295931191e-07, + "loss": 0.2445, + "step": 13658 + }, + { + "epoch": 0.86, + "grad_norm": 4.02531717500136, + "learning_rate": 5.124034136727163e-07, + "loss": 0.2533, + "step": 13659 + }, + { + "epoch": 0.86, + "grad_norm": 2.6125455807078777, + "learning_rate": 5.119543839506668e-07, + "loss": 0.2394, + "step": 13660 + }, + { + "epoch": 0.86, + "grad_norm": 2.12656228148672, + "learning_rate": 5.115055404456009e-07, + "loss": 0.2765, + "step": 13661 + }, + { + "epoch": 0.86, + "grad_norm": 1.8159771758577405, + "learning_rate": 5.110568831761426e-07, + "loss": 0.24, + "step": 13662 + }, + { + "epoch": 0.86, + "grad_norm": 5.327296188057833, + "learning_rate": 5.106084121609084e-07, + "loss": 0.2397, + "step": 13663 + }, + { + "epoch": 0.86, + "grad_norm": 5.4702360420956335, + "learning_rate": 5.101601274185053e-07, + "loss": 0.2505, + "step": 13664 + }, + { + "epoch": 0.86, + "grad_norm": 2.1902812194953794, + "learning_rate": 5.097120289675333e-07, + "loss": 0.2713, + "step": 13665 + }, + { + "epoch": 0.86, + "grad_norm": 1.485408023468269, + "learning_rate": 5.092641168265839e-07, + "loss": 0.2488, + "step": 13666 + }, + { + "epoch": 0.86, + "grad_norm": 2.7370814254015277, + "learning_rate": 5.088163910142436e-07, + "loss": 0.26, + "step": 13667 + }, + { + "epoch": 0.86, + "grad_norm": 1.8861457859821287, + "learning_rate": 5.083688515490881e-07, + "loss": 0.2478, + "step": 13668 + }, + { + "epoch": 0.86, + "grad_norm": 1.9405465602943042, + "learning_rate": 5.079214984496866e-07, + "loss": 0.2441, + "step": 13669 + }, + { + "epoch": 0.86, + "grad_norm": 0.5700041359223784, + "learning_rate": 5.074743317346009e-07, + "loss": 0.4724, + "step": 13670 + }, + { + "epoch": 0.86, + "grad_norm": 1.9281739077401292, + "learning_rate": 5.070273514223856e-07, + "loss": 0.2599, + "step": 13671 + }, + { + "epoch": 0.86, + "grad_norm": 6.494583527548455, + "learning_rate": 5.065805575315863e-07, + "loss": 0.2602, + "step": 13672 + }, + { + "epoch": 0.86, + "grad_norm": 1.7839197396281787, + "learning_rate": 5.061339500807405e-07, + "loss": 0.2468, + "step": 13673 + }, + { + "epoch": 0.86, + "grad_norm": 1.8851465665212364, + "learning_rate": 5.056875290883778e-07, + "loss": 0.2432, + "step": 13674 + }, + { + "epoch": 0.86, + "grad_norm": 1.9874666537147156, + "learning_rate": 5.05241294573024e-07, + "loss": 0.2552, + "step": 13675 + }, + { + "epoch": 0.86, + "grad_norm": 2.640552388955175, + "learning_rate": 5.047952465531913e-07, + "loss": 0.2825, + "step": 13676 + }, + { + "epoch": 0.86, + "grad_norm": 1.8929427231370501, + "learning_rate": 5.04349385047389e-07, + "loss": 0.2486, + "step": 13677 + }, + { + "epoch": 0.86, + "grad_norm": 4.17727300472136, + "learning_rate": 5.039037100741151e-07, + "loss": 0.2703, + "step": 13678 + }, + { + "epoch": 0.86, + "grad_norm": 2.327589387383158, + "learning_rate": 5.03458221651863e-07, + "loss": 0.2643, + "step": 13679 + }, + { + "epoch": 0.86, + "grad_norm": 4.17373665414888, + "learning_rate": 5.030129197991162e-07, + "loss": 0.2398, + "step": 13680 + }, + { + "epoch": 0.86, + "grad_norm": 2.434944858566004, + "learning_rate": 5.025678045343507e-07, + "loss": 0.248, + "step": 13681 + }, + { + "epoch": 0.86, + "grad_norm": 1.5754890071326177, + "learning_rate": 5.021228758760344e-07, + "loss": 0.2445, + "step": 13682 + }, + { + "epoch": 0.86, + "grad_norm": 1.7873413417206307, + "learning_rate": 5.016781338426302e-07, + "loss": 0.256, + "step": 13683 + }, + { + "epoch": 0.86, + "grad_norm": 2.3658169989413795, + "learning_rate": 5.01233578452589e-07, + "loss": 0.2623, + "step": 13684 + }, + { + "epoch": 0.86, + "grad_norm": 2.8189939340242294, + "learning_rate": 5.007892097243588e-07, + "loss": 0.256, + "step": 13685 + }, + { + "epoch": 0.86, + "grad_norm": 2.9382916850983416, + "learning_rate": 5.003450276763744e-07, + "loss": 0.2364, + "step": 13686 + }, + { + "epoch": 0.86, + "grad_norm": 1.8440640191006197, + "learning_rate": 4.999010323270681e-07, + "loss": 0.2578, + "step": 13687 + }, + { + "epoch": 0.86, + "grad_norm": 0.6054567038581964, + "learning_rate": 4.994572236948608e-07, + "loss": 0.4699, + "step": 13688 + }, + { + "epoch": 0.86, + "grad_norm": 2.129324880100568, + "learning_rate": 4.990136017981678e-07, + "loss": 0.253, + "step": 13689 + }, + { + "epoch": 0.86, + "grad_norm": 1.9361134202184722, + "learning_rate": 4.985701666553938e-07, + "loss": 0.2667, + "step": 13690 + }, + { + "epoch": 0.86, + "grad_norm": 1.8162227472397376, + "learning_rate": 4.981269182849391e-07, + "loss": 0.2513, + "step": 13691 + }, + { + "epoch": 0.86, + "grad_norm": 1.6966984178058855, + "learning_rate": 4.976838567051956e-07, + "loss": 0.2838, + "step": 13692 + }, + { + "epoch": 0.86, + "grad_norm": 1.6477148137784117, + "learning_rate": 4.972409819345458e-07, + "loss": 0.2677, + "step": 13693 + }, + { + "epoch": 0.86, + "grad_norm": 1.4327947666479184, + "learning_rate": 4.967982939913646e-07, + "loss": 0.2505, + "step": 13694 + }, + { + "epoch": 0.86, + "grad_norm": 2.744999869600108, + "learning_rate": 4.963557928940215e-07, + "loss": 0.2514, + "step": 13695 + }, + { + "epoch": 0.86, + "grad_norm": 1.366089183714872, + "learning_rate": 4.959134786608766e-07, + "loss": 0.2442, + "step": 13696 + }, + { + "epoch": 0.86, + "grad_norm": 1.657426892235389, + "learning_rate": 4.95471351310281e-07, + "loss": 0.2579, + "step": 13697 + }, + { + "epoch": 0.86, + "grad_norm": 2.26204346002552, + "learning_rate": 4.950294108605791e-07, + "loss": 0.2478, + "step": 13698 + }, + { + "epoch": 0.86, + "grad_norm": 1.7537968602009564, + "learning_rate": 4.94587657330109e-07, + "loss": 0.2577, + "step": 13699 + }, + { + "epoch": 0.86, + "grad_norm": 1.5656007558597471, + "learning_rate": 4.941460907372004e-07, + "loss": 0.2605, + "step": 13700 + }, + { + "epoch": 0.86, + "grad_norm": 1.9827331043292564, + "learning_rate": 4.937047111001736e-07, + "loss": 0.248, + "step": 13701 + }, + { + "epoch": 0.86, + "grad_norm": 1.591557031894639, + "learning_rate": 4.932635184373413e-07, + "loss": 0.2339, + "step": 13702 + }, + { + "epoch": 0.86, + "grad_norm": 1.7381995198494065, + "learning_rate": 4.928225127670123e-07, + "loss": 0.2604, + "step": 13703 + }, + { + "epoch": 0.86, + "grad_norm": 2.173427432588498, + "learning_rate": 4.923816941074822e-07, + "loss": 0.2622, + "step": 13704 + }, + { + "epoch": 0.86, + "grad_norm": 2.787580573787161, + "learning_rate": 4.919410624770421e-07, + "loss": 0.265, + "step": 13705 + }, + { + "epoch": 0.86, + "grad_norm": 1.5860671434993519, + "learning_rate": 4.915006178939724e-07, + "loss": 0.2413, + "step": 13706 + }, + { + "epoch": 0.86, + "grad_norm": 1.9961316065912431, + "learning_rate": 4.910603603765523e-07, + "loss": 0.2378, + "step": 13707 + }, + { + "epoch": 0.86, + "grad_norm": 5.846316850977396, + "learning_rate": 4.906202899430463e-07, + "loss": 0.2671, + "step": 13708 + }, + { + "epoch": 0.86, + "grad_norm": 2.5538424167891645, + "learning_rate": 4.901804066117144e-07, + "loss": 0.2652, + "step": 13709 + }, + { + "epoch": 0.86, + "grad_norm": 2.961948405895061, + "learning_rate": 4.897407104008067e-07, + "loss": 0.2477, + "step": 13710 + }, + { + "epoch": 0.86, + "grad_norm": 1.6083660588937367, + "learning_rate": 4.893012013285692e-07, + "loss": 0.2593, + "step": 13711 + }, + { + "epoch": 0.86, + "grad_norm": 2.1479071891206654, + "learning_rate": 4.88861879413236e-07, + "loss": 0.2587, + "step": 13712 + }, + { + "epoch": 0.86, + "grad_norm": 1.5699512695917646, + "learning_rate": 4.884227446730372e-07, + "loss": 0.2552, + "step": 13713 + }, + { + "epoch": 0.86, + "grad_norm": 1.493464694099462, + "learning_rate": 4.879837971261897e-07, + "loss": 0.2419, + "step": 13714 + }, + { + "epoch": 0.86, + "grad_norm": 3.30024647646415, + "learning_rate": 4.875450367909101e-07, + "loss": 0.245, + "step": 13715 + }, + { + "epoch": 0.86, + "grad_norm": 1.6888980037888235, + "learning_rate": 4.871064636854029e-07, + "loss": 0.255, + "step": 13716 + }, + { + "epoch": 0.86, + "grad_norm": 2.7814713071352997, + "learning_rate": 4.866680778278637e-07, + "loss": 0.2575, + "step": 13717 + }, + { + "epoch": 0.86, + "grad_norm": 1.6932877666833304, + "learning_rate": 4.862298792364817e-07, + "loss": 0.2323, + "step": 13718 + }, + { + "epoch": 0.86, + "grad_norm": 1.8334929928287969, + "learning_rate": 4.857918679294405e-07, + "loss": 0.2474, + "step": 13719 + }, + { + "epoch": 0.86, + "grad_norm": 2.446489705834415, + "learning_rate": 4.853540439249127e-07, + "loss": 0.2401, + "step": 13720 + }, + { + "epoch": 0.86, + "grad_norm": 1.7818780712405435, + "learning_rate": 4.849164072410639e-07, + "loss": 0.2392, + "step": 13721 + }, + { + "epoch": 0.86, + "grad_norm": 2.837794344465931, + "learning_rate": 4.844789578960524e-07, + "loss": 0.248, + "step": 13722 + }, + { + "epoch": 0.86, + "grad_norm": 2.40695280700196, + "learning_rate": 4.840416959080307e-07, + "loss": 0.2362, + "step": 13723 + }, + { + "epoch": 0.86, + "grad_norm": 1.8239567390662956, + "learning_rate": 4.836046212951406e-07, + "loss": 0.2471, + "step": 13724 + }, + { + "epoch": 0.86, + "grad_norm": 5.673927413885572, + "learning_rate": 4.831677340755164e-07, + "loss": 0.255, + "step": 13725 + }, + { + "epoch": 0.86, + "grad_norm": 6.480471311827824, + "learning_rate": 4.827310342672847e-07, + "loss": 0.2653, + "step": 13726 + }, + { + "epoch": 0.86, + "grad_norm": 1.8703191755513764, + "learning_rate": 4.822945218885672e-07, + "loss": 0.2748, + "step": 13727 + }, + { + "epoch": 0.86, + "grad_norm": 16.017521689606188, + "learning_rate": 4.818581969574743e-07, + "loss": 0.2774, + "step": 13728 + }, + { + "epoch": 0.86, + "grad_norm": 3.560156684088177, + "learning_rate": 4.814220594921087e-07, + "loss": 0.2772, + "step": 13729 + }, + { + "epoch": 0.86, + "grad_norm": 3.2924727580181514, + "learning_rate": 4.809861095105678e-07, + "loss": 0.2722, + "step": 13730 + }, + { + "epoch": 0.86, + "grad_norm": 7.2608226204571915, + "learning_rate": 4.805503470309408e-07, + "loss": 0.2402, + "step": 13731 + }, + { + "epoch": 0.86, + "grad_norm": 4.364101019053066, + "learning_rate": 4.801147720713079e-07, + "loss": 0.2558, + "step": 13732 + }, + { + "epoch": 0.86, + "grad_norm": 2.025503460901223, + "learning_rate": 4.796793846497411e-07, + "loss": 0.2509, + "step": 13733 + }, + { + "epoch": 0.86, + "grad_norm": 1.9664366367185016, + "learning_rate": 4.792441847843043e-07, + "loss": 0.2557, + "step": 13734 + }, + { + "epoch": 0.86, + "grad_norm": 2.5237687523827645, + "learning_rate": 4.788091724930571e-07, + "loss": 0.2648, + "step": 13735 + }, + { + "epoch": 0.86, + "grad_norm": 1.3612160673713032, + "learning_rate": 4.783743477940473e-07, + "loss": 0.2591, + "step": 13736 + }, + { + "epoch": 0.86, + "grad_norm": 1.576773378582313, + "learning_rate": 4.779397107053174e-07, + "loss": 0.2589, + "step": 13737 + }, + { + "epoch": 0.86, + "grad_norm": 3.8327080019559063, + "learning_rate": 4.775052612449005e-07, + "loss": 0.2464, + "step": 13738 + }, + { + "epoch": 0.86, + "grad_norm": 0.5667142764524086, + "learning_rate": 4.770709994308242e-07, + "loss": 0.4484, + "step": 13739 + }, + { + "epoch": 0.86, + "grad_norm": 6.785820436627895, + "learning_rate": 4.766369252811054e-07, + "loss": 0.2348, + "step": 13740 + }, + { + "epoch": 0.86, + "grad_norm": 1.524841412966559, + "learning_rate": 4.762030388137551e-07, + "loss": 0.2459, + "step": 13741 + }, + { + "epoch": 0.86, + "grad_norm": 4.5118561493442915, + "learning_rate": 4.75769340046775e-07, + "loss": 0.2508, + "step": 13742 + }, + { + "epoch": 0.86, + "grad_norm": 3.615071272129694, + "learning_rate": 4.7533582899816133e-07, + "loss": 0.2495, + "step": 13743 + }, + { + "epoch": 0.86, + "grad_norm": 2.296283413077024, + "learning_rate": 4.7490250568590137e-07, + "loss": 0.2477, + "step": 13744 + }, + { + "epoch": 0.86, + "grad_norm": 2.759273953850784, + "learning_rate": 4.744693701279735e-07, + "loss": 0.2501, + "step": 13745 + }, + { + "epoch": 0.86, + "grad_norm": 1.827686209491277, + "learning_rate": 4.7403642234234935e-07, + "loss": 0.2392, + "step": 13746 + }, + { + "epoch": 0.86, + "grad_norm": 1.9542298104846654, + "learning_rate": 4.736036623469936e-07, + "loss": 0.2857, + "step": 13747 + }, + { + "epoch": 0.86, + "grad_norm": 1.6202546621098888, + "learning_rate": 4.731710901598624e-07, + "loss": 0.2453, + "step": 13748 + }, + { + "epoch": 0.86, + "grad_norm": 0.6092639131500509, + "learning_rate": 4.7273870579890247e-07, + "loss": 0.455, + "step": 13749 + }, + { + "epoch": 0.86, + "grad_norm": 1.5931816960150724, + "learning_rate": 4.7230650928205447e-07, + "loss": 0.2485, + "step": 13750 + }, + { + "epoch": 0.86, + "grad_norm": 3.50300836641801, + "learning_rate": 4.7187450062725126e-07, + "loss": 0.2433, + "step": 13751 + }, + { + "epoch": 0.86, + "grad_norm": 1.9782678999273278, + "learning_rate": 4.7144267985241856e-07, + "loss": 0.2375, + "step": 13752 + }, + { + "epoch": 0.86, + "grad_norm": 3.880827791846489, + "learning_rate": 4.7101104697547307e-07, + "loss": 0.2495, + "step": 13753 + }, + { + "epoch": 0.86, + "grad_norm": 1.4834969288427793, + "learning_rate": 4.7057960201432275e-07, + "loss": 0.2439, + "step": 13754 + }, + { + "epoch": 0.87, + "grad_norm": 2.6071836496123604, + "learning_rate": 4.7014834498687045e-07, + "loss": 0.26, + "step": 13755 + }, + { + "epoch": 0.87, + "grad_norm": 2.6627098367754756, + "learning_rate": 4.697172759110097e-07, + "loss": 0.2676, + "step": 13756 + }, + { + "epoch": 0.87, + "grad_norm": 2.042789929514817, + "learning_rate": 4.692863948046256e-07, + "loss": 0.2488, + "step": 13757 + }, + { + "epoch": 0.87, + "grad_norm": 1.7472584622802503, + "learning_rate": 4.6885570168559493e-07, + "loss": 0.2474, + "step": 13758 + }, + { + "epoch": 0.87, + "grad_norm": 1.9202966305175444, + "learning_rate": 4.6842519657179066e-07, + "loss": 0.2454, + "step": 13759 + }, + { + "epoch": 0.87, + "grad_norm": 1.9996697367178797, + "learning_rate": 4.67994879481074e-07, + "loss": 0.249, + "step": 13760 + }, + { + "epoch": 0.87, + "grad_norm": 1.6712291343920274, + "learning_rate": 4.6756475043130024e-07, + "loss": 0.2482, + "step": 13761 + }, + { + "epoch": 0.87, + "grad_norm": 1.5700233439317361, + "learning_rate": 4.671348094403139e-07, + "loss": 0.2602, + "step": 13762 + }, + { + "epoch": 0.87, + "grad_norm": 6.035639140420502, + "learning_rate": 4.667050565259568e-07, + "loss": 0.2517, + "step": 13763 + }, + { + "epoch": 0.87, + "grad_norm": 2.480546374106348, + "learning_rate": 4.662754917060591e-07, + "loss": 0.2502, + "step": 13764 + }, + { + "epoch": 0.87, + "grad_norm": 14.413137043776564, + "learning_rate": 4.658461149984439e-07, + "loss": 0.259, + "step": 13765 + }, + { + "epoch": 0.87, + "grad_norm": 1.9089607350914712, + "learning_rate": 4.654169264209246e-07, + "loss": 0.2489, + "step": 13766 + }, + { + "epoch": 0.87, + "grad_norm": 2.7866982543067795, + "learning_rate": 4.649879259913137e-07, + "loss": 0.25, + "step": 13767 + }, + { + "epoch": 0.87, + "grad_norm": 1.6881030706778166, + "learning_rate": 4.645591137274091e-07, + "loss": 0.2475, + "step": 13768 + }, + { + "epoch": 0.87, + "grad_norm": 1.746952614699426, + "learning_rate": 4.6413048964700224e-07, + "loss": 0.2604, + "step": 13769 + }, + { + "epoch": 0.87, + "grad_norm": 4.169900530410927, + "learning_rate": 4.637020537678771e-07, + "loss": 0.2567, + "step": 13770 + }, + { + "epoch": 0.87, + "grad_norm": 3.6558378005064505, + "learning_rate": 4.6327380610781235e-07, + "loss": 0.2552, + "step": 13771 + }, + { + "epoch": 0.87, + "grad_norm": 1.41977707890803, + "learning_rate": 4.628457466845754e-07, + "loss": 0.269, + "step": 13772 + }, + { + "epoch": 0.87, + "grad_norm": 1.9287978507545889, + "learning_rate": 4.6241787551592645e-07, + "loss": 0.2408, + "step": 13773 + }, + { + "epoch": 0.87, + "grad_norm": 2.0971421383884636, + "learning_rate": 4.619901926196191e-07, + "loss": 0.2337, + "step": 13774 + }, + { + "epoch": 0.87, + "grad_norm": 1.8631539706243492, + "learning_rate": 4.615626980134003e-07, + "loss": 0.231, + "step": 13775 + }, + { + "epoch": 0.87, + "grad_norm": 7.092877746318571, + "learning_rate": 4.611353917150063e-07, + "loss": 0.2634, + "step": 13776 + }, + { + "epoch": 0.87, + "grad_norm": 6.893296375540724, + "learning_rate": 4.6070827374216697e-07, + "loss": 0.2471, + "step": 13777 + }, + { + "epoch": 0.87, + "grad_norm": 2.5562991316454764, + "learning_rate": 4.602813441126025e-07, + "loss": 0.2428, + "step": 13778 + }, + { + "epoch": 0.87, + "grad_norm": 2.4261447464071324, + "learning_rate": 4.5985460284403037e-07, + "loss": 0.2529, + "step": 13779 + }, + { + "epoch": 0.87, + "grad_norm": 1.9234299626647677, + "learning_rate": 4.5942804995415423e-07, + "loss": 0.2584, + "step": 13780 + }, + { + "epoch": 0.87, + "grad_norm": 2.9145088635375673, + "learning_rate": 4.590016854606727e-07, + "loss": 0.2472, + "step": 13781 + }, + { + "epoch": 0.87, + "grad_norm": 2.63369464339832, + "learning_rate": 4.585755093812766e-07, + "loss": 0.2552, + "step": 13782 + }, + { + "epoch": 0.87, + "grad_norm": 4.292750135238071, + "learning_rate": 4.5814952173365067e-07, + "loss": 0.2442, + "step": 13783 + }, + { + "epoch": 0.87, + "grad_norm": 2.4874279774455776, + "learning_rate": 4.57723722535468e-07, + "loss": 0.2508, + "step": 13784 + }, + { + "epoch": 0.87, + "grad_norm": 2.206289831535517, + "learning_rate": 4.5729811180439567e-07, + "loss": 0.2893, + "step": 13785 + }, + { + "epoch": 0.87, + "grad_norm": 1.633603228069572, + "learning_rate": 4.568726895580933e-07, + "loss": 0.2497, + "step": 13786 + }, + { + "epoch": 0.87, + "grad_norm": 1.9702627800297179, + "learning_rate": 4.5644745581421293e-07, + "loss": 0.2472, + "step": 13787 + }, + { + "epoch": 0.87, + "grad_norm": 1.9207013771987873, + "learning_rate": 4.560224105903971e-07, + "loss": 0.2396, + "step": 13788 + }, + { + "epoch": 0.87, + "grad_norm": 2.1453534898516478, + "learning_rate": 4.5559755390428284e-07, + "loss": 0.2474, + "step": 13789 + }, + { + "epoch": 0.87, + "grad_norm": 1.5964527090143048, + "learning_rate": 4.5517288577349773e-07, + "loss": 0.2507, + "step": 13790 + }, + { + "epoch": 0.87, + "grad_norm": 4.745744419491947, + "learning_rate": 4.5474840621566264e-07, + "loss": 0.2604, + "step": 13791 + }, + { + "epoch": 0.87, + "grad_norm": 5.319181385274683, + "learning_rate": 4.543241152483896e-07, + "loss": 0.2559, + "step": 13792 + }, + { + "epoch": 0.87, + "grad_norm": 1.4470094108943803, + "learning_rate": 4.539000128892829e-07, + "loss": 0.2411, + "step": 13793 + }, + { + "epoch": 0.87, + "grad_norm": 1.677805217558302, + "learning_rate": 4.534760991559384e-07, + "loss": 0.2533, + "step": 13794 + }, + { + "epoch": 0.87, + "grad_norm": 2.9837572300801964, + "learning_rate": 4.5305237406594705e-07, + "loss": 0.2482, + "step": 13795 + }, + { + "epoch": 0.87, + "grad_norm": 2.477426482139802, + "learning_rate": 4.526288376368887e-07, + "loss": 0.2587, + "step": 13796 + }, + { + "epoch": 0.87, + "grad_norm": 1.5367189378808117, + "learning_rate": 4.5220548988633707e-07, + "loss": 0.2347, + "step": 13797 + }, + { + "epoch": 0.87, + "grad_norm": 1.7406631956229388, + "learning_rate": 4.5178233083185694e-07, + "loss": 0.2459, + "step": 13798 + }, + { + "epoch": 0.87, + "grad_norm": 2.0021300722332454, + "learning_rate": 4.5135936049100715e-07, + "loss": 0.2563, + "step": 13799 + }, + { + "epoch": 0.87, + "grad_norm": 2.8486010054680313, + "learning_rate": 4.509365788813369e-07, + "loss": 0.2592, + "step": 13800 + }, + { + "epoch": 0.87, + "grad_norm": 1.6365894065618019, + "learning_rate": 4.5051398602038775e-07, + "loss": 0.2453, + "step": 13801 + }, + { + "epoch": 0.87, + "grad_norm": 1.6871080344047993, + "learning_rate": 4.5009158192569345e-07, + "loss": 0.2438, + "step": 13802 + }, + { + "epoch": 0.87, + "grad_norm": 1.6626935079153413, + "learning_rate": 4.496693666147811e-07, + "loss": 0.2542, + "step": 13803 + }, + { + "epoch": 0.87, + "grad_norm": 1.5095499566522133, + "learning_rate": 4.4924734010517e-07, + "loss": 0.2382, + "step": 13804 + }, + { + "epoch": 0.87, + "grad_norm": 2.3281388263377885, + "learning_rate": 4.488255024143695e-07, + "loss": 0.234, + "step": 13805 + }, + { + "epoch": 0.87, + "grad_norm": 2.0891323273649456, + "learning_rate": 4.484038535598817e-07, + "loss": 0.2299, + "step": 13806 + }, + { + "epoch": 0.87, + "grad_norm": 2.7026057419641507, + "learning_rate": 4.479823935592037e-07, + "loss": 0.2737, + "step": 13807 + }, + { + "epoch": 0.87, + "grad_norm": 2.185720517961521, + "learning_rate": 4.4756112242982153e-07, + "loss": 0.2582, + "step": 13808 + }, + { + "epoch": 0.87, + "grad_norm": 1.930030917707115, + "learning_rate": 4.471400401892145e-07, + "loss": 0.2391, + "step": 13809 + }, + { + "epoch": 0.87, + "grad_norm": 46.063136179670856, + "learning_rate": 4.4671914685485317e-07, + "loss": 0.2607, + "step": 13810 + }, + { + "epoch": 0.87, + "grad_norm": 2.6535333465397057, + "learning_rate": 4.4629844244420126e-07, + "loss": 0.2566, + "step": 13811 + }, + { + "epoch": 0.87, + "grad_norm": 1.7253326486467209, + "learning_rate": 4.458779269747171e-07, + "loss": 0.2543, + "step": 13812 + }, + { + "epoch": 0.87, + "grad_norm": 2.7635589860368603, + "learning_rate": 4.4545760046384614e-07, + "loss": 0.2454, + "step": 13813 + }, + { + "epoch": 0.87, + "grad_norm": 0.5949430510246059, + "learning_rate": 4.4503746292902825e-07, + "loss": 0.5078, + "step": 13814 + }, + { + "epoch": 0.87, + "grad_norm": 1.6246812372877706, + "learning_rate": 4.4461751438769794e-07, + "loss": 0.2505, + "step": 13815 + }, + { + "epoch": 0.87, + "grad_norm": 1.6625477921143004, + "learning_rate": 4.441977548572779e-07, + "loss": 0.2535, + "step": 13816 + }, + { + "epoch": 0.87, + "grad_norm": 2.0977268379278056, + "learning_rate": 4.4377818435518474e-07, + "loss": 0.2446, + "step": 13817 + }, + { + "epoch": 0.87, + "grad_norm": 2.5827314666332564, + "learning_rate": 4.433588028988267e-07, + "loss": 0.2744, + "step": 13818 + }, + { + "epoch": 0.87, + "grad_norm": 2.8016090532650186, + "learning_rate": 4.4293961050560607e-07, + "loss": 0.2382, + "step": 13819 + }, + { + "epoch": 0.87, + "grad_norm": 5.132460261241813, + "learning_rate": 4.4252060719291556e-07, + "loss": 0.2425, + "step": 13820 + }, + { + "epoch": 0.87, + "grad_norm": 2.5789131835596515, + "learning_rate": 4.4210179297814016e-07, + "loss": 0.2624, + "step": 13821 + }, + { + "epoch": 0.87, + "grad_norm": 2.2596280978187036, + "learning_rate": 4.4168316787865594e-07, + "loss": 0.2693, + "step": 13822 + }, + { + "epoch": 0.87, + "grad_norm": 2.649634793274662, + "learning_rate": 4.412647319118346e-07, + "loss": 0.2507, + "step": 13823 + }, + { + "epoch": 0.87, + "grad_norm": 4.694741837403251, + "learning_rate": 4.4084648509503724e-07, + "loss": 0.2551, + "step": 13824 + }, + { + "epoch": 0.87, + "grad_norm": 1.948856281137073, + "learning_rate": 4.404284274456161e-07, + "loss": 0.2437, + "step": 13825 + }, + { + "epoch": 0.87, + "grad_norm": 2.0648872061383656, + "learning_rate": 4.4001055898091894e-07, + "loss": 0.2397, + "step": 13826 + }, + { + "epoch": 0.87, + "grad_norm": 4.172524952943479, + "learning_rate": 4.395928797182819e-07, + "loss": 0.2837, + "step": 13827 + }, + { + "epoch": 0.87, + "grad_norm": 1.4186274080291301, + "learning_rate": 4.391753896750378e-07, + "loss": 0.2512, + "step": 13828 + }, + { + "epoch": 0.87, + "grad_norm": 1.491560574259554, + "learning_rate": 4.3875808886850777e-07, + "loss": 0.226, + "step": 13829 + }, + { + "epoch": 0.87, + "grad_norm": 1.410133617998824, + "learning_rate": 4.383409773160052e-07, + "loss": 0.2469, + "step": 13830 + }, + { + "epoch": 0.87, + "grad_norm": 1.6049657604459995, + "learning_rate": 4.3792405503483903e-07, + "loss": 0.2737, + "step": 13831 + }, + { + "epoch": 0.87, + "grad_norm": 2.0627550030741637, + "learning_rate": 4.375073220423065e-07, + "loss": 0.2654, + "step": 13832 + }, + { + "epoch": 0.87, + "grad_norm": 1.2355520813069347, + "learning_rate": 4.370907783556988e-07, + "loss": 0.2291, + "step": 13833 + }, + { + "epoch": 0.87, + "grad_norm": 2.11886330585489, + "learning_rate": 4.3667442399229985e-07, + "loss": 0.242, + "step": 13834 + }, + { + "epoch": 0.87, + "grad_norm": 1.6402330554041666, + "learning_rate": 4.362582589693837e-07, + "loss": 0.2468, + "step": 13835 + }, + { + "epoch": 0.87, + "grad_norm": 2.214726740768506, + "learning_rate": 4.358422833042192e-07, + "loss": 0.2672, + "step": 13836 + }, + { + "epoch": 0.87, + "grad_norm": 1.6252222120775173, + "learning_rate": 4.354264970140654e-07, + "loss": 0.259, + "step": 13837 + }, + { + "epoch": 0.87, + "grad_norm": 1.9010466084452147, + "learning_rate": 4.3501090011617286e-07, + "loss": 0.2659, + "step": 13838 + }, + { + "epoch": 0.87, + "grad_norm": 1.9746578949649074, + "learning_rate": 4.3459549262778736e-07, + "loss": 0.2697, + "step": 13839 + }, + { + "epoch": 0.87, + "grad_norm": 1.6810623032065262, + "learning_rate": 4.3418027456614277e-07, + "loss": 0.2365, + "step": 13840 + }, + { + "epoch": 0.87, + "grad_norm": 1.972024926647831, + "learning_rate": 4.337652459484698e-07, + "loss": 0.2651, + "step": 13841 + }, + { + "epoch": 0.87, + "grad_norm": 3.0992942064946996, + "learning_rate": 4.333504067919858e-07, + "loss": 0.2449, + "step": 13842 + }, + { + "epoch": 0.87, + "grad_norm": 2.4261767185343013, + "learning_rate": 4.329357571139059e-07, + "loss": 0.266, + "step": 13843 + }, + { + "epoch": 0.87, + "grad_norm": 1.6206963025094863, + "learning_rate": 4.3252129693143353e-07, + "loss": 0.2383, + "step": 13844 + }, + { + "epoch": 0.87, + "grad_norm": 1.9267780552117144, + "learning_rate": 4.321070262617655e-07, + "loss": 0.2376, + "step": 13845 + }, + { + "epoch": 0.87, + "grad_norm": 2.13778457696387, + "learning_rate": 4.3169294512208917e-07, + "loss": 0.2397, + "step": 13846 + }, + { + "epoch": 0.87, + "grad_norm": 1.8575974450737254, + "learning_rate": 4.3127905352958796e-07, + "loss": 0.2455, + "step": 13847 + }, + { + "epoch": 0.87, + "grad_norm": 2.2793197489275094, + "learning_rate": 4.308653515014327e-07, + "loss": 0.2686, + "step": 13848 + }, + { + "epoch": 0.87, + "grad_norm": 2.2701318796487207, + "learning_rate": 4.304518390547907e-07, + "loss": 0.2414, + "step": 13849 + }, + { + "epoch": 0.87, + "grad_norm": 2.41990295399939, + "learning_rate": 4.300385162068177e-07, + "loss": 0.2486, + "step": 13850 + }, + { + "epoch": 0.87, + "grad_norm": 2.0833381921166843, + "learning_rate": 4.296253829746644e-07, + "loss": 0.2442, + "step": 13851 + }, + { + "epoch": 0.87, + "grad_norm": 1.911491157143949, + "learning_rate": 4.292124393754715e-07, + "loss": 0.2502, + "step": 13852 + }, + { + "epoch": 0.87, + "grad_norm": 1.8997245751700704, + "learning_rate": 4.287996854263737e-07, + "loss": 0.245, + "step": 13853 + }, + { + "epoch": 0.87, + "grad_norm": 12.068283235664751, + "learning_rate": 4.283871211444951e-07, + "loss": 0.2574, + "step": 13854 + }, + { + "epoch": 0.87, + "grad_norm": 1.7681632505693208, + "learning_rate": 4.2797474654695527e-07, + "loss": 0.2595, + "step": 13855 + }, + { + "epoch": 0.87, + "grad_norm": 2.108444660808345, + "learning_rate": 4.2756256165086443e-07, + "loss": 0.2533, + "step": 13856 + }, + { + "epoch": 0.87, + "grad_norm": 4.354750050889478, + "learning_rate": 4.27150566473325e-07, + "loss": 0.2423, + "step": 13857 + }, + { + "epoch": 0.87, + "grad_norm": 2.4171742333104715, + "learning_rate": 4.267387610314294e-07, + "loss": 0.2518, + "step": 13858 + }, + { + "epoch": 0.87, + "grad_norm": 1.537594496454954, + "learning_rate": 4.2632714534226685e-07, + "loss": 0.2562, + "step": 13859 + }, + { + "epoch": 0.87, + "grad_norm": 1.4358005144862205, + "learning_rate": 4.259157194229152e-07, + "loss": 0.2552, + "step": 13860 + }, + { + "epoch": 0.87, + "grad_norm": 2.960417309318742, + "learning_rate": 4.2550448329044426e-07, + "loss": 0.2631, + "step": 13861 + }, + { + "epoch": 0.87, + "grad_norm": 2.0160656549023637, + "learning_rate": 4.2509343696191695e-07, + "loss": 0.2341, + "step": 13862 + }, + { + "epoch": 0.87, + "grad_norm": 2.1833622724811916, + "learning_rate": 4.246825804543886e-07, + "loss": 0.2584, + "step": 13863 + }, + { + "epoch": 0.87, + "grad_norm": 4.0390217373261255, + "learning_rate": 4.242719137849077e-07, + "loss": 0.2477, + "step": 13864 + }, + { + "epoch": 0.87, + "grad_norm": 2.1054123638811757, + "learning_rate": 4.238614369705124e-07, + "loss": 0.2627, + "step": 13865 + }, + { + "epoch": 0.87, + "grad_norm": 1.906825018692233, + "learning_rate": 4.2345115002823345e-07, + "loss": 0.2481, + "step": 13866 + }, + { + "epoch": 0.87, + "grad_norm": 2.700215689055635, + "learning_rate": 4.2304105297509614e-07, + "loss": 0.2509, + "step": 13867 + }, + { + "epoch": 0.87, + "grad_norm": 0.6226385486277936, + "learning_rate": 4.2263114582811515e-07, + "loss": 0.4667, + "step": 13868 + }, + { + "epoch": 0.87, + "grad_norm": 3.782780375876242, + "learning_rate": 4.2222142860429805e-07, + "loss": 0.2603, + "step": 13869 + }, + { + "epoch": 0.87, + "grad_norm": 2.428520394096701, + "learning_rate": 4.2181190132064396e-07, + "loss": 0.2606, + "step": 13870 + }, + { + "epoch": 0.87, + "grad_norm": 2.8714541166991, + "learning_rate": 4.2140256399414603e-07, + "loss": 0.25, + "step": 13871 + }, + { + "epoch": 0.87, + "grad_norm": 1.749954869223884, + "learning_rate": 4.2099341664178894e-07, + "loss": 0.2418, + "step": 13872 + }, + { + "epoch": 0.87, + "grad_norm": 1.5808420850754918, + "learning_rate": 4.205844592805486e-07, + "loss": 0.2579, + "step": 13873 + }, + { + "epoch": 0.87, + "grad_norm": 1.6077489134122638, + "learning_rate": 4.2017569192739195e-07, + "loss": 0.2445, + "step": 13874 + }, + { + "epoch": 0.87, + "grad_norm": 2.2296919846073795, + "learning_rate": 4.1976711459928096e-07, + "loss": 0.2458, + "step": 13875 + }, + { + "epoch": 0.87, + "grad_norm": 2.3033601982153282, + "learning_rate": 4.193587273131683e-07, + "loss": 0.2425, + "step": 13876 + }, + { + "epoch": 0.87, + "grad_norm": 1.5669571185389406, + "learning_rate": 4.189505300859981e-07, + "loss": 0.2415, + "step": 13877 + }, + { + "epoch": 0.87, + "grad_norm": 1.371627591062167, + "learning_rate": 4.185425229347062e-07, + "loss": 0.2718, + "step": 13878 + }, + { + "epoch": 0.87, + "grad_norm": 1.5515246372638904, + "learning_rate": 4.1813470587622305e-07, + "loss": 0.2468, + "step": 13879 + }, + { + "epoch": 0.87, + "grad_norm": 1.5876116038820525, + "learning_rate": 4.1772707892746954e-07, + "loss": 0.2466, + "step": 13880 + }, + { + "epoch": 0.87, + "grad_norm": 1.609968141072775, + "learning_rate": 4.173196421053588e-07, + "loss": 0.2574, + "step": 13881 + }, + { + "epoch": 0.87, + "grad_norm": 1.7243799581860655, + "learning_rate": 4.1691239542679507e-07, + "loss": 0.2411, + "step": 13882 + }, + { + "epoch": 0.87, + "grad_norm": 2.670650289852566, + "learning_rate": 4.165053389086776e-07, + "loss": 0.2344, + "step": 13883 + }, + { + "epoch": 0.87, + "grad_norm": 2.3328112575453748, + "learning_rate": 4.160984725678946e-07, + "loss": 0.2386, + "step": 13884 + }, + { + "epoch": 0.87, + "grad_norm": 1.812917492437089, + "learning_rate": 4.1569179642132694e-07, + "loss": 0.2498, + "step": 13885 + }, + { + "epoch": 0.87, + "grad_norm": 3.894752647902837, + "learning_rate": 4.152853104858506e-07, + "loss": 0.2851, + "step": 13886 + }, + { + "epoch": 0.87, + "grad_norm": 1.7591454201584609, + "learning_rate": 4.148790147783288e-07, + "loss": 0.2484, + "step": 13887 + }, + { + "epoch": 0.87, + "grad_norm": 2.6124279603834912, + "learning_rate": 4.1447290931562136e-07, + "loss": 0.2714, + "step": 13888 + }, + { + "epoch": 0.87, + "grad_norm": 0.6797118342691032, + "learning_rate": 4.1406699411457804e-07, + "loss": 0.4849, + "step": 13889 + }, + { + "epoch": 0.87, + "grad_norm": 1.7244922133291847, + "learning_rate": 4.1366126919203996e-07, + "loss": 0.2346, + "step": 13890 + }, + { + "epoch": 0.87, + "grad_norm": 2.31265028502726, + "learning_rate": 4.1325573456484304e-07, + "loss": 0.2473, + "step": 13891 + }, + { + "epoch": 0.87, + "grad_norm": 11.915228006423384, + "learning_rate": 4.1285039024981265e-07, + "loss": 0.2521, + "step": 13892 + }, + { + "epoch": 0.87, + "grad_norm": 1.970610547757536, + "learning_rate": 4.12445236263766e-07, + "loss": 0.2559, + "step": 13893 + }, + { + "epoch": 0.87, + "grad_norm": 2.088654997788465, + "learning_rate": 4.120402726235156e-07, + "loss": 0.2501, + "step": 13894 + }, + { + "epoch": 0.87, + "grad_norm": 2.5273454258558474, + "learning_rate": 4.116354993458632e-07, + "loss": 0.247, + "step": 13895 + }, + { + "epoch": 0.87, + "grad_norm": 1.7837724964174653, + "learning_rate": 4.1123091644760413e-07, + "loss": 0.2514, + "step": 13896 + }, + { + "epoch": 0.87, + "grad_norm": 7.43589432026048, + "learning_rate": 4.108265239455256e-07, + "loss": 0.2583, + "step": 13897 + }, + { + "epoch": 0.87, + "grad_norm": 1.7085514184862256, + "learning_rate": 4.104223218564046e-07, + "loss": 0.2497, + "step": 13898 + }, + { + "epoch": 0.87, + "grad_norm": 2.785846022395972, + "learning_rate": 4.10018310197014e-07, + "loss": 0.2431, + "step": 13899 + }, + { + "epoch": 0.87, + "grad_norm": 2.3647852333575528, + "learning_rate": 4.096144889841158e-07, + "loss": 0.2541, + "step": 13900 + }, + { + "epoch": 0.87, + "grad_norm": 5.52349487523597, + "learning_rate": 4.092108582344673e-07, + "loss": 0.2454, + "step": 13901 + }, + { + "epoch": 0.87, + "grad_norm": 2.5695289241864465, + "learning_rate": 4.088074179648138e-07, + "loss": 0.2677, + "step": 13902 + }, + { + "epoch": 0.87, + "grad_norm": 1.3692451951898694, + "learning_rate": 4.084041681918949e-07, + "loss": 0.2382, + "step": 13903 + }, + { + "epoch": 0.87, + "grad_norm": 3.023437322970993, + "learning_rate": 4.0800110893244314e-07, + "loss": 0.2872, + "step": 13904 + }, + { + "epoch": 0.87, + "grad_norm": 1.9305002568167204, + "learning_rate": 4.0759824020318197e-07, + "loss": 0.2657, + "step": 13905 + }, + { + "epoch": 0.87, + "grad_norm": 2.9176846662156515, + "learning_rate": 4.0719556202082567e-07, + "loss": 0.277, + "step": 13906 + }, + { + "epoch": 0.87, + "grad_norm": 44.749710952022866, + "learning_rate": 4.067930744020837e-07, + "loss": 0.2478, + "step": 13907 + }, + { + "epoch": 0.87, + "grad_norm": 0.5910365872232302, + "learning_rate": 4.0639077736365606e-07, + "loss": 0.448, + "step": 13908 + }, + { + "epoch": 0.87, + "grad_norm": 1.9684827752178375, + "learning_rate": 4.059886709222344e-07, + "loss": 0.2292, + "step": 13909 + }, + { + "epoch": 0.87, + "grad_norm": 1.9454508858254342, + "learning_rate": 4.055867550945025e-07, + "loss": 0.2514, + "step": 13910 + }, + { + "epoch": 0.87, + "grad_norm": 2.549682611788474, + "learning_rate": 4.05185029897136e-07, + "loss": 0.2571, + "step": 13911 + }, + { + "epoch": 0.87, + "grad_norm": 1.9142156366389025, + "learning_rate": 4.0478349534680496e-07, + "loss": 0.2688, + "step": 13912 + }, + { + "epoch": 0.87, + "grad_norm": 1.6895084210184605, + "learning_rate": 4.043821514601681e-07, + "loss": 0.231, + "step": 13913 + }, + { + "epoch": 0.88, + "grad_norm": 1.7273889713895971, + "learning_rate": 4.039809982538784e-07, + "loss": 0.2617, + "step": 13914 + }, + { + "epoch": 0.88, + "grad_norm": 1.6850846852629544, + "learning_rate": 4.0358003574458017e-07, + "loss": 0.2704, + "step": 13915 + }, + { + "epoch": 0.88, + "grad_norm": 3.836940898758, + "learning_rate": 4.031792639489113e-07, + "loss": 0.2617, + "step": 13916 + }, + { + "epoch": 0.88, + "grad_norm": 1.7979533425324887, + "learning_rate": 4.027786828834995e-07, + "loss": 0.2469, + "step": 13917 + }, + { + "epoch": 0.88, + "grad_norm": 3.8854678790418142, + "learning_rate": 4.023782925649661e-07, + "loss": 0.2533, + "step": 13918 + }, + { + "epoch": 0.88, + "grad_norm": 7.0842772759433394, + "learning_rate": 4.0197809300992206e-07, + "loss": 0.2403, + "step": 13919 + }, + { + "epoch": 0.88, + "grad_norm": 2.74645143191841, + "learning_rate": 4.0157808423497537e-07, + "loss": 0.242, + "step": 13920 + }, + { + "epoch": 0.88, + "grad_norm": 1.9685761744317802, + "learning_rate": 4.011782662567215e-07, + "loss": 0.2506, + "step": 13921 + }, + { + "epoch": 0.88, + "grad_norm": 4.701193911893507, + "learning_rate": 4.007786390917495e-07, + "loss": 0.2859, + "step": 13922 + }, + { + "epoch": 0.88, + "grad_norm": 2.0616095332696727, + "learning_rate": 4.0037920275664e-07, + "loss": 0.2662, + "step": 13923 + }, + { + "epoch": 0.88, + "grad_norm": 21.16385087060391, + "learning_rate": 3.999799572679686e-07, + "loss": 0.2591, + "step": 13924 + }, + { + "epoch": 0.88, + "grad_norm": 1.4668387854806826, + "learning_rate": 3.9958090264229923e-07, + "loss": 0.262, + "step": 13925 + }, + { + "epoch": 0.88, + "grad_norm": 2.007863733481297, + "learning_rate": 3.991820388961887e-07, + "loss": 0.2543, + "step": 13926 + }, + { + "epoch": 0.88, + "grad_norm": 5.130776382220719, + "learning_rate": 3.987833660461882e-07, + "loss": 0.2571, + "step": 13927 + }, + { + "epoch": 0.88, + "grad_norm": 2.5468586335375507, + "learning_rate": 3.9838488410883837e-07, + "loss": 0.248, + "step": 13928 + }, + { + "epoch": 0.88, + "grad_norm": 1.4178728606148994, + "learning_rate": 3.979865931006732e-07, + "loss": 0.2422, + "step": 13929 + }, + { + "epoch": 0.88, + "grad_norm": 1.9995809761593755, + "learning_rate": 3.975884930382173e-07, + "loss": 0.2523, + "step": 13930 + }, + { + "epoch": 0.88, + "grad_norm": 7.640397388681757, + "learning_rate": 3.971905839379903e-07, + "loss": 0.2557, + "step": 13931 + }, + { + "epoch": 0.88, + "grad_norm": 0.5824689224284922, + "learning_rate": 3.9679286581650155e-07, + "loss": 0.4839, + "step": 13932 + }, + { + "epoch": 0.88, + "grad_norm": 3.1923699861803274, + "learning_rate": 3.963953386902536e-07, + "loss": 0.2433, + "step": 13933 + }, + { + "epoch": 0.88, + "grad_norm": 1.6409875478485179, + "learning_rate": 3.9599800257573873e-07, + "loss": 0.2524, + "step": 13934 + }, + { + "epoch": 0.88, + "grad_norm": 2.5029582875661958, + "learning_rate": 3.9560085748944545e-07, + "loss": 0.2452, + "step": 13935 + }, + { + "epoch": 0.88, + "grad_norm": 1.4388595656727534, + "learning_rate": 3.9520390344785053e-07, + "loss": 0.2566, + "step": 13936 + }, + { + "epoch": 0.88, + "grad_norm": 1.8138724186168833, + "learning_rate": 3.948071404674242e-07, + "loss": 0.2362, + "step": 13937 + }, + { + "epoch": 0.88, + "grad_norm": 4.1889370186957295, + "learning_rate": 3.944105685646299e-07, + "loss": 0.2666, + "step": 13938 + }, + { + "epoch": 0.88, + "grad_norm": 1.7682123636206566, + "learning_rate": 3.940141877559212e-07, + "loss": 0.2374, + "step": 13939 + }, + { + "epoch": 0.88, + "grad_norm": 3.6072873298661396, + "learning_rate": 3.9361799805774535e-07, + "loss": 0.2722, + "step": 13940 + }, + { + "epoch": 0.88, + "grad_norm": 1.566699737378959, + "learning_rate": 3.9322199948654104e-07, + "loss": 0.2591, + "step": 13941 + }, + { + "epoch": 0.88, + "grad_norm": 1.9252607057894588, + "learning_rate": 3.928261920587373e-07, + "loss": 0.2651, + "step": 13942 + }, + { + "epoch": 0.88, + "grad_norm": 2.3426039067240128, + "learning_rate": 3.9243057579075925e-07, + "loss": 0.2664, + "step": 13943 + }, + { + "epoch": 0.88, + "grad_norm": 3.7662689443890587, + "learning_rate": 3.9203515069902055e-07, + "loss": 0.2601, + "step": 13944 + }, + { + "epoch": 0.88, + "grad_norm": 4.124532112595285, + "learning_rate": 3.9163991679992684e-07, + "loss": 0.2439, + "step": 13945 + }, + { + "epoch": 0.88, + "grad_norm": 1.6089708090188581, + "learning_rate": 3.9124487410987955e-07, + "loss": 0.2543, + "step": 13946 + }, + { + "epoch": 0.88, + "grad_norm": 5.599972590825963, + "learning_rate": 3.9085002264526775e-07, + "loss": 0.2804, + "step": 13947 + }, + { + "epoch": 0.88, + "grad_norm": 1.4550784722978682, + "learning_rate": 3.904553624224761e-07, + "loss": 0.2425, + "step": 13948 + }, + { + "epoch": 0.88, + "grad_norm": 3.7191182616936205, + "learning_rate": 3.900608934578787e-07, + "loss": 0.2789, + "step": 13949 + }, + { + "epoch": 0.88, + "grad_norm": 3.2779646523062587, + "learning_rate": 3.896666157678425e-07, + "loss": 0.2508, + "step": 13950 + }, + { + "epoch": 0.88, + "grad_norm": 2.2814600805195453, + "learning_rate": 3.8927252936872774e-07, + "loss": 0.2458, + "step": 13951 + }, + { + "epoch": 0.88, + "grad_norm": 2.8321334127552613, + "learning_rate": 3.8887863427688463e-07, + "loss": 0.2674, + "step": 13952 + }, + { + "epoch": 0.88, + "grad_norm": 2.8102398638850765, + "learning_rate": 3.884849305086585e-07, + "loss": 0.2571, + "step": 13953 + }, + { + "epoch": 0.88, + "grad_norm": 3.041175786355012, + "learning_rate": 3.88091418080383e-07, + "loss": 0.2591, + "step": 13954 + }, + { + "epoch": 0.88, + "grad_norm": 2.72017722148624, + "learning_rate": 3.8769809700838546e-07, + "loss": 0.2519, + "step": 13955 + }, + { + "epoch": 0.88, + "grad_norm": 3.3002952112703197, + "learning_rate": 3.8730496730898735e-07, + "loss": 0.2631, + "step": 13956 + }, + { + "epoch": 0.88, + "grad_norm": 1.9066585536648852, + "learning_rate": 3.86912028998499e-07, + "loss": 0.2697, + "step": 13957 + }, + { + "epoch": 0.88, + "grad_norm": 1.880663232172344, + "learning_rate": 3.865192820932234e-07, + "loss": 0.2556, + "step": 13958 + }, + { + "epoch": 0.88, + "grad_norm": 3.5605055531928658, + "learning_rate": 3.861267266094587e-07, + "loss": 0.2604, + "step": 13959 + }, + { + "epoch": 0.88, + "grad_norm": 2.4718300748092226, + "learning_rate": 3.857343625634896e-07, + "loss": 0.2539, + "step": 13960 + }, + { + "epoch": 0.88, + "grad_norm": 2.3417541065616234, + "learning_rate": 3.8534218997159923e-07, + "loss": 0.2506, + "step": 13961 + }, + { + "epoch": 0.88, + "grad_norm": 7.137743805169988, + "learning_rate": 3.8495020885005784e-07, + "loss": 0.2604, + "step": 13962 + }, + { + "epoch": 0.88, + "grad_norm": 1.5818669127051017, + "learning_rate": 3.8455841921512803e-07, + "loss": 0.261, + "step": 13963 + }, + { + "epoch": 0.88, + "grad_norm": 1.9518111291864557, + "learning_rate": 3.8416682108306904e-07, + "loss": 0.2433, + "step": 13964 + }, + { + "epoch": 0.88, + "grad_norm": 2.0509379586656284, + "learning_rate": 3.837754144701267e-07, + "loss": 0.2475, + "step": 13965 + }, + { + "epoch": 0.88, + "grad_norm": 1.8171415430236428, + "learning_rate": 3.8338419939254135e-07, + "loss": 0.2511, + "step": 13966 + }, + { + "epoch": 0.88, + "grad_norm": 1.6052968104197836, + "learning_rate": 3.8299317586654503e-07, + "loss": 0.2385, + "step": 13967 + }, + { + "epoch": 0.88, + "grad_norm": 1.8037691832569482, + "learning_rate": 3.826023439083637e-07, + "loss": 0.2419, + "step": 13968 + }, + { + "epoch": 0.88, + "grad_norm": 1.9415721273173152, + "learning_rate": 3.822117035342127e-07, + "loss": 0.2567, + "step": 13969 + }, + { + "epoch": 0.88, + "grad_norm": 2.835155764947781, + "learning_rate": 3.8182125476030007e-07, + "loss": 0.2549, + "step": 13970 + }, + { + "epoch": 0.88, + "grad_norm": 5.448693023832294, + "learning_rate": 3.8143099760282574e-07, + "loss": 0.2523, + "step": 13971 + }, + { + "epoch": 0.88, + "grad_norm": 2.7763422546295775, + "learning_rate": 3.8104093207798343e-07, + "loss": 0.2626, + "step": 13972 + }, + { + "epoch": 0.88, + "grad_norm": 1.806042382047809, + "learning_rate": 3.8065105820195736e-07, + "loss": 0.2539, + "step": 13973 + }, + { + "epoch": 0.88, + "grad_norm": 1.449936481099475, + "learning_rate": 3.802613759909224e-07, + "loss": 0.2609, + "step": 13974 + }, + { + "epoch": 0.88, + "grad_norm": 1.4197453166483975, + "learning_rate": 3.798718854610489e-07, + "loss": 0.2458, + "step": 13975 + }, + { + "epoch": 0.88, + "grad_norm": 2.3431706323038983, + "learning_rate": 3.794825866284979e-07, + "loss": 0.2467, + "step": 13976 + }, + { + "epoch": 0.88, + "grad_norm": 1.621411461075429, + "learning_rate": 3.7909347950942145e-07, + "loss": 0.2604, + "step": 13977 + }, + { + "epoch": 0.88, + "grad_norm": 1.9454255595883063, + "learning_rate": 3.787045641199638e-07, + "loss": 0.2576, + "step": 13978 + }, + { + "epoch": 0.88, + "grad_norm": 3.528482977675164, + "learning_rate": 3.783158404762616e-07, + "loss": 0.2829, + "step": 13979 + }, + { + "epoch": 0.88, + "grad_norm": 2.0598896829424405, + "learning_rate": 3.7792730859444515e-07, + "loss": 0.244, + "step": 13980 + }, + { + "epoch": 0.88, + "grad_norm": 2.5935719863410593, + "learning_rate": 3.775389684906344e-07, + "loss": 0.2599, + "step": 13981 + }, + { + "epoch": 0.88, + "grad_norm": 3.389542266877843, + "learning_rate": 3.771508201809415e-07, + "loss": 0.2557, + "step": 13982 + }, + { + "epoch": 0.88, + "grad_norm": 1.6116713768741266, + "learning_rate": 3.7676286368147184e-07, + "loss": 0.2321, + "step": 13983 + }, + { + "epoch": 0.88, + "grad_norm": 2.6815315489178015, + "learning_rate": 3.763750990083237e-07, + "loss": 0.2549, + "step": 13984 + }, + { + "epoch": 0.88, + "grad_norm": 2.3457852196671523, + "learning_rate": 3.7598752617758527e-07, + "loss": 0.2725, + "step": 13985 + }, + { + "epoch": 0.88, + "grad_norm": 2.2386337262106424, + "learning_rate": 3.756001452053376e-07, + "loss": 0.2878, + "step": 13986 + }, + { + "epoch": 0.88, + "grad_norm": 1.9042725851865951, + "learning_rate": 3.7521295610765287e-07, + "loss": 0.246, + "step": 13987 + }, + { + "epoch": 0.88, + "grad_norm": 2.2466391317913295, + "learning_rate": 3.748259589005976e-07, + "loss": 0.265, + "step": 13988 + }, + { + "epoch": 0.88, + "grad_norm": 1.748354744579556, + "learning_rate": 3.744391536002279e-07, + "loss": 0.2495, + "step": 13989 + }, + { + "epoch": 0.88, + "grad_norm": 2.063570841130431, + "learning_rate": 3.7405254022259475e-07, + "loss": 0.248, + "step": 13990 + }, + { + "epoch": 0.88, + "grad_norm": 1.4251580723790165, + "learning_rate": 3.73666118783737e-07, + "loss": 0.2545, + "step": 13991 + }, + { + "epoch": 0.88, + "grad_norm": 1.8153410444853335, + "learning_rate": 3.7327988929969083e-07, + "loss": 0.2422, + "step": 13992 + }, + { + "epoch": 0.88, + "grad_norm": 2.5122610022769543, + "learning_rate": 3.728938517864794e-07, + "loss": 0.2398, + "step": 13993 + }, + { + "epoch": 0.88, + "grad_norm": 4.528630755370866, + "learning_rate": 3.725080062601211e-07, + "loss": 0.2474, + "step": 13994 + }, + { + "epoch": 0.88, + "grad_norm": 2.520792065023425, + "learning_rate": 3.721223527366241e-07, + "loss": 0.2541, + "step": 13995 + }, + { + "epoch": 0.88, + "grad_norm": 2.040619967891763, + "learning_rate": 3.7173689123199133e-07, + "loss": 0.2395, + "step": 13996 + }, + { + "epoch": 0.88, + "grad_norm": 2.4496151423932235, + "learning_rate": 3.7135162176221483e-07, + "loss": 0.2449, + "step": 13997 + }, + { + "epoch": 0.88, + "grad_norm": 2.6070179068284256, + "learning_rate": 3.7096654434328197e-07, + "loss": 0.2529, + "step": 13998 + }, + { + "epoch": 0.88, + "grad_norm": 2.5364656885845123, + "learning_rate": 3.7058165899116816e-07, + "loss": 0.2596, + "step": 13999 + }, + { + "epoch": 0.88, + "grad_norm": 1.9744628668453355, + "learning_rate": 3.7019696572184515e-07, + "loss": 0.2331, + "step": 14000 + }, + { + "epoch": 0.88, + "grad_norm": 2.073741734694081, + "learning_rate": 3.698124645512735e-07, + "loss": 0.2638, + "step": 14001 + }, + { + "epoch": 0.88, + "grad_norm": 1.6839561431804393, + "learning_rate": 3.69428155495406e-07, + "loss": 0.2586, + "step": 14002 + }, + { + "epoch": 0.88, + "grad_norm": 2.968946990777144, + "learning_rate": 3.690440385701899e-07, + "loss": 0.2401, + "step": 14003 + }, + { + "epoch": 0.88, + "grad_norm": 1.4699091569746803, + "learning_rate": 3.686601137915613e-07, + "loss": 0.2474, + "step": 14004 + }, + { + "epoch": 0.88, + "grad_norm": 3.703743291144797, + "learning_rate": 3.6827638117545193e-07, + "loss": 0.2546, + "step": 14005 + }, + { + "epoch": 0.88, + "grad_norm": 1.3149306029380574, + "learning_rate": 3.6789284073778187e-07, + "loss": 0.24, + "step": 14006 + }, + { + "epoch": 0.88, + "grad_norm": 1.82716596559018, + "learning_rate": 3.67509492494465e-07, + "loss": 0.253, + "step": 14007 + }, + { + "epoch": 0.88, + "grad_norm": 2.057829474780221, + "learning_rate": 3.6712633646140805e-07, + "loss": 0.251, + "step": 14008 + }, + { + "epoch": 0.88, + "grad_norm": 3.9292991903661747, + "learning_rate": 3.667433726545083e-07, + "loss": 0.2547, + "step": 14009 + }, + { + "epoch": 0.88, + "grad_norm": 1.4976106902865847, + "learning_rate": 3.6636060108965533e-07, + "loss": 0.2447, + "step": 14010 + }, + { + "epoch": 0.88, + "grad_norm": 2.3557776302475415, + "learning_rate": 3.6597802178273136e-07, + "loss": 0.2606, + "step": 14011 + }, + { + "epoch": 0.88, + "grad_norm": 1.4506621013509584, + "learning_rate": 3.655956347496098e-07, + "loss": 0.2412, + "step": 14012 + }, + { + "epoch": 0.88, + "grad_norm": 1.949391643633007, + "learning_rate": 3.6521344000615746e-07, + "loss": 0.2685, + "step": 14013 + }, + { + "epoch": 0.88, + "grad_norm": 1.8876606327230736, + "learning_rate": 3.6483143756823215e-07, + "loss": 0.2476, + "step": 14014 + }, + { + "epoch": 0.88, + "grad_norm": 2.563638569520903, + "learning_rate": 3.644496274516829e-07, + "loss": 0.2509, + "step": 14015 + }, + { + "epoch": 0.88, + "grad_norm": 5.964949734790135, + "learning_rate": 3.6406800967235255e-07, + "loss": 0.2532, + "step": 14016 + }, + { + "epoch": 0.88, + "grad_norm": 1.4108558848561747, + "learning_rate": 3.636865842460752e-07, + "loss": 0.2566, + "step": 14017 + }, + { + "epoch": 0.88, + "grad_norm": 13.420700808848416, + "learning_rate": 3.633053511886753e-07, + "loss": 0.2523, + "step": 14018 + }, + { + "epoch": 0.88, + "grad_norm": 1.425152784819349, + "learning_rate": 3.6292431051597244e-07, + "loss": 0.2413, + "step": 14019 + }, + { + "epoch": 0.88, + "grad_norm": 2.4925792531002458, + "learning_rate": 3.625434622437768e-07, + "loss": 0.2517, + "step": 14020 + }, + { + "epoch": 0.88, + "grad_norm": 1.7139325459117283, + "learning_rate": 3.6216280638789013e-07, + "loss": 0.2476, + "step": 14021 + }, + { + "epoch": 0.88, + "grad_norm": 1.5914139311807765, + "learning_rate": 3.6178234296410595e-07, + "loss": 0.267, + "step": 14022 + }, + { + "epoch": 0.88, + "grad_norm": 2.605044548152336, + "learning_rate": 3.614020719882105e-07, + "loss": 0.2739, + "step": 14023 + }, + { + "epoch": 0.88, + "grad_norm": 0.5833532287491568, + "learning_rate": 3.610219934759829e-07, + "loss": 0.4876, + "step": 14024 + }, + { + "epoch": 0.88, + "grad_norm": 1.5081303045273848, + "learning_rate": 3.606421074431926e-07, + "loss": 0.2441, + "step": 14025 + }, + { + "epoch": 0.88, + "grad_norm": 2.640048597494044, + "learning_rate": 3.6026241390560045e-07, + "loss": 0.263, + "step": 14026 + }, + { + "epoch": 0.88, + "grad_norm": 3.6926999195341494, + "learning_rate": 3.5988291287896216e-07, + "loss": 0.2586, + "step": 14027 + }, + { + "epoch": 0.88, + "grad_norm": 2.026754301064587, + "learning_rate": 3.5950360437902454e-07, + "loss": 0.2507, + "step": 14028 + }, + { + "epoch": 0.88, + "grad_norm": 2.4190361609887163, + "learning_rate": 3.591244884215245e-07, + "loss": 0.2418, + "step": 14029 + }, + { + "epoch": 0.88, + "grad_norm": 3.8392270202784946, + "learning_rate": 3.587455650221927e-07, + "loss": 0.2717, + "step": 14030 + }, + { + "epoch": 0.88, + "grad_norm": 1.9051917167279349, + "learning_rate": 3.5836683419675056e-07, + "loss": 0.2445, + "step": 14031 + }, + { + "epoch": 0.88, + "grad_norm": 1.905415648496474, + "learning_rate": 3.5798829596091325e-07, + "loss": 0.265, + "step": 14032 + }, + { + "epoch": 0.88, + "grad_norm": 2.982409193120526, + "learning_rate": 3.57609950330387e-07, + "loss": 0.2652, + "step": 14033 + }, + { + "epoch": 0.88, + "grad_norm": 1.6893216064066778, + "learning_rate": 3.572317973208689e-07, + "loss": 0.2448, + "step": 14034 + }, + { + "epoch": 0.88, + "grad_norm": 1.6312148138750535, + "learning_rate": 3.568538369480501e-07, + "loss": 0.2357, + "step": 14035 + }, + { + "epoch": 0.88, + "grad_norm": 7.7249586999456525, + "learning_rate": 3.564760692276137e-07, + "loss": 0.252, + "step": 14036 + }, + { + "epoch": 0.88, + "grad_norm": 3.150511611161764, + "learning_rate": 3.560984941752327e-07, + "loss": 0.2446, + "step": 14037 + }, + { + "epoch": 0.88, + "grad_norm": 4.2946408833896355, + "learning_rate": 3.5572111180657396e-07, + "loss": 0.2455, + "step": 14038 + }, + { + "epoch": 0.88, + "grad_norm": 3.7397380751360467, + "learning_rate": 3.553439221372945e-07, + "loss": 0.2622, + "step": 14039 + }, + { + "epoch": 0.88, + "grad_norm": 2.069791112226758, + "learning_rate": 3.549669251830462e-07, + "loss": 0.2431, + "step": 14040 + }, + { + "epoch": 0.88, + "grad_norm": 2.3324155647873717, + "learning_rate": 3.5459012095947097e-07, + "loss": 0.2493, + "step": 14041 + }, + { + "epoch": 0.88, + "grad_norm": 1.3983670371869428, + "learning_rate": 3.542135094822019e-07, + "loss": 0.2521, + "step": 14042 + }, + { + "epoch": 0.88, + "grad_norm": 1.900478955509874, + "learning_rate": 3.538370907668659e-07, + "loss": 0.2554, + "step": 14043 + }, + { + "epoch": 0.88, + "grad_norm": 1.3513096505059168, + "learning_rate": 3.5346086482908273e-07, + "loss": 0.2529, + "step": 14044 + }, + { + "epoch": 0.88, + "grad_norm": 6.418736693991127, + "learning_rate": 3.5308483168446094e-07, + "loss": 0.2541, + "step": 14045 + }, + { + "epoch": 0.88, + "grad_norm": 1.6215455821904665, + "learning_rate": 3.527089913486037e-07, + "loss": 0.2604, + "step": 14046 + }, + { + "epoch": 0.88, + "grad_norm": 2.033883167194892, + "learning_rate": 3.5233334383710406e-07, + "loss": 0.2517, + "step": 14047 + }, + { + "epoch": 0.88, + "grad_norm": 1.6772044399704664, + "learning_rate": 3.5195788916555005e-07, + "loss": 0.2459, + "step": 14048 + }, + { + "epoch": 0.88, + "grad_norm": 3.2660530849734006, + "learning_rate": 3.515826273495182e-07, + "loss": 0.2619, + "step": 14049 + }, + { + "epoch": 0.88, + "grad_norm": 5.069942588734494, + "learning_rate": 3.5120755840458043e-07, + "loss": 0.271, + "step": 14050 + }, + { + "epoch": 0.88, + "grad_norm": 4.523335034358723, + "learning_rate": 3.5083268234629763e-07, + "loss": 0.2791, + "step": 14051 + }, + { + "epoch": 0.88, + "grad_norm": 1.7345082687962807, + "learning_rate": 3.5045799919022515e-07, + "loss": 0.2497, + "step": 14052 + }, + { + "epoch": 0.88, + "grad_norm": 2.2259196000159602, + "learning_rate": 3.500835089519089e-07, + "loss": 0.249, + "step": 14053 + }, + { + "epoch": 0.88, + "grad_norm": 3.157667260164587, + "learning_rate": 3.4970921164688755e-07, + "loss": 0.269, + "step": 14054 + }, + { + "epoch": 0.88, + "grad_norm": 1.957867738843563, + "learning_rate": 3.4933510729068976e-07, + "loss": 0.2485, + "step": 14055 + }, + { + "epoch": 0.88, + "grad_norm": 2.1714570268075017, + "learning_rate": 3.4896119589883925e-07, + "loss": 0.287, + "step": 14056 + }, + { + "epoch": 0.88, + "grad_norm": 1.9125151174917374, + "learning_rate": 3.4858747748684916e-07, + "loss": 0.2367, + "step": 14057 + }, + { + "epoch": 0.88, + "grad_norm": 1.7969448507382682, + "learning_rate": 3.4821395207022767e-07, + "loss": 0.2535, + "step": 14058 + }, + { + "epoch": 0.88, + "grad_norm": 2.0609274176620787, + "learning_rate": 3.4784061966447124e-07, + "loss": 0.2704, + "step": 14059 + }, + { + "epoch": 0.88, + "grad_norm": 1.9087048246241596, + "learning_rate": 3.4746748028507084e-07, + "loss": 0.245, + "step": 14060 + }, + { + "epoch": 0.88, + "grad_norm": 1.714360568446815, + "learning_rate": 3.4709453394750847e-07, + "loss": 0.2378, + "step": 14061 + }, + { + "epoch": 0.88, + "grad_norm": 2.5383303949841536, + "learning_rate": 3.46721780667259e-07, + "loss": 0.2581, + "step": 14062 + }, + { + "epoch": 0.88, + "grad_norm": 1.6783334254549231, + "learning_rate": 3.463492204597868e-07, + "loss": 0.2299, + "step": 14063 + }, + { + "epoch": 0.88, + "grad_norm": 5.326612215247788, + "learning_rate": 3.459768533405511e-07, + "loss": 0.2436, + "step": 14064 + }, + { + "epoch": 0.88, + "grad_norm": 1.981796445493719, + "learning_rate": 3.456046793250034e-07, + "loss": 0.2461, + "step": 14065 + }, + { + "epoch": 0.88, + "grad_norm": 3.5639625510084794, + "learning_rate": 3.4523269842858477e-07, + "loss": 0.2501, + "step": 14066 + }, + { + "epoch": 0.88, + "grad_norm": 2.1504799983494314, + "learning_rate": 3.448609106667283e-07, + "loss": 0.2593, + "step": 14067 + }, + { + "epoch": 0.88, + "grad_norm": 2.439883229790237, + "learning_rate": 3.444893160548618e-07, + "loss": 0.2311, + "step": 14068 + }, + { + "epoch": 0.88, + "grad_norm": 1.7139580722442127, + "learning_rate": 3.4411791460840285e-07, + "loss": 0.2502, + "step": 14069 + }, + { + "epoch": 0.88, + "grad_norm": 2.773717588972613, + "learning_rate": 3.4374670634276073e-07, + "loss": 0.2537, + "step": 14070 + }, + { + "epoch": 0.88, + "grad_norm": 1.7377046459612524, + "learning_rate": 3.4337569127333767e-07, + "loss": 0.2453, + "step": 14071 + }, + { + "epoch": 0.88, + "grad_norm": 2.3101557933414867, + "learning_rate": 3.4300486941552913e-07, + "loss": 0.2499, + "step": 14072 + }, + { + "epoch": 0.89, + "grad_norm": 1.5894446259716395, + "learning_rate": 3.426342407847205e-07, + "loss": 0.2514, + "step": 14073 + }, + { + "epoch": 0.89, + "grad_norm": 1.3154537038250578, + "learning_rate": 3.422638053962896e-07, + "loss": 0.2577, + "step": 14074 + }, + { + "epoch": 0.89, + "grad_norm": 2.073380209818981, + "learning_rate": 3.418935632656062e-07, + "loss": 0.2504, + "step": 14075 + }, + { + "epoch": 0.89, + "grad_norm": 1.567365734048435, + "learning_rate": 3.4152351440803254e-07, + "loss": 0.2563, + "step": 14076 + }, + { + "epoch": 0.89, + "grad_norm": 1.903767169799387, + "learning_rate": 3.41153658838923e-07, + "loss": 0.252, + "step": 14077 + }, + { + "epoch": 0.89, + "grad_norm": 1.5611119479604987, + "learning_rate": 3.407839965736237e-07, + "loss": 0.2565, + "step": 14078 + }, + { + "epoch": 0.89, + "grad_norm": 2.1750747738002634, + "learning_rate": 3.4041452762747007e-07, + "loss": 0.2578, + "step": 14079 + }, + { + "epoch": 0.89, + "grad_norm": 1.7385006166764045, + "learning_rate": 3.4004525201579543e-07, + "loss": 0.2648, + "step": 14080 + }, + { + "epoch": 0.89, + "grad_norm": 1.6157085561575384, + "learning_rate": 3.3967616975392026e-07, + "loss": 0.2698, + "step": 14081 + }, + { + "epoch": 0.89, + "grad_norm": 0.6148333518349004, + "learning_rate": 3.3930728085715903e-07, + "loss": 0.4513, + "step": 14082 + }, + { + "epoch": 0.89, + "grad_norm": 4.023096653748799, + "learning_rate": 3.389385853408156e-07, + "loss": 0.2526, + "step": 14083 + }, + { + "epoch": 0.89, + "grad_norm": 1.7164957939293168, + "learning_rate": 3.385700832201905e-07, + "loss": 0.2379, + "step": 14084 + }, + { + "epoch": 0.89, + "grad_norm": 1.4329485094320742, + "learning_rate": 3.38201774510572e-07, + "loss": 0.2549, + "step": 14085 + }, + { + "epoch": 0.89, + "grad_norm": 1.9916654475637625, + "learning_rate": 3.378336592272419e-07, + "loss": 0.2469, + "step": 14086 + }, + { + "epoch": 0.89, + "grad_norm": 1.6054290856093585, + "learning_rate": 3.374657373854734e-07, + "loss": 0.2778, + "step": 14087 + }, + { + "epoch": 0.89, + "grad_norm": 1.7978478650048377, + "learning_rate": 3.3709800900053437e-07, + "loss": 0.2581, + "step": 14088 + }, + { + "epoch": 0.89, + "grad_norm": 1.5952629654325567, + "learning_rate": 3.367304740876809e-07, + "loss": 0.2488, + "step": 14089 + }, + { + "epoch": 0.89, + "grad_norm": 2.246410194685025, + "learning_rate": 3.3636313266216304e-07, + "loss": 0.2753, + "step": 14090 + }, + { + "epoch": 0.89, + "grad_norm": 1.7010692136279622, + "learning_rate": 3.359959847392219e-07, + "loss": 0.2496, + "step": 14091 + }, + { + "epoch": 0.89, + "grad_norm": 1.4884789518705417, + "learning_rate": 3.35629030334092e-07, + "loss": 0.267, + "step": 14092 + }, + { + "epoch": 0.89, + "grad_norm": 1.9281994986521709, + "learning_rate": 3.352622694619989e-07, + "loss": 0.267, + "step": 14093 + }, + { + "epoch": 0.89, + "grad_norm": 3.592981451803473, + "learning_rate": 3.3489570213815826e-07, + "loss": 0.2518, + "step": 14094 + }, + { + "epoch": 0.89, + "grad_norm": 2.579066271172909, + "learning_rate": 3.3452932837778174e-07, + "loss": 0.2388, + "step": 14095 + }, + { + "epoch": 0.89, + "grad_norm": 1.780184275461993, + "learning_rate": 3.3416314819607056e-07, + "loss": 0.2543, + "step": 14096 + }, + { + "epoch": 0.89, + "grad_norm": 2.710008204718657, + "learning_rate": 3.3379716160821805e-07, + "loss": 0.2681, + "step": 14097 + }, + { + "epoch": 0.89, + "grad_norm": 1.8055841737255092, + "learning_rate": 3.334313686294094e-07, + "loss": 0.267, + "step": 14098 + }, + { + "epoch": 0.89, + "grad_norm": 0.550078482054344, + "learning_rate": 3.330657692748213e-07, + "loss": 0.4701, + "step": 14099 + }, + { + "epoch": 0.89, + "grad_norm": 1.820093325681551, + "learning_rate": 3.327003635596243e-07, + "loss": 0.2367, + "step": 14100 + }, + { + "epoch": 0.89, + "grad_norm": 1.451465411921623, + "learning_rate": 3.3233515149897865e-07, + "loss": 0.2731, + "step": 14101 + }, + { + "epoch": 0.89, + "grad_norm": 2.1110474763313096, + "learning_rate": 3.319701331080394e-07, + "loss": 0.2602, + "step": 14102 + }, + { + "epoch": 0.89, + "grad_norm": 2.5133777270564037, + "learning_rate": 3.316053084019494e-07, + "loss": 0.2365, + "step": 14103 + }, + { + "epoch": 0.89, + "grad_norm": 2.214333969472905, + "learning_rate": 3.312406773958482e-07, + "loss": 0.237, + "step": 14104 + }, + { + "epoch": 0.89, + "grad_norm": 2.24721399983154, + "learning_rate": 3.3087624010486377e-07, + "loss": 0.2385, + "step": 14105 + }, + { + "epoch": 0.89, + "grad_norm": 2.214415640869386, + "learning_rate": 3.305119965441178e-07, + "loss": 0.2538, + "step": 14106 + }, + { + "epoch": 0.89, + "grad_norm": 1.7871951376817026, + "learning_rate": 3.3014794672872165e-07, + "loss": 0.2494, + "step": 14107 + }, + { + "epoch": 0.89, + "grad_norm": 1.7727323497798866, + "learning_rate": 3.2978409067378315e-07, + "loss": 0.2539, + "step": 14108 + }, + { + "epoch": 0.89, + "grad_norm": 3.1630773104422993, + "learning_rate": 3.294204283943969e-07, + "loss": 0.2707, + "step": 14109 + }, + { + "epoch": 0.89, + "grad_norm": 1.4628341334900339, + "learning_rate": 3.2905695990565365e-07, + "loss": 0.2406, + "step": 14110 + }, + { + "epoch": 0.89, + "grad_norm": 4.44670128812514, + "learning_rate": 3.28693685222633e-07, + "loss": 0.2572, + "step": 14111 + }, + { + "epoch": 0.89, + "grad_norm": 1.3998173995318388, + "learning_rate": 3.2833060436040955e-07, + "loss": 0.2351, + "step": 14112 + }, + { + "epoch": 0.89, + "grad_norm": 2.009462336526005, + "learning_rate": 3.2796771733404673e-07, + "loss": 0.273, + "step": 14113 + }, + { + "epoch": 0.89, + "grad_norm": 2.518736069927823, + "learning_rate": 3.2760502415860206e-07, + "loss": 0.2618, + "step": 14114 + }, + { + "epoch": 0.89, + "grad_norm": 2.2074491028281913, + "learning_rate": 3.2724252484912343e-07, + "loss": 0.2364, + "step": 14115 + }, + { + "epoch": 0.89, + "grad_norm": 3.4589832200098267, + "learning_rate": 3.268802194206516e-07, + "loss": 0.2741, + "step": 14116 + }, + { + "epoch": 0.89, + "grad_norm": 1.6906481773450206, + "learning_rate": 3.2651810788822125e-07, + "loss": 0.2707, + "step": 14117 + }, + { + "epoch": 0.89, + "grad_norm": 5.642285376411281, + "learning_rate": 3.261561902668553e-07, + "loss": 0.2485, + "step": 14118 + }, + { + "epoch": 0.89, + "grad_norm": 4.427392316989657, + "learning_rate": 3.257944665715701e-07, + "loss": 0.2606, + "step": 14119 + }, + { + "epoch": 0.89, + "grad_norm": 2.9736124977420078, + "learning_rate": 3.254329368173753e-07, + "loss": 0.2429, + "step": 14120 + }, + { + "epoch": 0.89, + "grad_norm": 2.099895252274889, + "learning_rate": 3.2507160101927113e-07, + "loss": 0.2521, + "step": 14121 + }, + { + "epoch": 0.89, + "grad_norm": 2.658880185049216, + "learning_rate": 3.247104591922495e-07, + "loss": 0.2555, + "step": 14122 + }, + { + "epoch": 0.89, + "grad_norm": 18.07870804946324, + "learning_rate": 3.243495113512951e-07, + "loss": 0.2532, + "step": 14123 + }, + { + "epoch": 0.89, + "grad_norm": 2.3801652884066087, + "learning_rate": 3.239887575113837e-07, + "loss": 0.2649, + "step": 14124 + }, + { + "epoch": 0.89, + "grad_norm": 1.5853525265651602, + "learning_rate": 3.236281976874855e-07, + "loss": 0.2461, + "step": 14125 + }, + { + "epoch": 0.89, + "grad_norm": 1.4957718282050858, + "learning_rate": 3.2326783189455925e-07, + "loss": 0.2521, + "step": 14126 + }, + { + "epoch": 0.89, + "grad_norm": 1.9464437640196859, + "learning_rate": 3.229076601475567e-07, + "loss": 0.241, + "step": 14127 + }, + { + "epoch": 0.89, + "grad_norm": 1.9489841920359445, + "learning_rate": 3.225476824614238e-07, + "loss": 0.2653, + "step": 14128 + }, + { + "epoch": 0.89, + "grad_norm": 1.3831255692450444, + "learning_rate": 3.221878988510957e-07, + "loss": 0.2459, + "step": 14129 + }, + { + "epoch": 0.89, + "grad_norm": 2.194250706861133, + "learning_rate": 3.2182830933149997e-07, + "loss": 0.2692, + "step": 14130 + }, + { + "epoch": 0.89, + "grad_norm": 1.6009420196293114, + "learning_rate": 3.2146891391755586e-07, + "loss": 0.2399, + "step": 14131 + }, + { + "epoch": 0.89, + "grad_norm": 2.0690534754572023, + "learning_rate": 3.21109712624178e-07, + "loss": 0.2596, + "step": 14132 + }, + { + "epoch": 0.89, + "grad_norm": 1.7260369369536, + "learning_rate": 3.207507054662684e-07, + "loss": 0.2451, + "step": 14133 + }, + { + "epoch": 0.89, + "grad_norm": 3.6769983034758233, + "learning_rate": 3.2039189245872404e-07, + "loss": 0.2526, + "step": 14134 + }, + { + "epoch": 0.89, + "grad_norm": 1.7042673355599904, + "learning_rate": 3.2003327361643077e-07, + "loss": 0.241, + "step": 14135 + }, + { + "epoch": 0.89, + "grad_norm": 2.756754665829947, + "learning_rate": 3.196748489542706e-07, + "loss": 0.2406, + "step": 14136 + }, + { + "epoch": 0.89, + "grad_norm": 3.3749427363919344, + "learning_rate": 3.193166184871138e-07, + "loss": 0.2585, + "step": 14137 + }, + { + "epoch": 0.89, + "grad_norm": 2.0212558009921273, + "learning_rate": 3.189585822298241e-07, + "loss": 0.2545, + "step": 14138 + }, + { + "epoch": 0.89, + "grad_norm": 2.5790218799240625, + "learning_rate": 3.186007401972563e-07, + "loss": 0.2491, + "step": 14139 + }, + { + "epoch": 0.89, + "grad_norm": 4.00351015696923, + "learning_rate": 3.182430924042601e-07, + "loss": 0.2524, + "step": 14140 + }, + { + "epoch": 0.89, + "grad_norm": 1.7627894836248061, + "learning_rate": 3.178856388656737e-07, + "loss": 0.2461, + "step": 14141 + }, + { + "epoch": 0.89, + "grad_norm": 2.0022439223692308, + "learning_rate": 3.175283795963291e-07, + "loss": 0.2889, + "step": 14142 + }, + { + "epoch": 0.89, + "grad_norm": 1.8741973059886514, + "learning_rate": 3.171713146110478e-07, + "loss": 0.2544, + "step": 14143 + }, + { + "epoch": 0.89, + "grad_norm": 2.751331496634442, + "learning_rate": 3.168144439246468e-07, + "loss": 0.2607, + "step": 14144 + }, + { + "epoch": 0.89, + "grad_norm": 0.5805750487119463, + "learning_rate": 3.1645776755193314e-07, + "loss": 0.4583, + "step": 14145 + }, + { + "epoch": 0.89, + "grad_norm": 1.5341586703803891, + "learning_rate": 3.16101285507705e-07, + "loss": 0.2458, + "step": 14146 + }, + { + "epoch": 0.89, + "grad_norm": 2.264454355592782, + "learning_rate": 3.1574499780675395e-07, + "loss": 0.2426, + "step": 14147 + }, + { + "epoch": 0.89, + "grad_norm": 2.5053034981439968, + "learning_rate": 3.1538890446386363e-07, + "loss": 0.2665, + "step": 14148 + }, + { + "epoch": 0.89, + "grad_norm": 1.6998832531170613, + "learning_rate": 3.1503300549380833e-07, + "loss": 0.2437, + "step": 14149 + }, + { + "epoch": 0.89, + "grad_norm": 1.5599972917303324, + "learning_rate": 3.1467730091135574e-07, + "loss": 0.2438, + "step": 14150 + }, + { + "epoch": 0.89, + "grad_norm": 1.7217109211200416, + "learning_rate": 3.143217907312629e-07, + "loss": 0.2439, + "step": 14151 + }, + { + "epoch": 0.89, + "grad_norm": 2.288253222037634, + "learning_rate": 3.1396647496828245e-07, + "loss": 0.2578, + "step": 14152 + }, + { + "epoch": 0.89, + "grad_norm": 2.1278280755127885, + "learning_rate": 3.1361135363715544e-07, + "loss": 0.2558, + "step": 14153 + }, + { + "epoch": 0.89, + "grad_norm": 5.190756776753083, + "learning_rate": 3.132564267526178e-07, + "loss": 0.2413, + "step": 14154 + }, + { + "epoch": 0.89, + "grad_norm": 1.800167930523466, + "learning_rate": 3.1290169432939556e-07, + "loss": 0.2439, + "step": 14155 + }, + { + "epoch": 0.89, + "grad_norm": 1.8003421648981754, + "learning_rate": 3.1254715638220745e-07, + "loss": 0.2555, + "step": 14156 + }, + { + "epoch": 0.89, + "grad_norm": 1.6455429528094807, + "learning_rate": 3.12192812925764e-07, + "loss": 0.2428, + "step": 14157 + }, + { + "epoch": 0.89, + "grad_norm": 1.964151007411487, + "learning_rate": 3.118386639747667e-07, + "loss": 0.2515, + "step": 14158 + }, + { + "epoch": 0.89, + "grad_norm": 1.7635693224675286, + "learning_rate": 3.1148470954391e-07, + "loss": 0.2377, + "step": 14159 + }, + { + "epoch": 0.89, + "grad_norm": 4.644987155396305, + "learning_rate": 3.1113094964788095e-07, + "loss": 0.2636, + "step": 14160 + }, + { + "epoch": 0.89, + "grad_norm": 1.8162035050659775, + "learning_rate": 3.107773843013567e-07, + "loss": 0.2484, + "step": 14161 + }, + { + "epoch": 0.89, + "grad_norm": 3.0751770294342355, + "learning_rate": 3.104240135190084e-07, + "loss": 0.2448, + "step": 14162 + }, + { + "epoch": 0.89, + "grad_norm": 7.366092576526622, + "learning_rate": 3.1007083731549705e-07, + "loss": 0.2435, + "step": 14163 + }, + { + "epoch": 0.89, + "grad_norm": 3.1585048475823325, + "learning_rate": 3.0971785570547696e-07, + "loss": 0.2738, + "step": 14164 + }, + { + "epoch": 0.89, + "grad_norm": 5.872299106794302, + "learning_rate": 3.093650687035943e-07, + "loss": 0.2769, + "step": 14165 + }, + { + "epoch": 0.89, + "grad_norm": 3.6620275634652115, + "learning_rate": 3.090124763244867e-07, + "loss": 0.2637, + "step": 14166 + }, + { + "epoch": 0.89, + "grad_norm": 1.8573763061893935, + "learning_rate": 3.086600785827826e-07, + "loss": 0.2516, + "step": 14167 + }, + { + "epoch": 0.89, + "grad_norm": 3.389102685457744, + "learning_rate": 3.0830787549310405e-07, + "loss": 0.2562, + "step": 14168 + }, + { + "epoch": 0.89, + "grad_norm": 2.277507817959017, + "learning_rate": 3.079558670700666e-07, + "loss": 0.2492, + "step": 14169 + }, + { + "epoch": 0.89, + "grad_norm": 3.8974370453711717, + "learning_rate": 3.076040533282737e-07, + "loss": 0.2626, + "step": 14170 + }, + { + "epoch": 0.89, + "grad_norm": 1.743171319336527, + "learning_rate": 3.07252434282323e-07, + "loss": 0.2562, + "step": 14171 + }, + { + "epoch": 0.89, + "grad_norm": 2.3050480895414043, + "learning_rate": 3.069010099468045e-07, + "loss": 0.2441, + "step": 14172 + }, + { + "epoch": 0.89, + "grad_norm": 1.970789597954902, + "learning_rate": 3.0654978033629934e-07, + "loss": 0.2646, + "step": 14173 + }, + { + "epoch": 0.89, + "grad_norm": 2.5200774121937837, + "learning_rate": 3.0619874546537973e-07, + "loss": 0.2297, + "step": 14174 + }, + { + "epoch": 0.89, + "grad_norm": 2.0452896948858283, + "learning_rate": 3.058479053486113e-07, + "loss": 0.249, + "step": 14175 + }, + { + "epoch": 0.89, + "grad_norm": 3.379949916591384, + "learning_rate": 3.0549726000055067e-07, + "loss": 0.257, + "step": 14176 + }, + { + "epoch": 0.89, + "grad_norm": 5.079848781613622, + "learning_rate": 3.051468094357474e-07, + "loss": 0.2511, + "step": 14177 + }, + { + "epoch": 0.89, + "grad_norm": 2.3859222886999807, + "learning_rate": 3.047965536687425e-07, + "loss": 0.2462, + "step": 14178 + }, + { + "epoch": 0.89, + "grad_norm": 1.4474674964475036, + "learning_rate": 3.0444649271406723e-07, + "loss": 0.2438, + "step": 14179 + }, + { + "epoch": 0.89, + "grad_norm": 1.4229944900090208, + "learning_rate": 3.040966265862483e-07, + "loss": 0.2629, + "step": 14180 + }, + { + "epoch": 0.89, + "grad_norm": 1.9965843948030662, + "learning_rate": 3.037469552998007e-07, + "loss": 0.2595, + "step": 14181 + }, + { + "epoch": 0.89, + "grad_norm": 2.320974293952236, + "learning_rate": 3.03397478869234e-07, + "loss": 0.2556, + "step": 14182 + }, + { + "epoch": 0.89, + "grad_norm": 8.860521330464051, + "learning_rate": 3.0304819730904656e-07, + "loss": 0.2445, + "step": 14183 + }, + { + "epoch": 0.89, + "grad_norm": 3.01671078890323, + "learning_rate": 3.0269911063373293e-07, + "loss": 0.248, + "step": 14184 + }, + { + "epoch": 0.89, + "grad_norm": 2.3216123341239854, + "learning_rate": 3.023502188577765e-07, + "loss": 0.2576, + "step": 14185 + }, + { + "epoch": 0.89, + "grad_norm": 1.7143623675867794, + "learning_rate": 3.02001521995654e-07, + "loss": 0.2564, + "step": 14186 + }, + { + "epoch": 0.89, + "grad_norm": 1.9182187327746425, + "learning_rate": 3.016530200618323e-07, + "loss": 0.236, + "step": 14187 + }, + { + "epoch": 0.89, + "grad_norm": 2.512797203465373, + "learning_rate": 3.013047130707725e-07, + "loss": 0.248, + "step": 14188 + }, + { + "epoch": 0.89, + "grad_norm": 1.5181350357604373, + "learning_rate": 3.009566010369258e-07, + "loss": 0.2428, + "step": 14189 + }, + { + "epoch": 0.89, + "grad_norm": 1.5479538287869512, + "learning_rate": 3.006086839747363e-07, + "loss": 0.2248, + "step": 14190 + }, + { + "epoch": 0.89, + "grad_norm": 2.25517044002195, + "learning_rate": 3.002609618986385e-07, + "loss": 0.244, + "step": 14191 + }, + { + "epoch": 0.89, + "grad_norm": 1.7239218655688637, + "learning_rate": 2.9991343482306144e-07, + "loss": 0.2516, + "step": 14192 + }, + { + "epoch": 0.89, + "grad_norm": 5.15709818814808, + "learning_rate": 2.995661027624253e-07, + "loss": 0.2628, + "step": 14193 + }, + { + "epoch": 0.89, + "grad_norm": 7.970519353518592, + "learning_rate": 2.9921896573114005e-07, + "loss": 0.2681, + "step": 14194 + }, + { + "epoch": 0.89, + "grad_norm": 2.254104932453268, + "learning_rate": 2.988720237436088e-07, + "loss": 0.2515, + "step": 14195 + }, + { + "epoch": 0.89, + "grad_norm": 1.833982346041316, + "learning_rate": 2.985252768142283e-07, + "loss": 0.2325, + "step": 14196 + }, + { + "epoch": 0.89, + "grad_norm": 3.8311408798011928, + "learning_rate": 2.981787249573842e-07, + "loss": 0.2432, + "step": 14197 + }, + { + "epoch": 0.89, + "grad_norm": 2.7006812690340607, + "learning_rate": 2.978323681874562e-07, + "loss": 0.2515, + "step": 14198 + }, + { + "epoch": 0.89, + "grad_norm": 2.802188316984068, + "learning_rate": 2.97486206518815e-07, + "loss": 0.2527, + "step": 14199 + }, + { + "epoch": 0.89, + "grad_norm": 2.2877420965198394, + "learning_rate": 2.9714023996582407e-07, + "loss": 0.2373, + "step": 14200 + }, + { + "epoch": 0.89, + "grad_norm": 1.7574268777869833, + "learning_rate": 2.967944685428381e-07, + "loss": 0.2576, + "step": 14201 + }, + { + "epoch": 0.89, + "grad_norm": 2.4443036412371706, + "learning_rate": 2.964488922642034e-07, + "loss": 0.2607, + "step": 14202 + }, + { + "epoch": 0.89, + "grad_norm": 2.4874626471306946, + "learning_rate": 2.961035111442584e-07, + "loss": 0.2362, + "step": 14203 + }, + { + "epoch": 0.89, + "grad_norm": 2.2762819698663903, + "learning_rate": 2.9575832519733396e-07, + "loss": 0.2684, + "step": 14204 + }, + { + "epoch": 0.89, + "grad_norm": 1.5384853321987708, + "learning_rate": 2.9541333443775244e-07, + "loss": 0.2393, + "step": 14205 + }, + { + "epoch": 0.89, + "grad_norm": 1.8635726006801168, + "learning_rate": 2.950685388798269e-07, + "loss": 0.2597, + "step": 14206 + }, + { + "epoch": 0.89, + "grad_norm": 1.778220642726601, + "learning_rate": 2.9472393853786473e-07, + "loss": 0.2505, + "step": 14207 + }, + { + "epoch": 0.89, + "grad_norm": 1.4520739525243302, + "learning_rate": 2.9437953342616453e-07, + "loss": 0.2337, + "step": 14208 + }, + { + "epoch": 0.89, + "grad_norm": 2.024836258829966, + "learning_rate": 2.94035323559016e-07, + "loss": 0.2725, + "step": 14209 + }, + { + "epoch": 0.89, + "grad_norm": 2.1470186297578016, + "learning_rate": 2.936913089507004e-07, + "loss": 0.232, + "step": 14210 + }, + { + "epoch": 0.89, + "grad_norm": 6.491248426266962, + "learning_rate": 2.9334748961549084e-07, + "loss": 0.2528, + "step": 14211 + }, + { + "epoch": 0.89, + "grad_norm": 2.1762916699011883, + "learning_rate": 2.9300386556765483e-07, + "loss": 0.2593, + "step": 14212 + }, + { + "epoch": 0.89, + "grad_norm": 1.62343959964132, + "learning_rate": 2.926604368214486e-07, + "loss": 0.26, + "step": 14213 + }, + { + "epoch": 0.89, + "grad_norm": 2.668567414873467, + "learning_rate": 2.92317203391122e-07, + "loss": 0.2283, + "step": 14214 + }, + { + "epoch": 0.89, + "grad_norm": 2.203581337492975, + "learning_rate": 2.919741652909164e-07, + "loss": 0.2465, + "step": 14215 + }, + { + "epoch": 0.89, + "grad_norm": 3.96762141276433, + "learning_rate": 2.916313225350653e-07, + "loss": 0.2685, + "step": 14216 + }, + { + "epoch": 0.89, + "grad_norm": 1.8209855745035428, + "learning_rate": 2.912886751377941e-07, + "loss": 0.2372, + "step": 14217 + }, + { + "epoch": 0.89, + "grad_norm": 2.9995706117839998, + "learning_rate": 2.909462231133192e-07, + "loss": 0.2564, + "step": 14218 + }, + { + "epoch": 0.89, + "grad_norm": 1.9604757938209916, + "learning_rate": 2.9060396647584867e-07, + "loss": 0.2531, + "step": 14219 + }, + { + "epoch": 0.89, + "grad_norm": 0.60968839104951, + "learning_rate": 2.9026190523958553e-07, + "loss": 0.4309, + "step": 14220 + }, + { + "epoch": 0.89, + "grad_norm": 2.906944155396832, + "learning_rate": 2.899200394187202e-07, + "loss": 0.2336, + "step": 14221 + }, + { + "epoch": 0.89, + "grad_norm": 1.6227806585445994, + "learning_rate": 2.895783690274395e-07, + "loss": 0.2838, + "step": 14222 + }, + { + "epoch": 0.89, + "grad_norm": 2.1101751156505704, + "learning_rate": 2.892368940799184e-07, + "loss": 0.2715, + "step": 14223 + }, + { + "epoch": 0.89, + "grad_norm": 1.9735421721284456, + "learning_rate": 2.88895614590326e-07, + "loss": 0.2304, + "step": 14224 + }, + { + "epoch": 0.89, + "grad_norm": 2.744707844567755, + "learning_rate": 2.885545305728227e-07, + "loss": 0.2486, + "step": 14225 + }, + { + "epoch": 0.89, + "grad_norm": 1.8419252848727798, + "learning_rate": 2.8821364204156045e-07, + "loss": 0.2437, + "step": 14226 + }, + { + "epoch": 0.89, + "grad_norm": 1.4988629913186162, + "learning_rate": 2.878729490106824e-07, + "loss": 0.2621, + "step": 14227 + }, + { + "epoch": 0.89, + "grad_norm": 2.1972532747467786, + "learning_rate": 2.875324514943256e-07, + "loss": 0.2395, + "step": 14228 + }, + { + "epoch": 0.89, + "grad_norm": 2.3631920326134166, + "learning_rate": 2.871921495066182e-07, + "loss": 0.2524, + "step": 14229 + }, + { + "epoch": 0.89, + "grad_norm": 2.707015049201672, + "learning_rate": 2.8685204306168004e-07, + "loss": 0.2359, + "step": 14230 + }, + { + "epoch": 0.89, + "grad_norm": 2.2513505161088827, + "learning_rate": 2.865121321736203e-07, + "loss": 0.2479, + "step": 14231 + }, + { + "epoch": 0.9, + "grad_norm": 2.2149533956543466, + "learning_rate": 2.861724168565461e-07, + "loss": 0.2502, + "step": 14232 + }, + { + "epoch": 0.9, + "grad_norm": 1.4707674493259502, + "learning_rate": 2.8583289712455e-07, + "loss": 0.2347, + "step": 14233 + }, + { + "epoch": 0.9, + "grad_norm": 1.7805922254398394, + "learning_rate": 2.8549357299172077e-07, + "loss": 0.2522, + "step": 14234 + }, + { + "epoch": 0.9, + "grad_norm": 4.128419079823596, + "learning_rate": 2.851544444721366e-07, + "loss": 0.2654, + "step": 14235 + }, + { + "epoch": 0.9, + "grad_norm": 0.5927763243012352, + "learning_rate": 2.8481551157986896e-07, + "loss": 0.4553, + "step": 14236 + }, + { + "epoch": 0.9, + "grad_norm": 1.457753305233146, + "learning_rate": 2.8447677432898115e-07, + "loss": 0.2534, + "step": 14237 + }, + { + "epoch": 0.9, + "grad_norm": 1.431673570825116, + "learning_rate": 2.8413823273352793e-07, + "loss": 0.2714, + "step": 14238 + }, + { + "epoch": 0.9, + "grad_norm": 1.510482169073478, + "learning_rate": 2.8379988680755533e-07, + "loss": 0.2539, + "step": 14239 + }, + { + "epoch": 0.9, + "grad_norm": 3.0339140084886718, + "learning_rate": 2.8346173656510266e-07, + "loss": 0.2578, + "step": 14240 + }, + { + "epoch": 0.9, + "grad_norm": 2.0177448908096074, + "learning_rate": 2.8312378202020043e-07, + "loss": 0.2609, + "step": 14241 + }, + { + "epoch": 0.9, + "grad_norm": 1.7390656271983145, + "learning_rate": 2.827860231868701e-07, + "loss": 0.246, + "step": 14242 + }, + { + "epoch": 0.9, + "grad_norm": 1.8694647885348494, + "learning_rate": 2.824484600791261e-07, + "loss": 0.2546, + "step": 14243 + }, + { + "epoch": 0.9, + "grad_norm": 3.422264378514909, + "learning_rate": 2.821110927109744e-07, + "loss": 0.2517, + "step": 14244 + }, + { + "epoch": 0.9, + "grad_norm": 2.0384834894355954, + "learning_rate": 2.817739210964143e-07, + "loss": 0.2479, + "step": 14245 + }, + { + "epoch": 0.9, + "grad_norm": 2.198299849321342, + "learning_rate": 2.814369452494348e-07, + "loss": 0.2527, + "step": 14246 + }, + { + "epoch": 0.9, + "grad_norm": 1.563116367906744, + "learning_rate": 2.811001651840167e-07, + "loss": 0.2415, + "step": 14247 + }, + { + "epoch": 0.9, + "grad_norm": 3.3240036326220106, + "learning_rate": 2.807635809141357e-07, + "loss": 0.2539, + "step": 14248 + }, + { + "epoch": 0.9, + "grad_norm": 1.7415728194434241, + "learning_rate": 2.804271924537555e-07, + "loss": 0.2681, + "step": 14249 + }, + { + "epoch": 0.9, + "grad_norm": 1.9932981358469144, + "learning_rate": 2.800909998168333e-07, + "loss": 0.2923, + "step": 14250 + }, + { + "epoch": 0.9, + "grad_norm": 2.173290918256281, + "learning_rate": 2.797550030173196e-07, + "loss": 0.2571, + "step": 14251 + }, + { + "epoch": 0.9, + "grad_norm": 1.6016106518789888, + "learning_rate": 2.7941920206915443e-07, + "loss": 0.2537, + "step": 14252 + }, + { + "epoch": 0.9, + "grad_norm": 1.9258956452904854, + "learning_rate": 2.790835969862721e-07, + "loss": 0.2591, + "step": 14253 + }, + { + "epoch": 0.9, + "grad_norm": 1.5544338341820572, + "learning_rate": 2.7874818778259647e-07, + "loss": 0.2523, + "step": 14254 + }, + { + "epoch": 0.9, + "grad_norm": 1.7179957423032626, + "learning_rate": 2.7841297447204374e-07, + "loss": 0.2558, + "step": 14255 + }, + { + "epoch": 0.9, + "grad_norm": 0.6069787880469615, + "learning_rate": 2.780779570685238e-07, + "loss": 0.4768, + "step": 14256 + }, + { + "epoch": 0.9, + "grad_norm": 1.9796644903922684, + "learning_rate": 2.7774313558593667e-07, + "loss": 0.2461, + "step": 14257 + }, + { + "epoch": 0.9, + "grad_norm": 4.394635213376667, + "learning_rate": 2.774085100381735e-07, + "loss": 0.2583, + "step": 14258 + }, + { + "epoch": 0.9, + "grad_norm": 2.704137124756935, + "learning_rate": 2.770740804391209e-07, + "loss": 0.2651, + "step": 14259 + }, + { + "epoch": 0.9, + "grad_norm": 1.5746760808446791, + "learning_rate": 2.767398468026522e-07, + "loss": 0.2547, + "step": 14260 + }, + { + "epoch": 0.9, + "grad_norm": 1.4546499789228757, + "learning_rate": 2.764058091426375e-07, + "loss": 0.2529, + "step": 14261 + }, + { + "epoch": 0.9, + "grad_norm": 1.9139363708796495, + "learning_rate": 2.760719674729362e-07, + "loss": 0.2607, + "step": 14262 + }, + { + "epoch": 0.9, + "grad_norm": 1.6077906748820867, + "learning_rate": 2.757383218073983e-07, + "loss": 0.2686, + "step": 14263 + }, + { + "epoch": 0.9, + "grad_norm": 1.965502298767634, + "learning_rate": 2.7540487215986956e-07, + "loss": 0.2644, + "step": 14264 + }, + { + "epoch": 0.9, + "grad_norm": 6.499242183724396, + "learning_rate": 2.750716185441843e-07, + "loss": 0.2774, + "step": 14265 + }, + { + "epoch": 0.9, + "grad_norm": 1.7116661990277742, + "learning_rate": 2.747385609741704e-07, + "loss": 0.2505, + "step": 14266 + }, + { + "epoch": 0.9, + "grad_norm": 1.5395731190456927, + "learning_rate": 2.7440569946364624e-07, + "loss": 0.2774, + "step": 14267 + }, + { + "epoch": 0.9, + "grad_norm": 1.7632531684727295, + "learning_rate": 2.7407303402642305e-07, + "loss": 0.2357, + "step": 14268 + }, + { + "epoch": 0.9, + "grad_norm": 4.288278037230518, + "learning_rate": 2.737405646763042e-07, + "loss": 0.2523, + "step": 14269 + }, + { + "epoch": 0.9, + "grad_norm": 3.7551981683033855, + "learning_rate": 2.7340829142708413e-07, + "loss": 0.2626, + "step": 14270 + }, + { + "epoch": 0.9, + "grad_norm": 4.598934700154759, + "learning_rate": 2.730762142925492e-07, + "loss": 0.2698, + "step": 14271 + }, + { + "epoch": 0.9, + "grad_norm": 1.3412242311082365, + "learning_rate": 2.727443332864782e-07, + "loss": 0.2633, + "step": 14272 + }, + { + "epoch": 0.9, + "grad_norm": 5.464517531893419, + "learning_rate": 2.724126484226408e-07, + "loss": 0.2382, + "step": 14273 + }, + { + "epoch": 0.9, + "grad_norm": 1.4759007137241236, + "learning_rate": 2.720811597148004e-07, + "loss": 0.2479, + "step": 14274 + }, + { + "epoch": 0.9, + "grad_norm": 25.146667317474584, + "learning_rate": 2.7174986717670995e-07, + "loss": 0.2456, + "step": 14275 + }, + { + "epoch": 0.9, + "grad_norm": 1.3813183177187598, + "learning_rate": 2.7141877082211664e-07, + "loss": 0.2673, + "step": 14276 + }, + { + "epoch": 0.9, + "grad_norm": 1.2713790345902896, + "learning_rate": 2.710878706647574e-07, + "loss": 0.2469, + "step": 14277 + }, + { + "epoch": 0.9, + "grad_norm": 1.790607553768402, + "learning_rate": 2.707571667183617e-07, + "loss": 0.2391, + "step": 14278 + }, + { + "epoch": 0.9, + "grad_norm": 0.5958337388918039, + "learning_rate": 2.704266589966503e-07, + "loss": 0.4604, + "step": 14279 + }, + { + "epoch": 0.9, + "grad_norm": 2.13640474544427, + "learning_rate": 2.700963475133378e-07, + "loss": 0.2527, + "step": 14280 + }, + { + "epoch": 0.9, + "grad_norm": 13.868114083564988, + "learning_rate": 2.6976623228212984e-07, + "loss": 0.2594, + "step": 14281 + }, + { + "epoch": 0.9, + "grad_norm": 4.599182555203552, + "learning_rate": 2.694363133167227e-07, + "loss": 0.2476, + "step": 14282 + }, + { + "epoch": 0.9, + "grad_norm": 1.4228390381837008, + "learning_rate": 2.6910659063080436e-07, + "loss": 0.2458, + "step": 14283 + }, + { + "epoch": 0.9, + "grad_norm": 2.0723920804520524, + "learning_rate": 2.687770642380577e-07, + "loss": 0.2479, + "step": 14284 + }, + { + "epoch": 0.9, + "grad_norm": 1.8751990552545472, + "learning_rate": 2.684477341521541e-07, + "loss": 0.2351, + "step": 14285 + }, + { + "epoch": 0.9, + "grad_norm": 6.010661882498264, + "learning_rate": 2.681186003867581e-07, + "loss": 0.2362, + "step": 14286 + }, + { + "epoch": 0.9, + "grad_norm": 3.0588857871857464, + "learning_rate": 2.6778966295552546e-07, + "loss": 0.2571, + "step": 14287 + }, + { + "epoch": 0.9, + "grad_norm": 1.7254992247541738, + "learning_rate": 2.674609218721053e-07, + "loss": 0.2582, + "step": 14288 + }, + { + "epoch": 0.9, + "grad_norm": 2.4153781255581883, + "learning_rate": 2.6713237715013773e-07, + "loss": 0.2272, + "step": 14289 + }, + { + "epoch": 0.9, + "grad_norm": 2.188195995898477, + "learning_rate": 2.668040288032547e-07, + "loss": 0.2548, + "step": 14290 + }, + { + "epoch": 0.9, + "grad_norm": 1.9173221933713778, + "learning_rate": 2.664758768450787e-07, + "loss": 0.2433, + "step": 14291 + }, + { + "epoch": 0.9, + "grad_norm": 1.454731247284654, + "learning_rate": 2.6614792128922704e-07, + "loss": 0.2399, + "step": 14292 + }, + { + "epoch": 0.9, + "grad_norm": 1.8180687209741582, + "learning_rate": 2.6582016214930617e-07, + "loss": 0.2396, + "step": 14293 + }, + { + "epoch": 0.9, + "grad_norm": 4.232075975795993, + "learning_rate": 2.6549259943891567e-07, + "loss": 0.264, + "step": 14294 + }, + { + "epoch": 0.9, + "grad_norm": 1.4984812271406946, + "learning_rate": 2.6516523317164647e-07, + "loss": 0.2487, + "step": 14295 + }, + { + "epoch": 0.9, + "grad_norm": 1.888018384220596, + "learning_rate": 2.648380633610814e-07, + "loss": 0.2498, + "step": 14296 + }, + { + "epoch": 0.9, + "grad_norm": 3.2812566108764694, + "learning_rate": 2.645110900207959e-07, + "loss": 0.2475, + "step": 14297 + }, + { + "epoch": 0.9, + "grad_norm": 2.02868864682591, + "learning_rate": 2.641843131643573e-07, + "loss": 0.2654, + "step": 14298 + }, + { + "epoch": 0.9, + "grad_norm": 6.377008929284228, + "learning_rate": 2.638577328053221e-07, + "loss": 0.2389, + "step": 14299 + }, + { + "epoch": 0.9, + "grad_norm": 1.6874707178334174, + "learning_rate": 2.6353134895724273e-07, + "loss": 0.2376, + "step": 14300 + }, + { + "epoch": 0.9, + "grad_norm": 0.5864689451606748, + "learning_rate": 2.632051616336606e-07, + "loss": 0.4357, + "step": 14301 + }, + { + "epoch": 0.9, + "grad_norm": 1.7089429809272876, + "learning_rate": 2.628791708481099e-07, + "loss": 0.2554, + "step": 14302 + }, + { + "epoch": 0.9, + "grad_norm": 1.9622773307672783, + "learning_rate": 2.625533766141153e-07, + "loss": 0.2371, + "step": 14303 + }, + { + "epoch": 0.9, + "grad_norm": 0.5524344654306752, + "learning_rate": 2.622277789451966e-07, + "loss": 0.4532, + "step": 14304 + }, + { + "epoch": 0.9, + "grad_norm": 10.304183526786705, + "learning_rate": 2.6190237785486237e-07, + "loss": 0.2519, + "step": 14305 + }, + { + "epoch": 0.9, + "grad_norm": 2.68409037718728, + "learning_rate": 2.615771733566147e-07, + "loss": 0.2468, + "step": 14306 + }, + { + "epoch": 0.9, + "grad_norm": 2.051235304864181, + "learning_rate": 2.6125216546394605e-07, + "loss": 0.2468, + "step": 14307 + }, + { + "epoch": 0.9, + "grad_norm": 1.5563011857453501, + "learning_rate": 2.609273541903423e-07, + "loss": 0.2477, + "step": 14308 + }, + { + "epoch": 0.9, + "grad_norm": 2.5047866240943875, + "learning_rate": 2.606027395492805e-07, + "loss": 0.2472, + "step": 14309 + }, + { + "epoch": 0.9, + "grad_norm": 1.3888584474943344, + "learning_rate": 2.6027832155422816e-07, + "loss": 0.2334, + "step": 14310 + }, + { + "epoch": 0.9, + "grad_norm": 19.57147551750055, + "learning_rate": 2.599541002186479e-07, + "loss": 0.2357, + "step": 14311 + }, + { + "epoch": 0.9, + "grad_norm": 1.9039100663689588, + "learning_rate": 2.5963007555599053e-07, + "loss": 0.2385, + "step": 14312 + }, + { + "epoch": 0.9, + "grad_norm": 3.9219853419890818, + "learning_rate": 2.5930624757970205e-07, + "loss": 0.2683, + "step": 14313 + }, + { + "epoch": 0.9, + "grad_norm": 0.6234271727171482, + "learning_rate": 2.5898261630321717e-07, + "loss": 0.4833, + "step": 14314 + }, + { + "epoch": 0.9, + "grad_norm": 1.272651886494399, + "learning_rate": 2.5865918173996476e-07, + "loss": 0.2535, + "step": 14315 + }, + { + "epoch": 0.9, + "grad_norm": 2.1642482305498887, + "learning_rate": 2.5833594390336447e-07, + "loss": 0.2562, + "step": 14316 + }, + { + "epoch": 0.9, + "grad_norm": 2.4491858440788827, + "learning_rate": 2.580129028068273e-07, + "loss": 0.2455, + "step": 14317 + }, + { + "epoch": 0.9, + "grad_norm": 2.0427806683094007, + "learning_rate": 2.576900584637582e-07, + "loss": 0.2562, + "step": 14318 + }, + { + "epoch": 0.9, + "grad_norm": 1.8198197369485298, + "learning_rate": 2.5736741088755237e-07, + "loss": 0.2501, + "step": 14319 + }, + { + "epoch": 0.9, + "grad_norm": 1.7798821511568765, + "learning_rate": 2.570449600915953e-07, + "loss": 0.2625, + "step": 14320 + }, + { + "epoch": 0.9, + "grad_norm": 1.810146008158133, + "learning_rate": 2.5672270608926855e-07, + "loss": 0.2701, + "step": 14321 + }, + { + "epoch": 0.9, + "grad_norm": 3.6614336260490514, + "learning_rate": 2.5640064889394133e-07, + "loss": 0.2521, + "step": 14322 + }, + { + "epoch": 0.9, + "grad_norm": 2.0944552441167548, + "learning_rate": 2.560787885189758e-07, + "loss": 0.2378, + "step": 14323 + }, + { + "epoch": 0.9, + "grad_norm": 1.766888339721811, + "learning_rate": 2.5575712497772855e-07, + "loss": 0.238, + "step": 14324 + }, + { + "epoch": 0.9, + "grad_norm": 1.3644201234443156, + "learning_rate": 2.554356582835443e-07, + "loss": 0.2379, + "step": 14325 + }, + { + "epoch": 0.9, + "grad_norm": 1.6866521254325533, + "learning_rate": 2.55114388449762e-07, + "loss": 0.2405, + "step": 14326 + }, + { + "epoch": 0.9, + "grad_norm": 1.5681223064819383, + "learning_rate": 2.5479331548971197e-07, + "loss": 0.2653, + "step": 14327 + }, + { + "epoch": 0.9, + "grad_norm": 2.453224986863318, + "learning_rate": 2.544724394167153e-07, + "loss": 0.2489, + "step": 14328 + }, + { + "epoch": 0.9, + "grad_norm": 3.2150220301972587, + "learning_rate": 2.5415176024408685e-07, + "loss": 0.2578, + "step": 14329 + }, + { + "epoch": 0.9, + "grad_norm": 4.820358215394252, + "learning_rate": 2.5383127798513094e-07, + "loss": 0.2462, + "step": 14330 + }, + { + "epoch": 0.9, + "grad_norm": 2.4161023122403655, + "learning_rate": 2.535109926531448e-07, + "loss": 0.2375, + "step": 14331 + }, + { + "epoch": 0.9, + "grad_norm": 1.6473279939021952, + "learning_rate": 2.531909042614189e-07, + "loss": 0.2449, + "step": 14332 + }, + { + "epoch": 0.9, + "grad_norm": 2.0198870517579244, + "learning_rate": 2.528710128232337e-07, + "loss": 0.2579, + "step": 14333 + }, + { + "epoch": 0.9, + "grad_norm": 4.056495398028475, + "learning_rate": 2.525513183518624e-07, + "loss": 0.2614, + "step": 14334 + }, + { + "epoch": 0.9, + "grad_norm": 3.9809781313122357, + "learning_rate": 2.5223182086056897e-07, + "loss": 0.2776, + "step": 14335 + }, + { + "epoch": 0.9, + "grad_norm": 2.0434178773472356, + "learning_rate": 2.5191252036260995e-07, + "loss": 0.2606, + "step": 14336 + }, + { + "epoch": 0.9, + "grad_norm": 24.372143294614947, + "learning_rate": 2.5159341687123416e-07, + "loss": 0.2412, + "step": 14337 + }, + { + "epoch": 0.9, + "grad_norm": 2.2169340053483104, + "learning_rate": 2.512745103996822e-07, + "loss": 0.2494, + "step": 14338 + }, + { + "epoch": 0.9, + "grad_norm": 3.239186597790997, + "learning_rate": 2.5095580096118454e-07, + "loss": 0.2634, + "step": 14339 + }, + { + "epoch": 0.9, + "grad_norm": 1.440272853098814, + "learning_rate": 2.506372885689662e-07, + "loss": 0.2594, + "step": 14340 + }, + { + "epoch": 0.9, + "grad_norm": 2.489070722551848, + "learning_rate": 2.503189732362432e-07, + "loss": 0.2652, + "step": 14341 + }, + { + "epoch": 0.9, + "grad_norm": 1.5590398551133493, + "learning_rate": 2.5000085497622227e-07, + "loss": 0.255, + "step": 14342 + }, + { + "epoch": 0.9, + "grad_norm": 2.299914823370509, + "learning_rate": 2.496829338021028e-07, + "loss": 0.2448, + "step": 14343 + }, + { + "epoch": 0.9, + "grad_norm": 6.067053193087143, + "learning_rate": 2.4936520972707487e-07, + "loss": 0.2686, + "step": 14344 + }, + { + "epoch": 0.9, + "grad_norm": 1.384261052662673, + "learning_rate": 2.49047682764324e-07, + "loss": 0.2452, + "step": 14345 + }, + { + "epoch": 0.9, + "grad_norm": 1.5748765177490964, + "learning_rate": 2.4873035292702243e-07, + "loss": 0.2394, + "step": 14346 + }, + { + "epoch": 0.9, + "grad_norm": 2.994306853704753, + "learning_rate": 2.484132202283379e-07, + "loss": 0.235, + "step": 14347 + }, + { + "epoch": 0.9, + "grad_norm": 4.551821758825019, + "learning_rate": 2.4809628468142834e-07, + "loss": 0.2566, + "step": 14348 + }, + { + "epoch": 0.9, + "grad_norm": 3.728590827808792, + "learning_rate": 2.477795462994448e-07, + "loss": 0.2514, + "step": 14349 + }, + { + "epoch": 0.9, + "grad_norm": 1.7651862151945834, + "learning_rate": 2.47463005095529e-07, + "loss": 0.2599, + "step": 14350 + }, + { + "epoch": 0.9, + "grad_norm": 1.5845281034009686, + "learning_rate": 2.4714666108281436e-07, + "loss": 0.2386, + "step": 14351 + }, + { + "epoch": 0.9, + "grad_norm": 2.3639805165174725, + "learning_rate": 2.468305142744259e-07, + "loss": 0.2458, + "step": 14352 + }, + { + "epoch": 0.9, + "grad_norm": 2.3358090310402124, + "learning_rate": 2.465145646834832e-07, + "loss": 0.2839, + "step": 14353 + }, + { + "epoch": 0.9, + "grad_norm": 1.8326084014107749, + "learning_rate": 2.4619881232309405e-07, + "loss": 0.2583, + "step": 14354 + }, + { + "epoch": 0.9, + "grad_norm": 3.716078901090518, + "learning_rate": 2.458832572063591e-07, + "loss": 0.2569, + "step": 14355 + }, + { + "epoch": 0.9, + "grad_norm": 1.665638152776214, + "learning_rate": 2.4556789934637226e-07, + "loss": 0.2643, + "step": 14356 + }, + { + "epoch": 0.9, + "grad_norm": 2.3691234592486214, + "learning_rate": 2.452527387562187e-07, + "loss": 0.264, + "step": 14357 + }, + { + "epoch": 0.9, + "grad_norm": 1.6303826087677693, + "learning_rate": 2.44937775448974e-07, + "loss": 0.248, + "step": 14358 + }, + { + "epoch": 0.9, + "grad_norm": 2.0313384949244813, + "learning_rate": 2.446230094377067e-07, + "loss": 0.2543, + "step": 14359 + }, + { + "epoch": 0.9, + "grad_norm": 3.520181498866394, + "learning_rate": 2.4430844073547786e-07, + "loss": 0.2543, + "step": 14360 + }, + { + "epoch": 0.9, + "grad_norm": 1.48213903885956, + "learning_rate": 2.439940693553389e-07, + "loss": 0.2608, + "step": 14361 + }, + { + "epoch": 0.9, + "grad_norm": 1.6595244579834318, + "learning_rate": 2.436798953103331e-07, + "loss": 0.2354, + "step": 14362 + }, + { + "epoch": 0.9, + "grad_norm": 2.821776732101652, + "learning_rate": 2.4336591861349734e-07, + "loss": 0.2395, + "step": 14363 + }, + { + "epoch": 0.9, + "grad_norm": 2.6243041882966267, + "learning_rate": 2.430521392778573e-07, + "loss": 0.2387, + "step": 14364 + }, + { + "epoch": 0.9, + "grad_norm": 2.224976937315625, + "learning_rate": 2.4273855731643427e-07, + "loss": 0.265, + "step": 14365 + }, + { + "epoch": 0.9, + "grad_norm": 4.428468718138498, + "learning_rate": 2.4242517274223776e-07, + "loss": 0.2701, + "step": 14366 + }, + { + "epoch": 0.9, + "grad_norm": 3.651647854495142, + "learning_rate": 2.421119855682713e-07, + "loss": 0.2388, + "step": 14367 + }, + { + "epoch": 0.9, + "grad_norm": 2.1431795516245846, + "learning_rate": 2.417989958075295e-07, + "loss": 0.2473, + "step": 14368 + }, + { + "epoch": 0.9, + "grad_norm": 1.736890072614024, + "learning_rate": 2.414862034729998e-07, + "loss": 0.2267, + "step": 14369 + }, + { + "epoch": 0.9, + "grad_norm": 2.6582103219946127, + "learning_rate": 2.4117360857765836e-07, + "loss": 0.2711, + "step": 14370 + }, + { + "epoch": 0.9, + "grad_norm": 1.9213955854946294, + "learning_rate": 2.408612111344771e-07, + "loss": 0.2475, + "step": 14371 + }, + { + "epoch": 0.9, + "grad_norm": 1.453657492157642, + "learning_rate": 2.4054901115641684e-07, + "loss": 0.2423, + "step": 14372 + }, + { + "epoch": 0.9, + "grad_norm": 2.2477633989523333, + "learning_rate": 2.402370086564326e-07, + "loss": 0.2734, + "step": 14373 + }, + { + "epoch": 0.9, + "grad_norm": 3.1075849917982463, + "learning_rate": 2.399252036474686e-07, + "loss": 0.2374, + "step": 14374 + }, + { + "epoch": 0.9, + "grad_norm": 0.5765891430791905, + "learning_rate": 2.396135961424628e-07, + "loss": 0.4522, + "step": 14375 + }, + { + "epoch": 0.9, + "grad_norm": 2.3969949856910713, + "learning_rate": 2.393021861543449e-07, + "loss": 0.2587, + "step": 14376 + }, + { + "epoch": 0.9, + "grad_norm": 2.2012000712684663, + "learning_rate": 2.3899097369603385e-07, + "loss": 0.2437, + "step": 14377 + }, + { + "epoch": 0.9, + "grad_norm": 1.3542205146342408, + "learning_rate": 2.38679958780445e-07, + "loss": 0.2485, + "step": 14378 + }, + { + "epoch": 0.9, + "grad_norm": 3.7431888350531586, + "learning_rate": 2.3836914142048194e-07, + "loss": 0.2611, + "step": 14379 + }, + { + "epoch": 0.9, + "grad_norm": 1.878420594068702, + "learning_rate": 2.3805852162903987e-07, + "loss": 0.2539, + "step": 14380 + }, + { + "epoch": 0.9, + "grad_norm": 2.0501687454298296, + "learning_rate": 2.3774809941900844e-07, + "loss": 0.2462, + "step": 14381 + }, + { + "epoch": 0.9, + "grad_norm": 1.6814288297169528, + "learning_rate": 2.3743787480326742e-07, + "loss": 0.2493, + "step": 14382 + }, + { + "epoch": 0.9, + "grad_norm": 1.4408187485143142, + "learning_rate": 2.3712784779468756e-07, + "loss": 0.2543, + "step": 14383 + }, + { + "epoch": 0.9, + "grad_norm": 2.189193046287574, + "learning_rate": 2.3681801840613362e-07, + "loss": 0.2395, + "step": 14384 + }, + { + "epoch": 0.9, + "grad_norm": 2.229798587352424, + "learning_rate": 2.3650838665045972e-07, + "loss": 0.2501, + "step": 14385 + }, + { + "epoch": 0.9, + "grad_norm": 0.5913219911759955, + "learning_rate": 2.361989525405145e-07, + "loss": 0.5088, + "step": 14386 + }, + { + "epoch": 0.9, + "grad_norm": 1.6949106012743453, + "learning_rate": 2.3588971608913604e-07, + "loss": 0.2478, + "step": 14387 + }, + { + "epoch": 0.9, + "grad_norm": 1.6218761311965364, + "learning_rate": 2.3558067730915513e-07, + "loss": 0.265, + "step": 14388 + }, + { + "epoch": 0.9, + "grad_norm": 0.5670724941813509, + "learning_rate": 2.3527183621339434e-07, + "loss": 0.4422, + "step": 14389 + }, + { + "epoch": 0.9, + "grad_norm": 1.608523501003697, + "learning_rate": 2.3496319281466895e-07, + "loss": 0.242, + "step": 14390 + }, + { + "epoch": 0.91, + "grad_norm": 1.733549344985571, + "learning_rate": 2.346547471257832e-07, + "loss": 0.2607, + "step": 14391 + }, + { + "epoch": 0.91, + "grad_norm": 4.095912393550003, + "learning_rate": 2.3434649915953623e-07, + "loss": 0.2513, + "step": 14392 + }, + { + "epoch": 0.91, + "grad_norm": 3.9514309466854387, + "learning_rate": 2.3403844892871896e-07, + "loss": 0.2403, + "step": 14393 + }, + { + "epoch": 0.91, + "grad_norm": 2.280382700689735, + "learning_rate": 2.337305964461112e-07, + "loss": 0.2472, + "step": 14394 + }, + { + "epoch": 0.91, + "grad_norm": 7.861369097382789, + "learning_rate": 2.334229417244871e-07, + "loss": 0.2915, + "step": 14395 + }, + { + "epoch": 0.91, + "grad_norm": 1.7780582043153892, + "learning_rate": 2.3311548477661038e-07, + "loss": 0.2412, + "step": 14396 + }, + { + "epoch": 0.91, + "grad_norm": 1.3023809363689742, + "learning_rate": 2.3280822561524031e-07, + "loss": 0.2448, + "step": 14397 + }, + { + "epoch": 0.91, + "grad_norm": 1.3497213326356283, + "learning_rate": 2.325011642531244e-07, + "loss": 0.2253, + "step": 14398 + }, + { + "epoch": 0.91, + "grad_norm": 2.0955837113376727, + "learning_rate": 2.3219430070300254e-07, + "loss": 0.2589, + "step": 14399 + }, + { + "epoch": 0.91, + "grad_norm": 2.2183426439622114, + "learning_rate": 2.3188763497760725e-07, + "loss": 0.2486, + "step": 14400 + }, + { + "epoch": 0.91, + "grad_norm": 8.473385434235075, + "learning_rate": 2.3158116708966448e-07, + "loss": 0.2436, + "step": 14401 + }, + { + "epoch": 0.91, + "grad_norm": 2.461460164221812, + "learning_rate": 2.31274897051888e-07, + "loss": 0.2448, + "step": 14402 + }, + { + "epoch": 0.91, + "grad_norm": 5.590503190490564, + "learning_rate": 2.3096882487698703e-07, + "loss": 0.2574, + "step": 14403 + }, + { + "epoch": 0.91, + "grad_norm": 1.8828166767004388, + "learning_rate": 2.306629505776592e-07, + "loss": 0.2563, + "step": 14404 + }, + { + "epoch": 0.91, + "grad_norm": 1.951661123735288, + "learning_rate": 2.3035727416659769e-07, + "loss": 0.2545, + "step": 14405 + }, + { + "epoch": 0.91, + "grad_norm": 1.580776276640741, + "learning_rate": 2.300517956564846e-07, + "loss": 0.2396, + "step": 14406 + }, + { + "epoch": 0.91, + "grad_norm": 0.6372326130218704, + "learning_rate": 2.2974651505999425e-07, + "loss": 0.4515, + "step": 14407 + }, + { + "epoch": 0.91, + "grad_norm": 1.6914910927303681, + "learning_rate": 2.2944143238979366e-07, + "loss": 0.2527, + "step": 14408 + }, + { + "epoch": 0.91, + "grad_norm": 1.5940336644161897, + "learning_rate": 2.291365476585422e-07, + "loss": 0.2505, + "step": 14409 + }, + { + "epoch": 0.91, + "grad_norm": 2.0332868183136243, + "learning_rate": 2.2883186087888977e-07, + "loss": 0.2595, + "step": 14410 + }, + { + "epoch": 0.91, + "grad_norm": 2.3147909450333746, + "learning_rate": 2.2852737206347786e-07, + "loss": 0.2482, + "step": 14411 + }, + { + "epoch": 0.91, + "grad_norm": 2.6875666958481723, + "learning_rate": 2.2822308122493976e-07, + "loss": 0.2426, + "step": 14412 + }, + { + "epoch": 0.91, + "grad_norm": 2.4211114418589434, + "learning_rate": 2.2791898837590197e-07, + "loss": 0.2424, + "step": 14413 + }, + { + "epoch": 0.91, + "grad_norm": 2.3004597385252072, + "learning_rate": 2.2761509352898114e-07, + "loss": 0.2405, + "step": 14414 + }, + { + "epoch": 0.91, + "grad_norm": 2.79546572072472, + "learning_rate": 2.2731139669678714e-07, + "loss": 0.2537, + "step": 14415 + }, + { + "epoch": 0.91, + "grad_norm": 1.8991778276486342, + "learning_rate": 2.2700789789192047e-07, + "loss": 0.2639, + "step": 14416 + }, + { + "epoch": 0.91, + "grad_norm": 2.1232139901719056, + "learning_rate": 2.267045971269738e-07, + "loss": 0.2575, + "step": 14417 + }, + { + "epoch": 0.91, + "grad_norm": 6.485955926120402, + "learning_rate": 2.2640149441453208e-07, + "loss": 0.2552, + "step": 14418 + }, + { + "epoch": 0.91, + "grad_norm": 1.736427901752236, + "learning_rate": 2.2609858976717136e-07, + "loss": 0.2514, + "step": 14419 + }, + { + "epoch": 0.91, + "grad_norm": 3.1880579531991087, + "learning_rate": 2.2579588319745883e-07, + "loss": 0.232, + "step": 14420 + }, + { + "epoch": 0.91, + "grad_norm": 1.815118180142573, + "learning_rate": 2.2549337471795553e-07, + "loss": 0.2424, + "step": 14421 + }, + { + "epoch": 0.91, + "grad_norm": 2.3752905844804904, + "learning_rate": 2.2519106434121252e-07, + "loss": 0.2527, + "step": 14422 + }, + { + "epoch": 0.91, + "grad_norm": 7.792289956779686, + "learning_rate": 2.2488895207977312e-07, + "loss": 0.2513, + "step": 14423 + }, + { + "epoch": 0.91, + "grad_norm": 2.562866945134141, + "learning_rate": 2.245870379461723e-07, + "loss": 0.2743, + "step": 14424 + }, + { + "epoch": 0.91, + "grad_norm": 1.6285545836109259, + "learning_rate": 2.242853219529384e-07, + "loss": 0.2343, + "step": 14425 + }, + { + "epoch": 0.91, + "grad_norm": 6.25526385611361, + "learning_rate": 2.2398380411258858e-07, + "loss": 0.247, + "step": 14426 + }, + { + "epoch": 0.91, + "grad_norm": 1.3273318668415506, + "learning_rate": 2.2368248443763451e-07, + "loss": 0.2348, + "step": 14427 + }, + { + "epoch": 0.91, + "grad_norm": 2.2994044788467543, + "learning_rate": 2.2338136294057677e-07, + "loss": 0.2448, + "step": 14428 + }, + { + "epoch": 0.91, + "grad_norm": 3.657924935810576, + "learning_rate": 2.2308043963391034e-07, + "loss": 0.2749, + "step": 14429 + }, + { + "epoch": 0.91, + "grad_norm": 1.9342738710559804, + "learning_rate": 2.2277971453012193e-07, + "loss": 0.2678, + "step": 14430 + }, + { + "epoch": 0.91, + "grad_norm": 2.58915421386696, + "learning_rate": 2.2247918764168874e-07, + "loss": 0.2363, + "step": 14431 + }, + { + "epoch": 0.91, + "grad_norm": 1.3492375979172468, + "learning_rate": 2.2217885898107915e-07, + "loss": 0.2566, + "step": 14432 + }, + { + "epoch": 0.91, + "grad_norm": 2.0956570689213967, + "learning_rate": 2.2187872856075544e-07, + "loss": 0.2487, + "step": 14433 + }, + { + "epoch": 0.91, + "grad_norm": 2.3599289344086705, + "learning_rate": 2.2157879639317038e-07, + "loss": 0.2549, + "step": 14434 + }, + { + "epoch": 0.91, + "grad_norm": 3.4080477810531007, + "learning_rate": 2.2127906249076903e-07, + "loss": 0.2407, + "step": 14435 + }, + { + "epoch": 0.91, + "grad_norm": 7.772172873548713, + "learning_rate": 2.209795268659859e-07, + "loss": 0.2739, + "step": 14436 + }, + { + "epoch": 0.91, + "grad_norm": 1.7411833702255235, + "learning_rate": 2.2068018953125103e-07, + "loss": 0.2594, + "step": 14437 + }, + { + "epoch": 0.91, + "grad_norm": 3.220970600829254, + "learning_rate": 2.203810504989845e-07, + "loss": 0.2555, + "step": 14438 + }, + { + "epoch": 0.91, + "grad_norm": 0.5872756190965056, + "learning_rate": 2.2008210978159806e-07, + "loss": 0.465, + "step": 14439 + }, + { + "epoch": 0.91, + "grad_norm": 1.851639766650723, + "learning_rate": 2.1978336739149454e-07, + "loss": 0.2621, + "step": 14440 + }, + { + "epoch": 0.91, + "grad_norm": 1.6739715102465174, + "learning_rate": 2.1948482334106957e-07, + "loss": 0.2453, + "step": 14441 + }, + { + "epoch": 0.91, + "grad_norm": 2.050088892825613, + "learning_rate": 2.1918647764271105e-07, + "loss": 0.2378, + "step": 14442 + }, + { + "epoch": 0.91, + "grad_norm": 1.8791823839520525, + "learning_rate": 2.1888833030879685e-07, + "loss": 0.2463, + "step": 14443 + }, + { + "epoch": 0.91, + "grad_norm": 2.755452865345263, + "learning_rate": 2.1859038135169764e-07, + "loss": 0.2743, + "step": 14444 + }, + { + "epoch": 0.91, + "grad_norm": 2.249296166630754, + "learning_rate": 2.1829263078377683e-07, + "loss": 0.259, + "step": 14445 + }, + { + "epoch": 0.91, + "grad_norm": 1.645143216423926, + "learning_rate": 2.179950786173879e-07, + "loss": 0.2462, + "step": 14446 + }, + { + "epoch": 0.91, + "grad_norm": 1.779318669753915, + "learning_rate": 2.1769772486487705e-07, + "loss": 0.2586, + "step": 14447 + }, + { + "epoch": 0.91, + "grad_norm": 4.837384937670963, + "learning_rate": 2.174005695385817e-07, + "loss": 0.2718, + "step": 14448 + }, + { + "epoch": 0.91, + "grad_norm": 3.052738596929044, + "learning_rate": 2.171036126508319e-07, + "loss": 0.2602, + "step": 14449 + }, + { + "epoch": 0.91, + "grad_norm": 2.7928776179684216, + "learning_rate": 2.1680685421394842e-07, + "loss": 0.2711, + "step": 14450 + }, + { + "epoch": 0.91, + "grad_norm": 2.4483639431258877, + "learning_rate": 2.1651029424024417e-07, + "loss": 0.2412, + "step": 14451 + }, + { + "epoch": 0.91, + "grad_norm": 7.2787994474514255, + "learning_rate": 2.1621393274202429e-07, + "loss": 0.259, + "step": 14452 + }, + { + "epoch": 0.91, + "grad_norm": 2.6261215902194026, + "learning_rate": 2.1591776973158564e-07, + "loss": 0.2349, + "step": 14453 + }, + { + "epoch": 0.91, + "grad_norm": 1.6680651222154, + "learning_rate": 2.1562180522121613e-07, + "loss": 0.2498, + "step": 14454 + }, + { + "epoch": 0.91, + "grad_norm": 1.8700943683892384, + "learning_rate": 2.153260392231965e-07, + "loss": 0.2511, + "step": 14455 + }, + { + "epoch": 0.91, + "grad_norm": 1.567964666506948, + "learning_rate": 2.1503047174979695e-07, + "loss": 0.2715, + "step": 14456 + }, + { + "epoch": 0.91, + "grad_norm": 4.614727062017292, + "learning_rate": 2.147351028132827e-07, + "loss": 0.2762, + "step": 14457 + }, + { + "epoch": 0.91, + "grad_norm": 2.8235129338382885, + "learning_rate": 2.144399324259089e-07, + "loss": 0.2731, + "step": 14458 + }, + { + "epoch": 0.91, + "grad_norm": 2.5803840532779754, + "learning_rate": 2.1414496059992183e-07, + "loss": 0.2477, + "step": 14459 + }, + { + "epoch": 0.91, + "grad_norm": 2.0769844778878053, + "learning_rate": 2.138501873475607e-07, + "loss": 0.2418, + "step": 14460 + }, + { + "epoch": 0.91, + "grad_norm": 2.7474217544896757, + "learning_rate": 2.135556126810573e-07, + "loss": 0.241, + "step": 14461 + }, + { + "epoch": 0.91, + "grad_norm": 2.3863284081175946, + "learning_rate": 2.13261236612633e-07, + "loss": 0.2686, + "step": 14462 + }, + { + "epoch": 0.91, + "grad_norm": 0.6041412725538277, + "learning_rate": 2.1296705915450244e-07, + "loss": 0.488, + "step": 14463 + }, + { + "epoch": 0.91, + "grad_norm": 1.34452761724846, + "learning_rate": 2.1267308031887036e-07, + "loss": 0.237, + "step": 14464 + }, + { + "epoch": 0.91, + "grad_norm": 3.4947442283899726, + "learning_rate": 2.123793001179364e-07, + "loss": 0.2694, + "step": 14465 + }, + { + "epoch": 0.91, + "grad_norm": 3.26729705768746, + "learning_rate": 2.1208571856388915e-07, + "loss": 0.2596, + "step": 14466 + }, + { + "epoch": 0.91, + "grad_norm": 1.775837042839868, + "learning_rate": 2.117923356689089e-07, + "loss": 0.2589, + "step": 14467 + }, + { + "epoch": 0.91, + "grad_norm": 2.5906264427923897, + "learning_rate": 2.114991514451692e-07, + "loss": 0.2517, + "step": 14468 + }, + { + "epoch": 0.91, + "grad_norm": 0.6102118575831724, + "learning_rate": 2.112061659048359e-07, + "loss": 0.4594, + "step": 14469 + }, + { + "epoch": 0.91, + "grad_norm": 3.4509361422530795, + "learning_rate": 2.109133790600648e-07, + "loss": 0.2598, + "step": 14470 + }, + { + "epoch": 0.91, + "grad_norm": 4.615792194612049, + "learning_rate": 2.1062079092300402e-07, + "loss": 0.2468, + "step": 14471 + }, + { + "epoch": 0.91, + "grad_norm": 2.0041096957855564, + "learning_rate": 2.103284015057927e-07, + "loss": 0.2534, + "step": 14472 + }, + { + "epoch": 0.91, + "grad_norm": 2.2664870866984965, + "learning_rate": 2.1003621082056392e-07, + "loss": 0.2418, + "step": 14473 + }, + { + "epoch": 0.91, + "grad_norm": 2.3768270188270497, + "learning_rate": 2.097442188794402e-07, + "loss": 0.2837, + "step": 14474 + }, + { + "epoch": 0.91, + "grad_norm": 1.427340659654771, + "learning_rate": 2.0945242569453795e-07, + "loss": 0.253, + "step": 14475 + }, + { + "epoch": 0.91, + "grad_norm": 2.5161216053208117, + "learning_rate": 2.091608312779625e-07, + "loss": 0.2401, + "step": 14476 + }, + { + "epoch": 0.91, + "grad_norm": 2.675503374474496, + "learning_rate": 2.088694356418147e-07, + "loss": 0.2688, + "step": 14477 + }, + { + "epoch": 0.91, + "grad_norm": 1.8923580657315366, + "learning_rate": 2.0857823879818384e-07, + "loss": 0.2454, + "step": 14478 + }, + { + "epoch": 0.91, + "grad_norm": 2.3759402065898474, + "learning_rate": 2.0828724075915298e-07, + "loss": 0.2556, + "step": 14479 + }, + { + "epoch": 0.91, + "grad_norm": 2.476388684288924, + "learning_rate": 2.079964415367941e-07, + "loss": 0.2338, + "step": 14480 + }, + { + "epoch": 0.91, + "grad_norm": 2.4283349735457755, + "learning_rate": 2.0770584114317483e-07, + "loss": 0.2636, + "step": 14481 + }, + { + "epoch": 0.91, + "grad_norm": 1.6486641676674842, + "learning_rate": 2.074154395903527e-07, + "loss": 0.2402, + "step": 14482 + }, + { + "epoch": 0.91, + "grad_norm": 1.6213172513410736, + "learning_rate": 2.0712523689037645e-07, + "loss": 0.2556, + "step": 14483 + }, + { + "epoch": 0.91, + "grad_norm": 1.8537371799773197, + "learning_rate": 2.0683523305528696e-07, + "loss": 0.24, + "step": 14484 + }, + { + "epoch": 0.91, + "grad_norm": 2.7785473319886127, + "learning_rate": 2.0654542809711798e-07, + "loss": 0.2585, + "step": 14485 + }, + { + "epoch": 0.91, + "grad_norm": 1.7651719366311103, + "learning_rate": 2.0625582202789317e-07, + "loss": 0.2319, + "step": 14486 + }, + { + "epoch": 0.91, + "grad_norm": 2.1273793923211395, + "learning_rate": 2.0596641485962854e-07, + "loss": 0.2457, + "step": 14487 + }, + { + "epoch": 0.91, + "grad_norm": 4.21819875878106, + "learning_rate": 2.056772066043322e-07, + "loss": 0.2698, + "step": 14488 + }, + { + "epoch": 0.91, + "grad_norm": 1.5709435104374834, + "learning_rate": 2.053881972740046e-07, + "loss": 0.2511, + "step": 14489 + }, + { + "epoch": 0.91, + "grad_norm": 1.8761287437363046, + "learning_rate": 2.0509938688063723e-07, + "loss": 0.2411, + "step": 14490 + }, + { + "epoch": 0.91, + "grad_norm": 1.9211021468018759, + "learning_rate": 2.0481077543621275e-07, + "loss": 0.2368, + "step": 14491 + }, + { + "epoch": 0.91, + "grad_norm": 1.5115448951643704, + "learning_rate": 2.04522362952706e-07, + "loss": 0.2417, + "step": 14492 + }, + { + "epoch": 0.91, + "grad_norm": 5.088136531250573, + "learning_rate": 2.0423414944208464e-07, + "loss": 0.2441, + "step": 14493 + }, + { + "epoch": 0.91, + "grad_norm": 2.087766755890298, + "learning_rate": 2.0394613491630688e-07, + "loss": 0.2645, + "step": 14494 + }, + { + "epoch": 0.91, + "grad_norm": 2.107246310436187, + "learning_rate": 2.036583193873226e-07, + "loss": 0.2588, + "step": 14495 + }, + { + "epoch": 0.91, + "grad_norm": 2.204240909998728, + "learning_rate": 2.0337070286707283e-07, + "loss": 0.2476, + "step": 14496 + }, + { + "epoch": 0.91, + "grad_norm": 4.413546846926048, + "learning_rate": 2.0308328536749355e-07, + "loss": 0.2509, + "step": 14497 + }, + { + "epoch": 0.91, + "grad_norm": 1.7561510964926772, + "learning_rate": 2.0279606690050856e-07, + "loss": 0.2469, + "step": 14498 + }, + { + "epoch": 0.91, + "grad_norm": 8.338753213289381, + "learning_rate": 2.0250904747803614e-07, + "loss": 0.2612, + "step": 14499 + }, + { + "epoch": 0.91, + "grad_norm": 2.0824586160643634, + "learning_rate": 2.0222222711198392e-07, + "loss": 0.2393, + "step": 14500 + }, + { + "epoch": 0.91, + "grad_norm": 2.007315300107275, + "learning_rate": 2.0193560581425408e-07, + "loss": 0.2318, + "step": 14501 + }, + { + "epoch": 0.91, + "grad_norm": 1.9865525920497915, + "learning_rate": 2.0164918359673769e-07, + "loss": 0.2509, + "step": 14502 + }, + { + "epoch": 0.91, + "grad_norm": 1.7197148031101857, + "learning_rate": 2.013629604713202e-07, + "loss": 0.2382, + "step": 14503 + }, + { + "epoch": 0.91, + "grad_norm": 1.8947838501290832, + "learning_rate": 2.0107693644987492e-07, + "loss": 0.254, + "step": 14504 + }, + { + "epoch": 0.91, + "grad_norm": 4.902731433394685, + "learning_rate": 2.007911115442729e-07, + "loss": 0.2588, + "step": 14505 + }, + { + "epoch": 0.91, + "grad_norm": 2.289651256926037, + "learning_rate": 2.005054857663724e-07, + "loss": 0.2502, + "step": 14506 + }, + { + "epoch": 0.91, + "grad_norm": 2.186328093635684, + "learning_rate": 2.0022005912802345e-07, + "loss": 0.2609, + "step": 14507 + }, + { + "epoch": 0.91, + "grad_norm": 5.237839020173491, + "learning_rate": 1.999348316410693e-07, + "loss": 0.2598, + "step": 14508 + }, + { + "epoch": 0.91, + "grad_norm": 1.3844167421565585, + "learning_rate": 1.9964980331734552e-07, + "loss": 0.2406, + "step": 14509 + }, + { + "epoch": 0.91, + "grad_norm": 0.5850318063128662, + "learning_rate": 1.9936497416867816e-07, + "loss": 0.4568, + "step": 14510 + }, + { + "epoch": 0.91, + "grad_norm": 1.7980759564399063, + "learning_rate": 1.9908034420688394e-07, + "loss": 0.266, + "step": 14511 + }, + { + "epoch": 0.91, + "grad_norm": 1.731553582372513, + "learning_rate": 1.9879591344377335e-07, + "loss": 0.2584, + "step": 14512 + }, + { + "epoch": 0.91, + "grad_norm": 1.4881645457346415, + "learning_rate": 1.9851168189114923e-07, + "loss": 0.2767, + "step": 14513 + }, + { + "epoch": 0.91, + "grad_norm": 1.590704362330037, + "learning_rate": 1.9822764956080375e-07, + "loss": 0.2573, + "step": 14514 + }, + { + "epoch": 0.91, + "grad_norm": 1.706228859345715, + "learning_rate": 1.9794381646452198e-07, + "loss": 0.2427, + "step": 14515 + }, + { + "epoch": 0.91, + "grad_norm": 1.93471096274332, + "learning_rate": 1.9766018261408005e-07, + "loss": 0.2557, + "step": 14516 + }, + { + "epoch": 0.91, + "grad_norm": 4.9366506521476845, + "learning_rate": 1.973767480212474e-07, + "loss": 0.255, + "step": 14517 + }, + { + "epoch": 0.91, + "grad_norm": 2.2290112195682714, + "learning_rate": 1.9709351269778408e-07, + "loss": 0.272, + "step": 14518 + }, + { + "epoch": 0.91, + "grad_norm": 3.3340555024224874, + "learning_rate": 1.9681047665544127e-07, + "loss": 0.2726, + "step": 14519 + }, + { + "epoch": 0.91, + "grad_norm": 1.747504705651108, + "learning_rate": 1.9652763990596345e-07, + "loss": 0.2516, + "step": 14520 + }, + { + "epoch": 0.91, + "grad_norm": 2.562454675871726, + "learning_rate": 1.9624500246108625e-07, + "loss": 0.2443, + "step": 14521 + }, + { + "epoch": 0.91, + "grad_norm": 2.5122784108971654, + "learning_rate": 1.9596256433253635e-07, + "loss": 0.2681, + "step": 14522 + }, + { + "epoch": 0.91, + "grad_norm": 2.092076801555144, + "learning_rate": 1.956803255320322e-07, + "loss": 0.2794, + "step": 14523 + }, + { + "epoch": 0.91, + "grad_norm": 1.704990015234847, + "learning_rate": 1.953982860712844e-07, + "loss": 0.2503, + "step": 14524 + }, + { + "epoch": 0.91, + "grad_norm": 0.6040679585377352, + "learning_rate": 1.9511644596199643e-07, + "loss": 0.4704, + "step": 14525 + }, + { + "epoch": 0.91, + "grad_norm": 3.2386558262934875, + "learning_rate": 1.9483480521586107e-07, + "loss": 0.2623, + "step": 14526 + }, + { + "epoch": 0.91, + "grad_norm": 0.6204712726676649, + "learning_rate": 1.945533638445646e-07, + "loss": 0.4777, + "step": 14527 + }, + { + "epoch": 0.91, + "grad_norm": 2.6546399832572103, + "learning_rate": 1.9427212185978428e-07, + "loss": 0.2554, + "step": 14528 + }, + { + "epoch": 0.91, + "grad_norm": 3.868847229193744, + "learning_rate": 1.9399107927319028e-07, + "loss": 0.2563, + "step": 14529 + }, + { + "epoch": 0.91, + "grad_norm": 2.100870480623982, + "learning_rate": 1.9371023609644268e-07, + "loss": 0.2759, + "step": 14530 + }, + { + "epoch": 0.91, + "grad_norm": 1.7400093479151766, + "learning_rate": 1.9342959234119385e-07, + "loss": 0.2337, + "step": 14531 + }, + { + "epoch": 0.91, + "grad_norm": 1.7265859632734524, + "learning_rate": 1.931491480190889e-07, + "loss": 0.2514, + "step": 14532 + }, + { + "epoch": 0.91, + "grad_norm": 2.2983224300409826, + "learning_rate": 1.9286890314176353e-07, + "loss": 0.2502, + "step": 14533 + }, + { + "epoch": 0.91, + "grad_norm": 2.2969022201778935, + "learning_rate": 1.9258885772084567e-07, + "loss": 0.2416, + "step": 14534 + }, + { + "epoch": 0.91, + "grad_norm": 7.508959980294738, + "learning_rate": 1.9230901176795548e-07, + "loss": 0.2665, + "step": 14535 + }, + { + "epoch": 0.91, + "grad_norm": 3.582752042354064, + "learning_rate": 1.9202936529470363e-07, + "loss": 0.2625, + "step": 14536 + }, + { + "epoch": 0.91, + "grad_norm": 2.081445959892226, + "learning_rate": 1.9174991831269362e-07, + "loss": 0.2679, + "step": 14537 + }, + { + "epoch": 0.91, + "grad_norm": 1.287830484110232, + "learning_rate": 1.9147067083351954e-07, + "loss": 0.256, + "step": 14538 + }, + { + "epoch": 0.91, + "grad_norm": 1.5175800942879811, + "learning_rate": 1.911916228687688e-07, + "loss": 0.2376, + "step": 14539 + }, + { + "epoch": 0.91, + "grad_norm": 1.4282350306356104, + "learning_rate": 1.909127744300182e-07, + "loss": 0.2453, + "step": 14540 + }, + { + "epoch": 0.91, + "grad_norm": 2.5090376650146013, + "learning_rate": 1.9063412552883852e-07, + "loss": 0.2629, + "step": 14541 + }, + { + "epoch": 0.91, + "grad_norm": 3.2789070229309223, + "learning_rate": 1.9035567617679163e-07, + "loss": 0.2532, + "step": 14542 + }, + { + "epoch": 0.91, + "grad_norm": 2.200914609783236, + "learning_rate": 1.9007742638543104e-07, + "loss": 0.2634, + "step": 14543 + }, + { + "epoch": 0.91, + "grad_norm": 1.4406926563074134, + "learning_rate": 1.8979937616630084e-07, + "loss": 0.2368, + "step": 14544 + }, + { + "epoch": 0.91, + "grad_norm": 1.9354443070109852, + "learning_rate": 1.8952152553093906e-07, + "loss": 0.24, + "step": 14545 + }, + { + "epoch": 0.91, + "grad_norm": 2.382994471291222, + "learning_rate": 1.892438744908731e-07, + "loss": 0.2618, + "step": 14546 + }, + { + "epoch": 0.91, + "grad_norm": 2.4377555631615566, + "learning_rate": 1.889664230576238e-07, + "loss": 0.251, + "step": 14547 + }, + { + "epoch": 0.91, + "grad_norm": 2.6563752045766305, + "learning_rate": 1.8868917124270248e-07, + "loss": 0.2372, + "step": 14548 + }, + { + "epoch": 0.91, + "grad_norm": 1.7992839791383575, + "learning_rate": 1.8841211905761326e-07, + "loss": 0.2531, + "step": 14549 + }, + { + "epoch": 0.92, + "grad_norm": 1.48632246323107, + "learning_rate": 1.881352665138525e-07, + "loss": 0.2559, + "step": 14550 + }, + { + "epoch": 0.92, + "grad_norm": 2.2703408625191157, + "learning_rate": 1.8785861362290603e-07, + "loss": 0.2454, + "step": 14551 + }, + { + "epoch": 0.92, + "grad_norm": 2.1449732033066944, + "learning_rate": 1.8758216039625243e-07, + "loss": 0.263, + "step": 14552 + }, + { + "epoch": 0.92, + "grad_norm": 2.95268284637273, + "learning_rate": 1.873059068453631e-07, + "loss": 0.2396, + "step": 14553 + }, + { + "epoch": 0.92, + "grad_norm": 2.5756193425105325, + "learning_rate": 1.870298529817005e-07, + "loss": 0.2581, + "step": 14554 + }, + { + "epoch": 0.92, + "grad_norm": 4.413862692424448, + "learning_rate": 1.8675399881671774e-07, + "loss": 0.2625, + "step": 14555 + }, + { + "epoch": 0.92, + "grad_norm": 1.978311636808432, + "learning_rate": 1.8647834436186009e-07, + "loss": 0.2501, + "step": 14556 + }, + { + "epoch": 0.92, + "grad_norm": 3.0528327277025133, + "learning_rate": 1.862028896285667e-07, + "loss": 0.2568, + "step": 14557 + }, + { + "epoch": 0.92, + "grad_norm": 1.7813793578847024, + "learning_rate": 1.859276346282657e-07, + "loss": 0.2491, + "step": 14558 + }, + { + "epoch": 0.92, + "grad_norm": 1.5221083090039145, + "learning_rate": 1.856525793723779e-07, + "loss": 0.2474, + "step": 14559 + }, + { + "epoch": 0.92, + "grad_norm": 2.390504163933587, + "learning_rate": 1.853777238723148e-07, + "loss": 0.2787, + "step": 14560 + }, + { + "epoch": 0.92, + "grad_norm": 5.200196410811054, + "learning_rate": 1.8510306813948276e-07, + "loss": 0.241, + "step": 14561 + }, + { + "epoch": 0.92, + "grad_norm": 2.498783783971337, + "learning_rate": 1.848286121852766e-07, + "loss": 0.2649, + "step": 14562 + }, + { + "epoch": 0.92, + "grad_norm": 1.679129399596893, + "learning_rate": 1.8455435602108328e-07, + "loss": 0.2649, + "step": 14563 + }, + { + "epoch": 0.92, + "grad_norm": 1.9345832226516606, + "learning_rate": 1.8428029965828265e-07, + "loss": 0.2578, + "step": 14564 + }, + { + "epoch": 0.92, + "grad_norm": 5.894186801501865, + "learning_rate": 1.8400644310824722e-07, + "loss": 0.2427, + "step": 14565 + }, + { + "epoch": 0.92, + "grad_norm": 3.265496929716585, + "learning_rate": 1.8373278638233848e-07, + "loss": 0.2575, + "step": 14566 + }, + { + "epoch": 0.92, + "grad_norm": 5.193911921550233, + "learning_rate": 1.834593294919107e-07, + "loss": 0.2396, + "step": 14567 + }, + { + "epoch": 0.92, + "grad_norm": 5.398539465303125, + "learning_rate": 1.8318607244831033e-07, + "loss": 0.2497, + "step": 14568 + }, + { + "epoch": 0.92, + "grad_norm": 4.081467693048924, + "learning_rate": 1.8291301526287609e-07, + "loss": 0.2617, + "step": 14569 + }, + { + "epoch": 0.92, + "grad_norm": 1.7714467938424623, + "learning_rate": 1.8264015794693668e-07, + "loss": 0.269, + "step": 14570 + }, + { + "epoch": 0.92, + "grad_norm": 1.8395494084039425, + "learning_rate": 1.8236750051181306e-07, + "loss": 0.249, + "step": 14571 + }, + { + "epoch": 0.92, + "grad_norm": 6.3142334681720875, + "learning_rate": 1.8209504296881896e-07, + "loss": 0.2662, + "step": 14572 + }, + { + "epoch": 0.92, + "grad_norm": 2.3435171338601832, + "learning_rate": 1.818227853292598e-07, + "loss": 0.2555, + "step": 14573 + }, + { + "epoch": 0.92, + "grad_norm": 3.7046417545149053, + "learning_rate": 1.8155072760443149e-07, + "loss": 0.2416, + "step": 14574 + }, + { + "epoch": 0.92, + "grad_norm": 1.7680273985641055, + "learning_rate": 1.812788698056217e-07, + "loss": 0.2514, + "step": 14575 + }, + { + "epoch": 0.92, + "grad_norm": 2.417788014275917, + "learning_rate": 1.810072119441103e-07, + "loss": 0.2207, + "step": 14576 + }, + { + "epoch": 0.92, + "grad_norm": 2.4467649856462055, + "learning_rate": 1.8073575403116938e-07, + "loss": 0.261, + "step": 14577 + }, + { + "epoch": 0.92, + "grad_norm": 1.5405873106651053, + "learning_rate": 1.804644960780616e-07, + "loss": 0.2691, + "step": 14578 + }, + { + "epoch": 0.92, + "grad_norm": 2.2291964329191467, + "learning_rate": 1.8019343809604295e-07, + "loss": 0.2568, + "step": 14579 + }, + { + "epoch": 0.92, + "grad_norm": 2.00943408163364, + "learning_rate": 1.7992258009635942e-07, + "loss": 0.2352, + "step": 14580 + }, + { + "epoch": 0.92, + "grad_norm": 2.2836406128994153, + "learning_rate": 1.7965192209024928e-07, + "loss": 0.2456, + "step": 14581 + }, + { + "epoch": 0.92, + "grad_norm": 1.832438125777982, + "learning_rate": 1.7938146408894296e-07, + "loss": 0.2412, + "step": 14582 + }, + { + "epoch": 0.92, + "grad_norm": 1.929832837409836, + "learning_rate": 1.7911120610366262e-07, + "loss": 0.2451, + "step": 14583 + }, + { + "epoch": 0.92, + "grad_norm": 1.6513242660518177, + "learning_rate": 1.7884114814561983e-07, + "loss": 0.2472, + "step": 14584 + }, + { + "epoch": 0.92, + "grad_norm": 3.1905309233386983, + "learning_rate": 1.7857129022602226e-07, + "loss": 0.2466, + "step": 14585 + }, + { + "epoch": 0.92, + "grad_norm": 1.4813312295210759, + "learning_rate": 1.783016323560649e-07, + "loss": 0.2347, + "step": 14586 + }, + { + "epoch": 0.92, + "grad_norm": 2.563174579858422, + "learning_rate": 1.7803217454693766e-07, + "loss": 0.2482, + "step": 14587 + }, + { + "epoch": 0.92, + "grad_norm": 1.9113023448252877, + "learning_rate": 1.7776291680981995e-07, + "loss": 0.2412, + "step": 14588 + }, + { + "epoch": 0.92, + "grad_norm": 2.2786178310528573, + "learning_rate": 1.7749385915588446e-07, + "loss": 0.2415, + "step": 14589 + }, + { + "epoch": 0.92, + "grad_norm": 29.05300778970683, + "learning_rate": 1.7722500159629452e-07, + "loss": 0.2653, + "step": 14590 + }, + { + "epoch": 0.92, + "grad_norm": 4.290002453105786, + "learning_rate": 1.769563441422051e-07, + "loss": 0.2697, + "step": 14591 + }, + { + "epoch": 0.92, + "grad_norm": 2.092070753675626, + "learning_rate": 1.7668788680476334e-07, + "loss": 0.2562, + "step": 14592 + }, + { + "epoch": 0.92, + "grad_norm": 2.3838436164833396, + "learning_rate": 1.7641962959510872e-07, + "loss": 0.252, + "step": 14593 + }, + { + "epoch": 0.92, + "grad_norm": 2.3092892132355134, + "learning_rate": 1.7615157252437121e-07, + "loss": 0.2522, + "step": 14594 + }, + { + "epoch": 0.92, + "grad_norm": 1.8603278509510393, + "learning_rate": 1.7588371560367357e-07, + "loss": 0.253, + "step": 14595 + }, + { + "epoch": 0.92, + "grad_norm": 1.3223387568769163, + "learning_rate": 1.7561605884412858e-07, + "loss": 0.2407, + "step": 14596 + }, + { + "epoch": 0.92, + "grad_norm": 1.5872890972249871, + "learning_rate": 1.7534860225684292e-07, + "loss": 0.2713, + "step": 14597 + }, + { + "epoch": 0.92, + "grad_norm": 4.67526615629178, + "learning_rate": 1.7508134585291325e-07, + "loss": 0.2395, + "step": 14598 + }, + { + "epoch": 0.92, + "grad_norm": 1.986689727135611, + "learning_rate": 1.748142896434285e-07, + "loss": 0.2468, + "step": 14599 + }, + { + "epoch": 0.92, + "grad_norm": 2.0775548200314744, + "learning_rate": 1.7454743363946868e-07, + "loss": 0.249, + "step": 14600 + }, + { + "epoch": 0.92, + "grad_norm": 2.308339774678808, + "learning_rate": 1.7428077785210663e-07, + "loss": 0.2538, + "step": 14601 + }, + { + "epoch": 0.92, + "grad_norm": 1.9148666821890743, + "learning_rate": 1.7401432229240733e-07, + "loss": 0.252, + "step": 14602 + }, + { + "epoch": 0.92, + "grad_norm": 2.2472774814567003, + "learning_rate": 1.7374806697142588e-07, + "loss": 0.2507, + "step": 14603 + }, + { + "epoch": 0.92, + "grad_norm": 2.1864405904788144, + "learning_rate": 1.7348201190020897e-07, + "loss": 0.2447, + "step": 14604 + }, + { + "epoch": 0.92, + "grad_norm": 2.1737520105448525, + "learning_rate": 1.7321615708979666e-07, + "loss": 0.2357, + "step": 14605 + }, + { + "epoch": 0.92, + "grad_norm": 1.8597862814083623, + "learning_rate": 1.729505025512196e-07, + "loss": 0.2456, + "step": 14606 + }, + { + "epoch": 0.92, + "grad_norm": 1.7164992487378634, + "learning_rate": 1.726850482954995e-07, + "loss": 0.2436, + "step": 14607 + }, + { + "epoch": 0.92, + "grad_norm": 6.306614413990062, + "learning_rate": 1.7241979433365086e-07, + "loss": 0.2708, + "step": 14608 + }, + { + "epoch": 0.92, + "grad_norm": 1.5303694058213413, + "learning_rate": 1.721547406766799e-07, + "loss": 0.2535, + "step": 14609 + }, + { + "epoch": 0.92, + "grad_norm": 1.5516930701767238, + "learning_rate": 1.7188988733558397e-07, + "loss": 0.2565, + "step": 14610 + }, + { + "epoch": 0.92, + "grad_norm": 2.152221530150172, + "learning_rate": 1.7162523432135258e-07, + "loss": 0.2453, + "step": 14611 + }, + { + "epoch": 0.92, + "grad_norm": 1.9981888361709716, + "learning_rate": 1.7136078164496585e-07, + "loss": 0.2567, + "step": 14612 + }, + { + "epoch": 0.92, + "grad_norm": 3.4227979480329496, + "learning_rate": 1.7109652931739717e-07, + "loss": 0.2439, + "step": 14613 + }, + { + "epoch": 0.92, + "grad_norm": 2.0908760732808362, + "learning_rate": 1.7083247734961117e-07, + "loss": 0.2493, + "step": 14614 + }, + { + "epoch": 0.92, + "grad_norm": 1.599814462824902, + "learning_rate": 1.705686257525624e-07, + "loss": 0.2726, + "step": 14615 + }, + { + "epoch": 0.92, + "grad_norm": 1.6584689323598625, + "learning_rate": 1.703049745371993e-07, + "loss": 0.2503, + "step": 14616 + }, + { + "epoch": 0.92, + "grad_norm": 4.167687084073127, + "learning_rate": 1.7004152371446146e-07, + "loss": 0.2372, + "step": 14617 + }, + { + "epoch": 0.92, + "grad_norm": 1.3676841305083616, + "learning_rate": 1.6977827329528017e-07, + "loss": 0.2458, + "step": 14618 + }, + { + "epoch": 0.92, + "grad_norm": 3.2506013133346543, + "learning_rate": 1.6951522329057778e-07, + "loss": 0.2397, + "step": 14619 + }, + { + "epoch": 0.92, + "grad_norm": 2.640926651918078, + "learning_rate": 1.6925237371126835e-07, + "loss": 0.2512, + "step": 14620 + }, + { + "epoch": 0.92, + "grad_norm": 2.0368443257180364, + "learning_rate": 1.689897245682587e-07, + "loss": 0.2287, + "step": 14621 + }, + { + "epoch": 0.92, + "grad_norm": 3.0133313326001674, + "learning_rate": 1.687272758724462e-07, + "loss": 0.2415, + "step": 14622 + }, + { + "epoch": 0.92, + "grad_norm": 2.4267649921275667, + "learning_rate": 1.6846502763471939e-07, + "loss": 0.2583, + "step": 14623 + }, + { + "epoch": 0.92, + "grad_norm": 1.3878008863385654, + "learning_rate": 1.682029798659618e-07, + "loss": 0.268, + "step": 14624 + }, + { + "epoch": 0.92, + "grad_norm": 1.4831645260225281, + "learning_rate": 1.679411325770436e-07, + "loss": 0.2292, + "step": 14625 + }, + { + "epoch": 0.92, + "grad_norm": 4.735729228860837, + "learning_rate": 1.6767948577883109e-07, + "loss": 0.2554, + "step": 14626 + }, + { + "epoch": 0.92, + "grad_norm": 2.0309999751510386, + "learning_rate": 1.6741803948218005e-07, + "loss": 0.2595, + "step": 14627 + }, + { + "epoch": 0.92, + "grad_norm": 3.356528461492822, + "learning_rate": 1.6715679369793737e-07, + "loss": 0.2595, + "step": 14628 + }, + { + "epoch": 0.92, + "grad_norm": 1.275932866756124, + "learning_rate": 1.6689574843694433e-07, + "loss": 0.261, + "step": 14629 + }, + { + "epoch": 0.92, + "grad_norm": 2.3970316663035125, + "learning_rate": 1.6663490371003122e-07, + "loss": 0.2604, + "step": 14630 + }, + { + "epoch": 0.92, + "grad_norm": 3.076284490200964, + "learning_rate": 1.663742595280199e-07, + "loss": 0.2542, + "step": 14631 + }, + { + "epoch": 0.92, + "grad_norm": 4.965953426197452, + "learning_rate": 1.661138159017267e-07, + "loss": 0.2699, + "step": 14632 + }, + { + "epoch": 0.92, + "grad_norm": 2.1067068405456113, + "learning_rate": 1.6585357284195748e-07, + "loss": 0.2421, + "step": 14633 + }, + { + "epoch": 0.92, + "grad_norm": 1.3118731101592522, + "learning_rate": 1.6559353035950964e-07, + "loss": 0.2449, + "step": 14634 + }, + { + "epoch": 0.92, + "grad_norm": 2.0456770023365625, + "learning_rate": 1.653336884651735e-07, + "loss": 0.2421, + "step": 14635 + }, + { + "epoch": 0.92, + "grad_norm": 2.2109111172764475, + "learning_rate": 1.650740471697293e-07, + "loss": 0.2572, + "step": 14636 + }, + { + "epoch": 0.92, + "grad_norm": 2.8522415018301728, + "learning_rate": 1.648146064839512e-07, + "loss": 0.2899, + "step": 14637 + }, + { + "epoch": 0.92, + "grad_norm": 2.0267974383100484, + "learning_rate": 1.6455536641860282e-07, + "loss": 0.2638, + "step": 14638 + }, + { + "epoch": 0.92, + "grad_norm": 2.106440965143541, + "learning_rate": 1.6429632698444165e-07, + "loss": 0.2435, + "step": 14639 + }, + { + "epoch": 0.92, + "grad_norm": 2.92420689986729, + "learning_rate": 1.6403748819221464e-07, + "loss": 0.2491, + "step": 14640 + }, + { + "epoch": 0.92, + "grad_norm": 1.6832401826812362, + "learning_rate": 1.6377885005266214e-07, + "loss": 0.246, + "step": 14641 + }, + { + "epoch": 0.92, + "grad_norm": 1.5134706548275112, + "learning_rate": 1.6352041257651496e-07, + "loss": 0.2451, + "step": 14642 + }, + { + "epoch": 0.92, + "grad_norm": 1.9732603558655655, + "learning_rate": 1.6326217577449677e-07, + "loss": 0.2621, + "step": 14643 + }, + { + "epoch": 0.92, + "grad_norm": 1.5648125019183925, + "learning_rate": 1.6300413965732176e-07, + "loss": 0.2573, + "step": 14644 + }, + { + "epoch": 0.92, + "grad_norm": 2.2224410022222307, + "learning_rate": 1.6274630423569582e-07, + "loss": 0.2426, + "step": 14645 + }, + { + "epoch": 0.92, + "grad_norm": 3.098652634617864, + "learning_rate": 1.6248866952031816e-07, + "loss": 0.2636, + "step": 14646 + }, + { + "epoch": 0.92, + "grad_norm": 1.9518050289061362, + "learning_rate": 1.622312355218786e-07, + "loss": 0.2593, + "step": 14647 + }, + { + "epoch": 0.92, + "grad_norm": 2.2932777745542174, + "learning_rate": 1.6197400225105686e-07, + "loss": 0.246, + "step": 14648 + }, + { + "epoch": 0.92, + "grad_norm": 1.716278503850628, + "learning_rate": 1.617169697185278e-07, + "loss": 0.2448, + "step": 14649 + }, + { + "epoch": 0.92, + "grad_norm": 3.036732537531056, + "learning_rate": 1.6146013793495564e-07, + "loss": 0.2515, + "step": 14650 + }, + { + "epoch": 0.92, + "grad_norm": 2.430620711198442, + "learning_rate": 1.612035069109963e-07, + "loss": 0.2367, + "step": 14651 + }, + { + "epoch": 0.92, + "grad_norm": 6.377390063577401, + "learning_rate": 1.6094707665729736e-07, + "loss": 0.2644, + "step": 14652 + }, + { + "epoch": 0.92, + "grad_norm": 2.6673442596094272, + "learning_rate": 1.6069084718449978e-07, + "loss": 0.2721, + "step": 14653 + }, + { + "epoch": 0.92, + "grad_norm": 1.7329225185422226, + "learning_rate": 1.6043481850323505e-07, + "loss": 0.2418, + "step": 14654 + }, + { + "epoch": 0.92, + "grad_norm": 2.859340110203497, + "learning_rate": 1.6017899062412578e-07, + "loss": 0.2399, + "step": 14655 + }, + { + "epoch": 0.92, + "grad_norm": 2.983389765371011, + "learning_rate": 1.5992336355778572e-07, + "loss": 0.2482, + "step": 14656 + }, + { + "epoch": 0.92, + "grad_norm": 2.257335331812706, + "learning_rate": 1.59667937314823e-07, + "loss": 0.2784, + "step": 14657 + }, + { + "epoch": 0.92, + "grad_norm": 1.519793464270759, + "learning_rate": 1.5941271190583473e-07, + "loss": 0.2484, + "step": 14658 + }, + { + "epoch": 0.92, + "grad_norm": 1.7246468564172275, + "learning_rate": 1.5915768734141135e-07, + "loss": 0.2494, + "step": 14659 + }, + { + "epoch": 0.92, + "grad_norm": 6.079357549788692, + "learning_rate": 1.5890286363213326e-07, + "loss": 0.2515, + "step": 14660 + }, + { + "epoch": 0.92, + "grad_norm": 1.3967370792960778, + "learning_rate": 1.5864824078857365e-07, + "loss": 0.2315, + "step": 14661 + }, + { + "epoch": 0.92, + "grad_norm": 1.742727083738558, + "learning_rate": 1.5839381882129856e-07, + "loss": 0.2474, + "step": 14662 + }, + { + "epoch": 0.92, + "grad_norm": 1.7703861623718138, + "learning_rate": 1.5813959774086396e-07, + "loss": 0.2495, + "step": 14663 + }, + { + "epoch": 0.92, + "grad_norm": 2.21980916951687, + "learning_rate": 1.5788557755781642e-07, + "loss": 0.2424, + "step": 14664 + }, + { + "epoch": 0.92, + "grad_norm": 6.670680642865481, + "learning_rate": 1.576317582826975e-07, + "loss": 0.244, + "step": 14665 + }, + { + "epoch": 0.92, + "grad_norm": 2.7967100125911086, + "learning_rate": 1.5737813992603822e-07, + "loss": 0.2396, + "step": 14666 + }, + { + "epoch": 0.92, + "grad_norm": 1.4662996903675816, + "learning_rate": 1.571247224983613e-07, + "loss": 0.2282, + "step": 14667 + }, + { + "epoch": 0.92, + "grad_norm": 2.131216940287285, + "learning_rate": 1.5687150601018053e-07, + "loss": 0.246, + "step": 14668 + }, + { + "epoch": 0.92, + "grad_norm": 2.264780672903606, + "learning_rate": 1.5661849047200361e-07, + "loss": 0.2483, + "step": 14669 + }, + { + "epoch": 0.92, + "grad_norm": 1.9630664181022202, + "learning_rate": 1.5636567589432939e-07, + "loss": 0.2606, + "step": 14670 + }, + { + "epoch": 0.92, + "grad_norm": 1.7084230757094758, + "learning_rate": 1.5611306228764612e-07, + "loss": 0.277, + "step": 14671 + }, + { + "epoch": 0.92, + "grad_norm": 1.6489836653289587, + "learning_rate": 1.5586064966243487e-07, + "loss": 0.2559, + "step": 14672 + }, + { + "epoch": 0.92, + "grad_norm": 1.3012312620610775, + "learning_rate": 1.5560843802917001e-07, + "loss": 0.2304, + "step": 14673 + }, + { + "epoch": 0.92, + "grad_norm": 1.5971566825111916, + "learning_rate": 1.5535642739831602e-07, + "loss": 0.2489, + "step": 14674 + }, + { + "epoch": 0.92, + "grad_norm": 2.060596037769323, + "learning_rate": 1.5510461778032836e-07, + "loss": 0.2475, + "step": 14675 + }, + { + "epoch": 0.92, + "grad_norm": 3.3823634922069425, + "learning_rate": 1.5485300918565593e-07, + "loss": 0.2478, + "step": 14676 + }, + { + "epoch": 0.92, + "grad_norm": 2.1116629273618397, + "learning_rate": 1.5460160162473757e-07, + "loss": 0.2436, + "step": 14677 + }, + { + "epoch": 0.92, + "grad_norm": 4.303742306352026, + "learning_rate": 1.5435039510800555e-07, + "loss": 0.2688, + "step": 14678 + }, + { + "epoch": 0.92, + "grad_norm": 0.5724092937533349, + "learning_rate": 1.5409938964588254e-07, + "loss": 0.4582, + "step": 14679 + }, + { + "epoch": 0.92, + "grad_norm": 1.3640143428741367, + "learning_rate": 1.538485852487831e-07, + "loss": 0.252, + "step": 14680 + }, + { + "epoch": 0.92, + "grad_norm": 7.24412176119006, + "learning_rate": 1.5359798192711383e-07, + "loss": 0.2668, + "step": 14681 + }, + { + "epoch": 0.92, + "grad_norm": 1.9384342811547017, + "learning_rate": 1.5334757969127202e-07, + "loss": 0.2374, + "step": 14682 + }, + { + "epoch": 0.92, + "grad_norm": 1.9555963481625724, + "learning_rate": 1.5309737855164763e-07, + "loss": 0.2467, + "step": 14683 + }, + { + "epoch": 0.92, + "grad_norm": 2.1158433271097215, + "learning_rate": 1.5284737851862296e-07, + "loss": 0.2479, + "step": 14684 + }, + { + "epoch": 0.92, + "grad_norm": 2.0871617596354586, + "learning_rate": 1.525975796025686e-07, + "loss": 0.2537, + "step": 14685 + }, + { + "epoch": 0.92, + "grad_norm": 1.6673814261868143, + "learning_rate": 1.523479818138518e-07, + "loss": 0.2444, + "step": 14686 + }, + { + "epoch": 0.92, + "grad_norm": 1.9984525737896035, + "learning_rate": 1.520985851628276e-07, + "loss": 0.2594, + "step": 14687 + }, + { + "epoch": 0.92, + "grad_norm": 5.968073224426047, + "learning_rate": 1.518493896598433e-07, + "loss": 0.2605, + "step": 14688 + }, + { + "epoch": 0.92, + "grad_norm": 3.527886828538164, + "learning_rate": 1.5160039531523952e-07, + "loss": 0.2522, + "step": 14689 + }, + { + "epoch": 0.92, + "grad_norm": 8.890379303935964, + "learning_rate": 1.5135160213934685e-07, + "loss": 0.2545, + "step": 14690 + }, + { + "epoch": 0.92, + "grad_norm": 1.7934664789798513, + "learning_rate": 1.5110301014248874e-07, + "loss": 0.2629, + "step": 14691 + }, + { + "epoch": 0.92, + "grad_norm": 2.08821812985751, + "learning_rate": 1.508546193349797e-07, + "loss": 0.2369, + "step": 14692 + }, + { + "epoch": 0.92, + "grad_norm": 2.761791256746903, + "learning_rate": 1.5060642972712425e-07, + "loss": 0.2438, + "step": 14693 + }, + { + "epoch": 0.92, + "grad_norm": 1.9194521860460347, + "learning_rate": 1.503584413292225e-07, + "loss": 0.2624, + "step": 14694 + }, + { + "epoch": 0.92, + "grad_norm": 3.392746401289107, + "learning_rate": 1.501106541515629e-07, + "loss": 0.2517, + "step": 14695 + }, + { + "epoch": 0.92, + "grad_norm": 1.9097213975952536, + "learning_rate": 1.4986306820442609e-07, + "loss": 0.2392, + "step": 14696 + }, + { + "epoch": 0.92, + "grad_norm": 4.627763148595492, + "learning_rate": 1.4961568349808607e-07, + "loss": 0.2517, + "step": 14697 + }, + { + "epoch": 0.92, + "grad_norm": 2.1211150282529068, + "learning_rate": 1.493685000428058e-07, + "loss": 0.258, + "step": 14698 + }, + { + "epoch": 0.92, + "grad_norm": 1.3680378321681013, + "learning_rate": 1.491215178488431e-07, + "loss": 0.2396, + "step": 14699 + }, + { + "epoch": 0.92, + "grad_norm": 3.6550561107235926, + "learning_rate": 1.4887473692644428e-07, + "loss": 0.2441, + "step": 14700 + }, + { + "epoch": 0.92, + "grad_norm": 3.03823116777284, + "learning_rate": 1.4862815728584888e-07, + "loss": 0.254, + "step": 14701 + }, + { + "epoch": 0.92, + "grad_norm": 10.08310411298709, + "learning_rate": 1.4838177893728878e-07, + "loss": 0.2368, + "step": 14702 + }, + { + "epoch": 0.92, + "grad_norm": 3.7705972847750595, + "learning_rate": 1.481356018909863e-07, + "loss": 0.2491, + "step": 14703 + }, + { + "epoch": 0.92, + "grad_norm": 5.236174680489363, + "learning_rate": 1.4788962615715498e-07, + "loss": 0.2484, + "step": 14704 + }, + { + "epoch": 0.92, + "grad_norm": 2.7189632644735378, + "learning_rate": 1.4764385174600104e-07, + "loss": 0.2314, + "step": 14705 + }, + { + "epoch": 0.92, + "grad_norm": 2.197762770919978, + "learning_rate": 1.4739827866772304e-07, + "loss": 0.2461, + "step": 14706 + }, + { + "epoch": 0.92, + "grad_norm": 2.039026192120943, + "learning_rate": 1.4715290693251006e-07, + "loss": 0.2515, + "step": 14707 + }, + { + "epoch": 0.92, + "grad_norm": 1.676748054690642, + "learning_rate": 1.4690773655054225e-07, + "loss": 0.2501, + "step": 14708 + }, + { + "epoch": 0.93, + "grad_norm": 2.409012596567277, + "learning_rate": 1.4666276753199204e-07, + "loss": 0.2634, + "step": 14709 + }, + { + "epoch": 0.93, + "grad_norm": 1.662885215623147, + "learning_rate": 1.4641799988702466e-07, + "loss": 0.2401, + "step": 14710 + }, + { + "epoch": 0.93, + "grad_norm": 2.6202459054772875, + "learning_rate": 1.4617343362579528e-07, + "loss": 0.2667, + "step": 14711 + }, + { + "epoch": 0.93, + "grad_norm": 1.967525754697255, + "learning_rate": 1.4592906875845137e-07, + "loss": 0.2438, + "step": 14712 + }, + { + "epoch": 0.93, + "grad_norm": 1.8626807965066814, + "learning_rate": 1.4568490529513203e-07, + "loss": 0.2446, + "step": 14713 + }, + { + "epoch": 0.93, + "grad_norm": 2.0173005214412774, + "learning_rate": 1.454409432459686e-07, + "loss": 0.2634, + "step": 14714 + }, + { + "epoch": 0.93, + "grad_norm": 1.8575748264284897, + "learning_rate": 1.4519718262108296e-07, + "loss": 0.2685, + "step": 14715 + }, + { + "epoch": 0.93, + "grad_norm": 3.4461683819404567, + "learning_rate": 1.4495362343058872e-07, + "loss": 0.2551, + "step": 14716 + }, + { + "epoch": 0.93, + "grad_norm": 1.1198459156949172, + "learning_rate": 1.4471026568459335e-07, + "loss": 0.2503, + "step": 14717 + }, + { + "epoch": 0.93, + "grad_norm": 2.660210048012057, + "learning_rate": 1.4446710939319263e-07, + "loss": 0.2433, + "step": 14718 + }, + { + "epoch": 0.93, + "grad_norm": 2.711256064625923, + "learning_rate": 1.4422415456647577e-07, + "loss": 0.2506, + "step": 14719 + }, + { + "epoch": 0.93, + "grad_norm": 3.548776361514621, + "learning_rate": 1.4398140121452297e-07, + "loss": 0.2609, + "step": 14720 + }, + { + "epoch": 0.93, + "grad_norm": 2.3296713450433755, + "learning_rate": 1.4373884934740733e-07, + "loss": 0.2477, + "step": 14721 + }, + { + "epoch": 0.93, + "grad_norm": 1.6352457523620478, + "learning_rate": 1.43496498975193e-07, + "loss": 0.2445, + "step": 14722 + }, + { + "epoch": 0.93, + "grad_norm": 1.5220597702437866, + "learning_rate": 1.432543501079353e-07, + "loss": 0.2491, + "step": 14723 + }, + { + "epoch": 0.93, + "grad_norm": 2.929715711269501, + "learning_rate": 1.4301240275568006e-07, + "loss": 0.2342, + "step": 14724 + }, + { + "epoch": 0.93, + "grad_norm": 3.6164918458979822, + "learning_rate": 1.4277065692846815e-07, + "loss": 0.2332, + "step": 14725 + }, + { + "epoch": 0.93, + "grad_norm": 1.6748028845409968, + "learning_rate": 1.425291126363293e-07, + "loss": 0.2513, + "step": 14726 + }, + { + "epoch": 0.93, + "grad_norm": 2.2800775503316286, + "learning_rate": 1.422877698892844e-07, + "loss": 0.2614, + "step": 14727 + }, + { + "epoch": 0.93, + "grad_norm": 1.630118151963712, + "learning_rate": 1.420466286973493e-07, + "loss": 0.228, + "step": 14728 + }, + { + "epoch": 0.93, + "grad_norm": 1.7986034412082135, + "learning_rate": 1.4180568907052717e-07, + "loss": 0.2692, + "step": 14729 + }, + { + "epoch": 0.93, + "grad_norm": 6.117392210899359, + "learning_rate": 1.4156495101881662e-07, + "loss": 0.2381, + "step": 14730 + }, + { + "epoch": 0.93, + "grad_norm": 12.937335080838224, + "learning_rate": 1.4132441455220637e-07, + "loss": 0.2561, + "step": 14731 + }, + { + "epoch": 0.93, + "grad_norm": 2.191753514025765, + "learning_rate": 1.4108407968067506e-07, + "loss": 0.2523, + "step": 14732 + }, + { + "epoch": 0.93, + "grad_norm": 1.810721605975489, + "learning_rate": 1.4084394641419641e-07, + "loss": 0.2494, + "step": 14733 + }, + { + "epoch": 0.93, + "grad_norm": 1.8148423028706926, + "learning_rate": 1.4060401476273356e-07, + "loss": 0.2674, + "step": 14734 + }, + { + "epoch": 0.93, + "grad_norm": 7.572950294863925, + "learning_rate": 1.403642847362402e-07, + "loss": 0.2552, + "step": 14735 + }, + { + "epoch": 0.93, + "grad_norm": 1.585125551063138, + "learning_rate": 1.401247563446656e-07, + "loss": 0.2495, + "step": 14736 + }, + { + "epoch": 0.93, + "grad_norm": 3.390141227624101, + "learning_rate": 1.3988542959794627e-07, + "loss": 0.2582, + "step": 14737 + }, + { + "epoch": 0.93, + "grad_norm": 2.6615242836036774, + "learning_rate": 1.3964630450601314e-07, + "loss": 0.2558, + "step": 14738 + }, + { + "epoch": 0.93, + "grad_norm": 1.8693471718371781, + "learning_rate": 1.3940738107878826e-07, + "loss": 0.253, + "step": 14739 + }, + { + "epoch": 0.93, + "grad_norm": 3.1627254114858387, + "learning_rate": 1.3916865932618373e-07, + "loss": 0.2313, + "step": 14740 + }, + { + "epoch": 0.93, + "grad_norm": 2.4294468952066604, + "learning_rate": 1.389301392581055e-07, + "loss": 0.243, + "step": 14741 + }, + { + "epoch": 0.93, + "grad_norm": 2.5798209152898646, + "learning_rate": 1.386918208844501e-07, + "loss": 0.2698, + "step": 14742 + }, + { + "epoch": 0.93, + "grad_norm": 2.345631241114109, + "learning_rate": 1.3845370421510573e-07, + "loss": 0.2437, + "step": 14743 + }, + { + "epoch": 0.93, + "grad_norm": 2.3598763642601126, + "learning_rate": 1.3821578925995282e-07, + "loss": 0.2586, + "step": 14744 + }, + { + "epoch": 0.93, + "grad_norm": 1.4424637281151522, + "learning_rate": 1.3797807602886128e-07, + "loss": 0.2349, + "step": 14745 + }, + { + "epoch": 0.93, + "grad_norm": 5.6177531521316855, + "learning_rate": 1.3774056453169647e-07, + "loss": 0.249, + "step": 14746 + }, + { + "epoch": 0.93, + "grad_norm": 1.7221479334460554, + "learning_rate": 1.3750325477831116e-07, + "loss": 0.2579, + "step": 14747 + }, + { + "epoch": 0.93, + "grad_norm": 2.1935476166031673, + "learning_rate": 1.3726614677855243e-07, + "loss": 0.2647, + "step": 14748 + }, + { + "epoch": 0.93, + "grad_norm": 2.301822660137684, + "learning_rate": 1.3702924054225908e-07, + "loss": 0.2598, + "step": 14749 + }, + { + "epoch": 0.93, + "grad_norm": 1.8087374879222473, + "learning_rate": 1.367925360792599e-07, + "loss": 0.2465, + "step": 14750 + }, + { + "epoch": 0.93, + "grad_norm": 6.626035391338491, + "learning_rate": 1.3655603339937652e-07, + "loss": 0.2419, + "step": 14751 + }, + { + "epoch": 0.93, + "grad_norm": 2.5735250489342802, + "learning_rate": 1.3631973251242158e-07, + "loss": 0.2521, + "step": 14752 + }, + { + "epoch": 0.93, + "grad_norm": 2.5395385324028603, + "learning_rate": 1.3608363342819952e-07, + "loss": 0.2329, + "step": 14753 + }, + { + "epoch": 0.93, + "grad_norm": 1.5198408950056004, + "learning_rate": 1.3584773615650748e-07, + "loss": 0.256, + "step": 14754 + }, + { + "epoch": 0.93, + "grad_norm": 3.9608517463716955, + "learning_rate": 1.3561204070713264e-07, + "loss": 0.2576, + "step": 14755 + }, + { + "epoch": 0.93, + "grad_norm": 2.0182165284492783, + "learning_rate": 1.3537654708985326e-07, + "loss": 0.2557, + "step": 14756 + }, + { + "epoch": 0.93, + "grad_norm": 1.6962218711990547, + "learning_rate": 1.351412553144421e-07, + "loss": 0.2469, + "step": 14757 + }, + { + "epoch": 0.93, + "grad_norm": 1.8662322458064182, + "learning_rate": 1.3490616539066138e-07, + "loss": 0.2358, + "step": 14758 + }, + { + "epoch": 0.93, + "grad_norm": 2.1189782158984234, + "learning_rate": 1.3467127732826545e-07, + "loss": 0.2495, + "step": 14759 + }, + { + "epoch": 0.93, + "grad_norm": 1.8511935868137062, + "learning_rate": 1.3443659113699993e-07, + "loss": 0.2464, + "step": 14760 + }, + { + "epoch": 0.93, + "grad_norm": 4.088782987727659, + "learning_rate": 1.3420210682660194e-07, + "loss": 0.2713, + "step": 14761 + }, + { + "epoch": 0.93, + "grad_norm": 4.7116420973206825, + "learning_rate": 1.3396782440680157e-07, + "loss": 0.2399, + "step": 14762 + }, + { + "epoch": 0.93, + "grad_norm": 2.3910080932062456, + "learning_rate": 1.337337438873193e-07, + "loss": 0.2592, + "step": 14763 + }, + { + "epoch": 0.93, + "grad_norm": 1.8280881242317046, + "learning_rate": 1.3349986527786684e-07, + "loss": 0.2558, + "step": 14764 + }, + { + "epoch": 0.93, + "grad_norm": 4.283105581231077, + "learning_rate": 1.332661885881492e-07, + "loss": 0.2803, + "step": 14765 + }, + { + "epoch": 0.93, + "grad_norm": 2.2888267676262974, + "learning_rate": 1.3303271382786199e-07, + "loss": 0.2386, + "step": 14766 + }, + { + "epoch": 0.93, + "grad_norm": 1.3866444593266483, + "learning_rate": 1.3279944100669183e-07, + "loss": 0.2534, + "step": 14767 + }, + { + "epoch": 0.93, + "grad_norm": 2.0197531724971145, + "learning_rate": 1.3256637013431827e-07, + "loss": 0.2561, + "step": 14768 + }, + { + "epoch": 0.93, + "grad_norm": 3.3216086750641067, + "learning_rate": 1.3233350122041133e-07, + "loss": 0.2549, + "step": 14769 + }, + { + "epoch": 0.93, + "grad_norm": 2.8146968874953524, + "learning_rate": 1.3210083427463326e-07, + "loss": 0.2606, + "step": 14770 + }, + { + "epoch": 0.93, + "grad_norm": 1.782544295627537, + "learning_rate": 1.318683693066386e-07, + "loss": 0.2327, + "step": 14771 + }, + { + "epoch": 0.93, + "grad_norm": 2.2115745025861973, + "learning_rate": 1.3163610632607128e-07, + "loss": 0.2511, + "step": 14772 + }, + { + "epoch": 0.93, + "grad_norm": 4.529932750224458, + "learning_rate": 1.3140404534256912e-07, + "loss": 0.2469, + "step": 14773 + }, + { + "epoch": 0.93, + "grad_norm": 5.592638550689956, + "learning_rate": 1.3117218636576113e-07, + "loss": 0.2387, + "step": 14774 + }, + { + "epoch": 0.93, + "grad_norm": 2.4553856056557426, + "learning_rate": 1.3094052940526735e-07, + "loss": 0.2684, + "step": 14775 + }, + { + "epoch": 0.93, + "grad_norm": 1.79622933342868, + "learning_rate": 1.3070907447069903e-07, + "loss": 0.2478, + "step": 14776 + }, + { + "epoch": 0.93, + "grad_norm": 2.23924406557907, + "learning_rate": 1.3047782157166013e-07, + "loss": 0.274, + "step": 14777 + }, + { + "epoch": 0.93, + "grad_norm": 4.051012920838647, + "learning_rate": 1.3024677071774573e-07, + "loss": 0.2337, + "step": 14778 + }, + { + "epoch": 0.93, + "grad_norm": 2.4814995187614595, + "learning_rate": 1.3001592191854261e-07, + "loss": 0.2735, + "step": 14779 + }, + { + "epoch": 0.93, + "grad_norm": 1.8391784390693475, + "learning_rate": 1.2978527518362815e-07, + "loss": 0.2427, + "step": 14780 + }, + { + "epoch": 0.93, + "grad_norm": 0.613378542900168, + "learning_rate": 1.2955483052257355e-07, + "loss": 0.4245, + "step": 14781 + }, + { + "epoch": 0.93, + "grad_norm": 2.30056942275997, + "learning_rate": 1.2932458794494007e-07, + "loss": 0.2374, + "step": 14782 + }, + { + "epoch": 0.93, + "grad_norm": 1.7433319544034398, + "learning_rate": 1.2909454746028062e-07, + "loss": 0.2659, + "step": 14783 + }, + { + "epoch": 0.93, + "grad_norm": 3.3767453602211006, + "learning_rate": 1.288647090781403e-07, + "loss": 0.2436, + "step": 14784 + }, + { + "epoch": 0.93, + "grad_norm": 1.9230309022735703, + "learning_rate": 1.2863507280805488e-07, + "loss": 0.255, + "step": 14785 + }, + { + "epoch": 0.93, + "grad_norm": 1.2803826526183115, + "learning_rate": 1.2840563865955335e-07, + "loss": 0.2485, + "step": 14786 + }, + { + "epoch": 0.93, + "grad_norm": 2.3656651932396824, + "learning_rate": 1.2817640664215425e-07, + "loss": 0.2539, + "step": 14787 + }, + { + "epoch": 0.93, + "grad_norm": 2.059815424855098, + "learning_rate": 1.2794737676536993e-07, + "loss": 0.2443, + "step": 14788 + }, + { + "epoch": 0.93, + "grad_norm": 2.9648153798521695, + "learning_rate": 1.277185490387023e-07, + "loss": 0.2516, + "step": 14789 + }, + { + "epoch": 0.93, + "grad_norm": 2.101183941336599, + "learning_rate": 1.2748992347164647e-07, + "loss": 0.2503, + "step": 14790 + }, + { + "epoch": 0.93, + "grad_norm": 1.9660790907228658, + "learning_rate": 1.272615000736882e-07, + "loss": 0.2535, + "step": 14791 + }, + { + "epoch": 0.93, + "grad_norm": 1.3704224188798575, + "learning_rate": 1.270332788543055e-07, + "loss": 0.2598, + "step": 14792 + }, + { + "epoch": 0.93, + "grad_norm": 1.4233464489865375, + "learning_rate": 1.2680525982296688e-07, + "loss": 0.2513, + "step": 14793 + }, + { + "epoch": 0.93, + "grad_norm": 1.5876366069095593, + "learning_rate": 1.265774429891342e-07, + "loss": 0.2374, + "step": 14794 + }, + { + "epoch": 0.93, + "grad_norm": 3.1348939968086453, + "learning_rate": 1.2634982836225884e-07, + "loss": 0.244, + "step": 14795 + }, + { + "epoch": 0.93, + "grad_norm": 2.1557460702912743, + "learning_rate": 1.2612241595178654e-07, + "loss": 0.2486, + "step": 14796 + }, + { + "epoch": 0.93, + "grad_norm": 2.403929852882607, + "learning_rate": 1.2589520576715142e-07, + "loss": 0.2632, + "step": 14797 + }, + { + "epoch": 0.93, + "grad_norm": 3.6775150488385586, + "learning_rate": 1.256681978177826e-07, + "loss": 0.2869, + "step": 14798 + }, + { + "epoch": 0.93, + "grad_norm": 2.891385801479705, + "learning_rate": 1.2544139211309757e-07, + "loss": 0.2361, + "step": 14799 + }, + { + "epoch": 0.93, + "grad_norm": 2.3701499166962425, + "learning_rate": 1.2521478866250714e-07, + "loss": 0.2386, + "step": 14800 + }, + { + "epoch": 0.93, + "grad_norm": 1.7063263012705459, + "learning_rate": 1.2498838747541374e-07, + "loss": 0.2405, + "step": 14801 + }, + { + "epoch": 0.93, + "grad_norm": 1.5580683549274503, + "learning_rate": 1.2476218856121103e-07, + "loss": 0.238, + "step": 14802 + }, + { + "epoch": 0.93, + "grad_norm": 2.144137443763496, + "learning_rate": 1.245361919292848e-07, + "loss": 0.2485, + "step": 14803 + }, + { + "epoch": 0.93, + "grad_norm": 5.354295341876992, + "learning_rate": 1.2431039758901198e-07, + "loss": 0.239, + "step": 14804 + }, + { + "epoch": 0.93, + "grad_norm": 1.679861511521898, + "learning_rate": 1.2408480554976067e-07, + "loss": 0.2415, + "step": 14805 + }, + { + "epoch": 0.93, + "grad_norm": 2.945784493613841, + "learning_rate": 1.2385941582089168e-07, + "loss": 0.2479, + "step": 14806 + }, + { + "epoch": 0.93, + "grad_norm": 4.563666531930614, + "learning_rate": 1.2363422841175644e-07, + "loss": 0.2722, + "step": 14807 + }, + { + "epoch": 0.93, + "grad_norm": 1.9128993895990256, + "learning_rate": 1.2340924333169802e-07, + "loss": 0.244, + "step": 14808 + }, + { + "epoch": 0.93, + "grad_norm": 2.144993451579595, + "learning_rate": 1.2318446059005173e-07, + "loss": 0.2555, + "step": 14809 + }, + { + "epoch": 0.93, + "grad_norm": 2.279521249213373, + "learning_rate": 1.2295988019614514e-07, + "loss": 0.2424, + "step": 14810 + }, + { + "epoch": 0.93, + "grad_norm": 2.2654319812179002, + "learning_rate": 1.227355021592952e-07, + "loss": 0.242, + "step": 14811 + }, + { + "epoch": 0.93, + "grad_norm": 5.042725802533616, + "learning_rate": 1.225113264888128e-07, + "loss": 0.2491, + "step": 14812 + }, + { + "epoch": 0.93, + "grad_norm": 1.8868224572855978, + "learning_rate": 1.222873531939983e-07, + "loss": 0.2765, + "step": 14813 + }, + { + "epoch": 0.93, + "grad_norm": 3.576975161629902, + "learning_rate": 1.2206358228414593e-07, + "loss": 0.2748, + "step": 14814 + }, + { + "epoch": 0.93, + "grad_norm": 2.581179031064587, + "learning_rate": 1.218400137685394e-07, + "loss": 0.2444, + "step": 14815 + }, + { + "epoch": 0.93, + "grad_norm": 2.887526890406968, + "learning_rate": 1.2161664765645454e-07, + "loss": 0.2552, + "step": 14816 + }, + { + "epoch": 0.93, + "grad_norm": 1.500305220643014, + "learning_rate": 1.2139348395716066e-07, + "loss": 0.2606, + "step": 14817 + }, + { + "epoch": 0.93, + "grad_norm": 1.79615867270646, + "learning_rate": 1.2117052267991648e-07, + "loss": 0.2489, + "step": 14818 + }, + { + "epoch": 0.93, + "grad_norm": 0.6042147400663841, + "learning_rate": 1.2094776383397344e-07, + "loss": 0.4384, + "step": 14819 + }, + { + "epoch": 0.93, + "grad_norm": 1.5319324540550467, + "learning_rate": 1.2072520742857363e-07, + "loss": 0.2571, + "step": 14820 + }, + { + "epoch": 0.93, + "grad_norm": 1.9823282127894013, + "learning_rate": 1.2050285347295077e-07, + "loss": 0.2385, + "step": 14821 + }, + { + "epoch": 0.93, + "grad_norm": 2.0395177496762456, + "learning_rate": 1.2028070197633246e-07, + "loss": 0.2582, + "step": 14822 + }, + { + "epoch": 0.93, + "grad_norm": 2.045153436724005, + "learning_rate": 1.2005875294793522e-07, + "loss": 0.2829, + "step": 14823 + }, + { + "epoch": 0.93, + "grad_norm": 2.798874262521781, + "learning_rate": 1.1983700639696727e-07, + "loss": 0.243, + "step": 14824 + }, + { + "epoch": 0.93, + "grad_norm": 1.5626622951190008, + "learning_rate": 1.1961546233263011e-07, + "loss": 0.2233, + "step": 14825 + }, + { + "epoch": 0.93, + "grad_norm": 6.677377284479592, + "learning_rate": 1.193941207641164e-07, + "loss": 0.2554, + "step": 14826 + }, + { + "epoch": 0.93, + "grad_norm": 1.9631409156833521, + "learning_rate": 1.1917298170060987e-07, + "loss": 0.273, + "step": 14827 + }, + { + "epoch": 0.93, + "grad_norm": 21.48583609118683, + "learning_rate": 1.1895204515128544e-07, + "loss": 0.2649, + "step": 14828 + }, + { + "epoch": 0.93, + "grad_norm": 3.277225363412385, + "learning_rate": 1.1873131112530967e-07, + "loss": 0.2658, + "step": 14829 + }, + { + "epoch": 0.93, + "grad_norm": 2.1121952784711446, + "learning_rate": 1.1851077963184243e-07, + "loss": 0.2496, + "step": 14830 + }, + { + "epoch": 0.93, + "grad_norm": 1.5936972522637975, + "learning_rate": 1.1829045068003364e-07, + "loss": 0.2751, + "step": 14831 + }, + { + "epoch": 0.93, + "grad_norm": 2.806239788123379, + "learning_rate": 1.1807032427902488e-07, + "loss": 0.2448, + "step": 14832 + }, + { + "epoch": 0.93, + "grad_norm": 1.668576839612423, + "learning_rate": 1.1785040043794882e-07, + "loss": 0.2567, + "step": 14833 + }, + { + "epoch": 0.93, + "grad_norm": 1.5403410715242014, + "learning_rate": 1.1763067916593263e-07, + "loss": 0.2494, + "step": 14834 + }, + { + "epoch": 0.93, + "grad_norm": 2.1104326000099825, + "learning_rate": 1.1741116047209067e-07, + "loss": 0.2458, + "step": 14835 + }, + { + "epoch": 0.93, + "grad_norm": 2.390632507976463, + "learning_rate": 1.1719184436553288e-07, + "loss": 0.2611, + "step": 14836 + }, + { + "epoch": 0.93, + "grad_norm": 2.223425231824431, + "learning_rate": 1.1697273085535755e-07, + "loss": 0.2453, + "step": 14837 + }, + { + "epoch": 0.93, + "grad_norm": 1.835627269380618, + "learning_rate": 1.1675381995065738e-07, + "loss": 0.2377, + "step": 14838 + }, + { + "epoch": 0.93, + "grad_norm": 5.252125941938038, + "learning_rate": 1.1653511166051457e-07, + "loss": 0.2505, + "step": 14839 + }, + { + "epoch": 0.93, + "grad_norm": 1.7293188119700607, + "learning_rate": 1.1631660599400407e-07, + "loss": 0.2301, + "step": 14840 + }, + { + "epoch": 0.93, + "grad_norm": 1.947457346600411, + "learning_rate": 1.1609830296019142e-07, + "loss": 0.2451, + "step": 14841 + }, + { + "epoch": 0.93, + "grad_norm": 4.380286439795994, + "learning_rate": 1.1588020256813604e-07, + "loss": 0.2607, + "step": 14842 + }, + { + "epoch": 0.93, + "grad_norm": 1.6688985718457132, + "learning_rate": 1.1566230482688567e-07, + "loss": 0.2476, + "step": 14843 + }, + { + "epoch": 0.93, + "grad_norm": 1.491500712169069, + "learning_rate": 1.1544460974548199e-07, + "loss": 0.2514, + "step": 14844 + }, + { + "epoch": 0.93, + "grad_norm": 1.8003044903232, + "learning_rate": 1.1522711733295722e-07, + "loss": 0.252, + "step": 14845 + }, + { + "epoch": 0.93, + "grad_norm": 1.9114935174019365, + "learning_rate": 1.1500982759833579e-07, + "loss": 0.2585, + "step": 14846 + }, + { + "epoch": 0.93, + "grad_norm": 2.4443451893432773, + "learning_rate": 1.1479274055063327e-07, + "loss": 0.2583, + "step": 14847 + }, + { + "epoch": 0.93, + "grad_norm": 1.937288000014147, + "learning_rate": 1.1457585619885692e-07, + "loss": 0.2452, + "step": 14848 + }, + { + "epoch": 0.93, + "grad_norm": 1.854551755026898, + "learning_rate": 1.1435917455200562e-07, + "loss": 0.2517, + "step": 14849 + }, + { + "epoch": 0.93, + "grad_norm": 1.9920062840874568, + "learning_rate": 1.1414269561907054e-07, + "loss": 0.2611, + "step": 14850 + }, + { + "epoch": 0.93, + "grad_norm": 3.664366102243156, + "learning_rate": 1.1392641940903337e-07, + "loss": 0.2729, + "step": 14851 + }, + { + "epoch": 0.93, + "grad_norm": 2.3581522728478457, + "learning_rate": 1.1371034593086749e-07, + "loss": 0.2433, + "step": 14852 + }, + { + "epoch": 0.93, + "grad_norm": 2.339598302772925, + "learning_rate": 1.1349447519353796e-07, + "loss": 0.2434, + "step": 14853 + }, + { + "epoch": 0.93, + "grad_norm": 2.3366893826110973, + "learning_rate": 1.1327880720600205e-07, + "loss": 0.2384, + "step": 14854 + }, + { + "epoch": 0.93, + "grad_norm": 0.6284451781576239, + "learning_rate": 1.1306334197720925e-07, + "loss": 0.4828, + "step": 14855 + }, + { + "epoch": 0.93, + "grad_norm": 1.561425902150267, + "learning_rate": 1.12848079516098e-07, + "loss": 0.2665, + "step": 14856 + }, + { + "epoch": 0.93, + "grad_norm": 4.023058133647233, + "learning_rate": 1.1263301983160002e-07, + "loss": 0.2497, + "step": 14857 + }, + { + "epoch": 0.93, + "grad_norm": 1.916481387111643, + "learning_rate": 1.124181629326393e-07, + "loss": 0.2697, + "step": 14858 + }, + { + "epoch": 0.93, + "grad_norm": 2.4385159119139947, + "learning_rate": 1.1220350882813091e-07, + "loss": 0.2673, + "step": 14859 + }, + { + "epoch": 0.93, + "grad_norm": 8.651392137228159, + "learning_rate": 1.1198905752697997e-07, + "loss": 0.2486, + "step": 14860 + }, + { + "epoch": 0.93, + "grad_norm": 2.9391530254969616, + "learning_rate": 1.117748090380849e-07, + "loss": 0.2568, + "step": 14861 + }, + { + "epoch": 0.93, + "grad_norm": 1.8450702886107342, + "learning_rate": 1.1156076337033583e-07, + "loss": 0.2385, + "step": 14862 + }, + { + "epoch": 0.93, + "grad_norm": 1.6987128208037827, + "learning_rate": 1.1134692053261343e-07, + "loss": 0.2416, + "step": 14863 + }, + { + "epoch": 0.93, + "grad_norm": 1.674250944224833, + "learning_rate": 1.1113328053379113e-07, + "loss": 0.2634, + "step": 14864 + }, + { + "epoch": 0.93, + "grad_norm": 2.0697943603454942, + "learning_rate": 1.1091984338273188e-07, + "loss": 0.2233, + "step": 14865 + }, + { + "epoch": 0.93, + "grad_norm": 1.8118060020421574, + "learning_rate": 1.1070660908829245e-07, + "loss": 0.2265, + "step": 14866 + }, + { + "epoch": 0.93, + "grad_norm": 1.651358541009252, + "learning_rate": 1.1049357765932078e-07, + "loss": 0.2519, + "step": 14867 + }, + { + "epoch": 0.94, + "grad_norm": 1.8171109253631073, + "learning_rate": 1.1028074910465481e-07, + "loss": 0.2572, + "step": 14868 + }, + { + "epoch": 0.94, + "grad_norm": 3.3117779403954093, + "learning_rate": 1.1006812343312467e-07, + "loss": 0.2469, + "step": 14869 + }, + { + "epoch": 0.94, + "grad_norm": 0.5892931590116841, + "learning_rate": 1.09855700653555e-07, + "loss": 0.4931, + "step": 14870 + }, + { + "epoch": 0.94, + "grad_norm": 2.5588480046248643, + "learning_rate": 1.0964348077475817e-07, + "loss": 0.2355, + "step": 14871 + }, + { + "epoch": 0.94, + "grad_norm": 2.4196301254817536, + "learning_rate": 1.0943146380553938e-07, + "loss": 0.257, + "step": 14872 + }, + { + "epoch": 0.94, + "grad_norm": 2.6697389354697143, + "learning_rate": 1.092196497546949e-07, + "loss": 0.2584, + "step": 14873 + }, + { + "epoch": 0.94, + "grad_norm": 1.2976118335943434, + "learning_rate": 1.0900803863101494e-07, + "loss": 0.2447, + "step": 14874 + }, + { + "epoch": 0.94, + "grad_norm": 0.5761961831633281, + "learning_rate": 1.0879663044327915e-07, + "loss": 0.4653, + "step": 14875 + }, + { + "epoch": 0.94, + "grad_norm": 1.8734145186457203, + "learning_rate": 1.0858542520025828e-07, + "loss": 0.2423, + "step": 14876 + }, + { + "epoch": 0.94, + "grad_norm": 2.153025758468859, + "learning_rate": 1.0837442291071587e-07, + "loss": 0.2702, + "step": 14877 + }, + { + "epoch": 0.94, + "grad_norm": 2.889608873689613, + "learning_rate": 1.0816362358340826e-07, + "loss": 0.2807, + "step": 14878 + }, + { + "epoch": 0.94, + "grad_norm": 1.5377182230887265, + "learning_rate": 1.0795302722708012e-07, + "loss": 0.2508, + "step": 14879 + }, + { + "epoch": 0.94, + "grad_norm": 2.1302354393816434, + "learning_rate": 1.0774263385047057e-07, + "loss": 0.2591, + "step": 14880 + }, + { + "epoch": 0.94, + "grad_norm": 2.1321161995180526, + "learning_rate": 1.0753244346230818e-07, + "loss": 0.2572, + "step": 14881 + }, + { + "epoch": 0.94, + "grad_norm": 2.977443119313658, + "learning_rate": 1.0732245607131542e-07, + "loss": 0.264, + "step": 14882 + }, + { + "epoch": 0.94, + "grad_norm": 2.755625879920134, + "learning_rate": 1.071126716862042e-07, + "loss": 0.2435, + "step": 14883 + }, + { + "epoch": 0.94, + "grad_norm": 2.2010017414711407, + "learning_rate": 1.0690309031567814e-07, + "loss": 0.2478, + "step": 14884 + }, + { + "epoch": 0.94, + "grad_norm": 2.1106095110096064, + "learning_rate": 1.066937119684347e-07, + "loss": 0.2456, + "step": 14885 + }, + { + "epoch": 0.94, + "grad_norm": 2.1943699278626636, + "learning_rate": 1.0648453665316028e-07, + "loss": 0.2483, + "step": 14886 + }, + { + "epoch": 0.94, + "grad_norm": 1.5251210715085795, + "learning_rate": 1.0627556437853459e-07, + "loss": 0.2629, + "step": 14887 + }, + { + "epoch": 0.94, + "grad_norm": 1.7813309320362822, + "learning_rate": 1.0606679515322849e-07, + "loss": 0.2611, + "step": 14888 + }, + { + "epoch": 0.94, + "grad_norm": 1.591465764685345, + "learning_rate": 1.0585822898590281e-07, + "loss": 0.2479, + "step": 14889 + }, + { + "epoch": 0.94, + "grad_norm": 1.6430687983850112, + "learning_rate": 1.056498658852123e-07, + "loss": 0.2616, + "step": 14890 + }, + { + "epoch": 0.94, + "grad_norm": 1.6878018452247103, + "learning_rate": 1.0544170585980229e-07, + "loss": 0.2474, + "step": 14891 + }, + { + "epoch": 0.94, + "grad_norm": 2.3871371505976424, + "learning_rate": 1.0523374891830972e-07, + "loss": 0.237, + "step": 14892 + }, + { + "epoch": 0.94, + "grad_norm": 5.598496951092441, + "learning_rate": 1.0502599506936273e-07, + "loss": 0.2536, + "step": 14893 + }, + { + "epoch": 0.94, + "grad_norm": 1.4978143617102027, + "learning_rate": 1.0481844432158162e-07, + "loss": 0.2616, + "step": 14894 + }, + { + "epoch": 0.94, + "grad_norm": 1.9994766603064726, + "learning_rate": 1.0461109668357894e-07, + "loss": 0.2497, + "step": 14895 + }, + { + "epoch": 0.94, + "grad_norm": 1.6608440324327385, + "learning_rate": 1.0440395216395616e-07, + "loss": 0.2401, + "step": 14896 + }, + { + "epoch": 0.94, + "grad_norm": 3.151947723333777, + "learning_rate": 1.0419701077130917e-07, + "loss": 0.2757, + "step": 14897 + }, + { + "epoch": 0.94, + "grad_norm": 2.112650879500353, + "learning_rate": 1.0399027251422444e-07, + "loss": 0.2534, + "step": 14898 + }, + { + "epoch": 0.94, + "grad_norm": 2.759699635665989, + "learning_rate": 1.0378373740127901e-07, + "loss": 0.2596, + "step": 14899 + }, + { + "epoch": 0.94, + "grad_norm": 2.6249261058165243, + "learning_rate": 1.0357740544104323e-07, + "loss": 0.2532, + "step": 14900 + }, + { + "epoch": 0.94, + "grad_norm": 2.1306714110980276, + "learning_rate": 1.0337127664207803e-07, + "loss": 0.2454, + "step": 14901 + }, + { + "epoch": 0.94, + "grad_norm": 2.653563634502309, + "learning_rate": 1.0316535101293601e-07, + "loss": 0.2685, + "step": 14902 + }, + { + "epoch": 0.94, + "grad_norm": 2.025075574024474, + "learning_rate": 1.0295962856216146e-07, + "loss": 0.2491, + "step": 14903 + }, + { + "epoch": 0.94, + "grad_norm": 1.9884337748471876, + "learning_rate": 1.0275410929828977e-07, + "loss": 0.2661, + "step": 14904 + }, + { + "epoch": 0.94, + "grad_norm": 2.6602176710280725, + "learning_rate": 1.0254879322984856e-07, + "loss": 0.2496, + "step": 14905 + }, + { + "epoch": 0.94, + "grad_norm": 1.579321150659445, + "learning_rate": 1.0234368036535658e-07, + "loss": 0.2698, + "step": 14906 + }, + { + "epoch": 0.94, + "grad_norm": 3.0472771201422084, + "learning_rate": 1.0213877071332478e-07, + "loss": 0.242, + "step": 14907 + }, + { + "epoch": 0.94, + "grad_norm": 2.2621855761825884, + "learning_rate": 1.0193406428225528e-07, + "loss": 0.266, + "step": 14908 + }, + { + "epoch": 0.94, + "grad_norm": 1.860810566412694, + "learning_rate": 1.017295610806407e-07, + "loss": 0.2344, + "step": 14909 + }, + { + "epoch": 0.94, + "grad_norm": 4.795315979060604, + "learning_rate": 1.0152526111696759e-07, + "loss": 0.2565, + "step": 14910 + }, + { + "epoch": 0.94, + "grad_norm": 2.634323388444125, + "learning_rate": 1.0132116439971196e-07, + "loss": 0.2555, + "step": 14911 + }, + { + "epoch": 0.94, + "grad_norm": 0.5809638260400033, + "learning_rate": 1.0111727093734203e-07, + "loss": 0.4658, + "step": 14912 + }, + { + "epoch": 0.94, + "grad_norm": 2.379199203821618, + "learning_rate": 1.009135807383177e-07, + "loss": 0.2499, + "step": 14913 + }, + { + "epoch": 0.94, + "grad_norm": 2.57060843423349, + "learning_rate": 1.0071009381109053e-07, + "loss": 0.2687, + "step": 14914 + }, + { + "epoch": 0.94, + "grad_norm": 1.6136274412195344, + "learning_rate": 1.0050681016410435e-07, + "loss": 0.2417, + "step": 14915 + }, + { + "epoch": 0.94, + "grad_norm": 2.465798957401808, + "learning_rate": 1.0030372980579294e-07, + "loss": 0.2354, + "step": 14916 + }, + { + "epoch": 0.94, + "grad_norm": 2.478186727032362, + "learning_rate": 1.0010085274458236e-07, + "loss": 0.2527, + "step": 14917 + }, + { + "epoch": 0.94, + "grad_norm": 3.3420232502846567, + "learning_rate": 9.989817898889087e-08, + "loss": 0.2502, + "step": 14918 + }, + { + "epoch": 0.94, + "grad_norm": 0.5862422368537016, + "learning_rate": 9.969570854712785e-08, + "loss": 0.463, + "step": 14919 + }, + { + "epoch": 0.94, + "grad_norm": 4.186868427191014, + "learning_rate": 9.949344142769323e-08, + "loss": 0.2473, + "step": 14920 + }, + { + "epoch": 0.94, + "grad_norm": 3.4484885809275756, + "learning_rate": 9.929137763897923e-08, + "loss": 0.2739, + "step": 14921 + }, + { + "epoch": 0.94, + "grad_norm": 2.5240728567295854, + "learning_rate": 9.908951718937187e-08, + "loss": 0.2563, + "step": 14922 + }, + { + "epoch": 0.94, + "grad_norm": 3.4450657920114627, + "learning_rate": 9.888786008724504e-08, + "loss": 0.2589, + "step": 14923 + }, + { + "epoch": 0.94, + "grad_norm": 6.12220374274891, + "learning_rate": 9.868640634096649e-08, + "loss": 0.2486, + "step": 14924 + }, + { + "epoch": 0.94, + "grad_norm": 3.9133550174936365, + "learning_rate": 9.848515595889452e-08, + "loss": 0.2531, + "step": 14925 + }, + { + "epoch": 0.94, + "grad_norm": 1.6291496748320533, + "learning_rate": 9.828410894937912e-08, + "loss": 0.2408, + "step": 14926 + }, + { + "epoch": 0.94, + "grad_norm": 2.610575991622655, + "learning_rate": 9.808326532076307e-08, + "loss": 0.2346, + "step": 14927 + }, + { + "epoch": 0.94, + "grad_norm": 3.1243219912170472, + "learning_rate": 9.788262508137913e-08, + "loss": 0.2532, + "step": 14928 + }, + { + "epoch": 0.94, + "grad_norm": 2.0825214835054133, + "learning_rate": 9.768218823955122e-08, + "loss": 0.2421, + "step": 14929 + }, + { + "epoch": 0.94, + "grad_norm": 2.534624003973788, + "learning_rate": 9.748195480359768e-08, + "loss": 0.2403, + "step": 14930 + }, + { + "epoch": 0.94, + "grad_norm": 3.8788918273845305, + "learning_rate": 9.728192478182574e-08, + "loss": 0.2333, + "step": 14931 + }, + { + "epoch": 0.94, + "grad_norm": 2.2294369483115686, + "learning_rate": 9.708209818253489e-08, + "loss": 0.2545, + "step": 14932 + }, + { + "epoch": 0.94, + "grad_norm": 1.6569128406506433, + "learning_rate": 9.68824750140157e-08, + "loss": 0.2584, + "step": 14933 + }, + { + "epoch": 0.94, + "grad_norm": 5.83855571000696, + "learning_rate": 9.668305528455212e-08, + "loss": 0.2572, + "step": 14934 + }, + { + "epoch": 0.94, + "grad_norm": 1.9125795231859963, + "learning_rate": 9.648383900241808e-08, + "loss": 0.2447, + "step": 14935 + }, + { + "epoch": 0.94, + "grad_norm": 2.365058972026654, + "learning_rate": 9.628482617587864e-08, + "loss": 0.2636, + "step": 14936 + }, + { + "epoch": 0.94, + "grad_norm": 21.5959645402046, + "learning_rate": 9.608601681319163e-08, + "loss": 0.2661, + "step": 14937 + }, + { + "epoch": 0.94, + "grad_norm": 2.2741830157771217, + "learning_rate": 9.588741092260656e-08, + "loss": 0.2293, + "step": 14938 + }, + { + "epoch": 0.94, + "grad_norm": 1.5611052605208338, + "learning_rate": 9.568900851236351e-08, + "loss": 0.2521, + "step": 14939 + }, + { + "epoch": 0.94, + "grad_norm": 1.9518424864515562, + "learning_rate": 9.549080959069423e-08, + "loss": 0.2522, + "step": 14940 + }, + { + "epoch": 0.94, + "grad_norm": 2.5103558130100954, + "learning_rate": 9.52928141658227e-08, + "loss": 0.2702, + "step": 14941 + }, + { + "epoch": 0.94, + "grad_norm": 3.3833230777335555, + "learning_rate": 9.509502224596401e-08, + "loss": 0.2482, + "step": 14942 + }, + { + "epoch": 0.94, + "grad_norm": 3.6141775631828845, + "learning_rate": 9.489743383932548e-08, + "loss": 0.2636, + "step": 14943 + }, + { + "epoch": 0.94, + "grad_norm": 3.225754419827806, + "learning_rate": 9.470004895410445e-08, + "loss": 0.2495, + "step": 14944 + }, + { + "epoch": 0.94, + "grad_norm": 2.493882899914474, + "learning_rate": 9.450286759849048e-08, + "loss": 0.2477, + "step": 14945 + }, + { + "epoch": 0.94, + "grad_norm": 3.010916841228309, + "learning_rate": 9.430588978066647e-08, + "loss": 0.2546, + "step": 14946 + }, + { + "epoch": 0.94, + "grad_norm": 1.8776254788072027, + "learning_rate": 9.410911550880474e-08, + "loss": 0.2467, + "step": 14947 + }, + { + "epoch": 0.94, + "grad_norm": 2.6419471230118083, + "learning_rate": 9.391254479106993e-08, + "loss": 0.2477, + "step": 14948 + }, + { + "epoch": 0.94, + "grad_norm": 2.398983459069012, + "learning_rate": 9.371617763561658e-08, + "loss": 0.2386, + "step": 14949 + }, + { + "epoch": 0.94, + "grad_norm": 1.8036445531365404, + "learning_rate": 9.352001405059486e-08, + "loss": 0.2445, + "step": 14950 + }, + { + "epoch": 0.94, + "grad_norm": 2.566218181636836, + "learning_rate": 9.332405404414158e-08, + "loss": 0.2581, + "step": 14951 + }, + { + "epoch": 0.94, + "grad_norm": 1.9469322935504314, + "learning_rate": 9.312829762438914e-08, + "loss": 0.2408, + "step": 14952 + }, + { + "epoch": 0.94, + "grad_norm": 3.2739847687461685, + "learning_rate": 9.29327447994588e-08, + "loss": 0.2665, + "step": 14953 + }, + { + "epoch": 0.94, + "grad_norm": 1.6358838442443655, + "learning_rate": 9.27373955774652e-08, + "loss": 0.2404, + "step": 14954 + }, + { + "epoch": 0.94, + "grad_norm": 2.082394203426563, + "learning_rate": 9.254224996651351e-08, + "loss": 0.2456, + "step": 14955 + }, + { + "epoch": 0.94, + "grad_norm": 1.9432220621634133, + "learning_rate": 9.23473079747006e-08, + "loss": 0.2391, + "step": 14956 + }, + { + "epoch": 0.94, + "grad_norm": 1.6062936362069737, + "learning_rate": 9.215256961011443e-08, + "loss": 0.2485, + "step": 14957 + }, + { + "epoch": 0.94, + "grad_norm": 1.917760650477404, + "learning_rate": 9.195803488083521e-08, + "loss": 0.248, + "step": 14958 + }, + { + "epoch": 0.94, + "grad_norm": 1.693826435999172, + "learning_rate": 9.176370379493482e-08, + "loss": 0.2592, + "step": 14959 + }, + { + "epoch": 0.94, + "grad_norm": 2.468578559414902, + "learning_rate": 9.15695763604768e-08, + "loss": 0.2753, + "step": 14960 + }, + { + "epoch": 0.94, + "grad_norm": 1.576035149210496, + "learning_rate": 9.137565258551529e-08, + "loss": 0.247, + "step": 14961 + }, + { + "epoch": 0.94, + "grad_norm": 2.4686507036846277, + "learning_rate": 9.11819324780966e-08, + "loss": 0.2434, + "step": 14962 + }, + { + "epoch": 0.94, + "grad_norm": 6.0674562135055385, + "learning_rate": 9.098841604625874e-08, + "loss": 0.2457, + "step": 14963 + }, + { + "epoch": 0.94, + "grad_norm": 2.1382600137892642, + "learning_rate": 9.079510329803087e-08, + "loss": 0.2476, + "step": 14964 + }, + { + "epoch": 0.94, + "grad_norm": 1.5833971663826814, + "learning_rate": 9.060199424143378e-08, + "loss": 0.27, + "step": 14965 + }, + { + "epoch": 0.94, + "grad_norm": 1.9141649054444287, + "learning_rate": 9.040908888447941e-08, + "loss": 0.2503, + "step": 14966 + }, + { + "epoch": 0.94, + "grad_norm": 4.8754388846088315, + "learning_rate": 9.021638723517357e-08, + "loss": 0.2495, + "step": 14967 + }, + { + "epoch": 0.94, + "grad_norm": 1.5964389953001117, + "learning_rate": 9.002388930150984e-08, + "loss": 0.2387, + "step": 14968 + }, + { + "epoch": 0.94, + "grad_norm": 1.7661147240402528, + "learning_rate": 8.983159509147577e-08, + "loss": 0.2673, + "step": 14969 + }, + { + "epoch": 0.94, + "grad_norm": 1.7161758825257591, + "learning_rate": 8.963950461305104e-08, + "loss": 0.2499, + "step": 14970 + }, + { + "epoch": 0.94, + "grad_norm": 3.12529174164548, + "learning_rate": 8.944761787420486e-08, + "loss": 0.253, + "step": 14971 + }, + { + "epoch": 0.94, + "grad_norm": 1.4962954036350211, + "learning_rate": 8.925593488289918e-08, + "loss": 0.2443, + "step": 14972 + }, + { + "epoch": 0.94, + "grad_norm": 2.9436273201767413, + "learning_rate": 8.906445564708655e-08, + "loss": 0.2393, + "step": 14973 + }, + { + "epoch": 0.94, + "grad_norm": 1.7900452883307112, + "learning_rate": 8.88731801747128e-08, + "loss": 0.2557, + "step": 14974 + }, + { + "epoch": 0.94, + "grad_norm": 1.5962253752787288, + "learning_rate": 8.868210847371384e-08, + "loss": 0.2517, + "step": 14975 + }, + { + "epoch": 0.94, + "grad_norm": 1.5808888653862876, + "learning_rate": 8.84912405520183e-08, + "loss": 0.2523, + "step": 14976 + }, + { + "epoch": 0.94, + "grad_norm": 3.7288831022952715, + "learning_rate": 8.83005764175443e-08, + "loss": 0.2666, + "step": 14977 + }, + { + "epoch": 0.94, + "grad_norm": 2.3162567983125717, + "learning_rate": 8.811011607820386e-08, + "loss": 0.247, + "step": 14978 + }, + { + "epoch": 0.94, + "grad_norm": 2.445641448881334, + "learning_rate": 8.791985954189952e-08, + "loss": 0.2445, + "step": 14979 + }, + { + "epoch": 0.94, + "grad_norm": 3.1815497088289924, + "learning_rate": 8.772980681652444e-08, + "loss": 0.2432, + "step": 14980 + }, + { + "epoch": 0.94, + "grad_norm": 8.421640283671424, + "learning_rate": 8.753995790996505e-08, + "loss": 0.2469, + "step": 14981 + }, + { + "epoch": 0.94, + "grad_norm": 2.9510725059112817, + "learning_rate": 8.735031283009842e-08, + "loss": 0.2497, + "step": 14982 + }, + { + "epoch": 0.94, + "grad_norm": 2.912487167504048, + "learning_rate": 8.716087158479324e-08, + "loss": 0.264, + "step": 14983 + }, + { + "epoch": 0.94, + "grad_norm": 10.834676258213733, + "learning_rate": 8.697163418190935e-08, + "loss": 0.2354, + "step": 14984 + }, + { + "epoch": 0.94, + "grad_norm": 1.7326168534625863, + "learning_rate": 8.678260062929877e-08, + "loss": 0.243, + "step": 14985 + }, + { + "epoch": 0.94, + "grad_norm": 2.2672894190913695, + "learning_rate": 8.659377093480581e-08, + "loss": 0.2508, + "step": 14986 + }, + { + "epoch": 0.94, + "grad_norm": 5.359989194874794, + "learning_rate": 8.640514510626363e-08, + "loss": 0.2445, + "step": 14987 + }, + { + "epoch": 0.94, + "grad_norm": 1.5369905580351089, + "learning_rate": 8.621672315149987e-08, + "loss": 0.2529, + "step": 14988 + }, + { + "epoch": 0.94, + "grad_norm": 2.060738960519203, + "learning_rate": 8.602850507833161e-08, + "loss": 0.2556, + "step": 14989 + }, + { + "epoch": 0.94, + "grad_norm": 2.2529995206791567, + "learning_rate": 8.584049089456981e-08, + "loss": 0.2445, + "step": 14990 + }, + { + "epoch": 0.94, + "grad_norm": 1.6858711108570388, + "learning_rate": 8.56526806080138e-08, + "loss": 0.2659, + "step": 14991 + }, + { + "epoch": 0.94, + "grad_norm": 2.2813124187359204, + "learning_rate": 8.546507422645734e-08, + "loss": 0.265, + "step": 14992 + }, + { + "epoch": 0.94, + "grad_norm": 1.848471024723462, + "learning_rate": 8.52776717576842e-08, + "loss": 0.264, + "step": 14993 + }, + { + "epoch": 0.94, + "grad_norm": 3.4011796314883185, + "learning_rate": 8.509047320946984e-08, + "loss": 0.25, + "step": 14994 + }, + { + "epoch": 0.94, + "grad_norm": 2.0682365920212433, + "learning_rate": 8.490347858958192e-08, + "loss": 0.265, + "step": 14995 + }, + { + "epoch": 0.94, + "grad_norm": 1.7556126978556454, + "learning_rate": 8.471668790577814e-08, + "loss": 0.23, + "step": 14996 + }, + { + "epoch": 0.94, + "grad_norm": 2.0838923037676325, + "learning_rate": 8.45301011658095e-08, + "loss": 0.2506, + "step": 14997 + }, + { + "epoch": 0.94, + "grad_norm": 2.085841390214276, + "learning_rate": 8.434371837741817e-08, + "loss": 0.2425, + "step": 14998 + }, + { + "epoch": 0.94, + "grad_norm": 1.8883906705242588, + "learning_rate": 8.415753954833738e-08, + "loss": 0.2346, + "step": 14999 + }, + { + "epoch": 0.94, + "grad_norm": 4.727605274229623, + "learning_rate": 8.397156468629209e-08, + "loss": 0.2544, + "step": 15000 + }, + { + "epoch": 0.94, + "grad_norm": 1.83609080121886, + "learning_rate": 8.378579379899776e-08, + "loss": 0.2489, + "step": 15001 + }, + { + "epoch": 0.94, + "grad_norm": 3.3191386930002, + "learning_rate": 8.360022689416324e-08, + "loss": 0.2397, + "step": 15002 + }, + { + "epoch": 0.94, + "grad_norm": 3.977769675272214, + "learning_rate": 8.34148639794874e-08, + "loss": 0.273, + "step": 15003 + }, + { + "epoch": 0.94, + "grad_norm": 2.456322072507468, + "learning_rate": 8.322970506266237e-08, + "loss": 0.2705, + "step": 15004 + }, + { + "epoch": 0.94, + "grad_norm": 1.5705162597152489, + "learning_rate": 8.304475015136925e-08, + "loss": 0.2507, + "step": 15005 + }, + { + "epoch": 0.94, + "grad_norm": 1.7703958453252453, + "learning_rate": 8.285999925328359e-08, + "loss": 0.2599, + "step": 15006 + }, + { + "epoch": 0.94, + "grad_norm": 1.6048938773161612, + "learning_rate": 8.267545237607034e-08, + "loss": 0.2806, + "step": 15007 + }, + { + "epoch": 0.94, + "grad_norm": 1.9367012150118517, + "learning_rate": 8.249110952738726e-08, + "loss": 0.2694, + "step": 15008 + }, + { + "epoch": 0.94, + "grad_norm": 1.542691414723443, + "learning_rate": 8.230697071488159e-08, + "loss": 0.2444, + "step": 15009 + }, + { + "epoch": 0.94, + "grad_norm": 1.8070296743870038, + "learning_rate": 8.212303594619497e-08, + "loss": 0.2357, + "step": 15010 + }, + { + "epoch": 0.94, + "grad_norm": 2.6064423374607273, + "learning_rate": 8.193930522895799e-08, + "loss": 0.2422, + "step": 15011 + }, + { + "epoch": 0.94, + "grad_norm": 2.226075116398095, + "learning_rate": 8.175577857079565e-08, + "loss": 0.2658, + "step": 15012 + }, + { + "epoch": 0.94, + "grad_norm": 2.4496437824003148, + "learning_rate": 8.157245597932129e-08, + "loss": 0.2333, + "step": 15013 + }, + { + "epoch": 0.94, + "grad_norm": 1.6669729270492224, + "learning_rate": 8.138933746214218e-08, + "loss": 0.2417, + "step": 15014 + }, + { + "epoch": 0.94, + "grad_norm": 2.7373856422281864, + "learning_rate": 8.120642302685556e-08, + "loss": 0.2746, + "step": 15015 + }, + { + "epoch": 0.94, + "grad_norm": 2.5336644560674157, + "learning_rate": 8.102371268105147e-08, + "loss": 0.2556, + "step": 15016 + }, + { + "epoch": 0.94, + "grad_norm": 2.0907280259399346, + "learning_rate": 8.084120643231052e-08, + "loss": 0.2781, + "step": 15017 + }, + { + "epoch": 0.94, + "grad_norm": 2.130195687230351, + "learning_rate": 8.0658904288205e-08, + "loss": 0.2447, + "step": 15018 + }, + { + "epoch": 0.94, + "grad_norm": 1.9566567457339217, + "learning_rate": 8.047680625629994e-08, + "loss": 0.2598, + "step": 15019 + }, + { + "epoch": 0.94, + "grad_norm": 2.0714324441971175, + "learning_rate": 8.029491234414987e-08, + "loss": 0.245, + "step": 15020 + }, + { + "epoch": 0.94, + "grad_norm": 3.337980440495657, + "learning_rate": 8.01132225593021e-08, + "loss": 0.2596, + "step": 15021 + }, + { + "epoch": 0.94, + "grad_norm": 1.88405739578724, + "learning_rate": 7.993173690929556e-08, + "loss": 0.2569, + "step": 15022 + }, + { + "epoch": 0.94, + "grad_norm": 10.187463976959984, + "learning_rate": 7.975045540166038e-08, + "loss": 0.2596, + "step": 15023 + }, + { + "epoch": 0.94, + "grad_norm": 1.6669701272689876, + "learning_rate": 7.956937804391829e-08, + "loss": 0.2441, + "step": 15024 + }, + { + "epoch": 0.94, + "grad_norm": 4.487160049490356, + "learning_rate": 7.938850484358219e-08, + "loss": 0.2497, + "step": 15025 + }, + { + "epoch": 0.94, + "grad_norm": 4.416276728830592, + "learning_rate": 7.920783580815661e-08, + "loss": 0.2573, + "step": 15026 + }, + { + "epoch": 0.95, + "grad_norm": 1.4075389471818622, + "learning_rate": 7.90273709451389e-08, + "loss": 0.2511, + "step": 15027 + }, + { + "epoch": 0.95, + "grad_norm": 1.9727910733199612, + "learning_rate": 7.884711026201586e-08, + "loss": 0.2477, + "step": 15028 + }, + { + "epoch": 0.95, + "grad_norm": 0.5693306121869144, + "learning_rate": 7.866705376626704e-08, + "loss": 0.4655, + "step": 15029 + }, + { + "epoch": 0.95, + "grad_norm": 2.0371422961176253, + "learning_rate": 7.848720146536426e-08, + "loss": 0.2606, + "step": 15030 + }, + { + "epoch": 0.95, + "grad_norm": 11.406852070741884, + "learning_rate": 7.830755336676821e-08, + "loss": 0.2479, + "step": 15031 + }, + { + "epoch": 0.95, + "grad_norm": 1.5372067319191896, + "learning_rate": 7.812810947793403e-08, + "loss": 0.2406, + "step": 15032 + }, + { + "epoch": 0.95, + "grad_norm": 2.855173786389777, + "learning_rate": 7.79488698063069e-08, + "loss": 0.2345, + "step": 15033 + }, + { + "epoch": 0.95, + "grad_norm": 1.7803893474498058, + "learning_rate": 7.776983435932306e-08, + "loss": 0.2698, + "step": 15034 + }, + { + "epoch": 0.95, + "grad_norm": 3.087583867651988, + "learning_rate": 7.759100314441215e-08, + "loss": 0.2364, + "step": 15035 + }, + { + "epoch": 0.95, + "grad_norm": 2.5924903559618606, + "learning_rate": 7.741237616899377e-08, + "loss": 0.2318, + "step": 15036 + }, + { + "epoch": 0.95, + "grad_norm": 14.902985871795474, + "learning_rate": 7.723395344047924e-08, + "loss": 0.2538, + "step": 15037 + }, + { + "epoch": 0.95, + "grad_norm": 3.4096437595077678, + "learning_rate": 7.705573496627205e-08, + "loss": 0.2553, + "step": 15038 + }, + { + "epoch": 0.95, + "grad_norm": 1.9456729915431172, + "learning_rate": 7.687772075376632e-08, + "loss": 0.2485, + "step": 15039 + }, + { + "epoch": 0.95, + "grad_norm": 2.4682587504945, + "learning_rate": 7.669991081034889e-08, + "loss": 0.256, + "step": 15040 + }, + { + "epoch": 0.95, + "grad_norm": 1.8428692360405576, + "learning_rate": 7.65223051433961e-08, + "loss": 0.2554, + "step": 15041 + }, + { + "epoch": 0.95, + "grad_norm": 3.502670488676359, + "learning_rate": 7.634490376027814e-08, + "loss": 0.2487, + "step": 15042 + }, + { + "epoch": 0.95, + "grad_norm": 1.9273363329139828, + "learning_rate": 7.616770666835582e-08, + "loss": 0.2646, + "step": 15043 + }, + { + "epoch": 0.95, + "grad_norm": 1.7949329714879687, + "learning_rate": 7.599071387498102e-08, + "loss": 0.2502, + "step": 15044 + }, + { + "epoch": 0.95, + "grad_norm": 1.5112986784603015, + "learning_rate": 7.581392538749732e-08, + "loss": 0.2472, + "step": 15045 + }, + { + "epoch": 0.95, + "grad_norm": 3.149502030469735, + "learning_rate": 7.563734121324052e-08, + "loss": 0.2416, + "step": 15046 + }, + { + "epoch": 0.95, + "grad_norm": 2.9709106388728097, + "learning_rate": 7.546096135953696e-08, + "loss": 0.2388, + "step": 15047 + }, + { + "epoch": 0.95, + "grad_norm": 1.7272547408062873, + "learning_rate": 7.528478583370469e-08, + "loss": 0.2577, + "step": 15048 + }, + { + "epoch": 0.95, + "grad_norm": 2.3685895048468675, + "learning_rate": 7.510881464305452e-08, + "loss": 0.2716, + "step": 15049 + }, + { + "epoch": 0.95, + "grad_norm": 1.436988767256087, + "learning_rate": 7.493304779488675e-08, + "loss": 0.2371, + "step": 15050 + }, + { + "epoch": 0.95, + "grad_norm": 1.9474019351298308, + "learning_rate": 7.475748529649496e-08, + "loss": 0.2501, + "step": 15051 + }, + { + "epoch": 0.95, + "grad_norm": 2.414570049926212, + "learning_rate": 7.458212715516389e-08, + "loss": 0.2675, + "step": 15052 + }, + { + "epoch": 0.95, + "grad_norm": 1.5213522155912707, + "learning_rate": 7.440697337816771e-08, + "loss": 0.2414, + "step": 15053 + }, + { + "epoch": 0.95, + "grad_norm": 3.6456077863037066, + "learning_rate": 7.423202397277618e-08, + "loss": 0.2575, + "step": 15054 + }, + { + "epoch": 0.95, + "grad_norm": 1.9318247935661166, + "learning_rate": 7.405727894624626e-08, + "loss": 0.2592, + "step": 15055 + }, + { + "epoch": 0.95, + "grad_norm": 2.316707003497598, + "learning_rate": 7.388273830582937e-08, + "loss": 0.2544, + "step": 15056 + }, + { + "epoch": 0.95, + "grad_norm": 1.678109446238628, + "learning_rate": 7.370840205876806e-08, + "loss": 0.2528, + "step": 15057 + }, + { + "epoch": 0.95, + "grad_norm": 3.0675570350701333, + "learning_rate": 7.353427021229486e-08, + "loss": 0.2776, + "step": 15058 + }, + { + "epoch": 0.95, + "grad_norm": 2.1464382725465234, + "learning_rate": 7.336034277363513e-08, + "loss": 0.2456, + "step": 15059 + }, + { + "epoch": 0.95, + "grad_norm": 3.433912470881727, + "learning_rate": 7.318661975000585e-08, + "loss": 0.2528, + "step": 15060 + }, + { + "epoch": 0.95, + "grad_norm": 1.9775996362674466, + "learning_rate": 7.301310114861404e-08, + "loss": 0.2734, + "step": 15061 + }, + { + "epoch": 0.95, + "grad_norm": 2.356976812157187, + "learning_rate": 7.283978697666005e-08, + "loss": 0.2637, + "step": 15062 + }, + { + "epoch": 0.95, + "grad_norm": 1.4432642208060746, + "learning_rate": 7.266667724133536e-08, + "loss": 0.239, + "step": 15063 + }, + { + "epoch": 0.95, + "grad_norm": 1.875363497132506, + "learning_rate": 7.249377194982199e-08, + "loss": 0.2493, + "step": 15064 + }, + { + "epoch": 0.95, + "grad_norm": 2.750832913721003, + "learning_rate": 7.23210711092942e-08, + "loss": 0.2551, + "step": 15065 + }, + { + "epoch": 0.95, + "grad_norm": 3.1965919730419743, + "learning_rate": 7.214857472691794e-08, + "loss": 0.2591, + "step": 15066 + }, + { + "epoch": 0.95, + "grad_norm": 3.4181288946124706, + "learning_rate": 7.19762828098497e-08, + "loss": 0.2371, + "step": 15067 + }, + { + "epoch": 0.95, + "grad_norm": 1.9364263772404247, + "learning_rate": 7.18041953652393e-08, + "loss": 0.2429, + "step": 15068 + }, + { + "epoch": 0.95, + "grad_norm": 7.27039854486909, + "learning_rate": 7.16323124002255e-08, + "loss": 0.2847, + "step": 15069 + }, + { + "epoch": 0.95, + "grad_norm": 2.423011681705568, + "learning_rate": 7.146063392194147e-08, + "loss": 0.2654, + "step": 15070 + }, + { + "epoch": 0.95, + "grad_norm": 1.917807672109599, + "learning_rate": 7.128915993750929e-08, + "loss": 0.2587, + "step": 15071 + }, + { + "epoch": 0.95, + "grad_norm": 1.7531678404609035, + "learning_rate": 7.111789045404493e-08, + "loss": 0.2276, + "step": 15072 + }, + { + "epoch": 0.95, + "grad_norm": 1.6049922607264762, + "learning_rate": 7.094682547865328e-08, + "loss": 0.2468, + "step": 15073 + }, + { + "epoch": 0.95, + "grad_norm": 2.1576768292165927, + "learning_rate": 7.07759650184331e-08, + "loss": 0.2592, + "step": 15074 + }, + { + "epoch": 0.95, + "grad_norm": 11.952215411700562, + "learning_rate": 7.06053090804737e-08, + "loss": 0.251, + "step": 15075 + }, + { + "epoch": 0.95, + "grad_norm": 2.157583088277711, + "learning_rate": 7.043485767185554e-08, + "loss": 0.2449, + "step": 15076 + }, + { + "epoch": 0.95, + "grad_norm": 0.5970967109403809, + "learning_rate": 7.026461079965075e-08, + "loss": 0.4728, + "step": 15077 + }, + { + "epoch": 0.95, + "grad_norm": 1.564200936615172, + "learning_rate": 7.009456847092311e-08, + "loss": 0.2579, + "step": 15078 + }, + { + "epoch": 0.95, + "grad_norm": 1.3959489720807112, + "learning_rate": 6.99247306927292e-08, + "loss": 0.2551, + "step": 15079 + }, + { + "epoch": 0.95, + "grad_norm": 1.7185456841383286, + "learning_rate": 6.975509747211451e-08, + "loss": 0.2431, + "step": 15080 + }, + { + "epoch": 0.95, + "grad_norm": 2.557992125173991, + "learning_rate": 6.958566881611783e-08, + "loss": 0.2555, + "step": 15081 + }, + { + "epoch": 0.95, + "grad_norm": 2.473151134879203, + "learning_rate": 6.941644473176967e-08, + "loss": 0.2649, + "step": 15082 + }, + { + "epoch": 0.95, + "grad_norm": 1.8596363940244411, + "learning_rate": 6.924742522609052e-08, + "loss": 0.2565, + "step": 15083 + }, + { + "epoch": 0.95, + "grad_norm": 3.3633102982141923, + "learning_rate": 6.90786103060942e-08, + "loss": 0.2463, + "step": 15084 + }, + { + "epoch": 0.95, + "grad_norm": 2.0047519931535174, + "learning_rate": 6.890999997878345e-08, + "loss": 0.2507, + "step": 15085 + }, + { + "epoch": 0.95, + "grad_norm": 1.9008710161498206, + "learning_rate": 6.8741594251156e-08, + "loss": 0.2304, + "step": 15086 + }, + { + "epoch": 0.95, + "grad_norm": 2.4535875985794044, + "learning_rate": 6.857339313019906e-08, + "loss": 0.2404, + "step": 15087 + }, + { + "epoch": 0.95, + "grad_norm": 2.2091228671246594, + "learning_rate": 6.840539662289148e-08, + "loss": 0.2338, + "step": 15088 + }, + { + "epoch": 0.95, + "grad_norm": 1.5789767588666381, + "learning_rate": 6.823760473620267e-08, + "loss": 0.2537, + "step": 15089 + }, + { + "epoch": 0.95, + "grad_norm": 1.5449648442695354, + "learning_rate": 6.807001747709596e-08, + "loss": 0.2328, + "step": 15090 + }, + { + "epoch": 0.95, + "grad_norm": 1.4428670856828945, + "learning_rate": 6.790263485252413e-08, + "loss": 0.2286, + "step": 15091 + }, + { + "epoch": 0.95, + "grad_norm": 1.9390355575658578, + "learning_rate": 6.773545686943272e-08, + "loss": 0.2468, + "step": 15092 + }, + { + "epoch": 0.95, + "grad_norm": 3.6580568443351344, + "learning_rate": 6.756848353475732e-08, + "loss": 0.2672, + "step": 15093 + }, + { + "epoch": 0.95, + "grad_norm": 1.6013723138859637, + "learning_rate": 6.740171485542624e-08, + "loss": 0.2356, + "step": 15094 + }, + { + "epoch": 0.95, + "grad_norm": 1.8487835934831445, + "learning_rate": 6.723515083836008e-08, + "loss": 0.2636, + "step": 15095 + }, + { + "epoch": 0.95, + "grad_norm": 3.0637371279345085, + "learning_rate": 6.706879149046886e-08, + "loss": 0.2614, + "step": 15096 + }, + { + "epoch": 0.95, + "grad_norm": 3.01843306386544, + "learning_rate": 6.690263681865539e-08, + "loss": 0.2572, + "step": 15097 + }, + { + "epoch": 0.95, + "grad_norm": 2.2889947065763074, + "learning_rate": 6.67366868298136e-08, + "loss": 0.2775, + "step": 15098 + }, + { + "epoch": 0.95, + "grad_norm": 1.7265614047933293, + "learning_rate": 6.657094153082911e-08, + "loss": 0.2431, + "step": 15099 + }, + { + "epoch": 0.95, + "grad_norm": 1.9326881289441615, + "learning_rate": 6.640540092857916e-08, + "loss": 0.2275, + "step": 15100 + }, + { + "epoch": 0.95, + "grad_norm": 1.4671067198548577, + "learning_rate": 6.624006502993219e-08, + "loss": 0.2304, + "step": 15101 + }, + { + "epoch": 0.95, + "grad_norm": 5.3049156344853685, + "learning_rate": 6.607493384174768e-08, + "loss": 0.2589, + "step": 15102 + }, + { + "epoch": 0.95, + "grad_norm": 1.7234941969486317, + "learning_rate": 6.59100073708785e-08, + "loss": 0.2327, + "step": 15103 + }, + { + "epoch": 0.95, + "grad_norm": 2.9464370795901975, + "learning_rate": 6.574528562416693e-08, + "loss": 0.2623, + "step": 15104 + }, + { + "epoch": 0.95, + "grad_norm": 2.2161176030459635, + "learning_rate": 6.558076860844752e-08, + "loss": 0.2487, + "step": 15105 + }, + { + "epoch": 0.95, + "grad_norm": 1.4946278049032118, + "learning_rate": 6.54164563305465e-08, + "loss": 0.2314, + "step": 15106 + }, + { + "epoch": 0.95, + "grad_norm": 2.9899022284806125, + "learning_rate": 6.525234879728171e-08, + "loss": 0.2554, + "step": 15107 + }, + { + "epoch": 0.95, + "grad_norm": 2.34642547748058, + "learning_rate": 6.508844601546216e-08, + "loss": 0.2299, + "step": 15108 + }, + { + "epoch": 0.95, + "grad_norm": 1.7554524323469507, + "learning_rate": 6.492474799188797e-08, + "loss": 0.2528, + "step": 15109 + }, + { + "epoch": 0.95, + "grad_norm": 1.8134892649807535, + "learning_rate": 6.476125473335149e-08, + "loss": 0.2243, + "step": 15110 + }, + { + "epoch": 0.95, + "grad_norm": 1.700990344186364, + "learning_rate": 6.459796624663728e-08, + "loss": 0.2436, + "step": 15111 + }, + { + "epoch": 0.95, + "grad_norm": 2.1122002978515972, + "learning_rate": 6.443488253851937e-08, + "loss": 0.2323, + "step": 15112 + }, + { + "epoch": 0.95, + "grad_norm": 1.8460731893107822, + "learning_rate": 6.427200361576458e-08, + "loss": 0.2474, + "step": 15113 + }, + { + "epoch": 0.95, + "grad_norm": 1.770180126530419, + "learning_rate": 6.410932948513138e-08, + "loss": 0.2674, + "step": 15114 + }, + { + "epoch": 0.95, + "grad_norm": 1.761875834139763, + "learning_rate": 6.394686015336882e-08, + "loss": 0.2701, + "step": 15115 + }, + { + "epoch": 0.95, + "grad_norm": 3.8154205062658497, + "learning_rate": 6.378459562721928e-08, + "loss": 0.2609, + "step": 15116 + }, + { + "epoch": 0.95, + "grad_norm": 2.0372390233772855, + "learning_rate": 6.362253591341405e-08, + "loss": 0.2741, + "step": 15117 + }, + { + "epoch": 0.95, + "grad_norm": 6.583921479661432, + "learning_rate": 6.346068101867775e-08, + "loss": 0.2517, + "step": 15118 + }, + { + "epoch": 0.95, + "grad_norm": 1.5572213060348525, + "learning_rate": 6.329903094972612e-08, + "loss": 0.2502, + "step": 15119 + }, + { + "epoch": 0.95, + "grad_norm": 1.6531779228784358, + "learning_rate": 6.313758571326656e-08, + "loss": 0.2426, + "step": 15120 + }, + { + "epoch": 0.95, + "grad_norm": 1.9409091805365677, + "learning_rate": 6.29763453159965e-08, + "loss": 0.2331, + "step": 15121 + }, + { + "epoch": 0.95, + "grad_norm": 3.0001766903916027, + "learning_rate": 6.281530976460781e-08, + "loss": 0.2695, + "step": 15122 + }, + { + "epoch": 0.95, + "grad_norm": 3.1533895809246753, + "learning_rate": 6.265447906578126e-08, + "loss": 0.282, + "step": 15123 + }, + { + "epoch": 0.95, + "grad_norm": 1.6730403870285562, + "learning_rate": 6.24938532261904e-08, + "loss": 0.2532, + "step": 15124 + }, + { + "epoch": 0.95, + "grad_norm": 1.7155203670651034, + "learning_rate": 6.233343225249933e-08, + "loss": 0.2575, + "step": 15125 + }, + { + "epoch": 0.95, + "grad_norm": 4.999608808053576, + "learning_rate": 6.217321615136385e-08, + "loss": 0.2409, + "step": 15126 + }, + { + "epoch": 0.95, + "grad_norm": 0.5850726638459498, + "learning_rate": 6.201320492943252e-08, + "loss": 0.4774, + "step": 15127 + }, + { + "epoch": 0.95, + "grad_norm": 0.6008261510209832, + "learning_rate": 6.18533985933445e-08, + "loss": 0.465, + "step": 15128 + }, + { + "epoch": 0.95, + "grad_norm": 1.5886253994562793, + "learning_rate": 6.169379714973e-08, + "loss": 0.2581, + "step": 15129 + }, + { + "epoch": 0.95, + "grad_norm": 1.8513448530907524, + "learning_rate": 6.153440060521099e-08, + "loss": 0.2394, + "step": 15130 + }, + { + "epoch": 0.95, + "grad_norm": 1.9469107701208122, + "learning_rate": 6.137520896640159e-08, + "loss": 0.2506, + "step": 15131 + }, + { + "epoch": 0.95, + "grad_norm": 1.7405142829403057, + "learning_rate": 6.121622223990708e-08, + "loss": 0.2295, + "step": 15132 + }, + { + "epoch": 0.95, + "grad_norm": 2.4737681393194144, + "learning_rate": 6.105744043232331e-08, + "loss": 0.2386, + "step": 15133 + }, + { + "epoch": 0.95, + "grad_norm": 1.862263669110029, + "learning_rate": 6.089886355023889e-08, + "loss": 0.2727, + "step": 15134 + }, + { + "epoch": 0.95, + "grad_norm": 1.8948451670601327, + "learning_rate": 6.074049160023355e-08, + "loss": 0.2458, + "step": 15135 + }, + { + "epoch": 0.95, + "grad_norm": 1.8907923683215584, + "learning_rate": 6.05823245888787e-08, + "loss": 0.2381, + "step": 15136 + }, + { + "epoch": 0.95, + "grad_norm": 1.933079028711817, + "learning_rate": 6.042436252273631e-08, + "loss": 0.2426, + "step": 15137 + }, + { + "epoch": 0.95, + "grad_norm": 0.6434428434070673, + "learning_rate": 6.026660540836005e-08, + "loss": 0.4853, + "step": 15138 + }, + { + "epoch": 0.95, + "grad_norm": 0.6234215158027452, + "learning_rate": 6.010905325229744e-08, + "loss": 0.4676, + "step": 15139 + }, + { + "epoch": 0.95, + "grad_norm": 2.0697498060359614, + "learning_rate": 5.99517060610838e-08, + "loss": 0.2556, + "step": 15140 + }, + { + "epoch": 0.95, + "grad_norm": 2.6493974884538214, + "learning_rate": 5.979456384124893e-08, + "loss": 0.269, + "step": 15141 + }, + { + "epoch": 0.95, + "grad_norm": 1.8341882391238067, + "learning_rate": 5.963762659931205e-08, + "loss": 0.2673, + "step": 15142 + }, + { + "epoch": 0.95, + "grad_norm": 2.287705961590037, + "learning_rate": 5.948089434178517e-08, + "loss": 0.249, + "step": 15143 + }, + { + "epoch": 0.95, + "grad_norm": 1.897937430213338, + "learning_rate": 5.932436707517142e-08, + "loss": 0.2577, + "step": 15144 + }, + { + "epoch": 0.95, + "grad_norm": 1.8812675656392879, + "learning_rate": 5.91680448059645e-08, + "loss": 0.247, + "step": 15145 + }, + { + "epoch": 0.95, + "grad_norm": 2.3695730808633724, + "learning_rate": 5.901192754065199e-08, + "loss": 0.2478, + "step": 15146 + }, + { + "epoch": 0.95, + "grad_norm": 2.3134578969140613, + "learning_rate": 5.8856015285710946e-08, + "loss": 0.2592, + "step": 15147 + }, + { + "epoch": 0.95, + "grad_norm": 2.519528924904941, + "learning_rate": 5.870030804761007e-08, + "loss": 0.2606, + "step": 15148 + }, + { + "epoch": 0.95, + "grad_norm": 1.8495245079254352, + "learning_rate": 5.8544805832810304e-08, + "loss": 0.2696, + "step": 15149 + }, + { + "epoch": 0.95, + "grad_norm": 1.6019247537774153, + "learning_rate": 5.838950864776316e-08, + "loss": 0.27, + "step": 15150 + }, + { + "epoch": 0.95, + "grad_norm": 2.1327114632354314, + "learning_rate": 5.8234416498912925e-08, + "loss": 0.2542, + "step": 15151 + }, + { + "epoch": 0.95, + "grad_norm": 2.571365519697078, + "learning_rate": 5.8079529392694453e-08, + "loss": 0.2636, + "step": 15152 + }, + { + "epoch": 0.95, + "grad_norm": 3.16703759712683, + "learning_rate": 5.792484733553372e-08, + "loss": 0.2447, + "step": 15153 + }, + { + "epoch": 0.95, + "grad_norm": 2.010961220939667, + "learning_rate": 5.7770370333849466e-08, + "loss": 0.2569, + "step": 15154 + }, + { + "epoch": 0.95, + "grad_norm": 1.640572411925826, + "learning_rate": 5.7616098394051026e-08, + "loss": 0.262, + "step": 15155 + }, + { + "epoch": 0.95, + "grad_norm": 1.5472826722278206, + "learning_rate": 5.746203152253937e-08, + "loss": 0.2489, + "step": 15156 + }, + { + "epoch": 0.95, + "grad_norm": 3.4749328699426574, + "learning_rate": 5.730816972570663e-08, + "loss": 0.249, + "step": 15157 + }, + { + "epoch": 0.95, + "grad_norm": 2.423856974609935, + "learning_rate": 5.715451300993713e-08, + "loss": 0.2591, + "step": 15158 + }, + { + "epoch": 0.95, + "grad_norm": 2.009584840737862, + "learning_rate": 5.700106138160688e-08, + "loss": 0.24, + "step": 15159 + }, + { + "epoch": 0.95, + "grad_norm": 1.3581814776620722, + "learning_rate": 5.684781484708135e-08, + "loss": 0.2549, + "step": 15160 + }, + { + "epoch": 0.95, + "grad_norm": 1.9425870284310038, + "learning_rate": 5.6694773412721e-08, + "loss": 0.2465, + "step": 15161 + }, + { + "epoch": 0.95, + "grad_norm": 1.2810184054673877, + "learning_rate": 5.65419370848741e-08, + "loss": 0.2414, + "step": 15162 + }, + { + "epoch": 0.95, + "grad_norm": 2.703815186506678, + "learning_rate": 5.638930586988334e-08, + "loss": 0.2448, + "step": 15163 + }, + { + "epoch": 0.95, + "grad_norm": 0.6213095471075559, + "learning_rate": 5.623687977408088e-08, + "loss": 0.4262, + "step": 15164 + }, + { + "epoch": 0.95, + "grad_norm": 1.682341354058051, + "learning_rate": 5.6084658803791656e-08, + "loss": 0.2496, + "step": 15165 + }, + { + "epoch": 0.95, + "grad_norm": 3.215689411723757, + "learning_rate": 5.5932642965331184e-08, + "loss": 0.2569, + "step": 15166 + }, + { + "epoch": 0.95, + "grad_norm": 0.6587726584601461, + "learning_rate": 5.578083226500663e-08, + "loss": 0.4948, + "step": 15167 + }, + { + "epoch": 0.95, + "grad_norm": 11.168351878895283, + "learning_rate": 5.562922670911741e-08, + "loss": 0.2371, + "step": 15168 + }, + { + "epoch": 0.95, + "grad_norm": 1.8955329916455563, + "learning_rate": 5.547782630395404e-08, + "loss": 0.2467, + "step": 15169 + }, + { + "epoch": 0.95, + "grad_norm": 2.588851101880383, + "learning_rate": 5.532663105579761e-08, + "loss": 0.2449, + "step": 15170 + }, + { + "epoch": 0.95, + "grad_norm": 1.6489272704683102, + "learning_rate": 5.517564097092254e-08, + "loss": 0.2478, + "step": 15171 + }, + { + "epoch": 0.95, + "grad_norm": 16.90788333060237, + "learning_rate": 5.5024856055593266e-08, + "loss": 0.2429, + "step": 15172 + }, + { + "epoch": 0.95, + "grad_norm": 2.28757202297265, + "learning_rate": 5.487427631606534e-08, + "loss": 0.2411, + "step": 15173 + }, + { + "epoch": 0.95, + "grad_norm": 1.8575776825344417, + "learning_rate": 5.472390175858766e-08, + "loss": 0.2436, + "step": 15174 + }, + { + "epoch": 0.95, + "grad_norm": 2.2881137453485976, + "learning_rate": 5.457373238939856e-08, + "loss": 0.2565, + "step": 15175 + }, + { + "epoch": 0.95, + "grad_norm": 1.8198144124812337, + "learning_rate": 5.4423768214730274e-08, + "loss": 0.2694, + "step": 15176 + }, + { + "epoch": 0.95, + "grad_norm": 2.8031232580146708, + "learning_rate": 5.427400924080395e-08, + "loss": 0.267, + "step": 15177 + }, + { + "epoch": 0.95, + "grad_norm": 2.6278084081181734, + "learning_rate": 5.4124455473832935e-08, + "loss": 0.2529, + "step": 15178 + }, + { + "epoch": 0.95, + "grad_norm": 2.021537135167698, + "learning_rate": 5.397510692002339e-08, + "loss": 0.2541, + "step": 15179 + }, + { + "epoch": 0.95, + "grad_norm": 3.402138276385479, + "learning_rate": 5.3825963585572015e-08, + "loss": 0.2371, + "step": 15180 + }, + { + "epoch": 0.95, + "grad_norm": 1.8101820977372205, + "learning_rate": 5.367702547666664e-08, + "loss": 0.2647, + "step": 15181 + }, + { + "epoch": 0.95, + "grad_norm": 1.8894330996454136, + "learning_rate": 5.3528292599486776e-08, + "loss": 0.2579, + "step": 15182 + }, + { + "epoch": 0.95, + "grad_norm": 1.8951110145995576, + "learning_rate": 5.337976496020469e-08, + "loss": 0.2417, + "step": 15183 + }, + { + "epoch": 0.95, + "grad_norm": 1.9426591456784061, + "learning_rate": 5.323144256498214e-08, + "loss": 0.2446, + "step": 15184 + }, + { + "epoch": 0.95, + "grad_norm": 1.4369025705594134, + "learning_rate": 5.308332541997363e-08, + "loss": 0.2347, + "step": 15185 + }, + { + "epoch": 0.96, + "grad_norm": 1.8318883764582616, + "learning_rate": 5.293541353132425e-08, + "loss": 0.2488, + "step": 15186 + }, + { + "epoch": 0.96, + "grad_norm": 1.6685713685598524, + "learning_rate": 5.2787706905171874e-08, + "loss": 0.2597, + "step": 15187 + }, + { + "epoch": 0.96, + "grad_norm": 1.5352402352444785, + "learning_rate": 5.264020554764438e-08, + "loss": 0.2425, + "step": 15188 + }, + { + "epoch": 0.96, + "grad_norm": 1.7706277661782548, + "learning_rate": 5.249290946486241e-08, + "loss": 0.2435, + "step": 15189 + }, + { + "epoch": 0.96, + "grad_norm": 4.736725762980753, + "learning_rate": 5.23458186629372e-08, + "loss": 0.2516, + "step": 15190 + }, + { + "epoch": 0.96, + "grad_norm": 3.764797371212197, + "learning_rate": 5.2198933147972194e-08, + "loss": 0.2485, + "step": 15191 + }, + { + "epoch": 0.96, + "grad_norm": 2.0022105372767873, + "learning_rate": 5.205225292606198e-08, + "loss": 0.2567, + "step": 15192 + }, + { + "epoch": 0.96, + "grad_norm": 2.594500950542812, + "learning_rate": 5.1905778003292216e-08, + "loss": 0.2422, + "step": 15193 + }, + { + "epoch": 0.96, + "grad_norm": 1.9204515577690455, + "learning_rate": 5.175950838574029e-08, + "loss": 0.2449, + "step": 15194 + }, + { + "epoch": 0.96, + "grad_norm": 1.5331104549298378, + "learning_rate": 5.1613444079475774e-08, + "loss": 0.2333, + "step": 15195 + }, + { + "epoch": 0.96, + "grad_norm": 2.47501339219459, + "learning_rate": 5.1467585090558825e-08, + "loss": 0.2581, + "step": 15196 + }, + { + "epoch": 0.96, + "grad_norm": 2.684539632185198, + "learning_rate": 5.132193142504072e-08, + "loss": 0.247, + "step": 15197 + }, + { + "epoch": 0.96, + "grad_norm": 2.027822068369087, + "learning_rate": 5.1176483088966054e-08, + "loss": 0.265, + "step": 15198 + }, + { + "epoch": 0.96, + "grad_norm": 2.6415789247623427, + "learning_rate": 5.103124008836891e-08, + "loss": 0.2477, + "step": 15199 + }, + { + "epoch": 0.96, + "grad_norm": 8.164109607072536, + "learning_rate": 5.088620242927666e-08, + "loss": 0.2474, + "step": 15200 + }, + { + "epoch": 0.96, + "grad_norm": 2.494603751571222, + "learning_rate": 5.0741370117705634e-08, + "loss": 0.2491, + "step": 15201 + }, + { + "epoch": 0.96, + "grad_norm": 2.0621501866695215, + "learning_rate": 5.0596743159666565e-08, + "loss": 0.2587, + "step": 15202 + }, + { + "epoch": 0.96, + "grad_norm": 1.7005264148884391, + "learning_rate": 5.0452321561159646e-08, + "loss": 0.2458, + "step": 15203 + }, + { + "epoch": 0.96, + "grad_norm": 2.2018407435742287, + "learning_rate": 5.030810532817732e-08, + "loss": 0.256, + "step": 15204 + }, + { + "epoch": 0.96, + "grad_norm": 2.7323948238513522, + "learning_rate": 5.016409446670312e-08, + "loss": 0.2473, + "step": 15205 + }, + { + "epoch": 0.96, + "grad_norm": 3.363654318878327, + "learning_rate": 5.002028898271227e-08, + "loss": 0.2645, + "step": 15206 + }, + { + "epoch": 0.96, + "grad_norm": 2.236699145240007, + "learning_rate": 4.987668888217223e-08, + "loss": 0.2473, + "step": 15207 + }, + { + "epoch": 0.96, + "grad_norm": 2.194310880329028, + "learning_rate": 4.9733294171041e-08, + "loss": 0.2478, + "step": 15208 + }, + { + "epoch": 0.96, + "grad_norm": 3.8973970223237857, + "learning_rate": 4.959010485526772e-08, + "loss": 0.2526, + "step": 15209 + }, + { + "epoch": 0.96, + "grad_norm": 0.6451938731953772, + "learning_rate": 4.9447120940793735e-08, + "loss": 0.466, + "step": 15210 + }, + { + "epoch": 0.96, + "grad_norm": 3.048792251373977, + "learning_rate": 4.9304342433552086e-08, + "loss": 0.2539, + "step": 15211 + }, + { + "epoch": 0.96, + "grad_norm": 5.019522967877576, + "learning_rate": 4.9161769339466926e-08, + "loss": 0.236, + "step": 15212 + }, + { + "epoch": 0.96, + "grad_norm": 1.906157398199811, + "learning_rate": 4.9019401664453516e-08, + "loss": 0.2635, + "step": 15213 + }, + { + "epoch": 0.96, + "grad_norm": 2.303859144721179, + "learning_rate": 4.887723941441824e-08, + "loss": 0.2594, + "step": 15214 + }, + { + "epoch": 0.96, + "grad_norm": 2.3740232477896925, + "learning_rate": 4.8735282595261393e-08, + "loss": 0.2337, + "step": 15215 + }, + { + "epoch": 0.96, + "grad_norm": 4.811416062202169, + "learning_rate": 4.8593531212872136e-08, + "loss": 0.268, + "step": 15216 + }, + { + "epoch": 0.96, + "grad_norm": 2.247098452250411, + "learning_rate": 4.845198527313188e-08, + "loss": 0.2526, + "step": 15217 + }, + { + "epoch": 0.96, + "grad_norm": 1.3133988129658465, + "learning_rate": 4.831064478191316e-08, + "loss": 0.2411, + "step": 15218 + }, + { + "epoch": 0.96, + "grad_norm": 3.008405293056618, + "learning_rate": 4.8169509745081275e-08, + "loss": 0.2535, + "step": 15219 + }, + { + "epoch": 0.96, + "grad_norm": 3.710309663320064, + "learning_rate": 4.80285801684921e-08, + "loss": 0.2553, + "step": 15220 + }, + { + "epoch": 0.96, + "grad_norm": 1.9966221546370129, + "learning_rate": 4.7887856057993175e-08, + "loss": 0.2604, + "step": 15221 + }, + { + "epoch": 0.96, + "grad_norm": 2.1092487647744567, + "learning_rate": 4.774733741942206e-08, + "loss": 0.2524, + "step": 15222 + }, + { + "epoch": 0.96, + "grad_norm": 0.5856665390191502, + "learning_rate": 4.7607024258610744e-08, + "loss": 0.4571, + "step": 15223 + }, + { + "epoch": 0.96, + "grad_norm": 1.4573738058872443, + "learning_rate": 4.746691658138014e-08, + "loss": 0.2579, + "step": 15224 + }, + { + "epoch": 0.96, + "grad_norm": 3.677805411856043, + "learning_rate": 4.732701439354448e-08, + "loss": 0.2383, + "step": 15225 + }, + { + "epoch": 0.96, + "grad_norm": 3.5062308680197356, + "learning_rate": 4.7187317700906896e-08, + "loss": 0.257, + "step": 15226 + }, + { + "epoch": 0.96, + "grad_norm": 3.9946536866791273, + "learning_rate": 4.704782650926554e-08, + "loss": 0.249, + "step": 15227 + }, + { + "epoch": 0.96, + "grad_norm": 1.8148324274248189, + "learning_rate": 4.6908540824406876e-08, + "loss": 0.2462, + "step": 15228 + }, + { + "epoch": 0.96, + "grad_norm": 2.0039248350527212, + "learning_rate": 4.676946065211074e-08, + "loss": 0.236, + "step": 15229 + }, + { + "epoch": 0.96, + "grad_norm": 4.078730454449582, + "learning_rate": 4.6630585998147515e-08, + "loss": 0.2339, + "step": 15230 + }, + { + "epoch": 0.96, + "grad_norm": 4.510118986408492, + "learning_rate": 4.649191686827925e-08, + "loss": 0.2611, + "step": 15231 + }, + { + "epoch": 0.96, + "grad_norm": 2.525831984473719, + "learning_rate": 4.635345326826024e-08, + "loss": 0.2518, + "step": 15232 + }, + { + "epoch": 0.96, + "grad_norm": 7.9945673532516714, + "learning_rate": 4.621519520383477e-08, + "loss": 0.2547, + "step": 15233 + }, + { + "epoch": 0.96, + "grad_norm": 2.5247985117279472, + "learning_rate": 4.607714268073937e-08, + "loss": 0.2339, + "step": 15234 + }, + { + "epoch": 0.96, + "grad_norm": 2.0937951549587672, + "learning_rate": 4.5939295704703344e-08, + "loss": 0.2394, + "step": 15235 + }, + { + "epoch": 0.96, + "grad_norm": 2.5020797471347267, + "learning_rate": 4.58016542814449e-08, + "loss": 0.2435, + "step": 15236 + }, + { + "epoch": 0.96, + "grad_norm": 1.6930752115180783, + "learning_rate": 4.5664218416675565e-08, + "loss": 0.2256, + "step": 15237 + }, + { + "epoch": 0.96, + "grad_norm": 1.582958312268396, + "learning_rate": 4.5526988116097457e-08, + "loss": 0.2464, + "step": 15238 + }, + { + "epoch": 0.96, + "grad_norm": 3.2690202315888017, + "learning_rate": 4.5389963385405467e-08, + "loss": 0.2457, + "step": 15239 + }, + { + "epoch": 0.96, + "grad_norm": 2.265770290941597, + "learning_rate": 4.525314423028393e-08, + "loss": 0.2421, + "step": 15240 + }, + { + "epoch": 0.96, + "grad_norm": 1.7212025690661907, + "learning_rate": 4.5116530656409415e-08, + "loss": 0.2394, + "step": 15241 + }, + { + "epoch": 0.96, + "grad_norm": 1.6488513883421396, + "learning_rate": 4.498012266945129e-08, + "loss": 0.2444, + "step": 15242 + }, + { + "epoch": 0.96, + "grad_norm": 2.300497615099552, + "learning_rate": 4.4843920275068896e-08, + "loss": 0.2392, + "step": 15243 + }, + { + "epoch": 0.96, + "grad_norm": 1.5748133550149743, + "learning_rate": 4.470792347891384e-08, + "loss": 0.2573, + "step": 15244 + }, + { + "epoch": 0.96, + "grad_norm": 1.558385985174884, + "learning_rate": 4.457213228662882e-08, + "loss": 0.2549, + "step": 15245 + }, + { + "epoch": 0.96, + "grad_norm": 4.0750824986953065, + "learning_rate": 4.443654670384767e-08, + "loss": 0.2583, + "step": 15246 + }, + { + "epoch": 0.96, + "grad_norm": 1.5004155359426379, + "learning_rate": 4.430116673619589e-08, + "loss": 0.2519, + "step": 15247 + }, + { + "epoch": 0.96, + "grad_norm": 2.177245468921564, + "learning_rate": 4.4165992389291754e-08, + "loss": 0.2343, + "step": 15248 + }, + { + "epoch": 0.96, + "grad_norm": 2.55994081459802, + "learning_rate": 4.4031023668742454e-08, + "loss": 0.2513, + "step": 15249 + }, + { + "epoch": 0.96, + "grad_norm": 2.236676502308055, + "learning_rate": 4.3896260580149067e-08, + "loss": 0.2675, + "step": 15250 + }, + { + "epoch": 0.96, + "grad_norm": 1.4437197521405114, + "learning_rate": 4.3761703129102664e-08, + "loss": 0.2501, + "step": 15251 + }, + { + "epoch": 0.96, + "grad_norm": 4.823777221525274, + "learning_rate": 4.362735132118656e-08, + "loss": 0.2368, + "step": 15252 + }, + { + "epoch": 0.96, + "grad_norm": 1.5703363684189728, + "learning_rate": 4.349320516197575e-08, + "loss": 0.2443, + "step": 15253 + }, + { + "epoch": 0.96, + "grad_norm": 2.0679800829988095, + "learning_rate": 4.335926465703466e-08, + "loss": 0.2592, + "step": 15254 + }, + { + "epoch": 0.96, + "grad_norm": 2.356343132527862, + "learning_rate": 4.3225529811922186e-08, + "loss": 0.2567, + "step": 15255 + }, + { + "epoch": 0.96, + "grad_norm": 1.9619603710102285, + "learning_rate": 4.309200063218666e-08, + "loss": 0.2355, + "step": 15256 + }, + { + "epoch": 0.96, + "grad_norm": 1.7831380507861792, + "learning_rate": 4.29586771233681e-08, + "loss": 0.2513, + "step": 15257 + }, + { + "epoch": 0.96, + "grad_norm": 1.860489360122903, + "learning_rate": 4.28255592909993e-08, + "loss": 0.248, + "step": 15258 + }, + { + "epoch": 0.96, + "grad_norm": 2.005892768479395, + "learning_rate": 4.2692647140602507e-08, + "loss": 0.2482, + "step": 15259 + }, + { + "epoch": 0.96, + "grad_norm": 2.023296402407743, + "learning_rate": 4.255994067769331e-08, + "loss": 0.2537, + "step": 15260 + }, + { + "epoch": 0.96, + "grad_norm": 2.220633330102731, + "learning_rate": 4.2427439907777866e-08, + "loss": 0.2471, + "step": 15261 + }, + { + "epoch": 0.96, + "grad_norm": 3.869521845471784, + "learning_rate": 4.229514483635288e-08, + "loss": 0.2561, + "step": 15262 + }, + { + "epoch": 0.96, + "grad_norm": 1.8901609326295237, + "learning_rate": 4.216305546890842e-08, + "loss": 0.2535, + "step": 15263 + }, + { + "epoch": 0.96, + "grad_norm": 2.390020115903151, + "learning_rate": 4.2031171810925083e-08, + "loss": 0.2356, + "step": 15264 + }, + { + "epoch": 0.96, + "grad_norm": 0.5936812413296529, + "learning_rate": 4.189949386787462e-08, + "loss": 0.4859, + "step": 15265 + }, + { + "epoch": 0.96, + "grad_norm": 3.337940223191999, + "learning_rate": 4.176802164522042e-08, + "loss": 0.2596, + "step": 15266 + }, + { + "epoch": 0.96, + "grad_norm": 2.4417081749550698, + "learning_rate": 4.163675514841814e-08, + "loss": 0.2438, + "step": 15267 + }, + { + "epoch": 0.96, + "grad_norm": 2.660431688649492, + "learning_rate": 4.1505694382913965e-08, + "loss": 0.2727, + "step": 15268 + }, + { + "epoch": 0.96, + "grad_norm": 14.53405796262389, + "learning_rate": 4.1374839354146325e-08, + "loss": 0.2781, + "step": 15269 + }, + { + "epoch": 0.96, + "grad_norm": 2.377763272319495, + "learning_rate": 4.12441900675431e-08, + "loss": 0.2716, + "step": 15270 + }, + { + "epoch": 0.96, + "grad_norm": 2.9765658395738166, + "learning_rate": 4.111374652852662e-08, + "loss": 0.2513, + "step": 15271 + }, + { + "epoch": 0.96, + "grad_norm": 1.5273943077283598, + "learning_rate": 4.098350874250867e-08, + "loss": 0.2389, + "step": 15272 + }, + { + "epoch": 0.96, + "grad_norm": 4.407040531772963, + "learning_rate": 4.085347671489382e-08, + "loss": 0.2365, + "step": 15273 + }, + { + "epoch": 0.96, + "grad_norm": 2.3047366264035998, + "learning_rate": 4.072365045107551e-08, + "loss": 0.2713, + "step": 15274 + }, + { + "epoch": 0.96, + "grad_norm": 1.8629879975745975, + "learning_rate": 4.059402995644224e-08, + "loss": 0.2543, + "step": 15275 + }, + { + "epoch": 0.96, + "grad_norm": 2.072581575390451, + "learning_rate": 4.046461523637191e-08, + "loss": 0.254, + "step": 15276 + }, + { + "epoch": 0.96, + "grad_norm": 2.3297485353315848, + "learning_rate": 4.033540629623356e-08, + "loss": 0.2598, + "step": 15277 + }, + { + "epoch": 0.96, + "grad_norm": 2.317567473137255, + "learning_rate": 4.020640314138846e-08, + "loss": 0.242, + "step": 15278 + }, + { + "epoch": 0.96, + "grad_norm": 2.165336164376101, + "learning_rate": 4.0077605777189e-08, + "loss": 0.2333, + "step": 15279 + }, + { + "epoch": 0.96, + "grad_norm": 3.028975643073122, + "learning_rate": 3.9949014208979784e-08, + "loss": 0.2373, + "step": 15280 + }, + { + "epoch": 0.96, + "grad_norm": 7.164789527296874, + "learning_rate": 3.9820628442096e-08, + "loss": 0.2557, + "step": 15281 + }, + { + "epoch": 0.96, + "grad_norm": 2.1806254702067998, + "learning_rate": 3.969244848186449e-08, + "loss": 0.267, + "step": 15282 + }, + { + "epoch": 0.96, + "grad_norm": 3.0176171263476332, + "learning_rate": 3.9564474333603776e-08, + "loss": 0.2469, + "step": 15283 + }, + { + "epoch": 0.96, + "grad_norm": 1.6824030486712855, + "learning_rate": 3.943670600262406e-08, + "loss": 0.2523, + "step": 15284 + }, + { + "epoch": 0.96, + "grad_norm": 1.7226523653560994, + "learning_rate": 3.9309143494226097e-08, + "loss": 0.2443, + "step": 15285 + }, + { + "epoch": 0.96, + "grad_norm": 1.582261031254253, + "learning_rate": 3.918178681370288e-08, + "loss": 0.2482, + "step": 15286 + }, + { + "epoch": 0.96, + "grad_norm": 1.6870161425798145, + "learning_rate": 3.905463596633852e-08, + "loss": 0.2538, + "step": 15287 + }, + { + "epoch": 0.96, + "grad_norm": 2.18981167878124, + "learning_rate": 3.8927690957409893e-08, + "loss": 0.2541, + "step": 15288 + }, + { + "epoch": 0.96, + "grad_norm": 1.774093789964343, + "learning_rate": 3.880095179218224e-08, + "loss": 0.2418, + "step": 15289 + }, + { + "epoch": 0.96, + "grad_norm": 1.8224984339852053, + "learning_rate": 3.86744184759158e-08, + "loss": 0.2511, + "step": 15290 + }, + { + "epoch": 0.96, + "grad_norm": 1.8674508338644273, + "learning_rate": 3.8548091013859704e-08, + "loss": 0.2294, + "step": 15291 + }, + { + "epoch": 0.96, + "grad_norm": 1.49393034334191, + "learning_rate": 3.842196941125587e-08, + "loss": 0.2419, + "step": 15292 + }, + { + "epoch": 0.96, + "grad_norm": 8.091091727312069, + "learning_rate": 3.8296053673337884e-08, + "loss": 0.2439, + "step": 15293 + }, + { + "epoch": 0.96, + "grad_norm": 9.896800212058695, + "learning_rate": 3.8170343805328245e-08, + "loss": 0.2341, + "step": 15294 + }, + { + "epoch": 0.96, + "grad_norm": 7.168275542320641, + "learning_rate": 3.8044839812445554e-08, + "loss": 0.2693, + "step": 15295 + }, + { + "epoch": 0.96, + "grad_norm": 1.7257924184085172, + "learning_rate": 3.791954169989509e-08, + "loss": 0.2476, + "step": 15296 + }, + { + "epoch": 0.96, + "grad_norm": 4.648661235010644, + "learning_rate": 3.7794449472877135e-08, + "loss": 0.2383, + "step": 15297 + }, + { + "epoch": 0.96, + "grad_norm": 1.5738737607136402, + "learning_rate": 3.7669563136580875e-08, + "loss": 0.2597, + "step": 15298 + }, + { + "epoch": 0.96, + "grad_norm": 1.802143903144524, + "learning_rate": 3.754488269618883e-08, + "loss": 0.2622, + "step": 15299 + }, + { + "epoch": 0.96, + "grad_norm": 2.3231835161972927, + "learning_rate": 3.742040815687353e-08, + "loss": 0.2636, + "step": 15300 + }, + { + "epoch": 0.96, + "grad_norm": 2.4970093285623225, + "learning_rate": 3.729613952380029e-08, + "loss": 0.2484, + "step": 15301 + }, + { + "epoch": 0.96, + "grad_norm": 3.997487054648361, + "learning_rate": 3.7172076802124426e-08, + "loss": 0.2698, + "step": 15302 + }, + { + "epoch": 0.96, + "grad_norm": 3.5618913184894834, + "learning_rate": 3.704821999699515e-08, + "loss": 0.2391, + "step": 15303 + }, + { + "epoch": 0.96, + "grad_norm": 1.5674667541409975, + "learning_rate": 3.692456911354947e-08, + "loss": 0.246, + "step": 15304 + }, + { + "epoch": 0.96, + "grad_norm": 1.9085429118159454, + "learning_rate": 3.6801124156919385e-08, + "loss": 0.2421, + "step": 15305 + }, + { + "epoch": 0.96, + "grad_norm": 1.8696135249059707, + "learning_rate": 3.667788513222581e-08, + "loss": 0.2791, + "step": 15306 + }, + { + "epoch": 0.96, + "grad_norm": 1.8700458518702057, + "learning_rate": 3.655485204458353e-08, + "loss": 0.2482, + "step": 15307 + }, + { + "epoch": 0.96, + "grad_norm": 1.9142051065697887, + "learning_rate": 3.6432024899095694e-08, + "loss": 0.2413, + "step": 15308 + }, + { + "epoch": 0.96, + "grad_norm": 2.4659636876350683, + "learning_rate": 3.630940370085934e-08, + "loss": 0.2555, + "step": 15309 + }, + { + "epoch": 0.96, + "grad_norm": 1.777845902709994, + "learning_rate": 3.61869884549626e-08, + "loss": 0.2485, + "step": 15310 + }, + { + "epoch": 0.96, + "grad_norm": 1.8942817230681803, + "learning_rate": 3.606477916648477e-08, + "loss": 0.2484, + "step": 15311 + }, + { + "epoch": 0.96, + "grad_norm": 1.51286077522307, + "learning_rate": 3.594277584049566e-08, + "loss": 0.2529, + "step": 15312 + }, + { + "epoch": 0.96, + "grad_norm": 2.048395491240227, + "learning_rate": 3.5820978482058454e-08, + "loss": 0.2435, + "step": 15313 + }, + { + "epoch": 0.96, + "grad_norm": 1.345472689640056, + "learning_rate": 3.569938709622578e-08, + "loss": 0.2331, + "step": 15314 + }, + { + "epoch": 0.96, + "grad_norm": 1.6320413597453367, + "learning_rate": 3.557800168804359e-08, + "loss": 0.2619, + "step": 15315 + }, + { + "epoch": 0.96, + "grad_norm": 1.702694496363717, + "learning_rate": 3.545682226254732e-08, + "loss": 0.2455, + "step": 15316 + }, + { + "epoch": 0.96, + "grad_norm": 1.936633360459089, + "learning_rate": 3.5335848824765706e-08, + "loss": 0.235, + "step": 15317 + }, + { + "epoch": 0.96, + "grad_norm": 1.488881124586551, + "learning_rate": 3.521508137971807e-08, + "loss": 0.25, + "step": 15318 + }, + { + "epoch": 0.96, + "grad_norm": 1.250987039169261, + "learning_rate": 3.5094519932415417e-08, + "loss": 0.2662, + "step": 15319 + }, + { + "epoch": 0.96, + "grad_norm": 2.0940118980209528, + "learning_rate": 3.4974164487859285e-08, + "loss": 0.2497, + "step": 15320 + }, + { + "epoch": 0.96, + "grad_norm": 2.0866958738940644, + "learning_rate": 3.485401505104458e-08, + "loss": 0.276, + "step": 15321 + }, + { + "epoch": 0.96, + "grad_norm": 7.139820748030344, + "learning_rate": 3.473407162695508e-08, + "loss": 0.2411, + "step": 15322 + }, + { + "epoch": 0.96, + "grad_norm": 2.1695480514946923, + "learning_rate": 3.4614334220569036e-08, + "loss": 0.2628, + "step": 15323 + }, + { + "epoch": 0.96, + "grad_norm": 1.8046521207051538, + "learning_rate": 3.449480283685302e-08, + "loss": 0.2346, + "step": 15324 + }, + { + "epoch": 0.96, + "grad_norm": 1.5955712789364611, + "learning_rate": 3.4375477480768084e-08, + "loss": 0.2492, + "step": 15325 + }, + { + "epoch": 0.96, + "grad_norm": 1.2857057519472268, + "learning_rate": 3.4256358157264135e-08, + "loss": 0.2481, + "step": 15326 + }, + { + "epoch": 0.96, + "grad_norm": 2.2482746178734794, + "learning_rate": 3.41374448712839e-08, + "loss": 0.2562, + "step": 15327 + }, + { + "epoch": 0.96, + "grad_norm": 3.036424485527265, + "learning_rate": 3.4018737627761754e-08, + "loss": 0.2627, + "step": 15328 + }, + { + "epoch": 0.96, + "grad_norm": 2.8016908421713853, + "learning_rate": 3.390023643162266e-08, + "loss": 0.2429, + "step": 15329 + }, + { + "epoch": 0.96, + "grad_norm": 1.7336543605057326, + "learning_rate": 3.3781941287783224e-08, + "loss": 0.2595, + "step": 15330 + }, + { + "epoch": 0.96, + "grad_norm": 2.2087324847340573, + "learning_rate": 3.3663852201152314e-08, + "loss": 0.2451, + "step": 15331 + }, + { + "epoch": 0.96, + "grad_norm": 2.0713500706815813, + "learning_rate": 3.354596917662989e-08, + "loss": 0.2574, + "step": 15332 + }, + { + "epoch": 0.96, + "grad_norm": 2.0011230273975733, + "learning_rate": 3.342829221910593e-08, + "loss": 0.2434, + "step": 15333 + }, + { + "epoch": 0.96, + "grad_norm": 2.4172602084071526, + "learning_rate": 3.331082133346375e-08, + "loss": 0.2604, + "step": 15334 + }, + { + "epoch": 0.96, + "grad_norm": 1.5952833301376854, + "learning_rate": 3.3193556524577784e-08, + "loss": 0.2522, + "step": 15335 + }, + { + "epoch": 0.96, + "grad_norm": 2.3358224769108644, + "learning_rate": 3.3076497797313034e-08, + "loss": 0.2514, + "step": 15336 + }, + { + "epoch": 0.96, + "grad_norm": 1.7516779276850174, + "learning_rate": 3.295964515652672e-08, + "loss": 0.2582, + "step": 15337 + }, + { + "epoch": 0.96, + "grad_norm": 1.5643769409130932, + "learning_rate": 3.284299860706719e-08, + "loss": 0.2499, + "step": 15338 + }, + { + "epoch": 0.96, + "grad_norm": 3.434310465857878, + "learning_rate": 3.27265581537739e-08, + "loss": 0.2326, + "step": 15339 + }, + { + "epoch": 0.96, + "grad_norm": 2.992712023337565, + "learning_rate": 3.2610323801479104e-08, + "loss": 0.2592, + "step": 15340 + }, + { + "epoch": 0.96, + "grad_norm": 2.135565201470293, + "learning_rate": 3.249429555500505e-08, + "loss": 0.2453, + "step": 15341 + }, + { + "epoch": 0.96, + "grad_norm": 1.8042016918879589, + "learning_rate": 3.2378473419165665e-08, + "loss": 0.2573, + "step": 15342 + }, + { + "epoch": 0.96, + "grad_norm": 2.7511426969549913, + "learning_rate": 3.226285739876711e-08, + "loss": 0.2443, + "step": 15343 + }, + { + "epoch": 0.96, + "grad_norm": 1.8925565070592203, + "learning_rate": 3.21474474986061e-08, + "loss": 0.2407, + "step": 15344 + }, + { + "epoch": 0.97, + "grad_norm": 1.81522924280968, + "learning_rate": 3.203224372347158e-08, + "loss": 0.2434, + "step": 15345 + }, + { + "epoch": 0.97, + "grad_norm": 2.4283249443979504, + "learning_rate": 3.191724607814306e-08, + "loss": 0.2528, + "step": 15346 + }, + { + "epoch": 0.97, + "grad_norm": 2.6450169322172354, + "learning_rate": 3.1802454567392837e-08, + "loss": 0.2834, + "step": 15347 + }, + { + "epoch": 0.97, + "grad_norm": 4.363494925275413, + "learning_rate": 3.1687869195983215e-08, + "loss": 0.2407, + "step": 15348 + }, + { + "epoch": 0.97, + "grad_norm": 1.6525994502119377, + "learning_rate": 3.157348996866816e-08, + "loss": 0.2457, + "step": 15349 + }, + { + "epoch": 0.97, + "grad_norm": 2.843293513615338, + "learning_rate": 3.145931689019388e-08, + "loss": 0.2526, + "step": 15350 + }, + { + "epoch": 0.97, + "grad_norm": 1.4776435375146455, + "learning_rate": 3.134534996529826e-08, + "loss": 0.2527, + "step": 15351 + }, + { + "epoch": 0.97, + "grad_norm": 2.082787644659762, + "learning_rate": 3.123158919870917e-08, + "loss": 0.2484, + "step": 15352 + }, + { + "epoch": 0.97, + "grad_norm": 2.3803864607492664, + "learning_rate": 3.111803459514673e-08, + "loss": 0.2596, + "step": 15353 + }, + { + "epoch": 0.97, + "grad_norm": 3.4063893010098014, + "learning_rate": 3.1004686159322726e-08, + "loss": 0.237, + "step": 15354 + }, + { + "epoch": 0.97, + "grad_norm": 1.6274120290660266, + "learning_rate": 3.089154389594062e-08, + "loss": 0.2482, + "step": 15355 + }, + { + "epoch": 0.97, + "grad_norm": 1.7085256371492532, + "learning_rate": 3.0778607809694436e-08, + "loss": 0.2512, + "step": 15356 + }, + { + "epoch": 0.97, + "grad_norm": 1.8195484725010698, + "learning_rate": 3.066587790526987e-08, + "loss": 0.241, + "step": 15357 + }, + { + "epoch": 0.97, + "grad_norm": 3.7289982967678537, + "learning_rate": 3.055335418734429e-08, + "loss": 0.2586, + "step": 15358 + }, + { + "epoch": 0.97, + "grad_norm": 2.061965627953282, + "learning_rate": 3.0441036660587284e-08, + "loss": 0.2397, + "step": 15359 + }, + { + "epoch": 0.97, + "grad_norm": 2.354734299441215, + "learning_rate": 3.032892532965848e-08, + "loss": 0.2589, + "step": 15360 + }, + { + "epoch": 0.97, + "grad_norm": 3.004312778668439, + "learning_rate": 3.021702019920969e-08, + "loss": 0.2737, + "step": 15361 + }, + { + "epoch": 0.97, + "grad_norm": 3.0144702909820316, + "learning_rate": 3.010532127388388e-08, + "loss": 0.2508, + "step": 15362 + }, + { + "epoch": 0.97, + "grad_norm": 2.4394008718255376, + "learning_rate": 2.999382855831623e-08, + "loss": 0.2509, + "step": 15363 + }, + { + "epoch": 0.97, + "grad_norm": 1.8584716934521521, + "learning_rate": 2.9882542057131926e-08, + "loss": 0.2515, + "step": 15364 + }, + { + "epoch": 0.97, + "grad_norm": 3.0465589848756323, + "learning_rate": 2.9771461774949494e-08, + "loss": 0.2526, + "step": 15365 + }, + { + "epoch": 0.97, + "grad_norm": 1.3813740686620206, + "learning_rate": 2.9660587716376366e-08, + "loss": 0.252, + "step": 15366 + }, + { + "epoch": 0.97, + "grad_norm": 2.745844013136599, + "learning_rate": 2.954991988601441e-08, + "loss": 0.2466, + "step": 15367 + }, + { + "epoch": 0.97, + "grad_norm": 2.638236706039188, + "learning_rate": 2.9439458288454404e-08, + "loss": 0.2594, + "step": 15368 + }, + { + "epoch": 0.97, + "grad_norm": 25.038448563217997, + "learning_rate": 2.9329202928280452e-08, + "loss": 0.2502, + "step": 15369 + }, + { + "epoch": 0.97, + "grad_norm": 1.5932460614647606, + "learning_rate": 2.9219153810066124e-08, + "loss": 0.2472, + "step": 15370 + }, + { + "epoch": 0.97, + "grad_norm": 2.268113488709931, + "learning_rate": 2.9109310938378875e-08, + "loss": 0.2559, + "step": 15371 + }, + { + "epoch": 0.97, + "grad_norm": 1.813926907059919, + "learning_rate": 2.899967431777506e-08, + "loss": 0.2567, + "step": 15372 + }, + { + "epoch": 0.97, + "grad_norm": 1.452551507658095, + "learning_rate": 2.889024395280493e-08, + "loss": 0.2387, + "step": 15373 + }, + { + "epoch": 0.97, + "grad_norm": 1.3181105248950122, + "learning_rate": 2.8781019848007628e-08, + "loss": 0.2358, + "step": 15374 + }, + { + "epoch": 0.97, + "grad_norm": 0.6590910298493007, + "learning_rate": 2.8672002007915646e-08, + "loss": 0.4294, + "step": 15375 + }, + { + "epoch": 0.97, + "grad_norm": 5.022896727496877, + "learning_rate": 2.856319043705258e-08, + "loss": 0.2814, + "step": 15376 + }, + { + "epoch": 0.97, + "grad_norm": 3.3888693843039, + "learning_rate": 2.8454585139933154e-08, + "loss": 0.2563, + "step": 15377 + }, + { + "epoch": 0.97, + "grad_norm": 1.4575689950990918, + "learning_rate": 2.8346186121063214e-08, + "loss": 0.2508, + "step": 15378 + }, + { + "epoch": 0.97, + "grad_norm": 2.189791923584707, + "learning_rate": 2.8237993384940822e-08, + "loss": 0.2488, + "step": 15379 + }, + { + "epoch": 0.97, + "grad_norm": 1.899432846434351, + "learning_rate": 2.8130006936055167e-08, + "loss": 0.2619, + "step": 15380 + }, + { + "epoch": 0.97, + "grad_norm": 1.512986310092415, + "learning_rate": 2.8022226778885997e-08, + "loss": 0.2454, + "step": 15381 + }, + { + "epoch": 0.97, + "grad_norm": 3.269107239754572, + "learning_rate": 2.7914652917906405e-08, + "loss": 0.2504, + "step": 15382 + }, + { + "epoch": 0.97, + "grad_norm": 1.9599260607219626, + "learning_rate": 2.7807285357578374e-08, + "loss": 0.2604, + "step": 15383 + }, + { + "epoch": 0.97, + "grad_norm": 1.6436723989716344, + "learning_rate": 2.7700124102358896e-08, + "loss": 0.2492, + "step": 15384 + }, + { + "epoch": 0.97, + "grad_norm": 1.8879250478746306, + "learning_rate": 2.759316915669219e-08, + "loss": 0.2588, + "step": 15385 + }, + { + "epoch": 0.97, + "grad_norm": 4.293526808767585, + "learning_rate": 2.7486420525017487e-08, + "loss": 0.2644, + "step": 15386 + }, + { + "epoch": 0.97, + "grad_norm": 1.6808194686052513, + "learning_rate": 2.737987821176291e-08, + "loss": 0.2605, + "step": 15387 + }, + { + "epoch": 0.97, + "grad_norm": 3.4008204885221236, + "learning_rate": 2.7273542221349925e-08, + "loss": 0.2593, + "step": 15388 + }, + { + "epoch": 0.97, + "grad_norm": 1.9107654947024302, + "learning_rate": 2.7167412558189997e-08, + "loss": 0.2473, + "step": 15389 + }, + { + "epoch": 0.97, + "grad_norm": 1.9418409289716292, + "learning_rate": 2.7061489226686832e-08, + "loss": 0.2492, + "step": 15390 + }, + { + "epoch": 0.97, + "grad_norm": 9.665281881741754, + "learning_rate": 2.6955772231235243e-08, + "loss": 0.2605, + "step": 15391 + }, + { + "epoch": 0.97, + "grad_norm": 1.6116081555458277, + "learning_rate": 2.685026157622228e-08, + "loss": 0.231, + "step": 15392 + }, + { + "epoch": 0.97, + "grad_norm": 4.538967965499641, + "learning_rate": 2.6744957266024996e-08, + "loss": 0.2465, + "step": 15393 + }, + { + "epoch": 0.97, + "grad_norm": 3.183306124935992, + "learning_rate": 2.663985930501267e-08, + "loss": 0.2425, + "step": 15394 + }, + { + "epoch": 0.97, + "grad_norm": 2.2725031333267514, + "learning_rate": 2.653496769754682e-08, + "loss": 0.2721, + "step": 15395 + }, + { + "epoch": 0.97, + "grad_norm": 3.0530465258193353, + "learning_rate": 2.6430282447978404e-08, + "loss": 0.2602, + "step": 15396 + }, + { + "epoch": 0.97, + "grad_norm": 0.6272586979980177, + "learning_rate": 2.6325803560652286e-08, + "loss": 0.4753, + "step": 15397 + }, + { + "epoch": 0.97, + "grad_norm": 1.4854977102106606, + "learning_rate": 2.6221531039902214e-08, + "loss": 0.2478, + "step": 15398 + }, + { + "epoch": 0.97, + "grad_norm": 1.603740926459719, + "learning_rate": 2.611746489005529e-08, + "loss": 0.2354, + "step": 15399 + }, + { + "epoch": 0.97, + "grad_norm": 2.184837014693623, + "learning_rate": 2.601360511542972e-08, + "loss": 0.2535, + "step": 15400 + }, + { + "epoch": 0.97, + "grad_norm": 1.5317966287391347, + "learning_rate": 2.5909951720334837e-08, + "loss": 0.2505, + "step": 15401 + }, + { + "epoch": 0.97, + "grad_norm": 2.6660374121280324, + "learning_rate": 2.580650470906998e-08, + "loss": 0.2465, + "step": 15402 + }, + { + "epoch": 0.97, + "grad_norm": 1.8738114424906243, + "learning_rate": 2.570326408592949e-08, + "loss": 0.2437, + "step": 15403 + }, + { + "epoch": 0.97, + "grad_norm": 2.308996330877488, + "learning_rate": 2.56002298551955e-08, + "loss": 0.2437, + "step": 15404 + }, + { + "epoch": 0.97, + "grad_norm": 1.550698509333058, + "learning_rate": 2.549740202114348e-08, + "loss": 0.2572, + "step": 15405 + }, + { + "epoch": 0.97, + "grad_norm": 2.6730352932360026, + "learning_rate": 2.5394780588039458e-08, + "loss": 0.2435, + "step": 15406 + }, + { + "epoch": 0.97, + "grad_norm": 2.4473176371846725, + "learning_rate": 2.5292365560142252e-08, + "loss": 0.2696, + "step": 15407 + }, + { + "epoch": 0.97, + "grad_norm": 2.624639056178406, + "learning_rate": 2.5190156941700684e-08, + "loss": 0.2465, + "step": 15408 + }, + { + "epoch": 0.97, + "grad_norm": 1.3593923238769758, + "learning_rate": 2.508815473695636e-08, + "loss": 0.25, + "step": 15409 + }, + { + "epoch": 0.97, + "grad_norm": 3.7386045395773486, + "learning_rate": 2.4986358950140343e-08, + "loss": 0.2516, + "step": 15410 + }, + { + "epoch": 0.97, + "grad_norm": 1.6639875402424946, + "learning_rate": 2.488476958547703e-08, + "loss": 0.2334, + "step": 15411 + }, + { + "epoch": 0.97, + "grad_norm": 2.0738530249733325, + "learning_rate": 2.478338664718194e-08, + "loss": 0.2721, + "step": 15412 + }, + { + "epoch": 0.97, + "grad_norm": 5.110522461368836, + "learning_rate": 2.4682210139460593e-08, + "loss": 0.2526, + "step": 15413 + }, + { + "epoch": 0.97, + "grad_norm": 2.3880459034330768, + "learning_rate": 2.4581240066511304e-08, + "loss": 0.2446, + "step": 15414 + }, + { + "epoch": 0.97, + "grad_norm": 2.31725079189721, + "learning_rate": 2.4480476432524048e-08, + "loss": 0.2545, + "step": 15415 + }, + { + "epoch": 0.97, + "grad_norm": 1.5664437641217384, + "learning_rate": 2.4379919241679373e-08, + "loss": 0.2427, + "step": 15416 + }, + { + "epoch": 0.97, + "grad_norm": 2.2831622809714314, + "learning_rate": 2.4279568498149497e-08, + "loss": 0.24, + "step": 15417 + }, + { + "epoch": 0.97, + "grad_norm": 2.498901627489701, + "learning_rate": 2.4179424206098314e-08, + "loss": 0.2312, + "step": 15418 + }, + { + "epoch": 0.97, + "grad_norm": 1.7619439479261354, + "learning_rate": 2.4079486369680826e-08, + "loss": 0.2488, + "step": 15419 + }, + { + "epoch": 0.97, + "grad_norm": 1.457464055928428, + "learning_rate": 2.3979754993043724e-08, + "loss": 0.2428, + "step": 15420 + }, + { + "epoch": 0.97, + "grad_norm": 1.5065920712687704, + "learning_rate": 2.3880230080324806e-08, + "loss": 0.2288, + "step": 15421 + }, + { + "epoch": 0.97, + "grad_norm": 2.4139226223849604, + "learning_rate": 2.3780911635653547e-08, + "loss": 0.2548, + "step": 15422 + }, + { + "epoch": 0.97, + "grad_norm": 2.3718508502763997, + "learning_rate": 2.3681799663151096e-08, + "loss": 0.237, + "step": 15423 + }, + { + "epoch": 0.97, + "grad_norm": 1.7419795838586325, + "learning_rate": 2.358289416693027e-08, + "loss": 0.2354, + "step": 15424 + }, + { + "epoch": 0.97, + "grad_norm": 3.1453250036548543, + "learning_rate": 2.3484195151093902e-08, + "loss": 0.2387, + "step": 15425 + }, + { + "epoch": 0.97, + "grad_norm": 0.6081404995631711, + "learning_rate": 2.338570261973705e-08, + "loss": 0.4764, + "step": 15426 + }, + { + "epoch": 0.97, + "grad_norm": 44.83175261527692, + "learning_rate": 2.328741657694755e-08, + "loss": 0.2558, + "step": 15427 + }, + { + "epoch": 0.97, + "grad_norm": 2.0605645560487424, + "learning_rate": 2.3189337026802705e-08, + "loss": 0.2572, + "step": 15428 + }, + { + "epoch": 0.97, + "grad_norm": 1.4817074203972622, + "learning_rate": 2.309146397337203e-08, + "loss": 0.2215, + "step": 15429 + }, + { + "epoch": 0.97, + "grad_norm": 1.8491849328290098, + "learning_rate": 2.2993797420716168e-08, + "loss": 0.2571, + "step": 15430 + }, + { + "epoch": 0.97, + "grad_norm": 1.38646516516461, + "learning_rate": 2.2896337372887988e-08, + "loss": 0.2439, + "step": 15431 + }, + { + "epoch": 0.97, + "grad_norm": 2.3661684537440792, + "learning_rate": 2.279908383393148e-08, + "loss": 0.2512, + "step": 15432 + }, + { + "epoch": 0.97, + "grad_norm": 2.6656297796028037, + "learning_rate": 2.270203680788119e-08, + "loss": 0.2546, + "step": 15433 + }, + { + "epoch": 0.97, + "grad_norm": 2.933725777283793, + "learning_rate": 2.26051962987639e-08, + "loss": 0.2518, + "step": 15434 + }, + { + "epoch": 0.97, + "grad_norm": 6.072465880182918, + "learning_rate": 2.2508562310598057e-08, + "loss": 0.2501, + "step": 15435 + }, + { + "epoch": 0.97, + "grad_norm": 2.1916570925011354, + "learning_rate": 2.2412134847392687e-08, + "loss": 0.2322, + "step": 15436 + }, + { + "epoch": 0.97, + "grad_norm": 1.8154144355138124, + "learning_rate": 2.2315913913149578e-08, + "loss": 0.2652, + "step": 15437 + }, + { + "epoch": 0.97, + "grad_norm": 3.5564233721356238, + "learning_rate": 2.221989951185999e-08, + "loss": 0.2607, + "step": 15438 + }, + { + "epoch": 0.97, + "grad_norm": 1.6670904655104286, + "learning_rate": 2.2124091647509062e-08, + "loss": 0.2332, + "step": 15439 + }, + { + "epoch": 0.97, + "grad_norm": 1.606807819661613, + "learning_rate": 2.202849032407084e-08, + "loss": 0.236, + "step": 15440 + }, + { + "epoch": 0.97, + "grad_norm": 1.765907588009598, + "learning_rate": 2.193309554551215e-08, + "loss": 0.2437, + "step": 15441 + }, + { + "epoch": 0.97, + "grad_norm": 3.44992618005198, + "learning_rate": 2.1837907315791495e-08, + "loss": 0.2725, + "step": 15442 + }, + { + "epoch": 0.97, + "grad_norm": 3.149696000434322, + "learning_rate": 2.174292563885849e-08, + "loss": 0.2739, + "step": 15443 + }, + { + "epoch": 0.97, + "grad_norm": 2.8933918783675505, + "learning_rate": 2.1648150518653875e-08, + "loss": 0.2779, + "step": 15444 + }, + { + "epoch": 0.97, + "grad_norm": 2.5328495552060035, + "learning_rate": 2.1553581959110058e-08, + "loss": 0.2617, + "step": 15445 + }, + { + "epoch": 0.97, + "grad_norm": 4.205367458624773, + "learning_rate": 2.145921996415057e-08, + "loss": 0.2679, + "step": 15446 + }, + { + "epoch": 0.97, + "grad_norm": 2.5079999903802914, + "learning_rate": 2.1365064537691162e-08, + "loss": 0.2599, + "step": 15447 + }, + { + "epoch": 0.97, + "grad_norm": 3.5755684981923537, + "learning_rate": 2.1271115683638154e-08, + "loss": 0.2687, + "step": 15448 + }, + { + "epoch": 0.97, + "grad_norm": 7.141764026252246, + "learning_rate": 2.11773734058901e-08, + "loss": 0.2464, + "step": 15449 + }, + { + "epoch": 0.97, + "grad_norm": 4.341899232552611, + "learning_rate": 2.1083837708335554e-08, + "loss": 0.2536, + "step": 15450 + }, + { + "epoch": 0.97, + "grad_norm": 2.1419034502819514, + "learning_rate": 2.0990508594856407e-08, + "loss": 0.2622, + "step": 15451 + }, + { + "epoch": 0.97, + "grad_norm": 1.9159169909098033, + "learning_rate": 2.089738606932512e-08, + "loss": 0.2618, + "step": 15452 + }, + { + "epoch": 0.97, + "grad_norm": 2.9294412910823078, + "learning_rate": 2.0804470135604714e-08, + "loss": 0.2548, + "step": 15453 + }, + { + "epoch": 0.97, + "grad_norm": 1.9508149838605389, + "learning_rate": 2.0711760797550996e-08, + "loss": 0.2355, + "step": 15454 + }, + { + "epoch": 0.97, + "grad_norm": 2.5438058980307416, + "learning_rate": 2.0619258059010883e-08, + "loss": 0.2515, + "step": 15455 + }, + { + "epoch": 0.97, + "grad_norm": 0.6200725801548135, + "learning_rate": 2.0526961923821864e-08, + "loss": 0.4887, + "step": 15456 + }, + { + "epoch": 0.97, + "grad_norm": 1.3413472880789359, + "learning_rate": 2.0434872395813655e-08, + "loss": 0.2345, + "step": 15457 + }, + { + "epoch": 0.97, + "grad_norm": 3.2186162078452494, + "learning_rate": 2.034298947880764e-08, + "loss": 0.2521, + "step": 15458 + }, + { + "epoch": 0.97, + "grad_norm": 2.486717289994227, + "learning_rate": 2.0251313176615218e-08, + "loss": 0.2518, + "step": 15459 + }, + { + "epoch": 0.97, + "grad_norm": 1.9950808963470017, + "learning_rate": 2.015984349304112e-08, + "loss": 0.2483, + "step": 15460 + }, + { + "epoch": 0.97, + "grad_norm": 3.413461480828005, + "learning_rate": 2.0068580431880647e-08, + "loss": 0.2463, + "step": 15461 + }, + { + "epoch": 0.97, + "grad_norm": 2.2821608242697047, + "learning_rate": 1.9977523996919658e-08, + "loss": 0.2445, + "step": 15462 + }, + { + "epoch": 0.97, + "grad_norm": 1.851568948622649, + "learning_rate": 1.9886674191937348e-08, + "loss": 0.2472, + "step": 15463 + }, + { + "epoch": 0.97, + "grad_norm": 3.0410138494571632, + "learning_rate": 1.9796031020702376e-08, + "loss": 0.2572, + "step": 15464 + }, + { + "epoch": 0.97, + "grad_norm": 1.6359560002023097, + "learning_rate": 1.970559448697562e-08, + "loss": 0.2428, + "step": 15465 + }, + { + "epoch": 0.97, + "grad_norm": 1.4668214706477742, + "learning_rate": 1.9615364594509633e-08, + "loss": 0.2549, + "step": 15466 + }, + { + "epoch": 0.97, + "grad_norm": 0.6060142031172588, + "learning_rate": 1.9525341347048643e-08, + "loss": 0.4653, + "step": 15467 + }, + { + "epoch": 0.97, + "grad_norm": 1.766276291352814, + "learning_rate": 1.943552474832744e-08, + "loss": 0.257, + "step": 15468 + }, + { + "epoch": 0.97, + "grad_norm": 1.4656088738430961, + "learning_rate": 1.934591480207304e-08, + "loss": 0.2327, + "step": 15469 + }, + { + "epoch": 0.97, + "grad_norm": 1.7355086601017375, + "learning_rate": 1.925651151200303e-08, + "loss": 0.2464, + "step": 15470 + }, + { + "epoch": 0.97, + "grad_norm": 1.817621729816515, + "learning_rate": 1.9167314881827214e-08, + "loss": 0.2401, + "step": 15471 + }, + { + "epoch": 0.97, + "grad_norm": 3.0191311831408743, + "learning_rate": 1.9078324915246527e-08, + "loss": 0.2651, + "step": 15472 + }, + { + "epoch": 0.97, + "grad_norm": 2.793188698041432, + "learning_rate": 1.898954161595301e-08, + "loss": 0.2711, + "step": 15473 + }, + { + "epoch": 0.97, + "grad_norm": 2.369588591877338, + "learning_rate": 1.8900964987630388e-08, + "loss": 0.2505, + "step": 15474 + }, + { + "epoch": 0.97, + "grad_norm": 2.923550585192585, + "learning_rate": 1.8812595033954607e-08, + "loss": 0.2578, + "step": 15475 + }, + { + "epoch": 0.97, + "grad_norm": 2.6308461250264963, + "learning_rate": 1.8724431758592177e-08, + "loss": 0.2697, + "step": 15476 + }, + { + "epoch": 0.97, + "grad_norm": 2.9505678082335662, + "learning_rate": 1.8636475165200176e-08, + "loss": 0.2364, + "step": 15477 + }, + { + "epoch": 0.97, + "grad_norm": 2.0326938089135087, + "learning_rate": 1.8548725257429014e-08, + "loss": 0.2496, + "step": 15478 + }, + { + "epoch": 0.97, + "grad_norm": 2.625344297906875, + "learning_rate": 1.8461182038919666e-08, + "loss": 0.2496, + "step": 15479 + }, + { + "epoch": 0.97, + "grad_norm": 2.096852475475712, + "learning_rate": 1.8373845513303124e-08, + "loss": 0.2666, + "step": 15480 + }, + { + "epoch": 0.97, + "grad_norm": 1.6312141642097142, + "learning_rate": 1.8286715684204815e-08, + "loss": 0.2385, + "step": 15481 + }, + { + "epoch": 0.97, + "grad_norm": 2.692134022410049, + "learning_rate": 1.819979255523907e-08, + "loss": 0.2397, + "step": 15482 + }, + { + "epoch": 0.97, + "grad_norm": 11.99791696158461, + "learning_rate": 1.8113076130012453e-08, + "loss": 0.2468, + "step": 15483 + }, + { + "epoch": 0.97, + "grad_norm": 2.399654529619578, + "learning_rate": 1.802656641212375e-08, + "loss": 0.2539, + "step": 15484 + }, + { + "epoch": 0.97, + "grad_norm": 2.4177192326430204, + "learning_rate": 1.7940263405161195e-08, + "loss": 0.2839, + "step": 15485 + }, + { + "epoch": 0.97, + "grad_norm": 1.2935373024116066, + "learning_rate": 1.785416711270638e-08, + "loss": 0.2504, + "step": 15486 + }, + { + "epoch": 0.97, + "grad_norm": 1.9160151498153903, + "learning_rate": 1.7768277538331435e-08, + "loss": 0.2612, + "step": 15487 + }, + { + "epoch": 0.97, + "grad_norm": 3.9762390016418894, + "learning_rate": 1.7682594685600184e-08, + "loss": 0.2539, + "step": 15488 + }, + { + "epoch": 0.97, + "grad_norm": 1.7756054644360466, + "learning_rate": 1.759711855806756e-08, + "loss": 0.2508, + "step": 15489 + }, + { + "epoch": 0.97, + "grad_norm": 0.6438743333723528, + "learning_rate": 1.751184915928017e-08, + "loss": 0.4626, + "step": 15490 + }, + { + "epoch": 0.97, + "grad_norm": 2.0046281064284197, + "learning_rate": 1.742678649277574e-08, + "loss": 0.2559, + "step": 15491 + }, + { + "epoch": 0.97, + "grad_norm": 1.5446583872634272, + "learning_rate": 1.7341930562084775e-08, + "loss": 0.2403, + "step": 15492 + }, + { + "epoch": 0.97, + "grad_norm": 1.942016087724381, + "learning_rate": 1.7257281370726688e-08, + "loss": 0.2788, + "step": 15493 + }, + { + "epoch": 0.97, + "grad_norm": 2.612550085794354, + "learning_rate": 1.7172838922214773e-08, + "loss": 0.2519, + "step": 15494 + }, + { + "epoch": 0.97, + "grad_norm": 1.6222136585881686, + "learning_rate": 1.7088603220051792e-08, + "loss": 0.2407, + "step": 15495 + }, + { + "epoch": 0.97, + "grad_norm": 1.5933577644267796, + "learning_rate": 1.7004574267733832e-08, + "loss": 0.248, + "step": 15496 + }, + { + "epoch": 0.97, + "grad_norm": 3.5081318471711267, + "learning_rate": 1.6920752068746438e-08, + "loss": 0.2567, + "step": 15497 + }, + { + "epoch": 0.97, + "grad_norm": 2.3722598268780937, + "learning_rate": 1.6837136626568496e-08, + "loss": 0.2556, + "step": 15498 + }, + { + "epoch": 0.97, + "grad_norm": 2.1696781911728378, + "learning_rate": 1.6753727944668342e-08, + "loss": 0.2415, + "step": 15499 + }, + { + "epoch": 0.97, + "grad_norm": 2.2465984209971315, + "learning_rate": 1.667052602650765e-08, + "loss": 0.2682, + "step": 15500 + }, + { + "epoch": 0.97, + "grad_norm": 3.073112480747819, + "learning_rate": 1.6587530875538106e-08, + "loss": 0.2451, + "step": 15501 + }, + { + "epoch": 0.97, + "grad_norm": 2.0039550855240713, + "learning_rate": 1.6504742495203064e-08, + "loss": 0.2682, + "step": 15502 + }, + { + "epoch": 0.97, + "grad_norm": 3.716537895168068, + "learning_rate": 1.642216088893811e-08, + "loss": 0.2615, + "step": 15503 + }, + { + "epoch": 0.98, + "grad_norm": 1.6264490824986113, + "learning_rate": 1.6339786060169393e-08, + "loss": 0.2498, + "step": 15504 + }, + { + "epoch": 0.98, + "grad_norm": 1.9944649389374496, + "learning_rate": 1.625761801231529e-08, + "loss": 0.2611, + "step": 15505 + }, + { + "epoch": 0.98, + "grad_norm": 2.9685160961845805, + "learning_rate": 1.617565674878474e-08, + "loss": 0.2435, + "step": 15506 + }, + { + "epoch": 0.98, + "grad_norm": 1.9792377528694678, + "learning_rate": 1.6093902272978357e-08, + "loss": 0.2532, + "step": 15507 + }, + { + "epoch": 0.98, + "grad_norm": 1.5625692901850257, + "learning_rate": 1.6012354588288425e-08, + "loss": 0.2504, + "step": 15508 + }, + { + "epoch": 0.98, + "grad_norm": 2.872568214140006, + "learning_rate": 1.5931013698098353e-08, + "loss": 0.2818, + "step": 15509 + }, + { + "epoch": 0.98, + "grad_norm": 6.658827039770151, + "learning_rate": 1.5849879605783214e-08, + "loss": 0.2456, + "step": 15510 + }, + { + "epoch": 0.98, + "grad_norm": 1.7288425240765597, + "learning_rate": 1.5768952314709763e-08, + "loss": 0.2483, + "step": 15511 + }, + { + "epoch": 0.98, + "grad_norm": 1.8807100627466924, + "learning_rate": 1.5688231828234757e-08, + "loss": 0.248, + "step": 15512 + }, + { + "epoch": 0.98, + "grad_norm": 1.8400376668637661, + "learning_rate": 1.560771814970885e-08, + "loss": 0.2626, + "step": 15513 + }, + { + "epoch": 0.98, + "grad_norm": 1.9019812541798318, + "learning_rate": 1.5527411282471594e-08, + "loss": 0.2478, + "step": 15514 + }, + { + "epoch": 0.98, + "grad_norm": 2.0171208771970015, + "learning_rate": 1.5447311229855876e-08, + "loss": 0.2468, + "step": 15515 + }, + { + "epoch": 0.98, + "grad_norm": 1.9905667456664624, + "learning_rate": 1.5367417995184597e-08, + "loss": 0.2474, + "step": 15516 + }, + { + "epoch": 0.98, + "grad_norm": 2.473877934475984, + "learning_rate": 1.5287731581772326e-08, + "loss": 0.2532, + "step": 15517 + }, + { + "epoch": 0.98, + "grad_norm": 1.4390865518188125, + "learning_rate": 1.5208251992926415e-08, + "loss": 0.2654, + "step": 15518 + }, + { + "epoch": 0.98, + "grad_norm": 2.4060498089965208, + "learning_rate": 1.512897923194423e-08, + "loss": 0.2412, + "step": 15519 + }, + { + "epoch": 0.98, + "grad_norm": 1.5454154221178362, + "learning_rate": 1.504991330211425e-08, + "loss": 0.2609, + "step": 15520 + }, + { + "epoch": 0.98, + "grad_norm": 3.583097433815207, + "learning_rate": 1.4971054206718294e-08, + "loss": 0.2747, + "step": 15521 + }, + { + "epoch": 0.98, + "grad_norm": 2.73913176597066, + "learning_rate": 1.489240194902708e-08, + "loss": 0.23, + "step": 15522 + }, + { + "epoch": 0.98, + "grad_norm": 2.3986844251397104, + "learning_rate": 1.4813956532305218e-08, + "loss": 0.2508, + "step": 15523 + }, + { + "epoch": 0.98, + "grad_norm": 1.873586140642064, + "learning_rate": 1.4735717959806773e-08, + "loss": 0.2375, + "step": 15524 + }, + { + "epoch": 0.98, + "grad_norm": 2.141798204858941, + "learning_rate": 1.4657686234778035e-08, + "loss": 0.2457, + "step": 15525 + }, + { + "epoch": 0.98, + "grad_norm": 3.5805240595153993, + "learning_rate": 1.4579861360457525e-08, + "loss": 0.2656, + "step": 15526 + }, + { + "epoch": 0.98, + "grad_norm": 1.7488438587677326, + "learning_rate": 1.4502243340072663e-08, + "loss": 0.2449, + "step": 15527 + }, + { + "epoch": 0.98, + "grad_norm": 2.712316081614816, + "learning_rate": 1.4424832176845871e-08, + "loss": 0.2637, + "step": 15528 + }, + { + "epoch": 0.98, + "grad_norm": 1.9890059149896477, + "learning_rate": 1.4347627873987912e-08, + "loss": 0.2353, + "step": 15529 + }, + { + "epoch": 0.98, + "grad_norm": 3.5888310231399467, + "learning_rate": 1.4270630434701782e-08, + "loss": 0.2559, + "step": 15530 + }, + { + "epoch": 0.98, + "grad_norm": 1.5157996681402393, + "learning_rate": 1.4193839862183812e-08, + "loss": 0.2622, + "step": 15531 + }, + { + "epoch": 0.98, + "grad_norm": 2.2218685234480606, + "learning_rate": 1.4117256159618676e-08, + "loss": 0.2466, + "step": 15532 + }, + { + "epoch": 0.98, + "grad_norm": 1.744680873333697, + "learning_rate": 1.4040879330184387e-08, + "loss": 0.2638, + "step": 15533 + }, + { + "epoch": 0.98, + "grad_norm": 2.730767422762499, + "learning_rate": 1.3964709377050079e-08, + "loss": 0.2698, + "step": 15534 + }, + { + "epoch": 0.98, + "grad_norm": 4.182953404008009, + "learning_rate": 1.3888746303376554e-08, + "loss": 0.242, + "step": 15535 + }, + { + "epoch": 0.98, + "grad_norm": 2.8149028322694094, + "learning_rate": 1.3812990112315184e-08, + "loss": 0.2658, + "step": 15536 + }, + { + "epoch": 0.98, + "grad_norm": 2.093215393823958, + "learning_rate": 1.3737440807009006e-08, + "loss": 0.2417, + "step": 15537 + }, + { + "epoch": 0.98, + "grad_norm": 5.060166318007312, + "learning_rate": 1.3662098390593292e-08, + "loss": 0.2604, + "step": 15538 + }, + { + "epoch": 0.98, + "grad_norm": 2.620758001831998, + "learning_rate": 1.3586962866193875e-08, + "loss": 0.244, + "step": 15539 + }, + { + "epoch": 0.98, + "grad_norm": 2.927748783609269, + "learning_rate": 1.3512034236927706e-08, + "loss": 0.2574, + "step": 15540 + }, + { + "epoch": 0.98, + "grad_norm": 2.2031237302777953, + "learning_rate": 1.3437312505905075e-08, + "loss": 0.2363, + "step": 15541 + }, + { + "epoch": 0.98, + "grad_norm": 2.546532599190121, + "learning_rate": 1.3362797676224614e-08, + "loss": 0.239, + "step": 15542 + }, + { + "epoch": 0.98, + "grad_norm": 0.5388820068781249, + "learning_rate": 1.3288489750979406e-08, + "loss": 0.4509, + "step": 15543 + }, + { + "epoch": 0.98, + "grad_norm": 1.9298914157208156, + "learning_rate": 1.3214388733252536e-08, + "loss": 0.2424, + "step": 15544 + }, + { + "epoch": 0.98, + "grad_norm": 1.7125518414190737, + "learning_rate": 1.3140494626117661e-08, + "loss": 0.2542, + "step": 15545 + }, + { + "epoch": 0.98, + "grad_norm": 2.8646247302854344, + "learning_rate": 1.3066807432641216e-08, + "loss": 0.254, + "step": 15546 + }, + { + "epoch": 0.98, + "grad_norm": 1.445369105515142, + "learning_rate": 1.2993327155880753e-08, + "loss": 0.2486, + "step": 15547 + }, + { + "epoch": 0.98, + "grad_norm": 2.2019131649084476, + "learning_rate": 1.2920053798885501e-08, + "loss": 0.2489, + "step": 15548 + }, + { + "epoch": 0.98, + "grad_norm": 2.1827807345721975, + "learning_rate": 1.2846987364695252e-08, + "loss": 0.2577, + "step": 15549 + }, + { + "epoch": 0.98, + "grad_norm": 2.213672803846767, + "learning_rate": 1.2774127856341467e-08, + "loss": 0.27, + "step": 15550 + }, + { + "epoch": 0.98, + "grad_norm": 1.7643336230298274, + "learning_rate": 1.2701475276847286e-08, + "loss": 0.2403, + "step": 15551 + }, + { + "epoch": 0.98, + "grad_norm": 2.206378727249492, + "learning_rate": 1.2629029629227518e-08, + "loss": 0.2517, + "step": 15552 + }, + { + "epoch": 0.98, + "grad_norm": 1.9358869169210136, + "learning_rate": 1.2556790916488093e-08, + "loss": 0.2446, + "step": 15553 + }, + { + "epoch": 0.98, + "grad_norm": 1.8228967276809194, + "learning_rate": 1.2484759141625502e-08, + "loss": 0.2459, + "step": 15554 + }, + { + "epoch": 0.98, + "grad_norm": 2.0295069711263496, + "learning_rate": 1.2412934307629575e-08, + "loss": 0.2421, + "step": 15555 + }, + { + "epoch": 0.98, + "grad_norm": 2.7658988365955928, + "learning_rate": 1.2341316417479598e-08, + "loss": 0.2474, + "step": 15556 + }, + { + "epoch": 0.98, + "grad_norm": 3.233087249645574, + "learning_rate": 1.2269905474147636e-08, + "loss": 0.2555, + "step": 15557 + }, + { + "epoch": 0.98, + "grad_norm": 2.5075996025079546, + "learning_rate": 1.2198701480596875e-08, + "loss": 0.2483, + "step": 15558 + }, + { + "epoch": 0.98, + "grad_norm": 1.6876271772076141, + "learning_rate": 1.2127704439781062e-08, + "loss": 0.2457, + "step": 15559 + }, + { + "epoch": 0.98, + "grad_norm": 1.793953200667405, + "learning_rate": 1.2056914354646176e-08, + "loss": 0.2621, + "step": 15560 + }, + { + "epoch": 0.98, + "grad_norm": 2.0639247294016263, + "learning_rate": 1.1986331228129311e-08, + "loss": 0.2438, + "step": 15561 + }, + { + "epoch": 0.98, + "grad_norm": 0.5793979500542205, + "learning_rate": 1.1915955063159235e-08, + "loss": 0.4536, + "step": 15562 + }, + { + "epoch": 0.98, + "grad_norm": 3.7061081259812236, + "learning_rate": 1.1845785862656389e-08, + "loss": 0.2573, + "step": 15563 + }, + { + "epoch": 0.98, + "grad_norm": 2.576161420494555, + "learning_rate": 1.177582362953178e-08, + "loss": 0.2634, + "step": 15564 + }, + { + "epoch": 0.98, + "grad_norm": 2.9030760447820265, + "learning_rate": 1.1706068366688083e-08, + "loss": 0.252, + "step": 15565 + }, + { + "epoch": 0.98, + "grad_norm": 3.0726290133180094, + "learning_rate": 1.1636520077020207e-08, + "loss": 0.2577, + "step": 15566 + }, + { + "epoch": 0.98, + "grad_norm": 1.5106248481346085, + "learning_rate": 1.1567178763413068e-08, + "loss": 0.2475, + "step": 15567 + }, + { + "epoch": 0.98, + "grad_norm": 1.9558351080447594, + "learning_rate": 1.149804442874436e-08, + "loss": 0.2494, + "step": 15568 + }, + { + "epoch": 0.98, + "grad_norm": 2.334714522306353, + "learning_rate": 1.1429117075882345e-08, + "loss": 0.2684, + "step": 15569 + }, + { + "epoch": 0.98, + "grad_norm": 2.570248109556965, + "learning_rate": 1.1360396707686961e-08, + "loss": 0.2425, + "step": 15570 + }, + { + "epoch": 0.98, + "grad_norm": 1.8808737821337238, + "learning_rate": 1.1291883327009257e-08, + "loss": 0.2564, + "step": 15571 + }, + { + "epoch": 0.98, + "grad_norm": 2.4292775300354967, + "learning_rate": 1.1223576936692515e-08, + "loss": 0.2506, + "step": 15572 + }, + { + "epoch": 0.98, + "grad_norm": 4.519205841181352, + "learning_rate": 1.115547753957058e-08, + "loss": 0.2727, + "step": 15573 + }, + { + "epoch": 0.98, + "grad_norm": 1.46784944440388, + "learning_rate": 1.1087585138469525e-08, + "loss": 0.2453, + "step": 15574 + }, + { + "epoch": 0.98, + "grad_norm": 0.6034317401022334, + "learning_rate": 1.101989973620543e-08, + "loss": 0.4681, + "step": 15575 + }, + { + "epoch": 0.98, + "grad_norm": 2.070420111968326, + "learning_rate": 1.095242133558716e-08, + "loss": 0.2451, + "step": 15576 + }, + { + "epoch": 0.98, + "grad_norm": 0.5708556517898078, + "learning_rate": 1.0885149939414141e-08, + "loss": 0.4573, + "step": 15577 + }, + { + "epoch": 0.98, + "grad_norm": 2.197566537185328, + "learning_rate": 1.0818085550478585e-08, + "loss": 0.2465, + "step": 15578 + }, + { + "epoch": 0.98, + "grad_norm": 1.6830089399389538, + "learning_rate": 1.0751228171561601e-08, + "loss": 0.2567, + "step": 15579 + }, + { + "epoch": 0.98, + "grad_norm": 7.2262230473066715, + "learning_rate": 1.0684577805438744e-08, + "loss": 0.2353, + "step": 15580 + }, + { + "epoch": 0.98, + "grad_norm": 3.2733926244862537, + "learning_rate": 1.0618134454874473e-08, + "loss": 0.2528, + "step": 15581 + }, + { + "epoch": 0.98, + "grad_norm": 3.0396419553233973, + "learning_rate": 1.0551898122626025e-08, + "loss": 0.2399, + "step": 15582 + }, + { + "epoch": 0.98, + "grad_norm": 4.229861437779296, + "learning_rate": 1.0485868811441757e-08, + "loss": 0.2535, + "step": 15583 + }, + { + "epoch": 0.98, + "grad_norm": 1.7751439642267388, + "learning_rate": 1.0420046524061145e-08, + "loss": 0.2878, + "step": 15584 + }, + { + "epoch": 0.98, + "grad_norm": 1.6262032897060499, + "learning_rate": 1.0354431263214782e-08, + "loss": 0.2643, + "step": 15585 + }, + { + "epoch": 0.98, + "grad_norm": 2.040780135198672, + "learning_rate": 1.0289023031626044e-08, + "loss": 0.2429, + "step": 15586 + }, + { + "epoch": 0.98, + "grad_norm": 1.965122746676793, + "learning_rate": 1.0223821832008873e-08, + "loss": 0.267, + "step": 15587 + }, + { + "epoch": 0.98, + "grad_norm": 1.4362883851264039, + "learning_rate": 1.0158827667067772e-08, + "loss": 0.2502, + "step": 15588 + }, + { + "epoch": 0.98, + "grad_norm": 3.511057216270994, + "learning_rate": 1.0094040539499473e-08, + "loss": 0.2568, + "step": 15589 + }, + { + "epoch": 0.98, + "grad_norm": 2.246104436570195, + "learning_rate": 1.0029460451992933e-08, + "loss": 0.2413, + "step": 15590 + }, + { + "epoch": 0.98, + "grad_norm": 1.6716867312882933, + "learning_rate": 9.965087407227125e-09, + "loss": 0.2654, + "step": 15591 + }, + { + "epoch": 0.98, + "grad_norm": 1.7472619600651458, + "learning_rate": 9.900921407873243e-09, + "loss": 0.2385, + "step": 15592 + }, + { + "epoch": 0.98, + "grad_norm": 1.725106848403077, + "learning_rate": 9.836962456593602e-09, + "loss": 0.2644, + "step": 15593 + }, + { + "epoch": 0.98, + "grad_norm": 1.6500384360411164, + "learning_rate": 9.77321055604219e-09, + "loss": 0.2547, + "step": 15594 + }, + { + "epoch": 0.98, + "grad_norm": 1.632168563033859, + "learning_rate": 9.709665708863558e-09, + "loss": 0.2495, + "step": 15595 + }, + { + "epoch": 0.98, + "grad_norm": 6.006364411613694, + "learning_rate": 9.646327917694486e-09, + "loss": 0.2405, + "step": 15596 + }, + { + "epoch": 0.98, + "grad_norm": 1.9932137846393272, + "learning_rate": 9.58319718516343e-09, + "loss": 0.2555, + "step": 15597 + }, + { + "epoch": 0.98, + "grad_norm": 1.9316758936017333, + "learning_rate": 9.5202735138894e-09, + "loss": 0.2499, + "step": 15598 + }, + { + "epoch": 0.98, + "grad_norm": 0.5850876663634432, + "learning_rate": 9.45755690648309e-09, + "loss": 0.4632, + "step": 15599 + }, + { + "epoch": 0.98, + "grad_norm": 1.855408833305095, + "learning_rate": 9.395047365547416e-09, + "loss": 0.2405, + "step": 15600 + }, + { + "epoch": 0.98, + "grad_norm": 2.236241917557413, + "learning_rate": 9.332744893675306e-09, + "loss": 0.2761, + "step": 15601 + }, + { + "epoch": 0.98, + "grad_norm": 1.6952567859126504, + "learning_rate": 9.27064949345191e-09, + "loss": 0.25, + "step": 15602 + }, + { + "epoch": 0.98, + "grad_norm": 1.9630476757870707, + "learning_rate": 9.208761167453507e-09, + "loss": 0.2625, + "step": 15603 + }, + { + "epoch": 0.98, + "grad_norm": 1.962904644754545, + "learning_rate": 9.147079918249146e-09, + "loss": 0.2604, + "step": 15604 + }, + { + "epoch": 0.98, + "grad_norm": 2.1188646288857007, + "learning_rate": 9.085605748396231e-09, + "loss": 0.242, + "step": 15605 + }, + { + "epoch": 0.98, + "grad_norm": 1.4531888245836753, + "learning_rate": 9.024338660447162e-09, + "loss": 0.2361, + "step": 15606 + }, + { + "epoch": 0.98, + "grad_norm": 2.7107240539871507, + "learning_rate": 8.963278656942687e-09, + "loss": 0.2487, + "step": 15607 + }, + { + "epoch": 0.98, + "grad_norm": 17.318751386047623, + "learning_rate": 8.902425740416886e-09, + "loss": 0.2462, + "step": 15608 + }, + { + "epoch": 0.98, + "grad_norm": 1.5774320024795008, + "learning_rate": 8.841779913394966e-09, + "loss": 0.2671, + "step": 15609 + }, + { + "epoch": 0.98, + "grad_norm": 1.6641220630708113, + "learning_rate": 8.781341178393244e-09, + "loss": 0.2442, + "step": 15610 + }, + { + "epoch": 0.98, + "grad_norm": 1.9925184886437746, + "learning_rate": 8.72110953791805e-09, + "loss": 0.2538, + "step": 15611 + }, + { + "epoch": 0.98, + "grad_norm": 2.4335687958758183, + "learning_rate": 8.66108499447016e-09, + "loss": 0.2492, + "step": 15612 + }, + { + "epoch": 0.98, + "grad_norm": 1.5373189443861768, + "learning_rate": 8.601267550539805e-09, + "loss": 0.2437, + "step": 15613 + }, + { + "epoch": 0.98, + "grad_norm": 0.5720932307561466, + "learning_rate": 8.541657208607778e-09, + "loss": 0.4494, + "step": 15614 + }, + { + "epoch": 0.98, + "grad_norm": 1.7043081794435382, + "learning_rate": 8.482253971148768e-09, + "loss": 0.266, + "step": 15615 + }, + { + "epoch": 0.98, + "grad_norm": 0.6149659013118141, + "learning_rate": 8.423057840626914e-09, + "loss": 0.4963, + "step": 15616 + }, + { + "epoch": 0.98, + "grad_norm": 2.613573159584638, + "learning_rate": 8.364068819498029e-09, + "loss": 0.2672, + "step": 15617 + }, + { + "epoch": 0.98, + "grad_norm": 2.908704026341994, + "learning_rate": 8.305286910210709e-09, + "loss": 0.2619, + "step": 15618 + }, + { + "epoch": 0.98, + "grad_norm": 1.9634236224880925, + "learning_rate": 8.246712115203004e-09, + "loss": 0.2433, + "step": 15619 + }, + { + "epoch": 0.98, + "grad_norm": 1.8058417900140946, + "learning_rate": 8.188344436905193e-09, + "loss": 0.2485, + "step": 15620 + }, + { + "epoch": 0.98, + "grad_norm": 2.0499520966327007, + "learning_rate": 8.130183877739784e-09, + "loss": 0.2506, + "step": 15621 + }, + { + "epoch": 0.98, + "grad_norm": 1.464304071789921, + "learning_rate": 8.072230440119843e-09, + "loss": 0.2453, + "step": 15622 + }, + { + "epoch": 0.98, + "grad_norm": 1.9687961653155635, + "learning_rate": 8.014484126449008e-09, + "loss": 0.2397, + "step": 15623 + }, + { + "epoch": 0.98, + "grad_norm": 3.0285807789622403, + "learning_rate": 7.9569449391248e-09, + "loss": 0.273, + "step": 15624 + }, + { + "epoch": 0.98, + "grad_norm": 9.591317288318226, + "learning_rate": 7.899612880533092e-09, + "loss": 0.2368, + "step": 15625 + }, + { + "epoch": 0.98, + "grad_norm": 3.6196789453325597, + "learning_rate": 7.842487953054202e-09, + "loss": 0.2361, + "step": 15626 + }, + { + "epoch": 0.98, + "grad_norm": 2.2419885920789207, + "learning_rate": 7.785570159056788e-09, + "loss": 0.2387, + "step": 15627 + }, + { + "epoch": 0.98, + "grad_norm": 1.905001739989543, + "learning_rate": 7.728859500903408e-09, + "loss": 0.2557, + "step": 15628 + }, + { + "epoch": 0.98, + "grad_norm": 1.7865140658066843, + "learning_rate": 7.672355980947177e-09, + "loss": 0.2647, + "step": 15629 + }, + { + "epoch": 0.98, + "grad_norm": 2.1961707316110086, + "learning_rate": 7.616059601532333e-09, + "loss": 0.2467, + "step": 15630 + }, + { + "epoch": 0.98, + "grad_norm": 2.7961826128246297, + "learning_rate": 7.55997036499423e-09, + "loss": 0.2543, + "step": 15631 + }, + { + "epoch": 0.98, + "grad_norm": 1.9538604690082362, + "learning_rate": 7.504088273661003e-09, + "loss": 0.2887, + "step": 15632 + }, + { + "epoch": 0.98, + "grad_norm": 3.2481173055607213, + "learning_rate": 7.448413329850801e-09, + "loss": 0.2598, + "step": 15633 + }, + { + "epoch": 0.98, + "grad_norm": 1.9807230966892162, + "learning_rate": 7.392945535873441e-09, + "loss": 0.2485, + "step": 15634 + }, + { + "epoch": 0.98, + "grad_norm": 1.754303864248791, + "learning_rate": 7.337684894030417e-09, + "loss": 0.2374, + "step": 15635 + }, + { + "epoch": 0.98, + "grad_norm": 7.07331941428063, + "learning_rate": 7.282631406615448e-09, + "loss": 0.2354, + "step": 15636 + }, + { + "epoch": 0.98, + "grad_norm": 1.7190981757398527, + "learning_rate": 7.2277850759117085e-09, + "loss": 0.2255, + "step": 15637 + }, + { + "epoch": 0.98, + "grad_norm": 2.5759893447618776, + "learning_rate": 7.173145904195156e-09, + "loss": 0.2581, + "step": 15638 + }, + { + "epoch": 0.98, + "grad_norm": 1.8790706297505653, + "learning_rate": 7.11871389373342e-09, + "loss": 0.2496, + "step": 15639 + }, + { + "epoch": 0.98, + "grad_norm": 9.577593385186562, + "learning_rate": 7.064489046784695e-09, + "loss": 0.2642, + "step": 15640 + }, + { + "epoch": 0.98, + "grad_norm": 2.6178861605317176, + "learning_rate": 7.010471365598292e-09, + "loss": 0.2423, + "step": 15641 + }, + { + "epoch": 0.98, + "grad_norm": 1.979658809139444, + "learning_rate": 6.956660852416308e-09, + "loss": 0.2475, + "step": 15642 + }, + { + "epoch": 0.98, + "grad_norm": 2.265061644184496, + "learning_rate": 6.903057509470845e-09, + "loss": 0.2513, + "step": 15643 + }, + { + "epoch": 0.98, + "grad_norm": 4.183084640261362, + "learning_rate": 6.8496613389867905e-09, + "loss": 0.2348, + "step": 15644 + }, + { + "epoch": 0.98, + "grad_norm": 3.0015993779853, + "learning_rate": 6.796472343178484e-09, + "loss": 0.2701, + "step": 15645 + }, + { + "epoch": 0.98, + "grad_norm": 9.768687284571175, + "learning_rate": 6.743490524254159e-09, + "loss": 0.2748, + "step": 15646 + }, + { + "epoch": 0.98, + "grad_norm": 2.518585898499266, + "learning_rate": 6.690715884410947e-09, + "loss": 0.2454, + "step": 15647 + }, + { + "epoch": 0.98, + "grad_norm": 32.006304581087356, + "learning_rate": 6.638148425838764e-09, + "loss": 0.2526, + "step": 15648 + }, + { + "epoch": 0.98, + "grad_norm": 1.733578765111345, + "learning_rate": 6.585788150719752e-09, + "loss": 0.2453, + "step": 15649 + }, + { + "epoch": 0.98, + "grad_norm": 4.082609978964282, + "learning_rate": 6.533635061225507e-09, + "loss": 0.2773, + "step": 15650 + }, + { + "epoch": 0.98, + "grad_norm": 3.1356784380508858, + "learning_rate": 6.481689159519855e-09, + "loss": 0.2572, + "step": 15651 + }, + { + "epoch": 0.98, + "grad_norm": 5.234922177587976, + "learning_rate": 6.429950447758293e-09, + "loss": 0.2598, + "step": 15652 + }, + { + "epoch": 0.98, + "grad_norm": 6.653254671064186, + "learning_rate": 6.378418928087993e-09, + "loss": 0.2567, + "step": 15653 + }, + { + "epoch": 0.98, + "grad_norm": 1.8209219972317714, + "learning_rate": 6.327094602646688e-09, + "loss": 0.2629, + "step": 15654 + }, + { + "epoch": 0.98, + "grad_norm": 2.331626509852034, + "learning_rate": 6.275977473563788e-09, + "loss": 0.2584, + "step": 15655 + }, + { + "epoch": 0.98, + "grad_norm": 1.9876333163916338, + "learning_rate": 6.2250675429609274e-09, + "loss": 0.2544, + "step": 15656 + }, + { + "epoch": 0.98, + "grad_norm": 2.5051829218987804, + "learning_rate": 6.174364812949752e-09, + "loss": 0.2595, + "step": 15657 + }, + { + "epoch": 0.98, + "grad_norm": 3.0610047525807573, + "learning_rate": 6.123869285634132e-09, + "loss": 0.2503, + "step": 15658 + }, + { + "epoch": 0.98, + "grad_norm": 1.385699698392544, + "learning_rate": 6.073580963109061e-09, + "loss": 0.2527, + "step": 15659 + }, + { + "epoch": 0.98, + "grad_norm": 2.234972326993217, + "learning_rate": 6.023499847461201e-09, + "loss": 0.2408, + "step": 15660 + }, + { + "epoch": 0.98, + "grad_norm": 1.8396623098913352, + "learning_rate": 5.973625940769445e-09, + "loss": 0.2506, + "step": 15661 + }, + { + "epoch": 0.98, + "grad_norm": 2.006516227653823, + "learning_rate": 5.9239592451015845e-09, + "loss": 0.2293, + "step": 15662 + }, + { + "epoch": 0.99, + "grad_norm": 4.629313662584817, + "learning_rate": 5.874499762519303e-09, + "loss": 0.2426, + "step": 15663 + }, + { + "epoch": 0.99, + "grad_norm": 1.5684966662259532, + "learning_rate": 5.825247495074849e-09, + "loss": 0.2422, + "step": 15664 + }, + { + "epoch": 0.99, + "grad_norm": 1.6761734588470107, + "learning_rate": 5.776202444811585e-09, + "loss": 0.2435, + "step": 15665 + }, + { + "epoch": 0.99, + "grad_norm": 5.046085493563638, + "learning_rate": 5.727364613763997e-09, + "loss": 0.2548, + "step": 15666 + }, + { + "epoch": 0.99, + "grad_norm": 2.0059501740205063, + "learning_rate": 5.678734003958797e-09, + "loss": 0.2685, + "step": 15667 + }, + { + "epoch": 0.99, + "grad_norm": 12.98830537254205, + "learning_rate": 5.63031061741437e-09, + "loss": 0.2554, + "step": 15668 + }, + { + "epoch": 0.99, + "grad_norm": 2.0175163559699087, + "learning_rate": 5.582094456139109e-09, + "loss": 0.2514, + "step": 15669 + }, + { + "epoch": 0.99, + "grad_norm": 3.5522079285470918, + "learning_rate": 5.534085522133637e-09, + "loss": 0.2492, + "step": 15670 + }, + { + "epoch": 0.99, + "grad_norm": 6.112999635984669, + "learning_rate": 5.486283817390248e-09, + "loss": 0.2524, + "step": 15671 + }, + { + "epoch": 0.99, + "grad_norm": 1.520334575559302, + "learning_rate": 5.438689343892356e-09, + "loss": 0.2453, + "step": 15672 + }, + { + "epoch": 0.99, + "grad_norm": 2.1018062447739836, + "learning_rate": 5.391302103615048e-09, + "loss": 0.2641, + "step": 15673 + }, + { + "epoch": 0.99, + "grad_norm": 1.4175783931073094, + "learning_rate": 5.344122098523419e-09, + "loss": 0.2365, + "step": 15674 + }, + { + "epoch": 0.99, + "grad_norm": 2.016727443018837, + "learning_rate": 5.2971493305753465e-09, + "loss": 0.2684, + "step": 15675 + }, + { + "epoch": 0.99, + "grad_norm": 3.2345848687961936, + "learning_rate": 5.2503838017203824e-09, + "loss": 0.258, + "step": 15676 + }, + { + "epoch": 0.99, + "grad_norm": 2.011374306576539, + "learning_rate": 5.2038255138991965e-09, + "loss": 0.2425, + "step": 15677 + }, + { + "epoch": 0.99, + "grad_norm": 14.356641889740356, + "learning_rate": 5.157474469042467e-09, + "loss": 0.231, + "step": 15678 + }, + { + "epoch": 0.99, + "grad_norm": 2.0070010729827286, + "learning_rate": 5.111330669074211e-09, + "loss": 0.2491, + "step": 15679 + }, + { + "epoch": 0.99, + "grad_norm": 1.776880313302346, + "learning_rate": 5.065394115909006e-09, + "loss": 0.2489, + "step": 15680 + }, + { + "epoch": 0.99, + "grad_norm": 2.8691306274525914, + "learning_rate": 5.019664811451997e-09, + "loss": 0.264, + "step": 15681 + }, + { + "epoch": 0.99, + "grad_norm": 2.12316657401478, + "learning_rate": 4.974142757601663e-09, + "loss": 0.2579, + "step": 15682 + }, + { + "epoch": 0.99, + "grad_norm": 0.6520729617270807, + "learning_rate": 4.92882795624594e-09, + "loss": 0.4264, + "step": 15683 + }, + { + "epoch": 0.99, + "grad_norm": 1.5793191284545007, + "learning_rate": 4.8837204092655465e-09, + "loss": 0.2545, + "step": 15684 + }, + { + "epoch": 0.99, + "grad_norm": 4.038925760909837, + "learning_rate": 4.838820118532317e-09, + "loss": 0.2648, + "step": 15685 + }, + { + "epoch": 0.99, + "grad_norm": 2.8814905684080334, + "learning_rate": 4.7941270859086505e-09, + "loss": 0.2638, + "step": 15686 + }, + { + "epoch": 0.99, + "grad_norm": 1.7749038360763734, + "learning_rate": 4.749641313248621e-09, + "loss": 0.2488, + "step": 15687 + }, + { + "epoch": 0.99, + "grad_norm": 6.117625713689163, + "learning_rate": 4.705362802399082e-09, + "loss": 0.273, + "step": 15688 + }, + { + "epoch": 0.99, + "grad_norm": 2.2739046925445483, + "learning_rate": 4.661291555196346e-09, + "loss": 0.2455, + "step": 15689 + }, + { + "epoch": 0.99, + "grad_norm": 4.335449567235336, + "learning_rate": 4.617427573469502e-09, + "loss": 0.2577, + "step": 15690 + }, + { + "epoch": 0.99, + "grad_norm": 1.9443615278019875, + "learning_rate": 4.573770859038207e-09, + "loss": 0.236, + "step": 15691 + }, + { + "epoch": 0.99, + "grad_norm": 2.722616558266238, + "learning_rate": 4.530321413714345e-09, + "loss": 0.2628, + "step": 15692 + }, + { + "epoch": 0.99, + "grad_norm": 2.163307092856542, + "learning_rate": 4.487079239300363e-09, + "loss": 0.2504, + "step": 15693 + }, + { + "epoch": 0.99, + "grad_norm": 1.5143142547497772, + "learning_rate": 4.444044337590381e-09, + "loss": 0.2548, + "step": 15694 + }, + { + "epoch": 0.99, + "grad_norm": 3.357737687257555, + "learning_rate": 4.401216710370193e-09, + "loss": 0.2414, + "step": 15695 + }, + { + "epoch": 0.99, + "grad_norm": 3.296475992183728, + "learning_rate": 4.358596359416712e-09, + "loss": 0.2516, + "step": 15696 + }, + { + "epoch": 0.99, + "grad_norm": 2.3018184705700144, + "learning_rate": 4.316183286497966e-09, + "loss": 0.2452, + "step": 15697 + }, + { + "epoch": 0.99, + "grad_norm": 2.0956838160957654, + "learning_rate": 4.273977493374215e-09, + "loss": 0.2639, + "step": 15698 + }, + { + "epoch": 0.99, + "grad_norm": 1.8657023384057065, + "learning_rate": 4.231978981796836e-09, + "loss": 0.2547, + "step": 15699 + }, + { + "epoch": 0.99, + "grad_norm": 2.5622571930646347, + "learning_rate": 4.190187753507769e-09, + "loss": 0.2471, + "step": 15700 + }, + { + "epoch": 0.99, + "grad_norm": 1.8443428570988152, + "learning_rate": 4.148603810241736e-09, + "loss": 0.2454, + "step": 15701 + }, + { + "epoch": 0.99, + "grad_norm": 2.6598271444935904, + "learning_rate": 4.107227153724025e-09, + "loss": 0.2499, + "step": 15702 + }, + { + "epoch": 0.99, + "grad_norm": 1.7876087746085303, + "learning_rate": 4.066057785670486e-09, + "loss": 0.2591, + "step": 15703 + }, + { + "epoch": 0.99, + "grad_norm": 2.151683819391787, + "learning_rate": 4.0250957077903055e-09, + "loss": 0.239, + "step": 15704 + }, + { + "epoch": 0.99, + "grad_norm": 2.6222148335428344, + "learning_rate": 3.984340921782681e-09, + "loss": 0.2608, + "step": 15705 + }, + { + "epoch": 0.99, + "grad_norm": 2.1699803301021197, + "learning_rate": 3.9437934293384824e-09, + "loss": 0.2648, + "step": 15706 + }, + { + "epoch": 0.99, + "grad_norm": 2.633422960341346, + "learning_rate": 3.903453232140808e-09, + "loss": 0.2743, + "step": 15707 + }, + { + "epoch": 0.99, + "grad_norm": 3.548380016945056, + "learning_rate": 3.863320331862763e-09, + "loss": 0.2577, + "step": 15708 + }, + { + "epoch": 0.99, + "grad_norm": 1.6850794141643164, + "learning_rate": 3.823394730169683e-09, + "loss": 0.2489, + "step": 15709 + }, + { + "epoch": 0.99, + "grad_norm": 1.8572656559590008, + "learning_rate": 3.78367642871802e-09, + "loss": 0.2517, + "step": 15710 + }, + { + "epoch": 0.99, + "grad_norm": 2.5703480181271305, + "learning_rate": 3.744165429155344e-09, + "loss": 0.2683, + "step": 15711 + }, + { + "epoch": 0.99, + "grad_norm": 2.6371975870181545, + "learning_rate": 3.7048617331225667e-09, + "loss": 0.2547, + "step": 15712 + }, + { + "epoch": 0.99, + "grad_norm": 1.740872662529989, + "learning_rate": 3.6657653422489392e-09, + "loss": 0.2464, + "step": 15713 + }, + { + "epoch": 0.99, + "grad_norm": 1.9950757202341172, + "learning_rate": 3.626876258157608e-09, + "loss": 0.2741, + "step": 15714 + }, + { + "epoch": 0.99, + "grad_norm": 3.7426453294714634, + "learning_rate": 3.5881944824611716e-09, + "loss": 0.2569, + "step": 15715 + }, + { + "epoch": 0.99, + "grad_norm": 2.5666520416987435, + "learning_rate": 3.5497200167655676e-09, + "loss": 0.2771, + "step": 15716 + }, + { + "epoch": 0.99, + "grad_norm": 3.1091677599967005, + "learning_rate": 3.511452862666742e-09, + "loss": 0.2572, + "step": 15717 + }, + { + "epoch": 0.99, + "grad_norm": 2.7261772790606957, + "learning_rate": 3.473393021752314e-09, + "loss": 0.2442, + "step": 15718 + }, + { + "epoch": 0.99, + "grad_norm": 3.517246950564713, + "learning_rate": 3.4355404956021297e-09, + "loss": 0.226, + "step": 15719 + }, + { + "epoch": 0.99, + "grad_norm": 2.6825080849977034, + "learning_rate": 3.3978952857854907e-09, + "loss": 0.2485, + "step": 15720 + }, + { + "epoch": 0.99, + "grad_norm": 2.79111575766408, + "learning_rate": 3.3604573938655903e-09, + "loss": 0.2495, + "step": 15721 + }, + { + "epoch": 0.99, + "grad_norm": 2.6244642837968324, + "learning_rate": 3.323226821395631e-09, + "loss": 0.2741, + "step": 15722 + }, + { + "epoch": 0.99, + "grad_norm": 1.8370531258603937, + "learning_rate": 3.2862035699199323e-09, + "loss": 0.2484, + "step": 15723 + }, + { + "epoch": 0.99, + "grad_norm": 1.320558568930975, + "learning_rate": 3.2493876409744886e-09, + "loss": 0.2359, + "step": 15724 + }, + { + "epoch": 0.99, + "grad_norm": 3.3111944569878298, + "learning_rate": 3.212779036087521e-09, + "loss": 0.258, + "step": 15725 + }, + { + "epoch": 0.99, + "grad_norm": 2.748964897584076, + "learning_rate": 3.17637775677726e-09, + "loss": 0.2621, + "step": 15726 + }, + { + "epoch": 0.99, + "grad_norm": 1.8969812154128607, + "learning_rate": 3.1401838045547197e-09, + "loss": 0.2515, + "step": 15727 + }, + { + "epoch": 0.99, + "grad_norm": 2.7619772801912825, + "learning_rate": 3.1041971809209204e-09, + "loss": 0.2615, + "step": 15728 + }, + { + "epoch": 0.99, + "grad_norm": 2.075026575288025, + "learning_rate": 3.068417887370223e-09, + "loss": 0.2475, + "step": 15729 + }, + { + "epoch": 0.99, + "grad_norm": 1.6039535760746912, + "learning_rate": 3.0328459253858854e-09, + "loss": 0.2509, + "step": 15730 + }, + { + "epoch": 0.99, + "grad_norm": 1.519921143423708, + "learning_rate": 2.9974812964445042e-09, + "loss": 0.2557, + "step": 15731 + }, + { + "epoch": 0.99, + "grad_norm": 2.1964090774125284, + "learning_rate": 2.9623240020132395e-09, + "loss": 0.2567, + "step": 15732 + }, + { + "epoch": 0.99, + "grad_norm": 3.817719585498989, + "learning_rate": 2.9273740435514787e-09, + "loss": 0.2522, + "step": 15733 + }, + { + "epoch": 0.99, + "grad_norm": 1.5741963012685687, + "learning_rate": 2.892631422508063e-09, + "loss": 0.2473, + "step": 15734 + }, + { + "epoch": 0.99, + "grad_norm": 3.812412797870793, + "learning_rate": 2.8580961403251726e-09, + "loss": 0.2561, + "step": 15735 + }, + { + "epoch": 0.99, + "grad_norm": 1.9472104741231293, + "learning_rate": 2.8237681984361054e-09, + "loss": 0.2661, + "step": 15736 + }, + { + "epoch": 0.99, + "grad_norm": 9.120596604382238, + "learning_rate": 2.789647598264722e-09, + "loss": 0.2563, + "step": 15737 + }, + { + "epoch": 0.99, + "grad_norm": 2.0668650554990875, + "learning_rate": 2.755734341227112e-09, + "loss": 0.2436, + "step": 15738 + }, + { + "epoch": 0.99, + "grad_norm": 1.5249412288243576, + "learning_rate": 2.722028428730483e-09, + "loss": 0.2396, + "step": 15739 + }, + { + "epoch": 0.99, + "grad_norm": 2.724048197980926, + "learning_rate": 2.6885298621726062e-09, + "loss": 0.2658, + "step": 15740 + }, + { + "epoch": 0.99, + "grad_norm": 2.8083641336883494, + "learning_rate": 2.6552386429434806e-09, + "loss": 0.2402, + "step": 15741 + }, + { + "epoch": 0.99, + "grad_norm": 2.129485889147207, + "learning_rate": 2.6221547724253337e-09, + "loss": 0.2671, + "step": 15742 + }, + { + "epoch": 0.99, + "grad_norm": 1.6534557948417292, + "learning_rate": 2.5892782519904015e-09, + "loss": 0.2695, + "step": 15743 + }, + { + "epoch": 0.99, + "grad_norm": 8.642079432730648, + "learning_rate": 2.5566090830025924e-09, + "loss": 0.2415, + "step": 15744 + }, + { + "epoch": 0.99, + "grad_norm": 1.7533545002534725, + "learning_rate": 2.5241472668174894e-09, + "loss": 0.2553, + "step": 15745 + }, + { + "epoch": 0.99, + "grad_norm": 1.678989742258969, + "learning_rate": 2.4918928047817924e-09, + "loss": 0.2423, + "step": 15746 + }, + { + "epoch": 0.99, + "grad_norm": 4.397889746513507, + "learning_rate": 2.459845698234431e-09, + "loss": 0.2508, + "step": 15747 + }, + { + "epoch": 0.99, + "grad_norm": 2.2909800370640574, + "learning_rate": 2.4280059485043418e-09, + "loss": 0.2471, + "step": 15748 + }, + { + "epoch": 0.99, + "grad_norm": 2.0282399774213684, + "learning_rate": 2.396373556913245e-09, + "loss": 0.2549, + "step": 15749 + }, + { + "epoch": 0.99, + "grad_norm": 2.6786566203014583, + "learning_rate": 2.364948524773425e-09, + "loss": 0.258, + "step": 15750 + }, + { + "epoch": 0.99, + "grad_norm": 2.319857187054752, + "learning_rate": 2.333730853388283e-09, + "loss": 0.2609, + "step": 15751 + }, + { + "epoch": 0.99, + "grad_norm": 1.984921089124113, + "learning_rate": 2.3027205440540047e-09, + "loss": 0.2581, + "step": 15752 + }, + { + "epoch": 0.99, + "grad_norm": 1.7630771807481402, + "learning_rate": 2.271917598056228e-09, + "loss": 0.2774, + "step": 15753 + }, + { + "epoch": 0.99, + "grad_norm": 6.303271519897911, + "learning_rate": 2.24132201667393e-09, + "loss": 0.2545, + "step": 15754 + }, + { + "epoch": 0.99, + "grad_norm": 3.4398670295687306, + "learning_rate": 2.210933801176096e-09, + "loss": 0.2438, + "step": 15755 + }, + { + "epoch": 0.99, + "grad_norm": 2.558926362763449, + "learning_rate": 2.1807529528239392e-09, + "loss": 0.2329, + "step": 15756 + }, + { + "epoch": 0.99, + "grad_norm": 1.820919925346816, + "learning_rate": 2.1507794728692354e-09, + "loss": 0.2495, + "step": 15757 + }, + { + "epoch": 0.99, + "grad_norm": 2.6281178551012414, + "learning_rate": 2.121013362555435e-09, + "loss": 0.2455, + "step": 15758 + }, + { + "epoch": 0.99, + "grad_norm": 8.12444545364323, + "learning_rate": 2.0914546231187717e-09, + "loss": 0.284, + "step": 15759 + }, + { + "epoch": 0.99, + "grad_norm": 3.310457589552341, + "learning_rate": 2.0621032557843755e-09, + "loss": 0.2672, + "step": 15760 + }, + { + "epoch": 0.99, + "grad_norm": 2.2508547205728386, + "learning_rate": 2.0329592617712724e-09, + "loss": 0.2461, + "step": 15761 + }, + { + "epoch": 0.99, + "grad_norm": 2.8216047821695156, + "learning_rate": 2.0040226422873844e-09, + "loss": 0.2507, + "step": 15762 + }, + { + "epoch": 0.99, + "grad_norm": 8.340007702846668, + "learning_rate": 1.975293398534528e-09, + "loss": 0.262, + "step": 15763 + }, + { + "epoch": 0.99, + "grad_norm": 2.0180235361840233, + "learning_rate": 1.9467715317039725e-09, + "loss": 0.2574, + "step": 15764 + }, + { + "epoch": 0.99, + "grad_norm": 3.4677583375723975, + "learning_rate": 1.918457042979216e-09, + "loss": 0.2313, + "step": 15765 + }, + { + "epoch": 0.99, + "grad_norm": 1.9576456015077983, + "learning_rate": 1.8903499335359842e-09, + "loss": 0.2334, + "step": 15766 + }, + { + "epoch": 0.99, + "grad_norm": 2.617471321302201, + "learning_rate": 1.8624502045389015e-09, + "loss": 0.2472, + "step": 15767 + }, + { + "epoch": 0.99, + "grad_norm": 2.239156455603878, + "learning_rate": 1.8347578571470404e-09, + "loss": 0.2591, + "step": 15768 + }, + { + "epoch": 0.99, + "grad_norm": 3.2377567823075415, + "learning_rate": 1.8072728925089266e-09, + "loss": 0.2524, + "step": 15769 + }, + { + "epoch": 0.99, + "grad_norm": 4.370827325043801, + "learning_rate": 1.7799953117642045e-09, + "loss": 0.2456, + "step": 15770 + }, + { + "epoch": 0.99, + "grad_norm": 3.1988984807116836, + "learning_rate": 1.752925116045856e-09, + "loss": 0.269, + "step": 15771 + }, + { + "epoch": 0.99, + "grad_norm": 1.7036850099691692, + "learning_rate": 1.7260623064763172e-09, + "loss": 0.2628, + "step": 15772 + }, + { + "epoch": 0.99, + "grad_norm": 2.498788690928352, + "learning_rate": 1.6994068841702515e-09, + "loss": 0.2555, + "step": 15773 + }, + { + "epoch": 0.99, + "grad_norm": 0.6238314332509672, + "learning_rate": 1.6729588502339966e-09, + "loss": 0.4649, + "step": 15774 + }, + { + "epoch": 0.99, + "grad_norm": 1.5360329641170327, + "learning_rate": 1.6467182057650077e-09, + "loss": 0.2423, + "step": 15775 + }, + { + "epoch": 0.99, + "grad_norm": 1.8678257484980374, + "learning_rate": 1.6206849518513036e-09, + "loss": 0.265, + "step": 15776 + }, + { + "epoch": 0.99, + "grad_norm": 1.4887046116583083, + "learning_rate": 1.5948590895736859e-09, + "loss": 0.2588, + "step": 15777 + }, + { + "epoch": 0.99, + "grad_norm": 1.4301680936463346, + "learning_rate": 1.5692406200035204e-09, + "loss": 0.2352, + "step": 15778 + }, + { + "epoch": 0.99, + "grad_norm": 1.3356872740475663, + "learning_rate": 1.5438295442032902e-09, + "loss": 0.2508, + "step": 15779 + }, + { + "epoch": 0.99, + "grad_norm": 3.7309428404312084, + "learning_rate": 1.5186258632282625e-09, + "loss": 0.2546, + "step": 15780 + }, + { + "epoch": 0.99, + "grad_norm": 1.6027635420829873, + "learning_rate": 1.493629578123712e-09, + "loss": 0.2466, + "step": 15781 + }, + { + "epoch": 0.99, + "grad_norm": 0.5931391141911617, + "learning_rate": 1.4688406899271424e-09, + "loss": 0.4778, + "step": 15782 + }, + { + "epoch": 0.99, + "grad_norm": 6.626739295954805, + "learning_rate": 1.44425919966662e-09, + "loss": 0.263, + "step": 15783 + }, + { + "epoch": 0.99, + "grad_norm": 2.968552764813573, + "learning_rate": 1.4198851083618847e-09, + "loss": 0.2673, + "step": 15784 + }, + { + "epoch": 0.99, + "grad_norm": 1.866082105062887, + "learning_rate": 1.395718417024905e-09, + "loss": 0.2523, + "step": 15785 + }, + { + "epoch": 0.99, + "grad_norm": 2.0104393193516383, + "learning_rate": 1.3717591266576569e-09, + "loss": 0.2483, + "step": 15786 + }, + { + "epoch": 0.99, + "grad_norm": 2.2977593434636496, + "learning_rate": 1.3480072382549003e-09, + "loss": 0.2622, + "step": 15787 + }, + { + "epoch": 0.99, + "grad_norm": 1.8680346634061324, + "learning_rate": 1.3244627528019582e-09, + "loss": 0.242, + "step": 15788 + }, + { + "epoch": 0.99, + "grad_norm": 2.6706379357274512, + "learning_rate": 1.3011256712758268e-09, + "loss": 0.245, + "step": 15789 + }, + { + "epoch": 0.99, + "grad_norm": 1.806097211194816, + "learning_rate": 1.2779959946446209e-09, + "loss": 0.2431, + "step": 15790 + }, + { + "epoch": 0.99, + "grad_norm": 1.9816721896288432, + "learning_rate": 1.2550737238681277e-09, + "loss": 0.2592, + "step": 15791 + }, + { + "epoch": 0.99, + "grad_norm": 2.773295092238538, + "learning_rate": 1.2323588598972536e-09, + "loss": 0.2515, + "step": 15792 + }, + { + "epoch": 0.99, + "grad_norm": 1.7284142664191817, + "learning_rate": 1.209851403675133e-09, + "loss": 0.2438, + "step": 15793 + }, + { + "epoch": 0.99, + "grad_norm": 2.970835538706795, + "learning_rate": 1.1875513561343531e-09, + "loss": 0.2575, + "step": 15794 + }, + { + "epoch": 0.99, + "grad_norm": 3.4136915313654406, + "learning_rate": 1.1654587182013955e-09, + "loss": 0.2522, + "step": 15795 + }, + { + "epoch": 0.99, + "grad_norm": 2.2831527404125618, + "learning_rate": 1.1435734907921936e-09, + "loss": 0.2637, + "step": 15796 + }, + { + "epoch": 0.99, + "grad_norm": 2.4595129764799224, + "learning_rate": 1.1218956748154652e-09, + "loss": 0.2572, + "step": 15797 + }, + { + "epoch": 0.99, + "grad_norm": 1.535759149745823, + "learning_rate": 1.100425271170491e-09, + "loss": 0.2367, + "step": 15798 + }, + { + "epoch": 0.99, + "grad_norm": 2.6849277462826926, + "learning_rate": 1.0791622807471147e-09, + "loss": 0.25, + "step": 15799 + }, + { + "epoch": 0.99, + "grad_norm": 2.577346126166828, + "learning_rate": 1.0581067044290738e-09, + "loss": 0.2434, + "step": 15800 + }, + { + "epoch": 0.99, + "grad_norm": 3.5184578024039044, + "learning_rate": 1.0372585430890037e-09, + "loss": 0.2529, + "step": 15801 + }, + { + "epoch": 0.99, + "grad_norm": 1.7656763929677726, + "learning_rate": 1.016617797592323e-09, + "loss": 0.2501, + "step": 15802 + }, + { + "epoch": 0.99, + "grad_norm": 2.271362466702832, + "learning_rate": 9.96184468795014e-10, + "loss": 0.2572, + "step": 15803 + }, + { + "epoch": 0.99, + "grad_norm": 1.4264577474246123, + "learning_rate": 9.75958557545842e-10, + "loss": 0.2431, + "step": 15804 + }, + { + "epoch": 0.99, + "grad_norm": 1.6250150979802267, + "learning_rate": 9.559400646830253e-10, + "loss": 0.2377, + "step": 15805 + }, + { + "epoch": 0.99, + "grad_norm": 2.1494566019549723, + "learning_rate": 9.36128991037566e-10, + "loss": 0.2628, + "step": 15806 + }, + { + "epoch": 0.99, + "grad_norm": 1.9165050424740038, + "learning_rate": 9.165253374315841e-10, + "loss": 0.2624, + "step": 15807 + }, + { + "epoch": 0.99, + "grad_norm": 1.6692706515997264, + "learning_rate": 8.971291046783182e-10, + "loss": 0.2418, + "step": 15808 + }, + { + "epoch": 0.99, + "grad_norm": 2.037366880622655, + "learning_rate": 8.779402935826797e-10, + "loss": 0.2546, + "step": 15809 + }, + { + "epoch": 0.99, + "grad_norm": 2.026183330748864, + "learning_rate": 8.589589049406988e-10, + "loss": 0.2605, + "step": 15810 + }, + { + "epoch": 0.99, + "grad_norm": 0.6477617273822848, + "learning_rate": 8.401849395395234e-10, + "loss": 0.4714, + "step": 15811 + }, + { + "epoch": 0.99, + "grad_norm": 1.7289607764256854, + "learning_rate": 8.216183981590853e-10, + "loss": 0.2438, + "step": 15812 + }, + { + "epoch": 0.99, + "grad_norm": 2.7450392671904176, + "learning_rate": 8.032592815693241e-10, + "loss": 0.2455, + "step": 15813 + }, + { + "epoch": 0.99, + "grad_norm": 5.457123771551411, + "learning_rate": 7.851075905312977e-10, + "loss": 0.2626, + "step": 15814 + }, + { + "epoch": 0.99, + "grad_norm": 2.861847714252405, + "learning_rate": 7.671633257994026e-10, + "loss": 0.2697, + "step": 15815 + }, + { + "epoch": 0.99, + "grad_norm": 2.0476718483286223, + "learning_rate": 7.494264881174884e-10, + "loss": 0.2674, + "step": 15816 + }, + { + "epoch": 0.99, + "grad_norm": 2.2708692136065403, + "learning_rate": 7.318970782210777e-10, + "loss": 0.2535, + "step": 15817 + }, + { + "epoch": 0.99, + "grad_norm": 2.011896763030752, + "learning_rate": 7.145750968384768e-10, + "loss": 0.2311, + "step": 15818 + }, + { + "epoch": 0.99, + "grad_norm": 1.89508554758061, + "learning_rate": 6.97460544687445e-10, + "loss": 0.264, + "step": 15819 + }, + { + "epoch": 0.99, + "grad_norm": 1.808696185442965, + "learning_rate": 6.805534224785248e-10, + "loss": 0.2623, + "step": 15820 + }, + { + "epoch": 0.99, + "grad_norm": 2.8490841953622192, + "learning_rate": 6.638537309133775e-10, + "loss": 0.2665, + "step": 15821 + }, + { + "epoch": 1.0, + "grad_norm": 2.448466173634456, + "learning_rate": 6.473614706847819e-10, + "loss": 0.2433, + "step": 15822 + }, + { + "epoch": 1.0, + "grad_norm": 2.379150334893618, + "learning_rate": 6.310766424771908e-10, + "loss": 0.2646, + "step": 15823 + }, + { + "epoch": 1.0, + "grad_norm": 1.778485870475655, + "learning_rate": 6.149992469661747e-10, + "loss": 0.29, + "step": 15824 + }, + { + "epoch": 1.0, + "grad_norm": 3.8603618420930625, + "learning_rate": 5.991292848184227e-10, + "loss": 0.2474, + "step": 15825 + }, + { + "epoch": 1.0, + "grad_norm": 3.0839198732403688, + "learning_rate": 5.83466756692852e-10, + "loss": 0.2466, + "step": 15826 + }, + { + "epoch": 1.0, + "grad_norm": 2.881392580062999, + "learning_rate": 5.680116632389432e-10, + "loss": 0.2509, + "step": 15827 + }, + { + "epoch": 1.0, + "grad_norm": 2.4326814213899266, + "learning_rate": 5.527640050984051e-10, + "loss": 0.2532, + "step": 15828 + }, + { + "epoch": 1.0, + "grad_norm": 2.111285348025681, + "learning_rate": 5.377237829040649e-10, + "loss": 0.2332, + "step": 15829 + }, + { + "epoch": 1.0, + "grad_norm": 6.345627418125003, + "learning_rate": 5.228909972793129e-10, + "loss": 0.259, + "step": 15830 + }, + { + "epoch": 1.0, + "grad_norm": 4.885811337884381, + "learning_rate": 5.082656488397675e-10, + "loss": 0.2576, + "step": 15831 + }, + { + "epoch": 1.0, + "grad_norm": 4.720055567255641, + "learning_rate": 4.938477381921658e-10, + "loss": 0.2491, + "step": 15832 + }, + { + "epoch": 1.0, + "grad_norm": 2.8205032582896923, + "learning_rate": 4.796372659354731e-10, + "loss": 0.2564, + "step": 15833 + }, + { + "epoch": 1.0, + "grad_norm": 1.7918494828314753, + "learning_rate": 4.656342326586627e-10, + "loss": 0.2618, + "step": 15834 + }, + { + "epoch": 1.0, + "grad_norm": 2.2529947044315977, + "learning_rate": 4.5183863894293633e-10, + "loss": 0.2503, + "step": 15835 + }, + { + "epoch": 1.0, + "grad_norm": 2.8860136646471233, + "learning_rate": 4.382504853600589e-10, + "loss": 0.2625, + "step": 15836 + }, + { + "epoch": 1.0, + "grad_norm": 2.084821422668374, + "learning_rate": 4.2486977247513384e-10, + "loss": 0.2601, + "step": 15837 + }, + { + "epoch": 1.0, + "grad_norm": 1.4990367180345032, + "learning_rate": 4.116965008421625e-10, + "loss": 0.2525, + "step": 15838 + }, + { + "epoch": 1.0, + "grad_norm": 11.549193529741228, + "learning_rate": 3.987306710079297e-10, + "loss": 0.2696, + "step": 15839 + }, + { + "epoch": 1.0, + "grad_norm": 0.6142007502113539, + "learning_rate": 3.859722835114488e-10, + "loss": 0.475, + "step": 15840 + }, + { + "epoch": 1.0, + "grad_norm": 2.1968732666911017, + "learning_rate": 3.734213388806307e-10, + "loss": 0.2521, + "step": 15841 + }, + { + "epoch": 1.0, + "grad_norm": 3.3236889276011943, + "learning_rate": 3.6107783763728033e-10, + "loss": 0.2618, + "step": 15842 + }, + { + "epoch": 1.0, + "grad_norm": 1.7506306941277392, + "learning_rate": 3.4894178029265537e-10, + "loss": 0.2204, + "step": 15843 + }, + { + "epoch": 1.0, + "grad_norm": 1.660360868628995, + "learning_rate": 3.3701316735135213e-10, + "loss": 0.2597, + "step": 15844 + }, + { + "epoch": 1.0, + "grad_norm": 2.311502716852735, + "learning_rate": 3.2529199930741996e-10, + "loss": 0.2475, + "step": 15845 + }, + { + "epoch": 1.0, + "grad_norm": 1.693419771168762, + "learning_rate": 3.1377827664769156e-10, + "loss": 0.2484, + "step": 15846 + }, + { + "epoch": 1.0, + "grad_norm": 0.6339411603170874, + "learning_rate": 3.024719998495629e-10, + "loss": 0.4808, + "step": 15847 + }, + { + "epoch": 1.0, + "grad_norm": 4.071155479781498, + "learning_rate": 2.9137316938265826e-10, + "loss": 0.2594, + "step": 15848 + }, + { + "epoch": 1.0, + "grad_norm": 1.9132552133377398, + "learning_rate": 2.804817857071651e-10, + "loss": 0.2572, + "step": 15849 + }, + { + "epoch": 1.0, + "grad_norm": 3.448989462257928, + "learning_rate": 2.697978492749442e-10, + "loss": 0.2482, + "step": 15850 + }, + { + "epoch": 1.0, + "grad_norm": 1.550668758577631, + "learning_rate": 2.593213605289746e-10, + "loss": 0.2423, + "step": 15851 + }, + { + "epoch": 1.0, + "grad_norm": 1.6655926186417245, + "learning_rate": 2.4905231990446367e-10, + "loss": 0.246, + "step": 15852 + }, + { + "epoch": 1.0, + "grad_norm": 2.202039656965659, + "learning_rate": 2.3899072782773703e-10, + "loss": 0.2553, + "step": 15853 + }, + { + "epoch": 1.0, + "grad_norm": 1.9603793889571737, + "learning_rate": 2.2913658471512834e-10, + "loss": 0.2544, + "step": 15854 + }, + { + "epoch": 1.0, + "grad_norm": 2.060598463139521, + "learning_rate": 2.1948989097686503e-10, + "loss": 0.2634, + "step": 15855 + }, + { + "epoch": 1.0, + "grad_norm": 2.0960565068295582, + "learning_rate": 2.1005064701262733e-10, + "loss": 0.2478, + "step": 15856 + }, + { + "epoch": 1.0, + "grad_norm": 18.82967641871937, + "learning_rate": 2.008188532132138e-10, + "loss": 0.2424, + "step": 15857 + }, + { + "epoch": 1.0, + "grad_norm": 1.9338115148910051, + "learning_rate": 1.9179450996331673e-10, + "loss": 0.2551, + "step": 15858 + }, + { + "epoch": 1.0, + "grad_norm": 4.993639365186874, + "learning_rate": 1.82977617635971e-10, + "loss": 0.2549, + "step": 15859 + }, + { + "epoch": 1.0, + "grad_norm": 1.7870016346903912, + "learning_rate": 1.7436817659810534e-10, + "loss": 0.26, + "step": 15860 + }, + { + "epoch": 1.0, + "grad_norm": 1.873555944531384, + "learning_rate": 1.6596618720610135e-10, + "loss": 0.2539, + "step": 15861 + }, + { + "epoch": 1.0, + "grad_norm": 1.8403489382086395, + "learning_rate": 1.5777164980856908e-10, + "loss": 0.2622, + "step": 15862 + }, + { + "epoch": 1.0, + "grad_norm": 1.8220043644736423, + "learning_rate": 1.4978456474634694e-10, + "loss": 0.2485, + "step": 15863 + }, + { + "epoch": 1.0, + "grad_norm": 3.5134512089914693, + "learning_rate": 1.4200493234972633e-10, + "loss": 0.2714, + "step": 15864 + }, + { + "epoch": 1.0, + "grad_norm": 1.8601289860607761, + "learning_rate": 1.3443275294233727e-10, + "loss": 0.2703, + "step": 15865 + }, + { + "epoch": 1.0, + "grad_norm": 3.55878366768738, + "learning_rate": 1.2706802683837282e-10, + "loss": 0.2404, + "step": 15866 + }, + { + "epoch": 1.0, + "grad_norm": 2.2219665945766827, + "learning_rate": 1.1991075434258924e-10, + "loss": 0.255, + "step": 15867 + }, + { + "epoch": 1.0, + "grad_norm": 0.5993892094123481, + "learning_rate": 1.1296093575308142e-10, + "loss": 0.4483, + "step": 15868 + }, + { + "epoch": 1.0, + "grad_norm": 1.4817994356802835, + "learning_rate": 1.062185713573971e-10, + "loss": 0.2407, + "step": 15869 + }, + { + "epoch": 1.0, + "grad_norm": 1.5873082189193888, + "learning_rate": 9.96836614353125e-11, + "loss": 0.2353, + "step": 15870 + }, + { + "epoch": 1.0, + "grad_norm": 1.6658670506839874, + "learning_rate": 9.335620625827712e-11, + "loss": 0.2402, + "step": 15871 + }, + { + "epoch": 1.0, + "grad_norm": 1.5357579254195182, + "learning_rate": 8.723620608885875e-11, + "loss": 0.2492, + "step": 15872 + }, + { + "epoch": 1.0, + "grad_norm": 1.9062473174883259, + "learning_rate": 8.13236611807433e-11, + "loss": 0.2477, + "step": 15873 + }, + { + "epoch": 1.0, + "grad_norm": 29.69040100379666, + "learning_rate": 7.56185717792901e-11, + "loss": 0.2662, + "step": 15874 + }, + { + "epoch": 1.0, + "grad_norm": 4.6464851171485275, + "learning_rate": 7.012093812153175e-11, + "loss": 0.2362, + "step": 15875 + }, + { + "epoch": 1.0, + "grad_norm": 2.762516786693178, + "learning_rate": 6.483076043506398e-11, + "loss": 0.2303, + "step": 15876 + }, + { + "epoch": 1.0, + "grad_norm": 4.826107944026365, + "learning_rate": 5.974803893971093e-11, + "loss": 0.2523, + "step": 15877 + }, + { + "epoch": 1.0, + "grad_norm": 5.22165720210061, + "learning_rate": 5.4872773846414984e-11, + "loss": 0.2604, + "step": 15878 + }, + { + "epoch": 1.0, + "grad_norm": 13.016177664600205, + "learning_rate": 5.0204965357236735e-11, + "loss": 0.2421, + "step": 15879 + }, + { + "epoch": 1.0, + "grad_norm": 2.134437829279411, + "learning_rate": 4.5744613665910096e-11, + "loss": 0.2467, + "step": 15880 + }, + { + "epoch": 1.0, + "grad_norm": 3.2137617734187565, + "learning_rate": 4.149171895784232e-11, + "loss": 0.2435, + "step": 15881 + }, + { + "epoch": 1.0, + "grad_norm": 2.1415668129522456, + "learning_rate": 3.744628140900375e-11, + "loss": 0.2699, + "step": 15882 + }, + { + "epoch": 1.0, + "grad_norm": 2.4250313758395325, + "learning_rate": 3.360830118759317e-11, + "loss": 0.2462, + "step": 15883 + }, + { + "epoch": 1.0, + "grad_norm": 2.481129533467969, + "learning_rate": 2.997777845237249e-11, + "loss": 0.2671, + "step": 15884 + }, + { + "epoch": 1.0, + "grad_norm": 1.5735979939100921, + "learning_rate": 2.655471335433202e-11, + "loss": 0.2434, + "step": 15885 + }, + { + "epoch": 1.0, + "grad_norm": 2.087847386570869, + "learning_rate": 2.3339106036135428e-11, + "loss": 0.2548, + "step": 15886 + }, + { + "epoch": 1.0, + "grad_norm": 1.5564851815198115, + "learning_rate": 2.0330956629899256e-11, + "loss": 0.239, + "step": 15887 + }, + { + "epoch": 1.0, + "grad_norm": 3.614140017072922, + "learning_rate": 1.7530265261633817e-11, + "loss": 0.2525, + "step": 15888 + }, + { + "epoch": 1.0, + "grad_norm": 1.4842722954885819, + "learning_rate": 1.493703204624719e-11, + "loss": 0.2512, + "step": 15889 + }, + { + "epoch": 1.0, + "grad_norm": 1.3875573976995257, + "learning_rate": 1.2551257092541236e-11, + "loss": 0.2483, + "step": 15890 + }, + { + "epoch": 1.0, + "grad_norm": 3.1632674817088717, + "learning_rate": 1.0372940499325801e-11, + "loss": 0.2434, + "step": 15891 + }, + { + "epoch": 1.0, + "grad_norm": 2.6108113224693525, + "learning_rate": 8.402082356528951e-12, + "loss": 0.2375, + "step": 15892 + }, + { + "epoch": 1.0, + "grad_norm": 3.0574607295809564, + "learning_rate": 6.638682745752078e-12, + "loss": 0.2501, + "step": 15893 + }, + { + "epoch": 1.0, + "grad_norm": 1.9223745668222756, + "learning_rate": 5.082741740825015e-12, + "loss": 0.2481, + "step": 15894 + }, + { + "epoch": 1.0, + "grad_norm": 2.911128079235186, + "learning_rate": 3.734259406140694e-12, + "loss": 0.2495, + "step": 15895 + }, + { + "epoch": 1.0, + "grad_norm": 0.6302976214860023, + "learning_rate": 2.5932357972102695e-12, + "loss": 0.4538, + "step": 15896 + }, + { + "epoch": 1.0, + "grad_norm": 1.5654230457121145, + "learning_rate": 1.6596709617733298e-12, + "loss": 0.2426, + "step": 15897 + }, + { + "epoch": 1.0, + "grad_norm": 2.76785231886163, + "learning_rate": 9.33564938687681e-13, + "loss": 0.2583, + "step": 15898 + }, + { + "epoch": 1.0, + "grad_norm": 1.8857348369708409, + "learning_rate": 4.149177579293451e-13, + "loss": 0.2469, + "step": 15899 + }, + { + "epoch": 1.0, + "grad_norm": 1.9988417197412622, + "learning_rate": 1.037294405925593e-13, + "loss": 0.2535, + "step": 15900 + }, + { + "epoch": 1.0, + "grad_norm": 7.689559692396786, + "learning_rate": 0.0, + "loss": 0.2487, + "step": 15901 + }, + { + "epoch": 1.0, + "step": 15901, + "total_flos": 1.4822327104372736e+16, + "train_loss": 0.06416682437559874, + "train_runtime": 143299.6981, + "train_samples_per_second": 28.408, + "train_steps_per_second": 0.111 + } + ], + "logging_steps": 1.0, + "max_steps": 15901, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 3000, + "total_flos": 1.4822327104372736e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}