{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 7180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001392757660167131, "grad_norm": 2.8908393383026123, "learning_rate": 4.861111111111111e-07, "loss": 9.5583, "step": 1 }, { "epoch": 0.002785515320334262, "grad_norm": 2.8815290927886963, "learning_rate": 9.722222222222222e-07, "loss": 9.6058, "step": 2 }, { "epoch": 0.004178272980501393, "grad_norm": 2.6509928703308105, "learning_rate": 1.458333333333333e-06, "loss": 9.549, "step": 3 }, { "epoch": 0.005571030640668524, "grad_norm": 2.6149637699127197, "learning_rate": 1.9444444444444444e-06, "loss": 9.5588, "step": 4 }, { "epoch": 0.006963788300835654, "grad_norm": 3.3958027362823486, "learning_rate": 2.4305555555555552e-06, "loss": 9.584, "step": 5 }, { "epoch": 0.008356545961002786, "grad_norm": 2.5388824939727783, "learning_rate": 2.916666666666666e-06, "loss": 9.5191, "step": 6 }, { "epoch": 0.009749303621169917, "grad_norm": 2.521509885787964, "learning_rate": 3.4027777777777774e-06, "loss": 9.5599, "step": 7 }, { "epoch": 0.011142061281337047, "grad_norm": 2.5034923553466797, "learning_rate": 3.888888888888889e-06, "loss": 9.5376, "step": 8 }, { "epoch": 0.012534818941504178, "grad_norm": 2.9331307411193848, "learning_rate": 4.375e-06, "loss": 9.5846, "step": 9 }, { "epoch": 0.013927576601671309, "grad_norm": 2.613980531692505, "learning_rate": 4.8611111111111105e-06, "loss": 9.5551, "step": 10 }, { "epoch": 0.01532033426183844, "grad_norm": 2.579277276992798, "learning_rate": 5.347222222222222e-06, "loss": 9.5235, "step": 11 }, { "epoch": 0.016713091922005572, "grad_norm": 2.4165875911712646, "learning_rate": 5.833333333333332e-06, "loss": 9.4958, "step": 12 }, { "epoch": 0.018105849582172703, "grad_norm": 2.5124220848083496, "learning_rate": 6.319444444444444e-06, "loss": 9.5346, "step": 13 }, { "epoch": 0.019498607242339833, "grad_norm": 2.4491915702819824, "learning_rate": 6.805555555555555e-06, "loss": 9.5037, "step": 14 }, { "epoch": 0.020891364902506964, "grad_norm": 2.270697593688965, "learning_rate": 7.291666666666667e-06, "loss": 9.5149, "step": 15 }, { "epoch": 0.022284122562674095, "grad_norm": 2.4208738803863525, "learning_rate": 7.777777777777777e-06, "loss": 9.5268, "step": 16 }, { "epoch": 0.023676880222841225, "grad_norm": 2.4248204231262207, "learning_rate": 8.263888888888888e-06, "loss": 9.5157, "step": 17 }, { "epoch": 0.025069637883008356, "grad_norm": 2.4280309677124023, "learning_rate": 8.75e-06, "loss": 9.4933, "step": 18 }, { "epoch": 0.026462395543175487, "grad_norm": 3.640717029571533, "learning_rate": 9.23611111111111e-06, "loss": 9.5245, "step": 19 }, { "epoch": 0.027855153203342618, "grad_norm": 2.6451287269592285, "learning_rate": 9.722222222222221e-06, "loss": 9.5873, "step": 20 }, { "epoch": 0.02924791086350975, "grad_norm": 2.3361599445343018, "learning_rate": 1.0208333333333334e-05, "loss": 9.5214, "step": 21 }, { "epoch": 0.03064066852367688, "grad_norm": 2.440614700317383, "learning_rate": 1.0694444444444444e-05, "loss": 9.5212, "step": 22 }, { "epoch": 0.03203342618384401, "grad_norm": 2.296144962310791, "learning_rate": 1.1180555555555554e-05, "loss": 9.4204, "step": 23 }, { "epoch": 0.033426183844011144, "grad_norm": 2.450535774230957, "learning_rate": 1.1666666666666665e-05, "loss": 9.5338, "step": 24 }, { "epoch": 0.034818941504178275, "grad_norm": 2.552171230316162, "learning_rate": 1.2152777777777777e-05, "loss": 9.5031, "step": 25 }, { "epoch": 0.036211699164345405, "grad_norm": 2.9361586570739746, "learning_rate": 1.2638888888888888e-05, "loss": 9.4962, "step": 26 }, { "epoch": 0.037604456824512536, "grad_norm": 2.195554733276367, "learning_rate": 1.3124999999999999e-05, "loss": 9.4691, "step": 27 }, { "epoch": 0.03899721448467967, "grad_norm": 2.3282525539398193, "learning_rate": 1.361111111111111e-05, "loss": 9.4448, "step": 28 }, { "epoch": 0.0403899721448468, "grad_norm": 2.380357503890991, "learning_rate": 1.409722222222222e-05, "loss": 9.4604, "step": 29 }, { "epoch": 0.04178272980501393, "grad_norm": 2.1941680908203125, "learning_rate": 1.4583333333333333e-05, "loss": 9.5046, "step": 30 }, { "epoch": 0.04317548746518106, "grad_norm": 2.200751543045044, "learning_rate": 1.5069444444444444e-05, "loss": 9.4559, "step": 31 }, { "epoch": 0.04456824512534819, "grad_norm": 2.4295594692230225, "learning_rate": 1.5555555555555555e-05, "loss": 9.5226, "step": 32 }, { "epoch": 0.04596100278551532, "grad_norm": 2.3435890674591064, "learning_rate": 1.6041666666666666e-05, "loss": 9.4528, "step": 33 }, { "epoch": 0.04735376044568245, "grad_norm": 3.0860228538513184, "learning_rate": 1.6527777777777777e-05, "loss": 9.4985, "step": 34 }, { "epoch": 0.04874651810584958, "grad_norm": 2.2005224227905273, "learning_rate": 1.7013888888888888e-05, "loss": 9.4564, "step": 35 }, { "epoch": 0.05013927576601671, "grad_norm": 2.600155830383301, "learning_rate": 1.75e-05, "loss": 9.4709, "step": 36 }, { "epoch": 0.05153203342618384, "grad_norm": 2.273688793182373, "learning_rate": 1.798611111111111e-05, "loss": 9.5097, "step": 37 }, { "epoch": 0.052924791086350974, "grad_norm": 2.3479037284851074, "learning_rate": 1.847222222222222e-05, "loss": 9.4957, "step": 38 }, { "epoch": 0.054317548746518104, "grad_norm": 2.629591464996338, "learning_rate": 1.895833333333333e-05, "loss": 9.5332, "step": 39 }, { "epoch": 0.055710306406685235, "grad_norm": 2.1469650268554688, "learning_rate": 1.9444444444444442e-05, "loss": 9.4229, "step": 40 }, { "epoch": 0.057103064066852366, "grad_norm": 2.278468132019043, "learning_rate": 1.9930555555555553e-05, "loss": 9.4944, "step": 41 }, { "epoch": 0.0584958217270195, "grad_norm": 1.952836275100708, "learning_rate": 2.0416666666666667e-05, "loss": 9.4444, "step": 42 }, { "epoch": 0.05988857938718663, "grad_norm": 2.0497686862945557, "learning_rate": 2.0902777777777775e-05, "loss": 9.4685, "step": 43 }, { "epoch": 0.06128133704735376, "grad_norm": 2.3993453979492188, "learning_rate": 2.138888888888889e-05, "loss": 9.4273, "step": 44 }, { "epoch": 0.06267409470752089, "grad_norm": 2.091829299926758, "learning_rate": 2.1874999999999996e-05, "loss": 9.4461, "step": 45 }, { "epoch": 0.06406685236768803, "grad_norm": 2.043539047241211, "learning_rate": 2.2361111111111107e-05, "loss": 9.4678, "step": 46 }, { "epoch": 0.06545961002785515, "grad_norm": 2.401531219482422, "learning_rate": 2.284722222222222e-05, "loss": 9.4719, "step": 47 }, { "epoch": 0.06685236768802229, "grad_norm": 1.9853050708770752, "learning_rate": 2.333333333333333e-05, "loss": 9.4406, "step": 48 }, { "epoch": 0.06824512534818941, "grad_norm": 1.8597462177276611, "learning_rate": 2.3819444444444443e-05, "loss": 9.4266, "step": 49 }, { "epoch": 0.06963788300835655, "grad_norm": 2.1002442836761475, "learning_rate": 2.4305555555555554e-05, "loss": 9.4377, "step": 50 }, { "epoch": 0.07103064066852367, "grad_norm": 2.0721702575683594, "learning_rate": 2.4791666666666665e-05, "loss": 9.378, "step": 51 }, { "epoch": 0.07242339832869081, "grad_norm": 2.1750292778015137, "learning_rate": 2.5277777777777776e-05, "loss": 9.4031, "step": 52 }, { "epoch": 0.07381615598885793, "grad_norm": 2.0567100048065186, "learning_rate": 2.5763888888888887e-05, "loss": 9.4175, "step": 53 }, { "epoch": 0.07520891364902507, "grad_norm": 1.9709560871124268, "learning_rate": 2.6249999999999998e-05, "loss": 9.4306, "step": 54 }, { "epoch": 0.0766016713091922, "grad_norm": 2.089951992034912, "learning_rate": 2.673611111111111e-05, "loss": 9.3999, "step": 55 }, { "epoch": 0.07799442896935933, "grad_norm": 2.057718515396118, "learning_rate": 2.722222222222222e-05, "loss": 9.4385, "step": 56 }, { "epoch": 0.07938718662952646, "grad_norm": 2.2145371437072754, "learning_rate": 2.770833333333333e-05, "loss": 9.4618, "step": 57 }, { "epoch": 0.0807799442896936, "grad_norm": 2.0873260498046875, "learning_rate": 2.819444444444444e-05, "loss": 9.4036, "step": 58 }, { "epoch": 0.08217270194986072, "grad_norm": 2.0868418216705322, "learning_rate": 2.8680555555555552e-05, "loss": 9.4577, "step": 59 }, { "epoch": 0.08356545961002786, "grad_norm": 2.0237319469451904, "learning_rate": 2.9166666666666666e-05, "loss": 9.4307, "step": 60 }, { "epoch": 0.08495821727019498, "grad_norm": 1.981904149055481, "learning_rate": 2.9652777777777774e-05, "loss": 9.3878, "step": 61 }, { "epoch": 0.08635097493036212, "grad_norm": 1.8592807054519653, "learning_rate": 3.0138888888888888e-05, "loss": 9.4487, "step": 62 }, { "epoch": 0.08774373259052924, "grad_norm": 2.1793065071105957, "learning_rate": 3.0625e-05, "loss": 9.4536, "step": 63 }, { "epoch": 0.08913649025069638, "grad_norm": 1.99411141872406, "learning_rate": 3.111111111111111e-05, "loss": 9.4251, "step": 64 }, { "epoch": 0.0905292479108635, "grad_norm": 2.7959253787994385, "learning_rate": 3.159722222222222e-05, "loss": 9.4595, "step": 65 }, { "epoch": 0.09192200557103064, "grad_norm": 1.9733268022537231, "learning_rate": 3.208333333333333e-05, "loss": 9.448, "step": 66 }, { "epoch": 0.09331476323119778, "grad_norm": 1.9736051559448242, "learning_rate": 3.256944444444444e-05, "loss": 9.421, "step": 67 }, { "epoch": 0.0947075208913649, "grad_norm": 2.1202316284179688, "learning_rate": 3.3055555555555553e-05, "loss": 9.4151, "step": 68 }, { "epoch": 0.09610027855153204, "grad_norm": 1.9673542976379395, "learning_rate": 3.3541666666666664e-05, "loss": 9.433, "step": 69 }, { "epoch": 0.09749303621169916, "grad_norm": 2.3342862129211426, "learning_rate": 3.4027777777777775e-05, "loss": 9.4349, "step": 70 }, { "epoch": 0.0988857938718663, "grad_norm": 2.252122402191162, "learning_rate": 3.4513888888888886e-05, "loss": 9.3299, "step": 71 }, { "epoch": 0.10027855153203342, "grad_norm": 1.9738399982452393, "learning_rate": 3.5e-05, "loss": 9.3686, "step": 72 }, { "epoch": 0.10167130919220056, "grad_norm": 1.9239985942840576, "learning_rate": 3.548611111111111e-05, "loss": 9.3583, "step": 73 }, { "epoch": 0.10306406685236769, "grad_norm": 2.0035247802734375, "learning_rate": 3.597222222222222e-05, "loss": 9.3615, "step": 74 }, { "epoch": 0.10445682451253482, "grad_norm": 2.1391425132751465, "learning_rate": 3.645833333333333e-05, "loss": 9.3704, "step": 75 }, { "epoch": 0.10584958217270195, "grad_norm": 2.5549750328063965, "learning_rate": 3.694444444444444e-05, "loss": 9.3467, "step": 76 }, { "epoch": 0.10724233983286909, "grad_norm": 3.201791763305664, "learning_rate": 3.743055555555555e-05, "loss": 9.3759, "step": 77 }, { "epoch": 0.10863509749303621, "grad_norm": 2.117368698120117, "learning_rate": 3.791666666666666e-05, "loss": 9.4053, "step": 78 }, { "epoch": 0.11002785515320335, "grad_norm": 2.5113062858581543, "learning_rate": 3.840277777777778e-05, "loss": 9.3376, "step": 79 }, { "epoch": 0.11142061281337047, "grad_norm": 2.192519187927246, "learning_rate": 3.8888888888888884e-05, "loss": 9.417, "step": 80 }, { "epoch": 0.11281337047353761, "grad_norm": 2.4800620079040527, "learning_rate": 3.9374999999999995e-05, "loss": 9.3578, "step": 81 }, { "epoch": 0.11420612813370473, "grad_norm": 2.34975004196167, "learning_rate": 3.9861111111111106e-05, "loss": 9.3705, "step": 82 }, { "epoch": 0.11559888579387187, "grad_norm": 2.3390395641326904, "learning_rate": 4.034722222222222e-05, "loss": 9.336, "step": 83 }, { "epoch": 0.116991643454039, "grad_norm": 2.483818531036377, "learning_rate": 4.0833333333333334e-05, "loss": 9.3954, "step": 84 }, { "epoch": 0.11838440111420613, "grad_norm": 2.2996625900268555, "learning_rate": 4.131944444444444e-05, "loss": 9.3094, "step": 85 }, { "epoch": 0.11977715877437325, "grad_norm": 2.236759901046753, "learning_rate": 4.180555555555555e-05, "loss": 9.3359, "step": 86 }, { "epoch": 0.12116991643454039, "grad_norm": 2.3694021701812744, "learning_rate": 4.229166666666666e-05, "loss": 9.3774, "step": 87 }, { "epoch": 0.12256267409470752, "grad_norm": 2.4825735092163086, "learning_rate": 4.277777777777778e-05, "loss": 9.2735, "step": 88 }, { "epoch": 0.12395543175487465, "grad_norm": 2.1388025283813477, "learning_rate": 4.326388888888889e-05, "loss": 9.3976, "step": 89 }, { "epoch": 0.12534818941504178, "grad_norm": 2.3984382152557373, "learning_rate": 4.374999999999999e-05, "loss": 9.3027, "step": 90 }, { "epoch": 0.12674094707520892, "grad_norm": 2.3104631900787354, "learning_rate": 4.4236111111111104e-05, "loss": 9.2651, "step": 91 }, { "epoch": 0.12813370473537605, "grad_norm": 2.2130093574523926, "learning_rate": 4.4722222222222215e-05, "loss": 9.2949, "step": 92 }, { "epoch": 0.12952646239554316, "grad_norm": 2.3408825397491455, "learning_rate": 4.520833333333333e-05, "loss": 9.4334, "step": 93 }, { "epoch": 0.1309192200557103, "grad_norm": 2.701294183731079, "learning_rate": 4.569444444444444e-05, "loss": 9.2697, "step": 94 }, { "epoch": 0.13231197771587744, "grad_norm": 2.449645757675171, "learning_rate": 4.6180555555555554e-05, "loss": 9.2557, "step": 95 }, { "epoch": 0.13370473537604458, "grad_norm": 2.396810531616211, "learning_rate": 4.666666666666666e-05, "loss": 9.2513, "step": 96 }, { "epoch": 0.13509749303621169, "grad_norm": 2.6208698749542236, "learning_rate": 4.7152777777777776e-05, "loss": 9.3046, "step": 97 }, { "epoch": 0.13649025069637882, "grad_norm": 2.6074302196502686, "learning_rate": 4.7638888888888887e-05, "loss": 9.34, "step": 98 }, { "epoch": 0.13788300835654596, "grad_norm": 2.6555991172790527, "learning_rate": 4.8125e-05, "loss": 9.3138, "step": 99 }, { "epoch": 0.1392757660167131, "grad_norm": 2.6097066402435303, "learning_rate": 4.861111111111111e-05, "loss": 9.3208, "step": 100 }, { "epoch": 0.14066852367688024, "grad_norm": 2.5649614334106445, "learning_rate": 4.909722222222221e-05, "loss": 9.2206, "step": 101 }, { "epoch": 0.14206128133704735, "grad_norm": 2.6494956016540527, "learning_rate": 4.958333333333333e-05, "loss": 9.3438, "step": 102 }, { "epoch": 0.14345403899721448, "grad_norm": 2.6111457347869873, "learning_rate": 5.006944444444444e-05, "loss": 9.2774, "step": 103 }, { "epoch": 0.14484679665738162, "grad_norm": 2.6649367809295654, "learning_rate": 5.055555555555555e-05, "loss": 9.2373, "step": 104 }, { "epoch": 0.14623955431754876, "grad_norm": 2.546994924545288, "learning_rate": 5.104166666666666e-05, "loss": 9.2823, "step": 105 }, { "epoch": 0.14763231197771587, "grad_norm": 2.5940351486206055, "learning_rate": 5.1527777777777774e-05, "loss": 9.2889, "step": 106 }, { "epoch": 0.149025069637883, "grad_norm": 2.6945221424102783, "learning_rate": 5.2013888888888885e-05, "loss": 9.1161, "step": 107 }, { "epoch": 0.15041782729805014, "grad_norm": 3.0894527435302734, "learning_rate": 5.2499999999999995e-05, "loss": 9.1788, "step": 108 }, { "epoch": 0.15181058495821728, "grad_norm": 2.867466926574707, "learning_rate": 5.2986111111111106e-05, "loss": 9.1579, "step": 109 }, { "epoch": 0.1532033426183844, "grad_norm": 2.731473445892334, "learning_rate": 5.347222222222222e-05, "loss": 9.264, "step": 110 }, { "epoch": 0.15459610027855153, "grad_norm": 2.9084222316741943, "learning_rate": 5.395833333333333e-05, "loss": 9.3098, "step": 111 }, { "epoch": 0.15598885793871867, "grad_norm": 2.7787702083587646, "learning_rate": 5.444444444444444e-05, "loss": 9.3831, "step": 112 }, { "epoch": 0.1573816155988858, "grad_norm": 3.490419387817383, "learning_rate": 5.493055555555555e-05, "loss": 9.2544, "step": 113 }, { "epoch": 0.15877437325905291, "grad_norm": 3.0516364574432373, "learning_rate": 5.541666666666666e-05, "loss": 9.2826, "step": 114 }, { "epoch": 0.16016713091922005, "grad_norm": 3.0757088661193848, "learning_rate": 5.590277777777778e-05, "loss": 9.221, "step": 115 }, { "epoch": 0.1615598885793872, "grad_norm": 3.052006244659424, "learning_rate": 5.638888888888888e-05, "loss": 9.0494, "step": 116 }, { "epoch": 0.16295264623955433, "grad_norm": 2.6261179447174072, "learning_rate": 5.687499999999999e-05, "loss": 9.2244, "step": 117 }, { "epoch": 0.16434540389972144, "grad_norm": 2.775188684463501, "learning_rate": 5.7361111111111104e-05, "loss": 9.2703, "step": 118 }, { "epoch": 0.16573816155988857, "grad_norm": 2.6947615146636963, "learning_rate": 5.7847222222222215e-05, "loss": 9.1994, "step": 119 }, { "epoch": 0.1671309192200557, "grad_norm": 3.2321982383728027, "learning_rate": 5.833333333333333e-05, "loss": 9.0563, "step": 120 }, { "epoch": 0.16852367688022285, "grad_norm": 2.8258464336395264, "learning_rate": 5.881944444444444e-05, "loss": 9.2179, "step": 121 }, { "epoch": 0.16991643454038996, "grad_norm": 3.2115259170532227, "learning_rate": 5.930555555555555e-05, "loss": 9.195, "step": 122 }, { "epoch": 0.1713091922005571, "grad_norm": 2.8792707920074463, "learning_rate": 5.979166666666666e-05, "loss": 9.02, "step": 123 }, { "epoch": 0.17270194986072424, "grad_norm": 2.866339683532715, "learning_rate": 6.0277777777777776e-05, "loss": 9.1544, "step": 124 }, { "epoch": 0.17409470752089137, "grad_norm": 3.1229407787323, "learning_rate": 6.076388888888889e-05, "loss": 9.1192, "step": 125 }, { "epoch": 0.17548746518105848, "grad_norm": 3.0198867321014404, "learning_rate": 6.125e-05, "loss": 9.0572, "step": 126 }, { "epoch": 0.17688022284122562, "grad_norm": 3.060091018676758, "learning_rate": 6.173611111111111e-05, "loss": 9.1694, "step": 127 }, { "epoch": 0.17827298050139276, "grad_norm": 3.0378992557525635, "learning_rate": 6.222222222222222e-05, "loss": 9.0026, "step": 128 }, { "epoch": 0.1796657381615599, "grad_norm": 3.2583515644073486, "learning_rate": 6.270833333333333e-05, "loss": 9.0214, "step": 129 }, { "epoch": 0.181058495821727, "grad_norm": 3.5013654232025146, "learning_rate": 6.319444444444444e-05, "loss": 9.1353, "step": 130 }, { "epoch": 0.18245125348189414, "grad_norm": 3.2136590480804443, "learning_rate": 6.368055555555555e-05, "loss": 9.0555, "step": 131 }, { "epoch": 0.18384401114206128, "grad_norm": 3.704627513885498, "learning_rate": 6.416666666666666e-05, "loss": 9.0568, "step": 132 }, { "epoch": 0.18523676880222842, "grad_norm": 3.6550254821777344, "learning_rate": 6.465277777777777e-05, "loss": 9.0778, "step": 133 }, { "epoch": 0.18662952646239556, "grad_norm": 3.029344320297241, "learning_rate": 6.513888888888889e-05, "loss": 9.0669, "step": 134 }, { "epoch": 0.18802228412256267, "grad_norm": 3.116588592529297, "learning_rate": 6.5625e-05, "loss": 9.0169, "step": 135 }, { "epoch": 0.1894150417827298, "grad_norm": 2.9819633960723877, "learning_rate": 6.611111111111111e-05, "loss": 9.1648, "step": 136 }, { "epoch": 0.19080779944289694, "grad_norm": 3.1296842098236084, "learning_rate": 6.659722222222222e-05, "loss": 9.0234, "step": 137 }, { "epoch": 0.19220055710306408, "grad_norm": 3.7833495140075684, "learning_rate": 6.708333333333333e-05, "loss": 9.1396, "step": 138 }, { "epoch": 0.1935933147632312, "grad_norm": 3.1753551959991455, "learning_rate": 6.756944444444444e-05, "loss": 9.0495, "step": 139 }, { "epoch": 0.19498607242339833, "grad_norm": 3.087599515914917, "learning_rate": 6.805555555555555e-05, "loss": 8.8913, "step": 140 }, { "epoch": 0.19637883008356546, "grad_norm": 3.865417957305908, "learning_rate": 6.854166666666666e-05, "loss": 9.2012, "step": 141 }, { "epoch": 0.1977715877437326, "grad_norm": 3.1938564777374268, "learning_rate": 6.902777777777777e-05, "loss": 9.1955, "step": 142 }, { "epoch": 0.1991643454038997, "grad_norm": 3.7016489505767822, "learning_rate": 6.951388888888888e-05, "loss": 8.9795, "step": 143 }, { "epoch": 0.20055710306406685, "grad_norm": 3.168267250061035, "learning_rate": 7e-05, "loss": 9.0605, "step": 144 }, { "epoch": 0.201949860724234, "grad_norm": 3.670396566390991, "learning_rate": 6.99900511654349e-05, "loss": 9.1897, "step": 145 }, { "epoch": 0.20334261838440112, "grad_norm": 3.8943371772766113, "learning_rate": 6.998010233086982e-05, "loss": 8.989, "step": 146 }, { "epoch": 0.20473537604456823, "grad_norm": 3.5359532833099365, "learning_rate": 6.99701534963047e-05, "loss": 8.9466, "step": 147 }, { "epoch": 0.20612813370473537, "grad_norm": 3.0849764347076416, "learning_rate": 6.996020466173962e-05, "loss": 9.0618, "step": 148 }, { "epoch": 0.2075208913649025, "grad_norm": 3.2177481651306152, "learning_rate": 6.995025582717453e-05, "loss": 9.0375, "step": 149 }, { "epoch": 0.20891364902506965, "grad_norm": 3.689551830291748, "learning_rate": 6.994030699260943e-05, "loss": 8.9582, "step": 150 }, { "epoch": 0.21030640668523676, "grad_norm": 3.431204319000244, "learning_rate": 6.993035815804434e-05, "loss": 9.0407, "step": 151 }, { "epoch": 0.2116991643454039, "grad_norm": 3.074554443359375, "learning_rate": 6.992040932347924e-05, "loss": 8.869, "step": 152 }, { "epoch": 0.21309192200557103, "grad_norm": 3.3490922451019287, "learning_rate": 6.991046048891414e-05, "loss": 9.0683, "step": 153 }, { "epoch": 0.21448467966573817, "grad_norm": 3.005607843399048, "learning_rate": 6.990051165434906e-05, "loss": 9.1739, "step": 154 }, { "epoch": 0.21587743732590528, "grad_norm": 3.483210563659668, "learning_rate": 6.989056281978397e-05, "loss": 9.0874, "step": 155 }, { "epoch": 0.21727019498607242, "grad_norm": 3.241920232772827, "learning_rate": 6.988061398521887e-05, "loss": 9.0501, "step": 156 }, { "epoch": 0.21866295264623956, "grad_norm": 3.062758207321167, "learning_rate": 6.987066515065377e-05, "loss": 9.1537, "step": 157 }, { "epoch": 0.2200557103064067, "grad_norm": 3.220777988433838, "learning_rate": 6.986071631608868e-05, "loss": 8.8843, "step": 158 }, { "epoch": 0.2214484679665738, "grad_norm": 3.283903121948242, "learning_rate": 6.985076748152358e-05, "loss": 8.9421, "step": 159 }, { "epoch": 0.22284122562674094, "grad_norm": 3.5407607555389404, "learning_rate": 6.98408186469585e-05, "loss": 9.0808, "step": 160 }, { "epoch": 0.22423398328690808, "grad_norm": 3.2570924758911133, "learning_rate": 6.98308698123934e-05, "loss": 8.792, "step": 161 }, { "epoch": 0.22562674094707522, "grad_norm": 3.170731544494629, "learning_rate": 6.982092097782831e-05, "loss": 9.017, "step": 162 }, { "epoch": 0.22701949860724233, "grad_norm": 3.8973240852355957, "learning_rate": 6.981097214326321e-05, "loss": 8.9749, "step": 163 }, { "epoch": 0.22841225626740946, "grad_norm": 4.280517101287842, "learning_rate": 6.980102330869812e-05, "loss": 8.9395, "step": 164 }, { "epoch": 0.2298050139275766, "grad_norm": 3.2216036319732666, "learning_rate": 6.979107447413302e-05, "loss": 8.9109, "step": 165 }, { "epoch": 0.23119777158774374, "grad_norm": 3.2336678504943848, "learning_rate": 6.978112563956792e-05, "loss": 9.0659, "step": 166 }, { "epoch": 0.23259052924791088, "grad_norm": 3.7737042903900146, "learning_rate": 6.977117680500283e-05, "loss": 8.9384, "step": 167 }, { "epoch": 0.233983286908078, "grad_norm": 3.4278862476348877, "learning_rate": 6.976122797043775e-05, "loss": 8.9576, "step": 168 }, { "epoch": 0.23537604456824512, "grad_norm": 3.465973377227783, "learning_rate": 6.975127913587265e-05, "loss": 8.9898, "step": 169 }, { "epoch": 0.23676880222841226, "grad_norm": 5.112400531768799, "learning_rate": 6.974133030130755e-05, "loss": 8.7912, "step": 170 }, { "epoch": 0.2381615598885794, "grad_norm": 4.257119178771973, "learning_rate": 6.973138146674246e-05, "loss": 8.899, "step": 171 }, { "epoch": 0.2395543175487465, "grad_norm": 4.034311771392822, "learning_rate": 6.972143263217736e-05, "loss": 8.8859, "step": 172 }, { "epoch": 0.24094707520891365, "grad_norm": 3.9154863357543945, "learning_rate": 6.971148379761228e-05, "loss": 9.074, "step": 173 }, { "epoch": 0.24233983286908078, "grad_norm": 4.049985885620117, "learning_rate": 6.970153496304718e-05, "loss": 8.9757, "step": 174 }, { "epoch": 0.24373259052924792, "grad_norm": 3.8165013790130615, "learning_rate": 6.969158612848209e-05, "loss": 8.9547, "step": 175 }, { "epoch": 0.24512534818941503, "grad_norm": 4.293961048126221, "learning_rate": 6.968163729391699e-05, "loss": 8.8375, "step": 176 }, { "epoch": 0.24651810584958217, "grad_norm": 5.501155376434326, "learning_rate": 6.96716884593519e-05, "loss": 8.9973, "step": 177 }, { "epoch": 0.2479108635097493, "grad_norm": 3.8478055000305176, "learning_rate": 6.96617396247868e-05, "loss": 8.8513, "step": 178 }, { "epoch": 0.24930362116991645, "grad_norm": 3.494331121444702, "learning_rate": 6.965179079022172e-05, "loss": 9.106, "step": 179 }, { "epoch": 0.25069637883008355, "grad_norm": 3.3035213947296143, "learning_rate": 6.964184195565662e-05, "loss": 8.8163, "step": 180 }, { "epoch": 0.2520891364902507, "grad_norm": 3.44191575050354, "learning_rate": 6.963189312109153e-05, "loss": 9.0062, "step": 181 }, { "epoch": 0.25348189415041783, "grad_norm": 4.08702278137207, "learning_rate": 6.962194428652643e-05, "loss": 8.8288, "step": 182 }, { "epoch": 0.25487465181058494, "grad_norm": 3.422994375228882, "learning_rate": 6.961199545196133e-05, "loss": 8.7787, "step": 183 }, { "epoch": 0.2562674094707521, "grad_norm": 3.4032821655273438, "learning_rate": 6.960204661739624e-05, "loss": 8.876, "step": 184 }, { "epoch": 0.2576601671309192, "grad_norm": 3.357682228088379, "learning_rate": 6.959209778283116e-05, "loss": 9.0388, "step": 185 }, { "epoch": 0.2590529247910863, "grad_norm": 3.4004476070404053, "learning_rate": 6.958214894826605e-05, "loss": 9.0445, "step": 186 }, { "epoch": 0.2604456824512535, "grad_norm": 3.507704734802246, "learning_rate": 6.957220011370096e-05, "loss": 8.7071, "step": 187 }, { "epoch": 0.2618384401114206, "grad_norm": 3.7336301803588867, "learning_rate": 6.956225127913587e-05, "loss": 9.0218, "step": 188 }, { "epoch": 0.26323119777158777, "grad_norm": 3.0426900386810303, "learning_rate": 6.955230244457077e-05, "loss": 8.9227, "step": 189 }, { "epoch": 0.2646239554317549, "grad_norm": 3.8145570755004883, "learning_rate": 6.954235361000568e-05, "loss": 9.0258, "step": 190 }, { "epoch": 0.266016713091922, "grad_norm": 3.6831650733947754, "learning_rate": 6.953240477544058e-05, "loss": 9.0218, "step": 191 }, { "epoch": 0.26740947075208915, "grad_norm": 3.8541195392608643, "learning_rate": 6.952245594087548e-05, "loss": 8.9664, "step": 192 }, { "epoch": 0.26880222841225626, "grad_norm": 4.092911720275879, "learning_rate": 6.95125071063104e-05, "loss": 8.8958, "step": 193 }, { "epoch": 0.27019498607242337, "grad_norm": 3.52112078666687, "learning_rate": 6.95025582717453e-05, "loss": 8.7399, "step": 194 }, { "epoch": 0.27158774373259054, "grad_norm": 3.9229259490966797, "learning_rate": 6.949260943718021e-05, "loss": 8.503, "step": 195 }, { "epoch": 0.27298050139275765, "grad_norm": 4.027508735656738, "learning_rate": 6.948266060261511e-05, "loss": 8.8499, "step": 196 }, { "epoch": 0.2743732590529248, "grad_norm": 3.772533655166626, "learning_rate": 6.947271176805002e-05, "loss": 8.8647, "step": 197 }, { "epoch": 0.2757660167130919, "grad_norm": 3.626054048538208, "learning_rate": 6.946276293348492e-05, "loss": 8.7417, "step": 198 }, { "epoch": 0.27715877437325903, "grad_norm": 3.7706661224365234, "learning_rate": 6.945281409891984e-05, "loss": 8.675, "step": 199 }, { "epoch": 0.2785515320334262, "grad_norm": 3.55424165725708, "learning_rate": 6.944286526435474e-05, "loss": 8.7761, "step": 200 }, { "epoch": 0.2799442896935933, "grad_norm": 4.005819320678711, "learning_rate": 6.943291642978965e-05, "loss": 8.7846, "step": 201 }, { "epoch": 0.28133704735376047, "grad_norm": 3.900658369064331, "learning_rate": 6.942296759522455e-05, "loss": 8.7952, "step": 202 }, { "epoch": 0.2827298050139276, "grad_norm": 3.804995059967041, "learning_rate": 6.941301876065946e-05, "loss": 8.7566, "step": 203 }, { "epoch": 0.2841225626740947, "grad_norm": 4.131643295288086, "learning_rate": 6.940306992609437e-05, "loss": 8.8913, "step": 204 }, { "epoch": 0.28551532033426186, "grad_norm": 3.9271464347839355, "learning_rate": 6.939312109152928e-05, "loss": 8.69, "step": 205 }, { "epoch": 0.28690807799442897, "grad_norm": 5.086146354675293, "learning_rate": 6.938317225696418e-05, "loss": 8.7908, "step": 206 }, { "epoch": 0.2883008356545961, "grad_norm": 4.286420822143555, "learning_rate": 6.937322342239909e-05, "loss": 8.4524, "step": 207 }, { "epoch": 0.28969359331476324, "grad_norm": 3.5584073066711426, "learning_rate": 6.936327458783399e-05, "loss": 8.9401, "step": 208 }, { "epoch": 0.29108635097493035, "grad_norm": 3.8784470558166504, "learning_rate": 6.93533257532689e-05, "loss": 8.9656, "step": 209 }, { "epoch": 0.2924791086350975, "grad_norm": 3.98789644241333, "learning_rate": 6.93433769187038e-05, "loss": 8.825, "step": 210 }, { "epoch": 0.2938718662952646, "grad_norm": 4.665347576141357, "learning_rate": 6.93334280841387e-05, "loss": 8.8567, "step": 211 }, { "epoch": 0.29526462395543174, "grad_norm": 4.211238861083984, "learning_rate": 6.932347924957362e-05, "loss": 8.5163, "step": 212 }, { "epoch": 0.2966573816155989, "grad_norm": 4.115207195281982, "learning_rate": 6.931353041500852e-05, "loss": 8.8728, "step": 213 }, { "epoch": 0.298050139275766, "grad_norm": 4.007952690124512, "learning_rate": 6.930358158044343e-05, "loss": 8.8066, "step": 214 }, { "epoch": 0.2994428969359331, "grad_norm": 4.000673770904541, "learning_rate": 6.929363274587833e-05, "loss": 8.8258, "step": 215 }, { "epoch": 0.3008356545961003, "grad_norm": 3.824923038482666, "learning_rate": 6.928368391131324e-05, "loss": 8.6051, "step": 216 }, { "epoch": 0.3022284122562674, "grad_norm": 4.339374542236328, "learning_rate": 6.927373507674814e-05, "loss": 8.8224, "step": 217 }, { "epoch": 0.30362116991643456, "grad_norm": 4.101476669311523, "learning_rate": 6.926378624218306e-05, "loss": 8.4742, "step": 218 }, { "epoch": 0.3050139275766017, "grad_norm": 3.2088582515716553, "learning_rate": 6.925383740761796e-05, "loss": 9.0615, "step": 219 }, { "epoch": 0.3064066852367688, "grad_norm": 3.446979522705078, "learning_rate": 6.924388857305287e-05, "loss": 8.8705, "step": 220 }, { "epoch": 0.30779944289693595, "grad_norm": 3.8296539783477783, "learning_rate": 6.923393973848777e-05, "loss": 8.6779, "step": 221 }, { "epoch": 0.30919220055710306, "grad_norm": 3.905626058578491, "learning_rate": 6.922399090392267e-05, "loss": 8.5484, "step": 222 }, { "epoch": 0.31058495821727017, "grad_norm": 3.4928548336029053, "learning_rate": 6.921404206935758e-05, "loss": 8.7129, "step": 223 }, { "epoch": 0.31197771587743733, "grad_norm": 3.5804247856140137, "learning_rate": 6.92040932347925e-05, "loss": 8.571, "step": 224 }, { "epoch": 0.31337047353760444, "grad_norm": 4.050597667694092, "learning_rate": 6.919414440022739e-05, "loss": 8.6514, "step": 225 }, { "epoch": 0.3147632311977716, "grad_norm": 4.444650650024414, "learning_rate": 6.91841955656623e-05, "loss": 8.5167, "step": 226 }, { "epoch": 0.3161559888579387, "grad_norm": 4.613363265991211, "learning_rate": 6.917424673109721e-05, "loss": 8.7485, "step": 227 }, { "epoch": 0.31754874651810583, "grad_norm": 3.857363700866699, "learning_rate": 6.916429789653211e-05, "loss": 8.4424, "step": 228 }, { "epoch": 0.318941504178273, "grad_norm": 3.773397922515869, "learning_rate": 6.915434906196703e-05, "loss": 8.3823, "step": 229 }, { "epoch": 0.3203342618384401, "grad_norm": 3.7881274223327637, "learning_rate": 6.914440022740192e-05, "loss": 8.5923, "step": 230 }, { "epoch": 0.32172701949860727, "grad_norm": 3.938821792602539, "learning_rate": 6.913445139283684e-05, "loss": 8.6306, "step": 231 }, { "epoch": 0.3231197771587744, "grad_norm": 4.051789283752441, "learning_rate": 6.912450255827174e-05, "loss": 8.7096, "step": 232 }, { "epoch": 0.3245125348189415, "grad_norm": 3.930919647216797, "learning_rate": 6.911455372370665e-05, "loss": 8.8334, "step": 233 }, { "epoch": 0.32590529247910865, "grad_norm": 5.816878318786621, "learning_rate": 6.910460488914155e-05, "loss": 8.6405, "step": 234 }, { "epoch": 0.32729805013927576, "grad_norm": 4.822601318359375, "learning_rate": 6.909465605457645e-05, "loss": 8.7494, "step": 235 }, { "epoch": 0.3286908077994429, "grad_norm": 4.636408805847168, "learning_rate": 6.908470722001136e-05, "loss": 8.7079, "step": 236 }, { "epoch": 0.33008356545961004, "grad_norm": 3.6052465438842773, "learning_rate": 6.907475838544628e-05, "loss": 8.3915, "step": 237 }, { "epoch": 0.33147632311977715, "grad_norm": 3.939838409423828, "learning_rate": 6.906480955088118e-05, "loss": 8.7672, "step": 238 }, { "epoch": 0.3328690807799443, "grad_norm": 3.9982073307037354, "learning_rate": 6.905486071631608e-05, "loss": 8.3698, "step": 239 }, { "epoch": 0.3342618384401114, "grad_norm": 3.5854127407073975, "learning_rate": 6.904491188175099e-05, "loss": 8.6274, "step": 240 }, { "epoch": 0.33565459610027853, "grad_norm": 3.827282428741455, "learning_rate": 6.903496304718589e-05, "loss": 8.5594, "step": 241 }, { "epoch": 0.3370473537604457, "grad_norm": 3.9238662719726562, "learning_rate": 6.90250142126208e-05, "loss": 8.5622, "step": 242 }, { "epoch": 0.3384401114206128, "grad_norm": 4.059924602508545, "learning_rate": 6.901506537805571e-05, "loss": 8.631, "step": 243 }, { "epoch": 0.3398328690807799, "grad_norm": 3.882917642593384, "learning_rate": 6.900511654349062e-05, "loss": 8.5415, "step": 244 }, { "epoch": 0.3412256267409471, "grad_norm": 3.731807231903076, "learning_rate": 6.899516770892552e-05, "loss": 8.7059, "step": 245 }, { "epoch": 0.3426183844011142, "grad_norm": 4.963943958282471, "learning_rate": 6.898521887436043e-05, "loss": 8.2728, "step": 246 }, { "epoch": 0.34401114206128136, "grad_norm": 3.780099391937256, "learning_rate": 6.897527003979533e-05, "loss": 8.6473, "step": 247 }, { "epoch": 0.34540389972144847, "grad_norm": 3.502495527267456, "learning_rate": 6.896532120523023e-05, "loss": 8.64, "step": 248 }, { "epoch": 0.3467966573816156, "grad_norm": 4.078880310058594, "learning_rate": 6.895537237066515e-05, "loss": 8.6649, "step": 249 }, { "epoch": 0.34818941504178275, "grad_norm": 3.965594530105591, "learning_rate": 6.894542353610004e-05, "loss": 8.4793, "step": 250 }, { "epoch": 0.34958217270194986, "grad_norm": 3.7059476375579834, "learning_rate": 6.893547470153496e-05, "loss": 8.6176, "step": 251 }, { "epoch": 0.35097493036211697, "grad_norm": 3.8155102729797363, "learning_rate": 6.892552586696986e-05, "loss": 8.5968, "step": 252 }, { "epoch": 0.35236768802228413, "grad_norm": 3.9428980350494385, "learning_rate": 6.891557703240477e-05, "loss": 8.7197, "step": 253 }, { "epoch": 0.35376044568245124, "grad_norm": 3.9024670124053955, "learning_rate": 6.890562819783967e-05, "loss": 8.33, "step": 254 }, { "epoch": 0.3551532033426184, "grad_norm": 3.6394572257995605, "learning_rate": 6.889567936327458e-05, "loss": 8.4934, "step": 255 }, { "epoch": 0.3565459610027855, "grad_norm": 3.7295875549316406, "learning_rate": 6.888573052870948e-05, "loss": 8.1607, "step": 256 }, { "epoch": 0.3579387186629526, "grad_norm": 3.547926187515259, "learning_rate": 6.88757816941444e-05, "loss": 8.5152, "step": 257 }, { "epoch": 0.3593314763231198, "grad_norm": 4.115841865539551, "learning_rate": 6.88658328595793e-05, "loss": 8.3158, "step": 258 }, { "epoch": 0.3607242339832869, "grad_norm": 3.5963521003723145, "learning_rate": 6.88558840250142e-05, "loss": 8.0662, "step": 259 }, { "epoch": 0.362116991643454, "grad_norm": 3.4882774353027344, "learning_rate": 6.884593519044911e-05, "loss": 8.2915, "step": 260 }, { "epoch": 0.3635097493036212, "grad_norm": 4.000843524932861, "learning_rate": 6.883598635588401e-05, "loss": 8.459, "step": 261 }, { "epoch": 0.3649025069637883, "grad_norm": 3.437972068786621, "learning_rate": 6.882603752131893e-05, "loss": 8.6511, "step": 262 }, { "epoch": 0.36629526462395545, "grad_norm": 3.727342367172241, "learning_rate": 6.881608868675384e-05, "loss": 8.6024, "step": 263 }, { "epoch": 0.36768802228412256, "grad_norm": 3.84175968170166, "learning_rate": 6.880613985218874e-05, "loss": 8.5866, "step": 264 }, { "epoch": 0.36908077994428967, "grad_norm": 4.956315517425537, "learning_rate": 6.879619101762364e-05, "loss": 7.9471, "step": 265 }, { "epoch": 0.37047353760445684, "grad_norm": 4.100083351135254, "learning_rate": 6.878624218305855e-05, "loss": 8.5005, "step": 266 }, { "epoch": 0.37186629526462395, "grad_norm": 4.244174003601074, "learning_rate": 6.877629334849345e-05, "loss": 8.5043, "step": 267 }, { "epoch": 0.3732590529247911, "grad_norm": 4.182929039001465, "learning_rate": 6.876634451392837e-05, "loss": 8.6353, "step": 268 }, { "epoch": 0.3746518105849582, "grad_norm": 4.116519927978516, "learning_rate": 6.875639567936326e-05, "loss": 8.2668, "step": 269 }, { "epoch": 0.37604456824512533, "grad_norm": 4.627055644989014, "learning_rate": 6.874644684479818e-05, "loss": 8.3179, "step": 270 }, { "epoch": 0.3774373259052925, "grad_norm": 4.048768520355225, "learning_rate": 6.873649801023308e-05, "loss": 8.2831, "step": 271 }, { "epoch": 0.3788300835654596, "grad_norm": 4.0451340675354, "learning_rate": 6.872654917566799e-05, "loss": 8.4852, "step": 272 }, { "epoch": 0.3802228412256267, "grad_norm": 3.7257699966430664, "learning_rate": 6.871660034110289e-05, "loss": 8.0703, "step": 273 }, { "epoch": 0.3816155988857939, "grad_norm": 4.093203544616699, "learning_rate": 6.87066515065378e-05, "loss": 8.304, "step": 274 }, { "epoch": 0.383008356545961, "grad_norm": 3.7395145893096924, "learning_rate": 6.86967026719727e-05, "loss": 8.3956, "step": 275 }, { "epoch": 0.38440111420612816, "grad_norm": 4.134032726287842, "learning_rate": 6.868675383740762e-05, "loss": 8.1094, "step": 276 }, { "epoch": 0.38579387186629527, "grad_norm": 4.843389511108398, "learning_rate": 6.867680500284252e-05, "loss": 8.3974, "step": 277 }, { "epoch": 0.3871866295264624, "grad_norm": 3.7470033168792725, "learning_rate": 6.866685616827742e-05, "loss": 8.3527, "step": 278 }, { "epoch": 0.38857938718662954, "grad_norm": 3.5585482120513916, "learning_rate": 6.865690733371233e-05, "loss": 8.2012, "step": 279 }, { "epoch": 0.38997214484679665, "grad_norm": 4.271474361419678, "learning_rate": 6.864695849914723e-05, "loss": 8.6701, "step": 280 }, { "epoch": 0.39136490250696376, "grad_norm": 4.250993728637695, "learning_rate": 6.863700966458214e-05, "loss": 8.3565, "step": 281 }, { "epoch": 0.39275766016713093, "grad_norm": 3.6277823448181152, "learning_rate": 6.862706083001705e-05, "loss": 8.2638, "step": 282 }, { "epoch": 0.39415041782729804, "grad_norm": 4.192377090454102, "learning_rate": 6.861711199545196e-05, "loss": 8.1367, "step": 283 }, { "epoch": 0.3955431754874652, "grad_norm": 3.8814239501953125, "learning_rate": 6.860716316088686e-05, "loss": 8.2968, "step": 284 }, { "epoch": 0.3969359331476323, "grad_norm": 3.861527442932129, "learning_rate": 6.859721432632177e-05, "loss": 8.2238, "step": 285 }, { "epoch": 0.3983286908077994, "grad_norm": 3.7376210689544678, "learning_rate": 6.858726549175667e-05, "loss": 8.2223, "step": 286 }, { "epoch": 0.3997214484679666, "grad_norm": 4.001213550567627, "learning_rate": 6.857731665719159e-05, "loss": 8.3963, "step": 287 }, { "epoch": 0.4011142061281337, "grad_norm": 3.7330715656280518, "learning_rate": 6.856736782262649e-05, "loss": 8.2626, "step": 288 }, { "epoch": 0.4025069637883008, "grad_norm": 4.302996635437012, "learning_rate": 6.85574189880614e-05, "loss": 8.3355, "step": 289 }, { "epoch": 0.403899721448468, "grad_norm": 3.770094156265259, "learning_rate": 6.85474701534963e-05, "loss": 8.3885, "step": 290 }, { "epoch": 0.4052924791086351, "grad_norm": 4.166201591491699, "learning_rate": 6.85375213189312e-05, "loss": 8.2349, "step": 291 }, { "epoch": 0.40668523676880225, "grad_norm": 3.9205539226531982, "learning_rate": 6.852757248436611e-05, "loss": 8.3096, "step": 292 }, { "epoch": 0.40807799442896936, "grad_norm": 3.5723092555999756, "learning_rate": 6.851762364980103e-05, "loss": 8.5081, "step": 293 }, { "epoch": 0.40947075208913647, "grad_norm": 4.560377597808838, "learning_rate": 6.850767481523592e-05, "loss": 8.2322, "step": 294 }, { "epoch": 0.41086350974930363, "grad_norm": 4.03375244140625, "learning_rate": 6.849772598067083e-05, "loss": 8.2581, "step": 295 }, { "epoch": 0.41225626740947074, "grad_norm": 4.069296836853027, "learning_rate": 6.848777714610574e-05, "loss": 8.1669, "step": 296 }, { "epoch": 0.4136490250696379, "grad_norm": 4.926348686218262, "learning_rate": 6.847782831154064e-05, "loss": 8.1485, "step": 297 }, { "epoch": 0.415041782729805, "grad_norm": 4.371025562286377, "learning_rate": 6.846787947697555e-05, "loss": 7.9984, "step": 298 }, { "epoch": 0.41643454038997213, "grad_norm": 4.272550582885742, "learning_rate": 6.845793064241045e-05, "loss": 8.0296, "step": 299 }, { "epoch": 0.4178272980501393, "grad_norm": 4.450153350830078, "learning_rate": 6.844798180784535e-05, "loss": 8.2459, "step": 300 }, { "epoch": 0.4192200557103064, "grad_norm": 4.329676151275635, "learning_rate": 6.843803297328027e-05, "loss": 8.247, "step": 301 }, { "epoch": 0.4206128133704735, "grad_norm": 4.585980415344238, "learning_rate": 6.842808413871518e-05, "loss": 8.3604, "step": 302 }, { "epoch": 0.4220055710306407, "grad_norm": 3.8479998111724854, "learning_rate": 6.841813530415008e-05, "loss": 8.4521, "step": 303 }, { "epoch": 0.4233983286908078, "grad_norm": 4.2120256423950195, "learning_rate": 6.840818646958498e-05, "loss": 8.1643, "step": 304 }, { "epoch": 0.42479108635097496, "grad_norm": 4.131726264953613, "learning_rate": 6.839823763501989e-05, "loss": 8.3119, "step": 305 }, { "epoch": 0.42618384401114207, "grad_norm": 3.7550110816955566, "learning_rate": 6.838828880045479e-05, "loss": 7.8546, "step": 306 }, { "epoch": 0.4275766016713092, "grad_norm": 4.501293182373047, "learning_rate": 6.837833996588971e-05, "loss": 8.387, "step": 307 }, { "epoch": 0.42896935933147634, "grad_norm": 3.984358310699463, "learning_rate": 6.83683911313246e-05, "loss": 8.3124, "step": 308 }, { "epoch": 0.43036211699164345, "grad_norm": 4.523743152618408, "learning_rate": 6.835844229675952e-05, "loss": 8.1569, "step": 309 }, { "epoch": 0.43175487465181056, "grad_norm": 3.7779624462127686, "learning_rate": 6.834849346219442e-05, "loss": 8.1798, "step": 310 }, { "epoch": 0.4331476323119777, "grad_norm": 4.647845268249512, "learning_rate": 6.833854462762933e-05, "loss": 7.9793, "step": 311 }, { "epoch": 0.43454038997214484, "grad_norm": 3.5851359367370605, "learning_rate": 6.832859579306424e-05, "loss": 8.1629, "step": 312 }, { "epoch": 0.435933147632312, "grad_norm": 4.379482269287109, "learning_rate": 6.831864695849913e-05, "loss": 8.2956, "step": 313 }, { "epoch": 0.4373259052924791, "grad_norm": 4.4390645027160645, "learning_rate": 6.830869812393405e-05, "loss": 7.9394, "step": 314 }, { "epoch": 0.4387186629526462, "grad_norm": 4.138993740081787, "learning_rate": 6.829874928936896e-05, "loss": 8.4355, "step": 315 }, { "epoch": 0.4401114206128134, "grad_norm": 4.070654392242432, "learning_rate": 6.828880045480386e-05, "loss": 8.2139, "step": 316 }, { "epoch": 0.4415041782729805, "grad_norm": 4.666057109832764, "learning_rate": 6.827885162023876e-05, "loss": 8.4277, "step": 317 }, { "epoch": 0.4428969359331476, "grad_norm": 4.2710185050964355, "learning_rate": 6.826890278567367e-05, "loss": 8.2777, "step": 318 }, { "epoch": 0.44428969359331477, "grad_norm": 4.994112014770508, "learning_rate": 6.825895395110857e-05, "loss": 8.1703, "step": 319 }, { "epoch": 0.4456824512534819, "grad_norm": 3.9322173595428467, "learning_rate": 6.824900511654349e-05, "loss": 8.0619, "step": 320 }, { "epoch": 0.44707520891364905, "grad_norm": 4.775113105773926, "learning_rate": 6.82390562819784e-05, "loss": 8.3888, "step": 321 }, { "epoch": 0.44846796657381616, "grad_norm": 4.775972366333008, "learning_rate": 6.82291074474133e-05, "loss": 8.2665, "step": 322 }, { "epoch": 0.44986072423398327, "grad_norm": 4.2928290367126465, "learning_rate": 6.82191586128482e-05, "loss": 8.2028, "step": 323 }, { "epoch": 0.45125348189415043, "grad_norm": 3.857879877090454, "learning_rate": 6.820920977828311e-05, "loss": 8.2462, "step": 324 }, { "epoch": 0.45264623955431754, "grad_norm": 3.9455676078796387, "learning_rate": 6.819926094371801e-05, "loss": 8.2978, "step": 325 }, { "epoch": 0.45403899721448465, "grad_norm": 4.97908878326416, "learning_rate": 6.818931210915293e-05, "loss": 8.0867, "step": 326 }, { "epoch": 0.4554317548746518, "grad_norm": 4.033157825469971, "learning_rate": 6.817936327458783e-05, "loss": 8.4074, "step": 327 }, { "epoch": 0.4568245125348189, "grad_norm": 4.895842552185059, "learning_rate": 6.816941444002274e-05, "loss": 8.3586, "step": 328 }, { "epoch": 0.4582172701949861, "grad_norm": 7.515446186065674, "learning_rate": 6.815946560545764e-05, "loss": 7.9998, "step": 329 }, { "epoch": 0.4596100278551532, "grad_norm": 4.775269031524658, "learning_rate": 6.814951677089254e-05, "loss": 8.3389, "step": 330 }, { "epoch": 0.4610027855153203, "grad_norm": 4.500304222106934, "learning_rate": 6.813956793632745e-05, "loss": 8.1957, "step": 331 }, { "epoch": 0.4623955431754875, "grad_norm": 3.8328864574432373, "learning_rate": 6.812961910176237e-05, "loss": 8.4248, "step": 332 }, { "epoch": 0.4637883008356546, "grad_norm": 4.228978157043457, "learning_rate": 6.811967026719726e-05, "loss": 7.9624, "step": 333 }, { "epoch": 0.46518105849582175, "grad_norm": 3.9632623195648193, "learning_rate": 6.810972143263217e-05, "loss": 7.9722, "step": 334 }, { "epoch": 0.46657381615598886, "grad_norm": 4.006036758422852, "learning_rate": 6.809977259806708e-05, "loss": 8.0736, "step": 335 }, { "epoch": 0.467966573816156, "grad_norm": 3.991527557373047, "learning_rate": 6.808982376350198e-05, "loss": 8.3156, "step": 336 }, { "epoch": 0.46935933147632314, "grad_norm": 3.9454505443573, "learning_rate": 6.80798749289369e-05, "loss": 8.2869, "step": 337 }, { "epoch": 0.47075208913649025, "grad_norm": 4.333049774169922, "learning_rate": 6.806992609437179e-05, "loss": 8.0926, "step": 338 }, { "epoch": 0.47214484679665736, "grad_norm": 4.2421112060546875, "learning_rate": 6.80599772598067e-05, "loss": 8.3053, "step": 339 }, { "epoch": 0.4735376044568245, "grad_norm": 4.790825366973877, "learning_rate": 6.805002842524161e-05, "loss": 7.8001, "step": 340 }, { "epoch": 0.47493036211699163, "grad_norm": 4.551433086395264, "learning_rate": 6.804007959067652e-05, "loss": 8.0157, "step": 341 }, { "epoch": 0.4763231197771588, "grad_norm": 4.332050800323486, "learning_rate": 6.803013075611142e-05, "loss": 8.0316, "step": 342 }, { "epoch": 0.4777158774373259, "grad_norm": 4.675700664520264, "learning_rate": 6.802018192154632e-05, "loss": 8.0046, "step": 343 }, { "epoch": 0.479108635097493, "grad_norm": 4.037833213806152, "learning_rate": 6.801023308698123e-05, "loss": 8.1756, "step": 344 }, { "epoch": 0.4805013927576602, "grad_norm": 4.044549942016602, "learning_rate": 6.800028425241615e-05, "loss": 7.8956, "step": 345 }, { "epoch": 0.4818941504178273, "grad_norm": 4.56648588180542, "learning_rate": 6.799033541785105e-05, "loss": 7.8569, "step": 346 }, { "epoch": 0.4832869080779944, "grad_norm": 4.27932071685791, "learning_rate": 6.798038658328595e-05, "loss": 7.983, "step": 347 }, { "epoch": 0.48467966573816157, "grad_norm": 4.309334754943848, "learning_rate": 6.797043774872086e-05, "loss": 8.0926, "step": 348 }, { "epoch": 0.4860724233983287, "grad_norm": 4.4279961585998535, "learning_rate": 6.796048891415576e-05, "loss": 8.1277, "step": 349 }, { "epoch": 0.48746518105849584, "grad_norm": 4.202175140380859, "learning_rate": 6.795054007959067e-05, "loss": 8.2918, "step": 350 }, { "epoch": 0.48885793871866295, "grad_norm": 4.153756618499756, "learning_rate": 6.794059124502558e-05, "loss": 8.0533, "step": 351 }, { "epoch": 0.49025069637883006, "grad_norm": 3.8882815837860107, "learning_rate": 6.793064241046048e-05, "loss": 8.2729, "step": 352 }, { "epoch": 0.49164345403899723, "grad_norm": 4.373082160949707, "learning_rate": 6.792069357589539e-05, "loss": 8.2598, "step": 353 }, { "epoch": 0.49303621169916434, "grad_norm": 4.185801982879639, "learning_rate": 6.79107447413303e-05, "loss": 8.1271, "step": 354 }, { "epoch": 0.49442896935933145, "grad_norm": 4.643874645233154, "learning_rate": 6.79007959067652e-05, "loss": 8.1628, "step": 355 }, { "epoch": 0.4958217270194986, "grad_norm": 4.2254414558410645, "learning_rate": 6.78908470722001e-05, "loss": 8.0874, "step": 356 }, { "epoch": 0.4972144846796657, "grad_norm": 3.6543078422546387, "learning_rate": 6.788089823763501e-05, "loss": 8.12, "step": 357 }, { "epoch": 0.4986072423398329, "grad_norm": 4.454078197479248, "learning_rate": 6.787094940306991e-05, "loss": 8.1818, "step": 358 }, { "epoch": 0.5, "grad_norm": 4.394839286804199, "learning_rate": 6.786100056850483e-05, "loss": 7.8266, "step": 359 }, { "epoch": 0.5013927576601671, "grad_norm": 4.51784610748291, "learning_rate": 6.785105173393973e-05, "loss": 8.377, "step": 360 }, { "epoch": 0.5027855153203342, "grad_norm": 3.762207269668579, "learning_rate": 6.784110289937464e-05, "loss": 8.0772, "step": 361 }, { "epoch": 0.5041782729805014, "grad_norm": 4.488733291625977, "learning_rate": 6.783115406480954e-05, "loss": 7.9061, "step": 362 }, { "epoch": 0.5055710306406686, "grad_norm": 3.9269416332244873, "learning_rate": 6.782120523024445e-05, "loss": 8.1671, "step": 363 }, { "epoch": 0.5069637883008357, "grad_norm": 4.34641170501709, "learning_rate": 6.781125639567935e-05, "loss": 8.2714, "step": 364 }, { "epoch": 0.5083565459610028, "grad_norm": 5.105216979980469, "learning_rate": 6.780130756111427e-05, "loss": 8.1929, "step": 365 }, { "epoch": 0.5097493036211699, "grad_norm": 4.120882987976074, "learning_rate": 6.779135872654917e-05, "loss": 8.2241, "step": 366 }, { "epoch": 0.5111420612813371, "grad_norm": 4.087029933929443, "learning_rate": 6.778140989198408e-05, "loss": 7.9492, "step": 367 }, { "epoch": 0.5125348189415042, "grad_norm": 3.694951057434082, "learning_rate": 6.777146105741898e-05, "loss": 7.9803, "step": 368 }, { "epoch": 0.5139275766016713, "grad_norm": 4.075655460357666, "learning_rate": 6.776151222285389e-05, "loss": 7.917, "step": 369 }, { "epoch": 0.5153203342618384, "grad_norm": 4.9306254386901855, "learning_rate": 6.77515633882888e-05, "loss": 7.8565, "step": 370 }, { "epoch": 0.5167130919220055, "grad_norm": 4.179455280303955, "learning_rate": 6.77416145537237e-05, "loss": 7.8994, "step": 371 }, { "epoch": 0.5181058495821727, "grad_norm": 4.032535552978516, "learning_rate": 6.773166571915861e-05, "loss": 8.0329, "step": 372 }, { "epoch": 0.5194986072423399, "grad_norm": 3.875140905380249, "learning_rate": 6.772171688459351e-05, "loss": 7.4966, "step": 373 }, { "epoch": 0.520891364902507, "grad_norm": 4.436954498291016, "learning_rate": 6.771176805002842e-05, "loss": 8.0252, "step": 374 }, { "epoch": 0.5222841225626741, "grad_norm": 3.833369493484497, "learning_rate": 6.770181921546332e-05, "loss": 8.0937, "step": 375 }, { "epoch": 0.5236768802228412, "grad_norm": 3.9323227405548096, "learning_rate": 6.769187038089824e-05, "loss": 7.9147, "step": 376 }, { "epoch": 0.5250696378830083, "grad_norm": 4.252800941467285, "learning_rate": 6.768192154633313e-05, "loss": 7.9065, "step": 377 }, { "epoch": 0.5264623955431755, "grad_norm": 4.6514482498168945, "learning_rate": 6.767197271176805e-05, "loss": 7.7514, "step": 378 }, { "epoch": 0.5278551532033426, "grad_norm": 4.116487503051758, "learning_rate": 6.766202387720295e-05, "loss": 8.1345, "step": 379 }, { "epoch": 0.5292479108635098, "grad_norm": 4.300816059112549, "learning_rate": 6.765207504263786e-05, "loss": 7.8827, "step": 380 }, { "epoch": 0.5306406685236769, "grad_norm": 3.882556676864624, "learning_rate": 6.764212620807276e-05, "loss": 7.9054, "step": 381 }, { "epoch": 0.532033426183844, "grad_norm": 4.058185577392578, "learning_rate": 6.763217737350767e-05, "loss": 7.7686, "step": 382 }, { "epoch": 0.5334261838440112, "grad_norm": 3.925845146179199, "learning_rate": 6.762222853894257e-05, "loss": 7.9461, "step": 383 }, { "epoch": 0.5348189415041783, "grad_norm": 4.54960823059082, "learning_rate": 6.761227970437749e-05, "loss": 7.9756, "step": 384 }, { "epoch": 0.5362116991643454, "grad_norm": 4.5480427742004395, "learning_rate": 6.760233086981239e-05, "loss": 7.9223, "step": 385 }, { "epoch": 0.5376044568245125, "grad_norm": 3.820448875427246, "learning_rate": 6.75923820352473e-05, "loss": 7.9005, "step": 386 }, { "epoch": 0.5389972144846796, "grad_norm": 4.343704700469971, "learning_rate": 6.75824332006822e-05, "loss": 7.5845, "step": 387 }, { "epoch": 0.5403899721448467, "grad_norm": 4.2667036056518555, "learning_rate": 6.75724843661171e-05, "loss": 7.914, "step": 388 }, { "epoch": 0.541782729805014, "grad_norm": 3.9239776134490967, "learning_rate": 6.756253553155201e-05, "loss": 8.2154, "step": 389 }, { "epoch": 0.5431754874651811, "grad_norm": 3.943493604660034, "learning_rate": 6.755258669698692e-05, "loss": 7.4672, "step": 390 }, { "epoch": 0.5445682451253482, "grad_norm": 4.2096266746521, "learning_rate": 6.754263786242182e-05, "loss": 7.7497, "step": 391 }, { "epoch": 0.5459610027855153, "grad_norm": 3.7932655811309814, "learning_rate": 6.753268902785673e-05, "loss": 8.2188, "step": 392 }, { "epoch": 0.5473537604456824, "grad_norm": 4.857273578643799, "learning_rate": 6.752274019329164e-05, "loss": 7.8588, "step": 393 }, { "epoch": 0.5487465181058496, "grad_norm": 4.0276007652282715, "learning_rate": 6.751279135872654e-05, "loss": 7.9237, "step": 394 }, { "epoch": 0.5501392757660167, "grad_norm": 4.130008697509766, "learning_rate": 6.750284252416146e-05, "loss": 7.5528, "step": 395 }, { "epoch": 0.5515320334261838, "grad_norm": 4.636471271514893, "learning_rate": 6.749289368959635e-05, "loss": 7.684, "step": 396 }, { "epoch": 0.552924791086351, "grad_norm": 4.785216331481934, "learning_rate": 6.748294485503125e-05, "loss": 8.0515, "step": 397 }, { "epoch": 0.5543175487465181, "grad_norm": 4.659693241119385, "learning_rate": 6.747299602046617e-05, "loss": 7.8735, "step": 398 }, { "epoch": 0.5557103064066853, "grad_norm": 4.613758563995361, "learning_rate": 6.746304718590108e-05, "loss": 7.964, "step": 399 }, { "epoch": 0.5571030640668524, "grad_norm": 4.028876781463623, "learning_rate": 6.745309835133598e-05, "loss": 7.5535, "step": 400 }, { "epoch": 0.5584958217270195, "grad_norm": 6.227209091186523, "learning_rate": 6.744314951677088e-05, "loss": 7.8115, "step": 401 }, { "epoch": 0.5598885793871866, "grad_norm": 5.188333511352539, "learning_rate": 6.743320068220579e-05, "loss": 7.6271, "step": 402 }, { "epoch": 0.5612813370473537, "grad_norm": 5.935398101806641, "learning_rate": 6.74232518476407e-05, "loss": 8.2987, "step": 403 }, { "epoch": 0.5626740947075209, "grad_norm": 4.565037727355957, "learning_rate": 6.741330301307561e-05, "loss": 7.4819, "step": 404 }, { "epoch": 0.564066852367688, "grad_norm": 4.986330032348633, "learning_rate": 6.740335417851051e-05, "loss": 7.6441, "step": 405 }, { "epoch": 0.5654596100278552, "grad_norm": 4.388195991516113, "learning_rate": 6.739340534394542e-05, "loss": 7.6117, "step": 406 }, { "epoch": 0.5668523676880223, "grad_norm": 4.697558403015137, "learning_rate": 6.738345650938032e-05, "loss": 7.7427, "step": 407 }, { "epoch": 0.5682451253481894, "grad_norm": 4.241237640380859, "learning_rate": 6.737350767481523e-05, "loss": 7.9022, "step": 408 }, { "epoch": 0.5696378830083565, "grad_norm": 4.273233413696289, "learning_rate": 6.736355884025014e-05, "loss": 7.637, "step": 409 }, { "epoch": 0.5710306406685237, "grad_norm": 4.361372947692871, "learning_rate": 6.735361000568505e-05, "loss": 7.5757, "step": 410 }, { "epoch": 0.5724233983286908, "grad_norm": 4.265485763549805, "learning_rate": 6.734366117111995e-05, "loss": 7.6315, "step": 411 }, { "epoch": 0.5738161559888579, "grad_norm": 4.280587673187256, "learning_rate": 6.733371233655486e-05, "loss": 7.6017, "step": 412 }, { "epoch": 0.575208913649025, "grad_norm": 5.131948471069336, "learning_rate": 6.732376350198976e-05, "loss": 7.973, "step": 413 }, { "epoch": 0.5766016713091922, "grad_norm": 4.602144241333008, "learning_rate": 6.731381466742466e-05, "loss": 7.8972, "step": 414 }, { "epoch": 0.5779944289693594, "grad_norm": 4.569875240325928, "learning_rate": 6.730386583285958e-05, "loss": 7.9031, "step": 415 }, { "epoch": 0.5793871866295265, "grad_norm": 4.378690719604492, "learning_rate": 6.729391699829447e-05, "loss": 7.7355, "step": 416 }, { "epoch": 0.5807799442896936, "grad_norm": 5.248675346374512, "learning_rate": 6.728396816372939e-05, "loss": 7.4518, "step": 417 }, { "epoch": 0.5821727019498607, "grad_norm": 5.819039344787598, "learning_rate": 6.727401932916429e-05, "loss": 7.7974, "step": 418 }, { "epoch": 0.5835654596100278, "grad_norm": 4.727663040161133, "learning_rate": 6.72640704945992e-05, "loss": 7.3703, "step": 419 }, { "epoch": 0.584958217270195, "grad_norm": 3.7929654121398926, "learning_rate": 6.725412166003411e-05, "loss": 7.989, "step": 420 }, { "epoch": 0.5863509749303621, "grad_norm": 4.322558403015137, "learning_rate": 6.7244172825469e-05, "loss": 7.877, "step": 421 }, { "epoch": 0.5877437325905293, "grad_norm": 4.5659613609313965, "learning_rate": 6.723422399090391e-05, "loss": 7.8418, "step": 422 }, { "epoch": 0.5891364902506964, "grad_norm": 4.2391133308410645, "learning_rate": 6.722427515633883e-05, "loss": 7.6219, "step": 423 }, { "epoch": 0.5905292479108635, "grad_norm": 4.069179058074951, "learning_rate": 6.721432632177373e-05, "loss": 7.9688, "step": 424 }, { "epoch": 0.5919220055710307, "grad_norm": 4.541570663452148, "learning_rate": 6.720437748720864e-05, "loss": 7.9766, "step": 425 }, { "epoch": 0.5933147632311978, "grad_norm": 4.306227207183838, "learning_rate": 6.719442865264354e-05, "loss": 7.7377, "step": 426 }, { "epoch": 0.5947075208913649, "grad_norm": 4.158151626586914, "learning_rate": 6.718447981807844e-05, "loss": 7.5197, "step": 427 }, { "epoch": 0.596100278551532, "grad_norm": 4.7426228523254395, "learning_rate": 6.717453098351336e-05, "loss": 7.9801, "step": 428 }, { "epoch": 0.5974930362116991, "grad_norm": 4.626999378204346, "learning_rate": 6.716458214894827e-05, "loss": 7.5905, "step": 429 }, { "epoch": 0.5988857938718662, "grad_norm": 5.324058532714844, "learning_rate": 6.715463331438317e-05, "loss": 8.0235, "step": 430 }, { "epoch": 0.6002785515320335, "grad_norm": 4.126290798187256, "learning_rate": 6.714468447981807e-05, "loss": 7.8866, "step": 431 }, { "epoch": 0.6016713091922006, "grad_norm": 3.7322535514831543, "learning_rate": 6.713473564525298e-05, "loss": 7.4555, "step": 432 }, { "epoch": 0.6030640668523677, "grad_norm": 4.0403876304626465, "learning_rate": 6.712478681068788e-05, "loss": 7.7761, "step": 433 }, { "epoch": 0.6044568245125348, "grad_norm": 4.520780563354492, "learning_rate": 6.71148379761228e-05, "loss": 7.9322, "step": 434 }, { "epoch": 0.6058495821727019, "grad_norm": 3.943120002746582, "learning_rate": 6.710488914155769e-05, "loss": 7.4862, "step": 435 }, { "epoch": 0.6072423398328691, "grad_norm": 5.254693984985352, "learning_rate": 6.709494030699261e-05, "loss": 7.375, "step": 436 }, { "epoch": 0.6086350974930362, "grad_norm": 3.7695791721343994, "learning_rate": 6.708499147242751e-05, "loss": 7.7116, "step": 437 }, { "epoch": 0.6100278551532033, "grad_norm": 4.416331768035889, "learning_rate": 6.707504263786242e-05, "loss": 7.9933, "step": 438 }, { "epoch": 0.6114206128133705, "grad_norm": 3.907776117324829, "learning_rate": 6.706509380329732e-05, "loss": 7.9828, "step": 439 }, { "epoch": 0.6128133704735376, "grad_norm": 4.00132942199707, "learning_rate": 6.705514496873222e-05, "loss": 7.4383, "step": 440 }, { "epoch": 0.6142061281337048, "grad_norm": 3.9430370330810547, "learning_rate": 6.704519613416713e-05, "loss": 7.5872, "step": 441 }, { "epoch": 0.6155988857938719, "grad_norm": 4.282450199127197, "learning_rate": 6.703524729960205e-05, "loss": 7.2941, "step": 442 }, { "epoch": 0.616991643454039, "grad_norm": 4.128746509552002, "learning_rate": 6.702529846503695e-05, "loss": 7.5871, "step": 443 }, { "epoch": 0.6183844011142061, "grad_norm": 4.742000102996826, "learning_rate": 6.701534963047185e-05, "loss": 7.7283, "step": 444 }, { "epoch": 0.6197771587743732, "grad_norm": 4.397759437561035, "learning_rate": 6.700540079590676e-05, "loss": 7.4465, "step": 445 }, { "epoch": 0.6211699164345403, "grad_norm": 5.393130302429199, "learning_rate": 6.699545196134166e-05, "loss": 8.0633, "step": 446 }, { "epoch": 0.6225626740947076, "grad_norm": 4.592435836791992, "learning_rate": 6.698550312677657e-05, "loss": 7.5154, "step": 447 }, { "epoch": 0.6239554317548747, "grad_norm": 6.1243109703063965, "learning_rate": 6.697555429221148e-05, "loss": 7.3979, "step": 448 }, { "epoch": 0.6253481894150418, "grad_norm": 5.653916835784912, "learning_rate": 6.696560545764639e-05, "loss": 7.3102, "step": 449 }, { "epoch": 0.6267409470752089, "grad_norm": 4.128157138824463, "learning_rate": 6.695565662308129e-05, "loss": 7.6995, "step": 450 }, { "epoch": 0.628133704735376, "grad_norm": 4.498085975646973, "learning_rate": 6.69457077885162e-05, "loss": 7.3869, "step": 451 }, { "epoch": 0.6295264623955432, "grad_norm": 4.387475490570068, "learning_rate": 6.69357589539511e-05, "loss": 7.3913, "step": 452 }, { "epoch": 0.6309192200557103, "grad_norm": 4.345146179199219, "learning_rate": 6.692581011938602e-05, "loss": 7.7166, "step": 453 }, { "epoch": 0.6323119777158774, "grad_norm": 4.356617450714111, "learning_rate": 6.691586128482092e-05, "loss": 7.2329, "step": 454 }, { "epoch": 0.6337047353760445, "grad_norm": 4.563919544219971, "learning_rate": 6.690591245025583e-05, "loss": 7.4578, "step": 455 }, { "epoch": 0.6350974930362117, "grad_norm": 3.964571952819824, "learning_rate": 6.689596361569073e-05, "loss": 7.3362, "step": 456 }, { "epoch": 0.6364902506963789, "grad_norm": 3.775155782699585, "learning_rate": 6.688601478112563e-05, "loss": 7.1363, "step": 457 }, { "epoch": 0.637883008356546, "grad_norm": 4.498113632202148, "learning_rate": 6.687606594656054e-05, "loss": 7.3597, "step": 458 }, { "epoch": 0.6392757660167131, "grad_norm": 4.065338134765625, "learning_rate": 6.686611711199546e-05, "loss": 7.8226, "step": 459 }, { "epoch": 0.6406685236768802, "grad_norm": 4.917151927947998, "learning_rate": 6.685616827743035e-05, "loss": 7.6501, "step": 460 }, { "epoch": 0.6420612813370473, "grad_norm": 4.308130741119385, "learning_rate": 6.684621944286526e-05, "loss": 7.6445, "step": 461 }, { "epoch": 0.6434540389972145, "grad_norm": 3.901914596557617, "learning_rate": 6.683627060830017e-05, "loss": 7.8137, "step": 462 }, { "epoch": 0.6448467966573816, "grad_norm": 3.91216778755188, "learning_rate": 6.682632177373507e-05, "loss": 7.7947, "step": 463 }, { "epoch": 0.6462395543175488, "grad_norm": 4.771979331970215, "learning_rate": 6.681637293916998e-05, "loss": 7.6896, "step": 464 }, { "epoch": 0.6476323119777159, "grad_norm": 5.148728847503662, "learning_rate": 6.680642410460488e-05, "loss": 7.4895, "step": 465 }, { "epoch": 0.649025069637883, "grad_norm": 4.330319404602051, "learning_rate": 6.679647527003978e-05, "loss": 7.6376, "step": 466 }, { "epoch": 0.6504178272980501, "grad_norm": 4.768857002258301, "learning_rate": 6.67865264354747e-05, "loss": 7.7503, "step": 467 }, { "epoch": 0.6518105849582173, "grad_norm": 4.0677900314331055, "learning_rate": 6.67765776009096e-05, "loss": 7.5107, "step": 468 }, { "epoch": 0.6532033426183844, "grad_norm": 5.170895576477051, "learning_rate": 6.676662876634451e-05, "loss": 7.6665, "step": 469 }, { "epoch": 0.6545961002785515, "grad_norm": 6.822413444519043, "learning_rate": 6.675667993177941e-05, "loss": 7.3319, "step": 470 }, { "epoch": 0.6559888579387186, "grad_norm": 4.619007587432861, "learning_rate": 6.674673109721432e-05, "loss": 7.4256, "step": 471 }, { "epoch": 0.6573816155988857, "grad_norm": 4.023612022399902, "learning_rate": 6.673678226264922e-05, "loss": 7.5926, "step": 472 }, { "epoch": 0.658774373259053, "grad_norm": 4.313607215881348, "learning_rate": 6.672683342808414e-05, "loss": 7.7134, "step": 473 }, { "epoch": 0.6601671309192201, "grad_norm": 4.842207908630371, "learning_rate": 6.671688459351903e-05, "loss": 7.4796, "step": 474 }, { "epoch": 0.6615598885793872, "grad_norm": 8.598404884338379, "learning_rate": 6.670693575895395e-05, "loss": 7.584, "step": 475 }, { "epoch": 0.6629526462395543, "grad_norm": 4.583897113800049, "learning_rate": 6.669698692438885e-05, "loss": 7.6584, "step": 476 }, { "epoch": 0.6643454038997214, "grad_norm": 5.060597896575928, "learning_rate": 6.668703808982376e-05, "loss": 7.2289, "step": 477 }, { "epoch": 0.6657381615598886, "grad_norm": 4.662160873413086, "learning_rate": 6.667708925525867e-05, "loss": 7.1151, "step": 478 }, { "epoch": 0.6671309192200557, "grad_norm": 4.092073917388916, "learning_rate": 6.666714042069356e-05, "loss": 7.0661, "step": 479 }, { "epoch": 0.6685236768802229, "grad_norm": 4.688446998596191, "learning_rate": 6.665719158612847e-05, "loss": 7.8714, "step": 480 }, { "epoch": 0.66991643454039, "grad_norm": 6.650211334228516, "learning_rate": 6.664724275156339e-05, "loss": 7.4456, "step": 481 }, { "epoch": 0.6713091922005571, "grad_norm": 4.948154926300049, "learning_rate": 6.663729391699829e-05, "loss": 7.3168, "step": 482 }, { "epoch": 0.6727019498607242, "grad_norm": 4.459090232849121, "learning_rate": 6.66273450824332e-05, "loss": 7.7445, "step": 483 }, { "epoch": 0.6740947075208914, "grad_norm": 4.993417263031006, "learning_rate": 6.66173962478681e-05, "loss": 7.5161, "step": 484 }, { "epoch": 0.6754874651810585, "grad_norm": 4.263315200805664, "learning_rate": 6.6607447413303e-05, "loss": 7.7636, "step": 485 }, { "epoch": 0.6768802228412256, "grad_norm": 3.9328417778015137, "learning_rate": 6.659749857873792e-05, "loss": 7.7585, "step": 486 }, { "epoch": 0.6782729805013927, "grad_norm": 4.226478576660156, "learning_rate": 6.658754974417282e-05, "loss": 7.7491, "step": 487 }, { "epoch": 0.6796657381615598, "grad_norm": 4.163766860961914, "learning_rate": 6.657760090960773e-05, "loss": 7.2713, "step": 488 }, { "epoch": 0.6810584958217271, "grad_norm": 4.728254318237305, "learning_rate": 6.656765207504263e-05, "loss": 7.5866, "step": 489 }, { "epoch": 0.6824512534818942, "grad_norm": 4.2728657722473145, "learning_rate": 6.655770324047754e-05, "loss": 7.4582, "step": 490 }, { "epoch": 0.6838440111420613, "grad_norm": 4.443821430206299, "learning_rate": 6.654775440591244e-05, "loss": 7.4313, "step": 491 }, { "epoch": 0.6852367688022284, "grad_norm": 4.36879301071167, "learning_rate": 6.653780557134736e-05, "loss": 7.3954, "step": 492 }, { "epoch": 0.6866295264623955, "grad_norm": 3.9841506481170654, "learning_rate": 6.652785673678226e-05, "loss": 7.3566, "step": 493 }, { "epoch": 0.6880222841225627, "grad_norm": 4.223026752471924, "learning_rate": 6.651790790221717e-05, "loss": 7.4146, "step": 494 }, { "epoch": 0.6894150417827298, "grad_norm": 4.6790008544921875, "learning_rate": 6.650795906765207e-05, "loss": 7.3331, "step": 495 }, { "epoch": 0.6908077994428969, "grad_norm": 4.291107654571533, "learning_rate": 6.649801023308697e-05, "loss": 7.2829, "step": 496 }, { "epoch": 0.692200557103064, "grad_norm": 4.316932678222656, "learning_rate": 6.648806139852188e-05, "loss": 7.41, "step": 497 }, { "epoch": 0.6935933147632312, "grad_norm": 4.292764663696289, "learning_rate": 6.64781125639568e-05, "loss": 7.2584, "step": 498 }, { "epoch": 0.6949860724233984, "grad_norm": 4.277196407318115, "learning_rate": 6.646816372939169e-05, "loss": 7.3955, "step": 499 }, { "epoch": 0.6963788300835655, "grad_norm": 4.318631649017334, "learning_rate": 6.64582148948266e-05, "loss": 7.3698, "step": 500 }, { "epoch": 0.6977715877437326, "grad_norm": 4.1690263748168945, "learning_rate": 6.644826606026151e-05, "loss": 7.2454, "step": 501 }, { "epoch": 0.6991643454038997, "grad_norm": 3.916719675064087, "learning_rate": 6.643831722569641e-05, "loss": 7.1476, "step": 502 }, { "epoch": 0.7005571030640668, "grad_norm": 3.9907374382019043, "learning_rate": 6.642836839113132e-05, "loss": 7.4952, "step": 503 }, { "epoch": 0.7019498607242339, "grad_norm": 4.071834564208984, "learning_rate": 6.641841955656622e-05, "loss": 7.492, "step": 504 }, { "epoch": 0.7033426183844012, "grad_norm": 4.0513458251953125, "learning_rate": 6.640847072200112e-05, "loss": 7.4342, "step": 505 }, { "epoch": 0.7047353760445683, "grad_norm": 4.521594524383545, "learning_rate": 6.639852188743604e-05, "loss": 7.3739, "step": 506 }, { "epoch": 0.7061281337047354, "grad_norm": 4.986645221710205, "learning_rate": 6.638857305287095e-05, "loss": 7.55, "step": 507 }, { "epoch": 0.7075208913649025, "grad_norm": 4.119948387145996, "learning_rate": 6.637862421830585e-05, "loss": 7.7317, "step": 508 }, { "epoch": 0.7089136490250696, "grad_norm": 5.314901351928711, "learning_rate": 6.636867538374075e-05, "loss": 7.7665, "step": 509 }, { "epoch": 0.7103064066852368, "grad_norm": 4.315718650817871, "learning_rate": 6.635872654917566e-05, "loss": 7.1784, "step": 510 }, { "epoch": 0.7116991643454039, "grad_norm": 3.93410325050354, "learning_rate": 6.634877771461058e-05, "loss": 7.4012, "step": 511 }, { "epoch": 0.713091922005571, "grad_norm": 4.619295597076416, "learning_rate": 6.633882888004548e-05, "loss": 7.178, "step": 512 }, { "epoch": 0.7144846796657381, "grad_norm": 4.923532485961914, "learning_rate": 6.632888004548038e-05, "loss": 7.4176, "step": 513 }, { "epoch": 0.7158774373259053, "grad_norm": 5.159674167633057, "learning_rate": 6.631893121091529e-05, "loss": 7.2846, "step": 514 }, { "epoch": 0.7172701949860725, "grad_norm": 3.867479085922241, "learning_rate": 6.630898237635019e-05, "loss": 7.142, "step": 515 }, { "epoch": 0.7186629526462396, "grad_norm": 4.792901039123535, "learning_rate": 6.62990335417851e-05, "loss": 7.4178, "step": 516 }, { "epoch": 0.7200557103064067, "grad_norm": 4.785666465759277, "learning_rate": 6.628908470722001e-05, "loss": 7.1373, "step": 517 }, { "epoch": 0.7214484679665738, "grad_norm": 5.257824420928955, "learning_rate": 6.62791358726549e-05, "loss": 7.3563, "step": 518 }, { "epoch": 0.7228412256267409, "grad_norm": 4.598160266876221, "learning_rate": 6.626918703808982e-05, "loss": 7.2522, "step": 519 }, { "epoch": 0.724233983286908, "grad_norm": 4.3474040031433105, "learning_rate": 6.625923820352473e-05, "loss": 7.7297, "step": 520 }, { "epoch": 0.7256267409470752, "grad_norm": 4.432811737060547, "learning_rate": 6.624928936895963e-05, "loss": 7.3761, "step": 521 }, { "epoch": 0.7270194986072424, "grad_norm": 3.7676665782928467, "learning_rate": 6.623934053439453e-05, "loss": 7.2723, "step": 522 }, { "epoch": 0.7284122562674095, "grad_norm": 4.212183952331543, "learning_rate": 6.622939169982944e-05, "loss": 7.1899, "step": 523 }, { "epoch": 0.7298050139275766, "grad_norm": 3.817636013031006, "learning_rate": 6.621944286526434e-05, "loss": 7.0919, "step": 524 }, { "epoch": 0.7311977715877437, "grad_norm": 3.760101079940796, "learning_rate": 6.620949403069926e-05, "loss": 7.0633, "step": 525 }, { "epoch": 0.7325905292479109, "grad_norm": 4.175741672515869, "learning_rate": 6.619954519613416e-05, "loss": 7.053, "step": 526 }, { "epoch": 0.733983286908078, "grad_norm": 4.742794036865234, "learning_rate": 6.618959636156907e-05, "loss": 7.4681, "step": 527 }, { "epoch": 0.7353760445682451, "grad_norm": 5.325410842895508, "learning_rate": 6.617964752700397e-05, "loss": 7.5734, "step": 528 }, { "epoch": 0.7367688022284122, "grad_norm": 4.095676422119141, "learning_rate": 6.616969869243888e-05, "loss": 7.5398, "step": 529 }, { "epoch": 0.7381615598885793, "grad_norm": 5.706197738647461, "learning_rate": 6.615974985787378e-05, "loss": 7.4122, "step": 530 }, { "epoch": 0.7395543175487466, "grad_norm": 4.12428617477417, "learning_rate": 6.61498010233087e-05, "loss": 7.1195, "step": 531 }, { "epoch": 0.7409470752089137, "grad_norm": 4.2953691482543945, "learning_rate": 6.61398521887436e-05, "loss": 6.9206, "step": 532 }, { "epoch": 0.7423398328690808, "grad_norm": 4.770367622375488, "learning_rate": 6.61299033541785e-05, "loss": 7.0371, "step": 533 }, { "epoch": 0.7437325905292479, "grad_norm": 4.273123264312744, "learning_rate": 6.611995451961341e-05, "loss": 6.5632, "step": 534 }, { "epoch": 0.745125348189415, "grad_norm": 5.233716011047363, "learning_rate": 6.611000568504831e-05, "loss": 7.3385, "step": 535 }, { "epoch": 0.7465181058495822, "grad_norm": 4.813663959503174, "learning_rate": 6.610005685048323e-05, "loss": 7.2633, "step": 536 }, { "epoch": 0.7479108635097493, "grad_norm": 4.405749797821045, "learning_rate": 6.609010801591814e-05, "loss": 7.4617, "step": 537 }, { "epoch": 0.7493036211699164, "grad_norm": 4.777441024780273, "learning_rate": 6.608015918135304e-05, "loss": 7.5012, "step": 538 }, { "epoch": 0.7506963788300836, "grad_norm": 3.9238295555114746, "learning_rate": 6.607021034678794e-05, "loss": 7.3552, "step": 539 }, { "epoch": 0.7520891364902507, "grad_norm": 5.207787990570068, "learning_rate": 6.606026151222285e-05, "loss": 6.8543, "step": 540 }, { "epoch": 0.7534818941504178, "grad_norm": 4.217895984649658, "learning_rate": 6.605031267765775e-05, "loss": 7.2798, "step": 541 }, { "epoch": 0.754874651810585, "grad_norm": 4.337287425994873, "learning_rate": 6.604036384309267e-05, "loss": 7.2047, "step": 542 }, { "epoch": 0.7562674094707521, "grad_norm": 5.529717922210693, "learning_rate": 6.603041500852756e-05, "loss": 7.3142, "step": 543 }, { "epoch": 0.7576601671309192, "grad_norm": 4.267019271850586, "learning_rate": 6.602046617396248e-05, "loss": 6.9866, "step": 544 }, { "epoch": 0.7590529247910863, "grad_norm": 4.3455352783203125, "learning_rate": 6.601051733939738e-05, "loss": 6.9549, "step": 545 }, { "epoch": 0.7604456824512534, "grad_norm": 4.68610143661499, "learning_rate": 6.600056850483229e-05, "loss": 7.5779, "step": 546 }, { "epoch": 0.7618384401114207, "grad_norm": 5.094757080078125, "learning_rate": 6.599061967026719e-05, "loss": 7.4815, "step": 547 }, { "epoch": 0.7632311977715878, "grad_norm": 4.233945846557617, "learning_rate": 6.59806708357021e-05, "loss": 7.3032, "step": 548 }, { "epoch": 0.7646239554317549, "grad_norm": 4.713217258453369, "learning_rate": 6.5970722001137e-05, "loss": 7.3833, "step": 549 }, { "epoch": 0.766016713091922, "grad_norm": 6.269781112670898, "learning_rate": 6.596077316657192e-05, "loss": 7.212, "step": 550 }, { "epoch": 0.7674094707520891, "grad_norm": 4.588368892669678, "learning_rate": 6.595082433200682e-05, "loss": 7.448, "step": 551 }, { "epoch": 0.7688022284122563, "grad_norm": 4.187366008758545, "learning_rate": 6.594087549744172e-05, "loss": 7.0892, "step": 552 }, { "epoch": 0.7701949860724234, "grad_norm": 4.566118240356445, "learning_rate": 6.593092666287663e-05, "loss": 6.8282, "step": 553 }, { "epoch": 0.7715877437325905, "grad_norm": 5.366352081298828, "learning_rate": 6.592097782831153e-05, "loss": 7.4461, "step": 554 }, { "epoch": 0.7729805013927576, "grad_norm": 4.582744598388672, "learning_rate": 6.591102899374644e-05, "loss": 7.0199, "step": 555 }, { "epoch": 0.7743732590529248, "grad_norm": 3.6933884620666504, "learning_rate": 6.590108015918135e-05, "loss": 6.9732, "step": 556 }, { "epoch": 0.775766016713092, "grad_norm": 4.238746166229248, "learning_rate": 6.589113132461624e-05, "loss": 7.621, "step": 557 }, { "epoch": 0.7771587743732591, "grad_norm": 5.870996475219727, "learning_rate": 6.588118249005116e-05, "loss": 7.1813, "step": 558 }, { "epoch": 0.7785515320334262, "grad_norm": 4.53729248046875, "learning_rate": 6.587123365548607e-05, "loss": 7.2066, "step": 559 }, { "epoch": 0.7799442896935933, "grad_norm": 5.991088390350342, "learning_rate": 6.586128482092097e-05, "loss": 7.1529, "step": 560 }, { "epoch": 0.7813370473537604, "grad_norm": 7.669244289398193, "learning_rate": 6.585133598635589e-05, "loss": 7.3021, "step": 561 }, { "epoch": 0.7827298050139275, "grad_norm": 7.90692663192749, "learning_rate": 6.584138715179078e-05, "loss": 7.4128, "step": 562 }, { "epoch": 0.7841225626740947, "grad_norm": 5.141284942626953, "learning_rate": 6.583143831722568e-05, "loss": 7.0841, "step": 563 }, { "epoch": 0.7855153203342619, "grad_norm": 5.19080924987793, "learning_rate": 6.58214894826606e-05, "loss": 7.2239, "step": 564 }, { "epoch": 0.786908077994429, "grad_norm": 4.016646862030029, "learning_rate": 6.58115406480955e-05, "loss": 6.8987, "step": 565 }, { "epoch": 0.7883008356545961, "grad_norm": 4.256837368011475, "learning_rate": 6.580159181353041e-05, "loss": 6.9851, "step": 566 }, { "epoch": 0.7896935933147632, "grad_norm": 4.62431001663208, "learning_rate": 6.579164297896531e-05, "loss": 6.9119, "step": 567 }, { "epoch": 0.7910863509749304, "grad_norm": 4.101048946380615, "learning_rate": 6.578169414440022e-05, "loss": 7.0689, "step": 568 }, { "epoch": 0.7924791086350975, "grad_norm": 4.175568103790283, "learning_rate": 6.577174530983513e-05, "loss": 7.0084, "step": 569 }, { "epoch": 0.7938718662952646, "grad_norm": 4.61623477935791, "learning_rate": 6.576179647527004e-05, "loss": 7.0022, "step": 570 }, { "epoch": 0.7952646239554317, "grad_norm": 4.684569358825684, "learning_rate": 6.575184764070494e-05, "loss": 7.2584, "step": 571 }, { "epoch": 0.7966573816155988, "grad_norm": 5.298076629638672, "learning_rate": 6.574189880613985e-05, "loss": 7.5078, "step": 572 }, { "epoch": 0.7980501392757661, "grad_norm": 4.708342552185059, "learning_rate": 6.573194997157475e-05, "loss": 7.0561, "step": 573 }, { "epoch": 0.7994428969359332, "grad_norm": 4.700664520263672, "learning_rate": 6.572200113700965e-05, "loss": 6.9512, "step": 574 }, { "epoch": 0.8008356545961003, "grad_norm": 4.937999248504639, "learning_rate": 6.571205230244457e-05, "loss": 7.3912, "step": 575 }, { "epoch": 0.8022284122562674, "grad_norm": 4.863896369934082, "learning_rate": 6.570210346787948e-05, "loss": 7.213, "step": 576 }, { "epoch": 0.8036211699164345, "grad_norm": 4.428140640258789, "learning_rate": 6.569215463331438e-05, "loss": 7.0807, "step": 577 }, { "epoch": 0.8050139275766016, "grad_norm": 4.518229961395264, "learning_rate": 6.568220579874928e-05, "loss": 7.2289, "step": 578 }, { "epoch": 0.8064066852367688, "grad_norm": 4.143688201904297, "learning_rate": 6.567225696418419e-05, "loss": 7.0841, "step": 579 }, { "epoch": 0.807799442896936, "grad_norm": 4.04298734664917, "learning_rate": 6.566230812961909e-05, "loss": 6.8355, "step": 580 }, { "epoch": 0.8091922005571031, "grad_norm": 3.7745213508605957, "learning_rate": 6.565235929505401e-05, "loss": 6.7909, "step": 581 }, { "epoch": 0.8105849582172702, "grad_norm": 4.9903483390808105, "learning_rate": 6.56424104604889e-05, "loss": 7.1245, "step": 582 }, { "epoch": 0.8119777158774373, "grad_norm": 4.211728096008301, "learning_rate": 6.563246162592382e-05, "loss": 7.1137, "step": 583 }, { "epoch": 0.8133704735376045, "grad_norm": 3.8617141246795654, "learning_rate": 6.562251279135872e-05, "loss": 7.6446, "step": 584 }, { "epoch": 0.8147632311977716, "grad_norm": 3.9979238510131836, "learning_rate": 6.561256395679363e-05, "loss": 7.1116, "step": 585 }, { "epoch": 0.8161559888579387, "grad_norm": 4.649385929107666, "learning_rate": 6.560261512222853e-05, "loss": 7.062, "step": 586 }, { "epoch": 0.8175487465181058, "grad_norm": 4.784141540527344, "learning_rate": 6.559266628766343e-05, "loss": 6.5499, "step": 587 }, { "epoch": 0.8189415041782729, "grad_norm": 6.651003360748291, "learning_rate": 6.558271745309834e-05, "loss": 7.5502, "step": 588 }, { "epoch": 0.8203342618384402, "grad_norm": 4.67574405670166, "learning_rate": 6.557276861853326e-05, "loss": 6.9083, "step": 589 }, { "epoch": 0.8217270194986073, "grad_norm": 5.133337497711182, "learning_rate": 6.556281978396816e-05, "loss": 7.2497, "step": 590 }, { "epoch": 0.8231197771587744, "grad_norm": 4.505591869354248, "learning_rate": 6.555287094940306e-05, "loss": 7.2742, "step": 591 }, { "epoch": 0.8245125348189415, "grad_norm": 4.713627338409424, "learning_rate": 6.554292211483797e-05, "loss": 7.1844, "step": 592 }, { "epoch": 0.8259052924791086, "grad_norm": 4.773281097412109, "learning_rate": 6.553297328027287e-05, "loss": 6.9555, "step": 593 }, { "epoch": 0.8272980501392758, "grad_norm": 4.636007308959961, "learning_rate": 6.552302444570779e-05, "loss": 6.8225, "step": 594 }, { "epoch": 0.8286908077994429, "grad_norm": 4.04464054107666, "learning_rate": 6.55130756111427e-05, "loss": 7.1568, "step": 595 }, { "epoch": 0.83008356545961, "grad_norm": 4.1389994621276855, "learning_rate": 6.55031267765776e-05, "loss": 7.1188, "step": 596 }, { "epoch": 0.8314763231197771, "grad_norm": 4.6370086669921875, "learning_rate": 6.54931779420125e-05, "loss": 6.9095, "step": 597 }, { "epoch": 0.8328690807799443, "grad_norm": 5.183930397033691, "learning_rate": 6.54832291074474e-05, "loss": 7.2433, "step": 598 }, { "epoch": 0.8342618384401114, "grad_norm": 5.050951957702637, "learning_rate": 6.547328027288231e-05, "loss": 7.2824, "step": 599 }, { "epoch": 0.8356545961002786, "grad_norm": 4.163578987121582, "learning_rate": 6.546333143831723e-05, "loss": 6.8322, "step": 600 }, { "epoch": 0.8370473537604457, "grad_norm": 4.691806316375732, "learning_rate": 6.545338260375212e-05, "loss": 7.0893, "step": 601 }, { "epoch": 0.8384401114206128, "grad_norm": 4.006926536560059, "learning_rate": 6.544343376918704e-05, "loss": 6.8651, "step": 602 }, { "epoch": 0.8398328690807799, "grad_norm": 4.914746284484863, "learning_rate": 6.543348493462194e-05, "loss": 6.9708, "step": 603 }, { "epoch": 0.841225626740947, "grad_norm": 4.245934009552002, "learning_rate": 6.542353610005684e-05, "loss": 6.7655, "step": 604 }, { "epoch": 0.8426183844011143, "grad_norm": 4.1350860595703125, "learning_rate": 6.541358726549175e-05, "loss": 7.2147, "step": 605 }, { "epoch": 0.8440111420612814, "grad_norm": 4.082108974456787, "learning_rate": 6.540363843092665e-05, "loss": 7.0206, "step": 606 }, { "epoch": 0.8454038997214485, "grad_norm": 4.45560884475708, "learning_rate": 6.539368959636156e-05, "loss": 6.652, "step": 607 }, { "epoch": 0.8467966573816156, "grad_norm": 4.454686641693115, "learning_rate": 6.538374076179647e-05, "loss": 7.4298, "step": 608 }, { "epoch": 0.8481894150417827, "grad_norm": 4.466418743133545, "learning_rate": 6.537379192723138e-05, "loss": 7.5666, "step": 609 }, { "epoch": 0.8495821727019499, "grad_norm": 3.795778512954712, "learning_rate": 6.536384309266628e-05, "loss": 7.0145, "step": 610 }, { "epoch": 0.850974930362117, "grad_norm": 4.251140117645264, "learning_rate": 6.535389425810119e-05, "loss": 7.2689, "step": 611 }, { "epoch": 0.8523676880222841, "grad_norm": 4.9372406005859375, "learning_rate": 6.534394542353609e-05, "loss": 7.038, "step": 612 }, { "epoch": 0.8537604456824512, "grad_norm": 4.656927108764648, "learning_rate": 6.5333996588971e-05, "loss": 7.2663, "step": 613 }, { "epoch": 0.8551532033426184, "grad_norm": 4.058319091796875, "learning_rate": 6.532404775440591e-05, "loss": 7.1084, "step": 614 }, { "epoch": 0.8565459610027855, "grad_norm": 4.106003761291504, "learning_rate": 6.531409891984082e-05, "loss": 6.6016, "step": 615 }, { "epoch": 0.8579387186629527, "grad_norm": 6.479761600494385, "learning_rate": 6.530415008527572e-05, "loss": 7.2559, "step": 616 }, { "epoch": 0.8593314763231198, "grad_norm": 4.016816139221191, "learning_rate": 6.529420125071062e-05, "loss": 7.1392, "step": 617 }, { "epoch": 0.8607242339832869, "grad_norm": 4.016937732696533, "learning_rate": 6.528425241614553e-05, "loss": 6.538, "step": 618 }, { "epoch": 0.862116991643454, "grad_norm": 4.666384696960449, "learning_rate": 6.527430358158045e-05, "loss": 6.7963, "step": 619 }, { "epoch": 0.8635097493036211, "grad_norm": 4.206920623779297, "learning_rate": 6.526435474701535e-05, "loss": 7.2645, "step": 620 }, { "epoch": 0.8649025069637883, "grad_norm": 4.449158191680908, "learning_rate": 6.525440591245024e-05, "loss": 7.0839, "step": 621 }, { "epoch": 0.8662952646239555, "grad_norm": 6.0619964599609375, "learning_rate": 6.524445707788516e-05, "loss": 7.0886, "step": 622 }, { "epoch": 0.8676880222841226, "grad_norm": 4.405318737030029, "learning_rate": 6.523450824332006e-05, "loss": 6.8355, "step": 623 }, { "epoch": 0.8690807799442897, "grad_norm": 4.437068462371826, "learning_rate": 6.522455940875497e-05, "loss": 7.0541, "step": 624 }, { "epoch": 0.8704735376044568, "grad_norm": 4.335897922515869, "learning_rate": 6.521461057418988e-05, "loss": 7.2348, "step": 625 }, { "epoch": 0.871866295264624, "grad_norm": 4.303770065307617, "learning_rate": 6.520466173962477e-05, "loss": 7.127, "step": 626 }, { "epoch": 0.8732590529247911, "grad_norm": 4.896326065063477, "learning_rate": 6.519471290505969e-05, "loss": 7.2584, "step": 627 }, { "epoch": 0.8746518105849582, "grad_norm": 4.51694917678833, "learning_rate": 6.51847640704946e-05, "loss": 6.6806, "step": 628 }, { "epoch": 0.8760445682451253, "grad_norm": 4.327085018157959, "learning_rate": 6.51748152359295e-05, "loss": 7.2585, "step": 629 }, { "epoch": 0.8774373259052924, "grad_norm": 9.302055358886719, "learning_rate": 6.51648664013644e-05, "loss": 6.5966, "step": 630 }, { "epoch": 0.8788300835654597, "grad_norm": 4.5956597328186035, "learning_rate": 6.515491756679931e-05, "loss": 7.3094, "step": 631 }, { "epoch": 0.8802228412256268, "grad_norm": 4.286032199859619, "learning_rate": 6.514496873223421e-05, "loss": 6.7288, "step": 632 }, { "epoch": 0.8816155988857939, "grad_norm": 4.732513427734375, "learning_rate": 6.513501989766913e-05, "loss": 7.3134, "step": 633 }, { "epoch": 0.883008356545961, "grad_norm": 5.090188026428223, "learning_rate": 6.512507106310403e-05, "loss": 6.529, "step": 634 }, { "epoch": 0.8844011142061281, "grad_norm": 4.097725868225098, "learning_rate": 6.511512222853894e-05, "loss": 6.4284, "step": 635 }, { "epoch": 0.8857938718662952, "grad_norm": 4.353823184967041, "learning_rate": 6.510517339397384e-05, "loss": 7.0736, "step": 636 }, { "epoch": 0.8871866295264624, "grad_norm": 5.056899547576904, "learning_rate": 6.509522455940875e-05, "loss": 7.3754, "step": 637 }, { "epoch": 0.8885793871866295, "grad_norm": 4.356399059295654, "learning_rate": 6.508527572484365e-05, "loss": 6.7254, "step": 638 }, { "epoch": 0.8899721448467967, "grad_norm": 4.472954273223877, "learning_rate": 6.507532689027857e-05, "loss": 7.1479, "step": 639 }, { "epoch": 0.8913649025069638, "grad_norm": 4.049185752868652, "learning_rate": 6.506537805571346e-05, "loss": 7.0321, "step": 640 }, { "epoch": 0.8927576601671309, "grad_norm": 4.421054363250732, "learning_rate": 6.505542922114838e-05, "loss": 6.8861, "step": 641 }, { "epoch": 0.8941504178272981, "grad_norm": 4.027243137359619, "learning_rate": 6.504548038658328e-05, "loss": 7.047, "step": 642 }, { "epoch": 0.8955431754874652, "grad_norm": 4.731100559234619, "learning_rate": 6.503553155201818e-05, "loss": 6.6766, "step": 643 }, { "epoch": 0.8969359331476323, "grad_norm": 4.997005939483643, "learning_rate": 6.50255827174531e-05, "loss": 6.964, "step": 644 }, { "epoch": 0.8983286908077994, "grad_norm": 4.219837188720703, "learning_rate": 6.501563388288799e-05, "loss": 7.0504, "step": 645 }, { "epoch": 0.8997214484679665, "grad_norm": 4.365358829498291, "learning_rate": 6.50056850483229e-05, "loss": 6.7434, "step": 646 }, { "epoch": 0.9011142061281338, "grad_norm": 4.429034233093262, "learning_rate": 6.499573621375781e-05, "loss": 6.408, "step": 647 }, { "epoch": 0.9025069637883009, "grad_norm": 4.476465702056885, "learning_rate": 6.498578737919272e-05, "loss": 6.9395, "step": 648 }, { "epoch": 0.903899721448468, "grad_norm": 3.8384079933166504, "learning_rate": 6.497583854462762e-05, "loss": 6.5423, "step": 649 }, { "epoch": 0.9052924791086351, "grad_norm": 4.713837623596191, "learning_rate": 6.496588971006253e-05, "loss": 7.1157, "step": 650 }, { "epoch": 0.9066852367688022, "grad_norm": 4.122503280639648, "learning_rate": 6.495594087549743e-05, "loss": 6.6999, "step": 651 }, { "epoch": 0.9080779944289693, "grad_norm": 5.028951644897461, "learning_rate": 6.494599204093235e-05, "loss": 6.7512, "step": 652 }, { "epoch": 0.9094707520891365, "grad_norm": 4.512173175811768, "learning_rate": 6.493604320636725e-05, "loss": 6.8538, "step": 653 }, { "epoch": 0.9108635097493036, "grad_norm": 4.629344463348389, "learning_rate": 6.492609437180216e-05, "loss": 6.8566, "step": 654 }, { "epoch": 0.9122562674094707, "grad_norm": 4.417125701904297, "learning_rate": 6.491614553723706e-05, "loss": 7.0721, "step": 655 }, { "epoch": 0.9136490250696379, "grad_norm": 4.363222122192383, "learning_rate": 6.490619670267196e-05, "loss": 6.8486, "step": 656 }, { "epoch": 0.915041782729805, "grad_norm": 4.881872177124023, "learning_rate": 6.489624786810687e-05, "loss": 6.7772, "step": 657 }, { "epoch": 0.9164345403899722, "grad_norm": 6.637619495391846, "learning_rate": 6.488629903354179e-05, "loss": 7.021, "step": 658 }, { "epoch": 0.9178272980501393, "grad_norm": 4.867405891418457, "learning_rate": 6.487635019897669e-05, "loss": 7.0793, "step": 659 }, { "epoch": 0.9192200557103064, "grad_norm": 6.487813472747803, "learning_rate": 6.48664013644116e-05, "loss": 6.1775, "step": 660 }, { "epoch": 0.9206128133704735, "grad_norm": 3.9346566200256348, "learning_rate": 6.48564525298465e-05, "loss": 6.9936, "step": 661 }, { "epoch": 0.9220055710306406, "grad_norm": 4.957814693450928, "learning_rate": 6.48465036952814e-05, "loss": 6.7964, "step": 662 }, { "epoch": 0.9233983286908078, "grad_norm": 4.183300495147705, "learning_rate": 6.48365548607163e-05, "loss": 6.7053, "step": 663 }, { "epoch": 0.924791086350975, "grad_norm": 4.829319000244141, "learning_rate": 6.482660602615122e-05, "loss": 6.8848, "step": 664 }, { "epoch": 0.9261838440111421, "grad_norm": 4.310740947723389, "learning_rate": 6.481665719158611e-05, "loss": 6.8218, "step": 665 }, { "epoch": 0.9275766016713092, "grad_norm": 4.705667972564697, "learning_rate": 6.480670835702103e-05, "loss": 6.938, "step": 666 }, { "epoch": 0.9289693593314763, "grad_norm": 5.496898651123047, "learning_rate": 6.479675952245594e-05, "loss": 6.8801, "step": 667 }, { "epoch": 0.9303621169916435, "grad_norm": 4.279628753662109, "learning_rate": 6.478681068789084e-05, "loss": 6.7455, "step": 668 }, { "epoch": 0.9317548746518106, "grad_norm": 5.3207244873046875, "learning_rate": 6.477686185332574e-05, "loss": 6.8616, "step": 669 }, { "epoch": 0.9331476323119777, "grad_norm": 4.7314934730529785, "learning_rate": 6.476691301876065e-05, "loss": 6.5074, "step": 670 }, { "epoch": 0.9345403899721448, "grad_norm": 6.029335021972656, "learning_rate": 6.475696418419555e-05, "loss": 6.8448, "step": 671 }, { "epoch": 0.935933147632312, "grad_norm": 4.07269287109375, "learning_rate": 6.474701534963047e-05, "loss": 6.6289, "step": 672 }, { "epoch": 0.9373259052924791, "grad_norm": 4.37167501449585, "learning_rate": 6.473706651506537e-05, "loss": 6.3312, "step": 673 }, { "epoch": 0.9387186629526463, "grad_norm": 4.4684367179870605, "learning_rate": 6.472711768050028e-05, "loss": 6.6532, "step": 674 }, { "epoch": 0.9401114206128134, "grad_norm": 4.46663761138916, "learning_rate": 6.471716884593518e-05, "loss": 6.7704, "step": 675 }, { "epoch": 0.9415041782729805, "grad_norm": 4.736171722412109, "learning_rate": 6.470722001137009e-05, "loss": 6.7296, "step": 676 }, { "epoch": 0.9428969359331476, "grad_norm": 4.486076831817627, "learning_rate": 6.4697271176805e-05, "loss": 6.6435, "step": 677 }, { "epoch": 0.9442896935933147, "grad_norm": 4.807506561279297, "learning_rate": 6.468732234223991e-05, "loss": 6.6223, "step": 678 }, { "epoch": 0.9456824512534819, "grad_norm": 6.329336166381836, "learning_rate": 6.467737350767481e-05, "loss": 6.631, "step": 679 }, { "epoch": 0.947075208913649, "grad_norm": 5.18367338180542, "learning_rate": 6.466742467310972e-05, "loss": 6.7117, "step": 680 }, { "epoch": 0.9484679665738162, "grad_norm": 5.845064640045166, "learning_rate": 6.465747583854462e-05, "loss": 7.154, "step": 681 }, { "epoch": 0.9498607242339833, "grad_norm": 4.247320175170898, "learning_rate": 6.464752700397952e-05, "loss": 6.5966, "step": 682 }, { "epoch": 0.9512534818941504, "grad_norm": 4.217019557952881, "learning_rate": 6.463757816941444e-05, "loss": 6.8241, "step": 683 }, { "epoch": 0.9526462395543176, "grad_norm": 4.208700180053711, "learning_rate": 6.462762933484933e-05, "loss": 7.0147, "step": 684 }, { "epoch": 0.9540389972144847, "grad_norm": 3.9506630897521973, "learning_rate": 6.461768050028425e-05, "loss": 6.1799, "step": 685 }, { "epoch": 0.9554317548746518, "grad_norm": 4.633251190185547, "learning_rate": 6.460773166571915e-05, "loss": 6.586, "step": 686 }, { "epoch": 0.9568245125348189, "grad_norm": 4.501231670379639, "learning_rate": 6.459778283115406e-05, "loss": 6.69, "step": 687 }, { "epoch": 0.958217270194986, "grad_norm": 4.209427356719971, "learning_rate": 6.458783399658896e-05, "loss": 6.65, "step": 688 }, { "epoch": 0.9596100278551533, "grad_norm": 3.9768435955047607, "learning_rate": 6.457788516202387e-05, "loss": 6.829, "step": 689 }, { "epoch": 0.9610027855153204, "grad_norm": 4.2762532234191895, "learning_rate": 6.456793632745877e-05, "loss": 6.7769, "step": 690 }, { "epoch": 0.9623955431754875, "grad_norm": 4.354592800140381, "learning_rate": 6.455798749289369e-05, "loss": 6.8221, "step": 691 }, { "epoch": 0.9637883008356546, "grad_norm": 4.69321346282959, "learning_rate": 6.454803865832859e-05, "loss": 6.6603, "step": 692 }, { "epoch": 0.9651810584958217, "grad_norm": 4.885592937469482, "learning_rate": 6.45380898237635e-05, "loss": 6.8437, "step": 693 }, { "epoch": 0.9665738161559888, "grad_norm": 4.524178981781006, "learning_rate": 6.45281409891984e-05, "loss": 6.8486, "step": 694 }, { "epoch": 0.967966573816156, "grad_norm": 5.346595287322998, "learning_rate": 6.45181921546333e-05, "loss": 6.6868, "step": 695 }, { "epoch": 0.9693593314763231, "grad_norm": 5.525434970855713, "learning_rate": 6.450824332006821e-05, "loss": 6.5876, "step": 696 }, { "epoch": 0.9707520891364902, "grad_norm": 5.755575180053711, "learning_rate": 6.449829448550313e-05, "loss": 6.8912, "step": 697 }, { "epoch": 0.9721448467966574, "grad_norm": 4.54886531829834, "learning_rate": 6.448834565093803e-05, "loss": 6.8111, "step": 698 }, { "epoch": 0.9735376044568245, "grad_norm": 4.352001667022705, "learning_rate": 6.447839681637293e-05, "loss": 6.6811, "step": 699 }, { "epoch": 0.9749303621169917, "grad_norm": 4.668796062469482, "learning_rate": 6.446844798180784e-05, "loss": 6.5804, "step": 700 }, { "epoch": 0.9763231197771588, "grad_norm": 4.026281356811523, "learning_rate": 6.445849914724274e-05, "loss": 6.8115, "step": 701 }, { "epoch": 0.9777158774373259, "grad_norm": 4.048024654388428, "learning_rate": 6.444855031267766e-05, "loss": 7.0243, "step": 702 }, { "epoch": 0.979108635097493, "grad_norm": 4.501350402832031, "learning_rate": 6.443860147811256e-05, "loss": 6.6124, "step": 703 }, { "epoch": 0.9805013927576601, "grad_norm": 3.951732635498047, "learning_rate": 6.442865264354745e-05, "loss": 6.655, "step": 704 }, { "epoch": 0.9818941504178273, "grad_norm": 4.734480857849121, "learning_rate": 6.441870380898237e-05, "loss": 6.8814, "step": 705 }, { "epoch": 0.9832869080779945, "grad_norm": 5.22509765625, "learning_rate": 6.440875497441728e-05, "loss": 6.7328, "step": 706 }, { "epoch": 0.9846796657381616, "grad_norm": 4.631306171417236, "learning_rate": 6.439880613985218e-05, "loss": 6.925, "step": 707 }, { "epoch": 0.9860724233983287, "grad_norm": 4.773249626159668, "learning_rate": 6.43888573052871e-05, "loss": 6.3109, "step": 708 }, { "epoch": 0.9874651810584958, "grad_norm": 6.017326354980469, "learning_rate": 6.437890847072199e-05, "loss": 6.6903, "step": 709 }, { "epoch": 0.9888579387186629, "grad_norm": 4.226189136505127, "learning_rate": 6.43689596361569e-05, "loss": 6.7982, "step": 710 }, { "epoch": 0.9902506963788301, "grad_norm": 5.146072864532471, "learning_rate": 6.435901080159181e-05, "loss": 6.6298, "step": 711 }, { "epoch": 0.9916434540389972, "grad_norm": 4.559305191040039, "learning_rate": 6.434906196702671e-05, "loss": 6.7671, "step": 712 }, { "epoch": 0.9930362116991643, "grad_norm": 4.815158367156982, "learning_rate": 6.433911313246162e-05, "loss": 7.0803, "step": 713 }, { "epoch": 0.9944289693593314, "grad_norm": 4.042872905731201, "learning_rate": 6.432916429789652e-05, "loss": 6.8644, "step": 714 }, { "epoch": 0.9958217270194986, "grad_norm": 5.020940780639648, "learning_rate": 6.431921546333143e-05, "loss": 6.006, "step": 715 }, { "epoch": 0.9972144846796658, "grad_norm": 4.264742374420166, "learning_rate": 6.430926662876634e-05, "loss": 6.5417, "step": 716 }, { "epoch": 0.9986072423398329, "grad_norm": 4.906822681427002, "learning_rate": 6.429931779420125e-05, "loss": 6.5299, "step": 717 }, { "epoch": 1.0, "grad_norm": 4.636160373687744, "learning_rate": 6.428936895963615e-05, "loss": 7.2965, "step": 718 }, { "epoch": 1.001392757660167, "grad_norm": 4.3554911613464355, "learning_rate": 6.427942012507106e-05, "loss": 6.2307, "step": 719 }, { "epoch": 1.0027855153203342, "grad_norm": 4.056817054748535, "learning_rate": 6.426947129050596e-05, "loss": 6.0645, "step": 720 }, { "epoch": 1.0041782729805013, "grad_norm": 4.727801322937012, "learning_rate": 6.425952245594086e-05, "loss": 6.7937, "step": 721 }, { "epoch": 1.0055710306406684, "grad_norm": 4.018832683563232, "learning_rate": 6.424957362137578e-05, "loss": 6.2987, "step": 722 }, { "epoch": 1.0069637883008355, "grad_norm": 4.134768962860107, "learning_rate": 6.423962478681069e-05, "loss": 6.3883, "step": 723 }, { "epoch": 1.0083565459610029, "grad_norm": 4.1882710456848145, "learning_rate": 6.422967595224559e-05, "loss": 6.0631, "step": 724 }, { "epoch": 1.00974930362117, "grad_norm": 4.130897045135498, "learning_rate": 6.42197271176805e-05, "loss": 6.6044, "step": 725 }, { "epoch": 1.011142061281337, "grad_norm": 4.134621620178223, "learning_rate": 6.42097782831154e-05, "loss": 6.4306, "step": 726 }, { "epoch": 1.0125348189415042, "grad_norm": 4.082250595092773, "learning_rate": 6.41998294485503e-05, "loss": 6.7674, "step": 727 }, { "epoch": 1.0139275766016713, "grad_norm": 4.204432487487793, "learning_rate": 6.418988061398521e-05, "loss": 5.7767, "step": 728 }, { "epoch": 1.0153203342618384, "grad_norm": 3.8960046768188477, "learning_rate": 6.417993177942011e-05, "loss": 6.7049, "step": 729 }, { "epoch": 1.0167130919220055, "grad_norm": 4.618951320648193, "learning_rate": 6.416998294485503e-05, "loss": 6.3384, "step": 730 }, { "epoch": 1.0181058495821727, "grad_norm": 4.198172092437744, "learning_rate": 6.416003411028993e-05, "loss": 6.2329, "step": 731 }, { "epoch": 1.0194986072423398, "grad_norm": 4.364832401275635, "learning_rate": 6.415008527572484e-05, "loss": 6.6683, "step": 732 }, { "epoch": 1.0208913649025069, "grad_norm": 4.310018062591553, "learning_rate": 6.414013644115974e-05, "loss": 6.4213, "step": 733 }, { "epoch": 1.0222841225626742, "grad_norm": 4.494109153747559, "learning_rate": 6.413018760659464e-05, "loss": 6.8315, "step": 734 }, { "epoch": 1.0236768802228413, "grad_norm": 4.4797892570495605, "learning_rate": 6.412023877202956e-05, "loss": 6.0234, "step": 735 }, { "epoch": 1.0250696378830084, "grad_norm": 4.1381964683532715, "learning_rate": 6.411028993746447e-05, "loss": 6.4072, "step": 736 }, { "epoch": 1.0264623955431755, "grad_norm": 3.576007604598999, "learning_rate": 6.410034110289937e-05, "loss": 5.8613, "step": 737 }, { "epoch": 1.0278551532033426, "grad_norm": 4.08755350112915, "learning_rate": 6.409039226833427e-05, "loss": 5.5571, "step": 738 }, { "epoch": 1.0292479108635098, "grad_norm": 5.901718616485596, "learning_rate": 6.408044343376918e-05, "loss": 6.4184, "step": 739 }, { "epoch": 1.0306406685236769, "grad_norm": 3.864762306213379, "learning_rate": 6.407049459920408e-05, "loss": 5.9471, "step": 740 }, { "epoch": 1.032033426183844, "grad_norm": 4.546471118927002, "learning_rate": 6.4060545764639e-05, "loss": 6.2533, "step": 741 }, { "epoch": 1.033426183844011, "grad_norm": 3.783642530441284, "learning_rate": 6.40505969300739e-05, "loss": 6.6279, "step": 742 }, { "epoch": 1.0348189415041782, "grad_norm": 4.507882118225098, "learning_rate": 6.404064809550881e-05, "loss": 6.4094, "step": 743 }, { "epoch": 1.0362116991643453, "grad_norm": 4.0628790855407715, "learning_rate": 6.403069926094371e-05, "loss": 6.7273, "step": 744 }, { "epoch": 1.0376044568245126, "grad_norm": 4.369492053985596, "learning_rate": 6.402075042637862e-05, "loss": 6.3731, "step": 745 }, { "epoch": 1.0389972144846797, "grad_norm": 3.927105188369751, "learning_rate": 6.401080159181352e-05, "loss": 6.0893, "step": 746 }, { "epoch": 1.0403899721448469, "grad_norm": 4.238670825958252, "learning_rate": 6.400085275724844e-05, "loss": 6.2224, "step": 747 }, { "epoch": 1.041782729805014, "grad_norm": 4.1663103103637695, "learning_rate": 6.399090392268333e-05, "loss": 6.3216, "step": 748 }, { "epoch": 1.043175487465181, "grad_norm": 5.088364601135254, "learning_rate": 6.398095508811825e-05, "loss": 5.7618, "step": 749 }, { "epoch": 1.0445682451253482, "grad_norm": 4.116178035736084, "learning_rate": 6.397100625355315e-05, "loss": 6.4555, "step": 750 }, { "epoch": 1.0459610027855153, "grad_norm": 3.701273202896118, "learning_rate": 6.396105741898805e-05, "loss": 6.2116, "step": 751 }, { "epoch": 1.0473537604456824, "grad_norm": 4.711391925811768, "learning_rate": 6.395110858442296e-05, "loss": 6.5953, "step": 752 }, { "epoch": 1.0487465181058495, "grad_norm": 4.637911319732666, "learning_rate": 6.394115974985786e-05, "loss": 6.6138, "step": 753 }, { "epoch": 1.0501392757660166, "grad_norm": 4.483766555786133, "learning_rate": 6.393121091529277e-05, "loss": 6.5945, "step": 754 }, { "epoch": 1.051532033426184, "grad_norm": 3.9736287593841553, "learning_rate": 6.392126208072768e-05, "loss": 6.7025, "step": 755 }, { "epoch": 1.052924791086351, "grad_norm": 4.410802841186523, "learning_rate": 6.391131324616259e-05, "loss": 6.4296, "step": 756 }, { "epoch": 1.0543175487465182, "grad_norm": 4.418691158294678, "learning_rate": 6.390136441159749e-05, "loss": 6.5703, "step": 757 }, { "epoch": 1.0557103064066853, "grad_norm": 4.40813684463501, "learning_rate": 6.38914155770324e-05, "loss": 6.2693, "step": 758 }, { "epoch": 1.0571030640668524, "grad_norm": 3.891906499862671, "learning_rate": 6.38814667424673e-05, "loss": 6.3551, "step": 759 }, { "epoch": 1.0584958217270195, "grad_norm": 4.650675296783447, "learning_rate": 6.387151790790222e-05, "loss": 6.7067, "step": 760 }, { "epoch": 1.0598885793871866, "grad_norm": 4.566289901733398, "learning_rate": 6.386156907333712e-05, "loss": 6.4354, "step": 761 }, { "epoch": 1.0612813370473537, "grad_norm": 4.452826023101807, "learning_rate": 6.385162023877203e-05, "loss": 6.0533, "step": 762 }, { "epoch": 1.0626740947075208, "grad_norm": 3.898571491241455, "learning_rate": 6.384167140420693e-05, "loss": 6.054, "step": 763 }, { "epoch": 1.064066852367688, "grad_norm": 3.97408127784729, "learning_rate": 6.383172256964183e-05, "loss": 6.556, "step": 764 }, { "epoch": 1.065459610027855, "grad_norm": 4.9180755615234375, "learning_rate": 6.382177373507674e-05, "loss": 5.8876, "step": 765 }, { "epoch": 1.0668523676880224, "grad_norm": 4.586277484893799, "learning_rate": 6.381182490051166e-05, "loss": 6.3013, "step": 766 }, { "epoch": 1.0682451253481895, "grad_norm": 3.9467272758483887, "learning_rate": 6.380187606594655e-05, "loss": 6.6091, "step": 767 }, { "epoch": 1.0696378830083566, "grad_norm": 3.837667226791382, "learning_rate": 6.379192723138146e-05, "loss": 6.3902, "step": 768 }, { "epoch": 1.0710306406685237, "grad_norm": 4.07935905456543, "learning_rate": 6.378197839681637e-05, "loss": 6.0804, "step": 769 }, { "epoch": 1.0724233983286908, "grad_norm": 4.16187858581543, "learning_rate": 6.377202956225127e-05, "loss": 6.5067, "step": 770 }, { "epoch": 1.073816155988858, "grad_norm": 3.817171335220337, "learning_rate": 6.376208072768618e-05, "loss": 6.6519, "step": 771 }, { "epoch": 1.075208913649025, "grad_norm": 3.7888450622558594, "learning_rate": 6.375213189312108e-05, "loss": 6.3623, "step": 772 }, { "epoch": 1.0766016713091922, "grad_norm": 6.532416343688965, "learning_rate": 6.374218305855599e-05, "loss": 6.1087, "step": 773 }, { "epoch": 1.0779944289693593, "grad_norm": 3.9244799613952637, "learning_rate": 6.37322342239909e-05, "loss": 6.3221, "step": 774 }, { "epoch": 1.0793871866295264, "grad_norm": 3.6228652000427246, "learning_rate": 6.372228538942581e-05, "loss": 6.0867, "step": 775 }, { "epoch": 1.0807799442896937, "grad_norm": 4.465415954589844, "learning_rate": 6.371233655486071e-05, "loss": 6.3228, "step": 776 }, { "epoch": 1.0821727019498608, "grad_norm": 4.021289348602295, "learning_rate": 6.370238772029562e-05, "loss": 6.1475, "step": 777 }, { "epoch": 1.083565459610028, "grad_norm": 4.0907392501831055, "learning_rate": 6.369243888573052e-05, "loss": 5.8969, "step": 778 }, { "epoch": 1.084958217270195, "grad_norm": 3.709568500518799, "learning_rate": 6.368249005116542e-05, "loss": 5.8814, "step": 779 }, { "epoch": 1.0863509749303621, "grad_norm": 4.08675479888916, "learning_rate": 6.367254121660034e-05, "loss": 6.4905, "step": 780 }, { "epoch": 1.0877437325905293, "grad_norm": 4.593238830566406, "learning_rate": 6.366259238203524e-05, "loss": 6.1173, "step": 781 }, { "epoch": 1.0891364902506964, "grad_norm": 4.014584064483643, "learning_rate": 6.365264354747015e-05, "loss": 5.8148, "step": 782 }, { "epoch": 1.0905292479108635, "grad_norm": 4.625162601470947, "learning_rate": 6.364269471290505e-05, "loss": 6.3614, "step": 783 }, { "epoch": 1.0919220055710306, "grad_norm": 4.606537342071533, "learning_rate": 6.363274587833996e-05, "loss": 6.1496, "step": 784 }, { "epoch": 1.0933147632311977, "grad_norm": 4.191118240356445, "learning_rate": 6.362279704377487e-05, "loss": 6.619, "step": 785 }, { "epoch": 1.0947075208913648, "grad_norm": 3.9036128520965576, "learning_rate": 6.361284820920978e-05, "loss": 6.5763, "step": 786 }, { "epoch": 1.0961002785515321, "grad_norm": 4.702339172363281, "learning_rate": 6.360289937464467e-05, "loss": 6.4284, "step": 787 }, { "epoch": 1.0974930362116992, "grad_norm": 4.698242664337158, "learning_rate": 6.359295054007959e-05, "loss": 5.9575, "step": 788 }, { "epoch": 1.0988857938718664, "grad_norm": 4.833227157592773, "learning_rate": 6.358300170551449e-05, "loss": 6.1474, "step": 789 }, { "epoch": 1.1002785515320335, "grad_norm": 4.603786945343018, "learning_rate": 6.35730528709494e-05, "loss": 6.1101, "step": 790 }, { "epoch": 1.1016713091922006, "grad_norm": 4.136576175689697, "learning_rate": 6.356310403638431e-05, "loss": 6.2686, "step": 791 }, { "epoch": 1.1030640668523677, "grad_norm": 4.749037742614746, "learning_rate": 6.35531552018192e-05, "loss": 6.1223, "step": 792 }, { "epoch": 1.1044568245125348, "grad_norm": 3.8633317947387695, "learning_rate": 6.354320636725412e-05, "loss": 6.1911, "step": 793 }, { "epoch": 1.105849582172702, "grad_norm": 3.7554922103881836, "learning_rate": 6.353325753268902e-05, "loss": 6.0206, "step": 794 }, { "epoch": 1.107242339832869, "grad_norm": 4.667807579040527, "learning_rate": 6.352330869812393e-05, "loss": 5.9013, "step": 795 }, { "epoch": 1.1086350974930361, "grad_norm": 3.96821665763855, "learning_rate": 6.351335986355883e-05, "loss": 5.9689, "step": 796 }, { "epoch": 1.1100278551532035, "grad_norm": 3.9820969104766846, "learning_rate": 6.350341102899374e-05, "loss": 5.959, "step": 797 }, { "epoch": 1.1114206128133706, "grad_norm": 4.256950378417969, "learning_rate": 6.349346219442864e-05, "loss": 6.0054, "step": 798 }, { "epoch": 1.1128133704735377, "grad_norm": 5.839420318603516, "learning_rate": 6.348351335986356e-05, "loss": 5.7755, "step": 799 }, { "epoch": 1.1142061281337048, "grad_norm": 3.9156510829925537, "learning_rate": 6.347356452529846e-05, "loss": 6.0378, "step": 800 }, { "epoch": 1.115598885793872, "grad_norm": 3.987285614013672, "learning_rate": 6.346361569073337e-05, "loss": 6.0337, "step": 801 }, { "epoch": 1.116991643454039, "grad_norm": 4.3043293952941895, "learning_rate": 6.345366685616827e-05, "loss": 5.9843, "step": 802 }, { "epoch": 1.1183844011142061, "grad_norm": 4.419588088989258, "learning_rate": 6.344371802160318e-05, "loss": 6.3288, "step": 803 }, { "epoch": 1.1197771587743732, "grad_norm": 4.007911205291748, "learning_rate": 6.343376918703808e-05, "loss": 6.5837, "step": 804 }, { "epoch": 1.1211699164345403, "grad_norm": 4.451642036437988, "learning_rate": 6.3423820352473e-05, "loss": 5.979, "step": 805 }, { "epoch": 1.1225626740947074, "grad_norm": 3.8725087642669678, "learning_rate": 6.34138715179079e-05, "loss": 6.2178, "step": 806 }, { "epoch": 1.1239554317548746, "grad_norm": 4.166767120361328, "learning_rate": 6.34039226833428e-05, "loss": 6.4615, "step": 807 }, { "epoch": 1.1253481894150417, "grad_norm": 5.3962721824646, "learning_rate": 6.339397384877771e-05, "loss": 5.6746, "step": 808 }, { "epoch": 1.126740947075209, "grad_norm": 4.789846420288086, "learning_rate": 6.338402501421261e-05, "loss": 6.1639, "step": 809 }, { "epoch": 1.128133704735376, "grad_norm": 4.104311466217041, "learning_rate": 6.337407617964752e-05, "loss": 6.1073, "step": 810 }, { "epoch": 1.1295264623955432, "grad_norm": 3.9826443195343018, "learning_rate": 6.336412734508242e-05, "loss": 5.5452, "step": 811 }, { "epoch": 1.1309192200557103, "grad_norm": 4.094364643096924, "learning_rate": 6.335417851051733e-05, "loss": 6.1048, "step": 812 }, { "epoch": 1.1323119777158774, "grad_norm": 4.958089828491211, "learning_rate": 6.334422967595224e-05, "loss": 5.8935, "step": 813 }, { "epoch": 1.1337047353760445, "grad_norm": 4.460774898529053, "learning_rate": 6.333428084138715e-05, "loss": 6.2401, "step": 814 }, { "epoch": 1.1350974930362117, "grad_norm": 4.0596513748168945, "learning_rate": 6.332433200682205e-05, "loss": 6.0652, "step": 815 }, { "epoch": 1.1364902506963788, "grad_norm": 3.9710872173309326, "learning_rate": 6.331438317225696e-05, "loss": 5.9651, "step": 816 }, { "epoch": 1.1378830083565459, "grad_norm": 4.852348804473877, "learning_rate": 6.330443433769186e-05, "loss": 6.3512, "step": 817 }, { "epoch": 1.1392757660167132, "grad_norm": 4.2245683670043945, "learning_rate": 6.329448550312678e-05, "loss": 5.7914, "step": 818 }, { "epoch": 1.1406685236768803, "grad_norm": 3.978698492050171, "learning_rate": 6.328453666856168e-05, "loss": 5.9029, "step": 819 }, { "epoch": 1.1420612813370474, "grad_norm": 5.231683731079102, "learning_rate": 6.327458783399659e-05, "loss": 5.9183, "step": 820 }, { "epoch": 1.1434540389972145, "grad_norm": 4.089069366455078, "learning_rate": 6.326463899943149e-05, "loss": 6.038, "step": 821 }, { "epoch": 1.1448467966573816, "grad_norm": 5.573705196380615, "learning_rate": 6.32546901648664e-05, "loss": 6.1843, "step": 822 }, { "epoch": 1.1462395543175488, "grad_norm": 3.8310587406158447, "learning_rate": 6.32447413303013e-05, "loss": 5.6359, "step": 823 }, { "epoch": 1.1476323119777159, "grad_norm": 4.118086814880371, "learning_rate": 6.323479249573621e-05, "loss": 5.8516, "step": 824 }, { "epoch": 1.149025069637883, "grad_norm": 4.978765964508057, "learning_rate": 6.322484366117112e-05, "loss": 6.0637, "step": 825 }, { "epoch": 1.15041782729805, "grad_norm": 3.9830269813537598, "learning_rate": 6.321489482660602e-05, "loss": 6.0359, "step": 826 }, { "epoch": 1.1518105849582172, "grad_norm": 4.337949752807617, "learning_rate": 6.320494599204093e-05, "loss": 5.6737, "step": 827 }, { "epoch": 1.1532033426183843, "grad_norm": 4.050119876861572, "learning_rate": 6.319499715747583e-05, "loss": 5.9892, "step": 828 }, { "epoch": 1.1545961002785514, "grad_norm": 4.22210693359375, "learning_rate": 6.318504832291074e-05, "loss": 6.4008, "step": 829 }, { "epoch": 1.1559888579387188, "grad_norm": 3.744811534881592, "learning_rate": 6.317509948834565e-05, "loss": 5.7396, "step": 830 }, { "epoch": 1.1573816155988859, "grad_norm": 4.1426262855529785, "learning_rate": 6.316515065378054e-05, "loss": 6.0505, "step": 831 }, { "epoch": 1.158774373259053, "grad_norm": 4.062531471252441, "learning_rate": 6.315520181921546e-05, "loss": 6.2989, "step": 832 }, { "epoch": 1.16016713091922, "grad_norm": 4.024596214294434, "learning_rate": 6.314525298465037e-05, "loss": 5.9928, "step": 833 }, { "epoch": 1.1615598885793872, "grad_norm": 4.976029872894287, "learning_rate": 6.313530415008527e-05, "loss": 6.171, "step": 834 }, { "epoch": 1.1629526462395543, "grad_norm": 4.145542144775391, "learning_rate": 6.312535531552017e-05, "loss": 6.0952, "step": 835 }, { "epoch": 1.1643454038997214, "grad_norm": 4.032221794128418, "learning_rate": 6.311540648095508e-05, "loss": 6.1791, "step": 836 }, { "epoch": 1.1657381615598885, "grad_norm": 4.050756454467773, "learning_rate": 6.310545764638998e-05, "loss": 6.4111, "step": 837 }, { "epoch": 1.1671309192200556, "grad_norm": 5.001420021057129, "learning_rate": 6.30955088118249e-05, "loss": 5.6097, "step": 838 }, { "epoch": 1.168523676880223, "grad_norm": 6.472048282623291, "learning_rate": 6.30855599772598e-05, "loss": 6.3765, "step": 839 }, { "epoch": 1.16991643454039, "grad_norm": 4.391728401184082, "learning_rate": 6.307561114269471e-05, "loss": 6.5625, "step": 840 }, { "epoch": 1.1713091922005572, "grad_norm": 4.546194553375244, "learning_rate": 6.306566230812961e-05, "loss": 5.5986, "step": 841 }, { "epoch": 1.1727019498607243, "grad_norm": 5.060868740081787, "learning_rate": 6.305571347356452e-05, "loss": 5.7148, "step": 842 }, { "epoch": 1.1740947075208914, "grad_norm": 4.985201358795166, "learning_rate": 6.304576463899943e-05, "loss": 5.9249, "step": 843 }, { "epoch": 1.1754874651810585, "grad_norm": 4.410891532897949, "learning_rate": 6.303581580443434e-05, "loss": 6.3042, "step": 844 }, { "epoch": 1.1768802228412256, "grad_norm": 3.9722976684570312, "learning_rate": 6.302586696986924e-05, "loss": 6.1316, "step": 845 }, { "epoch": 1.1782729805013927, "grad_norm": 4.459285259246826, "learning_rate": 6.301591813530415e-05, "loss": 6.1813, "step": 846 }, { "epoch": 1.1796657381615598, "grad_norm": 4.7457075119018555, "learning_rate": 6.300596930073905e-05, "loss": 5.5983, "step": 847 }, { "epoch": 1.181058495821727, "grad_norm": 4.224425792694092, "learning_rate": 6.299602046617395e-05, "loss": 5.9078, "step": 848 }, { "epoch": 1.182451253481894, "grad_norm": 4.264475345611572, "learning_rate": 6.298607163160887e-05, "loss": 5.9993, "step": 849 }, { "epoch": 1.1838440111420612, "grad_norm": 4.388229846954346, "learning_rate": 6.297612279704378e-05, "loss": 5.8776, "step": 850 }, { "epoch": 1.1852367688022285, "grad_norm": 4.292633056640625, "learning_rate": 6.296617396247868e-05, "loss": 5.742, "step": 851 }, { "epoch": 1.1866295264623956, "grad_norm": 3.895598888397217, "learning_rate": 6.295622512791358e-05, "loss": 5.6649, "step": 852 }, { "epoch": 1.1880222841225627, "grad_norm": 4.261716842651367, "learning_rate": 6.294627629334849e-05, "loss": 5.9047, "step": 853 }, { "epoch": 1.1894150417827298, "grad_norm": 4.003418445587158, "learning_rate": 6.293632745878339e-05, "loss": 5.6362, "step": 854 }, { "epoch": 1.190807799442897, "grad_norm": 4.029558181762695, "learning_rate": 6.29263786242183e-05, "loss": 5.9837, "step": 855 }, { "epoch": 1.192200557103064, "grad_norm": 4.504885673522949, "learning_rate": 6.29164297896532e-05, "loss": 6.1724, "step": 856 }, { "epoch": 1.1935933147632312, "grad_norm": 4.499607563018799, "learning_rate": 6.290648095508812e-05, "loss": 5.9879, "step": 857 }, { "epoch": 1.1949860724233983, "grad_norm": 4.0793585777282715, "learning_rate": 6.289653212052302e-05, "loss": 5.8677, "step": 858 }, { "epoch": 1.1963788300835654, "grad_norm": 4.758726596832275, "learning_rate": 6.288658328595793e-05, "loss": 5.6511, "step": 859 }, { "epoch": 1.1977715877437327, "grad_norm": 6.490820407867432, "learning_rate": 6.287663445139283e-05, "loss": 6.3513, "step": 860 }, { "epoch": 1.1991643454038998, "grad_norm": 4.483609199523926, "learning_rate": 6.286668561682773e-05, "loss": 5.9408, "step": 861 }, { "epoch": 1.200557103064067, "grad_norm": 3.9243340492248535, "learning_rate": 6.285673678226264e-05, "loss": 5.7365, "step": 862 }, { "epoch": 1.201949860724234, "grad_norm": 5.145392417907715, "learning_rate": 6.284678794769756e-05, "loss": 6.0241, "step": 863 }, { "epoch": 1.2033426183844012, "grad_norm": 5.115814208984375, "learning_rate": 6.283683911313246e-05, "loss": 6.0067, "step": 864 }, { "epoch": 1.2047353760445683, "grad_norm": 4.754467964172363, "learning_rate": 6.282689027856736e-05, "loss": 6.2052, "step": 865 }, { "epoch": 1.2061281337047354, "grad_norm": 4.245777606964111, "learning_rate": 6.281694144400227e-05, "loss": 6.0602, "step": 866 }, { "epoch": 1.2075208913649025, "grad_norm": 4.805239677429199, "learning_rate": 6.280699260943717e-05, "loss": 6.0325, "step": 867 }, { "epoch": 1.2089136490250696, "grad_norm": 4.5950236320495605, "learning_rate": 6.279704377487208e-05, "loss": 5.8024, "step": 868 }, { "epoch": 1.2103064066852367, "grad_norm": 4.148370265960693, "learning_rate": 6.278709494030699e-05, "loss": 5.753, "step": 869 }, { "epoch": 1.2116991643454038, "grad_norm": 4.147583484649658, "learning_rate": 6.277714610574188e-05, "loss": 6.1212, "step": 870 }, { "epoch": 1.213091922005571, "grad_norm": 4.20698356628418, "learning_rate": 6.27671972711768e-05, "loss": 6.0757, "step": 871 }, { "epoch": 1.2144846796657383, "grad_norm": 4.499080181121826, "learning_rate": 6.27572484366117e-05, "loss": 6.1316, "step": 872 }, { "epoch": 1.2158774373259054, "grad_norm": 4.6047868728637695, "learning_rate": 6.274729960204661e-05, "loss": 5.973, "step": 873 }, { "epoch": 1.2172701949860725, "grad_norm": 5.152363300323486, "learning_rate": 6.273735076748153e-05, "loss": 5.8796, "step": 874 }, { "epoch": 1.2186629526462396, "grad_norm": 5.067062854766846, "learning_rate": 6.272740193291642e-05, "loss": 6.1757, "step": 875 }, { "epoch": 1.2200557103064067, "grad_norm": 3.904432535171509, "learning_rate": 6.271745309835134e-05, "loss": 6.1742, "step": 876 }, { "epoch": 1.2214484679665738, "grad_norm": 4.014632701873779, "learning_rate": 6.270750426378624e-05, "loss": 5.6793, "step": 877 }, { "epoch": 1.222841225626741, "grad_norm": 3.9907188415527344, "learning_rate": 6.269755542922114e-05, "loss": 6.129, "step": 878 }, { "epoch": 1.224233983286908, "grad_norm": 4.22652530670166, "learning_rate": 6.268760659465605e-05, "loss": 5.8635, "step": 879 }, { "epoch": 1.2256267409470751, "grad_norm": 4.706326007843018, "learning_rate": 6.267765776009095e-05, "loss": 6.4357, "step": 880 }, { "epoch": 1.2270194986072422, "grad_norm": 4.264747619628906, "learning_rate": 6.266770892552586e-05, "loss": 5.7687, "step": 881 }, { "epoch": 1.2284122562674096, "grad_norm": 4.919692516326904, "learning_rate": 6.265776009096077e-05, "loss": 5.8591, "step": 882 }, { "epoch": 1.2298050139275767, "grad_norm": 4.682892799377441, "learning_rate": 6.264781125639568e-05, "loss": 5.3311, "step": 883 }, { "epoch": 1.2311977715877438, "grad_norm": 4.29823112487793, "learning_rate": 6.263786242183058e-05, "loss": 5.2996, "step": 884 }, { "epoch": 1.232590529247911, "grad_norm": 4.313159465789795, "learning_rate": 6.262791358726549e-05, "loss": 5.9182, "step": 885 }, { "epoch": 1.233983286908078, "grad_norm": 4.632452487945557, "learning_rate": 6.261796475270039e-05, "loss": 5.4301, "step": 886 }, { "epoch": 1.2353760445682451, "grad_norm": 4.984564781188965, "learning_rate": 6.26080159181353e-05, "loss": 6.1695, "step": 887 }, { "epoch": 1.2367688022284122, "grad_norm": 4.197230339050293, "learning_rate": 6.259806708357021e-05, "loss": 5.8395, "step": 888 }, { "epoch": 1.2381615598885793, "grad_norm": 4.51923942565918, "learning_rate": 6.258811824900512e-05, "loss": 5.9988, "step": 889 }, { "epoch": 1.2395543175487465, "grad_norm": 4.738932132720947, "learning_rate": 6.257816941444002e-05, "loss": 5.5031, "step": 890 }, { "epoch": 1.2409470752089136, "grad_norm": 4.4604082107543945, "learning_rate": 6.256822057987492e-05, "loss": 5.5689, "step": 891 }, { "epoch": 1.2423398328690807, "grad_norm": 4.486937522888184, "learning_rate": 6.255827174530983e-05, "loss": 5.9191, "step": 892 }, { "epoch": 1.243732590529248, "grad_norm": 5.033269882202148, "learning_rate": 6.254832291074473e-05, "loss": 6.1984, "step": 893 }, { "epoch": 1.2451253481894151, "grad_norm": 3.9928624629974365, "learning_rate": 6.253837407617965e-05, "loss": 5.8658, "step": 894 }, { "epoch": 1.2465181058495822, "grad_norm": 3.888197183609009, "learning_rate": 6.252842524161454e-05, "loss": 5.5798, "step": 895 }, { "epoch": 1.2479108635097493, "grad_norm": 4.839468002319336, "learning_rate": 6.251847640704946e-05, "loss": 5.523, "step": 896 }, { "epoch": 1.2493036211699164, "grad_norm": 4.236422538757324, "learning_rate": 6.250852757248436e-05, "loss": 5.6453, "step": 897 }, { "epoch": 1.2506963788300836, "grad_norm": 4.6251397132873535, "learning_rate": 6.249857873791927e-05, "loss": 5.9148, "step": 898 }, { "epoch": 1.2520891364902507, "grad_norm": 5.287298202514648, "learning_rate": 6.248862990335417e-05, "loss": 5.96, "step": 899 }, { "epoch": 1.2534818941504178, "grad_norm": 4.414423942565918, "learning_rate": 6.247868106878907e-05, "loss": 5.7607, "step": 900 }, { "epoch": 1.2548746518105849, "grad_norm": 4.138334274291992, "learning_rate": 6.246873223422399e-05, "loss": 5.8021, "step": 901 }, { "epoch": 1.2562674094707522, "grad_norm": 4.3649749755859375, "learning_rate": 6.24587833996589e-05, "loss": 6.0395, "step": 902 }, { "epoch": 1.2576601671309193, "grad_norm": 3.8956754207611084, "learning_rate": 6.24488345650938e-05, "loss": 5.4605, "step": 903 }, { "epoch": 1.2590529247910864, "grad_norm": 4.663502216339111, "learning_rate": 6.24388857305287e-05, "loss": 6.1287, "step": 904 }, { "epoch": 1.2604456824512535, "grad_norm": 3.9989981651306152, "learning_rate": 6.242893689596361e-05, "loss": 5.9307, "step": 905 }, { "epoch": 1.2618384401114207, "grad_norm": 4.225207328796387, "learning_rate": 6.241898806139851e-05, "loss": 6.0563, "step": 906 }, { "epoch": 1.2632311977715878, "grad_norm": 4.495071887969971, "learning_rate": 6.240903922683343e-05, "loss": 5.7611, "step": 907 }, { "epoch": 1.2646239554317549, "grad_norm": 4.2864508628845215, "learning_rate": 6.239909039226833e-05, "loss": 5.8417, "step": 908 }, { "epoch": 1.266016713091922, "grad_norm": 4.327315330505371, "learning_rate": 6.238914155770324e-05, "loss": 5.904, "step": 909 }, { "epoch": 1.267409470752089, "grad_norm": 4.177762031555176, "learning_rate": 6.237919272313814e-05, "loss": 5.8393, "step": 910 }, { "epoch": 1.2688022284122562, "grad_norm": 4.884469509124756, "learning_rate": 6.236924388857305e-05, "loss": 6.0954, "step": 911 }, { "epoch": 1.2701949860724233, "grad_norm": 4.177820205688477, "learning_rate": 6.235929505400795e-05, "loss": 5.5047, "step": 912 }, { "epoch": 1.2715877437325904, "grad_norm": 4.735227584838867, "learning_rate": 6.234934621944287e-05, "loss": 5.7605, "step": 913 }, { "epoch": 1.2729805013927575, "grad_norm": 4.475788593292236, "learning_rate": 6.233939738487776e-05, "loss": 5.4821, "step": 914 }, { "epoch": 1.2743732590529249, "grad_norm": 4.623432159423828, "learning_rate": 6.232944855031268e-05, "loss": 6.0002, "step": 915 }, { "epoch": 1.275766016713092, "grad_norm": 4.811022758483887, "learning_rate": 6.231949971574758e-05, "loss": 5.6893, "step": 916 }, { "epoch": 1.277158774373259, "grad_norm": 4.068156719207764, "learning_rate": 6.230955088118248e-05, "loss": 6.2387, "step": 917 }, { "epoch": 1.2785515320334262, "grad_norm": 3.8421871662139893, "learning_rate": 6.229960204661739e-05, "loss": 5.6591, "step": 918 }, { "epoch": 1.2799442896935933, "grad_norm": 3.815600633621216, "learning_rate": 6.228965321205229e-05, "loss": 5.6912, "step": 919 }, { "epoch": 1.2813370473537604, "grad_norm": 4.52863883972168, "learning_rate": 6.22797043774872e-05, "loss": 6.0405, "step": 920 }, { "epoch": 1.2827298050139275, "grad_norm": 4.008754730224609, "learning_rate": 6.226975554292211e-05, "loss": 6.003, "step": 921 }, { "epoch": 1.2841225626740946, "grad_norm": 4.417988300323486, "learning_rate": 6.225980670835702e-05, "loss": 5.611, "step": 922 }, { "epoch": 1.285515320334262, "grad_norm": 3.9807190895080566, "learning_rate": 6.224985787379192e-05, "loss": 5.4935, "step": 923 }, { "epoch": 1.286908077994429, "grad_norm": 4.328763484954834, "learning_rate": 6.223990903922683e-05, "loss": 6.0448, "step": 924 }, { "epoch": 1.2883008356545962, "grad_norm": 5.232528209686279, "learning_rate": 6.222996020466173e-05, "loss": 5.8381, "step": 925 }, { "epoch": 1.2896935933147633, "grad_norm": 4.714798450469971, "learning_rate": 6.222001137009665e-05, "loss": 6.0027, "step": 926 }, { "epoch": 1.2910863509749304, "grad_norm": 4.252414226531982, "learning_rate": 6.221006253553155e-05, "loss": 6.1091, "step": 927 }, { "epoch": 1.2924791086350975, "grad_norm": 4.840965747833252, "learning_rate": 6.220011370096646e-05, "loss": 5.7015, "step": 928 }, { "epoch": 1.2938718662952646, "grad_norm": 4.268723487854004, "learning_rate": 6.219016486640136e-05, "loss": 5.6105, "step": 929 }, { "epoch": 1.2952646239554317, "grad_norm": 4.211379528045654, "learning_rate": 6.218021603183626e-05, "loss": 5.5058, "step": 930 }, { "epoch": 1.2966573816155988, "grad_norm": 4.496066093444824, "learning_rate": 6.217026719727117e-05, "loss": 5.664, "step": 931 }, { "epoch": 1.298050139275766, "grad_norm": 4.24421501159668, "learning_rate": 6.216031836270609e-05, "loss": 5.8674, "step": 932 }, { "epoch": 1.299442896935933, "grad_norm": 4.360576152801514, "learning_rate": 6.215036952814099e-05, "loss": 5.7603, "step": 933 }, { "epoch": 1.3008356545961002, "grad_norm": 4.054847717285156, "learning_rate": 6.21404206935759e-05, "loss": 5.8031, "step": 934 }, { "epoch": 1.3022284122562673, "grad_norm": 5.198960304260254, "learning_rate": 6.21304718590108e-05, "loss": 5.6797, "step": 935 }, { "epoch": 1.3036211699164346, "grad_norm": 4.035637378692627, "learning_rate": 6.21205230244457e-05, "loss": 5.6923, "step": 936 }, { "epoch": 1.3050139275766017, "grad_norm": 4.253226280212402, "learning_rate": 6.21105741898806e-05, "loss": 5.7791, "step": 937 }, { "epoch": 1.3064066852367688, "grad_norm": 4.880700588226318, "learning_rate": 6.210062535531551e-05, "loss": 5.876, "step": 938 }, { "epoch": 1.307799442896936, "grad_norm": 5.53300142288208, "learning_rate": 6.209067652075041e-05, "loss": 6.1308, "step": 939 }, { "epoch": 1.309192200557103, "grad_norm": 4.972252368927002, "learning_rate": 6.208072768618533e-05, "loss": 5.8277, "step": 940 }, { "epoch": 1.3105849582172702, "grad_norm": 5.015955448150635, "learning_rate": 6.207077885162024e-05, "loss": 5.9938, "step": 941 }, { "epoch": 1.3119777158774373, "grad_norm": 5.864644527435303, "learning_rate": 6.206083001705514e-05, "loss": 5.6609, "step": 942 }, { "epoch": 1.3133704735376044, "grad_norm": 7.571382522583008, "learning_rate": 6.205088118249004e-05, "loss": 5.5346, "step": 943 }, { "epoch": 1.3147632311977717, "grad_norm": 4.031362533569336, "learning_rate": 6.204093234792495e-05, "loss": 5.9596, "step": 944 }, { "epoch": 1.3161559888579388, "grad_norm": 4.2481608390808105, "learning_rate": 6.203098351335985e-05, "loss": 5.9164, "step": 945 }, { "epoch": 1.317548746518106, "grad_norm": 5.277892589569092, "learning_rate": 6.202103467879477e-05, "loss": 5.7101, "step": 946 }, { "epoch": 1.318941504178273, "grad_norm": 4.687514781951904, "learning_rate": 6.201108584422967e-05, "loss": 5.9427, "step": 947 }, { "epoch": 1.3203342618384402, "grad_norm": 3.968960762023926, "learning_rate": 6.200113700966458e-05, "loss": 5.7682, "step": 948 }, { "epoch": 1.3217270194986073, "grad_norm": 4.346205711364746, "learning_rate": 6.199118817509948e-05, "loss": 5.6468, "step": 949 }, { "epoch": 1.3231197771587744, "grad_norm": 4.177730560302734, "learning_rate": 6.198123934053439e-05, "loss": 5.9089, "step": 950 }, { "epoch": 1.3245125348189415, "grad_norm": 4.508565902709961, "learning_rate": 6.197129050596929e-05, "loss": 5.9606, "step": 951 }, { "epoch": 1.3259052924791086, "grad_norm": 5.372434139251709, "learning_rate": 6.196134167140421e-05, "loss": 5.928, "step": 952 }, { "epoch": 1.3272980501392757, "grad_norm": 3.947396755218506, "learning_rate": 6.19513928368391e-05, "loss": 5.9353, "step": 953 }, { "epoch": 1.3286908077994428, "grad_norm": 5.454845428466797, "learning_rate": 6.194144400227402e-05, "loss": 6.1017, "step": 954 }, { "epoch": 1.33008356545961, "grad_norm": 4.1838908195495605, "learning_rate": 6.193149516770892e-05, "loss": 5.1224, "step": 955 }, { "epoch": 1.331476323119777, "grad_norm": 4.65695333480835, "learning_rate": 6.192154633314382e-05, "loss": 6.1432, "step": 956 }, { "epoch": 1.3328690807799444, "grad_norm": 4.286771774291992, "learning_rate": 6.191159749857874e-05, "loss": 5.5884, "step": 957 }, { "epoch": 1.3342618384401115, "grad_norm": 4.350987911224365, "learning_rate": 6.190164866401363e-05, "loss": 6.1831, "step": 958 }, { "epoch": 1.3356545961002786, "grad_norm": 4.22835636138916, "learning_rate": 6.189169982944855e-05, "loss": 5.999, "step": 959 }, { "epoch": 1.3370473537604457, "grad_norm": 5.089576244354248, "learning_rate": 6.188175099488345e-05, "loss": 5.7303, "step": 960 }, { "epoch": 1.3384401114206128, "grad_norm": 4.221466541290283, "learning_rate": 6.187180216031836e-05, "loss": 5.778, "step": 961 }, { "epoch": 1.33983286908078, "grad_norm": 5.235956192016602, "learning_rate": 6.186185332575326e-05, "loss": 5.3759, "step": 962 }, { "epoch": 1.341225626740947, "grad_norm": 4.110906600952148, "learning_rate": 6.185190449118817e-05, "loss": 6.1872, "step": 963 }, { "epoch": 1.3426183844011141, "grad_norm": 6.1621994972229, "learning_rate": 6.184195565662307e-05, "loss": 5.6299, "step": 964 }, { "epoch": 1.3440111420612815, "grad_norm": 4.876462459564209, "learning_rate": 6.183200682205799e-05, "loss": 5.7035, "step": 965 }, { "epoch": 1.3454038997214486, "grad_norm": 4.035791873931885, "learning_rate": 6.182205798749289e-05, "loss": 5.9675, "step": 966 }, { "epoch": 1.3467966573816157, "grad_norm": 5.427221298217773, "learning_rate": 6.18121091529278e-05, "loss": 5.9598, "step": 967 }, { "epoch": 1.3481894150417828, "grad_norm": 4.459470748901367, "learning_rate": 6.18021603183627e-05, "loss": 6.2836, "step": 968 }, { "epoch": 1.34958217270195, "grad_norm": 4.781061172485352, "learning_rate": 6.17922114837976e-05, "loss": 5.4981, "step": 969 }, { "epoch": 1.350974930362117, "grad_norm": 4.126856327056885, "learning_rate": 6.178226264923251e-05, "loss": 5.426, "step": 970 }, { "epoch": 1.3523676880222841, "grad_norm": 5.340152740478516, "learning_rate": 6.177231381466743e-05, "loss": 5.7245, "step": 971 }, { "epoch": 1.3537604456824512, "grad_norm": 3.787040948867798, "learning_rate": 6.176236498010233e-05, "loss": 5.8324, "step": 972 }, { "epoch": 1.3551532033426184, "grad_norm": 4.234292984008789, "learning_rate": 6.175241614553723e-05, "loss": 5.6917, "step": 973 }, { "epoch": 1.3565459610027855, "grad_norm": 4.556074142456055, "learning_rate": 6.174246731097214e-05, "loss": 5.7874, "step": 974 }, { "epoch": 1.3579387186629526, "grad_norm": 5.05952262878418, "learning_rate": 6.173251847640704e-05, "loss": 5.3797, "step": 975 }, { "epoch": 1.3593314763231197, "grad_norm": 4.6962504386901855, "learning_rate": 6.172256964184195e-05, "loss": 6.0405, "step": 976 }, { "epoch": 1.3607242339832868, "grad_norm": 4.279208183288574, "learning_rate": 6.171262080727686e-05, "loss": 5.4048, "step": 977 }, { "epoch": 1.362116991643454, "grad_norm": 3.862438440322876, "learning_rate": 6.170267197271175e-05, "loss": 5.5516, "step": 978 }, { "epoch": 1.3635097493036212, "grad_norm": 4.292626857757568, "learning_rate": 6.169272313814667e-05, "loss": 5.7087, "step": 979 }, { "epoch": 1.3649025069637883, "grad_norm": 4.5584588050842285, "learning_rate": 6.168277430358158e-05, "loss": 5.8243, "step": 980 }, { "epoch": 1.3662952646239555, "grad_norm": 5.150217533111572, "learning_rate": 6.167282546901648e-05, "loss": 5.679, "step": 981 }, { "epoch": 1.3676880222841226, "grad_norm": 4.363905906677246, "learning_rate": 6.166287663445138e-05, "loss": 5.2874, "step": 982 }, { "epoch": 1.3690807799442897, "grad_norm": 4.296676158905029, "learning_rate": 6.165292779988629e-05, "loss": 5.6455, "step": 983 }, { "epoch": 1.3704735376044568, "grad_norm": 3.8754825592041016, "learning_rate": 6.16429789653212e-05, "loss": 5.6513, "step": 984 }, { "epoch": 1.371866295264624, "grad_norm": 3.8887650966644287, "learning_rate": 6.163303013075611e-05, "loss": 5.0899, "step": 985 }, { "epoch": 1.3732590529247912, "grad_norm": 3.969674825668335, "learning_rate": 6.162308129619101e-05, "loss": 5.2191, "step": 986 }, { "epoch": 1.3746518105849583, "grad_norm": 4.9046783447265625, "learning_rate": 6.161313246162592e-05, "loss": 5.2244, "step": 987 }, { "epoch": 1.3760445682451254, "grad_norm": 4.269351005554199, "learning_rate": 6.160318362706082e-05, "loss": 5.659, "step": 988 }, { "epoch": 1.3774373259052926, "grad_norm": 4.419259071350098, "learning_rate": 6.159323479249573e-05, "loss": 5.7227, "step": 989 }, { "epoch": 1.3788300835654597, "grad_norm": 3.671412944793701, "learning_rate": 6.158328595793064e-05, "loss": 5.4442, "step": 990 }, { "epoch": 1.3802228412256268, "grad_norm": 11.605569839477539, "learning_rate": 6.157333712336555e-05, "loss": 5.3918, "step": 991 }, { "epoch": 1.3816155988857939, "grad_norm": 4.40183687210083, "learning_rate": 6.156338828880045e-05, "loss": 5.4837, "step": 992 }, { "epoch": 1.383008356545961, "grad_norm": 6.407291889190674, "learning_rate": 6.155343945423536e-05, "loss": 5.7197, "step": 993 }, { "epoch": 1.384401114206128, "grad_norm": 4.451000213623047, "learning_rate": 6.154349061967026e-05, "loss": 6.1491, "step": 994 }, { "epoch": 1.3857938718662952, "grad_norm": 4.515896797180176, "learning_rate": 6.153354178510516e-05, "loss": 5.5752, "step": 995 }, { "epoch": 1.3871866295264623, "grad_norm": 5.924260139465332, "learning_rate": 6.152359295054008e-05, "loss": 5.4646, "step": 996 }, { "epoch": 1.3885793871866294, "grad_norm": 4.985195159912109, "learning_rate": 6.151364411597497e-05, "loss": 5.9752, "step": 997 }, { "epoch": 1.3899721448467965, "grad_norm": 4.6726298332214355, "learning_rate": 6.150369528140989e-05, "loss": 5.7832, "step": 998 }, { "epoch": 1.3913649025069637, "grad_norm": 4.758754730224609, "learning_rate": 6.14937464468448e-05, "loss": 6.0052, "step": 999 }, { "epoch": 1.392757660167131, "grad_norm": 4.372791767120361, "learning_rate": 6.14837976122797e-05, "loss": 5.267, "step": 1000 }, { "epoch": 1.394150417827298, "grad_norm": 4.5017828941345215, "learning_rate": 6.14738487777146e-05, "loss": 5.7571, "step": 1001 }, { "epoch": 1.3955431754874652, "grad_norm": 4.74721622467041, "learning_rate": 6.14638999431495e-05, "loss": 5.2723, "step": 1002 }, { "epoch": 1.3969359331476323, "grad_norm": 4.530693054199219, "learning_rate": 6.145395110858441e-05, "loss": 5.2973, "step": 1003 }, { "epoch": 1.3983286908077994, "grad_norm": 3.9919230937957764, "learning_rate": 6.144400227401933e-05, "loss": 5.5479, "step": 1004 }, { "epoch": 1.3997214484679665, "grad_norm": 4.236709117889404, "learning_rate": 6.143405343945423e-05, "loss": 5.4198, "step": 1005 }, { "epoch": 1.4011142061281336, "grad_norm": 4.950026988983154, "learning_rate": 6.142410460488914e-05, "loss": 5.3283, "step": 1006 }, { "epoch": 1.4025069637883008, "grad_norm": 4.193325996398926, "learning_rate": 6.141415577032404e-05, "loss": 5.4861, "step": 1007 }, { "epoch": 1.403899721448468, "grad_norm": 3.9612650871276855, "learning_rate": 6.140420693575894e-05, "loss": 5.3246, "step": 1008 }, { "epoch": 1.4052924791086352, "grad_norm": 6.121573448181152, "learning_rate": 6.139425810119385e-05, "loss": 5.6648, "step": 1009 }, { "epoch": 1.4066852367688023, "grad_norm": 4.664121150970459, "learning_rate": 6.138430926662877e-05, "loss": 5.4626, "step": 1010 }, { "epoch": 1.4080779944289694, "grad_norm": 5.541447639465332, "learning_rate": 6.137436043206367e-05, "loss": 6.0914, "step": 1011 }, { "epoch": 1.4094707520891365, "grad_norm": 5.062973976135254, "learning_rate": 6.136441159749857e-05, "loss": 5.7945, "step": 1012 }, { "epoch": 1.4108635097493036, "grad_norm": 4.804659843444824, "learning_rate": 6.135446276293348e-05, "loss": 5.6537, "step": 1013 }, { "epoch": 1.4122562674094707, "grad_norm": 5.041073322296143, "learning_rate": 6.134451392836838e-05, "loss": 5.7487, "step": 1014 }, { "epoch": 1.4136490250696379, "grad_norm": 4.3413310050964355, "learning_rate": 6.13345650938033e-05, "loss": 5.4096, "step": 1015 }, { "epoch": 1.415041782729805, "grad_norm": 5.038837432861328, "learning_rate": 6.13246162592382e-05, "loss": 5.3533, "step": 1016 }, { "epoch": 1.416434540389972, "grad_norm": 4.292027950286865, "learning_rate": 6.131466742467311e-05, "loss": 5.0028, "step": 1017 }, { "epoch": 1.4178272980501392, "grad_norm": 5.166059494018555, "learning_rate": 6.130471859010801e-05, "loss": 5.7436, "step": 1018 }, { "epoch": 1.4192200557103063, "grad_norm": 4.512571811676025, "learning_rate": 6.129476975554292e-05, "loss": 5.8252, "step": 1019 }, { "epoch": 1.4206128133704734, "grad_norm": 5.226598739624023, "learning_rate": 6.128482092097782e-05, "loss": 5.2616, "step": 1020 }, { "epoch": 1.4220055710306407, "grad_norm": 4.709335803985596, "learning_rate": 6.127487208641274e-05, "loss": 5.5318, "step": 1021 }, { "epoch": 1.4233983286908078, "grad_norm": 4.278756618499756, "learning_rate": 6.126492325184763e-05, "loss": 5.1678, "step": 1022 }, { "epoch": 1.424791086350975, "grad_norm": 3.8250346183776855, "learning_rate": 6.125497441728255e-05, "loss": 5.5314, "step": 1023 }, { "epoch": 1.426183844011142, "grad_norm": 4.649449825286865, "learning_rate": 6.124502558271745e-05, "loss": 4.8598, "step": 1024 }, { "epoch": 1.4275766016713092, "grad_norm": 4.135913372039795, "learning_rate": 6.123507674815235e-05, "loss": 5.3614, "step": 1025 }, { "epoch": 1.4289693593314763, "grad_norm": 5.673917293548584, "learning_rate": 6.122512791358726e-05, "loss": 5.6734, "step": 1026 }, { "epoch": 1.4303621169916434, "grad_norm": 4.483156681060791, "learning_rate": 6.121517907902216e-05, "loss": 5.7856, "step": 1027 }, { "epoch": 1.4317548746518105, "grad_norm": 7.711784362792969, "learning_rate": 6.120523024445707e-05, "loss": 5.2445, "step": 1028 }, { "epoch": 1.4331476323119778, "grad_norm": 4.991926670074463, "learning_rate": 6.119528140989198e-05, "loss": 5.5703, "step": 1029 }, { "epoch": 1.434540389972145, "grad_norm": 4.263853073120117, "learning_rate": 6.118533257532689e-05, "loss": 5.4425, "step": 1030 }, { "epoch": 1.435933147632312, "grad_norm": 4.021599292755127, "learning_rate": 6.117538374076179e-05, "loss": 5.3901, "step": 1031 }, { "epoch": 1.4373259052924792, "grad_norm": 4.648274898529053, "learning_rate": 6.11654349061967e-05, "loss": 5.3981, "step": 1032 }, { "epoch": 1.4387186629526463, "grad_norm": 4.237360000610352, "learning_rate": 6.11554860716316e-05, "loss": 6.0682, "step": 1033 }, { "epoch": 1.4401114206128134, "grad_norm": 4.454067707061768, "learning_rate": 6.11455372370665e-05, "loss": 5.6093, "step": 1034 }, { "epoch": 1.4415041782729805, "grad_norm": 6.402667999267578, "learning_rate": 6.113558840250142e-05, "loss": 5.6968, "step": 1035 }, { "epoch": 1.4428969359331476, "grad_norm": 4.66150426864624, "learning_rate": 6.112563956793631e-05, "loss": 6.0225, "step": 1036 }, { "epoch": 1.4442896935933147, "grad_norm": 4.917180061340332, "learning_rate": 6.111569073337123e-05, "loss": 5.4053, "step": 1037 }, { "epoch": 1.4456824512534818, "grad_norm": 4.644591808319092, "learning_rate": 6.110574189880613e-05, "loss": 5.4277, "step": 1038 }, { "epoch": 1.447075208913649, "grad_norm": 6.097512722015381, "learning_rate": 6.109579306424104e-05, "loss": 5.1661, "step": 1039 }, { "epoch": 1.448467966573816, "grad_norm": 4.1244025230407715, "learning_rate": 6.108584422967596e-05, "loss": 5.6467, "step": 1040 }, { "epoch": 1.4498607242339832, "grad_norm": 4.131136417388916, "learning_rate": 6.107589539511085e-05, "loss": 5.8465, "step": 1041 }, { "epoch": 1.4512534818941505, "grad_norm": 5.074693202972412, "learning_rate": 6.106594656054576e-05, "loss": 5.3554, "step": 1042 }, { "epoch": 1.4526462395543176, "grad_norm": 6.52849006652832, "learning_rate": 6.105599772598067e-05, "loss": 5.3501, "step": 1043 }, { "epoch": 1.4540389972144847, "grad_norm": 5.787081241607666, "learning_rate": 6.104604889141557e-05, "loss": 5.4217, "step": 1044 }, { "epoch": 1.4554317548746518, "grad_norm": 4.454585552215576, "learning_rate": 6.103610005685048e-05, "loss": 5.2065, "step": 1045 }, { "epoch": 1.456824512534819, "grad_norm": 5.2344279289245605, "learning_rate": 6.102615122228538e-05, "loss": 5.732, "step": 1046 }, { "epoch": 1.458217270194986, "grad_norm": 4.843843460083008, "learning_rate": 6.101620238772029e-05, "loss": 5.6284, "step": 1047 }, { "epoch": 1.4596100278551531, "grad_norm": 4.481762409210205, "learning_rate": 6.1006253553155195e-05, "loss": 5.4837, "step": 1048 }, { "epoch": 1.4610027855153203, "grad_norm": 4.521838188171387, "learning_rate": 6.0996304718590106e-05, "loss": 5.5173, "step": 1049 }, { "epoch": 1.4623955431754876, "grad_norm": 4.370110511779785, "learning_rate": 6.098635588402501e-05, "loss": 5.377, "step": 1050 }, { "epoch": 1.4637883008356547, "grad_norm": 4.479923725128174, "learning_rate": 6.0976407049459914e-05, "loss": 5.6361, "step": 1051 }, { "epoch": 1.4651810584958218, "grad_norm": 4.193077087402344, "learning_rate": 6.096645821489482e-05, "loss": 5.5678, "step": 1052 }, { "epoch": 1.466573816155989, "grad_norm": 4.2606201171875, "learning_rate": 6.095650938032973e-05, "loss": 5.7236, "step": 1053 }, { "epoch": 1.467966573816156, "grad_norm": 4.565093994140625, "learning_rate": 6.094656054576463e-05, "loss": 5.4948, "step": 1054 }, { "epoch": 1.4693593314763231, "grad_norm": 4.295390605926514, "learning_rate": 6.0936611711199544e-05, "loss": 5.2635, "step": 1055 }, { "epoch": 1.4707520891364902, "grad_norm": 4.787730693817139, "learning_rate": 6.092666287663444e-05, "loss": 5.4875, "step": 1056 }, { "epoch": 1.4721448467966574, "grad_norm": 5.01603364944458, "learning_rate": 6.091671404206935e-05, "loss": 5.441, "step": 1057 }, { "epoch": 1.4735376044568245, "grad_norm": 7.9381513595581055, "learning_rate": 6.0906765207504256e-05, "loss": 5.5475, "step": 1058 }, { "epoch": 1.4749303621169916, "grad_norm": 4.661684036254883, "learning_rate": 6.089681637293917e-05, "loss": 5.6567, "step": 1059 }, { "epoch": 1.4763231197771587, "grad_norm": 4.058624744415283, "learning_rate": 6.088686753837407e-05, "loss": 5.0811, "step": 1060 }, { "epoch": 1.4777158774373258, "grad_norm": 4.226449012756348, "learning_rate": 6.0876918703808975e-05, "loss": 5.6178, "step": 1061 }, { "epoch": 1.479108635097493, "grad_norm": 4.778773784637451, "learning_rate": 6.086696986924388e-05, "loss": 5.9535, "step": 1062 }, { "epoch": 1.4805013927576602, "grad_norm": 5.2507195472717285, "learning_rate": 6.085702103467879e-05, "loss": 5.4184, "step": 1063 }, { "epoch": 1.4818941504178273, "grad_norm": 4.835323333740234, "learning_rate": 6.08470722001137e-05, "loss": 5.4343, "step": 1064 }, { "epoch": 1.4832869080779945, "grad_norm": 4.113168716430664, "learning_rate": 6.0837123365548605e-05, "loss": 5.5131, "step": 1065 }, { "epoch": 1.4846796657381616, "grad_norm": 5.054234504699707, "learning_rate": 6.08271745309835e-05, "loss": 5.1919, "step": 1066 }, { "epoch": 1.4860724233983287, "grad_norm": 4.585954666137695, "learning_rate": 6.081722569641841e-05, "loss": 5.4392, "step": 1067 }, { "epoch": 1.4874651810584958, "grad_norm": 4.478633880615234, "learning_rate": 6.0807276861853324e-05, "loss": 5.1403, "step": 1068 }, { "epoch": 1.488857938718663, "grad_norm": 4.157210350036621, "learning_rate": 6.079732802728823e-05, "loss": 5.3341, "step": 1069 }, { "epoch": 1.49025069637883, "grad_norm": 4.241940021514893, "learning_rate": 6.078737919272313e-05, "loss": 5.5515, "step": 1070 }, { "epoch": 1.4916434540389973, "grad_norm": 4.752518177032471, "learning_rate": 6.0777430358158036e-05, "loss": 5.4118, "step": 1071 }, { "epoch": 1.4930362116991645, "grad_norm": 4.162990093231201, "learning_rate": 6.076748152359295e-05, "loss": 5.3923, "step": 1072 }, { "epoch": 1.4944289693593316, "grad_norm": 6.27323579788208, "learning_rate": 6.075753268902785e-05, "loss": 5.3445, "step": 1073 }, { "epoch": 1.4958217270194987, "grad_norm": 3.8739569187164307, "learning_rate": 6.074758385446276e-05, "loss": 5.7046, "step": 1074 }, { "epoch": 1.4972144846796658, "grad_norm": 4.954774856567383, "learning_rate": 6.073763501989766e-05, "loss": 5.2156, "step": 1075 }, { "epoch": 1.498607242339833, "grad_norm": 4.059577941894531, "learning_rate": 6.072768618533257e-05, "loss": 5.1942, "step": 1076 }, { "epoch": 1.5, "grad_norm": 4.890732288360596, "learning_rate": 6.0717737350767474e-05, "loss": 5.3576, "step": 1077 }, { "epoch": 1.501392757660167, "grad_norm": 4.2035298347473145, "learning_rate": 6.0707788516202385e-05, "loss": 5.3458, "step": 1078 }, { "epoch": 1.5027855153203342, "grad_norm": 4.541333198547363, "learning_rate": 6.069783968163729e-05, "loss": 5.855, "step": 1079 }, { "epoch": 1.5041782729805013, "grad_norm": 4.331189155578613, "learning_rate": 6.0687890847072193e-05, "loss": 5.4807, "step": 1080 }, { "epoch": 1.5055710306406684, "grad_norm": 4.317488193511963, "learning_rate": 6.06779420125071e-05, "loss": 5.7551, "step": 1081 }, { "epoch": 1.5069637883008355, "grad_norm": 4.339413166046143, "learning_rate": 6.066799317794201e-05, "loss": 5.0802, "step": 1082 }, { "epoch": 1.5083565459610027, "grad_norm": 4.928534984588623, "learning_rate": 6.065804434337691e-05, "loss": 5.745, "step": 1083 }, { "epoch": 1.5097493036211698, "grad_norm": 3.766230583190918, "learning_rate": 6.064809550881182e-05, "loss": 5.3793, "step": 1084 }, { "epoch": 1.511142061281337, "grad_norm": 4.159375190734863, "learning_rate": 6.063814667424672e-05, "loss": 5.3454, "step": 1085 }, { "epoch": 1.5125348189415042, "grad_norm": 4.078312873840332, "learning_rate": 6.062819783968163e-05, "loss": 4.8024, "step": 1086 }, { "epoch": 1.5139275766016713, "grad_norm": 4.715867042541504, "learning_rate": 6.0618249005116536e-05, "loss": 5.3176, "step": 1087 }, { "epoch": 1.5153203342618384, "grad_norm": 6.341579914093018, "learning_rate": 6.0608300170551446e-05, "loss": 5.4676, "step": 1088 }, { "epoch": 1.5167130919220055, "grad_norm": 4.221826553344727, "learning_rate": 6.059835133598635e-05, "loss": 5.2045, "step": 1089 }, { "epoch": 1.5181058495821727, "grad_norm": 4.108441352844238, "learning_rate": 6.0588402501421255e-05, "loss": 4.8501, "step": 1090 }, { "epoch": 1.51949860724234, "grad_norm": 4.748363018035889, "learning_rate": 6.057845366685616e-05, "loss": 5.6348, "step": 1091 }, { "epoch": 1.520891364902507, "grad_norm": 5.1843132972717285, "learning_rate": 6.056850483229107e-05, "loss": 5.2767, "step": 1092 }, { "epoch": 1.5222841225626742, "grad_norm": 7.4928436279296875, "learning_rate": 6.055855599772598e-05, "loss": 5.3334, "step": 1093 }, { "epoch": 1.5236768802228413, "grad_norm": 4.244089126586914, "learning_rate": 6.0548607163160884e-05, "loss": 5.6978, "step": 1094 }, { "epoch": 1.5250696378830084, "grad_norm": 5.482670783996582, "learning_rate": 6.053865832859578e-05, "loss": 5.0617, "step": 1095 }, { "epoch": 1.5264623955431755, "grad_norm": 4.926762104034424, "learning_rate": 6.052870949403069e-05, "loss": 5.0647, "step": 1096 }, { "epoch": 1.5278551532033426, "grad_norm": 4.932678699493408, "learning_rate": 6.05187606594656e-05, "loss": 5.175, "step": 1097 }, { "epoch": 1.5292479108635098, "grad_norm": 4.782625198364258, "learning_rate": 6.050881182490051e-05, "loss": 5.2746, "step": 1098 }, { "epoch": 1.5306406685236769, "grad_norm": 5.648792743682861, "learning_rate": 6.049886299033542e-05, "loss": 6.1578, "step": 1099 }, { "epoch": 1.532033426183844, "grad_norm": 4.749377727508545, "learning_rate": 6.0488914155770316e-05, "loss": 5.2959, "step": 1100 }, { "epoch": 1.533426183844011, "grad_norm": 4.50879430770874, "learning_rate": 6.0478965321205226e-05, "loss": 5.4286, "step": 1101 }, { "epoch": 1.5348189415041782, "grad_norm": 5.779273986816406, "learning_rate": 6.046901648664013e-05, "loss": 5.2275, "step": 1102 }, { "epoch": 1.5362116991643453, "grad_norm": 5.6755690574646, "learning_rate": 6.045906765207504e-05, "loss": 5.3256, "step": 1103 }, { "epoch": 1.5376044568245124, "grad_norm": 4.357021331787109, "learning_rate": 6.0449118817509945e-05, "loss": 5.2706, "step": 1104 }, { "epoch": 1.5389972144846795, "grad_norm": 4.6982502937316895, "learning_rate": 6.043916998294485e-05, "loss": 5.273, "step": 1105 }, { "epoch": 1.5403899721448466, "grad_norm": 4.713165283203125, "learning_rate": 6.0429221148379754e-05, "loss": 5.6245, "step": 1106 }, { "epoch": 1.541782729805014, "grad_norm": 4.421494960784912, "learning_rate": 6.0419272313814664e-05, "loss": 5.2232, "step": 1107 }, { "epoch": 1.543175487465181, "grad_norm": 4.202319622039795, "learning_rate": 6.040932347924957e-05, "loss": 5.5722, "step": 1108 }, { "epoch": 1.5445682451253482, "grad_norm": 4.8798089027404785, "learning_rate": 6.039937464468448e-05, "loss": 5.61, "step": 1109 }, { "epoch": 1.5459610027855153, "grad_norm": 3.93410587310791, "learning_rate": 6.038942581011938e-05, "loss": 5.9666, "step": 1110 }, { "epoch": 1.5473537604456824, "grad_norm": 4.741140842437744, "learning_rate": 6.037947697555429e-05, "loss": 5.4437, "step": 1111 }, { "epoch": 1.5487465181058497, "grad_norm": 4.0675177574157715, "learning_rate": 6.036952814098919e-05, "loss": 5.6989, "step": 1112 }, { "epoch": 1.5501392757660168, "grad_norm": 4.973630428314209, "learning_rate": 6.03595793064241e-05, "loss": 5.7861, "step": 1113 }, { "epoch": 1.551532033426184, "grad_norm": 3.842400550842285, "learning_rate": 6.0349630471859e-05, "loss": 5.1912, "step": 1114 }, { "epoch": 1.552924791086351, "grad_norm": 5.069960594177246, "learning_rate": 6.033968163729391e-05, "loss": 5.3714, "step": 1115 }, { "epoch": 1.5543175487465182, "grad_norm": 4.760850429534912, "learning_rate": 6.0329732802728815e-05, "loss": 5.0273, "step": 1116 }, { "epoch": 1.5557103064066853, "grad_norm": 4.261945724487305, "learning_rate": 6.0319783968163726e-05, "loss": 4.9191, "step": 1117 }, { "epoch": 1.5571030640668524, "grad_norm": 4.528093338012695, "learning_rate": 6.030983513359863e-05, "loss": 4.8012, "step": 1118 }, { "epoch": 1.5584958217270195, "grad_norm": 3.9830808639526367, "learning_rate": 6.0299886299033534e-05, "loss": 5.2467, "step": 1119 }, { "epoch": 1.5598885793871866, "grad_norm": 4.702970504760742, "learning_rate": 6.028993746446844e-05, "loss": 5.3754, "step": 1120 }, { "epoch": 1.5612813370473537, "grad_norm": 5.310110569000244, "learning_rate": 6.027998862990335e-05, "loss": 5.2087, "step": 1121 }, { "epoch": 1.5626740947075208, "grad_norm": 4.7408127784729, "learning_rate": 6.027003979533826e-05, "loss": 5.0399, "step": 1122 }, { "epoch": 1.564066852367688, "grad_norm": 5.498900890350342, "learning_rate": 6.0260090960773164e-05, "loss": 5.0449, "step": 1123 }, { "epoch": 1.565459610027855, "grad_norm": 5.8566484451293945, "learning_rate": 6.025014212620806e-05, "loss": 5.5611, "step": 1124 }, { "epoch": 1.5668523676880222, "grad_norm": 4.719356060028076, "learning_rate": 6.024019329164297e-05, "loss": 5.1937, "step": 1125 }, { "epoch": 1.5682451253481893, "grad_norm": 5.191523551940918, "learning_rate": 6.023024445707788e-05, "loss": 5.4316, "step": 1126 }, { "epoch": 1.5696378830083564, "grad_norm": 4.1674652099609375, "learning_rate": 6.022029562251279e-05, "loss": 5.3627, "step": 1127 }, { "epoch": 1.5710306406685237, "grad_norm": 4.102329254150391, "learning_rate": 6.02103467879477e-05, "loss": 5.3458, "step": 1128 }, { "epoch": 1.5724233983286908, "grad_norm": 5.134708881378174, "learning_rate": 6.0200397953382595e-05, "loss": 5.4587, "step": 1129 }, { "epoch": 1.573816155988858, "grad_norm": 4.33356237411499, "learning_rate": 6.0190449118817506e-05, "loss": 5.099, "step": 1130 }, { "epoch": 1.575208913649025, "grad_norm": 5.126018047332764, "learning_rate": 6.018050028425241e-05, "loss": 5.8141, "step": 1131 }, { "epoch": 1.5766016713091922, "grad_norm": 4.2986321449279785, "learning_rate": 6.017055144968732e-05, "loss": 5.7301, "step": 1132 }, { "epoch": 1.5779944289693595, "grad_norm": 5.645534992218018, "learning_rate": 6.0160602615122225e-05, "loss": 5.5876, "step": 1133 }, { "epoch": 1.5793871866295266, "grad_norm": 4.3288373947143555, "learning_rate": 6.015065378055713e-05, "loss": 4.9602, "step": 1134 }, { "epoch": 1.5807799442896937, "grad_norm": 4.655366897583008, "learning_rate": 6.014070494599203e-05, "loss": 5.3268, "step": 1135 }, { "epoch": 1.5821727019498608, "grad_norm": 4.137612819671631, "learning_rate": 6.0130756111426944e-05, "loss": 5.232, "step": 1136 }, { "epoch": 1.583565459610028, "grad_norm": 4.0097150802612305, "learning_rate": 6.012080727686185e-05, "loss": 4.9411, "step": 1137 }, { "epoch": 1.584958217270195, "grad_norm": 4.4006242752075195, "learning_rate": 6.011085844229676e-05, "loss": 5.2862, "step": 1138 }, { "epoch": 1.5863509749303621, "grad_norm": 6.664628982543945, "learning_rate": 6.0100909607731656e-05, "loss": 5.4074, "step": 1139 }, { "epoch": 1.5877437325905293, "grad_norm": 5.05947732925415, "learning_rate": 6.009096077316657e-05, "loss": 4.9289, "step": 1140 }, { "epoch": 1.5891364902506964, "grad_norm": 4.076658248901367, "learning_rate": 6.008101193860147e-05, "loss": 5.2478, "step": 1141 }, { "epoch": 1.5905292479108635, "grad_norm": 4.9287309646606445, "learning_rate": 6.007106310403638e-05, "loss": 5.638, "step": 1142 }, { "epoch": 1.5919220055710306, "grad_norm": 4.976055145263672, "learning_rate": 6.0061114269471286e-05, "loss": 4.9995, "step": 1143 }, { "epoch": 1.5933147632311977, "grad_norm": 4.467281818389893, "learning_rate": 6.005116543490619e-05, "loss": 5.0632, "step": 1144 }, { "epoch": 1.5947075208913648, "grad_norm": 10.33066463470459, "learning_rate": 6.0041216600341094e-05, "loss": 5.5778, "step": 1145 }, { "epoch": 1.596100278551532, "grad_norm": 4.165136814117432, "learning_rate": 6.0031267765776005e-05, "loss": 5.0624, "step": 1146 }, { "epoch": 1.597493036211699, "grad_norm": 4.316961765289307, "learning_rate": 6.0021318931210916e-05, "loss": 5.7191, "step": 1147 }, { "epoch": 1.5988857938718661, "grad_norm": 4.557807445526123, "learning_rate": 6.001137009664582e-05, "loss": 5.7798, "step": 1148 }, { "epoch": 1.6002785515320335, "grad_norm": 5.020558834075928, "learning_rate": 6.000142126208072e-05, "loss": 5.1111, "step": 1149 }, { "epoch": 1.6016713091922006, "grad_norm": 4.93497371673584, "learning_rate": 5.999147242751563e-05, "loss": 5.0666, "step": 1150 }, { "epoch": 1.6030640668523677, "grad_norm": 4.127545356750488, "learning_rate": 5.998152359295054e-05, "loss": 5.3712, "step": 1151 }, { "epoch": 1.6044568245125348, "grad_norm": 5.075907230377197, "learning_rate": 5.997157475838544e-05, "loss": 5.0848, "step": 1152 }, { "epoch": 1.605849582172702, "grad_norm": 4.087526321411133, "learning_rate": 5.996162592382034e-05, "loss": 4.9592, "step": 1153 }, { "epoch": 1.6072423398328692, "grad_norm": 4.246445178985596, "learning_rate": 5.995167708925525e-05, "loss": 5.1955, "step": 1154 }, { "epoch": 1.6086350974930363, "grad_norm": 5.3127617835998535, "learning_rate": 5.994172825469016e-05, "loss": 5.2628, "step": 1155 }, { "epoch": 1.6100278551532035, "grad_norm": 4.276273250579834, "learning_rate": 5.9931779420125066e-05, "loss": 5.3533, "step": 1156 }, { "epoch": 1.6114206128133706, "grad_norm": 4.265781402587891, "learning_rate": 5.992183058555998e-05, "loss": 5.4358, "step": 1157 }, { "epoch": 1.6128133704735377, "grad_norm": 5.589931011199951, "learning_rate": 5.9911881750994874e-05, "loss": 5.4369, "step": 1158 }, { "epoch": 1.6142061281337048, "grad_norm": 5.889350414276123, "learning_rate": 5.9901932916429785e-05, "loss": 5.1033, "step": 1159 }, { "epoch": 1.615598885793872, "grad_norm": 3.8457164764404297, "learning_rate": 5.989198408186469e-05, "loss": 5.4135, "step": 1160 }, { "epoch": 1.616991643454039, "grad_norm": 7.082851886749268, "learning_rate": 5.98820352472996e-05, "loss": 5.1141, "step": 1161 }, { "epoch": 1.6183844011142061, "grad_norm": 4.579845905303955, "learning_rate": 5.9872086412734504e-05, "loss": 5.0879, "step": 1162 }, { "epoch": 1.6197771587743732, "grad_norm": 4.529720306396484, "learning_rate": 5.986213757816941e-05, "loss": 4.983, "step": 1163 }, { "epoch": 1.6211699164345403, "grad_norm": 4.010012149810791, "learning_rate": 5.985218874360431e-05, "loss": 5.1261, "step": 1164 }, { "epoch": 1.6225626740947074, "grad_norm": 5.6930460929870605, "learning_rate": 5.984223990903922e-05, "loss": 5.7673, "step": 1165 }, { "epoch": 1.6239554317548746, "grad_norm": 4.764626502990723, "learning_rate": 5.983229107447413e-05, "loss": 5.0265, "step": 1166 }, { "epoch": 1.6253481894150417, "grad_norm": 4.306996822357178, "learning_rate": 5.982234223990904e-05, "loss": 5.1573, "step": 1167 }, { "epoch": 1.6267409470752088, "grad_norm": 3.859175682067871, "learning_rate": 5.9812393405343935e-05, "loss": 4.9783, "step": 1168 }, { "epoch": 1.6281337047353759, "grad_norm": 3.9782230854034424, "learning_rate": 5.9802444570778846e-05, "loss": 5.3576, "step": 1169 }, { "epoch": 1.6295264623955432, "grad_norm": 4.932841777801514, "learning_rate": 5.979249573621375e-05, "loss": 5.5647, "step": 1170 }, { "epoch": 1.6309192200557103, "grad_norm": 4.469640254974365, "learning_rate": 5.978254690164866e-05, "loss": 5.3512, "step": 1171 }, { "epoch": 1.6323119777158774, "grad_norm": 4.7257981300354, "learning_rate": 5.9772598067083565e-05, "loss": 5.4221, "step": 1172 }, { "epoch": 1.6337047353760445, "grad_norm": 4.525543212890625, "learning_rate": 5.976264923251847e-05, "loss": 5.2254, "step": 1173 }, { "epoch": 1.6350974930362117, "grad_norm": 4.767129421234131, "learning_rate": 5.975270039795337e-05, "loss": 4.7804, "step": 1174 }, { "epoch": 1.636490250696379, "grad_norm": 4.485895156860352, "learning_rate": 5.9742751563388284e-05, "loss": 5.3896, "step": 1175 }, { "epoch": 1.637883008356546, "grad_norm": 3.918351888656616, "learning_rate": 5.9732802728823195e-05, "loss": 5.066, "step": 1176 }, { "epoch": 1.6392757660167132, "grad_norm": 4.607816696166992, "learning_rate": 5.97228538942581e-05, "loss": 5.2288, "step": 1177 }, { "epoch": 1.6406685236768803, "grad_norm": 5.340575695037842, "learning_rate": 5.9712905059692996e-05, "loss": 5.02, "step": 1178 }, { "epoch": 1.6420612813370474, "grad_norm": 4.5595383644104, "learning_rate": 5.970295622512791e-05, "loss": 4.5346, "step": 1179 }, { "epoch": 1.6434540389972145, "grad_norm": 5.011353015899658, "learning_rate": 5.969300739056282e-05, "loss": 4.9287, "step": 1180 }, { "epoch": 1.6448467966573816, "grad_norm": 3.863245964050293, "learning_rate": 5.968305855599772e-05, "loss": 5.3482, "step": 1181 }, { "epoch": 1.6462395543175488, "grad_norm": 4.161905765533447, "learning_rate": 5.967310972143263e-05, "loss": 5.464, "step": 1182 }, { "epoch": 1.6476323119777159, "grad_norm": 4.196154594421387, "learning_rate": 5.966316088686753e-05, "loss": 5.4455, "step": 1183 }, { "epoch": 1.649025069637883, "grad_norm": 4.436659812927246, "learning_rate": 5.965321205230244e-05, "loss": 4.9179, "step": 1184 }, { "epoch": 1.65041782729805, "grad_norm": 3.823091506958008, "learning_rate": 5.9643263217737345e-05, "loss": 5.2561, "step": 1185 }, { "epoch": 1.6518105849582172, "grad_norm": 4.765748023986816, "learning_rate": 5.9633314383172256e-05, "loss": 5.0113, "step": 1186 }, { "epoch": 1.6532033426183843, "grad_norm": 4.5616865158081055, "learning_rate": 5.962336554860716e-05, "loss": 5.0939, "step": 1187 }, { "epoch": 1.6545961002785514, "grad_norm": 4.232777118682861, "learning_rate": 5.9613416714042064e-05, "loss": 5.4659, "step": 1188 }, { "epoch": 1.6559888579387185, "grad_norm": 5.522634983062744, "learning_rate": 5.960346787947697e-05, "loss": 4.767, "step": 1189 }, { "epoch": 1.6573816155988856, "grad_norm": 4.595826148986816, "learning_rate": 5.959351904491188e-05, "loss": 4.8471, "step": 1190 }, { "epoch": 1.658774373259053, "grad_norm": 4.422351360321045, "learning_rate": 5.958357021034678e-05, "loss": 4.5475, "step": 1191 }, { "epoch": 1.66016713091922, "grad_norm": 4.2058587074279785, "learning_rate": 5.9573621375781694e-05, "loss": 5.1124, "step": 1192 }, { "epoch": 1.6615598885793872, "grad_norm": 6.848339080810547, "learning_rate": 5.956367254121659e-05, "loss": 5.3895, "step": 1193 }, { "epoch": 1.6629526462395543, "grad_norm": 5.851674556732178, "learning_rate": 5.95537237066515e-05, "loss": 5.0473, "step": 1194 }, { "epoch": 1.6643454038997214, "grad_norm": 4.681227684020996, "learning_rate": 5.9543774872086406e-05, "loss": 5.1896, "step": 1195 }, { "epoch": 1.6657381615598887, "grad_norm": 8.910616874694824, "learning_rate": 5.953382603752132e-05, "loss": 5.2984, "step": 1196 }, { "epoch": 1.6671309192200559, "grad_norm": 4.738180637359619, "learning_rate": 5.9523877202956214e-05, "loss": 5.1951, "step": 1197 }, { "epoch": 1.668523676880223, "grad_norm": 4.421966552734375, "learning_rate": 5.9513928368391125e-05, "loss": 5.0105, "step": 1198 }, { "epoch": 1.66991643454039, "grad_norm": 4.483490943908691, "learning_rate": 5.950397953382603e-05, "loss": 4.8556, "step": 1199 }, { "epoch": 1.6713091922005572, "grad_norm": 4.8139729499816895, "learning_rate": 5.949403069926094e-05, "loss": 5.4025, "step": 1200 }, { "epoch": 1.6727019498607243, "grad_norm": 5.430883884429932, "learning_rate": 5.9484081864695844e-05, "loss": 5.1012, "step": 1201 }, { "epoch": 1.6740947075208914, "grad_norm": 4.058892726898193, "learning_rate": 5.947413303013075e-05, "loss": 4.9456, "step": 1202 }, { "epoch": 1.6754874651810585, "grad_norm": 4.60087251663208, "learning_rate": 5.946418419556565e-05, "loss": 5.1531, "step": 1203 }, { "epoch": 1.6768802228412256, "grad_norm": 5.3073272705078125, "learning_rate": 5.945423536100056e-05, "loss": 5.061, "step": 1204 }, { "epoch": 1.6782729805013927, "grad_norm": 5.039947509765625, "learning_rate": 5.9444286526435474e-05, "loss": 5.0029, "step": 1205 }, { "epoch": 1.6796657381615598, "grad_norm": 6.318732738494873, "learning_rate": 5.943433769187038e-05, "loss": 4.9684, "step": 1206 }, { "epoch": 1.681058495821727, "grad_norm": 4.097740173339844, "learning_rate": 5.9424388857305275e-05, "loss": 4.6838, "step": 1207 }, { "epoch": 1.682451253481894, "grad_norm": 3.9446823596954346, "learning_rate": 5.9414440022740186e-05, "loss": 4.6736, "step": 1208 }, { "epoch": 1.6838440111420612, "grad_norm": 4.035701751708984, "learning_rate": 5.94044911881751e-05, "loss": 5.2903, "step": 1209 }, { "epoch": 1.6852367688022283, "grad_norm": 5.2842583656311035, "learning_rate": 5.939454235361e-05, "loss": 5.4825, "step": 1210 }, { "epoch": 1.6866295264623954, "grad_norm": 4.32784366607666, "learning_rate": 5.938459351904491e-05, "loss": 5.0295, "step": 1211 }, { "epoch": 1.6880222841225627, "grad_norm": 4.509637355804443, "learning_rate": 5.937464468447981e-05, "loss": 5.1461, "step": 1212 }, { "epoch": 1.6894150417827298, "grad_norm": 4.043561935424805, "learning_rate": 5.936469584991472e-05, "loss": 4.7742, "step": 1213 }, { "epoch": 1.690807799442897, "grad_norm": 5.205489635467529, "learning_rate": 5.9354747015349624e-05, "loss": 4.6934, "step": 1214 }, { "epoch": 1.692200557103064, "grad_norm": 4.248347282409668, "learning_rate": 5.9344798180784535e-05, "loss": 5.4886, "step": 1215 }, { "epoch": 1.6935933147632312, "grad_norm": 4.5697197914123535, "learning_rate": 5.933484934621944e-05, "loss": 5.0912, "step": 1216 }, { "epoch": 1.6949860724233985, "grad_norm": 4.81688928604126, "learning_rate": 5.932490051165434e-05, "loss": 5.4905, "step": 1217 }, { "epoch": 1.6963788300835656, "grad_norm": 5.066283702850342, "learning_rate": 5.931495167708925e-05, "loss": 4.8654, "step": 1218 }, { "epoch": 1.6977715877437327, "grad_norm": 4.438613414764404, "learning_rate": 5.930500284252416e-05, "loss": 5.0706, "step": 1219 }, { "epoch": 1.6991643454038998, "grad_norm": 4.010351181030273, "learning_rate": 5.929505400795906e-05, "loss": 5.4198, "step": 1220 }, { "epoch": 1.700557103064067, "grad_norm": 3.8836967945098877, "learning_rate": 5.928510517339397e-05, "loss": 5.3495, "step": 1221 }, { "epoch": 1.701949860724234, "grad_norm": 9.083257675170898, "learning_rate": 5.927515633882887e-05, "loss": 5.0419, "step": 1222 }, { "epoch": 1.7033426183844012, "grad_norm": 4.5670623779296875, "learning_rate": 5.926520750426378e-05, "loss": 4.8311, "step": 1223 }, { "epoch": 1.7047353760445683, "grad_norm": 4.448812484741211, "learning_rate": 5.9255258669698685e-05, "loss": 4.8841, "step": 1224 }, { "epoch": 1.7061281337047354, "grad_norm": 4.513146877288818, "learning_rate": 5.9245309835133596e-05, "loss": 5.0136, "step": 1225 }, { "epoch": 1.7075208913649025, "grad_norm": 4.682188034057617, "learning_rate": 5.92353610005685e-05, "loss": 4.8929, "step": 1226 }, { "epoch": 1.7089136490250696, "grad_norm": 4.83218240737915, "learning_rate": 5.9225412166003404e-05, "loss": 5.3724, "step": 1227 }, { "epoch": 1.7103064066852367, "grad_norm": 4.96373987197876, "learning_rate": 5.921546333143831e-05, "loss": 5.282, "step": 1228 }, { "epoch": 1.7116991643454038, "grad_norm": 4.282477378845215, "learning_rate": 5.920551449687322e-05, "loss": 4.4825, "step": 1229 }, { "epoch": 1.713091922005571, "grad_norm": 5.492599010467529, "learning_rate": 5.919556566230812e-05, "loss": 5.0134, "step": 1230 }, { "epoch": 1.714484679665738, "grad_norm": 5.012602806091309, "learning_rate": 5.9185616827743034e-05, "loss": 5.3737, "step": 1231 }, { "epoch": 1.7158774373259051, "grad_norm": 7.142603874206543, "learning_rate": 5.917566799317793e-05, "loss": 5.2829, "step": 1232 }, { "epoch": 1.7172701949860725, "grad_norm": 5.519572734832764, "learning_rate": 5.916571915861284e-05, "loss": 5.2057, "step": 1233 }, { "epoch": 1.7186629526462396, "grad_norm": 4.653135776519775, "learning_rate": 5.915577032404775e-05, "loss": 4.9784, "step": 1234 }, { "epoch": 1.7200557103064067, "grad_norm": 5.516115188598633, "learning_rate": 5.914582148948266e-05, "loss": 5.2837, "step": 1235 }, { "epoch": 1.7214484679665738, "grad_norm": 4.786309242248535, "learning_rate": 5.913587265491757e-05, "loss": 5.5864, "step": 1236 }, { "epoch": 1.722841225626741, "grad_norm": 4.339987277984619, "learning_rate": 5.9125923820352465e-05, "loss": 4.6804, "step": 1237 }, { "epoch": 1.724233983286908, "grad_norm": 4.238324165344238, "learning_rate": 5.9115974985787376e-05, "loss": 5.0248, "step": 1238 }, { "epoch": 1.7256267409470754, "grad_norm": 4.193300247192383, "learning_rate": 5.910602615122228e-05, "loss": 4.8384, "step": 1239 }, { "epoch": 1.7270194986072425, "grad_norm": 4.547834873199463, "learning_rate": 5.909607731665719e-05, "loss": 5.26, "step": 1240 }, { "epoch": 1.7284122562674096, "grad_norm": 5.412426471710205, "learning_rate": 5.908612848209209e-05, "loss": 5.0807, "step": 1241 }, { "epoch": 1.7298050139275767, "grad_norm": 4.3400139808654785, "learning_rate": 5.9076179647527e-05, "loss": 4.8661, "step": 1242 }, { "epoch": 1.7311977715877438, "grad_norm": 4.30125093460083, "learning_rate": 5.90662308129619e-05, "loss": 5.465, "step": 1243 }, { "epoch": 1.732590529247911, "grad_norm": 7.581824779510498, "learning_rate": 5.9056281978396814e-05, "loss": 4.9617, "step": 1244 }, { "epoch": 1.733983286908078, "grad_norm": 4.4453020095825195, "learning_rate": 5.904633314383172e-05, "loss": 4.8422, "step": 1245 }, { "epoch": 1.7353760445682451, "grad_norm": 4.197065353393555, "learning_rate": 5.903638430926662e-05, "loss": 5.1462, "step": 1246 }, { "epoch": 1.7367688022284122, "grad_norm": 4.419856548309326, "learning_rate": 5.9026435474701526e-05, "loss": 5.35, "step": 1247 }, { "epoch": 1.7381615598885793, "grad_norm": 5.0929789543151855, "learning_rate": 5.901648664013644e-05, "loss": 4.7436, "step": 1248 }, { "epoch": 1.7395543175487465, "grad_norm": 5.040703296661377, "learning_rate": 5.900653780557134e-05, "loss": 5.0925, "step": 1249 }, { "epoch": 1.7409470752089136, "grad_norm": 3.77506160736084, "learning_rate": 5.899658897100625e-05, "loss": 4.6411, "step": 1250 }, { "epoch": 1.7423398328690807, "grad_norm": 4.22568416595459, "learning_rate": 5.898664013644115e-05, "loss": 5.2016, "step": 1251 }, { "epoch": 1.7437325905292478, "grad_norm": 4.241969585418701, "learning_rate": 5.897669130187606e-05, "loss": 5.059, "step": 1252 }, { "epoch": 1.745125348189415, "grad_norm": 4.156410217285156, "learning_rate": 5.8966742467310964e-05, "loss": 5.1263, "step": 1253 }, { "epoch": 1.7465181058495822, "grad_norm": 3.999236822128296, "learning_rate": 5.8956793632745875e-05, "loss": 4.89, "step": 1254 }, { "epoch": 1.7479108635097493, "grad_norm": 4.329075336456299, "learning_rate": 5.894684479818078e-05, "loss": 4.871, "step": 1255 }, { "epoch": 1.7493036211699164, "grad_norm": 5.040160179138184, "learning_rate": 5.893689596361568e-05, "loss": 4.9036, "step": 1256 }, { "epoch": 1.7506963788300836, "grad_norm": 4.441908836364746, "learning_rate": 5.892694712905059e-05, "loss": 5.5134, "step": 1257 }, { "epoch": 1.7520891364902507, "grad_norm": 6.066964149475098, "learning_rate": 5.89169982944855e-05, "loss": 5.083, "step": 1258 }, { "epoch": 1.7534818941504178, "grad_norm": 4.725863456726074, "learning_rate": 5.89070494599204e-05, "loss": 5.5413, "step": 1259 }, { "epoch": 1.754874651810585, "grad_norm": 4.986145496368408, "learning_rate": 5.889710062535531e-05, "loss": 5.0532, "step": 1260 }, { "epoch": 1.7562674094707522, "grad_norm": 4.244281768798828, "learning_rate": 5.888715179079021e-05, "loss": 5.3201, "step": 1261 }, { "epoch": 1.7576601671309193, "grad_norm": 4.933448791503906, "learning_rate": 5.887720295622512e-05, "loss": 5.2326, "step": 1262 }, { "epoch": 1.7590529247910864, "grad_norm": 4.831197738647461, "learning_rate": 5.886725412166003e-05, "loss": 5.0347, "step": 1263 }, { "epoch": 1.7604456824512535, "grad_norm": 6.52940034866333, "learning_rate": 5.8857305287094936e-05, "loss": 4.7476, "step": 1264 }, { "epoch": 1.7618384401114207, "grad_norm": 4.689281463623047, "learning_rate": 5.884735645252985e-05, "loss": 5.3087, "step": 1265 }, { "epoch": 1.7632311977715878, "grad_norm": 4.21406888961792, "learning_rate": 5.8837407617964744e-05, "loss": 5.3117, "step": 1266 }, { "epoch": 1.7646239554317549, "grad_norm": 4.888186931610107, "learning_rate": 5.8827458783399655e-05, "loss": 4.9891, "step": 1267 }, { "epoch": 1.766016713091922, "grad_norm": 6.084913730621338, "learning_rate": 5.881750994883456e-05, "loss": 4.9797, "step": 1268 }, { "epoch": 1.767409470752089, "grad_norm": 9.895624160766602, "learning_rate": 5.880756111426947e-05, "loss": 5.259, "step": 1269 }, { "epoch": 1.7688022284122562, "grad_norm": 4.0565571784973145, "learning_rate": 5.8797612279704374e-05, "loss": 4.9253, "step": 1270 }, { "epoch": 1.7701949860724233, "grad_norm": 5.655186653137207, "learning_rate": 5.878766344513928e-05, "loss": 4.9211, "step": 1271 }, { "epoch": 1.7715877437325904, "grad_norm": 3.942230463027954, "learning_rate": 5.877771461057418e-05, "loss": 5.1804, "step": 1272 }, { "epoch": 1.7729805013927575, "grad_norm": 5.159029483795166, "learning_rate": 5.876776577600909e-05, "loss": 5.0458, "step": 1273 }, { "epoch": 1.7743732590529246, "grad_norm": 5.114142894744873, "learning_rate": 5.8757816941444e-05, "loss": 4.7971, "step": 1274 }, { "epoch": 1.775766016713092, "grad_norm": 5.485936641693115, "learning_rate": 5.874786810687891e-05, "loss": 4.6306, "step": 1275 }, { "epoch": 1.777158774373259, "grad_norm": 6.276236057281494, "learning_rate": 5.8737919272313805e-05, "loss": 5.1194, "step": 1276 }, { "epoch": 1.7785515320334262, "grad_norm": 4.214354991912842, "learning_rate": 5.8727970437748716e-05, "loss": 4.6878, "step": 1277 }, { "epoch": 1.7799442896935933, "grad_norm": 6.638233184814453, "learning_rate": 5.871802160318362e-05, "loss": 4.8345, "step": 1278 }, { "epoch": 1.7813370473537604, "grad_norm": 4.968391418457031, "learning_rate": 5.870807276861853e-05, "loss": 4.3399, "step": 1279 }, { "epoch": 1.7827298050139275, "grad_norm": 4.857850074768066, "learning_rate": 5.8698123934053435e-05, "loss": 5.1188, "step": 1280 }, { "epoch": 1.7841225626740949, "grad_norm": 4.397397518157959, "learning_rate": 5.868817509948834e-05, "loss": 5.0782, "step": 1281 }, { "epoch": 1.785515320334262, "grad_norm": 4.862459659576416, "learning_rate": 5.8678226264923243e-05, "loss": 4.7992, "step": 1282 }, { "epoch": 1.786908077994429, "grad_norm": 4.83461856842041, "learning_rate": 5.8668277430358154e-05, "loss": 5.2456, "step": 1283 }, { "epoch": 1.7883008356545962, "grad_norm": 5.348004341125488, "learning_rate": 5.865832859579306e-05, "loss": 4.909, "step": 1284 }, { "epoch": 1.7896935933147633, "grad_norm": 3.98246169090271, "learning_rate": 5.864837976122796e-05, "loss": 4.9827, "step": 1285 }, { "epoch": 1.7910863509749304, "grad_norm": 4.485197067260742, "learning_rate": 5.8638430926662867e-05, "loss": 4.8303, "step": 1286 }, { "epoch": 1.7924791086350975, "grad_norm": 4.2006731033325195, "learning_rate": 5.862848209209778e-05, "loss": 4.3103, "step": 1287 }, { "epoch": 1.7938718662952646, "grad_norm": 5.053812503814697, "learning_rate": 5.861853325753269e-05, "loss": 5.1545, "step": 1288 }, { "epoch": 1.7952646239554317, "grad_norm": 5.753026962280273, "learning_rate": 5.860858442296759e-05, "loss": 5.0855, "step": 1289 }, { "epoch": 1.7966573816155988, "grad_norm": 4.728921413421631, "learning_rate": 5.859863558840249e-05, "loss": 4.6271, "step": 1290 }, { "epoch": 1.798050139275766, "grad_norm": 4.88857364654541, "learning_rate": 5.85886867538374e-05, "loss": 4.8991, "step": 1291 }, { "epoch": 1.799442896935933, "grad_norm": 4.64939022064209, "learning_rate": 5.857873791927231e-05, "loss": 4.7404, "step": 1292 }, { "epoch": 1.8008356545961002, "grad_norm": 4.134711265563965, "learning_rate": 5.8568789084707215e-05, "loss": 4.9327, "step": 1293 }, { "epoch": 1.8022284122562673, "grad_norm": 4.269225120544434, "learning_rate": 5.8558840250142126e-05, "loss": 5.1741, "step": 1294 }, { "epoch": 1.8036211699164344, "grad_norm": 4.530089378356934, "learning_rate": 5.8548891415577024e-05, "loss": 4.7819, "step": 1295 }, { "epoch": 1.8050139275766015, "grad_norm": 5.171576976776123, "learning_rate": 5.8538942581011934e-05, "loss": 5.1002, "step": 1296 }, { "epoch": 1.8064066852367688, "grad_norm": 4.574953079223633, "learning_rate": 5.852899374644684e-05, "loss": 5.2196, "step": 1297 }, { "epoch": 1.807799442896936, "grad_norm": 4.656442165374756, "learning_rate": 5.851904491188175e-05, "loss": 4.8261, "step": 1298 }, { "epoch": 1.809192200557103, "grad_norm": 5.161946773529053, "learning_rate": 5.8509096077316653e-05, "loss": 5.1474, "step": 1299 }, { "epoch": 1.8105849582172702, "grad_norm": 5.265983581542969, "learning_rate": 5.849914724275156e-05, "loss": 4.902, "step": 1300 }, { "epoch": 1.8119777158774373, "grad_norm": 4.652301788330078, "learning_rate": 5.848919840818646e-05, "loss": 4.722, "step": 1301 }, { "epoch": 1.8133704735376046, "grad_norm": 4.341517925262451, "learning_rate": 5.847924957362137e-05, "loss": 5.2609, "step": 1302 }, { "epoch": 1.8147632311977717, "grad_norm": 5.4265313148498535, "learning_rate": 5.8469300739056277e-05, "loss": 5.1165, "step": 1303 }, { "epoch": 1.8161559888579388, "grad_norm": 4.894313812255859, "learning_rate": 5.845935190449119e-05, "loss": 4.8027, "step": 1304 }, { "epoch": 1.817548746518106, "grad_norm": 4.270108222961426, "learning_rate": 5.8449403069926085e-05, "loss": 4.9183, "step": 1305 }, { "epoch": 1.818941504178273, "grad_norm": 6.259138107299805, "learning_rate": 5.8439454235360996e-05, "loss": 4.9659, "step": 1306 }, { "epoch": 1.8203342618384402, "grad_norm": 4.402987480163574, "learning_rate": 5.84295054007959e-05, "loss": 4.5167, "step": 1307 }, { "epoch": 1.8217270194986073, "grad_norm": 4.210943698883057, "learning_rate": 5.841955656623081e-05, "loss": 5.0627, "step": 1308 }, { "epoch": 1.8231197771587744, "grad_norm": 4.103539943695068, "learning_rate": 5.8409607731665715e-05, "loss": 5.1827, "step": 1309 }, { "epoch": 1.8245125348189415, "grad_norm": 6.2264299392700195, "learning_rate": 5.839965889710062e-05, "loss": 5.2231, "step": 1310 }, { "epoch": 1.8259052924791086, "grad_norm": 6.247641563415527, "learning_rate": 5.838971006253552e-05, "loss": 5.0428, "step": 1311 }, { "epoch": 1.8272980501392757, "grad_norm": 4.584635257720947, "learning_rate": 5.8379761227970434e-05, "loss": 4.9395, "step": 1312 }, { "epoch": 1.8286908077994428, "grad_norm": 3.6346242427825928, "learning_rate": 5.836981239340534e-05, "loss": 4.8325, "step": 1313 }, { "epoch": 1.83008356545961, "grad_norm": 4.009475231170654, "learning_rate": 5.835986355884025e-05, "loss": 4.6872, "step": 1314 }, { "epoch": 1.831476323119777, "grad_norm": 4.166362762451172, "learning_rate": 5.8349914724275146e-05, "loss": 5.1893, "step": 1315 }, { "epoch": 1.8328690807799441, "grad_norm": 5.342883586883545, "learning_rate": 5.8339965889710057e-05, "loss": 5.0689, "step": 1316 }, { "epoch": 1.8342618384401113, "grad_norm": 4.411332607269287, "learning_rate": 5.833001705514497e-05, "loss": 4.847, "step": 1317 }, { "epoch": 1.8356545961002786, "grad_norm": 5.047307968139648, "learning_rate": 5.832006822057987e-05, "loss": 5.1624, "step": 1318 }, { "epoch": 1.8370473537604457, "grad_norm": 5.890045166015625, "learning_rate": 5.831011938601478e-05, "loss": 4.8237, "step": 1319 }, { "epoch": 1.8384401114206128, "grad_norm": 5.271467208862305, "learning_rate": 5.830017055144968e-05, "loss": 5.0535, "step": 1320 }, { "epoch": 1.83983286908078, "grad_norm": 4.287512302398682, "learning_rate": 5.829022171688459e-05, "loss": 4.3351, "step": 1321 }, { "epoch": 1.841225626740947, "grad_norm": 4.520585536956787, "learning_rate": 5.8280272882319495e-05, "loss": 5.2462, "step": 1322 }, { "epoch": 1.8426183844011144, "grad_norm": 4.573328971862793, "learning_rate": 5.8270324047754405e-05, "loss": 4.6179, "step": 1323 }, { "epoch": 1.8440111420612815, "grad_norm": 4.49533748626709, "learning_rate": 5.82603752131893e-05, "loss": 5.1825, "step": 1324 }, { "epoch": 1.8454038997214486, "grad_norm": 4.156423568725586, "learning_rate": 5.8250426378624214e-05, "loss": 5.0544, "step": 1325 }, { "epoch": 1.8467966573816157, "grad_norm": 4.7064433097839355, "learning_rate": 5.824047754405912e-05, "loss": 5.0148, "step": 1326 }, { "epoch": 1.8481894150417828, "grad_norm": 4.827240467071533, "learning_rate": 5.823052870949403e-05, "loss": 4.6688, "step": 1327 }, { "epoch": 1.84958217270195, "grad_norm": 5.145040035247803, "learning_rate": 5.822057987492893e-05, "loss": 5.3015, "step": 1328 }, { "epoch": 1.850974930362117, "grad_norm": 7.89101505279541, "learning_rate": 5.821063104036384e-05, "loss": 4.8715, "step": 1329 }, { "epoch": 1.8523676880222841, "grad_norm": 4.7502923011779785, "learning_rate": 5.820068220579874e-05, "loss": 5.0511, "step": 1330 }, { "epoch": 1.8537604456824512, "grad_norm": 4.670762538909912, "learning_rate": 5.819073337123365e-05, "loss": 4.7304, "step": 1331 }, { "epoch": 1.8551532033426184, "grad_norm": 3.9312586784362793, "learning_rate": 5.8180784536668556e-05, "loss": 4.8302, "step": 1332 }, { "epoch": 1.8565459610027855, "grad_norm": 4.732985496520996, "learning_rate": 5.8170835702103467e-05, "loss": 4.81, "step": 1333 }, { "epoch": 1.8579387186629526, "grad_norm": 5.042375564575195, "learning_rate": 5.8160886867538364e-05, "loss": 4.7127, "step": 1334 }, { "epoch": 1.8593314763231197, "grad_norm": 4.160640239715576, "learning_rate": 5.8150938032973275e-05, "loss": 5.0566, "step": 1335 }, { "epoch": 1.8607242339832868, "grad_norm": 4.453600883483887, "learning_rate": 5.814098919840818e-05, "loss": 4.7741, "step": 1336 }, { "epoch": 1.862116991643454, "grad_norm": 4.191370964050293, "learning_rate": 5.813104036384309e-05, "loss": 4.9208, "step": 1337 }, { "epoch": 1.863509749303621, "grad_norm": 4.520490646362305, "learning_rate": 5.8121091529277994e-05, "loss": 4.9874, "step": 1338 }, { "epoch": 1.8649025069637883, "grad_norm": 5.264113426208496, "learning_rate": 5.81111426947129e-05, "loss": 4.6024, "step": 1339 }, { "epoch": 1.8662952646239555, "grad_norm": 4.082907199859619, "learning_rate": 5.81011938601478e-05, "loss": 5.0785, "step": 1340 }, { "epoch": 1.8676880222841226, "grad_norm": 3.9186766147613525, "learning_rate": 5.809124502558271e-05, "loss": 4.5587, "step": 1341 }, { "epoch": 1.8690807799442897, "grad_norm": 4.418737888336182, "learning_rate": 5.808129619101762e-05, "loss": 4.3297, "step": 1342 }, { "epoch": 1.8704735376044568, "grad_norm": 4.381232738494873, "learning_rate": 5.807134735645253e-05, "loss": 4.4724, "step": 1343 }, { "epoch": 1.8718662952646241, "grad_norm": 4.592318534851074, "learning_rate": 5.8061398521887425e-05, "loss": 4.6123, "step": 1344 }, { "epoch": 1.8732590529247912, "grad_norm": 6.614316463470459, "learning_rate": 5.8051449687322336e-05, "loss": 4.9827, "step": 1345 }, { "epoch": 1.8746518105849583, "grad_norm": 4.040852069854736, "learning_rate": 5.8041500852757247e-05, "loss": 4.7856, "step": 1346 }, { "epoch": 1.8760445682451254, "grad_norm": 4.913686275482178, "learning_rate": 5.803155201819215e-05, "loss": 5.0156, "step": 1347 }, { "epoch": 1.8774373259052926, "grad_norm": 5.432875633239746, "learning_rate": 5.802160318362706e-05, "loss": 4.8061, "step": 1348 }, { "epoch": 1.8788300835654597, "grad_norm": 4.370886325836182, "learning_rate": 5.801165434906196e-05, "loss": 4.8098, "step": 1349 }, { "epoch": 1.8802228412256268, "grad_norm": 4.216629505157471, "learning_rate": 5.800170551449687e-05, "loss": 4.3694, "step": 1350 }, { "epoch": 1.8816155988857939, "grad_norm": 4.49286413192749, "learning_rate": 5.7991756679931774e-05, "loss": 5.1319, "step": 1351 }, { "epoch": 1.883008356545961, "grad_norm": 4.426698684692383, "learning_rate": 5.7981807845366685e-05, "loss": 5.2308, "step": 1352 }, { "epoch": 1.884401114206128, "grad_norm": 4.446043491363525, "learning_rate": 5.797185901080159e-05, "loss": 4.736, "step": 1353 }, { "epoch": 1.8857938718662952, "grad_norm": 4.116539001464844, "learning_rate": 5.796191017623649e-05, "loss": 4.8607, "step": 1354 }, { "epoch": 1.8871866295264623, "grad_norm": 3.835434913635254, "learning_rate": 5.79519613416714e-05, "loss": 4.304, "step": 1355 }, { "epoch": 1.8885793871866294, "grad_norm": 4.829227447509766, "learning_rate": 5.794201250710631e-05, "loss": 4.9886, "step": 1356 }, { "epoch": 1.8899721448467965, "grad_norm": 6.427865982055664, "learning_rate": 5.793206367254121e-05, "loss": 4.2493, "step": 1357 }, { "epoch": 1.8913649025069637, "grad_norm": 7.588675498962402, "learning_rate": 5.792211483797612e-05, "loss": 5.2044, "step": 1358 }, { "epoch": 1.8927576601671308, "grad_norm": 5.770694255828857, "learning_rate": 5.791216600341102e-05, "loss": 4.5427, "step": 1359 }, { "epoch": 1.894150417827298, "grad_norm": 4.1365485191345215, "learning_rate": 5.790221716884593e-05, "loss": 5.0175, "step": 1360 }, { "epoch": 1.8955431754874652, "grad_norm": 3.9512853622436523, "learning_rate": 5.7892268334280835e-05, "loss": 5.1879, "step": 1361 }, { "epoch": 1.8969359331476323, "grad_norm": 4.1711530685424805, "learning_rate": 5.7882319499715746e-05, "loss": 4.5685, "step": 1362 }, { "epoch": 1.8983286908077994, "grad_norm": 4.351253986358643, "learning_rate": 5.787237066515065e-05, "loss": 4.6267, "step": 1363 }, { "epoch": 1.8997214484679665, "grad_norm": 4.42673397064209, "learning_rate": 5.7862421830585554e-05, "loss": 5.4112, "step": 1364 }, { "epoch": 1.9011142061281339, "grad_norm": 4.138251304626465, "learning_rate": 5.785247299602046e-05, "loss": 4.8453, "step": 1365 }, { "epoch": 1.902506963788301, "grad_norm": 4.287871360778809, "learning_rate": 5.784252416145537e-05, "loss": 4.9142, "step": 1366 }, { "epoch": 1.903899721448468, "grad_norm": 4.384206295013428, "learning_rate": 5.783257532689027e-05, "loss": 4.8998, "step": 1367 }, { "epoch": 1.9052924791086352, "grad_norm": 6.017545223236084, "learning_rate": 5.782262649232518e-05, "loss": 4.8551, "step": 1368 }, { "epoch": 1.9066852367688023, "grad_norm": 5.367592811584473, "learning_rate": 5.781267765776008e-05, "loss": 5.2526, "step": 1369 }, { "epoch": 1.9080779944289694, "grad_norm": 5.079102039337158, "learning_rate": 5.780272882319499e-05, "loss": 4.7068, "step": 1370 }, { "epoch": 1.9094707520891365, "grad_norm": 5.6808648109436035, "learning_rate": 5.7792779988629896e-05, "loss": 4.6206, "step": 1371 }, { "epoch": 1.9108635097493036, "grad_norm": 3.902747869491577, "learning_rate": 5.778283115406481e-05, "loss": 4.2311, "step": 1372 }, { "epoch": 1.9122562674094707, "grad_norm": 4.13630485534668, "learning_rate": 5.7772882319499704e-05, "loss": 4.94, "step": 1373 }, { "epoch": 1.9136490250696379, "grad_norm": 4.3250250816345215, "learning_rate": 5.7762933484934615e-05, "loss": 4.6258, "step": 1374 }, { "epoch": 1.915041782729805, "grad_norm": 4.0559186935424805, "learning_rate": 5.7752984650369526e-05, "loss": 5.2015, "step": 1375 }, { "epoch": 1.916434540389972, "grad_norm": 4.951042175292969, "learning_rate": 5.774303581580443e-05, "loss": 5.2368, "step": 1376 }, { "epoch": 1.9178272980501392, "grad_norm": 4.291134834289551, "learning_rate": 5.773308698123934e-05, "loss": 4.7236, "step": 1377 }, { "epoch": 1.9192200557103063, "grad_norm": 6.4045515060424805, "learning_rate": 5.772313814667424e-05, "loss": 5.3078, "step": 1378 }, { "epoch": 1.9206128133704734, "grad_norm": 4.630138874053955, "learning_rate": 5.771318931210915e-05, "loss": 4.6285, "step": 1379 }, { "epoch": 1.9220055710306405, "grad_norm": 5.62636661529541, "learning_rate": 5.770324047754405e-05, "loss": 4.3099, "step": 1380 }, { "epoch": 1.9233983286908078, "grad_norm": 4.544035911560059, "learning_rate": 5.7693291642978964e-05, "loss": 4.6767, "step": 1381 }, { "epoch": 1.924791086350975, "grad_norm": 4.146763324737549, "learning_rate": 5.768334280841387e-05, "loss": 4.5114, "step": 1382 }, { "epoch": 1.926183844011142, "grad_norm": 5.176914215087891, "learning_rate": 5.767339397384877e-05, "loss": 4.3382, "step": 1383 }, { "epoch": 1.9275766016713092, "grad_norm": 4.158557415008545, "learning_rate": 5.7663445139283676e-05, "loss": 5.1691, "step": 1384 }, { "epoch": 1.9289693593314763, "grad_norm": 5.481085777282715, "learning_rate": 5.765349630471859e-05, "loss": 4.883, "step": 1385 }, { "epoch": 1.9303621169916436, "grad_norm": 4.413912296295166, "learning_rate": 5.764354747015349e-05, "loss": 4.4083, "step": 1386 }, { "epoch": 1.9317548746518107, "grad_norm": 7.916518211364746, "learning_rate": 5.76335986355884e-05, "loss": 5.2687, "step": 1387 }, { "epoch": 1.9331476323119778, "grad_norm": 3.890836000442505, "learning_rate": 5.76236498010233e-05, "loss": 4.4788, "step": 1388 }, { "epoch": 1.934540389972145, "grad_norm": 4.351379871368408, "learning_rate": 5.761370096645821e-05, "loss": 4.386, "step": 1389 }, { "epoch": 1.935933147632312, "grad_norm": 6.305099010467529, "learning_rate": 5.7603752131893114e-05, "loss": 4.857, "step": 1390 }, { "epoch": 1.9373259052924792, "grad_norm": 5.599578857421875, "learning_rate": 5.7593803297328025e-05, "loss": 5.7696, "step": 1391 }, { "epoch": 1.9387186629526463, "grad_norm": 3.866544723510742, "learning_rate": 5.758385446276293e-05, "loss": 4.3325, "step": 1392 }, { "epoch": 1.9401114206128134, "grad_norm": 4.895143508911133, "learning_rate": 5.757390562819783e-05, "loss": 4.8292, "step": 1393 }, { "epoch": 1.9415041782729805, "grad_norm": 4.506255149841309, "learning_rate": 5.756395679363274e-05, "loss": 4.5273, "step": 1394 }, { "epoch": 1.9428969359331476, "grad_norm": 4.85635232925415, "learning_rate": 5.755400795906765e-05, "loss": 5.0543, "step": 1395 }, { "epoch": 1.9442896935933147, "grad_norm": 5.331640720367432, "learning_rate": 5.754405912450255e-05, "loss": 4.5892, "step": 1396 }, { "epoch": 1.9456824512534818, "grad_norm": 3.6229052543640137, "learning_rate": 5.753411028993746e-05, "loss": 4.6695, "step": 1397 }, { "epoch": 1.947075208913649, "grad_norm": 4.780959129333496, "learning_rate": 5.752416145537236e-05, "loss": 4.4121, "step": 1398 }, { "epoch": 1.948467966573816, "grad_norm": 4.7896809577941895, "learning_rate": 5.751421262080727e-05, "loss": 4.9175, "step": 1399 }, { "epoch": 1.9498607242339832, "grad_norm": 4.381043434143066, "learning_rate": 5.750426378624218e-05, "loss": 4.8115, "step": 1400 }, { "epoch": 1.9512534818941503, "grad_norm": 4.62701940536499, "learning_rate": 5.7494314951677086e-05, "loss": 4.9335, "step": 1401 }, { "epoch": 1.9526462395543176, "grad_norm": 4.220277309417725, "learning_rate": 5.7484366117112e-05, "loss": 4.5878, "step": 1402 }, { "epoch": 1.9540389972144847, "grad_norm": 5.747648239135742, "learning_rate": 5.7474417282546894e-05, "loss": 4.9775, "step": 1403 }, { "epoch": 1.9554317548746518, "grad_norm": 5.05411958694458, "learning_rate": 5.7464468447981805e-05, "loss": 4.7716, "step": 1404 }, { "epoch": 1.956824512534819, "grad_norm": 4.94806432723999, "learning_rate": 5.745451961341671e-05, "loss": 4.5957, "step": 1405 }, { "epoch": 1.958217270194986, "grad_norm": 4.678225994110107, "learning_rate": 5.744457077885162e-05, "loss": 5.2641, "step": 1406 }, { "epoch": 1.9596100278551534, "grad_norm": 3.895622491836548, "learning_rate": 5.7434621944286524e-05, "loss": 4.472, "step": 1407 }, { "epoch": 1.9610027855153205, "grad_norm": 4.450009346008301, "learning_rate": 5.742467310972143e-05, "loss": 4.5857, "step": 1408 }, { "epoch": 1.9623955431754876, "grad_norm": 4.481184005737305, "learning_rate": 5.741472427515633e-05, "loss": 4.3904, "step": 1409 }, { "epoch": 1.9637883008356547, "grad_norm": 7.812098979949951, "learning_rate": 5.740477544059124e-05, "loss": 4.6505, "step": 1410 }, { "epoch": 1.9651810584958218, "grad_norm": 5.122473239898682, "learning_rate": 5.739482660602615e-05, "loss": 5.3639, "step": 1411 }, { "epoch": 1.966573816155989, "grad_norm": 4.021027088165283, "learning_rate": 5.738487777146105e-05, "loss": 4.5642, "step": 1412 }, { "epoch": 1.967966573816156, "grad_norm": 4.1866536140441895, "learning_rate": 5.7374928936895955e-05, "loss": 4.5343, "step": 1413 }, { "epoch": 1.9693593314763231, "grad_norm": 5.105562686920166, "learning_rate": 5.7364980102330866e-05, "loss": 4.7158, "step": 1414 }, { "epoch": 1.9707520891364902, "grad_norm": 4.511186122894287, "learning_rate": 5.735503126776577e-05, "loss": 5.0681, "step": 1415 }, { "epoch": 1.9721448467966574, "grad_norm": 4.926334857940674, "learning_rate": 5.734508243320068e-05, "loss": 5.056, "step": 1416 }, { "epoch": 1.9735376044568245, "grad_norm": 5.372550010681152, "learning_rate": 5.733513359863558e-05, "loss": 4.9231, "step": 1417 }, { "epoch": 1.9749303621169916, "grad_norm": 4.97346830368042, "learning_rate": 5.732518476407049e-05, "loss": 4.8801, "step": 1418 }, { "epoch": 1.9763231197771587, "grad_norm": 4.553598403930664, "learning_rate": 5.731523592950539e-05, "loss": 4.871, "step": 1419 }, { "epoch": 1.9777158774373258, "grad_norm": 4.302711486816406, "learning_rate": 5.7305287094940304e-05, "loss": 4.4123, "step": 1420 }, { "epoch": 1.979108635097493, "grad_norm": 5.098697185516357, "learning_rate": 5.729533826037521e-05, "loss": 4.1993, "step": 1421 }, { "epoch": 1.98050139275766, "grad_norm": 5.240197658538818, "learning_rate": 5.728538942581011e-05, "loss": 4.6883, "step": 1422 }, { "epoch": 1.9818941504178273, "grad_norm": 5.173120498657227, "learning_rate": 5.7275440591245016e-05, "loss": 5.0508, "step": 1423 }, { "epoch": 1.9832869080779945, "grad_norm": 5.3469390869140625, "learning_rate": 5.726549175667993e-05, "loss": 5.003, "step": 1424 }, { "epoch": 1.9846796657381616, "grad_norm": 4.947526454925537, "learning_rate": 5.725554292211483e-05, "loss": 5.0937, "step": 1425 }, { "epoch": 1.9860724233983287, "grad_norm": 4.714720249176025, "learning_rate": 5.724559408754974e-05, "loss": 4.5825, "step": 1426 }, { "epoch": 1.9874651810584958, "grad_norm": 5.9887471199035645, "learning_rate": 5.723564525298464e-05, "loss": 4.2472, "step": 1427 }, { "epoch": 1.988857938718663, "grad_norm": 4.950433731079102, "learning_rate": 5.722569641841955e-05, "loss": 4.8435, "step": 1428 }, { "epoch": 1.9902506963788302, "grad_norm": 4.301307201385498, "learning_rate": 5.721574758385446e-05, "loss": 4.5511, "step": 1429 }, { "epoch": 1.9916434540389973, "grad_norm": 4.059776306152344, "learning_rate": 5.7205798749289365e-05, "loss": 4.4791, "step": 1430 }, { "epoch": 1.9930362116991645, "grad_norm": 6.272601127624512, "learning_rate": 5.7195849914724276e-05, "loss": 4.6268, "step": 1431 }, { "epoch": 1.9944289693593316, "grad_norm": 6.988149642944336, "learning_rate": 5.718590108015917e-05, "loss": 4.3559, "step": 1432 }, { "epoch": 1.9958217270194987, "grad_norm": 5.213869094848633, "learning_rate": 5.7175952245594084e-05, "loss": 4.905, "step": 1433 }, { "epoch": 1.9972144846796658, "grad_norm": 4.848118305206299, "learning_rate": 5.716600341102899e-05, "loss": 4.5364, "step": 1434 }, { "epoch": 1.998607242339833, "grad_norm": 5.0276899337768555, "learning_rate": 5.71560545764639e-05, "loss": 4.815, "step": 1435 }, { "epoch": 2.0, "grad_norm": 4.235401153564453, "learning_rate": 5.71461057418988e-05, "loss": 4.4951, "step": 1436 }, { "epoch": 2.001392757660167, "grad_norm": 6.121840476989746, "learning_rate": 5.713615690733371e-05, "loss": 3.9839, "step": 1437 }, { "epoch": 2.002785515320334, "grad_norm": 4.8736748695373535, "learning_rate": 5.712620807276861e-05, "loss": 4.0053, "step": 1438 }, { "epoch": 2.0041782729805013, "grad_norm": 6.055631160736084, "learning_rate": 5.711625923820352e-05, "loss": 4.4934, "step": 1439 }, { "epoch": 2.0055710306406684, "grad_norm": 6.15172815322876, "learning_rate": 5.7106310403638426e-05, "loss": 4.4508, "step": 1440 }, { "epoch": 2.0069637883008355, "grad_norm": 3.874490976333618, "learning_rate": 5.709636156907334e-05, "loss": 4.2319, "step": 1441 }, { "epoch": 2.0083565459610027, "grad_norm": 4.357259273529053, "learning_rate": 5.7086412734508234e-05, "loss": 4.091, "step": 1442 }, { "epoch": 2.0097493036211698, "grad_norm": 4.5031328201293945, "learning_rate": 5.7076463899943145e-05, "loss": 4.3625, "step": 1443 }, { "epoch": 2.011142061281337, "grad_norm": 3.8418807983398438, "learning_rate": 5.706651506537805e-05, "loss": 3.8717, "step": 1444 }, { "epoch": 2.012534818941504, "grad_norm": 3.67148494720459, "learning_rate": 5.705656623081296e-05, "loss": 4.0714, "step": 1445 }, { "epoch": 2.013927576601671, "grad_norm": 4.8723320960998535, "learning_rate": 5.7046617396247864e-05, "loss": 4.1978, "step": 1446 }, { "epoch": 2.0153203342618387, "grad_norm": 4.041756629943848, "learning_rate": 5.703666856168277e-05, "loss": 4.7872, "step": 1447 }, { "epoch": 2.0167130919220058, "grad_norm": 4.530393600463867, "learning_rate": 5.702671972711767e-05, "loss": 4.1579, "step": 1448 }, { "epoch": 2.018105849582173, "grad_norm": 4.1396918296813965, "learning_rate": 5.701677089255258e-05, "loss": 4.4417, "step": 1449 }, { "epoch": 2.01949860724234, "grad_norm": 6.0312323570251465, "learning_rate": 5.700682205798749e-05, "loss": 4.3929, "step": 1450 }, { "epoch": 2.020891364902507, "grad_norm": 4.5816874504089355, "learning_rate": 5.69968732234224e-05, "loss": 4.5516, "step": 1451 }, { "epoch": 2.022284122562674, "grad_norm": 4.764605522155762, "learning_rate": 5.6986924388857295e-05, "loss": 4.2814, "step": 1452 }, { "epoch": 2.0236768802228413, "grad_norm": 4.251420021057129, "learning_rate": 5.6976975554292206e-05, "loss": 4.2084, "step": 1453 }, { "epoch": 2.0250696378830084, "grad_norm": 4.817270755767822, "learning_rate": 5.696702671972711e-05, "loss": 4.5213, "step": 1454 }, { "epoch": 2.0264623955431755, "grad_norm": 5.079899311065674, "learning_rate": 5.695707788516202e-05, "loss": 4.439, "step": 1455 }, { "epoch": 2.0278551532033426, "grad_norm": 5.405300617218018, "learning_rate": 5.694712905059692e-05, "loss": 3.9677, "step": 1456 }, { "epoch": 2.0292479108635098, "grad_norm": 3.9286108016967773, "learning_rate": 5.693718021603183e-05, "loss": 4.2411, "step": 1457 }, { "epoch": 2.030640668523677, "grad_norm": 3.5010762214660645, "learning_rate": 5.692723138146674e-05, "loss": 3.9355, "step": 1458 }, { "epoch": 2.032033426183844, "grad_norm": 4.042318820953369, "learning_rate": 5.6917282546901644e-05, "loss": 4.0139, "step": 1459 }, { "epoch": 2.033426183844011, "grad_norm": 4.314542293548584, "learning_rate": 5.6907333712336555e-05, "loss": 4.1083, "step": 1460 }, { "epoch": 2.034818941504178, "grad_norm": 4.028563976287842, "learning_rate": 5.689738487777145e-05, "loss": 4.2008, "step": 1461 }, { "epoch": 2.0362116991643453, "grad_norm": 5.131320476531982, "learning_rate": 5.688743604320636e-05, "loss": 4.6816, "step": 1462 }, { "epoch": 2.0376044568245124, "grad_norm": 4.556960105895996, "learning_rate": 5.687748720864127e-05, "loss": 4.4714, "step": 1463 }, { "epoch": 2.0389972144846795, "grad_norm": 4.0364251136779785, "learning_rate": 5.686753837407618e-05, "loss": 3.961, "step": 1464 }, { "epoch": 2.0403899721448466, "grad_norm": 3.756049871444702, "learning_rate": 5.685758953951108e-05, "loss": 4.0901, "step": 1465 }, { "epoch": 2.0417827298050137, "grad_norm": 3.9342846870422363, "learning_rate": 5.6847640704945986e-05, "loss": 4.273, "step": 1466 }, { "epoch": 2.043175487465181, "grad_norm": 4.469848155975342, "learning_rate": 5.683769187038089e-05, "loss": 4.0942, "step": 1467 }, { "epoch": 2.0445682451253484, "grad_norm": 4.504777908325195, "learning_rate": 5.68277430358158e-05, "loss": 4.1311, "step": 1468 }, { "epoch": 2.0459610027855155, "grad_norm": 6.825126647949219, "learning_rate": 5.6817794201250705e-05, "loss": 4.2753, "step": 1469 }, { "epoch": 2.0473537604456826, "grad_norm": 4.323886394500732, "learning_rate": 5.6807845366685616e-05, "loss": 4.7228, "step": 1470 }, { "epoch": 2.0487465181058497, "grad_norm": 3.9217405319213867, "learning_rate": 5.6797896532120513e-05, "loss": 4.4236, "step": 1471 }, { "epoch": 2.050139275766017, "grad_norm": 3.924076795578003, "learning_rate": 5.6787947697555424e-05, "loss": 4.2186, "step": 1472 }, { "epoch": 2.051532033426184, "grad_norm": 4.097128868103027, "learning_rate": 5.677799886299033e-05, "loss": 4.4548, "step": 1473 }, { "epoch": 2.052924791086351, "grad_norm": 4.621747016906738, "learning_rate": 5.676805002842524e-05, "loss": 4.4286, "step": 1474 }, { "epoch": 2.054317548746518, "grad_norm": 4.6758856773376465, "learning_rate": 5.675810119386014e-05, "loss": 4.2658, "step": 1475 }, { "epoch": 2.0557103064066853, "grad_norm": 4.02954626083374, "learning_rate": 5.674815235929505e-05, "loss": 4.005, "step": 1476 }, { "epoch": 2.0571030640668524, "grad_norm": 3.7476682662963867, "learning_rate": 5.673820352472995e-05, "loss": 3.9305, "step": 1477 }, { "epoch": 2.0584958217270195, "grad_norm": 3.8278512954711914, "learning_rate": 5.672825469016486e-05, "loss": 4.4554, "step": 1478 }, { "epoch": 2.0598885793871866, "grad_norm": 4.185937881469727, "learning_rate": 5.6718305855599766e-05, "loss": 4.175, "step": 1479 }, { "epoch": 2.0612813370473537, "grad_norm": 3.753911018371582, "learning_rate": 5.670835702103468e-05, "loss": 4.3212, "step": 1480 }, { "epoch": 2.062674094707521, "grad_norm": 4.38871431350708, "learning_rate": 5.6698408186469575e-05, "loss": 4.4653, "step": 1481 }, { "epoch": 2.064066852367688, "grad_norm": 5.867580413818359, "learning_rate": 5.6688459351904485e-05, "loss": 4.1974, "step": 1482 }, { "epoch": 2.065459610027855, "grad_norm": 4.391541481018066, "learning_rate": 5.667851051733939e-05, "loss": 4.0392, "step": 1483 }, { "epoch": 2.066852367688022, "grad_norm": 3.861218214035034, "learning_rate": 5.66685616827743e-05, "loss": 4.1293, "step": 1484 }, { "epoch": 2.0682451253481893, "grad_norm": 4.242783069610596, "learning_rate": 5.665861284820921e-05, "loss": 4.0947, "step": 1485 }, { "epoch": 2.0696378830083564, "grad_norm": 4.565762519836426, "learning_rate": 5.664866401364411e-05, "loss": 4.0485, "step": 1486 }, { "epoch": 2.0710306406685235, "grad_norm": 5.0193963050842285, "learning_rate": 5.663871517907902e-05, "loss": 3.8263, "step": 1487 }, { "epoch": 2.0724233983286906, "grad_norm": 3.990995168685913, "learning_rate": 5.6628766344513923e-05, "loss": 4.1442, "step": 1488 }, { "epoch": 2.073816155988858, "grad_norm": 4.26665735244751, "learning_rate": 5.6618817509948834e-05, "loss": 4.2591, "step": 1489 }, { "epoch": 2.0752089136490253, "grad_norm": 5.239255428314209, "learning_rate": 5.660886867538374e-05, "loss": 4.3352, "step": 1490 }, { "epoch": 2.0766016713091924, "grad_norm": 4.049388885498047, "learning_rate": 5.659891984081864e-05, "loss": 3.8159, "step": 1491 }, { "epoch": 2.0779944289693595, "grad_norm": 4.513409614562988, "learning_rate": 5.6588971006253546e-05, "loss": 3.9364, "step": 1492 }, { "epoch": 2.0793871866295266, "grad_norm": 5.870997428894043, "learning_rate": 5.657902217168846e-05, "loss": 4.2862, "step": 1493 }, { "epoch": 2.0807799442896937, "grad_norm": 4.047566890716553, "learning_rate": 5.656907333712336e-05, "loss": 4.6385, "step": 1494 }, { "epoch": 2.082172701949861, "grad_norm": 4.456325531005859, "learning_rate": 5.655912450255827e-05, "loss": 4.2397, "step": 1495 }, { "epoch": 2.083565459610028, "grad_norm": 3.933844804763794, "learning_rate": 5.654917566799317e-05, "loss": 4.3366, "step": 1496 }, { "epoch": 2.084958217270195, "grad_norm": 4.979091644287109, "learning_rate": 5.653922683342808e-05, "loss": 4.1172, "step": 1497 }, { "epoch": 2.086350974930362, "grad_norm": 4.2216668128967285, "learning_rate": 5.6529277998862984e-05, "loss": 3.8411, "step": 1498 }, { "epoch": 2.0877437325905293, "grad_norm": 4.793976783752441, "learning_rate": 5.6519329164297895e-05, "loss": 4.4086, "step": 1499 }, { "epoch": 2.0891364902506964, "grad_norm": 4.201914310455322, "learning_rate": 5.650938032973279e-05, "loss": 4.7244, "step": 1500 }, { "epoch": 2.0905292479108635, "grad_norm": 4.169383525848389, "learning_rate": 5.6499431495167703e-05, "loss": 4.1203, "step": 1501 }, { "epoch": 2.0919220055710306, "grad_norm": 4.854778289794922, "learning_rate": 5.648948266060261e-05, "loss": 4.4488, "step": 1502 }, { "epoch": 2.0933147632311977, "grad_norm": 3.9533584117889404, "learning_rate": 5.647953382603752e-05, "loss": 4.0845, "step": 1503 }, { "epoch": 2.094707520891365, "grad_norm": 4.016109943389893, "learning_rate": 5.646958499147242e-05, "loss": 4.1466, "step": 1504 }, { "epoch": 2.096100278551532, "grad_norm": 4.391005516052246, "learning_rate": 5.6459636156907327e-05, "loss": 4.1644, "step": 1505 }, { "epoch": 2.097493036211699, "grad_norm": 3.59956693649292, "learning_rate": 5.644968732234223e-05, "loss": 3.8031, "step": 1506 }, { "epoch": 2.098885793871866, "grad_norm": 4.031566143035889, "learning_rate": 5.643973848777714e-05, "loss": 4.1626, "step": 1507 }, { "epoch": 2.1002785515320332, "grad_norm": 4.36628532409668, "learning_rate": 5.6429789653212046e-05, "loss": 4.2109, "step": 1508 }, { "epoch": 2.1016713091922004, "grad_norm": 4.576984882354736, "learning_rate": 5.6419840818646956e-05, "loss": 4.2118, "step": 1509 }, { "epoch": 2.103064066852368, "grad_norm": 4.1856794357299805, "learning_rate": 5.6409891984081854e-05, "loss": 4.9522, "step": 1510 }, { "epoch": 2.104456824512535, "grad_norm": 5.25437593460083, "learning_rate": 5.6399943149516765e-05, "loss": 4.12, "step": 1511 }, { "epoch": 2.105849582172702, "grad_norm": 4.273394584655762, "learning_rate": 5.638999431495167e-05, "loss": 4.6421, "step": 1512 }, { "epoch": 2.1072423398328692, "grad_norm": 4.079535007476807, "learning_rate": 5.638004548038658e-05, "loss": 3.9771, "step": 1513 }, { "epoch": 2.1086350974930363, "grad_norm": 4.427313804626465, "learning_rate": 5.637009664582149e-05, "loss": 4.2386, "step": 1514 }, { "epoch": 2.1100278551532035, "grad_norm": 4.569276809692383, "learning_rate": 5.636014781125639e-05, "loss": 4.9382, "step": 1515 }, { "epoch": 2.1114206128133706, "grad_norm": 4.124020099639893, "learning_rate": 5.63501989766913e-05, "loss": 4.2301, "step": 1516 }, { "epoch": 2.1128133704735377, "grad_norm": 4.050805568695068, "learning_rate": 5.63402501421262e-05, "loss": 3.7955, "step": 1517 }, { "epoch": 2.114206128133705, "grad_norm": 4.00832462310791, "learning_rate": 5.6330301307561113e-05, "loss": 4.3878, "step": 1518 }, { "epoch": 2.115598885793872, "grad_norm": 3.9795923233032227, "learning_rate": 5.632035247299602e-05, "loss": 4.6169, "step": 1519 }, { "epoch": 2.116991643454039, "grad_norm": 4.110733985900879, "learning_rate": 5.631040363843092e-05, "loss": 4.2281, "step": 1520 }, { "epoch": 2.118384401114206, "grad_norm": 3.6762120723724365, "learning_rate": 5.6300454803865826e-05, "loss": 3.7926, "step": 1521 }, { "epoch": 2.1197771587743732, "grad_norm": 5.0108256340026855, "learning_rate": 5.6290505969300736e-05, "loss": 3.8425, "step": 1522 }, { "epoch": 2.1211699164345403, "grad_norm": 4.064860820770264, "learning_rate": 5.628055713473564e-05, "loss": 4.1895, "step": 1523 }, { "epoch": 2.1225626740947074, "grad_norm": 4.2516703605651855, "learning_rate": 5.627060830017055e-05, "loss": 4.0237, "step": 1524 }, { "epoch": 2.1239554317548746, "grad_norm": 3.8588855266571045, "learning_rate": 5.626065946560545e-05, "loss": 3.8886, "step": 1525 }, { "epoch": 2.1253481894150417, "grad_norm": 4.1031575202941895, "learning_rate": 5.625071063104036e-05, "loss": 4.1133, "step": 1526 }, { "epoch": 2.1267409470752088, "grad_norm": 4.086325168609619, "learning_rate": 5.6240761796475264e-05, "loss": 3.9396, "step": 1527 }, { "epoch": 2.128133704735376, "grad_norm": 4.331142902374268, "learning_rate": 5.6230812961910174e-05, "loss": 4.292, "step": 1528 }, { "epoch": 2.129526462395543, "grad_norm": 4.0734381675720215, "learning_rate": 5.622086412734508e-05, "loss": 4.1097, "step": 1529 }, { "epoch": 2.13091922005571, "grad_norm": 4.513016700744629, "learning_rate": 5.621091529277998e-05, "loss": 4.2115, "step": 1530 }, { "epoch": 2.1323119777158777, "grad_norm": 4.545360088348389, "learning_rate": 5.620096645821489e-05, "loss": 4.2847, "step": 1531 }, { "epoch": 2.1337047353760448, "grad_norm": 4.601300239562988, "learning_rate": 5.61910176236498e-05, "loss": 4.2504, "step": 1532 }, { "epoch": 2.135097493036212, "grad_norm": 4.783657550811768, "learning_rate": 5.61810687890847e-05, "loss": 4.0739, "step": 1533 }, { "epoch": 2.136490250696379, "grad_norm": 4.459740161895752, "learning_rate": 5.617111995451961e-05, "loss": 4.1157, "step": 1534 }, { "epoch": 2.137883008356546, "grad_norm": 4.170529842376709, "learning_rate": 5.616117111995451e-05, "loss": 3.8235, "step": 1535 }, { "epoch": 2.139275766016713, "grad_norm": 4.170540809631348, "learning_rate": 5.615122228538942e-05, "loss": 4.309, "step": 1536 }, { "epoch": 2.1406685236768803, "grad_norm": 3.9952211380004883, "learning_rate": 5.6141273450824325e-05, "loss": 4.7077, "step": 1537 }, { "epoch": 2.1420612813370474, "grad_norm": 5.134542465209961, "learning_rate": 5.6131324616259236e-05, "loss": 3.7039, "step": 1538 }, { "epoch": 2.1434540389972145, "grad_norm": 4.411221981048584, "learning_rate": 5.612137578169413e-05, "loss": 4.0268, "step": 1539 }, { "epoch": 2.1448467966573816, "grad_norm": 4.216711044311523, "learning_rate": 5.6111426947129044e-05, "loss": 4.642, "step": 1540 }, { "epoch": 2.1462395543175488, "grad_norm": 4.027458190917969, "learning_rate": 5.6101478112563955e-05, "loss": 4.0452, "step": 1541 }, { "epoch": 2.147632311977716, "grad_norm": 5.340506076812744, "learning_rate": 5.609152927799886e-05, "loss": 4.4872, "step": 1542 }, { "epoch": 2.149025069637883, "grad_norm": 4.38348388671875, "learning_rate": 5.608158044343377e-05, "loss": 4.3302, "step": 1543 }, { "epoch": 2.15041782729805, "grad_norm": 4.876546382904053, "learning_rate": 5.607163160886867e-05, "loss": 3.5804, "step": 1544 }, { "epoch": 2.151810584958217, "grad_norm": 4.479104995727539, "learning_rate": 5.606168277430358e-05, "loss": 3.9673, "step": 1545 }, { "epoch": 2.1532033426183843, "grad_norm": 4.716986656188965, "learning_rate": 5.605173393973848e-05, "loss": 4.1712, "step": 1546 }, { "epoch": 2.1545961002785514, "grad_norm": 4.21389102935791, "learning_rate": 5.604178510517339e-05, "loss": 4.393, "step": 1547 }, { "epoch": 2.1559888579387185, "grad_norm": 5.340983867645264, "learning_rate": 5.60318362706083e-05, "loss": 4.8122, "step": 1548 }, { "epoch": 2.1573816155988856, "grad_norm": 4.190916538238525, "learning_rate": 5.60218874360432e-05, "loss": 4.5195, "step": 1549 }, { "epoch": 2.1587743732590527, "grad_norm": 4.221656799316406, "learning_rate": 5.6011938601478105e-05, "loss": 4.0933, "step": 1550 }, { "epoch": 2.16016713091922, "grad_norm": 3.9489657878875732, "learning_rate": 5.6001989766913016e-05, "loss": 3.8152, "step": 1551 }, { "epoch": 2.1615598885793874, "grad_norm": 4.408094882965088, "learning_rate": 5.599204093234792e-05, "loss": 3.9514, "step": 1552 }, { "epoch": 2.1629526462395545, "grad_norm": 4.304285049438477, "learning_rate": 5.598209209778283e-05, "loss": 4.5502, "step": 1553 }, { "epoch": 2.1643454038997216, "grad_norm": 4.349761962890625, "learning_rate": 5.597214326321773e-05, "loss": 3.9968, "step": 1554 }, { "epoch": 2.1657381615598887, "grad_norm": 4.542641639709473, "learning_rate": 5.596219442865264e-05, "loss": 3.821, "step": 1555 }, { "epoch": 2.167130919220056, "grad_norm": 3.8207194805145264, "learning_rate": 5.595224559408754e-05, "loss": 3.6482, "step": 1556 }, { "epoch": 2.168523676880223, "grad_norm": 4.042953014373779, "learning_rate": 5.5942296759522454e-05, "loss": 4.6605, "step": 1557 }, { "epoch": 2.16991643454039, "grad_norm": 4.262577533721924, "learning_rate": 5.593234792495736e-05, "loss": 4.587, "step": 1558 }, { "epoch": 2.171309192200557, "grad_norm": 4.657623291015625, "learning_rate": 5.592239909039226e-05, "loss": 4.4044, "step": 1559 }, { "epoch": 2.1727019498607243, "grad_norm": 4.761733531951904, "learning_rate": 5.5912450255827166e-05, "loss": 4.3288, "step": 1560 }, { "epoch": 2.1740947075208914, "grad_norm": 4.737750053405762, "learning_rate": 5.590250142126208e-05, "loss": 4.235, "step": 1561 }, { "epoch": 2.1754874651810585, "grad_norm": 4.017462253570557, "learning_rate": 5.589255258669698e-05, "loss": 4.2454, "step": 1562 }, { "epoch": 2.1768802228412256, "grad_norm": 4.1256585121154785, "learning_rate": 5.588260375213189e-05, "loss": 4.2497, "step": 1563 }, { "epoch": 2.1782729805013927, "grad_norm": 8.097579002380371, "learning_rate": 5.587265491756679e-05, "loss": 4.1784, "step": 1564 }, { "epoch": 2.17966573816156, "grad_norm": 4.777957916259766, "learning_rate": 5.58627060830017e-05, "loss": 4.0151, "step": 1565 }, { "epoch": 2.181058495821727, "grad_norm": 4.157647609710693, "learning_rate": 5.5852757248436604e-05, "loss": 3.9234, "step": 1566 }, { "epoch": 2.182451253481894, "grad_norm": 3.7936336994171143, "learning_rate": 5.5842808413871515e-05, "loss": 4.0243, "step": 1567 }, { "epoch": 2.183844011142061, "grad_norm": 5.389497756958008, "learning_rate": 5.5832859579306426e-05, "loss": 4.4381, "step": 1568 }, { "epoch": 2.1852367688022283, "grad_norm": 7.03663444519043, "learning_rate": 5.582291074474132e-05, "loss": 4.0193, "step": 1569 }, { "epoch": 2.1866295264623954, "grad_norm": 4.235686779022217, "learning_rate": 5.5812961910176234e-05, "loss": 4.0826, "step": 1570 }, { "epoch": 2.1880222841225625, "grad_norm": 4.34411096572876, "learning_rate": 5.580301307561114e-05, "loss": 4.3301, "step": 1571 }, { "epoch": 2.1894150417827296, "grad_norm": 4.225539684295654, "learning_rate": 5.579306424104605e-05, "loss": 4.5005, "step": 1572 }, { "epoch": 2.190807799442897, "grad_norm": 4.322992324829102, "learning_rate": 5.578311540648095e-05, "loss": 4.3125, "step": 1573 }, { "epoch": 2.1922005571030643, "grad_norm": 3.9500017166137695, "learning_rate": 5.577316657191586e-05, "loss": 3.9417, "step": 1574 }, { "epoch": 2.1935933147632314, "grad_norm": 3.9060707092285156, "learning_rate": 5.576321773735076e-05, "loss": 4.125, "step": 1575 }, { "epoch": 2.1949860724233985, "grad_norm": 4.2026286125183105, "learning_rate": 5.575326890278567e-05, "loss": 3.6776, "step": 1576 }, { "epoch": 2.1963788300835656, "grad_norm": 4.400326251983643, "learning_rate": 5.5743320068220576e-05, "loss": 4.4655, "step": 1577 }, { "epoch": 2.1977715877437327, "grad_norm": 5.644049644470215, "learning_rate": 5.573337123365549e-05, "loss": 4.0425, "step": 1578 }, { "epoch": 2.1991643454039, "grad_norm": 6.8138251304626465, "learning_rate": 5.5723422399090384e-05, "loss": 3.9512, "step": 1579 }, { "epoch": 2.200557103064067, "grad_norm": 5.015941619873047, "learning_rate": 5.5713473564525295e-05, "loss": 4.0271, "step": 1580 }, { "epoch": 2.201949860724234, "grad_norm": 4.783520698547363, "learning_rate": 5.57035247299602e-05, "loss": 4.2316, "step": 1581 }, { "epoch": 2.203342618384401, "grad_norm": 4.280081748962402, "learning_rate": 5.569357589539511e-05, "loss": 4.0603, "step": 1582 }, { "epoch": 2.2047353760445683, "grad_norm": 5.039388656616211, "learning_rate": 5.568362706083001e-05, "loss": 3.9209, "step": 1583 }, { "epoch": 2.2061281337047354, "grad_norm": 4.157800197601318, "learning_rate": 5.567367822626492e-05, "loss": 3.6996, "step": 1584 }, { "epoch": 2.2075208913649025, "grad_norm": 4.173427581787109, "learning_rate": 5.566372939169982e-05, "loss": 4.289, "step": 1585 }, { "epoch": 2.2089136490250696, "grad_norm": 4.1454267501831055, "learning_rate": 5.565378055713473e-05, "loss": 3.8151, "step": 1586 }, { "epoch": 2.2103064066852367, "grad_norm": 4.3231401443481445, "learning_rate": 5.564383172256964e-05, "loss": 4.2357, "step": 1587 }, { "epoch": 2.211699164345404, "grad_norm": 4.394160270690918, "learning_rate": 5.563388288800454e-05, "loss": 3.8161, "step": 1588 }, { "epoch": 2.213091922005571, "grad_norm": 5.923463344573975, "learning_rate": 5.5623934053439445e-05, "loss": 4.2231, "step": 1589 }, { "epoch": 2.214484679665738, "grad_norm": 4.116878032684326, "learning_rate": 5.5613985218874356e-05, "loss": 4.1096, "step": 1590 }, { "epoch": 2.215877437325905, "grad_norm": 4.993494033813477, "learning_rate": 5.560403638430926e-05, "loss": 4.1901, "step": 1591 }, { "epoch": 2.2172701949860723, "grad_norm": 4.756366729736328, "learning_rate": 5.559408754974417e-05, "loss": 4.1018, "step": 1592 }, { "epoch": 2.2186629526462394, "grad_norm": 4.143631935119629, "learning_rate": 5.558413871517907e-05, "loss": 4.0226, "step": 1593 }, { "epoch": 2.220055710306407, "grad_norm": 4.060478210449219, "learning_rate": 5.557418988061398e-05, "loss": 4.1714, "step": 1594 }, { "epoch": 2.2214484679665736, "grad_norm": 5.196919918060303, "learning_rate": 5.556424104604888e-05, "loss": 3.6259, "step": 1595 }, { "epoch": 2.222841225626741, "grad_norm": 4.83811616897583, "learning_rate": 5.5554292211483794e-05, "loss": 4.1439, "step": 1596 }, { "epoch": 2.2242339832869082, "grad_norm": 4.05018949508667, "learning_rate": 5.5544343376918705e-05, "loss": 3.5583, "step": 1597 }, { "epoch": 2.2256267409470754, "grad_norm": 4.4459638595581055, "learning_rate": 5.55343945423536e-05, "loss": 4.5046, "step": 1598 }, { "epoch": 2.2270194986072425, "grad_norm": 4.6933417320251465, "learning_rate": 5.552444570778851e-05, "loss": 4.2784, "step": 1599 }, { "epoch": 2.2284122562674096, "grad_norm": 4.105106830596924, "learning_rate": 5.551449687322342e-05, "loss": 4.1781, "step": 1600 }, { "epoch": 2.2298050139275767, "grad_norm": 4.288883209228516, "learning_rate": 5.550454803865833e-05, "loss": 4.2866, "step": 1601 }, { "epoch": 2.231197771587744, "grad_norm": 5.926295757293701, "learning_rate": 5.549459920409323e-05, "loss": 4.2175, "step": 1602 }, { "epoch": 2.232590529247911, "grad_norm": 3.661668539047241, "learning_rate": 5.5484650369528136e-05, "loss": 4.0713, "step": 1603 }, { "epoch": 2.233983286908078, "grad_norm": 3.785322904586792, "learning_rate": 5.547470153496304e-05, "loss": 4.1502, "step": 1604 }, { "epoch": 2.235376044568245, "grad_norm": 4.111382007598877, "learning_rate": 5.546475270039795e-05, "loss": 3.8881, "step": 1605 }, { "epoch": 2.2367688022284122, "grad_norm": 4.228845596313477, "learning_rate": 5.5454803865832855e-05, "loss": 3.7675, "step": 1606 }, { "epoch": 2.2381615598885793, "grad_norm": 3.797921895980835, "learning_rate": 5.5444855031267766e-05, "loss": 4.4189, "step": 1607 }, { "epoch": 2.2395543175487465, "grad_norm": 3.7675411701202393, "learning_rate": 5.543490619670266e-05, "loss": 3.5243, "step": 1608 }, { "epoch": 2.2409470752089136, "grad_norm": 4.016136646270752, "learning_rate": 5.5424957362137574e-05, "loss": 3.9185, "step": 1609 }, { "epoch": 2.2423398328690807, "grad_norm": 3.8654441833496094, "learning_rate": 5.541500852757248e-05, "loss": 3.8333, "step": 1610 }, { "epoch": 2.243732590529248, "grad_norm": 4.411679744720459, "learning_rate": 5.540505969300739e-05, "loss": 3.6448, "step": 1611 }, { "epoch": 2.245125348189415, "grad_norm": 4.223076820373535, "learning_rate": 5.539511085844229e-05, "loss": 4.1067, "step": 1612 }, { "epoch": 2.246518105849582, "grad_norm": 3.976116895675659, "learning_rate": 5.53851620238772e-05, "loss": 3.9666, "step": 1613 }, { "epoch": 2.247910863509749, "grad_norm": 4.159890174865723, "learning_rate": 5.53752131893121e-05, "loss": 4.2775, "step": 1614 }, { "epoch": 2.2493036211699167, "grad_norm": 4.389676570892334, "learning_rate": 5.536526435474701e-05, "loss": 4.1369, "step": 1615 }, { "epoch": 2.2506963788300833, "grad_norm": 5.163928985595703, "learning_rate": 5.5355315520181916e-05, "loss": 4.3889, "step": 1616 }, { "epoch": 2.252089136490251, "grad_norm": 4.48787784576416, "learning_rate": 5.534536668561683e-05, "loss": 4.2583, "step": 1617 }, { "epoch": 2.253481894150418, "grad_norm": 4.052666187286377, "learning_rate": 5.5335417851051724e-05, "loss": 4.0006, "step": 1618 }, { "epoch": 2.254874651810585, "grad_norm": 4.186406135559082, "learning_rate": 5.5325469016486635e-05, "loss": 4.1602, "step": 1619 }, { "epoch": 2.256267409470752, "grad_norm": 7.1013383865356445, "learning_rate": 5.531552018192154e-05, "loss": 4.0932, "step": 1620 }, { "epoch": 2.2576601671309193, "grad_norm": 3.8866419792175293, "learning_rate": 5.530557134735645e-05, "loss": 3.7892, "step": 1621 }, { "epoch": 2.2590529247910864, "grad_norm": 3.7705419063568115, "learning_rate": 5.529562251279136e-05, "loss": 3.7626, "step": 1622 }, { "epoch": 2.2604456824512535, "grad_norm": 3.958216428756714, "learning_rate": 5.528567367822626e-05, "loss": 4.2034, "step": 1623 }, { "epoch": 2.2618384401114207, "grad_norm": 4.2496771812438965, "learning_rate": 5.527572484366116e-05, "loss": 4.0623, "step": 1624 }, { "epoch": 2.2632311977715878, "grad_norm": 4.305548191070557, "learning_rate": 5.526577600909607e-05, "loss": 4.2394, "step": 1625 }, { "epoch": 2.264623955431755, "grad_norm": 4.462040901184082, "learning_rate": 5.5255827174530984e-05, "loss": 4.3093, "step": 1626 }, { "epoch": 2.266016713091922, "grad_norm": 4.265384197235107, "learning_rate": 5.524587833996588e-05, "loss": 3.8959, "step": 1627 }, { "epoch": 2.267409470752089, "grad_norm": 3.958977222442627, "learning_rate": 5.523592950540079e-05, "loss": 3.7723, "step": 1628 }, { "epoch": 2.268802228412256, "grad_norm": 4.284531116485596, "learning_rate": 5.5225980670835696e-05, "loss": 3.8134, "step": 1629 }, { "epoch": 2.2701949860724233, "grad_norm": 4.096482753753662, "learning_rate": 5.521603183627061e-05, "loss": 4.0498, "step": 1630 }, { "epoch": 2.2715877437325904, "grad_norm": 4.294747829437256, "learning_rate": 5.520608300170551e-05, "loss": 3.9642, "step": 1631 }, { "epoch": 2.2729805013927575, "grad_norm": 4.197109699249268, "learning_rate": 5.5196134167140415e-05, "loss": 4.0461, "step": 1632 }, { "epoch": 2.2743732590529246, "grad_norm": 3.998181104660034, "learning_rate": 5.518618533257532e-05, "loss": 4.4055, "step": 1633 }, { "epoch": 2.2757660167130918, "grad_norm": 4.521604061126709, "learning_rate": 5.517623649801023e-05, "loss": 3.6582, "step": 1634 }, { "epoch": 2.277158774373259, "grad_norm": 4.392470359802246, "learning_rate": 5.5166287663445134e-05, "loss": 4.3384, "step": 1635 }, { "epoch": 2.2785515320334264, "grad_norm": 4.117228984832764, "learning_rate": 5.5156338828880045e-05, "loss": 3.6421, "step": 1636 }, { "epoch": 2.279944289693593, "grad_norm": 4.706968784332275, "learning_rate": 5.514638999431494e-05, "loss": 4.5145, "step": 1637 }, { "epoch": 2.2813370473537606, "grad_norm": 4.281352519989014, "learning_rate": 5.513644115974985e-05, "loss": 3.9539, "step": 1638 }, { "epoch": 2.2827298050139277, "grad_norm": 4.370902061462402, "learning_rate": 5.512649232518476e-05, "loss": 3.755, "step": 1639 }, { "epoch": 2.284122562674095, "grad_norm": 4.418724060058594, "learning_rate": 5.511654349061967e-05, "loss": 3.9502, "step": 1640 }, { "epoch": 2.285515320334262, "grad_norm": 4.078126907348633, "learning_rate": 5.510659465605457e-05, "loss": 3.6778, "step": 1641 }, { "epoch": 2.286908077994429, "grad_norm": 4.293686389923096, "learning_rate": 5.5096645821489476e-05, "loss": 3.4956, "step": 1642 }, { "epoch": 2.288300835654596, "grad_norm": 5.51296854019165, "learning_rate": 5.508669698692438e-05, "loss": 4.2429, "step": 1643 }, { "epoch": 2.2896935933147633, "grad_norm": 4.143667221069336, "learning_rate": 5.507674815235929e-05, "loss": 4.142, "step": 1644 }, { "epoch": 2.2910863509749304, "grad_norm": 3.7358782291412354, "learning_rate": 5.5066799317794195e-05, "loss": 3.9213, "step": 1645 }, { "epoch": 2.2924791086350975, "grad_norm": 4.326070785522461, "learning_rate": 5.5056850483229106e-05, "loss": 3.5167, "step": 1646 }, { "epoch": 2.2938718662952646, "grad_norm": 4.527500629425049, "learning_rate": 5.5046901648664e-05, "loss": 3.7828, "step": 1647 }, { "epoch": 2.2952646239554317, "grad_norm": 4.36574649810791, "learning_rate": 5.5036952814098914e-05, "loss": 4.2175, "step": 1648 }, { "epoch": 2.296657381615599, "grad_norm": 3.914255142211914, "learning_rate": 5.502700397953382e-05, "loss": 3.7232, "step": 1649 }, { "epoch": 2.298050139275766, "grad_norm": 4.246000289916992, "learning_rate": 5.501705514496873e-05, "loss": 4.3355, "step": 1650 }, { "epoch": 2.299442896935933, "grad_norm": 5.617733955383301, "learning_rate": 5.500710631040364e-05, "loss": 3.876, "step": 1651 }, { "epoch": 2.3008356545961, "grad_norm": 3.8478708267211914, "learning_rate": 5.499715747583854e-05, "loss": 4.0928, "step": 1652 }, { "epoch": 2.3022284122562673, "grad_norm": 4.152899742126465, "learning_rate": 5.498720864127344e-05, "loss": 3.7558, "step": 1653 }, { "epoch": 2.3036211699164344, "grad_norm": 4.24927282333374, "learning_rate": 5.497725980670835e-05, "loss": 4.2481, "step": 1654 }, { "epoch": 2.3050139275766015, "grad_norm": 3.5137290954589844, "learning_rate": 5.496731097214326e-05, "loss": 3.8982, "step": 1655 }, { "epoch": 2.3064066852367686, "grad_norm": 3.882204294204712, "learning_rate": 5.495736213757817e-05, "loss": 3.545, "step": 1656 }, { "epoch": 2.307799442896936, "grad_norm": 4.548402786254883, "learning_rate": 5.494741330301307e-05, "loss": 4.0283, "step": 1657 }, { "epoch": 2.309192200557103, "grad_norm": 3.8759939670562744, "learning_rate": 5.4937464468447975e-05, "loss": 4.1759, "step": 1658 }, { "epoch": 2.3105849582172704, "grad_norm": 4.622664928436279, "learning_rate": 5.4927515633882886e-05, "loss": 4.1117, "step": 1659 }, { "epoch": 2.3119777158774375, "grad_norm": 4.528932571411133, "learning_rate": 5.491756679931779e-05, "loss": 3.9803, "step": 1660 }, { "epoch": 2.3133704735376046, "grad_norm": 3.8661184310913086, "learning_rate": 5.49076179647527e-05, "loss": 4.0703, "step": 1661 }, { "epoch": 2.3147632311977717, "grad_norm": 4.365589618682861, "learning_rate": 5.48976691301876e-05, "loss": 4.349, "step": 1662 }, { "epoch": 2.316155988857939, "grad_norm": 4.564374923706055, "learning_rate": 5.488772029562251e-05, "loss": 3.8839, "step": 1663 }, { "epoch": 2.317548746518106, "grad_norm": 3.963067054748535, "learning_rate": 5.487777146105741e-05, "loss": 3.8955, "step": 1664 }, { "epoch": 2.318941504178273, "grad_norm": 4.266085147857666, "learning_rate": 5.4867822626492324e-05, "loss": 3.9313, "step": 1665 }, { "epoch": 2.32033426183844, "grad_norm": 4.137028217315674, "learning_rate": 5.485787379192723e-05, "loss": 3.6683, "step": 1666 }, { "epoch": 2.3217270194986073, "grad_norm": 3.8003323078155518, "learning_rate": 5.484792495736213e-05, "loss": 4.1103, "step": 1667 }, { "epoch": 2.3231197771587744, "grad_norm": 4.106907844543457, "learning_rate": 5.4837976122797036e-05, "loss": 3.9397, "step": 1668 }, { "epoch": 2.3245125348189415, "grad_norm": 4.142886638641357, "learning_rate": 5.482802728823195e-05, "loss": 4.1045, "step": 1669 }, { "epoch": 2.3259052924791086, "grad_norm": 4.56312370300293, "learning_rate": 5.481807845366685e-05, "loss": 3.7163, "step": 1670 }, { "epoch": 2.3272980501392757, "grad_norm": 4.902716159820557, "learning_rate": 5.4808129619101755e-05, "loss": 3.5701, "step": 1671 }, { "epoch": 2.328690807799443, "grad_norm": 4.238404273986816, "learning_rate": 5.479818078453666e-05, "loss": 4.078, "step": 1672 }, { "epoch": 2.33008356545961, "grad_norm": 5.5242133140563965, "learning_rate": 5.478823194997157e-05, "loss": 3.8578, "step": 1673 }, { "epoch": 2.331476323119777, "grad_norm": 5.297257900238037, "learning_rate": 5.4778283115406474e-05, "loss": 3.6976, "step": 1674 }, { "epoch": 2.332869080779944, "grad_norm": 4.090051651000977, "learning_rate": 5.4768334280841385e-05, "loss": 3.7153, "step": 1675 }, { "epoch": 2.3342618384401113, "grad_norm": 4.634399890899658, "learning_rate": 5.475838544627628e-05, "loss": 3.9084, "step": 1676 }, { "epoch": 2.3356545961002784, "grad_norm": 4.183865070343018, "learning_rate": 5.474843661171119e-05, "loss": 3.9903, "step": 1677 }, { "epoch": 2.337047353760446, "grad_norm": 4.400298118591309, "learning_rate": 5.47384877771461e-05, "loss": 3.9344, "step": 1678 }, { "epoch": 2.3384401114206126, "grad_norm": 4.828468322753906, "learning_rate": 5.472853894258101e-05, "loss": 4.3338, "step": 1679 }, { "epoch": 2.33983286908078, "grad_norm": 4.010335922241211, "learning_rate": 5.471859010801592e-05, "loss": 3.8026, "step": 1680 }, { "epoch": 2.3412256267409473, "grad_norm": 4.217126846313477, "learning_rate": 5.4708641273450816e-05, "loss": 4.0941, "step": 1681 }, { "epoch": 2.3426183844011144, "grad_norm": 11.835099220275879, "learning_rate": 5.469869243888573e-05, "loss": 4.6217, "step": 1682 }, { "epoch": 2.3440111420612815, "grad_norm": 4.50076961517334, "learning_rate": 5.468874360432063e-05, "loss": 3.9282, "step": 1683 }, { "epoch": 2.3454038997214486, "grad_norm": 4.27176570892334, "learning_rate": 5.467879476975554e-05, "loss": 3.771, "step": 1684 }, { "epoch": 2.3467966573816157, "grad_norm": 3.9045350551605225, "learning_rate": 5.4668845935190446e-05, "loss": 4.0347, "step": 1685 }, { "epoch": 2.348189415041783, "grad_norm": 4.14553165435791, "learning_rate": 5.465889710062535e-05, "loss": 3.8646, "step": 1686 }, { "epoch": 2.34958217270195, "grad_norm": 3.7285969257354736, "learning_rate": 5.4648948266060254e-05, "loss": 4.1016, "step": 1687 }, { "epoch": 2.350974930362117, "grad_norm": 3.9861230850219727, "learning_rate": 5.4638999431495165e-05, "loss": 3.8058, "step": 1688 }, { "epoch": 2.352367688022284, "grad_norm": 4.064573287963867, "learning_rate": 5.462905059693007e-05, "loss": 4.2472, "step": 1689 }, { "epoch": 2.3537604456824512, "grad_norm": 6.249232769012451, "learning_rate": 5.461910176236498e-05, "loss": 3.7758, "step": 1690 }, { "epoch": 2.3551532033426184, "grad_norm": 4.630408763885498, "learning_rate": 5.460915292779988e-05, "loss": 4.0598, "step": 1691 }, { "epoch": 2.3565459610027855, "grad_norm": 4.128727912902832, "learning_rate": 5.459920409323479e-05, "loss": 3.7818, "step": 1692 }, { "epoch": 2.3579387186629526, "grad_norm": 4.089692115783691, "learning_rate": 5.458925525866969e-05, "loss": 3.6243, "step": 1693 }, { "epoch": 2.3593314763231197, "grad_norm": 4.420183181762695, "learning_rate": 5.45793064241046e-05, "loss": 4.1857, "step": 1694 }, { "epoch": 2.360724233983287, "grad_norm": 12.535345077514648, "learning_rate": 5.456935758953951e-05, "loss": 4.0737, "step": 1695 }, { "epoch": 2.362116991643454, "grad_norm": 4.6984076499938965, "learning_rate": 5.455940875497441e-05, "loss": 3.5673, "step": 1696 }, { "epoch": 2.363509749303621, "grad_norm": 4.740118503570557, "learning_rate": 5.4549459920409316e-05, "loss": 4.0019, "step": 1697 }, { "epoch": 2.364902506963788, "grad_norm": 4.562565326690674, "learning_rate": 5.4539511085844226e-05, "loss": 3.9198, "step": 1698 }, { "epoch": 2.3662952646239557, "grad_norm": 4.111634254455566, "learning_rate": 5.452956225127913e-05, "loss": 3.8422, "step": 1699 }, { "epoch": 2.3676880222841223, "grad_norm": 4.0607171058654785, "learning_rate": 5.451961341671404e-05, "loss": 4.0549, "step": 1700 }, { "epoch": 2.36908077994429, "grad_norm": 5.0293378829956055, "learning_rate": 5.450966458214894e-05, "loss": 4.3338, "step": 1701 }, { "epoch": 2.370473537604457, "grad_norm": 4.337513446807861, "learning_rate": 5.449971574758385e-05, "loss": 4.1525, "step": 1702 }, { "epoch": 2.371866295264624, "grad_norm": 4.8907880783081055, "learning_rate": 5.4489766913018754e-05, "loss": 3.6912, "step": 1703 }, { "epoch": 2.3732590529247912, "grad_norm": 4.43142032623291, "learning_rate": 5.4479818078453664e-05, "loss": 3.9502, "step": 1704 }, { "epoch": 2.3746518105849583, "grad_norm": 7.17999792098999, "learning_rate": 5.4469869243888575e-05, "loss": 3.9594, "step": 1705 }, { "epoch": 2.3760445682451254, "grad_norm": 4.692847728729248, "learning_rate": 5.445992040932347e-05, "loss": 3.913, "step": 1706 }, { "epoch": 2.3774373259052926, "grad_norm": 4.352915287017822, "learning_rate": 5.4449971574758377e-05, "loss": 3.8952, "step": 1707 }, { "epoch": 2.3788300835654597, "grad_norm": 4.265293598175049, "learning_rate": 5.444002274019329e-05, "loss": 3.4984, "step": 1708 }, { "epoch": 2.3802228412256268, "grad_norm": 4.7866668701171875, "learning_rate": 5.44300739056282e-05, "loss": 3.5351, "step": 1709 }, { "epoch": 2.381615598885794, "grad_norm": 4.605996608734131, "learning_rate": 5.4420125071063096e-05, "loss": 4.3709, "step": 1710 }, { "epoch": 2.383008356545961, "grad_norm": 4.188661098480225, "learning_rate": 5.4410176236498006e-05, "loss": 3.954, "step": 1711 }, { "epoch": 2.384401114206128, "grad_norm": 5.439994812011719, "learning_rate": 5.440022740193291e-05, "loss": 4.3375, "step": 1712 }, { "epoch": 2.385793871866295, "grad_norm": 4.298741817474365, "learning_rate": 5.439027856736782e-05, "loss": 3.3614, "step": 1713 }, { "epoch": 2.3871866295264623, "grad_norm": 3.844583511352539, "learning_rate": 5.4380329732802725e-05, "loss": 4.1435, "step": 1714 }, { "epoch": 2.3885793871866294, "grad_norm": 4.180552959442139, "learning_rate": 5.437038089823763e-05, "loss": 3.3169, "step": 1715 }, { "epoch": 2.3899721448467965, "grad_norm": 5.350617408752441, "learning_rate": 5.4360432063672534e-05, "loss": 3.6092, "step": 1716 }, { "epoch": 2.3913649025069637, "grad_norm": 4.727535724639893, "learning_rate": 5.4350483229107444e-05, "loss": 3.7643, "step": 1717 }, { "epoch": 2.3927576601671308, "grad_norm": 4.809422492980957, "learning_rate": 5.434053439454235e-05, "loss": 4.3711, "step": 1718 }, { "epoch": 2.394150417827298, "grad_norm": 4.07980489730835, "learning_rate": 5.433058555997726e-05, "loss": 3.738, "step": 1719 }, { "epoch": 2.3955431754874654, "grad_norm": 3.852039098739624, "learning_rate": 5.432063672541216e-05, "loss": 3.6097, "step": 1720 }, { "epoch": 2.396935933147632, "grad_norm": 5.284004211425781, "learning_rate": 5.431068789084707e-05, "loss": 3.5813, "step": 1721 }, { "epoch": 2.3983286908077996, "grad_norm": 4.828900337219238, "learning_rate": 5.430073905628197e-05, "loss": 3.8873, "step": 1722 }, { "epoch": 2.3997214484679668, "grad_norm": 3.7450292110443115, "learning_rate": 5.429079022171688e-05, "loss": 4.0325, "step": 1723 }, { "epoch": 2.401114206128134, "grad_norm": 4.9826765060424805, "learning_rate": 5.4280841387151787e-05, "loss": 3.7105, "step": 1724 }, { "epoch": 2.402506963788301, "grad_norm": 4.0178141593933105, "learning_rate": 5.427089255258669e-05, "loss": 3.2191, "step": 1725 }, { "epoch": 2.403899721448468, "grad_norm": 4.679317951202393, "learning_rate": 5.4260943718021595e-05, "loss": 3.6095, "step": 1726 }, { "epoch": 2.405292479108635, "grad_norm": 5.885893821716309, "learning_rate": 5.4250994883456506e-05, "loss": 3.7421, "step": 1727 }, { "epoch": 2.4066852367688023, "grad_norm": 4.902598857879639, "learning_rate": 5.424104604889141e-05, "loss": 3.624, "step": 1728 }, { "epoch": 2.4080779944289694, "grad_norm": 3.8110177516937256, "learning_rate": 5.423109721432632e-05, "loss": 3.6195, "step": 1729 }, { "epoch": 2.4094707520891365, "grad_norm": 4.584103584289551, "learning_rate": 5.422114837976122e-05, "loss": 3.7773, "step": 1730 }, { "epoch": 2.4108635097493036, "grad_norm": 4.206098556518555, "learning_rate": 5.421119954519613e-05, "loss": 4.3632, "step": 1731 }, { "epoch": 2.4122562674094707, "grad_norm": 4.200994491577148, "learning_rate": 5.420125071063103e-05, "loss": 3.4025, "step": 1732 }, { "epoch": 2.413649025069638, "grad_norm": 4.245271682739258, "learning_rate": 5.4191301876065944e-05, "loss": 3.7452, "step": 1733 }, { "epoch": 2.415041782729805, "grad_norm": 4.359995365142822, "learning_rate": 5.4181353041500854e-05, "loss": 4.0386, "step": 1734 }, { "epoch": 2.416434540389972, "grad_norm": 4.1396484375, "learning_rate": 5.417140420693575e-05, "loss": 4.4054, "step": 1735 }, { "epoch": 2.417827298050139, "grad_norm": 3.848989248275757, "learning_rate": 5.4161455372370656e-05, "loss": 3.7372, "step": 1736 }, { "epoch": 2.4192200557103063, "grad_norm": 3.9558351039886475, "learning_rate": 5.4151506537805567e-05, "loss": 3.6096, "step": 1737 }, { "epoch": 2.4206128133704734, "grad_norm": 4.3685526847839355, "learning_rate": 5.414155770324048e-05, "loss": 4.0534, "step": 1738 }, { "epoch": 2.4220055710306405, "grad_norm": 4.487792491912842, "learning_rate": 5.413160886867538e-05, "loss": 3.5896, "step": 1739 }, { "epoch": 2.4233983286908076, "grad_norm": 4.340232849121094, "learning_rate": 5.4121660034110286e-05, "loss": 3.6079, "step": 1740 }, { "epoch": 2.424791086350975, "grad_norm": 4.081694602966309, "learning_rate": 5.411171119954519e-05, "loss": 3.8478, "step": 1741 }, { "epoch": 2.426183844011142, "grad_norm": 4.9409708976745605, "learning_rate": 5.41017623649801e-05, "loss": 3.682, "step": 1742 }, { "epoch": 2.4275766016713094, "grad_norm": 4.346573352813721, "learning_rate": 5.4091813530415005e-05, "loss": 3.8987, "step": 1743 }, { "epoch": 2.4289693593314765, "grad_norm": 4.81184720993042, "learning_rate": 5.4081864695849915e-05, "loss": 3.8868, "step": 1744 }, { "epoch": 2.4303621169916436, "grad_norm": 3.792619466781616, "learning_rate": 5.407191586128481e-05, "loss": 3.4401, "step": 1745 }, { "epoch": 2.4317548746518107, "grad_norm": 5.124619483947754, "learning_rate": 5.4061967026719724e-05, "loss": 4.2587, "step": 1746 }, { "epoch": 2.433147632311978, "grad_norm": 3.9748077392578125, "learning_rate": 5.405201819215463e-05, "loss": 3.7289, "step": 1747 }, { "epoch": 2.434540389972145, "grad_norm": 4.482245922088623, "learning_rate": 5.404206935758954e-05, "loss": 3.7506, "step": 1748 }, { "epoch": 2.435933147632312, "grad_norm": 4.309085845947266, "learning_rate": 5.403212052302444e-05, "loss": 3.3056, "step": 1749 }, { "epoch": 2.437325905292479, "grad_norm": 4.857260227203369, "learning_rate": 5.402217168845935e-05, "loss": 3.9873, "step": 1750 }, { "epoch": 2.4387186629526463, "grad_norm": 4.169651985168457, "learning_rate": 5.401222285389425e-05, "loss": 3.5028, "step": 1751 }, { "epoch": 2.4401114206128134, "grad_norm": 4.409386157989502, "learning_rate": 5.400227401932916e-05, "loss": 4.1041, "step": 1752 }, { "epoch": 2.4415041782729805, "grad_norm": 4.25468111038208, "learning_rate": 5.3992325184764066e-05, "loss": 3.7783, "step": 1753 }, { "epoch": 2.4428969359331476, "grad_norm": 4.455057144165039, "learning_rate": 5.398237635019897e-05, "loss": 3.7907, "step": 1754 }, { "epoch": 2.4442896935933147, "grad_norm": 4.924750328063965, "learning_rate": 5.3972427515633874e-05, "loss": 4.0321, "step": 1755 }, { "epoch": 2.445682451253482, "grad_norm": 4.541840076446533, "learning_rate": 5.3962478681068785e-05, "loss": 4.1903, "step": 1756 }, { "epoch": 2.447075208913649, "grad_norm": 5.0196614265441895, "learning_rate": 5.395252984650369e-05, "loss": 3.9073, "step": 1757 }, { "epoch": 2.448467966573816, "grad_norm": 5.129353046417236, "learning_rate": 5.39425810119386e-05, "loss": 3.5858, "step": 1758 }, { "epoch": 2.449860724233983, "grad_norm": 4.817824363708496, "learning_rate": 5.39326321773735e-05, "loss": 4.1497, "step": 1759 }, { "epoch": 2.4512534818941503, "grad_norm": 3.773118734359741, "learning_rate": 5.392268334280841e-05, "loss": 3.655, "step": 1760 }, { "epoch": 2.4526462395543174, "grad_norm": 4.434882164001465, "learning_rate": 5.391273450824331e-05, "loss": 3.9889, "step": 1761 }, { "epoch": 2.4540389972144845, "grad_norm": 3.593027114868164, "learning_rate": 5.390278567367822e-05, "loss": 3.4343, "step": 1762 }, { "epoch": 2.4554317548746516, "grad_norm": 7.124451637268066, "learning_rate": 5.3892836839113134e-05, "loss": 3.7918, "step": 1763 }, { "epoch": 2.456824512534819, "grad_norm": 5.207165241241455, "learning_rate": 5.388288800454803e-05, "loss": 3.9079, "step": 1764 }, { "epoch": 2.4582172701949863, "grad_norm": 4.422651290893555, "learning_rate": 5.3872939169982935e-05, "loss": 4.0438, "step": 1765 }, { "epoch": 2.4596100278551534, "grad_norm": 4.428305625915527, "learning_rate": 5.3862990335417846e-05, "loss": 3.8374, "step": 1766 }, { "epoch": 2.4610027855153205, "grad_norm": 4.756076812744141, "learning_rate": 5.3853041500852757e-05, "loss": 3.7995, "step": 1767 }, { "epoch": 2.4623955431754876, "grad_norm": 4.255223274230957, "learning_rate": 5.384309266628766e-05, "loss": 3.9773, "step": 1768 }, { "epoch": 2.4637883008356547, "grad_norm": 4.941119194030762, "learning_rate": 5.3833143831722565e-05, "loss": 3.708, "step": 1769 }, { "epoch": 2.465181058495822, "grad_norm": 4.289640426635742, "learning_rate": 5.382319499715747e-05, "loss": 4.0151, "step": 1770 }, { "epoch": 2.466573816155989, "grad_norm": 4.04726505279541, "learning_rate": 5.381324616259238e-05, "loss": 3.9457, "step": 1771 }, { "epoch": 2.467966573816156, "grad_norm": 4.4292449951171875, "learning_rate": 5.3803297328027284e-05, "loss": 3.8842, "step": 1772 }, { "epoch": 2.469359331476323, "grad_norm": 4.514986991882324, "learning_rate": 5.3793348493462195e-05, "loss": 4.1711, "step": 1773 }, { "epoch": 2.4707520891364902, "grad_norm": 4.214544296264648, "learning_rate": 5.378339965889709e-05, "loss": 3.409, "step": 1774 }, { "epoch": 2.4721448467966574, "grad_norm": 4.354898452758789, "learning_rate": 5.3773450824332e-05, "loss": 3.914, "step": 1775 }, { "epoch": 2.4735376044568245, "grad_norm": 5.114170074462891, "learning_rate": 5.376350198976691e-05, "loss": 3.7704, "step": 1776 }, { "epoch": 2.4749303621169916, "grad_norm": 4.445180416107178, "learning_rate": 5.375355315520182e-05, "loss": 4.1139, "step": 1777 }, { "epoch": 2.4763231197771587, "grad_norm": 4.674963474273682, "learning_rate": 5.374360432063672e-05, "loss": 3.4318, "step": 1778 }, { "epoch": 2.477715877437326, "grad_norm": 3.9881012439727783, "learning_rate": 5.3733655486071626e-05, "loss": 3.5508, "step": 1779 }, { "epoch": 2.479108635097493, "grad_norm": 4.56321382522583, "learning_rate": 5.372370665150653e-05, "loss": 3.7149, "step": 1780 }, { "epoch": 2.48050139275766, "grad_norm": 4.433288097381592, "learning_rate": 5.371375781694144e-05, "loss": 3.9132, "step": 1781 }, { "epoch": 2.481894150417827, "grad_norm": 4.969962120056152, "learning_rate": 5.3703808982376345e-05, "loss": 4.5837, "step": 1782 }, { "epoch": 2.4832869080779942, "grad_norm": 4.592645645141602, "learning_rate": 5.3693860147811256e-05, "loss": 3.9482, "step": 1783 }, { "epoch": 2.4846796657381613, "grad_norm": 4.448911666870117, "learning_rate": 5.368391131324615e-05, "loss": 3.6117, "step": 1784 }, { "epoch": 2.486072423398329, "grad_norm": 4.952700614929199, "learning_rate": 5.3673962478681064e-05, "loss": 3.5696, "step": 1785 }, { "epoch": 2.487465181058496, "grad_norm": 4.145140171051025, "learning_rate": 5.366401364411597e-05, "loss": 4.1032, "step": 1786 }, { "epoch": 2.488857938718663, "grad_norm": 5.181249141693115, "learning_rate": 5.365406480955088e-05, "loss": 3.7582, "step": 1787 }, { "epoch": 2.4902506963788302, "grad_norm": 4.133391857147217, "learning_rate": 5.364411597498579e-05, "loss": 3.3871, "step": 1788 }, { "epoch": 2.4916434540389973, "grad_norm": 4.500240325927734, "learning_rate": 5.363416714042069e-05, "loss": 3.747, "step": 1789 }, { "epoch": 2.4930362116991645, "grad_norm": 4.199702262878418, "learning_rate": 5.362421830585559e-05, "loss": 4.1196, "step": 1790 }, { "epoch": 2.4944289693593316, "grad_norm": 4.462384223937988, "learning_rate": 5.36142694712905e-05, "loss": 3.7021, "step": 1791 }, { "epoch": 2.4958217270194987, "grad_norm": 4.20314359664917, "learning_rate": 5.360432063672541e-05, "loss": 3.3557, "step": 1792 }, { "epoch": 2.497214484679666, "grad_norm": 3.8399412631988525, "learning_rate": 5.359437180216032e-05, "loss": 3.3568, "step": 1793 }, { "epoch": 2.498607242339833, "grad_norm": 4.548267364501953, "learning_rate": 5.358442296759522e-05, "loss": 4.0814, "step": 1794 }, { "epoch": 2.5, "grad_norm": 5.398299217224121, "learning_rate": 5.3574474133030125e-05, "loss": 3.4655, "step": 1795 }, { "epoch": 2.501392757660167, "grad_norm": 4.408416271209717, "learning_rate": 5.3564525298465036e-05, "loss": 3.5049, "step": 1796 }, { "epoch": 2.502785515320334, "grad_norm": 4.241742134094238, "learning_rate": 5.355457646389994e-05, "loss": 3.9685, "step": 1797 }, { "epoch": 2.5041782729805013, "grad_norm": 4.418053150177002, "learning_rate": 5.3544627629334844e-05, "loss": 4.0235, "step": 1798 }, { "epoch": 2.5055710306406684, "grad_norm": 4.750143527984619, "learning_rate": 5.353467879476975e-05, "loss": 3.5644, "step": 1799 }, { "epoch": 2.5069637883008355, "grad_norm": 4.102102279663086, "learning_rate": 5.352472996020466e-05, "loss": 3.7058, "step": 1800 }, { "epoch": 2.5083565459610027, "grad_norm": 4.465678691864014, "learning_rate": 5.351478112563956e-05, "loss": 3.4684, "step": 1801 }, { "epoch": 2.5097493036211698, "grad_norm": 5.520817756652832, "learning_rate": 5.3504832291074474e-05, "loss": 3.8512, "step": 1802 }, { "epoch": 2.511142061281337, "grad_norm": 4.454752445220947, "learning_rate": 5.349488345650937e-05, "loss": 4.0388, "step": 1803 }, { "epoch": 2.5125348189415044, "grad_norm": 3.9050180912017822, "learning_rate": 5.348493462194428e-05, "loss": 3.5277, "step": 1804 }, { "epoch": 2.513927576601671, "grad_norm": 4.071952819824219, "learning_rate": 5.3474985787379186e-05, "loss": 3.7311, "step": 1805 }, { "epoch": 2.5153203342618387, "grad_norm": 4.0145063400268555, "learning_rate": 5.34650369528141e-05, "loss": 3.4427, "step": 1806 }, { "epoch": 2.5167130919220053, "grad_norm": 8.699322700500488, "learning_rate": 5.3455088118249e-05, "loss": 3.5338, "step": 1807 }, { "epoch": 2.518105849582173, "grad_norm": 3.9393362998962402, "learning_rate": 5.3445139283683905e-05, "loss": 3.6937, "step": 1808 }, { "epoch": 2.51949860724234, "grad_norm": 5.14155912399292, "learning_rate": 5.343519044911881e-05, "loss": 3.9188, "step": 1809 }, { "epoch": 2.520891364902507, "grad_norm": 4.432671070098877, "learning_rate": 5.342524161455372e-05, "loss": 3.8052, "step": 1810 }, { "epoch": 2.522284122562674, "grad_norm": 4.019896030426025, "learning_rate": 5.3415292779988624e-05, "loss": 3.853, "step": 1811 }, { "epoch": 2.5236768802228413, "grad_norm": 3.7733829021453857, "learning_rate": 5.3405343945423535e-05, "loss": 3.1545, "step": 1812 }, { "epoch": 2.5250696378830084, "grad_norm": 4.388171195983887, "learning_rate": 5.339539511085843e-05, "loss": 3.7329, "step": 1813 }, { "epoch": 2.5264623955431755, "grad_norm": 4.052828311920166, "learning_rate": 5.338544627629334e-05, "loss": 3.4381, "step": 1814 }, { "epoch": 2.5278551532033426, "grad_norm": 4.079244136810303, "learning_rate": 5.337549744172825e-05, "loss": 3.7339, "step": 1815 }, { "epoch": 2.5292479108635098, "grad_norm": 5.9524946212768555, "learning_rate": 5.336554860716316e-05, "loss": 3.6238, "step": 1816 }, { "epoch": 2.530640668523677, "grad_norm": 4.794691562652588, "learning_rate": 5.335559977259807e-05, "loss": 3.9297, "step": 1817 }, { "epoch": 2.532033426183844, "grad_norm": 4.042095184326172, "learning_rate": 5.3345650938032966e-05, "loss": 3.5151, "step": 1818 }, { "epoch": 2.533426183844011, "grad_norm": 5.037164211273193, "learning_rate": 5.333570210346787e-05, "loss": 3.9337, "step": 1819 }, { "epoch": 2.534818941504178, "grad_norm": 4.347875118255615, "learning_rate": 5.332575326890278e-05, "loss": 4.1536, "step": 1820 }, { "epoch": 2.5362116991643453, "grad_norm": 5.549806594848633, "learning_rate": 5.331580443433769e-05, "loss": 3.4958, "step": 1821 }, { "epoch": 2.5376044568245124, "grad_norm": 5.315230369567871, "learning_rate": 5.3305855599772596e-05, "loss": 4.1105, "step": 1822 }, { "epoch": 2.5389972144846795, "grad_norm": 4.203426361083984, "learning_rate": 5.32959067652075e-05, "loss": 3.3643, "step": 1823 }, { "epoch": 2.5403899721448466, "grad_norm": 4.214809417724609, "learning_rate": 5.3285957930642404e-05, "loss": 3.9121, "step": 1824 }, { "epoch": 2.541782729805014, "grad_norm": 5.017365455627441, "learning_rate": 5.3276009096077315e-05, "loss": 3.7667, "step": 1825 }, { "epoch": 2.543175487465181, "grad_norm": 4.704203128814697, "learning_rate": 5.326606026151222e-05, "loss": 3.5719, "step": 1826 }, { "epoch": 2.5445682451253484, "grad_norm": 4.263749599456787, "learning_rate": 5.325611142694713e-05, "loss": 3.8484, "step": 1827 }, { "epoch": 2.545961002785515, "grad_norm": 4.997926235198975, "learning_rate": 5.324616259238203e-05, "loss": 3.5273, "step": 1828 }, { "epoch": 2.5473537604456826, "grad_norm": 4.328037738800049, "learning_rate": 5.323621375781694e-05, "loss": 3.6637, "step": 1829 }, { "epoch": 2.5487465181058497, "grad_norm": 4.116519927978516, "learning_rate": 5.322626492325184e-05, "loss": 3.5132, "step": 1830 }, { "epoch": 2.550139275766017, "grad_norm": 4.875148296356201, "learning_rate": 5.321631608868675e-05, "loss": 3.6665, "step": 1831 }, { "epoch": 2.551532033426184, "grad_norm": 4.478740215301514, "learning_rate": 5.320636725412166e-05, "loss": 3.4473, "step": 1832 }, { "epoch": 2.552924791086351, "grad_norm": 6.236715316772461, "learning_rate": 5.319641841955656e-05, "loss": 3.518, "step": 1833 }, { "epoch": 2.554317548746518, "grad_norm": 4.370816707611084, "learning_rate": 5.3186469584991465e-05, "loss": 3.5055, "step": 1834 }, { "epoch": 2.5557103064066853, "grad_norm": 4.396291255950928, "learning_rate": 5.3176520750426376e-05, "loss": 3.7455, "step": 1835 }, { "epoch": 2.5571030640668524, "grad_norm": 5.377126216888428, "learning_rate": 5.316657191586128e-05, "loss": 3.8579, "step": 1836 }, { "epoch": 2.5584958217270195, "grad_norm": 4.379935264587402, "learning_rate": 5.315662308129619e-05, "loss": 3.9947, "step": 1837 }, { "epoch": 2.5598885793871866, "grad_norm": 3.9767608642578125, "learning_rate": 5.314667424673109e-05, "loss": 3.6255, "step": 1838 }, { "epoch": 2.5612813370473537, "grad_norm": 5.672046661376953, "learning_rate": 5.3136725412166e-05, "loss": 3.6386, "step": 1839 }, { "epoch": 2.562674094707521, "grad_norm": 4.641458034515381, "learning_rate": 5.31267765776009e-05, "loss": 4.0477, "step": 1840 }, { "epoch": 2.564066852367688, "grad_norm": 4.017055511474609, "learning_rate": 5.3116827743035814e-05, "loss": 3.8583, "step": 1841 }, { "epoch": 2.565459610027855, "grad_norm": 4.255927085876465, "learning_rate": 5.310687890847071e-05, "loss": 3.5727, "step": 1842 }, { "epoch": 2.566852367688022, "grad_norm": 5.480453968048096, "learning_rate": 5.309693007390562e-05, "loss": 3.3779, "step": 1843 }, { "epoch": 2.5682451253481893, "grad_norm": 3.9198381900787354, "learning_rate": 5.3086981239340526e-05, "loss": 4.0592, "step": 1844 }, { "epoch": 2.5696378830083564, "grad_norm": 4.344557285308838, "learning_rate": 5.307703240477544e-05, "loss": 3.9158, "step": 1845 }, { "epoch": 2.571030640668524, "grad_norm": 4.110029220581055, "learning_rate": 5.306708357021035e-05, "loss": 4.1278, "step": 1846 }, { "epoch": 2.5724233983286906, "grad_norm": 3.936121940612793, "learning_rate": 5.3057134735645245e-05, "loss": 3.5425, "step": 1847 }, { "epoch": 2.573816155988858, "grad_norm": 3.695356607437134, "learning_rate": 5.304718590108015e-05, "loss": 3.4381, "step": 1848 }, { "epoch": 2.575208913649025, "grad_norm": 8.275914192199707, "learning_rate": 5.303723706651506e-05, "loss": 3.4775, "step": 1849 }, { "epoch": 2.5766016713091924, "grad_norm": 5.141213417053223, "learning_rate": 5.302728823194997e-05, "loss": 3.695, "step": 1850 }, { "epoch": 2.5779944289693595, "grad_norm": 4.1147332191467285, "learning_rate": 5.3017339397384875e-05, "loss": 3.3717, "step": 1851 }, { "epoch": 2.5793871866295266, "grad_norm": 5.368847846984863, "learning_rate": 5.300739056281978e-05, "loss": 3.7957, "step": 1852 }, { "epoch": 2.5807799442896937, "grad_norm": 4.1998395919799805, "learning_rate": 5.299744172825468e-05, "loss": 3.9779, "step": 1853 }, { "epoch": 2.582172701949861, "grad_norm": 3.5952372550964355, "learning_rate": 5.2987492893689594e-05, "loss": 3.3549, "step": 1854 }, { "epoch": 2.583565459610028, "grad_norm": 4.264059543609619, "learning_rate": 5.29775440591245e-05, "loss": 3.5971, "step": 1855 }, { "epoch": 2.584958217270195, "grad_norm": 4.747745037078857, "learning_rate": 5.296759522455941e-05, "loss": 3.5043, "step": 1856 }, { "epoch": 2.586350974930362, "grad_norm": 4.186683654785156, "learning_rate": 5.2957646389994306e-05, "loss": 3.9387, "step": 1857 }, { "epoch": 2.5877437325905293, "grad_norm": 5.196044921875, "learning_rate": 5.294769755542922e-05, "loss": 3.9363, "step": 1858 }, { "epoch": 2.5891364902506964, "grad_norm": 4.816099166870117, "learning_rate": 5.293774872086412e-05, "loss": 3.5953, "step": 1859 }, { "epoch": 2.5905292479108635, "grad_norm": 4.190717697143555, "learning_rate": 5.292779988629903e-05, "loss": 3.8588, "step": 1860 }, { "epoch": 2.5919220055710306, "grad_norm": 4.201430320739746, "learning_rate": 5.2917851051733936e-05, "loss": 4.0909, "step": 1861 }, { "epoch": 2.5933147632311977, "grad_norm": 4.05234956741333, "learning_rate": 5.290790221716884e-05, "loss": 3.6215, "step": 1862 }, { "epoch": 2.594707520891365, "grad_norm": 3.9277195930480957, "learning_rate": 5.2897953382603744e-05, "loss": 3.5398, "step": 1863 }, { "epoch": 2.596100278551532, "grad_norm": 4.1085591316223145, "learning_rate": 5.2888004548038655e-05, "loss": 4.4546, "step": 1864 }, { "epoch": 2.597493036211699, "grad_norm": 5.142462730407715, "learning_rate": 5.287805571347356e-05, "loss": 3.4016, "step": 1865 }, { "epoch": 2.598885793871866, "grad_norm": 5.263397693634033, "learning_rate": 5.286810687890847e-05, "loss": 3.6781, "step": 1866 }, { "epoch": 2.6002785515320337, "grad_norm": 4.9327497482299805, "learning_rate": 5.285815804434337e-05, "loss": 3.7238, "step": 1867 }, { "epoch": 2.6016713091922004, "grad_norm": 3.7192282676696777, "learning_rate": 5.284820920977828e-05, "loss": 3.5964, "step": 1868 }, { "epoch": 2.603064066852368, "grad_norm": 7.3771257400512695, "learning_rate": 5.283826037521318e-05, "loss": 3.53, "step": 1869 }, { "epoch": 2.6044568245125346, "grad_norm": 3.78019118309021, "learning_rate": 5.282831154064809e-05, "loss": 3.4685, "step": 1870 }, { "epoch": 2.605849582172702, "grad_norm": 4.528641223907471, "learning_rate": 5.2818362706083e-05, "loss": 3.5435, "step": 1871 }, { "epoch": 2.6072423398328692, "grad_norm": 6.675357818603516, "learning_rate": 5.28084138715179e-05, "loss": 3.4348, "step": 1872 }, { "epoch": 2.6086350974930363, "grad_norm": 4.7266130447387695, "learning_rate": 5.2798465036952805e-05, "loss": 3.5718, "step": 1873 }, { "epoch": 2.6100278551532035, "grad_norm": 4.423274040222168, "learning_rate": 5.2788516202387716e-05, "loss": 3.1636, "step": 1874 }, { "epoch": 2.6114206128133706, "grad_norm": 4.587574481964111, "learning_rate": 5.277856736782263e-05, "loss": 3.8414, "step": 1875 }, { "epoch": 2.6128133704735377, "grad_norm": 3.895914316177368, "learning_rate": 5.276861853325753e-05, "loss": 4.0406, "step": 1876 }, { "epoch": 2.614206128133705, "grad_norm": 4.607090950012207, "learning_rate": 5.275866969869243e-05, "loss": 4.121, "step": 1877 }, { "epoch": 2.615598885793872, "grad_norm": 9.816665649414062, "learning_rate": 5.274872086412734e-05, "loss": 3.8753, "step": 1878 }, { "epoch": 2.616991643454039, "grad_norm": 5.306396484375, "learning_rate": 5.273877202956225e-05, "loss": 3.7716, "step": 1879 }, { "epoch": 2.618384401114206, "grad_norm": 5.104506492614746, "learning_rate": 5.2728823194997154e-05, "loss": 3.6848, "step": 1880 }, { "epoch": 2.6197771587743732, "grad_norm": 4.126043796539307, "learning_rate": 5.2718874360432065e-05, "loss": 3.7632, "step": 1881 }, { "epoch": 2.6211699164345403, "grad_norm": 3.921090602874756, "learning_rate": 5.270892552586696e-05, "loss": 3.8218, "step": 1882 }, { "epoch": 2.6225626740947074, "grad_norm": 4.4806952476501465, "learning_rate": 5.269897669130187e-05, "loss": 3.6962, "step": 1883 }, { "epoch": 2.6239554317548746, "grad_norm": 5.107631683349609, "learning_rate": 5.268902785673678e-05, "loss": 3.8599, "step": 1884 }, { "epoch": 2.6253481894150417, "grad_norm": 4.719388961791992, "learning_rate": 5.267907902217169e-05, "loss": 3.8638, "step": 1885 }, { "epoch": 2.6267409470752088, "grad_norm": 5.310430526733398, "learning_rate": 5.2669130187606585e-05, "loss": 3.7588, "step": 1886 }, { "epoch": 2.628133704735376, "grad_norm": 4.737289905548096, "learning_rate": 5.2659181353041496e-05, "loss": 3.6632, "step": 1887 }, { "epoch": 2.6295264623955434, "grad_norm": 4.28202486038208, "learning_rate": 5.26492325184764e-05, "loss": 3.5808, "step": 1888 }, { "epoch": 2.63091922005571, "grad_norm": 4.6112494468688965, "learning_rate": 5.263928368391131e-05, "loss": 4.0221, "step": 1889 }, { "epoch": 2.6323119777158777, "grad_norm": 3.8772616386413574, "learning_rate": 5.2629334849346215e-05, "loss": 3.6025, "step": 1890 }, { "epoch": 2.6337047353760443, "grad_norm": 4.3526105880737305, "learning_rate": 5.261938601478112e-05, "loss": 3.611, "step": 1891 }, { "epoch": 2.635097493036212, "grad_norm": 4.766617298126221, "learning_rate": 5.2609437180216023e-05, "loss": 3.4089, "step": 1892 }, { "epoch": 2.636490250696379, "grad_norm": 4.573330879211426, "learning_rate": 5.2599488345650934e-05, "loss": 3.8785, "step": 1893 }, { "epoch": 2.637883008356546, "grad_norm": 4.703368186950684, "learning_rate": 5.258953951108584e-05, "loss": 3.6822, "step": 1894 }, { "epoch": 2.639275766016713, "grad_norm": 4.373324871063232, "learning_rate": 5.257959067652075e-05, "loss": 3.6716, "step": 1895 }, { "epoch": 2.6406685236768803, "grad_norm": 5.367077350616455, "learning_rate": 5.2569641841955647e-05, "loss": 4.0347, "step": 1896 }, { "epoch": 2.6420612813370474, "grad_norm": 5.086892127990723, "learning_rate": 5.255969300739056e-05, "loss": 3.6172, "step": 1897 }, { "epoch": 2.6434540389972145, "grad_norm": 4.16098690032959, "learning_rate": 5.254974417282546e-05, "loss": 3.7007, "step": 1898 }, { "epoch": 2.6448467966573816, "grad_norm": 4.16826057434082, "learning_rate": 5.253979533826037e-05, "loss": 4.066, "step": 1899 }, { "epoch": 2.6462395543175488, "grad_norm": 4.992652893066406, "learning_rate": 5.252984650369528e-05, "loss": 3.7062, "step": 1900 }, { "epoch": 2.647632311977716, "grad_norm": 4.250000953674316, "learning_rate": 5.251989766913018e-05, "loss": 3.1994, "step": 1901 }, { "epoch": 2.649025069637883, "grad_norm": 4.818694591522217, "learning_rate": 5.2509948834565085e-05, "loss": 4.2614, "step": 1902 }, { "epoch": 2.65041782729805, "grad_norm": 4.342881202697754, "learning_rate": 5.2499999999999995e-05, "loss": 3.6534, "step": 1903 }, { "epoch": 2.651810584958217, "grad_norm": 4.088788032531738, "learning_rate": 5.2490051165434906e-05, "loss": 3.302, "step": 1904 }, { "epoch": 2.6532033426183843, "grad_norm": 4.922445774078369, "learning_rate": 5.248010233086981e-05, "loss": 3.4726, "step": 1905 }, { "epoch": 2.6545961002785514, "grad_norm": 3.9478759765625, "learning_rate": 5.247015349630471e-05, "loss": 3.6109, "step": 1906 }, { "epoch": 2.6559888579387185, "grad_norm": 4.833628177642822, "learning_rate": 5.246020466173962e-05, "loss": 3.974, "step": 1907 }, { "epoch": 2.6573816155988856, "grad_norm": 3.918222427368164, "learning_rate": 5.245025582717453e-05, "loss": 3.327, "step": 1908 }, { "epoch": 2.658774373259053, "grad_norm": 4.314101219177246, "learning_rate": 5.2440306992609433e-05, "loss": 3.7502, "step": 1909 }, { "epoch": 2.66016713091922, "grad_norm": 4.25202751159668, "learning_rate": 5.2430358158044344e-05, "loss": 3.861, "step": 1910 }, { "epoch": 2.6615598885793874, "grad_norm": 3.9286015033721924, "learning_rate": 5.242040932347924e-05, "loss": 3.4853, "step": 1911 }, { "epoch": 2.662952646239554, "grad_norm": 4.644457817077637, "learning_rate": 5.241046048891415e-05, "loss": 3.548, "step": 1912 }, { "epoch": 2.6643454038997216, "grad_norm": 4.0190229415893555, "learning_rate": 5.2400511654349056e-05, "loss": 3.3824, "step": 1913 }, { "epoch": 2.6657381615598887, "grad_norm": 4.23594856262207, "learning_rate": 5.239056281978397e-05, "loss": 3.1528, "step": 1914 }, { "epoch": 2.667130919220056, "grad_norm": 4.653962135314941, "learning_rate": 5.238061398521887e-05, "loss": 3.5549, "step": 1915 }, { "epoch": 2.668523676880223, "grad_norm": 4.541530132293701, "learning_rate": 5.2370665150653775e-05, "loss": 3.427, "step": 1916 }, { "epoch": 2.66991643454039, "grad_norm": 4.232297897338867, "learning_rate": 5.236071631608868e-05, "loss": 3.5785, "step": 1917 }, { "epoch": 2.671309192200557, "grad_norm": 5.324038982391357, "learning_rate": 5.235076748152359e-05, "loss": 3.8548, "step": 1918 }, { "epoch": 2.6727019498607243, "grad_norm": 6.1439080238342285, "learning_rate": 5.2340818646958494e-05, "loss": 3.6788, "step": 1919 }, { "epoch": 2.6740947075208914, "grad_norm": 3.6627092361450195, "learning_rate": 5.2330869812393405e-05, "loss": 3.436, "step": 1920 }, { "epoch": 2.6754874651810585, "grad_norm": 4.146340370178223, "learning_rate": 5.23209209778283e-05, "loss": 3.6203, "step": 1921 }, { "epoch": 2.6768802228412256, "grad_norm": 4.556893348693848, "learning_rate": 5.2310972143263213e-05, "loss": 3.4084, "step": 1922 }, { "epoch": 2.6782729805013927, "grad_norm": 4.068876266479492, "learning_rate": 5.230102330869812e-05, "loss": 3.5732, "step": 1923 }, { "epoch": 2.67966573816156, "grad_norm": 4.500432968139648, "learning_rate": 5.229107447413303e-05, "loss": 3.8973, "step": 1924 }, { "epoch": 2.681058495821727, "grad_norm": 7.038255214691162, "learning_rate": 5.2281125639567926e-05, "loss": 3.7381, "step": 1925 }, { "epoch": 2.682451253481894, "grad_norm": 4.796656131744385, "learning_rate": 5.2271176805002837e-05, "loss": 3.7804, "step": 1926 }, { "epoch": 2.683844011142061, "grad_norm": 4.580173492431641, "learning_rate": 5.226122797043774e-05, "loss": 3.7832, "step": 1927 }, { "epoch": 2.6852367688022283, "grad_norm": 4.559717655181885, "learning_rate": 5.225127913587265e-05, "loss": 3.5402, "step": 1928 }, { "epoch": 2.6866295264623954, "grad_norm": 4.069858551025391, "learning_rate": 5.224133030130756e-05, "loss": 3.7922, "step": 1929 }, { "epoch": 2.688022284122563, "grad_norm": 4.148495197296143, "learning_rate": 5.223138146674246e-05, "loss": 3.5519, "step": 1930 }, { "epoch": 2.6894150417827296, "grad_norm": 4.800426483154297, "learning_rate": 5.2221432632177364e-05, "loss": 3.7697, "step": 1931 }, { "epoch": 2.690807799442897, "grad_norm": 9.859857559204102, "learning_rate": 5.2211483797612275e-05, "loss": 3.2831, "step": 1932 }, { "epoch": 2.692200557103064, "grad_norm": 4.45426607131958, "learning_rate": 5.2201534963047185e-05, "loss": 3.8314, "step": 1933 }, { "epoch": 2.6935933147632314, "grad_norm": 4.1094465255737305, "learning_rate": 5.219158612848209e-05, "loss": 3.389, "step": 1934 }, { "epoch": 2.6949860724233985, "grad_norm": 4.332690238952637, "learning_rate": 5.2181637293916994e-05, "loss": 4.0012, "step": 1935 }, { "epoch": 2.6963788300835656, "grad_norm": 4.643312454223633, "learning_rate": 5.21716884593519e-05, "loss": 3.3735, "step": 1936 }, { "epoch": 2.6977715877437327, "grad_norm": 5.339315414428711, "learning_rate": 5.216173962478681e-05, "loss": 3.2747, "step": 1937 }, { "epoch": 2.6991643454039, "grad_norm": 4.450771808624268, "learning_rate": 5.215179079022171e-05, "loss": 3.4963, "step": 1938 }, { "epoch": 2.700557103064067, "grad_norm": 4.209102630615234, "learning_rate": 5.2141841955656623e-05, "loss": 3.3696, "step": 1939 }, { "epoch": 2.701949860724234, "grad_norm": 4.320992469787598, "learning_rate": 5.213189312109152e-05, "loss": 3.6478, "step": 1940 }, { "epoch": 2.703342618384401, "grad_norm": 4.924124717712402, "learning_rate": 5.212194428652643e-05, "loss": 3.8642, "step": 1941 }, { "epoch": 2.7047353760445683, "grad_norm": 4.025848865509033, "learning_rate": 5.2111995451961336e-05, "loss": 3.1843, "step": 1942 }, { "epoch": 2.7061281337047354, "grad_norm": 5.065040111541748, "learning_rate": 5.2102046617396246e-05, "loss": 3.7114, "step": 1943 }, { "epoch": 2.7075208913649025, "grad_norm": 3.9760985374450684, "learning_rate": 5.209209778283115e-05, "loss": 3.402, "step": 1944 }, { "epoch": 2.7089136490250696, "grad_norm": 4.428731441497803, "learning_rate": 5.2082148948266055e-05, "loss": 3.6306, "step": 1945 }, { "epoch": 2.7103064066852367, "grad_norm": 3.8420968055725098, "learning_rate": 5.207220011370096e-05, "loss": 3.0539, "step": 1946 }, { "epoch": 2.711699164345404, "grad_norm": 4.728443622589111, "learning_rate": 5.206225127913587e-05, "loss": 4.1489, "step": 1947 }, { "epoch": 2.713091922005571, "grad_norm": 4.3703765869140625, "learning_rate": 5.2052302444570774e-05, "loss": 3.4861, "step": 1948 }, { "epoch": 2.714484679665738, "grad_norm": 4.901739120483398, "learning_rate": 5.2042353610005684e-05, "loss": 3.2697, "step": 1949 }, { "epoch": 2.715877437325905, "grad_norm": 4.627745628356934, "learning_rate": 5.203240477544058e-05, "loss": 3.3958, "step": 1950 }, { "epoch": 2.7172701949860727, "grad_norm": 5.152009963989258, "learning_rate": 5.202245594087549e-05, "loss": 2.9311, "step": 1951 }, { "epoch": 2.7186629526462394, "grad_norm": 4.1321587562561035, "learning_rate": 5.20125071063104e-05, "loss": 3.2103, "step": 1952 }, { "epoch": 2.720055710306407, "grad_norm": 4.1402740478515625, "learning_rate": 5.200255827174531e-05, "loss": 3.6239, "step": 1953 }, { "epoch": 2.7214484679665736, "grad_norm": 5.09497594833374, "learning_rate": 5.199260943718021e-05, "loss": 3.589, "step": 1954 }, { "epoch": 2.722841225626741, "grad_norm": 4.253708362579346, "learning_rate": 5.1982660602615116e-05, "loss": 3.2043, "step": 1955 }, { "epoch": 2.724233983286908, "grad_norm": 4.034773826599121, "learning_rate": 5.197271176805002e-05, "loss": 3.555, "step": 1956 }, { "epoch": 2.7256267409470754, "grad_norm": 4.14765739440918, "learning_rate": 5.196276293348493e-05, "loss": 3.5134, "step": 1957 }, { "epoch": 2.7270194986072425, "grad_norm": 4.335793972015381, "learning_rate": 5.195281409891984e-05, "loss": 3.3533, "step": 1958 }, { "epoch": 2.7284122562674096, "grad_norm": 5.370896339416504, "learning_rate": 5.1942865264354746e-05, "loss": 3.7931, "step": 1959 }, { "epoch": 2.7298050139275767, "grad_norm": 4.351439476013184, "learning_rate": 5.193291642978964e-05, "loss": 3.7763, "step": 1960 }, { "epoch": 2.731197771587744, "grad_norm": 4.499411106109619, "learning_rate": 5.1922967595224554e-05, "loss": 3.453, "step": 1961 }, { "epoch": 2.732590529247911, "grad_norm": 6.719456672668457, "learning_rate": 5.1913018760659465e-05, "loss": 4.2591, "step": 1962 }, { "epoch": 2.733983286908078, "grad_norm": 4.187103271484375, "learning_rate": 5.190306992609437e-05, "loss": 3.712, "step": 1963 }, { "epoch": 2.735376044568245, "grad_norm": 4.134088516235352, "learning_rate": 5.189312109152928e-05, "loss": 3.4247, "step": 1964 }, { "epoch": 2.7367688022284122, "grad_norm": 3.8819053173065186, "learning_rate": 5.188317225696418e-05, "loss": 3.1055, "step": 1965 }, { "epoch": 2.7381615598885793, "grad_norm": 4.02744197845459, "learning_rate": 5.187322342239909e-05, "loss": 3.665, "step": 1966 }, { "epoch": 2.7395543175487465, "grad_norm": 3.618467330932617, "learning_rate": 5.186327458783399e-05, "loss": 3.2742, "step": 1967 }, { "epoch": 2.7409470752089136, "grad_norm": 4.1958537101745605, "learning_rate": 5.18533257532689e-05, "loss": 3.7362, "step": 1968 }, { "epoch": 2.7423398328690807, "grad_norm": 4.144404411315918, "learning_rate": 5.18433769187038e-05, "loss": 3.7999, "step": 1969 }, { "epoch": 2.743732590529248, "grad_norm": 4.595558166503906, "learning_rate": 5.183342808413871e-05, "loss": 3.6315, "step": 1970 }, { "epoch": 2.745125348189415, "grad_norm": 3.7925376892089844, "learning_rate": 5.1823479249573615e-05, "loss": 2.9733, "step": 1971 }, { "epoch": 2.7465181058495824, "grad_norm": 3.7566452026367188, "learning_rate": 5.1813530415008526e-05, "loss": 3.198, "step": 1972 }, { "epoch": 2.747910863509749, "grad_norm": 4.324394226074219, "learning_rate": 5.180358158044343e-05, "loss": 3.6722, "step": 1973 }, { "epoch": 2.7493036211699167, "grad_norm": 5.738814353942871, "learning_rate": 5.1793632745878334e-05, "loss": 3.5951, "step": 1974 }, { "epoch": 2.7506963788300833, "grad_norm": 4.4775567054748535, "learning_rate": 5.178368391131324e-05, "loss": 3.524, "step": 1975 }, { "epoch": 2.752089136490251, "grad_norm": 4.070470333099365, "learning_rate": 5.177373507674815e-05, "loss": 3.2178, "step": 1976 }, { "epoch": 2.7534818941504176, "grad_norm": 4.293017864227295, "learning_rate": 5.176378624218305e-05, "loss": 3.8558, "step": 1977 }, { "epoch": 2.754874651810585, "grad_norm": 4.2213640213012695, "learning_rate": 5.1753837407617964e-05, "loss": 3.5929, "step": 1978 }, { "epoch": 2.756267409470752, "grad_norm": 4.38236665725708, "learning_rate": 5.174388857305286e-05, "loss": 3.4067, "step": 1979 }, { "epoch": 2.7576601671309193, "grad_norm": 4.706287384033203, "learning_rate": 5.173393973848777e-05, "loss": 3.7148, "step": 1980 }, { "epoch": 2.7590529247910864, "grad_norm": 4.4334516525268555, "learning_rate": 5.1723990903922676e-05, "loss": 3.5652, "step": 1981 }, { "epoch": 2.7604456824512535, "grad_norm": 4.028783798217773, "learning_rate": 5.171404206935759e-05, "loss": 3.4242, "step": 1982 }, { "epoch": 2.7618384401114207, "grad_norm": 4.340991020202637, "learning_rate": 5.170409323479249e-05, "loss": 3.861, "step": 1983 }, { "epoch": 2.7632311977715878, "grad_norm": 4.295785427093506, "learning_rate": 5.1694144400227395e-05, "loss": 3.5997, "step": 1984 }, { "epoch": 2.764623955431755, "grad_norm": 6.218203544616699, "learning_rate": 5.16841955656623e-05, "loss": 3.4819, "step": 1985 }, { "epoch": 2.766016713091922, "grad_norm": 4.786428451538086, "learning_rate": 5.167424673109721e-05, "loss": 3.01, "step": 1986 }, { "epoch": 2.767409470752089, "grad_norm": 4.1766252517700195, "learning_rate": 5.166429789653212e-05, "loss": 3.3219, "step": 1987 }, { "epoch": 2.768802228412256, "grad_norm": 4.175639629364014, "learning_rate": 5.1654349061967025e-05, "loss": 3.8378, "step": 1988 }, { "epoch": 2.7701949860724233, "grad_norm": 5.461728096008301, "learning_rate": 5.164440022740192e-05, "loss": 3.6676, "step": 1989 }, { "epoch": 2.7715877437325904, "grad_norm": 4.121284008026123, "learning_rate": 5.163445139283683e-05, "loss": 3.7472, "step": 1990 }, { "epoch": 2.7729805013927575, "grad_norm": 4.15053653717041, "learning_rate": 5.1624502558271744e-05, "loss": 3.297, "step": 1991 }, { "epoch": 2.7743732590529246, "grad_norm": 4.42463493347168, "learning_rate": 5.161455372370665e-05, "loss": 3.6092, "step": 1992 }, { "epoch": 2.775766016713092, "grad_norm": 3.930119752883911, "learning_rate": 5.160460488914156e-05, "loss": 3.3793, "step": 1993 }, { "epoch": 2.777158774373259, "grad_norm": 4.388279438018799, "learning_rate": 5.1594656054576456e-05, "loss": 3.472, "step": 1994 }, { "epoch": 2.7785515320334264, "grad_norm": 5.646612167358398, "learning_rate": 5.158470722001137e-05, "loss": 3.2413, "step": 1995 }, { "epoch": 2.779944289693593, "grad_norm": 4.878632545471191, "learning_rate": 5.157475838544627e-05, "loss": 3.386, "step": 1996 }, { "epoch": 2.7813370473537606, "grad_norm": 4.885819911956787, "learning_rate": 5.156480955088118e-05, "loss": 3.8848, "step": 1997 }, { "epoch": 2.7827298050139273, "grad_norm": 4.037961959838867, "learning_rate": 5.1554860716316086e-05, "loss": 3.2975, "step": 1998 }, { "epoch": 2.784122562674095, "grad_norm": 4.697548866271973, "learning_rate": 5.154491188175099e-05, "loss": 4.0258, "step": 1999 }, { "epoch": 2.785515320334262, "grad_norm": 5.877390384674072, "learning_rate": 5.1534963047185894e-05, "loss": 3.9183, "step": 2000 }, { "epoch": 2.786908077994429, "grad_norm": 4.240438461303711, "learning_rate": 5.1525014212620805e-05, "loss": 3.7199, "step": 2001 }, { "epoch": 2.788300835654596, "grad_norm": 3.9093611240386963, "learning_rate": 5.151506537805571e-05, "loss": 3.2419, "step": 2002 }, { "epoch": 2.7896935933147633, "grad_norm": 4.7583699226379395, "learning_rate": 5.150511654349062e-05, "loss": 3.2971, "step": 2003 }, { "epoch": 2.7910863509749304, "grad_norm": 9.54155445098877, "learning_rate": 5.149516770892552e-05, "loss": 3.7672, "step": 2004 }, { "epoch": 2.7924791086350975, "grad_norm": 4.426088333129883, "learning_rate": 5.148521887436043e-05, "loss": 3.5291, "step": 2005 }, { "epoch": 2.7938718662952646, "grad_norm": 5.26630163192749, "learning_rate": 5.147527003979533e-05, "loss": 3.2663, "step": 2006 }, { "epoch": 2.7952646239554317, "grad_norm": 4.465476036071777, "learning_rate": 5.146532120523024e-05, "loss": 3.1189, "step": 2007 }, { "epoch": 2.796657381615599, "grad_norm": 5.240188121795654, "learning_rate": 5.145537237066515e-05, "loss": 3.5407, "step": 2008 }, { "epoch": 2.798050139275766, "grad_norm": 3.9314498901367188, "learning_rate": 5.144542353610005e-05, "loss": 3.2046, "step": 2009 }, { "epoch": 2.799442896935933, "grad_norm": 5.52628231048584, "learning_rate": 5.1435474701534955e-05, "loss": 3.216, "step": 2010 }, { "epoch": 2.8008356545961, "grad_norm": 3.9009246826171875, "learning_rate": 5.1425525866969866e-05, "loss": 3.3723, "step": 2011 }, { "epoch": 2.8022284122562673, "grad_norm": 4.057968616485596, "learning_rate": 5.141557703240477e-05, "loss": 3.6863, "step": 2012 }, { "epoch": 2.8036211699164344, "grad_norm": 4.20530891418457, "learning_rate": 5.1405628197839674e-05, "loss": 3.3286, "step": 2013 }, { "epoch": 2.8050139275766015, "grad_norm": 3.844698905944824, "learning_rate": 5.139567936327458e-05, "loss": 3.5184, "step": 2014 }, { "epoch": 2.8064066852367686, "grad_norm": 4.140641689300537, "learning_rate": 5.138573052870949e-05, "loss": 3.512, "step": 2015 }, { "epoch": 2.807799442896936, "grad_norm": 5.2917985916137695, "learning_rate": 5.13757816941444e-05, "loss": 3.1181, "step": 2016 }, { "epoch": 2.809192200557103, "grad_norm": 4.944843292236328, "learning_rate": 5.1365832859579304e-05, "loss": 4.0601, "step": 2017 }, { "epoch": 2.8105849582172704, "grad_norm": 4.644077777862549, "learning_rate": 5.13558840250142e-05, "loss": 3.1175, "step": 2018 }, { "epoch": 2.811977715877437, "grad_norm": 4.8350677490234375, "learning_rate": 5.134593519044911e-05, "loss": 3.7456, "step": 2019 }, { "epoch": 2.8133704735376046, "grad_norm": 4.3641510009765625, "learning_rate": 5.133598635588402e-05, "loss": 3.3445, "step": 2020 }, { "epoch": 2.8147632311977717, "grad_norm": 4.085535049438477, "learning_rate": 5.132603752131893e-05, "loss": 3.5831, "step": 2021 }, { "epoch": 2.816155988857939, "grad_norm": 6.408960819244385, "learning_rate": 5.131608868675384e-05, "loss": 3.8303, "step": 2022 }, { "epoch": 2.817548746518106, "grad_norm": 4.139590263366699, "learning_rate": 5.1306139852188735e-05, "loss": 3.2063, "step": 2023 }, { "epoch": 2.818941504178273, "grad_norm": 3.946959972381592, "learning_rate": 5.1296191017623646e-05, "loss": 3.3457, "step": 2024 }, { "epoch": 2.82033426183844, "grad_norm": 4.58073091506958, "learning_rate": 5.128624218305855e-05, "loss": 4.1379, "step": 2025 }, { "epoch": 2.8217270194986073, "grad_norm": 3.9743804931640625, "learning_rate": 5.127629334849346e-05, "loss": 3.5062, "step": 2026 }, { "epoch": 2.8231197771587744, "grad_norm": 4.281818866729736, "learning_rate": 5.1266344513928365e-05, "loss": 3.447, "step": 2027 }, { "epoch": 2.8245125348189415, "grad_norm": 4.268107891082764, "learning_rate": 5.125639567936327e-05, "loss": 3.0485, "step": 2028 }, { "epoch": 2.8259052924791086, "grad_norm": 4.711485862731934, "learning_rate": 5.124644684479817e-05, "loss": 3.8763, "step": 2029 }, { "epoch": 2.8272980501392757, "grad_norm": 4.70322322845459, "learning_rate": 5.1236498010233084e-05, "loss": 3.6961, "step": 2030 }, { "epoch": 2.828690807799443, "grad_norm": 6.191211700439453, "learning_rate": 5.122654917566799e-05, "loss": 3.2545, "step": 2031 }, { "epoch": 2.83008356545961, "grad_norm": 4.219863414764404, "learning_rate": 5.12166003411029e-05, "loss": 3.268, "step": 2032 }, { "epoch": 2.831476323119777, "grad_norm": 4.127409934997559, "learning_rate": 5.1206651506537796e-05, "loss": 3.1905, "step": 2033 }, { "epoch": 2.832869080779944, "grad_norm": 4.068781852722168, "learning_rate": 5.119670267197271e-05, "loss": 2.9591, "step": 2034 }, { "epoch": 2.8342618384401113, "grad_norm": 3.95893931388855, "learning_rate": 5.118675383740761e-05, "loss": 2.8394, "step": 2035 }, { "epoch": 2.8356545961002784, "grad_norm": 3.969543933868408, "learning_rate": 5.117680500284252e-05, "loss": 3.4354, "step": 2036 }, { "epoch": 2.837047353760446, "grad_norm": 3.9751126766204834, "learning_rate": 5.1166856168277426e-05, "loss": 3.3789, "step": 2037 }, { "epoch": 2.8384401114206126, "grad_norm": 4.16307258605957, "learning_rate": 5.115690733371233e-05, "loss": 3.5744, "step": 2038 }, { "epoch": 2.83983286908078, "grad_norm": 4.139352798461914, "learning_rate": 5.1146958499147234e-05, "loss": 3.4809, "step": 2039 }, { "epoch": 2.841225626740947, "grad_norm": 4.255465507507324, "learning_rate": 5.1137009664582145e-05, "loss": 3.2271, "step": 2040 }, { "epoch": 2.8426183844011144, "grad_norm": 5.983844757080078, "learning_rate": 5.1127060830017056e-05, "loss": 3.0895, "step": 2041 }, { "epoch": 2.8440111420612815, "grad_norm": 5.0439910888671875, "learning_rate": 5.111711199545196e-05, "loss": 3.5335, "step": 2042 }, { "epoch": 2.8454038997214486, "grad_norm": 3.8901827335357666, "learning_rate": 5.110716316088686e-05, "loss": 3.5002, "step": 2043 }, { "epoch": 2.8467966573816157, "grad_norm": 4.606709957122803, "learning_rate": 5.109721432632177e-05, "loss": 3.2298, "step": 2044 }, { "epoch": 2.848189415041783, "grad_norm": 5.750291347503662, "learning_rate": 5.108726549175668e-05, "loss": 3.3431, "step": 2045 }, { "epoch": 2.84958217270195, "grad_norm": 4.332377910614014, "learning_rate": 5.107731665719158e-05, "loss": 3.8921, "step": 2046 }, { "epoch": 2.850974930362117, "grad_norm": 3.8708927631378174, "learning_rate": 5.1067367822626494e-05, "loss": 3.5135, "step": 2047 }, { "epoch": 2.852367688022284, "grad_norm": 3.9907753467559814, "learning_rate": 5.105741898806139e-05, "loss": 3.1292, "step": 2048 }, { "epoch": 2.8537604456824512, "grad_norm": 4.350218772888184, "learning_rate": 5.10474701534963e-05, "loss": 3.5061, "step": 2049 }, { "epoch": 2.8551532033426184, "grad_norm": 4.249178409576416, "learning_rate": 5.1037521318931206e-05, "loss": 3.537, "step": 2050 }, { "epoch": 2.8565459610027855, "grad_norm": 4.3957061767578125, "learning_rate": 5.102757248436612e-05, "loss": 3.0038, "step": 2051 }, { "epoch": 2.8579387186629526, "grad_norm": 4.325258731842041, "learning_rate": 5.101762364980102e-05, "loss": 3.8054, "step": 2052 }, { "epoch": 2.8593314763231197, "grad_norm": 4.3150153160095215, "learning_rate": 5.1007674815235925e-05, "loss": 3.2175, "step": 2053 }, { "epoch": 2.860724233983287, "grad_norm": 4.273927211761475, "learning_rate": 5.099772598067083e-05, "loss": 3.3386, "step": 2054 }, { "epoch": 2.862116991643454, "grad_norm": 4.8481903076171875, "learning_rate": 5.098777714610574e-05, "loss": 3.4993, "step": 2055 }, { "epoch": 2.863509749303621, "grad_norm": 3.8663220405578613, "learning_rate": 5.0977828311540644e-05, "loss": 3.2226, "step": 2056 }, { "epoch": 2.864902506963788, "grad_norm": 4.236097812652588, "learning_rate": 5.096787947697555e-05, "loss": 3.1875, "step": 2057 }, { "epoch": 2.8662952646239557, "grad_norm": 5.388729572296143, "learning_rate": 5.095793064241045e-05, "loss": 3.0379, "step": 2058 }, { "epoch": 2.8676880222841223, "grad_norm": 3.7443923950195312, "learning_rate": 5.094798180784536e-05, "loss": 3.1198, "step": 2059 }, { "epoch": 2.86908077994429, "grad_norm": 3.982881546020508, "learning_rate": 5.093803297328027e-05, "loss": 3.639, "step": 2060 }, { "epoch": 2.8704735376044566, "grad_norm": 5.106269836425781, "learning_rate": 5.092808413871518e-05, "loss": 3.3559, "step": 2061 }, { "epoch": 2.871866295264624, "grad_norm": 4.366353511810303, "learning_rate": 5.0918135304150075e-05, "loss": 3.5405, "step": 2062 }, { "epoch": 2.8732590529247912, "grad_norm": 4.137770652770996, "learning_rate": 5.0908186469584986e-05, "loss": 3.283, "step": 2063 }, { "epoch": 2.8746518105849583, "grad_norm": 4.251780986785889, "learning_rate": 5.089823763501989e-05, "loss": 3.3503, "step": 2064 }, { "epoch": 2.8760445682451254, "grad_norm": 4.264405250549316, "learning_rate": 5.08882888004548e-05, "loss": 3.3128, "step": 2065 }, { "epoch": 2.8774373259052926, "grad_norm": 4.219046115875244, "learning_rate": 5.0878339965889705e-05, "loss": 3.2743, "step": 2066 }, { "epoch": 2.8788300835654597, "grad_norm": 4.376394271850586, "learning_rate": 5.086839113132461e-05, "loss": 3.7509, "step": 2067 }, { "epoch": 2.8802228412256268, "grad_norm": 3.643519401550293, "learning_rate": 5.085844229675951e-05, "loss": 3.1443, "step": 2068 }, { "epoch": 2.881615598885794, "grad_norm": 4.250158309936523, "learning_rate": 5.0848493462194424e-05, "loss": 3.3089, "step": 2069 }, { "epoch": 2.883008356545961, "grad_norm": 4.211243152618408, "learning_rate": 5.0838544627629335e-05, "loss": 3.6829, "step": 2070 }, { "epoch": 2.884401114206128, "grad_norm": 3.8670225143432617, "learning_rate": 5.082859579306424e-05, "loss": 3.1512, "step": 2071 }, { "epoch": 2.885793871866295, "grad_norm": 4.404150485992432, "learning_rate": 5.0818646958499136e-05, "loss": 3.4532, "step": 2072 }, { "epoch": 2.8871866295264623, "grad_norm": 5.2248101234436035, "learning_rate": 5.080869812393405e-05, "loss": 3.2853, "step": 2073 }, { "epoch": 2.8885793871866294, "grad_norm": 4.03637170791626, "learning_rate": 5.079874928936896e-05, "loss": 3.4545, "step": 2074 }, { "epoch": 2.8899721448467965, "grad_norm": 4.957915306091309, "learning_rate": 5.078880045480386e-05, "loss": 3.2843, "step": 2075 }, { "epoch": 2.8913649025069637, "grad_norm": 4.652523517608643, "learning_rate": 5.077885162023877e-05, "loss": 3.7638, "step": 2076 }, { "epoch": 2.8927576601671308, "grad_norm": 5.459745407104492, "learning_rate": 5.076890278567367e-05, "loss": 3.1137, "step": 2077 }, { "epoch": 2.894150417827298, "grad_norm": 4.156223773956299, "learning_rate": 5.075895395110858e-05, "loss": 3.0294, "step": 2078 }, { "epoch": 2.8955431754874654, "grad_norm": 4.485111236572266, "learning_rate": 5.0749005116543485e-05, "loss": 3.6693, "step": 2079 }, { "epoch": 2.896935933147632, "grad_norm": 4.370935440063477, "learning_rate": 5.0739056281978396e-05, "loss": 3.4302, "step": 2080 }, { "epoch": 2.8983286908077996, "grad_norm": 4.037283897399902, "learning_rate": 5.07291074474133e-05, "loss": 3.3939, "step": 2081 }, { "epoch": 2.8997214484679663, "grad_norm": 4.518899440765381, "learning_rate": 5.0719158612848204e-05, "loss": 3.8476, "step": 2082 }, { "epoch": 2.901114206128134, "grad_norm": 5.807830810546875, "learning_rate": 5.070920977828311e-05, "loss": 3.7524, "step": 2083 }, { "epoch": 2.902506963788301, "grad_norm": 5.00191593170166, "learning_rate": 5.069926094371802e-05, "loss": 3.457, "step": 2084 }, { "epoch": 2.903899721448468, "grad_norm": 4.1947503089904785, "learning_rate": 5.068931210915292e-05, "loss": 3.9833, "step": 2085 }, { "epoch": 2.905292479108635, "grad_norm": 4.5223259925842285, "learning_rate": 5.0679363274587834e-05, "loss": 3.3464, "step": 2086 }, { "epoch": 2.9066852367688023, "grad_norm": 4.826055526733398, "learning_rate": 5.066941444002273e-05, "loss": 3.7102, "step": 2087 }, { "epoch": 2.9080779944289694, "grad_norm": 3.685760498046875, "learning_rate": 5.065946560545764e-05, "loss": 3.3121, "step": 2088 }, { "epoch": 2.9094707520891365, "grad_norm": 4.318071365356445, "learning_rate": 5.0649516770892546e-05, "loss": 3.7514, "step": 2089 }, { "epoch": 2.9108635097493036, "grad_norm": 3.9932713508605957, "learning_rate": 5.063956793632746e-05, "loss": 3.6518, "step": 2090 }, { "epoch": 2.9122562674094707, "grad_norm": 5.094545841217041, "learning_rate": 5.062961910176236e-05, "loss": 3.3245, "step": 2091 }, { "epoch": 2.913649025069638, "grad_norm": 4.561112403869629, "learning_rate": 5.0619670267197265e-05, "loss": 3.7716, "step": 2092 }, { "epoch": 2.915041782729805, "grad_norm": 3.9744319915771484, "learning_rate": 5.060972143263217e-05, "loss": 2.7508, "step": 2093 }, { "epoch": 2.916434540389972, "grad_norm": 4.323774337768555, "learning_rate": 5.059977259806708e-05, "loss": 3.3937, "step": 2094 }, { "epoch": 2.917827298050139, "grad_norm": 4.414550304412842, "learning_rate": 5.0589823763501984e-05, "loss": 3.7095, "step": 2095 }, { "epoch": 2.9192200557103063, "grad_norm": 4.5342912673950195, "learning_rate": 5.0579874928936895e-05, "loss": 3.3317, "step": 2096 }, { "epoch": 2.9206128133704734, "grad_norm": 6.246424674987793, "learning_rate": 5.056992609437179e-05, "loss": 3.6733, "step": 2097 }, { "epoch": 2.9220055710306405, "grad_norm": 6.608816146850586, "learning_rate": 5.05599772598067e-05, "loss": 3.3776, "step": 2098 }, { "epoch": 2.9233983286908076, "grad_norm": 4.423455238342285, "learning_rate": 5.0550028425241614e-05, "loss": 3.4785, "step": 2099 }, { "epoch": 2.924791086350975, "grad_norm": 7.79443359375, "learning_rate": 5.054007959067652e-05, "loss": 3.7376, "step": 2100 }, { "epoch": 2.926183844011142, "grad_norm": 4.41956901550293, "learning_rate": 5.0530130756111416e-05, "loss": 3.5199, "step": 2101 }, { "epoch": 2.9275766016713094, "grad_norm": 4.584619522094727, "learning_rate": 5.0520181921546326e-05, "loss": 3.8453, "step": 2102 }, { "epoch": 2.928969359331476, "grad_norm": 4.029069900512695, "learning_rate": 5.051023308698124e-05, "loss": 3.2393, "step": 2103 }, { "epoch": 2.9303621169916436, "grad_norm": 4.323663711547852, "learning_rate": 5.050028425241614e-05, "loss": 3.7648, "step": 2104 }, { "epoch": 2.9317548746518107, "grad_norm": 4.1720404624938965, "learning_rate": 5.049033541785105e-05, "loss": 2.879, "step": 2105 }, { "epoch": 2.933147632311978, "grad_norm": 4.187463760375977, "learning_rate": 5.048038658328595e-05, "loss": 2.8754, "step": 2106 }, { "epoch": 2.934540389972145, "grad_norm": 4.060224533081055, "learning_rate": 5.047043774872086e-05, "loss": 3.123, "step": 2107 }, { "epoch": 2.935933147632312, "grad_norm": 4.4883904457092285, "learning_rate": 5.0460488914155764e-05, "loss": 3.6887, "step": 2108 }, { "epoch": 2.937325905292479, "grad_norm": 4.690547943115234, "learning_rate": 5.0450540079590675e-05, "loss": 3.657, "step": 2109 }, { "epoch": 2.9387186629526463, "grad_norm": 5.70568323135376, "learning_rate": 5.044059124502558e-05, "loss": 3.3645, "step": 2110 }, { "epoch": 2.9401114206128134, "grad_norm": 5.02766227722168, "learning_rate": 5.0430642410460483e-05, "loss": 3.3286, "step": 2111 }, { "epoch": 2.9415041782729805, "grad_norm": 4.652088165283203, "learning_rate": 5.042069357589539e-05, "loss": 3.1985, "step": 2112 }, { "epoch": 2.9428969359331476, "grad_norm": 4.628662586212158, "learning_rate": 5.04107447413303e-05, "loss": 3.6984, "step": 2113 }, { "epoch": 2.9442896935933147, "grad_norm": 3.9817137718200684, "learning_rate": 5.04007959067652e-05, "loss": 2.973, "step": 2114 }, { "epoch": 2.945682451253482, "grad_norm": 4.922610282897949, "learning_rate": 5.039084707220011e-05, "loss": 3.0331, "step": 2115 }, { "epoch": 2.947075208913649, "grad_norm": 3.9922170639038086, "learning_rate": 5.038089823763501e-05, "loss": 3.1564, "step": 2116 }, { "epoch": 2.948467966573816, "grad_norm": 4.425002574920654, "learning_rate": 5.037094940306992e-05, "loss": 3.6502, "step": 2117 }, { "epoch": 2.949860724233983, "grad_norm": 4.016625881195068, "learning_rate": 5.0361000568504826e-05, "loss": 3.3882, "step": 2118 }, { "epoch": 2.9512534818941503, "grad_norm": 4.493213653564453, "learning_rate": 5.0351051733939736e-05, "loss": 3.1698, "step": 2119 }, { "epoch": 2.9526462395543174, "grad_norm": 4.914352893829346, "learning_rate": 5.034110289937464e-05, "loss": 3.6761, "step": 2120 }, { "epoch": 2.954038997214485, "grad_norm": 5.080784320831299, "learning_rate": 5.0331154064809545e-05, "loss": 3.7529, "step": 2121 }, { "epoch": 2.9554317548746516, "grad_norm": 4.149218559265137, "learning_rate": 5.032120523024445e-05, "loss": 3.1979, "step": 2122 }, { "epoch": 2.956824512534819, "grad_norm": 9.52330493927002, "learning_rate": 5.031125639567936e-05, "loss": 2.9479, "step": 2123 }, { "epoch": 2.958217270194986, "grad_norm": 5.114642143249512, "learning_rate": 5.0301307561114264e-05, "loss": 3.9038, "step": 2124 }, { "epoch": 2.9596100278551534, "grad_norm": 4.347400665283203, "learning_rate": 5.0291358726549174e-05, "loss": 3.6515, "step": 2125 }, { "epoch": 2.9610027855153205, "grad_norm": 5.055492877960205, "learning_rate": 5.028140989198407e-05, "loss": 3.9448, "step": 2126 }, { "epoch": 2.9623955431754876, "grad_norm": 5.06788444519043, "learning_rate": 5.027146105741898e-05, "loss": 3.5084, "step": 2127 }, { "epoch": 2.9637883008356547, "grad_norm": 4.336512565612793, "learning_rate": 5.026151222285389e-05, "loss": 3.4349, "step": 2128 }, { "epoch": 2.965181058495822, "grad_norm": 3.7679922580718994, "learning_rate": 5.02515633882888e-05, "loss": 3.2156, "step": 2129 }, { "epoch": 2.966573816155989, "grad_norm": 5.449789047241211, "learning_rate": 5.024161455372371e-05, "loss": 3.5185, "step": 2130 }, { "epoch": 2.967966573816156, "grad_norm": 3.7605178356170654, "learning_rate": 5.0231665719158606e-05, "loss": 3.0766, "step": 2131 }, { "epoch": 2.969359331476323, "grad_norm": 3.99495267868042, "learning_rate": 5.0221716884593516e-05, "loss": 3.2622, "step": 2132 }, { "epoch": 2.9707520891364902, "grad_norm": 4.847777366638184, "learning_rate": 5.021176805002842e-05, "loss": 3.2505, "step": 2133 }, { "epoch": 2.9721448467966574, "grad_norm": 4.027635097503662, "learning_rate": 5.020181921546333e-05, "loss": 3.1319, "step": 2134 }, { "epoch": 2.9735376044568245, "grad_norm": 4.147630214691162, "learning_rate": 5.0191870380898235e-05, "loss": 3.1158, "step": 2135 }, { "epoch": 2.9749303621169916, "grad_norm": 4.148489475250244, "learning_rate": 5.018192154633314e-05, "loss": 3.3226, "step": 2136 }, { "epoch": 2.9763231197771587, "grad_norm": 4.217398643493652, "learning_rate": 5.0171972711768044e-05, "loss": 3.651, "step": 2137 }, { "epoch": 2.977715877437326, "grad_norm": 5.1949381828308105, "learning_rate": 5.0162023877202954e-05, "loss": 3.4924, "step": 2138 }, { "epoch": 2.979108635097493, "grad_norm": 4.664876461029053, "learning_rate": 5.015207504263786e-05, "loss": 3.4569, "step": 2139 }, { "epoch": 2.98050139275766, "grad_norm": 4.591869354248047, "learning_rate": 5.014212620807276e-05, "loss": 3.2384, "step": 2140 }, { "epoch": 2.981894150417827, "grad_norm": 4.423793792724609, "learning_rate": 5.013217737350767e-05, "loss": 3.1595, "step": 2141 }, { "epoch": 2.9832869080779947, "grad_norm": 4.307295322418213, "learning_rate": 5.012222853894258e-05, "loss": 3.0333, "step": 2142 }, { "epoch": 2.9846796657381613, "grad_norm": 4.102986812591553, "learning_rate": 5.011227970437748e-05, "loss": 3.3809, "step": 2143 }, { "epoch": 2.986072423398329, "grad_norm": 4.3870768547058105, "learning_rate": 5.010233086981239e-05, "loss": 3.4217, "step": 2144 }, { "epoch": 2.9874651810584956, "grad_norm": 3.7008109092712402, "learning_rate": 5.009238203524729e-05, "loss": 2.917, "step": 2145 }, { "epoch": 2.988857938718663, "grad_norm": 5.337913513183594, "learning_rate": 5.00824332006822e-05, "loss": 3.0094, "step": 2146 }, { "epoch": 2.9902506963788302, "grad_norm": 5.179196834564209, "learning_rate": 5.0072484366117105e-05, "loss": 3.2392, "step": 2147 }, { "epoch": 2.9916434540389973, "grad_norm": 5.839849472045898, "learning_rate": 5.0062535531552016e-05, "loss": 3.3579, "step": 2148 }, { "epoch": 2.9930362116991645, "grad_norm": 4.230762481689453, "learning_rate": 5.005258669698692e-05, "loss": 3.1051, "step": 2149 }, { "epoch": 2.9944289693593316, "grad_norm": 3.6010777950286865, "learning_rate": 5.0042637862421824e-05, "loss": 3.1442, "step": 2150 }, { "epoch": 2.9958217270194987, "grad_norm": 4.144538879394531, "learning_rate": 5.003268902785673e-05, "loss": 3.5637, "step": 2151 }, { "epoch": 2.997214484679666, "grad_norm": 3.8748812675476074, "learning_rate": 5.002274019329164e-05, "loss": 3.5695, "step": 2152 }, { "epoch": 2.998607242339833, "grad_norm": 3.9042489528656006, "learning_rate": 5.001279135872655e-05, "loss": 3.1174, "step": 2153 }, { "epoch": 3.0, "grad_norm": 3.9408371448516846, "learning_rate": 5.0002842524161454e-05, "loss": 3.4697, "step": 2154 }, { "epoch": 3.001392757660167, "grad_norm": 3.8024237155914307, "learning_rate": 4.999289368959635e-05, "loss": 3.0165, "step": 2155 }, { "epoch": 3.002785515320334, "grad_norm": 3.8605546951293945, "learning_rate": 4.998294485503126e-05, "loss": 2.6671, "step": 2156 }, { "epoch": 3.0041782729805013, "grad_norm": 4.001540660858154, "learning_rate": 4.997299602046617e-05, "loss": 2.9679, "step": 2157 }, { "epoch": 3.0055710306406684, "grad_norm": 3.791510581970215, "learning_rate": 4.9963047185901077e-05, "loss": 2.7511, "step": 2158 }, { "epoch": 3.0069637883008355, "grad_norm": 3.9625966548919678, "learning_rate": 4.995309835133599e-05, "loss": 2.7843, "step": 2159 }, { "epoch": 3.0083565459610027, "grad_norm": 4.191686630249023, "learning_rate": 4.9943149516770885e-05, "loss": 2.6372, "step": 2160 }, { "epoch": 3.0097493036211698, "grad_norm": 4.725612163543701, "learning_rate": 4.9933200682205796e-05, "loss": 2.6615, "step": 2161 }, { "epoch": 3.011142061281337, "grad_norm": 3.901984214782715, "learning_rate": 4.99232518476407e-05, "loss": 3.2304, "step": 2162 }, { "epoch": 3.012534818941504, "grad_norm": 4.5678229331970215, "learning_rate": 4.991330301307561e-05, "loss": 2.8722, "step": 2163 }, { "epoch": 3.013927576601671, "grad_norm": 3.9938695430755615, "learning_rate": 4.9903354178510515e-05, "loss": 2.9263, "step": 2164 }, { "epoch": 3.0153203342618387, "grad_norm": 4.176566123962402, "learning_rate": 4.989340534394542e-05, "loss": 2.9044, "step": 2165 }, { "epoch": 3.0167130919220058, "grad_norm": 4.377198696136475, "learning_rate": 4.988345650938032e-05, "loss": 3.1145, "step": 2166 }, { "epoch": 3.018105849582173, "grad_norm": 3.675611972808838, "learning_rate": 4.9873507674815234e-05, "loss": 3.1403, "step": 2167 }, { "epoch": 3.01949860724234, "grad_norm": 3.7560067176818848, "learning_rate": 4.986355884025014e-05, "loss": 3.1056, "step": 2168 }, { "epoch": 3.020891364902507, "grad_norm": 3.496814012527466, "learning_rate": 4.985361000568505e-05, "loss": 2.5276, "step": 2169 }, { "epoch": 3.022284122562674, "grad_norm": 3.7299299240112305, "learning_rate": 4.9843661171119946e-05, "loss": 2.7589, "step": 2170 }, { "epoch": 3.0236768802228413, "grad_norm": 4.047593593597412, "learning_rate": 4.983371233655486e-05, "loss": 3.0481, "step": 2171 }, { "epoch": 3.0250696378830084, "grad_norm": 3.7720863819122314, "learning_rate": 4.982376350198976e-05, "loss": 3.2035, "step": 2172 }, { "epoch": 3.0264623955431755, "grad_norm": 4.240333080291748, "learning_rate": 4.981381466742467e-05, "loss": 2.4098, "step": 2173 }, { "epoch": 3.0278551532033426, "grad_norm": 3.621859550476074, "learning_rate": 4.9803865832859576e-05, "loss": 3.319, "step": 2174 }, { "epoch": 3.0292479108635098, "grad_norm": 3.9396166801452637, "learning_rate": 4.979391699829448e-05, "loss": 2.6429, "step": 2175 }, { "epoch": 3.030640668523677, "grad_norm": 4.1023077964782715, "learning_rate": 4.9783968163729384e-05, "loss": 2.5816, "step": 2176 }, { "epoch": 3.032033426183844, "grad_norm": 3.9348316192626953, "learning_rate": 4.9774019329164295e-05, "loss": 3.1643, "step": 2177 }, { "epoch": 3.033426183844011, "grad_norm": 6.0089592933654785, "learning_rate": 4.97640704945992e-05, "loss": 3.0826, "step": 2178 }, { "epoch": 3.034818941504178, "grad_norm": 3.588162899017334, "learning_rate": 4.975412166003411e-05, "loss": 2.942, "step": 2179 }, { "epoch": 3.0362116991643453, "grad_norm": 5.2281718254089355, "learning_rate": 4.974417282546901e-05, "loss": 2.4846, "step": 2180 }, { "epoch": 3.0376044568245124, "grad_norm": 3.4828572273254395, "learning_rate": 4.973422399090392e-05, "loss": 2.6165, "step": 2181 }, { "epoch": 3.0389972144846795, "grad_norm": 4.738771438598633, "learning_rate": 4.972427515633883e-05, "loss": 3.1826, "step": 2182 }, { "epoch": 3.0403899721448466, "grad_norm": 3.785372018814087, "learning_rate": 4.971432632177373e-05, "loss": 2.872, "step": 2183 }, { "epoch": 3.0417827298050137, "grad_norm": 3.934426784515381, "learning_rate": 4.970437748720863e-05, "loss": 3.3355, "step": 2184 }, { "epoch": 3.043175487465181, "grad_norm": 4.093682765960693, "learning_rate": 4.969442865264354e-05, "loss": 3.0227, "step": 2185 }, { "epoch": 3.0445682451253484, "grad_norm": 3.8305344581604004, "learning_rate": 4.968447981807845e-05, "loss": 2.8497, "step": 2186 }, { "epoch": 3.0459610027855155, "grad_norm": 3.768501043319702, "learning_rate": 4.9674530983513356e-05, "loss": 3.252, "step": 2187 }, { "epoch": 3.0473537604456826, "grad_norm": 3.5746545791625977, "learning_rate": 4.966458214894827e-05, "loss": 2.6726, "step": 2188 }, { "epoch": 3.0487465181058497, "grad_norm": 3.641993999481201, "learning_rate": 4.9654633314383164e-05, "loss": 2.8944, "step": 2189 }, { "epoch": 3.050139275766017, "grad_norm": 3.740527629852295, "learning_rate": 4.9644684479818075e-05, "loss": 2.858, "step": 2190 }, { "epoch": 3.051532033426184, "grad_norm": 3.6628808975219727, "learning_rate": 4.963473564525298e-05, "loss": 2.5151, "step": 2191 }, { "epoch": 3.052924791086351, "grad_norm": 3.681525707244873, "learning_rate": 4.962478681068789e-05, "loss": 2.8237, "step": 2192 }, { "epoch": 3.054317548746518, "grad_norm": 4.520201206207275, "learning_rate": 4.9614837976122794e-05, "loss": 2.6566, "step": 2193 }, { "epoch": 3.0557103064066853, "grad_norm": 4.108491897583008, "learning_rate": 4.96048891415577e-05, "loss": 3.2071, "step": 2194 }, { "epoch": 3.0571030640668524, "grad_norm": 4.114054203033447, "learning_rate": 4.95949403069926e-05, "loss": 2.6665, "step": 2195 }, { "epoch": 3.0584958217270195, "grad_norm": 4.240559101104736, "learning_rate": 4.958499147242751e-05, "loss": 2.9796, "step": 2196 }, { "epoch": 3.0598885793871866, "grad_norm": 3.4781413078308105, "learning_rate": 4.957504263786242e-05, "loss": 2.5954, "step": 2197 }, { "epoch": 3.0612813370473537, "grad_norm": 3.296037435531616, "learning_rate": 4.956509380329733e-05, "loss": 2.4479, "step": 2198 }, { "epoch": 3.062674094707521, "grad_norm": 4.22749662399292, "learning_rate": 4.9555144968732225e-05, "loss": 2.7733, "step": 2199 }, { "epoch": 3.064066852367688, "grad_norm": 5.45635461807251, "learning_rate": 4.9545196134167136e-05, "loss": 2.6946, "step": 2200 }, { "epoch": 3.065459610027855, "grad_norm": 3.8924145698547363, "learning_rate": 4.953524729960204e-05, "loss": 3.0797, "step": 2201 }, { "epoch": 3.066852367688022, "grad_norm": 3.5238921642303467, "learning_rate": 4.952529846503695e-05, "loss": 2.869, "step": 2202 }, { "epoch": 3.0682451253481893, "grad_norm": 4.308964729309082, "learning_rate": 4.9515349630471855e-05, "loss": 3.0236, "step": 2203 }, { "epoch": 3.0696378830083564, "grad_norm": 4.2634477615356445, "learning_rate": 4.950540079590676e-05, "loss": 2.9298, "step": 2204 }, { "epoch": 3.0710306406685235, "grad_norm": 3.6639902591705322, "learning_rate": 4.949545196134166e-05, "loss": 2.8863, "step": 2205 }, { "epoch": 3.0724233983286906, "grad_norm": 3.3267507553100586, "learning_rate": 4.9485503126776574e-05, "loss": 2.7208, "step": 2206 }, { "epoch": 3.073816155988858, "grad_norm": 4.104797840118408, "learning_rate": 4.947555429221148e-05, "loss": 3.1285, "step": 2207 }, { "epoch": 3.0752089136490253, "grad_norm": 3.7130515575408936, "learning_rate": 4.946560545764639e-05, "loss": 2.8338, "step": 2208 }, { "epoch": 3.0766016713091924, "grad_norm": 4.037384510040283, "learning_rate": 4.9455656623081286e-05, "loss": 2.7723, "step": 2209 }, { "epoch": 3.0779944289693595, "grad_norm": 3.7449655532836914, "learning_rate": 4.94457077885162e-05, "loss": 2.7966, "step": 2210 }, { "epoch": 3.0793871866295266, "grad_norm": 3.6848769187927246, "learning_rate": 4.943575895395111e-05, "loss": 2.9968, "step": 2211 }, { "epoch": 3.0807799442896937, "grad_norm": 3.8089780807495117, "learning_rate": 4.942581011938601e-05, "loss": 2.7478, "step": 2212 }, { "epoch": 3.082172701949861, "grad_norm": 5.400775909423828, "learning_rate": 4.941586128482092e-05, "loss": 2.8524, "step": 2213 }, { "epoch": 3.083565459610028, "grad_norm": 4.017539978027344, "learning_rate": 4.940591245025582e-05, "loss": 2.6039, "step": 2214 }, { "epoch": 3.084958217270195, "grad_norm": 3.5601894855499268, "learning_rate": 4.939596361569073e-05, "loss": 2.9378, "step": 2215 }, { "epoch": 3.086350974930362, "grad_norm": 3.6896731853485107, "learning_rate": 4.9386014781125635e-05, "loss": 2.8304, "step": 2216 }, { "epoch": 3.0877437325905293, "grad_norm": 4.51615047454834, "learning_rate": 4.9376065946560546e-05, "loss": 3.2497, "step": 2217 }, { "epoch": 3.0891364902506964, "grad_norm": 4.1750102043151855, "learning_rate": 4.936611711199545e-05, "loss": 3.1508, "step": 2218 }, { "epoch": 3.0905292479108635, "grad_norm": 4.524855136871338, "learning_rate": 4.9356168277430354e-05, "loss": 2.818, "step": 2219 }, { "epoch": 3.0919220055710306, "grad_norm": 4.195107936859131, "learning_rate": 4.934621944286526e-05, "loss": 3.273, "step": 2220 }, { "epoch": 3.0933147632311977, "grad_norm": 5.05075740814209, "learning_rate": 4.933627060830017e-05, "loss": 2.6328, "step": 2221 }, { "epoch": 3.094707520891365, "grad_norm": 3.962237596511841, "learning_rate": 4.932632177373507e-05, "loss": 2.7848, "step": 2222 }, { "epoch": 3.096100278551532, "grad_norm": 4.436080455780029, "learning_rate": 4.9316372939169984e-05, "loss": 2.9761, "step": 2223 }, { "epoch": 3.097493036211699, "grad_norm": 4.083108901977539, "learning_rate": 4.930642410460488e-05, "loss": 3.057, "step": 2224 }, { "epoch": 3.098885793871866, "grad_norm": 3.324781894683838, "learning_rate": 4.929647527003979e-05, "loss": 2.4656, "step": 2225 }, { "epoch": 3.1002785515320332, "grad_norm": 3.7265758514404297, "learning_rate": 4.9286526435474696e-05, "loss": 2.6025, "step": 2226 }, { "epoch": 3.1016713091922004, "grad_norm": 4.01738166809082, "learning_rate": 4.927657760090961e-05, "loss": 2.8352, "step": 2227 }, { "epoch": 3.103064066852368, "grad_norm": 4.062448978424072, "learning_rate": 4.9266628766344504e-05, "loss": 3.2861, "step": 2228 }, { "epoch": 3.104456824512535, "grad_norm": 3.786773443222046, "learning_rate": 4.9256679931779415e-05, "loss": 2.6291, "step": 2229 }, { "epoch": 3.105849582172702, "grad_norm": 4.09287166595459, "learning_rate": 4.924673109721432e-05, "loss": 2.8467, "step": 2230 }, { "epoch": 3.1072423398328692, "grad_norm": 4.1100311279296875, "learning_rate": 4.923678226264923e-05, "loss": 2.9087, "step": 2231 }, { "epoch": 3.1086350974930363, "grad_norm": 3.9571471214294434, "learning_rate": 4.9226833428084134e-05, "loss": 2.4647, "step": 2232 }, { "epoch": 3.1100278551532035, "grad_norm": 4.0034260749816895, "learning_rate": 4.921688459351904e-05, "loss": 2.7365, "step": 2233 }, { "epoch": 3.1114206128133706, "grad_norm": 3.7767553329467773, "learning_rate": 4.920693575895394e-05, "loss": 2.9574, "step": 2234 }, { "epoch": 3.1128133704735377, "grad_norm": 6.777519702911377, "learning_rate": 4.919698692438885e-05, "loss": 2.9007, "step": 2235 }, { "epoch": 3.114206128133705, "grad_norm": 3.5743494033813477, "learning_rate": 4.918703808982376e-05, "loss": 2.678, "step": 2236 }, { "epoch": 3.115598885793872, "grad_norm": 3.626544237136841, "learning_rate": 4.917708925525867e-05, "loss": 2.7042, "step": 2237 }, { "epoch": 3.116991643454039, "grad_norm": 3.875890016555786, "learning_rate": 4.9167140420693565e-05, "loss": 3.0001, "step": 2238 }, { "epoch": 3.118384401114206, "grad_norm": 3.7919387817382812, "learning_rate": 4.9157191586128476e-05, "loss": 2.9725, "step": 2239 }, { "epoch": 3.1197771587743732, "grad_norm": 3.46844744682312, "learning_rate": 4.914724275156339e-05, "loss": 2.8531, "step": 2240 }, { "epoch": 3.1211699164345403, "grad_norm": 4.005001068115234, "learning_rate": 4.913729391699829e-05, "loss": 2.8883, "step": 2241 }, { "epoch": 3.1225626740947074, "grad_norm": 4.705630779266357, "learning_rate": 4.91273450824332e-05, "loss": 3.0957, "step": 2242 }, { "epoch": 3.1239554317548746, "grad_norm": 3.739565372467041, "learning_rate": 4.91173962478681e-05, "loss": 2.4562, "step": 2243 }, { "epoch": 3.1253481894150417, "grad_norm": 3.5640506744384766, "learning_rate": 4.910744741330301e-05, "loss": 3.0103, "step": 2244 }, { "epoch": 3.1267409470752088, "grad_norm": 3.751476764678955, "learning_rate": 4.9097498578737914e-05, "loss": 2.7388, "step": 2245 }, { "epoch": 3.128133704735376, "grad_norm": 3.8549933433532715, "learning_rate": 4.9087549744172825e-05, "loss": 2.7139, "step": 2246 }, { "epoch": 3.129526462395543, "grad_norm": 4.479499340057373, "learning_rate": 4.907760090960773e-05, "loss": 2.7457, "step": 2247 }, { "epoch": 3.13091922005571, "grad_norm": 3.822279691696167, "learning_rate": 4.906765207504263e-05, "loss": 2.8294, "step": 2248 }, { "epoch": 3.1323119777158777, "grad_norm": 3.825222969055176, "learning_rate": 4.905770324047754e-05, "loss": 2.8442, "step": 2249 }, { "epoch": 3.1337047353760448, "grad_norm": 4.0725531578063965, "learning_rate": 4.904775440591245e-05, "loss": 2.6, "step": 2250 }, { "epoch": 3.135097493036212, "grad_norm": 3.854177236557007, "learning_rate": 4.903780557134735e-05, "loss": 2.6668, "step": 2251 }, { "epoch": 3.136490250696379, "grad_norm": 3.6467463970184326, "learning_rate": 4.902785673678226e-05, "loss": 2.7115, "step": 2252 }, { "epoch": 3.137883008356546, "grad_norm": 3.687175989151001, "learning_rate": 4.901790790221716e-05, "loss": 2.8751, "step": 2253 }, { "epoch": 3.139275766016713, "grad_norm": 3.6928493976593018, "learning_rate": 4.900795906765207e-05, "loss": 2.992, "step": 2254 }, { "epoch": 3.1406685236768803, "grad_norm": 4.653807163238525, "learning_rate": 4.8998010233086975e-05, "loss": 2.9187, "step": 2255 }, { "epoch": 3.1420612813370474, "grad_norm": 3.7770557403564453, "learning_rate": 4.8988061398521886e-05, "loss": 3.0289, "step": 2256 }, { "epoch": 3.1434540389972145, "grad_norm": 3.6280288696289062, "learning_rate": 4.897811256395679e-05, "loss": 2.6832, "step": 2257 }, { "epoch": 3.1448467966573816, "grad_norm": 4.064112186431885, "learning_rate": 4.8968163729391694e-05, "loss": 3.05, "step": 2258 }, { "epoch": 3.1462395543175488, "grad_norm": 3.938138961791992, "learning_rate": 4.89582148948266e-05, "loss": 2.9412, "step": 2259 }, { "epoch": 3.147632311977716, "grad_norm": 3.529313802719116, "learning_rate": 4.894826606026151e-05, "loss": 2.9496, "step": 2260 }, { "epoch": 3.149025069637883, "grad_norm": 3.544926643371582, "learning_rate": 4.893831722569641e-05, "loss": 2.7199, "step": 2261 }, { "epoch": 3.15041782729805, "grad_norm": 3.8904011249542236, "learning_rate": 4.8928368391131324e-05, "loss": 3.0123, "step": 2262 }, { "epoch": 3.151810584958217, "grad_norm": 3.769857883453369, "learning_rate": 4.891841955656622e-05, "loss": 2.856, "step": 2263 }, { "epoch": 3.1532033426183843, "grad_norm": 3.689384698867798, "learning_rate": 4.890847072200113e-05, "loss": 2.8636, "step": 2264 }, { "epoch": 3.1545961002785514, "grad_norm": 3.706634044647217, "learning_rate": 4.8898521887436036e-05, "loss": 2.7097, "step": 2265 }, { "epoch": 3.1559888579387185, "grad_norm": 3.793800115585327, "learning_rate": 4.888857305287095e-05, "loss": 3.2398, "step": 2266 }, { "epoch": 3.1573816155988856, "grad_norm": 3.964437246322632, "learning_rate": 4.887862421830586e-05, "loss": 2.64, "step": 2267 }, { "epoch": 3.1587743732590527, "grad_norm": 3.7391197681427, "learning_rate": 4.8868675383740755e-05, "loss": 2.8607, "step": 2268 }, { "epoch": 3.16016713091922, "grad_norm": 4.067811012268066, "learning_rate": 4.8858726549175666e-05, "loss": 2.6981, "step": 2269 }, { "epoch": 3.1615598885793874, "grad_norm": 4.685571670532227, "learning_rate": 4.884877771461057e-05, "loss": 2.9726, "step": 2270 }, { "epoch": 3.1629526462395545, "grad_norm": 4.283031463623047, "learning_rate": 4.883882888004548e-05, "loss": 3.1427, "step": 2271 }, { "epoch": 3.1643454038997216, "grad_norm": 4.66605281829834, "learning_rate": 4.882888004548038e-05, "loss": 2.9865, "step": 2272 }, { "epoch": 3.1657381615598887, "grad_norm": 4.3901166915893555, "learning_rate": 4.881893121091529e-05, "loss": 3.0573, "step": 2273 }, { "epoch": 3.167130919220056, "grad_norm": 3.78541898727417, "learning_rate": 4.880898237635019e-05, "loss": 2.9784, "step": 2274 }, { "epoch": 3.168523676880223, "grad_norm": 3.5710551738739014, "learning_rate": 4.8799033541785104e-05, "loss": 2.7992, "step": 2275 }, { "epoch": 3.16991643454039, "grad_norm": 3.791133165359497, "learning_rate": 4.878908470722001e-05, "loss": 2.9119, "step": 2276 }, { "epoch": 3.171309192200557, "grad_norm": 3.72632098197937, "learning_rate": 4.877913587265491e-05, "loss": 3.1021, "step": 2277 }, { "epoch": 3.1727019498607243, "grad_norm": 3.671421766281128, "learning_rate": 4.8769187038089816e-05, "loss": 2.7327, "step": 2278 }, { "epoch": 3.1740947075208914, "grad_norm": 3.593618869781494, "learning_rate": 4.875923820352473e-05, "loss": 2.7006, "step": 2279 }, { "epoch": 3.1754874651810585, "grad_norm": 4.296664237976074, "learning_rate": 4.874928936895963e-05, "loss": 2.6943, "step": 2280 }, { "epoch": 3.1768802228412256, "grad_norm": 3.8954293727874756, "learning_rate": 4.873934053439454e-05, "loss": 2.8782, "step": 2281 }, { "epoch": 3.1782729805013927, "grad_norm": 6.061985969543457, "learning_rate": 4.872939169982944e-05, "loss": 2.8432, "step": 2282 }, { "epoch": 3.17966573816156, "grad_norm": 4.803857326507568, "learning_rate": 4.871944286526435e-05, "loss": 3.7857, "step": 2283 }, { "epoch": 3.181058495821727, "grad_norm": 4.052546501159668, "learning_rate": 4.8709494030699254e-05, "loss": 2.9245, "step": 2284 }, { "epoch": 3.182451253481894, "grad_norm": 6.619697570800781, "learning_rate": 4.8699545196134165e-05, "loss": 3.0243, "step": 2285 }, { "epoch": 3.183844011142061, "grad_norm": 4.285874843597412, "learning_rate": 4.868959636156907e-05, "loss": 2.9593, "step": 2286 }, { "epoch": 3.1852367688022283, "grad_norm": 3.6087124347686768, "learning_rate": 4.867964752700397e-05, "loss": 2.9436, "step": 2287 }, { "epoch": 3.1866295264623954, "grad_norm": 3.975787878036499, "learning_rate": 4.866969869243888e-05, "loss": 2.801, "step": 2288 }, { "epoch": 3.1880222841225625, "grad_norm": 4.284610271453857, "learning_rate": 4.865974985787379e-05, "loss": 2.9373, "step": 2289 }, { "epoch": 3.1894150417827296, "grad_norm": 4.059457778930664, "learning_rate": 4.864980102330869e-05, "loss": 2.5689, "step": 2290 }, { "epoch": 3.190807799442897, "grad_norm": 3.7924251556396484, "learning_rate": 4.86398521887436e-05, "loss": 2.8105, "step": 2291 }, { "epoch": 3.1922005571030643, "grad_norm": 4.193859577178955, "learning_rate": 4.86299033541785e-05, "loss": 2.7084, "step": 2292 }, { "epoch": 3.1935933147632314, "grad_norm": 3.9864838123321533, "learning_rate": 4.861995451961341e-05, "loss": 2.9286, "step": 2293 }, { "epoch": 3.1949860724233985, "grad_norm": 4.141392707824707, "learning_rate": 4.861000568504832e-05, "loss": 2.7396, "step": 2294 }, { "epoch": 3.1963788300835656, "grad_norm": 4.376644134521484, "learning_rate": 4.8600056850483226e-05, "loss": 3.1385, "step": 2295 }, { "epoch": 3.1977715877437327, "grad_norm": 3.9887776374816895, "learning_rate": 4.859010801591814e-05, "loss": 2.8945, "step": 2296 }, { "epoch": 3.1991643454039, "grad_norm": 4.6822004318237305, "learning_rate": 4.8580159181353034e-05, "loss": 3.0602, "step": 2297 }, { "epoch": 3.200557103064067, "grad_norm": 4.151376247406006, "learning_rate": 4.8570210346787945e-05, "loss": 2.9018, "step": 2298 }, { "epoch": 3.201949860724234, "grad_norm": 4.49393892288208, "learning_rate": 4.856026151222285e-05, "loss": 3.2492, "step": 2299 }, { "epoch": 3.203342618384401, "grad_norm": 3.828725814819336, "learning_rate": 4.855031267765776e-05, "loss": 2.7855, "step": 2300 }, { "epoch": 3.2047353760445683, "grad_norm": 3.9831230640411377, "learning_rate": 4.8540363843092664e-05, "loss": 3.0632, "step": 2301 }, { "epoch": 3.2061281337047354, "grad_norm": 4.14377498626709, "learning_rate": 4.853041500852757e-05, "loss": 3.1884, "step": 2302 }, { "epoch": 3.2075208913649025, "grad_norm": 3.670644521713257, "learning_rate": 4.852046617396247e-05, "loss": 3.0383, "step": 2303 }, { "epoch": 3.2089136490250696, "grad_norm": 3.681523323059082, "learning_rate": 4.851051733939738e-05, "loss": 2.7217, "step": 2304 }, { "epoch": 3.2103064066852367, "grad_norm": 3.5931689739227295, "learning_rate": 4.850056850483229e-05, "loss": 2.8416, "step": 2305 }, { "epoch": 3.211699164345404, "grad_norm": 4.240254878997803, "learning_rate": 4.84906196702672e-05, "loss": 2.602, "step": 2306 }, { "epoch": 3.213091922005571, "grad_norm": 4.246129035949707, "learning_rate": 4.8480670835702095e-05, "loss": 3.0062, "step": 2307 }, { "epoch": 3.214484679665738, "grad_norm": 4.037646770477295, "learning_rate": 4.8470722001137006e-05, "loss": 2.8875, "step": 2308 }, { "epoch": 3.215877437325905, "grad_norm": 4.305235385894775, "learning_rate": 4.846077316657191e-05, "loss": 3.1422, "step": 2309 }, { "epoch": 3.2172701949860723, "grad_norm": 3.8170394897460938, "learning_rate": 4.845082433200682e-05, "loss": 2.6108, "step": 2310 }, { "epoch": 3.2186629526462394, "grad_norm": 4.461042404174805, "learning_rate": 4.844087549744172e-05, "loss": 2.9229, "step": 2311 }, { "epoch": 3.220055710306407, "grad_norm": 6.392706871032715, "learning_rate": 4.843092666287663e-05, "loss": 3.0824, "step": 2312 }, { "epoch": 3.2214484679665736, "grad_norm": 5.156744956970215, "learning_rate": 4.8420977828311533e-05, "loss": 2.6974, "step": 2313 }, { "epoch": 3.222841225626741, "grad_norm": 4.121723651885986, "learning_rate": 4.8411028993746444e-05, "loss": 2.751, "step": 2314 }, { "epoch": 3.2242339832869082, "grad_norm": 4.308473110198975, "learning_rate": 4.840108015918135e-05, "loss": 2.9132, "step": 2315 }, { "epoch": 3.2256267409470754, "grad_norm": 4.490592956542969, "learning_rate": 4.839113132461625e-05, "loss": 3.0115, "step": 2316 }, { "epoch": 3.2270194986072425, "grad_norm": 4.198948860168457, "learning_rate": 4.8381182490051157e-05, "loss": 3.1584, "step": 2317 }, { "epoch": 3.2284122562674096, "grad_norm": 4.588038921356201, "learning_rate": 4.837123365548607e-05, "loss": 3.1666, "step": 2318 }, { "epoch": 3.2298050139275767, "grad_norm": 4.044364929199219, "learning_rate": 4.836128482092097e-05, "loss": 2.7106, "step": 2319 }, { "epoch": 3.231197771587744, "grad_norm": 3.7738356590270996, "learning_rate": 4.835133598635588e-05, "loss": 2.654, "step": 2320 }, { "epoch": 3.232590529247911, "grad_norm": 4.125843048095703, "learning_rate": 4.834138715179078e-05, "loss": 3.1454, "step": 2321 }, { "epoch": 3.233983286908078, "grad_norm": 3.8941538333892822, "learning_rate": 4.833143831722569e-05, "loss": 2.7563, "step": 2322 }, { "epoch": 3.235376044568245, "grad_norm": 3.7710914611816406, "learning_rate": 4.83214894826606e-05, "loss": 2.5314, "step": 2323 }, { "epoch": 3.2367688022284122, "grad_norm": 3.6377623081207275, "learning_rate": 4.8311540648095505e-05, "loss": 3.0108, "step": 2324 }, { "epoch": 3.2381615598885793, "grad_norm": 4.007593631744385, "learning_rate": 4.8301591813530416e-05, "loss": 2.9897, "step": 2325 }, { "epoch": 3.2395543175487465, "grad_norm": 3.7874231338500977, "learning_rate": 4.8291642978965314e-05, "loss": 2.3803, "step": 2326 }, { "epoch": 3.2409470752089136, "grad_norm": 4.07582426071167, "learning_rate": 4.8281694144400224e-05, "loss": 3.0804, "step": 2327 }, { "epoch": 3.2423398328690807, "grad_norm": 4.140418529510498, "learning_rate": 4.827174530983513e-05, "loss": 2.8582, "step": 2328 }, { "epoch": 3.243732590529248, "grad_norm": 3.6024129390716553, "learning_rate": 4.826179647527004e-05, "loss": 2.9104, "step": 2329 }, { "epoch": 3.245125348189415, "grad_norm": 5.591879367828369, "learning_rate": 4.8251847640704943e-05, "loss": 2.6721, "step": 2330 }, { "epoch": 3.246518105849582, "grad_norm": 4.41704797744751, "learning_rate": 4.824189880613985e-05, "loss": 2.9928, "step": 2331 }, { "epoch": 3.247910863509749, "grad_norm": 4.577050685882568, "learning_rate": 4.823194997157475e-05, "loss": 2.8255, "step": 2332 }, { "epoch": 3.2493036211699167, "grad_norm": 4.430422782897949, "learning_rate": 4.822200113700966e-05, "loss": 3.1678, "step": 2333 }, { "epoch": 3.2506963788300833, "grad_norm": 5.460785865783691, "learning_rate": 4.8212052302444566e-05, "loss": 2.6286, "step": 2334 }, { "epoch": 3.252089136490251, "grad_norm": 5.823190212249756, "learning_rate": 4.820210346787948e-05, "loss": 2.568, "step": 2335 }, { "epoch": 3.253481894150418, "grad_norm": 3.726700782775879, "learning_rate": 4.8192154633314375e-05, "loss": 3.0072, "step": 2336 }, { "epoch": 3.254874651810585, "grad_norm": 3.914973735809326, "learning_rate": 4.8182205798749285e-05, "loss": 2.8886, "step": 2337 }, { "epoch": 3.256267409470752, "grad_norm": 4.194888114929199, "learning_rate": 4.817225696418419e-05, "loss": 2.8432, "step": 2338 }, { "epoch": 3.2576601671309193, "grad_norm": 3.9118857383728027, "learning_rate": 4.81623081296191e-05, "loss": 2.8313, "step": 2339 }, { "epoch": 3.2590529247910864, "grad_norm": 3.5121490955352783, "learning_rate": 4.8152359295054004e-05, "loss": 2.5016, "step": 2340 }, { "epoch": 3.2604456824512535, "grad_norm": 4.009439468383789, "learning_rate": 4.814241046048891e-05, "loss": 2.7574, "step": 2341 }, { "epoch": 3.2618384401114207, "grad_norm": 7.137579917907715, "learning_rate": 4.813246162592381e-05, "loss": 3.2454, "step": 2342 }, { "epoch": 3.2632311977715878, "grad_norm": 3.6958656311035156, "learning_rate": 4.8122512791358723e-05, "loss": 2.8281, "step": 2343 }, { "epoch": 3.264623955431755, "grad_norm": 3.7219202518463135, "learning_rate": 4.811256395679363e-05, "loss": 2.6826, "step": 2344 }, { "epoch": 3.266016713091922, "grad_norm": 3.782970905303955, "learning_rate": 4.810261512222854e-05, "loss": 2.6493, "step": 2345 }, { "epoch": 3.267409470752089, "grad_norm": 3.8960306644439697, "learning_rate": 4.8092666287663436e-05, "loss": 3.3011, "step": 2346 }, { "epoch": 3.268802228412256, "grad_norm": 4.1278252601623535, "learning_rate": 4.8082717453098347e-05, "loss": 3.2684, "step": 2347 }, { "epoch": 3.2701949860724233, "grad_norm": 3.858278512954712, "learning_rate": 4.807276861853325e-05, "loss": 2.7097, "step": 2348 }, { "epoch": 3.2715877437325904, "grad_norm": 4.56193208694458, "learning_rate": 4.806281978396816e-05, "loss": 2.8349, "step": 2349 }, { "epoch": 3.2729805013927575, "grad_norm": 4.361737251281738, "learning_rate": 4.805287094940307e-05, "loss": 2.8348, "step": 2350 }, { "epoch": 3.2743732590529246, "grad_norm": 3.825388193130493, "learning_rate": 4.804292211483797e-05, "loss": 2.6212, "step": 2351 }, { "epoch": 3.2757660167130918, "grad_norm": 4.141374588012695, "learning_rate": 4.803297328027288e-05, "loss": 2.9403, "step": 2352 }, { "epoch": 3.277158774373259, "grad_norm": 3.733556032180786, "learning_rate": 4.8023024445707785e-05, "loss": 2.7187, "step": 2353 }, { "epoch": 3.2785515320334264, "grad_norm": 4.972504138946533, "learning_rate": 4.8013075611142695e-05, "loss": 2.8249, "step": 2354 }, { "epoch": 3.279944289693593, "grad_norm": 4.3905839920043945, "learning_rate": 4.800312677657759e-05, "loss": 2.6179, "step": 2355 }, { "epoch": 3.2813370473537606, "grad_norm": 4.249562740325928, "learning_rate": 4.7993177942012504e-05, "loss": 2.8928, "step": 2356 }, { "epoch": 3.2827298050139277, "grad_norm": 4.65199613571167, "learning_rate": 4.798322910744741e-05, "loss": 2.766, "step": 2357 }, { "epoch": 3.284122562674095, "grad_norm": 3.633455514907837, "learning_rate": 4.797328027288232e-05, "loss": 2.6158, "step": 2358 }, { "epoch": 3.285515320334262, "grad_norm": 3.461538791656494, "learning_rate": 4.796333143831722e-05, "loss": 2.3918, "step": 2359 }, { "epoch": 3.286908077994429, "grad_norm": 3.6453847885131836, "learning_rate": 4.795338260375213e-05, "loss": 2.8247, "step": 2360 }, { "epoch": 3.288300835654596, "grad_norm": 4.0668158531188965, "learning_rate": 4.794343376918703e-05, "loss": 2.8784, "step": 2361 }, { "epoch": 3.2896935933147633, "grad_norm": 3.821699857711792, "learning_rate": 4.793348493462194e-05, "loss": 2.9337, "step": 2362 }, { "epoch": 3.2910863509749304, "grad_norm": 4.014733791351318, "learning_rate": 4.7923536100056846e-05, "loss": 3.0672, "step": 2363 }, { "epoch": 3.2924791086350975, "grad_norm": 3.88468599319458, "learning_rate": 4.7913587265491757e-05, "loss": 2.5454, "step": 2364 }, { "epoch": 3.2938718662952646, "grad_norm": 3.78873872756958, "learning_rate": 4.7903638430926654e-05, "loss": 2.8817, "step": 2365 }, { "epoch": 3.2952646239554317, "grad_norm": 4.950428485870361, "learning_rate": 4.7893689596361565e-05, "loss": 2.8254, "step": 2366 }, { "epoch": 3.296657381615599, "grad_norm": 3.688955545425415, "learning_rate": 4.788374076179647e-05, "loss": 2.8069, "step": 2367 }, { "epoch": 3.298050139275766, "grad_norm": 4.237142562866211, "learning_rate": 4.787379192723138e-05, "loss": 3.0093, "step": 2368 }, { "epoch": 3.299442896935933, "grad_norm": 3.334515333175659, "learning_rate": 4.7863843092666284e-05, "loss": 2.3511, "step": 2369 }, { "epoch": 3.3008356545961, "grad_norm": 3.505284309387207, "learning_rate": 4.785389425810119e-05, "loss": 2.2776, "step": 2370 }, { "epoch": 3.3022284122562673, "grad_norm": 4.798534870147705, "learning_rate": 4.784394542353609e-05, "loss": 3.3179, "step": 2371 }, { "epoch": 3.3036211699164344, "grad_norm": 4.9942240715026855, "learning_rate": 4.7833996588971e-05, "loss": 2.6916, "step": 2372 }, { "epoch": 3.3050139275766015, "grad_norm": 3.5696606636047363, "learning_rate": 4.782404775440591e-05, "loss": 2.7702, "step": 2373 }, { "epoch": 3.3064066852367686, "grad_norm": 3.6908276081085205, "learning_rate": 4.781409891984082e-05, "loss": 2.6051, "step": 2374 }, { "epoch": 3.307799442896936, "grad_norm": 4.1333394050598145, "learning_rate": 4.7804150085275715e-05, "loss": 2.917, "step": 2375 }, { "epoch": 3.309192200557103, "grad_norm": 4.748862266540527, "learning_rate": 4.7794201250710626e-05, "loss": 2.6613, "step": 2376 }, { "epoch": 3.3105849582172704, "grad_norm": 4.2038254737854, "learning_rate": 4.778425241614553e-05, "loss": 2.8604, "step": 2377 }, { "epoch": 3.3119777158774375, "grad_norm": 4.11339807510376, "learning_rate": 4.777430358158044e-05, "loss": 2.9838, "step": 2378 }, { "epoch": 3.3133704735376046, "grad_norm": 3.785566806793213, "learning_rate": 4.776435474701535e-05, "loss": 2.4237, "step": 2379 }, { "epoch": 3.3147632311977717, "grad_norm": 3.6763899326324463, "learning_rate": 4.775440591245025e-05, "loss": 2.7251, "step": 2380 }, { "epoch": 3.316155988857939, "grad_norm": 3.59271502494812, "learning_rate": 4.774445707788516e-05, "loss": 2.3965, "step": 2381 }, { "epoch": 3.317548746518106, "grad_norm": 4.120853424072266, "learning_rate": 4.7734508243320064e-05, "loss": 2.7869, "step": 2382 }, { "epoch": 3.318941504178273, "grad_norm": 3.6389341354370117, "learning_rate": 4.7724559408754975e-05, "loss": 2.7214, "step": 2383 }, { "epoch": 3.32033426183844, "grad_norm": 10.967327117919922, "learning_rate": 4.771461057418988e-05, "loss": 2.7669, "step": 2384 }, { "epoch": 3.3217270194986073, "grad_norm": 3.867356061935425, "learning_rate": 4.770466173962478e-05, "loss": 2.6242, "step": 2385 }, { "epoch": 3.3231197771587744, "grad_norm": 3.5651192665100098, "learning_rate": 4.769471290505969e-05, "loss": 2.4967, "step": 2386 }, { "epoch": 3.3245125348189415, "grad_norm": 3.5389902591705322, "learning_rate": 4.76847640704946e-05, "loss": 2.7493, "step": 2387 }, { "epoch": 3.3259052924791086, "grad_norm": 6.526866912841797, "learning_rate": 4.76748152359295e-05, "loss": 2.8418, "step": 2388 }, { "epoch": 3.3272980501392757, "grad_norm": 4.658627510070801, "learning_rate": 4.766486640136441e-05, "loss": 2.858, "step": 2389 }, { "epoch": 3.328690807799443, "grad_norm": 3.680142641067505, "learning_rate": 4.765491756679931e-05, "loss": 2.7746, "step": 2390 }, { "epoch": 3.33008356545961, "grad_norm": 4.72170877456665, "learning_rate": 4.764496873223422e-05, "loss": 2.8055, "step": 2391 }, { "epoch": 3.331476323119777, "grad_norm": 4.868217468261719, "learning_rate": 4.7635019897669125e-05, "loss": 2.6525, "step": 2392 }, { "epoch": 3.332869080779944, "grad_norm": 4.732217311859131, "learning_rate": 4.7625071063104036e-05, "loss": 2.8297, "step": 2393 }, { "epoch": 3.3342618384401113, "grad_norm": 3.6755013465881348, "learning_rate": 4.761512222853894e-05, "loss": 2.8107, "step": 2394 }, { "epoch": 3.3356545961002784, "grad_norm": 4.290006160736084, "learning_rate": 4.7605173393973844e-05, "loss": 2.7585, "step": 2395 }, { "epoch": 3.337047353760446, "grad_norm": 3.6859662532806396, "learning_rate": 4.759522455940875e-05, "loss": 2.469, "step": 2396 }, { "epoch": 3.3384401114206126, "grad_norm": 3.754054069519043, "learning_rate": 4.758527572484366e-05, "loss": 2.9326, "step": 2397 }, { "epoch": 3.33983286908078, "grad_norm": 4.631336688995361, "learning_rate": 4.757532689027856e-05, "loss": 3.0036, "step": 2398 }, { "epoch": 3.3412256267409473, "grad_norm": 3.606992244720459, "learning_rate": 4.756537805571347e-05, "loss": 2.7054, "step": 2399 }, { "epoch": 3.3426183844011144, "grad_norm": 3.855879545211792, "learning_rate": 4.755542922114837e-05, "loss": 2.8321, "step": 2400 }, { "epoch": 3.3440111420612815, "grad_norm": 3.460355520248413, "learning_rate": 4.754548038658328e-05, "loss": 2.5688, "step": 2401 }, { "epoch": 3.3454038997214486, "grad_norm": 4.121874809265137, "learning_rate": 4.7535531552018186e-05, "loss": 2.8504, "step": 2402 }, { "epoch": 3.3467966573816157, "grad_norm": 4.156680583953857, "learning_rate": 4.75255827174531e-05, "loss": 2.8232, "step": 2403 }, { "epoch": 3.348189415041783, "grad_norm": 3.8161020278930664, "learning_rate": 4.7515633882887994e-05, "loss": 2.6313, "step": 2404 }, { "epoch": 3.34958217270195, "grad_norm": 4.528938293457031, "learning_rate": 4.7505685048322905e-05, "loss": 2.9876, "step": 2405 }, { "epoch": 3.350974930362117, "grad_norm": 3.8600189685821533, "learning_rate": 4.749573621375781e-05, "loss": 2.6049, "step": 2406 }, { "epoch": 3.352367688022284, "grad_norm": 5.053176403045654, "learning_rate": 4.748578737919272e-05, "loss": 3.0565, "step": 2407 }, { "epoch": 3.3537604456824512, "grad_norm": 4.079180717468262, "learning_rate": 4.747583854462763e-05, "loss": 2.7893, "step": 2408 }, { "epoch": 3.3551532033426184, "grad_norm": 3.8171560764312744, "learning_rate": 4.746588971006253e-05, "loss": 2.7338, "step": 2409 }, { "epoch": 3.3565459610027855, "grad_norm": 3.907892942428589, "learning_rate": 4.745594087549744e-05, "loss": 2.8174, "step": 2410 }, { "epoch": 3.3579387186629526, "grad_norm": 7.531633377075195, "learning_rate": 4.744599204093234e-05, "loss": 3.2603, "step": 2411 }, { "epoch": 3.3593314763231197, "grad_norm": 4.269933223724365, "learning_rate": 4.7436043206367254e-05, "loss": 2.9665, "step": 2412 }, { "epoch": 3.360724233983287, "grad_norm": 3.293668508529663, "learning_rate": 4.742609437180216e-05, "loss": 2.3873, "step": 2413 }, { "epoch": 3.362116991643454, "grad_norm": 4.437375545501709, "learning_rate": 4.741614553723706e-05, "loss": 2.9374, "step": 2414 }, { "epoch": 3.363509749303621, "grad_norm": 3.826282024383545, "learning_rate": 4.7406196702671966e-05, "loss": 2.9442, "step": 2415 }, { "epoch": 3.364902506963788, "grad_norm": 3.6247544288635254, "learning_rate": 4.739624786810688e-05, "loss": 2.9575, "step": 2416 }, { "epoch": 3.3662952646239557, "grad_norm": 3.819434404373169, "learning_rate": 4.738629903354178e-05, "loss": 2.6576, "step": 2417 }, { "epoch": 3.3676880222841223, "grad_norm": 4.170529365539551, "learning_rate": 4.737635019897669e-05, "loss": 2.7199, "step": 2418 }, { "epoch": 3.36908077994429, "grad_norm": 3.944037914276123, "learning_rate": 4.736640136441159e-05, "loss": 2.5178, "step": 2419 }, { "epoch": 3.370473537604457, "grad_norm": 4.178440570831299, "learning_rate": 4.73564525298465e-05, "loss": 2.5889, "step": 2420 }, { "epoch": 3.371866295264624, "grad_norm": 4.817466735839844, "learning_rate": 4.7346503695281404e-05, "loss": 2.7761, "step": 2421 }, { "epoch": 3.3732590529247912, "grad_norm": 4.657197952270508, "learning_rate": 4.7336554860716315e-05, "loss": 2.6743, "step": 2422 }, { "epoch": 3.3746518105849583, "grad_norm": 4.162113189697266, "learning_rate": 4.732660602615122e-05, "loss": 2.6157, "step": 2423 }, { "epoch": 3.3760445682451254, "grad_norm": 3.604059934616089, "learning_rate": 4.731665719158612e-05, "loss": 2.6157, "step": 2424 }, { "epoch": 3.3774373259052926, "grad_norm": 4.041855812072754, "learning_rate": 4.730670835702103e-05, "loss": 2.8309, "step": 2425 }, { "epoch": 3.3788300835654597, "grad_norm": 3.923802614212036, "learning_rate": 4.729675952245594e-05, "loss": 3.0207, "step": 2426 }, { "epoch": 3.3802228412256268, "grad_norm": 4.154032230377197, "learning_rate": 4.728681068789084e-05, "loss": 3.1268, "step": 2427 }, { "epoch": 3.381615598885794, "grad_norm": 3.595344066619873, "learning_rate": 4.727686185332575e-05, "loss": 2.6528, "step": 2428 }, { "epoch": 3.383008356545961, "grad_norm": 4.08375883102417, "learning_rate": 4.726691301876065e-05, "loss": 2.7236, "step": 2429 }, { "epoch": 3.384401114206128, "grad_norm": 4.021947383880615, "learning_rate": 4.725696418419556e-05, "loss": 2.6887, "step": 2430 }, { "epoch": 3.385793871866295, "grad_norm": 4.529158115386963, "learning_rate": 4.7247015349630465e-05, "loss": 3.2312, "step": 2431 }, { "epoch": 3.3871866295264623, "grad_norm": 4.145749568939209, "learning_rate": 4.7237066515065376e-05, "loss": 2.9076, "step": 2432 }, { "epoch": 3.3885793871866294, "grad_norm": 3.9654574394226074, "learning_rate": 4.722711768050029e-05, "loss": 2.9585, "step": 2433 }, { "epoch": 3.3899721448467965, "grad_norm": 4.129126071929932, "learning_rate": 4.7217168845935184e-05, "loss": 2.7876, "step": 2434 }, { "epoch": 3.3913649025069637, "grad_norm": 4.0180182456970215, "learning_rate": 4.7207220011370095e-05, "loss": 2.8569, "step": 2435 }, { "epoch": 3.3927576601671308, "grad_norm": 6.775938510894775, "learning_rate": 4.7197271176805e-05, "loss": 2.8111, "step": 2436 }, { "epoch": 3.394150417827298, "grad_norm": 4.824405193328857, "learning_rate": 4.718732234223991e-05, "loss": 2.7421, "step": 2437 }, { "epoch": 3.3955431754874654, "grad_norm": 4.232151985168457, "learning_rate": 4.7177373507674814e-05, "loss": 2.6584, "step": 2438 }, { "epoch": 3.396935933147632, "grad_norm": 3.910310745239258, "learning_rate": 4.716742467310972e-05, "loss": 2.7436, "step": 2439 }, { "epoch": 3.3983286908077996, "grad_norm": 3.6283116340637207, "learning_rate": 4.715747583854462e-05, "loss": 2.9569, "step": 2440 }, { "epoch": 3.3997214484679668, "grad_norm": 4.020989894866943, "learning_rate": 4.714752700397953e-05, "loss": 2.662, "step": 2441 }, { "epoch": 3.401114206128134, "grad_norm": 4.190573215484619, "learning_rate": 4.713757816941444e-05, "loss": 2.8246, "step": 2442 }, { "epoch": 3.402506963788301, "grad_norm": 4.062912464141846, "learning_rate": 4.712762933484934e-05, "loss": 2.9073, "step": 2443 }, { "epoch": 3.403899721448468, "grad_norm": 4.1267170906066895, "learning_rate": 4.7117680500284245e-05, "loss": 3.395, "step": 2444 }, { "epoch": 3.405292479108635, "grad_norm": 3.808474063873291, "learning_rate": 4.7107731665719156e-05, "loss": 2.4232, "step": 2445 }, { "epoch": 3.4066852367688023, "grad_norm": 4.739579677581787, "learning_rate": 4.709778283115406e-05, "loss": 2.6484, "step": 2446 }, { "epoch": 3.4080779944289694, "grad_norm": 3.631610155105591, "learning_rate": 4.708783399658897e-05, "loss": 2.6488, "step": 2447 }, { "epoch": 3.4094707520891365, "grad_norm": 3.909418821334839, "learning_rate": 4.707788516202387e-05, "loss": 2.7904, "step": 2448 }, { "epoch": 3.4108635097493036, "grad_norm": 3.8568201065063477, "learning_rate": 4.706793632745878e-05, "loss": 2.4642, "step": 2449 }, { "epoch": 3.4122562674094707, "grad_norm": 3.8794257640838623, "learning_rate": 4.705798749289368e-05, "loss": 2.9015, "step": 2450 }, { "epoch": 3.413649025069638, "grad_norm": 4.410316467285156, "learning_rate": 4.7048038658328594e-05, "loss": 2.4628, "step": 2451 }, { "epoch": 3.415041782729805, "grad_norm": 4.268566131591797, "learning_rate": 4.70380898237635e-05, "loss": 2.8573, "step": 2452 }, { "epoch": 3.416434540389972, "grad_norm": 3.9342827796936035, "learning_rate": 4.70281409891984e-05, "loss": 2.292, "step": 2453 }, { "epoch": 3.417827298050139, "grad_norm": 4.145666599273682, "learning_rate": 4.7018192154633306e-05, "loss": 3.166, "step": 2454 }, { "epoch": 3.4192200557103063, "grad_norm": 5.339357376098633, "learning_rate": 4.700824332006822e-05, "loss": 2.6585, "step": 2455 }, { "epoch": 3.4206128133704734, "grad_norm": 3.945842981338501, "learning_rate": 4.699829448550312e-05, "loss": 2.5021, "step": 2456 }, { "epoch": 3.4220055710306405, "grad_norm": 3.6610825061798096, "learning_rate": 4.698834565093803e-05, "loss": 2.8762, "step": 2457 }, { "epoch": 3.4233983286908076, "grad_norm": 3.8504912853240967, "learning_rate": 4.697839681637293e-05, "loss": 2.8399, "step": 2458 }, { "epoch": 3.424791086350975, "grad_norm": 5.630061626434326, "learning_rate": 4.696844798180784e-05, "loss": 2.7062, "step": 2459 }, { "epoch": 3.426183844011142, "grad_norm": 4.1890692710876465, "learning_rate": 4.6958499147242744e-05, "loss": 2.9676, "step": 2460 }, { "epoch": 3.4275766016713094, "grad_norm": 3.9940645694732666, "learning_rate": 4.6948550312677655e-05, "loss": 2.5669, "step": 2461 }, { "epoch": 3.4289693593314765, "grad_norm": 3.853699207305908, "learning_rate": 4.6938601478112566e-05, "loss": 2.9325, "step": 2462 }, { "epoch": 3.4303621169916436, "grad_norm": 3.8538451194763184, "learning_rate": 4.692865264354746e-05, "loss": 2.9975, "step": 2463 }, { "epoch": 3.4317548746518107, "grad_norm": 4.6403069496154785, "learning_rate": 4.6918703808982374e-05, "loss": 3.2556, "step": 2464 }, { "epoch": 3.433147632311978, "grad_norm": 4.27023983001709, "learning_rate": 4.690875497441728e-05, "loss": 2.9309, "step": 2465 }, { "epoch": 3.434540389972145, "grad_norm": 3.9356207847595215, "learning_rate": 4.689880613985219e-05, "loss": 2.5403, "step": 2466 }, { "epoch": 3.435933147632312, "grad_norm": 4.530642986297607, "learning_rate": 4.688885730528709e-05, "loss": 2.6984, "step": 2467 }, { "epoch": 3.437325905292479, "grad_norm": 3.673515558242798, "learning_rate": 4.6878908470722e-05, "loss": 2.4632, "step": 2468 }, { "epoch": 3.4387186629526463, "grad_norm": 3.577712059020996, "learning_rate": 4.68689596361569e-05, "loss": 2.5888, "step": 2469 }, { "epoch": 3.4401114206128134, "grad_norm": 3.673774242401123, "learning_rate": 4.685901080159181e-05, "loss": 2.502, "step": 2470 }, { "epoch": 3.4415041782729805, "grad_norm": 4.250201225280762, "learning_rate": 4.6849061967026716e-05, "loss": 2.9086, "step": 2471 }, { "epoch": 3.4428969359331476, "grad_norm": 4.001884937286377, "learning_rate": 4.683911313246163e-05, "loss": 2.6975, "step": 2472 }, { "epoch": 3.4442896935933147, "grad_norm": 4.061591148376465, "learning_rate": 4.6829164297896524e-05, "loss": 2.492, "step": 2473 }, { "epoch": 3.445682451253482, "grad_norm": 3.9153151512145996, "learning_rate": 4.6819215463331435e-05, "loss": 2.8333, "step": 2474 }, { "epoch": 3.447075208913649, "grad_norm": 3.529660940170288, "learning_rate": 4.680926662876634e-05, "loss": 2.6082, "step": 2475 }, { "epoch": 3.448467966573816, "grad_norm": 3.9232723712921143, "learning_rate": 4.679931779420125e-05, "loss": 2.8638, "step": 2476 }, { "epoch": 3.449860724233983, "grad_norm": 3.741406202316284, "learning_rate": 4.6789368959636154e-05, "loss": 2.4546, "step": 2477 }, { "epoch": 3.4512534818941503, "grad_norm": 3.93326997756958, "learning_rate": 4.677942012507106e-05, "loss": 3.0198, "step": 2478 }, { "epoch": 3.4526462395543174, "grad_norm": 4.116133213043213, "learning_rate": 4.676947129050596e-05, "loss": 2.5921, "step": 2479 }, { "epoch": 3.4540389972144845, "grad_norm": 4.393657684326172, "learning_rate": 4.675952245594087e-05, "loss": 2.7356, "step": 2480 }, { "epoch": 3.4554317548746516, "grad_norm": 4.12740421295166, "learning_rate": 4.674957362137578e-05, "loss": 2.7773, "step": 2481 }, { "epoch": 3.456824512534819, "grad_norm": 4.11367654800415, "learning_rate": 4.673962478681069e-05, "loss": 3.455, "step": 2482 }, { "epoch": 3.4582172701949863, "grad_norm": 4.02070426940918, "learning_rate": 4.6729675952245585e-05, "loss": 3.1256, "step": 2483 }, { "epoch": 3.4596100278551534, "grad_norm": 4.215537071228027, "learning_rate": 4.6719727117680496e-05, "loss": 2.6202, "step": 2484 }, { "epoch": 3.4610027855153205, "grad_norm": 3.678616523742676, "learning_rate": 4.67097782831154e-05, "loss": 2.9246, "step": 2485 }, { "epoch": 3.4623955431754876, "grad_norm": 5.492940902709961, "learning_rate": 4.669982944855031e-05, "loss": 2.6919, "step": 2486 }, { "epoch": 3.4637883008356547, "grad_norm": 4.754304885864258, "learning_rate": 4.668988061398521e-05, "loss": 2.8382, "step": 2487 }, { "epoch": 3.465181058495822, "grad_norm": 3.8414769172668457, "learning_rate": 4.667993177942012e-05, "loss": 2.2719, "step": 2488 }, { "epoch": 3.466573816155989, "grad_norm": 3.788646697998047, "learning_rate": 4.666998294485502e-05, "loss": 2.2803, "step": 2489 }, { "epoch": 3.467966573816156, "grad_norm": 3.948223114013672, "learning_rate": 4.6660034110289934e-05, "loss": 2.5775, "step": 2490 }, { "epoch": 3.469359331476323, "grad_norm": 4.155910968780518, "learning_rate": 4.6650085275724845e-05, "loss": 2.9139, "step": 2491 }, { "epoch": 3.4707520891364902, "grad_norm": 6.739853858947754, "learning_rate": 4.664013644115974e-05, "loss": 2.9066, "step": 2492 }, { "epoch": 3.4721448467966574, "grad_norm": 4.00689172744751, "learning_rate": 4.663018760659465e-05, "loss": 2.856, "step": 2493 }, { "epoch": 3.4735376044568245, "grad_norm": 3.6261701583862305, "learning_rate": 4.662023877202956e-05, "loss": 2.8077, "step": 2494 }, { "epoch": 3.4749303621169916, "grad_norm": 3.793961524963379, "learning_rate": 4.661028993746447e-05, "loss": 2.6904, "step": 2495 }, { "epoch": 3.4763231197771587, "grad_norm": 4.114292621612549, "learning_rate": 4.660034110289937e-05, "loss": 3.1544, "step": 2496 }, { "epoch": 3.477715877437326, "grad_norm": 3.8729336261749268, "learning_rate": 4.6590392268334276e-05, "loss": 2.6785, "step": 2497 }, { "epoch": 3.479108635097493, "grad_norm": 4.672178268432617, "learning_rate": 4.658044343376918e-05, "loss": 2.5692, "step": 2498 }, { "epoch": 3.48050139275766, "grad_norm": 3.973593235015869, "learning_rate": 4.657049459920409e-05, "loss": 2.5247, "step": 2499 }, { "epoch": 3.481894150417827, "grad_norm": 4.204497814178467, "learning_rate": 4.6560545764638995e-05, "loss": 3.3036, "step": 2500 }, { "epoch": 3.4832869080779942, "grad_norm": 3.5338776111602783, "learning_rate": 4.6550596930073906e-05, "loss": 2.2795, "step": 2501 }, { "epoch": 3.4846796657381613, "grad_norm": 3.7186532020568848, "learning_rate": 4.6540648095508803e-05, "loss": 2.4927, "step": 2502 }, { "epoch": 3.486072423398329, "grad_norm": 4.113531112670898, "learning_rate": 4.6530699260943714e-05, "loss": 2.3812, "step": 2503 }, { "epoch": 3.487465181058496, "grad_norm": 3.9955506324768066, "learning_rate": 4.652075042637862e-05, "loss": 3.0143, "step": 2504 }, { "epoch": 3.488857938718663, "grad_norm": 3.6784656047821045, "learning_rate": 4.651080159181353e-05, "loss": 2.3338, "step": 2505 }, { "epoch": 3.4902506963788302, "grad_norm": 4.05626916885376, "learning_rate": 4.650085275724843e-05, "loss": 2.8512, "step": 2506 }, { "epoch": 3.4916434540389973, "grad_norm": 4.049886226654053, "learning_rate": 4.649090392268334e-05, "loss": 2.8965, "step": 2507 }, { "epoch": 3.4930362116991645, "grad_norm": 4.836441993713379, "learning_rate": 4.648095508811824e-05, "loss": 2.6381, "step": 2508 }, { "epoch": 3.4944289693593316, "grad_norm": 4.007538318634033, "learning_rate": 4.647100625355315e-05, "loss": 2.3314, "step": 2509 }, { "epoch": 3.4958217270194987, "grad_norm": 5.4083662033081055, "learning_rate": 4.6461057418988056e-05, "loss": 2.4072, "step": 2510 }, { "epoch": 3.497214484679666, "grad_norm": 3.6691737174987793, "learning_rate": 4.645110858442297e-05, "loss": 2.3187, "step": 2511 }, { "epoch": 3.498607242339833, "grad_norm": 5.101102828979492, "learning_rate": 4.6441159749857865e-05, "loss": 2.8893, "step": 2512 }, { "epoch": 3.5, "grad_norm": 3.9152889251708984, "learning_rate": 4.6431210915292775e-05, "loss": 2.4472, "step": 2513 }, { "epoch": 3.501392757660167, "grad_norm": 4.1873555183410645, "learning_rate": 4.642126208072768e-05, "loss": 2.7775, "step": 2514 }, { "epoch": 3.502785515320334, "grad_norm": 3.612882614135742, "learning_rate": 4.641131324616259e-05, "loss": 2.703, "step": 2515 }, { "epoch": 3.5041782729805013, "grad_norm": 4.355597972869873, "learning_rate": 4.64013644115975e-05, "loss": 2.9082, "step": 2516 }, { "epoch": 3.5055710306406684, "grad_norm": 3.797044277191162, "learning_rate": 4.63914155770324e-05, "loss": 2.6401, "step": 2517 }, { "epoch": 3.5069637883008355, "grad_norm": 3.89890456199646, "learning_rate": 4.63814667424673e-05, "loss": 2.5919, "step": 2518 }, { "epoch": 3.5083565459610027, "grad_norm": 3.6076433658599854, "learning_rate": 4.637151790790221e-05, "loss": 2.422, "step": 2519 }, { "epoch": 3.5097493036211698, "grad_norm": 3.984490156173706, "learning_rate": 4.6361569073337124e-05, "loss": 2.6891, "step": 2520 }, { "epoch": 3.511142061281337, "grad_norm": 3.5938799381256104, "learning_rate": 4.635162023877203e-05, "loss": 2.7263, "step": 2521 }, { "epoch": 3.5125348189415044, "grad_norm": 4.015036106109619, "learning_rate": 4.634167140420693e-05, "loss": 2.8112, "step": 2522 }, { "epoch": 3.513927576601671, "grad_norm": 3.9240949153900146, "learning_rate": 4.6331722569641836e-05, "loss": 2.7094, "step": 2523 }, { "epoch": 3.5153203342618387, "grad_norm": 4.509311676025391, "learning_rate": 4.632177373507675e-05, "loss": 3.0363, "step": 2524 }, { "epoch": 3.5167130919220053, "grad_norm": 4.242368221282959, "learning_rate": 4.631182490051165e-05, "loss": 2.9884, "step": 2525 }, { "epoch": 3.518105849582173, "grad_norm": 5.659360885620117, "learning_rate": 4.6301876065946555e-05, "loss": 2.782, "step": 2526 }, { "epoch": 3.51949860724234, "grad_norm": 4.399890422821045, "learning_rate": 4.629192723138146e-05, "loss": 2.7044, "step": 2527 }, { "epoch": 3.520891364902507, "grad_norm": 4.138412952423096, "learning_rate": 4.628197839681637e-05, "loss": 2.7621, "step": 2528 }, { "epoch": 3.522284122562674, "grad_norm": 4.267613887786865, "learning_rate": 4.6272029562251274e-05, "loss": 2.5129, "step": 2529 }, { "epoch": 3.5236768802228413, "grad_norm": 3.9596104621887207, "learning_rate": 4.6262080727686185e-05, "loss": 2.6278, "step": 2530 }, { "epoch": 3.5250696378830084, "grad_norm": 3.815171003341675, "learning_rate": 4.625213189312108e-05, "loss": 2.4257, "step": 2531 }, { "epoch": 3.5264623955431755, "grad_norm": 5.524709224700928, "learning_rate": 4.6242183058555993e-05, "loss": 2.7696, "step": 2532 }, { "epoch": 3.5278551532033426, "grad_norm": 4.13335657119751, "learning_rate": 4.62322342239909e-05, "loss": 3.011, "step": 2533 }, { "epoch": 3.5292479108635098, "grad_norm": 4.052757263183594, "learning_rate": 4.622228538942581e-05, "loss": 2.6474, "step": 2534 }, { "epoch": 3.530640668523677, "grad_norm": 3.7928733825683594, "learning_rate": 4.621233655486071e-05, "loss": 2.4821, "step": 2535 }, { "epoch": 3.532033426183844, "grad_norm": 3.940757989883423, "learning_rate": 4.6202387720295617e-05, "loss": 2.5799, "step": 2536 }, { "epoch": 3.533426183844011, "grad_norm": 4.096225738525391, "learning_rate": 4.619243888573052e-05, "loss": 2.8428, "step": 2537 }, { "epoch": 3.534818941504178, "grad_norm": 4.849141597747803, "learning_rate": 4.618249005116543e-05, "loss": 2.3243, "step": 2538 }, { "epoch": 3.5362116991643453, "grad_norm": 4.354057312011719, "learning_rate": 4.6172541216600336e-05, "loss": 2.5553, "step": 2539 }, { "epoch": 3.5376044568245124, "grad_norm": 4.655442714691162, "learning_rate": 4.6162592382035246e-05, "loss": 2.5603, "step": 2540 }, { "epoch": 3.5389972144846795, "grad_norm": 3.835197687149048, "learning_rate": 4.6152643547470144e-05, "loss": 2.3249, "step": 2541 }, { "epoch": 3.5403899721448466, "grad_norm": 3.771993398666382, "learning_rate": 4.6142694712905055e-05, "loss": 2.9533, "step": 2542 }, { "epoch": 3.541782729805014, "grad_norm": 4.148719310760498, "learning_rate": 4.613274587833996e-05, "loss": 2.7036, "step": 2543 }, { "epoch": 3.543175487465181, "grad_norm": 4.081655025482178, "learning_rate": 4.612279704377487e-05, "loss": 2.9815, "step": 2544 }, { "epoch": 3.5445682451253484, "grad_norm": 4.131512641906738, "learning_rate": 4.611284820920978e-05, "loss": 2.6722, "step": 2545 }, { "epoch": 3.545961002785515, "grad_norm": 4.983314514160156, "learning_rate": 4.610289937464468e-05, "loss": 2.4037, "step": 2546 }, { "epoch": 3.5473537604456826, "grad_norm": 4.0679097175598145, "learning_rate": 4.609295054007959e-05, "loss": 2.5493, "step": 2547 }, { "epoch": 3.5487465181058497, "grad_norm": 3.8854141235351562, "learning_rate": 4.608300170551449e-05, "loss": 2.9201, "step": 2548 }, { "epoch": 3.550139275766017, "grad_norm": 4.404861927032471, "learning_rate": 4.6073052870949403e-05, "loss": 2.3592, "step": 2549 }, { "epoch": 3.551532033426184, "grad_norm": 4.275940895080566, "learning_rate": 4.606310403638431e-05, "loss": 2.4952, "step": 2550 }, { "epoch": 3.552924791086351, "grad_norm": 4.227753162384033, "learning_rate": 4.605315520181921e-05, "loss": 2.7161, "step": 2551 }, { "epoch": 3.554317548746518, "grad_norm": 4.181053638458252, "learning_rate": 4.6043206367254116e-05, "loss": 2.8687, "step": 2552 }, { "epoch": 3.5557103064066853, "grad_norm": 3.793886661529541, "learning_rate": 4.6033257532689026e-05, "loss": 2.7349, "step": 2553 }, { "epoch": 3.5571030640668524, "grad_norm": 3.7882189750671387, "learning_rate": 4.602330869812393e-05, "loss": 2.308, "step": 2554 }, { "epoch": 3.5584958217270195, "grad_norm": 3.880540370941162, "learning_rate": 4.601335986355884e-05, "loss": 2.4352, "step": 2555 }, { "epoch": 3.5598885793871866, "grad_norm": 3.4791016578674316, "learning_rate": 4.600341102899374e-05, "loss": 2.4757, "step": 2556 }, { "epoch": 3.5612813370473537, "grad_norm": 3.6199772357940674, "learning_rate": 4.599346219442865e-05, "loss": 2.4552, "step": 2557 }, { "epoch": 3.562674094707521, "grad_norm": 4.231076240539551, "learning_rate": 4.5983513359863554e-05, "loss": 2.8545, "step": 2558 }, { "epoch": 3.564066852367688, "grad_norm": 3.7487900257110596, "learning_rate": 4.5973564525298464e-05, "loss": 2.5915, "step": 2559 }, { "epoch": 3.565459610027855, "grad_norm": 4.234863758087158, "learning_rate": 4.596361569073337e-05, "loss": 2.4, "step": 2560 }, { "epoch": 3.566852367688022, "grad_norm": 4.43113374710083, "learning_rate": 4.595366685616827e-05, "loss": 2.569, "step": 2561 }, { "epoch": 3.5682451253481893, "grad_norm": 4.24310827255249, "learning_rate": 4.594371802160318e-05, "loss": 2.8166, "step": 2562 }, { "epoch": 3.5696378830083564, "grad_norm": 4.14475679397583, "learning_rate": 4.593376918703809e-05, "loss": 2.6704, "step": 2563 }, { "epoch": 3.571030640668524, "grad_norm": 3.697046995162964, "learning_rate": 4.592382035247299e-05, "loss": 2.403, "step": 2564 }, { "epoch": 3.5724233983286906, "grad_norm": 3.756159543991089, "learning_rate": 4.59138715179079e-05, "loss": 2.9002, "step": 2565 }, { "epoch": 3.573816155988858, "grad_norm": 3.5814194679260254, "learning_rate": 4.59039226833428e-05, "loss": 2.3633, "step": 2566 }, { "epoch": 3.575208913649025, "grad_norm": 3.5276687145233154, "learning_rate": 4.589397384877771e-05, "loss": 2.3979, "step": 2567 }, { "epoch": 3.5766016713091924, "grad_norm": 3.988581418991089, "learning_rate": 4.5884025014212615e-05, "loss": 2.6663, "step": 2568 }, { "epoch": 3.5779944289693595, "grad_norm": 3.8402259349823, "learning_rate": 4.5874076179647526e-05, "loss": 2.734, "step": 2569 }, { "epoch": 3.5793871866295266, "grad_norm": 4.858250141143799, "learning_rate": 4.586412734508242e-05, "loss": 2.404, "step": 2570 }, { "epoch": 3.5807799442896937, "grad_norm": 4.333425998687744, "learning_rate": 4.5854178510517334e-05, "loss": 3.1859, "step": 2571 }, { "epoch": 3.582172701949861, "grad_norm": 4.770310878753662, "learning_rate": 4.584422967595224e-05, "loss": 2.8104, "step": 2572 }, { "epoch": 3.583565459610028, "grad_norm": 4.867776870727539, "learning_rate": 4.583428084138715e-05, "loss": 2.8107, "step": 2573 }, { "epoch": 3.584958217270195, "grad_norm": 3.9517855644226074, "learning_rate": 4.582433200682206e-05, "loss": 2.9689, "step": 2574 }, { "epoch": 3.586350974930362, "grad_norm": 3.9247171878814697, "learning_rate": 4.581438317225696e-05, "loss": 2.4604, "step": 2575 }, { "epoch": 3.5877437325905293, "grad_norm": 3.239447832107544, "learning_rate": 4.580443433769187e-05, "loss": 2.2073, "step": 2576 }, { "epoch": 3.5891364902506964, "grad_norm": 3.8720920085906982, "learning_rate": 4.579448550312677e-05, "loss": 2.6184, "step": 2577 }, { "epoch": 3.5905292479108635, "grad_norm": 4.217468738555908, "learning_rate": 4.578453666856168e-05, "loss": 2.8072, "step": 2578 }, { "epoch": 3.5919220055710306, "grad_norm": 4.635210990905762, "learning_rate": 4.577458783399659e-05, "loss": 2.4138, "step": 2579 }, { "epoch": 3.5933147632311977, "grad_norm": 4.103201389312744, "learning_rate": 4.576463899943149e-05, "loss": 2.5774, "step": 2580 }, { "epoch": 3.594707520891365, "grad_norm": 4.105129241943359, "learning_rate": 4.5754690164866395e-05, "loss": 2.5725, "step": 2581 }, { "epoch": 3.596100278551532, "grad_norm": 4.445010185241699, "learning_rate": 4.5744741330301306e-05, "loss": 2.3895, "step": 2582 }, { "epoch": 3.597493036211699, "grad_norm": 3.87135910987854, "learning_rate": 4.573479249573621e-05, "loss": 2.5712, "step": 2583 }, { "epoch": 3.598885793871866, "grad_norm": 4.35500955581665, "learning_rate": 4.572484366117112e-05, "loss": 2.7363, "step": 2584 }, { "epoch": 3.6002785515320337, "grad_norm": 4.664234638214111, "learning_rate": 4.571489482660602e-05, "loss": 2.9104, "step": 2585 }, { "epoch": 3.6016713091922004, "grad_norm": 4.105116844177246, "learning_rate": 4.570494599204093e-05, "loss": 2.3681, "step": 2586 }, { "epoch": 3.603064066852368, "grad_norm": 3.8014397621154785, "learning_rate": 4.569499715747583e-05, "loss": 2.685, "step": 2587 }, { "epoch": 3.6044568245125346, "grad_norm": 4.089648723602295, "learning_rate": 4.5685048322910744e-05, "loss": 2.4354, "step": 2588 }, { "epoch": 3.605849582172702, "grad_norm": 4.380812168121338, "learning_rate": 4.567509948834565e-05, "loss": 2.6941, "step": 2589 }, { "epoch": 3.6072423398328692, "grad_norm": 4.799055576324463, "learning_rate": 4.566515065378055e-05, "loss": 3.1246, "step": 2590 }, { "epoch": 3.6086350974930363, "grad_norm": 4.490792274475098, "learning_rate": 4.5655201819215456e-05, "loss": 2.8987, "step": 2591 }, { "epoch": 3.6100278551532035, "grad_norm": 3.71956467628479, "learning_rate": 4.564525298465037e-05, "loss": 2.7376, "step": 2592 }, { "epoch": 3.6114206128133706, "grad_norm": 3.9480626583099365, "learning_rate": 4.563530415008527e-05, "loss": 2.6768, "step": 2593 }, { "epoch": 3.6128133704735377, "grad_norm": 5.132822513580322, "learning_rate": 4.562535531552018e-05, "loss": 2.8901, "step": 2594 }, { "epoch": 3.614206128133705, "grad_norm": 4.388399600982666, "learning_rate": 4.561540648095508e-05, "loss": 2.8447, "step": 2595 }, { "epoch": 3.615598885793872, "grad_norm": 3.8116705417633057, "learning_rate": 4.560545764638999e-05, "loss": 2.5332, "step": 2596 }, { "epoch": 3.616991643454039, "grad_norm": 6.826498985290527, "learning_rate": 4.5595508811824894e-05, "loss": 2.6834, "step": 2597 }, { "epoch": 3.618384401114206, "grad_norm": 3.922801971435547, "learning_rate": 4.5585559977259805e-05, "loss": 2.9515, "step": 2598 }, { "epoch": 3.6197771587743732, "grad_norm": 4.208038806915283, "learning_rate": 4.5575611142694716e-05, "loss": 2.5785, "step": 2599 }, { "epoch": 3.6211699164345403, "grad_norm": 3.956136465072632, "learning_rate": 4.556566230812961e-05, "loss": 2.4827, "step": 2600 }, { "epoch": 3.6225626740947074, "grad_norm": 4.067508697509766, "learning_rate": 4.555571347356452e-05, "loss": 2.242, "step": 2601 }, { "epoch": 3.6239554317548746, "grad_norm": 4.087154388427734, "learning_rate": 4.554576463899943e-05, "loss": 2.6914, "step": 2602 }, { "epoch": 3.6253481894150417, "grad_norm": 3.8476922512054443, "learning_rate": 4.553581580443434e-05, "loss": 2.582, "step": 2603 }, { "epoch": 3.6267409470752088, "grad_norm": 3.9368791580200195, "learning_rate": 4.552586696986924e-05, "loss": 2.1679, "step": 2604 }, { "epoch": 3.628133704735376, "grad_norm": 4.296043395996094, "learning_rate": 4.551591813530415e-05, "loss": 2.8044, "step": 2605 }, { "epoch": 3.6295264623955434, "grad_norm": 4.281557559967041, "learning_rate": 4.550596930073905e-05, "loss": 2.8132, "step": 2606 }, { "epoch": 3.63091922005571, "grad_norm": 3.6667840480804443, "learning_rate": 4.549602046617396e-05, "loss": 2.562, "step": 2607 }, { "epoch": 3.6323119777158777, "grad_norm": 3.9428160190582275, "learning_rate": 4.5486071631608866e-05, "loss": 2.726, "step": 2608 }, { "epoch": 3.6337047353760443, "grad_norm": 4.410280227661133, "learning_rate": 4.547612279704378e-05, "loss": 2.5515, "step": 2609 }, { "epoch": 3.635097493036212, "grad_norm": 3.685770034790039, "learning_rate": 4.5466173962478674e-05, "loss": 2.566, "step": 2610 }, { "epoch": 3.636490250696379, "grad_norm": 5.172934055328369, "learning_rate": 4.5456225127913585e-05, "loss": 2.1931, "step": 2611 }, { "epoch": 3.637883008356546, "grad_norm": 3.7498984336853027, "learning_rate": 4.544627629334849e-05, "loss": 2.2915, "step": 2612 }, { "epoch": 3.639275766016713, "grad_norm": 3.7322449684143066, "learning_rate": 4.54363274587834e-05, "loss": 2.2766, "step": 2613 }, { "epoch": 3.6406685236768803, "grad_norm": 4.4291090965271, "learning_rate": 4.54263786242183e-05, "loss": 2.5531, "step": 2614 }, { "epoch": 3.6420612813370474, "grad_norm": 4.109873294830322, "learning_rate": 4.541642978965321e-05, "loss": 2.727, "step": 2615 }, { "epoch": 3.6434540389972145, "grad_norm": 3.9789352416992188, "learning_rate": 4.540648095508811e-05, "loss": 2.346, "step": 2616 }, { "epoch": 3.6448467966573816, "grad_norm": 3.961202383041382, "learning_rate": 4.539653212052302e-05, "loss": 2.4444, "step": 2617 }, { "epoch": 3.6462395543175488, "grad_norm": 3.8635852336883545, "learning_rate": 4.538658328595793e-05, "loss": 2.6535, "step": 2618 }, { "epoch": 3.647632311977716, "grad_norm": 3.8671016693115234, "learning_rate": 4.537663445139283e-05, "loss": 2.5828, "step": 2619 }, { "epoch": 3.649025069637883, "grad_norm": 3.676393985748291, "learning_rate": 4.5366685616827735e-05, "loss": 2.4813, "step": 2620 }, { "epoch": 3.65041782729805, "grad_norm": 4.046565532684326, "learning_rate": 4.5356736782262646e-05, "loss": 2.5864, "step": 2621 }, { "epoch": 3.651810584958217, "grad_norm": 3.296393394470215, "learning_rate": 4.534678794769755e-05, "loss": 2.194, "step": 2622 }, { "epoch": 3.6532033426183843, "grad_norm": 3.507030725479126, "learning_rate": 4.533683911313246e-05, "loss": 2.6079, "step": 2623 }, { "epoch": 3.6545961002785514, "grad_norm": 4.159409523010254, "learning_rate": 4.532689027856736e-05, "loss": 2.7253, "step": 2624 }, { "epoch": 3.6559888579387185, "grad_norm": 3.7661807537078857, "learning_rate": 4.531694144400227e-05, "loss": 2.7663, "step": 2625 }, { "epoch": 3.6573816155988856, "grad_norm": 4.4413323402404785, "learning_rate": 4.530699260943717e-05, "loss": 2.7119, "step": 2626 }, { "epoch": 3.658774373259053, "grad_norm": 4.101630210876465, "learning_rate": 4.5297043774872084e-05, "loss": 2.089, "step": 2627 }, { "epoch": 3.66016713091922, "grad_norm": 4.0640034675598145, "learning_rate": 4.5287094940306995e-05, "loss": 2.997, "step": 2628 }, { "epoch": 3.6615598885793874, "grad_norm": 3.9334235191345215, "learning_rate": 4.527714610574189e-05, "loss": 2.6158, "step": 2629 }, { "epoch": 3.662952646239554, "grad_norm": 4.678755760192871, "learning_rate": 4.5267197271176796e-05, "loss": 2.5875, "step": 2630 }, { "epoch": 3.6643454038997216, "grad_norm": 3.4286279678344727, "learning_rate": 4.525724843661171e-05, "loss": 2.301, "step": 2631 }, { "epoch": 3.6657381615598887, "grad_norm": 4.978024959564209, "learning_rate": 4.524729960204662e-05, "loss": 3.3852, "step": 2632 }, { "epoch": 3.667130919220056, "grad_norm": 3.922593116760254, "learning_rate": 4.523735076748152e-05, "loss": 2.5403, "step": 2633 }, { "epoch": 3.668523676880223, "grad_norm": 3.587224245071411, "learning_rate": 4.5227401932916426e-05, "loss": 2.6945, "step": 2634 }, { "epoch": 3.66991643454039, "grad_norm": 3.586855173110962, "learning_rate": 4.521745309835133e-05, "loss": 2.4708, "step": 2635 }, { "epoch": 3.671309192200557, "grad_norm": 4.244228839874268, "learning_rate": 4.520750426378624e-05, "loss": 2.5793, "step": 2636 }, { "epoch": 3.6727019498607243, "grad_norm": 3.6994094848632812, "learning_rate": 4.5197555429221145e-05, "loss": 2.2796, "step": 2637 }, { "epoch": 3.6740947075208914, "grad_norm": 3.9471349716186523, "learning_rate": 4.5187606594656056e-05, "loss": 2.9239, "step": 2638 }, { "epoch": 3.6754874651810585, "grad_norm": 3.8247509002685547, "learning_rate": 4.517765776009095e-05, "loss": 2.3052, "step": 2639 }, { "epoch": 3.6768802228412256, "grad_norm": 4.448849678039551, "learning_rate": 4.5167708925525864e-05, "loss": 2.5112, "step": 2640 }, { "epoch": 3.6782729805013927, "grad_norm": 4.322947978973389, "learning_rate": 4.515776009096077e-05, "loss": 2.663, "step": 2641 }, { "epoch": 3.67966573816156, "grad_norm": 3.362961769104004, "learning_rate": 4.514781125639568e-05, "loss": 2.422, "step": 2642 }, { "epoch": 3.681058495821727, "grad_norm": 3.8426859378814697, "learning_rate": 4.513786242183058e-05, "loss": 2.6184, "step": 2643 }, { "epoch": 3.682451253481894, "grad_norm": 4.057262897491455, "learning_rate": 4.512791358726549e-05, "loss": 2.7178, "step": 2644 }, { "epoch": 3.683844011142061, "grad_norm": 4.93907356262207, "learning_rate": 4.511796475270039e-05, "loss": 2.766, "step": 2645 }, { "epoch": 3.6852367688022283, "grad_norm": 4.774403095245361, "learning_rate": 4.51080159181353e-05, "loss": 2.6246, "step": 2646 }, { "epoch": 3.6866295264623954, "grad_norm": 3.8961191177368164, "learning_rate": 4.5098067083570206e-05, "loss": 2.6065, "step": 2647 }, { "epoch": 3.688022284122563, "grad_norm": 4.014467716217041, "learning_rate": 4.508811824900512e-05, "loss": 2.3052, "step": 2648 }, { "epoch": 3.6894150417827296, "grad_norm": 3.5372679233551025, "learning_rate": 4.5078169414440014e-05, "loss": 2.2911, "step": 2649 }, { "epoch": 3.690807799442897, "grad_norm": 4.601160526275635, "learning_rate": 4.5068220579874925e-05, "loss": 2.5624, "step": 2650 }, { "epoch": 3.692200557103064, "grad_norm": 4.832505226135254, "learning_rate": 4.505827174530983e-05, "loss": 2.713, "step": 2651 }, { "epoch": 3.6935933147632314, "grad_norm": 3.792067527770996, "learning_rate": 4.504832291074474e-05, "loss": 2.8528, "step": 2652 }, { "epoch": 3.6949860724233985, "grad_norm": 8.431668281555176, "learning_rate": 4.503837407617965e-05, "loss": 2.4823, "step": 2653 }, { "epoch": 3.6963788300835656, "grad_norm": 4.345920562744141, "learning_rate": 4.502842524161455e-05, "loss": 3.0058, "step": 2654 }, { "epoch": 3.6977715877437327, "grad_norm": 3.7541372776031494, "learning_rate": 4.501847640704945e-05, "loss": 2.2991, "step": 2655 }, { "epoch": 3.6991643454039, "grad_norm": 3.7721927165985107, "learning_rate": 4.500852757248436e-05, "loss": 2.5565, "step": 2656 }, { "epoch": 3.700557103064067, "grad_norm": 4.094515800476074, "learning_rate": 4.4998578737919274e-05, "loss": 2.4353, "step": 2657 }, { "epoch": 3.701949860724234, "grad_norm": 3.893754005432129, "learning_rate": 4.498862990335417e-05, "loss": 2.3503, "step": 2658 }, { "epoch": 3.703342618384401, "grad_norm": 3.969780206680298, "learning_rate": 4.4978681068789075e-05, "loss": 2.6403, "step": 2659 }, { "epoch": 3.7047353760445683, "grad_norm": 3.9899258613586426, "learning_rate": 4.4968732234223986e-05, "loss": 2.6126, "step": 2660 }, { "epoch": 3.7061281337047354, "grad_norm": 4.702632904052734, "learning_rate": 4.49587833996589e-05, "loss": 2.6044, "step": 2661 }, { "epoch": 3.7075208913649025, "grad_norm": 3.7289204597473145, "learning_rate": 4.49488345650938e-05, "loss": 2.4549, "step": 2662 }, { "epoch": 3.7089136490250696, "grad_norm": 3.6642167568206787, "learning_rate": 4.4938885730528705e-05, "loss": 2.7396, "step": 2663 }, { "epoch": 3.7103064066852367, "grad_norm": 3.549678325653076, "learning_rate": 4.492893689596361e-05, "loss": 2.2628, "step": 2664 }, { "epoch": 3.711699164345404, "grad_norm": 4.3354997634887695, "learning_rate": 4.491898806139852e-05, "loss": 2.9193, "step": 2665 }, { "epoch": 3.713091922005571, "grad_norm": 4.62516975402832, "learning_rate": 4.4909039226833424e-05, "loss": 2.2609, "step": 2666 }, { "epoch": 3.714484679665738, "grad_norm": 3.9241013526916504, "learning_rate": 4.4899090392268335e-05, "loss": 2.3211, "step": 2667 }, { "epoch": 3.715877437325905, "grad_norm": 3.7552497386932373, "learning_rate": 4.488914155770323e-05, "loss": 2.1628, "step": 2668 }, { "epoch": 3.7172701949860727, "grad_norm": 3.472870349884033, "learning_rate": 4.487919272313814e-05, "loss": 2.4897, "step": 2669 }, { "epoch": 3.7186629526462394, "grad_norm": 4.359379291534424, "learning_rate": 4.486924388857305e-05, "loss": 2.8167, "step": 2670 }, { "epoch": 3.720055710306407, "grad_norm": 3.691875696182251, "learning_rate": 4.485929505400796e-05, "loss": 2.5296, "step": 2671 }, { "epoch": 3.7214484679665736, "grad_norm": 3.495535373687744, "learning_rate": 4.484934621944286e-05, "loss": 2.241, "step": 2672 }, { "epoch": 3.722841225626741, "grad_norm": 4.101537704467773, "learning_rate": 4.4839397384877766e-05, "loss": 2.2998, "step": 2673 }, { "epoch": 3.724233983286908, "grad_norm": 3.879869222640991, "learning_rate": 4.482944855031267e-05, "loss": 2.7232, "step": 2674 }, { "epoch": 3.7256267409470754, "grad_norm": 3.9464526176452637, "learning_rate": 4.481949971574758e-05, "loss": 2.5505, "step": 2675 }, { "epoch": 3.7270194986072425, "grad_norm": 4.150542736053467, "learning_rate": 4.4809550881182485e-05, "loss": 2.5747, "step": 2676 }, { "epoch": 3.7284122562674096, "grad_norm": 4.016899108886719, "learning_rate": 4.4799602046617396e-05, "loss": 2.0973, "step": 2677 }, { "epoch": 3.7298050139275767, "grad_norm": 4.207693576812744, "learning_rate": 4.478965321205229e-05, "loss": 2.2592, "step": 2678 }, { "epoch": 3.731197771587744, "grad_norm": 4.351635456085205, "learning_rate": 4.4779704377487204e-05, "loss": 2.7811, "step": 2679 }, { "epoch": 3.732590529247911, "grad_norm": 4.195699691772461, "learning_rate": 4.476975554292211e-05, "loss": 2.8685, "step": 2680 }, { "epoch": 3.733983286908078, "grad_norm": 4.452206611633301, "learning_rate": 4.475980670835702e-05, "loss": 2.619, "step": 2681 }, { "epoch": 3.735376044568245, "grad_norm": 3.9670653343200684, "learning_rate": 4.474985787379193e-05, "loss": 2.5058, "step": 2682 }, { "epoch": 3.7367688022284122, "grad_norm": 5.118613243103027, "learning_rate": 4.473990903922683e-05, "loss": 2.8105, "step": 2683 }, { "epoch": 3.7381615598885793, "grad_norm": 4.750264644622803, "learning_rate": 4.472996020466173e-05, "loss": 2.6768, "step": 2684 }, { "epoch": 3.7395543175487465, "grad_norm": 3.6492106914520264, "learning_rate": 4.472001137009664e-05, "loss": 2.5859, "step": 2685 }, { "epoch": 3.7409470752089136, "grad_norm": 3.87990403175354, "learning_rate": 4.471006253553155e-05, "loss": 2.1777, "step": 2686 }, { "epoch": 3.7423398328690807, "grad_norm": 3.9534242153167725, "learning_rate": 4.470011370096646e-05, "loss": 2.5173, "step": 2687 }, { "epoch": 3.743732590529248, "grad_norm": 6.62573766708374, "learning_rate": 4.469016486640136e-05, "loss": 2.5524, "step": 2688 }, { "epoch": 3.745125348189415, "grad_norm": 3.6964545249938965, "learning_rate": 4.4680216031836265e-05, "loss": 2.6479, "step": 2689 }, { "epoch": 3.7465181058495824, "grad_norm": 4.097850799560547, "learning_rate": 4.4670267197271176e-05, "loss": 2.2815, "step": 2690 }, { "epoch": 3.747910863509749, "grad_norm": 3.6248035430908203, "learning_rate": 4.466031836270608e-05, "loss": 2.5696, "step": 2691 }, { "epoch": 3.7493036211699167, "grad_norm": 4.852518558502197, "learning_rate": 4.465036952814099e-05, "loss": 2.3997, "step": 2692 }, { "epoch": 3.7506963788300833, "grad_norm": 3.2907462120056152, "learning_rate": 4.464042069357589e-05, "loss": 2.3746, "step": 2693 }, { "epoch": 3.752089136490251, "grad_norm": 4.214056968688965, "learning_rate": 4.46304718590108e-05, "loss": 2.8967, "step": 2694 }, { "epoch": 3.7534818941504176, "grad_norm": 3.662506341934204, "learning_rate": 4.46205230244457e-05, "loss": 2.5207, "step": 2695 }, { "epoch": 3.754874651810585, "grad_norm": 3.9540629386901855, "learning_rate": 4.4610574189880614e-05, "loss": 2.7571, "step": 2696 }, { "epoch": 3.756267409470752, "grad_norm": 3.5481412410736084, "learning_rate": 4.460062535531551e-05, "loss": 2.4145, "step": 2697 }, { "epoch": 3.7576601671309193, "grad_norm": 4.129096984863281, "learning_rate": 4.459067652075042e-05, "loss": 2.9039, "step": 2698 }, { "epoch": 3.7590529247910864, "grad_norm": 3.8313071727752686, "learning_rate": 4.4580727686185326e-05, "loss": 2.2129, "step": 2699 }, { "epoch": 3.7604456824512535, "grad_norm": 3.844362258911133, "learning_rate": 4.457077885162024e-05, "loss": 2.4664, "step": 2700 }, { "epoch": 3.7618384401114207, "grad_norm": 3.797652006149292, "learning_rate": 4.456083001705514e-05, "loss": 2.2317, "step": 2701 }, { "epoch": 3.7632311977715878, "grad_norm": 4.749051094055176, "learning_rate": 4.4550881182490045e-05, "loss": 2.5312, "step": 2702 }, { "epoch": 3.764623955431755, "grad_norm": 4.275303840637207, "learning_rate": 4.454093234792495e-05, "loss": 2.9626, "step": 2703 }, { "epoch": 3.766016713091922, "grad_norm": 4.067491054534912, "learning_rate": 4.453098351335986e-05, "loss": 2.4986, "step": 2704 }, { "epoch": 3.767409470752089, "grad_norm": 4.040477275848389, "learning_rate": 4.4521034678794764e-05, "loss": 2.6475, "step": 2705 }, { "epoch": 3.768802228412256, "grad_norm": 3.455617666244507, "learning_rate": 4.4511085844229675e-05, "loss": 2.3625, "step": 2706 }, { "epoch": 3.7701949860724233, "grad_norm": 3.8546557426452637, "learning_rate": 4.450113700966457e-05, "loss": 2.3496, "step": 2707 }, { "epoch": 3.7715877437325904, "grad_norm": 3.701049566268921, "learning_rate": 4.449118817509948e-05, "loss": 2.3824, "step": 2708 }, { "epoch": 3.7729805013927575, "grad_norm": 4.046526908874512, "learning_rate": 4.448123934053439e-05, "loss": 2.5241, "step": 2709 }, { "epoch": 3.7743732590529246, "grad_norm": 4.742768287658691, "learning_rate": 4.44712905059693e-05, "loss": 2.7906, "step": 2710 }, { "epoch": 3.775766016713092, "grad_norm": 5.611998558044434, "learning_rate": 4.446134167140421e-05, "loss": 2.3346, "step": 2711 }, { "epoch": 3.777158774373259, "grad_norm": 4.185859203338623, "learning_rate": 4.4451392836839106e-05, "loss": 2.4558, "step": 2712 }, { "epoch": 3.7785515320334264, "grad_norm": 3.4776322841644287, "learning_rate": 4.444144400227401e-05, "loss": 2.036, "step": 2713 }, { "epoch": 3.779944289693593, "grad_norm": 3.5825843811035156, "learning_rate": 4.443149516770892e-05, "loss": 2.5068, "step": 2714 }, { "epoch": 3.7813370473537606, "grad_norm": 3.74259352684021, "learning_rate": 4.442154633314383e-05, "loss": 2.3882, "step": 2715 }, { "epoch": 3.7827298050139273, "grad_norm": 4.258945465087891, "learning_rate": 4.4411597498578736e-05, "loss": 2.7586, "step": 2716 }, { "epoch": 3.784122562674095, "grad_norm": 3.828115940093994, "learning_rate": 4.440164866401364e-05, "loss": 2.3588, "step": 2717 }, { "epoch": 3.785515320334262, "grad_norm": 4.125650405883789, "learning_rate": 4.4391699829448544e-05, "loss": 2.6631, "step": 2718 }, { "epoch": 3.786908077994429, "grad_norm": 4.53554630279541, "learning_rate": 4.4381750994883455e-05, "loss": 2.772, "step": 2719 }, { "epoch": 3.788300835654596, "grad_norm": 4.092480182647705, "learning_rate": 4.437180216031836e-05, "loss": 2.4638, "step": 2720 }, { "epoch": 3.7896935933147633, "grad_norm": 4.31044864654541, "learning_rate": 4.436185332575327e-05, "loss": 3.0333, "step": 2721 }, { "epoch": 3.7910863509749304, "grad_norm": 5.6687421798706055, "learning_rate": 4.435190449118817e-05, "loss": 2.5189, "step": 2722 }, { "epoch": 3.7924791086350975, "grad_norm": 3.9867589473724365, "learning_rate": 4.434195565662308e-05, "loss": 1.9807, "step": 2723 }, { "epoch": 3.7938718662952646, "grad_norm": 4.122042655944824, "learning_rate": 4.433200682205798e-05, "loss": 2.2054, "step": 2724 }, { "epoch": 3.7952646239554317, "grad_norm": 9.375899314880371, "learning_rate": 4.432205798749289e-05, "loss": 2.737, "step": 2725 }, { "epoch": 3.796657381615599, "grad_norm": 4.639389514923096, "learning_rate": 4.43121091529278e-05, "loss": 2.355, "step": 2726 }, { "epoch": 3.798050139275766, "grad_norm": 4.087195873260498, "learning_rate": 4.43021603183627e-05, "loss": 2.8663, "step": 2727 }, { "epoch": 3.799442896935933, "grad_norm": 4.093467712402344, "learning_rate": 4.4292211483797605e-05, "loss": 2.6564, "step": 2728 }, { "epoch": 3.8008356545961, "grad_norm": 4.882555961608887, "learning_rate": 4.4282262649232516e-05, "loss": 2.361, "step": 2729 }, { "epoch": 3.8022284122562673, "grad_norm": 3.6350321769714355, "learning_rate": 4.427231381466742e-05, "loss": 2.6936, "step": 2730 }, { "epoch": 3.8036211699164344, "grad_norm": 3.577855348587036, "learning_rate": 4.426236498010233e-05, "loss": 2.1247, "step": 2731 }, { "epoch": 3.8050139275766015, "grad_norm": 4.559157848358154, "learning_rate": 4.425241614553723e-05, "loss": 2.8159, "step": 2732 }, { "epoch": 3.8064066852367686, "grad_norm": 4.250999927520752, "learning_rate": 4.424246731097214e-05, "loss": 2.4847, "step": 2733 }, { "epoch": 3.807799442896936, "grad_norm": 3.956256151199341, "learning_rate": 4.4232518476407044e-05, "loss": 2.5316, "step": 2734 }, { "epoch": 3.809192200557103, "grad_norm": 4.939937114715576, "learning_rate": 4.4222569641841954e-05, "loss": 2.788, "step": 2735 }, { "epoch": 3.8105849582172704, "grad_norm": 3.885408401489258, "learning_rate": 4.421262080727686e-05, "loss": 2.5442, "step": 2736 }, { "epoch": 3.811977715877437, "grad_norm": 3.944845676422119, "learning_rate": 4.420267197271176e-05, "loss": 2.247, "step": 2737 }, { "epoch": 3.8133704735376046, "grad_norm": 4.235049247741699, "learning_rate": 4.4192723138146667e-05, "loss": 2.3736, "step": 2738 }, { "epoch": 3.8147632311977717, "grad_norm": 3.783020257949829, "learning_rate": 4.418277430358158e-05, "loss": 2.8785, "step": 2739 }, { "epoch": 3.816155988857939, "grad_norm": 3.18996524810791, "learning_rate": 4.417282546901649e-05, "loss": 2.0317, "step": 2740 }, { "epoch": 3.817548746518106, "grad_norm": 3.9531784057617188, "learning_rate": 4.4162876634451386e-05, "loss": 2.6962, "step": 2741 }, { "epoch": 3.818941504178273, "grad_norm": 5.225775241851807, "learning_rate": 4.415292779988629e-05, "loss": 2.593, "step": 2742 }, { "epoch": 3.82033426183844, "grad_norm": 4.643198490142822, "learning_rate": 4.41429789653212e-05, "loss": 2.5271, "step": 2743 }, { "epoch": 3.8217270194986073, "grad_norm": 4.245351791381836, "learning_rate": 4.413303013075611e-05, "loss": 2.7188, "step": 2744 }, { "epoch": 3.8231197771587744, "grad_norm": 3.899301767349243, "learning_rate": 4.4123081296191015e-05, "loss": 2.2749, "step": 2745 }, { "epoch": 3.8245125348189415, "grad_norm": 3.4520652294158936, "learning_rate": 4.411313246162592e-05, "loss": 2.0981, "step": 2746 }, { "epoch": 3.8259052924791086, "grad_norm": 4.36779260635376, "learning_rate": 4.4103183627060824e-05, "loss": 2.4021, "step": 2747 }, { "epoch": 3.8272980501392757, "grad_norm": 4.111340045928955, "learning_rate": 4.4093234792495734e-05, "loss": 2.5331, "step": 2748 }, { "epoch": 3.828690807799443, "grad_norm": 4.206274032592773, "learning_rate": 4.408328595793064e-05, "loss": 2.6779, "step": 2749 }, { "epoch": 3.83008356545961, "grad_norm": 3.6885695457458496, "learning_rate": 4.407333712336555e-05, "loss": 2.2924, "step": 2750 }, { "epoch": 3.831476323119777, "grad_norm": 4.14463472366333, "learning_rate": 4.406338828880045e-05, "loss": 2.731, "step": 2751 }, { "epoch": 3.832869080779944, "grad_norm": 4.981577396392822, "learning_rate": 4.405343945423536e-05, "loss": 2.5577, "step": 2752 }, { "epoch": 3.8342618384401113, "grad_norm": 4.107276439666748, "learning_rate": 4.404349061967026e-05, "loss": 2.5428, "step": 2753 }, { "epoch": 3.8356545961002784, "grad_norm": 3.483412027359009, "learning_rate": 4.403354178510517e-05, "loss": 2.1633, "step": 2754 }, { "epoch": 3.837047353760446, "grad_norm": 3.5000691413879395, "learning_rate": 4.4023592950540077e-05, "loss": 2.2543, "step": 2755 }, { "epoch": 3.8384401114206126, "grad_norm": 3.731545925140381, "learning_rate": 4.401364411597498e-05, "loss": 2.678, "step": 2756 }, { "epoch": 3.83983286908078, "grad_norm": 3.8793838024139404, "learning_rate": 4.4003695281409885e-05, "loss": 2.1981, "step": 2757 }, { "epoch": 3.841225626740947, "grad_norm": 3.9608139991760254, "learning_rate": 4.3993746446844796e-05, "loss": 2.4132, "step": 2758 }, { "epoch": 3.8426183844011144, "grad_norm": 5.246201515197754, "learning_rate": 4.39837976122797e-05, "loss": 2.4739, "step": 2759 }, { "epoch": 3.8440111420612815, "grad_norm": 3.9054718017578125, "learning_rate": 4.397384877771461e-05, "loss": 2.4072, "step": 2760 }, { "epoch": 3.8454038997214486, "grad_norm": 3.5004072189331055, "learning_rate": 4.396389994314951e-05, "loss": 2.2678, "step": 2761 }, { "epoch": 3.8467966573816157, "grad_norm": 4.606179237365723, "learning_rate": 4.395395110858442e-05, "loss": 2.3299, "step": 2762 }, { "epoch": 3.848189415041783, "grad_norm": 3.8552868366241455, "learning_rate": 4.394400227401932e-05, "loss": 2.9338, "step": 2763 }, { "epoch": 3.84958217270195, "grad_norm": 3.881601095199585, "learning_rate": 4.3934053439454234e-05, "loss": 2.5753, "step": 2764 }, { "epoch": 3.850974930362117, "grad_norm": 4.5721659660339355, "learning_rate": 4.392410460488914e-05, "loss": 2.6409, "step": 2765 }, { "epoch": 3.852367688022284, "grad_norm": 4.485108852386475, "learning_rate": 4.391415577032404e-05, "loss": 2.3553, "step": 2766 }, { "epoch": 3.8537604456824512, "grad_norm": 4.302353858947754, "learning_rate": 4.3904206935758946e-05, "loss": 2.3465, "step": 2767 }, { "epoch": 3.8551532033426184, "grad_norm": 3.646881580352783, "learning_rate": 4.3894258101193857e-05, "loss": 2.2946, "step": 2768 }, { "epoch": 3.8565459610027855, "grad_norm": 4.009352207183838, "learning_rate": 4.388430926662877e-05, "loss": 2.5868, "step": 2769 }, { "epoch": 3.8579387186629526, "grad_norm": 4.348059177398682, "learning_rate": 4.387436043206367e-05, "loss": 2.1755, "step": 2770 }, { "epoch": 3.8593314763231197, "grad_norm": 3.8434622287750244, "learning_rate": 4.386441159749857e-05, "loss": 2.3478, "step": 2771 }, { "epoch": 3.860724233983287, "grad_norm": 5.061528205871582, "learning_rate": 4.385446276293348e-05, "loss": 2.6413, "step": 2772 }, { "epoch": 3.862116991643454, "grad_norm": 4.642963886260986, "learning_rate": 4.384451392836839e-05, "loss": 2.2953, "step": 2773 }, { "epoch": 3.863509749303621, "grad_norm": 4.183655261993408, "learning_rate": 4.3834565093803295e-05, "loss": 2.6933, "step": 2774 }, { "epoch": 3.864902506963788, "grad_norm": 3.5605030059814453, "learning_rate": 4.3824616259238205e-05, "loss": 2.4551, "step": 2775 }, { "epoch": 3.8662952646239557, "grad_norm": 4.208330154418945, "learning_rate": 4.38146674246731e-05, "loss": 2.7612, "step": 2776 }, { "epoch": 3.8676880222841223, "grad_norm": 4.467424392700195, "learning_rate": 4.3804718590108014e-05, "loss": 2.3246, "step": 2777 }, { "epoch": 3.86908077994429, "grad_norm": 3.943737268447876, "learning_rate": 4.379476975554292e-05, "loss": 2.2271, "step": 2778 }, { "epoch": 3.8704735376044566, "grad_norm": 3.7545974254608154, "learning_rate": 4.378482092097783e-05, "loss": 2.4962, "step": 2779 }, { "epoch": 3.871866295264624, "grad_norm": 4.182112693786621, "learning_rate": 4.377487208641273e-05, "loss": 2.8764, "step": 2780 }, { "epoch": 3.8732590529247912, "grad_norm": 4.237875461578369, "learning_rate": 4.376492325184764e-05, "loss": 2.3709, "step": 2781 }, { "epoch": 3.8746518105849583, "grad_norm": 4.057324409484863, "learning_rate": 4.375497441728254e-05, "loss": 2.5617, "step": 2782 }, { "epoch": 3.8760445682451254, "grad_norm": 4.075103282928467, "learning_rate": 4.374502558271745e-05, "loss": 2.3773, "step": 2783 }, { "epoch": 3.8774373259052926, "grad_norm": 4.05328893661499, "learning_rate": 4.3735076748152356e-05, "loss": 2.5776, "step": 2784 }, { "epoch": 3.8788300835654597, "grad_norm": 3.459596633911133, "learning_rate": 4.372512791358726e-05, "loss": 2.2137, "step": 2785 }, { "epoch": 3.8802228412256268, "grad_norm": 3.943903923034668, "learning_rate": 4.3715179079022164e-05, "loss": 2.2072, "step": 2786 }, { "epoch": 3.881615598885794, "grad_norm": 3.6659719944000244, "learning_rate": 4.3705230244457075e-05, "loss": 2.2368, "step": 2787 }, { "epoch": 3.883008356545961, "grad_norm": 3.9039340019226074, "learning_rate": 4.369528140989198e-05, "loss": 2.3746, "step": 2788 }, { "epoch": 3.884401114206128, "grad_norm": 3.675025701522827, "learning_rate": 4.368533257532689e-05, "loss": 2.4561, "step": 2789 }, { "epoch": 3.885793871866295, "grad_norm": 4.125492572784424, "learning_rate": 4.367538374076179e-05, "loss": 2.2744, "step": 2790 }, { "epoch": 3.8871866295264623, "grad_norm": 5.663262844085693, "learning_rate": 4.36654349061967e-05, "loss": 2.8396, "step": 2791 }, { "epoch": 3.8885793871866294, "grad_norm": 3.851743459701538, "learning_rate": 4.36554860716316e-05, "loss": 2.2684, "step": 2792 }, { "epoch": 3.8899721448467965, "grad_norm": 3.9266207218170166, "learning_rate": 4.364553723706651e-05, "loss": 2.4336, "step": 2793 }, { "epoch": 3.8913649025069637, "grad_norm": 3.5865793228149414, "learning_rate": 4.3635588402501424e-05, "loss": 2.214, "step": 2794 }, { "epoch": 3.8927576601671308, "grad_norm": 3.5342860221862793, "learning_rate": 4.362563956793632e-05, "loss": 2.462, "step": 2795 }, { "epoch": 3.894150417827298, "grad_norm": 3.487074375152588, "learning_rate": 4.3615690733371225e-05, "loss": 2.2586, "step": 2796 }, { "epoch": 3.8955431754874654, "grad_norm": 3.9496841430664062, "learning_rate": 4.3605741898806136e-05, "loss": 2.6275, "step": 2797 }, { "epoch": 3.896935933147632, "grad_norm": 4.085710048675537, "learning_rate": 4.3595793064241047e-05, "loss": 2.3676, "step": 2798 }, { "epoch": 3.8983286908077996, "grad_norm": 4.594913959503174, "learning_rate": 4.358584422967595e-05, "loss": 2.553, "step": 2799 }, { "epoch": 3.8997214484679663, "grad_norm": 3.9324839115142822, "learning_rate": 4.3575895395110855e-05, "loss": 2.8682, "step": 2800 }, { "epoch": 3.901114206128134, "grad_norm": 3.9619557857513428, "learning_rate": 4.356594656054576e-05, "loss": 2.2718, "step": 2801 }, { "epoch": 3.902506963788301, "grad_norm": 4.3043084144592285, "learning_rate": 4.355599772598067e-05, "loss": 2.6799, "step": 2802 }, { "epoch": 3.903899721448468, "grad_norm": 5.833071231842041, "learning_rate": 4.3546048891415574e-05, "loss": 2.2675, "step": 2803 }, { "epoch": 3.905292479108635, "grad_norm": 4.207306385040283, "learning_rate": 4.3536100056850485e-05, "loss": 2.7466, "step": 2804 }, { "epoch": 3.9066852367688023, "grad_norm": 3.897665500640869, "learning_rate": 4.352615122228538e-05, "loss": 2.5291, "step": 2805 }, { "epoch": 3.9080779944289694, "grad_norm": 4.754603862762451, "learning_rate": 4.351620238772029e-05, "loss": 2.1749, "step": 2806 }, { "epoch": 3.9094707520891365, "grad_norm": 6.453915119171143, "learning_rate": 4.35062535531552e-05, "loss": 2.2067, "step": 2807 }, { "epoch": 3.9108635097493036, "grad_norm": 3.9593875408172607, "learning_rate": 4.349630471859011e-05, "loss": 2.1644, "step": 2808 }, { "epoch": 3.9122562674094707, "grad_norm": 3.7252540588378906, "learning_rate": 4.348635588402501e-05, "loss": 2.5891, "step": 2809 }, { "epoch": 3.913649025069638, "grad_norm": 3.7336575984954834, "learning_rate": 4.3476407049459916e-05, "loss": 2.6783, "step": 2810 }, { "epoch": 3.915041782729805, "grad_norm": 3.7686209678649902, "learning_rate": 4.346645821489482e-05, "loss": 2.2132, "step": 2811 }, { "epoch": 3.916434540389972, "grad_norm": 4.698162078857422, "learning_rate": 4.345650938032973e-05, "loss": 2.2761, "step": 2812 }, { "epoch": 3.917827298050139, "grad_norm": 4.409482955932617, "learning_rate": 4.3446560545764635e-05, "loss": 2.3576, "step": 2813 }, { "epoch": 3.9192200557103063, "grad_norm": 4.060924053192139, "learning_rate": 4.3436611711199546e-05, "loss": 2.5578, "step": 2814 }, { "epoch": 3.9206128133704734, "grad_norm": 3.9536235332489014, "learning_rate": 4.342666287663444e-05, "loss": 2.3978, "step": 2815 }, { "epoch": 3.9220055710306405, "grad_norm": 3.7282447814941406, "learning_rate": 4.3416714042069354e-05, "loss": 2.2678, "step": 2816 }, { "epoch": 3.9233983286908076, "grad_norm": 3.6149067878723145, "learning_rate": 4.340676520750426e-05, "loss": 2.371, "step": 2817 }, { "epoch": 3.924791086350975, "grad_norm": 3.990464210510254, "learning_rate": 4.339681637293917e-05, "loss": 2.5115, "step": 2818 }, { "epoch": 3.926183844011142, "grad_norm": 3.926725387573242, "learning_rate": 4.338686753837407e-05, "loss": 2.4013, "step": 2819 }, { "epoch": 3.9275766016713094, "grad_norm": 4.2266645431518555, "learning_rate": 4.337691870380898e-05, "loss": 2.46, "step": 2820 }, { "epoch": 3.928969359331476, "grad_norm": 4.060005187988281, "learning_rate": 4.336696986924388e-05, "loss": 2.6188, "step": 2821 }, { "epoch": 3.9303621169916436, "grad_norm": 4.332579612731934, "learning_rate": 4.335702103467879e-05, "loss": 2.2021, "step": 2822 }, { "epoch": 3.9317548746518107, "grad_norm": 3.5953142642974854, "learning_rate": 4.33470722001137e-05, "loss": 2.3435, "step": 2823 }, { "epoch": 3.933147632311978, "grad_norm": 4.64898681640625, "learning_rate": 4.333712336554861e-05, "loss": 2.4961, "step": 2824 }, { "epoch": 3.934540389972145, "grad_norm": 4.039651393890381, "learning_rate": 4.3327174530983504e-05, "loss": 2.0873, "step": 2825 }, { "epoch": 3.935933147632312, "grad_norm": 3.3444809913635254, "learning_rate": 4.3317225696418415e-05, "loss": 1.8563, "step": 2826 }, { "epoch": 3.937325905292479, "grad_norm": 3.111057996749878, "learning_rate": 4.3307276861853326e-05, "loss": 2.0835, "step": 2827 }, { "epoch": 3.9387186629526463, "grad_norm": 3.7342870235443115, "learning_rate": 4.329732802728823e-05, "loss": 2.1274, "step": 2828 }, { "epoch": 3.9401114206128134, "grad_norm": 4.010530948638916, "learning_rate": 4.3287379192723134e-05, "loss": 2.5928, "step": 2829 }, { "epoch": 3.9415041782729805, "grad_norm": 4.297623157501221, "learning_rate": 4.327743035815804e-05, "loss": 2.4479, "step": 2830 }, { "epoch": 3.9428969359331476, "grad_norm": 3.8704824447631836, "learning_rate": 4.326748152359295e-05, "loss": 2.3586, "step": 2831 }, { "epoch": 3.9442896935933147, "grad_norm": 3.575561761856079, "learning_rate": 4.325753268902785e-05, "loss": 2.1189, "step": 2832 }, { "epoch": 3.945682451253482, "grad_norm": 4.465643405914307, "learning_rate": 4.3247583854462764e-05, "loss": 1.9393, "step": 2833 }, { "epoch": 3.947075208913649, "grad_norm": 4.366245746612549, "learning_rate": 4.323763501989766e-05, "loss": 2.1547, "step": 2834 }, { "epoch": 3.948467966573816, "grad_norm": 4.428537845611572, "learning_rate": 4.322768618533257e-05, "loss": 2.7843, "step": 2835 }, { "epoch": 3.949860724233983, "grad_norm": 4.058773994445801, "learning_rate": 4.3217737350767476e-05, "loss": 2.3258, "step": 2836 }, { "epoch": 3.9512534818941503, "grad_norm": 3.489189386367798, "learning_rate": 4.320778851620239e-05, "loss": 2.3345, "step": 2837 }, { "epoch": 3.9526462395543174, "grad_norm": 5.146334648132324, "learning_rate": 4.319783968163729e-05, "loss": 2.5709, "step": 2838 }, { "epoch": 3.954038997214485, "grad_norm": 4.094646453857422, "learning_rate": 4.3187890847072195e-05, "loss": 2.3741, "step": 2839 }, { "epoch": 3.9554317548746516, "grad_norm": 3.887098789215088, "learning_rate": 4.31779420125071e-05, "loss": 2.1513, "step": 2840 }, { "epoch": 3.956824512534819, "grad_norm": 4.42372465133667, "learning_rate": 4.316799317794201e-05, "loss": 2.3141, "step": 2841 }, { "epoch": 3.958217270194986, "grad_norm": 4.165892601013184, "learning_rate": 4.3158044343376914e-05, "loss": 2.6135, "step": 2842 }, { "epoch": 3.9596100278551534, "grad_norm": 4.015235900878906, "learning_rate": 4.3148095508811825e-05, "loss": 2.4057, "step": 2843 }, { "epoch": 3.9610027855153205, "grad_norm": 4.624741554260254, "learning_rate": 4.313814667424672e-05, "loss": 2.7105, "step": 2844 }, { "epoch": 3.9623955431754876, "grad_norm": 3.801917314529419, "learning_rate": 4.312819783968163e-05, "loss": 2.603, "step": 2845 }, { "epoch": 3.9637883008356547, "grad_norm": 3.7009623050689697, "learning_rate": 4.311824900511654e-05, "loss": 2.2056, "step": 2846 }, { "epoch": 3.965181058495822, "grad_norm": 4.001925468444824, "learning_rate": 4.310830017055145e-05, "loss": 2.0946, "step": 2847 }, { "epoch": 3.966573816155989, "grad_norm": 4.103342533111572, "learning_rate": 4.309835133598635e-05, "loss": 2.8683, "step": 2848 }, { "epoch": 3.967966573816156, "grad_norm": 4.002389430999756, "learning_rate": 4.3088402501421256e-05, "loss": 2.6372, "step": 2849 }, { "epoch": 3.969359331476323, "grad_norm": 3.70853590965271, "learning_rate": 4.307845366685616e-05, "loss": 1.8587, "step": 2850 }, { "epoch": 3.9707520891364902, "grad_norm": 3.8358755111694336, "learning_rate": 4.306850483229107e-05, "loss": 2.5599, "step": 2851 }, { "epoch": 3.9721448467966574, "grad_norm": 4.751933574676514, "learning_rate": 4.305855599772598e-05, "loss": 2.4604, "step": 2852 }, { "epoch": 3.9735376044568245, "grad_norm": 3.9108285903930664, "learning_rate": 4.3048607163160886e-05, "loss": 2.5878, "step": 2853 }, { "epoch": 3.9749303621169916, "grad_norm": 4.267486095428467, "learning_rate": 4.303865832859578e-05, "loss": 2.4386, "step": 2854 }, { "epoch": 3.9763231197771587, "grad_norm": 4.0073323249816895, "learning_rate": 4.3028709494030694e-05, "loss": 2.6417, "step": 2855 }, { "epoch": 3.977715877437326, "grad_norm": 3.7361462116241455, "learning_rate": 4.3018760659465605e-05, "loss": 2.5317, "step": 2856 }, { "epoch": 3.979108635097493, "grad_norm": 5.070940017700195, "learning_rate": 4.300881182490051e-05, "loss": 2.4261, "step": 2857 }, { "epoch": 3.98050139275766, "grad_norm": 7.305459976196289, "learning_rate": 4.299886299033542e-05, "loss": 2.2394, "step": 2858 }, { "epoch": 3.981894150417827, "grad_norm": 5.064977169036865, "learning_rate": 4.298891415577032e-05, "loss": 2.8809, "step": 2859 }, { "epoch": 3.9832869080779947, "grad_norm": 4.64074182510376, "learning_rate": 4.297896532120523e-05, "loss": 2.3022, "step": 2860 }, { "epoch": 3.9846796657381613, "grad_norm": 4.33528995513916, "learning_rate": 4.296901648664013e-05, "loss": 2.1313, "step": 2861 }, { "epoch": 3.986072423398329, "grad_norm": 3.887798547744751, "learning_rate": 4.295906765207504e-05, "loss": 2.212, "step": 2862 }, { "epoch": 3.9874651810584956, "grad_norm": 3.91961407661438, "learning_rate": 4.294911881750995e-05, "loss": 2.2057, "step": 2863 }, { "epoch": 3.988857938718663, "grad_norm": 3.9067459106445312, "learning_rate": 4.293916998294485e-05, "loss": 2.5234, "step": 2864 }, { "epoch": 3.9902506963788302, "grad_norm": 3.912951707839966, "learning_rate": 4.2929221148379755e-05, "loss": 2.2181, "step": 2865 }, { "epoch": 3.9916434540389973, "grad_norm": 3.513366937637329, "learning_rate": 4.2919272313814666e-05, "loss": 2.3435, "step": 2866 }, { "epoch": 3.9930362116991645, "grad_norm": 3.928295135498047, "learning_rate": 4.290932347924957e-05, "loss": 2.4312, "step": 2867 }, { "epoch": 3.9944289693593316, "grad_norm": 4.228482246398926, "learning_rate": 4.289937464468448e-05, "loss": 2.7527, "step": 2868 }, { "epoch": 3.9958217270194987, "grad_norm": 4.201807022094727, "learning_rate": 4.288942581011938e-05, "loss": 2.5804, "step": 2869 }, { "epoch": 3.997214484679666, "grad_norm": 3.8392558097839355, "learning_rate": 4.287947697555429e-05, "loss": 2.3846, "step": 2870 }, { "epoch": 3.998607242339833, "grad_norm": 4.189210891723633, "learning_rate": 4.286952814098919e-05, "loss": 2.2174, "step": 2871 }, { "epoch": 4.0, "grad_norm": 4.876611232757568, "learning_rate": 4.2859579306424104e-05, "loss": 2.778, "step": 2872 }, { "epoch": 4.0013927576601676, "grad_norm": 3.852813720703125, "learning_rate": 4.2849630471859e-05, "loss": 2.2284, "step": 2873 }, { "epoch": 4.002785515320334, "grad_norm": 3.3349533081054688, "learning_rate": 4.283968163729391e-05, "loss": 2.0552, "step": 2874 }, { "epoch": 4.004178272980502, "grad_norm": 3.858365058898926, "learning_rate": 4.2829732802728816e-05, "loss": 2.1243, "step": 2875 }, { "epoch": 4.005571030640668, "grad_norm": 3.3642642498016357, "learning_rate": 4.281978396816373e-05, "loss": 1.8814, "step": 2876 }, { "epoch": 4.006963788300836, "grad_norm": 4.753489971160889, "learning_rate": 4.280983513359863e-05, "loss": 2.1598, "step": 2877 }, { "epoch": 4.008356545961003, "grad_norm": 4.414871692657471, "learning_rate": 4.2799886299033535e-05, "loss": 1.9104, "step": 2878 }, { "epoch": 4.00974930362117, "grad_norm": 3.775564432144165, "learning_rate": 4.278993746446844e-05, "loss": 1.8956, "step": 2879 }, { "epoch": 4.011142061281337, "grad_norm": 3.8523037433624268, "learning_rate": 4.277998862990335e-05, "loss": 1.9274, "step": 2880 }, { "epoch": 4.012534818941504, "grad_norm": 3.4568839073181152, "learning_rate": 4.277003979533826e-05, "loss": 1.9917, "step": 2881 }, { "epoch": 4.013927576601671, "grad_norm": 3.492802381515503, "learning_rate": 4.2760090960773165e-05, "loss": 2.1702, "step": 2882 }, { "epoch": 4.015320334261839, "grad_norm": 3.5977847576141357, "learning_rate": 4.275014212620806e-05, "loss": 2.2774, "step": 2883 }, { "epoch": 4.016713091922005, "grad_norm": 4.024469375610352, "learning_rate": 4.274019329164297e-05, "loss": 2.1418, "step": 2884 }, { "epoch": 4.018105849582173, "grad_norm": 3.895038366317749, "learning_rate": 4.2730244457077884e-05, "loss": 2.3048, "step": 2885 }, { "epoch": 4.0194986072423395, "grad_norm": 3.8219165802001953, "learning_rate": 4.272029562251279e-05, "loss": 1.807, "step": 2886 }, { "epoch": 4.020891364902507, "grad_norm": 3.387883424758911, "learning_rate": 4.27103467879477e-05, "loss": 1.8544, "step": 2887 }, { "epoch": 4.022284122562674, "grad_norm": 3.87650203704834, "learning_rate": 4.2700397953382596e-05, "loss": 2.0758, "step": 2888 }, { "epoch": 4.023676880222841, "grad_norm": 3.484063148498535, "learning_rate": 4.269044911881751e-05, "loss": 2.0685, "step": 2889 }, { "epoch": 4.025069637883008, "grad_norm": 3.4695568084716797, "learning_rate": 4.268050028425241e-05, "loss": 2.0025, "step": 2890 }, { "epoch": 4.0264623955431755, "grad_norm": 3.7375874519348145, "learning_rate": 4.267055144968732e-05, "loss": 2.1755, "step": 2891 }, { "epoch": 4.027855153203342, "grad_norm": 3.4818079471588135, "learning_rate": 4.2660602615122226e-05, "loss": 1.8245, "step": 2892 }, { "epoch": 4.02924791086351, "grad_norm": 3.2014174461364746, "learning_rate": 4.265065378055713e-05, "loss": 1.7686, "step": 2893 }, { "epoch": 4.030640668523677, "grad_norm": 3.6586625576019287, "learning_rate": 4.2640704945992034e-05, "loss": 2.2697, "step": 2894 }, { "epoch": 4.032033426183844, "grad_norm": 4.191351890563965, "learning_rate": 4.2630756111426945e-05, "loss": 2.1766, "step": 2895 }, { "epoch": 4.0334261838440115, "grad_norm": 3.260627031326294, "learning_rate": 4.262080727686185e-05, "loss": 2.201, "step": 2896 }, { "epoch": 4.034818941504178, "grad_norm": 3.9480907917022705, "learning_rate": 4.261085844229676e-05, "loss": 2.0782, "step": 2897 }, { "epoch": 4.036211699164346, "grad_norm": 3.4172251224517822, "learning_rate": 4.260090960773166e-05, "loss": 1.9945, "step": 2898 }, { "epoch": 4.037604456824512, "grad_norm": 3.900606393814087, "learning_rate": 4.259096077316657e-05, "loss": 2.163, "step": 2899 }, { "epoch": 4.03899721448468, "grad_norm": 4.304780006408691, "learning_rate": 4.258101193860147e-05, "loss": 2.3168, "step": 2900 }, { "epoch": 4.040389972144847, "grad_norm": 3.3004045486450195, "learning_rate": 4.257106310403638e-05, "loss": 2.1544, "step": 2901 }, { "epoch": 4.041782729805014, "grad_norm": 3.5081892013549805, "learning_rate": 4.256111426947129e-05, "loss": 2.0612, "step": 2902 }, { "epoch": 4.043175487465181, "grad_norm": 3.855876922607422, "learning_rate": 4.255116543490619e-05, "loss": 2.1881, "step": 2903 }, { "epoch": 4.044568245125348, "grad_norm": 3.5717360973358154, "learning_rate": 4.2541216600341095e-05, "loss": 1.8758, "step": 2904 }, { "epoch": 4.045961002785515, "grad_norm": 3.3564541339874268, "learning_rate": 4.2531267765776006e-05, "loss": 1.6998, "step": 2905 }, { "epoch": 4.047353760445683, "grad_norm": 3.6321773529052734, "learning_rate": 4.252131893121092e-05, "loss": 1.789, "step": 2906 }, { "epoch": 4.048746518105849, "grad_norm": 3.646332263946533, "learning_rate": 4.251137009664582e-05, "loss": 1.8159, "step": 2907 }, { "epoch": 4.050139275766017, "grad_norm": 3.75654673576355, "learning_rate": 4.250142126208072e-05, "loss": 1.9863, "step": 2908 }, { "epoch": 4.0515320334261835, "grad_norm": 3.1938226222991943, "learning_rate": 4.249147242751563e-05, "loss": 1.7361, "step": 2909 }, { "epoch": 4.052924791086351, "grad_norm": 4.553583145141602, "learning_rate": 4.248152359295054e-05, "loss": 2.097, "step": 2910 }, { "epoch": 4.054317548746518, "grad_norm": 3.4492368698120117, "learning_rate": 4.2471574758385444e-05, "loss": 2.1103, "step": 2911 }, { "epoch": 4.055710306406685, "grad_norm": 3.943769693374634, "learning_rate": 4.246162592382034e-05, "loss": 2.1472, "step": 2912 }, { "epoch": 4.057103064066852, "grad_norm": 3.1672894954681396, "learning_rate": 4.245167708925525e-05, "loss": 1.9289, "step": 2913 }, { "epoch": 4.0584958217270195, "grad_norm": 3.6658668518066406, "learning_rate": 4.244172825469016e-05, "loss": 2.1979, "step": 2914 }, { "epoch": 4.059888579387187, "grad_norm": 3.32881236076355, "learning_rate": 4.243177942012507e-05, "loss": 1.8956, "step": 2915 }, { "epoch": 4.061281337047354, "grad_norm": 3.316603660583496, "learning_rate": 4.242183058555998e-05, "loss": 1.9835, "step": 2916 }, { "epoch": 4.062674094707521, "grad_norm": 3.3299334049224854, "learning_rate": 4.2411881750994875e-05, "loss": 1.7492, "step": 2917 }, { "epoch": 4.064066852367688, "grad_norm": 4.63602876663208, "learning_rate": 4.2401932916429786e-05, "loss": 2.1778, "step": 2918 }, { "epoch": 4.0654596100278555, "grad_norm": 3.0447895526885986, "learning_rate": 4.239198408186469e-05, "loss": 1.9583, "step": 2919 }, { "epoch": 4.066852367688022, "grad_norm": 3.6520590782165527, "learning_rate": 4.23820352472996e-05, "loss": 2.0538, "step": 2920 }, { "epoch": 4.06824512534819, "grad_norm": 4.1695146560668945, "learning_rate": 4.2372086412734505e-05, "loss": 2.5048, "step": 2921 }, { "epoch": 4.069637883008356, "grad_norm": 3.669248104095459, "learning_rate": 4.236213757816941e-05, "loss": 2.1274, "step": 2922 }, { "epoch": 4.071030640668524, "grad_norm": 3.8280062675476074, "learning_rate": 4.2352188743604313e-05, "loss": 2.0354, "step": 2923 }, { "epoch": 4.072423398328691, "grad_norm": 3.502394199371338, "learning_rate": 4.2342239909039224e-05, "loss": 2.1068, "step": 2924 }, { "epoch": 4.073816155988858, "grad_norm": 3.5648293495178223, "learning_rate": 4.233229107447413e-05, "loss": 2.4372, "step": 2925 }, { "epoch": 4.075208913649025, "grad_norm": 6.297248840332031, "learning_rate": 4.232234223990904e-05, "loss": 2.0153, "step": 2926 }, { "epoch": 4.076601671309192, "grad_norm": 3.234280586242676, "learning_rate": 4.2312393405343937e-05, "loss": 1.8531, "step": 2927 }, { "epoch": 4.077994428969359, "grad_norm": 7.501010894775391, "learning_rate": 4.230244457077885e-05, "loss": 1.9979, "step": 2928 }, { "epoch": 4.079387186629527, "grad_norm": 4.206099510192871, "learning_rate": 4.229249573621375e-05, "loss": 2.0006, "step": 2929 }, { "epoch": 4.080779944289693, "grad_norm": 3.550398349761963, "learning_rate": 4.228254690164866e-05, "loss": 2.3617, "step": 2930 }, { "epoch": 4.082172701949861, "grad_norm": 3.0064449310302734, "learning_rate": 4.2272598067083566e-05, "loss": 1.71, "step": 2931 }, { "epoch": 4.0835654596100275, "grad_norm": 3.389880418777466, "learning_rate": 4.226264923251847e-05, "loss": 1.7805, "step": 2932 }, { "epoch": 4.084958217270195, "grad_norm": 3.2349741458892822, "learning_rate": 4.2252700397953375e-05, "loss": 1.8118, "step": 2933 }, { "epoch": 4.086350974930362, "grad_norm": 4.4300432205200195, "learning_rate": 4.2242751563388285e-05, "loss": 2.3677, "step": 2934 }, { "epoch": 4.087743732590529, "grad_norm": 3.7126057147979736, "learning_rate": 4.2232802728823196e-05, "loss": 2.4539, "step": 2935 }, { "epoch": 4.089136490250697, "grad_norm": 3.426689386367798, "learning_rate": 4.22228538942581e-05, "loss": 2.1186, "step": 2936 }, { "epoch": 4.0905292479108635, "grad_norm": 3.7912368774414062, "learning_rate": 4.2212905059693e-05, "loss": 1.9035, "step": 2937 }, { "epoch": 4.091922005571031, "grad_norm": 3.738041877746582, "learning_rate": 4.220295622512791e-05, "loss": 2.0842, "step": 2938 }, { "epoch": 4.093314763231198, "grad_norm": 5.30759334564209, "learning_rate": 4.219300739056282e-05, "loss": 1.9049, "step": 2939 }, { "epoch": 4.094707520891365, "grad_norm": 5.949930191040039, "learning_rate": 4.2183058555997723e-05, "loss": 2.1455, "step": 2940 }, { "epoch": 4.096100278551532, "grad_norm": 3.648623466491699, "learning_rate": 4.2173109721432634e-05, "loss": 2.1864, "step": 2941 }, { "epoch": 4.0974930362116995, "grad_norm": 5.2805328369140625, "learning_rate": 4.216316088686753e-05, "loss": 1.8307, "step": 2942 }, { "epoch": 4.098885793871866, "grad_norm": 3.3273491859436035, "learning_rate": 4.215321205230244e-05, "loss": 1.9127, "step": 2943 }, { "epoch": 4.100278551532034, "grad_norm": 3.942098379135132, "learning_rate": 4.2143263217737346e-05, "loss": 2.0882, "step": 2944 }, { "epoch": 4.1016713091922, "grad_norm": 3.925662040710449, "learning_rate": 4.213331438317226e-05, "loss": 2.1373, "step": 2945 }, { "epoch": 4.103064066852368, "grad_norm": 3.572674036026001, "learning_rate": 4.212336554860716e-05, "loss": 2.2118, "step": 2946 }, { "epoch": 4.104456824512535, "grad_norm": 3.682919979095459, "learning_rate": 4.2113416714042065e-05, "loss": 2.02, "step": 2947 }, { "epoch": 4.105849582172702, "grad_norm": 3.69718337059021, "learning_rate": 4.210346787947697e-05, "loss": 1.9632, "step": 2948 }, { "epoch": 4.107242339832869, "grad_norm": 3.8436195850372314, "learning_rate": 4.209351904491188e-05, "loss": 2.0702, "step": 2949 }, { "epoch": 4.108635097493036, "grad_norm": 3.8712940216064453, "learning_rate": 4.2083570210346784e-05, "loss": 2.3238, "step": 2950 }, { "epoch": 4.110027855153203, "grad_norm": 3.6358802318573, "learning_rate": 4.2073621375781695e-05, "loss": 1.9997, "step": 2951 }, { "epoch": 4.111420612813371, "grad_norm": 3.9602253437042236, "learning_rate": 4.206367254121659e-05, "loss": 2.311, "step": 2952 }, { "epoch": 4.112813370473537, "grad_norm": 3.319279193878174, "learning_rate": 4.2053723706651503e-05, "loss": 2.0274, "step": 2953 }, { "epoch": 4.114206128133705, "grad_norm": 3.474858045578003, "learning_rate": 4.204377487208641e-05, "loss": 2.1625, "step": 2954 }, { "epoch": 4.1155988857938715, "grad_norm": 3.547170400619507, "learning_rate": 4.203382603752132e-05, "loss": 2.1221, "step": 2955 }, { "epoch": 4.116991643454039, "grad_norm": 3.437105417251587, "learning_rate": 4.2023877202956216e-05, "loss": 2.1375, "step": 2956 }, { "epoch": 4.118384401114207, "grad_norm": 3.3093979358673096, "learning_rate": 4.2013928368391127e-05, "loss": 1.9692, "step": 2957 }, { "epoch": 4.119777158774373, "grad_norm": 3.6160924434661865, "learning_rate": 4.200397953382603e-05, "loss": 2.1094, "step": 2958 }, { "epoch": 4.121169916434541, "grad_norm": 3.928183078765869, "learning_rate": 4.199403069926094e-05, "loss": 1.9048, "step": 2959 }, { "epoch": 4.1225626740947074, "grad_norm": 3.6659960746765137, "learning_rate": 4.1984081864695846e-05, "loss": 1.847, "step": 2960 }, { "epoch": 4.123955431754875, "grad_norm": 4.066314697265625, "learning_rate": 4.197413303013075e-05, "loss": 2.4269, "step": 2961 }, { "epoch": 4.125348189415042, "grad_norm": 3.5907700061798096, "learning_rate": 4.1964184195565654e-05, "loss": 1.8727, "step": 2962 }, { "epoch": 4.126740947075209, "grad_norm": 3.6240313053131104, "learning_rate": 4.1954235361000565e-05, "loss": 2.1292, "step": 2963 }, { "epoch": 4.128133704735376, "grad_norm": 3.639030933380127, "learning_rate": 4.1944286526435475e-05, "loss": 1.9945, "step": 2964 }, { "epoch": 4.129526462395543, "grad_norm": 3.69741153717041, "learning_rate": 4.193433769187038e-05, "loss": 2.123, "step": 2965 }, { "epoch": 4.13091922005571, "grad_norm": 4.221983432769775, "learning_rate": 4.192438885730528e-05, "loss": 2.3641, "step": 2966 }, { "epoch": 4.132311977715878, "grad_norm": 3.8189964294433594, "learning_rate": 4.191444002274019e-05, "loss": 2.0556, "step": 2967 }, { "epoch": 4.133704735376044, "grad_norm": 3.6743671894073486, "learning_rate": 4.19044911881751e-05, "loss": 2.0531, "step": 2968 }, { "epoch": 4.135097493036212, "grad_norm": 3.03293776512146, "learning_rate": 4.189454235361e-05, "loss": 1.8302, "step": 2969 }, { "epoch": 4.1364902506963785, "grad_norm": 4.1291046142578125, "learning_rate": 4.1884593519044913e-05, "loss": 1.7203, "step": 2970 }, { "epoch": 4.137883008356546, "grad_norm": 4.686734676361084, "learning_rate": 4.187464468447981e-05, "loss": 2.0674, "step": 2971 }, { "epoch": 4.139275766016713, "grad_norm": 3.4521591663360596, "learning_rate": 4.186469584991472e-05, "loss": 1.8709, "step": 2972 }, { "epoch": 4.14066852367688, "grad_norm": 5.810756206512451, "learning_rate": 4.1854747015349626e-05, "loss": 2.2401, "step": 2973 }, { "epoch": 4.142061281337047, "grad_norm": 3.6473703384399414, "learning_rate": 4.1844798180784536e-05, "loss": 2.1024, "step": 2974 }, { "epoch": 4.1434540389972145, "grad_norm": 3.4020042419433594, "learning_rate": 4.183484934621944e-05, "loss": 1.8226, "step": 2975 }, { "epoch": 4.144846796657381, "grad_norm": 3.425886869430542, "learning_rate": 4.1824900511654345e-05, "loss": 2.0591, "step": 2976 }, { "epoch": 4.146239554317549, "grad_norm": 3.797455310821533, "learning_rate": 4.181495167708925e-05, "loss": 2.2521, "step": 2977 }, { "epoch": 4.147632311977716, "grad_norm": 3.6039366722106934, "learning_rate": 4.180500284252416e-05, "loss": 2.2441, "step": 2978 }, { "epoch": 4.149025069637883, "grad_norm": 3.46376633644104, "learning_rate": 4.1795054007959064e-05, "loss": 2.1358, "step": 2979 }, { "epoch": 4.1504178272980505, "grad_norm": 3.632885217666626, "learning_rate": 4.1785105173393974e-05, "loss": 2.1073, "step": 2980 }, { "epoch": 4.151810584958217, "grad_norm": 3.925652265548706, "learning_rate": 4.177515633882887e-05, "loss": 2.0397, "step": 2981 }, { "epoch": 4.153203342618385, "grad_norm": 3.7493999004364014, "learning_rate": 4.176520750426378e-05, "loss": 2.0389, "step": 2982 }, { "epoch": 4.154596100278551, "grad_norm": 4.511785984039307, "learning_rate": 4.175525866969869e-05, "loss": 2.3307, "step": 2983 }, { "epoch": 4.155988857938719, "grad_norm": 3.2160871028900146, "learning_rate": 4.17453098351336e-05, "loss": 1.6668, "step": 2984 }, { "epoch": 4.157381615598886, "grad_norm": 3.8907697200775146, "learning_rate": 4.17353610005685e-05, "loss": 1.6878, "step": 2985 }, { "epoch": 4.158774373259053, "grad_norm": 4.556327819824219, "learning_rate": 4.1725412166003406e-05, "loss": 1.9801, "step": 2986 }, { "epoch": 4.16016713091922, "grad_norm": 3.8720271587371826, "learning_rate": 4.171546333143831e-05, "loss": 2.238, "step": 2987 }, { "epoch": 4.161559888579387, "grad_norm": 3.6748290061950684, "learning_rate": 4.170551449687322e-05, "loss": 2.2768, "step": 2988 }, { "epoch": 4.162952646239554, "grad_norm": 7.88837194442749, "learning_rate": 4.1695565662308125e-05, "loss": 2.2326, "step": 2989 }, { "epoch": 4.164345403899722, "grad_norm": 3.624276638031006, "learning_rate": 4.1685616827743036e-05, "loss": 2.2305, "step": 2990 }, { "epoch": 4.165738161559888, "grad_norm": 4.0302414894104, "learning_rate": 4.167566799317793e-05, "loss": 1.9653, "step": 2991 }, { "epoch": 4.167130919220056, "grad_norm": 3.4902541637420654, "learning_rate": 4.1665719158612844e-05, "loss": 2.1037, "step": 2992 }, { "epoch": 4.1685236768802225, "grad_norm": 3.871030569076538, "learning_rate": 4.1655770324047755e-05, "loss": 2.134, "step": 2993 }, { "epoch": 4.16991643454039, "grad_norm": 3.963440418243408, "learning_rate": 4.164582148948266e-05, "loss": 2.197, "step": 2994 }, { "epoch": 4.171309192200557, "grad_norm": 3.3340349197387695, "learning_rate": 4.163587265491757e-05, "loss": 1.8162, "step": 2995 }, { "epoch": 4.172701949860724, "grad_norm": 3.550037145614624, "learning_rate": 4.162592382035247e-05, "loss": 1.7161, "step": 2996 }, { "epoch": 4.174094707520891, "grad_norm": 3.3833820819854736, "learning_rate": 4.161597498578738e-05, "loss": 1.789, "step": 2997 }, { "epoch": 4.1754874651810585, "grad_norm": 3.274717330932617, "learning_rate": 4.160602615122228e-05, "loss": 1.7684, "step": 2998 }, { "epoch": 4.176880222841225, "grad_norm": 3.4522273540496826, "learning_rate": 4.159607731665719e-05, "loss": 1.9979, "step": 2999 }, { "epoch": 4.178272980501393, "grad_norm": 3.6612470149993896, "learning_rate": 4.158612848209209e-05, "loss": 1.9038, "step": 3000 }, { "epoch": 4.17966573816156, "grad_norm": 3.563570022583008, "learning_rate": 4.1576179647527e-05, "loss": 2.0586, "step": 3001 }, { "epoch": 4.181058495821727, "grad_norm": 3.3609519004821777, "learning_rate": 4.1566230812961905e-05, "loss": 1.6657, "step": 3002 }, { "epoch": 4.1824512534818945, "grad_norm": 3.595749616622925, "learning_rate": 4.1556281978396816e-05, "loss": 1.984, "step": 3003 }, { "epoch": 4.183844011142061, "grad_norm": 3.685802698135376, "learning_rate": 4.154633314383172e-05, "loss": 1.9852, "step": 3004 }, { "epoch": 4.185236768802229, "grad_norm": 3.6919596195220947, "learning_rate": 4.1536384309266624e-05, "loss": 2.2896, "step": 3005 }, { "epoch": 4.186629526462395, "grad_norm": 4.045046329498291, "learning_rate": 4.152643547470153e-05, "loss": 1.9941, "step": 3006 }, { "epoch": 4.188022284122563, "grad_norm": 4.211788654327393, "learning_rate": 4.151648664013644e-05, "loss": 1.8658, "step": 3007 }, { "epoch": 4.18941504178273, "grad_norm": 3.4465973377227783, "learning_rate": 4.150653780557134e-05, "loss": 1.9987, "step": 3008 }, { "epoch": 4.190807799442897, "grad_norm": 3.3546183109283447, "learning_rate": 4.1496588971006254e-05, "loss": 1.9247, "step": 3009 }, { "epoch": 4.192200557103064, "grad_norm": 3.7048683166503906, "learning_rate": 4.148664013644115e-05, "loss": 2.0285, "step": 3010 }, { "epoch": 4.193593314763231, "grad_norm": 3.253375768661499, "learning_rate": 4.147669130187606e-05, "loss": 1.9872, "step": 3011 }, { "epoch": 4.194986072423398, "grad_norm": 3.39601469039917, "learning_rate": 4.1466742467310966e-05, "loss": 1.7865, "step": 3012 }, { "epoch": 4.196378830083566, "grad_norm": 3.560274362564087, "learning_rate": 4.145679363274588e-05, "loss": 1.9759, "step": 3013 }, { "epoch": 4.197771587743732, "grad_norm": 3.3260228633880615, "learning_rate": 4.144684479818078e-05, "loss": 1.7974, "step": 3014 }, { "epoch": 4.1991643454039, "grad_norm": 3.7586898803710938, "learning_rate": 4.1436895963615685e-05, "loss": 2.3483, "step": 3015 }, { "epoch": 4.2005571030640665, "grad_norm": 3.291304349899292, "learning_rate": 4.142694712905059e-05, "loss": 1.8448, "step": 3016 }, { "epoch": 4.201949860724234, "grad_norm": 3.7757081985473633, "learning_rate": 4.14169982944855e-05, "loss": 1.7119, "step": 3017 }, { "epoch": 4.203342618384401, "grad_norm": 3.5498621463775635, "learning_rate": 4.1407049459920404e-05, "loss": 1.9627, "step": 3018 }, { "epoch": 4.204735376044568, "grad_norm": 3.449716567993164, "learning_rate": 4.1397100625355315e-05, "loss": 1.9399, "step": 3019 }, { "epoch": 4.206128133704736, "grad_norm": 3.383650779724121, "learning_rate": 4.138715179079021e-05, "loss": 1.8659, "step": 3020 }, { "epoch": 4.2075208913649025, "grad_norm": 3.405827522277832, "learning_rate": 4.137720295622512e-05, "loss": 1.6832, "step": 3021 }, { "epoch": 4.20891364902507, "grad_norm": 3.6824240684509277, "learning_rate": 4.1367254121660034e-05, "loss": 2.4785, "step": 3022 }, { "epoch": 4.210306406685237, "grad_norm": 3.528207778930664, "learning_rate": 4.135730528709494e-05, "loss": 2.0306, "step": 3023 }, { "epoch": 4.211699164345404, "grad_norm": 3.8610434532165527, "learning_rate": 4.134735645252985e-05, "loss": 2.5256, "step": 3024 }, { "epoch": 4.213091922005571, "grad_norm": 3.419217586517334, "learning_rate": 4.1337407617964746e-05, "loss": 1.9272, "step": 3025 }, { "epoch": 4.2144846796657385, "grad_norm": 3.739485502243042, "learning_rate": 4.132745878339966e-05, "loss": 2.0173, "step": 3026 }, { "epoch": 4.215877437325905, "grad_norm": 3.5592610836029053, "learning_rate": 4.131750994883456e-05, "loss": 2.1957, "step": 3027 }, { "epoch": 4.217270194986073, "grad_norm": 3.525094747543335, "learning_rate": 4.130756111426947e-05, "loss": 2.2607, "step": 3028 }, { "epoch": 4.218662952646239, "grad_norm": 4.066737174987793, "learning_rate": 4.1297612279704376e-05, "loss": 2.1214, "step": 3029 }, { "epoch": 4.220055710306407, "grad_norm": 5.838018894195557, "learning_rate": 4.128766344513928e-05, "loss": 2.3155, "step": 3030 }, { "epoch": 4.221448467966574, "grad_norm": 3.724910259246826, "learning_rate": 4.1277714610574184e-05, "loss": 2.2864, "step": 3031 }, { "epoch": 4.222841225626741, "grad_norm": 3.427699089050293, "learning_rate": 4.1267765776009095e-05, "loss": 1.9764, "step": 3032 }, { "epoch": 4.224233983286908, "grad_norm": 3.9147324562072754, "learning_rate": 4.1257816941444e-05, "loss": 2.2344, "step": 3033 }, { "epoch": 4.225626740947075, "grad_norm": 3.2432050704956055, "learning_rate": 4.124786810687891e-05, "loss": 1.5073, "step": 3034 }, { "epoch": 4.227019498607242, "grad_norm": 4.5403032302856445, "learning_rate": 4.123791927231381e-05, "loss": 1.8843, "step": 3035 }, { "epoch": 4.22841225626741, "grad_norm": 3.7331345081329346, "learning_rate": 4.122797043774872e-05, "loss": 2.126, "step": 3036 }, { "epoch": 4.229805013927576, "grad_norm": 3.9318816661834717, "learning_rate": 4.121802160318362e-05, "loss": 2.052, "step": 3037 }, { "epoch": 4.231197771587744, "grad_norm": 4.157036304473877, "learning_rate": 4.120807276861853e-05, "loss": 1.9879, "step": 3038 }, { "epoch": 4.2325905292479105, "grad_norm": 3.6946725845336914, "learning_rate": 4.119812393405344e-05, "loss": 1.6204, "step": 3039 }, { "epoch": 4.233983286908078, "grad_norm": 3.3176159858703613, "learning_rate": 4.118817509948834e-05, "loss": 1.7824, "step": 3040 }, { "epoch": 4.235376044568245, "grad_norm": 3.661743402481079, "learning_rate": 4.1178226264923245e-05, "loss": 2.0289, "step": 3041 }, { "epoch": 4.236768802228412, "grad_norm": 3.5172345638275146, "learning_rate": 4.1168277430358156e-05, "loss": 1.8457, "step": 3042 }, { "epoch": 4.23816155988858, "grad_norm": 3.398505210876465, "learning_rate": 4.115832859579306e-05, "loss": 1.7284, "step": 3043 }, { "epoch": 4.2395543175487465, "grad_norm": 3.7780115604400635, "learning_rate": 4.1148379761227964e-05, "loss": 2.1162, "step": 3044 }, { "epoch": 4.240947075208914, "grad_norm": 4.354976654052734, "learning_rate": 4.113843092666287e-05, "loss": 2.555, "step": 3045 }, { "epoch": 4.242339832869081, "grad_norm": 3.9801361560821533, "learning_rate": 4.112848209209778e-05, "loss": 2.3081, "step": 3046 }, { "epoch": 4.243732590529248, "grad_norm": 3.451056718826294, "learning_rate": 4.111853325753269e-05, "loss": 2.1083, "step": 3047 }, { "epoch": 4.245125348189415, "grad_norm": 3.6292426586151123, "learning_rate": 4.1108584422967594e-05, "loss": 1.8113, "step": 3048 }, { "epoch": 4.2465181058495824, "grad_norm": 3.9774768352508545, "learning_rate": 4.109863558840249e-05, "loss": 1.9119, "step": 3049 }, { "epoch": 4.247910863509749, "grad_norm": 3.4504611492156982, "learning_rate": 4.10886867538374e-05, "loss": 1.8894, "step": 3050 }, { "epoch": 4.249303621169917, "grad_norm": 4.225854873657227, "learning_rate": 4.107873791927231e-05, "loss": 2.3452, "step": 3051 }, { "epoch": 4.250696378830083, "grad_norm": 3.4578373432159424, "learning_rate": 4.106878908470722e-05, "loss": 1.7064, "step": 3052 }, { "epoch": 4.252089136490251, "grad_norm": 4.071245193481445, "learning_rate": 4.105884025014213e-05, "loss": 2.2979, "step": 3053 }, { "epoch": 4.2534818941504176, "grad_norm": 3.548414707183838, "learning_rate": 4.1048891415577025e-05, "loss": 2.2977, "step": 3054 }, { "epoch": 4.254874651810585, "grad_norm": 3.9203896522521973, "learning_rate": 4.1038942581011936e-05, "loss": 1.9007, "step": 3055 }, { "epoch": 4.256267409470752, "grad_norm": 3.354689359664917, "learning_rate": 4.102899374644684e-05, "loss": 2.0673, "step": 3056 }, { "epoch": 4.257660167130919, "grad_norm": 3.854431629180908, "learning_rate": 4.101904491188175e-05, "loss": 2.1364, "step": 3057 }, { "epoch": 4.259052924791086, "grad_norm": 3.1632230281829834, "learning_rate": 4.1009096077316655e-05, "loss": 1.9589, "step": 3058 }, { "epoch": 4.2604456824512535, "grad_norm": 3.4056951999664307, "learning_rate": 4.099914724275156e-05, "loss": 1.7211, "step": 3059 }, { "epoch": 4.26183844011142, "grad_norm": 3.9283065795898438, "learning_rate": 4.098919840818646e-05, "loss": 2.3037, "step": 3060 }, { "epoch": 4.263231197771588, "grad_norm": 3.8917815685272217, "learning_rate": 4.0979249573621374e-05, "loss": 2.0805, "step": 3061 }, { "epoch": 4.264623955431755, "grad_norm": 3.466789960861206, "learning_rate": 4.096930073905628e-05, "loss": 1.7961, "step": 3062 }, { "epoch": 4.266016713091922, "grad_norm": 3.8265340328216553, "learning_rate": 4.095935190449119e-05, "loss": 2.1436, "step": 3063 }, { "epoch": 4.2674094707520895, "grad_norm": 3.4796509742736816, "learning_rate": 4.0949403069926086e-05, "loss": 1.6483, "step": 3064 }, { "epoch": 4.268802228412256, "grad_norm": 3.108628988265991, "learning_rate": 4.0939454235361e-05, "loss": 1.7464, "step": 3065 }, { "epoch": 4.270194986072424, "grad_norm": 3.9219963550567627, "learning_rate": 4.09295054007959e-05, "loss": 1.8372, "step": 3066 }, { "epoch": 4.27158774373259, "grad_norm": 4.510400295257568, "learning_rate": 4.091955656623081e-05, "loss": 1.7882, "step": 3067 }, { "epoch": 4.272980501392758, "grad_norm": 3.6423346996307373, "learning_rate": 4.0909607731665716e-05, "loss": 2.1274, "step": 3068 }, { "epoch": 4.274373259052925, "grad_norm": 3.5593934059143066, "learning_rate": 4.089965889710062e-05, "loss": 1.8293, "step": 3069 }, { "epoch": 4.275766016713092, "grad_norm": 3.4503164291381836, "learning_rate": 4.0889710062535524e-05, "loss": 1.9578, "step": 3070 }, { "epoch": 4.277158774373259, "grad_norm": 3.593991994857788, "learning_rate": 4.0879761227970435e-05, "loss": 1.9559, "step": 3071 }, { "epoch": 4.278551532033426, "grad_norm": 3.898189067840576, "learning_rate": 4.086981239340534e-05, "loss": 2.4211, "step": 3072 }, { "epoch": 4.279944289693593, "grad_norm": 3.5171823501586914, "learning_rate": 4.085986355884025e-05, "loss": 1.7952, "step": 3073 }, { "epoch": 4.281337047353761, "grad_norm": 3.7271249294281006, "learning_rate": 4.084991472427515e-05, "loss": 2.1002, "step": 3074 }, { "epoch": 4.282729805013927, "grad_norm": 4.111434459686279, "learning_rate": 4.083996588971006e-05, "loss": 2.3111, "step": 3075 }, { "epoch": 4.284122562674095, "grad_norm": 3.5992376804351807, "learning_rate": 4.083001705514497e-05, "loss": 1.8289, "step": 3076 }, { "epoch": 4.2855153203342615, "grad_norm": 3.5889201164245605, "learning_rate": 4.082006822057987e-05, "loss": 2.1275, "step": 3077 }, { "epoch": 4.286908077994429, "grad_norm": 4.682888984680176, "learning_rate": 4.0810119386014784e-05, "loss": 2.1404, "step": 3078 }, { "epoch": 4.288300835654596, "grad_norm": 3.8084707260131836, "learning_rate": 4.080017055144968e-05, "loss": 1.8622, "step": 3079 }, { "epoch": 4.289693593314763, "grad_norm": 3.434199094772339, "learning_rate": 4.079022171688459e-05, "loss": 1.909, "step": 3080 }, { "epoch": 4.29108635097493, "grad_norm": 3.2674553394317627, "learning_rate": 4.0780272882319496e-05, "loss": 1.8136, "step": 3081 }, { "epoch": 4.2924791086350975, "grad_norm": 3.6555733680725098, "learning_rate": 4.077032404775441e-05, "loss": 1.9099, "step": 3082 }, { "epoch": 4.293871866295264, "grad_norm": 6.769938945770264, "learning_rate": 4.0760375213189304e-05, "loss": 1.9184, "step": 3083 }, { "epoch": 4.295264623955432, "grad_norm": 3.5074851512908936, "learning_rate": 4.0750426378624215e-05, "loss": 1.9444, "step": 3084 }, { "epoch": 4.296657381615599, "grad_norm": 4.210463523864746, "learning_rate": 4.074047754405912e-05, "loss": 1.9365, "step": 3085 }, { "epoch": 4.298050139275766, "grad_norm": 3.9172286987304688, "learning_rate": 4.073052870949403e-05, "loss": 2.022, "step": 3086 }, { "epoch": 4.2994428969359335, "grad_norm": 3.6236064434051514, "learning_rate": 4.0720579874928934e-05, "loss": 2.1497, "step": 3087 }, { "epoch": 4.3008356545961, "grad_norm": 4.024149417877197, "learning_rate": 4.071063104036384e-05, "loss": 2.3488, "step": 3088 }, { "epoch": 4.302228412256268, "grad_norm": 5.756144046783447, "learning_rate": 4.070068220579874e-05, "loss": 1.8647, "step": 3089 }, { "epoch": 4.303621169916434, "grad_norm": 3.4565234184265137, "learning_rate": 4.069073337123365e-05, "loss": 2.3004, "step": 3090 }, { "epoch": 4.305013927576602, "grad_norm": 3.532681941986084, "learning_rate": 4.068078453666856e-05, "loss": 1.7751, "step": 3091 }, { "epoch": 4.306406685236769, "grad_norm": 4.027726650238037, "learning_rate": 4.067083570210347e-05, "loss": 2.2313, "step": 3092 }, { "epoch": 4.307799442896936, "grad_norm": 3.7110137939453125, "learning_rate": 4.0660886867538365e-05, "loss": 2.0806, "step": 3093 }, { "epoch": 4.309192200557103, "grad_norm": 4.137944221496582, "learning_rate": 4.0650938032973276e-05, "loss": 2.1601, "step": 3094 }, { "epoch": 4.31058495821727, "grad_norm": 3.6126034259796143, "learning_rate": 4.064098919840818e-05, "loss": 2.1295, "step": 3095 }, { "epoch": 4.311977715877437, "grad_norm": 4.281980514526367, "learning_rate": 4.063104036384309e-05, "loss": 2.3458, "step": 3096 }, { "epoch": 4.313370473537605, "grad_norm": 3.6331682205200195, "learning_rate": 4.0621091529277995e-05, "loss": 1.6873, "step": 3097 }, { "epoch": 4.314763231197771, "grad_norm": 3.5378000736236572, "learning_rate": 4.06111426947129e-05, "loss": 1.5119, "step": 3098 }, { "epoch": 4.316155988857939, "grad_norm": 3.42341685295105, "learning_rate": 4.06011938601478e-05, "loss": 1.7202, "step": 3099 }, { "epoch": 4.3175487465181055, "grad_norm": 3.846165418624878, "learning_rate": 4.0591245025582714e-05, "loss": 1.9793, "step": 3100 }, { "epoch": 4.318941504178273, "grad_norm": 3.754512071609497, "learning_rate": 4.058129619101762e-05, "loss": 2.1661, "step": 3101 }, { "epoch": 4.32033426183844, "grad_norm": 6.259832859039307, "learning_rate": 4.057134735645253e-05, "loss": 2.3145, "step": 3102 }, { "epoch": 4.321727019498607, "grad_norm": 3.235311985015869, "learning_rate": 4.0561398521887426e-05, "loss": 1.8265, "step": 3103 }, { "epoch": 4.323119777158775, "grad_norm": 3.2759287357330322, "learning_rate": 4.055144968732234e-05, "loss": 1.8827, "step": 3104 }, { "epoch": 4.3245125348189415, "grad_norm": 3.4988794326782227, "learning_rate": 4.054150085275725e-05, "loss": 1.925, "step": 3105 }, { "epoch": 4.325905292479109, "grad_norm": 3.346482038497925, "learning_rate": 4.053155201819215e-05, "loss": 1.8684, "step": 3106 }, { "epoch": 4.327298050139276, "grad_norm": 3.441643714904785, "learning_rate": 4.052160318362706e-05, "loss": 1.7618, "step": 3107 }, { "epoch": 4.328690807799443, "grad_norm": 3.765227794647217, "learning_rate": 4.051165434906196e-05, "loss": 1.8764, "step": 3108 }, { "epoch": 4.33008356545961, "grad_norm": 4.57131814956665, "learning_rate": 4.050170551449687e-05, "loss": 2.296, "step": 3109 }, { "epoch": 4.3314763231197775, "grad_norm": 3.5450220108032227, "learning_rate": 4.0491756679931775e-05, "loss": 1.8936, "step": 3110 }, { "epoch": 4.332869080779944, "grad_norm": 3.6709892749786377, "learning_rate": 4.0481807845366686e-05, "loss": 2.3217, "step": 3111 }, { "epoch": 4.334261838440112, "grad_norm": 7.978917121887207, "learning_rate": 4.047185901080159e-05, "loss": 2.1545, "step": 3112 }, { "epoch": 4.335654596100278, "grad_norm": 3.426018238067627, "learning_rate": 4.0461910176236494e-05, "loss": 1.7577, "step": 3113 }, { "epoch": 4.337047353760446, "grad_norm": 5.184499740600586, "learning_rate": 4.04519613416714e-05, "loss": 1.9331, "step": 3114 }, { "epoch": 4.338440111420613, "grad_norm": 3.6694085597991943, "learning_rate": 4.044201250710631e-05, "loss": 2.2532, "step": 3115 }, { "epoch": 4.33983286908078, "grad_norm": 3.379873275756836, "learning_rate": 4.043206367254121e-05, "loss": 1.7379, "step": 3116 }, { "epoch": 4.341225626740947, "grad_norm": 3.5996804237365723, "learning_rate": 4.0422114837976124e-05, "loss": 1.9487, "step": 3117 }, { "epoch": 4.342618384401114, "grad_norm": 4.125095367431641, "learning_rate": 4.041216600341102e-05, "loss": 2.3051, "step": 3118 }, { "epoch": 4.344011142061281, "grad_norm": 3.487342596054077, "learning_rate": 4.040221716884593e-05, "loss": 2.0359, "step": 3119 }, { "epoch": 4.345403899721449, "grad_norm": 4.149247646331787, "learning_rate": 4.0392268334280836e-05, "loss": 1.8801, "step": 3120 }, { "epoch": 4.346796657381615, "grad_norm": 3.596158981323242, "learning_rate": 4.038231949971575e-05, "loss": 1.9599, "step": 3121 }, { "epoch": 4.348189415041783, "grad_norm": 3.6659107208251953, "learning_rate": 4.037237066515065e-05, "loss": 2.0527, "step": 3122 }, { "epoch": 4.3495821727019495, "grad_norm": 3.5319089889526367, "learning_rate": 4.0362421830585555e-05, "loss": 1.9435, "step": 3123 }, { "epoch": 4.350974930362117, "grad_norm": 3.8539369106292725, "learning_rate": 4.035247299602046e-05, "loss": 1.7566, "step": 3124 }, { "epoch": 4.352367688022284, "grad_norm": 3.5454413890838623, "learning_rate": 4.034252416145537e-05, "loss": 1.8269, "step": 3125 }, { "epoch": 4.353760445682451, "grad_norm": 5.290493965148926, "learning_rate": 4.0332575326890274e-05, "loss": 2.0032, "step": 3126 }, { "epoch": 4.355153203342619, "grad_norm": 3.8820858001708984, "learning_rate": 4.032262649232518e-05, "loss": 2.0953, "step": 3127 }, { "epoch": 4.3565459610027855, "grad_norm": 3.353656768798828, "learning_rate": 4.031267765776008e-05, "loss": 2.0047, "step": 3128 }, { "epoch": 4.357938718662953, "grad_norm": 3.3565492630004883, "learning_rate": 4.030272882319499e-05, "loss": 2.1301, "step": 3129 }, { "epoch": 4.35933147632312, "grad_norm": 3.818831205368042, "learning_rate": 4.02927799886299e-05, "loss": 1.9086, "step": 3130 }, { "epoch": 4.360724233983287, "grad_norm": 3.3351616859436035, "learning_rate": 4.028283115406481e-05, "loss": 1.6885, "step": 3131 }, { "epoch": 4.362116991643454, "grad_norm": 3.724071979522705, "learning_rate": 4.0272882319499706e-05, "loss": 2.3188, "step": 3132 }, { "epoch": 4.3635097493036215, "grad_norm": 4.343799591064453, "learning_rate": 4.0262933484934616e-05, "loss": 2.209, "step": 3133 }, { "epoch": 4.364902506963788, "grad_norm": 4.293625831604004, "learning_rate": 4.025298465036953e-05, "loss": 2.315, "step": 3134 }, { "epoch": 4.366295264623956, "grad_norm": 3.800335645675659, "learning_rate": 4.024303581580443e-05, "loss": 1.8779, "step": 3135 }, { "epoch": 4.367688022284122, "grad_norm": 4.923582553863525, "learning_rate": 4.023308698123934e-05, "loss": 2.2346, "step": 3136 }, { "epoch": 4.36908077994429, "grad_norm": 3.8185031414031982, "learning_rate": 4.022313814667424e-05, "loss": 1.7602, "step": 3137 }, { "epoch": 4.370473537604457, "grad_norm": 3.3094518184661865, "learning_rate": 4.021318931210915e-05, "loss": 1.8159, "step": 3138 }, { "epoch": 4.371866295264624, "grad_norm": 3.3837151527404785, "learning_rate": 4.0203240477544054e-05, "loss": 2.0494, "step": 3139 }, { "epoch": 4.373259052924791, "grad_norm": 3.692568302154541, "learning_rate": 4.0193291642978965e-05, "loss": 1.951, "step": 3140 }, { "epoch": 4.374651810584958, "grad_norm": 3.4474568367004395, "learning_rate": 4.018334280841387e-05, "loss": 2.0262, "step": 3141 }, { "epoch": 4.376044568245125, "grad_norm": 3.624847888946533, "learning_rate": 4.0173393973848773e-05, "loss": 2.3257, "step": 3142 }, { "epoch": 4.3774373259052926, "grad_norm": 3.9856884479522705, "learning_rate": 4.016344513928368e-05, "loss": 2.4661, "step": 3143 }, { "epoch": 4.378830083565459, "grad_norm": 3.9437685012817383, "learning_rate": 4.015349630471859e-05, "loss": 2.1859, "step": 3144 }, { "epoch": 4.380222841225627, "grad_norm": 3.850909948348999, "learning_rate": 4.014354747015349e-05, "loss": 2.142, "step": 3145 }, { "epoch": 4.381615598885794, "grad_norm": 3.9388673305511475, "learning_rate": 4.01335986355884e-05, "loss": 2.2247, "step": 3146 }, { "epoch": 4.383008356545961, "grad_norm": 5.086613655090332, "learning_rate": 4.01236498010233e-05, "loss": 2.4244, "step": 3147 }, { "epoch": 4.3844011142061285, "grad_norm": 3.144426107406616, "learning_rate": 4.011370096645821e-05, "loss": 1.3181, "step": 3148 }, { "epoch": 4.385793871866295, "grad_norm": 3.7608373165130615, "learning_rate": 4.0103752131893116e-05, "loss": 1.9655, "step": 3149 }, { "epoch": 4.387186629526463, "grad_norm": 3.7845873832702637, "learning_rate": 4.0093803297328026e-05, "loss": 2.1247, "step": 3150 }, { "epoch": 4.388579387186629, "grad_norm": 3.580233573913574, "learning_rate": 4.008385446276293e-05, "loss": 1.903, "step": 3151 }, { "epoch": 4.389972144846797, "grad_norm": 4.054161071777344, "learning_rate": 4.0073905628197835e-05, "loss": 1.8935, "step": 3152 }, { "epoch": 4.391364902506964, "grad_norm": 3.250823974609375, "learning_rate": 4.006395679363274e-05, "loss": 1.6088, "step": 3153 }, { "epoch": 4.392757660167131, "grad_norm": 3.544598340988159, "learning_rate": 4.005400795906765e-05, "loss": 1.9608, "step": 3154 }, { "epoch": 4.394150417827298, "grad_norm": 4.119276523590088, "learning_rate": 4.0044059124502554e-05, "loss": 2.116, "step": 3155 }, { "epoch": 4.395543175487465, "grad_norm": 6.200865745544434, "learning_rate": 4.0034110289937464e-05, "loss": 1.8136, "step": 3156 }, { "epoch": 4.396935933147632, "grad_norm": 3.2142770290374756, "learning_rate": 4.002416145537236e-05, "loss": 1.5655, "step": 3157 }, { "epoch": 4.3983286908078, "grad_norm": 4.196826934814453, "learning_rate": 4.001421262080727e-05, "loss": 2.0261, "step": 3158 }, { "epoch": 4.399721448467966, "grad_norm": 3.638808488845825, "learning_rate": 4.000426378624218e-05, "loss": 1.9075, "step": 3159 }, { "epoch": 4.401114206128134, "grad_norm": 3.1467158794403076, "learning_rate": 3.999431495167709e-05, "loss": 2.0249, "step": 3160 }, { "epoch": 4.4025069637883005, "grad_norm": 3.6672751903533936, "learning_rate": 3.9984366117112e-05, "loss": 1.9514, "step": 3161 }, { "epoch": 4.403899721448468, "grad_norm": 3.508639335632324, "learning_rate": 3.9974417282546896e-05, "loss": 1.762, "step": 3162 }, { "epoch": 4.405292479108635, "grad_norm": 3.3780128955841064, "learning_rate": 3.9964468447981806e-05, "loss": 2.0071, "step": 3163 }, { "epoch": 4.406685236768802, "grad_norm": 3.5957813262939453, "learning_rate": 3.995451961341671e-05, "loss": 2.0229, "step": 3164 }, { "epoch": 4.408077994428969, "grad_norm": 4.51296854019165, "learning_rate": 3.994457077885162e-05, "loss": 1.9209, "step": 3165 }, { "epoch": 4.4094707520891365, "grad_norm": 3.6258111000061035, "learning_rate": 3.9934621944286525e-05, "loss": 2.0767, "step": 3166 }, { "epoch": 4.410863509749303, "grad_norm": 3.485344886779785, "learning_rate": 3.992467310972143e-05, "loss": 1.8335, "step": 3167 }, { "epoch": 4.412256267409471, "grad_norm": 3.6054396629333496, "learning_rate": 3.9914724275156334e-05, "loss": 1.8713, "step": 3168 }, { "epoch": 4.413649025069638, "grad_norm": 3.383699893951416, "learning_rate": 3.9904775440591244e-05, "loss": 1.7211, "step": 3169 }, { "epoch": 4.415041782729805, "grad_norm": 3.8874316215515137, "learning_rate": 3.989482660602615e-05, "loss": 2.1207, "step": 3170 }, { "epoch": 4.4164345403899725, "grad_norm": 6.061794757843018, "learning_rate": 3.988487777146105e-05, "loss": 1.5361, "step": 3171 }, { "epoch": 4.417827298050139, "grad_norm": 3.5344083309173584, "learning_rate": 3.987492893689596e-05, "loss": 2.1361, "step": 3172 }, { "epoch": 4.419220055710307, "grad_norm": 3.9785499572753906, "learning_rate": 3.986498010233087e-05, "loss": 2.3867, "step": 3173 }, { "epoch": 4.420612813370473, "grad_norm": 3.3845369815826416, "learning_rate": 3.985503126776577e-05, "loss": 1.5828, "step": 3174 }, { "epoch": 4.422005571030641, "grad_norm": 3.99165940284729, "learning_rate": 3.984508243320068e-05, "loss": 1.9207, "step": 3175 }, { "epoch": 4.423398328690808, "grad_norm": 3.9511852264404297, "learning_rate": 3.983513359863558e-05, "loss": 2.1961, "step": 3176 }, { "epoch": 4.424791086350975, "grad_norm": 4.167506694793701, "learning_rate": 3.982518476407049e-05, "loss": 2.242, "step": 3177 }, { "epoch": 4.426183844011142, "grad_norm": 4.409764289855957, "learning_rate": 3.9815235929505395e-05, "loss": 2.0475, "step": 3178 }, { "epoch": 4.427576601671309, "grad_norm": 3.4161460399627686, "learning_rate": 3.9805287094940306e-05, "loss": 1.8625, "step": 3179 }, { "epoch": 4.428969359331476, "grad_norm": 3.3184471130371094, "learning_rate": 3.979533826037521e-05, "loss": 1.7828, "step": 3180 }, { "epoch": 4.430362116991644, "grad_norm": 4.11037015914917, "learning_rate": 3.9785389425810114e-05, "loss": 1.5766, "step": 3181 }, { "epoch": 4.43175487465181, "grad_norm": 4.095383644104004, "learning_rate": 3.977544059124502e-05, "loss": 1.8509, "step": 3182 }, { "epoch": 4.433147632311978, "grad_norm": 4.078642845153809, "learning_rate": 3.976549175667993e-05, "loss": 1.9048, "step": 3183 }, { "epoch": 4.4345403899721445, "grad_norm": 3.6307432651519775, "learning_rate": 3.975554292211483e-05, "loss": 1.7649, "step": 3184 }, { "epoch": 4.435933147632312, "grad_norm": 3.682129383087158, "learning_rate": 3.9745594087549744e-05, "loss": 1.8686, "step": 3185 }, { "epoch": 4.437325905292479, "grad_norm": 3.911226987838745, "learning_rate": 3.973564525298464e-05, "loss": 2.2978, "step": 3186 }, { "epoch": 4.438718662952646, "grad_norm": 3.18542742729187, "learning_rate": 3.972569641841955e-05, "loss": 1.5903, "step": 3187 }, { "epoch": 4.440111420612814, "grad_norm": 3.3501884937286377, "learning_rate": 3.971574758385446e-05, "loss": 1.3564, "step": 3188 }, { "epoch": 4.4415041782729805, "grad_norm": 4.470577716827393, "learning_rate": 3.9705798749289367e-05, "loss": 1.8826, "step": 3189 }, { "epoch": 4.442896935933147, "grad_norm": 3.6222612857818604, "learning_rate": 3.969584991472428e-05, "loss": 1.8286, "step": 3190 }, { "epoch": 4.444289693593315, "grad_norm": 3.79026460647583, "learning_rate": 3.9685901080159175e-05, "loss": 2.1936, "step": 3191 }, { "epoch": 4.445682451253482, "grad_norm": 3.869800329208374, "learning_rate": 3.9675952245594086e-05, "loss": 2.1218, "step": 3192 }, { "epoch": 4.447075208913649, "grad_norm": 3.4443883895874023, "learning_rate": 3.966600341102899e-05, "loss": 2.0388, "step": 3193 }, { "epoch": 4.4484679665738165, "grad_norm": 3.4055368900299072, "learning_rate": 3.96560545764639e-05, "loss": 1.6828, "step": 3194 }, { "epoch": 4.449860724233983, "grad_norm": 3.665839910507202, "learning_rate": 3.9646105741898805e-05, "loss": 2.0586, "step": 3195 }, { "epoch": 4.451253481894151, "grad_norm": 3.516286849975586, "learning_rate": 3.963615690733371e-05, "loss": 2.0749, "step": 3196 }, { "epoch": 4.452646239554317, "grad_norm": 4.141451835632324, "learning_rate": 3.962620807276861e-05, "loss": 2.3455, "step": 3197 }, { "epoch": 4.454038997214485, "grad_norm": 4.327914237976074, "learning_rate": 3.9616259238203524e-05, "loss": 2.3216, "step": 3198 }, { "epoch": 4.455431754874652, "grad_norm": 3.882049560546875, "learning_rate": 3.960631040363843e-05, "loss": 2.0532, "step": 3199 }, { "epoch": 4.456824512534819, "grad_norm": 3.3833377361297607, "learning_rate": 3.959636156907334e-05, "loss": 1.5927, "step": 3200 }, { "epoch": 4.458217270194986, "grad_norm": 3.203939199447632, "learning_rate": 3.9586412734508236e-05, "loss": 1.6464, "step": 3201 }, { "epoch": 4.459610027855153, "grad_norm": 3.5028281211853027, "learning_rate": 3.957646389994315e-05, "loss": 1.9767, "step": 3202 }, { "epoch": 4.46100278551532, "grad_norm": 4.2450666427612305, "learning_rate": 3.956651506537805e-05, "loss": 1.7586, "step": 3203 }, { "epoch": 4.462395543175488, "grad_norm": 3.6107778549194336, "learning_rate": 3.955656623081296e-05, "loss": 1.8337, "step": 3204 }, { "epoch": 4.463788300835654, "grad_norm": 3.225759267807007, "learning_rate": 3.9546617396247866e-05, "loss": 1.7373, "step": 3205 }, { "epoch": 4.465181058495822, "grad_norm": 3.8535566329956055, "learning_rate": 3.953666856168277e-05, "loss": 1.9903, "step": 3206 }, { "epoch": 4.4665738161559885, "grad_norm": 3.146134376525879, "learning_rate": 3.9526719727117674e-05, "loss": 1.5497, "step": 3207 }, { "epoch": 4.467966573816156, "grad_norm": 3.726294994354248, "learning_rate": 3.9516770892552585e-05, "loss": 2.1184, "step": 3208 }, { "epoch": 4.469359331476323, "grad_norm": 3.102433681488037, "learning_rate": 3.950682205798749e-05, "loss": 1.513, "step": 3209 }, { "epoch": 4.47075208913649, "grad_norm": 4.219781398773193, "learning_rate": 3.94968732234224e-05, "loss": 1.9301, "step": 3210 }, { "epoch": 4.472144846796658, "grad_norm": 3.8631253242492676, "learning_rate": 3.94869243888573e-05, "loss": 1.9974, "step": 3211 }, { "epoch": 4.4735376044568245, "grad_norm": 3.584851026535034, "learning_rate": 3.947697555429221e-05, "loss": 1.9607, "step": 3212 }, { "epoch": 4.474930362116992, "grad_norm": 3.701464891433716, "learning_rate": 3.946702671972711e-05, "loss": 1.8906, "step": 3213 }, { "epoch": 4.476323119777159, "grad_norm": 3.6301727294921875, "learning_rate": 3.945707788516202e-05, "loss": 1.9891, "step": 3214 }, { "epoch": 4.477715877437326, "grad_norm": 3.6762855052948, "learning_rate": 3.944712905059692e-05, "loss": 1.6908, "step": 3215 }, { "epoch": 4.479108635097493, "grad_norm": 3.854891061782837, "learning_rate": 3.943718021603183e-05, "loss": 2.3549, "step": 3216 }, { "epoch": 4.4805013927576605, "grad_norm": 3.8298966884613037, "learning_rate": 3.942723138146674e-05, "loss": 1.9698, "step": 3217 }, { "epoch": 4.481894150417827, "grad_norm": 4.6079912185668945, "learning_rate": 3.9417282546901646e-05, "loss": 1.8695, "step": 3218 }, { "epoch": 4.483286908077995, "grad_norm": 3.379887580871582, "learning_rate": 3.940733371233656e-05, "loss": 1.6363, "step": 3219 }, { "epoch": 4.484679665738161, "grad_norm": 3.9019153118133545, "learning_rate": 3.9397384877771454e-05, "loss": 1.6475, "step": 3220 }, { "epoch": 4.486072423398329, "grad_norm": 4.2392897605896, "learning_rate": 3.9387436043206365e-05, "loss": 2.2761, "step": 3221 }, { "epoch": 4.487465181058496, "grad_norm": 3.652559995651245, "learning_rate": 3.937748720864127e-05, "loss": 2.1351, "step": 3222 }, { "epoch": 4.488857938718663, "grad_norm": 3.350752592086792, "learning_rate": 3.936753837407618e-05, "loss": 1.6802, "step": 3223 }, { "epoch": 4.49025069637883, "grad_norm": 3.8682892322540283, "learning_rate": 3.9357589539511084e-05, "loss": 2.0835, "step": 3224 }, { "epoch": 4.491643454038997, "grad_norm": 3.9659388065338135, "learning_rate": 3.934764070494599e-05, "loss": 2.0425, "step": 3225 }, { "epoch": 4.493036211699164, "grad_norm": 4.409570693969727, "learning_rate": 3.933769187038089e-05, "loss": 1.8561, "step": 3226 }, { "epoch": 4.494428969359332, "grad_norm": 3.5168051719665527, "learning_rate": 3.93277430358158e-05, "loss": 1.995, "step": 3227 }, { "epoch": 4.495821727019498, "grad_norm": 3.20039701461792, "learning_rate": 3.931779420125071e-05, "loss": 1.6086, "step": 3228 }, { "epoch": 4.497214484679666, "grad_norm": 3.4170303344726562, "learning_rate": 3.930784536668562e-05, "loss": 1.7782, "step": 3229 }, { "epoch": 4.498607242339833, "grad_norm": 3.441661834716797, "learning_rate": 3.9297896532120515e-05, "loss": 1.5542, "step": 3230 }, { "epoch": 4.5, "grad_norm": 4.464572906494141, "learning_rate": 3.9287947697555426e-05, "loss": 1.8985, "step": 3231 }, { "epoch": 4.501392757660167, "grad_norm": 3.6527247428894043, "learning_rate": 3.927799886299033e-05, "loss": 1.9854, "step": 3232 }, { "epoch": 4.502785515320334, "grad_norm": 3.33829927444458, "learning_rate": 3.926805002842524e-05, "loss": 1.8484, "step": 3233 }, { "epoch": 4.504178272980502, "grad_norm": 3.4014110565185547, "learning_rate": 3.9258101193860145e-05, "loss": 1.9452, "step": 3234 }, { "epoch": 4.505571030640668, "grad_norm": 3.632737398147583, "learning_rate": 3.924815235929505e-05, "loss": 1.8714, "step": 3235 }, { "epoch": 4.506963788300836, "grad_norm": 3.3304083347320557, "learning_rate": 3.923820352472995e-05, "loss": 1.8572, "step": 3236 }, { "epoch": 4.508356545961003, "grad_norm": 3.681295156478882, "learning_rate": 3.9228254690164864e-05, "loss": 1.8286, "step": 3237 }, { "epoch": 4.50974930362117, "grad_norm": 3.4436419010162354, "learning_rate": 3.921830585559977e-05, "loss": 1.8035, "step": 3238 }, { "epoch": 4.511142061281337, "grad_norm": 3.5060203075408936, "learning_rate": 3.920835702103468e-05, "loss": 1.9942, "step": 3239 }, { "epoch": 4.512534818941504, "grad_norm": 3.54351806640625, "learning_rate": 3.9198408186469576e-05, "loss": 1.8422, "step": 3240 }, { "epoch": 4.513927576601671, "grad_norm": 4.204930305480957, "learning_rate": 3.918845935190449e-05, "loss": 1.98, "step": 3241 }, { "epoch": 4.515320334261839, "grad_norm": 4.28328800201416, "learning_rate": 3.917851051733939e-05, "loss": 1.7543, "step": 3242 }, { "epoch": 4.516713091922005, "grad_norm": 3.403895378112793, "learning_rate": 3.91685616827743e-05, "loss": 1.797, "step": 3243 }, { "epoch": 4.518105849582173, "grad_norm": 4.003249645233154, "learning_rate": 3.915861284820921e-05, "loss": 1.915, "step": 3244 }, { "epoch": 4.5194986072423395, "grad_norm": 3.870506763458252, "learning_rate": 3.914866401364411e-05, "loss": 1.8718, "step": 3245 }, { "epoch": 4.520891364902507, "grad_norm": 3.469045877456665, "learning_rate": 3.913871517907902e-05, "loss": 1.9785, "step": 3246 }, { "epoch": 4.522284122562674, "grad_norm": 3.2671995162963867, "learning_rate": 3.9128766344513925e-05, "loss": 1.9109, "step": 3247 }, { "epoch": 4.523676880222841, "grad_norm": 3.3625526428222656, "learning_rate": 3.9118817509948836e-05, "loss": 1.7483, "step": 3248 }, { "epoch": 4.525069637883008, "grad_norm": 3.8905181884765625, "learning_rate": 3.910886867538374e-05, "loss": 2.4342, "step": 3249 }, { "epoch": 4.5264623955431755, "grad_norm": 4.0306243896484375, "learning_rate": 3.9098919840818644e-05, "loss": 2.2661, "step": 3250 }, { "epoch": 4.527855153203342, "grad_norm": 4.22361946105957, "learning_rate": 3.908897100625355e-05, "loss": 1.9663, "step": 3251 }, { "epoch": 4.52924791086351, "grad_norm": 4.0032806396484375, "learning_rate": 3.907902217168846e-05, "loss": 2.3843, "step": 3252 }, { "epoch": 4.530640668523677, "grad_norm": 3.5702474117279053, "learning_rate": 3.906907333712336e-05, "loss": 1.9604, "step": 3253 }, { "epoch": 4.532033426183844, "grad_norm": 4.1573710441589355, "learning_rate": 3.9059124502558274e-05, "loss": 2.1405, "step": 3254 }, { "epoch": 4.5334261838440115, "grad_norm": 3.67299485206604, "learning_rate": 3.904917566799317e-05, "loss": 1.682, "step": 3255 }, { "epoch": 4.534818941504178, "grad_norm": 3.53426456451416, "learning_rate": 3.903922683342808e-05, "loss": 1.9956, "step": 3256 }, { "epoch": 4.536211699164346, "grad_norm": 3.801340341567993, "learning_rate": 3.9029277998862986e-05, "loss": 2.4488, "step": 3257 }, { "epoch": 4.537604456824512, "grad_norm": 3.5699822902679443, "learning_rate": 3.90193291642979e-05, "loss": 1.9663, "step": 3258 }, { "epoch": 4.53899721448468, "grad_norm": 4.429647445678711, "learning_rate": 3.9009380329732794e-05, "loss": 1.8785, "step": 3259 }, { "epoch": 4.540389972144847, "grad_norm": 3.939802885055542, "learning_rate": 3.8999431495167705e-05, "loss": 2.0498, "step": 3260 }, { "epoch": 4.541782729805014, "grad_norm": 3.1633245944976807, "learning_rate": 3.898948266060261e-05, "loss": 1.4715, "step": 3261 }, { "epoch": 4.543175487465181, "grad_norm": 3.1759512424468994, "learning_rate": 3.897953382603752e-05, "loss": 1.6324, "step": 3262 }, { "epoch": 4.544568245125348, "grad_norm": 3.4950180053710938, "learning_rate": 3.8969584991472424e-05, "loss": 1.8991, "step": 3263 }, { "epoch": 4.545961002785515, "grad_norm": 3.6954591274261475, "learning_rate": 3.895963615690733e-05, "loss": 1.9564, "step": 3264 }, { "epoch": 4.547353760445683, "grad_norm": 3.3584558963775635, "learning_rate": 3.894968732234223e-05, "loss": 1.9633, "step": 3265 }, { "epoch": 4.548746518105849, "grad_norm": 4.469377040863037, "learning_rate": 3.893973848777714e-05, "loss": 1.4949, "step": 3266 }, { "epoch": 4.550139275766017, "grad_norm": 3.7999792098999023, "learning_rate": 3.892978965321205e-05, "loss": 1.5881, "step": 3267 }, { "epoch": 4.5515320334261835, "grad_norm": 3.7202396392822266, "learning_rate": 3.891984081864696e-05, "loss": 2.0198, "step": 3268 }, { "epoch": 4.552924791086351, "grad_norm": 3.222660541534424, "learning_rate": 3.8909891984081855e-05, "loss": 1.7424, "step": 3269 }, { "epoch": 4.554317548746518, "grad_norm": 3.1136035919189453, "learning_rate": 3.8899943149516766e-05, "loss": 1.6429, "step": 3270 }, { "epoch": 4.555710306406685, "grad_norm": 4.08917236328125, "learning_rate": 3.888999431495167e-05, "loss": 1.8382, "step": 3271 }, { "epoch": 4.557103064066853, "grad_norm": 3.8375301361083984, "learning_rate": 3.888004548038658e-05, "loss": 1.93, "step": 3272 }, { "epoch": 4.5584958217270195, "grad_norm": 3.7703723907470703, "learning_rate": 3.887009664582149e-05, "loss": 1.9536, "step": 3273 }, { "epoch": 4.559888579387186, "grad_norm": 3.5565130710601807, "learning_rate": 3.886014781125639e-05, "loss": 2.0665, "step": 3274 }, { "epoch": 4.561281337047354, "grad_norm": 3.986150026321411, "learning_rate": 3.88501989766913e-05, "loss": 1.7912, "step": 3275 }, { "epoch": 4.562674094707521, "grad_norm": 3.5278992652893066, "learning_rate": 3.8840250142126204e-05, "loss": 1.6781, "step": 3276 }, { "epoch": 4.564066852367688, "grad_norm": 5.309002876281738, "learning_rate": 3.8830301307561115e-05, "loss": 1.7468, "step": 3277 }, { "epoch": 4.5654596100278555, "grad_norm": 3.887995958328247, "learning_rate": 3.882035247299602e-05, "loss": 1.7888, "step": 3278 }, { "epoch": 4.566852367688022, "grad_norm": 2.9909744262695312, "learning_rate": 3.881040363843092e-05, "loss": 1.3848, "step": 3279 }, { "epoch": 4.56824512534819, "grad_norm": 3.7661995887756348, "learning_rate": 3.880045480386583e-05, "loss": 1.7164, "step": 3280 }, { "epoch": 4.569637883008356, "grad_norm": 4.145754337310791, "learning_rate": 3.879050596930074e-05, "loss": 1.9152, "step": 3281 }, { "epoch": 4.571030640668524, "grad_norm": 3.8252034187316895, "learning_rate": 3.878055713473564e-05, "loss": 2.0524, "step": 3282 }, { "epoch": 4.572423398328691, "grad_norm": 4.5574164390563965, "learning_rate": 3.877060830017055e-05, "loss": 1.9094, "step": 3283 }, { "epoch": 4.573816155988858, "grad_norm": 4.0055012702941895, "learning_rate": 3.876065946560545e-05, "loss": 2.1733, "step": 3284 }, { "epoch": 4.575208913649025, "grad_norm": 3.6039435863494873, "learning_rate": 3.875071063104036e-05, "loss": 1.7209, "step": 3285 }, { "epoch": 4.576601671309192, "grad_norm": 3.5629451274871826, "learning_rate": 3.8740761796475265e-05, "loss": 2.0675, "step": 3286 }, { "epoch": 4.577994428969359, "grad_norm": 4.082374572753906, "learning_rate": 3.8730812961910176e-05, "loss": 1.9238, "step": 3287 }, { "epoch": 4.579387186629527, "grad_norm": 3.620668888092041, "learning_rate": 3.872086412734508e-05, "loss": 2.0418, "step": 3288 }, { "epoch": 4.580779944289693, "grad_norm": 3.4701499938964844, "learning_rate": 3.8710915292779984e-05, "loss": 1.7259, "step": 3289 }, { "epoch": 4.582172701949861, "grad_norm": 4.000033855438232, "learning_rate": 3.870096645821489e-05, "loss": 2.0742, "step": 3290 }, { "epoch": 4.5835654596100275, "grad_norm": 5.691669464111328, "learning_rate": 3.86910176236498e-05, "loss": 1.8358, "step": 3291 }, { "epoch": 4.584958217270195, "grad_norm": 3.2448883056640625, "learning_rate": 3.86810687890847e-05, "loss": 1.7106, "step": 3292 }, { "epoch": 4.586350974930362, "grad_norm": 3.6653659343719482, "learning_rate": 3.8671119954519614e-05, "loss": 1.8573, "step": 3293 }, { "epoch": 4.587743732590529, "grad_norm": 3.8329854011535645, "learning_rate": 3.866117111995451e-05, "loss": 2.0607, "step": 3294 }, { "epoch": 4.589136490250697, "grad_norm": 3.4483561515808105, "learning_rate": 3.865122228538942e-05, "loss": 1.7451, "step": 3295 }, { "epoch": 4.5905292479108635, "grad_norm": 3.327455520629883, "learning_rate": 3.8641273450824326e-05, "loss": 1.849, "step": 3296 }, { "epoch": 4.591922005571031, "grad_norm": 3.6674351692199707, "learning_rate": 3.863132461625924e-05, "loss": 1.687, "step": 3297 }, { "epoch": 4.593314763231198, "grad_norm": 4.475555419921875, "learning_rate": 3.8621375781694134e-05, "loss": 1.7948, "step": 3298 }, { "epoch": 4.594707520891365, "grad_norm": 3.7766876220703125, "learning_rate": 3.8611426947129045e-05, "loss": 1.87, "step": 3299 }, { "epoch": 4.596100278551532, "grad_norm": 3.52447247505188, "learning_rate": 3.8601478112563956e-05, "loss": 1.9438, "step": 3300 }, { "epoch": 4.5974930362116995, "grad_norm": 3.5671544075012207, "learning_rate": 3.859152927799886e-05, "loss": 2.0848, "step": 3301 }, { "epoch": 4.598885793871866, "grad_norm": 3.7626819610595703, "learning_rate": 3.858158044343377e-05, "loss": 1.7089, "step": 3302 }, { "epoch": 4.600278551532034, "grad_norm": 3.4325287342071533, "learning_rate": 3.857163160886867e-05, "loss": 1.6694, "step": 3303 }, { "epoch": 4.6016713091922, "grad_norm": 3.536877393722534, "learning_rate": 3.856168277430358e-05, "loss": 1.7793, "step": 3304 }, { "epoch": 4.603064066852368, "grad_norm": 3.5021097660064697, "learning_rate": 3.855173393973848e-05, "loss": 1.5806, "step": 3305 }, { "epoch": 4.604456824512535, "grad_norm": 4.428031921386719, "learning_rate": 3.8541785105173394e-05, "loss": 1.749, "step": 3306 }, { "epoch": 4.605849582172702, "grad_norm": 3.478275775909424, "learning_rate": 3.85318362706083e-05, "loss": 1.8083, "step": 3307 }, { "epoch": 4.607242339832869, "grad_norm": 3.6865408420562744, "learning_rate": 3.85218874360432e-05, "loss": 2.1574, "step": 3308 }, { "epoch": 4.608635097493036, "grad_norm": 3.195882558822632, "learning_rate": 3.8511938601478106e-05, "loss": 1.8088, "step": 3309 }, { "epoch": 4.610027855153203, "grad_norm": 3.710249662399292, "learning_rate": 3.850198976691302e-05, "loss": 2.3266, "step": 3310 }, { "epoch": 4.611420612813371, "grad_norm": 3.181318521499634, "learning_rate": 3.849204093234792e-05, "loss": 1.7057, "step": 3311 }, { "epoch": 4.612813370473537, "grad_norm": 3.261418342590332, "learning_rate": 3.848209209778283e-05, "loss": 2.0305, "step": 3312 }, { "epoch": 4.614206128133705, "grad_norm": 3.871584415435791, "learning_rate": 3.847214326321773e-05, "loss": 1.8123, "step": 3313 }, { "epoch": 4.615598885793872, "grad_norm": 3.5570790767669678, "learning_rate": 3.846219442865264e-05, "loss": 1.8704, "step": 3314 }, { "epoch": 4.616991643454039, "grad_norm": 3.5778913497924805, "learning_rate": 3.8452245594087544e-05, "loss": 2.0716, "step": 3315 }, { "epoch": 4.618384401114206, "grad_norm": 3.684006929397583, "learning_rate": 3.8442296759522455e-05, "loss": 1.622, "step": 3316 }, { "epoch": 4.619777158774373, "grad_norm": 4.059329032897949, "learning_rate": 3.843234792495736e-05, "loss": 2.3033, "step": 3317 }, { "epoch": 4.621169916434541, "grad_norm": 3.6581759452819824, "learning_rate": 3.842239909039226e-05, "loss": 2.1392, "step": 3318 }, { "epoch": 4.6225626740947074, "grad_norm": 3.5814528465270996, "learning_rate": 3.841245025582717e-05, "loss": 1.8765, "step": 3319 }, { "epoch": 4.623955431754875, "grad_norm": 3.860010862350464, "learning_rate": 3.840250142126208e-05, "loss": 2.1593, "step": 3320 }, { "epoch": 4.625348189415042, "grad_norm": 4.103090286254883, "learning_rate": 3.839255258669698e-05, "loss": 2.0104, "step": 3321 }, { "epoch": 4.626740947075209, "grad_norm": 3.645543098449707, "learning_rate": 3.838260375213189e-05, "loss": 1.7529, "step": 3322 }, { "epoch": 4.628133704735376, "grad_norm": 4.183781623840332, "learning_rate": 3.837265491756679e-05, "loss": 1.8564, "step": 3323 }, { "epoch": 4.629526462395543, "grad_norm": 3.8604724407196045, "learning_rate": 3.83627060830017e-05, "loss": 2.3319, "step": 3324 }, { "epoch": 4.63091922005571, "grad_norm": 3.996187210083008, "learning_rate": 3.8352757248436605e-05, "loss": 2.2144, "step": 3325 }, { "epoch": 4.632311977715878, "grad_norm": 3.186293125152588, "learning_rate": 3.8342808413871516e-05, "loss": 1.8631, "step": 3326 }, { "epoch": 4.633704735376044, "grad_norm": 3.9150609970092773, "learning_rate": 3.833285957930643e-05, "loss": 1.9915, "step": 3327 }, { "epoch": 4.635097493036212, "grad_norm": 3.5735533237457275, "learning_rate": 3.8322910744741324e-05, "loss": 1.8528, "step": 3328 }, { "epoch": 4.6364902506963785, "grad_norm": 3.7437288761138916, "learning_rate": 3.8312961910176235e-05, "loss": 2.0451, "step": 3329 }, { "epoch": 4.637883008356546, "grad_norm": 3.541717767715454, "learning_rate": 3.830301307561114e-05, "loss": 1.7844, "step": 3330 }, { "epoch": 4.639275766016713, "grad_norm": 3.573110818862915, "learning_rate": 3.829306424104605e-05, "loss": 1.8942, "step": 3331 }, { "epoch": 4.64066852367688, "grad_norm": 4.328987121582031, "learning_rate": 3.8283115406480954e-05, "loss": 2.1885, "step": 3332 }, { "epoch": 4.642061281337047, "grad_norm": 4.351795196533203, "learning_rate": 3.827316657191586e-05, "loss": 1.9701, "step": 3333 }, { "epoch": 4.6434540389972145, "grad_norm": 3.7214114665985107, "learning_rate": 3.826321773735076e-05, "loss": 1.8902, "step": 3334 }, { "epoch": 4.644846796657381, "grad_norm": 4.347647190093994, "learning_rate": 3.825326890278567e-05, "loss": 1.9213, "step": 3335 }, { "epoch": 4.646239554317549, "grad_norm": 4.3626909255981445, "learning_rate": 3.824332006822058e-05, "loss": 2.0552, "step": 3336 }, { "epoch": 4.647632311977716, "grad_norm": 5.106423854827881, "learning_rate": 3.823337123365549e-05, "loss": 2.0844, "step": 3337 }, { "epoch": 4.649025069637883, "grad_norm": 3.6310203075408936, "learning_rate": 3.8223422399090385e-05, "loss": 1.6776, "step": 3338 }, { "epoch": 4.65041782729805, "grad_norm": 4.02175235748291, "learning_rate": 3.8213473564525296e-05, "loss": 2.0913, "step": 3339 }, { "epoch": 4.651810584958217, "grad_norm": 3.364502191543579, "learning_rate": 3.82035247299602e-05, "loss": 1.6781, "step": 3340 }, { "epoch": 4.653203342618385, "grad_norm": 3.457348108291626, "learning_rate": 3.819357589539511e-05, "loss": 2.0024, "step": 3341 }, { "epoch": 4.654596100278551, "grad_norm": 3.8440089225769043, "learning_rate": 3.818362706083001e-05, "loss": 1.85, "step": 3342 }, { "epoch": 4.655988857938719, "grad_norm": 3.5063223838806152, "learning_rate": 3.817367822626492e-05, "loss": 1.8078, "step": 3343 }, { "epoch": 4.657381615598886, "grad_norm": 3.236264228820801, "learning_rate": 3.8163729391699823e-05, "loss": 1.6326, "step": 3344 }, { "epoch": 4.658774373259053, "grad_norm": 3.800360918045044, "learning_rate": 3.8153780557134734e-05, "loss": 2.0866, "step": 3345 }, { "epoch": 4.66016713091922, "grad_norm": 3.2970800399780273, "learning_rate": 3.814383172256964e-05, "loss": 1.8191, "step": 3346 }, { "epoch": 4.661559888579387, "grad_norm": 4.163506984710693, "learning_rate": 3.813388288800454e-05, "loss": 1.781, "step": 3347 }, { "epoch": 4.662952646239554, "grad_norm": 3.7520077228546143, "learning_rate": 3.8123934053439447e-05, "loss": 1.9486, "step": 3348 }, { "epoch": 4.664345403899722, "grad_norm": 3.782254934310913, "learning_rate": 3.811398521887436e-05, "loss": 2.0663, "step": 3349 }, { "epoch": 4.665738161559888, "grad_norm": 3.776536464691162, "learning_rate": 3.810403638430926e-05, "loss": 1.8538, "step": 3350 }, { "epoch": 4.667130919220056, "grad_norm": 3.2819957733154297, "learning_rate": 3.809408754974417e-05, "loss": 1.7891, "step": 3351 }, { "epoch": 4.6685236768802225, "grad_norm": 4.100096702575684, "learning_rate": 3.808413871517907e-05, "loss": 1.9173, "step": 3352 }, { "epoch": 4.66991643454039, "grad_norm": 4.292869567871094, "learning_rate": 3.807418988061398e-05, "loss": 1.9693, "step": 3353 }, { "epoch": 4.671309192200557, "grad_norm": 3.77502703666687, "learning_rate": 3.8064241046048885e-05, "loss": 1.8048, "step": 3354 }, { "epoch": 4.672701949860724, "grad_norm": 3.7283897399902344, "learning_rate": 3.8054292211483795e-05, "loss": 1.7996, "step": 3355 }, { "epoch": 4.674094707520892, "grad_norm": 3.706955671310425, "learning_rate": 3.8044343376918706e-05, "loss": 1.9962, "step": 3356 }, { "epoch": 4.6754874651810585, "grad_norm": 3.4003515243530273, "learning_rate": 3.8034394542353604e-05, "loss": 1.5745, "step": 3357 }, { "epoch": 4.676880222841225, "grad_norm": 3.4651854038238525, "learning_rate": 3.8024445707788514e-05, "loss": 1.9375, "step": 3358 }, { "epoch": 4.678272980501393, "grad_norm": 3.9952895641326904, "learning_rate": 3.801449687322342e-05, "loss": 1.9034, "step": 3359 }, { "epoch": 4.67966573816156, "grad_norm": 3.463488817214966, "learning_rate": 3.800454803865833e-05, "loss": 1.8098, "step": 3360 }, { "epoch": 4.681058495821727, "grad_norm": 3.772735834121704, "learning_rate": 3.7994599204093233e-05, "loss": 1.9488, "step": 3361 }, { "epoch": 4.6824512534818945, "grad_norm": 3.6313321590423584, "learning_rate": 3.798465036952814e-05, "loss": 1.8965, "step": 3362 }, { "epoch": 4.683844011142061, "grad_norm": 3.1754677295684814, "learning_rate": 3.797470153496304e-05, "loss": 1.7626, "step": 3363 }, { "epoch": 4.685236768802229, "grad_norm": 3.6511735916137695, "learning_rate": 3.796475270039795e-05, "loss": 1.8018, "step": 3364 }, { "epoch": 4.686629526462395, "grad_norm": 4.095413684844971, "learning_rate": 3.7954803865832856e-05, "loss": 2.2402, "step": 3365 }, { "epoch": 4.688022284122563, "grad_norm": 3.337965726852417, "learning_rate": 3.794485503126777e-05, "loss": 1.6362, "step": 3366 }, { "epoch": 4.68941504178273, "grad_norm": 3.9275214672088623, "learning_rate": 3.7934906196702665e-05, "loss": 1.5543, "step": 3367 }, { "epoch": 4.690807799442897, "grad_norm": 3.974381446838379, "learning_rate": 3.7924957362137575e-05, "loss": 1.8771, "step": 3368 }, { "epoch": 4.692200557103064, "grad_norm": 4.257464408874512, "learning_rate": 3.791500852757248e-05, "loss": 1.8199, "step": 3369 }, { "epoch": 4.693593314763231, "grad_norm": 3.9581267833709717, "learning_rate": 3.790505969300739e-05, "loss": 2.0797, "step": 3370 }, { "epoch": 4.694986072423398, "grad_norm": 4.050745964050293, "learning_rate": 3.7895110858442294e-05, "loss": 1.5836, "step": 3371 }, { "epoch": 4.696378830083566, "grad_norm": 3.8411006927490234, "learning_rate": 3.78851620238772e-05, "loss": 1.9422, "step": 3372 }, { "epoch": 4.697771587743732, "grad_norm": 3.5904810428619385, "learning_rate": 3.78752131893121e-05, "loss": 1.6732, "step": 3373 }, { "epoch": 4.6991643454039, "grad_norm": 4.09559965133667, "learning_rate": 3.7865264354747013e-05, "loss": 1.9492, "step": 3374 }, { "epoch": 4.7005571030640665, "grad_norm": 3.2541232109069824, "learning_rate": 3.785531552018192e-05, "loss": 1.856, "step": 3375 }, { "epoch": 4.701949860724234, "grad_norm": 3.921887159347534, "learning_rate": 3.784536668561683e-05, "loss": 1.9857, "step": 3376 }, { "epoch": 4.703342618384401, "grad_norm": 3.925410509109497, "learning_rate": 3.7835417851051726e-05, "loss": 2.043, "step": 3377 }, { "epoch": 4.704735376044568, "grad_norm": 3.9396636486053467, "learning_rate": 3.7825469016486637e-05, "loss": 1.8628, "step": 3378 }, { "epoch": 4.706128133704736, "grad_norm": 3.7705271244049072, "learning_rate": 3.781552018192154e-05, "loss": 1.8439, "step": 3379 }, { "epoch": 4.7075208913649025, "grad_norm": 4.013179302215576, "learning_rate": 3.780557134735645e-05, "loss": 1.8958, "step": 3380 }, { "epoch": 4.708913649025069, "grad_norm": 4.116738319396973, "learning_rate": 3.779562251279136e-05, "loss": 1.9555, "step": 3381 }, { "epoch": 4.710306406685237, "grad_norm": 3.614222764968872, "learning_rate": 3.778567367822626e-05, "loss": 1.8712, "step": 3382 }, { "epoch": 4.711699164345404, "grad_norm": 4.237809658050537, "learning_rate": 3.7775724843661164e-05, "loss": 1.8454, "step": 3383 }, { "epoch": 4.713091922005571, "grad_norm": 3.4724388122558594, "learning_rate": 3.7765776009096075e-05, "loss": 1.7271, "step": 3384 }, { "epoch": 4.7144846796657385, "grad_norm": 3.5229036808013916, "learning_rate": 3.7755827174530985e-05, "loss": 1.9329, "step": 3385 }, { "epoch": 4.715877437325905, "grad_norm": 3.386718988418579, "learning_rate": 3.774587833996588e-05, "loss": 1.9594, "step": 3386 }, { "epoch": 4.717270194986073, "grad_norm": 3.710028648376465, "learning_rate": 3.7735929505400794e-05, "loss": 1.7565, "step": 3387 }, { "epoch": 4.718662952646239, "grad_norm": 4.659442901611328, "learning_rate": 3.77259806708357e-05, "loss": 1.7809, "step": 3388 }, { "epoch": 4.720055710306407, "grad_norm": 3.3721377849578857, "learning_rate": 3.771603183627061e-05, "loss": 1.675, "step": 3389 }, { "epoch": 4.721448467966574, "grad_norm": 3.6961467266082764, "learning_rate": 3.770608300170551e-05, "loss": 1.9312, "step": 3390 }, { "epoch": 4.722841225626741, "grad_norm": 3.807015895843506, "learning_rate": 3.769613416714042e-05, "loss": 1.9269, "step": 3391 }, { "epoch": 4.724233983286908, "grad_norm": 4.433807849884033, "learning_rate": 3.768618533257532e-05, "loss": 1.7858, "step": 3392 }, { "epoch": 4.725626740947075, "grad_norm": 3.145347833633423, "learning_rate": 3.767623649801023e-05, "loss": 1.4089, "step": 3393 }, { "epoch": 4.727019498607242, "grad_norm": 3.8560850620269775, "learning_rate": 3.7666287663445136e-05, "loss": 2.0344, "step": 3394 }, { "epoch": 4.72841225626741, "grad_norm": 4.012540340423584, "learning_rate": 3.7656338828880046e-05, "loss": 1.768, "step": 3395 }, { "epoch": 4.729805013927576, "grad_norm": 3.9792680740356445, "learning_rate": 3.7646389994314944e-05, "loss": 1.7829, "step": 3396 }, { "epoch": 4.731197771587744, "grad_norm": 3.9786007404327393, "learning_rate": 3.7636441159749855e-05, "loss": 2.1115, "step": 3397 }, { "epoch": 4.732590529247911, "grad_norm": 3.971142053604126, "learning_rate": 3.762649232518476e-05, "loss": 1.7771, "step": 3398 }, { "epoch": 4.733983286908078, "grad_norm": 3.923093557357788, "learning_rate": 3.761654349061967e-05, "loss": 1.8994, "step": 3399 }, { "epoch": 4.735376044568245, "grad_norm": 3.5467283725738525, "learning_rate": 3.7606594656054574e-05, "loss": 1.6225, "step": 3400 }, { "epoch": 4.736768802228412, "grad_norm": 4.023221492767334, "learning_rate": 3.759664582148948e-05, "loss": 1.7095, "step": 3401 }, { "epoch": 4.73816155988858, "grad_norm": 4.696751594543457, "learning_rate": 3.758669698692438e-05, "loss": 1.7263, "step": 3402 }, { "epoch": 4.7395543175487465, "grad_norm": 3.790797472000122, "learning_rate": 3.757674815235929e-05, "loss": 1.845, "step": 3403 }, { "epoch": 4.740947075208914, "grad_norm": 3.5207347869873047, "learning_rate": 3.75667993177942e-05, "loss": 2.0653, "step": 3404 }, { "epoch": 4.742339832869081, "grad_norm": 3.7906086444854736, "learning_rate": 3.755685048322911e-05, "loss": 1.7273, "step": 3405 }, { "epoch": 4.743732590529248, "grad_norm": 3.459949493408203, "learning_rate": 3.7546901648664005e-05, "loss": 1.6935, "step": 3406 }, { "epoch": 4.745125348189415, "grad_norm": 7.457804203033447, "learning_rate": 3.7536952814098916e-05, "loss": 1.7552, "step": 3407 }, { "epoch": 4.7465181058495824, "grad_norm": 4.607371807098389, "learning_rate": 3.752700397953382e-05, "loss": 2.175, "step": 3408 }, { "epoch": 4.747910863509749, "grad_norm": 3.4577276706695557, "learning_rate": 3.751705514496873e-05, "loss": 1.6552, "step": 3409 }, { "epoch": 4.749303621169917, "grad_norm": 3.7358248233795166, "learning_rate": 3.750710631040364e-05, "loss": 1.8011, "step": 3410 }, { "epoch": 4.750696378830083, "grad_norm": 3.9303715229034424, "learning_rate": 3.749715747583854e-05, "loss": 1.7354, "step": 3411 }, { "epoch": 4.752089136490251, "grad_norm": 5.308157444000244, "learning_rate": 3.748720864127344e-05, "loss": 1.6196, "step": 3412 }, { "epoch": 4.7534818941504176, "grad_norm": 4.389355182647705, "learning_rate": 3.7477259806708354e-05, "loss": 1.7116, "step": 3413 }, { "epoch": 4.754874651810585, "grad_norm": 4.6615376472473145, "learning_rate": 3.7467310972143265e-05, "loss": 1.6443, "step": 3414 }, { "epoch": 4.756267409470752, "grad_norm": 3.608461380004883, "learning_rate": 3.745736213757817e-05, "loss": 2.0131, "step": 3415 }, { "epoch": 4.757660167130919, "grad_norm": 4.320363998413086, "learning_rate": 3.744741330301307e-05, "loss": 1.9642, "step": 3416 }, { "epoch": 4.759052924791086, "grad_norm": 3.189528226852417, "learning_rate": 3.743746446844798e-05, "loss": 1.6693, "step": 3417 }, { "epoch": 4.7604456824512535, "grad_norm": 3.9767041206359863, "learning_rate": 3.742751563388289e-05, "loss": 2.0967, "step": 3418 }, { "epoch": 4.76183844011142, "grad_norm": 3.6122829914093018, "learning_rate": 3.741756679931779e-05, "loss": 2.1152, "step": 3419 }, { "epoch": 4.763231197771588, "grad_norm": 3.6352224349975586, "learning_rate": 3.74076179647527e-05, "loss": 1.8194, "step": 3420 }, { "epoch": 4.764623955431755, "grad_norm": 3.21540904045105, "learning_rate": 3.73976691301876e-05, "loss": 1.6025, "step": 3421 }, { "epoch": 4.766016713091922, "grad_norm": 4.017698287963867, "learning_rate": 3.738772029562251e-05, "loss": 2.1644, "step": 3422 }, { "epoch": 4.767409470752089, "grad_norm": 3.6868584156036377, "learning_rate": 3.7377771461057415e-05, "loss": 2.0127, "step": 3423 }, { "epoch": 4.768802228412256, "grad_norm": 3.684056043624878, "learning_rate": 3.7367822626492326e-05, "loss": 1.9572, "step": 3424 }, { "epoch": 4.770194986072424, "grad_norm": 3.913855791091919, "learning_rate": 3.735787379192723e-05, "loss": 1.9114, "step": 3425 }, { "epoch": 4.77158774373259, "grad_norm": 3.8981807231903076, "learning_rate": 3.7347924957362134e-05, "loss": 2.2089, "step": 3426 }, { "epoch": 4.772980501392758, "grad_norm": 3.620997905731201, "learning_rate": 3.733797612279704e-05, "loss": 1.7761, "step": 3427 }, { "epoch": 4.774373259052925, "grad_norm": 3.9985365867614746, "learning_rate": 3.732802728823195e-05, "loss": 1.6269, "step": 3428 }, { "epoch": 4.775766016713092, "grad_norm": 3.124957323074341, "learning_rate": 3.731807845366685e-05, "loss": 1.6921, "step": 3429 }, { "epoch": 4.777158774373259, "grad_norm": 3.801938533782959, "learning_rate": 3.730812961910176e-05, "loss": 2.0213, "step": 3430 }, { "epoch": 4.778551532033426, "grad_norm": 3.622676134109497, "learning_rate": 3.729818078453666e-05, "loss": 1.7529, "step": 3431 }, { "epoch": 4.779944289693593, "grad_norm": 4.099602222442627, "learning_rate": 3.728823194997157e-05, "loss": 1.8388, "step": 3432 }, { "epoch": 4.781337047353761, "grad_norm": 3.208228826522827, "learning_rate": 3.7278283115406476e-05, "loss": 1.6376, "step": 3433 }, { "epoch": 4.782729805013927, "grad_norm": 4.3150105476379395, "learning_rate": 3.726833428084139e-05, "loss": 1.4145, "step": 3434 }, { "epoch": 4.784122562674095, "grad_norm": 4.187281608581543, "learning_rate": 3.7258385446276284e-05, "loss": 2.0179, "step": 3435 }, { "epoch": 4.7855153203342615, "grad_norm": 4.075669288635254, "learning_rate": 3.7248436611711195e-05, "loss": 1.7502, "step": 3436 }, { "epoch": 4.786908077994429, "grad_norm": 3.068662166595459, "learning_rate": 3.72384877771461e-05, "loss": 1.5979, "step": 3437 }, { "epoch": 4.788300835654596, "grad_norm": 3.2184665203094482, "learning_rate": 3.722853894258101e-05, "loss": 1.5834, "step": 3438 }, { "epoch": 4.789693593314763, "grad_norm": 3.7012887001037598, "learning_rate": 3.721859010801592e-05, "loss": 1.6719, "step": 3439 }, { "epoch": 4.791086350974931, "grad_norm": 3.8084328174591064, "learning_rate": 3.720864127345082e-05, "loss": 1.9171, "step": 3440 }, { "epoch": 4.7924791086350975, "grad_norm": 3.2045934200286865, "learning_rate": 3.719869243888573e-05, "loss": 1.5753, "step": 3441 }, { "epoch": 4.793871866295264, "grad_norm": 3.7816498279571533, "learning_rate": 3.718874360432063e-05, "loss": 2.1607, "step": 3442 }, { "epoch": 4.795264623955432, "grad_norm": 3.9390110969543457, "learning_rate": 3.7178794769755544e-05, "loss": 2.2207, "step": 3443 }, { "epoch": 4.796657381615599, "grad_norm": 3.879908561706543, "learning_rate": 3.716884593519045e-05, "loss": 1.7727, "step": 3444 }, { "epoch": 4.798050139275766, "grad_norm": 3.260246753692627, "learning_rate": 3.715889710062535e-05, "loss": 1.7271, "step": 3445 }, { "epoch": 4.7994428969359335, "grad_norm": 3.562711238861084, "learning_rate": 3.7148948266060256e-05, "loss": 1.5258, "step": 3446 }, { "epoch": 4.8008356545961, "grad_norm": 4.049978733062744, "learning_rate": 3.713899943149517e-05, "loss": 2.1744, "step": 3447 }, { "epoch": 4.802228412256268, "grad_norm": 3.903008460998535, "learning_rate": 3.712905059693007e-05, "loss": 1.7587, "step": 3448 }, { "epoch": 4.803621169916434, "grad_norm": 3.714756965637207, "learning_rate": 3.711910176236498e-05, "loss": 1.7595, "step": 3449 }, { "epoch": 4.805013927576602, "grad_norm": 3.900696277618408, "learning_rate": 3.710915292779988e-05, "loss": 1.9418, "step": 3450 }, { "epoch": 4.806406685236769, "grad_norm": 4.046336650848389, "learning_rate": 3.709920409323479e-05, "loss": 2.1615, "step": 3451 }, { "epoch": 4.807799442896936, "grad_norm": 2.856611728668213, "learning_rate": 3.7089255258669694e-05, "loss": 1.3214, "step": 3452 }, { "epoch": 4.809192200557103, "grad_norm": 3.4597105979919434, "learning_rate": 3.7079306424104605e-05, "loss": 1.4322, "step": 3453 }, { "epoch": 4.81058495821727, "grad_norm": 3.6220455169677734, "learning_rate": 3.706935758953951e-05, "loss": 1.9229, "step": 3454 }, { "epoch": 4.811977715877437, "grad_norm": 3.5215296745300293, "learning_rate": 3.705940875497441e-05, "loss": 1.8309, "step": 3455 }, { "epoch": 4.813370473537605, "grad_norm": 4.009708404541016, "learning_rate": 3.704945992040932e-05, "loss": 1.731, "step": 3456 }, { "epoch": 4.814763231197771, "grad_norm": 3.590940475463867, "learning_rate": 3.703951108584423e-05, "loss": 1.8334, "step": 3457 }, { "epoch": 4.816155988857939, "grad_norm": 3.6801607608795166, "learning_rate": 3.702956225127913e-05, "loss": 1.982, "step": 3458 }, { "epoch": 4.8175487465181055, "grad_norm": 3.8056976795196533, "learning_rate": 3.701961341671404e-05, "loss": 1.7785, "step": 3459 }, { "epoch": 4.818941504178273, "grad_norm": 3.730794906616211, "learning_rate": 3.700966458214894e-05, "loss": 1.7429, "step": 3460 }, { "epoch": 4.82033426183844, "grad_norm": 3.3455824851989746, "learning_rate": 3.699971574758385e-05, "loss": 1.6526, "step": 3461 }, { "epoch": 4.821727019498607, "grad_norm": 4.02980899810791, "learning_rate": 3.6989766913018755e-05, "loss": 1.7632, "step": 3462 }, { "epoch": 4.823119777158775, "grad_norm": 3.611778497695923, "learning_rate": 3.6979818078453666e-05, "loss": 1.524, "step": 3463 }, { "epoch": 4.8245125348189415, "grad_norm": 3.3080599308013916, "learning_rate": 3.696986924388858e-05, "loss": 1.7114, "step": 3464 }, { "epoch": 4.825905292479108, "grad_norm": 3.5850789546966553, "learning_rate": 3.6959920409323474e-05, "loss": 1.4328, "step": 3465 }, { "epoch": 4.827298050139276, "grad_norm": 3.918975353240967, "learning_rate": 3.694997157475838e-05, "loss": 1.9371, "step": 3466 }, { "epoch": 4.828690807799443, "grad_norm": 4.104087829589844, "learning_rate": 3.694002274019329e-05, "loss": 1.971, "step": 3467 }, { "epoch": 4.83008356545961, "grad_norm": 3.403681993484497, "learning_rate": 3.69300739056282e-05, "loss": 1.9956, "step": 3468 }, { "epoch": 4.8314763231197775, "grad_norm": 3.927757978439331, "learning_rate": 3.69201250710631e-05, "loss": 1.6046, "step": 3469 }, { "epoch": 4.832869080779944, "grad_norm": 4.31494140625, "learning_rate": 3.691017623649801e-05, "loss": 1.8073, "step": 3470 }, { "epoch": 4.834261838440112, "grad_norm": 3.8312408924102783, "learning_rate": 3.690022740193291e-05, "loss": 1.6918, "step": 3471 }, { "epoch": 4.835654596100278, "grad_norm": 3.8084018230438232, "learning_rate": 3.689027856736782e-05, "loss": 1.9344, "step": 3472 }, { "epoch": 4.837047353760446, "grad_norm": 3.9358959197998047, "learning_rate": 3.688032973280273e-05, "loss": 1.9348, "step": 3473 }, { "epoch": 4.838440111420613, "grad_norm": 4.218131065368652, "learning_rate": 3.687038089823763e-05, "loss": 1.7566, "step": 3474 }, { "epoch": 4.83983286908078, "grad_norm": 3.7826144695281982, "learning_rate": 3.6860432063672535e-05, "loss": 1.8968, "step": 3475 }, { "epoch": 4.841225626740947, "grad_norm": 4.4024200439453125, "learning_rate": 3.6850483229107446e-05, "loss": 1.9906, "step": 3476 }, { "epoch": 4.842618384401114, "grad_norm": 3.888853073120117, "learning_rate": 3.684053439454235e-05, "loss": 1.8098, "step": 3477 }, { "epoch": 4.844011142061281, "grad_norm": 3.795907974243164, "learning_rate": 3.683058555997726e-05, "loss": 2.0344, "step": 3478 }, { "epoch": 4.845403899721449, "grad_norm": 4.283677577972412, "learning_rate": 3.682063672541216e-05, "loss": 1.7764, "step": 3479 }, { "epoch": 4.846796657381615, "grad_norm": 3.312293767929077, "learning_rate": 3.681068789084707e-05, "loss": 1.6458, "step": 3480 }, { "epoch": 4.848189415041783, "grad_norm": 3.818984270095825, "learning_rate": 3.680073905628197e-05, "loss": 1.9209, "step": 3481 }, { "epoch": 4.84958217270195, "grad_norm": 3.8890469074249268, "learning_rate": 3.6790790221716884e-05, "loss": 2.1366, "step": 3482 }, { "epoch": 4.850974930362117, "grad_norm": 4.156975269317627, "learning_rate": 3.678084138715179e-05, "loss": 1.8456, "step": 3483 }, { "epoch": 4.852367688022284, "grad_norm": 3.5987191200256348, "learning_rate": 3.677089255258669e-05, "loss": 1.7846, "step": 3484 }, { "epoch": 4.853760445682451, "grad_norm": 4.193032741546631, "learning_rate": 3.6760943718021596e-05, "loss": 2.1748, "step": 3485 }, { "epoch": 4.855153203342619, "grad_norm": 3.4617221355438232, "learning_rate": 3.675099488345651e-05, "loss": 1.5625, "step": 3486 }, { "epoch": 4.8565459610027855, "grad_norm": 3.506861686706543, "learning_rate": 3.674104604889141e-05, "loss": 1.6646, "step": 3487 }, { "epoch": 4.857938718662953, "grad_norm": 3.827099323272705, "learning_rate": 3.673109721432632e-05, "loss": 2.0211, "step": 3488 }, { "epoch": 4.85933147632312, "grad_norm": 4.375415802001953, "learning_rate": 3.672114837976122e-05, "loss": 2.2855, "step": 3489 }, { "epoch": 4.860724233983287, "grad_norm": 3.5745644569396973, "learning_rate": 3.671119954519613e-05, "loss": 1.8843, "step": 3490 }, { "epoch": 4.862116991643454, "grad_norm": 4.019834518432617, "learning_rate": 3.6701250710631034e-05, "loss": 1.6538, "step": 3491 }, { "epoch": 4.8635097493036215, "grad_norm": 4.201918125152588, "learning_rate": 3.6691301876065945e-05, "loss": 1.6402, "step": 3492 }, { "epoch": 4.864902506963788, "grad_norm": 3.5356197357177734, "learning_rate": 3.6681353041500856e-05, "loss": 1.6881, "step": 3493 }, { "epoch": 4.866295264623956, "grad_norm": 3.792555332183838, "learning_rate": 3.667140420693575e-05, "loss": 2.1385, "step": 3494 }, { "epoch": 4.867688022284122, "grad_norm": 3.5409021377563477, "learning_rate": 3.666145537237066e-05, "loss": 1.6562, "step": 3495 }, { "epoch": 4.86908077994429, "grad_norm": 3.8827669620513916, "learning_rate": 3.665150653780557e-05, "loss": 1.8948, "step": 3496 }, { "epoch": 4.870473537604457, "grad_norm": 3.8804070949554443, "learning_rate": 3.664155770324048e-05, "loss": 1.6538, "step": 3497 }, { "epoch": 4.871866295264624, "grad_norm": 3.7477757930755615, "learning_rate": 3.663160886867538e-05, "loss": 1.733, "step": 3498 }, { "epoch": 4.873259052924791, "grad_norm": 3.519540548324585, "learning_rate": 3.662166003411029e-05, "loss": 2.0008, "step": 3499 }, { "epoch": 4.874651810584958, "grad_norm": 3.5823540687561035, "learning_rate": 3.661171119954519e-05, "loss": 1.645, "step": 3500 }, { "epoch": 4.876044568245125, "grad_norm": 4.832265377044678, "learning_rate": 3.66017623649801e-05, "loss": 1.6547, "step": 3501 }, { "epoch": 4.8774373259052926, "grad_norm": 3.9604856967926025, "learning_rate": 3.6591813530415006e-05, "loss": 2.0138, "step": 3502 }, { "epoch": 4.878830083565459, "grad_norm": 3.8182246685028076, "learning_rate": 3.658186469584992e-05, "loss": 1.6823, "step": 3503 }, { "epoch": 4.880222841225627, "grad_norm": 3.1158838272094727, "learning_rate": 3.6571915861284814e-05, "loss": 1.594, "step": 3504 }, { "epoch": 4.881615598885794, "grad_norm": 3.876467227935791, "learning_rate": 3.6561967026719725e-05, "loss": 1.7696, "step": 3505 }, { "epoch": 4.883008356545961, "grad_norm": 3.868966817855835, "learning_rate": 3.655201819215463e-05, "loss": 1.9037, "step": 3506 }, { "epoch": 4.884401114206128, "grad_norm": 3.630047082901001, "learning_rate": 3.654206935758954e-05, "loss": 1.504, "step": 3507 }, { "epoch": 4.885793871866295, "grad_norm": 3.239949941635132, "learning_rate": 3.6532120523024444e-05, "loss": 1.3695, "step": 3508 }, { "epoch": 4.887186629526463, "grad_norm": 4.650155544281006, "learning_rate": 3.652217168845935e-05, "loss": 2.0668, "step": 3509 }, { "epoch": 4.888579387186629, "grad_norm": 3.227562427520752, "learning_rate": 3.651222285389425e-05, "loss": 1.6167, "step": 3510 }, { "epoch": 4.889972144846797, "grad_norm": 3.7339327335357666, "learning_rate": 3.650227401932916e-05, "loss": 1.631, "step": 3511 }, { "epoch": 4.891364902506964, "grad_norm": 3.2770235538482666, "learning_rate": 3.649232518476407e-05, "loss": 1.6926, "step": 3512 }, { "epoch": 4.892757660167131, "grad_norm": 4.106587886810303, "learning_rate": 3.648237635019897e-05, "loss": 1.7791, "step": 3513 }, { "epoch": 4.894150417827298, "grad_norm": 4.150693416595459, "learning_rate": 3.6472427515633875e-05, "loss": 1.6498, "step": 3514 }, { "epoch": 4.895543175487465, "grad_norm": 3.1088902950286865, "learning_rate": 3.6462478681068786e-05, "loss": 1.641, "step": 3515 }, { "epoch": 4.896935933147632, "grad_norm": 3.614098310470581, "learning_rate": 3.645252984650369e-05, "loss": 1.518, "step": 3516 }, { "epoch": 4.8983286908078, "grad_norm": 3.6146419048309326, "learning_rate": 3.64425810119386e-05, "loss": 1.9188, "step": 3517 }, { "epoch": 4.899721448467966, "grad_norm": 3.583493947982788, "learning_rate": 3.64326321773735e-05, "loss": 1.8812, "step": 3518 }, { "epoch": 4.901114206128134, "grad_norm": 4.195201873779297, "learning_rate": 3.642268334280841e-05, "loss": 2.0226, "step": 3519 }, { "epoch": 4.9025069637883005, "grad_norm": 3.681187868118286, "learning_rate": 3.641273450824331e-05, "loss": 1.8083, "step": 3520 }, { "epoch": 4.903899721448468, "grad_norm": 3.536716938018799, "learning_rate": 3.6402785673678224e-05, "loss": 1.6227, "step": 3521 }, { "epoch": 4.905292479108635, "grad_norm": 3.566226005554199, "learning_rate": 3.6392836839113135e-05, "loss": 1.817, "step": 3522 }, { "epoch": 4.906685236768802, "grad_norm": 4.066131591796875, "learning_rate": 3.638288800454803e-05, "loss": 2.0704, "step": 3523 }, { "epoch": 4.908077994428969, "grad_norm": 3.2537622451782227, "learning_rate": 3.6372939169982936e-05, "loss": 1.7477, "step": 3524 }, { "epoch": 4.9094707520891365, "grad_norm": 4.299447059631348, "learning_rate": 3.636299033541785e-05, "loss": 1.8149, "step": 3525 }, { "epoch": 4.910863509749303, "grad_norm": 7.490352153778076, "learning_rate": 3.635304150085276e-05, "loss": 1.3041, "step": 3526 }, { "epoch": 4.912256267409471, "grad_norm": 4.535417079925537, "learning_rate": 3.634309266628766e-05, "loss": 1.9322, "step": 3527 }, { "epoch": 4.913649025069638, "grad_norm": 3.613455295562744, "learning_rate": 3.6333143831722566e-05, "loss": 1.7951, "step": 3528 }, { "epoch": 4.915041782729805, "grad_norm": 3.567254066467285, "learning_rate": 3.632319499715747e-05, "loss": 1.9571, "step": 3529 }, { "epoch": 4.9164345403899725, "grad_norm": 3.3372793197631836, "learning_rate": 3.631324616259238e-05, "loss": 1.5199, "step": 3530 }, { "epoch": 4.917827298050139, "grad_norm": 3.837397336959839, "learning_rate": 3.6303297328027285e-05, "loss": 1.5254, "step": 3531 }, { "epoch": 4.919220055710307, "grad_norm": 5.37261962890625, "learning_rate": 3.6293348493462196e-05, "loss": 1.8113, "step": 3532 }, { "epoch": 4.920612813370473, "grad_norm": 3.5950417518615723, "learning_rate": 3.6283399658897093e-05, "loss": 1.7721, "step": 3533 }, { "epoch": 4.922005571030641, "grad_norm": 3.993096113204956, "learning_rate": 3.6273450824332004e-05, "loss": 1.9407, "step": 3534 }, { "epoch": 4.923398328690808, "grad_norm": 4.111208915710449, "learning_rate": 3.626350198976691e-05, "loss": 1.4839, "step": 3535 }, { "epoch": 4.924791086350975, "grad_norm": 4.141539573669434, "learning_rate": 3.625355315520182e-05, "loss": 1.8831, "step": 3536 }, { "epoch": 4.926183844011142, "grad_norm": 3.235837697982788, "learning_rate": 3.624360432063672e-05, "loss": 1.625, "step": 3537 }, { "epoch": 4.927576601671309, "grad_norm": 3.5801455974578857, "learning_rate": 3.623365548607163e-05, "loss": 1.6913, "step": 3538 }, { "epoch": 4.928969359331476, "grad_norm": 4.944547176361084, "learning_rate": 3.622370665150653e-05, "loss": 1.6985, "step": 3539 }, { "epoch": 4.930362116991644, "grad_norm": 3.601283550262451, "learning_rate": 3.621375781694144e-05, "loss": 1.6085, "step": 3540 }, { "epoch": 4.93175487465181, "grad_norm": 5.916871070861816, "learning_rate": 3.6203808982376346e-05, "loss": 1.947, "step": 3541 }, { "epoch": 4.933147632311978, "grad_norm": 5.327152729034424, "learning_rate": 3.619386014781126e-05, "loss": 1.8474, "step": 3542 }, { "epoch": 4.9345403899721445, "grad_norm": 3.569394588470459, "learning_rate": 3.6183911313246155e-05, "loss": 1.8715, "step": 3543 }, { "epoch": 4.935933147632312, "grad_norm": 3.9451210498809814, "learning_rate": 3.6173962478681065e-05, "loss": 1.9203, "step": 3544 }, { "epoch": 4.937325905292479, "grad_norm": 3.293942928314209, "learning_rate": 3.616401364411597e-05, "loss": 1.4795, "step": 3545 }, { "epoch": 4.938718662952646, "grad_norm": 3.8608896732330322, "learning_rate": 3.615406480955088e-05, "loss": 1.7539, "step": 3546 }, { "epoch": 4.940111420612814, "grad_norm": 3.3696210384368896, "learning_rate": 3.614411597498579e-05, "loss": 1.5044, "step": 3547 }, { "epoch": 4.9415041782729805, "grad_norm": 3.858611583709717, "learning_rate": 3.613416714042069e-05, "loss": 1.5807, "step": 3548 }, { "epoch": 4.942896935933147, "grad_norm": 3.8843953609466553, "learning_rate": 3.612421830585559e-05, "loss": 2.3736, "step": 3549 }, { "epoch": 4.944289693593315, "grad_norm": 3.9137628078460693, "learning_rate": 3.61142694712905e-05, "loss": 1.7743, "step": 3550 }, { "epoch": 4.945682451253482, "grad_norm": 3.500291347503662, "learning_rate": 3.6104320636725414e-05, "loss": 1.9175, "step": 3551 }, { "epoch": 4.947075208913649, "grad_norm": 3.385045289993286, "learning_rate": 3.609437180216032e-05, "loss": 1.6842, "step": 3552 }, { "epoch": 4.9484679665738165, "grad_norm": 6.60980224609375, "learning_rate": 3.608442296759522e-05, "loss": 1.7951, "step": 3553 }, { "epoch": 4.949860724233983, "grad_norm": 3.7822020053863525, "learning_rate": 3.6074474133030126e-05, "loss": 1.557, "step": 3554 }, { "epoch": 4.951253481894151, "grad_norm": 3.5959599018096924, "learning_rate": 3.606452529846504e-05, "loss": 1.7329, "step": 3555 }, { "epoch": 4.952646239554317, "grad_norm": 3.565088987350464, "learning_rate": 3.605457646389994e-05, "loss": 1.6584, "step": 3556 }, { "epoch": 4.954038997214485, "grad_norm": 3.444204807281494, "learning_rate": 3.6044627629334845e-05, "loss": 1.9458, "step": 3557 }, { "epoch": 4.955431754874652, "grad_norm": 3.907775402069092, "learning_rate": 3.603467879476975e-05, "loss": 1.7874, "step": 3558 }, { "epoch": 4.956824512534819, "grad_norm": 3.466712474822998, "learning_rate": 3.602472996020466e-05, "loss": 1.5166, "step": 3559 }, { "epoch": 4.958217270194986, "grad_norm": 3.595690965652466, "learning_rate": 3.6014781125639564e-05, "loss": 1.9107, "step": 3560 }, { "epoch": 4.959610027855153, "grad_norm": 4.689317226409912, "learning_rate": 3.6004832291074475e-05, "loss": 2.0874, "step": 3561 }, { "epoch": 4.96100278551532, "grad_norm": 3.4974448680877686, "learning_rate": 3.599488345650937e-05, "loss": 1.9071, "step": 3562 }, { "epoch": 4.962395543175488, "grad_norm": 4.746645450592041, "learning_rate": 3.5984934621944283e-05, "loss": 1.6609, "step": 3563 }, { "epoch": 4.963788300835654, "grad_norm": 3.7093594074249268, "learning_rate": 3.597498578737919e-05, "loss": 1.7767, "step": 3564 }, { "epoch": 4.965181058495822, "grad_norm": 3.8114004135131836, "learning_rate": 3.59650369528141e-05, "loss": 2.0213, "step": 3565 }, { "epoch": 4.9665738161559885, "grad_norm": 3.6004419326782227, "learning_rate": 3.5955088118249e-05, "loss": 1.8763, "step": 3566 }, { "epoch": 4.967966573816156, "grad_norm": 3.915437698364258, "learning_rate": 3.5945139283683907e-05, "loss": 1.7432, "step": 3567 }, { "epoch": 4.969359331476323, "grad_norm": 3.8346073627471924, "learning_rate": 3.593519044911881e-05, "loss": 1.9158, "step": 3568 }, { "epoch": 4.97075208913649, "grad_norm": 3.7203516960144043, "learning_rate": 3.592524161455372e-05, "loss": 1.5696, "step": 3569 }, { "epoch": 4.972144846796658, "grad_norm": 3.4037482738494873, "learning_rate": 3.5915292779988626e-05, "loss": 1.7977, "step": 3570 }, { "epoch": 4.9735376044568245, "grad_norm": 2.969834566116333, "learning_rate": 3.5905343945423536e-05, "loss": 1.561, "step": 3571 }, { "epoch": 4.974930362116992, "grad_norm": 3.761119842529297, "learning_rate": 3.5895395110858434e-05, "loss": 1.5814, "step": 3572 }, { "epoch": 4.976323119777159, "grad_norm": 4.0623979568481445, "learning_rate": 3.5885446276293345e-05, "loss": 1.9379, "step": 3573 }, { "epoch": 4.977715877437326, "grad_norm": 4.7986626625061035, "learning_rate": 3.587549744172825e-05, "loss": 1.8651, "step": 3574 }, { "epoch": 4.979108635097493, "grad_norm": 3.2597880363464355, "learning_rate": 3.586554860716316e-05, "loss": 1.6728, "step": 3575 }, { "epoch": 4.9805013927576605, "grad_norm": 4.0020527839660645, "learning_rate": 3.585559977259807e-05, "loss": 1.4982, "step": 3576 }, { "epoch": 4.981894150417827, "grad_norm": 3.393110513687134, "learning_rate": 3.584565093803297e-05, "loss": 1.6846, "step": 3577 }, { "epoch": 4.983286908077995, "grad_norm": 3.4916343688964844, "learning_rate": 3.583570210346787e-05, "loss": 1.6884, "step": 3578 }, { "epoch": 4.984679665738161, "grad_norm": 4.145401954650879, "learning_rate": 3.582575326890278e-05, "loss": 1.6155, "step": 3579 }, { "epoch": 4.986072423398329, "grad_norm": 3.450040578842163, "learning_rate": 3.581580443433769e-05, "loss": 1.5963, "step": 3580 }, { "epoch": 4.987465181058496, "grad_norm": 4.401824474334717, "learning_rate": 3.58058555997726e-05, "loss": 2.0137, "step": 3581 }, { "epoch": 4.988857938718663, "grad_norm": 3.62214994430542, "learning_rate": 3.57959067652075e-05, "loss": 1.8588, "step": 3582 }, { "epoch": 4.99025069637883, "grad_norm": 4.877983570098877, "learning_rate": 3.5785957930642406e-05, "loss": 2.2727, "step": 3583 }, { "epoch": 4.991643454038997, "grad_norm": 3.382337808609009, "learning_rate": 3.5776009096077316e-05, "loss": 1.514, "step": 3584 }, { "epoch": 4.993036211699164, "grad_norm": 3.798260450363159, "learning_rate": 3.576606026151222e-05, "loss": 1.5972, "step": 3585 }, { "epoch": 4.994428969359332, "grad_norm": 4.307199478149414, "learning_rate": 3.575611142694713e-05, "loss": 1.6715, "step": 3586 }, { "epoch": 4.995821727019498, "grad_norm": 3.9402637481689453, "learning_rate": 3.574616259238203e-05, "loss": 1.7135, "step": 3587 }, { "epoch": 4.997214484679666, "grad_norm": 3.9421639442443848, "learning_rate": 3.573621375781694e-05, "loss": 2.3556, "step": 3588 }, { "epoch": 4.998607242339833, "grad_norm": 3.5371787548065186, "learning_rate": 3.5726264923251844e-05, "loss": 1.6727, "step": 3589 }, { "epoch": 5.0, "grad_norm": 3.5636303424835205, "learning_rate": 3.5716316088686754e-05, "loss": 1.4795, "step": 3590 }, { "epoch": 5.0013927576601676, "grad_norm": 6.826588153839111, "learning_rate": 3.570636725412166e-05, "loss": 1.4351, "step": 3591 }, { "epoch": 5.002785515320334, "grad_norm": 3.244941234588623, "learning_rate": 3.569641841955656e-05, "loss": 1.4898, "step": 3592 }, { "epoch": 5.004178272980502, "grad_norm": 2.8874032497406006, "learning_rate": 3.568646958499147e-05, "loss": 1.38, "step": 3593 }, { "epoch": 5.005571030640668, "grad_norm": 3.873886823654175, "learning_rate": 3.567652075042638e-05, "loss": 1.4358, "step": 3594 }, { "epoch": 5.006963788300836, "grad_norm": 3.3421318531036377, "learning_rate": 3.566657191586128e-05, "loss": 1.3531, "step": 3595 }, { "epoch": 5.008356545961003, "grad_norm": 3.3321070671081543, "learning_rate": 3.565662308129619e-05, "loss": 1.4568, "step": 3596 }, { "epoch": 5.00974930362117, "grad_norm": 3.352158546447754, "learning_rate": 3.564667424673109e-05, "loss": 1.5221, "step": 3597 }, { "epoch": 5.011142061281337, "grad_norm": 3.3171162605285645, "learning_rate": 3.5636725412166e-05, "loss": 1.4319, "step": 3598 }, { "epoch": 5.012534818941504, "grad_norm": 3.1241772174835205, "learning_rate": 3.5626776577600905e-05, "loss": 1.6024, "step": 3599 }, { "epoch": 5.013927576601671, "grad_norm": 2.815913677215576, "learning_rate": 3.5616827743035816e-05, "loss": 1.1249, "step": 3600 }, { "epoch": 5.015320334261839, "grad_norm": 3.150897741317749, "learning_rate": 3.560687890847071e-05, "loss": 1.3773, "step": 3601 }, { "epoch": 5.016713091922005, "grad_norm": 3.2558064460754395, "learning_rate": 3.5596930073905624e-05, "loss": 1.5564, "step": 3602 }, { "epoch": 5.018105849582173, "grad_norm": 2.789930582046509, "learning_rate": 3.558698123934053e-05, "loss": 0.979, "step": 3603 }, { "epoch": 5.0194986072423395, "grad_norm": 3.3550565242767334, "learning_rate": 3.557703240477544e-05, "loss": 1.8087, "step": 3604 }, { "epoch": 5.020891364902507, "grad_norm": 3.4039390087127686, "learning_rate": 3.556708357021035e-05, "loss": 1.4341, "step": 3605 }, { "epoch": 5.022284122562674, "grad_norm": 4.22948694229126, "learning_rate": 3.555713473564525e-05, "loss": 1.3164, "step": 3606 }, { "epoch": 5.023676880222841, "grad_norm": 3.8464784622192383, "learning_rate": 3.554718590108015e-05, "loss": 1.23, "step": 3607 }, { "epoch": 5.025069637883008, "grad_norm": 3.5483877658843994, "learning_rate": 3.553723706651506e-05, "loss": 1.2695, "step": 3608 }, { "epoch": 5.0264623955431755, "grad_norm": 3.2288761138916016, "learning_rate": 3.552728823194997e-05, "loss": 1.2609, "step": 3609 }, { "epoch": 5.027855153203342, "grad_norm": 3.2547757625579834, "learning_rate": 3.551733939738488e-05, "loss": 1.5669, "step": 3610 }, { "epoch": 5.02924791086351, "grad_norm": 2.923926591873169, "learning_rate": 3.550739056281978e-05, "loss": 1.4724, "step": 3611 }, { "epoch": 5.030640668523677, "grad_norm": 3.3608334064483643, "learning_rate": 3.5497441728254685e-05, "loss": 1.5402, "step": 3612 }, { "epoch": 5.032033426183844, "grad_norm": 3.311286449432373, "learning_rate": 3.5487492893689596e-05, "loss": 1.3233, "step": 3613 }, { "epoch": 5.0334261838440115, "grad_norm": 3.551703691482544, "learning_rate": 3.54775440591245e-05, "loss": 1.7847, "step": 3614 }, { "epoch": 5.034818941504178, "grad_norm": 3.485243797302246, "learning_rate": 3.546759522455941e-05, "loss": 1.5752, "step": 3615 }, { "epoch": 5.036211699164346, "grad_norm": 3.2045066356658936, "learning_rate": 3.545764638999431e-05, "loss": 1.2544, "step": 3616 }, { "epoch": 5.037604456824512, "grad_norm": 3.0100302696228027, "learning_rate": 3.544769755542922e-05, "loss": 1.2859, "step": 3617 }, { "epoch": 5.03899721448468, "grad_norm": 3.7358620166778564, "learning_rate": 3.543774872086412e-05, "loss": 1.171, "step": 3618 }, { "epoch": 5.040389972144847, "grad_norm": 3.3091506958007812, "learning_rate": 3.5427799886299034e-05, "loss": 1.4943, "step": 3619 }, { "epoch": 5.041782729805014, "grad_norm": 3.4011595249176025, "learning_rate": 3.541785105173394e-05, "loss": 1.6702, "step": 3620 }, { "epoch": 5.043175487465181, "grad_norm": 3.6097514629364014, "learning_rate": 3.540790221716884e-05, "loss": 1.8783, "step": 3621 }, { "epoch": 5.044568245125348, "grad_norm": 3.3654654026031494, "learning_rate": 3.5397953382603746e-05, "loss": 1.6149, "step": 3622 }, { "epoch": 5.045961002785515, "grad_norm": 3.2377398014068604, "learning_rate": 3.538800454803866e-05, "loss": 1.5624, "step": 3623 }, { "epoch": 5.047353760445683, "grad_norm": 3.1147098541259766, "learning_rate": 3.537805571347356e-05, "loss": 1.4554, "step": 3624 }, { "epoch": 5.048746518105849, "grad_norm": 3.2079262733459473, "learning_rate": 3.536810687890847e-05, "loss": 1.7468, "step": 3625 }, { "epoch": 5.050139275766017, "grad_norm": 2.7711851596832275, "learning_rate": 3.535815804434337e-05, "loss": 1.2825, "step": 3626 }, { "epoch": 5.0515320334261835, "grad_norm": 4.073577880859375, "learning_rate": 3.534820920977828e-05, "loss": 1.2833, "step": 3627 }, { "epoch": 5.052924791086351, "grad_norm": 3.3940837383270264, "learning_rate": 3.5338260375213184e-05, "loss": 1.4993, "step": 3628 }, { "epoch": 5.054317548746518, "grad_norm": 3.7020013332366943, "learning_rate": 3.5328311540648095e-05, "loss": 1.8527, "step": 3629 }, { "epoch": 5.055710306406685, "grad_norm": 3.2459218502044678, "learning_rate": 3.5318362706083e-05, "loss": 1.5839, "step": 3630 }, { "epoch": 5.057103064066852, "grad_norm": 2.827864646911621, "learning_rate": 3.53084138715179e-05, "loss": 1.3861, "step": 3631 }, { "epoch": 5.0584958217270195, "grad_norm": 3.8657968044281006, "learning_rate": 3.529846503695281e-05, "loss": 1.8368, "step": 3632 }, { "epoch": 5.059888579387187, "grad_norm": 3.375610113143921, "learning_rate": 3.528851620238772e-05, "loss": 1.713, "step": 3633 }, { "epoch": 5.061281337047354, "grad_norm": 3.10119366645813, "learning_rate": 3.527856736782263e-05, "loss": 1.48, "step": 3634 }, { "epoch": 5.062674094707521, "grad_norm": 3.3581926822662354, "learning_rate": 3.526861853325753e-05, "loss": 1.4784, "step": 3635 }, { "epoch": 5.064066852367688, "grad_norm": 3.330660581588745, "learning_rate": 3.525866969869243e-05, "loss": 1.5715, "step": 3636 }, { "epoch": 5.0654596100278555, "grad_norm": 3.0002896785736084, "learning_rate": 3.524872086412734e-05, "loss": 1.3374, "step": 3637 }, { "epoch": 5.066852367688022, "grad_norm": 3.3638551235198975, "learning_rate": 3.523877202956225e-05, "loss": 1.4083, "step": 3638 }, { "epoch": 5.06824512534819, "grad_norm": 3.5368261337280273, "learning_rate": 3.5228823194997156e-05, "loss": 1.3304, "step": 3639 }, { "epoch": 5.069637883008356, "grad_norm": 3.7917847633361816, "learning_rate": 3.521887436043207e-05, "loss": 1.6021, "step": 3640 }, { "epoch": 5.071030640668524, "grad_norm": 3.3980519771575928, "learning_rate": 3.5208925525866964e-05, "loss": 1.772, "step": 3641 }, { "epoch": 5.072423398328691, "grad_norm": 3.2356252670288086, "learning_rate": 3.5198976691301875e-05, "loss": 1.5132, "step": 3642 }, { "epoch": 5.073816155988858, "grad_norm": 3.7738420963287354, "learning_rate": 3.518902785673678e-05, "loss": 1.4201, "step": 3643 }, { "epoch": 5.075208913649025, "grad_norm": 2.7929110527038574, "learning_rate": 3.517907902217169e-05, "loss": 1.1405, "step": 3644 }, { "epoch": 5.076601671309192, "grad_norm": 3.162795066833496, "learning_rate": 3.516913018760659e-05, "loss": 1.7356, "step": 3645 }, { "epoch": 5.077994428969359, "grad_norm": 3.6293771266937256, "learning_rate": 3.51591813530415e-05, "loss": 1.6064, "step": 3646 }, { "epoch": 5.079387186629527, "grad_norm": 3.622387647628784, "learning_rate": 3.51492325184764e-05, "loss": 1.6365, "step": 3647 }, { "epoch": 5.080779944289693, "grad_norm": 3.730894088745117, "learning_rate": 3.513928368391131e-05, "loss": 1.6652, "step": 3648 }, { "epoch": 5.082172701949861, "grad_norm": 2.8982667922973633, "learning_rate": 3.512933484934622e-05, "loss": 1.1618, "step": 3649 }, { "epoch": 5.0835654596100275, "grad_norm": 3.325928211212158, "learning_rate": 3.511938601478112e-05, "loss": 1.5396, "step": 3650 }, { "epoch": 5.084958217270195, "grad_norm": 2.7456600666046143, "learning_rate": 3.5109437180216025e-05, "loss": 1.1465, "step": 3651 }, { "epoch": 5.086350974930362, "grad_norm": 3.5660476684570312, "learning_rate": 3.5099488345650936e-05, "loss": 1.5811, "step": 3652 }, { "epoch": 5.087743732590529, "grad_norm": 3.5288870334625244, "learning_rate": 3.508953951108584e-05, "loss": 1.787, "step": 3653 }, { "epoch": 5.089136490250697, "grad_norm": 3.0032618045806885, "learning_rate": 3.507959067652075e-05, "loss": 1.3188, "step": 3654 }, { "epoch": 5.0905292479108635, "grad_norm": 3.1997663974761963, "learning_rate": 3.506964184195565e-05, "loss": 1.4057, "step": 3655 }, { "epoch": 5.091922005571031, "grad_norm": 3.2036070823669434, "learning_rate": 3.505969300739056e-05, "loss": 1.425, "step": 3656 }, { "epoch": 5.093314763231198, "grad_norm": 3.3824801445007324, "learning_rate": 3.504974417282546e-05, "loss": 1.6274, "step": 3657 }, { "epoch": 5.094707520891365, "grad_norm": 3.0331170558929443, "learning_rate": 3.5039795338260374e-05, "loss": 1.2897, "step": 3658 }, { "epoch": 5.096100278551532, "grad_norm": 3.129055976867676, "learning_rate": 3.5029846503695285e-05, "loss": 1.5442, "step": 3659 }, { "epoch": 5.0974930362116995, "grad_norm": 3.255399227142334, "learning_rate": 3.501989766913018e-05, "loss": 1.3974, "step": 3660 }, { "epoch": 5.098885793871866, "grad_norm": 3.486367702484131, "learning_rate": 3.5009948834565086e-05, "loss": 1.4974, "step": 3661 }, { "epoch": 5.100278551532034, "grad_norm": 3.3133184909820557, "learning_rate": 3.5e-05, "loss": 1.5463, "step": 3662 }, { "epoch": 5.1016713091922, "grad_norm": 3.101465940475464, "learning_rate": 3.499005116543491e-05, "loss": 1.2295, "step": 3663 }, { "epoch": 5.103064066852368, "grad_norm": 2.898435115814209, "learning_rate": 3.498010233086981e-05, "loss": 1.3083, "step": 3664 }, { "epoch": 5.104456824512535, "grad_norm": 7.913095474243164, "learning_rate": 3.4970153496304716e-05, "loss": 1.5649, "step": 3665 }, { "epoch": 5.105849582172702, "grad_norm": 2.8633811473846436, "learning_rate": 3.496020466173962e-05, "loss": 1.3258, "step": 3666 }, { "epoch": 5.107242339832869, "grad_norm": 3.5359859466552734, "learning_rate": 3.495025582717453e-05, "loss": 1.7369, "step": 3667 }, { "epoch": 5.108635097493036, "grad_norm": 3.1591122150421143, "learning_rate": 3.4940306992609435e-05, "loss": 1.3495, "step": 3668 }, { "epoch": 5.110027855153203, "grad_norm": 3.358830690383911, "learning_rate": 3.493035815804434e-05, "loss": 1.599, "step": 3669 }, { "epoch": 5.111420612813371, "grad_norm": 2.988240957260132, "learning_rate": 3.492040932347925e-05, "loss": 1.4411, "step": 3670 }, { "epoch": 5.112813370473537, "grad_norm": 3.552689790725708, "learning_rate": 3.4910460488914154e-05, "loss": 1.5288, "step": 3671 }, { "epoch": 5.114206128133705, "grad_norm": 3.24813175201416, "learning_rate": 3.490051165434906e-05, "loss": 1.6736, "step": 3672 }, { "epoch": 5.1155988857938715, "grad_norm": 3.0939056873321533, "learning_rate": 3.489056281978396e-05, "loss": 1.2816, "step": 3673 }, { "epoch": 5.116991643454039, "grad_norm": 3.16573166847229, "learning_rate": 3.488061398521887e-05, "loss": 1.435, "step": 3674 }, { "epoch": 5.118384401114207, "grad_norm": 3.6670165061950684, "learning_rate": 3.487066515065378e-05, "loss": 1.4094, "step": 3675 }, { "epoch": 5.119777158774373, "grad_norm": 3.0940935611724854, "learning_rate": 3.486071631608868e-05, "loss": 1.5027, "step": 3676 }, { "epoch": 5.121169916434541, "grad_norm": 3.1497867107391357, "learning_rate": 3.485076748152359e-05, "loss": 1.2258, "step": 3677 }, { "epoch": 5.1225626740947074, "grad_norm": 2.927849054336548, "learning_rate": 3.4840818646958496e-05, "loss": 1.3154, "step": 3678 }, { "epoch": 5.123955431754875, "grad_norm": 3.4527838230133057, "learning_rate": 3.48308698123934e-05, "loss": 1.4148, "step": 3679 }, { "epoch": 5.125348189415042, "grad_norm": 6.127671718597412, "learning_rate": 3.482092097782831e-05, "loss": 1.4014, "step": 3680 }, { "epoch": 5.126740947075209, "grad_norm": 3.6558492183685303, "learning_rate": 3.4810972143263215e-05, "loss": 1.4898, "step": 3681 }, { "epoch": 5.128133704735376, "grad_norm": 4.456975936889648, "learning_rate": 3.480102330869812e-05, "loss": 1.6801, "step": 3682 }, { "epoch": 5.129526462395543, "grad_norm": 3.446354866027832, "learning_rate": 3.479107447413302e-05, "loss": 1.6189, "step": 3683 }, { "epoch": 5.13091922005571, "grad_norm": 3.003618001937866, "learning_rate": 3.4781125639567934e-05, "loss": 1.5376, "step": 3684 }, { "epoch": 5.132311977715878, "grad_norm": 3.30557918548584, "learning_rate": 3.477117680500284e-05, "loss": 1.4387, "step": 3685 }, { "epoch": 5.133704735376044, "grad_norm": 3.599032163619995, "learning_rate": 3.476122797043774e-05, "loss": 1.7406, "step": 3686 }, { "epoch": 5.135097493036212, "grad_norm": 3.6804332733154297, "learning_rate": 3.475127913587265e-05, "loss": 1.8243, "step": 3687 }, { "epoch": 5.1364902506963785, "grad_norm": 2.995417356491089, "learning_rate": 3.474133030130756e-05, "loss": 1.4121, "step": 3688 }, { "epoch": 5.137883008356546, "grad_norm": 2.8882551193237305, "learning_rate": 3.473138146674246e-05, "loss": 1.2631, "step": 3689 }, { "epoch": 5.139275766016713, "grad_norm": 3.0136148929595947, "learning_rate": 3.472143263217737e-05, "loss": 1.5149, "step": 3690 }, { "epoch": 5.14066852367688, "grad_norm": 3.514186382293701, "learning_rate": 3.4711483797612276e-05, "loss": 1.6868, "step": 3691 }, { "epoch": 5.142061281337047, "grad_norm": 3.5419411659240723, "learning_rate": 3.470153496304719e-05, "loss": 1.7281, "step": 3692 }, { "epoch": 5.1434540389972145, "grad_norm": 3.458965301513672, "learning_rate": 3.469158612848209e-05, "loss": 1.5011, "step": 3693 }, { "epoch": 5.144846796657381, "grad_norm": 3.9583911895751953, "learning_rate": 3.4681637293916995e-05, "loss": 1.5089, "step": 3694 }, { "epoch": 5.146239554317549, "grad_norm": 3.239198684692383, "learning_rate": 3.46716884593519e-05, "loss": 1.5014, "step": 3695 }, { "epoch": 5.147632311977716, "grad_norm": 3.087223768234253, "learning_rate": 3.466173962478681e-05, "loss": 1.5708, "step": 3696 }, { "epoch": 5.149025069637883, "grad_norm": 3.3014509677886963, "learning_rate": 3.4651790790221714e-05, "loss": 1.5735, "step": 3697 }, { "epoch": 5.1504178272980505, "grad_norm": 3.4644100666046143, "learning_rate": 3.464184195565662e-05, "loss": 1.5335, "step": 3698 }, { "epoch": 5.151810584958217, "grad_norm": 3.331061601638794, "learning_rate": 3.463189312109153e-05, "loss": 1.4699, "step": 3699 }, { "epoch": 5.153203342618385, "grad_norm": 3.240514039993286, "learning_rate": 3.462194428652643e-05, "loss": 1.2046, "step": 3700 }, { "epoch": 5.154596100278551, "grad_norm": 3.537564277648926, "learning_rate": 3.461199545196134e-05, "loss": 1.4906, "step": 3701 }, { "epoch": 5.155988857938719, "grad_norm": 4.372491836547852, "learning_rate": 3.460204661739625e-05, "loss": 1.5823, "step": 3702 }, { "epoch": 5.157381615598886, "grad_norm": 3.1347551345825195, "learning_rate": 3.459209778283115e-05, "loss": 1.3192, "step": 3703 }, { "epoch": 5.158774373259053, "grad_norm": 3.348072052001953, "learning_rate": 3.4582148948266056e-05, "loss": 1.478, "step": 3704 }, { "epoch": 5.16016713091922, "grad_norm": 3.973855972290039, "learning_rate": 3.457220011370096e-05, "loss": 1.7168, "step": 3705 }, { "epoch": 5.161559888579387, "grad_norm": 3.481712579727173, "learning_rate": 3.456225127913587e-05, "loss": 1.5055, "step": 3706 }, { "epoch": 5.162952646239554, "grad_norm": 3.2055375576019287, "learning_rate": 3.4552302444570775e-05, "loss": 1.3556, "step": 3707 }, { "epoch": 5.164345403899722, "grad_norm": 2.8089704513549805, "learning_rate": 3.454235361000568e-05, "loss": 1.3841, "step": 3708 }, { "epoch": 5.165738161559888, "grad_norm": 2.9740664958953857, "learning_rate": 3.453240477544059e-05, "loss": 1.5267, "step": 3709 }, { "epoch": 5.167130919220056, "grad_norm": 3.2898945808410645, "learning_rate": 3.4522455940875494e-05, "loss": 1.2577, "step": 3710 }, { "epoch": 5.1685236768802225, "grad_norm": 3.3988595008850098, "learning_rate": 3.45125071063104e-05, "loss": 1.3032, "step": 3711 }, { "epoch": 5.16991643454039, "grad_norm": 3.444390058517456, "learning_rate": 3.450255827174531e-05, "loss": 1.4137, "step": 3712 }, { "epoch": 5.171309192200557, "grad_norm": 2.9359185695648193, "learning_rate": 3.449260943718021e-05, "loss": 1.3204, "step": 3713 }, { "epoch": 5.172701949860724, "grad_norm": 3.4491429328918457, "learning_rate": 3.448266060261512e-05, "loss": 1.3318, "step": 3714 }, { "epoch": 5.174094707520891, "grad_norm": 3.563459873199463, "learning_rate": 3.447271176805002e-05, "loss": 1.3859, "step": 3715 }, { "epoch": 5.1754874651810585, "grad_norm": 4.120718479156494, "learning_rate": 3.446276293348493e-05, "loss": 1.708, "step": 3716 }, { "epoch": 5.176880222841225, "grad_norm": 3.0903916358947754, "learning_rate": 3.4452814098919836e-05, "loss": 1.303, "step": 3717 }, { "epoch": 5.178272980501393, "grad_norm": 3.113990068435669, "learning_rate": 3.444286526435474e-05, "loss": 1.3673, "step": 3718 }, { "epoch": 5.17966573816156, "grad_norm": 3.122307538986206, "learning_rate": 3.443291642978965e-05, "loss": 1.3382, "step": 3719 }, { "epoch": 5.181058495821727, "grad_norm": 3.141376495361328, "learning_rate": 3.4422967595224555e-05, "loss": 1.4865, "step": 3720 }, { "epoch": 5.1824512534818945, "grad_norm": 2.90157151222229, "learning_rate": 3.4413018760659466e-05, "loss": 1.4591, "step": 3721 }, { "epoch": 5.183844011142061, "grad_norm": 3.2187490463256836, "learning_rate": 3.440306992609437e-05, "loss": 1.4385, "step": 3722 }, { "epoch": 5.185236768802229, "grad_norm": 3.39043927192688, "learning_rate": 3.4393121091529274e-05, "loss": 1.7379, "step": 3723 }, { "epoch": 5.186629526462395, "grad_norm": 3.1450371742248535, "learning_rate": 3.4383172256964185e-05, "loss": 1.5507, "step": 3724 }, { "epoch": 5.188022284122563, "grad_norm": 3.4779839515686035, "learning_rate": 3.437322342239909e-05, "loss": 1.3501, "step": 3725 }, { "epoch": 5.18941504178273, "grad_norm": 3.0941572189331055, "learning_rate": 3.436327458783399e-05, "loss": 1.2161, "step": 3726 }, { "epoch": 5.190807799442897, "grad_norm": 3.643517017364502, "learning_rate": 3.43533257532689e-05, "loss": 1.3892, "step": 3727 }, { "epoch": 5.192200557103064, "grad_norm": 3.092085123062134, "learning_rate": 3.434337691870381e-05, "loss": 1.4778, "step": 3728 }, { "epoch": 5.193593314763231, "grad_norm": 3.0536091327667236, "learning_rate": 3.433342808413871e-05, "loss": 1.3603, "step": 3729 }, { "epoch": 5.194986072423398, "grad_norm": 3.3252575397491455, "learning_rate": 3.4323479249573616e-05, "loss": 1.593, "step": 3730 }, { "epoch": 5.196378830083566, "grad_norm": 3.5904736518859863, "learning_rate": 3.431353041500853e-05, "loss": 1.6221, "step": 3731 }, { "epoch": 5.197771587743732, "grad_norm": 3.3816256523132324, "learning_rate": 3.430358158044343e-05, "loss": 1.4817, "step": 3732 }, { "epoch": 5.1991643454039, "grad_norm": 3.674961566925049, "learning_rate": 3.4293632745878335e-05, "loss": 1.7566, "step": 3733 }, { "epoch": 5.2005571030640665, "grad_norm": 3.5699350833892822, "learning_rate": 3.4283683911313246e-05, "loss": 1.6321, "step": 3734 }, { "epoch": 5.201949860724234, "grad_norm": 3.3462531566619873, "learning_rate": 3.427373507674815e-05, "loss": 1.432, "step": 3735 }, { "epoch": 5.203342618384401, "grad_norm": 3.26177978515625, "learning_rate": 3.4263786242183054e-05, "loss": 1.3974, "step": 3736 }, { "epoch": 5.204735376044568, "grad_norm": 3.213848829269409, "learning_rate": 3.425383740761796e-05, "loss": 1.2837, "step": 3737 }, { "epoch": 5.206128133704736, "grad_norm": 3.2751214504241943, "learning_rate": 3.424388857305287e-05, "loss": 1.3041, "step": 3738 }, { "epoch": 5.2075208913649025, "grad_norm": 3.244657516479492, "learning_rate": 3.423393973848777e-05, "loss": 1.5692, "step": 3739 }, { "epoch": 5.20891364902507, "grad_norm": 3.5669984817504883, "learning_rate": 3.422399090392268e-05, "loss": 1.4378, "step": 3740 }, { "epoch": 5.210306406685237, "grad_norm": 2.9351229667663574, "learning_rate": 3.421404206935759e-05, "loss": 1.198, "step": 3741 }, { "epoch": 5.211699164345404, "grad_norm": 3.4855310916900635, "learning_rate": 3.420409323479249e-05, "loss": 1.5302, "step": 3742 }, { "epoch": 5.213091922005571, "grad_norm": 2.974017381668091, "learning_rate": 3.4194144400227396e-05, "loss": 1.2527, "step": 3743 }, { "epoch": 5.2144846796657385, "grad_norm": 3.190744400024414, "learning_rate": 3.41841955656623e-05, "loss": 1.4676, "step": 3744 }, { "epoch": 5.215877437325905, "grad_norm": 3.7585272789001465, "learning_rate": 3.417424673109721e-05, "loss": 1.9378, "step": 3745 }, { "epoch": 5.217270194986073, "grad_norm": 3.2659990787506104, "learning_rate": 3.416429789653212e-05, "loss": 1.4307, "step": 3746 }, { "epoch": 5.218662952646239, "grad_norm": 3.387321949005127, "learning_rate": 3.4154349061967026e-05, "loss": 1.4444, "step": 3747 }, { "epoch": 5.220055710306407, "grad_norm": 3.2577359676361084, "learning_rate": 3.414440022740193e-05, "loss": 1.5049, "step": 3748 }, { "epoch": 5.221448467966574, "grad_norm": 3.1732935905456543, "learning_rate": 3.4134451392836834e-05, "loss": 1.5884, "step": 3749 }, { "epoch": 5.222841225626741, "grad_norm": 3.5600271224975586, "learning_rate": 3.4124502558271745e-05, "loss": 1.8239, "step": 3750 }, { "epoch": 5.224233983286908, "grad_norm": 3.043459415435791, "learning_rate": 3.411455372370665e-05, "loss": 1.3098, "step": 3751 }, { "epoch": 5.225626740947075, "grad_norm": 3.24894380569458, "learning_rate": 3.4104604889141553e-05, "loss": 1.1873, "step": 3752 }, { "epoch": 5.227019498607242, "grad_norm": 3.9069149494171143, "learning_rate": 3.4094656054576464e-05, "loss": 1.3383, "step": 3753 }, { "epoch": 5.22841225626741, "grad_norm": 2.6927483081817627, "learning_rate": 3.408470722001137e-05, "loss": 1.0927, "step": 3754 }, { "epoch": 5.229805013927576, "grad_norm": 3.414612293243408, "learning_rate": 3.407475838544627e-05, "loss": 1.3463, "step": 3755 }, { "epoch": 5.231197771587744, "grad_norm": 3.3081040382385254, "learning_rate": 3.406480955088118e-05, "loss": 1.4498, "step": 3756 }, { "epoch": 5.2325905292479105, "grad_norm": 4.235991954803467, "learning_rate": 3.405486071631609e-05, "loss": 1.7435, "step": 3757 }, { "epoch": 5.233983286908078, "grad_norm": 3.453193426132202, "learning_rate": 3.404491188175099e-05, "loss": 1.3693, "step": 3758 }, { "epoch": 5.235376044568245, "grad_norm": 3.6752874851226807, "learning_rate": 3.4034963047185895e-05, "loss": 1.448, "step": 3759 }, { "epoch": 5.236768802228412, "grad_norm": 3.6105453968048096, "learning_rate": 3.4025014212620806e-05, "loss": 1.6106, "step": 3760 }, { "epoch": 5.23816155988858, "grad_norm": 2.91959547996521, "learning_rate": 3.401506537805571e-05, "loss": 1.3171, "step": 3761 }, { "epoch": 5.2395543175487465, "grad_norm": 3.491471290588379, "learning_rate": 3.4005116543490614e-05, "loss": 1.4736, "step": 3762 }, { "epoch": 5.240947075208914, "grad_norm": 3.8632736206054688, "learning_rate": 3.3995167708925525e-05, "loss": 1.4344, "step": 3763 }, { "epoch": 5.242339832869081, "grad_norm": 3.767866849899292, "learning_rate": 3.398521887436043e-05, "loss": 1.2747, "step": 3764 }, { "epoch": 5.243732590529248, "grad_norm": 3.0411815643310547, "learning_rate": 3.3975270039795333e-05, "loss": 1.3078, "step": 3765 }, { "epoch": 5.245125348189415, "grad_norm": 3.113668918609619, "learning_rate": 3.396532120523024e-05, "loss": 1.4455, "step": 3766 }, { "epoch": 5.2465181058495824, "grad_norm": 3.476451873779297, "learning_rate": 3.395537237066515e-05, "loss": 1.712, "step": 3767 }, { "epoch": 5.247910863509749, "grad_norm": 3.588526964187622, "learning_rate": 3.394542353610005e-05, "loss": 1.5368, "step": 3768 }, { "epoch": 5.249303621169917, "grad_norm": 2.9576282501220703, "learning_rate": 3.3935474701534957e-05, "loss": 1.2277, "step": 3769 }, { "epoch": 5.250696378830083, "grad_norm": 3.746317148208618, "learning_rate": 3.392552586696987e-05, "loss": 1.6772, "step": 3770 }, { "epoch": 5.252089136490251, "grad_norm": 3.6188101768493652, "learning_rate": 3.391557703240477e-05, "loss": 1.8135, "step": 3771 }, { "epoch": 5.2534818941504176, "grad_norm": 2.8830478191375732, "learning_rate": 3.3905628197839676e-05, "loss": 1.3099, "step": 3772 }, { "epoch": 5.254874651810585, "grad_norm": 3.50219988822937, "learning_rate": 3.3895679363274586e-05, "loss": 1.4418, "step": 3773 }, { "epoch": 5.256267409470752, "grad_norm": 3.100625991821289, "learning_rate": 3.388573052870949e-05, "loss": 1.0908, "step": 3774 }, { "epoch": 5.257660167130919, "grad_norm": 2.9331486225128174, "learning_rate": 3.38757816941444e-05, "loss": 1.0758, "step": 3775 }, { "epoch": 5.259052924791086, "grad_norm": 3.0599770545959473, "learning_rate": 3.3865832859579305e-05, "loss": 1.3869, "step": 3776 }, { "epoch": 5.2604456824512535, "grad_norm": 2.9075305461883545, "learning_rate": 3.385588402501421e-05, "loss": 1.12, "step": 3777 }, { "epoch": 5.26183844011142, "grad_norm": 3.2149698734283447, "learning_rate": 3.384593519044912e-05, "loss": 1.352, "step": 3778 }, { "epoch": 5.263231197771588, "grad_norm": 3.346226453781128, "learning_rate": 3.3835986355884024e-05, "loss": 1.1678, "step": 3779 }, { "epoch": 5.264623955431755, "grad_norm": 3.9807395935058594, "learning_rate": 3.382603752131893e-05, "loss": 1.9116, "step": 3780 }, { "epoch": 5.266016713091922, "grad_norm": 3.3949508666992188, "learning_rate": 3.381608868675383e-05, "loss": 1.587, "step": 3781 }, { "epoch": 5.2674094707520895, "grad_norm": 3.343817710876465, "learning_rate": 3.3806139852188743e-05, "loss": 1.3577, "step": 3782 }, { "epoch": 5.268802228412256, "grad_norm": 3.4596922397613525, "learning_rate": 3.379619101762365e-05, "loss": 1.2473, "step": 3783 }, { "epoch": 5.270194986072424, "grad_norm": 2.865809679031372, "learning_rate": 3.378624218305855e-05, "loss": 1.4484, "step": 3784 }, { "epoch": 5.27158774373259, "grad_norm": 3.792099952697754, "learning_rate": 3.377629334849346e-05, "loss": 1.6435, "step": 3785 }, { "epoch": 5.272980501392758, "grad_norm": 4.077797889709473, "learning_rate": 3.3766344513928367e-05, "loss": 1.4314, "step": 3786 }, { "epoch": 5.274373259052925, "grad_norm": 3.7077529430389404, "learning_rate": 3.375639567936327e-05, "loss": 1.4233, "step": 3787 }, { "epoch": 5.275766016713092, "grad_norm": 3.0881690979003906, "learning_rate": 3.3746446844798175e-05, "loss": 1.2292, "step": 3788 }, { "epoch": 5.277158774373259, "grad_norm": 4.164643287658691, "learning_rate": 3.3736498010233086e-05, "loss": 2.0144, "step": 3789 }, { "epoch": 5.278551532033426, "grad_norm": 3.2468597888946533, "learning_rate": 3.372654917566799e-05, "loss": 1.4749, "step": 3790 }, { "epoch": 5.279944289693593, "grad_norm": 3.39668607711792, "learning_rate": 3.3716600341102894e-05, "loss": 1.3774, "step": 3791 }, { "epoch": 5.281337047353761, "grad_norm": 3.3434839248657227, "learning_rate": 3.3706651506537805e-05, "loss": 1.5875, "step": 3792 }, { "epoch": 5.282729805013927, "grad_norm": 3.2673895359039307, "learning_rate": 3.369670267197271e-05, "loss": 1.5603, "step": 3793 }, { "epoch": 5.284122562674095, "grad_norm": 3.286590099334717, "learning_rate": 3.368675383740761e-05, "loss": 1.3379, "step": 3794 }, { "epoch": 5.2855153203342615, "grad_norm": 3.299992084503174, "learning_rate": 3.3676805002842524e-05, "loss": 1.5589, "step": 3795 }, { "epoch": 5.286908077994429, "grad_norm": 3.6170554161071777, "learning_rate": 3.366685616827743e-05, "loss": 1.6821, "step": 3796 }, { "epoch": 5.288300835654596, "grad_norm": 3.3409528732299805, "learning_rate": 3.365690733371233e-05, "loss": 1.5481, "step": 3797 }, { "epoch": 5.289693593314763, "grad_norm": 3.209533929824829, "learning_rate": 3.3646958499147236e-05, "loss": 1.3921, "step": 3798 }, { "epoch": 5.29108635097493, "grad_norm": 3.2695765495300293, "learning_rate": 3.3637009664582147e-05, "loss": 1.2969, "step": 3799 }, { "epoch": 5.2924791086350975, "grad_norm": 3.567899703979492, "learning_rate": 3.362706083001706e-05, "loss": 1.5348, "step": 3800 }, { "epoch": 5.293871866295264, "grad_norm": 3.4814798831939697, "learning_rate": 3.3617111995451955e-05, "loss": 1.228, "step": 3801 }, { "epoch": 5.295264623955432, "grad_norm": 3.332674503326416, "learning_rate": 3.3607163160886866e-05, "loss": 1.2396, "step": 3802 }, { "epoch": 5.296657381615599, "grad_norm": 4.195977687835693, "learning_rate": 3.359721432632177e-05, "loss": 1.5116, "step": 3803 }, { "epoch": 5.298050139275766, "grad_norm": 3.3662610054016113, "learning_rate": 3.358726549175668e-05, "loss": 1.5008, "step": 3804 }, { "epoch": 5.2994428969359335, "grad_norm": 3.4025464057922363, "learning_rate": 3.3577316657191585e-05, "loss": 1.6831, "step": 3805 }, { "epoch": 5.3008356545961, "grad_norm": 3.5710318088531494, "learning_rate": 3.356736782262649e-05, "loss": 1.577, "step": 3806 }, { "epoch": 5.302228412256268, "grad_norm": 3.1233208179473877, "learning_rate": 3.35574189880614e-05, "loss": 1.4833, "step": 3807 }, { "epoch": 5.303621169916434, "grad_norm": 3.377131462097168, "learning_rate": 3.3547470153496304e-05, "loss": 1.6813, "step": 3808 }, { "epoch": 5.305013927576602, "grad_norm": 3.482731342315674, "learning_rate": 3.353752131893121e-05, "loss": 1.5225, "step": 3809 }, { "epoch": 5.306406685236769, "grad_norm": 3.1043663024902344, "learning_rate": 3.352757248436611e-05, "loss": 1.2276, "step": 3810 }, { "epoch": 5.307799442896936, "grad_norm": 3.382906913757324, "learning_rate": 3.351762364980102e-05, "loss": 1.3289, "step": 3811 }, { "epoch": 5.309192200557103, "grad_norm": 3.4567394256591797, "learning_rate": 3.350767481523593e-05, "loss": 1.4484, "step": 3812 }, { "epoch": 5.31058495821727, "grad_norm": 3.7682783603668213, "learning_rate": 3.349772598067083e-05, "loss": 1.9119, "step": 3813 }, { "epoch": 5.311977715877437, "grad_norm": 3.658247470855713, "learning_rate": 3.348777714610574e-05, "loss": 1.6523, "step": 3814 }, { "epoch": 5.313370473537605, "grad_norm": 2.793503522872925, "learning_rate": 3.3477828311540646e-05, "loss": 1.2379, "step": 3815 }, { "epoch": 5.314763231197771, "grad_norm": 3.6581826210021973, "learning_rate": 3.346787947697555e-05, "loss": 1.6143, "step": 3816 }, { "epoch": 5.316155988857939, "grad_norm": 3.794476270675659, "learning_rate": 3.345793064241046e-05, "loss": 1.5009, "step": 3817 }, { "epoch": 5.3175487465181055, "grad_norm": 3.8433616161346436, "learning_rate": 3.3447981807845365e-05, "loss": 1.4117, "step": 3818 }, { "epoch": 5.318941504178273, "grad_norm": 2.8389317989349365, "learning_rate": 3.343803297328027e-05, "loss": 1.2281, "step": 3819 }, { "epoch": 5.32033426183844, "grad_norm": 3.3493552207946777, "learning_rate": 3.342808413871517e-05, "loss": 1.4341, "step": 3820 }, { "epoch": 5.321727019498607, "grad_norm": 4.652235507965088, "learning_rate": 3.3418135304150084e-05, "loss": 1.3654, "step": 3821 }, { "epoch": 5.323119777158775, "grad_norm": 3.134294271469116, "learning_rate": 3.340818646958499e-05, "loss": 1.5002, "step": 3822 }, { "epoch": 5.3245125348189415, "grad_norm": 3.2703158855438232, "learning_rate": 3.339823763501989e-05, "loss": 1.4319, "step": 3823 }, { "epoch": 5.325905292479109, "grad_norm": 3.316800117492676, "learning_rate": 3.33882888004548e-05, "loss": 1.4161, "step": 3824 }, { "epoch": 5.327298050139276, "grad_norm": 2.9278154373168945, "learning_rate": 3.337833996588971e-05, "loss": 1.2484, "step": 3825 }, { "epoch": 5.328690807799443, "grad_norm": 3.4309632778167725, "learning_rate": 3.336839113132461e-05, "loss": 1.4719, "step": 3826 }, { "epoch": 5.33008356545961, "grad_norm": 3.2639284133911133, "learning_rate": 3.3358442296759515e-05, "loss": 1.474, "step": 3827 }, { "epoch": 5.3314763231197775, "grad_norm": 3.263230323791504, "learning_rate": 3.3348493462194426e-05, "loss": 1.5239, "step": 3828 }, { "epoch": 5.332869080779944, "grad_norm": 3.6524100303649902, "learning_rate": 3.3338544627629337e-05, "loss": 1.4968, "step": 3829 }, { "epoch": 5.334261838440112, "grad_norm": 3.077557325363159, "learning_rate": 3.3328595793064234e-05, "loss": 1.0446, "step": 3830 }, { "epoch": 5.335654596100278, "grad_norm": 3.468564987182617, "learning_rate": 3.3318646958499145e-05, "loss": 1.5119, "step": 3831 }, { "epoch": 5.337047353760446, "grad_norm": 3.4484708309173584, "learning_rate": 3.330869812393405e-05, "loss": 1.5385, "step": 3832 }, { "epoch": 5.338440111420613, "grad_norm": 3.177473306655884, "learning_rate": 3.329874928936896e-05, "loss": 1.1245, "step": 3833 }, { "epoch": 5.33983286908078, "grad_norm": 3.467501163482666, "learning_rate": 3.3288800454803864e-05, "loss": 1.576, "step": 3834 }, { "epoch": 5.341225626740947, "grad_norm": 3.00364089012146, "learning_rate": 3.327885162023877e-05, "loss": 1.2415, "step": 3835 }, { "epoch": 5.342618384401114, "grad_norm": 4.731228351593018, "learning_rate": 3.326890278567368e-05, "loss": 1.3672, "step": 3836 }, { "epoch": 5.344011142061281, "grad_norm": 3.201103925704956, "learning_rate": 3.325895395110858e-05, "loss": 1.4716, "step": 3837 }, { "epoch": 5.345403899721449, "grad_norm": 3.115396022796631, "learning_rate": 3.324900511654349e-05, "loss": 1.2102, "step": 3838 }, { "epoch": 5.346796657381615, "grad_norm": 3.1591529846191406, "learning_rate": 3.32390562819784e-05, "loss": 1.2851, "step": 3839 }, { "epoch": 5.348189415041783, "grad_norm": 2.949540138244629, "learning_rate": 3.32291074474133e-05, "loss": 1.2902, "step": 3840 }, { "epoch": 5.3495821727019495, "grad_norm": 3.423433780670166, "learning_rate": 3.3219158612848206e-05, "loss": 1.4414, "step": 3841 }, { "epoch": 5.350974930362117, "grad_norm": 3.4719269275665283, "learning_rate": 3.320920977828311e-05, "loss": 1.5675, "step": 3842 }, { "epoch": 5.352367688022284, "grad_norm": 3.1271326541900635, "learning_rate": 3.319926094371802e-05, "loss": 1.1859, "step": 3843 }, { "epoch": 5.353760445682451, "grad_norm": 3.346649646759033, "learning_rate": 3.3189312109152925e-05, "loss": 1.2755, "step": 3844 }, { "epoch": 5.355153203342619, "grad_norm": 3.6410646438598633, "learning_rate": 3.317936327458783e-05, "loss": 1.6861, "step": 3845 }, { "epoch": 5.3565459610027855, "grad_norm": 3.6382384300231934, "learning_rate": 3.316941444002274e-05, "loss": 1.5115, "step": 3846 }, { "epoch": 5.357938718662953, "grad_norm": 3.3190386295318604, "learning_rate": 3.3159465605457644e-05, "loss": 1.5371, "step": 3847 }, { "epoch": 5.35933147632312, "grad_norm": 5.246037006378174, "learning_rate": 3.314951677089255e-05, "loss": 1.5903, "step": 3848 }, { "epoch": 5.360724233983287, "grad_norm": 5.078573226928711, "learning_rate": 3.313956793632745e-05, "loss": 1.3486, "step": 3849 }, { "epoch": 5.362116991643454, "grad_norm": 2.915461778640747, "learning_rate": 3.312961910176236e-05, "loss": 1.3303, "step": 3850 }, { "epoch": 5.3635097493036215, "grad_norm": 4.221773147583008, "learning_rate": 3.311967026719727e-05, "loss": 1.562, "step": 3851 }, { "epoch": 5.364902506963788, "grad_norm": 4.714282035827637, "learning_rate": 3.310972143263217e-05, "loss": 1.506, "step": 3852 }, { "epoch": 5.366295264623956, "grad_norm": 3.0521111488342285, "learning_rate": 3.309977259806708e-05, "loss": 1.1443, "step": 3853 }, { "epoch": 5.367688022284122, "grad_norm": 3.915872097015381, "learning_rate": 3.3089823763501986e-05, "loss": 1.6592, "step": 3854 }, { "epoch": 5.36908077994429, "grad_norm": 3.1428794860839844, "learning_rate": 3.307987492893689e-05, "loss": 1.3548, "step": 3855 }, { "epoch": 5.370473537604457, "grad_norm": 3.683709144592285, "learning_rate": 3.30699260943718e-05, "loss": 1.6792, "step": 3856 }, { "epoch": 5.371866295264624, "grad_norm": 3.304654359817505, "learning_rate": 3.3059977259806705e-05, "loss": 1.4264, "step": 3857 }, { "epoch": 5.373259052924791, "grad_norm": 3.9296340942382812, "learning_rate": 3.3050028425241616e-05, "loss": 1.7694, "step": 3858 }, { "epoch": 5.374651810584958, "grad_norm": 3.66880202293396, "learning_rate": 3.304007959067652e-05, "loss": 1.9336, "step": 3859 }, { "epoch": 5.376044568245125, "grad_norm": 3.4877336025238037, "learning_rate": 3.3030130756111424e-05, "loss": 1.248, "step": 3860 }, { "epoch": 5.3774373259052926, "grad_norm": 3.447391986846924, "learning_rate": 3.3020181921546335e-05, "loss": 1.5482, "step": 3861 }, { "epoch": 5.378830083565459, "grad_norm": 6.742697715759277, "learning_rate": 3.301023308698124e-05, "loss": 1.5847, "step": 3862 }, { "epoch": 5.380222841225627, "grad_norm": 3.357435464859009, "learning_rate": 3.300028425241614e-05, "loss": 1.3434, "step": 3863 }, { "epoch": 5.381615598885794, "grad_norm": 3.4757871627807617, "learning_rate": 3.299033541785105e-05, "loss": 1.4059, "step": 3864 }, { "epoch": 5.383008356545961, "grad_norm": 4.1926350593566895, "learning_rate": 3.298038658328596e-05, "loss": 1.5506, "step": 3865 }, { "epoch": 5.3844011142061285, "grad_norm": 3.4357733726501465, "learning_rate": 3.297043774872086e-05, "loss": 1.6557, "step": 3866 }, { "epoch": 5.385793871866295, "grad_norm": 3.7321584224700928, "learning_rate": 3.2960488914155766e-05, "loss": 1.612, "step": 3867 }, { "epoch": 5.387186629526463, "grad_norm": 3.0687263011932373, "learning_rate": 3.295054007959068e-05, "loss": 1.4702, "step": 3868 }, { "epoch": 5.388579387186629, "grad_norm": 3.526613473892212, "learning_rate": 3.294059124502558e-05, "loss": 1.6784, "step": 3869 }, { "epoch": 5.389972144846797, "grad_norm": 3.673717737197876, "learning_rate": 3.2930642410460485e-05, "loss": 1.3199, "step": 3870 }, { "epoch": 5.391364902506964, "grad_norm": 3.002877712249756, "learning_rate": 3.292069357589539e-05, "loss": 1.2214, "step": 3871 }, { "epoch": 5.392757660167131, "grad_norm": 3.2002980709075928, "learning_rate": 3.29107447413303e-05, "loss": 1.4956, "step": 3872 }, { "epoch": 5.394150417827298, "grad_norm": 2.985261917114258, "learning_rate": 3.2900795906765204e-05, "loss": 1.3173, "step": 3873 }, { "epoch": 5.395543175487465, "grad_norm": 3.5086417198181152, "learning_rate": 3.289084707220011e-05, "loss": 1.3256, "step": 3874 }, { "epoch": 5.396935933147632, "grad_norm": 2.9871435165405273, "learning_rate": 3.288089823763502e-05, "loss": 1.4463, "step": 3875 }, { "epoch": 5.3983286908078, "grad_norm": 3.4604439735412598, "learning_rate": 3.287094940306992e-05, "loss": 1.4626, "step": 3876 }, { "epoch": 5.399721448467966, "grad_norm": 3.3530375957489014, "learning_rate": 3.286100056850483e-05, "loss": 1.3948, "step": 3877 }, { "epoch": 5.401114206128134, "grad_norm": 4.094565391540527, "learning_rate": 3.285105173393974e-05, "loss": 1.4901, "step": 3878 }, { "epoch": 5.4025069637883005, "grad_norm": 3.299420118331909, "learning_rate": 3.284110289937464e-05, "loss": 1.4508, "step": 3879 }, { "epoch": 5.403899721448468, "grad_norm": 3.154717445373535, "learning_rate": 3.2831154064809546e-05, "loss": 1.1577, "step": 3880 }, { "epoch": 5.405292479108635, "grad_norm": 3.361546277999878, "learning_rate": 3.282120523024445e-05, "loss": 1.3684, "step": 3881 }, { "epoch": 5.406685236768802, "grad_norm": 4.2829909324646, "learning_rate": 3.281125639567936e-05, "loss": 1.5438, "step": 3882 }, { "epoch": 5.408077994428969, "grad_norm": 3.4301254749298096, "learning_rate": 3.2801307561114265e-05, "loss": 1.3747, "step": 3883 }, { "epoch": 5.4094707520891365, "grad_norm": 3.082698106765747, "learning_rate": 3.279135872654917e-05, "loss": 1.3142, "step": 3884 }, { "epoch": 5.410863509749303, "grad_norm": 3.568859815597534, "learning_rate": 3.278140989198408e-05, "loss": 1.6341, "step": 3885 }, { "epoch": 5.412256267409471, "grad_norm": 2.8952035903930664, "learning_rate": 3.2771461057418984e-05, "loss": 1.2079, "step": 3886 }, { "epoch": 5.413649025069638, "grad_norm": 3.228323221206665, "learning_rate": 3.2761512222853895e-05, "loss": 1.4066, "step": 3887 }, { "epoch": 5.415041782729805, "grad_norm": 3.5163016319274902, "learning_rate": 3.27515633882888e-05, "loss": 1.5318, "step": 3888 }, { "epoch": 5.4164345403899725, "grad_norm": 3.3017849922180176, "learning_rate": 3.27416145537237e-05, "loss": 1.2807, "step": 3889 }, { "epoch": 5.417827298050139, "grad_norm": 3.5008184909820557, "learning_rate": 3.2731665719158614e-05, "loss": 1.4937, "step": 3890 }, { "epoch": 5.419220055710307, "grad_norm": 3.5234081745147705, "learning_rate": 3.272171688459352e-05, "loss": 1.4323, "step": 3891 }, { "epoch": 5.420612813370473, "grad_norm": 3.7362451553344727, "learning_rate": 3.271176805002842e-05, "loss": 1.4941, "step": 3892 }, { "epoch": 5.422005571030641, "grad_norm": 2.8877837657928467, "learning_rate": 3.2701819215463326e-05, "loss": 1.3242, "step": 3893 }, { "epoch": 5.423398328690808, "grad_norm": 3.115877866744995, "learning_rate": 3.269187038089824e-05, "loss": 1.3975, "step": 3894 }, { "epoch": 5.424791086350975, "grad_norm": 3.7254276275634766, "learning_rate": 3.268192154633314e-05, "loss": 1.394, "step": 3895 }, { "epoch": 5.426183844011142, "grad_norm": 3.117959499359131, "learning_rate": 3.2671972711768045e-05, "loss": 1.3566, "step": 3896 }, { "epoch": 5.427576601671309, "grad_norm": 3.875413417816162, "learning_rate": 3.2662023877202956e-05, "loss": 1.3033, "step": 3897 }, { "epoch": 5.428969359331476, "grad_norm": 3.9885122776031494, "learning_rate": 3.265207504263786e-05, "loss": 1.6364, "step": 3898 }, { "epoch": 5.430362116991644, "grad_norm": 2.8498659133911133, "learning_rate": 3.2642126208072764e-05, "loss": 1.3442, "step": 3899 }, { "epoch": 5.43175487465181, "grad_norm": 3.3412253856658936, "learning_rate": 3.2632177373507675e-05, "loss": 1.3209, "step": 3900 }, { "epoch": 5.433147632311978, "grad_norm": 3.333075523376465, "learning_rate": 3.262222853894258e-05, "loss": 1.5815, "step": 3901 }, { "epoch": 5.4345403899721445, "grad_norm": 3.547705888748169, "learning_rate": 3.261227970437748e-05, "loss": 1.6916, "step": 3902 }, { "epoch": 5.435933147632312, "grad_norm": 3.714667320251465, "learning_rate": 3.260233086981239e-05, "loss": 1.5171, "step": 3903 }, { "epoch": 5.437325905292479, "grad_norm": 3.384195566177368, "learning_rate": 3.25923820352473e-05, "loss": 1.4183, "step": 3904 }, { "epoch": 5.438718662952646, "grad_norm": 3.1017160415649414, "learning_rate": 3.25824332006822e-05, "loss": 1.4028, "step": 3905 }, { "epoch": 5.440111420612814, "grad_norm": 4.297664165496826, "learning_rate": 3.2572484366117106e-05, "loss": 1.5649, "step": 3906 }, { "epoch": 5.4415041782729805, "grad_norm": 3.9159302711486816, "learning_rate": 3.256253553155202e-05, "loss": 1.4022, "step": 3907 }, { "epoch": 5.442896935933147, "grad_norm": 3.391209840774536, "learning_rate": 3.255258669698692e-05, "loss": 1.3308, "step": 3908 }, { "epoch": 5.444289693593315, "grad_norm": 3.9618213176727295, "learning_rate": 3.2542637862421825e-05, "loss": 1.5808, "step": 3909 }, { "epoch": 5.445682451253482, "grad_norm": 3.6104896068573, "learning_rate": 3.253268902785673e-05, "loss": 1.4468, "step": 3910 }, { "epoch": 5.447075208913649, "grad_norm": 3.209728240966797, "learning_rate": 3.252274019329164e-05, "loss": 1.3776, "step": 3911 }, { "epoch": 5.4484679665738165, "grad_norm": 3.180717706680298, "learning_rate": 3.251279135872655e-05, "loss": 1.2867, "step": 3912 }, { "epoch": 5.449860724233983, "grad_norm": 3.3016552925109863, "learning_rate": 3.250284252416145e-05, "loss": 1.3566, "step": 3913 }, { "epoch": 5.451253481894151, "grad_norm": 3.617339611053467, "learning_rate": 3.249289368959636e-05, "loss": 1.6453, "step": 3914 }, { "epoch": 5.452646239554317, "grad_norm": 3.0904672145843506, "learning_rate": 3.248294485503126e-05, "loss": 1.3079, "step": 3915 }, { "epoch": 5.454038997214485, "grad_norm": 3.719731330871582, "learning_rate": 3.2472996020466174e-05, "loss": 1.4557, "step": 3916 }, { "epoch": 5.455431754874652, "grad_norm": 3.3843271732330322, "learning_rate": 3.246304718590108e-05, "loss": 1.4903, "step": 3917 }, { "epoch": 5.456824512534819, "grad_norm": 3.0128655433654785, "learning_rate": 3.245309835133598e-05, "loss": 1.191, "step": 3918 }, { "epoch": 5.458217270194986, "grad_norm": 3.1585681438446045, "learning_rate": 3.244314951677089e-05, "loss": 1.4083, "step": 3919 }, { "epoch": 5.459610027855153, "grad_norm": 3.3376874923706055, "learning_rate": 3.24332006822058e-05, "loss": 1.6112, "step": 3920 }, { "epoch": 5.46100278551532, "grad_norm": 3.072366237640381, "learning_rate": 3.24232518476407e-05, "loss": 1.1329, "step": 3921 }, { "epoch": 5.462395543175488, "grad_norm": 3.20548415184021, "learning_rate": 3.241330301307561e-05, "loss": 1.4211, "step": 3922 }, { "epoch": 5.463788300835654, "grad_norm": 3.1418893337249756, "learning_rate": 3.2403354178510516e-05, "loss": 1.3566, "step": 3923 }, { "epoch": 5.465181058495822, "grad_norm": 3.166402578353882, "learning_rate": 3.239340534394542e-05, "loss": 1.1901, "step": 3924 }, { "epoch": 5.4665738161559885, "grad_norm": 3.5038914680480957, "learning_rate": 3.2383456509380324e-05, "loss": 1.5809, "step": 3925 }, { "epoch": 5.467966573816156, "grad_norm": 3.311577081680298, "learning_rate": 3.2373507674815235e-05, "loss": 1.4073, "step": 3926 }, { "epoch": 5.469359331476323, "grad_norm": 3.3907663822174072, "learning_rate": 3.236355884025014e-05, "loss": 1.2873, "step": 3927 }, { "epoch": 5.47075208913649, "grad_norm": 3.972849130630493, "learning_rate": 3.235361000568504e-05, "loss": 1.4817, "step": 3928 }, { "epoch": 5.472144846796658, "grad_norm": 3.3276312351226807, "learning_rate": 3.2343661171119954e-05, "loss": 1.284, "step": 3929 }, { "epoch": 5.4735376044568245, "grad_norm": 2.837625026702881, "learning_rate": 3.233371233655486e-05, "loss": 1.099, "step": 3930 }, { "epoch": 5.474930362116992, "grad_norm": 3.2056424617767334, "learning_rate": 3.232376350198976e-05, "loss": 1.3397, "step": 3931 }, { "epoch": 5.476323119777159, "grad_norm": 3.7157018184661865, "learning_rate": 3.2313814667424666e-05, "loss": 1.6531, "step": 3932 }, { "epoch": 5.477715877437326, "grad_norm": 3.524932861328125, "learning_rate": 3.230386583285958e-05, "loss": 1.2025, "step": 3933 }, { "epoch": 5.479108635097493, "grad_norm": 3.128558874130249, "learning_rate": 3.229391699829448e-05, "loss": 1.3906, "step": 3934 }, { "epoch": 5.4805013927576605, "grad_norm": 2.9211344718933105, "learning_rate": 3.2283968163729385e-05, "loss": 1.2087, "step": 3935 }, { "epoch": 5.481894150417827, "grad_norm": 3.499192714691162, "learning_rate": 3.2274019329164296e-05, "loss": 1.3862, "step": 3936 }, { "epoch": 5.483286908077995, "grad_norm": 2.7200067043304443, "learning_rate": 3.22640704945992e-05, "loss": 1.1445, "step": 3937 }, { "epoch": 5.484679665738161, "grad_norm": 3.847712755203247, "learning_rate": 3.2254121660034104e-05, "loss": 1.3631, "step": 3938 }, { "epoch": 5.486072423398329, "grad_norm": 4.017714977264404, "learning_rate": 3.2244172825469015e-05, "loss": 1.6958, "step": 3939 }, { "epoch": 5.487465181058496, "grad_norm": 3.3647825717926025, "learning_rate": 3.223422399090392e-05, "loss": 1.376, "step": 3940 }, { "epoch": 5.488857938718663, "grad_norm": 2.96148943901062, "learning_rate": 3.222427515633883e-05, "loss": 1.1843, "step": 3941 }, { "epoch": 5.49025069637883, "grad_norm": 3.516737222671509, "learning_rate": 3.221432632177373e-05, "loss": 1.6118, "step": 3942 }, { "epoch": 5.491643454038997, "grad_norm": 2.888239622116089, "learning_rate": 3.220437748720864e-05, "loss": 1.2634, "step": 3943 }, { "epoch": 5.493036211699164, "grad_norm": 3.1161155700683594, "learning_rate": 3.219442865264355e-05, "loss": 1.2417, "step": 3944 }, { "epoch": 5.494428969359332, "grad_norm": 3.035766124725342, "learning_rate": 3.218447981807845e-05, "loss": 1.2204, "step": 3945 }, { "epoch": 5.495821727019498, "grad_norm": 3.7109148502349854, "learning_rate": 3.217453098351336e-05, "loss": 1.5756, "step": 3946 }, { "epoch": 5.497214484679666, "grad_norm": 3.837930679321289, "learning_rate": 3.216458214894826e-05, "loss": 1.187, "step": 3947 }, { "epoch": 5.498607242339833, "grad_norm": 3.503160238265991, "learning_rate": 3.215463331438317e-05, "loss": 1.3288, "step": 3948 }, { "epoch": 5.5, "grad_norm": 2.700408697128296, "learning_rate": 3.2144684479818076e-05, "loss": 1.1584, "step": 3949 }, { "epoch": 5.501392757660167, "grad_norm": 3.317624092102051, "learning_rate": 3.213473564525298e-05, "loss": 1.3599, "step": 3950 }, { "epoch": 5.502785515320334, "grad_norm": 3.801151990890503, "learning_rate": 3.212478681068789e-05, "loss": 1.4942, "step": 3951 }, { "epoch": 5.504178272980502, "grad_norm": 3.7417783737182617, "learning_rate": 3.2114837976122795e-05, "loss": 1.1773, "step": 3952 }, { "epoch": 5.505571030640668, "grad_norm": 3.4485790729522705, "learning_rate": 3.21048891415577e-05, "loss": 1.4223, "step": 3953 }, { "epoch": 5.506963788300836, "grad_norm": 3.1726791858673096, "learning_rate": 3.2094940306992603e-05, "loss": 1.2985, "step": 3954 }, { "epoch": 5.508356545961003, "grad_norm": 3.7761385440826416, "learning_rate": 3.2084991472427514e-05, "loss": 1.3546, "step": 3955 }, { "epoch": 5.50974930362117, "grad_norm": 3.064495086669922, "learning_rate": 3.207504263786242e-05, "loss": 1.3108, "step": 3956 }, { "epoch": 5.511142061281337, "grad_norm": 3.1682968139648438, "learning_rate": 3.206509380329732e-05, "loss": 1.2865, "step": 3957 }, { "epoch": 5.512534818941504, "grad_norm": 2.9550204277038574, "learning_rate": 3.205514496873223e-05, "loss": 1.1755, "step": 3958 }, { "epoch": 5.513927576601671, "grad_norm": 3.430180788040161, "learning_rate": 3.204519613416714e-05, "loss": 1.3638, "step": 3959 }, { "epoch": 5.515320334261839, "grad_norm": 3.227559804916382, "learning_rate": 3.203524729960204e-05, "loss": 1.3122, "step": 3960 }, { "epoch": 5.516713091922005, "grad_norm": 2.9942450523376465, "learning_rate": 3.202529846503695e-05, "loss": 1.2739, "step": 3961 }, { "epoch": 5.518105849582173, "grad_norm": 4.473596096038818, "learning_rate": 3.2015349630471856e-05, "loss": 1.5011, "step": 3962 }, { "epoch": 5.5194986072423395, "grad_norm": 3.246814250946045, "learning_rate": 3.200540079590676e-05, "loss": 1.4086, "step": 3963 }, { "epoch": 5.520891364902507, "grad_norm": 3.2652289867401123, "learning_rate": 3.1995451961341665e-05, "loss": 1.4842, "step": 3964 }, { "epoch": 5.522284122562674, "grad_norm": 3.1682024002075195, "learning_rate": 3.1985503126776575e-05, "loss": 1.3278, "step": 3965 }, { "epoch": 5.523676880222841, "grad_norm": 3.0474061965942383, "learning_rate": 3.197555429221148e-05, "loss": 1.3106, "step": 3966 }, { "epoch": 5.525069637883008, "grad_norm": 3.323626756668091, "learning_rate": 3.1965605457646384e-05, "loss": 1.3913, "step": 3967 }, { "epoch": 5.5264623955431755, "grad_norm": 3.210984230041504, "learning_rate": 3.1955656623081294e-05, "loss": 1.2502, "step": 3968 }, { "epoch": 5.527855153203342, "grad_norm": 4.018118858337402, "learning_rate": 3.19457077885162e-05, "loss": 1.1894, "step": 3969 }, { "epoch": 5.52924791086351, "grad_norm": 2.9094223976135254, "learning_rate": 3.193575895395111e-05, "loss": 1.1678, "step": 3970 }, { "epoch": 5.530640668523677, "grad_norm": 3.7920138835906982, "learning_rate": 3.192581011938601e-05, "loss": 1.4131, "step": 3971 }, { "epoch": 5.532033426183844, "grad_norm": 3.238229513168335, "learning_rate": 3.191586128482092e-05, "loss": 1.4148, "step": 3972 }, { "epoch": 5.5334261838440115, "grad_norm": 3.8313090801239014, "learning_rate": 3.190591245025583e-05, "loss": 1.6562, "step": 3973 }, { "epoch": 5.534818941504178, "grad_norm": 5.136152744293213, "learning_rate": 3.189596361569073e-05, "loss": 1.4177, "step": 3974 }, { "epoch": 5.536211699164346, "grad_norm": 3.5499415397644043, "learning_rate": 3.1886014781125636e-05, "loss": 1.449, "step": 3975 }, { "epoch": 5.537604456824512, "grad_norm": 2.9771597385406494, "learning_rate": 3.187606594656054e-05, "loss": 1.1568, "step": 3976 }, { "epoch": 5.53899721448468, "grad_norm": 3.252581834793091, "learning_rate": 3.186611711199545e-05, "loss": 1.2848, "step": 3977 }, { "epoch": 5.540389972144847, "grad_norm": 3.4198732376098633, "learning_rate": 3.1856168277430355e-05, "loss": 1.3266, "step": 3978 }, { "epoch": 5.541782729805014, "grad_norm": 3.03037691116333, "learning_rate": 3.184621944286526e-05, "loss": 1.2643, "step": 3979 }, { "epoch": 5.543175487465181, "grad_norm": 3.268641233444214, "learning_rate": 3.183627060830017e-05, "loss": 1.468, "step": 3980 }, { "epoch": 5.544568245125348, "grad_norm": 4.733526229858398, "learning_rate": 3.1826321773735074e-05, "loss": 1.5814, "step": 3981 }, { "epoch": 5.545961002785515, "grad_norm": 4.59187650680542, "learning_rate": 3.181637293916998e-05, "loss": 1.6145, "step": 3982 }, { "epoch": 5.547353760445683, "grad_norm": 3.706252098083496, "learning_rate": 3.180642410460489e-05, "loss": 1.4103, "step": 3983 }, { "epoch": 5.548746518105849, "grad_norm": 2.993542194366455, "learning_rate": 3.1796475270039793e-05, "loss": 1.1197, "step": 3984 }, { "epoch": 5.550139275766017, "grad_norm": 3.199744939804077, "learning_rate": 3.17865264354747e-05, "loss": 1.5324, "step": 3985 }, { "epoch": 5.5515320334261835, "grad_norm": 3.2078278064727783, "learning_rate": 3.17765776009096e-05, "loss": 1.3773, "step": 3986 }, { "epoch": 5.552924791086351, "grad_norm": 3.8803422451019287, "learning_rate": 3.176662876634451e-05, "loss": 1.3681, "step": 3987 }, { "epoch": 5.554317548746518, "grad_norm": 3.2783961296081543, "learning_rate": 3.1756679931779417e-05, "loss": 1.4319, "step": 3988 }, { "epoch": 5.555710306406685, "grad_norm": 6.760887145996094, "learning_rate": 3.174673109721432e-05, "loss": 1.2848, "step": 3989 }, { "epoch": 5.557103064066853, "grad_norm": 3.0715172290802, "learning_rate": 3.173678226264923e-05, "loss": 1.3646, "step": 3990 }, { "epoch": 5.5584958217270195, "grad_norm": 3.3107216358184814, "learning_rate": 3.1726833428084136e-05, "loss": 1.5385, "step": 3991 }, { "epoch": 5.559888579387186, "grad_norm": 3.1300973892211914, "learning_rate": 3.171688459351904e-05, "loss": 1.3358, "step": 3992 }, { "epoch": 5.561281337047354, "grad_norm": 3.567563056945801, "learning_rate": 3.170693575895395e-05, "loss": 1.3929, "step": 3993 }, { "epoch": 5.562674094707521, "grad_norm": 3.478569269180298, "learning_rate": 3.1696986924388855e-05, "loss": 1.4469, "step": 3994 }, { "epoch": 5.564066852367688, "grad_norm": 3.3098061084747314, "learning_rate": 3.168703808982376e-05, "loss": 1.2011, "step": 3995 }, { "epoch": 5.5654596100278555, "grad_norm": 3.060587167739868, "learning_rate": 3.167708925525866e-05, "loss": 1.3309, "step": 3996 }, { "epoch": 5.566852367688022, "grad_norm": 2.8655707836151123, "learning_rate": 3.1667140420693574e-05, "loss": 1.2528, "step": 3997 }, { "epoch": 5.56824512534819, "grad_norm": 3.060734748840332, "learning_rate": 3.165719158612848e-05, "loss": 1.5167, "step": 3998 }, { "epoch": 5.569637883008356, "grad_norm": 3.5213658809661865, "learning_rate": 3.164724275156339e-05, "loss": 1.504, "step": 3999 }, { "epoch": 5.571030640668524, "grad_norm": 2.8732447624206543, "learning_rate": 3.163729391699829e-05, "loss": 1.4204, "step": 4000 }, { "epoch": 5.572423398328691, "grad_norm": 3.171318769454956, "learning_rate": 3.16273450824332e-05, "loss": 1.2582, "step": 4001 }, { "epoch": 5.573816155988858, "grad_norm": 3.1689140796661377, "learning_rate": 3.161739624786811e-05, "loss": 1.2342, "step": 4002 }, { "epoch": 5.575208913649025, "grad_norm": 3.2862606048583984, "learning_rate": 3.160744741330301e-05, "loss": 1.2617, "step": 4003 }, { "epoch": 5.576601671309192, "grad_norm": 3.740560293197632, "learning_rate": 3.1597498578737916e-05, "loss": 1.598, "step": 4004 }, { "epoch": 5.577994428969359, "grad_norm": 3.5847156047821045, "learning_rate": 3.1587549744172826e-05, "loss": 1.3149, "step": 4005 }, { "epoch": 5.579387186629527, "grad_norm": 3.189554452896118, "learning_rate": 3.157760090960773e-05, "loss": 1.2423, "step": 4006 }, { "epoch": 5.580779944289693, "grad_norm": 3.277650833129883, "learning_rate": 3.1567652075042635e-05, "loss": 1.3428, "step": 4007 }, { "epoch": 5.582172701949861, "grad_norm": 3.879124879837036, "learning_rate": 3.155770324047754e-05, "loss": 1.3948, "step": 4008 }, { "epoch": 5.5835654596100275, "grad_norm": 3.133305072784424, "learning_rate": 3.154775440591245e-05, "loss": 1.4726, "step": 4009 }, { "epoch": 5.584958217270195, "grad_norm": 3.1936330795288086, "learning_rate": 3.1537805571347354e-05, "loss": 1.0911, "step": 4010 }, { "epoch": 5.586350974930362, "grad_norm": 3.7738914489746094, "learning_rate": 3.152785673678226e-05, "loss": 1.5611, "step": 4011 }, { "epoch": 5.587743732590529, "grad_norm": 3.346017837524414, "learning_rate": 3.151790790221717e-05, "loss": 1.4063, "step": 4012 }, { "epoch": 5.589136490250697, "grad_norm": 3.697584629058838, "learning_rate": 3.150795906765207e-05, "loss": 1.4792, "step": 4013 }, { "epoch": 5.5905292479108635, "grad_norm": 3.020110845565796, "learning_rate": 3.149801023308698e-05, "loss": 1.2814, "step": 4014 }, { "epoch": 5.591922005571031, "grad_norm": 4.786131381988525, "learning_rate": 3.148806139852189e-05, "loss": 1.3081, "step": 4015 }, { "epoch": 5.593314763231198, "grad_norm": 3.489091396331787, "learning_rate": 3.147811256395679e-05, "loss": 1.3555, "step": 4016 }, { "epoch": 5.594707520891365, "grad_norm": 3.1483490467071533, "learning_rate": 3.1468163729391696e-05, "loss": 1.4145, "step": 4017 }, { "epoch": 5.596100278551532, "grad_norm": 3.646615505218506, "learning_rate": 3.14582148948266e-05, "loss": 1.5155, "step": 4018 }, { "epoch": 5.5974930362116995, "grad_norm": 3.1701669692993164, "learning_rate": 3.144826606026151e-05, "loss": 1.3475, "step": 4019 }, { "epoch": 5.598885793871866, "grad_norm": 3.67806339263916, "learning_rate": 3.1438317225696415e-05, "loss": 1.5856, "step": 4020 }, { "epoch": 5.600278551532034, "grad_norm": 3.745802402496338, "learning_rate": 3.142836839113132e-05, "loss": 1.7255, "step": 4021 }, { "epoch": 5.6016713091922, "grad_norm": 3.7711427211761475, "learning_rate": 3.141841955656623e-05, "loss": 1.5741, "step": 4022 }, { "epoch": 5.603064066852368, "grad_norm": 3.1372427940368652, "learning_rate": 3.1408470722001134e-05, "loss": 1.2582, "step": 4023 }, { "epoch": 5.604456824512535, "grad_norm": 2.933021306991577, "learning_rate": 3.139852188743604e-05, "loss": 1.2301, "step": 4024 }, { "epoch": 5.605849582172702, "grad_norm": 2.75142240524292, "learning_rate": 3.138857305287094e-05, "loss": 1.0087, "step": 4025 }, { "epoch": 5.607242339832869, "grad_norm": 3.52231502532959, "learning_rate": 3.137862421830585e-05, "loss": 1.513, "step": 4026 }, { "epoch": 5.608635097493036, "grad_norm": 3.888263702392578, "learning_rate": 3.1368675383740764e-05, "loss": 1.6725, "step": 4027 }, { "epoch": 5.610027855153203, "grad_norm": 3.135993480682373, "learning_rate": 3.135872654917567e-05, "loss": 1.3214, "step": 4028 }, { "epoch": 5.611420612813371, "grad_norm": 2.9921624660491943, "learning_rate": 3.134877771461057e-05, "loss": 1.1343, "step": 4029 }, { "epoch": 5.612813370473537, "grad_norm": 3.49458646774292, "learning_rate": 3.1338828880045476e-05, "loss": 1.3498, "step": 4030 }, { "epoch": 5.614206128133705, "grad_norm": 3.1544137001037598, "learning_rate": 3.132888004548039e-05, "loss": 1.1794, "step": 4031 }, { "epoch": 5.615598885793872, "grad_norm": 3.238394260406494, "learning_rate": 3.131893121091529e-05, "loss": 1.3715, "step": 4032 }, { "epoch": 5.616991643454039, "grad_norm": 3.5697274208068848, "learning_rate": 3.1308982376350195e-05, "loss": 1.457, "step": 4033 }, { "epoch": 5.618384401114206, "grad_norm": 3.0798115730285645, "learning_rate": 3.1299033541785106e-05, "loss": 1.5299, "step": 4034 }, { "epoch": 5.619777158774373, "grad_norm": 2.8842337131500244, "learning_rate": 3.128908470722001e-05, "loss": 1.234, "step": 4035 }, { "epoch": 5.621169916434541, "grad_norm": 3.178698778152466, "learning_rate": 3.1279135872654914e-05, "loss": 1.3407, "step": 4036 }, { "epoch": 5.6225626740947074, "grad_norm": 4.927010536193848, "learning_rate": 3.1269187038089825e-05, "loss": 1.2835, "step": 4037 }, { "epoch": 5.623955431754875, "grad_norm": 3.533519983291626, "learning_rate": 3.125923820352473e-05, "loss": 1.1547, "step": 4038 }, { "epoch": 5.625348189415042, "grad_norm": 3.076141595840454, "learning_rate": 3.124928936895963e-05, "loss": 1.3105, "step": 4039 }, { "epoch": 5.626740947075209, "grad_norm": 3.7596917152404785, "learning_rate": 3.123934053439454e-05, "loss": 1.3619, "step": 4040 }, { "epoch": 5.628133704735376, "grad_norm": 3.091172456741333, "learning_rate": 3.122939169982945e-05, "loss": 1.2091, "step": 4041 }, { "epoch": 5.629526462395543, "grad_norm": 3.2155330181121826, "learning_rate": 3.121944286526435e-05, "loss": 1.2206, "step": 4042 }, { "epoch": 5.63091922005571, "grad_norm": 3.5321922302246094, "learning_rate": 3.1209494030699256e-05, "loss": 1.4917, "step": 4043 }, { "epoch": 5.632311977715878, "grad_norm": 3.7770276069641113, "learning_rate": 3.119954519613417e-05, "loss": 1.5045, "step": 4044 }, { "epoch": 5.633704735376044, "grad_norm": 3.619699239730835, "learning_rate": 3.118959636156907e-05, "loss": 1.2959, "step": 4045 }, { "epoch": 5.635097493036212, "grad_norm": 3.2381839752197266, "learning_rate": 3.1179647527003975e-05, "loss": 1.3476, "step": 4046 }, { "epoch": 5.6364902506963785, "grad_norm": 3.1673707962036133, "learning_rate": 3.116969869243888e-05, "loss": 1.3016, "step": 4047 }, { "epoch": 5.637883008356546, "grad_norm": 3.2762868404388428, "learning_rate": 3.115974985787379e-05, "loss": 1.331, "step": 4048 }, { "epoch": 5.639275766016713, "grad_norm": 3.825324773788452, "learning_rate": 3.1149801023308694e-05, "loss": 1.5748, "step": 4049 }, { "epoch": 5.64066852367688, "grad_norm": 3.2996816635131836, "learning_rate": 3.11398521887436e-05, "loss": 1.4018, "step": 4050 }, { "epoch": 5.642061281337047, "grad_norm": 3.464852809906006, "learning_rate": 3.112990335417851e-05, "loss": 1.4006, "step": 4051 }, { "epoch": 5.6434540389972145, "grad_norm": 3.3543944358825684, "learning_rate": 3.111995451961341e-05, "loss": 1.3048, "step": 4052 }, { "epoch": 5.644846796657381, "grad_norm": 3.9715123176574707, "learning_rate": 3.1110005685048324e-05, "loss": 1.381, "step": 4053 }, { "epoch": 5.646239554317549, "grad_norm": 3.423572063446045, "learning_rate": 3.110005685048323e-05, "loss": 1.4331, "step": 4054 }, { "epoch": 5.647632311977716, "grad_norm": 3.0576534271240234, "learning_rate": 3.109010801591813e-05, "loss": 1.4171, "step": 4055 }, { "epoch": 5.649025069637883, "grad_norm": 3.4302568435668945, "learning_rate": 3.108015918135304e-05, "loss": 1.4861, "step": 4056 }, { "epoch": 5.65041782729805, "grad_norm": 3.7648096084594727, "learning_rate": 3.107021034678795e-05, "loss": 1.5186, "step": 4057 }, { "epoch": 5.651810584958217, "grad_norm": 3.2167465686798096, "learning_rate": 3.106026151222285e-05, "loss": 1.5403, "step": 4058 }, { "epoch": 5.653203342618385, "grad_norm": 3.5104517936706543, "learning_rate": 3.1050312677657755e-05, "loss": 1.2002, "step": 4059 }, { "epoch": 5.654596100278551, "grad_norm": 4.983837127685547, "learning_rate": 3.1040363843092666e-05, "loss": 1.343, "step": 4060 }, { "epoch": 5.655988857938719, "grad_norm": 3.564985752105713, "learning_rate": 3.103041500852757e-05, "loss": 1.4958, "step": 4061 }, { "epoch": 5.657381615598886, "grad_norm": 3.7939937114715576, "learning_rate": 3.1020466173962474e-05, "loss": 1.4437, "step": 4062 }, { "epoch": 5.658774373259053, "grad_norm": 3.5555875301361084, "learning_rate": 3.1010517339397385e-05, "loss": 1.2997, "step": 4063 }, { "epoch": 5.66016713091922, "grad_norm": 2.7710838317871094, "learning_rate": 3.100056850483229e-05, "loss": 1.008, "step": 4064 }, { "epoch": 5.661559888579387, "grad_norm": 3.532623767852783, "learning_rate": 3.099061967026719e-05, "loss": 1.6255, "step": 4065 }, { "epoch": 5.662952646239554, "grad_norm": 3.8716318607330322, "learning_rate": 3.0980670835702104e-05, "loss": 1.3818, "step": 4066 }, { "epoch": 5.664345403899722, "grad_norm": 3.149153470993042, "learning_rate": 3.097072200113701e-05, "loss": 1.3997, "step": 4067 }, { "epoch": 5.665738161559888, "grad_norm": 3.526543378829956, "learning_rate": 3.096077316657191e-05, "loss": 1.3464, "step": 4068 }, { "epoch": 5.667130919220056, "grad_norm": 3.077786922454834, "learning_rate": 3.0950824332006816e-05, "loss": 1.1928, "step": 4069 }, { "epoch": 5.6685236768802225, "grad_norm": 3.1415812969207764, "learning_rate": 3.094087549744173e-05, "loss": 1.3901, "step": 4070 }, { "epoch": 5.66991643454039, "grad_norm": 3.5665364265441895, "learning_rate": 3.093092666287663e-05, "loss": 1.1578, "step": 4071 }, { "epoch": 5.671309192200557, "grad_norm": 3.6948559284210205, "learning_rate": 3.0920977828311535e-05, "loss": 1.5388, "step": 4072 }, { "epoch": 5.672701949860724, "grad_norm": 3.619840621948242, "learning_rate": 3.0911028993746446e-05, "loss": 1.5306, "step": 4073 }, { "epoch": 5.674094707520892, "grad_norm": 3.6319541931152344, "learning_rate": 3.090108015918135e-05, "loss": 1.6165, "step": 4074 }, { "epoch": 5.6754874651810585, "grad_norm": 3.2584638595581055, "learning_rate": 3.0891131324616254e-05, "loss": 1.2343, "step": 4075 }, { "epoch": 5.676880222841225, "grad_norm": 3.5607893466949463, "learning_rate": 3.0881182490051165e-05, "loss": 1.5375, "step": 4076 }, { "epoch": 5.678272980501393, "grad_norm": 3.6344685554504395, "learning_rate": 3.087123365548607e-05, "loss": 1.468, "step": 4077 }, { "epoch": 5.67966573816156, "grad_norm": 3.4850354194641113, "learning_rate": 3.086128482092097e-05, "loss": 1.3957, "step": 4078 }, { "epoch": 5.681058495821727, "grad_norm": 3.4105865955352783, "learning_rate": 3.085133598635588e-05, "loss": 1.3103, "step": 4079 }, { "epoch": 5.6824512534818945, "grad_norm": 3.7823452949523926, "learning_rate": 3.084138715179079e-05, "loss": 1.519, "step": 4080 }, { "epoch": 5.683844011142061, "grad_norm": 4.052648544311523, "learning_rate": 3.083143831722569e-05, "loss": 1.4329, "step": 4081 }, { "epoch": 5.685236768802229, "grad_norm": 3.6204118728637695, "learning_rate": 3.08214894826606e-05, "loss": 1.4498, "step": 4082 }, { "epoch": 5.686629526462395, "grad_norm": 3.150477170944214, "learning_rate": 3.081154064809551e-05, "loss": 1.51, "step": 4083 }, { "epoch": 5.688022284122563, "grad_norm": 3.188532590866089, "learning_rate": 3.080159181353041e-05, "loss": 1.5072, "step": 4084 }, { "epoch": 5.68941504178273, "grad_norm": 3.531343936920166, "learning_rate": 3.079164297896532e-05, "loss": 1.3479, "step": 4085 }, { "epoch": 5.690807799442897, "grad_norm": 3.5683412551879883, "learning_rate": 3.0781694144400226e-05, "loss": 1.8486, "step": 4086 }, { "epoch": 5.692200557103064, "grad_norm": 2.9277305603027344, "learning_rate": 3.077174530983513e-05, "loss": 1.0993, "step": 4087 }, { "epoch": 5.693593314763231, "grad_norm": 3.2109014987945557, "learning_rate": 3.076179647527004e-05, "loss": 1.3071, "step": 4088 }, { "epoch": 5.694986072423398, "grad_norm": 3.4162259101867676, "learning_rate": 3.0751847640704945e-05, "loss": 1.4459, "step": 4089 }, { "epoch": 5.696378830083566, "grad_norm": 3.262427806854248, "learning_rate": 3.074189880613985e-05, "loss": 1.271, "step": 4090 }, { "epoch": 5.697771587743732, "grad_norm": 3.9636881351470947, "learning_rate": 3.073194997157475e-05, "loss": 1.1394, "step": 4091 }, { "epoch": 5.6991643454039, "grad_norm": 4.1263885498046875, "learning_rate": 3.0722001137009664e-05, "loss": 1.2762, "step": 4092 }, { "epoch": 5.7005571030640665, "grad_norm": 3.5942258834838867, "learning_rate": 3.071205230244457e-05, "loss": 1.5294, "step": 4093 }, { "epoch": 5.701949860724234, "grad_norm": 3.5829882621765137, "learning_rate": 3.070210346787947e-05, "loss": 1.6246, "step": 4094 }, { "epoch": 5.703342618384401, "grad_norm": 3.2303061485290527, "learning_rate": 3.069215463331438e-05, "loss": 1.2655, "step": 4095 }, { "epoch": 5.704735376044568, "grad_norm": 2.990727186203003, "learning_rate": 3.068220579874929e-05, "loss": 1.2716, "step": 4096 }, { "epoch": 5.706128133704736, "grad_norm": 3.8842310905456543, "learning_rate": 3.067225696418419e-05, "loss": 1.6645, "step": 4097 }, { "epoch": 5.7075208913649025, "grad_norm": 3.260706901550293, "learning_rate": 3.06623081296191e-05, "loss": 1.3646, "step": 4098 }, { "epoch": 5.708913649025069, "grad_norm": 3.34687876701355, "learning_rate": 3.0652359295054006e-05, "loss": 1.4088, "step": 4099 }, { "epoch": 5.710306406685237, "grad_norm": 3.53373384475708, "learning_rate": 3.064241046048891e-05, "loss": 1.4936, "step": 4100 }, { "epoch": 5.711699164345404, "grad_norm": 3.292091131210327, "learning_rate": 3.0632461625923814e-05, "loss": 1.2043, "step": 4101 }, { "epoch": 5.713091922005571, "grad_norm": 3.288079023361206, "learning_rate": 3.0622512791358725e-05, "loss": 1.5383, "step": 4102 }, { "epoch": 5.7144846796657385, "grad_norm": 3.857797861099243, "learning_rate": 3.061256395679363e-05, "loss": 1.3433, "step": 4103 }, { "epoch": 5.715877437325905, "grad_norm": 3.4742066860198975, "learning_rate": 3.060261512222853e-05, "loss": 1.4904, "step": 4104 }, { "epoch": 5.717270194986073, "grad_norm": 2.6940255165100098, "learning_rate": 3.0592666287663444e-05, "loss": 1.1487, "step": 4105 }, { "epoch": 5.718662952646239, "grad_norm": 3.0581483840942383, "learning_rate": 3.058271745309835e-05, "loss": 1.2375, "step": 4106 }, { "epoch": 5.720055710306407, "grad_norm": 3.8022048473358154, "learning_rate": 3.057276861853325e-05, "loss": 1.5468, "step": 4107 }, { "epoch": 5.721448467966574, "grad_norm": 3.0157785415649414, "learning_rate": 3.0562819783968156e-05, "loss": 1.2329, "step": 4108 }, { "epoch": 5.722841225626741, "grad_norm": 2.8184762001037598, "learning_rate": 3.055287094940307e-05, "loss": 1.1472, "step": 4109 }, { "epoch": 5.724233983286908, "grad_norm": 3.1936144828796387, "learning_rate": 3.054292211483798e-05, "loss": 1.2509, "step": 4110 }, { "epoch": 5.725626740947075, "grad_norm": 4.00202751159668, "learning_rate": 3.053297328027288e-05, "loss": 1.6249, "step": 4111 }, { "epoch": 5.727019498607242, "grad_norm": 3.5175881385803223, "learning_rate": 3.0523024445707786e-05, "loss": 1.2482, "step": 4112 }, { "epoch": 5.72841225626741, "grad_norm": 3.3421785831451416, "learning_rate": 3.051307561114269e-05, "loss": 1.1038, "step": 4113 }, { "epoch": 5.729805013927576, "grad_norm": 3.4558494091033936, "learning_rate": 3.0503126776577598e-05, "loss": 1.3373, "step": 4114 }, { "epoch": 5.731197771587744, "grad_norm": 3.02663254737854, "learning_rate": 3.0493177942012505e-05, "loss": 1.5091, "step": 4115 }, { "epoch": 5.732590529247911, "grad_norm": 2.962890386581421, "learning_rate": 3.048322910744741e-05, "loss": 1.1872, "step": 4116 }, { "epoch": 5.733983286908078, "grad_norm": 3.380704879760742, "learning_rate": 3.0473280272882317e-05, "loss": 1.2305, "step": 4117 }, { "epoch": 5.735376044568245, "grad_norm": 3.2142016887664795, "learning_rate": 3.046333143831722e-05, "loss": 1.3784, "step": 4118 }, { "epoch": 5.736768802228412, "grad_norm": 2.860426187515259, "learning_rate": 3.0453382603752128e-05, "loss": 1.179, "step": 4119 }, { "epoch": 5.73816155988858, "grad_norm": 3.4320616722106934, "learning_rate": 3.0443433769187036e-05, "loss": 1.4381, "step": 4120 }, { "epoch": 5.7395543175487465, "grad_norm": 3.2273716926574707, "learning_rate": 3.043348493462194e-05, "loss": 1.2319, "step": 4121 }, { "epoch": 5.740947075208914, "grad_norm": 3.51316499710083, "learning_rate": 3.042353610005685e-05, "loss": 1.6669, "step": 4122 }, { "epoch": 5.742339832869081, "grad_norm": 2.818206787109375, "learning_rate": 3.041358726549175e-05, "loss": 1.1421, "step": 4123 }, { "epoch": 5.743732590529248, "grad_norm": 3.244997262954712, "learning_rate": 3.0403638430926662e-05, "loss": 1.2826, "step": 4124 }, { "epoch": 5.745125348189415, "grad_norm": 3.031724452972412, "learning_rate": 3.0393689596361566e-05, "loss": 1.2507, "step": 4125 }, { "epoch": 5.7465181058495824, "grad_norm": 3.2619550228118896, "learning_rate": 3.0383740761796474e-05, "loss": 1.2841, "step": 4126 }, { "epoch": 5.747910863509749, "grad_norm": 3.3931632041931152, "learning_rate": 3.037379192723138e-05, "loss": 1.3306, "step": 4127 }, { "epoch": 5.749303621169917, "grad_norm": 3.3280770778656006, "learning_rate": 3.0363843092666285e-05, "loss": 1.4295, "step": 4128 }, { "epoch": 5.750696378830083, "grad_norm": 3.037776470184326, "learning_rate": 3.0353894258101193e-05, "loss": 1.2161, "step": 4129 }, { "epoch": 5.752089136490251, "grad_norm": 3.152137517929077, "learning_rate": 3.0343945423536097e-05, "loss": 1.4588, "step": 4130 }, { "epoch": 5.7534818941504176, "grad_norm": 3.9945080280303955, "learning_rate": 3.0333996588971004e-05, "loss": 1.4879, "step": 4131 }, { "epoch": 5.754874651810585, "grad_norm": 3.6053826808929443, "learning_rate": 3.032404775440591e-05, "loss": 1.4856, "step": 4132 }, { "epoch": 5.756267409470752, "grad_norm": 3.221766710281372, "learning_rate": 3.0314098919840816e-05, "loss": 1.3328, "step": 4133 }, { "epoch": 5.757660167130919, "grad_norm": 3.8348989486694336, "learning_rate": 3.0304150085275723e-05, "loss": 1.493, "step": 4134 }, { "epoch": 5.759052924791086, "grad_norm": 3.57279372215271, "learning_rate": 3.0294201250710627e-05, "loss": 1.4078, "step": 4135 }, { "epoch": 5.7604456824512535, "grad_norm": 3.2637948989868164, "learning_rate": 3.0284252416145535e-05, "loss": 1.2166, "step": 4136 }, { "epoch": 5.76183844011142, "grad_norm": 3.51096773147583, "learning_rate": 3.0274303581580442e-05, "loss": 1.5175, "step": 4137 }, { "epoch": 5.763231197771588, "grad_norm": 3.0070669651031494, "learning_rate": 3.0264354747015346e-05, "loss": 1.0949, "step": 4138 }, { "epoch": 5.764623955431755, "grad_norm": 3.6944706439971924, "learning_rate": 3.0254405912450254e-05, "loss": 1.457, "step": 4139 }, { "epoch": 5.766016713091922, "grad_norm": 3.577955484390259, "learning_rate": 3.0244457077885158e-05, "loss": 1.4939, "step": 4140 }, { "epoch": 5.767409470752089, "grad_norm": 3.0088469982147217, "learning_rate": 3.0234508243320065e-05, "loss": 1.3536, "step": 4141 }, { "epoch": 5.768802228412256, "grad_norm": 3.2172975540161133, "learning_rate": 3.0224559408754973e-05, "loss": 1.3453, "step": 4142 }, { "epoch": 5.770194986072424, "grad_norm": 2.8926684856414795, "learning_rate": 3.0214610574189877e-05, "loss": 1.2709, "step": 4143 }, { "epoch": 5.77158774373259, "grad_norm": 2.8702945709228516, "learning_rate": 3.0204661739624784e-05, "loss": 1.0931, "step": 4144 }, { "epoch": 5.772980501392758, "grad_norm": 3.240285634994507, "learning_rate": 3.019471290505969e-05, "loss": 0.9439, "step": 4145 }, { "epoch": 5.774373259052925, "grad_norm": 3.271115779876709, "learning_rate": 3.0184764070494596e-05, "loss": 1.4925, "step": 4146 }, { "epoch": 5.775766016713092, "grad_norm": 3.020358085632324, "learning_rate": 3.01748152359295e-05, "loss": 1.2876, "step": 4147 }, { "epoch": 5.777158774373259, "grad_norm": 3.0515573024749756, "learning_rate": 3.0164866401364407e-05, "loss": 1.0941, "step": 4148 }, { "epoch": 5.778551532033426, "grad_norm": 3.369999408721924, "learning_rate": 3.0154917566799315e-05, "loss": 1.2354, "step": 4149 }, { "epoch": 5.779944289693593, "grad_norm": 4.339197158813477, "learning_rate": 3.014496873223422e-05, "loss": 1.9558, "step": 4150 }, { "epoch": 5.781337047353761, "grad_norm": 3.7317991256713867, "learning_rate": 3.013501989766913e-05, "loss": 1.2342, "step": 4151 }, { "epoch": 5.782729805013927, "grad_norm": 3.094419002532959, "learning_rate": 3.012507106310403e-05, "loss": 1.1601, "step": 4152 }, { "epoch": 5.784122562674095, "grad_norm": 2.936570644378662, "learning_rate": 3.011512222853894e-05, "loss": 1.1971, "step": 4153 }, { "epoch": 5.7855153203342615, "grad_norm": 3.0709304809570312, "learning_rate": 3.010517339397385e-05, "loss": 1.1642, "step": 4154 }, { "epoch": 5.786908077994429, "grad_norm": 2.7466046810150146, "learning_rate": 3.0095224559408753e-05, "loss": 0.943, "step": 4155 }, { "epoch": 5.788300835654596, "grad_norm": 3.3312766551971436, "learning_rate": 3.008527572484366e-05, "loss": 1.4297, "step": 4156 }, { "epoch": 5.789693593314763, "grad_norm": 3.617126703262329, "learning_rate": 3.0075326890278564e-05, "loss": 1.3232, "step": 4157 }, { "epoch": 5.791086350974931, "grad_norm": 2.7909839153289795, "learning_rate": 3.0065378055713472e-05, "loss": 0.8445, "step": 4158 }, { "epoch": 5.7924791086350975, "grad_norm": 3.9032790660858154, "learning_rate": 3.005542922114838e-05, "loss": 1.6201, "step": 4159 }, { "epoch": 5.793871866295264, "grad_norm": 3.076613426208496, "learning_rate": 3.0045480386583283e-05, "loss": 1.3427, "step": 4160 }, { "epoch": 5.795264623955432, "grad_norm": 4.433011531829834, "learning_rate": 3.003553155201819e-05, "loss": 1.3001, "step": 4161 }, { "epoch": 5.796657381615599, "grad_norm": 3.7605056762695312, "learning_rate": 3.0025582717453095e-05, "loss": 1.3189, "step": 4162 }, { "epoch": 5.798050139275766, "grad_norm": 3.1744890213012695, "learning_rate": 3.0015633882888002e-05, "loss": 1.4072, "step": 4163 }, { "epoch": 5.7994428969359335, "grad_norm": 3.4143013954162598, "learning_rate": 3.000568504832291e-05, "loss": 1.3463, "step": 4164 }, { "epoch": 5.8008356545961, "grad_norm": 3.0669949054718018, "learning_rate": 2.9995736213757814e-05, "loss": 1.227, "step": 4165 }, { "epoch": 5.802228412256268, "grad_norm": 3.3583242893218994, "learning_rate": 2.998578737919272e-05, "loss": 1.4553, "step": 4166 }, { "epoch": 5.803621169916434, "grad_norm": 3.753074884414673, "learning_rate": 2.9975838544627625e-05, "loss": 1.5123, "step": 4167 }, { "epoch": 5.805013927576602, "grad_norm": 3.2437632083892822, "learning_rate": 2.9965889710062533e-05, "loss": 1.0924, "step": 4168 }, { "epoch": 5.806406685236769, "grad_norm": 3.4535083770751953, "learning_rate": 2.9955940875497437e-05, "loss": 1.3092, "step": 4169 }, { "epoch": 5.807799442896936, "grad_norm": 3.2413816452026367, "learning_rate": 2.9945992040932344e-05, "loss": 1.4041, "step": 4170 }, { "epoch": 5.809192200557103, "grad_norm": 3.1956446170806885, "learning_rate": 2.9936043206367252e-05, "loss": 1.3137, "step": 4171 }, { "epoch": 5.81058495821727, "grad_norm": 3.5328941345214844, "learning_rate": 2.9926094371802156e-05, "loss": 1.5614, "step": 4172 }, { "epoch": 5.811977715877437, "grad_norm": 3.4689059257507324, "learning_rate": 2.9916145537237063e-05, "loss": 1.4592, "step": 4173 }, { "epoch": 5.813370473537605, "grad_norm": 3.411120891571045, "learning_rate": 2.9906196702671968e-05, "loss": 1.3544, "step": 4174 }, { "epoch": 5.814763231197771, "grad_norm": 3.5169613361358643, "learning_rate": 2.9896247868106875e-05, "loss": 1.541, "step": 4175 }, { "epoch": 5.816155988857939, "grad_norm": 2.96293568611145, "learning_rate": 2.9886299033541782e-05, "loss": 1.0733, "step": 4176 }, { "epoch": 5.8175487465181055, "grad_norm": 3.3015525341033936, "learning_rate": 2.9876350198976687e-05, "loss": 1.5115, "step": 4177 }, { "epoch": 5.818941504178273, "grad_norm": 2.850428581237793, "learning_rate": 2.9866401364411597e-05, "loss": 1.2325, "step": 4178 }, { "epoch": 5.82033426183844, "grad_norm": 3.6030232906341553, "learning_rate": 2.9856452529846498e-05, "loss": 1.4975, "step": 4179 }, { "epoch": 5.821727019498607, "grad_norm": 3.1357882022857666, "learning_rate": 2.984650369528141e-05, "loss": 1.4008, "step": 4180 }, { "epoch": 5.823119777158775, "grad_norm": 3.1677186489105225, "learning_rate": 2.9836554860716316e-05, "loss": 1.2208, "step": 4181 }, { "epoch": 5.8245125348189415, "grad_norm": 3.15153431892395, "learning_rate": 2.982660602615122e-05, "loss": 1.3147, "step": 4182 }, { "epoch": 5.825905292479108, "grad_norm": 3.3322243690490723, "learning_rate": 2.9816657191586128e-05, "loss": 1.3051, "step": 4183 }, { "epoch": 5.827298050139276, "grad_norm": 3.433753252029419, "learning_rate": 2.9806708357021032e-05, "loss": 1.4682, "step": 4184 }, { "epoch": 5.828690807799443, "grad_norm": 3.517406940460205, "learning_rate": 2.979675952245594e-05, "loss": 1.383, "step": 4185 }, { "epoch": 5.83008356545961, "grad_norm": 10.21381664276123, "learning_rate": 2.9786810687890847e-05, "loss": 1.2204, "step": 4186 }, { "epoch": 5.8314763231197775, "grad_norm": 3.349642515182495, "learning_rate": 2.977686185332575e-05, "loss": 1.3719, "step": 4187 }, { "epoch": 5.832869080779944, "grad_norm": 3.384669303894043, "learning_rate": 2.976691301876066e-05, "loss": 1.42, "step": 4188 }, { "epoch": 5.834261838440112, "grad_norm": 3.919182300567627, "learning_rate": 2.9756964184195563e-05, "loss": 1.5257, "step": 4189 }, { "epoch": 5.835654596100278, "grad_norm": 3.144423484802246, "learning_rate": 2.974701534963047e-05, "loss": 1.4346, "step": 4190 }, { "epoch": 5.837047353760446, "grad_norm": 3.360140800476074, "learning_rate": 2.9737066515065374e-05, "loss": 1.628, "step": 4191 }, { "epoch": 5.838440111420613, "grad_norm": 3.2371537685394287, "learning_rate": 2.972711768050028e-05, "loss": 1.3053, "step": 4192 }, { "epoch": 5.83983286908078, "grad_norm": 3.167996883392334, "learning_rate": 2.971716884593519e-05, "loss": 1.3235, "step": 4193 }, { "epoch": 5.841225626740947, "grad_norm": 3.457688570022583, "learning_rate": 2.9707220011370093e-05, "loss": 1.3575, "step": 4194 }, { "epoch": 5.842618384401114, "grad_norm": 3.499300956726074, "learning_rate": 2.9697271176805e-05, "loss": 1.407, "step": 4195 }, { "epoch": 5.844011142061281, "grad_norm": 3.7530620098114014, "learning_rate": 2.9687322342239905e-05, "loss": 1.3062, "step": 4196 }, { "epoch": 5.845403899721449, "grad_norm": 3.6542885303497314, "learning_rate": 2.9677373507674812e-05, "loss": 1.4013, "step": 4197 }, { "epoch": 5.846796657381615, "grad_norm": 3.788815975189209, "learning_rate": 2.966742467310972e-05, "loss": 1.7402, "step": 4198 }, { "epoch": 5.848189415041783, "grad_norm": 3.198843479156494, "learning_rate": 2.9657475838544624e-05, "loss": 1.1983, "step": 4199 }, { "epoch": 5.84958217270195, "grad_norm": 3.5123395919799805, "learning_rate": 2.964752700397953e-05, "loss": 1.3479, "step": 4200 }, { "epoch": 5.850974930362117, "grad_norm": 3.186678409576416, "learning_rate": 2.9637578169414435e-05, "loss": 1.3358, "step": 4201 }, { "epoch": 5.852367688022284, "grad_norm": 3.2986679077148438, "learning_rate": 2.9627629334849343e-05, "loss": 1.4147, "step": 4202 }, { "epoch": 5.853760445682451, "grad_norm": 3.591884136199951, "learning_rate": 2.961768050028425e-05, "loss": 1.252, "step": 4203 }, { "epoch": 5.855153203342619, "grad_norm": 4.0336503982543945, "learning_rate": 2.9607731665719154e-05, "loss": 1.545, "step": 4204 }, { "epoch": 5.8565459610027855, "grad_norm": 3.571032762527466, "learning_rate": 2.959778283115406e-05, "loss": 1.4103, "step": 4205 }, { "epoch": 5.857938718662953, "grad_norm": 3.2887074947357178, "learning_rate": 2.9587833996588966e-05, "loss": 1.3755, "step": 4206 }, { "epoch": 5.85933147632312, "grad_norm": 2.8116984367370605, "learning_rate": 2.9577885162023877e-05, "loss": 1.0999, "step": 4207 }, { "epoch": 5.860724233983287, "grad_norm": 3.0161566734313965, "learning_rate": 2.9567936327458784e-05, "loss": 1.2861, "step": 4208 }, { "epoch": 5.862116991643454, "grad_norm": 3.3580868244171143, "learning_rate": 2.9557987492893688e-05, "loss": 1.3462, "step": 4209 }, { "epoch": 5.8635097493036215, "grad_norm": 3.395779609680176, "learning_rate": 2.9548038658328596e-05, "loss": 1.3673, "step": 4210 }, { "epoch": 5.864902506963788, "grad_norm": 3.057870388031006, "learning_rate": 2.95380898237635e-05, "loss": 1.0038, "step": 4211 }, { "epoch": 5.866295264623956, "grad_norm": 2.9318737983703613, "learning_rate": 2.9528140989198407e-05, "loss": 1.415, "step": 4212 }, { "epoch": 5.867688022284122, "grad_norm": 3.789201021194458, "learning_rate": 2.951819215463331e-05, "loss": 1.2342, "step": 4213 }, { "epoch": 5.86908077994429, "grad_norm": 3.4682655334472656, "learning_rate": 2.950824332006822e-05, "loss": 1.3848, "step": 4214 }, { "epoch": 5.870473537604457, "grad_norm": 3.241466522216797, "learning_rate": 2.9498294485503126e-05, "loss": 1.6755, "step": 4215 }, { "epoch": 5.871866295264624, "grad_norm": 3.4985568523406982, "learning_rate": 2.948834565093803e-05, "loss": 1.2251, "step": 4216 }, { "epoch": 5.873259052924791, "grad_norm": 3.2599987983703613, "learning_rate": 2.9478396816372938e-05, "loss": 1.308, "step": 4217 }, { "epoch": 5.874651810584958, "grad_norm": 3.268479108810425, "learning_rate": 2.946844798180784e-05, "loss": 1.2174, "step": 4218 }, { "epoch": 5.876044568245125, "grad_norm": 3.1882810592651367, "learning_rate": 2.945849914724275e-05, "loss": 1.2337, "step": 4219 }, { "epoch": 5.8774373259052926, "grad_norm": 3.6085448265075684, "learning_rate": 2.9448550312677657e-05, "loss": 1.7037, "step": 4220 }, { "epoch": 5.878830083565459, "grad_norm": 3.001582384109497, "learning_rate": 2.943860147811256e-05, "loss": 1.2232, "step": 4221 }, { "epoch": 5.880222841225627, "grad_norm": 3.2147443294525146, "learning_rate": 2.9428652643547468e-05, "loss": 1.2098, "step": 4222 }, { "epoch": 5.881615598885794, "grad_norm": 3.6638669967651367, "learning_rate": 2.9418703808982372e-05, "loss": 1.5788, "step": 4223 }, { "epoch": 5.883008356545961, "grad_norm": 2.899076461791992, "learning_rate": 2.940875497441728e-05, "loss": 0.9433, "step": 4224 }, { "epoch": 5.884401114206128, "grad_norm": 3.244516372680664, "learning_rate": 2.9398806139852187e-05, "loss": 1.4991, "step": 4225 }, { "epoch": 5.885793871866295, "grad_norm": 3.1038339138031006, "learning_rate": 2.938885730528709e-05, "loss": 1.0099, "step": 4226 }, { "epoch": 5.887186629526463, "grad_norm": 2.9299960136413574, "learning_rate": 2.9378908470722e-05, "loss": 1.1283, "step": 4227 }, { "epoch": 5.888579387186629, "grad_norm": 3.4644925594329834, "learning_rate": 2.9368959636156903e-05, "loss": 1.4124, "step": 4228 }, { "epoch": 5.889972144846797, "grad_norm": 3.1208677291870117, "learning_rate": 2.935901080159181e-05, "loss": 1.5089, "step": 4229 }, { "epoch": 5.891364902506964, "grad_norm": 3.68741774559021, "learning_rate": 2.9349061967026718e-05, "loss": 1.3013, "step": 4230 }, { "epoch": 5.892757660167131, "grad_norm": 3.068450450897217, "learning_rate": 2.9339113132461622e-05, "loss": 1.0985, "step": 4231 }, { "epoch": 5.894150417827298, "grad_norm": 3.0982754230499268, "learning_rate": 2.932916429789653e-05, "loss": 1.1662, "step": 4232 }, { "epoch": 5.895543175487465, "grad_norm": 3.9106130599975586, "learning_rate": 2.9319215463331433e-05, "loss": 1.6737, "step": 4233 }, { "epoch": 5.896935933147632, "grad_norm": 3.0923335552215576, "learning_rate": 2.9309266628766344e-05, "loss": 1.0318, "step": 4234 }, { "epoch": 5.8983286908078, "grad_norm": 3.1148061752319336, "learning_rate": 2.9299317794201245e-05, "loss": 1.074, "step": 4235 }, { "epoch": 5.899721448467966, "grad_norm": 3.4015417098999023, "learning_rate": 2.9289368959636156e-05, "loss": 1.54, "step": 4236 }, { "epoch": 5.901114206128134, "grad_norm": 3.440244436264038, "learning_rate": 2.9279420125071063e-05, "loss": 1.2495, "step": 4237 }, { "epoch": 5.9025069637883005, "grad_norm": 3.1867411136627197, "learning_rate": 2.9269471290505967e-05, "loss": 1.2098, "step": 4238 }, { "epoch": 5.903899721448468, "grad_norm": 3.6312007904052734, "learning_rate": 2.9259522455940875e-05, "loss": 1.6599, "step": 4239 }, { "epoch": 5.905292479108635, "grad_norm": 3.264538288116455, "learning_rate": 2.924957362137578e-05, "loss": 1.1512, "step": 4240 }, { "epoch": 5.906685236768802, "grad_norm": 3.1987524032592773, "learning_rate": 2.9239624786810686e-05, "loss": 1.3559, "step": 4241 }, { "epoch": 5.908077994428969, "grad_norm": 3.2913811206817627, "learning_rate": 2.9229675952245594e-05, "loss": 1.3152, "step": 4242 }, { "epoch": 5.9094707520891365, "grad_norm": 3.5033583641052246, "learning_rate": 2.9219727117680498e-05, "loss": 1.3457, "step": 4243 }, { "epoch": 5.910863509749303, "grad_norm": 3.714859962463379, "learning_rate": 2.9209778283115405e-05, "loss": 1.5776, "step": 4244 }, { "epoch": 5.912256267409471, "grad_norm": 3.511657476425171, "learning_rate": 2.919982944855031e-05, "loss": 1.523, "step": 4245 }, { "epoch": 5.913649025069638, "grad_norm": 3.459751605987549, "learning_rate": 2.9189880613985217e-05, "loss": 1.2272, "step": 4246 }, { "epoch": 5.915041782729805, "grad_norm": 3.478177070617676, "learning_rate": 2.9179931779420124e-05, "loss": 1.5362, "step": 4247 }, { "epoch": 5.9164345403899725, "grad_norm": 3.2239317893981934, "learning_rate": 2.9169982944855028e-05, "loss": 1.0754, "step": 4248 }, { "epoch": 5.917827298050139, "grad_norm": 3.0095248222351074, "learning_rate": 2.9160034110289936e-05, "loss": 1.3243, "step": 4249 }, { "epoch": 5.919220055710307, "grad_norm": 3.2675821781158447, "learning_rate": 2.915008527572484e-05, "loss": 1.048, "step": 4250 }, { "epoch": 5.920612813370473, "grad_norm": 3.6328365802764893, "learning_rate": 2.9140136441159747e-05, "loss": 1.4779, "step": 4251 }, { "epoch": 5.922005571030641, "grad_norm": 3.0589301586151123, "learning_rate": 2.913018760659465e-05, "loss": 1.3803, "step": 4252 }, { "epoch": 5.923398328690808, "grad_norm": 3.189467430114746, "learning_rate": 2.912023877202956e-05, "loss": 1.2764, "step": 4253 }, { "epoch": 5.924791086350975, "grad_norm": 3.7781779766082764, "learning_rate": 2.9110289937464466e-05, "loss": 1.3555, "step": 4254 }, { "epoch": 5.926183844011142, "grad_norm": 3.6133739948272705, "learning_rate": 2.910034110289937e-05, "loss": 1.3811, "step": 4255 }, { "epoch": 5.927576601671309, "grad_norm": 3.5860114097595215, "learning_rate": 2.9090392268334278e-05, "loss": 1.5179, "step": 4256 }, { "epoch": 5.928969359331476, "grad_norm": 4.614382743835449, "learning_rate": 2.9080443433769182e-05, "loss": 1.2738, "step": 4257 }, { "epoch": 5.930362116991644, "grad_norm": 8.058302879333496, "learning_rate": 2.907049459920409e-05, "loss": 1.409, "step": 4258 }, { "epoch": 5.93175487465181, "grad_norm": 4.036209583282471, "learning_rate": 2.9060545764638997e-05, "loss": 1.3674, "step": 4259 }, { "epoch": 5.933147632311978, "grad_norm": 3.4181134700775146, "learning_rate": 2.90505969300739e-05, "loss": 1.1795, "step": 4260 }, { "epoch": 5.9345403899721445, "grad_norm": 3.0663325786590576, "learning_rate": 2.904064809550881e-05, "loss": 1.066, "step": 4261 }, { "epoch": 5.935933147632312, "grad_norm": 3.1197872161865234, "learning_rate": 2.9030699260943712e-05, "loss": 1.2864, "step": 4262 }, { "epoch": 5.937325905292479, "grad_norm": 3.1056711673736572, "learning_rate": 2.9020750426378623e-05, "loss": 1.031, "step": 4263 }, { "epoch": 5.938718662952646, "grad_norm": 4.707518100738525, "learning_rate": 2.901080159181353e-05, "loss": 1.2698, "step": 4264 }, { "epoch": 5.940111420612814, "grad_norm": 3.6590993404388428, "learning_rate": 2.9000852757248435e-05, "loss": 1.4503, "step": 4265 }, { "epoch": 5.9415041782729805, "grad_norm": 3.107004165649414, "learning_rate": 2.8990903922683342e-05, "loss": 1.1497, "step": 4266 }, { "epoch": 5.942896935933147, "grad_norm": 3.4972264766693115, "learning_rate": 2.8980955088118246e-05, "loss": 1.4542, "step": 4267 }, { "epoch": 5.944289693593315, "grad_norm": 3.4089934825897217, "learning_rate": 2.8971006253553154e-05, "loss": 1.3729, "step": 4268 }, { "epoch": 5.945682451253482, "grad_norm": 3.4364445209503174, "learning_rate": 2.896105741898806e-05, "loss": 1.6305, "step": 4269 }, { "epoch": 5.947075208913649, "grad_norm": 2.60617995262146, "learning_rate": 2.8951108584422965e-05, "loss": 1.1789, "step": 4270 }, { "epoch": 5.9484679665738165, "grad_norm": 3.87069034576416, "learning_rate": 2.8941159749857873e-05, "loss": 1.3996, "step": 4271 }, { "epoch": 5.949860724233983, "grad_norm": 2.914503335952759, "learning_rate": 2.8931210915292777e-05, "loss": 1.1826, "step": 4272 }, { "epoch": 5.951253481894151, "grad_norm": 3.2550365924835205, "learning_rate": 2.8921262080727684e-05, "loss": 1.5075, "step": 4273 }, { "epoch": 5.952646239554317, "grad_norm": 3.322737693786621, "learning_rate": 2.891131324616259e-05, "loss": 1.4445, "step": 4274 }, { "epoch": 5.954038997214485, "grad_norm": 3.8737401962280273, "learning_rate": 2.8901364411597496e-05, "loss": 1.2271, "step": 4275 }, { "epoch": 5.955431754874652, "grad_norm": 2.9896321296691895, "learning_rate": 2.8891415577032403e-05, "loss": 1.2047, "step": 4276 }, { "epoch": 5.956824512534819, "grad_norm": 3.766829490661621, "learning_rate": 2.8881466742467307e-05, "loss": 1.1086, "step": 4277 }, { "epoch": 5.958217270194986, "grad_norm": 3.792320728302002, "learning_rate": 2.8871517907902215e-05, "loss": 1.1187, "step": 4278 }, { "epoch": 5.959610027855153, "grad_norm": 3.7886745929718018, "learning_rate": 2.886156907333712e-05, "loss": 1.1448, "step": 4279 }, { "epoch": 5.96100278551532, "grad_norm": 3.2813446521759033, "learning_rate": 2.8851620238772026e-05, "loss": 1.2934, "step": 4280 }, { "epoch": 5.962395543175488, "grad_norm": 3.670713186264038, "learning_rate": 2.8841671404206934e-05, "loss": 1.504, "step": 4281 }, { "epoch": 5.963788300835654, "grad_norm": 3.573119640350342, "learning_rate": 2.8831722569641838e-05, "loss": 1.1831, "step": 4282 }, { "epoch": 5.965181058495822, "grad_norm": 3.8057022094726562, "learning_rate": 2.8821773735076745e-05, "loss": 1.5948, "step": 4283 }, { "epoch": 5.9665738161559885, "grad_norm": 3.496189594268799, "learning_rate": 2.881182490051165e-05, "loss": 1.3005, "step": 4284 }, { "epoch": 5.967966573816156, "grad_norm": 3.821584463119507, "learning_rate": 2.8801876065946557e-05, "loss": 1.4228, "step": 4285 }, { "epoch": 5.969359331476323, "grad_norm": 3.7349889278411865, "learning_rate": 2.8791927231381464e-05, "loss": 1.4551, "step": 4286 }, { "epoch": 5.97075208913649, "grad_norm": 3.0262346267700195, "learning_rate": 2.878197839681637e-05, "loss": 1.0351, "step": 4287 }, { "epoch": 5.972144846796658, "grad_norm": 3.5886991024017334, "learning_rate": 2.8772029562251276e-05, "loss": 1.5945, "step": 4288 }, { "epoch": 5.9735376044568245, "grad_norm": 4.177419185638428, "learning_rate": 2.876208072768618e-05, "loss": 1.4383, "step": 4289 }, { "epoch": 5.974930362116992, "grad_norm": 3.3451030254364014, "learning_rate": 2.875213189312109e-05, "loss": 1.138, "step": 4290 }, { "epoch": 5.976323119777159, "grad_norm": 3.395125389099121, "learning_rate": 2.8742183058556e-05, "loss": 1.4852, "step": 4291 }, { "epoch": 5.977715877437326, "grad_norm": 3.767486095428467, "learning_rate": 2.8732234223990902e-05, "loss": 1.5145, "step": 4292 }, { "epoch": 5.979108635097493, "grad_norm": 3.584868907928467, "learning_rate": 2.872228538942581e-05, "loss": 1.5485, "step": 4293 }, { "epoch": 5.9805013927576605, "grad_norm": 3.1592671871185303, "learning_rate": 2.8712336554860714e-05, "loss": 1.3211, "step": 4294 }, { "epoch": 5.981894150417827, "grad_norm": 7.4499831199646, "learning_rate": 2.870238772029562e-05, "loss": 1.4428, "step": 4295 }, { "epoch": 5.983286908077995, "grad_norm": 3.348339796066284, "learning_rate": 2.8692438885730526e-05, "loss": 1.1395, "step": 4296 }, { "epoch": 5.984679665738161, "grad_norm": 3.1295952796936035, "learning_rate": 2.8682490051165433e-05, "loss": 1.4351, "step": 4297 }, { "epoch": 5.986072423398329, "grad_norm": 3.1701250076293945, "learning_rate": 2.867254121660034e-05, "loss": 1.37, "step": 4298 }, { "epoch": 5.987465181058496, "grad_norm": 3.4694182872772217, "learning_rate": 2.8662592382035245e-05, "loss": 1.3412, "step": 4299 }, { "epoch": 5.988857938718663, "grad_norm": 3.265918254852295, "learning_rate": 2.8652643547470152e-05, "loss": 1.3245, "step": 4300 }, { "epoch": 5.99025069637883, "grad_norm": 3.54642653465271, "learning_rate": 2.8642694712905056e-05, "loss": 1.3947, "step": 4301 }, { "epoch": 5.991643454038997, "grad_norm": 3.4953207969665527, "learning_rate": 2.8632745878339964e-05, "loss": 1.2307, "step": 4302 }, { "epoch": 5.993036211699164, "grad_norm": 3.104365825653076, "learning_rate": 2.862279704377487e-05, "loss": 1.2262, "step": 4303 }, { "epoch": 5.994428969359332, "grad_norm": 3.410261869430542, "learning_rate": 2.8612848209209775e-05, "loss": 1.2567, "step": 4304 }, { "epoch": 5.995821727019498, "grad_norm": 3.7561678886413574, "learning_rate": 2.8602899374644683e-05, "loss": 1.5103, "step": 4305 }, { "epoch": 5.997214484679666, "grad_norm": 2.7139320373535156, "learning_rate": 2.8592950540079587e-05, "loss": 1.003, "step": 4306 }, { "epoch": 5.998607242339833, "grad_norm": 3.683227300643921, "learning_rate": 2.8583001705514494e-05, "loss": 1.171, "step": 4307 }, { "epoch": 6.0, "grad_norm": 2.929830312728882, "learning_rate": 2.85730528709494e-05, "loss": 1.0885, "step": 4308 }, { "epoch": 6.0013927576601676, "grad_norm": 3.2844910621643066, "learning_rate": 2.8563104036384306e-05, "loss": 1.0426, "step": 4309 }, { "epoch": 6.002785515320334, "grad_norm": 3.4652326107025146, "learning_rate": 2.8553155201819213e-05, "loss": 1.3491, "step": 4310 }, { "epoch": 6.004178272980502, "grad_norm": 3.4769561290740967, "learning_rate": 2.8543206367254117e-05, "loss": 1.3252, "step": 4311 }, { "epoch": 6.005571030640668, "grad_norm": 3.0269827842712402, "learning_rate": 2.8533257532689025e-05, "loss": 1.2031, "step": 4312 }, { "epoch": 6.006963788300836, "grad_norm": 2.7160186767578125, "learning_rate": 2.8523308698123932e-05, "loss": 0.8197, "step": 4313 }, { "epoch": 6.008356545961003, "grad_norm": 3.1668927669525146, "learning_rate": 2.8513359863558836e-05, "loss": 1.1986, "step": 4314 }, { "epoch": 6.00974930362117, "grad_norm": 3.0711967945098877, "learning_rate": 2.8503411028993744e-05, "loss": 1.206, "step": 4315 }, { "epoch": 6.011142061281337, "grad_norm": 3.2880496978759766, "learning_rate": 2.8493462194428648e-05, "loss": 1.4118, "step": 4316 }, { "epoch": 6.012534818941504, "grad_norm": 2.922769069671631, "learning_rate": 2.8483513359863555e-05, "loss": 1.1758, "step": 4317 }, { "epoch": 6.013927576601671, "grad_norm": 3.00602650642395, "learning_rate": 2.847356452529846e-05, "loss": 1.2837, "step": 4318 }, { "epoch": 6.015320334261839, "grad_norm": 3.077406644821167, "learning_rate": 2.846361569073337e-05, "loss": 1.383, "step": 4319 }, { "epoch": 6.016713091922005, "grad_norm": 3.1378235816955566, "learning_rate": 2.8453666856168278e-05, "loss": 0.9669, "step": 4320 }, { "epoch": 6.018105849582173, "grad_norm": 2.8575522899627686, "learning_rate": 2.844371802160318e-05, "loss": 1.0698, "step": 4321 }, { "epoch": 6.0194986072423395, "grad_norm": 2.893104314804077, "learning_rate": 2.843376918703809e-05, "loss": 1.2204, "step": 4322 }, { "epoch": 6.020891364902507, "grad_norm": 3.258876085281372, "learning_rate": 2.8423820352472993e-05, "loss": 1.1175, "step": 4323 }, { "epoch": 6.022284122562674, "grad_norm": 2.6355395317077637, "learning_rate": 2.84138715179079e-05, "loss": 0.7807, "step": 4324 }, { "epoch": 6.023676880222841, "grad_norm": 2.9973530769348145, "learning_rate": 2.8403922683342808e-05, "loss": 1.0241, "step": 4325 }, { "epoch": 6.025069637883008, "grad_norm": 3.0352323055267334, "learning_rate": 2.8393973848777712e-05, "loss": 1.1441, "step": 4326 }, { "epoch": 6.0264623955431755, "grad_norm": 3.1082520484924316, "learning_rate": 2.838402501421262e-05, "loss": 1.0703, "step": 4327 }, { "epoch": 6.027855153203342, "grad_norm": 3.024894952774048, "learning_rate": 2.8374076179647524e-05, "loss": 1.0597, "step": 4328 }, { "epoch": 6.02924791086351, "grad_norm": 3.3348255157470703, "learning_rate": 2.836412734508243e-05, "loss": 1.3704, "step": 4329 }, { "epoch": 6.030640668523677, "grad_norm": 2.8505239486694336, "learning_rate": 2.835417851051734e-05, "loss": 0.9118, "step": 4330 }, { "epoch": 6.032033426183844, "grad_norm": 2.8822379112243652, "learning_rate": 2.8344229675952243e-05, "loss": 1.0973, "step": 4331 }, { "epoch": 6.0334261838440115, "grad_norm": 3.1755788326263428, "learning_rate": 2.833428084138715e-05, "loss": 0.9507, "step": 4332 }, { "epoch": 6.034818941504178, "grad_norm": 2.5899782180786133, "learning_rate": 2.8324332006822054e-05, "loss": 0.9368, "step": 4333 }, { "epoch": 6.036211699164346, "grad_norm": 2.606720209121704, "learning_rate": 2.8314383172256962e-05, "loss": 0.9887, "step": 4334 }, { "epoch": 6.037604456824512, "grad_norm": 2.9312844276428223, "learning_rate": 2.830443433769187e-05, "loss": 1.0675, "step": 4335 }, { "epoch": 6.03899721448468, "grad_norm": 4.057625770568848, "learning_rate": 2.8294485503126773e-05, "loss": 1.2126, "step": 4336 }, { "epoch": 6.040389972144847, "grad_norm": 2.791701316833496, "learning_rate": 2.828453666856168e-05, "loss": 1.1019, "step": 4337 }, { "epoch": 6.041782729805014, "grad_norm": 2.6868085861206055, "learning_rate": 2.8274587833996585e-05, "loss": 0.8865, "step": 4338 }, { "epoch": 6.043175487465181, "grad_norm": 3.106109142303467, "learning_rate": 2.8264638999431492e-05, "loss": 1.1152, "step": 4339 }, { "epoch": 6.044568245125348, "grad_norm": 3.098327875137329, "learning_rate": 2.8254690164866396e-05, "loss": 1.3081, "step": 4340 }, { "epoch": 6.045961002785515, "grad_norm": 2.9800162315368652, "learning_rate": 2.8244741330301304e-05, "loss": 1.0212, "step": 4341 }, { "epoch": 6.047353760445683, "grad_norm": 2.679067850112915, "learning_rate": 2.823479249573621e-05, "loss": 0.9206, "step": 4342 }, { "epoch": 6.048746518105849, "grad_norm": 2.7627615928649902, "learning_rate": 2.8224843661171115e-05, "loss": 0.9192, "step": 4343 }, { "epoch": 6.050139275766017, "grad_norm": 2.181574821472168, "learning_rate": 2.8214894826606023e-05, "loss": 0.7656, "step": 4344 }, { "epoch": 6.0515320334261835, "grad_norm": 3.5677804946899414, "learning_rate": 2.8204945992040927e-05, "loss": 0.9876, "step": 4345 }, { "epoch": 6.052924791086351, "grad_norm": 3.1361148357391357, "learning_rate": 2.8194997157475834e-05, "loss": 1.1568, "step": 4346 }, { "epoch": 6.054317548746518, "grad_norm": 2.8180770874023438, "learning_rate": 2.8185048322910745e-05, "loss": 0.9822, "step": 4347 }, { "epoch": 6.055710306406685, "grad_norm": 2.7147934436798096, "learning_rate": 2.817509948834565e-05, "loss": 0.9644, "step": 4348 }, { "epoch": 6.057103064066852, "grad_norm": 3.5782716274261475, "learning_rate": 2.8165150653780557e-05, "loss": 1.0967, "step": 4349 }, { "epoch": 6.0584958217270195, "grad_norm": 3.360884189605713, "learning_rate": 2.815520181921546e-05, "loss": 0.8963, "step": 4350 }, { "epoch": 6.059888579387187, "grad_norm": 2.692962169647217, "learning_rate": 2.8145252984650368e-05, "loss": 1.2067, "step": 4351 }, { "epoch": 6.061281337047354, "grad_norm": 3.3448727130889893, "learning_rate": 2.8135304150085276e-05, "loss": 1.1568, "step": 4352 }, { "epoch": 6.062674094707521, "grad_norm": 3.0642430782318115, "learning_rate": 2.812535531552018e-05, "loss": 0.9382, "step": 4353 }, { "epoch": 6.064066852367688, "grad_norm": 3.427797555923462, "learning_rate": 2.8115406480955087e-05, "loss": 1.0851, "step": 4354 }, { "epoch": 6.0654596100278555, "grad_norm": 3.3021860122680664, "learning_rate": 2.810545764638999e-05, "loss": 0.9075, "step": 4355 }, { "epoch": 6.066852367688022, "grad_norm": 3.595940113067627, "learning_rate": 2.80955088118249e-05, "loss": 1.2991, "step": 4356 }, { "epoch": 6.06824512534819, "grad_norm": 3.1379806995391846, "learning_rate": 2.8085559977259806e-05, "loss": 1.1563, "step": 4357 }, { "epoch": 6.069637883008356, "grad_norm": 3.217236280441284, "learning_rate": 2.807561114269471e-05, "loss": 1.3122, "step": 4358 }, { "epoch": 6.071030640668524, "grad_norm": 3.2345099449157715, "learning_rate": 2.8065662308129618e-05, "loss": 1.1902, "step": 4359 }, { "epoch": 6.072423398328691, "grad_norm": 2.8844897747039795, "learning_rate": 2.8055713473564522e-05, "loss": 1.1736, "step": 4360 }, { "epoch": 6.073816155988858, "grad_norm": 2.722585439682007, "learning_rate": 2.804576463899943e-05, "loss": 1.079, "step": 4361 }, { "epoch": 6.075208913649025, "grad_norm": 3.402970790863037, "learning_rate": 2.8035815804434333e-05, "loss": 1.3633, "step": 4362 }, { "epoch": 6.076601671309192, "grad_norm": 3.323962450027466, "learning_rate": 2.802586696986924e-05, "loss": 1.2016, "step": 4363 }, { "epoch": 6.077994428969359, "grad_norm": 2.6824100017547607, "learning_rate": 2.801591813530415e-05, "loss": 1.1126, "step": 4364 }, { "epoch": 6.079387186629527, "grad_norm": 2.8601081371307373, "learning_rate": 2.8005969300739052e-05, "loss": 1.1906, "step": 4365 }, { "epoch": 6.080779944289693, "grad_norm": 3.135958194732666, "learning_rate": 2.799602046617396e-05, "loss": 1.0786, "step": 4366 }, { "epoch": 6.082172701949861, "grad_norm": 2.9230966567993164, "learning_rate": 2.7986071631608864e-05, "loss": 1.1039, "step": 4367 }, { "epoch": 6.0835654596100275, "grad_norm": 3.0857913494110107, "learning_rate": 2.797612279704377e-05, "loss": 1.0456, "step": 4368 }, { "epoch": 6.084958217270195, "grad_norm": 3.4706621170043945, "learning_rate": 2.796617396247868e-05, "loss": 1.2033, "step": 4369 }, { "epoch": 6.086350974930362, "grad_norm": 2.7866897583007812, "learning_rate": 2.7956225127913583e-05, "loss": 0.7676, "step": 4370 }, { "epoch": 6.087743732590529, "grad_norm": 3.1143743991851807, "learning_rate": 2.794627629334849e-05, "loss": 1.2894, "step": 4371 }, { "epoch": 6.089136490250697, "grad_norm": 2.9497530460357666, "learning_rate": 2.7936327458783394e-05, "loss": 0.8985, "step": 4372 }, { "epoch": 6.0905292479108635, "grad_norm": 2.946164131164551, "learning_rate": 2.7926378624218302e-05, "loss": 1.0038, "step": 4373 }, { "epoch": 6.091922005571031, "grad_norm": 3.4043126106262207, "learning_rate": 2.7916429789653213e-05, "loss": 1.2819, "step": 4374 }, { "epoch": 6.093314763231198, "grad_norm": 3.185086965560913, "learning_rate": 2.7906480955088117e-05, "loss": 1.3676, "step": 4375 }, { "epoch": 6.094707520891365, "grad_norm": 3.4986045360565186, "learning_rate": 2.7896532120523024e-05, "loss": 1.0832, "step": 4376 }, { "epoch": 6.096100278551532, "grad_norm": 4.043941497802734, "learning_rate": 2.788658328595793e-05, "loss": 1.059, "step": 4377 }, { "epoch": 6.0974930362116995, "grad_norm": 3.022905111312866, "learning_rate": 2.7876634451392836e-05, "loss": 1.1137, "step": 4378 }, { "epoch": 6.098885793871866, "grad_norm": 2.5002658367156982, "learning_rate": 2.7866685616827743e-05, "loss": 0.9002, "step": 4379 }, { "epoch": 6.100278551532034, "grad_norm": 3.5448226928710938, "learning_rate": 2.7856736782262647e-05, "loss": 1.2442, "step": 4380 }, { "epoch": 6.1016713091922, "grad_norm": 2.954012870788574, "learning_rate": 2.7846787947697555e-05, "loss": 0.9172, "step": 4381 }, { "epoch": 6.103064066852368, "grad_norm": 3.0760223865509033, "learning_rate": 2.783683911313246e-05, "loss": 1.0496, "step": 4382 }, { "epoch": 6.104456824512535, "grad_norm": 2.6588072776794434, "learning_rate": 2.7826890278567366e-05, "loss": 1.0475, "step": 4383 }, { "epoch": 6.105849582172702, "grad_norm": 3.1436309814453125, "learning_rate": 2.781694144400227e-05, "loss": 1.0528, "step": 4384 }, { "epoch": 6.107242339832869, "grad_norm": 2.879953622817993, "learning_rate": 2.7806992609437178e-05, "loss": 1.0093, "step": 4385 }, { "epoch": 6.108635097493036, "grad_norm": 3.093233346939087, "learning_rate": 2.7797043774872085e-05, "loss": 1.0947, "step": 4386 }, { "epoch": 6.110027855153203, "grad_norm": 2.9168739318847656, "learning_rate": 2.778709494030699e-05, "loss": 1.1076, "step": 4387 }, { "epoch": 6.111420612813371, "grad_norm": 3.2096879482269287, "learning_rate": 2.7777146105741897e-05, "loss": 1.0626, "step": 4388 }, { "epoch": 6.112813370473537, "grad_norm": 2.9247891902923584, "learning_rate": 2.77671972711768e-05, "loss": 1.0168, "step": 4389 }, { "epoch": 6.114206128133705, "grad_norm": 3.6016478538513184, "learning_rate": 2.775724843661171e-05, "loss": 1.3709, "step": 4390 }, { "epoch": 6.1155988857938715, "grad_norm": 2.836681604385376, "learning_rate": 2.7747299602046616e-05, "loss": 1.0959, "step": 4391 }, { "epoch": 6.116991643454039, "grad_norm": 2.8651485443115234, "learning_rate": 2.773735076748152e-05, "loss": 1.1398, "step": 4392 }, { "epoch": 6.118384401114207, "grad_norm": 2.827662706375122, "learning_rate": 2.7727401932916427e-05, "loss": 1.0757, "step": 4393 }, { "epoch": 6.119777158774373, "grad_norm": 2.425075054168701, "learning_rate": 2.771745309835133e-05, "loss": 0.7563, "step": 4394 }, { "epoch": 6.121169916434541, "grad_norm": 2.821561098098755, "learning_rate": 2.770750426378624e-05, "loss": 0.8801, "step": 4395 }, { "epoch": 6.1225626740947074, "grad_norm": 2.993879795074463, "learning_rate": 2.7697555429221146e-05, "loss": 0.8296, "step": 4396 }, { "epoch": 6.123955431754875, "grad_norm": 3.3326897621154785, "learning_rate": 2.768760659465605e-05, "loss": 1.1056, "step": 4397 }, { "epoch": 6.125348189415042, "grad_norm": 4.613895416259766, "learning_rate": 2.7677657760090958e-05, "loss": 1.1644, "step": 4398 }, { "epoch": 6.126740947075209, "grad_norm": 3.4767098426818848, "learning_rate": 2.7667708925525862e-05, "loss": 1.2945, "step": 4399 }, { "epoch": 6.128133704735376, "grad_norm": 3.333099842071533, "learning_rate": 2.765776009096077e-05, "loss": 1.1153, "step": 4400 }, { "epoch": 6.129526462395543, "grad_norm": 2.639409065246582, "learning_rate": 2.764781125639568e-05, "loss": 0.9969, "step": 4401 }, { "epoch": 6.13091922005571, "grad_norm": 2.9064671993255615, "learning_rate": 2.763786242183058e-05, "loss": 0.9964, "step": 4402 }, { "epoch": 6.132311977715878, "grad_norm": 2.537750482559204, "learning_rate": 2.7627913587265492e-05, "loss": 0.913, "step": 4403 }, { "epoch": 6.133704735376044, "grad_norm": 2.829019546508789, "learning_rate": 2.7617964752700396e-05, "loss": 0.9105, "step": 4404 }, { "epoch": 6.135097493036212, "grad_norm": 3.537351369857788, "learning_rate": 2.7608015918135303e-05, "loss": 1.0077, "step": 4405 }, { "epoch": 6.1364902506963785, "grad_norm": 2.815483570098877, "learning_rate": 2.7598067083570208e-05, "loss": 1.0808, "step": 4406 }, { "epoch": 6.137883008356546, "grad_norm": 3.0557570457458496, "learning_rate": 2.7588118249005115e-05, "loss": 0.9736, "step": 4407 }, { "epoch": 6.139275766016713, "grad_norm": 2.9972572326660156, "learning_rate": 2.7578169414440022e-05, "loss": 1.1775, "step": 4408 }, { "epoch": 6.14066852367688, "grad_norm": 3.338998794555664, "learning_rate": 2.7568220579874927e-05, "loss": 1.2179, "step": 4409 }, { "epoch": 6.142061281337047, "grad_norm": 3.2950048446655273, "learning_rate": 2.7558271745309834e-05, "loss": 1.2744, "step": 4410 }, { "epoch": 6.1434540389972145, "grad_norm": 3.7593259811401367, "learning_rate": 2.7548322910744738e-05, "loss": 1.0514, "step": 4411 }, { "epoch": 6.144846796657381, "grad_norm": 3.059964656829834, "learning_rate": 2.7538374076179646e-05, "loss": 1.1883, "step": 4412 }, { "epoch": 6.146239554317549, "grad_norm": 2.710592269897461, "learning_rate": 2.7528425241614553e-05, "loss": 0.8026, "step": 4413 }, { "epoch": 6.147632311977716, "grad_norm": 3.130387544631958, "learning_rate": 2.7518476407049457e-05, "loss": 1.2624, "step": 4414 }, { "epoch": 6.149025069637883, "grad_norm": 3.216870069503784, "learning_rate": 2.7508527572484365e-05, "loss": 1.0916, "step": 4415 }, { "epoch": 6.1504178272980505, "grad_norm": 2.8923614025115967, "learning_rate": 2.749857873791927e-05, "loss": 1.0776, "step": 4416 }, { "epoch": 6.151810584958217, "grad_norm": 2.855976104736328, "learning_rate": 2.7488629903354176e-05, "loss": 1.0076, "step": 4417 }, { "epoch": 6.153203342618385, "grad_norm": 3.043898105621338, "learning_rate": 2.7478681068789084e-05, "loss": 0.9476, "step": 4418 }, { "epoch": 6.154596100278551, "grad_norm": 3.2427806854248047, "learning_rate": 2.7468732234223988e-05, "loss": 1.1407, "step": 4419 }, { "epoch": 6.155988857938719, "grad_norm": 2.4030675888061523, "learning_rate": 2.7458783399658895e-05, "loss": 0.8496, "step": 4420 }, { "epoch": 6.157381615598886, "grad_norm": 2.9312615394592285, "learning_rate": 2.74488345650938e-05, "loss": 1.0472, "step": 4421 }, { "epoch": 6.158774373259053, "grad_norm": 2.7273006439208984, "learning_rate": 2.7438885730528707e-05, "loss": 1.0319, "step": 4422 }, { "epoch": 6.16016713091922, "grad_norm": 2.653557062149048, "learning_rate": 2.7428936895963614e-05, "loss": 1.0142, "step": 4423 }, { "epoch": 6.161559888579387, "grad_norm": 2.500417947769165, "learning_rate": 2.7418988061398518e-05, "loss": 0.7894, "step": 4424 }, { "epoch": 6.162952646239554, "grad_norm": 3.8580479621887207, "learning_rate": 2.7409039226833426e-05, "loss": 0.9945, "step": 4425 }, { "epoch": 6.164345403899722, "grad_norm": 2.5263960361480713, "learning_rate": 2.739909039226833e-05, "loss": 0.7399, "step": 4426 }, { "epoch": 6.165738161559888, "grad_norm": 3.226283073425293, "learning_rate": 2.7389141557703237e-05, "loss": 1.0198, "step": 4427 }, { "epoch": 6.167130919220056, "grad_norm": 3.1259124279022217, "learning_rate": 2.737919272313814e-05, "loss": 1.0195, "step": 4428 }, { "epoch": 6.1685236768802225, "grad_norm": 2.9556758403778076, "learning_rate": 2.736924388857305e-05, "loss": 0.9375, "step": 4429 }, { "epoch": 6.16991643454039, "grad_norm": 3.263813018798828, "learning_rate": 2.735929505400796e-05, "loss": 1.273, "step": 4430 }, { "epoch": 6.171309192200557, "grad_norm": 2.9776573181152344, "learning_rate": 2.7349346219442864e-05, "loss": 1.2021, "step": 4431 }, { "epoch": 6.172701949860724, "grad_norm": 2.862586498260498, "learning_rate": 2.733939738487777e-05, "loss": 1.0884, "step": 4432 }, { "epoch": 6.174094707520891, "grad_norm": 3.386265993118286, "learning_rate": 2.7329448550312675e-05, "loss": 1.2135, "step": 4433 }, { "epoch": 6.1754874651810585, "grad_norm": 3.206834316253662, "learning_rate": 2.7319499715747583e-05, "loss": 1.1536, "step": 4434 }, { "epoch": 6.176880222841225, "grad_norm": 3.6165032386779785, "learning_rate": 2.730955088118249e-05, "loss": 1.3512, "step": 4435 }, { "epoch": 6.178272980501393, "grad_norm": 2.75239634513855, "learning_rate": 2.7299602046617394e-05, "loss": 1.1119, "step": 4436 }, { "epoch": 6.17966573816156, "grad_norm": 3.360097646713257, "learning_rate": 2.72896532120523e-05, "loss": 1.3204, "step": 4437 }, { "epoch": 6.181058495821727, "grad_norm": 3.0297152996063232, "learning_rate": 2.7279704377487206e-05, "loss": 0.9583, "step": 4438 }, { "epoch": 6.1824512534818945, "grad_norm": 3.2654929161071777, "learning_rate": 2.7269755542922113e-05, "loss": 1.0884, "step": 4439 }, { "epoch": 6.183844011142061, "grad_norm": 3.2654433250427246, "learning_rate": 2.725980670835702e-05, "loss": 1.0021, "step": 4440 }, { "epoch": 6.185236768802229, "grad_norm": 2.9692389965057373, "learning_rate": 2.7249857873791925e-05, "loss": 1.0119, "step": 4441 }, { "epoch": 6.186629526462395, "grad_norm": 3.0016331672668457, "learning_rate": 2.7239909039226832e-05, "loss": 1.0429, "step": 4442 }, { "epoch": 6.188022284122563, "grad_norm": 3.1427507400512695, "learning_rate": 2.7229960204661736e-05, "loss": 1.121, "step": 4443 }, { "epoch": 6.18941504178273, "grad_norm": 3.3997702598571777, "learning_rate": 2.7220011370096644e-05, "loss": 1.3677, "step": 4444 }, { "epoch": 6.190807799442897, "grad_norm": 2.856048583984375, "learning_rate": 2.7210062535531548e-05, "loss": 1.1527, "step": 4445 }, { "epoch": 6.192200557103064, "grad_norm": 2.6541314125061035, "learning_rate": 2.7200113700966455e-05, "loss": 0.9218, "step": 4446 }, { "epoch": 6.193593314763231, "grad_norm": 3.25169038772583, "learning_rate": 2.7190164866401363e-05, "loss": 1.3243, "step": 4447 }, { "epoch": 6.194986072423398, "grad_norm": 2.727038860321045, "learning_rate": 2.7180216031836267e-05, "loss": 1.0709, "step": 4448 }, { "epoch": 6.196378830083566, "grad_norm": 3.0074374675750732, "learning_rate": 2.7170267197271174e-05, "loss": 1.136, "step": 4449 }, { "epoch": 6.197771587743732, "grad_norm": 3.121025800704956, "learning_rate": 2.716031836270608e-05, "loss": 1.2403, "step": 4450 }, { "epoch": 6.1991643454039, "grad_norm": 3.5447516441345215, "learning_rate": 2.7150369528140986e-05, "loss": 1.0727, "step": 4451 }, { "epoch": 6.2005571030640665, "grad_norm": 3.221599578857422, "learning_rate": 2.7140420693575893e-05, "loss": 1.1793, "step": 4452 }, { "epoch": 6.201949860724234, "grad_norm": 2.852677345275879, "learning_rate": 2.7130471859010797e-05, "loss": 1.1591, "step": 4453 }, { "epoch": 6.203342618384401, "grad_norm": 2.823099136352539, "learning_rate": 2.7120523024445705e-05, "loss": 0.9864, "step": 4454 }, { "epoch": 6.204735376044568, "grad_norm": 2.9134204387664795, "learning_rate": 2.711057418988061e-05, "loss": 1.1051, "step": 4455 }, { "epoch": 6.206128133704736, "grad_norm": 3.437868356704712, "learning_rate": 2.7100625355315516e-05, "loss": 1.1092, "step": 4456 }, { "epoch": 6.2075208913649025, "grad_norm": 3.328800916671753, "learning_rate": 2.7090676520750427e-05, "loss": 1.1272, "step": 4457 }, { "epoch": 6.20891364902507, "grad_norm": 3.6444592475891113, "learning_rate": 2.7080727686185328e-05, "loss": 1.4687, "step": 4458 }, { "epoch": 6.210306406685237, "grad_norm": 3.0198705196380615, "learning_rate": 2.707077885162024e-05, "loss": 1.0773, "step": 4459 }, { "epoch": 6.211699164345404, "grad_norm": 3.073758363723755, "learning_rate": 2.7060830017055143e-05, "loss": 1.2888, "step": 4460 }, { "epoch": 6.213091922005571, "grad_norm": 2.8396661281585693, "learning_rate": 2.705088118249005e-05, "loss": 1.0346, "step": 4461 }, { "epoch": 6.2144846796657385, "grad_norm": 2.5152876377105713, "learning_rate": 2.7040932347924958e-05, "loss": 0.8936, "step": 4462 }, { "epoch": 6.215877437325905, "grad_norm": 2.855208158493042, "learning_rate": 2.7030983513359862e-05, "loss": 0.9975, "step": 4463 }, { "epoch": 6.217270194986073, "grad_norm": 3.0980777740478516, "learning_rate": 2.702103467879477e-05, "loss": 1.1262, "step": 4464 }, { "epoch": 6.218662952646239, "grad_norm": 3.4908106327056885, "learning_rate": 2.7011085844229673e-05, "loss": 0.8788, "step": 4465 }, { "epoch": 6.220055710306407, "grad_norm": 2.9197728633880615, "learning_rate": 2.700113700966458e-05, "loss": 1.0582, "step": 4466 }, { "epoch": 6.221448467966574, "grad_norm": 2.9431533813476562, "learning_rate": 2.6991188175099485e-05, "loss": 1.3294, "step": 4467 }, { "epoch": 6.222841225626741, "grad_norm": 3.098060369491577, "learning_rate": 2.6981239340534392e-05, "loss": 0.9908, "step": 4468 }, { "epoch": 6.224233983286908, "grad_norm": 2.678053855895996, "learning_rate": 2.69712905059693e-05, "loss": 0.9772, "step": 4469 }, { "epoch": 6.225626740947075, "grad_norm": 4.149014949798584, "learning_rate": 2.6961341671404204e-05, "loss": 0.9701, "step": 4470 }, { "epoch": 6.227019498607242, "grad_norm": 3.3159608840942383, "learning_rate": 2.695139283683911e-05, "loss": 1.1843, "step": 4471 }, { "epoch": 6.22841225626741, "grad_norm": 2.8942387104034424, "learning_rate": 2.6941444002274015e-05, "loss": 0.9166, "step": 4472 }, { "epoch": 6.229805013927576, "grad_norm": 3.1490697860717773, "learning_rate": 2.6931495167708923e-05, "loss": 1.087, "step": 4473 }, { "epoch": 6.231197771587744, "grad_norm": 3.0416064262390137, "learning_rate": 2.692154633314383e-05, "loss": 1.069, "step": 4474 }, { "epoch": 6.2325905292479105, "grad_norm": 3.295464277267456, "learning_rate": 2.6911597498578734e-05, "loss": 1.0931, "step": 4475 }, { "epoch": 6.233983286908078, "grad_norm": 2.9210784435272217, "learning_rate": 2.6901648664013642e-05, "loss": 0.8531, "step": 4476 }, { "epoch": 6.235376044568245, "grad_norm": 3.215038537979126, "learning_rate": 2.6891699829448546e-05, "loss": 1.1697, "step": 4477 }, { "epoch": 6.236768802228412, "grad_norm": 2.8260319232940674, "learning_rate": 2.6881750994883453e-05, "loss": 1.1174, "step": 4478 }, { "epoch": 6.23816155988858, "grad_norm": 2.42219614982605, "learning_rate": 2.687180216031836e-05, "loss": 0.8495, "step": 4479 }, { "epoch": 6.2395543175487465, "grad_norm": 3.514204740524292, "learning_rate": 2.6861853325753265e-05, "loss": 1.2002, "step": 4480 }, { "epoch": 6.240947075208914, "grad_norm": 3.183506488800049, "learning_rate": 2.6851904491188172e-05, "loss": 1.0287, "step": 4481 }, { "epoch": 6.242339832869081, "grad_norm": 3.0259408950805664, "learning_rate": 2.6841955656623076e-05, "loss": 0.9779, "step": 4482 }, { "epoch": 6.243732590529248, "grad_norm": 3.318598747253418, "learning_rate": 2.6832006822057984e-05, "loss": 1.0195, "step": 4483 }, { "epoch": 6.245125348189415, "grad_norm": 2.9120140075683594, "learning_rate": 2.6822057987492895e-05, "loss": 1.1234, "step": 4484 }, { "epoch": 6.2465181058495824, "grad_norm": 3.182194709777832, "learning_rate": 2.6812109152927795e-05, "loss": 1.2401, "step": 4485 }, { "epoch": 6.247910863509749, "grad_norm": 2.682565450668335, "learning_rate": 2.6802160318362706e-05, "loss": 1.0734, "step": 4486 }, { "epoch": 6.249303621169917, "grad_norm": 3.1601169109344482, "learning_rate": 2.679221148379761e-05, "loss": 1.203, "step": 4487 }, { "epoch": 6.250696378830083, "grad_norm": 2.9164812564849854, "learning_rate": 2.6782262649232518e-05, "loss": 1.2156, "step": 4488 }, { "epoch": 6.252089136490251, "grad_norm": 2.9238650798797607, "learning_rate": 2.6772313814667422e-05, "loss": 1.0867, "step": 4489 }, { "epoch": 6.2534818941504176, "grad_norm": 2.9365594387054443, "learning_rate": 2.676236498010233e-05, "loss": 1.1165, "step": 4490 }, { "epoch": 6.254874651810585, "grad_norm": 2.6445999145507812, "learning_rate": 2.6752416145537237e-05, "loss": 1.0766, "step": 4491 }, { "epoch": 6.256267409470752, "grad_norm": 2.6067068576812744, "learning_rate": 2.674246731097214e-05, "loss": 1.0061, "step": 4492 }, { "epoch": 6.257660167130919, "grad_norm": 3.738860607147217, "learning_rate": 2.673251847640705e-05, "loss": 1.0554, "step": 4493 }, { "epoch": 6.259052924791086, "grad_norm": 3.5218794345855713, "learning_rate": 2.6722569641841952e-05, "loss": 1.1866, "step": 4494 }, { "epoch": 6.2604456824512535, "grad_norm": 2.8027846813201904, "learning_rate": 2.671262080727686e-05, "loss": 0.9919, "step": 4495 }, { "epoch": 6.26183844011142, "grad_norm": 2.9861958026885986, "learning_rate": 2.6702671972711767e-05, "loss": 1.1291, "step": 4496 }, { "epoch": 6.263231197771588, "grad_norm": 3.143234968185425, "learning_rate": 2.669272313814667e-05, "loss": 0.8558, "step": 4497 }, { "epoch": 6.264623955431755, "grad_norm": 2.6168835163116455, "learning_rate": 2.668277430358158e-05, "loss": 0.944, "step": 4498 }, { "epoch": 6.266016713091922, "grad_norm": 3.6969757080078125, "learning_rate": 2.6672825469016483e-05, "loss": 1.1644, "step": 4499 }, { "epoch": 6.2674094707520895, "grad_norm": 2.8025553226470947, "learning_rate": 2.666287663445139e-05, "loss": 1.009, "step": 4500 }, { "epoch": 6.268802228412256, "grad_norm": 2.835324764251709, "learning_rate": 2.6652927799886298e-05, "loss": 0.9676, "step": 4501 }, { "epoch": 6.270194986072424, "grad_norm": 3.0212371349334717, "learning_rate": 2.6642978965321202e-05, "loss": 1.0412, "step": 4502 }, { "epoch": 6.27158774373259, "grad_norm": 3.110295295715332, "learning_rate": 2.663303013075611e-05, "loss": 1.0001, "step": 4503 }, { "epoch": 6.272980501392758, "grad_norm": 3.0676543712615967, "learning_rate": 2.6623081296191014e-05, "loss": 0.9887, "step": 4504 }, { "epoch": 6.274373259052925, "grad_norm": 3.4942009449005127, "learning_rate": 2.661313246162592e-05, "loss": 1.2879, "step": 4505 }, { "epoch": 6.275766016713092, "grad_norm": 3.999800682067871, "learning_rate": 2.660318362706083e-05, "loss": 1.1081, "step": 4506 }, { "epoch": 6.277158774373259, "grad_norm": 6.410895824432373, "learning_rate": 2.6593234792495733e-05, "loss": 1.2086, "step": 4507 }, { "epoch": 6.278551532033426, "grad_norm": 2.805187463760376, "learning_rate": 2.658328595793064e-05, "loss": 1.0742, "step": 4508 }, { "epoch": 6.279944289693593, "grad_norm": 3.0566322803497314, "learning_rate": 2.6573337123365544e-05, "loss": 1.0853, "step": 4509 }, { "epoch": 6.281337047353761, "grad_norm": 2.8116209506988525, "learning_rate": 2.656338828880045e-05, "loss": 0.8955, "step": 4510 }, { "epoch": 6.282729805013927, "grad_norm": 2.5915589332580566, "learning_rate": 2.6553439454235356e-05, "loss": 0.7699, "step": 4511 }, { "epoch": 6.284122562674095, "grad_norm": 3.1562087535858154, "learning_rate": 2.6543490619670263e-05, "loss": 0.9947, "step": 4512 }, { "epoch": 6.2855153203342615, "grad_norm": 2.7000367641448975, "learning_rate": 2.6533541785105174e-05, "loss": 0.9771, "step": 4513 }, { "epoch": 6.286908077994429, "grad_norm": 2.8173441886901855, "learning_rate": 2.6523592950540075e-05, "loss": 1.0611, "step": 4514 }, { "epoch": 6.288300835654596, "grad_norm": 2.9903464317321777, "learning_rate": 2.6513644115974986e-05, "loss": 1.0874, "step": 4515 }, { "epoch": 6.289693593314763, "grad_norm": 3.273923397064209, "learning_rate": 2.650369528140989e-05, "loss": 1.0667, "step": 4516 }, { "epoch": 6.29108635097493, "grad_norm": 3.0306897163391113, "learning_rate": 2.6493746446844797e-05, "loss": 1.0308, "step": 4517 }, { "epoch": 6.2924791086350975, "grad_norm": 2.948537588119507, "learning_rate": 2.6483797612279705e-05, "loss": 0.9645, "step": 4518 }, { "epoch": 6.293871866295264, "grad_norm": 2.477851390838623, "learning_rate": 2.647384877771461e-05, "loss": 0.7948, "step": 4519 }, { "epoch": 6.295264623955432, "grad_norm": 2.6905126571655273, "learning_rate": 2.6463899943149516e-05, "loss": 0.9911, "step": 4520 }, { "epoch": 6.296657381615599, "grad_norm": 2.6656227111816406, "learning_rate": 2.645395110858442e-05, "loss": 0.9001, "step": 4521 }, { "epoch": 6.298050139275766, "grad_norm": 2.6179158687591553, "learning_rate": 2.6444002274019328e-05, "loss": 0.8578, "step": 4522 }, { "epoch": 6.2994428969359335, "grad_norm": 2.883927822113037, "learning_rate": 2.6434053439454235e-05, "loss": 1.0552, "step": 4523 }, { "epoch": 6.3008356545961, "grad_norm": 2.9616405963897705, "learning_rate": 2.642410460488914e-05, "loss": 1.0918, "step": 4524 }, { "epoch": 6.302228412256268, "grad_norm": 2.888624906539917, "learning_rate": 2.6414155770324047e-05, "loss": 1.2251, "step": 4525 }, { "epoch": 6.303621169916434, "grad_norm": 3.91120982170105, "learning_rate": 2.640420693575895e-05, "loss": 1.2118, "step": 4526 }, { "epoch": 6.305013927576602, "grad_norm": 2.8514249324798584, "learning_rate": 2.6394258101193858e-05, "loss": 0.9345, "step": 4527 }, { "epoch": 6.306406685236769, "grad_norm": 2.232351303100586, "learning_rate": 2.6384309266628766e-05, "loss": 0.7428, "step": 4528 }, { "epoch": 6.307799442896936, "grad_norm": 2.5607776641845703, "learning_rate": 2.637436043206367e-05, "loss": 0.7218, "step": 4529 }, { "epoch": 6.309192200557103, "grad_norm": 2.8364899158477783, "learning_rate": 2.6364411597498577e-05, "loss": 1.019, "step": 4530 }, { "epoch": 6.31058495821727, "grad_norm": 2.9029994010925293, "learning_rate": 2.635446276293348e-05, "loss": 1.183, "step": 4531 }, { "epoch": 6.311977715877437, "grad_norm": 3.020650863647461, "learning_rate": 2.634451392836839e-05, "loss": 1.0243, "step": 4532 }, { "epoch": 6.313370473537605, "grad_norm": 2.6905677318573, "learning_rate": 2.6334565093803293e-05, "loss": 0.8836, "step": 4533 }, { "epoch": 6.314763231197771, "grad_norm": 2.8317041397094727, "learning_rate": 2.63246162592382e-05, "loss": 0.9563, "step": 4534 }, { "epoch": 6.316155988857939, "grad_norm": 3.1123011112213135, "learning_rate": 2.6314667424673108e-05, "loss": 1.1499, "step": 4535 }, { "epoch": 6.3175487465181055, "grad_norm": 3.1473801136016846, "learning_rate": 2.6304718590108012e-05, "loss": 1.1845, "step": 4536 }, { "epoch": 6.318941504178273, "grad_norm": 2.918656349182129, "learning_rate": 2.629476975554292e-05, "loss": 1.2055, "step": 4537 }, { "epoch": 6.32033426183844, "grad_norm": 3.118304491043091, "learning_rate": 2.6284820920977823e-05, "loss": 1.3297, "step": 4538 }, { "epoch": 6.321727019498607, "grad_norm": 2.6895864009857178, "learning_rate": 2.627487208641273e-05, "loss": 1.184, "step": 4539 }, { "epoch": 6.323119777158775, "grad_norm": 2.6843533515930176, "learning_rate": 2.626492325184764e-05, "loss": 1.1, "step": 4540 }, { "epoch": 6.3245125348189415, "grad_norm": 3.0688438415527344, "learning_rate": 2.6254974417282542e-05, "loss": 1.1488, "step": 4541 }, { "epoch": 6.325905292479109, "grad_norm": 2.361597776412964, "learning_rate": 2.6245025582717453e-05, "loss": 0.8017, "step": 4542 }, { "epoch": 6.327298050139276, "grad_norm": 3.0336103439331055, "learning_rate": 2.6235076748152354e-05, "loss": 1.1591, "step": 4543 }, { "epoch": 6.328690807799443, "grad_norm": 2.713315010070801, "learning_rate": 2.6225127913587265e-05, "loss": 0.9068, "step": 4544 }, { "epoch": 6.33008356545961, "grad_norm": 2.8144984245300293, "learning_rate": 2.6215179079022172e-05, "loss": 1.0332, "step": 4545 }, { "epoch": 6.3314763231197775, "grad_norm": 3.7023849487304688, "learning_rate": 2.6205230244457076e-05, "loss": 1.1772, "step": 4546 }, { "epoch": 6.332869080779944, "grad_norm": 3.373838186264038, "learning_rate": 2.6195281409891984e-05, "loss": 1.203, "step": 4547 }, { "epoch": 6.334261838440112, "grad_norm": 2.8458609580993652, "learning_rate": 2.6185332575326888e-05, "loss": 1.1772, "step": 4548 }, { "epoch": 6.335654596100278, "grad_norm": 2.9250094890594482, "learning_rate": 2.6175383740761795e-05, "loss": 1.2326, "step": 4549 }, { "epoch": 6.337047353760446, "grad_norm": 2.676616907119751, "learning_rate": 2.6165434906196703e-05, "loss": 1.0522, "step": 4550 }, { "epoch": 6.338440111420613, "grad_norm": 3.5170352458953857, "learning_rate": 2.6155486071631607e-05, "loss": 0.9828, "step": 4551 }, { "epoch": 6.33983286908078, "grad_norm": 3.3684849739074707, "learning_rate": 2.6145537237066514e-05, "loss": 1.0967, "step": 4552 }, { "epoch": 6.341225626740947, "grad_norm": 2.9907102584838867, "learning_rate": 2.6135588402501418e-05, "loss": 1.0293, "step": 4553 }, { "epoch": 6.342618384401114, "grad_norm": 2.501798629760742, "learning_rate": 2.6125639567936326e-05, "loss": 0.9224, "step": 4554 }, { "epoch": 6.344011142061281, "grad_norm": 3.068803310394287, "learning_rate": 2.611569073337123e-05, "loss": 1.229, "step": 4555 }, { "epoch": 6.345403899721449, "grad_norm": 3.6298787593841553, "learning_rate": 2.6105741898806137e-05, "loss": 1.2226, "step": 4556 }, { "epoch": 6.346796657381615, "grad_norm": 4.111496448516846, "learning_rate": 2.6095793064241045e-05, "loss": 1.3388, "step": 4557 }, { "epoch": 6.348189415041783, "grad_norm": 2.6730756759643555, "learning_rate": 2.608584422967595e-05, "loss": 0.9996, "step": 4558 }, { "epoch": 6.3495821727019495, "grad_norm": 2.889697313308716, "learning_rate": 2.6075895395110856e-05, "loss": 1.0544, "step": 4559 }, { "epoch": 6.350974930362117, "grad_norm": 2.5355618000030518, "learning_rate": 2.606594656054576e-05, "loss": 0.9156, "step": 4560 }, { "epoch": 6.352367688022284, "grad_norm": 2.8138792514801025, "learning_rate": 2.6055997725980668e-05, "loss": 0.9829, "step": 4561 }, { "epoch": 6.353760445682451, "grad_norm": 3.2617111206054688, "learning_rate": 2.6046048891415575e-05, "loss": 1.1128, "step": 4562 }, { "epoch": 6.355153203342619, "grad_norm": 2.78334641456604, "learning_rate": 2.603610005685048e-05, "loss": 0.9547, "step": 4563 }, { "epoch": 6.3565459610027855, "grad_norm": 3.299154043197632, "learning_rate": 2.6026151222285387e-05, "loss": 1.1187, "step": 4564 }, { "epoch": 6.357938718662953, "grad_norm": 3.1545679569244385, "learning_rate": 2.601620238772029e-05, "loss": 0.9695, "step": 4565 }, { "epoch": 6.35933147632312, "grad_norm": 3.2818355560302734, "learning_rate": 2.60062535531552e-05, "loss": 1.1084, "step": 4566 }, { "epoch": 6.360724233983287, "grad_norm": 2.984417676925659, "learning_rate": 2.5996304718590106e-05, "loss": 0.9893, "step": 4567 }, { "epoch": 6.362116991643454, "grad_norm": 3.286147117614746, "learning_rate": 2.598635588402501e-05, "loss": 1.3968, "step": 4568 }, { "epoch": 6.3635097493036215, "grad_norm": 3.4263768196105957, "learning_rate": 2.597640704945992e-05, "loss": 1.2713, "step": 4569 }, { "epoch": 6.364902506963788, "grad_norm": 2.975414514541626, "learning_rate": 2.596645821489482e-05, "loss": 1.2003, "step": 4570 }, { "epoch": 6.366295264623956, "grad_norm": 2.4351494312286377, "learning_rate": 2.5956509380329732e-05, "loss": 0.9211, "step": 4571 }, { "epoch": 6.367688022284122, "grad_norm": 3.6765012741088867, "learning_rate": 2.594656054576464e-05, "loss": 1.0735, "step": 4572 }, { "epoch": 6.36908077994429, "grad_norm": 3.1842734813690186, "learning_rate": 2.5936611711199544e-05, "loss": 1.1938, "step": 4573 }, { "epoch": 6.370473537604457, "grad_norm": 3.16499400138855, "learning_rate": 2.592666287663445e-05, "loss": 1.3683, "step": 4574 }, { "epoch": 6.371866295264624, "grad_norm": 2.929141044616699, "learning_rate": 2.5916714042069355e-05, "loss": 1.0272, "step": 4575 }, { "epoch": 6.373259052924791, "grad_norm": 3.245147228240967, "learning_rate": 2.5906765207504263e-05, "loss": 1.1937, "step": 4576 }, { "epoch": 6.374651810584958, "grad_norm": 3.472449541091919, "learning_rate": 2.5896816372939167e-05, "loss": 1.3552, "step": 4577 }, { "epoch": 6.376044568245125, "grad_norm": 2.93967866897583, "learning_rate": 2.5886867538374074e-05, "loss": 0.9408, "step": 4578 }, { "epoch": 6.3774373259052926, "grad_norm": 3.1434576511383057, "learning_rate": 2.5876918703808982e-05, "loss": 1.1183, "step": 4579 }, { "epoch": 6.378830083565459, "grad_norm": 2.9085288047790527, "learning_rate": 2.5866969869243886e-05, "loss": 0.9078, "step": 4580 }, { "epoch": 6.380222841225627, "grad_norm": 4.131046772003174, "learning_rate": 2.5857021034678793e-05, "loss": 1.1996, "step": 4581 }, { "epoch": 6.381615598885794, "grad_norm": 3.206854820251465, "learning_rate": 2.5847072200113697e-05, "loss": 1.1907, "step": 4582 }, { "epoch": 6.383008356545961, "grad_norm": 3.003754138946533, "learning_rate": 2.5837123365548605e-05, "loss": 1.1902, "step": 4583 }, { "epoch": 6.3844011142061285, "grad_norm": 3.2510647773742676, "learning_rate": 2.5827174530983512e-05, "loss": 1.2185, "step": 4584 }, { "epoch": 6.385793871866295, "grad_norm": 3.215944528579712, "learning_rate": 2.5817225696418416e-05, "loss": 1.0377, "step": 4585 }, { "epoch": 6.387186629526463, "grad_norm": 3.019026756286621, "learning_rate": 2.5807276861853324e-05, "loss": 1.1211, "step": 4586 }, { "epoch": 6.388579387186629, "grad_norm": 2.9317314624786377, "learning_rate": 2.5797328027288228e-05, "loss": 1.0638, "step": 4587 }, { "epoch": 6.389972144846797, "grad_norm": 2.9449985027313232, "learning_rate": 2.5787379192723135e-05, "loss": 0.9736, "step": 4588 }, { "epoch": 6.391364902506964, "grad_norm": 3.33158540725708, "learning_rate": 2.5777430358158043e-05, "loss": 1.0651, "step": 4589 }, { "epoch": 6.392757660167131, "grad_norm": 2.5780160427093506, "learning_rate": 2.5767481523592947e-05, "loss": 1.0129, "step": 4590 }, { "epoch": 6.394150417827298, "grad_norm": 3.0476839542388916, "learning_rate": 2.5757532689027854e-05, "loss": 1.0014, "step": 4591 }, { "epoch": 6.395543175487465, "grad_norm": 3.832833766937256, "learning_rate": 2.574758385446276e-05, "loss": 0.9095, "step": 4592 }, { "epoch": 6.396935933147632, "grad_norm": 2.8570642471313477, "learning_rate": 2.5737635019897666e-05, "loss": 0.968, "step": 4593 }, { "epoch": 6.3983286908078, "grad_norm": 3.2362117767333984, "learning_rate": 2.5727686185332573e-05, "loss": 1.0836, "step": 4594 }, { "epoch": 6.399721448467966, "grad_norm": 2.5439629554748535, "learning_rate": 2.5717737350767478e-05, "loss": 0.9426, "step": 4595 }, { "epoch": 6.401114206128134, "grad_norm": 3.088709592819214, "learning_rate": 2.5707788516202385e-05, "loss": 1.1695, "step": 4596 }, { "epoch": 6.4025069637883005, "grad_norm": 2.907419443130493, "learning_rate": 2.569783968163729e-05, "loss": 1.0151, "step": 4597 }, { "epoch": 6.403899721448468, "grad_norm": 3.443770408630371, "learning_rate": 2.56878908470722e-05, "loss": 1.1531, "step": 4598 }, { "epoch": 6.405292479108635, "grad_norm": 3.1635823249816895, "learning_rate": 2.56779420125071e-05, "loss": 1.0283, "step": 4599 }, { "epoch": 6.406685236768802, "grad_norm": 2.810631513595581, "learning_rate": 2.566799317794201e-05, "loss": 1.0196, "step": 4600 }, { "epoch": 6.408077994428969, "grad_norm": 2.9825165271759033, "learning_rate": 2.565804434337692e-05, "loss": 1.3187, "step": 4601 }, { "epoch": 6.4094707520891365, "grad_norm": 4.131698131561279, "learning_rate": 2.5648095508811823e-05, "loss": 1.0415, "step": 4602 }, { "epoch": 6.410863509749303, "grad_norm": 2.7010226249694824, "learning_rate": 2.563814667424673e-05, "loss": 0.9784, "step": 4603 }, { "epoch": 6.412256267409471, "grad_norm": 4.202706813812256, "learning_rate": 2.5628197839681635e-05, "loss": 0.9339, "step": 4604 }, { "epoch": 6.413649025069638, "grad_norm": 3.086254119873047, "learning_rate": 2.5618249005116542e-05, "loss": 0.8264, "step": 4605 }, { "epoch": 6.415041782729805, "grad_norm": 2.9942097663879395, "learning_rate": 2.560830017055145e-05, "loss": 1.0963, "step": 4606 }, { "epoch": 6.4164345403899725, "grad_norm": 3.093679428100586, "learning_rate": 2.5598351335986354e-05, "loss": 1.1648, "step": 4607 }, { "epoch": 6.417827298050139, "grad_norm": 2.9956912994384766, "learning_rate": 2.558840250142126e-05, "loss": 1.0576, "step": 4608 }, { "epoch": 6.419220055710307, "grad_norm": 3.0251107215881348, "learning_rate": 2.5578453666856165e-05, "loss": 0.9207, "step": 4609 }, { "epoch": 6.420612813370473, "grad_norm": 2.97255539894104, "learning_rate": 2.5568504832291073e-05, "loss": 1.0619, "step": 4610 }, { "epoch": 6.422005571030641, "grad_norm": 2.7506778240203857, "learning_rate": 2.555855599772598e-05, "loss": 1.0987, "step": 4611 }, { "epoch": 6.423398328690808, "grad_norm": 2.845851182937622, "learning_rate": 2.5548607163160884e-05, "loss": 0.9654, "step": 4612 }, { "epoch": 6.424791086350975, "grad_norm": 2.7043139934539795, "learning_rate": 2.553865832859579e-05, "loss": 0.962, "step": 4613 }, { "epoch": 6.426183844011142, "grad_norm": 3.166102170944214, "learning_rate": 2.5528709494030696e-05, "loss": 1.2582, "step": 4614 }, { "epoch": 6.427576601671309, "grad_norm": 2.819932460784912, "learning_rate": 2.5518760659465603e-05, "loss": 1.0251, "step": 4615 }, { "epoch": 6.428969359331476, "grad_norm": 4.832479476928711, "learning_rate": 2.550881182490051e-05, "loss": 0.9833, "step": 4616 }, { "epoch": 6.430362116991644, "grad_norm": 2.6273465156555176, "learning_rate": 2.5498862990335415e-05, "loss": 0.8972, "step": 4617 }, { "epoch": 6.43175487465181, "grad_norm": 3.3082454204559326, "learning_rate": 2.5488914155770322e-05, "loss": 1.0965, "step": 4618 }, { "epoch": 6.433147632311978, "grad_norm": 3.153756856918335, "learning_rate": 2.5478965321205226e-05, "loss": 1.1271, "step": 4619 }, { "epoch": 6.4345403899721445, "grad_norm": 3.0597243309020996, "learning_rate": 2.5469016486640134e-05, "loss": 1.1421, "step": 4620 }, { "epoch": 6.435933147632312, "grad_norm": 2.59921932220459, "learning_rate": 2.5459067652075038e-05, "loss": 0.892, "step": 4621 }, { "epoch": 6.437325905292479, "grad_norm": 2.940541982650757, "learning_rate": 2.5449118817509945e-05, "loss": 1.1599, "step": 4622 }, { "epoch": 6.438718662952646, "grad_norm": 2.700190782546997, "learning_rate": 2.5439169982944853e-05, "loss": 0.8381, "step": 4623 }, { "epoch": 6.440111420612814, "grad_norm": 2.8738365173339844, "learning_rate": 2.5429221148379757e-05, "loss": 1.0608, "step": 4624 }, { "epoch": 6.4415041782729805, "grad_norm": 3.1238081455230713, "learning_rate": 2.5419272313814668e-05, "loss": 1.0864, "step": 4625 }, { "epoch": 6.442896935933147, "grad_norm": 2.8366260528564453, "learning_rate": 2.5409323479249568e-05, "loss": 0.9429, "step": 4626 }, { "epoch": 6.444289693593315, "grad_norm": 3.097949981689453, "learning_rate": 2.539937464468448e-05, "loss": 1.0946, "step": 4627 }, { "epoch": 6.445682451253482, "grad_norm": 4.2688422203063965, "learning_rate": 2.5389425810119387e-05, "loss": 1.0365, "step": 4628 }, { "epoch": 6.447075208913649, "grad_norm": 2.8227672576904297, "learning_rate": 2.537947697555429e-05, "loss": 1.1122, "step": 4629 }, { "epoch": 6.4484679665738165, "grad_norm": 3.191445827484131, "learning_rate": 2.5369528140989198e-05, "loss": 1.275, "step": 4630 }, { "epoch": 6.449860724233983, "grad_norm": 3.0683882236480713, "learning_rate": 2.5359579306424102e-05, "loss": 1.0693, "step": 4631 }, { "epoch": 6.451253481894151, "grad_norm": 2.7502219676971436, "learning_rate": 2.534963047185901e-05, "loss": 0.9373, "step": 4632 }, { "epoch": 6.452646239554317, "grad_norm": 2.8942861557006836, "learning_rate": 2.5339681637293917e-05, "loss": 1.1393, "step": 4633 }, { "epoch": 6.454038997214485, "grad_norm": 2.999723196029663, "learning_rate": 2.532973280272882e-05, "loss": 0.8987, "step": 4634 }, { "epoch": 6.455431754874652, "grad_norm": 2.8345420360565186, "learning_rate": 2.531978396816373e-05, "loss": 1.0373, "step": 4635 }, { "epoch": 6.456824512534819, "grad_norm": 2.9698808193206787, "learning_rate": 2.5309835133598633e-05, "loss": 1.2927, "step": 4636 }, { "epoch": 6.458217270194986, "grad_norm": 2.84393048286438, "learning_rate": 2.529988629903354e-05, "loss": 0.9664, "step": 4637 }, { "epoch": 6.459610027855153, "grad_norm": 2.92636775970459, "learning_rate": 2.5289937464468448e-05, "loss": 1.1069, "step": 4638 }, { "epoch": 6.46100278551532, "grad_norm": 3.5068624019622803, "learning_rate": 2.527998862990335e-05, "loss": 0.959, "step": 4639 }, { "epoch": 6.462395543175488, "grad_norm": 2.868551015853882, "learning_rate": 2.527003979533826e-05, "loss": 1.1338, "step": 4640 }, { "epoch": 6.463788300835654, "grad_norm": 3.7234344482421875, "learning_rate": 2.5260090960773163e-05, "loss": 0.8658, "step": 4641 }, { "epoch": 6.465181058495822, "grad_norm": 3.3896803855895996, "learning_rate": 2.525014212620807e-05, "loss": 1.1506, "step": 4642 }, { "epoch": 6.4665738161559885, "grad_norm": 2.8069541454315186, "learning_rate": 2.5240193291642975e-05, "loss": 1.0095, "step": 4643 }, { "epoch": 6.467966573816156, "grad_norm": 2.879835605621338, "learning_rate": 2.5230244457077882e-05, "loss": 1.1378, "step": 4644 }, { "epoch": 6.469359331476323, "grad_norm": 3.1890411376953125, "learning_rate": 2.522029562251279e-05, "loss": 1.3044, "step": 4645 }, { "epoch": 6.47075208913649, "grad_norm": 3.6902568340301514, "learning_rate": 2.5210346787947694e-05, "loss": 1.1575, "step": 4646 }, { "epoch": 6.472144846796658, "grad_norm": 3.353255033493042, "learning_rate": 2.52003979533826e-05, "loss": 1.1234, "step": 4647 }, { "epoch": 6.4735376044568245, "grad_norm": 3.339797258377075, "learning_rate": 2.5190449118817505e-05, "loss": 1.3185, "step": 4648 }, { "epoch": 6.474930362116992, "grad_norm": 2.759061813354492, "learning_rate": 2.5180500284252413e-05, "loss": 0.8372, "step": 4649 }, { "epoch": 6.476323119777159, "grad_norm": 3.0016627311706543, "learning_rate": 2.517055144968732e-05, "loss": 0.9608, "step": 4650 }, { "epoch": 6.477715877437326, "grad_norm": 3.210092782974243, "learning_rate": 2.5160602615122224e-05, "loss": 1.1231, "step": 4651 }, { "epoch": 6.479108635097493, "grad_norm": 2.6207337379455566, "learning_rate": 2.5150653780557132e-05, "loss": 0.8677, "step": 4652 }, { "epoch": 6.4805013927576605, "grad_norm": 2.916144847869873, "learning_rate": 2.5140704945992036e-05, "loss": 1.0909, "step": 4653 }, { "epoch": 6.481894150417827, "grad_norm": 3.0932586193084717, "learning_rate": 2.5130756111426947e-05, "loss": 1.2029, "step": 4654 }, { "epoch": 6.483286908077995, "grad_norm": 2.834848403930664, "learning_rate": 2.5120807276861854e-05, "loss": 1.0181, "step": 4655 }, { "epoch": 6.484679665738161, "grad_norm": 3.0963003635406494, "learning_rate": 2.5110858442296758e-05, "loss": 0.9916, "step": 4656 }, { "epoch": 6.486072423398329, "grad_norm": 2.952533721923828, "learning_rate": 2.5100909607731666e-05, "loss": 0.9978, "step": 4657 }, { "epoch": 6.487465181058496, "grad_norm": 3.1474313735961914, "learning_rate": 2.509096077316657e-05, "loss": 1.1436, "step": 4658 }, { "epoch": 6.488857938718663, "grad_norm": 2.6717944145202637, "learning_rate": 2.5081011938601477e-05, "loss": 0.8815, "step": 4659 }, { "epoch": 6.49025069637883, "grad_norm": 3.0935699939727783, "learning_rate": 2.507106310403638e-05, "loss": 1.1013, "step": 4660 }, { "epoch": 6.491643454038997, "grad_norm": 3.12080454826355, "learning_rate": 2.506111426947129e-05, "loss": 1.098, "step": 4661 }, { "epoch": 6.493036211699164, "grad_norm": 3.0284221172332764, "learning_rate": 2.5051165434906196e-05, "loss": 1.1351, "step": 4662 }, { "epoch": 6.494428969359332, "grad_norm": 3.2609972953796387, "learning_rate": 2.50412166003411e-05, "loss": 1.2471, "step": 4663 }, { "epoch": 6.495821727019498, "grad_norm": 3.3732669353485107, "learning_rate": 2.5031267765776008e-05, "loss": 0.9781, "step": 4664 }, { "epoch": 6.497214484679666, "grad_norm": 3.2345945835113525, "learning_rate": 2.5021318931210912e-05, "loss": 1.2421, "step": 4665 }, { "epoch": 6.498607242339833, "grad_norm": 3.080406904220581, "learning_rate": 2.501137009664582e-05, "loss": 1.0013, "step": 4666 }, { "epoch": 6.5, "grad_norm": 4.212028980255127, "learning_rate": 2.5001421262080727e-05, "loss": 0.8842, "step": 4667 }, { "epoch": 6.501392757660167, "grad_norm": 2.7986233234405518, "learning_rate": 2.499147242751563e-05, "loss": 1.0137, "step": 4668 }, { "epoch": 6.502785515320334, "grad_norm": 3.28351092338562, "learning_rate": 2.4981523592950538e-05, "loss": 1.0776, "step": 4669 }, { "epoch": 6.504178272980502, "grad_norm": 2.7406668663024902, "learning_rate": 2.4971574758385442e-05, "loss": 1.0218, "step": 4670 }, { "epoch": 6.505571030640668, "grad_norm": 2.6849937438964844, "learning_rate": 2.496162592382035e-05, "loss": 0.8841, "step": 4671 }, { "epoch": 6.506963788300836, "grad_norm": 3.0540127754211426, "learning_rate": 2.4951677089255257e-05, "loss": 0.9514, "step": 4672 }, { "epoch": 6.508356545961003, "grad_norm": 3.3202834129333496, "learning_rate": 2.494172825469016e-05, "loss": 1.1729, "step": 4673 }, { "epoch": 6.50974930362117, "grad_norm": 3.6286327838897705, "learning_rate": 2.493177942012507e-05, "loss": 0.9764, "step": 4674 }, { "epoch": 6.511142061281337, "grad_norm": 3.012465238571167, "learning_rate": 2.4921830585559973e-05, "loss": 1.1726, "step": 4675 }, { "epoch": 6.512534818941504, "grad_norm": 3.3141849040985107, "learning_rate": 2.491188175099488e-05, "loss": 1.1859, "step": 4676 }, { "epoch": 6.513927576601671, "grad_norm": 3.096423625946045, "learning_rate": 2.4901932916429788e-05, "loss": 1.1932, "step": 4677 }, { "epoch": 6.515320334261839, "grad_norm": 2.589930772781372, "learning_rate": 2.4891984081864692e-05, "loss": 0.7744, "step": 4678 }, { "epoch": 6.516713091922005, "grad_norm": 2.926628351211548, "learning_rate": 2.48820352472996e-05, "loss": 0.8675, "step": 4679 }, { "epoch": 6.518105849582173, "grad_norm": 3.745406150817871, "learning_rate": 2.4872086412734503e-05, "loss": 1.1075, "step": 4680 }, { "epoch": 6.5194986072423395, "grad_norm": 3.591724157333374, "learning_rate": 2.4862137578169414e-05, "loss": 0.9095, "step": 4681 }, { "epoch": 6.520891364902507, "grad_norm": 2.5653629302978516, "learning_rate": 2.4852188743604315e-05, "loss": 0.9503, "step": 4682 }, { "epoch": 6.522284122562674, "grad_norm": 3.108179807662964, "learning_rate": 2.4842239909039226e-05, "loss": 1.1386, "step": 4683 }, { "epoch": 6.523676880222841, "grad_norm": 3.205650806427002, "learning_rate": 2.4832291074474133e-05, "loss": 1.0266, "step": 4684 }, { "epoch": 6.525069637883008, "grad_norm": 2.7524988651275635, "learning_rate": 2.4822342239909037e-05, "loss": 1.0027, "step": 4685 }, { "epoch": 6.5264623955431755, "grad_norm": 2.7650883197784424, "learning_rate": 2.4812393405343945e-05, "loss": 0.9588, "step": 4686 }, { "epoch": 6.527855153203342, "grad_norm": 3.339064598083496, "learning_rate": 2.480244457077885e-05, "loss": 1.1757, "step": 4687 }, { "epoch": 6.52924791086351, "grad_norm": 3.030686140060425, "learning_rate": 2.4792495736213756e-05, "loss": 0.9491, "step": 4688 }, { "epoch": 6.530640668523677, "grad_norm": 3.2274553775787354, "learning_rate": 2.4782546901648664e-05, "loss": 1.2159, "step": 4689 }, { "epoch": 6.532033426183844, "grad_norm": 3.967367172241211, "learning_rate": 2.4772598067083568e-05, "loss": 1.3414, "step": 4690 }, { "epoch": 6.5334261838440115, "grad_norm": 3.752794027328491, "learning_rate": 2.4762649232518475e-05, "loss": 1.1098, "step": 4691 }, { "epoch": 6.534818941504178, "grad_norm": 3.29585337638855, "learning_rate": 2.475270039795338e-05, "loss": 1.1735, "step": 4692 }, { "epoch": 6.536211699164346, "grad_norm": 3.18642520904541, "learning_rate": 2.4742751563388287e-05, "loss": 1.3173, "step": 4693 }, { "epoch": 6.537604456824512, "grad_norm": 2.4995405673980713, "learning_rate": 2.4732802728823194e-05, "loss": 0.8807, "step": 4694 }, { "epoch": 6.53899721448468, "grad_norm": 2.5437867641448975, "learning_rate": 2.47228538942581e-05, "loss": 0.9895, "step": 4695 }, { "epoch": 6.540389972144847, "grad_norm": 3.1137325763702393, "learning_rate": 2.4712905059693006e-05, "loss": 1.2948, "step": 4696 }, { "epoch": 6.541782729805014, "grad_norm": 3.344264507293701, "learning_rate": 2.470295622512791e-05, "loss": 1.3989, "step": 4697 }, { "epoch": 6.543175487465181, "grad_norm": 3.345322370529175, "learning_rate": 2.4693007390562817e-05, "loss": 1.2011, "step": 4698 }, { "epoch": 6.544568245125348, "grad_norm": 2.8151438236236572, "learning_rate": 2.4683058555997725e-05, "loss": 0.9592, "step": 4699 }, { "epoch": 6.545961002785515, "grad_norm": 2.728134870529175, "learning_rate": 2.467310972143263e-05, "loss": 0.9775, "step": 4700 }, { "epoch": 6.547353760445683, "grad_norm": 2.9509196281433105, "learning_rate": 2.4663160886867536e-05, "loss": 1.1995, "step": 4701 }, { "epoch": 6.548746518105849, "grad_norm": 3.4511666297912598, "learning_rate": 2.465321205230244e-05, "loss": 1.1744, "step": 4702 }, { "epoch": 6.550139275766017, "grad_norm": 2.9215309619903564, "learning_rate": 2.4643263217737348e-05, "loss": 0.9128, "step": 4703 }, { "epoch": 6.5515320334261835, "grad_norm": 2.8722798824310303, "learning_rate": 2.4633314383172252e-05, "loss": 0.9149, "step": 4704 }, { "epoch": 6.552924791086351, "grad_norm": 2.5877463817596436, "learning_rate": 2.462336554860716e-05, "loss": 0.8475, "step": 4705 }, { "epoch": 6.554317548746518, "grad_norm": 3.049581527709961, "learning_rate": 2.4613416714042067e-05, "loss": 0.9758, "step": 4706 }, { "epoch": 6.555710306406685, "grad_norm": 4.076055526733398, "learning_rate": 2.460346787947697e-05, "loss": 0.9216, "step": 4707 }, { "epoch": 6.557103064066853, "grad_norm": 2.760948657989502, "learning_rate": 2.459351904491188e-05, "loss": 0.8467, "step": 4708 }, { "epoch": 6.5584958217270195, "grad_norm": 2.908273458480835, "learning_rate": 2.4583570210346783e-05, "loss": 0.9611, "step": 4709 }, { "epoch": 6.559888579387186, "grad_norm": 2.923372268676758, "learning_rate": 2.4573621375781693e-05, "loss": 1.0828, "step": 4710 }, { "epoch": 6.561281337047354, "grad_norm": 3.5005199909210205, "learning_rate": 2.45636725412166e-05, "loss": 1.0053, "step": 4711 }, { "epoch": 6.562674094707521, "grad_norm": 3.210413694381714, "learning_rate": 2.4553723706651505e-05, "loss": 1.1403, "step": 4712 }, { "epoch": 6.564066852367688, "grad_norm": 3.584632635116577, "learning_rate": 2.4543774872086412e-05, "loss": 1.1124, "step": 4713 }, { "epoch": 6.5654596100278555, "grad_norm": 3.279512882232666, "learning_rate": 2.4533826037521317e-05, "loss": 1.1136, "step": 4714 }, { "epoch": 6.566852367688022, "grad_norm": 3.1212546825408936, "learning_rate": 2.4523877202956224e-05, "loss": 0.9316, "step": 4715 }, { "epoch": 6.56824512534819, "grad_norm": 2.8250529766082764, "learning_rate": 2.451392836839113e-05, "loss": 1.0379, "step": 4716 }, { "epoch": 6.569637883008356, "grad_norm": 3.100564479827881, "learning_rate": 2.4503979533826036e-05, "loss": 1.0759, "step": 4717 }, { "epoch": 6.571030640668524, "grad_norm": 3.492800235748291, "learning_rate": 2.4494030699260943e-05, "loss": 1.0498, "step": 4718 }, { "epoch": 6.572423398328691, "grad_norm": 2.630836009979248, "learning_rate": 2.4484081864695847e-05, "loss": 0.8225, "step": 4719 }, { "epoch": 6.573816155988858, "grad_norm": 2.584005117416382, "learning_rate": 2.4474133030130755e-05, "loss": 1.018, "step": 4720 }, { "epoch": 6.575208913649025, "grad_norm": 2.8945672512054443, "learning_rate": 2.4464184195565662e-05, "loss": 0.9315, "step": 4721 }, { "epoch": 6.576601671309192, "grad_norm": 3.119586229324341, "learning_rate": 2.4454235361000566e-05, "loss": 1.2142, "step": 4722 }, { "epoch": 6.577994428969359, "grad_norm": 3.0555262565612793, "learning_rate": 2.4444286526435474e-05, "loss": 0.9332, "step": 4723 }, { "epoch": 6.579387186629527, "grad_norm": 2.706542730331421, "learning_rate": 2.4434337691870378e-05, "loss": 0.9654, "step": 4724 }, { "epoch": 6.580779944289693, "grad_norm": 3.118386745452881, "learning_rate": 2.4424388857305285e-05, "loss": 1.4078, "step": 4725 }, { "epoch": 6.582172701949861, "grad_norm": 3.0359275341033936, "learning_rate": 2.441444002274019e-05, "loss": 1.1302, "step": 4726 }, { "epoch": 6.5835654596100275, "grad_norm": 2.9190561771392822, "learning_rate": 2.4404491188175097e-05, "loss": 1.0922, "step": 4727 }, { "epoch": 6.584958217270195, "grad_norm": 2.814833641052246, "learning_rate": 2.4394542353610004e-05, "loss": 0.8571, "step": 4728 }, { "epoch": 6.586350974930362, "grad_norm": 2.996880292892456, "learning_rate": 2.4384593519044908e-05, "loss": 0.8809, "step": 4729 }, { "epoch": 6.587743732590529, "grad_norm": 3.106786012649536, "learning_rate": 2.4374644684479816e-05, "loss": 1.0673, "step": 4730 }, { "epoch": 6.589136490250697, "grad_norm": 2.694575309753418, "learning_rate": 2.436469584991472e-05, "loss": 1.0702, "step": 4731 }, { "epoch": 6.5905292479108635, "grad_norm": 2.9143412113189697, "learning_rate": 2.4354747015349627e-05, "loss": 0.9471, "step": 4732 }, { "epoch": 6.591922005571031, "grad_norm": 2.377490282058716, "learning_rate": 2.4344798180784535e-05, "loss": 0.6882, "step": 4733 }, { "epoch": 6.593314763231198, "grad_norm": 2.679741144180298, "learning_rate": 2.433484934621944e-05, "loss": 0.8205, "step": 4734 }, { "epoch": 6.594707520891365, "grad_norm": 2.5864362716674805, "learning_rate": 2.4324900511654346e-05, "loss": 1.0964, "step": 4735 }, { "epoch": 6.596100278551532, "grad_norm": 2.768282175064087, "learning_rate": 2.431495167708925e-05, "loss": 0.8348, "step": 4736 }, { "epoch": 6.5974930362116995, "grad_norm": 3.049933910369873, "learning_rate": 2.430500284252416e-05, "loss": 1.1628, "step": 4737 }, { "epoch": 6.598885793871866, "grad_norm": 3.35349440574646, "learning_rate": 2.429505400795907e-05, "loss": 1.2172, "step": 4738 }, { "epoch": 6.600278551532034, "grad_norm": 3.1047682762145996, "learning_rate": 2.4285105173393973e-05, "loss": 0.9607, "step": 4739 }, { "epoch": 6.6016713091922, "grad_norm": 2.599682331085205, "learning_rate": 2.427515633882888e-05, "loss": 0.8289, "step": 4740 }, { "epoch": 6.603064066852368, "grad_norm": 2.819077491760254, "learning_rate": 2.4265207504263784e-05, "loss": 1.2117, "step": 4741 }, { "epoch": 6.604456824512535, "grad_norm": 3.0849640369415283, "learning_rate": 2.425525866969869e-05, "loss": 1.0484, "step": 4742 }, { "epoch": 6.605849582172702, "grad_norm": 3.0797269344329834, "learning_rate": 2.42453098351336e-05, "loss": 0.9468, "step": 4743 }, { "epoch": 6.607242339832869, "grad_norm": 2.7996320724487305, "learning_rate": 2.4235361000568503e-05, "loss": 0.8944, "step": 4744 }, { "epoch": 6.608635097493036, "grad_norm": 2.971978187561035, "learning_rate": 2.422541216600341e-05, "loss": 1.0863, "step": 4745 }, { "epoch": 6.610027855153203, "grad_norm": 2.5103025436401367, "learning_rate": 2.4215463331438315e-05, "loss": 0.7514, "step": 4746 }, { "epoch": 6.611420612813371, "grad_norm": 3.3744678497314453, "learning_rate": 2.4205514496873222e-05, "loss": 0.7911, "step": 4747 }, { "epoch": 6.612813370473537, "grad_norm": 2.7324295043945312, "learning_rate": 2.4195565662308126e-05, "loss": 0.8992, "step": 4748 }, { "epoch": 6.614206128133705, "grad_norm": 2.771432876586914, "learning_rate": 2.4185616827743034e-05, "loss": 0.9506, "step": 4749 }, { "epoch": 6.615598885793872, "grad_norm": 3.6363251209259033, "learning_rate": 2.417566799317794e-05, "loss": 1.2449, "step": 4750 }, { "epoch": 6.616991643454039, "grad_norm": 3.2286977767944336, "learning_rate": 2.4165719158612845e-05, "loss": 1.2933, "step": 4751 }, { "epoch": 6.618384401114206, "grad_norm": 2.9055263996124268, "learning_rate": 2.4155770324047753e-05, "loss": 1.1623, "step": 4752 }, { "epoch": 6.619777158774373, "grad_norm": 3.2123911380767822, "learning_rate": 2.4145821489482657e-05, "loss": 1.0442, "step": 4753 }, { "epoch": 6.621169916434541, "grad_norm": 3.097463607788086, "learning_rate": 2.4135872654917564e-05, "loss": 0.9573, "step": 4754 }, { "epoch": 6.6225626740947074, "grad_norm": 2.822230815887451, "learning_rate": 2.4125923820352472e-05, "loss": 1.026, "step": 4755 }, { "epoch": 6.623955431754875, "grad_norm": 3.2758986949920654, "learning_rate": 2.4115974985787376e-05, "loss": 1.0508, "step": 4756 }, { "epoch": 6.625348189415042, "grad_norm": 2.6393706798553467, "learning_rate": 2.4106026151222283e-05, "loss": 0.8872, "step": 4757 }, { "epoch": 6.626740947075209, "grad_norm": 2.6567206382751465, "learning_rate": 2.4096077316657187e-05, "loss": 0.9524, "step": 4758 }, { "epoch": 6.628133704735376, "grad_norm": 3.7393648624420166, "learning_rate": 2.4086128482092095e-05, "loss": 1.002, "step": 4759 }, { "epoch": 6.629526462395543, "grad_norm": 2.7445218563079834, "learning_rate": 2.4076179647527002e-05, "loss": 0.9604, "step": 4760 }, { "epoch": 6.63091922005571, "grad_norm": 2.7881252765655518, "learning_rate": 2.4066230812961906e-05, "loss": 0.8748, "step": 4761 }, { "epoch": 6.632311977715878, "grad_norm": 2.8022923469543457, "learning_rate": 2.4056281978396814e-05, "loss": 0.9567, "step": 4762 }, { "epoch": 6.633704735376044, "grad_norm": 2.6584715843200684, "learning_rate": 2.4046333143831718e-05, "loss": 0.8187, "step": 4763 }, { "epoch": 6.635097493036212, "grad_norm": 2.696336507797241, "learning_rate": 2.4036384309266625e-05, "loss": 0.9962, "step": 4764 }, { "epoch": 6.6364902506963785, "grad_norm": 2.634849786758423, "learning_rate": 2.4026435474701536e-05, "loss": 0.9339, "step": 4765 }, { "epoch": 6.637883008356546, "grad_norm": 2.7696452140808105, "learning_rate": 2.401648664013644e-05, "loss": 0.9384, "step": 4766 }, { "epoch": 6.639275766016713, "grad_norm": 2.92993426322937, "learning_rate": 2.4006537805571348e-05, "loss": 1.0852, "step": 4767 }, { "epoch": 6.64066852367688, "grad_norm": 3.3232686519622803, "learning_rate": 2.3996588971006252e-05, "loss": 1.1916, "step": 4768 }, { "epoch": 6.642061281337047, "grad_norm": 4.471713066101074, "learning_rate": 2.398664013644116e-05, "loss": 1.1487, "step": 4769 }, { "epoch": 6.6434540389972145, "grad_norm": 2.7921526432037354, "learning_rate": 2.3976691301876063e-05, "loss": 0.9126, "step": 4770 }, { "epoch": 6.644846796657381, "grad_norm": 3.0503218173980713, "learning_rate": 2.396674246731097e-05, "loss": 1.0404, "step": 4771 }, { "epoch": 6.646239554317549, "grad_norm": 2.566943407058716, "learning_rate": 2.3956793632745878e-05, "loss": 0.8334, "step": 4772 }, { "epoch": 6.647632311977716, "grad_norm": 3.2688379287719727, "learning_rate": 2.3946844798180782e-05, "loss": 0.947, "step": 4773 }, { "epoch": 6.649025069637883, "grad_norm": 3.023160457611084, "learning_rate": 2.393689596361569e-05, "loss": 1.06, "step": 4774 }, { "epoch": 6.65041782729805, "grad_norm": 3.023071527481079, "learning_rate": 2.3926947129050594e-05, "loss": 1.0209, "step": 4775 }, { "epoch": 6.651810584958217, "grad_norm": 2.9008774757385254, "learning_rate": 2.39169982944855e-05, "loss": 1.1054, "step": 4776 }, { "epoch": 6.653203342618385, "grad_norm": 2.7631795406341553, "learning_rate": 2.390704945992041e-05, "loss": 0.9356, "step": 4777 }, { "epoch": 6.654596100278551, "grad_norm": 2.6900882720947266, "learning_rate": 2.3897100625355313e-05, "loss": 0.8931, "step": 4778 }, { "epoch": 6.655988857938719, "grad_norm": 2.8253777027130127, "learning_rate": 2.388715179079022e-05, "loss": 1.0883, "step": 4779 }, { "epoch": 6.657381615598886, "grad_norm": 2.7304461002349854, "learning_rate": 2.3877202956225124e-05, "loss": 1.068, "step": 4780 }, { "epoch": 6.658774373259053, "grad_norm": 3.2457969188690186, "learning_rate": 2.3867254121660032e-05, "loss": 1.0144, "step": 4781 }, { "epoch": 6.66016713091922, "grad_norm": 3.1068785190582275, "learning_rate": 2.385730528709494e-05, "loss": 0.962, "step": 4782 }, { "epoch": 6.661559888579387, "grad_norm": 3.2889511585235596, "learning_rate": 2.3847356452529843e-05, "loss": 1.0686, "step": 4783 }, { "epoch": 6.662952646239554, "grad_norm": 3.2533276081085205, "learning_rate": 2.383740761796475e-05, "loss": 0.9554, "step": 4784 }, { "epoch": 6.664345403899722, "grad_norm": 3.1829142570495605, "learning_rate": 2.3827458783399655e-05, "loss": 1.0605, "step": 4785 }, { "epoch": 6.665738161559888, "grad_norm": 2.736853837966919, "learning_rate": 2.3817509948834562e-05, "loss": 0.9768, "step": 4786 }, { "epoch": 6.667130919220056, "grad_norm": 3.2278525829315186, "learning_rate": 2.380756111426947e-05, "loss": 1.0798, "step": 4787 }, { "epoch": 6.6685236768802225, "grad_norm": 3.094557762145996, "learning_rate": 2.3797612279704374e-05, "loss": 1.0931, "step": 4788 }, { "epoch": 6.66991643454039, "grad_norm": 2.9677906036376953, "learning_rate": 2.378766344513928e-05, "loss": 1.1247, "step": 4789 }, { "epoch": 6.671309192200557, "grad_norm": 2.5754425525665283, "learning_rate": 2.3777714610574185e-05, "loss": 0.7597, "step": 4790 }, { "epoch": 6.672701949860724, "grad_norm": 2.9235668182373047, "learning_rate": 2.3767765776009093e-05, "loss": 1.137, "step": 4791 }, { "epoch": 6.674094707520892, "grad_norm": 3.0413553714752197, "learning_rate": 2.3757816941443997e-05, "loss": 1.1092, "step": 4792 }, { "epoch": 6.6754874651810585, "grad_norm": 2.9375197887420654, "learning_rate": 2.3747868106878904e-05, "loss": 1.0091, "step": 4793 }, { "epoch": 6.676880222841225, "grad_norm": 2.6702022552490234, "learning_rate": 2.3737919272313815e-05, "loss": 0.9394, "step": 4794 }, { "epoch": 6.678272980501393, "grad_norm": 7.281650066375732, "learning_rate": 2.372797043774872e-05, "loss": 1.1282, "step": 4795 }, { "epoch": 6.67966573816156, "grad_norm": 2.7565412521362305, "learning_rate": 2.3718021603183627e-05, "loss": 0.9102, "step": 4796 }, { "epoch": 6.681058495821727, "grad_norm": 3.12174391746521, "learning_rate": 2.370807276861853e-05, "loss": 1.1007, "step": 4797 }, { "epoch": 6.6824512534818945, "grad_norm": 3.0235435962677, "learning_rate": 2.369812393405344e-05, "loss": 1.1611, "step": 4798 }, { "epoch": 6.683844011142061, "grad_norm": 2.963348388671875, "learning_rate": 2.3688175099488346e-05, "loss": 0.9288, "step": 4799 }, { "epoch": 6.685236768802229, "grad_norm": 3.0740180015563965, "learning_rate": 2.367822626492325e-05, "loss": 0.9463, "step": 4800 }, { "epoch": 6.686629526462395, "grad_norm": 3.2976651191711426, "learning_rate": 2.3668277430358157e-05, "loss": 1.284, "step": 4801 }, { "epoch": 6.688022284122563, "grad_norm": 3.295809745788574, "learning_rate": 2.365832859579306e-05, "loss": 1.0205, "step": 4802 }, { "epoch": 6.68941504178273, "grad_norm": 3.338949680328369, "learning_rate": 2.364837976122797e-05, "loss": 1.3288, "step": 4803 }, { "epoch": 6.690807799442897, "grad_norm": 3.4728505611419678, "learning_rate": 2.3638430926662876e-05, "loss": 0.9714, "step": 4804 }, { "epoch": 6.692200557103064, "grad_norm": 2.947736978530884, "learning_rate": 2.362848209209778e-05, "loss": 1.1931, "step": 4805 }, { "epoch": 6.693593314763231, "grad_norm": 2.6117374897003174, "learning_rate": 2.3618533257532688e-05, "loss": 0.8665, "step": 4806 }, { "epoch": 6.694986072423398, "grad_norm": 2.668964385986328, "learning_rate": 2.3608584422967592e-05, "loss": 0.8744, "step": 4807 }, { "epoch": 6.696378830083566, "grad_norm": 2.5390703678131104, "learning_rate": 2.35986355884025e-05, "loss": 1.0152, "step": 4808 }, { "epoch": 6.697771587743732, "grad_norm": 2.81025767326355, "learning_rate": 2.3588686753837407e-05, "loss": 0.9227, "step": 4809 }, { "epoch": 6.6991643454039, "grad_norm": 3.149915933609009, "learning_rate": 2.357873791927231e-05, "loss": 1.142, "step": 4810 }, { "epoch": 6.7005571030640665, "grad_norm": 3.086437702178955, "learning_rate": 2.356878908470722e-05, "loss": 1.2156, "step": 4811 }, { "epoch": 6.701949860724234, "grad_norm": 3.5764501094818115, "learning_rate": 2.3558840250142123e-05, "loss": 1.164, "step": 4812 }, { "epoch": 6.703342618384401, "grad_norm": 6.518548011779785, "learning_rate": 2.354889141557703e-05, "loss": 0.9738, "step": 4813 }, { "epoch": 6.704735376044568, "grad_norm": 2.80240797996521, "learning_rate": 2.3538942581011934e-05, "loss": 0.9904, "step": 4814 }, { "epoch": 6.706128133704736, "grad_norm": 3.277841091156006, "learning_rate": 2.352899374644684e-05, "loss": 0.9248, "step": 4815 }, { "epoch": 6.7075208913649025, "grad_norm": 3.215149164199829, "learning_rate": 2.351904491188175e-05, "loss": 0.9506, "step": 4816 }, { "epoch": 6.708913649025069, "grad_norm": 3.2901055812835693, "learning_rate": 2.3509096077316653e-05, "loss": 1.2405, "step": 4817 }, { "epoch": 6.710306406685237, "grad_norm": 2.942967653274536, "learning_rate": 2.349914724275156e-05, "loss": 1.1618, "step": 4818 }, { "epoch": 6.711699164345404, "grad_norm": 2.7844669818878174, "learning_rate": 2.3489198408186465e-05, "loss": 1.0405, "step": 4819 }, { "epoch": 6.713091922005571, "grad_norm": 3.1708924770355225, "learning_rate": 2.3479249573621372e-05, "loss": 1.3228, "step": 4820 }, { "epoch": 6.7144846796657385, "grad_norm": 3.003021240234375, "learning_rate": 2.3469300739056283e-05, "loss": 1.0084, "step": 4821 }, { "epoch": 6.715877437325905, "grad_norm": 3.2701704502105713, "learning_rate": 2.3459351904491187e-05, "loss": 1.1496, "step": 4822 }, { "epoch": 6.717270194986073, "grad_norm": 3.280702590942383, "learning_rate": 2.3449403069926094e-05, "loss": 0.8639, "step": 4823 }, { "epoch": 6.718662952646239, "grad_norm": 2.408778667449951, "learning_rate": 2.3439454235361e-05, "loss": 0.7839, "step": 4824 }, { "epoch": 6.720055710306407, "grad_norm": 3.651390314102173, "learning_rate": 2.3429505400795906e-05, "loss": 1.1629, "step": 4825 }, { "epoch": 6.721448467966574, "grad_norm": 3.0692644119262695, "learning_rate": 2.3419556566230813e-05, "loss": 1.2663, "step": 4826 }, { "epoch": 6.722841225626741, "grad_norm": 2.9470434188842773, "learning_rate": 2.3409607731665718e-05, "loss": 1.1087, "step": 4827 }, { "epoch": 6.724233983286908, "grad_norm": 2.759246826171875, "learning_rate": 2.3399658897100625e-05, "loss": 0.9773, "step": 4828 }, { "epoch": 6.725626740947075, "grad_norm": 2.8075499534606934, "learning_rate": 2.338971006253553e-05, "loss": 0.749, "step": 4829 }, { "epoch": 6.727019498607242, "grad_norm": 2.875091791152954, "learning_rate": 2.3379761227970437e-05, "loss": 1.0342, "step": 4830 }, { "epoch": 6.72841225626741, "grad_norm": 3.200989246368408, "learning_rate": 2.3369812393405344e-05, "loss": 1.0565, "step": 4831 }, { "epoch": 6.729805013927576, "grad_norm": 2.9280784130096436, "learning_rate": 2.3359863558840248e-05, "loss": 0.9753, "step": 4832 }, { "epoch": 6.731197771587744, "grad_norm": 2.8760924339294434, "learning_rate": 2.3349914724275156e-05, "loss": 1.0051, "step": 4833 }, { "epoch": 6.732590529247911, "grad_norm": 2.6996631622314453, "learning_rate": 2.333996588971006e-05, "loss": 0.9777, "step": 4834 }, { "epoch": 6.733983286908078, "grad_norm": 3.017238140106201, "learning_rate": 2.3330017055144967e-05, "loss": 0.8874, "step": 4835 }, { "epoch": 6.735376044568245, "grad_norm": 3.1263272762298584, "learning_rate": 2.332006822057987e-05, "loss": 0.9936, "step": 4836 }, { "epoch": 6.736768802228412, "grad_norm": 3.133610248565674, "learning_rate": 2.331011938601478e-05, "loss": 0.9494, "step": 4837 }, { "epoch": 6.73816155988858, "grad_norm": 2.646540403366089, "learning_rate": 2.3300170551449686e-05, "loss": 0.856, "step": 4838 }, { "epoch": 6.7395543175487465, "grad_norm": 3.041557550430298, "learning_rate": 2.329022171688459e-05, "loss": 0.9367, "step": 4839 }, { "epoch": 6.740947075208914, "grad_norm": 2.9367825984954834, "learning_rate": 2.3280272882319498e-05, "loss": 1.0595, "step": 4840 }, { "epoch": 6.742339832869081, "grad_norm": 3.084501266479492, "learning_rate": 2.3270324047754402e-05, "loss": 0.8773, "step": 4841 }, { "epoch": 6.743732590529248, "grad_norm": 3.071359634399414, "learning_rate": 2.326037521318931e-05, "loss": 1.049, "step": 4842 }, { "epoch": 6.745125348189415, "grad_norm": 3.3054821491241455, "learning_rate": 2.3250426378624217e-05, "loss": 1.1725, "step": 4843 }, { "epoch": 6.7465181058495824, "grad_norm": 2.5150833129882812, "learning_rate": 2.324047754405912e-05, "loss": 0.7595, "step": 4844 }, { "epoch": 6.747910863509749, "grad_norm": 2.453789472579956, "learning_rate": 2.3230528709494028e-05, "loss": 0.891, "step": 4845 }, { "epoch": 6.749303621169917, "grad_norm": 2.9088315963745117, "learning_rate": 2.3220579874928932e-05, "loss": 1.0837, "step": 4846 }, { "epoch": 6.750696378830083, "grad_norm": 2.64329195022583, "learning_rate": 2.321063104036384e-05, "loss": 0.8332, "step": 4847 }, { "epoch": 6.752089136490251, "grad_norm": 2.5478029251098633, "learning_rate": 2.320068220579875e-05, "loss": 0.9777, "step": 4848 }, { "epoch": 6.7534818941504176, "grad_norm": 2.7847177982330322, "learning_rate": 2.319073337123365e-05, "loss": 0.9929, "step": 4849 }, { "epoch": 6.754874651810585, "grad_norm": 2.8384597301483154, "learning_rate": 2.3180784536668562e-05, "loss": 1.0823, "step": 4850 }, { "epoch": 6.756267409470752, "grad_norm": 3.376582145690918, "learning_rate": 2.3170835702103466e-05, "loss": 1.1255, "step": 4851 }, { "epoch": 6.757660167130919, "grad_norm": 2.6226806640625, "learning_rate": 2.3160886867538374e-05, "loss": 0.8217, "step": 4852 }, { "epoch": 6.759052924791086, "grad_norm": 2.9121294021606445, "learning_rate": 2.3150938032973278e-05, "loss": 1.0547, "step": 4853 }, { "epoch": 6.7604456824512535, "grad_norm": 3.0356545448303223, "learning_rate": 2.3140989198408185e-05, "loss": 1.0487, "step": 4854 }, { "epoch": 6.76183844011142, "grad_norm": 3.278174638748169, "learning_rate": 2.3131040363843093e-05, "loss": 0.8931, "step": 4855 }, { "epoch": 6.763231197771588, "grad_norm": 3.302827835083008, "learning_rate": 2.3121091529277997e-05, "loss": 1.1367, "step": 4856 }, { "epoch": 6.764623955431755, "grad_norm": 2.7777512073516846, "learning_rate": 2.3111142694712904e-05, "loss": 0.9247, "step": 4857 }, { "epoch": 6.766016713091922, "grad_norm": 2.9615275859832764, "learning_rate": 2.3101193860147808e-05, "loss": 0.9697, "step": 4858 }, { "epoch": 6.767409470752089, "grad_norm": 3.0469748973846436, "learning_rate": 2.3091245025582716e-05, "loss": 1.2354, "step": 4859 }, { "epoch": 6.768802228412256, "grad_norm": 3.0362789630889893, "learning_rate": 2.3081296191017623e-05, "loss": 0.9566, "step": 4860 }, { "epoch": 6.770194986072424, "grad_norm": 2.6087095737457275, "learning_rate": 2.3071347356452527e-05, "loss": 1.0061, "step": 4861 }, { "epoch": 6.77158774373259, "grad_norm": 3.168748140335083, "learning_rate": 2.3061398521887435e-05, "loss": 0.8952, "step": 4862 }, { "epoch": 6.772980501392758, "grad_norm": 2.2951464653015137, "learning_rate": 2.305144968732234e-05, "loss": 0.7511, "step": 4863 }, { "epoch": 6.774373259052925, "grad_norm": 2.996347188949585, "learning_rate": 2.3041500852757246e-05, "loss": 0.8698, "step": 4864 }, { "epoch": 6.775766016713092, "grad_norm": 2.909731149673462, "learning_rate": 2.3031552018192154e-05, "loss": 1.0732, "step": 4865 }, { "epoch": 6.777158774373259, "grad_norm": 3.417818546295166, "learning_rate": 2.3021603183627058e-05, "loss": 1.0244, "step": 4866 }, { "epoch": 6.778551532033426, "grad_norm": 2.939164400100708, "learning_rate": 2.3011654349061965e-05, "loss": 0.6911, "step": 4867 }, { "epoch": 6.779944289693593, "grad_norm": 2.8852765560150146, "learning_rate": 2.300170551449687e-05, "loss": 1.0562, "step": 4868 }, { "epoch": 6.781337047353761, "grad_norm": 2.835871934890747, "learning_rate": 2.2991756679931777e-05, "loss": 0.9956, "step": 4869 }, { "epoch": 6.782729805013927, "grad_norm": 3.3156003952026367, "learning_rate": 2.2981807845366684e-05, "loss": 1.0879, "step": 4870 }, { "epoch": 6.784122562674095, "grad_norm": 3.052037000656128, "learning_rate": 2.297185901080159e-05, "loss": 1.0994, "step": 4871 }, { "epoch": 6.7855153203342615, "grad_norm": 2.884131908416748, "learning_rate": 2.2961910176236496e-05, "loss": 1.028, "step": 4872 }, { "epoch": 6.786908077994429, "grad_norm": 2.5954511165618896, "learning_rate": 2.29519613416714e-05, "loss": 0.8262, "step": 4873 }, { "epoch": 6.788300835654596, "grad_norm": 3.0728766918182373, "learning_rate": 2.2942012507106307e-05, "loss": 1.1055, "step": 4874 }, { "epoch": 6.789693593314763, "grad_norm": 2.8305482864379883, "learning_rate": 2.293206367254121e-05, "loss": 0.9566, "step": 4875 }, { "epoch": 6.791086350974931, "grad_norm": 2.665647029876709, "learning_rate": 2.292211483797612e-05, "loss": 0.8036, "step": 4876 }, { "epoch": 6.7924791086350975, "grad_norm": 2.9441335201263428, "learning_rate": 2.291216600341103e-05, "loss": 1.0342, "step": 4877 }, { "epoch": 6.793871866295264, "grad_norm": 4.162250995635986, "learning_rate": 2.2902217168845934e-05, "loss": 0.7745, "step": 4878 }, { "epoch": 6.795264623955432, "grad_norm": 2.99715518951416, "learning_rate": 2.289226833428084e-05, "loss": 1.1928, "step": 4879 }, { "epoch": 6.796657381615599, "grad_norm": 3.211400270462036, "learning_rate": 2.2882319499715745e-05, "loss": 0.9964, "step": 4880 }, { "epoch": 6.798050139275766, "grad_norm": 2.7228331565856934, "learning_rate": 2.2872370665150653e-05, "loss": 0.9169, "step": 4881 }, { "epoch": 6.7994428969359335, "grad_norm": 2.626319646835327, "learning_rate": 2.286242183058556e-05, "loss": 0.8734, "step": 4882 }, { "epoch": 6.8008356545961, "grad_norm": 2.9297358989715576, "learning_rate": 2.2852472996020464e-05, "loss": 1.199, "step": 4883 }, { "epoch": 6.802228412256268, "grad_norm": 2.9165637493133545, "learning_rate": 2.2842524161455372e-05, "loss": 1.0327, "step": 4884 }, { "epoch": 6.803621169916434, "grad_norm": 2.8690686225891113, "learning_rate": 2.2832575326890276e-05, "loss": 0.913, "step": 4885 }, { "epoch": 6.805013927576602, "grad_norm": 4.286510467529297, "learning_rate": 2.2822626492325183e-05, "loss": 0.9175, "step": 4886 }, { "epoch": 6.806406685236769, "grad_norm": 2.6812071800231934, "learning_rate": 2.281267765776009e-05, "loss": 0.8382, "step": 4887 }, { "epoch": 6.807799442896936, "grad_norm": 3.066721200942993, "learning_rate": 2.2802728823194995e-05, "loss": 0.9701, "step": 4888 }, { "epoch": 6.809192200557103, "grad_norm": 3.317391872406006, "learning_rate": 2.2792779988629902e-05, "loss": 1.1758, "step": 4889 }, { "epoch": 6.81058495821727, "grad_norm": 2.943735122680664, "learning_rate": 2.2782831154064806e-05, "loss": 0.9858, "step": 4890 }, { "epoch": 6.811977715877437, "grad_norm": 3.0627267360687256, "learning_rate": 2.2772882319499714e-05, "loss": 1.1379, "step": 4891 }, { "epoch": 6.813370473537605, "grad_norm": 2.906496524810791, "learning_rate": 2.276293348493462e-05, "loss": 0.9958, "step": 4892 }, { "epoch": 6.814763231197771, "grad_norm": 3.279766798019409, "learning_rate": 2.2752984650369525e-05, "loss": 1.0487, "step": 4893 }, { "epoch": 6.816155988857939, "grad_norm": 2.6565163135528564, "learning_rate": 2.2743035815804433e-05, "loss": 0.8614, "step": 4894 }, { "epoch": 6.8175487465181055, "grad_norm": 2.6851377487182617, "learning_rate": 2.2733086981239337e-05, "loss": 0.8974, "step": 4895 }, { "epoch": 6.818941504178273, "grad_norm": 2.879284143447876, "learning_rate": 2.2723138146674244e-05, "loss": 0.899, "step": 4896 }, { "epoch": 6.82033426183844, "grad_norm": 3.6307334899902344, "learning_rate": 2.271318931210915e-05, "loss": 1.2624, "step": 4897 }, { "epoch": 6.821727019498607, "grad_norm": 3.167030096054077, "learning_rate": 2.2703240477544056e-05, "loss": 1.0152, "step": 4898 }, { "epoch": 6.823119777158775, "grad_norm": 2.485337018966675, "learning_rate": 2.2693291642978963e-05, "loss": 0.8044, "step": 4899 }, { "epoch": 6.8245125348189415, "grad_norm": 3.0829617977142334, "learning_rate": 2.2683342808413868e-05, "loss": 1.2321, "step": 4900 }, { "epoch": 6.825905292479108, "grad_norm": 3.10870361328125, "learning_rate": 2.2673393973848775e-05, "loss": 0.907, "step": 4901 }, { "epoch": 6.827298050139276, "grad_norm": 3.1099343299865723, "learning_rate": 2.266344513928368e-05, "loss": 0.9595, "step": 4902 }, { "epoch": 6.828690807799443, "grad_norm": 3.0900535583496094, "learning_rate": 2.2653496304718587e-05, "loss": 1.0796, "step": 4903 }, { "epoch": 6.83008356545961, "grad_norm": 3.012258529663086, "learning_rate": 2.2643547470153497e-05, "loss": 0.9799, "step": 4904 }, { "epoch": 6.8314763231197775, "grad_norm": 2.9189305305480957, "learning_rate": 2.2633598635588398e-05, "loss": 0.8298, "step": 4905 }, { "epoch": 6.832869080779944, "grad_norm": 3.2075531482696533, "learning_rate": 2.262364980102331e-05, "loss": 1.2077, "step": 4906 }, { "epoch": 6.834261838440112, "grad_norm": 3.070300340652466, "learning_rate": 2.2613700966458213e-05, "loss": 1.1166, "step": 4907 }, { "epoch": 6.835654596100278, "grad_norm": 2.730436325073242, "learning_rate": 2.260375213189312e-05, "loss": 1.137, "step": 4908 }, { "epoch": 6.837047353760446, "grad_norm": 2.8196377754211426, "learning_rate": 2.2593803297328028e-05, "loss": 0.8568, "step": 4909 }, { "epoch": 6.838440111420613, "grad_norm": 3.2209866046905518, "learning_rate": 2.2583854462762932e-05, "loss": 1.1469, "step": 4910 }, { "epoch": 6.83983286908078, "grad_norm": 3.142430067062378, "learning_rate": 2.257390562819784e-05, "loss": 1.1244, "step": 4911 }, { "epoch": 6.841225626740947, "grad_norm": 2.8419435024261475, "learning_rate": 2.2563956793632744e-05, "loss": 0.8251, "step": 4912 }, { "epoch": 6.842618384401114, "grad_norm": 3.740586519241333, "learning_rate": 2.255400795906765e-05, "loss": 1.0554, "step": 4913 }, { "epoch": 6.844011142061281, "grad_norm": 2.52078914642334, "learning_rate": 2.254405912450256e-05, "loss": 0.8627, "step": 4914 }, { "epoch": 6.845403899721449, "grad_norm": 3.096212387084961, "learning_rate": 2.2534110289937463e-05, "loss": 1.0125, "step": 4915 }, { "epoch": 6.846796657381615, "grad_norm": 3.0929019451141357, "learning_rate": 2.252416145537237e-05, "loss": 1.1388, "step": 4916 }, { "epoch": 6.848189415041783, "grad_norm": 2.9732069969177246, "learning_rate": 2.2514212620807274e-05, "loss": 1.0101, "step": 4917 }, { "epoch": 6.84958217270195, "grad_norm": 3.1545639038085938, "learning_rate": 2.250426378624218e-05, "loss": 1.0123, "step": 4918 }, { "epoch": 6.850974930362117, "grad_norm": 3.0801069736480713, "learning_rate": 2.2494314951677086e-05, "loss": 1.1101, "step": 4919 }, { "epoch": 6.852367688022284, "grad_norm": 3.8419761657714844, "learning_rate": 2.2484366117111993e-05, "loss": 0.8489, "step": 4920 }, { "epoch": 6.853760445682451, "grad_norm": 3.294038772583008, "learning_rate": 2.24744172825469e-05, "loss": 1.1795, "step": 4921 }, { "epoch": 6.855153203342619, "grad_norm": 3.377178430557251, "learning_rate": 2.2464468447981805e-05, "loss": 1.3105, "step": 4922 }, { "epoch": 6.8565459610027855, "grad_norm": 3.4253134727478027, "learning_rate": 2.2454519613416712e-05, "loss": 1.1638, "step": 4923 }, { "epoch": 6.857938718662953, "grad_norm": 2.7638607025146484, "learning_rate": 2.2444570778851616e-05, "loss": 0.9349, "step": 4924 }, { "epoch": 6.85933147632312, "grad_norm": 2.787493944168091, "learning_rate": 2.2434621944286524e-05, "loss": 0.9409, "step": 4925 }, { "epoch": 6.860724233983287, "grad_norm": 2.876359224319458, "learning_rate": 2.242467310972143e-05, "loss": 0.852, "step": 4926 }, { "epoch": 6.862116991643454, "grad_norm": 3.78804874420166, "learning_rate": 2.2414724275156335e-05, "loss": 1.2609, "step": 4927 }, { "epoch": 6.8635097493036215, "grad_norm": 2.9731123447418213, "learning_rate": 2.2404775440591243e-05, "loss": 1.0511, "step": 4928 }, { "epoch": 6.864902506963788, "grad_norm": 2.677267074584961, "learning_rate": 2.2394826606026147e-05, "loss": 0.9392, "step": 4929 }, { "epoch": 6.866295264623956, "grad_norm": 2.678102970123291, "learning_rate": 2.2384877771461054e-05, "loss": 0.9781, "step": 4930 }, { "epoch": 6.867688022284122, "grad_norm": 3.74009108543396, "learning_rate": 2.2374928936895965e-05, "loss": 0.9932, "step": 4931 }, { "epoch": 6.86908077994429, "grad_norm": 2.4893105030059814, "learning_rate": 2.2364980102330866e-05, "loss": 0.7554, "step": 4932 }, { "epoch": 6.870473537604457, "grad_norm": 2.7979495525360107, "learning_rate": 2.2355031267765777e-05, "loss": 0.819, "step": 4933 }, { "epoch": 6.871866295264624, "grad_norm": 2.7396624088287354, "learning_rate": 2.234508243320068e-05, "loss": 0.8175, "step": 4934 }, { "epoch": 6.873259052924791, "grad_norm": 3.2493722438812256, "learning_rate": 2.2335133598635588e-05, "loss": 0.9786, "step": 4935 }, { "epoch": 6.874651810584958, "grad_norm": 2.8006935119628906, "learning_rate": 2.2325184764070496e-05, "loss": 0.8901, "step": 4936 }, { "epoch": 6.876044568245125, "grad_norm": 2.727621078491211, "learning_rate": 2.23152359295054e-05, "loss": 0.9357, "step": 4937 }, { "epoch": 6.8774373259052926, "grad_norm": 3.2232580184936523, "learning_rate": 2.2305287094940307e-05, "loss": 1.0972, "step": 4938 }, { "epoch": 6.878830083565459, "grad_norm": 3.3753669261932373, "learning_rate": 2.229533826037521e-05, "loss": 0.9813, "step": 4939 }, { "epoch": 6.880222841225627, "grad_norm": 3.5995097160339355, "learning_rate": 2.228538942581012e-05, "loss": 1.5259, "step": 4940 }, { "epoch": 6.881615598885794, "grad_norm": 2.841562032699585, "learning_rate": 2.2275440591245023e-05, "loss": 1.1019, "step": 4941 }, { "epoch": 6.883008356545961, "grad_norm": 4.627342224121094, "learning_rate": 2.226549175667993e-05, "loss": 0.9604, "step": 4942 }, { "epoch": 6.884401114206128, "grad_norm": 3.2343509197235107, "learning_rate": 2.2255542922114838e-05, "loss": 1.168, "step": 4943 }, { "epoch": 6.885793871866295, "grad_norm": 3.2918992042541504, "learning_rate": 2.224559408754974e-05, "loss": 1.2578, "step": 4944 }, { "epoch": 6.887186629526463, "grad_norm": 2.8083975315093994, "learning_rate": 2.223564525298465e-05, "loss": 0.9497, "step": 4945 }, { "epoch": 6.888579387186629, "grad_norm": 2.8898448944091797, "learning_rate": 2.2225696418419553e-05, "loss": 0.9155, "step": 4946 }, { "epoch": 6.889972144846797, "grad_norm": 3.120607376098633, "learning_rate": 2.221574758385446e-05, "loss": 1.1165, "step": 4947 }, { "epoch": 6.891364902506964, "grad_norm": 2.907076835632324, "learning_rate": 2.2205798749289368e-05, "loss": 0.7774, "step": 4948 }, { "epoch": 6.892757660167131, "grad_norm": 2.8598861694335938, "learning_rate": 2.2195849914724272e-05, "loss": 0.8315, "step": 4949 }, { "epoch": 6.894150417827298, "grad_norm": 2.8945460319519043, "learning_rate": 2.218590108015918e-05, "loss": 0.9248, "step": 4950 }, { "epoch": 6.895543175487465, "grad_norm": 3.612288236618042, "learning_rate": 2.2175952245594084e-05, "loss": 1.3396, "step": 4951 }, { "epoch": 6.896935933147632, "grad_norm": 2.9821317195892334, "learning_rate": 2.216600341102899e-05, "loss": 0.8911, "step": 4952 }, { "epoch": 6.8983286908078, "grad_norm": 3.016906261444092, "learning_rate": 2.21560545764639e-05, "loss": 1.109, "step": 4953 }, { "epoch": 6.899721448467966, "grad_norm": 3.6679413318634033, "learning_rate": 2.2146105741898803e-05, "loss": 0.9454, "step": 4954 }, { "epoch": 6.901114206128134, "grad_norm": 2.7348663806915283, "learning_rate": 2.213615690733371e-05, "loss": 1.0042, "step": 4955 }, { "epoch": 6.9025069637883005, "grad_norm": 2.812051296234131, "learning_rate": 2.2126208072768614e-05, "loss": 0.9994, "step": 4956 }, { "epoch": 6.903899721448468, "grad_norm": 3.1831793785095215, "learning_rate": 2.2116259238203522e-05, "loss": 1.2802, "step": 4957 }, { "epoch": 6.905292479108635, "grad_norm": 4.422842025756836, "learning_rate": 2.210631040363843e-05, "loss": 0.8655, "step": 4958 }, { "epoch": 6.906685236768802, "grad_norm": 3.1010208129882812, "learning_rate": 2.2096361569073333e-05, "loss": 1.0346, "step": 4959 }, { "epoch": 6.908077994428969, "grad_norm": 2.766726493835449, "learning_rate": 2.2086412734508244e-05, "loss": 1.0624, "step": 4960 }, { "epoch": 6.9094707520891365, "grad_norm": 3.0663726329803467, "learning_rate": 2.2076463899943145e-05, "loss": 0.9082, "step": 4961 }, { "epoch": 6.910863509749303, "grad_norm": 3.0582809448242188, "learning_rate": 2.2066515065378056e-05, "loss": 1.2008, "step": 4962 }, { "epoch": 6.912256267409471, "grad_norm": 2.92802095413208, "learning_rate": 2.205656623081296e-05, "loss": 1.0997, "step": 4963 }, { "epoch": 6.913649025069638, "grad_norm": 3.3929343223571777, "learning_rate": 2.2046617396247867e-05, "loss": 1.1293, "step": 4964 }, { "epoch": 6.915041782729805, "grad_norm": 2.8015310764312744, "learning_rate": 2.2036668561682775e-05, "loss": 0.9593, "step": 4965 }, { "epoch": 6.9164345403899725, "grad_norm": 2.6430611610412598, "learning_rate": 2.202671972711768e-05, "loss": 0.9387, "step": 4966 }, { "epoch": 6.917827298050139, "grad_norm": 3.121490478515625, "learning_rate": 2.2016770892552586e-05, "loss": 0.8176, "step": 4967 }, { "epoch": 6.919220055710307, "grad_norm": 2.7189342975616455, "learning_rate": 2.200682205798749e-05, "loss": 0.731, "step": 4968 }, { "epoch": 6.920612813370473, "grad_norm": 2.969587564468384, "learning_rate": 2.1996873223422398e-05, "loss": 1.0109, "step": 4969 }, { "epoch": 6.922005571030641, "grad_norm": 2.9785587787628174, "learning_rate": 2.1986924388857305e-05, "loss": 1.0273, "step": 4970 }, { "epoch": 6.923398328690808, "grad_norm": 3.127487897872925, "learning_rate": 2.197697555429221e-05, "loss": 1.1461, "step": 4971 }, { "epoch": 6.924791086350975, "grad_norm": 3.0932915210723877, "learning_rate": 2.1967026719727117e-05, "loss": 0.9521, "step": 4972 }, { "epoch": 6.926183844011142, "grad_norm": 3.6915338039398193, "learning_rate": 2.195707788516202e-05, "loss": 1.2715, "step": 4973 }, { "epoch": 6.927576601671309, "grad_norm": 2.8909175395965576, "learning_rate": 2.1947129050596928e-05, "loss": 1.2073, "step": 4974 }, { "epoch": 6.928969359331476, "grad_norm": 2.9219141006469727, "learning_rate": 2.1937180216031836e-05, "loss": 0.8063, "step": 4975 }, { "epoch": 6.930362116991644, "grad_norm": 4.024686336517334, "learning_rate": 2.192723138146674e-05, "loss": 0.8786, "step": 4976 }, { "epoch": 6.93175487465181, "grad_norm": 2.914431571960449, "learning_rate": 2.1917282546901647e-05, "loss": 0.7909, "step": 4977 }, { "epoch": 6.933147632311978, "grad_norm": 3.4128596782684326, "learning_rate": 2.190733371233655e-05, "loss": 1.1857, "step": 4978 }, { "epoch": 6.9345403899721445, "grad_norm": 2.566725254058838, "learning_rate": 2.189738487777146e-05, "loss": 0.8833, "step": 4979 }, { "epoch": 6.935933147632312, "grad_norm": 3.3989527225494385, "learning_rate": 2.1887436043206366e-05, "loss": 1.0414, "step": 4980 }, { "epoch": 6.937325905292479, "grad_norm": 3.1205458641052246, "learning_rate": 2.187748720864127e-05, "loss": 0.8835, "step": 4981 }, { "epoch": 6.938718662952646, "grad_norm": 3.2083778381347656, "learning_rate": 2.1867538374076178e-05, "loss": 1.2242, "step": 4982 }, { "epoch": 6.940111420612814, "grad_norm": 3.1415822505950928, "learning_rate": 2.1857589539511082e-05, "loss": 0.8324, "step": 4983 }, { "epoch": 6.9415041782729805, "grad_norm": 3.6850414276123047, "learning_rate": 2.184764070494599e-05, "loss": 1.0104, "step": 4984 }, { "epoch": 6.942896935933147, "grad_norm": 2.9321935176849365, "learning_rate": 2.1837691870380893e-05, "loss": 0.8893, "step": 4985 }, { "epoch": 6.944289693593315, "grad_norm": 3.8329665660858154, "learning_rate": 2.18277430358158e-05, "loss": 1.1802, "step": 4986 }, { "epoch": 6.945682451253482, "grad_norm": 3.4439475536346436, "learning_rate": 2.1817794201250712e-05, "loss": 0.8457, "step": 4987 }, { "epoch": 6.947075208913649, "grad_norm": 3.002546548843384, "learning_rate": 2.1807845366685612e-05, "loss": 1.1196, "step": 4988 }, { "epoch": 6.9484679665738165, "grad_norm": 3.2283008098602295, "learning_rate": 2.1797896532120523e-05, "loss": 1.1945, "step": 4989 }, { "epoch": 6.949860724233983, "grad_norm": 2.5133228302001953, "learning_rate": 2.1787947697555427e-05, "loss": 0.7714, "step": 4990 }, { "epoch": 6.951253481894151, "grad_norm": 3.184483528137207, "learning_rate": 2.1777998862990335e-05, "loss": 0.8774, "step": 4991 }, { "epoch": 6.952646239554317, "grad_norm": 2.887610912322998, "learning_rate": 2.1768050028425242e-05, "loss": 0.8744, "step": 4992 }, { "epoch": 6.954038997214485, "grad_norm": 2.69132661819458, "learning_rate": 2.1758101193860146e-05, "loss": 0.8415, "step": 4993 }, { "epoch": 6.955431754874652, "grad_norm": 3.511922597885132, "learning_rate": 2.1748152359295054e-05, "loss": 0.8937, "step": 4994 }, { "epoch": 6.956824512534819, "grad_norm": 3.0658130645751953, "learning_rate": 2.1738203524729958e-05, "loss": 0.8997, "step": 4995 }, { "epoch": 6.958217270194986, "grad_norm": 2.714297294616699, "learning_rate": 2.1728254690164865e-05, "loss": 1.1088, "step": 4996 }, { "epoch": 6.959610027855153, "grad_norm": 2.500307083129883, "learning_rate": 2.1718305855599773e-05, "loss": 0.9374, "step": 4997 }, { "epoch": 6.96100278551532, "grad_norm": 2.758538007736206, "learning_rate": 2.1708357021034677e-05, "loss": 0.9487, "step": 4998 }, { "epoch": 6.962395543175488, "grad_norm": 3.0099618434906006, "learning_rate": 2.1698408186469584e-05, "loss": 1.0268, "step": 4999 }, { "epoch": 6.963788300835654, "grad_norm": 3.660125970840454, "learning_rate": 2.168845935190449e-05, "loss": 1.0164, "step": 5000 }, { "epoch": 6.965181058495822, "grad_norm": 3.014194965362549, "learning_rate": 2.1678510517339396e-05, "loss": 1.1265, "step": 5001 }, { "epoch": 6.9665738161559885, "grad_norm": 2.936569929122925, "learning_rate": 2.1668561682774303e-05, "loss": 0.9369, "step": 5002 }, { "epoch": 6.967966573816156, "grad_norm": 3.5617330074310303, "learning_rate": 2.1658612848209207e-05, "loss": 1.2657, "step": 5003 }, { "epoch": 6.969359331476323, "grad_norm": 3.6318094730377197, "learning_rate": 2.1648664013644115e-05, "loss": 1.1003, "step": 5004 }, { "epoch": 6.97075208913649, "grad_norm": 2.7930498123168945, "learning_rate": 2.163871517907902e-05, "loss": 0.9573, "step": 5005 }, { "epoch": 6.972144846796658, "grad_norm": 3.4401938915252686, "learning_rate": 2.1628766344513926e-05, "loss": 1.0708, "step": 5006 }, { "epoch": 6.9735376044568245, "grad_norm": 2.823951005935669, "learning_rate": 2.161881750994883e-05, "loss": 1.1048, "step": 5007 }, { "epoch": 6.974930362116992, "grad_norm": 2.9734506607055664, "learning_rate": 2.1608868675383738e-05, "loss": 1.0475, "step": 5008 }, { "epoch": 6.976323119777159, "grad_norm": 2.6455373764038086, "learning_rate": 2.1598919840818645e-05, "loss": 0.8868, "step": 5009 }, { "epoch": 6.977715877437326, "grad_norm": 3.0616531372070312, "learning_rate": 2.158897100625355e-05, "loss": 0.8475, "step": 5010 }, { "epoch": 6.979108635097493, "grad_norm": 2.7626075744628906, "learning_rate": 2.1579022171688457e-05, "loss": 0.9221, "step": 5011 }, { "epoch": 6.9805013927576605, "grad_norm": 3.4198381900787354, "learning_rate": 2.156907333712336e-05, "loss": 1.2821, "step": 5012 }, { "epoch": 6.981894150417827, "grad_norm": 2.8367843627929688, "learning_rate": 2.155912450255827e-05, "loss": 0.9523, "step": 5013 }, { "epoch": 6.983286908077995, "grad_norm": 3.0300216674804688, "learning_rate": 2.1549175667993176e-05, "loss": 0.9777, "step": 5014 }, { "epoch": 6.984679665738161, "grad_norm": 3.0950534343719482, "learning_rate": 2.153922683342808e-05, "loss": 1.0669, "step": 5015 }, { "epoch": 6.986072423398329, "grad_norm": 3.0042805671691895, "learning_rate": 2.152927799886299e-05, "loss": 0.8434, "step": 5016 }, { "epoch": 6.987465181058496, "grad_norm": 2.593696355819702, "learning_rate": 2.151932916429789e-05, "loss": 0.7874, "step": 5017 }, { "epoch": 6.988857938718663, "grad_norm": 2.6960651874542236, "learning_rate": 2.1509380329732802e-05, "loss": 0.9296, "step": 5018 }, { "epoch": 6.99025069637883, "grad_norm": 3.2014400959014893, "learning_rate": 2.149943149516771e-05, "loss": 0.9696, "step": 5019 }, { "epoch": 6.991643454038997, "grad_norm": 2.7455062866210938, "learning_rate": 2.1489482660602614e-05, "loss": 0.8764, "step": 5020 }, { "epoch": 6.993036211699164, "grad_norm": 3.0886213779449463, "learning_rate": 2.147953382603752e-05, "loss": 0.8537, "step": 5021 }, { "epoch": 6.994428969359332, "grad_norm": 2.88210391998291, "learning_rate": 2.1469584991472426e-05, "loss": 0.875, "step": 5022 }, { "epoch": 6.995821727019498, "grad_norm": 3.03379487991333, "learning_rate": 2.1459636156907333e-05, "loss": 0.9391, "step": 5023 }, { "epoch": 6.997214484679666, "grad_norm": 2.986440420150757, "learning_rate": 2.144968732234224e-05, "loss": 1.0542, "step": 5024 }, { "epoch": 6.998607242339833, "grad_norm": 3.2636523246765137, "learning_rate": 2.1439738487777145e-05, "loss": 0.9983, "step": 5025 }, { "epoch": 7.0, "grad_norm": 3.2439088821411133, "learning_rate": 2.1429789653212052e-05, "loss": 1.0126, "step": 5026 }, { "epoch": 7.0013927576601676, "grad_norm": 2.588900566101074, "learning_rate": 2.1419840818646956e-05, "loss": 0.7241, "step": 5027 }, { "epoch": 7.002785515320334, "grad_norm": 2.518993616104126, "learning_rate": 2.1409891984081864e-05, "loss": 0.8023, "step": 5028 }, { "epoch": 7.004178272980502, "grad_norm": 2.8074703216552734, "learning_rate": 2.1399943149516768e-05, "loss": 0.983, "step": 5029 }, { "epoch": 7.005571030640668, "grad_norm": 2.823148012161255, "learning_rate": 2.1389994314951675e-05, "loss": 1.0526, "step": 5030 }, { "epoch": 7.006963788300836, "grad_norm": 2.761317014694214, "learning_rate": 2.1380045480386583e-05, "loss": 0.8701, "step": 5031 }, { "epoch": 7.008356545961003, "grad_norm": 2.7132506370544434, "learning_rate": 2.1370096645821487e-05, "loss": 0.7854, "step": 5032 }, { "epoch": 7.00974930362117, "grad_norm": 2.4156360626220703, "learning_rate": 2.1360147811256394e-05, "loss": 0.7814, "step": 5033 }, { "epoch": 7.011142061281337, "grad_norm": 2.9176406860351562, "learning_rate": 2.1350198976691298e-05, "loss": 0.8162, "step": 5034 }, { "epoch": 7.012534818941504, "grad_norm": 2.6062848567962646, "learning_rate": 2.1340250142126206e-05, "loss": 0.8725, "step": 5035 }, { "epoch": 7.013927576601671, "grad_norm": 2.6114494800567627, "learning_rate": 2.1330301307561113e-05, "loss": 0.9053, "step": 5036 }, { "epoch": 7.015320334261839, "grad_norm": 2.632460117340088, "learning_rate": 2.1320352472996017e-05, "loss": 0.7621, "step": 5037 }, { "epoch": 7.016713091922005, "grad_norm": 2.561506509780884, "learning_rate": 2.1310403638430925e-05, "loss": 0.8962, "step": 5038 }, { "epoch": 7.018105849582173, "grad_norm": 3.0773773193359375, "learning_rate": 2.130045480386583e-05, "loss": 0.9322, "step": 5039 }, { "epoch": 7.0194986072423395, "grad_norm": 2.61908221244812, "learning_rate": 2.1290505969300736e-05, "loss": 0.942, "step": 5040 }, { "epoch": 7.020891364902507, "grad_norm": 2.6667587757110596, "learning_rate": 2.1280557134735644e-05, "loss": 0.8027, "step": 5041 }, { "epoch": 7.022284122562674, "grad_norm": 2.5058159828186035, "learning_rate": 2.1270608300170548e-05, "loss": 0.7292, "step": 5042 }, { "epoch": 7.023676880222841, "grad_norm": 2.6387643814086914, "learning_rate": 2.126065946560546e-05, "loss": 0.7403, "step": 5043 }, { "epoch": 7.025069637883008, "grad_norm": 2.9340646266937256, "learning_rate": 2.125071063104036e-05, "loss": 1.1487, "step": 5044 }, { "epoch": 7.0264623955431755, "grad_norm": 3.019251823425293, "learning_rate": 2.124076179647527e-05, "loss": 0.8989, "step": 5045 }, { "epoch": 7.027855153203342, "grad_norm": 2.4151124954223633, "learning_rate": 2.123081296191017e-05, "loss": 0.8461, "step": 5046 }, { "epoch": 7.02924791086351, "grad_norm": 2.8883981704711914, "learning_rate": 2.122086412734508e-05, "loss": 0.9723, "step": 5047 }, { "epoch": 7.030640668523677, "grad_norm": 2.4514729976654053, "learning_rate": 2.121091529277999e-05, "loss": 0.7727, "step": 5048 }, { "epoch": 7.032033426183844, "grad_norm": 2.7968037128448486, "learning_rate": 2.1200966458214893e-05, "loss": 0.8198, "step": 5049 }, { "epoch": 7.0334261838440115, "grad_norm": 4.603396892547607, "learning_rate": 2.11910176236498e-05, "loss": 0.9419, "step": 5050 }, { "epoch": 7.034818941504178, "grad_norm": 2.6633098125457764, "learning_rate": 2.1181068789084705e-05, "loss": 0.8245, "step": 5051 }, { "epoch": 7.036211699164346, "grad_norm": 3.1830623149871826, "learning_rate": 2.1171119954519612e-05, "loss": 1.1548, "step": 5052 }, { "epoch": 7.037604456824512, "grad_norm": 2.8647260665893555, "learning_rate": 2.116117111995452e-05, "loss": 0.8362, "step": 5053 }, { "epoch": 7.03899721448468, "grad_norm": 2.6832895278930664, "learning_rate": 2.1151222285389424e-05, "loss": 0.9162, "step": 5054 }, { "epoch": 7.040389972144847, "grad_norm": 2.7217798233032227, "learning_rate": 2.114127345082433e-05, "loss": 0.8067, "step": 5055 }, { "epoch": 7.041782729805014, "grad_norm": 2.254660129547119, "learning_rate": 2.1131324616259235e-05, "loss": 0.6615, "step": 5056 }, { "epoch": 7.043175487465181, "grad_norm": 2.911212205886841, "learning_rate": 2.1121375781694143e-05, "loss": 0.9648, "step": 5057 }, { "epoch": 7.044568245125348, "grad_norm": 2.566112756729126, "learning_rate": 2.111142694712905e-05, "loss": 0.6842, "step": 5058 }, { "epoch": 7.045961002785515, "grad_norm": 2.482128143310547, "learning_rate": 2.1101478112563954e-05, "loss": 0.7122, "step": 5059 }, { "epoch": 7.047353760445683, "grad_norm": 2.571878671646118, "learning_rate": 2.1091529277998862e-05, "loss": 0.7384, "step": 5060 }, { "epoch": 7.048746518105849, "grad_norm": 2.926588773727417, "learning_rate": 2.1081580443433766e-05, "loss": 1.0739, "step": 5061 }, { "epoch": 7.050139275766017, "grad_norm": 2.393764019012451, "learning_rate": 2.1071631608868673e-05, "loss": 0.7013, "step": 5062 }, { "epoch": 7.0515320334261835, "grad_norm": 2.4707753658294678, "learning_rate": 2.106168277430358e-05, "loss": 0.7492, "step": 5063 }, { "epoch": 7.052924791086351, "grad_norm": 2.3997011184692383, "learning_rate": 2.1051733939738485e-05, "loss": 0.822, "step": 5064 }, { "epoch": 7.054317548746518, "grad_norm": 2.5442023277282715, "learning_rate": 2.1041785105173392e-05, "loss": 0.7408, "step": 5065 }, { "epoch": 7.055710306406685, "grad_norm": 2.7726078033447266, "learning_rate": 2.1031836270608296e-05, "loss": 0.9467, "step": 5066 }, { "epoch": 7.057103064066852, "grad_norm": 2.851839542388916, "learning_rate": 2.1021887436043204e-05, "loss": 1.0702, "step": 5067 }, { "epoch": 7.0584958217270195, "grad_norm": 3.1124398708343506, "learning_rate": 2.1011938601478108e-05, "loss": 0.9318, "step": 5068 }, { "epoch": 7.059888579387187, "grad_norm": 2.5048420429229736, "learning_rate": 2.1001989766913015e-05, "loss": 0.844, "step": 5069 }, { "epoch": 7.061281337047354, "grad_norm": 2.6897008419036865, "learning_rate": 2.0992040932347923e-05, "loss": 0.8532, "step": 5070 }, { "epoch": 7.062674094707521, "grad_norm": 2.5082085132598877, "learning_rate": 2.0982092097782827e-05, "loss": 0.7295, "step": 5071 }, { "epoch": 7.064066852367688, "grad_norm": 2.720109224319458, "learning_rate": 2.0972143263217738e-05, "loss": 0.7822, "step": 5072 }, { "epoch": 7.0654596100278555, "grad_norm": 2.456808567047119, "learning_rate": 2.096219442865264e-05, "loss": 0.76, "step": 5073 }, { "epoch": 7.066852367688022, "grad_norm": 2.958230495452881, "learning_rate": 2.095224559408755e-05, "loss": 0.799, "step": 5074 }, { "epoch": 7.06824512534819, "grad_norm": 2.6701319217681885, "learning_rate": 2.0942296759522457e-05, "loss": 0.9165, "step": 5075 }, { "epoch": 7.069637883008356, "grad_norm": 2.6504392623901367, "learning_rate": 2.093234792495736e-05, "loss": 0.7042, "step": 5076 }, { "epoch": 7.071030640668524, "grad_norm": 2.80842924118042, "learning_rate": 2.0922399090392268e-05, "loss": 0.8763, "step": 5077 }, { "epoch": 7.072423398328691, "grad_norm": 4.207406997680664, "learning_rate": 2.0912450255827172e-05, "loss": 0.9335, "step": 5078 }, { "epoch": 7.073816155988858, "grad_norm": 2.4932899475097656, "learning_rate": 2.090250142126208e-05, "loss": 0.7264, "step": 5079 }, { "epoch": 7.075208913649025, "grad_norm": 3.238790988922119, "learning_rate": 2.0892552586696987e-05, "loss": 0.9106, "step": 5080 }, { "epoch": 7.076601671309192, "grad_norm": 3.1530511379241943, "learning_rate": 2.088260375213189e-05, "loss": 0.9639, "step": 5081 }, { "epoch": 7.077994428969359, "grad_norm": 3.010418176651001, "learning_rate": 2.08726549175668e-05, "loss": 0.8949, "step": 5082 }, { "epoch": 7.079387186629527, "grad_norm": 2.6341300010681152, "learning_rate": 2.0862706083001703e-05, "loss": 0.6999, "step": 5083 }, { "epoch": 7.080779944289693, "grad_norm": 2.839822292327881, "learning_rate": 2.085275724843661e-05, "loss": 0.8473, "step": 5084 }, { "epoch": 7.082172701949861, "grad_norm": 2.499387264251709, "learning_rate": 2.0842808413871518e-05, "loss": 0.8796, "step": 5085 }, { "epoch": 7.0835654596100275, "grad_norm": 3.1768767833709717, "learning_rate": 2.0832859579306422e-05, "loss": 0.8339, "step": 5086 }, { "epoch": 7.084958217270195, "grad_norm": 2.5548417568206787, "learning_rate": 2.082291074474133e-05, "loss": 0.8024, "step": 5087 }, { "epoch": 7.086350974930362, "grad_norm": 2.81540846824646, "learning_rate": 2.0812961910176233e-05, "loss": 0.8978, "step": 5088 }, { "epoch": 7.087743732590529, "grad_norm": 2.507047176361084, "learning_rate": 2.080301307561114e-05, "loss": 0.8813, "step": 5089 }, { "epoch": 7.089136490250697, "grad_norm": 2.84696888923645, "learning_rate": 2.0793064241046045e-05, "loss": 0.6542, "step": 5090 }, { "epoch": 7.0905292479108635, "grad_norm": 2.5703725814819336, "learning_rate": 2.0783115406480952e-05, "loss": 0.8221, "step": 5091 }, { "epoch": 7.091922005571031, "grad_norm": 2.9748427867889404, "learning_rate": 2.077316657191586e-05, "loss": 0.7694, "step": 5092 }, { "epoch": 7.093314763231198, "grad_norm": 2.238007068634033, "learning_rate": 2.0763217737350764e-05, "loss": 0.6962, "step": 5093 }, { "epoch": 7.094707520891365, "grad_norm": 2.6781530380249023, "learning_rate": 2.075326890278567e-05, "loss": 0.8532, "step": 5094 }, { "epoch": 7.096100278551532, "grad_norm": 2.589348554611206, "learning_rate": 2.0743320068220575e-05, "loss": 0.7188, "step": 5095 }, { "epoch": 7.0974930362116995, "grad_norm": 2.998473644256592, "learning_rate": 2.0733371233655483e-05, "loss": 0.9301, "step": 5096 }, { "epoch": 7.098885793871866, "grad_norm": 2.7394397258758545, "learning_rate": 2.072342239909039e-05, "loss": 0.8027, "step": 5097 }, { "epoch": 7.100278551532034, "grad_norm": 2.572772979736328, "learning_rate": 2.0713473564525294e-05, "loss": 0.7165, "step": 5098 }, { "epoch": 7.1016713091922, "grad_norm": 2.7826836109161377, "learning_rate": 2.0703524729960202e-05, "loss": 0.9358, "step": 5099 }, { "epoch": 7.103064066852368, "grad_norm": 3.2955169677734375, "learning_rate": 2.0693575895395106e-05, "loss": 0.8409, "step": 5100 }, { "epoch": 7.104456824512535, "grad_norm": 2.598841905593872, "learning_rate": 2.0683627060830017e-05, "loss": 0.8871, "step": 5101 }, { "epoch": 7.105849582172702, "grad_norm": 2.574275255203247, "learning_rate": 2.0673678226264924e-05, "loss": 0.6962, "step": 5102 }, { "epoch": 7.107242339832869, "grad_norm": 2.3531739711761475, "learning_rate": 2.066372939169983e-05, "loss": 0.6944, "step": 5103 }, { "epoch": 7.108635097493036, "grad_norm": 2.294454574584961, "learning_rate": 2.0653780557134736e-05, "loss": 0.7277, "step": 5104 }, { "epoch": 7.110027855153203, "grad_norm": 3.7097771167755127, "learning_rate": 2.064383172256964e-05, "loss": 0.8847, "step": 5105 }, { "epoch": 7.111420612813371, "grad_norm": 2.7075202465057373, "learning_rate": 2.0633882888004547e-05, "loss": 0.8689, "step": 5106 }, { "epoch": 7.112813370473537, "grad_norm": 2.550398111343384, "learning_rate": 2.0623934053439455e-05, "loss": 0.6873, "step": 5107 }, { "epoch": 7.114206128133705, "grad_norm": 2.507601499557495, "learning_rate": 2.061398521887436e-05, "loss": 0.949, "step": 5108 }, { "epoch": 7.1155988857938715, "grad_norm": 2.350555181503296, "learning_rate": 2.0604036384309266e-05, "loss": 0.7499, "step": 5109 }, { "epoch": 7.116991643454039, "grad_norm": 2.904186487197876, "learning_rate": 2.059408754974417e-05, "loss": 0.9101, "step": 5110 }, { "epoch": 7.118384401114207, "grad_norm": 2.7214269638061523, "learning_rate": 2.0584138715179078e-05, "loss": 0.9063, "step": 5111 }, { "epoch": 7.119777158774373, "grad_norm": 2.905555486679077, "learning_rate": 2.0574189880613982e-05, "loss": 0.8439, "step": 5112 }, { "epoch": 7.121169916434541, "grad_norm": 2.5376923084259033, "learning_rate": 2.056424104604889e-05, "loss": 0.7845, "step": 5113 }, { "epoch": 7.1225626740947074, "grad_norm": 2.6675326824188232, "learning_rate": 2.0554292211483797e-05, "loss": 0.9121, "step": 5114 }, { "epoch": 7.123955431754875, "grad_norm": 3.5976321697235107, "learning_rate": 2.05443433769187e-05, "loss": 0.972, "step": 5115 }, { "epoch": 7.125348189415042, "grad_norm": 2.5115559101104736, "learning_rate": 2.053439454235361e-05, "loss": 0.7596, "step": 5116 }, { "epoch": 7.126740947075209, "grad_norm": 2.221853733062744, "learning_rate": 2.0524445707788513e-05, "loss": 0.6731, "step": 5117 }, { "epoch": 7.128133704735376, "grad_norm": 2.7109086513519287, "learning_rate": 2.051449687322342e-05, "loss": 0.9109, "step": 5118 }, { "epoch": 7.129526462395543, "grad_norm": 2.7682974338531494, "learning_rate": 2.0504548038658327e-05, "loss": 0.8008, "step": 5119 }, { "epoch": 7.13091922005571, "grad_norm": 2.805457830429077, "learning_rate": 2.049459920409323e-05, "loss": 0.8912, "step": 5120 }, { "epoch": 7.132311977715878, "grad_norm": 3.161466360092163, "learning_rate": 2.048465036952814e-05, "loss": 0.9236, "step": 5121 }, { "epoch": 7.133704735376044, "grad_norm": 2.6468088626861572, "learning_rate": 2.0474701534963043e-05, "loss": 0.8531, "step": 5122 }, { "epoch": 7.135097493036212, "grad_norm": 2.6612088680267334, "learning_rate": 2.046475270039795e-05, "loss": 0.8459, "step": 5123 }, { "epoch": 7.1364902506963785, "grad_norm": 2.7777202129364014, "learning_rate": 2.0454803865832858e-05, "loss": 0.9561, "step": 5124 }, { "epoch": 7.137883008356546, "grad_norm": 3.0060179233551025, "learning_rate": 2.0444855031267762e-05, "loss": 0.9452, "step": 5125 }, { "epoch": 7.139275766016713, "grad_norm": 3.0659730434417725, "learning_rate": 2.043490619670267e-05, "loss": 0.8891, "step": 5126 }, { "epoch": 7.14066852367688, "grad_norm": 2.5169434547424316, "learning_rate": 2.0424957362137574e-05, "loss": 0.7024, "step": 5127 }, { "epoch": 7.142061281337047, "grad_norm": 2.351672887802124, "learning_rate": 2.0415008527572484e-05, "loss": 0.6795, "step": 5128 }, { "epoch": 7.1434540389972145, "grad_norm": 2.4826416969299316, "learning_rate": 2.0405059693007392e-05, "loss": 0.745, "step": 5129 }, { "epoch": 7.144846796657381, "grad_norm": 2.7235655784606934, "learning_rate": 2.0395110858442296e-05, "loss": 0.8468, "step": 5130 }, { "epoch": 7.146239554317549, "grad_norm": 2.200775623321533, "learning_rate": 2.0385162023877203e-05, "loss": 0.7456, "step": 5131 }, { "epoch": 7.147632311977716, "grad_norm": 2.8050622940063477, "learning_rate": 2.0375213189312108e-05, "loss": 0.6287, "step": 5132 }, { "epoch": 7.149025069637883, "grad_norm": 2.5859289169311523, "learning_rate": 2.0365264354747015e-05, "loss": 0.8106, "step": 5133 }, { "epoch": 7.1504178272980505, "grad_norm": 3.303654909133911, "learning_rate": 2.035531552018192e-05, "loss": 0.9672, "step": 5134 }, { "epoch": 7.151810584958217, "grad_norm": 3.0739753246307373, "learning_rate": 2.0345366685616827e-05, "loss": 0.8583, "step": 5135 }, { "epoch": 7.153203342618385, "grad_norm": 2.913184404373169, "learning_rate": 2.0335417851051734e-05, "loss": 0.9952, "step": 5136 }, { "epoch": 7.154596100278551, "grad_norm": 2.4681808948516846, "learning_rate": 2.0325469016486638e-05, "loss": 0.8304, "step": 5137 }, { "epoch": 7.155988857938719, "grad_norm": 2.6157567501068115, "learning_rate": 2.0315520181921546e-05, "loss": 0.7232, "step": 5138 }, { "epoch": 7.157381615598886, "grad_norm": 2.5770859718322754, "learning_rate": 2.030557134735645e-05, "loss": 0.8426, "step": 5139 }, { "epoch": 7.158774373259053, "grad_norm": 2.7727253437042236, "learning_rate": 2.0295622512791357e-05, "loss": 0.8057, "step": 5140 }, { "epoch": 7.16016713091922, "grad_norm": 2.745166778564453, "learning_rate": 2.0285673678226265e-05, "loss": 0.9122, "step": 5141 }, { "epoch": 7.161559888579387, "grad_norm": 2.6689202785491943, "learning_rate": 2.027572484366117e-05, "loss": 0.8866, "step": 5142 }, { "epoch": 7.162952646239554, "grad_norm": 2.7326817512512207, "learning_rate": 2.0265776009096076e-05, "loss": 0.9073, "step": 5143 }, { "epoch": 7.164345403899722, "grad_norm": 2.2113630771636963, "learning_rate": 2.025582717453098e-05, "loss": 0.8135, "step": 5144 }, { "epoch": 7.165738161559888, "grad_norm": 4.216705322265625, "learning_rate": 2.0245878339965888e-05, "loss": 0.8057, "step": 5145 }, { "epoch": 7.167130919220056, "grad_norm": 2.7352399826049805, "learning_rate": 2.0235929505400795e-05, "loss": 0.8108, "step": 5146 }, { "epoch": 7.1685236768802225, "grad_norm": 2.788586139678955, "learning_rate": 2.02259806708357e-05, "loss": 0.6843, "step": 5147 }, { "epoch": 7.16991643454039, "grad_norm": 2.4862241744995117, "learning_rate": 2.0216031836270607e-05, "loss": 0.826, "step": 5148 }, { "epoch": 7.171309192200557, "grad_norm": 3.241105556488037, "learning_rate": 2.020608300170551e-05, "loss": 1.0344, "step": 5149 }, { "epoch": 7.172701949860724, "grad_norm": 2.471928834915161, "learning_rate": 2.0196134167140418e-05, "loss": 0.6736, "step": 5150 }, { "epoch": 7.174094707520891, "grad_norm": 2.711073875427246, "learning_rate": 2.0186185332575326e-05, "loss": 0.911, "step": 5151 }, { "epoch": 7.1754874651810585, "grad_norm": 2.5336215496063232, "learning_rate": 2.017623649801023e-05, "loss": 0.936, "step": 5152 }, { "epoch": 7.176880222841225, "grad_norm": 2.245718479156494, "learning_rate": 2.0166287663445137e-05, "loss": 0.632, "step": 5153 }, { "epoch": 7.178272980501393, "grad_norm": 2.371717929840088, "learning_rate": 2.015633882888004e-05, "loss": 0.7806, "step": 5154 }, { "epoch": 7.17966573816156, "grad_norm": 2.528045415878296, "learning_rate": 2.014638999431495e-05, "loss": 0.8388, "step": 5155 }, { "epoch": 7.181058495821727, "grad_norm": 2.6796059608459473, "learning_rate": 2.0136441159749853e-05, "loss": 0.8448, "step": 5156 }, { "epoch": 7.1824512534818945, "grad_norm": 2.8471596240997314, "learning_rate": 2.0126492325184764e-05, "loss": 0.8743, "step": 5157 }, { "epoch": 7.183844011142061, "grad_norm": 2.8560431003570557, "learning_rate": 2.011654349061967e-05, "loss": 0.9575, "step": 5158 }, { "epoch": 7.185236768802229, "grad_norm": 3.3372654914855957, "learning_rate": 2.0106594656054575e-05, "loss": 1.3444, "step": 5159 }, { "epoch": 7.186629526462395, "grad_norm": 2.640535593032837, "learning_rate": 2.0096645821489483e-05, "loss": 0.8092, "step": 5160 }, { "epoch": 7.188022284122563, "grad_norm": 2.883291482925415, "learning_rate": 2.0086696986924387e-05, "loss": 0.7063, "step": 5161 }, { "epoch": 7.18941504178273, "grad_norm": 2.960609197616577, "learning_rate": 2.0076748152359294e-05, "loss": 1.1612, "step": 5162 }, { "epoch": 7.190807799442897, "grad_norm": 2.7009124755859375, "learning_rate": 2.00667993177942e-05, "loss": 1.0279, "step": 5163 }, { "epoch": 7.192200557103064, "grad_norm": 2.5741305351257324, "learning_rate": 2.0056850483229106e-05, "loss": 0.7547, "step": 5164 }, { "epoch": 7.193593314763231, "grad_norm": 2.402419090270996, "learning_rate": 2.0046901648664013e-05, "loss": 0.7029, "step": 5165 }, { "epoch": 7.194986072423398, "grad_norm": 2.8439228534698486, "learning_rate": 2.0036952814098917e-05, "loss": 0.7626, "step": 5166 }, { "epoch": 7.196378830083566, "grad_norm": 2.8873085975646973, "learning_rate": 2.0027003979533825e-05, "loss": 0.9064, "step": 5167 }, { "epoch": 7.197771587743732, "grad_norm": 2.371703624725342, "learning_rate": 2.0017055144968732e-05, "loss": 0.5794, "step": 5168 }, { "epoch": 7.1991643454039, "grad_norm": 2.3308355808258057, "learning_rate": 2.0007106310403636e-05, "loss": 0.6031, "step": 5169 }, { "epoch": 7.2005571030640665, "grad_norm": 2.594644784927368, "learning_rate": 1.9997157475838544e-05, "loss": 0.7663, "step": 5170 }, { "epoch": 7.201949860724234, "grad_norm": 2.617103338241577, "learning_rate": 1.9987208641273448e-05, "loss": 0.801, "step": 5171 }, { "epoch": 7.203342618384401, "grad_norm": 3.003443717956543, "learning_rate": 1.9977259806708355e-05, "loss": 0.9119, "step": 5172 }, { "epoch": 7.204735376044568, "grad_norm": 2.8007192611694336, "learning_rate": 1.9967310972143263e-05, "loss": 0.9762, "step": 5173 }, { "epoch": 7.206128133704736, "grad_norm": 2.3412129878997803, "learning_rate": 1.9957362137578167e-05, "loss": 0.6466, "step": 5174 }, { "epoch": 7.2075208913649025, "grad_norm": 3.0760936737060547, "learning_rate": 1.9947413303013074e-05, "loss": 1.1183, "step": 5175 }, { "epoch": 7.20891364902507, "grad_norm": 2.990391969680786, "learning_rate": 1.993746446844798e-05, "loss": 0.7956, "step": 5176 }, { "epoch": 7.210306406685237, "grad_norm": 2.6078553199768066, "learning_rate": 1.9927515633882886e-05, "loss": 0.742, "step": 5177 }, { "epoch": 7.211699164345404, "grad_norm": 3.205244541168213, "learning_rate": 1.991756679931779e-05, "loss": 0.8327, "step": 5178 }, { "epoch": 7.213091922005571, "grad_norm": 3.2693397998809814, "learning_rate": 1.9907617964752697e-05, "loss": 0.8681, "step": 5179 }, { "epoch": 7.2144846796657385, "grad_norm": 3.0199766159057617, "learning_rate": 1.9897669130187605e-05, "loss": 1.0342, "step": 5180 }, { "epoch": 7.215877437325905, "grad_norm": 2.707958221435547, "learning_rate": 1.988772029562251e-05, "loss": 0.8669, "step": 5181 }, { "epoch": 7.217270194986073, "grad_norm": 2.774451971054077, "learning_rate": 1.9877771461057416e-05, "loss": 0.8854, "step": 5182 }, { "epoch": 7.218662952646239, "grad_norm": 2.6234207153320312, "learning_rate": 1.986782262649232e-05, "loss": 0.6751, "step": 5183 }, { "epoch": 7.220055710306407, "grad_norm": 2.521958827972412, "learning_rate": 1.985787379192723e-05, "loss": 0.7473, "step": 5184 }, { "epoch": 7.221448467966574, "grad_norm": 2.656883716583252, "learning_rate": 1.984792495736214e-05, "loss": 0.8238, "step": 5185 }, { "epoch": 7.222841225626741, "grad_norm": 2.4660890102386475, "learning_rate": 1.9837976122797043e-05, "loss": 0.7723, "step": 5186 }, { "epoch": 7.224233983286908, "grad_norm": 2.383047342300415, "learning_rate": 1.982802728823195e-05, "loss": 0.6429, "step": 5187 }, { "epoch": 7.225626740947075, "grad_norm": 2.556492805480957, "learning_rate": 1.9818078453666854e-05, "loss": 0.7697, "step": 5188 }, { "epoch": 7.227019498607242, "grad_norm": 4.027998924255371, "learning_rate": 1.9808129619101762e-05, "loss": 0.9606, "step": 5189 }, { "epoch": 7.22841225626741, "grad_norm": 2.40372896194458, "learning_rate": 1.979818078453667e-05, "loss": 0.717, "step": 5190 }, { "epoch": 7.229805013927576, "grad_norm": 2.6457722187042236, "learning_rate": 1.9788231949971573e-05, "loss": 0.8272, "step": 5191 }, { "epoch": 7.231197771587744, "grad_norm": 2.5490403175354004, "learning_rate": 1.977828311540648e-05, "loss": 0.824, "step": 5192 }, { "epoch": 7.2325905292479105, "grad_norm": 2.8429832458496094, "learning_rate": 1.9768334280841385e-05, "loss": 0.8409, "step": 5193 }, { "epoch": 7.233983286908078, "grad_norm": 2.7641210556030273, "learning_rate": 1.9758385446276292e-05, "loss": 0.8567, "step": 5194 }, { "epoch": 7.235376044568245, "grad_norm": 2.108842134475708, "learning_rate": 1.97484366117112e-05, "loss": 0.6063, "step": 5195 }, { "epoch": 7.236768802228412, "grad_norm": 2.6179306507110596, "learning_rate": 1.9738487777146104e-05, "loss": 0.672, "step": 5196 }, { "epoch": 7.23816155988858, "grad_norm": 2.7160799503326416, "learning_rate": 1.972853894258101e-05, "loss": 0.8693, "step": 5197 }, { "epoch": 7.2395543175487465, "grad_norm": 2.631033420562744, "learning_rate": 1.9718590108015915e-05, "loss": 0.7287, "step": 5198 }, { "epoch": 7.240947075208914, "grad_norm": 2.4316320419311523, "learning_rate": 1.9708641273450823e-05, "loss": 0.7627, "step": 5199 }, { "epoch": 7.242339832869081, "grad_norm": 2.6915841102600098, "learning_rate": 1.9698692438885727e-05, "loss": 0.9119, "step": 5200 }, { "epoch": 7.243732590529248, "grad_norm": 2.515749931335449, "learning_rate": 1.9688743604320634e-05, "loss": 0.8842, "step": 5201 }, { "epoch": 7.245125348189415, "grad_norm": 2.6935107707977295, "learning_rate": 1.9678794769755542e-05, "loss": 0.7339, "step": 5202 }, { "epoch": 7.2465181058495824, "grad_norm": 2.112510919570923, "learning_rate": 1.9668845935190446e-05, "loss": 0.5913, "step": 5203 }, { "epoch": 7.247910863509749, "grad_norm": 2.314663887023926, "learning_rate": 1.9658897100625353e-05, "loss": 0.6436, "step": 5204 }, { "epoch": 7.249303621169917, "grad_norm": 2.8983373641967773, "learning_rate": 1.9648948266060257e-05, "loss": 0.7483, "step": 5205 }, { "epoch": 7.250696378830083, "grad_norm": 2.8968334197998047, "learning_rate": 1.9638999431495165e-05, "loss": 0.8495, "step": 5206 }, { "epoch": 7.252089136490251, "grad_norm": 2.4824650287628174, "learning_rate": 1.9629050596930072e-05, "loss": 0.684, "step": 5207 }, { "epoch": 7.2534818941504176, "grad_norm": 2.26605486869812, "learning_rate": 1.9619101762364976e-05, "loss": 0.7048, "step": 5208 }, { "epoch": 7.254874651810585, "grad_norm": 2.54241681098938, "learning_rate": 1.9609152927799884e-05, "loss": 0.8729, "step": 5209 }, { "epoch": 7.256267409470752, "grad_norm": 3.6656627655029297, "learning_rate": 1.9599204093234788e-05, "loss": 0.8071, "step": 5210 }, { "epoch": 7.257660167130919, "grad_norm": 3.096910238265991, "learning_rate": 1.9589255258669695e-05, "loss": 0.8771, "step": 5211 }, { "epoch": 7.259052924791086, "grad_norm": 2.730132579803467, "learning_rate": 1.9579306424104606e-05, "loss": 0.8259, "step": 5212 }, { "epoch": 7.2604456824512535, "grad_norm": 2.580861806869507, "learning_rate": 1.956935758953951e-05, "loss": 0.9026, "step": 5213 }, { "epoch": 7.26183844011142, "grad_norm": 2.4602959156036377, "learning_rate": 1.9559408754974418e-05, "loss": 0.7339, "step": 5214 }, { "epoch": 7.263231197771588, "grad_norm": 2.3018956184387207, "learning_rate": 1.9549459920409322e-05, "loss": 0.793, "step": 5215 }, { "epoch": 7.264623955431755, "grad_norm": 2.997117280960083, "learning_rate": 1.953951108584423e-05, "loss": 0.8779, "step": 5216 }, { "epoch": 7.266016713091922, "grad_norm": 2.6175875663757324, "learning_rate": 1.9529562251279137e-05, "loss": 0.7928, "step": 5217 }, { "epoch": 7.2674094707520895, "grad_norm": 3.443099021911621, "learning_rate": 1.951961341671404e-05, "loss": 0.7839, "step": 5218 }, { "epoch": 7.268802228412256, "grad_norm": 2.713618755340576, "learning_rate": 1.950966458214895e-05, "loss": 0.9359, "step": 5219 }, { "epoch": 7.270194986072424, "grad_norm": 2.643413543701172, "learning_rate": 1.9499715747583853e-05, "loss": 0.8584, "step": 5220 }, { "epoch": 7.27158774373259, "grad_norm": 2.323481559753418, "learning_rate": 1.948976691301876e-05, "loss": 0.7348, "step": 5221 }, { "epoch": 7.272980501392758, "grad_norm": 2.8908908367156982, "learning_rate": 1.9479818078453664e-05, "loss": 1.0852, "step": 5222 }, { "epoch": 7.274373259052925, "grad_norm": 2.4041051864624023, "learning_rate": 1.946986924388857e-05, "loss": 0.7248, "step": 5223 }, { "epoch": 7.275766016713092, "grad_norm": 2.178449869155884, "learning_rate": 1.945992040932348e-05, "loss": 0.7322, "step": 5224 }, { "epoch": 7.277158774373259, "grad_norm": 2.3139548301696777, "learning_rate": 1.9449971574758383e-05, "loss": 0.7602, "step": 5225 }, { "epoch": 7.278551532033426, "grad_norm": 2.421529531478882, "learning_rate": 1.944002274019329e-05, "loss": 0.6924, "step": 5226 }, { "epoch": 7.279944289693593, "grad_norm": 2.8418025970458984, "learning_rate": 1.9430073905628195e-05, "loss": 0.9891, "step": 5227 }, { "epoch": 7.281337047353761, "grad_norm": 2.5942490100860596, "learning_rate": 1.9420125071063102e-05, "loss": 0.8317, "step": 5228 }, { "epoch": 7.282729805013927, "grad_norm": 3.0093631744384766, "learning_rate": 1.941017623649801e-05, "loss": 0.9615, "step": 5229 }, { "epoch": 7.284122562674095, "grad_norm": 3.2489662170410156, "learning_rate": 1.9400227401932914e-05, "loss": 1.0558, "step": 5230 }, { "epoch": 7.2855153203342615, "grad_norm": 2.364912509918213, "learning_rate": 1.939027856736782e-05, "loss": 0.7061, "step": 5231 }, { "epoch": 7.286908077994429, "grad_norm": 3.0428390502929688, "learning_rate": 1.9380329732802725e-05, "loss": 0.9384, "step": 5232 }, { "epoch": 7.288300835654596, "grad_norm": 2.7181684970855713, "learning_rate": 1.9370380898237633e-05, "loss": 0.795, "step": 5233 }, { "epoch": 7.289693593314763, "grad_norm": 2.540064811706543, "learning_rate": 1.936043206367254e-05, "loss": 0.7905, "step": 5234 }, { "epoch": 7.29108635097493, "grad_norm": 2.6257853507995605, "learning_rate": 1.9350483229107444e-05, "loss": 1.0153, "step": 5235 }, { "epoch": 7.2924791086350975, "grad_norm": 2.388836145401001, "learning_rate": 1.934053439454235e-05, "loss": 0.6819, "step": 5236 }, { "epoch": 7.293871866295264, "grad_norm": 2.7063469886779785, "learning_rate": 1.9330585559977256e-05, "loss": 1.0169, "step": 5237 }, { "epoch": 7.295264623955432, "grad_norm": 2.6595163345336914, "learning_rate": 1.9320636725412163e-05, "loss": 0.7897, "step": 5238 }, { "epoch": 7.296657381615599, "grad_norm": 2.7297656536102295, "learning_rate": 1.9310687890847067e-05, "loss": 0.8888, "step": 5239 }, { "epoch": 7.298050139275766, "grad_norm": 2.5360867977142334, "learning_rate": 1.9300739056281978e-05, "loss": 0.7609, "step": 5240 }, { "epoch": 7.2994428969359335, "grad_norm": 2.6307790279388428, "learning_rate": 1.9290790221716886e-05, "loss": 0.6898, "step": 5241 }, { "epoch": 7.3008356545961, "grad_norm": 3.1441152095794678, "learning_rate": 1.928084138715179e-05, "loss": 1.0444, "step": 5242 }, { "epoch": 7.302228412256268, "grad_norm": 3.023423910140991, "learning_rate": 1.9270892552586697e-05, "loss": 0.9404, "step": 5243 }, { "epoch": 7.303621169916434, "grad_norm": 2.8225855827331543, "learning_rate": 1.92609437180216e-05, "loss": 0.8027, "step": 5244 }, { "epoch": 7.305013927576602, "grad_norm": 2.899827003479004, "learning_rate": 1.925099488345651e-05, "loss": 1.0238, "step": 5245 }, { "epoch": 7.306406685236769, "grad_norm": 3.0850961208343506, "learning_rate": 1.9241046048891416e-05, "loss": 0.9852, "step": 5246 }, { "epoch": 7.307799442896936, "grad_norm": 3.0236339569091797, "learning_rate": 1.923109721432632e-05, "loss": 0.9381, "step": 5247 }, { "epoch": 7.309192200557103, "grad_norm": 3.084923267364502, "learning_rate": 1.9221148379761228e-05, "loss": 0.8701, "step": 5248 }, { "epoch": 7.31058495821727, "grad_norm": 3.112299919128418, "learning_rate": 1.921119954519613e-05, "loss": 0.8863, "step": 5249 }, { "epoch": 7.311977715877437, "grad_norm": 2.5260541439056396, "learning_rate": 1.920125071063104e-05, "loss": 0.6948, "step": 5250 }, { "epoch": 7.313370473537605, "grad_norm": 2.752619981765747, "learning_rate": 1.9191301876065947e-05, "loss": 0.833, "step": 5251 }, { "epoch": 7.314763231197771, "grad_norm": 2.4809863567352295, "learning_rate": 1.918135304150085e-05, "loss": 0.6482, "step": 5252 }, { "epoch": 7.316155988857939, "grad_norm": 2.7999536991119385, "learning_rate": 1.9171404206935758e-05, "loss": 0.7211, "step": 5253 }, { "epoch": 7.3175487465181055, "grad_norm": 3.0541927814483643, "learning_rate": 1.9161455372370662e-05, "loss": 1.1089, "step": 5254 }, { "epoch": 7.318941504178273, "grad_norm": 2.6121320724487305, "learning_rate": 1.915150653780557e-05, "loss": 0.8468, "step": 5255 }, { "epoch": 7.32033426183844, "grad_norm": 2.517087936401367, "learning_rate": 1.9141557703240477e-05, "loss": 0.7409, "step": 5256 }, { "epoch": 7.321727019498607, "grad_norm": 2.7593190670013428, "learning_rate": 1.913160886867538e-05, "loss": 0.9637, "step": 5257 }, { "epoch": 7.323119777158775, "grad_norm": 2.8267886638641357, "learning_rate": 1.912166003411029e-05, "loss": 0.7191, "step": 5258 }, { "epoch": 7.3245125348189415, "grad_norm": 2.4519171714782715, "learning_rate": 1.9111711199545193e-05, "loss": 0.6571, "step": 5259 }, { "epoch": 7.325905292479109, "grad_norm": 2.661771774291992, "learning_rate": 1.91017623649801e-05, "loss": 0.774, "step": 5260 }, { "epoch": 7.327298050139276, "grad_norm": 3.1244726181030273, "learning_rate": 1.9091813530415004e-05, "loss": 1.0265, "step": 5261 }, { "epoch": 7.328690807799443, "grad_norm": 2.6667983531951904, "learning_rate": 1.9081864695849912e-05, "loss": 0.871, "step": 5262 }, { "epoch": 7.33008356545961, "grad_norm": 2.4615135192871094, "learning_rate": 1.907191586128482e-05, "loss": 0.703, "step": 5263 }, { "epoch": 7.3314763231197775, "grad_norm": 2.2512056827545166, "learning_rate": 1.9061967026719723e-05, "loss": 0.7559, "step": 5264 }, { "epoch": 7.332869080779944, "grad_norm": 2.763364553451538, "learning_rate": 1.905201819215463e-05, "loss": 0.8774, "step": 5265 }, { "epoch": 7.334261838440112, "grad_norm": 2.2631046772003174, "learning_rate": 1.9042069357589535e-05, "loss": 0.664, "step": 5266 }, { "epoch": 7.335654596100278, "grad_norm": 2.8672075271606445, "learning_rate": 1.9032120523024442e-05, "loss": 0.9558, "step": 5267 }, { "epoch": 7.337047353760446, "grad_norm": 2.6277668476104736, "learning_rate": 1.9022171688459353e-05, "loss": 0.7777, "step": 5268 }, { "epoch": 7.338440111420613, "grad_norm": 2.4149839878082275, "learning_rate": 1.9012222853894257e-05, "loss": 0.8391, "step": 5269 }, { "epoch": 7.33983286908078, "grad_norm": 2.5299253463745117, "learning_rate": 1.9002274019329165e-05, "loss": 0.8632, "step": 5270 }, { "epoch": 7.341225626740947, "grad_norm": 2.412696123123169, "learning_rate": 1.899232518476407e-05, "loss": 0.6032, "step": 5271 }, { "epoch": 7.342618384401114, "grad_norm": 2.2152018547058105, "learning_rate": 1.8982376350198976e-05, "loss": 0.5657, "step": 5272 }, { "epoch": 7.344011142061281, "grad_norm": 3.163825273513794, "learning_rate": 1.8972427515633884e-05, "loss": 0.8335, "step": 5273 }, { "epoch": 7.345403899721449, "grad_norm": 2.466665029525757, "learning_rate": 1.8962478681068788e-05, "loss": 0.776, "step": 5274 }, { "epoch": 7.346796657381615, "grad_norm": 2.686173439025879, "learning_rate": 1.8952529846503695e-05, "loss": 0.8854, "step": 5275 }, { "epoch": 7.348189415041783, "grad_norm": 2.6543776988983154, "learning_rate": 1.89425810119386e-05, "loss": 0.7509, "step": 5276 }, { "epoch": 7.3495821727019495, "grad_norm": 2.4722702503204346, "learning_rate": 1.8932632177373507e-05, "loss": 0.7713, "step": 5277 }, { "epoch": 7.350974930362117, "grad_norm": 2.601783037185669, "learning_rate": 1.8922683342808414e-05, "loss": 0.8249, "step": 5278 }, { "epoch": 7.352367688022284, "grad_norm": 2.874080181121826, "learning_rate": 1.8912734508243318e-05, "loss": 0.8515, "step": 5279 }, { "epoch": 7.353760445682451, "grad_norm": 2.882293462753296, "learning_rate": 1.8902785673678226e-05, "loss": 0.8188, "step": 5280 }, { "epoch": 7.355153203342619, "grad_norm": 2.140166997909546, "learning_rate": 1.889283683911313e-05, "loss": 0.5261, "step": 5281 }, { "epoch": 7.3565459610027855, "grad_norm": 2.9048073291778564, "learning_rate": 1.8882888004548037e-05, "loss": 0.8111, "step": 5282 }, { "epoch": 7.357938718662953, "grad_norm": 2.34552001953125, "learning_rate": 1.887293916998294e-05, "loss": 0.7812, "step": 5283 }, { "epoch": 7.35933147632312, "grad_norm": 2.349907159805298, "learning_rate": 1.886299033541785e-05, "loss": 0.6517, "step": 5284 }, { "epoch": 7.360724233983287, "grad_norm": 3.0905449390411377, "learning_rate": 1.8853041500852756e-05, "loss": 0.9429, "step": 5285 }, { "epoch": 7.362116991643454, "grad_norm": 2.25343918800354, "learning_rate": 1.884309266628766e-05, "loss": 0.6692, "step": 5286 }, { "epoch": 7.3635097493036215, "grad_norm": 2.5146708488464355, "learning_rate": 1.8833143831722568e-05, "loss": 0.9155, "step": 5287 }, { "epoch": 7.364902506963788, "grad_norm": 2.932051420211792, "learning_rate": 1.8823194997157472e-05, "loss": 0.7189, "step": 5288 }, { "epoch": 7.366295264623956, "grad_norm": 2.626209020614624, "learning_rate": 1.881324616259238e-05, "loss": 0.7896, "step": 5289 }, { "epoch": 7.367688022284122, "grad_norm": 2.668407440185547, "learning_rate": 1.8803297328027287e-05, "loss": 0.8976, "step": 5290 }, { "epoch": 7.36908077994429, "grad_norm": 2.6616017818450928, "learning_rate": 1.879334849346219e-05, "loss": 0.8108, "step": 5291 }, { "epoch": 7.370473537604457, "grad_norm": 2.352094888687134, "learning_rate": 1.87833996588971e-05, "loss": 0.7182, "step": 5292 }, { "epoch": 7.371866295264624, "grad_norm": 4.22088098526001, "learning_rate": 1.8773450824332002e-05, "loss": 1.0058, "step": 5293 }, { "epoch": 7.373259052924791, "grad_norm": 2.9428882598876953, "learning_rate": 1.876350198976691e-05, "loss": 0.8682, "step": 5294 }, { "epoch": 7.374651810584958, "grad_norm": 2.534942865371704, "learning_rate": 1.875355315520182e-05, "loss": 0.7611, "step": 5295 }, { "epoch": 7.376044568245125, "grad_norm": 2.342184543609619, "learning_rate": 1.874360432063672e-05, "loss": 0.695, "step": 5296 }, { "epoch": 7.3774373259052926, "grad_norm": 2.9026925563812256, "learning_rate": 1.8733655486071632e-05, "loss": 0.7989, "step": 5297 }, { "epoch": 7.378830083565459, "grad_norm": 2.5647268295288086, "learning_rate": 1.8723706651506536e-05, "loss": 0.7226, "step": 5298 }, { "epoch": 7.380222841225627, "grad_norm": 2.5180187225341797, "learning_rate": 1.8713757816941444e-05, "loss": 0.7938, "step": 5299 }, { "epoch": 7.381615598885794, "grad_norm": 2.7397608757019043, "learning_rate": 1.870380898237635e-05, "loss": 0.6367, "step": 5300 }, { "epoch": 7.383008356545961, "grad_norm": 2.5732569694519043, "learning_rate": 1.8693860147811255e-05, "loss": 0.7301, "step": 5301 }, { "epoch": 7.3844011142061285, "grad_norm": 2.7441160678863525, "learning_rate": 1.8683911313246163e-05, "loss": 0.8464, "step": 5302 }, { "epoch": 7.385793871866295, "grad_norm": 3.1986751556396484, "learning_rate": 1.8673962478681067e-05, "loss": 0.9428, "step": 5303 }, { "epoch": 7.387186629526463, "grad_norm": 2.3839962482452393, "learning_rate": 1.8664013644115974e-05, "loss": 0.6604, "step": 5304 }, { "epoch": 7.388579387186629, "grad_norm": 2.7603166103363037, "learning_rate": 1.865406480955088e-05, "loss": 0.9198, "step": 5305 }, { "epoch": 7.389972144846797, "grad_norm": 2.3966386318206787, "learning_rate": 1.8644115974985786e-05, "loss": 0.7653, "step": 5306 }, { "epoch": 7.391364902506964, "grad_norm": 2.5830929279327393, "learning_rate": 1.8634167140420693e-05, "loss": 0.8256, "step": 5307 }, { "epoch": 7.392757660167131, "grad_norm": 3.0383450984954834, "learning_rate": 1.8624218305855597e-05, "loss": 0.9586, "step": 5308 }, { "epoch": 7.394150417827298, "grad_norm": 2.53407883644104, "learning_rate": 1.8614269471290505e-05, "loss": 0.675, "step": 5309 }, { "epoch": 7.395543175487465, "grad_norm": 2.7543201446533203, "learning_rate": 1.860432063672541e-05, "loss": 0.9026, "step": 5310 }, { "epoch": 7.396935933147632, "grad_norm": 2.485504627227783, "learning_rate": 1.8594371802160316e-05, "loss": 0.8323, "step": 5311 }, { "epoch": 7.3983286908078, "grad_norm": 2.7948977947235107, "learning_rate": 1.8584422967595224e-05, "loss": 0.8652, "step": 5312 }, { "epoch": 7.399721448467966, "grad_norm": 2.5813651084899902, "learning_rate": 1.8574474133030128e-05, "loss": 0.8066, "step": 5313 }, { "epoch": 7.401114206128134, "grad_norm": 2.6509640216827393, "learning_rate": 1.8564525298465035e-05, "loss": 0.7535, "step": 5314 }, { "epoch": 7.4025069637883005, "grad_norm": 2.642306089401245, "learning_rate": 1.855457646389994e-05, "loss": 0.8696, "step": 5315 }, { "epoch": 7.403899721448468, "grad_norm": 2.6423439979553223, "learning_rate": 1.8544627629334847e-05, "loss": 0.6899, "step": 5316 }, { "epoch": 7.405292479108635, "grad_norm": 2.9575910568237305, "learning_rate": 1.8534678794769754e-05, "loss": 1.0177, "step": 5317 }, { "epoch": 7.406685236768802, "grad_norm": 2.517331123352051, "learning_rate": 1.852472996020466e-05, "loss": 0.7426, "step": 5318 }, { "epoch": 7.408077994428969, "grad_norm": 2.621168851852417, "learning_rate": 1.8514781125639566e-05, "loss": 0.7847, "step": 5319 }, { "epoch": 7.4094707520891365, "grad_norm": 2.5754759311676025, "learning_rate": 1.850483229107447e-05, "loss": 0.7888, "step": 5320 }, { "epoch": 7.410863509749303, "grad_norm": 3.159336805343628, "learning_rate": 1.8494883456509378e-05, "loss": 0.7203, "step": 5321 }, { "epoch": 7.412256267409471, "grad_norm": 2.5982353687286377, "learning_rate": 1.848493462194429e-05, "loss": 0.7717, "step": 5322 }, { "epoch": 7.413649025069638, "grad_norm": 2.3548851013183594, "learning_rate": 1.847498578737919e-05, "loss": 0.6459, "step": 5323 }, { "epoch": 7.415041782729805, "grad_norm": 2.8021745681762695, "learning_rate": 1.84650369528141e-05, "loss": 0.906, "step": 5324 }, { "epoch": 7.4164345403899725, "grad_norm": 3.013658285140991, "learning_rate": 1.8455088118249004e-05, "loss": 0.8091, "step": 5325 }, { "epoch": 7.417827298050139, "grad_norm": 2.7148406505584717, "learning_rate": 1.844513928368391e-05, "loss": 0.7998, "step": 5326 }, { "epoch": 7.419220055710307, "grad_norm": 2.475613594055176, "learning_rate": 1.8435190449118816e-05, "loss": 0.7506, "step": 5327 }, { "epoch": 7.420612813370473, "grad_norm": 2.788057804107666, "learning_rate": 1.8425241614553723e-05, "loss": 0.7888, "step": 5328 }, { "epoch": 7.422005571030641, "grad_norm": 2.3782296180725098, "learning_rate": 1.841529277998863e-05, "loss": 0.6336, "step": 5329 }, { "epoch": 7.423398328690808, "grad_norm": 2.264065980911255, "learning_rate": 1.8405343945423535e-05, "loss": 0.6319, "step": 5330 }, { "epoch": 7.424791086350975, "grad_norm": 2.715252637863159, "learning_rate": 1.8395395110858442e-05, "loss": 0.8039, "step": 5331 }, { "epoch": 7.426183844011142, "grad_norm": 2.309450626373291, "learning_rate": 1.8385446276293346e-05, "loss": 0.6296, "step": 5332 }, { "epoch": 7.427576601671309, "grad_norm": 2.5203335285186768, "learning_rate": 1.8375497441728254e-05, "loss": 0.8469, "step": 5333 }, { "epoch": 7.428969359331476, "grad_norm": 4.202627182006836, "learning_rate": 1.836554860716316e-05, "loss": 0.819, "step": 5334 }, { "epoch": 7.430362116991644, "grad_norm": 2.361088514328003, "learning_rate": 1.8355599772598065e-05, "loss": 0.7707, "step": 5335 }, { "epoch": 7.43175487465181, "grad_norm": 2.1766088008880615, "learning_rate": 1.8345650938032973e-05, "loss": 0.6533, "step": 5336 }, { "epoch": 7.433147632311978, "grad_norm": 3.1537468433380127, "learning_rate": 1.8335702103467877e-05, "loss": 0.9132, "step": 5337 }, { "epoch": 7.4345403899721445, "grad_norm": 2.5507802963256836, "learning_rate": 1.8325753268902784e-05, "loss": 0.8233, "step": 5338 }, { "epoch": 7.435933147632312, "grad_norm": 2.8491318225860596, "learning_rate": 1.831580443433769e-05, "loss": 0.7669, "step": 5339 }, { "epoch": 7.437325905292479, "grad_norm": 2.901625394821167, "learning_rate": 1.8305855599772596e-05, "loss": 0.8406, "step": 5340 }, { "epoch": 7.438718662952646, "grad_norm": 3.1791460514068604, "learning_rate": 1.8295906765207503e-05, "loss": 0.7758, "step": 5341 }, { "epoch": 7.440111420612814, "grad_norm": 3.0664150714874268, "learning_rate": 1.8285957930642407e-05, "loss": 0.842, "step": 5342 }, { "epoch": 7.4415041782729805, "grad_norm": 3.25173020362854, "learning_rate": 1.8276009096077315e-05, "loss": 0.9466, "step": 5343 }, { "epoch": 7.442896935933147, "grad_norm": 2.729762554168701, "learning_rate": 1.8266060261512222e-05, "loss": 0.846, "step": 5344 }, { "epoch": 7.444289693593315, "grad_norm": 2.436539888381958, "learning_rate": 1.8256111426947126e-05, "loss": 0.6487, "step": 5345 }, { "epoch": 7.445682451253482, "grad_norm": 2.9036285877227783, "learning_rate": 1.8246162592382034e-05, "loss": 0.8984, "step": 5346 }, { "epoch": 7.447075208913649, "grad_norm": 2.7688663005828857, "learning_rate": 1.8236213757816938e-05, "loss": 0.7661, "step": 5347 }, { "epoch": 7.4484679665738165, "grad_norm": 2.9770824909210205, "learning_rate": 1.8226264923251845e-05, "loss": 1.0991, "step": 5348 }, { "epoch": 7.449860724233983, "grad_norm": 2.889711856842041, "learning_rate": 1.821631608868675e-05, "loss": 1.0009, "step": 5349 }, { "epoch": 7.451253481894151, "grad_norm": 2.694047212600708, "learning_rate": 1.8206367254121657e-05, "loss": 0.6695, "step": 5350 }, { "epoch": 7.452646239554317, "grad_norm": 2.5336525440216064, "learning_rate": 1.8196418419556568e-05, "loss": 0.8706, "step": 5351 }, { "epoch": 7.454038997214485, "grad_norm": 3.0518062114715576, "learning_rate": 1.8186469584991468e-05, "loss": 0.9837, "step": 5352 }, { "epoch": 7.455431754874652, "grad_norm": 2.8290791511535645, "learning_rate": 1.817652075042638e-05, "loss": 0.7497, "step": 5353 }, { "epoch": 7.456824512534819, "grad_norm": 2.8035120964050293, "learning_rate": 1.8166571915861283e-05, "loss": 0.8461, "step": 5354 }, { "epoch": 7.458217270194986, "grad_norm": 2.4490153789520264, "learning_rate": 1.815662308129619e-05, "loss": 0.7488, "step": 5355 }, { "epoch": 7.459610027855153, "grad_norm": 2.6081271171569824, "learning_rate": 1.8146674246731098e-05, "loss": 0.7607, "step": 5356 }, { "epoch": 7.46100278551532, "grad_norm": 2.4156997203826904, "learning_rate": 1.8136725412166002e-05, "loss": 0.7671, "step": 5357 }, { "epoch": 7.462395543175488, "grad_norm": 2.8010993003845215, "learning_rate": 1.812677657760091e-05, "loss": 0.7032, "step": 5358 }, { "epoch": 7.463788300835654, "grad_norm": 2.6486268043518066, "learning_rate": 1.8116827743035814e-05, "loss": 0.714, "step": 5359 }, { "epoch": 7.465181058495822, "grad_norm": 2.284186840057373, "learning_rate": 1.810687890847072e-05, "loss": 0.6935, "step": 5360 }, { "epoch": 7.4665738161559885, "grad_norm": 7.3192853927612305, "learning_rate": 1.809693007390563e-05, "loss": 0.631, "step": 5361 }, { "epoch": 7.467966573816156, "grad_norm": 3.7031171321868896, "learning_rate": 1.8086981239340533e-05, "loss": 0.9192, "step": 5362 }, { "epoch": 7.469359331476323, "grad_norm": 2.848693370819092, "learning_rate": 1.807703240477544e-05, "loss": 0.9586, "step": 5363 }, { "epoch": 7.47075208913649, "grad_norm": 2.508500814437866, "learning_rate": 1.8067083570210344e-05, "loss": 0.7331, "step": 5364 }, { "epoch": 7.472144846796658, "grad_norm": 2.4824063777923584, "learning_rate": 1.805713473564525e-05, "loss": 0.7332, "step": 5365 }, { "epoch": 7.4735376044568245, "grad_norm": 2.697272300720215, "learning_rate": 1.804718590108016e-05, "loss": 1.0376, "step": 5366 }, { "epoch": 7.474930362116992, "grad_norm": 2.364535331726074, "learning_rate": 1.8037237066515063e-05, "loss": 0.6744, "step": 5367 }, { "epoch": 7.476323119777159, "grad_norm": 2.8499128818511963, "learning_rate": 1.802728823194997e-05, "loss": 0.9371, "step": 5368 }, { "epoch": 7.477715877437326, "grad_norm": 2.528367519378662, "learning_rate": 1.8017339397384875e-05, "loss": 0.8048, "step": 5369 }, { "epoch": 7.479108635097493, "grad_norm": 2.3179337978363037, "learning_rate": 1.8007390562819782e-05, "loss": 0.6478, "step": 5370 }, { "epoch": 7.4805013927576605, "grad_norm": 2.8684136867523193, "learning_rate": 1.7997441728254686e-05, "loss": 1.0433, "step": 5371 }, { "epoch": 7.481894150417827, "grad_norm": 2.4741365909576416, "learning_rate": 1.7987492893689594e-05, "loss": 0.7669, "step": 5372 }, { "epoch": 7.483286908077995, "grad_norm": 3.0352306365966797, "learning_rate": 1.79775440591245e-05, "loss": 1.1622, "step": 5373 }, { "epoch": 7.484679665738161, "grad_norm": 2.81583571434021, "learning_rate": 1.7967595224559405e-05, "loss": 0.9287, "step": 5374 }, { "epoch": 7.486072423398329, "grad_norm": 2.670285701751709, "learning_rate": 1.7957646389994313e-05, "loss": 0.8969, "step": 5375 }, { "epoch": 7.487465181058496, "grad_norm": 2.5652999877929688, "learning_rate": 1.7947697555429217e-05, "loss": 0.955, "step": 5376 }, { "epoch": 7.488857938718663, "grad_norm": 2.609499216079712, "learning_rate": 1.7937748720864124e-05, "loss": 0.9204, "step": 5377 }, { "epoch": 7.49025069637883, "grad_norm": 2.5414206981658936, "learning_rate": 1.7927799886299035e-05, "loss": 0.6906, "step": 5378 }, { "epoch": 7.491643454038997, "grad_norm": 2.760133743286133, "learning_rate": 1.7917851051733936e-05, "loss": 0.8683, "step": 5379 }, { "epoch": 7.493036211699164, "grad_norm": 2.682169198989868, "learning_rate": 1.7907902217168847e-05, "loss": 0.8578, "step": 5380 }, { "epoch": 7.494428969359332, "grad_norm": 2.098057270050049, "learning_rate": 1.789795338260375e-05, "loss": 0.5941, "step": 5381 }, { "epoch": 7.495821727019498, "grad_norm": 3.0723142623901367, "learning_rate": 1.7888004548038658e-05, "loss": 0.9611, "step": 5382 }, { "epoch": 7.497214484679666, "grad_norm": 2.3826351165771484, "learning_rate": 1.7878055713473566e-05, "loss": 0.6693, "step": 5383 }, { "epoch": 7.498607242339833, "grad_norm": 2.4851551055908203, "learning_rate": 1.786810687890847e-05, "loss": 0.8697, "step": 5384 }, { "epoch": 7.5, "grad_norm": 2.494851589202881, "learning_rate": 1.7858158044343377e-05, "loss": 0.7803, "step": 5385 }, { "epoch": 7.501392757660167, "grad_norm": 2.3112099170684814, "learning_rate": 1.784820920977828e-05, "loss": 0.8052, "step": 5386 }, { "epoch": 7.502785515320334, "grad_norm": 2.399679660797119, "learning_rate": 1.783826037521319e-05, "loss": 0.73, "step": 5387 }, { "epoch": 7.504178272980502, "grad_norm": 2.62286639213562, "learning_rate": 1.7828311540648096e-05, "loss": 0.87, "step": 5388 }, { "epoch": 7.505571030640668, "grad_norm": 3.6493852138519287, "learning_rate": 1.7818362706083e-05, "loss": 0.6422, "step": 5389 }, { "epoch": 7.506963788300836, "grad_norm": 2.447942018508911, "learning_rate": 1.7808413871517908e-05, "loss": 0.7403, "step": 5390 }, { "epoch": 7.508356545961003, "grad_norm": 2.873213768005371, "learning_rate": 1.7798465036952812e-05, "loss": 0.7787, "step": 5391 }, { "epoch": 7.50974930362117, "grad_norm": 2.470106840133667, "learning_rate": 1.778851620238772e-05, "loss": 0.8113, "step": 5392 }, { "epoch": 7.511142061281337, "grad_norm": 2.59989070892334, "learning_rate": 1.7778567367822623e-05, "loss": 0.7656, "step": 5393 }, { "epoch": 7.512534818941504, "grad_norm": 2.6054182052612305, "learning_rate": 1.776861853325753e-05, "loss": 0.6932, "step": 5394 }, { "epoch": 7.513927576601671, "grad_norm": 2.237776041030884, "learning_rate": 1.775866969869244e-05, "loss": 0.6018, "step": 5395 }, { "epoch": 7.515320334261839, "grad_norm": 2.5836374759674072, "learning_rate": 1.7748720864127342e-05, "loss": 0.7404, "step": 5396 }, { "epoch": 7.516713091922005, "grad_norm": 2.885637044906616, "learning_rate": 1.773877202956225e-05, "loss": 0.7564, "step": 5397 }, { "epoch": 7.518105849582173, "grad_norm": 2.3777401447296143, "learning_rate": 1.7728823194997154e-05, "loss": 0.8696, "step": 5398 }, { "epoch": 7.5194986072423395, "grad_norm": 2.781888484954834, "learning_rate": 1.771887436043206e-05, "loss": 0.9645, "step": 5399 }, { "epoch": 7.520891364902507, "grad_norm": 2.661517381668091, "learning_rate": 1.770892552586697e-05, "loss": 0.8297, "step": 5400 }, { "epoch": 7.522284122562674, "grad_norm": 2.6107850074768066, "learning_rate": 1.7698976691301873e-05, "loss": 0.8516, "step": 5401 }, { "epoch": 7.523676880222841, "grad_norm": 2.5141971111297607, "learning_rate": 1.768902785673678e-05, "loss": 0.8158, "step": 5402 }, { "epoch": 7.525069637883008, "grad_norm": 3.4384422302246094, "learning_rate": 1.7679079022171684e-05, "loss": 0.8487, "step": 5403 }, { "epoch": 7.5264623955431755, "grad_norm": 2.5683579444885254, "learning_rate": 1.7669130187606592e-05, "loss": 0.8595, "step": 5404 }, { "epoch": 7.527855153203342, "grad_norm": 2.845391035079956, "learning_rate": 1.76591813530415e-05, "loss": 0.8601, "step": 5405 }, { "epoch": 7.52924791086351, "grad_norm": 2.3391435146331787, "learning_rate": 1.7649232518476403e-05, "loss": 0.6663, "step": 5406 }, { "epoch": 7.530640668523677, "grad_norm": 2.5470023155212402, "learning_rate": 1.7639283683911314e-05, "loss": 0.824, "step": 5407 }, { "epoch": 7.532033426183844, "grad_norm": 2.9565205574035645, "learning_rate": 1.7629334849346215e-05, "loss": 0.9813, "step": 5408 }, { "epoch": 7.5334261838440115, "grad_norm": 2.5947036743164062, "learning_rate": 1.7619386014781126e-05, "loss": 0.8166, "step": 5409 }, { "epoch": 7.534818941504178, "grad_norm": 2.6413090229034424, "learning_rate": 1.7609437180216033e-05, "loss": 0.6797, "step": 5410 }, { "epoch": 7.536211699164346, "grad_norm": 2.1500332355499268, "learning_rate": 1.7599488345650937e-05, "loss": 0.555, "step": 5411 }, { "epoch": 7.537604456824512, "grad_norm": 2.5348427295684814, "learning_rate": 1.7589539511085845e-05, "loss": 0.7153, "step": 5412 }, { "epoch": 7.53899721448468, "grad_norm": 2.384018898010254, "learning_rate": 1.757959067652075e-05, "loss": 0.7399, "step": 5413 }, { "epoch": 7.540389972144847, "grad_norm": 2.929490566253662, "learning_rate": 1.7569641841955656e-05, "loss": 0.8884, "step": 5414 }, { "epoch": 7.541782729805014, "grad_norm": 2.9986398220062256, "learning_rate": 1.755969300739056e-05, "loss": 0.6664, "step": 5415 }, { "epoch": 7.543175487465181, "grad_norm": 2.725666046142578, "learning_rate": 1.7549744172825468e-05, "loss": 0.8639, "step": 5416 }, { "epoch": 7.544568245125348, "grad_norm": 2.7333314418792725, "learning_rate": 1.7539795338260375e-05, "loss": 0.8251, "step": 5417 }, { "epoch": 7.545961002785515, "grad_norm": 2.9461610317230225, "learning_rate": 1.752984650369528e-05, "loss": 0.8615, "step": 5418 }, { "epoch": 7.547353760445683, "grad_norm": 2.54337477684021, "learning_rate": 1.7519897669130187e-05, "loss": 0.8794, "step": 5419 }, { "epoch": 7.548746518105849, "grad_norm": 6.415521621704102, "learning_rate": 1.750994883456509e-05, "loss": 0.7321, "step": 5420 }, { "epoch": 7.550139275766017, "grad_norm": 1.941736102104187, "learning_rate": 1.75e-05, "loss": 0.5779, "step": 5421 }, { "epoch": 7.5515320334261835, "grad_norm": 2.8460235595703125, "learning_rate": 1.7490051165434906e-05, "loss": 0.8093, "step": 5422 }, { "epoch": 7.552924791086351, "grad_norm": 2.7201473712921143, "learning_rate": 1.748010233086981e-05, "loss": 0.8864, "step": 5423 }, { "epoch": 7.554317548746518, "grad_norm": 3.1468052864074707, "learning_rate": 1.7470153496304717e-05, "loss": 0.6753, "step": 5424 }, { "epoch": 7.555710306406685, "grad_norm": 2.8339996337890625, "learning_rate": 1.7460204661739625e-05, "loss": 0.8595, "step": 5425 }, { "epoch": 7.557103064066853, "grad_norm": 2.769394874572754, "learning_rate": 1.745025582717453e-05, "loss": 0.9072, "step": 5426 }, { "epoch": 7.5584958217270195, "grad_norm": 2.3809988498687744, "learning_rate": 1.7440306992609436e-05, "loss": 0.6453, "step": 5427 }, { "epoch": 7.559888579387186, "grad_norm": 2.7767210006713867, "learning_rate": 1.743035815804434e-05, "loss": 0.823, "step": 5428 }, { "epoch": 7.561281337047354, "grad_norm": 2.5831570625305176, "learning_rate": 1.7420409323479248e-05, "loss": 0.7418, "step": 5429 }, { "epoch": 7.562674094707521, "grad_norm": 3.282571315765381, "learning_rate": 1.7410460488914155e-05, "loss": 1.0822, "step": 5430 }, { "epoch": 7.564066852367688, "grad_norm": 2.7830049991607666, "learning_rate": 1.740051165434906e-05, "loss": 0.9058, "step": 5431 }, { "epoch": 7.5654596100278555, "grad_norm": 2.606900691986084, "learning_rate": 1.7390562819783967e-05, "loss": 0.7775, "step": 5432 }, { "epoch": 7.566852367688022, "grad_norm": 2.7287631034851074, "learning_rate": 1.738061398521887e-05, "loss": 0.9519, "step": 5433 }, { "epoch": 7.56824512534819, "grad_norm": 2.7785258293151855, "learning_rate": 1.737066515065378e-05, "loss": 0.8458, "step": 5434 }, { "epoch": 7.569637883008356, "grad_norm": 2.9038608074188232, "learning_rate": 1.7360716316088686e-05, "loss": 1.0261, "step": 5435 }, { "epoch": 7.571030640668524, "grad_norm": 2.3288161754608154, "learning_rate": 1.7350767481523593e-05, "loss": 0.643, "step": 5436 }, { "epoch": 7.572423398328691, "grad_norm": 2.881404161453247, "learning_rate": 1.7340818646958498e-05, "loss": 0.8509, "step": 5437 }, { "epoch": 7.573816155988858, "grad_norm": 2.6631224155426025, "learning_rate": 1.7330869812393405e-05, "loss": 0.5832, "step": 5438 }, { "epoch": 7.575208913649025, "grad_norm": 2.52154278755188, "learning_rate": 1.732092097782831e-05, "loss": 0.8464, "step": 5439 }, { "epoch": 7.576601671309192, "grad_norm": 2.851818561553955, "learning_rate": 1.7310972143263217e-05, "loss": 0.7793, "step": 5440 }, { "epoch": 7.577994428969359, "grad_norm": 2.672607421875, "learning_rate": 1.7301023308698124e-05, "loss": 0.7822, "step": 5441 }, { "epoch": 7.579387186629527, "grad_norm": 2.48591685295105, "learning_rate": 1.7291074474133028e-05, "loss": 0.7613, "step": 5442 }, { "epoch": 7.580779944289693, "grad_norm": 2.7468457221984863, "learning_rate": 1.7281125639567936e-05, "loss": 0.8192, "step": 5443 }, { "epoch": 7.582172701949861, "grad_norm": 2.4549994468688965, "learning_rate": 1.727117680500284e-05, "loss": 0.6692, "step": 5444 }, { "epoch": 7.5835654596100275, "grad_norm": 3.481426477432251, "learning_rate": 1.7261227970437747e-05, "loss": 0.8286, "step": 5445 }, { "epoch": 7.584958217270195, "grad_norm": 2.9519455432891846, "learning_rate": 1.7251279135872655e-05, "loss": 0.9889, "step": 5446 }, { "epoch": 7.586350974930362, "grad_norm": 4.644660949707031, "learning_rate": 1.724133030130756e-05, "loss": 0.961, "step": 5447 }, { "epoch": 7.587743732590529, "grad_norm": 2.657379388809204, "learning_rate": 1.7231381466742466e-05, "loss": 0.7251, "step": 5448 }, { "epoch": 7.589136490250697, "grad_norm": 2.662083148956299, "learning_rate": 1.722143263217737e-05, "loss": 0.7343, "step": 5449 }, { "epoch": 7.5905292479108635, "grad_norm": 2.9999523162841797, "learning_rate": 1.7211483797612278e-05, "loss": 0.9716, "step": 5450 }, { "epoch": 7.591922005571031, "grad_norm": 2.435251474380493, "learning_rate": 1.7201534963047185e-05, "loss": 0.6349, "step": 5451 }, { "epoch": 7.593314763231198, "grad_norm": 2.6481752395629883, "learning_rate": 1.7191586128482093e-05, "loss": 0.7385, "step": 5452 }, { "epoch": 7.594707520891365, "grad_norm": 2.7415566444396973, "learning_rate": 1.7181637293916997e-05, "loss": 1.0074, "step": 5453 }, { "epoch": 7.596100278551532, "grad_norm": 2.5755679607391357, "learning_rate": 1.7171688459351904e-05, "loss": 0.7337, "step": 5454 }, { "epoch": 7.5974930362116995, "grad_norm": 2.529637575149536, "learning_rate": 1.7161739624786808e-05, "loss": 0.7974, "step": 5455 }, { "epoch": 7.598885793871866, "grad_norm": 2.3905506134033203, "learning_rate": 1.7151790790221716e-05, "loss": 0.7598, "step": 5456 }, { "epoch": 7.600278551532034, "grad_norm": 2.273625612258911, "learning_rate": 1.7141841955656623e-05, "loss": 0.6601, "step": 5457 }, { "epoch": 7.6016713091922, "grad_norm": 2.8713133335113525, "learning_rate": 1.7131893121091527e-05, "loss": 1.0206, "step": 5458 }, { "epoch": 7.603064066852368, "grad_norm": 2.36533784866333, "learning_rate": 1.7121944286526435e-05, "loss": 0.6262, "step": 5459 }, { "epoch": 7.604456824512535, "grad_norm": 2.8157126903533936, "learning_rate": 1.711199545196134e-05, "loss": 0.7364, "step": 5460 }, { "epoch": 7.605849582172702, "grad_norm": 2.8216311931610107, "learning_rate": 1.7102046617396246e-05, "loss": 0.7561, "step": 5461 }, { "epoch": 7.607242339832869, "grad_norm": 2.4692447185516357, "learning_rate": 1.709209778283115e-05, "loss": 0.8601, "step": 5462 }, { "epoch": 7.608635097493036, "grad_norm": 2.7294795513153076, "learning_rate": 1.708214894826606e-05, "loss": 0.735, "step": 5463 }, { "epoch": 7.610027855153203, "grad_norm": 2.7090420722961426, "learning_rate": 1.7072200113700965e-05, "loss": 0.7506, "step": 5464 }, { "epoch": 7.611420612813371, "grad_norm": 2.555973768234253, "learning_rate": 1.7062251279135873e-05, "loss": 0.6659, "step": 5465 }, { "epoch": 7.612813370473537, "grad_norm": 2.3970787525177, "learning_rate": 1.7052302444570777e-05, "loss": 0.6477, "step": 5466 }, { "epoch": 7.614206128133705, "grad_norm": 2.8345634937286377, "learning_rate": 1.7042353610005684e-05, "loss": 0.9087, "step": 5467 }, { "epoch": 7.615598885793872, "grad_norm": 2.9403674602508545, "learning_rate": 1.703240477544059e-05, "loss": 0.8148, "step": 5468 }, { "epoch": 7.616991643454039, "grad_norm": 3.1836977005004883, "learning_rate": 1.7022455940875496e-05, "loss": 0.7056, "step": 5469 }, { "epoch": 7.618384401114206, "grad_norm": 2.4465065002441406, "learning_rate": 1.7012507106310403e-05, "loss": 0.728, "step": 5470 }, { "epoch": 7.619777158774373, "grad_norm": 2.708162307739258, "learning_rate": 1.7002558271745307e-05, "loss": 0.8091, "step": 5471 }, { "epoch": 7.621169916434541, "grad_norm": 2.887277603149414, "learning_rate": 1.6992609437180215e-05, "loss": 0.9227, "step": 5472 }, { "epoch": 7.6225626740947074, "grad_norm": 2.8316197395324707, "learning_rate": 1.698266060261512e-05, "loss": 0.9379, "step": 5473 }, { "epoch": 7.623955431754875, "grad_norm": 3.01243257522583, "learning_rate": 1.6972711768050026e-05, "loss": 1.0515, "step": 5474 }, { "epoch": 7.625348189415042, "grad_norm": 2.652139186859131, "learning_rate": 1.6962762933484934e-05, "loss": 0.7844, "step": 5475 }, { "epoch": 7.626740947075209, "grad_norm": 1.931072473526001, "learning_rate": 1.6952814098919838e-05, "loss": 0.5464, "step": 5476 }, { "epoch": 7.628133704735376, "grad_norm": 2.547229528427124, "learning_rate": 1.6942865264354745e-05, "loss": 0.5843, "step": 5477 }, { "epoch": 7.629526462395543, "grad_norm": 2.6638574600219727, "learning_rate": 1.6932916429789653e-05, "loss": 0.9491, "step": 5478 }, { "epoch": 7.63091922005571, "grad_norm": 2.8186087608337402, "learning_rate": 1.692296759522456e-05, "loss": 0.9055, "step": 5479 }, { "epoch": 7.632311977715878, "grad_norm": 2.598853349685669, "learning_rate": 1.6913018760659464e-05, "loss": 0.6766, "step": 5480 }, { "epoch": 7.633704735376044, "grad_norm": 2.5343191623687744, "learning_rate": 1.6903069926094372e-05, "loss": 0.6529, "step": 5481 }, { "epoch": 7.635097493036212, "grad_norm": 2.8590381145477295, "learning_rate": 1.6893121091529276e-05, "loss": 0.98, "step": 5482 }, { "epoch": 7.6364902506963785, "grad_norm": 2.413297653198242, "learning_rate": 1.6883172256964183e-05, "loss": 0.8172, "step": 5483 }, { "epoch": 7.637883008356546, "grad_norm": 2.6304757595062256, "learning_rate": 1.6873223422399087e-05, "loss": 0.8413, "step": 5484 }, { "epoch": 7.639275766016713, "grad_norm": 2.8911056518554688, "learning_rate": 1.6863274587833995e-05, "loss": 0.9724, "step": 5485 }, { "epoch": 7.64066852367688, "grad_norm": 3.1047544479370117, "learning_rate": 1.6853325753268902e-05, "loss": 0.8266, "step": 5486 }, { "epoch": 7.642061281337047, "grad_norm": 2.703200340270996, "learning_rate": 1.6843376918703806e-05, "loss": 0.9002, "step": 5487 }, { "epoch": 7.6434540389972145, "grad_norm": 2.464510440826416, "learning_rate": 1.6833428084138714e-05, "loss": 0.6793, "step": 5488 }, { "epoch": 7.644846796657381, "grad_norm": 3.0875465869903564, "learning_rate": 1.6823479249573618e-05, "loss": 0.8917, "step": 5489 }, { "epoch": 7.646239554317549, "grad_norm": 1.9251242876052856, "learning_rate": 1.681353041500853e-05, "loss": 0.4798, "step": 5490 }, { "epoch": 7.647632311977716, "grad_norm": 2.643867254257202, "learning_rate": 1.6803581580443433e-05, "loss": 0.8798, "step": 5491 }, { "epoch": 7.649025069637883, "grad_norm": 2.3868167400360107, "learning_rate": 1.679363274587834e-05, "loss": 0.8146, "step": 5492 }, { "epoch": 7.65041782729805, "grad_norm": 2.5355112552642822, "learning_rate": 1.6783683911313244e-05, "loss": 0.7204, "step": 5493 }, { "epoch": 7.651810584958217, "grad_norm": 3.13287091255188, "learning_rate": 1.6773735076748152e-05, "loss": 1.0552, "step": 5494 }, { "epoch": 7.653203342618385, "grad_norm": 2.4425160884857178, "learning_rate": 1.6763786242183056e-05, "loss": 0.8627, "step": 5495 }, { "epoch": 7.654596100278551, "grad_norm": 2.3554141521453857, "learning_rate": 1.6753837407617963e-05, "loss": 0.6306, "step": 5496 }, { "epoch": 7.655988857938719, "grad_norm": 2.2212038040161133, "learning_rate": 1.674388857305287e-05, "loss": 0.6204, "step": 5497 }, { "epoch": 7.657381615598886, "grad_norm": 2.7523720264434814, "learning_rate": 1.6733939738487775e-05, "loss": 0.796, "step": 5498 }, { "epoch": 7.658774373259053, "grad_norm": 2.4031729698181152, "learning_rate": 1.6723990903922682e-05, "loss": 0.8283, "step": 5499 }, { "epoch": 7.66016713091922, "grad_norm": 2.490022659301758, "learning_rate": 1.6714042069357586e-05, "loss": 0.6693, "step": 5500 }, { "epoch": 7.661559888579387, "grad_norm": 2.371617078781128, "learning_rate": 1.6704093234792494e-05, "loss": 0.6041, "step": 5501 }, { "epoch": 7.662952646239554, "grad_norm": 2.6778881549835205, "learning_rate": 1.66941444002274e-05, "loss": 0.6822, "step": 5502 }, { "epoch": 7.664345403899722, "grad_norm": 2.798565149307251, "learning_rate": 1.6684195565662305e-05, "loss": 0.8008, "step": 5503 }, { "epoch": 7.665738161559888, "grad_norm": 2.421142101287842, "learning_rate": 1.6674246731097213e-05, "loss": 0.7211, "step": 5504 }, { "epoch": 7.667130919220056, "grad_norm": 2.256908416748047, "learning_rate": 1.6664297896532117e-05, "loss": 0.5448, "step": 5505 }, { "epoch": 7.6685236768802225, "grad_norm": 2.430297374725342, "learning_rate": 1.6654349061967024e-05, "loss": 0.674, "step": 5506 }, { "epoch": 7.66991643454039, "grad_norm": 2.560138463973999, "learning_rate": 1.6644400227401932e-05, "loss": 0.7353, "step": 5507 }, { "epoch": 7.671309192200557, "grad_norm": 2.754990577697754, "learning_rate": 1.663445139283684e-05, "loss": 0.7594, "step": 5508 }, { "epoch": 7.672701949860724, "grad_norm": 2.3859455585479736, "learning_rate": 1.6624502558271743e-05, "loss": 0.6086, "step": 5509 }, { "epoch": 7.674094707520892, "grad_norm": 2.340557098388672, "learning_rate": 1.661455372370665e-05, "loss": 0.6464, "step": 5510 }, { "epoch": 7.6754874651810585, "grad_norm": 2.4551546573638916, "learning_rate": 1.6604604889141555e-05, "loss": 0.6357, "step": 5511 }, { "epoch": 7.676880222841225, "grad_norm": 2.8065202236175537, "learning_rate": 1.6594656054576462e-05, "loss": 0.8873, "step": 5512 }, { "epoch": 7.678272980501393, "grad_norm": 2.6035358905792236, "learning_rate": 1.658470722001137e-05, "loss": 0.7739, "step": 5513 }, { "epoch": 7.67966573816156, "grad_norm": 2.8600263595581055, "learning_rate": 1.6574758385446274e-05, "loss": 0.9284, "step": 5514 }, { "epoch": 7.681058495821727, "grad_norm": 2.302539587020874, "learning_rate": 1.656480955088118e-05, "loss": 0.7256, "step": 5515 }, { "epoch": 7.6824512534818945, "grad_norm": 3.3458428382873535, "learning_rate": 1.6554860716316085e-05, "loss": 0.8755, "step": 5516 }, { "epoch": 7.683844011142061, "grad_norm": 2.4608819484710693, "learning_rate": 1.6544911881750993e-05, "loss": 0.7326, "step": 5517 }, { "epoch": 7.685236768802229, "grad_norm": 2.4221181869506836, "learning_rate": 1.65349630471859e-05, "loss": 0.6242, "step": 5518 }, { "epoch": 7.686629526462395, "grad_norm": 2.9095842838287354, "learning_rate": 1.6525014212620808e-05, "loss": 0.9393, "step": 5519 }, { "epoch": 7.688022284122563, "grad_norm": 2.33331298828125, "learning_rate": 1.6515065378055712e-05, "loss": 0.6507, "step": 5520 }, { "epoch": 7.68941504178273, "grad_norm": 2.502535820007324, "learning_rate": 1.650511654349062e-05, "loss": 0.7312, "step": 5521 }, { "epoch": 7.690807799442897, "grad_norm": 2.9326870441436768, "learning_rate": 1.6495167708925523e-05, "loss": 1.0834, "step": 5522 }, { "epoch": 7.692200557103064, "grad_norm": 2.883514165878296, "learning_rate": 1.648521887436043e-05, "loss": 0.7977, "step": 5523 }, { "epoch": 7.693593314763231, "grad_norm": 2.428057909011841, "learning_rate": 1.647527003979534e-05, "loss": 0.8281, "step": 5524 }, { "epoch": 7.694986072423398, "grad_norm": 2.342104434967041, "learning_rate": 1.6465321205230242e-05, "loss": 0.6763, "step": 5525 }, { "epoch": 7.696378830083566, "grad_norm": 2.424226999282837, "learning_rate": 1.645537237066515e-05, "loss": 0.6285, "step": 5526 }, { "epoch": 7.697771587743732, "grad_norm": 2.470301389694214, "learning_rate": 1.6445423536100054e-05, "loss": 0.819, "step": 5527 }, { "epoch": 7.6991643454039, "grad_norm": 2.714398145675659, "learning_rate": 1.643547470153496e-05, "loss": 0.8148, "step": 5528 }, { "epoch": 7.7005571030640665, "grad_norm": 2.368927001953125, "learning_rate": 1.642552586696987e-05, "loss": 0.7107, "step": 5529 }, { "epoch": 7.701949860724234, "grad_norm": 2.540170669555664, "learning_rate": 1.6415577032404773e-05, "loss": 0.8445, "step": 5530 }, { "epoch": 7.703342618384401, "grad_norm": 2.7775514125823975, "learning_rate": 1.640562819783968e-05, "loss": 0.7228, "step": 5531 }, { "epoch": 7.704735376044568, "grad_norm": 2.2772164344787598, "learning_rate": 1.6395679363274585e-05, "loss": 0.806, "step": 5532 }, { "epoch": 7.706128133704736, "grad_norm": 2.8098080158233643, "learning_rate": 1.6385730528709492e-05, "loss": 0.8471, "step": 5533 }, { "epoch": 7.7075208913649025, "grad_norm": 2.630612850189209, "learning_rate": 1.63757816941444e-05, "loss": 0.7689, "step": 5534 }, { "epoch": 7.708913649025069, "grad_norm": 2.955442190170288, "learning_rate": 1.6365832859579307e-05, "loss": 0.7642, "step": 5535 }, { "epoch": 7.710306406685237, "grad_norm": 3.060309648513794, "learning_rate": 1.635588402501421e-05, "loss": 0.7033, "step": 5536 }, { "epoch": 7.711699164345404, "grad_norm": 2.650979518890381, "learning_rate": 1.634593519044912e-05, "loss": 0.6853, "step": 5537 }, { "epoch": 7.713091922005571, "grad_norm": 2.5225212574005127, "learning_rate": 1.6335986355884023e-05, "loss": 0.7362, "step": 5538 }, { "epoch": 7.7144846796657385, "grad_norm": 2.7491307258605957, "learning_rate": 1.632603752131893e-05, "loss": 0.8206, "step": 5539 }, { "epoch": 7.715877437325905, "grad_norm": 2.786600351333618, "learning_rate": 1.6316088686753837e-05, "loss": 0.8513, "step": 5540 }, { "epoch": 7.717270194986073, "grad_norm": 2.575843334197998, "learning_rate": 1.630613985218874e-05, "loss": 0.8263, "step": 5541 }, { "epoch": 7.718662952646239, "grad_norm": 2.4344546794891357, "learning_rate": 1.629619101762365e-05, "loss": 0.7748, "step": 5542 }, { "epoch": 7.720055710306407, "grad_norm": 2.481569290161133, "learning_rate": 1.6286242183058553e-05, "loss": 0.6716, "step": 5543 }, { "epoch": 7.721448467966574, "grad_norm": 2.69612979888916, "learning_rate": 1.627629334849346e-05, "loss": 0.9076, "step": 5544 }, { "epoch": 7.722841225626741, "grad_norm": 2.8241164684295654, "learning_rate": 1.6266344513928365e-05, "loss": 0.8286, "step": 5545 }, { "epoch": 7.724233983286908, "grad_norm": 2.8578298091888428, "learning_rate": 1.6256395679363275e-05, "loss": 0.7248, "step": 5546 }, { "epoch": 7.725626740947075, "grad_norm": 2.6153275966644287, "learning_rate": 1.624644684479818e-05, "loss": 0.9127, "step": 5547 }, { "epoch": 7.727019498607242, "grad_norm": 4.352839469909668, "learning_rate": 1.6236498010233087e-05, "loss": 0.7752, "step": 5548 }, { "epoch": 7.72841225626741, "grad_norm": 2.616942882537842, "learning_rate": 1.622654917566799e-05, "loss": 0.7326, "step": 5549 }, { "epoch": 7.729805013927576, "grad_norm": 2.4175174236297607, "learning_rate": 1.62166003411029e-05, "loss": 0.7237, "step": 5550 }, { "epoch": 7.731197771587744, "grad_norm": 3.226278781890869, "learning_rate": 1.6206651506537806e-05, "loss": 0.861, "step": 5551 }, { "epoch": 7.732590529247911, "grad_norm": 3.5635745525360107, "learning_rate": 1.619670267197271e-05, "loss": 0.7789, "step": 5552 }, { "epoch": 7.733983286908078, "grad_norm": 2.7078680992126465, "learning_rate": 1.6186753837407618e-05, "loss": 0.7734, "step": 5553 }, { "epoch": 7.735376044568245, "grad_norm": 2.684532403945923, "learning_rate": 1.617680500284252e-05, "loss": 0.7954, "step": 5554 }, { "epoch": 7.736768802228412, "grad_norm": 2.4593377113342285, "learning_rate": 1.616685616827743e-05, "loss": 0.7353, "step": 5555 }, { "epoch": 7.73816155988858, "grad_norm": 2.7976765632629395, "learning_rate": 1.6156907333712333e-05, "loss": 0.7252, "step": 5556 }, { "epoch": 7.7395543175487465, "grad_norm": 2.5154716968536377, "learning_rate": 1.614695849914724e-05, "loss": 0.7299, "step": 5557 }, { "epoch": 7.740947075208914, "grad_norm": 2.710759162902832, "learning_rate": 1.6137009664582148e-05, "loss": 0.884, "step": 5558 }, { "epoch": 7.742339832869081, "grad_norm": 2.633753538131714, "learning_rate": 1.6127060830017052e-05, "loss": 0.7493, "step": 5559 }, { "epoch": 7.743732590529248, "grad_norm": 2.786937713623047, "learning_rate": 1.611711199545196e-05, "loss": 0.8833, "step": 5560 }, { "epoch": 7.745125348189415, "grad_norm": 2.3996152877807617, "learning_rate": 1.6107163160886864e-05, "loss": 0.5208, "step": 5561 }, { "epoch": 7.7465181058495824, "grad_norm": 2.7960236072540283, "learning_rate": 1.6097214326321775e-05, "loss": 0.8819, "step": 5562 }, { "epoch": 7.747910863509749, "grad_norm": 2.499798536300659, "learning_rate": 1.608726549175668e-05, "loss": 0.6926, "step": 5563 }, { "epoch": 7.749303621169917, "grad_norm": 2.8297054767608643, "learning_rate": 1.6077316657191586e-05, "loss": 0.8726, "step": 5564 }, { "epoch": 7.750696378830083, "grad_norm": 2.852834701538086, "learning_rate": 1.606736782262649e-05, "loss": 0.9254, "step": 5565 }, { "epoch": 7.752089136490251, "grad_norm": 2.609112501144409, "learning_rate": 1.6057418988061398e-05, "loss": 0.6887, "step": 5566 }, { "epoch": 7.7534818941504176, "grad_norm": 2.426393985748291, "learning_rate": 1.6047470153496302e-05, "loss": 0.7099, "step": 5567 }, { "epoch": 7.754874651810585, "grad_norm": 3.0347394943237305, "learning_rate": 1.603752131893121e-05, "loss": 0.834, "step": 5568 }, { "epoch": 7.756267409470752, "grad_norm": 2.8130438327789307, "learning_rate": 1.6027572484366117e-05, "loss": 0.7637, "step": 5569 }, { "epoch": 7.757660167130919, "grad_norm": 2.989290714263916, "learning_rate": 1.601762364980102e-05, "loss": 1.0875, "step": 5570 }, { "epoch": 7.759052924791086, "grad_norm": 2.9280595779418945, "learning_rate": 1.6007674815235928e-05, "loss": 0.7164, "step": 5571 }, { "epoch": 7.7604456824512535, "grad_norm": 2.8934755325317383, "learning_rate": 1.5997725980670832e-05, "loss": 0.8064, "step": 5572 }, { "epoch": 7.76183844011142, "grad_norm": 2.6276254653930664, "learning_rate": 1.598777714610574e-05, "loss": 0.6988, "step": 5573 }, { "epoch": 7.763231197771588, "grad_norm": 3.078321933746338, "learning_rate": 1.5977828311540647e-05, "loss": 0.8105, "step": 5574 }, { "epoch": 7.764623955431755, "grad_norm": 2.2949001789093018, "learning_rate": 1.5967879476975555e-05, "loss": 0.5527, "step": 5575 }, { "epoch": 7.766016713091922, "grad_norm": 2.639017105102539, "learning_rate": 1.595793064241046e-05, "loss": 0.737, "step": 5576 }, { "epoch": 7.767409470752089, "grad_norm": 3.0754311084747314, "learning_rate": 1.5947981807845366e-05, "loss": 0.8296, "step": 5577 }, { "epoch": 7.768802228412256, "grad_norm": 2.674020767211914, "learning_rate": 1.593803297328027e-05, "loss": 0.8421, "step": 5578 }, { "epoch": 7.770194986072424, "grad_norm": 2.3705127239227295, "learning_rate": 1.5928084138715178e-05, "loss": 0.6161, "step": 5579 }, { "epoch": 7.77158774373259, "grad_norm": 2.5708835124969482, "learning_rate": 1.5918135304150085e-05, "loss": 0.7274, "step": 5580 }, { "epoch": 7.772980501392758, "grad_norm": 2.8865108489990234, "learning_rate": 1.590818646958499e-05, "loss": 0.7965, "step": 5581 }, { "epoch": 7.774373259052925, "grad_norm": 2.9721262454986572, "learning_rate": 1.5898237635019897e-05, "loss": 0.784, "step": 5582 }, { "epoch": 7.775766016713092, "grad_norm": 2.682227849960327, "learning_rate": 1.58882888004548e-05, "loss": 0.6563, "step": 5583 }, { "epoch": 7.777158774373259, "grad_norm": 2.9040980339050293, "learning_rate": 1.5878339965889708e-05, "loss": 0.7591, "step": 5584 }, { "epoch": 7.778551532033426, "grad_norm": 2.747191905975342, "learning_rate": 1.5868391131324616e-05, "loss": 0.7726, "step": 5585 }, { "epoch": 7.779944289693593, "grad_norm": 2.908496856689453, "learning_rate": 1.585844229675952e-05, "loss": 0.7806, "step": 5586 }, { "epoch": 7.781337047353761, "grad_norm": 2.671966552734375, "learning_rate": 1.5848493462194427e-05, "loss": 0.6197, "step": 5587 }, { "epoch": 7.782729805013927, "grad_norm": 2.7716825008392334, "learning_rate": 1.583854462762933e-05, "loss": 0.9544, "step": 5588 }, { "epoch": 7.784122562674095, "grad_norm": 3.0952935218811035, "learning_rate": 1.582859579306424e-05, "loss": 0.8225, "step": 5589 }, { "epoch": 7.7855153203342615, "grad_norm": 2.783839464187622, "learning_rate": 1.5818646958499146e-05, "loss": 0.6937, "step": 5590 }, { "epoch": 7.786908077994429, "grad_norm": 2.6238903999328613, "learning_rate": 1.5808698123934054e-05, "loss": 0.7029, "step": 5591 }, { "epoch": 7.788300835654596, "grad_norm": 2.4679155349731445, "learning_rate": 1.5798749289368958e-05, "loss": 0.7577, "step": 5592 }, { "epoch": 7.789693593314763, "grad_norm": 2.842315912246704, "learning_rate": 1.5788800454803865e-05, "loss": 0.7747, "step": 5593 }, { "epoch": 7.791086350974931, "grad_norm": 2.250731945037842, "learning_rate": 1.577885162023877e-05, "loss": 0.586, "step": 5594 }, { "epoch": 7.7924791086350975, "grad_norm": 2.5972139835357666, "learning_rate": 1.5768902785673677e-05, "loss": 0.7453, "step": 5595 }, { "epoch": 7.793871866295264, "grad_norm": 3.002807140350342, "learning_rate": 1.5758953951108584e-05, "loss": 0.8393, "step": 5596 }, { "epoch": 7.795264623955432, "grad_norm": 2.551990032196045, "learning_rate": 1.574900511654349e-05, "loss": 0.7232, "step": 5597 }, { "epoch": 7.796657381615599, "grad_norm": 2.2206497192382812, "learning_rate": 1.5739056281978396e-05, "loss": 0.6792, "step": 5598 }, { "epoch": 7.798050139275766, "grad_norm": 2.9718410968780518, "learning_rate": 1.57291074474133e-05, "loss": 1.0455, "step": 5599 }, { "epoch": 7.7994428969359335, "grad_norm": 3.1451773643493652, "learning_rate": 1.5719158612848207e-05, "loss": 0.7256, "step": 5600 }, { "epoch": 7.8008356545961, "grad_norm": 2.6781413555145264, "learning_rate": 1.5709209778283115e-05, "loss": 0.869, "step": 5601 }, { "epoch": 7.802228412256268, "grad_norm": 2.681276798248291, "learning_rate": 1.569926094371802e-05, "loss": 0.8421, "step": 5602 }, { "epoch": 7.803621169916434, "grad_norm": 2.325745105743408, "learning_rate": 1.5689312109152926e-05, "loss": 0.6422, "step": 5603 }, { "epoch": 7.805013927576602, "grad_norm": 3.1258108615875244, "learning_rate": 1.5679363274587834e-05, "loss": 0.8142, "step": 5604 }, { "epoch": 7.806406685236769, "grad_norm": 2.8076539039611816, "learning_rate": 1.5669414440022738e-05, "loss": 0.9406, "step": 5605 }, { "epoch": 7.807799442896936, "grad_norm": 2.98014497756958, "learning_rate": 1.5659465605457645e-05, "loss": 0.82, "step": 5606 }, { "epoch": 7.809192200557103, "grad_norm": 2.5118582248687744, "learning_rate": 1.5649516770892553e-05, "loss": 0.7875, "step": 5607 }, { "epoch": 7.81058495821727, "grad_norm": 2.427443265914917, "learning_rate": 1.5639567936327457e-05, "loss": 0.7352, "step": 5608 }, { "epoch": 7.811977715877437, "grad_norm": 2.578648567199707, "learning_rate": 1.5629619101762364e-05, "loss": 0.6233, "step": 5609 }, { "epoch": 7.813370473537605, "grad_norm": 2.2706801891326904, "learning_rate": 1.561967026719727e-05, "loss": 0.6984, "step": 5610 }, { "epoch": 7.814763231197771, "grad_norm": 2.4212236404418945, "learning_rate": 1.5609721432632176e-05, "loss": 0.6463, "step": 5611 }, { "epoch": 7.816155988857939, "grad_norm": 2.941241502761841, "learning_rate": 1.5599772598067083e-05, "loss": 0.8984, "step": 5612 }, { "epoch": 7.8175487465181055, "grad_norm": 2.364943265914917, "learning_rate": 1.5589823763501987e-05, "loss": 0.7095, "step": 5613 }, { "epoch": 7.818941504178273, "grad_norm": 2.514819383621216, "learning_rate": 1.5579874928936895e-05, "loss": 0.8956, "step": 5614 }, { "epoch": 7.82033426183844, "grad_norm": 2.2634267807006836, "learning_rate": 1.55699260943718e-05, "loss": 0.7432, "step": 5615 }, { "epoch": 7.821727019498607, "grad_norm": 2.539466619491577, "learning_rate": 1.5559977259806706e-05, "loss": 0.745, "step": 5616 }, { "epoch": 7.823119777158775, "grad_norm": 3.175222158432007, "learning_rate": 1.5550028425241614e-05, "loss": 0.7331, "step": 5617 }, { "epoch": 7.8245125348189415, "grad_norm": 2.66514253616333, "learning_rate": 1.554007959067652e-05, "loss": 0.646, "step": 5618 }, { "epoch": 7.825905292479108, "grad_norm": 4.708824157714844, "learning_rate": 1.5530130756111425e-05, "loss": 0.6814, "step": 5619 }, { "epoch": 7.827298050139276, "grad_norm": 2.8916537761688232, "learning_rate": 1.5520181921546333e-05, "loss": 0.9828, "step": 5620 }, { "epoch": 7.828690807799443, "grad_norm": 2.7039146423339844, "learning_rate": 1.5510233086981237e-05, "loss": 1.0111, "step": 5621 }, { "epoch": 7.83008356545961, "grad_norm": 2.8030519485473633, "learning_rate": 1.5500284252416144e-05, "loss": 0.8601, "step": 5622 }, { "epoch": 7.8314763231197775, "grad_norm": 2.383009672164917, "learning_rate": 1.5490335417851052e-05, "loss": 0.8488, "step": 5623 }, { "epoch": 7.832869080779944, "grad_norm": 2.9474103450775146, "learning_rate": 1.5480386583285956e-05, "loss": 0.88, "step": 5624 }, { "epoch": 7.834261838440112, "grad_norm": 2.38859224319458, "learning_rate": 1.5470437748720863e-05, "loss": 0.7978, "step": 5625 }, { "epoch": 7.835654596100278, "grad_norm": 2.316699743270874, "learning_rate": 1.5460488914155768e-05, "loss": 0.6718, "step": 5626 }, { "epoch": 7.837047353760446, "grad_norm": 2.29079008102417, "learning_rate": 1.5450540079590675e-05, "loss": 0.661, "step": 5627 }, { "epoch": 7.838440111420613, "grad_norm": 2.701103925704956, "learning_rate": 1.5440591245025582e-05, "loss": 0.8497, "step": 5628 }, { "epoch": 7.83983286908078, "grad_norm": 3.1637492179870605, "learning_rate": 1.5430642410460487e-05, "loss": 1.0143, "step": 5629 }, { "epoch": 7.841225626740947, "grad_norm": 2.374556064605713, "learning_rate": 1.5420693575895394e-05, "loss": 0.6001, "step": 5630 }, { "epoch": 7.842618384401114, "grad_norm": 2.5755841732025146, "learning_rate": 1.54107447413303e-05, "loss": 0.7803, "step": 5631 }, { "epoch": 7.844011142061281, "grad_norm": 2.7818307876586914, "learning_rate": 1.5400795906765206e-05, "loss": 0.9121, "step": 5632 }, { "epoch": 7.845403899721449, "grad_norm": 2.5337882041931152, "learning_rate": 1.5390847072200113e-05, "loss": 0.7806, "step": 5633 }, { "epoch": 7.846796657381615, "grad_norm": 3.340794086456299, "learning_rate": 1.538089823763502e-05, "loss": 0.9883, "step": 5634 }, { "epoch": 7.848189415041783, "grad_norm": 4.154270172119141, "learning_rate": 1.5370949403069925e-05, "loss": 0.8184, "step": 5635 }, { "epoch": 7.84958217270195, "grad_norm": 2.594151496887207, "learning_rate": 1.5361000568504832e-05, "loss": 0.6705, "step": 5636 }, { "epoch": 7.850974930362117, "grad_norm": 2.5860893726348877, "learning_rate": 1.5351051733939736e-05, "loss": 0.8204, "step": 5637 }, { "epoch": 7.852367688022284, "grad_norm": 2.2906734943389893, "learning_rate": 1.5341102899374644e-05, "loss": 0.7963, "step": 5638 }, { "epoch": 7.853760445682451, "grad_norm": 2.1849262714385986, "learning_rate": 1.533115406480955e-05, "loss": 0.7335, "step": 5639 }, { "epoch": 7.855153203342619, "grad_norm": 2.508540153503418, "learning_rate": 1.5321205230244455e-05, "loss": 0.7183, "step": 5640 }, { "epoch": 7.8565459610027855, "grad_norm": 2.3108091354370117, "learning_rate": 1.5311256395679363e-05, "loss": 0.6723, "step": 5641 }, { "epoch": 7.857938718662953, "grad_norm": 2.8738508224487305, "learning_rate": 1.5301307561114267e-05, "loss": 0.8377, "step": 5642 }, { "epoch": 7.85933147632312, "grad_norm": 2.4582111835479736, "learning_rate": 1.5291358726549174e-05, "loss": 0.7304, "step": 5643 }, { "epoch": 7.860724233983287, "grad_norm": 2.4296915531158447, "learning_rate": 1.5281409891984078e-05, "loss": 0.7116, "step": 5644 }, { "epoch": 7.862116991643454, "grad_norm": 2.100569248199463, "learning_rate": 1.527146105741899e-05, "loss": 0.5974, "step": 5645 }, { "epoch": 7.8635097493036215, "grad_norm": 2.504051923751831, "learning_rate": 1.5261512222853893e-05, "loss": 0.7527, "step": 5646 }, { "epoch": 7.864902506963788, "grad_norm": 2.7811291217803955, "learning_rate": 1.5251563388288799e-05, "loss": 0.8601, "step": 5647 }, { "epoch": 7.866295264623956, "grad_norm": 2.9225103855133057, "learning_rate": 1.5241614553723705e-05, "loss": 0.8525, "step": 5648 }, { "epoch": 7.867688022284122, "grad_norm": 2.3884072303771973, "learning_rate": 1.523166571915861e-05, "loss": 0.6826, "step": 5649 }, { "epoch": 7.86908077994429, "grad_norm": 2.5493416786193848, "learning_rate": 1.5221716884593518e-05, "loss": 0.5761, "step": 5650 }, { "epoch": 7.870473537604457, "grad_norm": 2.43230938911438, "learning_rate": 1.5211768050028425e-05, "loss": 0.7446, "step": 5651 }, { "epoch": 7.871866295264624, "grad_norm": 4.126525402069092, "learning_rate": 1.5201819215463331e-05, "loss": 0.7879, "step": 5652 }, { "epoch": 7.873259052924791, "grad_norm": 2.607642889022827, "learning_rate": 1.5191870380898237e-05, "loss": 0.8421, "step": 5653 }, { "epoch": 7.874651810584958, "grad_norm": 2.85125732421875, "learning_rate": 1.5181921546333143e-05, "loss": 0.7928, "step": 5654 }, { "epoch": 7.876044568245125, "grad_norm": 2.496875762939453, "learning_rate": 1.5171972711768048e-05, "loss": 0.785, "step": 5655 }, { "epoch": 7.8774373259052926, "grad_norm": 2.5948171615600586, "learning_rate": 1.5162023877202956e-05, "loss": 0.7341, "step": 5656 }, { "epoch": 7.878830083565459, "grad_norm": 3.0469987392425537, "learning_rate": 1.5152075042637862e-05, "loss": 0.8603, "step": 5657 }, { "epoch": 7.880222841225627, "grad_norm": 2.4342963695526123, "learning_rate": 1.5142126208072767e-05, "loss": 0.6999, "step": 5658 }, { "epoch": 7.881615598885794, "grad_norm": 2.8552112579345703, "learning_rate": 1.5132177373507673e-05, "loss": 0.8601, "step": 5659 }, { "epoch": 7.883008356545961, "grad_norm": 2.7369091510772705, "learning_rate": 1.5122228538942579e-05, "loss": 0.8949, "step": 5660 }, { "epoch": 7.884401114206128, "grad_norm": 2.396972179412842, "learning_rate": 1.5112279704377486e-05, "loss": 0.7546, "step": 5661 }, { "epoch": 7.885793871866295, "grad_norm": 2.2595653533935547, "learning_rate": 1.5102330869812392e-05, "loss": 0.6355, "step": 5662 }, { "epoch": 7.887186629526463, "grad_norm": 2.827040195465088, "learning_rate": 1.5092382035247298e-05, "loss": 0.8552, "step": 5663 }, { "epoch": 7.888579387186629, "grad_norm": 2.8968186378479004, "learning_rate": 1.5082433200682204e-05, "loss": 0.8445, "step": 5664 }, { "epoch": 7.889972144846797, "grad_norm": 2.5456926822662354, "learning_rate": 1.507248436611711e-05, "loss": 0.7291, "step": 5665 }, { "epoch": 7.891364902506964, "grad_norm": 2.5830819606781006, "learning_rate": 1.5062535531552015e-05, "loss": 0.7048, "step": 5666 }, { "epoch": 7.892757660167131, "grad_norm": 2.4604547023773193, "learning_rate": 1.5052586696986924e-05, "loss": 0.7161, "step": 5667 }, { "epoch": 7.894150417827298, "grad_norm": 2.7419490814208984, "learning_rate": 1.504263786242183e-05, "loss": 1.0447, "step": 5668 }, { "epoch": 7.895543175487465, "grad_norm": 2.6495437622070312, "learning_rate": 1.5032689027856736e-05, "loss": 0.7987, "step": 5669 }, { "epoch": 7.896935933147632, "grad_norm": 2.556122303009033, "learning_rate": 1.5022740193291642e-05, "loss": 0.8719, "step": 5670 }, { "epoch": 7.8983286908078, "grad_norm": 2.394429922103882, "learning_rate": 1.5012791358726547e-05, "loss": 0.7168, "step": 5671 }, { "epoch": 7.899721448467966, "grad_norm": 2.569481134414673, "learning_rate": 1.5002842524161455e-05, "loss": 0.7791, "step": 5672 }, { "epoch": 7.901114206128134, "grad_norm": 2.329503059387207, "learning_rate": 1.499289368959636e-05, "loss": 0.609, "step": 5673 }, { "epoch": 7.9025069637883005, "grad_norm": 2.459470272064209, "learning_rate": 1.4982944855031266e-05, "loss": 0.7467, "step": 5674 }, { "epoch": 7.903899721448468, "grad_norm": 2.417170524597168, "learning_rate": 1.4972996020466172e-05, "loss": 0.7591, "step": 5675 }, { "epoch": 7.905292479108635, "grad_norm": 2.2791519165039062, "learning_rate": 1.4963047185901078e-05, "loss": 0.6549, "step": 5676 }, { "epoch": 7.906685236768802, "grad_norm": 2.9026219844818115, "learning_rate": 1.4953098351335984e-05, "loss": 0.7479, "step": 5677 }, { "epoch": 7.908077994428969, "grad_norm": 2.889512300491333, "learning_rate": 1.4943149516770891e-05, "loss": 0.8028, "step": 5678 }, { "epoch": 7.9094707520891365, "grad_norm": 2.6889922618865967, "learning_rate": 1.4933200682205799e-05, "loss": 0.7674, "step": 5679 }, { "epoch": 7.910863509749303, "grad_norm": 2.6476426124572754, "learning_rate": 1.4923251847640704e-05, "loss": 0.6657, "step": 5680 }, { "epoch": 7.912256267409471, "grad_norm": 2.9579553604125977, "learning_rate": 1.491330301307561e-05, "loss": 0.8853, "step": 5681 }, { "epoch": 7.913649025069638, "grad_norm": 1.990238904953003, "learning_rate": 1.4903354178510516e-05, "loss": 0.4864, "step": 5682 }, { "epoch": 7.915041782729805, "grad_norm": 2.6816718578338623, "learning_rate": 1.4893405343945423e-05, "loss": 0.8325, "step": 5683 }, { "epoch": 7.9164345403899725, "grad_norm": 2.970919132232666, "learning_rate": 1.488345650938033e-05, "loss": 0.8256, "step": 5684 }, { "epoch": 7.917827298050139, "grad_norm": 2.6956913471221924, "learning_rate": 1.4873507674815235e-05, "loss": 0.7907, "step": 5685 }, { "epoch": 7.919220055710307, "grad_norm": 2.150650978088379, "learning_rate": 1.486355884025014e-05, "loss": 0.7931, "step": 5686 }, { "epoch": 7.920612813370473, "grad_norm": 2.735020399093628, "learning_rate": 1.4853610005685047e-05, "loss": 0.7559, "step": 5687 }, { "epoch": 7.922005571030641, "grad_norm": 2.7567038536071777, "learning_rate": 1.4843661171119952e-05, "loss": 0.9638, "step": 5688 }, { "epoch": 7.923398328690808, "grad_norm": 2.8808114528656006, "learning_rate": 1.483371233655486e-05, "loss": 0.7945, "step": 5689 }, { "epoch": 7.924791086350975, "grad_norm": 2.803769111633301, "learning_rate": 1.4823763501989766e-05, "loss": 1.0416, "step": 5690 }, { "epoch": 7.926183844011142, "grad_norm": 2.9110865592956543, "learning_rate": 1.4813814667424671e-05, "loss": 0.7346, "step": 5691 }, { "epoch": 7.927576601671309, "grad_norm": 3.0096850395202637, "learning_rate": 1.4803865832859577e-05, "loss": 0.7868, "step": 5692 }, { "epoch": 7.928969359331476, "grad_norm": 2.6202094554901123, "learning_rate": 1.4793916998294483e-05, "loss": 0.8138, "step": 5693 }, { "epoch": 7.930362116991644, "grad_norm": 2.9771244525909424, "learning_rate": 1.4783968163729392e-05, "loss": 0.864, "step": 5694 }, { "epoch": 7.93175487465181, "grad_norm": 2.3767271041870117, "learning_rate": 1.4774019329164298e-05, "loss": 0.6459, "step": 5695 }, { "epoch": 7.933147632311978, "grad_norm": 2.8269755840301514, "learning_rate": 1.4764070494599204e-05, "loss": 0.8445, "step": 5696 }, { "epoch": 7.9345403899721445, "grad_norm": 2.45312237739563, "learning_rate": 1.475412166003411e-05, "loss": 0.7114, "step": 5697 }, { "epoch": 7.935933147632312, "grad_norm": 2.465182065963745, "learning_rate": 1.4744172825469015e-05, "loss": 0.6793, "step": 5698 }, { "epoch": 7.937325905292479, "grad_norm": 2.35618257522583, "learning_rate": 1.473422399090392e-05, "loss": 0.7439, "step": 5699 }, { "epoch": 7.938718662952646, "grad_norm": 2.613288402557373, "learning_rate": 1.4724275156338828e-05, "loss": 0.8685, "step": 5700 }, { "epoch": 7.940111420612814, "grad_norm": 2.3746984004974365, "learning_rate": 1.4714326321773734e-05, "loss": 0.778, "step": 5701 }, { "epoch": 7.9415041782729805, "grad_norm": 2.941166400909424, "learning_rate": 1.470437748720864e-05, "loss": 0.6549, "step": 5702 }, { "epoch": 7.942896935933147, "grad_norm": 2.9136996269226074, "learning_rate": 1.4694428652643546e-05, "loss": 0.9501, "step": 5703 }, { "epoch": 7.944289693593315, "grad_norm": 3.1199958324432373, "learning_rate": 1.4684479818078451e-05, "loss": 0.931, "step": 5704 }, { "epoch": 7.945682451253482, "grad_norm": 2.5881006717681885, "learning_rate": 1.4674530983513359e-05, "loss": 0.7387, "step": 5705 }, { "epoch": 7.947075208913649, "grad_norm": 2.805751085281372, "learning_rate": 1.4664582148948265e-05, "loss": 0.8509, "step": 5706 }, { "epoch": 7.9484679665738165, "grad_norm": 2.4799911975860596, "learning_rate": 1.4654633314383172e-05, "loss": 0.7261, "step": 5707 }, { "epoch": 7.949860724233983, "grad_norm": 2.4309656620025635, "learning_rate": 1.4644684479818078e-05, "loss": 0.7461, "step": 5708 }, { "epoch": 7.951253481894151, "grad_norm": 2.6114726066589355, "learning_rate": 1.4634735645252984e-05, "loss": 0.7763, "step": 5709 }, { "epoch": 7.952646239554317, "grad_norm": 2.3870186805725098, "learning_rate": 1.462478681068789e-05, "loss": 0.6467, "step": 5710 }, { "epoch": 7.954038997214485, "grad_norm": 2.8836123943328857, "learning_rate": 1.4614837976122797e-05, "loss": 0.9972, "step": 5711 }, { "epoch": 7.955431754874652, "grad_norm": 3.082615375518799, "learning_rate": 1.4604889141557703e-05, "loss": 0.9587, "step": 5712 }, { "epoch": 7.956824512534819, "grad_norm": 2.9020183086395264, "learning_rate": 1.4594940306992608e-05, "loss": 0.9053, "step": 5713 }, { "epoch": 7.958217270194986, "grad_norm": 2.556014060974121, "learning_rate": 1.4584991472427514e-05, "loss": 0.7685, "step": 5714 }, { "epoch": 7.959610027855153, "grad_norm": 2.619906425476074, "learning_rate": 1.457504263786242e-05, "loss": 0.7352, "step": 5715 }, { "epoch": 7.96100278551532, "grad_norm": 2.594573497772217, "learning_rate": 1.4565093803297326e-05, "loss": 0.743, "step": 5716 }, { "epoch": 7.962395543175488, "grad_norm": 2.6292269229888916, "learning_rate": 1.4555144968732233e-05, "loss": 0.6509, "step": 5717 }, { "epoch": 7.963788300835654, "grad_norm": 2.802565336227417, "learning_rate": 1.4545196134167139e-05, "loss": 0.9216, "step": 5718 }, { "epoch": 7.965181058495822, "grad_norm": 2.444504737854004, "learning_rate": 1.4535247299602045e-05, "loss": 0.7161, "step": 5719 }, { "epoch": 7.9665738161559885, "grad_norm": 2.265277862548828, "learning_rate": 1.452529846503695e-05, "loss": 0.5789, "step": 5720 }, { "epoch": 7.967966573816156, "grad_norm": 2.3342485427856445, "learning_rate": 1.4515349630471856e-05, "loss": 0.6536, "step": 5721 }, { "epoch": 7.969359331476323, "grad_norm": 2.153320550918579, "learning_rate": 1.4505400795906765e-05, "loss": 0.6483, "step": 5722 }, { "epoch": 7.97075208913649, "grad_norm": 2.940307140350342, "learning_rate": 1.4495451961341671e-05, "loss": 0.9323, "step": 5723 }, { "epoch": 7.972144846796658, "grad_norm": 2.91743803024292, "learning_rate": 1.4485503126776577e-05, "loss": 0.8105, "step": 5724 }, { "epoch": 7.9735376044568245, "grad_norm": 2.5661327838897705, "learning_rate": 1.4475554292211483e-05, "loss": 0.8149, "step": 5725 }, { "epoch": 7.974930362116992, "grad_norm": 2.6780052185058594, "learning_rate": 1.4465605457646388e-05, "loss": 0.9606, "step": 5726 }, { "epoch": 7.976323119777159, "grad_norm": 2.502511501312256, "learning_rate": 1.4455656623081294e-05, "loss": 0.6927, "step": 5727 }, { "epoch": 7.977715877437326, "grad_norm": 2.9799442291259766, "learning_rate": 1.4445707788516202e-05, "loss": 0.8756, "step": 5728 }, { "epoch": 7.979108635097493, "grad_norm": 2.6761505603790283, "learning_rate": 1.4435758953951107e-05, "loss": 0.8301, "step": 5729 }, { "epoch": 7.9805013927576605, "grad_norm": 3.159679889678955, "learning_rate": 1.4425810119386013e-05, "loss": 0.7773, "step": 5730 }, { "epoch": 7.981894150417827, "grad_norm": 2.164862632751465, "learning_rate": 1.4415861284820919e-05, "loss": 0.7532, "step": 5731 }, { "epoch": 7.983286908077995, "grad_norm": 3.3656914234161377, "learning_rate": 1.4405912450255825e-05, "loss": 0.9429, "step": 5732 }, { "epoch": 7.984679665738161, "grad_norm": 2.541602373123169, "learning_rate": 1.4395963615690732e-05, "loss": 0.7399, "step": 5733 }, { "epoch": 7.986072423398329, "grad_norm": 2.8899080753326416, "learning_rate": 1.4386014781125638e-05, "loss": 0.8363, "step": 5734 }, { "epoch": 7.987465181058496, "grad_norm": 2.7722439765930176, "learning_rate": 1.4376065946560545e-05, "loss": 0.8408, "step": 5735 }, { "epoch": 7.988857938718663, "grad_norm": 2.579982280731201, "learning_rate": 1.4366117111995451e-05, "loss": 0.7021, "step": 5736 }, { "epoch": 7.99025069637883, "grad_norm": 2.7417590618133545, "learning_rate": 1.4356168277430357e-05, "loss": 0.8916, "step": 5737 }, { "epoch": 7.991643454038997, "grad_norm": 2.7365336418151855, "learning_rate": 1.4346219442865263e-05, "loss": 0.9358, "step": 5738 }, { "epoch": 7.993036211699164, "grad_norm": 2.475696325302124, "learning_rate": 1.433627060830017e-05, "loss": 0.7145, "step": 5739 }, { "epoch": 7.994428969359332, "grad_norm": 2.2998290061950684, "learning_rate": 1.4326321773735076e-05, "loss": 0.575, "step": 5740 }, { "epoch": 7.995821727019498, "grad_norm": 2.5487418174743652, "learning_rate": 1.4316372939169982e-05, "loss": 0.785, "step": 5741 }, { "epoch": 7.997214484679666, "grad_norm": 2.654986619949341, "learning_rate": 1.4306424104604888e-05, "loss": 0.6867, "step": 5742 }, { "epoch": 7.998607242339833, "grad_norm": 2.1407663822174072, "learning_rate": 1.4296475270039793e-05, "loss": 0.6288, "step": 5743 }, { "epoch": 8.0, "grad_norm": 2.55751633644104, "learning_rate": 1.42865264354747e-05, "loss": 0.7525, "step": 5744 }, { "epoch": 8.001392757660167, "grad_norm": 2.391916275024414, "learning_rate": 1.4276577600909607e-05, "loss": 0.7063, "step": 5745 }, { "epoch": 8.002785515320335, "grad_norm": 2.0450918674468994, "learning_rate": 1.4266628766344512e-05, "loss": 0.4889, "step": 5746 }, { "epoch": 8.004178272980502, "grad_norm": 2.7029154300689697, "learning_rate": 1.4256679931779418e-05, "loss": 0.7572, "step": 5747 }, { "epoch": 8.005571030640668, "grad_norm": 2.054926872253418, "learning_rate": 1.4246731097214324e-05, "loss": 0.5982, "step": 5748 }, { "epoch": 8.006963788300835, "grad_norm": 2.607769727706909, "learning_rate": 1.423678226264923e-05, "loss": 0.7556, "step": 5749 }, { "epoch": 8.008356545961004, "grad_norm": 2.3893003463745117, "learning_rate": 1.4226833428084139e-05, "loss": 0.6554, "step": 5750 }, { "epoch": 8.00974930362117, "grad_norm": 2.1952672004699707, "learning_rate": 1.4216884593519045e-05, "loss": 0.5623, "step": 5751 }, { "epoch": 8.011142061281337, "grad_norm": 2.2442495822906494, "learning_rate": 1.420693575895395e-05, "loss": 0.5921, "step": 5752 }, { "epoch": 8.012534818941504, "grad_norm": 2.343412160873413, "learning_rate": 1.4196986924388856e-05, "loss": 0.6303, "step": 5753 }, { "epoch": 8.013927576601672, "grad_norm": 2.431013822555542, "learning_rate": 1.4187038089823762e-05, "loss": 0.5914, "step": 5754 }, { "epoch": 8.015320334261839, "grad_norm": 2.8709309101104736, "learning_rate": 1.417708925525867e-05, "loss": 0.677, "step": 5755 }, { "epoch": 8.016713091922005, "grad_norm": 2.6854491233825684, "learning_rate": 1.4167140420693575e-05, "loss": 0.9044, "step": 5756 }, { "epoch": 8.018105849582172, "grad_norm": 2.2322912216186523, "learning_rate": 1.4157191586128481e-05, "loss": 0.5336, "step": 5757 }, { "epoch": 8.01949860724234, "grad_norm": 2.5030367374420166, "learning_rate": 1.4147242751563387e-05, "loss": 0.6665, "step": 5758 }, { "epoch": 8.020891364902507, "grad_norm": 2.7106118202209473, "learning_rate": 1.4137293916998292e-05, "loss": 0.6792, "step": 5759 }, { "epoch": 8.022284122562674, "grad_norm": 3.1940295696258545, "learning_rate": 1.4127345082433198e-05, "loss": 0.6484, "step": 5760 }, { "epoch": 8.02367688022284, "grad_norm": 2.362699508666992, "learning_rate": 1.4117396247868106e-05, "loss": 0.729, "step": 5761 }, { "epoch": 8.025069637883009, "grad_norm": 2.236466407775879, "learning_rate": 1.4107447413303011e-05, "loss": 0.5975, "step": 5762 }, { "epoch": 8.026462395543176, "grad_norm": 2.5531554222106934, "learning_rate": 1.4097498578737917e-05, "loss": 0.9051, "step": 5763 }, { "epoch": 8.027855153203342, "grad_norm": 2.501363754272461, "learning_rate": 1.4087549744172825e-05, "loss": 0.7443, "step": 5764 }, { "epoch": 8.029247910863509, "grad_norm": 2.6491715908050537, "learning_rate": 1.407760090960773e-05, "loss": 0.7844, "step": 5765 }, { "epoch": 8.030640668523677, "grad_norm": 2.756535291671753, "learning_rate": 1.4067652075042638e-05, "loss": 0.7348, "step": 5766 }, { "epoch": 8.032033426183844, "grad_norm": 2.7806143760681152, "learning_rate": 1.4057703240477544e-05, "loss": 0.6787, "step": 5767 }, { "epoch": 8.03342618384401, "grad_norm": 2.6109886169433594, "learning_rate": 1.404775440591245e-05, "loss": 0.8259, "step": 5768 }, { "epoch": 8.034818941504179, "grad_norm": 2.5723721981048584, "learning_rate": 1.4037805571347355e-05, "loss": 0.911, "step": 5769 }, { "epoch": 8.036211699164346, "grad_norm": 2.319932460784912, "learning_rate": 1.4027856736782261e-05, "loss": 0.552, "step": 5770 }, { "epoch": 8.037604456824512, "grad_norm": 2.5703868865966797, "learning_rate": 1.4017907902217167e-05, "loss": 0.6775, "step": 5771 }, { "epoch": 8.038997214484679, "grad_norm": 2.100165605545044, "learning_rate": 1.4007959067652074e-05, "loss": 0.5726, "step": 5772 }, { "epoch": 8.040389972144848, "grad_norm": 2.1826250553131104, "learning_rate": 1.399801023308698e-05, "loss": 0.7185, "step": 5773 }, { "epoch": 8.041782729805014, "grad_norm": 2.448641061782837, "learning_rate": 1.3988061398521886e-05, "loss": 0.7163, "step": 5774 }, { "epoch": 8.04317548746518, "grad_norm": 2.128293037414551, "learning_rate": 1.3978112563956791e-05, "loss": 0.6839, "step": 5775 }, { "epoch": 8.044568245125348, "grad_norm": 2.3341121673583984, "learning_rate": 1.3968163729391697e-05, "loss": 0.6366, "step": 5776 }, { "epoch": 8.045961002785516, "grad_norm": 2.6132967472076416, "learning_rate": 1.3958214894826606e-05, "loss": 0.6322, "step": 5777 }, { "epoch": 8.047353760445683, "grad_norm": 2.2238409519195557, "learning_rate": 1.3948266060261512e-05, "loss": 0.5845, "step": 5778 }, { "epoch": 8.04874651810585, "grad_norm": 2.2329790592193604, "learning_rate": 1.3938317225696418e-05, "loss": 0.6621, "step": 5779 }, { "epoch": 8.050139275766016, "grad_norm": 2.3248584270477295, "learning_rate": 1.3928368391131324e-05, "loss": 0.7721, "step": 5780 }, { "epoch": 8.051532033426184, "grad_norm": 2.487091302871704, "learning_rate": 1.391841955656623e-05, "loss": 0.7894, "step": 5781 }, { "epoch": 8.052924791086351, "grad_norm": 2.1566736698150635, "learning_rate": 1.3908470722001135e-05, "loss": 0.621, "step": 5782 }, { "epoch": 8.054317548746518, "grad_norm": 2.67290997505188, "learning_rate": 1.3898521887436043e-05, "loss": 0.8812, "step": 5783 }, { "epoch": 8.055710306406684, "grad_norm": 2.1292243003845215, "learning_rate": 1.3888573052870948e-05, "loss": 0.5414, "step": 5784 }, { "epoch": 8.057103064066853, "grad_norm": 4.09104061126709, "learning_rate": 1.3878624218305854e-05, "loss": 0.7092, "step": 5785 }, { "epoch": 8.05849582172702, "grad_norm": 2.084872245788574, "learning_rate": 1.386867538374076e-05, "loss": 0.5472, "step": 5786 }, { "epoch": 8.059888579387186, "grad_norm": 1.9917781352996826, "learning_rate": 1.3858726549175666e-05, "loss": 0.5196, "step": 5787 }, { "epoch": 8.061281337047355, "grad_norm": 2.74969482421875, "learning_rate": 1.3848777714610573e-05, "loss": 0.7509, "step": 5788 }, { "epoch": 8.062674094707521, "grad_norm": 2.2629988193511963, "learning_rate": 1.3838828880045479e-05, "loss": 0.5508, "step": 5789 }, { "epoch": 8.064066852367688, "grad_norm": 3.313831090927124, "learning_rate": 1.3828880045480385e-05, "loss": 1.0226, "step": 5790 }, { "epoch": 8.065459610027855, "grad_norm": 2.245662212371826, "learning_rate": 1.381893121091529e-05, "loss": 0.6747, "step": 5791 }, { "epoch": 8.066852367688023, "grad_norm": 2.039762496948242, "learning_rate": 1.3808982376350198e-05, "loss": 0.6079, "step": 5792 }, { "epoch": 8.06824512534819, "grad_norm": 2.0505447387695312, "learning_rate": 1.3799033541785104e-05, "loss": 0.5687, "step": 5793 }, { "epoch": 8.069637883008356, "grad_norm": 2.716017484664917, "learning_rate": 1.3789084707220011e-05, "loss": 0.7565, "step": 5794 }, { "epoch": 8.071030640668523, "grad_norm": 2.498021125793457, "learning_rate": 1.3779135872654917e-05, "loss": 0.9454, "step": 5795 }, { "epoch": 8.072423398328691, "grad_norm": 2.0985682010650635, "learning_rate": 1.3769187038089823e-05, "loss": 0.6228, "step": 5796 }, { "epoch": 8.073816155988858, "grad_norm": 2.0225183963775635, "learning_rate": 1.3759238203524729e-05, "loss": 0.5725, "step": 5797 }, { "epoch": 8.075208913649025, "grad_norm": 2.3394439220428467, "learning_rate": 1.3749289368959634e-05, "loss": 0.6084, "step": 5798 }, { "epoch": 8.076601671309191, "grad_norm": 2.3332998752593994, "learning_rate": 1.3739340534394542e-05, "loss": 0.6034, "step": 5799 }, { "epoch": 8.07799442896936, "grad_norm": 2.2177438735961914, "learning_rate": 1.3729391699829448e-05, "loss": 0.5445, "step": 5800 }, { "epoch": 8.079387186629527, "grad_norm": 2.416872978210449, "learning_rate": 1.3719442865264353e-05, "loss": 0.6073, "step": 5801 }, { "epoch": 8.080779944289693, "grad_norm": 2.3437678813934326, "learning_rate": 1.3709494030699259e-05, "loss": 0.6007, "step": 5802 }, { "epoch": 8.08217270194986, "grad_norm": 2.0099823474884033, "learning_rate": 1.3699545196134165e-05, "loss": 0.5035, "step": 5803 }, { "epoch": 8.083565459610028, "grad_norm": 2.5457677841186523, "learning_rate": 1.368959636156907e-05, "loss": 0.7358, "step": 5804 }, { "epoch": 8.084958217270195, "grad_norm": 2.1337838172912598, "learning_rate": 1.367964752700398e-05, "loss": 0.6909, "step": 5805 }, { "epoch": 8.086350974930362, "grad_norm": 2.7705256938934326, "learning_rate": 1.3669698692438886e-05, "loss": 0.75, "step": 5806 }, { "epoch": 8.087743732590528, "grad_norm": 2.366692066192627, "learning_rate": 1.3659749857873791e-05, "loss": 0.6185, "step": 5807 }, { "epoch": 8.089136490250697, "grad_norm": 2.7063069343566895, "learning_rate": 1.3649801023308697e-05, "loss": 0.7608, "step": 5808 }, { "epoch": 8.090529247910863, "grad_norm": 2.7802040576934814, "learning_rate": 1.3639852188743603e-05, "loss": 0.7523, "step": 5809 }, { "epoch": 8.09192200557103, "grad_norm": 2.513087034225464, "learning_rate": 1.362990335417851e-05, "loss": 0.6017, "step": 5810 }, { "epoch": 8.093314763231199, "grad_norm": 2.6714537143707275, "learning_rate": 1.3619954519613416e-05, "loss": 0.863, "step": 5811 }, { "epoch": 8.094707520891365, "grad_norm": 2.186694860458374, "learning_rate": 1.3610005685048322e-05, "loss": 0.5695, "step": 5812 }, { "epoch": 8.096100278551532, "grad_norm": 2.306868314743042, "learning_rate": 1.3600056850483228e-05, "loss": 0.6054, "step": 5813 }, { "epoch": 8.097493036211699, "grad_norm": 2.0273630619049072, "learning_rate": 1.3590108015918133e-05, "loss": 0.5264, "step": 5814 }, { "epoch": 8.098885793871867, "grad_norm": 2.6661019325256348, "learning_rate": 1.358015918135304e-05, "loss": 0.7713, "step": 5815 }, { "epoch": 8.100278551532034, "grad_norm": 2.276488780975342, "learning_rate": 1.3570210346787947e-05, "loss": 0.5874, "step": 5816 }, { "epoch": 8.1016713091922, "grad_norm": 2.4340615272521973, "learning_rate": 1.3560261512222852e-05, "loss": 0.7338, "step": 5817 }, { "epoch": 8.103064066852367, "grad_norm": 2.042086362838745, "learning_rate": 1.3550312677657758e-05, "loss": 0.5316, "step": 5818 }, { "epoch": 8.104456824512535, "grad_norm": 2.4385125637054443, "learning_rate": 1.3540363843092664e-05, "loss": 0.7456, "step": 5819 }, { "epoch": 8.105849582172702, "grad_norm": 2.796144723892212, "learning_rate": 1.3530415008527571e-05, "loss": 0.7485, "step": 5820 }, { "epoch": 8.107242339832869, "grad_norm": 2.8352746963500977, "learning_rate": 1.3520466173962479e-05, "loss": 0.6964, "step": 5821 }, { "epoch": 8.108635097493035, "grad_norm": 2.162991762161255, "learning_rate": 1.3510517339397385e-05, "loss": 0.6187, "step": 5822 }, { "epoch": 8.110027855153204, "grad_norm": 2.3581225872039795, "learning_rate": 1.350056850483229e-05, "loss": 0.7305, "step": 5823 }, { "epoch": 8.11142061281337, "grad_norm": 2.6001851558685303, "learning_rate": 1.3490619670267196e-05, "loss": 0.6513, "step": 5824 }, { "epoch": 8.112813370473537, "grad_norm": 2.750192165374756, "learning_rate": 1.3480670835702102e-05, "loss": 0.7932, "step": 5825 }, { "epoch": 8.114206128133704, "grad_norm": 2.251194953918457, "learning_rate": 1.3470722001137008e-05, "loss": 0.6281, "step": 5826 }, { "epoch": 8.115598885793872, "grad_norm": 2.7577006816864014, "learning_rate": 1.3460773166571915e-05, "loss": 0.712, "step": 5827 }, { "epoch": 8.116991643454039, "grad_norm": 2.456524133682251, "learning_rate": 1.3450824332006821e-05, "loss": 0.6428, "step": 5828 }, { "epoch": 8.118384401114206, "grad_norm": 2.1486616134643555, "learning_rate": 1.3440875497441727e-05, "loss": 0.6689, "step": 5829 }, { "epoch": 8.119777158774374, "grad_norm": 2.1123476028442383, "learning_rate": 1.3430926662876632e-05, "loss": 0.5007, "step": 5830 }, { "epoch": 8.12116991643454, "grad_norm": 2.353919506072998, "learning_rate": 1.3420977828311538e-05, "loss": 0.6885, "step": 5831 }, { "epoch": 8.122562674094707, "grad_norm": 2.1658499240875244, "learning_rate": 1.3411028993746447e-05, "loss": 0.5995, "step": 5832 }, { "epoch": 8.123955431754874, "grad_norm": 2.5123870372772217, "learning_rate": 1.3401080159181353e-05, "loss": 0.6298, "step": 5833 }, { "epoch": 8.125348189415043, "grad_norm": 2.5471677780151367, "learning_rate": 1.3391131324616259e-05, "loss": 0.6832, "step": 5834 }, { "epoch": 8.12674094707521, "grad_norm": 2.5444443225860596, "learning_rate": 1.3381182490051165e-05, "loss": 0.648, "step": 5835 }, { "epoch": 8.128133704735376, "grad_norm": 2.4492671489715576, "learning_rate": 1.337123365548607e-05, "loss": 0.6774, "step": 5836 }, { "epoch": 8.129526462395543, "grad_norm": 2.365010976791382, "learning_rate": 1.3361284820920976e-05, "loss": 0.7052, "step": 5837 }, { "epoch": 8.130919220055711, "grad_norm": 2.1995530128479004, "learning_rate": 1.3351335986355884e-05, "loss": 0.6682, "step": 5838 }, { "epoch": 8.132311977715878, "grad_norm": 2.2907214164733887, "learning_rate": 1.334138715179079e-05, "loss": 0.5913, "step": 5839 }, { "epoch": 8.133704735376044, "grad_norm": 2.1191372871398926, "learning_rate": 1.3331438317225695e-05, "loss": 0.5251, "step": 5840 }, { "epoch": 8.135097493036211, "grad_norm": 2.6210310459136963, "learning_rate": 1.3321489482660601e-05, "loss": 0.5285, "step": 5841 }, { "epoch": 8.13649025069638, "grad_norm": 2.2320189476013184, "learning_rate": 1.3311540648095507e-05, "loss": 0.6324, "step": 5842 }, { "epoch": 8.137883008356546, "grad_norm": 2.838926315307617, "learning_rate": 1.3301591813530414e-05, "loss": 0.6681, "step": 5843 }, { "epoch": 8.139275766016713, "grad_norm": 2.5873560905456543, "learning_rate": 1.329164297896532e-05, "loss": 0.7269, "step": 5844 }, { "epoch": 8.14066852367688, "grad_norm": 2.117415428161621, "learning_rate": 1.3281694144400226e-05, "loss": 0.6531, "step": 5845 }, { "epoch": 8.142061281337048, "grad_norm": 2.1007559299468994, "learning_rate": 1.3271745309835132e-05, "loss": 0.5534, "step": 5846 }, { "epoch": 8.143454038997215, "grad_norm": 2.65936279296875, "learning_rate": 1.3261796475270037e-05, "loss": 0.6624, "step": 5847 }, { "epoch": 8.144846796657381, "grad_norm": 2.128936290740967, "learning_rate": 1.3251847640704945e-05, "loss": 0.6995, "step": 5848 }, { "epoch": 8.14623955431755, "grad_norm": 2.5572454929351807, "learning_rate": 1.3241898806139852e-05, "loss": 0.7573, "step": 5849 }, { "epoch": 8.147632311977716, "grad_norm": 2.2091126441955566, "learning_rate": 1.3231949971574758e-05, "loss": 0.6345, "step": 5850 }, { "epoch": 8.149025069637883, "grad_norm": 2.3995816707611084, "learning_rate": 1.3222001137009664e-05, "loss": 0.7709, "step": 5851 }, { "epoch": 8.15041782729805, "grad_norm": 2.8272969722747803, "learning_rate": 1.321205230244457e-05, "loss": 0.6315, "step": 5852 }, { "epoch": 8.151810584958218, "grad_norm": 2.152749538421631, "learning_rate": 1.3202103467879475e-05, "loss": 0.645, "step": 5853 }, { "epoch": 8.153203342618385, "grad_norm": 2.56510853767395, "learning_rate": 1.3192154633314383e-05, "loss": 0.7223, "step": 5854 }, { "epoch": 8.154596100278551, "grad_norm": 2.3824665546417236, "learning_rate": 1.3182205798749289e-05, "loss": 0.85, "step": 5855 }, { "epoch": 8.155988857938718, "grad_norm": 2.052708864212036, "learning_rate": 1.3172256964184194e-05, "loss": 0.6054, "step": 5856 }, { "epoch": 8.157381615598887, "grad_norm": 2.3831207752227783, "learning_rate": 1.31623081296191e-05, "loss": 0.6628, "step": 5857 }, { "epoch": 8.158774373259053, "grad_norm": 2.5285627841949463, "learning_rate": 1.3152359295054006e-05, "loss": 0.91, "step": 5858 }, { "epoch": 8.16016713091922, "grad_norm": 2.5442399978637695, "learning_rate": 1.3142410460488912e-05, "loss": 0.5158, "step": 5859 }, { "epoch": 8.161559888579387, "grad_norm": 2.2371113300323486, "learning_rate": 1.313246162592382e-05, "loss": 0.6232, "step": 5860 }, { "epoch": 8.162952646239555, "grad_norm": 2.5069162845611572, "learning_rate": 1.3122512791358727e-05, "loss": 0.6472, "step": 5861 }, { "epoch": 8.164345403899722, "grad_norm": 1.9748362302780151, "learning_rate": 1.3112563956793632e-05, "loss": 0.4711, "step": 5862 }, { "epoch": 8.165738161559888, "grad_norm": 2.6397035121917725, "learning_rate": 1.3102615122228538e-05, "loss": 0.6641, "step": 5863 }, { "epoch": 8.167130919220055, "grad_norm": 2.3952383995056152, "learning_rate": 1.3092666287663444e-05, "loss": 0.6823, "step": 5864 }, { "epoch": 8.168523676880223, "grad_norm": 2.539874315261841, "learning_rate": 1.3082717453098351e-05, "loss": 0.7934, "step": 5865 }, { "epoch": 8.16991643454039, "grad_norm": 2.3489582538604736, "learning_rate": 1.3072768618533257e-05, "loss": 0.7011, "step": 5866 }, { "epoch": 8.171309192200557, "grad_norm": 2.5650250911712646, "learning_rate": 1.3062819783968163e-05, "loss": 0.6256, "step": 5867 }, { "epoch": 8.172701949860723, "grad_norm": 2.726438045501709, "learning_rate": 1.3052870949403069e-05, "loss": 0.7297, "step": 5868 }, { "epoch": 8.174094707520892, "grad_norm": 2.390582799911499, "learning_rate": 1.3042922114837974e-05, "loss": 0.6669, "step": 5869 }, { "epoch": 8.175487465181059, "grad_norm": 2.175386667251587, "learning_rate": 1.303297328027288e-05, "loss": 0.5627, "step": 5870 }, { "epoch": 8.176880222841225, "grad_norm": 2.3071563243865967, "learning_rate": 1.3023024445707788e-05, "loss": 0.6578, "step": 5871 }, { "epoch": 8.178272980501394, "grad_norm": 2.320700168609619, "learning_rate": 1.3013075611142693e-05, "loss": 0.6997, "step": 5872 }, { "epoch": 8.17966573816156, "grad_norm": 2.4623591899871826, "learning_rate": 1.30031267765776e-05, "loss": 0.707, "step": 5873 }, { "epoch": 8.181058495821727, "grad_norm": 1.9218674898147583, "learning_rate": 1.2993177942012505e-05, "loss": 0.4918, "step": 5874 }, { "epoch": 8.182451253481894, "grad_norm": 2.1007614135742188, "learning_rate": 1.298322910744741e-05, "loss": 0.5732, "step": 5875 }, { "epoch": 8.183844011142062, "grad_norm": 2.257274627685547, "learning_rate": 1.297328027288232e-05, "loss": 0.6475, "step": 5876 }, { "epoch": 8.185236768802229, "grad_norm": 2.255138635635376, "learning_rate": 1.2963331438317226e-05, "loss": 0.6226, "step": 5877 }, { "epoch": 8.186629526462395, "grad_norm": 2.243072271347046, "learning_rate": 1.2953382603752131e-05, "loss": 0.6013, "step": 5878 }, { "epoch": 8.188022284122562, "grad_norm": 2.5485637187957764, "learning_rate": 1.2943433769187037e-05, "loss": 0.6773, "step": 5879 }, { "epoch": 8.18941504178273, "grad_norm": 2.3386006355285645, "learning_rate": 1.2933484934621943e-05, "loss": 0.5755, "step": 5880 }, { "epoch": 8.190807799442897, "grad_norm": 2.494722843170166, "learning_rate": 1.2923536100056849e-05, "loss": 0.6814, "step": 5881 }, { "epoch": 8.192200557103064, "grad_norm": 2.4812235832214355, "learning_rate": 1.2913587265491756e-05, "loss": 0.7207, "step": 5882 }, { "epoch": 8.19359331476323, "grad_norm": 2.194302797317505, "learning_rate": 1.2903638430926662e-05, "loss": 0.6737, "step": 5883 }, { "epoch": 8.194986072423399, "grad_norm": 2.4026405811309814, "learning_rate": 1.2893689596361568e-05, "loss": 0.7586, "step": 5884 }, { "epoch": 8.196378830083566, "grad_norm": 2.3500783443450928, "learning_rate": 1.2883740761796473e-05, "loss": 0.6373, "step": 5885 }, { "epoch": 8.197771587743732, "grad_norm": 2.294128894805908, "learning_rate": 1.287379192723138e-05, "loss": 0.5818, "step": 5886 }, { "epoch": 8.199164345403899, "grad_norm": 3.788611888885498, "learning_rate": 1.2863843092666287e-05, "loss": 0.7514, "step": 5887 }, { "epoch": 8.200557103064067, "grad_norm": 2.2748560905456543, "learning_rate": 1.2853894258101192e-05, "loss": 0.6306, "step": 5888 }, { "epoch": 8.201949860724234, "grad_norm": 2.349860191345215, "learning_rate": 1.28439454235361e-05, "loss": 0.6139, "step": 5889 }, { "epoch": 8.2033426183844, "grad_norm": 2.2774569988250732, "learning_rate": 1.2833996588971006e-05, "loss": 0.649, "step": 5890 }, { "epoch": 8.204735376044567, "grad_norm": 2.288731813430786, "learning_rate": 1.2824047754405911e-05, "loss": 0.6084, "step": 5891 }, { "epoch": 8.206128133704736, "grad_norm": 3.4352667331695557, "learning_rate": 1.2814098919840817e-05, "loss": 0.4728, "step": 5892 }, { "epoch": 8.207520891364902, "grad_norm": 2.0849528312683105, "learning_rate": 1.2804150085275725e-05, "loss": 0.5846, "step": 5893 }, { "epoch": 8.20891364902507, "grad_norm": 3.286938428878784, "learning_rate": 1.279420125071063e-05, "loss": 0.5766, "step": 5894 }, { "epoch": 8.210306406685238, "grad_norm": 2.490229845046997, "learning_rate": 1.2784252416145536e-05, "loss": 0.7511, "step": 5895 }, { "epoch": 8.211699164345404, "grad_norm": 2.1200239658355713, "learning_rate": 1.2774303581580442e-05, "loss": 0.5537, "step": 5896 }, { "epoch": 8.213091922005571, "grad_norm": 2.2673428058624268, "learning_rate": 1.2764354747015348e-05, "loss": 0.6893, "step": 5897 }, { "epoch": 8.214484679665738, "grad_norm": 2.736982822418213, "learning_rate": 1.2754405912450255e-05, "loss": 0.7344, "step": 5898 }, { "epoch": 8.215877437325906, "grad_norm": 2.560347318649292, "learning_rate": 1.2744457077885161e-05, "loss": 0.7171, "step": 5899 }, { "epoch": 8.217270194986073, "grad_norm": 2.484302520751953, "learning_rate": 1.2734508243320067e-05, "loss": 0.6429, "step": 5900 }, { "epoch": 8.21866295264624, "grad_norm": 2.39678692817688, "learning_rate": 1.2724559408754973e-05, "loss": 0.6213, "step": 5901 }, { "epoch": 8.220055710306406, "grad_norm": 2.588151693344116, "learning_rate": 1.2714610574189878e-05, "loss": 0.738, "step": 5902 }, { "epoch": 8.221448467966574, "grad_norm": 2.280254364013672, "learning_rate": 1.2704661739624784e-05, "loss": 0.6531, "step": 5903 }, { "epoch": 8.222841225626741, "grad_norm": 2.2270069122314453, "learning_rate": 1.2694712905059693e-05, "loss": 0.6994, "step": 5904 }, { "epoch": 8.224233983286908, "grad_norm": 2.891584873199463, "learning_rate": 1.2684764070494599e-05, "loss": 0.6786, "step": 5905 }, { "epoch": 8.225626740947074, "grad_norm": 2.5043554306030273, "learning_rate": 1.2674815235929505e-05, "loss": 0.6914, "step": 5906 }, { "epoch": 8.227019498607243, "grad_norm": 2.258073329925537, "learning_rate": 1.266486640136441e-05, "loss": 0.6952, "step": 5907 }, { "epoch": 8.22841225626741, "grad_norm": 2.1068105697631836, "learning_rate": 1.2654917566799316e-05, "loss": 0.499, "step": 5908 }, { "epoch": 8.229805013927576, "grad_norm": 2.5303189754486084, "learning_rate": 1.2644968732234224e-05, "loss": 0.7266, "step": 5909 }, { "epoch": 8.231197771587743, "grad_norm": 2.7753899097442627, "learning_rate": 1.263501989766913e-05, "loss": 0.8295, "step": 5910 }, { "epoch": 8.232590529247911, "grad_norm": 2.471743106842041, "learning_rate": 1.2625071063104035e-05, "loss": 0.5109, "step": 5911 }, { "epoch": 8.233983286908078, "grad_norm": 2.314821481704712, "learning_rate": 1.2615122228538941e-05, "loss": 0.7335, "step": 5912 }, { "epoch": 8.235376044568245, "grad_norm": 2.624682903289795, "learning_rate": 1.2605173393973847e-05, "loss": 0.7699, "step": 5913 }, { "epoch": 8.236768802228413, "grad_norm": 2.130018711090088, "learning_rate": 1.2595224559408753e-05, "loss": 0.5058, "step": 5914 }, { "epoch": 8.23816155988858, "grad_norm": 2.4204928874969482, "learning_rate": 1.258527572484366e-05, "loss": 0.8315, "step": 5915 }, { "epoch": 8.239554317548746, "grad_norm": 2.3368215560913086, "learning_rate": 1.2575326890278566e-05, "loss": 0.5695, "step": 5916 }, { "epoch": 8.240947075208913, "grad_norm": 2.070472478866577, "learning_rate": 1.2565378055713473e-05, "loss": 0.5549, "step": 5917 }, { "epoch": 8.242339832869082, "grad_norm": 2.43178653717041, "learning_rate": 1.2555429221148379e-05, "loss": 0.7577, "step": 5918 }, { "epoch": 8.243732590529248, "grad_norm": 2.3918399810791016, "learning_rate": 1.2545480386583285e-05, "loss": 0.6415, "step": 5919 }, { "epoch": 8.245125348189415, "grad_norm": 2.3452882766723633, "learning_rate": 1.253553155201819e-05, "loss": 0.7346, "step": 5920 }, { "epoch": 8.246518105849582, "grad_norm": 2.5305464267730713, "learning_rate": 1.2525582717453098e-05, "loss": 0.7734, "step": 5921 }, { "epoch": 8.24791086350975, "grad_norm": 2.26212477684021, "learning_rate": 1.2515633882888004e-05, "loss": 0.478, "step": 5922 }, { "epoch": 8.249303621169917, "grad_norm": 2.480069637298584, "learning_rate": 1.250568504832291e-05, "loss": 0.6553, "step": 5923 }, { "epoch": 8.250696378830083, "grad_norm": 2.397341012954712, "learning_rate": 1.2495736213757815e-05, "loss": 0.6801, "step": 5924 }, { "epoch": 8.25208913649025, "grad_norm": 2.522189140319824, "learning_rate": 1.2485787379192721e-05, "loss": 0.6564, "step": 5925 }, { "epoch": 8.253481894150418, "grad_norm": 2.4263949394226074, "learning_rate": 1.2475838544627629e-05, "loss": 0.6019, "step": 5926 }, { "epoch": 8.254874651810585, "grad_norm": 2.0276315212249756, "learning_rate": 1.2465889710062534e-05, "loss": 0.6907, "step": 5927 }, { "epoch": 8.256267409470752, "grad_norm": 2.6812098026275635, "learning_rate": 1.245594087549744e-05, "loss": 0.6119, "step": 5928 }, { "epoch": 8.257660167130918, "grad_norm": 1.9010913372039795, "learning_rate": 1.2445992040932346e-05, "loss": 0.4513, "step": 5929 }, { "epoch": 8.259052924791087, "grad_norm": 2.417888879776001, "learning_rate": 1.2436043206367252e-05, "loss": 0.7542, "step": 5930 }, { "epoch": 8.260445682451254, "grad_norm": 2.392244577407837, "learning_rate": 1.2426094371802157e-05, "loss": 0.7648, "step": 5931 }, { "epoch": 8.26183844011142, "grad_norm": 2.4280102252960205, "learning_rate": 1.2416145537237067e-05, "loss": 0.6441, "step": 5932 }, { "epoch": 8.263231197771589, "grad_norm": 2.130587100982666, "learning_rate": 1.2406196702671972e-05, "loss": 0.6038, "step": 5933 }, { "epoch": 8.264623955431755, "grad_norm": 2.398505687713623, "learning_rate": 1.2396247868106878e-05, "loss": 0.7898, "step": 5934 }, { "epoch": 8.266016713091922, "grad_norm": 2.954123020172119, "learning_rate": 1.2386299033541784e-05, "loss": 0.7322, "step": 5935 }, { "epoch": 8.267409470752089, "grad_norm": 2.5999250411987305, "learning_rate": 1.237635019897669e-05, "loss": 0.7961, "step": 5936 }, { "epoch": 8.268802228412257, "grad_norm": 2.0471620559692383, "learning_rate": 1.2366401364411597e-05, "loss": 0.5758, "step": 5937 }, { "epoch": 8.270194986072424, "grad_norm": 2.710904598236084, "learning_rate": 1.2356452529846503e-05, "loss": 0.6447, "step": 5938 }, { "epoch": 8.27158774373259, "grad_norm": 2.6580443382263184, "learning_rate": 1.2346503695281409e-05, "loss": 0.5315, "step": 5939 }, { "epoch": 8.272980501392757, "grad_norm": 2.3212594985961914, "learning_rate": 1.2336554860716315e-05, "loss": 0.5587, "step": 5940 }, { "epoch": 8.274373259052926, "grad_norm": 2.2812185287475586, "learning_rate": 1.232660602615122e-05, "loss": 0.7328, "step": 5941 }, { "epoch": 8.275766016713092, "grad_norm": 2.3596303462982178, "learning_rate": 1.2316657191586126e-05, "loss": 0.7359, "step": 5942 }, { "epoch": 8.277158774373259, "grad_norm": 2.8402531147003174, "learning_rate": 1.2306708357021034e-05, "loss": 0.7279, "step": 5943 }, { "epoch": 8.278551532033426, "grad_norm": 2.4832041263580322, "learning_rate": 1.229675952245594e-05, "loss": 0.7613, "step": 5944 }, { "epoch": 8.279944289693594, "grad_norm": 2.0726089477539062, "learning_rate": 1.2286810687890847e-05, "loss": 0.6109, "step": 5945 }, { "epoch": 8.28133704735376, "grad_norm": 2.011716604232788, "learning_rate": 1.2276861853325753e-05, "loss": 0.5422, "step": 5946 }, { "epoch": 8.282729805013927, "grad_norm": 2.4055304527282715, "learning_rate": 1.2266913018760658e-05, "loss": 0.6343, "step": 5947 }, { "epoch": 8.284122562674094, "grad_norm": 1.9171804189682007, "learning_rate": 1.2256964184195566e-05, "loss": 0.4508, "step": 5948 }, { "epoch": 8.285515320334262, "grad_norm": 2.1899428367614746, "learning_rate": 1.2247015349630472e-05, "loss": 0.6534, "step": 5949 }, { "epoch": 8.286908077994429, "grad_norm": 2.282148838043213, "learning_rate": 1.2237066515065377e-05, "loss": 0.5948, "step": 5950 }, { "epoch": 8.288300835654596, "grad_norm": 2.289850950241089, "learning_rate": 1.2227117680500283e-05, "loss": 0.5804, "step": 5951 }, { "epoch": 8.289693593314762, "grad_norm": 2.1792290210723877, "learning_rate": 1.2217168845935189e-05, "loss": 0.5808, "step": 5952 }, { "epoch": 8.29108635097493, "grad_norm": 2.6128134727478027, "learning_rate": 1.2207220011370095e-05, "loss": 0.7482, "step": 5953 }, { "epoch": 8.292479108635098, "grad_norm": 2.1930418014526367, "learning_rate": 1.2197271176805002e-05, "loss": 0.6192, "step": 5954 }, { "epoch": 8.293871866295264, "grad_norm": 2.435877561569214, "learning_rate": 1.2187322342239908e-05, "loss": 0.7605, "step": 5955 }, { "epoch": 8.295264623955433, "grad_norm": 2.295370101928711, "learning_rate": 1.2177373507674814e-05, "loss": 0.6078, "step": 5956 }, { "epoch": 8.2966573816156, "grad_norm": 2.4461095333099365, "learning_rate": 1.216742467310972e-05, "loss": 0.621, "step": 5957 }, { "epoch": 8.298050139275766, "grad_norm": 2.2723653316497803, "learning_rate": 1.2157475838544625e-05, "loss": 0.5849, "step": 5958 }, { "epoch": 8.299442896935933, "grad_norm": 1.9850846529006958, "learning_rate": 1.2147527003979534e-05, "loss": 0.5557, "step": 5959 }, { "epoch": 8.300835654596101, "grad_norm": 2.227482557296753, "learning_rate": 1.213757816941444e-05, "loss": 0.5069, "step": 5960 }, { "epoch": 8.302228412256268, "grad_norm": 2.16172456741333, "learning_rate": 1.2127629334849346e-05, "loss": 0.5123, "step": 5961 }, { "epoch": 8.303621169916434, "grad_norm": 2.513965368270874, "learning_rate": 1.2117680500284252e-05, "loss": 0.6231, "step": 5962 }, { "epoch": 8.305013927576601, "grad_norm": 2.4063384532928467, "learning_rate": 1.2107731665719157e-05, "loss": 0.7008, "step": 5963 }, { "epoch": 8.30640668523677, "grad_norm": 2.155329465866089, "learning_rate": 1.2097782831154063e-05, "loss": 0.6408, "step": 5964 }, { "epoch": 8.307799442896936, "grad_norm": 2.879249095916748, "learning_rate": 1.208783399658897e-05, "loss": 0.6533, "step": 5965 }, { "epoch": 8.309192200557103, "grad_norm": 3.0566322803497314, "learning_rate": 1.2077885162023876e-05, "loss": 0.8159, "step": 5966 }, { "epoch": 8.31058495821727, "grad_norm": 2.2032434940338135, "learning_rate": 1.2067936327458782e-05, "loss": 0.6026, "step": 5967 }, { "epoch": 8.311977715877438, "grad_norm": 2.7518277168273926, "learning_rate": 1.2057987492893688e-05, "loss": 0.8638, "step": 5968 }, { "epoch": 8.313370473537605, "grad_norm": 2.5776665210723877, "learning_rate": 1.2048038658328594e-05, "loss": 0.7632, "step": 5969 }, { "epoch": 8.314763231197771, "grad_norm": 2.0614118576049805, "learning_rate": 1.2038089823763501e-05, "loss": 0.4663, "step": 5970 }, { "epoch": 8.316155988857938, "grad_norm": 2.5430283546447754, "learning_rate": 1.2028140989198407e-05, "loss": 0.6766, "step": 5971 }, { "epoch": 8.317548746518106, "grad_norm": 2.0144097805023193, "learning_rate": 1.2018192154633313e-05, "loss": 0.573, "step": 5972 }, { "epoch": 8.318941504178273, "grad_norm": 2.3663172721862793, "learning_rate": 1.200824332006822e-05, "loss": 0.7452, "step": 5973 }, { "epoch": 8.32033426183844, "grad_norm": 2.3292689323425293, "learning_rate": 1.1998294485503126e-05, "loss": 0.4725, "step": 5974 }, { "epoch": 8.321727019498606, "grad_norm": 2.3262789249420166, "learning_rate": 1.1988345650938032e-05, "loss": 0.6988, "step": 5975 }, { "epoch": 8.323119777158775, "grad_norm": 2.0353403091430664, "learning_rate": 1.1978396816372939e-05, "loss": 0.5519, "step": 5976 }, { "epoch": 8.324512534818941, "grad_norm": 2.522467851638794, "learning_rate": 1.1968447981807845e-05, "loss": 0.6967, "step": 5977 }, { "epoch": 8.325905292479108, "grad_norm": 2.53556489944458, "learning_rate": 1.195849914724275e-05, "loss": 0.7762, "step": 5978 }, { "epoch": 8.327298050139277, "grad_norm": 2.6604411602020264, "learning_rate": 1.1948550312677656e-05, "loss": 0.838, "step": 5979 }, { "epoch": 8.328690807799443, "grad_norm": 2.690307378768921, "learning_rate": 1.1938601478112562e-05, "loss": 0.7921, "step": 5980 }, { "epoch": 8.33008356545961, "grad_norm": 2.9018197059631348, "learning_rate": 1.192865264354747e-05, "loss": 0.8936, "step": 5981 }, { "epoch": 8.331476323119777, "grad_norm": 2.4915647506713867, "learning_rate": 1.1918703808982375e-05, "loss": 0.6134, "step": 5982 }, { "epoch": 8.332869080779945, "grad_norm": 2.345963716506958, "learning_rate": 1.1908754974417281e-05, "loss": 0.7294, "step": 5983 }, { "epoch": 8.334261838440112, "grad_norm": 2.099630832672119, "learning_rate": 1.1898806139852187e-05, "loss": 0.6348, "step": 5984 }, { "epoch": 8.335654596100278, "grad_norm": 2.1514601707458496, "learning_rate": 1.1888857305287093e-05, "loss": 0.6229, "step": 5985 }, { "epoch": 8.337047353760445, "grad_norm": 2.3876328468322754, "learning_rate": 1.1878908470721999e-05, "loss": 0.6337, "step": 5986 }, { "epoch": 8.338440111420613, "grad_norm": 2.335765838623047, "learning_rate": 1.1868959636156908e-05, "loss": 0.632, "step": 5987 }, { "epoch": 8.33983286908078, "grad_norm": 2.3674325942993164, "learning_rate": 1.1859010801591813e-05, "loss": 0.6469, "step": 5988 }, { "epoch": 8.341225626740947, "grad_norm": 2.791717529296875, "learning_rate": 1.184906196702672e-05, "loss": 0.7054, "step": 5989 }, { "epoch": 8.342618384401113, "grad_norm": 2.028801679611206, "learning_rate": 1.1839113132461625e-05, "loss": 0.5468, "step": 5990 }, { "epoch": 8.344011142061282, "grad_norm": 2.0309956073760986, "learning_rate": 1.182916429789653e-05, "loss": 0.5663, "step": 5991 }, { "epoch": 8.345403899721449, "grad_norm": 2.292309522628784, "learning_rate": 1.1819215463331438e-05, "loss": 0.5529, "step": 5992 }, { "epoch": 8.346796657381615, "grad_norm": 2.561776876449585, "learning_rate": 1.1809266628766344e-05, "loss": 0.6528, "step": 5993 }, { "epoch": 8.348189415041782, "grad_norm": 2.0604283809661865, "learning_rate": 1.179931779420125e-05, "loss": 0.514, "step": 5994 }, { "epoch": 8.34958217270195, "grad_norm": 2.6872153282165527, "learning_rate": 1.1789368959636156e-05, "loss": 0.7647, "step": 5995 }, { "epoch": 8.350974930362117, "grad_norm": 2.186255693435669, "learning_rate": 1.1779420125071061e-05, "loss": 0.5133, "step": 5996 }, { "epoch": 8.352367688022284, "grad_norm": 2.1541671752929688, "learning_rate": 1.1769471290505967e-05, "loss": 0.5094, "step": 5997 }, { "epoch": 8.35376044568245, "grad_norm": 2.272148847579956, "learning_rate": 1.1759522455940875e-05, "loss": 0.5915, "step": 5998 }, { "epoch": 8.355153203342619, "grad_norm": 2.8752074241638184, "learning_rate": 1.174957362137578e-05, "loss": 0.6677, "step": 5999 }, { "epoch": 8.356545961002785, "grad_norm": 2.014286756515503, "learning_rate": 1.1739624786810686e-05, "loss": 0.4691, "step": 6000 }, { "epoch": 8.357938718662952, "grad_norm": 2.2888171672821045, "learning_rate": 1.1729675952245594e-05, "loss": 0.5322, "step": 6001 }, { "epoch": 8.35933147632312, "grad_norm": 2.31115460395813, "learning_rate": 1.17197271176805e-05, "loss": 0.7171, "step": 6002 }, { "epoch": 8.360724233983287, "grad_norm": 4.947230815887451, "learning_rate": 1.1709778283115407e-05, "loss": 0.4356, "step": 6003 }, { "epoch": 8.362116991643454, "grad_norm": 2.268806219100952, "learning_rate": 1.1699829448550313e-05, "loss": 0.5276, "step": 6004 }, { "epoch": 8.36350974930362, "grad_norm": 2.08506178855896, "learning_rate": 1.1689880613985218e-05, "loss": 0.5546, "step": 6005 }, { "epoch": 8.364902506963789, "grad_norm": 2.5294394493103027, "learning_rate": 1.1679931779420124e-05, "loss": 0.7976, "step": 6006 }, { "epoch": 8.366295264623956, "grad_norm": 2.3474745750427246, "learning_rate": 1.166998294485503e-05, "loss": 0.7576, "step": 6007 }, { "epoch": 8.367688022284122, "grad_norm": 2.291719913482666, "learning_rate": 1.1660034110289936e-05, "loss": 0.6335, "step": 6008 }, { "epoch": 8.369080779944289, "grad_norm": 2.443223237991333, "learning_rate": 1.1650085275724843e-05, "loss": 0.6618, "step": 6009 }, { "epoch": 8.370473537604457, "grad_norm": 2.556466817855835, "learning_rate": 1.1640136441159749e-05, "loss": 0.5996, "step": 6010 }, { "epoch": 8.371866295264624, "grad_norm": 2.3878207206726074, "learning_rate": 1.1630187606594655e-05, "loss": 0.6798, "step": 6011 }, { "epoch": 8.37325905292479, "grad_norm": 2.4887008666992188, "learning_rate": 1.162023877202956e-05, "loss": 0.6105, "step": 6012 }, { "epoch": 8.374651810584957, "grad_norm": 2.777446985244751, "learning_rate": 1.1610289937464466e-05, "loss": 0.6818, "step": 6013 }, { "epoch": 8.376044568245126, "grad_norm": 2.743042469024658, "learning_rate": 1.1600341102899375e-05, "loss": 0.8038, "step": 6014 }, { "epoch": 8.377437325905293, "grad_norm": 1.8929640054702759, "learning_rate": 1.1590392268334281e-05, "loss": 0.518, "step": 6015 }, { "epoch": 8.37883008356546, "grad_norm": 2.203227996826172, "learning_rate": 1.1580443433769187e-05, "loss": 0.5197, "step": 6016 }, { "epoch": 8.380222841225626, "grad_norm": 2.3382623195648193, "learning_rate": 1.1570494599204093e-05, "loss": 0.5744, "step": 6017 }, { "epoch": 8.381615598885794, "grad_norm": 2.7288174629211426, "learning_rate": 1.1560545764638998e-05, "loss": 0.8261, "step": 6018 }, { "epoch": 8.383008356545961, "grad_norm": 2.1930718421936035, "learning_rate": 1.1550596930073904e-05, "loss": 0.5449, "step": 6019 }, { "epoch": 8.384401114206128, "grad_norm": 2.059731960296631, "learning_rate": 1.1540648095508812e-05, "loss": 0.5807, "step": 6020 }, { "epoch": 8.385793871866296, "grad_norm": 2.0126492977142334, "learning_rate": 1.1530699260943717e-05, "loss": 0.479, "step": 6021 }, { "epoch": 8.387186629526463, "grad_norm": 3.5227551460266113, "learning_rate": 1.1520750426378623e-05, "loss": 0.6927, "step": 6022 }, { "epoch": 8.38857938718663, "grad_norm": 2.895655393600464, "learning_rate": 1.1510801591813529e-05, "loss": 0.8958, "step": 6023 }, { "epoch": 8.389972144846796, "grad_norm": 2.062605381011963, "learning_rate": 1.1500852757248435e-05, "loss": 0.5053, "step": 6024 }, { "epoch": 8.391364902506965, "grad_norm": 2.0187153816223145, "learning_rate": 1.1490903922683342e-05, "loss": 0.5552, "step": 6025 }, { "epoch": 8.392757660167131, "grad_norm": 2.3866779804229736, "learning_rate": 1.1480955088118248e-05, "loss": 0.6162, "step": 6026 }, { "epoch": 8.394150417827298, "grad_norm": 2.3278214931488037, "learning_rate": 1.1471006253553154e-05, "loss": 0.6794, "step": 6027 }, { "epoch": 8.395543175487465, "grad_norm": 2.476162910461426, "learning_rate": 1.146105741898806e-05, "loss": 0.682, "step": 6028 }, { "epoch": 8.396935933147633, "grad_norm": 2.229062080383301, "learning_rate": 1.1451108584422967e-05, "loss": 0.6901, "step": 6029 }, { "epoch": 8.3983286908078, "grad_norm": 2.519990921020508, "learning_rate": 1.1441159749857873e-05, "loss": 0.6159, "step": 6030 }, { "epoch": 8.399721448467966, "grad_norm": 1.9492111206054688, "learning_rate": 1.143121091529278e-05, "loss": 0.5927, "step": 6031 }, { "epoch": 8.401114206128133, "grad_norm": 2.318389415740967, "learning_rate": 1.1421262080727686e-05, "loss": 0.6085, "step": 6032 }, { "epoch": 8.402506963788301, "grad_norm": 2.561988115310669, "learning_rate": 1.1411313246162592e-05, "loss": 0.7195, "step": 6033 }, { "epoch": 8.403899721448468, "grad_norm": 2.4790420532226562, "learning_rate": 1.1401364411597497e-05, "loss": 0.8115, "step": 6034 }, { "epoch": 8.405292479108635, "grad_norm": 2.3170247077941895, "learning_rate": 1.1391415577032403e-05, "loss": 0.711, "step": 6035 }, { "epoch": 8.406685236768801, "grad_norm": 2.3760390281677246, "learning_rate": 1.138146674246731e-05, "loss": 0.6746, "step": 6036 }, { "epoch": 8.40807799442897, "grad_norm": 2.3536174297332764, "learning_rate": 1.1371517907902216e-05, "loss": 0.7556, "step": 6037 }, { "epoch": 8.409470752089137, "grad_norm": 2.398081064224243, "learning_rate": 1.1361569073337122e-05, "loss": 0.577, "step": 6038 }, { "epoch": 8.410863509749303, "grad_norm": 2.490631341934204, "learning_rate": 1.1351620238772028e-05, "loss": 0.776, "step": 6039 }, { "epoch": 8.412256267409472, "grad_norm": 2.167231798171997, "learning_rate": 1.1341671404206934e-05, "loss": 0.6009, "step": 6040 }, { "epoch": 8.413649025069638, "grad_norm": 3.3253467082977295, "learning_rate": 1.133172256964184e-05, "loss": 0.7999, "step": 6041 }, { "epoch": 8.415041782729805, "grad_norm": 2.2623608112335205, "learning_rate": 1.1321773735076749e-05, "loss": 0.5736, "step": 6042 }, { "epoch": 8.416434540389972, "grad_norm": 2.054021120071411, "learning_rate": 1.1311824900511654e-05, "loss": 0.5795, "step": 6043 }, { "epoch": 8.41782729805014, "grad_norm": 2.448329448699951, "learning_rate": 1.130187606594656e-05, "loss": 0.8009, "step": 6044 }, { "epoch": 8.419220055710307, "grad_norm": 2.1401994228363037, "learning_rate": 1.1291927231381466e-05, "loss": 0.597, "step": 6045 }, { "epoch": 8.420612813370473, "grad_norm": 2.19303035736084, "learning_rate": 1.1281978396816372e-05, "loss": 0.6198, "step": 6046 }, { "epoch": 8.42200557103064, "grad_norm": 2.4928131103515625, "learning_rate": 1.127202956225128e-05, "loss": 0.5572, "step": 6047 }, { "epoch": 8.423398328690809, "grad_norm": 3.331573009490967, "learning_rate": 1.1262080727686185e-05, "loss": 0.9076, "step": 6048 }, { "epoch": 8.424791086350975, "grad_norm": 2.0832619667053223, "learning_rate": 1.125213189312109e-05, "loss": 0.5759, "step": 6049 }, { "epoch": 8.426183844011142, "grad_norm": 2.446834087371826, "learning_rate": 1.1242183058555997e-05, "loss": 0.7688, "step": 6050 }, { "epoch": 8.427576601671309, "grad_norm": 2.683854818344116, "learning_rate": 1.1232234223990902e-05, "loss": 0.8399, "step": 6051 }, { "epoch": 8.428969359331477, "grad_norm": 2.356480121612549, "learning_rate": 1.1222285389425808e-05, "loss": 0.6126, "step": 6052 }, { "epoch": 8.430362116991644, "grad_norm": 2.6480844020843506, "learning_rate": 1.1212336554860716e-05, "loss": 0.6399, "step": 6053 }, { "epoch": 8.43175487465181, "grad_norm": 2.059575319290161, "learning_rate": 1.1202387720295621e-05, "loss": 0.5944, "step": 6054 }, { "epoch": 8.433147632311977, "grad_norm": 2.2821314334869385, "learning_rate": 1.1192438885730527e-05, "loss": 0.5851, "step": 6055 }, { "epoch": 8.434540389972145, "grad_norm": 2.48276948928833, "learning_rate": 1.1182490051165433e-05, "loss": 0.6728, "step": 6056 }, { "epoch": 8.435933147632312, "grad_norm": 2.7306902408599854, "learning_rate": 1.117254121660034e-05, "loss": 0.7469, "step": 6057 }, { "epoch": 8.437325905292479, "grad_norm": 2.428895950317383, "learning_rate": 1.1162592382035248e-05, "loss": 0.6445, "step": 6058 }, { "epoch": 8.438718662952645, "grad_norm": 2.553884744644165, "learning_rate": 1.1152643547470154e-05, "loss": 0.7236, "step": 6059 }, { "epoch": 8.440111420612814, "grad_norm": 2.110875129699707, "learning_rate": 1.114269471290506e-05, "loss": 0.5479, "step": 6060 }, { "epoch": 8.44150417827298, "grad_norm": 2.2131245136260986, "learning_rate": 1.1132745878339965e-05, "loss": 0.6372, "step": 6061 }, { "epoch": 8.442896935933147, "grad_norm": 2.3978965282440186, "learning_rate": 1.112279704377487e-05, "loss": 0.6745, "step": 6062 }, { "epoch": 8.444289693593316, "grad_norm": 2.78007435798645, "learning_rate": 1.1112848209209777e-05, "loss": 0.7903, "step": 6063 }, { "epoch": 8.445682451253482, "grad_norm": 2.0242226123809814, "learning_rate": 1.1102899374644684e-05, "loss": 0.5913, "step": 6064 }, { "epoch": 8.447075208913649, "grad_norm": 2.1386795043945312, "learning_rate": 1.109295054007959e-05, "loss": 0.5568, "step": 6065 }, { "epoch": 8.448467966573816, "grad_norm": 2.5141165256500244, "learning_rate": 1.1083001705514496e-05, "loss": 0.7348, "step": 6066 }, { "epoch": 8.449860724233984, "grad_norm": 2.2028284072875977, "learning_rate": 1.1073052870949401e-05, "loss": 0.5872, "step": 6067 }, { "epoch": 8.45125348189415, "grad_norm": 2.050466299057007, "learning_rate": 1.1063104036384307e-05, "loss": 0.5879, "step": 6068 }, { "epoch": 8.452646239554317, "grad_norm": 2.748208999633789, "learning_rate": 1.1053155201819215e-05, "loss": 0.9008, "step": 6069 }, { "epoch": 8.454038997214484, "grad_norm": 2.5057647228240967, "learning_rate": 1.1043206367254122e-05, "loss": 0.5594, "step": 6070 }, { "epoch": 8.455431754874652, "grad_norm": 2.995257616043091, "learning_rate": 1.1033257532689028e-05, "loss": 0.6864, "step": 6071 }, { "epoch": 8.45682451253482, "grad_norm": 2.420017957687378, "learning_rate": 1.1023308698123934e-05, "loss": 0.5537, "step": 6072 }, { "epoch": 8.458217270194986, "grad_norm": 2.578761100769043, "learning_rate": 1.101335986355884e-05, "loss": 0.7239, "step": 6073 }, { "epoch": 8.459610027855152, "grad_norm": 2.409921884536743, "learning_rate": 1.1003411028993745e-05, "loss": 0.6184, "step": 6074 }, { "epoch": 8.461002785515321, "grad_norm": 2.3940603733062744, "learning_rate": 1.0993462194428653e-05, "loss": 0.7035, "step": 6075 }, { "epoch": 8.462395543175488, "grad_norm": 2.53452730178833, "learning_rate": 1.0983513359863558e-05, "loss": 0.8525, "step": 6076 }, { "epoch": 8.463788300835654, "grad_norm": 2.4821276664733887, "learning_rate": 1.0973564525298464e-05, "loss": 0.6698, "step": 6077 }, { "epoch": 8.465181058495821, "grad_norm": 2.6680209636688232, "learning_rate": 1.096361569073337e-05, "loss": 0.8518, "step": 6078 }, { "epoch": 8.46657381615599, "grad_norm": 2.228168487548828, "learning_rate": 1.0953666856168276e-05, "loss": 0.6164, "step": 6079 }, { "epoch": 8.467966573816156, "grad_norm": 2.6836867332458496, "learning_rate": 1.0943718021603183e-05, "loss": 0.8314, "step": 6080 }, { "epoch": 8.469359331476323, "grad_norm": 2.4073612689971924, "learning_rate": 1.0933769187038089e-05, "loss": 0.6607, "step": 6081 }, { "epoch": 8.47075208913649, "grad_norm": 2.2259182929992676, "learning_rate": 1.0923820352472995e-05, "loss": 0.6018, "step": 6082 }, { "epoch": 8.472144846796658, "grad_norm": 2.0883285999298096, "learning_rate": 1.09138715179079e-05, "loss": 0.5973, "step": 6083 }, { "epoch": 8.473537604456824, "grad_norm": 2.151155948638916, "learning_rate": 1.0903922683342806e-05, "loss": 0.6066, "step": 6084 }, { "epoch": 8.474930362116991, "grad_norm": 2.8162882328033447, "learning_rate": 1.0893973848777714e-05, "loss": 0.6641, "step": 6085 }, { "epoch": 8.47632311977716, "grad_norm": 2.4459331035614014, "learning_rate": 1.0884025014212621e-05, "loss": 0.6417, "step": 6086 }, { "epoch": 8.477715877437326, "grad_norm": 2.411317825317383, "learning_rate": 1.0874076179647527e-05, "loss": 0.6312, "step": 6087 }, { "epoch": 8.479108635097493, "grad_norm": 2.0946240425109863, "learning_rate": 1.0864127345082433e-05, "loss": 0.487, "step": 6088 }, { "epoch": 8.48050139275766, "grad_norm": 2.4434168338775635, "learning_rate": 1.0854178510517338e-05, "loss": 0.7161, "step": 6089 }, { "epoch": 8.481894150417828, "grad_norm": 2.4298110008239746, "learning_rate": 1.0844229675952244e-05, "loss": 0.7625, "step": 6090 }, { "epoch": 8.483286908077995, "grad_norm": 1.8792526721954346, "learning_rate": 1.0834280841387152e-05, "loss": 0.484, "step": 6091 }, { "epoch": 8.484679665738161, "grad_norm": 2.4808316230773926, "learning_rate": 1.0824332006822057e-05, "loss": 0.6855, "step": 6092 }, { "epoch": 8.486072423398328, "grad_norm": 2.2407937049865723, "learning_rate": 1.0814383172256963e-05, "loss": 0.6131, "step": 6093 }, { "epoch": 8.487465181058496, "grad_norm": 2.2950949668884277, "learning_rate": 1.0804434337691869e-05, "loss": 0.6918, "step": 6094 }, { "epoch": 8.488857938718663, "grad_norm": 2.2543063163757324, "learning_rate": 1.0794485503126775e-05, "loss": 0.6633, "step": 6095 }, { "epoch": 8.49025069637883, "grad_norm": 2.020587205886841, "learning_rate": 1.078453666856168e-05, "loss": 0.517, "step": 6096 }, { "epoch": 8.491643454038996, "grad_norm": 2.5152981281280518, "learning_rate": 1.0774587833996588e-05, "loss": 0.7744, "step": 6097 }, { "epoch": 8.493036211699165, "grad_norm": 2.577939987182617, "learning_rate": 1.0764638999431495e-05, "loss": 0.7159, "step": 6098 }, { "epoch": 8.494428969359332, "grad_norm": 2.560106039047241, "learning_rate": 1.0754690164866401e-05, "loss": 0.724, "step": 6099 }, { "epoch": 8.495821727019498, "grad_norm": 2.466357469558716, "learning_rate": 1.0744741330301307e-05, "loss": 0.7571, "step": 6100 }, { "epoch": 8.497214484679665, "grad_norm": 2.45121169090271, "learning_rate": 1.0734792495736213e-05, "loss": 0.7114, "step": 6101 }, { "epoch": 8.498607242339833, "grad_norm": 2.4164838790893555, "learning_rate": 1.072484366117112e-05, "loss": 0.687, "step": 6102 }, { "epoch": 8.5, "grad_norm": 2.3164844512939453, "learning_rate": 1.0714894826606026e-05, "loss": 0.6274, "step": 6103 }, { "epoch": 8.501392757660167, "grad_norm": 2.7290728092193604, "learning_rate": 1.0704945992040932e-05, "loss": 0.6255, "step": 6104 }, { "epoch": 8.502785515320333, "grad_norm": 2.267289400100708, "learning_rate": 1.0694997157475838e-05, "loss": 0.578, "step": 6105 }, { "epoch": 8.504178272980502, "grad_norm": 2.3413429260253906, "learning_rate": 1.0685048322910743e-05, "loss": 0.5592, "step": 6106 }, { "epoch": 8.505571030640668, "grad_norm": 1.9599863290786743, "learning_rate": 1.0675099488345649e-05, "loss": 0.4539, "step": 6107 }, { "epoch": 8.506963788300835, "grad_norm": 2.0619986057281494, "learning_rate": 1.0665150653780557e-05, "loss": 0.5147, "step": 6108 }, { "epoch": 8.508356545961004, "grad_norm": 2.3537232875823975, "learning_rate": 1.0655201819215462e-05, "loss": 0.638, "step": 6109 }, { "epoch": 8.50974930362117, "grad_norm": 2.3712711334228516, "learning_rate": 1.0645252984650368e-05, "loss": 0.7834, "step": 6110 }, { "epoch": 8.511142061281337, "grad_norm": 1.855421543121338, "learning_rate": 1.0635304150085274e-05, "loss": 0.5217, "step": 6111 }, { "epoch": 8.512534818941504, "grad_norm": 2.5245094299316406, "learning_rate": 1.062535531552018e-05, "loss": 0.6808, "step": 6112 }, { "epoch": 8.513927576601672, "grad_norm": 2.324967384338379, "learning_rate": 1.0615406480955085e-05, "loss": 0.751, "step": 6113 }, { "epoch": 8.515320334261839, "grad_norm": 2.984036922454834, "learning_rate": 1.0605457646389995e-05, "loss": 0.8782, "step": 6114 }, { "epoch": 8.516713091922005, "grad_norm": 2.8980345726013184, "learning_rate": 1.05955088118249e-05, "loss": 0.7493, "step": 6115 }, { "epoch": 8.518105849582172, "grad_norm": 2.2524449825286865, "learning_rate": 1.0585559977259806e-05, "loss": 0.6959, "step": 6116 }, { "epoch": 8.51949860724234, "grad_norm": 2.000723361968994, "learning_rate": 1.0575611142694712e-05, "loss": 0.514, "step": 6117 }, { "epoch": 8.520891364902507, "grad_norm": 2.113044500350952, "learning_rate": 1.0565662308129618e-05, "loss": 0.6553, "step": 6118 }, { "epoch": 8.522284122562674, "grad_norm": 1.9838714599609375, "learning_rate": 1.0555713473564525e-05, "loss": 0.533, "step": 6119 }, { "epoch": 8.52367688022284, "grad_norm": 2.3189139366149902, "learning_rate": 1.0545764638999431e-05, "loss": 0.5607, "step": 6120 }, { "epoch": 8.525069637883009, "grad_norm": 2.4928135871887207, "learning_rate": 1.0535815804434337e-05, "loss": 0.7224, "step": 6121 }, { "epoch": 8.526462395543176, "grad_norm": 1.9980363845825195, "learning_rate": 1.0525866969869242e-05, "loss": 0.4732, "step": 6122 }, { "epoch": 8.527855153203342, "grad_norm": 2.1374313831329346, "learning_rate": 1.0515918135304148e-05, "loss": 0.6443, "step": 6123 }, { "epoch": 8.52924791086351, "grad_norm": 2.423882246017456, "learning_rate": 1.0505969300739054e-05, "loss": 0.7051, "step": 6124 }, { "epoch": 8.530640668523677, "grad_norm": 2.315556526184082, "learning_rate": 1.0496020466173961e-05, "loss": 0.5865, "step": 6125 }, { "epoch": 8.532033426183844, "grad_norm": 2.257117509841919, "learning_rate": 1.0486071631608869e-05, "loss": 0.6034, "step": 6126 }, { "epoch": 8.53342618384401, "grad_norm": 2.1576497554779053, "learning_rate": 1.0476122797043775e-05, "loss": 0.5834, "step": 6127 }, { "epoch": 8.534818941504179, "grad_norm": 2.4187119007110596, "learning_rate": 1.046617396247868e-05, "loss": 0.7332, "step": 6128 }, { "epoch": 8.536211699164346, "grad_norm": 2.3867664337158203, "learning_rate": 1.0456225127913586e-05, "loss": 0.6255, "step": 6129 }, { "epoch": 8.537604456824512, "grad_norm": 2.352771520614624, "learning_rate": 1.0446276293348494e-05, "loss": 0.6072, "step": 6130 }, { "epoch": 8.538997214484679, "grad_norm": 2.334275722503662, "learning_rate": 1.04363274587834e-05, "loss": 0.7366, "step": 6131 }, { "epoch": 8.540389972144848, "grad_norm": 2.602456569671631, "learning_rate": 1.0426378624218305e-05, "loss": 0.7915, "step": 6132 }, { "epoch": 8.541782729805014, "grad_norm": 2.212661027908325, "learning_rate": 1.0416429789653211e-05, "loss": 0.6547, "step": 6133 }, { "epoch": 8.54317548746518, "grad_norm": 2.5381555557250977, "learning_rate": 1.0406480955088117e-05, "loss": 0.7952, "step": 6134 }, { "epoch": 8.544568245125348, "grad_norm": 2.6080639362335205, "learning_rate": 1.0396532120523022e-05, "loss": 0.8796, "step": 6135 }, { "epoch": 8.545961002785516, "grad_norm": 2.1430845260620117, "learning_rate": 1.038658328595793e-05, "loss": 0.702, "step": 6136 }, { "epoch": 8.547353760445683, "grad_norm": 2.3446338176727295, "learning_rate": 1.0376634451392836e-05, "loss": 0.5499, "step": 6137 }, { "epoch": 8.54874651810585, "grad_norm": 2.168344736099243, "learning_rate": 1.0366685616827741e-05, "loss": 0.536, "step": 6138 }, { "epoch": 8.550139275766016, "grad_norm": 2.585874319076538, "learning_rate": 1.0356736782262647e-05, "loss": 0.6948, "step": 6139 }, { "epoch": 8.551532033426184, "grad_norm": 2.2352216243743896, "learning_rate": 1.0346787947697553e-05, "loss": 0.609, "step": 6140 }, { "epoch": 8.552924791086351, "grad_norm": 3.1932296752929688, "learning_rate": 1.0336839113132462e-05, "loss": 0.6917, "step": 6141 }, { "epoch": 8.554317548746518, "grad_norm": 2.3370463848114014, "learning_rate": 1.0326890278567368e-05, "loss": 0.5339, "step": 6142 }, { "epoch": 8.555710306406684, "grad_norm": 2.7332851886749268, "learning_rate": 1.0316941444002274e-05, "loss": 0.8413, "step": 6143 }, { "epoch": 8.557103064066853, "grad_norm": 2.489694595336914, "learning_rate": 1.030699260943718e-05, "loss": 0.7553, "step": 6144 }, { "epoch": 8.55849582172702, "grad_norm": 2.6534366607666016, "learning_rate": 1.0297043774872085e-05, "loss": 0.8061, "step": 6145 }, { "epoch": 8.559888579387186, "grad_norm": 2.3838272094726562, "learning_rate": 1.0287094940306991e-05, "loss": 0.6884, "step": 6146 }, { "epoch": 8.561281337047355, "grad_norm": 2.1057419776916504, "learning_rate": 1.0277146105741898e-05, "loss": 0.5985, "step": 6147 }, { "epoch": 8.562674094707521, "grad_norm": 2.3954782485961914, "learning_rate": 1.0267197271176804e-05, "loss": 0.5937, "step": 6148 }, { "epoch": 8.564066852367688, "grad_norm": 2.503591299057007, "learning_rate": 1.025724843661171e-05, "loss": 0.774, "step": 6149 }, { "epoch": 8.565459610027855, "grad_norm": 2.894399404525757, "learning_rate": 1.0247299602046616e-05, "loss": 0.5723, "step": 6150 }, { "epoch": 8.566852367688023, "grad_norm": 2.4418928623199463, "learning_rate": 1.0237350767481522e-05, "loss": 0.7451, "step": 6151 }, { "epoch": 8.56824512534819, "grad_norm": 2.3711578845977783, "learning_rate": 1.0227401932916429e-05, "loss": 0.7051, "step": 6152 }, { "epoch": 8.569637883008356, "grad_norm": 2.750633716583252, "learning_rate": 1.0217453098351335e-05, "loss": 0.7105, "step": 6153 }, { "epoch": 8.571030640668523, "grad_norm": 2.5556020736694336, "learning_rate": 1.0207504263786242e-05, "loss": 0.7731, "step": 6154 }, { "epoch": 8.572423398328691, "grad_norm": 2.362330913543701, "learning_rate": 1.0197555429221148e-05, "loss": 0.7308, "step": 6155 }, { "epoch": 8.573816155988858, "grad_norm": 2.430133581161499, "learning_rate": 1.0187606594656054e-05, "loss": 0.6906, "step": 6156 }, { "epoch": 8.575208913649025, "grad_norm": 2.301302194595337, "learning_rate": 1.017765776009096e-05, "loss": 0.5844, "step": 6157 }, { "epoch": 8.576601671309191, "grad_norm": 2.2957372665405273, "learning_rate": 1.0167708925525867e-05, "loss": 0.6841, "step": 6158 }, { "epoch": 8.57799442896936, "grad_norm": 2.3437983989715576, "learning_rate": 1.0157760090960773e-05, "loss": 0.6135, "step": 6159 }, { "epoch": 8.579387186629527, "grad_norm": 2.098742961883545, "learning_rate": 1.0147811256395679e-05, "loss": 0.4693, "step": 6160 }, { "epoch": 8.580779944289693, "grad_norm": 2.3117828369140625, "learning_rate": 1.0137862421830584e-05, "loss": 0.5804, "step": 6161 }, { "epoch": 8.58217270194986, "grad_norm": 2.6008477210998535, "learning_rate": 1.012791358726549e-05, "loss": 0.672, "step": 6162 }, { "epoch": 8.583565459610028, "grad_norm": 3.014817476272583, "learning_rate": 1.0117964752700398e-05, "loss": 0.5709, "step": 6163 }, { "epoch": 8.584958217270195, "grad_norm": 2.3694300651550293, "learning_rate": 1.0108015918135303e-05, "loss": 0.6873, "step": 6164 }, { "epoch": 8.586350974930362, "grad_norm": 2.311758279800415, "learning_rate": 1.0098067083570209e-05, "loss": 0.5831, "step": 6165 }, { "epoch": 8.587743732590528, "grad_norm": 2.4471564292907715, "learning_rate": 1.0088118249005115e-05, "loss": 0.6819, "step": 6166 }, { "epoch": 8.589136490250697, "grad_norm": 2.028956413269043, "learning_rate": 1.007816941444002e-05, "loss": 0.5429, "step": 6167 }, { "epoch": 8.590529247910863, "grad_norm": 2.548349380493164, "learning_rate": 1.0068220579874926e-05, "loss": 0.8417, "step": 6168 }, { "epoch": 8.59192200557103, "grad_norm": 1.9181674718856812, "learning_rate": 1.0058271745309836e-05, "loss": 0.5545, "step": 6169 }, { "epoch": 8.593314763231199, "grad_norm": 1.998604416847229, "learning_rate": 1.0048322910744741e-05, "loss": 0.4737, "step": 6170 }, { "epoch": 8.594707520891365, "grad_norm": 1.970977783203125, "learning_rate": 1.0038374076179647e-05, "loss": 0.5835, "step": 6171 }, { "epoch": 8.596100278551532, "grad_norm": 2.541653871536255, "learning_rate": 1.0028425241614553e-05, "loss": 0.7278, "step": 6172 }, { "epoch": 8.597493036211699, "grad_norm": 2.5394837856292725, "learning_rate": 1.0018476407049459e-05, "loss": 0.7412, "step": 6173 }, { "epoch": 8.598885793871867, "grad_norm": 2.08638596534729, "learning_rate": 1.0008527572484366e-05, "loss": 0.5682, "step": 6174 }, { "epoch": 8.600278551532034, "grad_norm": 2.239345073699951, "learning_rate": 9.998578737919272e-06, "loss": 0.6036, "step": 6175 }, { "epoch": 8.6016713091922, "grad_norm": 2.568204641342163, "learning_rate": 9.988629903354178e-06, "loss": 0.8099, "step": 6176 }, { "epoch": 8.603064066852367, "grad_norm": 2.748413324356079, "learning_rate": 9.978681068789083e-06, "loss": 0.9198, "step": 6177 }, { "epoch": 8.604456824512535, "grad_norm": 2.369399309158325, "learning_rate": 9.96873223422399e-06, "loss": 0.6536, "step": 6178 }, { "epoch": 8.605849582172702, "grad_norm": 1.9755353927612305, "learning_rate": 9.958783399658895e-06, "loss": 0.6128, "step": 6179 }, { "epoch": 8.607242339832869, "grad_norm": 2.522010087966919, "learning_rate": 9.948834565093802e-06, "loss": 0.501, "step": 6180 }, { "epoch": 8.608635097493035, "grad_norm": 2.2480692863464355, "learning_rate": 9.938885730528708e-06, "loss": 0.4739, "step": 6181 }, { "epoch": 8.610027855153204, "grad_norm": 2.5440099239349365, "learning_rate": 9.928936895963616e-06, "loss": 0.6548, "step": 6182 }, { "epoch": 8.61142061281337, "grad_norm": 2.194700002670288, "learning_rate": 9.918988061398521e-06, "loss": 0.7249, "step": 6183 }, { "epoch": 8.612813370473537, "grad_norm": 2.014458179473877, "learning_rate": 9.909039226833427e-06, "loss": 0.5552, "step": 6184 }, { "epoch": 8.614206128133706, "grad_norm": 2.3420283794403076, "learning_rate": 9.899090392268335e-06, "loss": 0.7262, "step": 6185 }, { "epoch": 8.615598885793872, "grad_norm": 2.8859808444976807, "learning_rate": 9.88914155770324e-06, "loss": 0.998, "step": 6186 }, { "epoch": 8.616991643454039, "grad_norm": 2.879088878631592, "learning_rate": 9.879192723138146e-06, "loss": 0.8435, "step": 6187 }, { "epoch": 8.618384401114206, "grad_norm": 2.1142868995666504, "learning_rate": 9.869243888573052e-06, "loss": 0.5091, "step": 6188 }, { "epoch": 8.619777158774372, "grad_norm": 2.2945990562438965, "learning_rate": 9.859295054007958e-06, "loss": 0.682, "step": 6189 }, { "epoch": 8.62116991643454, "grad_norm": 2.226578712463379, "learning_rate": 9.849346219442863e-06, "loss": 0.6142, "step": 6190 }, { "epoch": 8.622562674094707, "grad_norm": 2.5178959369659424, "learning_rate": 9.839397384877771e-06, "loss": 0.6461, "step": 6191 }, { "epoch": 8.623955431754874, "grad_norm": 2.4162328243255615, "learning_rate": 9.829448550312677e-06, "loss": 0.6262, "step": 6192 }, { "epoch": 8.625348189415043, "grad_norm": 2.248405933380127, "learning_rate": 9.819499715747582e-06, "loss": 0.639, "step": 6193 }, { "epoch": 8.62674094707521, "grad_norm": 2.142740249633789, "learning_rate": 9.809550881182488e-06, "loss": 0.5656, "step": 6194 }, { "epoch": 8.628133704735376, "grad_norm": 11.2222318649292, "learning_rate": 9.799602046617394e-06, "loss": 0.6877, "step": 6195 }, { "epoch": 8.629526462395543, "grad_norm": 2.5588018894195557, "learning_rate": 9.789653212052303e-06, "loss": 0.7678, "step": 6196 }, { "epoch": 8.630919220055711, "grad_norm": 2.0188772678375244, "learning_rate": 9.779704377487209e-06, "loss": 0.5143, "step": 6197 }, { "epoch": 8.632311977715878, "grad_norm": 2.4895517826080322, "learning_rate": 9.769755542922115e-06, "loss": 0.741, "step": 6198 }, { "epoch": 8.633704735376044, "grad_norm": 2.028364658355713, "learning_rate": 9.75980670835702e-06, "loss": 0.5493, "step": 6199 }, { "epoch": 8.635097493036211, "grad_norm": 2.5568647384643555, "learning_rate": 9.749857873791926e-06, "loss": 0.6039, "step": 6200 }, { "epoch": 8.63649025069638, "grad_norm": 2.497607707977295, "learning_rate": 9.739909039226832e-06, "loss": 0.6868, "step": 6201 }, { "epoch": 8.637883008356546, "grad_norm": 2.6691572666168213, "learning_rate": 9.72996020466174e-06, "loss": 0.8506, "step": 6202 }, { "epoch": 8.639275766016713, "grad_norm": 2.588259220123291, "learning_rate": 9.720011370096645e-06, "loss": 0.6523, "step": 6203 }, { "epoch": 8.64066852367688, "grad_norm": 2.6926379203796387, "learning_rate": 9.710062535531551e-06, "loss": 0.7702, "step": 6204 }, { "epoch": 8.642061281337048, "grad_norm": 2.3415701389312744, "learning_rate": 9.700113700966457e-06, "loss": 0.7117, "step": 6205 }, { "epoch": 8.643454038997215, "grad_norm": 2.263221263885498, "learning_rate": 9.690164866401363e-06, "loss": 0.6067, "step": 6206 }, { "epoch": 8.644846796657381, "grad_norm": 3.0019490718841553, "learning_rate": 9.68021603183627e-06, "loss": 0.6614, "step": 6207 }, { "epoch": 8.64623955431755, "grad_norm": 2.1951980590820312, "learning_rate": 9.670267197271176e-06, "loss": 0.5938, "step": 6208 }, { "epoch": 8.647632311977716, "grad_norm": 2.2621564865112305, "learning_rate": 9.660318362706082e-06, "loss": 0.6477, "step": 6209 }, { "epoch": 8.649025069637883, "grad_norm": 2.6822450160980225, "learning_rate": 9.650369528140989e-06, "loss": 0.7119, "step": 6210 }, { "epoch": 8.65041782729805, "grad_norm": 2.1923351287841797, "learning_rate": 9.640420693575895e-06, "loss": 0.6461, "step": 6211 }, { "epoch": 8.651810584958218, "grad_norm": 2.5055742263793945, "learning_rate": 9.6304718590108e-06, "loss": 0.7208, "step": 6212 }, { "epoch": 8.653203342618385, "grad_norm": 2.129638433456421, "learning_rate": 9.620523024445708e-06, "loss": 0.6233, "step": 6213 }, { "epoch": 8.654596100278551, "grad_norm": 2.059680461883545, "learning_rate": 9.610574189880614e-06, "loss": 0.5662, "step": 6214 }, { "epoch": 8.655988857938718, "grad_norm": 2.1819725036621094, "learning_rate": 9.60062535531552e-06, "loss": 0.5014, "step": 6215 }, { "epoch": 8.657381615598887, "grad_norm": 2.555784225463867, "learning_rate": 9.590676520750425e-06, "loss": 0.6371, "step": 6216 }, { "epoch": 8.658774373259053, "grad_norm": 2.4387550354003906, "learning_rate": 9.580727686185331e-06, "loss": 0.6615, "step": 6217 }, { "epoch": 8.66016713091922, "grad_norm": 1.9884040355682373, "learning_rate": 9.570778851620239e-06, "loss": 0.5488, "step": 6218 }, { "epoch": 8.661559888579387, "grad_norm": 2.188859224319458, "learning_rate": 9.560830017055144e-06, "loss": 0.5665, "step": 6219 }, { "epoch": 8.662952646239555, "grad_norm": 2.0982136726379395, "learning_rate": 9.55088118249005e-06, "loss": 0.6464, "step": 6220 }, { "epoch": 8.664345403899722, "grad_norm": 1.977082371711731, "learning_rate": 9.540932347924956e-06, "loss": 0.4598, "step": 6221 }, { "epoch": 8.665738161559888, "grad_norm": 2.4717345237731934, "learning_rate": 9.530983513359862e-06, "loss": 0.6632, "step": 6222 }, { "epoch": 8.667130919220055, "grad_norm": 2.65958571434021, "learning_rate": 9.521034678794767e-06, "loss": 0.7021, "step": 6223 }, { "epoch": 8.668523676880223, "grad_norm": 1.9559311866760254, "learning_rate": 9.511085844229677e-06, "loss": 0.4133, "step": 6224 }, { "epoch": 8.66991643454039, "grad_norm": 2.413815975189209, "learning_rate": 9.501137009664582e-06, "loss": 0.6967, "step": 6225 }, { "epoch": 8.671309192200557, "grad_norm": 2.0799384117126465, "learning_rate": 9.491188175099488e-06, "loss": 0.6857, "step": 6226 }, { "epoch": 8.672701949860723, "grad_norm": 2.4632363319396973, "learning_rate": 9.481239340534394e-06, "loss": 0.69, "step": 6227 }, { "epoch": 8.674094707520892, "grad_norm": 2.4808237552642822, "learning_rate": 9.4712905059693e-06, "loss": 0.6757, "step": 6228 }, { "epoch": 8.675487465181059, "grad_norm": 2.4378230571746826, "learning_rate": 9.461341671404207e-06, "loss": 0.6528, "step": 6229 }, { "epoch": 8.676880222841225, "grad_norm": 3.099447250366211, "learning_rate": 9.451392836839113e-06, "loss": 0.6603, "step": 6230 }, { "epoch": 8.678272980501394, "grad_norm": 2.001962900161743, "learning_rate": 9.441444002274019e-06, "loss": 0.4916, "step": 6231 }, { "epoch": 8.67966573816156, "grad_norm": 2.306098222732544, "learning_rate": 9.431495167708924e-06, "loss": 0.6936, "step": 6232 }, { "epoch": 8.681058495821727, "grad_norm": 2.3302414417266846, "learning_rate": 9.42154633314383e-06, "loss": 0.6671, "step": 6233 }, { "epoch": 8.682451253481894, "grad_norm": 2.6579461097717285, "learning_rate": 9.411597498578736e-06, "loss": 0.5523, "step": 6234 }, { "epoch": 8.683844011142062, "grad_norm": 2.4258522987365723, "learning_rate": 9.401648664013643e-06, "loss": 0.6678, "step": 6235 }, { "epoch": 8.685236768802229, "grad_norm": 2.312063694000244, "learning_rate": 9.39169982944855e-06, "loss": 0.6703, "step": 6236 }, { "epoch": 8.686629526462395, "grad_norm": 2.2572250366210938, "learning_rate": 9.381750994883455e-06, "loss": 0.6261, "step": 6237 }, { "epoch": 8.688022284122562, "grad_norm": 2.1370370388031006, "learning_rate": 9.37180216031836e-06, "loss": 0.6779, "step": 6238 }, { "epoch": 8.68941504178273, "grad_norm": 2.361268997192383, "learning_rate": 9.361853325753268e-06, "loss": 0.6468, "step": 6239 }, { "epoch": 8.690807799442897, "grad_norm": 2.149677038192749, "learning_rate": 9.351904491188176e-06, "loss": 0.601, "step": 6240 }, { "epoch": 8.692200557103064, "grad_norm": 1.8978216648101807, "learning_rate": 9.341955656623081e-06, "loss": 0.4258, "step": 6241 }, { "epoch": 8.69359331476323, "grad_norm": 2.113858461380005, "learning_rate": 9.332006822057987e-06, "loss": 0.5342, "step": 6242 }, { "epoch": 8.694986072423399, "grad_norm": 2.1423451900482178, "learning_rate": 9.322057987492893e-06, "loss": 0.5411, "step": 6243 }, { "epoch": 8.696378830083566, "grad_norm": 2.2512307167053223, "learning_rate": 9.312109152927799e-06, "loss": 0.6263, "step": 6244 }, { "epoch": 8.697771587743732, "grad_norm": 2.957404375076294, "learning_rate": 9.302160318362704e-06, "loss": 0.7694, "step": 6245 }, { "epoch": 8.699164345403899, "grad_norm": 2.7469513416290283, "learning_rate": 9.292211483797612e-06, "loss": 0.6972, "step": 6246 }, { "epoch": 8.700557103064067, "grad_norm": 2.0505714416503906, "learning_rate": 9.282262649232518e-06, "loss": 0.5273, "step": 6247 }, { "epoch": 8.701949860724234, "grad_norm": 2.007658004760742, "learning_rate": 9.272313814667423e-06, "loss": 0.5611, "step": 6248 }, { "epoch": 8.7033426183844, "grad_norm": 1.9458181858062744, "learning_rate": 9.26236498010233e-06, "loss": 0.4343, "step": 6249 }, { "epoch": 8.704735376044567, "grad_norm": 2.614433765411377, "learning_rate": 9.252416145537235e-06, "loss": 0.879, "step": 6250 }, { "epoch": 8.706128133704736, "grad_norm": 2.5961923599243164, "learning_rate": 9.242467310972144e-06, "loss": 0.7532, "step": 6251 }, { "epoch": 8.707520891364902, "grad_norm": 1.9644964933395386, "learning_rate": 9.23251847640705e-06, "loss": 0.4588, "step": 6252 }, { "epoch": 8.70891364902507, "grad_norm": 2.656649351119995, "learning_rate": 9.222569641841956e-06, "loss": 0.9547, "step": 6253 }, { "epoch": 8.710306406685238, "grad_norm": 2.868112802505493, "learning_rate": 9.212620807276861e-06, "loss": 0.7435, "step": 6254 }, { "epoch": 8.711699164345404, "grad_norm": 2.573540687561035, "learning_rate": 9.202671972711767e-06, "loss": 0.7418, "step": 6255 }, { "epoch": 8.713091922005571, "grad_norm": 2.7416908740997314, "learning_rate": 9.192723138146673e-06, "loss": 0.6664, "step": 6256 }, { "epoch": 8.714484679665738, "grad_norm": 2.4586341381073, "learning_rate": 9.18277430358158e-06, "loss": 0.701, "step": 6257 }, { "epoch": 8.715877437325906, "grad_norm": 2.4834976196289062, "learning_rate": 9.172825469016486e-06, "loss": 0.7483, "step": 6258 }, { "epoch": 8.717270194986073, "grad_norm": 2.4113335609436035, "learning_rate": 9.162876634451392e-06, "loss": 0.5699, "step": 6259 }, { "epoch": 8.71866295264624, "grad_norm": 2.408886194229126, "learning_rate": 9.152927799886298e-06, "loss": 0.5494, "step": 6260 }, { "epoch": 8.720055710306406, "grad_norm": 2.3952715396881104, "learning_rate": 9.142978965321204e-06, "loss": 0.7711, "step": 6261 }, { "epoch": 8.721448467966574, "grad_norm": 2.3807334899902344, "learning_rate": 9.133030130756111e-06, "loss": 0.4661, "step": 6262 }, { "epoch": 8.722841225626741, "grad_norm": 3.1220288276672363, "learning_rate": 9.123081296191017e-06, "loss": 0.7197, "step": 6263 }, { "epoch": 8.724233983286908, "grad_norm": 2.0899627208709717, "learning_rate": 9.113132461625923e-06, "loss": 0.545, "step": 6264 }, { "epoch": 8.725626740947074, "grad_norm": 2.251028060913086, "learning_rate": 9.103183627060828e-06, "loss": 0.6706, "step": 6265 }, { "epoch": 8.727019498607243, "grad_norm": 2.0511791706085205, "learning_rate": 9.093234792495734e-06, "loss": 0.6125, "step": 6266 }, { "epoch": 8.72841225626741, "grad_norm": 1.962691307067871, "learning_rate": 9.083285957930642e-06, "loss": 0.455, "step": 6267 }, { "epoch": 8.729805013927576, "grad_norm": 2.496546983718872, "learning_rate": 9.073337123365549e-06, "loss": 0.8317, "step": 6268 }, { "epoch": 8.731197771587743, "grad_norm": 2.121131181716919, "learning_rate": 9.063388288800455e-06, "loss": 0.5414, "step": 6269 }, { "epoch": 8.732590529247911, "grad_norm": 2.086209774017334, "learning_rate": 9.05343945423536e-06, "loss": 0.532, "step": 6270 }, { "epoch": 8.733983286908078, "grad_norm": 1.8627957105636597, "learning_rate": 9.043490619670266e-06, "loss": 0.4795, "step": 6271 }, { "epoch": 8.735376044568245, "grad_norm": 3.083702564239502, "learning_rate": 9.033541785105172e-06, "loss": 0.6021, "step": 6272 }, { "epoch": 8.736768802228411, "grad_norm": 2.2250123023986816, "learning_rate": 9.02359295054008e-06, "loss": 0.6004, "step": 6273 }, { "epoch": 8.73816155988858, "grad_norm": 2.6575896739959717, "learning_rate": 9.013644115974985e-06, "loss": 0.706, "step": 6274 }, { "epoch": 8.739554317548746, "grad_norm": 2.206486463546753, "learning_rate": 9.003695281409891e-06, "loss": 0.5474, "step": 6275 }, { "epoch": 8.740947075208913, "grad_norm": 2.0687928199768066, "learning_rate": 8.993746446844797e-06, "loss": 0.4435, "step": 6276 }, { "epoch": 8.742339832869082, "grad_norm": 2.3520524501800537, "learning_rate": 8.983797612279703e-06, "loss": 0.5865, "step": 6277 }, { "epoch": 8.743732590529248, "grad_norm": 2.687025547027588, "learning_rate": 8.973848777714608e-06, "loss": 0.8384, "step": 6278 }, { "epoch": 8.745125348189415, "grad_norm": 2.443650484085083, "learning_rate": 8.963899943149518e-06, "loss": 0.7113, "step": 6279 }, { "epoch": 8.746518105849582, "grad_norm": 4.449234485626221, "learning_rate": 8.953951108584423e-06, "loss": 0.7642, "step": 6280 }, { "epoch": 8.74791086350975, "grad_norm": 2.805539131164551, "learning_rate": 8.944002274019329e-06, "loss": 0.5477, "step": 6281 }, { "epoch": 8.749303621169917, "grad_norm": 2.4428164958953857, "learning_rate": 8.934053439454235e-06, "loss": 0.5837, "step": 6282 }, { "epoch": 8.750696378830083, "grad_norm": 2.272738456726074, "learning_rate": 8.92410460488914e-06, "loss": 0.5737, "step": 6283 }, { "epoch": 8.75208913649025, "grad_norm": 2.465785503387451, "learning_rate": 8.914155770324048e-06, "loss": 0.6352, "step": 6284 }, { "epoch": 8.753481894150418, "grad_norm": 2.5750913619995117, "learning_rate": 8.904206935758954e-06, "loss": 0.7051, "step": 6285 }, { "epoch": 8.754874651810585, "grad_norm": 2.47235107421875, "learning_rate": 8.89425810119386e-06, "loss": 0.6843, "step": 6286 }, { "epoch": 8.756267409470752, "grad_norm": 2.425645112991333, "learning_rate": 8.884309266628765e-06, "loss": 0.563, "step": 6287 }, { "epoch": 8.757660167130918, "grad_norm": 2.2004497051239014, "learning_rate": 8.874360432063671e-06, "loss": 0.6022, "step": 6288 }, { "epoch": 8.759052924791087, "grad_norm": 2.0673093795776367, "learning_rate": 8.864411597498577e-06, "loss": 0.5229, "step": 6289 }, { "epoch": 8.760445682451254, "grad_norm": 2.6072375774383545, "learning_rate": 8.854462762933484e-06, "loss": 0.8253, "step": 6290 }, { "epoch": 8.76183844011142, "grad_norm": 2.562784433364868, "learning_rate": 8.84451392836839e-06, "loss": 0.6664, "step": 6291 }, { "epoch": 8.763231197771589, "grad_norm": 2.4072632789611816, "learning_rate": 8.834565093803296e-06, "loss": 0.6328, "step": 6292 }, { "epoch": 8.764623955431755, "grad_norm": 2.034414529800415, "learning_rate": 8.824616259238202e-06, "loss": 0.5251, "step": 6293 }, { "epoch": 8.766016713091922, "grad_norm": 2.5559253692626953, "learning_rate": 8.814667424673107e-06, "loss": 0.7352, "step": 6294 }, { "epoch": 8.767409470752089, "grad_norm": 9.379827499389648, "learning_rate": 8.804718590108017e-06, "loss": 0.6291, "step": 6295 }, { "epoch": 8.768802228412257, "grad_norm": 2.123366117477417, "learning_rate": 8.794769755542922e-06, "loss": 0.5206, "step": 6296 }, { "epoch": 8.770194986072424, "grad_norm": 2.862316608428955, "learning_rate": 8.784820920977828e-06, "loss": 0.6797, "step": 6297 }, { "epoch": 8.77158774373259, "grad_norm": 2.3365907669067383, "learning_rate": 8.774872086412734e-06, "loss": 0.5556, "step": 6298 }, { "epoch": 8.772980501392757, "grad_norm": 2.0204622745513916, "learning_rate": 8.76492325184764e-06, "loss": 0.5291, "step": 6299 }, { "epoch": 8.774373259052926, "grad_norm": 2.288923978805542, "learning_rate": 8.754974417282546e-06, "loss": 0.6139, "step": 6300 }, { "epoch": 8.775766016713092, "grad_norm": 2.1300530433654785, "learning_rate": 8.745025582717453e-06, "loss": 0.6121, "step": 6301 }, { "epoch": 8.777158774373259, "grad_norm": 2.426448106765747, "learning_rate": 8.735076748152359e-06, "loss": 0.8317, "step": 6302 }, { "epoch": 8.778551532033426, "grad_norm": 1.941066026687622, "learning_rate": 8.725127913587265e-06, "loss": 0.5266, "step": 6303 }, { "epoch": 8.779944289693594, "grad_norm": 2.1124699115753174, "learning_rate": 8.71517907902217e-06, "loss": 0.5945, "step": 6304 }, { "epoch": 8.78133704735376, "grad_norm": 2.294548749923706, "learning_rate": 8.705230244457078e-06, "loss": 0.5947, "step": 6305 }, { "epoch": 8.782729805013927, "grad_norm": 2.6320042610168457, "learning_rate": 8.695281409891984e-06, "loss": 0.6269, "step": 6306 }, { "epoch": 8.784122562674094, "grad_norm": 2.093122720718384, "learning_rate": 8.68533257532689e-06, "loss": 0.3532, "step": 6307 }, { "epoch": 8.785515320334262, "grad_norm": 2.3614604473114014, "learning_rate": 8.675383740761797e-06, "loss": 0.6072, "step": 6308 }, { "epoch": 8.786908077994429, "grad_norm": 2.1654114723205566, "learning_rate": 8.665434906196703e-06, "loss": 0.4391, "step": 6309 }, { "epoch": 8.788300835654596, "grad_norm": 2.2236990928649902, "learning_rate": 8.655486071631608e-06, "loss": 0.5794, "step": 6310 }, { "epoch": 8.789693593314762, "grad_norm": 2.972788095474243, "learning_rate": 8.645537237066514e-06, "loss": 0.6736, "step": 6311 }, { "epoch": 8.79108635097493, "grad_norm": 1.9443541765213013, "learning_rate": 8.63558840250142e-06, "loss": 0.5474, "step": 6312 }, { "epoch": 8.792479108635098, "grad_norm": 3.0000555515289307, "learning_rate": 8.625639567936327e-06, "loss": 0.8171, "step": 6313 }, { "epoch": 8.793871866295264, "grad_norm": 2.4785919189453125, "learning_rate": 8.615690733371233e-06, "loss": 0.6326, "step": 6314 }, { "epoch": 8.795264623955433, "grad_norm": 2.1436679363250732, "learning_rate": 8.605741898806139e-06, "loss": 0.6217, "step": 6315 }, { "epoch": 8.7966573816156, "grad_norm": 2.8069963455200195, "learning_rate": 8.595793064241046e-06, "loss": 0.7795, "step": 6316 }, { "epoch": 8.798050139275766, "grad_norm": 2.0932371616363525, "learning_rate": 8.585844229675952e-06, "loss": 0.6109, "step": 6317 }, { "epoch": 8.799442896935933, "grad_norm": 1.9473953247070312, "learning_rate": 8.575895395110858e-06, "loss": 0.5751, "step": 6318 }, { "epoch": 8.800835654596101, "grad_norm": 2.075078248977661, "learning_rate": 8.565946560545764e-06, "loss": 0.6307, "step": 6319 }, { "epoch": 8.802228412256268, "grad_norm": 2.0488667488098145, "learning_rate": 8.55599772598067e-06, "loss": 0.4987, "step": 6320 }, { "epoch": 8.803621169916434, "grad_norm": 2.2753357887268066, "learning_rate": 8.546048891415575e-06, "loss": 0.6262, "step": 6321 }, { "epoch": 8.805013927576601, "grad_norm": 5.119298934936523, "learning_rate": 8.536100056850483e-06, "loss": 0.8127, "step": 6322 }, { "epoch": 8.80640668523677, "grad_norm": 2.296943187713623, "learning_rate": 8.526151222285388e-06, "loss": 0.6588, "step": 6323 }, { "epoch": 8.807799442896936, "grad_norm": 2.4178757667541504, "learning_rate": 8.516202387720296e-06, "loss": 0.4879, "step": 6324 }, { "epoch": 8.809192200557103, "grad_norm": 2.0936036109924316, "learning_rate": 8.506253553155202e-06, "loss": 0.55, "step": 6325 }, { "epoch": 8.81058495821727, "grad_norm": 1.953722596168518, "learning_rate": 8.496304718590107e-06, "loss": 0.4707, "step": 6326 }, { "epoch": 8.811977715877438, "grad_norm": 2.4794445037841797, "learning_rate": 8.486355884025013e-06, "loss": 0.7046, "step": 6327 }, { "epoch": 8.813370473537605, "grad_norm": 2.4205312728881836, "learning_rate": 8.476407049459919e-06, "loss": 0.7026, "step": 6328 }, { "epoch": 8.814763231197771, "grad_norm": 2.315272569656372, "learning_rate": 8.466458214894826e-06, "loss": 0.6319, "step": 6329 }, { "epoch": 8.816155988857938, "grad_norm": 2.3596596717834473, "learning_rate": 8.456509380329732e-06, "loss": 0.6448, "step": 6330 }, { "epoch": 8.817548746518106, "grad_norm": 2.025387763977051, "learning_rate": 8.446560545764638e-06, "loss": 0.5503, "step": 6331 }, { "epoch": 8.818941504178273, "grad_norm": 1.8706268072128296, "learning_rate": 8.436611711199544e-06, "loss": 0.5552, "step": 6332 }, { "epoch": 8.82033426183844, "grad_norm": 2.4644529819488525, "learning_rate": 8.426662876634451e-06, "loss": 0.6394, "step": 6333 }, { "epoch": 8.821727019498606, "grad_norm": 2.3819756507873535, "learning_rate": 8.416714042069357e-06, "loss": 0.7558, "step": 6334 }, { "epoch": 8.823119777158775, "grad_norm": 2.2065489292144775, "learning_rate": 8.406765207504264e-06, "loss": 0.4688, "step": 6335 }, { "epoch": 8.824512534818941, "grad_norm": 2.437697172164917, "learning_rate": 8.39681637293917e-06, "loss": 0.6055, "step": 6336 }, { "epoch": 8.825905292479108, "grad_norm": 1.6681405305862427, "learning_rate": 8.386867538374076e-06, "loss": 0.4151, "step": 6337 }, { "epoch": 8.827298050139277, "grad_norm": 2.163010597229004, "learning_rate": 8.376918703808982e-06, "loss": 0.5549, "step": 6338 }, { "epoch": 8.828690807799443, "grad_norm": 3.0924649238586426, "learning_rate": 8.366969869243887e-06, "loss": 0.5241, "step": 6339 }, { "epoch": 8.83008356545961, "grad_norm": 2.328601598739624, "learning_rate": 8.357021034678793e-06, "loss": 0.5889, "step": 6340 }, { "epoch": 8.831476323119777, "grad_norm": 2.064568281173706, "learning_rate": 8.3470722001137e-06, "loss": 0.5624, "step": 6341 }, { "epoch": 8.832869080779945, "grad_norm": 2.0899322032928467, "learning_rate": 8.337123365548606e-06, "loss": 0.5557, "step": 6342 }, { "epoch": 8.834261838440112, "grad_norm": 2.094360589981079, "learning_rate": 8.327174530983512e-06, "loss": 0.5485, "step": 6343 }, { "epoch": 8.835654596100278, "grad_norm": 2.1241538524627686, "learning_rate": 8.31722569641842e-06, "loss": 0.6834, "step": 6344 }, { "epoch": 8.837047353760445, "grad_norm": 2.5731396675109863, "learning_rate": 8.307276861853325e-06, "loss": 0.7289, "step": 6345 }, { "epoch": 8.838440111420613, "grad_norm": 2.2893941402435303, "learning_rate": 8.297328027288231e-06, "loss": 0.6117, "step": 6346 }, { "epoch": 8.83983286908078, "grad_norm": 2.296159505844116, "learning_rate": 8.287379192723137e-06, "loss": 0.5925, "step": 6347 }, { "epoch": 8.841225626740947, "grad_norm": 2.214094877243042, "learning_rate": 8.277430358158043e-06, "loss": 0.6989, "step": 6348 }, { "epoch": 8.842618384401113, "grad_norm": 2.2275171279907227, "learning_rate": 8.26748152359295e-06, "loss": 0.5738, "step": 6349 }, { "epoch": 8.844011142061282, "grad_norm": 2.639855146408081, "learning_rate": 8.257532689027856e-06, "loss": 0.6824, "step": 6350 }, { "epoch": 8.845403899721449, "grad_norm": 2.3448219299316406, "learning_rate": 8.247583854462762e-06, "loss": 0.6128, "step": 6351 }, { "epoch": 8.846796657381615, "grad_norm": 2.3506157398223877, "learning_rate": 8.23763501989767e-06, "loss": 0.7248, "step": 6352 }, { "epoch": 8.848189415041782, "grad_norm": 2.2729196548461914, "learning_rate": 8.227686185332575e-06, "loss": 0.6088, "step": 6353 }, { "epoch": 8.84958217270195, "grad_norm": 2.17976975440979, "learning_rate": 8.21773735076748e-06, "loss": 0.5869, "step": 6354 }, { "epoch": 8.850974930362117, "grad_norm": 2.1733644008636475, "learning_rate": 8.207788516202387e-06, "loss": 0.6588, "step": 6355 }, { "epoch": 8.852367688022284, "grad_norm": 2.554656982421875, "learning_rate": 8.197839681637292e-06, "loss": 0.6929, "step": 6356 }, { "epoch": 8.85376044568245, "grad_norm": 1.9149433374404907, "learning_rate": 8.1878908470722e-06, "loss": 0.5418, "step": 6357 }, { "epoch": 8.855153203342619, "grad_norm": 2.480768918991089, "learning_rate": 8.177942012507106e-06, "loss": 0.6574, "step": 6358 }, { "epoch": 8.856545961002785, "grad_norm": 2.1273303031921387, "learning_rate": 8.167993177942011e-06, "loss": 0.5279, "step": 6359 }, { "epoch": 8.857938718662952, "grad_norm": 2.612752914428711, "learning_rate": 8.158044343376919e-06, "loss": 0.8142, "step": 6360 }, { "epoch": 8.85933147632312, "grad_norm": 2.2889060974121094, "learning_rate": 8.148095508811825e-06, "loss": 0.6762, "step": 6361 }, { "epoch": 8.860724233983287, "grad_norm": 2.2247323989868164, "learning_rate": 8.13814667424673e-06, "loss": 0.5483, "step": 6362 }, { "epoch": 8.862116991643454, "grad_norm": 1.9490143060684204, "learning_rate": 8.128197839681638e-06, "loss": 0.5258, "step": 6363 }, { "epoch": 8.86350974930362, "grad_norm": 1.9298173189163208, "learning_rate": 8.118249005116544e-06, "loss": 0.4203, "step": 6364 }, { "epoch": 8.864902506963789, "grad_norm": 2.6802420616149902, "learning_rate": 8.10830017055145e-06, "loss": 0.7453, "step": 6365 }, { "epoch": 8.866295264623956, "grad_norm": 2.622260093688965, "learning_rate": 8.098351335986355e-06, "loss": 0.7084, "step": 6366 }, { "epoch": 8.867688022284122, "grad_norm": 2.229451894760132, "learning_rate": 8.08840250142126e-06, "loss": 0.6352, "step": 6367 }, { "epoch": 8.869080779944289, "grad_norm": 2.427901268005371, "learning_rate": 8.078453666856167e-06, "loss": 0.609, "step": 6368 }, { "epoch": 8.870473537604457, "grad_norm": 2.5861806869506836, "learning_rate": 8.068504832291074e-06, "loss": 0.8659, "step": 6369 }, { "epoch": 8.871866295264624, "grad_norm": 2.042874336242676, "learning_rate": 8.05855599772598e-06, "loss": 0.5616, "step": 6370 }, { "epoch": 8.87325905292479, "grad_norm": 2.3502142429351807, "learning_rate": 8.048607163160887e-06, "loss": 0.6585, "step": 6371 }, { "epoch": 8.874651810584957, "grad_norm": 2.520488977432251, "learning_rate": 8.038658328595793e-06, "loss": 0.7801, "step": 6372 }, { "epoch": 8.876044568245126, "grad_norm": 2.4678702354431152, "learning_rate": 8.028709494030699e-06, "loss": 0.6454, "step": 6373 }, { "epoch": 8.877437325905293, "grad_norm": 2.1264488697052, "learning_rate": 8.018760659465605e-06, "loss": 0.6294, "step": 6374 }, { "epoch": 8.87883008356546, "grad_norm": 2.327361583709717, "learning_rate": 8.00881182490051e-06, "loss": 0.7171, "step": 6375 }, { "epoch": 8.880222841225628, "grad_norm": 2.119594097137451, "learning_rate": 7.998862990335416e-06, "loss": 0.5455, "step": 6376 }, { "epoch": 8.881615598885794, "grad_norm": 2.0090112686157227, "learning_rate": 7.988914155770324e-06, "loss": 0.5778, "step": 6377 }, { "epoch": 8.883008356545961, "grad_norm": 2.290604591369629, "learning_rate": 7.97896532120523e-06, "loss": 0.7082, "step": 6378 }, { "epoch": 8.884401114206128, "grad_norm": 2.384650468826294, "learning_rate": 7.969016486640135e-06, "loss": 0.6976, "step": 6379 }, { "epoch": 8.885793871866294, "grad_norm": 2.0418272018432617, "learning_rate": 7.959067652075043e-06, "loss": 0.4512, "step": 6380 }, { "epoch": 8.887186629526463, "grad_norm": 4.094249725341797, "learning_rate": 7.949118817509948e-06, "loss": 0.519, "step": 6381 }, { "epoch": 8.88857938718663, "grad_norm": 2.6078884601593018, "learning_rate": 7.939169982944854e-06, "loss": 0.8992, "step": 6382 }, { "epoch": 8.889972144846796, "grad_norm": 2.191352367401123, "learning_rate": 7.92922114837976e-06, "loss": 0.6294, "step": 6383 }, { "epoch": 8.891364902506965, "grad_norm": 2.573664903640747, "learning_rate": 7.919272313814666e-06, "loss": 0.6933, "step": 6384 }, { "epoch": 8.892757660167131, "grad_norm": 1.7536120414733887, "learning_rate": 7.909323479249573e-06, "loss": 0.4474, "step": 6385 }, { "epoch": 8.894150417827298, "grad_norm": 1.9037634134292603, "learning_rate": 7.899374644684479e-06, "loss": 0.427, "step": 6386 }, { "epoch": 8.895543175487465, "grad_norm": 2.3941051959991455, "learning_rate": 7.889425810119385e-06, "loss": 0.6907, "step": 6387 }, { "epoch": 8.896935933147633, "grad_norm": 2.87742018699646, "learning_rate": 7.879476975554292e-06, "loss": 0.7522, "step": 6388 }, { "epoch": 8.8983286908078, "grad_norm": 2.0602097511291504, "learning_rate": 7.869528140989198e-06, "loss": 0.5668, "step": 6389 }, { "epoch": 8.899721448467966, "grad_norm": 2.61000657081604, "learning_rate": 7.859579306424104e-06, "loss": 0.6381, "step": 6390 }, { "epoch": 8.901114206128133, "grad_norm": 2.4334514141082764, "learning_rate": 7.84963047185901e-06, "loss": 0.5955, "step": 6391 }, { "epoch": 8.902506963788301, "grad_norm": 2.498460292816162, "learning_rate": 7.839681637293917e-06, "loss": 0.7454, "step": 6392 }, { "epoch": 8.903899721448468, "grad_norm": 2.1475067138671875, "learning_rate": 7.829732802728823e-06, "loss": 0.5761, "step": 6393 }, { "epoch": 8.905292479108635, "grad_norm": 2.2519261837005615, "learning_rate": 7.819783968163728e-06, "loss": 0.6272, "step": 6394 }, { "epoch": 8.906685236768801, "grad_norm": 2.2598562240600586, "learning_rate": 7.809835133598634e-06, "loss": 0.7187, "step": 6395 }, { "epoch": 8.90807799442897, "grad_norm": 2.552800416946411, "learning_rate": 7.799886299033542e-06, "loss": 0.5676, "step": 6396 }, { "epoch": 8.909470752089137, "grad_norm": 2.866748809814453, "learning_rate": 7.789937464468447e-06, "loss": 0.9511, "step": 6397 }, { "epoch": 8.910863509749303, "grad_norm": 2.2814176082611084, "learning_rate": 7.779988629903353e-06, "loss": 0.688, "step": 6398 }, { "epoch": 8.912256267409472, "grad_norm": 2.148735523223877, "learning_rate": 7.77003979533826e-06, "loss": 0.4889, "step": 6399 }, { "epoch": 8.913649025069638, "grad_norm": 2.1620900630950928, "learning_rate": 7.760090960773166e-06, "loss": 0.5491, "step": 6400 }, { "epoch": 8.915041782729805, "grad_norm": 2.202448844909668, "learning_rate": 7.750142126208072e-06, "loss": 0.6079, "step": 6401 }, { "epoch": 8.916434540389972, "grad_norm": 2.5117387771606445, "learning_rate": 7.740193291642978e-06, "loss": 0.7127, "step": 6402 }, { "epoch": 8.91782729805014, "grad_norm": 2.3837361335754395, "learning_rate": 7.730244457077884e-06, "loss": 0.6081, "step": 6403 }, { "epoch": 8.919220055710307, "grad_norm": 2.114670991897583, "learning_rate": 7.720295622512791e-06, "loss": 0.4769, "step": 6404 }, { "epoch": 8.920612813370473, "grad_norm": 2.271439552307129, "learning_rate": 7.710346787947697e-06, "loss": 0.637, "step": 6405 }, { "epoch": 8.92200557103064, "grad_norm": 1.810683012008667, "learning_rate": 7.700397953382603e-06, "loss": 0.5446, "step": 6406 }, { "epoch": 8.923398328690809, "grad_norm": 2.686549425125122, "learning_rate": 7.69044911881751e-06, "loss": 0.7877, "step": 6407 }, { "epoch": 8.924791086350975, "grad_norm": 2.7595272064208984, "learning_rate": 7.680500284252416e-06, "loss": 0.7785, "step": 6408 }, { "epoch": 8.926183844011142, "grad_norm": 2.541989803314209, "learning_rate": 7.670551449687322e-06, "loss": 0.6519, "step": 6409 }, { "epoch": 8.927576601671309, "grad_norm": 2.749061346054077, "learning_rate": 7.660602615122228e-06, "loss": 0.8212, "step": 6410 }, { "epoch": 8.928969359331477, "grad_norm": 2.9554824829101562, "learning_rate": 7.650653780557133e-06, "loss": 0.645, "step": 6411 }, { "epoch": 8.930362116991644, "grad_norm": 2.175075054168701, "learning_rate": 7.640704945992039e-06, "loss": 0.5324, "step": 6412 }, { "epoch": 8.93175487465181, "grad_norm": 1.6839343309402466, "learning_rate": 7.630756111426947e-06, "loss": 0.4137, "step": 6413 }, { "epoch": 8.933147632311977, "grad_norm": 2.240232467651367, "learning_rate": 7.620807276861852e-06, "loss": 0.5484, "step": 6414 }, { "epoch": 8.934540389972145, "grad_norm": 2.220383882522583, "learning_rate": 7.610858442296759e-06, "loss": 0.5923, "step": 6415 }, { "epoch": 8.935933147632312, "grad_norm": 2.379972457885742, "learning_rate": 7.6009096077316655e-06, "loss": 0.7216, "step": 6416 }, { "epoch": 8.937325905292479, "grad_norm": 2.165274143218994, "learning_rate": 7.590960773166571e-06, "loss": 0.497, "step": 6417 }, { "epoch": 8.938718662952645, "grad_norm": 2.3317317962646484, "learning_rate": 7.581011938601478e-06, "loss": 0.6365, "step": 6418 }, { "epoch": 8.940111420612814, "grad_norm": 2.669895887374878, "learning_rate": 7.571063104036384e-06, "loss": 0.7133, "step": 6419 }, { "epoch": 8.94150417827298, "grad_norm": 2.572749137878418, "learning_rate": 7.5611142694712895e-06, "loss": 0.7673, "step": 6420 }, { "epoch": 8.942896935933147, "grad_norm": 2.1923329830169678, "learning_rate": 7.551165434906196e-06, "loss": 0.5866, "step": 6421 }, { "epoch": 8.944289693593316, "grad_norm": 2.577394962310791, "learning_rate": 7.541216600341102e-06, "loss": 0.6512, "step": 6422 }, { "epoch": 8.945682451253482, "grad_norm": 2.8903701305389404, "learning_rate": 7.531267765776008e-06, "loss": 0.9225, "step": 6423 }, { "epoch": 8.947075208913649, "grad_norm": 2.316950559616089, "learning_rate": 7.521318931210915e-06, "loss": 0.5731, "step": 6424 }, { "epoch": 8.948467966573816, "grad_norm": 2.4975435733795166, "learning_rate": 7.511370096645821e-06, "loss": 0.5957, "step": 6425 }, { "epoch": 8.949860724233984, "grad_norm": 2.2214558124542236, "learning_rate": 7.5014212620807275e-06, "loss": 0.5169, "step": 6426 }, { "epoch": 8.95125348189415, "grad_norm": 2.169856309890747, "learning_rate": 7.491472427515633e-06, "loss": 0.4889, "step": 6427 }, { "epoch": 8.952646239554317, "grad_norm": 1.9657492637634277, "learning_rate": 7.481523592950539e-06, "loss": 0.4941, "step": 6428 }, { "epoch": 8.954038997214484, "grad_norm": 2.2034287452697754, "learning_rate": 7.471574758385446e-06, "loss": 0.59, "step": 6429 }, { "epoch": 8.955431754874652, "grad_norm": 2.387701988220215, "learning_rate": 7.461625923820352e-06, "loss": 0.7226, "step": 6430 }, { "epoch": 8.95682451253482, "grad_norm": 2.715440273284912, "learning_rate": 7.451677089255258e-06, "loss": 0.7354, "step": 6431 }, { "epoch": 8.958217270194986, "grad_norm": 2.4074158668518066, "learning_rate": 7.441728254690165e-06, "loss": 0.8206, "step": 6432 }, { "epoch": 8.959610027855152, "grad_norm": 2.1775918006896973, "learning_rate": 7.43177942012507e-06, "loss": 0.6853, "step": 6433 }, { "epoch": 8.961002785515321, "grad_norm": 2.364708185195923, "learning_rate": 7.421830585559976e-06, "loss": 0.6133, "step": 6434 }, { "epoch": 8.962395543175488, "grad_norm": 2.233593225479126, "learning_rate": 7.411881750994883e-06, "loss": 0.5081, "step": 6435 }, { "epoch": 8.963788300835654, "grad_norm": 2.5921921730041504, "learning_rate": 7.4019329164297885e-06, "loss": 0.7114, "step": 6436 }, { "epoch": 8.965181058495821, "grad_norm": 2.1150667667388916, "learning_rate": 7.391984081864696e-06, "loss": 0.5276, "step": 6437 }, { "epoch": 8.96657381615599, "grad_norm": 2.1465353965759277, "learning_rate": 7.382035247299602e-06, "loss": 0.5026, "step": 6438 }, { "epoch": 8.967966573816156, "grad_norm": 2.005256414413452, "learning_rate": 7.3720864127345075e-06, "loss": 0.4283, "step": 6439 }, { "epoch": 8.969359331476323, "grad_norm": 2.601466655731201, "learning_rate": 7.362137578169414e-06, "loss": 0.8654, "step": 6440 }, { "epoch": 8.97075208913649, "grad_norm": 2.86332631111145, "learning_rate": 7.35218874360432e-06, "loss": 0.7487, "step": 6441 }, { "epoch": 8.972144846796658, "grad_norm": 2.120083808898926, "learning_rate": 7.342239909039226e-06, "loss": 0.46, "step": 6442 }, { "epoch": 8.973537604456824, "grad_norm": 2.6671149730682373, "learning_rate": 7.332291074474132e-06, "loss": 0.6688, "step": 6443 }, { "epoch": 8.974930362116991, "grad_norm": 2.4337751865386963, "learning_rate": 7.322342239909039e-06, "loss": 0.6878, "step": 6444 }, { "epoch": 8.97632311977716, "grad_norm": 2.029585123062134, "learning_rate": 7.312393405343945e-06, "loss": 0.5332, "step": 6445 }, { "epoch": 8.977715877437326, "grad_norm": 1.9723401069641113, "learning_rate": 7.302444570778851e-06, "loss": 0.5103, "step": 6446 }, { "epoch": 8.979108635097493, "grad_norm": 2.330951452255249, "learning_rate": 7.292495736213757e-06, "loss": 0.6988, "step": 6447 }, { "epoch": 8.98050139275766, "grad_norm": 2.130052328109741, "learning_rate": 7.282546901648663e-06, "loss": 0.5799, "step": 6448 }, { "epoch": 8.981894150417828, "grad_norm": 2.8521018028259277, "learning_rate": 7.2725980670835695e-06, "loss": 0.8852, "step": 6449 }, { "epoch": 8.983286908077995, "grad_norm": 2.0426878929138184, "learning_rate": 7.262649232518475e-06, "loss": 0.5008, "step": 6450 }, { "epoch": 8.984679665738161, "grad_norm": 2.6988277435302734, "learning_rate": 7.252700397953383e-06, "loss": 0.6231, "step": 6451 }, { "epoch": 8.986072423398328, "grad_norm": 2.195624828338623, "learning_rate": 7.2427515633882885e-06, "loss": 0.7232, "step": 6452 }, { "epoch": 8.987465181058496, "grad_norm": 2.369209051132202, "learning_rate": 7.232802728823194e-06, "loss": 0.5215, "step": 6453 }, { "epoch": 8.988857938718663, "grad_norm": 2.2820773124694824, "learning_rate": 7.222853894258101e-06, "loss": 0.5163, "step": 6454 }, { "epoch": 8.99025069637883, "grad_norm": 2.16398549079895, "learning_rate": 7.212905059693007e-06, "loss": 0.5011, "step": 6455 }, { "epoch": 8.991643454038996, "grad_norm": 2.217278003692627, "learning_rate": 7.202956225127912e-06, "loss": 0.6777, "step": 6456 }, { "epoch": 8.993036211699165, "grad_norm": 2.1077592372894287, "learning_rate": 7.193007390562819e-06, "loss": 0.4697, "step": 6457 }, { "epoch": 8.994428969359332, "grad_norm": 2.4509129524230957, "learning_rate": 7.183058555997726e-06, "loss": 0.7212, "step": 6458 }, { "epoch": 8.995821727019498, "grad_norm": 2.3753774166107178, "learning_rate": 7.173109721432631e-06, "loss": 0.5633, "step": 6459 }, { "epoch": 8.997214484679667, "grad_norm": 2.4649858474731445, "learning_rate": 7.163160886867538e-06, "loss": 0.6322, "step": 6460 }, { "epoch": 8.998607242339833, "grad_norm": 1.9081060886383057, "learning_rate": 7.153212052302444e-06, "loss": 0.4937, "step": 6461 }, { "epoch": 9.0, "grad_norm": 2.2499141693115234, "learning_rate": 7.14326321773735e-06, "loss": 0.5931, "step": 6462 }, { "epoch": 9.001392757660167, "grad_norm": 2.136467456817627, "learning_rate": 7.133314383172256e-06, "loss": 0.4405, "step": 6463 }, { "epoch": 9.002785515320335, "grad_norm": 2.313202142715454, "learning_rate": 7.123365548607162e-06, "loss": 0.6688, "step": 6464 }, { "epoch": 9.004178272980502, "grad_norm": 2.090893030166626, "learning_rate": 7.113416714042069e-06, "loss": 0.4964, "step": 6465 }, { "epoch": 9.005571030640668, "grad_norm": 2.3237464427948, "learning_rate": 7.103467879476975e-06, "loss": 0.4757, "step": 6466 }, { "epoch": 9.006963788300835, "grad_norm": 1.8766562938690186, "learning_rate": 7.093519044911881e-06, "loss": 0.5428, "step": 6467 }, { "epoch": 9.008356545961004, "grad_norm": 2.2717623710632324, "learning_rate": 7.0835702103467875e-06, "loss": 0.6654, "step": 6468 }, { "epoch": 9.00974930362117, "grad_norm": 2.110006332397461, "learning_rate": 7.073621375781693e-06, "loss": 0.5555, "step": 6469 }, { "epoch": 9.011142061281337, "grad_norm": 2.2815473079681396, "learning_rate": 7.063672541216599e-06, "loss": 0.595, "step": 6470 }, { "epoch": 9.012534818941504, "grad_norm": 2.1525909900665283, "learning_rate": 7.053723706651506e-06, "loss": 0.5173, "step": 6471 }, { "epoch": 9.013927576601672, "grad_norm": 2.469860792160034, "learning_rate": 7.043774872086412e-06, "loss": 0.6452, "step": 6472 }, { "epoch": 9.015320334261839, "grad_norm": 1.8618592023849487, "learning_rate": 7.033826037521319e-06, "loss": 0.5143, "step": 6473 }, { "epoch": 9.016713091922005, "grad_norm": 2.2681984901428223, "learning_rate": 7.023877202956225e-06, "loss": 0.6183, "step": 6474 }, { "epoch": 9.018105849582172, "grad_norm": 2.1257636547088623, "learning_rate": 7.0139283683911305e-06, "loss": 0.6137, "step": 6475 }, { "epoch": 9.01949860724234, "grad_norm": 2.035341739654541, "learning_rate": 7.003979533826037e-06, "loss": 0.5467, "step": 6476 }, { "epoch": 9.020891364902507, "grad_norm": 2.1111531257629395, "learning_rate": 6.994030699260943e-06, "loss": 0.5556, "step": 6477 }, { "epoch": 9.022284122562674, "grad_norm": 2.399853229522705, "learning_rate": 6.984081864695849e-06, "loss": 0.7357, "step": 6478 }, { "epoch": 9.02367688022284, "grad_norm": 1.8878366947174072, "learning_rate": 6.974133030130756e-06, "loss": 0.3905, "step": 6479 }, { "epoch": 9.025069637883009, "grad_norm": 1.9461958408355713, "learning_rate": 6.964184195565662e-06, "loss": 0.5955, "step": 6480 }, { "epoch": 9.026462395543176, "grad_norm": 2.0812342166900635, "learning_rate": 6.954235361000568e-06, "loss": 0.599, "step": 6481 }, { "epoch": 9.027855153203342, "grad_norm": 2.3478505611419678, "learning_rate": 6.944286526435474e-06, "loss": 0.6056, "step": 6482 }, { "epoch": 9.029247910863509, "grad_norm": 2.2185094356536865, "learning_rate": 6.93433769187038e-06, "loss": 0.656, "step": 6483 }, { "epoch": 9.030640668523677, "grad_norm": 2.0745649337768555, "learning_rate": 6.924388857305287e-06, "loss": 0.5492, "step": 6484 }, { "epoch": 9.032033426183844, "grad_norm": 2.6149706840515137, "learning_rate": 6.914440022740192e-06, "loss": 0.7426, "step": 6485 }, { "epoch": 9.03342618384401, "grad_norm": 2.519866704940796, "learning_rate": 6.904491188175099e-06, "loss": 0.6276, "step": 6486 }, { "epoch": 9.034818941504179, "grad_norm": 1.6276168823242188, "learning_rate": 6.894542353610006e-06, "loss": 0.4102, "step": 6487 }, { "epoch": 9.036211699164346, "grad_norm": 1.8854120969772339, "learning_rate": 6.884593519044911e-06, "loss": 0.4847, "step": 6488 }, { "epoch": 9.037604456824512, "grad_norm": 2.044466733932495, "learning_rate": 6.874644684479817e-06, "loss": 0.6135, "step": 6489 }, { "epoch": 9.038997214484679, "grad_norm": 2.0263922214508057, "learning_rate": 6.864695849914724e-06, "loss": 0.5746, "step": 6490 }, { "epoch": 9.040389972144848, "grad_norm": 2.099761486053467, "learning_rate": 6.8547470153496295e-06, "loss": 0.5026, "step": 6491 }, { "epoch": 9.041782729805014, "grad_norm": 1.8517907857894897, "learning_rate": 6.844798180784535e-06, "loss": 0.4566, "step": 6492 }, { "epoch": 9.04317548746518, "grad_norm": 2.01879620552063, "learning_rate": 6.834849346219443e-06, "loss": 0.6084, "step": 6493 }, { "epoch": 9.044568245125348, "grad_norm": 1.533672571182251, "learning_rate": 6.8249005116543485e-06, "loss": 0.3441, "step": 6494 }, { "epoch": 9.045961002785516, "grad_norm": 1.9208685159683228, "learning_rate": 6.814951677089255e-06, "loss": 0.43, "step": 6495 }, { "epoch": 9.047353760445683, "grad_norm": 2.1694424152374268, "learning_rate": 6.805002842524161e-06, "loss": 0.6528, "step": 6496 }, { "epoch": 9.04874651810585, "grad_norm": 1.8260539770126343, "learning_rate": 6.795054007959067e-06, "loss": 0.4115, "step": 6497 }, { "epoch": 9.050139275766016, "grad_norm": 2.137845754623413, "learning_rate": 6.785105173393973e-06, "loss": 0.6022, "step": 6498 }, { "epoch": 9.051532033426184, "grad_norm": 2.4480721950531006, "learning_rate": 6.775156338828879e-06, "loss": 0.8338, "step": 6499 }, { "epoch": 9.052924791086351, "grad_norm": 2.1186275482177734, "learning_rate": 6.765207504263786e-06, "loss": 0.5907, "step": 6500 }, { "epoch": 9.054317548746518, "grad_norm": 2.262209415435791, "learning_rate": 6.755258669698692e-06, "loss": 0.5248, "step": 6501 }, { "epoch": 9.055710306406684, "grad_norm": 2.148144483566284, "learning_rate": 6.745309835133598e-06, "loss": 0.5823, "step": 6502 }, { "epoch": 9.057103064066853, "grad_norm": 2.2517294883728027, "learning_rate": 6.735361000568504e-06, "loss": 0.455, "step": 6503 }, { "epoch": 9.05849582172702, "grad_norm": 1.8921222686767578, "learning_rate": 6.7254121660034105e-06, "loss": 0.4669, "step": 6504 }, { "epoch": 9.059888579387186, "grad_norm": 2.537829637527466, "learning_rate": 6.715463331438316e-06, "loss": 0.6485, "step": 6505 }, { "epoch": 9.061281337047355, "grad_norm": 2.7535204887390137, "learning_rate": 6.705514496873224e-06, "loss": 0.6671, "step": 6506 }, { "epoch": 9.062674094707521, "grad_norm": 2.0473270416259766, "learning_rate": 6.6955656623081295e-06, "loss": 0.5458, "step": 6507 }, { "epoch": 9.064066852367688, "grad_norm": 1.9972840547561646, "learning_rate": 6.685616827743035e-06, "loss": 0.5426, "step": 6508 }, { "epoch": 9.065459610027855, "grad_norm": 2.451608657836914, "learning_rate": 6.675667993177942e-06, "loss": 0.5775, "step": 6509 }, { "epoch": 9.066852367688023, "grad_norm": 2.119060754776001, "learning_rate": 6.665719158612848e-06, "loss": 0.5437, "step": 6510 }, { "epoch": 9.06824512534819, "grad_norm": 2.03897762298584, "learning_rate": 6.655770324047753e-06, "loss": 0.6292, "step": 6511 }, { "epoch": 9.069637883008356, "grad_norm": 1.972775936126709, "learning_rate": 6.64582148948266e-06, "loss": 0.5501, "step": 6512 }, { "epoch": 9.071030640668523, "grad_norm": 1.6815879344940186, "learning_rate": 6.635872654917566e-06, "loss": 0.4543, "step": 6513 }, { "epoch": 9.072423398328691, "grad_norm": 2.2197539806365967, "learning_rate": 6.625923820352472e-06, "loss": 0.6066, "step": 6514 }, { "epoch": 9.073816155988858, "grad_norm": 2.0606441497802734, "learning_rate": 6.615974985787379e-06, "loss": 0.4188, "step": 6515 }, { "epoch": 9.075208913649025, "grad_norm": 2.4126522541046143, "learning_rate": 6.606026151222285e-06, "loss": 0.679, "step": 6516 }, { "epoch": 9.076601671309191, "grad_norm": 2.097259283065796, "learning_rate": 6.596077316657191e-06, "loss": 0.4659, "step": 6517 }, { "epoch": 9.07799442896936, "grad_norm": 1.8794084787368774, "learning_rate": 6.586128482092097e-06, "loss": 0.4761, "step": 6518 }, { "epoch": 9.079387186629527, "grad_norm": 2.0499138832092285, "learning_rate": 6.576179647527003e-06, "loss": 0.5466, "step": 6519 }, { "epoch": 9.080779944289693, "grad_norm": 2.3997960090637207, "learning_rate": 6.56623081296191e-06, "loss": 0.7621, "step": 6520 }, { "epoch": 9.08217270194986, "grad_norm": 2.227309465408325, "learning_rate": 6.556281978396816e-06, "loss": 0.5626, "step": 6521 }, { "epoch": 9.083565459610028, "grad_norm": 2.1702988147735596, "learning_rate": 6.546333143831722e-06, "loss": 0.571, "step": 6522 }, { "epoch": 9.084958217270195, "grad_norm": 1.8819431066513062, "learning_rate": 6.5363843092666286e-06, "loss": 0.4433, "step": 6523 }, { "epoch": 9.086350974930362, "grad_norm": 2.238947629928589, "learning_rate": 6.526435474701534e-06, "loss": 0.662, "step": 6524 }, { "epoch": 9.087743732590528, "grad_norm": 2.6882331371307373, "learning_rate": 6.51648664013644e-06, "loss": 0.6829, "step": 6525 }, { "epoch": 9.089136490250697, "grad_norm": 2.0446624755859375, "learning_rate": 6.506537805571347e-06, "loss": 0.5858, "step": 6526 }, { "epoch": 9.090529247910863, "grad_norm": 1.9051544666290283, "learning_rate": 6.4965889710062525e-06, "loss": 0.4415, "step": 6527 }, { "epoch": 9.09192200557103, "grad_norm": 2.035181999206543, "learning_rate": 6.48664013644116e-06, "loss": 0.497, "step": 6528 }, { "epoch": 9.093314763231199, "grad_norm": 2.2840728759765625, "learning_rate": 6.476691301876066e-06, "loss": 0.729, "step": 6529 }, { "epoch": 9.094707520891365, "grad_norm": 2.1286890506744385, "learning_rate": 6.4667424673109715e-06, "loss": 0.5436, "step": 6530 }, { "epoch": 9.096100278551532, "grad_norm": 2.1142804622650146, "learning_rate": 6.456793632745878e-06, "loss": 0.5353, "step": 6531 }, { "epoch": 9.097493036211699, "grad_norm": 1.9269057512283325, "learning_rate": 6.446844798180784e-06, "loss": 0.4997, "step": 6532 }, { "epoch": 9.098885793871867, "grad_norm": 2.1926798820495605, "learning_rate": 6.43689596361569e-06, "loss": 0.5438, "step": 6533 }, { "epoch": 9.100278551532034, "grad_norm": 2.042038917541504, "learning_rate": 6.426947129050596e-06, "loss": 0.508, "step": 6534 }, { "epoch": 9.1016713091922, "grad_norm": 1.7788965702056885, "learning_rate": 6.416998294485503e-06, "loss": 0.4407, "step": 6535 }, { "epoch": 9.103064066852367, "grad_norm": 2.239969253540039, "learning_rate": 6.407049459920409e-06, "loss": 0.6697, "step": 6536 }, { "epoch": 9.104456824512535, "grad_norm": 2.0588786602020264, "learning_rate": 6.397100625355315e-06, "loss": 0.582, "step": 6537 }, { "epoch": 9.105849582172702, "grad_norm": 1.8924956321716309, "learning_rate": 6.387151790790221e-06, "loss": 0.4345, "step": 6538 }, { "epoch": 9.107242339832869, "grad_norm": 2.6685168743133545, "learning_rate": 6.377202956225128e-06, "loss": 0.5076, "step": 6539 }, { "epoch": 9.108635097493035, "grad_norm": 1.8214253187179565, "learning_rate": 6.367254121660033e-06, "loss": 0.5259, "step": 6540 }, { "epoch": 9.110027855153204, "grad_norm": 1.6199214458465576, "learning_rate": 6.357305287094939e-06, "loss": 0.4196, "step": 6541 }, { "epoch": 9.11142061281337, "grad_norm": 1.9098056554794312, "learning_rate": 6.347356452529847e-06, "loss": 0.5348, "step": 6542 }, { "epoch": 9.112813370473537, "grad_norm": 2.2226037979125977, "learning_rate": 6.337407617964752e-06, "loss": 0.5473, "step": 6543 }, { "epoch": 9.114206128133704, "grad_norm": 2.042248487472534, "learning_rate": 6.327458783399658e-06, "loss": 0.4278, "step": 6544 }, { "epoch": 9.115598885793872, "grad_norm": 2.474745988845825, "learning_rate": 6.317509948834565e-06, "loss": 0.6941, "step": 6545 }, { "epoch": 9.116991643454039, "grad_norm": 2.1232476234436035, "learning_rate": 6.3075611142694706e-06, "loss": 0.5255, "step": 6546 }, { "epoch": 9.118384401114206, "grad_norm": 2.09792423248291, "learning_rate": 6.297612279704376e-06, "loss": 0.509, "step": 6547 }, { "epoch": 9.119777158774374, "grad_norm": 2.5328168869018555, "learning_rate": 6.287663445139283e-06, "loss": 0.6558, "step": 6548 }, { "epoch": 9.12116991643454, "grad_norm": 2.478328227996826, "learning_rate": 6.2777146105741896e-06, "loss": 0.8837, "step": 6549 }, { "epoch": 9.122562674094707, "grad_norm": 2.2600510120391846, "learning_rate": 6.267765776009095e-06, "loss": 0.6868, "step": 6550 }, { "epoch": 9.123955431754874, "grad_norm": 2.1773951053619385, "learning_rate": 6.257816941444002e-06, "loss": 0.6119, "step": 6551 }, { "epoch": 9.125348189415043, "grad_norm": 2.062988519668579, "learning_rate": 6.247868106878908e-06, "loss": 0.5197, "step": 6552 }, { "epoch": 9.12674094707521, "grad_norm": 2.266681432723999, "learning_rate": 6.237919272313814e-06, "loss": 0.4914, "step": 6553 }, { "epoch": 9.128133704735376, "grad_norm": 2.474975824356079, "learning_rate": 6.22797043774872e-06, "loss": 0.4814, "step": 6554 }, { "epoch": 9.129526462395543, "grad_norm": 1.9066749811172485, "learning_rate": 6.218021603183626e-06, "loss": 0.5745, "step": 6555 }, { "epoch": 9.130919220055711, "grad_norm": 2.4156877994537354, "learning_rate": 6.208072768618533e-06, "loss": 0.6683, "step": 6556 }, { "epoch": 9.132311977715878, "grad_norm": 1.9416658878326416, "learning_rate": 6.198123934053439e-06, "loss": 0.4561, "step": 6557 }, { "epoch": 9.133704735376044, "grad_norm": 1.8224204778671265, "learning_rate": 6.188175099488345e-06, "loss": 0.4899, "step": 6558 }, { "epoch": 9.135097493036211, "grad_norm": 2.1181678771972656, "learning_rate": 6.1782262649232515e-06, "loss": 0.6304, "step": 6559 }, { "epoch": 9.13649025069638, "grad_norm": 2.207368850708008, "learning_rate": 6.168277430358157e-06, "loss": 0.5457, "step": 6560 }, { "epoch": 9.137883008356546, "grad_norm": 2.40140438079834, "learning_rate": 6.158328595793063e-06, "loss": 0.5149, "step": 6561 }, { "epoch": 9.139275766016713, "grad_norm": 2.3443496227264404, "learning_rate": 6.14837976122797e-06, "loss": 0.5534, "step": 6562 }, { "epoch": 9.14066852367688, "grad_norm": 2.1881232261657715, "learning_rate": 6.138430926662876e-06, "loss": 0.5862, "step": 6563 }, { "epoch": 9.142061281337048, "grad_norm": 2.1861469745635986, "learning_rate": 6.128482092097783e-06, "loss": 0.7328, "step": 6564 }, { "epoch": 9.143454038997215, "grad_norm": 2.315837860107422, "learning_rate": 6.118533257532689e-06, "loss": 0.6537, "step": 6565 }, { "epoch": 9.144846796657381, "grad_norm": 2.104191303253174, "learning_rate": 6.108584422967594e-06, "loss": 0.581, "step": 6566 }, { "epoch": 9.14623955431755, "grad_norm": 2.272362470626831, "learning_rate": 6.098635588402501e-06, "loss": 0.6664, "step": 6567 }, { "epoch": 9.147632311977716, "grad_norm": 2.103689193725586, "learning_rate": 6.088686753837407e-06, "loss": 0.5321, "step": 6568 }, { "epoch": 9.149025069637883, "grad_norm": 2.0636544227600098, "learning_rate": 6.0787379192723126e-06, "loss": 0.5396, "step": 6569 }, { "epoch": 9.15041782729805, "grad_norm": 2.2445733547210693, "learning_rate": 6.06878908470722e-06, "loss": 0.529, "step": 6570 }, { "epoch": 9.151810584958218, "grad_norm": 1.8259060382843018, "learning_rate": 6.058840250142126e-06, "loss": 0.4758, "step": 6571 }, { "epoch": 9.153203342618385, "grad_norm": 2.264030694961548, "learning_rate": 6.0488914155770316e-06, "loss": 0.603, "step": 6572 }, { "epoch": 9.154596100278551, "grad_norm": 2.1737589836120605, "learning_rate": 6.038942581011938e-06, "loss": 0.5718, "step": 6573 }, { "epoch": 9.155988857938718, "grad_norm": 2.5338075160980225, "learning_rate": 6.028993746446844e-06, "loss": 0.8101, "step": 6574 }, { "epoch": 9.157381615598887, "grad_norm": 2.3226256370544434, "learning_rate": 6.0190449118817506e-06, "loss": 0.6567, "step": 6575 }, { "epoch": 9.158774373259053, "grad_norm": 1.9931647777557373, "learning_rate": 6.009096077316656e-06, "loss": 0.5562, "step": 6576 }, { "epoch": 9.16016713091922, "grad_norm": 2.3122940063476562, "learning_rate": 5.999147242751563e-06, "loss": 0.5887, "step": 6577 }, { "epoch": 9.161559888579387, "grad_norm": 1.7064476013183594, "learning_rate": 5.9891984081864696e-06, "loss": 0.3999, "step": 6578 }, { "epoch": 9.162952646239555, "grad_norm": 1.9466005563735962, "learning_rate": 5.979249573621375e-06, "loss": 0.4791, "step": 6579 }, { "epoch": 9.164345403899722, "grad_norm": 2.7882297039031982, "learning_rate": 5.969300739056281e-06, "loss": 0.6314, "step": 6580 }, { "epoch": 9.165738161559888, "grad_norm": 2.1359176635742188, "learning_rate": 5.959351904491188e-06, "loss": 0.6299, "step": 6581 }, { "epoch": 9.167130919220055, "grad_norm": 2.3436279296875, "learning_rate": 5.9494030699260935e-06, "loss": 0.5784, "step": 6582 }, { "epoch": 9.168523676880223, "grad_norm": 2.3204479217529297, "learning_rate": 5.939454235360999e-06, "loss": 0.7025, "step": 6583 }, { "epoch": 9.16991643454039, "grad_norm": 1.9044424295425415, "learning_rate": 5.929505400795907e-06, "loss": 0.5345, "step": 6584 }, { "epoch": 9.171309192200557, "grad_norm": 2.173640012741089, "learning_rate": 5.9195565662308125e-06, "loss": 0.5031, "step": 6585 }, { "epoch": 9.172701949860723, "grad_norm": 2.1724770069122314, "learning_rate": 5.909607731665719e-06, "loss": 0.5905, "step": 6586 }, { "epoch": 9.174094707520892, "grad_norm": 5.491093158721924, "learning_rate": 5.899658897100625e-06, "loss": 0.5465, "step": 6587 }, { "epoch": 9.175487465181059, "grad_norm": 2.1875393390655518, "learning_rate": 5.889710062535531e-06, "loss": 0.5646, "step": 6588 }, { "epoch": 9.176880222841225, "grad_norm": 2.554903507232666, "learning_rate": 5.879761227970437e-06, "loss": 0.6221, "step": 6589 }, { "epoch": 9.178272980501394, "grad_norm": 2.604617118835449, "learning_rate": 5.869812393405343e-06, "loss": 0.8008, "step": 6590 }, { "epoch": 9.17966573816156, "grad_norm": 2.177938222885132, "learning_rate": 5.85986355884025e-06, "loss": 0.5306, "step": 6591 }, { "epoch": 9.181058495821727, "grad_norm": 1.9429033994674683, "learning_rate": 5.849914724275156e-06, "loss": 0.4525, "step": 6592 }, { "epoch": 9.182451253481894, "grad_norm": 2.2409863471984863, "learning_rate": 5.839965889710062e-06, "loss": 0.6216, "step": 6593 }, { "epoch": 9.183844011142062, "grad_norm": 2.0866949558258057, "learning_rate": 5.830017055144968e-06, "loss": 0.4791, "step": 6594 }, { "epoch": 9.185236768802229, "grad_norm": 1.870818018913269, "learning_rate": 5.820068220579874e-06, "loss": 0.4292, "step": 6595 }, { "epoch": 9.186629526462395, "grad_norm": 2.2606542110443115, "learning_rate": 5.81011938601478e-06, "loss": 0.599, "step": 6596 }, { "epoch": 9.188022284122562, "grad_norm": 2.318547010421753, "learning_rate": 5.800170551449688e-06, "loss": 0.4828, "step": 6597 }, { "epoch": 9.18941504178273, "grad_norm": 2.084038019180298, "learning_rate": 5.790221716884593e-06, "loss": 0.5901, "step": 6598 }, { "epoch": 9.190807799442897, "grad_norm": 2.06911563873291, "learning_rate": 5.780272882319499e-06, "loss": 0.5821, "step": 6599 }, { "epoch": 9.192200557103064, "grad_norm": 3.8607749938964844, "learning_rate": 5.770324047754406e-06, "loss": 0.612, "step": 6600 }, { "epoch": 9.19359331476323, "grad_norm": 2.122738838195801, "learning_rate": 5.7603752131893116e-06, "loss": 0.58, "step": 6601 }, { "epoch": 9.194986072423399, "grad_norm": 2.52789568901062, "learning_rate": 5.750426378624217e-06, "loss": 0.7067, "step": 6602 }, { "epoch": 9.196378830083566, "grad_norm": 2.0914368629455566, "learning_rate": 5.740477544059124e-06, "loss": 0.543, "step": 6603 }, { "epoch": 9.197771587743732, "grad_norm": 2.113074779510498, "learning_rate": 5.73052870949403e-06, "loss": 0.5929, "step": 6604 }, { "epoch": 9.199164345403899, "grad_norm": 2.141188621520996, "learning_rate": 5.720579874928936e-06, "loss": 0.5811, "step": 6605 }, { "epoch": 9.200557103064067, "grad_norm": 2.0961978435516357, "learning_rate": 5.710631040363843e-06, "loss": 0.6667, "step": 6606 }, { "epoch": 9.201949860724234, "grad_norm": 2.0160956382751465, "learning_rate": 5.700682205798749e-06, "loss": 0.5359, "step": 6607 }, { "epoch": 9.2033426183844, "grad_norm": 2.2770156860351562, "learning_rate": 5.690733371233655e-06, "loss": 0.6647, "step": 6608 }, { "epoch": 9.204735376044567, "grad_norm": 2.2007791996002197, "learning_rate": 5.680784536668561e-06, "loss": 0.6502, "step": 6609 }, { "epoch": 9.206128133704736, "grad_norm": 2.4591970443725586, "learning_rate": 5.670835702103467e-06, "loss": 0.5854, "step": 6610 }, { "epoch": 9.207520891364902, "grad_norm": 2.0855774879455566, "learning_rate": 5.660886867538374e-06, "loss": 0.5477, "step": 6611 }, { "epoch": 9.20891364902507, "grad_norm": 1.9735785722732544, "learning_rate": 5.65093803297328e-06, "loss": 0.5616, "step": 6612 }, { "epoch": 9.210306406685238, "grad_norm": 1.9095104932785034, "learning_rate": 5.640989198408186e-06, "loss": 0.4562, "step": 6613 }, { "epoch": 9.211699164345404, "grad_norm": 2.0682218074798584, "learning_rate": 5.6310403638430925e-06, "loss": 0.5367, "step": 6614 }, { "epoch": 9.213091922005571, "grad_norm": 2.425248861312866, "learning_rate": 5.621091529277998e-06, "loss": 0.6349, "step": 6615 }, { "epoch": 9.214484679665738, "grad_norm": 1.8182013034820557, "learning_rate": 5.611142694712904e-06, "loss": 0.4559, "step": 6616 }, { "epoch": 9.215877437325906, "grad_norm": 2.4161479473114014, "learning_rate": 5.601193860147811e-06, "loss": 0.6224, "step": 6617 }, { "epoch": 9.217270194986073, "grad_norm": 2.2463033199310303, "learning_rate": 5.591245025582716e-06, "loss": 0.6994, "step": 6618 }, { "epoch": 9.21866295264624, "grad_norm": 2.5108425617218018, "learning_rate": 5.581296191017624e-06, "loss": 0.5931, "step": 6619 }, { "epoch": 9.220055710306406, "grad_norm": 1.9878058433532715, "learning_rate": 5.57134735645253e-06, "loss": 0.5535, "step": 6620 }, { "epoch": 9.221448467966574, "grad_norm": 2.05832839012146, "learning_rate": 5.561398521887435e-06, "loss": 0.4756, "step": 6621 }, { "epoch": 9.222841225626741, "grad_norm": 1.9694043397903442, "learning_rate": 5.551449687322342e-06, "loss": 0.4976, "step": 6622 }, { "epoch": 9.224233983286908, "grad_norm": 2.2591166496276855, "learning_rate": 5.541500852757248e-06, "loss": 0.5794, "step": 6623 }, { "epoch": 9.225626740947074, "grad_norm": 1.9439704418182373, "learning_rate": 5.5315520181921536e-06, "loss": 0.4655, "step": 6624 }, { "epoch": 9.227019498607243, "grad_norm": 2.149198532104492, "learning_rate": 5.521603183627061e-06, "loss": 0.6658, "step": 6625 }, { "epoch": 9.22841225626741, "grad_norm": 1.7805689573287964, "learning_rate": 5.511654349061967e-06, "loss": 0.4119, "step": 6626 }, { "epoch": 9.229805013927576, "grad_norm": 1.9849166870117188, "learning_rate": 5.5017055144968726e-06, "loss": 0.5005, "step": 6627 }, { "epoch": 9.231197771587743, "grad_norm": 2.055591344833374, "learning_rate": 5.491756679931779e-06, "loss": 0.5138, "step": 6628 }, { "epoch": 9.232590529247911, "grad_norm": 1.8095927238464355, "learning_rate": 5.481807845366685e-06, "loss": 0.4968, "step": 6629 }, { "epoch": 9.233983286908078, "grad_norm": 1.74369478225708, "learning_rate": 5.4718590108015916e-06, "loss": 0.471, "step": 6630 }, { "epoch": 9.235376044568245, "grad_norm": 2.1644845008850098, "learning_rate": 5.461910176236497e-06, "loss": 0.4978, "step": 6631 }, { "epoch": 9.236768802228413, "grad_norm": 1.9678279161453247, "learning_rate": 5.451961341671403e-06, "loss": 0.4752, "step": 6632 }, { "epoch": 9.23816155988858, "grad_norm": 2.2192115783691406, "learning_rate": 5.4420125071063106e-06, "loss": 0.4683, "step": 6633 }, { "epoch": 9.239554317548746, "grad_norm": 2.171332836151123, "learning_rate": 5.432063672541216e-06, "loss": 0.6633, "step": 6634 }, { "epoch": 9.240947075208913, "grad_norm": 2.0996429920196533, "learning_rate": 5.422114837976122e-06, "loss": 0.5084, "step": 6635 }, { "epoch": 9.242339832869082, "grad_norm": 7.0336594581604, "learning_rate": 5.412166003411029e-06, "loss": 0.5198, "step": 6636 }, { "epoch": 9.243732590529248, "grad_norm": 2.24898099899292, "learning_rate": 5.4022171688459345e-06, "loss": 0.5915, "step": 6637 }, { "epoch": 9.245125348189415, "grad_norm": 1.914724588394165, "learning_rate": 5.39226833428084e-06, "loss": 0.5765, "step": 6638 }, { "epoch": 9.246518105849582, "grad_norm": 2.361132860183716, "learning_rate": 5.382319499715748e-06, "loss": 0.6781, "step": 6639 }, { "epoch": 9.24791086350975, "grad_norm": 2.349442720413208, "learning_rate": 5.3723706651506535e-06, "loss": 0.7101, "step": 6640 }, { "epoch": 9.249303621169917, "grad_norm": 2.002004384994507, "learning_rate": 5.36242183058556e-06, "loss": 0.5796, "step": 6641 }, { "epoch": 9.250696378830083, "grad_norm": 2.1309149265289307, "learning_rate": 5.352472996020466e-06, "loss": 0.5567, "step": 6642 }, { "epoch": 9.25208913649025, "grad_norm": 2.001441478729248, "learning_rate": 5.342524161455372e-06, "loss": 0.4673, "step": 6643 }, { "epoch": 9.253481894150418, "grad_norm": 2.335261821746826, "learning_rate": 5.332575326890278e-06, "loss": 0.596, "step": 6644 }, { "epoch": 9.254874651810585, "grad_norm": 2.1615543365478516, "learning_rate": 5.322626492325184e-06, "loss": 0.5647, "step": 6645 }, { "epoch": 9.256267409470752, "grad_norm": 2.391345977783203, "learning_rate": 5.31267765776009e-06, "loss": 0.7754, "step": 6646 }, { "epoch": 9.257660167130918, "grad_norm": 2.632402181625366, "learning_rate": 5.302728823194997e-06, "loss": 0.6171, "step": 6647 }, { "epoch": 9.259052924791087, "grad_norm": 2.426246404647827, "learning_rate": 5.292779988629903e-06, "loss": 0.7527, "step": 6648 }, { "epoch": 9.260445682451254, "grad_norm": 1.877597451210022, "learning_rate": 5.282831154064809e-06, "loss": 0.4197, "step": 6649 }, { "epoch": 9.26183844011142, "grad_norm": 2.0923333168029785, "learning_rate": 5.2728823194997154e-06, "loss": 0.5272, "step": 6650 }, { "epoch": 9.263231197771589, "grad_norm": 2.482633352279663, "learning_rate": 5.262933484934621e-06, "loss": 0.7377, "step": 6651 }, { "epoch": 9.264623955431755, "grad_norm": 2.743605136871338, "learning_rate": 5.252984650369527e-06, "loss": 0.6502, "step": 6652 }, { "epoch": 9.266016713091922, "grad_norm": 1.9745889902114868, "learning_rate": 5.2430358158044344e-06, "loss": 0.6234, "step": 6653 }, { "epoch": 9.267409470752089, "grad_norm": 1.8544808626174927, "learning_rate": 5.23308698123934e-06, "loss": 0.4923, "step": 6654 }, { "epoch": 9.268802228412257, "grad_norm": 2.3628005981445312, "learning_rate": 5.223138146674247e-06, "loss": 0.7013, "step": 6655 }, { "epoch": 9.270194986072424, "grad_norm": 2.123011827468872, "learning_rate": 5.213189312109153e-06, "loss": 0.6657, "step": 6656 }, { "epoch": 9.27158774373259, "grad_norm": 2.0028347969055176, "learning_rate": 5.203240477544058e-06, "loss": 0.4596, "step": 6657 }, { "epoch": 9.272980501392757, "grad_norm": 2.028984546661377, "learning_rate": 5.193291642978965e-06, "loss": 0.5634, "step": 6658 }, { "epoch": 9.274373259052926, "grad_norm": 2.5099196434020996, "learning_rate": 5.183342808413871e-06, "loss": 0.8089, "step": 6659 }, { "epoch": 9.275766016713092, "grad_norm": 2.106228828430176, "learning_rate": 5.1733939738487765e-06, "loss": 0.5925, "step": 6660 }, { "epoch": 9.277158774373259, "grad_norm": 1.9736827611923218, "learning_rate": 5.163445139283684e-06, "loss": 0.5531, "step": 6661 }, { "epoch": 9.278551532033426, "grad_norm": 2.2319600582122803, "learning_rate": 5.15349630471859e-06, "loss": 0.613, "step": 6662 }, { "epoch": 9.279944289693594, "grad_norm": 2.1375272274017334, "learning_rate": 5.1435474701534955e-06, "loss": 0.594, "step": 6663 }, { "epoch": 9.28133704735376, "grad_norm": 5.751987934112549, "learning_rate": 5.133598635588402e-06, "loss": 0.4896, "step": 6664 }, { "epoch": 9.282729805013927, "grad_norm": 2.2547049522399902, "learning_rate": 5.123649801023308e-06, "loss": 0.6529, "step": 6665 }, { "epoch": 9.284122562674094, "grad_norm": 1.929680585861206, "learning_rate": 5.1137009664582145e-06, "loss": 0.4584, "step": 6666 }, { "epoch": 9.285515320334262, "grad_norm": 2.387620210647583, "learning_rate": 5.103752131893121e-06, "loss": 0.545, "step": 6667 }, { "epoch": 9.286908077994429, "grad_norm": 1.8755158185958862, "learning_rate": 5.093803297328027e-06, "loss": 0.4759, "step": 6668 }, { "epoch": 9.288300835654596, "grad_norm": 2.2386081218719482, "learning_rate": 5.0838544627629335e-06, "loss": 0.5903, "step": 6669 }, { "epoch": 9.289693593314762, "grad_norm": 2.1858484745025635, "learning_rate": 5.073905628197839e-06, "loss": 0.4471, "step": 6670 }, { "epoch": 9.29108635097493, "grad_norm": 2.3394856452941895, "learning_rate": 5.063956793632745e-06, "loss": 0.5806, "step": 6671 }, { "epoch": 9.292479108635098, "grad_norm": 2.412019729614258, "learning_rate": 5.054007959067652e-06, "loss": 0.6955, "step": 6672 }, { "epoch": 9.293871866295264, "grad_norm": 1.8427023887634277, "learning_rate": 5.0440591245025574e-06, "loss": 0.5396, "step": 6673 }, { "epoch": 9.295264623955433, "grad_norm": 2.1877055168151855, "learning_rate": 5.034110289937463e-06, "loss": 0.5349, "step": 6674 }, { "epoch": 9.2966573816156, "grad_norm": 2.2356770038604736, "learning_rate": 5.024161455372371e-06, "loss": 0.6146, "step": 6675 }, { "epoch": 9.298050139275766, "grad_norm": 2.124812126159668, "learning_rate": 5.0142126208072764e-06, "loss": 0.5767, "step": 6676 }, { "epoch": 9.299442896935933, "grad_norm": 2.352794647216797, "learning_rate": 5.004263786242183e-06, "loss": 0.66, "step": 6677 }, { "epoch": 9.300835654596101, "grad_norm": 2.1764092445373535, "learning_rate": 4.994314951677089e-06, "loss": 0.7739, "step": 6678 }, { "epoch": 9.302228412256268, "grad_norm": 2.2855031490325928, "learning_rate": 4.984366117111995e-06, "loss": 0.6507, "step": 6679 }, { "epoch": 9.303621169916434, "grad_norm": 1.9761897325515747, "learning_rate": 4.974417282546901e-06, "loss": 0.6233, "step": 6680 }, { "epoch": 9.305013927576601, "grad_norm": 2.1757235527038574, "learning_rate": 4.964468447981808e-06, "loss": 0.5873, "step": 6681 }, { "epoch": 9.30640668523677, "grad_norm": 1.8421281576156616, "learning_rate": 4.954519613416714e-06, "loss": 0.54, "step": 6682 }, { "epoch": 9.307799442896936, "grad_norm": 1.8912826776504517, "learning_rate": 4.94457077885162e-06, "loss": 0.4241, "step": 6683 }, { "epoch": 9.309192200557103, "grad_norm": 2.3895020484924316, "learning_rate": 4.934621944286526e-06, "loss": 0.6684, "step": 6684 }, { "epoch": 9.31058495821727, "grad_norm": 2.4576313495635986, "learning_rate": 4.924673109721432e-06, "loss": 0.6911, "step": 6685 }, { "epoch": 9.311977715877438, "grad_norm": 2.165168523788452, "learning_rate": 4.914724275156338e-06, "loss": 0.6065, "step": 6686 }, { "epoch": 9.313370473537605, "grad_norm": 1.9459168910980225, "learning_rate": 4.904775440591244e-06, "loss": 0.443, "step": 6687 }, { "epoch": 9.314763231197771, "grad_norm": 2.1418256759643555, "learning_rate": 4.894826606026152e-06, "loss": 0.4688, "step": 6688 }, { "epoch": 9.316155988857938, "grad_norm": 2.76389741897583, "learning_rate": 4.884877771461057e-06, "loss": 0.9413, "step": 6689 }, { "epoch": 9.317548746518106, "grad_norm": 2.0555858612060547, "learning_rate": 4.874928936895963e-06, "loss": 0.5609, "step": 6690 }, { "epoch": 9.318941504178273, "grad_norm": 2.319624185562134, "learning_rate": 4.86498010233087e-06, "loss": 0.5781, "step": 6691 }, { "epoch": 9.32033426183844, "grad_norm": 1.983708381652832, "learning_rate": 4.8550312677657755e-06, "loss": 0.5228, "step": 6692 }, { "epoch": 9.321727019498606, "grad_norm": 2.338756799697876, "learning_rate": 4.845082433200681e-06, "loss": 0.6649, "step": 6693 }, { "epoch": 9.323119777158775, "grad_norm": 2.617812156677246, "learning_rate": 4.835133598635588e-06, "loss": 0.6386, "step": 6694 }, { "epoch": 9.324512534818941, "grad_norm": 1.9342000484466553, "learning_rate": 4.8251847640704945e-06, "loss": 0.5451, "step": 6695 }, { "epoch": 9.325905292479108, "grad_norm": 2.455490827560425, "learning_rate": 4.8152359295054e-06, "loss": 0.6154, "step": 6696 }, { "epoch": 9.327298050139277, "grad_norm": 2.2828080654144287, "learning_rate": 4.805287094940307e-06, "loss": 0.6426, "step": 6697 }, { "epoch": 9.328690807799443, "grad_norm": 2.1832237243652344, "learning_rate": 4.795338260375213e-06, "loss": 0.5574, "step": 6698 }, { "epoch": 9.33008356545961, "grad_norm": 2.169520378112793, "learning_rate": 4.785389425810119e-06, "loss": 0.666, "step": 6699 }, { "epoch": 9.331476323119777, "grad_norm": 2.042065143585205, "learning_rate": 4.775440591245025e-06, "loss": 0.4648, "step": 6700 }, { "epoch": 9.332869080779945, "grad_norm": 2.5215961933135986, "learning_rate": 4.765491756679931e-06, "loss": 0.6618, "step": 6701 }, { "epoch": 9.334261838440112, "grad_norm": 2.3360421657562256, "learning_rate": 4.755542922114838e-06, "loss": 0.628, "step": 6702 }, { "epoch": 9.335654596100278, "grad_norm": 3.0754506587982178, "learning_rate": 4.745594087549744e-06, "loss": 0.5901, "step": 6703 }, { "epoch": 9.337047353760445, "grad_norm": 2.697869062423706, "learning_rate": 4.73564525298465e-06, "loss": 0.545, "step": 6704 }, { "epoch": 9.338440111420613, "grad_norm": 1.9758132696151733, "learning_rate": 4.7256964184195564e-06, "loss": 0.5344, "step": 6705 }, { "epoch": 9.33983286908078, "grad_norm": 2.1251981258392334, "learning_rate": 4.715747583854462e-06, "loss": 0.3737, "step": 6706 }, { "epoch": 9.341225626740947, "grad_norm": 1.9647719860076904, "learning_rate": 4.705798749289368e-06, "loss": 0.4168, "step": 6707 }, { "epoch": 9.342618384401113, "grad_norm": 1.8448026180267334, "learning_rate": 4.695849914724275e-06, "loss": 0.4246, "step": 6708 }, { "epoch": 9.344011142061282, "grad_norm": 2.073429584503174, "learning_rate": 4.68590108015918e-06, "loss": 0.4977, "step": 6709 }, { "epoch": 9.345403899721449, "grad_norm": 2.2559731006622314, "learning_rate": 4.675952245594088e-06, "loss": 0.5836, "step": 6710 }, { "epoch": 9.346796657381615, "grad_norm": 1.941716194152832, "learning_rate": 4.666003411028994e-06, "loss": 0.5033, "step": 6711 }, { "epoch": 9.348189415041782, "grad_norm": 1.8942596912384033, "learning_rate": 4.656054576463899e-06, "loss": 0.4844, "step": 6712 }, { "epoch": 9.34958217270195, "grad_norm": 2.08406662940979, "learning_rate": 4.646105741898806e-06, "loss": 0.5139, "step": 6713 }, { "epoch": 9.350974930362117, "grad_norm": 2.2924787998199463, "learning_rate": 4.636156907333712e-06, "loss": 0.5734, "step": 6714 }, { "epoch": 9.352367688022284, "grad_norm": 2.0736114978790283, "learning_rate": 4.6262080727686175e-06, "loss": 0.5575, "step": 6715 }, { "epoch": 9.35376044568245, "grad_norm": 1.6771340370178223, "learning_rate": 4.616259238203525e-06, "loss": 0.3436, "step": 6716 }, { "epoch": 9.355153203342619, "grad_norm": 2.115764856338501, "learning_rate": 4.606310403638431e-06, "loss": 0.4658, "step": 6717 }, { "epoch": 9.356545961002785, "grad_norm": 2.183945894241333, "learning_rate": 4.5963615690733365e-06, "loss": 0.6343, "step": 6718 }, { "epoch": 9.357938718662952, "grad_norm": 2.2778444290161133, "learning_rate": 4.586412734508243e-06, "loss": 0.7001, "step": 6719 }, { "epoch": 9.35933147632312, "grad_norm": 2.308894395828247, "learning_rate": 4.576463899943149e-06, "loss": 0.6485, "step": 6720 }, { "epoch": 9.360724233983287, "grad_norm": 2.110804796218872, "learning_rate": 4.5665150653780555e-06, "loss": 0.5745, "step": 6721 }, { "epoch": 9.362116991643454, "grad_norm": 1.934041976928711, "learning_rate": 4.556566230812961e-06, "loss": 0.5032, "step": 6722 }, { "epoch": 9.36350974930362, "grad_norm": 1.87200129032135, "learning_rate": 4.546617396247867e-06, "loss": 0.4351, "step": 6723 }, { "epoch": 9.364902506963789, "grad_norm": 1.953367829322815, "learning_rate": 4.5366685616827745e-06, "loss": 0.5016, "step": 6724 }, { "epoch": 9.366295264623956, "grad_norm": 2.0533089637756348, "learning_rate": 4.52671972711768e-06, "loss": 0.6008, "step": 6725 }, { "epoch": 9.367688022284122, "grad_norm": 2.586660385131836, "learning_rate": 4.516770892552586e-06, "loss": 0.6865, "step": 6726 }, { "epoch": 9.369080779944289, "grad_norm": 2.1469037532806396, "learning_rate": 4.506822057987493e-06, "loss": 0.5693, "step": 6727 }, { "epoch": 9.370473537604457, "grad_norm": 2.256702423095703, "learning_rate": 4.4968732234223984e-06, "loss": 0.5153, "step": 6728 }, { "epoch": 9.371866295264624, "grad_norm": 1.728001356124878, "learning_rate": 4.486924388857304e-06, "loss": 0.4704, "step": 6729 }, { "epoch": 9.37325905292479, "grad_norm": 2.113049268722534, "learning_rate": 4.476975554292212e-06, "loss": 0.5075, "step": 6730 }, { "epoch": 9.374651810584957, "grad_norm": 2.0204129219055176, "learning_rate": 4.4670267197271174e-06, "loss": 0.4786, "step": 6731 }, { "epoch": 9.376044568245126, "grad_norm": 1.8315682411193848, "learning_rate": 4.457077885162024e-06, "loss": 0.4078, "step": 6732 }, { "epoch": 9.377437325905293, "grad_norm": 2.0842456817626953, "learning_rate": 4.44712905059693e-06, "loss": 0.6208, "step": 6733 }, { "epoch": 9.37883008356546, "grad_norm": 2.3982343673706055, "learning_rate": 4.437180216031836e-06, "loss": 0.6758, "step": 6734 }, { "epoch": 9.380222841225626, "grad_norm": 1.9927856922149658, "learning_rate": 4.427231381466742e-06, "loss": 0.4766, "step": 6735 }, { "epoch": 9.381615598885794, "grad_norm": 2.199819803237915, "learning_rate": 4.417282546901648e-06, "loss": 0.6303, "step": 6736 }, { "epoch": 9.383008356545961, "grad_norm": 1.983742594718933, "learning_rate": 4.407333712336554e-06, "loss": 0.5537, "step": 6737 }, { "epoch": 9.384401114206128, "grad_norm": 2.1473169326782227, "learning_rate": 4.397384877771461e-06, "loss": 0.6254, "step": 6738 }, { "epoch": 9.385793871866296, "grad_norm": 2.208550453186035, "learning_rate": 4.387436043206367e-06, "loss": 0.5505, "step": 6739 }, { "epoch": 9.387186629526463, "grad_norm": 2.168860673904419, "learning_rate": 4.377487208641273e-06, "loss": 0.4925, "step": 6740 }, { "epoch": 9.38857938718663, "grad_norm": 2.385719060897827, "learning_rate": 4.367538374076179e-06, "loss": 0.6254, "step": 6741 }, { "epoch": 9.389972144846796, "grad_norm": 1.8590692281723022, "learning_rate": 4.357589539511085e-06, "loss": 0.5158, "step": 6742 }, { "epoch": 9.391364902506965, "grad_norm": 2.19563364982605, "learning_rate": 4.347640704945992e-06, "loss": 0.5806, "step": 6743 }, { "epoch": 9.392757660167131, "grad_norm": 1.9516217708587646, "learning_rate": 4.337691870380898e-06, "loss": 0.5567, "step": 6744 }, { "epoch": 9.394150417827298, "grad_norm": 2.4490139484405518, "learning_rate": 4.327743035815804e-06, "loss": 0.617, "step": 6745 }, { "epoch": 9.395543175487465, "grad_norm": 1.9458948373794556, "learning_rate": 4.31779420125071e-06, "loss": 0.5444, "step": 6746 }, { "epoch": 9.396935933147633, "grad_norm": 2.157991647720337, "learning_rate": 4.3078453666856165e-06, "loss": 0.5913, "step": 6747 }, { "epoch": 9.3983286908078, "grad_norm": 2.0901591777801514, "learning_rate": 4.297896532120523e-06, "loss": 0.4922, "step": 6748 }, { "epoch": 9.399721448467966, "grad_norm": 2.1302778720855713, "learning_rate": 4.287947697555429e-06, "loss": 0.5691, "step": 6749 }, { "epoch": 9.401114206128133, "grad_norm": 2.671572685241699, "learning_rate": 4.277998862990335e-06, "loss": 0.7691, "step": 6750 }, { "epoch": 9.402506963788301, "grad_norm": 1.9120711088180542, "learning_rate": 4.268050028425241e-06, "loss": 0.5165, "step": 6751 }, { "epoch": 9.403899721448468, "grad_norm": 2.2509145736694336, "learning_rate": 4.258101193860148e-06, "loss": 0.6263, "step": 6752 }, { "epoch": 9.405292479108635, "grad_norm": 2.2784059047698975, "learning_rate": 4.248152359295054e-06, "loss": 0.5518, "step": 6753 }, { "epoch": 9.406685236768801, "grad_norm": 1.7827047109603882, "learning_rate": 4.2382035247299594e-06, "loss": 0.426, "step": 6754 }, { "epoch": 9.40807799442897, "grad_norm": 2.2618050575256348, "learning_rate": 4.228254690164866e-06, "loss": 0.6541, "step": 6755 }, { "epoch": 9.409470752089137, "grad_norm": 2.3271522521972656, "learning_rate": 4.218305855599772e-06, "loss": 0.5396, "step": 6756 }, { "epoch": 9.410863509749303, "grad_norm": 2.2148826122283936, "learning_rate": 4.2083570210346784e-06, "loss": 0.6406, "step": 6757 }, { "epoch": 9.412256267409472, "grad_norm": 2.005791425704956, "learning_rate": 4.198408186469585e-06, "loss": 0.4294, "step": 6758 }, { "epoch": 9.413649025069638, "grad_norm": 1.8397232294082642, "learning_rate": 4.188459351904491e-06, "loss": 0.4662, "step": 6759 }, { "epoch": 9.415041782729805, "grad_norm": 1.7420717477798462, "learning_rate": 4.178510517339397e-06, "loss": 0.3784, "step": 6760 }, { "epoch": 9.416434540389972, "grad_norm": 2.056633234024048, "learning_rate": 4.168561682774303e-06, "loss": 0.5443, "step": 6761 }, { "epoch": 9.41782729805014, "grad_norm": 1.816161036491394, "learning_rate": 4.15861284820921e-06, "loss": 0.4245, "step": 6762 }, { "epoch": 9.419220055710307, "grad_norm": 2.3265507221221924, "learning_rate": 4.148664013644116e-06, "loss": 0.6085, "step": 6763 }, { "epoch": 9.420612813370473, "grad_norm": 1.9604753255844116, "learning_rate": 4.138715179079021e-06, "loss": 0.5458, "step": 6764 }, { "epoch": 9.42200557103064, "grad_norm": 1.7473949193954468, "learning_rate": 4.128766344513928e-06, "loss": 0.4521, "step": 6765 }, { "epoch": 9.423398328690809, "grad_norm": 1.8409470319747925, "learning_rate": 4.118817509948835e-06, "loss": 0.4751, "step": 6766 }, { "epoch": 9.424791086350975, "grad_norm": 2.09124755859375, "learning_rate": 4.10886867538374e-06, "loss": 0.5173, "step": 6767 }, { "epoch": 9.426183844011142, "grad_norm": 1.998836636543274, "learning_rate": 4.098919840818646e-06, "loss": 0.6055, "step": 6768 }, { "epoch": 9.427576601671309, "grad_norm": 2.2272489070892334, "learning_rate": 4.088971006253553e-06, "loss": 0.5947, "step": 6769 }, { "epoch": 9.428969359331477, "grad_norm": 2.0613739490509033, "learning_rate": 4.079022171688459e-06, "loss": 0.513, "step": 6770 }, { "epoch": 9.430362116991644, "grad_norm": 2.181584119796753, "learning_rate": 4.069073337123365e-06, "loss": 0.5332, "step": 6771 }, { "epoch": 9.43175487465181, "grad_norm": 2.1950395107269287, "learning_rate": 4.059124502558272e-06, "loss": 0.6239, "step": 6772 }, { "epoch": 9.433147632311977, "grad_norm": 2.2424991130828857, "learning_rate": 4.0491756679931775e-06, "loss": 0.5076, "step": 6773 }, { "epoch": 9.434540389972145, "grad_norm": 2.6309731006622314, "learning_rate": 4.039226833428083e-06, "loss": 0.7419, "step": 6774 }, { "epoch": 9.435933147632312, "grad_norm": 2.420696258544922, "learning_rate": 4.02927799886299e-06, "loss": 0.651, "step": 6775 }, { "epoch": 9.437325905292479, "grad_norm": 2.2619361877441406, "learning_rate": 4.0193291642978965e-06, "loss": 0.6268, "step": 6776 }, { "epoch": 9.438718662952645, "grad_norm": 2.0040783882141113, "learning_rate": 4.009380329732802e-06, "loss": 0.4365, "step": 6777 }, { "epoch": 9.440111420612814, "grad_norm": 2.1652371883392334, "learning_rate": 3.999431495167708e-06, "loss": 0.6763, "step": 6778 }, { "epoch": 9.44150417827298, "grad_norm": 1.946468472480774, "learning_rate": 3.989482660602615e-06, "loss": 0.531, "step": 6779 }, { "epoch": 9.442896935933147, "grad_norm": 2.0985682010650635, "learning_rate": 3.979533826037521e-06, "loss": 0.5662, "step": 6780 }, { "epoch": 9.444289693593316, "grad_norm": 2.9044721126556396, "learning_rate": 3.969584991472427e-06, "loss": 0.7111, "step": 6781 }, { "epoch": 9.445682451253482, "grad_norm": 1.901016354560852, "learning_rate": 3.959636156907333e-06, "loss": 0.424, "step": 6782 }, { "epoch": 9.447075208913649, "grad_norm": 2.362870693206787, "learning_rate": 3.9496873223422395e-06, "loss": 0.4705, "step": 6783 }, { "epoch": 9.448467966573816, "grad_norm": 1.503623366355896, "learning_rate": 3.939738487777146e-06, "loss": 0.3077, "step": 6784 }, { "epoch": 9.449860724233984, "grad_norm": 1.8384147882461548, "learning_rate": 3.929789653212052e-06, "loss": 0.3329, "step": 6785 }, { "epoch": 9.45125348189415, "grad_norm": 1.9747810363769531, "learning_rate": 3.9198408186469585e-06, "loss": 0.5828, "step": 6786 }, { "epoch": 9.452646239554317, "grad_norm": 1.9929118156433105, "learning_rate": 3.909891984081864e-06, "loss": 0.4236, "step": 6787 }, { "epoch": 9.454038997214484, "grad_norm": 2.373485803604126, "learning_rate": 3.899943149516771e-06, "loss": 0.5345, "step": 6788 }, { "epoch": 9.455431754874652, "grad_norm": 1.934966802597046, "learning_rate": 3.889994314951677e-06, "loss": 0.5323, "step": 6789 }, { "epoch": 9.45682451253482, "grad_norm": 2.5214738845825195, "learning_rate": 3.880045480386583e-06, "loss": 0.7866, "step": 6790 }, { "epoch": 9.458217270194986, "grad_norm": 1.8218555450439453, "learning_rate": 3.870096645821489e-06, "loss": 0.4144, "step": 6791 }, { "epoch": 9.459610027855152, "grad_norm": 1.9606658220291138, "learning_rate": 3.860147811256396e-06, "loss": 0.5411, "step": 6792 }, { "epoch": 9.461002785515321, "grad_norm": 1.7190555334091187, "learning_rate": 3.850198976691301e-06, "loss": 0.4121, "step": 6793 }, { "epoch": 9.462395543175488, "grad_norm": 2.5075480937957764, "learning_rate": 3.840250142126208e-06, "loss": 0.5883, "step": 6794 }, { "epoch": 9.463788300835654, "grad_norm": 2.277167320251465, "learning_rate": 3.830301307561114e-06, "loss": 0.5809, "step": 6795 }, { "epoch": 9.465181058495821, "grad_norm": 1.7093855142593384, "learning_rate": 3.8203524729960195e-06, "loss": 0.4828, "step": 6796 }, { "epoch": 9.46657381615599, "grad_norm": 1.923637866973877, "learning_rate": 3.810403638430926e-06, "loss": 0.53, "step": 6797 }, { "epoch": 9.467966573816156, "grad_norm": 2.2378780841827393, "learning_rate": 3.8004548038658328e-06, "loss": 0.5775, "step": 6798 }, { "epoch": 9.469359331476323, "grad_norm": 2.268008232116699, "learning_rate": 3.790505969300739e-06, "loss": 0.533, "step": 6799 }, { "epoch": 9.47075208913649, "grad_norm": 2.2728209495544434, "learning_rate": 3.7805571347356447e-06, "loss": 0.6456, "step": 6800 }, { "epoch": 9.472144846796658, "grad_norm": 2.1043710708618164, "learning_rate": 3.770608300170551e-06, "loss": 0.5893, "step": 6801 }, { "epoch": 9.473537604456824, "grad_norm": 1.8332468271255493, "learning_rate": 3.7606594656054575e-06, "loss": 0.4217, "step": 6802 }, { "epoch": 9.474930362116991, "grad_norm": 1.8673499822616577, "learning_rate": 3.7507106310403637e-06, "loss": 0.5118, "step": 6803 }, { "epoch": 9.47632311977716, "grad_norm": 2.1703574657440186, "learning_rate": 3.7407617964752695e-06, "loss": 0.7188, "step": 6804 }, { "epoch": 9.477715877437326, "grad_norm": 2.5039427280426025, "learning_rate": 3.730812961910176e-06, "loss": 0.5681, "step": 6805 }, { "epoch": 9.479108635097493, "grad_norm": 2.1298906803131104, "learning_rate": 3.7208641273450823e-06, "loss": 0.4972, "step": 6806 }, { "epoch": 9.48050139275766, "grad_norm": 1.9694808721542358, "learning_rate": 3.710915292779988e-06, "loss": 0.4645, "step": 6807 }, { "epoch": 9.481894150417828, "grad_norm": 2.109868049621582, "learning_rate": 3.7009664582148943e-06, "loss": 0.5743, "step": 6808 }, { "epoch": 9.483286908077995, "grad_norm": 2.0913796424865723, "learning_rate": 3.691017623649801e-06, "loss": 0.5902, "step": 6809 }, { "epoch": 9.484679665738161, "grad_norm": 2.116060972213745, "learning_rate": 3.681068789084707e-06, "loss": 0.6449, "step": 6810 }, { "epoch": 9.486072423398328, "grad_norm": 2.1506569385528564, "learning_rate": 3.671119954519613e-06, "loss": 0.6338, "step": 6811 }, { "epoch": 9.487465181058496, "grad_norm": 2.254997730255127, "learning_rate": 3.6611711199545195e-06, "loss": 0.513, "step": 6812 }, { "epoch": 9.488857938718663, "grad_norm": 2.3468072414398193, "learning_rate": 3.6512222853894257e-06, "loss": 0.6028, "step": 6813 }, { "epoch": 9.49025069637883, "grad_norm": 1.951341152191162, "learning_rate": 3.6412734508243314e-06, "loss": 0.5671, "step": 6814 }, { "epoch": 9.491643454038996, "grad_norm": 2.2778260707855225, "learning_rate": 3.6313246162592376e-06, "loss": 0.4638, "step": 6815 }, { "epoch": 9.493036211699165, "grad_norm": 2.0231311321258545, "learning_rate": 3.6213757816941442e-06, "loss": 0.5473, "step": 6816 }, { "epoch": 9.494428969359332, "grad_norm": 2.116697072982788, "learning_rate": 3.6114269471290504e-06, "loss": 0.4566, "step": 6817 }, { "epoch": 9.495821727019498, "grad_norm": 1.9370697736740112, "learning_rate": 3.601478112563956e-06, "loss": 0.4776, "step": 6818 }, { "epoch": 9.497214484679665, "grad_norm": 1.903659462928772, "learning_rate": 3.591529277998863e-06, "loss": 0.507, "step": 6819 }, { "epoch": 9.498607242339833, "grad_norm": 2.2818992137908936, "learning_rate": 3.581580443433769e-06, "loss": 0.7046, "step": 6820 }, { "epoch": 9.5, "grad_norm": 2.027432918548584, "learning_rate": 3.571631608868675e-06, "loss": 0.4643, "step": 6821 }, { "epoch": 9.501392757660167, "grad_norm": 2.352668046951294, "learning_rate": 3.561682774303581e-06, "loss": 0.7195, "step": 6822 }, { "epoch": 9.502785515320333, "grad_norm": 2.2707459926605225, "learning_rate": 3.5517339397384876e-06, "loss": 0.6042, "step": 6823 }, { "epoch": 9.504178272980502, "grad_norm": 2.524899959564209, "learning_rate": 3.5417851051733938e-06, "loss": 0.6264, "step": 6824 }, { "epoch": 9.505571030640668, "grad_norm": 1.987490177154541, "learning_rate": 3.5318362706082995e-06, "loss": 0.5492, "step": 6825 }, { "epoch": 9.506963788300835, "grad_norm": 1.8484636545181274, "learning_rate": 3.521887436043206e-06, "loss": 0.472, "step": 6826 }, { "epoch": 9.508356545961004, "grad_norm": 2.245314836502075, "learning_rate": 3.5119386014781123e-06, "loss": 0.6465, "step": 6827 }, { "epoch": 9.50974930362117, "grad_norm": 2.340188503265381, "learning_rate": 3.5019897669130185e-06, "loss": 0.6109, "step": 6828 }, { "epoch": 9.511142061281337, "grad_norm": 2.700007200241089, "learning_rate": 3.4920409323479243e-06, "loss": 0.6385, "step": 6829 }, { "epoch": 9.512534818941504, "grad_norm": 1.9856232404708862, "learning_rate": 3.482092097782831e-06, "loss": 0.4791, "step": 6830 }, { "epoch": 9.513927576601672, "grad_norm": 1.9098279476165771, "learning_rate": 3.472143263217737e-06, "loss": 0.4022, "step": 6831 }, { "epoch": 9.515320334261839, "grad_norm": 2.438234567642212, "learning_rate": 3.4621944286526433e-06, "loss": 0.7368, "step": 6832 }, { "epoch": 9.516713091922005, "grad_norm": 1.7444934844970703, "learning_rate": 3.4522455940875495e-06, "loss": 0.5008, "step": 6833 }, { "epoch": 9.518105849582172, "grad_norm": 2.0556321144104004, "learning_rate": 3.4422967595224557e-06, "loss": 0.4713, "step": 6834 }, { "epoch": 9.51949860724234, "grad_norm": 2.0487747192382812, "learning_rate": 3.432347924957362e-06, "loss": 0.5607, "step": 6835 }, { "epoch": 9.520891364902507, "grad_norm": 2.1954009532928467, "learning_rate": 3.4223990903922677e-06, "loss": 0.5943, "step": 6836 }, { "epoch": 9.522284122562674, "grad_norm": 2.11663556098938, "learning_rate": 3.4124502558271743e-06, "loss": 0.5555, "step": 6837 }, { "epoch": 9.52367688022284, "grad_norm": 2.2933733463287354, "learning_rate": 3.4025014212620805e-06, "loss": 0.689, "step": 6838 }, { "epoch": 9.525069637883009, "grad_norm": 2.073500156402588, "learning_rate": 3.3925525866969867e-06, "loss": 0.4693, "step": 6839 }, { "epoch": 9.526462395543176, "grad_norm": 1.6908445358276367, "learning_rate": 3.382603752131893e-06, "loss": 0.4207, "step": 6840 }, { "epoch": 9.527855153203342, "grad_norm": 2.208913803100586, "learning_rate": 3.372654917566799e-06, "loss": 0.6036, "step": 6841 }, { "epoch": 9.52924791086351, "grad_norm": 2.4213709831237793, "learning_rate": 3.3627060830017052e-06, "loss": 0.6669, "step": 6842 }, { "epoch": 9.530640668523677, "grad_norm": 2.1149485111236572, "learning_rate": 3.352757248436612e-06, "loss": 0.5837, "step": 6843 }, { "epoch": 9.532033426183844, "grad_norm": 1.683316707611084, "learning_rate": 3.3428084138715176e-06, "loss": 0.3735, "step": 6844 }, { "epoch": 9.53342618384401, "grad_norm": 2.1535401344299316, "learning_rate": 3.332859579306424e-06, "loss": 0.5639, "step": 6845 }, { "epoch": 9.534818941504179, "grad_norm": 2.3267405033111572, "learning_rate": 3.32291074474133e-06, "loss": 0.6312, "step": 6846 }, { "epoch": 9.536211699164346, "grad_norm": 2.0664336681365967, "learning_rate": 3.312961910176236e-06, "loss": 0.6014, "step": 6847 }, { "epoch": 9.537604456824512, "grad_norm": 2.2134811878204346, "learning_rate": 3.3030130756111424e-06, "loss": 0.548, "step": 6848 }, { "epoch": 9.538997214484679, "grad_norm": 2.397930145263672, "learning_rate": 3.2930642410460486e-06, "loss": 0.7653, "step": 6849 }, { "epoch": 9.540389972144848, "grad_norm": 1.931451439857483, "learning_rate": 3.283115406480955e-06, "loss": 0.5529, "step": 6850 }, { "epoch": 9.541782729805014, "grad_norm": 2.0318799018859863, "learning_rate": 3.273166571915861e-06, "loss": 0.3954, "step": 6851 }, { "epoch": 9.54317548746518, "grad_norm": 2.2449874877929688, "learning_rate": 3.263217737350767e-06, "loss": 0.5723, "step": 6852 }, { "epoch": 9.544568245125348, "grad_norm": 1.9648016691207886, "learning_rate": 3.2532689027856734e-06, "loss": 0.6213, "step": 6853 }, { "epoch": 9.545961002785516, "grad_norm": 1.7888097763061523, "learning_rate": 3.24332006822058e-06, "loss": 0.4491, "step": 6854 }, { "epoch": 9.547353760445683, "grad_norm": 2.4542691707611084, "learning_rate": 3.2333712336554857e-06, "loss": 0.6643, "step": 6855 }, { "epoch": 9.54874651810585, "grad_norm": 2.0693256855010986, "learning_rate": 3.223422399090392e-06, "loss": 0.4155, "step": 6856 }, { "epoch": 9.550139275766016, "grad_norm": 1.9995914697647095, "learning_rate": 3.213473564525298e-06, "loss": 0.492, "step": 6857 }, { "epoch": 9.551532033426184, "grad_norm": 2.548415184020996, "learning_rate": 3.2035247299602043e-06, "loss": 0.6386, "step": 6858 }, { "epoch": 9.552924791086351, "grad_norm": 2.3077518939971924, "learning_rate": 3.1935758953951105e-06, "loss": 0.7407, "step": 6859 }, { "epoch": 9.554317548746518, "grad_norm": 2.6892733573913574, "learning_rate": 3.1836270608300167e-06, "loss": 0.5772, "step": 6860 }, { "epoch": 9.555710306406684, "grad_norm": 2.148426055908203, "learning_rate": 3.1736782262649233e-06, "loss": 0.5233, "step": 6861 }, { "epoch": 9.557103064066853, "grad_norm": 1.9864565134048462, "learning_rate": 3.163729391699829e-06, "loss": 0.5015, "step": 6862 }, { "epoch": 9.55849582172702, "grad_norm": 2.461149215698242, "learning_rate": 3.1537805571347353e-06, "loss": 0.8165, "step": 6863 }, { "epoch": 9.559888579387186, "grad_norm": 2.3243892192840576, "learning_rate": 3.1438317225696415e-06, "loss": 0.6147, "step": 6864 }, { "epoch": 9.561281337047355, "grad_norm": 2.165677785873413, "learning_rate": 3.1338828880045477e-06, "loss": 0.4625, "step": 6865 }, { "epoch": 9.562674094707521, "grad_norm": 1.836151123046875, "learning_rate": 3.123934053439454e-06, "loss": 0.3987, "step": 6866 }, { "epoch": 9.564066852367688, "grad_norm": 2.061715602874756, "learning_rate": 3.11398521887436e-06, "loss": 0.609, "step": 6867 }, { "epoch": 9.565459610027855, "grad_norm": 1.9791380167007446, "learning_rate": 3.1040363843092667e-06, "loss": 0.5075, "step": 6868 }, { "epoch": 9.566852367688023, "grad_norm": 2.4446194171905518, "learning_rate": 3.0940875497441724e-06, "loss": 0.7393, "step": 6869 }, { "epoch": 9.56824512534819, "grad_norm": 2.0252926349639893, "learning_rate": 3.0841387151790786e-06, "loss": 0.5545, "step": 6870 }, { "epoch": 9.569637883008356, "grad_norm": 2.1388938426971436, "learning_rate": 3.074189880613985e-06, "loss": 0.6642, "step": 6871 }, { "epoch": 9.571030640668523, "grad_norm": 1.940800428390503, "learning_rate": 3.0642410460488914e-06, "loss": 0.4897, "step": 6872 }, { "epoch": 9.572423398328691, "grad_norm": 1.8035876750946045, "learning_rate": 3.054292211483797e-06, "loss": 0.4695, "step": 6873 }, { "epoch": 9.573816155988858, "grad_norm": 2.1073710918426514, "learning_rate": 3.0443433769187034e-06, "loss": 0.5661, "step": 6874 }, { "epoch": 9.575208913649025, "grad_norm": 2.249248504638672, "learning_rate": 3.03439454235361e-06, "loss": 0.7143, "step": 6875 }, { "epoch": 9.576601671309191, "grad_norm": 2.2620809078216553, "learning_rate": 3.0244457077885158e-06, "loss": 0.4422, "step": 6876 }, { "epoch": 9.57799442896936, "grad_norm": 2.0016865730285645, "learning_rate": 3.014496873223422e-06, "loss": 0.522, "step": 6877 }, { "epoch": 9.579387186629527, "grad_norm": 2.14836049079895, "learning_rate": 3.004548038658328e-06, "loss": 0.6262, "step": 6878 }, { "epoch": 9.580779944289693, "grad_norm": 2.43053936958313, "learning_rate": 2.9945992040932348e-06, "loss": 0.7094, "step": 6879 }, { "epoch": 9.58217270194986, "grad_norm": 2.0721077919006348, "learning_rate": 2.9846503695281406e-06, "loss": 0.4594, "step": 6880 }, { "epoch": 9.583565459610028, "grad_norm": 2.3775992393493652, "learning_rate": 2.9747015349630467e-06, "loss": 0.6097, "step": 6881 }, { "epoch": 9.584958217270195, "grad_norm": 1.7133889198303223, "learning_rate": 2.9647527003979534e-06, "loss": 0.4903, "step": 6882 }, { "epoch": 9.586350974930362, "grad_norm": 2.000835418701172, "learning_rate": 2.9548038658328596e-06, "loss": 0.5703, "step": 6883 }, { "epoch": 9.587743732590528, "grad_norm": 2.188347816467285, "learning_rate": 2.9448550312677653e-06, "loss": 0.5702, "step": 6884 }, { "epoch": 9.589136490250697, "grad_norm": 2.036536693572998, "learning_rate": 2.9349061967026715e-06, "loss": 0.5254, "step": 6885 }, { "epoch": 9.590529247910863, "grad_norm": 2.263777256011963, "learning_rate": 2.924957362137578e-06, "loss": 0.5976, "step": 6886 }, { "epoch": 9.59192200557103, "grad_norm": 1.9548321962356567, "learning_rate": 2.915008527572484e-06, "loss": 0.4607, "step": 6887 }, { "epoch": 9.593314763231199, "grad_norm": 2.230546236038208, "learning_rate": 2.90505969300739e-06, "loss": 0.7865, "step": 6888 }, { "epoch": 9.594707520891365, "grad_norm": 2.3613121509552, "learning_rate": 2.8951108584422967e-06, "loss": 0.7575, "step": 6889 }, { "epoch": 9.596100278551532, "grad_norm": 1.9772577285766602, "learning_rate": 2.885162023877203e-06, "loss": 0.5071, "step": 6890 }, { "epoch": 9.597493036211699, "grad_norm": 2.2921950817108154, "learning_rate": 2.8752131893121087e-06, "loss": 0.6333, "step": 6891 }, { "epoch": 9.598885793871867, "grad_norm": 2.2228214740753174, "learning_rate": 2.865264354747015e-06, "loss": 0.6499, "step": 6892 }, { "epoch": 9.600278551532034, "grad_norm": 1.8338245153427124, "learning_rate": 2.8553155201819215e-06, "loss": 0.5282, "step": 6893 }, { "epoch": 9.6016713091922, "grad_norm": 1.925538420677185, "learning_rate": 2.8453666856168277e-06, "loss": 0.4687, "step": 6894 }, { "epoch": 9.603064066852367, "grad_norm": 1.5537601709365845, "learning_rate": 2.8354178510517334e-06, "loss": 0.3136, "step": 6895 }, { "epoch": 9.604456824512535, "grad_norm": 1.8593311309814453, "learning_rate": 2.82546901648664e-06, "loss": 0.4402, "step": 6896 }, { "epoch": 9.605849582172702, "grad_norm": 2.3682188987731934, "learning_rate": 2.8155201819215462e-06, "loss": 0.6091, "step": 6897 }, { "epoch": 9.607242339832869, "grad_norm": 2.1733884811401367, "learning_rate": 2.805571347356452e-06, "loss": 0.71, "step": 6898 }, { "epoch": 9.608635097493035, "grad_norm": 2.267303466796875, "learning_rate": 2.795622512791358e-06, "loss": 0.5613, "step": 6899 }, { "epoch": 9.610027855153204, "grad_norm": 2.11245059967041, "learning_rate": 2.785673678226265e-06, "loss": 0.4584, "step": 6900 }, { "epoch": 9.61142061281337, "grad_norm": 1.9563874006271362, "learning_rate": 2.775724843661171e-06, "loss": 0.6084, "step": 6901 }, { "epoch": 9.612813370473537, "grad_norm": 2.390535831451416, "learning_rate": 2.7657760090960768e-06, "loss": 0.734, "step": 6902 }, { "epoch": 9.614206128133706, "grad_norm": 2.5485520362854004, "learning_rate": 2.7558271745309834e-06, "loss": 0.5524, "step": 6903 }, { "epoch": 9.615598885793872, "grad_norm": 2.0168912410736084, "learning_rate": 2.7458783399658896e-06, "loss": 0.5521, "step": 6904 }, { "epoch": 9.616991643454039, "grad_norm": 1.8310315608978271, "learning_rate": 2.7359295054007958e-06, "loss": 0.5109, "step": 6905 }, { "epoch": 9.618384401114206, "grad_norm": 2.0850415229797363, "learning_rate": 2.7259806708357016e-06, "loss": 0.516, "step": 6906 }, { "epoch": 9.619777158774372, "grad_norm": 2.121399164199829, "learning_rate": 2.716031836270608e-06, "loss": 0.4571, "step": 6907 }, { "epoch": 9.62116991643454, "grad_norm": 2.0826025009155273, "learning_rate": 2.7060830017055144e-06, "loss": 0.6349, "step": 6908 }, { "epoch": 9.622562674094707, "grad_norm": 2.023716688156128, "learning_rate": 2.69613416714042e-06, "loss": 0.5386, "step": 6909 }, { "epoch": 9.623955431754874, "grad_norm": 2.1747753620147705, "learning_rate": 2.6861853325753267e-06, "loss": 0.5967, "step": 6910 }, { "epoch": 9.625348189415043, "grad_norm": 1.7980483770370483, "learning_rate": 2.676236498010233e-06, "loss": 0.4663, "step": 6911 }, { "epoch": 9.62674094707521, "grad_norm": 2.0955042839050293, "learning_rate": 2.666287663445139e-06, "loss": 0.5562, "step": 6912 }, { "epoch": 9.628133704735376, "grad_norm": 2.251206636428833, "learning_rate": 2.656338828880045e-06, "loss": 0.5868, "step": 6913 }, { "epoch": 9.629526462395543, "grad_norm": 1.8673650026321411, "learning_rate": 2.6463899943149515e-06, "loss": 0.5288, "step": 6914 }, { "epoch": 9.630919220055711, "grad_norm": 2.5020244121551514, "learning_rate": 2.6364411597498577e-06, "loss": 0.6089, "step": 6915 }, { "epoch": 9.632311977715878, "grad_norm": 2.392106533050537, "learning_rate": 2.6264923251847635e-06, "loss": 0.6196, "step": 6916 }, { "epoch": 9.633704735376044, "grad_norm": 2.0079996585845947, "learning_rate": 2.61654349061967e-06, "loss": 0.5551, "step": 6917 }, { "epoch": 9.635097493036211, "grad_norm": 2.3050472736358643, "learning_rate": 2.6065946560545763e-06, "loss": 0.5626, "step": 6918 }, { "epoch": 9.63649025069638, "grad_norm": 2.0892646312713623, "learning_rate": 2.5966458214894825e-06, "loss": 0.5317, "step": 6919 }, { "epoch": 9.637883008356546, "grad_norm": 2.22564959526062, "learning_rate": 2.5866969869243883e-06, "loss": 0.5402, "step": 6920 }, { "epoch": 9.639275766016713, "grad_norm": 1.9598835706710815, "learning_rate": 2.576748152359295e-06, "loss": 0.4181, "step": 6921 }, { "epoch": 9.64066852367688, "grad_norm": 1.9309732913970947, "learning_rate": 2.566799317794201e-06, "loss": 0.5266, "step": 6922 }, { "epoch": 9.642061281337048, "grad_norm": 2.229978561401367, "learning_rate": 2.5568504832291073e-06, "loss": 0.524, "step": 6923 }, { "epoch": 9.643454038997215, "grad_norm": 2.2351856231689453, "learning_rate": 2.5469016486640134e-06, "loss": 0.6238, "step": 6924 }, { "epoch": 9.644846796657381, "grad_norm": 2.101581573486328, "learning_rate": 2.5369528140989196e-06, "loss": 0.5739, "step": 6925 }, { "epoch": 9.64623955431755, "grad_norm": 1.8099535703659058, "learning_rate": 2.527003979533826e-06, "loss": 0.4697, "step": 6926 }, { "epoch": 9.647632311977716, "grad_norm": 2.1131985187530518, "learning_rate": 2.5170551449687316e-06, "loss": 0.5731, "step": 6927 }, { "epoch": 9.649025069637883, "grad_norm": 2.122297763824463, "learning_rate": 2.5071063104036382e-06, "loss": 0.7187, "step": 6928 }, { "epoch": 9.65041782729805, "grad_norm": 2.3489110469818115, "learning_rate": 2.4971574758385444e-06, "loss": 0.8092, "step": 6929 }, { "epoch": 9.651810584958218, "grad_norm": 1.7810783386230469, "learning_rate": 2.4872086412734506e-06, "loss": 0.4758, "step": 6930 }, { "epoch": 9.653203342618385, "grad_norm": 2.2817227840423584, "learning_rate": 2.477259806708357e-06, "loss": 0.64, "step": 6931 }, { "epoch": 9.654596100278551, "grad_norm": 2.217214584350586, "learning_rate": 2.467310972143263e-06, "loss": 0.5357, "step": 6932 }, { "epoch": 9.655988857938718, "grad_norm": 2.1526618003845215, "learning_rate": 2.457362137578169e-06, "loss": 0.6023, "step": 6933 }, { "epoch": 9.657381615598887, "grad_norm": 2.233933925628662, "learning_rate": 2.447413303013076e-06, "loss": 0.5486, "step": 6934 }, { "epoch": 9.658774373259053, "grad_norm": 2.1165871620178223, "learning_rate": 2.4374644684479816e-06, "loss": 0.6014, "step": 6935 }, { "epoch": 9.66016713091922, "grad_norm": 1.994081735610962, "learning_rate": 2.4275156338828878e-06, "loss": 0.5341, "step": 6936 }, { "epoch": 9.661559888579387, "grad_norm": 2.0053889751434326, "learning_rate": 2.417566799317794e-06, "loss": 0.5132, "step": 6937 }, { "epoch": 9.662952646239555, "grad_norm": 2.163989305496216, "learning_rate": 2.4076179647527e-06, "loss": 0.6876, "step": 6938 }, { "epoch": 9.664345403899722, "grad_norm": 2.0896172523498535, "learning_rate": 2.3976691301876063e-06, "loss": 0.579, "step": 6939 }, { "epoch": 9.665738161559888, "grad_norm": 2.0439982414245605, "learning_rate": 2.3877202956225125e-06, "loss": 0.4529, "step": 6940 }, { "epoch": 9.667130919220055, "grad_norm": 1.9303287267684937, "learning_rate": 2.377771461057419e-06, "loss": 0.5237, "step": 6941 }, { "epoch": 9.668523676880223, "grad_norm": 2.6126015186309814, "learning_rate": 2.367822626492325e-06, "loss": 0.6362, "step": 6942 }, { "epoch": 9.66991643454039, "grad_norm": 2.1996190547943115, "learning_rate": 2.357873791927231e-06, "loss": 0.5479, "step": 6943 }, { "epoch": 9.671309192200557, "grad_norm": 1.8324912786483765, "learning_rate": 2.3479249573621373e-06, "loss": 0.4954, "step": 6944 }, { "epoch": 9.672701949860723, "grad_norm": 1.8486496210098267, "learning_rate": 2.337976122797044e-06, "loss": 0.4296, "step": 6945 }, { "epoch": 9.674094707520892, "grad_norm": 1.6869251728057861, "learning_rate": 2.3280272882319497e-06, "loss": 0.3979, "step": 6946 }, { "epoch": 9.675487465181059, "grad_norm": 2.0417392253875732, "learning_rate": 2.318078453666856e-06, "loss": 0.5187, "step": 6947 }, { "epoch": 9.676880222841225, "grad_norm": 2.3201656341552734, "learning_rate": 2.3081296191017625e-06, "loss": 0.5462, "step": 6948 }, { "epoch": 9.678272980501394, "grad_norm": 1.9886524677276611, "learning_rate": 2.2981807845366683e-06, "loss": 0.5588, "step": 6949 }, { "epoch": 9.67966573816156, "grad_norm": 2.0460445880889893, "learning_rate": 2.2882319499715745e-06, "loss": 0.4677, "step": 6950 }, { "epoch": 9.681058495821727, "grad_norm": 2.216416835784912, "learning_rate": 2.2782831154064806e-06, "loss": 0.615, "step": 6951 }, { "epoch": 9.682451253481894, "grad_norm": 2.2661871910095215, "learning_rate": 2.2683342808413873e-06, "loss": 0.6978, "step": 6952 }, { "epoch": 9.683844011142062, "grad_norm": 2.5274295806884766, "learning_rate": 2.258385446276293e-06, "loss": 0.6989, "step": 6953 }, { "epoch": 9.685236768802229, "grad_norm": 1.9905458688735962, "learning_rate": 2.2484366117111992e-06, "loss": 0.5145, "step": 6954 }, { "epoch": 9.686629526462395, "grad_norm": 2.3244149684906006, "learning_rate": 2.238487777146106e-06, "loss": 0.5652, "step": 6955 }, { "epoch": 9.688022284122562, "grad_norm": 2.5218443870544434, "learning_rate": 2.228538942581012e-06, "loss": 0.5619, "step": 6956 }, { "epoch": 9.68941504178273, "grad_norm": 2.1873762607574463, "learning_rate": 2.218590108015918e-06, "loss": 0.6727, "step": 6957 }, { "epoch": 9.690807799442897, "grad_norm": 2.5588035583496094, "learning_rate": 2.208641273450824e-06, "loss": 0.6758, "step": 6958 }, { "epoch": 9.692200557103064, "grad_norm": 2.6251144409179688, "learning_rate": 2.1986924388857306e-06, "loss": 0.5718, "step": 6959 }, { "epoch": 9.69359331476323, "grad_norm": 2.7621402740478516, "learning_rate": 2.1887436043206364e-06, "loss": 0.5315, "step": 6960 }, { "epoch": 9.694986072423399, "grad_norm": 2.1926655769348145, "learning_rate": 2.1787947697555426e-06, "loss": 0.6315, "step": 6961 }, { "epoch": 9.696378830083566, "grad_norm": 2.6438217163085938, "learning_rate": 2.168845935190449e-06, "loss": 0.6022, "step": 6962 }, { "epoch": 9.697771587743732, "grad_norm": 1.984354853630066, "learning_rate": 2.158897100625355e-06, "loss": 0.4491, "step": 6963 }, { "epoch": 9.699164345403899, "grad_norm": 2.20831298828125, "learning_rate": 2.1489482660602616e-06, "loss": 0.5286, "step": 6964 }, { "epoch": 9.700557103064067, "grad_norm": 2.2550482749938965, "learning_rate": 2.1389994314951673e-06, "loss": 0.7065, "step": 6965 }, { "epoch": 9.701949860724234, "grad_norm": 1.811528205871582, "learning_rate": 2.129050596930074e-06, "loss": 0.3716, "step": 6966 }, { "epoch": 9.7033426183844, "grad_norm": 2.439908504486084, "learning_rate": 2.1191017623649797e-06, "loss": 0.6874, "step": 6967 }, { "epoch": 9.704735376044567, "grad_norm": 2.070103168487549, "learning_rate": 2.109152927799886e-06, "loss": 0.6209, "step": 6968 }, { "epoch": 9.706128133704736, "grad_norm": 2.3902666568756104, "learning_rate": 2.0992040932347925e-06, "loss": 0.6534, "step": 6969 }, { "epoch": 9.707520891364902, "grad_norm": 2.2055246829986572, "learning_rate": 2.0892552586696983e-06, "loss": 0.5116, "step": 6970 }, { "epoch": 9.70891364902507, "grad_norm": 1.8873040676116943, "learning_rate": 2.079306424104605e-06, "loss": 0.496, "step": 6971 }, { "epoch": 9.710306406685238, "grad_norm": 2.046614646911621, "learning_rate": 2.0693575895395107e-06, "loss": 0.4669, "step": 6972 }, { "epoch": 9.711699164345404, "grad_norm": 1.6519601345062256, "learning_rate": 2.0594087549744173e-06, "loss": 0.4092, "step": 6973 }, { "epoch": 9.713091922005571, "grad_norm": 2.278075933456421, "learning_rate": 2.049459920409323e-06, "loss": 0.7151, "step": 6974 }, { "epoch": 9.714484679665738, "grad_norm": 2.32436203956604, "learning_rate": 2.0395110858442297e-06, "loss": 0.5953, "step": 6975 }, { "epoch": 9.715877437325906, "grad_norm": 1.7162848711013794, "learning_rate": 2.029562251279136e-06, "loss": 0.4261, "step": 6976 }, { "epoch": 9.717270194986073, "grad_norm": 1.9412891864776611, "learning_rate": 2.0196134167140416e-06, "loss": 0.4663, "step": 6977 }, { "epoch": 9.71866295264624, "grad_norm": 2.11315655708313, "learning_rate": 2.0096645821489483e-06, "loss": 0.6713, "step": 6978 }, { "epoch": 9.720055710306406, "grad_norm": 2.1294631958007812, "learning_rate": 1.999715747583854e-06, "loss": 0.5247, "step": 6979 }, { "epoch": 9.721448467966574, "grad_norm": 2.0747945308685303, "learning_rate": 1.9897669130187606e-06, "loss": 0.5031, "step": 6980 }, { "epoch": 9.722841225626741, "grad_norm": 2.102787733078003, "learning_rate": 1.9798180784536664e-06, "loss": 0.6355, "step": 6981 }, { "epoch": 9.724233983286908, "grad_norm": 2.1588542461395264, "learning_rate": 1.969869243888573e-06, "loss": 0.5202, "step": 6982 }, { "epoch": 9.725626740947074, "grad_norm": 2.134801149368286, "learning_rate": 1.9599204093234792e-06, "loss": 0.6076, "step": 6983 }, { "epoch": 9.727019498607243, "grad_norm": 2.18527889251709, "learning_rate": 1.9499715747583854e-06, "loss": 0.5973, "step": 6984 }, { "epoch": 9.72841225626741, "grad_norm": 2.2865958213806152, "learning_rate": 1.9400227401932916e-06, "loss": 0.5338, "step": 6985 }, { "epoch": 9.729805013927576, "grad_norm": 2.043187141418457, "learning_rate": 1.930073905628198e-06, "loss": 0.4877, "step": 6986 }, { "epoch": 9.731197771587743, "grad_norm": 2.2247226238250732, "learning_rate": 1.920125071063104e-06, "loss": 0.4591, "step": 6987 }, { "epoch": 9.732590529247911, "grad_norm": 1.741791009902954, "learning_rate": 1.9101762364980098e-06, "loss": 0.4618, "step": 6988 }, { "epoch": 9.733983286908078, "grad_norm": 1.886433482170105, "learning_rate": 1.9002274019329164e-06, "loss": 0.4592, "step": 6989 }, { "epoch": 9.735376044568245, "grad_norm": 1.9228953123092651, "learning_rate": 1.8902785673678224e-06, "loss": 0.4409, "step": 6990 }, { "epoch": 9.736768802228411, "grad_norm": 2.4497694969177246, "learning_rate": 1.8803297328027288e-06, "loss": 0.6366, "step": 6991 }, { "epoch": 9.73816155988858, "grad_norm": 1.7542403936386108, "learning_rate": 1.8703808982376347e-06, "loss": 0.427, "step": 6992 }, { "epoch": 9.739554317548746, "grad_norm": 1.9917913675308228, "learning_rate": 1.8604320636725412e-06, "loss": 0.4881, "step": 6993 }, { "epoch": 9.740947075208913, "grad_norm": 1.8209863901138306, "learning_rate": 1.8504832291074471e-06, "loss": 0.431, "step": 6994 }, { "epoch": 9.742339832869082, "grad_norm": 2.16464900970459, "learning_rate": 1.8405343945423535e-06, "loss": 0.5031, "step": 6995 }, { "epoch": 9.743732590529248, "grad_norm": 2.160353660583496, "learning_rate": 1.8305855599772597e-06, "loss": 0.5782, "step": 6996 }, { "epoch": 9.745125348189415, "grad_norm": 2.2408394813537598, "learning_rate": 1.8206367254121657e-06, "loss": 0.4689, "step": 6997 }, { "epoch": 9.746518105849582, "grad_norm": 1.8952913284301758, "learning_rate": 1.8106878908470721e-06, "loss": 0.4133, "step": 6998 }, { "epoch": 9.74791086350975, "grad_norm": 2.0435421466827393, "learning_rate": 1.800739056281978e-06, "loss": 0.5561, "step": 6999 }, { "epoch": 9.749303621169917, "grad_norm": 1.984863042831421, "learning_rate": 1.7907902217168845e-06, "loss": 0.4682, "step": 7000 }, { "epoch": 9.750696378830083, "grad_norm": 1.9193624258041382, "learning_rate": 1.7808413871517905e-06, "loss": 0.5653, "step": 7001 }, { "epoch": 9.75208913649025, "grad_norm": 2.372758626937866, "learning_rate": 1.7708925525866969e-06, "loss": 0.5681, "step": 7002 }, { "epoch": 9.753481894150418, "grad_norm": 2.1380016803741455, "learning_rate": 1.760943718021603e-06, "loss": 0.457, "step": 7003 }, { "epoch": 9.754874651810585, "grad_norm": 1.9594526290893555, "learning_rate": 1.7509948834565093e-06, "loss": 0.5065, "step": 7004 }, { "epoch": 9.756267409470752, "grad_norm": 2.3975584506988525, "learning_rate": 1.7410460488914155e-06, "loss": 0.7089, "step": 7005 }, { "epoch": 9.757660167130918, "grad_norm": 2.2214443683624268, "learning_rate": 1.7310972143263217e-06, "loss": 0.5737, "step": 7006 }, { "epoch": 9.759052924791087, "grad_norm": 2.1189019680023193, "learning_rate": 1.7211483797612278e-06, "loss": 0.5466, "step": 7007 }, { "epoch": 9.760445682451254, "grad_norm": 2.492465019226074, "learning_rate": 1.7111995451961338e-06, "loss": 0.7129, "step": 7008 }, { "epoch": 9.76183844011142, "grad_norm": 2.2690608501434326, "learning_rate": 1.7012507106310402e-06, "loss": 0.6258, "step": 7009 }, { "epoch": 9.763231197771589, "grad_norm": 2.0491721630096436, "learning_rate": 1.6913018760659464e-06, "loss": 0.4888, "step": 7010 }, { "epoch": 9.764623955431755, "grad_norm": 2.462674379348755, "learning_rate": 1.6813530415008526e-06, "loss": 0.6891, "step": 7011 }, { "epoch": 9.766016713091922, "grad_norm": 3.877023935317993, "learning_rate": 1.6714042069357588e-06, "loss": 0.6741, "step": 7012 }, { "epoch": 9.767409470752089, "grad_norm": 2.1356163024902344, "learning_rate": 1.661455372370665e-06, "loss": 0.535, "step": 7013 }, { "epoch": 9.768802228412257, "grad_norm": 2.041917562484741, "learning_rate": 1.6515065378055712e-06, "loss": 0.5095, "step": 7014 }, { "epoch": 9.770194986072424, "grad_norm": 2.0338964462280273, "learning_rate": 1.6415577032404776e-06, "loss": 0.4833, "step": 7015 }, { "epoch": 9.77158774373259, "grad_norm": 2.070000648498535, "learning_rate": 1.6316088686753836e-06, "loss": 0.579, "step": 7016 }, { "epoch": 9.772980501392757, "grad_norm": 2.4562582969665527, "learning_rate": 1.62166003411029e-06, "loss": 0.721, "step": 7017 }, { "epoch": 9.774373259052926, "grad_norm": 2.306575059890747, "learning_rate": 1.611711199545196e-06, "loss": 0.6792, "step": 7018 }, { "epoch": 9.775766016713092, "grad_norm": 2.3740251064300537, "learning_rate": 1.6017623649801022e-06, "loss": 0.6151, "step": 7019 }, { "epoch": 9.777158774373259, "grad_norm": 2.5305862426757812, "learning_rate": 1.5918135304150084e-06, "loss": 0.6339, "step": 7020 }, { "epoch": 9.778551532033426, "grad_norm": 1.8700345754623413, "learning_rate": 1.5818646958499145e-06, "loss": 0.4044, "step": 7021 }, { "epoch": 9.779944289693594, "grad_norm": 2.096038341522217, "learning_rate": 1.5719158612848207e-06, "loss": 0.4863, "step": 7022 }, { "epoch": 9.78133704735376, "grad_norm": 2.018458127975464, "learning_rate": 1.561967026719727e-06, "loss": 0.4877, "step": 7023 }, { "epoch": 9.782729805013927, "grad_norm": 1.793351173400879, "learning_rate": 1.5520181921546333e-06, "loss": 0.5304, "step": 7024 }, { "epoch": 9.784122562674094, "grad_norm": 1.891086220741272, "learning_rate": 1.5420693575895393e-06, "loss": 0.4283, "step": 7025 }, { "epoch": 9.785515320334262, "grad_norm": 2.475612163543701, "learning_rate": 1.5321205230244457e-06, "loss": 0.5958, "step": 7026 }, { "epoch": 9.786908077994429, "grad_norm": 1.8320320844650269, "learning_rate": 1.5221716884593517e-06, "loss": 0.4097, "step": 7027 }, { "epoch": 9.788300835654596, "grad_norm": 1.9003340005874634, "learning_rate": 1.5122228538942579e-06, "loss": 0.5038, "step": 7028 }, { "epoch": 9.789693593314762, "grad_norm": 2.2090916633605957, "learning_rate": 1.502274019329164e-06, "loss": 0.6872, "step": 7029 }, { "epoch": 9.79108635097493, "grad_norm": 1.9826990365982056, "learning_rate": 1.4923251847640703e-06, "loss": 0.5234, "step": 7030 }, { "epoch": 9.792479108635098, "grad_norm": 1.7873624563217163, "learning_rate": 1.4823763501989767e-06, "loss": 0.4322, "step": 7031 }, { "epoch": 9.793871866295264, "grad_norm": 2.042003870010376, "learning_rate": 1.4724275156338827e-06, "loss": 0.5007, "step": 7032 }, { "epoch": 9.795264623955433, "grad_norm": 2.289184093475342, "learning_rate": 1.462478681068789e-06, "loss": 0.605, "step": 7033 }, { "epoch": 9.7966573816156, "grad_norm": 3.1260499954223633, "learning_rate": 1.452529846503695e-06, "loss": 0.6609, "step": 7034 }, { "epoch": 9.798050139275766, "grad_norm": 1.9191718101501465, "learning_rate": 1.4425810119386014e-06, "loss": 0.4787, "step": 7035 }, { "epoch": 9.799442896935933, "grad_norm": 2.3396522998809814, "learning_rate": 1.4326321773735074e-06, "loss": 0.6021, "step": 7036 }, { "epoch": 9.800835654596101, "grad_norm": 1.95595121383667, "learning_rate": 1.4226833428084138e-06, "loss": 0.386, "step": 7037 }, { "epoch": 9.802228412256268, "grad_norm": 2.068617820739746, "learning_rate": 1.41273450824332e-06, "loss": 0.6495, "step": 7038 }, { "epoch": 9.803621169916434, "grad_norm": 2.383984088897705, "learning_rate": 1.402785673678226e-06, "loss": 0.6752, "step": 7039 }, { "epoch": 9.805013927576601, "grad_norm": 2.521237373352051, "learning_rate": 1.3928368391131324e-06, "loss": 0.6401, "step": 7040 }, { "epoch": 9.80640668523677, "grad_norm": 1.938478946685791, "learning_rate": 1.3828880045480384e-06, "loss": 0.5717, "step": 7041 }, { "epoch": 9.807799442896936, "grad_norm": 1.7377694845199585, "learning_rate": 1.3729391699829448e-06, "loss": 0.4244, "step": 7042 }, { "epoch": 9.809192200557103, "grad_norm": 2.3117964267730713, "learning_rate": 1.3629903354178508e-06, "loss": 0.6161, "step": 7043 }, { "epoch": 9.81058495821727, "grad_norm": 2.130361795425415, "learning_rate": 1.3530415008527572e-06, "loss": 0.5192, "step": 7044 }, { "epoch": 9.811977715877438, "grad_norm": 2.2317090034484863, "learning_rate": 1.3430926662876634e-06, "loss": 0.4929, "step": 7045 }, { "epoch": 9.813370473537605, "grad_norm": 2.0268118381500244, "learning_rate": 1.3331438317225696e-06, "loss": 0.6043, "step": 7046 }, { "epoch": 9.814763231197771, "grad_norm": 2.0953783988952637, "learning_rate": 1.3231949971574758e-06, "loss": 0.5248, "step": 7047 }, { "epoch": 9.816155988857938, "grad_norm": 1.854250431060791, "learning_rate": 1.3132461625923817e-06, "loss": 0.5196, "step": 7048 }, { "epoch": 9.817548746518106, "grad_norm": 2.054757595062256, "learning_rate": 1.3032973280272881e-06, "loss": 0.5887, "step": 7049 }, { "epoch": 9.818941504178273, "grad_norm": 2.184720516204834, "learning_rate": 1.2933484934621941e-06, "loss": 0.5577, "step": 7050 }, { "epoch": 9.82033426183844, "grad_norm": 2.0001440048217773, "learning_rate": 1.2833996588971005e-06, "loss": 0.4692, "step": 7051 }, { "epoch": 9.821727019498606, "grad_norm": 1.9419692754745483, "learning_rate": 1.2734508243320067e-06, "loss": 0.5137, "step": 7052 }, { "epoch": 9.823119777158775, "grad_norm": 2.0789315700531006, "learning_rate": 1.263501989766913e-06, "loss": 0.4364, "step": 7053 }, { "epoch": 9.824512534818941, "grad_norm": 1.8646198511123657, "learning_rate": 1.2535531552018191e-06, "loss": 0.4988, "step": 7054 }, { "epoch": 9.825905292479108, "grad_norm": 1.80806565284729, "learning_rate": 1.2436043206367253e-06, "loss": 0.4913, "step": 7055 }, { "epoch": 9.827298050139277, "grad_norm": 2.497965097427368, "learning_rate": 1.2336554860716315e-06, "loss": 0.5724, "step": 7056 }, { "epoch": 9.828690807799443, "grad_norm": 1.9820994138717651, "learning_rate": 1.223706651506538e-06, "loss": 0.5237, "step": 7057 }, { "epoch": 9.83008356545961, "grad_norm": 1.5877418518066406, "learning_rate": 1.2137578169414439e-06, "loss": 0.3922, "step": 7058 }, { "epoch": 9.831476323119777, "grad_norm": 2.035332441329956, "learning_rate": 1.20380898237635e-06, "loss": 0.5689, "step": 7059 }, { "epoch": 9.832869080779945, "grad_norm": 2.17326283454895, "learning_rate": 1.1938601478112563e-06, "loss": 0.6454, "step": 7060 }, { "epoch": 9.834261838440112, "grad_norm": 2.0205471515655518, "learning_rate": 1.1839113132461625e-06, "loss": 0.5385, "step": 7061 }, { "epoch": 9.835654596100278, "grad_norm": 1.927617073059082, "learning_rate": 1.1739624786810686e-06, "loss": 0.4974, "step": 7062 }, { "epoch": 9.837047353760445, "grad_norm": 1.6703886985778809, "learning_rate": 1.1640136441159748e-06, "loss": 0.3911, "step": 7063 }, { "epoch": 9.838440111420613, "grad_norm": 2.4799304008483887, "learning_rate": 1.1540648095508812e-06, "loss": 0.6664, "step": 7064 }, { "epoch": 9.83983286908078, "grad_norm": 2.192898988723755, "learning_rate": 1.1441159749857872e-06, "loss": 0.6836, "step": 7065 }, { "epoch": 9.841225626740947, "grad_norm": 1.9271363019943237, "learning_rate": 1.1341671404206936e-06, "loss": 0.4509, "step": 7066 }, { "epoch": 9.842618384401113, "grad_norm": 1.6498823165893555, "learning_rate": 1.1242183058555996e-06, "loss": 0.406, "step": 7067 }, { "epoch": 9.844011142061282, "grad_norm": 2.2960798740386963, "learning_rate": 1.114269471290506e-06, "loss": 0.7303, "step": 7068 }, { "epoch": 9.845403899721449, "grad_norm": 2.159357786178589, "learning_rate": 1.104320636725412e-06, "loss": 0.6743, "step": 7069 }, { "epoch": 9.846796657381615, "grad_norm": 1.9666335582733154, "learning_rate": 1.0943718021603182e-06, "loss": 0.4534, "step": 7070 }, { "epoch": 9.848189415041782, "grad_norm": 1.9147229194641113, "learning_rate": 1.0844229675952246e-06, "loss": 0.5186, "step": 7071 }, { "epoch": 9.84958217270195, "grad_norm": 1.8351507186889648, "learning_rate": 1.0744741330301308e-06, "loss": 0.4674, "step": 7072 }, { "epoch": 9.850974930362117, "grad_norm": 2.234679937362671, "learning_rate": 1.064525298465037e-06, "loss": 0.6809, "step": 7073 }, { "epoch": 9.852367688022284, "grad_norm": 1.74303138256073, "learning_rate": 1.054576463899943e-06, "loss": 0.3896, "step": 7074 }, { "epoch": 9.85376044568245, "grad_norm": 2.025094747543335, "learning_rate": 1.0446276293348492e-06, "loss": 0.5635, "step": 7075 }, { "epoch": 9.855153203342619, "grad_norm": 2.3703227043151855, "learning_rate": 1.0346787947697553e-06, "loss": 0.6656, "step": 7076 }, { "epoch": 9.856545961002785, "grad_norm": 2.1646673679351807, "learning_rate": 1.0247299602046615e-06, "loss": 0.5863, "step": 7077 }, { "epoch": 9.857938718662952, "grad_norm": 2.322664737701416, "learning_rate": 1.014781125639568e-06, "loss": 0.552, "step": 7078 }, { "epoch": 9.85933147632312, "grad_norm": 2.183500051498413, "learning_rate": 1.0048322910744741e-06, "loss": 0.576, "step": 7079 }, { "epoch": 9.860724233983287, "grad_norm": 3.6958634853363037, "learning_rate": 9.948834565093803e-07, "loss": 0.4973, "step": 7080 }, { "epoch": 9.862116991643454, "grad_norm": 2.184685468673706, "learning_rate": 9.849346219442865e-07, "loss": 0.5928, "step": 7081 }, { "epoch": 9.86350974930362, "grad_norm": 1.7897632122039795, "learning_rate": 9.749857873791927e-07, "loss": 0.4739, "step": 7082 }, { "epoch": 9.864902506963789, "grad_norm": 2.104130744934082, "learning_rate": 9.65036952814099e-07, "loss": 0.504, "step": 7083 }, { "epoch": 9.866295264623956, "grad_norm": 1.6344671249389648, "learning_rate": 9.550881182490049e-07, "loss": 0.4088, "step": 7084 }, { "epoch": 9.867688022284122, "grad_norm": 2.351177215576172, "learning_rate": 9.451392836839112e-07, "loss": 0.3861, "step": 7085 }, { "epoch": 9.869080779944289, "grad_norm": 2.248156785964966, "learning_rate": 9.351904491188174e-07, "loss": 0.6107, "step": 7086 }, { "epoch": 9.870473537604457, "grad_norm": 2.3252768516540527, "learning_rate": 9.252416145537236e-07, "loss": 0.6035, "step": 7087 }, { "epoch": 9.871866295264624, "grad_norm": 1.9758137464523315, "learning_rate": 9.152927799886299e-07, "loss": 0.4369, "step": 7088 }, { "epoch": 9.87325905292479, "grad_norm": 2.075624465942383, "learning_rate": 9.053439454235361e-07, "loss": 0.5758, "step": 7089 }, { "epoch": 9.874651810584957, "grad_norm": 1.892717719078064, "learning_rate": 8.953951108584422e-07, "loss": 0.4483, "step": 7090 }, { "epoch": 9.876044568245126, "grad_norm": 2.014420747756958, "learning_rate": 8.854462762933484e-07, "loss": 0.6405, "step": 7091 }, { "epoch": 9.877437325905293, "grad_norm": 1.8224351406097412, "learning_rate": 8.754974417282546e-07, "loss": 0.4362, "step": 7092 }, { "epoch": 9.87883008356546, "grad_norm": 2.152827024459839, "learning_rate": 8.655486071631608e-07, "loss": 0.4982, "step": 7093 }, { "epoch": 9.880222841225628, "grad_norm": 1.9619979858398438, "learning_rate": 8.555997725980669e-07, "loss": 0.4952, "step": 7094 }, { "epoch": 9.881615598885794, "grad_norm": 2.4509429931640625, "learning_rate": 8.456509380329732e-07, "loss": 0.6591, "step": 7095 }, { "epoch": 9.883008356545961, "grad_norm": 2.236889362335205, "learning_rate": 8.357021034678794e-07, "loss": 0.6118, "step": 7096 }, { "epoch": 9.884401114206128, "grad_norm": 2.0628082752227783, "learning_rate": 8.257532689027856e-07, "loss": 0.5942, "step": 7097 }, { "epoch": 9.885793871866294, "grad_norm": 2.272329092025757, "learning_rate": 8.158044343376918e-07, "loss": 0.6116, "step": 7098 }, { "epoch": 9.887186629526463, "grad_norm": 2.2354860305786133, "learning_rate": 8.05855599772598e-07, "loss": 0.5908, "step": 7099 }, { "epoch": 9.88857938718663, "grad_norm": 1.988637089729309, "learning_rate": 7.959067652075042e-07, "loss": 0.4926, "step": 7100 }, { "epoch": 9.889972144846796, "grad_norm": 2.5095486640930176, "learning_rate": 7.859579306424104e-07, "loss": 0.5934, "step": 7101 }, { "epoch": 9.891364902506965, "grad_norm": 2.0553159713745117, "learning_rate": 7.760090960773167e-07, "loss": 0.5687, "step": 7102 }, { "epoch": 9.892757660167131, "grad_norm": 2.137578248977661, "learning_rate": 7.660602615122229e-07, "loss": 0.5213, "step": 7103 }, { "epoch": 9.894150417827298, "grad_norm": 2.0234615802764893, "learning_rate": 7.561114269471289e-07, "loss": 0.4872, "step": 7104 }, { "epoch": 9.895543175487465, "grad_norm": 1.8893370628356934, "learning_rate": 7.461625923820351e-07, "loss": 0.6072, "step": 7105 }, { "epoch": 9.896935933147633, "grad_norm": 2.027456283569336, "learning_rate": 7.362137578169413e-07, "loss": 0.5366, "step": 7106 }, { "epoch": 9.8983286908078, "grad_norm": 2.1616761684417725, "learning_rate": 7.262649232518475e-07, "loss": 0.5211, "step": 7107 }, { "epoch": 9.899721448467966, "grad_norm": 2.145045518875122, "learning_rate": 7.163160886867537e-07, "loss": 0.5437, "step": 7108 }, { "epoch": 9.901114206128133, "grad_norm": 1.9581626653671265, "learning_rate": 7.0636725412166e-07, "loss": 0.5937, "step": 7109 }, { "epoch": 9.902506963788301, "grad_norm": 2.170980453491211, "learning_rate": 6.964184195565662e-07, "loss": 0.7115, "step": 7110 }, { "epoch": 9.903899721448468, "grad_norm": 2.0341475009918213, "learning_rate": 6.864695849914724e-07, "loss": 0.572, "step": 7111 }, { "epoch": 9.905292479108635, "grad_norm": 2.295051097869873, "learning_rate": 6.765207504263786e-07, "loss": 0.5601, "step": 7112 }, { "epoch": 9.906685236768801, "grad_norm": 1.8335386514663696, "learning_rate": 6.665719158612848e-07, "loss": 0.3945, "step": 7113 }, { "epoch": 9.90807799442897, "grad_norm": 2.219296932220459, "learning_rate": 6.566230812961909e-07, "loss": 0.6245, "step": 7114 }, { "epoch": 9.909470752089137, "grad_norm": 2.0192878246307373, "learning_rate": 6.466742467310971e-07, "loss": 0.5099, "step": 7115 }, { "epoch": 9.910863509749303, "grad_norm": 2.1819090843200684, "learning_rate": 6.367254121660034e-07, "loss": 0.5059, "step": 7116 }, { "epoch": 9.912256267409472, "grad_norm": 1.8525947332382202, "learning_rate": 6.267765776009096e-07, "loss": 0.4052, "step": 7117 }, { "epoch": 9.913649025069638, "grad_norm": 2.503136396408081, "learning_rate": 6.168277430358157e-07, "loss": 0.6114, "step": 7118 }, { "epoch": 9.915041782729805, "grad_norm": 2.1122305393218994, "learning_rate": 6.068789084707219e-07, "loss": 0.5232, "step": 7119 }, { "epoch": 9.916434540389972, "grad_norm": 1.8866273164749146, "learning_rate": 5.969300739056281e-07, "loss": 0.5607, "step": 7120 }, { "epoch": 9.91782729805014, "grad_norm": 2.3245203495025635, "learning_rate": 5.869812393405343e-07, "loss": 0.5957, "step": 7121 }, { "epoch": 9.919220055710307, "grad_norm": 1.9134774208068848, "learning_rate": 5.770324047754406e-07, "loss": 0.5709, "step": 7122 }, { "epoch": 9.920612813370473, "grad_norm": 1.9723302125930786, "learning_rate": 5.670835702103468e-07, "loss": 0.4957, "step": 7123 }, { "epoch": 9.92200557103064, "grad_norm": 2.3764803409576416, "learning_rate": 5.57134735645253e-07, "loss": 0.6448, "step": 7124 }, { "epoch": 9.923398328690809, "grad_norm": 1.8192933797836304, "learning_rate": 5.471859010801591e-07, "loss": 0.4076, "step": 7125 }, { "epoch": 9.924791086350975, "grad_norm": 2.030700445175171, "learning_rate": 5.372370665150654e-07, "loss": 0.5361, "step": 7126 }, { "epoch": 9.926183844011142, "grad_norm": 1.692780613899231, "learning_rate": 5.272882319499715e-07, "loss": 0.4395, "step": 7127 }, { "epoch": 9.927576601671309, "grad_norm": 1.9694042205810547, "learning_rate": 5.173393973848777e-07, "loss": 0.4873, "step": 7128 }, { "epoch": 9.928969359331477, "grad_norm": 2.3899731636047363, "learning_rate": 5.07390562819784e-07, "loss": 0.6094, "step": 7129 }, { "epoch": 9.930362116991644, "grad_norm": 1.9247006177902222, "learning_rate": 4.974417282546902e-07, "loss": 0.5094, "step": 7130 }, { "epoch": 9.93175487465181, "grad_norm": 2.264085531234741, "learning_rate": 4.874928936895964e-07, "loss": 0.5281, "step": 7131 }, { "epoch": 9.933147632311977, "grad_norm": 1.9155348539352417, "learning_rate": 4.775440591245024e-07, "loss": 0.43, "step": 7132 }, { "epoch": 9.934540389972145, "grad_norm": 2.153841972351074, "learning_rate": 4.675952245594087e-07, "loss": 0.5799, "step": 7133 }, { "epoch": 9.935933147632312, "grad_norm": 2.1322433948516846, "learning_rate": 4.5764638999431493e-07, "loss": 0.7184, "step": 7134 }, { "epoch": 9.937325905292479, "grad_norm": 2.050550699234009, "learning_rate": 4.476975554292211e-07, "loss": 0.5051, "step": 7135 }, { "epoch": 9.938718662952645, "grad_norm": 1.484911561012268, "learning_rate": 4.377487208641273e-07, "loss": 0.3215, "step": 7136 }, { "epoch": 9.940111420612814, "grad_norm": 1.9908186197280884, "learning_rate": 4.2779988629903346e-07, "loss": 0.4299, "step": 7137 }, { "epoch": 9.94150417827298, "grad_norm": 2.2484664916992188, "learning_rate": 4.178510517339397e-07, "loss": 0.5732, "step": 7138 }, { "epoch": 9.942896935933147, "grad_norm": 2.211859703063965, "learning_rate": 4.079022171688459e-07, "loss": 0.7317, "step": 7139 }, { "epoch": 9.944289693593316, "grad_norm": 1.795798897743225, "learning_rate": 3.979533826037521e-07, "loss": 0.4229, "step": 7140 }, { "epoch": 9.945682451253482, "grad_norm": 1.943630576133728, "learning_rate": 3.8800454803865833e-07, "loss": 0.4857, "step": 7141 }, { "epoch": 9.947075208913649, "grad_norm": 2.133408784866333, "learning_rate": 3.7805571347356447e-07, "loss": 0.5843, "step": 7142 }, { "epoch": 9.948467966573816, "grad_norm": 1.9784224033355713, "learning_rate": 3.6810687890847067e-07, "loss": 0.4794, "step": 7143 }, { "epoch": 9.949860724233984, "grad_norm": 2.6341850757598877, "learning_rate": 3.5815804434337686e-07, "loss": 0.7599, "step": 7144 }, { "epoch": 9.95125348189415, "grad_norm": 2.1995270252227783, "learning_rate": 3.482092097782831e-07, "loss": 0.5679, "step": 7145 }, { "epoch": 9.952646239554317, "grad_norm": 1.9202507734298706, "learning_rate": 3.382603752131893e-07, "loss": 0.4951, "step": 7146 }, { "epoch": 9.954038997214484, "grad_norm": 2.0263235569000244, "learning_rate": 3.2831154064809544e-07, "loss": 0.5948, "step": 7147 }, { "epoch": 9.955431754874652, "grad_norm": 2.282881021499634, "learning_rate": 3.183627060830017e-07, "loss": 0.5656, "step": 7148 }, { "epoch": 9.95682451253482, "grad_norm": 2.2149147987365723, "learning_rate": 3.0841387151790787e-07, "loss": 0.718, "step": 7149 }, { "epoch": 9.958217270194986, "grad_norm": 1.9350368976593018, "learning_rate": 2.9846503695281407e-07, "loss": 0.5345, "step": 7150 }, { "epoch": 9.959610027855152, "grad_norm": 1.8354272842407227, "learning_rate": 2.885162023877203e-07, "loss": 0.5589, "step": 7151 }, { "epoch": 9.961002785515321, "grad_norm": 1.8528400659561157, "learning_rate": 2.785673678226265e-07, "loss": 0.526, "step": 7152 }, { "epoch": 9.962395543175488, "grad_norm": 2.0718677043914795, "learning_rate": 2.686185332575327e-07, "loss": 0.6267, "step": 7153 }, { "epoch": 9.963788300835654, "grad_norm": 2.057283878326416, "learning_rate": 2.5866969869243884e-07, "loss": 0.5607, "step": 7154 }, { "epoch": 9.965181058495821, "grad_norm": 1.8653982877731323, "learning_rate": 2.487208641273451e-07, "loss": 0.5396, "step": 7155 }, { "epoch": 9.96657381615599, "grad_norm": 2.0835800170898438, "learning_rate": 2.387720295622512e-07, "loss": 0.5667, "step": 7156 }, { "epoch": 9.967966573816156, "grad_norm": 2.089766025543213, "learning_rate": 2.2882319499715747e-07, "loss": 0.5627, "step": 7157 }, { "epoch": 9.969359331476323, "grad_norm": 1.9209058284759521, "learning_rate": 2.1887436043206366e-07, "loss": 0.451, "step": 7158 }, { "epoch": 9.97075208913649, "grad_norm": 1.976711392402649, "learning_rate": 2.0892552586696985e-07, "loss": 0.4475, "step": 7159 }, { "epoch": 9.972144846796658, "grad_norm": 2.1920390129089355, "learning_rate": 1.9897669130187604e-07, "loss": 0.5494, "step": 7160 }, { "epoch": 9.973537604456824, "grad_norm": 2.1984899044036865, "learning_rate": 1.8902785673678224e-07, "loss": 0.6537, "step": 7161 }, { "epoch": 9.974930362116991, "grad_norm": 2.0709049701690674, "learning_rate": 1.7907902217168843e-07, "loss": 0.5861, "step": 7162 }, { "epoch": 9.97632311977716, "grad_norm": 2.4112558364868164, "learning_rate": 1.6913018760659465e-07, "loss": 0.5579, "step": 7163 }, { "epoch": 9.977715877437326, "grad_norm": 2.3421363830566406, "learning_rate": 1.5918135304150084e-07, "loss": 0.5993, "step": 7164 }, { "epoch": 9.979108635097493, "grad_norm": 1.897717833518982, "learning_rate": 1.4923251847640703e-07, "loss": 0.4771, "step": 7165 }, { "epoch": 9.98050139275766, "grad_norm": 1.9442927837371826, "learning_rate": 1.3928368391131325e-07, "loss": 0.4929, "step": 7166 }, { "epoch": 9.981894150417828, "grad_norm": 2.2649590969085693, "learning_rate": 1.2933484934621942e-07, "loss": 0.5187, "step": 7167 }, { "epoch": 9.983286908077995, "grad_norm": 2.1614112854003906, "learning_rate": 1.193860147811256e-07, "loss": 0.6393, "step": 7168 }, { "epoch": 9.984679665738161, "grad_norm": 2.230652332305908, "learning_rate": 1.0943718021603183e-07, "loss": 0.5468, "step": 7169 }, { "epoch": 9.986072423398328, "grad_norm": 1.8186545372009277, "learning_rate": 9.948834565093802e-08, "loss": 0.3882, "step": 7170 }, { "epoch": 9.987465181058496, "grad_norm": 1.8561375141143799, "learning_rate": 8.953951108584421e-08, "loss": 0.5552, "step": 7171 }, { "epoch": 9.988857938718663, "grad_norm": 2.0926353931427, "learning_rate": 7.959067652075042e-08, "loss": 0.5638, "step": 7172 }, { "epoch": 9.99025069637883, "grad_norm": 2.2047741413116455, "learning_rate": 6.964184195565663e-08, "loss": 0.6578, "step": 7173 }, { "epoch": 9.991643454038996, "grad_norm": 1.7672758102416992, "learning_rate": 5.96930073905628e-08, "loss": 0.399, "step": 7174 }, { "epoch": 9.993036211699165, "grad_norm": 2.1503844261169434, "learning_rate": 4.974417282546901e-08, "loss": 0.5326, "step": 7175 }, { "epoch": 9.994428969359332, "grad_norm": 2.4471487998962402, "learning_rate": 3.979533826037521e-08, "loss": 0.7769, "step": 7176 }, { "epoch": 9.995821727019498, "grad_norm": 1.9396417140960693, "learning_rate": 2.98465036952814e-08, "loss": 0.409, "step": 7177 }, { "epoch": 9.997214484679667, "grad_norm": 1.933851718902588, "learning_rate": 1.9897669130187605e-08, "loss": 0.5888, "step": 7178 }, { "epoch": 9.998607242339833, "grad_norm": 2.037083148956299, "learning_rate": 9.948834565093803e-09, "loss": 0.5062, "step": 7179 }, { "epoch": 10.0, "grad_norm": 2.248157501220703, "learning_rate": 0.0, "loss": 0.6042, "step": 7180 }, { "epoch": 10.0, "step": 7180, "total_flos": 8.941285959873331e+17, "train_loss": 2.6341500356917926, "train_runtime": 43087.7162, "train_samples_per_second": 21.33, "train_steps_per_second": 0.167 } ], "logging_steps": 1.0, "max_steps": 7180, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.941285959873331e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }