{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7499691218246943, "eval_steps": 759, "global_step": 2277, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003293672032607353, "grad_norm": 3.2412221431732178, "learning_rate": 2.0000000000000003e-06, "loss": 3.3662, "step": 1 }, { "epoch": 0.0003293672032607353, "eval_loss": 3.95210599899292, "eval_runtime": 812.7321, "eval_samples_per_second": 3.146, "eval_steps_per_second": 1.574, "step": 1 }, { "epoch": 0.0006587344065214706, "grad_norm": 3.8850836753845215, "learning_rate": 4.000000000000001e-06, "loss": 3.2364, "step": 2 }, { "epoch": 0.000988101609782206, "grad_norm": 3.7942073345184326, "learning_rate": 6e-06, "loss": 3.3337, "step": 3 }, { "epoch": 0.0013174688130429412, "grad_norm": 4.045947074890137, "learning_rate": 8.000000000000001e-06, "loss": 3.523, "step": 4 }, { "epoch": 0.0016468360163036766, "grad_norm": 3.6181905269622803, "learning_rate": 1e-05, "loss": 3.1772, "step": 5 }, { "epoch": 0.001976203219564412, "grad_norm": 4.814149379730225, "learning_rate": 1.2e-05, "loss": 3.871, "step": 6 }, { "epoch": 0.002305570422825147, "grad_norm": 4.0820112228393555, "learning_rate": 1.4000000000000001e-05, "loss": 3.7199, "step": 7 }, { "epoch": 0.0026349376260858823, "grad_norm": 4.483249187469482, "learning_rate": 1.6000000000000003e-05, "loss": 3.5852, "step": 8 }, { "epoch": 0.002964304829346618, "grad_norm": 5.03192138671875, "learning_rate": 1.8e-05, "loss": 3.5165, "step": 9 }, { "epoch": 0.003293672032607353, "grad_norm": 4.41070556640625, "learning_rate": 2e-05, "loss": 3.1522, "step": 10 }, { "epoch": 0.0036230392358680883, "grad_norm": 4.54289436340332, "learning_rate": 2.2000000000000003e-05, "loss": 3.113, "step": 11 }, { "epoch": 0.003952406439128824, "grad_norm": 5.175803184509277, "learning_rate": 2.4e-05, "loss": 3.0833, "step": 12 }, { "epoch": 0.004281773642389559, "grad_norm": 5.162367820739746, "learning_rate": 2.6000000000000002e-05, "loss": 3.2495, "step": 13 }, { "epoch": 0.004611140845650294, "grad_norm": 5.13967227935791, "learning_rate": 2.8000000000000003e-05, "loss": 3.1866, "step": 14 }, { "epoch": 0.0049405080489110294, "grad_norm": 4.819941520690918, "learning_rate": 3e-05, "loss": 2.8483, "step": 15 }, { "epoch": 0.005269875252171765, "grad_norm": 4.200347900390625, "learning_rate": 3.2000000000000005e-05, "loss": 3.0112, "step": 16 }, { "epoch": 0.005599242455432501, "grad_norm": 3.748765230178833, "learning_rate": 3.4000000000000007e-05, "loss": 2.4482, "step": 17 }, { "epoch": 0.005928609658693236, "grad_norm": 4.706761360168457, "learning_rate": 3.6e-05, "loss": 2.7336, "step": 18 }, { "epoch": 0.006257976861953971, "grad_norm": 4.122259616851807, "learning_rate": 3.8e-05, "loss": 2.2935, "step": 19 }, { "epoch": 0.006587344065214706, "grad_norm": 3.4756200313568115, "learning_rate": 4e-05, "loss": 2.4558, "step": 20 }, { "epoch": 0.006916711268475441, "grad_norm": 4.478485107421875, "learning_rate": 4.2e-05, "loss": 2.7799, "step": 21 }, { "epoch": 0.007246078471736177, "grad_norm": 3.8455657958984375, "learning_rate": 4.4000000000000006e-05, "loss": 2.3758, "step": 22 }, { "epoch": 0.007575445674996912, "grad_norm": 4.439505100250244, "learning_rate": 4.600000000000001e-05, "loss": 2.4312, "step": 23 }, { "epoch": 0.007904812878257648, "grad_norm": 5.45050048828125, "learning_rate": 4.8e-05, "loss": 2.3483, "step": 24 }, { "epoch": 0.008234180081518383, "grad_norm": 6.179609298706055, "learning_rate": 5e-05, "loss": 2.1845, "step": 25 }, { "epoch": 0.008563547284779118, "grad_norm": 3.980731964111328, "learning_rate": 5.2000000000000004e-05, "loss": 2.7766, "step": 26 }, { "epoch": 0.008892914488039853, "grad_norm": 3.381983995437622, "learning_rate": 5.4000000000000005e-05, "loss": 2.5084, "step": 27 }, { "epoch": 0.009222281691300589, "grad_norm": 3.6727781295776367, "learning_rate": 5.6000000000000006e-05, "loss": 2.8683, "step": 28 }, { "epoch": 0.009551648894561324, "grad_norm": 3.6398210525512695, "learning_rate": 5.8e-05, "loss": 2.4222, "step": 29 }, { "epoch": 0.009881016097822059, "grad_norm": 3.005326986312866, "learning_rate": 6e-05, "loss": 2.5306, "step": 30 }, { "epoch": 0.010210383301082794, "grad_norm": 3.8456501960754395, "learning_rate": 6.2e-05, "loss": 2.7113, "step": 31 }, { "epoch": 0.01053975050434353, "grad_norm": 2.880403518676758, "learning_rate": 6.400000000000001e-05, "loss": 2.3046, "step": 32 }, { "epoch": 0.010869117707604266, "grad_norm": 3.180576801300049, "learning_rate": 6.6e-05, "loss": 2.5024, "step": 33 }, { "epoch": 0.011198484910865001, "grad_norm": 3.2660038471221924, "learning_rate": 6.800000000000001e-05, "loss": 2.6287, "step": 34 }, { "epoch": 0.011527852114125737, "grad_norm": 2.8470723628997803, "learning_rate": 7e-05, "loss": 2.3198, "step": 35 }, { "epoch": 0.011857219317386472, "grad_norm": 3.0491487979888916, "learning_rate": 7.2e-05, "loss": 2.3468, "step": 36 }, { "epoch": 0.012186586520647207, "grad_norm": 3.558762788772583, "learning_rate": 7.4e-05, "loss": 2.5952, "step": 37 }, { "epoch": 0.012515953723907942, "grad_norm": 3.0420889854431152, "learning_rate": 7.6e-05, "loss": 2.2754, "step": 38 }, { "epoch": 0.012845320927168677, "grad_norm": 2.9328525066375732, "learning_rate": 7.800000000000001e-05, "loss": 1.9192, "step": 39 }, { "epoch": 0.013174688130429412, "grad_norm": 2.9751060009002686, "learning_rate": 8e-05, "loss": 2.4325, "step": 40 }, { "epoch": 0.013504055333690148, "grad_norm": 2.924984931945801, "learning_rate": 8.2e-05, "loss": 2.376, "step": 41 }, { "epoch": 0.013833422536950883, "grad_norm": 2.9344444274902344, "learning_rate": 8.4e-05, "loss": 2.3072, "step": 42 }, { "epoch": 0.014162789740211618, "grad_norm": 3.2519760131835938, "learning_rate": 8.6e-05, "loss": 2.3017, "step": 43 }, { "epoch": 0.014492156943472353, "grad_norm": 2.9699273109436035, "learning_rate": 8.800000000000001e-05, "loss": 2.2801, "step": 44 }, { "epoch": 0.014821524146733088, "grad_norm": 3.125183343887329, "learning_rate": 9e-05, "loss": 2.1891, "step": 45 }, { "epoch": 0.015150891349993824, "grad_norm": 4.185096740722656, "learning_rate": 9.200000000000001e-05, "loss": 2.3958, "step": 46 }, { "epoch": 0.01548025855325456, "grad_norm": 4.01360559463501, "learning_rate": 9.4e-05, "loss": 2.3923, "step": 47 }, { "epoch": 0.015809625756515296, "grad_norm": 3.355820417404175, "learning_rate": 9.6e-05, "loss": 2.0905, "step": 48 }, { "epoch": 0.01613899295977603, "grad_norm": 4.308192729949951, "learning_rate": 9.8e-05, "loss": 2.0585, "step": 49 }, { "epoch": 0.016468360163036766, "grad_norm": 5.098552227020264, "learning_rate": 0.0001, "loss": 2.7164, "step": 50 }, { "epoch": 0.0167977273662975, "grad_norm": 2.8389906883239746, "learning_rate": 9.999997232675378e-05, "loss": 2.7739, "step": 51 }, { "epoch": 0.017127094569558236, "grad_norm": 3.0663294792175293, "learning_rate": 9.999988930704576e-05, "loss": 2.7123, "step": 52 }, { "epoch": 0.01745646177281897, "grad_norm": 3.3522698879241943, "learning_rate": 9.99997509409678e-05, "loss": 2.4191, "step": 53 }, { "epoch": 0.017785828976079707, "grad_norm": 2.815250873565674, "learning_rate": 9.999955722867312e-05, "loss": 2.6329, "step": 54 }, { "epoch": 0.018115196179340442, "grad_norm": 2.9915289878845215, "learning_rate": 9.999930817037609e-05, "loss": 2.4786, "step": 55 }, { "epoch": 0.018444563382601177, "grad_norm": 3.284151554107666, "learning_rate": 9.999900376635243e-05, "loss": 2.4249, "step": 56 }, { "epoch": 0.018773930585861912, "grad_norm": 2.4534027576446533, "learning_rate": 9.999864401693908e-05, "loss": 2.356, "step": 57 }, { "epoch": 0.019103297789122647, "grad_norm": 2.522468328475952, "learning_rate": 9.999822892253428e-05, "loss": 2.3852, "step": 58 }, { "epoch": 0.019432664992383383, "grad_norm": 2.9176650047302246, "learning_rate": 9.999775848359749e-05, "loss": 2.2109, "step": 59 }, { "epoch": 0.019762032195644118, "grad_norm": 3.167917013168335, "learning_rate": 9.999723270064945e-05, "loss": 2.3511, "step": 60 }, { "epoch": 0.020091399398904853, "grad_norm": 2.6770145893096924, "learning_rate": 9.999665157427218e-05, "loss": 2.6865, "step": 61 }, { "epoch": 0.020420766602165588, "grad_norm": 2.729907989501953, "learning_rate": 9.999601510510895e-05, "loss": 2.3012, "step": 62 }, { "epoch": 0.020750133805426323, "grad_norm": 2.7215871810913086, "learning_rate": 9.999532329386425e-05, "loss": 2.0514, "step": 63 }, { "epoch": 0.02107950100868706, "grad_norm": 2.7751474380493164, "learning_rate": 9.999457614130391e-05, "loss": 2.4867, "step": 64 }, { "epoch": 0.021408868211947794, "grad_norm": 2.4375815391540527, "learning_rate": 9.999377364825496e-05, "loss": 2.2029, "step": 65 }, { "epoch": 0.021738235415208532, "grad_norm": 2.7066309452056885, "learning_rate": 9.99929158156057e-05, "loss": 2.1274, "step": 66 }, { "epoch": 0.022067602618469268, "grad_norm": 3.297457695007324, "learning_rate": 9.999200264430569e-05, "loss": 2.5863, "step": 67 }, { "epoch": 0.022396969821730003, "grad_norm": 3.2771711349487305, "learning_rate": 9.999103413536574e-05, "loss": 2.334, "step": 68 }, { "epoch": 0.022726337024990738, "grad_norm": 2.9507834911346436, "learning_rate": 9.999001028985795e-05, "loss": 2.3783, "step": 69 }, { "epoch": 0.023055704228251473, "grad_norm": 2.6401610374450684, "learning_rate": 9.998893110891562e-05, "loss": 1.9256, "step": 70 }, { "epoch": 0.023385071431512208, "grad_norm": 3.0901267528533936, "learning_rate": 9.998779659373332e-05, "loss": 2.1796, "step": 71 }, { "epoch": 0.023714438634772943, "grad_norm": 3.132500171661377, "learning_rate": 9.99866067455669e-05, "loss": 2.2996, "step": 72 }, { "epoch": 0.02404380583803368, "grad_norm": 3.1304476261138916, "learning_rate": 9.998536156573343e-05, "loss": 2.1264, "step": 73 }, { "epoch": 0.024373173041294414, "grad_norm": 3.315488576889038, "learning_rate": 9.998406105561125e-05, "loss": 1.984, "step": 74 }, { "epoch": 0.02470254024455515, "grad_norm": 7.0925774574279785, "learning_rate": 9.99827052166399e-05, "loss": 2.533, "step": 75 }, { "epoch": 0.025031907447815884, "grad_norm": 2.485670328140259, "learning_rate": 9.998129405032022e-05, "loss": 2.4111, "step": 76 }, { "epoch": 0.02536127465107662, "grad_norm": 2.9346237182617188, "learning_rate": 9.997982755821428e-05, "loss": 2.4954, "step": 77 }, { "epoch": 0.025690641854337355, "grad_norm": 2.6853690147399902, "learning_rate": 9.997830574194538e-05, "loss": 2.7849, "step": 78 }, { "epoch": 0.02602000905759809, "grad_norm": 2.614497661590576, "learning_rate": 9.997672860319804e-05, "loss": 2.7054, "step": 79 }, { "epoch": 0.026349376260858825, "grad_norm": 2.476630687713623, "learning_rate": 9.997509614371807e-05, "loss": 2.3267, "step": 80 }, { "epoch": 0.02667874346411956, "grad_norm": 2.7469708919525146, "learning_rate": 9.997340836531249e-05, "loss": 2.3416, "step": 81 }, { "epoch": 0.027008110667380295, "grad_norm": 2.3859927654266357, "learning_rate": 9.997166526984954e-05, "loss": 2.1941, "step": 82 }, { "epoch": 0.02733747787064103, "grad_norm": 2.3550686836242676, "learning_rate": 9.996986685925868e-05, "loss": 2.2731, "step": 83 }, { "epoch": 0.027666845073901766, "grad_norm": 2.6245596408843994, "learning_rate": 9.996801313553068e-05, "loss": 2.1934, "step": 84 }, { "epoch": 0.0279962122771625, "grad_norm": 2.337979793548584, "learning_rate": 9.996610410071742e-05, "loss": 2.0485, "step": 85 }, { "epoch": 0.028325579480423236, "grad_norm": 2.2919275760650635, "learning_rate": 9.996413975693214e-05, "loss": 2.1097, "step": 86 }, { "epoch": 0.02865494668368397, "grad_norm": 2.4434707164764404, "learning_rate": 9.996212010634917e-05, "loss": 2.1181, "step": 87 }, { "epoch": 0.028984313886944706, "grad_norm": 2.2074220180511475, "learning_rate": 9.996004515120414e-05, "loss": 2.1341, "step": 88 }, { "epoch": 0.02931368109020544, "grad_norm": 2.670886993408203, "learning_rate": 9.995791489379388e-05, "loss": 2.2701, "step": 89 }, { "epoch": 0.029643048293466177, "grad_norm": 3.0483765602111816, "learning_rate": 9.995572933647643e-05, "loss": 2.2896, "step": 90 }, { "epoch": 0.029972415496726912, "grad_norm": 2.74815034866333, "learning_rate": 9.995348848167107e-05, "loss": 2.3094, "step": 91 }, { "epoch": 0.030301782699987647, "grad_norm": 3.188274621963501, "learning_rate": 9.995119233185825e-05, "loss": 2.3274, "step": 92 }, { "epoch": 0.030631149903248386, "grad_norm": 2.603101968765259, "learning_rate": 9.994884088957966e-05, "loss": 2.0735, "step": 93 }, { "epoch": 0.03096051710650912, "grad_norm": 2.7947723865509033, "learning_rate": 9.994643415743817e-05, "loss": 2.571, "step": 94 }, { "epoch": 0.03128988430976985, "grad_norm": 2.8341128826141357, "learning_rate": 9.994397213809786e-05, "loss": 2.4703, "step": 95 }, { "epoch": 0.03161925151303059, "grad_norm": 2.8162853717803955, "learning_rate": 9.994145483428403e-05, "loss": 2.2555, "step": 96 }, { "epoch": 0.03194861871629132, "grad_norm": 2.9116039276123047, "learning_rate": 9.993888224878313e-05, "loss": 2.0821, "step": 97 }, { "epoch": 0.03227798591955206, "grad_norm": 3.1359169483184814, "learning_rate": 9.993625438444287e-05, "loss": 1.8411, "step": 98 }, { "epoch": 0.03260735312281279, "grad_norm": 3.9783849716186523, "learning_rate": 9.993357124417209e-05, "loss": 2.3985, "step": 99 }, { "epoch": 0.03293672032607353, "grad_norm": 3.3506076335906982, "learning_rate": 9.993083283094084e-05, "loss": 2.2946, "step": 100 }, { "epoch": 0.033266087529334264, "grad_norm": 2.503321409225464, "learning_rate": 9.992803914778034e-05, "loss": 2.6129, "step": 101 }, { "epoch": 0.033595454732595, "grad_norm": 3.6384737491607666, "learning_rate": 9.992519019778301e-05, "loss": 2.6609, "step": 102 }, { "epoch": 0.033924821935855734, "grad_norm": 2.3811521530151367, "learning_rate": 9.992228598410244e-05, "loss": 2.3603, "step": 103 }, { "epoch": 0.03425418913911647, "grad_norm": 2.4915661811828613, "learning_rate": 9.991932650995341e-05, "loss": 2.2988, "step": 104 }, { "epoch": 0.034583556342377204, "grad_norm": 2.7037079334259033, "learning_rate": 9.991631177861182e-05, "loss": 2.4169, "step": 105 }, { "epoch": 0.03491292354563794, "grad_norm": 2.6235880851745605, "learning_rate": 9.991324179341478e-05, "loss": 2.2463, "step": 106 }, { "epoch": 0.03524229074889868, "grad_norm": 2.3382320404052734, "learning_rate": 9.991011655776055e-05, "loss": 2.2194, "step": 107 }, { "epoch": 0.03557165795215941, "grad_norm": 2.669034957885742, "learning_rate": 9.990693607510853e-05, "loss": 2.4065, "step": 108 }, { "epoch": 0.03590102515542015, "grad_norm": 2.219449281692505, "learning_rate": 9.990370034897931e-05, "loss": 2.0945, "step": 109 }, { "epoch": 0.036230392358680884, "grad_norm": 3.07869291305542, "learning_rate": 9.99004093829546e-05, "loss": 2.3285, "step": 110 }, { "epoch": 0.03655975956194162, "grad_norm": 2.4732134342193604, "learning_rate": 9.989706318067728e-05, "loss": 2.0391, "step": 111 }, { "epoch": 0.036889126765202354, "grad_norm": 2.9737696647644043, "learning_rate": 9.989366174585135e-05, "loss": 2.0482, "step": 112 }, { "epoch": 0.03721849396846309, "grad_norm": 2.8496570587158203, "learning_rate": 9.989020508224197e-05, "loss": 2.3701, "step": 113 }, { "epoch": 0.037547861171723824, "grad_norm": 3.108743190765381, "learning_rate": 9.98866931936754e-05, "loss": 2.4347, "step": 114 }, { "epoch": 0.03787722837498456, "grad_norm": 2.642160177230835, "learning_rate": 9.988312608403909e-05, "loss": 2.3828, "step": 115 }, { "epoch": 0.038206595578245295, "grad_norm": 2.7258148193359375, "learning_rate": 9.987950375728157e-05, "loss": 2.0533, "step": 116 }, { "epoch": 0.038535962781506033, "grad_norm": 2.6295080184936523, "learning_rate": 9.98758262174125e-05, "loss": 2.3752, "step": 117 }, { "epoch": 0.038865329984766765, "grad_norm": 2.905092477798462, "learning_rate": 9.987209346850263e-05, "loss": 2.031, "step": 118 }, { "epoch": 0.039194697188027504, "grad_norm": 2.780714988708496, "learning_rate": 9.986830551468388e-05, "loss": 2.0643, "step": 119 }, { "epoch": 0.039524064391288236, "grad_norm": 2.858346939086914, "learning_rate": 9.986446236014925e-05, "loss": 2.3293, "step": 120 }, { "epoch": 0.039853431594548974, "grad_norm": 3.3452649116516113, "learning_rate": 9.986056400915284e-05, "loss": 2.4871, "step": 121 }, { "epoch": 0.040182798797809706, "grad_norm": 2.921239137649536, "learning_rate": 9.985661046600984e-05, "loss": 2.3667, "step": 122 }, { "epoch": 0.040512166001070445, "grad_norm": 3.3637163639068604, "learning_rate": 9.985260173509656e-05, "loss": 2.2286, "step": 123 }, { "epoch": 0.040841533204331176, "grad_norm": 2.8042445182800293, "learning_rate": 9.984853782085035e-05, "loss": 1.9868, "step": 124 }, { "epoch": 0.041170900407591915, "grad_norm": 3.1596014499664307, "learning_rate": 9.984441872776973e-05, "loss": 1.9366, "step": 125 }, { "epoch": 0.04150026761085265, "grad_norm": 2.2009477615356445, "learning_rate": 9.984024446041423e-05, "loss": 2.7554, "step": 126 }, { "epoch": 0.041829634814113385, "grad_norm": 2.5528178215026855, "learning_rate": 9.983601502340443e-05, "loss": 2.4577, "step": 127 }, { "epoch": 0.04215900201737412, "grad_norm": 2.445089101791382, "learning_rate": 9.983173042142208e-05, "loss": 2.3525, "step": 128 }, { "epoch": 0.042488369220634856, "grad_norm": 2.8235416412353516, "learning_rate": 9.98273906592099e-05, "loss": 2.6086, "step": 129 }, { "epoch": 0.04281773642389559, "grad_norm": 2.569711446762085, "learning_rate": 9.98229957415717e-05, "loss": 2.4181, "step": 130 }, { "epoch": 0.043147103627156326, "grad_norm": 2.627051591873169, "learning_rate": 9.981854567337237e-05, "loss": 2.307, "step": 131 }, { "epoch": 0.043476470830417065, "grad_norm": 2.5236432552337646, "learning_rate": 9.98140404595378e-05, "loss": 2.5002, "step": 132 }, { "epoch": 0.043805838033677796, "grad_norm": 2.83263897895813, "learning_rate": 9.980948010505493e-05, "loss": 2.6664, "step": 133 }, { "epoch": 0.044135205236938535, "grad_norm": 2.404651641845703, "learning_rate": 9.98048646149718e-05, "loss": 2.4746, "step": 134 }, { "epoch": 0.04446457244019927, "grad_norm": 2.768198013305664, "learning_rate": 9.980019399439741e-05, "loss": 2.3371, "step": 135 }, { "epoch": 0.044793939643460005, "grad_norm": 3.2502036094665527, "learning_rate": 9.97954682485018e-05, "loss": 2.7268, "step": 136 }, { "epoch": 0.04512330684672074, "grad_norm": 2.356938362121582, "learning_rate": 9.979068738251605e-05, "loss": 2.271, "step": 137 }, { "epoch": 0.045452674049981476, "grad_norm": 2.790741205215454, "learning_rate": 9.978585140173225e-05, "loss": 2.5781, "step": 138 }, { "epoch": 0.04578204125324221, "grad_norm": 3.1573290824890137, "learning_rate": 9.978096031150346e-05, "loss": 2.3545, "step": 139 }, { "epoch": 0.046111408456502946, "grad_norm": 2.6217617988586426, "learning_rate": 9.977601411724382e-05, "loss": 2.2068, "step": 140 }, { "epoch": 0.04644077565976368, "grad_norm": 3.0412278175354004, "learning_rate": 9.977101282442839e-05, "loss": 2.386, "step": 141 }, { "epoch": 0.046770142863024416, "grad_norm": 3.4128334522247314, "learning_rate": 9.976595643859326e-05, "loss": 2.5365, "step": 142 }, { "epoch": 0.04709951006628515, "grad_norm": 3.5312652587890625, "learning_rate": 9.976084496533547e-05, "loss": 2.2243, "step": 143 }, { "epoch": 0.04742887726954589, "grad_norm": 2.8533828258514404, "learning_rate": 9.97556784103131e-05, "loss": 2.2667, "step": 144 }, { "epoch": 0.04775824447280662, "grad_norm": 3.081562042236328, "learning_rate": 9.975045677924515e-05, "loss": 2.3508, "step": 145 }, { "epoch": 0.04808761167606736, "grad_norm": 3.2083470821380615, "learning_rate": 9.97451800779116e-05, "loss": 2.4371, "step": 146 }, { "epoch": 0.04841697887932809, "grad_norm": 3.0021450519561768, "learning_rate": 9.973984831215337e-05, "loss": 1.9932, "step": 147 }, { "epoch": 0.04874634608258883, "grad_norm": 3.146559953689575, "learning_rate": 9.973446148787238e-05, "loss": 2.0892, "step": 148 }, { "epoch": 0.04907571328584956, "grad_norm": 3.2698886394500732, "learning_rate": 9.972901961103145e-05, "loss": 1.9643, "step": 149 }, { "epoch": 0.0494050804891103, "grad_norm": 3.640223503112793, "learning_rate": 9.972352268765434e-05, "loss": 2.0784, "step": 150 }, { "epoch": 0.04973444769237103, "grad_norm": 2.1278653144836426, "learning_rate": 9.971797072382579e-05, "loss": 2.4746, "step": 151 }, { "epoch": 0.05006381489563177, "grad_norm": 2.588524341583252, "learning_rate": 9.971236372569142e-05, "loss": 2.2406, "step": 152 }, { "epoch": 0.0503931820988925, "grad_norm": 2.5750515460968018, "learning_rate": 9.97067016994578e-05, "loss": 2.254, "step": 153 }, { "epoch": 0.05072254930215324, "grad_norm": 2.47708797454834, "learning_rate": 9.970098465139236e-05, "loss": 2.2676, "step": 154 }, { "epoch": 0.05105191650541397, "grad_norm": 2.496344566345215, "learning_rate": 9.969521258782351e-05, "loss": 2.3978, "step": 155 }, { "epoch": 0.05138128370867471, "grad_norm": 2.462228775024414, "learning_rate": 9.968938551514048e-05, "loss": 2.3217, "step": 156 }, { "epoch": 0.05171065091193544, "grad_norm": 2.3593785762786865, "learning_rate": 9.968350343979346e-05, "loss": 2.0463, "step": 157 }, { "epoch": 0.05204001811519618, "grad_norm": 2.4684934616088867, "learning_rate": 9.967756636829348e-05, "loss": 2.3118, "step": 158 }, { "epoch": 0.05236938531845692, "grad_norm": 3.1452877521514893, "learning_rate": 9.967157430721248e-05, "loss": 2.3831, "step": 159 }, { "epoch": 0.05269875252171765, "grad_norm": 2.745805025100708, "learning_rate": 9.966552726318323e-05, "loss": 2.3436, "step": 160 }, { "epoch": 0.05302811972497839, "grad_norm": 2.490478038787842, "learning_rate": 9.965942524289941e-05, "loss": 2.2698, "step": 161 }, { "epoch": 0.05335748692823912, "grad_norm": 2.3748586177825928, "learning_rate": 9.96532682531155e-05, "loss": 2.4236, "step": 162 }, { "epoch": 0.05368685413149986, "grad_norm": 2.378679037094116, "learning_rate": 9.964705630064686e-05, "loss": 2.1829, "step": 163 }, { "epoch": 0.05401622133476059, "grad_norm": 2.8342976570129395, "learning_rate": 9.964078939236971e-05, "loss": 2.3079, "step": 164 }, { "epoch": 0.05434558853802133, "grad_norm": 2.9072232246398926, "learning_rate": 9.963446753522104e-05, "loss": 2.3423, "step": 165 }, { "epoch": 0.05467495574128206, "grad_norm": 2.8593156337738037, "learning_rate": 9.962809073619875e-05, "loss": 2.2235, "step": 166 }, { "epoch": 0.0550043229445428, "grad_norm": 2.832493543624878, "learning_rate": 9.962165900236146e-05, "loss": 2.2889, "step": 167 }, { "epoch": 0.05533369014780353, "grad_norm": 2.806488037109375, "learning_rate": 9.961517234082866e-05, "loss": 2.1615, "step": 168 }, { "epoch": 0.05566305735106427, "grad_norm": 3.301941394805908, "learning_rate": 9.960863075878067e-05, "loss": 2.2195, "step": 169 }, { "epoch": 0.055992424554325, "grad_norm": 2.458503484725952, "learning_rate": 9.960203426345851e-05, "loss": 2.1645, "step": 170 }, { "epoch": 0.05632179175758574, "grad_norm": 2.415736675262451, "learning_rate": 9.959538286216408e-05, "loss": 2.012, "step": 171 }, { "epoch": 0.05665115896084647, "grad_norm": 2.7939391136169434, "learning_rate": 9.958867656225997e-05, "loss": 2.3091, "step": 172 }, { "epoch": 0.05698052616410721, "grad_norm": 2.971738576889038, "learning_rate": 9.958191537116963e-05, "loss": 2.1566, "step": 173 }, { "epoch": 0.05730989336736794, "grad_norm": 3.0159671306610107, "learning_rate": 9.957509929637719e-05, "loss": 2.0143, "step": 174 }, { "epoch": 0.05763926057062868, "grad_norm": 3.120633125305176, "learning_rate": 9.956822834542759e-05, "loss": 1.8494, "step": 175 }, { "epoch": 0.05796862777388941, "grad_norm": 2.3217828273773193, "learning_rate": 9.956130252592646e-05, "loss": 2.6393, "step": 176 }, { "epoch": 0.05829799497715015, "grad_norm": 2.3948323726654053, "learning_rate": 9.955432184554024e-05, "loss": 2.6342, "step": 177 }, { "epoch": 0.05862736218041088, "grad_norm": 2.1750946044921875, "learning_rate": 9.9547286311996e-05, "loss": 2.3683, "step": 178 }, { "epoch": 0.05895672938367162, "grad_norm": 2.6148295402526855, "learning_rate": 9.954019593308163e-05, "loss": 2.5178, "step": 179 }, { "epoch": 0.05928609658693235, "grad_norm": 2.6671082973480225, "learning_rate": 9.953305071664566e-05, "loss": 2.3501, "step": 180 }, { "epoch": 0.05961546379019309, "grad_norm": 2.740058422088623, "learning_rate": 9.952585067059734e-05, "loss": 2.3677, "step": 181 }, { "epoch": 0.059944830993453824, "grad_norm": 2.2972073554992676, "learning_rate": 9.951859580290664e-05, "loss": 2.5598, "step": 182 }, { "epoch": 0.06027419819671456, "grad_norm": 2.5012662410736084, "learning_rate": 9.951128612160417e-05, "loss": 2.247, "step": 183 }, { "epoch": 0.060603565399975294, "grad_norm": 2.5461273193359375, "learning_rate": 9.950392163478121e-05, "loss": 2.4683, "step": 184 }, { "epoch": 0.06093293260323603, "grad_norm": 3.3869452476501465, "learning_rate": 9.949650235058978e-05, "loss": 2.5158, "step": 185 }, { "epoch": 0.06126229980649677, "grad_norm": 2.2931103706359863, "learning_rate": 9.948902827724248e-05, "loss": 2.2837, "step": 186 }, { "epoch": 0.0615916670097575, "grad_norm": 2.4342219829559326, "learning_rate": 9.94814994230126e-05, "loss": 2.2988, "step": 187 }, { "epoch": 0.06192103421301824, "grad_norm": 2.924483299255371, "learning_rate": 9.947391579623401e-05, "loss": 2.4679, "step": 188 }, { "epoch": 0.06225040141627897, "grad_norm": 2.381253480911255, "learning_rate": 9.946627740530131e-05, "loss": 2.0651, "step": 189 }, { "epoch": 0.0625797686195397, "grad_norm": 2.3202030658721924, "learning_rate": 9.945858425866962e-05, "loss": 2.0079, "step": 190 }, { "epoch": 0.06290913582280044, "grad_norm": 2.717766523361206, "learning_rate": 9.945083636485476e-05, "loss": 2.6815, "step": 191 }, { "epoch": 0.06323850302606118, "grad_norm": 3.0983986854553223, "learning_rate": 9.944303373243307e-05, "loss": 2.4154, "step": 192 }, { "epoch": 0.06356787022932192, "grad_norm": 2.5674819946289062, "learning_rate": 9.943517637004151e-05, "loss": 1.8935, "step": 193 }, { "epoch": 0.06389723743258265, "grad_norm": 2.8902697563171387, "learning_rate": 9.942726428637765e-05, "loss": 2.19, "step": 194 }, { "epoch": 0.06422660463584338, "grad_norm": 2.824228525161743, "learning_rate": 9.941929749019961e-05, "loss": 1.7962, "step": 195 }, { "epoch": 0.06455597183910412, "grad_norm": 2.9178125858306885, "learning_rate": 9.941127599032605e-05, "loss": 1.9707, "step": 196 }, { "epoch": 0.06488533904236486, "grad_norm": 3.112900495529175, "learning_rate": 9.940319979563624e-05, "loss": 2.1085, "step": 197 }, { "epoch": 0.06521470624562559, "grad_norm": 2.4243252277374268, "learning_rate": 9.939506891506993e-05, "loss": 1.6683, "step": 198 }, { "epoch": 0.06554407344888633, "grad_norm": 3.3545095920562744, "learning_rate": 9.938688335762747e-05, "loss": 1.9903, "step": 199 }, { "epoch": 0.06587344065214706, "grad_norm": 4.020653247833252, "learning_rate": 9.937864313236968e-05, "loss": 1.9782, "step": 200 }, { "epoch": 0.0662028078554078, "grad_norm": 2.1895785331726074, "learning_rate": 9.93703482484179e-05, "loss": 2.4232, "step": 201 }, { "epoch": 0.06653217505866853, "grad_norm": 2.779294967651367, "learning_rate": 9.9361998714954e-05, "loss": 2.4474, "step": 202 }, { "epoch": 0.06686154226192927, "grad_norm": 2.666214942932129, "learning_rate": 9.935359454122033e-05, "loss": 2.3747, "step": 203 }, { "epoch": 0.06719090946519, "grad_norm": 2.4927260875701904, "learning_rate": 9.93451357365197e-05, "loss": 2.3229, "step": 204 }, { "epoch": 0.06752027666845074, "grad_norm": 2.6281189918518066, "learning_rate": 9.933662231021543e-05, "loss": 2.3106, "step": 205 }, { "epoch": 0.06784964387171147, "grad_norm": 2.487201452255249, "learning_rate": 9.932805427173128e-05, "loss": 2.1396, "step": 206 }, { "epoch": 0.0681790110749722, "grad_norm": 2.6833136081695557, "learning_rate": 9.931943163055148e-05, "loss": 2.6855, "step": 207 }, { "epoch": 0.06850837827823295, "grad_norm": 2.670117139816284, "learning_rate": 9.931075439622069e-05, "loss": 2.0407, "step": 208 }, { "epoch": 0.06883774548149368, "grad_norm": 2.8142549991607666, "learning_rate": 9.930202257834397e-05, "loss": 2.5156, "step": 209 }, { "epoch": 0.06916711268475441, "grad_norm": 2.5977020263671875, "learning_rate": 9.929323618658686e-05, "loss": 2.2659, "step": 210 }, { "epoch": 0.06949647988801515, "grad_norm": 2.188446521759033, "learning_rate": 9.928439523067526e-05, "loss": 1.853, "step": 211 }, { "epoch": 0.06982584709127589, "grad_norm": 2.691819906234741, "learning_rate": 9.92754997203955e-05, "loss": 2.0574, "step": 212 }, { "epoch": 0.07015521429453662, "grad_norm": 2.599579334259033, "learning_rate": 9.926654966559427e-05, "loss": 2.1189, "step": 213 }, { "epoch": 0.07048458149779736, "grad_norm": 3.0365447998046875, "learning_rate": 9.925754507617868e-05, "loss": 2.2052, "step": 214 }, { "epoch": 0.07081394870105809, "grad_norm": 3.086376190185547, "learning_rate": 9.924848596211618e-05, "loss": 2.5058, "step": 215 }, { "epoch": 0.07114331590431883, "grad_norm": 3.420269012451172, "learning_rate": 9.923937233343453e-05, "loss": 2.0489, "step": 216 }, { "epoch": 0.07147268310757957, "grad_norm": 2.6292338371276855, "learning_rate": 9.923020420022191e-05, "loss": 2.4083, "step": 217 }, { "epoch": 0.0718020503108403, "grad_norm": 3.046985387802124, "learning_rate": 9.92209815726268e-05, "loss": 2.2224, "step": 218 }, { "epoch": 0.07213141751410103, "grad_norm": 3.1213648319244385, "learning_rate": 9.921170446085798e-05, "loss": 2.0798, "step": 219 }, { "epoch": 0.07246078471736177, "grad_norm": 2.71501088142395, "learning_rate": 9.920237287518462e-05, "loss": 2.3788, "step": 220 }, { "epoch": 0.0727901519206225, "grad_norm": 2.7265591621398926, "learning_rate": 9.919298682593605e-05, "loss": 2.0445, "step": 221 }, { "epoch": 0.07311951912388324, "grad_norm": 3.3277218341827393, "learning_rate": 9.918354632350202e-05, "loss": 2.1541, "step": 222 }, { "epoch": 0.07344888632714397, "grad_norm": 3.288353443145752, "learning_rate": 9.917405137833249e-05, "loss": 2.2666, "step": 223 }, { "epoch": 0.07377825353040471, "grad_norm": 3.049253463745117, "learning_rate": 9.916450200093771e-05, "loss": 1.8277, "step": 224 }, { "epoch": 0.07410762073366545, "grad_norm": 3.3985233306884766, "learning_rate": 9.915489820188814e-05, "loss": 1.9754, "step": 225 }, { "epoch": 0.07443698793692619, "grad_norm": 2.7046058177948, "learning_rate": 9.914523999181456e-05, "loss": 2.3886, "step": 226 }, { "epoch": 0.07476635514018691, "grad_norm": 2.472142219543457, "learning_rate": 9.91355273814079e-05, "loss": 2.4031, "step": 227 }, { "epoch": 0.07509572234344765, "grad_norm": 2.1829640865325928, "learning_rate": 9.912576038141933e-05, "loss": 2.34, "step": 228 }, { "epoch": 0.07542508954670839, "grad_norm": 2.4181277751922607, "learning_rate": 9.911593900266026e-05, "loss": 2.1865, "step": 229 }, { "epoch": 0.07575445674996913, "grad_norm": 2.6987533569335938, "learning_rate": 9.910606325600223e-05, "loss": 2.3855, "step": 230 }, { "epoch": 0.07608382395322985, "grad_norm": 2.5802574157714844, "learning_rate": 9.909613315237702e-05, "loss": 2.3325, "step": 231 }, { "epoch": 0.07641319115649059, "grad_norm": 2.356382369995117, "learning_rate": 9.90861487027766e-05, "loss": 2.1446, "step": 232 }, { "epoch": 0.07674255835975133, "grad_norm": 2.3594143390655518, "learning_rate": 9.907610991825298e-05, "loss": 2.204, "step": 233 }, { "epoch": 0.07707192556301207, "grad_norm": 2.47929310798645, "learning_rate": 9.906601680991842e-05, "loss": 2.2276, "step": 234 }, { "epoch": 0.07740129276627279, "grad_norm": 3.100281000137329, "learning_rate": 9.905586938894531e-05, "loss": 2.3447, "step": 235 }, { "epoch": 0.07773065996953353, "grad_norm": 2.3578147888183594, "learning_rate": 9.904566766656612e-05, "loss": 2.1765, "step": 236 }, { "epoch": 0.07806002717279427, "grad_norm": 2.6452767848968506, "learning_rate": 9.903541165407341e-05, "loss": 2.2725, "step": 237 }, { "epoch": 0.07838939437605501, "grad_norm": 2.8530819416046143, "learning_rate": 9.902510136281989e-05, "loss": 2.1286, "step": 238 }, { "epoch": 0.07871876157931575, "grad_norm": 3.382469892501831, "learning_rate": 9.901473680421833e-05, "loss": 2.5109, "step": 239 }, { "epoch": 0.07904812878257647, "grad_norm": 2.7474164962768555, "learning_rate": 9.900431798974158e-05, "loss": 2.0808, "step": 240 }, { "epoch": 0.07937749598583721, "grad_norm": 2.889378070831299, "learning_rate": 9.899384493092252e-05, "loss": 2.6418, "step": 241 }, { "epoch": 0.07970686318909795, "grad_norm": 2.788848876953125, "learning_rate": 9.89833176393541e-05, "loss": 2.4061, "step": 242 }, { "epoch": 0.08003623039235869, "grad_norm": 2.540822982788086, "learning_rate": 9.897273612668927e-05, "loss": 1.9808, "step": 243 }, { "epoch": 0.08036559759561941, "grad_norm": 2.4531843662261963, "learning_rate": 9.896210040464105e-05, "loss": 1.9014, "step": 244 }, { "epoch": 0.08069496479888015, "grad_norm": 3.2541840076446533, "learning_rate": 9.895141048498244e-05, "loss": 2.5161, "step": 245 }, { "epoch": 0.08102433200214089, "grad_norm": 2.396268367767334, "learning_rate": 9.89406663795464e-05, "loss": 1.944, "step": 246 }, { "epoch": 0.08135369920540163, "grad_norm": 3.345994710922241, "learning_rate": 9.892986810022594e-05, "loss": 2.4834, "step": 247 }, { "epoch": 0.08168306640866235, "grad_norm": 2.90889048576355, "learning_rate": 9.891901565897397e-05, "loss": 2.041, "step": 248 }, { "epoch": 0.08201243361192309, "grad_norm": 3.424887180328369, "learning_rate": 9.89081090678034e-05, "loss": 2.1515, "step": 249 }, { "epoch": 0.08234180081518383, "grad_norm": 3.129890203475952, "learning_rate": 9.889714833878705e-05, "loss": 1.5253, "step": 250 }, { "epoch": 0.08267116801844457, "grad_norm": 2.033660411834717, "learning_rate": 9.888613348405766e-05, "loss": 2.4284, "step": 251 }, { "epoch": 0.0830005352217053, "grad_norm": 2.726742744445801, "learning_rate": 9.887506451580794e-05, "loss": 2.5739, "step": 252 }, { "epoch": 0.08332990242496603, "grad_norm": 2.1317944526672363, "learning_rate": 9.886394144629044e-05, "loss": 2.3195, "step": 253 }, { "epoch": 0.08365926962822677, "grad_norm": 2.2404332160949707, "learning_rate": 9.885276428781763e-05, "loss": 2.6881, "step": 254 }, { "epoch": 0.08398863683148751, "grad_norm": 2.376636028289795, "learning_rate": 9.88415330527618e-05, "loss": 2.2499, "step": 255 }, { "epoch": 0.08431800403474823, "grad_norm": 2.24052095413208, "learning_rate": 9.88302477535552e-05, "loss": 2.4867, "step": 256 }, { "epoch": 0.08464737123800897, "grad_norm": 2.6284337043762207, "learning_rate": 9.881890840268981e-05, "loss": 2.5267, "step": 257 }, { "epoch": 0.08497673844126971, "grad_norm": 2.5630033016204834, "learning_rate": 9.880751501271755e-05, "loss": 2.3627, "step": 258 }, { "epoch": 0.08530610564453045, "grad_norm": 2.3372888565063477, "learning_rate": 9.879606759625004e-05, "loss": 2.4922, "step": 259 }, { "epoch": 0.08563547284779117, "grad_norm": 2.3916890621185303, "learning_rate": 9.878456616595882e-05, "loss": 2.1065, "step": 260 }, { "epoch": 0.08596484005105191, "grad_norm": 2.6594431400299072, "learning_rate": 9.877301073457515e-05, "loss": 2.034, "step": 261 }, { "epoch": 0.08629420725431265, "grad_norm": 2.3828513622283936, "learning_rate": 9.876140131489008e-05, "loss": 1.9415, "step": 262 }, { "epoch": 0.08662357445757339, "grad_norm": 2.558377265930176, "learning_rate": 9.874973791975442e-05, "loss": 2.1253, "step": 263 }, { "epoch": 0.08695294166083413, "grad_norm": 2.41732120513916, "learning_rate": 9.873802056207872e-05, "loss": 2.4188, "step": 264 }, { "epoch": 0.08728230886409485, "grad_norm": 2.653940439224243, "learning_rate": 9.872624925483329e-05, "loss": 2.1664, "step": 265 }, { "epoch": 0.08761167606735559, "grad_norm": 2.6138205528259277, "learning_rate": 9.871442401104816e-05, "loss": 1.9422, "step": 266 }, { "epoch": 0.08794104327061633, "grad_norm": 2.4396393299102783, "learning_rate": 9.870254484381299e-05, "loss": 2.2988, "step": 267 }, { "epoch": 0.08827041047387707, "grad_norm": 3.269818067550659, "learning_rate": 9.869061176627724e-05, "loss": 2.4621, "step": 268 }, { "epoch": 0.0885997776771378, "grad_norm": 2.802405595779419, "learning_rate": 9.867862479164996e-05, "loss": 2.0724, "step": 269 }, { "epoch": 0.08892914488039853, "grad_norm": 3.132948160171509, "learning_rate": 9.866658393319988e-05, "loss": 2.3857, "step": 270 }, { "epoch": 0.08925851208365927, "grad_norm": 3.446258544921875, "learning_rate": 9.865448920425541e-05, "loss": 2.1038, "step": 271 }, { "epoch": 0.08958787928692001, "grad_norm": 3.5878865718841553, "learning_rate": 9.864234061820458e-05, "loss": 2.4794, "step": 272 }, { "epoch": 0.08991724649018074, "grad_norm": 2.7170724868774414, "learning_rate": 9.863013818849499e-05, "loss": 2.0187, "step": 273 }, { "epoch": 0.09024661369344147, "grad_norm": 3.4046213626861572, "learning_rate": 9.861788192863388e-05, "loss": 1.9167, "step": 274 }, { "epoch": 0.09057598089670221, "grad_norm": 4.091611385345459, "learning_rate": 9.860557185218808e-05, "loss": 2.203, "step": 275 }, { "epoch": 0.09090534809996295, "grad_norm": 1.9869258403778076, "learning_rate": 9.859320797278397e-05, "loss": 2.4434, "step": 276 }, { "epoch": 0.09123471530322368, "grad_norm": 2.5179264545440674, "learning_rate": 9.85807903041075e-05, "loss": 2.2607, "step": 277 }, { "epoch": 0.09156408250648441, "grad_norm": 2.7460920810699463, "learning_rate": 9.856831885990416e-05, "loss": 2.5456, "step": 278 }, { "epoch": 0.09189344970974515, "grad_norm": 2.803849458694458, "learning_rate": 9.855579365397898e-05, "loss": 2.5656, "step": 279 }, { "epoch": 0.09222281691300589, "grad_norm": 2.4078571796417236, "learning_rate": 9.854321470019646e-05, "loss": 2.4701, "step": 280 }, { "epoch": 0.09255218411626662, "grad_norm": 2.735297679901123, "learning_rate": 9.853058201248063e-05, "loss": 2.4882, "step": 281 }, { "epoch": 0.09288155131952736, "grad_norm": 2.441884756088257, "learning_rate": 9.851789560481499e-05, "loss": 2.2741, "step": 282 }, { "epoch": 0.0932109185227881, "grad_norm": 2.480804681777954, "learning_rate": 9.85051554912425e-05, "loss": 2.2021, "step": 283 }, { "epoch": 0.09354028572604883, "grad_norm": 2.59104585647583, "learning_rate": 9.849236168586558e-05, "loss": 2.7452, "step": 284 }, { "epoch": 0.09386965292930956, "grad_norm": 2.392718553543091, "learning_rate": 9.847951420284605e-05, "loss": 2.231, "step": 285 }, { "epoch": 0.0941990201325703, "grad_norm": 2.880892753601074, "learning_rate": 9.84666130564052e-05, "loss": 2.5514, "step": 286 }, { "epoch": 0.09452838733583104, "grad_norm": 2.457305431365967, "learning_rate": 9.845365826082368e-05, "loss": 2.261, "step": 287 }, { "epoch": 0.09485775453909177, "grad_norm": 2.8255691528320312, "learning_rate": 9.844064983044157e-05, "loss": 2.4296, "step": 288 }, { "epoch": 0.0951871217423525, "grad_norm": 2.97965407371521, "learning_rate": 9.842758777965824e-05, "loss": 2.3662, "step": 289 }, { "epoch": 0.09551648894561324, "grad_norm": 2.454676389694214, "learning_rate": 9.841447212293249e-05, "loss": 2.1213, "step": 290 }, { "epoch": 0.09584585614887398, "grad_norm": 3.022413492202759, "learning_rate": 9.840130287478245e-05, "loss": 2.4408, "step": 291 }, { "epoch": 0.09617522335213471, "grad_norm": 3.0308666229248047, "learning_rate": 9.83880800497855e-05, "loss": 2.5066, "step": 292 }, { "epoch": 0.09650459055539545, "grad_norm": 2.674705743789673, "learning_rate": 9.837480366257844e-05, "loss": 2.2084, "step": 293 }, { "epoch": 0.09683395775865618, "grad_norm": 2.988152503967285, "learning_rate": 9.836147372785726e-05, "loss": 2.2093, "step": 294 }, { "epoch": 0.09716332496191692, "grad_norm": 2.502009868621826, "learning_rate": 9.834809026037728e-05, "loss": 2.259, "step": 295 }, { "epoch": 0.09749269216517766, "grad_norm": 2.590487241744995, "learning_rate": 9.833465327495306e-05, "loss": 2.1714, "step": 296 }, { "epoch": 0.0978220593684384, "grad_norm": 2.34224534034729, "learning_rate": 9.83211627864584e-05, "loss": 1.7337, "step": 297 }, { "epoch": 0.09815142657169912, "grad_norm": 2.985863447189331, "learning_rate": 9.83076188098263e-05, "loss": 2.0128, "step": 298 }, { "epoch": 0.09848079377495986, "grad_norm": 2.8273167610168457, "learning_rate": 9.829402136004904e-05, "loss": 2.0277, "step": 299 }, { "epoch": 0.0988101609782206, "grad_norm": 3.533780813217163, "learning_rate": 9.8280370452178e-05, "loss": 2.0994, "step": 300 }, { "epoch": 0.09913952818148133, "grad_norm": 2.0469958782196045, "learning_rate": 9.82666661013238e-05, "loss": 2.4583, "step": 301 }, { "epoch": 0.09946889538474206, "grad_norm": 2.3586039543151855, "learning_rate": 9.825290832265617e-05, "loss": 2.4677, "step": 302 }, { "epoch": 0.0997982625880028, "grad_norm": 2.269946813583374, "learning_rate": 9.823909713140404e-05, "loss": 2.3393, "step": 303 }, { "epoch": 0.10012762979126354, "grad_norm": 2.338125705718994, "learning_rate": 9.82252325428554e-05, "loss": 2.4787, "step": 304 }, { "epoch": 0.10045699699452428, "grad_norm": 2.1585426330566406, "learning_rate": 9.821131457235739e-05, "loss": 2.5393, "step": 305 }, { "epoch": 0.100786364197785, "grad_norm": 2.6568026542663574, "learning_rate": 9.819734323531624e-05, "loss": 2.5194, "step": 306 }, { "epoch": 0.10111573140104574, "grad_norm": 2.206839084625244, "learning_rate": 9.818331854719722e-05, "loss": 2.4154, "step": 307 }, { "epoch": 0.10144509860430648, "grad_norm": 2.444082498550415, "learning_rate": 9.816924052352468e-05, "loss": 2.2583, "step": 308 }, { "epoch": 0.10177446580756722, "grad_norm": 2.4031789302825928, "learning_rate": 9.815510917988202e-05, "loss": 2.5014, "step": 309 }, { "epoch": 0.10210383301082794, "grad_norm": 2.008598566055298, "learning_rate": 9.814092453191163e-05, "loss": 2.1755, "step": 310 }, { "epoch": 0.10243320021408868, "grad_norm": 2.6430673599243164, "learning_rate": 9.812668659531492e-05, "loss": 2.4391, "step": 311 }, { "epoch": 0.10276256741734942, "grad_norm": 2.4818711280822754, "learning_rate": 9.811239538585229e-05, "loss": 2.2518, "step": 312 }, { "epoch": 0.10309193462061016, "grad_norm": 2.733666181564331, "learning_rate": 9.80980509193431e-05, "loss": 2.3118, "step": 313 }, { "epoch": 0.10342130182387088, "grad_norm": 2.3446598052978516, "learning_rate": 9.808365321166568e-05, "loss": 2.3457, "step": 314 }, { "epoch": 0.10375066902713162, "grad_norm": 2.2961266040802, "learning_rate": 9.806920227875729e-05, "loss": 1.9795, "step": 315 }, { "epoch": 0.10408003623039236, "grad_norm": 2.767897844314575, "learning_rate": 9.805469813661408e-05, "loss": 2.2274, "step": 316 }, { "epoch": 0.1044094034336531, "grad_norm": 2.9815187454223633, "learning_rate": 9.804014080129111e-05, "loss": 2.4279, "step": 317 }, { "epoch": 0.10473877063691384, "grad_norm": 2.3294548988342285, "learning_rate": 9.802553028890237e-05, "loss": 1.9681, "step": 318 }, { "epoch": 0.10506813784017456, "grad_norm": 3.110809564590454, "learning_rate": 9.801086661562062e-05, "loss": 2.3353, "step": 319 }, { "epoch": 0.1053975050434353, "grad_norm": 2.6092398166656494, "learning_rate": 9.799614979767757e-05, "loss": 2.1682, "step": 320 }, { "epoch": 0.10572687224669604, "grad_norm": 3.033212184906006, "learning_rate": 9.798137985136367e-05, "loss": 1.9523, "step": 321 }, { "epoch": 0.10605623944995678, "grad_norm": 2.9443624019622803, "learning_rate": 9.79665567930282e-05, "loss": 2.1126, "step": 322 }, { "epoch": 0.1063856066532175, "grad_norm": 3.2337043285369873, "learning_rate": 9.795168063907929e-05, "loss": 2.1455, "step": 323 }, { "epoch": 0.10671497385647824, "grad_norm": 2.5773916244506836, "learning_rate": 9.793675140598377e-05, "loss": 1.7072, "step": 324 }, { "epoch": 0.10704434105973898, "grad_norm": 3.643908977508545, "learning_rate": 9.792176911026727e-05, "loss": 2.2543, "step": 325 }, { "epoch": 0.10737370826299972, "grad_norm": 1.8910375833511353, "learning_rate": 9.790673376851414e-05, "loss": 2.5806, "step": 326 }, { "epoch": 0.10770307546626044, "grad_norm": 2.3565833568573, "learning_rate": 9.789164539736741e-05, "loss": 2.4109, "step": 327 }, { "epoch": 0.10803244266952118, "grad_norm": 2.3572747707366943, "learning_rate": 9.78765040135289e-05, "loss": 2.3321, "step": 328 }, { "epoch": 0.10836180987278192, "grad_norm": 2.759070873260498, "learning_rate": 9.786130963375904e-05, "loss": 2.4526, "step": 329 }, { "epoch": 0.10869117707604266, "grad_norm": 2.419929027557373, "learning_rate": 9.784606227487693e-05, "loss": 2.1635, "step": 330 }, { "epoch": 0.10902054427930338, "grad_norm": 2.4334778785705566, "learning_rate": 9.783076195376036e-05, "loss": 2.5785, "step": 331 }, { "epoch": 0.10934991148256412, "grad_norm": 2.6062378883361816, "learning_rate": 9.781540868734567e-05, "loss": 2.4172, "step": 332 }, { "epoch": 0.10967927868582486, "grad_norm": 2.5069472789764404, "learning_rate": 9.780000249262787e-05, "loss": 2.3606, "step": 333 }, { "epoch": 0.1100086458890856, "grad_norm": 2.3731164932250977, "learning_rate": 9.778454338666053e-05, "loss": 2.3024, "step": 334 }, { "epoch": 0.11033801309234632, "grad_norm": 2.8083693981170654, "learning_rate": 9.776903138655581e-05, "loss": 2.426, "step": 335 }, { "epoch": 0.11066738029560706, "grad_norm": 2.4966108798980713, "learning_rate": 9.775346650948439e-05, "loss": 2.2338, "step": 336 }, { "epoch": 0.1109967474988678, "grad_norm": 2.9775607585906982, "learning_rate": 9.77378487726755e-05, "loss": 2.7963, "step": 337 }, { "epoch": 0.11132611470212854, "grad_norm": 2.692918062210083, "learning_rate": 9.77221781934169e-05, "loss": 2.4771, "step": 338 }, { "epoch": 0.11165548190538926, "grad_norm": 2.5929603576660156, "learning_rate": 9.770645478905481e-05, "loss": 2.4814, "step": 339 }, { "epoch": 0.11198484910865, "grad_norm": 2.6808626651763916, "learning_rate": 9.76906785769939e-05, "loss": 2.1842, "step": 340 }, { "epoch": 0.11231421631191074, "grad_norm": 2.846548080444336, "learning_rate": 9.767484957469739e-05, "loss": 2.1864, "step": 341 }, { "epoch": 0.11264358351517148, "grad_norm": 2.9507384300231934, "learning_rate": 9.765896779968685e-05, "loss": 2.3369, "step": 342 }, { "epoch": 0.11297295071843222, "grad_norm": 3.360044002532959, "learning_rate": 9.764303326954226e-05, "loss": 2.2568, "step": 343 }, { "epoch": 0.11330231792169294, "grad_norm": 2.5054543018341064, "learning_rate": 9.762704600190207e-05, "loss": 1.8953, "step": 344 }, { "epoch": 0.11363168512495368, "grad_norm": 3.037480592727661, "learning_rate": 9.761100601446304e-05, "loss": 2.4476, "step": 345 }, { "epoch": 0.11396105232821442, "grad_norm": 3.0009777545928955, "learning_rate": 9.759491332498032e-05, "loss": 2.3728, "step": 346 }, { "epoch": 0.11429041953147516, "grad_norm": 2.856393814086914, "learning_rate": 9.757876795126739e-05, "loss": 1.9786, "step": 347 }, { "epoch": 0.11461978673473588, "grad_norm": 3.3026115894317627, "learning_rate": 9.756256991119603e-05, "loss": 2.6015, "step": 348 }, { "epoch": 0.11494915393799662, "grad_norm": 3.5809285640716553, "learning_rate": 9.754631922269636e-05, "loss": 2.307, "step": 349 }, { "epoch": 0.11527852114125736, "grad_norm": 3.11942195892334, "learning_rate": 9.753001590375674e-05, "loss": 2.0157, "step": 350 }, { "epoch": 0.1156078883445181, "grad_norm": 2.313547372817993, "learning_rate": 9.75136599724238e-05, "loss": 2.4783, "step": 351 }, { "epoch": 0.11593725554777883, "grad_norm": 2.4683542251586914, "learning_rate": 9.74972514468024e-05, "loss": 2.4294, "step": 352 }, { "epoch": 0.11626662275103956, "grad_norm": 2.895655632019043, "learning_rate": 9.748079034505565e-05, "loss": 2.6217, "step": 353 }, { "epoch": 0.1165959899543003, "grad_norm": 2.646327495574951, "learning_rate": 9.746427668540481e-05, "loss": 2.5583, "step": 354 }, { "epoch": 0.11692535715756104, "grad_norm": 2.4664547443389893, "learning_rate": 9.744771048612935e-05, "loss": 2.4272, "step": 355 }, { "epoch": 0.11725472436082177, "grad_norm": 2.5208280086517334, "learning_rate": 9.743109176556689e-05, "loss": 2.7008, "step": 356 }, { "epoch": 0.1175840915640825, "grad_norm": 2.5895395278930664, "learning_rate": 9.741442054211319e-05, "loss": 2.4581, "step": 357 }, { "epoch": 0.11791345876734324, "grad_norm": 2.3770651817321777, "learning_rate": 9.739769683422214e-05, "loss": 2.2697, "step": 358 }, { "epoch": 0.11824282597060398, "grad_norm": 2.263289451599121, "learning_rate": 9.738092066040568e-05, "loss": 2.1457, "step": 359 }, { "epoch": 0.1185721931738647, "grad_norm": 3.5201501846313477, "learning_rate": 9.736409203923388e-05, "loss": 2.4604, "step": 360 }, { "epoch": 0.11890156037712545, "grad_norm": 2.3830199241638184, "learning_rate": 9.734721098933484e-05, "loss": 2.0263, "step": 361 }, { "epoch": 0.11923092758038618, "grad_norm": 2.431260347366333, "learning_rate": 9.73302775293947e-05, "loss": 2.2848, "step": 362 }, { "epoch": 0.11956029478364692, "grad_norm": 2.4053165912628174, "learning_rate": 9.73132916781576e-05, "loss": 2.0885, "step": 363 }, { "epoch": 0.11988966198690765, "grad_norm": 2.584679126739502, "learning_rate": 9.72962534544257e-05, "loss": 2.2592, "step": 364 }, { "epoch": 0.12021902919016839, "grad_norm": 2.6707746982574463, "learning_rate": 9.727916287705912e-05, "loss": 2.135, "step": 365 }, { "epoch": 0.12054839639342912, "grad_norm": 2.802182197570801, "learning_rate": 9.72620199649759e-05, "loss": 2.6641, "step": 366 }, { "epoch": 0.12087776359668986, "grad_norm": 3.923982620239258, "learning_rate": 9.724482473715207e-05, "loss": 2.3298, "step": 367 }, { "epoch": 0.12120713079995059, "grad_norm": 2.7437329292297363, "learning_rate": 9.722757721262154e-05, "loss": 2.2673, "step": 368 }, { "epoch": 0.12153649800321133, "grad_norm": 2.640639543533325, "learning_rate": 9.72102774104761e-05, "loss": 2.2149, "step": 369 }, { "epoch": 0.12186586520647207, "grad_norm": 3.37756609916687, "learning_rate": 9.719292534986543e-05, "loss": 2.0474, "step": 370 }, { "epoch": 0.1221952324097328, "grad_norm": 2.524691581726074, "learning_rate": 9.717552104999703e-05, "loss": 2.0271, "step": 371 }, { "epoch": 0.12252459961299354, "grad_norm": 2.769646406173706, "learning_rate": 9.715806453013625e-05, "loss": 2.0452, "step": 372 }, { "epoch": 0.12285396681625427, "grad_norm": 2.9961395263671875, "learning_rate": 9.714055580960622e-05, "loss": 1.9754, "step": 373 }, { "epoch": 0.123183334019515, "grad_norm": 3.112914800643921, "learning_rate": 9.712299490778786e-05, "loss": 2.1425, "step": 374 }, { "epoch": 0.12351270122277574, "grad_norm": 3.4256157875061035, "learning_rate": 9.710538184411991e-05, "loss": 1.9117, "step": 375 }, { "epoch": 0.12384206842603648, "grad_norm": 1.9335092306137085, "learning_rate": 9.708771663809872e-05, "loss": 2.4769, "step": 376 }, { "epoch": 0.12417143562929721, "grad_norm": 2.1666107177734375, "learning_rate": 9.706999930927848e-05, "loss": 2.2265, "step": 377 }, { "epoch": 0.12450080283255795, "grad_norm": 2.2735953330993652, "learning_rate": 9.7052229877271e-05, "loss": 2.4031, "step": 378 }, { "epoch": 0.12483017003581869, "grad_norm": 2.531383514404297, "learning_rate": 9.703440836174583e-05, "loss": 2.4251, "step": 379 }, { "epoch": 0.1251595372390794, "grad_norm": 2.608523368835449, "learning_rate": 9.701653478243013e-05, "loss": 2.4985, "step": 380 }, { "epoch": 0.12548890444234015, "grad_norm": 2.4094133377075195, "learning_rate": 9.699860915910868e-05, "loss": 1.9842, "step": 381 }, { "epoch": 0.1258182716456009, "grad_norm": 2.278822183609009, "learning_rate": 9.698063151162389e-05, "loss": 2.0549, "step": 382 }, { "epoch": 0.12614763884886163, "grad_norm": 2.65533709526062, "learning_rate": 9.696260185987576e-05, "loss": 2.4869, "step": 383 }, { "epoch": 0.12647700605212236, "grad_norm": 2.8089699745178223, "learning_rate": 9.694452022382186e-05, "loss": 2.3468, "step": 384 }, { "epoch": 0.1268063732553831, "grad_norm": 2.4362573623657227, "learning_rate": 9.692638662347728e-05, "loss": 2.5076, "step": 385 }, { "epoch": 0.12713574045864384, "grad_norm": 3.138957977294922, "learning_rate": 9.690820107891466e-05, "loss": 2.3684, "step": 386 }, { "epoch": 0.12746510766190455, "grad_norm": 3.3768868446350098, "learning_rate": 9.68899636102641e-05, "loss": 2.7286, "step": 387 }, { "epoch": 0.1277944748651653, "grad_norm": 2.5961859226226807, "learning_rate": 9.68716742377132e-05, "loss": 2.1429, "step": 388 }, { "epoch": 0.12812384206842603, "grad_norm": 2.7435123920440674, "learning_rate": 9.685333298150702e-05, "loss": 2.2378, "step": 389 }, { "epoch": 0.12845320927168677, "grad_norm": 2.207853317260742, "learning_rate": 9.683493986194808e-05, "loss": 2.0057, "step": 390 }, { "epoch": 0.1287825764749475, "grad_norm": 3.450223922729492, "learning_rate": 9.681649489939619e-05, "loss": 2.5243, "step": 391 }, { "epoch": 0.12911194367820825, "grad_norm": 2.479057788848877, "learning_rate": 9.67979981142687e-05, "loss": 1.9977, "step": 392 }, { "epoch": 0.12944131088146899, "grad_norm": 2.4462480545043945, "learning_rate": 9.677944952704023e-05, "loss": 1.9085, "step": 393 }, { "epoch": 0.12977067808472972, "grad_norm": 3.183197021484375, "learning_rate": 9.676084915824276e-05, "loss": 2.6043, "step": 394 }, { "epoch": 0.13010004528799043, "grad_norm": 2.802366256713867, "learning_rate": 9.674219702846561e-05, "loss": 2.3849, "step": 395 }, { "epoch": 0.13042941249125117, "grad_norm": 3.248178482055664, "learning_rate": 9.672349315835535e-05, "loss": 2.3824, "step": 396 }, { "epoch": 0.1307587796945119, "grad_norm": 2.677258253097534, "learning_rate": 9.670473756861588e-05, "loss": 1.8382, "step": 397 }, { "epoch": 0.13108814689777265, "grad_norm": 2.8905255794525146, "learning_rate": 9.668593028000831e-05, "loss": 1.9997, "step": 398 }, { "epoch": 0.1314175141010334, "grad_norm": 2.8859400749206543, "learning_rate": 9.6667071313351e-05, "loss": 1.904, "step": 399 }, { "epoch": 0.13174688130429413, "grad_norm": 3.3241395950317383, "learning_rate": 9.664816068951947e-05, "loss": 1.7025, "step": 400 }, { "epoch": 0.13207624850755487, "grad_norm": 2.0233919620513916, "learning_rate": 9.662919842944651e-05, "loss": 2.3296, "step": 401 }, { "epoch": 0.1324056157108156, "grad_norm": 2.4507946968078613, "learning_rate": 9.661018455412197e-05, "loss": 2.431, "step": 402 }, { "epoch": 0.13273498291407634, "grad_norm": 2.341461658477783, "learning_rate": 9.659111908459288e-05, "loss": 2.5369, "step": 403 }, { "epoch": 0.13306435011733705, "grad_norm": 2.3247382640838623, "learning_rate": 9.657200204196337e-05, "loss": 2.3336, "step": 404 }, { "epoch": 0.1333937173205978, "grad_norm": 2.6917145252227783, "learning_rate": 9.65528334473947e-05, "loss": 2.5245, "step": 405 }, { "epoch": 0.13372308452385853, "grad_norm": 2.5759096145629883, "learning_rate": 9.653361332210513e-05, "loss": 2.5367, "step": 406 }, { "epoch": 0.13405245172711927, "grad_norm": 3.123220682144165, "learning_rate": 9.651434168737e-05, "loss": 2.3026, "step": 407 }, { "epoch": 0.13438181893038, "grad_norm": 1.9661033153533936, "learning_rate": 9.649501856452165e-05, "loss": 1.9149, "step": 408 }, { "epoch": 0.13471118613364075, "grad_norm": 2.2201578617095947, "learning_rate": 9.647564397494944e-05, "loss": 2.4501, "step": 409 }, { "epoch": 0.1350405533369015, "grad_norm": 2.485238790512085, "learning_rate": 9.645621794009967e-05, "loss": 2.3879, "step": 410 }, { "epoch": 0.13536992054016223, "grad_norm": 2.429931163787842, "learning_rate": 9.643674048147558e-05, "loss": 2.3842, "step": 411 }, { "epoch": 0.13569928774342294, "grad_norm": 2.674360513687134, "learning_rate": 9.641721162063739e-05, "loss": 2.2783, "step": 412 }, { "epoch": 0.13602865494668367, "grad_norm": 2.56316876411438, "learning_rate": 9.639763137920214e-05, "loss": 2.2338, "step": 413 }, { "epoch": 0.1363580221499444, "grad_norm": 2.6116116046905518, "learning_rate": 9.637799977884381e-05, "loss": 2.2164, "step": 414 }, { "epoch": 0.13668738935320515, "grad_norm": 2.379546880722046, "learning_rate": 9.635831684129318e-05, "loss": 1.8579, "step": 415 }, { "epoch": 0.1370167565564659, "grad_norm": 2.250623941421509, "learning_rate": 9.63385825883379e-05, "loss": 1.9759, "step": 416 }, { "epoch": 0.13734612375972663, "grad_norm": 2.5346574783325195, "learning_rate": 9.63187970418224e-05, "loss": 2.1965, "step": 417 }, { "epoch": 0.13767549096298737, "grad_norm": 2.7046220302581787, "learning_rate": 9.62989602236479e-05, "loss": 2.207, "step": 418 }, { "epoch": 0.1380048581662481, "grad_norm": 2.8171615600585938, "learning_rate": 9.627907215577236e-05, "loss": 2.1648, "step": 419 }, { "epoch": 0.13833422536950882, "grad_norm": 2.6681857109069824, "learning_rate": 9.625913286021046e-05, "loss": 2.2677, "step": 420 }, { "epoch": 0.13866359257276956, "grad_norm": 2.8763694763183594, "learning_rate": 9.623914235903362e-05, "loss": 1.9661, "step": 421 }, { "epoch": 0.1389929597760303, "grad_norm": 2.8630616664886475, "learning_rate": 9.621910067436992e-05, "loss": 2.1464, "step": 422 }, { "epoch": 0.13932232697929103, "grad_norm": 3.0975148677825928, "learning_rate": 9.61990078284041e-05, "loss": 2.1959, "step": 423 }, { "epoch": 0.13965169418255177, "grad_norm": 3.8912901878356934, "learning_rate": 9.617886384337751e-05, "loss": 2.1552, "step": 424 }, { "epoch": 0.1399810613858125, "grad_norm": 3.1637001037597656, "learning_rate": 9.615866874158816e-05, "loss": 1.8698, "step": 425 }, { "epoch": 0.14031042858907325, "grad_norm": 2.0410444736480713, "learning_rate": 9.613842254539058e-05, "loss": 2.6313, "step": 426 }, { "epoch": 0.140639795792334, "grad_norm": 2.081902503967285, "learning_rate": 9.611812527719593e-05, "loss": 2.444, "step": 427 }, { "epoch": 0.14096916299559473, "grad_norm": 2.232922077178955, "learning_rate": 9.609777695947182e-05, "loss": 2.1034, "step": 428 }, { "epoch": 0.14129853019885544, "grad_norm": 2.2757458686828613, "learning_rate": 9.607737761474242e-05, "loss": 2.5483, "step": 429 }, { "epoch": 0.14162789740211618, "grad_norm": 2.3721635341644287, "learning_rate": 9.60569272655884e-05, "loss": 2.6453, "step": 430 }, { "epoch": 0.14195726460537691, "grad_norm": 2.366621971130371, "learning_rate": 9.603642593464683e-05, "loss": 2.3196, "step": 431 }, { "epoch": 0.14228663180863765, "grad_norm": 2.4534189701080322, "learning_rate": 9.601587364461127e-05, "loss": 2.4592, "step": 432 }, { "epoch": 0.1426159990118984, "grad_norm": 2.350576639175415, "learning_rate": 9.599527041823164e-05, "loss": 2.0809, "step": 433 }, { "epoch": 0.14294536621515913, "grad_norm": 2.2983639240264893, "learning_rate": 9.59746162783143e-05, "loss": 2.1881, "step": 434 }, { "epoch": 0.14327473341841987, "grad_norm": 2.5199313163757324, "learning_rate": 9.595391124772189e-05, "loss": 2.2941, "step": 435 }, { "epoch": 0.1436041006216806, "grad_norm": 2.4015860557556152, "learning_rate": 9.593315534937345e-05, "loss": 2.2748, "step": 436 }, { "epoch": 0.14393346782494132, "grad_norm": 2.675955057144165, "learning_rate": 9.591234860624431e-05, "loss": 2.1505, "step": 437 }, { "epoch": 0.14426283502820206, "grad_norm": 2.4210708141326904, "learning_rate": 9.589149104136605e-05, "loss": 2.2871, "step": 438 }, { "epoch": 0.1445922022314628, "grad_norm": 2.417851686477661, "learning_rate": 9.587058267782656e-05, "loss": 2.1308, "step": 439 }, { "epoch": 0.14492156943472354, "grad_norm": 2.456392526626587, "learning_rate": 9.584962353876992e-05, "loss": 1.8146, "step": 440 }, { "epoch": 0.14525093663798427, "grad_norm": 2.4803600311279297, "learning_rate": 9.582861364739642e-05, "loss": 2.2325, "step": 441 }, { "epoch": 0.145580303841245, "grad_norm": 2.4525747299194336, "learning_rate": 9.580755302696256e-05, "loss": 2.0824, "step": 442 }, { "epoch": 0.14590967104450575, "grad_norm": 3.1618423461914062, "learning_rate": 9.578644170078093e-05, "loss": 2.3309, "step": 443 }, { "epoch": 0.1462390382477665, "grad_norm": 3.0864148139953613, "learning_rate": 9.576527969222031e-05, "loss": 2.2085, "step": 444 }, { "epoch": 0.1465684054510272, "grad_norm": 2.7343194484710693, "learning_rate": 9.574406702470558e-05, "loss": 2.2462, "step": 445 }, { "epoch": 0.14689777265428794, "grad_norm": 2.57436466217041, "learning_rate": 9.572280372171763e-05, "loss": 2.0448, "step": 446 }, { "epoch": 0.14722713985754868, "grad_norm": 2.608705759048462, "learning_rate": 9.570148980679347e-05, "loss": 1.9546, "step": 447 }, { "epoch": 0.14755650706080942, "grad_norm": 2.5100154876708984, "learning_rate": 9.56801253035261e-05, "loss": 1.6897, "step": 448 }, { "epoch": 0.14788587426407016, "grad_norm": 3.806117296218872, "learning_rate": 9.565871023556455e-05, "loss": 1.8645, "step": 449 }, { "epoch": 0.1482152414673309, "grad_norm": 3.5076465606689453, "learning_rate": 9.563724462661376e-05, "loss": 2.2529, "step": 450 }, { "epoch": 0.14854460867059163, "grad_norm": 2.111990451812744, "learning_rate": 9.561572850043467e-05, "loss": 2.5226, "step": 451 }, { "epoch": 0.14887397587385237, "grad_norm": 2.713508367538452, "learning_rate": 9.559416188084416e-05, "loss": 2.548, "step": 452 }, { "epoch": 0.1492033430771131, "grad_norm": 2.7908146381378174, "learning_rate": 9.557254479171489e-05, "loss": 2.8369, "step": 453 }, { "epoch": 0.14953271028037382, "grad_norm": 2.3136017322540283, "learning_rate": 9.555087725697554e-05, "loss": 2.3652, "step": 454 }, { "epoch": 0.14986207748363456, "grad_norm": 2.1749308109283447, "learning_rate": 9.552915930061048e-05, "loss": 2.2855, "step": 455 }, { "epoch": 0.1501914446868953, "grad_norm": 2.302049398422241, "learning_rate": 9.550739094666002e-05, "loss": 2.3987, "step": 456 }, { "epoch": 0.15052081189015604, "grad_norm": 2.3191065788269043, "learning_rate": 9.548557221922017e-05, "loss": 2.4523, "step": 457 }, { "epoch": 0.15085017909341678, "grad_norm": 2.958138942718506, "learning_rate": 9.546370314244273e-05, "loss": 2.2964, "step": 458 }, { "epoch": 0.1511795462966775, "grad_norm": 2.8905577659606934, "learning_rate": 9.544178374053524e-05, "loss": 2.6665, "step": 459 }, { "epoch": 0.15150891349993825, "grad_norm": 2.6026418209075928, "learning_rate": 9.541981403776095e-05, "loss": 2.5692, "step": 460 }, { "epoch": 0.151838280703199, "grad_norm": 2.7903120517730713, "learning_rate": 9.539779405843876e-05, "loss": 2.439, "step": 461 }, { "epoch": 0.1521676479064597, "grad_norm": 3.0123376846313477, "learning_rate": 9.537572382694328e-05, "loss": 2.592, "step": 462 }, { "epoch": 0.15249701510972044, "grad_norm": 2.3931968212127686, "learning_rate": 9.535360336770467e-05, "loss": 2.2706, "step": 463 }, { "epoch": 0.15282638231298118, "grad_norm": 2.501070022583008, "learning_rate": 9.533143270520873e-05, "loss": 1.8545, "step": 464 }, { "epoch": 0.15315574951624192, "grad_norm": 2.836297035217285, "learning_rate": 9.530921186399684e-05, "loss": 2.3613, "step": 465 }, { "epoch": 0.15348511671950266, "grad_norm": 2.518571138381958, "learning_rate": 9.528694086866592e-05, "loss": 2.1096, "step": 466 }, { "epoch": 0.1538144839227634, "grad_norm": 2.5794365406036377, "learning_rate": 9.526461974386838e-05, "loss": 2.0714, "step": 467 }, { "epoch": 0.15414385112602413, "grad_norm": 2.955522060394287, "learning_rate": 9.524224851431214e-05, "loss": 2.1713, "step": 468 }, { "epoch": 0.15447321832928487, "grad_norm": 3.465235948562622, "learning_rate": 9.521982720476062e-05, "loss": 2.2217, "step": 469 }, { "epoch": 0.15480258553254558, "grad_norm": 2.497987985610962, "learning_rate": 9.519735584003257e-05, "loss": 1.9994, "step": 470 }, { "epoch": 0.15513195273580632, "grad_norm": 2.911043643951416, "learning_rate": 9.517483444500228e-05, "loss": 1.9883, "step": 471 }, { "epoch": 0.15546131993906706, "grad_norm": 2.7313320636749268, "learning_rate": 9.51522630445993e-05, "loss": 2.1227, "step": 472 }, { "epoch": 0.1557906871423278, "grad_norm": 3.4482412338256836, "learning_rate": 9.512964166380864e-05, "loss": 2.2148, "step": 473 }, { "epoch": 0.15612005434558854, "grad_norm": 2.664477825164795, "learning_rate": 9.510697032767053e-05, "loss": 1.7443, "step": 474 }, { "epoch": 0.15644942154884928, "grad_norm": 2.8564321994781494, "learning_rate": 9.508424906128058e-05, "loss": 1.9044, "step": 475 }, { "epoch": 0.15677878875211002, "grad_norm": 1.97623872756958, "learning_rate": 9.506147788978965e-05, "loss": 2.3548, "step": 476 }, { "epoch": 0.15710815595537075, "grad_norm": 2.3602867126464844, "learning_rate": 9.503865683840378e-05, "loss": 2.5651, "step": 477 }, { "epoch": 0.1574375231586315, "grad_norm": 2.1931259632110596, "learning_rate": 9.501578593238432e-05, "loss": 2.1644, "step": 478 }, { "epoch": 0.1577668903618922, "grad_norm": 2.284613609313965, "learning_rate": 9.499286519704773e-05, "loss": 2.2849, "step": 479 }, { "epoch": 0.15809625756515294, "grad_norm": 2.498391628265381, "learning_rate": 9.49698946577657e-05, "loss": 2.5516, "step": 480 }, { "epoch": 0.15842562476841368, "grad_norm": 2.4198145866394043, "learning_rate": 9.494687433996493e-05, "loss": 2.194, "step": 481 }, { "epoch": 0.15875499197167442, "grad_norm": 2.404466390609741, "learning_rate": 9.492380426912737e-05, "loss": 2.3409, "step": 482 }, { "epoch": 0.15908435917493516, "grad_norm": 2.326627254486084, "learning_rate": 9.490068447078992e-05, "loss": 2.5327, "step": 483 }, { "epoch": 0.1594137263781959, "grad_norm": 2.2029290199279785, "learning_rate": 9.487751497054461e-05, "loss": 2.3595, "step": 484 }, { "epoch": 0.15974309358145664, "grad_norm": 2.798891544342041, "learning_rate": 9.485429579403843e-05, "loss": 2.5281, "step": 485 }, { "epoch": 0.16007246078471737, "grad_norm": 2.421884775161743, "learning_rate": 9.483102696697339e-05, "loss": 2.3762, "step": 486 }, { "epoch": 0.16040182798797809, "grad_norm": 2.618962049484253, "learning_rate": 9.480770851510644e-05, "loss": 2.5659, "step": 487 }, { "epoch": 0.16073119519123882, "grad_norm": 2.394176721572876, "learning_rate": 9.478434046424948e-05, "loss": 2.0389, "step": 488 }, { "epoch": 0.16106056239449956, "grad_norm": 2.6122007369995117, "learning_rate": 9.47609228402693e-05, "loss": 2.2712, "step": 489 }, { "epoch": 0.1613899295977603, "grad_norm": 3.2165706157684326, "learning_rate": 9.473745566908756e-05, "loss": 2.2492, "step": 490 }, { "epoch": 0.16171929680102104, "grad_norm": 2.527129650115967, "learning_rate": 9.471393897668078e-05, "loss": 2.4401, "step": 491 }, { "epoch": 0.16204866400428178, "grad_norm": 2.758704900741577, "learning_rate": 9.469037278908029e-05, "loss": 2.2011, "step": 492 }, { "epoch": 0.16237803120754252, "grad_norm": 2.8025074005126953, "learning_rate": 9.46667571323722e-05, "loss": 2.0435, "step": 493 }, { "epoch": 0.16270739841080326, "grad_norm": 2.798211097717285, "learning_rate": 9.464309203269739e-05, "loss": 2.2279, "step": 494 }, { "epoch": 0.16303676561406397, "grad_norm": 2.942800521850586, "learning_rate": 9.461937751625145e-05, "loss": 2.082, "step": 495 }, { "epoch": 0.1633661328173247, "grad_norm": 2.7005648612976074, "learning_rate": 9.459561360928472e-05, "loss": 2.2156, "step": 496 }, { "epoch": 0.16369550002058544, "grad_norm": 2.9613757133483887, "learning_rate": 9.457180033810216e-05, "loss": 2.3678, "step": 497 }, { "epoch": 0.16402486722384618, "grad_norm": 2.888354539871216, "learning_rate": 9.454793772906336e-05, "loss": 1.9226, "step": 498 }, { "epoch": 0.16435423442710692, "grad_norm": 3.021557331085205, "learning_rate": 9.452402580858261e-05, "loss": 2.1693, "step": 499 }, { "epoch": 0.16468360163036766, "grad_norm": 3.1927671432495117, "learning_rate": 9.45000646031287e-05, "loss": 1.6268, "step": 500 }, { "epoch": 0.1650129688336284, "grad_norm": 1.9104129076004028, "learning_rate": 9.447605413922499e-05, "loss": 2.6226, "step": 501 }, { "epoch": 0.16534233603688914, "grad_norm": 1.981658935546875, "learning_rate": 9.44519944434494e-05, "loss": 2.288, "step": 502 }, { "epoch": 0.16567170324014988, "grad_norm": 2.278552532196045, "learning_rate": 9.442788554243431e-05, "loss": 2.4093, "step": 503 }, { "epoch": 0.1660010704434106, "grad_norm": 2.9332423210144043, "learning_rate": 9.440372746286661e-05, "loss": 2.5302, "step": 504 }, { "epoch": 0.16633043764667133, "grad_norm": 2.4313466548919678, "learning_rate": 9.437952023148757e-05, "loss": 2.52, "step": 505 }, { "epoch": 0.16665980484993206, "grad_norm": 2.3095078468322754, "learning_rate": 9.43552638750929e-05, "loss": 2.424, "step": 506 }, { "epoch": 0.1669891720531928, "grad_norm": 2.3015639781951904, "learning_rate": 9.433095842053272e-05, "loss": 2.5717, "step": 507 }, { "epoch": 0.16731853925645354, "grad_norm": 2.5541129112243652, "learning_rate": 9.43066038947114e-05, "loss": 2.5831, "step": 508 }, { "epoch": 0.16764790645971428, "grad_norm": 2.421571969985962, "learning_rate": 9.428220032458776e-05, "loss": 2.4438, "step": 509 }, { "epoch": 0.16797727366297502, "grad_norm": 2.5124785900115967, "learning_rate": 9.425774773717479e-05, "loss": 2.2709, "step": 510 }, { "epoch": 0.16830664086623576, "grad_norm": 2.592477560043335, "learning_rate": 9.423324615953982e-05, "loss": 2.3089, "step": 511 }, { "epoch": 0.16863600806949647, "grad_norm": 2.2433090209960938, "learning_rate": 9.420869561880434e-05, "loss": 2.318, "step": 512 }, { "epoch": 0.1689653752727572, "grad_norm": 2.6666271686553955, "learning_rate": 9.418409614214412e-05, "loss": 2.3501, "step": 513 }, { "epoch": 0.16929474247601795, "grad_norm": 2.3065149784088135, "learning_rate": 9.415944775678902e-05, "loss": 2.0432, "step": 514 }, { "epoch": 0.16962410967927868, "grad_norm": 3.0598034858703613, "learning_rate": 9.41347504900231e-05, "loss": 2.6206, "step": 515 }, { "epoch": 0.16995347688253942, "grad_norm": 2.051896095275879, "learning_rate": 9.411000436918449e-05, "loss": 1.9563, "step": 516 }, { "epoch": 0.17028284408580016, "grad_norm": 3.1122515201568604, "learning_rate": 9.408520942166541e-05, "loss": 2.2816, "step": 517 }, { "epoch": 0.1706122112890609, "grad_norm": 2.379194974899292, "learning_rate": 9.406036567491213e-05, "loss": 2.4151, "step": 518 }, { "epoch": 0.17094157849232164, "grad_norm": 2.4957690238952637, "learning_rate": 9.403547315642493e-05, "loss": 1.8377, "step": 519 }, { "epoch": 0.17127094569558235, "grad_norm": 2.6822221279144287, "learning_rate": 9.401053189375809e-05, "loss": 2.0926, "step": 520 }, { "epoch": 0.1716003128988431, "grad_norm": 2.902961492538452, "learning_rate": 9.398554191451983e-05, "loss": 2.1816, "step": 521 }, { "epoch": 0.17192968010210383, "grad_norm": 3.225004196166992, "learning_rate": 9.396050324637228e-05, "loss": 2.5233, "step": 522 }, { "epoch": 0.17225904730536457, "grad_norm": 2.8753762245178223, "learning_rate": 9.393541591703156e-05, "loss": 2.3371, "step": 523 }, { "epoch": 0.1725884145086253, "grad_norm": 3.5578978061676025, "learning_rate": 9.39102799542675e-05, "loss": 2.3127, "step": 524 }, { "epoch": 0.17291778171188604, "grad_norm": 3.2395496368408203, "learning_rate": 9.388509538590391e-05, "loss": 2.0262, "step": 525 }, { "epoch": 0.17324714891514678, "grad_norm": 2.063525438308716, "learning_rate": 9.385986223981833e-05, "loss": 2.5577, "step": 526 }, { "epoch": 0.17357651611840752, "grad_norm": 2.227280616760254, "learning_rate": 9.383458054394206e-05, "loss": 2.5892, "step": 527 }, { "epoch": 0.17390588332166826, "grad_norm": 2.1608548164367676, "learning_rate": 9.380925032626015e-05, "loss": 2.5988, "step": 528 }, { "epoch": 0.17423525052492897, "grad_norm": 2.333763360977173, "learning_rate": 9.378387161481142e-05, "loss": 2.4371, "step": 529 }, { "epoch": 0.1745646177281897, "grad_norm": 2.1032464504241943, "learning_rate": 9.375844443768829e-05, "loss": 2.1269, "step": 530 }, { "epoch": 0.17489398493145045, "grad_norm": 2.7457637786865234, "learning_rate": 9.373296882303688e-05, "loss": 2.4893, "step": 531 }, { "epoch": 0.17522335213471119, "grad_norm": 2.414435625076294, "learning_rate": 9.37074447990569e-05, "loss": 2.4437, "step": 532 }, { "epoch": 0.17555271933797192, "grad_norm": 2.7596664428710938, "learning_rate": 9.368187239400166e-05, "loss": 2.3113, "step": 533 }, { "epoch": 0.17588208654123266, "grad_norm": 2.7227630615234375, "learning_rate": 9.3656251636178e-05, "loss": 2.2728, "step": 534 }, { "epoch": 0.1762114537444934, "grad_norm": 2.8053038120269775, "learning_rate": 9.363058255394632e-05, "loss": 2.3559, "step": 535 }, { "epoch": 0.17654082094775414, "grad_norm": 2.6018600463867188, "learning_rate": 9.360486517572049e-05, "loss": 2.2176, "step": 536 }, { "epoch": 0.17687018815101485, "grad_norm": 2.3456811904907227, "learning_rate": 9.357909952996784e-05, "loss": 2.1538, "step": 537 }, { "epoch": 0.1771995553542756, "grad_norm": 2.6372413635253906, "learning_rate": 9.355328564520914e-05, "loss": 2.1687, "step": 538 }, { "epoch": 0.17752892255753633, "grad_norm": 2.4839794635772705, "learning_rate": 9.352742355001853e-05, "loss": 2.2029, "step": 539 }, { "epoch": 0.17785828976079707, "grad_norm": 2.5911648273468018, "learning_rate": 9.350151327302356e-05, "loss": 2.0988, "step": 540 }, { "epoch": 0.1781876569640578, "grad_norm": 2.3182106018066406, "learning_rate": 9.347555484290507e-05, "loss": 1.9714, "step": 541 }, { "epoch": 0.17851702416731854, "grad_norm": 3.564013957977295, "learning_rate": 9.344954828839722e-05, "loss": 2.3427, "step": 542 }, { "epoch": 0.17884639137057928, "grad_norm": 2.816439628601074, "learning_rate": 9.342349363828748e-05, "loss": 2.198, "step": 543 }, { "epoch": 0.17917575857384002, "grad_norm": 2.91499924659729, "learning_rate": 9.339739092141647e-05, "loss": 2.2565, "step": 544 }, { "epoch": 0.17950512577710073, "grad_norm": 3.363369941711426, "learning_rate": 9.337124016667809e-05, "loss": 2.1877, "step": 545 }, { "epoch": 0.17983449298036147, "grad_norm": 3.335421085357666, "learning_rate": 9.334504140301938e-05, "loss": 2.2754, "step": 546 }, { "epoch": 0.1801638601836222, "grad_norm": 3.2244529724121094, "learning_rate": 9.331879465944056e-05, "loss": 2.0735, "step": 547 }, { "epoch": 0.18049322738688295, "grad_norm": 2.5784671306610107, "learning_rate": 9.32924999649949e-05, "loss": 2.0593, "step": 548 }, { "epoch": 0.1808225945901437, "grad_norm": 3.826298236846924, "learning_rate": 9.326615734878878e-05, "loss": 2.4232, "step": 549 }, { "epoch": 0.18115196179340443, "grad_norm": 3.9787750244140625, "learning_rate": 9.323976683998168e-05, "loss": 1.9951, "step": 550 }, { "epoch": 0.18148132899666516, "grad_norm": 2.1491682529449463, "learning_rate": 9.321332846778599e-05, "loss": 2.497, "step": 551 }, { "epoch": 0.1818106961999259, "grad_norm": 2.5805752277374268, "learning_rate": 9.318684226146714e-05, "loss": 2.343, "step": 552 }, { "epoch": 0.18214006340318661, "grad_norm": 2.671565055847168, "learning_rate": 9.316030825034354e-05, "loss": 2.314, "step": 553 }, { "epoch": 0.18246943060644735, "grad_norm": 2.459749221801758, "learning_rate": 9.313372646378643e-05, "loss": 2.3503, "step": 554 }, { "epoch": 0.1827987978097081, "grad_norm": 2.3150362968444824, "learning_rate": 9.310709693122002e-05, "loss": 1.9586, "step": 555 }, { "epoch": 0.18312816501296883, "grad_norm": 2.813565254211426, "learning_rate": 9.308041968212131e-05, "loss": 2.467, "step": 556 }, { "epoch": 0.18345753221622957, "grad_norm": 2.5931906700134277, "learning_rate": 9.305369474602015e-05, "loss": 2.245, "step": 557 }, { "epoch": 0.1837868994194903, "grad_norm": 2.7060670852661133, "learning_rate": 9.302692215249918e-05, "loss": 2.7003, "step": 558 }, { "epoch": 0.18411626662275105, "grad_norm": 2.543288230895996, "learning_rate": 9.300010193119376e-05, "loss": 2.2948, "step": 559 }, { "epoch": 0.18444563382601178, "grad_norm": 2.492933750152588, "learning_rate": 9.297323411179202e-05, "loss": 2.2784, "step": 560 }, { "epoch": 0.18477500102927252, "grad_norm": 2.2065117359161377, "learning_rate": 9.294631872403474e-05, "loss": 2.2058, "step": 561 }, { "epoch": 0.18510436823253323, "grad_norm": 3.046983003616333, "learning_rate": 9.291935579771536e-05, "loss": 2.3053, "step": 562 }, { "epoch": 0.18543373543579397, "grad_norm": 2.53873610496521, "learning_rate": 9.289234536267996e-05, "loss": 2.4844, "step": 563 }, { "epoch": 0.1857631026390547, "grad_norm": 2.3958847522735596, "learning_rate": 9.286528744882719e-05, "loss": 2.0553, "step": 564 }, { "epoch": 0.18609246984231545, "grad_norm": 3.1099889278411865, "learning_rate": 9.283818208610826e-05, "loss": 2.7911, "step": 565 }, { "epoch": 0.1864218370455762, "grad_norm": 2.7768149375915527, "learning_rate": 9.28110293045269e-05, "loss": 2.3027, "step": 566 }, { "epoch": 0.18675120424883693, "grad_norm": 2.551649570465088, "learning_rate": 9.278382913413935e-05, "loss": 1.9763, "step": 567 }, { "epoch": 0.18708057145209767, "grad_norm": 2.6487247943878174, "learning_rate": 9.27565816050543e-05, "loss": 2.2145, "step": 568 }, { "epoch": 0.1874099386553584, "grad_norm": 3.50011944770813, "learning_rate": 9.272928674743282e-05, "loss": 2.0824, "step": 569 }, { "epoch": 0.18773930585861912, "grad_norm": 2.5027475357055664, "learning_rate": 9.270194459148841e-05, "loss": 2.0633, "step": 570 }, { "epoch": 0.18806867306187985, "grad_norm": 2.7622134685516357, "learning_rate": 9.267455516748693e-05, "loss": 2.4109, "step": 571 }, { "epoch": 0.1883980402651406, "grad_norm": 2.5253777503967285, "learning_rate": 9.264711850574657e-05, "loss": 1.8391, "step": 572 }, { "epoch": 0.18872740746840133, "grad_norm": 3.9980525970458984, "learning_rate": 9.261963463663775e-05, "loss": 2.4288, "step": 573 }, { "epoch": 0.18905677467166207, "grad_norm": 3.201934814453125, "learning_rate": 9.25921035905832e-05, "loss": 2.1041, "step": 574 }, { "epoch": 0.1893861418749228, "grad_norm": 3.45932674407959, "learning_rate": 9.256452539805787e-05, "loss": 1.9162, "step": 575 }, { "epoch": 0.18971550907818355, "grad_norm": 2.3997511863708496, "learning_rate": 9.253690008958886e-05, "loss": 2.7296, "step": 576 }, { "epoch": 0.19004487628144429, "grad_norm": 2.100877523422241, "learning_rate": 9.250922769575548e-05, "loss": 2.2391, "step": 577 }, { "epoch": 0.190374243484705, "grad_norm": 2.0351099967956543, "learning_rate": 9.248150824718911e-05, "loss": 2.3191, "step": 578 }, { "epoch": 0.19070361068796574, "grad_norm": 2.368957757949829, "learning_rate": 9.245374177457323e-05, "loss": 2.2142, "step": 579 }, { "epoch": 0.19103297789122647, "grad_norm": 2.49444580078125, "learning_rate": 9.242592830864339e-05, "loss": 2.3158, "step": 580 }, { "epoch": 0.1913623450944872, "grad_norm": 2.4949872493743896, "learning_rate": 9.239806788018714e-05, "loss": 2.3183, "step": 581 }, { "epoch": 0.19169171229774795, "grad_norm": 2.6433401107788086, "learning_rate": 9.2370160520044e-05, "loss": 2.4043, "step": 582 }, { "epoch": 0.1920210795010087, "grad_norm": 2.3650400638580322, "learning_rate": 9.23422062591055e-05, "loss": 2.2305, "step": 583 }, { "epoch": 0.19235044670426943, "grad_norm": 2.623833179473877, "learning_rate": 9.231420512831501e-05, "loss": 2.2516, "step": 584 }, { "epoch": 0.19267981390753017, "grad_norm": 2.3704848289489746, "learning_rate": 9.228615715866785e-05, "loss": 2.1916, "step": 585 }, { "epoch": 0.1930091811107909, "grad_norm": 2.6421825885772705, "learning_rate": 9.225806238121113e-05, "loss": 2.1705, "step": 586 }, { "epoch": 0.19333854831405162, "grad_norm": 2.507702589035034, "learning_rate": 9.222992082704381e-05, "loss": 2.2798, "step": 587 }, { "epoch": 0.19366791551731236, "grad_norm": 2.977654457092285, "learning_rate": 9.22017325273166e-05, "loss": 2.3934, "step": 588 }, { "epoch": 0.1939972827205731, "grad_norm": 2.6255760192871094, "learning_rate": 9.217349751323199e-05, "loss": 2.168, "step": 589 }, { "epoch": 0.19432664992383383, "grad_norm": 2.767040252685547, "learning_rate": 9.214521581604415e-05, "loss": 2.3227, "step": 590 }, { "epoch": 0.19465601712709457, "grad_norm": 2.74783992767334, "learning_rate": 9.211688746705894e-05, "loss": 2.5598, "step": 591 }, { "epoch": 0.1949853843303553, "grad_norm": 3.419125556945801, "learning_rate": 9.208851249763385e-05, "loss": 2.1441, "step": 592 }, { "epoch": 0.19531475153361605, "grad_norm": 2.531890392303467, "learning_rate": 9.206009093917798e-05, "loss": 2.2535, "step": 593 }, { "epoch": 0.1956441187368768, "grad_norm": 2.928637742996216, "learning_rate": 9.203162282315201e-05, "loss": 1.9981, "step": 594 }, { "epoch": 0.1959734859401375, "grad_norm": 2.740192174911499, "learning_rate": 9.200310818106813e-05, "loss": 2.1965, "step": 595 }, { "epoch": 0.19630285314339824, "grad_norm": 2.5833513736724854, "learning_rate": 9.197454704449007e-05, "loss": 1.9684, "step": 596 }, { "epoch": 0.19663222034665898, "grad_norm": 2.816075563430786, "learning_rate": 9.194593944503298e-05, "loss": 1.9281, "step": 597 }, { "epoch": 0.19696158754991971, "grad_norm": 2.4949111938476562, "learning_rate": 9.19172854143635e-05, "loss": 1.8233, "step": 598 }, { "epoch": 0.19729095475318045, "grad_norm": 3.0978989601135254, "learning_rate": 9.18885849841996e-05, "loss": 1.9407, "step": 599 }, { "epoch": 0.1976203219564412, "grad_norm": 3.714794397354126, "learning_rate": 9.185983818631066e-05, "loss": 2.082, "step": 600 }, { "epoch": 0.19794968915970193, "grad_norm": 2.1515870094299316, "learning_rate": 9.183104505251735e-05, "loss": 2.5559, "step": 601 }, { "epoch": 0.19827905636296267, "grad_norm": 2.2169995307922363, "learning_rate": 9.180220561469167e-05, "loss": 2.2574, "step": 602 }, { "epoch": 0.19860842356622338, "grad_norm": 2.327986717224121, "learning_rate": 9.177331990475685e-05, "loss": 2.4616, "step": 603 }, { "epoch": 0.19893779076948412, "grad_norm": 3.0551161766052246, "learning_rate": 9.174438795468734e-05, "loss": 2.602, "step": 604 }, { "epoch": 0.19926715797274486, "grad_norm": 2.404407024383545, "learning_rate": 9.171540979650879e-05, "loss": 2.4922, "step": 605 }, { "epoch": 0.1995965251760056, "grad_norm": 2.458751916885376, "learning_rate": 9.168638546229796e-05, "loss": 2.3752, "step": 606 }, { "epoch": 0.19992589237926633, "grad_norm": 3.044050693511963, "learning_rate": 9.165731498418277e-05, "loss": 2.6778, "step": 607 }, { "epoch": 0.20025525958252707, "grad_norm": 2.5065767765045166, "learning_rate": 9.162819839434223e-05, "loss": 2.3761, "step": 608 }, { "epoch": 0.2005846267857878, "grad_norm": 2.5272152423858643, "learning_rate": 9.15990357250063e-05, "loss": 2.2685, "step": 609 }, { "epoch": 0.20091399398904855, "grad_norm": 3.1523914337158203, "learning_rate": 9.156982700845606e-05, "loss": 2.516, "step": 610 }, { "epoch": 0.2012433611923093, "grad_norm": 2.7383854389190674, "learning_rate": 9.154057227702348e-05, "loss": 2.4093, "step": 611 }, { "epoch": 0.20157272839557, "grad_norm": 3.010639190673828, "learning_rate": 9.151127156309151e-05, "loss": 2.5929, "step": 612 }, { "epoch": 0.20190209559883074, "grad_norm": 3.279294967651367, "learning_rate": 9.1481924899094e-05, "loss": 2.266, "step": 613 }, { "epoch": 0.20223146280209148, "grad_norm": 3.147496223449707, "learning_rate": 9.145253231751563e-05, "loss": 2.1337, "step": 614 }, { "epoch": 0.20256083000535222, "grad_norm": 2.710343599319458, "learning_rate": 9.142309385089191e-05, "loss": 2.0808, "step": 615 }, { "epoch": 0.20289019720861295, "grad_norm": 2.6864521503448486, "learning_rate": 9.139360953180918e-05, "loss": 2.1494, "step": 616 }, { "epoch": 0.2032195644118737, "grad_norm": 2.7441837787628174, "learning_rate": 9.136407939290451e-05, "loss": 2.5828, "step": 617 }, { "epoch": 0.20354893161513443, "grad_norm": 3.46586537361145, "learning_rate": 9.13345034668657e-05, "loss": 2.6775, "step": 618 }, { "epoch": 0.20387829881839517, "grad_norm": 2.7498793601989746, "learning_rate": 9.130488178643119e-05, "loss": 2.0033, "step": 619 }, { "epoch": 0.20420766602165588, "grad_norm": 3.1091184616088867, "learning_rate": 9.127521438439015e-05, "loss": 2.4799, "step": 620 }, { "epoch": 0.20453703322491662, "grad_norm": 2.3706719875335693, "learning_rate": 9.124550129358227e-05, "loss": 1.8221, "step": 621 }, { "epoch": 0.20486640042817736, "grad_norm": 2.63212251663208, "learning_rate": 9.121574254689788e-05, "loss": 2.0923, "step": 622 }, { "epoch": 0.2051957676314381, "grad_norm": 3.275517225265503, "learning_rate": 9.118593817727782e-05, "loss": 2.1223, "step": 623 }, { "epoch": 0.20552513483469884, "grad_norm": 3.2868645191192627, "learning_rate": 9.115608821771347e-05, "loss": 2.1894, "step": 624 }, { "epoch": 0.20585450203795957, "grad_norm": 3.2911734580993652, "learning_rate": 9.112619270124658e-05, "loss": 2.0714, "step": 625 }, { "epoch": 0.2061838692412203, "grad_norm": 1.8351259231567383, "learning_rate": 9.109625166096942e-05, "loss": 2.5735, "step": 626 }, { "epoch": 0.20651323644448105, "grad_norm": 2.289583206176758, "learning_rate": 9.106626513002464e-05, "loss": 2.2809, "step": 627 }, { "epoch": 0.20684260364774176, "grad_norm": 2.3926987648010254, "learning_rate": 9.103623314160518e-05, "loss": 2.4775, "step": 628 }, { "epoch": 0.2071719708510025, "grad_norm": 2.8485803604125977, "learning_rate": 9.100615572895439e-05, "loss": 2.5643, "step": 629 }, { "epoch": 0.20750133805426324, "grad_norm": 2.601393461227417, "learning_rate": 9.097603292536583e-05, "loss": 2.1331, "step": 630 }, { "epoch": 0.20783070525752398, "grad_norm": 2.364741086959839, "learning_rate": 9.094586476418335e-05, "loss": 2.2653, "step": 631 }, { "epoch": 0.20816007246078472, "grad_norm": 2.307363510131836, "learning_rate": 9.091565127880096e-05, "loss": 2.8223, "step": 632 }, { "epoch": 0.20848943966404546, "grad_norm": 2.207406997680664, "learning_rate": 9.088539250266287e-05, "loss": 2.2839, "step": 633 }, { "epoch": 0.2088188068673062, "grad_norm": 2.271679401397705, "learning_rate": 9.085508846926345e-05, "loss": 2.1932, "step": 634 }, { "epoch": 0.20914817407056693, "grad_norm": 2.566964864730835, "learning_rate": 9.082473921214714e-05, "loss": 2.2745, "step": 635 }, { "epoch": 0.20947754127382767, "grad_norm": 2.6665945053100586, "learning_rate": 9.07943447649084e-05, "loss": 2.2285, "step": 636 }, { "epoch": 0.20980690847708838, "grad_norm": 2.232455253601074, "learning_rate": 9.07639051611918e-05, "loss": 2.0741, "step": 637 }, { "epoch": 0.21013627568034912, "grad_norm": 2.365143299102783, "learning_rate": 9.07334204346918e-05, "loss": 1.9137, "step": 638 }, { "epoch": 0.21046564288360986, "grad_norm": 2.6072494983673096, "learning_rate": 9.070289061915289e-05, "loss": 2.235, "step": 639 }, { "epoch": 0.2107950100868706, "grad_norm": 2.8605432510375977, "learning_rate": 9.06723157483694e-05, "loss": 2.2824, "step": 640 }, { "epoch": 0.21112437729013134, "grad_norm": 2.70151424407959, "learning_rate": 9.064169585618561e-05, "loss": 2.202, "step": 641 }, { "epoch": 0.21145374449339208, "grad_norm": 2.540966272354126, "learning_rate": 9.061103097649554e-05, "loss": 2.0908, "step": 642 }, { "epoch": 0.21178311169665281, "grad_norm": 2.4950785636901855, "learning_rate": 9.05803211432431e-05, "loss": 1.9015, "step": 643 }, { "epoch": 0.21211247889991355, "grad_norm": 2.925896406173706, "learning_rate": 9.054956639042194e-05, "loss": 2.2201, "step": 644 }, { "epoch": 0.21244184610317426, "grad_norm": 3.0082502365112305, "learning_rate": 9.051876675207535e-05, "loss": 2.3677, "step": 645 }, { "epoch": 0.212771213306435, "grad_norm": 2.554163694381714, "learning_rate": 9.048792226229642e-05, "loss": 1.9325, "step": 646 }, { "epoch": 0.21310058050969574, "grad_norm": 2.7593939304351807, "learning_rate": 9.04570329552278e-05, "loss": 2.2431, "step": 647 }, { "epoch": 0.21342994771295648, "grad_norm": 3.1389267444610596, "learning_rate": 9.042609886506183e-05, "loss": 2.2714, "step": 648 }, { "epoch": 0.21375931491621722, "grad_norm": 2.6484322547912598, "learning_rate": 9.039512002604034e-05, "loss": 2.0055, "step": 649 }, { "epoch": 0.21408868211947796, "grad_norm": 3.6394948959350586, "learning_rate": 9.036409647245474e-05, "loss": 2.2731, "step": 650 }, { "epoch": 0.2144180493227387, "grad_norm": 1.968092679977417, "learning_rate": 9.033302823864595e-05, "loss": 2.3822, "step": 651 }, { "epoch": 0.21474741652599943, "grad_norm": 2.4255237579345703, "learning_rate": 9.03019153590043e-05, "loss": 2.5761, "step": 652 }, { "epoch": 0.21507678372926015, "grad_norm": 2.2358973026275635, "learning_rate": 9.027075786796957e-05, "loss": 2.2964, "step": 653 }, { "epoch": 0.21540615093252088, "grad_norm": 3.0591135025024414, "learning_rate": 9.023955580003092e-05, "loss": 2.609, "step": 654 }, { "epoch": 0.21573551813578162, "grad_norm": 2.4640660285949707, "learning_rate": 9.020830918972684e-05, "loss": 2.4714, "step": 655 }, { "epoch": 0.21606488533904236, "grad_norm": 2.639633893966675, "learning_rate": 9.017701807164516e-05, "loss": 1.9724, "step": 656 }, { "epoch": 0.2163942525423031, "grad_norm": 2.35853910446167, "learning_rate": 9.014568248042292e-05, "loss": 2.1988, "step": 657 }, { "epoch": 0.21672361974556384, "grad_norm": 2.6157610416412354, "learning_rate": 9.011430245074645e-05, "loss": 2.4365, "step": 658 }, { "epoch": 0.21705298694882458, "grad_norm": 2.606468439102173, "learning_rate": 9.008287801735124e-05, "loss": 2.1511, "step": 659 }, { "epoch": 0.21738235415208532, "grad_norm": 2.8526015281677246, "learning_rate": 9.005140921502193e-05, "loss": 2.366, "step": 660 }, { "epoch": 0.21771172135534606, "grad_norm": 2.663635492324829, "learning_rate": 9.001989607859226e-05, "loss": 2.418, "step": 661 }, { "epoch": 0.21804108855860677, "grad_norm": 2.6935718059539795, "learning_rate": 8.998833864294507e-05, "loss": 2.327, "step": 662 }, { "epoch": 0.2183704557618675, "grad_norm": 2.840543031692505, "learning_rate": 8.995673694301223e-05, "loss": 2.106, "step": 663 }, { "epoch": 0.21869982296512824, "grad_norm": 2.640491485595703, "learning_rate": 8.99250910137746e-05, "loss": 1.9047, "step": 664 }, { "epoch": 0.21902919016838898, "grad_norm": 2.4656693935394287, "learning_rate": 8.989340089026203e-05, "loss": 1.9766, "step": 665 }, { "epoch": 0.21935855737164972, "grad_norm": 3.2263641357421875, "learning_rate": 8.986166660755321e-05, "loss": 2.4087, "step": 666 }, { "epoch": 0.21968792457491046, "grad_norm": 2.758763313293457, "learning_rate": 8.982988820077582e-05, "loss": 2.2525, "step": 667 }, { "epoch": 0.2200172917781712, "grad_norm": 2.882479429244995, "learning_rate": 8.979806570510631e-05, "loss": 2.3654, "step": 668 }, { "epoch": 0.22034665898143194, "grad_norm": 2.9043984413146973, "learning_rate": 8.976619915576994e-05, "loss": 1.992, "step": 669 }, { "epoch": 0.22067602618469265, "grad_norm": 2.918984889984131, "learning_rate": 8.973428858804073e-05, "loss": 2.2139, "step": 670 }, { "epoch": 0.22100539338795339, "grad_norm": 2.9728167057037354, "learning_rate": 8.970233403724146e-05, "loss": 2.0774, "step": 671 }, { "epoch": 0.22133476059121412, "grad_norm": 2.9786489009857178, "learning_rate": 8.96703355387436e-05, "loss": 2.1803, "step": 672 }, { "epoch": 0.22166412779447486, "grad_norm": 3.05253267288208, "learning_rate": 8.963829312796718e-05, "loss": 2.1476, "step": 673 }, { "epoch": 0.2219934949977356, "grad_norm": 2.640706777572632, "learning_rate": 8.960620684038097e-05, "loss": 2.1194, "step": 674 }, { "epoch": 0.22232286220099634, "grad_norm": 3.7444612979888916, "learning_rate": 8.95740767115022e-05, "loss": 2.1602, "step": 675 }, { "epoch": 0.22265222940425708, "grad_norm": 1.9276431798934937, "learning_rate": 8.95419027768967e-05, "loss": 2.1702, "step": 676 }, { "epoch": 0.22298159660751782, "grad_norm": 2.062800168991089, "learning_rate": 8.95096850721787e-05, "loss": 2.2015, "step": 677 }, { "epoch": 0.22331096381077853, "grad_norm": 2.143061637878418, "learning_rate": 8.947742363301098e-05, "loss": 2.4098, "step": 678 }, { "epoch": 0.22364033101403927, "grad_norm": 2.5208868980407715, "learning_rate": 8.944511849510469e-05, "loss": 2.5313, "step": 679 }, { "epoch": 0.2239696982173, "grad_norm": 2.5163257122039795, "learning_rate": 8.941276969421935e-05, "loss": 2.4371, "step": 680 }, { "epoch": 0.22429906542056074, "grad_norm": 2.5549163818359375, "learning_rate": 8.938037726616281e-05, "loss": 2.3066, "step": 681 }, { "epoch": 0.22462843262382148, "grad_norm": 2.5244922637939453, "learning_rate": 8.934794124679121e-05, "loss": 2.4772, "step": 682 }, { "epoch": 0.22495779982708222, "grad_norm": 2.399825096130371, "learning_rate": 8.931546167200895e-05, "loss": 2.5803, "step": 683 }, { "epoch": 0.22528716703034296, "grad_norm": 2.410311460494995, "learning_rate": 8.928293857776866e-05, "loss": 2.0751, "step": 684 }, { "epoch": 0.2256165342336037, "grad_norm": 2.29736590385437, "learning_rate": 8.925037200007109e-05, "loss": 2.2536, "step": 685 }, { "epoch": 0.22594590143686444, "grad_norm": 2.3026459217071533, "learning_rate": 8.921776197496518e-05, "loss": 2.0917, "step": 686 }, { "epoch": 0.22627526864012515, "grad_norm": 2.490049362182617, "learning_rate": 8.918510853854794e-05, "loss": 2.0983, "step": 687 }, { "epoch": 0.2266046358433859, "grad_norm": 3.218235969543457, "learning_rate": 8.915241172696441e-05, "loss": 2.3049, "step": 688 }, { "epoch": 0.22693400304664663, "grad_norm": 2.5197603702545166, "learning_rate": 8.911967157640771e-05, "loss": 2.1897, "step": 689 }, { "epoch": 0.22726337024990736, "grad_norm": 2.4397006034851074, "learning_rate": 8.908688812311884e-05, "loss": 1.9715, "step": 690 }, { "epoch": 0.2275927374531681, "grad_norm": 2.9002199172973633, "learning_rate": 8.905406140338683e-05, "loss": 2.2867, "step": 691 }, { "epoch": 0.22792210465642884, "grad_norm": 2.6698174476623535, "learning_rate": 8.902119145354852e-05, "loss": 2.4758, "step": 692 }, { "epoch": 0.22825147185968958, "grad_norm": 3.1235620975494385, "learning_rate": 8.898827830998864e-05, "loss": 2.1978, "step": 693 }, { "epoch": 0.22858083906295032, "grad_norm": 2.7362828254699707, "learning_rate": 8.895532200913976e-05, "loss": 2.326, "step": 694 }, { "epoch": 0.22891020626621103, "grad_norm": 2.6940107345581055, "learning_rate": 8.892232258748217e-05, "loss": 2.0774, "step": 695 }, { "epoch": 0.22923957346947177, "grad_norm": 2.963489294052124, "learning_rate": 8.888928008154393e-05, "loss": 2.2016, "step": 696 }, { "epoch": 0.2295689406727325, "grad_norm": 2.9156124591827393, "learning_rate": 8.885619452790078e-05, "loss": 2.4717, "step": 697 }, { "epoch": 0.22989830787599325, "grad_norm": 3.306898355484009, "learning_rate": 8.882306596317606e-05, "loss": 2.2373, "step": 698 }, { "epoch": 0.23022767507925399, "grad_norm": 2.8223652839660645, "learning_rate": 8.878989442404082e-05, "loss": 1.9008, "step": 699 }, { "epoch": 0.23055704228251472, "grad_norm": 2.8709568977355957, "learning_rate": 8.87566799472136e-05, "loss": 1.9383, "step": 700 }, { "epoch": 0.23088640948577546, "grad_norm": 2.175177574157715, "learning_rate": 8.872342256946051e-05, "loss": 2.3998, "step": 701 }, { "epoch": 0.2312157766890362, "grad_norm": 2.408965587615967, "learning_rate": 8.869012232759512e-05, "loss": 2.6233, "step": 702 }, { "epoch": 0.2315451438922969, "grad_norm": 2.2874746322631836, "learning_rate": 8.865677925847848e-05, "loss": 2.0586, "step": 703 }, { "epoch": 0.23187451109555765, "grad_norm": 2.436617374420166, "learning_rate": 8.862339339901902e-05, "loss": 2.324, "step": 704 }, { "epoch": 0.2322038782988184, "grad_norm": 2.8110032081604004, "learning_rate": 8.858996478617253e-05, "loss": 2.4255, "step": 705 }, { "epoch": 0.23253324550207913, "grad_norm": 2.681027889251709, "learning_rate": 8.855649345694216e-05, "loss": 2.5485, "step": 706 }, { "epoch": 0.23286261270533987, "grad_norm": 2.6605820655822754, "learning_rate": 8.852297944837831e-05, "loss": 2.5063, "step": 707 }, { "epoch": 0.2331919799086006, "grad_norm": 2.8735132217407227, "learning_rate": 8.848942279757864e-05, "loss": 2.1953, "step": 708 }, { "epoch": 0.23352134711186134, "grad_norm": 2.5370914936065674, "learning_rate": 8.845582354168802e-05, "loss": 1.9245, "step": 709 }, { "epoch": 0.23385071431512208, "grad_norm": 2.733022928237915, "learning_rate": 8.842218171789846e-05, "loss": 2.109, "step": 710 }, { "epoch": 0.2341800815183828, "grad_norm": 3.819556951522827, "learning_rate": 8.838849736344909e-05, "loss": 2.405, "step": 711 }, { "epoch": 0.23450944872164353, "grad_norm": 2.486302137374878, "learning_rate": 8.835477051562613e-05, "loss": 2.1244, "step": 712 }, { "epoch": 0.23483881592490427, "grad_norm": 3.0748279094696045, "learning_rate": 8.832100121176285e-05, "loss": 2.2095, "step": 713 }, { "epoch": 0.235168183128165, "grad_norm": 3.035090208053589, "learning_rate": 8.828718948923949e-05, "loss": 1.9369, "step": 714 }, { "epoch": 0.23549755033142575, "grad_norm": 2.62367582321167, "learning_rate": 8.825333538548326e-05, "loss": 2.1745, "step": 715 }, { "epoch": 0.2358269175346865, "grad_norm": 2.8850131034851074, "learning_rate": 8.821943893796826e-05, "loss": 2.3818, "step": 716 }, { "epoch": 0.23615628473794723, "grad_norm": 3.0183825492858887, "learning_rate": 8.81855001842155e-05, "loss": 2.2054, "step": 717 }, { "epoch": 0.23648565194120796, "grad_norm": 2.698676586151123, "learning_rate": 8.81515191617928e-05, "loss": 2.24, "step": 718 }, { "epoch": 0.2368150191444687, "grad_norm": 2.8108513355255127, "learning_rate": 8.811749590831475e-05, "loss": 2.1469, "step": 719 }, { "epoch": 0.2371443863477294, "grad_norm": 2.8320016860961914, "learning_rate": 8.808343046144271e-05, "loss": 2.1924, "step": 720 }, { "epoch": 0.23747375355099015, "grad_norm": 3.3814282417297363, "learning_rate": 8.804932285888477e-05, "loss": 2.3818, "step": 721 }, { "epoch": 0.2378031207542509, "grad_norm": 3.158623695373535, "learning_rate": 8.80151731383956e-05, "loss": 2.3962, "step": 722 }, { "epoch": 0.23813248795751163, "grad_norm": 2.743208646774292, "learning_rate": 8.798098133777659e-05, "loss": 1.779, "step": 723 }, { "epoch": 0.23846185516077237, "grad_norm": 3.1935389041900635, "learning_rate": 8.794674749487565e-05, "loss": 2.1095, "step": 724 }, { "epoch": 0.2387912223640331, "grad_norm": 3.515878438949585, "learning_rate": 8.791247164758722e-05, "loss": 1.9777, "step": 725 }, { "epoch": 0.23912058956729385, "grad_norm": 2.7194595336914062, "learning_rate": 8.78781538338523e-05, "loss": 2.3815, "step": 726 }, { "epoch": 0.23944995677055458, "grad_norm": 2.3084821701049805, "learning_rate": 8.784379409165828e-05, "loss": 2.2721, "step": 727 }, { "epoch": 0.2397793239738153, "grad_norm": 2.4673047065734863, "learning_rate": 8.780939245903898e-05, "loss": 2.1049, "step": 728 }, { "epoch": 0.24010869117707603, "grad_norm": 2.5129013061523438, "learning_rate": 8.77749489740746e-05, "loss": 2.3995, "step": 729 }, { "epoch": 0.24043805838033677, "grad_norm": 2.559809923171997, "learning_rate": 8.774046367489166e-05, "loss": 2.3422, "step": 730 }, { "epoch": 0.2407674255835975, "grad_norm": 2.3660426139831543, "learning_rate": 8.770593659966298e-05, "loss": 2.0759, "step": 731 }, { "epoch": 0.24109679278685825, "grad_norm": 2.502711057662964, "learning_rate": 8.767136778660759e-05, "loss": 2.2224, "step": 732 }, { "epoch": 0.241426159990119, "grad_norm": 2.396199941635132, "learning_rate": 8.763675727399075e-05, "loss": 2.4539, "step": 733 }, { "epoch": 0.24175552719337973, "grad_norm": 2.830613851547241, "learning_rate": 8.760210510012387e-05, "loss": 2.4058, "step": 734 }, { "epoch": 0.24208489439664047, "grad_norm": 2.6569724082946777, "learning_rate": 8.756741130336448e-05, "loss": 2.4416, "step": 735 }, { "epoch": 0.24241426159990118, "grad_norm": 3.4117109775543213, "learning_rate": 8.753267592211616e-05, "loss": 2.483, "step": 736 }, { "epoch": 0.24274362880316191, "grad_norm": 2.8380532264709473, "learning_rate": 8.749789899482856e-05, "loss": 2.4988, "step": 737 }, { "epoch": 0.24307299600642265, "grad_norm": 2.325429916381836, "learning_rate": 8.74630805599973e-05, "loss": 1.8716, "step": 738 }, { "epoch": 0.2434023632096834, "grad_norm": 2.5336060523986816, "learning_rate": 8.742822065616393e-05, "loss": 1.8746, "step": 739 }, { "epoch": 0.24373173041294413, "grad_norm": 2.6919002532958984, "learning_rate": 8.739331932191592e-05, "loss": 2.2107, "step": 740 }, { "epoch": 0.24406109761620487, "grad_norm": 3.1390397548675537, "learning_rate": 8.735837659588661e-05, "loss": 2.5521, "step": 741 }, { "epoch": 0.2443904648194656, "grad_norm": 2.4885716438293457, "learning_rate": 8.732339251675516e-05, "loss": 2.1718, "step": 742 }, { "epoch": 0.24471983202272635, "grad_norm": 2.5541298389434814, "learning_rate": 8.728836712324646e-05, "loss": 2.1153, "step": 743 }, { "epoch": 0.24504919922598709, "grad_norm": 2.7616307735443115, "learning_rate": 8.725330045413117e-05, "loss": 2.0635, "step": 744 }, { "epoch": 0.2453785664292478, "grad_norm": 2.8875701427459717, "learning_rate": 8.721819254822565e-05, "loss": 2.3036, "step": 745 }, { "epoch": 0.24570793363250854, "grad_norm": 3.5547261238098145, "learning_rate": 8.718304344439186e-05, "loss": 2.5285, "step": 746 }, { "epoch": 0.24603730083576927, "grad_norm": 3.126332998275757, "learning_rate": 8.714785318153742e-05, "loss": 2.1983, "step": 747 }, { "epoch": 0.24636666803903, "grad_norm": 2.997291088104248, "learning_rate": 8.711262179861547e-05, "loss": 2.2518, "step": 748 }, { "epoch": 0.24669603524229075, "grad_norm": 2.786508083343506, "learning_rate": 8.70773493346247e-05, "loss": 1.8481, "step": 749 }, { "epoch": 0.2470254024455515, "grad_norm": 3.188028573989868, "learning_rate": 8.704203582860922e-05, "loss": 1.929, "step": 750 }, { "epoch": 0.24735476964881223, "grad_norm": 1.924673318862915, "learning_rate": 8.700668131965861e-05, "loss": 2.319, "step": 751 }, { "epoch": 0.24768413685207297, "grad_norm": 2.163625955581665, "learning_rate": 8.697128584690785e-05, "loss": 2.2707, "step": 752 }, { "epoch": 0.24801350405533368, "grad_norm": 2.0983192920684814, "learning_rate": 8.693584944953723e-05, "loss": 2.4019, "step": 753 }, { "epoch": 0.24834287125859442, "grad_norm": 2.2360665798187256, "learning_rate": 8.690037216677236e-05, "loss": 2.5004, "step": 754 }, { "epoch": 0.24867223846185516, "grad_norm": 2.301961660385132, "learning_rate": 8.686485403788411e-05, "loss": 2.2912, "step": 755 }, { "epoch": 0.2490016056651159, "grad_norm": 2.593334436416626, "learning_rate": 8.682929510218855e-05, "loss": 2.7432, "step": 756 }, { "epoch": 0.24933097286837663, "grad_norm": 2.3250508308410645, "learning_rate": 8.679369539904693e-05, "loss": 2.525, "step": 757 }, { "epoch": 0.24966034007163737, "grad_norm": 2.4877517223358154, "learning_rate": 8.675805496786563e-05, "loss": 2.2531, "step": 758 }, { "epoch": 0.2499897072748981, "grad_norm": 2.423072099685669, "learning_rate": 8.672237384809609e-05, "loss": 2.3282, "step": 759 }, { "epoch": 0.2499897072748981, "eval_loss": 2.2632205486297607, "eval_runtime": 795.2953, "eval_samples_per_second": 3.215, "eval_steps_per_second": 1.608, "step": 759 }, { "epoch": 0.2503190744781588, "grad_norm": 2.326564073562622, "learning_rate": 8.668665207923482e-05, "loss": 2.0632, "step": 760 }, { "epoch": 0.2506484416814196, "grad_norm": 2.884378671646118, "learning_rate": 8.665088970082331e-05, "loss": 2.5123, "step": 761 }, { "epoch": 0.2509778088846803, "grad_norm": 2.3617143630981445, "learning_rate": 8.661508675244801e-05, "loss": 2.1108, "step": 762 }, { "epoch": 0.25130717608794106, "grad_norm": 2.4837584495544434, "learning_rate": 8.657924327374025e-05, "loss": 2.3088, "step": 763 }, { "epoch": 0.2516365432912018, "grad_norm": 2.4218530654907227, "learning_rate": 8.654335930437627e-05, "loss": 1.8391, "step": 764 }, { "epoch": 0.25196591049446254, "grad_norm": 2.5416500568389893, "learning_rate": 8.650743488407709e-05, "loss": 2.1901, "step": 765 }, { "epoch": 0.25229527769772325, "grad_norm": 2.6304333209991455, "learning_rate": 8.647147005260854e-05, "loss": 2.4257, "step": 766 }, { "epoch": 0.25262464490098396, "grad_norm": 3.04967999458313, "learning_rate": 8.643546484978115e-05, "loss": 2.1485, "step": 767 }, { "epoch": 0.25295401210424473, "grad_norm": 2.736220359802246, "learning_rate": 8.639941931545017e-05, "loss": 2.0484, "step": 768 }, { "epoch": 0.25328337930750544, "grad_norm": 2.4498915672302246, "learning_rate": 8.636333348951546e-05, "loss": 1.9845, "step": 769 }, { "epoch": 0.2536127465107662, "grad_norm": 3.074000597000122, "learning_rate": 8.63272074119215e-05, "loss": 2.0217, "step": 770 }, { "epoch": 0.2539421137140269, "grad_norm": 2.7875514030456543, "learning_rate": 8.629104112265735e-05, "loss": 2.1935, "step": 771 }, { "epoch": 0.2542714809172877, "grad_norm": 2.7638540267944336, "learning_rate": 8.625483466175652e-05, "loss": 2.1215, "step": 772 }, { "epoch": 0.2546008481205484, "grad_norm": 2.99078106880188, "learning_rate": 8.621858806929705e-05, "loss": 1.8759, "step": 773 }, { "epoch": 0.2549302153238091, "grad_norm": 3.0909841060638428, "learning_rate": 8.618230138540136e-05, "loss": 1.8021, "step": 774 }, { "epoch": 0.2552595825270699, "grad_norm": 3.437960147857666, "learning_rate": 8.614597465023626e-05, "loss": 1.9589, "step": 775 }, { "epoch": 0.2555889497303306, "grad_norm": 2.2460529804229736, "learning_rate": 8.61096079040129e-05, "loss": 2.2812, "step": 776 }, { "epoch": 0.25591831693359135, "grad_norm": 2.4062023162841797, "learning_rate": 8.607320118698674e-05, "loss": 2.1588, "step": 777 }, { "epoch": 0.25624768413685206, "grad_norm": 3.011057138442993, "learning_rate": 8.603675453945742e-05, "loss": 2.5091, "step": 778 }, { "epoch": 0.2565770513401128, "grad_norm": 2.4397964477539062, "learning_rate": 8.600026800176885e-05, "loss": 2.4397, "step": 779 }, { "epoch": 0.25690641854337354, "grad_norm": 2.548110246658325, "learning_rate": 8.596374161430907e-05, "loss": 2.4302, "step": 780 }, { "epoch": 0.2572357857466343, "grad_norm": 2.477574586868286, "learning_rate": 8.592717541751021e-05, "loss": 2.2464, "step": 781 }, { "epoch": 0.257565152949895, "grad_norm": 2.5044217109680176, "learning_rate": 8.58905694518485e-05, "loss": 2.13, "step": 782 }, { "epoch": 0.2578945201531557, "grad_norm": 2.3912570476531982, "learning_rate": 8.585392375784418e-05, "loss": 2.3024, "step": 783 }, { "epoch": 0.2582238873564165, "grad_norm": 2.51187801361084, "learning_rate": 8.581723837606144e-05, "loss": 2.3613, "step": 784 }, { "epoch": 0.2585532545596772, "grad_norm": 2.4491524696350098, "learning_rate": 8.578051334710844e-05, "loss": 2.0575, "step": 785 }, { "epoch": 0.25888262176293797, "grad_norm": 2.263449192047119, "learning_rate": 8.574374871163721e-05, "loss": 2.0833, "step": 786 }, { "epoch": 0.2592119889661987, "grad_norm": 2.714989423751831, "learning_rate": 8.570694451034362e-05, "loss": 2.2463, "step": 787 }, { "epoch": 0.25954135616945945, "grad_norm": 2.5804946422576904, "learning_rate": 8.567010078396735e-05, "loss": 2.2576, "step": 788 }, { "epoch": 0.25987072337272016, "grad_norm": 2.759988784790039, "learning_rate": 8.56332175732918e-05, "loss": 2.2353, "step": 789 }, { "epoch": 0.26020009057598087, "grad_norm": 2.70926570892334, "learning_rate": 8.559629491914412e-05, "loss": 2.0866, "step": 790 }, { "epoch": 0.26052945777924164, "grad_norm": 3.139805555343628, "learning_rate": 8.555933286239508e-05, "loss": 2.4547, "step": 791 }, { "epoch": 0.26085882498250235, "grad_norm": 2.390231132507324, "learning_rate": 8.552233144395907e-05, "loss": 1.9382, "step": 792 }, { "epoch": 0.2611881921857631, "grad_norm": 2.88167142868042, "learning_rate": 8.54852907047941e-05, "loss": 2.1961, "step": 793 }, { "epoch": 0.2615175593890238, "grad_norm": 2.3817718029022217, "learning_rate": 8.544821068590165e-05, "loss": 1.7874, "step": 794 }, { "epoch": 0.2618469265922846, "grad_norm": 2.8125686645507812, "learning_rate": 8.54110914283267e-05, "loss": 2.1755, "step": 795 }, { "epoch": 0.2621762937955453, "grad_norm": 2.719560384750366, "learning_rate": 8.537393297315767e-05, "loss": 2.0847, "step": 796 }, { "epoch": 0.26250566099880607, "grad_norm": 2.7798001766204834, "learning_rate": 8.533673536152638e-05, "loss": 1.8334, "step": 797 }, { "epoch": 0.2628350282020668, "grad_norm": 4.041043758392334, "learning_rate": 8.529949863460793e-05, "loss": 2.3578, "step": 798 }, { "epoch": 0.2631643954053275, "grad_norm": 3.4026458263397217, "learning_rate": 8.52622228336208e-05, "loss": 2.0998, "step": 799 }, { "epoch": 0.26349376260858826, "grad_norm": 4.440029144287109, "learning_rate": 8.522490799982669e-05, "loss": 1.7791, "step": 800 }, { "epoch": 0.26382312981184897, "grad_norm": 2.11838436126709, "learning_rate": 8.518755417453048e-05, "loss": 2.4855, "step": 801 }, { "epoch": 0.26415249701510973, "grad_norm": 2.5172719955444336, "learning_rate": 8.515016139908024e-05, "loss": 2.2894, "step": 802 }, { "epoch": 0.26448186421837044, "grad_norm": 2.1991279125213623, "learning_rate": 8.511272971486717e-05, "loss": 2.2683, "step": 803 }, { "epoch": 0.2648112314216312, "grad_norm": 2.425428867340088, "learning_rate": 8.507525916332549e-05, "loss": 2.2494, "step": 804 }, { "epoch": 0.2651405986248919, "grad_norm": 2.5921618938446045, "learning_rate": 8.503774978593248e-05, "loss": 2.2534, "step": 805 }, { "epoch": 0.2654699658281527, "grad_norm": 2.6209821701049805, "learning_rate": 8.500020162420841e-05, "loss": 2.3588, "step": 806 }, { "epoch": 0.2657993330314134, "grad_norm": 2.933804750442505, "learning_rate": 8.496261471971645e-05, "loss": 2.5508, "step": 807 }, { "epoch": 0.2661287002346741, "grad_norm": 2.4999735355377197, "learning_rate": 8.492498911406266e-05, "loss": 2.0871, "step": 808 }, { "epoch": 0.2664580674379349, "grad_norm": 2.6054482460021973, "learning_rate": 8.488732484889594e-05, "loss": 2.1609, "step": 809 }, { "epoch": 0.2667874346411956, "grad_norm": 2.588648557662964, "learning_rate": 8.4849621965908e-05, "loss": 2.2175, "step": 810 }, { "epoch": 0.26711680184445635, "grad_norm": 2.7105255126953125, "learning_rate": 8.481188050683328e-05, "loss": 2.1727, "step": 811 }, { "epoch": 0.26744616904771706, "grad_norm": 2.858750581741333, "learning_rate": 8.477410051344895e-05, "loss": 2.6405, "step": 812 }, { "epoch": 0.26777553625097783, "grad_norm": 2.562743663787842, "learning_rate": 8.473628202757478e-05, "loss": 2.2222, "step": 813 }, { "epoch": 0.26810490345423854, "grad_norm": 2.7561135292053223, "learning_rate": 8.46984250910732e-05, "loss": 2.3755, "step": 814 }, { "epoch": 0.26843427065749925, "grad_norm": 2.5868582725524902, "learning_rate": 8.466052974584918e-05, "loss": 2.3861, "step": 815 }, { "epoch": 0.26876363786076, "grad_norm": 2.836190938949585, "learning_rate": 8.46225960338502e-05, "loss": 2.1303, "step": 816 }, { "epoch": 0.26909300506402073, "grad_norm": 3.028157949447632, "learning_rate": 8.458462399706623e-05, "loss": 2.2734, "step": 817 }, { "epoch": 0.2694223722672815, "grad_norm": 2.804187536239624, "learning_rate": 8.454661367752965e-05, "loss": 2.1357, "step": 818 }, { "epoch": 0.2697517394705422, "grad_norm": 2.80621075630188, "learning_rate": 8.450856511731519e-05, "loss": 1.732, "step": 819 }, { "epoch": 0.270081106673803, "grad_norm": 2.5028510093688965, "learning_rate": 8.447047835853999e-05, "loss": 1.8806, "step": 820 }, { "epoch": 0.2704104738770637, "grad_norm": 2.597686529159546, "learning_rate": 8.443235344336337e-05, "loss": 2.1617, "step": 821 }, { "epoch": 0.27073984108032445, "grad_norm": 2.7407145500183105, "learning_rate": 8.439419041398698e-05, "loss": 2.157, "step": 822 }, { "epoch": 0.27106920828358516, "grad_norm": 3.096123695373535, "learning_rate": 8.435598931265459e-05, "loss": 1.9549, "step": 823 }, { "epoch": 0.27139857548684587, "grad_norm": 4.0686516761779785, "learning_rate": 8.431775018165211e-05, "loss": 2.139, "step": 824 }, { "epoch": 0.27172794269010664, "grad_norm": 2.7538039684295654, "learning_rate": 8.427947306330764e-05, "loss": 1.8598, "step": 825 }, { "epoch": 0.27205730989336735, "grad_norm": 2.3142900466918945, "learning_rate": 8.424115799999122e-05, "loss": 2.3159, "step": 826 }, { "epoch": 0.2723866770966281, "grad_norm": 2.2066521644592285, "learning_rate": 8.420280503411495e-05, "loss": 2.2287, "step": 827 }, { "epoch": 0.2727160442998888, "grad_norm": 2.188645839691162, "learning_rate": 8.416441420813288e-05, "loss": 2.2174, "step": 828 }, { "epoch": 0.2730454115031496, "grad_norm": 2.808419704437256, "learning_rate": 8.412598556454096e-05, "loss": 2.6173, "step": 829 }, { "epoch": 0.2733747787064103, "grad_norm": 2.461317300796509, "learning_rate": 8.408751914587699e-05, "loss": 2.5954, "step": 830 }, { "epoch": 0.27370414590967107, "grad_norm": 2.9252431392669678, "learning_rate": 8.40490149947206e-05, "loss": 2.4617, "step": 831 }, { "epoch": 0.2740335131129318, "grad_norm": 2.5286619663238525, "learning_rate": 8.401047315369319e-05, "loss": 2.5215, "step": 832 }, { "epoch": 0.2743628803161925, "grad_norm": 2.650164842605591, "learning_rate": 8.397189366545786e-05, "loss": 2.1433, "step": 833 }, { "epoch": 0.27469224751945326, "grad_norm": 2.2741010189056396, "learning_rate": 8.393327657271944e-05, "loss": 2.0067, "step": 834 }, { "epoch": 0.27502161472271397, "grad_norm": 2.3971540927886963, "learning_rate": 8.389462191822428e-05, "loss": 2.2778, "step": 835 }, { "epoch": 0.27535098192597474, "grad_norm": 2.9199283123016357, "learning_rate": 8.385592974476042e-05, "loss": 2.375, "step": 836 }, { "epoch": 0.27568034912923545, "grad_norm": 2.3536462783813477, "learning_rate": 8.381720009515736e-05, "loss": 2.2465, "step": 837 }, { "epoch": 0.2760097163324962, "grad_norm": 2.7381389141082764, "learning_rate": 8.377843301228611e-05, "loss": 2.201, "step": 838 }, { "epoch": 0.2763390835357569, "grad_norm": 2.795403003692627, "learning_rate": 8.373962853905912e-05, "loss": 2.1858, "step": 839 }, { "epoch": 0.27666845073901764, "grad_norm": 2.549487829208374, "learning_rate": 8.37007867184302e-05, "loss": 2.0456, "step": 840 }, { "epoch": 0.2769978179422784, "grad_norm": 2.419311046600342, "learning_rate": 8.366190759339453e-05, "loss": 1.8163, "step": 841 }, { "epoch": 0.2773271851455391, "grad_norm": 2.8960893154144287, "learning_rate": 8.362299120698858e-05, "loss": 2.0316, "step": 842 }, { "epoch": 0.2776565523487999, "grad_norm": 2.440534830093384, "learning_rate": 8.358403760229006e-05, "loss": 1.8316, "step": 843 }, { "epoch": 0.2779859195520606, "grad_norm": 2.9307198524475098, "learning_rate": 8.354504682241786e-05, "loss": 2.5626, "step": 844 }, { "epoch": 0.27831528675532136, "grad_norm": 2.7770323753356934, "learning_rate": 8.350601891053207e-05, "loss": 2.2409, "step": 845 }, { "epoch": 0.27864465395858207, "grad_norm": 2.7087655067443848, "learning_rate": 8.346695390983382e-05, "loss": 1.9896, "step": 846 }, { "epoch": 0.27897402116184283, "grad_norm": 2.673274517059326, "learning_rate": 8.342785186356534e-05, "loss": 2.0128, "step": 847 }, { "epoch": 0.27930338836510354, "grad_norm": 3.0754430294036865, "learning_rate": 8.338871281500984e-05, "loss": 2.008, "step": 848 }, { "epoch": 0.27963275556836426, "grad_norm": 2.8769547939300537, "learning_rate": 8.334953680749152e-05, "loss": 1.8857, "step": 849 }, { "epoch": 0.279962122771625, "grad_norm": 3.0530593395233154, "learning_rate": 8.331032388437546e-05, "loss": 1.967, "step": 850 }, { "epoch": 0.28029148997488573, "grad_norm": 1.8923742771148682, "learning_rate": 8.32710740890676e-05, "loss": 2.4918, "step": 851 }, { "epoch": 0.2806208571781465, "grad_norm": 2.0591697692871094, "learning_rate": 8.323178746501475e-05, "loss": 2.2662, "step": 852 }, { "epoch": 0.2809502243814072, "grad_norm": 2.4539167881011963, "learning_rate": 8.319246405570441e-05, "loss": 2.4903, "step": 853 }, { "epoch": 0.281279591584668, "grad_norm": 2.1013600826263428, "learning_rate": 8.315310390466487e-05, "loss": 2.1026, "step": 854 }, { "epoch": 0.2816089587879287, "grad_norm": 2.0880558490753174, "learning_rate": 8.311370705546501e-05, "loss": 2.3886, "step": 855 }, { "epoch": 0.28193832599118945, "grad_norm": 2.4675230979919434, "learning_rate": 8.307427355171443e-05, "loss": 2.2145, "step": 856 }, { "epoch": 0.28226769319445016, "grad_norm": 2.3811771869659424, "learning_rate": 8.303480343706321e-05, "loss": 2.6376, "step": 857 }, { "epoch": 0.2825970603977109, "grad_norm": 2.4288272857666016, "learning_rate": 8.299529675520201e-05, "loss": 2.2381, "step": 858 }, { "epoch": 0.28292642760097164, "grad_norm": 2.4424099922180176, "learning_rate": 8.295575354986196e-05, "loss": 2.4726, "step": 859 }, { "epoch": 0.28325579480423235, "grad_norm": 2.6503515243530273, "learning_rate": 8.291617386481463e-05, "loss": 2.4256, "step": 860 }, { "epoch": 0.2835851620074931, "grad_norm": 2.2516067028045654, "learning_rate": 8.287655774387193e-05, "loss": 1.9415, "step": 861 }, { "epoch": 0.28391452921075383, "grad_norm": 2.435704231262207, "learning_rate": 8.283690523088613e-05, "loss": 2.0356, "step": 862 }, { "epoch": 0.2842438964140146, "grad_norm": 2.6750681400299072, "learning_rate": 8.279721636974978e-05, "loss": 2.2815, "step": 863 }, { "epoch": 0.2845732636172753, "grad_norm": 3.6107234954833984, "learning_rate": 8.275749120439569e-05, "loss": 2.2028, "step": 864 }, { "epoch": 0.284902630820536, "grad_norm": 2.5893361568450928, "learning_rate": 8.27177297787968e-05, "loss": 2.1027, "step": 865 }, { "epoch": 0.2852319980237968, "grad_norm": 2.4202561378479004, "learning_rate": 8.267793213696624e-05, "loss": 2.0077, "step": 866 }, { "epoch": 0.2855613652270575, "grad_norm": 2.8321430683135986, "learning_rate": 8.263809832295719e-05, "loss": 1.9374, "step": 867 }, { "epoch": 0.28589073243031826, "grad_norm": 2.830946683883667, "learning_rate": 8.25982283808629e-05, "loss": 2.3701, "step": 868 }, { "epoch": 0.286220099633579, "grad_norm": 2.626882791519165, "learning_rate": 8.255832235481659e-05, "loss": 2.2236, "step": 869 }, { "epoch": 0.28654946683683974, "grad_norm": 2.398036003112793, "learning_rate": 8.251838028899143e-05, "loss": 2.2344, "step": 870 }, { "epoch": 0.28687883404010045, "grad_norm": 3.660982608795166, "learning_rate": 8.247840222760052e-05, "loss": 2.2156, "step": 871 }, { "epoch": 0.2872082012433612, "grad_norm": 3.3050460815429688, "learning_rate": 8.243838821489671e-05, "loss": 2.0017, "step": 872 }, { "epoch": 0.2875375684466219, "grad_norm": 3.096376895904541, "learning_rate": 8.239833829517276e-05, "loss": 1.9194, "step": 873 }, { "epoch": 0.28786693564988264, "grad_norm": 2.761716365814209, "learning_rate": 8.235825251276108e-05, "loss": 1.9807, "step": 874 }, { "epoch": 0.2881963028531434, "grad_norm": 3.867349624633789, "learning_rate": 8.231813091203385e-05, "loss": 1.9222, "step": 875 }, { "epoch": 0.2885256700564041, "grad_norm": 2.335017442703247, "learning_rate": 8.227797353740286e-05, "loss": 2.4247, "step": 876 }, { "epoch": 0.2888550372596649, "grad_norm": 2.4474287033081055, "learning_rate": 8.223778043331948e-05, "loss": 2.6098, "step": 877 }, { "epoch": 0.2891844044629256, "grad_norm": 2.22808575630188, "learning_rate": 8.219755164427469e-05, "loss": 2.2784, "step": 878 }, { "epoch": 0.28951377166618636, "grad_norm": 2.265848398208618, "learning_rate": 8.215728721479892e-05, "loss": 2.2767, "step": 879 }, { "epoch": 0.28984313886944707, "grad_norm": 2.5640437602996826, "learning_rate": 8.211698718946208e-05, "loss": 1.9147, "step": 880 }, { "epoch": 0.29017250607270784, "grad_norm": 2.4126553535461426, "learning_rate": 8.207665161287345e-05, "loss": 2.2837, "step": 881 }, { "epoch": 0.29050187327596855, "grad_norm": 2.4454050064086914, "learning_rate": 8.20362805296817e-05, "loss": 2.3148, "step": 882 }, { "epoch": 0.29083124047922926, "grad_norm": 2.776961088180542, "learning_rate": 8.19958739845748e-05, "loss": 2.7321, "step": 883 }, { "epoch": 0.29116060768249, "grad_norm": 2.721742630004883, "learning_rate": 8.195543202227993e-05, "loss": 2.3516, "step": 884 }, { "epoch": 0.29148997488575074, "grad_norm": 2.417945384979248, "learning_rate": 8.191495468756354e-05, "loss": 2.0815, "step": 885 }, { "epoch": 0.2918193420890115, "grad_norm": 2.7946279048919678, "learning_rate": 8.187444202523116e-05, "loss": 2.5238, "step": 886 }, { "epoch": 0.2921487092922722, "grad_norm": 2.2200636863708496, "learning_rate": 8.183389408012752e-05, "loss": 2.2116, "step": 887 }, { "epoch": 0.292478076495533, "grad_norm": 2.7158875465393066, "learning_rate": 8.179331089713629e-05, "loss": 2.2724, "step": 888 }, { "epoch": 0.2928074436987937, "grad_norm": 3.0811691284179688, "learning_rate": 8.175269252118023e-05, "loss": 2.481, "step": 889 }, { "epoch": 0.2931368109020544, "grad_norm": 2.898715019226074, "learning_rate": 8.171203899722105e-05, "loss": 1.9759, "step": 890 }, { "epoch": 0.29346617810531517, "grad_norm": 3.110511541366577, "learning_rate": 8.167135037025933e-05, "loss": 2.3635, "step": 891 }, { "epoch": 0.2937955453085759, "grad_norm": 2.4876153469085693, "learning_rate": 8.163062668533454e-05, "loss": 2.2224, "step": 892 }, { "epoch": 0.29412491251183664, "grad_norm": 2.962139844894409, "learning_rate": 8.158986798752492e-05, "loss": 2.1494, "step": 893 }, { "epoch": 0.29445427971509736, "grad_norm": 2.6857833862304688, "learning_rate": 8.154907432194751e-05, "loss": 1.9066, "step": 894 }, { "epoch": 0.2947836469183581, "grad_norm": 3.1264843940734863, "learning_rate": 8.150824573375804e-05, "loss": 2.1503, "step": 895 }, { "epoch": 0.29511301412161883, "grad_norm": 3.0480618476867676, "learning_rate": 8.146738226815087e-05, "loss": 2.2263, "step": 896 }, { "epoch": 0.2954423813248796, "grad_norm": 2.6517231464385986, "learning_rate": 8.142648397035899e-05, "loss": 1.98, "step": 897 }, { "epoch": 0.2957717485281403, "grad_norm": 2.765357255935669, "learning_rate": 8.138555088565398e-05, "loss": 1.8292, "step": 898 }, { "epoch": 0.296101115731401, "grad_norm": 2.788534641265869, "learning_rate": 8.134458305934587e-05, "loss": 1.7272, "step": 899 }, { "epoch": 0.2964304829346618, "grad_norm": 3.0059924125671387, "learning_rate": 8.130358053678315e-05, "loss": 2.0136, "step": 900 }, { "epoch": 0.2967598501379225, "grad_norm": 2.0941505432128906, "learning_rate": 8.126254336335279e-05, "loss": 2.5732, "step": 901 }, { "epoch": 0.29708921734118326, "grad_norm": 2.0665740966796875, "learning_rate": 8.122147158448002e-05, "loss": 2.4054, "step": 902 }, { "epoch": 0.297418584544444, "grad_norm": 2.2430355548858643, "learning_rate": 8.118036524562841e-05, "loss": 2.4054, "step": 903 }, { "epoch": 0.29774795174770474, "grad_norm": 2.2699286937713623, "learning_rate": 8.113922439229982e-05, "loss": 2.1162, "step": 904 }, { "epoch": 0.29807731895096545, "grad_norm": 2.1671202182769775, "learning_rate": 8.109804907003429e-05, "loss": 2.3838, "step": 905 }, { "epoch": 0.2984066861542262, "grad_norm": 2.175811529159546, "learning_rate": 8.105683932441e-05, "loss": 2.0848, "step": 906 }, { "epoch": 0.29873605335748693, "grad_norm": 3.168586492538452, "learning_rate": 8.101559520104323e-05, "loss": 2.4891, "step": 907 }, { "epoch": 0.29906542056074764, "grad_norm": 2.39568829536438, "learning_rate": 8.097431674558838e-05, "loss": 2.3179, "step": 908 }, { "epoch": 0.2993947877640084, "grad_norm": 2.7230722904205322, "learning_rate": 8.093300400373775e-05, "loss": 2.2789, "step": 909 }, { "epoch": 0.2997241549672691, "grad_norm": 2.686748504638672, "learning_rate": 8.08916570212217e-05, "loss": 2.1697, "step": 910 }, { "epoch": 0.3000535221705299, "grad_norm": 2.3742809295654297, "learning_rate": 8.08502758438084e-05, "loss": 1.9485, "step": 911 }, { "epoch": 0.3003828893737906, "grad_norm": 2.234506607055664, "learning_rate": 8.080886051730391e-05, "loss": 2.2139, "step": 912 }, { "epoch": 0.30071225657705136, "grad_norm": 2.438096284866333, "learning_rate": 8.076741108755212e-05, "loss": 2.338, "step": 913 }, { "epoch": 0.3010416237803121, "grad_norm": 2.3559389114379883, "learning_rate": 8.072592760043463e-05, "loss": 2.3608, "step": 914 }, { "epoch": 0.3013709909835728, "grad_norm": 2.8507134914398193, "learning_rate": 8.068441010187073e-05, "loss": 2.3359, "step": 915 }, { "epoch": 0.30170035818683355, "grad_norm": 3.0053601264953613, "learning_rate": 8.06428586378174e-05, "loss": 2.2598, "step": 916 }, { "epoch": 0.30202972539009426, "grad_norm": 2.6545512676239014, "learning_rate": 8.06012732542692e-05, "loss": 2.2021, "step": 917 }, { "epoch": 0.302359092593355, "grad_norm": 2.2311315536499023, "learning_rate": 8.05596539972582e-05, "loss": 2.0849, "step": 918 }, { "epoch": 0.30268845979661574, "grad_norm": 3.3348686695098877, "learning_rate": 8.051800091285404e-05, "loss": 2.2344, "step": 919 }, { "epoch": 0.3030178269998765, "grad_norm": 2.89279842376709, "learning_rate": 8.047631404716374e-05, "loss": 2.5125, "step": 920 }, { "epoch": 0.3033471942031372, "grad_norm": 2.6193151473999023, "learning_rate": 8.043459344633173e-05, "loss": 1.7652, "step": 921 }, { "epoch": 0.303676561406398, "grad_norm": 2.6484382152557373, "learning_rate": 8.039283915653979e-05, "loss": 1.9555, "step": 922 }, { "epoch": 0.3040059286096587, "grad_norm": 2.553636312484741, "learning_rate": 8.035105122400701e-05, "loss": 1.8701, "step": 923 }, { "epoch": 0.3043352958129194, "grad_norm": 3.712156057357788, "learning_rate": 8.030922969498968e-05, "loss": 2.2537, "step": 924 }, { "epoch": 0.30466466301618017, "grad_norm": 3.272002696990967, "learning_rate": 8.026737461578132e-05, "loss": 2.1644, "step": 925 }, { "epoch": 0.3049940302194409, "grad_norm": 2.4612069129943848, "learning_rate": 8.022548603271252e-05, "loss": 2.4379, "step": 926 }, { "epoch": 0.30532339742270165, "grad_norm": 3.1458165645599365, "learning_rate": 8.018356399215104e-05, "loss": 2.6069, "step": 927 }, { "epoch": 0.30565276462596236, "grad_norm": 2.0101094245910645, "learning_rate": 8.014160854050164e-05, "loss": 2.185, "step": 928 }, { "epoch": 0.3059821318292231, "grad_norm": 2.1659107208251953, "learning_rate": 8.009961972420607e-05, "loss": 2.1568, "step": 929 }, { "epoch": 0.30631149903248384, "grad_norm": 2.243806838989258, "learning_rate": 8.005759758974296e-05, "loss": 2.4728, "step": 930 }, { "epoch": 0.3066408662357446, "grad_norm": 2.525348424911499, "learning_rate": 8.001554218362791e-05, "loss": 2.6265, "step": 931 }, { "epoch": 0.3069702334390053, "grad_norm": 2.3284335136413574, "learning_rate": 7.997345355241328e-05, "loss": 2.3675, "step": 932 }, { "epoch": 0.307299600642266, "grad_norm": 2.4215810298919678, "learning_rate": 7.993133174268826e-05, "loss": 2.1221, "step": 933 }, { "epoch": 0.3076289678455268, "grad_norm": 2.8312551975250244, "learning_rate": 7.988917680107871e-05, "loss": 2.5939, "step": 934 }, { "epoch": 0.3079583350487875, "grad_norm": 2.665449857711792, "learning_rate": 7.984698877424718e-05, "loss": 2.24, "step": 935 }, { "epoch": 0.30828770225204827, "grad_norm": 2.5410232543945312, "learning_rate": 7.980476770889289e-05, "loss": 2.2135, "step": 936 }, { "epoch": 0.308617069455309, "grad_norm": 2.556459426879883, "learning_rate": 7.976251365175158e-05, "loss": 2.2969, "step": 937 }, { "epoch": 0.30894643665856975, "grad_norm": 3.4073662757873535, "learning_rate": 7.972022664959554e-05, "loss": 2.3814, "step": 938 }, { "epoch": 0.30927580386183046, "grad_norm": 2.888021230697632, "learning_rate": 7.96779067492335e-05, "loss": 2.4569, "step": 939 }, { "epoch": 0.30960517106509117, "grad_norm": 2.819286584854126, "learning_rate": 7.963555399751063e-05, "loss": 2.1128, "step": 940 }, { "epoch": 0.30993453826835193, "grad_norm": 2.760483741760254, "learning_rate": 7.959316844130846e-05, "loss": 2.4026, "step": 941 }, { "epoch": 0.31026390547161264, "grad_norm": 2.6997532844543457, "learning_rate": 7.95507501275448e-05, "loss": 2.0134, "step": 942 }, { "epoch": 0.3105932726748734, "grad_norm": 2.7757112979888916, "learning_rate": 7.950829910317379e-05, "loss": 2.207, "step": 943 }, { "epoch": 0.3109226398781341, "grad_norm": 2.5524275302886963, "learning_rate": 7.946581541518569e-05, "loss": 1.7721, "step": 944 }, { "epoch": 0.3112520070813949, "grad_norm": 2.4421706199645996, "learning_rate": 7.942329911060703e-05, "loss": 2.1257, "step": 945 }, { "epoch": 0.3115813742846556, "grad_norm": 2.7249844074249268, "learning_rate": 7.938075023650029e-05, "loss": 1.9599, "step": 946 }, { "epoch": 0.31191074148791637, "grad_norm": 2.9589931964874268, "learning_rate": 7.933816883996415e-05, "loss": 2.3046, "step": 947 }, { "epoch": 0.3122401086911771, "grad_norm": 2.614107608795166, "learning_rate": 7.92955549681332e-05, "loss": 1.8874, "step": 948 }, { "epoch": 0.3125694758944378, "grad_norm": 3.2241110801696777, "learning_rate": 7.925290866817802e-05, "loss": 2.3943, "step": 949 }, { "epoch": 0.31289884309769855, "grad_norm": 3.045088291168213, "learning_rate": 7.921022998730507e-05, "loss": 1.961, "step": 950 }, { "epoch": 0.31322821030095926, "grad_norm": 2.1225154399871826, "learning_rate": 7.916751897275665e-05, "loss": 2.749, "step": 951 }, { "epoch": 0.31355757750422003, "grad_norm": 2.060499668121338, "learning_rate": 7.912477567181086e-05, "loss": 2.2708, "step": 952 }, { "epoch": 0.31388694470748074, "grad_norm": 2.4853827953338623, "learning_rate": 7.908200013178156e-05, "loss": 2.2059, "step": 953 }, { "epoch": 0.3142163119107415, "grad_norm": 2.317662477493286, "learning_rate": 7.903919240001824e-05, "loss": 2.4268, "step": 954 }, { "epoch": 0.3145456791140022, "grad_norm": 2.5555713176727295, "learning_rate": 7.899635252390606e-05, "loss": 2.4417, "step": 955 }, { "epoch": 0.314875046317263, "grad_norm": 2.6387650966644287, "learning_rate": 7.895348055086577e-05, "loss": 2.3655, "step": 956 }, { "epoch": 0.3152044135205237, "grad_norm": 2.4570939540863037, "learning_rate": 7.891057652835361e-05, "loss": 2.4014, "step": 957 }, { "epoch": 0.3155337807237844, "grad_norm": 2.4972894191741943, "learning_rate": 7.886764050386135e-05, "loss": 2.3593, "step": 958 }, { "epoch": 0.3158631479270452, "grad_norm": 2.601262092590332, "learning_rate": 7.882467252491617e-05, "loss": 2.4721, "step": 959 }, { "epoch": 0.3161925151303059, "grad_norm": 2.5913026332855225, "learning_rate": 7.878167263908056e-05, "loss": 2.2818, "step": 960 }, { "epoch": 0.31652188233356665, "grad_norm": 2.6688802242279053, "learning_rate": 7.873864089395243e-05, "loss": 2.1604, "step": 961 }, { "epoch": 0.31685124953682736, "grad_norm": 2.4998068809509277, "learning_rate": 7.869557733716488e-05, "loss": 1.95, "step": 962 }, { "epoch": 0.31718061674008813, "grad_norm": 2.5691847801208496, "learning_rate": 7.865248201638623e-05, "loss": 2.1274, "step": 963 }, { "epoch": 0.31750998394334884, "grad_norm": 2.2748234272003174, "learning_rate": 7.860935497932e-05, "loss": 1.9988, "step": 964 }, { "epoch": 0.31783935114660955, "grad_norm": 2.8164100646972656, "learning_rate": 7.856619627370479e-05, "loss": 2.3755, "step": 965 }, { "epoch": 0.3181687183498703, "grad_norm": 2.8128836154937744, "learning_rate": 7.852300594731425e-05, "loss": 2.1516, "step": 966 }, { "epoch": 0.318498085553131, "grad_norm": 2.7594070434570312, "learning_rate": 7.847978404795704e-05, "loss": 2.177, "step": 967 }, { "epoch": 0.3188274527563918, "grad_norm": 2.687586784362793, "learning_rate": 7.843653062347679e-05, "loss": 2.4126, "step": 968 }, { "epoch": 0.3191568199596525, "grad_norm": 2.610460042953491, "learning_rate": 7.8393245721752e-05, "loss": 2.3253, "step": 969 }, { "epoch": 0.31948618716291327, "grad_norm": 2.525682210922241, "learning_rate": 7.8349929390696e-05, "loss": 1.9292, "step": 970 }, { "epoch": 0.319815554366174, "grad_norm": 3.1786725521087646, "learning_rate": 7.830658167825696e-05, "loss": 1.956, "step": 971 }, { "epoch": 0.32014492156943475, "grad_norm": 2.809406280517578, "learning_rate": 7.826320263241771e-05, "loss": 2.1892, "step": 972 }, { "epoch": 0.32047428877269546, "grad_norm": 3.148411750793457, "learning_rate": 7.821979230119587e-05, "loss": 2.0155, "step": 973 }, { "epoch": 0.32080365597595617, "grad_norm": 3.3248815536499023, "learning_rate": 7.81763507326436e-05, "loss": 1.7539, "step": 974 }, { "epoch": 0.32113302317921694, "grad_norm": 3.2047150135040283, "learning_rate": 7.813287797484768e-05, "loss": 2.028, "step": 975 }, { "epoch": 0.32146239038247765, "grad_norm": 2.169050455093384, "learning_rate": 7.808937407592938e-05, "loss": 2.4696, "step": 976 }, { "epoch": 0.3217917575857384, "grad_norm": 2.0645432472229004, "learning_rate": 7.804583908404448e-05, "loss": 2.1029, "step": 977 }, { "epoch": 0.3221211247889991, "grad_norm": 2.496066093444824, "learning_rate": 7.800227304738317e-05, "loss": 2.3193, "step": 978 }, { "epoch": 0.3224504919922599, "grad_norm": 2.2872400283813477, "learning_rate": 7.795867601416998e-05, "loss": 2.4506, "step": 979 }, { "epoch": 0.3227798591955206, "grad_norm": 2.6444544792175293, "learning_rate": 7.791504803266377e-05, "loss": 2.818, "step": 980 }, { "epoch": 0.32310922639878137, "grad_norm": 2.2178843021392822, "learning_rate": 7.787138915115768e-05, "loss": 1.9535, "step": 981 }, { "epoch": 0.3234385936020421, "grad_norm": 2.528449535369873, "learning_rate": 7.782769941797899e-05, "loss": 2.1525, "step": 982 }, { "epoch": 0.3237679608053028, "grad_norm": 2.597266435623169, "learning_rate": 7.778397888148921e-05, "loss": 2.4295, "step": 983 }, { "epoch": 0.32409732800856356, "grad_norm": 2.5413691997528076, "learning_rate": 7.774022759008386e-05, "loss": 2.3796, "step": 984 }, { "epoch": 0.32442669521182427, "grad_norm": 2.5218312740325928, "learning_rate": 7.76964455921926e-05, "loss": 2.4834, "step": 985 }, { "epoch": 0.32475606241508503, "grad_norm": 2.420651435852051, "learning_rate": 7.7652632936279e-05, "loss": 2.1914, "step": 986 }, { "epoch": 0.32508542961834574, "grad_norm": 2.723442792892456, "learning_rate": 7.760878967084059e-05, "loss": 2.5173, "step": 987 }, { "epoch": 0.3254147968216065, "grad_norm": 2.400263786315918, "learning_rate": 7.756491584440882e-05, "loss": 2.1829, "step": 988 }, { "epoch": 0.3257441640248672, "grad_norm": 2.4888710975646973, "learning_rate": 7.75210115055489e-05, "loss": 2.1607, "step": 989 }, { "epoch": 0.32607353122812793, "grad_norm": 2.6729047298431396, "learning_rate": 7.747707670285989e-05, "loss": 2.3433, "step": 990 }, { "epoch": 0.3264028984313887, "grad_norm": 2.9809768199920654, "learning_rate": 7.743311148497452e-05, "loss": 2.3124, "step": 991 }, { "epoch": 0.3267322656346494, "grad_norm": 2.287365198135376, "learning_rate": 7.73891159005592e-05, "loss": 2.0576, "step": 992 }, { "epoch": 0.3270616328379102, "grad_norm": 2.4659500122070312, "learning_rate": 7.734508999831394e-05, "loss": 2.0228, "step": 993 }, { "epoch": 0.3273910000411709, "grad_norm": 2.66711163520813, "learning_rate": 7.730103382697236e-05, "loss": 2.1277, "step": 994 }, { "epoch": 0.32772036724443165, "grad_norm": 2.902860641479492, "learning_rate": 7.725694743530153e-05, "loss": 2.3815, "step": 995 }, { "epoch": 0.32804973444769236, "grad_norm": 2.8413193225860596, "learning_rate": 7.721283087210199e-05, "loss": 2.2505, "step": 996 }, { "epoch": 0.32837910165095313, "grad_norm": 3.3534953594207764, "learning_rate": 7.716868418620768e-05, "loss": 2.2123, "step": 997 }, { "epoch": 0.32870846885421384, "grad_norm": 2.6357085704803467, "learning_rate": 7.71245074264859e-05, "loss": 1.8754, "step": 998 }, { "epoch": 0.32903783605747455, "grad_norm": 3.168962001800537, "learning_rate": 7.70803006418372e-05, "loss": 1.9192, "step": 999 }, { "epoch": 0.3293672032607353, "grad_norm": 3.3432531356811523, "learning_rate": 7.703606388119542e-05, "loss": 1.7519, "step": 1000 }, { "epoch": 0.32969657046399603, "grad_norm": 1.857453465461731, "learning_rate": 7.699179719352752e-05, "loss": 2.5652, "step": 1001 }, { "epoch": 0.3300259376672568, "grad_norm": 1.9321167469024658, "learning_rate": 7.694750062783363e-05, "loss": 2.1067, "step": 1002 }, { "epoch": 0.3303553048705175, "grad_norm": 2.123394250869751, "learning_rate": 7.690317423314696e-05, "loss": 2.311, "step": 1003 }, { "epoch": 0.3306846720737783, "grad_norm": 2.345137357711792, "learning_rate": 7.685881805853369e-05, "loss": 2.1725, "step": 1004 }, { "epoch": 0.331014039277039, "grad_norm": 2.3156449794769287, "learning_rate": 7.6814432153093e-05, "loss": 2.3528, "step": 1005 }, { "epoch": 0.33134340648029975, "grad_norm": 2.4214887619018555, "learning_rate": 7.6770016565957e-05, "loss": 2.0471, "step": 1006 }, { "epoch": 0.33167277368356046, "grad_norm": 2.3343491554260254, "learning_rate": 7.672557134629059e-05, "loss": 2.1914, "step": 1007 }, { "epoch": 0.3320021408868212, "grad_norm": 2.713235378265381, "learning_rate": 7.668109654329154e-05, "loss": 2.4468, "step": 1008 }, { "epoch": 0.33233150809008194, "grad_norm": 3.0459039211273193, "learning_rate": 7.663659220619033e-05, "loss": 2.4859, "step": 1009 }, { "epoch": 0.33266087529334265, "grad_norm": 2.7493438720703125, "learning_rate": 7.659205838425013e-05, "loss": 2.4392, "step": 1010 }, { "epoch": 0.3329902424966034, "grad_norm": 2.3039515018463135, "learning_rate": 7.654749512676676e-05, "loss": 2.0348, "step": 1011 }, { "epoch": 0.33331960969986413, "grad_norm": 2.667750597000122, "learning_rate": 7.650290248306863e-05, "loss": 1.949, "step": 1012 }, { "epoch": 0.3336489769031249, "grad_norm": 2.9373788833618164, "learning_rate": 7.645828050251665e-05, "loss": 2.591, "step": 1013 }, { "epoch": 0.3339783441063856, "grad_norm": 3.202880620956421, "learning_rate": 7.641362923450424e-05, "loss": 2.5421, "step": 1014 }, { "epoch": 0.3343077113096463, "grad_norm": 2.4246268272399902, "learning_rate": 7.636894872845722e-05, "loss": 2.1465, "step": 1015 }, { "epoch": 0.3346370785129071, "grad_norm": 2.732734203338623, "learning_rate": 7.632423903383374e-05, "loss": 2.1029, "step": 1016 }, { "epoch": 0.3349664457161678, "grad_norm": 3.3760523796081543, "learning_rate": 7.627950020012434e-05, "loss": 2.0353, "step": 1017 }, { "epoch": 0.33529581291942856, "grad_norm": 3.0070247650146484, "learning_rate": 7.623473227685176e-05, "loss": 2.365, "step": 1018 }, { "epoch": 0.33562518012268927, "grad_norm": 2.8173720836639404, "learning_rate": 7.618993531357094e-05, "loss": 2.0214, "step": 1019 }, { "epoch": 0.33595454732595004, "grad_norm": 2.5577118396759033, "learning_rate": 7.614510935986898e-05, "loss": 1.8253, "step": 1020 }, { "epoch": 0.33628391452921075, "grad_norm": 2.8958218097686768, "learning_rate": 7.610025446536509e-05, "loss": 2.2123, "step": 1021 }, { "epoch": 0.3366132817324715, "grad_norm": 3.0846564769744873, "learning_rate": 7.605537067971045e-05, "loss": 2.0833, "step": 1022 }, { "epoch": 0.3369426489357322, "grad_norm": 3.0642452239990234, "learning_rate": 7.601045805258828e-05, "loss": 1.8784, "step": 1023 }, { "epoch": 0.33727201613899294, "grad_norm": 4.532246112823486, "learning_rate": 7.596551663371372e-05, "loss": 2.4327, "step": 1024 }, { "epoch": 0.3376013833422537, "grad_norm": 3.3755955696105957, "learning_rate": 7.592054647283375e-05, "loss": 1.7789, "step": 1025 }, { "epoch": 0.3379307505455144, "grad_norm": 1.943467140197754, "learning_rate": 7.587554761972718e-05, "loss": 2.3184, "step": 1026 }, { "epoch": 0.3382601177487752, "grad_norm": 2.520038604736328, "learning_rate": 7.583052012420461e-05, "loss": 2.2305, "step": 1027 }, { "epoch": 0.3385894849520359, "grad_norm": 2.3161160945892334, "learning_rate": 7.57854640361083e-05, "loss": 2.2228, "step": 1028 }, { "epoch": 0.33891885215529666, "grad_norm": 2.4692294597625732, "learning_rate": 7.574037940531218e-05, "loss": 2.251, "step": 1029 }, { "epoch": 0.33924821935855737, "grad_norm": 2.4752914905548096, "learning_rate": 7.569526628172177e-05, "loss": 2.2454, "step": 1030 }, { "epoch": 0.33957758656181813, "grad_norm": 2.6334524154663086, "learning_rate": 7.565012471527416e-05, "loss": 2.5675, "step": 1031 }, { "epoch": 0.33990695376507885, "grad_norm": 2.3727474212646484, "learning_rate": 7.560495475593785e-05, "loss": 2.2345, "step": 1032 }, { "epoch": 0.34023632096833956, "grad_norm": 2.6379683017730713, "learning_rate": 7.555975645371285e-05, "loss": 2.4907, "step": 1033 }, { "epoch": 0.3405656881716003, "grad_norm": 2.5622193813323975, "learning_rate": 7.55145298586305e-05, "loss": 2.3077, "step": 1034 }, { "epoch": 0.34089505537486103, "grad_norm": 2.888123035430908, "learning_rate": 7.546927502075348e-05, "loss": 2.2664, "step": 1035 }, { "epoch": 0.3412244225781218, "grad_norm": 2.4946439266204834, "learning_rate": 7.542399199017568e-05, "loss": 2.3239, "step": 1036 }, { "epoch": 0.3415537897813825, "grad_norm": 2.6942648887634277, "learning_rate": 7.53786808170223e-05, "loss": 2.501, "step": 1037 }, { "epoch": 0.3418831569846433, "grad_norm": 2.577432632446289, "learning_rate": 7.53333415514496e-05, "loss": 2.3515, "step": 1038 }, { "epoch": 0.342212524187904, "grad_norm": 2.2709529399871826, "learning_rate": 7.528797424364496e-05, "loss": 1.9878, "step": 1039 }, { "epoch": 0.3425418913911647, "grad_norm": 2.2214181423187256, "learning_rate": 7.524257894382681e-05, "loss": 1.8414, "step": 1040 }, { "epoch": 0.34287125859442547, "grad_norm": 2.8622872829437256, "learning_rate": 7.519715570224457e-05, "loss": 1.9751, "step": 1041 }, { "epoch": 0.3432006257976862, "grad_norm": 2.7308106422424316, "learning_rate": 7.515170456917857e-05, "loss": 2.1777, "step": 1042 }, { "epoch": 0.34352999300094694, "grad_norm": 3.008892297744751, "learning_rate": 7.510622559494002e-05, "loss": 2.1712, "step": 1043 }, { "epoch": 0.34385936020420765, "grad_norm": 2.538496732711792, "learning_rate": 7.5060718829871e-05, "loss": 1.5176, "step": 1044 }, { "epoch": 0.3441887274074684, "grad_norm": 2.9815316200256348, "learning_rate": 7.501518432434424e-05, "loss": 1.9269, "step": 1045 }, { "epoch": 0.34451809461072913, "grad_norm": 2.786571741104126, "learning_rate": 7.49696221287633e-05, "loss": 2.0934, "step": 1046 }, { "epoch": 0.3448474618139899, "grad_norm": 2.8320257663726807, "learning_rate": 7.49240322935623e-05, "loss": 1.7772, "step": 1047 }, { "epoch": 0.3451768290172506, "grad_norm": 3.3367862701416016, "learning_rate": 7.487841486920599e-05, "loss": 2.2513, "step": 1048 }, { "epoch": 0.3455061962205113, "grad_norm": 3.6575772762298584, "learning_rate": 7.48327699061897e-05, "loss": 2.2954, "step": 1049 }, { "epoch": 0.3458355634237721, "grad_norm": 3.485363721847534, "learning_rate": 7.478709745503913e-05, "loss": 1.9509, "step": 1050 }, { "epoch": 0.3461649306270328, "grad_norm": 2.551144599914551, "learning_rate": 7.474139756631056e-05, "loss": 2.5046, "step": 1051 }, { "epoch": 0.34649429783029356, "grad_norm": 2.3824918270111084, "learning_rate": 7.46956702905905e-05, "loss": 2.3288, "step": 1052 }, { "epoch": 0.3468236650335543, "grad_norm": 2.3063457012176514, "learning_rate": 7.464991567849586e-05, "loss": 2.2688, "step": 1053 }, { "epoch": 0.34715303223681504, "grad_norm": 2.4683382511138916, "learning_rate": 7.460413378067379e-05, "loss": 2.4154, "step": 1054 }, { "epoch": 0.34748239944007575, "grad_norm": 2.572463035583496, "learning_rate": 7.455832464780162e-05, "loss": 2.4033, "step": 1055 }, { "epoch": 0.3478117666433365, "grad_norm": 2.575000762939453, "learning_rate": 7.451248833058687e-05, "loss": 2.2997, "step": 1056 }, { "epoch": 0.34814113384659723, "grad_norm": 2.226249933242798, "learning_rate": 7.446662487976713e-05, "loss": 2.1283, "step": 1057 }, { "epoch": 0.34847050104985794, "grad_norm": 2.5053114891052246, "learning_rate": 7.442073434610997e-05, "loss": 2.1099, "step": 1058 }, { "epoch": 0.3487998682531187, "grad_norm": 2.71102237701416, "learning_rate": 7.437481678041307e-05, "loss": 2.3652, "step": 1059 }, { "epoch": 0.3491292354563794, "grad_norm": 3.036839723587036, "learning_rate": 7.43288722335039e-05, "loss": 1.9926, "step": 1060 }, { "epoch": 0.3494586026596402, "grad_norm": 2.7319066524505615, "learning_rate": 7.428290075623987e-05, "loss": 2.5434, "step": 1061 }, { "epoch": 0.3497879698629009, "grad_norm": 2.671752691268921, "learning_rate": 7.423690239950818e-05, "loss": 2.1477, "step": 1062 }, { "epoch": 0.35011733706616166, "grad_norm": 3.032055616378784, "learning_rate": 7.419087721422576e-05, "loss": 2.3655, "step": 1063 }, { "epoch": 0.35044670426942237, "grad_norm": 2.8573625087738037, "learning_rate": 7.414482525133928e-05, "loss": 2.6337, "step": 1064 }, { "epoch": 0.3507760714726831, "grad_norm": 2.5970046520233154, "learning_rate": 7.409874656182506e-05, "loss": 1.9232, "step": 1065 }, { "epoch": 0.35110543867594385, "grad_norm": 2.5858652591705322, "learning_rate": 7.405264119668894e-05, "loss": 2.4204, "step": 1066 }, { "epoch": 0.35143480587920456, "grad_norm": 3.2637603282928467, "learning_rate": 7.400650920696633e-05, "loss": 2.6369, "step": 1067 }, { "epoch": 0.3517641730824653, "grad_norm": 2.568941831588745, "learning_rate": 7.396035064372214e-05, "loss": 2.2192, "step": 1068 }, { "epoch": 0.35209354028572604, "grad_norm": 3.0660107135772705, "learning_rate": 7.39141655580506e-05, "loss": 2.1387, "step": 1069 }, { "epoch": 0.3524229074889868, "grad_norm": 3.537260055541992, "learning_rate": 7.386795400107539e-05, "loss": 2.1265, "step": 1070 }, { "epoch": 0.3527522746922475, "grad_norm": 2.7157890796661377, "learning_rate": 7.382171602394948e-05, "loss": 1.7636, "step": 1071 }, { "epoch": 0.3530816418955083, "grad_norm": 3.227700710296631, "learning_rate": 7.377545167785506e-05, "loss": 2.1963, "step": 1072 }, { "epoch": 0.353411009098769, "grad_norm": 2.9469199180603027, "learning_rate": 7.372916101400349e-05, "loss": 1.8702, "step": 1073 }, { "epoch": 0.3537403763020297, "grad_norm": 2.9933621883392334, "learning_rate": 7.368284408363531e-05, "loss": 1.6875, "step": 1074 }, { "epoch": 0.35406974350529047, "grad_norm": 3.143301248550415, "learning_rate": 7.363650093802012e-05, "loss": 1.8475, "step": 1075 }, { "epoch": 0.3543991107085512, "grad_norm": 1.9010746479034424, "learning_rate": 7.35901316284565e-05, "loss": 2.4547, "step": 1076 }, { "epoch": 0.35472847791181195, "grad_norm": 1.9302737712860107, "learning_rate": 7.354373620627205e-05, "loss": 2.2665, "step": 1077 }, { "epoch": 0.35505784511507266, "grad_norm": 2.267845392227173, "learning_rate": 7.349731472282325e-05, "loss": 2.3649, "step": 1078 }, { "epoch": 0.3553872123183334, "grad_norm": 2.5052695274353027, "learning_rate": 7.345086722949539e-05, "loss": 2.3261, "step": 1079 }, { "epoch": 0.35571657952159413, "grad_norm": 2.4212067127227783, "learning_rate": 7.340439377770263e-05, "loss": 2.1668, "step": 1080 }, { "epoch": 0.35604594672485484, "grad_norm": 2.1286873817443848, "learning_rate": 7.335789441888781e-05, "loss": 2.1269, "step": 1081 }, { "epoch": 0.3563753139281156, "grad_norm": 2.6201021671295166, "learning_rate": 7.331136920452244e-05, "loss": 2.498, "step": 1082 }, { "epoch": 0.3567046811313763, "grad_norm": 2.8522377014160156, "learning_rate": 7.326481818610668e-05, "loss": 2.2456, "step": 1083 }, { "epoch": 0.3570340483346371, "grad_norm": 2.749220609664917, "learning_rate": 7.321824141516926e-05, "loss": 2.2296, "step": 1084 }, { "epoch": 0.3573634155378978, "grad_norm": 2.6423492431640625, "learning_rate": 7.317163894326735e-05, "loss": 2.0497, "step": 1085 }, { "epoch": 0.35769278274115857, "grad_norm": 2.9978692531585693, "learning_rate": 7.312501082198666e-05, "loss": 2.4605, "step": 1086 }, { "epoch": 0.3580221499444193, "grad_norm": 2.5553252696990967, "learning_rate": 7.307835710294125e-05, "loss": 1.9969, "step": 1087 }, { "epoch": 0.35835151714768004, "grad_norm": 3.3244998455047607, "learning_rate": 7.303167783777349e-05, "loss": 2.2522, "step": 1088 }, { "epoch": 0.35868088435094075, "grad_norm": 2.7964744567871094, "learning_rate": 7.298497307815406e-05, "loss": 2.2721, "step": 1089 }, { "epoch": 0.35901025155420146, "grad_norm": 2.972792625427246, "learning_rate": 7.293824287578185e-05, "loss": 2.2599, "step": 1090 }, { "epoch": 0.35933961875746223, "grad_norm": 2.953064441680908, "learning_rate": 7.289148728238392e-05, "loss": 2.4102, "step": 1091 }, { "epoch": 0.35966898596072294, "grad_norm": 3.144014835357666, "learning_rate": 7.284470634971544e-05, "loss": 2.4154, "step": 1092 }, { "epoch": 0.3599983531639837, "grad_norm": 2.5227506160736084, "learning_rate": 7.279790012955961e-05, "loss": 1.9305, "step": 1093 }, { "epoch": 0.3603277203672444, "grad_norm": 3.1171419620513916, "learning_rate": 7.275106867372762e-05, "loss": 2.2716, "step": 1094 }, { "epoch": 0.3606570875705052, "grad_norm": 2.6330418586730957, "learning_rate": 7.270421203405863e-05, "loss": 1.8731, "step": 1095 }, { "epoch": 0.3609864547737659, "grad_norm": 2.5206286907196045, "learning_rate": 7.265733026241966e-05, "loss": 1.8349, "step": 1096 }, { "epoch": 0.36131582197702666, "grad_norm": 2.8334505558013916, "learning_rate": 7.261042341070552e-05, "loss": 1.4885, "step": 1097 }, { "epoch": 0.3616451891802874, "grad_norm": 3.205961227416992, "learning_rate": 7.256349153083881e-05, "loss": 1.9328, "step": 1098 }, { "epoch": 0.3619745563835481, "grad_norm": 3.352240562438965, "learning_rate": 7.251653467476983e-05, "loss": 2.0997, "step": 1099 }, { "epoch": 0.36230392358680885, "grad_norm": 3.4709959030151367, "learning_rate": 7.246955289447653e-05, "loss": 1.7248, "step": 1100 }, { "epoch": 0.36263329079006956, "grad_norm": 2.071342945098877, "learning_rate": 7.242254624196443e-05, "loss": 2.2652, "step": 1101 }, { "epoch": 0.36296265799333033, "grad_norm": 2.2652060985565186, "learning_rate": 7.237551476926661e-05, "loss": 2.3592, "step": 1102 }, { "epoch": 0.36329202519659104, "grad_norm": 2.8345634937286377, "learning_rate": 7.232845852844361e-05, "loss": 2.6346, "step": 1103 }, { "epoch": 0.3636213923998518, "grad_norm": 2.3296942710876465, "learning_rate": 7.228137757158338e-05, "loss": 2.0973, "step": 1104 }, { "epoch": 0.3639507596031125, "grad_norm": 2.4769644737243652, "learning_rate": 7.223427195080126e-05, "loss": 2.2648, "step": 1105 }, { "epoch": 0.36428012680637323, "grad_norm": 2.285839080810547, "learning_rate": 7.218714171823984e-05, "loss": 2.1648, "step": 1106 }, { "epoch": 0.364609494009634, "grad_norm": 2.404283046722412, "learning_rate": 7.2139986926069e-05, "loss": 2.3887, "step": 1107 }, { "epoch": 0.3649388612128947, "grad_norm": 2.4443814754486084, "learning_rate": 7.209280762648576e-05, "loss": 2.227, "step": 1108 }, { "epoch": 0.36526822841615547, "grad_norm": 2.4258129596710205, "learning_rate": 7.204560387171432e-05, "loss": 2.2972, "step": 1109 }, { "epoch": 0.3655975956194162, "grad_norm": 2.630659341812134, "learning_rate": 7.199837571400591e-05, "loss": 2.3323, "step": 1110 }, { "epoch": 0.36592696282267695, "grad_norm": 2.332148313522339, "learning_rate": 7.195112320563881e-05, "loss": 2.0955, "step": 1111 }, { "epoch": 0.36625633002593766, "grad_norm": 2.656656503677368, "learning_rate": 7.190384639891822e-05, "loss": 2.016, "step": 1112 }, { "epoch": 0.3665856972291984, "grad_norm": 2.663341760635376, "learning_rate": 7.185654534617623e-05, "loss": 2.3012, "step": 1113 }, { "epoch": 0.36691506443245914, "grad_norm": 2.5622754096984863, "learning_rate": 7.180922009977181e-05, "loss": 1.9749, "step": 1114 }, { "epoch": 0.36724443163571985, "grad_norm": 2.865852117538452, "learning_rate": 7.176187071209069e-05, "loss": 1.648, "step": 1115 }, { "epoch": 0.3675737988389806, "grad_norm": 2.8726089000701904, "learning_rate": 7.171449723554531e-05, "loss": 2.0493, "step": 1116 }, { "epoch": 0.3679031660422413, "grad_norm": 2.720703125, "learning_rate": 7.166709972257478e-05, "loss": 2.3402, "step": 1117 }, { "epoch": 0.3682325332455021, "grad_norm": 3.307445764541626, "learning_rate": 7.161967822564483e-05, "loss": 1.8724, "step": 1118 }, { "epoch": 0.3685619004487628, "grad_norm": 3.013970375061035, "learning_rate": 7.157223279724775e-05, "loss": 2.1087, "step": 1119 }, { "epoch": 0.36889126765202357, "grad_norm": 2.9190618991851807, "learning_rate": 7.152476348990224e-05, "loss": 2.1204, "step": 1120 }, { "epoch": 0.3692206348552843, "grad_norm": 2.860560178756714, "learning_rate": 7.147727035615355e-05, "loss": 2.0234, "step": 1121 }, { "epoch": 0.36955000205854505, "grad_norm": 3.1333353519439697, "learning_rate": 7.142975344857325e-05, "loss": 2.1354, "step": 1122 }, { "epoch": 0.36987936926180576, "grad_norm": 3.1250181198120117, "learning_rate": 7.138221281975919e-05, "loss": 2.2142, "step": 1123 }, { "epoch": 0.37020873646506647, "grad_norm": 3.049356460571289, "learning_rate": 7.133464852233553e-05, "loss": 1.9174, "step": 1124 }, { "epoch": 0.37053810366832723, "grad_norm": 3.432244062423706, "learning_rate": 7.12870606089526e-05, "loss": 1.4808, "step": 1125 }, { "epoch": 0.37086747087158795, "grad_norm": 1.7517294883728027, "learning_rate": 7.123944913228688e-05, "loss": 2.5009, "step": 1126 }, { "epoch": 0.3711968380748487, "grad_norm": 2.627819299697876, "learning_rate": 7.119181414504095e-05, "loss": 2.4364, "step": 1127 }, { "epoch": 0.3715262052781094, "grad_norm": 2.8850955963134766, "learning_rate": 7.11441556999434e-05, "loss": 2.4496, "step": 1128 }, { "epoch": 0.3718555724813702, "grad_norm": 2.320338726043701, "learning_rate": 7.109647384974876e-05, "loss": 1.9873, "step": 1129 }, { "epoch": 0.3721849396846309, "grad_norm": 2.7479662895202637, "learning_rate": 7.104876864723751e-05, "loss": 2.4404, "step": 1130 }, { "epoch": 0.3725143068878916, "grad_norm": 2.618581533432007, "learning_rate": 7.100104014521598e-05, "loss": 2.0846, "step": 1131 }, { "epoch": 0.3728436740911524, "grad_norm": 3.1042213439941406, "learning_rate": 7.095328839651625e-05, "loss": 2.6769, "step": 1132 }, { "epoch": 0.3731730412944131, "grad_norm": 2.402482271194458, "learning_rate": 7.090551345399616e-05, "loss": 2.1385, "step": 1133 }, { "epoch": 0.37350240849767385, "grad_norm": 2.716562271118164, "learning_rate": 7.085771537053923e-05, "loss": 2.2691, "step": 1134 }, { "epoch": 0.37383177570093457, "grad_norm": 2.5292835235595703, "learning_rate": 7.080989419905456e-05, "loss": 2.0973, "step": 1135 }, { "epoch": 0.37416114290419533, "grad_norm": 2.4309535026550293, "learning_rate": 7.076204999247686e-05, "loss": 2.2923, "step": 1136 }, { "epoch": 0.37449051010745604, "grad_norm": 2.3152859210968018, "learning_rate": 7.071418280376629e-05, "loss": 1.898, "step": 1137 }, { "epoch": 0.3748198773107168, "grad_norm": 2.7684056758880615, "learning_rate": 7.06662926859085e-05, "loss": 2.192, "step": 1138 }, { "epoch": 0.3751492445139775, "grad_norm": 2.886624813079834, "learning_rate": 7.061837969191445e-05, "loss": 2.2443, "step": 1139 }, { "epoch": 0.37547861171723823, "grad_norm": 2.6988983154296875, "learning_rate": 7.05704438748205e-05, "loss": 2.0046, "step": 1140 }, { "epoch": 0.375807978920499, "grad_norm": 2.849220037460327, "learning_rate": 7.05224852876882e-05, "loss": 2.2765, "step": 1141 }, { "epoch": 0.3761373461237597, "grad_norm": 2.494020462036133, "learning_rate": 7.047450398360438e-05, "loss": 2.1473, "step": 1142 }, { "epoch": 0.3764667133270205, "grad_norm": 2.90225887298584, "learning_rate": 7.042650001568097e-05, "loss": 2.3484, "step": 1143 }, { "epoch": 0.3767960805302812, "grad_norm": 2.873544692993164, "learning_rate": 7.037847343705496e-05, "loss": 2.0771, "step": 1144 }, { "epoch": 0.37712544773354195, "grad_norm": 2.2210004329681396, "learning_rate": 7.033042430088844e-05, "loss": 1.6013, "step": 1145 }, { "epoch": 0.37745481493680266, "grad_norm": 2.627661943435669, "learning_rate": 7.028235266036841e-05, "loss": 2.0004, "step": 1146 }, { "epoch": 0.37778418214006343, "grad_norm": 2.7000410556793213, "learning_rate": 7.023425856870683e-05, "loss": 1.8592, "step": 1147 }, { "epoch": 0.37811354934332414, "grad_norm": 3.242762327194214, "learning_rate": 7.018614207914047e-05, "loss": 2.1389, "step": 1148 }, { "epoch": 0.37844291654658485, "grad_norm": 3.2626426219940186, "learning_rate": 7.013800324493089e-05, "loss": 2.0661, "step": 1149 }, { "epoch": 0.3787722837498456, "grad_norm": 4.664367198944092, "learning_rate": 7.008984211936446e-05, "loss": 2.0999, "step": 1150 }, { "epoch": 0.37910165095310633, "grad_norm": 1.8425556421279907, "learning_rate": 7.00416587557521e-05, "loss": 2.4092, "step": 1151 }, { "epoch": 0.3794310181563671, "grad_norm": 2.1894047260284424, "learning_rate": 6.999345320742945e-05, "loss": 2.4542, "step": 1152 }, { "epoch": 0.3797603853596278, "grad_norm": 2.095588207244873, "learning_rate": 6.994522552775666e-05, "loss": 2.1205, "step": 1153 }, { "epoch": 0.38008975256288857, "grad_norm": 2.427074909210205, "learning_rate": 6.98969757701184e-05, "loss": 2.2412, "step": 1154 }, { "epoch": 0.3804191197661493, "grad_norm": 2.322599411010742, "learning_rate": 6.984870398792374e-05, "loss": 2.0839, "step": 1155 }, { "epoch": 0.38074848696941, "grad_norm": 2.316253900527954, "learning_rate": 6.980041023460619e-05, "loss": 2.2265, "step": 1156 }, { "epoch": 0.38107785417267076, "grad_norm": 2.33427095413208, "learning_rate": 6.975209456362353e-05, "loss": 2.3765, "step": 1157 }, { "epoch": 0.38140722137593147, "grad_norm": 2.23207426071167, "learning_rate": 6.97037570284578e-05, "loss": 1.9018, "step": 1158 }, { "epoch": 0.38173658857919224, "grad_norm": 2.3566267490386963, "learning_rate": 6.965539768261531e-05, "loss": 2.1916, "step": 1159 }, { "epoch": 0.38206595578245295, "grad_norm": 2.377549409866333, "learning_rate": 6.960701657962641e-05, "loss": 2.4266, "step": 1160 }, { "epoch": 0.3823953229857137, "grad_norm": 2.494011163711548, "learning_rate": 6.955861377304564e-05, "loss": 2.1707, "step": 1161 }, { "epoch": 0.3827246901889744, "grad_norm": 2.6809515953063965, "learning_rate": 6.951018931645146e-05, "loss": 2.3089, "step": 1162 }, { "epoch": 0.3830540573922352, "grad_norm": 2.6300032138824463, "learning_rate": 6.946174326344637e-05, "loss": 2.1554, "step": 1163 }, { "epoch": 0.3833834245954959, "grad_norm": 2.4809701442718506, "learning_rate": 6.941327566765675e-05, "loss": 2.1591, "step": 1164 }, { "epoch": 0.3837127917987566, "grad_norm": 2.6750636100769043, "learning_rate": 6.936478658273285e-05, "loss": 2.3151, "step": 1165 }, { "epoch": 0.3840421590020174, "grad_norm": 2.421334981918335, "learning_rate": 6.931627606234865e-05, "loss": 1.9933, "step": 1166 }, { "epoch": 0.3843715262052781, "grad_norm": 2.758087158203125, "learning_rate": 6.92677441602019e-05, "loss": 2.2134, "step": 1167 }, { "epoch": 0.38470089340853886, "grad_norm": 3.3581409454345703, "learning_rate": 6.921919093001402e-05, "loss": 2.3097, "step": 1168 }, { "epoch": 0.38503026061179957, "grad_norm": 3.3593056201934814, "learning_rate": 6.917061642553005e-05, "loss": 1.9195, "step": 1169 }, { "epoch": 0.38535962781506033, "grad_norm": 2.783876895904541, "learning_rate": 6.91220207005185e-05, "loss": 1.8634, "step": 1170 }, { "epoch": 0.38568899501832105, "grad_norm": 2.8132247924804688, "learning_rate": 6.907340380877149e-05, "loss": 2.1477, "step": 1171 }, { "epoch": 0.3860183622215818, "grad_norm": 2.878632068634033, "learning_rate": 6.902476580410449e-05, "loss": 2.254, "step": 1172 }, { "epoch": 0.3863477294248425, "grad_norm": 3.4279277324676514, "learning_rate": 6.897610674035634e-05, "loss": 1.95, "step": 1173 }, { "epoch": 0.38667709662810323, "grad_norm": 3.4778835773468018, "learning_rate": 6.892742667138923e-05, "loss": 1.8198, "step": 1174 }, { "epoch": 0.387006463831364, "grad_norm": 3.2819323539733887, "learning_rate": 6.887872565108859e-05, "loss": 1.8538, "step": 1175 }, { "epoch": 0.3873358310346247, "grad_norm": 1.8319649696350098, "learning_rate": 6.883000373336299e-05, "loss": 2.4531, "step": 1176 }, { "epoch": 0.3876651982378855, "grad_norm": 2.054316520690918, "learning_rate": 6.878126097214421e-05, "loss": 1.9968, "step": 1177 }, { "epoch": 0.3879945654411462, "grad_norm": 2.3420443534851074, "learning_rate": 6.873249742138709e-05, "loss": 2.2451, "step": 1178 }, { "epoch": 0.38832393264440696, "grad_norm": 1.8880903720855713, "learning_rate": 6.868371313506941e-05, "loss": 2.3786, "step": 1179 }, { "epoch": 0.38865329984766767, "grad_norm": 2.393928289413452, "learning_rate": 6.863490816719196e-05, "loss": 2.1902, "step": 1180 }, { "epoch": 0.3889826670509284, "grad_norm": 2.3493289947509766, "learning_rate": 6.858608257177846e-05, "loss": 2.053, "step": 1181 }, { "epoch": 0.38931203425418914, "grad_norm": 2.4995625019073486, "learning_rate": 6.853723640287535e-05, "loss": 2.3704, "step": 1182 }, { "epoch": 0.38964140145744985, "grad_norm": 2.5445573329925537, "learning_rate": 6.848836971455197e-05, "loss": 1.9575, "step": 1183 }, { "epoch": 0.3899707686607106, "grad_norm": 2.5418543815612793, "learning_rate": 6.84394825609003e-05, "loss": 2.4032, "step": 1184 }, { "epoch": 0.39030013586397133, "grad_norm": 3.205739736557007, "learning_rate": 6.839057499603497e-05, "loss": 2.6626, "step": 1185 }, { "epoch": 0.3906295030672321, "grad_norm": 2.529710531234741, "learning_rate": 6.834164707409326e-05, "loss": 2.1196, "step": 1186 }, { "epoch": 0.3909588702704928, "grad_norm": 2.9174511432647705, "learning_rate": 6.829269884923491e-05, "loss": 2.4384, "step": 1187 }, { "epoch": 0.3912882374737536, "grad_norm": 2.5731706619262695, "learning_rate": 6.82437303756422e-05, "loss": 2.21, "step": 1188 }, { "epoch": 0.3916176046770143, "grad_norm": 3.202188014984131, "learning_rate": 6.819474170751978e-05, "loss": 2.5139, "step": 1189 }, { "epoch": 0.391946971880275, "grad_norm": 2.6559245586395264, "learning_rate": 6.814573289909466e-05, "loss": 2.3044, "step": 1190 }, { "epoch": 0.39227633908353576, "grad_norm": 3.1062824726104736, "learning_rate": 6.809670400461618e-05, "loss": 2.0021, "step": 1191 }, { "epoch": 0.3926057062867965, "grad_norm": 2.513209342956543, "learning_rate": 6.804765507835587e-05, "loss": 1.8991, "step": 1192 }, { "epoch": 0.39293507349005724, "grad_norm": 2.8009233474731445, "learning_rate": 6.799858617460744e-05, "loss": 2.3419, "step": 1193 }, { "epoch": 0.39326444069331795, "grad_norm": 2.8235421180725098, "learning_rate": 6.794949734768674e-05, "loss": 2.2565, "step": 1194 }, { "epoch": 0.3935938078965787, "grad_norm": 2.455665111541748, "learning_rate": 6.790038865193167e-05, "loss": 1.9415, "step": 1195 }, { "epoch": 0.39392317509983943, "grad_norm": 2.5987555980682373, "learning_rate": 6.785126014170207e-05, "loss": 1.8636, "step": 1196 }, { "epoch": 0.3942525423031002, "grad_norm": 2.7285478115081787, "learning_rate": 6.780211187137981e-05, "loss": 2.0818, "step": 1197 }, { "epoch": 0.3945819095063609, "grad_norm": 3.124101161956787, "learning_rate": 6.775294389536852e-05, "loss": 2.1855, "step": 1198 }, { "epoch": 0.3949112767096216, "grad_norm": 2.466757297515869, "learning_rate": 6.770375626809373e-05, "loss": 1.6696, "step": 1199 }, { "epoch": 0.3952406439128824, "grad_norm": 3.0805959701538086, "learning_rate": 6.76545490440027e-05, "loss": 2.1575, "step": 1200 }, { "epoch": 0.3955700111161431, "grad_norm": 2.310868978500366, "learning_rate": 6.760532227756435e-05, "loss": 2.3745, "step": 1201 }, { "epoch": 0.39589937831940386, "grad_norm": 2.0340662002563477, "learning_rate": 6.755607602326928e-05, "loss": 2.2478, "step": 1202 }, { "epoch": 0.39622874552266457, "grad_norm": 2.500797748565674, "learning_rate": 6.750681033562964e-05, "loss": 2.593, "step": 1203 }, { "epoch": 0.39655811272592534, "grad_norm": 2.07173228263855, "learning_rate": 6.745752526917907e-05, "loss": 2.3154, "step": 1204 }, { "epoch": 0.39688747992918605, "grad_norm": 2.0684971809387207, "learning_rate": 6.74082208784727e-05, "loss": 2.0726, "step": 1205 }, { "epoch": 0.39721684713244676, "grad_norm": 2.2718148231506348, "learning_rate": 6.735889721808703e-05, "loss": 2.2801, "step": 1206 }, { "epoch": 0.3975462143357075, "grad_norm": 2.2261784076690674, "learning_rate": 6.730955434261986e-05, "loss": 2.1202, "step": 1207 }, { "epoch": 0.39787558153896824, "grad_norm": 2.9758946895599365, "learning_rate": 6.726019230669034e-05, "loss": 2.4945, "step": 1208 }, { "epoch": 0.398204948742229, "grad_norm": 2.5161709785461426, "learning_rate": 6.721081116493874e-05, "loss": 2.1904, "step": 1209 }, { "epoch": 0.3985343159454897, "grad_norm": 2.922379493713379, "learning_rate": 6.716141097202657e-05, "loss": 2.5583, "step": 1210 }, { "epoch": 0.3988636831487505, "grad_norm": 2.4563610553741455, "learning_rate": 6.711199178263632e-05, "loss": 2.355, "step": 1211 }, { "epoch": 0.3991930503520112, "grad_norm": 3.078686237335205, "learning_rate": 6.706255365147161e-05, "loss": 2.604, "step": 1212 }, { "epoch": 0.39952241755527196, "grad_norm": 2.5945472717285156, "learning_rate": 6.701309663325696e-05, "loss": 2.3826, "step": 1213 }, { "epoch": 0.39985178475853267, "grad_norm": 2.8002431392669678, "learning_rate": 6.696362078273781e-05, "loss": 2.1889, "step": 1214 }, { "epoch": 0.4001811519617934, "grad_norm": 2.4396965503692627, "learning_rate": 6.69141261546805e-05, "loss": 1.9122, "step": 1215 }, { "epoch": 0.40051051916505415, "grad_norm": 2.8927268981933594, "learning_rate": 6.686461280387203e-05, "loss": 2.2138, "step": 1216 }, { "epoch": 0.40083988636831486, "grad_norm": 2.633314847946167, "learning_rate": 6.681508078512031e-05, "loss": 1.7317, "step": 1217 }, { "epoch": 0.4011692535715756, "grad_norm": 3.017258882522583, "learning_rate": 6.676553015325372e-05, "loss": 2.1284, "step": 1218 }, { "epoch": 0.40149862077483633, "grad_norm": 2.7072062492370605, "learning_rate": 6.67159609631214e-05, "loss": 1.8943, "step": 1219 }, { "epoch": 0.4018279879780971, "grad_norm": 2.961946487426758, "learning_rate": 6.666637326959293e-05, "loss": 2.2615, "step": 1220 }, { "epoch": 0.4021573551813578, "grad_norm": 2.648956775665283, "learning_rate": 6.661676712755842e-05, "loss": 1.9888, "step": 1221 }, { "epoch": 0.4024867223846186, "grad_norm": 2.841571092605591, "learning_rate": 6.656714259192839e-05, "loss": 2.2434, "step": 1222 }, { "epoch": 0.4028160895878793, "grad_norm": 2.6502773761749268, "learning_rate": 6.65174997176337e-05, "loss": 1.9564, "step": 1223 }, { "epoch": 0.40314545679114, "grad_norm": 2.589456796646118, "learning_rate": 6.646783855962555e-05, "loss": 1.664, "step": 1224 }, { "epoch": 0.40347482399440077, "grad_norm": 2.849358320236206, "learning_rate": 6.641815917287535e-05, "loss": 1.6624, "step": 1225 }, { "epoch": 0.4038041911976615, "grad_norm": 2.1147572994232178, "learning_rate": 6.63684616123747e-05, "loss": 2.2939, "step": 1226 }, { "epoch": 0.40413355840092224, "grad_norm": 2.6644229888916016, "learning_rate": 6.631874593313531e-05, "loss": 2.2582, "step": 1227 }, { "epoch": 0.40446292560418295, "grad_norm": 2.094411849975586, "learning_rate": 6.626901219018895e-05, "loss": 2.3132, "step": 1228 }, { "epoch": 0.4047922928074437, "grad_norm": 2.631877899169922, "learning_rate": 6.621926043858739e-05, "loss": 2.1575, "step": 1229 }, { "epoch": 0.40512166001070443, "grad_norm": 2.610651731491089, "learning_rate": 6.616949073340232e-05, "loss": 2.3285, "step": 1230 }, { "epoch": 0.40545102721396514, "grad_norm": 2.4871439933776855, "learning_rate": 6.611970312972531e-05, "loss": 2.0776, "step": 1231 }, { "epoch": 0.4057803944172259, "grad_norm": 2.7471463680267334, "learning_rate": 6.606989768266776e-05, "loss": 2.5548, "step": 1232 }, { "epoch": 0.4061097616204866, "grad_norm": 2.746669292449951, "learning_rate": 6.602007444736077e-05, "loss": 2.2351, "step": 1233 }, { "epoch": 0.4064391288237474, "grad_norm": 2.907914161682129, "learning_rate": 6.597023347895524e-05, "loss": 2.7394, "step": 1234 }, { "epoch": 0.4067684960270081, "grad_norm": 2.477755308151245, "learning_rate": 6.592037483262156e-05, "loss": 2.0522, "step": 1235 }, { "epoch": 0.40709786323026886, "grad_norm": 2.537019968032837, "learning_rate": 6.587049856354977e-05, "loss": 2.2179, "step": 1236 }, { "epoch": 0.4074272304335296, "grad_norm": 2.399907350540161, "learning_rate": 6.582060472694939e-05, "loss": 2.2994, "step": 1237 }, { "epoch": 0.40775659763679034, "grad_norm": 2.9592201709747314, "learning_rate": 6.577069337804944e-05, "loss": 2.246, "step": 1238 }, { "epoch": 0.40808596484005105, "grad_norm": 2.831124782562256, "learning_rate": 6.572076457209822e-05, "loss": 2.0093, "step": 1239 }, { "epoch": 0.40841533204331176, "grad_norm": 2.620274066925049, "learning_rate": 6.567081836436346e-05, "loss": 1.8906, "step": 1240 }, { "epoch": 0.40874469924657253, "grad_norm": 2.5152428150177, "learning_rate": 6.562085481013211e-05, "loss": 2.2982, "step": 1241 }, { "epoch": 0.40907406644983324, "grad_norm": 3.2875349521636963, "learning_rate": 6.55708739647103e-05, "loss": 2.4582, "step": 1242 }, { "epoch": 0.409403433653094, "grad_norm": 2.650205612182617, "learning_rate": 6.552087588342332e-05, "loss": 2.1196, "step": 1243 }, { "epoch": 0.4097328008563547, "grad_norm": 2.7873198986053467, "learning_rate": 6.547086062161555e-05, "loss": 1.9238, "step": 1244 }, { "epoch": 0.4100621680596155, "grad_norm": 2.823976993560791, "learning_rate": 6.542082823465037e-05, "loss": 1.9929, "step": 1245 }, { "epoch": 0.4103915352628762, "grad_norm": 3.1395912170410156, "learning_rate": 6.537077877791011e-05, "loss": 2.3549, "step": 1246 }, { "epoch": 0.41072090246613696, "grad_norm": 2.8609724044799805, "learning_rate": 6.532071230679604e-05, "loss": 2.1222, "step": 1247 }, { "epoch": 0.41105026966939767, "grad_norm": 2.8728225231170654, "learning_rate": 6.527062887672819e-05, "loss": 2.1451, "step": 1248 }, { "epoch": 0.4113796368726584, "grad_norm": 3.3821842670440674, "learning_rate": 6.522052854314544e-05, "loss": 2.3055, "step": 1249 }, { "epoch": 0.41170900407591915, "grad_norm": 3.682586431503296, "learning_rate": 6.517041136150534e-05, "loss": 1.9983, "step": 1250 }, { "epoch": 0.41203837127917986, "grad_norm": 2.3784186840057373, "learning_rate": 6.512027738728407e-05, "loss": 2.4458, "step": 1251 }, { "epoch": 0.4123677384824406, "grad_norm": 2.384232997894287, "learning_rate": 6.507012667597643e-05, "loss": 2.5749, "step": 1252 }, { "epoch": 0.41269710568570134, "grad_norm": 2.3909189701080322, "learning_rate": 6.501995928309577e-05, "loss": 2.3923, "step": 1253 }, { "epoch": 0.4130264728889621, "grad_norm": 2.478942394256592, "learning_rate": 6.496977526417383e-05, "loss": 2.5673, "step": 1254 }, { "epoch": 0.4133558400922228, "grad_norm": 2.6806704998016357, "learning_rate": 6.491957467476081e-05, "loss": 2.1562, "step": 1255 }, { "epoch": 0.4136852072954835, "grad_norm": 2.777129888534546, "learning_rate": 6.486935757042529e-05, "loss": 2.3145, "step": 1256 }, { "epoch": 0.4140145744987443, "grad_norm": 2.46478533744812, "learning_rate": 6.481912400675402e-05, "loss": 2.1051, "step": 1257 }, { "epoch": 0.414343941702005, "grad_norm": 2.9172468185424805, "learning_rate": 6.476887403935204e-05, "loss": 2.32, "step": 1258 }, { "epoch": 0.41467330890526577, "grad_norm": 2.7249600887298584, "learning_rate": 6.471860772384256e-05, "loss": 2.2816, "step": 1259 }, { "epoch": 0.4150026761085265, "grad_norm": 2.6038973331451416, "learning_rate": 6.466832511586687e-05, "loss": 2.1756, "step": 1260 }, { "epoch": 0.41533204331178725, "grad_norm": 2.6487905979156494, "learning_rate": 6.461802627108426e-05, "loss": 2.6021, "step": 1261 }, { "epoch": 0.41566141051504796, "grad_norm": 2.545063018798828, "learning_rate": 6.456771124517205e-05, "loss": 2.0434, "step": 1262 }, { "epoch": 0.4159907777183087, "grad_norm": 2.7668240070343018, "learning_rate": 6.451738009382542e-05, "loss": 2.3299, "step": 1263 }, { "epoch": 0.41632014492156943, "grad_norm": 2.4981627464294434, "learning_rate": 6.446703287275745e-05, "loss": 1.9392, "step": 1264 }, { "epoch": 0.41664951212483015, "grad_norm": 2.7257003784179688, "learning_rate": 6.441666963769897e-05, "loss": 2.0421, "step": 1265 }, { "epoch": 0.4169788793280909, "grad_norm": 2.8627986907958984, "learning_rate": 6.436629044439854e-05, "loss": 2.3554, "step": 1266 }, { "epoch": 0.4173082465313516, "grad_norm": 2.81748628616333, "learning_rate": 6.43158953486224e-05, "loss": 1.9084, "step": 1267 }, { "epoch": 0.4176376137346124, "grad_norm": 2.941399335861206, "learning_rate": 6.426548440615438e-05, "loss": 2.0619, "step": 1268 }, { "epoch": 0.4179669809378731, "grad_norm": 3.05411434173584, "learning_rate": 6.421505767279588e-05, "loss": 2.1444, "step": 1269 }, { "epoch": 0.41829634814113387, "grad_norm": 2.318077564239502, "learning_rate": 6.416461520436571e-05, "loss": 1.7884, "step": 1270 }, { "epoch": 0.4186257153443946, "grad_norm": 2.866882562637329, "learning_rate": 6.411415705670021e-05, "loss": 2.0699, "step": 1271 }, { "epoch": 0.41895508254765534, "grad_norm": 2.7729244232177734, "learning_rate": 6.406368328565295e-05, "loss": 2.2929, "step": 1272 }, { "epoch": 0.41928444975091606, "grad_norm": 3.3135483264923096, "learning_rate": 6.401319394709489e-05, "loss": 2.4658, "step": 1273 }, { "epoch": 0.41961381695417677, "grad_norm": 3.1503915786743164, "learning_rate": 6.396268909691414e-05, "loss": 2.117, "step": 1274 }, { "epoch": 0.41994318415743753, "grad_norm": 2.9462268352508545, "learning_rate": 6.391216879101608e-05, "loss": 2.0571, "step": 1275 }, { "epoch": 0.42027255136069824, "grad_norm": 2.186408042907715, "learning_rate": 6.386163308532314e-05, "loss": 2.5956, "step": 1276 }, { "epoch": 0.420601918563959, "grad_norm": 2.4059898853302, "learning_rate": 6.381108203577476e-05, "loss": 2.4601, "step": 1277 }, { "epoch": 0.4209312857672197, "grad_norm": 2.4408154487609863, "learning_rate": 6.376051569832742e-05, "loss": 2.537, "step": 1278 }, { "epoch": 0.4212606529704805, "grad_norm": 2.3048956394195557, "learning_rate": 6.370993412895454e-05, "loss": 2.4092, "step": 1279 }, { "epoch": 0.4215900201737412, "grad_norm": 2.4109179973602295, "learning_rate": 6.365933738364634e-05, "loss": 2.2523, "step": 1280 }, { "epoch": 0.4219193873770019, "grad_norm": 2.6136887073516846, "learning_rate": 6.360872551840988e-05, "loss": 2.0975, "step": 1281 }, { "epoch": 0.4222487545802627, "grad_norm": 2.697741985321045, "learning_rate": 6.355809858926893e-05, "loss": 2.5818, "step": 1282 }, { "epoch": 0.4225781217835234, "grad_norm": 2.3792057037353516, "learning_rate": 6.350745665226396e-05, "loss": 2.1916, "step": 1283 }, { "epoch": 0.42290748898678415, "grad_norm": 2.742786169052124, "learning_rate": 6.345679976345205e-05, "loss": 2.5287, "step": 1284 }, { "epoch": 0.42323685619004486, "grad_norm": 2.550384998321533, "learning_rate": 6.34061279789068e-05, "loss": 2.3782, "step": 1285 }, { "epoch": 0.42356622339330563, "grad_norm": 3.0929136276245117, "learning_rate": 6.335544135471834e-05, "loss": 2.3501, "step": 1286 }, { "epoch": 0.42389559059656634, "grad_norm": 2.920880079269409, "learning_rate": 6.330473994699318e-05, "loss": 2.3128, "step": 1287 }, { "epoch": 0.4242249577998271, "grad_norm": 2.258695125579834, "learning_rate": 6.325402381185426e-05, "loss": 2.0221, "step": 1288 }, { "epoch": 0.4245543250030878, "grad_norm": 2.6346912384033203, "learning_rate": 6.320329300544076e-05, "loss": 2.2641, "step": 1289 }, { "epoch": 0.42488369220634853, "grad_norm": 3.050056219100952, "learning_rate": 6.315254758390814e-05, "loss": 2.4125, "step": 1290 }, { "epoch": 0.4252130594096093, "grad_norm": 2.530167818069458, "learning_rate": 6.3101787603428e-05, "loss": 2.0573, "step": 1291 }, { "epoch": 0.42554242661287, "grad_norm": 3.0110995769500732, "learning_rate": 6.305101312018809e-05, "loss": 2.6722, "step": 1292 }, { "epoch": 0.4258717938161308, "grad_norm": 2.6352243423461914, "learning_rate": 6.300022419039219e-05, "loss": 2.0293, "step": 1293 }, { "epoch": 0.4262011610193915, "grad_norm": 2.864758253097534, "learning_rate": 6.294942087026011e-05, "loss": 1.9841, "step": 1294 }, { "epoch": 0.42653052822265225, "grad_norm": 3.216989517211914, "learning_rate": 6.289860321602754e-05, "loss": 2.005, "step": 1295 }, { "epoch": 0.42685989542591296, "grad_norm": 3.134946584701538, "learning_rate": 6.284777128394603e-05, "loss": 2.3214, "step": 1296 }, { "epoch": 0.4271892626291737, "grad_norm": 3.151312828063965, "learning_rate": 6.279692513028304e-05, "loss": 2.2082, "step": 1297 }, { "epoch": 0.42751862983243444, "grad_norm": 2.858823776245117, "learning_rate": 6.274606481132163e-05, "loss": 2.0439, "step": 1298 }, { "epoch": 0.42784799703569515, "grad_norm": 2.6305992603302, "learning_rate": 6.269519038336062e-05, "loss": 1.7326, "step": 1299 }, { "epoch": 0.4281773642389559, "grad_norm": 4.182598114013672, "learning_rate": 6.264430190271444e-05, "loss": 1.8915, "step": 1300 }, { "epoch": 0.4285067314422166, "grad_norm": 2.144503116607666, "learning_rate": 6.259339942571307e-05, "loss": 2.3162, "step": 1301 }, { "epoch": 0.4288360986454774, "grad_norm": 2.138836622238159, "learning_rate": 6.254248300870198e-05, "loss": 2.3545, "step": 1302 }, { "epoch": 0.4291654658487381, "grad_norm": 2.6747050285339355, "learning_rate": 6.249155270804206e-05, "loss": 2.6766, "step": 1303 }, { "epoch": 0.42949483305199887, "grad_norm": 2.2941064834594727, "learning_rate": 6.24406085801096e-05, "loss": 2.6815, "step": 1304 }, { "epoch": 0.4298242002552596, "grad_norm": 2.450798988342285, "learning_rate": 6.238965068129616e-05, "loss": 2.3887, "step": 1305 }, { "epoch": 0.4301535674585203, "grad_norm": 2.7236177921295166, "learning_rate": 6.233867906800856e-05, "loss": 2.6746, "step": 1306 }, { "epoch": 0.43048293466178106, "grad_norm": 2.221660852432251, "learning_rate": 6.22876937966688e-05, "loss": 2.4207, "step": 1307 }, { "epoch": 0.43081230186504177, "grad_norm": 2.9190986156463623, "learning_rate": 6.2236694923714e-05, "loss": 2.5129, "step": 1308 }, { "epoch": 0.43114166906830254, "grad_norm": 2.183797597885132, "learning_rate": 6.218568250559634e-05, "loss": 2.209, "step": 1309 }, { "epoch": 0.43147103627156325, "grad_norm": 2.7680306434631348, "learning_rate": 6.2134656598783e-05, "loss": 1.9334, "step": 1310 }, { "epoch": 0.431800403474824, "grad_norm": 2.6052422523498535, "learning_rate": 6.208361725975605e-05, "loss": 2.2873, "step": 1311 }, { "epoch": 0.4321297706780847, "grad_norm": 2.873652696609497, "learning_rate": 6.203256454501248e-05, "loss": 2.5024, "step": 1312 }, { "epoch": 0.4324591378813455, "grad_norm": 3.202440023422241, "learning_rate": 6.198149851106407e-05, "loss": 2.2169, "step": 1313 }, { "epoch": 0.4327885050846062, "grad_norm": 2.7076804637908936, "learning_rate": 6.19304192144373e-05, "loss": 2.0037, "step": 1314 }, { "epoch": 0.4331178722878669, "grad_norm": 3.2877442836761475, "learning_rate": 6.187932671167342e-05, "loss": 2.6063, "step": 1315 }, { "epoch": 0.4334472394911277, "grad_norm": 2.6223089694976807, "learning_rate": 6.18282210593282e-05, "loss": 1.959, "step": 1316 }, { "epoch": 0.4337766066943884, "grad_norm": 3.001115083694458, "learning_rate": 6.177710231397203e-05, "loss": 2.1598, "step": 1317 }, { "epoch": 0.43410597389764916, "grad_norm": 2.4432201385498047, "learning_rate": 6.172597053218978e-05, "loss": 1.7752, "step": 1318 }, { "epoch": 0.43443534110090987, "grad_norm": 2.4193572998046875, "learning_rate": 6.167482577058075e-05, "loss": 2.0442, "step": 1319 }, { "epoch": 0.43476470830417063, "grad_norm": 2.7738735675811768, "learning_rate": 6.162366808575857e-05, "loss": 2.1012, "step": 1320 }, { "epoch": 0.43509407550743134, "grad_norm": 2.398259401321411, "learning_rate": 6.157249753435124e-05, "loss": 1.641, "step": 1321 }, { "epoch": 0.4354234427106921, "grad_norm": 2.803773880004883, "learning_rate": 6.152131417300098e-05, "loss": 2.1607, "step": 1322 }, { "epoch": 0.4357528099139528, "grad_norm": 3.041407346725464, "learning_rate": 6.147011805836414e-05, "loss": 1.8865, "step": 1323 }, { "epoch": 0.43608217711721353, "grad_norm": 4.000000476837158, "learning_rate": 6.141890924711126e-05, "loss": 1.8523, "step": 1324 }, { "epoch": 0.4364115443204743, "grad_norm": 3.650296688079834, "learning_rate": 6.136768779592691e-05, "loss": 2.2463, "step": 1325 }, { "epoch": 0.436740911523735, "grad_norm": 2.288025140762329, "learning_rate": 6.13164537615096e-05, "loss": 2.5091, "step": 1326 }, { "epoch": 0.4370702787269958, "grad_norm": 2.0283560752868652, "learning_rate": 6.126520720057186e-05, "loss": 2.2552, "step": 1327 }, { "epoch": 0.4373996459302565, "grad_norm": 2.147888422012329, "learning_rate": 6.121394816984e-05, "loss": 2.3754, "step": 1328 }, { "epoch": 0.43772901313351725, "grad_norm": 2.3866569995880127, "learning_rate": 6.11626767260542e-05, "loss": 2.2485, "step": 1329 }, { "epoch": 0.43805838033677796, "grad_norm": 2.4724600315093994, "learning_rate": 6.111139292596834e-05, "loss": 2.3486, "step": 1330 }, { "epoch": 0.4383877475400387, "grad_norm": 2.1698498725891113, "learning_rate": 6.106009682634997e-05, "loss": 2.2673, "step": 1331 }, { "epoch": 0.43871711474329944, "grad_norm": 2.1860690116882324, "learning_rate": 6.100878848398032e-05, "loss": 2.1531, "step": 1332 }, { "epoch": 0.43904648194656015, "grad_norm": 2.3432462215423584, "learning_rate": 6.095746795565408e-05, "loss": 2.3907, "step": 1333 }, { "epoch": 0.4393758491498209, "grad_norm": 3.00425386428833, "learning_rate": 6.090613529817949e-05, "loss": 2.5884, "step": 1334 }, { "epoch": 0.43970521635308163, "grad_norm": 3.2894668579101562, "learning_rate": 6.085479056837821e-05, "loss": 2.2589, "step": 1335 }, { "epoch": 0.4400345835563424, "grad_norm": 2.9489943981170654, "learning_rate": 6.0803433823085244e-05, "loss": 2.2962, "step": 1336 }, { "epoch": 0.4403639507596031, "grad_norm": 3.29754638671875, "learning_rate": 6.075206511914891e-05, "loss": 2.4655, "step": 1337 }, { "epoch": 0.4406933179628639, "grad_norm": 2.559438705444336, "learning_rate": 6.070068451343074e-05, "loss": 2.2637, "step": 1338 }, { "epoch": 0.4410226851661246, "grad_norm": 3.065319061279297, "learning_rate": 6.0649292062805494e-05, "loss": 2.1736, "step": 1339 }, { "epoch": 0.4413520523693853, "grad_norm": 2.943509101867676, "learning_rate": 6.059788782416099e-05, "loss": 2.0683, "step": 1340 }, { "epoch": 0.44168141957264606, "grad_norm": 2.7584707736968994, "learning_rate": 6.054647185439814e-05, "loss": 2.3206, "step": 1341 }, { "epoch": 0.44201078677590677, "grad_norm": 2.789400577545166, "learning_rate": 6.049504421043078e-05, "loss": 1.8737, "step": 1342 }, { "epoch": 0.44234015397916754, "grad_norm": 2.9476754665374756, "learning_rate": 6.0443604949185706e-05, "loss": 2.2538, "step": 1343 }, { "epoch": 0.44266952118242825, "grad_norm": 2.6708343029022217, "learning_rate": 6.0392154127602595e-05, "loss": 2.156, "step": 1344 }, { "epoch": 0.442998888385689, "grad_norm": 2.7900948524475098, "learning_rate": 6.0340691802633884e-05, "loss": 1.8873, "step": 1345 }, { "epoch": 0.4433282555889497, "grad_norm": 2.6233713626861572, "learning_rate": 6.028921803124476e-05, "loss": 1.7659, "step": 1346 }, { "epoch": 0.4436576227922105, "grad_norm": 2.7721614837646484, "learning_rate": 6.023773287041308e-05, "loss": 2.0038, "step": 1347 }, { "epoch": 0.4439869899954712, "grad_norm": 2.7499945163726807, "learning_rate": 6.01862363771293e-05, "loss": 2.1821, "step": 1348 }, { "epoch": 0.4443163571987319, "grad_norm": 2.655479907989502, "learning_rate": 6.013472860839642e-05, "loss": 1.9229, "step": 1349 }, { "epoch": 0.4446457244019927, "grad_norm": 2.927269458770752, "learning_rate": 6.008320962122994e-05, "loss": 1.8664, "step": 1350 }, { "epoch": 0.4449750916052534, "grad_norm": 1.9483531713485718, "learning_rate": 6.003167947265777e-05, "loss": 2.5558, "step": 1351 }, { "epoch": 0.44530445880851416, "grad_norm": 2.5765058994293213, "learning_rate": 5.9980138219720125e-05, "loss": 2.6047, "step": 1352 }, { "epoch": 0.44563382601177487, "grad_norm": 2.483147382736206, "learning_rate": 5.992858591946961e-05, "loss": 2.324, "step": 1353 }, { "epoch": 0.44596319321503564, "grad_norm": 2.8535609245300293, "learning_rate": 5.987702262897098e-05, "loss": 2.395, "step": 1354 }, { "epoch": 0.44629256041829635, "grad_norm": 2.455509662628174, "learning_rate": 5.9825448405301175e-05, "loss": 2.3842, "step": 1355 }, { "epoch": 0.44662192762155706, "grad_norm": 2.347088575363159, "learning_rate": 5.977386330554926e-05, "loss": 2.2313, "step": 1356 }, { "epoch": 0.4469512948248178, "grad_norm": 2.4735326766967773, "learning_rate": 5.9722267386816324e-05, "loss": 2.1965, "step": 1357 }, { "epoch": 0.44728066202807854, "grad_norm": 2.5871574878692627, "learning_rate": 5.967066070621541e-05, "loss": 2.2823, "step": 1358 }, { "epoch": 0.4476100292313393, "grad_norm": 2.4709925651550293, "learning_rate": 5.9619043320871494e-05, "loss": 2.2467, "step": 1359 }, { "epoch": 0.4479393964346, "grad_norm": 2.6086020469665527, "learning_rate": 5.956741528792142e-05, "loss": 2.2654, "step": 1360 }, { "epoch": 0.4482687636378608, "grad_norm": 2.7858760356903076, "learning_rate": 5.951577666451379e-05, "loss": 2.3658, "step": 1361 }, { "epoch": 0.4485981308411215, "grad_norm": 2.6391570568084717, "learning_rate": 5.946412750780892e-05, "loss": 2.1763, "step": 1362 }, { "epoch": 0.44892749804438226, "grad_norm": 2.597666025161743, "learning_rate": 5.941246787497884e-05, "loss": 2.1193, "step": 1363 }, { "epoch": 0.44925686524764297, "grad_norm": 2.591607093811035, "learning_rate": 5.9360797823207104e-05, "loss": 1.9894, "step": 1364 }, { "epoch": 0.4495862324509037, "grad_norm": 2.4428396224975586, "learning_rate": 5.930911740968884e-05, "loss": 2.1478, "step": 1365 }, { "epoch": 0.44991559965416444, "grad_norm": 2.8427088260650635, "learning_rate": 5.9257426691630656e-05, "loss": 2.0804, "step": 1366 }, { "epoch": 0.45024496685742516, "grad_norm": 2.4200117588043213, "learning_rate": 5.920572572625056e-05, "loss": 2.0263, "step": 1367 }, { "epoch": 0.4505743340606859, "grad_norm": 2.4792933464050293, "learning_rate": 5.915401457077785e-05, "loss": 2.1457, "step": 1368 }, { "epoch": 0.45090370126394663, "grad_norm": 3.0544183254241943, "learning_rate": 5.910229328245319e-05, "loss": 2.476, "step": 1369 }, { "epoch": 0.4512330684672074, "grad_norm": 3.122922897338867, "learning_rate": 5.90505619185284e-05, "loss": 1.8849, "step": 1370 }, { "epoch": 0.4515624356704681, "grad_norm": 3.2922286987304688, "learning_rate": 5.899882053626646e-05, "loss": 2.1517, "step": 1371 }, { "epoch": 0.4518918028737289, "grad_norm": 3.095803737640381, "learning_rate": 5.8947069192941493e-05, "loss": 2.1022, "step": 1372 }, { "epoch": 0.4522211700769896, "grad_norm": 3.426305055618286, "learning_rate": 5.889530794583855e-05, "loss": 2.1558, "step": 1373 }, { "epoch": 0.4525505372802503, "grad_norm": 3.2254509925842285, "learning_rate": 5.8843536852253745e-05, "loss": 2.2135, "step": 1374 }, { "epoch": 0.45287990448351106, "grad_norm": 3.3690662384033203, "learning_rate": 5.879175596949401e-05, "loss": 2.0148, "step": 1375 }, { "epoch": 0.4532092716867718, "grad_norm": 2.666074275970459, "learning_rate": 5.8739965354877194e-05, "loss": 2.445, "step": 1376 }, { "epoch": 0.45353863889003254, "grad_norm": 2.3343346118927, "learning_rate": 5.8688165065731826e-05, "loss": 2.332, "step": 1377 }, { "epoch": 0.45386800609329325, "grad_norm": 2.3201119899749756, "learning_rate": 5.8636355159397225e-05, "loss": 2.5834, "step": 1378 }, { "epoch": 0.454197373296554, "grad_norm": 2.493422269821167, "learning_rate": 5.858453569322332e-05, "loss": 2.5553, "step": 1379 }, { "epoch": 0.45452674049981473, "grad_norm": 2.374323606491089, "learning_rate": 5.853270672457061e-05, "loss": 2.1962, "step": 1380 }, { "epoch": 0.45485610770307544, "grad_norm": 2.812485694885254, "learning_rate": 5.8480868310810124e-05, "loss": 2.3164, "step": 1381 }, { "epoch": 0.4551854749063362, "grad_norm": 2.3375439643859863, "learning_rate": 5.8429020509323385e-05, "loss": 2.4357, "step": 1382 }, { "epoch": 0.4555148421095969, "grad_norm": 2.376716136932373, "learning_rate": 5.837716337750223e-05, "loss": 2.5416, "step": 1383 }, { "epoch": 0.4558442093128577, "grad_norm": 2.5697383880615234, "learning_rate": 5.8325296972748864e-05, "loss": 2.3119, "step": 1384 }, { "epoch": 0.4561735765161184, "grad_norm": 2.5520308017730713, "learning_rate": 5.827342135247581e-05, "loss": 2.3784, "step": 1385 }, { "epoch": 0.45650294371937916, "grad_norm": 2.750321865081787, "learning_rate": 5.8221536574105694e-05, "loss": 2.5948, "step": 1386 }, { "epoch": 0.4568323109226399, "grad_norm": 2.980220317840576, "learning_rate": 5.816964269507135e-05, "loss": 2.5563, "step": 1387 }, { "epoch": 0.45716167812590064, "grad_norm": 2.898198366165161, "learning_rate": 5.811773977281565e-05, "loss": 2.2648, "step": 1388 }, { "epoch": 0.45749104532916135, "grad_norm": 3.0173451900482178, "learning_rate": 5.806582786479149e-05, "loss": 2.2571, "step": 1389 }, { "epoch": 0.45782041253242206, "grad_norm": 2.6375420093536377, "learning_rate": 5.801390702846171e-05, "loss": 2.0654, "step": 1390 }, { "epoch": 0.4581497797356828, "grad_norm": 2.8282692432403564, "learning_rate": 5.796197732129905e-05, "loss": 2.3666, "step": 1391 }, { "epoch": 0.45847914693894354, "grad_norm": 2.815434694290161, "learning_rate": 5.7910038800786e-05, "loss": 2.2048, "step": 1392 }, { "epoch": 0.4588085141422043, "grad_norm": 2.911081075668335, "learning_rate": 5.7858091524414926e-05, "loss": 2.3954, "step": 1393 }, { "epoch": 0.459137881345465, "grad_norm": 2.4357879161834717, "learning_rate": 5.780613554968777e-05, "loss": 1.9379, "step": 1394 }, { "epoch": 0.4594672485487258, "grad_norm": 3.2798242568969727, "learning_rate": 5.775417093411619e-05, "loss": 2.3849, "step": 1395 }, { "epoch": 0.4597966157519865, "grad_norm": 2.6205365657806396, "learning_rate": 5.770219773522133e-05, "loss": 2.2528, "step": 1396 }, { "epoch": 0.4601259829552472, "grad_norm": 2.84399676322937, "learning_rate": 5.765021601053391e-05, "loss": 1.8288, "step": 1397 }, { "epoch": 0.46045535015850797, "grad_norm": 2.869101047515869, "learning_rate": 5.7598225817594035e-05, "loss": 2.3284, "step": 1398 }, { "epoch": 0.4607847173617687, "grad_norm": 2.6911323070526123, "learning_rate": 5.754622721395119e-05, "loss": 2.0477, "step": 1399 }, { "epoch": 0.46111408456502945, "grad_norm": 3.472745656967163, "learning_rate": 5.74942202571642e-05, "loss": 1.8559, "step": 1400 }, { "epoch": 0.46144345176829016, "grad_norm": 2.722027063369751, "learning_rate": 5.744220500480113e-05, "loss": 2.5318, "step": 1401 }, { "epoch": 0.4617728189715509, "grad_norm": 2.0918314456939697, "learning_rate": 5.739018151443918e-05, "loss": 2.1847, "step": 1402 }, { "epoch": 0.46210218617481164, "grad_norm": 2.0019867420196533, "learning_rate": 5.733814984366474e-05, "loss": 2.2064, "step": 1403 }, { "epoch": 0.4624315533780724, "grad_norm": 2.3096742630004883, "learning_rate": 5.7286110050073194e-05, "loss": 2.0878, "step": 1404 }, { "epoch": 0.4627609205813331, "grad_norm": 2.7274067401885986, "learning_rate": 5.723406219126895e-05, "loss": 2.2556, "step": 1405 }, { "epoch": 0.4630902877845938, "grad_norm": 2.2362143993377686, "learning_rate": 5.718200632486534e-05, "loss": 2.0013, "step": 1406 }, { "epoch": 0.4634196549878546, "grad_norm": 2.458976984024048, "learning_rate": 5.7129942508484556e-05, "loss": 2.1049, "step": 1407 }, { "epoch": 0.4637490221911153, "grad_norm": 2.278069496154785, "learning_rate": 5.707787079975758e-05, "loss": 2.0951, "step": 1408 }, { "epoch": 0.46407838939437607, "grad_norm": 2.235696315765381, "learning_rate": 5.702579125632416e-05, "loss": 2.4472, "step": 1409 }, { "epoch": 0.4644077565976368, "grad_norm": 2.662332773208618, "learning_rate": 5.697370393583269e-05, "loss": 2.1896, "step": 1410 }, { "epoch": 0.46473712380089754, "grad_norm": 2.677049398422241, "learning_rate": 5.692160889594017e-05, "loss": 2.0806, "step": 1411 }, { "epoch": 0.46506649100415826, "grad_norm": 2.7263095378875732, "learning_rate": 5.686950619431215e-05, "loss": 2.1826, "step": 1412 }, { "epoch": 0.465395858207419, "grad_norm": 2.625284194946289, "learning_rate": 5.6817395888622694e-05, "loss": 2.4464, "step": 1413 }, { "epoch": 0.46572522541067973, "grad_norm": 2.6425976753234863, "learning_rate": 5.6765278036554225e-05, "loss": 2.0412, "step": 1414 }, { "epoch": 0.46605459261394044, "grad_norm": 2.918874502182007, "learning_rate": 5.671315269579756e-05, "loss": 2.3, "step": 1415 }, { "epoch": 0.4663839598172012, "grad_norm": 2.5336811542510986, "learning_rate": 5.6661019924051814e-05, "loss": 2.4006, "step": 1416 }, { "epoch": 0.4667133270204619, "grad_norm": 2.6169357299804688, "learning_rate": 5.6608879779024274e-05, "loss": 2.1464, "step": 1417 }, { "epoch": 0.4670426942237227, "grad_norm": 2.734600305557251, "learning_rate": 5.6556732318430437e-05, "loss": 2.0906, "step": 1418 }, { "epoch": 0.4673720614269834, "grad_norm": 2.7634074687957764, "learning_rate": 5.650457759999389e-05, "loss": 2.1538, "step": 1419 }, { "epoch": 0.46770142863024416, "grad_norm": 2.391335964202881, "learning_rate": 5.6452415681446256e-05, "loss": 1.7893, "step": 1420 }, { "epoch": 0.4680307958335049, "grad_norm": 2.9604604244232178, "learning_rate": 5.6400246620527096e-05, "loss": 2.1226, "step": 1421 }, { "epoch": 0.4683601630367656, "grad_norm": 2.650703191757202, "learning_rate": 5.6348070474983905e-05, "loss": 1.8164, "step": 1422 }, { "epoch": 0.46868953024002635, "grad_norm": 2.603179454803467, "learning_rate": 5.629588730257205e-05, "loss": 1.6553, "step": 1423 }, { "epoch": 0.46901889744328706, "grad_norm": 2.903170108795166, "learning_rate": 5.6243697161054584e-05, "loss": 1.8419, "step": 1424 }, { "epoch": 0.46934826464654783, "grad_norm": 3.1051528453826904, "learning_rate": 5.619150010820238e-05, "loss": 1.6226, "step": 1425 }, { "epoch": 0.46967763184980854, "grad_norm": 1.8390017747879028, "learning_rate": 5.613929620179389e-05, "loss": 2.2257, "step": 1426 }, { "epoch": 0.4700069990530693, "grad_norm": 2.490265130996704, "learning_rate": 5.608708549961519e-05, "loss": 2.3091, "step": 1427 }, { "epoch": 0.47033636625633, "grad_norm": 2.267498254776001, "learning_rate": 5.603486805945984e-05, "loss": 2.4721, "step": 1428 }, { "epoch": 0.4706657334595908, "grad_norm": 2.788170337677002, "learning_rate": 5.598264393912891e-05, "loss": 2.453, "step": 1429 }, { "epoch": 0.4709951006628515, "grad_norm": 2.2585389614105225, "learning_rate": 5.593041319643083e-05, "loss": 2.2228, "step": 1430 }, { "epoch": 0.4713244678661122, "grad_norm": 2.556220531463623, "learning_rate": 5.587817588918137e-05, "loss": 2.3444, "step": 1431 }, { "epoch": 0.471653835069373, "grad_norm": 2.604313611984253, "learning_rate": 5.582593207520357e-05, "loss": 2.1515, "step": 1432 }, { "epoch": 0.4719832022726337, "grad_norm": 2.6779942512512207, "learning_rate": 5.577368181232764e-05, "loss": 2.1363, "step": 1433 }, { "epoch": 0.47231256947589445, "grad_norm": 2.4980621337890625, "learning_rate": 5.572142515839098e-05, "loss": 2.0295, "step": 1434 }, { "epoch": 0.47264193667915516, "grad_norm": 2.331618309020996, "learning_rate": 5.5669162171238046e-05, "loss": 2.1247, "step": 1435 }, { "epoch": 0.4729713038824159, "grad_norm": 3.248292922973633, "learning_rate": 5.5616892908720274e-05, "loss": 2.2727, "step": 1436 }, { "epoch": 0.47330067108567664, "grad_norm": 2.3975675106048584, "learning_rate": 5.556461742869609e-05, "loss": 2.3504, "step": 1437 }, { "epoch": 0.4736300382889374, "grad_norm": 2.7226855754852295, "learning_rate": 5.551233578903078e-05, "loss": 2.0893, "step": 1438 }, { "epoch": 0.4739594054921981, "grad_norm": 2.8404018878936768, "learning_rate": 5.5460048047596434e-05, "loss": 2.4199, "step": 1439 }, { "epoch": 0.4742887726954588, "grad_norm": 2.4289791584014893, "learning_rate": 5.540775426227194e-05, "loss": 2.4747, "step": 1440 }, { "epoch": 0.4746181398987196, "grad_norm": 2.772507905960083, "learning_rate": 5.535545449094283e-05, "loss": 2.054, "step": 1441 }, { "epoch": 0.4749475071019803, "grad_norm": 2.7915074825286865, "learning_rate": 5.5303148791501305e-05, "loss": 2.3271, "step": 1442 }, { "epoch": 0.47527687430524107, "grad_norm": 2.894771099090576, "learning_rate": 5.525083722184607e-05, "loss": 2.2738, "step": 1443 }, { "epoch": 0.4756062415085018, "grad_norm": 2.797031879425049, "learning_rate": 5.519851983988239e-05, "loss": 2.3225, "step": 1444 }, { "epoch": 0.47593560871176255, "grad_norm": 3.053269386291504, "learning_rate": 5.514619670352192e-05, "loss": 2.1418, "step": 1445 }, { "epoch": 0.47626497591502326, "grad_norm": 2.743236780166626, "learning_rate": 5.5093867870682725e-05, "loss": 1.966, "step": 1446 }, { "epoch": 0.47659434311828397, "grad_norm": 2.998206615447998, "learning_rate": 5.504153339928914e-05, "loss": 2.2283, "step": 1447 }, { "epoch": 0.47692371032154474, "grad_norm": 3.0275509357452393, "learning_rate": 5.498919334727175e-05, "loss": 2.0081, "step": 1448 }, { "epoch": 0.47725307752480545, "grad_norm": 3.7755558490753174, "learning_rate": 5.4936847772567314e-05, "loss": 2.3072, "step": 1449 }, { "epoch": 0.4775824447280662, "grad_norm": 3.9976677894592285, "learning_rate": 5.488449673311872e-05, "loss": 2.2594, "step": 1450 }, { "epoch": 0.4779118119313269, "grad_norm": 2.3137824535369873, "learning_rate": 5.48321402868749e-05, "loss": 2.6206, "step": 1451 }, { "epoch": 0.4782411791345877, "grad_norm": 2.003905773162842, "learning_rate": 5.477977849179076e-05, "loss": 2.221, "step": 1452 }, { "epoch": 0.4785705463378484, "grad_norm": 2.37841534614563, "learning_rate": 5.4727411405827136e-05, "loss": 2.5322, "step": 1453 }, { "epoch": 0.47889991354110917, "grad_norm": 2.259744644165039, "learning_rate": 5.467503908695073e-05, "loss": 1.9622, "step": 1454 }, { "epoch": 0.4792292807443699, "grad_norm": 2.1672475337982178, "learning_rate": 5.4622661593133996e-05, "loss": 2.0246, "step": 1455 }, { "epoch": 0.4795586479476306, "grad_norm": 2.359017848968506, "learning_rate": 5.457027898235517e-05, "loss": 2.0392, "step": 1456 }, { "epoch": 0.47988801515089136, "grad_norm": 2.5766847133636475, "learning_rate": 5.451789131259814e-05, "loss": 2.2329, "step": 1457 }, { "epoch": 0.48021738235415207, "grad_norm": 2.3741567134857178, "learning_rate": 5.446549864185233e-05, "loss": 2.2631, "step": 1458 }, { "epoch": 0.48054674955741283, "grad_norm": 2.526066541671753, "learning_rate": 5.44131010281128e-05, "loss": 2.1493, "step": 1459 }, { "epoch": 0.48087611676067354, "grad_norm": 2.551849126815796, "learning_rate": 5.4360698529380004e-05, "loss": 2.3144, "step": 1460 }, { "epoch": 0.4812054839639343, "grad_norm": 2.2339227199554443, "learning_rate": 5.4308291203659855e-05, "loss": 2.2004, "step": 1461 }, { "epoch": 0.481534851167195, "grad_norm": 2.5119829177856445, "learning_rate": 5.425587910896357e-05, "loss": 2.0944, "step": 1462 }, { "epoch": 0.4818642183704558, "grad_norm": 2.8552167415618896, "learning_rate": 5.4203462303307685e-05, "loss": 2.1718, "step": 1463 }, { "epoch": 0.4821935855737165, "grad_norm": 2.6687982082366943, "learning_rate": 5.4151040844713886e-05, "loss": 2.2164, "step": 1464 }, { "epoch": 0.4825229527769772, "grad_norm": 2.963787317276001, "learning_rate": 5.409861479120908e-05, "loss": 2.22, "step": 1465 }, { "epoch": 0.482852319980238, "grad_norm": 2.990370750427246, "learning_rate": 5.404618420082521e-05, "loss": 2.2234, "step": 1466 }, { "epoch": 0.4831816871834987, "grad_norm": 2.695317506790161, "learning_rate": 5.39937491315993e-05, "loss": 2.2448, "step": 1467 }, { "epoch": 0.48351105438675945, "grad_norm": 2.775763988494873, "learning_rate": 5.394130964157324e-05, "loss": 2.1806, "step": 1468 }, { "epoch": 0.48384042159002016, "grad_norm": 2.3641860485076904, "learning_rate": 5.388886578879392e-05, "loss": 1.927, "step": 1469 }, { "epoch": 0.48416978879328093, "grad_norm": 2.720329999923706, "learning_rate": 5.383641763131297e-05, "loss": 1.9248, "step": 1470 }, { "epoch": 0.48449915599654164, "grad_norm": 2.7788572311401367, "learning_rate": 5.378396522718683e-05, "loss": 1.9824, "step": 1471 }, { "epoch": 0.48482852319980235, "grad_norm": 3.20474910736084, "learning_rate": 5.373150863447662e-05, "loss": 1.5423, "step": 1472 }, { "epoch": 0.4851578904030631, "grad_norm": 2.918916940689087, "learning_rate": 5.367904791124815e-05, "loss": 1.9387, "step": 1473 }, { "epoch": 0.48548725760632383, "grad_norm": 3.4597556591033936, "learning_rate": 5.3626583115571716e-05, "loss": 2.226, "step": 1474 }, { "epoch": 0.4858166248095846, "grad_norm": 3.169464588165283, "learning_rate": 5.357411430552216e-05, "loss": 2.0575, "step": 1475 }, { "epoch": 0.4861459920128453, "grad_norm": 2.0502803325653076, "learning_rate": 5.352164153917882e-05, "loss": 2.3078, "step": 1476 }, { "epoch": 0.4864753592161061, "grad_norm": 2.205932855606079, "learning_rate": 5.3469164874625345e-05, "loss": 2.3107, "step": 1477 }, { "epoch": 0.4868047264193668, "grad_norm": 1.9236304759979248, "learning_rate": 5.341668436994971e-05, "loss": 2.22, "step": 1478 }, { "epoch": 0.48713409362262755, "grad_norm": 2.331031322479248, "learning_rate": 5.3364200083244175e-05, "loss": 2.1931, "step": 1479 }, { "epoch": 0.48746346082588826, "grad_norm": 2.279069185256958, "learning_rate": 5.3311712072605136e-05, "loss": 2.2918, "step": 1480 }, { "epoch": 0.487792828029149, "grad_norm": 2.3501081466674805, "learning_rate": 5.325922039613316e-05, "loss": 2.28, "step": 1481 }, { "epoch": 0.48812219523240974, "grad_norm": 2.4179487228393555, "learning_rate": 5.320672511193285e-05, "loss": 2.4011, "step": 1482 }, { "epoch": 0.48845156243567045, "grad_norm": 2.3031511306762695, "learning_rate": 5.315422627811278e-05, "loss": 2.0346, "step": 1483 }, { "epoch": 0.4887809296389312, "grad_norm": 3.2005553245544434, "learning_rate": 5.310172395278551e-05, "loss": 2.5121, "step": 1484 }, { "epoch": 0.4891102968421919, "grad_norm": 2.487902879714966, "learning_rate": 5.3049218194067394e-05, "loss": 2.0688, "step": 1485 }, { "epoch": 0.4894396640454527, "grad_norm": 2.2701635360717773, "learning_rate": 5.299670906007866e-05, "loss": 2.0644, "step": 1486 }, { "epoch": 0.4897690312487134, "grad_norm": 3.022916078567505, "learning_rate": 5.294419660894322e-05, "loss": 2.3781, "step": 1487 }, { "epoch": 0.49009839845197417, "grad_norm": 2.469449758529663, "learning_rate": 5.2891680898788665e-05, "loss": 1.7029, "step": 1488 }, { "epoch": 0.4904277656552349, "grad_norm": 2.625373363494873, "learning_rate": 5.283916198774621e-05, "loss": 2.0506, "step": 1489 }, { "epoch": 0.4907571328584956, "grad_norm": 2.7571139335632324, "learning_rate": 5.2786639933950597e-05, "loss": 2.1408, "step": 1490 }, { "epoch": 0.49108650006175636, "grad_norm": 2.4763495922088623, "learning_rate": 5.273411479554008e-05, "loss": 1.9425, "step": 1491 }, { "epoch": 0.49141586726501707, "grad_norm": 3.169473171234131, "learning_rate": 5.2681586630656276e-05, "loss": 2.4006, "step": 1492 }, { "epoch": 0.49174523446827784, "grad_norm": 2.6018457412719727, "learning_rate": 5.262905549744419e-05, "loss": 2.0221, "step": 1493 }, { "epoch": 0.49207460167153855, "grad_norm": 2.7356503009796143, "learning_rate": 5.25765214540521e-05, "loss": 2.1555, "step": 1494 }, { "epoch": 0.4924039688747993, "grad_norm": 2.443387269973755, "learning_rate": 5.2523984558631514e-05, "loss": 2.0985, "step": 1495 }, { "epoch": 0.49273333607806, "grad_norm": 2.6359920501708984, "learning_rate": 5.247144486933706e-05, "loss": 2.217, "step": 1496 }, { "epoch": 0.49306270328132074, "grad_norm": 2.7415084838867188, "learning_rate": 5.241890244432652e-05, "loss": 1.8597, "step": 1497 }, { "epoch": 0.4933920704845815, "grad_norm": 2.734299421310425, "learning_rate": 5.236635734176069e-05, "loss": 2.2436, "step": 1498 }, { "epoch": 0.4937214376878422, "grad_norm": 2.9822301864624023, "learning_rate": 5.231380961980326e-05, "loss": 2.0025, "step": 1499 }, { "epoch": 0.494050804891103, "grad_norm": 3.2635011672973633, "learning_rate": 5.226125933662088e-05, "loss": 2.073, "step": 1500 }, { "epoch": 0.4943801720943637, "grad_norm": 2.0584778785705566, "learning_rate": 5.220870655038308e-05, "loss": 2.2952, "step": 1501 }, { "epoch": 0.49470953929762446, "grad_norm": 2.4790139198303223, "learning_rate": 5.2156151319262045e-05, "loss": 2.5564, "step": 1502 }, { "epoch": 0.49503890650088517, "grad_norm": 2.687093496322632, "learning_rate": 5.2103593701432776e-05, "loss": 2.0692, "step": 1503 }, { "epoch": 0.49536827370414593, "grad_norm": 2.1168501377105713, "learning_rate": 5.2051033755072834e-05, "loss": 2.065, "step": 1504 }, { "epoch": 0.49569764090740664, "grad_norm": 2.593621253967285, "learning_rate": 5.199847153836241e-05, "loss": 2.2862, "step": 1505 }, { "epoch": 0.49602700811066736, "grad_norm": 2.273329257965088, "learning_rate": 5.194590710948419e-05, "loss": 1.9901, "step": 1506 }, { "epoch": 0.4963563753139281, "grad_norm": 2.755690336227417, "learning_rate": 5.189334052662331e-05, "loss": 2.1752, "step": 1507 }, { "epoch": 0.49668574251718883, "grad_norm": 2.4404118061065674, "learning_rate": 5.1840771847967286e-05, "loss": 2.1709, "step": 1508 }, { "epoch": 0.4970151097204496, "grad_norm": 2.30769681930542, "learning_rate": 5.178820113170596e-05, "loss": 2.2124, "step": 1509 }, { "epoch": 0.4973444769237103, "grad_norm": 2.8207881450653076, "learning_rate": 5.1735628436031436e-05, "loss": 2.2862, "step": 1510 }, { "epoch": 0.4976738441269711, "grad_norm": 2.278991460800171, "learning_rate": 5.1683053819137975e-05, "loss": 2.0548, "step": 1511 }, { "epoch": 0.4980032113302318, "grad_norm": 2.4886574745178223, "learning_rate": 5.1630477339221994e-05, "loss": 2.3737, "step": 1512 }, { "epoch": 0.49833257853349255, "grad_norm": 2.911540985107422, "learning_rate": 5.1577899054481996e-05, "loss": 2.0253, "step": 1513 }, { "epoch": 0.49866194573675326, "grad_norm": 2.7352163791656494, "learning_rate": 5.1525319023118434e-05, "loss": 2.3057, "step": 1514 }, { "epoch": 0.498991312940014, "grad_norm": 2.4180266857147217, "learning_rate": 5.147273730333372e-05, "loss": 2.3452, "step": 1515 }, { "epoch": 0.49932068014327474, "grad_norm": 2.5061450004577637, "learning_rate": 5.14201539533321e-05, "loss": 1.8195, "step": 1516 }, { "epoch": 0.49965004734653545, "grad_norm": 2.4668986797332764, "learning_rate": 5.1367569031319715e-05, "loss": 1.9367, "step": 1517 }, { "epoch": 0.4999794145497962, "grad_norm": 2.661878824234009, "learning_rate": 5.1314982595504335e-05, "loss": 1.9323, "step": 1518 }, { "epoch": 0.4999794145497962, "eval_loss": 2.212698221206665, "eval_runtime": 764.8335, "eval_samples_per_second": 3.343, "eval_steps_per_second": 1.672, "step": 1518 }, { "epoch": 0.5003087817530569, "grad_norm": 3.1498970985412598, "learning_rate": 5.126239470409546e-05, "loss": 2.2301, "step": 1519 }, { "epoch": 0.5006381489563176, "grad_norm": 2.7907960414886475, "learning_rate": 5.1209805415304224e-05, "loss": 2.0078, "step": 1520 }, { "epoch": 0.5009675161595785, "grad_norm": 3.4877936840057373, "learning_rate": 5.115721478734323e-05, "loss": 2.7673, "step": 1521 }, { "epoch": 0.5012968833628392, "grad_norm": 3.1865594387054443, "learning_rate": 5.1104622878426664e-05, "loss": 1.9934, "step": 1522 }, { "epoch": 0.5016262505660999, "grad_norm": 2.964200258255005, "learning_rate": 5.105202974677008e-05, "loss": 1.8623, "step": 1523 }, { "epoch": 0.5019556177693606, "grad_norm": 3.1753077507019043, "learning_rate": 5.099943545059035e-05, "loss": 1.9103, "step": 1524 }, { "epoch": 0.5022849849726213, "grad_norm": 3.5453011989593506, "learning_rate": 5.094684004810568e-05, "loss": 2.223, "step": 1525 }, { "epoch": 0.5026143521758821, "grad_norm": 1.9202203750610352, "learning_rate": 5.089424359753553e-05, "loss": 2.3095, "step": 1526 }, { "epoch": 0.5029437193791428, "grad_norm": 2.3142552375793457, "learning_rate": 5.084164615710042e-05, "loss": 2.0537, "step": 1527 }, { "epoch": 0.5032730865824036, "grad_norm": 2.545691728591919, "learning_rate": 5.078904778502206e-05, "loss": 2.222, "step": 1528 }, { "epoch": 0.5036024537856643, "grad_norm": 2.6911139488220215, "learning_rate": 5.0736448539523174e-05, "loss": 2.2372, "step": 1529 }, { "epoch": 0.5039318209889251, "grad_norm": 2.5566017627716064, "learning_rate": 5.06838484788274e-05, "loss": 2.2237, "step": 1530 }, { "epoch": 0.5042611881921858, "grad_norm": 3.251295804977417, "learning_rate": 5.063124766115933e-05, "loss": 2.6693, "step": 1531 }, { "epoch": 0.5045905553954465, "grad_norm": 2.5430779457092285, "learning_rate": 5.057864614474439e-05, "loss": 2.6509, "step": 1532 }, { "epoch": 0.5049199225987072, "grad_norm": 2.509148597717285, "learning_rate": 5.052604398780877e-05, "loss": 2.2311, "step": 1533 }, { "epoch": 0.5052492898019679, "grad_norm": 2.435283899307251, "learning_rate": 5.047344124857933e-05, "loss": 2.2194, "step": 1534 }, { "epoch": 0.5055786570052287, "grad_norm": 2.5991721153259277, "learning_rate": 5.0420837985283664e-05, "loss": 2.3425, "step": 1535 }, { "epoch": 0.5059080242084895, "grad_norm": 2.383134126663208, "learning_rate": 5.036823425614986e-05, "loss": 2.3328, "step": 1536 }, { "epoch": 0.5062373914117502, "grad_norm": 3.121763229370117, "learning_rate": 5.0315630119406565e-05, "loss": 2.4681, "step": 1537 }, { "epoch": 0.5065667586150109, "grad_norm": 2.824033260345459, "learning_rate": 5.0263025633282866e-05, "loss": 2.3247, "step": 1538 }, { "epoch": 0.5068961258182716, "grad_norm": 2.942976713180542, "learning_rate": 5.021042085600827e-05, "loss": 2.254, "step": 1539 }, { "epoch": 0.5072254930215324, "grad_norm": 2.7724997997283936, "learning_rate": 5.015781584581252e-05, "loss": 2.0605, "step": 1540 }, { "epoch": 0.5075548602247931, "grad_norm": 2.791670322418213, "learning_rate": 5.0105210660925704e-05, "loss": 2.0396, "step": 1541 }, { "epoch": 0.5078842274280538, "grad_norm": 2.5132830142974854, "learning_rate": 5.0052605359578086e-05, "loss": 1.7545, "step": 1542 }, { "epoch": 0.5082135946313145, "grad_norm": 2.5325276851654053, "learning_rate": 5e-05, "loss": 1.737, "step": 1543 }, { "epoch": 0.5085429618345754, "grad_norm": 3.5093564987182617, "learning_rate": 4.994739464042193e-05, "loss": 2.3988, "step": 1544 }, { "epoch": 0.5088723290378361, "grad_norm": 3.0075719356536865, "learning_rate": 4.989478933907431e-05, "loss": 2.0844, "step": 1545 }, { "epoch": 0.5092016962410968, "grad_norm": 2.6146292686462402, "learning_rate": 4.9842184154187486e-05, "loss": 2.0929, "step": 1546 }, { "epoch": 0.5095310634443575, "grad_norm": 2.921995162963867, "learning_rate": 4.978957914399174e-05, "loss": 2.162, "step": 1547 }, { "epoch": 0.5098604306476182, "grad_norm": 3.16227650642395, "learning_rate": 4.973697436671714e-05, "loss": 2.2389, "step": 1548 }, { "epoch": 0.510189797850879, "grad_norm": 2.8888611793518066, "learning_rate": 4.9684369880593446e-05, "loss": 1.9225, "step": 1549 }, { "epoch": 0.5105191650541397, "grad_norm": 3.1876556873321533, "learning_rate": 4.963176574385015e-05, "loss": 1.7554, "step": 1550 }, { "epoch": 0.5108485322574005, "grad_norm": 2.1988892555236816, "learning_rate": 4.957916201471635e-05, "loss": 2.6424, "step": 1551 }, { "epoch": 0.5111778994606612, "grad_norm": 2.707622766494751, "learning_rate": 4.952655875142068e-05, "loss": 2.4397, "step": 1552 }, { "epoch": 0.511507266663922, "grad_norm": 2.7695624828338623, "learning_rate": 4.947395601219126e-05, "loss": 2.2901, "step": 1553 }, { "epoch": 0.5118366338671827, "grad_norm": 2.7299704551696777, "learning_rate": 4.9421353855255624e-05, "loss": 2.4199, "step": 1554 }, { "epoch": 0.5121660010704434, "grad_norm": 2.7451207637786865, "learning_rate": 4.936875233884069e-05, "loss": 2.1827, "step": 1555 }, { "epoch": 0.5124953682737041, "grad_norm": 2.116290807723999, "learning_rate": 4.931615152117262e-05, "loss": 1.9413, "step": 1556 }, { "epoch": 0.5128247354769648, "grad_norm": 2.1912240982055664, "learning_rate": 4.926355146047685e-05, "loss": 2.3163, "step": 1557 }, { "epoch": 0.5131541026802257, "grad_norm": 2.397310495376587, "learning_rate": 4.9210952214977954e-05, "loss": 2.022, "step": 1558 }, { "epoch": 0.5134834698834864, "grad_norm": 2.5802431106567383, "learning_rate": 4.915835384289958e-05, "loss": 2.3069, "step": 1559 }, { "epoch": 0.5138128370867471, "grad_norm": 2.7203660011291504, "learning_rate": 4.9105756402464486e-05, "loss": 2.6652, "step": 1560 }, { "epoch": 0.5141422042900078, "grad_norm": 2.513911247253418, "learning_rate": 4.905315995189431e-05, "loss": 2.012, "step": 1561 }, { "epoch": 0.5144715714932686, "grad_norm": 3.2579002380371094, "learning_rate": 4.900056454940965e-05, "loss": 2.3097, "step": 1562 }, { "epoch": 0.5148009386965293, "grad_norm": 3.077726364135742, "learning_rate": 4.894797025322993e-05, "loss": 2.1601, "step": 1563 }, { "epoch": 0.51513030589979, "grad_norm": 3.066802501678467, "learning_rate": 4.8895377121573334e-05, "loss": 2.5118, "step": 1564 }, { "epoch": 0.5154596731030507, "grad_norm": 2.064492702484131, "learning_rate": 4.884278521265677e-05, "loss": 1.8952, "step": 1565 }, { "epoch": 0.5157890403063115, "grad_norm": 2.6084978580474854, "learning_rate": 4.879019458469579e-05, "loss": 2.2112, "step": 1566 }, { "epoch": 0.5161184075095723, "grad_norm": 2.9014785289764404, "learning_rate": 4.873760529590455e-05, "loss": 2.0184, "step": 1567 }, { "epoch": 0.516447774712833, "grad_norm": 2.735502243041992, "learning_rate": 4.8685017404495683e-05, "loss": 2.0125, "step": 1568 }, { "epoch": 0.5167771419160937, "grad_norm": 2.60123348236084, "learning_rate": 4.863243096868029e-05, "loss": 2.0574, "step": 1569 }, { "epoch": 0.5171065091193544, "grad_norm": 2.9571070671081543, "learning_rate": 4.85798460466679e-05, "loss": 2.1061, "step": 1570 }, { "epoch": 0.5174358763226152, "grad_norm": 2.7076237201690674, "learning_rate": 4.85272626966663e-05, "loss": 2.0312, "step": 1571 }, { "epoch": 0.5177652435258759, "grad_norm": 2.76377010345459, "learning_rate": 4.847468097688157e-05, "loss": 1.9232, "step": 1572 }, { "epoch": 0.5180946107291367, "grad_norm": 2.923283338546753, "learning_rate": 4.8422100945518015e-05, "loss": 2.0029, "step": 1573 }, { "epoch": 0.5184239779323974, "grad_norm": 2.9562487602233887, "learning_rate": 4.836952266077801e-05, "loss": 2.1253, "step": 1574 }, { "epoch": 0.5187533451356581, "grad_norm": 3.1904914379119873, "learning_rate": 4.8316946180862036e-05, "loss": 2.2682, "step": 1575 }, { "epoch": 0.5190827123389189, "grad_norm": 2.154838800430298, "learning_rate": 4.826437156396858e-05, "loss": 2.6257, "step": 1576 }, { "epoch": 0.5194120795421796, "grad_norm": 2.501347541809082, "learning_rate": 4.821179886829405e-05, "loss": 2.0692, "step": 1577 }, { "epoch": 0.5197414467454403, "grad_norm": 2.355531692504883, "learning_rate": 4.815922815203272e-05, "loss": 2.2775, "step": 1578 }, { "epoch": 0.520070813948701, "grad_norm": 2.39996075630188, "learning_rate": 4.8106659473376695e-05, "loss": 2.3885, "step": 1579 }, { "epoch": 0.5204001811519617, "grad_norm": 2.339895486831665, "learning_rate": 4.805409289051582e-05, "loss": 2.2418, "step": 1580 }, { "epoch": 0.5207295483552226, "grad_norm": 2.3130595684051514, "learning_rate": 4.80015284616376e-05, "loss": 2.0223, "step": 1581 }, { "epoch": 0.5210589155584833, "grad_norm": 2.9214603900909424, "learning_rate": 4.794896624492718e-05, "loss": 2.4215, "step": 1582 }, { "epoch": 0.521388282761744, "grad_norm": 2.5012197494506836, "learning_rate": 4.789640629856725e-05, "loss": 2.3697, "step": 1583 }, { "epoch": 0.5217176499650047, "grad_norm": 2.809415817260742, "learning_rate": 4.7843848680737966e-05, "loss": 2.1699, "step": 1584 }, { "epoch": 0.5220470171682655, "grad_norm": 2.9632301330566406, "learning_rate": 4.779129344961694e-05, "loss": 2.2572, "step": 1585 }, { "epoch": 0.5223763843715262, "grad_norm": 2.583806276321411, "learning_rate": 4.7738740663379135e-05, "loss": 2.1202, "step": 1586 }, { "epoch": 0.5227057515747869, "grad_norm": 3.0589942932128906, "learning_rate": 4.768619038019677e-05, "loss": 2.2599, "step": 1587 }, { "epoch": 0.5230351187780476, "grad_norm": 2.770036220550537, "learning_rate": 4.7633642658239345e-05, "loss": 1.9833, "step": 1588 }, { "epoch": 0.5233644859813084, "grad_norm": 3.0664565563201904, "learning_rate": 4.758109755567348e-05, "loss": 2.0921, "step": 1589 }, { "epoch": 0.5236938531845692, "grad_norm": 2.592406749725342, "learning_rate": 4.752855513066293e-05, "loss": 1.9637, "step": 1590 }, { "epoch": 0.5240232203878299, "grad_norm": 2.8975071907043457, "learning_rate": 4.747601544136849e-05, "loss": 2.2294, "step": 1591 }, { "epoch": 0.5243525875910906, "grad_norm": 2.795776844024658, "learning_rate": 4.742347854594791e-05, "loss": 2.2874, "step": 1592 }, { "epoch": 0.5246819547943513, "grad_norm": 2.9049065113067627, "learning_rate": 4.737094450255581e-05, "loss": 2.2325, "step": 1593 }, { "epoch": 0.5250113219976121, "grad_norm": 3.0760581493377686, "learning_rate": 4.731841336934372e-05, "loss": 2.2926, "step": 1594 }, { "epoch": 0.5253406892008728, "grad_norm": 2.4571144580841064, "learning_rate": 4.726588520445993e-05, "loss": 2.0032, "step": 1595 }, { "epoch": 0.5256700564041336, "grad_norm": 3.1097042560577393, "learning_rate": 4.721336006604941e-05, "loss": 1.9375, "step": 1596 }, { "epoch": 0.5259994236073943, "grad_norm": 2.750455617904663, "learning_rate": 4.71608380122538e-05, "loss": 2.1235, "step": 1597 }, { "epoch": 0.526328790810655, "grad_norm": 2.6923322677612305, "learning_rate": 4.710831910121135e-05, "loss": 1.7062, "step": 1598 }, { "epoch": 0.5266581580139158, "grad_norm": 3.282520055770874, "learning_rate": 4.7055803391056795e-05, "loss": 2.2095, "step": 1599 }, { "epoch": 0.5269875252171765, "grad_norm": 2.924677610397339, "learning_rate": 4.700329093992135e-05, "loss": 1.6683, "step": 1600 }, { "epoch": 0.5273168924204372, "grad_norm": 2.091074228286743, "learning_rate": 4.695078180593262e-05, "loss": 2.4635, "step": 1601 }, { "epoch": 0.5276462596236979, "grad_norm": 2.103097677230835, "learning_rate": 4.689827604721451e-05, "loss": 2.1667, "step": 1602 }, { "epoch": 0.5279756268269588, "grad_norm": 2.7929534912109375, "learning_rate": 4.684577372188723e-05, "loss": 2.7994, "step": 1603 }, { "epoch": 0.5283049940302195, "grad_norm": 2.534282684326172, "learning_rate": 4.679327488806716e-05, "loss": 2.3134, "step": 1604 }, { "epoch": 0.5286343612334802, "grad_norm": 2.393833875656128, "learning_rate": 4.674077960386685e-05, "loss": 2.3605, "step": 1605 }, { "epoch": 0.5289637284367409, "grad_norm": 2.5953030586242676, "learning_rate": 4.668828792739487e-05, "loss": 2.2843, "step": 1606 }, { "epoch": 0.5292930956400016, "grad_norm": 2.9071319103240967, "learning_rate": 4.6635799916755836e-05, "loss": 2.35, "step": 1607 }, { "epoch": 0.5296224628432624, "grad_norm": 2.4467287063598633, "learning_rate": 4.65833156300503e-05, "loss": 2.1182, "step": 1608 }, { "epoch": 0.5299518300465231, "grad_norm": 2.2593226432800293, "learning_rate": 4.653083512537467e-05, "loss": 2.3638, "step": 1609 }, { "epoch": 0.5302811972497838, "grad_norm": 2.5257232189178467, "learning_rate": 4.6478358460821184e-05, "loss": 2.2619, "step": 1610 }, { "epoch": 0.5306105644530446, "grad_norm": 2.365927219390869, "learning_rate": 4.642588569447785e-05, "loss": 2.1333, "step": 1611 }, { "epoch": 0.5309399316563054, "grad_norm": 2.346470594406128, "learning_rate": 4.637341688442831e-05, "loss": 2.1835, "step": 1612 }, { "epoch": 0.5312692988595661, "grad_norm": 2.5479326248168945, "learning_rate": 4.6320952088751876e-05, "loss": 2.1302, "step": 1613 }, { "epoch": 0.5315986660628268, "grad_norm": 2.7286736965179443, "learning_rate": 4.626849136552338e-05, "loss": 2.0868, "step": 1614 }, { "epoch": 0.5319280332660875, "grad_norm": 2.7833032608032227, "learning_rate": 4.6216034772813195e-05, "loss": 2.5112, "step": 1615 }, { "epoch": 0.5322574004693482, "grad_norm": 2.5549001693725586, "learning_rate": 4.616358236868705e-05, "loss": 2.0309, "step": 1616 }, { "epoch": 0.532586767672609, "grad_norm": 3.951969623565674, "learning_rate": 4.61111342112061e-05, "loss": 2.3313, "step": 1617 }, { "epoch": 0.5329161348758698, "grad_norm": 2.874086380004883, "learning_rate": 4.605869035842677e-05, "loss": 2.2732, "step": 1618 }, { "epoch": 0.5332455020791305, "grad_norm": 2.5527641773223877, "learning_rate": 4.6006250868400726e-05, "loss": 1.9412, "step": 1619 }, { "epoch": 0.5335748692823912, "grad_norm": 2.8807637691497803, "learning_rate": 4.595381579917478e-05, "loss": 2.283, "step": 1620 }, { "epoch": 0.533904236485652, "grad_norm": 2.820483922958374, "learning_rate": 4.5901385208790924e-05, "loss": 2.3101, "step": 1621 }, { "epoch": 0.5342336036889127, "grad_norm": 2.790860652923584, "learning_rate": 4.584895915528611e-05, "loss": 2.0345, "step": 1622 }, { "epoch": 0.5345629708921734, "grad_norm": 2.5604536533355713, "learning_rate": 4.579653769669233e-05, "loss": 1.8557, "step": 1623 }, { "epoch": 0.5348923380954341, "grad_norm": 3.321770668029785, "learning_rate": 4.574412089103643e-05, "loss": 1.9905, "step": 1624 }, { "epoch": 0.5352217052986948, "grad_norm": 3.51432204246521, "learning_rate": 4.569170879634014e-05, "loss": 1.9512, "step": 1625 }, { "epoch": 0.5355510725019557, "grad_norm": 2.5113067626953125, "learning_rate": 4.563930147062e-05, "loss": 2.393, "step": 1626 }, { "epoch": 0.5358804397052164, "grad_norm": 1.724272608757019, "learning_rate": 4.558689897188721e-05, "loss": 2.1496, "step": 1627 }, { "epoch": 0.5362098069084771, "grad_norm": 2.2979555130004883, "learning_rate": 4.553450135814768e-05, "loss": 2.4137, "step": 1628 }, { "epoch": 0.5365391741117378, "grad_norm": 2.4890944957733154, "learning_rate": 4.548210868740188e-05, "loss": 2.3685, "step": 1629 }, { "epoch": 0.5368685413149985, "grad_norm": 2.3017985820770264, "learning_rate": 4.5429721017644835e-05, "loss": 2.4172, "step": 1630 }, { "epoch": 0.5371979085182593, "grad_norm": 2.6837596893310547, "learning_rate": 4.537733840686601e-05, "loss": 2.424, "step": 1631 }, { "epoch": 0.53752727572152, "grad_norm": 2.583272695541382, "learning_rate": 4.532496091304929e-05, "loss": 2.5061, "step": 1632 }, { "epoch": 0.5378566429247807, "grad_norm": 2.5158188343048096, "learning_rate": 4.5272588594172875e-05, "loss": 2.3061, "step": 1633 }, { "epoch": 0.5381860101280415, "grad_norm": 2.558367967605591, "learning_rate": 4.522022150820925e-05, "loss": 2.0809, "step": 1634 }, { "epoch": 0.5385153773313023, "grad_norm": 2.473259687423706, "learning_rate": 4.516785971312511e-05, "loss": 2.2443, "step": 1635 }, { "epoch": 0.538844744534563, "grad_norm": 2.3816640377044678, "learning_rate": 4.51155032668813e-05, "loss": 2.0707, "step": 1636 }, { "epoch": 0.5391741117378237, "grad_norm": 2.75467586517334, "learning_rate": 4.5063152227432705e-05, "loss": 2.307, "step": 1637 }, { "epoch": 0.5395034789410844, "grad_norm": 2.2179129123687744, "learning_rate": 4.501080665272827e-05, "loss": 2.1801, "step": 1638 }, { "epoch": 0.5398328461443451, "grad_norm": 2.5906922817230225, "learning_rate": 4.495846660071088e-05, "loss": 2.136, "step": 1639 }, { "epoch": 0.540162213347606, "grad_norm": 3.0321009159088135, "learning_rate": 4.490613212931729e-05, "loss": 2.5038, "step": 1640 }, { "epoch": 0.5404915805508667, "grad_norm": 2.619051694869995, "learning_rate": 4.485380329647808e-05, "loss": 2.2298, "step": 1641 }, { "epoch": 0.5408209477541274, "grad_norm": 3.0019948482513428, "learning_rate": 4.480148016011762e-05, "loss": 2.0229, "step": 1642 }, { "epoch": 0.5411503149573881, "grad_norm": 2.2649595737457275, "learning_rate": 4.4749162778153954e-05, "loss": 1.8919, "step": 1643 }, { "epoch": 0.5414796821606489, "grad_norm": 2.849843740463257, "learning_rate": 4.469685120849872e-05, "loss": 2.0903, "step": 1644 }, { "epoch": 0.5418090493639096, "grad_norm": 3.021907091140747, "learning_rate": 4.4644545509057185e-05, "loss": 2.3181, "step": 1645 }, { "epoch": 0.5421384165671703, "grad_norm": 2.7080013751983643, "learning_rate": 4.459224573772808e-05, "loss": 1.8712, "step": 1646 }, { "epoch": 0.542467783770431, "grad_norm": 2.6799404621124268, "learning_rate": 4.453995195240358e-05, "loss": 1.4717, "step": 1647 }, { "epoch": 0.5427971509736917, "grad_norm": 3.217162609100342, "learning_rate": 4.448766421096924e-05, "loss": 2.263, "step": 1648 }, { "epoch": 0.5431265181769526, "grad_norm": 3.4610049724578857, "learning_rate": 4.443538257130393e-05, "loss": 2.0226, "step": 1649 }, { "epoch": 0.5434558853802133, "grad_norm": 3.7474148273468018, "learning_rate": 4.4383107091279724e-05, "loss": 1.9848, "step": 1650 }, { "epoch": 0.543785252583474, "grad_norm": 1.9629192352294922, "learning_rate": 4.433083782876196e-05, "loss": 2.4658, "step": 1651 }, { "epoch": 0.5441146197867347, "grad_norm": 2.639218807220459, "learning_rate": 4.427857484160902e-05, "loss": 2.5598, "step": 1652 }, { "epoch": 0.5444439869899955, "grad_norm": 2.2541027069091797, "learning_rate": 4.4226318187672357e-05, "loss": 2.4328, "step": 1653 }, { "epoch": 0.5447733541932562, "grad_norm": 2.1472740173339844, "learning_rate": 4.4174067924796444e-05, "loss": 2.1159, "step": 1654 }, { "epoch": 0.5451027213965169, "grad_norm": 2.170586585998535, "learning_rate": 4.4121824110818636e-05, "loss": 2.193, "step": 1655 }, { "epoch": 0.5454320885997777, "grad_norm": 2.1903934478759766, "learning_rate": 4.406958680356917e-05, "loss": 2.0941, "step": 1656 }, { "epoch": 0.5457614558030384, "grad_norm": 2.934429883956909, "learning_rate": 4.4017356060871084e-05, "loss": 2.7441, "step": 1657 }, { "epoch": 0.5460908230062992, "grad_norm": 2.7060625553131104, "learning_rate": 4.396513194054017e-05, "loss": 2.2778, "step": 1658 }, { "epoch": 0.5464201902095599, "grad_norm": 2.726825714111328, "learning_rate": 4.3912914500384825e-05, "loss": 2.4721, "step": 1659 }, { "epoch": 0.5467495574128206, "grad_norm": 2.721712589263916, "learning_rate": 4.386070379820612e-05, "loss": 2.6526, "step": 1660 }, { "epoch": 0.5470789246160813, "grad_norm": 2.7384960651397705, "learning_rate": 4.380849989179764e-05, "loss": 2.0193, "step": 1661 }, { "epoch": 0.5474082918193421, "grad_norm": 2.4211504459381104, "learning_rate": 4.375630283894543e-05, "loss": 1.86, "step": 1662 }, { "epoch": 0.5477376590226029, "grad_norm": 2.456686019897461, "learning_rate": 4.370411269742797e-05, "loss": 2.2503, "step": 1663 }, { "epoch": 0.5480670262258636, "grad_norm": 2.7705039978027344, "learning_rate": 4.36519295250161e-05, "loss": 2.1732, "step": 1664 }, { "epoch": 0.5483963934291243, "grad_norm": 2.6563832759857178, "learning_rate": 4.3599753379472916e-05, "loss": 1.8923, "step": 1665 }, { "epoch": 0.548725760632385, "grad_norm": 3.264117479324341, "learning_rate": 4.354758431855375e-05, "loss": 2.1551, "step": 1666 }, { "epoch": 0.5490551278356458, "grad_norm": 3.068531036376953, "learning_rate": 4.349542240000612e-05, "loss": 2.342, "step": 1667 }, { "epoch": 0.5493844950389065, "grad_norm": 2.412754535675049, "learning_rate": 4.344326768156957e-05, "loss": 1.961, "step": 1668 }, { "epoch": 0.5497138622421672, "grad_norm": 2.5070178508758545, "learning_rate": 4.339112022097574e-05, "loss": 2.0561, "step": 1669 }, { "epoch": 0.5500432294454279, "grad_norm": 3.7027029991149902, "learning_rate": 4.33389800759482e-05, "loss": 2.4097, "step": 1670 }, { "epoch": 0.5503725966486888, "grad_norm": 2.5179436206817627, "learning_rate": 4.3286847304202446e-05, "loss": 2.1429, "step": 1671 }, { "epoch": 0.5507019638519495, "grad_norm": 2.777573823928833, "learning_rate": 4.323472196344579e-05, "loss": 1.8857, "step": 1672 }, { "epoch": 0.5510313310552102, "grad_norm": 2.600545644760132, "learning_rate": 4.3182604111377324e-05, "loss": 1.7022, "step": 1673 }, { "epoch": 0.5513606982584709, "grad_norm": 3.057023048400879, "learning_rate": 4.3130493805687864e-05, "loss": 2.03, "step": 1674 }, { "epoch": 0.5516900654617316, "grad_norm": 3.4655754566192627, "learning_rate": 4.3078391104059854e-05, "loss": 1.8471, "step": 1675 }, { "epoch": 0.5520194326649924, "grad_norm": 2.065298318862915, "learning_rate": 4.3026296064167334e-05, "loss": 2.4574, "step": 1676 }, { "epoch": 0.5523487998682531, "grad_norm": 2.120046615600586, "learning_rate": 4.297420874367586e-05, "loss": 2.1619, "step": 1677 }, { "epoch": 0.5526781670715138, "grad_norm": 2.614112615585327, "learning_rate": 4.292212920024243e-05, "loss": 2.3831, "step": 1678 }, { "epoch": 0.5530075342747746, "grad_norm": 2.4009101390838623, "learning_rate": 4.287005749151546e-05, "loss": 2.4063, "step": 1679 }, { "epoch": 0.5533369014780353, "grad_norm": 2.5356178283691406, "learning_rate": 4.281799367513467e-05, "loss": 2.2704, "step": 1680 }, { "epoch": 0.5536662686812961, "grad_norm": 2.489375352859497, "learning_rate": 4.276593780873105e-05, "loss": 2.1472, "step": 1681 }, { "epoch": 0.5539956358845568, "grad_norm": 2.381366491317749, "learning_rate": 4.271388994992682e-05, "loss": 2.0989, "step": 1682 }, { "epoch": 0.5543250030878175, "grad_norm": 2.4626338481903076, "learning_rate": 4.266185015633527e-05, "loss": 2.2773, "step": 1683 }, { "epoch": 0.5546543702910782, "grad_norm": 2.7277164459228516, "learning_rate": 4.260981848556081e-05, "loss": 2.3875, "step": 1684 }, { "epoch": 0.554983737494339, "grad_norm": 2.389749765396118, "learning_rate": 4.2557794995198876e-05, "loss": 2.0901, "step": 1685 }, { "epoch": 0.5553131046975998, "grad_norm": 2.2613539695739746, "learning_rate": 4.25057797428358e-05, "loss": 2.3888, "step": 1686 }, { "epoch": 0.5556424719008605, "grad_norm": 2.8356738090515137, "learning_rate": 4.245377278604881e-05, "loss": 2.1503, "step": 1687 }, { "epoch": 0.5559718391041212, "grad_norm": 2.471827745437622, "learning_rate": 4.240177418240597e-05, "loss": 2.1751, "step": 1688 }, { "epoch": 0.5563012063073819, "grad_norm": 2.6936299800872803, "learning_rate": 4.234978398946611e-05, "loss": 2.2912, "step": 1689 }, { "epoch": 0.5566305735106427, "grad_norm": 2.365645170211792, "learning_rate": 4.2297802264778676e-05, "loss": 2.0137, "step": 1690 }, { "epoch": 0.5569599407139034, "grad_norm": 2.8537073135375977, "learning_rate": 4.224582906588382e-05, "loss": 1.9902, "step": 1691 }, { "epoch": 0.5572893079171641, "grad_norm": 2.6542677879333496, "learning_rate": 4.2193864450312236e-05, "loss": 2.0595, "step": 1692 }, { "epoch": 0.5576186751204248, "grad_norm": 2.9382853507995605, "learning_rate": 4.214190847558509e-05, "loss": 2.1753, "step": 1693 }, { "epoch": 0.5579480423236857, "grad_norm": 2.9632253646850586, "learning_rate": 4.2089961199214e-05, "loss": 2.0176, "step": 1694 }, { "epoch": 0.5582774095269464, "grad_norm": 3.0138721466064453, "learning_rate": 4.203802267870097e-05, "loss": 1.9451, "step": 1695 }, { "epoch": 0.5586067767302071, "grad_norm": 2.569395065307617, "learning_rate": 4.1986092971538306e-05, "loss": 1.8728, "step": 1696 }, { "epoch": 0.5589361439334678, "grad_norm": 2.5671072006225586, "learning_rate": 4.193417213520852e-05, "loss": 1.9411, "step": 1697 }, { "epoch": 0.5592655111367285, "grad_norm": 2.9711215496063232, "learning_rate": 4.1882260227184354e-05, "loss": 2.2304, "step": 1698 }, { "epoch": 0.5595948783399893, "grad_norm": 2.82068133354187, "learning_rate": 4.1830357304928664e-05, "loss": 1.9563, "step": 1699 }, { "epoch": 0.55992424554325, "grad_norm": 3.187591314315796, "learning_rate": 4.177846342589431e-05, "loss": 1.8838, "step": 1700 }, { "epoch": 0.5602536127465108, "grad_norm": 2.2826199531555176, "learning_rate": 4.17265786475242e-05, "loss": 2.3018, "step": 1701 }, { "epoch": 0.5605829799497715, "grad_norm": 2.2217764854431152, "learning_rate": 4.167470302725114e-05, "loss": 2.3331, "step": 1702 }, { "epoch": 0.5609123471530323, "grad_norm": 2.2242486476898193, "learning_rate": 4.16228366224978e-05, "loss": 2.2806, "step": 1703 }, { "epoch": 0.561241714356293, "grad_norm": 2.391951560974121, "learning_rate": 4.157097949067664e-05, "loss": 2.4716, "step": 1704 }, { "epoch": 0.5615710815595537, "grad_norm": 2.481973886489868, "learning_rate": 4.1519131689189894e-05, "loss": 2.1806, "step": 1705 }, { "epoch": 0.5619004487628144, "grad_norm": 2.7211787700653076, "learning_rate": 4.146729327542942e-05, "loss": 2.3138, "step": 1706 }, { "epoch": 0.5622298159660751, "grad_norm": 2.246211528778076, "learning_rate": 4.1415464306776704e-05, "loss": 2.339, "step": 1707 }, { "epoch": 0.562559183169336, "grad_norm": 2.3714375495910645, "learning_rate": 4.136364484060279e-05, "loss": 2.3938, "step": 1708 }, { "epoch": 0.5628885503725967, "grad_norm": 2.315984010696411, "learning_rate": 4.131183493426819e-05, "loss": 2.0192, "step": 1709 }, { "epoch": 0.5632179175758574, "grad_norm": 3.121124029159546, "learning_rate": 4.126003464512283e-05, "loss": 2.5295, "step": 1710 }, { "epoch": 0.5635472847791181, "grad_norm": 2.801370620727539, "learning_rate": 4.120824403050598e-05, "loss": 2.0249, "step": 1711 }, { "epoch": 0.5638766519823789, "grad_norm": 2.729098320007324, "learning_rate": 4.115646314774626e-05, "loss": 2.1085, "step": 1712 }, { "epoch": 0.5642060191856396, "grad_norm": 2.5545506477355957, "learning_rate": 4.110469205416144e-05, "loss": 2.1962, "step": 1713 }, { "epoch": 0.5645353863889003, "grad_norm": 2.6717350482940674, "learning_rate": 4.105293080705852e-05, "loss": 2.124, "step": 1714 }, { "epoch": 0.564864753592161, "grad_norm": 2.6053621768951416, "learning_rate": 4.100117946373353e-05, "loss": 2.4692, "step": 1715 }, { "epoch": 0.5651941207954218, "grad_norm": 2.648757219314575, "learning_rate": 4.09494380814716e-05, "loss": 2.2406, "step": 1716 }, { "epoch": 0.5655234879986826, "grad_norm": 2.497184991836548, "learning_rate": 4.089770671754683e-05, "loss": 1.9408, "step": 1717 }, { "epoch": 0.5658528552019433, "grad_norm": 3.321964740753174, "learning_rate": 4.0845985429222156e-05, "loss": 1.9579, "step": 1718 }, { "epoch": 0.566182222405204, "grad_norm": 3.0194036960601807, "learning_rate": 4.079427427374945e-05, "loss": 2.1183, "step": 1719 }, { "epoch": 0.5665115896084647, "grad_norm": 3.1421256065368652, "learning_rate": 4.0742573308369356e-05, "loss": 2.041, "step": 1720 }, { "epoch": 0.5668409568117255, "grad_norm": 2.7292587757110596, "learning_rate": 4.069088259031117e-05, "loss": 1.9314, "step": 1721 }, { "epoch": 0.5671703240149862, "grad_norm": 2.4826438426971436, "learning_rate": 4.0639202176792914e-05, "loss": 1.9008, "step": 1722 }, { "epoch": 0.567499691218247, "grad_norm": 3.0865981578826904, "learning_rate": 4.0587532125021173e-05, "loss": 2.1393, "step": 1723 }, { "epoch": 0.5678290584215077, "grad_norm": 3.053807020187378, "learning_rate": 4.053587249219108e-05, "loss": 2.0525, "step": 1724 }, { "epoch": 0.5681584256247684, "grad_norm": 3.325843572616577, "learning_rate": 4.048422333548622e-05, "loss": 1.8684, "step": 1725 }, { "epoch": 0.5684877928280292, "grad_norm": 2.406651020050049, "learning_rate": 4.043258471207858e-05, "loss": 2.2508, "step": 1726 }, { "epoch": 0.5688171600312899, "grad_norm": 2.377819061279297, "learning_rate": 4.038095667912851e-05, "loss": 2.4474, "step": 1727 }, { "epoch": 0.5691465272345506, "grad_norm": 2.348951816558838, "learning_rate": 4.03293392937846e-05, "loss": 2.4961, "step": 1728 }, { "epoch": 0.5694758944378113, "grad_norm": 2.3945837020874023, "learning_rate": 4.027773261318368e-05, "loss": 2.359, "step": 1729 }, { "epoch": 0.569805261641072, "grad_norm": 2.5263919830322266, "learning_rate": 4.022613669445075e-05, "loss": 2.2331, "step": 1730 }, { "epoch": 0.5701346288443329, "grad_norm": 2.5810630321502686, "learning_rate": 4.0174551594698836e-05, "loss": 2.0708, "step": 1731 }, { "epoch": 0.5704639960475936, "grad_norm": 2.346571207046509, "learning_rate": 4.012297737102903e-05, "loss": 2.222, "step": 1732 }, { "epoch": 0.5707933632508543, "grad_norm": 2.419288158416748, "learning_rate": 4.00714140805304e-05, "loss": 2.1208, "step": 1733 }, { "epoch": 0.571122730454115, "grad_norm": 2.1099019050598145, "learning_rate": 4.0019861780279886e-05, "loss": 2.0208, "step": 1734 }, { "epoch": 0.5714520976573758, "grad_norm": 2.775758981704712, "learning_rate": 3.9968320527342265e-05, "loss": 2.4634, "step": 1735 }, { "epoch": 0.5717814648606365, "grad_norm": 2.646881103515625, "learning_rate": 3.991679037877008e-05, "loss": 2.1785, "step": 1736 }, { "epoch": 0.5721108320638972, "grad_norm": 2.8634984493255615, "learning_rate": 3.98652713916036e-05, "loss": 2.4894, "step": 1737 }, { "epoch": 0.572440199267158, "grad_norm": 2.8133482933044434, "learning_rate": 3.981376362287072e-05, "loss": 2.1222, "step": 1738 }, { "epoch": 0.5727695664704187, "grad_norm": 2.9048733711242676, "learning_rate": 3.9762267129586934e-05, "loss": 2.2615, "step": 1739 }, { "epoch": 0.5730989336736795, "grad_norm": 2.7857484817504883, "learning_rate": 3.971078196875526e-05, "loss": 2.0236, "step": 1740 }, { "epoch": 0.5734283008769402, "grad_norm": 2.681157350540161, "learning_rate": 3.965930819736613e-05, "loss": 1.9672, "step": 1741 }, { "epoch": 0.5737576680802009, "grad_norm": 2.8750627040863037, "learning_rate": 3.960784587239741e-05, "loss": 2.0623, "step": 1742 }, { "epoch": 0.5740870352834616, "grad_norm": 2.7466681003570557, "learning_rate": 3.95563950508143e-05, "loss": 2.1302, "step": 1743 }, { "epoch": 0.5744164024867224, "grad_norm": 3.013389825820923, "learning_rate": 3.950495578956923e-05, "loss": 2.1544, "step": 1744 }, { "epoch": 0.5747457696899831, "grad_norm": 2.87448787689209, "learning_rate": 3.9453528145601875e-05, "loss": 2.085, "step": 1745 }, { "epoch": 0.5750751368932439, "grad_norm": 2.522451162338257, "learning_rate": 3.9402112175839005e-05, "loss": 2.0753, "step": 1746 }, { "epoch": 0.5754045040965046, "grad_norm": 2.751380681991577, "learning_rate": 3.9350707937194504e-05, "loss": 2.3245, "step": 1747 }, { "epoch": 0.5757338712997653, "grad_norm": 2.8028810024261475, "learning_rate": 3.929931548656925e-05, "loss": 1.9075, "step": 1748 }, { "epoch": 0.5760632385030261, "grad_norm": 2.744546413421631, "learning_rate": 3.924793488085111e-05, "loss": 1.889, "step": 1749 }, { "epoch": 0.5763926057062868, "grad_norm": 2.8225655555725098, "learning_rate": 3.9196566176914775e-05, "loss": 1.7777, "step": 1750 }, { "epoch": 0.5767219729095475, "grad_norm": 2.291588306427002, "learning_rate": 3.91452094316218e-05, "loss": 2.5541, "step": 1751 }, { "epoch": 0.5770513401128082, "grad_norm": 2.511906862258911, "learning_rate": 3.909386470182053e-05, "loss": 2.2869, "step": 1752 }, { "epoch": 0.577380707316069, "grad_norm": 2.5849432945251465, "learning_rate": 3.9042532044345934e-05, "loss": 2.3356, "step": 1753 }, { "epoch": 0.5777100745193298, "grad_norm": 2.452805280685425, "learning_rate": 3.899121151601969e-05, "loss": 2.3691, "step": 1754 }, { "epoch": 0.5780394417225905, "grad_norm": 2.3843283653259277, "learning_rate": 3.893990317365003e-05, "loss": 2.2706, "step": 1755 }, { "epoch": 0.5783688089258512, "grad_norm": 2.5602409839630127, "learning_rate": 3.888860707403167e-05, "loss": 2.2975, "step": 1756 }, { "epoch": 0.5786981761291119, "grad_norm": 2.4516329765319824, "learning_rate": 3.88373232739458e-05, "loss": 2.2493, "step": 1757 }, { "epoch": 0.5790275433323727, "grad_norm": 2.7871108055114746, "learning_rate": 3.878605183016001e-05, "loss": 2.2575, "step": 1758 }, { "epoch": 0.5793569105356334, "grad_norm": 2.2499959468841553, "learning_rate": 3.873479279942815e-05, "loss": 2.1826, "step": 1759 }, { "epoch": 0.5796862777388941, "grad_norm": 2.571885347366333, "learning_rate": 3.8683546238490396e-05, "loss": 2.0972, "step": 1760 }, { "epoch": 0.5800156449421549, "grad_norm": 2.976599931716919, "learning_rate": 3.86323122040731e-05, "loss": 2.4199, "step": 1761 }, { "epoch": 0.5803450121454157, "grad_norm": 2.53981876373291, "learning_rate": 3.858109075288875e-05, "loss": 2.0601, "step": 1762 }, { "epoch": 0.5806743793486764, "grad_norm": 2.889158248901367, "learning_rate": 3.852988194163587e-05, "loss": 2.5461, "step": 1763 }, { "epoch": 0.5810037465519371, "grad_norm": 2.411149740219116, "learning_rate": 3.847868582699904e-05, "loss": 2.027, "step": 1764 }, { "epoch": 0.5813331137551978, "grad_norm": 2.5908608436584473, "learning_rate": 3.8427502465648776e-05, "loss": 2.4121, "step": 1765 }, { "epoch": 0.5816624809584585, "grad_norm": 2.567713499069214, "learning_rate": 3.8376331914241446e-05, "loss": 1.9522, "step": 1766 }, { "epoch": 0.5819918481617193, "grad_norm": 2.662276029586792, "learning_rate": 3.832517422941928e-05, "loss": 2.1719, "step": 1767 }, { "epoch": 0.58232121536498, "grad_norm": 2.723975419998169, "learning_rate": 3.8274029467810245e-05, "loss": 2.2837, "step": 1768 }, { "epoch": 0.5826505825682408, "grad_norm": 2.5226011276245117, "learning_rate": 3.822289768602799e-05, "loss": 1.8823, "step": 1769 }, { "epoch": 0.5829799497715015, "grad_norm": 2.713068962097168, "learning_rate": 3.817177894067182e-05, "loss": 2.2014, "step": 1770 }, { "epoch": 0.5833093169747623, "grad_norm": 2.9934639930725098, "learning_rate": 3.81206732883266e-05, "loss": 2.0643, "step": 1771 }, { "epoch": 0.583638684178023, "grad_norm": 2.897000312805176, "learning_rate": 3.8069580785562686e-05, "loss": 2.0665, "step": 1772 }, { "epoch": 0.5839680513812837, "grad_norm": 2.8157401084899902, "learning_rate": 3.8018501488935936e-05, "loss": 1.9893, "step": 1773 }, { "epoch": 0.5842974185845444, "grad_norm": 3.132835865020752, "learning_rate": 3.796743545498751e-05, "loss": 1.8069, "step": 1774 }, { "epoch": 0.5846267857878051, "grad_norm": 3.8937413692474365, "learning_rate": 3.791638274024394e-05, "loss": 1.9874, "step": 1775 }, { "epoch": 0.584956152991066, "grad_norm": 2.0166850090026855, "learning_rate": 3.7865343401217e-05, "loss": 2.5567, "step": 1776 }, { "epoch": 0.5852855201943267, "grad_norm": 2.2458670139312744, "learning_rate": 3.781431749440365e-05, "loss": 2.3101, "step": 1777 }, { "epoch": 0.5856148873975874, "grad_norm": 2.270277738571167, "learning_rate": 3.7763305076286e-05, "loss": 2.5257, "step": 1778 }, { "epoch": 0.5859442546008481, "grad_norm": 2.389111042022705, "learning_rate": 3.7712306203331205e-05, "loss": 2.4234, "step": 1779 }, { "epoch": 0.5862736218041088, "grad_norm": 2.423489809036255, "learning_rate": 3.766132093199146e-05, "loss": 2.3983, "step": 1780 }, { "epoch": 0.5866029890073696, "grad_norm": 2.0370967388153076, "learning_rate": 3.761034931870386e-05, "loss": 2.3665, "step": 1781 }, { "epoch": 0.5869323562106303, "grad_norm": 2.6825292110443115, "learning_rate": 3.7559391419890414e-05, "loss": 2.561, "step": 1782 }, { "epoch": 0.587261723413891, "grad_norm": 2.5416064262390137, "learning_rate": 3.7508447291957956e-05, "loss": 2.5464, "step": 1783 }, { "epoch": 0.5875910906171518, "grad_norm": 2.385080099105835, "learning_rate": 3.7457516991298036e-05, "loss": 2.143, "step": 1784 }, { "epoch": 0.5879204578204126, "grad_norm": 3.0276386737823486, "learning_rate": 3.740660057428694e-05, "loss": 2.3312, "step": 1785 }, { "epoch": 0.5882498250236733, "grad_norm": 2.4158527851104736, "learning_rate": 3.735569809728556e-05, "loss": 2.2548, "step": 1786 }, { "epoch": 0.588579192226934, "grad_norm": 2.7139041423797607, "learning_rate": 3.730480961663939e-05, "loss": 2.2324, "step": 1787 }, { "epoch": 0.5889085594301947, "grad_norm": 2.506220579147339, "learning_rate": 3.7253935188678386e-05, "loss": 2.2346, "step": 1788 }, { "epoch": 0.5892379266334554, "grad_norm": 2.876122236251831, "learning_rate": 3.720307486971697e-05, "loss": 2.5066, "step": 1789 }, { "epoch": 0.5895672938367162, "grad_norm": 2.6854052543640137, "learning_rate": 3.715222871605397e-05, "loss": 2.2645, "step": 1790 }, { "epoch": 0.589896661039977, "grad_norm": 2.4886717796325684, "learning_rate": 3.710139678397249e-05, "loss": 2.0398, "step": 1791 }, { "epoch": 0.5902260282432377, "grad_norm": 2.610877513885498, "learning_rate": 3.7050579129739904e-05, "loss": 2.2531, "step": 1792 }, { "epoch": 0.5905553954464984, "grad_norm": 2.7850184440612793, "learning_rate": 3.699977580960782e-05, "loss": 2.1468, "step": 1793 }, { "epoch": 0.5908847626497592, "grad_norm": 2.5817010402679443, "learning_rate": 3.694898687981193e-05, "loss": 2.2793, "step": 1794 }, { "epoch": 0.5912141298530199, "grad_norm": 2.5994668006896973, "learning_rate": 3.689821239657202e-05, "loss": 2.0205, "step": 1795 }, { "epoch": 0.5915434970562806, "grad_norm": 2.431635856628418, "learning_rate": 3.684745241609189e-05, "loss": 1.9347, "step": 1796 }, { "epoch": 0.5918728642595413, "grad_norm": 2.6019957065582275, "learning_rate": 3.6796706994559255e-05, "loss": 1.982, "step": 1797 }, { "epoch": 0.592202231462802, "grad_norm": 2.8037219047546387, "learning_rate": 3.6745976188145755e-05, "loss": 1.8823, "step": 1798 }, { "epoch": 0.5925315986660629, "grad_norm": 2.723085641860962, "learning_rate": 3.6695260053006825e-05, "loss": 1.5098, "step": 1799 }, { "epoch": 0.5928609658693236, "grad_norm": 2.8144564628601074, "learning_rate": 3.664455864528169e-05, "loss": 1.6849, "step": 1800 }, { "epoch": 0.5931903330725843, "grad_norm": 2.097994327545166, "learning_rate": 3.659387202109322e-05, "loss": 2.33, "step": 1801 }, { "epoch": 0.593519700275845, "grad_norm": 2.1393368244171143, "learning_rate": 3.6543200236547956e-05, "loss": 2.2933, "step": 1802 }, { "epoch": 0.5938490674791058, "grad_norm": 2.3984479904174805, "learning_rate": 3.649254334773604e-05, "loss": 2.487, "step": 1803 }, { "epoch": 0.5941784346823665, "grad_norm": 2.082158088684082, "learning_rate": 3.6441901410731064e-05, "loss": 2.2317, "step": 1804 }, { "epoch": 0.5945078018856272, "grad_norm": 2.287839412689209, "learning_rate": 3.639127448159013e-05, "loss": 2.2207, "step": 1805 }, { "epoch": 0.594837169088888, "grad_norm": 2.7098779678344727, "learning_rate": 3.634066261635366e-05, "loss": 2.7025, "step": 1806 }, { "epoch": 0.5951665362921487, "grad_norm": 2.2493362426757812, "learning_rate": 3.629006587104546e-05, "loss": 2.2232, "step": 1807 }, { "epoch": 0.5954959034954095, "grad_norm": 2.7404730319976807, "learning_rate": 3.623948430167258e-05, "loss": 2.2783, "step": 1808 }, { "epoch": 0.5958252706986702, "grad_norm": 2.522005796432495, "learning_rate": 3.6188917964225256e-05, "loss": 2.1626, "step": 1809 }, { "epoch": 0.5961546379019309, "grad_norm": 2.4059793949127197, "learning_rate": 3.613836691467688e-05, "loss": 2.338, "step": 1810 }, { "epoch": 0.5964840051051916, "grad_norm": 2.5347237586975098, "learning_rate": 3.608783120898392e-05, "loss": 2.3689, "step": 1811 }, { "epoch": 0.5968133723084524, "grad_norm": 2.6831188201904297, "learning_rate": 3.603731090308586e-05, "loss": 2.1, "step": 1812 }, { "epoch": 0.5971427395117131, "grad_norm": 2.634315013885498, "learning_rate": 3.598680605290513e-05, "loss": 2.4177, "step": 1813 }, { "epoch": 0.5974721067149739, "grad_norm": 2.7967567443847656, "learning_rate": 3.593631671434706e-05, "loss": 2.406, "step": 1814 }, { "epoch": 0.5978014739182346, "grad_norm": 2.2884232997894287, "learning_rate": 3.588584294329981e-05, "loss": 1.8473, "step": 1815 }, { "epoch": 0.5981308411214953, "grad_norm": 2.73101806640625, "learning_rate": 3.5835384795634285e-05, "loss": 2.1567, "step": 1816 }, { "epoch": 0.5984602083247561, "grad_norm": 2.5716552734375, "learning_rate": 3.578494232720413e-05, "loss": 2.1229, "step": 1817 }, { "epoch": 0.5987895755280168, "grad_norm": 2.678837537765503, "learning_rate": 3.573451559384563e-05, "loss": 2.1249, "step": 1818 }, { "epoch": 0.5991189427312775, "grad_norm": 2.7849841117858887, "learning_rate": 3.568410465137762e-05, "loss": 1.8091, "step": 1819 }, { "epoch": 0.5994483099345382, "grad_norm": 2.5732271671295166, "learning_rate": 3.563370955560147e-05, "loss": 2.0009, "step": 1820 }, { "epoch": 0.5997776771377991, "grad_norm": 2.8257884979248047, "learning_rate": 3.558333036230105e-05, "loss": 2.0211, "step": 1821 }, { "epoch": 0.6001070443410598, "grad_norm": 2.7145349979400635, "learning_rate": 3.553296712724256e-05, "loss": 2.0294, "step": 1822 }, { "epoch": 0.6004364115443205, "grad_norm": 2.5253031253814697, "learning_rate": 3.548261990617459e-05, "loss": 1.754, "step": 1823 }, { "epoch": 0.6007657787475812, "grad_norm": 3.2652945518493652, "learning_rate": 3.543228875482796e-05, "loss": 2.1409, "step": 1824 }, { "epoch": 0.6010951459508419, "grad_norm": 4.042289733886719, "learning_rate": 3.538197372891575e-05, "loss": 2.3513, "step": 1825 }, { "epoch": 0.6014245131541027, "grad_norm": 2.0667660236358643, "learning_rate": 3.533167488413315e-05, "loss": 2.6748, "step": 1826 }, { "epoch": 0.6017538803573634, "grad_norm": 2.205390691757202, "learning_rate": 3.528139227615744e-05, "loss": 2.2744, "step": 1827 }, { "epoch": 0.6020832475606241, "grad_norm": 2.3584322929382324, "learning_rate": 3.5231125960647974e-05, "loss": 2.0508, "step": 1828 }, { "epoch": 0.6024126147638849, "grad_norm": 2.636195659637451, "learning_rate": 3.5180875993246005e-05, "loss": 2.3369, "step": 1829 }, { "epoch": 0.6027419819671456, "grad_norm": 2.2127506732940674, "learning_rate": 3.513064242957473e-05, "loss": 2.2606, "step": 1830 }, { "epoch": 0.6030713491704064, "grad_norm": 2.4520390033721924, "learning_rate": 3.50804253252392e-05, "loss": 2.2014, "step": 1831 }, { "epoch": 0.6034007163736671, "grad_norm": 2.1871538162231445, "learning_rate": 3.503022473582619e-05, "loss": 2.0991, "step": 1832 }, { "epoch": 0.6037300835769278, "grad_norm": 2.8282971382141113, "learning_rate": 3.498004071690424e-05, "loss": 2.5463, "step": 1833 }, { "epoch": 0.6040594507801885, "grad_norm": 2.2347660064697266, "learning_rate": 3.492987332402356e-05, "loss": 2.189, "step": 1834 }, { "epoch": 0.6043888179834493, "grad_norm": 2.6401114463806152, "learning_rate": 3.487972261271594e-05, "loss": 2.2381, "step": 1835 }, { "epoch": 0.60471818518671, "grad_norm": 2.1282289028167725, "learning_rate": 3.482958863849467e-05, "loss": 2.0018, "step": 1836 }, { "epoch": 0.6050475523899708, "grad_norm": 2.5486040115356445, "learning_rate": 3.477947145685456e-05, "loss": 2.2497, "step": 1837 }, { "epoch": 0.6053769195932315, "grad_norm": 2.6828370094299316, "learning_rate": 3.47293711232718e-05, "loss": 2.3501, "step": 1838 }, { "epoch": 0.6057062867964922, "grad_norm": 3.3055360317230225, "learning_rate": 3.467928769320397e-05, "loss": 2.3995, "step": 1839 }, { "epoch": 0.606035653999753, "grad_norm": 2.5054500102996826, "learning_rate": 3.462922122208989e-05, "loss": 1.9583, "step": 1840 }, { "epoch": 0.6063650212030137, "grad_norm": 2.4648640155792236, "learning_rate": 3.457917176534964e-05, "loss": 2.1108, "step": 1841 }, { "epoch": 0.6066943884062744, "grad_norm": 2.776409149169922, "learning_rate": 3.452913937838446e-05, "loss": 2.2656, "step": 1842 }, { "epoch": 0.6070237556095351, "grad_norm": 2.3845038414001465, "learning_rate": 3.447912411657669e-05, "loss": 1.9668, "step": 1843 }, { "epoch": 0.607353122812796, "grad_norm": 2.431183099746704, "learning_rate": 3.442912603528971e-05, "loss": 2.0438, "step": 1844 }, { "epoch": 0.6076824900160567, "grad_norm": 2.8428471088409424, "learning_rate": 3.43791451898679e-05, "loss": 2.3823, "step": 1845 }, { "epoch": 0.6080118572193174, "grad_norm": 3.1004912853240967, "learning_rate": 3.432918163563654e-05, "loss": 2.1599, "step": 1846 }, { "epoch": 0.6083412244225781, "grad_norm": 3.722520112991333, "learning_rate": 3.4279235427901785e-05, "loss": 2.3052, "step": 1847 }, { "epoch": 0.6086705916258388, "grad_norm": 2.688524007797241, "learning_rate": 3.422930662195058e-05, "loss": 1.7832, "step": 1848 }, { "epoch": 0.6089999588290996, "grad_norm": 2.663485527038574, "learning_rate": 3.417939527305062e-05, "loss": 1.9782, "step": 1849 }, { "epoch": 0.6093293260323603, "grad_norm": 3.491044759750366, "learning_rate": 3.412950143645025e-05, "loss": 1.9132, "step": 1850 }, { "epoch": 0.609658693235621, "grad_norm": 1.7720582485198975, "learning_rate": 3.407962516737846e-05, "loss": 2.376, "step": 1851 }, { "epoch": 0.6099880604388818, "grad_norm": 2.126549005508423, "learning_rate": 3.402976652104477e-05, "loss": 1.8721, "step": 1852 }, { "epoch": 0.6103174276421426, "grad_norm": 2.1321988105773926, "learning_rate": 3.3979925552639224e-05, "loss": 2.2343, "step": 1853 }, { "epoch": 0.6106467948454033, "grad_norm": 2.4410877227783203, "learning_rate": 3.3930102317332255e-05, "loss": 2.3504, "step": 1854 }, { "epoch": 0.610976162048664, "grad_norm": 2.425161361694336, "learning_rate": 3.38802968702747e-05, "loss": 2.1846, "step": 1855 }, { "epoch": 0.6113055292519247, "grad_norm": 2.4529266357421875, "learning_rate": 3.383050926659771e-05, "loss": 2.3507, "step": 1856 }, { "epoch": 0.6116348964551854, "grad_norm": 2.2834224700927734, "learning_rate": 3.378073956141263e-05, "loss": 2.2235, "step": 1857 }, { "epoch": 0.6119642636584463, "grad_norm": 2.8359508514404297, "learning_rate": 3.3730987809811064e-05, "loss": 2.3002, "step": 1858 }, { "epoch": 0.612293630861707, "grad_norm": 2.947082042694092, "learning_rate": 3.368125406686472e-05, "loss": 2.2171, "step": 1859 }, { "epoch": 0.6126229980649677, "grad_norm": 2.474220037460327, "learning_rate": 3.3631538387625325e-05, "loss": 2.3359, "step": 1860 }, { "epoch": 0.6129523652682284, "grad_norm": 2.4733357429504395, "learning_rate": 3.3581840827124665e-05, "loss": 2.3127, "step": 1861 }, { "epoch": 0.6132817324714892, "grad_norm": 2.6637370586395264, "learning_rate": 3.353216144037448e-05, "loss": 2.2253, "step": 1862 }, { "epoch": 0.6136110996747499, "grad_norm": 2.6768898963928223, "learning_rate": 3.3482500282366304e-05, "loss": 2.0104, "step": 1863 }, { "epoch": 0.6139404668780106, "grad_norm": 2.4703683853149414, "learning_rate": 3.3432857408071626e-05, "loss": 2.4931, "step": 1864 }, { "epoch": 0.6142698340812713, "grad_norm": 2.216553211212158, "learning_rate": 3.338323287244158e-05, "loss": 2.0299, "step": 1865 }, { "epoch": 0.614599201284532, "grad_norm": 2.8111774921417236, "learning_rate": 3.333362673040706e-05, "loss": 2.5135, "step": 1866 }, { "epoch": 0.6149285684877929, "grad_norm": 2.6286466121673584, "learning_rate": 3.328403903687859e-05, "loss": 2.1726, "step": 1867 }, { "epoch": 0.6152579356910536, "grad_norm": 2.4072277545928955, "learning_rate": 3.323446984674627e-05, "loss": 2.1707, "step": 1868 }, { "epoch": 0.6155873028943143, "grad_norm": 2.4936254024505615, "learning_rate": 3.3184919214879696e-05, "loss": 1.993, "step": 1869 }, { "epoch": 0.615916670097575, "grad_norm": 2.5818185806274414, "learning_rate": 3.313538719612796e-05, "loss": 2.1305, "step": 1870 }, { "epoch": 0.6162460373008357, "grad_norm": 2.4668569564819336, "learning_rate": 3.308587384531953e-05, "loss": 1.7286, "step": 1871 }, { "epoch": 0.6165754045040965, "grad_norm": 2.7164247035980225, "learning_rate": 3.30363792172622e-05, "loss": 1.9667, "step": 1872 }, { "epoch": 0.6169047717073572, "grad_norm": 2.8039917945861816, "learning_rate": 3.2986903366743056e-05, "loss": 1.9472, "step": 1873 }, { "epoch": 0.617234138910618, "grad_norm": 2.910883903503418, "learning_rate": 3.293744634852841e-05, "loss": 2.1599, "step": 1874 }, { "epoch": 0.6175635061138787, "grad_norm": 2.8668439388275146, "learning_rate": 3.288800821736369e-05, "loss": 1.6236, "step": 1875 }, { "epoch": 0.6178928733171395, "grad_norm": 1.9064861536026, "learning_rate": 3.2838589027973444e-05, "loss": 2.2323, "step": 1876 }, { "epoch": 0.6182222405204002, "grad_norm": 2.109698534011841, "learning_rate": 3.278918883506126e-05, "loss": 2.2206, "step": 1877 }, { "epoch": 0.6185516077236609, "grad_norm": 2.2354044914245605, "learning_rate": 3.2739807693309675e-05, "loss": 2.2063, "step": 1878 }, { "epoch": 0.6188809749269216, "grad_norm": 2.030697822570801, "learning_rate": 3.269044565738014e-05, "loss": 2.0355, "step": 1879 }, { "epoch": 0.6192103421301823, "grad_norm": 2.2695322036743164, "learning_rate": 3.2641102781912994e-05, "loss": 2.0768, "step": 1880 }, { "epoch": 0.6195397093334432, "grad_norm": 2.603289842605591, "learning_rate": 3.259177912152732e-05, "loss": 2.2277, "step": 1881 }, { "epoch": 0.6198690765367039, "grad_norm": 2.7053635120391846, "learning_rate": 3.254247473082094e-05, "loss": 2.565, "step": 1882 }, { "epoch": 0.6201984437399646, "grad_norm": 2.517655849456787, "learning_rate": 3.249318966437037e-05, "loss": 2.2004, "step": 1883 }, { "epoch": 0.6205278109432253, "grad_norm": 2.3863282203674316, "learning_rate": 3.244392397673073e-05, "loss": 1.9685, "step": 1884 }, { "epoch": 0.6208571781464861, "grad_norm": 2.740966320037842, "learning_rate": 3.239467772243566e-05, "loss": 2.052, "step": 1885 }, { "epoch": 0.6211865453497468, "grad_norm": 2.55629301071167, "learning_rate": 3.234545095599732e-05, "loss": 2.1718, "step": 1886 }, { "epoch": 0.6215159125530075, "grad_norm": 2.6892268657684326, "learning_rate": 3.22962437319063e-05, "loss": 2.1374, "step": 1887 }, { "epoch": 0.6218452797562682, "grad_norm": 2.641112804412842, "learning_rate": 3.2247056104631505e-05, "loss": 2.2445, "step": 1888 }, { "epoch": 0.622174646959529, "grad_norm": 2.7154104709625244, "learning_rate": 3.2197888128620224e-05, "loss": 2.1997, "step": 1889 }, { "epoch": 0.6225040141627898, "grad_norm": 2.327333450317383, "learning_rate": 3.2148739858297936e-05, "loss": 2.0276, "step": 1890 }, { "epoch": 0.6228333813660505, "grad_norm": 2.518122434616089, "learning_rate": 3.209961134806836e-05, "loss": 2.1551, "step": 1891 }, { "epoch": 0.6231627485693112, "grad_norm": 2.6456973552703857, "learning_rate": 3.205050265231327e-05, "loss": 2.1652, "step": 1892 }, { "epoch": 0.6234921157725719, "grad_norm": 2.701111316680908, "learning_rate": 3.2001413825392574e-05, "loss": 2.2113, "step": 1893 }, { "epoch": 0.6238214829758327, "grad_norm": 2.5059456825256348, "learning_rate": 3.195234492164414e-05, "loss": 2.0026, "step": 1894 }, { "epoch": 0.6241508501790934, "grad_norm": 2.855924367904663, "learning_rate": 3.190329599538382e-05, "loss": 2.1053, "step": 1895 }, { "epoch": 0.6244802173823542, "grad_norm": 2.850864887237549, "learning_rate": 3.185426710090534e-05, "loss": 2.3436, "step": 1896 }, { "epoch": 0.6248095845856149, "grad_norm": 2.7715375423431396, "learning_rate": 3.180525829248023e-05, "loss": 1.9533, "step": 1897 }, { "epoch": 0.6251389517888756, "grad_norm": 3.283682107925415, "learning_rate": 3.1756269624357806e-05, "loss": 1.8841, "step": 1898 }, { "epoch": 0.6254683189921364, "grad_norm": 2.5927112102508545, "learning_rate": 3.17073011507651e-05, "loss": 1.5226, "step": 1899 }, { "epoch": 0.6257976861953971, "grad_norm": 3.1886491775512695, "learning_rate": 3.165835292590675e-05, "loss": 2.0477, "step": 1900 }, { "epoch": 0.6261270533986578, "grad_norm": 2.009676218032837, "learning_rate": 3.160942500396503e-05, "loss": 2.5363, "step": 1901 }, { "epoch": 0.6264564206019185, "grad_norm": 2.3882784843444824, "learning_rate": 3.1560517439099715e-05, "loss": 2.4409, "step": 1902 }, { "epoch": 0.6267857878051794, "grad_norm": 2.1249465942382812, "learning_rate": 3.151163028544804e-05, "loss": 2.4448, "step": 1903 }, { "epoch": 0.6271151550084401, "grad_norm": 2.7128207683563232, "learning_rate": 3.146276359712466e-05, "loss": 2.3409, "step": 1904 }, { "epoch": 0.6274445222117008, "grad_norm": 2.4485268592834473, "learning_rate": 3.141391742822156e-05, "loss": 2.4835, "step": 1905 }, { "epoch": 0.6277738894149615, "grad_norm": 2.6135315895080566, "learning_rate": 3.136509183280805e-05, "loss": 2.4324, "step": 1906 }, { "epoch": 0.6281032566182222, "grad_norm": 2.25262713432312, "learning_rate": 3.131628686493061e-05, "loss": 2.3033, "step": 1907 }, { "epoch": 0.628432623821483, "grad_norm": 2.3247017860412598, "learning_rate": 3.1267502578612926e-05, "loss": 2.1639, "step": 1908 }, { "epoch": 0.6287619910247437, "grad_norm": 2.385807514190674, "learning_rate": 3.121873902785579e-05, "loss": 2.1444, "step": 1909 }, { "epoch": 0.6290913582280044, "grad_norm": 2.7242722511291504, "learning_rate": 3.116999626663701e-05, "loss": 2.3064, "step": 1910 }, { "epoch": 0.6294207254312651, "grad_norm": 2.3983850479125977, "learning_rate": 3.112127434891143e-05, "loss": 2.275, "step": 1911 }, { "epoch": 0.629750092634526, "grad_norm": 2.6150460243225098, "learning_rate": 3.107257332861078e-05, "loss": 2.321, "step": 1912 }, { "epoch": 0.6300794598377867, "grad_norm": 2.5469534397125244, "learning_rate": 3.1023893259643666e-05, "loss": 1.8771, "step": 1913 }, { "epoch": 0.6304088270410474, "grad_norm": 2.3631060123443604, "learning_rate": 3.0975234195895526e-05, "loss": 1.8878, "step": 1914 }, { "epoch": 0.6307381942443081, "grad_norm": 2.889796495437622, "learning_rate": 3.092659619122852e-05, "loss": 2.1742, "step": 1915 }, { "epoch": 0.6310675614475688, "grad_norm": 2.6340792179107666, "learning_rate": 3.087797929948151e-05, "loss": 1.9413, "step": 1916 }, { "epoch": 0.6313969286508296, "grad_norm": 2.559239149093628, "learning_rate": 3.0829383574469976e-05, "loss": 2.0802, "step": 1917 }, { "epoch": 0.6317262958540903, "grad_norm": 2.652400016784668, "learning_rate": 3.078080906998599e-05, "loss": 2.24, "step": 1918 }, { "epoch": 0.6320556630573511, "grad_norm": 2.5503621101379395, "learning_rate": 3.073225583979812e-05, "loss": 1.9102, "step": 1919 }, { "epoch": 0.6323850302606118, "grad_norm": 2.8662219047546387, "learning_rate": 3.068372393765137e-05, "loss": 2.2393, "step": 1920 }, { "epoch": 0.6327143974638725, "grad_norm": 3.3057799339294434, "learning_rate": 3.063521341726717e-05, "loss": 2.1511, "step": 1921 }, { "epoch": 0.6330437646671333, "grad_norm": 2.57186222076416, "learning_rate": 3.0586724332343266e-05, "loss": 1.9987, "step": 1922 }, { "epoch": 0.633373131870394, "grad_norm": 2.6434319019317627, "learning_rate": 3.053825673655365e-05, "loss": 1.7956, "step": 1923 }, { "epoch": 0.6337024990736547, "grad_norm": 2.6144514083862305, "learning_rate": 3.0489810683548546e-05, "loss": 2.0272, "step": 1924 }, { "epoch": 0.6340318662769154, "grad_norm": 3.186408281326294, "learning_rate": 3.0441386226954372e-05, "loss": 1.9882, "step": 1925 }, { "epoch": 0.6343612334801763, "grad_norm": 1.8592625856399536, "learning_rate": 3.0392983420373577e-05, "loss": 2.4248, "step": 1926 }, { "epoch": 0.634690600683437, "grad_norm": 2.230592966079712, "learning_rate": 3.0344602317384695e-05, "loss": 2.476, "step": 1927 }, { "epoch": 0.6350199678866977, "grad_norm": 2.3445613384246826, "learning_rate": 3.0296242971542194e-05, "loss": 2.1382, "step": 1928 }, { "epoch": 0.6353493350899584, "grad_norm": 2.5234224796295166, "learning_rate": 3.024790543637648e-05, "loss": 2.5123, "step": 1929 }, { "epoch": 0.6356787022932191, "grad_norm": 2.1975531578063965, "learning_rate": 3.0199589765393823e-05, "loss": 2.071, "step": 1930 }, { "epoch": 0.6360080694964799, "grad_norm": 2.5757479667663574, "learning_rate": 3.015129601207627e-05, "loss": 2.2572, "step": 1931 }, { "epoch": 0.6363374366997406, "grad_norm": 2.23809552192688, "learning_rate": 3.0103024229881617e-05, "loss": 1.8197, "step": 1932 }, { "epoch": 0.6366668039030013, "grad_norm": 2.522535562515259, "learning_rate": 3.0054774472243346e-05, "loss": 2.4502, "step": 1933 }, { "epoch": 0.636996171106262, "grad_norm": 2.562291145324707, "learning_rate": 3.0006546792570566e-05, "loss": 2.3762, "step": 1934 }, { "epoch": 0.6373255383095229, "grad_norm": 2.2808642387390137, "learning_rate": 2.9958341244247913e-05, "loss": 2.0, "step": 1935 }, { "epoch": 0.6376549055127836, "grad_norm": 2.5651497840881348, "learning_rate": 2.991015788063556e-05, "loss": 2.2833, "step": 1936 }, { "epoch": 0.6379842727160443, "grad_norm": 2.647606372833252, "learning_rate": 2.9861996755069112e-05, "loss": 2.1614, "step": 1937 }, { "epoch": 0.638313639919305, "grad_norm": 3.004729747772217, "learning_rate": 2.9813857920859544e-05, "loss": 2.3966, "step": 1938 }, { "epoch": 0.6386430071225657, "grad_norm": 2.419417142868042, "learning_rate": 2.9765741431293175e-05, "loss": 2.2321, "step": 1939 }, { "epoch": 0.6389723743258265, "grad_norm": 2.413052558898926, "learning_rate": 2.97176473396316e-05, "loss": 2.2033, "step": 1940 }, { "epoch": 0.6393017415290873, "grad_norm": 2.419487237930298, "learning_rate": 2.9669575699111575e-05, "loss": 2.2214, "step": 1941 }, { "epoch": 0.639631108732348, "grad_norm": 2.965186834335327, "learning_rate": 2.9621526562945058e-05, "loss": 2.3811, "step": 1942 }, { "epoch": 0.6399604759356087, "grad_norm": 2.3459784984588623, "learning_rate": 2.9573499984319053e-05, "loss": 2.0426, "step": 1943 }, { "epoch": 0.6402898431388695, "grad_norm": 2.5563299655914307, "learning_rate": 2.9525496016395637e-05, "loss": 2.0472, "step": 1944 }, { "epoch": 0.6406192103421302, "grad_norm": 2.555187225341797, "learning_rate": 2.9477514712311803e-05, "loss": 1.9391, "step": 1945 }, { "epoch": 0.6409485775453909, "grad_norm": 2.444321632385254, "learning_rate": 2.942955612517952e-05, "loss": 1.6775, "step": 1946 }, { "epoch": 0.6412779447486516, "grad_norm": 2.7038373947143555, "learning_rate": 2.9381620308085566e-05, "loss": 1.811, "step": 1947 }, { "epoch": 0.6416073119519123, "grad_norm": 3.07734751701355, "learning_rate": 2.9333707314091525e-05, "loss": 2.1634, "step": 1948 }, { "epoch": 0.6419366791551732, "grad_norm": 3.2388222217559814, "learning_rate": 2.9285817196233722e-05, "loss": 1.7891, "step": 1949 }, { "epoch": 0.6422660463584339, "grad_norm": 3.074317216873169, "learning_rate": 2.9237950007523164e-05, "loss": 1.944, "step": 1950 }, { "epoch": 0.6425954135616946, "grad_norm": 2.3127830028533936, "learning_rate": 2.919010580094546e-05, "loss": 2.2439, "step": 1951 }, { "epoch": 0.6429247807649553, "grad_norm": 2.1176540851593018, "learning_rate": 2.91422846294608e-05, "loss": 2.2749, "step": 1952 }, { "epoch": 0.6432541479682161, "grad_norm": 2.5770678520202637, "learning_rate": 2.9094486546003857e-05, "loss": 2.2961, "step": 1953 }, { "epoch": 0.6435835151714768, "grad_norm": 2.203704595565796, "learning_rate": 2.9046711603483766e-05, "loss": 2.1581, "step": 1954 }, { "epoch": 0.6439128823747375, "grad_norm": 2.189901828765869, "learning_rate": 2.8998959854784026e-05, "loss": 2.4594, "step": 1955 }, { "epoch": 0.6442422495779982, "grad_norm": 2.659796714782715, "learning_rate": 2.8951231352762486e-05, "loss": 2.3197, "step": 1956 }, { "epoch": 0.644571616781259, "grad_norm": 2.2193856239318848, "learning_rate": 2.890352615025124e-05, "loss": 2.2613, "step": 1957 }, { "epoch": 0.6449009839845198, "grad_norm": 2.119441270828247, "learning_rate": 2.885584430005661e-05, "loss": 2.2418, "step": 1958 }, { "epoch": 0.6452303511877805, "grad_norm": 2.3953168392181396, "learning_rate": 2.8808185854959047e-05, "loss": 2.2826, "step": 1959 }, { "epoch": 0.6455597183910412, "grad_norm": 2.431501626968384, "learning_rate": 2.876055086771313e-05, "loss": 2.21, "step": 1960 }, { "epoch": 0.6458890855943019, "grad_norm": 2.9048917293548584, "learning_rate": 2.871293939104742e-05, "loss": 2.5832, "step": 1961 }, { "epoch": 0.6462184527975627, "grad_norm": 2.4806578159332275, "learning_rate": 2.8665351477664492e-05, "loss": 2.4109, "step": 1962 }, { "epoch": 0.6465478200008234, "grad_norm": 2.117027521133423, "learning_rate": 2.861778718024082e-05, "loss": 1.8737, "step": 1963 }, { "epoch": 0.6468771872040842, "grad_norm": 2.629340410232544, "learning_rate": 2.8570246551426762e-05, "loss": 2.6089, "step": 1964 }, { "epoch": 0.6472065544073449, "grad_norm": 2.8343591690063477, "learning_rate": 2.852272964384644e-05, "loss": 2.2693, "step": 1965 }, { "epoch": 0.6475359216106056, "grad_norm": 2.3691017627716064, "learning_rate": 2.8475236510097752e-05, "loss": 2.2384, "step": 1966 }, { "epoch": 0.6478652888138664, "grad_norm": 2.574763536453247, "learning_rate": 2.842776720275228e-05, "loss": 2.1394, "step": 1967 }, { "epoch": 0.6481946560171271, "grad_norm": 2.5222694873809814, "learning_rate": 2.838032177435518e-05, "loss": 1.6567, "step": 1968 }, { "epoch": 0.6485240232203878, "grad_norm": 2.61596941947937, "learning_rate": 2.8332900277425233e-05, "loss": 2.2399, "step": 1969 }, { "epoch": 0.6488533904236485, "grad_norm": 2.659245252609253, "learning_rate": 2.8285502764454703e-05, "loss": 1.9834, "step": 1970 }, { "epoch": 0.6491827576269092, "grad_norm": 2.8906190395355225, "learning_rate": 2.8238129287909314e-05, "loss": 2.1137, "step": 1971 }, { "epoch": 0.6495121248301701, "grad_norm": 2.697786331176758, "learning_rate": 2.8190779900228185e-05, "loss": 1.913, "step": 1972 }, { "epoch": 0.6498414920334308, "grad_norm": 3.1825335025787354, "learning_rate": 2.8143454653823787e-05, "loss": 2.0227, "step": 1973 }, { "epoch": 0.6501708592366915, "grad_norm": 3.1395328044891357, "learning_rate": 2.8096153601081805e-05, "loss": 2.05, "step": 1974 }, { "epoch": 0.6505002264399522, "grad_norm": 3.7124152183532715, "learning_rate": 2.8048876794361206e-05, "loss": 2.0609, "step": 1975 }, { "epoch": 0.650829593643213, "grad_norm": 1.9795833826065063, "learning_rate": 2.80016242859941e-05, "loss": 2.3731, "step": 1976 }, { "epoch": 0.6511589608464737, "grad_norm": 2.223407745361328, "learning_rate": 2.7954396128285698e-05, "loss": 2.5673, "step": 1977 }, { "epoch": 0.6514883280497344, "grad_norm": 1.970519781112671, "learning_rate": 2.7907192373514256e-05, "loss": 2.3043, "step": 1978 }, { "epoch": 0.6518176952529952, "grad_norm": 2.7503855228424072, "learning_rate": 2.7860013073931024e-05, "loss": 2.5426, "step": 1979 }, { "epoch": 0.6521470624562559, "grad_norm": 2.135634183883667, "learning_rate": 2.781285828176019e-05, "loss": 2.0703, "step": 1980 }, { "epoch": 0.6524764296595167, "grad_norm": 2.544966220855713, "learning_rate": 2.7765728049198768e-05, "loss": 2.2, "step": 1981 }, { "epoch": 0.6528057968627774, "grad_norm": 2.2923483848571777, "learning_rate": 2.771862242841663e-05, "loss": 2.0587, "step": 1982 }, { "epoch": 0.6531351640660381, "grad_norm": 2.587148427963257, "learning_rate": 2.7671541471556404e-05, "loss": 2.542, "step": 1983 }, { "epoch": 0.6534645312692988, "grad_norm": 2.211090564727783, "learning_rate": 2.7624485230733403e-05, "loss": 2.2066, "step": 1984 }, { "epoch": 0.6537938984725596, "grad_norm": 2.4215967655181885, "learning_rate": 2.7577453758035588e-05, "loss": 2.0838, "step": 1985 }, { "epoch": 0.6541232656758204, "grad_norm": 2.3949782848358154, "learning_rate": 2.753044710552349e-05, "loss": 2.002, "step": 1986 }, { "epoch": 0.6544526328790811, "grad_norm": 2.762983560562134, "learning_rate": 2.748346532523018e-05, "loss": 2.3399, "step": 1987 }, { "epoch": 0.6547820000823418, "grad_norm": 2.6567108631134033, "learning_rate": 2.7436508469161195e-05, "loss": 2.2037, "step": 1988 }, { "epoch": 0.6551113672856025, "grad_norm": 2.348445415496826, "learning_rate": 2.7389576589294486e-05, "loss": 2.0968, "step": 1989 }, { "epoch": 0.6554407344888633, "grad_norm": 2.819063663482666, "learning_rate": 2.734266973758034e-05, "loss": 2.4161, "step": 1990 }, { "epoch": 0.655770101692124, "grad_norm": 2.5999984741210938, "learning_rate": 2.7295787965941355e-05, "loss": 2.1207, "step": 1991 }, { "epoch": 0.6560994688953847, "grad_norm": 2.612342357635498, "learning_rate": 2.7248931326272386e-05, "loss": 2.018, "step": 1992 }, { "epoch": 0.6564288360986454, "grad_norm": 2.783804416656494, "learning_rate": 2.720209987044041e-05, "loss": 2.2325, "step": 1993 }, { "epoch": 0.6567582033019063, "grad_norm": 2.30511212348938, "learning_rate": 2.7155293650284573e-05, "loss": 1.9004, "step": 1994 }, { "epoch": 0.657087570505167, "grad_norm": 2.7158255577087402, "learning_rate": 2.710851271761609e-05, "loss": 1.8252, "step": 1995 }, { "epoch": 0.6574169377084277, "grad_norm": 2.2666115760803223, "learning_rate": 2.7061757124218162e-05, "loss": 1.7978, "step": 1996 }, { "epoch": 0.6577463049116884, "grad_norm": 2.959491491317749, "learning_rate": 2.7015026921845952e-05, "loss": 1.9785, "step": 1997 }, { "epoch": 0.6580756721149491, "grad_norm": 3.017894983291626, "learning_rate": 2.696832216222654e-05, "loss": 1.828, "step": 1998 }, { "epoch": 0.6584050393182099, "grad_norm": 3.1254472732543945, "learning_rate": 2.6921642897058776e-05, "loss": 2.0924, "step": 1999 }, { "epoch": 0.6587344065214706, "grad_norm": 2.893326997756958, "learning_rate": 2.6874989178013345e-05, "loss": 1.7982, "step": 2000 }, { "epoch": 0.6590637737247313, "grad_norm": 1.9270685911178589, "learning_rate": 2.6828361056732653e-05, "loss": 2.466, "step": 2001 }, { "epoch": 0.6593931409279921, "grad_norm": 1.8927141427993774, "learning_rate": 2.678175858483075e-05, "loss": 2.4379, "step": 2002 }, { "epoch": 0.6597225081312529, "grad_norm": 2.279390811920166, "learning_rate": 2.6735181813893318e-05, "loss": 2.3186, "step": 2003 }, { "epoch": 0.6600518753345136, "grad_norm": 2.184950828552246, "learning_rate": 2.6688630795477554e-05, "loss": 2.1413, "step": 2004 }, { "epoch": 0.6603812425377743, "grad_norm": 2.0760080814361572, "learning_rate": 2.664210558111221e-05, "loss": 2.1418, "step": 2005 }, { "epoch": 0.660710609741035, "grad_norm": 2.2783913612365723, "learning_rate": 2.6595606222297376e-05, "loss": 2.1825, "step": 2006 }, { "epoch": 0.6610399769442957, "grad_norm": 2.2643139362335205, "learning_rate": 2.6549132770504615e-05, "loss": 2.1718, "step": 2007 }, { "epoch": 0.6613693441475565, "grad_norm": 2.3673267364501953, "learning_rate": 2.6502685277176765e-05, "loss": 2.2925, "step": 2008 }, { "epoch": 0.6616987113508173, "grad_norm": 2.5755388736724854, "learning_rate": 2.6456263793727953e-05, "loss": 2.3991, "step": 2009 }, { "epoch": 0.662028078554078, "grad_norm": 3.254749298095703, "learning_rate": 2.6409868371543506e-05, "loss": 2.3747, "step": 2010 }, { "epoch": 0.6623574457573387, "grad_norm": 2.4208810329437256, "learning_rate": 2.636349906197991e-05, "loss": 2.168, "step": 2011 }, { "epoch": 0.6626868129605995, "grad_norm": 2.396467447280884, "learning_rate": 2.631715591636471e-05, "loss": 2.1355, "step": 2012 }, { "epoch": 0.6630161801638602, "grad_norm": 2.6166560649871826, "learning_rate": 2.6270838985996525e-05, "loss": 2.1752, "step": 2013 }, { "epoch": 0.6633455473671209, "grad_norm": 2.6235897541046143, "learning_rate": 2.6224548322144964e-05, "loss": 2.3333, "step": 2014 }, { "epoch": 0.6636749145703816, "grad_norm": 2.3000566959381104, "learning_rate": 2.6178283976050532e-05, "loss": 2.1201, "step": 2015 }, { "epoch": 0.6640042817736423, "grad_norm": 2.6367340087890625, "learning_rate": 2.6132045998924616e-05, "loss": 2.2484, "step": 2016 }, { "epoch": 0.6643336489769032, "grad_norm": 2.8426151275634766, "learning_rate": 2.6085834441949418e-05, "loss": 2.3913, "step": 2017 }, { "epoch": 0.6646630161801639, "grad_norm": 3.0466115474700928, "learning_rate": 2.6039649356277885e-05, "loss": 2.3497, "step": 2018 }, { "epoch": 0.6649923833834246, "grad_norm": 2.5447306632995605, "learning_rate": 2.599349079303367e-05, "loss": 2.0447, "step": 2019 }, { "epoch": 0.6653217505866853, "grad_norm": 2.6460211277008057, "learning_rate": 2.594735880331106e-05, "loss": 1.9458, "step": 2020 }, { "epoch": 0.665651117789946, "grad_norm": 2.6749167442321777, "learning_rate": 2.5901253438174938e-05, "loss": 1.8064, "step": 2021 }, { "epoch": 0.6659804849932068, "grad_norm": 2.8169589042663574, "learning_rate": 2.5855174748660704e-05, "loss": 2.2488, "step": 2022 }, { "epoch": 0.6663098521964675, "grad_norm": 3.0470190048217773, "learning_rate": 2.5809122785774254e-05, "loss": 1.9864, "step": 2023 }, { "epoch": 0.6666392193997283, "grad_norm": 2.856595277786255, "learning_rate": 2.5763097600491847e-05, "loss": 1.8953, "step": 2024 }, { "epoch": 0.666968586602989, "grad_norm": 3.369244337081909, "learning_rate": 2.5717099243760147e-05, "loss": 1.5753, "step": 2025 }, { "epoch": 0.6672979538062498, "grad_norm": 1.7922537326812744, "learning_rate": 2.5671127766496105e-05, "loss": 2.545, "step": 2026 }, { "epoch": 0.6676273210095105, "grad_norm": 2.0254573822021484, "learning_rate": 2.5625183219586935e-05, "loss": 2.2592, "step": 2027 }, { "epoch": 0.6679566882127712, "grad_norm": 2.4665722846984863, "learning_rate": 2.5579265653890016e-05, "loss": 2.4563, "step": 2028 }, { "epoch": 0.6682860554160319, "grad_norm": 2.5361671447753906, "learning_rate": 2.5533375120232885e-05, "loss": 2.3127, "step": 2029 }, { "epoch": 0.6686154226192926, "grad_norm": 2.1621336936950684, "learning_rate": 2.5487511669413143e-05, "loss": 2.2734, "step": 2030 }, { "epoch": 0.6689447898225535, "grad_norm": 2.921595811843872, "learning_rate": 2.5441675352198392e-05, "loss": 2.1833, "step": 2031 }, { "epoch": 0.6692741570258142, "grad_norm": 1.9871540069580078, "learning_rate": 2.5395866219326224e-05, "loss": 2.2722, "step": 2032 }, { "epoch": 0.6696035242290749, "grad_norm": 2.5396902561187744, "learning_rate": 2.5350084321504148e-05, "loss": 2.3714, "step": 2033 }, { "epoch": 0.6699328914323356, "grad_norm": 2.6981403827667236, "learning_rate": 2.5304329709409508e-05, "loss": 2.5758, "step": 2034 }, { "epoch": 0.6702622586355964, "grad_norm": 2.476912498474121, "learning_rate": 2.525860243368945e-05, "loss": 2.1581, "step": 2035 }, { "epoch": 0.6705916258388571, "grad_norm": 2.6519880294799805, "learning_rate": 2.5212902544960882e-05, "loss": 2.0924, "step": 2036 }, { "epoch": 0.6709209930421178, "grad_norm": 2.709019660949707, "learning_rate": 2.516723009381033e-05, "loss": 2.3023, "step": 2037 }, { "epoch": 0.6712503602453785, "grad_norm": 2.7613868713378906, "learning_rate": 2.512158513079402e-05, "loss": 2.2279, "step": 2038 }, { "epoch": 0.6715797274486393, "grad_norm": 2.635972023010254, "learning_rate": 2.507596770643772e-05, "loss": 2.2506, "step": 2039 }, { "epoch": 0.6719090946519001, "grad_norm": 2.5571701526641846, "learning_rate": 2.5030377871236714e-05, "loss": 1.9826, "step": 2040 }, { "epoch": 0.6722384618551608, "grad_norm": 2.326291799545288, "learning_rate": 2.4984815675655766e-05, "loss": 2.1142, "step": 2041 }, { "epoch": 0.6725678290584215, "grad_norm": 2.433256149291992, "learning_rate": 2.4939281170129015e-05, "loss": 1.8394, "step": 2042 }, { "epoch": 0.6728971962616822, "grad_norm": 2.478874921798706, "learning_rate": 2.4893774405059993e-05, "loss": 2.0067, "step": 2043 }, { "epoch": 0.673226563464943, "grad_norm": 2.6590943336486816, "learning_rate": 2.4848295430821455e-05, "loss": 1.8746, "step": 2044 }, { "epoch": 0.6735559306682037, "grad_norm": 2.759122610092163, "learning_rate": 2.4802844297755455e-05, "loss": 2.5029, "step": 2045 }, { "epoch": 0.6738852978714645, "grad_norm": 2.439481258392334, "learning_rate": 2.4757421056173184e-05, "loss": 1.7643, "step": 2046 }, { "epoch": 0.6742146650747252, "grad_norm": 2.6136834621429443, "learning_rate": 2.4712025756355033e-05, "loss": 2.0499, "step": 2047 }, { "epoch": 0.6745440322779859, "grad_norm": 3.705885648727417, "learning_rate": 2.466665844855041e-05, "loss": 2.6908, "step": 2048 }, { "epoch": 0.6748733994812467, "grad_norm": 3.136033773422241, "learning_rate": 2.4621319182977697e-05, "loss": 1.8605, "step": 2049 }, { "epoch": 0.6752027666845074, "grad_norm": 3.363560199737549, "learning_rate": 2.457600800982431e-05, "loss": 1.5852, "step": 2050 }, { "epoch": 0.6755321338877681, "grad_norm": 2.1761481761932373, "learning_rate": 2.4530724979246535e-05, "loss": 2.4601, "step": 2051 }, { "epoch": 0.6758615010910288, "grad_norm": 2.2820851802825928, "learning_rate": 2.44854701413695e-05, "loss": 2.2097, "step": 2052 }, { "epoch": 0.6761908682942896, "grad_norm": 2.133443593978882, "learning_rate": 2.444024354628715e-05, "loss": 2.2759, "step": 2053 }, { "epoch": 0.6765202354975504, "grad_norm": 2.1510062217712402, "learning_rate": 2.4395045244062172e-05, "loss": 2.1092, "step": 2054 }, { "epoch": 0.6768496027008111, "grad_norm": 2.543733596801758, "learning_rate": 2.4349875284725863e-05, "loss": 2.52, "step": 2055 }, { "epoch": 0.6771789699040718, "grad_norm": 2.35121488571167, "learning_rate": 2.430473371827824e-05, "loss": 2.5922, "step": 2056 }, { "epoch": 0.6775083371073325, "grad_norm": 2.656498432159424, "learning_rate": 2.425962059468783e-05, "loss": 2.6134, "step": 2057 }, { "epoch": 0.6778377043105933, "grad_norm": 2.513655662536621, "learning_rate": 2.421453596389171e-05, "loss": 2.4393, "step": 2058 }, { "epoch": 0.678167071513854, "grad_norm": 2.9910781383514404, "learning_rate": 2.4169479875795396e-05, "loss": 2.5802, "step": 2059 }, { "epoch": 0.6784964387171147, "grad_norm": 2.754699230194092, "learning_rate": 2.4124452380272817e-05, "loss": 2.0555, "step": 2060 }, { "epoch": 0.6788258059203754, "grad_norm": 2.328061819076538, "learning_rate": 2.4079453527166273e-05, "loss": 2.4093, "step": 2061 }, { "epoch": 0.6791551731236363, "grad_norm": 2.4432404041290283, "learning_rate": 2.4034483366286305e-05, "loss": 2.0952, "step": 2062 }, { "epoch": 0.679484540326897, "grad_norm": 2.6212449073791504, "learning_rate": 2.3989541947411735e-05, "loss": 1.9832, "step": 2063 }, { "epoch": 0.6798139075301577, "grad_norm": 2.5926594734191895, "learning_rate": 2.3944629320289568e-05, "loss": 1.9186, "step": 2064 }, { "epoch": 0.6801432747334184, "grad_norm": 2.6798171997070312, "learning_rate": 2.3899745534634925e-05, "loss": 1.7767, "step": 2065 }, { "epoch": 0.6804726419366791, "grad_norm": 2.3956661224365234, "learning_rate": 2.3854890640131018e-05, "loss": 2.2176, "step": 2066 }, { "epoch": 0.6808020091399399, "grad_norm": 2.882429838180542, "learning_rate": 2.3810064686429062e-05, "loss": 2.1865, "step": 2067 }, { "epoch": 0.6811313763432006, "grad_norm": 2.6350555419921875, "learning_rate": 2.3765267723148267e-05, "loss": 2.0534, "step": 2068 }, { "epoch": 0.6814607435464614, "grad_norm": 3.9625144004821777, "learning_rate": 2.3720499799875677e-05, "loss": 2.4213, "step": 2069 }, { "epoch": 0.6817901107497221, "grad_norm": 2.6471993923187256, "learning_rate": 2.3675760966166276e-05, "loss": 1.8502, "step": 2070 }, { "epoch": 0.6821194779529828, "grad_norm": 2.543469190597534, "learning_rate": 2.3631051271542816e-05, "loss": 1.7004, "step": 2071 }, { "epoch": 0.6824488451562436, "grad_norm": 3.3397600650787354, "learning_rate": 2.358637076549578e-05, "loss": 2.3005, "step": 2072 }, { "epoch": 0.6827782123595043, "grad_norm": 3.1723084449768066, "learning_rate": 2.3541719497483362e-05, "loss": 2.3147, "step": 2073 }, { "epoch": 0.683107579562765, "grad_norm": 2.94470477104187, "learning_rate": 2.3497097516931398e-05, "loss": 2.1635, "step": 2074 }, { "epoch": 0.6834369467660257, "grad_norm": 3.1503822803497314, "learning_rate": 2.3452504873233262e-05, "loss": 1.9506, "step": 2075 }, { "epoch": 0.6837663139692866, "grad_norm": 2.039581298828125, "learning_rate": 2.3407941615749888e-05, "loss": 2.0947, "step": 2076 }, { "epoch": 0.6840956811725473, "grad_norm": 2.2650582790374756, "learning_rate": 2.3363407793809666e-05, "loss": 2.3766, "step": 2077 }, { "epoch": 0.684425048375808, "grad_norm": 2.530653238296509, "learning_rate": 2.3318903456708445e-05, "loss": 2.3375, "step": 2078 }, { "epoch": 0.6847544155790687, "grad_norm": 2.3660364151000977, "learning_rate": 2.3274428653709412e-05, "loss": 2.625, "step": 2079 }, { "epoch": 0.6850837827823294, "grad_norm": 2.313480854034424, "learning_rate": 2.3229983434043006e-05, "loss": 2.233, "step": 2080 }, { "epoch": 0.6854131499855902, "grad_norm": 2.3391940593719482, "learning_rate": 2.3185567846906997e-05, "loss": 2.1382, "step": 2081 }, { "epoch": 0.6857425171888509, "grad_norm": 2.3489909172058105, "learning_rate": 2.3141181941466312e-05, "loss": 1.8627, "step": 2082 }, { "epoch": 0.6860718843921116, "grad_norm": 2.5285491943359375, "learning_rate": 2.3096825766853043e-05, "loss": 2.1704, "step": 2083 }, { "epoch": 0.6864012515953724, "grad_norm": 2.5529744625091553, "learning_rate": 2.3052499372166366e-05, "loss": 2.0689, "step": 2084 }, { "epoch": 0.6867306187986332, "grad_norm": 2.355783462524414, "learning_rate": 2.300820280647248e-05, "loss": 2.1035, "step": 2085 }, { "epoch": 0.6870599860018939, "grad_norm": 2.845242738723755, "learning_rate": 2.29639361188046e-05, "loss": 2.3495, "step": 2086 }, { "epoch": 0.6873893532051546, "grad_norm": 2.588106632232666, "learning_rate": 2.2919699358162817e-05, "loss": 2.4312, "step": 2087 }, { "epoch": 0.6877187204084153, "grad_norm": 2.835247755050659, "learning_rate": 2.2875492573514123e-05, "loss": 2.4685, "step": 2088 }, { "epoch": 0.688048087611676, "grad_norm": 2.293931722640991, "learning_rate": 2.2831315813792336e-05, "loss": 1.9804, "step": 2089 }, { "epoch": 0.6883774548149368, "grad_norm": 2.318871259689331, "learning_rate": 2.2787169127898027e-05, "loss": 1.7806, "step": 2090 }, { "epoch": 0.6887068220181976, "grad_norm": 2.732576370239258, "learning_rate": 2.2743052564698487e-05, "loss": 2.298, "step": 2091 }, { "epoch": 0.6890361892214583, "grad_norm": 2.569328784942627, "learning_rate": 2.2698966173027663e-05, "loss": 2.3241, "step": 2092 }, { "epoch": 0.689365556424719, "grad_norm": 2.7927603721618652, "learning_rate": 2.2654910001686076e-05, "loss": 2.0825, "step": 2093 }, { "epoch": 0.6896949236279798, "grad_norm": 2.9026594161987305, "learning_rate": 2.261088409944082e-05, "loss": 2.0577, "step": 2094 }, { "epoch": 0.6900242908312405, "grad_norm": 3.1268606185913086, "learning_rate": 2.2566888515025498e-05, "loss": 2.2409, "step": 2095 }, { "epoch": 0.6903536580345012, "grad_norm": 2.6002843379974365, "learning_rate": 2.252292329714012e-05, "loss": 1.8221, "step": 2096 }, { "epoch": 0.6906830252377619, "grad_norm": 3.102250337600708, "learning_rate": 2.2478988494451102e-05, "loss": 1.8374, "step": 2097 }, { "epoch": 0.6910123924410226, "grad_norm": 2.739004611968994, "learning_rate": 2.2435084155591195e-05, "loss": 1.8381, "step": 2098 }, { "epoch": 0.6913417596442835, "grad_norm": 3.433546781539917, "learning_rate": 2.2391210329159433e-05, "loss": 2.4603, "step": 2099 }, { "epoch": 0.6916711268475442, "grad_norm": 3.0737321376800537, "learning_rate": 2.234736706372103e-05, "loss": 1.8963, "step": 2100 }, { "epoch": 0.6920004940508049, "grad_norm": 1.8924542665481567, "learning_rate": 2.2303554407807426e-05, "loss": 2.3809, "step": 2101 }, { "epoch": 0.6923298612540656, "grad_norm": 2.463632583618164, "learning_rate": 2.2259772409916153e-05, "loss": 2.5391, "step": 2102 }, { "epoch": 0.6926592284573264, "grad_norm": 2.449293375015259, "learning_rate": 2.2216021118510815e-05, "loss": 2.3246, "step": 2103 }, { "epoch": 0.6929885956605871, "grad_norm": 2.3340203762054443, "learning_rate": 2.2172300582021022e-05, "loss": 2.3163, "step": 2104 }, { "epoch": 0.6933179628638478, "grad_norm": 2.3864715099334717, "learning_rate": 2.2128610848842336e-05, "loss": 2.5004, "step": 2105 }, { "epoch": 0.6936473300671085, "grad_norm": 2.2127561569213867, "learning_rate": 2.208495196733625e-05, "loss": 2.1619, "step": 2106 }, { "epoch": 0.6939766972703693, "grad_norm": 2.7245256900787354, "learning_rate": 2.2041323985830027e-05, "loss": 2.3766, "step": 2107 }, { "epoch": 0.6943060644736301, "grad_norm": 2.3172929286956787, "learning_rate": 2.1997726952616836e-05, "loss": 2.2989, "step": 2108 }, { "epoch": 0.6946354316768908, "grad_norm": 2.4857938289642334, "learning_rate": 2.1954160915955525e-05, "loss": 2.3859, "step": 2109 }, { "epoch": 0.6949647988801515, "grad_norm": 2.176250457763672, "learning_rate": 2.1910625924070623e-05, "loss": 2.0642, "step": 2110 }, { "epoch": 0.6952941660834122, "grad_norm": 2.457257032394409, "learning_rate": 2.186712202515234e-05, "loss": 2.092, "step": 2111 }, { "epoch": 0.695623533286673, "grad_norm": 3.2521281242370605, "learning_rate": 2.1823649267356412e-05, "loss": 2.2814, "step": 2112 }, { "epoch": 0.6959529004899337, "grad_norm": 2.5254688262939453, "learning_rate": 2.1780207698804134e-05, "loss": 2.0433, "step": 2113 }, { "epoch": 0.6962822676931945, "grad_norm": 2.5245742797851562, "learning_rate": 2.1736797367582284e-05, "loss": 2.1671, "step": 2114 }, { "epoch": 0.6966116348964552, "grad_norm": 2.781444787979126, "learning_rate": 2.169341832174306e-05, "loss": 2.3001, "step": 2115 }, { "epoch": 0.6969410020997159, "grad_norm": 2.258967876434326, "learning_rate": 2.1650070609304002e-05, "loss": 1.9225, "step": 2116 }, { "epoch": 0.6972703693029767, "grad_norm": 2.339409112930298, "learning_rate": 2.1606754278248025e-05, "loss": 2.0533, "step": 2117 }, { "epoch": 0.6975997365062374, "grad_norm": 2.8763694763183594, "learning_rate": 2.1563469376523228e-05, "loss": 1.9008, "step": 2118 }, { "epoch": 0.6979291037094981, "grad_norm": 2.7212729454040527, "learning_rate": 2.152021595204297e-05, "loss": 2.1429, "step": 2119 }, { "epoch": 0.6982584709127588, "grad_norm": 2.8974947929382324, "learning_rate": 2.1476994052685766e-05, "loss": 1.9576, "step": 2120 }, { "epoch": 0.6985878381160195, "grad_norm": 2.508470058441162, "learning_rate": 2.1433803726295227e-05, "loss": 1.931, "step": 2121 }, { "epoch": 0.6989172053192804, "grad_norm": 2.526657819747925, "learning_rate": 2.1390645020680006e-05, "loss": 1.7588, "step": 2122 }, { "epoch": 0.6992465725225411, "grad_norm": 2.8346455097198486, "learning_rate": 2.1347517983613773e-05, "loss": 2.1006, "step": 2123 }, { "epoch": 0.6995759397258018, "grad_norm": 3.421928882598877, "learning_rate": 2.1304422662835146e-05, "loss": 2.4451, "step": 2124 }, { "epoch": 0.6999053069290625, "grad_norm": 2.873753547668457, "learning_rate": 2.126135910604758e-05, "loss": 1.7832, "step": 2125 }, { "epoch": 0.7002346741323233, "grad_norm": 2.716782331466675, "learning_rate": 2.1218327360919438e-05, "loss": 2.4388, "step": 2126 }, { "epoch": 0.700564041335584, "grad_norm": 2.3297483921051025, "learning_rate": 2.1175327475083844e-05, "loss": 2.5349, "step": 2127 }, { "epoch": 0.7008934085388447, "grad_norm": 2.592759847640991, "learning_rate": 2.1132359496138648e-05, "loss": 2.3805, "step": 2128 }, { "epoch": 0.7012227757421055, "grad_norm": 2.579590082168579, "learning_rate": 2.108942347164639e-05, "loss": 2.4302, "step": 2129 }, { "epoch": 0.7015521429453662, "grad_norm": 2.581580877304077, "learning_rate": 2.104651944913426e-05, "loss": 2.2602, "step": 2130 }, { "epoch": 0.701881510148627, "grad_norm": 2.675879955291748, "learning_rate": 2.1003647476093962e-05, "loss": 2.706, "step": 2131 }, { "epoch": 0.7022108773518877, "grad_norm": 2.9450440406799316, "learning_rate": 2.0960807599981785e-05, "loss": 2.5483, "step": 2132 }, { "epoch": 0.7025402445551484, "grad_norm": 2.4769771099090576, "learning_rate": 2.0917999868218457e-05, "loss": 2.2321, "step": 2133 }, { "epoch": 0.7028696117584091, "grad_norm": 2.61974835395813, "learning_rate": 2.087522432818914e-05, "loss": 2.3689, "step": 2134 }, { "epoch": 0.7031989789616699, "grad_norm": 2.3466622829437256, "learning_rate": 2.0832481027243357e-05, "loss": 2.2311, "step": 2135 }, { "epoch": 0.7035283461649307, "grad_norm": 2.35425066947937, "learning_rate": 2.0789770012694937e-05, "loss": 2.0013, "step": 2136 }, { "epoch": 0.7038577133681914, "grad_norm": 2.2261617183685303, "learning_rate": 2.0747091331822005e-05, "loss": 1.9601, "step": 2137 }, { "epoch": 0.7041870805714521, "grad_norm": 2.35845685005188, "learning_rate": 2.0704445031866803e-05, "loss": 1.9554, "step": 2138 }, { "epoch": 0.7045164477747128, "grad_norm": 2.579216480255127, "learning_rate": 2.066183116003586e-05, "loss": 2.0605, "step": 2139 }, { "epoch": 0.7048458149779736, "grad_norm": 2.4616470336914062, "learning_rate": 2.0619249763499708e-05, "loss": 2.1647, "step": 2140 }, { "epoch": 0.7051751821812343, "grad_norm": 2.7063252925872803, "learning_rate": 2.057670088939298e-05, "loss": 2.1963, "step": 2141 }, { "epoch": 0.705504549384495, "grad_norm": 2.8242859840393066, "learning_rate": 2.053418458481431e-05, "loss": 2.2085, "step": 2142 }, { "epoch": 0.7058339165877557, "grad_norm": 3.1535305976867676, "learning_rate": 2.0491700896826222e-05, "loss": 2.0284, "step": 2143 }, { "epoch": 0.7061632837910166, "grad_norm": 2.366270065307617, "learning_rate": 2.04492498724552e-05, "loss": 2.1194, "step": 2144 }, { "epoch": 0.7064926509942773, "grad_norm": 2.758836269378662, "learning_rate": 2.0406831558691552e-05, "loss": 2.2551, "step": 2145 }, { "epoch": 0.706822018197538, "grad_norm": 2.47495698928833, "learning_rate": 2.0364446002489372e-05, "loss": 1.9995, "step": 2146 }, { "epoch": 0.7071513854007987, "grad_norm": 2.167638063430786, "learning_rate": 2.03220932507665e-05, "loss": 1.8277, "step": 2147 }, { "epoch": 0.7074807526040594, "grad_norm": 2.6053552627563477, "learning_rate": 2.0279773350404464e-05, "loss": 1.7099, "step": 2148 }, { "epoch": 0.7078101198073202, "grad_norm": 2.8557825088500977, "learning_rate": 2.0237486348248437e-05, "loss": 1.6977, "step": 2149 }, { "epoch": 0.7081394870105809, "grad_norm": 3.3039448261260986, "learning_rate": 2.0195232291107125e-05, "loss": 1.5792, "step": 2150 }, { "epoch": 0.7084688542138416, "grad_norm": 1.8703773021697998, "learning_rate": 2.0153011225752832e-05, "loss": 2.436, "step": 2151 }, { "epoch": 0.7087982214171024, "grad_norm": 1.92439866065979, "learning_rate": 2.0110823198921314e-05, "loss": 2.1712, "step": 2152 }, { "epoch": 0.7091275886203632, "grad_norm": 2.654240846633911, "learning_rate": 2.0068668257311752e-05, "loss": 2.7122, "step": 2153 }, { "epoch": 0.7094569558236239, "grad_norm": 2.478987455368042, "learning_rate": 2.0026546447586715e-05, "loss": 2.1828, "step": 2154 }, { "epoch": 0.7097863230268846, "grad_norm": 2.456247091293335, "learning_rate": 1.9984457816372103e-05, "loss": 2.4922, "step": 2155 }, { "epoch": 0.7101156902301453, "grad_norm": 2.527040958404541, "learning_rate": 1.994240241025705e-05, "loss": 2.265, "step": 2156 }, { "epoch": 0.710445057433406, "grad_norm": 2.492377519607544, "learning_rate": 1.990038027579395e-05, "loss": 2.2534, "step": 2157 }, { "epoch": 0.7107744246366668, "grad_norm": 3.028960943222046, "learning_rate": 1.9858391459498367e-05, "loss": 2.4119, "step": 2158 }, { "epoch": 0.7111037918399276, "grad_norm": 2.4781980514526367, "learning_rate": 1.9816436007848964e-05, "loss": 2.2563, "step": 2159 }, { "epoch": 0.7114331590431883, "grad_norm": 2.4208078384399414, "learning_rate": 1.9774513967287496e-05, "loss": 2.1364, "step": 2160 }, { "epoch": 0.711762526246449, "grad_norm": 2.3853366374969482, "learning_rate": 1.9732625384218705e-05, "loss": 2.0286, "step": 2161 }, { "epoch": 0.7120918934497097, "grad_norm": 2.075531244277954, "learning_rate": 1.9690770305010346e-05, "loss": 2.1623, "step": 2162 }, { "epoch": 0.7124212606529705, "grad_norm": 2.8861477375030518, "learning_rate": 1.9648948775993014e-05, "loss": 2.0822, "step": 2163 }, { "epoch": 0.7127506278562312, "grad_norm": 2.6686360836029053, "learning_rate": 1.9607160843460225e-05, "loss": 2.3996, "step": 2164 }, { "epoch": 0.7130799950594919, "grad_norm": 2.642855644226074, "learning_rate": 1.956540655366829e-05, "loss": 2.1804, "step": 2165 }, { "epoch": 0.7134093622627526, "grad_norm": 2.506636142730713, "learning_rate": 1.952368595283628e-05, "loss": 1.6528, "step": 2166 }, { "epoch": 0.7137387294660135, "grad_norm": 2.5572757720947266, "learning_rate": 1.9481999087145973e-05, "loss": 1.9408, "step": 2167 }, { "epoch": 0.7140680966692742, "grad_norm": 2.670609474182129, "learning_rate": 1.9440346002741798e-05, "loss": 2.2826, "step": 2168 }, { "epoch": 0.7143974638725349, "grad_norm": 2.513737916946411, "learning_rate": 1.939872674573081e-05, "loss": 2.0831, "step": 2169 }, { "epoch": 0.7147268310757956, "grad_norm": 2.594437837600708, "learning_rate": 1.93571413621826e-05, "loss": 2.0226, "step": 2170 }, { "epoch": 0.7150561982790563, "grad_norm": 3.0486814975738525, "learning_rate": 1.9315589898129266e-05, "loss": 1.8639, "step": 2171 }, { "epoch": 0.7153855654823171, "grad_norm": 2.8681252002716064, "learning_rate": 1.9274072399565373e-05, "loss": 1.9047, "step": 2172 }, { "epoch": 0.7157149326855778, "grad_norm": 2.4376771450042725, "learning_rate": 1.9232588912447875e-05, "loss": 1.6213, "step": 2173 }, { "epoch": 0.7160442998888386, "grad_norm": 2.7798798084259033, "learning_rate": 1.9191139482696097e-05, "loss": 1.9778, "step": 2174 }, { "epoch": 0.7163736670920993, "grad_norm": 2.8711724281311035, "learning_rate": 1.9149724156191618e-05, "loss": 1.7021, "step": 2175 }, { "epoch": 0.7167030342953601, "grad_norm": 2.1148149967193604, "learning_rate": 1.9108342978778317e-05, "loss": 2.3465, "step": 2176 }, { "epoch": 0.7170324014986208, "grad_norm": 2.3499250411987305, "learning_rate": 1.9066995996262248e-05, "loss": 2.2567, "step": 2177 }, { "epoch": 0.7173617687018815, "grad_norm": 2.2933461666107178, "learning_rate": 1.902568325441163e-05, "loss": 2.1633, "step": 2178 }, { "epoch": 0.7176911359051422, "grad_norm": 2.383384943008423, "learning_rate": 1.898440479895677e-05, "loss": 2.3886, "step": 2179 }, { "epoch": 0.7180205031084029, "grad_norm": 2.2387309074401855, "learning_rate": 1.894316067559003e-05, "loss": 2.02, "step": 2180 }, { "epoch": 0.7183498703116638, "grad_norm": 2.4402246475219727, "learning_rate": 1.890195092996573e-05, "loss": 2.1891, "step": 2181 }, { "epoch": 0.7186792375149245, "grad_norm": 2.534137010574341, "learning_rate": 1.886077560770019e-05, "loss": 2.3046, "step": 2182 }, { "epoch": 0.7190086047181852, "grad_norm": 2.543569803237915, "learning_rate": 1.88196347543716e-05, "loss": 2.5529, "step": 2183 }, { "epoch": 0.7193379719214459, "grad_norm": 2.6904938220977783, "learning_rate": 1.8778528415519998e-05, "loss": 1.9117, "step": 2184 }, { "epoch": 0.7196673391247067, "grad_norm": 2.512737989425659, "learning_rate": 1.873745663664722e-05, "loss": 2.093, "step": 2185 }, { "epoch": 0.7199967063279674, "grad_norm": 2.5019516944885254, "learning_rate": 1.869641946321684e-05, "loss": 2.3069, "step": 2186 }, { "epoch": 0.7203260735312281, "grad_norm": 2.564549446105957, "learning_rate": 1.8655416940654152e-05, "loss": 1.94, "step": 2187 }, { "epoch": 0.7206554407344888, "grad_norm": 2.498377561569214, "learning_rate": 1.8614449114346033e-05, "loss": 2.317, "step": 2188 }, { "epoch": 0.7209848079377495, "grad_norm": 2.7334401607513428, "learning_rate": 1.8573516029641015e-05, "loss": 2.0478, "step": 2189 }, { "epoch": 0.7213141751410104, "grad_norm": 2.2970240116119385, "learning_rate": 1.8532617731849144e-05, "loss": 1.6288, "step": 2190 }, { "epoch": 0.7216435423442711, "grad_norm": 2.7691264152526855, "learning_rate": 1.8491754266241973e-05, "loss": 2.3471, "step": 2191 }, { "epoch": 0.7219729095475318, "grad_norm": 2.845181941986084, "learning_rate": 1.8450925678052495e-05, "loss": 2.3704, "step": 2192 }, { "epoch": 0.7223022767507925, "grad_norm": 2.658377170562744, "learning_rate": 1.8410132012475094e-05, "loss": 2.0011, "step": 2193 }, { "epoch": 0.7226316439540533, "grad_norm": 2.7496695518493652, "learning_rate": 1.8369373314665483e-05, "loss": 1.9333, "step": 2194 }, { "epoch": 0.722961011157314, "grad_norm": 3.0554986000061035, "learning_rate": 1.8328649629740685e-05, "loss": 2.1955, "step": 2195 }, { "epoch": 0.7232903783605747, "grad_norm": 2.52851939201355, "learning_rate": 1.8287961002778964e-05, "loss": 2.1551, "step": 2196 }, { "epoch": 0.7236197455638355, "grad_norm": 2.8639140129089355, "learning_rate": 1.824730747881978e-05, "loss": 2.0154, "step": 2197 }, { "epoch": 0.7239491127670962, "grad_norm": 3.166149139404297, "learning_rate": 1.8206689102863728e-05, "loss": 2.148, "step": 2198 }, { "epoch": 0.724278479970357, "grad_norm": 3.3647382259368896, "learning_rate": 1.81661059198725e-05, "loss": 2.2395, "step": 2199 }, { "epoch": 0.7246078471736177, "grad_norm": 2.8727433681488037, "learning_rate": 1.8125557974768837e-05, "loss": 1.7405, "step": 2200 }, { "epoch": 0.7249372143768784, "grad_norm": 2.2409629821777344, "learning_rate": 1.8085045312436465e-05, "loss": 2.6778, "step": 2201 }, { "epoch": 0.7252665815801391, "grad_norm": 1.9362051486968994, "learning_rate": 1.804456797772006e-05, "loss": 2.1103, "step": 2202 }, { "epoch": 0.7255959487834, "grad_norm": 2.2799839973449707, "learning_rate": 1.80041260154252e-05, "loss": 2.3589, "step": 2203 }, { "epoch": 0.7259253159866607, "grad_norm": 2.4302492141723633, "learning_rate": 1.796371947031829e-05, "loss": 2.2235, "step": 2204 }, { "epoch": 0.7262546831899214, "grad_norm": 2.615129232406616, "learning_rate": 1.7923348387126566e-05, "loss": 2.3434, "step": 2205 }, { "epoch": 0.7265840503931821, "grad_norm": 2.2985572814941406, "learning_rate": 1.788301281053794e-05, "loss": 2.2154, "step": 2206 }, { "epoch": 0.7269134175964428, "grad_norm": 2.6856024265289307, "learning_rate": 1.7842712785201094e-05, "loss": 2.4275, "step": 2207 }, { "epoch": 0.7272427847997036, "grad_norm": 2.337125778198242, "learning_rate": 1.7802448355725322e-05, "loss": 2.2931, "step": 2208 }, { "epoch": 0.7275721520029643, "grad_norm": 2.6537628173828125, "learning_rate": 1.7762219566680528e-05, "loss": 2.2855, "step": 2209 }, { "epoch": 0.727901519206225, "grad_norm": 3.049884796142578, "learning_rate": 1.7722026462597153e-05, "loss": 2.5902, "step": 2210 }, { "epoch": 0.7282308864094857, "grad_norm": 2.5096991062164307, "learning_rate": 1.768186908796617e-05, "loss": 2.1373, "step": 2211 }, { "epoch": 0.7285602536127465, "grad_norm": 2.4747989177703857, "learning_rate": 1.764174748723893e-05, "loss": 2.626, "step": 2212 }, { "epoch": 0.7288896208160073, "grad_norm": 2.4736328125, "learning_rate": 1.7601661704827253e-05, "loss": 2.303, "step": 2213 }, { "epoch": 0.729218988019268, "grad_norm": 2.823103666305542, "learning_rate": 1.7561611785103294e-05, "loss": 2.1824, "step": 2214 }, { "epoch": 0.7295483552225287, "grad_norm": 2.7458789348602295, "learning_rate": 1.7521597772399496e-05, "loss": 1.7698, "step": 2215 }, { "epoch": 0.7298777224257894, "grad_norm": 2.636065721511841, "learning_rate": 1.748161971100856e-05, "loss": 2.2537, "step": 2216 }, { "epoch": 0.7302070896290502, "grad_norm": 2.7386367321014404, "learning_rate": 1.7441677645183413e-05, "loss": 2.2158, "step": 2217 }, { "epoch": 0.7305364568323109, "grad_norm": 2.3262336254119873, "learning_rate": 1.740177161913712e-05, "loss": 1.9395, "step": 2218 }, { "epoch": 0.7308658240355717, "grad_norm": 2.5685794353485107, "learning_rate": 1.736190167704283e-05, "loss": 2.0057, "step": 2219 }, { "epoch": 0.7311951912388324, "grad_norm": 2.8580093383789062, "learning_rate": 1.732206786303378e-05, "loss": 2.178, "step": 2220 }, { "epoch": 0.7315245584420931, "grad_norm": 2.475217580795288, "learning_rate": 1.7282270221203213e-05, "loss": 1.9048, "step": 2221 }, { "epoch": 0.7318539256453539, "grad_norm": 2.6721620559692383, "learning_rate": 1.7242508795604324e-05, "loss": 1.9798, "step": 2222 }, { "epoch": 0.7321832928486146, "grad_norm": 3.082043409347534, "learning_rate": 1.720278363025022e-05, "loss": 2.1289, "step": 2223 }, { "epoch": 0.7325126600518753, "grad_norm": 3.0245370864868164, "learning_rate": 1.716309476911388e-05, "loss": 1.8127, "step": 2224 }, { "epoch": 0.732842027255136, "grad_norm": 3.863074779510498, "learning_rate": 1.7123442256128097e-05, "loss": 1.9633, "step": 2225 }, { "epoch": 0.7331713944583969, "grad_norm": 1.7904800176620483, "learning_rate": 1.7083826135185393e-05, "loss": 2.3501, "step": 2226 }, { "epoch": 0.7335007616616576, "grad_norm": 2.111262083053589, "learning_rate": 1.7044246450138053e-05, "loss": 2.4573, "step": 2227 }, { "epoch": 0.7338301288649183, "grad_norm": 2.706902503967285, "learning_rate": 1.700470324479801e-05, "loss": 2.0959, "step": 2228 }, { "epoch": 0.734159496068179, "grad_norm": 2.2584450244903564, "learning_rate": 1.6965196562936796e-05, "loss": 2.2161, "step": 2229 }, { "epoch": 0.7344888632714397, "grad_norm": 2.040581226348877, "learning_rate": 1.6925726448285588e-05, "loss": 2.0735, "step": 2230 }, { "epoch": 0.7348182304747005, "grad_norm": 2.670949697494507, "learning_rate": 1.6886292944534994e-05, "loss": 2.2781, "step": 2231 }, { "epoch": 0.7351475976779612, "grad_norm": 2.4085488319396973, "learning_rate": 1.6846896095335146e-05, "loss": 2.1216, "step": 2232 }, { "epoch": 0.7354769648812219, "grad_norm": 2.76834774017334, "learning_rate": 1.680753594429559e-05, "loss": 2.4634, "step": 2233 }, { "epoch": 0.7358063320844827, "grad_norm": 2.398747444152832, "learning_rate": 1.6768212534985257e-05, "loss": 2.1542, "step": 2234 }, { "epoch": 0.7361356992877435, "grad_norm": 2.2678730487823486, "learning_rate": 1.672892591093239e-05, "loss": 2.2039, "step": 2235 }, { "epoch": 0.7364650664910042, "grad_norm": 2.4844462871551514, "learning_rate": 1.6689676115624563e-05, "loss": 2.0138, "step": 2236 }, { "epoch": 0.7367944336942649, "grad_norm": 2.4144296646118164, "learning_rate": 1.6650463192508496e-05, "loss": 1.9737, "step": 2237 }, { "epoch": 0.7371238008975256, "grad_norm": 3.1505680084228516, "learning_rate": 1.6611287184990172e-05, "loss": 2.4489, "step": 2238 }, { "epoch": 0.7374531681007863, "grad_norm": 2.5178029537200928, "learning_rate": 1.6572148136434678e-05, "loss": 2.2807, "step": 2239 }, { "epoch": 0.7377825353040471, "grad_norm": 2.765470266342163, "learning_rate": 1.6533046090166195e-05, "loss": 2.3461, "step": 2240 }, { "epoch": 0.7381119025073078, "grad_norm": 2.7750508785247803, "learning_rate": 1.6493981089467943e-05, "loss": 2.2532, "step": 2241 }, { "epoch": 0.7384412697105686, "grad_norm": 2.7615838050842285, "learning_rate": 1.645495317758214e-05, "loss": 2.0369, "step": 2242 }, { "epoch": 0.7387706369138293, "grad_norm": 2.5372447967529297, "learning_rate": 1.641596239770996e-05, "loss": 2.1105, "step": 2243 }, { "epoch": 0.7391000041170901, "grad_norm": 2.304556369781494, "learning_rate": 1.6377008793011433e-05, "loss": 1.6536, "step": 2244 }, { "epoch": 0.7394293713203508, "grad_norm": 2.7731292247772217, "learning_rate": 1.633809240660548e-05, "loss": 1.9395, "step": 2245 }, { "epoch": 0.7397587385236115, "grad_norm": 2.9680540561676025, "learning_rate": 1.6299213281569815e-05, "loss": 1.7278, "step": 2246 }, { "epoch": 0.7400881057268722, "grad_norm": 2.882687568664551, "learning_rate": 1.626037146094089e-05, "loss": 1.9297, "step": 2247 }, { "epoch": 0.7404174729301329, "grad_norm": 3.0989298820495605, "learning_rate": 1.6221566987713893e-05, "loss": 2.0538, "step": 2248 }, { "epoch": 0.7407468401333938, "grad_norm": 3.140981435775757, "learning_rate": 1.618279990484266e-05, "loss": 2.1415, "step": 2249 }, { "epoch": 0.7410762073366545, "grad_norm": 3.206493616104126, "learning_rate": 1.6144070255239597e-05, "loss": 2.1208, "step": 2250 }, { "epoch": 0.7414055745399152, "grad_norm": 2.1097187995910645, "learning_rate": 1.6105378081775735e-05, "loss": 2.5872, "step": 2251 }, { "epoch": 0.7417349417431759, "grad_norm": 2.210814952850342, "learning_rate": 1.6066723427280582e-05, "loss": 2.6126, "step": 2252 }, { "epoch": 0.7420643089464367, "grad_norm": 2.039604902267456, "learning_rate": 1.6028106334542144e-05, "loss": 2.2881, "step": 2253 }, { "epoch": 0.7423936761496974, "grad_norm": 2.2926268577575684, "learning_rate": 1.5989526846306824e-05, "loss": 2.4734, "step": 2254 }, { "epoch": 0.7427230433529581, "grad_norm": 2.4341986179351807, "learning_rate": 1.5950985005279413e-05, "loss": 2.3901, "step": 2255 }, { "epoch": 0.7430524105562188, "grad_norm": 2.212266206741333, "learning_rate": 1.5912480854123042e-05, "loss": 2.5487, "step": 2256 }, { "epoch": 0.7433817777594796, "grad_norm": 2.5183675289154053, "learning_rate": 1.5874014435459068e-05, "loss": 2.3042, "step": 2257 }, { "epoch": 0.7437111449627404, "grad_norm": 2.696521759033203, "learning_rate": 1.5835585791867135e-05, "loss": 2.2133, "step": 2258 }, { "epoch": 0.7440405121660011, "grad_norm": 2.3256208896636963, "learning_rate": 1.579719496588506e-05, "loss": 2.2106, "step": 2259 }, { "epoch": 0.7443698793692618, "grad_norm": 2.5763232707977295, "learning_rate": 1.5758842000008772e-05, "loss": 2.4074, "step": 2260 }, { "epoch": 0.7446992465725225, "grad_norm": 2.66152024269104, "learning_rate": 1.572052693669237e-05, "loss": 1.9538, "step": 2261 }, { "epoch": 0.7450286137757832, "grad_norm": 3.3669986724853516, "learning_rate": 1.568224981834789e-05, "loss": 2.4397, "step": 2262 }, { "epoch": 0.745357980979044, "grad_norm": 2.784435510635376, "learning_rate": 1.5644010687345433e-05, "loss": 2.4693, "step": 2263 }, { "epoch": 0.7456873481823048, "grad_norm": 2.511077404022217, "learning_rate": 1.5605809586013033e-05, "loss": 2.2116, "step": 2264 }, { "epoch": 0.7460167153855655, "grad_norm": 2.888571262359619, "learning_rate": 1.556764655663662e-05, "loss": 2.5169, "step": 2265 }, { "epoch": 0.7463460825888262, "grad_norm": 2.7775685787200928, "learning_rate": 1.552952164146001e-05, "loss": 1.8714, "step": 2266 }, { "epoch": 0.746675449792087, "grad_norm": 2.7570314407348633, "learning_rate": 1.5491434882684796e-05, "loss": 2.1423, "step": 2267 }, { "epoch": 0.7470048169953477, "grad_norm": 3.36289381980896, "learning_rate": 1.545338632247037e-05, "loss": 2.1092, "step": 2268 }, { "epoch": 0.7473341841986084, "grad_norm": 2.6311471462249756, "learning_rate": 1.541537600293378e-05, "loss": 1.8818, "step": 2269 }, { "epoch": 0.7476635514018691, "grad_norm": 2.5641837120056152, "learning_rate": 1.5377403966149806e-05, "loss": 1.7303, "step": 2270 }, { "epoch": 0.7479929186051298, "grad_norm": 2.628680467605591, "learning_rate": 1.533947025415083e-05, "loss": 2.2235, "step": 2271 }, { "epoch": 0.7483222858083907, "grad_norm": 3.235398530960083, "learning_rate": 1.53015749089268e-05, "loss": 1.8414, "step": 2272 }, { "epoch": 0.7486516530116514, "grad_norm": 2.6925714015960693, "learning_rate": 1.5263717972425222e-05, "loss": 1.6467, "step": 2273 }, { "epoch": 0.7489810202149121, "grad_norm": 2.408013343811035, "learning_rate": 1.5225899486551065e-05, "loss": 1.8224, "step": 2274 }, { "epoch": 0.7493103874181728, "grad_norm": 3.5030288696289062, "learning_rate": 1.5188119493166726e-05, "loss": 2.0291, "step": 2275 }, { "epoch": 0.7496397546214336, "grad_norm": 2.0207650661468506, "learning_rate": 1.515037803409201e-05, "loss": 2.4186, "step": 2276 }, { "epoch": 0.7499691218246943, "grad_norm": 2.019650459289551, "learning_rate": 1.5112675151104066e-05, "loss": 2.1816, "step": 2277 }, { "epoch": 0.7499691218246943, "eval_loss": 2.19006085395813, "eval_runtime": 765.4555, "eval_samples_per_second": 3.34, "eval_steps_per_second": 1.671, "step": 2277 } ], "logging_steps": 1, "max_steps": 3036, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 759, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.492436769658372e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }