diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" new file mode 100644--- /dev/null +++ "b/last-checkpoint/trainer_state.json" @@ -0,0 +1,5362 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.2499897072748981, + "eval_steps": 759, + "global_step": 759, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0003293672032607353, + "grad_norm": 3.2412221431732178, + "learning_rate": 2.0000000000000003e-06, + "loss": 3.3662, + "step": 1 + }, + { + "epoch": 0.0003293672032607353, + "eval_loss": 3.95210599899292, + "eval_runtime": 812.7321, + "eval_samples_per_second": 3.146, + "eval_steps_per_second": 1.574, + "step": 1 + }, + { + "epoch": 0.0006587344065214706, + "grad_norm": 3.8850836753845215, + "learning_rate": 4.000000000000001e-06, + "loss": 3.2364, + "step": 2 + }, + { + "epoch": 0.000988101609782206, + "grad_norm": 3.7942073345184326, + "learning_rate": 6e-06, + "loss": 3.3337, + "step": 3 + }, + { + "epoch": 0.0013174688130429412, + "grad_norm": 4.045947074890137, + "learning_rate": 8.000000000000001e-06, + "loss": 3.523, + "step": 4 + }, + { + "epoch": 0.0016468360163036766, + "grad_norm": 3.6181905269622803, + "learning_rate": 1e-05, + "loss": 3.1772, + "step": 5 + }, + { + "epoch": 0.001976203219564412, + "grad_norm": 4.814149379730225, + "learning_rate": 1.2e-05, + "loss": 3.871, + "step": 6 + }, + { + "epoch": 0.002305570422825147, + "grad_norm": 4.0820112228393555, + "learning_rate": 1.4000000000000001e-05, + "loss": 3.7199, + "step": 7 + }, + { + "epoch": 0.0026349376260858823, + "grad_norm": 4.483249187469482, + "learning_rate": 1.6000000000000003e-05, + "loss": 3.5852, + "step": 8 + }, + { + "epoch": 0.002964304829346618, + "grad_norm": 5.03192138671875, + "learning_rate": 1.8e-05, + "loss": 3.5165, + "step": 9 + }, + { + "epoch": 0.003293672032607353, + "grad_norm": 4.41070556640625, + "learning_rate": 2e-05, + "loss": 3.1522, + "step": 10 + }, + { + "epoch": 0.0036230392358680883, + "grad_norm": 4.54289436340332, + "learning_rate": 2.2000000000000003e-05, + "loss": 3.113, + "step": 11 + }, + { + "epoch": 0.003952406439128824, + "grad_norm": 5.175803184509277, + "learning_rate": 2.4e-05, + "loss": 3.0833, + "step": 12 + }, + { + "epoch": 0.004281773642389559, + "grad_norm": 5.162367820739746, + "learning_rate": 2.6000000000000002e-05, + "loss": 3.2495, + "step": 13 + }, + { + "epoch": 0.004611140845650294, + "grad_norm": 5.13967227935791, + "learning_rate": 2.8000000000000003e-05, + "loss": 3.1866, + "step": 14 + }, + { + "epoch": 0.0049405080489110294, + "grad_norm": 4.819941520690918, + "learning_rate": 3e-05, + "loss": 2.8483, + "step": 15 + }, + { + "epoch": 0.005269875252171765, + "grad_norm": 4.200347900390625, + "learning_rate": 3.2000000000000005e-05, + "loss": 3.0112, + "step": 16 + }, + { + "epoch": 0.005599242455432501, + "grad_norm": 3.748765230178833, + "learning_rate": 3.4000000000000007e-05, + "loss": 2.4482, + "step": 17 + }, + { + "epoch": 0.005928609658693236, + "grad_norm": 4.706761360168457, + "learning_rate": 3.6e-05, + "loss": 2.7336, + "step": 18 + }, + { + "epoch": 0.006257976861953971, + "grad_norm": 4.122259616851807, + "learning_rate": 3.8e-05, + "loss": 2.2935, + "step": 19 + }, + { + "epoch": 0.006587344065214706, + "grad_norm": 3.4756200313568115, + "learning_rate": 4e-05, + "loss": 2.4558, + "step": 20 + }, + { + "epoch": 0.006916711268475441, + "grad_norm": 4.478485107421875, + "learning_rate": 4.2e-05, + "loss": 2.7799, + "step": 21 + }, + { + "epoch": 0.007246078471736177, + "grad_norm": 3.8455657958984375, + "learning_rate": 4.4000000000000006e-05, + "loss": 2.3758, + "step": 22 + }, + { + "epoch": 0.007575445674996912, + "grad_norm": 4.439505100250244, + "learning_rate": 4.600000000000001e-05, + "loss": 2.4312, + "step": 23 + }, + { + "epoch": 0.007904812878257648, + "grad_norm": 5.45050048828125, + "learning_rate": 4.8e-05, + "loss": 2.3483, + "step": 24 + }, + { + "epoch": 0.008234180081518383, + "grad_norm": 6.179609298706055, + "learning_rate": 5e-05, + "loss": 2.1845, + "step": 25 + }, + { + "epoch": 0.008563547284779118, + "grad_norm": 3.980731964111328, + "learning_rate": 5.2000000000000004e-05, + "loss": 2.7766, + "step": 26 + }, + { + "epoch": 0.008892914488039853, + "grad_norm": 3.381983995437622, + "learning_rate": 5.4000000000000005e-05, + "loss": 2.5084, + "step": 27 + }, + { + "epoch": 0.009222281691300589, + "grad_norm": 3.6727781295776367, + "learning_rate": 5.6000000000000006e-05, + "loss": 2.8683, + "step": 28 + }, + { + "epoch": 0.009551648894561324, + "grad_norm": 3.6398210525512695, + "learning_rate": 5.8e-05, + "loss": 2.4222, + "step": 29 + }, + { + "epoch": 0.009881016097822059, + "grad_norm": 3.005326986312866, + "learning_rate": 6e-05, + "loss": 2.5306, + "step": 30 + }, + { + "epoch": 0.010210383301082794, + "grad_norm": 3.8456501960754395, + "learning_rate": 6.2e-05, + "loss": 2.7113, + "step": 31 + }, + { + "epoch": 0.01053975050434353, + "grad_norm": 2.880403518676758, + "learning_rate": 6.400000000000001e-05, + "loss": 2.3046, + "step": 32 + }, + { + "epoch": 0.010869117707604266, + "grad_norm": 3.180576801300049, + "learning_rate": 6.6e-05, + "loss": 2.5024, + "step": 33 + }, + { + "epoch": 0.011198484910865001, + "grad_norm": 3.2660038471221924, + "learning_rate": 6.800000000000001e-05, + "loss": 2.6287, + "step": 34 + }, + { + "epoch": 0.011527852114125737, + "grad_norm": 2.8470723628997803, + "learning_rate": 7e-05, + "loss": 2.3198, + "step": 35 + }, + { + "epoch": 0.011857219317386472, + "grad_norm": 3.0491487979888916, + "learning_rate": 7.2e-05, + "loss": 2.3468, + "step": 36 + }, + { + "epoch": 0.012186586520647207, + "grad_norm": 3.558762788772583, + "learning_rate": 7.4e-05, + "loss": 2.5952, + "step": 37 + }, + { + "epoch": 0.012515953723907942, + "grad_norm": 3.0420889854431152, + "learning_rate": 7.6e-05, + "loss": 2.2754, + "step": 38 + }, + { + "epoch": 0.012845320927168677, + "grad_norm": 2.9328525066375732, + "learning_rate": 7.800000000000001e-05, + "loss": 1.9192, + "step": 39 + }, + { + "epoch": 0.013174688130429412, + "grad_norm": 2.9751060009002686, + "learning_rate": 8e-05, + "loss": 2.4325, + "step": 40 + }, + { + "epoch": 0.013504055333690148, + "grad_norm": 2.924984931945801, + "learning_rate": 8.2e-05, + "loss": 2.376, + "step": 41 + }, + { + "epoch": 0.013833422536950883, + "grad_norm": 2.9344444274902344, + "learning_rate": 8.4e-05, + "loss": 2.3072, + "step": 42 + }, + { + "epoch": 0.014162789740211618, + "grad_norm": 3.2519760131835938, + "learning_rate": 8.6e-05, + "loss": 2.3017, + "step": 43 + }, + { + "epoch": 0.014492156943472353, + "grad_norm": 2.9699273109436035, + "learning_rate": 8.800000000000001e-05, + "loss": 2.2801, + "step": 44 + }, + { + "epoch": 0.014821524146733088, + "grad_norm": 3.125183343887329, + "learning_rate": 9e-05, + "loss": 2.1891, + "step": 45 + }, + { + "epoch": 0.015150891349993824, + "grad_norm": 4.185096740722656, + "learning_rate": 9.200000000000001e-05, + "loss": 2.3958, + "step": 46 + }, + { + "epoch": 0.01548025855325456, + "grad_norm": 4.01360559463501, + "learning_rate": 9.4e-05, + "loss": 2.3923, + "step": 47 + }, + { + "epoch": 0.015809625756515296, + "grad_norm": 3.355820417404175, + "learning_rate": 9.6e-05, + "loss": 2.0905, + "step": 48 + }, + { + "epoch": 0.01613899295977603, + "grad_norm": 4.308192729949951, + "learning_rate": 9.8e-05, + "loss": 2.0585, + "step": 49 + }, + { + "epoch": 0.016468360163036766, + "grad_norm": 5.098552227020264, + "learning_rate": 0.0001, + "loss": 2.7164, + "step": 50 + }, + { + "epoch": 0.0167977273662975, + "grad_norm": 2.8389906883239746, + "learning_rate": 9.999997232675378e-05, + "loss": 2.7739, + "step": 51 + }, + { + "epoch": 0.017127094569558236, + "grad_norm": 3.0663294792175293, + "learning_rate": 9.999988930704576e-05, + "loss": 2.7123, + "step": 52 + }, + { + "epoch": 0.01745646177281897, + "grad_norm": 3.3522698879241943, + "learning_rate": 9.99997509409678e-05, + "loss": 2.4191, + "step": 53 + }, + { + "epoch": 0.017785828976079707, + "grad_norm": 2.815250873565674, + "learning_rate": 9.999955722867312e-05, + "loss": 2.6329, + "step": 54 + }, + { + "epoch": 0.018115196179340442, + "grad_norm": 2.9915289878845215, + "learning_rate": 9.999930817037609e-05, + "loss": 2.4786, + "step": 55 + }, + { + "epoch": 0.018444563382601177, + "grad_norm": 3.284151554107666, + "learning_rate": 9.999900376635243e-05, + "loss": 2.4249, + "step": 56 + }, + { + "epoch": 0.018773930585861912, + "grad_norm": 2.4534027576446533, + "learning_rate": 9.999864401693908e-05, + "loss": 2.356, + "step": 57 + }, + { + "epoch": 0.019103297789122647, + "grad_norm": 2.522468328475952, + "learning_rate": 9.999822892253428e-05, + "loss": 2.3852, + "step": 58 + }, + { + "epoch": 0.019432664992383383, + "grad_norm": 2.9176650047302246, + "learning_rate": 9.999775848359749e-05, + "loss": 2.2109, + "step": 59 + }, + { + "epoch": 0.019762032195644118, + "grad_norm": 3.167917013168335, + "learning_rate": 9.999723270064945e-05, + "loss": 2.3511, + "step": 60 + }, + { + "epoch": 0.020091399398904853, + "grad_norm": 2.6770145893096924, + "learning_rate": 9.999665157427218e-05, + "loss": 2.6865, + "step": 61 + }, + { + "epoch": 0.020420766602165588, + "grad_norm": 2.729907989501953, + "learning_rate": 9.999601510510895e-05, + "loss": 2.3012, + "step": 62 + }, + { + "epoch": 0.020750133805426323, + "grad_norm": 2.7215871810913086, + "learning_rate": 9.999532329386425e-05, + "loss": 2.0514, + "step": 63 + }, + { + "epoch": 0.02107950100868706, + "grad_norm": 2.7751474380493164, + "learning_rate": 9.999457614130391e-05, + "loss": 2.4867, + "step": 64 + }, + { + "epoch": 0.021408868211947794, + "grad_norm": 2.4375815391540527, + "learning_rate": 9.999377364825496e-05, + "loss": 2.2029, + "step": 65 + }, + { + "epoch": 0.021738235415208532, + "grad_norm": 2.7066309452056885, + "learning_rate": 9.99929158156057e-05, + "loss": 2.1274, + "step": 66 + }, + { + "epoch": 0.022067602618469268, + "grad_norm": 3.297457695007324, + "learning_rate": 9.999200264430569e-05, + "loss": 2.5863, + "step": 67 + }, + { + "epoch": 0.022396969821730003, + "grad_norm": 3.2771711349487305, + "learning_rate": 9.999103413536574e-05, + "loss": 2.334, + "step": 68 + }, + { + "epoch": 0.022726337024990738, + "grad_norm": 2.9507834911346436, + "learning_rate": 9.999001028985795e-05, + "loss": 2.3783, + "step": 69 + }, + { + "epoch": 0.023055704228251473, + "grad_norm": 2.6401610374450684, + "learning_rate": 9.998893110891562e-05, + "loss": 1.9256, + "step": 70 + }, + { + "epoch": 0.023385071431512208, + "grad_norm": 3.0901267528533936, + "learning_rate": 9.998779659373332e-05, + "loss": 2.1796, + "step": 71 + }, + { + "epoch": 0.023714438634772943, + "grad_norm": 3.132500171661377, + "learning_rate": 9.99866067455669e-05, + "loss": 2.2996, + "step": 72 + }, + { + "epoch": 0.02404380583803368, + "grad_norm": 3.1304476261138916, + "learning_rate": 9.998536156573343e-05, + "loss": 2.1264, + "step": 73 + }, + { + "epoch": 0.024373173041294414, + "grad_norm": 3.315488576889038, + "learning_rate": 9.998406105561125e-05, + "loss": 1.984, + "step": 74 + }, + { + "epoch": 0.02470254024455515, + "grad_norm": 7.0925774574279785, + "learning_rate": 9.99827052166399e-05, + "loss": 2.533, + "step": 75 + }, + { + "epoch": 0.025031907447815884, + "grad_norm": 2.485670328140259, + "learning_rate": 9.998129405032022e-05, + "loss": 2.4111, + "step": 76 + }, + { + "epoch": 0.02536127465107662, + "grad_norm": 2.9346237182617188, + "learning_rate": 9.997982755821428e-05, + "loss": 2.4954, + "step": 77 + }, + { + "epoch": 0.025690641854337355, + "grad_norm": 2.6853690147399902, + "learning_rate": 9.997830574194538e-05, + "loss": 2.7849, + "step": 78 + }, + { + "epoch": 0.02602000905759809, + "grad_norm": 2.614497661590576, + "learning_rate": 9.997672860319804e-05, + "loss": 2.7054, + "step": 79 + }, + { + "epoch": 0.026349376260858825, + "grad_norm": 2.476630687713623, + "learning_rate": 9.997509614371807e-05, + "loss": 2.3267, + "step": 80 + }, + { + "epoch": 0.02667874346411956, + "grad_norm": 2.7469708919525146, + "learning_rate": 9.997340836531249e-05, + "loss": 2.3416, + "step": 81 + }, + { + "epoch": 0.027008110667380295, + "grad_norm": 2.3859927654266357, + "learning_rate": 9.997166526984954e-05, + "loss": 2.1941, + "step": 82 + }, + { + "epoch": 0.02733747787064103, + "grad_norm": 2.3550686836242676, + "learning_rate": 9.996986685925868e-05, + "loss": 2.2731, + "step": 83 + }, + { + "epoch": 0.027666845073901766, + "grad_norm": 2.6245596408843994, + "learning_rate": 9.996801313553068e-05, + "loss": 2.1934, + "step": 84 + }, + { + "epoch": 0.0279962122771625, + "grad_norm": 2.337979793548584, + "learning_rate": 9.996610410071742e-05, + "loss": 2.0485, + "step": 85 + }, + { + "epoch": 0.028325579480423236, + "grad_norm": 2.2919275760650635, + "learning_rate": 9.996413975693214e-05, + "loss": 2.1097, + "step": 86 + }, + { + "epoch": 0.02865494668368397, + "grad_norm": 2.4434707164764404, + "learning_rate": 9.996212010634917e-05, + "loss": 2.1181, + "step": 87 + }, + { + "epoch": 0.028984313886944706, + "grad_norm": 2.2074220180511475, + "learning_rate": 9.996004515120414e-05, + "loss": 2.1341, + "step": 88 + }, + { + "epoch": 0.02931368109020544, + "grad_norm": 2.670886993408203, + "learning_rate": 9.995791489379388e-05, + "loss": 2.2701, + "step": 89 + }, + { + "epoch": 0.029643048293466177, + "grad_norm": 3.0483765602111816, + "learning_rate": 9.995572933647643e-05, + "loss": 2.2896, + "step": 90 + }, + { + "epoch": 0.029972415496726912, + "grad_norm": 2.74815034866333, + "learning_rate": 9.995348848167107e-05, + "loss": 2.3094, + "step": 91 + }, + { + "epoch": 0.030301782699987647, + "grad_norm": 3.188274621963501, + "learning_rate": 9.995119233185825e-05, + "loss": 2.3274, + "step": 92 + }, + { + "epoch": 0.030631149903248386, + "grad_norm": 2.603101968765259, + "learning_rate": 9.994884088957966e-05, + "loss": 2.0735, + "step": 93 + }, + { + "epoch": 0.03096051710650912, + "grad_norm": 2.7947723865509033, + "learning_rate": 9.994643415743817e-05, + "loss": 2.571, + "step": 94 + }, + { + "epoch": 0.03128988430976985, + "grad_norm": 2.8341128826141357, + "learning_rate": 9.994397213809786e-05, + "loss": 2.4703, + "step": 95 + }, + { + "epoch": 0.03161925151303059, + "grad_norm": 2.8162853717803955, + "learning_rate": 9.994145483428403e-05, + "loss": 2.2555, + "step": 96 + }, + { + "epoch": 0.03194861871629132, + "grad_norm": 2.9116039276123047, + "learning_rate": 9.993888224878313e-05, + "loss": 2.0821, + "step": 97 + }, + { + "epoch": 0.03227798591955206, + "grad_norm": 3.1359169483184814, + "learning_rate": 9.993625438444287e-05, + "loss": 1.8411, + "step": 98 + }, + { + "epoch": 0.03260735312281279, + "grad_norm": 3.9783849716186523, + "learning_rate": 9.993357124417209e-05, + "loss": 2.3985, + "step": 99 + }, + { + "epoch": 0.03293672032607353, + "grad_norm": 3.3506076335906982, + "learning_rate": 9.993083283094084e-05, + "loss": 2.2946, + "step": 100 + }, + { + "epoch": 0.033266087529334264, + "grad_norm": 2.503321409225464, + "learning_rate": 9.992803914778034e-05, + "loss": 2.6129, + "step": 101 + }, + { + "epoch": 0.033595454732595, + "grad_norm": 3.6384737491607666, + "learning_rate": 9.992519019778301e-05, + "loss": 2.6609, + "step": 102 + }, + { + "epoch": 0.033924821935855734, + "grad_norm": 2.3811521530151367, + "learning_rate": 9.992228598410244e-05, + "loss": 2.3603, + "step": 103 + }, + { + "epoch": 0.03425418913911647, + "grad_norm": 2.4915661811828613, + "learning_rate": 9.991932650995341e-05, + "loss": 2.2988, + "step": 104 + }, + { + "epoch": 0.034583556342377204, + "grad_norm": 2.7037079334259033, + "learning_rate": 9.991631177861182e-05, + "loss": 2.4169, + "step": 105 + }, + { + "epoch": 0.03491292354563794, + "grad_norm": 2.6235880851745605, + "learning_rate": 9.991324179341478e-05, + "loss": 2.2463, + "step": 106 + }, + { + "epoch": 0.03524229074889868, + "grad_norm": 2.3382320404052734, + "learning_rate": 9.991011655776055e-05, + "loss": 2.2194, + "step": 107 + }, + { + "epoch": 0.03557165795215941, + "grad_norm": 2.669034957885742, + "learning_rate": 9.990693607510853e-05, + "loss": 2.4065, + "step": 108 + }, + { + "epoch": 0.03590102515542015, + "grad_norm": 2.219449281692505, + "learning_rate": 9.990370034897931e-05, + "loss": 2.0945, + "step": 109 + }, + { + "epoch": 0.036230392358680884, + "grad_norm": 3.07869291305542, + "learning_rate": 9.99004093829546e-05, + "loss": 2.3285, + "step": 110 + }, + { + "epoch": 0.03655975956194162, + "grad_norm": 2.4732134342193604, + "learning_rate": 9.989706318067728e-05, + "loss": 2.0391, + "step": 111 + }, + { + "epoch": 0.036889126765202354, + "grad_norm": 2.9737696647644043, + "learning_rate": 9.989366174585135e-05, + "loss": 2.0482, + "step": 112 + }, + { + "epoch": 0.03721849396846309, + "grad_norm": 2.8496570587158203, + "learning_rate": 9.989020508224197e-05, + "loss": 2.3701, + "step": 113 + }, + { + "epoch": 0.037547861171723824, + "grad_norm": 3.108743190765381, + "learning_rate": 9.98866931936754e-05, + "loss": 2.4347, + "step": 114 + }, + { + "epoch": 0.03787722837498456, + "grad_norm": 2.642160177230835, + "learning_rate": 9.988312608403909e-05, + "loss": 2.3828, + "step": 115 + }, + { + "epoch": 0.038206595578245295, + "grad_norm": 2.7258148193359375, + "learning_rate": 9.987950375728157e-05, + "loss": 2.0533, + "step": 116 + }, + { + "epoch": 0.038535962781506033, + "grad_norm": 2.6295080184936523, + "learning_rate": 9.98758262174125e-05, + "loss": 2.3752, + "step": 117 + }, + { + "epoch": 0.038865329984766765, + "grad_norm": 2.905092477798462, + "learning_rate": 9.987209346850263e-05, + "loss": 2.031, + "step": 118 + }, + { + "epoch": 0.039194697188027504, + "grad_norm": 2.780714988708496, + "learning_rate": 9.986830551468388e-05, + "loss": 2.0643, + "step": 119 + }, + { + "epoch": 0.039524064391288236, + "grad_norm": 2.858346939086914, + "learning_rate": 9.986446236014925e-05, + "loss": 2.3293, + "step": 120 + }, + { + "epoch": 0.039853431594548974, + "grad_norm": 3.3452649116516113, + "learning_rate": 9.986056400915284e-05, + "loss": 2.4871, + "step": 121 + }, + { + "epoch": 0.040182798797809706, + "grad_norm": 2.921239137649536, + "learning_rate": 9.985661046600984e-05, + "loss": 2.3667, + "step": 122 + }, + { + "epoch": 0.040512166001070445, + "grad_norm": 3.3637163639068604, + "learning_rate": 9.985260173509656e-05, + "loss": 2.2286, + "step": 123 + }, + { + "epoch": 0.040841533204331176, + "grad_norm": 2.8042445182800293, + "learning_rate": 9.984853782085035e-05, + "loss": 1.9868, + "step": 124 + }, + { + "epoch": 0.041170900407591915, + "grad_norm": 3.1596014499664307, + "learning_rate": 9.984441872776973e-05, + "loss": 1.9366, + "step": 125 + }, + { + "epoch": 0.04150026761085265, + "grad_norm": 2.2009477615356445, + "learning_rate": 9.984024446041423e-05, + "loss": 2.7554, + "step": 126 + }, + { + "epoch": 0.041829634814113385, + "grad_norm": 2.5528178215026855, + "learning_rate": 9.983601502340443e-05, + "loss": 2.4577, + "step": 127 + }, + { + "epoch": 0.04215900201737412, + "grad_norm": 2.445089101791382, + "learning_rate": 9.983173042142208e-05, + "loss": 2.3525, + "step": 128 + }, + { + "epoch": 0.042488369220634856, + "grad_norm": 2.8235416412353516, + "learning_rate": 9.98273906592099e-05, + "loss": 2.6086, + "step": 129 + }, + { + "epoch": 0.04281773642389559, + "grad_norm": 2.569711446762085, + "learning_rate": 9.98229957415717e-05, + "loss": 2.4181, + "step": 130 + }, + { + "epoch": 0.043147103627156326, + "grad_norm": 2.627051591873169, + "learning_rate": 9.981854567337237e-05, + "loss": 2.307, + "step": 131 + }, + { + "epoch": 0.043476470830417065, + "grad_norm": 2.5236432552337646, + "learning_rate": 9.98140404595378e-05, + "loss": 2.5002, + "step": 132 + }, + { + "epoch": 0.043805838033677796, + "grad_norm": 2.83263897895813, + "learning_rate": 9.980948010505493e-05, + "loss": 2.6664, + "step": 133 + }, + { + "epoch": 0.044135205236938535, + "grad_norm": 2.404651641845703, + "learning_rate": 9.98048646149718e-05, + "loss": 2.4746, + "step": 134 + }, + { + "epoch": 0.04446457244019927, + "grad_norm": 2.768198013305664, + "learning_rate": 9.980019399439741e-05, + "loss": 2.3371, + "step": 135 + }, + { + "epoch": 0.044793939643460005, + "grad_norm": 3.2502036094665527, + "learning_rate": 9.97954682485018e-05, + "loss": 2.7268, + "step": 136 + }, + { + "epoch": 0.04512330684672074, + "grad_norm": 2.356938362121582, + "learning_rate": 9.979068738251605e-05, + "loss": 2.271, + "step": 137 + }, + { + "epoch": 0.045452674049981476, + "grad_norm": 2.790741205215454, + "learning_rate": 9.978585140173225e-05, + "loss": 2.5781, + "step": 138 + }, + { + "epoch": 0.04578204125324221, + "grad_norm": 3.1573290824890137, + "learning_rate": 9.978096031150346e-05, + "loss": 2.3545, + "step": 139 + }, + { + "epoch": 0.046111408456502946, + "grad_norm": 2.6217617988586426, + "learning_rate": 9.977601411724382e-05, + "loss": 2.2068, + "step": 140 + }, + { + "epoch": 0.04644077565976368, + "grad_norm": 3.0412278175354004, + "learning_rate": 9.977101282442839e-05, + "loss": 2.386, + "step": 141 + }, + { + "epoch": 0.046770142863024416, + "grad_norm": 3.4128334522247314, + "learning_rate": 9.976595643859326e-05, + "loss": 2.5365, + "step": 142 + }, + { + "epoch": 0.04709951006628515, + "grad_norm": 3.5312652587890625, + "learning_rate": 9.976084496533547e-05, + "loss": 2.2243, + "step": 143 + }, + { + "epoch": 0.04742887726954589, + "grad_norm": 2.8533828258514404, + "learning_rate": 9.97556784103131e-05, + "loss": 2.2667, + "step": 144 + }, + { + "epoch": 0.04775824447280662, + "grad_norm": 3.081562042236328, + "learning_rate": 9.975045677924515e-05, + "loss": 2.3508, + "step": 145 + }, + { + "epoch": 0.04808761167606736, + "grad_norm": 3.2083470821380615, + "learning_rate": 9.97451800779116e-05, + "loss": 2.4371, + "step": 146 + }, + { + "epoch": 0.04841697887932809, + "grad_norm": 3.0021450519561768, + "learning_rate": 9.973984831215337e-05, + "loss": 1.9932, + "step": 147 + }, + { + "epoch": 0.04874634608258883, + "grad_norm": 3.146559953689575, + "learning_rate": 9.973446148787238e-05, + "loss": 2.0892, + "step": 148 + }, + { + "epoch": 0.04907571328584956, + "grad_norm": 3.2698886394500732, + "learning_rate": 9.972901961103145e-05, + "loss": 1.9643, + "step": 149 + }, + { + "epoch": 0.0494050804891103, + "grad_norm": 3.640223503112793, + "learning_rate": 9.972352268765434e-05, + "loss": 2.0784, + "step": 150 + }, + { + "epoch": 0.04973444769237103, + "grad_norm": 2.1278653144836426, + "learning_rate": 9.971797072382579e-05, + "loss": 2.4746, + "step": 151 + }, + { + "epoch": 0.05006381489563177, + "grad_norm": 2.588524341583252, + "learning_rate": 9.971236372569142e-05, + "loss": 2.2406, + "step": 152 + }, + { + "epoch": 0.0503931820988925, + "grad_norm": 2.5750515460968018, + "learning_rate": 9.97067016994578e-05, + "loss": 2.254, + "step": 153 + }, + { + "epoch": 0.05072254930215324, + "grad_norm": 2.47708797454834, + "learning_rate": 9.970098465139236e-05, + "loss": 2.2676, + "step": 154 + }, + { + "epoch": 0.05105191650541397, + "grad_norm": 2.496344566345215, + "learning_rate": 9.969521258782351e-05, + "loss": 2.3978, + "step": 155 + }, + { + "epoch": 0.05138128370867471, + "grad_norm": 2.462228775024414, + "learning_rate": 9.968938551514048e-05, + "loss": 2.3217, + "step": 156 + }, + { + "epoch": 0.05171065091193544, + "grad_norm": 2.3593785762786865, + "learning_rate": 9.968350343979346e-05, + "loss": 2.0463, + "step": 157 + }, + { + "epoch": 0.05204001811519618, + "grad_norm": 2.4684934616088867, + "learning_rate": 9.967756636829348e-05, + "loss": 2.3118, + "step": 158 + }, + { + "epoch": 0.05236938531845692, + "grad_norm": 3.1452877521514893, + "learning_rate": 9.967157430721248e-05, + "loss": 2.3831, + "step": 159 + }, + { + "epoch": 0.05269875252171765, + "grad_norm": 2.745805025100708, + "learning_rate": 9.966552726318323e-05, + "loss": 2.3436, + "step": 160 + }, + { + "epoch": 0.05302811972497839, + "grad_norm": 2.490478038787842, + "learning_rate": 9.965942524289941e-05, + "loss": 2.2698, + "step": 161 + }, + { + "epoch": 0.05335748692823912, + "grad_norm": 2.3748586177825928, + "learning_rate": 9.96532682531155e-05, + "loss": 2.4236, + "step": 162 + }, + { + "epoch": 0.05368685413149986, + "grad_norm": 2.378679037094116, + "learning_rate": 9.964705630064686e-05, + "loss": 2.1829, + "step": 163 + }, + { + "epoch": 0.05401622133476059, + "grad_norm": 2.8342976570129395, + "learning_rate": 9.964078939236971e-05, + "loss": 2.3079, + "step": 164 + }, + { + "epoch": 0.05434558853802133, + "grad_norm": 2.9072232246398926, + "learning_rate": 9.963446753522104e-05, + "loss": 2.3423, + "step": 165 + }, + { + "epoch": 0.05467495574128206, + "grad_norm": 2.8593156337738037, + "learning_rate": 9.962809073619875e-05, + "loss": 2.2235, + "step": 166 + }, + { + "epoch": 0.0550043229445428, + "grad_norm": 2.832493543624878, + "learning_rate": 9.962165900236146e-05, + "loss": 2.2889, + "step": 167 + }, + { + "epoch": 0.05533369014780353, + "grad_norm": 2.806488037109375, + "learning_rate": 9.961517234082866e-05, + "loss": 2.1615, + "step": 168 + }, + { + "epoch": 0.05566305735106427, + "grad_norm": 3.301941394805908, + "learning_rate": 9.960863075878067e-05, + "loss": 2.2195, + "step": 169 + }, + { + "epoch": 0.055992424554325, + "grad_norm": 2.458503484725952, + "learning_rate": 9.960203426345851e-05, + "loss": 2.1645, + "step": 170 + }, + { + "epoch": 0.05632179175758574, + "grad_norm": 2.415736675262451, + "learning_rate": 9.959538286216408e-05, + "loss": 2.012, + "step": 171 + }, + { + "epoch": 0.05665115896084647, + "grad_norm": 2.7939391136169434, + "learning_rate": 9.958867656225997e-05, + "loss": 2.3091, + "step": 172 + }, + { + "epoch": 0.05698052616410721, + "grad_norm": 2.971738576889038, + "learning_rate": 9.958191537116963e-05, + "loss": 2.1566, + "step": 173 + }, + { + "epoch": 0.05730989336736794, + "grad_norm": 3.0159671306610107, + "learning_rate": 9.957509929637719e-05, + "loss": 2.0143, + "step": 174 + }, + { + "epoch": 0.05763926057062868, + "grad_norm": 3.120633125305176, + "learning_rate": 9.956822834542759e-05, + "loss": 1.8494, + "step": 175 + }, + { + "epoch": 0.05796862777388941, + "grad_norm": 2.3217828273773193, + "learning_rate": 9.956130252592646e-05, + "loss": 2.6393, + "step": 176 + }, + { + "epoch": 0.05829799497715015, + "grad_norm": 2.3948323726654053, + "learning_rate": 9.955432184554024e-05, + "loss": 2.6342, + "step": 177 + }, + { + "epoch": 0.05862736218041088, + "grad_norm": 2.1750946044921875, + "learning_rate": 9.9547286311996e-05, + "loss": 2.3683, + "step": 178 + }, + { + "epoch": 0.05895672938367162, + "grad_norm": 2.6148295402526855, + "learning_rate": 9.954019593308163e-05, + "loss": 2.5178, + "step": 179 + }, + { + "epoch": 0.05928609658693235, + "grad_norm": 2.6671082973480225, + "learning_rate": 9.953305071664566e-05, + "loss": 2.3501, + "step": 180 + }, + { + "epoch": 0.05961546379019309, + "grad_norm": 2.740058422088623, + "learning_rate": 9.952585067059734e-05, + "loss": 2.3677, + "step": 181 + }, + { + "epoch": 0.059944830993453824, + "grad_norm": 2.2972073554992676, + "learning_rate": 9.951859580290664e-05, + "loss": 2.5598, + "step": 182 + }, + { + "epoch": 0.06027419819671456, + "grad_norm": 2.5012662410736084, + "learning_rate": 9.951128612160417e-05, + "loss": 2.247, + "step": 183 + }, + { + "epoch": 0.060603565399975294, + "grad_norm": 2.5461273193359375, + "learning_rate": 9.950392163478121e-05, + "loss": 2.4683, + "step": 184 + }, + { + "epoch": 0.06093293260323603, + "grad_norm": 3.3869452476501465, + "learning_rate": 9.949650235058978e-05, + "loss": 2.5158, + "step": 185 + }, + { + "epoch": 0.06126229980649677, + "grad_norm": 2.2931103706359863, + "learning_rate": 9.948902827724248e-05, + "loss": 2.2837, + "step": 186 + }, + { + "epoch": 0.0615916670097575, + "grad_norm": 2.4342219829559326, + "learning_rate": 9.94814994230126e-05, + "loss": 2.2988, + "step": 187 + }, + { + "epoch": 0.06192103421301824, + "grad_norm": 2.924483299255371, + "learning_rate": 9.947391579623401e-05, + "loss": 2.4679, + "step": 188 + }, + { + "epoch": 0.06225040141627897, + "grad_norm": 2.381253480911255, + "learning_rate": 9.946627740530131e-05, + "loss": 2.0651, + "step": 189 + }, + { + "epoch": 0.0625797686195397, + "grad_norm": 2.3202030658721924, + "learning_rate": 9.945858425866962e-05, + "loss": 2.0079, + "step": 190 + }, + { + "epoch": 0.06290913582280044, + "grad_norm": 2.717766523361206, + "learning_rate": 9.945083636485476e-05, + "loss": 2.6815, + "step": 191 + }, + { + "epoch": 0.06323850302606118, + "grad_norm": 3.0983986854553223, + "learning_rate": 9.944303373243307e-05, + "loss": 2.4154, + "step": 192 + }, + { + "epoch": 0.06356787022932192, + "grad_norm": 2.5674819946289062, + "learning_rate": 9.943517637004151e-05, + "loss": 1.8935, + "step": 193 + }, + { + "epoch": 0.06389723743258265, + "grad_norm": 2.8902697563171387, + "learning_rate": 9.942726428637765e-05, + "loss": 2.19, + "step": 194 + }, + { + "epoch": 0.06422660463584338, + "grad_norm": 2.824228525161743, + "learning_rate": 9.941929749019961e-05, + "loss": 1.7962, + "step": 195 + }, + { + "epoch": 0.06455597183910412, + "grad_norm": 2.9178125858306885, + "learning_rate": 9.941127599032605e-05, + "loss": 1.9707, + "step": 196 + }, + { + "epoch": 0.06488533904236486, + "grad_norm": 3.112900495529175, + "learning_rate": 9.940319979563624e-05, + "loss": 2.1085, + "step": 197 + }, + { + "epoch": 0.06521470624562559, + "grad_norm": 2.4243252277374268, + "learning_rate": 9.939506891506993e-05, + "loss": 1.6683, + "step": 198 + }, + { + "epoch": 0.06554407344888633, + "grad_norm": 3.3545095920562744, + "learning_rate": 9.938688335762747e-05, + "loss": 1.9903, + "step": 199 + }, + { + "epoch": 0.06587344065214706, + "grad_norm": 4.020653247833252, + "learning_rate": 9.937864313236968e-05, + "loss": 1.9782, + "step": 200 + }, + { + "epoch": 0.0662028078554078, + "grad_norm": 2.1895785331726074, + "learning_rate": 9.93703482484179e-05, + "loss": 2.4232, + "step": 201 + }, + { + "epoch": 0.06653217505866853, + "grad_norm": 2.779294967651367, + "learning_rate": 9.9361998714954e-05, + "loss": 2.4474, + "step": 202 + }, + { + "epoch": 0.06686154226192927, + "grad_norm": 2.666214942932129, + "learning_rate": 9.935359454122033e-05, + "loss": 2.3747, + "step": 203 + }, + { + "epoch": 0.06719090946519, + "grad_norm": 2.4927260875701904, + "learning_rate": 9.93451357365197e-05, + "loss": 2.3229, + "step": 204 + }, + { + "epoch": 0.06752027666845074, + "grad_norm": 2.6281189918518066, + "learning_rate": 9.933662231021543e-05, + "loss": 2.3106, + "step": 205 + }, + { + "epoch": 0.06784964387171147, + "grad_norm": 2.487201452255249, + "learning_rate": 9.932805427173128e-05, + "loss": 2.1396, + "step": 206 + }, + { + "epoch": 0.0681790110749722, + "grad_norm": 2.6833136081695557, + "learning_rate": 9.931943163055148e-05, + "loss": 2.6855, + "step": 207 + }, + { + "epoch": 0.06850837827823295, + "grad_norm": 2.670117139816284, + "learning_rate": 9.931075439622069e-05, + "loss": 2.0407, + "step": 208 + }, + { + "epoch": 0.06883774548149368, + "grad_norm": 2.8142549991607666, + "learning_rate": 9.930202257834397e-05, + "loss": 2.5156, + "step": 209 + }, + { + "epoch": 0.06916711268475441, + "grad_norm": 2.5977020263671875, + "learning_rate": 9.929323618658686e-05, + "loss": 2.2659, + "step": 210 + }, + { + "epoch": 0.06949647988801515, + "grad_norm": 2.188446521759033, + "learning_rate": 9.928439523067526e-05, + "loss": 1.853, + "step": 211 + }, + { + "epoch": 0.06982584709127589, + "grad_norm": 2.691819906234741, + "learning_rate": 9.92754997203955e-05, + "loss": 2.0574, + "step": 212 + }, + { + "epoch": 0.07015521429453662, + "grad_norm": 2.599579334259033, + "learning_rate": 9.926654966559427e-05, + "loss": 2.1189, + "step": 213 + }, + { + "epoch": 0.07048458149779736, + "grad_norm": 3.0365447998046875, + "learning_rate": 9.925754507617868e-05, + "loss": 2.2052, + "step": 214 + }, + { + "epoch": 0.07081394870105809, + "grad_norm": 3.086376190185547, + "learning_rate": 9.924848596211618e-05, + "loss": 2.5058, + "step": 215 + }, + { + "epoch": 0.07114331590431883, + "grad_norm": 3.420269012451172, + "learning_rate": 9.923937233343453e-05, + "loss": 2.0489, + "step": 216 + }, + { + "epoch": 0.07147268310757957, + "grad_norm": 2.6292338371276855, + "learning_rate": 9.923020420022191e-05, + "loss": 2.4083, + "step": 217 + }, + { + "epoch": 0.0718020503108403, + "grad_norm": 3.046985387802124, + "learning_rate": 9.92209815726268e-05, + "loss": 2.2224, + "step": 218 + }, + { + "epoch": 0.07213141751410103, + "grad_norm": 3.1213648319244385, + "learning_rate": 9.921170446085798e-05, + "loss": 2.0798, + "step": 219 + }, + { + "epoch": 0.07246078471736177, + "grad_norm": 2.71501088142395, + "learning_rate": 9.920237287518462e-05, + "loss": 2.3788, + "step": 220 + }, + { + "epoch": 0.0727901519206225, + "grad_norm": 2.7265591621398926, + "learning_rate": 9.919298682593605e-05, + "loss": 2.0445, + "step": 221 + }, + { + "epoch": 0.07311951912388324, + "grad_norm": 3.3277218341827393, + "learning_rate": 9.918354632350202e-05, + "loss": 2.1541, + "step": 222 + }, + { + "epoch": 0.07344888632714397, + "grad_norm": 3.288353443145752, + "learning_rate": 9.917405137833249e-05, + "loss": 2.2666, + "step": 223 + }, + { + "epoch": 0.07377825353040471, + "grad_norm": 3.049253463745117, + "learning_rate": 9.916450200093771e-05, + "loss": 1.8277, + "step": 224 + }, + { + "epoch": 0.07410762073366545, + "grad_norm": 3.3985233306884766, + "learning_rate": 9.915489820188814e-05, + "loss": 1.9754, + "step": 225 + }, + { + "epoch": 0.07443698793692619, + "grad_norm": 2.7046058177948, + "learning_rate": 9.914523999181456e-05, + "loss": 2.3886, + "step": 226 + }, + { + "epoch": 0.07476635514018691, + "grad_norm": 2.472142219543457, + "learning_rate": 9.91355273814079e-05, + "loss": 2.4031, + "step": 227 + }, + { + "epoch": 0.07509572234344765, + "grad_norm": 2.1829640865325928, + "learning_rate": 9.912576038141933e-05, + "loss": 2.34, + "step": 228 + }, + { + "epoch": 0.07542508954670839, + "grad_norm": 2.4181277751922607, + "learning_rate": 9.911593900266026e-05, + "loss": 2.1865, + "step": 229 + }, + { + "epoch": 0.07575445674996913, + "grad_norm": 2.6987533569335938, + "learning_rate": 9.910606325600223e-05, + "loss": 2.3855, + "step": 230 + }, + { + "epoch": 0.07608382395322985, + "grad_norm": 2.5802574157714844, + "learning_rate": 9.909613315237702e-05, + "loss": 2.3325, + "step": 231 + }, + { + "epoch": 0.07641319115649059, + "grad_norm": 2.356382369995117, + "learning_rate": 9.90861487027766e-05, + "loss": 2.1446, + "step": 232 + }, + { + "epoch": 0.07674255835975133, + "grad_norm": 2.3594143390655518, + "learning_rate": 9.907610991825298e-05, + "loss": 2.204, + "step": 233 + }, + { + "epoch": 0.07707192556301207, + "grad_norm": 2.47929310798645, + "learning_rate": 9.906601680991842e-05, + "loss": 2.2276, + "step": 234 + }, + { + "epoch": 0.07740129276627279, + "grad_norm": 3.100281000137329, + "learning_rate": 9.905586938894531e-05, + "loss": 2.3447, + "step": 235 + }, + { + "epoch": 0.07773065996953353, + "grad_norm": 2.3578147888183594, + "learning_rate": 9.904566766656612e-05, + "loss": 2.1765, + "step": 236 + }, + { + "epoch": 0.07806002717279427, + "grad_norm": 2.6452767848968506, + "learning_rate": 9.903541165407341e-05, + "loss": 2.2725, + "step": 237 + }, + { + "epoch": 0.07838939437605501, + "grad_norm": 2.8530819416046143, + "learning_rate": 9.902510136281989e-05, + "loss": 2.1286, + "step": 238 + }, + { + "epoch": 0.07871876157931575, + "grad_norm": 3.382469892501831, + "learning_rate": 9.901473680421833e-05, + "loss": 2.5109, + "step": 239 + }, + { + "epoch": 0.07904812878257647, + "grad_norm": 2.7474164962768555, + "learning_rate": 9.900431798974158e-05, + "loss": 2.0808, + "step": 240 + }, + { + "epoch": 0.07937749598583721, + "grad_norm": 2.889378070831299, + "learning_rate": 9.899384493092252e-05, + "loss": 2.6418, + "step": 241 + }, + { + "epoch": 0.07970686318909795, + "grad_norm": 2.788848876953125, + "learning_rate": 9.89833176393541e-05, + "loss": 2.4061, + "step": 242 + }, + { + "epoch": 0.08003623039235869, + "grad_norm": 2.540822982788086, + "learning_rate": 9.897273612668927e-05, + "loss": 1.9808, + "step": 243 + }, + { + "epoch": 0.08036559759561941, + "grad_norm": 2.4531843662261963, + "learning_rate": 9.896210040464105e-05, + "loss": 1.9014, + "step": 244 + }, + { + "epoch": 0.08069496479888015, + "grad_norm": 3.2541840076446533, + "learning_rate": 9.895141048498244e-05, + "loss": 2.5161, + "step": 245 + }, + { + "epoch": 0.08102433200214089, + "grad_norm": 2.396268367767334, + "learning_rate": 9.89406663795464e-05, + "loss": 1.944, + "step": 246 + }, + { + "epoch": 0.08135369920540163, + "grad_norm": 3.345994710922241, + "learning_rate": 9.892986810022594e-05, + "loss": 2.4834, + "step": 247 + }, + { + "epoch": 0.08168306640866235, + "grad_norm": 2.90889048576355, + "learning_rate": 9.891901565897397e-05, + "loss": 2.041, + "step": 248 + }, + { + "epoch": 0.08201243361192309, + "grad_norm": 3.424887180328369, + "learning_rate": 9.89081090678034e-05, + "loss": 2.1515, + "step": 249 + }, + { + "epoch": 0.08234180081518383, + "grad_norm": 3.129890203475952, + "learning_rate": 9.889714833878705e-05, + "loss": 1.5253, + "step": 250 + }, + { + "epoch": 0.08267116801844457, + "grad_norm": 2.033660411834717, + "learning_rate": 9.888613348405766e-05, + "loss": 2.4284, + "step": 251 + }, + { + "epoch": 0.0830005352217053, + "grad_norm": 2.726742744445801, + "learning_rate": 9.887506451580794e-05, + "loss": 2.5739, + "step": 252 + }, + { + "epoch": 0.08332990242496603, + "grad_norm": 2.1317944526672363, + "learning_rate": 9.886394144629044e-05, + "loss": 2.3195, + "step": 253 + }, + { + "epoch": 0.08365926962822677, + "grad_norm": 2.2404332160949707, + "learning_rate": 9.885276428781763e-05, + "loss": 2.6881, + "step": 254 + }, + { + "epoch": 0.08398863683148751, + "grad_norm": 2.376636028289795, + "learning_rate": 9.88415330527618e-05, + "loss": 2.2499, + "step": 255 + }, + { + "epoch": 0.08431800403474823, + "grad_norm": 2.24052095413208, + "learning_rate": 9.88302477535552e-05, + "loss": 2.4867, + "step": 256 + }, + { + "epoch": 0.08464737123800897, + "grad_norm": 2.6284337043762207, + "learning_rate": 9.881890840268981e-05, + "loss": 2.5267, + "step": 257 + }, + { + "epoch": 0.08497673844126971, + "grad_norm": 2.5630033016204834, + "learning_rate": 9.880751501271755e-05, + "loss": 2.3627, + "step": 258 + }, + { + "epoch": 0.08530610564453045, + "grad_norm": 2.3372888565063477, + "learning_rate": 9.879606759625004e-05, + "loss": 2.4922, + "step": 259 + }, + { + "epoch": 0.08563547284779117, + "grad_norm": 2.3916890621185303, + "learning_rate": 9.878456616595882e-05, + "loss": 2.1065, + "step": 260 + }, + { + "epoch": 0.08596484005105191, + "grad_norm": 2.6594431400299072, + "learning_rate": 9.877301073457515e-05, + "loss": 2.034, + "step": 261 + }, + { + "epoch": 0.08629420725431265, + "grad_norm": 2.3828513622283936, + "learning_rate": 9.876140131489008e-05, + "loss": 1.9415, + "step": 262 + }, + { + "epoch": 0.08662357445757339, + "grad_norm": 2.558377265930176, + "learning_rate": 9.874973791975442e-05, + "loss": 2.1253, + "step": 263 + }, + { + "epoch": 0.08695294166083413, + "grad_norm": 2.41732120513916, + "learning_rate": 9.873802056207872e-05, + "loss": 2.4188, + "step": 264 + }, + { + "epoch": 0.08728230886409485, + "grad_norm": 2.653940439224243, + "learning_rate": 9.872624925483329e-05, + "loss": 2.1664, + "step": 265 + }, + { + "epoch": 0.08761167606735559, + "grad_norm": 2.6138205528259277, + "learning_rate": 9.871442401104816e-05, + "loss": 1.9422, + "step": 266 + }, + { + "epoch": 0.08794104327061633, + "grad_norm": 2.4396393299102783, + "learning_rate": 9.870254484381299e-05, + "loss": 2.2988, + "step": 267 + }, + { + "epoch": 0.08827041047387707, + "grad_norm": 3.269818067550659, + "learning_rate": 9.869061176627724e-05, + "loss": 2.4621, + "step": 268 + }, + { + "epoch": 0.0885997776771378, + "grad_norm": 2.802405595779419, + "learning_rate": 9.867862479164996e-05, + "loss": 2.0724, + "step": 269 + }, + { + "epoch": 0.08892914488039853, + "grad_norm": 3.132948160171509, + "learning_rate": 9.866658393319988e-05, + "loss": 2.3857, + "step": 270 + }, + { + "epoch": 0.08925851208365927, + "grad_norm": 3.446258544921875, + "learning_rate": 9.865448920425541e-05, + "loss": 2.1038, + "step": 271 + }, + { + "epoch": 0.08958787928692001, + "grad_norm": 3.5878865718841553, + "learning_rate": 9.864234061820458e-05, + "loss": 2.4794, + "step": 272 + }, + { + "epoch": 0.08991724649018074, + "grad_norm": 2.7170724868774414, + "learning_rate": 9.863013818849499e-05, + "loss": 2.0187, + "step": 273 + }, + { + "epoch": 0.09024661369344147, + "grad_norm": 3.4046213626861572, + "learning_rate": 9.861788192863388e-05, + "loss": 1.9167, + "step": 274 + }, + { + "epoch": 0.09057598089670221, + "grad_norm": 4.091611385345459, + "learning_rate": 9.860557185218808e-05, + "loss": 2.203, + "step": 275 + }, + { + "epoch": 0.09090534809996295, + "grad_norm": 1.9869258403778076, + "learning_rate": 9.859320797278397e-05, + "loss": 2.4434, + "step": 276 + }, + { + "epoch": 0.09123471530322368, + "grad_norm": 2.5179264545440674, + "learning_rate": 9.85807903041075e-05, + "loss": 2.2607, + "step": 277 + }, + { + "epoch": 0.09156408250648441, + "grad_norm": 2.7460920810699463, + "learning_rate": 9.856831885990416e-05, + "loss": 2.5456, + "step": 278 + }, + { + "epoch": 0.09189344970974515, + "grad_norm": 2.803849458694458, + "learning_rate": 9.855579365397898e-05, + "loss": 2.5656, + "step": 279 + }, + { + "epoch": 0.09222281691300589, + "grad_norm": 2.4078571796417236, + "learning_rate": 9.854321470019646e-05, + "loss": 2.4701, + "step": 280 + }, + { + "epoch": 0.09255218411626662, + "grad_norm": 2.735297679901123, + "learning_rate": 9.853058201248063e-05, + "loss": 2.4882, + "step": 281 + }, + { + "epoch": 0.09288155131952736, + "grad_norm": 2.441884756088257, + "learning_rate": 9.851789560481499e-05, + "loss": 2.2741, + "step": 282 + }, + { + "epoch": 0.0932109185227881, + "grad_norm": 2.480804681777954, + "learning_rate": 9.85051554912425e-05, + "loss": 2.2021, + "step": 283 + }, + { + "epoch": 0.09354028572604883, + "grad_norm": 2.59104585647583, + "learning_rate": 9.849236168586558e-05, + "loss": 2.7452, + "step": 284 + }, + { + "epoch": 0.09386965292930956, + "grad_norm": 2.392718553543091, + "learning_rate": 9.847951420284605e-05, + "loss": 2.231, + "step": 285 + }, + { + "epoch": 0.0941990201325703, + "grad_norm": 2.880892753601074, + "learning_rate": 9.84666130564052e-05, + "loss": 2.5514, + "step": 286 + }, + { + "epoch": 0.09452838733583104, + "grad_norm": 2.457305431365967, + "learning_rate": 9.845365826082368e-05, + "loss": 2.261, + "step": 287 + }, + { + "epoch": 0.09485775453909177, + "grad_norm": 2.8255691528320312, + "learning_rate": 9.844064983044157e-05, + "loss": 2.4296, + "step": 288 + }, + { + "epoch": 0.0951871217423525, + "grad_norm": 2.97965407371521, + "learning_rate": 9.842758777965824e-05, + "loss": 2.3662, + "step": 289 + }, + { + "epoch": 0.09551648894561324, + "grad_norm": 2.454676389694214, + "learning_rate": 9.841447212293249e-05, + "loss": 2.1213, + "step": 290 + }, + { + "epoch": 0.09584585614887398, + "grad_norm": 3.022413492202759, + "learning_rate": 9.840130287478245e-05, + "loss": 2.4408, + "step": 291 + }, + { + "epoch": 0.09617522335213471, + "grad_norm": 3.0308666229248047, + "learning_rate": 9.83880800497855e-05, + "loss": 2.5066, + "step": 292 + }, + { + "epoch": 0.09650459055539545, + "grad_norm": 2.674705743789673, + "learning_rate": 9.837480366257844e-05, + "loss": 2.2084, + "step": 293 + }, + { + "epoch": 0.09683395775865618, + "grad_norm": 2.988152503967285, + "learning_rate": 9.836147372785726e-05, + "loss": 2.2093, + "step": 294 + }, + { + "epoch": 0.09716332496191692, + "grad_norm": 2.502009868621826, + "learning_rate": 9.834809026037728e-05, + "loss": 2.259, + "step": 295 + }, + { + "epoch": 0.09749269216517766, + "grad_norm": 2.590487241744995, + "learning_rate": 9.833465327495306e-05, + "loss": 2.1714, + "step": 296 + }, + { + "epoch": 0.0978220593684384, + "grad_norm": 2.34224534034729, + "learning_rate": 9.83211627864584e-05, + "loss": 1.7337, + "step": 297 + }, + { + "epoch": 0.09815142657169912, + "grad_norm": 2.985863447189331, + "learning_rate": 9.83076188098263e-05, + "loss": 2.0128, + "step": 298 + }, + { + "epoch": 0.09848079377495986, + "grad_norm": 2.8273167610168457, + "learning_rate": 9.829402136004904e-05, + "loss": 2.0277, + "step": 299 + }, + { + "epoch": 0.0988101609782206, + "grad_norm": 3.533780813217163, + "learning_rate": 9.8280370452178e-05, + "loss": 2.0994, + "step": 300 + }, + { + "epoch": 0.09913952818148133, + "grad_norm": 2.0469958782196045, + "learning_rate": 9.82666661013238e-05, + "loss": 2.4583, + "step": 301 + }, + { + "epoch": 0.09946889538474206, + "grad_norm": 2.3586039543151855, + "learning_rate": 9.825290832265617e-05, + "loss": 2.4677, + "step": 302 + }, + { + "epoch": 0.0997982625880028, + "grad_norm": 2.269946813583374, + "learning_rate": 9.823909713140404e-05, + "loss": 2.3393, + "step": 303 + }, + { + "epoch": 0.10012762979126354, + "grad_norm": 2.338125705718994, + "learning_rate": 9.82252325428554e-05, + "loss": 2.4787, + "step": 304 + }, + { + "epoch": 0.10045699699452428, + "grad_norm": 2.1585426330566406, + "learning_rate": 9.821131457235739e-05, + "loss": 2.5393, + "step": 305 + }, + { + "epoch": 0.100786364197785, + "grad_norm": 2.6568026542663574, + "learning_rate": 9.819734323531624e-05, + "loss": 2.5194, + "step": 306 + }, + { + "epoch": 0.10111573140104574, + "grad_norm": 2.206839084625244, + "learning_rate": 9.818331854719722e-05, + "loss": 2.4154, + "step": 307 + }, + { + "epoch": 0.10144509860430648, + "grad_norm": 2.444082498550415, + "learning_rate": 9.816924052352468e-05, + "loss": 2.2583, + "step": 308 + }, + { + "epoch": 0.10177446580756722, + "grad_norm": 2.4031789302825928, + "learning_rate": 9.815510917988202e-05, + "loss": 2.5014, + "step": 309 + }, + { + "epoch": 0.10210383301082794, + "grad_norm": 2.008598566055298, + "learning_rate": 9.814092453191163e-05, + "loss": 2.1755, + "step": 310 + }, + { + "epoch": 0.10243320021408868, + "grad_norm": 2.6430673599243164, + "learning_rate": 9.812668659531492e-05, + "loss": 2.4391, + "step": 311 + }, + { + "epoch": 0.10276256741734942, + "grad_norm": 2.4818711280822754, + "learning_rate": 9.811239538585229e-05, + "loss": 2.2518, + "step": 312 + }, + { + "epoch": 0.10309193462061016, + "grad_norm": 2.733666181564331, + "learning_rate": 9.80980509193431e-05, + "loss": 2.3118, + "step": 313 + }, + { + "epoch": 0.10342130182387088, + "grad_norm": 2.3446598052978516, + "learning_rate": 9.808365321166568e-05, + "loss": 2.3457, + "step": 314 + }, + { + "epoch": 0.10375066902713162, + "grad_norm": 2.2961266040802, + "learning_rate": 9.806920227875729e-05, + "loss": 1.9795, + "step": 315 + }, + { + "epoch": 0.10408003623039236, + "grad_norm": 2.767897844314575, + "learning_rate": 9.805469813661408e-05, + "loss": 2.2274, + "step": 316 + }, + { + "epoch": 0.1044094034336531, + "grad_norm": 2.9815187454223633, + "learning_rate": 9.804014080129111e-05, + "loss": 2.4279, + "step": 317 + }, + { + "epoch": 0.10473877063691384, + "grad_norm": 2.3294548988342285, + "learning_rate": 9.802553028890237e-05, + "loss": 1.9681, + "step": 318 + }, + { + "epoch": 0.10506813784017456, + "grad_norm": 3.110809564590454, + "learning_rate": 9.801086661562062e-05, + "loss": 2.3353, + "step": 319 + }, + { + "epoch": 0.1053975050434353, + "grad_norm": 2.6092398166656494, + "learning_rate": 9.799614979767757e-05, + "loss": 2.1682, + "step": 320 + }, + { + "epoch": 0.10572687224669604, + "grad_norm": 3.033212184906006, + "learning_rate": 9.798137985136367e-05, + "loss": 1.9523, + "step": 321 + }, + { + "epoch": 0.10605623944995678, + "grad_norm": 2.9443624019622803, + "learning_rate": 9.79665567930282e-05, + "loss": 2.1126, + "step": 322 + }, + { + "epoch": 0.1063856066532175, + "grad_norm": 3.2337043285369873, + "learning_rate": 9.795168063907929e-05, + "loss": 2.1455, + "step": 323 + }, + { + "epoch": 0.10671497385647824, + "grad_norm": 2.5773916244506836, + "learning_rate": 9.793675140598377e-05, + "loss": 1.7072, + "step": 324 + }, + { + "epoch": 0.10704434105973898, + "grad_norm": 3.643908977508545, + "learning_rate": 9.792176911026727e-05, + "loss": 2.2543, + "step": 325 + }, + { + "epoch": 0.10737370826299972, + "grad_norm": 1.8910375833511353, + "learning_rate": 9.790673376851414e-05, + "loss": 2.5806, + "step": 326 + }, + { + "epoch": 0.10770307546626044, + "grad_norm": 2.3565833568573, + "learning_rate": 9.789164539736741e-05, + "loss": 2.4109, + "step": 327 + }, + { + "epoch": 0.10803244266952118, + "grad_norm": 2.3572747707366943, + "learning_rate": 9.78765040135289e-05, + "loss": 2.3321, + "step": 328 + }, + { + "epoch": 0.10836180987278192, + "grad_norm": 2.759070873260498, + "learning_rate": 9.786130963375904e-05, + "loss": 2.4526, + "step": 329 + }, + { + "epoch": 0.10869117707604266, + "grad_norm": 2.419929027557373, + "learning_rate": 9.784606227487693e-05, + "loss": 2.1635, + "step": 330 + }, + { + "epoch": 0.10902054427930338, + "grad_norm": 2.4334778785705566, + "learning_rate": 9.783076195376036e-05, + "loss": 2.5785, + "step": 331 + }, + { + "epoch": 0.10934991148256412, + "grad_norm": 2.6062378883361816, + "learning_rate": 9.781540868734567e-05, + "loss": 2.4172, + "step": 332 + }, + { + "epoch": 0.10967927868582486, + "grad_norm": 2.5069472789764404, + "learning_rate": 9.780000249262787e-05, + "loss": 2.3606, + "step": 333 + }, + { + "epoch": 0.1100086458890856, + "grad_norm": 2.3731164932250977, + "learning_rate": 9.778454338666053e-05, + "loss": 2.3024, + "step": 334 + }, + { + "epoch": 0.11033801309234632, + "grad_norm": 2.8083693981170654, + "learning_rate": 9.776903138655581e-05, + "loss": 2.426, + "step": 335 + }, + { + "epoch": 0.11066738029560706, + "grad_norm": 2.4966108798980713, + "learning_rate": 9.775346650948439e-05, + "loss": 2.2338, + "step": 336 + }, + { + "epoch": 0.1109967474988678, + "grad_norm": 2.9775607585906982, + "learning_rate": 9.77378487726755e-05, + "loss": 2.7963, + "step": 337 + }, + { + "epoch": 0.11132611470212854, + "grad_norm": 2.692918062210083, + "learning_rate": 9.77221781934169e-05, + "loss": 2.4771, + "step": 338 + }, + { + "epoch": 0.11165548190538926, + "grad_norm": 2.5929603576660156, + "learning_rate": 9.770645478905481e-05, + "loss": 2.4814, + "step": 339 + }, + { + "epoch": 0.11198484910865, + "grad_norm": 2.6808626651763916, + "learning_rate": 9.76906785769939e-05, + "loss": 2.1842, + "step": 340 + }, + { + "epoch": 0.11231421631191074, + "grad_norm": 2.846548080444336, + "learning_rate": 9.767484957469739e-05, + "loss": 2.1864, + "step": 341 + }, + { + "epoch": 0.11264358351517148, + "grad_norm": 2.9507384300231934, + "learning_rate": 9.765896779968685e-05, + "loss": 2.3369, + "step": 342 + }, + { + "epoch": 0.11297295071843222, + "grad_norm": 3.360044002532959, + "learning_rate": 9.764303326954226e-05, + "loss": 2.2568, + "step": 343 + }, + { + "epoch": 0.11330231792169294, + "grad_norm": 2.5054543018341064, + "learning_rate": 9.762704600190207e-05, + "loss": 1.8953, + "step": 344 + }, + { + "epoch": 0.11363168512495368, + "grad_norm": 3.037480592727661, + "learning_rate": 9.761100601446304e-05, + "loss": 2.4476, + "step": 345 + }, + { + "epoch": 0.11396105232821442, + "grad_norm": 3.0009777545928955, + "learning_rate": 9.759491332498032e-05, + "loss": 2.3728, + "step": 346 + }, + { + "epoch": 0.11429041953147516, + "grad_norm": 2.856393814086914, + "learning_rate": 9.757876795126739e-05, + "loss": 1.9786, + "step": 347 + }, + { + "epoch": 0.11461978673473588, + "grad_norm": 3.3026115894317627, + "learning_rate": 9.756256991119603e-05, + "loss": 2.6015, + "step": 348 + }, + { + "epoch": 0.11494915393799662, + "grad_norm": 3.5809285640716553, + "learning_rate": 9.754631922269636e-05, + "loss": 2.307, + "step": 349 + }, + { + "epoch": 0.11527852114125736, + "grad_norm": 3.11942195892334, + "learning_rate": 9.753001590375674e-05, + "loss": 2.0157, + "step": 350 + }, + { + "epoch": 0.1156078883445181, + "grad_norm": 2.313547372817993, + "learning_rate": 9.75136599724238e-05, + "loss": 2.4783, + "step": 351 + }, + { + "epoch": 0.11593725554777883, + "grad_norm": 2.4683542251586914, + "learning_rate": 9.74972514468024e-05, + "loss": 2.4294, + "step": 352 + }, + { + "epoch": 0.11626662275103956, + "grad_norm": 2.895655632019043, + "learning_rate": 9.748079034505565e-05, + "loss": 2.6217, + "step": 353 + }, + { + "epoch": 0.1165959899543003, + "grad_norm": 2.646327495574951, + "learning_rate": 9.746427668540481e-05, + "loss": 2.5583, + "step": 354 + }, + { + "epoch": 0.11692535715756104, + "grad_norm": 2.4664547443389893, + "learning_rate": 9.744771048612935e-05, + "loss": 2.4272, + "step": 355 + }, + { + "epoch": 0.11725472436082177, + "grad_norm": 2.5208280086517334, + "learning_rate": 9.743109176556689e-05, + "loss": 2.7008, + "step": 356 + }, + { + "epoch": 0.1175840915640825, + "grad_norm": 2.5895395278930664, + "learning_rate": 9.741442054211319e-05, + "loss": 2.4581, + "step": 357 + }, + { + "epoch": 0.11791345876734324, + "grad_norm": 2.3770651817321777, + "learning_rate": 9.739769683422214e-05, + "loss": 2.2697, + "step": 358 + }, + { + "epoch": 0.11824282597060398, + "grad_norm": 2.263289451599121, + "learning_rate": 9.738092066040568e-05, + "loss": 2.1457, + "step": 359 + }, + { + "epoch": 0.1185721931738647, + "grad_norm": 3.5201501846313477, + "learning_rate": 9.736409203923388e-05, + "loss": 2.4604, + "step": 360 + }, + { + "epoch": 0.11890156037712545, + "grad_norm": 2.3830199241638184, + "learning_rate": 9.734721098933484e-05, + "loss": 2.0263, + "step": 361 + }, + { + "epoch": 0.11923092758038618, + "grad_norm": 2.431260347366333, + "learning_rate": 9.73302775293947e-05, + "loss": 2.2848, + "step": 362 + }, + { + "epoch": 0.11956029478364692, + "grad_norm": 2.4053165912628174, + "learning_rate": 9.73132916781576e-05, + "loss": 2.0885, + "step": 363 + }, + { + "epoch": 0.11988966198690765, + "grad_norm": 2.584679126739502, + "learning_rate": 9.72962534544257e-05, + "loss": 2.2592, + "step": 364 + }, + { + "epoch": 0.12021902919016839, + "grad_norm": 2.6707746982574463, + "learning_rate": 9.727916287705912e-05, + "loss": 2.135, + "step": 365 + }, + { + "epoch": 0.12054839639342912, + "grad_norm": 2.802182197570801, + "learning_rate": 9.72620199649759e-05, + "loss": 2.6641, + "step": 366 + }, + { + "epoch": 0.12087776359668986, + "grad_norm": 3.923982620239258, + "learning_rate": 9.724482473715207e-05, + "loss": 2.3298, + "step": 367 + }, + { + "epoch": 0.12120713079995059, + "grad_norm": 2.7437329292297363, + "learning_rate": 9.722757721262154e-05, + "loss": 2.2673, + "step": 368 + }, + { + "epoch": 0.12153649800321133, + "grad_norm": 2.640639543533325, + "learning_rate": 9.72102774104761e-05, + "loss": 2.2149, + "step": 369 + }, + { + "epoch": 0.12186586520647207, + "grad_norm": 3.37756609916687, + "learning_rate": 9.719292534986543e-05, + "loss": 2.0474, + "step": 370 + }, + { + "epoch": 0.1221952324097328, + "grad_norm": 2.524691581726074, + "learning_rate": 9.717552104999703e-05, + "loss": 2.0271, + "step": 371 + }, + { + "epoch": 0.12252459961299354, + "grad_norm": 2.769646406173706, + "learning_rate": 9.715806453013625e-05, + "loss": 2.0452, + "step": 372 + }, + { + "epoch": 0.12285396681625427, + "grad_norm": 2.9961395263671875, + "learning_rate": 9.714055580960622e-05, + "loss": 1.9754, + "step": 373 + }, + { + "epoch": 0.123183334019515, + "grad_norm": 3.112914800643921, + "learning_rate": 9.712299490778786e-05, + "loss": 2.1425, + "step": 374 + }, + { + "epoch": 0.12351270122277574, + "grad_norm": 3.4256157875061035, + "learning_rate": 9.710538184411991e-05, + "loss": 1.9117, + "step": 375 + }, + { + "epoch": 0.12384206842603648, + "grad_norm": 1.9335092306137085, + "learning_rate": 9.708771663809872e-05, + "loss": 2.4769, + "step": 376 + }, + { + "epoch": 0.12417143562929721, + "grad_norm": 2.1666107177734375, + "learning_rate": 9.706999930927848e-05, + "loss": 2.2265, + "step": 377 + }, + { + "epoch": 0.12450080283255795, + "grad_norm": 2.2735953330993652, + "learning_rate": 9.7052229877271e-05, + "loss": 2.4031, + "step": 378 + }, + { + "epoch": 0.12483017003581869, + "grad_norm": 2.531383514404297, + "learning_rate": 9.703440836174583e-05, + "loss": 2.4251, + "step": 379 + }, + { + "epoch": 0.1251595372390794, + "grad_norm": 2.608523368835449, + "learning_rate": 9.701653478243013e-05, + "loss": 2.4985, + "step": 380 + }, + { + "epoch": 0.12548890444234015, + "grad_norm": 2.4094133377075195, + "learning_rate": 9.699860915910868e-05, + "loss": 1.9842, + "step": 381 + }, + { + "epoch": 0.1258182716456009, + "grad_norm": 2.278822183609009, + "learning_rate": 9.698063151162389e-05, + "loss": 2.0549, + "step": 382 + }, + { + "epoch": 0.12614763884886163, + "grad_norm": 2.65533709526062, + "learning_rate": 9.696260185987576e-05, + "loss": 2.4869, + "step": 383 + }, + { + "epoch": 0.12647700605212236, + "grad_norm": 2.8089699745178223, + "learning_rate": 9.694452022382186e-05, + "loss": 2.3468, + "step": 384 + }, + { + "epoch": 0.1268063732553831, + "grad_norm": 2.4362573623657227, + "learning_rate": 9.692638662347728e-05, + "loss": 2.5076, + "step": 385 + }, + { + "epoch": 0.12713574045864384, + "grad_norm": 3.138957977294922, + "learning_rate": 9.690820107891466e-05, + "loss": 2.3684, + "step": 386 + }, + { + "epoch": 0.12746510766190455, + "grad_norm": 3.3768868446350098, + "learning_rate": 9.68899636102641e-05, + "loss": 2.7286, + "step": 387 + }, + { + "epoch": 0.1277944748651653, + "grad_norm": 2.5961859226226807, + "learning_rate": 9.68716742377132e-05, + "loss": 2.1429, + "step": 388 + }, + { + "epoch": 0.12812384206842603, + "grad_norm": 2.7435123920440674, + "learning_rate": 9.685333298150702e-05, + "loss": 2.2378, + "step": 389 + }, + { + "epoch": 0.12845320927168677, + "grad_norm": 2.207853317260742, + "learning_rate": 9.683493986194808e-05, + "loss": 2.0057, + "step": 390 + }, + { + "epoch": 0.1287825764749475, + "grad_norm": 3.450223922729492, + "learning_rate": 9.681649489939619e-05, + "loss": 2.5243, + "step": 391 + }, + { + "epoch": 0.12911194367820825, + "grad_norm": 2.479057788848877, + "learning_rate": 9.67979981142687e-05, + "loss": 1.9977, + "step": 392 + }, + { + "epoch": 0.12944131088146899, + "grad_norm": 2.4462480545043945, + "learning_rate": 9.677944952704023e-05, + "loss": 1.9085, + "step": 393 + }, + { + "epoch": 0.12977067808472972, + "grad_norm": 3.183197021484375, + "learning_rate": 9.676084915824276e-05, + "loss": 2.6043, + "step": 394 + }, + { + "epoch": 0.13010004528799043, + "grad_norm": 2.802366256713867, + "learning_rate": 9.674219702846561e-05, + "loss": 2.3849, + "step": 395 + }, + { + "epoch": 0.13042941249125117, + "grad_norm": 3.248178482055664, + "learning_rate": 9.672349315835535e-05, + "loss": 2.3824, + "step": 396 + }, + { + "epoch": 0.1307587796945119, + "grad_norm": 2.677258253097534, + "learning_rate": 9.670473756861588e-05, + "loss": 1.8382, + "step": 397 + }, + { + "epoch": 0.13108814689777265, + "grad_norm": 2.8905255794525146, + "learning_rate": 9.668593028000831e-05, + "loss": 1.9997, + "step": 398 + }, + { + "epoch": 0.1314175141010334, + "grad_norm": 2.8859400749206543, + "learning_rate": 9.6667071313351e-05, + "loss": 1.904, + "step": 399 + }, + { + "epoch": 0.13174688130429413, + "grad_norm": 3.3241395950317383, + "learning_rate": 9.664816068951947e-05, + "loss": 1.7025, + "step": 400 + }, + { + "epoch": 0.13207624850755487, + "grad_norm": 2.0233919620513916, + "learning_rate": 9.662919842944651e-05, + "loss": 2.3296, + "step": 401 + }, + { + "epoch": 0.1324056157108156, + "grad_norm": 2.4507946968078613, + "learning_rate": 9.661018455412197e-05, + "loss": 2.431, + "step": 402 + }, + { + "epoch": 0.13273498291407634, + "grad_norm": 2.341461658477783, + "learning_rate": 9.659111908459288e-05, + "loss": 2.5369, + "step": 403 + }, + { + "epoch": 0.13306435011733705, + "grad_norm": 2.3247382640838623, + "learning_rate": 9.657200204196337e-05, + "loss": 2.3336, + "step": 404 + }, + { + "epoch": 0.1333937173205978, + "grad_norm": 2.6917145252227783, + "learning_rate": 9.65528334473947e-05, + "loss": 2.5245, + "step": 405 + }, + { + "epoch": 0.13372308452385853, + "grad_norm": 2.5759096145629883, + "learning_rate": 9.653361332210513e-05, + "loss": 2.5367, + "step": 406 + }, + { + "epoch": 0.13405245172711927, + "grad_norm": 3.123220682144165, + "learning_rate": 9.651434168737e-05, + "loss": 2.3026, + "step": 407 + }, + { + "epoch": 0.13438181893038, + "grad_norm": 1.9661033153533936, + "learning_rate": 9.649501856452165e-05, + "loss": 1.9149, + "step": 408 + }, + { + "epoch": 0.13471118613364075, + "grad_norm": 2.2201578617095947, + "learning_rate": 9.647564397494944e-05, + "loss": 2.4501, + "step": 409 + }, + { + "epoch": 0.1350405533369015, + "grad_norm": 2.485238790512085, + "learning_rate": 9.645621794009967e-05, + "loss": 2.3879, + "step": 410 + }, + { + "epoch": 0.13536992054016223, + "grad_norm": 2.429931163787842, + "learning_rate": 9.643674048147558e-05, + "loss": 2.3842, + "step": 411 + }, + { + "epoch": 0.13569928774342294, + "grad_norm": 2.674360513687134, + "learning_rate": 9.641721162063739e-05, + "loss": 2.2783, + "step": 412 + }, + { + "epoch": 0.13602865494668367, + "grad_norm": 2.56316876411438, + "learning_rate": 9.639763137920214e-05, + "loss": 2.2338, + "step": 413 + }, + { + "epoch": 0.1363580221499444, + "grad_norm": 2.6116116046905518, + "learning_rate": 9.637799977884381e-05, + "loss": 2.2164, + "step": 414 + }, + { + "epoch": 0.13668738935320515, + "grad_norm": 2.379546880722046, + "learning_rate": 9.635831684129318e-05, + "loss": 1.8579, + "step": 415 + }, + { + "epoch": 0.1370167565564659, + "grad_norm": 2.250623941421509, + "learning_rate": 9.63385825883379e-05, + "loss": 1.9759, + "step": 416 + }, + { + "epoch": 0.13734612375972663, + "grad_norm": 2.5346574783325195, + "learning_rate": 9.63187970418224e-05, + "loss": 2.1965, + "step": 417 + }, + { + "epoch": 0.13767549096298737, + "grad_norm": 2.7046220302581787, + "learning_rate": 9.62989602236479e-05, + "loss": 2.207, + "step": 418 + }, + { + "epoch": 0.1380048581662481, + "grad_norm": 2.8171615600585938, + "learning_rate": 9.627907215577236e-05, + "loss": 2.1648, + "step": 419 + }, + { + "epoch": 0.13833422536950882, + "grad_norm": 2.6681857109069824, + "learning_rate": 9.625913286021046e-05, + "loss": 2.2677, + "step": 420 + }, + { + "epoch": 0.13866359257276956, + "grad_norm": 2.8763694763183594, + "learning_rate": 9.623914235903362e-05, + "loss": 1.9661, + "step": 421 + }, + { + "epoch": 0.1389929597760303, + "grad_norm": 2.8630616664886475, + "learning_rate": 9.621910067436992e-05, + "loss": 2.1464, + "step": 422 + }, + { + "epoch": 0.13932232697929103, + "grad_norm": 3.0975148677825928, + "learning_rate": 9.61990078284041e-05, + "loss": 2.1959, + "step": 423 + }, + { + "epoch": 0.13965169418255177, + "grad_norm": 3.8912901878356934, + "learning_rate": 9.617886384337751e-05, + "loss": 2.1552, + "step": 424 + }, + { + "epoch": 0.1399810613858125, + "grad_norm": 3.1637001037597656, + "learning_rate": 9.615866874158816e-05, + "loss": 1.8698, + "step": 425 + }, + { + "epoch": 0.14031042858907325, + "grad_norm": 2.0410444736480713, + "learning_rate": 9.613842254539058e-05, + "loss": 2.6313, + "step": 426 + }, + { + "epoch": 0.140639795792334, + "grad_norm": 2.081902503967285, + "learning_rate": 9.611812527719593e-05, + "loss": 2.444, + "step": 427 + }, + { + "epoch": 0.14096916299559473, + "grad_norm": 2.232922077178955, + "learning_rate": 9.609777695947182e-05, + "loss": 2.1034, + "step": 428 + }, + { + "epoch": 0.14129853019885544, + "grad_norm": 2.2757458686828613, + "learning_rate": 9.607737761474242e-05, + "loss": 2.5483, + "step": 429 + }, + { + "epoch": 0.14162789740211618, + "grad_norm": 2.3721635341644287, + "learning_rate": 9.60569272655884e-05, + "loss": 2.6453, + "step": 430 + }, + { + "epoch": 0.14195726460537691, + "grad_norm": 2.366621971130371, + "learning_rate": 9.603642593464683e-05, + "loss": 2.3196, + "step": 431 + }, + { + "epoch": 0.14228663180863765, + "grad_norm": 2.4534189701080322, + "learning_rate": 9.601587364461127e-05, + "loss": 2.4592, + "step": 432 + }, + { + "epoch": 0.1426159990118984, + "grad_norm": 2.350576639175415, + "learning_rate": 9.599527041823164e-05, + "loss": 2.0809, + "step": 433 + }, + { + "epoch": 0.14294536621515913, + "grad_norm": 2.2983639240264893, + "learning_rate": 9.59746162783143e-05, + "loss": 2.1881, + "step": 434 + }, + { + "epoch": 0.14327473341841987, + "grad_norm": 2.5199313163757324, + "learning_rate": 9.595391124772189e-05, + "loss": 2.2941, + "step": 435 + }, + { + "epoch": 0.1436041006216806, + "grad_norm": 2.4015860557556152, + "learning_rate": 9.593315534937345e-05, + "loss": 2.2748, + "step": 436 + }, + { + "epoch": 0.14393346782494132, + "grad_norm": 2.675955057144165, + "learning_rate": 9.591234860624431e-05, + "loss": 2.1505, + "step": 437 + }, + { + "epoch": 0.14426283502820206, + "grad_norm": 2.4210708141326904, + "learning_rate": 9.589149104136605e-05, + "loss": 2.2871, + "step": 438 + }, + { + "epoch": 0.1445922022314628, + "grad_norm": 2.417851686477661, + "learning_rate": 9.587058267782656e-05, + "loss": 2.1308, + "step": 439 + }, + { + "epoch": 0.14492156943472354, + "grad_norm": 2.456392526626587, + "learning_rate": 9.584962353876992e-05, + "loss": 1.8146, + "step": 440 + }, + { + "epoch": 0.14525093663798427, + "grad_norm": 2.4803600311279297, + "learning_rate": 9.582861364739642e-05, + "loss": 2.2325, + "step": 441 + }, + { + "epoch": 0.145580303841245, + "grad_norm": 2.4525747299194336, + "learning_rate": 9.580755302696256e-05, + "loss": 2.0824, + "step": 442 + }, + { + "epoch": 0.14590967104450575, + "grad_norm": 3.1618423461914062, + "learning_rate": 9.578644170078093e-05, + "loss": 2.3309, + "step": 443 + }, + { + "epoch": 0.1462390382477665, + "grad_norm": 3.0864148139953613, + "learning_rate": 9.576527969222031e-05, + "loss": 2.2085, + "step": 444 + }, + { + "epoch": 0.1465684054510272, + "grad_norm": 2.7343194484710693, + "learning_rate": 9.574406702470558e-05, + "loss": 2.2462, + "step": 445 + }, + { + "epoch": 0.14689777265428794, + "grad_norm": 2.57436466217041, + "learning_rate": 9.572280372171763e-05, + "loss": 2.0448, + "step": 446 + }, + { + "epoch": 0.14722713985754868, + "grad_norm": 2.608705759048462, + "learning_rate": 9.570148980679347e-05, + "loss": 1.9546, + "step": 447 + }, + { + "epoch": 0.14755650706080942, + "grad_norm": 2.5100154876708984, + "learning_rate": 9.56801253035261e-05, + "loss": 1.6897, + "step": 448 + }, + { + "epoch": 0.14788587426407016, + "grad_norm": 3.806117296218872, + "learning_rate": 9.565871023556455e-05, + "loss": 1.8645, + "step": 449 + }, + { + "epoch": 0.1482152414673309, + "grad_norm": 3.5076465606689453, + "learning_rate": 9.563724462661376e-05, + "loss": 2.2529, + "step": 450 + }, + { + "epoch": 0.14854460867059163, + "grad_norm": 2.111990451812744, + "learning_rate": 9.561572850043467e-05, + "loss": 2.5226, + "step": 451 + }, + { + "epoch": 0.14887397587385237, + "grad_norm": 2.713508367538452, + "learning_rate": 9.559416188084416e-05, + "loss": 2.548, + "step": 452 + }, + { + "epoch": 0.1492033430771131, + "grad_norm": 2.7908146381378174, + "learning_rate": 9.557254479171489e-05, + "loss": 2.8369, + "step": 453 + }, + { + "epoch": 0.14953271028037382, + "grad_norm": 2.3136017322540283, + "learning_rate": 9.555087725697554e-05, + "loss": 2.3652, + "step": 454 + }, + { + "epoch": 0.14986207748363456, + "grad_norm": 2.1749308109283447, + "learning_rate": 9.552915930061048e-05, + "loss": 2.2855, + "step": 455 + }, + { + "epoch": 0.1501914446868953, + "grad_norm": 2.302049398422241, + "learning_rate": 9.550739094666002e-05, + "loss": 2.3987, + "step": 456 + }, + { + "epoch": 0.15052081189015604, + "grad_norm": 2.3191065788269043, + "learning_rate": 9.548557221922017e-05, + "loss": 2.4523, + "step": 457 + }, + { + "epoch": 0.15085017909341678, + "grad_norm": 2.958138942718506, + "learning_rate": 9.546370314244273e-05, + "loss": 2.2964, + "step": 458 + }, + { + "epoch": 0.1511795462966775, + "grad_norm": 2.8905577659606934, + "learning_rate": 9.544178374053524e-05, + "loss": 2.6665, + "step": 459 + }, + { + "epoch": 0.15150891349993825, + "grad_norm": 2.6026418209075928, + "learning_rate": 9.541981403776095e-05, + "loss": 2.5692, + "step": 460 + }, + { + "epoch": 0.151838280703199, + "grad_norm": 2.7903120517730713, + "learning_rate": 9.539779405843876e-05, + "loss": 2.439, + "step": 461 + }, + { + "epoch": 0.1521676479064597, + "grad_norm": 3.0123376846313477, + "learning_rate": 9.537572382694328e-05, + "loss": 2.592, + "step": 462 + }, + { + "epoch": 0.15249701510972044, + "grad_norm": 2.3931968212127686, + "learning_rate": 9.535360336770467e-05, + "loss": 2.2706, + "step": 463 + }, + { + "epoch": 0.15282638231298118, + "grad_norm": 2.501070022583008, + "learning_rate": 9.533143270520873e-05, + "loss": 1.8545, + "step": 464 + }, + { + "epoch": 0.15315574951624192, + "grad_norm": 2.836297035217285, + "learning_rate": 9.530921186399684e-05, + "loss": 2.3613, + "step": 465 + }, + { + "epoch": 0.15348511671950266, + "grad_norm": 2.518571138381958, + "learning_rate": 9.528694086866592e-05, + "loss": 2.1096, + "step": 466 + }, + { + "epoch": 0.1538144839227634, + "grad_norm": 2.5794365406036377, + "learning_rate": 9.526461974386838e-05, + "loss": 2.0714, + "step": 467 + }, + { + "epoch": 0.15414385112602413, + "grad_norm": 2.955522060394287, + "learning_rate": 9.524224851431214e-05, + "loss": 2.1713, + "step": 468 + }, + { + "epoch": 0.15447321832928487, + "grad_norm": 3.465235948562622, + "learning_rate": 9.521982720476062e-05, + "loss": 2.2217, + "step": 469 + }, + { + "epoch": 0.15480258553254558, + "grad_norm": 2.497987985610962, + "learning_rate": 9.519735584003257e-05, + "loss": 1.9994, + "step": 470 + }, + { + "epoch": 0.15513195273580632, + "grad_norm": 2.911043643951416, + "learning_rate": 9.517483444500228e-05, + "loss": 1.9883, + "step": 471 + }, + { + "epoch": 0.15546131993906706, + "grad_norm": 2.7313320636749268, + "learning_rate": 9.51522630445993e-05, + "loss": 2.1227, + "step": 472 + }, + { + "epoch": 0.1557906871423278, + "grad_norm": 3.4482412338256836, + "learning_rate": 9.512964166380864e-05, + "loss": 2.2148, + "step": 473 + }, + { + "epoch": 0.15612005434558854, + "grad_norm": 2.664477825164795, + "learning_rate": 9.510697032767053e-05, + "loss": 1.7443, + "step": 474 + }, + { + "epoch": 0.15644942154884928, + "grad_norm": 2.8564321994781494, + "learning_rate": 9.508424906128058e-05, + "loss": 1.9044, + "step": 475 + }, + { + "epoch": 0.15677878875211002, + "grad_norm": 1.97623872756958, + "learning_rate": 9.506147788978965e-05, + "loss": 2.3548, + "step": 476 + }, + { + "epoch": 0.15710815595537075, + "grad_norm": 2.3602867126464844, + "learning_rate": 9.503865683840378e-05, + "loss": 2.5651, + "step": 477 + }, + { + "epoch": 0.1574375231586315, + "grad_norm": 2.1931259632110596, + "learning_rate": 9.501578593238432e-05, + "loss": 2.1644, + "step": 478 + }, + { + "epoch": 0.1577668903618922, + "grad_norm": 2.284613609313965, + "learning_rate": 9.499286519704773e-05, + "loss": 2.2849, + "step": 479 + }, + { + "epoch": 0.15809625756515294, + "grad_norm": 2.498391628265381, + "learning_rate": 9.49698946577657e-05, + "loss": 2.5516, + "step": 480 + }, + { + "epoch": 0.15842562476841368, + "grad_norm": 2.4198145866394043, + "learning_rate": 9.494687433996493e-05, + "loss": 2.194, + "step": 481 + }, + { + "epoch": 0.15875499197167442, + "grad_norm": 2.404466390609741, + "learning_rate": 9.492380426912737e-05, + "loss": 2.3409, + "step": 482 + }, + { + "epoch": 0.15908435917493516, + "grad_norm": 2.326627254486084, + "learning_rate": 9.490068447078992e-05, + "loss": 2.5327, + "step": 483 + }, + { + "epoch": 0.1594137263781959, + "grad_norm": 2.2029290199279785, + "learning_rate": 9.487751497054461e-05, + "loss": 2.3595, + "step": 484 + }, + { + "epoch": 0.15974309358145664, + "grad_norm": 2.798891544342041, + "learning_rate": 9.485429579403843e-05, + "loss": 2.5281, + "step": 485 + }, + { + "epoch": 0.16007246078471737, + "grad_norm": 2.421884775161743, + "learning_rate": 9.483102696697339e-05, + "loss": 2.3762, + "step": 486 + }, + { + "epoch": 0.16040182798797809, + "grad_norm": 2.618962049484253, + "learning_rate": 9.480770851510644e-05, + "loss": 2.5659, + "step": 487 + }, + { + "epoch": 0.16073119519123882, + "grad_norm": 2.394176721572876, + "learning_rate": 9.478434046424948e-05, + "loss": 2.0389, + "step": 488 + }, + { + "epoch": 0.16106056239449956, + "grad_norm": 2.6122007369995117, + "learning_rate": 9.47609228402693e-05, + "loss": 2.2712, + "step": 489 + }, + { + "epoch": 0.1613899295977603, + "grad_norm": 3.2165706157684326, + "learning_rate": 9.473745566908756e-05, + "loss": 2.2492, + "step": 490 + }, + { + "epoch": 0.16171929680102104, + "grad_norm": 2.527129650115967, + "learning_rate": 9.471393897668078e-05, + "loss": 2.4401, + "step": 491 + }, + { + "epoch": 0.16204866400428178, + "grad_norm": 2.758704900741577, + "learning_rate": 9.469037278908029e-05, + "loss": 2.2011, + "step": 492 + }, + { + "epoch": 0.16237803120754252, + "grad_norm": 2.8025074005126953, + "learning_rate": 9.46667571323722e-05, + "loss": 2.0435, + "step": 493 + }, + { + "epoch": 0.16270739841080326, + "grad_norm": 2.798211097717285, + "learning_rate": 9.464309203269739e-05, + "loss": 2.2279, + "step": 494 + }, + { + "epoch": 0.16303676561406397, + "grad_norm": 2.942800521850586, + "learning_rate": 9.461937751625145e-05, + "loss": 2.082, + "step": 495 + }, + { + "epoch": 0.1633661328173247, + "grad_norm": 2.7005648612976074, + "learning_rate": 9.459561360928472e-05, + "loss": 2.2156, + "step": 496 + }, + { + "epoch": 0.16369550002058544, + "grad_norm": 2.9613757133483887, + "learning_rate": 9.457180033810216e-05, + "loss": 2.3678, + "step": 497 + }, + { + "epoch": 0.16402486722384618, + "grad_norm": 2.888354539871216, + "learning_rate": 9.454793772906336e-05, + "loss": 1.9226, + "step": 498 + }, + { + "epoch": 0.16435423442710692, + "grad_norm": 3.021557331085205, + "learning_rate": 9.452402580858261e-05, + "loss": 2.1693, + "step": 499 + }, + { + "epoch": 0.16468360163036766, + "grad_norm": 3.1927671432495117, + "learning_rate": 9.45000646031287e-05, + "loss": 1.6268, + "step": 500 + }, + { + "epoch": 0.1650129688336284, + "grad_norm": 1.9104129076004028, + "learning_rate": 9.447605413922499e-05, + "loss": 2.6226, + "step": 501 + }, + { + "epoch": 0.16534233603688914, + "grad_norm": 1.981658935546875, + "learning_rate": 9.44519944434494e-05, + "loss": 2.288, + "step": 502 + }, + { + "epoch": 0.16567170324014988, + "grad_norm": 2.278552532196045, + "learning_rate": 9.442788554243431e-05, + "loss": 2.4093, + "step": 503 + }, + { + "epoch": 0.1660010704434106, + "grad_norm": 2.9332423210144043, + "learning_rate": 9.440372746286661e-05, + "loss": 2.5302, + "step": 504 + }, + { + "epoch": 0.16633043764667133, + "grad_norm": 2.4313466548919678, + "learning_rate": 9.437952023148757e-05, + "loss": 2.52, + "step": 505 + }, + { + "epoch": 0.16665980484993206, + "grad_norm": 2.3095078468322754, + "learning_rate": 9.43552638750929e-05, + "loss": 2.424, + "step": 506 + }, + { + "epoch": 0.1669891720531928, + "grad_norm": 2.3015639781951904, + "learning_rate": 9.433095842053272e-05, + "loss": 2.5717, + "step": 507 + }, + { + "epoch": 0.16731853925645354, + "grad_norm": 2.5541129112243652, + "learning_rate": 9.43066038947114e-05, + "loss": 2.5831, + "step": 508 + }, + { + "epoch": 0.16764790645971428, + "grad_norm": 2.421571969985962, + "learning_rate": 9.428220032458776e-05, + "loss": 2.4438, + "step": 509 + }, + { + "epoch": 0.16797727366297502, + "grad_norm": 2.5124785900115967, + "learning_rate": 9.425774773717479e-05, + "loss": 2.2709, + "step": 510 + }, + { + "epoch": 0.16830664086623576, + "grad_norm": 2.592477560043335, + "learning_rate": 9.423324615953982e-05, + "loss": 2.3089, + "step": 511 + }, + { + "epoch": 0.16863600806949647, + "grad_norm": 2.2433090209960938, + "learning_rate": 9.420869561880434e-05, + "loss": 2.318, + "step": 512 + }, + { + "epoch": 0.1689653752727572, + "grad_norm": 2.6666271686553955, + "learning_rate": 9.418409614214412e-05, + "loss": 2.3501, + "step": 513 + }, + { + "epoch": 0.16929474247601795, + "grad_norm": 2.3065149784088135, + "learning_rate": 9.415944775678902e-05, + "loss": 2.0432, + "step": 514 + }, + { + "epoch": 0.16962410967927868, + "grad_norm": 3.0598034858703613, + "learning_rate": 9.41347504900231e-05, + "loss": 2.6206, + "step": 515 + }, + { + "epoch": 0.16995347688253942, + "grad_norm": 2.051896095275879, + "learning_rate": 9.411000436918449e-05, + "loss": 1.9563, + "step": 516 + }, + { + "epoch": 0.17028284408580016, + "grad_norm": 3.1122515201568604, + "learning_rate": 9.408520942166541e-05, + "loss": 2.2816, + "step": 517 + }, + { + "epoch": 0.1706122112890609, + "grad_norm": 2.379194974899292, + "learning_rate": 9.406036567491213e-05, + "loss": 2.4151, + "step": 518 + }, + { + "epoch": 0.17094157849232164, + "grad_norm": 2.4957690238952637, + "learning_rate": 9.403547315642493e-05, + "loss": 1.8377, + "step": 519 + }, + { + "epoch": 0.17127094569558235, + "grad_norm": 2.6822221279144287, + "learning_rate": 9.401053189375809e-05, + "loss": 2.0926, + "step": 520 + }, + { + "epoch": 0.1716003128988431, + "grad_norm": 2.902961492538452, + "learning_rate": 9.398554191451983e-05, + "loss": 2.1816, + "step": 521 + }, + { + "epoch": 0.17192968010210383, + "grad_norm": 3.225004196166992, + "learning_rate": 9.396050324637228e-05, + "loss": 2.5233, + "step": 522 + }, + { + "epoch": 0.17225904730536457, + "grad_norm": 2.8753762245178223, + "learning_rate": 9.393541591703156e-05, + "loss": 2.3371, + "step": 523 + }, + { + "epoch": 0.1725884145086253, + "grad_norm": 3.5578978061676025, + "learning_rate": 9.39102799542675e-05, + "loss": 2.3127, + "step": 524 + }, + { + "epoch": 0.17291778171188604, + "grad_norm": 3.2395496368408203, + "learning_rate": 9.388509538590391e-05, + "loss": 2.0262, + "step": 525 + }, + { + "epoch": 0.17324714891514678, + "grad_norm": 2.063525438308716, + "learning_rate": 9.385986223981833e-05, + "loss": 2.5577, + "step": 526 + }, + { + "epoch": 0.17357651611840752, + "grad_norm": 2.227280616760254, + "learning_rate": 9.383458054394206e-05, + "loss": 2.5892, + "step": 527 + }, + { + "epoch": 0.17390588332166826, + "grad_norm": 2.1608548164367676, + "learning_rate": 9.380925032626015e-05, + "loss": 2.5988, + "step": 528 + }, + { + "epoch": 0.17423525052492897, + "grad_norm": 2.333763360977173, + "learning_rate": 9.378387161481142e-05, + "loss": 2.4371, + "step": 529 + }, + { + "epoch": 0.1745646177281897, + "grad_norm": 2.1032464504241943, + "learning_rate": 9.375844443768829e-05, + "loss": 2.1269, + "step": 530 + }, + { + "epoch": 0.17489398493145045, + "grad_norm": 2.7457637786865234, + "learning_rate": 9.373296882303688e-05, + "loss": 2.4893, + "step": 531 + }, + { + "epoch": 0.17522335213471119, + "grad_norm": 2.414435625076294, + "learning_rate": 9.37074447990569e-05, + "loss": 2.4437, + "step": 532 + }, + { + "epoch": 0.17555271933797192, + "grad_norm": 2.7596664428710938, + "learning_rate": 9.368187239400166e-05, + "loss": 2.3113, + "step": 533 + }, + { + "epoch": 0.17588208654123266, + "grad_norm": 2.7227630615234375, + "learning_rate": 9.3656251636178e-05, + "loss": 2.2728, + "step": 534 + }, + { + "epoch": 0.1762114537444934, + "grad_norm": 2.8053038120269775, + "learning_rate": 9.363058255394632e-05, + "loss": 2.3559, + "step": 535 + }, + { + "epoch": 0.17654082094775414, + "grad_norm": 2.6018600463867188, + "learning_rate": 9.360486517572049e-05, + "loss": 2.2176, + "step": 536 + }, + { + "epoch": 0.17687018815101485, + "grad_norm": 2.3456811904907227, + "learning_rate": 9.357909952996784e-05, + "loss": 2.1538, + "step": 537 + }, + { + "epoch": 0.1771995553542756, + "grad_norm": 2.6372413635253906, + "learning_rate": 9.355328564520914e-05, + "loss": 2.1687, + "step": 538 + }, + { + "epoch": 0.17752892255753633, + "grad_norm": 2.4839794635772705, + "learning_rate": 9.352742355001853e-05, + "loss": 2.2029, + "step": 539 + }, + { + "epoch": 0.17785828976079707, + "grad_norm": 2.5911648273468018, + "learning_rate": 9.350151327302356e-05, + "loss": 2.0988, + "step": 540 + }, + { + "epoch": 0.1781876569640578, + "grad_norm": 2.3182106018066406, + "learning_rate": 9.347555484290507e-05, + "loss": 1.9714, + "step": 541 + }, + { + "epoch": 0.17851702416731854, + "grad_norm": 3.564013957977295, + "learning_rate": 9.344954828839722e-05, + "loss": 2.3427, + "step": 542 + }, + { + "epoch": 0.17884639137057928, + "grad_norm": 2.816439628601074, + "learning_rate": 9.342349363828748e-05, + "loss": 2.198, + "step": 543 + }, + { + "epoch": 0.17917575857384002, + "grad_norm": 2.91499924659729, + "learning_rate": 9.339739092141647e-05, + "loss": 2.2565, + "step": 544 + }, + { + "epoch": 0.17950512577710073, + "grad_norm": 3.363369941711426, + "learning_rate": 9.337124016667809e-05, + "loss": 2.1877, + "step": 545 + }, + { + "epoch": 0.17983449298036147, + "grad_norm": 3.335421085357666, + "learning_rate": 9.334504140301938e-05, + "loss": 2.2754, + "step": 546 + }, + { + "epoch": 0.1801638601836222, + "grad_norm": 3.2244529724121094, + "learning_rate": 9.331879465944056e-05, + "loss": 2.0735, + "step": 547 + }, + { + "epoch": 0.18049322738688295, + "grad_norm": 2.5784671306610107, + "learning_rate": 9.32924999649949e-05, + "loss": 2.0593, + "step": 548 + }, + { + "epoch": 0.1808225945901437, + "grad_norm": 3.826298236846924, + "learning_rate": 9.326615734878878e-05, + "loss": 2.4232, + "step": 549 + }, + { + "epoch": 0.18115196179340443, + "grad_norm": 3.9787750244140625, + "learning_rate": 9.323976683998168e-05, + "loss": 1.9951, + "step": 550 + }, + { + "epoch": 0.18148132899666516, + "grad_norm": 2.1491682529449463, + "learning_rate": 9.321332846778599e-05, + "loss": 2.497, + "step": 551 + }, + { + "epoch": 0.1818106961999259, + "grad_norm": 2.5805752277374268, + "learning_rate": 9.318684226146714e-05, + "loss": 2.343, + "step": 552 + }, + { + "epoch": 0.18214006340318661, + "grad_norm": 2.671565055847168, + "learning_rate": 9.316030825034354e-05, + "loss": 2.314, + "step": 553 + }, + { + "epoch": 0.18246943060644735, + "grad_norm": 2.459749221801758, + "learning_rate": 9.313372646378643e-05, + "loss": 2.3503, + "step": 554 + }, + { + "epoch": 0.1827987978097081, + "grad_norm": 2.3150362968444824, + "learning_rate": 9.310709693122002e-05, + "loss": 1.9586, + "step": 555 + }, + { + "epoch": 0.18312816501296883, + "grad_norm": 2.813565254211426, + "learning_rate": 9.308041968212131e-05, + "loss": 2.467, + "step": 556 + }, + { + "epoch": 0.18345753221622957, + "grad_norm": 2.5931906700134277, + "learning_rate": 9.305369474602015e-05, + "loss": 2.245, + "step": 557 + }, + { + "epoch": 0.1837868994194903, + "grad_norm": 2.7060670852661133, + "learning_rate": 9.302692215249918e-05, + "loss": 2.7003, + "step": 558 + }, + { + "epoch": 0.18411626662275105, + "grad_norm": 2.543288230895996, + "learning_rate": 9.300010193119376e-05, + "loss": 2.2948, + "step": 559 + }, + { + "epoch": 0.18444563382601178, + "grad_norm": 2.492933750152588, + "learning_rate": 9.297323411179202e-05, + "loss": 2.2784, + "step": 560 + }, + { + "epoch": 0.18477500102927252, + "grad_norm": 2.2065117359161377, + "learning_rate": 9.294631872403474e-05, + "loss": 2.2058, + "step": 561 + }, + { + "epoch": 0.18510436823253323, + "grad_norm": 3.046983003616333, + "learning_rate": 9.291935579771536e-05, + "loss": 2.3053, + "step": 562 + }, + { + "epoch": 0.18543373543579397, + "grad_norm": 2.53873610496521, + "learning_rate": 9.289234536267996e-05, + "loss": 2.4844, + "step": 563 + }, + { + "epoch": 0.1857631026390547, + "grad_norm": 2.3958847522735596, + "learning_rate": 9.286528744882719e-05, + "loss": 2.0553, + "step": 564 + }, + { + "epoch": 0.18609246984231545, + "grad_norm": 3.1099889278411865, + "learning_rate": 9.283818208610826e-05, + "loss": 2.7911, + "step": 565 + }, + { + "epoch": 0.1864218370455762, + "grad_norm": 2.7768149375915527, + "learning_rate": 9.28110293045269e-05, + "loss": 2.3027, + "step": 566 + }, + { + "epoch": 0.18675120424883693, + "grad_norm": 2.551649570465088, + "learning_rate": 9.278382913413935e-05, + "loss": 1.9763, + "step": 567 + }, + { + "epoch": 0.18708057145209767, + "grad_norm": 2.6487247943878174, + "learning_rate": 9.27565816050543e-05, + "loss": 2.2145, + "step": 568 + }, + { + "epoch": 0.1874099386553584, + "grad_norm": 3.50011944770813, + "learning_rate": 9.272928674743282e-05, + "loss": 2.0824, + "step": 569 + }, + { + "epoch": 0.18773930585861912, + "grad_norm": 2.5027475357055664, + "learning_rate": 9.270194459148841e-05, + "loss": 2.0633, + "step": 570 + }, + { + "epoch": 0.18806867306187985, + "grad_norm": 2.7622134685516357, + "learning_rate": 9.267455516748693e-05, + "loss": 2.4109, + "step": 571 + }, + { + "epoch": 0.1883980402651406, + "grad_norm": 2.5253777503967285, + "learning_rate": 9.264711850574657e-05, + "loss": 1.8391, + "step": 572 + }, + { + "epoch": 0.18872740746840133, + "grad_norm": 3.9980525970458984, + "learning_rate": 9.261963463663775e-05, + "loss": 2.4288, + "step": 573 + }, + { + "epoch": 0.18905677467166207, + "grad_norm": 3.201934814453125, + "learning_rate": 9.25921035905832e-05, + "loss": 2.1041, + "step": 574 + }, + { + "epoch": 0.1893861418749228, + "grad_norm": 3.45932674407959, + "learning_rate": 9.256452539805787e-05, + "loss": 1.9162, + "step": 575 + }, + { + "epoch": 0.18971550907818355, + "grad_norm": 2.3997511863708496, + "learning_rate": 9.253690008958886e-05, + "loss": 2.7296, + "step": 576 + }, + { + "epoch": 0.19004487628144429, + "grad_norm": 2.100877523422241, + "learning_rate": 9.250922769575548e-05, + "loss": 2.2391, + "step": 577 + }, + { + "epoch": 0.190374243484705, + "grad_norm": 2.0351099967956543, + "learning_rate": 9.248150824718911e-05, + "loss": 2.3191, + "step": 578 + }, + { + "epoch": 0.19070361068796574, + "grad_norm": 2.368957757949829, + "learning_rate": 9.245374177457323e-05, + "loss": 2.2142, + "step": 579 + }, + { + "epoch": 0.19103297789122647, + "grad_norm": 2.49444580078125, + "learning_rate": 9.242592830864339e-05, + "loss": 2.3158, + "step": 580 + }, + { + "epoch": 0.1913623450944872, + "grad_norm": 2.4949872493743896, + "learning_rate": 9.239806788018714e-05, + "loss": 2.3183, + "step": 581 + }, + { + "epoch": 0.19169171229774795, + "grad_norm": 2.6433401107788086, + "learning_rate": 9.2370160520044e-05, + "loss": 2.4043, + "step": 582 + }, + { + "epoch": 0.1920210795010087, + "grad_norm": 2.3650400638580322, + "learning_rate": 9.23422062591055e-05, + "loss": 2.2305, + "step": 583 + }, + { + "epoch": 0.19235044670426943, + "grad_norm": 2.623833179473877, + "learning_rate": 9.231420512831501e-05, + "loss": 2.2516, + "step": 584 + }, + { + "epoch": 0.19267981390753017, + "grad_norm": 2.3704848289489746, + "learning_rate": 9.228615715866785e-05, + "loss": 2.1916, + "step": 585 + }, + { + "epoch": 0.1930091811107909, + "grad_norm": 2.6421825885772705, + "learning_rate": 9.225806238121113e-05, + "loss": 2.1705, + "step": 586 + }, + { + "epoch": 0.19333854831405162, + "grad_norm": 2.507702589035034, + "learning_rate": 9.222992082704381e-05, + "loss": 2.2798, + "step": 587 + }, + { + "epoch": 0.19366791551731236, + "grad_norm": 2.977654457092285, + "learning_rate": 9.22017325273166e-05, + "loss": 2.3934, + "step": 588 + }, + { + "epoch": 0.1939972827205731, + "grad_norm": 2.6255760192871094, + "learning_rate": 9.217349751323199e-05, + "loss": 2.168, + "step": 589 + }, + { + "epoch": 0.19432664992383383, + "grad_norm": 2.767040252685547, + "learning_rate": 9.214521581604415e-05, + "loss": 2.3227, + "step": 590 + }, + { + "epoch": 0.19465601712709457, + "grad_norm": 2.74783992767334, + "learning_rate": 9.211688746705894e-05, + "loss": 2.5598, + "step": 591 + }, + { + "epoch": 0.1949853843303553, + "grad_norm": 3.419125556945801, + "learning_rate": 9.208851249763385e-05, + "loss": 2.1441, + "step": 592 + }, + { + "epoch": 0.19531475153361605, + "grad_norm": 2.531890392303467, + "learning_rate": 9.206009093917798e-05, + "loss": 2.2535, + "step": 593 + }, + { + "epoch": 0.1956441187368768, + "grad_norm": 2.928637742996216, + "learning_rate": 9.203162282315201e-05, + "loss": 1.9981, + "step": 594 + }, + { + "epoch": 0.1959734859401375, + "grad_norm": 2.740192174911499, + "learning_rate": 9.200310818106813e-05, + "loss": 2.1965, + "step": 595 + }, + { + "epoch": 0.19630285314339824, + "grad_norm": 2.5833513736724854, + "learning_rate": 9.197454704449007e-05, + "loss": 1.9684, + "step": 596 + }, + { + "epoch": 0.19663222034665898, + "grad_norm": 2.816075563430786, + "learning_rate": 9.194593944503298e-05, + "loss": 1.9281, + "step": 597 + }, + { + "epoch": 0.19696158754991971, + "grad_norm": 2.4949111938476562, + "learning_rate": 9.19172854143635e-05, + "loss": 1.8233, + "step": 598 + }, + { + "epoch": 0.19729095475318045, + "grad_norm": 3.0978989601135254, + "learning_rate": 9.18885849841996e-05, + "loss": 1.9407, + "step": 599 + }, + { + "epoch": 0.1976203219564412, + "grad_norm": 3.714794397354126, + "learning_rate": 9.185983818631066e-05, + "loss": 2.082, + "step": 600 + }, + { + "epoch": 0.19794968915970193, + "grad_norm": 2.1515870094299316, + "learning_rate": 9.183104505251735e-05, + "loss": 2.5559, + "step": 601 + }, + { + "epoch": 0.19827905636296267, + "grad_norm": 2.2169995307922363, + "learning_rate": 9.180220561469167e-05, + "loss": 2.2574, + "step": 602 + }, + { + "epoch": 0.19860842356622338, + "grad_norm": 2.327986717224121, + "learning_rate": 9.177331990475685e-05, + "loss": 2.4616, + "step": 603 + }, + { + "epoch": 0.19893779076948412, + "grad_norm": 3.0551161766052246, + "learning_rate": 9.174438795468734e-05, + "loss": 2.602, + "step": 604 + }, + { + "epoch": 0.19926715797274486, + "grad_norm": 2.404407024383545, + "learning_rate": 9.171540979650879e-05, + "loss": 2.4922, + "step": 605 + }, + { + "epoch": 0.1995965251760056, + "grad_norm": 2.458751916885376, + "learning_rate": 9.168638546229796e-05, + "loss": 2.3752, + "step": 606 + }, + { + "epoch": 0.19992589237926633, + "grad_norm": 3.044050693511963, + "learning_rate": 9.165731498418277e-05, + "loss": 2.6778, + "step": 607 + }, + { + "epoch": 0.20025525958252707, + "grad_norm": 2.5065767765045166, + "learning_rate": 9.162819839434223e-05, + "loss": 2.3761, + "step": 608 + }, + { + "epoch": 0.2005846267857878, + "grad_norm": 2.5272152423858643, + "learning_rate": 9.15990357250063e-05, + "loss": 2.2685, + "step": 609 + }, + { + "epoch": 0.20091399398904855, + "grad_norm": 3.1523914337158203, + "learning_rate": 9.156982700845606e-05, + "loss": 2.516, + "step": 610 + }, + { + "epoch": 0.2012433611923093, + "grad_norm": 2.7383854389190674, + "learning_rate": 9.154057227702348e-05, + "loss": 2.4093, + "step": 611 + }, + { + "epoch": 0.20157272839557, + "grad_norm": 3.010639190673828, + "learning_rate": 9.151127156309151e-05, + "loss": 2.5929, + "step": 612 + }, + { + "epoch": 0.20190209559883074, + "grad_norm": 3.279294967651367, + "learning_rate": 9.1481924899094e-05, + "loss": 2.266, + "step": 613 + }, + { + "epoch": 0.20223146280209148, + "grad_norm": 3.147496223449707, + "learning_rate": 9.145253231751563e-05, + "loss": 2.1337, + "step": 614 + }, + { + "epoch": 0.20256083000535222, + "grad_norm": 2.710343599319458, + "learning_rate": 9.142309385089191e-05, + "loss": 2.0808, + "step": 615 + }, + { + "epoch": 0.20289019720861295, + "grad_norm": 2.6864521503448486, + "learning_rate": 9.139360953180918e-05, + "loss": 2.1494, + "step": 616 + }, + { + "epoch": 0.2032195644118737, + "grad_norm": 2.7441837787628174, + "learning_rate": 9.136407939290451e-05, + "loss": 2.5828, + "step": 617 + }, + { + "epoch": 0.20354893161513443, + "grad_norm": 3.46586537361145, + "learning_rate": 9.13345034668657e-05, + "loss": 2.6775, + "step": 618 + }, + { + "epoch": 0.20387829881839517, + "grad_norm": 2.7498793601989746, + "learning_rate": 9.130488178643119e-05, + "loss": 2.0033, + "step": 619 + }, + { + "epoch": 0.20420766602165588, + "grad_norm": 3.1091184616088867, + "learning_rate": 9.127521438439015e-05, + "loss": 2.4799, + "step": 620 + }, + { + "epoch": 0.20453703322491662, + "grad_norm": 2.3706719875335693, + "learning_rate": 9.124550129358227e-05, + "loss": 1.8221, + "step": 621 + }, + { + "epoch": 0.20486640042817736, + "grad_norm": 2.63212251663208, + "learning_rate": 9.121574254689788e-05, + "loss": 2.0923, + "step": 622 + }, + { + "epoch": 0.2051957676314381, + "grad_norm": 3.275517225265503, + "learning_rate": 9.118593817727782e-05, + "loss": 2.1223, + "step": 623 + }, + { + "epoch": 0.20552513483469884, + "grad_norm": 3.2868645191192627, + "learning_rate": 9.115608821771347e-05, + "loss": 2.1894, + "step": 624 + }, + { + "epoch": 0.20585450203795957, + "grad_norm": 3.2911734580993652, + "learning_rate": 9.112619270124658e-05, + "loss": 2.0714, + "step": 625 + }, + { + "epoch": 0.2061838692412203, + "grad_norm": 1.8351259231567383, + "learning_rate": 9.109625166096942e-05, + "loss": 2.5735, + "step": 626 + }, + { + "epoch": 0.20651323644448105, + "grad_norm": 2.289583206176758, + "learning_rate": 9.106626513002464e-05, + "loss": 2.2809, + "step": 627 + }, + { + "epoch": 0.20684260364774176, + "grad_norm": 2.3926987648010254, + "learning_rate": 9.103623314160518e-05, + "loss": 2.4775, + "step": 628 + }, + { + "epoch": 0.2071719708510025, + "grad_norm": 2.8485803604125977, + "learning_rate": 9.100615572895439e-05, + "loss": 2.5643, + "step": 629 + }, + { + "epoch": 0.20750133805426324, + "grad_norm": 2.601393461227417, + "learning_rate": 9.097603292536583e-05, + "loss": 2.1331, + "step": 630 + }, + { + "epoch": 0.20783070525752398, + "grad_norm": 2.364741086959839, + "learning_rate": 9.094586476418335e-05, + "loss": 2.2653, + "step": 631 + }, + { + "epoch": 0.20816007246078472, + "grad_norm": 2.307363510131836, + "learning_rate": 9.091565127880096e-05, + "loss": 2.8223, + "step": 632 + }, + { + "epoch": 0.20848943966404546, + "grad_norm": 2.207406997680664, + "learning_rate": 9.088539250266287e-05, + "loss": 2.2839, + "step": 633 + }, + { + "epoch": 0.2088188068673062, + "grad_norm": 2.271679401397705, + "learning_rate": 9.085508846926345e-05, + "loss": 2.1932, + "step": 634 + }, + { + "epoch": 0.20914817407056693, + "grad_norm": 2.566964864730835, + "learning_rate": 9.082473921214714e-05, + "loss": 2.2745, + "step": 635 + }, + { + "epoch": 0.20947754127382767, + "grad_norm": 2.6665945053100586, + "learning_rate": 9.07943447649084e-05, + "loss": 2.2285, + "step": 636 + }, + { + "epoch": 0.20980690847708838, + "grad_norm": 2.232455253601074, + "learning_rate": 9.07639051611918e-05, + "loss": 2.0741, + "step": 637 + }, + { + "epoch": 0.21013627568034912, + "grad_norm": 2.365143299102783, + "learning_rate": 9.07334204346918e-05, + "loss": 1.9137, + "step": 638 + }, + { + "epoch": 0.21046564288360986, + "grad_norm": 2.6072494983673096, + "learning_rate": 9.070289061915289e-05, + "loss": 2.235, + "step": 639 + }, + { + "epoch": 0.2107950100868706, + "grad_norm": 2.8605432510375977, + "learning_rate": 9.06723157483694e-05, + "loss": 2.2824, + "step": 640 + }, + { + "epoch": 0.21112437729013134, + "grad_norm": 2.70151424407959, + "learning_rate": 9.064169585618561e-05, + "loss": 2.202, + "step": 641 + }, + { + "epoch": 0.21145374449339208, + "grad_norm": 2.540966272354126, + "learning_rate": 9.061103097649554e-05, + "loss": 2.0908, + "step": 642 + }, + { + "epoch": 0.21178311169665281, + "grad_norm": 2.4950785636901855, + "learning_rate": 9.05803211432431e-05, + "loss": 1.9015, + "step": 643 + }, + { + "epoch": 0.21211247889991355, + "grad_norm": 2.925896406173706, + "learning_rate": 9.054956639042194e-05, + "loss": 2.2201, + "step": 644 + }, + { + "epoch": 0.21244184610317426, + "grad_norm": 3.0082502365112305, + "learning_rate": 9.051876675207535e-05, + "loss": 2.3677, + "step": 645 + }, + { + "epoch": 0.212771213306435, + "grad_norm": 2.554163694381714, + "learning_rate": 9.048792226229642e-05, + "loss": 1.9325, + "step": 646 + }, + { + "epoch": 0.21310058050969574, + "grad_norm": 2.7593939304351807, + "learning_rate": 9.04570329552278e-05, + "loss": 2.2431, + "step": 647 + }, + { + "epoch": 0.21342994771295648, + "grad_norm": 3.1389267444610596, + "learning_rate": 9.042609886506183e-05, + "loss": 2.2714, + "step": 648 + }, + { + "epoch": 0.21375931491621722, + "grad_norm": 2.6484322547912598, + "learning_rate": 9.039512002604034e-05, + "loss": 2.0055, + "step": 649 + }, + { + "epoch": 0.21408868211947796, + "grad_norm": 3.6394948959350586, + "learning_rate": 9.036409647245474e-05, + "loss": 2.2731, + "step": 650 + }, + { + "epoch": 0.2144180493227387, + "grad_norm": 1.968092679977417, + "learning_rate": 9.033302823864595e-05, + "loss": 2.3822, + "step": 651 + }, + { + "epoch": 0.21474741652599943, + "grad_norm": 2.4255237579345703, + "learning_rate": 9.03019153590043e-05, + "loss": 2.5761, + "step": 652 + }, + { + "epoch": 0.21507678372926015, + "grad_norm": 2.2358973026275635, + "learning_rate": 9.027075786796957e-05, + "loss": 2.2964, + "step": 653 + }, + { + "epoch": 0.21540615093252088, + "grad_norm": 3.0591135025024414, + "learning_rate": 9.023955580003092e-05, + "loss": 2.609, + "step": 654 + }, + { + "epoch": 0.21573551813578162, + "grad_norm": 2.4640660285949707, + "learning_rate": 9.020830918972684e-05, + "loss": 2.4714, + "step": 655 + }, + { + "epoch": 0.21606488533904236, + "grad_norm": 2.639633893966675, + "learning_rate": 9.017701807164516e-05, + "loss": 1.9724, + "step": 656 + }, + { + "epoch": 0.2163942525423031, + "grad_norm": 2.35853910446167, + "learning_rate": 9.014568248042292e-05, + "loss": 2.1988, + "step": 657 + }, + { + "epoch": 0.21672361974556384, + "grad_norm": 2.6157610416412354, + "learning_rate": 9.011430245074645e-05, + "loss": 2.4365, + "step": 658 + }, + { + "epoch": 0.21705298694882458, + "grad_norm": 2.606468439102173, + "learning_rate": 9.008287801735124e-05, + "loss": 2.1511, + "step": 659 + }, + { + "epoch": 0.21738235415208532, + "grad_norm": 2.8526015281677246, + "learning_rate": 9.005140921502193e-05, + "loss": 2.366, + "step": 660 + }, + { + "epoch": 0.21771172135534606, + "grad_norm": 2.663635492324829, + "learning_rate": 9.001989607859226e-05, + "loss": 2.418, + "step": 661 + }, + { + "epoch": 0.21804108855860677, + "grad_norm": 2.6935718059539795, + "learning_rate": 8.998833864294507e-05, + "loss": 2.327, + "step": 662 + }, + { + "epoch": 0.2183704557618675, + "grad_norm": 2.840543031692505, + "learning_rate": 8.995673694301223e-05, + "loss": 2.106, + "step": 663 + }, + { + "epoch": 0.21869982296512824, + "grad_norm": 2.640491485595703, + "learning_rate": 8.99250910137746e-05, + "loss": 1.9047, + "step": 664 + }, + { + "epoch": 0.21902919016838898, + "grad_norm": 2.4656693935394287, + "learning_rate": 8.989340089026203e-05, + "loss": 1.9766, + "step": 665 + }, + { + "epoch": 0.21935855737164972, + "grad_norm": 3.2263641357421875, + "learning_rate": 8.986166660755321e-05, + "loss": 2.4087, + "step": 666 + }, + { + "epoch": 0.21968792457491046, + "grad_norm": 2.758763313293457, + "learning_rate": 8.982988820077582e-05, + "loss": 2.2525, + "step": 667 + }, + { + "epoch": 0.2200172917781712, + "grad_norm": 2.882479429244995, + "learning_rate": 8.979806570510631e-05, + "loss": 2.3654, + "step": 668 + }, + { + "epoch": 0.22034665898143194, + "grad_norm": 2.9043984413146973, + "learning_rate": 8.976619915576994e-05, + "loss": 1.992, + "step": 669 + }, + { + "epoch": 0.22067602618469265, + "grad_norm": 2.918984889984131, + "learning_rate": 8.973428858804073e-05, + "loss": 2.2139, + "step": 670 + }, + { + "epoch": 0.22100539338795339, + "grad_norm": 2.9728167057037354, + "learning_rate": 8.970233403724146e-05, + "loss": 2.0774, + "step": 671 + }, + { + "epoch": 0.22133476059121412, + "grad_norm": 2.9786489009857178, + "learning_rate": 8.96703355387436e-05, + "loss": 2.1803, + "step": 672 + }, + { + "epoch": 0.22166412779447486, + "grad_norm": 3.05253267288208, + "learning_rate": 8.963829312796718e-05, + "loss": 2.1476, + "step": 673 + }, + { + "epoch": 0.2219934949977356, + "grad_norm": 2.640706777572632, + "learning_rate": 8.960620684038097e-05, + "loss": 2.1194, + "step": 674 + }, + { + "epoch": 0.22232286220099634, + "grad_norm": 3.7444612979888916, + "learning_rate": 8.95740767115022e-05, + "loss": 2.1602, + "step": 675 + }, + { + "epoch": 0.22265222940425708, + "grad_norm": 1.9276431798934937, + "learning_rate": 8.95419027768967e-05, + "loss": 2.1702, + "step": 676 + }, + { + "epoch": 0.22298159660751782, + "grad_norm": 2.062800168991089, + "learning_rate": 8.95096850721787e-05, + "loss": 2.2015, + "step": 677 + }, + { + "epoch": 0.22331096381077853, + "grad_norm": 2.143061637878418, + "learning_rate": 8.947742363301098e-05, + "loss": 2.4098, + "step": 678 + }, + { + "epoch": 0.22364033101403927, + "grad_norm": 2.5208868980407715, + "learning_rate": 8.944511849510469e-05, + "loss": 2.5313, + "step": 679 + }, + { + "epoch": 0.2239696982173, + "grad_norm": 2.5163257122039795, + "learning_rate": 8.941276969421935e-05, + "loss": 2.4371, + "step": 680 + }, + { + "epoch": 0.22429906542056074, + "grad_norm": 2.5549163818359375, + "learning_rate": 8.938037726616281e-05, + "loss": 2.3066, + "step": 681 + }, + { + "epoch": 0.22462843262382148, + "grad_norm": 2.5244922637939453, + "learning_rate": 8.934794124679121e-05, + "loss": 2.4772, + "step": 682 + }, + { + "epoch": 0.22495779982708222, + "grad_norm": 2.399825096130371, + "learning_rate": 8.931546167200895e-05, + "loss": 2.5803, + "step": 683 + }, + { + "epoch": 0.22528716703034296, + "grad_norm": 2.410311460494995, + "learning_rate": 8.928293857776866e-05, + "loss": 2.0751, + "step": 684 + }, + { + "epoch": 0.2256165342336037, + "grad_norm": 2.29736590385437, + "learning_rate": 8.925037200007109e-05, + "loss": 2.2536, + "step": 685 + }, + { + "epoch": 0.22594590143686444, + "grad_norm": 2.3026459217071533, + "learning_rate": 8.921776197496518e-05, + "loss": 2.0917, + "step": 686 + }, + { + "epoch": 0.22627526864012515, + "grad_norm": 2.490049362182617, + "learning_rate": 8.918510853854794e-05, + "loss": 2.0983, + "step": 687 + }, + { + "epoch": 0.2266046358433859, + "grad_norm": 3.218235969543457, + "learning_rate": 8.915241172696441e-05, + "loss": 2.3049, + "step": 688 + }, + { + "epoch": 0.22693400304664663, + "grad_norm": 2.5197603702545166, + "learning_rate": 8.911967157640771e-05, + "loss": 2.1897, + "step": 689 + }, + { + "epoch": 0.22726337024990736, + "grad_norm": 2.4397006034851074, + "learning_rate": 8.908688812311884e-05, + "loss": 1.9715, + "step": 690 + }, + { + "epoch": 0.2275927374531681, + "grad_norm": 2.9002199172973633, + "learning_rate": 8.905406140338683e-05, + "loss": 2.2867, + "step": 691 + }, + { + "epoch": 0.22792210465642884, + "grad_norm": 2.6698174476623535, + "learning_rate": 8.902119145354852e-05, + "loss": 2.4758, + "step": 692 + }, + { + "epoch": 0.22825147185968958, + "grad_norm": 3.1235620975494385, + "learning_rate": 8.898827830998864e-05, + "loss": 2.1978, + "step": 693 + }, + { + "epoch": 0.22858083906295032, + "grad_norm": 2.7362828254699707, + "learning_rate": 8.895532200913976e-05, + "loss": 2.326, + "step": 694 + }, + { + "epoch": 0.22891020626621103, + "grad_norm": 2.6940107345581055, + "learning_rate": 8.892232258748217e-05, + "loss": 2.0774, + "step": 695 + }, + { + "epoch": 0.22923957346947177, + "grad_norm": 2.963489294052124, + "learning_rate": 8.888928008154393e-05, + "loss": 2.2016, + "step": 696 + }, + { + "epoch": 0.2295689406727325, + "grad_norm": 2.9156124591827393, + "learning_rate": 8.885619452790078e-05, + "loss": 2.4717, + "step": 697 + }, + { + "epoch": 0.22989830787599325, + "grad_norm": 3.306898355484009, + "learning_rate": 8.882306596317606e-05, + "loss": 2.2373, + "step": 698 + }, + { + "epoch": 0.23022767507925399, + "grad_norm": 2.8223652839660645, + "learning_rate": 8.878989442404082e-05, + "loss": 1.9008, + "step": 699 + }, + { + "epoch": 0.23055704228251472, + "grad_norm": 2.8709568977355957, + "learning_rate": 8.87566799472136e-05, + "loss": 1.9383, + "step": 700 + }, + { + "epoch": 0.23088640948577546, + "grad_norm": 2.175177574157715, + "learning_rate": 8.872342256946051e-05, + "loss": 2.3998, + "step": 701 + }, + { + "epoch": 0.2312157766890362, + "grad_norm": 2.408965587615967, + "learning_rate": 8.869012232759512e-05, + "loss": 2.6233, + "step": 702 + }, + { + "epoch": 0.2315451438922969, + "grad_norm": 2.2874746322631836, + "learning_rate": 8.865677925847848e-05, + "loss": 2.0586, + "step": 703 + }, + { + "epoch": 0.23187451109555765, + "grad_norm": 2.436617374420166, + "learning_rate": 8.862339339901902e-05, + "loss": 2.324, + "step": 704 + }, + { + "epoch": 0.2322038782988184, + "grad_norm": 2.8110032081604004, + "learning_rate": 8.858996478617253e-05, + "loss": 2.4255, + "step": 705 + }, + { + "epoch": 0.23253324550207913, + "grad_norm": 2.681027889251709, + "learning_rate": 8.855649345694216e-05, + "loss": 2.5485, + "step": 706 + }, + { + "epoch": 0.23286261270533987, + "grad_norm": 2.6605820655822754, + "learning_rate": 8.852297944837831e-05, + "loss": 2.5063, + "step": 707 + }, + { + "epoch": 0.2331919799086006, + "grad_norm": 2.8735132217407227, + "learning_rate": 8.848942279757864e-05, + "loss": 2.1953, + "step": 708 + }, + { + "epoch": 0.23352134711186134, + "grad_norm": 2.5370914936065674, + "learning_rate": 8.845582354168802e-05, + "loss": 1.9245, + "step": 709 + }, + { + "epoch": 0.23385071431512208, + "grad_norm": 2.733022928237915, + "learning_rate": 8.842218171789846e-05, + "loss": 2.109, + "step": 710 + }, + { + "epoch": 0.2341800815183828, + "grad_norm": 3.819556951522827, + "learning_rate": 8.838849736344909e-05, + "loss": 2.405, + "step": 711 + }, + { + "epoch": 0.23450944872164353, + "grad_norm": 2.486302137374878, + "learning_rate": 8.835477051562613e-05, + "loss": 2.1244, + "step": 712 + }, + { + "epoch": 0.23483881592490427, + "grad_norm": 3.0748279094696045, + "learning_rate": 8.832100121176285e-05, + "loss": 2.2095, + "step": 713 + }, + { + "epoch": 0.235168183128165, + "grad_norm": 3.035090208053589, + "learning_rate": 8.828718948923949e-05, + "loss": 1.9369, + "step": 714 + }, + { + "epoch": 0.23549755033142575, + "grad_norm": 2.62367582321167, + "learning_rate": 8.825333538548326e-05, + "loss": 2.1745, + "step": 715 + }, + { + "epoch": 0.2358269175346865, + "grad_norm": 2.8850131034851074, + "learning_rate": 8.821943893796826e-05, + "loss": 2.3818, + "step": 716 + }, + { + "epoch": 0.23615628473794723, + "grad_norm": 3.0183825492858887, + "learning_rate": 8.81855001842155e-05, + "loss": 2.2054, + "step": 717 + }, + { + "epoch": 0.23648565194120796, + "grad_norm": 2.698676586151123, + "learning_rate": 8.81515191617928e-05, + "loss": 2.24, + "step": 718 + }, + { + "epoch": 0.2368150191444687, + "grad_norm": 2.8108513355255127, + "learning_rate": 8.811749590831475e-05, + "loss": 2.1469, + "step": 719 + }, + { + "epoch": 0.2371443863477294, + "grad_norm": 2.8320016860961914, + "learning_rate": 8.808343046144271e-05, + "loss": 2.1924, + "step": 720 + }, + { + "epoch": 0.23747375355099015, + "grad_norm": 3.3814282417297363, + "learning_rate": 8.804932285888477e-05, + "loss": 2.3818, + "step": 721 + }, + { + "epoch": 0.2378031207542509, + "grad_norm": 3.158623695373535, + "learning_rate": 8.80151731383956e-05, + "loss": 2.3962, + "step": 722 + }, + { + "epoch": 0.23813248795751163, + "grad_norm": 2.743208646774292, + "learning_rate": 8.798098133777659e-05, + "loss": 1.779, + "step": 723 + }, + { + "epoch": 0.23846185516077237, + "grad_norm": 3.1935389041900635, + "learning_rate": 8.794674749487565e-05, + "loss": 2.1095, + "step": 724 + }, + { + "epoch": 0.2387912223640331, + "grad_norm": 3.515878438949585, + "learning_rate": 8.791247164758722e-05, + "loss": 1.9777, + "step": 725 + }, + { + "epoch": 0.23912058956729385, + "grad_norm": 2.7194595336914062, + "learning_rate": 8.78781538338523e-05, + "loss": 2.3815, + "step": 726 + }, + { + "epoch": 0.23944995677055458, + "grad_norm": 2.3084821701049805, + "learning_rate": 8.784379409165828e-05, + "loss": 2.2721, + "step": 727 + }, + { + "epoch": 0.2397793239738153, + "grad_norm": 2.4673047065734863, + "learning_rate": 8.780939245903898e-05, + "loss": 2.1049, + "step": 728 + }, + { + "epoch": 0.24010869117707603, + "grad_norm": 2.5129013061523438, + "learning_rate": 8.77749489740746e-05, + "loss": 2.3995, + "step": 729 + }, + { + "epoch": 0.24043805838033677, + "grad_norm": 2.559809923171997, + "learning_rate": 8.774046367489166e-05, + "loss": 2.3422, + "step": 730 + }, + { + "epoch": 0.2407674255835975, + "grad_norm": 2.3660426139831543, + "learning_rate": 8.770593659966298e-05, + "loss": 2.0759, + "step": 731 + }, + { + "epoch": 0.24109679278685825, + "grad_norm": 2.502711057662964, + "learning_rate": 8.767136778660759e-05, + "loss": 2.2224, + "step": 732 + }, + { + "epoch": 0.241426159990119, + "grad_norm": 2.396199941635132, + "learning_rate": 8.763675727399075e-05, + "loss": 2.4539, + "step": 733 + }, + { + "epoch": 0.24175552719337973, + "grad_norm": 2.830613851547241, + "learning_rate": 8.760210510012387e-05, + "loss": 2.4058, + "step": 734 + }, + { + "epoch": 0.24208489439664047, + "grad_norm": 2.6569724082946777, + "learning_rate": 8.756741130336448e-05, + "loss": 2.4416, + "step": 735 + }, + { + "epoch": 0.24241426159990118, + "grad_norm": 3.4117109775543213, + "learning_rate": 8.753267592211616e-05, + "loss": 2.483, + "step": 736 + }, + { + "epoch": 0.24274362880316191, + "grad_norm": 2.8380532264709473, + "learning_rate": 8.749789899482856e-05, + "loss": 2.4988, + "step": 737 + }, + { + "epoch": 0.24307299600642265, + "grad_norm": 2.325429916381836, + "learning_rate": 8.74630805599973e-05, + "loss": 1.8716, + "step": 738 + }, + { + "epoch": 0.2434023632096834, + "grad_norm": 2.5336060523986816, + "learning_rate": 8.742822065616393e-05, + "loss": 1.8746, + "step": 739 + }, + { + "epoch": 0.24373173041294413, + "grad_norm": 2.6919002532958984, + "learning_rate": 8.739331932191592e-05, + "loss": 2.2107, + "step": 740 + }, + { + "epoch": 0.24406109761620487, + "grad_norm": 3.1390397548675537, + "learning_rate": 8.735837659588661e-05, + "loss": 2.5521, + "step": 741 + }, + { + "epoch": 0.2443904648194656, + "grad_norm": 2.4885716438293457, + "learning_rate": 8.732339251675516e-05, + "loss": 2.1718, + "step": 742 + }, + { + "epoch": 0.24471983202272635, + "grad_norm": 2.5541298389434814, + "learning_rate": 8.728836712324646e-05, + "loss": 2.1153, + "step": 743 + }, + { + "epoch": 0.24504919922598709, + "grad_norm": 2.7616307735443115, + "learning_rate": 8.725330045413117e-05, + "loss": 2.0635, + "step": 744 + }, + { + "epoch": 0.2453785664292478, + "grad_norm": 2.8875701427459717, + "learning_rate": 8.721819254822565e-05, + "loss": 2.3036, + "step": 745 + }, + { + "epoch": 0.24570793363250854, + "grad_norm": 3.5547261238098145, + "learning_rate": 8.718304344439186e-05, + "loss": 2.5285, + "step": 746 + }, + { + "epoch": 0.24603730083576927, + "grad_norm": 3.126332998275757, + "learning_rate": 8.714785318153742e-05, + "loss": 2.1983, + "step": 747 + }, + { + "epoch": 0.24636666803903, + "grad_norm": 2.997291088104248, + "learning_rate": 8.711262179861547e-05, + "loss": 2.2518, + "step": 748 + }, + { + "epoch": 0.24669603524229075, + "grad_norm": 2.786508083343506, + "learning_rate": 8.70773493346247e-05, + "loss": 1.8481, + "step": 749 + }, + { + "epoch": 0.2470254024455515, + "grad_norm": 3.188028573989868, + "learning_rate": 8.704203582860922e-05, + "loss": 1.929, + "step": 750 + }, + { + "epoch": 0.24735476964881223, + "grad_norm": 1.924673318862915, + "learning_rate": 8.700668131965861e-05, + "loss": 2.319, + "step": 751 + }, + { + "epoch": 0.24768413685207297, + "grad_norm": 2.163625955581665, + "learning_rate": 8.697128584690785e-05, + "loss": 2.2707, + "step": 752 + }, + { + "epoch": 0.24801350405533368, + "grad_norm": 2.0983192920684814, + "learning_rate": 8.693584944953723e-05, + "loss": 2.4019, + "step": 753 + }, + { + "epoch": 0.24834287125859442, + "grad_norm": 2.2360665798187256, + "learning_rate": 8.690037216677236e-05, + "loss": 2.5004, + "step": 754 + }, + { + "epoch": 0.24867223846185516, + "grad_norm": 2.301961660385132, + "learning_rate": 8.686485403788411e-05, + "loss": 2.2912, + "step": 755 + }, + { + "epoch": 0.2490016056651159, + "grad_norm": 2.593334436416626, + "learning_rate": 8.682929510218855e-05, + "loss": 2.7432, + "step": 756 + }, + { + "epoch": 0.24933097286837663, + "grad_norm": 2.3250508308410645, + "learning_rate": 8.679369539904693e-05, + "loss": 2.525, + "step": 757 + }, + { + "epoch": 0.24966034007163737, + "grad_norm": 2.4877517223358154, + "learning_rate": 8.675805496786563e-05, + "loss": 2.2531, + "step": 758 + }, + { + "epoch": 0.2499897072748981, + "grad_norm": 2.423072099685669, + "learning_rate": 8.672237384809609e-05, + "loss": 2.3282, + "step": 759 + }, + { + "epoch": 0.2499897072748981, + "eval_loss": 2.2632205486297607, + "eval_runtime": 795.2953, + "eval_samples_per_second": 3.215, + "eval_steps_per_second": 1.608, + "step": 759 + } + ], + "logging_steps": 1, + "max_steps": 3036, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 759, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.164145589886124e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}