{ "best_metric": null, "best_model_checkpoint": null, "epoch": 125.0, "eval_steps": 500, "global_step": 8000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15625, "grad_norm": 0.6168534159660339, "learning_rate": 0.00019999922893814705, "loss": 1.9025, "step": 10 }, { "epoch": 0.3125, "grad_norm": 0.6524689793586731, "learning_rate": 0.00019999691576447898, "loss": 1.6781, "step": 20 }, { "epoch": 0.46875, "grad_norm": 0.7400289177894592, "learning_rate": 0.00019999306051466772, "loss": 1.5857, "step": 30 }, { "epoch": 0.625, "grad_norm": 0.6026787161827087, "learning_rate": 0.00019998766324816607, "loss": 1.4631, "step": 40 }, { "epoch": 0.78125, "grad_norm": 0.676242470741272, "learning_rate": 0.0001999807240482065, "loss": 1.4429, "step": 50 }, { "epoch": 0.9375, "grad_norm": 0.5645259022712708, "learning_rate": 0.00019997224302180006, "loss": 1.4138, "step": 60 }, { "epoch": 1.09375, "grad_norm": 0.6464071273803711, "learning_rate": 0.0001999622202997347, "loss": 1.4142, "step": 70 }, { "epoch": 1.25, "grad_norm": 0.5914332270622253, "learning_rate": 0.00019995065603657316, "loss": 1.3459, "step": 80 }, { "epoch": 1.40625, "grad_norm": 0.6979939341545105, "learning_rate": 0.0001999375504106507, "loss": 1.3659, "step": 90 }, { "epoch": 1.5625, "grad_norm": 0.7593270540237427, "learning_rate": 0.0001999229036240723, "loss": 1.3414, "step": 100 }, { "epoch": 1.71875, "grad_norm": 0.9171484112739563, "learning_rate": 0.0001999067159027095, "loss": 1.3622, "step": 110 }, { "epoch": 1.875, "grad_norm": 0.9084861874580383, "learning_rate": 0.00019988898749619702, "loss": 1.3028, "step": 120 }, { "epoch": 2.03125, "grad_norm": 0.7756249904632568, "learning_rate": 0.00019986971867792875, "loss": 1.2742, "step": 130 }, { "epoch": 2.1875, "grad_norm": 0.813124418258667, "learning_rate": 0.00019984890974505381, "loss": 1.3174, "step": 140 }, { "epoch": 2.34375, "grad_norm": 0.7856652140617371, "learning_rate": 0.00019982656101847162, "loss": 1.2747, "step": 150 }, { "epoch": 2.5, "grad_norm": 0.8684581518173218, "learning_rate": 0.00019980267284282717, "loss": 1.2513, "step": 160 }, { "epoch": 2.65625, "grad_norm": 1.0259562730789185, "learning_rate": 0.00019977724558650572, "loss": 1.2616, "step": 170 }, { "epoch": 2.8125, "grad_norm": 0.9384151101112366, "learning_rate": 0.00019975027964162702, "loss": 1.2425, "step": 180 }, { "epoch": 2.96875, "grad_norm": 0.9272423982620239, "learning_rate": 0.0001997217754240393, "loss": 1.1985, "step": 190 }, { "epoch": 3.125, "grad_norm": 1.033215045928955, "learning_rate": 0.0001996917333733128, "loss": 1.2136, "step": 200 }, { "epoch": 3.28125, "grad_norm": 1.1179872751235962, "learning_rate": 0.00019966015395273318, "loss": 1.1696, "step": 210 }, { "epoch": 3.4375, "grad_norm": 1.0058550834655762, "learning_rate": 0.00019962703764929413, "loss": 1.2167, "step": 220 }, { "epoch": 3.59375, "grad_norm": 1.0203720331192017, "learning_rate": 0.00019959238497369003, "loss": 1.2328, "step": 230 }, { "epoch": 3.75, "grad_norm": 0.9134518504142761, "learning_rate": 0.00019955619646030802, "loss": 1.185, "step": 240 }, { "epoch": 3.90625, "grad_norm": 1.0784626007080078, "learning_rate": 0.0001995184726672197, "loss": 1.1593, "step": 250 }, { "epoch": 4.0625, "grad_norm": 1.117414951324463, "learning_rate": 0.00019947921417617267, "loss": 1.1788, "step": 260 }, { "epoch": 4.21875, "grad_norm": 1.0805243253707886, "learning_rate": 0.0001994384215925814, "loss": 1.1486, "step": 270 }, { "epoch": 4.375, "grad_norm": 1.2142518758773804, "learning_rate": 0.000199396095545518, "loss": 1.1243, "step": 280 }, { "epoch": 4.53125, "grad_norm": 1.1736822128295898, "learning_rate": 0.00019935223668770245, "loss": 1.0945, "step": 290 }, { "epoch": 4.6875, "grad_norm": 1.0896259546279907, "learning_rate": 0.00019930684569549264, "loss": 1.1531, "step": 300 }, { "epoch": 4.84375, "grad_norm": 1.172074794769287, "learning_rate": 0.00019925992326887379, "loss": 1.15, "step": 310 }, { "epoch": 5.0, "grad_norm": 1.1381478309631348, "learning_rate": 0.0001992114701314478, "loss": 1.1261, "step": 320 }, { "epoch": 5.15625, "grad_norm": 1.2212597131729126, "learning_rate": 0.00019916148703042193, "loss": 1.0793, "step": 330 }, { "epoch": 5.3125, "grad_norm": 1.1290854215621948, "learning_rate": 0.0001991099747365975, "loss": 1.1324, "step": 340 }, { "epoch": 5.46875, "grad_norm": 1.298770785331726, "learning_rate": 0.00019905693404435773, "loss": 1.0982, "step": 350 }, { "epoch": 5.625, "grad_norm": 1.1428149938583374, "learning_rate": 0.00019900236577165576, "loss": 1.0749, "step": 360 }, { "epoch": 5.78125, "grad_norm": 1.2018159627914429, "learning_rate": 0.00019894627076000185, "loss": 1.0501, "step": 370 }, { "epoch": 5.9375, "grad_norm": 1.2703092098236084, "learning_rate": 0.0001988886498744505, "loss": 1.077, "step": 380 }, { "epoch": 6.09375, "grad_norm": 1.2956756353378296, "learning_rate": 0.00019882950400358694, "loss": 1.0571, "step": 390 }, { "epoch": 6.25, "grad_norm": 1.324562668800354, "learning_rate": 0.00019876883405951377, "loss": 1.0494, "step": 400 }, { "epoch": 6.40625, "grad_norm": 1.2973648309707642, "learning_rate": 0.00019870664097783655, "loss": 1.0362, "step": 410 }, { "epoch": 6.5625, "grad_norm": 1.3334016799926758, "learning_rate": 0.00019864292571764955, "loss": 1.0537, "step": 420 }, { "epoch": 6.71875, "grad_norm": 1.3302708864212036, "learning_rate": 0.0001985776892615209, "loss": 1.0082, "step": 430 }, { "epoch": 6.875, "grad_norm": 1.346130609512329, "learning_rate": 0.0001985109326154774, "loss": 1.0633, "step": 440 }, { "epoch": 7.03125, "grad_norm": 1.2094783782958984, "learning_rate": 0.00019844265680898918, "loss": 1.0505, "step": 450 }, { "epoch": 7.1875, "grad_norm": 1.3239214420318604, "learning_rate": 0.00019837286289495361, "loss": 0.9989, "step": 460 }, { "epoch": 7.34375, "grad_norm": 1.3971266746520996, "learning_rate": 0.00019830155194967917, "loss": 0.9816, "step": 470 }, { "epoch": 7.5, "grad_norm": 1.3986828327178955, "learning_rate": 0.0001982287250728689, "loss": 0.9951, "step": 480 }, { "epoch": 7.65625, "grad_norm": 1.4055757522583008, "learning_rate": 0.00019815438338760327, "loss": 1.0279, "step": 490 }, { "epoch": 7.8125, "grad_norm": 1.5168254375457764, "learning_rate": 0.00019807852804032305, "loss": 1.0069, "step": 500 }, { "epoch": 7.96875, "grad_norm": 1.3308908939361572, "learning_rate": 0.00019800116020081158, "loss": 0.9957, "step": 510 }, { "epoch": 8.125, "grad_norm": 1.629763126373291, "learning_rate": 0.00019792228106217658, "loss": 0.9368, "step": 520 }, { "epoch": 8.28125, "grad_norm": 1.487622618675232, "learning_rate": 0.000197841891840832, "loss": 1.0098, "step": 530 }, { "epoch": 8.4375, "grad_norm": 1.3082317113876343, "learning_rate": 0.0001977599937764791, "loss": 0.9657, "step": 540 }, { "epoch": 8.59375, "grad_norm": 1.4622434377670288, "learning_rate": 0.00019767658813208726, "loss": 0.9627, "step": 550 }, { "epoch": 8.75, "grad_norm": 1.4137543439865112, "learning_rate": 0.00019759167619387476, "loss": 0.9818, "step": 560 }, { "epoch": 8.90625, "grad_norm": 1.4448939561843872, "learning_rate": 0.0001975052592712887, "loss": 0.932, "step": 570 }, { "epoch": 9.0625, "grad_norm": 1.4551522731781006, "learning_rate": 0.00019741733869698495, "loss": 0.9693, "step": 580 }, { "epoch": 9.21875, "grad_norm": 1.4330525398254395, "learning_rate": 0.0001973279158268075, "loss": 0.9367, "step": 590 }, { "epoch": 9.375, "grad_norm": 1.3818308115005493, "learning_rate": 0.00019723699203976766, "loss": 0.9344, "step": 600 }, { "epoch": 9.53125, "grad_norm": 1.4964330196380615, "learning_rate": 0.00019714456873802272, "loss": 0.9046, "step": 610 }, { "epoch": 9.6875, "grad_norm": 1.3817216157913208, "learning_rate": 0.00019705064734685425, "loss": 0.8933, "step": 620 }, { "epoch": 9.84375, "grad_norm": 1.6543223857879639, "learning_rate": 0.00019695522931464636, "loss": 0.937, "step": 630 }, { "epoch": 10.0, "grad_norm": 1.4744540452957153, "learning_rate": 0.0001968583161128631, "loss": 0.9789, "step": 640 }, { "epoch": 10.15625, "grad_norm": 1.5337893962860107, "learning_rate": 0.00019675990923602598, "loss": 0.8901, "step": 650 }, { "epoch": 10.3125, "grad_norm": 1.4886064529418945, "learning_rate": 0.00019666001020169073, "loss": 0.8747, "step": 660 }, { "epoch": 10.46875, "grad_norm": 1.4878489971160889, "learning_rate": 0.00019655862055042406, "loss": 0.9148, "step": 670 }, { "epoch": 10.625, "grad_norm": 1.537325143814087, "learning_rate": 0.00019645574184577982, "loss": 0.8928, "step": 680 }, { "epoch": 10.78125, "grad_norm": 1.5436025857925415, "learning_rate": 0.00019635137567427488, "loss": 0.9109, "step": 690 }, { "epoch": 10.9375, "grad_norm": 1.7200922966003418, "learning_rate": 0.00019624552364536473, "loss": 0.9218, "step": 700 }, { "epoch": 11.09375, "grad_norm": 1.4200806617736816, "learning_rate": 0.00019613818739141862, "loss": 0.8591, "step": 710 }, { "epoch": 11.25, "grad_norm": 1.618428349494934, "learning_rate": 0.0001960293685676943, "loss": 0.849, "step": 720 }, { "epoch": 11.40625, "grad_norm": 1.538055658340454, "learning_rate": 0.00019591906885231276, "loss": 0.8597, "step": 730 }, { "epoch": 11.5625, "grad_norm": 1.4808735847473145, "learning_rate": 0.00019580728994623195, "loss": 0.8691, "step": 740 }, { "epoch": 11.71875, "grad_norm": 1.5934367179870605, "learning_rate": 0.0001956940335732209, "loss": 0.9137, "step": 750 }, { "epoch": 11.875, "grad_norm": 1.584621787071228, "learning_rate": 0.00019557930147983302, "loss": 0.8624, "step": 760 }, { "epoch": 12.03125, "grad_norm": 1.503152847290039, "learning_rate": 0.0001954630954353791, "loss": 0.8308, "step": 770 }, { "epoch": 12.1875, "grad_norm": 1.3880295753479004, "learning_rate": 0.0001953454172319001, "loss": 0.7993, "step": 780 }, { "epoch": 12.34375, "grad_norm": 1.665929913520813, "learning_rate": 0.00019522626868413956, "loss": 0.8547, "step": 790 }, { "epoch": 12.5, "grad_norm": 1.6523503065109253, "learning_rate": 0.00019510565162951537, "loss": 0.8312, "step": 800 }, { "epoch": 12.65625, "grad_norm": 1.761229157447815, "learning_rate": 0.00019498356792809176, "loss": 0.8257, "step": 810 }, { "epoch": 12.8125, "grad_norm": 1.6496795415878296, "learning_rate": 0.00019486001946255046, "loss": 0.8549, "step": 820 }, { "epoch": 12.96875, "grad_norm": 1.601778507232666, "learning_rate": 0.00019473500813816164, "loss": 0.8432, "step": 830 }, { "epoch": 13.125, "grad_norm": 1.645071268081665, "learning_rate": 0.00019460853588275454, "loss": 0.8002, "step": 840 }, { "epoch": 13.28125, "grad_norm": 1.704184889793396, "learning_rate": 0.00019448060464668783, "loss": 0.8224, "step": 850 }, { "epoch": 13.4375, "grad_norm": 1.6473637819290161, "learning_rate": 0.00019435121640281938, "loss": 0.7799, "step": 860 }, { "epoch": 13.59375, "grad_norm": 1.859934687614441, "learning_rate": 0.00019422037314647598, "loss": 0.7917, "step": 870 }, { "epoch": 13.75, "grad_norm": 1.631324052810669, "learning_rate": 0.00019408807689542257, "loss": 0.8419, "step": 880 }, { "epoch": 13.90625, "grad_norm": 1.794649362564087, "learning_rate": 0.00019395432968983092, "loss": 0.8033, "step": 890 }, { "epoch": 14.0625, "grad_norm": 1.7658714056015015, "learning_rate": 0.00019381913359224842, "loss": 0.785, "step": 900 }, { "epoch": 14.21875, "grad_norm": 1.7977893352508545, "learning_rate": 0.00019368249068756613, "loss": 0.7421, "step": 910 }, { "epoch": 14.375, "grad_norm": 2.016810178756714, "learning_rate": 0.00019354440308298675, "loss": 0.7994, "step": 920 }, { "epoch": 14.53125, "grad_norm": 1.9253318309783936, "learning_rate": 0.00019340487290799184, "loss": 0.7738, "step": 930 }, { "epoch": 14.6875, "grad_norm": 1.7931846380233765, "learning_rate": 0.00019326390231430942, "loss": 0.7725, "step": 940 }, { "epoch": 14.84375, "grad_norm": 1.9552454948425293, "learning_rate": 0.00019312149347588037, "loss": 0.7717, "step": 950 }, { "epoch": 15.0, "grad_norm": 1.7500364780426025, "learning_rate": 0.00019297764858882514, "loss": 0.7987, "step": 960 }, { "epoch": 15.15625, "grad_norm": 1.8716918230056763, "learning_rate": 0.00019283236987140988, "loss": 0.7371, "step": 970 }, { "epoch": 15.3125, "grad_norm": 1.7724356651306152, "learning_rate": 0.00019268565956401208, "loss": 0.7553, "step": 980 }, { "epoch": 15.46875, "grad_norm": 1.8796881437301636, "learning_rate": 0.00019253751992908622, "loss": 0.7531, "step": 990 }, { "epoch": 15.625, "grad_norm": 1.8478586673736572, "learning_rate": 0.0001923879532511287, "loss": 0.7347, "step": 1000 }, { "epoch": 15.78125, "grad_norm": 1.9052739143371582, "learning_rate": 0.00019223696183664267, "loss": 0.7333, "step": 1010 }, { "epoch": 15.9375, "grad_norm": 2.0365710258483887, "learning_rate": 0.00019208454801410266, "loss": 0.7623, "step": 1020 }, { "epoch": 16.09375, "grad_norm": 2.4890291690826416, "learning_rate": 0.00019193071413391822, "loss": 0.7259, "step": 1030 }, { "epoch": 16.25, "grad_norm": 1.7967720031738281, "learning_rate": 0.00019177546256839812, "loss": 0.7239, "step": 1040 }, { "epoch": 16.40625, "grad_norm": 1.8451858758926392, "learning_rate": 0.00019161879571171362, "loss": 0.7212, "step": 1050 }, { "epoch": 16.5625, "grad_norm": 2.0714006423950195, "learning_rate": 0.00019146071597986138, "loss": 0.6777, "step": 1060 }, { "epoch": 16.71875, "grad_norm": 1.8511077165603638, "learning_rate": 0.00019130122581062644, "loss": 0.7451, "step": 1070 }, { "epoch": 16.875, "grad_norm": 1.961347222328186, "learning_rate": 0.00019114032766354453, "loss": 0.7201, "step": 1080 }, { "epoch": 17.03125, "grad_norm": 1.7642933130264282, "learning_rate": 0.0001909780240198641, "loss": 0.7593, "step": 1090 }, { "epoch": 17.1875, "grad_norm": 1.9495702981948853, "learning_rate": 0.00019081431738250814, "loss": 0.6792, "step": 1100 }, { "epoch": 17.34375, "grad_norm": 2.0456838607788086, "learning_rate": 0.00019064921027603547, "loss": 0.6827, "step": 1110 }, { "epoch": 17.5, "grad_norm": 2.0225956439971924, "learning_rate": 0.00019048270524660196, "loss": 0.7065, "step": 1120 }, { "epoch": 17.65625, "grad_norm": 2.0372891426086426, "learning_rate": 0.00019031480486192114, "loss": 0.6961, "step": 1130 }, { "epoch": 17.8125, "grad_norm": 1.9680888652801514, "learning_rate": 0.00019014551171122457, "loss": 0.6928, "step": 1140 }, { "epoch": 17.96875, "grad_norm": 1.925260305404663, "learning_rate": 0.00018997482840522217, "loss": 0.7214, "step": 1150 }, { "epoch": 18.125, "grad_norm": 2.222017526626587, "learning_rate": 0.00018980275757606157, "loss": 0.6755, "step": 1160 }, { "epoch": 18.28125, "grad_norm": 1.8654993772506714, "learning_rate": 0.0001896293018772879, "loss": 0.6734, "step": 1170 }, { "epoch": 18.4375, "grad_norm": 2.079108953475952, "learning_rate": 0.0001894544639838025, "loss": 0.6914, "step": 1180 }, { "epoch": 18.59375, "grad_norm": 2.1051278114318848, "learning_rate": 0.0001892782465918221, "loss": 0.6685, "step": 1190 }, { "epoch": 18.75, "grad_norm": 2.200761079788208, "learning_rate": 0.0001891006524188368, "loss": 0.6701, "step": 1200 }, { "epoch": 18.90625, "grad_norm": 2.0247373580932617, "learning_rate": 0.00018892168420356847, "loss": 0.644, "step": 1210 }, { "epoch": 19.0625, "grad_norm": 1.7870880365371704, "learning_rate": 0.00018874134470592835, "loss": 0.6479, "step": 1220 }, { "epoch": 19.21875, "grad_norm": 2.407914638519287, "learning_rate": 0.0001885596367069746, "loss": 0.6514, "step": 1230 }, { "epoch": 19.375, "grad_norm": 2.0643310546875, "learning_rate": 0.00018837656300886937, "loss": 0.623, "step": 1240 }, { "epoch": 19.53125, "grad_norm": 1.943108081817627, "learning_rate": 0.0001881921264348355, "loss": 0.6305, "step": 1250 }, { "epoch": 19.6875, "grad_norm": 1.9791898727416992, "learning_rate": 0.00018800632982911322, "loss": 0.6691, "step": 1260 }, { "epoch": 19.84375, "grad_norm": 1.9491395950317383, "learning_rate": 0.0001878191760569159, "loss": 0.6466, "step": 1270 }, { "epoch": 20.0, "grad_norm": 2.236191987991333, "learning_rate": 0.00018763066800438636, "loss": 0.6777, "step": 1280 }, { "epoch": 20.15625, "grad_norm": 2.0054900646209717, "learning_rate": 0.00018744080857855191, "loss": 0.5978, "step": 1290 }, { "epoch": 20.3125, "grad_norm": 1.9910144805908203, "learning_rate": 0.00018724960070727972, "loss": 0.616, "step": 1300 }, { "epoch": 20.46875, "grad_norm": 2.0548858642578125, "learning_rate": 0.00018705704733923176, "loss": 0.6339, "step": 1310 }, { "epoch": 20.625, "grad_norm": 2.190333843231201, "learning_rate": 0.00018686315144381913, "loss": 0.6166, "step": 1320 }, { "epoch": 20.78125, "grad_norm": 2.069106101989746, "learning_rate": 0.00018666791601115642, "loss": 0.6425, "step": 1330 }, { "epoch": 20.9375, "grad_norm": 1.9265080690383911, "learning_rate": 0.0001864713440520155, "loss": 0.633, "step": 1340 }, { "epoch": 21.09375, "grad_norm": 2.2421185970306396, "learning_rate": 0.0001862734385977792, "loss": 0.6081, "step": 1350 }, { "epoch": 21.25, "grad_norm": 2.1831634044647217, "learning_rate": 0.0001860742027003944, "loss": 0.5864, "step": 1360 }, { "epoch": 21.40625, "grad_norm": 2.443437337875366, "learning_rate": 0.00018587363943232504, "loss": 0.6257, "step": 1370 }, { "epoch": 21.5625, "grad_norm": 2.1479828357696533, "learning_rate": 0.00018567175188650498, "loss": 0.6078, "step": 1380 }, { "epoch": 21.71875, "grad_norm": 2.1301865577697754, "learning_rate": 0.0001854685431762898, "loss": 0.5923, "step": 1390 }, { "epoch": 21.875, "grad_norm": 2.476592779159546, "learning_rate": 0.00018526401643540922, "loss": 0.5948, "step": 1400 }, { "epoch": 22.03125, "grad_norm": 1.8967773914337158, "learning_rate": 0.00018505817481791862, "loss": 0.603, "step": 1410 }, { "epoch": 22.1875, "grad_norm": 2.07448148727417, "learning_rate": 0.00018485102149815038, "loss": 0.5644, "step": 1420 }, { "epoch": 22.34375, "grad_norm": 2.0168447494506836, "learning_rate": 0.00018464255967066493, "loss": 0.5823, "step": 1430 }, { "epoch": 22.5, "grad_norm": 2.0524818897247314, "learning_rate": 0.00018443279255020152, "loss": 0.5657, "step": 1440 }, { "epoch": 22.65625, "grad_norm": 2.0055761337280273, "learning_rate": 0.00018422172337162867, "loss": 0.6089, "step": 1450 }, { "epoch": 22.8125, "grad_norm": 2.2025387287139893, "learning_rate": 0.0001840093553898942, "loss": 0.5753, "step": 1460 }, { "epoch": 22.96875, "grad_norm": 2.3673453330993652, "learning_rate": 0.00018379569187997513, "loss": 0.5791, "step": 1470 }, { "epoch": 23.125, "grad_norm": 2.2405545711517334, "learning_rate": 0.00018358073613682706, "loss": 0.542, "step": 1480 }, { "epoch": 23.28125, "grad_norm": 2.062706708908081, "learning_rate": 0.00018336449147533345, "loss": 0.582, "step": 1490 }, { "epoch": 23.4375, "grad_norm": 2.4270834922790527, "learning_rate": 0.00018314696123025454, "loss": 0.5674, "step": 1500 }, { "epoch": 23.59375, "grad_norm": 2.288257122039795, "learning_rate": 0.00018292814875617576, "loss": 0.551, "step": 1510 }, { "epoch": 23.75, "grad_norm": 2.270174503326416, "learning_rate": 0.00018270805742745617, "loss": 0.5723, "step": 1520 }, { "epoch": 23.90625, "grad_norm": 2.3951213359832764, "learning_rate": 0.00018248669063817636, "loss": 0.5529, "step": 1530 }, { "epoch": 24.0625, "grad_norm": 2.5293216705322266, "learning_rate": 0.000182264051802086, "loss": 0.5531, "step": 1540 }, { "epoch": 24.21875, "grad_norm": 2.5577666759490967, "learning_rate": 0.00018204014435255135, "loss": 0.5189, "step": 1550 }, { "epoch": 24.375, "grad_norm": 2.211329460144043, "learning_rate": 0.00018181497174250236, "loss": 0.5425, "step": 1560 }, { "epoch": 24.53125, "grad_norm": 2.0459985733032227, "learning_rate": 0.00018158853744437914, "loss": 0.5486, "step": 1570 }, { "epoch": 24.6875, "grad_norm": 2.2284512519836426, "learning_rate": 0.00018136084495007872, "loss": 0.5268, "step": 1580 }, { "epoch": 24.84375, "grad_norm": 2.029223918914795, "learning_rate": 0.000181131897770901, "loss": 0.5421, "step": 1590 }, { "epoch": 25.0, "grad_norm": 2.371617078781128, "learning_rate": 0.00018090169943749476, "loss": 0.5612, "step": 1600 }, { "epoch": 25.15625, "grad_norm": 2.1253976821899414, "learning_rate": 0.000180670253499803, "loss": 0.5142, "step": 1610 }, { "epoch": 25.3125, "grad_norm": 2.6901040077209473, "learning_rate": 0.00018043756352700846, "loss": 0.5331, "step": 1620 }, { "epoch": 25.46875, "grad_norm": 2.1748178005218506, "learning_rate": 0.00018020363310747834, "loss": 0.5031, "step": 1630 }, { "epoch": 25.625, "grad_norm": 2.2645554542541504, "learning_rate": 0.00017996846584870908, "loss": 0.5171, "step": 1640 }, { "epoch": 25.78125, "grad_norm": 2.1726343631744385, "learning_rate": 0.00017973206537727073, "loss": 0.5104, "step": 1650 }, { "epoch": 25.9375, "grad_norm": 2.242905855178833, "learning_rate": 0.000179494435338751, "loss": 0.5543, "step": 1660 }, { "epoch": 26.09375, "grad_norm": 2.870544195175171, "learning_rate": 0.0001792555793976991, "loss": 0.4795, "step": 1670 }, { "epoch": 26.25, "grad_norm": 2.156977891921997, "learning_rate": 0.00017901550123756906, "loss": 0.4995, "step": 1680 }, { "epoch": 26.40625, "grad_norm": 2.332562208175659, "learning_rate": 0.0001787742045606631, "loss": 0.4938, "step": 1690 }, { "epoch": 26.5625, "grad_norm": 2.302238702774048, "learning_rate": 0.00017853169308807448, "loss": 0.5322, "step": 1700 }, { "epoch": 26.71875, "grad_norm": 2.610508680343628, "learning_rate": 0.00017828797055963018, "loss": 0.5243, "step": 1710 }, { "epoch": 26.875, "grad_norm": 2.429201126098633, "learning_rate": 0.000178043040733833, "loss": 0.4932, "step": 1720 }, { "epoch": 27.03125, "grad_norm": 2.024339199066162, "learning_rate": 0.00017779690738780387, "loss": 0.516, "step": 1730 }, { "epoch": 27.1875, "grad_norm": 2.3692007064819336, "learning_rate": 0.00017754957431722346, "loss": 0.4796, "step": 1740 }, { "epoch": 27.34375, "grad_norm": 2.3139917850494385, "learning_rate": 0.0001773010453362737, "loss": 0.4633, "step": 1750 }, { "epoch": 27.5, "grad_norm": 2.060307502746582, "learning_rate": 0.00017705132427757895, "loss": 0.4818, "step": 1760 }, { "epoch": 27.65625, "grad_norm": 2.431605577468872, "learning_rate": 0.00017680041499214678, "loss": 0.4941, "step": 1770 }, { "epoch": 27.8125, "grad_norm": 2.228679895401001, "learning_rate": 0.00017654832134930882, "loss": 0.4937, "step": 1780 }, { "epoch": 27.96875, "grad_norm": 2.1984705924987793, "learning_rate": 0.00017629504723666088, "loss": 0.5022, "step": 1790 }, { "epoch": 28.125, "grad_norm": 2.8257086277008057, "learning_rate": 0.0001760405965600031, "loss": 0.4586, "step": 1800 }, { "epoch": 28.28125, "grad_norm": 2.157179594039917, "learning_rate": 0.0001757849732432797, "loss": 0.4596, "step": 1810 }, { "epoch": 28.4375, "grad_norm": 2.935502290725708, "learning_rate": 0.00017552818122851838, "loss": 0.4741, "step": 1820 }, { "epoch": 28.59375, "grad_norm": 2.6089301109313965, "learning_rate": 0.0001752702244757697, "loss": 0.4661, "step": 1830 }, { "epoch": 28.75, "grad_norm": 2.3858306407928467, "learning_rate": 0.00017501110696304596, "loss": 0.4942, "step": 1840 }, { "epoch": 28.90625, "grad_norm": 2.238586187362671, "learning_rate": 0.0001747508326862597, "loss": 0.461, "step": 1850 }, { "epoch": 29.0625, "grad_norm": 2.3924996852874756, "learning_rate": 0.00017448940565916222, "loss": 0.478, "step": 1860 }, { "epoch": 29.21875, "grad_norm": 2.1921300888061523, "learning_rate": 0.0001742268299132817, "loss": 0.4361, "step": 1870 }, { "epoch": 29.375, "grad_norm": 2.3493051528930664, "learning_rate": 0.000173963109497861, "loss": 0.4497, "step": 1880 }, { "epoch": 29.53125, "grad_norm": 2.3282814025878906, "learning_rate": 0.0001736982484797951, "loss": 0.4601, "step": 1890 }, { "epoch": 29.6875, "grad_norm": 2.0696048736572266, "learning_rate": 0.00017343225094356855, "loss": 0.4692, "step": 1900 }, { "epoch": 29.84375, "grad_norm": 2.1270716190338135, "learning_rate": 0.0001731651209911925, "loss": 0.4467, "step": 1910 }, { "epoch": 30.0, "grad_norm": 2.4350905418395996, "learning_rate": 0.00017289686274214118, "loss": 0.4578, "step": 1920 }, { "epoch": 30.15625, "grad_norm": 2.1730527877807617, "learning_rate": 0.00017262748033328867, "loss": 0.4147, "step": 1930 }, { "epoch": 30.3125, "grad_norm": 2.2917675971984863, "learning_rate": 0.00017235697791884494, "loss": 0.4064, "step": 1940 }, { "epoch": 30.46875, "grad_norm": 2.6113858222961426, "learning_rate": 0.00017208535967029188, "loss": 0.4486, "step": 1950 }, { "epoch": 30.625, "grad_norm": 2.4307138919830322, "learning_rate": 0.00017181262977631888, "loss": 0.4359, "step": 1960 }, { "epoch": 30.78125, "grad_norm": 2.257676601409912, "learning_rate": 0.0001715387924427583, "loss": 0.4759, "step": 1970 }, { "epoch": 30.9375, "grad_norm": 2.196704864501953, "learning_rate": 0.00017126385189252053, "loss": 0.4618, "step": 1980 }, { "epoch": 31.09375, "grad_norm": 2.23207950592041, "learning_rate": 0.00017098781236552903, "loss": 0.4165, "step": 1990 }, { "epoch": 31.25, "grad_norm": 2.283047914505005, "learning_rate": 0.00017071067811865476, "loss": 0.4206, "step": 2000 }, { "epoch": 31.40625, "grad_norm": 2.4019672870635986, "learning_rate": 0.00017043245342565063, "loss": 0.4286, "step": 2010 }, { "epoch": 31.5625, "grad_norm": 2.395625591278076, "learning_rate": 0.0001701531425770856, "loss": 0.4219, "step": 2020 }, { "epoch": 31.71875, "grad_norm": 2.2177236080169678, "learning_rate": 0.00016987274988027843, "loss": 0.4338, "step": 2030 }, { "epoch": 31.875, "grad_norm": 2.526252269744873, "learning_rate": 0.00016959127965923142, "loss": 0.4337, "step": 2040 }, { "epoch": 32.03125, "grad_norm": 2.5658786296844482, "learning_rate": 0.0001693087362545636, "loss": 0.4374, "step": 2050 }, { "epoch": 32.1875, "grad_norm": 2.5486271381378174, "learning_rate": 0.00016902512402344373, "loss": 0.41, "step": 2060 }, { "epoch": 32.34375, "grad_norm": 2.4342575073242188, "learning_rate": 0.00016874044733952327, "loss": 0.4051, "step": 2070 }, { "epoch": 32.5, "grad_norm": 2.0787465572357178, "learning_rate": 0.00016845471059286887, "loss": 0.4065, "step": 2080 }, { "epoch": 32.65625, "grad_norm": 2.413379430770874, "learning_rate": 0.00016816791818989466, "loss": 0.4121, "step": 2090 }, { "epoch": 32.8125, "grad_norm": 2.255675792694092, "learning_rate": 0.0001678800745532942, "loss": 0.4199, "step": 2100 }, { "epoch": 32.96875, "grad_norm": 2.7167301177978516, "learning_rate": 0.00016759118412197247, "loss": 0.4219, "step": 2110 }, { "epoch": 33.125, "grad_norm": 2.5120389461517334, "learning_rate": 0.00016730125135097735, "loss": 0.4126, "step": 2120 }, { "epoch": 33.28125, "grad_norm": 2.2318458557128906, "learning_rate": 0.0001670102807114308, "loss": 0.3766, "step": 2130 }, { "epoch": 33.4375, "grad_norm": 2.6558902263641357, "learning_rate": 0.00016671827669045998, "loss": 0.3897, "step": 2140 }, { "epoch": 33.59375, "grad_norm": 2.1000514030456543, "learning_rate": 0.00016642524379112817, "loss": 0.3809, "step": 2150 }, { "epoch": 33.75, "grad_norm": 2.422773838043213, "learning_rate": 0.00016613118653236518, "loss": 0.4301, "step": 2160 }, { "epoch": 33.90625, "grad_norm": 2.6482996940612793, "learning_rate": 0.00016583610944889774, "loss": 0.419, "step": 2170 }, { "epoch": 34.0625, "grad_norm": 2.4564175605773926, "learning_rate": 0.0001655400170911794, "loss": 0.4099, "step": 2180 }, { "epoch": 34.21875, "grad_norm": 2.3099935054779053, "learning_rate": 0.0001652429140253207, "loss": 0.3742, "step": 2190 }, { "epoch": 34.375, "grad_norm": 2.5124337673187256, "learning_rate": 0.00016494480483301836, "loss": 0.3817, "step": 2200 }, { "epoch": 34.53125, "grad_norm": 2.3638722896575928, "learning_rate": 0.000164645694111485, "loss": 0.3978, "step": 2210 }, { "epoch": 34.6875, "grad_norm": 2.6030874252319336, "learning_rate": 0.0001643455864733779, "loss": 0.3908, "step": 2220 }, { "epoch": 34.84375, "grad_norm": 2.0865495204925537, "learning_rate": 0.0001640444865467281, "loss": 0.3782, "step": 2230 }, { "epoch": 35.0, "grad_norm": 2.1129355430603027, "learning_rate": 0.000163742398974869, "loss": 0.3998, "step": 2240 }, { "epoch": 35.15625, "grad_norm": 2.4917526245117188, "learning_rate": 0.00016343932841636456, "loss": 0.3733, "step": 2250 }, { "epoch": 35.3125, "grad_norm": 2.4784963130950928, "learning_rate": 0.00016313527954493778, "loss": 0.3557, "step": 2260 }, { "epoch": 35.46875, "grad_norm": 2.292530059814453, "learning_rate": 0.00016283025704939838, "loss": 0.3821, "step": 2270 }, { "epoch": 35.625, "grad_norm": 2.420898199081421, "learning_rate": 0.00016252426563357055, "loss": 0.382, "step": 2280 }, { "epoch": 35.78125, "grad_norm": 2.4212636947631836, "learning_rate": 0.0001622173100162204, "loss": 0.3963, "step": 2290 }, { "epoch": 35.9375, "grad_norm": 2.462033987045288, "learning_rate": 0.00016190939493098344, "loss": 0.3786, "step": 2300 }, { "epoch": 36.09375, "grad_norm": 2.4041311740875244, "learning_rate": 0.000161600525126291, "loss": 0.3625, "step": 2310 }, { "epoch": 36.25, "grad_norm": 2.265742540359497, "learning_rate": 0.00016129070536529766, "loss": 0.36, "step": 2320 }, { "epoch": 36.40625, "grad_norm": 2.326496124267578, "learning_rate": 0.0001609799404258074, "loss": 0.3655, "step": 2330 }, { "epoch": 36.5625, "grad_norm": 2.161198139190674, "learning_rate": 0.00016066823510019998, "loss": 0.3545, "step": 2340 }, { "epoch": 36.71875, "grad_norm": 2.603285312652588, "learning_rate": 0.00016035559419535716, "loss": 0.3748, "step": 2350 }, { "epoch": 36.875, "grad_norm": 2.2837438583374023, "learning_rate": 0.00016004202253258842, "loss": 0.3791, "step": 2360 }, { "epoch": 37.03125, "grad_norm": 2.172354221343994, "learning_rate": 0.00015972752494755672, "loss": 0.3756, "step": 2370 }, { "epoch": 37.1875, "grad_norm": 2.015866756439209, "learning_rate": 0.00015941210629020388, "loss": 0.344, "step": 2380 }, { "epoch": 37.34375, "grad_norm": 2.2509257793426514, "learning_rate": 0.00015909577142467573, "loss": 0.3454, "step": 2390 }, { "epoch": 37.5, "grad_norm": 2.574324131011963, "learning_rate": 0.00015877852522924732, "loss": 0.3518, "step": 2400 }, { "epoch": 37.65625, "grad_norm": 2.231295108795166, "learning_rate": 0.00015846037259624736, "loss": 0.3607, "step": 2410 }, { "epoch": 37.8125, "grad_norm": 2.4786765575408936, "learning_rate": 0.00015814131843198308, "loss": 0.3568, "step": 2420 }, { "epoch": 37.96875, "grad_norm": 2.8630552291870117, "learning_rate": 0.0001578213676566643, "loss": 0.3787, "step": 2430 }, { "epoch": 38.125, "grad_norm": 2.386781692504883, "learning_rate": 0.00015750052520432787, "loss": 0.3387, "step": 2440 }, { "epoch": 38.28125, "grad_norm": 2.238036632537842, "learning_rate": 0.00015717879602276122, "loss": 0.3514, "step": 2450 }, { "epoch": 38.4375, "grad_norm": 2.1005611419677734, "learning_rate": 0.0001568561850734264, "loss": 0.3522, "step": 2460 }, { "epoch": 38.59375, "grad_norm": 2.509990930557251, "learning_rate": 0.00015653269733138328, "loss": 0.3429, "step": 2470 }, { "epoch": 38.75, "grad_norm": 2.3400230407714844, "learning_rate": 0.00015620833778521307, "loss": 0.3429, "step": 2480 }, { "epoch": 38.90625, "grad_norm": 2.3299190998077393, "learning_rate": 0.00015588311143694116, "loss": 0.3623, "step": 2490 }, { "epoch": 39.0625, "grad_norm": 2.1369211673736572, "learning_rate": 0.00015555702330196023, "loss": 0.3222, "step": 2500 }, { "epoch": 39.21875, "grad_norm": 2.5863115787506104, "learning_rate": 0.0001552300784089527, "loss": 0.3244, "step": 2510 }, { "epoch": 39.375, "grad_norm": 2.0398120880126953, "learning_rate": 0.0001549022817998132, "loss": 0.336, "step": 2520 }, { "epoch": 39.53125, "grad_norm": 2.0763356685638428, "learning_rate": 0.000154573638529571, "loss": 0.3179, "step": 2530 }, { "epoch": 39.6875, "grad_norm": 2.39857816696167, "learning_rate": 0.00015424415366631188, "loss": 0.3508, "step": 2540 }, { "epoch": 39.84375, "grad_norm": 2.5503716468811035, "learning_rate": 0.00015391383229110007, "loss": 0.3489, "step": 2550 }, { "epoch": 40.0, "grad_norm": 2.2811882495880127, "learning_rate": 0.00015358267949789966, "loss": 0.3612, "step": 2560 }, { "epoch": 40.15625, "grad_norm": 1.873091459274292, "learning_rate": 0.00015325070039349655, "loss": 0.3116, "step": 2570 }, { "epoch": 40.3125, "grad_norm": 2.053217887878418, "learning_rate": 0.00015291790009741907, "loss": 0.3229, "step": 2580 }, { "epoch": 40.46875, "grad_norm": 2.6282942295074463, "learning_rate": 0.00015258428374185956, "loss": 0.3257, "step": 2590 }, { "epoch": 40.625, "grad_norm": 2.7164459228515625, "learning_rate": 0.0001522498564715949, "loss": 0.3414, "step": 2600 }, { "epoch": 40.78125, "grad_norm": 1.9660325050354004, "learning_rate": 0.0001519146234439073, "loss": 0.3296, "step": 2610 }, { "epoch": 40.9375, "grad_norm": 2.421337842941284, "learning_rate": 0.00015157858982850475, "loss": 0.3425, "step": 2620 }, { "epoch": 41.09375, "grad_norm": 2.4952664375305176, "learning_rate": 0.00015124176080744134, "loss": 0.3221, "step": 2630 }, { "epoch": 41.25, "grad_norm": 2.1518049240112305, "learning_rate": 0.00015090414157503714, "loss": 0.3121, "step": 2640 }, { "epoch": 41.40625, "grad_norm": 2.6512484550476074, "learning_rate": 0.00015056573733779848, "loss": 0.321, "step": 2650 }, { "epoch": 41.5625, "grad_norm": 2.3250184059143066, "learning_rate": 0.00015022655331433727, "loss": 0.3237, "step": 2660 }, { "epoch": 41.71875, "grad_norm": 2.1326417922973633, "learning_rate": 0.00014988659473529075, "loss": 0.3228, "step": 2670 }, { "epoch": 41.875, "grad_norm": 2.113276958465576, "learning_rate": 0.00014954586684324078, "loss": 0.3161, "step": 2680 }, { "epoch": 42.03125, "grad_norm": 2.1591405868530273, "learning_rate": 0.0001492043748926329, "loss": 0.3217, "step": 2690 }, { "epoch": 42.1875, "grad_norm": 2.3071820735931396, "learning_rate": 0.00014886212414969553, "loss": 0.3088, "step": 2700 }, { "epoch": 42.34375, "grad_norm": 2.0190141201019287, "learning_rate": 0.0001485191198923584, "loss": 0.3015, "step": 2710 }, { "epoch": 42.5, "grad_norm": 2.286022901535034, "learning_rate": 0.00014817536741017152, "loss": 0.3204, "step": 2720 }, { "epoch": 42.65625, "grad_norm": 2.2908082008361816, "learning_rate": 0.00014783087200422344, "loss": 0.3051, "step": 2730 }, { "epoch": 42.8125, "grad_norm": 2.825416326522827, "learning_rate": 0.00014748563898705946, "loss": 0.3305, "step": 2740 }, { "epoch": 42.96875, "grad_norm": 2.2812416553497314, "learning_rate": 0.0001471396736825998, "loss": 0.3176, "step": 2750 }, { "epoch": 43.125, "grad_norm": 2.5278165340423584, "learning_rate": 0.00014679298142605734, "loss": 0.316, "step": 2760 }, { "epoch": 43.28125, "grad_norm": 2.1231706142425537, "learning_rate": 0.00014644556756385565, "loss": 0.2954, "step": 2770 }, { "epoch": 43.4375, "grad_norm": 2.234170913696289, "learning_rate": 0.00014609743745354624, "loss": 0.3081, "step": 2780 }, { "epoch": 43.59375, "grad_norm": 2.217315435409546, "learning_rate": 0.00014574859646372605, "loss": 0.2936, "step": 2790 }, { "epoch": 43.75, "grad_norm": 2.164661407470703, "learning_rate": 0.00014539904997395468, "loss": 0.3105, "step": 2800 }, { "epoch": 43.90625, "grad_norm": 2.3690619468688965, "learning_rate": 0.00014504880337467145, "loss": 0.321, "step": 2810 }, { "epoch": 44.0625, "grad_norm": 2.202134370803833, "learning_rate": 0.00014469786206711214, "loss": 0.2908, "step": 2820 }, { "epoch": 44.21875, "grad_norm": 2.483181953430176, "learning_rate": 0.00014434623146322587, "loss": 0.2933, "step": 2830 }, { "epoch": 44.375, "grad_norm": 2.187701463699341, "learning_rate": 0.00014399391698559152, "loss": 0.3073, "step": 2840 }, { "epoch": 44.53125, "grad_norm": 2.346254587173462, "learning_rate": 0.0001436409240673342, "loss": 0.3062, "step": 2850 }, { "epoch": 44.6875, "grad_norm": 2.4697351455688477, "learning_rate": 0.00014328725815204144, "loss": 0.2915, "step": 2860 }, { "epoch": 44.84375, "grad_norm": 2.093778371810913, "learning_rate": 0.00014293292469367902, "loss": 0.3092, "step": 2870 }, { "epoch": 45.0, "grad_norm": 2.5177674293518066, "learning_rate": 0.00014257792915650728, "loss": 0.3092, "step": 2880 }, { "epoch": 45.15625, "grad_norm": 1.9424992799758911, "learning_rate": 0.00014222227701499656, "loss": 0.2945, "step": 2890 }, { "epoch": 45.3125, "grad_norm": 2.399775981903076, "learning_rate": 0.0001418659737537428, "loss": 0.2849, "step": 2900 }, { "epoch": 45.46875, "grad_norm": 2.5698862075805664, "learning_rate": 0.00014150902486738314, "loss": 0.293, "step": 2910 }, { "epoch": 45.625, "grad_norm": 2.2189462184906006, "learning_rate": 0.00014115143586051088, "loss": 0.3007, "step": 2920 }, { "epoch": 45.78125, "grad_norm": 2.7480437755584717, "learning_rate": 0.00014079321224759093, "loss": 0.2906, "step": 2930 }, { "epoch": 45.9375, "grad_norm": 2.392237663269043, "learning_rate": 0.00014043435955287452, "loss": 0.2981, "step": 2940 }, { "epoch": 46.09375, "grad_norm": 2.056471109390259, "learning_rate": 0.0001400748833103141, "loss": 0.2773, "step": 2950 }, { "epoch": 46.25, "grad_norm": 1.9949535131454468, "learning_rate": 0.00013971478906347806, "loss": 0.2852, "step": 2960 }, { "epoch": 46.40625, "grad_norm": 2.4332056045532227, "learning_rate": 0.00013935408236546515, "loss": 0.2846, "step": 2970 }, { "epoch": 46.5625, "grad_norm": 2.5237514972686768, "learning_rate": 0.00013899276877881884, "loss": 0.2864, "step": 2980 }, { "epoch": 46.71875, "grad_norm": 2.497544050216675, "learning_rate": 0.00013863085387544162, "loss": 0.2883, "step": 2990 }, { "epoch": 46.875, "grad_norm": 2.5128774642944336, "learning_rate": 0.000138268343236509, "loss": 0.2993, "step": 3000 }, { "epoch": 47.03125, "grad_norm": 1.5527375936508179, "learning_rate": 0.0001379052424523835, "loss": 0.2926, "step": 3010 }, { "epoch": 47.1875, "grad_norm": 2.4135637283325195, "learning_rate": 0.00013754155712252832, "loss": 0.2769, "step": 3020 }, { "epoch": 47.34375, "grad_norm": 2.0572760105133057, "learning_rate": 0.00013717729285542122, "loss": 0.2841, "step": 3030 }, { "epoch": 47.5, "grad_norm": 2.7703893184661865, "learning_rate": 0.00013681245526846783, "loss": 0.2704, "step": 3040 }, { "epoch": 47.65625, "grad_norm": 2.430634021759033, "learning_rate": 0.000136447049987915, "loss": 0.2842, "step": 3050 }, { "epoch": 47.8125, "grad_norm": 2.4779250621795654, "learning_rate": 0.0001360810826487642, "loss": 0.2922, "step": 3060 }, { "epoch": 47.96875, "grad_norm": 2.3544280529022217, "learning_rate": 0.00013571455889468457, "loss": 0.2909, "step": 3070 }, { "epoch": 48.125, "grad_norm": 1.854184627532959, "learning_rate": 0.00013534748437792573, "loss": 0.2672, "step": 3080 }, { "epoch": 48.28125, "grad_norm": 2.352564573287964, "learning_rate": 0.00013497986475923088, "loss": 0.2725, "step": 3090 }, { "epoch": 48.4375, "grad_norm": 2.332357883453369, "learning_rate": 0.0001346117057077493, "loss": 0.2736, "step": 3100 }, { "epoch": 48.59375, "grad_norm": 1.9822356700897217, "learning_rate": 0.000134243012900949, "loss": 0.2735, "step": 3110 }, { "epoch": 48.75, "grad_norm": 2.2452166080474854, "learning_rate": 0.00013387379202452917, "loss": 0.2793, "step": 3120 }, { "epoch": 48.90625, "grad_norm": 1.6397210359573364, "learning_rate": 0.0001335040487723324, "loss": 0.2823, "step": 3130 }, { "epoch": 49.0625, "grad_norm": 2.0738863945007324, "learning_rate": 0.0001331337888462571, "loss": 0.267, "step": 3140 }, { "epoch": 49.21875, "grad_norm": 1.7931309938430786, "learning_rate": 0.00013276301795616936, "loss": 0.2685, "step": 3150 }, { "epoch": 49.375, "grad_norm": 1.9530521631240845, "learning_rate": 0.00013239174181981495, "loss": 0.2606, "step": 3160 }, { "epoch": 49.53125, "grad_norm": 2.3546273708343506, "learning_rate": 0.00013201996616273118, "loss": 0.269, "step": 3170 }, { "epoch": 49.6875, "grad_norm": 2.2347261905670166, "learning_rate": 0.00013164769671815862, "loss": 0.274, "step": 3180 }, { "epoch": 49.84375, "grad_norm": 2.118546485900879, "learning_rate": 0.0001312749392269526, "loss": 0.2775, "step": 3190 }, { "epoch": 50.0, "grad_norm": 2.236036777496338, "learning_rate": 0.00013090169943749476, "loss": 0.28, "step": 3200 }, { "epoch": 50.15625, "grad_norm": 1.9887173175811768, "learning_rate": 0.00013052798310560438, "loss": 0.2666, "step": 3210 }, { "epoch": 50.3125, "grad_norm": 2.284052610397339, "learning_rate": 0.00013015379599444957, "loss": 0.2616, "step": 3220 }, { "epoch": 50.46875, "grad_norm": 1.860603928565979, "learning_rate": 0.00012977914387445855, "loss": 0.2665, "step": 3230 }, { "epoch": 50.625, "grad_norm": 1.7840187549591064, "learning_rate": 0.0001294040325232304, "loss": 0.2648, "step": 3240 }, { "epoch": 50.78125, "grad_norm": 2.2570018768310547, "learning_rate": 0.00012902846772544624, "loss": 0.258, "step": 3250 }, { "epoch": 50.9375, "grad_norm": 1.921676516532898, "learning_rate": 0.00012865245527277986, "loss": 0.269, "step": 3260 }, { "epoch": 51.09375, "grad_norm": 2.2320163249969482, "learning_rate": 0.00012827600096380833, "loss": 0.2676, "step": 3270 }, { "epoch": 51.25, "grad_norm": 1.9506572484970093, "learning_rate": 0.00012789911060392294, "loss": 0.2564, "step": 3280 }, { "epoch": 51.40625, "grad_norm": 1.990869164466858, "learning_rate": 0.00012752179000523917, "loss": 0.2611, "step": 3290 }, { "epoch": 51.5625, "grad_norm": 2.245129108428955, "learning_rate": 0.00012714404498650743, "loss": 0.2619, "step": 3300 }, { "epoch": 51.71875, "grad_norm": 2.441230535507202, "learning_rate": 0.00012676588137302327, "loss": 0.2638, "step": 3310 }, { "epoch": 51.875, "grad_norm": 1.967659831047058, "learning_rate": 0.0001263873049965373, "loss": 0.2574, "step": 3320 }, { "epoch": 52.03125, "grad_norm": 1.8834177255630493, "learning_rate": 0.00012600832169516567, "loss": 0.2602, "step": 3330 }, { "epoch": 52.1875, "grad_norm": 2.202791452407837, "learning_rate": 0.00012562893731329967, "loss": 0.2369, "step": 3340 }, { "epoch": 52.34375, "grad_norm": 1.9373564720153809, "learning_rate": 0.0001252491577015158, "loss": 0.2615, "step": 3350 }, { "epoch": 52.5, "grad_norm": 1.8653074502944946, "learning_rate": 0.0001248689887164855, "loss": 0.2513, "step": 3360 }, { "epoch": 52.65625, "grad_norm": 2.0396597385406494, "learning_rate": 0.00012448843622088476, "loss": 0.2605, "step": 3370 }, { "epoch": 52.8125, "grad_norm": 1.9073066711425781, "learning_rate": 0.00012410750608330388, "loss": 0.2611, "step": 3380 }, { "epoch": 52.96875, "grad_norm": 2.09374737739563, "learning_rate": 0.00012372620417815678, "loss": 0.2665, "step": 3390 }, { "epoch": 53.125, "grad_norm": 2.1771106719970703, "learning_rate": 0.00012334453638559057, "loss": 0.2334, "step": 3400 }, { "epoch": 53.28125, "grad_norm": 2.0365121364593506, "learning_rate": 0.0001229625085913947, "loss": 0.2471, "step": 3410 }, { "epoch": 53.4375, "grad_norm": 2.06864595413208, "learning_rate": 0.0001225801266869104, "loss": 0.2585, "step": 3420 }, { "epoch": 53.59375, "grad_norm": 1.9580702781677246, "learning_rate": 0.0001221973965689396, "loss": 0.2543, "step": 3430 }, { "epoch": 53.75, "grad_norm": 1.988387107849121, "learning_rate": 0.00012181432413965428, "loss": 0.2428, "step": 3440 }, { "epoch": 53.90625, "grad_norm": 2.0155484676361084, "learning_rate": 0.00012143091530650508, "loss": 0.2635, "step": 3450 }, { "epoch": 54.0625, "grad_norm": 1.7660094499588013, "learning_rate": 0.00012104717598213056, "loss": 0.2456, "step": 3460 }, { "epoch": 54.21875, "grad_norm": 2.1635379791259766, "learning_rate": 0.00012066311208426581, "loss": 0.2319, "step": 3470 }, { "epoch": 54.375, "grad_norm": 1.7314647436141968, "learning_rate": 0.00012027872953565125, "loss": 0.2486, "step": 3480 }, { "epoch": 54.53125, "grad_norm": 1.8251374959945679, "learning_rate": 0.00011989403426394125, "loss": 0.2429, "step": 3490 }, { "epoch": 54.6875, "grad_norm": 2.038282632827759, "learning_rate": 0.00011950903220161285, "loss": 0.249, "step": 3500 }, { "epoch": 54.84375, "grad_norm": 2.0042312145233154, "learning_rate": 0.00011912372928587406, "loss": 0.2563, "step": 3510 }, { "epoch": 55.0, "grad_norm": 2.0749430656433105, "learning_rate": 0.00011873813145857249, "loss": 0.2583, "step": 3520 }, { "epoch": 55.15625, "grad_norm": 1.9021109342575073, "learning_rate": 0.00011835224466610366, "loss": 0.2374, "step": 3530 }, { "epoch": 55.3125, "grad_norm": 2.4008102416992188, "learning_rate": 0.00011796607485931928, "loss": 0.2435, "step": 3540 }, { "epoch": 55.46875, "grad_norm": 2.771571397781372, "learning_rate": 0.00011757962799343547, "loss": 0.2487, "step": 3550 }, { "epoch": 55.625, "grad_norm": 1.8457822799682617, "learning_rate": 0.00011719291002794096, "loss": 0.2355, "step": 3560 }, { "epoch": 55.78125, "grad_norm": 1.9695429801940918, "learning_rate": 0.0001168059269265052, "loss": 0.2493, "step": 3570 }, { "epoch": 55.9375, "grad_norm": 2.004483461380005, "learning_rate": 0.0001164186846568863, "loss": 0.2493, "step": 3580 }, { "epoch": 56.09375, "grad_norm": 1.6778466701507568, "learning_rate": 0.00011603118919083913, "loss": 0.2328, "step": 3590 }, { "epoch": 56.25, "grad_norm": 1.9205918312072754, "learning_rate": 0.0001156434465040231, "loss": 0.2421, "step": 3600 }, { "epoch": 56.40625, "grad_norm": 2.071448802947998, "learning_rate": 0.0001152554625759101, "loss": 0.2392, "step": 3610 }, { "epoch": 56.5625, "grad_norm": 1.8244534730911255, "learning_rate": 0.00011486724338969232, "loss": 0.2348, "step": 3620 }, { "epoch": 56.71875, "grad_norm": 2.3694911003112793, "learning_rate": 0.00011447879493218979, "loss": 0.2471, "step": 3630 }, { "epoch": 56.875, "grad_norm": 1.7422678470611572, "learning_rate": 0.00011409012319375827, "loss": 0.244, "step": 3640 }, { "epoch": 57.03125, "grad_norm": 1.80690336227417, "learning_rate": 0.00011370123416819682, "loss": 0.2417, "step": 3650 }, { "epoch": 57.1875, "grad_norm": 1.9183917045593262, "learning_rate": 0.00011331213385265524, "loss": 0.2347, "step": 3660 }, { "epoch": 57.34375, "grad_norm": 2.4743127822875977, "learning_rate": 0.00011292282824754177, "loss": 0.2265, "step": 3670 }, { "epoch": 57.5, "grad_norm": 2.031683921813965, "learning_rate": 0.00011253332335643043, "loss": 0.2284, "step": 3680 }, { "epoch": 57.65625, "grad_norm": 2.3016927242279053, "learning_rate": 0.00011214362518596848, "loss": 0.2426, "step": 3690 }, { "epoch": 57.8125, "grad_norm": 2.0599136352539062, "learning_rate": 0.00011175373974578378, "loss": 0.2408, "step": 3700 }, { "epoch": 57.96875, "grad_norm": 1.9112536907196045, "learning_rate": 0.00011136367304839212, "loss": 0.2428, "step": 3710 }, { "epoch": 58.125, "grad_norm": 1.677746057510376, "learning_rate": 0.00011097343110910452, "loss": 0.2246, "step": 3720 }, { "epoch": 58.28125, "grad_norm": 2.0791244506835938, "learning_rate": 0.00011058301994593448, "loss": 0.2356, "step": 3730 }, { "epoch": 58.4375, "grad_norm": 1.9308363199234009, "learning_rate": 0.000110192445579505, "loss": 0.2258, "step": 3740 }, { "epoch": 58.59375, "grad_norm": 1.9701329469680786, "learning_rate": 0.0001098017140329561, "loss": 0.2416, "step": 3750 }, { "epoch": 58.75, "grad_norm": 1.7241312265396118, "learning_rate": 0.00010941083133185146, "loss": 0.2317, "step": 3760 }, { "epoch": 58.90625, "grad_norm": 1.8373721837997437, "learning_rate": 0.00010901980350408592, "loss": 0.2345, "step": 3770 }, { "epoch": 59.0625, "grad_norm": 2.176316022872925, "learning_rate": 0.00010862863657979237, "loss": 0.2296, "step": 3780 }, { "epoch": 59.21875, "grad_norm": 1.9734270572662354, "learning_rate": 0.00010823733659124857, "loss": 0.2241, "step": 3790 }, { "epoch": 59.375, "grad_norm": 2.1557183265686035, "learning_rate": 0.0001078459095727845, "loss": 0.2315, "step": 3800 }, { "epoch": 59.53125, "grad_norm": 2.2887086868286133, "learning_rate": 0.00010745436156068897, "loss": 0.2287, "step": 3810 }, { "epoch": 59.6875, "grad_norm": 1.7714214324951172, "learning_rate": 0.00010706269859311669, "loss": 0.2373, "step": 3820 }, { "epoch": 59.84375, "grad_norm": 1.8595536947250366, "learning_rate": 0.00010667092670999513, "loss": 0.2343, "step": 3830 }, { "epoch": 60.0, "grad_norm": 1.7251347303390503, "learning_rate": 0.00010627905195293135, "loss": 0.2254, "step": 3840 }, { "epoch": 60.15625, "grad_norm": 2.3099634647369385, "learning_rate": 0.0001058870803651189, "loss": 0.2238, "step": 3850 }, { "epoch": 60.3125, "grad_norm": 2.2596757411956787, "learning_rate": 0.0001054950179912446, "loss": 0.2289, "step": 3860 }, { "epoch": 60.46875, "grad_norm": 1.6570988893508911, "learning_rate": 0.00010510287087739517, "loss": 0.2282, "step": 3870 }, { "epoch": 60.625, "grad_norm": 2.195847749710083, "learning_rate": 0.00010471064507096426, "loss": 0.2286, "step": 3880 }, { "epoch": 60.78125, "grad_norm": 2.108572006225586, "learning_rate": 0.00010431834662055904, "loss": 0.2246, "step": 3890 }, { "epoch": 60.9375, "grad_norm": 1.8193440437316895, "learning_rate": 0.00010392598157590688, "loss": 0.2255, "step": 3900 }, { "epoch": 61.09375, "grad_norm": 1.710485577583313, "learning_rate": 0.00010353355598776214, "loss": 0.222, "step": 3910 }, { "epoch": 61.25, "grad_norm": 2.229994773864746, "learning_rate": 0.00010314107590781284, "loss": 0.2205, "step": 3920 }, { "epoch": 61.40625, "grad_norm": 1.7697498798370361, "learning_rate": 0.00010274854738858735, "loss": 0.2208, "step": 3930 }, { "epoch": 61.5625, "grad_norm": 2.168346405029297, "learning_rate": 0.00010235597648336104, "loss": 0.2235, "step": 3940 }, { "epoch": 61.71875, "grad_norm": 1.8804610967636108, "learning_rate": 0.00010196336924606283, "loss": 0.2284, "step": 3950 }, { "epoch": 61.875, "grad_norm": 1.934240698814392, "learning_rate": 0.00010157073173118208, "loss": 0.2255, "step": 3960 }, { "epoch": 62.03125, "grad_norm": 1.456990361213684, "learning_rate": 0.00010117806999367492, "loss": 0.2226, "step": 3970 }, { "epoch": 62.1875, "grad_norm": 1.7687050104141235, "learning_rate": 0.00010078539008887114, "loss": 0.2146, "step": 3980 }, { "epoch": 62.34375, "grad_norm": 1.7442452907562256, "learning_rate": 0.0001003926980723806, "loss": 0.2209, "step": 3990 }, { "epoch": 62.5, "grad_norm": 1.596010684967041, "learning_rate": 0.0001, "loss": 0.2199, "step": 4000 }, { "epoch": 62.65625, "grad_norm": 1.7170418500900269, "learning_rate": 9.960730192761942e-05, "loss": 0.2191, "step": 4010 }, { "epoch": 62.8125, "grad_norm": 1.9422141313552856, "learning_rate": 9.921460991112891e-05, "loss": 0.2157, "step": 4020 }, { "epoch": 62.96875, "grad_norm": 1.6883125305175781, "learning_rate": 9.882193000632506e-05, "loss": 0.2239, "step": 4030 }, { "epoch": 63.125, "grad_norm": 1.9851441383361816, "learning_rate": 9.842926826881796e-05, "loss": 0.2146, "step": 4040 }, { "epoch": 63.28125, "grad_norm": 1.519945740699768, "learning_rate": 9.807589324283746e-05, "loss": 0.217, "step": 4050 }, { "epoch": 63.4375, "grad_norm": 1.9202287197113037, "learning_rate": 9.768328270532056e-05, "loss": 0.2126, "step": 4060 }, { "epoch": 63.59375, "grad_norm": 2.1956114768981934, "learning_rate": 9.729070789445026e-05, "loss": 0.2223, "step": 4070 }, { "epoch": 63.75, "grad_norm": 1.8512564897537231, "learning_rate": 9.68981748642158e-05, "loss": 0.2162, "step": 4080 }, { "epoch": 63.90625, "grad_norm": 2.0431482791900635, "learning_rate": 9.650568966796203e-05, "loss": 0.2195, "step": 4090 }, { "epoch": 64.0625, "grad_norm": 1.8963639736175537, "learning_rate": 9.61132583582963e-05, "loss": 0.2226, "step": 4100 }, { "epoch": 64.21875, "grad_norm": 1.5588335990905762, "learning_rate": 9.572088698699478e-05, "loss": 0.2116, "step": 4110 }, { "epoch": 64.375, "grad_norm": 1.9125704765319824, "learning_rate": 9.532858160490945e-05, "loss": 0.2126, "step": 4120 }, { "epoch": 64.53125, "grad_norm": 1.8598657846450806, "learning_rate": 9.493634826187461e-05, "loss": 0.214, "step": 4130 }, { "epoch": 64.6875, "grad_norm": 1.7466522455215454, "learning_rate": 9.454419300661355e-05, "loss": 0.221, "step": 4140 }, { "epoch": 64.84375, "grad_norm": 1.8226146697998047, "learning_rate": 9.415212188664553e-05, "loss": 0.2161, "step": 4150 }, { "epoch": 65.0, "grad_norm": 1.7260197401046753, "learning_rate": 9.376014094819217e-05, "loss": 0.2147, "step": 4160 }, { "epoch": 65.15625, "grad_norm": 1.7944152355194092, "learning_rate": 9.336825623608448e-05, "loss": 0.2085, "step": 4170 }, { "epoch": 65.3125, "grad_norm": 2.3139121532440186, "learning_rate": 9.297647379366944e-05, "loss": 0.2112, "step": 4180 }, { "epoch": 65.46875, "grad_norm": 1.478004813194275, "learning_rate": 9.258479966271703e-05, "loss": 0.2128, "step": 4190 }, { "epoch": 65.625, "grad_norm": 1.8647255897521973, "learning_rate": 9.219323988332688e-05, "loss": 0.2055, "step": 4200 }, { "epoch": 65.78125, "grad_norm": 1.883954405784607, "learning_rate": 9.180180049383518e-05, "loss": 0.2131, "step": 4210 }, { "epoch": 65.9375, "grad_norm": 1.5575131177902222, "learning_rate": 9.141048753072144e-05, "loss": 0.2177, "step": 4220 }, { "epoch": 66.09375, "grad_norm": 2.0962812900543213, "learning_rate": 9.101930702851577e-05, "loss": 0.2128, "step": 4230 }, { "epoch": 66.25, "grad_norm": 1.675104022026062, "learning_rate": 9.062826501970535e-05, "loss": 0.2096, "step": 4240 }, { "epoch": 66.40625, "grad_norm": 1.7287991046905518, "learning_rate": 9.023736753464172e-05, "loss": 0.2074, "step": 4250 }, { "epoch": 66.5625, "grad_norm": 1.6228628158569336, "learning_rate": 8.984662060144763e-05, "loss": 0.2094, "step": 4260 }, { "epoch": 66.71875, "grad_norm": 2.205287218093872, "learning_rate": 8.94560302459242e-05, "loss": 0.2129, "step": 4270 }, { "epoch": 66.875, "grad_norm": 1.7029268741607666, "learning_rate": 8.90656024914579e-05, "loss": 0.2164, "step": 4280 }, { "epoch": 67.03125, "grad_norm": 1.6045933961868286, "learning_rate": 8.86753433589277e-05, "loss": 0.2124, "step": 4290 }, { "epoch": 67.1875, "grad_norm": 2.085911273956299, "learning_rate": 8.82852588666121e-05, "loss": 0.2074, "step": 4300 }, { "epoch": 67.34375, "grad_norm": 2.3079230785369873, "learning_rate": 8.789535503009667e-05, "loss": 0.2066, "step": 4310 }, { "epoch": 67.5, "grad_norm": 2.0734775066375732, "learning_rate": 8.750563786218081e-05, "loss": 0.2086, "step": 4320 }, { "epoch": 67.65625, "grad_norm": 2.025387763977051, "learning_rate": 8.711611337278537e-05, "loss": 0.207, "step": 4330 }, { "epoch": 67.8125, "grad_norm": 1.9657328128814697, "learning_rate": 8.672678756885984e-05, "loss": 0.2118, "step": 4340 }, { "epoch": 67.96875, "grad_norm": 2.05079984664917, "learning_rate": 8.633766645428974e-05, "loss": 0.2097, "step": 4350 }, { "epoch": 68.125, "grad_norm": 1.5424699783325195, "learning_rate": 8.594875602980407e-05, "loss": 0.2054, "step": 4360 }, { "epoch": 68.28125, "grad_norm": 1.584788203239441, "learning_rate": 8.55600622928826e-05, "loss": 0.2083, "step": 4370 }, { "epoch": 68.4375, "grad_norm": 1.837462067604065, "learning_rate": 8.517159123766363e-05, "loss": 0.2065, "step": 4380 }, { "epoch": 68.59375, "grad_norm": 2.0221145153045654, "learning_rate": 8.478334885485137e-05, "loss": 0.202, "step": 4390 }, { "epoch": 68.75, "grad_norm": 1.71314537525177, "learning_rate": 8.439534113162368e-05, "loss": 0.2086, "step": 4400 }, { "epoch": 68.90625, "grad_norm": 1.6273434162139893, "learning_rate": 8.40075740515396e-05, "loss": 0.2107, "step": 4410 }, { "epoch": 69.0625, "grad_norm": 1.656145691871643, "learning_rate": 8.362005359444727e-05, "loss": 0.2023, "step": 4420 }, { "epoch": 69.21875, "grad_norm": 1.59807550907135, "learning_rate": 8.323278573639139e-05, "loss": 0.2037, "step": 4430 }, { "epoch": 69.375, "grad_norm": 1.8091068267822266, "learning_rate": 8.284577644952155e-05, "loss": 0.2046, "step": 4440 }, { "epoch": 69.53125, "grad_norm": 1.5284535884857178, "learning_rate": 8.245903170199962e-05, "loss": 0.2066, "step": 4450 }, { "epoch": 69.6875, "grad_norm": 1.7114146947860718, "learning_rate": 8.207255745790805e-05, "loss": 0.2002, "step": 4460 }, { "epoch": 69.84375, "grad_norm": 1.703376293182373, "learning_rate": 8.168635967715776e-05, "loss": 0.2054, "step": 4470 }, { "epoch": 70.0, "grad_norm": 1.896020531654358, "learning_rate": 8.130044431539634e-05, "loss": 0.2059, "step": 4480 }, { "epoch": 70.15625, "grad_norm": 1.8438818454742432, "learning_rate": 8.091481732391599e-05, "loss": 0.1955, "step": 4490 }, { "epoch": 70.3125, "grad_norm": 1.7322238683700562, "learning_rate": 8.052948464956205e-05, "loss": 0.2014, "step": 4500 }, { "epoch": 70.46875, "grad_norm": 1.8049304485321045, "learning_rate": 8.01444522346409e-05, "loss": 0.1978, "step": 4510 }, { "epoch": 70.625, "grad_norm": 1.759329080581665, "learning_rate": 7.975972601682884e-05, "loss": 0.2057, "step": 4520 }, { "epoch": 70.78125, "grad_norm": 1.6832956075668335, "learning_rate": 7.937531192908e-05, "loss": 0.2038, "step": 4530 }, { "epoch": 70.9375, "grad_norm": 1.7098455429077148, "learning_rate": 7.899121589953515e-05, "loss": 0.2077, "step": 4540 }, { "epoch": 71.09375, "grad_norm": 1.477565050125122, "learning_rate": 7.860744385143022e-05, "loss": 0.1924, "step": 4550 }, { "epoch": 71.25, "grad_norm": 1.5800806283950806, "learning_rate": 7.822400170300497e-05, "loss": 0.1943, "step": 4560 }, { "epoch": 71.40625, "grad_norm": 1.497495412826538, "learning_rate": 7.784089536741164e-05, "loss": 0.1944, "step": 4570 }, { "epoch": 71.5625, "grad_norm": 1.9330024719238281, "learning_rate": 7.745813075262389e-05, "loss": 0.2039, "step": 4580 }, { "epoch": 71.71875, "grad_norm": 2.2339158058166504, "learning_rate": 7.707571376134548e-05, "loss": 0.2052, "step": 4590 }, { "epoch": 71.875, "grad_norm": 1.9479732513427734, "learning_rate": 7.669365029091964e-05, "loss": 0.2103, "step": 4600 }, { "epoch": 72.03125, "grad_norm": 1.2796049118041992, "learning_rate": 7.63119462332376e-05, "loss": 0.2049, "step": 4610 }, { "epoch": 72.1875, "grad_norm": 1.4597240686416626, "learning_rate": 7.593060747464818e-05, "loss": 0.1842, "step": 4620 }, { "epoch": 72.34375, "grad_norm": 1.857862114906311, "learning_rate": 7.554963989586675e-05, "loss": 0.193, "step": 4630 }, { "epoch": 72.5, "grad_norm": 1.5953953266143799, "learning_rate": 7.51690493718846e-05, "loss": 0.2033, "step": 4640 }, { "epoch": 72.65625, "grad_norm": 1.7451122999191284, "learning_rate": 7.478884177187855e-05, "loss": 0.2082, "step": 4650 }, { "epoch": 72.8125, "grad_norm": 1.7878004312515259, "learning_rate": 7.440902295912005e-05, "loss": 0.2025, "step": 4660 }, { "epoch": 72.96875, "grad_norm": 1.911655306816101, "learning_rate": 7.402959879088505e-05, "loss": 0.2051, "step": 4670 }, { "epoch": 73.125, "grad_norm": 1.783508062362671, "learning_rate": 7.365057511836359e-05, "loss": 0.1968, "step": 4680 }, { "epoch": 73.28125, "grad_norm": 1.643697738647461, "learning_rate": 7.327195778656962e-05, "loss": 0.1925, "step": 4690 }, { "epoch": 73.4375, "grad_norm": 1.5406553745269775, "learning_rate": 7.289375263425073e-05, "loss": 0.1969, "step": 4700 }, { "epoch": 73.59375, "grad_norm": 1.8887746334075928, "learning_rate": 7.251596549379827e-05, "loss": 0.1971, "step": 4710 }, { "epoch": 73.75, "grad_norm": 1.6867214441299438, "learning_rate": 7.21386021911572e-05, "loss": 0.197, "step": 4720 }, { "epoch": 73.90625, "grad_norm": 1.8071234226226807, "learning_rate": 7.176166854573659e-05, "loss": 0.1978, "step": 4730 }, { "epoch": 74.0625, "grad_norm": 1.795962929725647, "learning_rate": 7.138517037031944e-05, "loss": 0.2011, "step": 4740 }, { "epoch": 74.21875, "grad_norm": 1.666642427444458, "learning_rate": 7.100911347097343e-05, "loss": 0.1908, "step": 4750 }, { "epoch": 74.375, "grad_norm": 1.6308237314224243, "learning_rate": 7.06335036469611e-05, "loss": 0.1929, "step": 4760 }, { "epoch": 74.53125, "grad_norm": 1.6244676113128662, "learning_rate": 7.025834669065064e-05, "loss": 0.1959, "step": 4770 }, { "epoch": 74.6875, "grad_norm": 1.6163049936294556, "learning_rate": 6.988364838742639e-05, "loss": 0.1989, "step": 4780 }, { "epoch": 74.84375, "grad_norm": 1.8937861919403076, "learning_rate": 6.950941451559972e-05, "loss": 0.1916, "step": 4790 }, { "epoch": 75.0, "grad_norm": 1.4619686603546143, "learning_rate": 6.913565084631979e-05, "loss": 0.2031, "step": 4800 }, { "epoch": 75.15625, "grad_norm": 1.46817147731781, "learning_rate": 6.87623631434849e-05, "loss": 0.1837, "step": 4810 }, { "epoch": 75.3125, "grad_norm": 1.4131444692611694, "learning_rate": 6.838955716365309e-05, "loss": 0.1875, "step": 4820 }, { "epoch": 75.46875, "grad_norm": 1.7622783184051514, "learning_rate": 6.801723865595381e-05, "loss": 0.1971, "step": 4830 }, { "epoch": 75.625, "grad_norm": 1.4485961198806763, "learning_rate": 6.764541336199899e-05, "loss": 0.1921, "step": 4840 }, { "epoch": 75.78125, "grad_norm": 1.8549201488494873, "learning_rate": 6.727408701579467e-05, "loss": 0.2013, "step": 4850 }, { "epoch": 75.9375, "grad_norm": 1.809103012084961, "learning_rate": 6.690326534365248e-05, "loss": 0.1975, "step": 4860 }, { "epoch": 76.09375, "grad_norm": 1.4220165014266968, "learning_rate": 6.653295406410126e-05, "loss": 0.1908, "step": 4870 }, { "epoch": 76.25, "grad_norm": 1.5359421968460083, "learning_rate": 6.616315888779907e-05, "loss": 0.1903, "step": 4880 }, { "epoch": 76.40625, "grad_norm": 2.1324145793914795, "learning_rate": 6.579388551744499e-05, "loss": 0.1874, "step": 4890 }, { "epoch": 76.5625, "grad_norm": 1.7252235412597656, "learning_rate": 6.542513964769122e-05, "loss": 0.1929, "step": 4900 }, { "epoch": 76.71875, "grad_norm": 1.6541646718978882, "learning_rate": 6.50569269650552e-05, "loss": 0.1902, "step": 4910 }, { "epoch": 76.875, "grad_norm": 1.481459379196167, "learning_rate": 6.468925314783205e-05, "loss": 0.1958, "step": 4920 }, { "epoch": 77.03125, "grad_norm": 1.6920355558395386, "learning_rate": 6.432212386600676e-05, "loss": 0.1952, "step": 4930 }, { "epoch": 77.1875, "grad_norm": 1.567058801651001, "learning_rate": 6.395554478116717e-05, "loss": 0.1858, "step": 4940 }, { "epoch": 77.34375, "grad_norm": 2.0057098865509033, "learning_rate": 6.358952154641611e-05, "loss": 0.1886, "step": 4950 }, { "epoch": 77.5, "grad_norm": 2.1384148597717285, "learning_rate": 6.322405980628472e-05, "loss": 0.1943, "step": 4960 }, { "epoch": 77.65625, "grad_norm": 1.9071811437606812, "learning_rate": 6.28591651966451e-05, "loss": 0.1923, "step": 4970 }, { "epoch": 77.8125, "grad_norm": 1.9044047594070435, "learning_rate": 6.249484334462356e-05, "loss": 0.1967, "step": 4980 }, { "epoch": 77.96875, "grad_norm": 1.7569500207901, "learning_rate": 6.213109986851376e-05, "loss": 0.1919, "step": 4990 }, { "epoch": 78.125, "grad_norm": 1.7204627990722656, "learning_rate": 6.176794037769003e-05, "loss": 0.1844, "step": 5000 }, { "epoch": 78.28125, "grad_norm": 1.8979413509368896, "learning_rate": 6.140537047252092e-05, "loss": 0.1893, "step": 5010 }, { "epoch": 78.4375, "grad_norm": 1.5315955877304077, "learning_rate": 6.104339574428301e-05, "loss": 0.1856, "step": 5020 }, { "epoch": 78.59375, "grad_norm": 1.5335980653762817, "learning_rate": 6.0682021775074324e-05, "loss": 0.1939, "step": 5030 }, { "epoch": 78.75, "grad_norm": 1.9664844274520874, "learning_rate": 6.032125413772851e-05, "loss": 0.1925, "step": 5040 }, { "epoch": 78.90625, "grad_norm": 1.4986895322799683, "learning_rate": 5.996109839572881e-05, "loss": 0.193, "step": 5050 }, { "epoch": 79.0625, "grad_norm": 1.6273597478866577, "learning_rate": 5.960156010312236e-05, "loss": 0.1845, "step": 5060 }, { "epoch": 79.21875, "grad_norm": 1.3411659002304077, "learning_rate": 5.9242644804434354e-05, "loss": 0.182, "step": 5070 }, { "epoch": 79.375, "grad_norm": 1.550490379333496, "learning_rate": 5.888435803458272e-05, "loss": 0.1838, "step": 5080 }, { "epoch": 79.53125, "grad_norm": 1.4418952465057373, "learning_rate": 5.852670531879261e-05, "loss": 0.1913, "step": 5090 }, { "epoch": 79.6875, "grad_norm": 1.4645665884017944, "learning_rate": 5.816969217251144e-05, "loss": 0.1867, "step": 5100 }, { "epoch": 79.84375, "grad_norm": 1.539569616317749, "learning_rate": 5.781332410132352e-05, "loss": 0.1927, "step": 5110 }, { "epoch": 80.0, "grad_norm": 1.5742814540863037, "learning_rate": 5.745760660086532e-05, "loss": 0.1917, "step": 5120 }, { "epoch": 80.15625, "grad_norm": 1.3147929906845093, "learning_rate": 5.7102545156740805e-05, "loss": 0.181, "step": 5130 }, { "epoch": 80.3125, "grad_norm": 1.8531893491744995, "learning_rate": 5.6748145244436615e-05, "loss": 0.183, "step": 5140 }, { "epoch": 80.46875, "grad_norm": 1.2582364082336426, "learning_rate": 5.639441232923785e-05, "loss": 0.1834, "step": 5150 }, { "epoch": 80.625, "grad_norm": 1.6824560165405273, "learning_rate": 5.604135186614368e-05, "loss": 0.1859, "step": 5160 }, { "epoch": 80.78125, "grad_norm": 1.5116562843322754, "learning_rate": 5.568896929978316e-05, "loss": 0.1908, "step": 5170 }, { "epoch": 80.9375, "grad_norm": 1.6242319345474243, "learning_rate": 5.5337270064331306e-05, "loss": 0.1934, "step": 5180 }, { "epoch": 81.09375, "grad_norm": 1.4753321409225464, "learning_rate": 5.498625958342556e-05, "loss": 0.1889, "step": 5190 }, { "epoch": 81.25, "grad_norm": 1.5960818529129028, "learning_rate": 5.463594327008168e-05, "loss": 0.1798, "step": 5200 }, { "epoch": 81.40625, "grad_norm": 1.2806315422058105, "learning_rate": 5.4286326526610555e-05, "loss": 0.1845, "step": 5210 }, { "epoch": 81.5625, "grad_norm": 1.4679008722305298, "learning_rate": 5.3937414744534874e-05, "loss": 0.1858, "step": 5220 }, { "epoch": 81.71875, "grad_norm": 1.5440151691436768, "learning_rate": 5.3589213304506014e-05, "loss": 0.1855, "step": 5230 }, { "epoch": 81.875, "grad_norm": 1.5244407653808594, "learning_rate": 5.324172757622081e-05, "loss": 0.1876, "step": 5240 }, { "epoch": 82.03125, "grad_norm": 1.654781460762024, "learning_rate": 5.289496291833917e-05, "loss": 0.1887, "step": 5250 }, { "epoch": 82.1875, "grad_norm": 1.3543646335601807, "learning_rate": 5.254892467840098e-05, "loss": 0.1803, "step": 5260 }, { "epoch": 82.34375, "grad_norm": 1.4349260330200195, "learning_rate": 5.220361819274407e-05, "loss": 0.1791, "step": 5270 }, { "epoch": 82.5, "grad_norm": 1.5351791381835938, "learning_rate": 5.18590487864215e-05, "loss": 0.1851, "step": 5280 }, { "epoch": 82.65625, "grad_norm": 1.2516671419143677, "learning_rate": 5.151522177311989e-05, "loss": 0.1861, "step": 5290 }, { "epoch": 82.8125, "grad_norm": 1.4409736394882202, "learning_rate": 5.1172142455077e-05, "loss": 0.1861, "step": 5300 }, { "epoch": 82.96875, "grad_norm": 1.476853847503662, "learning_rate": 5.0829816123000395e-05, "loss": 0.1862, "step": 5310 }, { "epoch": 83.125, "grad_norm": 1.789379596710205, "learning_rate": 5.048824805598562e-05, "loss": 0.1841, "step": 5320 }, { "epoch": 83.28125, "grad_norm": 1.3781448602676392, "learning_rate": 5.014744352143477e-05, "loss": 0.1853, "step": 5330 }, { "epoch": 83.4375, "grad_norm": 1.5546520948410034, "learning_rate": 4.980740777497529e-05, "loss": 0.1787, "step": 5340 }, { "epoch": 83.59375, "grad_norm": 1.8084760904312134, "learning_rate": 4.946814606037917e-05, "loss": 0.1791, "step": 5350 }, { "epoch": 83.75, "grad_norm": 1.777557134628296, "learning_rate": 4.9129663609481694e-05, "loss": 0.1819, "step": 5360 }, { "epoch": 83.90625, "grad_norm": 1.4378726482391357, "learning_rate": 4.87919656421009e-05, "loss": 0.1872, "step": 5370 }, { "epoch": 84.0625, "grad_norm": 1.484693169593811, "learning_rate": 4.845505736595729e-05, "loss": 0.1887, "step": 5380 }, { "epoch": 84.21875, "grad_norm": 1.4815410375595093, "learning_rate": 4.811894397659319e-05, "loss": 0.1784, "step": 5390 }, { "epoch": 84.375, "grad_norm": 1.424680233001709, "learning_rate": 4.778363065729292e-05, "loss": 0.1821, "step": 5400 }, { "epoch": 84.53125, "grad_norm": 1.4403337240219116, "learning_rate": 4.7449122579002605e-05, "loss": 0.1839, "step": 5410 }, { "epoch": 84.6875, "grad_norm": 1.7450019121170044, "learning_rate": 4.711542490025067e-05, "loss": 0.1843, "step": 5420 }, { "epoch": 84.84375, "grad_norm": 1.3429001569747925, "learning_rate": 4.678254276706807e-05, "loss": 0.181, "step": 5430 }, { "epoch": 85.0, "grad_norm": 1.5108141899108887, "learning_rate": 4.6450481312909136e-05, "loss": 0.1815, "step": 5440 }, { "epoch": 85.15625, "grad_norm": 1.2566031217575073, "learning_rate": 4.6119245658572284e-05, "loss": 0.1772, "step": 5450 }, { "epoch": 85.3125, "grad_norm": 1.5348423719406128, "learning_rate": 4.578884091212103e-05, "loss": 0.1774, "step": 5460 }, { "epoch": 85.46875, "grad_norm": 1.8427592515945435, "learning_rate": 4.545927216880525e-05, "loss": 0.1815, "step": 5470 }, { "epoch": 85.625, "grad_norm": 1.6190025806427002, "learning_rate": 4.513054451098276e-05, "loss": 0.1862, "step": 5480 }, { "epoch": 85.78125, "grad_norm": 1.5374351739883423, "learning_rate": 4.4802663008040676e-05, "loss": 0.1798, "step": 5490 }, { "epoch": 85.9375, "grad_norm": 1.5920672416687012, "learning_rate": 4.4475632716317304e-05, "loss": 0.1816, "step": 5500 }, { "epoch": 86.09375, "grad_norm": 1.5544840097427368, "learning_rate": 4.414945867902437e-05, "loss": 0.176, "step": 5510 }, { "epoch": 86.25, "grad_norm": 1.6388013362884521, "learning_rate": 4.3824145926169036e-05, "loss": 0.1752, "step": 5520 }, { "epoch": 86.40625, "grad_norm": 1.461263656616211, "learning_rate": 4.349969947447632e-05, "loss": 0.1797, "step": 5530 }, { "epoch": 86.5625, "grad_norm": 1.5496662855148315, "learning_rate": 4.317612432731193e-05, "loss": 0.1817, "step": 5540 }, { "epoch": 86.71875, "grad_norm": 1.4612733125686646, "learning_rate": 4.2853425474604846e-05, "loss": 0.178, "step": 5550 }, { "epoch": 86.875, "grad_norm": 1.3939430713653564, "learning_rate": 4.253160789277066e-05, "loss": 0.1816, "step": 5560 }, { "epoch": 87.03125, "grad_norm": 1.5148475170135498, "learning_rate": 4.221067654463452e-05, "loss": 0.1889, "step": 5570 }, { "epoch": 87.1875, "grad_norm": 1.7056833505630493, "learning_rate": 4.189063637935487e-05, "loss": 0.1691, "step": 5580 }, { "epoch": 87.34375, "grad_norm": 1.3451744318008423, "learning_rate": 4.157149233234693e-05, "loss": 0.181, "step": 5590 }, { "epoch": 87.5, "grad_norm": 1.4934422969818115, "learning_rate": 4.1253249325206713e-05, "loss": 0.1784, "step": 5600 }, { "epoch": 87.65625, "grad_norm": 1.5852150917053223, "learning_rate": 4.0935912265635124e-05, "loss": 0.1848, "step": 5610 }, { "epoch": 87.8125, "grad_norm": 1.593066930770874, "learning_rate": 4.061948604736214e-05, "loss": 0.1769, "step": 5620 }, { "epoch": 87.96875, "grad_norm": 1.4550515413284302, "learning_rate": 4.0303975550071474e-05, "loss": 0.1803, "step": 5630 }, { "epoch": 88.125, "grad_norm": 1.4054372310638428, "learning_rate": 3.99893856393253e-05, "loss": 0.1781, "step": 5640 }, { "epoch": 88.28125, "grad_norm": 1.4826765060424805, "learning_rate": 3.967572116648927e-05, "loss": 0.1786, "step": 5650 }, { "epoch": 88.4375, "grad_norm": 1.4517465829849243, "learning_rate": 3.936298696865749e-05, "loss": 0.1795, "step": 5660 }, { "epoch": 88.59375, "grad_norm": 2.170179843902588, "learning_rate": 3.905118786857824e-05, "loss": 0.1755, "step": 5670 }, { "epoch": 88.75, "grad_norm": 1.5206856727600098, "learning_rate": 3.87403286745793e-05, "loss": 0.1725, "step": 5680 }, { "epoch": 88.90625, "grad_norm": 1.4970860481262207, "learning_rate": 3.843041418049402e-05, "loss": 0.1818, "step": 5690 }, { "epoch": 89.0625, "grad_norm": 1.3487545251846313, "learning_rate": 3.8121449165587255e-05, "loss": 0.1792, "step": 5700 }, { "epoch": 89.21875, "grad_norm": 1.6100648641586304, "learning_rate": 3.7813438394481794e-05, "loss": 0.1764, "step": 5710 }, { "epoch": 89.375, "grad_norm": 1.2975910902023315, "learning_rate": 3.750638661708469e-05, "loss": 0.1744, "step": 5720 }, { "epoch": 89.53125, "grad_norm": 1.2627249956130981, "learning_rate": 3.7200298568514214e-05, "loss": 0.1763, "step": 5730 }, { "epoch": 89.6875, "grad_norm": 1.3773270845413208, "learning_rate": 3.689517896902678e-05, "loss": 0.1803, "step": 5740 }, { "epoch": 89.84375, "grad_norm": 1.8265244960784912, "learning_rate": 3.659103252394401e-05, "loss": 0.1772, "step": 5750 }, { "epoch": 90.0, "grad_norm": 1.4150211811065674, "learning_rate": 3.628786392358028e-05, "loss": 0.1778, "step": 5760 }, { "epoch": 90.15625, "grad_norm": 1.5570526123046875, "learning_rate": 3.598567784317056e-05, "loss": 0.173, "step": 5770 }, { "epoch": 90.3125, "grad_norm": 1.424486756324768, "learning_rate": 3.5684478942797984e-05, "loss": 0.179, "step": 5780 }, { "epoch": 90.46875, "grad_norm": 1.4627338647842407, "learning_rate": 3.538427186732216e-05, "loss": 0.1776, "step": 5790 }, { "epoch": 90.625, "grad_norm": 1.7183500528335571, "learning_rate": 3.508506124630759e-05, "loss": 0.1795, "step": 5800 }, { "epoch": 90.78125, "grad_norm": 1.4211575984954834, "learning_rate": 3.478685169395224e-05, "loss": 0.1769, "step": 5810 }, { "epoch": 90.9375, "grad_norm": 1.5053857564926147, "learning_rate": 3.448964780901622e-05, "loss": 0.1751, "step": 5820 }, { "epoch": 91.09375, "grad_norm": 1.6190037727355957, "learning_rate": 3.4193454174751196e-05, "loss": 0.1752, "step": 5830 }, { "epoch": 91.25, "grad_norm": 1.623235821723938, "learning_rate": 3.389827535882931e-05, "loss": 0.17, "step": 5840 }, { "epoch": 91.40625, "grad_norm": 1.6231571435928345, "learning_rate": 3.3604115913273146e-05, "loss": 0.1808, "step": 5850 }, { "epoch": 91.5625, "grad_norm": 1.3666538000106812, "learning_rate": 3.331098037438518e-05, "loss": 0.1769, "step": 5860 }, { "epoch": 91.71875, "grad_norm": 1.6779783964157104, "learning_rate": 3.301887326267812e-05, "loss": 0.1757, "step": 5870 }, { "epoch": 91.875, "grad_norm": 1.3442100286483765, "learning_rate": 3.272779908280493e-05, "loss": 0.1735, "step": 5880 }, { "epoch": 92.03125, "grad_norm": 1.3602097034454346, "learning_rate": 3.2437762323489485e-05, "loss": 0.1739, "step": 5890 }, { "epoch": 92.1875, "grad_norm": 1.383625864982605, "learning_rate": 3.214876745745754e-05, "loss": 0.1732, "step": 5900 }, { "epoch": 92.34375, "grad_norm": 1.433128833770752, "learning_rate": 3.1860818941367367e-05, "loss": 0.1716, "step": 5910 }, { "epoch": 92.5, "grad_norm": 1.3733683824539185, "learning_rate": 3.1573921215741254e-05, "loss": 0.1743, "step": 5920 }, { "epoch": 92.65625, "grad_norm": 1.2076114416122437, "learning_rate": 3.1288078704897074e-05, "loss": 0.1717, "step": 5930 }, { "epoch": 92.8125, "grad_norm": 1.4948092699050903, "learning_rate": 3.1003295816879985e-05, "loss": 0.1784, "step": 5940 }, { "epoch": 92.96875, "grad_norm": 1.3194231986999512, "learning_rate": 3.071957694339435e-05, "loss": 0.1767, "step": 5950 }, { "epoch": 93.125, "grad_norm": 1.4278184175491333, "learning_rate": 3.0436926459736227e-05, "loss": 0.171, "step": 5960 }, { "epoch": 93.28125, "grad_norm": 2.0391736030578613, "learning_rate": 3.0155348724725673e-05, "loss": 0.1763, "step": 5970 }, { "epoch": 93.4375, "grad_norm": 1.4166723489761353, "learning_rate": 2.987484808063976e-05, "loss": 0.1738, "step": 5980 }, { "epoch": 93.59375, "grad_norm": 1.4654409885406494, "learning_rate": 2.959542885314536e-05, "loss": 0.1715, "step": 5990 }, { "epoch": 93.75, "grad_norm": 1.3458061218261719, "learning_rate": 2.9317095351232648e-05, "loss": 0.1758, "step": 6000 }, { "epoch": 93.90625, "grad_norm": 1.6262662410736084, "learning_rate": 2.9039851867148505e-05, "loss": 0.1776, "step": 6010 }, { "epoch": 94.0625, "grad_norm": 1.4164749383926392, "learning_rate": 2.876370267633045e-05, "loss": 0.1693, "step": 6020 }, { "epoch": 94.21875, "grad_norm": 1.3470439910888672, "learning_rate": 2.8488652037340634e-05, "loss": 0.1681, "step": 6030 }, { "epoch": 94.375, "grad_norm": 1.382950782775879, "learning_rate": 2.8214704191800145e-05, "loss": 0.1719, "step": 6040 }, { "epoch": 94.53125, "grad_norm": 1.6302028894424438, "learning_rate": 2.7941863364323595e-05, "loss": 0.1738, "step": 6050 }, { "epoch": 94.6875, "grad_norm": 1.4697049856185913, "learning_rate": 2.7670133762454175e-05, "loss": 0.1753, "step": 6060 }, { "epoch": 94.84375, "grad_norm": 1.3718417882919312, "learning_rate": 2.739951957659843e-05, "loss": 0.1774, "step": 6070 }, { "epoch": 95.0, "grad_norm": 1.6622495651245117, "learning_rate": 2.715692393953073e-05, "loss": 0.1724, "step": 6080 }, { "epoch": 95.15625, "grad_norm": 1.3040485382080078, "learning_rate": 2.688844052701359e-05, "loss": 0.164, "step": 6090 }, { "epoch": 95.3125, "grad_norm": 1.5511151552200317, "learning_rate": 2.6621084585186828e-05, "loss": 0.1725, "step": 6100 }, { "epoch": 95.46875, "grad_norm": 1.6277883052825928, "learning_rate": 2.635486023700976e-05, "loss": 0.1736, "step": 6110 }, { "epoch": 95.625, "grad_norm": 1.9094816446304321, "learning_rate": 2.6089771587991206e-05, "loss": 0.1749, "step": 6120 }, { "epoch": 95.78125, "grad_norm": 1.1980408430099487, "learning_rate": 2.582582272612609e-05, "loss": 0.1786, "step": 6130 }, { "epoch": 95.9375, "grad_norm": 1.4505863189697266, "learning_rate": 2.556301772183233e-05, "loss": 0.1739, "step": 6140 }, { "epoch": 96.09375, "grad_norm": 1.252650260925293, "learning_rate": 2.5301360627888236e-05, "loss": 0.1676, "step": 6150 }, { "epoch": 96.25, "grad_norm": 1.5206024646759033, "learning_rate": 2.5040855479369853e-05, "loss": 0.1677, "step": 6160 }, { "epoch": 96.40625, "grad_norm": 1.4388213157653809, "learning_rate": 2.4781506293588873e-05, "loss": 0.1748, "step": 6170 }, { "epoch": 96.5625, "grad_norm": 1.395816683769226, "learning_rate": 2.4523317070030515e-05, "loss": 0.1763, "step": 6180 }, { "epoch": 96.71875, "grad_norm": 1.2511937618255615, "learning_rate": 2.426629179029206e-05, "loss": 0.1644, "step": 6190 }, { "epoch": 96.875, "grad_norm": 1.137650489807129, "learning_rate": 2.401043441802121e-05, "loss": 0.1748, "step": 6200 }, { "epoch": 97.03125, "grad_norm": 1.197763442993164, "learning_rate": 2.37557488988552e-05, "loss": 0.1788, "step": 6210 }, { "epoch": 97.1875, "grad_norm": 1.7034637928009033, "learning_rate": 2.350223916035983e-05, "loss": 0.1681, "step": 6220 }, { "epoch": 97.34375, "grad_norm": 1.2496026754379272, "learning_rate": 2.3249909111968815e-05, "loss": 0.1741, "step": 6230 }, { "epoch": 97.5, "grad_norm": 1.8080981969833374, "learning_rate": 2.2998762644923656e-05, "loss": 0.1697, "step": 6240 }, { "epoch": 97.65625, "grad_norm": 1.5005358457565308, "learning_rate": 2.2748803632213557e-05, "loss": 0.1709, "step": 6250 }, { "epoch": 97.8125, "grad_norm": 1.2584110498428345, "learning_rate": 2.250003592851575e-05, "loss": 0.1712, "step": 6260 }, { "epoch": 97.96875, "grad_norm": 1.4121007919311523, "learning_rate": 2.225246337013591e-05, "loss": 0.1725, "step": 6270 }, { "epoch": 98.125, "grad_norm": 1.5540642738342285, "learning_rate": 2.2006089774949203e-05, "loss": 0.1735, "step": 6280 }, { "epoch": 98.28125, "grad_norm": 1.2959460020065308, "learning_rate": 2.1760918942341192e-05, "loss": 0.1646, "step": 6290 }, { "epoch": 98.4375, "grad_norm": 1.627122402191162, "learning_rate": 2.151695465314946e-05, "loss": 0.1715, "step": 6300 }, { "epoch": 98.59375, "grad_norm": 1.5968302488327026, "learning_rate": 2.127420066960508e-05, "loss": 0.1754, "step": 6310 }, { "epoch": 98.75, "grad_norm": 1.3676623106002808, "learning_rate": 2.1032660735274858e-05, "loss": 0.1746, "step": 6320 }, { "epoch": 98.90625, "grad_norm": 1.2594794034957886, "learning_rate": 2.07923385750033e-05, "loss": 0.1695, "step": 6330 }, { "epoch": 99.0625, "grad_norm": 1.6086690425872803, "learning_rate": 2.0553237894855458e-05, "loss": 0.1728, "step": 6340 }, { "epoch": 99.21875, "grad_norm": 1.2404333353042603, "learning_rate": 2.031536238205961e-05, "loss": 0.1709, "step": 6350 }, { "epoch": 99.375, "grad_norm": 1.3804398775100708, "learning_rate": 2.0078715704950423e-05, "loss": 0.1739, "step": 6360 }, { "epoch": 99.53125, "grad_norm": 1.37823486328125, "learning_rate": 1.9843301512912327e-05, "loss": 0.1676, "step": 6370 }, { "epoch": 99.6875, "grad_norm": 1.1960835456848145, "learning_rate": 1.9609123436323508e-05, "loss": 0.1678, "step": 6380 }, { "epoch": 99.84375, "grad_norm": 1.3781687021255493, "learning_rate": 1.937618508649954e-05, "loss": 0.1673, "step": 6390 }, { "epoch": 100.0, "grad_norm": 1.2113115787506104, "learning_rate": 1.914449005563791e-05, "loss": 0.1747, "step": 6400 }, { "epoch": 100.15625, "grad_norm": 1.298387885093689, "learning_rate": 1.891404191676265e-05, "loss": 0.1637, "step": 6410 }, { "epoch": 100.3125, "grad_norm": 1.1972380876541138, "learning_rate": 1.868484422366914e-05, "loss": 0.1721, "step": 6420 }, { "epoch": 100.46875, "grad_norm": 2.0465595722198486, "learning_rate": 1.8456900510869334e-05, "loss": 0.1723, "step": 6430 }, { "epoch": 100.625, "grad_norm": 1.6506719589233398, "learning_rate": 1.823021429353724e-05, "loss": 0.1689, "step": 6440 }, { "epoch": 100.78125, "grad_norm": 1.431828498840332, "learning_rate": 1.8004789067454764e-05, "loss": 0.1688, "step": 6450 }, { "epoch": 100.9375, "grad_norm": 1.4944453239440918, "learning_rate": 1.7780628308957792e-05, "loss": 0.1753, "step": 6460 }, { "epoch": 101.09375, "grad_norm": 1.3024934530258179, "learning_rate": 1.7557735474882465e-05, "loss": 0.1659, "step": 6470 }, { "epoch": 101.25, "grad_norm": 1.1864546537399292, "learning_rate": 1.733611400251206e-05, "loss": 0.1686, "step": 6480 }, { "epoch": 101.40625, "grad_norm": 1.250319480895996, "learning_rate": 1.7115767309523812e-05, "loss": 0.1684, "step": 6490 }, { "epoch": 101.5625, "grad_norm": 1.1592826843261719, "learning_rate": 1.6896698793936316e-05, "loss": 0.1734, "step": 6500 }, { "epoch": 101.71875, "grad_norm": 1.490527629852295, "learning_rate": 1.6678911834057098e-05, "loss": 0.1719, "step": 6510 }, { "epoch": 101.875, "grad_norm": 1.3874304294586182, "learning_rate": 1.64624097884305e-05, "loss": 0.1703, "step": 6520 }, { "epoch": 102.03125, "grad_norm": 1.2031797170639038, "learning_rate": 1.6247195995785837e-05, "loss": 0.1707, "step": 6530 }, { "epoch": 102.1875, "grad_norm": 1.4145324230194092, "learning_rate": 1.6033273774986067e-05, "loss": 0.1727, "step": 6540 }, { "epoch": 102.34375, "grad_norm": 1.491633415222168, "learning_rate": 1.5820646424976482e-05, "loss": 0.167, "step": 6550 }, { "epoch": 102.5, "grad_norm": 1.3200361728668213, "learning_rate": 1.560931722473381e-05, "loss": 0.1665, "step": 6560 }, { "epoch": 102.65625, "grad_norm": 1.3420644998550415, "learning_rate": 1.539928943321579e-05, "loss": 0.1697, "step": 6570 }, { "epoch": 102.8125, "grad_norm": 1.4412531852722168, "learning_rate": 1.5190566289310747e-05, "loss": 0.1711, "step": 6580 }, { "epoch": 102.96875, "grad_norm": 1.22170090675354, "learning_rate": 1.4983151011787788e-05, "loss": 0.1681, "step": 6590 }, { "epoch": 103.125, "grad_norm": 1.2543760538101196, "learning_rate": 1.4777046799247052e-05, "loss": 0.1647, "step": 6600 }, { "epoch": 103.28125, "grad_norm": 1.4133225679397583, "learning_rate": 1.4572256830070497e-05, "loss": 0.1702, "step": 6610 }, { "epoch": 103.4375, "grad_norm": 1.3233059644699097, "learning_rate": 1.4368784262372736e-05, "loss": 0.169, "step": 6620 }, { "epoch": 103.59375, "grad_norm": 1.228870153427124, "learning_rate": 1.4166632233952504e-05, "loss": 0.1717, "step": 6630 }, { "epoch": 103.75, "grad_norm": 1.2790390253067017, "learning_rate": 1.3965803862244164e-05, "loss": 0.1655, "step": 6640 }, { "epoch": 103.90625, "grad_norm": 1.3549610376358032, "learning_rate": 1.3766302244269624e-05, "loss": 0.167, "step": 6650 }, { "epoch": 104.0625, "grad_norm": 1.2854844331741333, "learning_rate": 1.3568130456590588e-05, "loss": 0.1663, "step": 6660 }, { "epoch": 104.21875, "grad_norm": 1.270037055015564, "learning_rate": 1.3371291555261256e-05, "loss": 0.1627, "step": 6670 }, { "epoch": 104.375, "grad_norm": 1.276854395866394, "learning_rate": 1.3175788575780945e-05, "loss": 0.1732, "step": 6680 }, { "epoch": 104.53125, "grad_norm": 1.3133665323257446, "learning_rate": 1.2981624533047432e-05, "loss": 0.1683, "step": 6690 }, { "epoch": 104.6875, "grad_norm": 1.3067399263381958, "learning_rate": 1.2788802421310463e-05, "loss": 0.1683, "step": 6700 }, { "epoch": 104.84375, "grad_norm": 1.3023086786270142, "learning_rate": 1.2597325214125566e-05, "loss": 0.1711, "step": 6710 }, { "epoch": 105.0, "grad_norm": 1.4500195980072021, "learning_rate": 1.2407195864308096e-05, "loss": 0.1714, "step": 6720 }, { "epoch": 105.15625, "grad_norm": 1.4866911172866821, "learning_rate": 1.2218417303887842e-05, "loss": 0.1693, "step": 6730 }, { "epoch": 105.3125, "grad_norm": 1.0444018840789795, "learning_rate": 1.2030992444063726e-05, "loss": 0.1649, "step": 6740 }, { "epoch": 105.46875, "grad_norm": 1.5279264450073242, "learning_rate": 1.184492417515899e-05, "loss": 0.1683, "step": 6750 }, { "epoch": 105.625, "grad_norm": 1.2571183443069458, "learning_rate": 1.1660215366576454e-05, "loss": 0.1691, "step": 6760 }, { "epoch": 105.78125, "grad_norm": 1.3707107305526733, "learning_rate": 1.1476868866754486e-05, "loss": 0.1662, "step": 6770 }, { "epoch": 105.9375, "grad_norm": 1.2116854190826416, "learning_rate": 1.1294887503122919e-05, "loss": 0.1664, "step": 6780 }, { "epoch": 106.09375, "grad_norm": 1.4501864910125732, "learning_rate": 1.1114274082059451e-05, "loss": 0.1663, "step": 6790 }, { "epoch": 106.25, "grad_norm": 1.7519021034240723, "learning_rate": 1.093503138884654e-05, "loss": 0.167, "step": 6800 }, { "epoch": 106.40625, "grad_norm": 1.4105887413024902, "learning_rate": 1.0757162187628222e-05, "loss": 0.1688, "step": 6810 }, { "epoch": 106.5625, "grad_norm": 1.1597462892532349, "learning_rate": 1.0580669221367589e-05, "loss": 0.1639, "step": 6820 }, { "epoch": 106.71875, "grad_norm": 1.3421756029129028, "learning_rate": 1.0405555211804518e-05, "loss": 0.1689, "step": 6830 }, { "epoch": 106.875, "grad_norm": 1.3345783948898315, "learning_rate": 1.0231822859413709e-05, "loss": 0.1717, "step": 6840 }, { "epoch": 107.03125, "grad_norm": 1.2460522651672363, "learning_rate": 1.0059474843362892e-05, "loss": 0.1649, "step": 6850 }, { "epoch": 107.1875, "grad_norm": 1.328639268875122, "learning_rate": 9.888513821471735e-06, "loss": 0.1623, "step": 6860 }, { "epoch": 107.34375, "grad_norm": 1.4876681566238403, "learning_rate": 9.71894243017063e-06, "loss": 0.1668, "step": 6870 }, { "epoch": 107.5, "grad_norm": 1.2804096937179565, "learning_rate": 9.55076328446024e-06, "loss": 0.1693, "step": 6880 }, { "epoch": 107.65625, "grad_norm": 1.2431553602218628, "learning_rate": 9.383978977871021e-06, "loss": 0.1669, "step": 6890 }, { "epoch": 107.8125, "grad_norm": 1.6604621410369873, "learning_rate": 9.218592082423316e-06, "loss": 0.1686, "step": 6900 }, { "epoch": 107.96875, "grad_norm": 1.3399462699890137, "learning_rate": 9.054605148587624e-06, "loss": 0.1689, "step": 6910 }, { "epoch": 108.125, "grad_norm": 1.9574614763259888, "learning_rate": 8.892020705245341e-06, "loss": 0.1635, "step": 6920 }, { "epoch": 108.28125, "grad_norm": 1.5866326093673706, "learning_rate": 8.730841259649725e-06, "loss": 0.1645, "step": 6930 }, { "epoch": 108.4375, "grad_norm": 1.3083440065383911, "learning_rate": 8.571069297387202e-06, "loss": 0.1682, "step": 6940 }, { "epoch": 108.59375, "grad_norm": 1.4477092027664185, "learning_rate": 8.41270728233905e-06, "loss": 0.1634, "step": 6950 }, { "epoch": 108.75, "grad_norm": 1.584891676902771, "learning_rate": 8.255757656643482e-06, "loss": 0.1685, "step": 6960 }, { "epoch": 108.90625, "grad_norm": 1.29928457736969, "learning_rate": 8.100222840657878e-06, "loss": 0.1703, "step": 6970 }, { "epoch": 109.0625, "grad_norm": 1.2276784181594849, "learning_rate": 7.946105232921474e-06, "loss": 0.1695, "step": 6980 }, { "epoch": 109.21875, "grad_norm": 1.2223650217056274, "learning_rate": 7.793407210118464e-06, "loss": 0.1651, "step": 6990 }, { "epoch": 109.375, "grad_norm": 1.1992684602737427, "learning_rate": 7.642131127041275e-06, "loss": 0.1693, "step": 7000 }, { "epoch": 109.53125, "grad_norm": 1.429261565208435, "learning_rate": 7.492279316554207e-06, "loss": 0.1676, "step": 7010 }, { "epoch": 109.6875, "grad_norm": 1.4679069519042969, "learning_rate": 7.343854089557556e-06, "loss": 0.1687, "step": 7020 }, { "epoch": 109.84375, "grad_norm": 1.261810541152954, "learning_rate": 7.1968577349519565e-06, "loss": 0.162, "step": 7030 }, { "epoch": 110.0, "grad_norm": 1.4870808124542236, "learning_rate": 7.051292519603014e-06, "loss": 0.1682, "step": 7040 }, { "epoch": 110.15625, "grad_norm": 1.2893967628479004, "learning_rate": 6.907160688306425e-06, "loss": 0.1637, "step": 7050 }, { "epoch": 110.3125, "grad_norm": 1.3684417009353638, "learning_rate": 6.764464463753362e-06, "loss": 0.1674, "step": 7060 }, { "epoch": 110.46875, "grad_norm": 1.3469291925430298, "learning_rate": 6.6232060464961e-06, "loss": 0.1628, "step": 7070 }, { "epoch": 110.625, "grad_norm": 1.1284126043319702, "learning_rate": 6.48338761491416e-06, "loss": 0.1654, "step": 7080 }, { "epoch": 110.78125, "grad_norm": 1.8623775243759155, "learning_rate": 6.345011325180772e-06, "loss": 0.1677, "step": 7090 }, { "epoch": 110.9375, "grad_norm": 1.4487367868423462, "learning_rate": 6.2080793112294755e-06, "loss": 0.1674, "step": 7100 }, { "epoch": 111.09375, "grad_norm": 1.5421063899993896, "learning_rate": 6.072593684721295e-06, "loss": 0.1657, "step": 7110 }, { "epoch": 111.25, "grad_norm": 1.2256017923355103, "learning_rate": 5.938556535012229e-06, "loss": 0.1626, "step": 7120 }, { "epoch": 111.40625, "grad_norm": 1.5549722909927368, "learning_rate": 5.805969929120947e-06, "loss": 0.1684, "step": 7130 }, { "epoch": 111.5625, "grad_norm": 1.4856703281402588, "learning_rate": 5.674835911696885e-06, "loss": 0.1642, "step": 7140 }, { "epoch": 111.71875, "grad_norm": 1.4706385135650635, "learning_rate": 5.545156504988857e-06, "loss": 0.1689, "step": 7150 }, { "epoch": 111.875, "grad_norm": 1.3578001260757446, "learning_rate": 5.4169337088136984e-06, "loss": 0.1656, "step": 7160 }, { "epoch": 112.03125, "grad_norm": 1.2759300470352173, "learning_rate": 5.290169500525577e-06, "loss": 0.1676, "step": 7170 }, { "epoch": 112.1875, "grad_norm": 1.3984161615371704, "learning_rate": 5.164865834985377e-06, "loss": 0.1641, "step": 7180 }, { "epoch": 112.34375, "grad_norm": 1.5849649906158447, "learning_rate": 5.0410246445306455e-06, "loss": 0.1674, "step": 7190 }, { "epoch": 112.5, "grad_norm": 1.6095918416976929, "learning_rate": 4.918647838945711e-06, "loss": 0.1676, "step": 7200 }, { "epoch": 112.65625, "grad_norm": 1.3017394542694092, "learning_rate": 4.797737305432337e-06, "loss": 0.1637, "step": 7210 }, { "epoch": 112.8125, "grad_norm": 1.5304924249649048, "learning_rate": 4.678294908580505e-06, "loss": 0.1664, "step": 7220 }, { "epoch": 112.96875, "grad_norm": 1.3150688409805298, "learning_rate": 4.560322490339741e-06, "loss": 0.1651, "step": 7230 }, { "epoch": 113.125, "grad_norm": 1.2478370666503906, "learning_rate": 4.443821869990661e-06, "loss": 0.1627, "step": 7240 }, { "epoch": 113.28125, "grad_norm": 1.1985414028167725, "learning_rate": 4.328794844116946e-06, "loss": 0.1673, "step": 7250 }, { "epoch": 113.4375, "grad_norm": 1.6331043243408203, "learning_rate": 4.215243186577656e-06, "loss": 0.1677, "step": 7260 }, { "epoch": 113.59375, "grad_norm": 1.3105155229568481, "learning_rate": 4.103168648479794e-06, "loss": 0.1633, "step": 7270 }, { "epoch": 113.75, "grad_norm": 1.6450676918029785, "learning_rate": 3.992572958151397e-06, "loss": 0.1698, "step": 7280 }, { "epoch": 113.90625, "grad_norm": 1.1760913133621216, "learning_rate": 3.883457821114811e-06, "loss": 0.1667, "step": 7290 }, { "epoch": 114.0625, "grad_norm": 1.5002760887145996, "learning_rate": 3.7758249200604444e-06, "loss": 0.1661, "step": 7300 }, { "epoch": 114.21875, "grad_norm": 1.1639865636825562, "learning_rate": 3.6696759148207583e-06, "loss": 0.1633, "step": 7310 }, { "epoch": 114.375, "grad_norm": 1.173621654510498, "learning_rate": 3.5650124423447507e-06, "loss": 0.1643, "step": 7320 }, { "epoch": 114.53125, "grad_norm": 1.1538360118865967, "learning_rate": 3.461836116672612e-06, "loss": 0.1675, "step": 7330 }, { "epoch": 114.6875, "grad_norm": 1.4228068590164185, "learning_rate": 3.3601485289109446e-06, "loss": 0.1661, "step": 7340 }, { "epoch": 114.84375, "grad_norm": 1.2065403461456299, "learning_rate": 3.259951247208148e-06, "loss": 0.1649, "step": 7350 }, { "epoch": 115.0, "grad_norm": 1.54328191280365, "learning_rate": 3.1612458167302518e-06, "loss": 0.1614, "step": 7360 }, { "epoch": 115.15625, "grad_norm": 1.7460312843322754, "learning_rate": 3.064033759637064e-06, "loss": 0.165, "step": 7370 }, { "epoch": 115.3125, "grad_norm": 1.4038829803466797, "learning_rate": 2.968316575058816e-06, "loss": 0.1602, "step": 7380 }, { "epoch": 115.46875, "grad_norm": 1.490884780883789, "learning_rate": 2.8740957390728796e-06, "loss": 0.1662, "step": 7390 }, { "epoch": 115.625, "grad_norm": 1.2959210872650146, "learning_rate": 2.7813727046810846e-06, "loss": 0.1649, "step": 7400 }, { "epoch": 115.78125, "grad_norm": 1.3744455575942993, "learning_rate": 2.690148901787337e-06, "loss": 0.1706, "step": 7410 }, { "epoch": 115.9375, "grad_norm": 1.475809931755066, "learning_rate": 2.600425737175549e-06, "loss": 0.1661, "step": 7420 }, { "epoch": 116.09375, "grad_norm": 1.2936959266662598, "learning_rate": 2.5122045944878882e-06, "loss": 0.171, "step": 7430 }, { "epoch": 116.25, "grad_norm": 1.2538620233535767, "learning_rate": 2.4254868342035054e-06, "loss": 0.1666, "step": 7440 }, { "epoch": 116.40625, "grad_norm": 1.0937186479568481, "learning_rate": 2.3402737936175425e-06, "loss": 0.1633, "step": 7450 }, { "epoch": 116.5625, "grad_norm": 1.490662932395935, "learning_rate": 2.2565667868205133e-06, "loss": 0.1652, "step": 7460 }, { "epoch": 116.71875, "grad_norm": 1.2698343992233276, "learning_rate": 2.174367104677999e-06, "loss": 0.1676, "step": 7470 }, { "epoch": 116.875, "grad_norm": 1.3303751945495605, "learning_rate": 2.093676014810797e-06, "loss": 0.1623, "step": 7480 }, { "epoch": 117.03125, "grad_norm": 1.4380319118499756, "learning_rate": 2.014494761575314e-06, "loss": 0.1603, "step": 7490 }, { "epoch": 117.1875, "grad_norm": 1.2588881254196167, "learning_rate": 1.9368245660444372e-06, "loss": 0.1652, "step": 7500 }, { "epoch": 117.34375, "grad_norm": 1.4556334018707275, "learning_rate": 1.8606666259886719e-06, "loss": 0.1623, "step": 7510 }, { "epoch": 117.5, "grad_norm": 1.451319932937622, "learning_rate": 1.7860221158576662e-06, "loss": 0.1696, "step": 7520 }, { "epoch": 117.65625, "grad_norm": 1.4228541851043701, "learning_rate": 1.712892186762083e-06, "loss": 0.1616, "step": 7530 }, { "epoch": 117.8125, "grad_norm": 1.5599255561828613, "learning_rate": 1.6412779664559119e-06, "loss": 0.1641, "step": 7540 }, { "epoch": 117.96875, "grad_norm": 1.4440757036209106, "learning_rate": 1.5711805593190188e-06, "loss": 0.1676, "step": 7550 }, { "epoch": 118.125, "grad_norm": 1.2332494258880615, "learning_rate": 1.5026010463401352e-06, "loss": 0.1689, "step": 7560 }, { "epoch": 118.28125, "grad_norm": 1.2784188985824585, "learning_rate": 1.4355404851001952e-06, "loss": 0.1606, "step": 7570 }, { "epoch": 118.4375, "grad_norm": 1.3040153980255127, "learning_rate": 1.3699999097560034e-06, "loss": 0.1652, "step": 7580 }, { "epoch": 118.59375, "grad_norm": 1.331621766090393, "learning_rate": 1.3059803310243147e-06, "loss": 0.165, "step": 7590 }, { "epoch": 118.75, "grad_norm": 1.4870508909225464, "learning_rate": 1.2434827361662237e-06, "loss": 0.1654, "step": 7600 }, { "epoch": 118.90625, "grad_norm": 1.3456453084945679, "learning_rate": 1.1825080889719563e-06, "loss": 0.1638, "step": 7610 }, { "epoch": 119.0625, "grad_norm": 1.5864441394805908, "learning_rate": 1.1230573297460022e-06, "loss": 0.1668, "step": 7620 }, { "epoch": 119.21875, "grad_norm": 1.2966445684432983, "learning_rate": 1.0651313752926052e-06, "loss": 0.1644, "step": 7630 }, { "epoch": 119.375, "grad_norm": 1.3712732791900635, "learning_rate": 1.0087311189016623e-06, "loss": 0.1641, "step": 7640 }, { "epoch": 119.53125, "grad_norm": 1.4837236404418945, "learning_rate": 9.538574303348813e-07, "loss": 0.1639, "step": 7650 }, { "epoch": 119.6875, "grad_norm": 1.2135469913482666, "learning_rate": 9.005111558124223e-07, "loss": 0.1648, "step": 7660 }, { "epoch": 119.84375, "grad_norm": 1.3776029348373413, "learning_rate": 8.486931179998325e-07, "loss": 0.1646, "step": 7670 }, { "epoch": 120.0, "grad_norm": 1.459794521331787, "learning_rate": 7.984041159953659e-07, "loss": 0.1681, "step": 7680 }, { "epoch": 120.15625, "grad_norm": 1.116540551185608, "learning_rate": 7.496449253176274e-07, "loss": 0.1627, "step": 7690 }, { "epoch": 120.3125, "grad_norm": 1.203869104385376, "learning_rate": 7.0241629789366e-07, "loss": 0.1655, "step": 7700 }, { "epoch": 120.46875, "grad_norm": 1.3655177354812622, "learning_rate": 6.567189620473424e-07, "loss": 0.1689, "step": 7710 }, { "epoch": 120.625, "grad_norm": 1.3210086822509766, "learning_rate": 6.12553622488099e-07, "loss": 0.164, "step": 7720 }, { "epoch": 120.78125, "grad_norm": 1.2926894426345825, "learning_rate": 5.699209603001076e-07, "loss": 0.1651, "step": 7730 }, { "epoch": 120.9375, "grad_norm": 1.445736050605774, "learning_rate": 5.288216329317641e-07, "loss": 0.1626, "step": 7740 }, { "epoch": 121.09375, "grad_norm": 1.3684810400009155, "learning_rate": 4.892562741855455e-07, "loss": 0.1661, "step": 7750 }, { "epoch": 121.25, "grad_norm": 1.2554900646209717, "learning_rate": 4.512254942082073e-07, "loss": 0.1561, "step": 7760 }, { "epoch": 121.40625, "grad_norm": 1.4690603017807007, "learning_rate": 4.1472987948143473e-07, "loss": 0.1674, "step": 7770 }, { "epoch": 121.5625, "grad_norm": 1.4324357509613037, "learning_rate": 3.797699928127507e-07, "loss": 0.1698, "step": 7780 }, { "epoch": 121.71875, "grad_norm": 1.0271071195602417, "learning_rate": 3.463463733268557e-07, "loss": 0.1661, "step": 7790 }, { "epoch": 121.875, "grad_norm": 1.151157021522522, "learning_rate": 3.144595364573233e-07, "loss": 0.16, "step": 7800 }, { "epoch": 122.03125, "grad_norm": 1.1873737573623657, "learning_rate": 2.841099739386066e-07, "loss": 0.1667, "step": 7810 }, { "epoch": 122.1875, "grad_norm": 1.3843321800231934, "learning_rate": 2.552981537985111e-07, "loss": 0.1608, "step": 7820 }, { "epoch": 122.34375, "grad_norm": 1.2830843925476074, "learning_rate": 2.2802452035093346e-07, "loss": 0.1639, "step": 7830 }, { "epoch": 122.5, "grad_norm": 1.2948521375656128, "learning_rate": 2.022894941890674e-07, "loss": 0.167, "step": 7840 }, { "epoch": 122.65625, "grad_norm": 1.2428685426712036, "learning_rate": 1.7809347217881966e-07, "loss": 0.161, "step": 7850 }, { "epoch": 122.8125, "grad_norm": 1.55159592628479, "learning_rate": 1.5543682745280396e-07, "loss": 0.1648, "step": 7860 }, { "epoch": 122.96875, "grad_norm": 1.3191789388656616, "learning_rate": 1.3431990940450113e-07, "loss": 0.1698, "step": 7870 }, { "epoch": 123.125, "grad_norm": 1.4506255388259888, "learning_rate": 1.1474304368291888e-07, "loss": 0.1676, "step": 7880 }, { "epoch": 123.28125, "grad_norm": 1.2544479370117188, "learning_rate": 9.670653218752934e-08, "loss": 0.1649, "step": 7890 }, { "epoch": 123.4375, "grad_norm": 1.184103012084961, "learning_rate": 8.021065306366149e-08, "loss": 0.16, "step": 7900 }, { "epoch": 123.59375, "grad_norm": 1.0644958019256592, "learning_rate": 6.525566069817135e-08, "loss": 0.1632, "step": 7910 }, { "epoch": 123.75, "grad_norm": 1.4110705852508545, "learning_rate": 5.18417857155562e-08, "loss": 0.1688, "step": 7920 }, { "epoch": 123.90625, "grad_norm": 1.2266552448272705, "learning_rate": 3.996923497434635e-08, "loss": 0.1702, "step": 7930 }, { "epoch": 124.0625, "grad_norm": 1.2383413314819336, "learning_rate": 2.9638191563974294e-08, "loss": 0.1597, "step": 7940 }, { "epoch": 124.21875, "grad_norm": 1.2518082857131958, "learning_rate": 2.084881480188816e-08, "loss": 0.1639, "step": 7950 }, { "epoch": 124.375, "grad_norm": 1.429856777191162, "learning_rate": 1.3601240231175816e-08, "loss": 0.1697, "step": 7960 }, { "epoch": 124.53125, "grad_norm": 1.521837592124939, "learning_rate": 7.895579618388827e-09, "loss": 0.1649, "step": 7970 }, { "epoch": 124.6875, "grad_norm": 1.2289235591888428, "learning_rate": 3.731920951877133e-09, "loss": 0.1648, "step": 7980 }, { "epoch": 124.84375, "grad_norm": 1.900389313697815, "learning_rate": 1.1103284404012648e-09, "loss": 0.1635, "step": 7990 }, { "epoch": 125.0, "grad_norm": 1.170914888381958, "learning_rate": 3.084251216645129e-11, "loss": 0.1648, "step": 8000 } ], "logging_steps": 10, "max_steps": 8000, "num_input_tokens_seen": 0, "num_train_epochs": 125, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7740235738629325e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }