diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.250214364460414, + "epoch": 0.500428728920828, "eval_steps": 766, - "global_step": 766, + "global_step": 1532, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -5385,6 +5385,5376 @@ "eval_samples_per_second": 5.164, "eval_steps_per_second": 2.582, "step": 766 + }, + { + "epoch": 0.25054101506675924, + "grad_norm": 0.4362111985683441, + "learning_rate": 8.664921401487023e-05, + "loss": 1.0782, + "step": 767 + }, + { + "epoch": 0.2508676656731044, + "grad_norm": 0.4964749813079834, + "learning_rate": 8.661370661824e-05, + "loss": 1.1945, + "step": 768 + }, + { + "epoch": 0.2511943162794496, + "grad_norm": 0.5608397722244263, + "learning_rate": 8.657815936302337e-05, + "loss": 1.2661, + "step": 769 + }, + { + "epoch": 0.2515209668857948, + "grad_norm": 0.5862159132957458, + "learning_rate": 8.654257228791795e-05, + "loss": 1.0976, + "step": 770 + }, + { + "epoch": 0.25184761749213996, + "grad_norm": 0.7066771984100342, + "learning_rate": 8.650694543166475e-05, + "loss": 1.274, + "step": 771 + }, + { + "epoch": 0.25217426809848514, + "grad_norm": 0.7534899711608887, + "learning_rate": 8.647127883304799e-05, + "loss": 1.1669, + "step": 772 + }, + { + "epoch": 0.2525009187048303, + "grad_norm": 0.9842721819877625, + "learning_rate": 8.643557253089525e-05, + "loss": 1.3533, + "step": 773 + }, + { + "epoch": 0.25282756931117556, + "grad_norm": 1.294683814048767, + "learning_rate": 8.639982656407729e-05, + "loss": 1.7016, + "step": 774 + }, + { + "epoch": 0.25315421991752074, + "grad_norm": 1.7022143602371216, + "learning_rate": 8.636404097150802e-05, + "loss": 1.9253, + "step": 775 + }, + { + "epoch": 0.2534808705238659, + "grad_norm": 0.19106809794902802, + "learning_rate": 8.632821579214456e-05, + "loss": 0.7292, + "step": 776 + }, + { + "epoch": 0.2538075211302111, + "grad_norm": 0.2291499525308609, + "learning_rate": 8.629235106498708e-05, + "loss": 0.8147, + "step": 777 + }, + { + "epoch": 0.2541341717365563, + "grad_norm": 0.23071865737438202, + "learning_rate": 8.625644682907879e-05, + "loss": 0.7612, + "step": 778 + }, + { + "epoch": 0.25446082234290146, + "grad_norm": 0.24651503562927246, + "learning_rate": 8.622050312350594e-05, + "loss": 0.874, + "step": 779 + }, + { + "epoch": 0.25478747294924664, + "grad_norm": 0.2605278789997101, + "learning_rate": 8.618451998739774e-05, + "loss": 0.8279, + "step": 780 + }, + { + "epoch": 0.2551141235555919, + "grad_norm": 0.2556253969669342, + "learning_rate": 8.614849745992632e-05, + "loss": 0.8052, + "step": 781 + }, + { + "epoch": 0.25544077416193706, + "grad_norm": 0.28385311365127563, + "learning_rate": 8.611243558030668e-05, + "loss": 0.8039, + "step": 782 + }, + { + "epoch": 0.25576742476828224, + "grad_norm": 0.303835928440094, + "learning_rate": 8.60763343877967e-05, + "loss": 0.9079, + "step": 783 + }, + { + "epoch": 0.2560940753746274, + "grad_norm": 0.31251901388168335, + "learning_rate": 8.604019392169702e-05, + "loss": 0.8428, + "step": 784 + }, + { + "epoch": 0.2564207259809726, + "grad_norm": 0.335475891828537, + "learning_rate": 8.600401422135104e-05, + "loss": 0.9193, + "step": 785 + }, + { + "epoch": 0.2567473765873178, + "grad_norm": 0.3252756595611572, + "learning_rate": 8.596779532614488e-05, + "loss": 0.8837, + "step": 786 + }, + { + "epoch": 0.25707402719366296, + "grad_norm": 0.3637792766094208, + "learning_rate": 8.593153727550732e-05, + "loss": 0.9312, + "step": 787 + }, + { + "epoch": 0.2574006778000082, + "grad_norm": 0.4200628399848938, + "learning_rate": 8.589524010890977e-05, + "loss": 1.0147, + "step": 788 + }, + { + "epoch": 0.2577273284063534, + "grad_norm": 0.3719130754470825, + "learning_rate": 8.585890386586623e-05, + "loss": 0.9036, + "step": 789 + }, + { + "epoch": 0.25805397901269855, + "grad_norm": 0.4389430284500122, + "learning_rate": 8.582252858593324e-05, + "loss": 1.0327, + "step": 790 + }, + { + "epoch": 0.25838062961904373, + "grad_norm": 0.445206880569458, + "learning_rate": 8.578611430870979e-05, + "loss": 1.1391, + "step": 791 + }, + { + "epoch": 0.2587072802253889, + "grad_norm": 0.5291475057601929, + "learning_rate": 8.574966107383744e-05, + "loss": 1.2331, + "step": 792 + }, + { + "epoch": 0.2590339308317341, + "grad_norm": 0.5588846802711487, + "learning_rate": 8.5713168921e-05, + "loss": 1.0723, + "step": 793 + }, + { + "epoch": 0.2593605814380793, + "grad_norm": 0.6296544075012207, + "learning_rate": 8.567663788992377e-05, + "loss": 1.2545, + "step": 794 + }, + { + "epoch": 0.2596872320444245, + "grad_norm": 0.6767513155937195, + "learning_rate": 8.564006802037734e-05, + "loss": 1.1453, + "step": 795 + }, + { + "epoch": 0.2600138826507697, + "grad_norm": 0.7685205340385437, + "learning_rate": 8.560345935217155e-05, + "loss": 1.169, + "step": 796 + }, + { + "epoch": 0.26034053325711487, + "grad_norm": 0.975924015045166, + "learning_rate": 8.556681192515952e-05, + "loss": 1.3188, + "step": 797 + }, + { + "epoch": 0.26066718386346005, + "grad_norm": 1.1853704452514648, + "learning_rate": 8.553012577923653e-05, + "loss": 1.5078, + "step": 798 + }, + { + "epoch": 0.26099383446980523, + "grad_norm": 1.6453309059143066, + "learning_rate": 8.549340095434006e-05, + "loss": 1.5953, + "step": 799 + }, + { + "epoch": 0.2613204850761504, + "grad_norm": 1.7937952280044556, + "learning_rate": 8.54566374904496e-05, + "loss": 2.4296, + "step": 800 + }, + { + "epoch": 0.2616471356824956, + "grad_norm": 0.190406933426857, + "learning_rate": 8.541983542758685e-05, + "loss": 0.7342, + "step": 801 + }, + { + "epoch": 0.2619737862888408, + "grad_norm": 0.22838981449604034, + "learning_rate": 8.538299480581538e-05, + "loss": 0.7493, + "step": 802 + }, + { + "epoch": 0.262300436895186, + "grad_norm": 0.2559290826320648, + "learning_rate": 8.53461156652408e-05, + "loss": 0.8504, + "step": 803 + }, + { + "epoch": 0.2626270875015312, + "grad_norm": 0.2588362693786621, + "learning_rate": 8.53091980460107e-05, + "loss": 0.9, + "step": 804 + }, + { + "epoch": 0.26295373810787637, + "grad_norm": 0.2569828927516937, + "learning_rate": 8.527224198831447e-05, + "loss": 0.813, + "step": 805 + }, + { + "epoch": 0.26328038871422155, + "grad_norm": 0.2703791558742523, + "learning_rate": 8.523524753238342e-05, + "loss": 0.8616, + "step": 806 + }, + { + "epoch": 0.2636070393205667, + "grad_norm": 0.27567189931869507, + "learning_rate": 8.519821471849061e-05, + "loss": 0.9035, + "step": 807 + }, + { + "epoch": 0.2639336899269119, + "grad_norm": 0.2831965386867523, + "learning_rate": 8.516114358695089e-05, + "loss": 0.94, + "step": 808 + }, + { + "epoch": 0.2642603405332571, + "grad_norm": 0.28172117471694946, + "learning_rate": 8.51240341781208e-05, + "loss": 0.8232, + "step": 809 + }, + { + "epoch": 0.2645869911396023, + "grad_norm": 0.3024875521659851, + "learning_rate": 8.508688653239858e-05, + "loss": 0.8993, + "step": 810 + }, + { + "epoch": 0.2649136417459475, + "grad_norm": 0.3354199528694153, + "learning_rate": 8.504970069022404e-05, + "loss": 0.89, + "step": 811 + }, + { + "epoch": 0.2652402923522927, + "grad_norm": 0.34784260392189026, + "learning_rate": 8.501247669207864e-05, + "loss": 1.0391, + "step": 812 + }, + { + "epoch": 0.26556694295863786, + "grad_norm": 0.3697127103805542, + "learning_rate": 8.497521457848532e-05, + "loss": 1.0011, + "step": 813 + }, + { + "epoch": 0.26589359356498304, + "grad_norm": 0.37952542304992676, + "learning_rate": 8.493791439000855e-05, + "loss": 0.8449, + "step": 814 + }, + { + "epoch": 0.2662202441713282, + "grad_norm": 0.42014080286026, + "learning_rate": 8.490057616725424e-05, + "loss": 0.9275, + "step": 815 + }, + { + "epoch": 0.2665468947776734, + "grad_norm": 0.46746087074279785, + "learning_rate": 8.48631999508697e-05, + "loss": 1.2106, + "step": 816 + }, + { + "epoch": 0.26687354538401864, + "grad_norm": 0.5447984933853149, + "learning_rate": 8.482578578154361e-05, + "loss": 1.1964, + "step": 817 + }, + { + "epoch": 0.2672001959903638, + "grad_norm": 0.5636278390884399, + "learning_rate": 8.478833370000594e-05, + "loss": 1.1554, + "step": 818 + }, + { + "epoch": 0.267526846596709, + "grad_norm": 0.6589205861091614, + "learning_rate": 8.475084374702797e-05, + "loss": 1.2816, + "step": 819 + }, + { + "epoch": 0.2678534972030542, + "grad_norm": 0.716610312461853, + "learning_rate": 8.47133159634222e-05, + "loss": 1.2345, + "step": 820 + }, + { + "epoch": 0.26818014780939936, + "grad_norm": 0.866495668888092, + "learning_rate": 8.467575039004227e-05, + "loss": 1.2124, + "step": 821 + }, + { + "epoch": 0.26850679841574454, + "grad_norm": 1.089608073234558, + "learning_rate": 8.463814706778304e-05, + "loss": 1.69, + "step": 822 + }, + { + "epoch": 0.2688334490220897, + "grad_norm": 1.1608694791793823, + "learning_rate": 8.460050603758035e-05, + "loss": 1.732, + "step": 823 + }, + { + "epoch": 0.26916009962843496, + "grad_norm": 1.2007168531417847, + "learning_rate": 8.456282734041121e-05, + "loss": 1.83, + "step": 824 + }, + { + "epoch": 0.26948675023478014, + "grad_norm": 1.6301062107086182, + "learning_rate": 8.452511101729357e-05, + "loss": 1.664, + "step": 825 + }, + { + "epoch": 0.2698134008411253, + "grad_norm": 0.1741432547569275, + "learning_rate": 8.448735710928635e-05, + "loss": 0.6838, + "step": 826 + }, + { + "epoch": 0.2701400514474705, + "grad_norm": 0.21807898581027985, + "learning_rate": 8.444956565748937e-05, + "loss": 0.8771, + "step": 827 + }, + { + "epoch": 0.2704667020538157, + "grad_norm": 0.23946182429790497, + "learning_rate": 8.441173670304337e-05, + "loss": 0.7937, + "step": 828 + }, + { + "epoch": 0.27079335266016086, + "grad_norm": 0.24787762761116028, + "learning_rate": 8.437387028712984e-05, + "loss": 0.8956, + "step": 829 + }, + { + "epoch": 0.27112000326650604, + "grad_norm": 0.26188749074935913, + "learning_rate": 8.433596645097114e-05, + "loss": 0.8621, + "step": 830 + }, + { + "epoch": 0.2714466538728513, + "grad_norm": 0.28136762976646423, + "learning_rate": 8.429802523583032e-05, + "loss": 0.8664, + "step": 831 + }, + { + "epoch": 0.27177330447919645, + "grad_norm": 0.2663547992706299, + "learning_rate": 8.42600466830111e-05, + "loss": 0.8265, + "step": 832 + }, + { + "epoch": 0.27209995508554163, + "grad_norm": 0.2984652817249298, + "learning_rate": 8.422203083385791e-05, + "loss": 0.9242, + "step": 833 + }, + { + "epoch": 0.2724266056918868, + "grad_norm": 0.30643805861473083, + "learning_rate": 8.418397772975571e-05, + "loss": 0.914, + "step": 834 + }, + { + "epoch": 0.272753256298232, + "grad_norm": 0.33440685272216797, + "learning_rate": 8.414588741213004e-05, + "loss": 0.8974, + "step": 835 + }, + { + "epoch": 0.2730799069045772, + "grad_norm": 0.34194639325141907, + "learning_rate": 8.410775992244699e-05, + "loss": 0.9481, + "step": 836 + }, + { + "epoch": 0.27340655751092235, + "grad_norm": 0.3455904722213745, + "learning_rate": 8.406959530221308e-05, + "loss": 0.8916, + "step": 837 + }, + { + "epoch": 0.2737332081172676, + "grad_norm": 0.38675469160079956, + "learning_rate": 8.403139359297526e-05, + "loss": 0.981, + "step": 838 + }, + { + "epoch": 0.27405985872361277, + "grad_norm": 0.38662129640579224, + "learning_rate": 8.399315483632087e-05, + "loss": 0.9426, + "step": 839 + }, + { + "epoch": 0.27438650932995795, + "grad_norm": 0.4323444366455078, + "learning_rate": 8.395487907387751e-05, + "loss": 0.9693, + "step": 840 + }, + { + "epoch": 0.27471315993630313, + "grad_norm": 0.4435561001300812, + "learning_rate": 8.391656634731319e-05, + "loss": 0.9983, + "step": 841 + }, + { + "epoch": 0.2750398105426483, + "grad_norm": 0.5139126777648926, + "learning_rate": 8.387821669833606e-05, + "loss": 1.198, + "step": 842 + }, + { + "epoch": 0.2753664611489935, + "grad_norm": 0.5524410605430603, + "learning_rate": 8.383983016869448e-05, + "loss": 1.18, + "step": 843 + }, + { + "epoch": 0.27569311175533867, + "grad_norm": 0.6392261981964111, + "learning_rate": 8.380140680017703e-05, + "loss": 1.2321, + "step": 844 + }, + { + "epoch": 0.2760197623616839, + "grad_norm": 0.6585685610771179, + "learning_rate": 8.376294663461227e-05, + "loss": 1.2603, + "step": 845 + }, + { + "epoch": 0.2763464129680291, + "grad_norm": 0.8117931485176086, + "learning_rate": 8.372444971386894e-05, + "loss": 1.3741, + "step": 846 + }, + { + "epoch": 0.27667306357437427, + "grad_norm": 0.9951660633087158, + "learning_rate": 8.368591607985571e-05, + "loss": 1.3947, + "step": 847 + }, + { + "epoch": 0.27699971418071945, + "grad_norm": 1.1902111768722534, + "learning_rate": 8.364734577452127e-05, + "loss": 1.5964, + "step": 848 + }, + { + "epoch": 0.2773263647870646, + "grad_norm": 1.2848105430603027, + "learning_rate": 8.360873883985418e-05, + "loss": 1.3661, + "step": 849 + }, + { + "epoch": 0.2776530153934098, + "grad_norm": 1.5601998567581177, + "learning_rate": 8.357009531788293e-05, + "loss": 1.5122, + "step": 850 + }, + { + "epoch": 0.277979665999755, + "grad_norm": 0.17106448113918304, + "learning_rate": 8.353141525067579e-05, + "loss": 0.6374, + "step": 851 + }, + { + "epoch": 0.2783063166061002, + "grad_norm": 0.21366915106773376, + "learning_rate": 8.349269868034087e-05, + "loss": 0.8114, + "step": 852 + }, + { + "epoch": 0.2786329672124454, + "grad_norm": 0.2161540687084198, + "learning_rate": 8.345394564902594e-05, + "loss": 0.8004, + "step": 853 + }, + { + "epoch": 0.2789596178187906, + "grad_norm": 0.24288657307624817, + "learning_rate": 8.341515619891856e-05, + "loss": 0.8151, + "step": 854 + }, + { + "epoch": 0.27928626842513576, + "grad_norm": 0.25571781396865845, + "learning_rate": 8.337633037224583e-05, + "loss": 0.8574, + "step": 855 + }, + { + "epoch": 0.27961291903148094, + "grad_norm": 0.30678462982177734, + "learning_rate": 8.333746821127455e-05, + "loss": 0.8622, + "step": 856 + }, + { + "epoch": 0.2799395696378261, + "grad_norm": 0.2756674289703369, + "learning_rate": 8.329856975831103e-05, + "loss": 0.8822, + "step": 857 + }, + { + "epoch": 0.2802662202441713, + "grad_norm": 0.2979314923286438, + "learning_rate": 8.325963505570104e-05, + "loss": 0.8659, + "step": 858 + }, + { + "epoch": 0.28059287085051654, + "grad_norm": 0.2941073775291443, + "learning_rate": 8.322066414582992e-05, + "loss": 0.8474, + "step": 859 + }, + { + "epoch": 0.2809195214568617, + "grad_norm": 0.32220232486724854, + "learning_rate": 8.318165707112233e-05, + "loss": 0.865, + "step": 860 + }, + { + "epoch": 0.2812461720632069, + "grad_norm": 0.33228668570518494, + "learning_rate": 8.314261387404234e-05, + "loss": 0.9357, + "step": 861 + }, + { + "epoch": 0.2815728226695521, + "grad_norm": 0.3545335829257965, + "learning_rate": 8.310353459709333e-05, + "loss": 0.9402, + "step": 862 + }, + { + "epoch": 0.28189947327589726, + "grad_norm": 0.3884226679801941, + "learning_rate": 8.306441928281798e-05, + "loss": 1.0393, + "step": 863 + }, + { + "epoch": 0.28222612388224244, + "grad_norm": 0.40320566296577454, + "learning_rate": 8.302526797379822e-05, + "loss": 0.961, + "step": 864 + }, + { + "epoch": 0.2825527744885876, + "grad_norm": 0.4174101948738098, + "learning_rate": 8.298608071265507e-05, + "loss": 0.9835, + "step": 865 + }, + { + "epoch": 0.28287942509493286, + "grad_norm": 0.4447181820869446, + "learning_rate": 8.29468575420488e-05, + "loss": 1.0212, + "step": 866 + }, + { + "epoch": 0.28320607570127804, + "grad_norm": 0.46445220708847046, + "learning_rate": 8.29075985046787e-05, + "loss": 1.0336, + "step": 867 + }, + { + "epoch": 0.2835327263076232, + "grad_norm": 0.5036008358001709, + "learning_rate": 8.286830364328314e-05, + "loss": 1.0846, + "step": 868 + }, + { + "epoch": 0.2838593769139684, + "grad_norm": 0.5870608687400818, + "learning_rate": 8.282897300063946e-05, + "loss": 1.051, + "step": 869 + }, + { + "epoch": 0.2841860275203136, + "grad_norm": 0.6841760277748108, + "learning_rate": 8.278960661956401e-05, + "loss": 1.366, + "step": 870 + }, + { + "epoch": 0.28451267812665876, + "grad_norm": 0.799982488155365, + "learning_rate": 8.275020454291195e-05, + "loss": 1.2628, + "step": 871 + }, + { + "epoch": 0.28483932873300394, + "grad_norm": 1.000924825668335, + "learning_rate": 8.271076681357741e-05, + "loss": 1.4612, + "step": 872 + }, + { + "epoch": 0.2851659793393492, + "grad_norm": 1.1383004188537598, + "learning_rate": 8.267129347449322e-05, + "loss": 1.6271, + "step": 873 + }, + { + "epoch": 0.28549262994569435, + "grad_norm": 1.2377307415008545, + "learning_rate": 8.26317845686311e-05, + "loss": 1.2689, + "step": 874 + }, + { + "epoch": 0.28581928055203953, + "grad_norm": 1.6466963291168213, + "learning_rate": 8.259224013900137e-05, + "loss": 1.6594, + "step": 875 + }, + { + "epoch": 0.2861459311583847, + "grad_norm": 0.2105908840894699, + "learning_rate": 8.255266022865309e-05, + "loss": 0.7822, + "step": 876 + }, + { + "epoch": 0.2864725817647299, + "grad_norm": 0.22548696398735046, + "learning_rate": 8.251304488067393e-05, + "loss": 0.7938, + "step": 877 + }, + { + "epoch": 0.2867992323710751, + "grad_norm": 0.24404418468475342, + "learning_rate": 8.247339413819015e-05, + "loss": 0.7887, + "step": 878 + }, + { + "epoch": 0.28712588297742025, + "grad_norm": 0.25363683700561523, + "learning_rate": 8.243370804436649e-05, + "loss": 0.9139, + "step": 879 + }, + { + "epoch": 0.2874525335837655, + "grad_norm": 0.25927162170410156, + "learning_rate": 8.239398664240627e-05, + "loss": 0.8177, + "step": 880 + }, + { + "epoch": 0.28777918419011067, + "grad_norm": 0.2730761170387268, + "learning_rate": 8.235422997555114e-05, + "loss": 0.9055, + "step": 881 + }, + { + "epoch": 0.28810583479645585, + "grad_norm": 0.2785848081111908, + "learning_rate": 8.231443808708122e-05, + "loss": 0.9572, + "step": 882 + }, + { + "epoch": 0.28843248540280103, + "grad_norm": 0.27791181206703186, + "learning_rate": 8.227461102031493e-05, + "loss": 0.803, + "step": 883 + }, + { + "epoch": 0.2887591360091462, + "grad_norm": 0.28716158866882324, + "learning_rate": 8.2234748818609e-05, + "loss": 0.8582, + "step": 884 + }, + { + "epoch": 0.2890857866154914, + "grad_norm": 0.3149946630001068, + "learning_rate": 8.21948515253584e-05, + "loss": 0.9189, + "step": 885 + }, + { + "epoch": 0.28941243722183657, + "grad_norm": 0.3081023395061493, + "learning_rate": 8.215491918399633e-05, + "loss": 0.8939, + "step": 886 + }, + { + "epoch": 0.2897390878281818, + "grad_norm": 0.31537654995918274, + "learning_rate": 8.211495183799413e-05, + "loss": 0.8625, + "step": 887 + }, + { + "epoch": 0.290065738434527, + "grad_norm": 0.3406592905521393, + "learning_rate": 8.20749495308612e-05, + "loss": 0.9604, + "step": 888 + }, + { + "epoch": 0.29039238904087217, + "grad_norm": 0.3376745581626892, + "learning_rate": 8.20349123061451e-05, + "loss": 0.8982, + "step": 889 + }, + { + "epoch": 0.29071903964721735, + "grad_norm": 0.3791499435901642, + "learning_rate": 8.19948402074313e-05, + "loss": 0.9649, + "step": 890 + }, + { + "epoch": 0.29104569025356253, + "grad_norm": 0.4459807872772217, + "learning_rate": 8.195473327834329e-05, + "loss": 1.1565, + "step": 891 + }, + { + "epoch": 0.2913723408599077, + "grad_norm": 0.45107483863830566, + "learning_rate": 8.191459156254247e-05, + "loss": 0.9899, + "step": 892 + }, + { + "epoch": 0.2916989914662529, + "grad_norm": 0.50025475025177, + "learning_rate": 8.187441510372808e-05, + "loss": 1.1841, + "step": 893 + }, + { + "epoch": 0.2920256420725981, + "grad_norm": 0.5542461276054382, + "learning_rate": 8.183420394563724e-05, + "loss": 1.183, + "step": 894 + }, + { + "epoch": 0.2923522926789433, + "grad_norm": 0.6245063543319702, + "learning_rate": 8.179395813204477e-05, + "loss": 1.3295, + "step": 895 + }, + { + "epoch": 0.2926789432852885, + "grad_norm": 0.7166218757629395, + "learning_rate": 8.17536777067633e-05, + "loss": 1.3525, + "step": 896 + }, + { + "epoch": 0.29300559389163366, + "grad_norm": 0.9630147814750671, + "learning_rate": 8.171336271364308e-05, + "loss": 1.4074, + "step": 897 + }, + { + "epoch": 0.29333224449797884, + "grad_norm": 1.0214118957519531, + "learning_rate": 8.167301319657201e-05, + "loss": 1.4875, + "step": 898 + }, + { + "epoch": 0.293658895104324, + "grad_norm": 1.444045901298523, + "learning_rate": 8.163262919947557e-05, + "loss": 1.6154, + "step": 899 + }, + { + "epoch": 0.2939855457106692, + "grad_norm": 1.9283764362335205, + "learning_rate": 8.159221076631678e-05, + "loss": 2.093, + "step": 900 + }, + { + "epoch": 0.29431219631701444, + "grad_norm": 0.2009824812412262, + "learning_rate": 8.155175794109614e-05, + "loss": 0.8172, + "step": 901 + }, + { + "epoch": 0.2946388469233596, + "grad_norm": 0.21653543412685394, + "learning_rate": 8.15112707678516e-05, + "loss": 0.8646, + "step": 902 + }, + { + "epoch": 0.2949654975297048, + "grad_norm": 0.26212337613105774, + "learning_rate": 8.14707492906585e-05, + "loss": 0.9042, + "step": 903 + }, + { + "epoch": 0.29529214813605, + "grad_norm": 0.26977843046188354, + "learning_rate": 8.143019355362952e-05, + "loss": 0.8568, + "step": 904 + }, + { + "epoch": 0.29561879874239516, + "grad_norm": 0.2698366045951843, + "learning_rate": 8.138960360091463e-05, + "loss": 0.8116, + "step": 905 + }, + { + "epoch": 0.29594544934874034, + "grad_norm": 0.29477614164352417, + "learning_rate": 8.134897947670108e-05, + "loss": 0.8601, + "step": 906 + }, + { + "epoch": 0.2962720999550855, + "grad_norm": 0.2830169200897217, + "learning_rate": 8.130832122521327e-05, + "loss": 0.8487, + "step": 907 + }, + { + "epoch": 0.29659875056143076, + "grad_norm": 0.3099175691604614, + "learning_rate": 8.12676288907128e-05, + "loss": 0.9419, + "step": 908 + }, + { + "epoch": 0.29692540116777594, + "grad_norm": 0.3113427758216858, + "learning_rate": 8.122690251749834e-05, + "loss": 0.94, + "step": 909 + }, + { + "epoch": 0.2972520517741211, + "grad_norm": 0.317848265171051, + "learning_rate": 8.118614214990561e-05, + "loss": 0.9457, + "step": 910 + }, + { + "epoch": 0.2975787023804663, + "grad_norm": 0.32310110330581665, + "learning_rate": 8.114534783230739e-05, + "loss": 0.93, + "step": 911 + }, + { + "epoch": 0.2979053529868115, + "grad_norm": 0.3455216884613037, + "learning_rate": 8.110451960911333e-05, + "loss": 0.9539, + "step": 912 + }, + { + "epoch": 0.29823200359315666, + "grad_norm": 0.3817412853240967, + "learning_rate": 8.106365752477012e-05, + "loss": 1.0052, + "step": 913 + }, + { + "epoch": 0.29855865419950184, + "grad_norm": 0.39544934034347534, + "learning_rate": 8.102276162376117e-05, + "loss": 0.9988, + "step": 914 + }, + { + "epoch": 0.298885304805847, + "grad_norm": 0.43939855694770813, + "learning_rate": 8.09818319506068e-05, + "loss": 1.1415, + "step": 915 + }, + { + "epoch": 0.29921195541219225, + "grad_norm": 0.442859947681427, + "learning_rate": 8.094086854986405e-05, + "loss": 1.0522, + "step": 916 + }, + { + "epoch": 0.29953860601853743, + "grad_norm": 0.4725445508956909, + "learning_rate": 8.089987146612669e-05, + "loss": 1.0607, + "step": 917 + }, + { + "epoch": 0.2998652566248826, + "grad_norm": 0.5423445701599121, + "learning_rate": 8.085884074402518e-05, + "loss": 1.1992, + "step": 918 + }, + { + "epoch": 0.3001919072312278, + "grad_norm": 0.5503812432289124, + "learning_rate": 8.081777642822657e-05, + "loss": 1.1793, + "step": 919 + }, + { + "epoch": 0.300518557837573, + "grad_norm": 0.5893210172653198, + "learning_rate": 8.077667856343449e-05, + "loss": 1.0644, + "step": 920 + }, + { + "epoch": 0.30084520844391816, + "grad_norm": 0.7819690108299255, + "learning_rate": 8.073554719438908e-05, + "loss": 1.4015, + "step": 921 + }, + { + "epoch": 0.30117185905026334, + "grad_norm": 0.8868228793144226, + "learning_rate": 8.069438236586695e-05, + "loss": 1.3257, + "step": 922 + }, + { + "epoch": 0.30149850965660857, + "grad_norm": 1.201609492301941, + "learning_rate": 8.065318412268119e-05, + "loss": 1.3868, + "step": 923 + }, + { + "epoch": 0.30182516026295375, + "grad_norm": 1.1145105361938477, + "learning_rate": 8.061195250968121e-05, + "loss": 1.4812, + "step": 924 + }, + { + "epoch": 0.30215181086929893, + "grad_norm": 1.567140817642212, + "learning_rate": 8.057068757175276e-05, + "loss": 1.737, + "step": 925 + }, + { + "epoch": 0.3024784614756441, + "grad_norm": 0.21669495105743408, + "learning_rate": 8.052938935381786e-05, + "loss": 0.7445, + "step": 926 + }, + { + "epoch": 0.3028051120819893, + "grad_norm": 0.22615468502044678, + "learning_rate": 8.048805790083481e-05, + "loss": 0.8108, + "step": 927 + }, + { + "epoch": 0.30313176268833447, + "grad_norm": 0.2774372100830078, + "learning_rate": 8.0446693257798e-05, + "loss": 0.8657, + "step": 928 + }, + { + "epoch": 0.30345841329467965, + "grad_norm": 0.2653120756149292, + "learning_rate": 8.040529546973805e-05, + "loss": 0.8441, + "step": 929 + }, + { + "epoch": 0.3037850639010249, + "grad_norm": 0.303362637758255, + "learning_rate": 8.036386458172161e-05, + "loss": 0.869, + "step": 930 + }, + { + "epoch": 0.30411171450737007, + "grad_norm": 0.29433003067970276, + "learning_rate": 8.032240063885133e-05, + "loss": 0.8341, + "step": 931 + }, + { + "epoch": 0.30443836511371525, + "grad_norm": 0.301315575838089, + "learning_rate": 8.028090368626591e-05, + "loss": 0.8746, + "step": 932 + }, + { + "epoch": 0.30476501572006043, + "grad_norm": 0.30583563446998596, + "learning_rate": 8.023937376913996e-05, + "loss": 0.8948, + "step": 933 + }, + { + "epoch": 0.3050916663264056, + "grad_norm": 0.3299638330936432, + "learning_rate": 8.019781093268396e-05, + "loss": 0.8029, + "step": 934 + }, + { + "epoch": 0.3054183169327508, + "grad_norm": 0.3463474214076996, + "learning_rate": 8.015621522214429e-05, + "loss": 0.8575, + "step": 935 + }, + { + "epoch": 0.30574496753909597, + "grad_norm": 0.3461097776889801, + "learning_rate": 8.0114586682803e-05, + "loss": 0.9876, + "step": 936 + }, + { + "epoch": 0.3060716181454412, + "grad_norm": 0.354095458984375, + "learning_rate": 8.007292535997799e-05, + "loss": 0.9575, + "step": 937 + }, + { + "epoch": 0.3063982687517864, + "grad_norm": 0.3798564076423645, + "learning_rate": 8.00312312990228e-05, + "loss": 1.0164, + "step": 938 + }, + { + "epoch": 0.30672491935813156, + "grad_norm": 0.3784736692905426, + "learning_rate": 7.998950454532662e-05, + "loss": 0.97, + "step": 939 + }, + { + "epoch": 0.30705156996447674, + "grad_norm": 0.40412405133247375, + "learning_rate": 7.99477451443142e-05, + "loss": 0.9975, + "step": 940 + }, + { + "epoch": 0.3073782205708219, + "grad_norm": 0.4350655674934387, + "learning_rate": 7.990595314144587e-05, + "loss": 1.005, + "step": 941 + }, + { + "epoch": 0.3077048711771671, + "grad_norm": 0.4765323996543884, + "learning_rate": 7.986412858221746e-05, + "loss": 1.0305, + "step": 942 + }, + { + "epoch": 0.3080315217835123, + "grad_norm": 0.5018781423568726, + "learning_rate": 7.982227151216019e-05, + "loss": 1.0618, + "step": 943 + }, + { + "epoch": 0.3083581723898575, + "grad_norm": 0.574617862701416, + "learning_rate": 7.978038197684073e-05, + "loss": 1.1572, + "step": 944 + }, + { + "epoch": 0.3086848229962027, + "grad_norm": 0.6548877358436584, + "learning_rate": 7.973846002186103e-05, + "loss": 1.2569, + "step": 945 + }, + { + "epoch": 0.3090114736025479, + "grad_norm": 0.7577599883079529, + "learning_rate": 7.969650569285839e-05, + "loss": 1.3086, + "step": 946 + }, + { + "epoch": 0.30933812420889306, + "grad_norm": 0.8888007998466492, + "learning_rate": 7.965451903550531e-05, + "loss": 1.1316, + "step": 947 + }, + { + "epoch": 0.30966477481523824, + "grad_norm": 1.0013303756713867, + "learning_rate": 7.961250009550953e-05, + "loss": 1.3751, + "step": 948 + }, + { + "epoch": 0.3099914254215834, + "grad_norm": 1.2923164367675781, + "learning_rate": 7.95704489186139e-05, + "loss": 1.5979, + "step": 949 + }, + { + "epoch": 0.3103180760279286, + "grad_norm": 1.940639853477478, + "learning_rate": 7.952836555059635e-05, + "loss": 2.0671, + "step": 950 + }, + { + "epoch": 0.31064472663427384, + "grad_norm": 0.20804435014724731, + "learning_rate": 7.94862500372699e-05, + "loss": 0.7342, + "step": 951 + }, + { + "epoch": 0.310971377240619, + "grad_norm": 0.22356468439102173, + "learning_rate": 7.944410242448253e-05, + "loss": 0.8373, + "step": 952 + }, + { + "epoch": 0.3112980278469642, + "grad_norm": 0.22745850682258606, + "learning_rate": 7.940192275811717e-05, + "loss": 0.8846, + "step": 953 + }, + { + "epoch": 0.3116246784533094, + "grad_norm": 0.22750817239284515, + "learning_rate": 7.935971108409166e-05, + "loss": 0.8154, + "step": 954 + }, + { + "epoch": 0.31195132905965456, + "grad_norm": 0.26259639859199524, + "learning_rate": 7.931746744835865e-05, + "loss": 0.8305, + "step": 955 + }, + { + "epoch": 0.31227797966599974, + "grad_norm": 0.26343998312950134, + "learning_rate": 7.927519189690562e-05, + "loss": 0.9104, + "step": 956 + }, + { + "epoch": 0.3126046302723449, + "grad_norm": 0.28506144881248474, + "learning_rate": 7.923288447575479e-05, + "loss": 0.8731, + "step": 957 + }, + { + "epoch": 0.31293128087869015, + "grad_norm": 0.2877998948097229, + "learning_rate": 7.919054523096306e-05, + "loss": 0.9688, + "step": 958 + }, + { + "epoch": 0.31325793148503533, + "grad_norm": 0.2984800636768341, + "learning_rate": 7.914817420862196e-05, + "loss": 0.8833, + "step": 959 + }, + { + "epoch": 0.3135845820913805, + "grad_norm": 0.3453596234321594, + "learning_rate": 7.910577145485765e-05, + "loss": 0.8775, + "step": 960 + }, + { + "epoch": 0.3139112326977257, + "grad_norm": 0.3432406187057495, + "learning_rate": 7.906333701583082e-05, + "loss": 0.8728, + "step": 961 + }, + { + "epoch": 0.3142378833040709, + "grad_norm": 0.3565903604030609, + "learning_rate": 7.902087093773663e-05, + "loss": 0.9814, + "step": 962 + }, + { + "epoch": 0.31456453391041606, + "grad_norm": 0.3908556401729584, + "learning_rate": 7.897837326680473e-05, + "loss": 1.0056, + "step": 963 + }, + { + "epoch": 0.31489118451676124, + "grad_norm": 0.392490029335022, + "learning_rate": 7.89358440492991e-05, + "loss": 1.0565, + "step": 964 + }, + { + "epoch": 0.31521783512310647, + "grad_norm": 0.4333183765411377, + "learning_rate": 7.889328333151814e-05, + "loss": 1.0835, + "step": 965 + }, + { + "epoch": 0.31554448572945165, + "grad_norm": 0.5302640199661255, + "learning_rate": 7.885069115979447e-05, + "loss": 1.112, + "step": 966 + }, + { + "epoch": 0.31587113633579683, + "grad_norm": 0.5419654846191406, + "learning_rate": 7.880806758049499e-05, + "loss": 1.166, + "step": 967 + }, + { + "epoch": 0.316197786942142, + "grad_norm": 0.5007505416870117, + "learning_rate": 7.876541264002078e-05, + "loss": 0.9182, + "step": 968 + }, + { + "epoch": 0.3165244375484872, + "grad_norm": 0.6056339144706726, + "learning_rate": 7.872272638480706e-05, + "loss": 1.2492, + "step": 969 + }, + { + "epoch": 0.31685108815483237, + "grad_norm": 0.7134028673171997, + "learning_rate": 7.868000886132316e-05, + "loss": 1.3831, + "step": 970 + }, + { + "epoch": 0.31717773876117755, + "grad_norm": 0.7521294355392456, + "learning_rate": 7.863726011607243e-05, + "loss": 1.4182, + "step": 971 + }, + { + "epoch": 0.3175043893675228, + "grad_norm": 0.9576146602630615, + "learning_rate": 7.859448019559217e-05, + "loss": 1.386, + "step": 972 + }, + { + "epoch": 0.31783103997386797, + "grad_norm": 1.1564505100250244, + "learning_rate": 7.855166914645372e-05, + "loss": 1.4906, + "step": 973 + }, + { + "epoch": 0.31815769058021315, + "grad_norm": 1.5212230682373047, + "learning_rate": 7.850882701526218e-05, + "loss": 1.6107, + "step": 974 + }, + { + "epoch": 0.31848434118655833, + "grad_norm": 2.16402268409729, + "learning_rate": 7.846595384865662e-05, + "loss": 2.8417, + "step": 975 + }, + { + "epoch": 0.3188109917929035, + "grad_norm": 0.20599240064620972, + "learning_rate": 7.84230496933098e-05, + "loss": 0.794, + "step": 976 + }, + { + "epoch": 0.3191376423992487, + "grad_norm": 0.21791720390319824, + "learning_rate": 7.838011459592824e-05, + "loss": 0.8069, + "step": 977 + }, + { + "epoch": 0.31946429300559387, + "grad_norm": 0.24189670383930206, + "learning_rate": 7.833714860325215e-05, + "loss": 0.8253, + "step": 978 + }, + { + "epoch": 0.3197909436119391, + "grad_norm": 0.23616375029087067, + "learning_rate": 7.829415176205539e-05, + "loss": 0.8416, + "step": 979 + }, + { + "epoch": 0.3201175942182843, + "grad_norm": 0.2511419951915741, + "learning_rate": 7.825112411914535e-05, + "loss": 0.8737, + "step": 980 + }, + { + "epoch": 0.32044424482462947, + "grad_norm": 0.24740338325500488, + "learning_rate": 7.820806572136301e-05, + "loss": 0.6864, + "step": 981 + }, + { + "epoch": 0.32077089543097465, + "grad_norm": 0.27360159158706665, + "learning_rate": 7.81649766155828e-05, + "loss": 0.9015, + "step": 982 + }, + { + "epoch": 0.3210975460373198, + "grad_norm": 0.27167263627052307, + "learning_rate": 7.812185684871261e-05, + "loss": 0.8641, + "step": 983 + }, + { + "epoch": 0.321424196643665, + "grad_norm": 0.27509331703186035, + "learning_rate": 7.807870646769364e-05, + "loss": 0.8864, + "step": 984 + }, + { + "epoch": 0.3217508472500102, + "grad_norm": 0.2927476167678833, + "learning_rate": 7.80355255195005e-05, + "loss": 0.8999, + "step": 985 + }, + { + "epoch": 0.3220774978563554, + "grad_norm": 0.30333781242370605, + "learning_rate": 7.799231405114102e-05, + "loss": 0.8984, + "step": 986 + }, + { + "epoch": 0.3224041484627006, + "grad_norm": 0.29891061782836914, + "learning_rate": 7.794907210965627e-05, + "loss": 0.8042, + "step": 987 + }, + { + "epoch": 0.3227307990690458, + "grad_norm": 0.31980669498443604, + "learning_rate": 7.790579974212052e-05, + "loss": 0.9125, + "step": 988 + }, + { + "epoch": 0.32305744967539096, + "grad_norm": 0.3497137427330017, + "learning_rate": 7.78624969956411e-05, + "loss": 1.0623, + "step": 989 + }, + { + "epoch": 0.32338410028173614, + "grad_norm": 0.3825185298919678, + "learning_rate": 7.781916391735847e-05, + "loss": 0.9571, + "step": 990 + }, + { + "epoch": 0.3237107508880813, + "grad_norm": 0.44200360774993896, + "learning_rate": 7.77758005544461e-05, + "loss": 0.9618, + "step": 991 + }, + { + "epoch": 0.3240374014944265, + "grad_norm": 0.4567243754863739, + "learning_rate": 7.773240695411042e-05, + "loss": 1.0149, + "step": 992 + }, + { + "epoch": 0.32436405210077174, + "grad_norm": 0.5161774158477783, + "learning_rate": 7.768898316359076e-05, + "loss": 1.1654, + "step": 993 + }, + { + "epoch": 0.3246907027071169, + "grad_norm": 0.5306985974311829, + "learning_rate": 7.764552923015935e-05, + "loss": 1.1412, + "step": 994 + }, + { + "epoch": 0.3250173533134621, + "grad_norm": 0.6168984174728394, + "learning_rate": 7.76020452011212e-05, + "loss": 1.158, + "step": 995 + }, + { + "epoch": 0.3253440039198073, + "grad_norm": 0.707940936088562, + "learning_rate": 7.755853112381411e-05, + "loss": 1.3226, + "step": 996 + }, + { + "epoch": 0.32567065452615246, + "grad_norm": 0.8782923817634583, + "learning_rate": 7.751498704560858e-05, + "loss": 1.3097, + "step": 997 + }, + { + "epoch": 0.32599730513249764, + "grad_norm": 1.1435363292694092, + "learning_rate": 7.747141301390777e-05, + "loss": 1.5753, + "step": 998 + }, + { + "epoch": 0.3263239557388428, + "grad_norm": 1.199203372001648, + "learning_rate": 7.742780907614742e-05, + "loss": 1.2035, + "step": 999 + }, + { + "epoch": 0.32665060634518805, + "grad_norm": 1.617131233215332, + "learning_rate": 7.73841752797959e-05, + "loss": 1.5258, + "step": 1000 + }, + { + "epoch": 0.32697725695153323, + "grad_norm": 0.21024766564369202, + "learning_rate": 7.734051167235404e-05, + "loss": 0.8054, + "step": 1001 + }, + { + "epoch": 0.3273039075578784, + "grad_norm": 0.23412029445171356, + "learning_rate": 7.729681830135506e-05, + "loss": 0.8206, + "step": 1002 + }, + { + "epoch": 0.3276305581642236, + "grad_norm": 0.26261237263679504, + "learning_rate": 7.725309521436473e-05, + "loss": 0.8588, + "step": 1003 + }, + { + "epoch": 0.3279572087705688, + "grad_norm": 0.2516871690750122, + "learning_rate": 7.720934245898101e-05, + "loss": 0.842, + "step": 1004 + }, + { + "epoch": 0.32828385937691396, + "grad_norm": 0.268623411655426, + "learning_rate": 7.716556008283428e-05, + "loss": 0.7826, + "step": 1005 + }, + { + "epoch": 0.32861050998325914, + "grad_norm": 0.282501220703125, + "learning_rate": 7.712174813358709e-05, + "loss": 0.8606, + "step": 1006 + }, + { + "epoch": 0.32893716058960437, + "grad_norm": 0.2929759621620178, + "learning_rate": 7.707790665893422e-05, + "loss": 0.9098, + "step": 1007 + }, + { + "epoch": 0.32926381119594955, + "grad_norm": 0.30565446615219116, + "learning_rate": 7.703403570660259e-05, + "loss": 0.905, + "step": 1008 + }, + { + "epoch": 0.32959046180229473, + "grad_norm": 0.3160889446735382, + "learning_rate": 7.699013532435119e-05, + "loss": 0.9065, + "step": 1009 + }, + { + "epoch": 0.3299171124086399, + "grad_norm": 0.34107425808906555, + "learning_rate": 7.694620555997107e-05, + "loss": 0.9131, + "step": 1010 + }, + { + "epoch": 0.3302437630149851, + "grad_norm": 0.3465249538421631, + "learning_rate": 7.690224646128526e-05, + "loss": 0.8906, + "step": 1011 + }, + { + "epoch": 0.3305704136213303, + "grad_norm": 0.36077815294265747, + "learning_rate": 7.685825807614872e-05, + "loss": 0.9187, + "step": 1012 + }, + { + "epoch": 0.33089706422767545, + "grad_norm": 0.37028247117996216, + "learning_rate": 7.681424045244829e-05, + "loss": 0.912, + "step": 1013 + }, + { + "epoch": 0.3312237148340207, + "grad_norm": 0.3931543827056885, + "learning_rate": 7.677019363810268e-05, + "loss": 1.0148, + "step": 1014 + }, + { + "epoch": 0.33155036544036587, + "grad_norm": 0.4445292055606842, + "learning_rate": 7.672611768106227e-05, + "loss": 1.015, + "step": 1015 + }, + { + "epoch": 0.33187701604671105, + "grad_norm": 0.49006712436676025, + "learning_rate": 7.668201262930927e-05, + "loss": 0.9695, + "step": 1016 + }, + { + "epoch": 0.33220366665305623, + "grad_norm": 0.5494130253791809, + "learning_rate": 7.663787853085755e-05, + "loss": 1.2143, + "step": 1017 + }, + { + "epoch": 0.3325303172594014, + "grad_norm": 0.5494057536125183, + "learning_rate": 7.659371543375258e-05, + "loss": 1.1659, + "step": 1018 + }, + { + "epoch": 0.3328569678657466, + "grad_norm": 0.5971184372901917, + "learning_rate": 7.654952338607137e-05, + "loss": 1.1892, + "step": 1019 + }, + { + "epoch": 0.33318361847209177, + "grad_norm": 0.7538458704948425, + "learning_rate": 7.650530243592248e-05, + "loss": 1.1604, + "step": 1020 + }, + { + "epoch": 0.33351026907843695, + "grad_norm": 0.9583027362823486, + "learning_rate": 7.646105263144595e-05, + "loss": 1.3371, + "step": 1021 + }, + { + "epoch": 0.3338369196847822, + "grad_norm": 1.0476845502853394, + "learning_rate": 7.64167740208132e-05, + "loss": 1.3216, + "step": 1022 + }, + { + "epoch": 0.33416357029112737, + "grad_norm": 1.0443453788757324, + "learning_rate": 7.637246665222704e-05, + "loss": 1.225, + "step": 1023 + }, + { + "epoch": 0.33449022089747255, + "grad_norm": 1.3026036024093628, + "learning_rate": 7.632813057392151e-05, + "loss": 1.4609, + "step": 1024 + }, + { + "epoch": 0.3348168715038177, + "grad_norm": 1.555985450744629, + "learning_rate": 7.628376583416204e-05, + "loss": 1.7155, + "step": 1025 + }, + { + "epoch": 0.3351435221101629, + "grad_norm": 0.19100616872310638, + "learning_rate": 7.623937248124513e-05, + "loss": 0.7413, + "step": 1026 + }, + { + "epoch": 0.3354701727165081, + "grad_norm": 0.22931453585624695, + "learning_rate": 7.619495056349849e-05, + "loss": 0.894, + "step": 1027 + }, + { + "epoch": 0.33579682332285327, + "grad_norm": 0.2343945950269699, + "learning_rate": 7.615050012928092e-05, + "loss": 0.8266, + "step": 1028 + }, + { + "epoch": 0.3361234739291985, + "grad_norm": 0.27221325039863586, + "learning_rate": 7.610602122698227e-05, + "loss": 0.8433, + "step": 1029 + }, + { + "epoch": 0.3364501245355437, + "grad_norm": 0.2684449553489685, + "learning_rate": 7.606151390502337e-05, + "loss": 0.7658, + "step": 1030 + }, + { + "epoch": 0.33677677514188886, + "grad_norm": 0.2658660411834717, + "learning_rate": 7.6016978211856e-05, + "loss": 0.8912, + "step": 1031 + }, + { + "epoch": 0.33710342574823404, + "grad_norm": 0.27399852871894836, + "learning_rate": 7.597241419596279e-05, + "loss": 0.8434, + "step": 1032 + }, + { + "epoch": 0.3374300763545792, + "grad_norm": 0.27935051918029785, + "learning_rate": 7.592782190585725e-05, + "loss": 0.8468, + "step": 1033 + }, + { + "epoch": 0.3377567269609244, + "grad_norm": 0.3147827684879303, + "learning_rate": 7.588320139008365e-05, + "loss": 0.8781, + "step": 1034 + }, + { + "epoch": 0.3380833775672696, + "grad_norm": 0.31882667541503906, + "learning_rate": 7.583855269721697e-05, + "loss": 0.9354, + "step": 1035 + }, + { + "epoch": 0.3384100281736148, + "grad_norm": 0.3448106348514557, + "learning_rate": 7.579387587586292e-05, + "loss": 0.8532, + "step": 1036 + }, + { + "epoch": 0.33873667877996, + "grad_norm": 0.3815080225467682, + "learning_rate": 7.574917097465774e-05, + "loss": 0.945, + "step": 1037 + }, + { + "epoch": 0.3390633293863052, + "grad_norm": 0.3913522958755493, + "learning_rate": 7.570443804226833e-05, + "loss": 0.8925, + "step": 1038 + }, + { + "epoch": 0.33938997999265036, + "grad_norm": 0.42671769857406616, + "learning_rate": 7.565967712739205e-05, + "loss": 1.0106, + "step": 1039 + }, + { + "epoch": 0.33971663059899554, + "grad_norm": 0.4863613247871399, + "learning_rate": 7.561488827875675e-05, + "loss": 1.0284, + "step": 1040 + }, + { + "epoch": 0.3400432812053407, + "grad_norm": 0.507326066493988, + "learning_rate": 7.557007154512065e-05, + "loss": 1.0476, + "step": 1041 + }, + { + "epoch": 0.3403699318116859, + "grad_norm": 0.5292708277702332, + "learning_rate": 7.55252269752724e-05, + "loss": 1.0151, + "step": 1042 + }, + { + "epoch": 0.34069658241803114, + "grad_norm": 0.5429087281227112, + "learning_rate": 7.548035461803087e-05, + "loss": 1.0862, + "step": 1043 + }, + { + "epoch": 0.3410232330243763, + "grad_norm": 0.6534260511398315, + "learning_rate": 7.543545452224523e-05, + "loss": 1.1288, + "step": 1044 + }, + { + "epoch": 0.3413498836307215, + "grad_norm": 0.8366156816482544, + "learning_rate": 7.539052673679483e-05, + "loss": 1.5115, + "step": 1045 + }, + { + "epoch": 0.3416765342370667, + "grad_norm": 0.8968381881713867, + "learning_rate": 7.534557131058917e-05, + "loss": 1.1607, + "step": 1046 + }, + { + "epoch": 0.34200318484341186, + "grad_norm": 1.1521880626678467, + "learning_rate": 7.530058829256785e-05, + "loss": 1.2325, + "step": 1047 + }, + { + "epoch": 0.34232983544975704, + "grad_norm": 1.234046459197998, + "learning_rate": 7.525557773170048e-05, + "loss": 1.6835, + "step": 1048 + }, + { + "epoch": 0.3426564860561022, + "grad_norm": 2.0295400619506836, + "learning_rate": 7.521053967698669e-05, + "loss": 2.3934, + "step": 1049 + }, + { + "epoch": 0.34298313666244745, + "grad_norm": 2.4439151287078857, + "learning_rate": 7.516547417745598e-05, + "loss": 2.534, + "step": 1050 + }, + { + "epoch": 0.34330978726879263, + "grad_norm": 0.20437824726104736, + "learning_rate": 7.512038128216782e-05, + "loss": 0.8084, + "step": 1051 + }, + { + "epoch": 0.3436364378751378, + "grad_norm": 0.22019585967063904, + "learning_rate": 7.507526104021141e-05, + "loss": 0.7823, + "step": 1052 + }, + { + "epoch": 0.343963088481483, + "grad_norm": 0.24627773463726044, + "learning_rate": 7.503011350070579e-05, + "loss": 0.87, + "step": 1053 + }, + { + "epoch": 0.3442897390878282, + "grad_norm": 0.2620931565761566, + "learning_rate": 7.498493871279967e-05, + "loss": 0.8776, + "step": 1054 + }, + { + "epoch": 0.34461638969417335, + "grad_norm": 0.2705698609352112, + "learning_rate": 7.493973672567144e-05, + "loss": 0.8141, + "step": 1055 + }, + { + "epoch": 0.34494304030051853, + "grad_norm": 0.2702180743217468, + "learning_rate": 7.48945075885291e-05, + "loss": 0.871, + "step": 1056 + }, + { + "epoch": 0.34526969090686377, + "grad_norm": 0.29457515478134155, + "learning_rate": 7.484925135061022e-05, + "loss": 0.9953, + "step": 1057 + }, + { + "epoch": 0.34559634151320895, + "grad_norm": 0.27578428387641907, + "learning_rate": 7.480396806118186e-05, + "loss": 0.8458, + "step": 1058 + }, + { + "epoch": 0.34592299211955413, + "grad_norm": 0.3459113836288452, + "learning_rate": 7.475865776954051e-05, + "loss": 0.9171, + "step": 1059 + }, + { + "epoch": 0.3462496427258993, + "grad_norm": 0.300814151763916, + "learning_rate": 7.47133205250121e-05, + "loss": 0.9346, + "step": 1060 + }, + { + "epoch": 0.3465762933322445, + "grad_norm": 0.31863105297088623, + "learning_rate": 7.466795637695184e-05, + "loss": 0.9133, + "step": 1061 + }, + { + "epoch": 0.34690294393858967, + "grad_norm": 0.3595605790615082, + "learning_rate": 7.462256537474429e-05, + "loss": 0.9152, + "step": 1062 + }, + { + "epoch": 0.34722959454493485, + "grad_norm": 0.3383408784866333, + "learning_rate": 7.457714756780322e-05, + "loss": 0.8437, + "step": 1063 + }, + { + "epoch": 0.3475562451512801, + "grad_norm": 0.3689708709716797, + "learning_rate": 7.453170300557156e-05, + "loss": 1.0964, + "step": 1064 + }, + { + "epoch": 0.34788289575762527, + "grad_norm": 0.3869151175022125, + "learning_rate": 7.448623173752139e-05, + "loss": 1.0209, + "step": 1065 + }, + { + "epoch": 0.34820954636397045, + "grad_norm": 0.4113345146179199, + "learning_rate": 7.444073381315388e-05, + "loss": 1.0003, + "step": 1066 + }, + { + "epoch": 0.3485361969703156, + "grad_norm": 0.4470776915550232, + "learning_rate": 7.439520928199917e-05, + "loss": 1.0222, + "step": 1067 + }, + { + "epoch": 0.3488628475766608, + "grad_norm": 0.4541364908218384, + "learning_rate": 7.434965819361638e-05, + "loss": 0.9672, + "step": 1068 + }, + { + "epoch": 0.349189498183006, + "grad_norm": 0.4776630103588104, + "learning_rate": 7.430408059759357e-05, + "loss": 1.1079, + "step": 1069 + }, + { + "epoch": 0.34951614878935117, + "grad_norm": 0.5205119252204895, + "learning_rate": 7.425847654354764e-05, + "loss": 0.9712, + "step": 1070 + }, + { + "epoch": 0.3498427993956964, + "grad_norm": 0.58984375, + "learning_rate": 7.421284608112431e-05, + "loss": 1.2611, + "step": 1071 + }, + { + "epoch": 0.3501694500020416, + "grad_norm": 0.8614017367362976, + "learning_rate": 7.416718925999797e-05, + "loss": 1.2577, + "step": 1072 + }, + { + "epoch": 0.35049610060838676, + "grad_norm": 0.9795945286750793, + "learning_rate": 7.412150612987182e-05, + "loss": 1.2588, + "step": 1073 + }, + { + "epoch": 0.35082275121473194, + "grad_norm": 1.1379746198654175, + "learning_rate": 7.407579674047763e-05, + "loss": 1.1972, + "step": 1074 + }, + { + "epoch": 0.3511494018210771, + "grad_norm": 1.3634546995162964, + "learning_rate": 7.403006114157575e-05, + "loss": 1.9137, + "step": 1075 + }, + { + "epoch": 0.3514760524274223, + "grad_norm": 0.19136269390583038, + "learning_rate": 7.398429938295511e-05, + "loss": 0.6746, + "step": 1076 + }, + { + "epoch": 0.3518027030337675, + "grad_norm": 0.25908541679382324, + "learning_rate": 7.393851151443307e-05, + "loss": 0.8532, + "step": 1077 + }, + { + "epoch": 0.3521293536401127, + "grad_norm": 0.2595674991607666, + "learning_rate": 7.389269758585546e-05, + "loss": 0.8023, + "step": 1078 + }, + { + "epoch": 0.3524560042464579, + "grad_norm": 0.27125805616378784, + "learning_rate": 7.384685764709645e-05, + "loss": 0.8122, + "step": 1079 + }, + { + "epoch": 0.3527826548528031, + "grad_norm": 0.282625675201416, + "learning_rate": 7.380099174805852e-05, + "loss": 0.878, + "step": 1080 + }, + { + "epoch": 0.35310930545914826, + "grad_norm": 0.28404882550239563, + "learning_rate": 7.375509993867242e-05, + "loss": 0.7957, + "step": 1081 + }, + { + "epoch": 0.35343595606549344, + "grad_norm": 0.3119284212589264, + "learning_rate": 7.370918226889713e-05, + "loss": 0.8864, + "step": 1082 + }, + { + "epoch": 0.3537626066718386, + "grad_norm": 0.3108453154563904, + "learning_rate": 7.366323878871973e-05, + "loss": 0.9002, + "step": 1083 + }, + { + "epoch": 0.3540892572781838, + "grad_norm": 0.3398928642272949, + "learning_rate": 7.361726954815547e-05, + "loss": 0.8979, + "step": 1084 + }, + { + "epoch": 0.35441590788452904, + "grad_norm": 0.33085235953330994, + "learning_rate": 7.357127459724755e-05, + "loss": 0.9091, + "step": 1085 + }, + { + "epoch": 0.3547425584908742, + "grad_norm": 0.39169177412986755, + "learning_rate": 7.352525398606724e-05, + "loss": 0.8724, + "step": 1086 + }, + { + "epoch": 0.3550692090972194, + "grad_norm": 0.3578439950942993, + "learning_rate": 7.347920776471374e-05, + "loss": 0.8364, + "step": 1087 + }, + { + "epoch": 0.3553958597035646, + "grad_norm": 0.34978634119033813, + "learning_rate": 7.343313598331406e-05, + "loss": 0.8644, + "step": 1088 + }, + { + "epoch": 0.35572251030990976, + "grad_norm": 0.40582436323165894, + "learning_rate": 7.33870386920231e-05, + "loss": 1.0534, + "step": 1089 + }, + { + "epoch": 0.35604916091625494, + "grad_norm": 0.4203055202960968, + "learning_rate": 7.33409159410235e-05, + "loss": 0.9638, + "step": 1090 + }, + { + "epoch": 0.3563758115226001, + "grad_norm": 0.45389387011528015, + "learning_rate": 7.329476778052565e-05, + "loss": 1.0034, + "step": 1091 + }, + { + "epoch": 0.35670246212894535, + "grad_norm": 0.47722360491752625, + "learning_rate": 7.324859426076756e-05, + "loss": 0.9551, + "step": 1092 + }, + { + "epoch": 0.35702911273529053, + "grad_norm": 0.4822474718093872, + "learning_rate": 7.320239543201489e-05, + "loss": 1.0724, + "step": 1093 + }, + { + "epoch": 0.3573557633416357, + "grad_norm": 0.559536337852478, + "learning_rate": 7.315617134456079e-05, + "loss": 1.2312, + "step": 1094 + }, + { + "epoch": 0.3576824139479809, + "grad_norm": 0.5810602307319641, + "learning_rate": 7.310992204872595e-05, + "loss": 0.9705, + "step": 1095 + }, + { + "epoch": 0.3580090645543261, + "grad_norm": 0.7775752544403076, + "learning_rate": 7.306364759485853e-05, + "loss": 1.3561, + "step": 1096 + }, + { + "epoch": 0.35833571516067125, + "grad_norm": 0.8379966020584106, + "learning_rate": 7.301734803333403e-05, + "loss": 1.2233, + "step": 1097 + }, + { + "epoch": 0.35866236576701643, + "grad_norm": 0.927143931388855, + "learning_rate": 7.297102341455528e-05, + "loss": 1.3993, + "step": 1098 + }, + { + "epoch": 0.35898901637336167, + "grad_norm": 1.2338497638702393, + "learning_rate": 7.292467378895243e-05, + "loss": 1.2431, + "step": 1099 + }, + { + "epoch": 0.35931566697970685, + "grad_norm": 1.6832832098007202, + "learning_rate": 7.28782992069828e-05, + "loss": 1.5705, + "step": 1100 + }, + { + "epoch": 0.35964231758605203, + "grad_norm": 0.2085774838924408, + "learning_rate": 7.283189971913094e-05, + "loss": 0.8405, + "step": 1101 + }, + { + "epoch": 0.3599689681923972, + "grad_norm": 0.23439809679985046, + "learning_rate": 7.278547537590845e-05, + "loss": 0.7714, + "step": 1102 + }, + { + "epoch": 0.3602956187987424, + "grad_norm": 0.25842559337615967, + "learning_rate": 7.273902622785405e-05, + "loss": 0.8394, + "step": 1103 + }, + { + "epoch": 0.36062226940508757, + "grad_norm": 0.2644081115722656, + "learning_rate": 7.269255232553339e-05, + "loss": 0.8266, + "step": 1104 + }, + { + "epoch": 0.36094892001143275, + "grad_norm": 0.26839661598205566, + "learning_rate": 7.264605371953915e-05, + "loss": 0.8877, + "step": 1105 + }, + { + "epoch": 0.361275570617778, + "grad_norm": 0.2822781503200531, + "learning_rate": 7.259953046049084e-05, + "loss": 0.9052, + "step": 1106 + }, + { + "epoch": 0.36160222122412317, + "grad_norm": 0.30244433879852295, + "learning_rate": 7.255298259903482e-05, + "loss": 0.9289, + "step": 1107 + }, + { + "epoch": 0.36192887183046835, + "grad_norm": 0.29979464411735535, + "learning_rate": 7.250641018584428e-05, + "loss": 0.9039, + "step": 1108 + }, + { + "epoch": 0.3622555224368135, + "grad_norm": 0.31162193417549133, + "learning_rate": 7.245981327161905e-05, + "loss": 0.968, + "step": 1109 + }, + { + "epoch": 0.3625821730431587, + "grad_norm": 0.32680559158325195, + "learning_rate": 7.241319190708575e-05, + "loss": 0.8592, + "step": 1110 + }, + { + "epoch": 0.3629088236495039, + "grad_norm": 0.332279235124588, + "learning_rate": 7.236654614299748e-05, + "loss": 0.8753, + "step": 1111 + }, + { + "epoch": 0.36323547425584907, + "grad_norm": 0.36247286200523376, + "learning_rate": 7.231987603013401e-05, + "loss": 0.9475, + "step": 1112 + }, + { + "epoch": 0.3635621248621943, + "grad_norm": 0.3807973563671112, + "learning_rate": 7.227318161930157e-05, + "loss": 0.9499, + "step": 1113 + }, + { + "epoch": 0.3638887754685395, + "grad_norm": 0.40924564003944397, + "learning_rate": 7.222646296133287e-05, + "loss": 0.9879, + "step": 1114 + }, + { + "epoch": 0.36421542607488466, + "grad_norm": 0.43880122900009155, + "learning_rate": 7.217972010708696e-05, + "loss": 0.9948, + "step": 1115 + }, + { + "epoch": 0.36454207668122984, + "grad_norm": 0.4344307780265808, + "learning_rate": 7.213295310744928e-05, + "loss": 1.0174, + "step": 1116 + }, + { + "epoch": 0.364868727287575, + "grad_norm": 0.4451688528060913, + "learning_rate": 7.208616201333156e-05, + "loss": 0.9336, + "step": 1117 + }, + { + "epoch": 0.3651953778939202, + "grad_norm": 0.5209891200065613, + "learning_rate": 7.203934687567173e-05, + "loss": 1.0403, + "step": 1118 + }, + { + "epoch": 0.3655220285002654, + "grad_norm": 0.5560318231582642, + "learning_rate": 7.199250774543391e-05, + "loss": 1.0855, + "step": 1119 + }, + { + "epoch": 0.3658486791066106, + "grad_norm": 0.6434129476547241, + "learning_rate": 7.194564467360834e-05, + "loss": 1.1184, + "step": 1120 + }, + { + "epoch": 0.3661753297129558, + "grad_norm": 0.8035168051719666, + "learning_rate": 7.189875771121129e-05, + "loss": 1.1999, + "step": 1121 + }, + { + "epoch": 0.366501980319301, + "grad_norm": 0.9151061177253723, + "learning_rate": 7.18518469092851e-05, + "loss": 1.2509, + "step": 1122 + }, + { + "epoch": 0.36682863092564616, + "grad_norm": 1.119385004043579, + "learning_rate": 7.180491231889802e-05, + "loss": 1.6826, + "step": 1123 + }, + { + "epoch": 0.36715528153199134, + "grad_norm": 1.2659269571304321, + "learning_rate": 7.17579539911442e-05, + "loss": 1.6055, + "step": 1124 + }, + { + "epoch": 0.3674819321383365, + "grad_norm": 1.693357229232788, + "learning_rate": 7.171097197714363e-05, + "loss": 1.8833, + "step": 1125 + }, + { + "epoch": 0.3678085827446817, + "grad_norm": 0.19237908720970154, + "learning_rate": 7.166396632804212e-05, + "loss": 0.646, + "step": 1126 + }, + { + "epoch": 0.3681352333510269, + "grad_norm": 0.25181078910827637, + "learning_rate": 7.161693709501114e-05, + "loss": 0.8216, + "step": 1127 + }, + { + "epoch": 0.3684618839573721, + "grad_norm": 0.26569420099258423, + "learning_rate": 7.156988432924791e-05, + "loss": 0.8657, + "step": 1128 + }, + { + "epoch": 0.3687885345637173, + "grad_norm": 0.2794082462787628, + "learning_rate": 7.152280808197522e-05, + "loss": 0.7595, + "step": 1129 + }, + { + "epoch": 0.3691151851700625, + "grad_norm": 0.2683536112308502, + "learning_rate": 7.147570840444145e-05, + "loss": 0.8327, + "step": 1130 + }, + { + "epoch": 0.36944183577640766, + "grad_norm": 0.27695196866989136, + "learning_rate": 7.142858534792045e-05, + "loss": 0.8344, + "step": 1131 + }, + { + "epoch": 0.36976848638275284, + "grad_norm": 0.28606536984443665, + "learning_rate": 7.138143896371157e-05, + "loss": 0.8636, + "step": 1132 + }, + { + "epoch": 0.370095136989098, + "grad_norm": 0.2818695902824402, + "learning_rate": 7.133426930313951e-05, + "loss": 0.8219, + "step": 1133 + }, + { + "epoch": 0.3704217875954432, + "grad_norm": 0.3215191960334778, + "learning_rate": 7.128707641755434e-05, + "loss": 0.9497, + "step": 1134 + }, + { + "epoch": 0.37074843820178843, + "grad_norm": 0.3110451400279999, + "learning_rate": 7.123986035833141e-05, + "loss": 0.8942, + "step": 1135 + }, + { + "epoch": 0.3710750888081336, + "grad_norm": 0.31207698583602905, + "learning_rate": 7.119262117687127e-05, + "loss": 0.9575, + "step": 1136 + }, + { + "epoch": 0.3714017394144788, + "grad_norm": 0.314654678106308, + "learning_rate": 7.114535892459967e-05, + "loss": 0.8195, + "step": 1137 + }, + { + "epoch": 0.371728390020824, + "grad_norm": 0.34628891944885254, + "learning_rate": 7.109807365296748e-05, + "loss": 0.8786, + "step": 1138 + }, + { + "epoch": 0.37205504062716915, + "grad_norm": 0.3640112578868866, + "learning_rate": 7.105076541345058e-05, + "loss": 0.9262, + "step": 1139 + }, + { + "epoch": 0.37238169123351433, + "grad_norm": 0.4010256230831146, + "learning_rate": 7.100343425754993e-05, + "loss": 0.9402, + "step": 1140 + }, + { + "epoch": 0.3727083418398595, + "grad_norm": 0.4014125466346741, + "learning_rate": 7.095608023679138e-05, + "loss": 1.0435, + "step": 1141 + }, + { + "epoch": 0.37303499244620475, + "grad_norm": 0.46626025438308716, + "learning_rate": 7.090870340272568e-05, + "loss": 0.992, + "step": 1142 + }, + { + "epoch": 0.37336164305254993, + "grad_norm": 0.5319213271141052, + "learning_rate": 7.086130380692841e-05, + "loss": 1.1192, + "step": 1143 + }, + { + "epoch": 0.3736882936588951, + "grad_norm": 0.5756850838661194, + "learning_rate": 7.081388150099999e-05, + "loss": 1.1887, + "step": 1144 + }, + { + "epoch": 0.3740149442652403, + "grad_norm": 0.6586177349090576, + "learning_rate": 7.076643653656549e-05, + "loss": 1.2207, + "step": 1145 + }, + { + "epoch": 0.37434159487158547, + "grad_norm": 0.6644164323806763, + "learning_rate": 7.071896896527464e-05, + "loss": 1.194, + "step": 1146 + }, + { + "epoch": 0.37466824547793065, + "grad_norm": 0.9249393939971924, + "learning_rate": 7.067147883880185e-05, + "loss": 1.3419, + "step": 1147 + }, + { + "epoch": 0.37499489608427583, + "grad_norm": 1.1207913160324097, + "learning_rate": 7.062396620884605e-05, + "loss": 1.5119, + "step": 1148 + }, + { + "epoch": 0.37532154669062107, + "grad_norm": 1.3481462001800537, + "learning_rate": 7.057643112713063e-05, + "loss": 1.2274, + "step": 1149 + }, + { + "epoch": 0.37564819729696625, + "grad_norm": 1.6863646507263184, + "learning_rate": 7.05288736454035e-05, + "loss": 2.1351, + "step": 1150 + }, + { + "epoch": 0.3759748479033114, + "grad_norm": 0.19038569927215576, + "learning_rate": 7.048129381543687e-05, + "loss": 0.6988, + "step": 1151 + }, + { + "epoch": 0.3763014985096566, + "grad_norm": 0.2359238564968109, + "learning_rate": 7.043369168902732e-05, + "loss": 0.8121, + "step": 1152 + }, + { + "epoch": 0.3766281491160018, + "grad_norm": 0.25924691557884216, + "learning_rate": 7.038606731799574e-05, + "loss": 0.8618, + "step": 1153 + }, + { + "epoch": 0.37695479972234697, + "grad_norm": 0.24869892001152039, + "learning_rate": 7.033842075418718e-05, + "loss": 0.8703, + "step": 1154 + }, + { + "epoch": 0.37728145032869215, + "grad_norm": 0.2840399742126465, + "learning_rate": 7.029075204947085e-05, + "loss": 0.7961, + "step": 1155 + }, + { + "epoch": 0.3776081009350374, + "grad_norm": 0.2748403549194336, + "learning_rate": 7.024306125574009e-05, + "loss": 0.9051, + "step": 1156 + }, + { + "epoch": 0.37793475154138256, + "grad_norm": 0.284496009349823, + "learning_rate": 7.019534842491228e-05, + "loss": 0.8884, + "step": 1157 + }, + { + "epoch": 0.37826140214772774, + "grad_norm": 0.3085760176181793, + "learning_rate": 7.014761360892882e-05, + "loss": 0.9337, + "step": 1158 + }, + { + "epoch": 0.3785880527540729, + "grad_norm": 0.3391752243041992, + "learning_rate": 7.009985685975495e-05, + "loss": 0.9858, + "step": 1159 + }, + { + "epoch": 0.3789147033604181, + "grad_norm": 0.3305496871471405, + "learning_rate": 7.005207822937988e-05, + "loss": 0.8196, + "step": 1160 + }, + { + "epoch": 0.3792413539667633, + "grad_norm": 0.34923458099365234, + "learning_rate": 7.00042777698166e-05, + "loss": 0.908, + "step": 1161 + }, + { + "epoch": 0.37956800457310846, + "grad_norm": 0.41269630193710327, + "learning_rate": 6.99564555331019e-05, + "loss": 0.9519, + "step": 1162 + }, + { + "epoch": 0.3798946551794537, + "grad_norm": 0.3687300682067871, + "learning_rate": 6.990861157129622e-05, + "loss": 1.0597, + "step": 1163 + }, + { + "epoch": 0.3802213057857989, + "grad_norm": 0.40411537885665894, + "learning_rate": 6.986074593648367e-05, + "loss": 0.9359, + "step": 1164 + }, + { + "epoch": 0.38054795639214406, + "grad_norm": 0.4321344792842865, + "learning_rate": 6.981285868077198e-05, + "loss": 1.0351, + "step": 1165 + }, + { + "epoch": 0.38087460699848924, + "grad_norm": 0.4461156725883484, + "learning_rate": 6.976494985629242e-05, + "loss": 1.0586, + "step": 1166 + }, + { + "epoch": 0.3812012576048344, + "grad_norm": 0.5063657164573669, + "learning_rate": 6.971701951519972e-05, + "loss": 1.163, + "step": 1167 + }, + { + "epoch": 0.3815279082111796, + "grad_norm": 0.5494873523712158, + "learning_rate": 6.966906770967199e-05, + "loss": 1.1235, + "step": 1168 + }, + { + "epoch": 0.3818545588175248, + "grad_norm": 0.6220507621765137, + "learning_rate": 6.962109449191077e-05, + "loss": 1.2436, + "step": 1169 + }, + { + "epoch": 0.38218120942387, + "grad_norm": 0.6774858832359314, + "learning_rate": 6.957309991414092e-05, + "loss": 1.3258, + "step": 1170 + }, + { + "epoch": 0.3825078600302152, + "grad_norm": 0.8799903392791748, + "learning_rate": 6.952508402861051e-05, + "loss": 1.4338, + "step": 1171 + }, + { + "epoch": 0.3828345106365604, + "grad_norm": 1.1180598735809326, + "learning_rate": 6.94770468875908e-05, + "loss": 1.8223, + "step": 1172 + }, + { + "epoch": 0.38316116124290556, + "grad_norm": 1.154464602470398, + "learning_rate": 6.942898854337621e-05, + "loss": 1.3532, + "step": 1173 + }, + { + "epoch": 0.38348781184925074, + "grad_norm": 1.271112084388733, + "learning_rate": 6.938090904828428e-05, + "loss": 1.5536, + "step": 1174 + }, + { + "epoch": 0.3838144624555959, + "grad_norm": 1.7882853746414185, + "learning_rate": 6.933280845465551e-05, + "loss": 1.9543, + "step": 1175 + }, + { + "epoch": 0.3841411130619411, + "grad_norm": 0.1903381198644638, + "learning_rate": 6.92846868148534e-05, + "loss": 0.6825, + "step": 1176 + }, + { + "epoch": 0.38446776366828633, + "grad_norm": 0.21171070635318756, + "learning_rate": 6.923654418126434e-05, + "loss": 0.7353, + "step": 1177 + }, + { + "epoch": 0.3847944142746315, + "grad_norm": 0.22646358609199524, + "learning_rate": 6.918838060629762e-05, + "loss": 0.7659, + "step": 1178 + }, + { + "epoch": 0.3851210648809767, + "grad_norm": 0.2556747496128082, + "learning_rate": 6.914019614238527e-05, + "loss": 0.8941, + "step": 1179 + }, + { + "epoch": 0.3854477154873219, + "grad_norm": 0.278025358915329, + "learning_rate": 6.909199084198212e-05, + "loss": 0.8642, + "step": 1180 + }, + { + "epoch": 0.38577436609366705, + "grad_norm": 0.2806093692779541, + "learning_rate": 6.904376475756563e-05, + "loss": 0.8426, + "step": 1181 + }, + { + "epoch": 0.38610101670001223, + "grad_norm": 0.3061642646789551, + "learning_rate": 6.899551794163592e-05, + "loss": 0.9506, + "step": 1182 + }, + { + "epoch": 0.3864276673063574, + "grad_norm": 0.31807222962379456, + "learning_rate": 6.894725044671566e-05, + "loss": 0.896, + "step": 1183 + }, + { + "epoch": 0.38675431791270265, + "grad_norm": 0.330069899559021, + "learning_rate": 6.889896232535004e-05, + "loss": 0.9255, + "step": 1184 + }, + { + "epoch": 0.38708096851904783, + "grad_norm": 0.3431348502635956, + "learning_rate": 6.885065363010671e-05, + "loss": 0.9511, + "step": 1185 + }, + { + "epoch": 0.387407619125393, + "grad_norm": 0.3761681020259857, + "learning_rate": 6.88023244135757e-05, + "loss": 0.9307, + "step": 1186 + }, + { + "epoch": 0.3877342697317382, + "grad_norm": 0.352243572473526, + "learning_rate": 6.875397472836937e-05, + "loss": 0.9937, + "step": 1187 + }, + { + "epoch": 0.38806092033808337, + "grad_norm": 0.38508570194244385, + "learning_rate": 6.870560462712243e-05, + "loss": 0.95, + "step": 1188 + }, + { + "epoch": 0.38838757094442855, + "grad_norm": 0.41849932074546814, + "learning_rate": 6.865721416249175e-05, + "loss": 1.0807, + "step": 1189 + }, + { + "epoch": 0.38871422155077373, + "grad_norm": 0.4687519669532776, + "learning_rate": 6.860880338715638e-05, + "loss": 1.0743, + "step": 1190 + }, + { + "epoch": 0.38904087215711897, + "grad_norm": 0.4491303563117981, + "learning_rate": 6.856037235381746e-05, + "loss": 1.1039, + "step": 1191 + }, + { + "epoch": 0.38936752276346415, + "grad_norm": 0.5182649493217468, + "learning_rate": 6.851192111519826e-05, + "loss": 0.9833, + "step": 1192 + }, + { + "epoch": 0.3896941733698093, + "grad_norm": 0.5373132824897766, + "learning_rate": 6.846344972404399e-05, + "loss": 1.1743, + "step": 1193 + }, + { + "epoch": 0.3900208239761545, + "grad_norm": 0.5523989796638489, + "learning_rate": 6.841495823312177e-05, + "loss": 1.1548, + "step": 1194 + }, + { + "epoch": 0.3903474745824997, + "grad_norm": 0.6846272349357605, + "learning_rate": 6.836644669522065e-05, + "loss": 1.2027, + "step": 1195 + }, + { + "epoch": 0.39067412518884487, + "grad_norm": 0.7508804202079773, + "learning_rate": 6.831791516315151e-05, + "loss": 1.2353, + "step": 1196 + }, + { + "epoch": 0.39100077579519005, + "grad_norm": 1.0120075941085815, + "learning_rate": 6.826936368974696e-05, + "loss": 1.6322, + "step": 1197 + }, + { + "epoch": 0.3913274264015353, + "grad_norm": 1.1982024908065796, + "learning_rate": 6.822079232786134e-05, + "loss": 1.674, + "step": 1198 + }, + { + "epoch": 0.39165407700788046, + "grad_norm": 1.4332988262176514, + "learning_rate": 6.817220113037062e-05, + "loss": 1.2769, + "step": 1199 + }, + { + "epoch": 0.39198072761422564, + "grad_norm": 1.8049565553665161, + "learning_rate": 6.81235901501724e-05, + "loss": 1.7269, + "step": 1200 + }, + { + "epoch": 0.3923073782205708, + "grad_norm": 0.20991112291812897, + "learning_rate": 6.807495944018577e-05, + "loss": 0.7704, + "step": 1201 + }, + { + "epoch": 0.392634028826916, + "grad_norm": 0.23148652911186218, + "learning_rate": 6.802630905335137e-05, + "loss": 0.7713, + "step": 1202 + }, + { + "epoch": 0.3929606794332612, + "grad_norm": 0.24124076962471008, + "learning_rate": 6.797763904263115e-05, + "loss": 0.8331, + "step": 1203 + }, + { + "epoch": 0.39328733003960636, + "grad_norm": 0.2620198130607605, + "learning_rate": 6.792894946100854e-05, + "loss": 0.8113, + "step": 1204 + }, + { + "epoch": 0.3936139806459516, + "grad_norm": 0.26299262046813965, + "learning_rate": 6.788024036148821e-05, + "loss": 0.8568, + "step": 1205 + }, + { + "epoch": 0.3939406312522968, + "grad_norm": 0.27623122930526733, + "learning_rate": 6.783151179709609e-05, + "loss": 0.8808, + "step": 1206 + }, + { + "epoch": 0.39426728185864196, + "grad_norm": 0.3048432171344757, + "learning_rate": 6.778276382087926e-05, + "loss": 0.9913, + "step": 1207 + }, + { + "epoch": 0.39459393246498714, + "grad_norm": 0.3026992678642273, + "learning_rate": 6.773399648590602e-05, + "loss": 0.9271, + "step": 1208 + }, + { + "epoch": 0.3949205830713323, + "grad_norm": 0.3055267632007599, + "learning_rate": 6.768520984526569e-05, + "loss": 0.8672, + "step": 1209 + }, + { + "epoch": 0.3952472336776775, + "grad_norm": 0.2968289256095886, + "learning_rate": 6.76364039520686e-05, + "loss": 0.7734, + "step": 1210 + }, + { + "epoch": 0.3955738842840227, + "grad_norm": 0.32888948917388916, + "learning_rate": 6.758757885944608e-05, + "loss": 0.9411, + "step": 1211 + }, + { + "epoch": 0.3959005348903679, + "grad_norm": 0.334842711687088, + "learning_rate": 6.75387346205503e-05, + "loss": 0.9304, + "step": 1212 + }, + { + "epoch": 0.3962271854967131, + "grad_norm": 0.3538110852241516, + "learning_rate": 6.74898712885543e-05, + "loss": 1.0136, + "step": 1213 + }, + { + "epoch": 0.3965538361030583, + "grad_norm": 0.4119766652584076, + "learning_rate": 6.744098891665194e-05, + "loss": 0.9688, + "step": 1214 + }, + { + "epoch": 0.39688048670940346, + "grad_norm": 0.40925925970077515, + "learning_rate": 6.739208755805778e-05, + "loss": 0.9954, + "step": 1215 + }, + { + "epoch": 0.39720713731574864, + "grad_norm": 0.4552319049835205, + "learning_rate": 6.734316726600702e-05, + "loss": 0.96, + "step": 1216 + }, + { + "epoch": 0.3975337879220938, + "grad_norm": 0.45739418268203735, + "learning_rate": 6.729422809375551e-05, + "loss": 0.9417, + "step": 1217 + }, + { + "epoch": 0.397860438528439, + "grad_norm": 0.5212276577949524, + "learning_rate": 6.724527009457966e-05, + "loss": 1.0401, + "step": 1218 + }, + { + "epoch": 0.39818708913478423, + "grad_norm": 0.5565895438194275, + "learning_rate": 6.719629332177634e-05, + "loss": 1.1682, + "step": 1219 + }, + { + "epoch": 0.3985137397411294, + "grad_norm": 0.6466187834739685, + "learning_rate": 6.714729782866291e-05, + "loss": 1.2577, + "step": 1220 + }, + { + "epoch": 0.3988403903474746, + "grad_norm": 0.7221575975418091, + "learning_rate": 6.709828366857702e-05, + "loss": 1.2678, + "step": 1221 + }, + { + "epoch": 0.3991670409538198, + "grad_norm": 0.9043142199516296, + "learning_rate": 6.704925089487675e-05, + "loss": 1.3762, + "step": 1222 + }, + { + "epoch": 0.39949369156016495, + "grad_norm": 1.0332460403442383, + "learning_rate": 6.700019956094035e-05, + "loss": 1.2477, + "step": 1223 + }, + { + "epoch": 0.39982034216651013, + "grad_norm": 1.4478338956832886, + "learning_rate": 6.695112972016633e-05, + "loss": 1.4505, + "step": 1224 + }, + { + "epoch": 0.4001469927728553, + "grad_norm": 1.8933912515640259, + "learning_rate": 6.690204142597333e-05, + "loss": 1.4914, + "step": 1225 + }, + { + "epoch": 0.40047364337920055, + "grad_norm": 0.21706634759902954, + "learning_rate": 6.68529347318001e-05, + "loss": 0.7785, + "step": 1226 + }, + { + "epoch": 0.40080029398554573, + "grad_norm": 0.2556523084640503, + "learning_rate": 6.680380969110537e-05, + "loss": 0.8661, + "step": 1227 + }, + { + "epoch": 0.4011269445918909, + "grad_norm": 0.2508922815322876, + "learning_rate": 6.67546663573679e-05, + "loss": 0.86, + "step": 1228 + }, + { + "epoch": 0.4014535951982361, + "grad_norm": 0.2551276385784149, + "learning_rate": 6.670550478408632e-05, + "loss": 0.8221, + "step": 1229 + }, + { + "epoch": 0.40178024580458127, + "grad_norm": 0.2717500627040863, + "learning_rate": 6.665632502477914e-05, + "loss": 0.8709, + "step": 1230 + }, + { + "epoch": 0.40210689641092645, + "grad_norm": 0.2777538299560547, + "learning_rate": 6.660712713298468e-05, + "loss": 0.8241, + "step": 1231 + }, + { + "epoch": 0.40243354701727163, + "grad_norm": 0.29997768998146057, + "learning_rate": 6.655791116226094e-05, + "loss": 0.8535, + "step": 1232 + }, + { + "epoch": 0.4027601976236168, + "grad_norm": 0.3282143771648407, + "learning_rate": 6.650867716618567e-05, + "loss": 0.9073, + "step": 1233 + }, + { + "epoch": 0.40308684822996205, + "grad_norm": 0.3342377841472626, + "learning_rate": 6.645942519835623e-05, + "loss": 0.9987, + "step": 1234 + }, + { + "epoch": 0.4034134988363072, + "grad_norm": 0.31825751066207886, + "learning_rate": 6.64101553123895e-05, + "loss": 0.7599, + "step": 1235 + }, + { + "epoch": 0.4037401494426524, + "grad_norm": 0.3360239863395691, + "learning_rate": 6.636086756192193e-05, + "loss": 0.7941, + "step": 1236 + }, + { + "epoch": 0.4040668000489976, + "grad_norm": 0.3323500156402588, + "learning_rate": 6.631156200060935e-05, + "loss": 0.945, + "step": 1237 + }, + { + "epoch": 0.40439345065534277, + "grad_norm": 0.4027165174484253, + "learning_rate": 6.626223868212702e-05, + "loss": 0.9231, + "step": 1238 + }, + { + "epoch": 0.40472010126168795, + "grad_norm": 0.42029932141304016, + "learning_rate": 6.621289766016955e-05, + "loss": 0.9594, + "step": 1239 + }, + { + "epoch": 0.40504675186803313, + "grad_norm": 0.4373820424079895, + "learning_rate": 6.616353898845076e-05, + "loss": 0.9466, + "step": 1240 + }, + { + "epoch": 0.40537340247437836, + "grad_norm": 0.48459959030151367, + "learning_rate": 6.611416272070377e-05, + "loss": 0.9867, + "step": 1241 + }, + { + "epoch": 0.40570005308072354, + "grad_norm": 0.4633182883262634, + "learning_rate": 6.606476891068074e-05, + "loss": 0.9491, + "step": 1242 + }, + { + "epoch": 0.4060267036870687, + "grad_norm": 0.5320441722869873, + "learning_rate": 6.601535761215305e-05, + "loss": 1.1246, + "step": 1243 + }, + { + "epoch": 0.4063533542934139, + "grad_norm": 0.5146458148956299, + "learning_rate": 6.596592887891103e-05, + "loss": 1.0492, + "step": 1244 + }, + { + "epoch": 0.4066800048997591, + "grad_norm": 0.6374284029006958, + "learning_rate": 6.591648276476402e-05, + "loss": 1.1577, + "step": 1245 + }, + { + "epoch": 0.40700665550610426, + "grad_norm": 0.6750211715698242, + "learning_rate": 6.586701932354031e-05, + "loss": 1.2225, + "step": 1246 + }, + { + "epoch": 0.40733330611244944, + "grad_norm": 0.9183369278907776, + "learning_rate": 6.581753860908699e-05, + "loss": 1.301, + "step": 1247 + }, + { + "epoch": 0.4076599567187947, + "grad_norm": 1.0853521823883057, + "learning_rate": 6.576804067527002e-05, + "loss": 1.5473, + "step": 1248 + }, + { + "epoch": 0.40798660732513986, + "grad_norm": 1.1827778816223145, + "learning_rate": 6.571852557597407e-05, + "loss": 1.4952, + "step": 1249 + }, + { + "epoch": 0.40831325793148504, + "grad_norm": 1.3170537948608398, + "learning_rate": 6.566899336510248e-05, + "loss": 1.6403, + "step": 1250 + }, + { + "epoch": 0.4086399085378302, + "grad_norm": 0.2041776031255722, + "learning_rate": 6.561944409657726e-05, + "loss": 0.6767, + "step": 1251 + }, + { + "epoch": 0.4089665591441754, + "grad_norm": 0.23141330480575562, + "learning_rate": 6.556987782433894e-05, + "loss": 0.8457, + "step": 1252 + }, + { + "epoch": 0.4092932097505206, + "grad_norm": 0.25107938051223755, + "learning_rate": 6.552029460234664e-05, + "loss": 0.8766, + "step": 1253 + }, + { + "epoch": 0.40961986035686576, + "grad_norm": 0.26021620631217957, + "learning_rate": 6.547069448457785e-05, + "loss": 0.9075, + "step": 1254 + }, + { + "epoch": 0.409946510963211, + "grad_norm": 0.29366645216941833, + "learning_rate": 6.542107752502848e-05, + "loss": 0.8482, + "step": 1255 + }, + { + "epoch": 0.4102731615695562, + "grad_norm": 0.2926657497882843, + "learning_rate": 6.537144377771279e-05, + "loss": 0.8924, + "step": 1256 + }, + { + "epoch": 0.41059981217590136, + "grad_norm": 0.30195921659469604, + "learning_rate": 6.53217932966633e-05, + "loss": 0.9467, + "step": 1257 + }, + { + "epoch": 0.41092646278224654, + "grad_norm": 0.29315873980522156, + "learning_rate": 6.527212613593074e-05, + "loss": 0.7876, + "step": 1258 + }, + { + "epoch": 0.4112531133885917, + "grad_norm": 0.29667970538139343, + "learning_rate": 6.522244234958404e-05, + "loss": 0.8602, + "step": 1259 + }, + { + "epoch": 0.4115797639949369, + "grad_norm": 0.3073457181453705, + "learning_rate": 6.517274199171019e-05, + "loss": 0.9042, + "step": 1260 + }, + { + "epoch": 0.4119064146012821, + "grad_norm": 0.3291796147823334, + "learning_rate": 6.512302511641419e-05, + "loss": 0.9297, + "step": 1261 + }, + { + "epoch": 0.4122330652076273, + "grad_norm": 0.36080867052078247, + "learning_rate": 6.507329177781911e-05, + "loss": 0.8909, + "step": 1262 + }, + { + "epoch": 0.4125597158139725, + "grad_norm": 0.39371705055236816, + "learning_rate": 6.502354203006588e-05, + "loss": 1.0463, + "step": 1263 + }, + { + "epoch": 0.4128863664203177, + "grad_norm": 0.3974571228027344, + "learning_rate": 6.497377592731329e-05, + "loss": 0.9081, + "step": 1264 + }, + { + "epoch": 0.41321301702666285, + "grad_norm": 0.41697776317596436, + "learning_rate": 6.492399352373795e-05, + "loss": 0.9524, + "step": 1265 + }, + { + "epoch": 0.41353966763300803, + "grad_norm": 0.4638747572898865, + "learning_rate": 6.487419487353421e-05, + "loss": 1.0998, + "step": 1266 + }, + { + "epoch": 0.4138663182393532, + "grad_norm": 0.5170689821243286, + "learning_rate": 6.482438003091414e-05, + "loss": 1.212, + "step": 1267 + }, + { + "epoch": 0.4141929688456984, + "grad_norm": 0.5564174652099609, + "learning_rate": 6.47745490501074e-05, + "loss": 1.0914, + "step": 1268 + }, + { + "epoch": 0.41451961945204363, + "grad_norm": 0.7147950530052185, + "learning_rate": 6.47247019853612e-05, + "loss": 1.2383, + "step": 1269 + }, + { + "epoch": 0.4148462700583888, + "grad_norm": 0.6999964118003845, + "learning_rate": 6.467483889094033e-05, + "loss": 1.2716, + "step": 1270 + }, + { + "epoch": 0.415172920664734, + "grad_norm": 0.9136797785758972, + "learning_rate": 6.462495982112697e-05, + "loss": 1.1405, + "step": 1271 + }, + { + "epoch": 0.41549957127107917, + "grad_norm": 0.9877931475639343, + "learning_rate": 6.457506483022068e-05, + "loss": 1.5912, + "step": 1272 + }, + { + "epoch": 0.41582622187742435, + "grad_norm": 1.415334939956665, + "learning_rate": 6.452515397253844e-05, + "loss": 1.7894, + "step": 1273 + }, + { + "epoch": 0.41615287248376953, + "grad_norm": 1.3017021417617798, + "learning_rate": 6.44752273024144e-05, + "loss": 1.4178, + "step": 1274 + }, + { + "epoch": 0.4164795230901147, + "grad_norm": 1.7668936252593994, + "learning_rate": 6.442528487419996e-05, + "loss": 1.9307, + "step": 1275 + }, + { + "epoch": 0.41680617369645995, + "grad_norm": 0.20308293402194977, + "learning_rate": 6.437532674226372e-05, + "loss": 0.7709, + "step": 1276 + }, + { + "epoch": 0.4171328243028051, + "grad_norm": 0.21541514992713928, + "learning_rate": 6.432535296099132e-05, + "loss": 0.7705, + "step": 1277 + }, + { + "epoch": 0.4174594749091503, + "grad_norm": 0.24598878622055054, + "learning_rate": 6.427536358478542e-05, + "loss": 0.8412, + "step": 1278 + }, + { + "epoch": 0.4177861255154955, + "grad_norm": 0.2694220244884491, + "learning_rate": 6.422535866806576e-05, + "loss": 0.8584, + "step": 1279 + }, + { + "epoch": 0.41811277612184067, + "grad_norm": 0.2715321183204651, + "learning_rate": 6.417533826526888e-05, + "loss": 0.8804, + "step": 1280 + }, + { + "epoch": 0.41843942672818585, + "grad_norm": 0.26800206303596497, + "learning_rate": 6.412530243084824e-05, + "loss": 0.7912, + "step": 1281 + }, + { + "epoch": 0.41876607733453103, + "grad_norm": 0.2986787259578705, + "learning_rate": 6.407525121927409e-05, + "loss": 0.8956, + "step": 1282 + }, + { + "epoch": 0.41909272794087626, + "grad_norm": 0.3155290186405182, + "learning_rate": 6.40251846850334e-05, + "loss": 0.8807, + "step": 1283 + }, + { + "epoch": 0.41941937854722144, + "grad_norm": 0.3152468502521515, + "learning_rate": 6.397510288262986e-05, + "loss": 0.8617, + "step": 1284 + }, + { + "epoch": 0.4197460291535666, + "grad_norm": 0.33172619342803955, + "learning_rate": 6.392500586658376e-05, + "loss": 0.9051, + "step": 1285 + }, + { + "epoch": 0.4200726797599118, + "grad_norm": 0.39893805980682373, + "learning_rate": 6.387489369143191e-05, + "loss": 1.0182, + "step": 1286 + }, + { + "epoch": 0.420399330366257, + "grad_norm": 0.34438183903694153, + "learning_rate": 6.38247664117277e-05, + "loss": 0.848, + "step": 1287 + }, + { + "epoch": 0.42072598097260216, + "grad_norm": 0.37023016810417175, + "learning_rate": 6.377462408204093e-05, + "loss": 0.8406, + "step": 1288 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.41819244623184204, + "learning_rate": 6.372446675695778e-05, + "loss": 0.9544, + "step": 1289 + }, + { + "epoch": 0.4213792821852926, + "grad_norm": 0.4439786374568939, + "learning_rate": 6.367429449108072e-05, + "loss": 1.0086, + "step": 1290 + }, + { + "epoch": 0.42170593279163776, + "grad_norm": 0.49234476685523987, + "learning_rate": 6.362410733902855e-05, + "loss": 1.0202, + "step": 1291 + }, + { + "epoch": 0.42203258339798294, + "grad_norm": 0.5539708733558655, + "learning_rate": 6.357390535543623e-05, + "loss": 1.1179, + "step": 1292 + }, + { + "epoch": 0.4223592340043281, + "grad_norm": 0.5921236276626587, + "learning_rate": 6.35236885949549e-05, + "loss": 1.3069, + "step": 1293 + }, + { + "epoch": 0.4226858846106733, + "grad_norm": 0.5858140587806702, + "learning_rate": 6.347345711225176e-05, + "loss": 1.2039, + "step": 1294 + }, + { + "epoch": 0.4230125352170185, + "grad_norm": 0.7424845695495605, + "learning_rate": 6.342321096201003e-05, + "loss": 1.4528, + "step": 1295 + }, + { + "epoch": 0.42333918582336366, + "grad_norm": 0.8141841292381287, + "learning_rate": 6.33729501989289e-05, + "loss": 1.4032, + "step": 1296 + }, + { + "epoch": 0.4236658364297089, + "grad_norm": 1.0641597509384155, + "learning_rate": 6.332267487772352e-05, + "loss": 1.4153, + "step": 1297 + }, + { + "epoch": 0.4239924870360541, + "grad_norm": 1.2870875597000122, + "learning_rate": 6.327238505312484e-05, + "loss": 1.6028, + "step": 1298 + }, + { + "epoch": 0.42431913764239926, + "grad_norm": 1.4582892656326294, + "learning_rate": 6.322208077987958e-05, + "loss": 1.3119, + "step": 1299 + }, + { + "epoch": 0.42464578824874444, + "grad_norm": 1.7859705686569214, + "learning_rate": 6.317176211275022e-05, + "loss": 1.7957, + "step": 1300 + }, + { + "epoch": 0.4249724388550896, + "grad_norm": 0.19623303413391113, + "learning_rate": 6.312142910651492e-05, + "loss": 0.7682, + "step": 1301 + }, + { + "epoch": 0.4252990894614348, + "grad_norm": 0.22996629774570465, + "learning_rate": 6.307108181596743e-05, + "loss": 0.8693, + "step": 1302 + }, + { + "epoch": 0.42562574006778, + "grad_norm": 0.24149446189403534, + "learning_rate": 6.302072029591707e-05, + "loss": 0.8444, + "step": 1303 + }, + { + "epoch": 0.4259523906741252, + "grad_norm": 0.25310564041137695, + "learning_rate": 6.297034460118861e-05, + "loss": 0.8612, + "step": 1304 + }, + { + "epoch": 0.4262790412804704, + "grad_norm": 0.26213711500167847, + "learning_rate": 6.29199547866223e-05, + "loss": 0.8087, + "step": 1305 + }, + { + "epoch": 0.4266056918868156, + "grad_norm": 0.28710299730300903, + "learning_rate": 6.286955090707371e-05, + "loss": 0.877, + "step": 1306 + }, + { + "epoch": 0.42693234249316075, + "grad_norm": 0.3216702342033386, + "learning_rate": 6.281913301741378e-05, + "loss": 0.9834, + "step": 1307 + }, + { + "epoch": 0.42725899309950593, + "grad_norm": 0.3331913948059082, + "learning_rate": 6.276870117252867e-05, + "loss": 1.0185, + "step": 1308 + }, + { + "epoch": 0.4275856437058511, + "grad_norm": 0.3292556405067444, + "learning_rate": 6.271825542731971e-05, + "loss": 0.9194, + "step": 1309 + }, + { + "epoch": 0.4279122943121963, + "grad_norm": 0.32852330803871155, + "learning_rate": 6.26677958367034e-05, + "loss": 0.9441, + "step": 1310 + }, + { + "epoch": 0.42823894491854153, + "grad_norm": 0.33652690052986145, + "learning_rate": 6.261732245561129e-05, + "loss": 0.8488, + "step": 1311 + }, + { + "epoch": 0.4285655955248867, + "grad_norm": 0.3745363652706146, + "learning_rate": 6.256683533898995e-05, + "loss": 0.8509, + "step": 1312 + }, + { + "epoch": 0.4288922461312319, + "grad_norm": 0.3785022795200348, + "learning_rate": 6.251633454180091e-05, + "loss": 0.8239, + "step": 1313 + }, + { + "epoch": 0.42921889673757707, + "grad_norm": 0.4264315366744995, + "learning_rate": 6.24658201190206e-05, + "loss": 1.065, + "step": 1314 + }, + { + "epoch": 0.42954554734392225, + "grad_norm": 0.43724411725997925, + "learning_rate": 6.241529212564025e-05, + "loss": 0.9565, + "step": 1315 + }, + { + "epoch": 0.42987219795026743, + "grad_norm": 0.468350887298584, + "learning_rate": 6.236475061666588e-05, + "loss": 1.0929, + "step": 1316 + }, + { + "epoch": 0.4301988485566126, + "grad_norm": 0.5075478553771973, + "learning_rate": 6.231419564711826e-05, + "loss": 1.0679, + "step": 1317 + }, + { + "epoch": 0.43052549916295785, + "grad_norm": 0.5075100660324097, + "learning_rate": 6.226362727203272e-05, + "loss": 1.0657, + "step": 1318 + }, + { + "epoch": 0.43085214976930303, + "grad_norm": 0.5745466947555542, + "learning_rate": 6.22130455464593e-05, + "loss": 1.0917, + "step": 1319 + }, + { + "epoch": 0.4311788003756482, + "grad_norm": 0.6840534210205078, + "learning_rate": 6.216245052546251e-05, + "loss": 1.0422, + "step": 1320 + }, + { + "epoch": 0.4315054509819934, + "grad_norm": 0.7854779958724976, + "learning_rate": 6.211184226412131e-05, + "loss": 1.2574, + "step": 1321 + }, + { + "epoch": 0.43183210158833857, + "grad_norm": 0.9489834904670715, + "learning_rate": 6.206122081752913e-05, + "loss": 1.4515, + "step": 1322 + }, + { + "epoch": 0.43215875219468375, + "grad_norm": 0.9874739050865173, + "learning_rate": 6.201058624079371e-05, + "loss": 1.2792, + "step": 1323 + }, + { + "epoch": 0.43248540280102893, + "grad_norm": 1.4044889211654663, + "learning_rate": 6.195993858903713e-05, + "loss": 1.5486, + "step": 1324 + }, + { + "epoch": 0.43281205340737416, + "grad_norm": 2.303447723388672, + "learning_rate": 6.190927791739565e-05, + "loss": 1.998, + "step": 1325 + }, + { + "epoch": 0.43313870401371934, + "grad_norm": 0.1984492391347885, + "learning_rate": 6.185860428101974e-05, + "loss": 0.7388, + "step": 1326 + }, + { + "epoch": 0.4334653546200645, + "grad_norm": 0.24812500178813934, + "learning_rate": 6.180791773507396e-05, + "loss": 0.7755, + "step": 1327 + }, + { + "epoch": 0.4337920052264097, + "grad_norm": 0.2576265335083008, + "learning_rate": 6.175721833473697e-05, + "loss": 0.796, + "step": 1328 + }, + { + "epoch": 0.4341186558327549, + "grad_norm": 0.2764826714992523, + "learning_rate": 6.170650613520137e-05, + "loss": 0.8901, + "step": 1329 + }, + { + "epoch": 0.43444530643910007, + "grad_norm": 0.28213411569595337, + "learning_rate": 6.16557811916737e-05, + "loss": 0.8744, + "step": 1330 + }, + { + "epoch": 0.43477195704544525, + "grad_norm": 0.2741318345069885, + "learning_rate": 6.160504355937441e-05, + "loss": 0.7944, + "step": 1331 + }, + { + "epoch": 0.4350986076517905, + "grad_norm": 0.3129563331604004, + "learning_rate": 6.155429329353772e-05, + "loss": 0.9256, + "step": 1332 + }, + { + "epoch": 0.43542525825813566, + "grad_norm": 0.2871616780757904, + "learning_rate": 6.150353044941166e-05, + "loss": 0.87, + "step": 1333 + }, + { + "epoch": 0.43575190886448084, + "grad_norm": 0.2862168550491333, + "learning_rate": 6.145275508225789e-05, + "loss": 0.9194, + "step": 1334 + }, + { + "epoch": 0.436078559470826, + "grad_norm": 0.3074105381965637, + "learning_rate": 6.140196724735173e-05, + "loss": 0.8538, + "step": 1335 + }, + { + "epoch": 0.4364052100771712, + "grad_norm": 0.3276168704032898, + "learning_rate": 6.135116699998208e-05, + "loss": 0.903, + "step": 1336 + }, + { + "epoch": 0.4367318606835164, + "grad_norm": 0.3311191499233246, + "learning_rate": 6.130035439545137e-05, + "loss": 0.8529, + "step": 1337 + }, + { + "epoch": 0.43705851128986156, + "grad_norm": 0.36968547105789185, + "learning_rate": 6.12495294890754e-05, + "loss": 1.0096, + "step": 1338 + }, + { + "epoch": 0.43738516189620674, + "grad_norm": 0.3730202317237854, + "learning_rate": 6.119869233618347e-05, + "loss": 0.8173, + "step": 1339 + }, + { + "epoch": 0.437711812502552, + "grad_norm": 0.3859659433364868, + "learning_rate": 6.114784299211812e-05, + "loss": 0.9545, + "step": 1340 + }, + { + "epoch": 0.43803846310889716, + "grad_norm": 0.4324837327003479, + "learning_rate": 6.109698151223524e-05, + "loss": 1.0696, + "step": 1341 + }, + { + "epoch": 0.43836511371524234, + "grad_norm": 0.509543776512146, + "learning_rate": 6.10461079519039e-05, + "loss": 0.9784, + "step": 1342 + }, + { + "epoch": 0.4386917643215875, + "grad_norm": 0.5357346534729004, + "learning_rate": 6.099522236650628e-05, + "loss": 1.109, + "step": 1343 + }, + { + "epoch": 0.4390184149279327, + "grad_norm": 0.5872611403465271, + "learning_rate": 6.09443248114377e-05, + "loss": 1.1932, + "step": 1344 + }, + { + "epoch": 0.4393450655342779, + "grad_norm": 0.6826693415641785, + "learning_rate": 6.089341534210652e-05, + "loss": 1.2509, + "step": 1345 + }, + { + "epoch": 0.43967171614062306, + "grad_norm": 0.7309541702270508, + "learning_rate": 6.084249401393403e-05, + "loss": 1.2348, + "step": 1346 + }, + { + "epoch": 0.4399983667469683, + "grad_norm": 0.897671639919281, + "learning_rate": 6.0791560882354424e-05, + "loss": 1.3749, + "step": 1347 + }, + { + "epoch": 0.4403250173533135, + "grad_norm": 0.9450259804725647, + "learning_rate": 6.07406160028148e-05, + "loss": 1.3895, + "step": 1348 + }, + { + "epoch": 0.44065166795965865, + "grad_norm": 1.1689409017562866, + "learning_rate": 6.0689659430775e-05, + "loss": 1.5146, + "step": 1349 + }, + { + "epoch": 0.44097831856600384, + "grad_norm": 1.6155471801757812, + "learning_rate": 6.063869122170761e-05, + "loss": 1.6751, + "step": 1350 + }, + { + "epoch": 0.441304969172349, + "grad_norm": 0.227553129196167, + "learning_rate": 6.058771143109789e-05, + "loss": 0.8437, + "step": 1351 + }, + { + "epoch": 0.4416316197786942, + "grad_norm": 0.25323745608329773, + "learning_rate": 6.053672011444369e-05, + "loss": 0.8681, + "step": 1352 + }, + { + "epoch": 0.4419582703850394, + "grad_norm": 0.26084786653518677, + "learning_rate": 6.048571732725543e-05, + "loss": 0.8891, + "step": 1353 + }, + { + "epoch": 0.4422849209913846, + "grad_norm": 0.2727871239185333, + "learning_rate": 6.043470312505599e-05, + "loss": 0.7828, + "step": 1354 + }, + { + "epoch": 0.4426115715977298, + "grad_norm": 0.2814069390296936, + "learning_rate": 6.038367756338072e-05, + "loss": 0.8744, + "step": 1355 + }, + { + "epoch": 0.44293822220407497, + "grad_norm": 0.2910081446170807, + "learning_rate": 6.0332640697777273e-05, + "loss": 0.7748, + "step": 1356 + }, + { + "epoch": 0.44326487281042015, + "grad_norm": 0.2924244701862335, + "learning_rate": 6.028159258380567e-05, + "loss": 0.769, + "step": 1357 + }, + { + "epoch": 0.44359152341676533, + "grad_norm": 0.30017054080963135, + "learning_rate": 6.0230533277038127e-05, + "loss": 0.8374, + "step": 1358 + }, + { + "epoch": 0.4439181740231105, + "grad_norm": 0.31334736943244934, + "learning_rate": 6.01794628330591e-05, + "loss": 0.8224, + "step": 1359 + }, + { + "epoch": 0.4442448246294557, + "grad_norm": 0.3215758800506592, + "learning_rate": 6.01283813074651e-05, + "loss": 0.8468, + "step": 1360 + }, + { + "epoch": 0.44457147523580093, + "grad_norm": 0.3455059230327606, + "learning_rate": 6.007728875586476e-05, + "loss": 0.9048, + "step": 1361 + }, + { + "epoch": 0.4448981258421461, + "grad_norm": 0.36075106263160706, + "learning_rate": 6.002618523387868e-05, + "loss": 0.8895, + "step": 1362 + }, + { + "epoch": 0.4452247764484913, + "grad_norm": 0.37401801347732544, + "learning_rate": 5.9975070797139446e-05, + "loss": 0.8716, + "step": 1363 + }, + { + "epoch": 0.44555142705483647, + "grad_norm": 0.40770915150642395, + "learning_rate": 5.992394550129148e-05, + "loss": 0.9879, + "step": 1364 + }, + { + "epoch": 0.44587807766118165, + "grad_norm": 0.4294146001338959, + "learning_rate": 5.9872809401991034e-05, + "loss": 1.0252, + "step": 1365 + }, + { + "epoch": 0.44620472826752683, + "grad_norm": 0.4494366943836212, + "learning_rate": 5.9821662554906144e-05, + "loss": 1.0504, + "step": 1366 + }, + { + "epoch": 0.446531378873872, + "grad_norm": 0.49997806549072266, + "learning_rate": 5.977050501571653e-05, + "loss": 0.9862, + "step": 1367 + }, + { + "epoch": 0.44685802948021724, + "grad_norm": 0.5091699957847595, + "learning_rate": 5.971933684011355e-05, + "loss": 0.9708, + "step": 1368 + }, + { + "epoch": 0.4471846800865624, + "grad_norm": 0.6185327172279358, + "learning_rate": 5.966815808380015e-05, + "loss": 1.1143, + "step": 1369 + }, + { + "epoch": 0.4475113306929076, + "grad_norm": 0.7364172339439392, + "learning_rate": 5.961696880249079e-05, + "loss": 1.3154, + "step": 1370 + }, + { + "epoch": 0.4478379812992528, + "grad_norm": 0.8025093674659729, + "learning_rate": 5.9565769051911376e-05, + "loss": 1.1813, + "step": 1371 + }, + { + "epoch": 0.44816463190559797, + "grad_norm": 0.9280282258987427, + "learning_rate": 5.951455888779925e-05, + "loss": 1.0802, + "step": 1372 + }, + { + "epoch": 0.44849128251194315, + "grad_norm": 1.183429479598999, + "learning_rate": 5.9463338365903035e-05, + "loss": 1.1206, + "step": 1373 + }, + { + "epoch": 0.4488179331182883, + "grad_norm": 1.2044509649276733, + "learning_rate": 5.941210754198266e-05, + "loss": 1.4664, + "step": 1374 + }, + { + "epoch": 0.44914458372463356, + "grad_norm": 1.775682806968689, + "learning_rate": 5.936086647180928e-05, + "loss": 2.0872, + "step": 1375 + }, + { + "epoch": 0.44947123433097874, + "grad_norm": 0.19256451725959778, + "learning_rate": 5.9309615211165185e-05, + "loss": 0.7676, + "step": 1376 + }, + { + "epoch": 0.4497978849373239, + "grad_norm": 0.22578822076320648, + "learning_rate": 5.925835381584377e-05, + "loss": 0.797, + "step": 1377 + }, + { + "epoch": 0.4501245355436691, + "grad_norm": 0.24020101130008698, + "learning_rate": 5.9207082341649454e-05, + "loss": 0.7621, + "step": 1378 + }, + { + "epoch": 0.4504511861500143, + "grad_norm": 0.262273371219635, + "learning_rate": 5.9155800844397625e-05, + "loss": 0.785, + "step": 1379 + }, + { + "epoch": 0.45077783675635946, + "grad_norm": 0.2687849700450897, + "learning_rate": 5.9104509379914586e-05, + "loss": 0.847, + "step": 1380 + }, + { + "epoch": 0.45110448736270464, + "grad_norm": 0.2742252051830292, + "learning_rate": 5.905320800403752e-05, + "loss": 0.798, + "step": 1381 + }, + { + "epoch": 0.4514311379690499, + "grad_norm": 0.28020089864730835, + "learning_rate": 5.900189677261434e-05, + "loss": 0.7844, + "step": 1382 + }, + { + "epoch": 0.45175778857539506, + "grad_norm": 0.3002532422542572, + "learning_rate": 5.8950575741503744e-05, + "loss": 0.9231, + "step": 1383 + }, + { + "epoch": 0.45208443918174024, + "grad_norm": 0.2936133146286011, + "learning_rate": 5.889924496657506e-05, + "loss": 0.8288, + "step": 1384 + }, + { + "epoch": 0.4524110897880854, + "grad_norm": 0.3212607502937317, + "learning_rate": 5.884790450370825e-05, + "loss": 0.8293, + "step": 1385 + }, + { + "epoch": 0.4527377403944306, + "grad_norm": 0.3269292414188385, + "learning_rate": 5.87965544087938e-05, + "loss": 0.8997, + "step": 1386 + }, + { + "epoch": 0.4530643910007758, + "grad_norm": 0.35607802867889404, + "learning_rate": 5.874519473773271e-05, + "loss": 0.9919, + "step": 1387 + }, + { + "epoch": 0.45339104160712096, + "grad_norm": 0.3684079647064209, + "learning_rate": 5.869382554643639e-05, + "loss": 0.9916, + "step": 1388 + }, + { + "epoch": 0.4537176922134662, + "grad_norm": 0.3907953202724457, + "learning_rate": 5.864244689082659e-05, + "loss": 1.0021, + "step": 1389 + }, + { + "epoch": 0.4540443428198114, + "grad_norm": 0.3965863883495331, + "learning_rate": 5.85910588268354e-05, + "loss": 0.8898, + "step": 1390 + }, + { + "epoch": 0.45437099342615656, + "grad_norm": 0.4447333514690399, + "learning_rate": 5.853966141040512e-05, + "loss": 0.9954, + "step": 1391 + }, + { + "epoch": 0.45469764403250174, + "grad_norm": 0.4704182744026184, + "learning_rate": 5.848825469748828e-05, + "loss": 1.0979, + "step": 1392 + }, + { + "epoch": 0.4550242946388469, + "grad_norm": 0.47324079275131226, + "learning_rate": 5.843683874404746e-05, + "loss": 0.9627, + "step": 1393 + }, + { + "epoch": 0.4553509452451921, + "grad_norm": 0.618899941444397, + "learning_rate": 5.838541360605538e-05, + "loss": 1.104, + "step": 1394 + }, + { + "epoch": 0.4556775958515373, + "grad_norm": 0.6069836616516113, + "learning_rate": 5.833397933949469e-05, + "loss": 1.2049, + "step": 1395 + }, + { + "epoch": 0.4560042464578825, + "grad_norm": 0.7420676946640015, + "learning_rate": 5.8282536000358e-05, + "loss": 1.2061, + "step": 1396 + }, + { + "epoch": 0.4563308970642277, + "grad_norm": 1.0195621252059937, + "learning_rate": 5.823108364464782e-05, + "loss": 1.4885, + "step": 1397 + }, + { + "epoch": 0.45665754767057287, + "grad_norm": 1.0798938274383545, + "learning_rate": 5.817962232837645e-05, + "loss": 1.4734, + "step": 1398 + }, + { + "epoch": 0.45698419827691805, + "grad_norm": 1.271763563156128, + "learning_rate": 5.8128152107565946e-05, + "loss": 1.8257, + "step": 1399 + }, + { + "epoch": 0.45731084888326323, + "grad_norm": 1.607129693031311, + "learning_rate": 5.807667303824806e-05, + "loss": 1.8979, + "step": 1400 + }, + { + "epoch": 0.4576374994896084, + "grad_norm": 0.2017160803079605, + "learning_rate": 5.8025185176464204e-05, + "loss": 0.6849, + "step": 1401 + }, + { + "epoch": 0.4579641500959536, + "grad_norm": 0.21873755753040314, + "learning_rate": 5.7973688578265304e-05, + "loss": 0.7677, + "step": 1402 + }, + { + "epoch": 0.45829080070229883, + "grad_norm": 0.25168201327323914, + "learning_rate": 5.792218329971184e-05, + "loss": 0.8155, + "step": 1403 + }, + { + "epoch": 0.458617451308644, + "grad_norm": 0.2593826651573181, + "learning_rate": 5.7870669396873754e-05, + "loss": 0.8939, + "step": 1404 + }, + { + "epoch": 0.4589441019149892, + "grad_norm": 0.24560409784317017, + "learning_rate": 5.7819146925830324e-05, + "loss": 0.7275, + "step": 1405 + }, + { + "epoch": 0.45927075252133437, + "grad_norm": 0.273952841758728, + "learning_rate": 5.7767615942670204e-05, + "loss": 0.8585, + "step": 1406 + }, + { + "epoch": 0.45959740312767955, + "grad_norm": 0.2819421887397766, + "learning_rate": 5.7716076503491314e-05, + "loss": 0.8319, + "step": 1407 + }, + { + "epoch": 0.45992405373402473, + "grad_norm": 0.3027994632720947, + "learning_rate": 5.766452866440072e-05, + "loss": 0.8609, + "step": 1408 + }, + { + "epoch": 0.4602507043403699, + "grad_norm": 0.3026481866836548, + "learning_rate": 5.761297248151469e-05, + "loss": 0.8992, + "step": 1409 + }, + { + "epoch": 0.46057735494671515, + "grad_norm": 0.3143552839756012, + "learning_rate": 5.756140801095858e-05, + "loss": 0.9488, + "step": 1410 + }, + { + "epoch": 0.4609040055530603, + "grad_norm": 0.3467939496040344, + "learning_rate": 5.750983530886672e-05, + "loss": 1.0314, + "step": 1411 + }, + { + "epoch": 0.4612306561594055, + "grad_norm": 0.3449691832065582, + "learning_rate": 5.745825443138246e-05, + "loss": 0.9967, + "step": 1412 + }, + { + "epoch": 0.4615573067657507, + "grad_norm": 0.35669663548469543, + "learning_rate": 5.740666543465798e-05, + "loss": 1.001, + "step": 1413 + }, + { + "epoch": 0.46188395737209587, + "grad_norm": 0.40556496381759644, + "learning_rate": 5.735506837485437e-05, + "loss": 0.9929, + "step": 1414 + }, + { + "epoch": 0.46221060797844105, + "grad_norm": 0.3812430500984192, + "learning_rate": 5.730346330814145e-05, + "loss": 1.0193, + "step": 1415 + }, + { + "epoch": 0.4625372585847862, + "grad_norm": 0.44601941108703613, + "learning_rate": 5.7251850290697774e-05, + "loss": 0.9902, + "step": 1416 + }, + { + "epoch": 0.46286390919113146, + "grad_norm": 0.42666640877723694, + "learning_rate": 5.7200229378710546e-05, + "loss": 1.0404, + "step": 1417 + }, + { + "epoch": 0.46319055979747664, + "grad_norm": 0.4538075625896454, + "learning_rate": 5.714860062837557e-05, + "loss": 1.0535, + "step": 1418 + }, + { + "epoch": 0.4635172104038218, + "grad_norm": 0.5554001331329346, + "learning_rate": 5.7096964095897174e-05, + "loss": 1.1326, + "step": 1419 + }, + { + "epoch": 0.463843861010167, + "grad_norm": 0.5905495882034302, + "learning_rate": 5.7045319837488186e-05, + "loss": 1.2233, + "step": 1420 + }, + { + "epoch": 0.4641705116165122, + "grad_norm": 0.6906095147132874, + "learning_rate": 5.6993667909369794e-05, + "loss": 1.292, + "step": 1421 + }, + { + "epoch": 0.46449716222285736, + "grad_norm": 0.793144941329956, + "learning_rate": 5.694200836777158e-05, + "loss": 1.2007, + "step": 1422 + }, + { + "epoch": 0.46482381282920254, + "grad_norm": 1.0440678596496582, + "learning_rate": 5.68903412689314e-05, + "loss": 1.3643, + "step": 1423 + }, + { + "epoch": 0.4651504634355478, + "grad_norm": 1.2731975317001343, + "learning_rate": 5.683866666909533e-05, + "loss": 1.6552, + "step": 1424 + }, + { + "epoch": 0.46547711404189296, + "grad_norm": 1.792528510093689, + "learning_rate": 5.6786984624517636e-05, + "loss": 1.6505, + "step": 1425 + }, + { + "epoch": 0.46580376464823814, + "grad_norm": 0.20550623536109924, + "learning_rate": 5.6735295191460636e-05, + "loss": 0.7223, + "step": 1426 + }, + { + "epoch": 0.4661304152545833, + "grad_norm": 0.24376057088375092, + "learning_rate": 5.668359842619474e-05, + "loss": 0.8327, + "step": 1427 + }, + { + "epoch": 0.4664570658609285, + "grad_norm": 0.23944585025310516, + "learning_rate": 5.663189438499833e-05, + "loss": 0.8034, + "step": 1428 + }, + { + "epoch": 0.4667837164672737, + "grad_norm": 0.2565159499645233, + "learning_rate": 5.6580183124157714e-05, + "loss": 0.8629, + "step": 1429 + }, + { + "epoch": 0.46711036707361886, + "grad_norm": 0.2701554298400879, + "learning_rate": 5.652846469996702e-05, + "loss": 0.8289, + "step": 1430 + }, + { + "epoch": 0.4674370176799641, + "grad_norm": 0.30046141147613525, + "learning_rate": 5.647673916872822e-05, + "loss": 0.8658, + "step": 1431 + }, + { + "epoch": 0.4677636682863093, + "grad_norm": 0.2801269590854645, + "learning_rate": 5.6425006586751004e-05, + "loss": 0.9112, + "step": 1432 + }, + { + "epoch": 0.46809031889265446, + "grad_norm": 0.3160743713378906, + "learning_rate": 5.6373267010352736e-05, + "loss": 0.8969, + "step": 1433 + }, + { + "epoch": 0.46841696949899964, + "grad_norm": 0.3159767687320709, + "learning_rate": 5.632152049585843e-05, + "loss": 0.9211, + "step": 1434 + }, + { + "epoch": 0.4687436201053448, + "grad_norm": 0.31142839789390564, + "learning_rate": 5.626976709960057e-05, + "loss": 0.8939, + "step": 1435 + }, + { + "epoch": 0.46907027071169, + "grad_norm": 0.31656891107559204, + "learning_rate": 5.621800687791922e-05, + "loss": 0.8701, + "step": 1436 + }, + { + "epoch": 0.4693969213180352, + "grad_norm": 0.3449065387248993, + "learning_rate": 5.616623988716181e-05, + "loss": 0.9217, + "step": 1437 + }, + { + "epoch": 0.4697235719243804, + "grad_norm": 0.39621448516845703, + "learning_rate": 5.611446618368319e-05, + "loss": 0.9877, + "step": 1438 + }, + { + "epoch": 0.4700502225307256, + "grad_norm": 0.39008018374443054, + "learning_rate": 5.606268582384548e-05, + "loss": 0.889, + "step": 1439 + }, + { + "epoch": 0.4703768731370708, + "grad_norm": 0.443905234336853, + "learning_rate": 5.601089886401808e-05, + "loss": 0.9621, + "step": 1440 + }, + { + "epoch": 0.47070352374341595, + "grad_norm": 0.46601757407188416, + "learning_rate": 5.595910536057753e-05, + "loss": 0.9748, + "step": 1441 + }, + { + "epoch": 0.47103017434976113, + "grad_norm": 0.4815382957458496, + "learning_rate": 5.5907305369907534e-05, + "loss": 0.9838, + "step": 1442 + }, + { + "epoch": 0.4713568249561063, + "grad_norm": 0.5498834252357483, + "learning_rate": 5.5855498948398846e-05, + "loss": 1.069, + "step": 1443 + }, + { + "epoch": 0.4716834755624515, + "grad_norm": 0.5492748022079468, + "learning_rate": 5.5803686152449184e-05, + "loss": 0.996, + "step": 1444 + }, + { + "epoch": 0.4720101261687967, + "grad_norm": 0.6215171217918396, + "learning_rate": 5.575186703846328e-05, + "loss": 1.0071, + "step": 1445 + }, + { + "epoch": 0.4723367767751419, + "grad_norm": 0.8084948658943176, + "learning_rate": 5.5700041662852684e-05, + "loss": 1.2216, + "step": 1446 + }, + { + "epoch": 0.4726634273814871, + "grad_norm": 0.9253719449043274, + "learning_rate": 5.56482100820358e-05, + "loss": 1.1535, + "step": 1447 + }, + { + "epoch": 0.47299007798783227, + "grad_norm": 1.2240797281265259, + "learning_rate": 5.559637235243773e-05, + "loss": 1.5206, + "step": 1448 + }, + { + "epoch": 0.47331672859417745, + "grad_norm": 1.3156620264053345, + "learning_rate": 5.5544528530490345e-05, + "loss": 1.3771, + "step": 1449 + }, + { + "epoch": 0.47364337920052263, + "grad_norm": 1.7129418849945068, + "learning_rate": 5.5492678672632094e-05, + "loss": 2.4055, + "step": 1450 + }, + { + "epoch": 0.4739700298068678, + "grad_norm": 0.17648789286613464, + "learning_rate": 5.5440822835308026e-05, + "loss": 0.6887, + "step": 1451 + }, + { + "epoch": 0.474296680413213, + "grad_norm": 0.22524596750736237, + "learning_rate": 5.538896107496967e-05, + "loss": 0.7308, + "step": 1452 + }, + { + "epoch": 0.4746233310195582, + "grad_norm": 0.23709015548229218, + "learning_rate": 5.5337093448075025e-05, + "loss": 0.8185, + "step": 1453 + }, + { + "epoch": 0.4749499816259034, + "grad_norm": 0.24522243440151215, + "learning_rate": 5.528522001108849e-05, + "loss": 0.7561, + "step": 1454 + }, + { + "epoch": 0.4752766322322486, + "grad_norm": 0.2625182569026947, + "learning_rate": 5.523334082048075e-05, + "loss": 0.8076, + "step": 1455 + }, + { + "epoch": 0.47560328283859377, + "grad_norm": 0.27700942754745483, + "learning_rate": 5.5181455932728785e-05, + "loss": 0.8427, + "step": 1456 + }, + { + "epoch": 0.47592993344493895, + "grad_norm": 0.2912745475769043, + "learning_rate": 5.512956540431577e-05, + "loss": 0.8505, + "step": 1457 + }, + { + "epoch": 0.4762565840512841, + "grad_norm": 0.2912481129169464, + "learning_rate": 5.5077669291731006e-05, + "loss": 0.9061, + "step": 1458 + }, + { + "epoch": 0.4765832346576293, + "grad_norm": 0.36340487003326416, + "learning_rate": 5.502576765146989e-05, + "loss": 0.9754, + "step": 1459 + }, + { + "epoch": 0.47690988526397454, + "grad_norm": 0.3103990852832794, + "learning_rate": 5.497386054003385e-05, + "loss": 0.9334, + "step": 1460 + }, + { + "epoch": 0.4772365358703197, + "grad_norm": 0.33423370122909546, + "learning_rate": 5.492194801393023e-05, + "loss": 0.8043, + "step": 1461 + }, + { + "epoch": 0.4775631864766649, + "grad_norm": 0.3521210253238678, + "learning_rate": 5.487003012967228e-05, + "loss": 0.9447, + "step": 1462 + }, + { + "epoch": 0.4778898370830101, + "grad_norm": 0.342441201210022, + "learning_rate": 5.4818106943779105e-05, + "loss": 0.9133, + "step": 1463 + }, + { + "epoch": 0.47821648768935526, + "grad_norm": 0.39511221647262573, + "learning_rate": 5.476617851277559e-05, + "loss": 1.0284, + "step": 1464 + }, + { + "epoch": 0.47854313829570044, + "grad_norm": 0.39670830965042114, + "learning_rate": 5.471424489319227e-05, + "loss": 0.9131, + "step": 1465 + }, + { + "epoch": 0.4788697889020456, + "grad_norm": 0.4468030333518982, + "learning_rate": 5.466230614156539e-05, + "loss": 0.93, + "step": 1466 + }, + { + "epoch": 0.47919643950839086, + "grad_norm": 0.47740623354911804, + "learning_rate": 5.461036231443676e-05, + "loss": 1.0926, + "step": 1467 + }, + { + "epoch": 0.47952309011473604, + "grad_norm": 0.5126772522926331, + "learning_rate": 5.455841346835371e-05, + "loss": 0.9977, + "step": 1468 + }, + { + "epoch": 0.4798497407210812, + "grad_norm": 0.5842729210853577, + "learning_rate": 5.4506459659869036e-05, + "loss": 1.0636, + "step": 1469 + }, + { + "epoch": 0.4801763913274264, + "grad_norm": 0.633167564868927, + "learning_rate": 5.445450094554094e-05, + "loss": 1.2103, + "step": 1470 + }, + { + "epoch": 0.4805030419337716, + "grad_norm": 0.7382978200912476, + "learning_rate": 5.440253738193297e-05, + "loss": 1.3609, + "step": 1471 + }, + { + "epoch": 0.48082969254011676, + "grad_norm": 1.0082590579986572, + "learning_rate": 5.435056902561393e-05, + "loss": 1.5858, + "step": 1472 + }, + { + "epoch": 0.48115634314646194, + "grad_norm": 1.1187267303466797, + "learning_rate": 5.4298595933157884e-05, + "loss": 1.5626, + "step": 1473 + }, + { + "epoch": 0.4814829937528072, + "grad_norm": 1.2796682119369507, + "learning_rate": 5.4246618161144006e-05, + "loss": 1.774, + "step": 1474 + }, + { + "epoch": 0.48180964435915236, + "grad_norm": 1.722538709640503, + "learning_rate": 5.4194635766156575e-05, + "loss": 2.2056, + "step": 1475 + }, + { + "epoch": 0.48213629496549754, + "grad_norm": 0.22073139250278473, + "learning_rate": 5.414264880478493e-05, + "loss": 0.8193, + "step": 1476 + }, + { + "epoch": 0.4824629455718427, + "grad_norm": 0.23783589899539948, + "learning_rate": 5.409065733362337e-05, + "loss": 0.8528, + "step": 1477 + }, + { + "epoch": 0.4827895961781879, + "grad_norm": 0.2497948557138443, + "learning_rate": 5.403866140927109e-05, + "loss": 0.7723, + "step": 1478 + }, + { + "epoch": 0.4831162467845331, + "grad_norm": 0.25545740127563477, + "learning_rate": 5.3986661088332115e-05, + "loss": 0.8591, + "step": 1479 + }, + { + "epoch": 0.48344289739087826, + "grad_norm": 0.2744065821170807, + "learning_rate": 5.3934656427415295e-05, + "loss": 0.8505, + "step": 1480 + }, + { + "epoch": 0.4837695479972235, + "grad_norm": 0.2947370111942291, + "learning_rate": 5.3882647483134196e-05, + "loss": 0.8756, + "step": 1481 + }, + { + "epoch": 0.4840961986035687, + "grad_norm": 0.3097544312477112, + "learning_rate": 5.3830634312107056e-05, + "loss": 0.8883, + "step": 1482 + }, + { + "epoch": 0.48442284920991385, + "grad_norm": 0.31207266449928284, + "learning_rate": 5.3778616970956663e-05, + "loss": 0.8534, + "step": 1483 + }, + { + "epoch": 0.48474949981625903, + "grad_norm": 0.38118842244148254, + "learning_rate": 5.3726595516310405e-05, + "loss": 0.9782, + "step": 1484 + }, + { + "epoch": 0.4850761504226042, + "grad_norm": 0.315085232257843, + "learning_rate": 5.367457000480011e-05, + "loss": 0.8068, + "step": 1485 + }, + { + "epoch": 0.4854028010289494, + "grad_norm": 0.34587183594703674, + "learning_rate": 5.3622540493062046e-05, + "loss": 0.8694, + "step": 1486 + }, + { + "epoch": 0.4857294516352946, + "grad_norm": 0.3551337718963623, + "learning_rate": 5.3570507037736826e-05, + "loss": 0.9294, + "step": 1487 + }, + { + "epoch": 0.4860561022416398, + "grad_norm": 0.3878238797187805, + "learning_rate": 5.351846969546935e-05, + "loss": 0.8701, + "step": 1488 + }, + { + "epoch": 0.486382752847985, + "grad_norm": 0.4408702254295349, + "learning_rate": 5.346642852290876e-05, + "loss": 1.2004, + "step": 1489 + }, + { + "epoch": 0.48670940345433017, + "grad_norm": 0.5383159518241882, + "learning_rate": 5.341438357670838e-05, + "loss": 1.0944, + "step": 1490 + }, + { + "epoch": 0.48703605406067535, + "grad_norm": 0.48991337418556213, + "learning_rate": 5.336233491352559e-05, + "loss": 1.0647, + "step": 1491 + }, + { + "epoch": 0.48736270466702053, + "grad_norm": 0.5360439419746399, + "learning_rate": 5.3310282590021875e-05, + "loss": 1.0419, + "step": 1492 + }, + { + "epoch": 0.4876893552733657, + "grad_norm": 0.5581495761871338, + "learning_rate": 5.325822666286268e-05, + "loss": 1.2277, + "step": 1493 + }, + { + "epoch": 0.4880160058797109, + "grad_norm": 0.6016497015953064, + "learning_rate": 5.320616718871736e-05, + "loss": 1.0748, + "step": 1494 + }, + { + "epoch": 0.4883426564860561, + "grad_norm": 0.6058472990989685, + "learning_rate": 5.315410422425917e-05, + "loss": 1.1947, + "step": 1495 + }, + { + "epoch": 0.4886693070924013, + "grad_norm": 0.7601789236068726, + "learning_rate": 5.310203782616513e-05, + "loss": 1.4049, + "step": 1496 + }, + { + "epoch": 0.4889959576987465, + "grad_norm": 0.9896534085273743, + "learning_rate": 5.304996805111599e-05, + "loss": 1.3961, + "step": 1497 + }, + { + "epoch": 0.48932260830509167, + "grad_norm": 1.224902629852295, + "learning_rate": 5.299789495579621e-05, + "loss": 1.6296, + "step": 1498 + }, + { + "epoch": 0.48964925891143685, + "grad_norm": 1.2904133796691895, + "learning_rate": 5.294581859689387e-05, + "loss": 1.5572, + "step": 1499 + }, + { + "epoch": 0.489975909517782, + "grad_norm": 1.9275236129760742, + "learning_rate": 5.2893739031100554e-05, + "loss": 2.2083, + "step": 1500 + }, + { + "epoch": 0.4903025601241272, + "grad_norm": 0.18931369483470917, + "learning_rate": 5.2841656315111366e-05, + "loss": 0.69, + "step": 1501 + }, + { + "epoch": 0.49062921073047244, + "grad_norm": 0.2273533195257187, + "learning_rate": 5.278957050562485e-05, + "loss": 0.8017, + "step": 1502 + }, + { + "epoch": 0.4909558613368176, + "grad_norm": 0.24772115051746368, + "learning_rate": 5.273748165934289e-05, + "loss": 0.8714, + "step": 1503 + }, + { + "epoch": 0.4912825119431628, + "grad_norm": 0.2568557858467102, + "learning_rate": 5.268538983297072e-05, + "loss": 0.827, + "step": 1504 + }, + { + "epoch": 0.491609162549508, + "grad_norm": 0.25856682658195496, + "learning_rate": 5.263329508321676e-05, + "loss": 0.8657, + "step": 1505 + }, + { + "epoch": 0.49193581315585316, + "grad_norm": 0.2738785445690155, + "learning_rate": 5.258119746679266e-05, + "loss": 0.8616, + "step": 1506 + }, + { + "epoch": 0.49226246376219834, + "grad_norm": 0.28627508878707886, + "learning_rate": 5.252909704041318e-05, + "loss": 0.7742, + "step": 1507 + }, + { + "epoch": 0.4925891143685435, + "grad_norm": 0.3042372763156891, + "learning_rate": 5.247699386079613e-05, + "loss": 0.874, + "step": 1508 + }, + { + "epoch": 0.49291576497488876, + "grad_norm": 0.328125536441803, + "learning_rate": 5.2424887984662294e-05, + "loss": 0.8562, + "step": 1509 + }, + { + "epoch": 0.49324241558123394, + "grad_norm": 0.3250144124031067, + "learning_rate": 5.2372779468735446e-05, + "loss": 0.9107, + "step": 1510 + }, + { + "epoch": 0.4935690661875791, + "grad_norm": 0.35796797275543213, + "learning_rate": 5.232066836974219e-05, + "loss": 0.98, + "step": 1511 + }, + { + "epoch": 0.4938957167939243, + "grad_norm": 0.3610784113407135, + "learning_rate": 5.226855474441197e-05, + "loss": 0.9441, + "step": 1512 + }, + { + "epoch": 0.4942223674002695, + "grad_norm": 0.36936962604522705, + "learning_rate": 5.2216438649476954e-05, + "loss": 0.9559, + "step": 1513 + }, + { + "epoch": 0.49454901800661466, + "grad_norm": 0.3782363533973694, + "learning_rate": 5.2164320141672006e-05, + "loss": 0.8496, + "step": 1514 + }, + { + "epoch": 0.49487566861295984, + "grad_norm": 0.37794479727745056, + "learning_rate": 5.211219927773464e-05, + "loss": 0.8517, + "step": 1515 + }, + { + "epoch": 0.4952023192193051, + "grad_norm": 0.48705726861953735, + "learning_rate": 5.206007611440491e-05, + "loss": 1.0162, + "step": 1516 + }, + { + "epoch": 0.49552896982565026, + "grad_norm": 0.4541557729244232, + "learning_rate": 5.200795070842539e-05, + "loss": 1.0469, + "step": 1517 + }, + { + "epoch": 0.49585562043199544, + "grad_norm": 0.4859015643596649, + "learning_rate": 5.195582311654107e-05, + "loss": 0.9468, + "step": 1518 + }, + { + "epoch": 0.4961822710383406, + "grad_norm": 0.6146647334098816, + "learning_rate": 5.190369339549933e-05, + "loss": 1.0863, + "step": 1519 + }, + { + "epoch": 0.4965089216446858, + "grad_norm": 0.6606687307357788, + "learning_rate": 5.18515616020499e-05, + "loss": 1.3782, + "step": 1520 + }, + { + "epoch": 0.496835572251031, + "grad_norm": 0.7691556215286255, + "learning_rate": 5.179942779294472e-05, + "loss": 1.1182, + "step": 1521 + }, + { + "epoch": 0.49716222285737616, + "grad_norm": 0.9613664746284485, + "learning_rate": 5.174729202493794e-05, + "loss": 1.3084, + "step": 1522 + }, + { + "epoch": 0.4974888734637214, + "grad_norm": 1.210338830947876, + "learning_rate": 5.169515435478587e-05, + "loss": 1.4691, + "step": 1523 + }, + { + "epoch": 0.4978155240700666, + "grad_norm": 1.183440923690796, + "learning_rate": 5.164301483924685e-05, + "loss": 1.4419, + "step": 1524 + }, + { + "epoch": 0.49814217467641175, + "grad_norm": 1.764573574066162, + "learning_rate": 5.159087353508125e-05, + "loss": 2.0149, + "step": 1525 + }, + { + "epoch": 0.49846882528275693, + "grad_norm": 0.2246791124343872, + "learning_rate": 5.153873049905138e-05, + "loss": 0.7747, + "step": 1526 + }, + { + "epoch": 0.4987954758891021, + "grad_norm": 0.24696335196495056, + "learning_rate": 5.1486585787921427e-05, + "loss": 0.7843, + "step": 1527 + }, + { + "epoch": 0.4991221264954473, + "grad_norm": 0.24796541035175323, + "learning_rate": 5.1434439458457426e-05, + "loss": 0.8426, + "step": 1528 + }, + { + "epoch": 0.4994487771017925, + "grad_norm": 0.2630245089530945, + "learning_rate": 5.1382291567427175e-05, + "loss": 0.9231, + "step": 1529 + }, + { + "epoch": 0.4997754277081377, + "grad_norm": 0.2528044879436493, + "learning_rate": 5.133014217160014e-05, + "loss": 0.7866, + "step": 1530 + }, + { + "epoch": 0.5001020783144828, + "grad_norm": 0.2634943425655365, + "learning_rate": 5.127799132774744e-05, + "loss": 0.7849, + "step": 1531 + }, + { + "epoch": 0.500428728920828, + "grad_norm": 0.27956002950668335, + "learning_rate": 5.122583909264178e-05, + "loss": 0.8601, + "step": 1532 + }, + { + "epoch": 0.500428728920828, + "eval_loss": 1.0430185794830322, + "eval_runtime": 500.0584, + "eval_samples_per_second": 5.155, + "eval_steps_per_second": 2.578, + "step": 1532 } ], "logging_steps": 1, @@ -5404,7 +10774,7 @@ "attributes": {} } }, - "total_flos": 5.04481580677333e+17, + "total_flos": 1.0089220027501773e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null