diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,13333 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 496.73202614379085, + "eval_steps": 500, + "global_step": 19000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.26143790849673204, + "grad_norm": 0.7138202786445618, + "learning_rate": 0.00019999986330190926, + "loss": 2.463, + "step": 10 + }, + { + "epoch": 0.5228758169934641, + "grad_norm": 0.790814220905304, + "learning_rate": 0.00019999945320801072, + "loss": 2.1963, + "step": 20 + }, + { + "epoch": 0.7843137254901961, + "grad_norm": 0.8431825041770935, + "learning_rate": 0.00019999876971942557, + "loss": 1.7823, + "step": 30 + }, + { + "epoch": 1.0457516339869282, + "grad_norm": 0.8540534377098083, + "learning_rate": 0.0001999978128380225, + "loss": 1.4428, + "step": 40 + }, + { + "epoch": 1.3071895424836601, + "grad_norm": 0.8526931405067444, + "learning_rate": 0.00019999658256641747, + "loss": 1.1373, + "step": 50 + }, + { + "epoch": 1.5686274509803921, + "grad_norm": 1.1626569032669067, + "learning_rate": 0.00019999507890797408, + "loss": 0.8482, + "step": 60 + }, + { + "epoch": 1.8300653594771243, + "grad_norm": 1.191985845565796, + "learning_rate": 0.0001999933018668033, + "loss": 0.6793, + "step": 70 + }, + { + "epoch": 2.0915032679738563, + "grad_norm": 0.8180794715881348, + "learning_rate": 0.0001999912514477634, + "loss": 0.5741, + "step": 80 + }, + { + "epoch": 2.3529411764705883, + "grad_norm": 0.7215670347213745, + "learning_rate": 0.00019998892765646026, + "loss": 0.424, + "step": 90 + }, + { + "epoch": 2.6143790849673203, + "grad_norm": 0.9890243411064148, + "learning_rate": 0.0001999863304992469, + "loss": 0.4335, + "step": 100 + }, + { + "epoch": 2.8758169934640523, + "grad_norm": 0.8641286492347717, + "learning_rate": 0.00019998345998322397, + "loss": 0.4301, + "step": 110 + }, + { + "epoch": 3.1372549019607843, + "grad_norm": 0.8116602301597595, + "learning_rate": 0.0001999803161162393, + "loss": 0.3547, + "step": 120 + }, + { + "epoch": 3.3986928104575163, + "grad_norm": 0.9898081421852112, + "learning_rate": 0.0001999768989068881, + "loss": 0.3373, + "step": 130 + }, + { + "epoch": 3.6601307189542482, + "grad_norm": 0.762677788734436, + "learning_rate": 0.0001999732083645129, + "loss": 0.3405, + "step": 140 + }, + { + "epoch": 3.9215686274509802, + "grad_norm": 0.9873971343040466, + "learning_rate": 0.0001999692444992035, + "loss": 0.3842, + "step": 150 + }, + { + "epoch": 4.183006535947713, + "grad_norm": 0.9828356504440308, + "learning_rate": 0.00019996500732179695, + "loss": 0.313, + "step": 160 + }, + { + "epoch": 4.444444444444445, + "grad_norm": 0.8973499536514282, + "learning_rate": 0.0001999604968438775, + "loss": 0.3191, + "step": 170 + }, + { + "epoch": 4.705882352941177, + "grad_norm": 0.9795909523963928, + "learning_rate": 0.0001999557130777767, + "loss": 0.3101, + "step": 180 + }, + { + "epoch": 4.967320261437909, + "grad_norm": 1.2029128074645996, + "learning_rate": 0.00019995065603657316, + "loss": 0.3403, + "step": 190 + }, + { + "epoch": 5.228758169934641, + "grad_norm": 1.0984058380126953, + "learning_rate": 0.00019994532573409262, + "loss": 0.2631, + "step": 200 + }, + { + "epoch": 5.490196078431373, + "grad_norm": 0.8392295241355896, + "learning_rate": 0.0001999397221849079, + "loss": 0.2942, + "step": 210 + }, + { + "epoch": 5.751633986928105, + "grad_norm": 0.8194178938865662, + "learning_rate": 0.00019993384540433894, + "loss": 0.3027, + "step": 220 + }, + { + "epoch": 6.0130718954248366, + "grad_norm": 1.1430736780166626, + "learning_rate": 0.00019992769540845258, + "loss": 0.312, + "step": 230 + }, + { + "epoch": 6.2745098039215685, + "grad_norm": 1.0457488298416138, + "learning_rate": 0.00019992127221406275, + "loss": 0.2631, + "step": 240 + }, + { + "epoch": 6.5359477124183005, + "grad_norm": 0.7075788974761963, + "learning_rate": 0.0001999145758387301, + "loss": 0.2814, + "step": 250 + }, + { + "epoch": 6.7973856209150325, + "grad_norm": 0.8417288064956665, + "learning_rate": 0.00019990760630076237, + "loss": 0.2945, + "step": 260 + }, + { + "epoch": 7.0588235294117645, + "grad_norm": 1.1337305307388306, + "learning_rate": 0.000199900363619214, + "loss": 0.288, + "step": 270 + }, + { + "epoch": 7.3202614379084965, + "grad_norm": 0.9480029940605164, + "learning_rate": 0.00019989284781388617, + "loss": 0.2696, + "step": 280 + }, + { + "epoch": 7.5816993464052285, + "grad_norm": 0.9333047270774841, + "learning_rate": 0.0001998850589053268, + "loss": 0.2647, + "step": 290 + }, + { + "epoch": 7.8431372549019605, + "grad_norm": 0.8173583149909973, + "learning_rate": 0.00019987699691483048, + "loss": 0.2595, + "step": 300 + }, + { + "epoch": 8.104575163398692, + "grad_norm": 1.2094272375106812, + "learning_rate": 0.0001998686618644384, + "loss": 0.2696, + "step": 310 + }, + { + "epoch": 8.366013071895425, + "grad_norm": 0.8234169483184814, + "learning_rate": 0.00019986005377693825, + "loss": 0.2446, + "step": 320 + }, + { + "epoch": 8.627450980392156, + "grad_norm": 1.0735175609588623, + "learning_rate": 0.00019985117267586424, + "loss": 0.2667, + "step": 330 + }, + { + "epoch": 8.88888888888889, + "grad_norm": 0.8714181184768677, + "learning_rate": 0.00019984201858549693, + "loss": 0.2737, + "step": 340 + }, + { + "epoch": 9.15032679738562, + "grad_norm": 1.0795856714248657, + "learning_rate": 0.00019983259153086327, + "loss": 0.2782, + "step": 350 + }, + { + "epoch": 9.411764705882353, + "grad_norm": 1.0457770824432373, + "learning_rate": 0.00019982289153773646, + "loss": 0.2415, + "step": 360 + }, + { + "epoch": 9.673202614379084, + "grad_norm": 0.8930032849311829, + "learning_rate": 0.00019981291863263592, + "loss": 0.2492, + "step": 370 + }, + { + "epoch": 9.934640522875817, + "grad_norm": 1.0066367387771606, + "learning_rate": 0.00019980267284282717, + "loss": 0.2467, + "step": 380 + }, + { + "epoch": 10.196078431372548, + "grad_norm": 1.148116111755371, + "learning_rate": 0.00019979215419632182, + "loss": 0.2323, + "step": 390 + }, + { + "epoch": 10.457516339869281, + "grad_norm": 1.03926420211792, + "learning_rate": 0.00019978136272187747, + "loss": 0.237, + "step": 400 + }, + { + "epoch": 10.718954248366012, + "grad_norm": 0.9982312917709351, + "learning_rate": 0.00019977029844899758, + "loss": 0.2597, + "step": 410 + }, + { + "epoch": 10.980392156862745, + "grad_norm": 1.2115322351455688, + "learning_rate": 0.00019975896140793142, + "loss": 0.2409, + "step": 420 + }, + { + "epoch": 11.241830065359476, + "grad_norm": 1.4162884950637817, + "learning_rate": 0.0001997473516296741, + "loss": 0.2381, + "step": 430 + }, + { + "epoch": 11.50326797385621, + "grad_norm": 1.3810129165649414, + "learning_rate": 0.00019973546914596623, + "loss": 0.2491, + "step": 440 + }, + { + "epoch": 11.764705882352942, + "grad_norm": 1.0123716592788696, + "learning_rate": 0.0001997233139892941, + "loss": 0.2401, + "step": 450 + }, + { + "epoch": 12.026143790849673, + "grad_norm": 0.9082147479057312, + "learning_rate": 0.0001997108861928895, + "loss": 0.2382, + "step": 460 + }, + { + "epoch": 12.287581699346406, + "grad_norm": 0.9988301992416382, + "learning_rate": 0.00019969818579072945, + "loss": 0.2217, + "step": 470 + }, + { + "epoch": 12.549019607843137, + "grad_norm": 1.2657634019851685, + "learning_rate": 0.00019968521281753642, + "loss": 0.2417, + "step": 480 + }, + { + "epoch": 12.81045751633987, + "grad_norm": 0.8032001852989197, + "learning_rate": 0.00019967196730877803, + "loss": 0.2309, + "step": 490 + }, + { + "epoch": 13.071895424836601, + "grad_norm": 1.2560818195343018, + "learning_rate": 0.000199658449300667, + "loss": 0.2306, + "step": 500 + }, + { + "epoch": 13.333333333333334, + "grad_norm": 0.9680474996566772, + "learning_rate": 0.000199644658830161, + "loss": 0.217, + "step": 510 + }, + { + "epoch": 13.594771241830065, + "grad_norm": 0.8553510904312134, + "learning_rate": 0.00019963059593496268, + "loss": 0.2362, + "step": 520 + }, + { + "epoch": 13.856209150326798, + "grad_norm": 1.1278111934661865, + "learning_rate": 0.00019961626065351947, + "loss": 0.2252, + "step": 530 + }, + { + "epoch": 14.117647058823529, + "grad_norm": 0.8731440305709839, + "learning_rate": 0.0001996016530250235, + "loss": 0.2101, + "step": 540 + }, + { + "epoch": 14.379084967320262, + "grad_norm": 1.0482535362243652, + "learning_rate": 0.00019958677308941139, + "loss": 0.2113, + "step": 550 + }, + { + "epoch": 14.640522875816993, + "grad_norm": 0.8031530380249023, + "learning_rate": 0.0001995716208873644, + "loss": 0.2285, + "step": 560 + }, + { + "epoch": 14.901960784313726, + "grad_norm": 1.2343863248825073, + "learning_rate": 0.00019955619646030802, + "loss": 0.2225, + "step": 570 + }, + { + "epoch": 15.163398692810457, + "grad_norm": 1.141625165939331, + "learning_rate": 0.00019954049985041204, + "loss": 0.222, + "step": 580 + }, + { + "epoch": 15.42483660130719, + "grad_norm": 1.163808822631836, + "learning_rate": 0.00019952453110059045, + "loss": 0.2099, + "step": 590 + }, + { + "epoch": 15.686274509803921, + "grad_norm": 1.0026309490203857, + "learning_rate": 0.00019950829025450114, + "loss": 0.222, + "step": 600 + }, + { + "epoch": 15.947712418300654, + "grad_norm": 0.9590998888015747, + "learning_rate": 0.00019949177735654601, + "loss": 0.2148, + "step": 610 + }, + { + "epoch": 16.209150326797385, + "grad_norm": 1.1316090822219849, + "learning_rate": 0.00019947499245187068, + "loss": 0.2063, + "step": 620 + }, + { + "epoch": 16.470588235294116, + "grad_norm": 1.1160564422607422, + "learning_rate": 0.00019945793558636437, + "loss": 0.2083, + "step": 630 + }, + { + "epoch": 16.73202614379085, + "grad_norm": 1.0182403326034546, + "learning_rate": 0.00019944060680666002, + "loss": 0.2133, + "step": 640 + }, + { + "epoch": 16.99346405228758, + "grad_norm": 0.9575550556182861, + "learning_rate": 0.0001994230061601338, + "loss": 0.2088, + "step": 650 + }, + { + "epoch": 17.254901960784313, + "grad_norm": 1.2370105981826782, + "learning_rate": 0.00019940513369490516, + "loss": 0.205, + "step": 660 + }, + { + "epoch": 17.516339869281047, + "grad_norm": 0.7969275712966919, + "learning_rate": 0.00019938698945983676, + "loss": 0.2053, + "step": 670 + }, + { + "epoch": 17.77777777777778, + "grad_norm": 1.0018616914749146, + "learning_rate": 0.0001993685735045343, + "loss": 0.2086, + "step": 680 + }, + { + "epoch": 18.03921568627451, + "grad_norm": 0.9241987466812134, + "learning_rate": 0.00019934988587934623, + "loss": 0.2025, + "step": 690 + }, + { + "epoch": 18.30065359477124, + "grad_norm": 1.1506460905075073, + "learning_rate": 0.00019933092663536382, + "loss": 0.1951, + "step": 700 + }, + { + "epoch": 18.562091503267975, + "grad_norm": 1.3489614725112915, + "learning_rate": 0.00019931169582442095, + "loss": 0.2139, + "step": 710 + }, + { + "epoch": 18.823529411764707, + "grad_norm": 1.0618153810501099, + "learning_rate": 0.00019929219349909392, + "loss": 0.2014, + "step": 720 + }, + { + "epoch": 19.084967320261438, + "grad_norm": 1.1984943151474, + "learning_rate": 0.0001992724197127013, + "loss": 0.2084, + "step": 730 + }, + { + "epoch": 19.34640522875817, + "grad_norm": 0.9617899656295776, + "learning_rate": 0.0001992523745193039, + "loss": 0.1913, + "step": 740 + }, + { + "epoch": 19.607843137254903, + "grad_norm": 0.951430082321167, + "learning_rate": 0.0001992320579737045, + "loss": 0.1922, + "step": 750 + }, + { + "epoch": 19.869281045751634, + "grad_norm": 1.0308970212936401, + "learning_rate": 0.0001992114701314478, + "loss": 0.2028, + "step": 760 + }, + { + "epoch": 20.130718954248366, + "grad_norm": 1.298295497894287, + "learning_rate": 0.0001991906110488201, + "loss": 0.1985, + "step": 770 + }, + { + "epoch": 20.392156862745097, + "grad_norm": 0.980803370475769, + "learning_rate": 0.0001991694807828494, + "loss": 0.2017, + "step": 780 + }, + { + "epoch": 20.65359477124183, + "grad_norm": 0.9850193858146667, + "learning_rate": 0.000199148079391305, + "loss": 0.1808, + "step": 790 + }, + { + "epoch": 20.915032679738562, + "grad_norm": 0.8435742855072021, + "learning_rate": 0.00019912640693269752, + "loss": 0.1991, + "step": 800 + }, + { + "epoch": 21.176470588235293, + "grad_norm": 1.2562199831008911, + "learning_rate": 0.00019910446346627862, + "loss": 0.1902, + "step": 810 + }, + { + "epoch": 21.437908496732025, + "grad_norm": 1.2811332941055298, + "learning_rate": 0.0001990822490520409, + "loss": 0.1895, + "step": 820 + }, + { + "epoch": 21.69934640522876, + "grad_norm": 1.185219407081604, + "learning_rate": 0.00019905976375071772, + "loss": 0.1873, + "step": 830 + }, + { + "epoch": 21.96078431372549, + "grad_norm": 1.0126444101333618, + "learning_rate": 0.000199037007623783, + "loss": 0.2022, + "step": 840 + }, + { + "epoch": 22.22222222222222, + "grad_norm": 1.0308198928833008, + "learning_rate": 0.00019901398073345118, + "loss": 0.186, + "step": 850 + }, + { + "epoch": 22.483660130718953, + "grad_norm": 0.9931133389472961, + "learning_rate": 0.00019899068314267688, + "loss": 0.1892, + "step": 860 + }, + { + "epoch": 22.745098039215687, + "grad_norm": 1.1834982633590698, + "learning_rate": 0.00019896711491515482, + "loss": 0.1866, + "step": 870 + }, + { + "epoch": 23.00653594771242, + "grad_norm": 1.0095534324645996, + "learning_rate": 0.0001989432761153196, + "loss": 0.1877, + "step": 880 + }, + { + "epoch": 23.26797385620915, + "grad_norm": 1.0486822128295898, + "learning_rate": 0.00019891916680834566, + "loss": 0.1848, + "step": 890 + }, + { + "epoch": 23.529411764705884, + "grad_norm": 1.0561635494232178, + "learning_rate": 0.00019889478706014687, + "loss": 0.1825, + "step": 900 + }, + { + "epoch": 23.790849673202615, + "grad_norm": 1.0218344926834106, + "learning_rate": 0.00019887013693737653, + "loss": 0.1781, + "step": 910 + }, + { + "epoch": 24.052287581699346, + "grad_norm": 0.907986044883728, + "learning_rate": 0.00019884521650742715, + "loss": 0.1925, + "step": 920 + }, + { + "epoch": 24.313725490196077, + "grad_norm": 1.3268672227859497, + "learning_rate": 0.00019882002583843024, + "loss": 0.1783, + "step": 930 + }, + { + "epoch": 24.575163398692812, + "grad_norm": 0.8679687976837158, + "learning_rate": 0.00019879456499925614, + "loss": 0.1814, + "step": 940 + }, + { + "epoch": 24.836601307189543, + "grad_norm": 1.194421648979187, + "learning_rate": 0.00019876883405951377, + "loss": 0.1818, + "step": 950 + }, + { + "epoch": 25.098039215686274, + "grad_norm": 1.2440810203552246, + "learning_rate": 0.00019874283308955057, + "loss": 0.1841, + "step": 960 + }, + { + "epoch": 25.359477124183005, + "grad_norm": 0.8485437035560608, + "learning_rate": 0.0001987165621604522, + "loss": 0.1721, + "step": 970 + }, + { + "epoch": 25.62091503267974, + "grad_norm": 1.1203727722167969, + "learning_rate": 0.00019869002134404235, + "loss": 0.1772, + "step": 980 + }, + { + "epoch": 25.88235294117647, + "grad_norm": 1.2701900005340576, + "learning_rate": 0.0001986632107128826, + "loss": 0.1823, + "step": 990 + }, + { + "epoch": 26.143790849673202, + "grad_norm": 0.7330073118209839, + "learning_rate": 0.00019863613034027224, + "loss": 0.1762, + "step": 1000 + }, + { + "epoch": 26.405228758169933, + "grad_norm": 1.0160173177719116, + "learning_rate": 0.0001986087803002479, + "loss": 0.1748, + "step": 1010 + }, + { + "epoch": 26.666666666666668, + "grad_norm": 1.2125264406204224, + "learning_rate": 0.00019858116066758362, + "loss": 0.1818, + "step": 1020 + }, + { + "epoch": 26.9281045751634, + "grad_norm": 1.3150966167449951, + "learning_rate": 0.00019855327151779042, + "loss": 0.1791, + "step": 1030 + }, + { + "epoch": 27.18954248366013, + "grad_norm": 0.9498183131217957, + "learning_rate": 0.00019852511292711608, + "loss": 0.1746, + "step": 1040 + }, + { + "epoch": 27.45098039215686, + "grad_norm": 1.101545810699463, + "learning_rate": 0.0001984966849725452, + "loss": 0.17, + "step": 1050 + }, + { + "epoch": 27.712418300653596, + "grad_norm": 0.9787285923957825, + "learning_rate": 0.00019846798773179866, + "loss": 0.1759, + "step": 1060 + }, + { + "epoch": 27.973856209150327, + "grad_norm": 0.9753018021583557, + "learning_rate": 0.00019843902128333367, + "loss": 0.1823, + "step": 1070 + }, + { + "epoch": 28.235294117647058, + "grad_norm": 1.1331627368927002, + "learning_rate": 0.0001984097857063434, + "loss": 0.1682, + "step": 1080 + }, + { + "epoch": 28.49673202614379, + "grad_norm": 1.199524998664856, + "learning_rate": 0.00019838028108075671, + "loss": 0.1735, + "step": 1090 + }, + { + "epoch": 28.758169934640524, + "grad_norm": 0.83395916223526, + "learning_rate": 0.00019835050748723824, + "loss": 0.1683, + "step": 1100 + }, + { + "epoch": 29.019607843137255, + "grad_norm": 0.954365611076355, + "learning_rate": 0.0001983204650071878, + "loss": 0.1764, + "step": 1110 + }, + { + "epoch": 29.281045751633986, + "grad_norm": 1.1084930896759033, + "learning_rate": 0.00019829015372274038, + "loss": 0.1663, + "step": 1120 + }, + { + "epoch": 29.54248366013072, + "grad_norm": 0.9061603546142578, + "learning_rate": 0.00019825957371676588, + "loss": 0.1618, + "step": 1130 + }, + { + "epoch": 29.80392156862745, + "grad_norm": 1.2576953172683716, + "learning_rate": 0.0001982287250728689, + "loss": 0.1773, + "step": 1140 + }, + { + "epoch": 30.065359477124183, + "grad_norm": 0.680230438709259, + "learning_rate": 0.0001981976078753884, + "loss": 0.1749, + "step": 1150 + }, + { + "epoch": 30.326797385620914, + "grad_norm": 1.0407228469848633, + "learning_rate": 0.0001981662222093976, + "loss": 0.1608, + "step": 1160 + }, + { + "epoch": 30.58823529411765, + "grad_norm": 1.0547195672988892, + "learning_rate": 0.0001981345681607038, + "loss": 0.1626, + "step": 1170 + }, + { + "epoch": 30.84967320261438, + "grad_norm": 1.2236053943634033, + "learning_rate": 0.00019810264581584787, + "loss": 0.1701, + "step": 1180 + }, + { + "epoch": 31.11111111111111, + "grad_norm": 1.0089291334152222, + "learning_rate": 0.00019807045526210436, + "loss": 0.1679, + "step": 1190 + }, + { + "epoch": 31.372549019607842, + "grad_norm": 1.067421555519104, + "learning_rate": 0.00019803799658748094, + "loss": 0.1614, + "step": 1200 + }, + { + "epoch": 31.633986928104576, + "grad_norm": 1.1328610181808472, + "learning_rate": 0.00019800526988071843, + "loss": 0.1691, + "step": 1210 + }, + { + "epoch": 31.895424836601308, + "grad_norm": 1.216069221496582, + "learning_rate": 0.0001979722752312904, + "loss": 0.1674, + "step": 1220 + }, + { + "epoch": 32.15686274509804, + "grad_norm": 0.9567670822143555, + "learning_rate": 0.00019793901272940293, + "loss": 0.1592, + "step": 1230 + }, + { + "epoch": 32.41830065359477, + "grad_norm": 0.9017887115478516, + "learning_rate": 0.00019790548246599447, + "loss": 0.1598, + "step": 1240 + }, + { + "epoch": 32.6797385620915, + "grad_norm": 1.1113390922546387, + "learning_rate": 0.00019787168453273544, + "loss": 0.1659, + "step": 1250 + }, + { + "epoch": 32.94117647058823, + "grad_norm": 1.2047311067581177, + "learning_rate": 0.00019783761902202813, + "loss": 0.1682, + "step": 1260 + }, + { + "epoch": 33.20261437908497, + "grad_norm": 1.0487027168273926, + "learning_rate": 0.00019780328602700636, + "loss": 0.16, + "step": 1270 + }, + { + "epoch": 33.4640522875817, + "grad_norm": 1.0085713863372803, + "learning_rate": 0.00019776868564153516, + "loss": 0.1544, + "step": 1280 + }, + { + "epoch": 33.72549019607843, + "grad_norm": 1.049383521080017, + "learning_rate": 0.0001977338179602107, + "loss": 0.1687, + "step": 1290 + }, + { + "epoch": 33.98692810457516, + "grad_norm": 1.0386006832122803, + "learning_rate": 0.00019769868307835994, + "loss": 0.1637, + "step": 1300 + }, + { + "epoch": 34.248366013071895, + "grad_norm": 1.0458704233169556, + "learning_rate": 0.00019766328109204024, + "loss": 0.1578, + "step": 1310 + }, + { + "epoch": 34.509803921568626, + "grad_norm": 1.0181804895401, + "learning_rate": 0.00019762761209803927, + "loss": 0.1556, + "step": 1320 + }, + { + "epoch": 34.77124183006536, + "grad_norm": 0.8453966975212097, + "learning_rate": 0.00019759167619387476, + "loss": 0.1594, + "step": 1330 + }, + { + "epoch": 35.032679738562095, + "grad_norm": 0.8666368126869202, + "learning_rate": 0.00019755547347779403, + "loss": 0.1639, + "step": 1340 + }, + { + "epoch": 35.294117647058826, + "grad_norm": 0.8416698575019836, + "learning_rate": 0.000197519004048774, + "loss": 0.1495, + "step": 1350 + }, + { + "epoch": 35.55555555555556, + "grad_norm": 1.3281161785125732, + "learning_rate": 0.0001974822680065206, + "loss": 0.1574, + "step": 1360 + }, + { + "epoch": 35.81699346405229, + "grad_norm": 0.953856348991394, + "learning_rate": 0.00019744526545146886, + "loss": 0.1614, + "step": 1370 + }, + { + "epoch": 36.07843137254902, + "grad_norm": 0.6768958568572998, + "learning_rate": 0.00019740799648478233, + "loss": 0.1571, + "step": 1380 + }, + { + "epoch": 36.33986928104575, + "grad_norm": 1.09906005859375, + "learning_rate": 0.0001973704612083529, + "loss": 0.1551, + "step": 1390 + }, + { + "epoch": 36.60130718954248, + "grad_norm": 1.172494649887085, + "learning_rate": 0.0001973326597248006, + "loss": 0.1597, + "step": 1400 + }, + { + "epoch": 36.86274509803921, + "grad_norm": 1.0662363767623901, + "learning_rate": 0.00019729459213747326, + "loss": 0.1625, + "step": 1410 + }, + { + "epoch": 37.12418300653595, + "grad_norm": 0.7910565137863159, + "learning_rate": 0.00019725625855044617, + "loss": 0.1504, + "step": 1420 + }, + { + "epoch": 37.38562091503268, + "grad_norm": 1.1636861562728882, + "learning_rate": 0.00019721765906852197, + "loss": 0.1508, + "step": 1430 + }, + { + "epoch": 37.64705882352941, + "grad_norm": 1.0530037879943848, + "learning_rate": 0.00019717879379723012, + "loss": 0.1543, + "step": 1440 + }, + { + "epoch": 37.908496732026144, + "grad_norm": 1.4254627227783203, + "learning_rate": 0.00019713966284282678, + "loss": 0.1624, + "step": 1450 + }, + { + "epoch": 38.169934640522875, + "grad_norm": 1.1266350746154785, + "learning_rate": 0.0001971002663122945, + "loss": 0.1505, + "step": 1460 + }, + { + "epoch": 38.431372549019606, + "grad_norm": 0.8052533864974976, + "learning_rate": 0.00019706060431334187, + "loss": 0.1473, + "step": 1470 + }, + { + "epoch": 38.69281045751634, + "grad_norm": 1.1820138692855835, + "learning_rate": 0.00019702067695440332, + "loss": 0.1512, + "step": 1480 + }, + { + "epoch": 38.95424836601307, + "grad_norm": 1.0051476955413818, + "learning_rate": 0.0001969804843446387, + "loss": 0.164, + "step": 1490 + }, + { + "epoch": 39.21568627450981, + "grad_norm": 1.0447814464569092, + "learning_rate": 0.00019694002659393305, + "loss": 0.1503, + "step": 1500 + }, + { + "epoch": 39.47712418300654, + "grad_norm": 0.8165223598480225, + "learning_rate": 0.00019689930381289634, + "loss": 0.1525, + "step": 1510 + }, + { + "epoch": 39.73856209150327, + "grad_norm": 0.9069657325744629, + "learning_rate": 0.0001968583161128631, + "loss": 0.1539, + "step": 1520 + }, + { + "epoch": 40.0, + "grad_norm": 1.4593009948730469, + "learning_rate": 0.00019681706360589216, + "loss": 0.1572, + "step": 1530 + }, + { + "epoch": 40.26143790849673, + "grad_norm": 1.3013628721237183, + "learning_rate": 0.00019677554640476624, + "loss": 0.1459, + "step": 1540 + }, + { + "epoch": 40.52287581699346, + "grad_norm": 1.1023391485214233, + "learning_rate": 0.00019673376462299184, + "loss": 0.1526, + "step": 1550 + }, + { + "epoch": 40.78431372549019, + "grad_norm": 1.0940853357315063, + "learning_rate": 0.00019669171837479873, + "loss": 0.1581, + "step": 1560 + }, + { + "epoch": 41.04575163398693, + "grad_norm": 0.940166175365448, + "learning_rate": 0.00019664940777513974, + "loss": 0.1492, + "step": 1570 + }, + { + "epoch": 41.30718954248366, + "grad_norm": 0.9569455981254578, + "learning_rate": 0.00019660683293969041, + "loss": 0.1453, + "step": 1580 + }, + { + "epoch": 41.568627450980394, + "grad_norm": 0.7959948182106018, + "learning_rate": 0.0001965639939848488, + "loss": 0.1505, + "step": 1590 + }, + { + "epoch": 41.830065359477125, + "grad_norm": 0.991958498954773, + "learning_rate": 0.00019652089102773488, + "loss": 0.1518, + "step": 1600 + }, + { + "epoch": 42.091503267973856, + "grad_norm": 1.0243059396743774, + "learning_rate": 0.00019647752418619054, + "loss": 0.1515, + "step": 1610 + }, + { + "epoch": 42.35294117647059, + "grad_norm": 0.826440155506134, + "learning_rate": 0.00019643389357877907, + "loss": 0.1465, + "step": 1620 + }, + { + "epoch": 42.61437908496732, + "grad_norm": 0.8773382902145386, + "learning_rate": 0.00019638999932478486, + "loss": 0.1468, + "step": 1630 + }, + { + "epoch": 42.87581699346405, + "grad_norm": 0.7697790861129761, + "learning_rate": 0.00019634584154421317, + "loss": 0.1442, + "step": 1640 + }, + { + "epoch": 43.13725490196079, + "grad_norm": 0.7857301831245422, + "learning_rate": 0.00019630142035778964, + "loss": 0.1473, + "step": 1650 + }, + { + "epoch": 43.39869281045752, + "grad_norm": 0.8533034920692444, + "learning_rate": 0.00019625673588696008, + "loss": 0.1453, + "step": 1660 + }, + { + "epoch": 43.66013071895425, + "grad_norm": 0.8283002376556396, + "learning_rate": 0.0001962117882538902, + "loss": 0.1469, + "step": 1670 + }, + { + "epoch": 43.92156862745098, + "grad_norm": 1.0466629266738892, + "learning_rate": 0.00019616657758146503, + "loss": 0.1501, + "step": 1680 + }, + { + "epoch": 44.18300653594771, + "grad_norm": 0.7576202750205994, + "learning_rate": 0.00019612110399328892, + "loss": 0.1438, + "step": 1690 + }, + { + "epoch": 44.44444444444444, + "grad_norm": 0.8621203303337097, + "learning_rate": 0.00019607536761368484, + "loss": 0.1382, + "step": 1700 + }, + { + "epoch": 44.705882352941174, + "grad_norm": 0.864361047744751, + "learning_rate": 0.0001960293685676943, + "loss": 0.1488, + "step": 1710 + }, + { + "epoch": 44.967320261437905, + "grad_norm": 1.3524795770645142, + "learning_rate": 0.00019598310698107702, + "loss": 0.1501, + "step": 1720 + }, + { + "epoch": 45.22875816993464, + "grad_norm": 0.8673699498176575, + "learning_rate": 0.00019593658298031034, + "loss": 0.1446, + "step": 1730 + }, + { + "epoch": 45.490196078431374, + "grad_norm": 0.7573821544647217, + "learning_rate": 0.0001958897966925891, + "loss": 0.1404, + "step": 1740 + }, + { + "epoch": 45.751633986928105, + "grad_norm": 1.00885009765625, + "learning_rate": 0.0001958427482458253, + "loss": 0.1499, + "step": 1750 + }, + { + "epoch": 46.01307189542484, + "grad_norm": 1.3049975633621216, + "learning_rate": 0.0001957954377686475, + "loss": 0.1494, + "step": 1760 + }, + { + "epoch": 46.27450980392157, + "grad_norm": 0.8838759660720825, + "learning_rate": 0.00019574786539040077, + "loss": 0.1401, + "step": 1770 + }, + { + "epoch": 46.5359477124183, + "grad_norm": 0.7831525206565857, + "learning_rate": 0.00019570003124114619, + "loss": 0.1471, + "step": 1780 + }, + { + "epoch": 46.79738562091503, + "grad_norm": 0.7171520590782166, + "learning_rate": 0.00019565193545166052, + "loss": 0.1414, + "step": 1790 + }, + { + "epoch": 47.05882352941177, + "grad_norm": 0.7733674645423889, + "learning_rate": 0.00019560357815343577, + "loss": 0.1465, + "step": 1800 + }, + { + "epoch": 47.3202614379085, + "grad_norm": 0.833474338054657, + "learning_rate": 0.00019555495947867895, + "loss": 0.1402, + "step": 1810 + }, + { + "epoch": 47.58169934640523, + "grad_norm": 0.8420645594596863, + "learning_rate": 0.0001955060795603117, + "loss": 0.1412, + "step": 1820 + }, + { + "epoch": 47.84313725490196, + "grad_norm": 0.9879257678985596, + "learning_rate": 0.00019545693853196983, + "loss": 0.1444, + "step": 1830 + }, + { + "epoch": 48.10457516339869, + "grad_norm": 0.9052282571792603, + "learning_rate": 0.000195407536528003, + "loss": 0.1433, + "step": 1840 + }, + { + "epoch": 48.36601307189542, + "grad_norm": 1.194151759147644, + "learning_rate": 0.00019535787368347442, + "loss": 0.1419, + "step": 1850 + }, + { + "epoch": 48.627450980392155, + "grad_norm": 1.0415573120117188, + "learning_rate": 0.00019530795013416046, + "loss": 0.1413, + "step": 1860 + }, + { + "epoch": 48.888888888888886, + "grad_norm": 0.9441238641738892, + "learning_rate": 0.00019525776601655014, + "loss": 0.1473, + "step": 1870 + }, + { + "epoch": 49.150326797385624, + "grad_norm": 0.7534403800964355, + "learning_rate": 0.00019520732146784491, + "loss": 0.1386, + "step": 1880 + }, + { + "epoch": 49.411764705882355, + "grad_norm": 1.0693260431289673, + "learning_rate": 0.0001951566166259583, + "loss": 0.1419, + "step": 1890 + }, + { + "epoch": 49.673202614379086, + "grad_norm": 0.9667706489562988, + "learning_rate": 0.00019510565162951537, + "loss": 0.1411, + "step": 1900 + }, + { + "epoch": 49.93464052287582, + "grad_norm": 0.8170807361602783, + "learning_rate": 0.0001950544266178525, + "loss": 0.146, + "step": 1910 + }, + { + "epoch": 50.19607843137255, + "grad_norm": 0.820950448513031, + "learning_rate": 0.00019500294173101687, + "loss": 0.1409, + "step": 1920 + }, + { + "epoch": 50.45751633986928, + "grad_norm": 0.9750927686691284, + "learning_rate": 0.00019495119710976626, + "loss": 0.1381, + "step": 1930 + }, + { + "epoch": 50.71895424836601, + "grad_norm": 0.9925455451011658, + "learning_rate": 0.00019489919289556845, + "loss": 0.1455, + "step": 1940 + }, + { + "epoch": 50.98039215686274, + "grad_norm": 0.7548269033432007, + "learning_rate": 0.00019484692923060095, + "loss": 0.1399, + "step": 1950 + }, + { + "epoch": 51.24183006535948, + "grad_norm": 0.7531190514564514, + "learning_rate": 0.0001947944062577507, + "loss": 0.1343, + "step": 1960 + }, + { + "epoch": 51.50326797385621, + "grad_norm": 0.9674361348152161, + "learning_rate": 0.0001947416241206134, + "loss": 0.1382, + "step": 1970 + }, + { + "epoch": 51.76470588235294, + "grad_norm": 0.9091185331344604, + "learning_rate": 0.0001946885829634935, + "loss": 0.1445, + "step": 1980 + }, + { + "epoch": 52.02614379084967, + "grad_norm": 0.987326979637146, + "learning_rate": 0.00019463528293140345, + "loss": 0.1453, + "step": 1990 + }, + { + "epoch": 52.287581699346404, + "grad_norm": 0.6855144500732422, + "learning_rate": 0.00019458172417006347, + "loss": 0.1349, + "step": 2000 + }, + { + "epoch": 52.549019607843135, + "grad_norm": 0.7305368781089783, + "learning_rate": 0.00019452790682590124, + "loss": 0.1397, + "step": 2010 + }, + { + "epoch": 52.810457516339866, + "grad_norm": 0.9413096904754639, + "learning_rate": 0.00019447383104605125, + "loss": 0.1446, + "step": 2020 + }, + { + "epoch": 53.071895424836605, + "grad_norm": 0.7378990650177002, + "learning_rate": 0.0001944194969783547, + "loss": 0.1409, + "step": 2030 + }, + { + "epoch": 53.333333333333336, + "grad_norm": 0.6865296959877014, + "learning_rate": 0.00019436490477135878, + "loss": 0.1352, + "step": 2040 + }, + { + "epoch": 53.59477124183007, + "grad_norm": 0.71687912940979, + "learning_rate": 0.00019431005457431653, + "loss": 0.1423, + "step": 2050 + }, + { + "epoch": 53.8562091503268, + "grad_norm": 0.7911390662193298, + "learning_rate": 0.0001942549465371863, + "loss": 0.1374, + "step": 2060 + }, + { + "epoch": 54.11764705882353, + "grad_norm": 0.8051833510398865, + "learning_rate": 0.00019419958081063138, + "loss": 0.1363, + "step": 2070 + }, + { + "epoch": 54.37908496732026, + "grad_norm": 0.6623182892799377, + "learning_rate": 0.00019414395754601947, + "loss": 0.1375, + "step": 2080 + }, + { + "epoch": 54.64052287581699, + "grad_norm": 0.8147381544113159, + "learning_rate": 0.00019408807689542257, + "loss": 0.1331, + "step": 2090 + }, + { + "epoch": 54.90196078431372, + "grad_norm": 0.7623370885848999, + "learning_rate": 0.00019403193901161613, + "loss": 0.14, + "step": 2100 + }, + { + "epoch": 55.16339869281046, + "grad_norm": 0.8199501037597656, + "learning_rate": 0.00019397554404807906, + "loss": 0.1389, + "step": 2110 + }, + { + "epoch": 55.42483660130719, + "grad_norm": 1.0729644298553467, + "learning_rate": 0.00019391889215899299, + "loss": 0.1376, + "step": 2120 + }, + { + "epoch": 55.68627450980392, + "grad_norm": 0.6667032241821289, + "learning_rate": 0.00019386198349924207, + "loss": 0.1359, + "step": 2130 + }, + { + "epoch": 55.947712418300654, + "grad_norm": 0.9523484706878662, + "learning_rate": 0.00019380481822441235, + "loss": 0.1423, + "step": 2140 + }, + { + "epoch": 56.209150326797385, + "grad_norm": 0.8002906441688538, + "learning_rate": 0.00019374739649079153, + "loss": 0.1315, + "step": 2150 + }, + { + "epoch": 56.470588235294116, + "grad_norm": 0.6775190830230713, + "learning_rate": 0.00019368971845536845, + "loss": 0.1342, + "step": 2160 + }, + { + "epoch": 56.73202614379085, + "grad_norm": 0.870460569858551, + "learning_rate": 0.00019363178427583266, + "loss": 0.1398, + "step": 2170 + }, + { + "epoch": 56.99346405228758, + "grad_norm": 0.8694108128547668, + "learning_rate": 0.000193573594110574, + "loss": 0.1453, + "step": 2180 + }, + { + "epoch": 57.254901960784316, + "grad_norm": 1.0844203233718872, + "learning_rate": 0.00019351514811868207, + "loss": 0.1336, + "step": 2190 + }, + { + "epoch": 57.51633986928105, + "grad_norm": 0.7382115125656128, + "learning_rate": 0.0001934564464599461, + "loss": 0.1379, + "step": 2200 + }, + { + "epoch": 57.77777777777778, + "grad_norm": 0.996875524520874, + "learning_rate": 0.0001933974892948541, + "loss": 0.1353, + "step": 2210 + }, + { + "epoch": 58.03921568627451, + "grad_norm": 0.7248476147651672, + "learning_rate": 0.0001933382767845928, + "loss": 0.1386, + "step": 2220 + }, + { + "epoch": 58.30065359477124, + "grad_norm": 0.671381413936615, + "learning_rate": 0.00019327880909104683, + "loss": 0.1347, + "step": 2230 + }, + { + "epoch": 58.56209150326797, + "grad_norm": 0.5566993355751038, + "learning_rate": 0.00019321908637679865, + "loss": 0.1365, + "step": 2240 + }, + { + "epoch": 58.8235294117647, + "grad_norm": 0.8637556433677673, + "learning_rate": 0.0001931591088051279, + "loss": 0.1344, + "step": 2250 + }, + { + "epoch": 59.08496732026144, + "grad_norm": 0.6594296097755432, + "learning_rate": 0.00019309887654001096, + "loss": 0.1359, + "step": 2260 + }, + { + "epoch": 59.34640522875817, + "grad_norm": 0.8212260603904724, + "learning_rate": 0.0001930383897461205, + "loss": 0.1286, + "step": 2270 + }, + { + "epoch": 59.6078431372549, + "grad_norm": 0.7467018365859985, + "learning_rate": 0.00019297764858882514, + "loss": 0.1379, + "step": 2280 + }, + { + "epoch": 59.869281045751634, + "grad_norm": 0.7997498512268066, + "learning_rate": 0.0001929166532341889, + "loss": 0.1379, + "step": 2290 + }, + { + "epoch": 60.130718954248366, + "grad_norm": 0.5982789993286133, + "learning_rate": 0.00019285540384897073, + "loss": 0.1323, + "step": 2300 + }, + { + "epoch": 60.3921568627451, + "grad_norm": 0.8748401403427124, + "learning_rate": 0.00019279390060062407, + "loss": 0.1317, + "step": 2310 + }, + { + "epoch": 60.65359477124183, + "grad_norm": 0.965622067451477, + "learning_rate": 0.00019273214365729655, + "loss": 0.1355, + "step": 2320 + }, + { + "epoch": 60.91503267973856, + "grad_norm": 0.5320419073104858, + "learning_rate": 0.0001926701331878292, + "loss": 0.1383, + "step": 2330 + }, + { + "epoch": 61.1764705882353, + "grad_norm": 0.8016908764839172, + "learning_rate": 0.00019260786936175635, + "loss": 0.1356, + "step": 2340 + }, + { + "epoch": 61.43790849673203, + "grad_norm": 0.5881072878837585, + "learning_rate": 0.00019254535234930486, + "loss": 0.1344, + "step": 2350 + }, + { + "epoch": 61.69934640522876, + "grad_norm": 0.7665542960166931, + "learning_rate": 0.00019248258232139388, + "loss": 0.1352, + "step": 2360 + }, + { + "epoch": 61.96078431372549, + "grad_norm": 0.709434449672699, + "learning_rate": 0.0001924195594496343, + "loss": 0.1352, + "step": 2370 + }, + { + "epoch": 62.22222222222222, + "grad_norm": 0.5670029520988464, + "learning_rate": 0.00019235628390632822, + "loss": 0.1282, + "step": 2380 + }, + { + "epoch": 62.48366013071895, + "grad_norm": 0.9107353091239929, + "learning_rate": 0.0001922927558644685, + "loss": 0.1294, + "step": 2390 + }, + { + "epoch": 62.745098039215684, + "grad_norm": 0.6936364769935608, + "learning_rate": 0.00019222897549773848, + "loss": 0.1342, + "step": 2400 + }, + { + "epoch": 63.00653594771242, + "grad_norm": 0.7786808013916016, + "learning_rate": 0.0001921649429805112, + "loss": 0.1399, + "step": 2410 + }, + { + "epoch": 63.26797385620915, + "grad_norm": 0.6474490165710449, + "learning_rate": 0.00019210065848784913, + "loss": 0.1266, + "step": 2420 + }, + { + "epoch": 63.529411764705884, + "grad_norm": 0.8766961097717285, + "learning_rate": 0.00019203612219550358, + "loss": 0.1319, + "step": 2430 + }, + { + "epoch": 63.790849673202615, + "grad_norm": 0.8006402254104614, + "learning_rate": 0.00019197133427991436, + "loss": 0.1338, + "step": 2440 + }, + { + "epoch": 64.05228758169935, + "grad_norm": 0.7504803538322449, + "learning_rate": 0.00019190629491820912, + "loss": 0.1408, + "step": 2450 + }, + { + "epoch": 64.31372549019608, + "grad_norm": 0.814525842666626, + "learning_rate": 0.000191841004288203, + "loss": 0.1326, + "step": 2460 + }, + { + "epoch": 64.57516339869281, + "grad_norm": 0.8137060403823853, + "learning_rate": 0.00019177546256839812, + "loss": 0.1316, + "step": 2470 + }, + { + "epoch": 64.83660130718954, + "grad_norm": 0.7670857310295105, + "learning_rate": 0.000191709669937983, + "loss": 0.1348, + "step": 2480 + }, + { + "epoch": 65.09803921568627, + "grad_norm": 0.646980345249176, + "learning_rate": 0.00019164362657683222, + "loss": 0.1333, + "step": 2490 + }, + { + "epoch": 65.359477124183, + "grad_norm": 1.1511456966400146, + "learning_rate": 0.00019157733266550575, + "loss": 0.1341, + "step": 2500 + }, + { + "epoch": 65.62091503267973, + "grad_norm": 0.6792668104171753, + "learning_rate": 0.00019151078838524867, + "loss": 0.1306, + "step": 2510 + }, + { + "epoch": 65.88235294117646, + "grad_norm": 0.8312354683876038, + "learning_rate": 0.00019144399391799043, + "loss": 0.1354, + "step": 2520 + }, + { + "epoch": 66.14379084967321, + "grad_norm": 0.7580569386482239, + "learning_rate": 0.00019137694944634464, + "loss": 0.1333, + "step": 2530 + }, + { + "epoch": 66.40522875816994, + "grad_norm": 0.7622929811477661, + "learning_rate": 0.0001913096551536083, + "loss": 0.1302, + "step": 2540 + }, + { + "epoch": 66.66666666666667, + "grad_norm": 0.9241070747375488, + "learning_rate": 0.00019124211122376137, + "loss": 0.131, + "step": 2550 + }, + { + "epoch": 66.9281045751634, + "grad_norm": 0.94864821434021, + "learning_rate": 0.00019117431784146645, + "loss": 0.1374, + "step": 2560 + }, + { + "epoch": 67.18954248366013, + "grad_norm": 0.6629276871681213, + "learning_rate": 0.00019110627519206805, + "loss": 0.1281, + "step": 2570 + }, + { + "epoch": 67.45098039215686, + "grad_norm": 0.868118941783905, + "learning_rate": 0.00019103798346159213, + "loss": 0.1308, + "step": 2580 + }, + { + "epoch": 67.7124183006536, + "grad_norm": 0.7864583730697632, + "learning_rate": 0.00019096944283674571, + "loss": 0.1332, + "step": 2590 + }, + { + "epoch": 67.97385620915033, + "grad_norm": 0.7753363251686096, + "learning_rate": 0.00019090065350491626, + "loss": 0.136, + "step": 2600 + }, + { + "epoch": 68.23529411764706, + "grad_norm": 0.4261542856693268, + "learning_rate": 0.00019083161565417115, + "loss": 0.1266, + "step": 2610 + }, + { + "epoch": 68.49673202614379, + "grad_norm": 0.7071658968925476, + "learning_rate": 0.00019076232947325722, + "loss": 0.1277, + "step": 2620 + }, + { + "epoch": 68.75816993464052, + "grad_norm": 0.913390040397644, + "learning_rate": 0.00019069279515160025, + "loss": 0.1335, + "step": 2630 + }, + { + "epoch": 69.01960784313725, + "grad_norm": 0.8911900520324707, + "learning_rate": 0.00019062301287930446, + "loss": 0.1352, + "step": 2640 + }, + { + "epoch": 69.28104575163398, + "grad_norm": 0.7245123982429504, + "learning_rate": 0.00019055298284715192, + "loss": 0.1286, + "step": 2650 + }, + { + "epoch": 69.54248366013071, + "grad_norm": 0.7637701034545898, + "learning_rate": 0.00019048270524660196, + "loss": 0.1325, + "step": 2660 + }, + { + "epoch": 69.80392156862744, + "grad_norm": 0.5611773133277893, + "learning_rate": 0.00019041218026979095, + "loss": 0.132, + "step": 2670 + }, + { + "epoch": 70.06535947712419, + "grad_norm": 0.6525811553001404, + "learning_rate": 0.0001903414081095315, + "loss": 0.1303, + "step": 2680 + }, + { + "epoch": 70.32679738562092, + "grad_norm": 0.5672200322151184, + "learning_rate": 0.0001902703889593119, + "loss": 0.1253, + "step": 2690 + }, + { + "epoch": 70.58823529411765, + "grad_norm": 0.698940098285675, + "learning_rate": 0.00019019912301329592, + "loss": 0.1261, + "step": 2700 + }, + { + "epoch": 70.84967320261438, + "grad_norm": 0.616579532623291, + "learning_rate": 0.0001901276104663218, + "loss": 0.1354, + "step": 2710 + }, + { + "epoch": 71.11111111111111, + "grad_norm": 0.5749228000640869, + "learning_rate": 0.00019005585151390223, + "loss": 0.1297, + "step": 2720 + }, + { + "epoch": 71.37254901960785, + "grad_norm": 0.9096235632896423, + "learning_rate": 0.00018998384635222334, + "loss": 0.1263, + "step": 2730 + }, + { + "epoch": 71.63398692810458, + "grad_norm": 0.8338767290115356, + "learning_rate": 0.0001899115951781446, + "loss": 0.1301, + "step": 2740 + }, + { + "epoch": 71.89542483660131, + "grad_norm": 0.7974913716316223, + "learning_rate": 0.0001898390981891979, + "loss": 0.1337, + "step": 2750 + }, + { + "epoch": 72.15686274509804, + "grad_norm": 0.5519678592681885, + "learning_rate": 0.00018976635558358722, + "loss": 0.1287, + "step": 2760 + }, + { + "epoch": 72.41830065359477, + "grad_norm": 0.7348606586456299, + "learning_rate": 0.00018969336756018815, + "loss": 0.1272, + "step": 2770 + }, + { + "epoch": 72.6797385620915, + "grad_norm": 0.6163354516029358, + "learning_rate": 0.00018962013431854702, + "loss": 0.1294, + "step": 2780 + }, + { + "epoch": 72.94117647058823, + "grad_norm": 0.8160730004310608, + "learning_rate": 0.00018954665605888088, + "loss": 0.1358, + "step": 2790 + }, + { + "epoch": 73.20261437908496, + "grad_norm": 0.6369748711585999, + "learning_rate": 0.00018947293298207635, + "loss": 0.1265, + "step": 2800 + }, + { + "epoch": 73.4640522875817, + "grad_norm": 0.8559942245483398, + "learning_rate": 0.0001893989652896896, + "loss": 0.1301, + "step": 2810 + }, + { + "epoch": 73.72549019607843, + "grad_norm": 0.5984659790992737, + "learning_rate": 0.0001893247531839454, + "loss": 0.1317, + "step": 2820 + }, + { + "epoch": 73.98692810457516, + "grad_norm": 0.7031850814819336, + "learning_rate": 0.0001892502968677369, + "loss": 0.1331, + "step": 2830 + }, + { + "epoch": 74.2483660130719, + "grad_norm": 0.7341207265853882, + "learning_rate": 0.00018917559654462474, + "loss": 0.1268, + "step": 2840 + }, + { + "epoch": 74.50980392156863, + "grad_norm": 0.6445427536964417, + "learning_rate": 0.0001891006524188368, + "loss": 0.1271, + "step": 2850 + }, + { + "epoch": 74.77124183006536, + "grad_norm": 0.7867002487182617, + "learning_rate": 0.00018902546469526743, + "loss": 0.134, + "step": 2860 + }, + { + "epoch": 75.0326797385621, + "grad_norm": 0.5532680153846741, + "learning_rate": 0.00018895003357947705, + "loss": 0.1271, + "step": 2870 + }, + { + "epoch": 75.29411764705883, + "grad_norm": 0.6302035450935364, + "learning_rate": 0.00018887435927769137, + "loss": 0.1254, + "step": 2880 + }, + { + "epoch": 75.55555555555556, + "grad_norm": 0.6897308826446533, + "learning_rate": 0.0001887984419968011, + "loss": 0.1271, + "step": 2890 + }, + { + "epoch": 75.81699346405229, + "grad_norm": 0.5863034725189209, + "learning_rate": 0.0001887222819443612, + "loss": 0.1258, + "step": 2900 + }, + { + "epoch": 76.07843137254902, + "grad_norm": 0.6140713095664978, + "learning_rate": 0.00018864587932859028, + "loss": 0.1323, + "step": 2910 + }, + { + "epoch": 76.33986928104575, + "grad_norm": 0.8416547775268555, + "learning_rate": 0.00018856923435837022, + "loss": 0.1254, + "step": 2920 + }, + { + "epoch": 76.60130718954248, + "grad_norm": 0.6811145544052124, + "learning_rate": 0.00018849234724324543, + "loss": 0.1269, + "step": 2930 + }, + { + "epoch": 76.86274509803921, + "grad_norm": 0.7164523601531982, + "learning_rate": 0.00018841521819342236, + "loss": 0.1328, + "step": 2940 + }, + { + "epoch": 77.12418300653594, + "grad_norm": 0.6024401783943176, + "learning_rate": 0.0001883378474197689, + "loss": 0.1281, + "step": 2950 + }, + { + "epoch": 77.38562091503267, + "grad_norm": 0.6707399487495422, + "learning_rate": 0.0001882602351338137, + "loss": 0.1278, + "step": 2960 + }, + { + "epoch": 77.6470588235294, + "grad_norm": 0.7291834354400635, + "learning_rate": 0.00018818238154774588, + "loss": 0.1306, + "step": 2970 + }, + { + "epoch": 77.90849673202614, + "grad_norm": 0.8068193793296814, + "learning_rate": 0.00018810428687441414, + "loss": 0.1321, + "step": 2980 + }, + { + "epoch": 78.16993464052288, + "grad_norm": 0.5824514627456665, + "learning_rate": 0.00018802595132732635, + "loss": 0.1228, + "step": 2990 + }, + { + "epoch": 78.43137254901961, + "grad_norm": 0.7537580132484436, + "learning_rate": 0.0001879473751206489, + "loss": 0.1255, + "step": 3000 + }, + { + "epoch": 78.69281045751634, + "grad_norm": 0.7544134259223938, + "learning_rate": 0.00018786855846920615, + "loss": 0.1286, + "step": 3010 + }, + { + "epoch": 78.95424836601308, + "grad_norm": 0.695929229259491, + "learning_rate": 0.00018778950158847976, + "loss": 0.1332, + "step": 3020 + }, + { + "epoch": 79.2156862745098, + "grad_norm": 0.5690885782241821, + "learning_rate": 0.0001877102046946083, + "loss": 0.1209, + "step": 3030 + }, + { + "epoch": 79.47712418300654, + "grad_norm": 0.7582102417945862, + "learning_rate": 0.00018763066800438636, + "loss": 0.1267, + "step": 3040 + }, + { + "epoch": 79.73856209150327, + "grad_norm": 0.48170891404151917, + "learning_rate": 0.0001875508917352643, + "loss": 0.1304, + "step": 3050 + }, + { + "epoch": 80.0, + "grad_norm": 0.7321044206619263, + "learning_rate": 0.00018747087610534736, + "loss": 0.1319, + "step": 3060 + }, + { + "epoch": 80.26143790849673, + "grad_norm": 0.6163921356201172, + "learning_rate": 0.00018739062133339517, + "loss": 0.1239, + "step": 3070 + }, + { + "epoch": 80.52287581699346, + "grad_norm": 0.8649224042892456, + "learning_rate": 0.00018731012763882133, + "loss": 0.1294, + "step": 3080 + }, + { + "epoch": 80.7843137254902, + "grad_norm": 0.635418713092804, + "learning_rate": 0.0001872293952416924, + "loss": 0.1274, + "step": 3090 + }, + { + "epoch": 81.04575163398692, + "grad_norm": 0.5143495202064514, + "learning_rate": 0.00018714842436272773, + "loss": 0.1272, + "step": 3100 + }, + { + "epoch": 81.30718954248366, + "grad_norm": 0.5938199758529663, + "learning_rate": 0.00018706721522329862, + "loss": 0.1274, + "step": 3110 + }, + { + "epoch": 81.56862745098039, + "grad_norm": 0.6700434684753418, + "learning_rate": 0.00018698576804542777, + "loss": 0.1239, + "step": 3120 + }, + { + "epoch": 81.83006535947712, + "grad_norm": 0.6935993432998657, + "learning_rate": 0.00018690408305178856, + "loss": 0.129, + "step": 3130 + }, + { + "epoch": 82.09150326797386, + "grad_norm": 0.616468071937561, + "learning_rate": 0.00018682216046570475, + "loss": 0.1309, + "step": 3140 + }, + { + "epoch": 82.3529411764706, + "grad_norm": 0.6784673929214478, + "learning_rate": 0.00018674000051114952, + "loss": 0.1256, + "step": 3150 + }, + { + "epoch": 82.61437908496733, + "grad_norm": 0.6315344572067261, + "learning_rate": 0.00018665760341274505, + "loss": 0.1248, + "step": 3160 + }, + { + "epoch": 82.87581699346406, + "grad_norm": 0.5992943644523621, + "learning_rate": 0.00018657496939576186, + "loss": 0.1289, + "step": 3170 + }, + { + "epoch": 83.13725490196079, + "grad_norm": 0.5748761892318726, + "learning_rate": 0.0001864920986861182, + "loss": 0.13, + "step": 3180 + }, + { + "epoch": 83.39869281045752, + "grad_norm": 0.4966670572757721, + "learning_rate": 0.00018640899151037944, + "loss": 0.125, + "step": 3190 + }, + { + "epoch": 83.66013071895425, + "grad_norm": 0.7322400212287903, + "learning_rate": 0.00018632564809575742, + "loss": 0.1296, + "step": 3200 + }, + { + "epoch": 83.92156862745098, + "grad_norm": 0.8756809234619141, + "learning_rate": 0.0001862420686701098, + "loss": 0.1258, + "step": 3210 + }, + { + "epoch": 84.18300653594771, + "grad_norm": 0.6855464577674866, + "learning_rate": 0.0001861582534619396, + "loss": 0.1293, + "step": 3220 + }, + { + "epoch": 84.44444444444444, + "grad_norm": 0.5204270482063293, + "learning_rate": 0.0001860742027003944, + "loss": 0.1239, + "step": 3230 + }, + { + "epoch": 84.70588235294117, + "grad_norm": 0.6892998814582825, + "learning_rate": 0.00018598991661526572, + "loss": 0.1276, + "step": 3240 + }, + { + "epoch": 84.9673202614379, + "grad_norm": 0.5613439083099365, + "learning_rate": 0.00018590539543698854, + "loss": 0.1291, + "step": 3250 + }, + { + "epoch": 85.22875816993464, + "grad_norm": 0.6739300489425659, + "learning_rate": 0.0001858206393966405, + "loss": 0.1264, + "step": 3260 + }, + { + "epoch": 85.49019607843137, + "grad_norm": 0.6257854700088501, + "learning_rate": 0.00018573564872594145, + "loss": 0.1228, + "step": 3270 + }, + { + "epoch": 85.7516339869281, + "grad_norm": 0.7718020081520081, + "learning_rate": 0.00018565042365725258, + "loss": 0.1272, + "step": 3280 + }, + { + "epoch": 86.01307189542484, + "grad_norm": 0.599905252456665, + "learning_rate": 0.00018556496442357595, + "loss": 0.131, + "step": 3290 + }, + { + "epoch": 86.27450980392157, + "grad_norm": 0.4919801950454712, + "learning_rate": 0.0001854792712585539, + "loss": 0.1187, + "step": 3300 + }, + { + "epoch": 86.5359477124183, + "grad_norm": 0.7735366821289062, + "learning_rate": 0.00018539334439646824, + "loss": 0.1292, + "step": 3310 + }, + { + "epoch": 86.79738562091504, + "grad_norm": 0.5821804404258728, + "learning_rate": 0.00018530718407223974, + "loss": 0.1241, + "step": 3320 + }, + { + "epoch": 87.05882352941177, + "grad_norm": 0.5197769403457642, + "learning_rate": 0.00018522079052142747, + "loss": 0.1327, + "step": 3330 + }, + { + "epoch": 87.3202614379085, + "grad_norm": 0.728046715259552, + "learning_rate": 0.00018513416398022802, + "loss": 0.125, + "step": 3340 + }, + { + "epoch": 87.58169934640523, + "grad_norm": 0.5133826732635498, + "learning_rate": 0.0001850473046854751, + "loss": 0.125, + "step": 3350 + }, + { + "epoch": 87.84313725490196, + "grad_norm": 0.5190628170967102, + "learning_rate": 0.0001849602128746387, + "loss": 0.1266, + "step": 3360 + }, + { + "epoch": 88.10457516339869, + "grad_norm": 0.5673719048500061, + "learning_rate": 0.00018487288878582447, + "loss": 0.1264, + "step": 3370 + }, + { + "epoch": 88.36601307189542, + "grad_norm": 0.5447489023208618, + "learning_rate": 0.00018478533265777318, + "loss": 0.1227, + "step": 3380 + }, + { + "epoch": 88.62745098039215, + "grad_norm": 0.7577037811279297, + "learning_rate": 0.00018469754472985993, + "loss": 0.1301, + "step": 3390 + }, + { + "epoch": 88.88888888888889, + "grad_norm": 0.6202467083930969, + "learning_rate": 0.00018460952524209355, + "loss": 0.1282, + "step": 3400 + }, + { + "epoch": 89.15032679738562, + "grad_norm": 0.5380336046218872, + "learning_rate": 0.00018452127443511598, + "loss": 0.1208, + "step": 3410 + }, + { + "epoch": 89.41176470588235, + "grad_norm": 0.6075873970985413, + "learning_rate": 0.00018443279255020152, + "loss": 0.1253, + "step": 3420 + }, + { + "epoch": 89.67320261437908, + "grad_norm": 0.6594501733779907, + "learning_rate": 0.0001843440798292563, + "loss": 0.1292, + "step": 3430 + }, + { + "epoch": 89.93464052287581, + "grad_norm": 0.8450397849082947, + "learning_rate": 0.00018425513651481747, + "loss": 0.1272, + "step": 3440 + }, + { + "epoch": 90.19607843137256, + "grad_norm": 0.5860391855239868, + "learning_rate": 0.00018416596285005272, + "loss": 0.1235, + "step": 3450 + }, + { + "epoch": 90.45751633986929, + "grad_norm": 1.0777153968811035, + "learning_rate": 0.0001840765590787594, + "loss": 0.1236, + "step": 3460 + }, + { + "epoch": 90.71895424836602, + "grad_norm": 0.5201640129089355, + "learning_rate": 0.00018398692544536397, + "loss": 0.1257, + "step": 3470 + }, + { + "epoch": 90.98039215686275, + "grad_norm": 0.6437315344810486, + "learning_rate": 0.00018389706219492147, + "loss": 0.1309, + "step": 3480 + }, + { + "epoch": 91.24183006535948, + "grad_norm": 0.7315922975540161, + "learning_rate": 0.00018380696957311449, + "loss": 0.1202, + "step": 3490 + }, + { + "epoch": 91.50326797385621, + "grad_norm": 0.6596841216087341, + "learning_rate": 0.00018371664782625287, + "loss": 0.123, + "step": 3500 + }, + { + "epoch": 91.76470588235294, + "grad_norm": 0.7001237869262695, + "learning_rate": 0.0001836260972012728, + "loss": 0.131, + "step": 3510 + }, + { + "epoch": 92.02614379084967, + "grad_norm": 0.5139759182929993, + "learning_rate": 0.00018353531794573625, + "loss": 0.1269, + "step": 3520 + }, + { + "epoch": 92.2875816993464, + "grad_norm": 0.5682742595672607, + "learning_rate": 0.0001834443103078302, + "loss": 0.1219, + "step": 3530 + }, + { + "epoch": 92.54901960784314, + "grad_norm": 0.5991658568382263, + "learning_rate": 0.0001833530745363661, + "loss": 0.1253, + "step": 3540 + }, + { + "epoch": 92.81045751633987, + "grad_norm": 0.600954532623291, + "learning_rate": 0.00018326161088077903, + "loss": 0.1286, + "step": 3550 + }, + { + "epoch": 93.0718954248366, + "grad_norm": 0.6942703723907471, + "learning_rate": 0.00018316991959112716, + "loss": 0.129, + "step": 3560 + }, + { + "epoch": 93.33333333333333, + "grad_norm": 0.723839282989502, + "learning_rate": 0.00018307800091809097, + "loss": 0.1243, + "step": 3570 + }, + { + "epoch": 93.59477124183006, + "grad_norm": 0.8791704773902893, + "learning_rate": 0.0001829858551129726, + "loss": 0.1266, + "step": 3580 + }, + { + "epoch": 93.85620915032679, + "grad_norm": 0.6914976835250854, + "learning_rate": 0.00018289348242769515, + "loss": 0.1252, + "step": 3590 + }, + { + "epoch": 94.11764705882354, + "grad_norm": 0.40806615352630615, + "learning_rate": 0.00018280088311480201, + "loss": 0.1245, + "step": 3600 + }, + { + "epoch": 94.37908496732027, + "grad_norm": 0.47459840774536133, + "learning_rate": 0.00018270805742745617, + "loss": 0.124, + "step": 3610 + }, + { + "epoch": 94.640522875817, + "grad_norm": 0.5964429378509521, + "learning_rate": 0.00018261500561943955, + "loss": 0.1247, + "step": 3620 + }, + { + "epoch": 94.90196078431373, + "grad_norm": 0.5425547957420349, + "learning_rate": 0.00018252172794515223, + "loss": 0.1291, + "step": 3630 + }, + { + "epoch": 95.16339869281046, + "grad_norm": 0.6276856660842896, + "learning_rate": 0.00018242822465961176, + "loss": 0.1291, + "step": 3640 + }, + { + "epoch": 95.42483660130719, + "grad_norm": 1.3327772617340088, + "learning_rate": 0.00018233449601845258, + "loss": 0.1209, + "step": 3650 + }, + { + "epoch": 95.68627450980392, + "grad_norm": 0.9959401488304138, + "learning_rate": 0.00018224054227792524, + "loss": 0.1237, + "step": 3660 + }, + { + "epoch": 95.94771241830065, + "grad_norm": 0.6117873787879944, + "learning_rate": 0.00018214636369489563, + "loss": 0.1336, + "step": 3670 + }, + { + "epoch": 96.20915032679738, + "grad_norm": 0.5268058776855469, + "learning_rate": 0.00018205196052684445, + "loss": 0.1212, + "step": 3680 + }, + { + "epoch": 96.47058823529412, + "grad_norm": 0.6816233396530151, + "learning_rate": 0.00018195733303186633, + "loss": 0.1291, + "step": 3690 + }, + { + "epoch": 96.73202614379085, + "grad_norm": 0.6062767505645752, + "learning_rate": 0.00018186248146866927, + "loss": 0.1248, + "step": 3700 + }, + { + "epoch": 96.99346405228758, + "grad_norm": 0.9882863759994507, + "learning_rate": 0.0001817674060965737, + "loss": 0.1297, + "step": 3710 + }, + { + "epoch": 97.25490196078431, + "grad_norm": 0.5438330769538879, + "learning_rate": 0.00018167210717551224, + "loss": 0.1207, + "step": 3720 + }, + { + "epoch": 97.51633986928104, + "grad_norm": 0.5340378284454346, + "learning_rate": 0.00018157658496602833, + "loss": 0.1219, + "step": 3730 + }, + { + "epoch": 97.77777777777777, + "grad_norm": 0.6972679495811462, + "learning_rate": 0.00018148083972927616, + "loss": 0.1272, + "step": 3740 + }, + { + "epoch": 98.03921568627452, + "grad_norm": 0.5716882348060608, + "learning_rate": 0.0001813848717270195, + "loss": 0.1293, + "step": 3750 + }, + { + "epoch": 98.30065359477125, + "grad_norm": 0.6160675883293152, + "learning_rate": 0.00018128868122163123, + "loss": 0.1231, + "step": 3760 + }, + { + "epoch": 98.56209150326798, + "grad_norm": 0.6029688715934753, + "learning_rate": 0.00018119226847609245, + "loss": 0.1243, + "step": 3770 + }, + { + "epoch": 98.82352941176471, + "grad_norm": 0.48037102818489075, + "learning_rate": 0.000181095633753992, + "loss": 0.1244, + "step": 3780 + }, + { + "epoch": 99.08496732026144, + "grad_norm": 0.5181258916854858, + "learning_rate": 0.0001809987773195255, + "loss": 0.1293, + "step": 3790 + }, + { + "epoch": 99.34640522875817, + "grad_norm": 0.5500502586364746, + "learning_rate": 0.00018090169943749476, + "loss": 0.1223, + "step": 3800 + }, + { + "epoch": 99.6078431372549, + "grad_norm": 0.5222991704940796, + "learning_rate": 0.00018080440037330695, + "loss": 0.1234, + "step": 3810 + }, + { + "epoch": 99.86928104575163, + "grad_norm": 0.44636550545692444, + "learning_rate": 0.00018070688039297403, + "loss": 0.1229, + "step": 3820 + }, + { + "epoch": 100.13071895424837, + "grad_norm": 0.6223607063293457, + "learning_rate": 0.00018060913976311192, + "loss": 0.1248, + "step": 3830 + }, + { + "epoch": 100.3921568627451, + "grad_norm": 0.5242628455162048, + "learning_rate": 0.00018051117875093976, + "loss": 0.1228, + "step": 3840 + }, + { + "epoch": 100.65359477124183, + "grad_norm": 0.5237357020378113, + "learning_rate": 0.00018041299762427916, + "loss": 0.1249, + "step": 3850 + }, + { + "epoch": 100.91503267973856, + "grad_norm": 0.48059752583503723, + "learning_rate": 0.00018031459665155363, + "loss": 0.1244, + "step": 3860 + }, + { + "epoch": 101.17647058823529, + "grad_norm": 0.6612676382064819, + "learning_rate": 0.00018021597610178768, + "loss": 0.1232, + "step": 3870 + }, + { + "epoch": 101.43790849673202, + "grad_norm": 0.5979042053222656, + "learning_rate": 0.00018011713624460608, + "loss": 0.1238, + "step": 3880 + }, + { + "epoch": 101.69934640522875, + "grad_norm": 0.5387808084487915, + "learning_rate": 0.0001800180773502333, + "loss": 0.1256, + "step": 3890 + }, + { + "epoch": 101.96078431372548, + "grad_norm": 0.526494026184082, + "learning_rate": 0.0001799187996894925, + "loss": 0.1237, + "step": 3900 + }, + { + "epoch": 102.22222222222223, + "grad_norm": 0.5353228449821472, + "learning_rate": 0.00017981930353380503, + "loss": 0.1196, + "step": 3910 + }, + { + "epoch": 102.48366013071896, + "grad_norm": 0.4083467125892639, + "learning_rate": 0.0001797195891551896, + "loss": 0.1242, + "step": 3920 + }, + { + "epoch": 102.74509803921569, + "grad_norm": 0.4955314099788666, + "learning_rate": 0.0001796196568262615, + "loss": 0.1273, + "step": 3930 + }, + { + "epoch": 103.00653594771242, + "grad_norm": 0.6663371920585632, + "learning_rate": 0.00017951950682023191, + "loss": 0.1271, + "step": 3940 + }, + { + "epoch": 103.26797385620915, + "grad_norm": 0.5380375385284424, + "learning_rate": 0.0001794191394109071, + "loss": 0.1197, + "step": 3950 + }, + { + "epoch": 103.52941176470588, + "grad_norm": 0.5058585405349731, + "learning_rate": 0.00017931855487268782, + "loss": 0.1253, + "step": 3960 + }, + { + "epoch": 103.79084967320262, + "grad_norm": 0.4719361364841461, + "learning_rate": 0.0001792177534805682, + "loss": 0.1221, + "step": 3970 + }, + { + "epoch": 104.05228758169935, + "grad_norm": 0.5762635469436646, + "learning_rate": 0.00017911673551013551, + "loss": 0.1273, + "step": 3980 + }, + { + "epoch": 104.31372549019608, + "grad_norm": 0.5460364818572998, + "learning_rate": 0.00017901550123756906, + "loss": 0.1207, + "step": 3990 + }, + { + "epoch": 104.57516339869281, + "grad_norm": 0.6880167722702026, + "learning_rate": 0.00017891405093963938, + "loss": 0.123, + "step": 4000 + }, + { + "epoch": 104.83660130718954, + "grad_norm": 0.5890039205551147, + "learning_rate": 0.00017881238489370776, + "loss": 0.1261, + "step": 4010 + }, + { + "epoch": 105.09803921568627, + "grad_norm": 0.5330226421356201, + "learning_rate": 0.00017871050337772525, + "loss": 0.1257, + "step": 4020 + }, + { + "epoch": 105.359477124183, + "grad_norm": 0.7687032222747803, + "learning_rate": 0.00017860840667023212, + "loss": 0.1191, + "step": 4030 + }, + { + "epoch": 105.62091503267973, + "grad_norm": 0.5523906946182251, + "learning_rate": 0.0001785060950503568, + "loss": 0.1222, + "step": 4040 + }, + { + "epoch": 105.88235294117646, + "grad_norm": 0.710148811340332, + "learning_rate": 0.0001784035687978153, + "loss": 0.1285, + "step": 4050 + }, + { + "epoch": 106.14379084967321, + "grad_norm": 0.5394968390464783, + "learning_rate": 0.0001783008281929106, + "loss": 0.1236, + "step": 4060 + }, + { + "epoch": 106.40522875816994, + "grad_norm": 0.9128403663635254, + "learning_rate": 0.0001781978735165315, + "loss": 0.123, + "step": 4070 + }, + { + "epoch": 106.66666666666667, + "grad_norm": 0.6581969857215881, + "learning_rate": 0.0001780947050501522, + "loss": 0.1233, + "step": 4080 + }, + { + "epoch": 106.9281045751634, + "grad_norm": 0.7595931887626648, + "learning_rate": 0.00017799132307583132, + "loss": 0.1265, + "step": 4090 + }, + { + "epoch": 107.18954248366013, + "grad_norm": 0.4747548997402191, + "learning_rate": 0.00017788772787621126, + "loss": 0.1205, + "step": 4100 + }, + { + "epoch": 107.45098039215686, + "grad_norm": 0.6191473603248596, + "learning_rate": 0.0001777839197345173, + "loss": 0.1227, + "step": 4110 + }, + { + "epoch": 107.7124183006536, + "grad_norm": 0.6291956305503845, + "learning_rate": 0.00017767989893455698, + "loss": 0.1241, + "step": 4120 + }, + { + "epoch": 107.97385620915033, + "grad_norm": 0.8078779578208923, + "learning_rate": 0.00017757566576071914, + "loss": 0.1276, + "step": 4130 + }, + { + "epoch": 108.23529411764706, + "grad_norm": 0.534305989742279, + "learning_rate": 0.00017747122049797335, + "loss": 0.1237, + "step": 4140 + }, + { + "epoch": 108.49673202614379, + "grad_norm": 0.6715922951698303, + "learning_rate": 0.00017736656343186896, + "loss": 0.1222, + "step": 4150 + }, + { + "epoch": 108.75816993464052, + "grad_norm": 0.4797965884208679, + "learning_rate": 0.00017726169484853438, + "loss": 0.1233, + "step": 4160 + }, + { + "epoch": 109.01960784313725, + "grad_norm": 0.5531324148178101, + "learning_rate": 0.0001771566150346763, + "loss": 0.1248, + "step": 4170 + }, + { + "epoch": 109.28104575163398, + "grad_norm": 0.6008434891700745, + "learning_rate": 0.00017705132427757895, + "loss": 0.1191, + "step": 4180 + }, + { + "epoch": 109.54248366013071, + "grad_norm": 0.4768673777580261, + "learning_rate": 0.0001769458228651032, + "loss": 0.1221, + "step": 4190 + }, + { + "epoch": 109.80392156862744, + "grad_norm": 0.6039496064186096, + "learning_rate": 0.00017684011108568592, + "loss": 0.1272, + "step": 4200 + }, + { + "epoch": 110.06535947712419, + "grad_norm": 0.4699447453022003, + "learning_rate": 0.00017673418922833903, + "loss": 0.124, + "step": 4210 + }, + { + "epoch": 110.32679738562092, + "grad_norm": 0.5179737210273743, + "learning_rate": 0.00017662805758264893, + "loss": 0.1216, + "step": 4220 + }, + { + "epoch": 110.58823529411765, + "grad_norm": 0.5560812950134277, + "learning_rate": 0.0001765217164387754, + "loss": 0.1254, + "step": 4230 + }, + { + "epoch": 110.84967320261438, + "grad_norm": 0.5947309732437134, + "learning_rate": 0.00017641516608745114, + "loss": 0.126, + "step": 4240 + }, + { + "epoch": 111.11111111111111, + "grad_norm": 0.5091108083724976, + "learning_rate": 0.00017630840681998066, + "loss": 0.1253, + "step": 4250 + }, + { + "epoch": 111.37254901960785, + "grad_norm": 0.6985629796981812, + "learning_rate": 0.00017620143892823977, + "loss": 0.1234, + "step": 4260 + }, + { + "epoch": 111.63398692810458, + "grad_norm": 0.4949619770050049, + "learning_rate": 0.00017609426270467462, + "loss": 0.1243, + "step": 4270 + }, + { + "epoch": 111.89542483660131, + "grad_norm": 0.6455708742141724, + "learning_rate": 0.00017598687844230088, + "loss": 0.1268, + "step": 4280 + }, + { + "epoch": 112.15686274509804, + "grad_norm": 0.45366206765174866, + "learning_rate": 0.000175879286434703, + "loss": 0.1218, + "step": 4290 + }, + { + "epoch": 112.41830065359477, + "grad_norm": 0.4446161687374115, + "learning_rate": 0.0001757714869760335, + "loss": 0.1159, + "step": 4300 + }, + { + "epoch": 112.6797385620915, + "grad_norm": 0.5011366009712219, + "learning_rate": 0.00017566348036101187, + "loss": 0.1237, + "step": 4310 + }, + { + "epoch": 112.94117647058823, + "grad_norm": 0.6293872594833374, + "learning_rate": 0.0001755552668849242, + "loss": 0.1261, + "step": 4320 + }, + { + "epoch": 113.20261437908496, + "grad_norm": 0.5598844885826111, + "learning_rate": 0.0001754468468436219, + "loss": 0.1257, + "step": 4330 + }, + { + "epoch": 113.4640522875817, + "grad_norm": 0.4953804314136505, + "learning_rate": 0.00017533822053352128, + "loss": 0.1193, + "step": 4340 + }, + { + "epoch": 113.72549019607843, + "grad_norm": 0.4813307225704193, + "learning_rate": 0.0001752293882516025, + "loss": 0.1203, + "step": 4350 + }, + { + "epoch": 113.98692810457516, + "grad_norm": 0.5343033075332642, + "learning_rate": 0.00017512035029540885, + "loss": 0.1287, + "step": 4360 + }, + { + "epoch": 114.2483660130719, + "grad_norm": 0.3980255722999573, + "learning_rate": 0.00017501110696304596, + "loss": 0.1194, + "step": 4370 + }, + { + "epoch": 114.50980392156863, + "grad_norm": 0.49828794598579407, + "learning_rate": 0.00017490165855318094, + "loss": 0.1225, + "step": 4380 + }, + { + "epoch": 114.77124183006536, + "grad_norm": 0.5137373208999634, + "learning_rate": 0.00017479200536504156, + "loss": 0.1246, + "step": 4390 + }, + { + "epoch": 115.0326797385621, + "grad_norm": 0.4799012839794159, + "learning_rate": 0.0001746821476984154, + "loss": 0.1267, + "step": 4400 + }, + { + "epoch": 115.29411764705883, + "grad_norm": 0.4675128757953644, + "learning_rate": 0.00017457208585364918, + "loss": 0.1212, + "step": 4410 + }, + { + "epoch": 115.55555555555556, + "grad_norm": 0.5032903552055359, + "learning_rate": 0.00017446182013164778, + "loss": 0.1233, + "step": 4420 + }, + { + "epoch": 115.81699346405229, + "grad_norm": 0.5266563892364502, + "learning_rate": 0.00017435135083387345, + "loss": 0.1259, + "step": 4430 + }, + { + "epoch": 116.07843137254902, + "grad_norm": 0.5633729696273804, + "learning_rate": 0.000174240678262345, + "loss": 0.1229, + "step": 4440 + }, + { + "epoch": 116.33986928104575, + "grad_norm": 0.6905789375305176, + "learning_rate": 0.0001741298027196371, + "loss": 0.1198, + "step": 4450 + }, + { + "epoch": 116.60130718954248, + "grad_norm": 0.39248159527778625, + "learning_rate": 0.00017401872450887917, + "loss": 0.1201, + "step": 4460 + }, + { + "epoch": 116.86274509803921, + "grad_norm": 0.48288437724113464, + "learning_rate": 0.00017390744393375486, + "loss": 0.1258, + "step": 4470 + }, + { + "epoch": 117.12418300653594, + "grad_norm": 0.5383310914039612, + "learning_rate": 0.00017379596129850098, + "loss": 0.1219, + "step": 4480 + }, + { + "epoch": 117.38562091503267, + "grad_norm": 0.5818719267845154, + "learning_rate": 0.00017368427690790677, + "loss": 0.122, + "step": 4490 + }, + { + "epoch": 117.6470588235294, + "grad_norm": 0.6253119111061096, + "learning_rate": 0.00017357239106731317, + "loss": 0.1218, + "step": 4500 + }, + { + "epoch": 117.90849673202614, + "grad_norm": 0.560431182384491, + "learning_rate": 0.00017346030408261172, + "loss": 0.1254, + "step": 4510 + }, + { + "epoch": 118.16993464052288, + "grad_norm": 0.4778011441230774, + "learning_rate": 0.000173348016260244, + "loss": 0.1264, + "step": 4520 + }, + { + "epoch": 118.43137254901961, + "grad_norm": 0.44935017824172974, + "learning_rate": 0.00017323552790720058, + "loss": 0.1194, + "step": 4530 + }, + { + "epoch": 118.69281045751634, + "grad_norm": 0.5576562881469727, + "learning_rate": 0.00017312283933102038, + "loss": 0.1262, + "step": 4540 + }, + { + "epoch": 118.95424836601308, + "grad_norm": 0.43395355343818665, + "learning_rate": 0.00017300995083978965, + "loss": 0.1206, + "step": 4550 + }, + { + "epoch": 119.2156862745098, + "grad_norm": 0.5254650712013245, + "learning_rate": 0.00017289686274214118, + "loss": 0.1222, + "step": 4560 + }, + { + "epoch": 119.47712418300654, + "grad_norm": 0.5673931241035461, + "learning_rate": 0.0001727835753472535, + "loss": 0.1186, + "step": 4570 + }, + { + "epoch": 119.73856209150327, + "grad_norm": 0.38074398040771484, + "learning_rate": 0.0001726700889648501, + "loss": 0.1214, + "step": 4580 + }, + { + "epoch": 120.0, + "grad_norm": 0.5379620790481567, + "learning_rate": 0.00017255640390519836, + "loss": 0.1262, + "step": 4590 + }, + { + "epoch": 120.26143790849673, + "grad_norm": 0.46410471200942993, + "learning_rate": 0.00017244252047910892, + "loss": 0.1187, + "step": 4600 + }, + { + "epoch": 120.52287581699346, + "grad_norm": 0.5001809000968933, + "learning_rate": 0.00017232843899793468, + "loss": 0.1183, + "step": 4610 + }, + { + "epoch": 120.7843137254902, + "grad_norm": 0.402111291885376, + "learning_rate": 0.00017221415977357007, + "loss": 0.1243, + "step": 4620 + }, + { + "epoch": 121.04575163398692, + "grad_norm": 0.389432817697525, + "learning_rate": 0.00017209968311845012, + "loss": 0.1246, + "step": 4630 + }, + { + "epoch": 121.30718954248366, + "grad_norm": 0.5195785760879517, + "learning_rate": 0.00017198500934554966, + "loss": 0.1171, + "step": 4640 + }, + { + "epoch": 121.56862745098039, + "grad_norm": 0.4785175323486328, + "learning_rate": 0.0001718701387683824, + "loss": 0.1199, + "step": 4650 + }, + { + "epoch": 121.83006535947712, + "grad_norm": 0.5644505620002747, + "learning_rate": 0.0001717550717010001, + "loss": 0.1251, + "step": 4660 + }, + { + "epoch": 122.09150326797386, + "grad_norm": 0.506691038608551, + "learning_rate": 0.0001716398084579917, + "loss": 0.1259, + "step": 4670 + }, + { + "epoch": 122.3529411764706, + "grad_norm": 0.491377592086792, + "learning_rate": 0.00017152434935448256, + "loss": 0.1203, + "step": 4680 + }, + { + "epoch": 122.61437908496733, + "grad_norm": 0.5997154712677002, + "learning_rate": 0.00017140869470613342, + "loss": 0.1179, + "step": 4690 + }, + { + "epoch": 122.87581699346406, + "grad_norm": 0.4591546356678009, + "learning_rate": 0.00017129284482913972, + "loss": 0.1218, + "step": 4700 + }, + { + "epoch": 123.13725490196079, + "grad_norm": 0.3804793655872345, + "learning_rate": 0.00017117680004023056, + "loss": 0.1195, + "step": 4710 + }, + { + "epoch": 123.39869281045752, + "grad_norm": 0.4563983082771301, + "learning_rate": 0.00017106056065666793, + "loss": 0.1177, + "step": 4720 + }, + { + "epoch": 123.66013071895425, + "grad_norm": 0.5122204422950745, + "learning_rate": 0.00017094412699624595, + "loss": 0.1221, + "step": 4730 + }, + { + "epoch": 123.92156862745098, + "grad_norm": 0.4884132146835327, + "learning_rate": 0.00017082749937728973, + "loss": 0.1245, + "step": 4740 + }, + { + "epoch": 124.18300653594771, + "grad_norm": 0.38031864166259766, + "learning_rate": 0.00017071067811865476, + "loss": 0.1196, + "step": 4750 + }, + { + "epoch": 124.44444444444444, + "grad_norm": 0.4657631814479828, + "learning_rate": 0.0001705936635397259, + "loss": 0.1197, + "step": 4760 + }, + { + "epoch": 124.70588235294117, + "grad_norm": 0.5115662217140198, + "learning_rate": 0.00017047645596041653, + "loss": 0.1217, + "step": 4770 + }, + { + "epoch": 124.9673202614379, + "grad_norm": 0.39813730120658875, + "learning_rate": 0.0001703590557011677, + "loss": 0.12, + "step": 4780 + }, + { + "epoch": 125.22875816993464, + "grad_norm": 0.4247078001499176, + "learning_rate": 0.00017024146308294724, + "loss": 0.1169, + "step": 4790 + }, + { + "epoch": 125.49019607843137, + "grad_norm": 0.41345420479774475, + "learning_rate": 0.00017012367842724887, + "loss": 0.1212, + "step": 4800 + }, + { + "epoch": 125.7516339869281, + "grad_norm": 0.43008559942245483, + "learning_rate": 0.00017000570205609136, + "loss": 0.1198, + "step": 4810 + }, + { + "epoch": 126.01307189542484, + "grad_norm": 0.47986850142478943, + "learning_rate": 0.00016988753429201755, + "loss": 0.125, + "step": 4820 + }, + { + "epoch": 126.27450980392157, + "grad_norm": 0.3475463390350342, + "learning_rate": 0.00016976917545809367, + "loss": 0.1146, + "step": 4830 + }, + { + "epoch": 126.5359477124183, + "grad_norm": 0.38883545994758606, + "learning_rate": 0.00016965062587790823, + "loss": 0.1189, + "step": 4840 + }, + { + "epoch": 126.79738562091504, + "grad_norm": 0.44550979137420654, + "learning_rate": 0.00016953188587557122, + "loss": 0.1247, + "step": 4850 + }, + { + "epoch": 127.05882352941177, + "grad_norm": 0.3760294020175934, + "learning_rate": 0.0001694129557757133, + "loss": 0.125, + "step": 4860 + }, + { + "epoch": 127.3202614379085, + "grad_norm": 0.46050703525543213, + "learning_rate": 0.0001692938359034848, + "loss": 0.1161, + "step": 4870 + }, + { + "epoch": 127.58169934640523, + "grad_norm": 0.48216715455055237, + "learning_rate": 0.00016917452658455495, + "loss": 0.1198, + "step": 4880 + }, + { + "epoch": 127.84313725490196, + "grad_norm": 0.4969286620616913, + "learning_rate": 0.00016905502814511082, + "loss": 0.1231, + "step": 4890 + }, + { + "epoch": 128.1045751633987, + "grad_norm": 0.3721112012863159, + "learning_rate": 0.0001689353409118566, + "loss": 0.1275, + "step": 4900 + }, + { + "epoch": 128.36601307189542, + "grad_norm": 0.518031895160675, + "learning_rate": 0.0001688154652120126, + "loss": 0.118, + "step": 4910 + }, + { + "epoch": 128.62745098039215, + "grad_norm": 0.49686360359191895, + "learning_rate": 0.00016869540137331445, + "loss": 0.1179, + "step": 4920 + }, + { + "epoch": 128.88888888888889, + "grad_norm": 0.5079768300056458, + "learning_rate": 0.00016857514972401207, + "loss": 0.1252, + "step": 4930 + }, + { + "epoch": 129.15032679738562, + "grad_norm": 0.4462049901485443, + "learning_rate": 0.00016845471059286887, + "loss": 0.1196, + "step": 4940 + }, + { + "epoch": 129.41176470588235, + "grad_norm": 0.4287566840648651, + "learning_rate": 0.00016833408430916085, + "loss": 0.1198, + "step": 4950 + }, + { + "epoch": 129.67320261437908, + "grad_norm": 0.4560422897338867, + "learning_rate": 0.00016821327120267567, + "loss": 0.1218, + "step": 4960 + }, + { + "epoch": 129.9346405228758, + "grad_norm": 0.40253371000289917, + "learning_rate": 0.0001680922716037117, + "loss": 0.1219, + "step": 4970 + }, + { + "epoch": 130.19607843137254, + "grad_norm": 0.33488982915878296, + "learning_rate": 0.00016797108584307732, + "loss": 0.1202, + "step": 4980 + }, + { + "epoch": 130.45751633986927, + "grad_norm": 0.4206305146217346, + "learning_rate": 0.00016784971425208965, + "loss": 0.1189, + "step": 4990 + }, + { + "epoch": 130.718954248366, + "grad_norm": 0.4833426773548126, + "learning_rate": 0.00016772815716257412, + "loss": 0.1209, + "step": 5000 + }, + { + "epoch": 130.98039215686273, + "grad_norm": 0.5023657083511353, + "learning_rate": 0.00016760641490686307, + "loss": 0.1221, + "step": 5010 + }, + { + "epoch": 131.24183006535947, + "grad_norm": 0.4237312972545624, + "learning_rate": 0.0001674844878177952, + "loss": 0.1204, + "step": 5020 + }, + { + "epoch": 131.5032679738562, + "grad_norm": 0.42303699254989624, + "learning_rate": 0.00016736237622871452, + "loss": 0.1143, + "step": 5030 + }, + { + "epoch": 131.76470588235293, + "grad_norm": 0.4471609890460968, + "learning_rate": 0.00016724008047346947, + "loss": 0.1242, + "step": 5040 + }, + { + "epoch": 132.0261437908497, + "grad_norm": 0.43694472312927246, + "learning_rate": 0.00016711760088641196, + "loss": 0.1245, + "step": 5050 + }, + { + "epoch": 132.28758169934642, + "grad_norm": 0.460407018661499, + "learning_rate": 0.0001669949378023965, + "loss": 0.119, + "step": 5060 + }, + { + "epoch": 132.54901960784315, + "grad_norm": 0.39372462034225464, + "learning_rate": 0.00016687209155677929, + "loss": 0.1159, + "step": 5070 + }, + { + "epoch": 132.81045751633988, + "grad_norm": 0.38842689990997314, + "learning_rate": 0.00016674906248541726, + "loss": 0.1241, + "step": 5080 + }, + { + "epoch": 133.0718954248366, + "grad_norm": 0.40953928232192993, + "learning_rate": 0.00016662585092466723, + "loss": 0.1254, + "step": 5090 + }, + { + "epoch": 133.33333333333334, + "grad_norm": 0.4259697496891022, + "learning_rate": 0.0001665024572113848, + "loss": 0.1187, + "step": 5100 + }, + { + "epoch": 133.59477124183007, + "grad_norm": 0.46274760365486145, + "learning_rate": 0.00016637888168292384, + "loss": 0.1174, + "step": 5110 + }, + { + "epoch": 133.8562091503268, + "grad_norm": 0.5445423126220703, + "learning_rate": 0.000166255124677135, + "loss": 0.121, + "step": 5120 + }, + { + "epoch": 134.11764705882354, + "grad_norm": 0.38918498158454895, + "learning_rate": 0.00016613118653236518, + "loss": 0.122, + "step": 5130 + }, + { + "epoch": 134.37908496732027, + "grad_norm": 0.3996954560279846, + "learning_rate": 0.00016600706758745668, + "loss": 0.1202, + "step": 5140 + }, + { + "epoch": 134.640522875817, + "grad_norm": 0.42393553256988525, + "learning_rate": 0.0001658827681817458, + "loss": 0.1226, + "step": 5150 + }, + { + "epoch": 134.90196078431373, + "grad_norm": 0.5191143751144409, + "learning_rate": 0.00016575828865506245, + "loss": 0.1208, + "step": 5160 + }, + { + "epoch": 135.16339869281046, + "grad_norm": 0.3241782784461975, + "learning_rate": 0.00016563362934772892, + "loss": 0.119, + "step": 5170 + }, + { + "epoch": 135.4248366013072, + "grad_norm": 0.4285520017147064, + "learning_rate": 0.00016550879060055895, + "loss": 0.1167, + "step": 5180 + }, + { + "epoch": 135.68627450980392, + "grad_norm": 0.4682152271270752, + "learning_rate": 0.00016538377275485691, + "loss": 0.1234, + "step": 5190 + }, + { + "epoch": 135.94771241830065, + "grad_norm": 0.5249207615852356, + "learning_rate": 0.00016525857615241687, + "loss": 0.122, + "step": 5200 + }, + { + "epoch": 136.20915032679738, + "grad_norm": 0.3867131173610687, + "learning_rate": 0.00016513320113552152, + "loss": 0.1183, + "step": 5210 + }, + { + "epoch": 136.47058823529412, + "grad_norm": 0.437995582818985, + "learning_rate": 0.0001650076480469413, + "loss": 0.1216, + "step": 5220 + }, + { + "epoch": 136.73202614379085, + "grad_norm": 0.47116023302078247, + "learning_rate": 0.0001648819172299337, + "loss": 0.1208, + "step": 5230 + }, + { + "epoch": 136.99346405228758, + "grad_norm": 0.4164656698703766, + "learning_rate": 0.0001647560090282419, + "loss": 0.1206, + "step": 5240 + }, + { + "epoch": 137.2549019607843, + "grad_norm": 0.41538143157958984, + "learning_rate": 0.00016462992378609407, + "loss": 0.1144, + "step": 5250 + }, + { + "epoch": 137.51633986928104, + "grad_norm": 0.48748481273651123, + "learning_rate": 0.00016450366184820255, + "loss": 0.1241, + "step": 5260 + }, + { + "epoch": 137.77777777777777, + "grad_norm": 0.49401819705963135, + "learning_rate": 0.00016437722355976258, + "loss": 0.1237, + "step": 5270 + }, + { + "epoch": 138.0392156862745, + "grad_norm": 0.41417965292930603, + "learning_rate": 0.00016425060926645167, + "loss": 0.1196, + "step": 5280 + }, + { + "epoch": 138.30065359477123, + "grad_norm": 0.34107911586761475, + "learning_rate": 0.00016412381931442838, + "loss": 0.1149, + "step": 5290 + }, + { + "epoch": 138.56209150326796, + "grad_norm": 0.5301646590232849, + "learning_rate": 0.00016399685405033167, + "loss": 0.1173, + "step": 5300 + }, + { + "epoch": 138.8235294117647, + "grad_norm": 0.500399649143219, + "learning_rate": 0.0001638697138212797, + "loss": 0.123, + "step": 5310 + }, + { + "epoch": 139.08496732026143, + "grad_norm": 0.43028515577316284, + "learning_rate": 0.000163742398974869, + "loss": 0.1254, + "step": 5320 + }, + { + "epoch": 139.34640522875816, + "grad_norm": 0.3889625668525696, + "learning_rate": 0.0001636149098591735, + "loss": 0.1163, + "step": 5330 + }, + { + "epoch": 139.6078431372549, + "grad_norm": 0.4060676693916321, + "learning_rate": 0.00016348724682274353, + "loss": 0.1185, + "step": 5340 + }, + { + "epoch": 139.86928104575162, + "grad_norm": 0.44942519068717957, + "learning_rate": 0.00016335941021460506, + "loss": 0.1216, + "step": 5350 + }, + { + "epoch": 140.13071895424838, + "grad_norm": 0.4668162167072296, + "learning_rate": 0.00016323140038425842, + "loss": 0.1222, + "step": 5360 + }, + { + "epoch": 140.3921568627451, + "grad_norm": 0.4359992444515228, + "learning_rate": 0.00016310321768167762, + "loss": 0.1183, + "step": 5370 + }, + { + "epoch": 140.65359477124184, + "grad_norm": 0.48698097467422485, + "learning_rate": 0.00016297486245730927, + "loss": 0.1182, + "step": 5380 + }, + { + "epoch": 140.91503267973857, + "grad_norm": 0.4088447391986847, + "learning_rate": 0.0001628463350620716, + "loss": 0.1259, + "step": 5390 + }, + { + "epoch": 141.1764705882353, + "grad_norm": 0.407049298286438, + "learning_rate": 0.0001627176358473537, + "loss": 0.1209, + "step": 5400 + }, + { + "epoch": 141.43790849673204, + "grad_norm": 0.4178103804588318, + "learning_rate": 0.00016258876516501424, + "loss": 0.1221, + "step": 5410 + }, + { + "epoch": 141.69934640522877, + "grad_norm": 0.44751349091529846, + "learning_rate": 0.0001624597233673808, + "loss": 0.1201, + "step": 5420 + }, + { + "epoch": 141.9607843137255, + "grad_norm": 0.4328495264053345, + "learning_rate": 0.00016233051080724868, + "loss": 0.1221, + "step": 5430 + }, + { + "epoch": 142.22222222222223, + "grad_norm": 0.44036683440208435, + "learning_rate": 0.0001622011278378801, + "loss": 0.1167, + "step": 5440 + }, + { + "epoch": 142.48366013071896, + "grad_norm": 0.5208015441894531, + "learning_rate": 0.00016207157481300312, + "loss": 0.1191, + "step": 5450 + }, + { + "epoch": 142.7450980392157, + "grad_norm": 0.4127715826034546, + "learning_rate": 0.00016194185208681083, + "loss": 0.1215, + "step": 5460 + }, + { + "epoch": 143.00653594771242, + "grad_norm": 0.4773133397102356, + "learning_rate": 0.00016181196001396019, + "loss": 0.1204, + "step": 5470 + }, + { + "epoch": 143.26797385620915, + "grad_norm": 0.37050458788871765, + "learning_rate": 0.0001616818989495711, + "loss": 0.1149, + "step": 5480 + }, + { + "epoch": 143.52941176470588, + "grad_norm": 0.34875085949897766, + "learning_rate": 0.00016155166924922566, + "loss": 0.1201, + "step": 5490 + }, + { + "epoch": 143.79084967320262, + "grad_norm": 0.3563850224018097, + "learning_rate": 0.0001614212712689668, + "loss": 0.1218, + "step": 5500 + }, + { + "epoch": 144.05228758169935, + "grad_norm": 0.37683019042015076, + "learning_rate": 0.00016129070536529766, + "loss": 0.1199, + "step": 5510 + }, + { + "epoch": 144.31372549019608, + "grad_norm": 0.415547639131546, + "learning_rate": 0.00016115997189518043, + "loss": 0.1165, + "step": 5520 + }, + { + "epoch": 144.5751633986928, + "grad_norm": 0.3484375476837158, + "learning_rate": 0.00016102907121603543, + "loss": 0.1197, + "step": 5530 + }, + { + "epoch": 144.83660130718954, + "grad_norm": 0.3772566616535187, + "learning_rate": 0.00016089800368574014, + "loss": 0.1236, + "step": 5540 + }, + { + "epoch": 145.09803921568627, + "grad_norm": 0.35378995537757874, + "learning_rate": 0.00016076676966262813, + "loss": 0.1188, + "step": 5550 + }, + { + "epoch": 145.359477124183, + "grad_norm": 0.34689784049987793, + "learning_rate": 0.00016063536950548826, + "loss": 0.1169, + "step": 5560 + }, + { + "epoch": 145.62091503267973, + "grad_norm": 0.35840165615081787, + "learning_rate": 0.0001605038035735635, + "loss": 0.1199, + "step": 5570 + }, + { + "epoch": 145.88235294117646, + "grad_norm": 0.40955159068107605, + "learning_rate": 0.0001603720722265501, + "loss": 0.1236, + "step": 5580 + }, + { + "epoch": 146.1437908496732, + "grad_norm": 0.3746141493320465, + "learning_rate": 0.00016024017582459652, + "loss": 0.1176, + "step": 5590 + }, + { + "epoch": 146.40522875816993, + "grad_norm": 0.4733276069164276, + "learning_rate": 0.00016010811472830252, + "loss": 0.1198, + "step": 5600 + }, + { + "epoch": 146.66666666666666, + "grad_norm": 0.4011492431163788, + "learning_rate": 0.00015997588929871808, + "loss": 0.1199, + "step": 5610 + }, + { + "epoch": 146.9281045751634, + "grad_norm": 0.5024054050445557, + "learning_rate": 0.00015985674620589864, + "loss": 0.121, + "step": 5620 + }, + { + "epoch": 147.18954248366012, + "grad_norm": 0.4476403594017029, + "learning_rate": 0.00015972420953936335, + "loss": 0.1189, + "step": 5630 + }, + { + "epoch": 147.45098039215685, + "grad_norm": 0.45632895827293396, + "learning_rate": 0.0001595915095891198, + "loss": 0.1168, + "step": 5640 + }, + { + "epoch": 147.71241830065358, + "grad_norm": 0.48059502243995667, + "learning_rate": 0.00015945864671796452, + "loss": 0.1226, + "step": 5650 + }, + { + "epoch": 147.9738562091503, + "grad_norm": 0.3604694604873657, + "learning_rate": 0.0001593256212891395, + "loss": 0.1218, + "step": 5660 + }, + { + "epoch": 148.23529411764707, + "grad_norm": 1.3692547082901, + "learning_rate": 0.00015919243366633126, + "loss": 0.1168, + "step": 5670 + }, + { + "epoch": 148.4967320261438, + "grad_norm": 0.70684415102005, + "learning_rate": 0.00015905908421366962, + "loss": 0.1165, + "step": 5680 + }, + { + "epoch": 148.75816993464053, + "grad_norm": 0.9317061901092529, + "learning_rate": 0.0001589255732957269, + "loss": 0.1231, + "step": 5690 + }, + { + "epoch": 149.01960784313727, + "grad_norm": 0.6531818509101868, + "learning_rate": 0.00015879190127751684, + "loss": 0.1257, + "step": 5700 + }, + { + "epoch": 149.281045751634, + "grad_norm": 0.5326528549194336, + "learning_rate": 0.00015865806852449367, + "loss": 0.1136, + "step": 5710 + }, + { + "epoch": 149.54248366013073, + "grad_norm": 0.5003425478935242, + "learning_rate": 0.00015852407540255104, + "loss": 0.123, + "step": 5720 + }, + { + "epoch": 149.80392156862746, + "grad_norm": 0.4232426583766937, + "learning_rate": 0.00015838992227802093, + "loss": 0.1245, + "step": 5730 + }, + { + "epoch": 150.0653594771242, + "grad_norm": 0.3918308615684509, + "learning_rate": 0.00015825560951767298, + "loss": 0.118, + "step": 5740 + }, + { + "epoch": 150.32679738562092, + "grad_norm": 0.4566536545753479, + "learning_rate": 0.00015812113748871304, + "loss": 0.1194, + "step": 5750 + }, + { + "epoch": 150.58823529411765, + "grad_norm": 0.4792901277542114, + "learning_rate": 0.00015798650655878262, + "loss": 0.1235, + "step": 5760 + }, + { + "epoch": 150.84967320261438, + "grad_norm": 0.4351622462272644, + "learning_rate": 0.00015785171709595743, + "loss": 0.1183, + "step": 5770 + }, + { + "epoch": 151.11111111111111, + "grad_norm": 0.4398731589317322, + "learning_rate": 0.0001577167694687468, + "loss": 0.1213, + "step": 5780 + }, + { + "epoch": 151.37254901960785, + "grad_norm": 0.4514036476612091, + "learning_rate": 0.00015758166404609232, + "loss": 0.1184, + "step": 5790 + }, + { + "epoch": 151.63398692810458, + "grad_norm": 0.46276119351387024, + "learning_rate": 0.0001574464011973671, + "loss": 0.1182, + "step": 5800 + }, + { + "epoch": 151.8954248366013, + "grad_norm": 0.4101347327232361, + "learning_rate": 0.00015731098129237458, + "loss": 0.1245, + "step": 5810 + }, + { + "epoch": 152.15686274509804, + "grad_norm": 0.45118212699890137, + "learning_rate": 0.00015717540470134761, + "loss": 0.1173, + "step": 5820 + }, + { + "epoch": 152.41830065359477, + "grad_norm": 0.47819066047668457, + "learning_rate": 0.00015703967179494748, + "loss": 0.1193, + "step": 5830 + }, + { + "epoch": 152.6797385620915, + "grad_norm": 0.4761189818382263, + "learning_rate": 0.00015690378294426266, + "loss": 0.1208, + "step": 5840 + }, + { + "epoch": 152.94117647058823, + "grad_norm": 0.47973722219467163, + "learning_rate": 0.00015676773852080813, + "loss": 0.1205, + "step": 5850 + }, + { + "epoch": 153.20261437908496, + "grad_norm": 0.4598498046398163, + "learning_rate": 0.0001566315388965242, + "loss": 0.119, + "step": 5860 + }, + { + "epoch": 153.4640522875817, + "grad_norm": 0.42741018533706665, + "learning_rate": 0.00015649518444377537, + "loss": 0.1176, + "step": 5870 + }, + { + "epoch": 153.72549019607843, + "grad_norm": 0.40698984265327454, + "learning_rate": 0.00015635867553534955, + "loss": 0.1215, + "step": 5880 + }, + { + "epoch": 153.98692810457516, + "grad_norm": 0.35974419116973877, + "learning_rate": 0.00015622201254445684, + "loss": 0.1221, + "step": 5890 + }, + { + "epoch": 154.2483660130719, + "grad_norm": 0.4019433856010437, + "learning_rate": 0.0001560851958447287, + "loss": 0.1168, + "step": 5900 + }, + { + "epoch": 154.50980392156862, + "grad_norm": 0.4910339117050171, + "learning_rate": 0.0001559482258102167, + "loss": 0.1213, + "step": 5910 + }, + { + "epoch": 154.77124183006535, + "grad_norm": 0.40200385451316833, + "learning_rate": 0.00015581110281539173, + "loss": 0.12, + "step": 5920 + }, + { + "epoch": 155.03267973856208, + "grad_norm": 0.36135050654411316, + "learning_rate": 0.0001556738272351428, + "loss": 0.1166, + "step": 5930 + }, + { + "epoch": 155.2941176470588, + "grad_norm": 0.41371849179267883, + "learning_rate": 0.00015553639944477612, + "loss": 0.1184, + "step": 5940 + }, + { + "epoch": 155.55555555555554, + "grad_norm": 0.4281088411808014, + "learning_rate": 0.000155398819820014, + "loss": 0.1187, + "step": 5950 + }, + { + "epoch": 155.81699346405227, + "grad_norm": 0.530092716217041, + "learning_rate": 0.00015526108873699387, + "loss": 0.1203, + "step": 5960 + }, + { + "epoch": 156.07843137254903, + "grad_norm": 0.3755616843700409, + "learning_rate": 0.00015512320657226728, + "loss": 0.1175, + "step": 5970 + }, + { + "epoch": 156.33986928104576, + "grad_norm": 0.35955289006233215, + "learning_rate": 0.00015498517370279884, + "loss": 0.1173, + "step": 5980 + }, + { + "epoch": 156.6013071895425, + "grad_norm": 0.406409353017807, + "learning_rate": 0.00015484699050596505, + "loss": 0.1221, + "step": 5990 + }, + { + "epoch": 156.86274509803923, + "grad_norm": 0.4339780807495117, + "learning_rate": 0.00015470865735955357, + "loss": 0.1183, + "step": 6000 + }, + { + "epoch": 157.12418300653596, + "grad_norm": 0.35623541474342346, + "learning_rate": 0.00015457017464176191, + "loss": 0.1215, + "step": 6010 + }, + { + "epoch": 157.3856209150327, + "grad_norm": 0.43511274456977844, + "learning_rate": 0.0001544315427311965, + "loss": 0.1169, + "step": 6020 + }, + { + "epoch": 157.64705882352942, + "grad_norm": 0.4758042097091675, + "learning_rate": 0.00015429276200687177, + "loss": 0.1177, + "step": 6030 + }, + { + "epoch": 157.90849673202615, + "grad_norm": 0.3972184360027313, + "learning_rate": 0.00015415383284820888, + "loss": 0.123, + "step": 6040 + }, + { + "epoch": 158.16993464052288, + "grad_norm": 0.3831634223461151, + "learning_rate": 0.0001540147556350348, + "loss": 0.1187, + "step": 6050 + }, + { + "epoch": 158.4313725490196, + "grad_norm": 0.3707033097743988, + "learning_rate": 0.0001538755307475814, + "loss": 0.1166, + "step": 6060 + }, + { + "epoch": 158.69281045751634, + "grad_norm": 0.40588223934173584, + "learning_rate": 0.00015373615856648418, + "loss": 0.1201, + "step": 6070 + }, + { + "epoch": 158.95424836601308, + "grad_norm": 0.5210367441177368, + "learning_rate": 0.0001535966394727813, + "loss": 0.1223, + "step": 6080 + }, + { + "epoch": 159.2156862745098, + "grad_norm": 0.3465724587440491, + "learning_rate": 0.00015345697384791274, + "loss": 0.1162, + "step": 6090 + }, + { + "epoch": 159.47712418300654, + "grad_norm": 0.48409783840179443, + "learning_rate": 0.00015331716207371888, + "loss": 0.1186, + "step": 6100 + }, + { + "epoch": 159.73856209150327, + "grad_norm": 0.49288466572761536, + "learning_rate": 0.00015317720453243981, + "loss": 0.1201, + "step": 6110 + }, + { + "epoch": 160.0, + "grad_norm": 0.41373661160469055, + "learning_rate": 0.00015303710160671416, + "loss": 0.1221, + "step": 6120 + }, + { + "epoch": 160.26143790849673, + "grad_norm": 0.30801451206207275, + "learning_rate": 0.00015289685367957792, + "loss": 0.1187, + "step": 6130 + }, + { + "epoch": 160.52287581699346, + "grad_norm": 0.4610970616340637, + "learning_rate": 0.0001527564611344636, + "loss": 0.1183, + "step": 6140 + }, + { + "epoch": 160.7843137254902, + "grad_norm": 0.3737730383872986, + "learning_rate": 0.000152615924355199, + "loss": 0.1174, + "step": 6150 + }, + { + "epoch": 161.04575163398692, + "grad_norm": 0.4367047846317291, + "learning_rate": 0.00015247524372600637, + "loss": 0.1245, + "step": 6160 + }, + { + "epoch": 161.30718954248366, + "grad_norm": 0.4288366734981537, + "learning_rate": 0.00015233441963150113, + "loss": 0.1165, + "step": 6170 + }, + { + "epoch": 161.5686274509804, + "grad_norm": 0.5186516642570496, + "learning_rate": 0.00015219345245669105, + "loss": 0.1196, + "step": 6180 + }, + { + "epoch": 161.83006535947712, + "grad_norm": 0.32446616888046265, + "learning_rate": 0.00015205234258697494, + "loss": 0.1206, + "step": 6190 + }, + { + "epoch": 162.09150326797385, + "grad_norm": 0.34923261404037476, + "learning_rate": 0.00015191109040814176, + "loss": 0.1184, + "step": 6200 + }, + { + "epoch": 162.35294117647058, + "grad_norm": 0.4656030833721161, + "learning_rate": 0.0001517696963063697, + "loss": 0.1186, + "step": 6210 + }, + { + "epoch": 162.6143790849673, + "grad_norm": 0.3549903631210327, + "learning_rate": 0.0001516281606682247, + "loss": 0.1204, + "step": 6220 + }, + { + "epoch": 162.87581699346404, + "grad_norm": 0.3952346742153168, + "learning_rate": 0.0001514864838806599, + "loss": 0.1161, + "step": 6230 + }, + { + "epoch": 163.13725490196077, + "grad_norm": 0.3911607563495636, + "learning_rate": 0.0001513446663310141, + "loss": 0.1198, + "step": 6240 + }, + { + "epoch": 163.3986928104575, + "grad_norm": 0.346305787563324, + "learning_rate": 0.00015120270840701124, + "loss": 0.1118, + "step": 6250 + }, + { + "epoch": 163.66013071895424, + "grad_norm": 0.42828211188316345, + "learning_rate": 0.0001510606104967587, + "loss": 0.1195, + "step": 6260 + }, + { + "epoch": 163.92156862745097, + "grad_norm": 0.4532879590988159, + "learning_rate": 0.00015091837298874682, + "loss": 0.1241, + "step": 6270 + }, + { + "epoch": 164.18300653594773, + "grad_norm": 0.33145442605018616, + "learning_rate": 0.00015077599627184754, + "loss": 0.116, + "step": 6280 + }, + { + "epoch": 164.44444444444446, + "grad_norm": 0.4350489675998688, + "learning_rate": 0.00015063348073531324, + "loss": 0.1173, + "step": 6290 + }, + { + "epoch": 164.7058823529412, + "grad_norm": 0.4127410054206848, + "learning_rate": 0.00015049082676877614, + "loss": 0.1209, + "step": 6300 + }, + { + "epoch": 164.96732026143792, + "grad_norm": 0.36359113454818726, + "learning_rate": 0.00015034803476224657, + "loss": 0.1215, + "step": 6310 + }, + { + "epoch": 165.22875816993465, + "grad_norm": 0.3537936210632324, + "learning_rate": 0.00015020510510611255, + "loss": 0.1203, + "step": 6320 + }, + { + "epoch": 165.49019607843138, + "grad_norm": 0.33729109168052673, + "learning_rate": 0.00015006203819113823, + "loss": 0.1162, + "step": 6330 + }, + { + "epoch": 165.7516339869281, + "grad_norm": 0.4218641221523285, + "learning_rate": 0.00014991883440846308, + "loss": 0.1162, + "step": 6340 + }, + { + "epoch": 166.01307189542484, + "grad_norm": 0.44932490587234497, + "learning_rate": 0.00014977549414960084, + "loss": 0.1243, + "step": 6350 + }, + { + "epoch": 166.27450980392157, + "grad_norm": 0.430530846118927, + "learning_rate": 0.00014963201780643823, + "loss": 0.1167, + "step": 6360 + }, + { + "epoch": 166.5359477124183, + "grad_norm": 0.4142252504825592, + "learning_rate": 0.00014948840577123416, + "loss": 0.1172, + "step": 6370 + }, + { + "epoch": 166.79738562091504, + "grad_norm": 0.2989576458930969, + "learning_rate": 0.00014934465843661842, + "loss": 0.1197, + "step": 6380 + }, + { + "epoch": 167.05882352941177, + "grad_norm": 0.37099623680114746, + "learning_rate": 0.00014920077619559073, + "loss": 0.1207, + "step": 6390 + }, + { + "epoch": 167.3202614379085, + "grad_norm": 0.3625079393386841, + "learning_rate": 0.00014905675944151966, + "loss": 0.1186, + "step": 6400 + }, + { + "epoch": 167.58169934640523, + "grad_norm": 0.4216829836368561, + "learning_rate": 0.00014891260856814148, + "loss": 0.1154, + "step": 6410 + }, + { + "epoch": 167.84313725490196, + "grad_norm": 0.44304612278938293, + "learning_rate": 0.0001487683239695592, + "loss": 0.1211, + "step": 6420 + }, + { + "epoch": 168.1045751633987, + "grad_norm": 0.38862261176109314, + "learning_rate": 0.00014862390604024144, + "loss": 0.1166, + "step": 6430 + }, + { + "epoch": 168.36601307189542, + "grad_norm": 0.38382863998413086, + "learning_rate": 0.00014847935517502123, + "loss": 0.1139, + "step": 6440 + }, + { + "epoch": 168.62745098039215, + "grad_norm": 0.42605075240135193, + "learning_rate": 0.00014833467176909515, + "loss": 0.1209, + "step": 6450 + }, + { + "epoch": 168.88888888888889, + "grad_norm": 0.3948115408420563, + "learning_rate": 0.00014818985621802212, + "loss": 0.12, + "step": 6460 + }, + { + "epoch": 169.15032679738562, + "grad_norm": 0.33048856258392334, + "learning_rate": 0.00014804490891772232, + "loss": 0.1185, + "step": 6470 + }, + { + "epoch": 169.41176470588235, + "grad_norm": 0.33838731050491333, + "learning_rate": 0.00014789983026447612, + "loss": 0.1139, + "step": 6480 + }, + { + "epoch": 169.67320261437908, + "grad_norm": 0.40736544132232666, + "learning_rate": 0.000147754620654923, + "loss": 0.1212, + "step": 6490 + }, + { + "epoch": 169.9346405228758, + "grad_norm": 0.5088180899620056, + "learning_rate": 0.00014760928048606055, + "loss": 0.1211, + "step": 6500 + }, + { + "epoch": 170.19607843137254, + "grad_norm": 0.36517319083213806, + "learning_rate": 0.00014746381015524323, + "loss": 0.1204, + "step": 6510 + }, + { + "epoch": 170.45751633986927, + "grad_norm": 0.4173334836959839, + "learning_rate": 0.00014731821006018131, + "loss": 0.1138, + "step": 6520 + }, + { + "epoch": 170.718954248366, + "grad_norm": 0.3489775061607361, + "learning_rate": 0.00014717248059893992, + "loss": 0.1197, + "step": 6530 + }, + { + "epoch": 170.98039215686273, + "grad_norm": 0.3684404492378235, + "learning_rate": 0.00014702662216993785, + "loss": 0.1226, + "step": 6540 + }, + { + "epoch": 171.24183006535947, + "grad_norm": 0.3432365953922272, + "learning_rate": 0.0001468806351719465, + "loss": 0.1186, + "step": 6550 + }, + { + "epoch": 171.5032679738562, + "grad_norm": 0.4222089350223541, + "learning_rate": 0.0001467345200040887, + "loss": 0.1193, + "step": 6560 + }, + { + "epoch": 171.76470588235293, + "grad_norm": 0.43939515948295593, + "learning_rate": 0.0001465882770658378, + "loss": 0.1197, + "step": 6570 + }, + { + "epoch": 172.0261437908497, + "grad_norm": 0.3959405720233917, + "learning_rate": 0.00014644190675701632, + "loss": 0.1188, + "step": 6580 + }, + { + "epoch": 172.28758169934642, + "grad_norm": 0.3560381531715393, + "learning_rate": 0.00014629540947779516, + "loss": 0.111, + "step": 6590 + }, + { + "epoch": 172.54901960784315, + "grad_norm": 0.31908681988716125, + "learning_rate": 0.0001461487856286923, + "loss": 0.1179, + "step": 6600 + }, + { + "epoch": 172.81045751633988, + "grad_norm": 0.4550021290779114, + "learning_rate": 0.0001460020356105717, + "loss": 0.1229, + "step": 6610 + }, + { + "epoch": 173.0718954248366, + "grad_norm": 0.31013500690460205, + "learning_rate": 0.00014585515982464234, + "loss": 0.1192, + "step": 6620 + }, + { + "epoch": 173.33333333333334, + "grad_norm": 0.41469523310661316, + "learning_rate": 0.00014570815867245696, + "loss": 0.1183, + "step": 6630 + }, + { + "epoch": 173.59477124183007, + "grad_norm": 0.40070420503616333, + "learning_rate": 0.00014556103255591114, + "loss": 0.1177, + "step": 6640 + }, + { + "epoch": 173.8562091503268, + "grad_norm": 0.3658972978591919, + "learning_rate": 0.0001454137818772421, + "loss": 0.1203, + "step": 6650 + }, + { + "epoch": 174.11764705882354, + "grad_norm": 0.4114379584789276, + "learning_rate": 0.00014526640703902747, + "loss": 0.1159, + "step": 6660 + }, + { + "epoch": 174.37908496732027, + "grad_norm": 0.37510064244270325, + "learning_rate": 0.00014511890844418453, + "loss": 0.1205, + "step": 6670 + }, + { + "epoch": 174.640522875817, + "grad_norm": 0.41856005787849426, + "learning_rate": 0.00014497128649596875, + "loss": 0.1169, + "step": 6680 + }, + { + "epoch": 174.90196078431373, + "grad_norm": 0.35670095682144165, + "learning_rate": 0.00014482354159797288, + "loss": 0.1187, + "step": 6690 + }, + { + "epoch": 175.16339869281046, + "grad_norm": 0.3349741995334625, + "learning_rate": 0.0001446756741541259, + "loss": 0.1191, + "step": 6700 + }, + { + "epoch": 175.4248366013072, + "grad_norm": 0.4219711422920227, + "learning_rate": 0.00014452768456869173, + "loss": 0.1167, + "step": 6710 + }, + { + "epoch": 175.68627450980392, + "grad_norm": 0.30478203296661377, + "learning_rate": 0.0001443795732462682, + "loss": 0.1172, + "step": 6720 + }, + { + "epoch": 175.94771241830065, + "grad_norm": 0.3981144428253174, + "learning_rate": 0.00014423134059178607, + "loss": 0.121, + "step": 6730 + }, + { + "epoch": 176.20915032679738, + "grad_norm": 0.3811021149158478, + "learning_rate": 0.00014408298701050774, + "loss": 0.1182, + "step": 6740 + }, + { + "epoch": 176.47058823529412, + "grad_norm": 0.4203794002532959, + "learning_rate": 0.00014393451290802619, + "loss": 0.1191, + "step": 6750 + }, + { + "epoch": 176.73202614379085, + "grad_norm": 0.38078659772872925, + "learning_rate": 0.000143785918690264, + "loss": 0.117, + "step": 6760 + }, + { + "epoch": 176.99346405228758, + "grad_norm": 0.3496187627315521, + "learning_rate": 0.0001436372047634721, + "loss": 0.1197, + "step": 6770 + }, + { + "epoch": 177.2549019607843, + "grad_norm": 0.3604697287082672, + "learning_rate": 0.00014348837153422864, + "loss": 0.1182, + "step": 6780 + }, + { + "epoch": 177.51633986928104, + "grad_norm": 0.3979366421699524, + "learning_rate": 0.000143339419409438, + "loss": 0.1169, + "step": 6790 + }, + { + "epoch": 177.77777777777777, + "grad_norm": 0.4077358543872833, + "learning_rate": 0.00014319034879632962, + "loss": 0.1189, + "step": 6800 + }, + { + "epoch": 178.0392156862745, + "grad_norm": 0.377392053604126, + "learning_rate": 0.00014304116010245685, + "loss": 0.1205, + "step": 6810 + }, + { + "epoch": 178.30065359477123, + "grad_norm": 0.9518114924430847, + "learning_rate": 0.00014289185373569585, + "loss": 0.1185, + "step": 6820 + }, + { + "epoch": 178.56209150326796, + "grad_norm": 0.4851488173007965, + "learning_rate": 0.00014274243010424457, + "loss": 0.1222, + "step": 6830 + }, + { + "epoch": 178.8235294117647, + "grad_norm": 0.4688913822174072, + "learning_rate": 0.00014259288961662153, + "loss": 0.1211, + "step": 6840 + }, + { + "epoch": 179.08496732026143, + "grad_norm": 0.4016769528388977, + "learning_rate": 0.00014244323268166467, + "loss": 0.1216, + "step": 6850 + }, + { + "epoch": 179.34640522875816, + "grad_norm": 0.661056637763977, + "learning_rate": 0.00014229345970853032, + "loss": 0.1203, + "step": 6860 + }, + { + "epoch": 179.6078431372549, + "grad_norm": 0.8823976516723633, + "learning_rate": 0.00014214357110669211, + "loss": 0.1218, + "step": 6870 + }, + { + "epoch": 179.86928104575162, + "grad_norm": 0.6820003390312195, + "learning_rate": 0.00014199356728593977, + "loss": 0.1225, + "step": 6880 + }, + { + "epoch": 180.13071895424838, + "grad_norm": 1.0853325128555298, + "learning_rate": 0.000141843448656378, + "loss": 0.1259, + "step": 6890 + }, + { + "epoch": 180.3921568627451, + "grad_norm": 0.9575023055076599, + "learning_rate": 0.00014169321562842535, + "loss": 0.1179, + "step": 6900 + }, + { + "epoch": 180.65359477124184, + "grad_norm": 0.4306863248348236, + "learning_rate": 0.00014154286861281325, + "loss": 0.124, + "step": 6910 + }, + { + "epoch": 180.91503267973857, + "grad_norm": 0.48212000727653503, + "learning_rate": 0.00014139240802058464, + "loss": 0.1216, + "step": 6920 + }, + { + "epoch": 181.1764705882353, + "grad_norm": 0.7190878391265869, + "learning_rate": 0.0001412418342630931, + "loss": 0.121, + "step": 6930 + }, + { + "epoch": 181.43790849673204, + "grad_norm": 0.6326178908348083, + "learning_rate": 0.0001410911477520015, + "loss": 0.122, + "step": 6940 + }, + { + "epoch": 181.69934640522877, + "grad_norm": 0.6375879645347595, + "learning_rate": 0.000140940348899281, + "loss": 0.1231, + "step": 6950 + }, + { + "epoch": 181.9607843137255, + "grad_norm": 0.8026982545852661, + "learning_rate": 0.0001407894381172099, + "loss": 0.1278, + "step": 6960 + }, + { + "epoch": 182.22222222222223, + "grad_norm": 0.8561803102493286, + "learning_rate": 0.00014063841581837255, + "loss": 0.1169, + "step": 6970 + }, + { + "epoch": 182.48366013071896, + "grad_norm": 0.6810368895530701, + "learning_rate": 0.00014048728241565812, + "loss": 0.1255, + "step": 6980 + }, + { + "epoch": 182.7450980392157, + "grad_norm": 0.8623539209365845, + "learning_rate": 0.00014033603832225956, + "loss": 0.1241, + "step": 6990 + }, + { + "epoch": 183.00653594771242, + "grad_norm": 0.5568687915802002, + "learning_rate": 0.00014018468395167246, + "loss": 0.1291, + "step": 7000 + }, + { + "epoch": 183.26797385620915, + "grad_norm": 0.7413952946662903, + "learning_rate": 0.00014003321971769385, + "loss": 0.1186, + "step": 7010 + }, + { + "epoch": 183.52941176470588, + "grad_norm": 0.39381900429725647, + "learning_rate": 0.00013988164603442126, + "loss": 0.1181, + "step": 7020 + }, + { + "epoch": 183.79084967320262, + "grad_norm": 0.8067420721054077, + "learning_rate": 0.00013972996331625126, + "loss": 0.1233, + "step": 7030 + }, + { + "epoch": 184.05228758169935, + "grad_norm": 0.4819212257862091, + "learning_rate": 0.00013957817197787865, + "loss": 0.1228, + "step": 7040 + }, + { + "epoch": 184.31372549019608, + "grad_norm": 0.5741551518440247, + "learning_rate": 0.00013942627243429512, + "loss": 0.1186, + "step": 7050 + }, + { + "epoch": 184.5751633986928, + "grad_norm": 0.44853687286376953, + "learning_rate": 0.0001392742651007882, + "loss": 0.1195, + "step": 7060 + }, + { + "epoch": 184.83660130718954, + "grad_norm": 0.39852526783943176, + "learning_rate": 0.00013912215039294028, + "loss": 0.1197, + "step": 7070 + }, + { + "epoch": 185.09803921568627, + "grad_norm": 0.3141544461250305, + "learning_rate": 0.000138969928726627, + "loss": 0.1196, + "step": 7080 + }, + { + "epoch": 185.359477124183, + "grad_norm": 0.40422919392585754, + "learning_rate": 0.00013881760051801667, + "loss": 0.117, + "step": 7090 + }, + { + "epoch": 185.62091503267973, + "grad_norm": 0.38428983092308044, + "learning_rate": 0.00013866516618356875, + "loss": 0.1165, + "step": 7100 + }, + { + "epoch": 185.88235294117646, + "grad_norm": 0.42969101667404175, + "learning_rate": 0.00013851262614003292, + "loss": 0.1211, + "step": 7110 + }, + { + "epoch": 186.1437908496732, + "grad_norm": 0.35507911443710327, + "learning_rate": 0.0001383599808044478, + "loss": 0.1167, + "step": 7120 + }, + { + "epoch": 186.40522875816993, + "grad_norm": 0.3685397207736969, + "learning_rate": 0.00013820723059413995, + "loss": 0.1169, + "step": 7130 + }, + { + "epoch": 186.66666666666666, + "grad_norm": 0.4494422972202301, + "learning_rate": 0.00013805437592672262, + "loss": 0.1144, + "step": 7140 + }, + { + "epoch": 186.9281045751634, + "grad_norm": 0.5370398759841919, + "learning_rate": 0.00013790141722009458, + "loss": 0.1222, + "step": 7150 + }, + { + "epoch": 187.18954248366012, + "grad_norm": 0.4284844994544983, + "learning_rate": 0.00013774835489243912, + "loss": 0.1185, + "step": 7160 + }, + { + "epoch": 187.45098039215685, + "grad_norm": 0.5446126461029053, + "learning_rate": 0.0001375951893622228, + "loss": 0.1202, + "step": 7170 + }, + { + "epoch": 187.71241830065358, + "grad_norm": 0.388539582490921, + "learning_rate": 0.00013744192104819437, + "loss": 0.118, + "step": 7180 + }, + { + "epoch": 187.9738562091503, + "grad_norm": 0.3365419805049896, + "learning_rate": 0.00013728855036938348, + "loss": 0.1169, + "step": 7190 + }, + { + "epoch": 188.23529411764707, + "grad_norm": 0.36737489700317383, + "learning_rate": 0.00013713507774509973, + "loss": 0.1145, + "step": 7200 + }, + { + "epoch": 188.4967320261438, + "grad_norm": 0.44414252042770386, + "learning_rate": 0.00013698150359493143, + "loss": 0.1166, + "step": 7210 + }, + { + "epoch": 188.75816993464053, + "grad_norm": 0.3925827443599701, + "learning_rate": 0.00013682782833874442, + "loss": 0.1198, + "step": 7220 + }, + { + "epoch": 189.01960784313727, + "grad_norm": 0.46063002943992615, + "learning_rate": 0.00013667405239668106, + "loss": 0.1211, + "step": 7230 + }, + { + "epoch": 189.281045751634, + "grad_norm": 0.33849823474884033, + "learning_rate": 0.0001365201761891588, + "loss": 0.1114, + "step": 7240 + }, + { + "epoch": 189.54248366013073, + "grad_norm": 0.4201454222202301, + "learning_rate": 0.00013636620013686936, + "loss": 0.1176, + "step": 7250 + }, + { + "epoch": 189.80392156862746, + "grad_norm": 0.3787872791290283, + "learning_rate": 0.00013621212466077736, + "loss": 0.1181, + "step": 7260 + }, + { + "epoch": 190.0653594771242, + "grad_norm": 0.3665624260902405, + "learning_rate": 0.00013605795018211932, + "loss": 0.1196, + "step": 7270 + }, + { + "epoch": 190.32679738562092, + "grad_norm": 0.3760620355606079, + "learning_rate": 0.0001359036771224024, + "loss": 0.1164, + "step": 7280 + }, + { + "epoch": 190.58823529411765, + "grad_norm": 0.4009071886539459, + "learning_rate": 0.00013574930590340314, + "loss": 0.1208, + "step": 7290 + }, + { + "epoch": 190.84967320261438, + "grad_norm": 0.3013065755367279, + "learning_rate": 0.00013559483694716663, + "loss": 0.119, + "step": 7300 + }, + { + "epoch": 191.11111111111111, + "grad_norm": 0.3610047996044159, + "learning_rate": 0.00013544027067600512, + "loss": 0.115, + "step": 7310 + }, + { + "epoch": 191.37254901960785, + "grad_norm": 0.3881337344646454, + "learning_rate": 0.00013528560751249687, + "loss": 0.1157, + "step": 7320 + }, + { + "epoch": 191.63398692810458, + "grad_norm": 0.36018821597099304, + "learning_rate": 0.00013513084787948504, + "loss": 0.1158, + "step": 7330 + }, + { + "epoch": 191.8954248366013, + "grad_norm": 0.4123559296131134, + "learning_rate": 0.00013497599220007656, + "loss": 0.1193, + "step": 7340 + }, + { + "epoch": 192.15686274509804, + "grad_norm": 0.35036417841911316, + "learning_rate": 0.00013482104089764096, + "loss": 0.117, + "step": 7350 + }, + { + "epoch": 192.41830065359477, + "grad_norm": 0.4409710168838501, + "learning_rate": 0.0001346659943958092, + "loss": 0.1147, + "step": 7360 + }, + { + "epoch": 192.6797385620915, + "grad_norm": 0.3965952694416046, + "learning_rate": 0.0001345108531184725, + "loss": 0.1178, + "step": 7370 + }, + { + "epoch": 192.94117647058823, + "grad_norm": 0.34038016200065613, + "learning_rate": 0.00013435561748978113, + "loss": 0.1216, + "step": 7380 + }, + { + "epoch": 193.20261437908496, + "grad_norm": 0.30114054679870605, + "learning_rate": 0.00013420028793414344, + "loss": 0.1153, + "step": 7390 + }, + { + "epoch": 193.4640522875817, + "grad_norm": 0.34901145100593567, + "learning_rate": 0.00013404486487622442, + "loss": 0.1176, + "step": 7400 + }, + { + "epoch": 193.72549019607843, + "grad_norm": 0.4058627784252167, + "learning_rate": 0.00013388934874094489, + "loss": 0.1168, + "step": 7410 + }, + { + "epoch": 193.98692810457516, + "grad_norm": 0.31916511058807373, + "learning_rate": 0.00013373373995347995, + "loss": 0.1214, + "step": 7420 + }, + { + "epoch": 194.2483660130719, + "grad_norm": 0.3582738935947418, + "learning_rate": 0.00013357803893925807, + "loss": 0.1155, + "step": 7430 + }, + { + "epoch": 194.50980392156862, + "grad_norm": 0.3602858781814575, + "learning_rate": 0.00013342224612395993, + "loss": 0.121, + "step": 7440 + }, + { + "epoch": 194.77124183006535, + "grad_norm": 0.30642154812812805, + "learning_rate": 0.0001332663619335171, + "loss": 0.1149, + "step": 7450 + }, + { + "epoch": 195.03267973856208, + "grad_norm": 0.32698705792427063, + "learning_rate": 0.00013311038679411104, + "loss": 0.1184, + "step": 7460 + }, + { + "epoch": 195.2941176470588, + "grad_norm": 0.34545981884002686, + "learning_rate": 0.00013295432113217176, + "loss": 0.1139, + "step": 7470 + }, + { + "epoch": 195.55555555555554, + "grad_norm": 0.2823973298072815, + "learning_rate": 0.00013279816537437687, + "loss": 0.1156, + "step": 7480 + }, + { + "epoch": 195.81699346405227, + "grad_norm": 0.3787044882774353, + "learning_rate": 0.00013264191994765028, + "loss": 0.1199, + "step": 7490 + }, + { + "epoch": 196.07843137254903, + "grad_norm": 0.2924850583076477, + "learning_rate": 0.00013248558527916094, + "loss": 0.1169, + "step": 7500 + }, + { + "epoch": 196.33986928104576, + "grad_norm": 0.307283490896225, + "learning_rate": 0.00013232916179632193, + "loss": 0.1156, + "step": 7510 + }, + { + "epoch": 196.6013071895425, + "grad_norm": 0.3378380835056305, + "learning_rate": 0.00013217264992678907, + "loss": 0.1156, + "step": 7520 + }, + { + "epoch": 196.86274509803923, + "grad_norm": 0.3563665449619293, + "learning_rate": 0.00013201605009845977, + "loss": 0.1197, + "step": 7530 + }, + { + "epoch": 197.12418300653596, + "grad_norm": 0.32628506422042847, + "learning_rate": 0.00013185936273947207, + "loss": 0.118, + "step": 7540 + }, + { + "epoch": 197.3856209150327, + "grad_norm": 0.4200800657272339, + "learning_rate": 0.00013170258827820318, + "loss": 0.1185, + "step": 7550 + }, + { + "epoch": 197.64705882352942, + "grad_norm": 0.39217332005500793, + "learning_rate": 0.00013154572714326848, + "loss": 0.1141, + "step": 7560 + }, + { + "epoch": 197.90849673202615, + "grad_norm": 0.37061387300491333, + "learning_rate": 0.00013138877976352035, + "loss": 0.1179, + "step": 7570 + }, + { + "epoch": 198.16993464052288, + "grad_norm": 0.311758816242218, + "learning_rate": 0.00013123174656804693, + "loss": 0.1191, + "step": 7580 + }, + { + "epoch": 198.4313725490196, + "grad_norm": 0.35725489258766174, + "learning_rate": 0.00013107462798617097, + "loss": 0.1135, + "step": 7590 + }, + { + "epoch": 198.69281045751634, + "grad_norm": 0.35550937056541443, + "learning_rate": 0.0001309174244474487, + "loss": 0.117, + "step": 7600 + }, + { + "epoch": 198.95424836601308, + "grad_norm": 0.3795972466468811, + "learning_rate": 0.00013076013638166852, + "loss": 0.1215, + "step": 7610 + }, + { + "epoch": 199.2156862745098, + "grad_norm": 0.3138248026371002, + "learning_rate": 0.0001306027642188501, + "loss": 0.1133, + "step": 7620 + }, + { + "epoch": 199.47712418300654, + "grad_norm": 0.4113026559352875, + "learning_rate": 0.00013044530838924283, + "loss": 0.1178, + "step": 7630 + }, + { + "epoch": 199.73856209150327, + "grad_norm": 0.3749026358127594, + "learning_rate": 0.00013028776932332497, + "loss": 0.1183, + "step": 7640 + }, + { + "epoch": 200.0, + "grad_norm": 0.4342474043369293, + "learning_rate": 0.00013013014745180237, + "loss": 0.1204, + "step": 7650 + }, + { + "epoch": 200.26143790849673, + "grad_norm": 0.348385214805603, + "learning_rate": 0.0001299724432056071, + "loss": 0.1152, + "step": 7660 + }, + { + "epoch": 200.52287581699346, + "grad_norm": 0.4169091284275055, + "learning_rate": 0.00012981465701589664, + "loss": 0.1179, + "step": 7670 + }, + { + "epoch": 200.7843137254902, + "grad_norm": 0.36309823393821716, + "learning_rate": 0.00012965678931405232, + "loss": 0.1173, + "step": 7680 + }, + { + "epoch": 201.04575163398692, + "grad_norm": 0.3717479705810547, + "learning_rate": 0.00012949884053167846, + "loss": 0.1168, + "step": 7690 + }, + { + "epoch": 201.30718954248366, + "grad_norm": 0.3208405673503876, + "learning_rate": 0.00012934081110060105, + "loss": 0.1159, + "step": 7700 + }, + { + "epoch": 201.5686274509804, + "grad_norm": 0.348286509513855, + "learning_rate": 0.00012918270145286642, + "loss": 0.1153, + "step": 7710 + }, + { + "epoch": 201.83006535947712, + "grad_norm": 0.418100506067276, + "learning_rate": 0.00012902451202074038, + "loss": 0.1165, + "step": 7720 + }, + { + "epoch": 202.09150326797385, + "grad_norm": 0.3307877480983734, + "learning_rate": 0.00012886624323670676, + "loss": 0.121, + "step": 7730 + }, + { + "epoch": 202.35294117647058, + "grad_norm": 0.45300108194351196, + "learning_rate": 0.0001287078955334664, + "loss": 0.1147, + "step": 7740 + }, + { + "epoch": 202.6143790849673, + "grad_norm": 0.37652915716171265, + "learning_rate": 0.00012854946934393586, + "loss": 0.1165, + "step": 7750 + }, + { + "epoch": 202.87581699346404, + "grad_norm": 0.35546672344207764, + "learning_rate": 0.0001283909651012463, + "loss": 0.1186, + "step": 7760 + }, + { + "epoch": 203.13725490196077, + "grad_norm": 0.3252201974391937, + "learning_rate": 0.00012823238323874224, + "loss": 0.1182, + "step": 7770 + }, + { + "epoch": 203.3986928104575, + "grad_norm": 0.36323535442352295, + "learning_rate": 0.00012807372418998045, + "loss": 0.1143, + "step": 7780 + }, + { + "epoch": 203.66013071895424, + "grad_norm": 0.37268149852752686, + "learning_rate": 0.00012791498838872874, + "loss": 0.1175, + "step": 7790 + }, + { + "epoch": 203.92156862745097, + "grad_norm": 0.3726952075958252, + "learning_rate": 0.00012775617626896468, + "loss": 0.1182, + "step": 7800 + }, + { + "epoch": 204.18300653594773, + "grad_norm": 0.34012719988822937, + "learning_rate": 0.0001275972882648746, + "loss": 0.1178, + "step": 7810 + }, + { + "epoch": 204.44444444444446, + "grad_norm": 0.3655106723308563, + "learning_rate": 0.0001274383248108522, + "loss": 0.1148, + "step": 7820 + }, + { + "epoch": 204.7058823529412, + "grad_norm": 0.3548455238342285, + "learning_rate": 0.00012727928634149744, + "loss": 0.1191, + "step": 7830 + }, + { + "epoch": 204.96732026143792, + "grad_norm": 0.3439328670501709, + "learning_rate": 0.00012712017329161553, + "loss": 0.1176, + "step": 7840 + }, + { + "epoch": 205.22875816993465, + "grad_norm": 0.3686840534210205, + "learning_rate": 0.00012696098609621542, + "loss": 0.1167, + "step": 7850 + }, + { + "epoch": 205.49019607843138, + "grad_norm": 0.39468914270401, + "learning_rate": 0.00012680172519050883, + "loss": 0.1162, + "step": 7860 + }, + { + "epoch": 205.7516339869281, + "grad_norm": 0.31318965554237366, + "learning_rate": 0.00012664239100990897, + "loss": 0.1176, + "step": 7870 + }, + { + "epoch": 206.01307189542484, + "grad_norm": 0.3187226355075836, + "learning_rate": 0.00012648298399002946, + "loss": 0.1186, + "step": 7880 + }, + { + "epoch": 206.27450980392157, + "grad_norm": 0.350243479013443, + "learning_rate": 0.0001263235045666829, + "loss": 0.1142, + "step": 7890 + }, + { + "epoch": 206.5359477124183, + "grad_norm": 0.3626524806022644, + "learning_rate": 0.00012616395317588007, + "loss": 0.1152, + "step": 7900 + }, + { + "epoch": 206.79738562091504, + "grad_norm": 0.3574896454811096, + "learning_rate": 0.00012600433025382833, + "loss": 0.1173, + "step": 7910 + }, + { + "epoch": 207.05882352941177, + "grad_norm": 0.3026224374771118, + "learning_rate": 0.00012584463623693064, + "loss": 0.1206, + "step": 7920 + }, + { + "epoch": 207.3202614379085, + "grad_norm": 0.3898176848888397, + "learning_rate": 0.00012568487156178434, + "loss": 0.1132, + "step": 7930 + }, + { + "epoch": 207.58169934640523, + "grad_norm": 0.3034379780292511, + "learning_rate": 0.00012552503666517998, + "loss": 0.1162, + "step": 7940 + }, + { + "epoch": 207.84313725490196, + "grad_norm": 0.38479849696159363, + "learning_rate": 0.00012536513198410006, + "loss": 0.1191, + "step": 7950 + }, + { + "epoch": 208.1045751633987, + "grad_norm": 0.4212307631969452, + "learning_rate": 0.00012520515795571785, + "loss": 0.1182, + "step": 7960 + }, + { + "epoch": 208.36601307189542, + "grad_norm": 0.44150879979133606, + "learning_rate": 0.00012504511501739622, + "loss": 0.1161, + "step": 7970 + }, + { + "epoch": 208.62745098039215, + "grad_norm": 0.49862542748451233, + "learning_rate": 0.0001248850036066865, + "loss": 0.1186, + "step": 7980 + }, + { + "epoch": 208.88888888888889, + "grad_norm": 0.37723541259765625, + "learning_rate": 0.00012472482416132712, + "loss": 0.1159, + "step": 7990 + }, + { + "epoch": 209.15032679738562, + "grad_norm": 0.39432400465011597, + "learning_rate": 0.00012456457711924266, + "loss": 0.1139, + "step": 8000 + }, + { + "epoch": 209.41176470588235, + "grad_norm": 0.320744127035141, + "learning_rate": 0.0001244042629185423, + "loss": 0.1138, + "step": 8010 + }, + { + "epoch": 209.67320261437908, + "grad_norm": 0.37809300422668457, + "learning_rate": 0.00012424388199751903, + "loss": 0.1164, + "step": 8020 + }, + { + "epoch": 209.9346405228758, + "grad_norm": 0.3541184961795807, + "learning_rate": 0.0001240834347946481, + "loss": 0.1208, + "step": 8030 + }, + { + "epoch": 210.19607843137254, + "grad_norm": 0.38199830055236816, + "learning_rate": 0.00012392292174858606, + "loss": 0.1171, + "step": 8040 + }, + { + "epoch": 210.45751633986927, + "grad_norm": 0.30296802520751953, + "learning_rate": 0.00012376234329816949, + "loss": 0.1173, + "step": 8050 + }, + { + "epoch": 210.718954248366, + "grad_norm": 0.38834238052368164, + "learning_rate": 0.00012360169988241367, + "loss": 0.1184, + "step": 8060 + }, + { + "epoch": 210.98039215686273, + "grad_norm": 0.3523692190647125, + "learning_rate": 0.0001234409919405116, + "loss": 0.1163, + "step": 8070 + }, + { + "epoch": 211.24183006535947, + "grad_norm": 0.31442514061927795, + "learning_rate": 0.0001232802199118327, + "loss": 0.1154, + "step": 8080 + }, + { + "epoch": 211.5032679738562, + "grad_norm": 0.365710586309433, + "learning_rate": 0.00012311938423592152, + "loss": 0.1143, + "step": 8090 + }, + { + "epoch": 211.76470588235293, + "grad_norm": 0.36913615465164185, + "learning_rate": 0.00012295848535249658, + "loss": 0.1169, + "step": 8100 + }, + { + "epoch": 212.0261437908497, + "grad_norm": 0.2978919744491577, + "learning_rate": 0.0001227975237014494, + "loss": 0.1208, + "step": 8110 + }, + { + "epoch": 212.28758169934642, + "grad_norm": 0.3304121494293213, + "learning_rate": 0.00012263649972284294, + "loss": 0.1137, + "step": 8120 + }, + { + "epoch": 212.54901960784315, + "grad_norm": 0.3513433337211609, + "learning_rate": 0.00012247541385691058, + "loss": 0.1198, + "step": 8130 + }, + { + "epoch": 212.81045751633988, + "grad_norm": 0.4109828472137451, + "learning_rate": 0.000122314266544055, + "loss": 0.1185, + "step": 8140 + }, + { + "epoch": 213.0718954248366, + "grad_norm": 0.2574869692325592, + "learning_rate": 0.00012215305822484672, + "loss": 0.1133, + "step": 8150 + }, + { + "epoch": 213.33333333333334, + "grad_norm": 0.307307630777359, + "learning_rate": 0.00012199178934002317, + "loss": 0.1153, + "step": 8160 + }, + { + "epoch": 213.59477124183007, + "grad_norm": 0.3619244396686554, + "learning_rate": 0.00012183046033048736, + "loss": 0.1173, + "step": 8170 + }, + { + "epoch": 213.8562091503268, + "grad_norm": 0.39120060205459595, + "learning_rate": 0.00012166907163730656, + "loss": 0.1166, + "step": 8180 + }, + { + "epoch": 214.11764705882354, + "grad_norm": 0.3365266025066376, + "learning_rate": 0.00012150762370171136, + "loss": 0.1201, + "step": 8190 + }, + { + "epoch": 214.37908496732027, + "grad_norm": 0.4538247883319855, + "learning_rate": 0.00012134611696509419, + "loss": 0.1157, + "step": 8200 + }, + { + "epoch": 214.640522875817, + "grad_norm": 0.3183276951313019, + "learning_rate": 0.00012118455186900836, + "loss": 0.1128, + "step": 8210 + }, + { + "epoch": 214.90196078431373, + "grad_norm": 0.8887193202972412, + "learning_rate": 0.00012102292885516666, + "loss": 0.1214, + "step": 8220 + }, + { + "epoch": 215.16339869281046, + "grad_norm": 0.3512011766433716, + "learning_rate": 0.00012086124836544024, + "loss": 0.1171, + "step": 8230 + }, + { + "epoch": 215.4248366013072, + "grad_norm": 0.3707999289035797, + "learning_rate": 0.00012069951084185733, + "loss": 0.1164, + "step": 8240 + }, + { + "epoch": 215.68627450980392, + "grad_norm": 0.3448478877544403, + "learning_rate": 0.00012053771672660221, + "loss": 0.1188, + "step": 8250 + }, + { + "epoch": 215.94771241830065, + "grad_norm": 0.3120673596858978, + "learning_rate": 0.00012037586646201378, + "loss": 0.1177, + "step": 8260 + }, + { + "epoch": 216.20915032679738, + "grad_norm": 0.46193230152130127, + "learning_rate": 0.00012021396049058451, + "loss": 0.1163, + "step": 8270 + }, + { + "epoch": 216.47058823529412, + "grad_norm": 0.34690043330192566, + "learning_rate": 0.00012005199925495914, + "loss": 0.1157, + "step": 8280 + }, + { + "epoch": 216.73202614379085, + "grad_norm": 0.4091133177280426, + "learning_rate": 0.00011988998319793346, + "loss": 0.1175, + "step": 8290 + }, + { + "epoch": 216.99346405228758, + "grad_norm": 0.32104718685150146, + "learning_rate": 0.00011972791276245321, + "loss": 0.1182, + "step": 8300 + }, + { + "epoch": 217.2549019607843, + "grad_norm": 0.3396114408969879, + "learning_rate": 0.00011956578839161279, + "loss": 0.1178, + "step": 8310 + }, + { + "epoch": 217.51633986928104, + "grad_norm": 0.39414018392562866, + "learning_rate": 0.00011940361052865401, + "loss": 0.1184, + "step": 8320 + }, + { + "epoch": 217.77777777777777, + "grad_norm": 0.3328598141670227, + "learning_rate": 0.000119241379616965, + "loss": 0.114, + "step": 8330 + }, + { + "epoch": 218.0392156862745, + "grad_norm": 0.334634929895401, + "learning_rate": 0.00011907909610007884, + "loss": 0.116, + "step": 8340 + }, + { + "epoch": 218.30065359477123, + "grad_norm": 0.32335221767425537, + "learning_rate": 0.00011891676042167246, + "loss": 0.1176, + "step": 8350 + }, + { + "epoch": 218.56209150326796, + "grad_norm": 0.2827315926551819, + "learning_rate": 0.00011875437302556543, + "loss": 0.1162, + "step": 8360 + }, + { + "epoch": 218.8235294117647, + "grad_norm": 0.3528691828250885, + "learning_rate": 0.0001185919343557187, + "loss": 0.1164, + "step": 8370 + }, + { + "epoch": 219.08496732026143, + "grad_norm": 0.28620168566703796, + "learning_rate": 0.00011842944485623335, + "loss": 0.1172, + "step": 8380 + }, + { + "epoch": 219.34640522875816, + "grad_norm": 0.38785773515701294, + "learning_rate": 0.0001182669049713495, + "loss": 0.1156, + "step": 8390 + }, + { + "epoch": 219.6078431372549, + "grad_norm": 0.30017638206481934, + "learning_rate": 0.00011810431514544496, + "loss": 0.1149, + "step": 8400 + }, + { + "epoch": 219.86928104575162, + "grad_norm": 0.3550892770290375, + "learning_rate": 0.00011794167582303412, + "loss": 0.1184, + "step": 8410 + }, + { + "epoch": 220.13071895424838, + "grad_norm": 0.2966924011707306, + "learning_rate": 0.00011777898744876673, + "loss": 0.1166, + "step": 8420 + }, + { + "epoch": 220.3921568627451, + "grad_norm": 0.2973492443561554, + "learning_rate": 0.00011761625046742651, + "loss": 0.1152, + "step": 8430 + }, + { + "epoch": 220.65359477124184, + "grad_norm": 0.3460974395275116, + "learning_rate": 0.00011745346532393017, + "loss": 0.115, + "step": 8440 + }, + { + "epoch": 220.91503267973857, + "grad_norm": 0.3842596113681793, + "learning_rate": 0.0001172906324633261, + "loss": 0.118, + "step": 8450 + }, + { + "epoch": 221.1764705882353, + "grad_norm": 0.3122817575931549, + "learning_rate": 0.00011712775233079311, + "loss": 0.1159, + "step": 8460 + }, + { + "epoch": 221.43790849673204, + "grad_norm": 0.3846839368343353, + "learning_rate": 0.00011696482537163933, + "loss": 0.1144, + "step": 8470 + }, + { + "epoch": 221.69934640522877, + "grad_norm": 0.3940717875957489, + "learning_rate": 0.00011680185203130075, + "loss": 0.1156, + "step": 8480 + }, + { + "epoch": 221.9607843137255, + "grad_norm": 0.35572728514671326, + "learning_rate": 0.00011663883275534029, + "loss": 0.1221, + "step": 8490 + }, + { + "epoch": 222.22222222222223, + "grad_norm": 0.2821555733680725, + "learning_rate": 0.0001164757679894464, + "loss": 0.1159, + "step": 8500 + }, + { + "epoch": 222.48366013071896, + "grad_norm": 0.4151257574558258, + "learning_rate": 0.00011631265817943198, + "loss": 0.1136, + "step": 8510 + }, + { + "epoch": 222.7450980392157, + "grad_norm": 0.3188461661338806, + "learning_rate": 0.000116149503771233, + "loss": 0.1156, + "step": 8520 + }, + { + "epoch": 223.00653594771242, + "grad_norm": 0.3545646667480469, + "learning_rate": 0.00011598630521090734, + "loss": 0.1183, + "step": 8530 + }, + { + "epoch": 223.26797385620915, + "grad_norm": 0.3330908417701721, + "learning_rate": 0.00011582306294463372, + "loss": 0.1144, + "step": 8540 + }, + { + "epoch": 223.52941176470588, + "grad_norm": 0.3198162913322449, + "learning_rate": 0.00011565977741871018, + "loss": 0.1182, + "step": 8550 + }, + { + "epoch": 223.79084967320262, + "grad_norm": 0.38695403933525085, + "learning_rate": 0.00011549644907955315, + "loss": 0.1156, + "step": 8560 + }, + { + "epoch": 224.05228758169935, + "grad_norm": 0.3442593514919281, + "learning_rate": 0.00011533307837369607, + "loss": 0.1191, + "step": 8570 + }, + { + "epoch": 224.31372549019608, + "grad_norm": 0.3656689524650574, + "learning_rate": 0.00011516966574778822, + "loss": 0.1154, + "step": 8580 + }, + { + "epoch": 224.5751633986928, + "grad_norm": 0.3869684636592865, + "learning_rate": 0.00011500621164859347, + "loss": 0.1176, + "step": 8590 + }, + { + "epoch": 224.83660130718954, + "grad_norm": 0.36732858419418335, + "learning_rate": 0.00011484271652298906, + "loss": 0.1121, + "step": 8600 + }, + { + "epoch": 225.09803921568627, + "grad_norm": 0.35245758295059204, + "learning_rate": 0.00011467918081796445, + "loss": 0.1189, + "step": 8610 + }, + { + "epoch": 225.359477124183, + "grad_norm": 0.3855646848678589, + "learning_rate": 0.00011451560498062, + "loss": 0.1169, + "step": 8620 + }, + { + "epoch": 225.62091503267973, + "grad_norm": 0.28450724482536316, + "learning_rate": 0.00011435198945816584, + "loss": 0.1142, + "step": 8630 + }, + { + "epoch": 225.88235294117646, + "grad_norm": 0.3665756583213806, + "learning_rate": 0.00011418833469792047, + "loss": 0.1202, + "step": 8640 + }, + { + "epoch": 226.1437908496732, + "grad_norm": 0.23904922604560852, + "learning_rate": 0.00011402464114730989, + "loss": 0.1133, + "step": 8650 + }, + { + "epoch": 226.40522875816993, + "grad_norm": 0.3275870382785797, + "learning_rate": 0.0001138609092538659, + "loss": 0.1138, + "step": 8660 + }, + { + "epoch": 226.66666666666666, + "grad_norm": 0.3351685404777527, + "learning_rate": 0.00011369713946522532, + "loss": 0.1143, + "step": 8670 + }, + { + "epoch": 226.9281045751634, + "grad_norm": 0.29419705271720886, + "learning_rate": 0.00011353333222912843, + "loss": 0.1198, + "step": 8680 + }, + { + "epoch": 227.18954248366012, + "grad_norm": 0.3791927993297577, + "learning_rate": 0.00011336948799341798, + "loss": 0.116, + "step": 8690 + }, + { + "epoch": 227.45098039215685, + "grad_norm": 0.487420916557312, + "learning_rate": 0.00011320560720603792, + "loss": 0.1183, + "step": 8700 + }, + { + "epoch": 227.71241830065358, + "grad_norm": 0.2873092591762543, + "learning_rate": 0.00011304169031503197, + "loss": 0.1146, + "step": 8710 + }, + { + "epoch": 227.9738562091503, + "grad_norm": 0.3828370273113251, + "learning_rate": 0.00011287773776854273, + "loss": 0.1202, + "step": 8720 + }, + { + "epoch": 228.23529411764707, + "grad_norm": 0.3244374990463257, + "learning_rate": 0.00011271375001481015, + "loss": 0.1141, + "step": 8730 + }, + { + "epoch": 228.4967320261438, + "grad_norm": 0.3474271893501282, + "learning_rate": 0.0001125497275021705, + "loss": 0.1144, + "step": 8740 + }, + { + "epoch": 228.75816993464053, + "grad_norm": 0.43531376123428345, + "learning_rate": 0.00011238567067905507, + "loss": 0.1183, + "step": 8750 + }, + { + "epoch": 229.01960784313727, + "grad_norm": 0.27749860286712646, + "learning_rate": 0.00011222157999398895, + "loss": 0.1166, + "step": 8760 + }, + { + "epoch": 229.281045751634, + "grad_norm": 0.35005658864974976, + "learning_rate": 0.00011205745589558982, + "loss": 0.1135, + "step": 8770 + }, + { + "epoch": 229.54248366013073, + "grad_norm": 0.3147236704826355, + "learning_rate": 0.00011189329883256668, + "loss": 0.1161, + "step": 8780 + }, + { + "epoch": 229.80392156862746, + "grad_norm": 0.3600068688392639, + "learning_rate": 0.00011172910925371865, + "loss": 0.121, + "step": 8790 + }, + { + "epoch": 230.0653594771242, + "grad_norm": 0.2795291543006897, + "learning_rate": 0.00011156488760793383, + "loss": 0.1145, + "step": 8800 + }, + { + "epoch": 230.32679738562092, + "grad_norm": 0.33064013719558716, + "learning_rate": 0.00011140063434418788, + "loss": 0.1133, + "step": 8810 + }, + { + "epoch": 230.58823529411765, + "grad_norm": 0.38541939854621887, + "learning_rate": 0.00011123634991154294, + "loss": 0.1172, + "step": 8820 + }, + { + "epoch": 230.84967320261438, + "grad_norm": 0.4045771062374115, + "learning_rate": 0.00011107203475914643, + "loss": 0.1183, + "step": 8830 + }, + { + "epoch": 231.11111111111111, + "grad_norm": 0.2556995153427124, + "learning_rate": 0.00011090768933622966, + "loss": 0.1141, + "step": 8840 + }, + { + "epoch": 231.37254901960785, + "grad_norm": 0.2857918441295624, + "learning_rate": 0.00011074331409210677, + "loss": 0.1128, + "step": 8850 + }, + { + "epoch": 231.63398692810458, + "grad_norm": 0.4650552570819855, + "learning_rate": 0.00011057890947617338, + "loss": 0.1198, + "step": 8860 + }, + { + "epoch": 231.8954248366013, + "grad_norm": 0.31443580985069275, + "learning_rate": 0.00011041447593790544, + "loss": 0.1166, + "step": 8870 + }, + { + "epoch": 232.15686274509804, + "grad_norm": 0.32467028498649597, + "learning_rate": 0.00011025001392685794, + "loss": 0.1167, + "step": 8880 + }, + { + "epoch": 232.41830065359477, + "grad_norm": 0.2936582565307617, + "learning_rate": 0.00011008552389266376, + "loss": 0.1145, + "step": 8890 + }, + { + "epoch": 232.6797385620915, + "grad_norm": 0.31260111927986145, + "learning_rate": 0.0001099210062850324, + "loss": 0.1147, + "step": 8900 + }, + { + "epoch": 232.94117647058823, + "grad_norm": 0.2891233265399933, + "learning_rate": 0.00010975646155374868, + "loss": 0.1194, + "step": 8910 + }, + { + "epoch": 233.20261437908496, + "grad_norm": 0.3409523367881775, + "learning_rate": 0.00010959189014867161, + "loss": 0.1156, + "step": 8920 + }, + { + "epoch": 233.4640522875817, + "grad_norm": 0.29765674471855164, + "learning_rate": 0.00010942729251973313, + "loss": 0.1151, + "step": 8930 + }, + { + "epoch": 233.72549019607843, + "grad_norm": 0.3181908428668976, + "learning_rate": 0.0001092626691169369, + "loss": 0.1181, + "step": 8940 + }, + { + "epoch": 233.98692810457516, + "grad_norm": 0.3303356170654297, + "learning_rate": 0.00010909802039035701, + "loss": 0.1163, + "step": 8950 + }, + { + "epoch": 234.2483660130719, + "grad_norm": 0.3642708957195282, + "learning_rate": 0.00010893334679013676, + "loss": 0.1132, + "step": 8960 + }, + { + "epoch": 234.50980392156862, + "grad_norm": 0.36728131771087646, + "learning_rate": 0.00010876864876648751, + "loss": 0.1159, + "step": 8970 + }, + { + "epoch": 234.77124183006535, + "grad_norm": 0.34145310521125793, + "learning_rate": 0.00010860392676968736, + "loss": 0.1168, + "step": 8980 + }, + { + "epoch": 235.03267973856208, + "grad_norm": 0.34180009365081787, + "learning_rate": 0.00010843918125007995, + "loss": 0.1201, + "step": 8990 + }, + { + "epoch": 235.2941176470588, + "grad_norm": 0.3454407751560211, + "learning_rate": 0.00010827441265807328, + "loss": 0.1149, + "step": 9000 + }, + { + "epoch": 235.55555555555554, + "grad_norm": 0.3704677224159241, + "learning_rate": 0.00010810962144413834, + "loss": 0.1117, + "step": 9010 + }, + { + "epoch": 235.81699346405227, + "grad_norm": 0.3329799175262451, + "learning_rate": 0.00010794480805880804, + "loss": 0.1181, + "step": 9020 + }, + { + "epoch": 236.07843137254903, + "grad_norm": 0.3676550090312958, + "learning_rate": 0.00010777997295267588, + "loss": 0.1148, + "step": 9030 + }, + { + "epoch": 236.33986928104576, + "grad_norm": 0.34352195262908936, + "learning_rate": 0.00010761511657639474, + "loss": 0.1154, + "step": 9040 + }, + { + "epoch": 236.6013071895425, + "grad_norm": 0.3431718349456787, + "learning_rate": 0.00010745023938067569, + "loss": 0.1154, + "step": 9050 + }, + { + "epoch": 236.86274509803923, + "grad_norm": 0.37354880571365356, + "learning_rate": 0.00010728534181628665, + "loss": 0.1186, + "step": 9060 + }, + { + "epoch": 237.12418300653596, + "grad_norm": 0.2959931790828705, + "learning_rate": 0.00010712042433405125, + "loss": 0.1153, + "step": 9070 + }, + { + "epoch": 237.3856209150327, + "grad_norm": 0.3524629771709442, + "learning_rate": 0.00010695548738484762, + "loss": 0.1135, + "step": 9080 + }, + { + "epoch": 237.64705882352942, + "grad_norm": 0.37540528178215027, + "learning_rate": 0.00010679053141960709, + "loss": 0.1199, + "step": 9090 + }, + { + "epoch": 237.90849673202615, + "grad_norm": 0.4271695017814636, + "learning_rate": 0.00010662555688931295, + "loss": 0.119, + "step": 9100 + }, + { + "epoch": 238.16993464052288, + "grad_norm": 0.3042963147163391, + "learning_rate": 0.00010646056424499926, + "loss": 0.1134, + "step": 9110 + }, + { + "epoch": 238.4313725490196, + "grad_norm": 0.3255995512008667, + "learning_rate": 0.00010629555393774962, + "loss": 0.1155, + "step": 9120 + }, + { + "epoch": 238.69281045751634, + "grad_norm": 0.3537774980068207, + "learning_rate": 0.00010613052641869592, + "loss": 0.1181, + "step": 9130 + }, + { + "epoch": 238.95424836601308, + "grad_norm": 0.34757131338119507, + "learning_rate": 0.00010596548213901708, + "loss": 0.1167, + "step": 9140 + }, + { + "epoch": 239.2156862745098, + "grad_norm": 0.2638850808143616, + "learning_rate": 0.00010580042154993786, + "loss": 0.1128, + "step": 9150 + }, + { + "epoch": 239.47712418300654, + "grad_norm": 0.39982908964157104, + "learning_rate": 0.00010563534510272763, + "loss": 0.1166, + "step": 9160 + }, + { + "epoch": 239.73856209150327, + "grad_norm": 0.4114007353782654, + "learning_rate": 0.00010547025324869903, + "loss": 0.1164, + "step": 9170 + }, + { + "epoch": 240.0, + "grad_norm": 0.4774865210056305, + "learning_rate": 0.00010530514643920697, + "loss": 0.1184, + "step": 9180 + }, + { + "epoch": 240.26143790849673, + "grad_norm": 0.2834888994693756, + "learning_rate": 0.00010514002512564714, + "loss": 0.112, + "step": 9190 + }, + { + "epoch": 240.52287581699346, + "grad_norm": 0.3496600091457367, + "learning_rate": 0.00010497488975945484, + "loss": 0.1152, + "step": 9200 + }, + { + "epoch": 240.7843137254902, + "grad_norm": 0.35766535997390747, + "learning_rate": 0.00010480974079210392, + "loss": 0.1181, + "step": 9210 + }, + { + "epoch": 241.04575163398692, + "grad_norm": 0.30494368076324463, + "learning_rate": 0.00010464457867510533, + "loss": 0.1177, + "step": 9220 + }, + { + "epoch": 241.30718954248366, + "grad_norm": 0.33818888664245605, + "learning_rate": 0.00010447940386000601, + "loss": 0.1138, + "step": 9230 + }, + { + "epoch": 241.5686274509804, + "grad_norm": 0.36898425221443176, + "learning_rate": 0.00010431421679838758, + "loss": 0.1136, + "step": 9240 + }, + { + "epoch": 241.83006535947712, + "grad_norm": 0.3632810711860657, + "learning_rate": 0.00010414901794186514, + "loss": 0.1196, + "step": 9250 + }, + { + "epoch": 242.09150326797385, + "grad_norm": 0.3109024167060852, + "learning_rate": 0.00010398380774208609, + "loss": 0.1166, + "step": 9260 + }, + { + "epoch": 242.35294117647058, + "grad_norm": 0.29130157828330994, + "learning_rate": 0.00010381858665072878, + "loss": 0.1143, + "step": 9270 + }, + { + "epoch": 242.6143790849673, + "grad_norm": 0.32316336035728455, + "learning_rate": 0.00010365335511950142, + "loss": 0.1122, + "step": 9280 + }, + { + "epoch": 242.87581699346404, + "grad_norm": 0.3277498185634613, + "learning_rate": 0.00010348811360014063, + "loss": 0.1192, + "step": 9290 + }, + { + "epoch": 243.13725490196077, + "grad_norm": 0.3015691637992859, + "learning_rate": 0.00010332286254441049, + "loss": 0.1161, + "step": 9300 + }, + { + "epoch": 243.3986928104575, + "grad_norm": 0.36535218358039856, + "learning_rate": 0.00010315760240410097, + "loss": 0.1162, + "step": 9310 + }, + { + "epoch": 243.66013071895424, + "grad_norm": 0.40011629462242126, + "learning_rate": 0.0001029923336310271, + "loss": 0.1183, + "step": 9320 + }, + { + "epoch": 243.92156862745097, + "grad_norm": 0.3597753643989563, + "learning_rate": 0.00010282705667702734, + "loss": 0.115, + "step": 9330 + }, + { + "epoch": 244.18300653594773, + "grad_norm": 0.37653326988220215, + "learning_rate": 0.00010266177199396257, + "loss": 0.1148, + "step": 9340 + }, + { + "epoch": 244.44444444444446, + "grad_norm": 0.30488893389701843, + "learning_rate": 0.00010249648003371482, + "loss": 0.1152, + "step": 9350 + }, + { + "epoch": 244.7058823529412, + "grad_norm": 0.351938396692276, + "learning_rate": 0.00010233118124818595, + "loss": 0.1174, + "step": 9360 + }, + { + "epoch": 244.96732026143792, + "grad_norm": 0.3121070861816406, + "learning_rate": 0.0001021658760892966, + "loss": 0.1136, + "step": 9370 + }, + { + "epoch": 245.22875816993465, + "grad_norm": 0.33020684123039246, + "learning_rate": 0.0001020005650089847, + "loss": 0.1108, + "step": 9380 + }, + { + "epoch": 245.49019607843138, + "grad_norm": 0.431669145822525, + "learning_rate": 0.00010183524845920447, + "loss": 0.1175, + "step": 9390 + }, + { + "epoch": 245.7516339869281, + "grad_norm": 0.2955373227596283, + "learning_rate": 0.00010166992689192505, + "loss": 0.1176, + "step": 9400 + }, + { + "epoch": 246.01307189542484, + "grad_norm": 0.3435670733451843, + "learning_rate": 0.00010150460075912922, + "loss": 0.1167, + "step": 9410 + }, + { + "epoch": 246.27450980392157, + "grad_norm": 0.3121711313724518, + "learning_rate": 0.00010133927051281243, + "loss": 0.114, + "step": 9420 + }, + { + "epoch": 246.5359477124183, + "grad_norm": 0.32946914434432983, + "learning_rate": 0.00010117393660498116, + "loss": 0.1133, + "step": 9430 + }, + { + "epoch": 246.79738562091504, + "grad_norm": 0.30625325441360474, + "learning_rate": 0.00010100859948765204, + "loss": 0.119, + "step": 9440 + }, + { + "epoch": 247.05882352941177, + "grad_norm": 0.312551885843277, + "learning_rate": 0.00010084325961285046, + "loss": 0.1157, + "step": 9450 + }, + { + "epoch": 247.3202614379085, + "grad_norm": 0.24457718431949615, + "learning_rate": 0.00010067791743260924, + "loss": 0.11, + "step": 9460 + }, + { + "epoch": 247.58169934640523, + "grad_norm": 0.33879950642585754, + "learning_rate": 0.00010051257339896771, + "loss": 0.1174, + "step": 9470 + }, + { + "epoch": 247.84313725490196, + "grad_norm": 0.33240655064582825, + "learning_rate": 0.00010034722796397004, + "loss": 0.1179, + "step": 9480 + }, + { + "epoch": 248.1045751633987, + "grad_norm": 0.30761128664016724, + "learning_rate": 0.00010018188157966442, + "loss": 0.1172, + "step": 9490 + }, + { + "epoch": 248.36601307189542, + "grad_norm": 0.30122920870780945, + "learning_rate": 0.00010001653469810145, + "loss": 0.1152, + "step": 9500 + }, + { + "epoch": 248.62745098039215, + "grad_norm": 0.3038076162338257, + "learning_rate": 9.985118777133329e-05, + "loss": 0.1161, + "step": 9510 + }, + { + "epoch": 248.88888888888889, + "grad_norm": 0.38542667031288147, + "learning_rate": 9.968584125141204e-05, + "loss": 0.1167, + "step": 9520 + }, + { + "epoch": 249.15032679738562, + "grad_norm": 0.2926845848560333, + "learning_rate": 9.952049559038885e-05, + "loss": 0.1153, + "step": 9530 + }, + { + "epoch": 249.41176470588235, + "grad_norm": 0.35238757729530334, + "learning_rate": 9.935515124031239e-05, + "loss": 0.1134, + "step": 9540 + }, + { + "epoch": 249.67320261437908, + "grad_norm": 0.31012189388275146, + "learning_rate": 9.918980865322782e-05, + "loss": 0.116, + "step": 9550 + }, + { + "epoch": 249.9346405228758, + "grad_norm": 0.3544747531414032, + "learning_rate": 9.902446828117545e-05, + "loss": 0.1189, + "step": 9560 + }, + { + "epoch": 250.19607843137254, + "grad_norm": 0.4231995940208435, + "learning_rate": 9.885913057618955e-05, + "loss": 0.1148, + "step": 9570 + }, + { + "epoch": 250.45751633986927, + "grad_norm": 0.3783901333808899, + "learning_rate": 9.869379599029708e-05, + "loss": 0.1158, + "step": 9580 + }, + { + "epoch": 250.718954248366, + "grad_norm": 0.32323822379112244, + "learning_rate": 9.852846497551653e-05, + "loss": 0.1172, + "step": 9590 + }, + { + "epoch": 250.98039215686273, + "grad_norm": 0.34913671016693115, + "learning_rate": 9.836313798385653e-05, + "loss": 0.1143, + "step": 9600 + }, + { + "epoch": 251.24183006535947, + "grad_norm": 0.2595481276512146, + "learning_rate": 9.819781546731476e-05, + "loss": 0.1137, + "step": 9610 + }, + { + "epoch": 251.5032679738562, + "grad_norm": 0.30071359872817993, + "learning_rate": 9.803249787787669e-05, + "loss": 0.1141, + "step": 9620 + }, + { + "epoch": 251.76470588235293, + "grad_norm": 0.32096344232559204, + "learning_rate": 9.786718566751431e-05, + "loss": 0.1176, + "step": 9630 + }, + { + "epoch": 252.0261437908497, + "grad_norm": 0.30524900555610657, + "learning_rate": 9.770187928818487e-05, + "loss": 0.118, + "step": 9640 + }, + { + "epoch": 252.28758169934642, + "grad_norm": 0.3532697856426239, + "learning_rate": 9.75365791918297e-05, + "loss": 0.1158, + "step": 9650 + }, + { + "epoch": 252.54901960784315, + "grad_norm": 0.29913774132728577, + "learning_rate": 9.737128583037295e-05, + "loss": 0.1145, + "step": 9660 + }, + { + "epoch": 252.81045751633988, + "grad_norm": 0.35638126730918884, + "learning_rate": 9.720599965572036e-05, + "loss": 0.1145, + "step": 9670 + }, + { + "epoch": 253.0718954248366, + "grad_norm": 0.28130611777305603, + "learning_rate": 9.704072111975802e-05, + "loss": 0.1143, + "step": 9680 + }, + { + "epoch": 253.33333333333334, + "grad_norm": 0.39420101046562195, + "learning_rate": 9.687545067435116e-05, + "loss": 0.1128, + "step": 9690 + }, + { + "epoch": 253.59477124183007, + "grad_norm": 0.33940833806991577, + "learning_rate": 9.671018877134284e-05, + "loss": 0.1152, + "step": 9700 + }, + { + "epoch": 253.8562091503268, + "grad_norm": 0.38873952627182007, + "learning_rate": 9.654493586255278e-05, + "loss": 0.1207, + "step": 9710 + }, + { + "epoch": 254.11764705882354, + "grad_norm": 0.242060124874115, + "learning_rate": 9.637969239977614e-05, + "loss": 0.1143, + "step": 9720 + }, + { + "epoch": 254.37908496732027, + "grad_norm": 0.29076987504959106, + "learning_rate": 9.62144588347823e-05, + "loss": 0.1119, + "step": 9730 + }, + { + "epoch": 254.640522875817, + "grad_norm": 0.28226718306541443, + "learning_rate": 9.604923561931337e-05, + "loss": 0.1151, + "step": 9740 + }, + { + "epoch": 254.90196078431373, + "grad_norm": 0.3857768177986145, + "learning_rate": 9.588402320508342e-05, + "loss": 0.1177, + "step": 9750 + }, + { + "epoch": 255.16339869281046, + "grad_norm": 0.2802301347255707, + "learning_rate": 9.571882204377687e-05, + "loss": 0.1194, + "step": 9760 + }, + { + "epoch": 255.4248366013072, + "grad_norm": 0.2925792932510376, + "learning_rate": 9.555363258704737e-05, + "loss": 0.1099, + "step": 9770 + }, + { + "epoch": 255.68627450980392, + "grad_norm": 0.3970232605934143, + "learning_rate": 9.538845528651665e-05, + "loss": 0.1168, + "step": 9780 + }, + { + "epoch": 255.94771241830065, + "grad_norm": 0.32095807790756226, + "learning_rate": 9.5223290593773e-05, + "loss": 0.1202, + "step": 9790 + }, + { + "epoch": 256.2091503267974, + "grad_norm": 0.2640714943408966, + "learning_rate": 9.50581389603705e-05, + "loss": 0.1147, + "step": 9800 + }, + { + "epoch": 256.47058823529414, + "grad_norm": 0.31910863518714905, + "learning_rate": 9.489300083782737e-05, + "loss": 0.1157, + "step": 9810 + }, + { + "epoch": 256.73202614379085, + "grad_norm": 0.32858380675315857, + "learning_rate": 9.472787667762493e-05, + "loss": 0.1166, + "step": 9820 + }, + { + "epoch": 256.9934640522876, + "grad_norm": 0.3285580575466156, + "learning_rate": 9.456276693120639e-05, + "loss": 0.1159, + "step": 9830 + }, + { + "epoch": 257.2549019607843, + "grad_norm": 0.35466259717941284, + "learning_rate": 9.43976720499754e-05, + "loss": 0.1129, + "step": 9840 + }, + { + "epoch": 257.51633986928107, + "grad_norm": 0.30083316564559937, + "learning_rate": 9.423259248529511e-05, + "loss": 0.1155, + "step": 9850 + }, + { + "epoch": 257.77777777777777, + "grad_norm": 0.37580639123916626, + "learning_rate": 9.406752868848673e-05, + "loss": 0.1166, + "step": 9860 + }, + { + "epoch": 258.03921568627453, + "grad_norm": 0.36361733078956604, + "learning_rate": 9.390248111082842e-05, + "loss": 0.1191, + "step": 9870 + }, + { + "epoch": 258.30065359477123, + "grad_norm": 0.3550747334957123, + "learning_rate": 9.373745020355387e-05, + "loss": 0.115, + "step": 9880 + }, + { + "epoch": 258.562091503268, + "grad_norm": 0.2830721437931061, + "learning_rate": 9.357243641785134e-05, + "loss": 0.1162, + "step": 9890 + }, + { + "epoch": 258.8235294117647, + "grad_norm": 0.2829933762550354, + "learning_rate": 9.340744020486222e-05, + "loss": 0.1133, + "step": 9900 + }, + { + "epoch": 259.08496732026146, + "grad_norm": 0.29842811822891235, + "learning_rate": 9.324246201567984e-05, + "loss": 0.1147, + "step": 9910 + }, + { + "epoch": 259.34640522875816, + "grad_norm": 0.32608217000961304, + "learning_rate": 9.30775023013483e-05, + "loss": 0.1143, + "step": 9920 + }, + { + "epoch": 259.6078431372549, + "grad_norm": 0.38989517092704773, + "learning_rate": 9.291256151286109e-05, + "loss": 0.1172, + "step": 9930 + }, + { + "epoch": 259.8692810457516, + "grad_norm": 0.3222101330757141, + "learning_rate": 9.274764010116008e-05, + "loss": 0.1171, + "step": 9940 + }, + { + "epoch": 260.1307189542484, + "grad_norm": 0.3502764105796814, + "learning_rate": 9.25827385171341e-05, + "loss": 0.1135, + "step": 9950 + }, + { + "epoch": 260.3921568627451, + "grad_norm": 0.2739408612251282, + "learning_rate": 9.241785721161779e-05, + "loss": 0.1119, + "step": 9960 + }, + { + "epoch": 260.65359477124184, + "grad_norm": 0.289969801902771, + "learning_rate": 9.225299663539038e-05, + "loss": 0.1165, + "step": 9970 + }, + { + "epoch": 260.91503267973854, + "grad_norm": 0.4188239276409149, + "learning_rate": 9.20881572391743e-05, + "loss": 0.1183, + "step": 9980 + }, + { + "epoch": 261.1764705882353, + "grad_norm": 0.30096152424812317, + "learning_rate": 9.192333947363423e-05, + "loss": 0.1143, + "step": 9990 + }, + { + "epoch": 261.437908496732, + "grad_norm": 0.3192864954471588, + "learning_rate": 9.175854378937563e-05, + "loss": 0.1124, + "step": 10000 + }, + { + "epoch": 261.69934640522877, + "grad_norm": 0.3204704523086548, + "learning_rate": 9.159377063694365e-05, + "loss": 0.1145, + "step": 10010 + }, + { + "epoch": 261.96078431372547, + "grad_norm": 0.45152097940444946, + "learning_rate": 9.142902046682171e-05, + "loss": 0.119, + "step": 10020 + }, + { + "epoch": 262.22222222222223, + "grad_norm": 0.3182998597621918, + "learning_rate": 9.126429372943053e-05, + "loss": 0.1164, + "step": 10030 + }, + { + "epoch": 262.48366013071893, + "grad_norm": 0.2740117311477661, + "learning_rate": 9.109959087512673e-05, + "loss": 0.1142, + "step": 10040 + }, + { + "epoch": 262.7450980392157, + "grad_norm": 0.3511454164981842, + "learning_rate": 9.093491235420162e-05, + "loss": 0.1154, + "step": 10050 + }, + { + "epoch": 263.0065359477124, + "grad_norm": 0.3681274652481079, + "learning_rate": 9.077025861688003e-05, + "loss": 0.1172, + "step": 10060 + }, + { + "epoch": 263.26797385620915, + "grad_norm": 0.31222420930862427, + "learning_rate": 9.060563011331888e-05, + "loss": 0.1122, + "step": 10070 + }, + { + "epoch": 263.52941176470586, + "grad_norm": 0.32507622241973877, + "learning_rate": 9.044102729360627e-05, + "loss": 0.1127, + "step": 10080 + }, + { + "epoch": 263.7908496732026, + "grad_norm": 0.4463390111923218, + "learning_rate": 9.027645060776006e-05, + "loss": 0.1199, + "step": 10090 + }, + { + "epoch": 264.0522875816994, + "grad_norm": 0.27414682507514954, + "learning_rate": 9.011190050572659e-05, + "loss": 0.1166, + "step": 10100 + }, + { + "epoch": 264.3137254901961, + "grad_norm": 0.35508808493614197, + "learning_rate": 8.994737743737961e-05, + "loss": 0.116, + "step": 10110 + }, + { + "epoch": 264.57516339869284, + "grad_norm": 0.29134297370910645, + "learning_rate": 8.978288185251881e-05, + "loss": 0.1134, + "step": 10120 + }, + { + "epoch": 264.83660130718954, + "grad_norm": 0.3992556631565094, + "learning_rate": 8.961841420086886e-05, + "loss": 0.1177, + "step": 10130 + }, + { + "epoch": 265.0980392156863, + "grad_norm": 0.24055442214012146, + "learning_rate": 8.945397493207809e-05, + "loss": 0.1124, + "step": 10140 + }, + { + "epoch": 265.359477124183, + "grad_norm": 0.3088826835155487, + "learning_rate": 8.92895644957172e-05, + "loss": 0.1138, + "step": 10150 + }, + { + "epoch": 265.62091503267976, + "grad_norm": 0.4448016285896301, + "learning_rate": 8.912518334127795e-05, + "loss": 0.1183, + "step": 10160 + }, + { + "epoch": 265.88235294117646, + "grad_norm": 0.31662651896476746, + "learning_rate": 8.896083191817221e-05, + "loss": 0.1166, + "step": 10170 + }, + { + "epoch": 266.1437908496732, + "grad_norm": 0.26434990763664246, + "learning_rate": 8.879651067573044e-05, + "loss": 0.1122, + "step": 10180 + }, + { + "epoch": 266.4052287581699, + "grad_norm": 0.25964638590812683, + "learning_rate": 8.86322200632007e-05, + "loss": 0.1123, + "step": 10190 + }, + { + "epoch": 266.6666666666667, + "grad_norm": 0.35823872685432434, + "learning_rate": 8.846796052974727e-05, + "loss": 0.1156, + "step": 10200 + }, + { + "epoch": 266.9281045751634, + "grad_norm": 0.37653812766075134, + "learning_rate": 8.830373252444937e-05, + "loss": 0.119, + "step": 10210 + }, + { + "epoch": 267.18954248366015, + "grad_norm": 0.33142727613449097, + "learning_rate": 8.813953649630012e-05, + "loss": 0.1158, + "step": 10220 + }, + { + "epoch": 267.45098039215685, + "grad_norm": 0.3292408883571625, + "learning_rate": 8.797537289420519e-05, + "loss": 0.1137, + "step": 10230 + }, + { + "epoch": 267.7124183006536, + "grad_norm": 0.40815725922584534, + "learning_rate": 8.781124216698161e-05, + "loss": 0.1146, + "step": 10240 + }, + { + "epoch": 267.9738562091503, + "grad_norm": 0.36364370584487915, + "learning_rate": 8.764714476335657e-05, + "loss": 0.1189, + "step": 10250 + }, + { + "epoch": 268.2352941176471, + "grad_norm": 0.3509305715560913, + "learning_rate": 8.748308113196602e-05, + "loss": 0.1144, + "step": 10260 + }, + { + "epoch": 268.4967320261438, + "grad_norm": 0.31511905789375305, + "learning_rate": 8.731905172135369e-05, + "loss": 0.113, + "step": 10270 + }, + { + "epoch": 268.75816993464053, + "grad_norm": 0.37249597907066345, + "learning_rate": 8.715505697996971e-05, + "loss": 0.1169, + "step": 10280 + }, + { + "epoch": 269.01960784313724, + "grad_norm": 0.3088653087615967, + "learning_rate": 8.699109735616952e-05, + "loss": 0.1186, + "step": 10290 + }, + { + "epoch": 269.281045751634, + "grad_norm": 0.3084110617637634, + "learning_rate": 8.68271732982124e-05, + "loss": 0.1107, + "step": 10300 + }, + { + "epoch": 269.5424836601307, + "grad_norm": 0.31472697854042053, + "learning_rate": 8.666328525426045e-05, + "loss": 0.117, + "step": 10310 + }, + { + "epoch": 269.80392156862746, + "grad_norm": 0.28198713064193726, + "learning_rate": 8.649943367237736e-05, + "loss": 0.1148, + "step": 10320 + }, + { + "epoch": 270.06535947712416, + "grad_norm": 0.3456663191318512, + "learning_rate": 8.633561900052708e-05, + "loss": 0.1184, + "step": 10330 + }, + { + "epoch": 270.3267973856209, + "grad_norm": 0.2773241102695465, + "learning_rate": 8.617184168657275e-05, + "loss": 0.1117, + "step": 10340 + }, + { + "epoch": 270.5882352941176, + "grad_norm": 0.34341636300086975, + "learning_rate": 8.600810217827515e-05, + "loss": 0.114, + "step": 10350 + }, + { + "epoch": 270.8496732026144, + "grad_norm": 0.3856281042098999, + "learning_rate": 8.584440092329193e-05, + "loss": 0.1174, + "step": 10360 + }, + { + "epoch": 271.1111111111111, + "grad_norm": 0.28968098759651184, + "learning_rate": 8.568073836917607e-05, + "loss": 0.1161, + "step": 10370 + }, + { + "epoch": 271.37254901960785, + "grad_norm": 0.3231571912765503, + "learning_rate": 8.551711496337469e-05, + "loss": 0.1162, + "step": 10380 + }, + { + "epoch": 271.63398692810455, + "grad_norm": 0.31247514486312866, + "learning_rate": 8.535353115322806e-05, + "loss": 0.1143, + "step": 10390 + }, + { + "epoch": 271.8954248366013, + "grad_norm": 0.3014558255672455, + "learning_rate": 8.518998738596791e-05, + "loss": 0.1173, + "step": 10400 + }, + { + "epoch": 272.15686274509807, + "grad_norm": 0.27148059010505676, + "learning_rate": 8.502648410871675e-05, + "loss": 0.11, + "step": 10410 + }, + { + "epoch": 272.41830065359477, + "grad_norm": 0.35765767097473145, + "learning_rate": 8.486302176848624e-05, + "loss": 0.1151, + "step": 10420 + }, + { + "epoch": 272.67973856209153, + "grad_norm": 0.3530411124229431, + "learning_rate": 8.469960081217627e-05, + "loss": 0.1165, + "step": 10430 + }, + { + "epoch": 272.94117647058823, + "grad_norm": 0.3553902804851532, + "learning_rate": 8.45362216865734e-05, + "loss": 0.1165, + "step": 10440 + }, + { + "epoch": 273.202614379085, + "grad_norm": 0.2596057653427124, + "learning_rate": 8.437288483834997e-05, + "loss": 0.1139, + "step": 10450 + }, + { + "epoch": 273.4640522875817, + "grad_norm": 0.29487183690071106, + "learning_rate": 8.420959071406266e-05, + "loss": 0.118, + "step": 10460 + }, + { + "epoch": 273.72549019607845, + "grad_norm": 0.3081933557987213, + "learning_rate": 8.404633976015134e-05, + "loss": 0.1147, + "step": 10470 + }, + { + "epoch": 273.98692810457516, + "grad_norm": 0.3125523030757904, + "learning_rate": 8.388313242293802e-05, + "loss": 0.1136, + "step": 10480 + }, + { + "epoch": 274.2483660130719, + "grad_norm": 0.35245898365974426, + "learning_rate": 8.371996914862519e-05, + "loss": 0.1136, + "step": 10490 + }, + { + "epoch": 274.5098039215686, + "grad_norm": 0.36222124099731445, + "learning_rate": 8.355685038329504e-05, + "loss": 0.1148, + "step": 10500 + }, + { + "epoch": 274.7712418300654, + "grad_norm": 0.34575581550598145, + "learning_rate": 8.339377657290808e-05, + "loss": 0.114, + "step": 10510 + }, + { + "epoch": 275.0326797385621, + "grad_norm": 0.4005025625228882, + "learning_rate": 8.323074816330183e-05, + "loss": 0.1188, + "step": 10520 + }, + { + "epoch": 275.29411764705884, + "grad_norm": 0.32579559087753296, + "learning_rate": 8.306776560018985e-05, + "loss": 0.1138, + "step": 10530 + }, + { + "epoch": 275.55555555555554, + "grad_norm": 0.27991876006126404, + "learning_rate": 8.29048293291601e-05, + "loss": 0.1125, + "step": 10540 + }, + { + "epoch": 275.8169934640523, + "grad_norm": 0.3079290986061096, + "learning_rate": 8.27419397956742e-05, + "loss": 0.1142, + "step": 10550 + }, + { + "epoch": 276.078431372549, + "grad_norm": 0.25126445293426514, + "learning_rate": 8.257909744506589e-05, + "loss": 0.1178, + "step": 10560 + }, + { + "epoch": 276.33986928104576, + "grad_norm": 0.2905788719654083, + "learning_rate": 8.241630272253998e-05, + "loss": 0.1125, + "step": 10570 + }, + { + "epoch": 276.60130718954247, + "grad_norm": 0.38739460706710815, + "learning_rate": 8.225355607317096e-05, + "loss": 0.1152, + "step": 10580 + }, + { + "epoch": 276.8627450980392, + "grad_norm": 0.33940622210502625, + "learning_rate": 8.2090857941902e-05, + "loss": 0.1168, + "step": 10590 + }, + { + "epoch": 277.12418300653593, + "grad_norm": 0.2534843981266022, + "learning_rate": 8.192820877354357e-05, + "loss": 0.1112, + "step": 10600 + }, + { + "epoch": 277.3856209150327, + "grad_norm": 0.3085792660713196, + "learning_rate": 8.176560901277229e-05, + "loss": 0.1165, + "step": 10610 + }, + { + "epoch": 277.6470588235294, + "grad_norm": 0.32212698459625244, + "learning_rate": 8.16030591041297e-05, + "loss": 0.1138, + "step": 10620 + }, + { + "epoch": 277.90849673202615, + "grad_norm": 0.31586194038391113, + "learning_rate": 8.144055949202101e-05, + "loss": 0.1186, + "step": 10630 + }, + { + "epoch": 278.16993464052285, + "grad_norm": 0.2730450928211212, + "learning_rate": 8.127811062071398e-05, + "loss": 0.1163, + "step": 10640 + }, + { + "epoch": 278.4313725490196, + "grad_norm": 0.2704620659351349, + "learning_rate": 8.111571293433764e-05, + "loss": 0.1104, + "step": 10650 + }, + { + "epoch": 278.6928104575163, + "grad_norm": 0.3569394052028656, + "learning_rate": 8.095336687688102e-05, + "loss": 0.1171, + "step": 10660 + }, + { + "epoch": 278.9542483660131, + "grad_norm": 0.28940895199775696, + "learning_rate": 8.079107289219209e-05, + "loss": 0.1172, + "step": 10670 + }, + { + "epoch": 279.2156862745098, + "grad_norm": 0.30181649327278137, + "learning_rate": 8.062883142397635e-05, + "loss": 0.1124, + "step": 10680 + }, + { + "epoch": 279.47712418300654, + "grad_norm": 0.3073585629463196, + "learning_rate": 8.046664291579584e-05, + "loss": 0.1137, + "step": 10690 + }, + { + "epoch": 279.73856209150324, + "grad_norm": 0.30750972032546997, + "learning_rate": 8.03045078110677e-05, + "loss": 0.115, + "step": 10700 + }, + { + "epoch": 280.0, + "grad_norm": 0.33145976066589355, + "learning_rate": 8.014242655306315e-05, + "loss": 0.1187, + "step": 10710 + }, + { + "epoch": 280.26143790849676, + "grad_norm": 0.2784498631954193, + "learning_rate": 7.998039958490613e-05, + "loss": 0.1113, + "step": 10720 + }, + { + "epoch": 280.52287581699346, + "grad_norm": 0.3815988302230835, + "learning_rate": 7.981842734957221e-05, + "loss": 0.1154, + "step": 10730 + }, + { + "epoch": 280.7843137254902, + "grad_norm": 0.35885512828826904, + "learning_rate": 7.965651028988726e-05, + "loss": 0.1192, + "step": 10740 + }, + { + "epoch": 281.0457516339869, + "grad_norm": 0.3109668791294098, + "learning_rate": 7.949464884852638e-05, + "loss": 0.114, + "step": 10750 + }, + { + "epoch": 281.3071895424837, + "grad_norm": 0.2830188274383545, + "learning_rate": 7.933284346801258e-05, + "loss": 0.1111, + "step": 10760 + }, + { + "epoch": 281.5686274509804, + "grad_norm": 0.30252325534820557, + "learning_rate": 7.917109459071553e-05, + "loss": 0.113, + "step": 10770 + }, + { + "epoch": 281.83006535947715, + "grad_norm": 0.3681179881095886, + "learning_rate": 7.900940265885052e-05, + "loss": 0.1172, + "step": 10780 + }, + { + "epoch": 282.09150326797385, + "grad_norm": 0.3035522401332855, + "learning_rate": 7.884776811447712e-05, + "loss": 0.1148, + "step": 10790 + }, + { + "epoch": 282.3529411764706, + "grad_norm": 0.2895407974720001, + "learning_rate": 7.8686191399498e-05, + "loss": 0.1125, + "step": 10800 + }, + { + "epoch": 282.6143790849673, + "grad_norm": 0.37216058373451233, + "learning_rate": 7.852467295565775e-05, + "loss": 0.1155, + "step": 10810 + }, + { + "epoch": 282.87581699346407, + "grad_norm": 0.3305184543132782, + "learning_rate": 7.836321322454159e-05, + "loss": 0.1163, + "step": 10820 + }, + { + "epoch": 283.1372549019608, + "grad_norm": 0.30414262413978577, + "learning_rate": 7.820181264757427e-05, + "loss": 0.1154, + "step": 10830 + }, + { + "epoch": 283.39869281045753, + "grad_norm": 0.2971784174442291, + "learning_rate": 7.80404716660188e-05, + "loss": 0.1126, + "step": 10840 + }, + { + "epoch": 283.66013071895424, + "grad_norm": 0.36141228675842285, + "learning_rate": 7.787919072097531e-05, + "loss": 0.1155, + "step": 10850 + }, + { + "epoch": 283.921568627451, + "grad_norm": 0.2840782403945923, + "learning_rate": 7.771797025337968e-05, + "loss": 0.1198, + "step": 10860 + }, + { + "epoch": 284.1830065359477, + "grad_norm": 0.2825259268283844, + "learning_rate": 7.755681070400253e-05, + "loss": 0.1123, + "step": 10870 + }, + { + "epoch": 284.44444444444446, + "grad_norm": 0.3692414462566376, + "learning_rate": 7.739571251344794e-05, + "loss": 0.1157, + "step": 10880 + }, + { + "epoch": 284.70588235294116, + "grad_norm": 0.2816650867462158, + "learning_rate": 7.723467612215219e-05, + "loss": 0.1159, + "step": 10890 + }, + { + "epoch": 284.9673202614379, + "grad_norm": 0.33248651027679443, + "learning_rate": 7.707370197038265e-05, + "loss": 0.1158, + "step": 10900 + }, + { + "epoch": 285.2287581699346, + "grad_norm": 0.3684132695198059, + "learning_rate": 7.691279049823646e-05, + "loss": 0.1138, + "step": 10910 + }, + { + "epoch": 285.4901960784314, + "grad_norm": 0.28321191668510437, + "learning_rate": 7.675194214563948e-05, + "loss": 0.1137, + "step": 10920 + }, + { + "epoch": 285.7516339869281, + "grad_norm": 0.39478781819343567, + "learning_rate": 7.659115735234494e-05, + "loss": 0.1193, + "step": 10930 + }, + { + "epoch": 286.01307189542484, + "grad_norm": 0.3251636028289795, + "learning_rate": 7.643043655793235e-05, + "loss": 0.1118, + "step": 10940 + }, + { + "epoch": 286.27450980392155, + "grad_norm": 0.32104846835136414, + "learning_rate": 7.62697802018062e-05, + "loss": 0.1134, + "step": 10950 + }, + { + "epoch": 286.5359477124183, + "grad_norm": 0.3238525092601776, + "learning_rate": 7.610918872319483e-05, + "loss": 0.1126, + "step": 10960 + }, + { + "epoch": 286.797385620915, + "grad_norm": 0.3236340284347534, + "learning_rate": 7.594866256114921e-05, + "loss": 0.1178, + "step": 10970 + }, + { + "epoch": 287.05882352941177, + "grad_norm": 0.2717225253582001, + "learning_rate": 7.578820215454178e-05, + "loss": 0.1153, + "step": 10980 + }, + { + "epoch": 287.32026143790847, + "grad_norm": 0.34041455388069153, + "learning_rate": 7.562780794206514e-05, + "loss": 0.1129, + "step": 10990 + }, + { + "epoch": 287.58169934640523, + "grad_norm": 0.27632108330726624, + "learning_rate": 7.546748036223091e-05, + "loss": 0.1154, + "step": 11000 + }, + { + "epoch": 287.84313725490193, + "grad_norm": 0.4436096251010895, + "learning_rate": 7.530721985336861e-05, + "loss": 0.1157, + "step": 11010 + }, + { + "epoch": 288.1045751633987, + "grad_norm": 0.3542175889015198, + "learning_rate": 7.514702685362434e-05, + "loss": 0.1161, + "step": 11020 + }, + { + "epoch": 288.36601307189545, + "grad_norm": 0.32913169264793396, + "learning_rate": 7.498690180095963e-05, + "loss": 0.115, + "step": 11030 + }, + { + "epoch": 288.62745098039215, + "grad_norm": 0.4230281710624695, + "learning_rate": 7.48268451331503e-05, + "loss": 0.1147, + "step": 11040 + }, + { + "epoch": 288.8888888888889, + "grad_norm": 0.31047868728637695, + "learning_rate": 7.466685728778513e-05, + "loss": 0.1136, + "step": 11050 + }, + { + "epoch": 289.1503267973856, + "grad_norm": 0.2447769194841385, + "learning_rate": 7.450693870226478e-05, + "loss": 0.1124, + "step": 11060 + }, + { + "epoch": 289.4117647058824, + "grad_norm": 0.2783534824848175, + "learning_rate": 7.434708981380057e-05, + "loss": 0.1145, + "step": 11070 + }, + { + "epoch": 289.6732026143791, + "grad_norm": 0.2729416787624359, + "learning_rate": 7.418731105941328e-05, + "loss": 0.1152, + "step": 11080 + }, + { + "epoch": 289.93464052287584, + "grad_norm": 0.29936328530311584, + "learning_rate": 7.402760287593189e-05, + "loss": 0.1175, + "step": 11090 + }, + { + "epoch": 290.19607843137254, + "grad_norm": 0.3042921721935272, + "learning_rate": 7.386796569999246e-05, + "loss": 0.1128, + "step": 11100 + }, + { + "epoch": 290.4575163398693, + "grad_norm": 0.3261522948741913, + "learning_rate": 7.370839996803697e-05, + "loss": 0.1144, + "step": 11110 + }, + { + "epoch": 290.718954248366, + "grad_norm": 0.30387112498283386, + "learning_rate": 7.354890611631202e-05, + "loss": 0.1139, + "step": 11120 + }, + { + "epoch": 290.98039215686276, + "grad_norm": 0.3516126275062561, + "learning_rate": 7.338948458086774e-05, + "loss": 0.1167, + "step": 11130 + }, + { + "epoch": 291.24183006535947, + "grad_norm": 0.2861202359199524, + "learning_rate": 7.323013579755647e-05, + "loss": 0.1138, + "step": 11140 + }, + { + "epoch": 291.5032679738562, + "grad_norm": 0.3707253634929657, + "learning_rate": 7.307086020203173e-05, + "loss": 0.1136, + "step": 11150 + }, + { + "epoch": 291.7647058823529, + "grad_norm": 0.34529221057891846, + "learning_rate": 7.29116582297469e-05, + "loss": 0.1167, + "step": 11160 + }, + { + "epoch": 292.0261437908497, + "grad_norm": 0.2350694239139557, + "learning_rate": 7.275253031595413e-05, + "loss": 0.1139, + "step": 11170 + }, + { + "epoch": 292.2875816993464, + "grad_norm": 0.35073357820510864, + "learning_rate": 7.259347689570304e-05, + "loss": 0.1141, + "step": 11180 + }, + { + "epoch": 292.54901960784315, + "grad_norm": 0.3456834554672241, + "learning_rate": 7.243449840383958e-05, + "loss": 0.1117, + "step": 11190 + }, + { + "epoch": 292.81045751633985, + "grad_norm": 0.3201320171356201, + "learning_rate": 7.227559527500489e-05, + "loss": 0.1177, + "step": 11200 + }, + { + "epoch": 293.0718954248366, + "grad_norm": 0.2568330466747284, + "learning_rate": 7.211676794363407e-05, + "loss": 0.1144, + "step": 11210 + }, + { + "epoch": 293.3333333333333, + "grad_norm": 0.29000136256217957, + "learning_rate": 7.1958016843955e-05, + "loss": 0.1121, + "step": 11220 + }, + { + "epoch": 293.5947712418301, + "grad_norm": 0.30854013562202454, + "learning_rate": 7.179934240998706e-05, + "loss": 0.1206, + "step": 11230 + }, + { + "epoch": 293.8562091503268, + "grad_norm": 0.27561551332473755, + "learning_rate": 7.164074507554015e-05, + "loss": 0.1137, + "step": 11240 + }, + { + "epoch": 294.11764705882354, + "grad_norm": 0.28580352663993835, + "learning_rate": 7.148222527421331e-05, + "loss": 0.1125, + "step": 11250 + }, + { + "epoch": 294.37908496732024, + "grad_norm": 0.29733917117118835, + "learning_rate": 7.132378343939361e-05, + "loss": 0.1142, + "step": 11260 + }, + { + "epoch": 294.640522875817, + "grad_norm": 0.3131226897239685, + "learning_rate": 7.116542000425501e-05, + "loss": 0.1151, + "step": 11270 + }, + { + "epoch": 294.9019607843137, + "grad_norm": 0.2978150248527527, + "learning_rate": 7.100713540175706e-05, + "loss": 0.1152, + "step": 11280 + }, + { + "epoch": 295.16339869281046, + "grad_norm": 0.31860676407814026, + "learning_rate": 7.084893006464383e-05, + "loss": 0.1136, + "step": 11290 + }, + { + "epoch": 295.42483660130716, + "grad_norm": 0.36646994948387146, + "learning_rate": 7.069080442544267e-05, + "loss": 0.1127, + "step": 11300 + }, + { + "epoch": 295.6862745098039, + "grad_norm": 0.3542885184288025, + "learning_rate": 7.053275891646303e-05, + "loss": 0.1134, + "step": 11310 + }, + { + "epoch": 295.9477124183006, + "grad_norm": 0.46107035875320435, + "learning_rate": 7.037479396979535e-05, + "loss": 0.1177, + "step": 11320 + }, + { + "epoch": 296.2091503267974, + "grad_norm": 0.32113420963287354, + "learning_rate": 7.02169100173097e-05, + "loss": 0.1149, + "step": 11330 + }, + { + "epoch": 296.47058823529414, + "grad_norm": 0.3642306625843048, + "learning_rate": 7.005910749065478e-05, + "loss": 0.1157, + "step": 11340 + }, + { + "epoch": 296.73202614379085, + "grad_norm": 0.35815343260765076, + "learning_rate": 6.990138682125671e-05, + "loss": 0.1135, + "step": 11350 + }, + { + "epoch": 296.9934640522876, + "grad_norm": 0.365302175283432, + "learning_rate": 6.974374844031779e-05, + "loss": 0.1158, + "step": 11360 + }, + { + "epoch": 297.2549019607843, + "grad_norm": 0.26450252532958984, + "learning_rate": 6.958619277881524e-05, + "loss": 0.1148, + "step": 11370 + }, + { + "epoch": 297.51633986928107, + "grad_norm": 0.29163211584091187, + "learning_rate": 6.942872026750029e-05, + "loss": 0.1134, + "step": 11380 + }, + { + "epoch": 297.77777777777777, + "grad_norm": 0.3450673520565033, + "learning_rate": 6.927133133689678e-05, + "loss": 0.1158, + "step": 11390 + }, + { + "epoch": 298.03921568627453, + "grad_norm": 0.2773265838623047, + "learning_rate": 6.911402641730003e-05, + "loss": 0.1136, + "step": 11400 + }, + { + "epoch": 298.30065359477123, + "grad_norm": 0.29598814249038696, + "learning_rate": 6.895680593877571e-05, + "loss": 0.1168, + "step": 11410 + }, + { + "epoch": 298.562091503268, + "grad_norm": 0.2809271812438965, + "learning_rate": 6.879967033115853e-05, + "loss": 0.1142, + "step": 11420 + }, + { + "epoch": 298.8235294117647, + "grad_norm": 0.2930368185043335, + "learning_rate": 6.864262002405129e-05, + "loss": 0.1148, + "step": 11430 + }, + { + "epoch": 299.08496732026146, + "grad_norm": 0.25596320629119873, + "learning_rate": 6.848565544682352e-05, + "loss": 0.1114, + "step": 11440 + }, + { + "epoch": 299.34640522875816, + "grad_norm": 0.26733312010765076, + "learning_rate": 6.832877702861037e-05, + "loss": 0.1138, + "step": 11450 + }, + { + "epoch": 299.6078431372549, + "grad_norm": 0.33392226696014404, + "learning_rate": 6.817198519831154e-05, + "loss": 0.116, + "step": 11460 + }, + { + "epoch": 299.8692810457516, + "grad_norm": 0.28333500027656555, + "learning_rate": 6.801528038458974e-05, + "loss": 0.1129, + "step": 11470 + }, + { + "epoch": 300.1307189542484, + "grad_norm": 0.28260430693626404, + "learning_rate": 6.785866301587007e-05, + "loss": 0.1151, + "step": 11480 + }, + { + "epoch": 300.3921568627451, + "grad_norm": 0.35571715235710144, + "learning_rate": 6.770213352033839e-05, + "loss": 0.1129, + "step": 11490 + }, + { + "epoch": 300.65359477124184, + "grad_norm": 0.3103904724121094, + "learning_rate": 6.754569232594042e-05, + "loss": 0.1145, + "step": 11500 + }, + { + "epoch": 300.91503267973854, + "grad_norm": 0.3481941223144531, + "learning_rate": 6.738933986038033e-05, + "loss": 0.1163, + "step": 11510 + }, + { + "epoch": 301.1764705882353, + "grad_norm": 0.2778976559638977, + "learning_rate": 6.72330765511198e-05, + "loss": 0.1141, + "step": 11520 + }, + { + "epoch": 301.437908496732, + "grad_norm": 0.472317099571228, + "learning_rate": 6.70769028253768e-05, + "loss": 0.1159, + "step": 11530 + }, + { + "epoch": 301.69934640522877, + "grad_norm": 0.3168316185474396, + "learning_rate": 6.692081911012431e-05, + "loss": 0.117, + "step": 11540 + }, + { + "epoch": 301.96078431372547, + "grad_norm": 0.2718193531036377, + "learning_rate": 6.676482583208929e-05, + "loss": 0.114, + "step": 11550 + }, + { + "epoch": 302.22222222222223, + "grad_norm": 0.32255101203918457, + "learning_rate": 6.660892341775132e-05, + "loss": 0.1129, + "step": 11560 + }, + { + "epoch": 302.48366013071893, + "grad_norm": 0.4141261577606201, + "learning_rate": 6.645311229334167e-05, + "loss": 0.1154, + "step": 11570 + }, + { + "epoch": 302.7450980392157, + "grad_norm": 0.28321996331214905, + "learning_rate": 6.629739288484204e-05, + "loss": 0.1125, + "step": 11580 + }, + { + "epoch": 303.0065359477124, + "grad_norm": 0.27643659710884094, + "learning_rate": 6.614176561798335e-05, + "loss": 0.1153, + "step": 11590 + }, + { + "epoch": 303.26797385620915, + "grad_norm": 0.3224067986011505, + "learning_rate": 6.598623091824461e-05, + "loss": 0.1103, + "step": 11600 + }, + { + "epoch": 303.52941176470586, + "grad_norm": 0.31204745173454285, + "learning_rate": 6.583078921085167e-05, + "loss": 0.114, + "step": 11610 + }, + { + "epoch": 303.7908496732026, + "grad_norm": 0.3363180458545685, + "learning_rate": 6.567544092077631e-05, + "loss": 0.1164, + "step": 11620 + }, + { + "epoch": 304.0522875816994, + "grad_norm": 0.3073705732822418, + "learning_rate": 6.55357076825483e-05, + "loss": 0.1173, + "step": 11630 + }, + { + "epoch": 304.3137254901961, + "grad_norm": 0.38721707463264465, + "learning_rate": 6.538053805525763e-05, + "loss": 0.1128, + "step": 11640 + }, + { + "epoch": 304.57516339869284, + "grad_norm": 0.3045378625392914, + "learning_rate": 6.522546307625399e-05, + "loss": 0.1154, + "step": 11650 + }, + { + "epoch": 304.83660130718954, + "grad_norm": 0.29825395345687866, + "learning_rate": 6.507048316950648e-05, + "loss": 0.1135, + "step": 11660 + }, + { + "epoch": 305.0980392156863, + "grad_norm": 0.32951053977012634, + "learning_rate": 6.491559875872415e-05, + "loss": 0.1168, + "step": 11670 + }, + { + "epoch": 305.359477124183, + "grad_norm": 0.25783228874206543, + "learning_rate": 6.476081026735513e-05, + "loss": 0.1094, + "step": 11680 + }, + { + "epoch": 305.62091503267976, + "grad_norm": 0.3310535252094269, + "learning_rate": 6.460611811858521e-05, + "loss": 0.1162, + "step": 11690 + }, + { + "epoch": 305.88235294117646, + "grad_norm": 0.3044494390487671, + "learning_rate": 6.445152273533687e-05, + "loss": 0.1172, + "step": 11700 + }, + { + "epoch": 306.1437908496732, + "grad_norm": 0.2858486771583557, + "learning_rate": 6.429702454026798e-05, + "loss": 0.1128, + "step": 11710 + }, + { + "epoch": 306.4052287581699, + "grad_norm": 0.28763970732688904, + "learning_rate": 6.414262395577065e-05, + "loss": 0.1123, + "step": 11720 + }, + { + "epoch": 306.6666666666667, + "grad_norm": 0.33545467257499695, + "learning_rate": 6.398832140397022e-05, + "loss": 0.117, + "step": 11730 + }, + { + "epoch": 306.9281045751634, + "grad_norm": 0.33642587065696716, + "learning_rate": 6.383411730672394e-05, + "loss": 0.1168, + "step": 11740 + }, + { + "epoch": 307.18954248366015, + "grad_norm": 0.3563029170036316, + "learning_rate": 6.368001208561998e-05, + "loss": 0.1135, + "step": 11750 + }, + { + "epoch": 307.45098039215685, + "grad_norm": 0.31238290667533875, + "learning_rate": 6.352600616197615e-05, + "loss": 0.1139, + "step": 11760 + }, + { + "epoch": 307.7124183006536, + "grad_norm": 0.3694748282432556, + "learning_rate": 6.337209995683867e-05, + "loss": 0.1166, + "step": 11770 + }, + { + "epoch": 307.9738562091503, + "grad_norm": 0.2764953672885895, + "learning_rate": 6.321829389098126e-05, + "loss": 0.1135, + "step": 11780 + }, + { + "epoch": 308.2352941176471, + "grad_norm": 0.28875818848609924, + "learning_rate": 6.306458838490385e-05, + "loss": 0.1121, + "step": 11790 + }, + { + "epoch": 308.4967320261438, + "grad_norm": 0.3236418068408966, + "learning_rate": 6.291098385883146e-05, + "loss": 0.1093, + "step": 11800 + }, + { + "epoch": 308.75816993464053, + "grad_norm": 0.2844972014427185, + "learning_rate": 6.275748073271292e-05, + "loss": 0.1184, + "step": 11810 + }, + { + "epoch": 309.01960784313724, + "grad_norm": 0.39473089575767517, + "learning_rate": 6.260407942621998e-05, + "loss": 0.1169, + "step": 11820 + }, + { + "epoch": 309.281045751634, + "grad_norm": 0.2659190893173218, + "learning_rate": 6.245078035874591e-05, + "loss": 0.1132, + "step": 11830 + }, + { + "epoch": 309.5424836601307, + "grad_norm": 0.28653818368911743, + "learning_rate": 6.22975839494045e-05, + "loss": 0.1137, + "step": 11840 + }, + { + "epoch": 309.80392156862746, + "grad_norm": 0.311483770608902, + "learning_rate": 6.214449061702898e-05, + "loss": 0.1127, + "step": 11850 + }, + { + "epoch": 310.06535947712416, + "grad_norm": 0.2597990930080414, + "learning_rate": 6.199150078017057e-05, + "loss": 0.1135, + "step": 11860 + }, + { + "epoch": 310.3267973856209, + "grad_norm": 0.32311543822288513, + "learning_rate": 6.183861485709765e-05, + "loss": 0.1154, + "step": 11870 + }, + { + "epoch": 310.5882352941176, + "grad_norm": 0.3250320553779602, + "learning_rate": 6.168583326579456e-05, + "loss": 0.1135, + "step": 11880 + }, + { + "epoch": 310.8496732026144, + "grad_norm": 0.2815916836261749, + "learning_rate": 6.153315642396025e-05, + "loss": 0.1145, + "step": 11890 + }, + { + "epoch": 311.1111111111111, + "grad_norm": 0.3453966975212097, + "learning_rate": 6.13805847490075e-05, + "loss": 0.1141, + "step": 11900 + }, + { + "epoch": 311.37254901960785, + "grad_norm": 0.3138068914413452, + "learning_rate": 6.122811865806131e-05, + "loss": 0.1127, + "step": 11910 + }, + { + "epoch": 311.63398692810455, + "grad_norm": 0.3160851299762726, + "learning_rate": 6.107575856795822e-05, + "loss": 0.1143, + "step": 11920 + }, + { + "epoch": 311.8954248366013, + "grad_norm": 0.3921455144882202, + "learning_rate": 6.0923504895244875e-05, + "loss": 0.1145, + "step": 11930 + }, + { + "epoch": 312.15686274509807, + "grad_norm": 0.31224459409713745, + "learning_rate": 6.077135805617705e-05, + "loss": 0.1113, + "step": 11940 + }, + { + "epoch": 312.41830065359477, + "grad_norm": 0.330119788646698, + "learning_rate": 6.061931846671833e-05, + "loss": 0.1154, + "step": 11950 + }, + { + "epoch": 312.67973856209153, + "grad_norm": 0.3644309639930725, + "learning_rate": 6.046738654253918e-05, + "loss": 0.1165, + "step": 11960 + }, + { + "epoch": 312.94117647058823, + "grad_norm": 0.30337658524513245, + "learning_rate": 6.031556269901567e-05, + "loss": 0.1154, + "step": 11970 + }, + { + "epoch": 313.202614379085, + "grad_norm": 0.31728994846343994, + "learning_rate": 6.0163847351228395e-05, + "loss": 0.1113, + "step": 11980 + }, + { + "epoch": 313.4640522875817, + "grad_norm": 0.32171520590782166, + "learning_rate": 6.0012240913961334e-05, + "loss": 0.1156, + "step": 11990 + }, + { + "epoch": 313.72549019607845, + "grad_norm": 0.2858985364437103, + "learning_rate": 5.986074380170068e-05, + "loss": 0.1139, + "step": 12000 + }, + { + "epoch": 313.98692810457516, + "grad_norm": 0.32188278436660767, + "learning_rate": 5.9709356428633746e-05, + "loss": 0.1163, + "step": 12010 + }, + { + "epoch": 314.2483660130719, + "grad_norm": 0.2941359877586365, + "learning_rate": 5.955807920864784e-05, + "loss": 0.1123, + "step": 12020 + }, + { + "epoch": 314.5098039215686, + "grad_norm": 0.29654598236083984, + "learning_rate": 5.940691255532912e-05, + "loss": 0.1116, + "step": 12030 + }, + { + "epoch": 314.7712418300654, + "grad_norm": 0.29309600591659546, + "learning_rate": 5.9255856881961444e-05, + "loss": 0.113, + "step": 12040 + }, + { + "epoch": 315.0326797385621, + "grad_norm": 0.3037513792514801, + "learning_rate": 5.910491260152522e-05, + "loss": 0.1196, + "step": 12050 + }, + { + "epoch": 315.29411764705884, + "grad_norm": 0.37008363008499146, + "learning_rate": 5.8954080126696366e-05, + "loss": 0.1123, + "step": 12060 + }, + { + "epoch": 315.55555555555554, + "grad_norm": 0.2903066575527191, + "learning_rate": 5.880335986984512e-05, + "loss": 0.1139, + "step": 12070 + }, + { + "epoch": 315.8169934640523, + "grad_norm": 0.42981478571891785, + "learning_rate": 5.865275224303491e-05, + "loss": 0.1145, + "step": 12080 + }, + { + "epoch": 316.078431372549, + "grad_norm": 0.2848221957683563, + "learning_rate": 5.850225765802122e-05, + "loss": 0.1178, + "step": 12090 + }, + { + "epoch": 316.33986928104576, + "grad_norm": 0.3206409811973572, + "learning_rate": 5.835187652625047e-05, + "loss": 0.1125, + "step": 12100 + }, + { + "epoch": 316.60130718954247, + "grad_norm": 0.30614250898361206, + "learning_rate": 5.820160925885902e-05, + "loss": 0.1139, + "step": 12110 + }, + { + "epoch": 316.8627450980392, + "grad_norm": 0.32410693168640137, + "learning_rate": 5.8051456266671746e-05, + "loss": 0.1144, + "step": 12120 + }, + { + "epoch": 317.12418300653593, + "grad_norm": 0.3014141619205475, + "learning_rate": 5.790141796020132e-05, + "loss": 0.114, + "step": 12130 + }, + { + "epoch": 317.3856209150327, + "grad_norm": 0.30295130610466003, + "learning_rate": 5.7751494749646575e-05, + "loss": 0.115, + "step": 12140 + }, + { + "epoch": 317.6470588235294, + "grad_norm": 0.3472057282924652, + "learning_rate": 5.7601687044891925e-05, + "loss": 0.1126, + "step": 12150 + }, + { + "epoch": 317.90849673202615, + "grad_norm": 0.2875531315803528, + "learning_rate": 5.745199525550596e-05, + "loss": 0.1153, + "step": 12160 + }, + { + "epoch": 318.16993464052285, + "grad_norm": 0.3130854070186615, + "learning_rate": 5.730241979074025e-05, + "loss": 0.1143, + "step": 12170 + }, + { + "epoch": 318.4313725490196, + "grad_norm": 0.27834898233413696, + "learning_rate": 5.71529610595285e-05, + "loss": 0.1129, + "step": 12180 + }, + { + "epoch": 318.6928104575163, + "grad_norm": 0.37341535091400146, + "learning_rate": 5.7003619470485016e-05, + "loss": 0.1133, + "step": 12190 + }, + { + "epoch": 318.9542483660131, + "grad_norm": 0.31970056891441345, + "learning_rate": 5.6854395431904094e-05, + "loss": 0.1151, + "step": 12200 + }, + { + "epoch": 319.2156862745098, + "grad_norm": 0.3472343683242798, + "learning_rate": 5.6705289351758584e-05, + "loss": 0.1109, + "step": 12210 + }, + { + "epoch": 319.47712418300654, + "grad_norm": 0.3212395906448364, + "learning_rate": 5.6556301637698785e-05, + "loss": 0.1119, + "step": 12220 + }, + { + "epoch": 319.73856209150324, + "grad_norm": 0.40940025448799133, + "learning_rate": 5.6407432697051424e-05, + "loss": 0.1174, + "step": 12230 + }, + { + "epoch": 320.0, + "grad_norm": 0.3223443925380707, + "learning_rate": 5.625868293681844e-05, + "loss": 0.1155, + "step": 12240 + }, + { + "epoch": 320.26143790849676, + "grad_norm": 0.295582115650177, + "learning_rate": 5.611005276367605e-05, + "loss": 0.1145, + "step": 12250 + }, + { + "epoch": 320.52287581699346, + "grad_norm": 0.3024216592311859, + "learning_rate": 5.596154258397353e-05, + "loss": 0.1104, + "step": 12260 + }, + { + "epoch": 320.7843137254902, + "grad_norm": 0.3048156797885895, + "learning_rate": 5.581315280373195e-05, + "loss": 0.1134, + "step": 12270 + }, + { + "epoch": 321.0457516339869, + "grad_norm": 0.256906121969223, + "learning_rate": 5.566488382864334e-05, + "loss": 0.1156, + "step": 12280 + }, + { + "epoch": 321.3071895424837, + "grad_norm": 0.2839376628398895, + "learning_rate": 5.55167360640694e-05, + "loss": 0.1141, + "step": 12290 + }, + { + "epoch": 321.5686274509804, + "grad_norm": 0.27784302830696106, + "learning_rate": 5.536870991504044e-05, + "loss": 0.1135, + "step": 12300 + }, + { + "epoch": 321.83006535947715, + "grad_norm": 0.2698685824871063, + "learning_rate": 5.522080578625438e-05, + "loss": 0.1134, + "step": 12310 + }, + { + "epoch": 322.09150326797385, + "grad_norm": 0.3085765838623047, + "learning_rate": 5.507302408207542e-05, + "loss": 0.1138, + "step": 12320 + }, + { + "epoch": 322.3529411764706, + "grad_norm": 0.3435112237930298, + "learning_rate": 5.492536520653307e-05, + "loss": 0.1124, + "step": 12330 + }, + { + "epoch": 322.6143790849673, + "grad_norm": 0.3543654680252075, + "learning_rate": 5.4777829563321046e-05, + "loss": 0.1152, + "step": 12340 + }, + { + "epoch": 322.87581699346407, + "grad_norm": 0.2766146957874298, + "learning_rate": 5.463041755579619e-05, + "loss": 0.1144, + "step": 12350 + }, + { + "epoch": 323.1372549019608, + "grad_norm": 0.3165183961391449, + "learning_rate": 5.4483129586977386e-05, + "loss": 0.1154, + "step": 12360 + }, + { + "epoch": 323.39869281045753, + "grad_norm": 0.3891643285751343, + "learning_rate": 5.433596605954415e-05, + "loss": 0.1137, + "step": 12370 + }, + { + "epoch": 323.66013071895424, + "grad_norm": 0.28057435154914856, + "learning_rate": 5.4188927375836074e-05, + "loss": 0.1116, + "step": 12380 + }, + { + "epoch": 323.921568627451, + "grad_norm": 0.2857229709625244, + "learning_rate": 5.404201393785122e-05, + "loss": 0.1143, + "step": 12390 + }, + { + "epoch": 324.1830065359477, + "grad_norm": 0.31078898906707764, + "learning_rate": 5.389522614724536e-05, + "loss": 0.113, + "step": 12400 + }, + { + "epoch": 324.44444444444446, + "grad_norm": 0.2750803530216217, + "learning_rate": 5.374856440533078e-05, + "loss": 0.1129, + "step": 12410 + }, + { + "epoch": 324.70588235294116, + "grad_norm": 0.3001578748226166, + "learning_rate": 5.360202911307493e-05, + "loss": 0.1167, + "step": 12420 + }, + { + "epoch": 324.9673202614379, + "grad_norm": 0.2825237810611725, + "learning_rate": 5.345562067109984e-05, + "loss": 0.1131, + "step": 12430 + }, + { + "epoch": 325.2287581699346, + "grad_norm": 0.40779075026512146, + "learning_rate": 5.3309339479680485e-05, + "loss": 0.1165, + "step": 12440 + }, + { + "epoch": 325.4901960784314, + "grad_norm": 0.30739492177963257, + "learning_rate": 5.316318593874415e-05, + "loss": 0.1128, + "step": 12450 + }, + { + "epoch": 325.7516339869281, + "grad_norm": 0.2763489782810211, + "learning_rate": 5.301716044786902e-05, + "loss": 0.1109, + "step": 12460 + }, + { + "epoch": 326.01307189542484, + "grad_norm": 0.3952416479587555, + "learning_rate": 5.287126340628312e-05, + "loss": 0.1146, + "step": 12470 + }, + { + "epoch": 326.27450980392155, + "grad_norm": 0.36667054891586304, + "learning_rate": 5.2725495212863494e-05, + "loss": 0.1158, + "step": 12480 + }, + { + "epoch": 326.5359477124183, + "grad_norm": 0.2867089807987213, + "learning_rate": 5.25798562661348e-05, + "loss": 0.1112, + "step": 12490 + }, + { + "epoch": 326.797385620915, + "grad_norm": 0.39078038930892944, + "learning_rate": 5.2434346964268344e-05, + "loss": 0.1159, + "step": 12500 + }, + { + "epoch": 327.05882352941177, + "grad_norm": 0.30490151047706604, + "learning_rate": 5.2288967705081e-05, + "loss": 0.116, + "step": 12510 + }, + { + "epoch": 327.32026143790847, + "grad_norm": 0.2912214696407318, + "learning_rate": 5.214371888603409e-05, + "loss": 0.1097, + "step": 12520 + }, + { + "epoch": 327.58169934640523, + "grad_norm": 0.32746198773384094, + "learning_rate": 5.199860090423233e-05, + "loss": 0.1136, + "step": 12530 + }, + { + "epoch": 327.84313725490193, + "grad_norm": 0.48329511284828186, + "learning_rate": 5.185361415642283e-05, + "loss": 0.1163, + "step": 12540 + }, + { + "epoch": 328.1045751633987, + "grad_norm": 0.34038567543029785, + "learning_rate": 5.170875903899375e-05, + "loss": 0.1136, + "step": 12550 + }, + { + "epoch": 328.36601307189545, + "grad_norm": 0.3485356569290161, + "learning_rate": 5.1564035947973456e-05, + "loss": 0.1106, + "step": 12560 + }, + { + "epoch": 328.62745098039215, + "grad_norm": 0.3116021156311035, + "learning_rate": 5.141944527902932e-05, + "loss": 0.1147, + "step": 12570 + }, + { + "epoch": 328.8888888888889, + "grad_norm": 0.32443079352378845, + "learning_rate": 5.127498742746675e-05, + "loss": 0.1181, + "step": 12580 + }, + { + "epoch": 329.1503267973856, + "grad_norm": 0.28348246216773987, + "learning_rate": 5.113066278822807e-05, + "loss": 0.1125, + "step": 12590 + }, + { + "epoch": 329.4117647058824, + "grad_norm": 0.30532312393188477, + "learning_rate": 5.098647175589118e-05, + "loss": 0.1122, + "step": 12600 + }, + { + "epoch": 329.6732026143791, + "grad_norm": 0.37380146980285645, + "learning_rate": 5.084241472466897e-05, + "loss": 0.1149, + "step": 12610 + }, + { + "epoch": 329.93464052287584, + "grad_norm": 0.2732062339782715, + "learning_rate": 5.069849208840779e-05, + "loss": 0.1124, + "step": 12620 + }, + { + "epoch": 330.19607843137254, + "grad_norm": 0.3351595997810364, + "learning_rate": 5.055470424058666e-05, + "loss": 0.1137, + "step": 12630 + }, + { + "epoch": 330.4575163398693, + "grad_norm": 0.33784759044647217, + "learning_rate": 5.041105157431616e-05, + "loss": 0.114, + "step": 12640 + }, + { + "epoch": 330.718954248366, + "grad_norm": 0.30338507890701294, + "learning_rate": 5.026753448233703e-05, + "loss": 0.1135, + "step": 12650 + }, + { + "epoch": 330.98039215686276, + "grad_norm": 0.35373762249946594, + "learning_rate": 5.012415335701962e-05, + "loss": 0.1145, + "step": 12660 + }, + { + "epoch": 331.24183006535947, + "grad_norm": 0.3398035764694214, + "learning_rate": 4.99809085903624e-05, + "loss": 0.1119, + "step": 12670 + }, + { + "epoch": 331.5032679738562, + "grad_norm": 0.3283422291278839, + "learning_rate": 4.983780057399111e-05, + "loss": 0.1151, + "step": 12680 + }, + { + "epoch": 331.7647058823529, + "grad_norm": 0.30405548214912415, + "learning_rate": 4.9694829699157695e-05, + "loss": 0.1136, + "step": 12690 + }, + { + "epoch": 332.0261437908497, + "grad_norm": 0.3022817373275757, + "learning_rate": 4.9551996356738915e-05, + "loss": 0.1128, + "step": 12700 + }, + { + "epoch": 332.2875816993464, + "grad_norm": 0.3224383592605591, + "learning_rate": 4.940930093723578e-05, + "loss": 0.1129, + "step": 12710 + }, + { + "epoch": 332.54901960784315, + "grad_norm": 0.3512856662273407, + "learning_rate": 4.9266743830772034e-05, + "loss": 0.1125, + "step": 12720 + }, + { + "epoch": 332.81045751633985, + "grad_norm": 0.32787182927131653, + "learning_rate": 4.9124325427093455e-05, + "loss": 0.1132, + "step": 12730 + }, + { + "epoch": 333.0718954248366, + "grad_norm": 0.29392367601394653, + "learning_rate": 4.898204611556647e-05, + "loss": 0.1154, + "step": 12740 + }, + { + "epoch": 333.3333333333333, + "grad_norm": 0.30844447016716003, + "learning_rate": 4.883990628517725e-05, + "loss": 0.112, + "step": 12750 + }, + { + "epoch": 333.5947712418301, + "grad_norm": 0.38730964064598083, + "learning_rate": 4.869790632453075e-05, + "loss": 0.1178, + "step": 12760 + }, + { + "epoch": 333.8562091503268, + "grad_norm": 0.3621443808078766, + "learning_rate": 4.8556046621849346e-05, + "loss": 0.1114, + "step": 12770 + }, + { + "epoch": 334.11764705882354, + "grad_norm": 0.31759655475616455, + "learning_rate": 4.841432756497214e-05, + "loss": 0.1159, + "step": 12780 + }, + { + "epoch": 334.37908496732024, + "grad_norm": 0.2881384789943695, + "learning_rate": 4.827274954135358e-05, + "loss": 0.1123, + "step": 12790 + }, + { + "epoch": 334.640522875817, + "grad_norm": 0.3464260697364807, + "learning_rate": 4.813131293806253e-05, + "loss": 0.1125, + "step": 12800 + }, + { + "epoch": 334.9019607843137, + "grad_norm": 0.29048338532447815, + "learning_rate": 4.7990018141781344e-05, + "loss": 0.1154, + "step": 12810 + }, + { + "epoch": 335.16339869281046, + "grad_norm": 0.301886647939682, + "learning_rate": 4.7848865538804535e-05, + "loss": 0.1129, + "step": 12820 + }, + { + "epoch": 335.42483660130716, + "grad_norm": 0.3119642436504364, + "learning_rate": 4.770785551503798e-05, + "loss": 0.1118, + "step": 12830 + }, + { + "epoch": 335.6862745098039, + "grad_norm": 0.32431304454803467, + "learning_rate": 4.756698845599769e-05, + "loss": 0.1127, + "step": 12840 + }, + { + "epoch": 335.9477124183006, + "grad_norm": 0.41362830996513367, + "learning_rate": 4.7426264746808755e-05, + "loss": 0.1163, + "step": 12850 + }, + { + "epoch": 336.2091503267974, + "grad_norm": 0.2932046055793762, + "learning_rate": 4.728568477220453e-05, + "loss": 0.1129, + "step": 12860 + }, + { + "epoch": 336.47058823529414, + "grad_norm": 0.32170718908309937, + "learning_rate": 4.714524891652524e-05, + "loss": 0.1123, + "step": 12870 + }, + { + "epoch": 336.73202614379085, + "grad_norm": 0.32445117831230164, + "learning_rate": 4.7004957563717134e-05, + "loss": 0.1132, + "step": 12880 + }, + { + "epoch": 336.9934640522876, + "grad_norm": 0.39626502990722656, + "learning_rate": 4.686481109733146e-05, + "loss": 0.1167, + "step": 12890 + }, + { + "epoch": 337.2549019607843, + "grad_norm": 0.2989867925643921, + "learning_rate": 4.6724809900523256e-05, + "loss": 0.1099, + "step": 12900 + }, + { + "epoch": 337.51633986928107, + "grad_norm": 0.303310364484787, + "learning_rate": 4.658495435605051e-05, + "loss": 0.1151, + "step": 12910 + }, + { + "epoch": 337.77777777777777, + "grad_norm": 0.28980472683906555, + "learning_rate": 4.6445244846272916e-05, + "loss": 0.1143, + "step": 12920 + }, + { + "epoch": 338.03921568627453, + "grad_norm": 0.2834646999835968, + "learning_rate": 4.630568175315088e-05, + "loss": 0.1138, + "step": 12930 + }, + { + "epoch": 338.30065359477123, + "grad_norm": 0.2876114249229431, + "learning_rate": 4.6166265458244665e-05, + "loss": 0.1142, + "step": 12940 + }, + { + "epoch": 338.562091503268, + "grad_norm": 0.35517698526382446, + "learning_rate": 4.6026996342713e-05, + "loss": 0.1116, + "step": 12950 + }, + { + "epoch": 338.8235294117647, + "grad_norm": 0.310722678899765, + "learning_rate": 4.588787478731242e-05, + "loss": 0.1145, + "step": 12960 + }, + { + "epoch": 339.08496732026146, + "grad_norm": 0.2860526740550995, + "learning_rate": 4.574890117239592e-05, + "loss": 0.1144, + "step": 12970 + }, + { + "epoch": 339.34640522875816, + "grad_norm": 0.3224795162677765, + "learning_rate": 4.5610075877912e-05, + "loss": 0.1144, + "step": 12980 + }, + { + "epoch": 339.6078431372549, + "grad_norm": 0.3138015866279602, + "learning_rate": 4.5471399283403784e-05, + "loss": 0.1126, + "step": 12990 + }, + { + "epoch": 339.8692810457516, + "grad_norm": 0.3215549886226654, + "learning_rate": 4.533287176800772e-05, + "loss": 0.114, + "step": 13000 + }, + { + "epoch": 340.1307189542484, + "grad_norm": 0.2976275384426117, + "learning_rate": 4.5194493710452825e-05, + "loss": 0.1131, + "step": 13010 + }, + { + "epoch": 340.3921568627451, + "grad_norm": 0.3737821877002716, + "learning_rate": 4.505626548905938e-05, + "loss": 0.1141, + "step": 13020 + }, + { + "epoch": 340.65359477124184, + "grad_norm": 0.3438400328159332, + "learning_rate": 4.491818748173804e-05, + "loss": 0.1133, + "step": 13030 + }, + { + "epoch": 340.91503267973854, + "grad_norm": 0.34674420952796936, + "learning_rate": 4.478026006598885e-05, + "loss": 0.1134, + "step": 13040 + }, + { + "epoch": 341.1764705882353, + "grad_norm": 0.31675341725349426, + "learning_rate": 4.464248361890006e-05, + "loss": 0.112, + "step": 13050 + }, + { + "epoch": 341.437908496732, + "grad_norm": 0.2978508770465851, + "learning_rate": 4.4504858517147265e-05, + "loss": 0.1124, + "step": 13060 + }, + { + "epoch": 341.69934640522877, + "grad_norm": 0.32540374994277954, + "learning_rate": 4.43673851369922e-05, + "loss": 0.1104, + "step": 13070 + }, + { + "epoch": 341.96078431372547, + "grad_norm": 0.3203536570072174, + "learning_rate": 4.423006385428181e-05, + "loss": 0.119, + "step": 13080 + }, + { + "epoch": 342.22222222222223, + "grad_norm": 0.3339461386203766, + "learning_rate": 4.409289504444732e-05, + "loss": 0.1154, + "step": 13090 + }, + { + "epoch": 342.48366013071893, + "grad_norm": 0.30102136731147766, + "learning_rate": 4.3955879082502926e-05, + "loss": 0.1116, + "step": 13100 + }, + { + "epoch": 342.7450980392157, + "grad_norm": 0.31064385175704956, + "learning_rate": 4.381901634304512e-05, + "loss": 0.1145, + "step": 13110 + }, + { + "epoch": 343.0065359477124, + "grad_norm": 0.30831557512283325, + "learning_rate": 4.368230720025137e-05, + "loss": 0.1121, + "step": 13120 + }, + { + "epoch": 343.26797385620915, + "grad_norm": 0.31142985820770264, + "learning_rate": 4.35457520278792e-05, + "loss": 0.1136, + "step": 13130 + }, + { + "epoch": 343.52941176470586, + "grad_norm": 0.2939104735851288, + "learning_rate": 4.340935119926534e-05, + "loss": 0.1129, + "step": 13140 + }, + { + "epoch": 343.7908496732026, + "grad_norm": 0.30027836561203003, + "learning_rate": 4.327310508732437e-05, + "loss": 0.1144, + "step": 13150 + }, + { + "epoch": 344.0522875816994, + "grad_norm": 0.25800076127052307, + "learning_rate": 4.3137014064547965e-05, + "loss": 0.1144, + "step": 13160 + }, + { + "epoch": 344.3137254901961, + "grad_norm": 0.3547254204750061, + "learning_rate": 4.3001078503003825e-05, + "loss": 0.1115, + "step": 13170 + }, + { + "epoch": 344.57516339869284, + "grad_norm": 0.2877454459667206, + "learning_rate": 4.286529877433453e-05, + "loss": 0.1131, + "step": 13180 + }, + { + "epoch": 344.83660130718954, + "grad_norm": 0.36441680788993835, + "learning_rate": 4.272967524975673e-05, + "loss": 0.1127, + "step": 13190 + }, + { + "epoch": 345.0980392156863, + "grad_norm": 0.3238193988800049, + "learning_rate": 4.2594208300059946e-05, + "loss": 0.1185, + "step": 13200 + }, + { + "epoch": 345.359477124183, + "grad_norm": 0.2932418882846832, + "learning_rate": 4.245889829560559e-05, + "loss": 0.1134, + "step": 13210 + }, + { + "epoch": 345.62091503267976, + "grad_norm": 0.2501135766506195, + "learning_rate": 4.232374560632614e-05, + "loss": 0.108, + "step": 13220 + }, + { + "epoch": 345.88235294117646, + "grad_norm": 0.39940959215164185, + "learning_rate": 4.218875060172379e-05, + "loss": 0.1159, + "step": 13230 + }, + { + "epoch": 346.1437908496732, + "grad_norm": 0.30277755856513977, + "learning_rate": 4.2053913650869816e-05, + "loss": 0.1145, + "step": 13240 + }, + { + "epoch": 346.4052287581699, + "grad_norm": 0.2736481726169586, + "learning_rate": 4.191923512240327e-05, + "loss": 0.1114, + "step": 13250 + }, + { + "epoch": 346.6666666666667, + "grad_norm": 0.3326358199119568, + "learning_rate": 4.1784715384530035e-05, + "loss": 0.1131, + "step": 13260 + }, + { + "epoch": 346.9281045751634, + "grad_norm": 0.31444019079208374, + "learning_rate": 4.165035480502204e-05, + "loss": 0.1164, + "step": 13270 + }, + { + "epoch": 347.18954248366015, + "grad_norm": 0.3125884532928467, + "learning_rate": 4.1516153751215895e-05, + "loss": 0.1157, + "step": 13280 + }, + { + "epoch": 347.45098039215685, + "grad_norm": 0.3291109800338745, + "learning_rate": 4.138211259001222e-05, + "loss": 0.1151, + "step": 13290 + }, + { + "epoch": 347.7124183006536, + "grad_norm": 0.385786771774292, + "learning_rate": 4.1248231687874414e-05, + "loss": 0.1105, + "step": 13300 + }, + { + "epoch": 347.9738562091503, + "grad_norm": 0.32350867986679077, + "learning_rate": 4.1114511410827714e-05, + "loss": 0.1148, + "step": 13310 + }, + { + "epoch": 348.2352941176471, + "grad_norm": 0.2844417691230774, + "learning_rate": 4.098095212445831e-05, + "loss": 0.113, + "step": 13320 + }, + { + "epoch": 348.4967320261438, + "grad_norm": 0.298141747713089, + "learning_rate": 4.084755419391213e-05, + "loss": 0.1118, + "step": 13330 + }, + { + "epoch": 348.75816993464053, + "grad_norm": 0.2766071856021881, + "learning_rate": 4.071431798389408e-05, + "loss": 0.1134, + "step": 13340 + }, + { + "epoch": 349.01960784313724, + "grad_norm": 0.3159619867801666, + "learning_rate": 4.058124385866685e-05, + "loss": 0.1138, + "step": 13350 + }, + { + "epoch": 349.281045751634, + "grad_norm": 0.2848956286907196, + "learning_rate": 4.044833218204998e-05, + "loss": 0.1128, + "step": 13360 + }, + { + "epoch": 349.5424836601307, + "grad_norm": 0.28767091035842896, + "learning_rate": 4.031558331741897e-05, + "loss": 0.1114, + "step": 13370 + }, + { + "epoch": 349.80392156862746, + "grad_norm": 0.30374428629875183, + "learning_rate": 4.01829976277041e-05, + "loss": 0.1143, + "step": 13380 + }, + { + "epoch": 350.06535947712416, + "grad_norm": 0.2949487864971161, + "learning_rate": 4.005057547538964e-05, + "loss": 0.1147, + "step": 13390 + }, + { + "epoch": 350.3267973856209, + "grad_norm": 0.2627740502357483, + "learning_rate": 3.991831722251268e-05, + "loss": 0.1116, + "step": 13400 + }, + { + "epoch": 350.5882352941176, + "grad_norm": 0.3931275010108948, + "learning_rate": 3.978622323066217e-05, + "loss": 0.1147, + "step": 13410 + }, + { + "epoch": 350.8496732026144, + "grad_norm": 0.27783629298210144, + "learning_rate": 3.965429386097813e-05, + "loss": 0.1137, + "step": 13420 + }, + { + "epoch": 351.1111111111111, + "grad_norm": 0.29457733035087585, + "learning_rate": 3.952252947415038e-05, + "loss": 0.1134, + "step": 13430 + }, + { + "epoch": 351.37254901960785, + "grad_norm": 0.27213922142982483, + "learning_rate": 3.9390930430417696e-05, + "loss": 0.1098, + "step": 13440 + }, + { + "epoch": 351.63398692810455, + "grad_norm": 0.2786455452442169, + "learning_rate": 3.925949708956689e-05, + "loss": 0.1134, + "step": 13450 + }, + { + "epoch": 351.8954248366013, + "grad_norm": 0.326171338558197, + "learning_rate": 3.9128229810931626e-05, + "loss": 0.1145, + "step": 13460 + }, + { + "epoch": 352.15686274509807, + "grad_norm": 0.2971991002559662, + "learning_rate": 3.8997128953391727e-05, + "loss": 0.1141, + "step": 13470 + }, + { + "epoch": 352.41830065359477, + "grad_norm": 0.3068319261074066, + "learning_rate": 3.886619487537187e-05, + "loss": 0.1101, + "step": 13480 + }, + { + "epoch": 352.67973856209153, + "grad_norm": 0.285197377204895, + "learning_rate": 3.873542793484081e-05, + "loss": 0.1134, + "step": 13490 + }, + { + "epoch": 352.94117647058823, + "grad_norm": 0.30382540822029114, + "learning_rate": 3.860482848931042e-05, + "loss": 0.1161, + "step": 13500 + }, + { + "epoch": 353.202614379085, + "grad_norm": 0.32687294483184814, + "learning_rate": 3.847439689583454e-05, + "loss": 0.1113, + "step": 13510 + }, + { + "epoch": 353.4640522875817, + "grad_norm": 0.27805617451667786, + "learning_rate": 3.834413351100823e-05, + "loss": 0.1139, + "step": 13520 + }, + { + "epoch": 353.72549019607845, + "grad_norm": 0.33675503730773926, + "learning_rate": 3.821403869096658e-05, + "loss": 0.1166, + "step": 13530 + }, + { + "epoch": 353.98692810457516, + "grad_norm": 0.3418455123901367, + "learning_rate": 3.808411279138383e-05, + "loss": 0.1113, + "step": 13540 + }, + { + "epoch": 354.2483660130719, + "grad_norm": 0.2951265275478363, + "learning_rate": 3.7954356167472485e-05, + "loss": 0.1149, + "step": 13550 + }, + { + "epoch": 354.5098039215686, + "grad_norm": 0.3082883358001709, + "learning_rate": 3.782476917398213e-05, + "loss": 0.1125, + "step": 13560 + }, + { + "epoch": 354.7712418300654, + "grad_norm": 0.30738532543182373, + "learning_rate": 3.7695352165198774e-05, + "loss": 0.1139, + "step": 13570 + }, + { + "epoch": 355.0326797385621, + "grad_norm": 0.28504759073257446, + "learning_rate": 3.7566105494943435e-05, + "loss": 0.1122, + "step": 13580 + }, + { + "epoch": 355.29411764705884, + "grad_norm": 0.2820607125759125, + "learning_rate": 3.743702951657163e-05, + "loss": 0.1104, + "step": 13590 + }, + { + "epoch": 355.55555555555554, + "grad_norm": 0.38755741715431213, + "learning_rate": 3.730812458297222e-05, + "loss": 0.1149, + "step": 13600 + }, + { + "epoch": 355.8169934640523, + "grad_norm": 0.31920456886291504, + "learning_rate": 3.717939104656626e-05, + "loss": 0.1128, + "step": 13610 + }, + { + "epoch": 356.078431372549, + "grad_norm": 0.4119099974632263, + "learning_rate": 3.7050829259306466e-05, + "loss": 0.1142, + "step": 13620 + }, + { + "epoch": 356.33986928104576, + "grad_norm": 0.31791460514068604, + "learning_rate": 3.692243957267568e-05, + "loss": 0.1095, + "step": 13630 + }, + { + "epoch": 356.60130718954247, + "grad_norm": 0.24925312399864197, + "learning_rate": 3.679422233768651e-05, + "loss": 0.1179, + "step": 13640 + }, + { + "epoch": 356.8627450980392, + "grad_norm": 0.3441096246242523, + "learning_rate": 3.6666177904879994e-05, + "loss": 0.1141, + "step": 13650 + }, + { + "epoch": 357.12418300653593, + "grad_norm": 0.2764681577682495, + "learning_rate": 3.655108595056173e-05, + "loss": 0.1124, + "step": 13660 + }, + { + "epoch": 357.3856209150327, + "grad_norm": 0.29715630412101746, + "learning_rate": 3.6423370805949876e-05, + "loss": 0.1119, + "step": 13670 + }, + { + "epoch": 357.6470588235294, + "grad_norm": 0.282524049282074, + "learning_rate": 3.629582947741461e-05, + "loss": 0.1155, + "step": 13680 + }, + { + "epoch": 357.90849673202615, + "grad_norm": 0.3714596629142761, + "learning_rate": 3.616846231364902e-05, + "loss": 0.1121, + "step": 13690 + }, + { + "epoch": 358.16993464052285, + "grad_norm": 0.2508104145526886, + "learning_rate": 3.604126966287004e-05, + "loss": 0.1124, + "step": 13700 + }, + { + "epoch": 358.4313725490196, + "grad_norm": 0.30074048042297363, + "learning_rate": 3.591425187281756e-05, + "loss": 0.1143, + "step": 13710 + }, + { + "epoch": 358.6928104575163, + "grad_norm": 0.3082398474216461, + "learning_rate": 3.578740929075333e-05, + "loss": 0.1139, + "step": 13720 + }, + { + "epoch": 358.9542483660131, + "grad_norm": 0.3137090504169464, + "learning_rate": 3.5660742263460203e-05, + "loss": 0.1116, + "step": 13730 + }, + { + "epoch": 359.2156862745098, + "grad_norm": 0.33088791370391846, + "learning_rate": 3.553425113724088e-05, + "loss": 0.1108, + "step": 13740 + }, + { + "epoch": 359.47712418300654, + "grad_norm": 0.29915428161621094, + "learning_rate": 3.5407936257917326e-05, + "loss": 0.1142, + "step": 13750 + }, + { + "epoch": 359.73856209150324, + "grad_norm": 0.2895696461200714, + "learning_rate": 3.5281797970829635e-05, + "loss": 0.1125, + "step": 13760 + }, + { + "epoch": 360.0, + "grad_norm": 0.33440282940864563, + "learning_rate": 3.5155836620835006e-05, + "loss": 0.115, + "step": 13770 + }, + { + "epoch": 360.26143790849676, + "grad_norm": 0.3408808708190918, + "learning_rate": 3.5030052552307044e-05, + "loss": 0.1107, + "step": 13780 + }, + { + "epoch": 360.52287581699346, + "grad_norm": 0.35789361596107483, + "learning_rate": 3.490444610913447e-05, + "loss": 0.1147, + "step": 13790 + }, + { + "epoch": 360.7843137254902, + "grad_norm": 0.30055010318756104, + "learning_rate": 3.477901763472057e-05, + "loss": 0.1123, + "step": 13800 + }, + { + "epoch": 361.0457516339869, + "grad_norm": 0.3059723973274231, + "learning_rate": 3.465376747198203e-05, + "loss": 0.1129, + "step": 13810 + }, + { + "epoch": 361.3071895424837, + "grad_norm": 0.32120776176452637, + "learning_rate": 3.452869596334798e-05, + "loss": 0.1127, + "step": 13820 + }, + { + "epoch": 361.5686274509804, + "grad_norm": 0.3302242159843445, + "learning_rate": 3.440380345075915e-05, + "loss": 0.1121, + "step": 13830 + }, + { + "epoch": 361.83006535947715, + "grad_norm": 0.3632891774177551, + "learning_rate": 3.427909027566688e-05, + "loss": 0.1118, + "step": 13840 + }, + { + "epoch": 362.09150326797385, + "grad_norm": 0.26599353551864624, + "learning_rate": 3.415455677903224e-05, + "loss": 0.1149, + "step": 13850 + }, + { + "epoch": 362.3529411764706, + "grad_norm": 0.29619720578193665, + "learning_rate": 3.403020330132509e-05, + "loss": 0.1107, + "step": 13860 + }, + { + "epoch": 362.6143790849673, + "grad_norm": 0.3457578718662262, + "learning_rate": 3.3906030182523077e-05, + "loss": 0.1157, + "step": 13870 + }, + { + "epoch": 362.87581699346407, + "grad_norm": 0.33504214882850647, + "learning_rate": 3.378203776211075e-05, + "loss": 0.1117, + "step": 13880 + }, + { + "epoch": 363.1372549019608, + "grad_norm": 0.3943881392478943, + "learning_rate": 3.365822637907862e-05, + "loss": 0.1158, + "step": 13890 + }, + { + "epoch": 363.39869281045753, + "grad_norm": 0.4457724392414093, + "learning_rate": 3.353459637192231e-05, + "loss": 0.1122, + "step": 13900 + }, + { + "epoch": 363.66013071895424, + "grad_norm": 0.3329622745513916, + "learning_rate": 3.341114807864158e-05, + "loss": 0.1119, + "step": 13910 + }, + { + "epoch": 363.921568627451, + "grad_norm": 0.3475494682788849, + "learning_rate": 3.328788183673932e-05, + "loss": 0.1142, + "step": 13920 + }, + { + "epoch": 364.1830065359477, + "grad_norm": 0.2990402281284332, + "learning_rate": 3.316479798322072e-05, + "loss": 0.1116, + "step": 13930 + }, + { + "epoch": 364.44444444444446, + "grad_norm": 0.24483919143676758, + "learning_rate": 3.3041896854592305e-05, + "loss": 0.1128, + "step": 13940 + }, + { + "epoch": 364.70588235294116, + "grad_norm": 0.31257379055023193, + "learning_rate": 3.2919178786861104e-05, + "loss": 0.1143, + "step": 13950 + }, + { + "epoch": 364.9673202614379, + "grad_norm": 0.3180519938468933, + "learning_rate": 3.279664411553368e-05, + "loss": 0.1138, + "step": 13960 + }, + { + "epoch": 365.2287581699346, + "grad_norm": 0.32743024826049805, + "learning_rate": 3.267429317561504e-05, + "loss": 0.1112, + "step": 13970 + }, + { + "epoch": 365.4901960784314, + "grad_norm": 0.3621465861797333, + "learning_rate": 3.2552126301608043e-05, + "loss": 0.1136, + "step": 13980 + }, + { + "epoch": 365.7516339869281, + "grad_norm": 0.3021661639213562, + "learning_rate": 3.243014382751224e-05, + "loss": 0.1142, + "step": 13990 + }, + { + "epoch": 366.01307189542484, + "grad_norm": 0.4712045192718506, + "learning_rate": 3.230834608682305e-05, + "loss": 0.1143, + "step": 14000 + }, + { + "epoch": 366.27450980392155, + "grad_norm": 0.28367525339126587, + "learning_rate": 3.218673341253092e-05, + "loss": 0.1144, + "step": 14010 + }, + { + "epoch": 366.5359477124183, + "grad_norm": 0.36586108803749084, + "learning_rate": 3.206530613712014e-05, + "loss": 0.1121, + "step": 14020 + }, + { + "epoch": 366.797385620915, + "grad_norm": 0.2866863012313843, + "learning_rate": 3.194406459256833e-05, + "loss": 0.1137, + "step": 14030 + }, + { + "epoch": 367.05882352941177, + "grad_norm": 0.31462156772613525, + "learning_rate": 3.182300911034518e-05, + "loss": 0.1129, + "step": 14040 + }, + { + "epoch": 367.32026143790847, + "grad_norm": 0.2943307161331177, + "learning_rate": 3.17021400214118e-05, + "loss": 0.1101, + "step": 14050 + }, + { + "epoch": 367.58169934640523, + "grad_norm": 0.31919369101524353, + "learning_rate": 3.158145765621971e-05, + "loss": 0.1132, + "step": 14060 + }, + { + "epoch": 367.84313725490193, + "grad_norm": 0.31484705209732056, + "learning_rate": 3.1460962344709774e-05, + "loss": 0.1127, + "step": 14070 + }, + { + "epoch": 368.1045751633987, + "grad_norm": 0.3020099997520447, + "learning_rate": 3.1340654416311656e-05, + "loss": 0.1153, + "step": 14080 + }, + { + "epoch": 368.36601307189545, + "grad_norm": 0.29994770884513855, + "learning_rate": 3.1220534199942585e-05, + "loss": 0.113, + "step": 14090 + }, + { + "epoch": 368.62745098039215, + "grad_norm": 0.28115540742874146, + "learning_rate": 3.1100602024006707e-05, + "loss": 0.1115, + "step": 14100 + }, + { + "epoch": 368.8888888888889, + "grad_norm": 0.37485307455062866, + "learning_rate": 3.098085821639398e-05, + "loss": 0.1172, + "step": 14110 + }, + { + "epoch": 369.1503267973856, + "grad_norm": 0.3369976282119751, + "learning_rate": 3.086130310447937e-05, + "loss": 0.11, + "step": 14120 + }, + { + "epoch": 369.4117647058824, + "grad_norm": 0.2802003026008606, + "learning_rate": 3.074193701512204e-05, + "loss": 0.1113, + "step": 14130 + }, + { + "epoch": 369.6732026143791, + "grad_norm": 0.3496047854423523, + "learning_rate": 3.0622760274664275e-05, + "loss": 0.1167, + "step": 14140 + }, + { + "epoch": 369.93464052287584, + "grad_norm": 0.2986941933631897, + "learning_rate": 3.0503773208930787e-05, + "loss": 0.1118, + "step": 14150 + }, + { + "epoch": 370.19607843137254, + "grad_norm": 0.2768370807170868, + "learning_rate": 3.038497614322763e-05, + "loss": 0.1123, + "step": 14160 + }, + { + "epoch": 370.4575163398693, + "grad_norm": 0.3114430010318756, + "learning_rate": 3.0266369402341433e-05, + "loss": 0.1152, + "step": 14170 + }, + { + "epoch": 370.718954248366, + "grad_norm": 0.30271750688552856, + "learning_rate": 3.0147953310538546e-05, + "loss": 0.1107, + "step": 14180 + }, + { + "epoch": 370.98039215686276, + "grad_norm": 0.33484479784965515, + "learning_rate": 3.0029728191563977e-05, + "loss": 0.112, + "step": 14190 + }, + { + "epoch": 371.24183006535947, + "grad_norm": 0.428725004196167, + "learning_rate": 2.9911694368640764e-05, + "loss": 0.1123, + "step": 14200 + }, + { + "epoch": 371.5032679738562, + "grad_norm": 0.3206990361213684, + "learning_rate": 2.9793852164468826e-05, + "loss": 0.1128, + "step": 14210 + }, + { + "epoch": 371.7647058823529, + "grad_norm": 0.3497907817363739, + "learning_rate": 2.9676201901224233e-05, + "loss": 0.1133, + "step": 14220 + }, + { + "epoch": 372.0261437908497, + "grad_norm": 0.3005002439022064, + "learning_rate": 2.955874390055836e-05, + "loss": 0.1127, + "step": 14230 + }, + { + "epoch": 372.2875816993464, + "grad_norm": 0.2785640358924866, + "learning_rate": 2.9441478483596862e-05, + "loss": 0.1137, + "step": 14240 + }, + { + "epoch": 372.54901960784315, + "grad_norm": 0.33614808320999146, + "learning_rate": 2.9324405970938906e-05, + "loss": 0.1105, + "step": 14250 + }, + { + "epoch": 372.81045751633985, + "grad_norm": 0.2854107618331909, + "learning_rate": 2.9207526682656306e-05, + "loss": 0.1153, + "step": 14260 + }, + { + "epoch": 373.0718954248366, + "grad_norm": 0.25472962856292725, + "learning_rate": 2.909084093829252e-05, + "loss": 0.1101, + "step": 14270 + }, + { + "epoch": 373.3333333333333, + "grad_norm": 0.34902864694595337, + "learning_rate": 2.897434905686198e-05, + "loss": 0.1105, + "step": 14280 + }, + { + "epoch": 373.5947712418301, + "grad_norm": 0.2810913622379303, + "learning_rate": 2.8858051356849014e-05, + "loss": 0.1148, + "step": 14290 + }, + { + "epoch": 373.8562091503268, + "grad_norm": 0.34702831506729126, + "learning_rate": 2.8741948156207056e-05, + "loss": 0.1137, + "step": 14300 + }, + { + "epoch": 374.11764705882354, + "grad_norm": 0.31400611996650696, + "learning_rate": 2.8626039772357882e-05, + "loss": 0.1122, + "step": 14310 + }, + { + "epoch": 374.37908496732024, + "grad_norm": 0.3254755139350891, + "learning_rate": 2.8510326522190545e-05, + "loss": 0.1113, + "step": 14320 + }, + { + "epoch": 374.640522875817, + "grad_norm": 0.31944742798805237, + "learning_rate": 2.8394808722060696e-05, + "loss": 0.1145, + "step": 14330 + }, + { + "epoch": 374.9019607843137, + "grad_norm": 0.30928805470466614, + "learning_rate": 2.8279486687789558e-05, + "loss": 0.1135, + "step": 14340 + }, + { + "epoch": 375.16339869281046, + "grad_norm": 0.3214901089668274, + "learning_rate": 2.8164360734663142e-05, + "loss": 0.111, + "step": 14350 + }, + { + "epoch": 375.42483660130716, + "grad_norm": 0.2930542230606079, + "learning_rate": 2.8049431177431486e-05, + "loss": 0.1139, + "step": 14360 + }, + { + "epoch": 375.6862745098039, + "grad_norm": 0.3097141683101654, + "learning_rate": 2.7934698330307518e-05, + "loss": 0.1127, + "step": 14370 + }, + { + "epoch": 375.9477124183006, + "grad_norm": 0.33316555619239807, + "learning_rate": 2.782016250696655e-05, + "loss": 0.1139, + "step": 14380 + }, + { + "epoch": 376.2091503267974, + "grad_norm": 0.3537690043449402, + "learning_rate": 2.77058240205451e-05, + "loss": 0.1126, + "step": 14390 + }, + { + "epoch": 376.47058823529414, + "grad_norm": 0.35879310965538025, + "learning_rate": 2.7591683183640215e-05, + "loss": 0.1145, + "step": 14400 + }, + { + "epoch": 376.73202614379085, + "grad_norm": 0.3203032314777374, + "learning_rate": 2.7477740308308618e-05, + "loss": 0.1093, + "step": 14410 + }, + { + "epoch": 376.9934640522876, + "grad_norm": 0.3569747507572174, + "learning_rate": 2.7363995706065737e-05, + "loss": 0.1143, + "step": 14420 + }, + { + "epoch": 377.2549019607843, + "grad_norm": 0.33088165521621704, + "learning_rate": 2.7250449687885028e-05, + "loss": 0.1112, + "step": 14430 + }, + { + "epoch": 377.51633986928107, + "grad_norm": 0.4784368872642517, + "learning_rate": 2.7137102564196937e-05, + "loss": 0.1136, + "step": 14440 + }, + { + "epoch": 377.77777777777777, + "grad_norm": 0.39264166355133057, + "learning_rate": 2.702395464488814e-05, + "loss": 0.115, + "step": 14450 + }, + { + "epoch": 378.03921568627453, + "grad_norm": 0.3187940716743469, + "learning_rate": 2.6911006239300794e-05, + "loss": 0.1105, + "step": 14460 + }, + { + "epoch": 378.30065359477123, + "grad_norm": 0.30897989869117737, + "learning_rate": 2.6798257656231464e-05, + "loss": 0.1127, + "step": 14470 + }, + { + "epoch": 378.562091503268, + "grad_norm": 0.40761929750442505, + "learning_rate": 2.668570920393052e-05, + "loss": 0.1139, + "step": 14480 + }, + { + "epoch": 378.8235294117647, + "grad_norm": 0.32026803493499756, + "learning_rate": 2.657336119010112e-05, + "loss": 0.1136, + "step": 14490 + }, + { + "epoch": 379.08496732026146, + "grad_norm": 0.30582597851753235, + "learning_rate": 2.646121392189841e-05, + "loss": 0.112, + "step": 14500 + }, + { + "epoch": 379.34640522875816, + "grad_norm": 0.2957599461078644, + "learning_rate": 2.6349267705928793e-05, + "loss": 0.1105, + "step": 14510 + }, + { + "epoch": 379.6078431372549, + "grad_norm": 0.2888094484806061, + "learning_rate": 2.623752284824893e-05, + "loss": 0.1119, + "step": 14520 + }, + { + "epoch": 379.8692810457516, + "grad_norm": 0.32123002409935, + "learning_rate": 2.6125979654364952e-05, + "loss": 0.1138, + "step": 14530 + }, + { + "epoch": 380.1307189542484, + "grad_norm": 0.2851218283176422, + "learning_rate": 2.601463842923175e-05, + "loss": 0.1113, + "step": 14540 + }, + { + "epoch": 380.3921568627451, + "grad_norm": 0.31959566473960876, + "learning_rate": 2.5903499477251936e-05, + "loss": 0.1136, + "step": 14550 + }, + { + "epoch": 380.65359477124184, + "grad_norm": 0.32848477363586426, + "learning_rate": 2.5792563102275213e-05, + "loss": 0.1107, + "step": 14560 + }, + { + "epoch": 380.91503267973854, + "grad_norm": 0.31065839529037476, + "learning_rate": 2.568182960759735e-05, + "loss": 0.1144, + "step": 14570 + }, + { + "epoch": 381.1764705882353, + "grad_norm": 0.3056698739528656, + "learning_rate": 2.5571299295959496e-05, + "loss": 0.1127, + "step": 14580 + }, + { + "epoch": 381.437908496732, + "grad_norm": 0.29230251908302307, + "learning_rate": 2.546097246954734e-05, + "loss": 0.1102, + "step": 14590 + }, + { + "epoch": 381.69934640522877, + "grad_norm": 0.3203261196613312, + "learning_rate": 2.5350849429990152e-05, + "loss": 0.113, + "step": 14600 + }, + { + "epoch": 381.96078431372547, + "grad_norm": 0.406775563955307, + "learning_rate": 2.524093047836018e-05, + "loss": 0.1152, + "step": 14610 + }, + { + "epoch": 382.22222222222223, + "grad_norm": 0.2875988781452179, + "learning_rate": 2.5131215915171624e-05, + "loss": 0.113, + "step": 14620 + }, + { + "epoch": 382.48366013071893, + "grad_norm": 0.29914501309394836, + "learning_rate": 2.5021706040379854e-05, + "loss": 0.1104, + "step": 14630 + }, + { + "epoch": 382.7450980392157, + "grad_norm": 0.3158552944660187, + "learning_rate": 2.4912401153380772e-05, + "loss": 0.1152, + "step": 14640 + }, + { + "epoch": 383.0065359477124, + "grad_norm": 0.45473095774650574, + "learning_rate": 2.4803301553009694e-05, + "loss": 0.112, + "step": 14650 + }, + { + "epoch": 383.26797385620915, + "grad_norm": 0.32687652111053467, + "learning_rate": 2.4694407537540808e-05, + "loss": 0.1119, + "step": 14660 + }, + { + "epoch": 383.52941176470586, + "grad_norm": 0.3493711054325104, + "learning_rate": 2.4585719404686192e-05, + "loss": 0.1152, + "step": 14670 + }, + { + "epoch": 383.7908496732026, + "grad_norm": 0.3303303122520447, + "learning_rate": 2.4477237451595004e-05, + "loss": 0.1105, + "step": 14680 + }, + { + "epoch": 384.0522875816994, + "grad_norm": 0.29483675956726074, + "learning_rate": 2.436896197485282e-05, + "loss": 0.1136, + "step": 14690 + }, + { + "epoch": 384.3137254901961, + "grad_norm": 0.28735411167144775, + "learning_rate": 2.42608932704806e-05, + "loss": 0.1104, + "step": 14700 + }, + { + "epoch": 384.57516339869284, + "grad_norm": 0.30859243869781494, + "learning_rate": 2.415303163393412e-05, + "loss": 0.1133, + "step": 14710 + }, + { + "epoch": 384.83660130718954, + "grad_norm": 0.32482388615608215, + "learning_rate": 2.404537736010295e-05, + "loss": 0.1131, + "step": 14720 + }, + { + "epoch": 385.0980392156863, + "grad_norm": 0.32856839895248413, + "learning_rate": 2.3937930743309723e-05, + "loss": 0.113, + "step": 14730 + }, + { + "epoch": 385.359477124183, + "grad_norm": 0.3382302224636078, + "learning_rate": 2.3830692077309446e-05, + "loss": 0.111, + "step": 14740 + }, + { + "epoch": 385.62091503267976, + "grad_norm": 0.4078975319862366, + "learning_rate": 2.3723661655288487e-05, + "loss": 0.1147, + "step": 14750 + }, + { + "epoch": 385.88235294117646, + "grad_norm": 0.32090675830841064, + "learning_rate": 2.3616839769863984e-05, + "loss": 0.1121, + "step": 14760 + }, + { + "epoch": 386.1437908496732, + "grad_norm": 0.3056178092956543, + "learning_rate": 2.351022671308287e-05, + "loss": 0.1116, + "step": 14770 + }, + { + "epoch": 386.4052287581699, + "grad_norm": 0.33630117774009705, + "learning_rate": 2.3403822776421135e-05, + "loss": 0.113, + "step": 14780 + }, + { + "epoch": 386.6666666666667, + "grad_norm": 0.3036194145679474, + "learning_rate": 2.3297628250783154e-05, + "loss": 0.1124, + "step": 14790 + }, + { + "epoch": 386.9281045751634, + "grad_norm": 0.35577064752578735, + "learning_rate": 2.3191643426500653e-05, + "loss": 0.113, + "step": 14800 + }, + { + "epoch": 387.18954248366015, + "grad_norm": 0.3300045430660248, + "learning_rate": 2.3085868593332073e-05, + "loss": 0.1126, + "step": 14810 + }, + { + "epoch": 387.45098039215685, + "grad_norm": 0.28813156485557556, + "learning_rate": 2.298030404046183e-05, + "loss": 0.1106, + "step": 14820 + }, + { + "epoch": 387.7124183006536, + "grad_norm": 0.40409666299819946, + "learning_rate": 2.2874950056499324e-05, + "loss": 0.118, + "step": 14830 + }, + { + "epoch": 387.9738562091503, + "grad_norm": 0.3458053171634674, + "learning_rate": 2.2769806929478377e-05, + "loss": 0.1093, + "step": 14840 + }, + { + "epoch": 388.2352941176471, + "grad_norm": 0.3175845742225647, + "learning_rate": 2.266487494685625e-05, + "loss": 0.1122, + "step": 14850 + }, + { + "epoch": 388.4967320261438, + "grad_norm": 0.31592434644699097, + "learning_rate": 2.2560154395512967e-05, + "loss": 0.115, + "step": 14860 + }, + { + "epoch": 388.75816993464053, + "grad_norm": 0.31762829422950745, + "learning_rate": 2.245564556175056e-05, + "loss": 0.1127, + "step": 14870 + }, + { + "epoch": 389.01960784313724, + "grad_norm": 0.27463746070861816, + "learning_rate": 2.235134873129213e-05, + "loss": 0.1095, + "step": 14880 + }, + { + "epoch": 389.281045751634, + "grad_norm": 0.2556680738925934, + "learning_rate": 2.2247264189281304e-05, + "loss": 0.1127, + "step": 14890 + }, + { + "epoch": 389.5424836601307, + "grad_norm": 0.3717378079891205, + "learning_rate": 2.214339222028119e-05, + "loss": 0.1127, + "step": 14900 + }, + { + "epoch": 389.80392156862746, + "grad_norm": 0.33163613080978394, + "learning_rate": 2.2039733108273774e-05, + "loss": 0.1129, + "step": 14910 + }, + { + "epoch": 390.06535947712416, + "grad_norm": 0.33852529525756836, + "learning_rate": 2.193628713665916e-05, + "loss": 0.1126, + "step": 14920 + }, + { + "epoch": 390.3267973856209, + "grad_norm": 0.38463908433914185, + "learning_rate": 2.183305458825464e-05, + "loss": 0.1107, + "step": 14930 + }, + { + "epoch": 390.5882352941176, + "grad_norm": 0.29137659072875977, + "learning_rate": 2.1730035745294098e-05, + "loss": 0.1135, + "step": 14940 + }, + { + "epoch": 390.8496732026144, + "grad_norm": 0.3932526111602783, + "learning_rate": 2.1627230889427096e-05, + "loss": 0.1138, + "step": 14950 + }, + { + "epoch": 391.1111111111111, + "grad_norm": 0.30873677134513855, + "learning_rate": 2.1524640301718167e-05, + "loss": 0.1129, + "step": 14960 + }, + { + "epoch": 391.37254901960785, + "grad_norm": 0.28704315423965454, + "learning_rate": 2.1422264262646097e-05, + "loss": 0.1134, + "step": 14970 + }, + { + "epoch": 391.63398692810455, + "grad_norm": 0.3116617798805237, + "learning_rate": 2.1320103052103024e-05, + "loss": 0.1113, + "step": 14980 + }, + { + "epoch": 391.8954248366013, + "grad_norm": 0.3618609607219696, + "learning_rate": 2.1218156949393853e-05, + "loss": 0.1132, + "step": 14990 + }, + { + "epoch": 392.15686274509807, + "grad_norm": 0.30935177206993103, + "learning_rate": 2.111642623323531e-05, + "loss": 0.112, + "step": 15000 + }, + { + "epoch": 392.41830065359477, + "grad_norm": 0.32789334654808044, + "learning_rate": 2.1014911181755247e-05, + "loss": 0.1127, + "step": 15010 + }, + { + "epoch": 392.67973856209153, + "grad_norm": 0.3063357472419739, + "learning_rate": 2.0913612072492006e-05, + "loss": 0.1144, + "step": 15020 + }, + { + "epoch": 392.94117647058823, + "grad_norm": 0.3251248896121979, + "learning_rate": 2.0812529182393424e-05, + "loss": 0.1107, + "step": 15030 + }, + { + "epoch": 393.202614379085, + "grad_norm": 0.31256797909736633, + "learning_rate": 2.071166278781632e-05, + "loss": 0.11, + "step": 15040 + }, + { + "epoch": 393.4640522875817, + "grad_norm": 0.29990172386169434, + "learning_rate": 2.061101316452554e-05, + "loss": 0.1126, + "step": 15050 + }, + { + "epoch": 393.72549019607845, + "grad_norm": 0.34786394238471985, + "learning_rate": 2.0510580587693273e-05, + "loss": 0.1133, + "step": 15060 + }, + { + "epoch": 393.98692810457516, + "grad_norm": 0.33727478981018066, + "learning_rate": 2.0410365331898416e-05, + "loss": 0.1145, + "step": 15070 + }, + { + "epoch": 394.2483660130719, + "grad_norm": 0.31517842411994934, + "learning_rate": 2.0310367671125618e-05, + "loss": 0.1113, + "step": 15080 + }, + { + "epoch": 394.5098039215686, + "grad_norm": 0.36189743876457214, + "learning_rate": 2.021058787876464e-05, + "loss": 0.1151, + "step": 15090 + }, + { + "epoch": 394.7712418300654, + "grad_norm": 0.3063863515853882, + "learning_rate": 2.0111026227609675e-05, + "loss": 0.1124, + "step": 15100 + }, + { + "epoch": 395.0326797385621, + "grad_norm": 0.3760160207748413, + "learning_rate": 2.0011682989858427e-05, + "loss": 0.1107, + "step": 15110 + }, + { + "epoch": 395.29411764705884, + "grad_norm": 0.31587451696395874, + "learning_rate": 1.991255843711156e-05, + "loss": 0.1103, + "step": 15120 + }, + { + "epoch": 395.55555555555554, + "grad_norm": 0.3024122416973114, + "learning_rate": 1.98136528403718e-05, + "loss": 0.1093, + "step": 15130 + }, + { + "epoch": 395.8169934640523, + "grad_norm": 0.32457295060157776, + "learning_rate": 1.971496647004324e-05, + "loss": 0.1158, + "step": 15140 + }, + { + "epoch": 396.078431372549, + "grad_norm": 0.3319365978240967, + "learning_rate": 1.9616499595930692e-05, + "loss": 0.1151, + "step": 15150 + }, + { + "epoch": 396.33986928104576, + "grad_norm": 0.33728495240211487, + "learning_rate": 1.9518252487238797e-05, + "loss": 0.1118, + "step": 15160 + }, + { + "epoch": 396.60130718954247, + "grad_norm": 0.34540119767189026, + "learning_rate": 1.9420225412571435e-05, + "loss": 0.1121, + "step": 15170 + }, + { + "epoch": 396.8627450980392, + "grad_norm": 0.30559542775154114, + "learning_rate": 1.9322418639930863e-05, + "loss": 0.1125, + "step": 15180 + }, + { + "epoch": 397.12418300653593, + "grad_norm": 0.3552384376525879, + "learning_rate": 1.9224832436717045e-05, + "loss": 0.1133, + "step": 15190 + }, + { + "epoch": 397.3856209150327, + "grad_norm": 0.38219109177589417, + "learning_rate": 1.912746706972697e-05, + "loss": 0.1122, + "step": 15200 + }, + { + "epoch": 397.6470588235294, + "grad_norm": 0.38125985860824585, + "learning_rate": 1.90303228051538e-05, + "loss": 0.112, + "step": 15210 + }, + { + "epoch": 397.90849673202615, + "grad_norm": 0.3336564600467682, + "learning_rate": 1.89333999085863e-05, + "loss": 0.1127, + "step": 15220 + }, + { + "epoch": 398.16993464052285, + "grad_norm": 0.29126253724098206, + "learning_rate": 1.8836698645007888e-05, + "loss": 0.1115, + "step": 15230 + }, + { + "epoch": 398.4313725490196, + "grad_norm": 0.4003722667694092, + "learning_rate": 1.8740219278796167e-05, + "loss": 0.1117, + "step": 15240 + }, + { + "epoch": 398.6928104575163, + "grad_norm": 0.348296582698822, + "learning_rate": 1.8643962073722064e-05, + "loss": 0.1133, + "step": 15250 + }, + { + "epoch": 398.9542483660131, + "grad_norm": 0.37177759408950806, + "learning_rate": 1.854792729294905e-05, + "loss": 0.1121, + "step": 15260 + }, + { + "epoch": 399.2156862745098, + "grad_norm": 0.3493458032608032, + "learning_rate": 1.8452115199032638e-05, + "loss": 0.1116, + "step": 15270 + }, + { + "epoch": 399.47712418300654, + "grad_norm": 0.30324786901474, + "learning_rate": 1.835652605391931e-05, + "loss": 0.1116, + "step": 15280 + }, + { + "epoch": 399.73856209150324, + "grad_norm": 0.2836083471775055, + "learning_rate": 1.826116011894621e-05, + "loss": 0.111, + "step": 15290 + }, + { + "epoch": 400.0, + "grad_norm": 0.3711930513381958, + "learning_rate": 1.8166017654840184e-05, + "loss": 0.115, + "step": 15300 + }, + { + "epoch": 400.26143790849676, + "grad_norm": 0.3117862343788147, + "learning_rate": 1.8071098921717033e-05, + "loss": 0.1129, + "step": 15310 + }, + { + "epoch": 400.52287581699346, + "grad_norm": 0.3351728916168213, + "learning_rate": 1.797640417908104e-05, + "loss": 0.1117, + "step": 15320 + }, + { + "epoch": 400.7843137254902, + "grad_norm": 0.36364924907684326, + "learning_rate": 1.7881933685823905e-05, + "loss": 0.1146, + "step": 15330 + }, + { + "epoch": 401.0457516339869, + "grad_norm": 0.30129241943359375, + "learning_rate": 1.7787687700224397e-05, + "loss": 0.1101, + "step": 15340 + }, + { + "epoch": 401.3071895424837, + "grad_norm": 0.29976898431777954, + "learning_rate": 1.769366647994748e-05, + "loss": 0.1131, + "step": 15350 + }, + { + "epoch": 401.5686274509804, + "grad_norm": 0.28711673617362976, + "learning_rate": 1.7599870282043552e-05, + "loss": 0.1131, + "step": 15360 + }, + { + "epoch": 401.83006535947715, + "grad_norm": 0.32417982816696167, + "learning_rate": 1.750629936294782e-05, + "loss": 0.112, + "step": 15370 + }, + { + "epoch": 402.09150326797385, + "grad_norm": 0.2811816930770874, + "learning_rate": 1.7412953978479595e-05, + "loss": 0.1132, + "step": 15380 + }, + { + "epoch": 402.3529411764706, + "grad_norm": 0.3149554431438446, + "learning_rate": 1.7319834383841616e-05, + "loss": 0.11, + "step": 15390 + }, + { + "epoch": 402.6143790849673, + "grad_norm": 0.3484453856945038, + "learning_rate": 1.7226940833619322e-05, + "loss": 0.111, + "step": 15400 + }, + { + "epoch": 402.87581699346407, + "grad_norm": 0.3713397681713104, + "learning_rate": 1.7134273581780113e-05, + "loss": 0.1151, + "step": 15410 + }, + { + "epoch": 403.1372549019608, + "grad_norm": 0.36278796195983887, + "learning_rate": 1.7041832881672703e-05, + "loss": 0.1136, + "step": 15420 + }, + { + "epoch": 403.39869281045753, + "grad_norm": 0.29296210408210754, + "learning_rate": 1.6949618986026416e-05, + "loss": 0.1099, + "step": 15430 + }, + { + "epoch": 403.66013071895424, + "grad_norm": 0.32640260457992554, + "learning_rate": 1.6857632146950564e-05, + "loss": 0.1118, + "step": 15440 + }, + { + "epoch": 403.921568627451, + "grad_norm": 0.3738747537136078, + "learning_rate": 1.6765872615933677e-05, + "loss": 0.1151, + "step": 15450 + }, + { + "epoch": 404.1830065359477, + "grad_norm": 0.3174304962158203, + "learning_rate": 1.6674340643842733e-05, + "loss": 0.1092, + "step": 15460 + }, + { + "epoch": 404.44444444444446, + "grad_norm": 0.3067731261253357, + "learning_rate": 1.6583036480922697e-05, + "loss": 0.1133, + "step": 15470 + }, + { + "epoch": 404.70588235294116, + "grad_norm": 0.3003155589103699, + "learning_rate": 1.6491960376795635e-05, + "loss": 0.1139, + "step": 15480 + }, + { + "epoch": 404.9673202614379, + "grad_norm": 0.3668583631515503, + "learning_rate": 1.6401112580460167e-05, + "loss": 0.1122, + "step": 15490 + }, + { + "epoch": 405.2287581699346, + "grad_norm": 0.3048345148563385, + "learning_rate": 1.6310493340290723e-05, + "loss": 0.1122, + "step": 15500 + }, + { + "epoch": 405.4901960784314, + "grad_norm": 0.308072566986084, + "learning_rate": 1.622010290403677e-05, + "loss": 0.1172, + "step": 15510 + }, + { + "epoch": 405.7516339869281, + "grad_norm": 0.3233634829521179, + "learning_rate": 1.6129941518822366e-05, + "loss": 0.1103, + "step": 15520 + }, + { + "epoch": 406.01307189542484, + "grad_norm": 0.34668582677841187, + "learning_rate": 1.6040009431145266e-05, + "loss": 0.1088, + "step": 15530 + }, + { + "epoch": 406.27450980392155, + "grad_norm": 0.4414779841899872, + "learning_rate": 1.5950306886876366e-05, + "loss": 0.1117, + "step": 15540 + }, + { + "epoch": 406.5359477124183, + "grad_norm": 0.3160878121852875, + "learning_rate": 1.586083413125906e-05, + "loss": 0.1113, + "step": 15550 + }, + { + "epoch": 406.797385620915, + "grad_norm": 0.3153967261314392, + "learning_rate": 1.577159140890835e-05, + "loss": 0.1136, + "step": 15560 + }, + { + "epoch": 407.05882352941177, + "grad_norm": 0.29611125588417053, + "learning_rate": 1.568257896381049e-05, + "loss": 0.1119, + "step": 15570 + }, + { + "epoch": 407.32026143790847, + "grad_norm": 0.3124426603317261, + "learning_rate": 1.5593797039322076e-05, + "loss": 0.1136, + "step": 15580 + }, + { + "epoch": 407.58169934640523, + "grad_norm": 0.29556897282600403, + "learning_rate": 1.5505245878169528e-05, + "loss": 0.1133, + "step": 15590 + }, + { + "epoch": 407.84313725490193, + "grad_norm": 0.3371742069721222, + "learning_rate": 1.541692572244833e-05, + "loss": 0.1084, + "step": 15600 + }, + { + "epoch": 408.1045751633987, + "grad_norm": 0.3488721549510956, + "learning_rate": 1.5328836813622393e-05, + "loss": 0.1134, + "step": 15610 + }, + { + "epoch": 408.36601307189545, + "grad_norm": 0.31725990772247314, + "learning_rate": 1.5240979392523458e-05, + "loss": 0.113, + "step": 15620 + }, + { + "epoch": 408.62745098039215, + "grad_norm": 0.37460392713546753, + "learning_rate": 1.5153353699350337e-05, + "loss": 0.1115, + "step": 15630 + }, + { + "epoch": 408.8888888888889, + "grad_norm": 0.3097977936267853, + "learning_rate": 1.5065959973668353e-05, + "loss": 0.1125, + "step": 15640 + }, + { + "epoch": 409.1503267973856, + "grad_norm": 0.27604126930236816, + "learning_rate": 1.4978798454408605e-05, + "loss": 0.1116, + "step": 15650 + }, + { + "epoch": 409.4117647058824, + "grad_norm": 0.30955255031585693, + "learning_rate": 1.489186937986734e-05, + "loss": 0.11, + "step": 15660 + }, + { + "epoch": 409.6732026143791, + "grad_norm": 0.310028076171875, + "learning_rate": 1.4805172987705362e-05, + "loss": 0.1135, + "step": 15670 + }, + { + "epoch": 409.93464052287584, + "grad_norm": 0.3490021824836731, + "learning_rate": 1.471870951494726e-05, + "loss": 0.1142, + "step": 15680 + }, + { + "epoch": 410.19607843137254, + "grad_norm": 0.40562987327575684, + "learning_rate": 1.4641091730943024e-05, + "loss": 0.1104, + "step": 15690 + }, + { + "epoch": 410.4575163398693, + "grad_norm": 0.33675485849380493, + "learning_rate": 1.4555071455773993e-05, + "loss": 0.1106, + "step": 15700 + }, + { + "epoch": 410.718954248366, + "grad_norm": 0.3096622824668884, + "learning_rate": 1.4469284783776893e-05, + "loss": 0.1133, + "step": 15710 + }, + { + "epoch": 410.98039215686276, + "grad_norm": 0.3052927851676941, + "learning_rate": 1.43837319494892e-05, + "loss": 0.1142, + "step": 15720 + }, + { + "epoch": 411.24183006535947, + "grad_norm": 0.3547520935535431, + "learning_rate": 1.4298413186809123e-05, + "loss": 0.1137, + "step": 15730 + }, + { + "epoch": 411.5032679738562, + "grad_norm": 0.3303002417087555, + "learning_rate": 1.4213328728994857e-05, + "loss": 0.1126, + "step": 15740 + }, + { + "epoch": 411.7647058823529, + "grad_norm": 0.3191388249397278, + "learning_rate": 1.4128478808664125e-05, + "loss": 0.1101, + "step": 15750 + }, + { + "epoch": 412.0261437908497, + "grad_norm": 0.29005515575408936, + "learning_rate": 1.4043863657793332e-05, + "loss": 0.1122, + "step": 15760 + }, + { + "epoch": 412.2875816993464, + "grad_norm": 0.33250725269317627, + "learning_rate": 1.3959483507717042e-05, + "loss": 0.1104, + "step": 15770 + }, + { + "epoch": 412.54901960784315, + "grad_norm": 0.2780681252479553, + "learning_rate": 1.3875338589127418e-05, + "loss": 0.1114, + "step": 15780 + }, + { + "epoch": 412.81045751633985, + "grad_norm": 0.35173699259757996, + "learning_rate": 1.3791429132073408e-05, + "loss": 0.1113, + "step": 15790 + }, + { + "epoch": 413.0718954248366, + "grad_norm": 0.3443647027015686, + "learning_rate": 1.3707755365960317e-05, + "loss": 0.1156, + "step": 15800 + }, + { + "epoch": 413.3333333333333, + "grad_norm": 0.2871156930923462, + "learning_rate": 1.3624317519548979e-05, + "loss": 0.1131, + "step": 15810 + }, + { + "epoch": 413.5947712418301, + "grad_norm": 0.26214319467544556, + "learning_rate": 1.3541115820955285e-05, + "loss": 0.1083, + "step": 15820 + }, + { + "epoch": 413.8562091503268, + "grad_norm": 0.3014472723007202, + "learning_rate": 1.3458150497649525e-05, + "loss": 0.1142, + "step": 15830 + }, + { + "epoch": 414.11764705882354, + "grad_norm": 0.3216531276702881, + "learning_rate": 1.3375421776455699e-05, + "loss": 0.1124, + "step": 15840 + }, + { + "epoch": 414.37908496732024, + "grad_norm": 0.3679593503475189, + "learning_rate": 1.3292929883550998e-05, + "loss": 0.1134, + "step": 15850 + }, + { + "epoch": 414.640522875817, + "grad_norm": 0.2999366819858551, + "learning_rate": 1.3210675044465103e-05, + "loss": 0.1124, + "step": 15860 + }, + { + "epoch": 414.9019607843137, + "grad_norm": 0.34971389174461365, + "learning_rate": 1.3128657484079566e-05, + "loss": 0.1131, + "step": 15870 + }, + { + "epoch": 415.16339869281046, + "grad_norm": 0.3751276433467865, + "learning_rate": 1.3046877426627313e-05, + "loss": 0.1118, + "step": 15880 + }, + { + "epoch": 415.42483660130716, + "grad_norm": 0.32183781266212463, + "learning_rate": 1.2965335095691889e-05, + "loss": 0.1129, + "step": 15890 + }, + { + "epoch": 415.6862745098039, + "grad_norm": 0.35736966133117676, + "learning_rate": 1.2884030714206874e-05, + "loss": 0.1109, + "step": 15900 + }, + { + "epoch": 415.9477124183006, + "grad_norm": 0.3015005588531494, + "learning_rate": 1.2802964504455395e-05, + "loss": 0.1124, + "step": 15910 + }, + { + "epoch": 416.2091503267974, + "grad_norm": 0.34793102741241455, + "learning_rate": 1.272213668806933e-05, + "loss": 0.1146, + "step": 15920 + }, + { + "epoch": 416.47058823529414, + "grad_norm": 0.3514239490032196, + "learning_rate": 1.2641547486028882e-05, + "loss": 0.1106, + "step": 15930 + }, + { + "epoch": 416.73202614379085, + "grad_norm": 0.31007125973701477, + "learning_rate": 1.2561197118661828e-05, + "loss": 0.1102, + "step": 15940 + }, + { + "epoch": 416.9934640522876, + "grad_norm": 0.31926625967025757, + "learning_rate": 1.2481085805643e-05, + "loss": 0.1129, + "step": 15950 + }, + { + "epoch": 417.2549019607843, + "grad_norm": 0.3109895884990692, + "learning_rate": 1.2401213765993691e-05, + "loss": 0.1124, + "step": 15960 + }, + { + "epoch": 417.51633986928107, + "grad_norm": 0.33066773414611816, + "learning_rate": 1.2321581218080979e-05, + "loss": 0.112, + "step": 15970 + }, + { + "epoch": 417.77777777777777, + "grad_norm": 0.4178493618965149, + "learning_rate": 1.2242188379617236e-05, + "loss": 0.1121, + "step": 15980 + }, + { + "epoch": 418.03921568627453, + "grad_norm": 0.3649527430534363, + "learning_rate": 1.2163035467659444e-05, + "loss": 0.1127, + "step": 15990 + }, + { + "epoch": 418.30065359477123, + "grad_norm": 0.29703032970428467, + "learning_rate": 1.2084122698608625e-05, + "loss": 0.1118, + "step": 16000 + }, + { + "epoch": 418.562091503268, + "grad_norm": 0.31829434633255005, + "learning_rate": 1.2005450288209297e-05, + "loss": 0.1123, + "step": 16010 + }, + { + "epoch": 418.8235294117647, + "grad_norm": 0.3175608217716217, + "learning_rate": 1.1927018451548811e-05, + "loss": 0.1096, + "step": 16020 + }, + { + "epoch": 419.08496732026146, + "grad_norm": 0.30831390619277954, + "learning_rate": 1.1848827403056828e-05, + "loss": 0.1153, + "step": 16030 + }, + { + "epoch": 419.34640522875816, + "grad_norm": 0.3015325665473938, + "learning_rate": 1.1770877356504683e-05, + "loss": 0.1087, + "step": 16040 + }, + { + "epoch": 419.6078431372549, + "grad_norm": 0.362508624792099, + "learning_rate": 1.1693168525004805e-05, + "loss": 0.113, + "step": 16050 + }, + { + "epoch": 419.8692810457516, + "grad_norm": 0.36422449350357056, + "learning_rate": 1.1615701121010214e-05, + "loss": 0.1133, + "step": 16060 + }, + { + "epoch": 420.1307189542484, + "grad_norm": 0.27966681122779846, + "learning_rate": 1.1538475356313794e-05, + "loss": 0.1118, + "step": 16070 + }, + { + "epoch": 420.3921568627451, + "grad_norm": 0.362490713596344, + "learning_rate": 1.1461491442047878e-05, + "loss": 0.1122, + "step": 16080 + }, + { + "epoch": 420.65359477124184, + "grad_norm": 0.3337494432926178, + "learning_rate": 1.138474958868352e-05, + "loss": 0.11, + "step": 16090 + }, + { + "epoch": 420.91503267973854, + "grad_norm": 0.3513115346431732, + "learning_rate": 1.1308250006029997e-05, + "loss": 0.1138, + "step": 16100 + }, + { + "epoch": 421.1764705882353, + "grad_norm": 0.28625595569610596, + "learning_rate": 1.123199290323429e-05, + "loss": 0.1115, + "step": 16110 + }, + { + "epoch": 421.437908496732, + "grad_norm": 0.3301337957382202, + "learning_rate": 1.1155978488780384e-05, + "loss": 0.1113, + "step": 16120 + }, + { + "epoch": 421.69934640522877, + "grad_norm": 0.42644599080085754, + "learning_rate": 1.1080206970488793e-05, + "loss": 0.1157, + "step": 16130 + }, + { + "epoch": 421.96078431372547, + "grad_norm": 0.35698893666267395, + "learning_rate": 1.1004678555515957e-05, + "loss": 0.1112, + "step": 16140 + }, + { + "epoch": 422.22222222222223, + "grad_norm": 0.37550118565559387, + "learning_rate": 1.0929393450353654e-05, + "loss": 0.1127, + "step": 16150 + }, + { + "epoch": 422.48366013071893, + "grad_norm": 0.31585901975631714, + "learning_rate": 1.0854351860828527e-05, + "loss": 0.1109, + "step": 16160 + }, + { + "epoch": 422.7450980392157, + "grad_norm": 0.3062879741191864, + "learning_rate": 1.0779553992101387e-05, + "loss": 0.1123, + "step": 16170 + }, + { + "epoch": 423.0065359477124, + "grad_norm": 0.38038599491119385, + "learning_rate": 1.0705000048666735e-05, + "loss": 0.1117, + "step": 16180 + }, + { + "epoch": 423.26797385620915, + "grad_norm": 0.30192792415618896, + "learning_rate": 1.0630690234352259e-05, + "loss": 0.1096, + "step": 16190 + }, + { + "epoch": 423.52941176470586, + "grad_norm": 0.2793295383453369, + "learning_rate": 1.0556624752318101e-05, + "loss": 0.1139, + "step": 16200 + }, + { + "epoch": 423.7908496732026, + "grad_norm": 0.35322046279907227, + "learning_rate": 1.0482803805056507e-05, + "loss": 0.1133, + "step": 16210 + }, + { + "epoch": 424.0522875816994, + "grad_norm": 0.33232808113098145, + "learning_rate": 1.0409227594391102e-05, + "loss": 0.1096, + "step": 16220 + }, + { + "epoch": 424.3137254901961, + "grad_norm": 0.2976301908493042, + "learning_rate": 1.0335896321476413e-05, + "loss": 0.1115, + "step": 16230 + }, + { + "epoch": 424.57516339869284, + "grad_norm": 0.35135358572006226, + "learning_rate": 1.0262810186797389e-05, + "loss": 0.1116, + "step": 16240 + }, + { + "epoch": 424.83660130718954, + "grad_norm": 0.31133124232292175, + "learning_rate": 1.0189969390168696e-05, + "loss": 0.1139, + "step": 16250 + }, + { + "epoch": 425.0980392156863, + "grad_norm": 0.3206573724746704, + "learning_rate": 1.0117374130734314e-05, + "loss": 0.1106, + "step": 16260 + }, + { + "epoch": 425.359477124183, + "grad_norm": 0.29747602343559265, + "learning_rate": 1.0045024606966902e-05, + "loss": 0.1112, + "step": 16270 + }, + { + "epoch": 425.62091503267976, + "grad_norm": 0.3697657585144043, + "learning_rate": 9.972921016667269e-06, + "loss": 0.1128, + "step": 16280 + }, + { + "epoch": 425.88235294117646, + "grad_norm": 0.3043394088745117, + "learning_rate": 9.90106355696393e-06, + "loss": 0.1128, + "step": 16290 + }, + { + "epoch": 426.1437908496732, + "grad_norm": 0.34919267892837524, + "learning_rate": 9.82945242431238e-06, + "loss": 0.111, + "step": 16300 + }, + { + "epoch": 426.4052287581699, + "grad_norm": 0.3754013478755951, + "learning_rate": 9.758087814494764e-06, + "loss": 0.1133, + "step": 16310 + }, + { + "epoch": 426.6666666666667, + "grad_norm": 0.2977403402328491, + "learning_rate": 9.686969922619193e-06, + "loss": 0.1112, + "step": 16320 + }, + { + "epoch": 426.9281045751634, + "grad_norm": 0.27013343572616577, + "learning_rate": 9.616098943119234e-06, + "loss": 0.1104, + "step": 16330 + }, + { + "epoch": 427.18954248366015, + "grad_norm": 0.35326704382896423, + "learning_rate": 9.545475069753484e-06, + "loss": 0.1113, + "step": 16340 + }, + { + "epoch": 427.45098039215685, + "grad_norm": 0.39166802167892456, + "learning_rate": 9.475098495604884e-06, + "loss": 0.1082, + "step": 16350 + }, + { + "epoch": 427.7124183006536, + "grad_norm": 0.3371388614177704, + "learning_rate": 9.404969413080322e-06, + "loss": 0.1146, + "step": 16360 + }, + { + "epoch": 427.9738562091503, + "grad_norm": 0.3773285448551178, + "learning_rate": 9.335088013910021e-06, + "loss": 0.116, + "step": 16370 + }, + { + "epoch": 428.2352941176471, + "grad_norm": 0.31114351749420166, + "learning_rate": 9.265454489147052e-06, + "loss": 0.1115, + "step": 16380 + }, + { + "epoch": 428.4967320261438, + "grad_norm": 0.31893211603164673, + "learning_rate": 9.196069029166831e-06, + "loss": 0.1146, + "step": 16390 + }, + { + "epoch": 428.75816993464053, + "grad_norm": 0.3122820556163788, + "learning_rate": 9.126931823666517e-06, + "loss": 0.1105, + "step": 16400 + }, + { + "epoch": 429.01960784313724, + "grad_norm": 0.41064098477363586, + "learning_rate": 9.058043061664655e-06, + "loss": 0.1125, + "step": 16410 + }, + { + "epoch": 429.281045751634, + "grad_norm": 0.30506137013435364, + "learning_rate": 8.989402931500434e-06, + "loss": 0.1094, + "step": 16420 + }, + { + "epoch": 429.5424836601307, + "grad_norm": 0.298270583152771, + "learning_rate": 8.921011620833364e-06, + "loss": 0.1134, + "step": 16430 + }, + { + "epoch": 429.80392156862746, + "grad_norm": 0.3039408326148987, + "learning_rate": 8.852869316642688e-06, + "loss": 0.1129, + "step": 16440 + }, + { + "epoch": 430.06535947712416, + "grad_norm": 0.33438700437545776, + "learning_rate": 8.78497620522687e-06, + "loss": 0.1124, + "step": 16450 + }, + { + "epoch": 430.3267973856209, + "grad_norm": 0.3313377797603607, + "learning_rate": 8.717332472203033e-06, + "loss": 0.1095, + "step": 16460 + }, + { + "epoch": 430.5882352941176, + "grad_norm": 0.3241206705570221, + "learning_rate": 8.649938302506633e-06, + "loss": 0.1133, + "step": 16470 + }, + { + "epoch": 430.8496732026144, + "grad_norm": 0.3204852342605591, + "learning_rate": 8.582793880390693e-06, + "loss": 0.1111, + "step": 16480 + }, + { + "epoch": 431.1111111111111, + "grad_norm": 0.315046489238739, + "learning_rate": 8.515899389425542e-06, + "loss": 0.1135, + "step": 16490 + }, + { + "epoch": 431.37254901960785, + "grad_norm": 0.3156817555427551, + "learning_rate": 8.449255012498148e-06, + "loss": 0.1084, + "step": 16500 + }, + { + "epoch": 431.63398692810455, + "grad_norm": 0.33890920877456665, + "learning_rate": 8.382860931811687e-06, + "loss": 0.113, + "step": 16510 + }, + { + "epoch": 431.8954248366013, + "grad_norm": 0.3318261206150055, + "learning_rate": 8.31671732888506e-06, + "loss": 0.1128, + "step": 16520 + }, + { + "epoch": 432.15686274509807, + "grad_norm": 0.3495106101036072, + "learning_rate": 8.250824384552314e-06, + "loss": 0.1134, + "step": 16530 + }, + { + "epoch": 432.41830065359477, + "grad_norm": 0.3283674120903015, + "learning_rate": 8.185182278962288e-06, + "loss": 0.112, + "step": 16540 + }, + { + "epoch": 432.67973856209153, + "grad_norm": 0.3081677556037903, + "learning_rate": 8.119791191577975e-06, + "loss": 0.112, + "step": 16550 + }, + { + "epoch": 432.94117647058823, + "grad_norm": 0.3371969759464264, + "learning_rate": 8.054651301176087e-06, + "loss": 0.1108, + "step": 16560 + }, + { + "epoch": 433.202614379085, + "grad_norm": 0.3539562225341797, + "learning_rate": 7.989762785846633e-06, + "loss": 0.1116, + "step": 16570 + }, + { + "epoch": 433.4640522875817, + "grad_norm": 0.36196842789649963, + "learning_rate": 7.925125822992307e-06, + "loss": 0.1129, + "step": 16580 + }, + { + "epoch": 433.72549019607845, + "grad_norm": 0.3191837966442108, + "learning_rate": 7.860740589328142e-06, + "loss": 0.1118, + "step": 16590 + }, + { + "epoch": 433.98692810457516, + "grad_norm": 0.31484946608543396, + "learning_rate": 7.796607260880839e-06, + "loss": 0.1124, + "step": 16600 + }, + { + "epoch": 434.2483660130719, + "grad_norm": 0.33817651867866516, + "learning_rate": 7.73272601298851e-06, + "loss": 0.1107, + "step": 16610 + }, + { + "epoch": 434.5098039215686, + "grad_norm": 0.3488980531692505, + "learning_rate": 7.669097020300064e-06, + "loss": 0.1137, + "step": 16620 + }, + { + "epoch": 434.7712418300654, + "grad_norm": 0.378192275762558, + "learning_rate": 7.605720456774701e-06, + "loss": 0.1099, + "step": 16630 + }, + { + "epoch": 435.0326797385621, + "grad_norm": 0.4347194731235504, + "learning_rate": 7.542596495681575e-06, + "loss": 0.1138, + "step": 16640 + }, + { + "epoch": 435.29411764705884, + "grad_norm": 0.314876526594162, + "learning_rate": 7.479725309599117e-06, + "loss": 0.1113, + "step": 16650 + }, + { + "epoch": 435.55555555555554, + "grad_norm": 0.3719530999660492, + "learning_rate": 7.417107070414786e-06, + "loss": 0.1121, + "step": 16660 + }, + { + "epoch": 435.8169934640523, + "grad_norm": 0.31676945090293884, + "learning_rate": 7.354741949324473e-06, + "loss": 0.1122, + "step": 16670 + }, + { + "epoch": 436.078431372549, + "grad_norm": 0.3606952726840973, + "learning_rate": 7.292630116832011e-06, + "loss": 0.1109, + "step": 16680 + }, + { + "epoch": 436.33986928104576, + "grad_norm": 0.36602726578712463, + "learning_rate": 7.23077174274881e-06, + "loss": 0.1135, + "step": 16690 + }, + { + "epoch": 436.60130718954247, + "grad_norm": 0.359847754240036, + "learning_rate": 7.169166996193255e-06, + "loss": 0.108, + "step": 16700 + }, + { + "epoch": 436.8627450980392, + "grad_norm": 0.3841437101364136, + "learning_rate": 7.1078160455903875e-06, + "loss": 0.1148, + "step": 16710 + }, + { + "epoch": 437.12418300653593, + "grad_norm": 0.3049101233482361, + "learning_rate": 7.0467190586713915e-06, + "loss": 0.11, + "step": 16720 + }, + { + "epoch": 437.3856209150327, + "grad_norm": 0.32839474081993103, + "learning_rate": 6.985876202473085e-06, + "loss": 0.1117, + "step": 16730 + }, + { + "epoch": 437.6470588235294, + "grad_norm": 0.3973640501499176, + "learning_rate": 6.925287643337497e-06, + "loss": 0.1119, + "step": 16740 + }, + { + "epoch": 437.90849673202615, + "grad_norm": 0.404038667678833, + "learning_rate": 6.864953546911424e-06, + "loss": 0.1126, + "step": 16750 + }, + { + "epoch": 438.16993464052285, + "grad_norm": 0.44833749532699585, + "learning_rate": 6.8048740781460065e-06, + "loss": 0.1101, + "step": 16760 + }, + { + "epoch": 438.4313725490196, + "grad_norm": 0.35082849860191345, + "learning_rate": 6.7450494012962326e-06, + "loss": 0.1105, + "step": 16770 + }, + { + "epoch": 438.6928104575163, + "grad_norm": 0.3201982080936432, + "learning_rate": 6.685479679920459e-06, + "loss": 0.1132, + "step": 16780 + }, + { + "epoch": 438.9542483660131, + "grad_norm": 0.3554164171218872, + "learning_rate": 6.626165076880031e-06, + "loss": 0.1115, + "step": 16790 + }, + { + "epoch": 439.2156862745098, + "grad_norm": 0.4029858410358429, + "learning_rate": 6.5671057543387985e-06, + "loss": 0.1135, + "step": 16800 + }, + { + "epoch": 439.47712418300654, + "grad_norm": 0.3851124346256256, + "learning_rate": 6.508301873762712e-06, + "loss": 0.1112, + "step": 16810 + }, + { + "epoch": 439.73856209150324, + "grad_norm": 0.2964167594909668, + "learning_rate": 6.449753595919361e-06, + "loss": 0.1131, + "step": 16820 + }, + { + "epoch": 440.0, + "grad_norm": 0.3902610242366791, + "learning_rate": 6.391461080877436e-06, + "loss": 0.1112, + "step": 16830 + }, + { + "epoch": 440.26143790849676, + "grad_norm": 0.3522237241268158, + "learning_rate": 6.333424488006501e-06, + "loss": 0.1104, + "step": 16840 + }, + { + "epoch": 440.52287581699346, + "grad_norm": 0.3239668309688568, + "learning_rate": 6.275643975976353e-06, + "loss": 0.1129, + "step": 16850 + }, + { + "epoch": 440.7843137254902, + "grad_norm": 0.3547097444534302, + "learning_rate": 6.218119702756708e-06, + "loss": 0.1116, + "step": 16860 + }, + { + "epoch": 441.0457516339869, + "grad_norm": 0.29542434215545654, + "learning_rate": 6.160851825616787e-06, + "loss": 0.1101, + "step": 16870 + }, + { + "epoch": 441.3071895424837, + "grad_norm": 0.3131840229034424, + "learning_rate": 6.103840501124702e-06, + "loss": 0.1116, + "step": 16880 + }, + { + "epoch": 441.5686274509804, + "grad_norm": 0.32945704460144043, + "learning_rate": 6.047085885147286e-06, + "loss": 0.1125, + "step": 16890 + }, + { + "epoch": 441.83006535947715, + "grad_norm": 0.3166607618331909, + "learning_rate": 5.990588132849462e-06, + "loss": 0.1084, + "step": 16900 + }, + { + "epoch": 442.09150326797385, + "grad_norm": 0.4009234607219696, + "learning_rate": 5.9343473986939405e-06, + "loss": 0.1174, + "step": 16910 + }, + { + "epoch": 442.3529411764706, + "grad_norm": 0.3329063653945923, + "learning_rate": 5.87836383644077e-06, + "loss": 0.1099, + "step": 16920 + }, + { + "epoch": 442.6143790849673, + "grad_norm": 0.3919806480407715, + "learning_rate": 5.8226375991468294e-06, + "loss": 0.1101, + "step": 16930 + }, + { + "epoch": 442.87581699346407, + "grad_norm": 0.35498499870300293, + "learning_rate": 5.767168839165538e-06, + "loss": 0.1122, + "step": 16940 + }, + { + "epoch": 443.1372549019608, + "grad_norm": 0.3532790243625641, + "learning_rate": 5.711957708146365e-06, + "loss": 0.1116, + "step": 16950 + }, + { + "epoch": 443.39869281045753, + "grad_norm": 0.2659851014614105, + "learning_rate": 5.657004357034445e-06, + "loss": 0.1128, + "step": 16960 + }, + { + "epoch": 443.66013071895424, + "grad_norm": 0.3570290207862854, + "learning_rate": 5.602308936070133e-06, + "loss": 0.1143, + "step": 16970 + }, + { + "epoch": 443.921568627451, + "grad_norm": 0.30066853761672974, + "learning_rate": 5.547871594788611e-06, + "loss": 0.1106, + "step": 16980 + }, + { + "epoch": 444.1830065359477, + "grad_norm": 0.30849123001098633, + "learning_rate": 5.49369248201953e-06, + "loss": 0.1101, + "step": 16990 + }, + { + "epoch": 444.44444444444446, + "grad_norm": 0.30579501390457153, + "learning_rate": 5.4397717458864576e-06, + "loss": 0.1113, + "step": 17000 + }, + { + "epoch": 444.70588235294116, + "grad_norm": 0.30909931659698486, + "learning_rate": 5.3861095338066826e-06, + "loss": 0.1124, + "step": 17010 + }, + { + "epoch": 444.9673202614379, + "grad_norm": 0.3454172909259796, + "learning_rate": 5.332705992490616e-06, + "loss": 0.1124, + "step": 17020 + }, + { + "epoch": 445.2287581699346, + "grad_norm": 0.390906423330307, + "learning_rate": 5.279561267941491e-06, + "loss": 0.1097, + "step": 17030 + }, + { + "epoch": 445.4901960784314, + "grad_norm": 0.3330720365047455, + "learning_rate": 5.226675505454981e-06, + "loss": 0.1113, + "step": 17040 + }, + { + "epoch": 445.7516339869281, + "grad_norm": 0.33850574493408203, + "learning_rate": 5.174048849618718e-06, + "loss": 0.1118, + "step": 17050 + }, + { + "epoch": 446.01307189542484, + "grad_norm": 0.3519850969314575, + "learning_rate": 5.121681444311987e-06, + "loss": 0.1147, + "step": 17060 + }, + { + "epoch": 446.27450980392155, + "grad_norm": 0.3401840031147003, + "learning_rate": 5.069573432705277e-06, + "loss": 0.1116, + "step": 17070 + }, + { + "epoch": 446.5359477124183, + "grad_norm": 0.3529043197631836, + "learning_rate": 5.017724957259873e-06, + "loss": 0.1135, + "step": 17080 + }, + { + "epoch": 446.797385620915, + "grad_norm": 0.3206219971179962, + "learning_rate": 4.966136159727563e-06, + "loss": 0.1118, + "step": 17090 + }, + { + "epoch": 447.05882352941177, + "grad_norm": 0.3134680986404419, + "learning_rate": 4.914807181150139e-06, + "loss": 0.1092, + "step": 17100 + }, + { + "epoch": 447.32026143790847, + "grad_norm": 0.3726840019226074, + "learning_rate": 4.863738161859044e-06, + "loss": 0.1116, + "step": 17110 + }, + { + "epoch": 447.58169934640523, + "grad_norm": 0.41007351875305176, + "learning_rate": 4.812929241475062e-06, + "loss": 0.1136, + "step": 17120 + }, + { + "epoch": 447.84313725490193, + "grad_norm": 0.2826221287250519, + "learning_rate": 4.762380558907798e-06, + "loss": 0.1113, + "step": 17130 + }, + { + "epoch": 448.1045751633987, + "grad_norm": 0.3534366488456726, + "learning_rate": 4.712092252355471e-06, + "loss": 0.1109, + "step": 17140 + }, + { + "epoch": 448.36601307189545, + "grad_norm": 0.28506147861480713, + "learning_rate": 4.662064459304372e-06, + "loss": 0.1134, + "step": 17150 + }, + { + "epoch": 448.62745098039215, + "grad_norm": 0.3237999975681305, + "learning_rate": 4.612297316528547e-06, + "loss": 0.1113, + "step": 17160 + }, + { + "epoch": 448.8888888888889, + "grad_norm": 0.37305569648742676, + "learning_rate": 4.5627909600895026e-06, + "loss": 0.1105, + "step": 17170 + }, + { + "epoch": 449.1503267973856, + "grad_norm": 0.30823808908462524, + "learning_rate": 4.513545525335705e-06, + "loss": 0.1123, + "step": 17180 + }, + { + "epoch": 449.4117647058824, + "grad_norm": 0.3125160336494446, + "learning_rate": 4.464561146902302e-06, + "loss": 0.1104, + "step": 17190 + }, + { + "epoch": 449.6732026143791, + "grad_norm": 0.32170015573501587, + "learning_rate": 4.4158379587107335e-06, + "loss": 0.1118, + "step": 17200 + }, + { + "epoch": 449.93464052287584, + "grad_norm": 0.3177231550216675, + "learning_rate": 4.367376093968278e-06, + "loss": 0.1134, + "step": 17210 + }, + { + "epoch": 450.19607843137254, + "grad_norm": 0.35781294107437134, + "learning_rate": 4.319175685167887e-06, + "loss": 0.1095, + "step": 17220 + }, + { + "epoch": 450.4575163398693, + "grad_norm": 0.3756282925605774, + "learning_rate": 4.2712368640875914e-06, + "loss": 0.1135, + "step": 17230 + }, + { + "epoch": 450.718954248366, + "grad_norm": 0.3352264165878296, + "learning_rate": 4.22355976179033e-06, + "loss": 0.1113, + "step": 17240 + }, + { + "epoch": 450.98039215686276, + "grad_norm": 0.38803133368492126, + "learning_rate": 4.176144508623458e-06, + "loss": 0.1114, + "step": 17250 + }, + { + "epoch": 451.24183006535947, + "grad_norm": 0.3300243020057678, + "learning_rate": 4.128991234218471e-06, + "loss": 0.1115, + "step": 17260 + }, + { + "epoch": 451.5032679738562, + "grad_norm": 0.30495062470436096, + "learning_rate": 4.082100067490635e-06, + "loss": 0.1117, + "step": 17270 + }, + { + "epoch": 451.7647058823529, + "grad_norm": 0.3008281886577606, + "learning_rate": 4.03547113663858e-06, + "loss": 0.1138, + "step": 17280 + }, + { + "epoch": 452.0261437908497, + "grad_norm": 0.3080352246761322, + "learning_rate": 3.989104569144065e-06, + "loss": 0.1098, + "step": 17290 + }, + { + "epoch": 452.2875816993464, + "grad_norm": 0.30234184861183167, + "learning_rate": 3.943000491771487e-06, + "loss": 0.1114, + "step": 17300 + }, + { + "epoch": 452.54901960784315, + "grad_norm": 0.3571633994579315, + "learning_rate": 3.897159030567621e-06, + "loss": 0.1114, + "step": 17310 + }, + { + "epoch": 452.81045751633985, + "grad_norm": 0.3420047163963318, + "learning_rate": 3.8515803108613025e-06, + "loss": 0.1107, + "step": 17320 + }, + { + "epoch": 453.0718954248366, + "grad_norm": 0.38761913776397705, + "learning_rate": 3.806264457262976e-06, + "loss": 0.1129, + "step": 17330 + }, + { + "epoch": 453.3333333333333, + "grad_norm": 0.3283638656139374, + "learning_rate": 3.7612115936644932e-06, + "loss": 0.113, + "step": 17340 + }, + { + "epoch": 453.5947712418301, + "grad_norm": 0.38328078389167786, + "learning_rate": 3.716421843238649e-06, + "loss": 0.1128, + "step": 17350 + }, + { + "epoch": 453.8562091503268, + "grad_norm": 0.2841828763484955, + "learning_rate": 3.67189532843889e-06, + "loss": 0.1106, + "step": 17360 + }, + { + "epoch": 454.11764705882354, + "grad_norm": 0.39247363805770874, + "learning_rate": 3.627632170999029e-06, + "loss": 0.1132, + "step": 17370 + }, + { + "epoch": 454.37908496732024, + "grad_norm": 0.38083165884017944, + "learning_rate": 3.5836324919328536e-06, + "loss": 0.1143, + "step": 17380 + }, + { + "epoch": 454.640522875817, + "grad_norm": 0.3211730420589447, + "learning_rate": 3.5398964115337828e-06, + "loss": 0.1088, + "step": 17390 + }, + { + "epoch": 454.9019607843137, + "grad_norm": 0.347988486289978, + "learning_rate": 3.496424049374614e-06, + "loss": 0.1084, + "step": 17400 + }, + { + "epoch": 455.16339869281046, + "grad_norm": 0.331668496131897, + "learning_rate": 3.4532155243070963e-06, + "loss": 0.1148, + "step": 17410 + }, + { + "epoch": 455.42483660130716, + "grad_norm": 0.3547741174697876, + "learning_rate": 3.410270954461725e-06, + "loss": 0.1101, + "step": 17420 + }, + { + "epoch": 455.6862745098039, + "grad_norm": 0.3205719292163849, + "learning_rate": 3.3675904572472825e-06, + "loss": 0.1102, + "step": 17430 + }, + { + "epoch": 455.9477124183006, + "grad_norm": 0.3358950912952423, + "learning_rate": 3.3251741493506294e-06, + "loss": 0.1109, + "step": 17440 + }, + { + "epoch": 456.2091503267974, + "grad_norm": 0.3523581027984619, + "learning_rate": 3.2830221467363476e-06, + "loss": 0.1134, + "step": 17450 + }, + { + "epoch": 456.47058823529414, + "grad_norm": 0.36160987615585327, + "learning_rate": 3.2411345646463643e-06, + "loss": 0.1101, + "step": 17460 + }, + { + "epoch": 456.73202614379085, + "grad_norm": 0.2866756319999695, + "learning_rate": 3.1995115175997736e-06, + "loss": 0.1121, + "step": 17470 + }, + { + "epoch": 456.9934640522876, + "grad_norm": 0.31712275743484497, + "learning_rate": 3.1581531193923706e-06, + "loss": 0.1127, + "step": 17480 + }, + { + "epoch": 457.2549019607843, + "grad_norm": 0.3979257345199585, + "learning_rate": 3.1170594830964405e-06, + "loss": 0.1103, + "step": 17490 + }, + { + "epoch": 457.51633986928107, + "grad_norm": 0.3742763102054596, + "learning_rate": 3.0762307210604246e-06, + "loss": 0.1115, + "step": 17500 + }, + { + "epoch": 457.77777777777777, + "grad_norm": 0.3208019435405731, + "learning_rate": 3.0356669449085775e-06, + "loss": 0.1157, + "step": 17510 + }, + { + "epoch": 458.03921568627453, + "grad_norm": 0.31606632471084595, + "learning_rate": 2.9953682655407434e-06, + "loss": 0.1093, + "step": 17520 + }, + { + "epoch": 458.30065359477123, + "grad_norm": 0.36023128032684326, + "learning_rate": 2.9553347931319586e-06, + "loss": 0.114, + "step": 17530 + }, + { + "epoch": 458.562091503268, + "grad_norm": 0.39027029275894165, + "learning_rate": 2.9155666371321944e-06, + "loss": 0.1123, + "step": 17540 + }, + { + "epoch": 458.8235294117647, + "grad_norm": 0.28587546944618225, + "learning_rate": 2.876063906266102e-06, + "loss": 0.1114, + "step": 17550 + }, + { + "epoch": 459.08496732026146, + "grad_norm": 0.33159536123275757, + "learning_rate": 2.836826708532603e-06, + "loss": 0.1084, + "step": 17560 + }, + { + "epoch": 459.34640522875816, + "grad_norm": 0.37588268518447876, + "learning_rate": 2.7978551512047312e-06, + "loss": 0.1136, + "step": 17570 + }, + { + "epoch": 459.6078431372549, + "grad_norm": 0.309316486120224, + "learning_rate": 2.7591493408292256e-06, + "loss": 0.1107, + "step": 17580 + }, + { + "epoch": 459.8692810457516, + "grad_norm": 0.34601840376853943, + "learning_rate": 2.720709383226272e-06, + "loss": 0.1104, + "step": 17590 + }, + { + "epoch": 460.1307189542484, + "grad_norm": 0.28702256083488464, + "learning_rate": 2.682535383489282e-06, + "loss": 0.1126, + "step": 17600 + }, + { + "epoch": 460.3921568627451, + "grad_norm": 0.3772827088832855, + "learning_rate": 2.6446274459844712e-06, + "loss": 0.111, + "step": 17610 + }, + { + "epoch": 460.65359477124184, + "grad_norm": 0.3219127357006073, + "learning_rate": 2.606985674350737e-06, + "loss": 0.1089, + "step": 17620 + }, + { + "epoch": 460.91503267973854, + "grad_norm": 0.36690565943717957, + "learning_rate": 2.569610171499226e-06, + "loss": 0.1134, + "step": 17630 + }, + { + "epoch": 461.1764705882353, + "grad_norm": 0.3363436162471771, + "learning_rate": 2.5325010396131332e-06, + "loss": 0.1111, + "step": 17640 + }, + { + "epoch": 461.437908496732, + "grad_norm": 0.3152006268501282, + "learning_rate": 2.495658380147414e-06, + "loss": 0.111, + "step": 17650 + }, + { + "epoch": 461.69934640522877, + "grad_norm": 0.30644363164901733, + "learning_rate": 2.4590822938284854e-06, + "loss": 0.1126, + "step": 17660 + }, + { + "epoch": 461.96078431372547, + "grad_norm": 0.4446253776550293, + "learning_rate": 2.4227728806539672e-06, + "loss": 0.1135, + "step": 17670 + }, + { + "epoch": 462.22222222222223, + "grad_norm": 0.323368638753891, + "learning_rate": 2.386730239892432e-06, + "loss": 0.1088, + "step": 17680 + }, + { + "epoch": 462.48366013071893, + "grad_norm": 0.37691059708595276, + "learning_rate": 2.3509544700830556e-06, + "loss": 0.1113, + "step": 17690 + }, + { + "epoch": 462.7450980392157, + "grad_norm": 0.32642048597335815, + "learning_rate": 2.318984532773427e-06, + "loss": 0.1136, + "step": 17700 + }, + { + "epoch": 463.0065359477124, + "grad_norm": 0.37580016255378723, + "learning_rate": 2.283716086635357e-06, + "loss": 0.1124, + "step": 17710 + }, + { + "epoch": 463.26797385620915, + "grad_norm": 0.3199895918369293, + "learning_rate": 2.248714793086215e-06, + "loss": 0.112, + "step": 17720 + }, + { + "epoch": 463.52941176470586, + "grad_norm": 0.2862391471862793, + "learning_rate": 2.213980747818201e-06, + "loss": 0.111, + "step": 17730 + }, + { + "epoch": 463.7908496732026, + "grad_norm": 0.3182966411113739, + "learning_rate": 2.179514045792885e-06, + "loss": 0.111, + "step": 17740 + }, + { + "epoch": 464.0522875816994, + "grad_norm": 0.4070954918861389, + "learning_rate": 2.1453147812408925e-06, + "loss": 0.1133, + "step": 17750 + }, + { + "epoch": 464.3137254901961, + "grad_norm": 0.39886951446533203, + "learning_rate": 2.1113830476617193e-06, + "loss": 0.1129, + "step": 17760 + }, + { + "epoch": 464.57516339869284, + "grad_norm": 0.3158631920814514, + "learning_rate": 2.0777189378234143e-06, + "loss": 0.1114, + "step": 17770 + }, + { + "epoch": 464.83660130718954, + "grad_norm": 0.3350854814052582, + "learning_rate": 2.0443225437624e-06, + "loss": 0.1103, + "step": 17780 + }, + { + "epoch": 465.0980392156863, + "grad_norm": 0.3238007128238678, + "learning_rate": 2.0111939567831197e-06, + "loss": 0.11, + "step": 17790 + }, + { + "epoch": 465.359477124183, + "grad_norm": 0.3874686360359192, + "learning_rate": 1.9783332674578546e-06, + "loss": 0.1122, + "step": 17800 + }, + { + "epoch": 465.62091503267976, + "grad_norm": 0.3172845244407654, + "learning_rate": 1.9457405656264973e-06, + "loss": 0.1108, + "step": 17810 + }, + { + "epoch": 465.88235294117646, + "grad_norm": 0.3653365969657898, + "learning_rate": 1.913415940396246e-06, + "loss": 0.1124, + "step": 17820 + }, + { + "epoch": 466.1437908496732, + "grad_norm": 0.33903956413269043, + "learning_rate": 1.8813594801413758e-06, + "loss": 0.1124, + "step": 17830 + }, + { + "epoch": 466.4052287581699, + "grad_norm": 0.3369342088699341, + "learning_rate": 1.8495712725030478e-06, + "loss": 0.1157, + "step": 17840 + }, + { + "epoch": 466.6666666666667, + "grad_norm": 0.36298859119415283, + "learning_rate": 1.8180514043889763e-06, + "loss": 0.1097, + "step": 17850 + }, + { + "epoch": 466.9281045751634, + "grad_norm": 0.3302154541015625, + "learning_rate": 1.7867999619733179e-06, + "loss": 0.1104, + "step": 17860 + }, + { + "epoch": 467.18954248366015, + "grad_norm": 0.42841556668281555, + "learning_rate": 1.755817030696294e-06, + "loss": 0.1101, + "step": 17870 + }, + { + "epoch": 467.45098039215685, + "grad_norm": 0.33162766695022583, + "learning_rate": 1.725102695264058e-06, + "loss": 0.1095, + "step": 17880 + }, + { + "epoch": 467.7124183006536, + "grad_norm": 0.33728310465812683, + "learning_rate": 1.6946570396484507e-06, + "loss": 0.11, + "step": 17890 + }, + { + "epoch": 467.9738562091503, + "grad_norm": 0.4027140438556671, + "learning_rate": 1.6644801470867e-06, + "loss": 0.1145, + "step": 17900 + }, + { + "epoch": 468.2352941176471, + "grad_norm": 0.33868324756622314, + "learning_rate": 1.6345721000812997e-06, + "loss": 0.1111, + "step": 17910 + }, + { + "epoch": 468.4967320261438, + "grad_norm": 0.30771663784980774, + "learning_rate": 1.6049329803997092e-06, + "loss": 0.1131, + "step": 17920 + }, + { + "epoch": 468.75816993464053, + "grad_norm": 0.32284727692604065, + "learning_rate": 1.575562869074143e-06, + "loss": 0.1076, + "step": 17930 + }, + { + "epoch": 469.01960784313724, + "grad_norm": 0.3255426287651062, + "learning_rate": 1.5464618464013592e-06, + "loss": 0.1151, + "step": 17940 + }, + { + "epoch": 469.281045751634, + "grad_norm": 0.3239709436893463, + "learning_rate": 1.5176299919424486e-06, + "loss": 0.1113, + "step": 17950 + }, + { + "epoch": 469.5424836601307, + "grad_norm": 0.39039456844329834, + "learning_rate": 1.4890673845226133e-06, + "loss": 0.1123, + "step": 17960 + }, + { + "epoch": 469.80392156862746, + "grad_norm": 0.35394954681396484, + "learning_rate": 1.4607741022309106e-06, + "loss": 0.1157, + "step": 17970 + }, + { + "epoch": 470.06535947712416, + "grad_norm": 0.33994433283805847, + "learning_rate": 1.4327502224200872e-06, + "loss": 0.106, + "step": 17980 + }, + { + "epoch": 470.3267973856209, + "grad_norm": 0.36443567276000977, + "learning_rate": 1.4049958217063896e-06, + "loss": 0.1097, + "step": 17990 + }, + { + "epoch": 470.5882352941176, + "grad_norm": 0.28790152072906494, + "learning_rate": 1.3775109759692651e-06, + "loss": 0.1127, + "step": 18000 + }, + { + "epoch": 470.8496732026144, + "grad_norm": 0.31258511543273926, + "learning_rate": 1.350295760351261e-06, + "loss": 0.112, + "step": 18010 + }, + { + "epoch": 471.1111111111111, + "grad_norm": 0.35889947414398193, + "learning_rate": 1.3233502492577044e-06, + "loss": 0.1139, + "step": 18020 + }, + { + "epoch": 471.37254901960785, + "grad_norm": 0.31674671173095703, + "learning_rate": 1.2966745163566107e-06, + "loss": 0.1113, + "step": 18030 + }, + { + "epoch": 471.63398692810455, + "grad_norm": 0.3120609521865845, + "learning_rate": 1.2702686345784088e-06, + "loss": 0.1133, + "step": 18040 + }, + { + "epoch": 471.8954248366013, + "grad_norm": 0.4035533666610718, + "learning_rate": 1.2441326761157723e-06, + "loss": 0.1112, + "step": 18050 + }, + { + "epoch": 472.15686274509807, + "grad_norm": 0.3225405812263489, + "learning_rate": 1.2182667124234326e-06, + "loss": 0.1085, + "step": 18060 + }, + { + "epoch": 472.41830065359477, + "grad_norm": 0.3946692645549774, + "learning_rate": 1.1926708142179111e-06, + "loss": 0.1124, + "step": 18070 + }, + { + "epoch": 472.67973856209153, + "grad_norm": 0.34986528754234314, + "learning_rate": 1.167345051477442e-06, + "loss": 0.1111, + "step": 18080 + }, + { + "epoch": 472.94117647058823, + "grad_norm": 0.32098639011383057, + "learning_rate": 1.1422894934416838e-06, + "loss": 0.1115, + "step": 18090 + }, + { + "epoch": 473.202614379085, + "grad_norm": 0.33079513907432556, + "learning_rate": 1.117504208611586e-06, + "loss": 0.1116, + "step": 18100 + }, + { + "epoch": 473.4640522875817, + "grad_norm": 0.332214891910553, + "learning_rate": 1.092989264749167e-06, + "loss": 0.1106, + "step": 18110 + }, + { + "epoch": 473.72549019607845, + "grad_norm": 0.2994788885116577, + "learning_rate": 1.0687447288773244e-06, + "loss": 0.1122, + "step": 18120 + }, + { + "epoch": 473.98692810457516, + "grad_norm": 0.32751554250717163, + "learning_rate": 1.0447706672797264e-06, + "loss": 0.1126, + "step": 18130 + }, + { + "epoch": 474.2483660130719, + "grad_norm": 0.40659329295158386, + "learning_rate": 1.0210671455005204e-06, + "loss": 0.1122, + "step": 18140 + }, + { + "epoch": 474.5098039215686, + "grad_norm": 0.3787976801395416, + "learning_rate": 9.976342283442463e-07, + "loss": 0.1132, + "step": 18150 + }, + { + "epoch": 474.7712418300654, + "grad_norm": 0.32952606678009033, + "learning_rate": 9.744719798755907e-07, + "loss": 0.1134, + "step": 18160 + }, + { + "epoch": 475.0326797385621, + "grad_norm": 0.36884790658950806, + "learning_rate": 9.515804634192659e-07, + "loss": 0.1089, + "step": 18170 + }, + { + "epoch": 475.29411764705884, + "grad_norm": 0.3276387155056, + "learning_rate": 9.289597415597872e-07, + "loss": 0.1081, + "step": 18180 + }, + { + "epoch": 475.55555555555554, + "grad_norm": 0.34189870953559875, + "learning_rate": 9.066098761413733e-07, + "loss": 0.113, + "step": 18190 + }, + { + "epoch": 475.8169934640523, + "grad_norm": 0.32616567611694336, + "learning_rate": 8.845309282676795e-07, + "loss": 0.112, + "step": 18200 + }, + { + "epoch": 476.078431372549, + "grad_norm": 0.3290955126285553, + "learning_rate": 8.627229583017204e-07, + "loss": 0.1112, + "step": 18210 + }, + { + "epoch": 476.33986928104576, + "grad_norm": 0.34466055035591125, + "learning_rate": 8.411860258656256e-07, + "loss": 0.1096, + "step": 18220 + }, + { + "epoch": 476.60130718954247, + "grad_norm": 0.30991095304489136, + "learning_rate": 8.199201898405839e-07, + "loss": 0.1115, + "step": 18230 + }, + { + "epoch": 476.8627450980392, + "grad_norm": 0.34044432640075684, + "learning_rate": 7.989255083665659e-07, + "loss": 0.1145, + "step": 18240 + }, + { + "epoch": 477.12418300653593, + "grad_norm": 0.34200185537338257, + "learning_rate": 7.782020388422018e-07, + "loss": 0.1114, + "step": 18250 + }, + { + "epoch": 477.3856209150327, + "grad_norm": 0.30193594098091125, + "learning_rate": 7.577498379247039e-07, + "loss": 0.1109, + "step": 18260 + }, + { + "epoch": 477.6470588235294, + "grad_norm": 0.3074875473976135, + "learning_rate": 7.375689615295889e-07, + "loss": 0.1111, + "step": 18270 + }, + { + "epoch": 477.90849673202615, + "grad_norm": 0.3469603359699249, + "learning_rate": 7.176594648306111e-07, + "loss": 0.114, + "step": 18280 + }, + { + "epoch": 478.16993464052285, + "grad_norm": 0.2915016710758209, + "learning_rate": 6.980214022595744e-07, + "loss": 0.1096, + "step": 18290 + }, + { + "epoch": 478.4313725490196, + "grad_norm": 0.32458066940307617, + "learning_rate": 6.786548275061754e-07, + "loss": 0.1124, + "step": 18300 + }, + { + "epoch": 478.6928104575163, + "grad_norm": 0.5039242506027222, + "learning_rate": 6.595597935179166e-07, + "loss": 0.111, + "step": 18310 + }, + { + "epoch": 478.9542483660131, + "grad_norm": 0.3211175501346588, + "learning_rate": 6.4073635249986e-07, + "loss": 0.1138, + "step": 18320 + }, + { + "epoch": 479.2156862745098, + "grad_norm": 0.32088157534599304, + "learning_rate": 6.221845559146066e-07, + "loss": 0.1104, + "step": 18330 + }, + { + "epoch": 479.47712418300654, + "grad_norm": 0.38157036900520325, + "learning_rate": 6.039044544820404e-07, + "loss": 0.1124, + "step": 18340 + }, + { + "epoch": 479.73856209150324, + "grad_norm": 0.3110623061656952, + "learning_rate": 5.858960981792505e-07, + "loss": 0.1127, + "step": 18350 + }, + { + "epoch": 480.0, + "grad_norm": 0.39009323716163635, + "learning_rate": 5.681595362404312e-07, + "loss": 0.1107, + "step": 18360 + }, + { + "epoch": 480.26143790849676, + "grad_norm": 0.3596895635128021, + "learning_rate": 5.506948171566273e-07, + "loss": 0.1122, + "step": 18370 + }, + { + "epoch": 480.52287581699346, + "grad_norm": 0.3319242298603058, + "learning_rate": 5.335019886757442e-07, + "loss": 0.1116, + "step": 18380 + }, + { + "epoch": 480.7843137254902, + "grad_norm": 0.34595364332199097, + "learning_rate": 5.165810978023044e-07, + "loss": 0.1109, + "step": 18390 + }, + { + "epoch": 481.0457516339869, + "grad_norm": 0.3366650640964508, + "learning_rate": 4.999321907973698e-07, + "loss": 0.1103, + "step": 18400 + }, + { + "epoch": 481.3071895424837, + "grad_norm": 0.31628909707069397, + "learning_rate": 4.835553131784298e-07, + "loss": 0.1128, + "step": 18410 + }, + { + "epoch": 481.5686274509804, + "grad_norm": 0.33149656653404236, + "learning_rate": 4.6745050971923607e-07, + "loss": 0.1119, + "step": 18420 + }, + { + "epoch": 481.83006535947715, + "grad_norm": 0.30861008167266846, + "learning_rate": 4.5161782444971267e-07, + "loss": 0.1106, + "step": 18430 + }, + { + "epoch": 482.09150326797385, + "grad_norm": 0.357360303401947, + "learning_rate": 4.360573006558122e-07, + "loss": 0.1112, + "step": 18440 + }, + { + "epoch": 482.3529411764706, + "grad_norm": 0.3232167065143585, + "learning_rate": 4.207689808794046e-07, + "loss": 0.1088, + "step": 18450 + }, + { + "epoch": 482.6143790849673, + "grad_norm": 0.31159424781799316, + "learning_rate": 4.0575290691817757e-07, + "loss": 0.1121, + "step": 18460 + }, + { + "epoch": 482.87581699346407, + "grad_norm": 0.4624285101890564, + "learning_rate": 3.91009119825525e-07, + "loss": 0.1121, + "step": 18470 + }, + { + "epoch": 483.1372549019608, + "grad_norm": 0.34504663944244385, + "learning_rate": 3.765376599103587e-07, + "loss": 0.1135, + "step": 18480 + }, + { + "epoch": 483.39869281045753, + "grad_norm": 0.3002839982509613, + "learning_rate": 3.623385667371304e-07, + "loss": 0.1101, + "step": 18490 + }, + { + "epoch": 483.66013071895424, + "grad_norm": 0.37612277269363403, + "learning_rate": 3.484118791255986e-07, + "loss": 0.112, + "step": 18500 + }, + { + "epoch": 483.921568627451, + "grad_norm": 0.4303232431411743, + "learning_rate": 3.347576351508064e-07, + "loss": 0.1127, + "step": 18510 + }, + { + "epoch": 484.1830065359477, + "grad_norm": 0.4048657715320587, + "learning_rate": 3.2137587214293717e-07, + "loss": 0.1125, + "step": 18520 + }, + { + "epoch": 484.44444444444446, + "grad_norm": 0.2931053638458252, + "learning_rate": 3.0826662668720364e-07, + "loss": 0.1138, + "step": 18530 + }, + { + "epoch": 484.70588235294116, + "grad_norm": 0.29800963401794434, + "learning_rate": 2.954299346238032e-07, + "loss": 0.1096, + "step": 18540 + }, + { + "epoch": 484.9673202614379, + "grad_norm": 0.34082716703414917, + "learning_rate": 2.828658310477406e-07, + "loss": 0.1086, + "step": 18550 + }, + { + "epoch": 485.2287581699346, + "grad_norm": 0.38244274258613586, + "learning_rate": 2.705743503088165e-07, + "loss": 0.1134, + "step": 18560 + }, + { + "epoch": 485.4901960784314, + "grad_norm": 0.3117094039916992, + "learning_rate": 2.585555260114614e-07, + "loss": 0.1078, + "step": 18570 + }, + { + "epoch": 485.7516339869281, + "grad_norm": 0.32655006647109985, + "learning_rate": 2.468093910146685e-07, + "loss": 0.1124, + "step": 18580 + }, + { + "epoch": 486.01307189542484, + "grad_norm": 0.4254554510116577, + "learning_rate": 2.3533597743194967e-07, + "loss": 0.1127, + "step": 18590 + }, + { + "epoch": 486.27450980392155, + "grad_norm": 0.3003155291080475, + "learning_rate": 2.2413531663115773e-07, + "loss": 0.1097, + "step": 18600 + }, + { + "epoch": 486.5359477124183, + "grad_norm": 0.3139316439628601, + "learning_rate": 2.1320743923447517e-07, + "loss": 0.1114, + "step": 18610 + }, + { + "epoch": 486.797385620915, + "grad_norm": 0.38288217782974243, + "learning_rate": 2.0255237511830338e-07, + "loss": 0.1145, + "step": 18620 + }, + { + "epoch": 487.05882352941177, + "grad_norm": 0.3423730134963989, + "learning_rate": 1.921701534131848e-07, + "loss": 0.1115, + "step": 18630 + }, + { + "epoch": 487.32026143790847, + "grad_norm": 0.40306052565574646, + "learning_rate": 1.8206080250372515e-07, + "loss": 0.1129, + "step": 18640 + }, + { + "epoch": 487.58169934640523, + "grad_norm": 0.30820974707603455, + "learning_rate": 1.7222435002847147e-07, + "loss": 0.1108, + "step": 18650 + }, + { + "epoch": 487.84313725490193, + "grad_norm": 0.3316936790943146, + "learning_rate": 1.6266082287994533e-07, + "loss": 0.1087, + "step": 18660 + }, + { + "epoch": 488.1045751633987, + "grad_norm": 0.32804399728775024, + "learning_rate": 1.5337024720445403e-07, + "loss": 0.1149, + "step": 18670 + }, + { + "epoch": 488.36601307189545, + "grad_norm": 0.36528879404067993, + "learning_rate": 1.443526484020574e-07, + "loss": 0.111, + "step": 18680 + }, + { + "epoch": 488.62745098039215, + "grad_norm": 0.336240291595459, + "learning_rate": 1.3560805112655673e-07, + "loss": 0.1123, + "step": 18690 + }, + { + "epoch": 488.8888888888889, + "grad_norm": 0.4585813283920288, + "learning_rate": 1.2713647928532802e-07, + "loss": 0.1106, + "step": 18700 + }, + { + "epoch": 489.1503267973856, + "grad_norm": 0.3581952452659607, + "learning_rate": 1.1893795603932222e-07, + "loss": 0.1135, + "step": 18710 + }, + { + "epoch": 489.4117647058824, + "grad_norm": 0.3902879059314728, + "learning_rate": 1.1101250380300965e-07, + "loss": 0.1083, + "step": 18720 + }, + { + "epoch": 489.6732026143791, + "grad_norm": 0.2964220345020294, + "learning_rate": 1.0336014424424668e-07, + "loss": 0.1116, + "step": 18730 + }, + { + "epoch": 489.93464052287584, + "grad_norm": 0.39717039465904236, + "learning_rate": 9.598089828430911e-08, + "loss": 0.1142, + "step": 18740 + }, + { + "epoch": 490.19607843137254, + "grad_norm": 0.3992186486721039, + "learning_rate": 8.887478609777011e-08, + "loss": 0.112, + "step": 18750 + }, + { + "epoch": 490.4575163398693, + "grad_norm": 0.34660351276397705, + "learning_rate": 8.204182711246677e-08, + "loss": 0.1128, + "step": 18760 + }, + { + "epoch": 490.718954248366, + "grad_norm": 0.29538923501968384, + "learning_rate": 7.54820400094558e-08, + "loss": 0.1075, + "step": 18770 + }, + { + "epoch": 490.98039215686276, + "grad_norm": 0.43791237473487854, + "learning_rate": 6.919544272293577e-08, + "loss": 0.1137, + "step": 18780 + }, + { + "epoch": 491.24183006535947, + "grad_norm": 0.35819703340530396, + "learning_rate": 6.318205244023601e-08, + "loss": 0.1113, + "step": 18790 + }, + { + "epoch": 491.5032679738562, + "grad_norm": 0.31988316774368286, + "learning_rate": 5.7441885601716707e-08, + "loss": 0.1108, + "step": 18800 + }, + { + "epoch": 491.7647058823529, + "grad_norm": 0.3159491717815399, + "learning_rate": 5.19749579007911e-08, + "loss": 0.114, + "step": 18810 + }, + { + "epoch": 492.0261437908497, + "grad_norm": 0.3267784118652344, + "learning_rate": 4.678128428382555e-08, + "loss": 0.1088, + "step": 18820 + }, + { + "epoch": 492.2875816993464, + "grad_norm": 0.3536956012248993, + "learning_rate": 4.186087895011737e-08, + "loss": 0.1136, + "step": 18830 + }, + { + "epoch": 492.54901960784315, + "grad_norm": 0.3293076455593109, + "learning_rate": 3.721375535188365e-08, + "loss": 0.1121, + "step": 18840 + }, + { + "epoch": 492.81045751633985, + "grad_norm": 0.2942824065685272, + "learning_rate": 3.283992619416143e-08, + "loss": 0.1117, + "step": 18850 + }, + { + "epoch": 493.0718954248366, + "grad_norm": 0.34250175952911377, + "learning_rate": 2.873940343485204e-08, + "loss": 0.1103, + "step": 18860 + }, + { + "epoch": 493.3333333333333, + "grad_norm": 0.3269750475883484, + "learning_rate": 2.4912198284621214e-08, + "loss": 0.11, + "step": 18870 + }, + { + "epoch": 493.5947712418301, + "grad_norm": 0.339807391166687, + "learning_rate": 2.135832120689907e-08, + "loss": 0.1081, + "step": 18880 + }, + { + "epoch": 493.8562091503268, + "grad_norm": 0.32826676964759827, + "learning_rate": 1.8077781917846815e-08, + "loss": 0.1127, + "step": 18890 + }, + { + "epoch": 494.11764705882354, + "grad_norm": 0.29220259189605713, + "learning_rate": 1.5070589386345645e-08, + "loss": 0.1146, + "step": 18900 + }, + { + "epoch": 494.37908496732024, + "grad_norm": 0.3818574547767639, + "learning_rate": 1.2336751833941229e-08, + "loss": 0.1145, + "step": 18910 + }, + { + "epoch": 494.640522875817, + "grad_norm": 0.3570195138454437, + "learning_rate": 9.876276734832601e-09, + "loss": 0.1106, + "step": 18920 + }, + { + "epoch": 494.9019607843137, + "grad_norm": 0.2886011600494385, + "learning_rate": 7.689170815872172e-09, + "loss": 0.1096, + "step": 18930 + }, + { + "epoch": 495.16339869281046, + "grad_norm": 0.36767634749412537, + "learning_rate": 5.775440056521308e-09, + "loss": 0.1102, + "step": 18940 + }, + { + "epoch": 495.42483660130716, + "grad_norm": 0.30554234981536865, + "learning_rate": 4.1350896888503464e-09, + "loss": 0.112, + "step": 18950 + }, + { + "epoch": 495.6862745098039, + "grad_norm": 0.35083529353141785, + "learning_rate": 2.768124197505273e-09, + "loss": 0.1123, + "step": 18960 + }, + { + "epoch": 495.9477124183006, + "grad_norm": 0.37509220838546753, + "learning_rate": 1.6745473197188333e-09, + "loss": 0.1122, + "step": 18970 + }, + { + "epoch": 496.2091503267974, + "grad_norm": 0.3241898715496063, + "learning_rate": 8.543620453105306e-10, + "loss": 0.1103, + "step": 18980 + }, + { + "epoch": 496.47058823529414, + "grad_norm": 0.40990370512008667, + "learning_rate": 3.075706166089098e-10, + "loss": 0.1103, + "step": 18990 + }, + { + "epoch": 496.73202614379085, + "grad_norm": 0.401498407125473, + "learning_rate": 3.417452852927383e-11, + "loss": 0.1131, + "step": 19000 + } + ], + "logging_steps": 10, + "max_steps": 19000, + "num_input_tokens_seen": 0, + "num_train_epochs": 500, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.3677835109083546e+17, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}