diff --git "a/Luminia-8B-RP/trainer_state.json" "b/Luminia-8B-RP/trainer_state.json" --- "a/Luminia-8B-RP/trainer_state.json" +++ "b/Luminia-8B-RP/trainer_state.json" @@ -10,39 +10,39 @@ "log_history": [ { "epoch": 0.0004801920768307323, - "grad_norm": 0.7035335898399353, + "grad_norm": 0.6976720094680786, "learning_rate": 5.000000000000001e-07, - "loss": 1.3448, + "loss": 1.3449, "num_input_tokens_seen": 81920, "step": 10 }, { "epoch": 0.0009603841536614646, - "grad_norm": 0.9122879505157471, + "grad_norm": 0.9117342233657837, "learning_rate": 1.0000000000000002e-06, - "loss": 1.3391, + "loss": 1.339, "num_input_tokens_seen": 163840, "step": 20 }, { "epoch": 0.0014405762304921968, - "grad_norm": 0.8231492638587952, + "grad_norm": 0.8280390501022339, "learning_rate": 1.5e-06, - "loss": 1.3258, + "loss": 1.3256, "num_input_tokens_seen": 245760, "step": 30 }, { "epoch": 0.0019207683073229293, - "grad_norm": 0.928112804889679, + "grad_norm": 0.9282752871513367, "learning_rate": 2.0000000000000003e-06, - "loss": 1.4027, + "loss": 1.4026, "num_input_tokens_seen": 327680, "step": 40 }, { "epoch": 0.0024009603841536613, - "grad_norm": 0.6168057918548584, + "grad_norm": 0.6142542362213135, "learning_rate": 2.5e-06, "loss": 1.1825, "num_input_tokens_seen": 409600, @@ -50,7 +50,7 @@ }, { "epoch": 0.0028811524609843936, - "grad_norm": 0.9915170073509216, + "grad_norm": 3.0495574474334717, "learning_rate": 3e-06, "loss": 1.0617, "num_input_tokens_seen": 491520, @@ -58,55 +58,55 @@ }, { "epoch": 0.0033613445378151263, - "grad_norm": 1.3197886943817139, + "grad_norm": 1.313432216644287, "learning_rate": 3.5000000000000004e-06, - "loss": 1.2087, + "loss": 1.2085, "num_input_tokens_seen": 573440, "step": 70 }, { "epoch": 0.0038415366146458585, - "grad_norm": 0.9965929985046387, + "grad_norm": 0.9920127391815186, "learning_rate": 4.000000000000001e-06, - "loss": 1.2009, + "loss": 1.2014, "num_input_tokens_seen": 655360, "step": 80 }, { "epoch": 0.004321728691476591, - "grad_norm": 1.3831547498703003, + "grad_norm": 1.3916656970977783, "learning_rate": 4.5e-06, - "loss": 1.2692, + "loss": 1.2691, "num_input_tokens_seen": 737280, "step": 90 }, { "epoch": 0.004801920768307323, - "grad_norm": 1.1937536001205444, + "grad_norm": 1.2180395126342773, "learning_rate": 5e-06, - "loss": 1.187, + "loss": 1.1872, "num_input_tokens_seen": 819200, "step": 100 }, { "epoch": 0.005282112845138055, - "grad_norm": 1.3311412334442139, + "grad_norm": 1.3460369110107422, "learning_rate": 5.500000000000001e-06, - "loss": 1.21, + "loss": 1.2103, "num_input_tokens_seen": 901120, "step": 110 }, { "epoch": 0.005762304921968787, - "grad_norm": 1.3286609649658203, + "grad_norm": 1.3270734548568726, "learning_rate": 6e-06, - "loss": 1.4533, + "loss": 1.4538, "num_input_tokens_seen": 983040, "step": 120 }, { "epoch": 0.00624249699879952, - "grad_norm": 1.8026959896087646, + "grad_norm": 1.7999413013458252, "learning_rate": 6.5000000000000004e-06, "loss": 1.3365, "num_input_tokens_seen": 1064960, @@ -114,287 +114,287 @@ }, { "epoch": 0.0067226890756302525, - "grad_norm": 1.4594871997833252, + "grad_norm": 1.4551628828048706, "learning_rate": 7.000000000000001e-06, - "loss": 1.0839, + "loss": 1.0837, "num_input_tokens_seen": 1146880, "step": 140 }, { "epoch": 0.007202881152460984, - "grad_norm": 1.2625036239624023, + "grad_norm": 1.265320062637329, "learning_rate": 7.5e-06, - "loss": 1.0646, + "loss": 1.0649, "num_input_tokens_seen": 1228800, "step": 150 }, { "epoch": 0.007683073229291717, - "grad_norm": 1.7347521781921387, + "grad_norm": 1.7354531288146973, "learning_rate": 8.000000000000001e-06, - "loss": 1.366, + "loss": 1.3666, "num_input_tokens_seen": 1310720, "step": 160 }, { "epoch": 0.00816326530612245, - "grad_norm": 1.4462337493896484, + "grad_norm": 1.464363932609558, "learning_rate": 8.500000000000002e-06, - "loss": 1.2509, + "loss": 1.2506, "num_input_tokens_seen": 1392640, "step": 170 }, { "epoch": 0.008643457382953182, - "grad_norm": 1.3763086795806885, + "grad_norm": 1.3760008811950684, "learning_rate": 9e-06, - "loss": 1.0235, + "loss": 1.0234, "num_input_tokens_seen": 1474560, "step": 180 }, { "epoch": 0.009123649459783913, - "grad_norm": 1.7035270929336548, + "grad_norm": 1.6841614246368408, "learning_rate": 9.5e-06, - "loss": 0.9992, + "loss": 0.9989, "num_input_tokens_seen": 1556480, "step": 190 }, { "epoch": 0.009603841536614645, - "grad_norm": 1.4041584730148315, + "grad_norm": 1.3923676013946533, "learning_rate": 1e-05, - "loss": 0.8424, + "loss": 0.8422, "num_input_tokens_seen": 1638400, "step": 200 }, { "epoch": 0.010084033613445379, - "grad_norm": 1.4125291109085083, + "grad_norm": 1.4108831882476807, "learning_rate": 1.05e-05, - "loss": 1.0181, + "loss": 1.0182, "num_input_tokens_seen": 1720320, "step": 210 }, { "epoch": 0.01056422569027611, - "grad_norm": 1.3839924335479736, + "grad_norm": 1.378661036491394, "learning_rate": 1.1000000000000001e-05, - "loss": 0.8846, + "loss": 0.8848, "num_input_tokens_seen": 1802240, "step": 220 }, { "epoch": 0.011044417767106842, - "grad_norm": 1.4403977394104004, + "grad_norm": 1.4397894144058228, "learning_rate": 1.1500000000000002e-05, - "loss": 1.1465, + "loss": 1.1464, "num_input_tokens_seen": 1884160, "step": 230 }, { "epoch": 0.011524609843937574, - "grad_norm": 1.1681145429611206, + "grad_norm": 1.1656988859176636, "learning_rate": 1.2e-05, - "loss": 1.0085, + "loss": 1.0084, "num_input_tokens_seen": 1966080, "step": 240 }, { "epoch": 0.012004801920768308, - "grad_norm": 2.7823386192321777, + "grad_norm": 2.9114668369293213, "learning_rate": 1.25e-05, - "loss": 1.0097, + "loss": 1.0096, "num_input_tokens_seen": 2048000, "step": 250 }, { "epoch": 0.01248499399759904, - "grad_norm": 1.2464739084243774, + "grad_norm": 1.2588117122650146, "learning_rate": 1.3000000000000001e-05, - "loss": 0.968, + "loss": 0.9685, "num_input_tokens_seen": 2129920, "step": 260 }, { "epoch": 0.012965186074429771, - "grad_norm": 2.1030478477478027, + "grad_norm": 2.1231954097747803, "learning_rate": 1.3500000000000001e-05, - "loss": 1.1094, + "loss": 1.109, "num_input_tokens_seen": 2211840, "step": 270 }, { "epoch": 0.013445378151260505, - "grad_norm": 1.006523609161377, + "grad_norm": 1.0035645961761475, "learning_rate": 1.4000000000000001e-05, - "loss": 1.0791, + "loss": 1.079, "num_input_tokens_seen": 2293760, "step": 280 }, { "epoch": 0.013925570228091237, - "grad_norm": 2.395143508911133, + "grad_norm": 2.3785393238067627, "learning_rate": 1.45e-05, - "loss": 1.0844, + "loss": 1.0842, "num_input_tokens_seen": 2375680, "step": 290 }, { "epoch": 0.014405762304921969, - "grad_norm": 1.1959385871887207, + "grad_norm": 1.2208366394042969, "learning_rate": 1.5e-05, - "loss": 1.0286, + "loss": 1.0287, "num_input_tokens_seen": 2457600, "step": 300 }, { "epoch": 0.0148859543817527, - "grad_norm": 2.8415608406066895, + "grad_norm": 2.929558038711548, "learning_rate": 1.55e-05, - "loss": 1.1747, + "loss": 1.1743, "num_input_tokens_seen": 2539520, "step": 310 }, { "epoch": 0.015366146458583434, - "grad_norm": 1.2111806869506836, + "grad_norm": 1.2139575481414795, "learning_rate": 1.6000000000000003e-05, - "loss": 1.1556, + "loss": 1.1558, "num_input_tokens_seen": 2621440, "step": 320 }, { "epoch": 0.015846338535414166, - "grad_norm": 1.1664150953292847, + "grad_norm": 1.1486806869506836, "learning_rate": 1.65e-05, - "loss": 1.0936, + "loss": 1.0934, "num_input_tokens_seen": 2703360, "step": 330 }, { "epoch": 0.0163265306122449, - "grad_norm": 1.121214747428894, + "grad_norm": 1.1211186647415161, "learning_rate": 1.7000000000000003e-05, - "loss": 1.0638, + "loss": 1.0637, "num_input_tokens_seen": 2785280, "step": 340 }, { "epoch": 0.01680672268907563, - "grad_norm": 1.1795384883880615, + "grad_norm": 1.207706093788147, "learning_rate": 1.75e-05, - "loss": 1.171, + "loss": 1.1708, "num_input_tokens_seen": 2867200, "step": 350 }, { "epoch": 0.017286914765906363, - "grad_norm": 1.1597260236740112, + "grad_norm": 1.1747910976409912, "learning_rate": 1.8e-05, - "loss": 0.9427, + "loss": 0.9425, "num_input_tokens_seen": 2949120, "step": 360 }, { "epoch": 0.017767106842737093, - "grad_norm": 0.9535462856292725, + "grad_norm": 0.9403015971183777, "learning_rate": 1.85e-05, - "loss": 0.9715, + "loss": 0.9716, "num_input_tokens_seen": 3031040, "step": 370 }, { "epoch": 0.018247298919567827, - "grad_norm": 0.946639895439148, + "grad_norm": 0.9536274075508118, "learning_rate": 1.9e-05, - "loss": 1.0749, + "loss": 1.0748, "num_input_tokens_seen": 3112960, "step": 380 }, { "epoch": 0.01872749099639856, - "grad_norm": 0.9363052845001221, + "grad_norm": 0.9436410069465637, "learning_rate": 1.9500000000000003e-05, - "loss": 0.9876, + "loss": 0.9879, "num_input_tokens_seen": 3194880, "step": 390 }, { "epoch": 0.01920768307322929, - "grad_norm": 1.0535475015640259, + "grad_norm": 1.080519199371338, "learning_rate": 2e-05, - "loss": 1.105, + "loss": 1.1051, "num_input_tokens_seen": 3276800, "step": 400 }, { "epoch": 0.019687875150060024, - "grad_norm": 2.634197950363159, + "grad_norm": 2.6441071033477783, "learning_rate": 2.05e-05, - "loss": 1.1199, + "loss": 1.1197, "num_input_tokens_seen": 3358720, "step": 410 }, { "epoch": 0.020168067226890758, - "grad_norm": 1.0603679418563843, + "grad_norm": 1.0869808197021484, "learning_rate": 2.1e-05, - "loss": 1.0458, + "loss": 1.046, "num_input_tokens_seen": 3440640, "step": 420 }, { "epoch": 0.020648259303721488, - "grad_norm": 1.0156742334365845, + "grad_norm": 1.0044586658477783, "learning_rate": 2.15e-05, - "loss": 1.023, + "loss": 1.0229, "num_input_tokens_seen": 3522560, "step": 430 }, { "epoch": 0.02112845138055222, - "grad_norm": 0.9424085021018982, + "grad_norm": 0.9314221143722534, "learning_rate": 2.2000000000000003e-05, - "loss": 1.1381, + "loss": 1.138, "num_input_tokens_seen": 3604480, "step": 440 }, { "epoch": 0.021608643457382955, - "grad_norm": 0.843125581741333, + "grad_norm": 0.846156895160675, "learning_rate": 2.25e-05, - "loss": 0.9443, + "loss": 0.9448, "num_input_tokens_seen": 3686400, "step": 450 }, { "epoch": 0.022088835534213685, - "grad_norm": 0.7700474262237549, + "grad_norm": 0.7679842114448547, "learning_rate": 2.3000000000000003e-05, - "loss": 0.9857, + "loss": 0.9858, "num_input_tokens_seen": 3768320, "step": 460 }, { "epoch": 0.02256902761104442, - "grad_norm": 0.8095147013664246, + "grad_norm": 0.8113387227058411, "learning_rate": 2.35e-05, - "loss": 1.2423, + "loss": 1.242, "num_input_tokens_seen": 3850240, "step": 470 }, { "epoch": 0.02304921968787515, - "grad_norm": 0.7245414853096008, + "grad_norm": 0.7278649210929871, "learning_rate": 2.4e-05, - "loss": 1.0458, + "loss": 1.046, "num_input_tokens_seen": 3932160, "step": 480 }, { "epoch": 0.023529411764705882, - "grad_norm": 0.7768824100494385, + "grad_norm": 0.765138566493988, "learning_rate": 2.45e-05, "loss": 1.0759, "num_input_tokens_seen": 4014080, @@ -402,87 +402,87 @@ }, { "epoch": 0.024009603841536616, - "grad_norm": 0.6364564895629883, + "grad_norm": 0.6442419290542603, "learning_rate": 2.5e-05, - "loss": 1.0799, + "loss": 1.0796, "num_input_tokens_seen": 4096000, "step": 500 }, { "epoch": 0.024489795918367346, - "grad_norm": 0.7478921413421631, + "grad_norm": 0.7583214640617371, "learning_rate": 2.5500000000000003e-05, - "loss": 1.0056, + "loss": 1.0057, "num_input_tokens_seen": 4177920, "step": 510 }, { "epoch": 0.02496998799519808, - "grad_norm": 0.5253103375434875, + "grad_norm": 0.5175316333770752, "learning_rate": 2.6000000000000002e-05, - "loss": 0.9158, + "loss": 0.9161, "num_input_tokens_seen": 4259840, "step": 520 }, { "epoch": 0.025450180072028813, - "grad_norm": 0.7591320872306824, + "grad_norm": 0.7704029679298401, "learning_rate": 2.6500000000000004e-05, - "loss": 1.0108, + "loss": 1.0109, "num_input_tokens_seen": 4341760, "step": 530 }, { "epoch": 0.025930372148859543, - "grad_norm": 0.7796912789344788, + "grad_norm": 0.7691550850868225, "learning_rate": 2.7000000000000002e-05, - "loss": 0.9604, + "loss": 0.9605, "num_input_tokens_seen": 4423680, "step": 540 }, { "epoch": 0.026410564225690276, - "grad_norm": 0.7369773983955383, + "grad_norm": 0.7306321263313293, "learning_rate": 2.7500000000000004e-05, - "loss": 1.0828, + "loss": 1.0826, "num_input_tokens_seen": 4505600, "step": 550 }, { "epoch": 0.02689075630252101, - "grad_norm": 0.7800397276878357, + "grad_norm": 0.7934399843215942, "learning_rate": 2.8000000000000003e-05, - "loss": 1.2004, + "loss": 1.2013, "num_input_tokens_seen": 4587520, "step": 560 }, { "epoch": 0.02737094837935174, - "grad_norm": 1.0041433572769165, + "grad_norm": 0.9972386360168457, "learning_rate": 2.8499999999999998e-05, - "loss": 1.0362, + "loss": 1.0361, "num_input_tokens_seen": 4669440, "step": 570 }, { "epoch": 0.027851140456182474, - "grad_norm": 0.7719876170158386, + "grad_norm": 0.7794674038887024, "learning_rate": 2.9e-05, - "loss": 1.1366, + "loss": 1.1374, "num_input_tokens_seen": 4751360, "step": 580 }, { "epoch": 0.028331332533013204, - "grad_norm": 0.7241286039352417, + "grad_norm": 0.7303441762924194, "learning_rate": 2.95e-05, - "loss": 1.1366, + "loss": 1.1363, "num_input_tokens_seen": 4833280, "step": 590 }, { "epoch": 0.028811524609843937, - "grad_norm": 1.1306092739105225, + "grad_norm": 1.1157978773117065, "learning_rate": 3e-05, "loss": 0.8576, "num_input_tokens_seen": 4915200, @@ -490,15 +490,15 @@ }, { "epoch": 0.02929171668667467, - "grad_norm": 0.6735727190971375, + "grad_norm": 0.6717653274536133, "learning_rate": 3.05e-05, - "loss": 1.2392, + "loss": 1.2395, "num_input_tokens_seen": 4997120, "step": 610 }, { "epoch": 0.0297719087635054, - "grad_norm": 1.3449808359146118, + "grad_norm": 1.3313277959823608, "learning_rate": 3.1e-05, "loss": 0.9339, "num_input_tokens_seen": 5079040, @@ -506,151 +506,151 @@ }, { "epoch": 0.030252100840336135, - "grad_norm": 0.7179974317550659, + "grad_norm": 0.7310436367988586, "learning_rate": 3.15e-05, - "loss": 0.9704, + "loss": 0.9699, "num_input_tokens_seen": 5160960, "step": 630 }, { "epoch": 0.030732292917166868, - "grad_norm": 0.7083514928817749, + "grad_norm": 0.708554208278656, "learning_rate": 3.2000000000000005e-05, - "loss": 0.9718, + "loss": 0.9722, "num_input_tokens_seen": 5242880, "step": 640 }, { "epoch": 0.031212484993997598, - "grad_norm": 1.1827001571655273, + "grad_norm": 1.094826102256775, "learning_rate": 3.2500000000000004e-05, - "loss": 0.9997, + "loss": 0.9991, "num_input_tokens_seen": 5324800, "step": 650 }, { "epoch": 0.03169267707082833, - "grad_norm": 0.553023099899292, + "grad_norm": 0.5501854419708252, "learning_rate": 3.3e-05, - "loss": 0.8748, + "loss": 0.8745, "num_input_tokens_seen": 5406720, "step": 660 }, { "epoch": 0.032172869147659065, - "grad_norm": 0.7951409816741943, + "grad_norm": 0.7503916025161743, "learning_rate": 3.35e-05, - "loss": 0.8658, + "loss": 0.8664, "num_input_tokens_seen": 5488640, "step": 670 }, { "epoch": 0.0326530612244898, - "grad_norm": 0.7278563976287842, + "grad_norm": 0.6931608319282532, "learning_rate": 3.4000000000000007e-05, - "loss": 0.8799, + "loss": 0.88, "num_input_tokens_seen": 5570560, "step": 680 }, { "epoch": 0.033133253301320525, - "grad_norm": 0.4969690442085266, + "grad_norm": 0.48539698123931885, "learning_rate": 3.45e-05, - "loss": 1.0723, + "loss": 1.0718, "num_input_tokens_seen": 5652480, "step": 690 }, { "epoch": 0.03361344537815126, - "grad_norm": 0.621137797832489, + "grad_norm": 0.6209288239479065, "learning_rate": 3.5e-05, - "loss": 1.0756, + "loss": 1.0753, "num_input_tokens_seen": 5734400, "step": 700 }, { "epoch": 0.03409363745498199, - "grad_norm": 0.6362459063529968, + "grad_norm": 0.6421590447425842, "learning_rate": 3.55e-05, - "loss": 0.967, + "loss": 0.9675, "num_input_tokens_seen": 5816320, "step": 710 }, { "epoch": 0.034573829531812726, - "grad_norm": 0.7109155654907227, + "grad_norm": 0.7048439979553223, "learning_rate": 3.6e-05, - "loss": 0.9767, + "loss": 0.9765, "num_input_tokens_seen": 5898240, "step": 720 }, { "epoch": 0.03505402160864346, - "grad_norm": 0.6260324120521545, + "grad_norm": 0.6135404706001282, "learning_rate": 3.65e-05, - "loss": 0.9352, + "loss": 0.9347, "num_input_tokens_seen": 5980160, "step": 730 }, { "epoch": 0.035534213685474186, - "grad_norm": 0.6420788168907166, + "grad_norm": 0.6425515413284302, "learning_rate": 3.7e-05, - "loss": 0.9214, + "loss": 0.9209, "num_input_tokens_seen": 6062080, "step": 740 }, { "epoch": 0.03601440576230492, - "grad_norm": 0.6354939937591553, + "grad_norm": 0.6536908149719238, "learning_rate": 3.7500000000000003e-05, - "loss": 1.0334, + "loss": 1.0337, "num_input_tokens_seen": 6144000, "step": 750 }, { "epoch": 0.036494597839135653, - "grad_norm": 0.5470936298370361, + "grad_norm": 0.5051663517951965, "learning_rate": 3.8e-05, - "loss": 1.0507, + "loss": 1.051, "num_input_tokens_seen": 6225920, "step": 760 }, { "epoch": 0.03697478991596639, - "grad_norm": 0.8862899541854858, + "grad_norm": 0.8759781122207642, "learning_rate": 3.85e-05, - "loss": 1.0, + "loss": 0.9995, "num_input_tokens_seen": 6307840, "step": 770 }, { "epoch": 0.03745498199279712, - "grad_norm": 0.6771734952926636, + "grad_norm": 0.7036904096603394, "learning_rate": 3.9000000000000006e-05, - "loss": 1.0441, + "loss": 1.0438, "num_input_tokens_seen": 6389760, "step": 780 }, { "epoch": 0.037935174069627854, - "grad_norm": 0.6661419868469238, + "grad_norm": 0.6532811522483826, "learning_rate": 3.9500000000000005e-05, - "loss": 1.0698, + "loss": 1.0693, "num_input_tokens_seen": 6471680, "step": 790 }, { "epoch": 0.03841536614645858, - "grad_norm": 0.6281525492668152, + "grad_norm": 0.6256266832351685, "learning_rate": 4e-05, - "loss": 0.9719, + "loss": 0.9723, "num_input_tokens_seen": 6553600, "step": 800 }, { "epoch": 0.038895558223289314, - "grad_norm": 0.8006066083908081, + "grad_norm": 0.8010514974594116, "learning_rate": 4.05e-05, "loss": 1.1387, "num_input_tokens_seen": 6635520, @@ -658,223 +658,223 @@ }, { "epoch": 0.03937575030012005, - "grad_norm": 0.6581395268440247, + "grad_norm": 0.6819067001342773, "learning_rate": 4.1e-05, - "loss": 1.0304, + "loss": 1.0296, "num_input_tokens_seen": 6717440, "step": 820 }, { "epoch": 0.03985594237695078, - "grad_norm": 0.5860041379928589, + "grad_norm": 0.6031268239021301, "learning_rate": 4.15e-05, - "loss": 0.9735, + "loss": 0.9731, "num_input_tokens_seen": 6799360, "step": 830 }, { "epoch": 0.040336134453781515, - "grad_norm": 0.6659871339797974, + "grad_norm": 0.6935213804244995, "learning_rate": 4.2e-05, - "loss": 0.9434, + "loss": 0.9446, "num_input_tokens_seen": 6881280, "step": 840 }, { "epoch": 0.04081632653061224, - "grad_norm": 0.6158783435821533, + "grad_norm": 0.6307564377784729, "learning_rate": 4.25e-05, - "loss": 0.9919, + "loss": 0.9918, "num_input_tokens_seen": 6963200, "step": 850 }, { "epoch": 0.041296518607442975, - "grad_norm": 1.7883119583129883, + "grad_norm": 1.976820468902588, "learning_rate": 4.3e-05, - "loss": 0.9973, + "loss": 0.9964, "num_input_tokens_seen": 7045120, "step": 860 }, { "epoch": 0.04177671068427371, - "grad_norm": 0.6660032272338867, + "grad_norm": 0.6267603635787964, "learning_rate": 4.35e-05, - "loss": 1.0275, + "loss": 1.0334, "num_input_tokens_seen": 7127040, "step": 870 }, { "epoch": 0.04225690276110444, - "grad_norm": 0.7275106906890869, + "grad_norm": 0.717856228351593, "learning_rate": 4.4000000000000006e-05, - "loss": 0.8301, + "loss": 0.8306, "num_input_tokens_seen": 7208960, "step": 880 }, { "epoch": 0.042737094837935176, - "grad_norm": 0.7131916880607605, + "grad_norm": 0.677495002746582, "learning_rate": 4.4500000000000004e-05, - "loss": 1.3488, + "loss": 1.3483, "num_input_tokens_seen": 7290880, "step": 890 }, { "epoch": 0.04321728691476591, - "grad_norm": 0.6134268045425415, + "grad_norm": 0.6296849250793457, "learning_rate": 4.5e-05, - "loss": 1.1191, + "loss": 1.1189, "num_input_tokens_seen": 7372800, "step": 900 }, { "epoch": 0.043697478991596636, - "grad_norm": 0.6196214556694031, + "grad_norm": 0.5991678833961487, "learning_rate": 4.55e-05, - "loss": 1.179, + "loss": 1.1789, "num_input_tokens_seen": 7454720, "step": 910 }, { "epoch": 0.04417767106842737, - "grad_norm": 0.626262903213501, + "grad_norm": 0.6427794098854065, "learning_rate": 4.600000000000001e-05, - "loss": 1.0239, + "loss": 1.0259, "num_input_tokens_seen": 7536640, "step": 920 }, { "epoch": 0.0446578631452581, - "grad_norm": 0.7423630952835083, + "grad_norm": 0.7176418900489807, "learning_rate": 4.6500000000000005e-05, - "loss": 1.0599, + "loss": 1.0611, "num_input_tokens_seen": 7618560, "step": 930 }, { "epoch": 0.04513805522208884, - "grad_norm": 0.6767206788063049, + "grad_norm": 0.6509612798690796, "learning_rate": 4.7e-05, - "loss": 0.9898, + "loss": 0.9896, "num_input_tokens_seen": 7700480, "step": 940 }, { "epoch": 0.04561824729891957, - "grad_norm": 0.8346232175827026, + "grad_norm": 0.7317907214164734, "learning_rate": 4.75e-05, - "loss": 0.9133, + "loss": 0.9137, "num_input_tokens_seen": 7782400, "step": 950 }, { "epoch": 0.0460984393757503, - "grad_norm": 0.5462753176689148, + "grad_norm": 0.567057728767395, "learning_rate": 4.8e-05, - "loss": 0.8747, + "loss": 0.875, "num_input_tokens_seen": 7864320, "step": 960 }, { "epoch": 0.04657863145258103, - "grad_norm": 0.6095471978187561, + "grad_norm": 0.6136640310287476, "learning_rate": 4.85e-05, - "loss": 1.0154, + "loss": 1.0156, "num_input_tokens_seen": 7946240, "step": 970 }, { "epoch": 0.047058823529411764, - "grad_norm": 0.555296778678894, + "grad_norm": 0.5537669658660889, "learning_rate": 4.9e-05, - "loss": 0.8588, + "loss": 0.8589, "num_input_tokens_seen": 8028160, "step": 980 }, { "epoch": 0.0475390156062425, - "grad_norm": 0.6341934204101562, + "grad_norm": 0.6746324300765991, "learning_rate": 4.9500000000000004e-05, - "loss": 0.8407, + "loss": 0.8402, "num_input_tokens_seen": 8110080, "step": 990 }, { "epoch": 0.04801920768307323, - "grad_norm": 0.6310316324234009, + "grad_norm": 0.5586228370666504, "learning_rate": 5e-05, - "loss": 0.8889, + "loss": 0.8892, "num_input_tokens_seen": 8192000, "step": 1000 }, { "epoch": 0.048499399759903965, - "grad_norm": 0.6559818983078003, + "grad_norm": 0.7242605686187744, "learning_rate": 4.9999968610581127e-05, - "loss": 1.0226, + "loss": 1.0213, "num_input_tokens_seen": 8273920, "step": 1010 }, { "epoch": 0.04897959183673469, - "grad_norm": 0.6584986448287964, + "grad_norm": 0.6588850021362305, "learning_rate": 4.9999874442403314e-05, - "loss": 1.0976, + "loss": 1.0984, "num_input_tokens_seen": 8355840, "step": 1020 }, { "epoch": 0.049459783913565425, - "grad_norm": 0.9989052414894104, + "grad_norm": 1.087486982345581, "learning_rate": 4.999971749570305e-05, - "loss": 1.1658, + "loss": 1.167, "num_input_tokens_seen": 8437760, "step": 1030 }, { "epoch": 0.04993997599039616, - "grad_norm": 0.6315760612487793, + "grad_norm": 0.6291378736495972, "learning_rate": 4.999949777087444e-05, - "loss": 0.9763, + "loss": 0.9767, "num_input_tokens_seen": 8519680, "step": 1040 }, { "epoch": 0.05042016806722689, - "grad_norm": 0.6868481636047363, + "grad_norm": 0.711715579032898, "learning_rate": 4.999921526846925e-05, - "loss": 0.8407, + "loss": 0.8419, "num_input_tokens_seen": 8601600, "step": 1050 }, { "epoch": 0.050900360144057626, - "grad_norm": 0.6122430562973022, + "grad_norm": 0.6777693629264832, "learning_rate": 4.9998869989196885e-05, - "loss": 0.9688, + "loss": 0.9681, "num_input_tokens_seen": 8683520, "step": 1060 }, { "epoch": 0.05138055222088835, - "grad_norm": 0.6200319528579712, + "grad_norm": 0.6924950480461121, "learning_rate": 4.99984619339244e-05, - "loss": 0.9459, + "loss": 0.9462, "num_input_tokens_seen": 8765440, "step": 1070 }, { "epoch": 0.051860744297719086, - "grad_norm": 0.6406499147415161, + "grad_norm": 0.6903403997421265, "learning_rate": 4.999799110367648e-05, - "loss": 1.0055, + "loss": 1.0044, "num_input_tokens_seen": 8847360, "step": 1080 }, { "epoch": 0.05234093637454982, - "grad_norm": 0.5609921813011169, + "grad_norm": 0.5596834421157837, "learning_rate": 4.999745749963545e-05, "loss": 1.0218, "num_input_tokens_seen": 8929280, @@ -882,79 +882,79 @@ }, { "epoch": 0.05282112845138055, - "grad_norm": 0.6337623596191406, + "grad_norm": 0.7687702775001526, "learning_rate": 4.999686112314127e-05, - "loss": 0.8797, + "loss": 0.8808, "num_input_tokens_seen": 9011200, "step": 1100 }, { "epoch": 0.053301320528211286, - "grad_norm": 0.6383864879608154, + "grad_norm": 0.633491575717926, "learning_rate": 4.999620197569155e-05, - "loss": 1.1579, + "loss": 1.1594, "num_input_tokens_seen": 9093120, "step": 1110 }, { "epoch": 0.05378151260504202, - "grad_norm": 0.575879693031311, + "grad_norm": 0.6057960391044617, "learning_rate": 4.9995480058941483e-05, - "loss": 0.8893, + "loss": 0.8905, "num_input_tokens_seen": 9175040, "step": 1120 }, { "epoch": 0.05426170468187275, - "grad_norm": 0.6337655186653137, + "grad_norm": 0.6258108019828796, "learning_rate": 4.9994695374703934e-05, - "loss": 0.987, + "loss": 0.9875, "num_input_tokens_seen": 9256960, "step": 1130 }, { "epoch": 0.05474189675870348, - "grad_norm": 0.6570917367935181, + "grad_norm": 0.6195672154426575, "learning_rate": 4.9993847924949364e-05, - "loss": 0.8846, + "loss": 0.8852, "num_input_tokens_seen": 9338880, "step": 1140 }, { "epoch": 0.055222088835534214, - "grad_norm": 0.6110508441925049, + "grad_norm": 0.5793142914772034, "learning_rate": 4.999293771180584e-05, - "loss": 0.9093, + "loss": 0.91, "num_input_tokens_seen": 9420800, "step": 1150 }, { "epoch": 0.05570228091236495, - "grad_norm": 0.7141037583351135, + "grad_norm": 0.6474392414093018, "learning_rate": 4.999196473755905e-05, - "loss": 1.0216, + "loss": 1.0214, "num_input_tokens_seen": 9502720, "step": 1160 }, { "epoch": 0.05618247298919568, - "grad_norm": 0.5640223026275635, + "grad_norm": 0.5729678273200989, "learning_rate": 4.9990929004652287e-05, - "loss": 1.0765, + "loss": 1.0768, "num_input_tokens_seen": 9584640, "step": 1170 }, { "epoch": 0.05666266506602641, - "grad_norm": 0.4741804301738739, + "grad_norm": 0.4522230923175812, "learning_rate": 4.9989830515686434e-05, - "loss": 0.9421, + "loss": 0.9418, "num_input_tokens_seen": 9666560, "step": 1180 }, { "epoch": 0.05714285714285714, - "grad_norm": 0.4780801236629486, + "grad_norm": 0.4965748190879822, "learning_rate": 4.998866927341995e-05, "loss": 1.0168, "num_input_tokens_seen": 9748480, @@ -962,303 +962,303 @@ }, { "epoch": 0.057623049219687875, - "grad_norm": 0.6580250859260559, + "grad_norm": 0.6100621223449707, "learning_rate": 4.9987445280768916e-05, - "loss": 0.9398, + "loss": 0.9405, "num_input_tokens_seen": 9830400, "step": 1200 }, { "epoch": 0.05810324129651861, - "grad_norm": 0.571909487247467, + "grad_norm": 0.5823714733123779, "learning_rate": 4.998615854080695e-05, - "loss": 1.0907, + "loss": 1.0905, "num_input_tokens_seen": 9912320, "step": 1210 }, { "epoch": 0.05858343337334934, - "grad_norm": 0.6233389973640442, + "grad_norm": 0.6303861141204834, "learning_rate": 4.998480905676527e-05, - "loss": 0.9021, + "loss": 0.9024, "num_input_tokens_seen": 9994240, "step": 1220 }, { "epoch": 0.059063625450180075, - "grad_norm": 1.2051422595977783, + "grad_norm": 1.1989893913269043, "learning_rate": 4.998339683203261e-05, - "loss": 1.2031, + "loss": 1.2022, "num_input_tokens_seen": 10076160, "step": 1230 }, { "epoch": 0.0595438175270108, - "grad_norm": 0.6003074645996094, + "grad_norm": 0.6013513803482056, "learning_rate": 4.9981921870155314e-05, - "loss": 1.0543, + "loss": 1.0539, "num_input_tokens_seen": 10158080, "step": 1240 }, { "epoch": 0.060024009603841535, - "grad_norm": 0.5767567753791809, + "grad_norm": 0.6614658236503601, "learning_rate": 4.998038417483721e-05, - "loss": 1.0474, + "loss": 1.0482, "num_input_tokens_seen": 10240000, "step": 1250 }, { "epoch": 0.06050420168067227, - "grad_norm": 0.6394206285476685, + "grad_norm": 0.7018217444419861, "learning_rate": 4.9978783749939705e-05, - "loss": 1.0161, + "loss": 1.0175, "num_input_tokens_seen": 10321920, "step": 1260 }, { "epoch": 0.060984393757503, - "grad_norm": 0.6625099778175354, + "grad_norm": 0.6370293498039246, "learning_rate": 4.997712059948171e-05, - "loss": 0.9022, + "loss": 0.902, "num_input_tokens_seen": 10403840, "step": 1270 }, { "epoch": 0.061464585834333736, - "grad_norm": 0.6047447323799133, + "grad_norm": 0.5794985890388489, "learning_rate": 4.997539472763964e-05, - "loss": 0.8682, + "loss": 0.8692, "num_input_tokens_seen": 10485760, "step": 1280 }, { "epoch": 0.06194477791116446, - "grad_norm": 0.6154036521911621, + "grad_norm": 0.5841014385223389, "learning_rate": 4.9973606138747434e-05, - "loss": 1.147, + "loss": 1.1472, "num_input_tokens_seen": 10567680, "step": 1290 }, { "epoch": 0.062424969987995196, - "grad_norm": 0.8171849250793457, + "grad_norm": 0.7204465866088867, "learning_rate": 4.9971754837296516e-05, - "loss": 0.8778, + "loss": 0.8769, "num_input_tokens_seen": 10649600, "step": 1300 }, { "epoch": 0.06290516206482594, - "grad_norm": 0.8324936628341675, + "grad_norm": 0.794895350933075, "learning_rate": 4.9969840827935776e-05, - "loss": 1.249, + "loss": 1.2477, "num_input_tokens_seen": 10731520, "step": 1310 }, { "epoch": 0.06338535414165666, - "grad_norm": 0.5857701897621155, + "grad_norm": 0.6169483661651611, "learning_rate": 4.99678641154716e-05, - "loss": 0.6743, + "loss": 0.674, "num_input_tokens_seen": 10813440, "step": 1320 }, { "epoch": 0.06386554621848739, - "grad_norm": 0.6449385285377502, + "grad_norm": 0.6169522404670715, "learning_rate": 4.9965824704867806e-05, - "loss": 1.0631, + "loss": 1.063, "num_input_tokens_seen": 10895360, "step": 1330 }, { "epoch": 0.06434573829531813, - "grad_norm": 0.5496826767921448, + "grad_norm": 0.5324148535728455, "learning_rate": 4.996372260124567e-05, - "loss": 0.8721, + "loss": 0.8696, "num_input_tokens_seen": 10977280, "step": 1340 }, { "epoch": 0.06482593037214886, - "grad_norm": 0.6244034767150879, + "grad_norm": 0.66362065076828, "learning_rate": 4.996155780988389e-05, - "loss": 0.7503, + "loss": 0.7504, "num_input_tokens_seen": 11059200, "step": 1350 }, { "epoch": 0.0653061224489796, - "grad_norm": 0.5835392475128174, + "grad_norm": 0.5489897727966309, "learning_rate": 4.9959330336218605e-05, - "loss": 0.8589, + "loss": 0.8579, "num_input_tokens_seen": 11141120, "step": 1360 }, { "epoch": 0.06578631452581032, - "grad_norm": 0.5886926054954529, + "grad_norm": 0.5780734419822693, "learning_rate": 4.995704018584334e-05, - "loss": 0.9261, + "loss": 0.9259, "num_input_tokens_seen": 11223040, "step": 1370 }, { "epoch": 0.06626650660264105, - "grad_norm": 0.6225190758705139, + "grad_norm": 0.596081018447876, "learning_rate": 4.9954687364508996e-05, - "loss": 0.7139, + "loss": 0.7141, "num_input_tokens_seen": 11304960, "step": 1380 }, { "epoch": 0.06674669867947179, - "grad_norm": 0.7748022079467773, + "grad_norm": 0.6452916264533997, "learning_rate": 4.995227187812389e-05, - "loss": 0.8474, + "loss": 0.8471, "num_input_tokens_seen": 11386880, "step": 1390 }, { "epoch": 0.06722689075630252, - "grad_norm": 0.625622570514679, + "grad_norm": 0.63129723072052, "learning_rate": 4.9949793732753656e-05, - "loss": 1.0527, + "loss": 1.0521, "num_input_tokens_seen": 11468800, "step": 1400 }, { "epoch": 0.06770708283313326, - "grad_norm": 0.5124455094337463, + "grad_norm": 0.48341453075408936, "learning_rate": 4.994725293462132e-05, - "loss": 0.9405, + "loss": 0.9402, "num_input_tokens_seen": 11550720, "step": 1410 }, { "epoch": 0.06818727490996399, - "grad_norm": 0.5305024981498718, + "grad_norm": 0.5027806758880615, "learning_rate": 4.994464949010722e-05, - "loss": 0.9588, + "loss": 0.9594, "num_input_tokens_seen": 11632640, "step": 1420 }, { "epoch": 0.06866746698679471, - "grad_norm": 0.5525708198547363, + "grad_norm": 0.5715320706367493, "learning_rate": 4.994198340574898e-05, - "loss": 0.9032, + "loss": 0.9039, "num_input_tokens_seen": 11714560, "step": 1430 }, { "epoch": 0.06914765906362545, - "grad_norm": 0.5921324491500854, + "grad_norm": 0.5546749234199524, "learning_rate": 4.993925468824156e-05, - "loss": 0.8397, + "loss": 0.8399, "num_input_tokens_seen": 11796480, "step": 1440 }, { "epoch": 0.06962785114045618, - "grad_norm": 0.5875818729400635, + "grad_norm": 0.5861151814460754, "learning_rate": 4.99364633444372e-05, - "loss": 0.9019, + "loss": 0.9013, "num_input_tokens_seen": 11878400, "step": 1450 }, { "epoch": 0.07010804321728692, - "grad_norm": 0.49930045008659363, + "grad_norm": 0.4911801815032959, "learning_rate": 4.993360938134537e-05, - "loss": 0.999, + "loss": 0.9985, "num_input_tokens_seen": 11960320, "step": 1460 }, { "epoch": 0.07058823529411765, - "grad_norm": 0.5186768770217896, + "grad_norm": 0.4994650185108185, "learning_rate": 4.993069280613282e-05, - "loss": 1.0207, + "loss": 1.0197, "num_input_tokens_seen": 12042240, "step": 1470 }, { "epoch": 0.07106842737094837, - "grad_norm": 0.6106103658676147, + "grad_norm": 0.5857521295547485, "learning_rate": 4.9927713626123524e-05, - "loss": 0.9636, + "loss": 0.964, "num_input_tokens_seen": 12124160, "step": 1480 }, { "epoch": 0.07154861944777911, - "grad_norm": 0.5551833510398865, + "grad_norm": 0.522347092628479, "learning_rate": 4.992467184879865e-05, - "loss": 0.9386, + "loss": 0.9385, "num_input_tokens_seen": 12206080, "step": 1490 }, { "epoch": 0.07202881152460984, - "grad_norm": 0.7018815279006958, + "grad_norm": 0.6849465370178223, "learning_rate": 4.9921567481796585e-05, - "loss": 1.0065, + "loss": 1.0057, "num_input_tokens_seen": 12288000, "step": 1500 }, { "epoch": 0.07250900360144058, - "grad_norm": 0.5908045768737793, + "grad_norm": 0.6455889940261841, "learning_rate": 4.9918400532912845e-05, - "loss": 0.9552, + "loss": 0.955, "num_input_tokens_seen": 12369920, "step": 1510 }, { "epoch": 0.07298919567827131, - "grad_norm": 0.9950196146965027, + "grad_norm": 0.8505333065986633, "learning_rate": 4.991517101010015e-05, - "loss": 0.9035, + "loss": 0.9024, "num_input_tokens_seen": 12451840, "step": 1520 }, { "epoch": 0.07346938775510205, - "grad_norm": 0.5551639199256897, + "grad_norm": 0.6128160953521729, "learning_rate": 4.9911878921468304e-05, - "loss": 1.0423, + "loss": 1.0432, "num_input_tokens_seen": 12533760, "step": 1530 }, { "epoch": 0.07394957983193277, - "grad_norm": 0.5617671012878418, + "grad_norm": 0.587476372718811, "learning_rate": 4.990852427528427e-05, - "loss": 0.9216, + "loss": 0.9204, "num_input_tokens_seen": 12615680, "step": 1540 }, { "epoch": 0.0744297719087635, - "grad_norm": 0.6270405650138855, + "grad_norm": 0.6319882273674011, "learning_rate": 4.9905107079972064e-05, - "loss": 0.9563, + "loss": 0.9548, "num_input_tokens_seen": 12697600, "step": 1550 }, { "epoch": 0.07490996398559424, - "grad_norm": 0.5399815440177917, + "grad_norm": 0.5549477934837341, "learning_rate": 4.990162734411279e-05, - "loss": 0.9665, + "loss": 0.9657, "num_input_tokens_seen": 12779520, "step": 1560 }, { "epoch": 0.07539015606242497, - "grad_norm": 0.55797278881073, + "grad_norm": 0.531639575958252, "learning_rate": 4.989808507644461e-05, "loss": 1.0059, "num_input_tokens_seen": 12861440, @@ -1266,7 +1266,7 @@ }, { "epoch": 0.07587034813925571, - "grad_norm": 0.5673569440841675, + "grad_norm": 0.5933917164802551, "learning_rate": 4.989448028586269e-05, "loss": 0.9245, "num_input_tokens_seen": 12943360, @@ -1274,55 +1274,55 @@ }, { "epoch": 0.07635054021608643, - "grad_norm": 0.5120587348937988, + "grad_norm": 0.5222373604774475, "learning_rate": 4.989081298141921e-05, - "loss": 0.8285, + "loss": 0.8293, "num_input_tokens_seen": 13025280, "step": 1590 }, { "epoch": 0.07683073229291716, - "grad_norm": 0.6352382898330688, + "grad_norm": 0.60689777135849, "learning_rate": 4.988708317232334e-05, - "loss": 0.8067, + "loss": 0.8068, "num_input_tokens_seen": 13107200, "step": 1600 }, { "epoch": 0.0773109243697479, - "grad_norm": 0.6657270789146423, + "grad_norm": 0.7311990261077881, "learning_rate": 4.988329086794122e-05, - "loss": 0.8029, + "loss": 0.8037, "num_input_tokens_seen": 13189120, "step": 1610 }, { "epoch": 0.07779111644657863, - "grad_norm": 0.5678449869155884, + "grad_norm": 0.5419260263442993, "learning_rate": 4.9879436077795884e-05, - "loss": 1.1687, + "loss": 1.1678, "num_input_tokens_seen": 13271040, "step": 1620 }, { "epoch": 0.07827130852340937, - "grad_norm": 0.5785603523254395, + "grad_norm": 0.6668097972869873, "learning_rate": 4.98755188115673e-05, - "loss": 0.9474, + "loss": 0.946, "num_input_tokens_seen": 13352960, "step": 1630 }, { "epoch": 0.0787515006002401, - "grad_norm": 0.5578638315200806, + "grad_norm": 0.5447852611541748, "learning_rate": 4.9871539079092344e-05, - "loss": 0.9421, + "loss": 0.9426, "num_input_tokens_seen": 13434880, "step": 1640 }, { "epoch": 0.07923169267707082, - "grad_norm": 0.9270123243331909, + "grad_norm": 0.9077025055885315, "learning_rate": 4.9867496890364726e-05, "loss": 1.0174, "num_input_tokens_seen": 13516800, @@ -1330,127 +1330,127 @@ }, { "epoch": 0.07971188475390156, - "grad_norm": 0.4947386085987091, + "grad_norm": 0.5434586405754089, "learning_rate": 4.9863392255535e-05, - "loss": 1.1363, + "loss": 1.1374, "num_input_tokens_seen": 13598720, "step": 1660 }, { "epoch": 0.08019207683073229, - "grad_norm": 1.040479063987732, + "grad_norm": 1.1956346035003662, "learning_rate": 4.985922518491054e-05, - "loss": 0.9505, + "loss": 0.9507, "num_input_tokens_seen": 13680640, "step": 1670 }, { "epoch": 0.08067226890756303, - "grad_norm": 0.6103275418281555, + "grad_norm": 0.6139404773712158, "learning_rate": 4.9854995688955494e-05, - "loss": 1.0416, + "loss": 1.0419, "num_input_tokens_seen": 13762560, "step": 1680 }, { "epoch": 0.08115246098439376, - "grad_norm": 0.5951912999153137, + "grad_norm": 0.5825724601745605, "learning_rate": 4.9850703778290784e-05, - "loss": 1.164, + "loss": 1.1634, "num_input_tokens_seen": 13844480, "step": 1690 }, { "epoch": 0.08163265306122448, - "grad_norm": 0.5379506945610046, + "grad_norm": 0.5449261665344238, "learning_rate": 4.984634946369404e-05, - "loss": 1.0884, + "loss": 1.0925, "num_input_tokens_seen": 13926400, "step": 1700 }, { "epoch": 0.08211284513805522, - "grad_norm": 0.6716327667236328, + "grad_norm": 0.6387315988540649, "learning_rate": 4.984193275609964e-05, - "loss": 1.2702, + "loss": 1.2723, "num_input_tokens_seen": 14008320, "step": 1710 }, { "epoch": 0.08259303721488595, - "grad_norm": 0.555696964263916, + "grad_norm": 0.5872774720191956, "learning_rate": 4.983745366659859e-05, - "loss": 0.9906, + "loss": 0.9903, "num_input_tokens_seen": 14090240, "step": 1720 }, { "epoch": 0.08307322929171669, - "grad_norm": 0.5970556735992432, + "grad_norm": 0.6030364632606506, "learning_rate": 4.983291220643858e-05, - "loss": 0.9924, + "loss": 0.9931, "num_input_tokens_seen": 14172160, "step": 1730 }, { "epoch": 0.08355342136854742, - "grad_norm": 0.5664453506469727, + "grad_norm": 0.5457348227500916, "learning_rate": 4.982830838702392e-05, - "loss": 1.0358, + "loss": 1.0356, "num_input_tokens_seen": 14254080, "step": 1740 }, { "epoch": 0.08403361344537816, - "grad_norm": 1.6528098583221436, + "grad_norm": 1.6807262897491455, "learning_rate": 4.98236422199155e-05, - "loss": 0.9054, + "loss": 0.9048, "num_input_tokens_seen": 14336000, "step": 1750 }, { "epoch": 0.08451380552220888, - "grad_norm": 0.5737955570220947, + "grad_norm": 0.5328593850135803, "learning_rate": 4.9818913716830784e-05, - "loss": 0.985, + "loss": 0.9876, "num_input_tokens_seen": 14417920, "step": 1760 }, { "epoch": 0.08499399759903961, - "grad_norm": 0.5684435963630676, + "grad_norm": 0.5556053519248962, "learning_rate": 4.981412288964377e-05, - "loss": 1.0279, + "loss": 1.0288, "num_input_tokens_seen": 14499840, "step": 1770 }, { "epoch": 0.08547418967587035, - "grad_norm": 0.5723880529403687, + "grad_norm": 0.5741177797317505, "learning_rate": 4.9809269750384956e-05, - "loss": 1.01, + "loss": 1.0098, "num_input_tokens_seen": 14581760, "step": 1780 }, { "epoch": 0.08595438175270108, - "grad_norm": 0.5838521122932434, + "grad_norm": 0.5520077347755432, "learning_rate": 4.980435431124133e-05, - "loss": 1.1156, + "loss": 1.1153, "num_input_tokens_seen": 14663680, "step": 1790 }, { "epoch": 0.08643457382953182, - "grad_norm": 0.45137259364128113, + "grad_norm": 0.4301939010620117, "learning_rate": 4.97993765845563e-05, - "loss": 0.9487, + "loss": 0.9484, "num_input_tokens_seen": 14745600, "step": 1800 }, { "epoch": 0.08691476590636255, - "grad_norm": 0.5664239525794983, + "grad_norm": 0.5945805311203003, "learning_rate": 4.9794336582829714e-05, "loss": 0.9613, "num_input_tokens_seen": 14827520, @@ -1458,55 +1458,55 @@ }, { "epoch": 0.08739495798319327, - "grad_norm": 0.7043583989143372, + "grad_norm": 0.7423526644706726, "learning_rate": 4.9789234318717784e-05, - "loss": 1.0248, + "loss": 1.0271, "num_input_tokens_seen": 14909440, "step": 1820 }, { "epoch": 0.08787515006002401, - "grad_norm": 0.5338053703308105, + "grad_norm": 0.5542834401130676, "learning_rate": 4.978406980503308e-05, - "loss": 1.0187, + "loss": 1.0182, "num_input_tokens_seen": 14991360, "step": 1830 }, { "epoch": 0.08835534213685474, - "grad_norm": 0.5779690146446228, + "grad_norm": 0.5887225270271301, "learning_rate": 4.9778843054744494e-05, - "loss": 0.9926, + "loss": 0.9922, "num_input_tokens_seen": 15073280, "step": 1840 }, { "epoch": 0.08883553421368548, - "grad_norm": 0.5544894337654114, + "grad_norm": 0.5537407398223877, "learning_rate": 4.977355408097719e-05, - "loss": 0.9672, + "loss": 0.9673, "num_input_tokens_seen": 15155200, "step": 1850 }, { "epoch": 0.0893157262905162, - "grad_norm": 0.7703062891960144, + "grad_norm": 0.709483802318573, "learning_rate": 4.9768202897012595e-05, - "loss": 0.8497, + "loss": 0.8494, "num_input_tokens_seen": 15237120, "step": 1860 }, { "epoch": 0.08979591836734693, - "grad_norm": 0.5235300660133362, + "grad_norm": 0.5220293402671814, "learning_rate": 4.9762789516288354e-05, - "loss": 1.0258, + "loss": 1.0266, "num_input_tokens_seen": 15319040, "step": 1870 }, { "epoch": 0.09027611044417767, - "grad_norm": 0.5461897253990173, + "grad_norm": 0.5254554748535156, "learning_rate": 4.97573139523983e-05, "loss": 0.9799, "num_input_tokens_seen": 15400960, @@ -1514,271 +1514,271 @@ }, { "epoch": 0.0907563025210084, - "grad_norm": 2.8185510635375977, + "grad_norm": 2.654348373413086, "learning_rate": 4.9751776219092405e-05, - "loss": 1.0616, + "loss": 1.0606, "num_input_tokens_seen": 15482880, "step": 1890 }, { "epoch": 0.09123649459783914, - "grad_norm": 0.5489822030067444, + "grad_norm": 0.5544585585594177, "learning_rate": 4.9746176330276783e-05, - "loss": 0.894, + "loss": 0.8944, "num_input_tokens_seen": 15564800, "step": 1900 }, { "epoch": 0.09171668667466987, - "grad_norm": 0.5419192910194397, + "grad_norm": 0.49567174911499023, "learning_rate": 4.97405143000136e-05, - "loss": 1.069, + "loss": 1.0702, "num_input_tokens_seen": 15646720, "step": 1910 }, { "epoch": 0.0921968787515006, - "grad_norm": 0.5786951184272766, + "grad_norm": 0.582963228225708, "learning_rate": 4.9734790142521096e-05, - "loss": 1.0716, + "loss": 1.0714, "num_input_tokens_seen": 15728640, "step": 1920 }, { "epoch": 0.09267707082833133, - "grad_norm": 0.5193120837211609, + "grad_norm": 0.48466384410858154, "learning_rate": 4.9729003872173494e-05, - "loss": 1.1271, + "loss": 1.1282, "num_input_tokens_seen": 15810560, "step": 1930 }, { "epoch": 0.09315726290516206, - "grad_norm": 0.5859541296958923, + "grad_norm": 0.6081234216690063, "learning_rate": 4.972315550350102e-05, - "loss": 0.9171, + "loss": 0.9172, "num_input_tokens_seen": 15892480, "step": 1940 }, { "epoch": 0.0936374549819928, - "grad_norm": 1.0095491409301758, + "grad_norm": 1.0088104009628296, "learning_rate": 4.971724505118982e-05, - "loss": 0.9881, + "loss": 0.9888, "num_input_tokens_seen": 15974400, "step": 1950 }, { "epoch": 0.09411764705882353, - "grad_norm": 0.5563061237335205, + "grad_norm": 0.5905680060386658, "learning_rate": 4.971127253008194e-05, - "loss": 1.0533, + "loss": 1.052, "num_input_tokens_seen": 16056320, "step": 1960 }, { "epoch": 0.09459783913565427, - "grad_norm": 0.5516453385353088, + "grad_norm": 0.5327790975570679, "learning_rate": 4.970523795517532e-05, - "loss": 1.0697, + "loss": 1.0703, "num_input_tokens_seen": 16138240, "step": 1970 }, { "epoch": 0.095078031212485, - "grad_norm": 0.6930283308029175, + "grad_norm": 0.6335835456848145, "learning_rate": 4.969914134162368e-05, - "loss": 0.8598, + "loss": 0.8602, "num_input_tokens_seen": 16220160, "step": 1980 }, { "epoch": 0.09555822328931572, - "grad_norm": 0.5376786589622498, + "grad_norm": 0.5414095520973206, "learning_rate": 4.9692982704736566e-05, - "loss": 0.9143, + "loss": 0.914, "num_input_tokens_seen": 16302080, "step": 1990 }, { "epoch": 0.09603841536614646, - "grad_norm": 0.849349856376648, + "grad_norm": 0.864175021648407, "learning_rate": 4.968676205997925e-05, - "loss": 0.8093, + "loss": 0.8096, "num_input_tokens_seen": 16384000, "step": 2000 }, { "epoch": 0.09651860744297719, - "grad_norm": 0.560719907283783, + "grad_norm": 0.5518527626991272, "learning_rate": 4.9680479422972735e-05, - "loss": 1.0241, + "loss": 1.0233, "num_input_tokens_seen": 16465920, "step": 2010 }, { "epoch": 0.09699879951980793, - "grad_norm": 0.5490632653236389, + "grad_norm": 0.567816972732544, "learning_rate": 4.9674134809493686e-05, - "loss": 0.9488, + "loss": 0.9492, "num_input_tokens_seen": 16547840, "step": 2020 }, { "epoch": 0.09747899159663866, - "grad_norm": 0.45328429341316223, + "grad_norm": 0.4478549063205719, "learning_rate": 4.9667728235474396e-05, - "loss": 0.9864, + "loss": 0.9868, "num_input_tokens_seen": 16629760, "step": 2030 }, { "epoch": 0.09795918367346938, - "grad_norm": 0.5597791075706482, + "grad_norm": 0.5483748316764832, "learning_rate": 4.9661259717002764e-05, - "loss": 0.8736, + "loss": 0.8727, "num_input_tokens_seen": 16711680, "step": 2040 }, { "epoch": 0.09843937575030012, - "grad_norm": 0.6936686038970947, + "grad_norm": 0.6592475175857544, "learning_rate": 4.9654729270322234e-05, - "loss": 1.0361, + "loss": 1.0359, "num_input_tokens_seen": 16793600, "step": 2050 }, { "epoch": 0.09891956782713085, - "grad_norm": 0.5862544178962708, + "grad_norm": 0.6033288240432739, "learning_rate": 4.964813691183174e-05, - "loss": 0.9958, + "loss": 0.9957, "num_input_tokens_seen": 16875520, "step": 2060 }, { "epoch": 0.09939975990396159, - "grad_norm": 2.3423945903778076, + "grad_norm": 2.248500108718872, "learning_rate": 4.964148265808573e-05, - "loss": 0.8843, + "loss": 0.8838, "num_input_tokens_seen": 16957440, "step": 2070 }, { "epoch": 0.09987995198079232, - "grad_norm": 0.5327481031417847, + "grad_norm": 0.5395046472549438, "learning_rate": 4.963476652579404e-05, - "loss": 0.8698, + "loss": 0.8697, "num_input_tokens_seen": 17039360, "step": 2080 }, { "epoch": 0.10036014405762304, - "grad_norm": 0.5698966979980469, + "grad_norm": 0.5363647937774658, "learning_rate": 4.962798853182192e-05, - "loss": 1.1505, + "loss": 1.1534, "num_input_tokens_seen": 17121280, "step": 2090 }, { "epoch": 0.10084033613445378, - "grad_norm": 1.078123688697815, + "grad_norm": 0.936784565448761, "learning_rate": 4.9621148693189954e-05, - "loss": 1.131, + "loss": 1.1309, "num_input_tokens_seen": 17203200, "step": 2100 }, { "epoch": 0.10132052821128451, - "grad_norm": 0.4983321726322174, + "grad_norm": 0.4972730576992035, "learning_rate": 4.9614247027074024e-05, - "loss": 1.2084, + "loss": 1.208, "num_input_tokens_seen": 17285120, "step": 2110 }, { "epoch": 0.10180072028811525, - "grad_norm": 0.5719578266143799, + "grad_norm": 0.5752937197685242, "learning_rate": 4.960728355080527e-05, - "loss": 0.822, + "loss": 0.8207, "num_input_tokens_seen": 17367040, "step": 2120 }, { "epoch": 0.10228091236494598, - "grad_norm": 0.6987754106521606, + "grad_norm": 0.642749547958374, "learning_rate": 4.9600258281870046e-05, - "loss": 0.9156, + "loss": 0.914, "num_input_tokens_seen": 17448960, "step": 2130 }, { "epoch": 0.1027611044417767, - "grad_norm": 0.9254079461097717, + "grad_norm": 0.9153047204017639, "learning_rate": 4.959317123790988e-05, - "loss": 0.9211, + "loss": 0.9198, "num_input_tokens_seen": 17530880, "step": 2140 }, { "epoch": 0.10324129651860744, - "grad_norm": 0.5085683465003967, + "grad_norm": 0.52935391664505, "learning_rate": 4.958602243672145e-05, - "loss": 0.9979, + "loss": 0.9969, "num_input_tokens_seen": 17612800, "step": 2150 }, { "epoch": 0.10372148859543817, - "grad_norm": 0.6418617963790894, + "grad_norm": 0.5792514681816101, "learning_rate": 4.9578811896256475e-05, - "loss": 0.9711, + "loss": 0.9702, "num_input_tokens_seen": 17694720, "step": 2160 }, { "epoch": 0.10420168067226891, - "grad_norm": 0.5556450486183167, + "grad_norm": 0.5800937414169312, "learning_rate": 4.957153963462172e-05, - "loss": 0.9847, + "loss": 0.9843, "num_input_tokens_seen": 17776640, "step": 2170 }, { "epoch": 0.10468187274909964, - "grad_norm": 0.5734561085700989, + "grad_norm": 0.5817973613739014, "learning_rate": 4.9564205670078965e-05, - "loss": 0.9521, + "loss": 0.9514, "num_input_tokens_seen": 17858560, "step": 2180 }, { "epoch": 0.10516206482593037, - "grad_norm": 0.5195381045341492, + "grad_norm": 0.5222535729408264, "learning_rate": 4.955681002104492e-05, - "loss": 0.9161, + "loss": 0.9166, "num_input_tokens_seen": 17940480, "step": 2190 }, { "epoch": 0.1056422569027611, - "grad_norm": 0.5360829830169678, + "grad_norm": 0.5431318879127502, "learning_rate": 4.954935270609119e-05, - "loss": 0.944, + "loss": 0.9447, "num_input_tokens_seen": 18022400, "step": 2200 }, { "epoch": 0.10612244897959183, - "grad_norm": 0.5251927375793457, + "grad_norm": 0.5681139826774597, "learning_rate": 4.9541833743944244e-05, - "loss": 1.0233, + "loss": 1.0235, "num_input_tokens_seen": 18104320, "step": 2210 }, { "epoch": 0.10660264105642257, - "grad_norm": 0.5330286622047424, + "grad_norm": 0.5424164533615112, "learning_rate": 4.953425315348534e-05, "loss": 0.9329, "num_input_tokens_seen": 18186240, @@ -1786,207 +1786,207 @@ }, { "epoch": 0.1070828331332533, - "grad_norm": 0.5502789616584778, + "grad_norm": 0.5445407032966614, "learning_rate": 4.952661095375051e-05, - "loss": 0.9987, + "loss": 1.0003, "num_input_tokens_seen": 18268160, "step": 2230 }, { "epoch": 0.10756302521008404, - "grad_norm": 0.6187983751296997, + "grad_norm": 0.611050009727478, "learning_rate": 4.95189071639305e-05, - "loss": 0.8519, + "loss": 0.852, "num_input_tokens_seen": 18350080, "step": 2240 }, { "epoch": 0.10804321728691477, - "grad_norm": 0.5929238796234131, + "grad_norm": 0.5931539535522461, "learning_rate": 4.951114180337069e-05, - "loss": 0.8418, + "loss": 0.8421, "num_input_tokens_seen": 18432000, "step": 2250 }, { "epoch": 0.1085234093637455, - "grad_norm": 0.4955889880657196, + "grad_norm": 0.4846673309803009, "learning_rate": 4.95033148915711e-05, - "loss": 0.7989, + "loss": 0.7993, "num_input_tokens_seen": 18513920, "step": 2260 }, { "epoch": 0.10900360144057623, - "grad_norm": 0.5626336336135864, + "grad_norm": 0.5406256914138794, "learning_rate": 4.949542644818631e-05, - "loss": 0.8756, + "loss": 0.8758, "num_input_tokens_seen": 18595840, "step": 2270 }, { "epoch": 0.10948379351740696, - "grad_norm": 1.048539638519287, + "grad_norm": 0.9688089489936829, "learning_rate": 4.948747649302542e-05, - "loss": 1.0697, + "loss": 1.0688, "num_input_tokens_seen": 18677760, "step": 2280 }, { "epoch": 0.1099639855942377, - "grad_norm": 0.49436965584754944, + "grad_norm": 0.4601312577724457, "learning_rate": 4.947946504605198e-05, - "loss": 0.9839, + "loss": 0.9846, "num_input_tokens_seen": 18759680, "step": 2290 }, { "epoch": 0.11044417767106843, - "grad_norm": 0.5616737008094788, + "grad_norm": 0.5519710779190063, "learning_rate": 4.947139212738395e-05, - "loss": 0.9213, + "loss": 0.9205, "num_input_tokens_seen": 18841600, "step": 2300 }, { "epoch": 0.11092436974789915, - "grad_norm": 0.5444278120994568, + "grad_norm": 0.5775902271270752, "learning_rate": 4.946325775729368e-05, - "loss": 1.0463, + "loss": 1.0461, "num_input_tokens_seen": 18923520, "step": 2310 }, { "epoch": 0.1114045618247299, - "grad_norm": 0.521336555480957, + "grad_norm": 0.5453388690948486, "learning_rate": 4.945506195620784e-05, - "loss": 0.9103, + "loss": 0.9106, "num_input_tokens_seen": 19005440, "step": 2320 }, { "epoch": 0.11188475390156062, - "grad_norm": 0.5483266115188599, + "grad_norm": 0.5399138927459717, "learning_rate": 4.944680474470731e-05, - "loss": 0.9041, + "loss": 0.9038, "num_input_tokens_seen": 19087360, "step": 2330 }, { "epoch": 0.11236494597839136, - "grad_norm": 0.5167329907417297, + "grad_norm": 0.5245352983474731, "learning_rate": 4.943848614352724e-05, - "loss": 1.1012, + "loss": 1.1003, "num_input_tokens_seen": 19169280, "step": 2340 }, { "epoch": 0.11284513805522209, - "grad_norm": 0.539591372013092, + "grad_norm": 0.5824449062347412, "learning_rate": 4.943010617355691e-05, - "loss": 0.8855, + "loss": 0.8852, "num_input_tokens_seen": 19251200, "step": 2350 }, { "epoch": 0.11332533013205282, - "grad_norm": 1.1016952991485596, + "grad_norm": 1.141217827796936, "learning_rate": 4.94216648558397e-05, - "loss": 0.9905, + "loss": 0.9907, "num_input_tokens_seen": 19333120, "step": 2360 }, { "epoch": 0.11380552220888356, - "grad_norm": 0.5307420492172241, + "grad_norm": 0.5422025322914124, "learning_rate": 4.9413162211573075e-05, - "loss": 0.8465, + "loss": 0.8463, "num_input_tokens_seen": 19415040, "step": 2370 }, { "epoch": 0.11428571428571428, - "grad_norm": 0.5067051649093628, + "grad_norm": 0.5168782472610474, "learning_rate": 4.9404598262108456e-05, - "loss": 1.3091, + "loss": 1.3157, "num_input_tokens_seen": 19496960, "step": 2380 }, { "epoch": 0.11476590636254502, - "grad_norm": 0.5648155212402344, + "grad_norm": 0.5548056364059448, "learning_rate": 4.939597302895125e-05, - "loss": 1.0505, + "loss": 1.0501, "num_input_tokens_seen": 19578880, "step": 2390 }, { "epoch": 0.11524609843937575, - "grad_norm": 0.5425604581832886, + "grad_norm": 0.5792363286018372, "learning_rate": 4.938728653376075e-05, - "loss": 0.9624, + "loss": 0.9628, "num_input_tokens_seen": 19660800, "step": 2400 }, { "epoch": 0.11572629051620648, - "grad_norm": 0.6649373173713684, + "grad_norm": 0.6419684886932373, "learning_rate": 4.9378538798350046e-05, - "loss": 0.882, + "loss": 0.8814, "num_input_tokens_seen": 19742720, "step": 2410 }, { "epoch": 0.11620648259303722, - "grad_norm": 0.4412856698036194, + "grad_norm": 0.39345890283584595, "learning_rate": 4.936972984468608e-05, - "loss": 0.802, + "loss": 0.8009, "num_input_tokens_seen": 19824640, "step": 2420 }, { "epoch": 0.11668667466986794, - "grad_norm": 0.5264973044395447, + "grad_norm": 0.5162023305892944, "learning_rate": 4.936085969488947e-05, - "loss": 0.8687, + "loss": 0.8686, "num_input_tokens_seen": 19906560, "step": 2430 }, { "epoch": 0.11716686674669868, - "grad_norm": 0.5381867289543152, + "grad_norm": 0.5232594013214111, "learning_rate": 4.9351928371234525e-05, - "loss": 0.7774, + "loss": 0.7769, "num_input_tokens_seen": 19988480, "step": 2440 }, { "epoch": 0.11764705882352941, - "grad_norm": 0.5331231355667114, + "grad_norm": 0.5508478283882141, "learning_rate": 4.934293589614917e-05, - "loss": 0.8777, + "loss": 0.8771, "num_input_tokens_seen": 20070400, "step": 2450 }, { "epoch": 0.11812725090036015, - "grad_norm": 0.5481956601142883, + "grad_norm": 0.5470507740974426, "learning_rate": 4.93338822922149e-05, - "loss": 1.0353, + "loss": 1.0355, "num_input_tokens_seen": 20152320, "step": 2460 }, { "epoch": 0.11860744297719088, - "grad_norm": 0.5417446494102478, + "grad_norm": 0.5525845885276794, "learning_rate": 4.932476758216669e-05, - "loss": 0.9866, + "loss": 0.9881, "num_input_tokens_seen": 20234240, "step": 2470 }, { "epoch": 0.1190876350540216, - "grad_norm": 0.5857323408126831, + "grad_norm": 0.5766599178314209, "learning_rate": 4.931559178889297e-05, "loss": 0.9443, "num_input_tokens_seen": 20316160, @@ -1994,199 +1994,199 @@ }, { "epoch": 0.11956782713085234, - "grad_norm": 0.7107313871383667, + "grad_norm": 0.7670407295227051, "learning_rate": 4.9306354935435594e-05, - "loss": 1.0362, + "loss": 1.0363, "num_input_tokens_seen": 20398080, "step": 2490 }, { "epoch": 0.12004801920768307, - "grad_norm": 0.5205745100975037, + "grad_norm": 0.5062960386276245, "learning_rate": 4.929705704498969e-05, - "loss": 1.0075, + "loss": 1.0079, "num_input_tokens_seen": 20480000, "step": 2500 }, { "epoch": 0.12052821128451381, - "grad_norm": 0.5517631769180298, + "grad_norm": 0.5576232671737671, "learning_rate": 4.928769814090371e-05, - "loss": 0.9074, + "loss": 0.9076, "num_input_tokens_seen": 20561920, "step": 2510 }, { "epoch": 0.12100840336134454, - "grad_norm": 0.5010194778442383, + "grad_norm": 0.5333806872367859, "learning_rate": 4.927827824667929e-05, - "loss": 1.0916, + "loss": 1.0913, "num_input_tokens_seen": 20643840, "step": 2520 }, { "epoch": 0.12148859543817526, - "grad_norm": 0.5098908543586731, + "grad_norm": 0.525405764579773, "learning_rate": 4.926879738597122e-05, - "loss": 1.0548, + "loss": 1.0558, "num_input_tokens_seen": 20725760, "step": 2530 }, { "epoch": 0.121968787515006, - "grad_norm": 0.8620312213897705, + "grad_norm": 0.835304319858551, "learning_rate": 4.925925558258741e-05, - "loss": 1.1967, + "loss": 1.1955, "num_input_tokens_seen": 20807680, "step": 2540 }, { "epoch": 0.12244897959183673, - "grad_norm": 0.5866915583610535, + "grad_norm": 0.5637728571891785, "learning_rate": 4.924965286048879e-05, - "loss": 0.9201, + "loss": 0.9205, "num_input_tokens_seen": 20889600, "step": 2550 }, { "epoch": 0.12292917166866747, - "grad_norm": 1.4908480644226074, + "grad_norm": 1.5548111200332642, "learning_rate": 4.9239989243789275e-05, - "loss": 1.0407, + "loss": 1.0418, "num_input_tokens_seen": 20971520, "step": 2560 }, { "epoch": 0.1234093637454982, - "grad_norm": 0.483049213886261, + "grad_norm": 0.5124868750572205, "learning_rate": 4.9230264756755685e-05, - "loss": 1.0231, + "loss": 1.0242, "num_input_tokens_seen": 21053440, "step": 2570 }, { "epoch": 0.12388955582232893, - "grad_norm": 2.7187938690185547, + "grad_norm": 2.6235156059265137, "learning_rate": 4.9220479423807694e-05, - "loss": 0.9814, + "loss": 0.9804, "num_input_tokens_seen": 21135360, "step": 2580 }, { "epoch": 0.12436974789915967, - "grad_norm": 0.5453813672065735, + "grad_norm": 0.5854002237319946, "learning_rate": 4.9210633269517776e-05, - "loss": 1.0074, + "loss": 1.0083, "num_input_tokens_seen": 21217280, "step": 2590 }, { "epoch": 0.12484993997599039, - "grad_norm": 0.5155650973320007, + "grad_norm": 0.5365314483642578, "learning_rate": 4.920072631861115e-05, - "loss": 0.9252, + "loss": 0.9283, "num_input_tokens_seen": 21299200, "step": 2600 }, { "epoch": 0.12533013205282112, - "grad_norm": 0.510693371295929, + "grad_norm": 0.5210400819778442, "learning_rate": 4.919075859596567e-05, - "loss": 1.1132, + "loss": 1.1134, "num_input_tokens_seen": 21381120, "step": 2610 }, { "epoch": 0.12581032412965187, - "grad_norm": 0.5092229843139648, + "grad_norm": 0.5012139678001404, "learning_rate": 4.918073012661183e-05, - "loss": 1.1313, + "loss": 1.1302, "num_input_tokens_seen": 21463040, "step": 2620 }, { "epoch": 0.1262905162064826, - "grad_norm": 0.5227720737457275, + "grad_norm": 0.522561252117157, "learning_rate": 4.9170640935732654e-05, - "loss": 1.0631, + "loss": 1.0637, "num_input_tokens_seen": 21544960, "step": 2630 }, { "epoch": 0.12677070828331333, - "grad_norm": 0.5651267766952515, + "grad_norm": 0.5822430849075317, "learning_rate": 4.916049104866365e-05, - "loss": 0.8485, + "loss": 0.8478, "num_input_tokens_seen": 21626880, "step": 2640 }, { "epoch": 0.12725090036014405, - "grad_norm": 0.8087652921676636, + "grad_norm": 0.8148403763771057, "learning_rate": 4.915028049089274e-05, - "loss": 0.9922, + "loss": 0.9934, "num_input_tokens_seen": 21708800, "step": 2650 }, { "epoch": 0.12773109243697478, - "grad_norm": 0.5781594514846802, + "grad_norm": 0.5749411582946777, "learning_rate": 4.914000928806021e-05, - "loss": 0.9331, + "loss": 0.9332, "num_input_tokens_seen": 21790720, "step": 2660 }, { "epoch": 0.12821128451380553, - "grad_norm": 0.5171418786048889, + "grad_norm": 0.5240984559059143, "learning_rate": 4.912967746595861e-05, - "loss": 1.0793, + "loss": 1.0782, "num_input_tokens_seen": 21872640, "step": 2670 }, { "epoch": 0.12869147659063626, - "grad_norm": 0.5301877856254578, + "grad_norm": 0.5066481828689575, "learning_rate": 4.911928505053275e-05, - "loss": 1.1306, + "loss": 1.1305, "num_input_tokens_seen": 21954560, "step": 2680 }, { "epoch": 0.129171668667467, - "grad_norm": 0.5109881162643433, + "grad_norm": 0.5011911392211914, "learning_rate": 4.9108832067879574e-05, - "loss": 0.9962, + "loss": 0.9965, "num_input_tokens_seen": 22036480, "step": 2690 }, { "epoch": 0.12965186074429771, - "grad_norm": 0.5722172856330872, + "grad_norm": 0.5225823521614075, "learning_rate": 4.909831854424812e-05, - "loss": 0.8064, + "loss": 0.8051, "num_input_tokens_seen": 22118400, "step": 2700 }, { "epoch": 0.13013205282112844, - "grad_norm": 0.5787602663040161, + "grad_norm": 0.5528846979141235, "learning_rate": 4.908774450603946e-05, - "loss": 0.9561, + "loss": 0.9556, "num_input_tokens_seen": 22200320, "step": 2710 }, { "epoch": 0.1306122448979592, - "grad_norm": 0.5278475284576416, + "grad_norm": 0.5039660930633545, "learning_rate": 4.907710997980664e-05, - "loss": 1.0554, + "loss": 1.0562, "num_input_tokens_seen": 22282240, "step": 2720 }, { "epoch": 0.13109243697478992, - "grad_norm": 0.5114937424659729, + "grad_norm": 0.5411632061004639, "learning_rate": 4.906641499225457e-05, "loss": 0.9203, "num_input_tokens_seen": 22364160, @@ -2194,111 +2194,111 @@ }, { "epoch": 0.13157262905162065, - "grad_norm": 0.5476466417312622, + "grad_norm": 0.5488880276679993, "learning_rate": 4.905565957024003e-05, - "loss": 0.9199, + "loss": 0.9196, "num_input_tokens_seen": 22446080, "step": 2740 }, { "epoch": 0.13205282112845138, - "grad_norm": 0.5202212929725647, + "grad_norm": 0.5375452637672424, "learning_rate": 4.9044843740771505e-05, - "loss": 0.9683, + "loss": 0.9674, "num_input_tokens_seen": 22528000, "step": 2750 }, { "epoch": 0.1325330132052821, - "grad_norm": 0.5196512341499329, + "grad_norm": 0.5489015579223633, "learning_rate": 4.9033967531009225e-05, - "loss": 1.1265, + "loss": 1.1255, "num_input_tokens_seen": 22609920, "step": 2760 }, { "epoch": 0.13301320528211286, - "grad_norm": 0.5623713731765747, + "grad_norm": 0.5846374034881592, "learning_rate": 4.902303096826502e-05, - "loss": 1.1676, + "loss": 1.1679, "num_input_tokens_seen": 22691840, "step": 2770 }, { "epoch": 0.13349339735894358, - "grad_norm": 0.6298018097877502, + "grad_norm": 0.5768160223960876, "learning_rate": 4.901203408000227e-05, - "loss": 0.8947, + "loss": 0.8948, "num_input_tokens_seen": 22773760, "step": 2780 }, { "epoch": 0.1339735894357743, - "grad_norm": 0.6308296322822571, + "grad_norm": 0.5849971771240234, "learning_rate": 4.9000976893835856e-05, - "loss": 1.1018, + "loss": 1.1007, "num_input_tokens_seen": 22855680, "step": 2790 }, { "epoch": 0.13445378151260504, - "grad_norm": 1.6783674955368042, + "grad_norm": 1.5311932563781738, "learning_rate": 4.898985943753207e-05, - "loss": 0.9954, + "loss": 0.9984, "num_input_tokens_seen": 22937600, "step": 2800 }, { "epoch": 0.13493397358943576, - "grad_norm": 0.5339183211326599, + "grad_norm": 0.5359712243080139, "learning_rate": 4.897868173900854e-05, - "loss": 0.7595, + "loss": 0.7598, "num_input_tokens_seen": 23019520, "step": 2810 }, { "epoch": 0.13541416566626652, - "grad_norm": 0.5435370802879333, + "grad_norm": 0.5451187491416931, "learning_rate": 4.89674438263342e-05, - "loss": 1.0, + "loss": 1.0037, "num_input_tokens_seen": 23101440, "step": 2820 }, { "epoch": 0.13589435774309724, - "grad_norm": 0.5289290547370911, + "grad_norm": 0.5637598633766174, "learning_rate": 4.8956145727729156e-05, - "loss": 0.9093, + "loss": 0.9097, "num_input_tokens_seen": 23183360, "step": 2830 }, { "epoch": 0.13637454981992797, - "grad_norm": 0.644011914730072, + "grad_norm": 0.7073699235916138, "learning_rate": 4.8944787471564686e-05, - "loss": 0.8391, + "loss": 0.8392, "num_input_tokens_seen": 23265280, "step": 2840 }, { "epoch": 0.1368547418967587, - "grad_norm": 0.5689689517021179, + "grad_norm": 0.558293879032135, "learning_rate": 4.89333690863631e-05, - "loss": 0.8903, + "loss": 0.8909, "num_input_tokens_seen": 23347200, "step": 2850 }, { "epoch": 0.13733493397358942, - "grad_norm": 0.5509852766990662, + "grad_norm": 0.5416772961616516, "learning_rate": 4.892189060079773e-05, - "loss": 0.9087, + "loss": 0.908, "num_input_tokens_seen": 23429120, "step": 2860 }, { "epoch": 0.13781512605042018, - "grad_norm": 0.6888849139213562, + "grad_norm": 0.6885920166969299, "learning_rate": 4.8910352043692806e-05, "loss": 0.8255, "num_input_tokens_seen": 23511040, @@ -2306,55 +2306,55 @@ }, { "epoch": 0.1382953181272509, - "grad_norm": 0.5926205515861511, + "grad_norm": 0.5959227085113525, "learning_rate": 4.889875344402342e-05, - "loss": 0.9486, + "loss": 0.9484, "num_input_tokens_seen": 23592960, "step": 2880 }, { "epoch": 0.13877551020408163, - "grad_norm": 0.5350954532623291, + "grad_norm": 0.5337544083595276, "learning_rate": 4.8887094830915427e-05, - "loss": 1.0803, + "loss": 1.0808, "num_input_tokens_seen": 23674880, "step": 2890 }, { "epoch": 0.13925570228091236, - "grad_norm": 0.5506545305252075, + "grad_norm": 0.5348219871520996, "learning_rate": 4.8875376233645396e-05, - "loss": 0.9572, + "loss": 0.9569, "num_input_tokens_seen": 23756800, "step": 2900 }, { "epoch": 0.13973589435774308, - "grad_norm": 0.552919328212738, + "grad_norm": 0.5439090132713318, "learning_rate": 4.886359768164054e-05, - "loss": 0.9725, + "loss": 0.9723, "num_input_tokens_seen": 23838720, "step": 2910 }, { "epoch": 0.14021608643457384, - "grad_norm": 0.5363306403160095, + "grad_norm": 0.5317606329917908, "learning_rate": 4.88517592044786e-05, - "loss": 0.9, + "loss": 0.9003, "num_input_tokens_seen": 23920640, "step": 2920 }, { "epoch": 0.14069627851140457, - "grad_norm": 0.5855774879455566, + "grad_norm": 0.560670018196106, "learning_rate": 4.8839860831887805e-05, - "loss": 1.0477, + "loss": 1.0471, "num_input_tokens_seen": 24002560, "step": 2930 }, { "epoch": 0.1411764705882353, - "grad_norm": 0.5440022349357605, + "grad_norm": 0.5462415218353271, "learning_rate": 4.882790259374681e-05, "loss": 0.8685, "num_input_tokens_seen": 24084480, @@ -2362,79 +2362,79 @@ }, { "epoch": 0.14165666266506602, - "grad_norm": 0.5618158578872681, + "grad_norm": 0.5607855319976807, "learning_rate": 4.881588452008456e-05, - "loss": 0.9956, + "loss": 0.9955, "num_input_tokens_seen": 24166400, "step": 2950 }, { "epoch": 0.14213685474189675, - "grad_norm": 0.5429749488830566, + "grad_norm": 0.536926805973053, "learning_rate": 4.880380664108032e-05, - "loss": 1.0358, + "loss": 1.0356, "num_input_tokens_seen": 24248320, "step": 2960 }, { "epoch": 0.1426170468187275, - "grad_norm": 0.5229565501213074, + "grad_norm": 0.5230121612548828, "learning_rate": 4.879166898706347e-05, - "loss": 0.8954, + "loss": 0.8951, "num_input_tokens_seen": 24330240, "step": 2970 }, { "epoch": 0.14309723889555823, - "grad_norm": 0.5274274349212646, + "grad_norm": 0.5175470113754272, "learning_rate": 4.877947158851352e-05, - "loss": 0.9882, + "loss": 0.9876, "num_input_tokens_seen": 24412160, "step": 2980 }, { "epoch": 0.14357743097238895, - "grad_norm": 0.5633429884910583, + "grad_norm": 0.5634691119194031, "learning_rate": 4.876721447606002e-05, - "loss": 0.8396, + "loss": 0.8405, "num_input_tokens_seen": 24494080, "step": 2990 }, { "epoch": 0.14405762304921968, - "grad_norm": 0.5823813676834106, + "grad_norm": 0.5656299591064453, "learning_rate": 4.875489768048247e-05, - "loss": 1.0854, + "loss": 1.0846, "num_input_tokens_seen": 24576000, "step": 3000 }, { "epoch": 0.14453781512605043, - "grad_norm": 0.5569640398025513, + "grad_norm": 0.5407316088676453, "learning_rate": 4.8742521232710234e-05, - "loss": 0.9147, + "loss": 0.9156, "num_input_tokens_seen": 24657920, "step": 3010 }, { "epoch": 0.14501800720288116, - "grad_norm": 0.5742693543434143, + "grad_norm": 0.5185397863388062, "learning_rate": 4.873008516382245e-05, - "loss": 0.956, + "loss": 0.9555, "num_input_tokens_seen": 24739840, "step": 3020 }, { "epoch": 0.1454981992797119, - "grad_norm": 0.5225424766540527, + "grad_norm": 0.5350617170333862, "learning_rate": 4.871758950504801e-05, - "loss": 0.9568, + "loss": 0.9566, "num_input_tokens_seen": 24821760, "step": 3030 }, { "epoch": 0.14597839135654261, - "grad_norm": 0.42165303230285645, + "grad_norm": 0.3936846852302551, "learning_rate": 4.870503428776544e-05, "loss": 0.95, "num_input_tokens_seen": 24903680, @@ -2442,255 +2442,255 @@ }, { "epoch": 0.14645858343337334, - "grad_norm": 0.5314249992370605, + "grad_norm": 0.5534759759902954, "learning_rate": 4.869241954350281e-05, - "loss": 1.0183, + "loss": 1.0182, "num_input_tokens_seen": 24985600, "step": 3050 }, { "epoch": 0.1469387755102041, - "grad_norm": 0.5332600474357605, + "grad_norm": 0.5339453220367432, "learning_rate": 4.867974530393767e-05, - "loss": 0.9581, + "loss": 0.9629, "num_input_tokens_seen": 25067520, "step": 3060 }, { "epoch": 0.14741896758703482, - "grad_norm": 0.5184004306793213, + "grad_norm": 0.5030773282051086, "learning_rate": 4.8667011600896994e-05, - "loss": 0.8628, + "loss": 0.8619, "num_input_tokens_seen": 25149440, "step": 3070 }, { "epoch": 0.14789915966386555, - "grad_norm": 0.524926483631134, + "grad_norm": 0.551304042339325, "learning_rate": 4.8654218466357064e-05, - "loss": 0.8973, + "loss": 0.8969, "num_input_tokens_seen": 25231360, "step": 3080 }, { "epoch": 0.14837935174069627, - "grad_norm": 0.4728389382362366, + "grad_norm": 0.5021383762359619, "learning_rate": 4.86413659324434e-05, - "loss": 0.9561, + "loss": 0.9565, "num_input_tokens_seen": 25313280, "step": 3090 }, { "epoch": 0.148859543817527, - "grad_norm": 0.4952199459075928, + "grad_norm": 0.5079641342163086, "learning_rate": 4.8628454031430694e-05, - "loss": 0.8549, + "loss": 0.8552, "num_input_tokens_seen": 25395200, "step": 3100 }, { "epoch": 0.14933973589435776, - "grad_norm": 0.6020041704177856, + "grad_norm": 0.5812205076217651, "learning_rate": 4.8615482795742696e-05, - "loss": 1.1022, + "loss": 1.1013, "num_input_tokens_seen": 25477120, "step": 3110 }, { "epoch": 0.14981992797118848, - "grad_norm": 1.6901748180389404, + "grad_norm": 1.72549307346344, "learning_rate": 4.860245225795219e-05, - "loss": 1.108, + "loss": 1.1029, "num_input_tokens_seen": 25559040, "step": 3120 }, { "epoch": 0.1503001200480192, - "grad_norm": 0.5366313457489014, + "grad_norm": 0.5684882998466492, "learning_rate": 4.858936245078084e-05, - "loss": 1.1467, + "loss": 1.162, "num_input_tokens_seen": 25640960, "step": 3130 }, { "epoch": 0.15078031212484994, - "grad_norm": 0.5097447633743286, + "grad_norm": 0.5000339150428772, "learning_rate": 4.857621340709917e-05, - "loss": 1.0484, + "loss": 1.0493, "num_input_tokens_seen": 25722880, "step": 3140 }, { "epoch": 0.15126050420168066, - "grad_norm": 0.5351747870445251, + "grad_norm": 0.5555292963981628, "learning_rate": 4.856300515992646e-05, - "loss": 0.8896, + "loss": 0.8903, "num_input_tokens_seen": 25804800, "step": 3150 }, { "epoch": 0.15174069627851142, - "grad_norm": 0.5735185742378235, + "grad_norm": 0.5889732837677002, "learning_rate": 4.854973774243062e-05, - "loss": 0.9816, + "loss": 0.9819, "num_input_tokens_seen": 25886720, "step": 3160 }, { "epoch": 0.15222088835534214, - "grad_norm": 0.5087327361106873, + "grad_norm": 0.5141785144805908, "learning_rate": 4.8536411187928186e-05, - "loss": 0.974, + "loss": 0.9732, "num_input_tokens_seen": 25968640, "step": 3170 }, { "epoch": 0.15270108043217287, - "grad_norm": 0.5174838900566101, + "grad_norm": 0.5038670897483826, "learning_rate": 4.852302552988418e-05, - "loss": 0.9798, + "loss": 0.9799, "num_input_tokens_seen": 26050560, "step": 3180 }, { "epoch": 0.1531812725090036, - "grad_norm": 0.5763252973556519, + "grad_norm": 0.5643836855888367, "learning_rate": 4.850958080191205e-05, - "loss": 1.0298, + "loss": 1.0295, "num_input_tokens_seen": 26132480, "step": 3190 }, { "epoch": 0.15366146458583432, - "grad_norm": 0.565152108669281, + "grad_norm": 0.5722659826278687, "learning_rate": 4.849607703777356e-05, - "loss": 0.9407, + "loss": 0.942, "num_input_tokens_seen": 26214400, "step": 3200 }, { "epoch": 0.15414165666266508, - "grad_norm": 0.5854922533035278, + "grad_norm": 0.5935901403427124, "learning_rate": 4.8482514271378745e-05, - "loss": 0.7755, + "loss": 0.7757, "num_input_tokens_seen": 26296320, "step": 3210 }, { "epoch": 0.1546218487394958, - "grad_norm": 0.5151272416114807, + "grad_norm": 0.5209060311317444, "learning_rate": 4.846889253678578e-05, - "loss": 0.8748, + "loss": 0.8739, "num_input_tokens_seen": 26378240, "step": 3220 }, { "epoch": 0.15510204081632653, - "grad_norm": 0.5427525043487549, + "grad_norm": 0.5480584502220154, "learning_rate": 4.845521186820096e-05, - "loss": 0.9308, + "loss": 0.931, "num_input_tokens_seen": 26460160, "step": 3230 }, { "epoch": 0.15558223289315726, - "grad_norm": 0.5123739242553711, + "grad_norm": 0.524978518486023, "learning_rate": 4.8441472299978504e-05, - "loss": 0.9556, + "loss": 0.955, "num_input_tokens_seen": 26542080, "step": 3240 }, { "epoch": 0.15606242496998798, - "grad_norm": 0.6060318946838379, + "grad_norm": 0.5937045812606812, "learning_rate": 4.8427673866620615e-05, - "loss": 0.9465, + "loss": 0.9456, "num_input_tokens_seen": 26624000, "step": 3250 }, { "epoch": 0.15654261704681874, - "grad_norm": 0.5624286532402039, + "grad_norm": 0.5852760076522827, "learning_rate": 4.841381660277725e-05, - "loss": 0.8611, + "loss": 0.8613, "num_input_tokens_seen": 26705920, "step": 3260 }, { "epoch": 0.15702280912364946, - "grad_norm": 0.5059430003166199, + "grad_norm": 0.5103029012680054, "learning_rate": 4.839990054324614e-05, - "loss": 0.9294, + "loss": 0.9297, "num_input_tokens_seen": 26787840, "step": 3270 }, { "epoch": 0.1575030012004802, - "grad_norm": 0.5408222675323486, + "grad_norm": 0.5465720295906067, "learning_rate": 4.838592572297265e-05, - "loss": 1.0522, + "loss": 1.0529, "num_input_tokens_seen": 26869760, "step": 3280 }, { "epoch": 0.15798319327731092, - "grad_norm": 0.5456782579421997, + "grad_norm": 0.5594531297683716, "learning_rate": 4.837189217704968e-05, - "loss": 0.9068, + "loss": 0.9072, "num_input_tokens_seen": 26951680, "step": 3290 }, { "epoch": 0.15846338535414164, - "grad_norm": 0.5521805882453918, + "grad_norm": 0.5584396719932556, "learning_rate": 4.835779994071764e-05, - "loss": 0.8502, + "loss": 0.8515, "num_input_tokens_seen": 27033600, "step": 3300 }, { "epoch": 0.1589435774309724, - "grad_norm": 0.525742769241333, + "grad_norm": 0.5218715071678162, "learning_rate": 4.8343649049364284e-05, - "loss": 0.9537, + "loss": 0.9548, "num_input_tokens_seen": 27115520, "step": 3310 }, { "epoch": 0.15942376950780313, - "grad_norm": 0.7998719215393066, + "grad_norm": 0.7685995697975159, "learning_rate": 4.832943953852468e-05, - "loss": 0.9564, + "loss": 0.9565, "num_input_tokens_seen": 27197440, "step": 3320 }, { "epoch": 0.15990396158463385, - "grad_norm": 0.41986241936683655, + "grad_norm": 0.42694056034088135, "learning_rate": 4.831517144388109e-05, - "loss": 0.8895, + "loss": 0.8896, "num_input_tokens_seen": 27279360, "step": 3330 }, { "epoch": 0.16038415366146458, - "grad_norm": 0.5009111166000366, + "grad_norm": 0.4931061267852783, "learning_rate": 4.830084480126288e-05, - "loss": 0.765, + "loss": 0.7648, "num_input_tokens_seen": 27361280, "step": 3340 }, { "epoch": 0.1608643457382953, - "grad_norm": 0.9933325052261353, + "grad_norm": 1.0382879972457886, "learning_rate": 4.828645964664647e-05, - "loss": 0.9538, + "loss": 0.9537, "num_input_tokens_seen": 27443200, "step": 3350 }, { "epoch": 0.16134453781512606, - "grad_norm": 0.6056552529335022, + "grad_norm": 1.5314464569091797, "learning_rate": 4.8272016016155166e-05, "loss": 1.0036, "num_input_tokens_seen": 27525120, @@ -2698,431 +2698,431 @@ }, { "epoch": 0.1618247298919568, - "grad_norm": 0.6024153828620911, + "grad_norm": 0.61847323179245, "learning_rate": 4.825751394605916e-05, - "loss": 1.1447, + "loss": 1.1458, "num_input_tokens_seen": 27607040, "step": 3370 }, { "epoch": 0.1623049219687875, - "grad_norm": 0.5308077335357666, + "grad_norm": 0.5093514323234558, "learning_rate": 4.824295347277537e-05, - "loss": 0.8392, + "loss": 0.8404, "num_input_tokens_seen": 27688960, "step": 3380 }, { "epoch": 0.16278511404561824, - "grad_norm": 0.5333995223045349, + "grad_norm": 0.5439700484275818, "learning_rate": 4.8228334632867375e-05, - "loss": 1.1125, + "loss": 1.1109, "num_input_tokens_seen": 27770880, "step": 3390 }, { "epoch": 0.16326530612244897, - "grad_norm": 0.5561527013778687, + "grad_norm": 0.5555248856544495, "learning_rate": 4.8213657463045344e-05, - "loss": 1.059, + "loss": 1.0581, "num_input_tokens_seen": 27852800, "step": 3400 }, { "epoch": 0.16374549819927972, - "grad_norm": 0.6268987655639648, + "grad_norm": 0.6565396785736084, "learning_rate": 4.819892200016588e-05, - "loss": 1.0134, + "loss": 1.0135, "num_input_tokens_seen": 27934720, "step": 3410 }, { "epoch": 0.16422569027611045, - "grad_norm": 0.5519012808799744, + "grad_norm": 0.5589664578437805, "learning_rate": 4.818412828123201e-05, - "loss": 0.8942, + "loss": 0.8944, "num_input_tokens_seen": 28016640, "step": 3420 }, { "epoch": 0.16470588235294117, - "grad_norm": 0.5266446471214294, + "grad_norm": 0.5469520688056946, "learning_rate": 4.816927634339302e-05, - "loss": 0.8664, + "loss": 0.8662, "num_input_tokens_seen": 28098560, "step": 3430 }, { "epoch": 0.1651860744297719, - "grad_norm": 0.5828010439872742, + "grad_norm": 0.5610776543617249, "learning_rate": 4.815436622394441e-05, - "loss": 0.7622, + "loss": 0.7626, "num_input_tokens_seen": 28180480, "step": 3440 }, { "epoch": 0.16566626650660263, - "grad_norm": 0.7593029141426086, + "grad_norm": 0.7478719353675842, "learning_rate": 4.813939796032779e-05, - "loss": 1.0249, + "loss": 1.0245, "num_input_tokens_seen": 28262400, "step": 3450 }, { "epoch": 0.16614645858343338, - "grad_norm": 0.5545393228530884, + "grad_norm": 0.5465115904808044, "learning_rate": 4.812437159013076e-05, - "loss": 0.9852, + "loss": 0.9836, "num_input_tokens_seen": 28344320, "step": 3460 }, { "epoch": 0.1666266506602641, - "grad_norm": 0.5645286440849304, + "grad_norm": 0.5847524404525757, "learning_rate": 4.810928715108683e-05, - "loss": 0.8945, + "loss": 0.8955, "num_input_tokens_seen": 28426240, "step": 3470 }, { "epoch": 0.16710684273709484, - "grad_norm": 0.6866888999938965, + "grad_norm": 0.7089729309082031, "learning_rate": 4.809414468107536e-05, - "loss": 0.8606, + "loss": 0.8607, "num_input_tokens_seen": 28508160, "step": 3480 }, { "epoch": 0.16758703481392556, - "grad_norm": 0.5912758111953735, + "grad_norm": 0.6048206090927124, "learning_rate": 4.8078944218121404e-05, - "loss": 0.7668, + "loss": 0.7663, "num_input_tokens_seen": 28590080, "step": 3490 }, { "epoch": 0.16806722689075632, - "grad_norm": 0.5021241903305054, + "grad_norm": 0.5017691850662231, "learning_rate": 4.806368580039566e-05, - "loss": 1.1072, + "loss": 1.1082, "num_input_tokens_seen": 28672000, "step": 3500 }, { "epoch": 0.16854741896758704, - "grad_norm": 0.5017940402030945, + "grad_norm": 0.5294102430343628, "learning_rate": 4.804836946621437e-05, - "loss": 0.7905, + "loss": 0.7908, "num_input_tokens_seen": 28753920, "step": 3510 }, { "epoch": 0.16902761104441777, - "grad_norm": 0.5461972951889038, + "grad_norm": 0.5386155247688293, "learning_rate": 4.803299525403919e-05, - "loss": 0.9979, + "loss": 0.9984, "num_input_tokens_seen": 28835840, "step": 3520 }, { "epoch": 0.1695078031212485, - "grad_norm": 0.5111615061759949, + "grad_norm": 0.5093580484390259, "learning_rate": 4.801756320247713e-05, - "loss": 1.1211, + "loss": 1.1216, "num_input_tokens_seen": 28917760, "step": 3530 }, { "epoch": 0.16998799519807922, - "grad_norm": 0.5483148097991943, + "grad_norm": 0.5753290057182312, "learning_rate": 4.800207335028044e-05, - "loss": 0.9346, + "loss": 0.9355, "num_input_tokens_seen": 28999680, "step": 3540 }, { "epoch": 0.17046818727490998, - "grad_norm": 0.6430408358573914, + "grad_norm": 0.6612030863761902, "learning_rate": 4.798652573634651e-05, - "loss": 0.8605, + "loss": 0.8603, "num_input_tokens_seen": 29081600, "step": 3550 }, { "epoch": 0.1709483793517407, - "grad_norm": 0.41809263825416565, + "grad_norm": 0.44161081314086914, "learning_rate": 4.797092039971779e-05, - "loss": 0.631, + "loss": 0.6316, "num_input_tokens_seen": 29163520, "step": 3560 }, { "epoch": 0.17142857142857143, - "grad_norm": 0.5095425844192505, + "grad_norm": 0.5154605507850647, "learning_rate": 4.7955257379581675e-05, - "loss": 0.8789, + "loss": 0.8794, "num_input_tokens_seen": 29245440, "step": 3570 }, { "epoch": 0.17190876350540216, - "grad_norm": 0.5259044766426086, + "grad_norm": 0.520716667175293, "learning_rate": 4.7939536715270415e-05, - "loss": 1.2783, + "loss": 1.2806, "num_input_tokens_seen": 29327360, "step": 3580 }, { "epoch": 0.17238895558223288, - "grad_norm": 0.5708450675010681, + "grad_norm": 0.5714126825332642, "learning_rate": 4.792375844626101e-05, - "loss": 0.7012, + "loss": 0.7021, "num_input_tokens_seen": 29409280, "step": 3590 }, { "epoch": 0.17286914765906364, - "grad_norm": 0.5291116833686829, + "grad_norm": 0.49484771490097046, "learning_rate": 4.790792261217512e-05, - "loss": 0.976, + "loss": 0.9755, "num_input_tokens_seen": 29491200, "step": 3600 }, { "epoch": 0.17334933973589436, - "grad_norm": 0.4987095296382904, + "grad_norm": 0.4960525929927826, "learning_rate": 4.789202925277895e-05, - "loss": 0.9292, + "loss": 0.929, "num_input_tokens_seen": 29573120, "step": 3610 }, { "epoch": 0.1738295318127251, - "grad_norm": 0.6108640432357788, + "grad_norm": 0.5834500789642334, "learning_rate": 4.787607840798317e-05, - "loss": 0.8974, + "loss": 0.8966, "num_input_tokens_seen": 29655040, "step": 3620 }, { "epoch": 0.17430972388955582, - "grad_norm": 0.42809537053108215, + "grad_norm": 0.4365158677101135, "learning_rate": 4.786007011784279e-05, - "loss": 0.9049, + "loss": 0.9054, "num_input_tokens_seen": 29736960, "step": 3630 }, { "epoch": 0.17478991596638654, - "grad_norm": 0.4873834252357483, + "grad_norm": 0.48445218801498413, "learning_rate": 4.78440044225571e-05, - "loss": 0.8813, + "loss": 0.881, "num_input_tokens_seen": 29818880, "step": 3640 }, { "epoch": 0.1752701080432173, - "grad_norm": 0.6227244138717651, + "grad_norm": 0.6551578640937805, "learning_rate": 4.7827881362469506e-05, - "loss": 0.7923, + "loss": 0.7921, "num_input_tokens_seen": 29900800, "step": 3650 }, { "epoch": 0.17575030012004803, - "grad_norm": 0.4629347324371338, + "grad_norm": 0.45337924361228943, "learning_rate": 4.781170097806751e-05, - "loss": 0.9563, + "loss": 0.956, "num_input_tokens_seen": 29982720, "step": 3660 }, { "epoch": 0.17623049219687875, - "grad_norm": 0.5306552648544312, + "grad_norm": 0.521186888217926, "learning_rate": 4.779546330998253e-05, - "loss": 0.9327, + "loss": 0.9339, "num_input_tokens_seen": 30064640, "step": 3670 }, { "epoch": 0.17671068427370948, - "grad_norm": 0.5401423573493958, + "grad_norm": 0.552125096321106, "learning_rate": 4.7779168398989826e-05, - "loss": 1.2167, + "loss": 1.2112, "num_input_tokens_seen": 30146560, "step": 3680 }, { "epoch": 0.1771908763505402, - "grad_norm": 0.5693386197090149, + "grad_norm": 0.6073199510574341, "learning_rate": 4.7762816286008454e-05, - "loss": 0.9795, + "loss": 0.9783, "num_input_tokens_seen": 30228480, "step": 3690 }, { "epoch": 0.17767106842737096, - "grad_norm": 0.5739344954490662, + "grad_norm": 0.5938025712966919, "learning_rate": 4.774640701210106e-05, - "loss": 1.108, + "loss": 1.1071, "num_input_tokens_seen": 30310400, "step": 3700 }, { "epoch": 0.1781512605042017, - "grad_norm": 0.5333011150360107, + "grad_norm": 0.5386204719543457, "learning_rate": 4.7729940618473854e-05, - "loss": 1.0156, + "loss": 1.0155, "num_input_tokens_seen": 30392320, "step": 3710 }, { "epoch": 0.1786314525810324, - "grad_norm": 0.6173415780067444, + "grad_norm": 0.6245675683021545, "learning_rate": 4.771341714647648e-05, - "loss": 0.9543, + "loss": 0.9545, "num_input_tokens_seen": 30474240, "step": 3720 }, { "epoch": 0.17911164465786314, - "grad_norm": 0.5696475505828857, + "grad_norm": 0.5611715912818909, "learning_rate": 4.76968366376019e-05, - "loss": 0.875, + "loss": 0.8737, "num_input_tokens_seen": 30556160, "step": 3730 }, { "epoch": 0.17959183673469387, - "grad_norm": 0.5061577558517456, + "grad_norm": 0.4791863262653351, "learning_rate": 4.768019913348634e-05, - "loss": 0.9798, + "loss": 0.9804, "num_input_tokens_seen": 30638080, "step": 3740 }, { "epoch": 0.18007202881152462, - "grad_norm": 0.454387903213501, + "grad_norm": 0.46026119589805603, "learning_rate": 4.766350467590911e-05, - "loss": 0.8985, + "loss": 0.8976, "num_input_tokens_seen": 30720000, "step": 3750 }, { "epoch": 0.18055222088835535, - "grad_norm": 0.5241227746009827, + "grad_norm": 0.5252475738525391, "learning_rate": 4.764675330679256e-05, - "loss": 0.9849, + "loss": 0.9886, "num_input_tokens_seen": 30801920, "step": 3760 }, { "epoch": 0.18103241296518607, - "grad_norm": 0.5448564887046814, + "grad_norm": 0.5292924642562866, "learning_rate": 4.7629945068201954e-05, - "loss": 0.9484, + "loss": 0.9472, "num_input_tokens_seen": 30883840, "step": 3770 }, { "epoch": 0.1815126050420168, - "grad_norm": 0.5225489139556885, + "grad_norm": 0.5213156938552856, "learning_rate": 4.7613080002345345e-05, - "loss": 0.8183, + "loss": 0.8187, "num_input_tokens_seen": 30965760, "step": 3780 }, { "epoch": 0.18199279711884753, - "grad_norm": 0.5782943367958069, + "grad_norm": 0.5824140906333923, "learning_rate": 4.759615815157352e-05, - "loss": 1.0653, + "loss": 1.0672, "num_input_tokens_seen": 31047680, "step": 3790 }, { "epoch": 0.18247298919567828, - "grad_norm": 0.5790293216705322, + "grad_norm": 0.6037180423736572, "learning_rate": 4.7579179558379836e-05, - "loss": 0.8532, + "loss": 0.854, "num_input_tokens_seen": 31129600, "step": 3800 }, { "epoch": 0.182953181272509, - "grad_norm": 0.49173110723495483, + "grad_norm": 0.47562384605407715, "learning_rate": 4.7562144265400146e-05, - "loss": 0.7594, + "loss": 0.759, "num_input_tokens_seen": 31211520, "step": 3810 }, { "epoch": 0.18343337334933973, - "grad_norm": 0.5832033157348633, + "grad_norm": 0.5508514642715454, "learning_rate": 4.754505231541268e-05, - "loss": 1.0458, + "loss": 1.0452, "num_input_tokens_seen": 31293440, "step": 3820 }, { "epoch": 0.18391356542617046, - "grad_norm": 0.539718508720398, + "grad_norm": 0.5534719824790955, "learning_rate": 4.752790375133797e-05, - "loss": 0.8403, + "loss": 0.8398, "num_input_tokens_seen": 31375360, "step": 3830 }, { "epoch": 0.1843937575030012, - "grad_norm": 0.5238791704177856, + "grad_norm": 0.5246814489364624, "learning_rate": 4.751069861623867e-05, - "loss": 0.9284, + "loss": 0.9282, "num_input_tokens_seen": 31457280, "step": 3840 }, { "epoch": 0.18487394957983194, - "grad_norm": 0.5033566355705261, + "grad_norm": 0.5262382626533508, "learning_rate": 4.749343695331952e-05, - "loss": 1.0164, + "loss": 1.0174, "num_input_tokens_seen": 31539200, "step": 3850 }, { "epoch": 0.18535414165666267, - "grad_norm": 0.6581845879554749, + "grad_norm": 0.6730586290359497, "learning_rate": 4.747611880592721e-05, - "loss": 0.8328, + "loss": 0.8329, "num_input_tokens_seen": 31621120, "step": 3860 }, { "epoch": 0.1858343337334934, - "grad_norm": 0.5361823439598083, + "grad_norm": 0.5353675484657288, "learning_rate": 4.745874421755027e-05, - "loss": 1.0115, + "loss": 1.0112, "num_input_tokens_seen": 31703040, "step": 3870 }, { "epoch": 0.18631452581032412, - "grad_norm": 0.5217710733413696, + "grad_norm": 0.517441987991333, "learning_rate": 4.744131323181895e-05, - "loss": 0.8417, + "loss": 0.8418, "num_input_tokens_seen": 31784960, "step": 3880 }, { "epoch": 0.18679471788715485, - "grad_norm": 0.5840950608253479, + "grad_norm": 0.5716066956520081, "learning_rate": 4.742382589250514e-05, - "loss": 0.9902, + "loss": 0.9913, "num_input_tokens_seen": 31866880, "step": 3890 }, { "epoch": 0.1872749099639856, - "grad_norm": 0.5277583599090576, + "grad_norm": 0.5103275179862976, "learning_rate": 4.740628224352222e-05, "loss": 1.0409, "num_input_tokens_seen": 31948800, @@ -3130,151 +3130,151 @@ }, { "epoch": 0.18775510204081633, - "grad_norm": 0.5681836605072021, + "grad_norm": 0.5536543726921082, "learning_rate": 4.7388682328925e-05, - "loss": 0.9343, + "loss": 0.9342, "num_input_tokens_seen": 32030720, "step": 3910 }, { "epoch": 0.18823529411764706, - "grad_norm": 0.4854600131511688, + "grad_norm": 0.5033223032951355, "learning_rate": 4.737102619290956e-05, - "loss": 0.9198, + "loss": 0.9206, "num_input_tokens_seen": 32112640, "step": 3920 }, { "epoch": 0.18871548619447778, - "grad_norm": 0.527630627155304, + "grad_norm": 0.5292133688926697, "learning_rate": 4.7353313879813165e-05, - "loss": 0.9707, + "loss": 0.9706, "num_input_tokens_seen": 32194560, "step": 3930 }, { "epoch": 0.18919567827130854, - "grad_norm": 0.5316260457038879, + "grad_norm": 0.527191698551178, "learning_rate": 4.733554543411417e-05, - "loss": 1.0691, + "loss": 1.0686, "num_input_tokens_seen": 32276480, "step": 3940 }, { "epoch": 0.18967587034813926, - "grad_norm": 0.5303869843482971, + "grad_norm": 0.534002959728241, "learning_rate": 4.731772090043184e-05, - "loss": 0.9361, + "loss": 0.9365, "num_input_tokens_seen": 32358400, "step": 3950 }, { "epoch": 0.19015606242497, - "grad_norm": 0.5066818594932556, + "grad_norm": 0.5045952796936035, "learning_rate": 4.729984032352635e-05, - "loss": 0.8927, + "loss": 0.8931, "num_input_tokens_seen": 32440320, "step": 3960 }, { "epoch": 0.19063625450180072, - "grad_norm": 0.5547256469726562, + "grad_norm": 0.5647931694984436, "learning_rate": 4.728190374829854e-05, - "loss": 0.9308, + "loss": 0.9301, "num_input_tokens_seen": 32522240, "step": 3970 }, { "epoch": 0.19111644657863144, - "grad_norm": 0.46649324893951416, + "grad_norm": 0.48447665572166443, "learning_rate": 4.726391121978992e-05, - "loss": 0.9229, + "loss": 0.9234, "num_input_tokens_seen": 32604160, "step": 3980 }, { "epoch": 0.1915966386554622, - "grad_norm": 0.5034019351005554, + "grad_norm": 0.5057299137115479, "learning_rate": 4.7245862783182496e-05, - "loss": 0.9033, + "loss": 0.9042, "num_input_tokens_seen": 32686080, "step": 3990 }, { "epoch": 0.19207683073229292, - "grad_norm": 0.5081968307495117, + "grad_norm": 0.5080722570419312, "learning_rate": 4.722775848379866e-05, - "loss": 1.0315, + "loss": 1.0318, "num_input_tokens_seen": 32768000, "step": 4000 }, { "epoch": 0.19255702280912365, - "grad_norm": 0.4671435058116913, + "grad_norm": 0.4755553901195526, "learning_rate": 4.720959836710107e-05, - "loss": 0.8195, + "loss": 0.8199, "num_input_tokens_seen": 32849920, "step": 4010 }, { "epoch": 0.19303721488595438, - "grad_norm": 0.4735753536224365, + "grad_norm": 0.47308361530303955, "learning_rate": 4.7191382478692594e-05, - "loss": 0.9489, + "loss": 0.9483, "num_input_tokens_seen": 32931840, "step": 4020 }, { "epoch": 0.1935174069627851, - "grad_norm": 0.5032724142074585, + "grad_norm": 0.5154343843460083, "learning_rate": 4.7173110864316104e-05, - "loss": 0.814, + "loss": 0.8141, "num_input_tokens_seen": 33013760, "step": 4030 }, { "epoch": 0.19399759903961586, - "grad_norm": 0.645969033241272, + "grad_norm": 0.6429656744003296, "learning_rate": 4.7154783569854444e-05, - "loss": 0.811, + "loss": 0.8107, "num_input_tokens_seen": 33095680, "step": 4040 }, { "epoch": 0.19447779111644659, - "grad_norm": 0.5401909947395325, + "grad_norm": 0.5589474439620972, "learning_rate": 4.713640064133025e-05, - "loss": 0.9777, + "loss": 0.9762, "num_input_tokens_seen": 33177600, "step": 4050 }, { "epoch": 0.1949579831932773, - "grad_norm": 0.5023765563964844, + "grad_norm": 0.5128633379936218, "learning_rate": 4.7117962124905885e-05, - "loss": 0.8785, + "loss": 0.8789, "num_input_tokens_seen": 33259520, "step": 4060 }, { "epoch": 0.19543817527010804, - "grad_norm": 0.5897220373153687, + "grad_norm": 0.5557809472084045, "learning_rate": 4.709946806688329e-05, - "loss": 0.9837, + "loss": 0.9835, "num_input_tokens_seen": 33341440, "step": 4070 }, { "epoch": 0.19591836734693877, - "grad_norm": 0.6706360578536987, + "grad_norm": 0.6408475637435913, "learning_rate": 4.708091851370389e-05, - "loss": 0.8769, + "loss": 0.8732, "num_input_tokens_seen": 33423360, "step": 4080 }, { "epoch": 0.19639855942376952, - "grad_norm": 0.5493477582931519, + "grad_norm": 0.5550397634506226, "learning_rate": 4.706231351194845e-05, "loss": 0.9604, "num_input_tokens_seen": 33505280, @@ -3282,39 +3282,39 @@ }, { "epoch": 0.19687875150060025, - "grad_norm": 0.5341002345085144, + "grad_norm": 0.5453352928161621, "learning_rate": 4.7043653108336994e-05, - "loss": 0.8028, + "loss": 0.8032, "num_input_tokens_seen": 33587200, "step": 4100 }, { "epoch": 0.19735894357743097, - "grad_norm": 0.5551480054855347, + "grad_norm": 0.5484746098518372, "learning_rate": 4.702493734972866e-05, - "loss": 0.9994, + "loss": 0.9989, "num_input_tokens_seen": 33669120, "step": 4110 }, { "epoch": 0.1978391356542617, - "grad_norm": 1.2934221029281616, + "grad_norm": 1.165541648864746, "learning_rate": 4.700616628312158e-05, - "loss": 1.0073, + "loss": 1.0064, "num_input_tokens_seen": 33751040, "step": 4120 }, { "epoch": 0.19831932773109243, - "grad_norm": 0.5070963501930237, + "grad_norm": 0.5196086168289185, "learning_rate": 4.69873399556528e-05, - "loss": 0.8428, + "loss": 0.8425, "num_input_tokens_seen": 33832960, "step": 4130 }, { "epoch": 0.19879951980792318, - "grad_norm": 0.49348321557044983, + "grad_norm": 0.506844162940979, "learning_rate": 4.696845841459811e-05, "loss": 1.0941, "num_input_tokens_seen": 33914880, @@ -3322,551 +3322,551 @@ }, { "epoch": 0.1992797118847539, - "grad_norm": 0.48639488220214844, + "grad_norm": 0.46458375453948975, "learning_rate": 4.6949521707371965e-05, - "loss": 0.8673, + "loss": 0.8666, "num_input_tokens_seen": 33996800, "step": 4150 }, { "epoch": 0.19975990396158463, - "grad_norm": 0.5290814638137817, + "grad_norm": 0.5106430649757385, "learning_rate": 4.693052988152733e-05, - "loss": 0.8488, + "loss": 0.8477, "num_input_tokens_seen": 34078720, "step": 4160 }, { "epoch": 0.20024009603841536, - "grad_norm": 0.7159754037857056, + "grad_norm": 0.7916086316108704, "learning_rate": 4.691148298475561e-05, - "loss": 1.0891, + "loss": 1.0889, "num_input_tokens_seen": 34160640, "step": 4170 }, { "epoch": 0.2007202881152461, - "grad_norm": 0.5923986434936523, + "grad_norm": 0.5892463326454163, "learning_rate": 4.689238106488647e-05, - "loss": 0.904, + "loss": 0.9039, "num_input_tokens_seen": 34242560, "step": 4180 }, { "epoch": 0.20120048019207684, - "grad_norm": 0.4993061125278473, + "grad_norm": 0.521371603012085, "learning_rate": 4.687322416988779e-05, - "loss": 0.8948, + "loss": 0.8954, "num_input_tokens_seen": 34324480, "step": 4190 }, { "epoch": 0.20168067226890757, - "grad_norm": 0.48761218786239624, + "grad_norm": 0.49719128012657166, "learning_rate": 4.685401234786544e-05, - "loss": 0.9203, + "loss": 0.92, "num_input_tokens_seen": 34406400, "step": 4200 }, { "epoch": 0.2021608643457383, - "grad_norm": 0.4988939166069031, + "grad_norm": 0.5170416235923767, "learning_rate": 4.683474564706327e-05, - "loss": 0.9092, + "loss": 0.9093, "num_input_tokens_seen": 34488320, "step": 4210 }, { "epoch": 0.20264105642256902, - "grad_norm": 0.6328974366188049, + "grad_norm": 0.6528691053390503, "learning_rate": 4.681542411586294e-05, - "loss": 0.8973, + "loss": 0.8968, "num_input_tokens_seen": 34570240, "step": 4220 }, { "epoch": 0.20312124849939975, - "grad_norm": 0.5057374835014343, + "grad_norm": 0.5039422512054443, "learning_rate": 4.6796047802783755e-05, - "loss": 1.046, + "loss": 1.0459, "num_input_tokens_seen": 34652160, "step": 4230 }, { "epoch": 0.2036014405762305, - "grad_norm": 0.562879741191864, + "grad_norm": 0.547264039516449, "learning_rate": 4.6776616756482624e-05, - "loss": 0.8767, + "loss": 0.8744, "num_input_tokens_seen": 34734080, "step": 4240 }, { "epoch": 0.20408163265306123, - "grad_norm": 0.9288103580474854, + "grad_norm": 0.9196895360946655, "learning_rate": 4.6757131025753886e-05, - "loss": 0.849, + "loss": 0.8492, "num_input_tokens_seen": 34816000, "step": 4250 }, { "epoch": 0.20456182472989196, - "grad_norm": 0.5365880727767944, + "grad_norm": 0.5697296857833862, "learning_rate": 4.67375906595292e-05, - "loss": 1.1241, + "loss": 1.1252, "num_input_tokens_seen": 34897920, "step": 4260 }, { "epoch": 0.20504201680672268, - "grad_norm": 0.5342133641242981, + "grad_norm": 0.5439584851264954, "learning_rate": 4.671799570687743e-05, - "loss": 1.1088, + "loss": 1.1081, "num_input_tokens_seen": 34979840, "step": 4270 }, { "epoch": 0.2055222088835534, - "grad_norm": 0.5288349390029907, + "grad_norm": 0.5510857105255127, "learning_rate": 4.6698346217004494e-05, - "loss": 0.9175, + "loss": 0.917, "num_input_tokens_seen": 35061760, "step": 4280 }, { "epoch": 0.20600240096038416, - "grad_norm": 0.7718984484672546, + "grad_norm": 0.7729096412658691, "learning_rate": 4.66786422392533e-05, - "loss": 1.146, + "loss": 1.1476, "num_input_tokens_seen": 35143680, "step": 4290 }, { "epoch": 0.2064825930372149, - "grad_norm": 0.5124546885490417, + "grad_norm": 0.5175178050994873, "learning_rate": 4.665888382310356e-05, - "loss": 0.9771, + "loss": 0.9777, "num_input_tokens_seen": 35225600, "step": 4300 }, { "epoch": 0.20696278511404562, - "grad_norm": 0.52974534034729, + "grad_norm": 0.5331414341926575, "learning_rate": 4.663907101817167e-05, - "loss": 0.9299, + "loss": 0.9304, "num_input_tokens_seen": 35307520, "step": 4310 }, { "epoch": 0.20744297719087634, - "grad_norm": 0.50569087266922, + "grad_norm": 0.5062457919120789, "learning_rate": 4.661920387421064e-05, - "loss": 1.0124, + "loss": 1.0128, "num_input_tokens_seen": 35389440, "step": 4320 }, { "epoch": 0.20792316926770707, - "grad_norm": 0.5128062963485718, + "grad_norm": 0.5168769955635071, "learning_rate": 4.65992824411099e-05, - "loss": 0.9051, + "loss": 0.904, "num_input_tokens_seen": 35471360, "step": 4330 }, { "epoch": 0.20840336134453782, - "grad_norm": 0.549404501914978, + "grad_norm": 0.5486623644828796, "learning_rate": 4.657930676889526e-05, - "loss": 1.11, + "loss": 1.1092, "num_input_tokens_seen": 35553280, "step": 4340 }, { "epoch": 0.20888355342136855, - "grad_norm": 0.5476168394088745, + "grad_norm": 0.5371941924095154, "learning_rate": 4.655927690772868e-05, - "loss": 0.7549, + "loss": 0.7555, "num_input_tokens_seen": 35635200, "step": 4350 }, { "epoch": 0.20936374549819928, - "grad_norm": 0.5444672107696533, + "grad_norm": 0.5360463857650757, "learning_rate": 4.6539192907908204e-05, - "loss": 0.798, + "loss": 0.7976, "num_input_tokens_seen": 35717120, "step": 4360 }, { "epoch": 0.20984393757503, - "grad_norm": 0.7407582998275757, + "grad_norm": 0.735427975654602, "learning_rate": 4.6519054819867856e-05, - "loss": 1.0632, + "loss": 1.0636, "num_input_tokens_seen": 35799040, "step": 4370 }, { "epoch": 0.21032412965186073, - "grad_norm": 0.5673499703407288, + "grad_norm": 0.5705459117889404, "learning_rate": 4.649886269417746e-05, - "loss": 0.9968, + "loss": 0.9967, "num_input_tokens_seen": 35880960, "step": 4380 }, { "epoch": 0.21080432172869148, - "grad_norm": 0.5538674592971802, + "grad_norm": 0.5621838569641113, "learning_rate": 4.647861658154254e-05, - "loss": 0.8916, + "loss": 0.8918, "num_input_tokens_seen": 35962880, "step": 4390 }, { "epoch": 0.2112845138055222, - "grad_norm": 0.4925515949726105, + "grad_norm": 0.4890127182006836, "learning_rate": 4.6458316532804214e-05, - "loss": 1.0407, + "loss": 1.0406, "num_input_tokens_seen": 36044800, "step": 4400 }, { "epoch": 0.21176470588235294, - "grad_norm": 0.4931751787662506, + "grad_norm": 0.4952712655067444, "learning_rate": 4.643796259893899e-05, - "loss": 0.9456, + "loss": 0.9447, "num_input_tokens_seen": 36126720, "step": 4410 }, { "epoch": 0.21224489795918366, - "grad_norm": 0.5209604501724243, + "grad_norm": 0.5150204300880432, "learning_rate": 4.641755483105874e-05, - "loss": 0.9378, + "loss": 0.9381, "num_input_tokens_seen": 36208640, "step": 4420 }, { "epoch": 0.21272509003601442, - "grad_norm": 1.0745376348495483, + "grad_norm": 1.1830569505691528, "learning_rate": 4.63970932804105e-05, - "loss": 0.8205, + "loss": 0.8216, "num_input_tokens_seen": 36290560, "step": 4430 }, { "epoch": 0.21320528211284515, - "grad_norm": 0.5311976671218872, + "grad_norm": 0.5193588733673096, "learning_rate": 4.637657799837635e-05, - "loss": 0.8998, + "loss": 0.9, "num_input_tokens_seen": 36372480, "step": 4440 }, { "epoch": 0.21368547418967587, - "grad_norm": 0.5121796727180481, + "grad_norm": 0.5316673517227173, "learning_rate": 4.635600903647333e-05, - "loss": 0.8201, + "loss": 0.8199, "num_input_tokens_seen": 36454400, "step": 4450 }, { "epoch": 0.2141656662665066, - "grad_norm": 0.5421672463417053, + "grad_norm": 0.5320562124252319, "learning_rate": 4.633538644635326e-05, - "loss": 0.879, + "loss": 0.8799, "num_input_tokens_seen": 36536320, "step": 4460 }, { "epoch": 0.21464585834333733, - "grad_norm": 1.2252622842788696, + "grad_norm": 1.1617580652236938, "learning_rate": 4.631471027980262e-05, - "loss": 0.9865, + "loss": 0.9863, "num_input_tokens_seen": 36618240, "step": 4470 }, { "epoch": 0.21512605042016808, - "grad_norm": 0.5252002477645874, + "grad_norm": 0.5181916952133179, "learning_rate": 4.629398058874245e-05, - "loss": 0.8658, + "loss": 0.8663, "num_input_tokens_seen": 36700160, "step": 4480 }, { "epoch": 0.2156062424969988, - "grad_norm": 0.5939603447914124, + "grad_norm": 0.5579994916915894, "learning_rate": 4.6273197425228166e-05, - "loss": 0.9567, + "loss": 0.9576, "num_input_tokens_seen": 36782080, "step": 4490 }, { "epoch": 0.21608643457382953, - "grad_norm": 0.5345758199691772, + "grad_norm": 0.5332381725311279, "learning_rate": 4.62523608414495e-05, - "loss": 0.9567, + "loss": 0.9573, "num_input_tokens_seen": 36864000, "step": 4500 }, { "epoch": 0.21656662665066026, - "grad_norm": 1.0143563747406006, + "grad_norm": 1.0233074426651, "learning_rate": 4.623147088973031e-05, - "loss": 0.7952, + "loss": 0.795, "num_input_tokens_seen": 36945920, "step": 4510 }, { "epoch": 0.217046818727491, - "grad_norm": 0.5424322485923767, + "grad_norm": 0.5333688855171204, "learning_rate": 4.6210527622528465e-05, - "loss": 1.1468, + "loss": 1.1473, "num_input_tokens_seen": 37027840, "step": 4520 }, { "epoch": 0.21752701080432174, - "grad_norm": 0.5119237303733826, + "grad_norm": 0.5092252492904663, "learning_rate": 4.618953109243573e-05, - "loss": 0.8657, + "loss": 0.8676, "num_input_tokens_seen": 37109760, "step": 4530 }, { "epoch": 0.21800720288115247, - "grad_norm": 0.5329012274742126, + "grad_norm": 0.539064347743988, "learning_rate": 4.616848135217761e-05, - "loss": 0.9235, + "loss": 0.9233, "num_input_tokens_seen": 37191680, "step": 4540 }, { "epoch": 0.2184873949579832, - "grad_norm": 0.5090308785438538, + "grad_norm": 0.509436309337616, "learning_rate": 4.6147378454613246e-05, - "loss": 0.8308, + "loss": 0.831, "num_input_tokens_seen": 37273600, "step": 4550 }, { "epoch": 0.21896758703481392, - "grad_norm": 0.7635847926139832, + "grad_norm": 0.780189573764801, "learning_rate": 4.6126222452735233e-05, - "loss": 0.8878, + "loss": 0.8871, "num_input_tokens_seen": 37355520, "step": 4560 }, { "epoch": 0.21944777911164465, - "grad_norm": 0.48617926239967346, + "grad_norm": 0.5022001266479492, "learning_rate": 4.6105013399669564e-05, - "loss": 0.9695, + "loss": 0.9688, "num_input_tokens_seen": 37437440, "step": 4570 }, { "epoch": 0.2199279711884754, - "grad_norm": 0.6429051756858826, + "grad_norm": 0.5899694561958313, "learning_rate": 4.608375134867541e-05, - "loss": 0.8531, + "loss": 0.8533, "num_input_tokens_seen": 37519360, "step": 4580 }, { "epoch": 0.22040816326530613, - "grad_norm": 0.5649603605270386, + "grad_norm": 0.62712562084198, "learning_rate": 4.6062436353145044e-05, - "loss": 0.8998, + "loss": 0.8995, "num_input_tokens_seen": 37601280, "step": 4590 }, { "epoch": 0.22088835534213686, - "grad_norm": 0.5439668297767639, + "grad_norm": 0.5399054884910583, "learning_rate": 4.60410684666037e-05, - "loss": 1.2532, + "loss": 1.2521, "num_input_tokens_seen": 37683200, "step": 4600 }, { "epoch": 0.22136854741896758, - "grad_norm": 0.5839661359786987, + "grad_norm": 0.5870562195777893, "learning_rate": 4.601964774270941e-05, - "loss": 0.9984, + "loss": 0.998, "num_input_tokens_seen": 37765120, "step": 4610 }, { "epoch": 0.2218487394957983, - "grad_norm": 0.5015205144882202, + "grad_norm": 0.49380698800086975, "learning_rate": 4.599817423525292e-05, - "loss": 1.0361, + "loss": 1.036, "num_input_tokens_seen": 37847040, "step": 4620 }, { "epoch": 0.22232893157262906, - "grad_norm": 0.5025255084037781, + "grad_norm": 0.501021146774292, "learning_rate": 4.597664799815749e-05, - "loss": 1.0169, + "loss": 1.0166, "num_input_tokens_seen": 37928960, "step": 4630 }, { "epoch": 0.2228091236494598, - "grad_norm": 0.4889910817146301, + "grad_norm": 0.49086204171180725, "learning_rate": 4.595506908547881e-05, - "loss": 0.9112, + "loss": 0.9113, "num_input_tokens_seen": 38010880, "step": 4640 }, { "epoch": 0.22328931572629052, - "grad_norm": 0.5253650546073914, + "grad_norm": 0.5220022797584534, "learning_rate": 4.593343755140483e-05, - "loss": 0.9424, + "loss": 0.9429, "num_input_tokens_seen": 38092800, "step": 4650 }, { "epoch": 0.22376950780312124, - "grad_norm": 0.4819222688674927, + "grad_norm": 0.48921534419059753, "learning_rate": 4.5911753450255665e-05, - "loss": 1.0131, + "loss": 1.0135, "num_input_tokens_seen": 38174720, "step": 4660 }, { "epoch": 0.22424969987995197, - "grad_norm": 0.5012921690940857, + "grad_norm": 0.4981484115123749, "learning_rate": 4.589001683648343e-05, - "loss": 0.7262, + "loss": 0.7263, "num_input_tokens_seen": 38256640, "step": 4670 }, { "epoch": 0.22472989195678272, - "grad_norm": 0.5600073337554932, + "grad_norm": 0.5662268400192261, "learning_rate": 4.586822776467208e-05, - "loss": 0.9312, + "loss": 0.9288, "num_input_tokens_seen": 38338560, "step": 4680 }, { "epoch": 0.22521008403361345, - "grad_norm": 0.5474848747253418, + "grad_norm": 0.5699325203895569, "learning_rate": 4.584638628953733e-05, - "loss": 0.9062, + "loss": 0.9058, "num_input_tokens_seen": 38420480, "step": 4690 }, { "epoch": 0.22569027611044418, - "grad_norm": 0.614006519317627, + "grad_norm": 0.6078118681907654, "learning_rate": 4.582449246592647e-05, - "loss": 0.8918, + "loss": 0.8911, "num_input_tokens_seen": 38502400, "step": 4700 }, { "epoch": 0.2261704681872749, - "grad_norm": 0.5042563080787659, + "grad_norm": 0.5221937894821167, "learning_rate": 4.5802546348818264e-05, - "loss": 0.8067, + "loss": 0.8064, "num_input_tokens_seen": 38584320, "step": 4710 }, { "epoch": 0.22665066026410563, - "grad_norm": 0.5609814524650574, + "grad_norm": 0.5463407635688782, "learning_rate": 4.578054799332277e-05, - "loss": 0.9678, + "loss": 0.9667, "num_input_tokens_seen": 38666240, "step": 4720 }, { "epoch": 0.22713085234093638, - "grad_norm": 0.4998452663421631, + "grad_norm": 0.49847063422203064, "learning_rate": 4.575849745468124e-05, - "loss": 1.0305, + "loss": 1.0298, "num_input_tokens_seen": 38748160, "step": 4730 }, { "epoch": 0.2276110444177671, - "grad_norm": 0.5019862651824951, + "grad_norm": 0.4975888729095459, "learning_rate": 4.573639478826596e-05, - "loss": 0.8708, + "loss": 0.87, "num_input_tokens_seen": 38830080, "step": 4740 }, { "epoch": 0.22809123649459784, - "grad_norm": 0.5069782733917236, + "grad_norm": 0.5155524611473083, "learning_rate": 4.571424004958012e-05, - "loss": 1.0366, + "loss": 1.0355, "num_input_tokens_seen": 38912000, "step": 4750 }, { "epoch": 0.22857142857142856, - "grad_norm": 0.5219393372535706, + "grad_norm": 0.5229736566543579, "learning_rate": 4.5692033294257666e-05, - "loss": 1.1274, + "loss": 1.1281, "num_input_tokens_seen": 38993920, "step": 4760 }, { "epoch": 0.2290516206482593, - "grad_norm": 0.5168918967247009, + "grad_norm": 0.5159292817115784, "learning_rate": 4.5669774578063174e-05, - "loss": 0.906, + "loss": 0.9067, "num_input_tokens_seen": 39075840, "step": 4770 }, { "epoch": 0.22953181272509005, - "grad_norm": 0.554415762424469, + "grad_norm": 0.5559686422348022, "learning_rate": 4.56474639568917e-05, - "loss": 0.9868, + "loss": 0.9853, "num_input_tokens_seen": 39157760, "step": 4780 }, { "epoch": 0.23001200480192077, - "grad_norm": 0.5875802040100098, + "grad_norm": 0.6078135371208191, "learning_rate": 4.5625101486768626e-05, - "loss": 0.9674, + "loss": 0.9676, "num_input_tokens_seen": 39239680, "step": 4790 }, { "epoch": 0.2304921968787515, - "grad_norm": 0.524535596370697, + "grad_norm": 0.5246491432189941, "learning_rate": 4.560268722384956e-05, - "loss": 0.944, + "loss": 0.9445, "num_input_tokens_seen": 39321600, "step": 4800 }, { "epoch": 0.23097238895558223, - "grad_norm": 0.5096153020858765, + "grad_norm": 0.4907456338405609, "learning_rate": 4.558022122442016e-05, - "loss": 0.9393, + "loss": 0.939, "num_input_tokens_seen": 39403520, "step": 4810 }, { "epoch": 0.23145258103241295, - "grad_norm": 0.527031421661377, + "grad_norm": 0.5344095230102539, "learning_rate": 4.555770354489598e-05, - "loss": 0.9572, + "loss": 0.9579, "num_input_tokens_seen": 39485440, "step": 4820 }, { "epoch": 0.2319327731092437, - "grad_norm": 0.4810847342014313, + "grad_norm": 0.4841211140155792, "learning_rate": 4.5535134241822394e-05, "loss": 0.8217, "num_input_tokens_seen": 39567360, @@ -3874,551 +3874,551 @@ }, { "epoch": 0.23241296518607443, - "grad_norm": 0.4917358458042145, + "grad_norm": 0.4927588105201721, "learning_rate": 4.551251337187436e-05, - "loss": 0.8829, + "loss": 0.8828, "num_input_tokens_seen": 39649280, "step": 4840 }, { "epoch": 0.23289315726290516, - "grad_norm": 0.4697302281856537, + "grad_norm": 0.47400715947151184, "learning_rate": 4.548984099185638e-05, - "loss": 1.0275, + "loss": 1.0273, "num_input_tokens_seen": 39731200, "step": 4850 }, { "epoch": 0.23337334933973589, - "grad_norm": 0.7084998488426208, + "grad_norm": 0.5200524926185608, "learning_rate": 4.546711715870227e-05, - "loss": 0.8638, + "loss": 0.8631, "num_input_tokens_seen": 39813120, "step": 4860 }, { "epoch": 0.23385354141656664, - "grad_norm": 0.6179664134979248, + "grad_norm": 0.6111912131309509, "learning_rate": 4.5444341929475064e-05, - "loss": 0.9251, + "loss": 0.9249, "num_input_tokens_seen": 39895040, "step": 4870 }, { "epoch": 0.23433373349339737, - "grad_norm": 0.5251472592353821, + "grad_norm": 0.5096678733825684, "learning_rate": 4.5421515361366854e-05, - "loss": 1.362, + "loss": 1.3656, "num_input_tokens_seen": 39976960, "step": 4880 }, { "epoch": 0.2348139255702281, - "grad_norm": 0.566771924495697, + "grad_norm": 0.6361328959465027, "learning_rate": 4.5398637511698665e-05, - "loss": 0.6601, + "loss": 0.6605, "num_input_tokens_seen": 40058880, "step": 4890 }, { "epoch": 0.23529411764705882, - "grad_norm": 0.5199572443962097, + "grad_norm": 0.505113422870636, "learning_rate": 4.5375708437920284e-05, - "loss": 0.9545, + "loss": 0.9551, "num_input_tokens_seen": 40140800, "step": 4900 }, { "epoch": 0.23577430972388955, - "grad_norm": 0.4832773506641388, + "grad_norm": 0.47399622201919556, "learning_rate": 4.535272819761014e-05, - "loss": 1.0709, + "loss": 1.0701, "num_input_tokens_seen": 40222720, "step": 4910 }, { "epoch": 0.2362545018007203, - "grad_norm": 0.575834333896637, + "grad_norm": 0.5620723366737366, "learning_rate": 4.532969684847514e-05, - "loss": 1.0263, + "loss": 1.026, "num_input_tokens_seen": 40304640, "step": 4920 }, { "epoch": 0.23673469387755103, - "grad_norm": 0.6102591753005981, + "grad_norm": 0.5764886736869812, "learning_rate": 4.530661444835054e-05, - "loss": 0.9399, + "loss": 0.9408, "num_input_tokens_seen": 40386560, "step": 4930 }, { "epoch": 0.23721488595438175, - "grad_norm": 0.4413188099861145, + "grad_norm": 0.4281923174858093, "learning_rate": 4.5283481055199784e-05, - "loss": 1.0041, + "loss": 1.0037, "num_input_tokens_seen": 40468480, "step": 4940 }, { "epoch": 0.23769507803121248, - "grad_norm": 0.5925584435462952, + "grad_norm": 0.5836651921272278, "learning_rate": 4.526029672711437e-05, - "loss": 1.1321, + "loss": 1.1308, "num_input_tokens_seen": 40550400, "step": 4950 }, { "epoch": 0.2381752701080432, - "grad_norm": 1.5264989137649536, + "grad_norm": 1.5391772985458374, "learning_rate": 4.523706152231373e-05, - "loss": 0.9726, + "loss": 0.9735, "num_input_tokens_seen": 40632320, "step": 4960 }, { "epoch": 0.23865546218487396, - "grad_norm": 0.5782877206802368, + "grad_norm": 0.5564135909080505, "learning_rate": 4.5213775499145e-05, - "loss": 1.0489, + "loss": 1.0495, "num_input_tokens_seen": 40714240, "step": 4970 }, { "epoch": 0.2391356542617047, - "grad_norm": 0.48922982811927795, + "grad_norm": 0.49914756417274475, "learning_rate": 4.519043871608297e-05, - "loss": 0.7904, + "loss": 0.7898, "num_input_tokens_seen": 40796160, "step": 4980 }, { "epoch": 0.23961584633853542, - "grad_norm": 0.5028977990150452, + "grad_norm": 0.4968912899494171, "learning_rate": 4.5167051231729894e-05, - "loss": 0.8851, + "loss": 0.8855, "num_input_tokens_seen": 40878080, "step": 4990 }, { "epoch": 0.24009603841536614, - "grad_norm": 0.4978225529193878, + "grad_norm": 0.49146732687950134, "learning_rate": 4.514361310481533e-05, - "loss": 0.9244, + "loss": 0.9249, "num_input_tokens_seen": 40960000, "step": 5000 }, { "epoch": 0.24057623049219687, - "grad_norm": 0.6003936529159546, + "grad_norm": 0.597633957862854, "learning_rate": 4.512012439419601e-05, - "loss": 0.9562, + "loss": 0.9561, "num_input_tokens_seen": 41041920, "step": 5010 }, { "epoch": 0.24105642256902762, - "grad_norm": 0.5342166423797607, + "grad_norm": 0.5319405794143677, "learning_rate": 4.509658515885568e-05, - "loss": 1.0229, + "loss": 1.0238, "num_input_tokens_seen": 41123840, "step": 5020 }, { "epoch": 0.24153661464585835, - "grad_norm": 0.5202224850654602, + "grad_norm": 0.5182055234909058, "learning_rate": 4.5072995457904995e-05, - "loss": 0.8586, + "loss": 0.8584, "num_input_tokens_seen": 41205760, "step": 5030 }, { "epoch": 0.24201680672268908, - "grad_norm": 0.5373672842979431, + "grad_norm": 0.5302344560623169, "learning_rate": 4.50493553505813e-05, - "loss": 0.9188, + "loss": 0.9193, "num_input_tokens_seen": 41287680, "step": 5040 }, { "epoch": 0.2424969987995198, - "grad_norm": 0.5604919195175171, + "grad_norm": 0.5817127823829651, "learning_rate": 4.502566489624855e-05, - "loss": 1.041, + "loss": 1.0409, "num_input_tokens_seen": 41369600, "step": 5050 }, { "epoch": 0.24297719087635053, - "grad_norm": 0.4886492192745209, + "grad_norm": 0.48699209094047546, "learning_rate": 4.50019241543971e-05, - "loss": 0.9901, + "loss": 0.9899, "num_input_tokens_seen": 41451520, "step": 5060 }, { "epoch": 0.24345738295318128, - "grad_norm": 0.5549675822257996, + "grad_norm": 0.5494588017463684, "learning_rate": 4.4978133184643586e-05, - "loss": 0.9472, + "loss": 0.9473, "num_input_tokens_seen": 41533440, "step": 5070 }, { "epoch": 0.243937575030012, - "grad_norm": 0.4997415244579315, + "grad_norm": 0.49519428610801697, "learning_rate": 4.495429204673081e-05, - "loss": 0.9372, + "loss": 0.937, "num_input_tokens_seen": 41615360, "step": 5080 }, { "epoch": 0.24441776710684274, - "grad_norm": 0.5093867182731628, + "grad_norm": 0.5150887966156006, "learning_rate": 4.493040080052752e-05, - "loss": 0.8477, + "loss": 0.8483, "num_input_tokens_seen": 41697280, "step": 5090 }, { "epoch": 0.24489795918367346, - "grad_norm": 0.5322418808937073, + "grad_norm": 0.5311129093170166, "learning_rate": 4.49064595060283e-05, - "loss": 1.0359, + "loss": 1.0361, "num_input_tokens_seen": 41779200, "step": 5100 }, { "epoch": 0.2453781512605042, - "grad_norm": 0.485977441072464, + "grad_norm": 0.4963180124759674, "learning_rate": 4.488246822335341e-05, - "loss": 1.027, + "loss": 1.0274, "num_input_tokens_seen": 41861120, "step": 5110 }, { "epoch": 0.24585834333733494, - "grad_norm": 0.6202290058135986, + "grad_norm": 0.6075377464294434, "learning_rate": 4.485842701274865e-05, - "loss": 1.4544, + "loss": 1.4537, "num_input_tokens_seen": 41943040, "step": 5120 }, { "epoch": 0.24633853541416567, - "grad_norm": 0.5418204665184021, + "grad_norm": 0.48977187275886536, "learning_rate": 4.4834335934585194e-05, - "loss": 0.8381, + "loss": 0.8386, "num_input_tokens_seen": 42024960, "step": 5130 }, { "epoch": 0.2468187274909964, - "grad_norm": 0.5167099237442017, + "grad_norm": 0.5131157040596008, "learning_rate": 4.4810195049359435e-05, - "loss": 0.8446, + "loss": 0.8449, "num_input_tokens_seen": 42106880, "step": 5140 }, { "epoch": 0.24729891956782712, - "grad_norm": 0.5339356064796448, + "grad_norm": 0.5210369229316711, "learning_rate": 4.4786004417692836e-05, - "loss": 0.9213, + "loss": 0.9216, "num_input_tokens_seen": 42188800, "step": 5150 }, { "epoch": 0.24777911164465785, - "grad_norm": 0.5605190396308899, + "grad_norm": 0.575391411781311, "learning_rate": 4.4761764100331795e-05, - "loss": 1.0777, + "loss": 1.0769, "num_input_tokens_seen": 42270720, "step": 5160 }, { "epoch": 0.2482593037214886, - "grad_norm": 0.5921156406402588, + "grad_norm": 0.6365089416503906, "learning_rate": 4.473747415814747e-05, - "loss": 0.8808, + "loss": 0.8806, "num_input_tokens_seen": 42352640, "step": 5170 }, { "epoch": 0.24873949579831933, - "grad_norm": 0.6122276186943054, + "grad_norm": 0.6410951018333435, "learning_rate": 4.471313465213562e-05, - "loss": 1.0989, + "loss": 1.0999, "num_input_tokens_seen": 42434560, "step": 5180 }, { "epoch": 0.24921968787515006, - "grad_norm": 0.618924617767334, + "grad_norm": 0.5982769727706909, "learning_rate": 4.46887456434165e-05, - "loss": 0.8832, + "loss": 0.8831, "num_input_tokens_seen": 42516480, "step": 5190 }, { "epoch": 0.24969987995198079, - "grad_norm": 0.5166974067687988, + "grad_norm": 0.5119848251342773, "learning_rate": 4.466430719323465e-05, - "loss": 0.843, + "loss": 0.8431, "num_input_tokens_seen": 42598400, "step": 5200 }, { "epoch": 0.25018007202881154, - "grad_norm": 0.6116747856140137, + "grad_norm": 0.6475332975387573, "learning_rate": 4.463981936295876e-05, - "loss": 1.096, + "loss": 1.0951, "num_input_tokens_seen": 42680320, "step": 5210 }, { "epoch": 0.25066026410564224, - "grad_norm": 0.4793613851070404, + "grad_norm": 0.4759005010128021, "learning_rate": 4.461528221408153e-05, - "loss": 0.8411, + "loss": 0.8407, "num_input_tokens_seen": 42762240, "step": 5220 }, { "epoch": 0.251140456182473, - "grad_norm": 0.4967063069343567, + "grad_norm": 0.5037009716033936, "learning_rate": 4.459069580821953e-05, - "loss": 0.9483, + "loss": 0.9482, "num_input_tokens_seen": 42844160, "step": 5230 }, { "epoch": 0.25162064825930375, - "grad_norm": 0.4823471009731293, + "grad_norm": 0.4769095778465271, "learning_rate": 4.4566060207112983e-05, - "loss": 0.7803, + "loss": 0.7797, "num_input_tokens_seen": 42926080, "step": 5240 }, { "epoch": 0.25210084033613445, - "grad_norm": 0.5162050127983093, + "grad_norm": 0.5239166021347046, "learning_rate": 4.454137547262566e-05, - "loss": 0.9638, + "loss": 0.9646, "num_input_tokens_seen": 43008000, "step": 5250 }, { "epoch": 0.2525810324129652, - "grad_norm": 0.6449117660522461, + "grad_norm": 0.6174294352531433, "learning_rate": 4.451664166674472e-05, - "loss": 0.9499, + "loss": 0.9503, "num_input_tokens_seen": 43089920, "step": 5260 }, { "epoch": 0.2530612244897959, - "grad_norm": 0.5185222625732422, + "grad_norm": 0.5097090005874634, "learning_rate": 4.449185885158056e-05, - "loss": 0.8689, + "loss": 0.8684, "num_input_tokens_seen": 43171840, "step": 5270 }, { "epoch": 0.25354141656662665, - "grad_norm": 0.5826950669288635, + "grad_norm": 0.5547761917114258, "learning_rate": 4.4467027089366625e-05, - "loss": 0.9125, + "loss": 0.9119, "num_input_tokens_seen": 43253760, "step": 5280 }, { "epoch": 0.2540216086434574, - "grad_norm": 0.6201736330986023, + "grad_norm": 0.6190077662467957, "learning_rate": 4.444214644245928e-05, - "loss": 0.9452, + "loss": 0.9446, "num_input_tokens_seen": 43335680, "step": 5290 }, { "epoch": 0.2545018007202881, - "grad_norm": 0.5122745633125305, + "grad_norm": 0.5137811303138733, "learning_rate": 4.441721697333765e-05, - "loss": 0.8625, + "loss": 0.8618, "num_input_tokens_seen": 43417600, "step": 5300 }, { "epoch": 0.25498199279711886, - "grad_norm": 0.5290250182151794, + "grad_norm": 0.5213572382926941, "learning_rate": 4.4392238744603464e-05, - "loss": 0.8692, + "loss": 0.8695, "num_input_tokens_seen": 43499520, "step": 5310 }, { "epoch": 0.25546218487394956, - "grad_norm": 0.719904363155365, + "grad_norm": 0.7135751843452454, "learning_rate": 4.436721181898088e-05, - "loss": 1.0077, + "loss": 1.0076, "num_input_tokens_seen": 43581440, "step": 5320 }, { "epoch": 0.2559423769507803, - "grad_norm": 0.5093972086906433, + "grad_norm": 0.4903344511985779, "learning_rate": 4.434213625931636e-05, - "loss": 0.9048, + "loss": 0.9059, "num_input_tokens_seen": 43663360, "step": 5330 }, { "epoch": 0.25642256902761107, - "grad_norm": 0.5186138153076172, + "grad_norm": 0.5091985464096069, "learning_rate": 4.431701212857847e-05, - "loss": 0.9174, + "loss": 0.9168, "num_input_tokens_seen": 43745280, "step": 5340 }, { "epoch": 0.25690276110444177, - "grad_norm": 0.5094157457351685, + "grad_norm": 0.5019556879997253, "learning_rate": 4.429183948985777e-05, - "loss": 0.7608, + "loss": 0.7605, "num_input_tokens_seen": 43827200, "step": 5350 }, { "epoch": 0.2573829531812725, - "grad_norm": 0.5557414889335632, + "grad_norm": 0.5584927201271057, "learning_rate": 4.426661840636662e-05, - "loss": 0.8841, + "loss": 0.8832, "num_input_tokens_seen": 43909120, "step": 5360 }, { "epoch": 0.2578631452581032, - "grad_norm": 0.49979162216186523, + "grad_norm": 0.5135828852653503, "learning_rate": 4.424134894143903e-05, - "loss": 1.0389, + "loss": 1.0383, "num_input_tokens_seen": 43991040, "step": 5370 }, { "epoch": 0.258343337334934, - "grad_norm": 0.6658138036727905, + "grad_norm": 0.6649297475814819, "learning_rate": 4.42160311585305e-05, - "loss": 1.0486, + "loss": 1.0498, "num_input_tokens_seen": 44072960, "step": 5380 }, { "epoch": 0.25882352941176473, - "grad_norm": 0.5858299732208252, + "grad_norm": 0.6021849513053894, "learning_rate": 4.419066512121788e-05, - "loss": 0.6945, + "loss": 0.6946, "num_input_tokens_seen": 44154880, "step": 5390 }, { "epoch": 0.25930372148859543, - "grad_norm": 0.5159766674041748, + "grad_norm": 0.5068604946136475, "learning_rate": 4.4165250893199176e-05, - "loss": 0.8972, + "loss": 0.898, "num_input_tokens_seen": 44236800, "step": 5400 }, { "epoch": 0.2597839135654262, - "grad_norm": 0.48362815380096436, + "grad_norm": 0.4720013439655304, "learning_rate": 4.413978853829342e-05, - "loss": 0.8574, + "loss": 0.8575, "num_input_tokens_seen": 44318720, "step": 5410 }, { "epoch": 0.2602641056422569, - "grad_norm": 0.5605273842811584, + "grad_norm": 0.5253759622573853, "learning_rate": 4.411427812044049e-05, - "loss": 0.9521, + "loss": 0.953, "num_input_tokens_seen": 44400640, "step": 5420 }, { "epoch": 0.26074429771908764, - "grad_norm": 0.5811429619789124, + "grad_norm": 0.5874719023704529, "learning_rate": 4.408871970370096e-05, - "loss": 1.0423, + "loss": 1.0408, "num_input_tokens_seen": 44482560, "step": 5430 }, { "epoch": 0.2612244897959184, - "grad_norm": 0.7415308952331543, + "grad_norm": 0.8202329874038696, "learning_rate": 4.406311335225595e-05, - "loss": 0.9938, + "loss": 0.9933, "num_input_tokens_seen": 44564480, "step": 5440 }, { "epoch": 0.2617046818727491, - "grad_norm": 0.6823397874832153, + "grad_norm": 0.6459318995475769, "learning_rate": 4.4037459130406923e-05, - "loss": 0.8559, + "loss": 0.8562, "num_input_tokens_seen": 44646400, "step": 5450 }, { "epoch": 0.26218487394957984, - "grad_norm": 0.5368736386299133, + "grad_norm": 0.5401067733764648, "learning_rate": 4.401175710257558e-05, - "loss": 0.9533, + "loss": 0.9538, "num_input_tokens_seen": 44728320, "step": 5460 }, { "epoch": 0.26266506602641054, - "grad_norm": 0.5195080637931824, + "grad_norm": 0.515424370765686, "learning_rate": 4.398600733330365e-05, - "loss": 0.8958, + "loss": 0.8955, "num_input_tokens_seen": 44810240, "step": 5470 }, { "epoch": 0.2631452581032413, - "grad_norm": 0.5098795890808105, + "grad_norm": 0.5169644951820374, "learning_rate": 4.3960209887252766e-05, - "loss": 1.1406, + "loss": 1.1404, "num_input_tokens_seen": 44892160, "step": 5480 }, { "epoch": 0.26362545018007205, - "grad_norm": 0.5225921273231506, + "grad_norm": 0.5176170468330383, "learning_rate": 4.3934364829204265e-05, - "loss": 1.0363, + "loss": 1.0361, "num_input_tokens_seen": 44974080, "step": 5490 }, { "epoch": 0.26410564225690275, - "grad_norm": 0.5075384378433228, + "grad_norm": 0.520937442779541, "learning_rate": 4.3908472224059064e-05, - "loss": 0.876, + "loss": 0.8763, "num_input_tokens_seen": 45056000, "step": 5500 }, { "epoch": 0.2645858343337335, - "grad_norm": 0.5186338424682617, + "grad_norm": 0.5109109878540039, "learning_rate": 4.388253213683747e-05, - "loss": 0.8846, + "loss": 0.8841, "num_input_tokens_seen": 45137920, "step": 5510 }, { "epoch": 0.2650660264105642, - "grad_norm": 0.5157744884490967, + "grad_norm": 0.525878369808197, "learning_rate": 4.385654463267901e-05, "loss": 0.841, "num_input_tokens_seen": 45219840, @@ -4426,127 +4426,127 @@ }, { "epoch": 0.26554621848739496, - "grad_norm": 0.49746039509773254, + "grad_norm": 0.49905362725257874, "learning_rate": 4.383050977684231e-05, - "loss": 1.1015, + "loss": 1.1014, "num_input_tokens_seen": 45301760, "step": 5530 }, { "epoch": 0.2660264105642257, - "grad_norm": 0.5038164258003235, + "grad_norm": 0.5063333511352539, "learning_rate": 4.3804427634704885e-05, - "loss": 0.7614, + "loss": 0.7611, "num_input_tokens_seen": 45383680, "step": 5540 }, { "epoch": 0.2665066026410564, - "grad_norm": 0.4780644476413727, + "grad_norm": 0.48698627948760986, "learning_rate": 4.3778298271762995e-05, - "loss": 0.8541, + "loss": 0.854, "num_input_tokens_seen": 45465600, "step": 5550 }, { "epoch": 0.26698679471788717, - "grad_norm": 0.49261558055877686, + "grad_norm": 0.4989083409309387, "learning_rate": 4.375212175363149e-05, - "loss": 0.9464, + "loss": 0.9457, "num_input_tokens_seen": 45547520, "step": 5560 }, { "epoch": 0.26746698679471786, - "grad_norm": 0.256661593914032, + "grad_norm": 0.2386409342288971, "learning_rate": 4.372589814604362e-05, - "loss": 0.8231, + "loss": 0.8236, "num_input_tokens_seen": 45629440, "step": 5570 }, { "epoch": 0.2679471788715486, - "grad_norm": 0.7495452761650085, + "grad_norm": 0.7750229239463806, "learning_rate": 4.369962751485089e-05, - "loss": 0.8247, + "loss": 0.8256, "num_input_tokens_seen": 45711360, "step": 5580 }, { "epoch": 0.2684273709483794, - "grad_norm": 0.6870756149291992, + "grad_norm": 0.8927270174026489, "learning_rate": 4.367330992602289e-05, - "loss": 0.7793, + "loss": 0.7786, "num_input_tokens_seen": 45793280, "step": 5590 }, { "epoch": 0.2689075630252101, - "grad_norm": 0.5133636593818665, + "grad_norm": 0.5122426748275757, "learning_rate": 4.3646945445647114e-05, - "loss": 0.7808, + "loss": 0.7805, "num_input_tokens_seen": 45875200, "step": 5600 }, { "epoch": 0.2693877551020408, - "grad_norm": 0.48444893956184387, + "grad_norm": 0.47932419180870056, "learning_rate": 4.362053413992883e-05, - "loss": 1.023, + "loss": 1.0227, "num_input_tokens_seen": 45957120, "step": 5610 }, { "epoch": 0.2698679471788715, - "grad_norm": 0.4823382794857025, + "grad_norm": 0.47927728295326233, "learning_rate": 4.359407607519088e-05, - "loss": 0.7344, + "loss": 0.7345, "num_input_tokens_seen": 46039040, "step": 5620 }, { "epoch": 0.2703481392557023, - "grad_norm": 0.4958154261112213, + "grad_norm": 0.5311817526817322, "learning_rate": 4.356757131787353e-05, - "loss": 0.9653, + "loss": 0.9656, "num_input_tokens_seen": 46120960, "step": 5630 }, { "epoch": 0.27082833133253303, - "grad_norm": 2.4843976497650146, + "grad_norm": 2.499256134033203, "learning_rate": 4.354101993453429e-05, - "loss": 0.8851, + "loss": 0.8838, "num_input_tokens_seen": 46202880, "step": 5640 }, { "epoch": 0.27130852340936373, - "grad_norm": 0.5063963532447815, + "grad_norm": 0.5061020851135254, "learning_rate": 4.3514421991847746e-05, - "loss": 0.8417, + "loss": 0.8414, "num_input_tokens_seen": 46284800, "step": 5650 }, { "epoch": 0.2717887154861945, - "grad_norm": 0.5009209513664246, + "grad_norm": 0.49775904417037964, "learning_rate": 4.3487777556605446e-05, - "loss": 0.8548, + "loss": 0.8546, "num_input_tokens_seen": 46366720, "step": 5660 }, { "epoch": 0.2722689075630252, - "grad_norm": 0.7282830476760864, + "grad_norm": 0.7386242747306824, "learning_rate": 4.3461086695715625e-05, - "loss": 1.0355, + "loss": 1.0363, "num_input_tokens_seen": 46448640, "step": 5670 }, { "epoch": 0.27274909963985594, - "grad_norm": 0.5339224934577942, + "grad_norm": 0.5468425750732422, "learning_rate": 4.343434947620315e-05, "loss": 0.799, "num_input_tokens_seen": 46530560, @@ -4554,15 +4554,15 @@ }, { "epoch": 0.2732292917166867, - "grad_norm": 0.5233862400054932, + "grad_norm": 0.5276323556900024, "learning_rate": 4.340756596520929e-05, - "loss": 0.9427, + "loss": 0.9422, "num_input_tokens_seen": 46612480, "step": 5690 }, { "epoch": 0.2737094837935174, - "grad_norm": 0.487689733505249, + "grad_norm": 0.4901086688041687, "learning_rate": 4.338073622999154e-05, "loss": 0.904, "num_input_tokens_seen": 46694400, @@ -4570,31 +4570,31 @@ }, { "epoch": 0.27418967587034815, - "grad_norm": 0.5029451251029968, + "grad_norm": 0.5071278214454651, "learning_rate": 4.335386033792347e-05, - "loss": 1.2054, + "loss": 1.2078, "num_input_tokens_seen": 46776320, "step": 5710 }, { "epoch": 0.27466986794717885, - "grad_norm": 0.5146957635879517, + "grad_norm": 0.5222769379615784, "learning_rate": 4.332693835649461e-05, - "loss": 0.8869, + "loss": 0.887, "num_input_tokens_seen": 46858240, "step": 5720 }, { "epoch": 0.2751500600240096, - "grad_norm": 0.5206642150878906, + "grad_norm": 0.5225254893302917, "learning_rate": 4.329997035331015e-05, - "loss": 0.9947, + "loss": 0.9954, "num_input_tokens_seen": 46940160, "step": 5730 }, { "epoch": 0.27563025210084036, - "grad_norm": 0.5068042278289795, + "grad_norm": 0.5147578716278076, "learning_rate": 4.3272956396090906e-05, "loss": 0.8496, "num_input_tokens_seen": 47022080, @@ -4602,31 +4602,31 @@ }, { "epoch": 0.27611044417767105, - "grad_norm": 0.5032111406326294, + "grad_norm": 0.5081062316894531, "learning_rate": 4.324589655267306e-05, - "loss": 0.9618, + "loss": 0.9616, "num_input_tokens_seen": 47104000, "step": 5750 }, { "epoch": 0.2765906362545018, - "grad_norm": 0.6475026607513428, + "grad_norm": 0.6242536306381226, "learning_rate": 4.321879089100805e-05, - "loss": 1.065, + "loss": 1.0649, "num_input_tokens_seen": 47185920, "step": 5760 }, { "epoch": 0.2770708283313325, - "grad_norm": 0.9474783539772034, + "grad_norm": 0.9209274053573608, "learning_rate": 4.319163947916234e-05, - "loss": 0.9001, + "loss": 0.9002, "num_input_tokens_seen": 47267840, "step": 5770 }, { "epoch": 0.27755102040816326, - "grad_norm": 0.48900583386421204, + "grad_norm": 0.49128785729408264, "learning_rate": 4.316444238531729e-05, "loss": 0.9649, "num_input_tokens_seen": 47349760, @@ -4634,263 +4634,263 @@ }, { "epoch": 0.278031212484994, - "grad_norm": 0.5249919891357422, + "grad_norm": 0.5176426768302917, "learning_rate": 4.313719967776899e-05, - "loss": 0.9746, + "loss": 0.9747, "num_input_tokens_seen": 47431680, "step": 5790 }, { "epoch": 0.2785114045618247, - "grad_norm": 0.5116479992866516, + "grad_norm": 0.5055992603302002, "learning_rate": 4.310991142492805e-05, - "loss": 1.0193, + "loss": 1.0223, "num_input_tokens_seen": 47513600, "step": 5800 }, { "epoch": 0.27899159663865547, - "grad_norm": 0.4895400404930115, + "grad_norm": 0.4892292022705078, "learning_rate": 4.308257769531947e-05, - "loss": 0.9313, + "loss": 0.9318, "num_input_tokens_seen": 47595520, "step": 5810 }, { "epoch": 0.27947178871548617, - "grad_norm": 0.5116353034973145, + "grad_norm": 0.51258385181427, "learning_rate": 4.3055198557582445e-05, - "loss": 1.0058, + "loss": 1.0057, "num_input_tokens_seen": 47677440, "step": 5820 }, { "epoch": 0.2799519807923169, - "grad_norm": 0.6602368354797363, + "grad_norm": 0.6507702469825745, "learning_rate": 4.3027774080470174e-05, - "loss": 1.1261, + "loss": 1.126, "num_input_tokens_seen": 47759360, "step": 5830 }, { "epoch": 0.2804321728691477, - "grad_norm": 0.5150447487831116, + "grad_norm": 0.5292423367500305, "learning_rate": 4.300030433284974e-05, - "loss": 0.8111, + "loss": 0.8118, "num_input_tokens_seen": 47841280, "step": 5840 }, { "epoch": 0.2809123649459784, - "grad_norm": 0.49057701230049133, + "grad_norm": 0.499051958322525, "learning_rate": 4.29727893837019e-05, - "loss": 0.7262, + "loss": 0.7263, "num_input_tokens_seen": 47923200, "step": 5850 }, { "epoch": 0.28139255702280913, - "grad_norm": 0.5138354897499084, + "grad_norm": 0.5214549899101257, "learning_rate": 4.294522930212091e-05, - "loss": 0.809, + "loss": 0.8095, "num_input_tokens_seen": 48005120, "step": 5860 }, { "epoch": 0.28187274909963983, - "grad_norm": 0.5236837267875671, + "grad_norm": 0.5287784337997437, "learning_rate": 4.291762415731437e-05, - "loss": 0.9162, + "loss": 0.9164, "num_input_tokens_seen": 48087040, "step": 5870 }, { "epoch": 0.2823529411764706, - "grad_norm": 0.5018298625946045, + "grad_norm": 0.5037810802459717, "learning_rate": 4.288997401860303e-05, - "loss": 1.0343, + "loss": 1.0349, "num_input_tokens_seen": 48168960, "step": 5880 }, { "epoch": 0.28283313325330134, - "grad_norm": 0.4813729524612427, + "grad_norm": 0.48096582293510437, "learning_rate": 4.286227895542064e-05, - "loss": 1.36, + "loss": 1.3606, "num_input_tokens_seen": 48250880, "step": 5890 }, { "epoch": 0.28331332533013204, - "grad_norm": 0.44931960105895996, + "grad_norm": 0.4305453300476074, "learning_rate": 4.283453903731375e-05, - "loss": 0.8472, + "loss": 0.8476, "num_input_tokens_seen": 48332800, "step": 5900 }, { "epoch": 0.2837935174069628, - "grad_norm": 0.5551528930664062, + "grad_norm": 0.5652824640274048, "learning_rate": 4.2806754333941546e-05, - "loss": 1.1295, + "loss": 1.1286, "num_input_tokens_seen": 48414720, "step": 5910 }, { "epoch": 0.2842737094837935, - "grad_norm": 0.6064106822013855, + "grad_norm": 0.587988018989563, "learning_rate": 4.2778924915075704e-05, - "loss": 0.9477, + "loss": 0.9471, "num_input_tokens_seen": 48496640, "step": 5920 }, { "epoch": 0.28475390156062425, - "grad_norm": 0.48876288533210754, + "grad_norm": 0.48125171661376953, "learning_rate": 4.275105085060014e-05, - "loss": 0.93, + "loss": 0.9296, "num_input_tokens_seen": 48578560, "step": 5930 }, { "epoch": 0.285234093637455, - "grad_norm": 0.4730078876018524, + "grad_norm": 0.4742906093597412, "learning_rate": 4.272313221051094e-05, - "loss": 1.0398, + "loss": 1.0404, "num_input_tokens_seen": 48660480, "step": 5940 }, { "epoch": 0.2857142857142857, - "grad_norm": 0.40384843945503235, + "grad_norm": 0.4079858064651489, "learning_rate": 4.269516906491607e-05, - "loss": 0.9514, + "loss": 0.9517, "num_input_tokens_seen": 48742400, "step": 5950 }, { "epoch": 0.28619447779111645, - "grad_norm": 0.5116029381752014, + "grad_norm": 0.5222288966178894, "learning_rate": 4.266716148403529e-05, - "loss": 0.7283, + "loss": 0.7284, "num_input_tokens_seen": 48824320, "step": 5960 }, { "epoch": 0.28667466986794715, - "grad_norm": 0.5653484463691711, + "grad_norm": 0.5701093673706055, "learning_rate": 4.263910953819993e-05, - "loss": 0.9808, + "loss": 0.98, "num_input_tokens_seen": 48906240, "step": 5970 }, { "epoch": 0.2871548619447779, - "grad_norm": 0.4929262697696686, + "grad_norm": 0.48784807324409485, "learning_rate": 4.2611013297852744e-05, - "loss": 0.9758, + "loss": 0.9762, "num_input_tokens_seen": 48988160, "step": 5980 }, { "epoch": 0.28763505402160866, - "grad_norm": 0.5140427947044373, + "grad_norm": 0.5211770534515381, "learning_rate": 4.2582872833547693e-05, - "loss": 0.9226, + "loss": 0.9233, "num_input_tokens_seen": 49070080, "step": 5990 }, { "epoch": 0.28811524609843936, - "grad_norm": 0.4957669973373413, + "grad_norm": 0.49390462040901184, "learning_rate": 4.255468821594981e-05, - "loss": 1.0878, + "loss": 1.0877, "num_input_tokens_seen": 49152000, "step": 6000 }, { "epoch": 0.2885954381752701, - "grad_norm": 0.5024486184120178, + "grad_norm": 0.5107914209365845, "learning_rate": 4.2526459515834996e-05, - "loss": 0.917, + "loss": 0.9164, "num_input_tokens_seen": 49233920, "step": 6010 }, { "epoch": 0.28907563025210087, - "grad_norm": 0.5399232506752014, + "grad_norm": 0.5360170006752014, "learning_rate": 4.249818680408984e-05, - "loss": 1.0098, + "loss": 1.01, "num_input_tokens_seen": 49315840, "step": 6020 }, { "epoch": 0.28955582232893157, - "grad_norm": 0.5091607570648193, + "grad_norm": 0.5045222043991089, "learning_rate": 4.246987015171148e-05, - "loss": 1.0573, + "loss": 1.057, "num_input_tokens_seen": 49397760, "step": 6030 }, { "epoch": 0.2900360144057623, - "grad_norm": 0.48257771134376526, + "grad_norm": 0.48786187171936035, "learning_rate": 4.244150962980735e-05, - "loss": 1.1285, + "loss": 1.1284, "num_input_tokens_seen": 49479680, "step": 6040 }, { "epoch": 0.290516206482593, - "grad_norm": 0.7823185324668884, + "grad_norm": 0.7572475075721741, "learning_rate": 4.2413105309595105e-05, - "loss": 0.9014, + "loss": 0.9011, "num_input_tokens_seen": 49561600, "step": 6050 }, { "epoch": 0.2909963985594238, - "grad_norm": 0.49261143803596497, + "grad_norm": 0.48458683490753174, "learning_rate": 4.238465726240233e-05, - "loss": 0.8398, + "loss": 0.8397, "num_input_tokens_seen": 49643520, "step": 6060 }, { "epoch": 0.29147659063625453, - "grad_norm": 0.5321176648139954, + "grad_norm": 0.5217098593711853, "learning_rate": 4.235616555966645e-05, - "loss": 0.879, + "loss": 0.8792, "num_input_tokens_seen": 49725440, "step": 6070 }, { "epoch": 0.29195678271308523, - "grad_norm": 0.5565729737281799, + "grad_norm": 0.5514656901359558, "learning_rate": 4.232763027293451e-05, - "loss": 0.7561, + "loss": 0.7543, "num_input_tokens_seen": 49807360, "step": 6080 }, { "epoch": 0.292436974789916, - "grad_norm": 0.6039222478866577, + "grad_norm": 0.6352956891059875, "learning_rate": 4.2299051473862976e-05, - "loss": 1.0367, + "loss": 1.0364, "num_input_tokens_seen": 49889280, "step": 6090 }, { "epoch": 0.2929171668667467, - "grad_norm": 0.38333648443222046, + "grad_norm": 0.42271411418914795, "learning_rate": 4.227042923421762e-05, - "loss": 0.979, + "loss": 0.9792, "num_input_tokens_seen": 49971200, "step": 6100 }, { "epoch": 0.29339735894357744, - "grad_norm": 0.5529670119285583, + "grad_norm": 0.5425634980201721, "learning_rate": 4.224176362587326e-05, "loss": 0.9951, "num_input_tokens_seen": 50053120, @@ -4898,31 +4898,31 @@ }, { "epoch": 0.2938775510204082, - "grad_norm": 0.6076828241348267, + "grad_norm": 0.6409218907356262, "learning_rate": 4.221305472081365e-05, - "loss": 0.8707, + "loss": 0.8709, "num_input_tokens_seen": 50135040, "step": 6120 }, { "epoch": 0.2943577430972389, - "grad_norm": 0.49703189730644226, + "grad_norm": 0.4920530617237091, "learning_rate": 4.2184302591131264e-05, - "loss": 0.9571, + "loss": 0.9572, "num_input_tokens_seen": 50216960, "step": 6130 }, { "epoch": 0.29483793517406964, - "grad_norm": 0.5342078804969788, + "grad_norm": 0.5330979228019714, "learning_rate": 4.21555073090271e-05, - "loss": 1.0041, + "loss": 1.0038, "num_input_tokens_seen": 50298880, "step": 6140 }, { "epoch": 0.29531812725090034, - "grad_norm": 0.6617572903633118, + "grad_norm": 0.5953214168548584, "learning_rate": 4.2126668946810545e-05, "loss": 1.1052, "num_input_tokens_seen": 50380800, @@ -4930,55 +4930,55 @@ }, { "epoch": 0.2957983193277311, - "grad_norm": 0.5139254331588745, + "grad_norm": 0.5212881565093994, "learning_rate": 4.2097787576899144e-05, - "loss": 0.7878, + "loss": 0.788, "num_input_tokens_seen": 50462720, "step": 6160 }, { "epoch": 0.29627851140456185, - "grad_norm": 0.5757784247398376, + "grad_norm": 0.5825909972190857, "learning_rate": 4.2068863271818455e-05, - "loss": 0.8359, + "loss": 0.8364, "num_input_tokens_seen": 50544640, "step": 6170 }, { "epoch": 0.29675870348139255, - "grad_norm": 0.6105610132217407, + "grad_norm": 0.5860311985015869, "learning_rate": 4.2039896104201844e-05, - "loss": 0.7906, + "loss": 0.7908, "num_input_tokens_seen": 50626560, "step": 6180 }, { "epoch": 0.2972388955582233, - "grad_norm": 0.5291628241539001, + "grad_norm": 0.5145933032035828, "learning_rate": 4.201088614679032e-05, - "loss": 0.8226, + "loss": 0.8223, "num_input_tokens_seen": 50708480, "step": 6190 }, { "epoch": 0.297719087635054, - "grad_norm": 0.5184271335601807, + "grad_norm": 0.520693302154541, "learning_rate": 4.198183347243233e-05, - "loss": 0.922, + "loss": 0.9225, "num_input_tokens_seen": 50790400, "step": 6200 }, { "epoch": 0.29819927971188476, - "grad_norm": 0.5568735599517822, + "grad_norm": 0.5607156157493591, "learning_rate": 4.1952738154083614e-05, - "loss": 0.8489, + "loss": 0.8484, "num_input_tokens_seen": 50872320, "step": 6210 }, { "epoch": 0.2986794717887155, - "grad_norm": 0.5584334135055542, + "grad_norm": 0.5793859362602234, "learning_rate": 4.1923600264806975e-05, "loss": 0.9279, "num_input_tokens_seen": 50954240, @@ -4986,7 +4986,7 @@ }, { "epoch": 0.2991596638655462, - "grad_norm": 1.1234581470489502, + "grad_norm": 0.9440059065818787, "learning_rate": 4.189441987777212e-05, "loss": 0.8793, "num_input_tokens_seen": 51036160, @@ -4994,111 +4994,111 @@ }, { "epoch": 0.29963985594237696, - "grad_norm": 0.5137265920639038, + "grad_norm": 0.542341947555542, "learning_rate": 4.186519706625549e-05, - "loss": 0.8057, + "loss": 0.809, "num_input_tokens_seen": 51118080, "step": 6240 }, { "epoch": 0.30012004801920766, - "grad_norm": 0.5186243057250977, + "grad_norm": 0.5198069214820862, "learning_rate": 4.1835931903640046e-05, - "loss": 1.0145, + "loss": 1.015, "num_input_tokens_seen": 51200000, "step": 6250 }, { "epoch": 0.3006002400960384, - "grad_norm": 0.5995609164237976, + "grad_norm": 0.5908818244934082, "learning_rate": 4.180662446341511e-05, - "loss": 0.9712, + "loss": 0.971, "num_input_tokens_seen": 51281920, "step": 6260 }, { "epoch": 0.3010804321728692, - "grad_norm": 0.5310963988304138, + "grad_norm": 0.5370185971260071, "learning_rate": 4.1777274819176154e-05, - "loss": 0.9031, + "loss": 0.9027, "num_input_tokens_seen": 51363840, "step": 6270 }, { "epoch": 0.30156062424969987, - "grad_norm": 0.536044716835022, + "grad_norm": 0.5164900422096252, "learning_rate": 4.1747883044624644e-05, - "loss": 0.9543, + "loss": 0.9546, "num_input_tokens_seen": 51445760, "step": 6280 }, { "epoch": 0.3020408163265306, - "grad_norm": 0.5187882781028748, + "grad_norm": 0.5132122039794922, "learning_rate": 4.171844921356784e-05, - "loss": 1.0411, + "loss": 1.041, "num_input_tokens_seen": 51527680, "step": 6290 }, { "epoch": 0.3025210084033613, - "grad_norm": 0.526749312877655, + "grad_norm": 0.5190037488937378, "learning_rate": 4.168897339991861e-05, - "loss": 1.0103, + "loss": 1.0105, "num_input_tokens_seen": 51609600, "step": 6300 }, { "epoch": 0.3030012004801921, - "grad_norm": 0.5082546472549438, + "grad_norm": 0.5217801332473755, "learning_rate": 4.1659455677695245e-05, - "loss": 1.0368, + "loss": 1.0367, "num_input_tokens_seen": 51691520, "step": 6310 }, { "epoch": 0.30348139255702283, - "grad_norm": 0.513116180896759, + "grad_norm": 0.4742312729358673, "learning_rate": 4.162989612102128e-05, - "loss": 0.6589, + "loss": 0.6583, "num_input_tokens_seen": 51773440, "step": 6320 }, { "epoch": 0.30396158463385353, - "grad_norm": 0.5046403408050537, + "grad_norm": 0.5095861554145813, "learning_rate": 4.160029480412529e-05, - "loss": 0.9936, + "loss": 0.9942, "num_input_tokens_seen": 51855360, "step": 6330 }, { "epoch": 0.3044417767106843, - "grad_norm": 0.6154318451881409, + "grad_norm": 0.6195290088653564, "learning_rate": 4.1570651801340735e-05, - "loss": 0.8154, + "loss": 0.8157, "num_input_tokens_seen": 51937280, "step": 6340 }, { "epoch": 0.304921968787515, - "grad_norm": 0.5283302664756775, + "grad_norm": 0.5015403032302856, "learning_rate": 4.1540967187105753e-05, - "loss": 1.0798, + "loss": 1.0806, "num_input_tokens_seen": 52019200, "step": 6350 }, { "epoch": 0.30540216086434574, - "grad_norm": 0.5067933797836304, + "grad_norm": 0.5029851794242859, "learning_rate": 4.151124103596295e-05, - "loss": 0.8155, + "loss": 0.815, "num_input_tokens_seen": 52101120, "step": 6360 }, { "epoch": 0.3058823529411765, - "grad_norm": 0.4994664788246155, + "grad_norm": 0.4925341308116913, "learning_rate": 4.148147342255926e-05, "loss": 0.9705, "num_input_tokens_seen": 52183040, @@ -5106,95 +5106,95 @@ }, { "epoch": 0.3063625450180072, - "grad_norm": 0.4871218800544739, + "grad_norm": 0.4862985610961914, "learning_rate": 4.145166442164573e-05, - "loss": 0.849, + "loss": 0.8492, "num_input_tokens_seen": 52264960, "step": 6380 }, { "epoch": 0.30684273709483795, - "grad_norm": 0.5769742727279663, + "grad_norm": 0.5730807781219482, "learning_rate": 4.142181410807735e-05, - "loss": 0.9561, + "loss": 0.9553, "num_input_tokens_seen": 52346880, "step": 6390 }, { "epoch": 0.30732292917166865, - "grad_norm": 0.8338456153869629, + "grad_norm": 0.8465107083320618, "learning_rate": 4.1391922556812815e-05, - "loss": 0.9117, + "loss": 0.9121, "num_input_tokens_seen": 52428800, "step": 6400 }, { "epoch": 0.3078031212484994, - "grad_norm": 0.521043062210083, + "grad_norm": 0.5357550382614136, "learning_rate": 4.136198984291442e-05, - "loss": 0.8657, + "loss": 0.8654, "num_input_tokens_seen": 52510720, "step": 6410 }, { "epoch": 0.30828331332533015, - "grad_norm": 0.4783630073070526, + "grad_norm": 0.47538501024246216, "learning_rate": 4.133201604154779e-05, - "loss": 0.9356, + "loss": 0.9387, "num_input_tokens_seen": 52592640, "step": 6420 }, { "epoch": 0.30876350540216085, - "grad_norm": 0.6084679961204529, + "grad_norm": 0.6129695773124695, "learning_rate": 4.1302001227981765e-05, - "loss": 1.0838, + "loss": 1.084, "num_input_tokens_seen": 52674560, "step": 6430 }, { "epoch": 0.3092436974789916, - "grad_norm": 0.5003706812858582, + "grad_norm": 0.5037487149238586, "learning_rate": 4.1271945477588126e-05, - "loss": 0.7471, + "loss": 0.7468, "num_input_tokens_seen": 52756480, "step": 6440 }, { "epoch": 0.3097238895558223, - "grad_norm": 0.5811577439308167, + "grad_norm": 0.5857306122779846, "learning_rate": 4.124184886584148e-05, - "loss": 0.9374, + "loss": 0.9378, "num_input_tokens_seen": 52838400, "step": 6450 }, { "epoch": 0.31020408163265306, - "grad_norm": 0.5085806250572205, + "grad_norm": 0.5044699311256409, "learning_rate": 4.121171146831905e-05, - "loss": 1.0186, + "loss": 1.0187, "num_input_tokens_seen": 52920320, "step": 6460 }, { "epoch": 0.3106842737094838, - "grad_norm": 0.5559006333351135, + "grad_norm": 0.5601541996002197, "learning_rate": 4.118153336070045e-05, - "loss": 1.1422, + "loss": 1.1429, "num_input_tokens_seen": 53002240, "step": 6470 }, { "epoch": 0.3111644657863145, - "grad_norm": 0.7885730266571045, + "grad_norm": 0.7909272909164429, "learning_rate": 4.115131461876756e-05, - "loss": 1.0219, + "loss": 1.0201, "num_input_tokens_seen": 53084160, "step": 6480 }, { "epoch": 0.31164465786314527, - "grad_norm": 0.5197670459747314, + "grad_norm": 0.5243809223175049, "learning_rate": 4.1121055318404264e-05, "loss": 0.9806, "num_input_tokens_seen": 53166080, @@ -5202,455 +5202,455 @@ }, { "epoch": 0.31212484993997597, - "grad_norm": 0.4977589249610901, + "grad_norm": 0.4948495328426361, "learning_rate": 4.109075553559633e-05, - "loss": 1.1074, + "loss": 1.1109, "num_input_tokens_seen": 53248000, "step": 6500 }, { "epoch": 0.3126050420168067, - "grad_norm": 0.46243059635162354, + "grad_norm": 0.4655405879020691, "learning_rate": 4.1060415346431134e-05, - "loss": 0.93, + "loss": 0.9303, "num_input_tokens_seen": 53329920, "step": 6510 }, { "epoch": 0.3130852340936375, - "grad_norm": 0.6155077219009399, + "grad_norm": 0.621785581111908, "learning_rate": 4.103003482709758e-05, - "loss": 0.9669, + "loss": 0.9677, "num_input_tokens_seen": 53411840, "step": 6520 }, { "epoch": 0.3135654261704682, - "grad_norm": 0.6652863025665283, + "grad_norm": 0.6690912246704102, "learning_rate": 4.0999614053885795e-05, - "loss": 0.7738, + "loss": 0.7746, "num_input_tokens_seen": 53493760, "step": 6530 }, { "epoch": 0.31404561824729893, - "grad_norm": 0.538733184337616, + "grad_norm": 0.5284923911094666, "learning_rate": 4.096915310318702e-05, - "loss": 0.8493, + "loss": 0.8491, "num_input_tokens_seen": 53575680, "step": 6540 }, { "epoch": 0.31452581032412963, - "grad_norm": 0.5088375210762024, + "grad_norm": 0.5016783475875854, "learning_rate": 4.093865205149337e-05, - "loss": 0.9464, + "loss": 0.9473, "num_input_tokens_seen": 53657600, "step": 6550 }, { "epoch": 0.3150060024009604, - "grad_norm": 0.5455616116523743, + "grad_norm": 0.5532281398773193, "learning_rate": 4.090811097539768e-05, - "loss": 0.7853, + "loss": 0.7859, "num_input_tokens_seen": 53739520, "step": 6560 }, { "epoch": 0.31548619447779114, - "grad_norm": 0.5154474377632141, + "grad_norm": 0.47948014736175537, "learning_rate": 4.087752995159327e-05, - "loss": 0.764, + "loss": 0.7647, "num_input_tokens_seen": 53821440, "step": 6570 }, { "epoch": 0.31596638655462184, - "grad_norm": 0.5074660181999207, + "grad_norm": 0.5162191390991211, "learning_rate": 4.084690905687379e-05, - "loss": 0.9871, + "loss": 0.988, "num_input_tokens_seen": 53903360, "step": 6580 }, { "epoch": 0.3164465786314526, - "grad_norm": 0.510628879070282, + "grad_norm": 0.529799222946167, "learning_rate": 4.0816248368133016e-05, - "loss": 1.1464, + "loss": 1.148, "num_input_tokens_seen": 53985280, "step": 6590 }, { "epoch": 0.3169267707082833, - "grad_norm": 0.6401832699775696, + "grad_norm": 0.701400876045227, "learning_rate": 4.078554796236462e-05, - "loss": 0.7681, + "loss": 0.7685, "num_input_tokens_seen": 54067200, "step": 6600 }, { "epoch": 0.31740696278511404, - "grad_norm": 0.5684659481048584, + "grad_norm": 0.5586780309677124, "learning_rate": 4.0754807916662055e-05, - "loss": 1.0146, + "loss": 1.0151, "num_input_tokens_seen": 54149120, "step": 6610 }, { "epoch": 0.3178871548619448, - "grad_norm": 0.5699182152748108, + "grad_norm": 0.5554243922233582, "learning_rate": 4.072402830821829e-05, - "loss": 0.9229, + "loss": 0.9232, "num_input_tokens_seen": 54231040, "step": 6620 }, { "epoch": 0.3183673469387755, - "grad_norm": 0.5109874606132507, + "grad_norm": 0.5143104195594788, "learning_rate": 4.069320921432564e-05, - "loss": 0.9067, + "loss": 0.907, "num_input_tokens_seen": 54312960, "step": 6630 }, { "epoch": 0.31884753901560625, - "grad_norm": 0.498054563999176, + "grad_norm": 0.49600183963775635, "learning_rate": 4.066235071237559e-05, - "loss": 0.8371, + "loss": 0.837, "num_input_tokens_seen": 54394880, "step": 6640 }, { "epoch": 0.31932773109243695, - "grad_norm": 0.4968014061450958, + "grad_norm": 0.5070446729660034, "learning_rate": 4.0631452879858565e-05, - "loss": 0.9809, + "loss": 0.9814, "num_input_tokens_seen": 54476800, "step": 6650 }, { "epoch": 0.3198079231692677, - "grad_norm": 0.49730050563812256, + "grad_norm": 0.5054400563240051, "learning_rate": 4.0600515794363774e-05, - "loss": 0.9101, + "loss": 0.9107, "num_input_tokens_seen": 54558720, "step": 6660 }, { "epoch": 0.32028811524609846, - "grad_norm": 0.5172863006591797, + "grad_norm": 0.5172276496887207, "learning_rate": 4.0569539533578985e-05, - "loss": 0.9802, + "loss": 0.9817, "num_input_tokens_seen": 54640640, "step": 6670 }, { "epoch": 0.32076830732292916, - "grad_norm": 0.5139868259429932, + "grad_norm": 0.5226202607154846, "learning_rate": 4.053852417529035e-05, - "loss": 0.7294, + "loss": 0.7322, "num_input_tokens_seen": 54722560, "step": 6680 }, { "epoch": 0.3212484993997599, - "grad_norm": 0.5264905691146851, + "grad_norm": 0.5340245962142944, "learning_rate": 4.050746979738218e-05, - "loss": 0.9096, + "loss": 0.9097, "num_input_tokens_seen": 54804480, "step": 6690 }, { "epoch": 0.3217286914765906, - "grad_norm": 0.5231399536132812, + "grad_norm": 0.5202445387840271, "learning_rate": 4.047637647783681e-05, - "loss": 1.0167, + "loss": 1.0164, "num_input_tokens_seen": 54886400, "step": 6700 }, { "epoch": 0.32220888355342137, - "grad_norm": 0.6286957859992981, + "grad_norm": 0.5725679993629456, "learning_rate": 4.044524429473431e-05, - "loss": 0.8882, + "loss": 0.888, "num_input_tokens_seen": 54968320, "step": 6710 }, { "epoch": 0.3226890756302521, - "grad_norm": 0.5430505275726318, + "grad_norm": 0.5457696914672852, "learning_rate": 4.041407332625238e-05, - "loss": 0.8349, + "loss": 0.8352, "num_input_tokens_seen": 55050240, "step": 6720 }, { "epoch": 0.3231692677070828, - "grad_norm": 0.5100399851799011, + "grad_norm": 0.5226444602012634, "learning_rate": 4.038286365066613e-05, - "loss": 0.8427, + "loss": 0.843, "num_input_tokens_seen": 55132160, "step": 6730 }, { "epoch": 0.3236494597839136, - "grad_norm": 0.48778000473976135, + "grad_norm": 0.48937085270881653, "learning_rate": 4.0351615346347804e-05, - "loss": 0.9561, + "loss": 0.9569, "num_input_tokens_seen": 55214080, "step": 6740 }, { "epoch": 0.3241296518607443, - "grad_norm": 0.5565239191055298, + "grad_norm": 0.5608055591583252, "learning_rate": 4.032032849176673e-05, - "loss": 0.9102, + "loss": 0.912, "num_input_tokens_seen": 55296000, "step": 6750 }, { "epoch": 0.324609843937575, - "grad_norm": 0.486905962228775, + "grad_norm": 0.524291455745697, "learning_rate": 4.0289003165488976e-05, - "loss": 0.8668, + "loss": 0.8669, "num_input_tokens_seen": 55377920, "step": 6760 }, { "epoch": 0.3250900360144058, - "grad_norm": 0.5659222602844238, + "grad_norm": 0.570642352104187, "learning_rate": 4.025763944617727e-05, - "loss": 0.9355, + "loss": 0.9358, "num_input_tokens_seen": 55459840, "step": 6770 }, { "epoch": 0.3255702280912365, - "grad_norm": 0.4450863301753998, + "grad_norm": 0.4495159387588501, "learning_rate": 4.0226237412590696e-05, - "loss": 0.9062, + "loss": 0.9055, "num_input_tokens_seen": 55541760, "step": 6780 }, { "epoch": 0.32605042016806723, - "grad_norm": 0.4924563467502594, + "grad_norm": 0.414616197347641, "learning_rate": 4.019479714358461e-05, - "loss": 0.8195, + "loss": 0.8201, "num_input_tokens_seen": 55623680, "step": 6790 }, { "epoch": 0.32653061224489793, - "grad_norm": 0.5010157227516174, + "grad_norm": 0.49962684512138367, "learning_rate": 4.016331871811033e-05, - "loss": 0.7515, + "loss": 0.7518, "num_input_tokens_seen": 55705600, "step": 6800 }, { "epoch": 0.3270108043217287, - "grad_norm": 0.49570614099502563, + "grad_norm": 0.4985042214393616, "learning_rate": 4.0131802215215025e-05, - "loss": 1.0522, + "loss": 1.0517, "num_input_tokens_seen": 55787520, "step": 6810 }, { "epoch": 0.32749099639855944, - "grad_norm": 0.7191053628921509, + "grad_norm": 0.7347798347473145, "learning_rate": 4.010024771404147e-05, - "loss": 0.9683, + "loss": 0.9678, "num_input_tokens_seen": 55869440, "step": 6820 }, { "epoch": 0.32797118847539014, - "grad_norm": 0.5185818672180176, + "grad_norm": 0.5248961448669434, "learning_rate": 4.006865529382787e-05, - "loss": 0.8884, + "loss": 0.8882, "num_input_tokens_seen": 55951360, "step": 6830 }, { "epoch": 0.3284513805522209, - "grad_norm": 0.6259596347808838, + "grad_norm": 0.6377902030944824, "learning_rate": 4.0037025033907635e-05, - "loss": 0.9416, + "loss": 0.9419, "num_input_tokens_seen": 56033280, "step": 6840 }, { "epoch": 0.3289315726290516, - "grad_norm": 0.5207152962684631, + "grad_norm": 0.5339779853820801, "learning_rate": 4.000535701370921e-05, - "loss": 0.9646, + "loss": 0.9656, "num_input_tokens_seen": 56115200, "step": 6850 }, { "epoch": 0.32941176470588235, - "grad_norm": 0.6173118948936462, + "grad_norm": 0.624208390712738, "learning_rate": 3.997365131275584e-05, - "loss": 0.9449, + "loss": 0.9448, "num_input_tokens_seen": 56197120, "step": 6860 }, { "epoch": 0.3298919567827131, - "grad_norm": 0.5023009777069092, + "grad_norm": 0.5037301778793335, "learning_rate": 3.994190801066542e-05, - "loss": 1.1115, + "loss": 1.1117, "num_input_tokens_seen": 56279040, "step": 6870 }, { "epoch": 0.3303721488595438, - "grad_norm": 0.7239778637886047, + "grad_norm": 0.7410263419151306, "learning_rate": 3.9910127187150246e-05, - "loss": 0.996, + "loss": 0.9973, "num_input_tokens_seen": 56360960, "step": 6880 }, { "epoch": 0.33085234093637456, - "grad_norm": 0.45797932147979736, + "grad_norm": 0.4599323868751526, "learning_rate": 3.9878308922016846e-05, - "loss": 0.887, + "loss": 0.8873, "num_input_tokens_seen": 56442880, "step": 6890 }, { "epoch": 0.33133253301320525, - "grad_norm": 0.594767689704895, + "grad_norm": 0.6016721129417419, "learning_rate": 3.984645329516578e-05, - "loss": 0.803, + "loss": 0.8034, "num_input_tokens_seen": 56524800, "step": 6900 }, { "epoch": 0.331812725090036, - "grad_norm": 0.502420961856842, + "grad_norm": 0.5103434324264526, "learning_rate": 3.98145603865914e-05, - "loss": 0.9339, + "loss": 0.9337, "num_input_tokens_seen": 56606720, "step": 6910 }, { "epoch": 0.33229291716686676, - "grad_norm": 0.4855603873729706, + "grad_norm": 0.4790404438972473, "learning_rate": 3.978263027638171e-05, - "loss": 1.0942, + "loss": 1.0943, "num_input_tokens_seen": 56688640, "step": 6920 }, { "epoch": 0.33277310924369746, - "grad_norm": 0.561661422252655, + "grad_norm": 0.5801796913146973, "learning_rate": 3.975066304471811e-05, - "loss": 0.9728, + "loss": 0.9727, "num_input_tokens_seen": 56770560, "step": 6930 }, { "epoch": 0.3332533013205282, - "grad_norm": 0.5083943009376526, + "grad_norm": 0.506883442401886, "learning_rate": 3.971865877187523e-05, - "loss": 0.8131, + "loss": 0.8136, "num_input_tokens_seen": 56852480, "step": 6940 }, { "epoch": 0.33373349339735897, - "grad_norm": 0.5498092770576477, + "grad_norm": 0.5405831336975098, "learning_rate": 3.968661753822071e-05, - "loss": 0.8142, + "loss": 0.8136, "num_input_tokens_seen": 56934400, "step": 6950 }, { "epoch": 0.33421368547418967, - "grad_norm": 0.5083803534507751, + "grad_norm": 0.5131149888038635, "learning_rate": 3.9654539424214996e-05, - "loss": 0.9086, + "loss": 0.9095, "num_input_tokens_seen": 57016320, "step": 6960 }, { "epoch": 0.3346938775510204, - "grad_norm": 0.4920550286769867, + "grad_norm": 0.4872545897960663, "learning_rate": 3.962242451041118e-05, - "loss": 0.8183, + "loss": 0.8184, "num_input_tokens_seen": 57098240, "step": 6970 }, { "epoch": 0.3351740696278511, - "grad_norm": 0.5124842524528503, + "grad_norm": 0.5294336080551147, "learning_rate": 3.9590272877454714e-05, - "loss": 0.9817, + "loss": 0.9821, "num_input_tokens_seen": 57180160, "step": 6980 }, { "epoch": 0.3356542617046819, - "grad_norm": 0.4871028959751129, + "grad_norm": 0.500049352645874, "learning_rate": 3.955808460608331e-05, - "loss": 1.0946, + "loss": 1.0956, "num_input_tokens_seen": 57262080, "step": 6990 }, { "epoch": 0.33613445378151263, - "grad_norm": 0.49150577187538147, + "grad_norm": 0.5052493810653687, "learning_rate": 3.952585977712664e-05, - "loss": 1.0473, + "loss": 1.0477, "num_input_tokens_seen": 57344000, "step": 7000 }, { "epoch": 0.33661464585834333, - "grad_norm": 0.5065931677818298, + "grad_norm": 0.49708953499794006, "learning_rate": 3.94935984715062e-05, - "loss": 0.9402, + "loss": 0.9404, "num_input_tokens_seen": 57425920, "step": 7010 }, { "epoch": 0.3370948379351741, - "grad_norm": 0.5077062249183655, + "grad_norm": 0.5013074278831482, "learning_rate": 3.9461300770235093e-05, - "loss": 0.678, + "loss": 0.6775, "num_input_tokens_seen": 57507840, "step": 7020 }, { "epoch": 0.3375750300120048, - "grad_norm": 0.4949451684951782, + "grad_norm": 0.4998069405555725, "learning_rate": 3.942896675441779e-05, - "loss": 0.9907, + "loss": 0.9909, "num_input_tokens_seen": 57589760, "step": 7030 }, { "epoch": 0.33805522208883554, - "grad_norm": 0.7275571823120117, + "grad_norm": 0.7399186491966248, "learning_rate": 3.939659650524997e-05, - "loss": 0.9791, + "loss": 0.9797, "num_input_tokens_seen": 57671680, "step": 7040 }, { "epoch": 0.3385354141656663, - "grad_norm": 0.4286993145942688, + "grad_norm": 0.4175257980823517, "learning_rate": 3.9364190104018307e-05, - "loss": 0.8087, + "loss": 0.8085, "num_input_tokens_seen": 57753600, "step": 7050 }, { "epoch": 0.339015606242497, - "grad_norm": 0.4871419668197632, + "grad_norm": 0.47881293296813965, "learning_rate": 3.933174763210024e-05, "loss": 0.9574, "num_input_tokens_seen": 57835520, @@ -5658,303 +5658,303 @@ }, { "epoch": 0.33949579831932775, - "grad_norm": 0.5074589252471924, + "grad_norm": 0.49739906191825867, "learning_rate": 3.9299269170963795e-05, - "loss": 0.866, + "loss": 0.8655, "num_input_tokens_seen": 57917440, "step": 7070 }, { "epoch": 0.33997599039615845, - "grad_norm": 0.5097850561141968, + "grad_norm": 0.5103303790092468, "learning_rate": 3.926675480216738e-05, - "loss": 0.9768, + "loss": 0.9769, "num_input_tokens_seen": 57999360, "step": 7080 }, { "epoch": 0.3404561824729892, - "grad_norm": 0.4989878833293915, + "grad_norm": 0.4884282350540161, "learning_rate": 3.923420460735957e-05, - "loss": 0.9659, + "loss": 0.9666, "num_input_tokens_seen": 58081280, "step": 7090 }, { "epoch": 0.34093637454981995, - "grad_norm": 0.4922942817211151, + "grad_norm": 0.4890122711658478, "learning_rate": 3.920161866827889e-05, - "loss": 0.7939, + "loss": 0.7937, "num_input_tokens_seen": 58163200, "step": 7100 }, { "epoch": 0.34141656662665065, - "grad_norm": 0.5052926540374756, + "grad_norm": 0.5014382600784302, "learning_rate": 3.916899706675365e-05, - "loss": 1.1513, + "loss": 1.1521, "num_input_tokens_seen": 58245120, "step": 7110 }, { "epoch": 0.3418967587034814, - "grad_norm": 0.5977290272712708, + "grad_norm": 0.6726990342140198, "learning_rate": 3.913633988470169e-05, - "loss": 1.0614, + "loss": 1.0604, "num_input_tokens_seen": 58327040, "step": 7120 }, { "epoch": 0.3423769507803121, - "grad_norm": 0.45560044050216675, + "grad_norm": 0.4604361653327942, "learning_rate": 3.91036472041302e-05, - "loss": 1.0288, + "loss": 1.0293, "num_input_tokens_seen": 58408960, "step": 7130 }, { "epoch": 0.34285714285714286, - "grad_norm": 0.5187392234802246, + "grad_norm": 0.5115752816200256, "learning_rate": 3.907091910713553e-05, - "loss": 0.9384, + "loss": 0.9386, "num_input_tokens_seen": 58490880, "step": 7140 }, { "epoch": 0.3433373349339736, - "grad_norm": 0.49635642766952515, + "grad_norm": 0.5059329867362976, "learning_rate": 3.9038155675902956e-05, - "loss": 0.9873, + "loss": 0.9874, "num_input_tokens_seen": 58572800, "step": 7150 }, { "epoch": 0.3438175270108043, - "grad_norm": 0.47836464643478394, + "grad_norm": 0.47620999813079834, "learning_rate": 3.900535699270647e-05, - "loss": 0.9565, + "loss": 0.9545, "num_input_tokens_seen": 58654720, "step": 7160 }, { "epoch": 0.34429771908763507, - "grad_norm": 0.49842721223831177, + "grad_norm": 0.49813467264175415, "learning_rate": 3.8972523139908616e-05, - "loss": 0.9251, + "loss": 0.9244, "num_input_tokens_seen": 58736640, "step": 7170 }, { "epoch": 0.34477791116446577, - "grad_norm": 0.5168570876121521, + "grad_norm": 0.5196046829223633, "learning_rate": 3.8939654199960244e-05, - "loss": 0.9024, + "loss": 0.9025, "num_input_tokens_seen": 58818560, "step": 7180 }, { "epoch": 0.3452581032412965, - "grad_norm": 0.508305549621582, + "grad_norm": 0.5014814734458923, "learning_rate": 3.890675025540028e-05, - "loss": 0.8464, + "loss": 0.8459, "num_input_tokens_seen": 58900480, "step": 7190 }, { "epoch": 0.3457382953181273, - "grad_norm": 0.5206389427185059, + "grad_norm": 0.5205827355384827, "learning_rate": 3.8873811388855605e-05, - "loss": 0.909, + "loss": 0.9075, "num_input_tokens_seen": 58982400, "step": 7200 }, { "epoch": 0.346218487394958, - "grad_norm": 0.5759819149971008, + "grad_norm": 0.555486798286438, "learning_rate": 3.8840837683040766e-05, - "loss": 1.0237, + "loss": 1.0231, "num_input_tokens_seen": 59064320, "step": 7210 }, { "epoch": 0.34669867947178873, - "grad_norm": 0.5078734755516052, + "grad_norm": 0.5133519768714905, "learning_rate": 3.880782922075778e-05, - "loss": 0.8262, + "loss": 0.8264, "num_input_tokens_seen": 59146240, "step": 7220 }, { "epoch": 0.3471788715486194, - "grad_norm": 0.9097881317138672, + "grad_norm": 0.9706603288650513, "learning_rate": 3.8774786084896e-05, - "loss": 0.9193, + "loss": 0.9188, "num_input_tokens_seen": 59228160, "step": 7230 }, { "epoch": 0.3476590636254502, - "grad_norm": 0.4915110468864441, + "grad_norm": 0.5008965730667114, "learning_rate": 3.8741708358431774e-05, - "loss": 0.8414, + "loss": 0.841, "num_input_tokens_seen": 59310080, "step": 7240 }, { "epoch": 0.34813925570228094, - "grad_norm": 0.4847305119037628, + "grad_norm": 0.4682840406894684, "learning_rate": 3.870859612442837e-05, - "loss": 1.138, + "loss": 1.1382, "num_input_tokens_seen": 59392000, "step": 7250 }, { "epoch": 0.34861944777911164, - "grad_norm": 0.5575107336044312, + "grad_norm": 0.5504831075668335, "learning_rate": 3.86754494660357e-05, - "loss": 0.8703, + "loss": 0.8716, "num_input_tokens_seen": 59473920, "step": 7260 }, { "epoch": 0.3490996398559424, - "grad_norm": 0.5898226499557495, + "grad_norm": 0.5717173218727112, "learning_rate": 3.864226846649008e-05, - "loss": 0.8408, + "loss": 0.8413, "num_input_tokens_seen": 59555840, "step": 7270 }, { "epoch": 0.3495798319327731, - "grad_norm": 0.5211881995201111, + "grad_norm": 0.5430110692977905, "learning_rate": 3.860905320911413e-05, - "loss": 0.9104, + "loss": 0.9106, "num_input_tokens_seen": 59637760, "step": 7280 }, { "epoch": 0.35006002400960384, - "grad_norm": 0.5230681896209717, + "grad_norm": 0.5178146958351135, "learning_rate": 3.857580377731644e-05, - "loss": 0.7804, + "loss": 0.7808, "num_input_tokens_seen": 59719680, "step": 7290 }, { "epoch": 0.3505402160864346, - "grad_norm": 0.5010928511619568, + "grad_norm": 0.5012259483337402, "learning_rate": 3.854252025459144e-05, - "loss": 0.868, + "loss": 0.8681, "num_input_tokens_seen": 59801600, "step": 7300 }, { "epoch": 0.3510204081632653, - "grad_norm": 0.4810640811920166, + "grad_norm": 0.5083165764808655, "learning_rate": 3.8509202724519165e-05, - "loss": 0.9105, + "loss": 0.9087, "num_input_tokens_seen": 59883520, "step": 7310 }, { "epoch": 0.35150060024009605, - "grad_norm": 0.5282043814659119, + "grad_norm": 0.5158397555351257, "learning_rate": 3.8475851270765054e-05, - "loss": 0.8274, + "loss": 0.8279, "num_input_tokens_seen": 59965440, "step": 7320 }, { "epoch": 0.35198079231692675, - "grad_norm": 0.5089632868766785, + "grad_norm": 0.5039446949958801, "learning_rate": 3.844246597707972e-05, - "loss": 0.9016, + "loss": 0.9013, "num_input_tokens_seen": 60047360, "step": 7330 }, { "epoch": 0.3524609843937575, - "grad_norm": 0.4965154230594635, + "grad_norm": 0.5005216598510742, "learning_rate": 3.8409046927298755e-05, - "loss": 0.8791, + "loss": 0.879, "num_input_tokens_seen": 60129280, "step": 7340 }, { "epoch": 0.35294117647058826, - "grad_norm": 0.5418551564216614, + "grad_norm": 0.5384024977684021, "learning_rate": 3.8375594205342534e-05, - "loss": 1.1453, + "loss": 1.1444, "num_input_tokens_seen": 60211200, "step": 7350 }, { "epoch": 0.35342136854741896, - "grad_norm": 0.5690187215805054, + "grad_norm": 0.5492764711380005, "learning_rate": 3.834210789521598e-05, - "loss": 1.0104, + "loss": 1.0091, "num_input_tokens_seen": 60293120, "step": 7360 }, { "epoch": 0.3539015606242497, - "grad_norm": 0.5610761046409607, + "grad_norm": 0.5844592452049255, "learning_rate": 3.830858808100834e-05, - "loss": 1.0196, + "loss": 1.0194, "num_input_tokens_seen": 60375040, "step": 7370 }, { "epoch": 0.3543817527010804, - "grad_norm": 0.501844584941864, + "grad_norm": 0.49856579303741455, "learning_rate": 3.8275034846893046e-05, - "loss": 0.8126, + "loss": 0.8125, "num_input_tokens_seen": 60456960, "step": 7380 }, { "epoch": 0.35486194477791116, - "grad_norm": 0.5217064023017883, + "grad_norm": 0.5153807997703552, "learning_rate": 3.824144827712738e-05, - "loss": 1.3003, + "loss": 1.3012, "num_input_tokens_seen": 60538880, "step": 7390 }, { "epoch": 0.3553421368547419, - "grad_norm": 0.5244282484054565, + "grad_norm": 0.5248746275901794, "learning_rate": 3.82078284560524e-05, - "loss": 1.0134, + "loss": 1.0136, "num_input_tokens_seen": 60620800, "step": 7400 }, { "epoch": 0.3558223289315726, - "grad_norm": 0.5655602216720581, + "grad_norm": 0.5390773415565491, "learning_rate": 3.817417546809263e-05, - "loss": 0.9036, + "loss": 0.9035, "num_input_tokens_seen": 60702720, "step": 7410 }, { "epoch": 0.3563025210084034, - "grad_norm": 0.5483632683753967, + "grad_norm": 0.5516919493675232, "learning_rate": 3.8140489397755886e-05, - "loss": 1.0055, + "loss": 1.0054, "num_input_tokens_seen": 60784640, "step": 7420 }, { "epoch": 0.35678271308523407, - "grad_norm": 0.5369768738746643, + "grad_norm": 0.5377995371818542, "learning_rate": 3.810677032963307e-05, - "loss": 0.9698, + "loss": 0.9705, "num_input_tokens_seen": 60866560, "step": 7430 }, { "epoch": 0.3572629051620648, - "grad_norm": 0.4874258041381836, + "grad_norm": 0.5012087821960449, "learning_rate": 3.807301834839793e-05, "loss": 0.8928, "num_input_tokens_seen": 60948480, @@ -5962,23 +5962,23 @@ }, { "epoch": 0.3577430972388956, - "grad_norm": 0.4989224076271057, + "grad_norm": 0.5082871913909912, "learning_rate": 3.803923353880687e-05, - "loss": 0.7671, + "loss": 0.7673, "num_input_tokens_seen": 61030400, "step": 7450 }, { "epoch": 0.3582232893157263, - "grad_norm": 0.5844639539718628, + "grad_norm": 0.5730777382850647, "learning_rate": 3.8005415985698754e-05, - "loss": 0.9805, + "loss": 0.9807, "num_input_tokens_seen": 61112320, "step": 7460 }, { "epoch": 0.35870348139255703, - "grad_norm": 0.5072601437568665, + "grad_norm": 0.4978314936161041, "learning_rate": 3.797156577399462e-05, "loss": 0.9943, "num_input_tokens_seen": 61194240, @@ -5986,127 +5986,127 @@ }, { "epoch": 0.35918367346938773, - "grad_norm": 0.5117142200469971, + "grad_norm": 0.5063808560371399, "learning_rate": 3.7937682988697566e-05, - "loss": 0.8656, + "loss": 0.8648, "num_input_tokens_seen": 61276160, "step": 7480 }, { "epoch": 0.3596638655462185, - "grad_norm": 0.5001174211502075, + "grad_norm": 0.5042293667793274, "learning_rate": 3.790376771489247e-05, - "loss": 1.0121, + "loss": 1.0123, "num_input_tokens_seen": 61358080, "step": 7490 }, { "epoch": 0.36014405762304924, - "grad_norm": 0.5205296277999878, + "grad_norm": 0.5322972536087036, "learning_rate": 3.7869820037745776e-05, - "loss": 0.8441, + "loss": 0.8446, "num_input_tokens_seen": 61440000, "step": 7500 }, { "epoch": 0.36062424969987994, - "grad_norm": 0.5183282494544983, + "grad_norm": 0.5157181024551392, "learning_rate": 3.783584004250531e-05, - "loss": 0.7604, + "loss": 0.7602, "num_input_tokens_seen": 61521920, "step": 7510 }, { "epoch": 0.3611044417767107, - "grad_norm": 0.3707723319530487, + "grad_norm": 0.3682163655757904, "learning_rate": 3.7801827814500074e-05, - "loss": 0.8183, + "loss": 0.8187, "num_input_tokens_seen": 61603840, "step": 7520 }, { "epoch": 0.3615846338535414, - "grad_norm": 0.5028261542320251, + "grad_norm": 0.49639391899108887, "learning_rate": 3.7767783439139984e-05, - "loss": 0.8522, + "loss": 0.8516, "num_input_tokens_seen": 61685760, "step": 7530 }, { "epoch": 0.36206482593037215, - "grad_norm": 0.5016160011291504, + "grad_norm": 0.48996445536613464, "learning_rate": 3.77337070019157e-05, - "loss": 0.8296, + "loss": 0.8289, "num_input_tokens_seen": 61767680, "step": 7540 }, { "epoch": 0.3625450180072029, - "grad_norm": 0.6660007238388062, + "grad_norm": 0.6653403043746948, "learning_rate": 3.7699598588398364e-05, - "loss": 0.9405, + "loss": 0.9404, "num_input_tokens_seen": 61849600, "step": 7550 }, { "epoch": 0.3630252100840336, - "grad_norm": 0.4924418032169342, + "grad_norm": 0.48587802052497864, "learning_rate": 3.766545828423946e-05, - "loss": 0.9701, + "loss": 0.9699, "num_input_tokens_seen": 61931520, "step": 7560 }, { "epoch": 0.36350540216086435, - "grad_norm": 0.5335708260536194, + "grad_norm": 0.5049532055854797, "learning_rate": 3.7631286175170535e-05, - "loss": 0.8598, + "loss": 0.8596, "num_input_tokens_seen": 62013440, "step": 7570 }, { "epoch": 0.36398559423769505, - "grad_norm": 0.4859420955181122, + "grad_norm": 0.4853578507900238, "learning_rate": 3.7597082347003e-05, - "loss": 1.0999, + "loss": 1.0988, "num_input_tokens_seen": 62095360, "step": 7580 }, { "epoch": 0.3644657863145258, - "grad_norm": 0.4671091139316559, + "grad_norm": 0.47532176971435547, "learning_rate": 3.75628468856279e-05, - "loss": 0.8699, + "loss": 0.8688, "num_input_tokens_seen": 62177280, "step": 7590 }, { "epoch": 0.36494597839135656, - "grad_norm": 0.5766321420669556, + "grad_norm": 0.6255443692207336, "learning_rate": 3.7528579877015746e-05, - "loss": 0.8688, + "loss": 0.8713, "num_input_tokens_seen": 62259200, "step": 7600 }, { "epoch": 0.36542617046818726, - "grad_norm": 0.8806756138801575, + "grad_norm": 1.008660912513733, "learning_rate": 3.749428140721626e-05, - "loss": 1.0697, + "loss": 1.0696, "num_input_tokens_seen": 62341120, "step": 7610 }, { "epoch": 0.365906362545018, - "grad_norm": 0.5065363049507141, + "grad_norm": 0.5324845910072327, "learning_rate": 3.745995156235815e-05, - "loss": 0.8349, + "loss": 0.8351, "num_input_tokens_seen": 62423040, "step": 7620 }, { "epoch": 0.3663865546218487, - "grad_norm": 0.5004618763923645, + "grad_norm": 0.5093852281570435, "learning_rate": 3.742559042864895e-05, "loss": 0.9453, "num_input_tokens_seen": 62504960, @@ -6114,175 +6114,175 @@ }, { "epoch": 0.36686674669867947, - "grad_norm": 0.46816208958625793, + "grad_norm": 0.4843917787075043, "learning_rate": 3.7391198092374726e-05, - "loss": 1.0237, + "loss": 1.0241, "num_input_tokens_seen": 62586880, "step": 7640 }, { "epoch": 0.3673469387755102, - "grad_norm": 0.5166922211647034, + "grad_norm": 0.5122877955436707, "learning_rate": 3.7356774639899914e-05, - "loss": 0.8499, + "loss": 0.8502, "num_input_tokens_seen": 62668800, "step": 7650 }, { "epoch": 0.3678271308523409, - "grad_norm": 0.5414100289344788, + "grad_norm": 0.525092363357544, "learning_rate": 3.7322320157667094e-05, - "loss": 0.8653, + "loss": 0.8658, "num_input_tokens_seen": 62750720, "step": 7660 }, { "epoch": 0.3683073229291717, - "grad_norm": 0.4967511296272278, + "grad_norm": 0.5073078274726868, "learning_rate": 3.728783473219676e-05, - "loss": 0.7981, + "loss": 0.7977, "num_input_tokens_seen": 62832640, "step": 7670 }, { "epoch": 0.3687875150060024, - "grad_norm": 0.5025008320808411, + "grad_norm": 0.49645790457725525, "learning_rate": 3.72533184500871e-05, - "loss": 0.7632, + "loss": 0.7638, "num_input_tokens_seen": 62914560, "step": 7680 }, { "epoch": 0.36926770708283313, - "grad_norm": 0.4785637855529785, + "grad_norm": 0.4900324046611786, "learning_rate": 3.7218771398013807e-05, - "loss": 0.9014, + "loss": 0.9013, "num_input_tokens_seen": 62996480, "step": 7690 }, { "epoch": 0.3697478991596639, - "grad_norm": 1.0697089433670044, + "grad_norm": 1.146216630935669, "learning_rate": 3.718419366272982e-05, - "loss": 0.828, + "loss": 0.8292, "num_input_tokens_seen": 63078400, "step": 7700 }, { "epoch": 0.3702280912364946, - "grad_norm": 0.4909783899784088, + "grad_norm": 0.4886285662651062, "learning_rate": 3.714958533106515e-05, - "loss": 0.7715, + "loss": 0.7722, "num_input_tokens_seen": 63160320, "step": 7710 }, { "epoch": 0.37070828331332534, - "grad_norm": 0.5369053483009338, + "grad_norm": 0.5365391969680786, "learning_rate": 3.7114946489926633e-05, - "loss": 0.9035, + "loss": 0.9032, "num_input_tokens_seen": 63242240, "step": 7720 }, { "epoch": 0.37118847539015604, - "grad_norm": 0.4767155647277832, + "grad_norm": 0.47952884435653687, "learning_rate": 3.708027722629772e-05, - "loss": 1.0931, + "loss": 1.0929, "num_input_tokens_seen": 63324160, "step": 7730 }, { "epoch": 0.3716686674669868, - "grad_norm": 0.4803867042064667, + "grad_norm": 0.4813827574253082, "learning_rate": 3.704557762723823e-05, - "loss": 0.8443, + "loss": 0.8444, "num_input_tokens_seen": 63406080, "step": 7740 }, { "epoch": 0.37214885954381755, - "grad_norm": 0.5255789756774902, + "grad_norm": 0.521245539188385, "learning_rate": 3.7010847779884204e-05, - "loss": 0.9003, + "loss": 0.8989, "num_input_tokens_seen": 63488000, "step": 7750 }, { "epoch": 0.37262905162064824, - "grad_norm": 0.8208502531051636, + "grad_norm": 0.8104525208473206, "learning_rate": 3.697608777144762e-05, - "loss": 0.9216, + "loss": 0.9214, "num_input_tokens_seen": 63569920, "step": 7760 }, { "epoch": 0.373109243697479, - "grad_norm": 0.5180743932723999, + "grad_norm": 0.5063729882240295, "learning_rate": 3.694129768921619e-05, - "loss": 0.966, + "loss": 0.9651, "num_input_tokens_seen": 63651840, "step": 7770 }, { "epoch": 0.3735894357743097, - "grad_norm": 0.5045250058174133, + "grad_norm": 0.4979282319545746, "learning_rate": 3.6906477620553156e-05, - "loss": 0.947, + "loss": 0.946, "num_input_tokens_seen": 63733760, "step": 7780 }, { "epoch": 0.37406962785114045, - "grad_norm": 0.4700779616832733, + "grad_norm": 0.4596295952796936, "learning_rate": 3.687162765289704e-05, - "loss": 0.9151, + "loss": 0.9149, "num_input_tokens_seen": 63815680, "step": 7790 }, { "epoch": 0.3745498199279712, - "grad_norm": 0.5987953543663025, + "grad_norm": 0.5804716348648071, "learning_rate": 3.683674787376148e-05, - "loss": 0.8509, + "loss": 0.8513, "num_input_tokens_seen": 63897600, "step": 7800 }, { "epoch": 0.3750300120048019, - "grad_norm": 0.4968700408935547, + "grad_norm": 0.4932875633239746, "learning_rate": 3.6801838370734945e-05, - "loss": 0.7108, + "loss": 0.7109, "num_input_tokens_seen": 63979520, "step": 7810 }, { "epoch": 0.37551020408163266, - "grad_norm": 0.4701189398765564, + "grad_norm": 0.4756897985935211, "learning_rate": 3.676689923148056e-05, - "loss": 0.8534, + "loss": 0.8537, "num_input_tokens_seen": 64061440, "step": 7820 }, { "epoch": 0.37599039615846336, - "grad_norm": 0.5217798948287964, + "grad_norm": 0.5182291865348816, "learning_rate": 3.673193054373587e-05, - "loss": 0.9249, + "loss": 0.9253, "num_input_tokens_seen": 64143360, "step": 7830 }, { "epoch": 0.3764705882352941, - "grad_norm": 0.6249185800552368, + "grad_norm": 0.610496461391449, "learning_rate": 3.6696932395312606e-05, - "loss": 0.945, + "loss": 0.9448, "num_input_tokens_seen": 64225280, "step": 7840 }, { "epoch": 0.37695078031212487, - "grad_norm": 0.48508864641189575, + "grad_norm": 0.4692385792732239, "learning_rate": 3.6661904874096503e-05, "loss": 0.8673, "num_input_tokens_seen": 64307200, @@ -6290,39 +6290,39 @@ }, { "epoch": 0.37743097238895557, - "grad_norm": 0.5447950959205627, + "grad_norm": 0.5275558233261108, "learning_rate": 3.662684806804704e-05, - "loss": 0.817, + "loss": 0.8175, "num_input_tokens_seen": 64389120, "step": 7860 }, { "epoch": 0.3779111644657863, - "grad_norm": 0.48486828804016113, + "grad_norm": 0.4853339195251465, "learning_rate": 3.659176206519724e-05, - "loss": 0.9525, + "loss": 0.9513, "num_input_tokens_seen": 64471040, "step": 7870 }, { "epoch": 0.3783913565426171, - "grad_norm": 0.4893752932548523, + "grad_norm": 0.4804559051990509, "learning_rate": 3.655664695365344e-05, - "loss": 0.8771, + "loss": 0.8763, "num_input_tokens_seen": 64552960, "step": 7880 }, { "epoch": 0.3788715486194478, - "grad_norm": 0.490164190530777, + "grad_norm": 0.4826927185058594, "learning_rate": 3.652150282159507e-05, - "loss": 0.895, + "loss": 0.8944, "num_input_tokens_seen": 64634880, "step": 7890 }, { "epoch": 0.3793517406962785, - "grad_norm": 0.5217410326004028, + "grad_norm": 0.5171544551849365, "learning_rate": 3.6486329757274454e-05, "loss": 0.8366, "num_input_tokens_seen": 64716800, @@ -6330,311 +6330,311 @@ }, { "epoch": 0.3798319327731092, - "grad_norm": 0.49266862869262695, + "grad_norm": 0.49717050790786743, "learning_rate": 3.645112784901655e-05, - "loss": 0.9177, + "loss": 0.9178, "num_input_tokens_seen": 64798720, "step": 7910 }, { "epoch": 0.38031212484994, - "grad_norm": 0.47938328981399536, + "grad_norm": 0.47939014434814453, "learning_rate": 3.641589718521875e-05, - "loss": 0.8592, + "loss": 0.8597, "num_input_tokens_seen": 64880640, "step": 7920 }, { "epoch": 0.38079231692677074, - "grad_norm": 1.3006699085235596, + "grad_norm": 1.3471354246139526, "learning_rate": 3.6380637854350665e-05, - "loss": 0.9027, + "loss": 0.9038, "num_input_tokens_seen": 64962560, "step": 7930 }, { "epoch": 0.38127250900360143, - "grad_norm": 0.6962760090827942, + "grad_norm": 0.6851658821105957, "learning_rate": 3.634534994495387e-05, - "loss": 0.8225, + "loss": 0.8226, "num_input_tokens_seen": 65044480, "step": 7940 }, { "epoch": 0.3817527010804322, - "grad_norm": 0.5089699625968933, + "grad_norm": 0.5071682333946228, "learning_rate": 3.631003354564175e-05, - "loss": 0.8546, + "loss": 0.8539, "num_input_tokens_seen": 65126400, "step": 7950 }, { "epoch": 0.3822328931572629, - "grad_norm": 0.48060354590415955, + "grad_norm": 0.48892542719841003, "learning_rate": 3.6274688745099194e-05, - "loss": 1.0001, + "loss": 1.0, "num_input_tokens_seen": 65208320, "step": 7960 }, { "epoch": 0.38271308523409364, - "grad_norm": 0.5190022587776184, + "grad_norm": 0.5070081949234009, "learning_rate": 3.623931563208241e-05, - "loss": 0.9504, + "loss": 0.9514, "num_input_tokens_seen": 65290240, "step": 7970 }, { "epoch": 0.3831932773109244, - "grad_norm": 0.48244509100914, + "grad_norm": 0.4804380536079407, "learning_rate": 3.620391429541873e-05, - "loss": 0.8261, + "loss": 0.8269, "num_input_tokens_seen": 65372160, "step": 7980 }, { "epoch": 0.3836734693877551, - "grad_norm": 0.5255333185195923, + "grad_norm": 0.5210065841674805, "learning_rate": 3.616848482400634e-05, - "loss": 0.9785, + "loss": 0.9784, "num_input_tokens_seen": 65454080, "step": 7990 }, { "epoch": 0.38415366146458585, - "grad_norm": 0.507675290107727, + "grad_norm": 0.5196698307991028, "learning_rate": 3.6133027306814085e-05, - "loss": 0.9941, + "loss": 0.994, "num_input_tokens_seen": 65536000, "step": 8000 }, { "epoch": 0.38463385354141655, - "grad_norm": 0.5132362842559814, + "grad_norm": 0.5133892893791199, "learning_rate": 3.609754183288122e-05, - "loss": 1.0458, + "loss": 1.046, "num_input_tokens_seen": 65617920, "step": 8010 }, { "epoch": 0.3851140456182473, - "grad_norm": 0.5268590450286865, + "grad_norm": 0.5304073691368103, "learning_rate": 3.606202849131723e-05, - "loss": 0.9985, + "loss": 0.9983, "num_input_tokens_seen": 65699840, "step": 8020 }, { "epoch": 0.38559423769507806, - "grad_norm": 0.4958067834377289, + "grad_norm": 0.48315533995628357, "learning_rate": 3.6026487371301564e-05, - "loss": 0.9213, + "loss": 0.9211, "num_input_tokens_seen": 65781760, "step": 8030 }, { "epoch": 0.38607442977190876, - "grad_norm": 0.5100277662277222, + "grad_norm": 0.5015775561332703, "learning_rate": 3.599091856208343e-05, - "loss": 0.8843, + "loss": 0.8849, "num_input_tokens_seen": 65863680, "step": 8040 }, { "epoch": 0.3865546218487395, - "grad_norm": 0.49061518907546997, + "grad_norm": 0.48933330178260803, "learning_rate": 3.5955322152981575e-05, - "loss": 0.9195, + "loss": 0.9194, "num_input_tokens_seen": 65945600, "step": 8050 }, { "epoch": 0.3870348139255702, - "grad_norm": 0.531670093536377, + "grad_norm": 0.5391829013824463, "learning_rate": 3.5919698233384034e-05, - "loss": 1.0418, + "loss": 1.0414, "num_input_tokens_seen": 66027520, "step": 8060 }, { "epoch": 0.38751500600240096, - "grad_norm": 0.5200238823890686, + "grad_norm": 0.5336183309555054, "learning_rate": 3.588404689274795e-05, - "loss": 0.8457, + "loss": 0.8462, "num_input_tokens_seen": 66109440, "step": 8070 }, { "epoch": 0.3879951980792317, - "grad_norm": 0.5855520963668823, + "grad_norm": 0.5864033699035645, "learning_rate": 3.58483682205993e-05, - "loss": 0.7761, + "loss": 0.7765, "num_input_tokens_seen": 66191360, "step": 8080 }, { "epoch": 0.3884753901560624, - "grad_norm": 0.5065504908561707, + "grad_norm": 0.5033870339393616, "learning_rate": 3.581266230653271e-05, - "loss": 0.86, + "loss": 0.8597, "num_input_tokens_seen": 66273280, "step": 8090 }, { "epoch": 0.38895558223289317, - "grad_norm": 0.5135564804077148, + "grad_norm": 0.5161807537078857, "learning_rate": 3.5776929240211224e-05, - "loss": 0.8299, + "loss": 0.8295, "num_input_tokens_seen": 66355200, "step": 8100 }, { "epoch": 0.38943577430972387, - "grad_norm": 0.5159543752670288, + "grad_norm": 0.5269225239753723, "learning_rate": 3.5741169111366047e-05, - "loss": 1.0024, + "loss": 1.0028, "num_input_tokens_seen": 66437120, "step": 8110 }, { "epoch": 0.3899159663865546, - "grad_norm": 0.659740149974823, + "grad_norm": 0.5889651775360107, "learning_rate": 3.570538200979635e-05, - "loss": 1.0171, + "loss": 1.0164, "num_input_tokens_seen": 66519040, "step": 8120 }, { "epoch": 0.3903961584633854, - "grad_norm": 0.6677471995353699, + "grad_norm": 0.6744776964187622, "learning_rate": 3.566956802536904e-05, - "loss": 0.9044, + "loss": 0.9047, "num_input_tokens_seen": 66600960, "step": 8130 }, { "epoch": 0.3908763505402161, - "grad_norm": 0.5268581509590149, + "grad_norm": 0.5208677053451538, "learning_rate": 3.5633727248018536e-05, - "loss": 0.9338, + "loss": 0.9341, "num_input_tokens_seen": 66682880, "step": 8140 }, { "epoch": 0.39135654261704683, - "grad_norm": 0.7781187891960144, + "grad_norm": 0.7580982446670532, "learning_rate": 3.5597859767746524e-05, - "loss": 0.8946, + "loss": 0.8941, "num_input_tokens_seen": 66764800, "step": 8150 }, { "epoch": 0.39183673469387753, - "grad_norm": 0.40642163157463074, + "grad_norm": 0.4247629642486572, "learning_rate": 3.556196567462175e-05, - "loss": 0.8835, + "loss": 0.8836, "num_input_tokens_seen": 66846720, "step": 8160 }, { "epoch": 0.3923169267707083, - "grad_norm": 0.5072639584541321, + "grad_norm": 0.49652138352394104, "learning_rate": 3.5526045058779805e-05, - "loss": 0.9559, + "loss": 0.9574, "num_input_tokens_seen": 66928640, "step": 8170 }, { "epoch": 0.39279711884753904, - "grad_norm": 0.5224815011024475, + "grad_norm": 0.5123445391654968, "learning_rate": 3.549009801042286e-05, - "loss": 0.8609, + "loss": 0.8612, "num_input_tokens_seen": 67010560, "step": 8180 }, { "epoch": 0.39327731092436974, - "grad_norm": 0.5318178534507751, + "grad_norm": 0.5425747632980347, "learning_rate": 3.545412461981947e-05, - "loss": 0.9237, + "loss": 0.9242, "num_input_tokens_seen": 67092480, "step": 8190 }, { "epoch": 0.3937575030012005, - "grad_norm": 0.5438302755355835, + "grad_norm": 0.49756717681884766, "learning_rate": 3.541812497730435e-05, - "loss": 0.992, + "loss": 0.9924, "num_input_tokens_seen": 67174400, "step": 8200 }, { "epoch": 0.3942376950780312, - "grad_norm": 0.48217347264289856, + "grad_norm": 0.48790714144706726, "learning_rate": 3.5382099173278125e-05, - "loss": 0.9067, + "loss": 0.9064, "num_input_tokens_seen": 67256320, "step": 8210 }, { "epoch": 0.39471788715486195, - "grad_norm": 0.6276324391365051, + "grad_norm": 0.6320737600326538, "learning_rate": 3.5346047298207116e-05, - "loss": 0.891, + "loss": 0.8907, "num_input_tokens_seen": 67338240, "step": 8220 }, { "epoch": 0.3951980792316927, - "grad_norm": 0.5074647665023804, + "grad_norm": 0.500048816204071, "learning_rate": 3.530996944262312e-05, - "loss": 0.7989, + "loss": 0.7992, "num_input_tokens_seen": 67420160, "step": 8230 }, { "epoch": 0.3956782713085234, - "grad_norm": 0.48775187134742737, + "grad_norm": 0.4895772337913513, "learning_rate": 3.5273865697123164e-05, - "loss": 0.8951, + "loss": 0.8946, "num_input_tokens_seen": 67502080, "step": 8240 }, { "epoch": 0.39615846338535415, - "grad_norm": 0.5741081237792969, + "grad_norm": 0.5737354159355164, "learning_rate": 3.52377361523693e-05, - "loss": 1.0857, + "loss": 1.0861, "num_input_tokens_seen": 67584000, "step": 8250 }, { "epoch": 0.39663865546218485, - "grad_norm": 0.5743753910064697, + "grad_norm": 0.5739513039588928, "learning_rate": 3.520158089908836e-05, - "loss": 1.0151, + "loss": 1.0155, "num_input_tokens_seen": 67665920, "step": 8260 }, { "epoch": 0.3971188475390156, - "grad_norm": 0.5247855186462402, + "grad_norm": 0.519338846206665, "learning_rate": 3.516540002807174e-05, - "loss": 0.765, + "loss": 0.7652, "num_input_tokens_seen": 67747840, "step": 8270 }, { "epoch": 0.39759903961584636, - "grad_norm": 0.5025756359100342, + "grad_norm": 0.505893349647522, "learning_rate": 3.512919363017516e-05, - "loss": 0.881, + "loss": 0.8809, "num_input_tokens_seen": 67829760, "step": 8280 }, { "epoch": 0.39807923169267706, - "grad_norm": 0.46168726682662964, + "grad_norm": 0.47453415393829346, "learning_rate": 3.509296179631843e-05, "loss": 0.8293, "num_input_tokens_seen": 67911680, @@ -6642,39 +6642,39 @@ }, { "epoch": 0.3985594237695078, - "grad_norm": 0.556755542755127, + "grad_norm": 0.545673131942749, "learning_rate": 3.505670461748527e-05, - "loss": 0.8953, + "loss": 0.8956, "num_input_tokens_seen": 67993600, "step": 8300 }, { "epoch": 0.3990396158463385, - "grad_norm": 0.5389664173126221, + "grad_norm": 0.5385898351669312, "learning_rate": 3.5020422184723e-05, - "loss": 1.19, + "loss": 1.1892, "num_input_tokens_seen": 68075520, "step": 8310 }, { "epoch": 0.39951980792316927, - "grad_norm": 0.5176492929458618, + "grad_norm": 0.5242102742195129, "learning_rate": 3.498411458914238e-05, - "loss": 0.8479, + "loss": 0.8476, "num_input_tokens_seen": 68157440, "step": 8320 }, { "epoch": 0.4, - "grad_norm": 0.5035036206245422, + "grad_norm": 0.5002279877662659, "learning_rate": 3.494778192191739e-05, - "loss": 0.8593, + "loss": 0.859, "num_input_tokens_seen": 68239360, "step": 8330 }, { "epoch": 0.4004801920768307, - "grad_norm": 0.49024298787117004, + "grad_norm": 0.4706880748271942, "learning_rate": 3.4911424274284886e-05, "loss": 0.771, "num_input_tokens_seen": 68321280, @@ -6682,47 +6682,47 @@ }, { "epoch": 0.4009603841536615, - "grad_norm": 0.4582843482494354, + "grad_norm": 0.4681709408760071, "learning_rate": 3.4875041737544526e-05, - "loss": 0.9203, + "loss": 0.9161, "num_input_tokens_seen": 68403200, "step": 8350 }, { "epoch": 0.4014405762304922, - "grad_norm": 0.5551020503044128, + "grad_norm": 0.5591146349906921, "learning_rate": 3.483863440305845e-05, - "loss": 0.8853, + "loss": 0.8845, "num_input_tokens_seen": 68485120, "step": 8360 }, { "epoch": 0.40192076830732293, - "grad_norm": 1.1716420650482178, + "grad_norm": 1.1396244764328003, "learning_rate": 3.480220236225106e-05, - "loss": 0.9757, + "loss": 0.9752, "num_input_tokens_seen": 68567040, "step": 8370 }, { "epoch": 0.4024009603841537, - "grad_norm": 0.4597180485725403, + "grad_norm": 0.46549364924430847, "learning_rate": 3.476574570660879e-05, - "loss": 0.8719, + "loss": 0.8717, "num_input_tokens_seen": 68648960, "step": 8380 }, { "epoch": 0.4028811524609844, - "grad_norm": 0.5064864754676819, + "grad_norm": 0.5010836720466614, "learning_rate": 3.472926452767992e-05, - "loss": 0.7306, + "loss": 0.7304, "num_input_tokens_seen": 68730880, "step": 8390 }, { "epoch": 0.40336134453781514, - "grad_norm": 0.4826078414916992, + "grad_norm": 0.4840982258319855, "learning_rate": 3.469275891707428e-05, "loss": 0.8629, "num_input_tokens_seen": 68812800, @@ -6730,79 +6730,79 @@ }, { "epoch": 0.40384153661464584, - "grad_norm": 0.4941999316215515, + "grad_norm": 0.5047776699066162, "learning_rate": 3.465622896646305e-05, - "loss": 0.8746, + "loss": 0.8738, "num_input_tokens_seen": 68894720, "step": 8410 }, { "epoch": 0.4043217286914766, - "grad_norm": 0.5197045207023621, + "grad_norm": 0.5110125541687012, "learning_rate": 3.461967476757857e-05, - "loss": 0.8477, + "loss": 0.8472, "num_input_tokens_seen": 68976640, "step": 8420 }, { "epoch": 0.40480192076830734, - "grad_norm": 0.508140504360199, + "grad_norm": 0.5032282471656799, "learning_rate": 3.4583096412214025e-05, - "loss": 1.0445, + "loss": 1.0451, "num_input_tokens_seen": 69058560, "step": 8430 }, { "epoch": 0.40528211284513804, - "grad_norm": 0.5302277207374573, + "grad_norm": 0.5277020931243896, "learning_rate": 3.454649399222328e-05, - "loss": 0.846, + "loss": 0.8458, "num_input_tokens_seen": 69140480, "step": 8440 }, { "epoch": 0.4057623049219688, - "grad_norm": 0.4965344965457916, + "grad_norm": 0.4992177486419678, "learning_rate": 3.450986759952064e-05, - "loss": 0.8504, + "loss": 0.8503, "num_input_tokens_seen": 69222400, "step": 8450 }, { "epoch": 0.4062424969987995, - "grad_norm": 0.6701764464378357, + "grad_norm": 0.6769206523895264, "learning_rate": 3.44732173260806e-05, - "loss": 0.8774, + "loss": 0.8767, "num_input_tokens_seen": 69304320, "step": 8460 }, { "epoch": 0.40672268907563025, - "grad_norm": 0.5118209719657898, + "grad_norm": 0.5169435143470764, "learning_rate": 3.4436543263937613e-05, - "loss": 0.9147, + "loss": 0.9149, "num_input_tokens_seen": 69386240, "step": 8470 }, { "epoch": 0.407202881152461, - "grad_norm": 0.5040360689163208, + "grad_norm": 0.4996021091938019, "learning_rate": 3.439984550518589e-05, - "loss": 0.8726, + "loss": 0.8728, "num_input_tokens_seen": 69468160, "step": 8480 }, { "epoch": 0.4076830732292917, - "grad_norm": 0.5248441696166992, + "grad_norm": 0.5270659327507019, "learning_rate": 3.436312414197913e-05, - "loss": 0.9237, + "loss": 0.9235, "num_input_tokens_seen": 69550080, "step": 8490 }, { "epoch": 0.40816326530612246, - "grad_norm": 0.47410309314727783, + "grad_norm": 0.4720569849014282, "learning_rate": 3.4326379266530314e-05, "loss": 1.0674, "num_input_tokens_seen": 69632000, @@ -6810,167 +6810,167 @@ }, { "epoch": 0.40864345738295316, - "grad_norm": 0.49306410551071167, + "grad_norm": 0.497772753238678, "learning_rate": 3.428961097111146e-05, - "loss": 0.8397, + "loss": 0.8399, "num_input_tokens_seen": 69713920, "step": 8510 }, { "epoch": 0.4091236494597839, - "grad_norm": 0.4915395677089691, + "grad_norm": 0.49216228723526, "learning_rate": 3.4252819348053424e-05, - "loss": 0.909, + "loss": 0.9089, "num_input_tokens_seen": 69795840, "step": 8520 }, { "epoch": 0.40960384153661467, - "grad_norm": 0.4925106167793274, + "grad_norm": 0.49164268374443054, "learning_rate": 3.421600448974559e-05, - "loss": 0.9302, + "loss": 0.9305, "num_input_tokens_seen": 69877760, "step": 8530 }, { "epoch": 0.41008403361344536, - "grad_norm": 0.49705174565315247, + "grad_norm": 0.49229446053504944, "learning_rate": 3.4179166488635736e-05, - "loss": 0.9918, + "loss": 0.9921, "num_input_tokens_seen": 69959680, "step": 8540 }, { "epoch": 0.4105642256902761, - "grad_norm": 0.49341046810150146, + "grad_norm": 0.484153151512146, "learning_rate": 3.414230543722973e-05, - "loss": 0.9864, + "loss": 0.9862, "num_input_tokens_seen": 70041600, "step": 8550 }, { "epoch": 0.4110444177671068, - "grad_norm": 0.5298373699188232, + "grad_norm": 0.5301636457443237, "learning_rate": 3.410542142809134e-05, - "loss": 0.872, + "loss": 0.8721, "num_input_tokens_seen": 70123520, "step": 8560 }, { "epoch": 0.41152460984393757, - "grad_norm": 0.48551279306411743, + "grad_norm": 0.484598308801651, "learning_rate": 3.4068514553841965e-05, - "loss": 1.0904, + "loss": 1.0909, "num_input_tokens_seen": 70205440, "step": 8570 }, { "epoch": 0.4120048019207683, - "grad_norm": 0.5113745331764221, + "grad_norm": 0.4904525578022003, "learning_rate": 3.403158490716043e-05, - "loss": 0.8943, + "loss": 0.8935, "num_input_tokens_seen": 70287360, "step": 8580 }, { "epoch": 0.412484993997599, - "grad_norm": 0.5106720924377441, + "grad_norm": 0.5105434060096741, "learning_rate": 3.3994632580782766e-05, - "loss": 0.8691, + "loss": 0.8689, "num_input_tokens_seen": 70369280, "step": 8590 }, { "epoch": 0.4129651860744298, - "grad_norm": 0.5008137822151184, + "grad_norm": 0.5011836886405945, "learning_rate": 3.395765766750192e-05, - "loss": 0.8594, + "loss": 0.8591, "num_input_tokens_seen": 70451200, "step": 8600 }, { "epoch": 0.4134453781512605, - "grad_norm": 0.47841960191726685, + "grad_norm": 0.48391351103782654, "learning_rate": 3.392066026016757e-05, - "loss": 0.8132, + "loss": 0.8133, "num_input_tokens_seen": 70533120, "step": 8610 }, { "epoch": 0.41392557022809123, - "grad_norm": 0.8451586365699768, + "grad_norm": 0.8519997596740723, "learning_rate": 3.388364045168591e-05, - "loss": 0.8289, + "loss": 0.8302, "num_input_tokens_seen": 70615040, "step": 8620 }, { "epoch": 0.414405762304922, - "grad_norm": 0.47416937351226807, + "grad_norm": 0.48617443442344666, "learning_rate": 3.3846598335019335e-05, - "loss": 0.8435, + "loss": 0.8433, "num_input_tokens_seen": 70696960, "step": 8630 }, { "epoch": 0.4148859543817527, - "grad_norm": 0.8883240818977356, + "grad_norm": 0.8815626502037048, "learning_rate": 3.38095340031863e-05, - "loss": 0.9588, + "loss": 0.959, "num_input_tokens_seen": 70778880, "step": 8640 }, { "epoch": 0.41536614645858344, - "grad_norm": 0.5661002993583679, + "grad_norm": 0.5593860149383545, "learning_rate": 3.377244754926104e-05, - "loss": 1.0117, + "loss": 1.0122, "num_input_tokens_seen": 70860800, "step": 8650 }, { "epoch": 0.41584633853541414, - "grad_norm": 0.5038189888000488, + "grad_norm": 0.5052839517593384, "learning_rate": 3.3735339066373314e-05, - "loss": 0.8984, + "loss": 0.8981, "num_input_tokens_seen": 70942720, "step": 8660 }, { "epoch": 0.4163265306122449, - "grad_norm": 1.053067922592163, + "grad_norm": 1.1593550443649292, "learning_rate": 3.369820864770822e-05, - "loss": 0.852, + "loss": 0.8525, "num_input_tokens_seen": 71024640, "step": 8670 }, { "epoch": 0.41680672268907565, - "grad_norm": 0.4945598840713501, + "grad_norm": 0.49418941140174866, "learning_rate": 3.366105638650596e-05, - "loss": 0.8927, + "loss": 0.8923, "num_input_tokens_seen": 71106560, "step": 8680 }, { "epoch": 0.41728691476590635, - "grad_norm": 0.5233136415481567, + "grad_norm": 0.5242265462875366, "learning_rate": 3.3623882376061554e-05, - "loss": 0.8245, + "loss": 0.8246, "num_input_tokens_seen": 71188480, "step": 8690 }, { "epoch": 0.4177671068427371, - "grad_norm": 0.46826255321502686, + "grad_norm": 0.4658726155757904, "learning_rate": 3.358668670972465e-05, - "loss": 0.8433, + "loss": 0.8436, "num_input_tokens_seen": 71270400, "step": 8700 }, { "epoch": 0.4182472989195678, - "grad_norm": 0.5887758731842041, + "grad_norm": 0.5847034454345703, "learning_rate": 3.354946948089927e-05, "loss": 0.9426, "num_input_tokens_seen": 71352320, @@ -6978,279 +6978,279 @@ }, { "epoch": 0.41872749099639855, - "grad_norm": 0.5028591752052307, + "grad_norm": 0.4962436556816101, "learning_rate": 3.351223078304359e-05, - "loss": 0.8831, + "loss": 0.8829, "num_input_tokens_seen": 71434240, "step": 8720 }, { "epoch": 0.4192076830732293, - "grad_norm": 0.5117266178131104, + "grad_norm": 0.5134901404380798, "learning_rate": 3.34749707096697e-05, - "loss": 0.9697, + "loss": 0.9692, "num_input_tokens_seen": 71516160, "step": 8730 }, { "epoch": 0.41968787515006, - "grad_norm": 0.5058220028877258, + "grad_norm": 0.5016522407531738, "learning_rate": 3.343768935434337e-05, - "loss": 0.8492, + "loss": 0.8491, "num_input_tokens_seen": 71598080, "step": 8740 }, { "epoch": 0.42016806722689076, - "grad_norm": 0.7977713346481323, + "grad_norm": 0.7968730330467224, "learning_rate": 3.34003868106838e-05, - "loss": 0.9056, + "loss": 0.905, "num_input_tokens_seen": 71680000, "step": 8750 }, { "epoch": 0.42064825930372146, - "grad_norm": 0.5445558428764343, + "grad_norm": 0.5383213758468628, "learning_rate": 3.3363063172363396e-05, - "loss": 0.9732, + "loss": 0.9733, "num_input_tokens_seen": 71761920, "step": 8760 }, { "epoch": 0.4211284513805522, - "grad_norm": 0.5837395787239075, + "grad_norm": 0.5921952128410339, "learning_rate": 3.3325718533107556e-05, - "loss": 1.1805, + "loss": 1.1804, "num_input_tokens_seen": 71843840, "step": 8770 }, { "epoch": 0.42160864345738297, - "grad_norm": 0.6139930486679077, + "grad_norm": 0.6270814538002014, "learning_rate": 3.3288352986694396e-05, - "loss": 0.9309, + "loss": 0.9318, "num_input_tokens_seen": 71925760, "step": 8780 }, { "epoch": 0.42208883553421367, - "grad_norm": 1.7015244960784912, + "grad_norm": 1.9248117208480835, "learning_rate": 3.325096662695454e-05, - "loss": 1.0555, + "loss": 1.0563, "num_input_tokens_seen": 72007680, "step": 8790 }, { "epoch": 0.4225690276110444, - "grad_norm": 0.49051469564437866, + "grad_norm": 0.491634339094162, "learning_rate": 3.321355954777087e-05, - "loss": 0.831, + "loss": 0.83, "num_input_tokens_seen": 72089600, "step": 8800 }, { "epoch": 0.4230492196878752, - "grad_norm": 0.5128316283226013, + "grad_norm": 0.4981280267238617, "learning_rate": 3.317613184307832e-05, - "loss": 0.9081, + "loss": 0.906, "num_input_tokens_seen": 72171520, "step": 8810 }, { "epoch": 0.4235294117647059, - "grad_norm": 0.5852062106132507, + "grad_norm": 0.5684440732002258, "learning_rate": 3.313868360686359e-05, - "loss": 0.9385, + "loss": 0.938, "num_input_tokens_seen": 72253440, "step": 8820 }, { "epoch": 0.42400960384153663, - "grad_norm": 0.4833785891532898, + "grad_norm": 0.4744153618812561, "learning_rate": 3.310121493316495e-05, - "loss": 0.8992, + "loss": 0.8989, "num_input_tokens_seen": 72335360, "step": 8830 }, { "epoch": 0.42448979591836733, - "grad_norm": 0.49650639295578003, + "grad_norm": 0.501400887966156, "learning_rate": 3.306372591607199e-05, - "loss": 0.8763, + "loss": 0.876, "num_input_tokens_seen": 72417280, "step": 8840 }, { "epoch": 0.4249699879951981, - "grad_norm": 0.8988882303237915, + "grad_norm": 0.9911435842514038, "learning_rate": 3.30262166497254e-05, - "loss": 0.8636, + "loss": 0.8632, "num_input_tokens_seen": 72499200, "step": 8850 }, { "epoch": 0.42545018007202884, - "grad_norm": 0.5129944086074829, + "grad_norm": 0.50812828540802, "learning_rate": 3.29886872283167e-05, - "loss": 1.1187, + "loss": 1.1161, "num_input_tokens_seen": 72581120, "step": 8860 }, { "epoch": 0.42593037214885954, - "grad_norm": 0.4534735381603241, + "grad_norm": 0.4761914610862732, "learning_rate": 3.2951137746088004e-05, - "loss": 0.9403, + "loss": 0.94, "num_input_tokens_seen": 72663040, "step": 8870 }, { "epoch": 0.4264105642256903, - "grad_norm": 0.48410260677337646, + "grad_norm": 0.4863661527633667, "learning_rate": 3.291356829733186e-05, - "loss": 0.8392, + "loss": 0.8393, "num_input_tokens_seen": 72744960, "step": 8880 }, { "epoch": 0.426890756302521, - "grad_norm": 0.9801385998725891, + "grad_norm": 1.054153323173523, "learning_rate": 3.287597897639092e-05, - "loss": 1.0537, + "loss": 1.0528, "num_input_tokens_seen": 72826880, "step": 8890 }, { "epoch": 0.42737094837935174, - "grad_norm": 0.7022339105606079, + "grad_norm": 0.677579402923584, "learning_rate": 3.283836987765771e-05, - "loss": 0.8691, + "loss": 0.8697, "num_input_tokens_seen": 72908800, "step": 8900 }, { "epoch": 0.4278511404561825, - "grad_norm": 0.4977160096168518, + "grad_norm": 0.48590415716171265, "learning_rate": 3.280074109557447e-05, - "loss": 0.8148, + "loss": 0.8158, "num_input_tokens_seen": 72990720, "step": 8910 }, { "epoch": 0.4283313325330132, - "grad_norm": 0.5575817823410034, + "grad_norm": 0.5039668679237366, "learning_rate": 3.2763092724632854e-05, - "loss": 0.8026, + "loss": 0.8016, "num_input_tokens_seen": 73072640, "step": 8920 }, { "epoch": 0.42881152460984395, - "grad_norm": 0.6442136764526367, + "grad_norm": 0.7368101477622986, "learning_rate": 3.272542485937369e-05, - "loss": 0.9775, + "loss": 0.9782, "num_input_tokens_seen": 73154560, "step": 8930 }, { "epoch": 0.42929171668667465, - "grad_norm": 0.4888969957828522, + "grad_norm": 0.4872770309448242, "learning_rate": 3.2687737594386766e-05, - "loss": 0.7946, + "loss": 0.7977, "num_input_tokens_seen": 73236480, "step": 8940 }, { "epoch": 0.4297719087635054, - "grad_norm": 0.9433819055557251, + "grad_norm": 1.1870628595352173, "learning_rate": 3.2650031024310603e-05, - "loss": 1.0062, + "loss": 1.0063, "num_input_tokens_seen": 73318400, "step": 8950 }, { "epoch": 0.43025210084033616, - "grad_norm": 2.151608943939209, + "grad_norm": 2.081188678741455, "learning_rate": 3.2612305243832176e-05, - "loss": 1.0847, + "loss": 1.0871, "num_input_tokens_seen": 73400320, "step": 8960 }, { "epoch": 0.43073229291716686, - "grad_norm": 0.5902963876724243, + "grad_norm": 0.5893439650535583, "learning_rate": 3.2574560347686725e-05, - "loss": 0.9699, + "loss": 0.9702, "num_input_tokens_seen": 73482240, "step": 8970 }, { "epoch": 0.4312124849939976, - "grad_norm": 0.5295826196670532, + "grad_norm": 0.5349632501602173, "learning_rate": 3.253679643065747e-05, - "loss": 0.9454, + "loss": 0.9451, "num_input_tokens_seen": 73564160, "step": 8980 }, { "epoch": 0.4316926770708283, - "grad_norm": 0.5086777210235596, + "grad_norm": 0.49540823698043823, "learning_rate": 3.249901358757538e-05, - "loss": 0.9417, + "loss": 0.9423, "num_input_tokens_seen": 73646080, "step": 8990 }, { "epoch": 0.43217286914765907, - "grad_norm": 0.5244941711425781, + "grad_norm": 0.5253874063491821, "learning_rate": 3.246121191331902e-05, - "loss": 0.9839, + "loss": 0.9845, "num_input_tokens_seen": 73728000, "step": 9000 }, { "epoch": 0.4326530612244898, - "grad_norm": 0.4932080805301666, + "grad_norm": 0.5032656788825989, "learning_rate": 3.242339150281417e-05, - "loss": 0.8661, + "loss": 0.8655, "num_input_tokens_seen": 73809920, "step": 9010 }, { "epoch": 0.4331332533013205, - "grad_norm": 0.47911617159843445, + "grad_norm": 0.48877930641174316, "learning_rate": 3.238555245103368e-05, - "loss": 0.8686, + "loss": 0.869, "num_input_tokens_seen": 73891840, "step": 9020 }, { "epoch": 0.4336134453781513, - "grad_norm": 0.5290552377700806, + "grad_norm": 0.564937949180603, "learning_rate": 3.234769485299724e-05, - "loss": 0.8504, + "loss": 0.8508, "num_input_tokens_seen": 73973760, "step": 9030 }, { "epoch": 0.434093637454982, - "grad_norm": 0.4913333058357239, + "grad_norm": 0.5002710819244385, "learning_rate": 3.230981880377107e-05, - "loss": 1.0346, + "loss": 1.035, "num_input_tokens_seen": 74055680, "step": 9040 }, { "epoch": 0.4345738295318127, - "grad_norm": 0.8791585564613342, + "grad_norm": 0.8834782242774963, "learning_rate": 3.227192439846775e-05, - "loss": 0.9299, + "loss": 0.9306, "num_input_tokens_seen": 74137600, "step": 9050 }, { "epoch": 0.4350540216086435, - "grad_norm": 0.4976731240749359, + "grad_norm": 0.4867245554924011, "learning_rate": 3.223401173224595e-05, "loss": 0.9369, "num_input_tokens_seen": 74219520, @@ -7258,183 +7258,183 @@ }, { "epoch": 0.4355342136854742, - "grad_norm": 0.49430936574935913, + "grad_norm": 0.49409911036491394, "learning_rate": 3.219608090031021e-05, - "loss": 0.8562, + "loss": 0.8561, "num_input_tokens_seen": 74301440, "step": 9070 }, { "epoch": 0.43601440576230494, - "grad_norm": 0.4978736937046051, + "grad_norm": 0.4928935766220093, "learning_rate": 3.215813199791065e-05, - "loss": 1.0076, + "loss": 1.0078, "num_input_tokens_seen": 74383360, "step": 9080 }, { "epoch": 0.43649459783913563, - "grad_norm": 0.5289158225059509, + "grad_norm": 0.5236163139343262, "learning_rate": 3.212016512034279e-05, - "loss": 0.8394, + "loss": 0.8402, "num_input_tokens_seen": 74465280, "step": 9090 }, { "epoch": 0.4369747899159664, - "grad_norm": 0.4874771535396576, + "grad_norm": 0.4845563471317291, "learning_rate": 3.2082180362947304e-05, - "loss": 0.9406, + "loss": 0.9409, "num_input_tokens_seen": 74547200, "step": 9100 }, { "epoch": 0.43745498199279714, - "grad_norm": 0.5047212243080139, + "grad_norm": 0.5000022649765015, "learning_rate": 3.204417782110974e-05, - "loss": 1.0131, + "loss": 1.0121, "num_input_tokens_seen": 74629120, "step": 9110 }, { "epoch": 0.43793517406962784, - "grad_norm": 0.5818035006523132, + "grad_norm": 0.5826824307441711, "learning_rate": 3.200615759026031e-05, - "loss": 1.0524, + "loss": 1.0525, "num_input_tokens_seen": 74711040, "step": 9120 }, { "epoch": 0.4384153661464586, - "grad_norm": 0.5106756687164307, + "grad_norm": 0.5062296390533447, "learning_rate": 3.1968119765873654e-05, - "loss": 0.8463, + "loss": 0.8456, "num_input_tokens_seen": 74792960, "step": 9130 }, { "epoch": 0.4388955582232893, - "grad_norm": 1.4386917352676392, + "grad_norm": 1.3879985809326172, "learning_rate": 3.193006444346859e-05, - "loss": 0.8177, + "loss": 0.8187, "num_input_tokens_seen": 74874880, "step": 9140 }, { "epoch": 0.43937575030012005, - "grad_norm": 0.5303124785423279, + "grad_norm": 0.5301570892333984, "learning_rate": 3.189199171860787e-05, - "loss": 1.1675, + "loss": 1.1672, "num_input_tokens_seen": 74956800, "step": 9150 }, { "epoch": 0.4398559423769508, - "grad_norm": 0.5052891969680786, + "grad_norm": 0.5018822550773621, "learning_rate": 3.185390168689796e-05, - "loss": 0.7782, + "loss": 0.7784, "num_input_tokens_seen": 75038720, "step": 9160 }, { "epoch": 0.4403361344537815, - "grad_norm": 0.5097294449806213, + "grad_norm": 0.5099103450775146, "learning_rate": 3.1815794443988763e-05, - "loss": 0.74, + "loss": 0.7409, "num_input_tokens_seen": 75120640, "step": 9170 }, { "epoch": 0.44081632653061226, - "grad_norm": 1.9583684206008911, + "grad_norm": 1.997220516204834, "learning_rate": 3.177767008557343e-05, - "loss": 0.9299, + "loss": 0.9283, "num_input_tokens_seen": 75202560, "step": 9180 }, { "epoch": 0.44129651860744296, - "grad_norm": 0.5055694580078125, + "grad_norm": 0.4805624186992645, "learning_rate": 3.1739528707388066e-05, - "loss": 0.8375, + "loss": 0.8384, "num_input_tokens_seen": 75284480, "step": 9190 }, { "epoch": 0.4417767106842737, - "grad_norm": 0.5287858247756958, + "grad_norm": 0.5211551785469055, "learning_rate": 3.1701370405211535e-05, - "loss": 0.8294, + "loss": 0.8281, "num_input_tokens_seen": 75366400, "step": 9200 }, { "epoch": 0.44225690276110446, - "grad_norm": 0.5322756171226501, + "grad_norm": 0.5320281386375427, "learning_rate": 3.166319527486519e-05, - "loss": 0.965, + "loss": 0.9655, "num_input_tokens_seen": 75448320, "step": 9210 }, { "epoch": 0.44273709483793516, - "grad_norm": 0.475495845079422, + "grad_norm": 0.4732847511768341, "learning_rate": 3.162500341221264e-05, - "loss": 0.9839, + "loss": 0.9852, "num_input_tokens_seen": 75530240, "step": 9220 }, { "epoch": 0.4432172869147659, - "grad_norm": 0.5287263989448547, + "grad_norm": 0.5219468474388123, "learning_rate": 3.158679491315951e-05, - "loss": 0.9622, + "loss": 0.9623, "num_input_tokens_seen": 75612160, "step": 9230 }, { "epoch": 0.4436974789915966, - "grad_norm": 0.9584441184997559, + "grad_norm": 0.9582969546318054, "learning_rate": 3.154856987365322e-05, - "loss": 0.9561, + "loss": 0.9564, "num_input_tokens_seen": 75694080, "step": 9240 }, { "epoch": 0.44417767106842737, - "grad_norm": 0.5105222463607788, + "grad_norm": 0.49347057938575745, "learning_rate": 3.151032838968271e-05, - "loss": 0.794, + "loss": 0.7948, "num_input_tokens_seen": 75776000, "step": 9250 }, { "epoch": 0.4446578631452581, - "grad_norm": 0.5271983742713928, + "grad_norm": 0.5148050785064697, "learning_rate": 3.14720705572782e-05, - "loss": 1.0292, + "loss": 1.0294, "num_input_tokens_seen": 75857920, "step": 9260 }, { "epoch": 0.4451380552220888, - "grad_norm": 0.6865978240966797, + "grad_norm": 0.6848868727684021, "learning_rate": 3.1433796472511e-05, - "loss": 0.865, + "loss": 0.8652, "num_input_tokens_seen": 75939840, "step": 9270 }, { "epoch": 0.4456182472989196, - "grad_norm": 0.5002774596214294, + "grad_norm": 0.5023311972618103, "learning_rate": 3.13955062314932e-05, - "loss": 1.0853, + "loss": 1.0851, "num_input_tokens_seen": 76021760, "step": 9280 }, { "epoch": 0.4460984393757503, - "grad_norm": 0.5186232924461365, + "grad_norm": 0.5274611711502075, "learning_rate": 3.135719993037748e-05, "loss": 0.9558, "num_input_tokens_seen": 76103680, @@ -7442,7 +7442,7 @@ }, { "epoch": 0.44657863145258103, - "grad_norm": 0.5334696769714355, + "grad_norm": 0.535510241985321, "learning_rate": 3.131887766535684e-05, "loss": 0.9276, "num_input_tokens_seen": 76185600, @@ -7450,143 +7450,143 @@ }, { "epoch": 0.4470588235294118, - "grad_norm": 0.49361923336982727, + "grad_norm": 0.48562875390052795, "learning_rate": 3.1280539532664366e-05, - "loss": 0.8235, + "loss": 0.824, "num_input_tokens_seen": 76267520, "step": 9310 }, { "epoch": 0.4475390156062425, - "grad_norm": 0.4922276735305786, + "grad_norm": 0.49112045764923096, "learning_rate": 3.1242185628573e-05, - "loss": 0.8764, + "loss": 0.876, "num_input_tokens_seen": 76349440, "step": 9320 }, { "epoch": 0.44801920768307324, - "grad_norm": 0.5455789566040039, + "grad_norm": 0.5443177819252014, "learning_rate": 3.120381604939529e-05, - "loss": 0.8174, + "loss": 0.8179, "num_input_tokens_seen": 76431360, "step": 9330 }, { "epoch": 0.44849939975990394, - "grad_norm": 0.5186362862586975, + "grad_norm": 0.53428053855896, "learning_rate": 3.116543089148312e-05, - "loss": 0.9815, + "loss": 0.9818, "num_input_tokens_seen": 76513280, "step": 9340 }, { "epoch": 0.4489795918367347, - "grad_norm": 0.5068888664245605, + "grad_norm": 0.5094349980354309, "learning_rate": 3.112703025122754e-05, - "loss": 0.7665, + "loss": 0.767, "num_input_tokens_seen": 76595200, "step": 9350 }, { "epoch": 0.44945978391356545, - "grad_norm": 0.5032262206077576, + "grad_norm": 0.5008413791656494, "learning_rate": 3.108861422505842e-05, - "loss": 0.7452, + "loss": 0.7457, "num_input_tokens_seen": 76677120, "step": 9360 }, { "epoch": 0.44993997599039615, - "grad_norm": 0.4863367974758148, + "grad_norm": 0.4887971580028534, "learning_rate": 3.105018290944432e-05, - "loss": 0.9044, + "loss": 0.9048, "num_input_tokens_seen": 76759040, "step": 9370 }, { "epoch": 0.4504201680672269, - "grad_norm": 0.49691611528396606, + "grad_norm": 0.5146111845970154, "learning_rate": 3.1011736400892175e-05, - "loss": 0.8608, + "loss": 0.86, "num_input_tokens_seen": 76840960, "step": 9380 }, { "epoch": 0.4509003601440576, - "grad_norm": 0.40895968675613403, + "grad_norm": 0.4209975600242615, "learning_rate": 3.097327479594707e-05, - "loss": 0.7893, + "loss": 0.793, "num_input_tokens_seen": 76922880, "step": 9390 }, { "epoch": 0.45138055222088835, - "grad_norm": 0.4914085865020752, + "grad_norm": 0.4876270294189453, "learning_rate": 3.093479819119198e-05, - "loss": 0.935, + "loss": 0.9349, "num_input_tokens_seen": 77004800, "step": 9400 }, { "epoch": 0.4518607442977191, - "grad_norm": 0.4737802743911743, + "grad_norm": 0.47249627113342285, "learning_rate": 3.089630668324759e-05, - "loss": 0.92, + "loss": 0.9205, "num_input_tokens_seen": 77086720, "step": 9410 }, { "epoch": 0.4523409363745498, - "grad_norm": 0.4802420437335968, + "grad_norm": 0.4846927225589752, "learning_rate": 3.085780036877197e-05, - "loss": 0.8323, + "loss": 0.8324, "num_input_tokens_seen": 77168640, "step": 9420 }, { "epoch": 0.45282112845138056, - "grad_norm": 0.8194192051887512, + "grad_norm": 0.8228050470352173, "learning_rate": 3.0819279344460396e-05, - "loss": 0.8059, + "loss": 0.8051, "num_input_tokens_seen": 77250560, "step": 9430 }, { "epoch": 0.45330132052821126, - "grad_norm": 0.500428318977356, + "grad_norm": 0.5003042221069336, "learning_rate": 3.078074370704507e-05, - "loss": 0.9285, + "loss": 0.9294, "num_input_tokens_seen": 77332480, "step": 9440 }, { "epoch": 0.453781512605042, - "grad_norm": 0.6494525671005249, + "grad_norm": 0.6587066054344177, "learning_rate": 3.07421935532949e-05, - "loss": 0.8807, + "loss": 0.8818, "num_input_tokens_seen": 77414400, "step": 9450 }, { "epoch": 0.45426170468187277, - "grad_norm": 0.6894603967666626, + "grad_norm": 0.6838945746421814, "learning_rate": 3.0703628980015214e-05, - "loss": 0.9211, + "loss": 0.9213, "num_input_tokens_seen": 77496320, "step": 9460 }, { "epoch": 0.45474189675870347, - "grad_norm": 0.4977949261665344, + "grad_norm": 0.5055971145629883, "learning_rate": 3.0665050084047605e-05, - "loss": 0.8221, + "loss": 0.8218, "num_input_tokens_seen": 77578240, "step": 9470 }, { "epoch": 0.4552220888355342, - "grad_norm": 0.49541839957237244, + "grad_norm": 0.5079997181892395, "learning_rate": 3.062645696226959e-05, "loss": 0.7467, "num_input_tokens_seen": 77660160, @@ -7594,135 +7594,135 @@ }, { "epoch": 0.4557022809123649, - "grad_norm": 0.48292815685272217, + "grad_norm": 0.487131267786026, "learning_rate": 3.0587849711594425e-05, - "loss": 0.982, + "loss": 0.9819, "num_input_tokens_seen": 77742080, "step": 9490 }, { "epoch": 0.4561824729891957, - "grad_norm": 0.5033644437789917, + "grad_norm": 0.5070083737373352, "learning_rate": 3.054922842897084e-05, - "loss": 0.863, + "loss": 0.8633, "num_input_tokens_seen": 77824000, "step": 9500 }, { "epoch": 0.45666266506602643, - "grad_norm": 0.5042487978935242, + "grad_norm": 0.508251965045929, "learning_rate": 3.051059321138281e-05, - "loss": 1.0322, + "loss": 1.0312, "num_input_tokens_seen": 77905920, "step": 9510 }, { "epoch": 0.45714285714285713, - "grad_norm": 0.49270498752593994, + "grad_norm": 0.4875255823135376, "learning_rate": 3.047194415584929e-05, - "loss": 0.9702, + "loss": 0.9672, "num_input_tokens_seen": 77987840, "step": 9520 }, { "epoch": 0.4576230492196879, - "grad_norm": 0.5053675174713135, + "grad_norm": 0.5202212333679199, "learning_rate": 3.0433281359424008e-05, - "loss": 0.9051, + "loss": 0.9048, "num_input_tokens_seen": 78069760, "step": 9530 }, { "epoch": 0.4581032412965186, - "grad_norm": 0.4840089976787567, + "grad_norm": 0.484787255525589, "learning_rate": 3.0394604919195156e-05, - "loss": 0.785, + "loss": 0.7858, "num_input_tokens_seen": 78151680, "step": 9540 }, { "epoch": 0.45858343337334934, - "grad_norm": 0.4885089099407196, + "grad_norm": 0.4984806776046753, "learning_rate": 3.0355914932285228e-05, - "loss": 0.9501, + "loss": 0.9508, "num_input_tokens_seen": 78233600, "step": 9550 }, { "epoch": 0.4590636254501801, - "grad_norm": 0.5052339434623718, + "grad_norm": 0.5085755586624146, "learning_rate": 3.0317211495850717e-05, - "loss": 0.8565, + "loss": 0.8558, "num_input_tokens_seen": 78315520, "step": 9560 }, { "epoch": 0.4595438175270108, - "grad_norm": 0.49368199706077576, + "grad_norm": 0.4945325553417206, "learning_rate": 3.02784947070819e-05, - "loss": 0.7823, + "loss": 0.7818, "num_input_tokens_seen": 78397440, "step": 9570 }, { "epoch": 0.46002400960384154, - "grad_norm": 0.48532119393348694, + "grad_norm": 0.4864473044872284, "learning_rate": 3.0239764663202562e-05, - "loss": 0.9049, + "loss": 0.9045, "num_input_tokens_seen": 78479360, "step": 9580 }, { "epoch": 0.46050420168067224, - "grad_norm": 0.48147672414779663, + "grad_norm": 0.4791228175163269, "learning_rate": 3.0201021461469803e-05, - "loss": 0.9584, + "loss": 0.9581, "num_input_tokens_seen": 78561280, "step": 9590 }, { "epoch": 0.460984393757503, - "grad_norm": 0.49447527527809143, + "grad_norm": 0.48919251561164856, "learning_rate": 3.0162265199173738e-05, - "loss": 0.8946, + "loss": 0.8943, "num_input_tokens_seen": 78643200, "step": 9600 }, { "epoch": 0.46146458583433375, - "grad_norm": 0.44538614153862, + "grad_norm": 0.4210091531276703, "learning_rate": 3.0123495973637305e-05, - "loss": 0.973, + "loss": 0.9728, "num_input_tokens_seen": 78725120, "step": 9610 }, { "epoch": 0.46194477791116445, - "grad_norm": 0.4778103530406952, + "grad_norm": 0.4873916208744049, "learning_rate": 3.008471388221597e-05, - "loss": 0.8953, + "loss": 0.8959, "num_input_tokens_seen": 78807040, "step": 9620 }, { "epoch": 0.4624249699879952, - "grad_norm": 0.5161569714546204, + "grad_norm": 0.5187123417854309, "learning_rate": 3.0045919022297524e-05, - "loss": 0.7189, + "loss": 0.7175, "num_input_tokens_seen": 78888960, "step": 9630 }, { "epoch": 0.4629051620648259, - "grad_norm": 0.47274622321128845, + "grad_norm": 0.46600180864334106, "learning_rate": 3.0007111491301816e-05, - "loss": 0.8521, + "loss": 0.8525, "num_input_tokens_seen": 78970880, "step": 9640 }, { "epoch": 0.46338535414165666, - "grad_norm": 0.4988880753517151, + "grad_norm": 0.5010501146316528, "learning_rate": 2.9968291386680503e-05, "loss": 0.941, "num_input_tokens_seen": 79052800, @@ -7730,231 +7730,231 @@ }, { "epoch": 0.4638655462184874, - "grad_norm": 0.6255693435668945, + "grad_norm": 0.6152305006980896, "learning_rate": 2.9929458805916837e-05, - "loss": 1.0642, + "loss": 1.066, "num_input_tokens_seen": 79134720, "step": 9660 }, { "epoch": 0.4643457382953181, - "grad_norm": 0.39187878370285034, + "grad_norm": 0.40477296710014343, "learning_rate": 2.9890613846525395e-05, - "loss": 0.7306, + "loss": 0.7304, "num_input_tokens_seen": 79216640, "step": 9670 }, { "epoch": 0.46482593037214887, - "grad_norm": 1.9925369024276733, + "grad_norm": 2.6527504920959473, "learning_rate": 2.9851756606051817e-05, - "loss": 0.8926, + "loss": 0.8932, "num_input_tokens_seen": 79298560, "step": 9680 }, { "epoch": 0.46530612244897956, - "grad_norm": 0.48940789699554443, + "grad_norm": 0.4877612292766571, "learning_rate": 2.9812887182072607e-05, - "loss": 0.9238, + "loss": 0.9239, "num_input_tokens_seen": 79380480, "step": 9690 }, { "epoch": 0.4657863145258103, - "grad_norm": 0.5379921197891235, + "grad_norm": 0.5364747643470764, "learning_rate": 2.9774005672194854e-05, - "loss": 0.8351, + "loss": 0.8346, "num_input_tokens_seen": 79462400, "step": 9700 }, { "epoch": 0.4662665066026411, - "grad_norm": 0.5129654407501221, + "grad_norm": 0.5188019871711731, "learning_rate": 2.9735112174056006e-05, - "loss": 0.8861, + "loss": 0.886, "num_input_tokens_seen": 79544320, "step": 9710 }, { "epoch": 0.46674669867947177, - "grad_norm": 0.4713475704193115, + "grad_norm": 0.47986412048339844, "learning_rate": 2.96962067853236e-05, - "loss": 0.8593, + "loss": 0.8596, "num_input_tokens_seen": 79626240, "step": 9720 }, { "epoch": 0.4672268907563025, - "grad_norm": 4.391692161560059, + "grad_norm": 4.085470199584961, "learning_rate": 2.9657289603695037e-05, - "loss": 0.8369, + "loss": 0.8394, "num_input_tokens_seen": 79708160, "step": 9730 }, { "epoch": 0.4677070828331333, - "grad_norm": 0.5324821472167969, + "grad_norm": 0.5369517803192139, "learning_rate": 2.9618360726897344e-05, - "loss": 1.3, + "loss": 1.2984, "num_input_tokens_seen": 79790080, "step": 9740 }, { "epoch": 0.468187274909964, - "grad_norm": 0.5004756450653076, + "grad_norm": 0.4899786114692688, "learning_rate": 2.957942025268689e-05, - "loss": 0.9522, + "loss": 0.9511, "num_input_tokens_seen": 79872000, "step": 9750 }, { "epoch": 0.46866746698679473, - "grad_norm": 0.5157467722892761, + "grad_norm": 0.5062848329544067, "learning_rate": 2.9540468278849208e-05, - "loss": 0.8728, + "loss": 0.8727, "num_input_tokens_seen": 79953920, "step": 9760 }, { "epoch": 0.46914765906362543, - "grad_norm": 0.4873751699924469, + "grad_norm": 0.48712435364723206, "learning_rate": 2.950150490319866e-05, - "loss": 0.8798, + "loss": 0.8799, "num_input_tokens_seen": 80035840, "step": 9770 }, { "epoch": 0.4696278511404562, - "grad_norm": 0.9361140727996826, + "grad_norm": 0.9658442735671997, "learning_rate": 2.9462530223578273e-05, - "loss": 0.9698, + "loss": 0.9699, "num_input_tokens_seen": 80117760, "step": 9780 }, { "epoch": 0.47010804321728694, - "grad_norm": 0.5102428197860718, + "grad_norm": 0.5106319785118103, "learning_rate": 2.9423544337859454e-05, - "loss": 0.8272, + "loss": 0.8275, "num_input_tokens_seen": 80199680, "step": 9790 }, { "epoch": 0.47058823529411764, - "grad_norm": 0.5207327008247375, + "grad_norm": 0.519828736782074, "learning_rate": 2.938454734394174e-05, - "loss": 0.8527, + "loss": 0.8525, "num_input_tokens_seen": 80281600, "step": 9800 }, { "epoch": 0.4710684273709484, - "grad_norm": 0.5059601068496704, + "grad_norm": 0.5150085687637329, "learning_rate": 2.9345539339752575e-05, - "loss": 0.9777, + "loss": 0.9778, "num_input_tokens_seen": 80363520, "step": 9810 }, { "epoch": 0.4715486194477791, - "grad_norm": 0.5397816896438599, + "grad_norm": 0.5509699583053589, "learning_rate": 2.9306520423247045e-05, - "loss": 0.8295, + "loss": 0.829, "num_input_tokens_seen": 80445440, "step": 9820 }, { "epoch": 0.47202881152460985, - "grad_norm": 0.4623885750770569, + "grad_norm": 0.4511142671108246, "learning_rate": 2.9267490692407635e-05, - "loss": 0.9394, + "loss": 0.9411, "num_input_tokens_seen": 80527360, "step": 9830 }, { "epoch": 0.4725090036014406, - "grad_norm": 0.48748502135276794, + "grad_norm": 0.48220759630203247, "learning_rate": 2.9228450245243993e-05, - "loss": 0.9649, + "loss": 0.9641, "num_input_tokens_seen": 80609280, "step": 9840 }, { "epoch": 0.4729891956782713, - "grad_norm": 0.493064820766449, + "grad_norm": 0.4902176856994629, "learning_rate": 2.9189399179792676e-05, - "loss": 1.1218, + "loss": 1.1205, "num_input_tokens_seen": 80691200, "step": 9850 }, { "epoch": 0.47346938775510206, - "grad_norm": 0.5036744475364685, + "grad_norm": 0.49037599563598633, "learning_rate": 2.9150337594116904e-05, - "loss": 0.9087, + "loss": 0.9089, "num_input_tokens_seen": 80773120, "step": 9860 }, { "epoch": 0.47394957983193275, - "grad_norm": 0.5360898971557617, + "grad_norm": 0.525600016117096, "learning_rate": 2.9111265586306314e-05, - "loss": 0.849, + "loss": 0.8489, "num_input_tokens_seen": 80855040, "step": 9870 }, { "epoch": 0.4744297719087635, - "grad_norm": 0.521198034286499, + "grad_norm": 0.5158500075340271, "learning_rate": 2.9072183254476713e-05, - "loss": 0.8285, + "loss": 0.8287, "num_input_tokens_seen": 80936960, "step": 9880 }, { "epoch": 0.47490996398559426, - "grad_norm": 0.4821554720401764, + "grad_norm": 0.4754113554954529, "learning_rate": 2.903309069676984e-05, - "loss": 0.9502, + "loss": 0.9506, "num_input_tokens_seen": 81018880, "step": 9890 }, { "epoch": 0.47539015606242496, - "grad_norm": 0.483500212430954, + "grad_norm": 0.48330146074295044, "learning_rate": 2.8993988011353112e-05, - "loss": 0.8596, + "loss": 0.8601, "num_input_tokens_seen": 81100800, "step": 9900 }, { "epoch": 0.4758703481392557, - "grad_norm": 0.5084524750709534, + "grad_norm": 0.4962432086467743, "learning_rate": 2.8954875296419364e-05, - "loss": 0.8992, + "loss": 0.9003, "num_input_tokens_seen": 81182720, "step": 9910 }, { "epoch": 0.4763505402160864, - "grad_norm": 0.5362197160720825, + "grad_norm": 0.5256210565567017, "learning_rate": 2.8915752650186635e-05, - "loss": 0.9792, + "loss": 0.9797, "num_input_tokens_seen": 81264640, "step": 9920 }, { "epoch": 0.47683073229291717, - "grad_norm": 0.5296033024787903, + "grad_norm": 0.529623806476593, "learning_rate": 2.8876620170897895e-05, - "loss": 0.7938, + "loss": 0.7935, "num_input_tokens_seen": 81346560, "step": 9930 }, { "epoch": 0.4773109243697479, - "grad_norm": 0.5047898888587952, + "grad_norm": 0.4984219968318939, "learning_rate": 2.88374779568208e-05, "loss": 0.8632, "num_input_tokens_seen": 81428480, @@ -7962,63 +7962,63 @@ }, { "epoch": 0.4777911164465786, - "grad_norm": 0.5180176496505737, + "grad_norm": 0.5229691863059998, "learning_rate": 2.879832610624747e-05, - "loss": 0.8698, + "loss": 0.8712, "num_input_tokens_seen": 81510400, "step": 9950 }, { "epoch": 0.4782713085234094, - "grad_norm": 0.4948786795139313, + "grad_norm": 0.49954238533973694, "learning_rate": 2.8759164717494202e-05, - "loss": 0.8159, + "loss": 0.8157, "num_input_tokens_seen": 81592320, "step": 9960 }, { "epoch": 0.4787515006002401, - "grad_norm": 0.5000596046447754, + "grad_norm": 0.5026626586914062, "learning_rate": 2.8719993888901258e-05, - "loss": 1.0499, + "loss": 1.05, "num_input_tokens_seen": 81674240, "step": 9970 }, { "epoch": 0.47923169267707083, - "grad_norm": 0.5099364519119263, + "grad_norm": 0.5073908567428589, "learning_rate": 2.86808137188326e-05, - "loss": 0.8724, + "loss": 0.8731, "num_input_tokens_seen": 81756160, "step": 9980 }, { "epoch": 0.4797118847539016, - "grad_norm": 0.48814287781715393, + "grad_norm": 0.48957353830337524, "learning_rate": 2.8641624305675657e-05, - "loss": 0.9881, + "loss": 0.9884, "num_input_tokens_seen": 81838080, "step": 9990 }, { "epoch": 0.4801920768307323, - "grad_norm": 0.46351805329322815, + "grad_norm": 0.46365344524383545, "learning_rate": 2.8602425747841057e-05, - "loss": 0.7335, + "loss": 0.7331, "num_input_tokens_seen": 81920000, "step": 10000 }, { "epoch": 0.48067226890756304, - "grad_norm": 0.4863956868648529, + "grad_norm": 0.4974704682826996, "learning_rate": 2.8563218143762383e-05, - "loss": 0.8356, + "loss": 0.8353, "num_input_tokens_seen": 82001920, "step": 10010 }, { "epoch": 0.48115246098439374, - "grad_norm": 0.5032678246498108, + "grad_norm": 0.5029409527778625, "learning_rate": 2.852400159189597e-05, "loss": 0.8458, "num_input_tokens_seen": 82083840, @@ -8026,39 +8026,39 @@ }, { "epoch": 0.4816326530612245, - "grad_norm": 0.4915202558040619, + "grad_norm": 0.4853820502758026, "learning_rate": 2.848477619072059e-05, - "loss": 0.995, + "loss": 0.9951, "num_input_tokens_seen": 82165760, "step": 10030 }, { "epoch": 0.48211284513805525, - "grad_norm": 0.9962053894996643, + "grad_norm": 0.5146056413650513, "learning_rate": 2.8445542038737245e-05, - "loss": 1.0218, + "loss": 1.0225, "num_input_tokens_seen": 82247680, "step": 10040 }, { "epoch": 0.48259303721488594, - "grad_norm": 0.49556922912597656, + "grad_norm": 0.4969925284385681, "learning_rate": 2.8406299234468915e-05, - "loss": 0.9613, + "loss": 0.9611, "num_input_tokens_seen": 82329600, "step": 10050 }, { "epoch": 0.4830732292917167, - "grad_norm": 0.5691222548484802, + "grad_norm": 0.5693634152412415, "learning_rate": 2.8367047876460305e-05, - "loss": 1.0402, + "loss": 1.0389, "num_input_tokens_seen": 82411520, "step": 10060 }, { "epoch": 0.4835534213685474, - "grad_norm": 0.4867800176143646, + "grad_norm": 0.48920491337776184, "learning_rate": 2.8327788063277594e-05, "loss": 0.9294, "num_input_tokens_seen": 82493440, @@ -8066,7 +8066,7 @@ }, { "epoch": 0.48403361344537815, - "grad_norm": 0.5041534304618835, + "grad_norm": 0.4980778098106384, "learning_rate": 2.8288519893508212e-05, "loss": 0.8672, "num_input_tokens_seen": 82575360, @@ -8074,15 +8074,15 @@ }, { "epoch": 0.4845138055222089, - "grad_norm": 0.44031843543052673, + "grad_norm": 0.45296671986579895, "learning_rate": 2.8249243465760566e-05, - "loss": 0.8052, + "loss": 0.805, "num_input_tokens_seen": 82657280, "step": 10090 }, { "epoch": 0.4849939975990396, - "grad_norm": 0.48101985454559326, + "grad_norm": 0.4847956895828247, "learning_rate": 2.8209958878663778e-05, "loss": 0.8885, "num_input_tokens_seen": 82739200, @@ -8090,15 +8090,15 @@ }, { "epoch": 0.48547418967587036, - "grad_norm": 1.5555214881896973, + "grad_norm": 1.503799319267273, "learning_rate": 2.817066623086748e-05, - "loss": 0.8561, + "loss": 0.855, "num_input_tokens_seen": 82821120, "step": 10110 }, { "epoch": 0.48595438175270106, - "grad_norm": 0.48857659101486206, + "grad_norm": 0.4808594286441803, "learning_rate": 2.813136562104155e-05, "loss": 0.8841, "num_input_tokens_seen": 82903040, @@ -8106,79 +8106,79 @@ }, { "epoch": 0.4864345738295318, - "grad_norm": 0.49165698885917664, + "grad_norm": 0.4852427542209625, "learning_rate": 2.8092057147875856e-05, - "loss": 0.9224, + "loss": 0.9228, "num_input_tokens_seen": 82984960, "step": 10130 }, { "epoch": 0.48691476590636257, - "grad_norm": 0.5225831270217896, + "grad_norm": 0.5168618559837341, "learning_rate": 2.8052740910079994e-05, - "loss": 0.9768, + "loss": 0.9771, "num_input_tokens_seen": 83066880, "step": 10140 }, { "epoch": 0.48739495798319327, - "grad_norm": 0.5045091509819031, + "grad_norm": 0.4985884428024292, "learning_rate": 2.8013417006383076e-05, - "loss": 0.9255, + "loss": 0.9318, "num_input_tokens_seen": 83148800, "step": 10150 }, { "epoch": 0.487875150060024, - "grad_norm": 0.5114516615867615, + "grad_norm": 0.49745380878448486, "learning_rate": 2.7974085535533473e-05, - "loss": 0.8565, + "loss": 0.8569, "num_input_tokens_seen": 83230720, "step": 10160 }, { "epoch": 0.4883553421368547, - "grad_norm": 0.5040556788444519, + "grad_norm": 0.48418742418289185, "learning_rate": 2.7934746596298535e-05, - "loss": 1.0491, + "loss": 1.0495, "num_input_tokens_seen": 83312640, "step": 10170 }, { "epoch": 0.4888355342136855, - "grad_norm": 1.6331427097320557, + "grad_norm": 1.714831829071045, "learning_rate": 2.789540028746438e-05, - "loss": 0.8835, + "loss": 0.8839, "num_input_tokens_seen": 83394560, "step": 10180 }, { "epoch": 0.48931572629051623, - "grad_norm": 0.6383754014968872, + "grad_norm": 0.6471736431121826, "learning_rate": 2.785604670783563e-05, - "loss": 0.9588, + "loss": 0.9594, "num_input_tokens_seen": 83476480, "step": 10190 }, { "epoch": 0.4897959183673469, - "grad_norm": 0.5155805349349976, + "grad_norm": 0.5089517831802368, "learning_rate": 2.7816685956235165e-05, - "loss": 0.7832, + "loss": 0.7835, "num_input_tokens_seen": 83558400, "step": 10200 }, { "epoch": 0.4902761104441777, - "grad_norm": 0.5443750619888306, + "grad_norm": 0.5460789203643799, "learning_rate": 2.7777318131503873e-05, - "loss": 0.9374, + "loss": 0.9379, "num_input_tokens_seen": 83640320, "step": 10210 }, { "epoch": 0.4907563025210084, - "grad_norm": 0.4726339280605316, + "grad_norm": 0.4767736792564392, "learning_rate": 2.773794333250041e-05, "loss": 0.767, "num_input_tokens_seen": 83722240, @@ -8186,15 +8186,15 @@ }, { "epoch": 0.49123649459783914, - "grad_norm": 0.4907895028591156, + "grad_norm": 0.5004221200942993, "learning_rate": 2.769856165810093e-05, - "loss": 0.9588, + "loss": 0.9591, "num_input_tokens_seen": 83804160, "step": 10230 }, { "epoch": 0.4917166866746699, - "grad_norm": 0.5048187375068665, + "grad_norm": 0.5016967058181763, "learning_rate": 2.765917320719887e-05, "loss": 0.929, "num_input_tokens_seen": 83886080, @@ -8202,23 +8202,23 @@ }, { "epoch": 0.4921968787515006, - "grad_norm": 0.4887201189994812, + "grad_norm": 0.4898025393486023, "learning_rate": 2.7619778078704685e-05, - "loss": 0.8619, + "loss": 0.8609, "num_input_tokens_seen": 83968000, "step": 10250 }, { "epoch": 0.49267707082833134, - "grad_norm": 0.5038937926292419, + "grad_norm": 0.4977264106273651, "learning_rate": 2.7580376371545573e-05, - "loss": 0.7223, + "loss": 0.7207, "num_input_tokens_seen": 84049920, "step": 10260 }, { "epoch": 0.49315726290516204, - "grad_norm": 0.505540668964386, + "grad_norm": 0.5083197951316833, "learning_rate": 2.7540968184665283e-05, "loss": 0.8415, "num_input_tokens_seen": 84131840, @@ -8226,79 +8226,79 @@ }, { "epoch": 0.4936374549819928, - "grad_norm": 0.4871721565723419, + "grad_norm": 0.4886813461780548, "learning_rate": 2.7501553617023816e-05, - "loss": 0.9001, + "loss": 0.9007, "num_input_tokens_seen": 84213760, "step": 10280 }, { "epoch": 0.49411764705882355, - "grad_norm": 0.4944019019603729, + "grad_norm": 0.48880535364151, "learning_rate": 2.7462132767597205e-05, - "loss": 0.8124, + "loss": 0.8125, "num_input_tokens_seen": 84295680, "step": 10290 }, { "epoch": 0.49459783913565425, - "grad_norm": 0.5085113644599915, + "grad_norm": 0.5156389474868774, "learning_rate": 2.742270573537724e-05, - "loss": 0.8611, + "loss": 0.8609, "num_input_tokens_seen": 84377600, "step": 10300 }, { "epoch": 0.495078031212485, - "grad_norm": 0.4947553277015686, + "grad_norm": 0.5027367472648621, "learning_rate": 2.7383272619371276e-05, - "loss": 0.9993, + "loss": 0.9997, "num_input_tokens_seen": 84459520, "step": 10310 }, { "epoch": 0.4955582232893157, - "grad_norm": 0.4945763945579529, + "grad_norm": 0.49897968769073486, "learning_rate": 2.7343833518601913e-05, - "loss": 0.8757, + "loss": 0.8754, "num_input_tokens_seen": 84541440, "step": 10320 }, { "epoch": 0.49603841536614646, - "grad_norm": 0.47624143958091736, + "grad_norm": 0.4910091459751129, "learning_rate": 2.7304388532106768e-05, - "loss": 1.1208, + "loss": 1.121, "num_input_tokens_seen": 84623360, "step": 10330 }, { "epoch": 0.4965186074429772, - "grad_norm": 0.5344643592834473, + "grad_norm": 0.5233538150787354, "learning_rate": 2.726493775893828e-05, - "loss": 0.7873, + "loss": 0.7872, "num_input_tokens_seen": 84705280, "step": 10340 }, { "epoch": 0.4969987995198079, - "grad_norm": 0.5042626261711121, + "grad_norm": 0.5136865377426147, "learning_rate": 2.7225481298163387e-05, - "loss": 0.9001, + "loss": 0.9003, "num_input_tokens_seen": 84787200, "step": 10350 }, { "epoch": 0.49747899159663866, - "grad_norm": 0.4883025884628296, + "grad_norm": 0.49212074279785156, "learning_rate": 2.718601924886332e-05, - "loss": 1.0793, + "loss": 1.0785, "num_input_tokens_seen": 84869120, "step": 10360 }, { "epoch": 0.49795918367346936, - "grad_norm": 0.512522280216217, + "grad_norm": 0.5100103616714478, "learning_rate": 2.7146551710133346e-05, "loss": 0.8888, "num_input_tokens_seen": 84951040, @@ -8306,303 +8306,303 @@ }, { "epoch": 0.4984393757503001, - "grad_norm": 0.47969046235084534, + "grad_norm": 0.4852333962917328, "learning_rate": 2.7107078781082508e-05, - "loss": 0.8732, + "loss": 0.8726, "num_input_tokens_seen": 85032960, "step": 10380 }, { "epoch": 0.49891956782713087, - "grad_norm": 0.5086472034454346, + "grad_norm": 0.5148525238037109, "learning_rate": 2.7067600560833384e-05, - "loss": 0.9422, + "loss": 0.942, "num_input_tokens_seen": 85114880, "step": 10390 }, { "epoch": 0.49939975990396157, - "grad_norm": 0.5234156250953674, + "grad_norm": 0.5078142285346985, "learning_rate": 2.7028117148521863e-05, - "loss": 0.8577, + "loss": 0.8575, "num_input_tokens_seen": 85196800, "step": 10400 }, { "epoch": 0.4998799519807923, - "grad_norm": 0.8362540006637573, + "grad_norm": 0.8435948491096497, "learning_rate": 2.698862864329685e-05, - "loss": 0.9081, + "loss": 0.9079, "num_input_tokens_seen": 85278720, "step": 10410 }, { "epoch": 0.5003601440576231, - "grad_norm": 0.5078486800193787, + "grad_norm": 0.5046183466911316, "learning_rate": 2.6949135144320026e-05, - "loss": 0.9876, + "loss": 0.9877, "num_input_tokens_seen": 85360640, "step": 10420 }, { "epoch": 0.5008403361344538, - "grad_norm": 0.49120739102363586, + "grad_norm": 0.4874359369277954, "learning_rate": 2.6909636750765653e-05, - "loss": 0.8621, + "loss": 0.8622, "num_input_tokens_seen": 85442560, "step": 10430 }, { "epoch": 0.5013205282112845, - "grad_norm": 0.5183668732643127, + "grad_norm": 0.504699170589447, "learning_rate": 2.6870133561820243e-05, - "loss": 0.9355, + "loss": 0.9352, "num_input_tokens_seen": 85524480, "step": 10440 }, { "epoch": 0.5018007202881153, - "grad_norm": 0.8437354564666748, + "grad_norm": 0.8512853384017944, "learning_rate": 2.683062567668238e-05, - "loss": 1.0307, + "loss": 1.031, "num_input_tokens_seen": 85606400, "step": 10450 }, { "epoch": 0.502280912364946, - "grad_norm": 0.4748769700527191, + "grad_norm": 0.4725627303123474, "learning_rate": 2.679111319456242e-05, - "loss": 0.8886, + "loss": 0.8878, "num_input_tokens_seen": 85688320, "step": 10460 }, { "epoch": 0.5027611044417767, - "grad_norm": 0.4411871135234833, + "grad_norm": 0.4424259066581726, "learning_rate": 2.6751596214682278e-05, - "loss": 0.6568, + "loss": 0.6566, "num_input_tokens_seen": 85770240, "step": 10470 }, { "epoch": 0.5032412965186075, - "grad_norm": 0.3230482041835785, + "grad_norm": 0.3668357729911804, "learning_rate": 2.671207483627515e-05, - "loss": 0.9178, + "loss": 0.9173, "num_input_tokens_seen": 85852160, "step": 10480 }, { "epoch": 0.5037214885954382, - "grad_norm": 0.6780135631561279, + "grad_norm": 0.6287544369697571, "learning_rate": 2.6672549158585293e-05, - "loss": 0.832, + "loss": 0.8325, "num_input_tokens_seen": 85934080, "step": 10490 }, { "epoch": 0.5042016806722689, - "grad_norm": 0.47888603806495667, + "grad_norm": 0.4674912989139557, "learning_rate": 2.663301928086774e-05, - "loss": 0.7443, + "loss": 0.7438, "num_input_tokens_seen": 86016000, "step": 10500 }, { "epoch": 0.5046818727490996, - "grad_norm": 0.4726419448852539, + "grad_norm": 0.46498844027519226, "learning_rate": 2.6593485302388087e-05, - "loss": 0.8535, + "loss": 0.8527, "num_input_tokens_seen": 86097920, "step": 10510 }, { "epoch": 0.5051620648259304, - "grad_norm": 0.4687502980232239, + "grad_norm": 0.45717716217041016, "learning_rate": 2.6553947322422223e-05, - "loss": 0.9447, + "loss": 0.9448, "num_input_tokens_seen": 86179840, "step": 10520 }, { "epoch": 0.5056422569027611, - "grad_norm": 0.5022156834602356, + "grad_norm": 0.4967118501663208, "learning_rate": 2.6514405440256086e-05, - "loss": 0.8657, + "loss": 0.8656, "num_input_tokens_seen": 86261760, "step": 10530 }, { "epoch": 0.5061224489795918, - "grad_norm": 0.49966397881507874, + "grad_norm": 0.49645423889160156, "learning_rate": 2.6474859755185415e-05, - "loss": 0.9218, + "loss": 0.9212, "num_input_tokens_seen": 86343680, "step": 10540 }, { "epoch": 0.5066026410564226, - "grad_norm": 0.4754197597503662, + "grad_norm": 0.47995299100875854, "learning_rate": 2.6435310366515498e-05, - "loss": 0.9897, + "loss": 0.9893, "num_input_tokens_seen": 86425600, "step": 10550 }, { "epoch": 0.5070828331332533, - "grad_norm": 0.4898022711277008, + "grad_norm": 0.49421054124832153, "learning_rate": 2.6395757373560904e-05, - "loss": 1.0327, + "loss": 1.0323, "num_input_tokens_seen": 86507520, "step": 10560 }, { "epoch": 0.507563025210084, - "grad_norm": 0.49128857254981995, + "grad_norm": 0.48955073952674866, "learning_rate": 2.6356200875645287e-05, - "loss": 0.9704, + "loss": 0.9709, "num_input_tokens_seen": 86589440, "step": 10570 }, { "epoch": 0.5080432172869148, - "grad_norm": 0.49930843710899353, + "grad_norm": 0.4894655644893646, "learning_rate": 2.631664097210108e-05, - "loss": 0.7818, + "loss": 0.7823, "num_input_tokens_seen": 86671360, "step": 10580 }, { "epoch": 0.5085234093637455, - "grad_norm": 2.420948028564453, + "grad_norm": 2.337101697921753, "learning_rate": 2.6277077762269276e-05, - "loss": 0.717, + "loss": 0.7174, "num_input_tokens_seen": 86753280, "step": 10590 }, { "epoch": 0.5090036014405762, - "grad_norm": 0.49312740564346313, + "grad_norm": 0.4962919056415558, "learning_rate": 2.6237511345499167e-05, - "loss": 0.8514, + "loss": 0.8513, "num_input_tokens_seen": 86835200, "step": 10600 }, { "epoch": 0.5094837935174069, - "grad_norm": 0.5390229225158691, + "grad_norm": 0.5572180151939392, "learning_rate": 2.61979418211481e-05, - "loss": 0.8897, + "loss": 0.8895, "num_input_tokens_seen": 86917120, "step": 10610 }, { "epoch": 0.5099639855942377, - "grad_norm": 0.5236647129058838, + "grad_norm": 0.5322186350822449, "learning_rate": 2.615836928858122e-05, - "loss": 0.9499, + "loss": 0.9501, "num_input_tokens_seen": 86999040, "step": 10620 }, { "epoch": 0.5104441776710684, - "grad_norm": 0.48277348279953003, + "grad_norm": 0.4891637861728668, "learning_rate": 2.6118793847171236e-05, - "loss": 0.7749, + "loss": 0.7757, "num_input_tokens_seen": 87080960, "step": 10630 }, { "epoch": 0.5109243697478991, - "grad_norm": 0.5363568067550659, + "grad_norm": 0.5661851167678833, "learning_rate": 2.607921559629816e-05, - "loss": 1.0117, + "loss": 1.0115, "num_input_tokens_seen": 87162880, "step": 10640 }, { "epoch": 0.5114045618247299, - "grad_norm": 0.47179368138313293, + "grad_norm": 0.4733878970146179, "learning_rate": 2.6039634635349043e-05, - "loss": 0.9039, + "loss": 0.9044, "num_input_tokens_seen": 87244800, "step": 10650 }, { "epoch": 0.5118847539015606, - "grad_norm": 0.4801013171672821, + "grad_norm": 0.4792388677597046, "learning_rate": 2.6000051063717767e-05, - "loss": 0.8813, + "loss": 0.8815, "num_input_tokens_seen": 87326720, "step": 10660 }, { "epoch": 0.5123649459783913, - "grad_norm": 0.48771876096725464, + "grad_norm": 0.49188944697380066, "learning_rate": 2.596046498080475e-05, - "loss": 0.856, + "loss": 0.8555, "num_input_tokens_seen": 87408640, "step": 10670 }, { "epoch": 0.5128451380552221, - "grad_norm": 0.4822755455970764, + "grad_norm": 0.4500071108341217, "learning_rate": 2.5920876486016726e-05, - "loss": 0.9129, + "loss": 0.9126, "num_input_tokens_seen": 87490560, "step": 10680 }, { "epoch": 0.5133253301320528, - "grad_norm": 0.5060322880744934, + "grad_norm": 0.5068156719207764, "learning_rate": 2.5881285678766482e-05, - "loss": 0.8935, + "loss": 0.8932, "num_input_tokens_seen": 87572480, "step": 10690 }, { "epoch": 0.5138055222088835, - "grad_norm": 0.7022219300270081, + "grad_norm": 0.6936764121055603, "learning_rate": 2.5841692658472617e-05, - "loss": 0.8172, + "loss": 0.8166, "num_input_tokens_seen": 87654400, "step": 10700 }, { "epoch": 0.5142857142857142, - "grad_norm": 0.5739632844924927, + "grad_norm": 0.5786249041557312, "learning_rate": 2.5802097524559264e-05, - "loss": 1.0935, + "loss": 1.0928, "num_input_tokens_seen": 87736320, "step": 10710 }, { "epoch": 0.514765906362545, - "grad_norm": 0.478157103061676, + "grad_norm": 0.481042742729187, "learning_rate": 2.5762500376455912e-05, - "loss": 0.8728, + "loss": 0.8714, "num_input_tokens_seen": 87818240, "step": 10720 }, { "epoch": 0.5152460984393757, - "grad_norm": 0.5317462086677551, + "grad_norm": 0.5307723879814148, "learning_rate": 2.5722901313597052e-05, - "loss": 0.8416, + "loss": 0.8432, "num_input_tokens_seen": 87900160, "step": 10730 }, { "epoch": 0.5157262905162064, - "grad_norm": 0.4968595802783966, + "grad_norm": 0.49768760800361633, "learning_rate": 2.5683300435422032e-05, - "loss": 0.8828, + "loss": 0.8825, "num_input_tokens_seen": 87982080, "step": 10740 }, { "epoch": 0.5162064825930373, - "grad_norm": 0.5193814039230347, + "grad_norm": 0.5388449430465698, "learning_rate": 2.564369784137472e-05, "loss": 0.8147, "num_input_tokens_seen": 88064000, @@ -8610,23 +8610,23 @@ }, { "epoch": 0.516686674669868, - "grad_norm": 0.5038419365882874, + "grad_norm": 0.5082977414131165, "learning_rate": 2.5604093630903307e-05, - "loss": 0.8245, + "loss": 0.8252, "num_input_tokens_seen": 88145920, "step": 10760 }, { "epoch": 0.5171668667466987, - "grad_norm": 0.4722188413143158, + "grad_norm": 0.47542208433151245, "learning_rate": 2.556448790346006e-05, - "loss": 0.7478, + "loss": 0.7476, "num_input_tokens_seen": 88227840, "step": 10770 }, { "epoch": 0.5176470588235295, - "grad_norm": 0.48475295305252075, + "grad_norm": 0.47928953170776367, "learning_rate": 2.5524880758501035e-05, "loss": 1.0129, "num_input_tokens_seen": 88309760, @@ -8634,535 +8634,535 @@ }, { "epoch": 0.5181272509003602, - "grad_norm": 1.4112732410430908, + "grad_norm": 1.2826298475265503, "learning_rate": 2.5485272295485846e-05, - "loss": 0.9362, + "loss": 0.9356, "num_input_tokens_seen": 88391680, "step": 10790 }, { "epoch": 0.5186074429771909, - "grad_norm": 0.484065979719162, + "grad_norm": 0.48119160532951355, "learning_rate": 2.544566261387743e-05, - "loss": 0.756, + "loss": 0.7554, "num_input_tokens_seen": 88473600, "step": 10800 }, { "epoch": 0.5190876350540216, - "grad_norm": 0.2988108992576599, + "grad_norm": 0.2859084904193878, "learning_rate": 2.5406051813141773e-05, - "loss": 0.843, + "loss": 0.8429, "num_input_tokens_seen": 88555520, "step": 10810 }, { "epoch": 0.5195678271308524, - "grad_norm": 0.4882756471633911, + "grad_norm": 0.49243417382240295, "learning_rate": 2.5366439992747688e-05, - "loss": 0.8411, + "loss": 0.8418, "num_input_tokens_seen": 88637440, "step": 10820 }, { "epoch": 0.5200480192076831, - "grad_norm": 0.4603646695613861, + "grad_norm": 0.4552639424800873, "learning_rate": 2.5326827252166523e-05, - "loss": 0.8732, + "loss": 0.873, "num_input_tokens_seen": 88719360, "step": 10830 }, { "epoch": 0.5205282112845138, - "grad_norm": 0.5258098244667053, + "grad_norm": 0.5165627002716064, "learning_rate": 2.5287213690871957e-05, - "loss": 0.8446, + "loss": 0.8453, "num_input_tokens_seen": 88801280, "step": 10840 }, { "epoch": 0.5210084033613446, - "grad_norm": 0.5720292925834656, + "grad_norm": 0.5611492991447449, "learning_rate": 2.5247599408339723e-05, - "loss": 0.8941, + "loss": 0.8937, "num_input_tokens_seen": 88883200, "step": 10850 }, { "epoch": 0.5214885954381753, - "grad_norm": 0.517301619052887, + "grad_norm": 0.5230381488800049, "learning_rate": 2.5207984504047365e-05, - "loss": 0.8391, + "loss": 0.8389, "num_input_tokens_seen": 88965120, "step": 10860 }, { "epoch": 0.521968787515006, - "grad_norm": 0.569545567035675, + "grad_norm": 0.5715019702911377, "learning_rate": 2.5168369077474004e-05, - "loss": 0.7663, + "loss": 0.7679, "num_input_tokens_seen": 89047040, "step": 10870 }, { "epoch": 0.5224489795918368, - "grad_norm": 0.5130712389945984, + "grad_norm": 0.5099117159843445, "learning_rate": 2.512875322810002e-05, - "loss": 0.9436, + "loss": 0.9431, "num_input_tokens_seen": 89128960, "step": 10880 }, { "epoch": 0.5229291716686675, - "grad_norm": 0.531822144985199, + "grad_norm": 0.5225771069526672, "learning_rate": 2.508913705540693e-05, - "loss": 0.8914, + "loss": 0.8919, "num_input_tokens_seen": 89210880, "step": 10890 }, { "epoch": 0.5234093637454982, - "grad_norm": 0.6732114553451538, + "grad_norm": 0.6697996854782104, "learning_rate": 2.504952065887701e-05, - "loss": 0.8412, + "loss": 0.841, "num_input_tokens_seen": 89292800, "step": 10900 }, { "epoch": 0.5238895558223289, - "grad_norm": 0.5019482970237732, + "grad_norm": 0.5064327120780945, "learning_rate": 2.5009904137993106e-05, - "loss": 0.9038, + "loss": 0.904, "num_input_tokens_seen": 89374720, "step": 10910 }, { "epoch": 0.5243697478991597, - "grad_norm": 2.033942699432373, + "grad_norm": 2.08994722366333, "learning_rate": 2.497028759223839e-05, - "loss": 1.0436, + "loss": 1.042, "num_input_tokens_seen": 89456640, "step": 10920 }, { "epoch": 0.5248499399759904, - "grad_norm": 0.5160328149795532, + "grad_norm": 0.49806416034698486, "learning_rate": 2.4930671121096105e-05, - "loss": 0.9828, + "loss": 0.9832, "num_input_tokens_seen": 89538560, "step": 10930 }, { "epoch": 0.5253301320528211, - "grad_norm": 0.5123900175094604, + "grad_norm": 0.5063926577568054, "learning_rate": 2.4891054824049264e-05, - "loss": 0.7117, + "loss": 0.7118, "num_input_tokens_seen": 89620480, "step": 10940 }, { "epoch": 0.5258103241296519, - "grad_norm": 0.5120553970336914, + "grad_norm": 0.5132240056991577, "learning_rate": 2.485143880058049e-05, - "loss": 0.8639, + "loss": 0.864, "num_input_tokens_seen": 89702400, "step": 10950 }, { "epoch": 0.5262905162064826, - "grad_norm": 0.5253350734710693, + "grad_norm": 0.5221575498580933, "learning_rate": 2.4811823150171692e-05, - "loss": 0.8927, + "loss": 0.8928, "num_input_tokens_seen": 89784320, "step": 10960 }, { "epoch": 0.5267707082833133, - "grad_norm": 0.7501769661903381, + "grad_norm": 0.7482132911682129, "learning_rate": 2.477220797230385e-05, - "loss": 1.0442, + "loss": 1.0436, "num_input_tokens_seen": 89866240, "step": 10970 }, { "epoch": 0.5272509003601441, - "grad_norm": 1.2171481847763062, + "grad_norm": 1.2887660264968872, "learning_rate": 2.4732593366456755e-05, - "loss": 1.1357, + "loss": 1.1344, "num_input_tokens_seen": 89948160, "step": 10980 }, { "epoch": 0.5277310924369748, - "grad_norm": 0.5957512259483337, + "grad_norm": 0.6092213988304138, "learning_rate": 2.4692979432108777e-05, - "loss": 0.9067, + "loss": 0.9068, "num_input_tokens_seen": 90030080, "step": 10990 }, { "epoch": 0.5282112845138055, - "grad_norm": 0.5080304741859436, + "grad_norm": 0.5136516690254211, "learning_rate": 2.4653366268736565e-05, - "loss": 1.0397, + "loss": 1.0396, "num_input_tokens_seen": 90112000, "step": 11000 }, { "epoch": 0.5286914765906362, - "grad_norm": 0.46417123079299927, + "grad_norm": 0.4663499593734741, "learning_rate": 2.461375397581487e-05, - "loss": 0.7986, + "loss": 0.7987, "num_input_tokens_seen": 90193920, "step": 11010 }, { "epoch": 0.529171668667467, - "grad_norm": 0.5016286373138428, + "grad_norm": 0.4954150915145874, "learning_rate": 2.4574142652816238e-05, - "loss": 0.8494, + "loss": 0.8519, "num_input_tokens_seen": 90275840, "step": 11020 }, { "epoch": 0.5296518607442977, - "grad_norm": 0.6602762937545776, + "grad_norm": 0.644432544708252, "learning_rate": 2.453453239921077e-05, - "loss": 0.75, + "loss": 0.7499, "num_input_tokens_seen": 90357760, "step": 11030 }, { "epoch": 0.5301320528211284, - "grad_norm": 0.520213782787323, + "grad_norm": 0.5186659693717957, "learning_rate": 2.44949233144659e-05, - "loss": 0.7564, + "loss": 0.7552, "num_input_tokens_seen": 90439680, "step": 11040 }, { "epoch": 0.5306122448979592, - "grad_norm": 0.49561575055122375, + "grad_norm": 0.49851685762405396, "learning_rate": 2.4455315498046134e-05, - "loss": 0.8193, + "loss": 0.8187, "num_input_tokens_seen": 90521600, "step": 11050 }, { "epoch": 0.5310924369747899, - "grad_norm": 0.5517764687538147, + "grad_norm": 0.5538609027862549, "learning_rate": 2.4415709049412757e-05, - "loss": 0.8752, + "loss": 0.8756, "num_input_tokens_seen": 90603520, "step": 11060 }, { "epoch": 0.5315726290516206, - "grad_norm": 0.5008801817893982, + "grad_norm": 0.5045083165168762, "learning_rate": 2.437610406802365e-05, - "loss": 0.8828, + "loss": 0.8821, "num_input_tokens_seen": 90685440, "step": 11070 }, { "epoch": 0.5320528211284514, - "grad_norm": 0.46303442120552063, + "grad_norm": 0.46333587169647217, "learning_rate": 2.4336500653333012e-05, - "loss": 0.9908, + "loss": 0.9912, "num_input_tokens_seen": 90767360, "step": 11080 }, { "epoch": 0.5325330132052821, - "grad_norm": 0.49256837368011475, + "grad_norm": 0.48697561025619507, "learning_rate": 2.4296898904791076e-05, - "loss": 0.9681, + "loss": 0.9691, "num_input_tokens_seen": 90849280, "step": 11090 }, { "epoch": 0.5330132052821128, - "grad_norm": 0.5212172269821167, + "grad_norm": 0.5088743567466736, "learning_rate": 2.425729892184393e-05, - "loss": 0.8629, + "loss": 0.8621, "num_input_tokens_seen": 90931200, "step": 11100 }, { "epoch": 0.5334933973589436, - "grad_norm": 0.49792659282684326, + "grad_norm": 0.49558478593826294, "learning_rate": 2.421770080393321e-05, - "loss": 1.1662, + "loss": 1.1661, "num_input_tokens_seen": 91013120, "step": 11110 }, { "epoch": 0.5339735894357743, - "grad_norm": 0.5083081126213074, + "grad_norm": 0.5269922614097595, "learning_rate": 2.417810465049585e-05, - "loss": 0.9811, + "loss": 0.9819, "num_input_tokens_seen": 91095040, "step": 11120 }, { "epoch": 0.534453781512605, - "grad_norm": 1.1411739587783813, + "grad_norm": 1.1876569986343384, "learning_rate": 2.413851056096388e-05, - "loss": 0.7801, + "loss": 0.7812, "num_input_tokens_seen": 91176960, "step": 11130 }, { "epoch": 0.5349339735894357, - "grad_norm": 0.4804481863975525, + "grad_norm": 0.4851647615432739, "learning_rate": 2.4098918634764153e-05, - "loss": 0.8704, + "loss": 0.8706, "num_input_tokens_seen": 91258880, "step": 11140 }, { "epoch": 0.5354141656662665, - "grad_norm": 0.47054970264434814, + "grad_norm": 0.4763019382953644, "learning_rate": 2.4059328971318053e-05, - "loss": 1.0149, + "loss": 1.015, "num_input_tokens_seen": 91340800, "step": 11150 }, { "epoch": 0.5358943577430972, - "grad_norm": 1.7846838235855103, + "grad_norm": 1.7785102128982544, "learning_rate": 2.4019741670041305e-05, - "loss": 0.9053, + "loss": 0.9073, "num_input_tokens_seen": 91422720, "step": 11160 }, { "epoch": 0.5363745498199279, - "grad_norm": 0.48844608664512634, + "grad_norm": 0.48900002241134644, "learning_rate": 2.398015683034371e-05, - "loss": 0.966, + "loss": 0.9643, "num_input_tokens_seen": 91504640, "step": 11170 }, { "epoch": 0.5368547418967587, - "grad_norm": 0.49595731496810913, + "grad_norm": 0.5039075613021851, "learning_rate": 2.394057455162886e-05, - "loss": 0.696, + "loss": 0.6964, "num_input_tokens_seen": 91586560, "step": 11180 }, { "epoch": 0.5373349339735894, - "grad_norm": 0.49253153800964355, + "grad_norm": 0.49732545018196106, "learning_rate": 2.3900994933293953e-05, - "loss": 0.7898, + "loss": 0.7903, "num_input_tokens_seen": 91668480, "step": 11190 }, { "epoch": 0.5378151260504201, - "grad_norm": 0.5204420685768127, + "grad_norm": 0.525292694568634, "learning_rate": 2.3861418074729476e-05, - "loss": 0.9175, + "loss": 0.918, "num_input_tokens_seen": 91750400, "step": 11200 }, { "epoch": 0.538295318127251, - "grad_norm": 0.4966830015182495, + "grad_norm": 0.5006028413772583, "learning_rate": 2.3821844075318993e-05, - "loss": 0.8275, + "loss": 0.8276, "num_input_tokens_seen": 91832320, "step": 11210 }, { "epoch": 0.5387755102040817, - "grad_norm": 0.41914886236190796, + "grad_norm": 0.42340755462646484, "learning_rate": 2.378227303443889e-05, - "loss": 0.9653, + "loss": 0.9665, "num_input_tokens_seen": 91914240, "step": 11220 }, { "epoch": 0.5392557022809124, - "grad_norm": 0.4765452444553375, + "grad_norm": 0.4761850833892822, "learning_rate": 2.3742705051458145e-05, - "loss": 0.8119, + "loss": 0.8118, "num_input_tokens_seen": 91996160, "step": 11230 }, { "epoch": 0.539735894357743, - "grad_norm": 0.4974367022514343, + "grad_norm": 0.515305757522583, "learning_rate": 2.3703140225738017e-05, - "loss": 0.9283, + "loss": 0.9285, "num_input_tokens_seen": 92078080, "step": 11240 }, { "epoch": 0.5402160864345739, - "grad_norm": 0.48964783549308777, + "grad_norm": 0.4948907792568207, "learning_rate": 2.3663578656631858e-05, - "loss": 0.8144, + "loss": 0.814, "num_input_tokens_seen": 92160000, "step": 11250 }, { "epoch": 0.5406962785114046, - "grad_norm": 0.5229817628860474, + "grad_norm": 0.5262484550476074, "learning_rate": 2.362402044348486e-05, - "loss": 0.8041, + "loss": 0.8016, "num_input_tokens_seen": 92241920, "step": 11260 }, { "epoch": 0.5411764705882353, - "grad_norm": 0.5138763785362244, + "grad_norm": 0.4996432662010193, "learning_rate": 2.3584465685633738e-05, - "loss": 0.877, + "loss": 0.8771, "num_input_tokens_seen": 92323840, "step": 11270 }, { "epoch": 0.5416566626650661, - "grad_norm": 0.6102971434593201, + "grad_norm": 0.5967109203338623, "learning_rate": 2.3544914482406592e-05, - "loss": 0.8796, + "loss": 0.8791, "num_input_tokens_seen": 92405760, "step": 11280 }, { "epoch": 0.5421368547418968, - "grad_norm": 0.4886428713798523, + "grad_norm": 0.4963090419769287, "learning_rate": 2.350536693312255e-05, - "loss": 0.8823, + "loss": 0.8822, "num_input_tokens_seen": 92487680, "step": 11290 }, { "epoch": 0.5426170468187275, - "grad_norm": 0.4848209321498871, + "grad_norm": 0.48664769530296326, "learning_rate": 2.3465823137091572e-05, - "loss": 0.8294, + "loss": 0.8298, "num_input_tokens_seen": 92569600, "step": 11300 }, { "epoch": 0.5430972388955583, - "grad_norm": 0.4850007891654968, + "grad_norm": 0.48569774627685547, "learning_rate": 2.3426283193614208e-05, - "loss": 0.8574, + "loss": 0.8567, "num_input_tokens_seen": 92651520, "step": 11310 }, { "epoch": 0.543577430972389, - "grad_norm": 0.47127023339271545, + "grad_norm": 0.4557991623878479, "learning_rate": 2.3386747201981338e-05, - "loss": 1.143, + "loss": 1.1435, "num_input_tokens_seen": 92733440, "step": 11320 }, { "epoch": 0.5440576230492197, - "grad_norm": 0.4775993227958679, + "grad_norm": 0.47304612398147583, "learning_rate": 2.3347215261473887e-05, - "loss": 0.8502, + "loss": 0.8503, "num_input_tokens_seen": 92815360, "step": 11330 }, { "epoch": 0.5445378151260504, - "grad_norm": 0.4856424629688263, + "grad_norm": 0.4795781970024109, "learning_rate": 2.330768747136263e-05, - "loss": 0.9998, + "loss": 0.9992, "num_input_tokens_seen": 92897280, "step": 11340 }, { "epoch": 0.5450180072028812, - "grad_norm": 0.4987284541130066, + "grad_norm": 0.49423906207084656, "learning_rate": 2.3268163930907933e-05, - "loss": 0.8627, + "loss": 0.8629, "num_input_tokens_seen": 92979200, "step": 11350 }, { "epoch": 0.5454981992797119, - "grad_norm": 0.9060471653938293, + "grad_norm": 0.8439264893531799, "learning_rate": 2.3228644739359444e-05, - "loss": 0.8115, + "loss": 0.8107, "num_input_tokens_seen": 93061120, "step": 11360 }, { "epoch": 0.5459783913565426, - "grad_norm": 0.5042136907577515, + "grad_norm": 0.501241147518158, "learning_rate": 2.3189129995955943e-05, - "loss": 0.9262, + "loss": 0.9255, "num_input_tokens_seen": 93143040, "step": 11370 }, { "epoch": 0.5464585834333734, - "grad_norm": 0.6296837329864502, + "grad_norm": 0.5781614780426025, "learning_rate": 2.314961979992501e-05, - "loss": 0.9588, + "loss": 0.9589, "num_input_tokens_seen": 93224960, "step": 11380 }, { "epoch": 0.5469387755102041, - "grad_norm": 0.40785011649131775, + "grad_norm": 0.4057072103023529, "learning_rate": 2.311011425048281e-05, - "loss": 1.0475, + "loss": 1.0476, "num_input_tokens_seen": 93306880, "step": 11390 }, { "epoch": 0.5474189675870348, - "grad_norm": 0.5426297187805176, + "grad_norm": 0.5158141255378723, "learning_rate": 2.3070613446833842e-05, - "loss": 0.9962, + "loss": 0.9967, "num_input_tokens_seen": 93388800, "step": 11400 }, { "epoch": 0.5478991596638656, - "grad_norm": 0.49344369769096375, + "grad_norm": 0.49406811594963074, "learning_rate": 2.30311174881707e-05, - "loss": 0.9423, + "loss": 0.9432, "num_input_tokens_seen": 93470720, "step": 11410 }, { "epoch": 0.5483793517406963, - "grad_norm": 0.5111265182495117, + "grad_norm": 0.5071375370025635, "learning_rate": 2.2991626473673773e-05, - "loss": 0.78, + "loss": 0.7797, "num_input_tokens_seen": 93552640, "step": 11420 }, { "epoch": 0.548859543817527, - "grad_norm": 0.49911582469940186, + "grad_norm": 0.4987107217311859, "learning_rate": 2.295214050251108e-05, - "loss": 0.9205, + "loss": 0.9207, "num_input_tokens_seen": 93634560, "step": 11430 }, { "epoch": 0.5493397358943577, - "grad_norm": 0.5109173059463501, + "grad_norm": 0.5108968019485474, "learning_rate": 2.2912659673837965e-05, - "loss": 0.7834, + "loss": 0.7837, "num_input_tokens_seen": 93716480, "step": 11440 }, { "epoch": 0.5498199279711885, - "grad_norm": 0.49743011593818665, + "grad_norm": 0.5007959604263306, "learning_rate": 2.2873184086796824e-05, "loss": 0.8649, "num_input_tokens_seen": 93798400, @@ -9170,271 +9170,271 @@ }, { "epoch": 0.5503001200480192, - "grad_norm": 0.5043100118637085, + "grad_norm": 0.49213898181915283, "learning_rate": 2.283371384051693e-05, - "loss": 1.2097, + "loss": 1.2096, "num_input_tokens_seen": 93880320, "step": 11460 }, { "epoch": 0.5507803121248499, - "grad_norm": 0.5047153830528259, + "grad_norm": 0.5077102780342102, "learning_rate": 2.2794249034114137e-05, - "loss": 0.9085, + "loss": 0.9088, "num_input_tokens_seen": 93962240, "step": 11470 }, { "epoch": 0.5512605042016807, - "grad_norm": 1.0863653421401978, + "grad_norm": 1.1485230922698975, "learning_rate": 2.275478976669062e-05, - "loss": 1.0061, + "loss": 1.0068, "num_input_tokens_seen": 94044160, "step": 11480 }, { "epoch": 0.5517406962785114, - "grad_norm": 0.5065763592720032, + "grad_norm": 0.5125216245651245, "learning_rate": 2.2715336137334657e-05, - "loss": 0.9816, + "loss": 0.9817, "num_input_tokens_seen": 94126080, "step": 11490 }, { "epoch": 0.5522208883553421, - "grad_norm": 0.5079005360603333, + "grad_norm": 0.5075448751449585, "learning_rate": 2.2675888245120382e-05, - "loss": 1.0597, + "loss": 1.0594, "num_input_tokens_seen": 94208000, "step": 11500 }, { "epoch": 0.5527010804321729, - "grad_norm": 0.5147351622581482, + "grad_norm": 0.5110220313072205, "learning_rate": 2.263644618910749e-05, - "loss": 0.9222, + "loss": 0.9223, "num_input_tokens_seen": 94289920, "step": 11510 }, { "epoch": 0.5531812725090036, - "grad_norm": 0.4940924048423767, + "grad_norm": 0.5087609887123108, "learning_rate": 2.2597010068341052e-05, - "loss": 0.6929, + "loss": 0.6925, "num_input_tokens_seen": 94371840, "step": 11520 }, { "epoch": 0.5536614645858343, - "grad_norm": 0.49539023637771606, + "grad_norm": 0.49689438939094543, "learning_rate": 2.255757998185122e-05, - "loss": 0.943, + "loss": 0.9426, "num_input_tokens_seen": 94453760, "step": 11530 }, { "epoch": 0.554141656662665, - "grad_norm": 0.7505642771720886, + "grad_norm": 0.7527957558631897, "learning_rate": 2.2518156028652977e-05, - "loss": 0.9255, + "loss": 0.9247, "num_input_tokens_seen": 94535680, "step": 11540 }, { "epoch": 0.5546218487394958, - "grad_norm": 0.2930028438568115, + "grad_norm": 0.30175384879112244, "learning_rate": 2.2478738307745938e-05, - "loss": 0.7929, + "loss": 0.7931, "num_input_tokens_seen": 94617600, "step": 11550 }, { "epoch": 0.5551020408163265, - "grad_norm": 0.4832445979118347, + "grad_norm": 0.482919305562973, "learning_rate": 2.243932691811405e-05, - "loss": 0.9169, + "loss": 0.9164, "num_input_tokens_seen": 94699520, "step": 11560 }, { "epoch": 0.5555822328931572, - "grad_norm": 0.4929827153682709, + "grad_norm": 0.4989522695541382, "learning_rate": 2.2399921958725364e-05, - "loss": 0.9749, + "loss": 0.9756, "num_input_tokens_seen": 94781440, "step": 11570 }, { "epoch": 0.556062424969988, - "grad_norm": 1.4206510782241821, + "grad_norm": 1.4328854084014893, "learning_rate": 2.236052352853177e-05, - "loss": 1.0515, + "loss": 1.0529, "num_input_tokens_seen": 94863360, "step": 11580 }, { "epoch": 0.5565426170468187, - "grad_norm": 0.5904713273048401, + "grad_norm": 0.554436206817627, "learning_rate": 2.232113172646878e-05, - "loss": 0.9887, + "loss": 0.9882, "num_input_tokens_seen": 94945280, "step": 11590 }, { "epoch": 0.5570228091236494, - "grad_norm": 0.49482420086860657, + "grad_norm": 0.49005597829818726, "learning_rate": 2.2281746651455272e-05, - "loss": 0.8613, + "loss": 0.8608, "num_input_tokens_seen": 95027200, "step": 11600 }, { "epoch": 0.5575030012004802, - "grad_norm": 0.5033355951309204, + "grad_norm": 0.502778947353363, "learning_rate": 2.2242368402393198e-05, - "loss": 1.061, + "loss": 1.0607, "num_input_tokens_seen": 95109120, "step": 11610 }, { "epoch": 0.5579831932773109, - "grad_norm": 1.082341194152832, + "grad_norm": 0.964520275592804, "learning_rate": 2.220299707816738e-05, - "loss": 0.8245, + "loss": 0.8243, "num_input_tokens_seen": 95191040, "step": 11620 }, { "epoch": 0.5584633853541416, - "grad_norm": 0.49979743361473083, + "grad_norm": 0.4970017969608307, "learning_rate": 2.2163632777645282e-05, - "loss": 0.8227, + "loss": 0.8224, "num_input_tokens_seen": 95272960, "step": 11630 }, { "epoch": 0.5589435774309723, - "grad_norm": 0.5040926933288574, + "grad_norm": 0.5044746994972229, "learning_rate": 2.2124275599676676e-05, - "loss": 0.9013, + "loss": 0.9011, "num_input_tokens_seen": 95354880, "step": 11640 }, { "epoch": 0.5594237695078031, - "grad_norm": 0.4717329144477844, + "grad_norm": 0.4746025800704956, "learning_rate": 2.20849256430935e-05, - "loss": 0.9034, + "loss": 0.9039, "num_input_tokens_seen": 95436800, "step": 11650 }, { "epoch": 0.5599039615846338, - "grad_norm": 0.5357513427734375, + "grad_norm": 0.5340638756752014, "learning_rate": 2.2045583006709536e-05, - "loss": 0.919, + "loss": 0.9187, "num_input_tokens_seen": 95518720, "step": 11660 }, { "epoch": 0.5603841536614645, - "grad_norm": 0.6526145935058594, + "grad_norm": 0.6348832845687866, "learning_rate": 2.2006247789320162e-05, - "loss": 0.868, + "loss": 0.8694, "num_input_tokens_seen": 95600640, "step": 11670 }, { "epoch": 0.5608643457382954, - "grad_norm": 0.527145266532898, + "grad_norm": 0.544661819934845, "learning_rate": 2.1966920089702157e-05, - "loss": 0.9287, + "loss": 0.9286, "num_input_tokens_seen": 95682560, "step": 11680 }, { "epoch": 0.561344537815126, - "grad_norm": 0.4794023931026459, + "grad_norm": 0.48837342858314514, "learning_rate": 2.192760000661343e-05, - "loss": 0.8022, + "loss": 0.8019, "num_input_tokens_seen": 95764480, "step": 11690 }, { "epoch": 0.5618247298919568, - "grad_norm": 0.4974648058414459, + "grad_norm": 0.5073260068893433, "learning_rate": 2.1888287638792722e-05, - "loss": 0.8273, + "loss": 0.8267, "num_input_tokens_seen": 95846400, "step": 11700 }, { "epoch": 0.5623049219687876, - "grad_norm": 0.48735299706459045, + "grad_norm": 0.4896014928817749, "learning_rate": 2.184898308495943e-05, - "loss": 0.8902, + "loss": 0.8884, "num_input_tokens_seen": 95928320, "step": 11710 }, { "epoch": 0.5627851140456183, - "grad_norm": 0.4717083275318146, + "grad_norm": 0.473751425743103, "learning_rate": 2.180968644381334e-05, - "loss": 0.9926, + "loss": 0.992, "num_input_tokens_seen": 96010240, "step": 11720 }, { "epoch": 0.563265306122449, - "grad_norm": 0.5048367977142334, + "grad_norm": 0.5091336965560913, "learning_rate": 2.1770397814034315e-05, - "loss": 0.8164, + "loss": 0.8166, "num_input_tokens_seen": 96092160, "step": 11730 }, { "epoch": 0.5637454981992797, - "grad_norm": 0.5213897228240967, + "grad_norm": 0.5212090015411377, "learning_rate": 2.1731117294282166e-05, - "loss": 0.9192, + "loss": 0.9194, "num_input_tokens_seen": 96174080, "step": 11740 }, { "epoch": 0.5642256902761105, - "grad_norm": 0.47659391164779663, + "grad_norm": 0.47561123967170715, "learning_rate": 2.16918449831963e-05, - "loss": 0.841, + "loss": 0.8408, "num_input_tokens_seen": 96256000, "step": 11750 }, { "epoch": 0.5647058823529412, - "grad_norm": 0.49390923976898193, + "grad_norm": 0.49135398864746094, "learning_rate": 2.165258097939551e-05, - "loss": 0.8303, + "loss": 0.8308, "num_input_tokens_seen": 96337920, "step": 11760 }, { "epoch": 0.5651860744297719, - "grad_norm": 0.4540766775608063, + "grad_norm": 0.4511423110961914, "learning_rate": 2.1613325381477744e-05, - "loss": 0.6811, + "loss": 0.6813, "num_input_tokens_seen": 96419840, "step": 11770 }, { "epoch": 0.5656662665066027, - "grad_norm": 0.5054943561553955, + "grad_norm": 0.5014552474021912, "learning_rate": 2.1574078288019846e-05, - "loss": 0.9841, + "loss": 0.9842, "num_input_tokens_seen": 96501760, "step": 11780 }, { "epoch": 0.5661464585834334, - "grad_norm": 0.5268092751502991, + "grad_norm": 0.5226877927780151, "learning_rate": 2.1534839797577268e-05, "loss": 1.0156, "num_input_tokens_seen": 96583680, @@ -9442,135 +9442,135 @@ }, { "epoch": 0.5666266506602641, - "grad_norm": 1.0047301054000854, + "grad_norm": 1.0016844272613525, "learning_rate": 2.14956100086839e-05, - "loss": 0.813, + "loss": 0.8119, "num_input_tokens_seen": 96665600, "step": 11800 }, { "epoch": 0.5671068427370949, - "grad_norm": 0.489765465259552, + "grad_norm": 0.487522155046463, "learning_rate": 2.1456389019851762e-05, - "loss": 0.8036, + "loss": 0.8039, "num_input_tokens_seen": 96747520, "step": 11810 }, { "epoch": 0.5675870348139256, - "grad_norm": 0.4869178831577301, + "grad_norm": 0.4897004961967468, "learning_rate": 2.1417176929570768e-05, - "loss": 0.8429, + "loss": 0.8425, "num_input_tokens_seen": 96829440, "step": 11820 }, { "epoch": 0.5680672268907563, - "grad_norm": 0.6066574454307556, + "grad_norm": 0.6142635941505432, "learning_rate": 2.137797383630851e-05, - "loss": 0.8213, + "loss": 0.8214, "num_input_tokens_seen": 96911360, "step": 11830 }, { "epoch": 0.568547418967587, - "grad_norm": 0.48497194051742554, + "grad_norm": 0.48310551047325134, "learning_rate": 2.1338779838509965e-05, - "loss": 0.8338, + "loss": 0.8337, "num_input_tokens_seen": 96993280, "step": 11840 }, { "epoch": 0.5690276110444178, - "grad_norm": 0.49360740184783936, + "grad_norm": 0.4890468418598175, "learning_rate": 2.129959503459728e-05, - "loss": 1.0187, + "loss": 1.018, "num_input_tokens_seen": 97075200, "step": 11850 }, { "epoch": 0.5695078031212485, - "grad_norm": 0.4896343946456909, + "grad_norm": 0.49132439494132996, "learning_rate": 2.126041952296951e-05, - "loss": 0.8479, + "loss": 0.8476, "num_input_tokens_seen": 97157120, "step": 11860 }, { "epoch": 0.5699879951980792, - "grad_norm": 0.626423180103302, + "grad_norm": 0.6127716302871704, "learning_rate": 2.122125340200239e-05, - "loss": 1.0873, + "loss": 1.086, "num_input_tokens_seen": 97239040, "step": 11870 }, { "epoch": 0.57046818727491, - "grad_norm": 0.5536409616470337, + "grad_norm": 0.5488608479499817, "learning_rate": 2.1182096770048045e-05, - "loss": 1.1087, + "loss": 1.1085, "num_input_tokens_seen": 97320960, "step": 11880 }, { "epoch": 0.5709483793517407, - "grad_norm": 0.4912538528442383, + "grad_norm": 0.4914509356021881, "learning_rate": 2.11429497254348e-05, - "loss": 0.872, + "loss": 0.8716, "num_input_tokens_seen": 97402880, "step": 11890 }, { "epoch": 0.5714285714285714, - "grad_norm": 0.5036265254020691, + "grad_norm": 0.5030828714370728, "learning_rate": 2.1103812366466896e-05, - "loss": 0.9375, + "loss": 0.9373, "num_input_tokens_seen": 97484800, "step": 11900 }, { "epoch": 0.5719087635054022, - "grad_norm": 0.47547534108161926, + "grad_norm": 0.4771164059638977, "learning_rate": 2.1064684791424236e-05, - "loss": 0.9402, + "loss": 0.9405, "num_input_tokens_seen": 97566720, "step": 11910 }, { "epoch": 0.5723889555822329, - "grad_norm": 0.6462324857711792, + "grad_norm": 0.8517586588859558, "learning_rate": 2.1025567098562177e-05, - "loss": 0.83, + "loss": 0.8314, "num_input_tokens_seen": 97648640, "step": 11920 }, { "epoch": 0.5728691476590636, - "grad_norm": 0.6137294769287109, + "grad_norm": 0.6191906332969666, "learning_rate": 2.0986459386111256e-05, - "loss": 0.8851, + "loss": 0.8852, "num_input_tokens_seen": 97730560, "step": 11930 }, { "epoch": 0.5733493397358943, - "grad_norm": 0.474915087223053, + "grad_norm": 0.4799683690071106, "learning_rate": 2.0947361752276935e-05, - "loss": 0.8404, + "loss": 0.8412, "num_input_tokens_seen": 97812480, "step": 11940 }, { "epoch": 0.5738295318127251, - "grad_norm": 0.5123166441917419, + "grad_norm": 0.5109384059906006, "learning_rate": 2.0908274295239365e-05, - "loss": 1.0064, + "loss": 1.007, "num_input_tokens_seen": 97894400, "step": 11950 }, { "epoch": 0.5743097238895558, - "grad_norm": 1.154371738433838, + "grad_norm": 1.1497302055358887, "learning_rate": 2.0869197113153175e-05, "loss": 0.882, "num_input_tokens_seen": 97976320, @@ -9578,15 +9578,15 @@ }, { "epoch": 0.5747899159663865, - "grad_norm": 0.48168784379959106, + "grad_norm": 0.4946820139884949, "learning_rate": 2.083013030414714e-05, - "loss": 0.7295, + "loss": 0.7298, "num_input_tokens_seen": 98058240, "step": 11970 }, { "epoch": 0.5752701080432173, - "grad_norm": 0.5147429704666138, + "grad_norm": 0.5023751258850098, "learning_rate": 2.0791073966324037e-05, "loss": 0.808, "num_input_tokens_seen": 98140160, @@ -9594,55 +9594,55 @@ }, { "epoch": 0.575750300120048, - "grad_norm": 0.47852545976638794, + "grad_norm": 0.47996094822883606, "learning_rate": 2.0752028197760323e-05, - "loss": 0.7292, + "loss": 0.729, "num_input_tokens_seen": 98222080, "step": 11990 }, { "epoch": 0.5762304921968787, - "grad_norm": 0.48357313871383667, + "grad_norm": 0.482053279876709, "learning_rate": 2.07129930965059e-05, - "loss": 0.9465, + "loss": 0.9467, "num_input_tokens_seen": 98304000, "step": 12000 }, { "epoch": 0.5767106842737095, - "grad_norm": 1.0818113088607788, + "grad_norm": 1.1011282205581665, "learning_rate": 2.0673968760583912e-05, - "loss": 0.8273, + "loss": 0.8261, "num_input_tokens_seen": 98385920, "step": 12010 }, { "epoch": 0.5771908763505402, - "grad_norm": 0.551762580871582, + "grad_norm": 0.5430877804756165, "learning_rate": 2.0634955287990465e-05, - "loss": 1.0037, + "loss": 1.0051, "num_input_tokens_seen": 98467840, "step": 12020 }, { "epoch": 0.5776710684273709, - "grad_norm": 0.5415941476821899, + "grad_norm": 0.5249661207199097, "learning_rate": 2.059595277669436e-05, - "loss": 0.7684, + "loss": 0.7679, "num_input_tokens_seen": 98549760, "step": 12030 }, { "epoch": 0.5781512605042017, - "grad_norm": 0.5173623561859131, + "grad_norm": 0.5203903317451477, "learning_rate": 2.0556961324636903e-05, - "loss": 0.9755, + "loss": 0.9756, "num_input_tokens_seen": 98631680, "step": 12040 }, { "epoch": 0.5786314525810324, - "grad_norm": 0.4724039137363434, + "grad_norm": 0.47762370109558105, "learning_rate": 2.0517981029731616e-05, "loss": 0.9699, "num_input_tokens_seen": 98713600, @@ -9650,119 +9650,119 @@ }, { "epoch": 0.5791116446578631, - "grad_norm": 0.5577803254127502, + "grad_norm": 0.5500503182411194, "learning_rate": 2.0479011989863988e-05, - "loss": 1.0823, + "loss": 1.0828, "num_input_tokens_seen": 98795520, "step": 12060 }, { "epoch": 0.5795918367346938, - "grad_norm": 0.9037268757820129, + "grad_norm": 0.8895681500434875, "learning_rate": 2.0440054302891276e-05, - "loss": 0.8871, + "loss": 0.8873, "num_input_tokens_seen": 98877440, "step": 12070 }, { "epoch": 0.5800720288115246, - "grad_norm": 0.4992835223674774, + "grad_norm": 0.49974870681762695, "learning_rate": 2.0401108066642217e-05, - "loss": 0.7901, + "loss": 0.7899, "num_input_tokens_seen": 98959360, "step": 12080 }, { "epoch": 0.5805522208883553, - "grad_norm": 0.5037127733230591, + "grad_norm": 0.5034114122390747, "learning_rate": 2.0362173378916763e-05, - "loss": 0.844, + "loss": 0.843, "num_input_tokens_seen": 99041280, "step": 12090 }, { "epoch": 0.581032412965186, - "grad_norm": 0.5936850905418396, + "grad_norm": 0.5820488929748535, "learning_rate": 2.032325033748591e-05, - "loss": 0.8851, + "loss": 0.8847, "num_input_tokens_seen": 99123200, "step": 12100 }, { "epoch": 0.5815126050420169, - "grad_norm": 0.6421457529067993, + "grad_norm": 0.6207419037818909, "learning_rate": 2.0284339040091403e-05, - "loss": 0.9621, + "loss": 0.9604, "num_input_tokens_seen": 99205120, "step": 12110 }, { "epoch": 0.5819927971188475, - "grad_norm": 0.49321863055229187, + "grad_norm": 0.49734240770339966, "learning_rate": 2.0245439584445457e-05, - "loss": 0.7169, + "loss": 0.716, "num_input_tokens_seen": 99287040, "step": 12120 }, { "epoch": 0.5824729891956782, - "grad_norm": 0.4810725152492523, + "grad_norm": 0.48361942172050476, "learning_rate": 2.0206552068230587e-05, - "loss": 0.9376, + "loss": 0.9379, "num_input_tokens_seen": 99368960, "step": 12130 }, { "epoch": 0.5829531812725091, - "grad_norm": 0.4972326159477234, + "grad_norm": 0.49501538276672363, "learning_rate": 2.0167676589099324e-05, - "loss": 0.9773, + "loss": 0.9769, "num_input_tokens_seen": 99450880, "step": 12140 }, { "epoch": 0.5834333733493398, - "grad_norm": 0.4891018569469452, + "grad_norm": 0.4937068819999695, "learning_rate": 2.0128813244673946e-05, - "loss": 0.8787, + "loss": 0.8786, "num_input_tokens_seen": 99532800, "step": 12150 }, { "epoch": 0.5839135654261705, - "grad_norm": 0.562864363193512, + "grad_norm": 0.5697944164276123, "learning_rate": 2.0089962132546296e-05, - "loss": 0.9362, + "loss": 0.9365, "num_input_tokens_seen": 99614720, "step": 12160 }, { "epoch": 0.5843937575030012, - "grad_norm": 0.5169605016708374, + "grad_norm": 0.5267488360404968, "learning_rate": 2.0051123350277477e-05, - "loss": 0.8472, + "loss": 0.8467, "num_input_tokens_seen": 99696640, "step": 12170 }, { "epoch": 0.584873949579832, - "grad_norm": 0.47625747323036194, + "grad_norm": 0.4827499985694885, "learning_rate": 2.0012296995397613e-05, - "loss": 0.6606, + "loss": 0.6612, "num_input_tokens_seen": 99778560, "step": 12180 }, { "epoch": 0.5853541416566627, - "grad_norm": 0.34957268834114075, + "grad_norm": 0.34603819251060486, "learning_rate": 1.997348316540566e-05, - "loss": 0.7587, + "loss": 0.7583, "num_input_tokens_seen": 99860480, "step": 12190 }, { "epoch": 0.5858343337334934, - "grad_norm": 0.5261297821998596, + "grad_norm": 0.5291000604629517, "learning_rate": 1.9934681957769107e-05, "loss": 0.7988, "num_input_tokens_seen": 99942400, @@ -9770,87 +9770,87 @@ }, { "epoch": 0.5863145258103242, - "grad_norm": 0.5246568322181702, + "grad_norm": 0.5321553349494934, "learning_rate": 1.9895893469923736e-05, - "loss": 0.9584, + "loss": 0.9576, "num_input_tokens_seen": 100024320, "step": 12210 }, { "epoch": 0.5867947178871549, - "grad_norm": 0.45888885855674744, + "grad_norm": 0.4535011947154999, "learning_rate": 1.985711779927339e-05, - "loss": 1.1023, + "loss": 1.1025, "num_input_tokens_seen": 100106240, "step": 12220 }, { "epoch": 0.5872749099639856, - "grad_norm": 0.6073553562164307, + "grad_norm": 0.600887656211853, "learning_rate": 1.9818355043189732e-05, - "loss": 0.8485, + "loss": 0.8477, "num_input_tokens_seen": 100188160, "step": 12230 }, { "epoch": 0.5877551020408164, - "grad_norm": 0.8071925640106201, + "grad_norm": 0.8248658180236816, "learning_rate": 1.9779605299012005e-05, - "loss": 0.9277, + "loss": 0.9275, "num_input_tokens_seen": 100270080, "step": 12240 }, { "epoch": 0.5882352941176471, - "grad_norm": 0.5476227402687073, + "grad_norm": 0.5473348498344421, "learning_rate": 1.974086866404675e-05, - "loss": 0.9011, + "loss": 0.9013, "num_input_tokens_seen": 100352000, "step": 12250 }, { "epoch": 0.5887154861944778, - "grad_norm": 0.4784226417541504, + "grad_norm": 0.47622087597846985, "learning_rate": 1.970214523556761e-05, - "loss": 1.0077, + "loss": 1.0079, "num_input_tokens_seen": 100433920, "step": 12260 }, { "epoch": 0.5891956782713085, - "grad_norm": 0.49654924869537354, + "grad_norm": 0.493226021528244, "learning_rate": 1.9663435110815065e-05, - "loss": 0.9088, + "loss": 0.909, "num_input_tokens_seen": 100515840, "step": 12270 }, { "epoch": 0.5896758703481393, - "grad_norm": 0.4656108617782593, + "grad_norm": 0.46058568358421326, "learning_rate": 1.9624738386996163e-05, - "loss": 0.8294, + "loss": 0.8296, "num_input_tokens_seen": 100597760, "step": 12280 }, { "epoch": 0.59015606242497, - "grad_norm": 0.5135471820831299, + "grad_norm": 0.5092949867248535, "learning_rate": 1.9586055161284332e-05, - "loss": 0.914, + "loss": 0.9134, "num_input_tokens_seen": 100679680, "step": 12290 }, { "epoch": 0.5906362545018007, - "grad_norm": 0.5345959663391113, + "grad_norm": 0.536564826965332, "learning_rate": 1.954738553081909e-05, - "loss": 1.1152, + "loss": 1.115, "num_input_tokens_seen": 100761600, "step": 12300 }, { "epoch": 0.5911164465786315, - "grad_norm": 0.5068685412406921, + "grad_norm": 0.5003234148025513, "learning_rate": 1.950872959270581e-05, "loss": 0.8391, "num_input_tokens_seen": 100843520, @@ -9858,135 +9858,135 @@ }, { "epoch": 0.5915966386554622, - "grad_norm": 0.5310745239257812, + "grad_norm": 0.529944658279419, "learning_rate": 1.9470087444015484e-05, - "loss": 0.8974, + "loss": 0.897, "num_input_tokens_seen": 100925440, "step": 12320 }, { "epoch": 0.5920768307322929, - "grad_norm": 0.4799753725528717, + "grad_norm": 0.48205164074897766, "learning_rate": 1.9431459181784495e-05, - "loss": 1.1511, + "loss": 1.1514, "num_input_tokens_seen": 101007360, "step": 12330 }, { "epoch": 0.5925570228091237, - "grad_norm": 0.51557856798172, + "grad_norm": 0.5071780681610107, "learning_rate": 1.939284490301432e-05, - "loss": 0.8286, + "loss": 0.8289, "num_input_tokens_seen": 101089280, "step": 12340 }, { "epoch": 0.5930372148859544, - "grad_norm": 0.49833816289901733, + "grad_norm": 0.49877625703811646, "learning_rate": 1.935424470467135e-05, - "loss": 0.7323, + "loss": 0.7324, "num_input_tokens_seen": 101171200, "step": 12350 }, { "epoch": 0.5935174069627851, - "grad_norm": 0.5022624731063843, + "grad_norm": 0.5038672089576721, "learning_rate": 1.9315658683686615e-05, - "loss": 0.9878, + "loss": 0.9871, "num_input_tokens_seen": 101253120, "step": 12360 }, { "epoch": 0.5939975990396158, - "grad_norm": 0.5500155091285706, + "grad_norm": 0.5478134155273438, "learning_rate": 1.9277086936955517e-05, - "loss": 0.8682, + "loss": 0.8688, "num_input_tokens_seen": 101335040, "step": 12370 }, { "epoch": 0.5944777911164466, - "grad_norm": 0.4885292053222656, + "grad_norm": 0.4918793737888336, "learning_rate": 1.9238529561337646e-05, - "loss": 0.8724, + "loss": 0.8722, "num_input_tokens_seen": 101416960, "step": 12380 }, { "epoch": 0.5949579831932773, - "grad_norm": 0.4912605583667755, + "grad_norm": 0.48617425560951233, "learning_rate": 1.9199986653656502e-05, - "loss": 0.9832, + "loss": 0.9837, "num_input_tokens_seen": 101498880, "step": 12390 }, { "epoch": 0.595438175270108, - "grad_norm": 0.6492443084716797, + "grad_norm": 0.6774265766143799, "learning_rate": 1.9161458310699225e-05, - "loss": 0.8047, + "loss": 0.8041, "num_input_tokens_seen": 101580800, "step": 12400 }, { "epoch": 0.5959183673469388, - "grad_norm": 0.5103248953819275, + "grad_norm": 0.5218595266342163, "learning_rate": 1.9122944629216402e-05, - "loss": 0.9766, + "loss": 0.9768, "num_input_tokens_seen": 101662720, "step": 12410 }, { "epoch": 0.5963985594237695, - "grad_norm": 0.4565691351890564, + "grad_norm": 0.4586848020553589, "learning_rate": 1.9084445705921815e-05, - "loss": 0.8339, + "loss": 0.8343, "num_input_tokens_seen": 101744640, "step": 12420 }, { "epoch": 0.5968787515006002, - "grad_norm": 0.47968733310699463, + "grad_norm": 0.47861307859420776, "learning_rate": 1.9045961637492145e-05, - "loss": 1.0445, + "loss": 1.0443, "num_input_tokens_seen": 101826560, "step": 12430 }, { "epoch": 0.597358943577431, - "grad_norm": 0.49476364254951477, + "grad_norm": 0.4952894151210785, "learning_rate": 1.9007492520566814e-05, - "loss": 0.9611, + "loss": 0.9608, "num_input_tokens_seen": 101908480, "step": 12440 }, { "epoch": 0.5978391356542617, - "grad_norm": 0.5539293885231018, + "grad_norm": 0.5489120483398438, "learning_rate": 1.896903845174768e-05, - "loss": 0.9063, + "loss": 0.9062, "num_input_tokens_seen": 101990400, "step": 12450 }, { "epoch": 0.5983193277310924, - "grad_norm": 0.49969589710235596, + "grad_norm": 0.49838995933532715, "learning_rate": 1.8930599527598797e-05, - "loss": 0.8149, + "loss": 0.815, "num_input_tokens_seen": 102072320, "step": 12460 }, { "epoch": 0.5987995198079231, - "grad_norm": 0.6476652026176453, + "grad_norm": 0.6173511743545532, "learning_rate": 1.8892175844646215e-05, - "loss": 1.05, + "loss": 1.0498, "num_input_tokens_seen": 102154240, "step": 12470 }, { "epoch": 0.5992797118847539, - "grad_norm": 0.485588401556015, + "grad_norm": 0.48503515124320984, "learning_rate": 1.8853767499377712e-05, "loss": 0.7995, "num_input_tokens_seen": 102236160, @@ -9994,47 +9994,47 @@ }, { "epoch": 0.5997599039615846, - "grad_norm": 0.5099250674247742, + "grad_norm": 0.5097170472145081, "learning_rate": 1.8815374588242523e-05, - "loss": 0.7582, + "loss": 0.7581, "num_input_tokens_seen": 102318080, "step": 12490 }, { "epoch": 0.6002400960384153, - "grad_norm": 0.49604761600494385, + "grad_norm": 0.5017048716545105, "learning_rate": 1.877699720765114e-05, - "loss": 0.8494, + "loss": 0.8486, "num_input_tokens_seen": 102400000, "step": 12500 }, { "epoch": 0.6007202881152461, - "grad_norm": 0.5131434798240662, + "grad_norm": 0.5360931158065796, "learning_rate": 1.873863545397507e-05, - "loss": 0.7685, + "loss": 0.7697, "num_input_tokens_seen": 102481920, "step": 12510 }, { "epoch": 0.6012004801920768, - "grad_norm": 0.45130103826522827, + "grad_norm": 0.45245033502578735, "learning_rate": 1.870028942354655e-05, - "loss": 0.9137, + "loss": 0.9144, "num_input_tokens_seen": 102563840, "step": 12520 }, { "epoch": 0.6016806722689075, - "grad_norm": 0.4828559160232544, + "grad_norm": 0.48523399233818054, "learning_rate": 1.8661959212658365e-05, - "loss": 0.8186, + "loss": 0.8182, "num_input_tokens_seen": 102645760, "step": 12530 }, { "epoch": 0.6021608643457383, - "grad_norm": 0.5113935470581055, + "grad_norm": 0.5199438333511353, "learning_rate": 1.862364491756355e-05, "loss": 0.8923, "num_input_tokens_seen": 102727680, @@ -10042,31 +10042,31 @@ }, { "epoch": 0.602641056422569, - "grad_norm": 0.5698245167732239, + "grad_norm": 0.5562946200370789, "learning_rate": 1.8585346634475175e-05, - "loss": 0.9799, + "loss": 0.9803, "num_input_tokens_seen": 102809600, "step": 12550 }, { "epoch": 0.6031212484993997, - "grad_norm": 0.4895305335521698, + "grad_norm": 0.48669108748435974, "learning_rate": 1.8547064459566117e-05, - "loss": 0.9472, + "loss": 0.9476, "num_input_tokens_seen": 102891520, "step": 12560 }, { "epoch": 0.6036014405762304, - "grad_norm": 0.4899882376194, + "grad_norm": 0.4891805052757263, "learning_rate": 1.8508798488968803e-05, - "loss": 0.7548, + "loss": 0.7547, "num_input_tokens_seen": 102973440, "step": 12570 }, { "epoch": 0.6040816326530613, - "grad_norm": 0.48758333921432495, + "grad_norm": 0.4860639274120331, "learning_rate": 1.8470548818774942e-05, "loss": 0.9321, "num_input_tokens_seen": 103055360, @@ -10074,87 +10074,87 @@ }, { "epoch": 0.604561824729892, - "grad_norm": 0.4785870313644409, + "grad_norm": 0.47964203357696533, "learning_rate": 1.8432315545035328e-05, - "loss": 0.8554, + "loss": 0.8547, "num_input_tokens_seen": 103137280, "step": 12590 }, { "epoch": 0.6050420168067226, - "grad_norm": 0.46478036046028137, + "grad_norm": 0.47002238035202026, "learning_rate": 1.83940987637596e-05, - "loss": 0.9034, + "loss": 0.9036, "num_input_tokens_seen": 103219200, "step": 12600 }, { "epoch": 0.6055222088835535, - "grad_norm": 0.5116038918495178, + "grad_norm": 0.5217668414115906, "learning_rate": 1.8355898570915937e-05, - "loss": 1.0032, + "loss": 1.0021, "num_input_tokens_seen": 103301120, "step": 12610 }, { "epoch": 0.6060024009603842, - "grad_norm": 0.4858674108982086, + "grad_norm": 0.48160508275032043, "learning_rate": 1.8317715062430902e-05, - "loss": 0.9037, + "loss": 0.9036, "num_input_tokens_seen": 103383040, "step": 12620 }, { "epoch": 0.6064825930372149, - "grad_norm": 0.49994757771492004, + "grad_norm": 0.4867957532405853, "learning_rate": 1.8279548334189146e-05, - "loss": 0.9669, + "loss": 0.9671, "num_input_tokens_seen": 103464960, "step": 12630 }, { "epoch": 0.6069627851140457, - "grad_norm": 0.5256997346878052, + "grad_norm": 0.5135723948478699, "learning_rate": 1.8241398482033185e-05, - "loss": 0.9426, + "loss": 0.943, "num_input_tokens_seen": 103546880, "step": 12640 }, { "epoch": 0.6074429771908764, - "grad_norm": 0.48835039138793945, + "grad_norm": 0.48528727889060974, "learning_rate": 1.8203265601763136e-05, - "loss": 0.9299, + "loss": 0.9297, "num_input_tokens_seen": 103628800, "step": 12650 }, { "epoch": 0.6079231692677071, - "grad_norm": 0.4843783676624298, + "grad_norm": 0.4840488135814667, "learning_rate": 1.816514978913655e-05, - "loss": 0.8059, + "loss": 0.8064, "num_input_tokens_seen": 103710720, "step": 12660 }, { "epoch": 0.6084033613445378, - "grad_norm": 0.6175418496131897, + "grad_norm": 0.615665853023529, "learning_rate": 1.8127051139868044e-05, - "loss": 0.963, + "loss": 0.9639, "num_input_tokens_seen": 103792640, "step": 12670 }, { "epoch": 0.6088835534213686, - "grad_norm": 0.503555178642273, + "grad_norm": 0.5075173377990723, "learning_rate": 1.8088969749629197e-05, - "loss": 0.9894, + "loss": 0.9895, "num_input_tokens_seen": 103874560, "step": 12680 }, { "epoch": 0.6093637454981993, - "grad_norm": 0.48050084710121155, + "grad_norm": 0.47513309121131897, "learning_rate": 1.8050905714048233e-05, "loss": 1.0683, "num_input_tokens_seen": 103956480, @@ -10162,231 +10162,231 @@ }, { "epoch": 0.60984393757503, - "grad_norm": 0.5039294362068176, + "grad_norm": 0.5024633407592773, "learning_rate": 1.8012859128709766e-05, - "loss": 0.7611, + "loss": 0.7629, "num_input_tokens_seen": 104038400, "step": 12700 }, { "epoch": 0.6103241296518608, - "grad_norm": 0.6034661531448364, + "grad_norm": 0.5909437537193298, "learning_rate": 1.7974830089154624e-05, - "loss": 0.9293, + "loss": 0.9292, "num_input_tokens_seen": 104120320, "step": 12710 }, { "epoch": 0.6108043217286915, - "grad_norm": 0.5101216435432434, + "grad_norm": 0.5102412104606628, "learning_rate": 1.7936818690879574e-05, - "loss": 0.9849, + "loss": 0.9859, "num_input_tokens_seen": 104202240, "step": 12720 }, { "epoch": 0.6112845138055222, - "grad_norm": 0.49056023359298706, + "grad_norm": 0.4822942018508911, "learning_rate": 1.7898825029337054e-05, - "loss": 0.9917, + "loss": 0.9921, "num_input_tokens_seen": 104284160, "step": 12730 }, { "epoch": 0.611764705882353, - "grad_norm": 0.5179559588432312, + "grad_norm": 0.5105669498443604, "learning_rate": 1.7860849199934983e-05, - "loss": 0.8664, + "loss": 0.8653, "num_input_tokens_seen": 104366080, "step": 12740 }, { "epoch": 0.6122448979591837, - "grad_norm": 0.5554147958755493, + "grad_norm": 0.5741724967956543, "learning_rate": 1.7822891298036515e-05, - "loss": 0.9739, + "loss": 0.9741, "num_input_tokens_seen": 104448000, "step": 12750 }, { "epoch": 0.6127250900360144, - "grad_norm": 0.5510497689247131, + "grad_norm": 0.5551435351371765, "learning_rate": 1.7784951418959747e-05, - "loss": 0.9441, + "loss": 0.9431, "num_input_tokens_seen": 104529920, "step": 12760 }, { "epoch": 0.6132052821128451, - "grad_norm": 0.5032241344451904, + "grad_norm": 0.49893906712532043, "learning_rate": 1.7747029657977556e-05, - "loss": 0.9443, + "loss": 0.9441, "num_input_tokens_seen": 104611840, "step": 12770 }, { "epoch": 0.6136854741896759, - "grad_norm": 0.48411932587623596, + "grad_norm": 0.4885694682598114, "learning_rate": 1.7709126110317297e-05, - "loss": 1.0327, + "loss": 1.0325, "num_input_tokens_seen": 104693760, "step": 12780 }, { "epoch": 0.6141656662665066, - "grad_norm": 0.4949854016304016, + "grad_norm": 0.49348217248916626, "learning_rate": 1.7671240871160593e-05, - "loss": 0.9103, + "loss": 0.9102, "num_input_tokens_seen": 104775680, "step": 12790 }, { "epoch": 0.6146458583433373, - "grad_norm": 0.47453951835632324, + "grad_norm": 0.4734002351760864, "learning_rate": 1.76333740356431e-05, - "loss": 0.8165, + "loss": 0.817, "num_input_tokens_seen": 104857600, "step": 12800 }, { "epoch": 0.6151260504201681, - "grad_norm": 0.4928891658782959, + "grad_norm": 0.49383237957954407, "learning_rate": 1.7595525698854263e-05, - "loss": 0.8398, + "loss": 0.8403, "num_input_tokens_seen": 104939520, "step": 12810 }, { "epoch": 0.6156062424969988, - "grad_norm": 0.6075853109359741, + "grad_norm": 0.603259265422821, "learning_rate": 1.7557695955837063e-05, - "loss": 0.8922, + "loss": 0.8923, "num_input_tokens_seen": 105021440, "step": 12820 }, { "epoch": 0.6160864345738295, - "grad_norm": 0.6649598479270935, + "grad_norm": 0.6551387310028076, "learning_rate": 1.7519884901587772e-05, - "loss": 0.8345, + "loss": 0.8347, "num_input_tokens_seen": 105103360, "step": 12830 }, { "epoch": 0.6165666266506603, - "grad_norm": 0.5287781357765198, + "grad_norm": 0.5301949977874756, "learning_rate": 1.748209263105577e-05, - "loss": 1.0156, + "loss": 1.0152, "num_input_tokens_seen": 105185280, "step": 12840 }, { "epoch": 0.617046818727491, - "grad_norm": 0.48797744512557983, + "grad_norm": 0.4785909354686737, "learning_rate": 1.744431923914326e-05, - "loss": 0.9688, + "loss": 0.9691, "num_input_tokens_seen": 105267200, "step": 12850 }, { "epoch": 0.6175270108043217, - "grad_norm": 0.4974718391895294, + "grad_norm": 0.4949907660484314, "learning_rate": 1.7406564820705e-05, - "loss": 0.9532, + "loss": 0.9528, "num_input_tokens_seen": 105349120, "step": 12860 }, { "epoch": 0.6180072028811524, - "grad_norm": 1.6541121006011963, + "grad_norm": 1.5191551446914673, "learning_rate": 1.736882947054815e-05, - "loss": 0.818, + "loss": 0.8165, "num_input_tokens_seen": 105431040, "step": 12870 }, { "epoch": 0.6184873949579832, - "grad_norm": 0.5184734463691711, + "grad_norm": 0.5097967386245728, "learning_rate": 1.7331113283431966e-05, - "loss": 0.994, + "loss": 0.9941, "num_input_tokens_seen": 105512960, "step": 12880 }, { "epoch": 0.6189675870348139, - "grad_norm": 0.5024128556251526, + "grad_norm": 0.5027289390563965, "learning_rate": 1.729341635406757e-05, - "loss": 0.7879, + "loss": 0.7875, "num_input_tokens_seen": 105594880, "step": 12890 }, { "epoch": 0.6194477791116446, - "grad_norm": 0.48197275400161743, + "grad_norm": 0.4796588718891144, "learning_rate": 1.725573877711776e-05, - "loss": 0.9374, + "loss": 0.9376, "num_input_tokens_seen": 105676800, "step": 12900 }, { "epoch": 0.6199279711884754, - "grad_norm": 0.5016830563545227, + "grad_norm": 0.4965690076351166, "learning_rate": 1.7218080647196698e-05, - "loss": 0.9342, + "loss": 0.9336, "num_input_tokens_seen": 105758720, "step": 12910 }, { "epoch": 0.6204081632653061, - "grad_norm": 0.48292943835258484, + "grad_norm": 0.4842563271522522, "learning_rate": 1.7180442058869732e-05, - "loss": 0.8143, + "loss": 0.8139, "num_input_tokens_seen": 105840640, "step": 12920 }, { "epoch": 0.6208883553421368, - "grad_norm": 0.4773729145526886, + "grad_norm": 0.4824802577495575, "learning_rate": 1.7142823106653135e-05, - "loss": 0.8115, + "loss": 0.8121, "num_input_tokens_seen": 105922560, "step": 12930 }, { "epoch": 0.6213685474189676, - "grad_norm": 0.5274072885513306, + "grad_norm": 0.5284069776535034, "learning_rate": 1.7105223885013884e-05, - "loss": 0.8874, + "loss": 0.8867, "num_input_tokens_seen": 106004480, "step": 12940 }, { "epoch": 0.6218487394957983, - "grad_norm": 1.052327036857605, + "grad_norm": 1.0092517137527466, "learning_rate": 1.706764448836938e-05, - "loss": 0.9164, + "loss": 0.9163, "num_input_tokens_seen": 106086400, "step": 12950 }, { "epoch": 0.622328931572629, - "grad_norm": 0.5385840535163879, + "grad_norm": 0.5435160994529724, "learning_rate": 1.703008501108726e-05, - "loss": 0.9694, + "loss": 0.969, "num_input_tokens_seen": 106168320, "step": 12960 }, { "epoch": 0.6228091236494598, - "grad_norm": 0.4652009904384613, + "grad_norm": 0.463956356048584, "learning_rate": 1.699254554748515e-05, - "loss": 0.9424, + "loss": 0.9426, "num_input_tokens_seen": 106250240, "step": 12970 }, { "epoch": 0.6232893157262905, - "grad_norm": 0.5203952193260193, + "grad_norm": 0.5222662687301636, "learning_rate": 1.6955026191830385e-05, "loss": 0.8038, "num_input_tokens_seen": 106332160, @@ -10394,175 +10394,175 @@ }, { "epoch": 0.6237695078031212, - "grad_norm": 0.47074759006500244, + "grad_norm": 0.47449785470962524, "learning_rate": 1.691752703833984e-05, - "loss": 0.9074, + "loss": 0.9071, "num_input_tokens_seen": 106414080, "step": 12990 }, { "epoch": 0.6242496998799519, - "grad_norm": 0.5040807723999023, + "grad_norm": 0.5060712099075317, "learning_rate": 1.6880048181179652e-05, - "loss": 0.8315, + "loss": 0.8317, "num_input_tokens_seen": 106496000, "step": 13000 }, { "epoch": 0.6247298919567827, - "grad_norm": 0.5343396663665771, + "grad_norm": 0.5310168266296387, "learning_rate": 1.684258971446497e-05, - "loss": 0.9378, + "loss": 0.938, "num_input_tokens_seen": 106577920, "step": 13010 }, { "epoch": 0.6252100840336134, - "grad_norm": 0.48654720187187195, + "grad_norm": 0.4854814410209656, "learning_rate": 1.6805151732259755e-05, - "loss": 0.8081, + "loss": 0.8077, "num_input_tokens_seen": 106659840, "step": 13020 }, { "epoch": 0.6256902761104441, - "grad_norm": 0.4875001907348633, + "grad_norm": 0.4907275140285492, "learning_rate": 1.6767734328576544e-05, - "loss": 0.8395, + "loss": 0.8399, "num_input_tokens_seen": 106741760, "step": 13030 }, { "epoch": 0.626170468187275, - "grad_norm": 0.4956626296043396, + "grad_norm": 0.5009474754333496, "learning_rate": 1.6730337597376165e-05, - "loss": 0.8904, + "loss": 0.8906, "num_input_tokens_seen": 106823680, "step": 13040 }, { "epoch": 0.6266506602641057, - "grad_norm": 0.49269092082977295, + "grad_norm": 0.4950611889362335, "learning_rate": 1.669296163256755e-05, - "loss": 0.7173, + "loss": 0.7171, "num_input_tokens_seen": 106905600, "step": 13050 }, { "epoch": 0.6271308523409364, - "grad_norm": 0.7931904196739197, + "grad_norm": 0.7820413708686829, "learning_rate": 1.6655606528007505e-05, - "loss": 1.0009, + "loss": 1.0005, "num_input_tokens_seen": 106987520, "step": 13060 }, { "epoch": 0.6276110444177672, - "grad_norm": 0.5141395926475525, + "grad_norm": 0.5212737917900085, "learning_rate": 1.6618272377500406e-05, - "loss": 0.9796, + "loss": 0.9795, "num_input_tokens_seen": 107069440, "step": 13070 }, { "epoch": 0.6280912364945979, - "grad_norm": 0.49574312567710876, + "grad_norm": 0.494011253118515, "learning_rate": 1.658095927479805e-05, - "loss": 0.7448, + "loss": 0.7443, "num_input_tokens_seen": 107151360, "step": 13080 }, { "epoch": 0.6285714285714286, - "grad_norm": 0.548512876033783, + "grad_norm": 0.5520787835121155, "learning_rate": 1.6543667313599365e-05, - "loss": 1.1406, + "loss": 1.141, "num_input_tokens_seen": 107233280, "step": 13090 }, { "epoch": 0.6290516206482593, - "grad_norm": 0.49996235966682434, + "grad_norm": 0.5004755854606628, "learning_rate": 1.6506396587550188e-05, - "loss": 0.8865, + "loss": 0.8862, "num_input_tokens_seen": 107315200, "step": 13100 }, { "epoch": 0.6295318127250901, - "grad_norm": 0.5021194815635681, + "grad_norm": 0.5015995502471924, "learning_rate": 1.6469147190243027e-05, - "loss": 0.8561, + "loss": 0.8555, "num_input_tokens_seen": 107397120, "step": 13110 }, { "epoch": 0.6300120048019208, - "grad_norm": 0.4927319884300232, + "grad_norm": 0.49403059482574463, "learning_rate": 1.6431919215216862e-05, - "loss": 0.8364, + "loss": 0.8369, "num_input_tokens_seen": 107479040, "step": 13120 }, { "epoch": 0.6304921968787515, - "grad_norm": 0.5446639060974121, + "grad_norm": 0.6032249927520752, "learning_rate": 1.6394712755956817e-05, - "loss": 0.996, + "loss": 0.9955, "num_input_tokens_seen": 107560960, "step": 13130 }, { "epoch": 0.6309723889555823, - "grad_norm": 0.5035328269004822, + "grad_norm": 0.5043690204620361, "learning_rate": 1.635752790589405e-05, - "loss": 0.8958, + "loss": 0.8961, "num_input_tokens_seen": 107642880, "step": 13140 }, { "epoch": 0.631452581032413, - "grad_norm": 0.4967860281467438, + "grad_norm": 0.49665483832359314, "learning_rate": 1.6320364758405422e-05, - "loss": 0.9864, + "loss": 0.987, "num_input_tokens_seen": 107724800, "step": 13150 }, { "epoch": 0.6319327731092437, - "grad_norm": 0.554397463798523, + "grad_norm": 0.5483250617980957, "learning_rate": 1.628322340681329e-05, - "loss": 0.9916, + "loss": 0.9919, "num_input_tokens_seen": 107806720, "step": 13160 }, { "epoch": 0.6324129651860745, - "grad_norm": 0.5113590955734253, + "grad_norm": 0.5074891448020935, "learning_rate": 1.6246103944385295e-05, - "loss": 1.0281, + "loss": 1.0279, "num_input_tokens_seen": 107888640, "step": 13170 }, { "epoch": 0.6328931572629052, - "grad_norm": 0.49097153544425964, + "grad_norm": 0.4931067228317261, "learning_rate": 1.620900646433412e-05, - "loss": 0.8902, + "loss": 0.8903, "num_input_tokens_seen": 107970560, "step": 13180 }, { "epoch": 0.6333733493397359, - "grad_norm": 0.523250937461853, + "grad_norm": 0.5287465453147888, "learning_rate": 1.6171931059817214e-05, - "loss": 0.9968, + "loss": 0.9969, "num_input_tokens_seen": 108052480, "step": 13190 }, { "epoch": 0.6338535414165666, - "grad_norm": 0.4726887345314026, + "grad_norm": 0.47930577397346497, "learning_rate": 1.613487782393661e-05, "loss": 0.6298, "num_input_tokens_seen": 108134400, @@ -10570,7 +10570,7 @@ }, { "epoch": 0.6343337334933974, - "grad_norm": 0.5005057454109192, + "grad_norm": 0.5008885860443115, "learning_rate": 1.6097846849738685e-05, "loss": 0.7394, "num_input_tokens_seen": 108216320, @@ -10578,127 +10578,127 @@ }, { "epoch": 0.6348139255702281, - "grad_norm": 0.4948345124721527, + "grad_norm": 0.4981120228767395, "learning_rate": 1.6060838230213883e-05, - "loss": 0.9741, + "loss": 0.9739, "num_input_tokens_seen": 108298240, "step": 13220 }, { "epoch": 0.6352941176470588, - "grad_norm": 0.6280222535133362, + "grad_norm": 0.6545652151107788, "learning_rate": 1.6023852058296544e-05, - "loss": 1.0429, + "loss": 1.0424, "num_input_tokens_seen": 108380160, "step": 13230 }, { "epoch": 0.6357743097238896, - "grad_norm": 0.553558349609375, + "grad_norm": 0.5533519387245178, "learning_rate": 1.5986888426864617e-05, - "loss": 0.8975, + "loss": 0.897, "num_input_tokens_seen": 108462080, "step": 13240 }, { "epoch": 0.6362545018007203, - "grad_norm": 0.47756895422935486, + "grad_norm": 0.47119852900505066, "learning_rate": 1.5949947428739448e-05, - "loss": 0.8076, + "loss": 0.8078, "num_input_tokens_seen": 108544000, "step": 13250 }, { "epoch": 0.636734693877551, - "grad_norm": 0.8511344194412231, + "grad_norm": 0.8498918414115906, "learning_rate": 1.591302915668556e-05, - "loss": 0.747, + "loss": 0.7465, "num_input_tokens_seen": 108625920, "step": 13260 }, { "epoch": 0.6372148859543818, - "grad_norm": 0.4981195032596588, + "grad_norm": 0.49918386340141296, "learning_rate": 1.5876133703410412e-05, - "loss": 0.9432, + "loss": 0.9428, "num_input_tokens_seen": 108707840, "step": 13270 }, { "epoch": 0.6376950780312125, - "grad_norm": 0.4754570722579956, + "grad_norm": 0.48045846819877625, "learning_rate": 1.5839261161564138e-05, - "loss": 1.0365, + "loss": 1.0362, "num_input_tokens_seen": 108789760, "step": 13280 }, { "epoch": 0.6381752701080432, - "grad_norm": 1.2110140323638916, + "grad_norm": 1.4066901206970215, "learning_rate": 1.5802411623739345e-05, - "loss": 0.9743, + "loss": 0.9749, "num_input_tokens_seen": 108871680, "step": 13290 }, { "epoch": 0.6386554621848739, - "grad_norm": 0.4123174250125885, + "grad_norm": 0.4242941737174988, "learning_rate": 1.57655851824709e-05, - "loss": 0.8376, + "loss": 0.8378, "num_input_tokens_seen": 108953600, "step": 13300 }, { "epoch": 0.6391356542617047, - "grad_norm": 0.5139141082763672, + "grad_norm": 0.5066592693328857, "learning_rate": 1.5728781930235627e-05, - "loss": 0.9858, + "loss": 0.9856, "num_input_tokens_seen": 109035520, "step": 13310 }, { "epoch": 0.6396158463385354, - "grad_norm": 0.49317967891693115, + "grad_norm": 0.4904208183288574, "learning_rate": 1.5692001959452164e-05, - "loss": 1.0486, + "loss": 1.0489, "num_input_tokens_seen": 109117440, "step": 13320 }, { "epoch": 0.6400960384153661, - "grad_norm": 0.48599720001220703, + "grad_norm": 0.48512765765190125, "learning_rate": 1.5655245362480654e-05, - "loss": 0.8965, + "loss": 0.8971, "num_input_tokens_seen": 109199360, "step": 13330 }, { "epoch": 0.6405762304921969, - "grad_norm": 0.4471222162246704, + "grad_norm": 0.43509966135025024, "learning_rate": 1.561851223162254e-05, - "loss": 0.8839, + "loss": 0.8842, "num_input_tokens_seen": 109281280, "step": 13340 }, { "epoch": 0.6410564225690276, - "grad_norm": 0.4929245412349701, + "grad_norm": 0.49229124188423157, "learning_rate": 1.558180265912037e-05, - "loss": 0.9363, + "loss": 0.9354, "num_input_tokens_seen": 109363200, "step": 13350 }, { "epoch": 0.6415366146458583, - "grad_norm": 0.4899829626083374, + "grad_norm": 0.492304265499115, "learning_rate": 1.5545116737157522e-05, - "loss": 0.9724, + "loss": 0.9725, "num_input_tokens_seen": 109445120, "step": 13360 }, { "epoch": 0.6420168067226891, - "grad_norm": 0.4964812994003296, + "grad_norm": 0.4966655373573303, "learning_rate": 1.5508454557857966e-05, "loss": 0.7562, "num_input_tokens_seen": 109527040, @@ -10706,103 +10706,103 @@ }, { "epoch": 0.6424969987995198, - "grad_norm": 0.44805675745010376, + "grad_norm": 0.44081437587738037, "learning_rate": 1.5471816213286054e-05, - "loss": 0.8147, + "loss": 0.8151, "num_input_tokens_seen": 109608960, "step": 13380 }, { "epoch": 0.6429771908763505, - "grad_norm": 0.8485963940620422, + "grad_norm": 0.8634435534477234, "learning_rate": 1.5435201795446317e-05, - "loss": 0.9489, + "loss": 0.9497, "num_input_tokens_seen": 109690880, "step": 13390 }, { "epoch": 0.6434573829531812, - "grad_norm": 0.4873722195625305, + "grad_norm": 0.4802958071231842, "learning_rate": 1.5398611396283153e-05, - "loss": 0.8655, + "loss": 0.8651, "num_input_tokens_seen": 109772800, "step": 13400 }, { "epoch": 0.643937575030012, - "grad_norm": 0.5201349258422852, + "grad_norm": 0.5202709436416626, "learning_rate": 1.536204510768069e-05, - "loss": 0.8621, + "loss": 0.8622, "num_input_tokens_seen": 109854720, "step": 13410 }, { "epoch": 0.6444177671068427, - "grad_norm": 0.4951586425304413, + "grad_norm": 0.4949398934841156, "learning_rate": 1.532550302146249e-05, - "loss": 0.8371, + "loss": 0.8377, "num_input_tokens_seen": 109936640, "step": 13420 }, { "epoch": 0.6448979591836734, - "grad_norm": 0.4836975336074829, + "grad_norm": 0.4846876859664917, "learning_rate": 1.528898522939133e-05, - "loss": 0.7762, + "loss": 0.7773, "num_input_tokens_seen": 110018560, "step": 13430 }, { "epoch": 0.6453781512605042, - "grad_norm": 1.549204707145691, + "grad_norm": 1.569688081741333, "learning_rate": 1.5252491823168994e-05, - "loss": 0.9059, + "loss": 0.9058, "num_input_tokens_seen": 110100480, "step": 13440 }, { "epoch": 0.6458583433373349, - "grad_norm": 0.39099404215812683, + "grad_norm": 0.39199429750442505, "learning_rate": 1.5216022894436043e-05, - "loss": 0.9104, + "loss": 0.9108, "num_input_tokens_seen": 110182400, "step": 13450 }, { "epoch": 0.6463385354141656, - "grad_norm": 0.4727102816104889, + "grad_norm": 0.47286975383758545, "learning_rate": 1.517957853477154e-05, - "loss": 0.9764, + "loss": 0.9771, "num_input_tokens_seen": 110264320, "step": 13460 }, { "epoch": 0.6468187274909964, - "grad_norm": 0.5066577792167664, + "grad_norm": 0.5029356479644775, "learning_rate": 1.5143158835692866e-05, - "loss": 0.8985, + "loss": 0.8984, "num_input_tokens_seen": 110346240, "step": 13470 }, { "epoch": 0.6472989195678271, - "grad_norm": 0.5499137043952942, + "grad_norm": 0.5453057289123535, "learning_rate": 1.5106763888655478e-05, - "loss": 1.1936, + "loss": 1.1943, "num_input_tokens_seen": 110428160, "step": 13480 }, { "epoch": 0.6477791116446578, - "grad_norm": 0.4742192029953003, + "grad_norm": 0.47629615664482117, "learning_rate": 1.5070393785052695e-05, - "loss": 0.9241, + "loss": 0.9233, "num_input_tokens_seen": 110510080, "step": 13490 }, { "epoch": 0.6482593037214885, - "grad_norm": 0.4930437207221985, + "grad_norm": 0.49545878171920776, "learning_rate": 1.5034048616215402e-05, "loss": 0.7714, "num_input_tokens_seen": 110592000, @@ -10810,31 +10810,31 @@ }, { "epoch": 0.6487394957983194, - "grad_norm": 0.48428675532341003, + "grad_norm": 0.4852381646633148, "learning_rate": 1.4997728473411903e-05, - "loss": 1.0863, + "loss": 1.0867, "num_input_tokens_seen": 110673920, "step": 13510 }, { "epoch": 0.64921968787515, - "grad_norm": 0.49991798400878906, + "grad_norm": 0.4980316758155823, "learning_rate": 1.4961433447847672e-05, - "loss": 0.8677, + "loss": 0.8682, "num_input_tokens_seen": 110755840, "step": 13520 }, { "epoch": 0.6496998799519808, - "grad_norm": 0.4799818694591522, + "grad_norm": 0.48034679889678955, "learning_rate": 1.4925163630665065e-05, - "loss": 0.8387, + "loss": 0.8386, "num_input_tokens_seen": 110837760, "step": 13530 }, { "epoch": 0.6501800720288116, - "grad_norm": 0.4553060531616211, + "grad_norm": 0.45721235871315, "learning_rate": 1.4888919112943173e-05, "loss": 0.9952, "num_input_tokens_seen": 110919680, @@ -10842,23 +10842,23 @@ }, { "epoch": 0.6506602641056423, - "grad_norm": 0.8867954611778259, + "grad_norm": 0.8776998519897461, "learning_rate": 1.4852699985697546e-05, - "loss": 0.8378, + "loss": 0.8368, "num_input_tokens_seen": 111001600, "step": 13550 }, { "epoch": 0.651140456182473, - "grad_norm": 0.49487099051475525, + "grad_norm": 0.4943079352378845, "learning_rate": 1.4816506339879965e-05, - "loss": 0.8531, + "loss": 0.8539, "num_input_tokens_seen": 111083520, "step": 13560 }, { "epoch": 0.6516206482593038, - "grad_norm": 0.5083696246147156, + "grad_norm": 0.49719420075416565, "learning_rate": 1.4780338266378232e-05, "loss": 0.8573, "num_input_tokens_seen": 111165440, @@ -10866,23 +10866,23 @@ }, { "epoch": 0.6521008403361345, - "grad_norm": 0.5299503207206726, + "grad_norm": 0.5202630758285522, "learning_rate": 1.4744195856015947e-05, - "loss": 0.6841, + "loss": 0.6835, "num_input_tokens_seen": 111247360, "step": 13580 }, { "epoch": 0.6525810324129652, - "grad_norm": 0.49826177954673767, + "grad_norm": 0.4931369423866272, "learning_rate": 1.4708079199552221e-05, - "loss": 0.8537, + "loss": 0.8532, "num_input_tokens_seen": 111329280, "step": 13590 }, { "epoch": 0.6530612244897959, - "grad_norm": 0.5107062458992004, + "grad_norm": 0.5132030248641968, "learning_rate": 1.4671988387681549e-05, "loss": 1.038, "num_input_tokens_seen": 111411200, @@ -10890,79 +10890,79 @@ }, { "epoch": 0.6535414165666267, - "grad_norm": 0.4928327202796936, + "grad_norm": 0.4891904592514038, "learning_rate": 1.4635923511033494e-05, - "loss": 0.9946, + "loss": 0.9945, "num_input_tokens_seen": 111493120, "step": 13610 }, { "epoch": 0.6540216086434574, - "grad_norm": 0.4964877963066101, + "grad_norm": 0.503991425037384, "learning_rate": 1.4599884660172485e-05, - "loss": 1.8003, + "loss": 1.7508, "num_input_tokens_seen": 111575040, "step": 13620 }, { "epoch": 0.6545018007202881, - "grad_norm": 0.530273973941803, + "grad_norm": 0.5447900891304016, "learning_rate": 1.4563871925597622e-05, - "loss": 0.8789, + "loss": 0.8786, "num_input_tokens_seen": 111656960, "step": 13630 }, { "epoch": 0.6549819927971189, - "grad_norm": 0.48265501856803894, + "grad_norm": 0.49095600843429565, "learning_rate": 1.452788539774241e-05, - "loss": 0.6229, + "loss": 0.6227, "num_input_tokens_seen": 111738880, "step": 13640 }, { "epoch": 0.6554621848739496, - "grad_norm": 0.4895990788936615, + "grad_norm": 0.49171656370162964, "learning_rate": 1.4491925166974532e-05, - "loss": 0.8647, + "loss": 0.8651, "num_input_tokens_seen": 111820800, "step": 13650 }, { "epoch": 0.6559423769507803, - "grad_norm": 0.534498929977417, + "grad_norm": 0.5414605736732483, "learning_rate": 1.4455991323595655e-05, - "loss": 0.8938, + "loss": 0.8945, "num_input_tokens_seen": 111902720, "step": 13660 }, { "epoch": 0.6564225690276111, - "grad_norm": 0.48878028988838196, + "grad_norm": 0.49501684308052063, "learning_rate": 1.4420083957841185e-05, - "loss": 0.8419, + "loss": 0.8423, "num_input_tokens_seen": 111984640, "step": 13670 }, { "epoch": 0.6569027611044418, - "grad_norm": 0.49631527066230774, + "grad_norm": 0.4984915256500244, "learning_rate": 1.4384203159880017e-05, - "loss": 0.8683, + "loss": 0.8682, "num_input_tokens_seen": 112066560, "step": 13680 }, { "epoch": 0.6573829531812725, - "grad_norm": 0.5460849404335022, + "grad_norm": 0.5721986293792725, "learning_rate": 1.4348349019814344e-05, - "loss": 1.1068, + "loss": 1.1063, "num_input_tokens_seen": 112148480, "step": 13690 }, { "epoch": 0.6578631452581032, - "grad_norm": 0.4952901601791382, + "grad_norm": 0.5022817850112915, "learning_rate": 1.4312521627679428e-05, "loss": 0.9632, "num_input_tokens_seen": 112230400, @@ -10970,47 +10970,47 @@ }, { "epoch": 0.658343337334934, - "grad_norm": 0.5365872979164124, + "grad_norm": 0.5426187515258789, "learning_rate": 1.4276721073443344e-05, - "loss": 0.8043, + "loss": 0.804, "num_input_tokens_seen": 112312320, "step": 13710 }, { "epoch": 0.6588235294117647, - "grad_norm": 0.5443843007087708, + "grad_norm": 0.5450629591941833, "learning_rate": 1.4240947447006764e-05, - "loss": 1.025, + "loss": 1.0239, "num_input_tokens_seen": 112394240, "step": 13720 }, { "epoch": 0.6593037214885954, - "grad_norm": 0.49245166778564453, + "grad_norm": 0.4963448643684387, "learning_rate": 1.4205200838202782e-05, - "loss": 1.0484, + "loss": 1.049, "num_input_tokens_seen": 112476160, "step": 13730 }, { "epoch": 0.6597839135654262, - "grad_norm": 0.5047618746757507, + "grad_norm": 0.5081272125244141, "learning_rate": 1.4169481336796597e-05, - "loss": 1.0862, + "loss": 1.0863, "num_input_tokens_seen": 112558080, "step": 13740 }, { "epoch": 0.6602641056422569, - "grad_norm": 0.5078234076499939, + "grad_norm": 0.5098510384559631, "learning_rate": 1.4133789032485367e-05, - "loss": 0.9006, + "loss": 0.9013, "num_input_tokens_seen": 112640000, "step": 13750 }, { "epoch": 0.6607442977190876, - "grad_norm": 1.923171877861023, + "grad_norm": 1.9367784261703491, "learning_rate": 1.4098124014897961e-05, "loss": 0.8773, "num_input_tokens_seen": 112721920, @@ -11018,119 +11018,119 @@ }, { "epoch": 0.6612244897959184, - "grad_norm": 0.5198791027069092, + "grad_norm": 0.514941930770874, "learning_rate": 1.4062486373594694e-05, - "loss": 0.802, + "loss": 0.7995, "num_input_tokens_seen": 112803840, "step": 13770 }, { "epoch": 0.6617046818727491, - "grad_norm": 0.4909871518611908, + "grad_norm": 0.48932793736457825, "learning_rate": 1.4026876198067163e-05, - "loss": 1.0097, + "loss": 1.0093, "num_input_tokens_seen": 112885760, "step": 13780 }, { "epoch": 0.6621848739495798, - "grad_norm": 0.4735969305038452, + "grad_norm": 0.4777870774269104, "learning_rate": 1.399129357773799e-05, - "loss": 0.9333, + "loss": 0.9332, "num_input_tokens_seen": 112967680, "step": 13790 }, { "epoch": 0.6626650660264105, - "grad_norm": 0.52108234167099, + "grad_norm": 0.5338255763053894, "learning_rate": 1.3955738601960588e-05, - "loss": 0.8832, + "loss": 0.8836, "num_input_tokens_seen": 113049600, "step": 13800 }, { "epoch": 0.6631452581032413, - "grad_norm": 0.49739062786102295, + "grad_norm": 0.5129244923591614, "learning_rate": 1.392021136001897e-05, - "loss": 0.7867, + "loss": 0.7864, "num_input_tokens_seen": 113131520, "step": 13810 }, { "epoch": 0.663625450180072, - "grad_norm": 0.5152223706245422, + "grad_norm": 0.502555251121521, "learning_rate": 1.3884711941127487e-05, - "loss": 0.9924, + "loss": 0.9925, "num_input_tokens_seen": 113213440, "step": 13820 }, { "epoch": 0.6641056422569027, - "grad_norm": 0.4832291901111603, + "grad_norm": 0.4870724678039551, "learning_rate": 1.384924043443062e-05, - "loss": 0.7811, + "loss": 0.7822, "num_input_tokens_seen": 113295360, "step": 13830 }, { "epoch": 0.6645858343337335, - "grad_norm": 0.49378257989883423, + "grad_norm": 0.49208691716194153, "learning_rate": 1.3813796929002779e-05, - "loss": 0.7269, + "loss": 0.7278, "num_input_tokens_seen": 113377280, "step": 13840 }, { "epoch": 0.6650660264105642, - "grad_norm": 0.4792208969593048, + "grad_norm": 0.4737597107887268, "learning_rate": 1.3778381513848055e-05, - "loss": 0.7797, + "loss": 0.7792, "num_input_tokens_seen": 113459200, "step": 13850 }, { "epoch": 0.6655462184873949, - "grad_norm": 0.5114631056785583, + "grad_norm": 0.5034937858581543, "learning_rate": 1.3742994277899967e-05, - "loss": 0.7852, + "loss": 0.7843, "num_input_tokens_seen": 113541120, "step": 13860 }, { "epoch": 0.6660264105642257, - "grad_norm": 0.49299582839012146, + "grad_norm": 0.49161359667778015, "learning_rate": 1.370763531002132e-05, - "loss": 1.2018, + "loss": 1.2019, "num_input_tokens_seen": 113623040, "step": 13870 }, { "epoch": 0.6665066026410564, - "grad_norm": 0.48771587014198303, + "grad_norm": 0.487804114818573, "learning_rate": 1.3672304699003908e-05, - "loss": 0.7667, + "loss": 0.7672, "num_input_tokens_seen": 113704960, "step": 13880 }, { "epoch": 0.6669867947178871, - "grad_norm": 0.6052321791648865, + "grad_norm": 0.6117998957633972, "learning_rate": 1.3637002533568302e-05, - "loss": 0.8471, + "loss": 0.847, "num_input_tokens_seen": 113786880, "step": 13890 }, { "epoch": 0.6674669867947179, - "grad_norm": 0.47869783639907837, + "grad_norm": 0.47483766078948975, "learning_rate": 1.3601728902363681e-05, - "loss": 1.1894, + "loss": 1.1898, "num_input_tokens_seen": 113868800, "step": 13900 }, { "epoch": 0.6679471788715486, - "grad_norm": 0.4907095432281494, + "grad_norm": 0.48828792572021484, "learning_rate": 1.356648389396754e-05, "loss": 0.9134, "num_input_tokens_seen": 113950720, @@ -11138,119 +11138,119 @@ }, { "epoch": 0.6684273709483793, - "grad_norm": 0.4225086569786072, + "grad_norm": 0.4222254455089569, "learning_rate": 1.3531267596885488e-05, - "loss": 1.1423, + "loss": 1.1422, "num_input_tokens_seen": 114032640, "step": 13920 }, { "epoch": 0.66890756302521, - "grad_norm": 0.4796990156173706, + "grad_norm": 0.47872963547706604, "learning_rate": 1.349608009955107e-05, - "loss": 0.7921, + "loss": 0.7922, "num_input_tokens_seen": 114114560, "step": 13930 }, { "epoch": 0.6693877551020408, - "grad_norm": 0.4856274724006653, + "grad_norm": 0.48117169737815857, "learning_rate": 1.34609214903255e-05, - "loss": 0.8029, + "loss": 0.8027, "num_input_tokens_seen": 114196480, "step": 13940 }, { "epoch": 0.6698679471788715, - "grad_norm": 0.4970095753669739, + "grad_norm": 0.49767807126045227, "learning_rate": 1.3425791857497422e-05, - "loss": 0.9638, + "loss": 0.9642, "num_input_tokens_seen": 114278400, "step": 13950 }, { "epoch": 0.6703481392557022, - "grad_norm": 0.47488856315612793, + "grad_norm": 0.4757895767688751, "learning_rate": 1.3390691289282754e-05, - "loss": 0.8415, + "loss": 0.8414, "num_input_tokens_seen": 114360320, "step": 13960 }, { "epoch": 0.6708283313325331, - "grad_norm": 0.4611085057258606, + "grad_norm": 0.4622432291507721, "learning_rate": 1.335561987382441e-05, - "loss": 0.9274, + "loss": 0.9272, "num_input_tokens_seen": 114442240, "step": 13970 }, { "epoch": 0.6713085234093638, - "grad_norm": 0.4784678518772125, + "grad_norm": 0.465688019990921, "learning_rate": 1.3320577699192086e-05, - "loss": 0.7943, + "loss": 0.7937, "num_input_tokens_seen": 114524160, "step": 13980 }, { "epoch": 0.6717887154861945, - "grad_norm": 0.49473994970321655, + "grad_norm": 0.4986606240272522, "learning_rate": 1.3285564853382076e-05, - "loss": 0.9644, + "loss": 0.9645, "num_input_tokens_seen": 114606080, "step": 13990 }, { "epoch": 0.6722689075630253, - "grad_norm": 0.45741796493530273, + "grad_norm": 0.46979689598083496, "learning_rate": 1.325058142431701e-05, - "loss": 0.8024, + "loss": 0.8027, "num_input_tokens_seen": 114688000, "step": 14000 }, { "epoch": 0.672749099639856, - "grad_norm": 0.4712899625301361, + "grad_norm": 0.47202202677726746, "learning_rate": 1.321562749984563e-05, - "loss": 0.7327, + "loss": 0.7335, "num_input_tokens_seen": 114769920, "step": 14010 }, { "epoch": 0.6732292917166867, - "grad_norm": 0.4954095780849457, + "grad_norm": 0.49672776460647583, "learning_rate": 1.318070316774262e-05, - "loss": 0.9788, + "loss": 0.9804, "num_input_tokens_seen": 114851840, "step": 14020 }, { "epoch": 0.6737094837935174, - "grad_norm": 0.5219026207923889, + "grad_norm": 0.5184418559074402, "learning_rate": 1.3145808515708347e-05, - "loss": 0.7022, + "loss": 0.7016, "num_input_tokens_seen": 114933760, "step": 14030 }, { "epoch": 0.6741896758703482, - "grad_norm": 0.4924544394016266, + "grad_norm": 0.491964727640152, "learning_rate": 1.3110943631368616e-05, - "loss": 0.9829, + "loss": 0.9825, "num_input_tokens_seen": 115015680, "step": 14040 }, { "epoch": 0.6746698679471789, - "grad_norm": 0.49134406447410583, + "grad_norm": 0.4879056215286255, "learning_rate": 1.3076108602274522e-05, - "loss": 0.7022, + "loss": 0.7027, "num_input_tokens_seen": 115097600, "step": 14050 }, { "epoch": 0.6751500600240096, - "grad_norm": 0.4494248926639557, + "grad_norm": 0.4461091160774231, "learning_rate": 1.3041303515902179e-05, "loss": 0.8251, "num_input_tokens_seen": 115179520, @@ -11258,15 +11258,15 @@ }, { "epoch": 0.6756302521008404, - "grad_norm": 0.46173524856567383, + "grad_norm": 0.45616379380226135, "learning_rate": 1.3006528459652476e-05, - "loss": 1.0901, + "loss": 1.0897, "num_input_tokens_seen": 115261440, "step": 14070 }, { "epoch": 0.6761104441776711, - "grad_norm": 0.5183840394020081, + "grad_norm": 0.5146764516830444, "learning_rate": 1.2971783520850939e-05, "loss": 0.9864, "num_input_tokens_seen": 115343360, @@ -11274,151 +11274,151 @@ }, { "epoch": 0.6765906362545018, - "grad_norm": 0.4756566286087036, + "grad_norm": 0.4778116047382355, "learning_rate": 1.2937068786747438e-05, - "loss": 0.9465, + "loss": 0.9467, "num_input_tokens_seen": 115425280, "step": 14090 }, { "epoch": 0.6770708283313326, - "grad_norm": 0.47903919219970703, + "grad_norm": 0.4811628758907318, "learning_rate": 1.2902384344515986e-05, - "loss": 0.9708, + "loss": 0.971, "num_input_tokens_seen": 115507200, "step": 14100 }, { "epoch": 0.6775510204081633, - "grad_norm": 0.6484615206718445, + "grad_norm": 0.6663933396339417, "learning_rate": 1.286773028125455e-05, - "loss": 0.7833, + "loss": 0.7835, "num_input_tokens_seen": 115589120, "step": 14110 }, { "epoch": 0.678031212484994, - "grad_norm": 0.48011910915374756, + "grad_norm": 0.47772663831710815, "learning_rate": 1.2833106683984808e-05, - "loss": 0.9278, + "loss": 0.9291, "num_input_tokens_seen": 115671040, "step": 14120 }, { "epoch": 0.6785114045618247, - "grad_norm": 0.4931375980377197, + "grad_norm": 0.48866212368011475, "learning_rate": 1.279851363965193e-05, - "loss": 0.8275, + "loss": 0.8278, "num_input_tokens_seen": 115752960, "step": 14130 }, { "epoch": 0.6789915966386555, - "grad_norm": 0.5052535533905029, + "grad_norm": 0.504115879535675, "learning_rate": 1.2763951235124346e-05, - "loss": 0.8764, + "loss": 0.8757, "num_input_tokens_seen": 115834880, "step": 14140 }, { "epoch": 0.6794717887154862, - "grad_norm": 0.49252551794052124, + "grad_norm": 0.4960152804851532, "learning_rate": 1.2729419557193573e-05, - "loss": 0.9236, + "loss": 0.9235, "num_input_tokens_seen": 115916800, "step": 14150 }, { "epoch": 0.6799519807923169, - "grad_norm": 0.45454323291778564, + "grad_norm": 0.4555470049381256, "learning_rate": 1.2694918692573954e-05, - "loss": 0.7963, + "loss": 0.7965, "num_input_tokens_seen": 115998720, "step": 14160 }, { "epoch": 0.6804321728691477, - "grad_norm": 0.47249314188957214, + "grad_norm": 0.475454181432724, "learning_rate": 1.2660448727902457e-05, - "loss": 0.7971, + "loss": 0.7966, "num_input_tokens_seen": 116080640, "step": 14170 }, { "epoch": 0.6809123649459784, - "grad_norm": 0.4945741891860962, + "grad_norm": 0.49502047896385193, "learning_rate": 1.2626009749738444e-05, - "loss": 0.7678, + "loss": 0.7681, "num_input_tokens_seen": 116162560, "step": 14180 }, { "epoch": 0.6813925570228091, - "grad_norm": 0.4679727852344513, + "grad_norm": 0.4328559339046478, "learning_rate": 1.2591601844563488e-05, - "loss": 0.8935, + "loss": 0.8936, "num_input_tokens_seen": 116244480, "step": 14190 }, { "epoch": 0.6818727490996399, - "grad_norm": 0.49965664744377136, + "grad_norm": 0.5078925490379333, "learning_rate": 1.2557225098781105e-05, - "loss": 0.9236, + "loss": 0.9239, "num_input_tokens_seen": 116326400, "step": 14200 }, { "epoch": 0.6823529411764706, - "grad_norm": 0.46757179498672485, + "grad_norm": 0.46798259019851685, "learning_rate": 1.2522879598716595e-05, - "loss": 0.8667, + "loss": 0.8666, "num_input_tokens_seen": 116408320, "step": 14210 }, { "epoch": 0.6828331332533013, - "grad_norm": 0.6531784534454346, + "grad_norm": 0.6590942740440369, "learning_rate": 1.2488565430616785e-05, - "loss": 0.8728, + "loss": 0.873, "num_input_tokens_seen": 116490240, "step": 14220 }, { "epoch": 0.683313325330132, - "grad_norm": 0.4773021340370178, + "grad_norm": 0.4791242778301239, "learning_rate": 1.2454282680649804e-05, - "loss": 0.6747, + "loss": 0.6743, "num_input_tokens_seen": 116572160, "step": 14230 }, { "epoch": 0.6837935174069628, - "grad_norm": 0.6445793509483337, + "grad_norm": 0.656548023223877, "learning_rate": 1.2420031434904906e-05, - "loss": 0.864, + "loss": 0.8642, "num_input_tokens_seen": 116654080, "step": 14240 }, { "epoch": 0.6842737094837935, - "grad_norm": 0.479495644569397, + "grad_norm": 0.4806419909000397, "learning_rate": 1.2385811779392236e-05, - "loss": 0.8979, + "loss": 0.8987, "num_input_tokens_seen": 116736000, "step": 14250 }, { "epoch": 0.6847539015606242, - "grad_norm": 0.8227359652519226, + "grad_norm": 0.8275937438011169, "learning_rate": 1.2351623800042587e-05, - "loss": 0.9371, + "loss": 0.9379, "num_input_tokens_seen": 116817920, "step": 14260 }, { "epoch": 0.685234093637455, - "grad_norm": 0.41954296827316284, + "grad_norm": 0.44159382581710815, "learning_rate": 1.2317467582707238e-05, "loss": 0.9432, "num_input_tokens_seen": 116899840, @@ -11426,79 +11426,79 @@ }, { "epoch": 0.6857142857142857, - "grad_norm": 0.5001243948936462, + "grad_norm": 0.5025272965431213, "learning_rate": 1.2283343213157688e-05, - "loss": 0.8542, + "loss": 0.854, "num_input_tokens_seen": 116981760, "step": 14280 }, { "epoch": 0.6861944777911164, - "grad_norm": 0.5058388113975525, + "grad_norm": 0.5092597603797913, "learning_rate": 1.2249250777085456e-05, - "loss": 0.897, + "loss": 0.8973, "num_input_tokens_seen": 117063680, "step": 14290 }, { "epoch": 0.6866746698679472, - "grad_norm": 0.5120940804481506, + "grad_norm": 0.47980546951293945, "learning_rate": 1.221519036010189e-05, - "loss": 0.9898, + "loss": 0.9893, "num_input_tokens_seen": 117145600, "step": 14300 }, { "epoch": 0.6871548619447779, - "grad_norm": 0.5334838032722473, + "grad_norm": 0.5349389910697937, "learning_rate": 1.2181162047737942e-05, - "loss": 0.7581, + "loss": 0.7582, "num_input_tokens_seen": 117227520, "step": 14310 }, { "epoch": 0.6876350540216086, - "grad_norm": 0.4675463140010834, + "grad_norm": 0.467484712600708, "learning_rate": 1.2147165925443904e-05, - "loss": 1.0274, + "loss": 1.0281, "num_input_tokens_seen": 117309440, "step": 14320 }, { "epoch": 0.6881152460984393, - "grad_norm": 0.7059617638587952, + "grad_norm": 0.6963463425636292, "learning_rate": 1.2113202078589267e-05, - "loss": 0.8403, + "loss": 0.84, "num_input_tokens_seen": 117391360, "step": 14330 }, { "epoch": 0.6885954381752701, - "grad_norm": 0.5068497061729431, + "grad_norm": 0.507810652256012, "learning_rate": 1.2079270592462475e-05, - "loss": 0.8582, + "loss": 0.8581, "num_input_tokens_seen": 117473280, "step": 14340 }, { "epoch": 0.6890756302521008, - "grad_norm": 0.5074156522750854, + "grad_norm": 0.5044135451316833, "learning_rate": 1.204537155227068e-05, - "loss": 0.876, + "loss": 0.8766, "num_input_tokens_seen": 117555200, "step": 14350 }, { "epoch": 0.6895558223289315, - "grad_norm": 0.5661312341690063, + "grad_norm": 0.5582197904586792, "learning_rate": 1.201150504313959e-05, - "loss": 0.9847, + "loss": 0.985, "num_input_tokens_seen": 117637120, "step": 14360 }, { "epoch": 0.6900360144057623, - "grad_norm": 0.48914408683776855, + "grad_norm": 0.4891442358493805, "learning_rate": 1.1977671150113206e-05, "loss": 0.8981, "num_input_tokens_seen": 117719040, @@ -11506,23 +11506,23 @@ }, { "epoch": 0.690516206482593, - "grad_norm": 0.5188919305801392, + "grad_norm": 0.5189415812492371, "learning_rate": 1.1943869958153613e-05, - "loss": 0.929, + "loss": 0.9291, "num_input_tokens_seen": 117800960, "step": 14380 }, { "epoch": 0.6909963985594237, - "grad_norm": 0.4931916892528534, + "grad_norm": 0.49603569507598877, "learning_rate": 1.1910101552140806e-05, - "loss": 0.9613, + "loss": 0.9612, "num_input_tokens_seen": 117882880, "step": 14390 }, { "epoch": 0.6914765906362546, - "grad_norm": 0.5177183747291565, + "grad_norm": 0.5161275267601013, "learning_rate": 1.1876366016872445e-05, "loss": 0.9258, "num_input_tokens_seen": 117964800, @@ -11530,151 +11530,151 @@ }, { "epoch": 0.6919567827130852, - "grad_norm": 0.48315808176994324, + "grad_norm": 0.48330655694007874, "learning_rate": 1.1842663437063613e-05, - "loss": 0.7828, + "loss": 0.783, "num_input_tokens_seen": 118046720, "step": 14410 }, { "epoch": 0.692436974789916, - "grad_norm": 0.4908153712749481, + "grad_norm": 0.4942614436149597, "learning_rate": 1.180899389734668e-05, - "loss": 0.9457, + "loss": 0.9465, "num_input_tokens_seen": 118128640, "step": 14420 }, { "epoch": 0.6929171668667466, - "grad_norm": 0.47637316584587097, + "grad_norm": 0.47883716225624084, "learning_rate": 1.1775357482271032e-05, - "loss": 0.872, + "loss": 0.8717, "num_input_tokens_seen": 118210560, "step": 14430 }, { "epoch": 0.6933973589435775, - "grad_norm": 0.5091587901115417, + "grad_norm": 0.5108613967895508, "learning_rate": 1.1741754276302851e-05, - "loss": 0.8554, + "loss": 0.8557, "num_input_tokens_seen": 118292480, "step": 14440 }, { "epoch": 0.6938775510204082, - "grad_norm": 0.49093228578567505, + "grad_norm": 0.4875372350215912, "learning_rate": 1.170818436382497e-05, - "loss": 0.9142, + "loss": 0.9129, "num_input_tokens_seen": 118374400, "step": 14450 }, { "epoch": 0.6943577430972389, - "grad_norm": 0.47567296028137207, + "grad_norm": 0.4819384217262268, "learning_rate": 1.1674647829136581e-05, - "loss": 0.9356, + "loss": 0.9353, "num_input_tokens_seen": 118456320, "step": 14460 }, { "epoch": 0.6948379351740697, - "grad_norm": 0.49377357959747314, + "grad_norm": 0.49324291944503784, "learning_rate": 1.164114475645306e-05, - "loss": 0.9423, + "loss": 0.9419, "num_input_tokens_seen": 118538240, "step": 14470 }, { "epoch": 0.6953181272509004, - "grad_norm": 0.49795305728912354, + "grad_norm": 0.49606454372406006, "learning_rate": 1.1607675229905776e-05, - "loss": 0.8267, + "loss": 0.8269, "num_input_tokens_seen": 118620160, "step": 14480 }, { "epoch": 0.6957983193277311, - "grad_norm": 0.42667868733406067, + "grad_norm": 0.4274294674396515, "learning_rate": 1.1574239333541856e-05, - "loss": 0.8272, + "loss": 0.8265, "num_input_tokens_seen": 118702080, "step": 14490 }, { "epoch": 0.6962785114045619, - "grad_norm": 0.4863702356815338, + "grad_norm": 0.4905083179473877, "learning_rate": 1.1540837151323951e-05, - "loss": 1.0219, + "loss": 1.0225, "num_input_tokens_seen": 118784000, "step": 14500 }, { "epoch": 0.6967587034813926, - "grad_norm": 0.42491790652275085, + "grad_norm": 0.42654213309288025, "learning_rate": 1.150746876713008e-05, - "loss": 1.0269, + "loss": 1.0267, "num_input_tokens_seen": 118865920, "step": 14510 }, { "epoch": 0.6972388955582233, - "grad_norm": 1.2376126050949097, + "grad_norm": 1.1795427799224854, "learning_rate": 1.1474134264753384e-05, - "loss": 0.9424, + "loss": 0.9422, "num_input_tokens_seen": 118947840, "step": 14520 }, { "epoch": 0.697719087635054, - "grad_norm": 0.4999983310699463, + "grad_norm": 0.4983981251716614, "learning_rate": 1.1440833727901894e-05, - "loss": 0.8436, + "loss": 0.8435, "num_input_tokens_seen": 119029760, "step": 14530 }, { "epoch": 0.6981992797118848, - "grad_norm": 1.1919395923614502, + "grad_norm": 1.133950114250183, "learning_rate": 1.1407567240198397e-05, - "loss": 0.7876, + "loss": 0.7872, "num_input_tokens_seen": 119111680, "step": 14540 }, { "epoch": 0.6986794717887155, - "grad_norm": 0.4805420935153961, + "grad_norm": 0.4824429750442505, "learning_rate": 1.1374334885180135e-05, - "loss": 0.8579, + "loss": 0.8584, "num_input_tokens_seen": 119193600, "step": 14550 }, { "epoch": 0.6991596638655462, - "grad_norm": 0.505160391330719, + "grad_norm": 0.5034725666046143, "learning_rate": 1.1341136746298647e-05, - "loss": 0.9189, + "loss": 0.9188, "num_input_tokens_seen": 119275520, "step": 14560 }, { "epoch": 0.699639855942377, - "grad_norm": 0.369730681180954, + "grad_norm": 0.3501124978065491, "learning_rate": 1.1307972906919562e-05, - "loss": 0.834, + "loss": 0.8331, "num_input_tokens_seen": 119357440, "step": 14570 }, { "epoch": 0.7001200480192077, - "grad_norm": 0.493691086769104, + "grad_norm": 0.4934571087360382, "learning_rate": 1.1274843450322381e-05, - "loss": 0.8542, + "loss": 0.8539, "num_input_tokens_seen": 119439360, "step": 14580 }, { "epoch": 0.7006002400960384, - "grad_norm": 0.49764561653137207, + "grad_norm": 0.49862003326416016, "learning_rate": 1.1241748459700241e-05, "loss": 0.8792, "num_input_tokens_seen": 119521280, @@ -11682,63 +11682,63 @@ }, { "epoch": 0.7010804321728692, - "grad_norm": 0.6448412537574768, + "grad_norm": 0.6504396796226501, "learning_rate": 1.1208688018159746e-05, - "loss": 0.9641, + "loss": 0.9642, "num_input_tokens_seen": 119603200, "step": 14600 }, { "epoch": 0.7015606242496999, - "grad_norm": 0.499411016702652, + "grad_norm": 0.4994623363018036, "learning_rate": 1.1175662208720758e-05, - "loss": 1.0869, + "loss": 1.0876, "num_input_tokens_seen": 119685120, "step": 14610 }, { "epoch": 0.7020408163265306, - "grad_norm": 0.5112809538841248, + "grad_norm": 0.5138194561004639, "learning_rate": 1.1142671114316127e-05, - "loss": 0.9409, + "loss": 0.941, "num_input_tokens_seen": 119767040, "step": 14620 }, { "epoch": 0.7025210084033613, - "grad_norm": 0.5502108931541443, + "grad_norm": 0.5315556526184082, "learning_rate": 1.1109714817791584e-05, - "loss": 0.9631, + "loss": 0.9637, "num_input_tokens_seen": 119848960, "step": 14630 }, { "epoch": 0.7030012004801921, - "grad_norm": 0.4769909381866455, + "grad_norm": 0.48073628544807434, "learning_rate": 1.1076793401905419e-05, - "loss": 0.8025, + "loss": 0.8024, "num_input_tokens_seen": 119930880, "step": 14640 }, { "epoch": 0.7034813925570228, - "grad_norm": 0.5704587697982788, + "grad_norm": 0.5759013295173645, "learning_rate": 1.1043906949328387e-05, - "loss": 0.8847, + "loss": 0.8844, "num_input_tokens_seen": 120012800, "step": 14650 }, { "epoch": 0.7039615846338535, - "grad_norm": 0.49126431345939636, + "grad_norm": 0.4914475977420807, "learning_rate": 1.1011055542643398e-05, - "loss": 0.8116, + "loss": 0.8125, "num_input_tokens_seen": 120094720, "step": 14660 }, { "epoch": 0.7044417767106843, - "grad_norm": 0.5111186504364014, + "grad_norm": 0.5116990208625793, "learning_rate": 1.0978239264345397e-05, "loss": 0.9473, "num_input_tokens_seen": 120176640, @@ -11746,71 +11746,71 @@ }, { "epoch": 0.704921968787515, - "grad_norm": 0.48010018467903137, + "grad_norm": 0.48121222853660583, "learning_rate": 1.0945458196841078e-05, - "loss": 0.9452, + "loss": 0.9443, "num_input_tokens_seen": 120258560, "step": 14680 }, { "epoch": 0.7054021608643457, - "grad_norm": 0.4862194061279297, + "grad_norm": 0.48448559641838074, "learning_rate": 1.0912712422448737e-05, - "loss": 1.089, + "loss": 1.0888, "num_input_tokens_seen": 120340480, "step": 14690 }, { "epoch": 0.7058823529411765, - "grad_norm": 0.5020564198493958, + "grad_norm": 0.49979427456855774, "learning_rate": 1.0880002023398058e-05, - "loss": 0.8222, + "loss": 0.8219, "num_input_tokens_seen": 120422400, "step": 14700 }, { "epoch": 0.7063625450180072, - "grad_norm": 0.5132585167884827, + "grad_norm": 0.5122401118278503, "learning_rate": 1.0847327081829853e-05, - "loss": 0.8349, + "loss": 0.8348, "num_input_tokens_seen": 120504320, "step": 14710 }, { "epoch": 0.7068427370948379, - "grad_norm": 0.5931466221809387, + "grad_norm": 0.6076194643974304, "learning_rate": 1.0814687679795924e-05, - "loss": 1.4205, + "loss": 1.4222, "num_input_tokens_seen": 120586240, "step": 14720 }, { "epoch": 0.7073229291716686, - "grad_norm": 0.529240608215332, + "grad_norm": 0.5290340185165405, "learning_rate": 1.0782083899258827e-05, - "loss": 0.9358, + "loss": 0.9349, "num_input_tokens_seen": 120668160, "step": 14730 }, { "epoch": 0.7078031212484994, - "grad_norm": 0.5132381319999695, + "grad_norm": 0.510225772857666, "learning_rate": 1.0749515822091658e-05, - "loss": 0.8712, + "loss": 0.871, "num_input_tokens_seen": 120750080, "step": 14740 }, { "epoch": 0.7082833133253301, - "grad_norm": 0.5172890424728394, + "grad_norm": 0.5205603241920471, "learning_rate": 1.0716983530077843e-05, - "loss": 1.2485, + "loss": 1.2482, "num_input_tokens_seen": 120832000, "step": 14750 }, { "epoch": 0.7087635054021608, - "grad_norm": 0.4927189350128174, + "grad_norm": 0.4906553030014038, "learning_rate": 1.0684487104910974e-05, "loss": 0.7965, "num_input_tokens_seen": 120913920, @@ -11818,7 +11818,7 @@ }, { "epoch": 0.7092436974789916, - "grad_norm": 0.3909131586551666, + "grad_norm": 0.4048673212528229, "learning_rate": 1.0652026628194567e-05, "loss": 0.8365, "num_input_tokens_seen": 120995840, @@ -11826,111 +11826,111 @@ }, { "epoch": 0.7097238895558223, - "grad_norm": 0.5766429901123047, + "grad_norm": 0.5668584108352661, "learning_rate": 1.0619602181441848e-05, - "loss": 0.8742, + "loss": 0.8743, "num_input_tokens_seen": 121077760, "step": 14780 }, { "epoch": 0.710204081632653, - "grad_norm": 0.5812046527862549, + "grad_norm": 0.5875443816184998, "learning_rate": 1.0587213846075591e-05, - "loss": 0.7952, + "loss": 0.7954, "num_input_tokens_seen": 121159680, "step": 14790 }, { "epoch": 0.7106842737094838, - "grad_norm": 0.5952445864677429, + "grad_norm": 0.5778575539588928, "learning_rate": 1.0554861703427884e-05, - "loss": 0.9441, + "loss": 0.9451, "num_input_tokens_seen": 121241600, "step": 14800 }, { "epoch": 0.7111644657863145, - "grad_norm": 1.0399448871612549, + "grad_norm": 1.0493206977844238, "learning_rate": 1.0522545834739908e-05, - "loss": 1.0587, + "loss": 1.0584, "num_input_tokens_seen": 121323520, "step": 14810 }, { "epoch": 0.7116446578631452, - "grad_norm": 0.519305944442749, + "grad_norm": 0.5271633267402649, "learning_rate": 1.0490266321161785e-05, - "loss": 0.9414, + "loss": 0.9405, "num_input_tokens_seen": 121405440, "step": 14820 }, { "epoch": 0.712124849939976, - "grad_norm": 0.517493724822998, + "grad_norm": 0.5183375477790833, "learning_rate": 1.0458023243752321e-05, - "loss": 0.9523, + "loss": 0.9526, "num_input_tokens_seen": 121487360, "step": 14830 }, { "epoch": 0.7126050420168067, - "grad_norm": 0.4983735978603363, + "grad_norm": 0.4991316497325897, "learning_rate": 1.0425816683478823e-05, - "loss": 1.145, + "loss": 1.1452, "num_input_tokens_seen": 121569280, "step": 14840 }, { "epoch": 0.7130852340936374, - "grad_norm": 1.5348645448684692, + "grad_norm": 1.4794950485229492, "learning_rate": 1.039364672121692e-05, - "loss": 0.9529, + "loss": 0.9537, "num_input_tokens_seen": 121651200, "step": 14850 }, { "epoch": 0.7135654261704681, - "grad_norm": 0.44925153255462646, + "grad_norm": 0.4501538574695587, "learning_rate": 1.0361513437750333e-05, - "loss": 1.0087, + "loss": 1.0094, "num_input_tokens_seen": 121733120, "step": 14860 }, { "epoch": 0.714045618247299, - "grad_norm": 0.28816720843315125, + "grad_norm": 0.31150001287460327, "learning_rate": 1.0329416913770651e-05, - "loss": 0.8125, + "loss": 0.8121, "num_input_tokens_seen": 121815040, "step": 14870 }, { "epoch": 0.7145258103241297, - "grad_norm": 0.5006898045539856, + "grad_norm": 0.5036375522613525, "learning_rate": 1.0297357229877183e-05, - "loss": 0.8591, + "loss": 0.8583, "num_input_tokens_seen": 121896960, "step": 14880 }, { "epoch": 0.7150060024009603, - "grad_norm": 0.4817348122596741, + "grad_norm": 0.4911898374557495, "learning_rate": 1.0265334466576723e-05, - "loss": 0.9232, + "loss": 0.9234, "num_input_tokens_seen": 121978880, "step": 14890 }, { "epoch": 0.7154861944777912, - "grad_norm": 0.511573851108551, + "grad_norm": 0.5137236714363098, "learning_rate": 1.0233348704283332e-05, - "loss": 0.7655, + "loss": 0.7661, "num_input_tokens_seen": 122060800, "step": 14900 }, { "epoch": 0.7159663865546219, - "grad_norm": 0.49658405780792236, + "grad_norm": 0.49499648809432983, "learning_rate": 1.0201400023318184e-05, "loss": 0.8836, "num_input_tokens_seen": 122142720, @@ -11938,79 +11938,79 @@ }, { "epoch": 0.7164465786314526, - "grad_norm": 0.49688369035720825, + "grad_norm": 0.49660295248031616, "learning_rate": 1.0169488503909313e-05, - "loss": 0.9663, + "loss": 0.9656, "num_input_tokens_seen": 122224640, "step": 14920 }, { "epoch": 0.7169267707082834, - "grad_norm": 0.4753476679325104, + "grad_norm": 0.47622841596603394, "learning_rate": 1.0137614226191434e-05, - "loss": 1.1426, + "loss": 1.1411, "num_input_tokens_seen": 122306560, "step": 14930 }, { "epoch": 0.7174069627851141, - "grad_norm": 0.496444433927536, + "grad_norm": 0.5041408538818359, "learning_rate": 1.010577727020576e-05, - "loss": 1.0298, + "loss": 1.0296, "num_input_tokens_seen": 122388480, "step": 14940 }, { "epoch": 0.7178871548619448, - "grad_norm": 0.5547921657562256, + "grad_norm": 0.5578935146331787, "learning_rate": 1.0073977715899785e-05, - "loss": 1.0564, + "loss": 1.0557, "num_input_tokens_seen": 122470400, "step": 14950 }, { "epoch": 0.7183673469387755, - "grad_norm": 0.49256107211112976, + "grad_norm": 0.4937760829925537, "learning_rate": 1.0042215643127051e-05, - "loss": 0.8569, + "loss": 0.857, "num_input_tokens_seen": 122552320, "step": 14960 }, { "epoch": 0.7188475390156063, - "grad_norm": 0.48709073662757874, + "grad_norm": 0.48638951778411865, "learning_rate": 1.0010491131647013e-05, - "loss": 0.9653, + "loss": 0.9649, "num_input_tokens_seen": 122634240, "step": 14970 }, { "epoch": 0.719327731092437, - "grad_norm": 0.9637261629104614, + "grad_norm": 1.094740390777588, "learning_rate": 9.978804261124792e-06, - "loss": 1.0368, + "loss": 1.0374, "num_input_tokens_seen": 122716160, "step": 14980 }, { "epoch": 0.7198079231692677, - "grad_norm": 0.5114941000938416, + "grad_norm": 0.5187696814537048, "learning_rate": 9.947155111130969e-06, - "loss": 0.8155, + "loss": 0.8151, "num_input_tokens_seen": 122798080, "step": 14990 }, { "epoch": 0.7202881152460985, - "grad_norm": 0.4979882538318634, + "grad_norm": 0.5019155144691467, "learning_rate": 9.915543761141432e-06, - "loss": 0.9462, + "loss": 0.9461, "num_input_tokens_seen": 122880000, "step": 15000 }, { "epoch": 0.7207683073229292, - "grad_norm": 0.49206554889678955, + "grad_norm": 0.49210861325263977, "learning_rate": 9.883970290537134e-06, "loss": 0.7584, "num_input_tokens_seen": 122961920, @@ -12018,79 +12018,79 @@ }, { "epoch": 0.7212484993997599, - "grad_norm": 0.5425018668174744, + "grad_norm": 0.541955828666687, "learning_rate": 9.852434778603888e-06, - "loss": 0.9354, + "loss": 0.9357, "num_input_tokens_seen": 123043840, "step": 15020 }, { "epoch": 0.7217286914765907, - "grad_norm": 0.311642587184906, + "grad_norm": 0.2484675943851471, "learning_rate": 9.820937304532221e-06, - "loss": 0.8208, + "loss": 0.8206, "num_input_tokens_seen": 123125760, "step": 15030 }, { "epoch": 0.7222088835534214, - "grad_norm": 0.5740408301353455, + "grad_norm": 0.5742604732513428, "learning_rate": 9.789477947417131e-06, - "loss": 1.0232, + "loss": 1.0253, "num_input_tokens_seen": 123207680, "step": 15040 }, { "epoch": 0.7226890756302521, - "grad_norm": 0.48182716965675354, + "grad_norm": 0.48535361886024475, "learning_rate": 9.758056786257874e-06, - "loss": 0.8119, + "loss": 0.8122, "num_input_tokens_seen": 123289600, "step": 15050 }, { "epoch": 0.7231692677070828, - "grad_norm": 0.6770291328430176, + "grad_norm": 0.7094045281410217, "learning_rate": 9.726673899957823e-06, - "loss": 0.9681, + "loss": 0.9684, "num_input_tokens_seen": 123371520, "step": 15060 }, { "epoch": 0.7236494597839136, - "grad_norm": 0.5068685412406921, + "grad_norm": 0.5078888535499573, "learning_rate": 9.695329367324226e-06, - "loss": 0.831, + "loss": 0.8309, "num_input_tokens_seen": 123453440, "step": 15070 }, { "epoch": 0.7241296518607443, - "grad_norm": 0.5525927543640137, + "grad_norm": 0.5580435395240784, "learning_rate": 9.664023267068007e-06, - "loss": 0.7507, + "loss": 0.7509, "num_input_tokens_seen": 123535360, "step": 15080 }, { "epoch": 0.724609843937575, - "grad_norm": 0.5521044731140137, + "grad_norm": 0.5548977255821228, "learning_rate": 9.632755677803595e-06, - "loss": 0.9068, + "loss": 0.9072, "num_input_tokens_seen": 123617280, "step": 15090 }, { "epoch": 0.7250900360144058, - "grad_norm": 0.628960907459259, + "grad_norm": 0.633141815662384, "learning_rate": 9.601526678048701e-06, - "loss": 0.9601, + "loss": 0.9596, "num_input_tokens_seen": 123699200, "step": 15100 }, { "epoch": 0.7255702280912365, - "grad_norm": 0.49019432067871094, + "grad_norm": 0.49129536747932434, "learning_rate": 9.570336346224145e-06, "loss": 0.8658, "num_input_tokens_seen": 123781120, @@ -12098,127 +12098,127 @@ }, { "epoch": 0.7260504201680672, - "grad_norm": 0.49973979592323303, + "grad_norm": 0.49976715445518494, "learning_rate": 9.53918476065363e-06, - "loss": 0.9606, + "loss": 0.9609, "num_input_tokens_seen": 123863040, "step": 15120 }, { "epoch": 0.726530612244898, - "grad_norm": 0.5277695655822754, + "grad_norm": 0.5346314311027527, "learning_rate": 9.508071999563578e-06, - "loss": 0.8749, + "loss": 0.8759, "num_input_tokens_seen": 123944960, "step": 15130 }, { "epoch": 0.7270108043217287, - "grad_norm": 0.6320548057556152, + "grad_norm": 0.6391932368278503, "learning_rate": 9.476998141082896e-06, - "loss": 0.8534, + "loss": 0.8535, "num_input_tokens_seen": 124026880, "step": 15140 }, { "epoch": 0.7274909963985594, - "grad_norm": 0.4708552658557892, + "grad_norm": 0.4688946306705475, "learning_rate": 9.445963263242822e-06, - "loss": 0.8115, + "loss": 0.8117, "num_input_tokens_seen": 124108800, "step": 15150 }, { "epoch": 0.7279711884753901, - "grad_norm": 0.49595674872398376, + "grad_norm": 0.4958951771259308, "learning_rate": 9.414967443976705e-06, - "loss": 0.8173, + "loss": 0.8172, "num_input_tokens_seen": 124190720, "step": 15160 }, { "epoch": 0.7284513805522209, - "grad_norm": 0.49820470809936523, + "grad_norm": 0.4968552887439728, "learning_rate": 9.384010761119787e-06, - "loss": 0.7552, + "loss": 0.7557, "num_input_tokens_seen": 124272640, "step": 15170 }, { "epoch": 0.7289315726290516, - "grad_norm": 0.47693219780921936, + "grad_norm": 0.4845529794692993, "learning_rate": 9.353093292409063e-06, - "loss": 1.0986, + "loss": 1.0988, "num_input_tokens_seen": 124354560, "step": 15180 }, { "epoch": 0.7294117647058823, - "grad_norm": 0.5159751176834106, + "grad_norm": 0.5118656158447266, "learning_rate": 9.322215115483049e-06, - "loss": 0.9024, + "loss": 0.9023, "num_input_tokens_seen": 124436480, "step": 15190 }, { "epoch": 0.7298919567827131, - "grad_norm": 0.4883824586868286, + "grad_norm": 0.4848543703556061, "learning_rate": 9.291376307881577e-06, - "loss": 0.7029, + "loss": 0.7027, "num_input_tokens_seen": 124518400, "step": 15200 }, { "epoch": 0.7303721488595438, - "grad_norm": 0.4944133162498474, + "grad_norm": 0.49403074383735657, "learning_rate": 9.260576947045624e-06, - "loss": 0.7564, + "loss": 0.7578, "num_input_tokens_seen": 124600320, "step": 15210 }, { "epoch": 0.7308523409363745, - "grad_norm": 0.5287705063819885, + "grad_norm": 0.5141822099685669, "learning_rate": 9.229817110317126e-06, - "loss": 0.8904, + "loss": 0.8909, "num_input_tokens_seen": 124682240, "step": 15220 }, { "epoch": 0.7313325330132053, - "grad_norm": 0.5056361556053162, + "grad_norm": 0.5016796588897705, "learning_rate": 9.19909687493874e-06, - "loss": 0.8753, + "loss": 0.8752, "num_input_tokens_seen": 124764160, "step": 15230 }, { "epoch": 0.731812725090036, - "grad_norm": 0.49271291494369507, + "grad_norm": 0.48871156573295593, "learning_rate": 9.168416318053701e-06, - "loss": 0.7468, + "loss": 0.7467, "num_input_tokens_seen": 124846080, "step": 15240 }, { "epoch": 0.7322929171668667, - "grad_norm": 0.4896814823150635, + "grad_norm": 0.4882807731628418, "learning_rate": 9.137775516705604e-06, - "loss": 0.7828, + "loss": 0.7833, "num_input_tokens_seen": 124928000, "step": 15250 }, { "epoch": 0.7327731092436974, - "grad_norm": 0.5445641279220581, + "grad_norm": 2.4790802001953125, "learning_rate": 9.107174547838188e-06, - "loss": 0.978, + "loss": 0.9774, "num_input_tokens_seen": 125009920, "step": 15260 }, { "epoch": 0.7332533013205282, - "grad_norm": 0.4939377009868622, + "grad_norm": 0.5495137572288513, "learning_rate": 9.076613488295193e-06, "loss": 0.9886, "num_input_tokens_seen": 125091840, @@ -12226,159 +12226,159 @@ }, { "epoch": 0.7337334933973589, - "grad_norm": 0.5052213668823242, + "grad_norm": 0.5097706913948059, "learning_rate": 9.04609241482014e-06, - "loss": 0.9165, + "loss": 0.9161, "num_input_tokens_seen": 125173760, "step": 15280 }, { "epoch": 0.7342136854741896, - "grad_norm": 0.5136955976486206, + "grad_norm": 0.5205005407333374, "learning_rate": 9.015611404056121e-06, - "loss": 0.9454, + "loss": 0.9453, "num_input_tokens_seen": 125255680, "step": 15290 }, { "epoch": 0.7346938775510204, - "grad_norm": 0.49934035539627075, + "grad_norm": 0.5136666893959045, "learning_rate": 8.985170532545622e-06, - "loss": 0.9743, + "loss": 0.9736, "num_input_tokens_seen": 125337600, "step": 15300 }, { "epoch": 0.7351740696278511, - "grad_norm": 0.47998034954071045, + "grad_norm": 0.48384732007980347, "learning_rate": 8.954769876730368e-06, - "loss": 1.0288, + "loss": 1.0289, "num_input_tokens_seen": 125419520, "step": 15310 }, { "epoch": 0.7356542617046818, - "grad_norm": 0.512691080570221, + "grad_norm": 0.5137535929679871, "learning_rate": 8.924409512951045e-06, - "loss": 0.8448, + "loss": 0.8446, "num_input_tokens_seen": 125501440, "step": 15320 }, { "epoch": 0.7361344537815127, - "grad_norm": 0.5149726271629333, + "grad_norm": 0.5144986510276794, "learning_rate": 8.894089517447206e-06, - "loss": 1.0112, + "loss": 1.0116, "num_input_tokens_seen": 125583360, "step": 15330 }, { "epoch": 0.7366146458583434, - "grad_norm": 0.47288787364959717, + "grad_norm": 0.474317729473114, "learning_rate": 8.863809966357017e-06, - "loss": 0.9288, + "loss": 0.9291, "num_input_tokens_seen": 125665280, "step": 15340 }, { "epoch": 0.737094837935174, - "grad_norm": 0.4858817756175995, + "grad_norm": 0.4877590835094452, "learning_rate": 8.833570935717064e-06, - "loss": 0.8124, + "loss": 0.8133, "num_input_tokens_seen": 125747200, "step": 15350 }, { "epoch": 0.7375750300120048, - "grad_norm": 0.496822714805603, + "grad_norm": 0.49466702342033386, "learning_rate": 8.803372501462203e-06, - "loss": 0.8461, + "loss": 0.845, "num_input_tokens_seen": 125829120, "step": 15360 }, { "epoch": 0.7380552220888356, - "grad_norm": 0.510150134563446, + "grad_norm": 0.5105023384094238, "learning_rate": 8.773214739425346e-06, - "loss": 0.7163, + "loss": 0.7169, "num_input_tokens_seen": 125911040, "step": 15370 }, { "epoch": 0.7385354141656663, - "grad_norm": 0.4825844466686249, + "grad_norm": 0.47850966453552246, "learning_rate": 8.743097725337255e-06, - "loss": 0.5987, + "loss": 0.599, "num_input_tokens_seen": 125992960, "step": 15380 }, { "epoch": 0.739015606242497, - "grad_norm": 0.47801604866981506, + "grad_norm": 0.47564321756362915, "learning_rate": 8.713021534826366e-06, - "loss": 0.8509, + "loss": 0.8508, "num_input_tokens_seen": 126074880, "step": 15390 }, { "epoch": 0.7394957983193278, - "grad_norm": 0.5060662031173706, + "grad_norm": 0.5005056262016296, "learning_rate": 8.68298624341862e-06, - "loss": 1.0267, + "loss": 1.0262, "num_input_tokens_seen": 126156800, "step": 15400 }, { "epoch": 0.7399759903961585, - "grad_norm": 0.46689069271087646, + "grad_norm": 0.46413904428482056, "learning_rate": 8.652991926537254e-06, - "loss": 0.913, + "loss": 0.9126, "num_input_tokens_seen": 126238720, "step": 15410 }, { "epoch": 0.7404561824729892, - "grad_norm": 0.4775540828704834, + "grad_norm": 0.47946876287460327, "learning_rate": 8.623038659502583e-06, - "loss": 0.9146, + "loss": 0.9147, "num_input_tokens_seen": 126320640, "step": 15420 }, { "epoch": 0.74093637454982, - "grad_norm": 0.5078407526016235, + "grad_norm": 0.5045376420021057, "learning_rate": 8.59312651753187e-06, - "loss": 0.7079, + "loss": 0.7083, "num_input_tokens_seen": 126402560, "step": 15430 }, { "epoch": 0.7414165666266507, - "grad_norm": 0.6031984090805054, + "grad_norm": 0.5812355875968933, "learning_rate": 8.5632555757391e-06, - "loss": 0.9282, + "loss": 0.9287, "num_input_tokens_seen": 126484480, "step": 15440 }, { "epoch": 0.7418967587034814, - "grad_norm": 0.48151716589927673, + "grad_norm": 0.4817546606063843, "learning_rate": 8.53342590913478e-06, - "loss": 0.8451, + "loss": 0.845, "num_input_tokens_seen": 126566400, "step": 15450 }, { "epoch": 0.7423769507803121, - "grad_norm": 0.4922950863838196, + "grad_norm": 0.4910758137702942, "learning_rate": 8.503637592625796e-06, - "loss": 0.8644, + "loss": 0.8641, "num_input_tokens_seen": 126648320, "step": 15460 }, { "epoch": 0.7428571428571429, - "grad_norm": 0.48294711112976074, + "grad_norm": 0.47712764143943787, "learning_rate": 8.473890701015177e-06, "loss": 0.9529, "num_input_tokens_seen": 126730240, @@ -12386,79 +12386,79 @@ }, { "epoch": 0.7433373349339736, - "grad_norm": 1.1124995946884155, + "grad_norm": 1.1171008348464966, "learning_rate": 8.444185309001926e-06, - "loss": 1.0753, + "loss": 1.0765, "num_input_tokens_seen": 126812160, "step": 15480 }, { "epoch": 0.7438175270108043, - "grad_norm": 0.6511945128440857, + "grad_norm": 0.635892927646637, "learning_rate": 8.41452149118085e-06, - "loss": 0.8009, + "loss": 0.7988, "num_input_tokens_seen": 126894080, "step": 15490 }, { "epoch": 0.7442977190876351, - "grad_norm": 0.47648462653160095, + "grad_norm": 0.46706971526145935, "learning_rate": 8.384899322042356e-06, - "loss": 0.8622, + "loss": 0.8623, "num_input_tokens_seen": 126976000, "step": 15500 }, { "epoch": 0.7447779111644658, - "grad_norm": 0.48163020610809326, + "grad_norm": 0.4782085418701172, "learning_rate": 8.355318875972243e-06, - "loss": 0.8215, + "loss": 0.8205, "num_input_tokens_seen": 127057920, "step": 15510 }, { "epoch": 0.7452581032412965, - "grad_norm": 0.4855590760707855, + "grad_norm": 0.4798336625099182, "learning_rate": 8.325780227251562e-06, - "loss": 0.7382, + "loss": 0.7383, "num_input_tokens_seen": 127139840, "step": 15520 }, { "epoch": 0.7457382953181273, - "grad_norm": 0.4971849024295807, + "grad_norm": 0.49591222405433655, "learning_rate": 8.2962834500564e-06, - "loss": 0.9809, + "loss": 0.9802, "num_input_tokens_seen": 127221760, "step": 15530 }, { "epoch": 0.746218487394958, - "grad_norm": 0.4910465478897095, + "grad_norm": 0.4897187054157257, "learning_rate": 8.266828618457678e-06, - "loss": 0.9193, + "loss": 0.9196, "num_input_tokens_seen": 127303680, "step": 15540 }, { "epoch": 0.7466986794717887, - "grad_norm": 0.5851929187774658, + "grad_norm": 0.5829177498817444, "learning_rate": 8.237415806421015e-06, - "loss": 1.1018, + "loss": 1.1014, "num_input_tokens_seen": 127385600, "step": 15550 }, { "epoch": 0.7471788715486194, - "grad_norm": 0.5044286847114563, + "grad_norm": 0.5015119910240173, "learning_rate": 8.20804508780648e-06, - "loss": 0.7888, + "loss": 0.7883, "num_input_tokens_seen": 127467520, "step": 15560 }, { "epoch": 0.7476590636254502, - "grad_norm": 0.48159295320510864, + "grad_norm": 0.48306459188461304, "learning_rate": 8.178716536368475e-06, "loss": 1.0093, "num_input_tokens_seen": 127549440, @@ -12466,39 +12466,39 @@ }, { "epoch": 0.7481392557022809, - "grad_norm": 0.5040597319602966, + "grad_norm": 0.5044358372688293, "learning_rate": 8.149430225755476e-06, - "loss": 0.9785, + "loss": 0.978, "num_input_tokens_seen": 127631360, "step": 15580 }, { "epoch": 0.7486194477791116, - "grad_norm": 0.4986848831176758, + "grad_norm": 0.5006290674209595, "learning_rate": 8.120186229509922e-06, - "loss": 0.8937, + "loss": 0.8938, "num_input_tokens_seen": 127713280, "step": 15590 }, { "epoch": 0.7490996398559424, - "grad_norm": 0.5762357711791992, + "grad_norm": 0.5820693969726562, "learning_rate": 8.090984621067963e-06, - "loss": 1.0423, + "loss": 1.0422, "num_input_tokens_seen": 127795200, "step": 15600 }, { "epoch": 0.7495798319327731, - "grad_norm": 0.4688274562358856, + "grad_norm": 0.471348375082016, "learning_rate": 8.061825473759324e-06, - "loss": 0.8387, + "loss": 0.8388, "num_input_tokens_seen": 127877120, "step": 15610 }, { "epoch": 0.7500600240096038, - "grad_norm": 0.49585649371147156, + "grad_norm": 0.49735504388809204, "learning_rate": 8.032708860807111e-06, "loss": 0.8558, "num_input_tokens_seen": 127959040, @@ -12506,175 +12506,175 @@ }, { "epoch": 0.7505402160864346, - "grad_norm": 0.501162052154541, + "grad_norm": 0.49957790970802307, "learning_rate": 8.003634855327594e-06, - "loss": 0.7608, + "loss": 0.7606, "num_input_tokens_seen": 128040960, "step": 15630 }, { "epoch": 0.7510204081632653, - "grad_norm": 0.8193572759628296, + "grad_norm": 0.8058281540870667, "learning_rate": 7.974603530330069e-06, - "loss": 0.7416, + "loss": 0.7422, "num_input_tokens_seen": 128122880, "step": 15640 }, { "epoch": 0.751500600240096, - "grad_norm": 0.5035424828529358, + "grad_norm": 0.49757471680641174, "learning_rate": 7.945614958716658e-06, - "loss": 0.7773, + "loss": 0.7774, "num_input_tokens_seen": 128204800, "step": 15650 }, { "epoch": 0.7519807923169267, - "grad_norm": 0.48581522703170776, + "grad_norm": 0.48207026720046997, "learning_rate": 7.916669213282107e-06, - "loss": 0.8304, + "loss": 0.831, "num_input_tokens_seen": 128286720, "step": 15660 }, { "epoch": 0.7524609843937575, - "grad_norm": 0.5436628460884094, + "grad_norm": 0.5513833165168762, "learning_rate": 7.88776636671362e-06, - "loss": 1.1004, + "loss": 1.1006, "num_input_tokens_seen": 128368640, "step": 15670 }, { "epoch": 0.7529411764705882, - "grad_norm": 0.4647444784641266, + "grad_norm": 0.46379712224006653, "learning_rate": 7.858906491590697e-06, - "loss": 0.9029, + "loss": 0.903, "num_input_tokens_seen": 128450560, "step": 15680 }, { "epoch": 0.7534213685474189, - "grad_norm": 0.4686352014541626, + "grad_norm": 0.4677363336086273, "learning_rate": 7.830089660384895e-06, - "loss": 0.8456, + "loss": 0.8458, "num_input_tokens_seen": 128532480, "step": 15690 }, { "epoch": 0.7539015606242497, - "grad_norm": 0.4896450340747833, + "grad_norm": 0.49112623929977417, "learning_rate": 7.801315945459714e-06, - "loss": 1.3139, + "loss": 1.3148, "num_input_tokens_seen": 128614400, "step": 15700 }, { "epoch": 0.7543817527010804, - "grad_norm": 0.5118237137794495, + "grad_norm": 0.5155120491981506, "learning_rate": 7.772585419070374e-06, - "loss": 0.931, + "loss": 0.9315, "num_input_tokens_seen": 128696320, "step": 15710 }, { "epoch": 0.7548619447779111, - "grad_norm": 0.4990551471710205, + "grad_norm": 0.5028974413871765, "learning_rate": 7.743898153363625e-06, - "loss": 0.7754, + "loss": 0.7756, "num_input_tokens_seen": 128778240, "step": 15720 }, { "epoch": 0.7553421368547419, - "grad_norm": 0.47710755467414856, + "grad_norm": 0.3549368679523468, "learning_rate": 7.715254220377596e-06, - "loss": 0.8358, + "loss": 0.8351, "num_input_tokens_seen": 128860160, "step": 15730 }, { "epoch": 0.7558223289315726, - "grad_norm": 1.072224497795105, + "grad_norm": 1.1180391311645508, "learning_rate": 7.686653692041615e-06, - "loss": 0.7976, + "loss": 0.7978, "num_input_tokens_seen": 128942080, "step": 15740 }, { "epoch": 0.7563025210084033, - "grad_norm": 0.48185160756111145, + "grad_norm": 0.48073238134384155, "learning_rate": 7.658096640175988e-06, - "loss": 0.834, + "loss": 0.8335, "num_input_tokens_seen": 129024000, "step": 15750 }, { "epoch": 0.7567827130852341, - "grad_norm": 0.5147521495819092, + "grad_norm": 0.5103764533996582, "learning_rate": 7.629583136491844e-06, - "loss": 0.78, + "loss": 0.7803, "num_input_tokens_seen": 129105920, "step": 15760 }, { "epoch": 0.7572629051620648, - "grad_norm": 0.5172405242919922, + "grad_norm": 0.5050859451293945, "learning_rate": 7.601113252590991e-06, - "loss": 0.855, + "loss": 0.8551, "num_input_tokens_seen": 129187840, "step": 15770 }, { "epoch": 0.7577430972388955, - "grad_norm": 0.5000312328338623, + "grad_norm": 0.4922633469104767, "learning_rate": 7.572687059965661e-06, - "loss": 1.0542, + "loss": 1.0544, "num_input_tokens_seen": 129269760, "step": 15780 }, { "epoch": 0.7582232893157262, - "grad_norm": 0.5540167689323425, + "grad_norm": 0.5366300344467163, "learning_rate": 7.544304629998389e-06, - "loss": 0.9999, + "loss": 0.9997, "num_input_tokens_seen": 129351680, "step": 15790 }, { "epoch": 0.758703481392557, - "grad_norm": 0.4820740222930908, + "grad_norm": 0.4798520505428314, "learning_rate": 7.51596603396183e-06, - "loss": 0.8808, + "loss": 0.8806, "num_input_tokens_seen": 129433600, "step": 15800 }, { "epoch": 0.7591836734693878, - "grad_norm": 0.5026702880859375, + "grad_norm": 0.49928078055381775, "learning_rate": 7.4876713430185265e-06, - "loss": 0.9648, + "loss": 0.9649, "num_input_tokens_seen": 129515520, "step": 15810 }, { "epoch": 0.7596638655462185, - "grad_norm": 0.4794592261314392, + "grad_norm": 0.48000165820121765, "learning_rate": 7.4594206282208e-06, - "loss": 0.9263, + "loss": 0.926, "num_input_tokens_seen": 129597440, "step": 15820 }, { "epoch": 0.7601440576230493, - "grad_norm": 0.4958498477935791, + "grad_norm": 0.49403661489486694, "learning_rate": 7.431213960510544e-06, - "loss": 0.8364, + "loss": 0.8361, "num_input_tokens_seen": 129679360, "step": 15830 }, { "epoch": 0.76062424969988, - "grad_norm": 0.4976142644882202, + "grad_norm": 0.49990224838256836, "learning_rate": 7.40305141071902e-06, "loss": 1.068, "num_input_tokens_seen": 129761280, @@ -12682,15 +12682,15 @@ }, { "epoch": 0.7611044417767107, - "grad_norm": 0.4213819205760956, + "grad_norm": 0.4160490930080414, "learning_rate": 7.374933049566704e-06, - "loss": 0.7245, + "loss": 0.724, "num_input_tokens_seen": 129843200, "step": 15850 }, { "epoch": 0.7615846338535415, - "grad_norm": 0.49244338274002075, + "grad_norm": 0.49283355474472046, "learning_rate": 7.346858947663138e-06, "loss": 0.8833, "num_input_tokens_seen": 129925120, @@ -12698,87 +12698,87 @@ }, { "epoch": 0.7620648259303722, - "grad_norm": 0.385324090719223, + "grad_norm": 0.39053088426589966, "learning_rate": 7.318829175506684e-06, - "loss": 1.0079, + "loss": 1.0074, "num_input_tokens_seen": 130007040, "step": 15870 }, { "epoch": 0.7625450180072029, - "grad_norm": 0.6321762800216675, + "grad_norm": 0.6311637163162231, "learning_rate": 7.290843803484409e-06, - "loss": 0.9743, + "loss": 0.9735, "num_input_tokens_seen": 130088960, "step": 15880 }, { "epoch": 0.7630252100840336, - "grad_norm": 0.5179278254508972, + "grad_norm": 0.5157874226570129, "learning_rate": 7.262902901871885e-06, - "loss": 0.923, + "loss": 0.9231, "num_input_tokens_seen": 130170880, "step": 15890 }, { "epoch": 0.7635054021608644, - "grad_norm": 0.48879900574684143, + "grad_norm": 0.4867972135543823, "learning_rate": 7.235006540832995e-06, - "loss": 0.947, + "loss": 0.9472, "num_input_tokens_seen": 130252800, "step": 15900 }, { "epoch": 0.7639855942376951, - "grad_norm": 0.4765695631504059, + "grad_norm": 0.4728543758392334, "learning_rate": 7.207154790419784e-06, - "loss": 0.8526, + "loss": 0.8525, "num_input_tokens_seen": 130334720, "step": 15910 }, { "epoch": 0.7644657863145258, - "grad_norm": 0.5195481181144714, + "grad_norm": 0.5194888114929199, "learning_rate": 7.179347720572288e-06, - "loss": 1.0454, + "loss": 1.0461, "num_input_tokens_seen": 130416640, "step": 15920 }, { "epoch": 0.7649459783913566, - "grad_norm": 0.48631834983825684, + "grad_norm": 0.48327717185020447, "learning_rate": 7.151585401118316e-06, - "loss": 0.7704, + "loss": 0.77, "num_input_tokens_seen": 130498560, "step": 15930 }, { "epoch": 0.7654261704681873, - "grad_norm": 0.4827914237976074, + "grad_norm": 0.4789976179599762, "learning_rate": 7.12386790177331e-06, - "loss": 0.8247, + "loss": 0.8252, "num_input_tokens_seen": 130580480, "step": 15940 }, { "epoch": 0.765906362545018, - "grad_norm": 0.21842628717422485, + "grad_norm": 0.2211165875196457, "learning_rate": 7.096195292140173e-06, - "loss": 0.8725, + "loss": 0.8723, "num_input_tokens_seen": 130662400, "step": 15950 }, { "epoch": 0.7663865546218488, - "grad_norm": 0.46142032742500305, + "grad_norm": 0.45980551838874817, "learning_rate": 7.06856764170907e-06, - "loss": 0.7432, + "loss": 0.7436, "num_input_tokens_seen": 130744320, "step": 15960 }, { "epoch": 0.7668667466986795, - "grad_norm": 0.5066096186637878, + "grad_norm": 0.5044180154800415, "learning_rate": 7.040985019857274e-06, "loss": 0.959, "num_input_tokens_seen": 130826240, @@ -12786,23 +12786,23 @@ }, { "epoch": 0.7673469387755102, - "grad_norm": 0.4842053949832916, + "grad_norm": 0.48258551955223083, "learning_rate": 7.013447495848996e-06, - "loss": 1.1146, + "loss": 1.1143, "num_input_tokens_seen": 130908160, "step": 15980 }, { "epoch": 0.7678271308523409, - "grad_norm": 0.5671928524971008, + "grad_norm": 0.5670498609542847, "learning_rate": 6.985955138835162e-06, - "loss": 0.9096, + "loss": 0.9102, "num_input_tokens_seen": 130990080, "step": 15990 }, { "epoch": 0.7683073229291717, - "grad_norm": 0.4887046813964844, + "grad_norm": 0.48566895723342896, "learning_rate": 6.958508017853319e-06, "loss": 0.9424, "num_input_tokens_seen": 131072000, @@ -12810,7 +12810,7 @@ }, { "epoch": 0.7687875150060024, - "grad_norm": 0.5106115341186523, + "grad_norm": 0.5094760656356812, "learning_rate": 6.931106201827397e-06, "loss": 0.905, "num_input_tokens_seen": 131153920, @@ -12818,7 +12818,7 @@ }, { "epoch": 0.7692677070828331, - "grad_norm": 0.4807291626930237, + "grad_norm": 0.48024067282676697, "learning_rate": 6.903749759567557e-06, "loss": 0.8972, "num_input_tokens_seen": 131235840, @@ -12826,15 +12826,15 @@ }, { "epoch": 0.7697478991596639, - "grad_norm": 0.4951104521751404, + "grad_norm": 0.4950743019580841, "learning_rate": 6.876438759770037e-06, - "loss": 0.8672, + "loss": 0.8669, "num_input_tokens_seen": 131317760, "step": 16030 }, { "epoch": 0.7702280912364946, - "grad_norm": 0.5364571213722229, + "grad_norm": 0.5339250564575195, "learning_rate": 6.8491732710169344e-06, "loss": 1.0239, "num_input_tokens_seen": 131399680, @@ -12842,47 +12842,47 @@ }, { "epoch": 0.7707082833133253, - "grad_norm": 0.4739181399345398, + "grad_norm": 0.4749922454357147, "learning_rate": 6.821953361776093e-06, - "loss": 0.7971, + "loss": 0.7977, "num_input_tokens_seen": 131481600, "step": 16050 }, { "epoch": 0.7711884753901561, - "grad_norm": 1.0601301193237305, + "grad_norm": 1.0639270544052124, "learning_rate": 6.7947791004008665e-06, - "loss": 0.8697, + "loss": 0.8704, "num_input_tokens_seen": 131563520, "step": 16060 }, { "epoch": 0.7716686674669868, - "grad_norm": 0.3752616047859192, + "grad_norm": 0.3700399100780487, "learning_rate": 6.767650555130009e-06, - "loss": 0.8022, + "loss": 0.8018, "num_input_tokens_seen": 131645440, "step": 16070 }, { "epoch": 0.7721488595438175, - "grad_norm": 0.5150649547576904, + "grad_norm": 0.5122857689857483, "learning_rate": 6.740567794087463e-06, - "loss": 0.9051, + "loss": 0.9043, "num_input_tokens_seen": 131727360, "step": 16080 }, { "epoch": 0.7726290516206482, - "grad_norm": 0.5848947763442993, + "grad_norm": 0.5689642429351807, "learning_rate": 6.713530885282188e-06, - "loss": 1.1109, + "loss": 1.1105, "num_input_tokens_seen": 131809280, "step": 16090 }, { "epoch": 0.773109243697479, - "grad_norm": 0.5849442481994629, + "grad_norm": 0.5890742540359497, "learning_rate": 6.686539896608016e-06, "loss": 0.781, "num_input_tokens_seen": 131891200, @@ -12890,63 +12890,63 @@ }, { "epoch": 0.7735894357743097, - "grad_norm": 0.47524645924568176, + "grad_norm": 0.4772266149520874, "learning_rate": 6.659594895843477e-06, - "loss": 0.7884, + "loss": 0.7886, "num_input_tokens_seen": 131973120, "step": 16110 }, { "epoch": 0.7740696278511404, - "grad_norm": 0.46588289737701416, + "grad_norm": 0.4759899377822876, "learning_rate": 6.632695950651594e-06, - "loss": 0.8852, + "loss": 0.8849, "num_input_tokens_seen": 132055040, "step": 16120 }, { "epoch": 0.7745498199279712, - "grad_norm": 0.6264036893844604, + "grad_norm": 0.6416568160057068, "learning_rate": 6.605843128579739e-06, - "loss": 0.9598, + "loss": 0.9594, "num_input_tokens_seen": 132136960, "step": 16130 }, { "epoch": 0.7750300120048019, - "grad_norm": 0.47402629256248474, + "grad_norm": 0.47457703948020935, "learning_rate": 6.579036497059482e-06, - "loss": 0.8386, + "loss": 0.8384, "num_input_tokens_seen": 132218880, "step": 16140 }, { "epoch": 0.7755102040816326, - "grad_norm": 0.4720049500465393, + "grad_norm": 0.47239863872528076, "learning_rate": 6.552276123406384e-06, - "loss": 0.8359, + "loss": 0.8364, "num_input_tokens_seen": 132300800, "step": 16150 }, { "epoch": 0.7759903961584634, - "grad_norm": 0.4943426847457886, + "grad_norm": 0.4923716187477112, "learning_rate": 6.525562074819852e-06, - "loss": 1.043, + "loss": 1.0425, "num_input_tokens_seen": 132382720, "step": 16160 }, { "epoch": 0.7764705882352941, - "grad_norm": 0.5186018347740173, + "grad_norm": 0.5167580246925354, "learning_rate": 6.4988944183829695e-06, - "loss": 0.9455, + "loss": 0.9449, "num_input_tokens_seen": 132464640, "step": 16170 }, { "epoch": 0.7769507803121248, - "grad_norm": 0.5227720141410828, + "grad_norm": 0.5179154276847839, "learning_rate": 6.472273221062305e-06, "loss": 1.1211, "num_input_tokens_seen": 132546560, @@ -12954,423 +12954,423 @@ }, { "epoch": 0.7774309723889555, - "grad_norm": 0.491787850856781, + "grad_norm": 0.49237915873527527, "learning_rate": 6.445698549707776e-06, - "loss": 1.0617, + "loss": 1.0615, "num_input_tokens_seen": 132628480, "step": 16190 }, { "epoch": 0.7779111644657863, - "grad_norm": 0.5019481778144836, + "grad_norm": 0.5016152262687683, "learning_rate": 6.419170471052472e-06, - "loss": 0.7963, + "loss": 0.7958, "num_input_tokens_seen": 132710400, "step": 16200 }, { "epoch": 0.778391356542617, - "grad_norm": 0.5009409189224243, + "grad_norm": 0.5024312734603882, "learning_rate": 6.392689051712458e-06, - "loss": 0.9802, + "loss": 0.9789, "num_input_tokens_seen": 132792320, "step": 16210 }, { "epoch": 0.7788715486194477, - "grad_norm": 0.5145575404167175, + "grad_norm": 0.5140400528907776, "learning_rate": 6.3662543581866405e-06, - "loss": 1.0569, + "loss": 1.0557, "num_input_tokens_seen": 132874240, "step": 16220 }, { "epoch": 0.7793517406962785, - "grad_norm": 0.4958726167678833, + "grad_norm": 0.49644625186920166, "learning_rate": 6.339866456856608e-06, - "loss": 0.7891, + "loss": 0.7892, "num_input_tokens_seen": 132956160, "step": 16230 }, { "epoch": 0.7798319327731092, - "grad_norm": 0.49058905243873596, + "grad_norm": 0.4914844036102295, "learning_rate": 6.313525413986415e-06, - "loss": 0.9846, + "loss": 0.9853, "num_input_tokens_seen": 133038080, "step": 16240 }, { "epoch": 0.78031212484994, - "grad_norm": 0.48392897844314575, + "grad_norm": 0.48480865359306335, "learning_rate": 6.28723129572247e-06, - "loss": 0.8808, + "loss": 0.8806, "num_input_tokens_seen": 133120000, "step": 16250 }, { "epoch": 0.7807923169267708, - "grad_norm": 0.5041011571884155, + "grad_norm": 0.504016637802124, "learning_rate": 6.260984168093353e-06, - "loss": 0.8577, + "loss": 0.8576, "num_input_tokens_seen": 133201920, "step": 16260 }, { "epoch": 0.7812725090036015, - "grad_norm": 0.49976569414138794, + "grad_norm": 0.5014110803604126, "learning_rate": 6.234784097009608e-06, - "loss": 0.9177, + "loss": 0.9178, "num_input_tokens_seen": 133283840, "step": 16270 }, { "epoch": 0.7817527010804322, - "grad_norm": 1.3650128841400146, + "grad_norm": 1.3052635192871094, "learning_rate": 6.208631148263649e-06, - "loss": 0.8475, + "loss": 0.8479, "num_input_tokens_seen": 133365760, "step": 16280 }, { "epoch": 0.7822328931572629, - "grad_norm": 1.7946758270263672, + "grad_norm": 1.8097127676010132, "learning_rate": 6.18252538752955e-06, - "loss": 0.6443, + "loss": 0.6448, "num_input_tokens_seen": 133447680, "step": 16290 }, { "epoch": 0.7827130852340937, - "grad_norm": 0.47387173771858215, + "grad_norm": 0.5451312065124512, "learning_rate": 6.156466880362877e-06, - "loss": 0.7558, + "loss": 0.7555, "num_input_tokens_seen": 133529600, "step": 16300 }, { "epoch": 0.7831932773109244, - "grad_norm": 0.47302481532096863, + "grad_norm": 0.47235605120658875, "learning_rate": 6.1304556922005315e-06, - "loss": 0.8476, + "loss": 0.8473, "num_input_tokens_seen": 133611520, "step": 16310 }, { "epoch": 0.7836734693877551, - "grad_norm": 0.6125491261482239, + "grad_norm": 0.5912697911262512, "learning_rate": 6.1044918883606225e-06, - "loss": 0.8751, + "loss": 0.8754, "num_input_tokens_seen": 133693440, "step": 16320 }, { "epoch": 0.7841536614645859, - "grad_norm": 0.6477288007736206, + "grad_norm": 0.629851222038269, "learning_rate": 6.078575534042222e-06, - "loss": 0.9734, + "loss": 0.9731, "num_input_tokens_seen": 133775360, "step": 16330 }, { "epoch": 0.7846338535414166, - "grad_norm": 1.2066618204116821, + "grad_norm": 1.2804758548736572, "learning_rate": 6.052706694325292e-06, - "loss": 0.9515, + "loss": 0.952, "num_input_tokens_seen": 133857280, "step": 16340 }, { "epoch": 0.7851140456182473, - "grad_norm": 0.5165063142776489, + "grad_norm": 0.5203343629837036, "learning_rate": 6.026885434170457e-06, - "loss": 0.9599, + "loss": 0.9592, "num_input_tokens_seen": 133939200, "step": 16350 }, { "epoch": 0.7855942376950781, - "grad_norm": 1.0127323865890503, + "grad_norm": 1.0266387462615967, "learning_rate": 6.001111818418859e-06, - "loss": 0.975, + "loss": 0.9753, "num_input_tokens_seen": 134021120, "step": 16360 }, { "epoch": 0.7860744297719088, - "grad_norm": 0.5036996603012085, + "grad_norm": 0.5004794001579285, "learning_rate": 5.975385911792006e-06, - "loss": 0.9928, + "loss": 0.9931, "num_input_tokens_seen": 134103040, "step": 16370 }, { "epoch": 0.7865546218487395, - "grad_norm": 0.49564144015312195, + "grad_norm": 0.4966083765029907, "learning_rate": 5.9497077788916055e-06, - "loss": 1.0484, + "loss": 1.0476, "num_input_tokens_seen": 134184960, "step": 16380 }, { "epoch": 0.7870348139255702, - "grad_norm": 1.0678465366363525, + "grad_norm": 1.0581490993499756, "learning_rate": 5.924077484199389e-06, - "loss": 0.832, + "loss": 0.8325, "num_input_tokens_seen": 134266880, "step": 16390 }, { "epoch": 0.787515006002401, - "grad_norm": 0.4710373878479004, + "grad_norm": 0.4584285616874695, "learning_rate": 5.89849509207695e-06, - "loss": 0.7669, + "loss": 0.7674, "num_input_tokens_seen": 134348800, "step": 16400 }, { "epoch": 0.7879951980792317, - "grad_norm": 0.48212987184524536, + "grad_norm": 0.48210394382476807, "learning_rate": 5.872960666765618e-06, - "loss": 0.9089, + "loss": 0.909, "num_input_tokens_seen": 134430720, "step": 16410 }, { "epoch": 0.7884753901560624, - "grad_norm": 1.6406066417694092, + "grad_norm": 1.591680884361267, "learning_rate": 5.847474272386239e-06, - "loss": 1.5337, + "loss": 1.5853, "num_input_tokens_seen": 134512640, "step": 16420 }, { "epoch": 0.7889555822328932, - "grad_norm": 0.5122446417808533, + "grad_norm": 0.5121796727180481, "learning_rate": 5.822035972939069e-06, - "loss": 0.8218, + "loss": 0.8213, "num_input_tokens_seen": 134594560, "step": 16430 }, { "epoch": 0.7894357743097239, - "grad_norm": 0.4671303927898407, + "grad_norm": 0.4680531322956085, "learning_rate": 5.79664583230359e-06, - "loss": 0.8714, + "loss": 0.8711, "num_input_tokens_seen": 134676480, "step": 16440 }, { "epoch": 0.7899159663865546, - "grad_norm": 0.5007370710372925, + "grad_norm": 0.5005565285682678, "learning_rate": 5.771303914238333e-06, - "loss": 0.7859, + "loss": 0.7858, "num_input_tokens_seen": 134758400, "step": 16450 }, { "epoch": 0.7903961584633854, - "grad_norm": 0.4834369122982025, + "grad_norm": 0.48560014367103577, "learning_rate": 5.746010282380745e-06, - "loss": 0.8862, + "loss": 0.8857, "num_input_tokens_seen": 134840320, "step": 16460 }, { "epoch": 0.7908763505402161, - "grad_norm": 0.3713008761405945, + "grad_norm": 0.3735353946685791, "learning_rate": 5.7207650002470274e-06, - "loss": 0.7945, + "loss": 0.7948, "num_input_tokens_seen": 134922240, "step": 16470 }, { "epoch": 0.7913565426170468, - "grad_norm": 0.6575520634651184, + "grad_norm": 0.6581428050994873, "learning_rate": 5.695568131231949e-06, - "loss": 0.8114, + "loss": 0.8112, "num_input_tokens_seen": 135004160, "step": 16480 }, { "epoch": 0.7918367346938775, - "grad_norm": 0.5816577672958374, + "grad_norm": 0.5894924402236938, "learning_rate": 5.670419738608723e-06, - "loss": 0.7961, + "loss": 0.7962, "num_input_tokens_seen": 135086080, "step": 16490 }, { "epoch": 0.7923169267707083, - "grad_norm": 0.7607868313789368, + "grad_norm": 0.7610780596733093, "learning_rate": 5.645319885528824e-06, - "loss": 0.867, + "loss": 0.8665, "num_input_tokens_seen": 135168000, "step": 16500 }, { "epoch": 0.792797118847539, - "grad_norm": 0.49401038885116577, + "grad_norm": 0.4935015141963959, "learning_rate": 5.620268635021825e-06, - "loss": 0.8139, + "loss": 0.8142, "num_input_tokens_seen": 135249920, "step": 16510 }, { "epoch": 0.7932773109243697, - "grad_norm": 0.5071017742156982, + "grad_norm": 0.5041617155075073, "learning_rate": 5.595266049995268e-06, - "loss": 0.9577, + "loss": 0.9578, "num_input_tokens_seen": 135331840, "step": 16520 }, { "epoch": 0.7937575030012005, - "grad_norm": 0.5186456441879272, + "grad_norm": 0.524131715297699, "learning_rate": 5.5703121932344896e-06, - "loss": 0.9578, + "loss": 0.9576, "num_input_tokens_seen": 135413760, "step": 16530 }, { "epoch": 0.7942376950780312, - "grad_norm": 0.5309847593307495, + "grad_norm": 0.5360773205757141, "learning_rate": 5.5454071274024436e-06, - "loss": 0.8885, + "loss": 0.8884, "num_input_tokens_seen": 135495680, "step": 16540 }, { "epoch": 0.7947178871548619, - "grad_norm": 0.5009894967079163, + "grad_norm": 0.5002300143241882, "learning_rate": 5.520550915039579e-06, - "loss": 0.9609, + "loss": 0.961, "num_input_tokens_seen": 135577600, "step": 16550 }, { "epoch": 0.7951980792316927, - "grad_norm": 0.6387041211128235, + "grad_norm": 0.5231886506080627, "learning_rate": 5.495743618563668e-06, - "loss": 0.8386, + "loss": 0.838, "num_input_tokens_seen": 135659520, "step": 16560 }, { "epoch": 0.7956782713085234, - "grad_norm": 0.4808179438114166, + "grad_norm": 0.4792463779449463, "learning_rate": 5.4709853002696236e-06, - "loss": 0.8747, + "loss": 0.8735, "num_input_tokens_seen": 135741440, "step": 16570 }, { "epoch": 0.7961584633853541, - "grad_norm": 0.49355682730674744, + "grad_norm": 0.4909564256668091, "learning_rate": 5.4462760223294e-06, - "loss": 0.8467, + "loss": 0.8468, "num_input_tokens_seen": 135823360, "step": 16580 }, { "epoch": 0.7966386554621848, - "grad_norm": 0.4921320378780365, + "grad_norm": 0.4909401535987854, "learning_rate": 5.42161584679178e-06, - "loss": 0.8295, + "loss": 0.8294, "num_input_tokens_seen": 135905280, "step": 16590 }, { "epoch": 0.7971188475390156, - "grad_norm": 0.4814390242099762, + "grad_norm": 0.4839801490306854, "learning_rate": 5.397004835582242e-06, - "loss": 0.8218, + "loss": 0.8217, "num_input_tokens_seen": 135987200, "step": 16600 }, { "epoch": 0.7975990396158463, - "grad_norm": 0.5022986531257629, + "grad_norm": 0.5041458606719971, "learning_rate": 5.372443050502823e-06, - "loss": 0.8522, + "loss": 0.8524, "num_input_tokens_seen": 136069120, "step": 16610 }, { "epoch": 0.798079231692677, - "grad_norm": 0.48835310339927673, + "grad_norm": 0.4912269711494446, "learning_rate": 5.347930553231942e-06, - "loss": 0.8879, + "loss": 0.8883, "num_input_tokens_seen": 136151040, "step": 16620 }, { "epoch": 0.7985594237695078, - "grad_norm": 0.48353898525238037, + "grad_norm": 0.48488718271255493, "learning_rate": 5.323467405324226e-06, - "loss": 1.0158, + "loss": 1.0163, "num_input_tokens_seen": 136232960, "step": 16630 }, { "epoch": 0.7990396158463385, - "grad_norm": 0.5025900602340698, + "grad_norm": 0.4990125596523285, "learning_rate": 5.299053668210402e-06, - "loss": 1.0689, + "loss": 1.069, "num_input_tokens_seen": 136314880, "step": 16640 }, { "epoch": 0.7995198079231692, - "grad_norm": 0.4869900345802307, + "grad_norm": 0.4874297082424164, "learning_rate": 5.274689403197119e-06, - "loss": 0.7811, + "loss": 0.7804, "num_input_tokens_seen": 136396800, "step": 16650 }, { "epoch": 0.8, - "grad_norm": 0.5168715119361877, + "grad_norm": 0.5153248310089111, "learning_rate": 5.250374671466776e-06, - "loss": 0.8692, + "loss": 0.8694, "num_input_tokens_seen": 136478720, "step": 16660 }, { "epoch": 0.8004801920768307, - "grad_norm": 0.5069206357002258, + "grad_norm": 0.5061455965042114, "learning_rate": 5.2261095340774085e-06, - "loss": 1.1452, + "loss": 1.145, "num_input_tokens_seen": 136560640, "step": 16670 }, { "epoch": 0.8009603841536614, - "grad_norm": 0.4853163957595825, + "grad_norm": 0.4891180098056793, "learning_rate": 5.201894051962486e-06, - "loss": 1.0006, + "loss": 0.9997, "num_input_tokens_seen": 136642560, "step": 16680 }, { "epoch": 0.8014405762304923, - "grad_norm": 0.4831332862377167, + "grad_norm": 0.48567453026771545, "learning_rate": 5.177728285930816e-06, - "loss": 0.897, + "loss": 0.8972, "num_input_tokens_seen": 136724480, "step": 16690 }, { "epoch": 0.801920768307323, - "grad_norm": 0.5170623660087585, + "grad_norm": 0.5174875259399414, "learning_rate": 5.153612296666335e-06, - "loss": 0.7989, + "loss": 0.7987, "num_input_tokens_seen": 136806400, "step": 16700 }, { "epoch": 0.8024009603841536, - "grad_norm": 0.47998228669166565, + "grad_norm": 0.4791352152824402, "learning_rate": 5.129546144727998e-06, "loss": 0.9656, "num_input_tokens_seen": 136888320, @@ -13378,47 +13378,47 @@ }, { "epoch": 0.8028811524609843, - "grad_norm": 0.4863240420818329, + "grad_norm": 0.4892936050891876, "learning_rate": 5.105529890549618e-06, - "loss": 0.8726, + "loss": 0.873, "num_input_tokens_seen": 136970240, "step": 16720 }, { "epoch": 0.8033613445378152, - "grad_norm": 0.4388894736766815, + "grad_norm": 0.4297828674316406, "learning_rate": 5.081563594439676e-06, - "loss": 0.7868, + "loss": 0.7871, "num_input_tokens_seen": 137052160, "step": 16730 }, { "epoch": 0.8038415366146459, - "grad_norm": 0.4876529276371002, + "grad_norm": 0.4883136749267578, "learning_rate": 5.057647316581232e-06, - "loss": 1.0533, + "loss": 1.0537, "num_input_tokens_seen": 137134080, "step": 16740 }, { "epoch": 0.8043217286914766, - "grad_norm": 0.47916775941848755, + "grad_norm": 0.4795672297477722, "learning_rate": 5.033781117031738e-06, - "loss": 0.9322, + "loss": 0.9321, "num_input_tokens_seen": 137216000, "step": 16750 }, { "epoch": 0.8048019207683074, - "grad_norm": 0.47834011912345886, + "grad_norm": 0.4753381013870239, "learning_rate": 5.0099650557228785e-06, - "loss": 1.0616, + "loss": 1.0611, "num_input_tokens_seen": 137297920, "step": 16760 }, { "epoch": 0.8052821128451381, - "grad_norm": 0.5335723757743835, + "grad_norm": 0.5331737399101257, "learning_rate": 4.986199192460428e-06, "loss": 0.8815, "num_input_tokens_seen": 137379840, @@ -13426,47 +13426,47 @@ }, { "epoch": 0.8057623049219688, - "grad_norm": 0.5083367228507996, + "grad_norm": 0.5075027346611023, "learning_rate": 4.962483586924136e-06, - "loss": 0.7737, + "loss": 0.7744, "num_input_tokens_seen": 137461760, "step": 16780 }, { "epoch": 0.8062424969987996, - "grad_norm": 0.4924291968345642, + "grad_norm": 0.490078866481781, "learning_rate": 4.93881829866751e-06, - "loss": 0.8263, + "loss": 0.8282, "num_input_tokens_seen": 137543680, "step": 16790 }, { "epoch": 0.8067226890756303, - "grad_norm": 0.5008875131607056, + "grad_norm": 0.5007233023643494, "learning_rate": 4.915203387117736e-06, - "loss": 0.9976, + "loss": 0.9975, "num_input_tokens_seen": 137625600, "step": 16800 }, { "epoch": 0.807202881152461, - "grad_norm": 0.5046831369400024, + "grad_norm": 0.5057447552680969, "learning_rate": 4.891638911575483e-06, - "loss": 0.8821, + "loss": 0.882, "num_input_tokens_seen": 137707520, "step": 16810 }, { "epoch": 0.8076830732292917, - "grad_norm": 0.5597212910652161, + "grad_norm": 0.5599965453147888, "learning_rate": 4.868124931214752e-06, - "loss": 0.9083, + "loss": 0.9084, "num_input_tokens_seen": 137789440, "step": 16820 }, { "epoch": 0.8081632653061225, - "grad_norm": 0.5007030367851257, + "grad_norm": 0.5017980933189392, "learning_rate": 4.844661505082768e-06, "loss": 0.8356, "num_input_tokens_seen": 137871360, @@ -13474,23 +13474,23 @@ }, { "epoch": 0.8086434573829532, - "grad_norm": 0.3486262857913971, + "grad_norm": 0.3630382716655731, "learning_rate": 4.8212486920998005e-06, - "loss": 0.7684, + "loss": 0.7691, "num_input_tokens_seen": 137953280, "step": 16840 }, { "epoch": 0.8091236494597839, - "grad_norm": 0.4673330783843994, + "grad_norm": 0.46930795907974243, "learning_rate": 4.797886551059011e-06, - "loss": 0.9145, + "loss": 0.915, "num_input_tokens_seen": 138035200, "step": 16850 }, { "epoch": 0.8096038415366147, - "grad_norm": 0.4685009717941284, + "grad_norm": 0.46570199728012085, "learning_rate": 4.7745751406263165e-06, "loss": 0.9692, "num_input_tokens_seen": 138117120, @@ -13498,207 +13498,207 @@ }, { "epoch": 0.8100840336134454, - "grad_norm": 0.4861032962799072, + "grad_norm": 0.48233890533447266, "learning_rate": 4.751314519340258e-06, - "loss": 0.8577, + "loss": 0.8575, "num_input_tokens_seen": 138199040, "step": 16870 }, { "epoch": 0.8105642256902761, - "grad_norm": 0.5741161108016968, + "grad_norm": 0.5723303556442261, "learning_rate": 4.728104745611814e-06, - "loss": 0.8875, + "loss": 0.8863, "num_input_tokens_seen": 138280960, "step": 16880 }, { "epoch": 0.8110444177671069, - "grad_norm": 0.5122044682502747, + "grad_norm": 0.5114380121231079, "learning_rate": 4.704945877724295e-06, - "loss": 1.0989, + "loss": 1.099, "num_input_tokens_seen": 138362880, "step": 16890 }, { "epoch": 0.8115246098439376, - "grad_norm": 0.8650029897689819, + "grad_norm": 0.8760389089584351, "learning_rate": 4.681837973833181e-06, - "loss": 0.9019, + "loss": 0.9003, "num_input_tokens_seen": 138444800, "step": 16900 }, { "epoch": 0.8120048019207683, - "grad_norm": 0.49754172563552856, + "grad_norm": 0.49562937021255493, "learning_rate": 4.658781091965955e-06, - "loss": 0.8082, + "loss": 0.8087, "num_input_tokens_seen": 138526720, "step": 16910 }, { "epoch": 0.812484993997599, - "grad_norm": 0.45871591567993164, + "grad_norm": 0.4572142958641052, "learning_rate": 4.635775290021988e-06, - "loss": 0.8978, + "loss": 0.8977, "num_input_tokens_seen": 138608640, "step": 16920 }, { "epoch": 0.8129651860744298, - "grad_norm": 0.3866370916366577, + "grad_norm": 0.39572256803512573, "learning_rate": 4.612820625772391e-06, - "loss": 0.6356, + "loss": 0.6357, "num_input_tokens_seen": 138690560, "step": 16930 }, { "epoch": 0.8134453781512605, - "grad_norm": 0.6949254870414734, + "grad_norm": 0.6943681240081787, "learning_rate": 4.589917156859838e-06, - "loss": 0.8645, + "loss": 0.8636, "num_input_tokens_seen": 138772480, "step": 16940 }, { "epoch": 0.8139255702280912, - "grad_norm": 0.6086997985839844, + "grad_norm": 0.6021678447723389, "learning_rate": 4.5670649407984625e-06, - "loss": 0.8896, + "loss": 0.8897, "num_input_tokens_seen": 138854400, "step": 16950 }, { "epoch": 0.814405762304922, - "grad_norm": 0.4606131613254547, + "grad_norm": 0.4605492651462555, "learning_rate": 4.544264034973686e-06, - "loss": 0.7794, + "loss": 0.7806, "num_input_tokens_seen": 138936320, "step": 16960 }, { "epoch": 0.8148859543817527, - "grad_norm": 0.5047131180763245, + "grad_norm": 0.5034934282302856, "learning_rate": 4.521514496642074e-06, - "loss": 0.9631, + "loss": 0.9632, "num_input_tokens_seen": 139018240, "step": 16970 }, { "epoch": 0.8153661464585834, - "grad_norm": 0.5257802605628967, + "grad_norm": 0.5086869597434998, "learning_rate": 4.498816382931217e-06, - "loss": 0.9246, + "loss": 0.9245, "num_input_tokens_seen": 139100160, "step": 16980 }, { "epoch": 0.8158463385354142, - "grad_norm": 0.5585389733314514, + "grad_norm": 0.5553309321403503, "learning_rate": 4.476169750839571e-06, - "loss": 1.0151, + "loss": 1.015, "num_input_tokens_seen": 139182080, "step": 16990 }, { "epoch": 0.8163265306122449, - "grad_norm": 0.47912609577178955, + "grad_norm": 0.4763419032096863, "learning_rate": 4.45357465723629e-06, - "loss": 0.8939, + "loss": 0.8941, "num_input_tokens_seen": 139264000, "step": 17000 }, { "epoch": 0.8168067226890756, - "grad_norm": 0.4979766309261322, + "grad_norm": 0.49482813477516174, "learning_rate": 4.4310311588611294e-06, - "loss": 0.8846, + "loss": 0.8841, "num_input_tokens_seen": 139345920, "step": 17010 }, { "epoch": 0.8172869147659063, - "grad_norm": 0.4919578731060028, + "grad_norm": 0.4898833930492401, "learning_rate": 4.408539312324281e-06, - "loss": 0.8446, + "loss": 0.8452, "num_input_tokens_seen": 139427840, "step": 17020 }, { "epoch": 0.8177671068427371, - "grad_norm": 0.510263204574585, + "grad_norm": 0.5128147006034851, "learning_rate": 4.386099174106212e-06, - "loss": 0.8381, + "loss": 0.8382, "num_input_tokens_seen": 139509760, "step": 17030 }, { "epoch": 0.8182472989195678, - "grad_norm": 0.5191063284873962, + "grad_norm": 0.5220133066177368, "learning_rate": 4.363710800557566e-06, - "loss": 0.9471, + "loss": 0.9468, "num_input_tokens_seen": 139591680, "step": 17040 }, { "epoch": 0.8187274909963985, - "grad_norm": 0.5044844150543213, + "grad_norm": 0.5058349967002869, "learning_rate": 4.341374247898983e-06, - "loss": 0.7787, + "loss": 0.7798, "num_input_tokens_seen": 139673600, "step": 17050 }, { "epoch": 0.8192076830732293, - "grad_norm": 0.48283785581588745, + "grad_norm": 0.4795903265476227, "learning_rate": 4.3190895722209635e-06, - "loss": 0.9487, + "loss": 0.949, "num_input_tokens_seen": 139755520, "step": 17060 }, { "epoch": 0.81968787515006, - "grad_norm": 0.36586451530456543, + "grad_norm": 0.3831540048122406, "learning_rate": 4.296856829483759e-06, - "loss": 0.884, + "loss": 0.8836, "num_input_tokens_seen": 139837440, "step": 17070 }, { "epoch": 0.8201680672268907, - "grad_norm": 0.49161645770072937, + "grad_norm": 0.4896925687789917, "learning_rate": 4.274676075517206e-06, - "loss": 0.741, + "loss": 0.7409, "num_input_tokens_seen": 139919360, "step": 17080 }, { "epoch": 0.8206482593037215, - "grad_norm": 0.47819775342941284, + "grad_norm": 0.47725582122802734, "learning_rate": 4.252547366020568e-06, - "loss": 0.8349, + "loss": 0.8346, "num_input_tokens_seen": 140001280, "step": 17090 }, { "epoch": 0.8211284513805522, - "grad_norm": 0.48540183901786804, + "grad_norm": 0.4832172989845276, "learning_rate": 4.230470756562438e-06, - "loss": 0.9832, + "loss": 0.983, "num_input_tokens_seen": 140083200, "step": 17100 }, { "epoch": 0.8216086434573829, - "grad_norm": 0.7266910672187805, + "grad_norm": 0.7230331897735596, "learning_rate": 4.208446302580582e-06, - "loss": 0.7601, + "loss": 0.7598, "num_input_tokens_seen": 140165120, "step": 17110 }, { "epoch": 0.8220888355342136, - "grad_norm": 0.48397764563560486, + "grad_norm": 0.48454123735427856, "learning_rate": 4.186474059381768e-06, "loss": 0.7198, "num_input_tokens_seen": 140247040, @@ -13706,23 +13706,23 @@ }, { "epoch": 0.8225690276110444, - "grad_norm": 0.5694714188575745, + "grad_norm": 0.5540962815284729, "learning_rate": 4.164554082141683e-06, - "loss": 0.8547, + "loss": 0.8544, "num_input_tokens_seen": 140328960, "step": 17130 }, { "epoch": 0.8230492196878751, - "grad_norm": 1.3133982419967651, + "grad_norm": 1.901149034500122, "learning_rate": 4.142686425904752e-06, - "loss": 0.8511, + "loss": 0.8519, "num_input_tokens_seen": 140410880, "step": 17140 }, { "epoch": 0.8235294117647058, - "grad_norm": 0.48804813623428345, + "grad_norm": 0.48827600479125977, "learning_rate": 4.12087114558401e-06, "loss": 0.8076, "num_input_tokens_seen": 140492800, @@ -13730,23 +13730,23 @@ }, { "epoch": 0.8240096038415367, - "grad_norm": 0.4581037163734436, + "grad_norm": 0.4572823941707611, "learning_rate": 4.099108295960977e-06, - "loss": 0.7348, + "loss": 0.7344, "num_input_tokens_seen": 140574720, "step": 17160 }, { "epoch": 0.8244897959183674, - "grad_norm": 0.48801615834236145, + "grad_norm": 0.4875430464744568, "learning_rate": 4.077397931685523e-06, - "loss": 0.8413, + "loss": 0.8416, "num_input_tokens_seen": 140656640, "step": 17170 }, { "epoch": 0.824969987995198, - "grad_norm": 0.46194446086883545, + "grad_norm": 0.46101564168930054, "learning_rate": 4.055740107275685e-06, "loss": 1.1131, "num_input_tokens_seen": 140738560, @@ -13754,23 +13754,23 @@ }, { "epoch": 0.8254501800720289, - "grad_norm": 0.5051575899124146, + "grad_norm": 0.5040237903594971, "learning_rate": 4.0341348771175955e-06, - "loss": 0.9822, + "loss": 0.9831, "num_input_tokens_seen": 140820480, "step": 17190 }, { "epoch": 0.8259303721488596, - "grad_norm": 0.4934830367565155, + "grad_norm": 0.492987722158432, "learning_rate": 4.012582295465308e-06, - "loss": 0.834, + "loss": 0.8343, "num_input_tokens_seen": 140902400, "step": 17200 }, { "epoch": 0.8264105642256903, - "grad_norm": 0.5021478533744812, + "grad_norm": 0.499546080827713, "learning_rate": 3.991082416440656e-06, "loss": 0.8526, "num_input_tokens_seen": 140984320, @@ -13778,39 +13778,39 @@ }, { "epoch": 0.826890756302521, - "grad_norm": 0.4906045198440552, + "grad_norm": 0.4928174316883087, "learning_rate": 3.969635294033144e-06, - "loss": 0.794, + "loss": 0.7942, "num_input_tokens_seen": 141066240, "step": 17220 }, { "epoch": 0.8273709483793518, - "grad_norm": 0.49712374806404114, + "grad_norm": 0.49705326557159424, "learning_rate": 3.9482409820997826e-06, - "loss": 0.8676, + "loss": 0.8681, "num_input_tokens_seen": 141148160, "step": 17230 }, { "epoch": 0.8278511404561825, - "grad_norm": 0.4694906771183014, + "grad_norm": 0.4672960937023163, "learning_rate": 3.926899534364969e-06, - "loss": 0.9085, + "loss": 0.9081, "num_input_tokens_seen": 141230080, "step": 17240 }, { "epoch": 0.8283313325330132, - "grad_norm": 0.4903012216091156, + "grad_norm": 0.4908684492111206, "learning_rate": 3.90561100442036e-06, - "loss": 0.8688, + "loss": 0.8689, "num_input_tokens_seen": 141312000, "step": 17250 }, { "epoch": 0.828811524609844, - "grad_norm": 0.5216225385665894, + "grad_norm": 0.5210193395614624, "learning_rate": 3.8843754457247275e-06, "loss": 1.0517, "num_input_tokens_seen": 141393920, @@ -13818,7 +13818,7 @@ }, { "epoch": 0.8292917166866747, - "grad_norm": 0.48557716608047485, + "grad_norm": 0.48606058955192566, "learning_rate": 3.863192911603808e-06, "loss": 0.9196, "num_input_tokens_seen": 141475840, @@ -13826,95 +13826,95 @@ }, { "epoch": 0.8297719087635054, - "grad_norm": 0.4983423352241516, + "grad_norm": 0.49873122572898865, "learning_rate": 3.842063455250203e-06, - "loss": 0.9045, + "loss": 0.9042, "num_input_tokens_seen": 141557760, "step": 17280 }, { "epoch": 0.8302521008403362, - "grad_norm": 0.5038923621177673, + "grad_norm": 0.5023019313812256, "learning_rate": 3.820987129723228e-06, - "loss": 0.8666, + "loss": 0.867, "num_input_tokens_seen": 141639680, "step": 17290 }, { "epoch": 0.8307322929171669, - "grad_norm": 0.4999052584171295, + "grad_norm": 0.4990661144256592, "learning_rate": 3.799963987948757e-06, - "loss": 0.788, + "loss": 0.7882, "num_input_tokens_seen": 141721600, "step": 17300 }, { "epoch": 0.8312124849939976, - "grad_norm": 0.5127232074737549, + "grad_norm": 0.5133486986160278, "learning_rate": 3.7789940827191395e-06, - "loss": 0.8889, + "loss": 0.8897, "num_input_tokens_seen": 141803520, "step": 17310 }, { "epoch": 0.8316926770708283, - "grad_norm": 0.5200673937797546, + "grad_norm": 0.5203906297683716, "learning_rate": 3.7580774666930134e-06, - "loss": 0.8877, + "loss": 0.8876, "num_input_tokens_seen": 141885440, "step": 17320 }, { "epoch": 0.8321728691476591, - "grad_norm": 0.4389039874076843, + "grad_norm": 0.46600568294525146, "learning_rate": 3.737214192395225e-06, - "loss": 0.8935, + "loss": 0.8931, "num_input_tokens_seen": 141967360, "step": 17330 }, { "epoch": 0.8326530612244898, - "grad_norm": 0.481927752494812, + "grad_norm": 0.48274463415145874, "learning_rate": 3.7164043122166508e-06, - "loss": 1.0336, + "loss": 1.0343, "num_input_tokens_seen": 142049280, "step": 17340 }, { "epoch": 0.8331332533013205, - "grad_norm": 0.5148651599884033, + "grad_norm": 0.5128066539764404, "learning_rate": 3.6956478784140937e-06, - "loss": 1.0014, + "loss": 1.0005, "num_input_tokens_seen": 142131200, "step": 17350 }, { "epoch": 0.8336134453781513, - "grad_norm": 0.48852846026420593, + "grad_norm": 0.48960596323013306, "learning_rate": 3.674944943110156e-06, - "loss": 0.9075, + "loss": 0.9079, "num_input_tokens_seen": 142213120, "step": 17360 }, { "epoch": 0.834093637454982, - "grad_norm": 0.4977552592754364, + "grad_norm": 0.49447503685951233, "learning_rate": 3.6542955582930748e-06, - "loss": 0.7461, + "loss": 0.7459, "num_input_tokens_seen": 142295040, "step": 17370 }, { "epoch": 0.8345738295318127, - "grad_norm": 0.4385221600532532, + "grad_norm": 0.43031996488571167, "learning_rate": 3.6336997758166263e-06, - "loss": 0.9869, + "loss": 0.9866, "num_input_tokens_seen": 142376960, "step": 17380 }, { "epoch": 0.8350540216086435, - "grad_norm": 0.4688054919242859, + "grad_norm": 0.46145930886268616, "learning_rate": 3.6131576473999924e-06, "loss": 0.775, "num_input_tokens_seen": 142458880, @@ -13922,7 +13922,7 @@ }, { "epoch": 0.8355342136854742, - "grad_norm": 0.4902810752391815, + "grad_norm": 0.49166035652160645, "learning_rate": 3.592669224627601e-06, "loss": 0.916, "num_input_tokens_seen": 142540800, @@ -13930,63 +13930,63 @@ }, { "epoch": 0.8360144057623049, - "grad_norm": 0.49315813183784485, + "grad_norm": 0.4930817782878876, "learning_rate": 3.5722345589490306e-06, - "loss": 1.0457, + "loss": 1.046, "num_input_tokens_seen": 142622720, "step": 17410 }, { "epoch": 0.8364945978391356, - "grad_norm": 0.8642867207527161, + "grad_norm": 0.7804808616638184, "learning_rate": 3.5518537016788646e-06, - "loss": 1.0179, + "loss": 1.0183, "num_input_tokens_seen": 142704640, "step": 17420 }, { "epoch": 0.8369747899159664, - "grad_norm": 0.487728476524353, + "grad_norm": 0.48990410566329956, "learning_rate": 3.531526703996557e-06, - "loss": 0.8232, + "loss": 0.8231, "num_input_tokens_seen": 142786560, "step": 17430 }, { "epoch": 0.8374549819927971, - "grad_norm": 0.48335549235343933, + "grad_norm": 0.4804694354534149, "learning_rate": 3.511253616946325e-06, - "loss": 0.9279, + "loss": 0.9277, "num_input_tokens_seen": 142868480, "step": 17440 }, { "epoch": 0.8379351740696278, - "grad_norm": 0.5654048323631287, + "grad_norm": 0.5617939829826355, "learning_rate": 3.4910344914370093e-06, - "loss": 0.8721, + "loss": 0.8723, "num_input_tokens_seen": 142950400, "step": 17450 }, { "epoch": 0.8384153661464586, - "grad_norm": 0.6163344383239746, + "grad_norm": 0.6097043752670288, "learning_rate": 3.4708693782419225e-06, - "loss": 0.8941, + "loss": 0.8936, "num_input_tokens_seen": 143032320, "step": 17460 }, { "epoch": 0.8388955582232893, - "grad_norm": 0.5046043992042542, + "grad_norm": 0.5014716982841492, "learning_rate": 3.450758327998768e-06, - "loss": 0.8403, + "loss": 0.8404, "num_input_tokens_seen": 143114240, "step": 17470 }, { "epoch": 0.83937575030012, - "grad_norm": 0.44665244221687317, + "grad_norm": 0.4464024603366852, "learning_rate": 3.4307013912094845e-06, "loss": 0.9405, "num_input_tokens_seen": 143196160, @@ -13994,63 +13994,63 @@ }, { "epoch": 0.8398559423769508, - "grad_norm": 0.5446330904960632, + "grad_norm": 0.5696042776107788, "learning_rate": 3.41069861824011e-06, - "loss": 0.6972, + "loss": 0.6978, "num_input_tokens_seen": 143278080, "step": 17490 }, { "epoch": 0.8403361344537815, - "grad_norm": 0.5121775269508362, + "grad_norm": 0.510014533996582, "learning_rate": 3.390750059320688e-06, - "loss": 0.9488, + "loss": 0.9487, "num_input_tokens_seen": 143360000, "step": 17500 }, { "epoch": 0.8408163265306122, - "grad_norm": 0.42379680275917053, + "grad_norm": 0.41205736994743347, "learning_rate": 3.3708557645451053e-06, - "loss": 0.7979, + "loss": 0.7978, "num_input_tokens_seen": 143441920, "step": 17510 }, { "epoch": 0.8412965186074429, - "grad_norm": 0.915107250213623, + "grad_norm": 0.8583078980445862, "learning_rate": 3.3510157838709895e-06, - "loss": 0.9223, + "loss": 0.9229, "num_input_tokens_seen": 143523840, "step": 17520 }, { "epoch": 0.8417767106842737, - "grad_norm": 0.4954970180988312, + "grad_norm": 0.4940420687198639, "learning_rate": 3.3312301671195784e-06, - "loss": 1.0832, + "loss": 1.0837, "num_input_tokens_seen": 143605760, "step": 17530 }, { "epoch": 0.8422569027611044, - "grad_norm": 0.37159988284111023, + "grad_norm": 0.3699828088283539, "learning_rate": 3.3114989639755983e-06, - "loss": 0.8365, + "loss": 0.8367, "num_input_tokens_seen": 143687680, "step": 17540 }, { "epoch": 0.8427370948379351, - "grad_norm": 0.4709157943725586, + "grad_norm": 0.47020527720451355, "learning_rate": 3.2918222239871206e-06, - "loss": 0.8637, + "loss": 0.863, "num_input_tokens_seen": 143769600, "step": 17550 }, { "epoch": 0.8432172869147659, - "grad_norm": 0.4602372646331787, + "grad_norm": 0.4599441885948181, "learning_rate": 3.272199996565464e-06, "loss": 0.8599, "num_input_tokens_seen": 143851520, @@ -14058,15 +14058,15 @@ }, { "epoch": 0.8436974789915966, - "grad_norm": 0.4694511890411377, + "grad_norm": 0.46887126564979553, "learning_rate": 3.252632330985059e-06, - "loss": 0.8833, + "loss": 0.8835, "num_input_tokens_seen": 143933440, "step": 17570 }, { "epoch": 0.8441776710684273, - "grad_norm": 0.4874444603919983, + "grad_norm": 0.4868754744529724, "learning_rate": 3.233119276383309e-06, "loss": 0.8098, "num_input_tokens_seen": 144015360, @@ -14074,71 +14074,71 @@ }, { "epoch": 0.8446578631452581, - "grad_norm": 0.7956154346466064, + "grad_norm": 0.7933685183525085, "learning_rate": 3.2136608817604998e-06, - "loss": 0.8137, + "loss": 0.8136, "num_input_tokens_seen": 144097280, "step": 17590 }, { "epoch": 0.8451380552220888, - "grad_norm": 0.5142585039138794, + "grad_norm": 0.5126639008522034, "learning_rate": 3.1942571959796414e-06, - "loss": 0.9244, + "loss": 0.9242, "num_input_tokens_seen": 144179200, "step": 17600 }, { "epoch": 0.8456182472989195, - "grad_norm": 0.49832209944725037, + "grad_norm": 0.49685657024383545, "learning_rate": 3.1749082677663606e-06, - "loss": 0.9445, + "loss": 0.9447, "num_input_tokens_seen": 144261120, "step": 17610 }, { "epoch": 0.8460984393757504, - "grad_norm": 0.49295270442962646, + "grad_norm": 0.493713915348053, "learning_rate": 3.1556141457087932e-06, - "loss": 1.0138, + "loss": 1.0145, "num_input_tokens_seen": 144343040, "step": 17620 }, { "epoch": 0.846578631452581, - "grad_norm": 0.49073633551597595, + "grad_norm": 0.4875103533267975, "learning_rate": 3.1363748782574475e-06, - "loss": 0.8728, + "loss": 0.8731, "num_input_tokens_seen": 144424960, "step": 17630 }, { "epoch": 0.8470588235294118, - "grad_norm": 0.48808997869491577, + "grad_norm": 0.48699912428855896, "learning_rate": 3.1171905137250655e-06, - "loss": 0.8881, + "loss": 0.8883, "num_input_tokens_seen": 144506880, "step": 17640 }, { "epoch": 0.8475390156062425, - "grad_norm": 0.5085486769676208, + "grad_norm": 0.5091184973716736, "learning_rate": 3.098061100286537e-06, - "loss": 0.8107, + "loss": 0.8102, "num_input_tokens_seen": 144588800, "step": 17650 }, { "epoch": 0.8480192076830733, - "grad_norm": 0.4130302965641022, + "grad_norm": 0.41908127069473267, "learning_rate": 3.078986685978763e-06, - "loss": 1.0014, + "loss": 1.0013, "num_input_tokens_seen": 144670720, "step": 17660 }, { "epoch": 0.848499399759904, - "grad_norm": 0.4925101697444916, + "grad_norm": 0.49007028341293335, "learning_rate": 3.059967318700513e-06, "loss": 0.7933, "num_input_tokens_seen": 144752640, @@ -14146,71 +14146,71 @@ }, { "epoch": 0.8489795918367347, - "grad_norm": 0.5227538347244263, + "grad_norm": 0.5263189077377319, "learning_rate": 3.0410030462123486e-06, - "loss": 0.8734, + "loss": 0.8733, "num_input_tokens_seen": 144834560, "step": 17680 }, { "epoch": 0.8494597839135655, - "grad_norm": 0.48874667286872864, + "grad_norm": 0.4886823892593384, "learning_rate": 3.022093916136465e-06, - "loss": 1.1692, + "loss": 1.1693, "num_input_tokens_seen": 144916480, "step": 17690 }, { "epoch": 0.8499399759903962, - "grad_norm": 0.5069953799247742, + "grad_norm": 0.507895290851593, "learning_rate": 3.0032399759565845e-06, - "loss": 0.8841, + "loss": 0.8838, "num_input_tokens_seen": 144998400, "step": 17700 }, { "epoch": 0.8504201680672269, - "grad_norm": 0.49537745118141174, + "grad_norm": 0.49439266324043274, "learning_rate": 2.9844412730178515e-06, - "loss": 0.9305, + "loss": 0.93, "num_input_tokens_seen": 145080320, "step": 17710 }, { "epoch": 0.8509003601440577, - "grad_norm": 0.48820367455482483, + "grad_norm": 0.48874273896217346, "learning_rate": 2.9656978545267002e-06, - "loss": 0.9277, + "loss": 0.9282, "num_input_tokens_seen": 145162240, "step": 17720 }, { "epoch": 0.8513805522208884, - "grad_norm": 0.5144213438034058, + "grad_norm": 0.49839329719543457, "learning_rate": 2.947009767550718e-06, - "loss": 0.9099, + "loss": 0.9096, "num_input_tokens_seen": 145244160, "step": 17730 }, { "epoch": 0.8518607442977191, - "grad_norm": 0.5397959351539612, + "grad_norm": 0.5408912301063538, "learning_rate": 2.9283770590185696e-06, - "loss": 1.0301, + "loss": 1.0297, "num_input_tokens_seen": 145326080, "step": 17740 }, { "epoch": 0.8523409363745498, - "grad_norm": 0.8778235912322998, + "grad_norm": 0.8705450892448425, "learning_rate": 2.9097997757198516e-06, - "loss": 1.1378, + "loss": 1.1381, "num_input_tokens_seen": 145408000, "step": 17750 }, { "epoch": 0.8528211284513806, - "grad_norm": 0.4958626329898834, + "grad_norm": 0.4960118532180786, "learning_rate": 2.891277964304959e-06, "loss": 0.8219, "num_input_tokens_seen": 145489920, @@ -14218,111 +14218,111 @@ }, { "epoch": 0.8533013205282113, - "grad_norm": 0.5561241507530212, + "grad_norm": 0.5603978037834167, "learning_rate": 2.8728116712850193e-06, - "loss": 0.8583, + "loss": 0.8588, "num_input_tokens_seen": 145571840, "step": 17770 }, { "epoch": 0.853781512605042, - "grad_norm": 0.5028048753738403, + "grad_norm": 0.5049429535865784, "learning_rate": 2.8544009430317153e-06, - "loss": 0.9945, + "loss": 0.9943, "num_input_tokens_seen": 145653760, "step": 17780 }, { "epoch": 0.8542617046818728, - "grad_norm": 0.4903087615966797, + "grad_norm": 0.49399763345718384, "learning_rate": 2.8360458257772228e-06, - "loss": 0.9056, + "loss": 0.9051, "num_input_tokens_seen": 145735680, "step": 17790 }, { "epoch": 0.8547418967587035, - "grad_norm": 0.4912867844104767, + "grad_norm": 0.4904550015926361, "learning_rate": 2.817746365614049e-06, - "loss": 0.8998, + "loss": 0.8999, "num_input_tokens_seen": 145817600, "step": 17800 }, { "epoch": 0.8552220888355342, - "grad_norm": 0.5079895853996277, + "grad_norm": 0.5089582800865173, "learning_rate": 2.7995026084949584e-06, - "loss": 0.931, + "loss": 0.9312, "num_input_tokens_seen": 145899520, "step": 17810 }, { "epoch": 0.855702280912365, - "grad_norm": 0.547659695148468, + "grad_norm": 0.540856659412384, "learning_rate": 2.781314600232815e-06, - "loss": 1.0527, + "loss": 1.0525, "num_input_tokens_seen": 145981440, "step": 17820 }, { "epoch": 0.8561824729891957, - "grad_norm": 0.4735318124294281, + "grad_norm": 0.4742536246776581, "learning_rate": 2.763182386500504e-06, - "loss": 0.8695, + "loss": 0.8697, "num_input_tokens_seen": 146063360, "step": 17830 }, { "epoch": 0.8566626650660264, - "grad_norm": 0.4886067509651184, + "grad_norm": 0.4883480966091156, "learning_rate": 2.745106012830806e-06, - "loss": 0.9396, + "loss": 0.9403, "num_input_tokens_seen": 146145280, "step": 17840 }, { "epoch": 0.8571428571428571, - "grad_norm": 0.48998895287513733, + "grad_norm": 0.4900871515274048, "learning_rate": 2.7270855246162547e-06, - "loss": 1.0026, + "loss": 1.0035, "num_input_tokens_seen": 146227200, "step": 17850 }, { "epoch": 0.8576230492196879, - "grad_norm": 0.4987630844116211, + "grad_norm": 0.4983166456222534, "learning_rate": 2.7091209671090715e-06, - "loss": 0.9145, + "loss": 0.9146, "num_input_tokens_seen": 146309120, "step": 17860 }, { "epoch": 0.8581032412965186, - "grad_norm": 0.5029290318489075, + "grad_norm": 0.5034369826316833, "learning_rate": 2.6912123854210212e-06, - "loss": 0.8782, + "loss": 0.8785, "num_input_tokens_seen": 146391040, "step": 17870 }, { "epoch": 0.8585834333733493, - "grad_norm": 0.5902127027511597, + "grad_norm": 0.5906243324279785, "learning_rate": 2.673359824523297e-06, - "loss": 0.9368, + "loss": 0.9372, "num_input_tokens_seen": 146472960, "step": 17880 }, { "epoch": 0.8590636254501801, - "grad_norm": 0.6036652326583862, + "grad_norm": 0.6103810667991638, "learning_rate": 2.655563329246413e-06, - "loss": 0.946, + "loss": 0.9457, "num_input_tokens_seen": 146554880, "step": 17890 }, { "epoch": 0.8595438175270108, - "grad_norm": 0.4699823558330536, + "grad_norm": 0.469939261674881, "learning_rate": 2.637822944280116e-06, "loss": 0.8075, "num_input_tokens_seen": 146636800, @@ -14330,7 +14330,7 @@ }, { "epoch": 0.8600240096038415, - "grad_norm": 0.5656408071517944, + "grad_norm": 0.567516028881073, "learning_rate": 2.6201387141732205e-06, "loss": 0.9612, "num_input_tokens_seen": 146718720, @@ -14338,151 +14338,151 @@ }, { "epoch": 0.8605042016806723, - "grad_norm": 0.4844336211681366, + "grad_norm": 0.4764900803565979, "learning_rate": 2.6025106833335505e-06, - "loss": 1.0511, + "loss": 1.0508, "num_input_tokens_seen": 146800640, "step": 17920 }, { "epoch": 0.860984393757503, - "grad_norm": 0.7261025309562683, + "grad_norm": 0.7253336906433105, "learning_rate": 2.5849388960277997e-06, - "loss": 0.7721, + "loss": 0.7723, "num_input_tokens_seen": 146882560, "step": 17930 }, { "epoch": 0.8614645858343337, - "grad_norm": 0.5041067600250244, + "grad_norm": 0.5117282867431641, "learning_rate": 2.567423396381419e-06, - "loss": 0.7826, + "loss": 0.783, "num_input_tokens_seen": 146964480, "step": 17940 }, { "epoch": 0.8619447779111644, - "grad_norm": 0.28633078932762146, + "grad_norm": 0.28951549530029297, "learning_rate": 2.549964228378518e-06, - "loss": 0.7408, + "loss": 0.7405, "num_input_tokens_seen": 147046400, "step": 17950 }, { "epoch": 0.8624249699879952, - "grad_norm": 0.8320537209510803, + "grad_norm": 0.8366937637329102, "learning_rate": 2.532561435861755e-06, - "loss": 0.906, + "loss": 0.9065, "num_input_tokens_seen": 147128320, "step": 17960 }, { "epoch": 0.8629051620648259, - "grad_norm": 0.4853600561618805, + "grad_norm": 0.4839094579219818, "learning_rate": 2.515215062532206e-06, - "loss": 0.8907, + "loss": 0.8906, "num_input_tokens_seen": 147210240, "step": 17970 }, { "epoch": 0.8633853541416566, - "grad_norm": 0.5016003251075745, + "grad_norm": 0.5016359090805054, "learning_rate": 2.497925151949271e-06, - "loss": 0.9341, + "loss": 0.9354, "num_input_tokens_seen": 147292160, "step": 17980 }, { "epoch": 0.8638655462184874, - "grad_norm": 0.4873434603214264, + "grad_norm": 0.4866850674152374, "learning_rate": 2.4806917475305806e-06, - "loss": 0.9855, + "loss": 0.985, "num_input_tokens_seen": 147374080, "step": 17990 }, { "epoch": 0.8643457382953181, - "grad_norm": 0.4814073443412781, + "grad_norm": 0.4811359643936157, "learning_rate": 2.4635148925518577e-06, - "loss": 0.8596, + "loss": 0.8599, "num_input_tokens_seen": 147456000, "step": 18000 }, { "epoch": 0.8648259303721488, - "grad_norm": 0.4790041446685791, + "grad_norm": 0.476534903049469, "learning_rate": 2.4463946301468143e-06, - "loss": 0.8384, + "loss": 0.8387, "num_input_tokens_seen": 147537920, "step": 18010 }, { "epoch": 0.8653061224489796, - "grad_norm": 0.4911400377750397, + "grad_norm": 0.4927884340286255, "learning_rate": 2.4293310033070614e-06, - "loss": 0.9121, + "loss": 0.9122, "num_input_tokens_seen": 147619840, "step": 18020 }, { "epoch": 0.8657863145258103, - "grad_norm": 0.5386258363723755, + "grad_norm": 0.539406955242157, "learning_rate": 2.4123240548819955e-06, - "loss": 0.9519, + "loss": 0.9507, "num_input_tokens_seen": 147701760, "step": 18030 }, { "epoch": 0.866266506602641, - "grad_norm": 0.4828698933124542, + "grad_norm": 0.48301905393600464, "learning_rate": 2.3953738275786565e-06, - "loss": 0.7976, + "loss": 0.7972, "num_input_tokens_seen": 147783680, "step": 18040 }, { "epoch": 0.8667466986794717, - "grad_norm": 0.4941945970058441, + "grad_norm": 0.49446576833724976, "learning_rate": 2.3784803639616854e-06, - "loss": 0.8773, + "loss": 0.8777, "num_input_tokens_seen": 147865600, "step": 18050 }, { "epoch": 0.8672268907563025, - "grad_norm": 0.4941422939300537, + "grad_norm": 0.49398505687713623, "learning_rate": 2.361643706453151e-06, - "loss": 0.8026, + "loss": 0.8028, "num_input_tokens_seen": 147947520, "step": 18060 }, { "epoch": 0.8677070828331332, - "grad_norm": 0.5104964971542358, + "grad_norm": 0.5100568532943726, "learning_rate": 2.3448638973324833e-06, - "loss": 0.8714, + "loss": 0.872, "num_input_tokens_seen": 148029440, "step": 18070 }, { "epoch": 0.868187274909964, - "grad_norm": 0.4910539984703064, + "grad_norm": 0.4940645694732666, "learning_rate": 2.328140978736365e-06, - "loss": 1.224, + "loss": 1.2244, "num_input_tokens_seen": 148111360, "step": 18080 }, { "epoch": 0.8686674669867948, - "grad_norm": 0.6451656222343445, + "grad_norm": 0.6448796391487122, "learning_rate": 2.311474992658613e-06, - "loss": 0.8576, + "loss": 0.8573, "num_input_tokens_seen": 148193280, "step": 18090 }, { "epoch": 0.8691476590636255, - "grad_norm": 0.47667035460472107, + "grad_norm": 0.4770622253417969, "learning_rate": 2.29486598095007e-06, "loss": 0.7746, "num_input_tokens_seen": 148275200, @@ -14490,103 +14490,103 @@ }, { "epoch": 0.8696278511404562, - "grad_norm": 0.4710484743118286, + "grad_norm": 0.47341033816337585, "learning_rate": 2.278313985318517e-06, - "loss": 0.9388, + "loss": 0.9389, "num_input_tokens_seen": 148357120, "step": 18110 }, { "epoch": 0.870108043217287, - "grad_norm": 0.5291144251823425, + "grad_norm": 0.5365563631057739, "learning_rate": 2.261819047328562e-06, - "loss": 0.8536, + "loss": 0.8541, "num_input_tokens_seen": 148439040, "step": 18120 }, { "epoch": 0.8705882352941177, - "grad_norm": 0.5111077427864075, + "grad_norm": 0.5126563906669617, "learning_rate": 2.2453812084015175e-06, - "loss": 0.9051, + "loss": 0.9053, "num_input_tokens_seen": 148520960, "step": 18130 }, { "epoch": 0.8710684273709484, - "grad_norm": 0.4752892553806305, + "grad_norm": 0.4737769365310669, "learning_rate": 2.2290005098153296e-06, - "loss": 1.2451, + "loss": 1.2452, "num_input_tokens_seen": 148602880, "step": 18140 }, { "epoch": 0.8715486194477791, - "grad_norm": 0.501112163066864, + "grad_norm": 0.5032677054405212, "learning_rate": 2.212676992704435e-06, - "loss": 0.9271, + "loss": 0.9268, "num_input_tokens_seen": 148684800, "step": 18150 }, { "epoch": 0.8720288115246099, - "grad_norm": 0.46904444694519043, + "grad_norm": 0.4705372750759125, "learning_rate": 2.1964106980597034e-06, - "loss": 0.6553, + "loss": 0.6561, "num_input_tokens_seen": 148766720, "step": 18160 }, { "epoch": 0.8725090036014406, - "grad_norm": 0.5374292135238647, + "grad_norm": 0.5352415442466736, "learning_rate": 2.1802016667282847e-06, - "loss": 0.8742, + "loss": 0.8747, "num_input_tokens_seen": 148848640, "step": 18170 }, { "epoch": 0.8729891956782713, - "grad_norm": 0.5054563283920288, + "grad_norm": 0.5062108635902405, "learning_rate": 2.1640499394135595e-06, - "loss": 0.8704, + "loss": 0.8711, "num_input_tokens_seen": 148930560, "step": 18180 }, { "epoch": 0.8734693877551021, - "grad_norm": 0.48471224308013916, + "grad_norm": 0.48544827103614807, "learning_rate": 2.1479555566749825e-06, - "loss": 0.9272, + "loss": 0.9279, "num_input_tokens_seen": 149012480, "step": 18190 }, { "epoch": 0.8739495798319328, - "grad_norm": 0.5023584961891174, + "grad_norm": 0.5034095048904419, "learning_rate": 2.131918558928023e-06, - "loss": 0.9053, + "loss": 0.9047, "num_input_tokens_seen": 149094400, "step": 18200 }, { "epoch": 0.8744297719087635, - "grad_norm": 0.46324488520622253, + "grad_norm": 0.4514198899269104, "learning_rate": 2.1159389864440495e-06, - "loss": 0.8871, + "loss": 0.8876, "num_input_tokens_seen": 149176320, "step": 18210 }, { "epoch": 0.8749099639855943, - "grad_norm": 0.5278415083885193, + "grad_norm": 0.5257483124732971, "learning_rate": 2.100016879350214e-06, - "loss": 0.7591, + "loss": 0.7592, "num_input_tokens_seen": 149258240, "step": 18220 }, { "epoch": 0.875390156062425, - "grad_norm": 0.4872019290924072, + "grad_norm": 0.48762884736061096, "learning_rate": 2.0841522776293725e-06, "loss": 0.8473, "num_input_tokens_seen": 149340160, @@ -14594,55 +14594,55 @@ }, { "epoch": 0.8758703481392557, - "grad_norm": 0.5031896233558655, + "grad_norm": 0.5084396600723267, "learning_rate": 2.0683452211199854e-06, - "loss": 1.118, + "loss": 1.1182, "num_input_tokens_seen": 149422080, "step": 18240 }, { "epoch": 0.8763505402160864, - "grad_norm": 0.48833465576171875, + "grad_norm": 0.48876795172691345, "learning_rate": 2.052595749515987e-06, - "loss": 0.8301, + "loss": 0.8305, "num_input_tokens_seen": 149504000, "step": 18250 }, { "epoch": 0.8768307322929172, - "grad_norm": 0.5221076607704163, + "grad_norm": 0.5128353238105774, "learning_rate": 2.0369039023667215e-06, - "loss": 0.9347, + "loss": 0.9346, "num_input_tokens_seen": 149585920, "step": 18260 }, { "epoch": 0.8773109243697479, - "grad_norm": 0.476140558719635, + "grad_norm": 0.4771345555782318, "learning_rate": 2.0212697190768263e-06, - "loss": 0.9084, + "loss": 0.9082, "num_input_tokens_seen": 149667840, "step": 18270 }, { "epoch": 0.8777911164465786, - "grad_norm": 0.49334678053855896, + "grad_norm": 0.4934534430503845, "learning_rate": 2.0056932389061338e-06, - "loss": 0.8173, + "loss": 0.8169, "num_input_tokens_seen": 149749760, "step": 18280 }, { "epoch": 0.8782713085234094, - "grad_norm": 0.8516491055488586, + "grad_norm": 0.8390567302703857, "learning_rate": 1.9901745009695773e-06, - "loss": 0.9435, + "loss": 0.9433, "num_input_tokens_seen": 149831680, "step": 18290 }, { "epoch": 0.8787515006002401, - "grad_norm": 0.5442166328430176, + "grad_norm": 0.5534536242485046, "learning_rate": 1.9747135442370946e-06, "loss": 0.8976, "num_input_tokens_seen": 149913600, @@ -14650,55 +14650,55 @@ }, { "epoch": 0.8792316926770708, - "grad_norm": 0.3513561189174652, + "grad_norm": 0.35009366273880005, "learning_rate": 1.9593104075335158e-06, - "loss": 0.762, + "loss": 0.7622, "num_input_tokens_seen": 149995520, "step": 18310 }, { "epoch": 0.8797118847539016, - "grad_norm": 0.48064127564430237, + "grad_norm": 0.47988003492355347, "learning_rate": 1.943965129538483e-06, - "loss": 0.8791, + "loss": 0.8789, "num_input_tokens_seen": 150077440, "step": 18320 }, { "epoch": 0.8801920768307323, - "grad_norm": 0.48252299427986145, + "grad_norm": 0.48413175344467163, "learning_rate": 1.9286777487863477e-06, - "loss": 0.8332, + "loss": 0.8331, "num_input_tokens_seen": 150159360, "step": 18330 }, { "epoch": 0.880672268907563, - "grad_norm": 0.45519593358039856, + "grad_norm": 0.46263861656188965, "learning_rate": 1.913448303666071e-06, - "loss": 0.999, + "loss": 0.9994, "num_input_tokens_seen": 150241280, "step": 18340 }, { "epoch": 0.8811524609843937, - "grad_norm": 0.47915107011795044, + "grad_norm": 0.4799620509147644, "learning_rate": 1.8982768324211197e-06, - "loss": 0.9294, + "loss": 0.9293, "num_input_tokens_seen": 150323200, "step": 18350 }, { "epoch": 0.8816326530612245, - "grad_norm": 0.5818130373954773, + "grad_norm": 0.583249568939209, "learning_rate": 1.8831633731493963e-06, - "loss": 0.9785, + "loss": 0.9782, "num_input_tokens_seen": 150405120, "step": 18360 }, { "epoch": 0.8821128451380552, - "grad_norm": 0.4887988865375519, + "grad_norm": 0.4899525046348572, "learning_rate": 1.8681079638031062e-06, "loss": 0.8177, "num_input_tokens_seen": 150487040, @@ -14706,191 +14706,191 @@ }, { "epoch": 0.8825930372148859, - "grad_norm": 0.6993192434310913, + "grad_norm": 0.696358859539032, "learning_rate": 1.8531106421887017e-06, - "loss": 0.8862, + "loss": 0.8863, "num_input_tokens_seen": 150568960, "step": 18380 }, { "epoch": 0.8830732292917167, - "grad_norm": 0.486240416765213, + "grad_norm": 0.4858860969543457, "learning_rate": 1.8381714459667603e-06, - "loss": 0.7221, + "loss": 0.7228, "num_input_tokens_seen": 150650880, "step": 18390 }, { "epoch": 0.8835534213685474, - "grad_norm": 0.7017090916633606, + "grad_norm": 0.695509135723114, "learning_rate": 1.823290412651893e-06, - "loss": 1.0007, + "loss": 1.0011, "num_input_tokens_seen": 150732800, "step": 18400 }, { "epoch": 0.8840336134453781, - "grad_norm": 0.46545252203941345, + "grad_norm": 0.46307533979415894, "learning_rate": 1.8084675796126576e-06, - "loss": 1.1891, + "loss": 1.1894, "num_input_tokens_seen": 150814720, "step": 18410 }, { "epoch": 0.8845138055222089, - "grad_norm": 0.5387531518936157, + "grad_norm": 0.5278788208961487, "learning_rate": 1.7937029840714715e-06, - "loss": 1.0123, + "loss": 1.0122, "num_input_tokens_seen": 150896640, "step": 18420 }, { "epoch": 0.8849939975990396, - "grad_norm": 0.4813983142375946, + "grad_norm": 0.48216572403907776, "learning_rate": 1.778996663104493e-06, - "loss": 0.8358, + "loss": 0.836, "num_input_tokens_seen": 150978560, "step": 18430 }, { "epoch": 0.8854741896758703, - "grad_norm": 0.4829707443714142, + "grad_norm": 0.4845614433288574, "learning_rate": 1.7643486536415537e-06, - "loss": 0.9354, + "loss": 0.9355, "num_input_tokens_seen": 151060480, "step": 18440 }, { "epoch": 0.885954381752701, - "grad_norm": 0.4897553622722626, + "grad_norm": 0.4892086088657379, "learning_rate": 1.7497589924660552e-06, - "loss": 0.8802, + "loss": 0.8795, "num_input_tokens_seen": 151142400, "step": 18450 }, { "epoch": 0.8864345738295318, - "grad_norm": 0.826809823513031, + "grad_norm": 0.8257756233215332, "learning_rate": 1.7352277162148712e-06, - "loss": 0.8142, + "loss": 0.814, "num_input_tokens_seen": 151224320, "step": 18460 }, { "epoch": 0.8869147659063625, - "grad_norm": 0.4941288232803345, + "grad_norm": 0.49343836307525635, "learning_rate": 1.7207548613782709e-06, - "loss": 0.8752, + "loss": 0.8757, "num_input_tokens_seen": 151306240, "step": 18470 }, { "epoch": 0.8873949579831932, - "grad_norm": 0.4736701548099518, + "grad_norm": 0.4691942632198334, "learning_rate": 1.7063404642998186e-06, - "loss": 1.0158, + "loss": 1.0152, "num_input_tokens_seen": 151388160, "step": 18480 }, { "epoch": 0.887875150060024, - "grad_norm": 0.5565969944000244, + "grad_norm": 0.549484133720398, "learning_rate": 1.6919845611762714e-06, - "loss": 0.9463, + "loss": 0.9454, "num_input_tokens_seen": 151470080, "step": 18490 }, { "epoch": 0.8883553421368547, - "grad_norm": 0.523489773273468, + "grad_norm": 0.522466242313385, "learning_rate": 1.6776871880575084e-06, - "loss": 1.0312, + "loss": 1.0313, "num_input_tokens_seen": 151552000, "step": 18500 }, { "epoch": 0.8888355342136854, - "grad_norm": 0.5504932403564453, + "grad_norm": 0.5475363731384277, "learning_rate": 1.663448380846433e-06, - "loss": 0.841, + "loss": 0.8417, "num_input_tokens_seen": 151633920, "step": 18510 }, { "epoch": 0.8893157262905163, - "grad_norm": 2.449960470199585, + "grad_norm": 2.4417154788970947, "learning_rate": 1.649268175298868e-06, - "loss": 0.819, + "loss": 0.8187, "num_input_tokens_seen": 151715840, "step": 18520 }, { "epoch": 0.889795918367347, - "grad_norm": 0.6405714750289917, + "grad_norm": 0.643785297870636, "learning_rate": 1.6351466070234882e-06, - "loss": 1.1654, + "loss": 1.1659, "num_input_tokens_seen": 151797760, "step": 18530 }, { "epoch": 0.8902761104441776, - "grad_norm": 0.5734189748764038, + "grad_norm": 0.5771599411964417, "learning_rate": 1.6210837114817272e-06, - "loss": 0.8883, + "loss": 0.8884, "num_input_tokens_seen": 151879680, "step": 18540 }, { "epoch": 0.8907563025210085, - "grad_norm": 0.4893549382686615, + "grad_norm": 0.490934282541275, "learning_rate": 1.6070795239876618e-06, - "loss": 0.785, + "loss": 0.7856, "num_input_tokens_seen": 151961600, "step": 18550 }, { "epoch": 0.8912364945978392, - "grad_norm": 0.48380154371261597, + "grad_norm": 0.4851411283016205, "learning_rate": 1.5931340797079613e-06, - "loss": 0.7937, + "loss": 0.7931, "num_input_tokens_seen": 152043520, "step": 18560 }, { "epoch": 0.8917166866746699, - "grad_norm": 0.44979286193847656, + "grad_norm": 0.4503311514854431, "learning_rate": 1.5792474136617858e-06, - "loss": 0.8338, + "loss": 0.8341, "num_input_tokens_seen": 152125440, "step": 18570 }, { "epoch": 0.8921968787515006, - "grad_norm": 0.48284098505973816, + "grad_norm": 0.4835943281650543, "learning_rate": 1.5654195607206712e-06, - "loss": 0.8318, + "loss": 0.8317, "num_input_tokens_seen": 152207360, "step": 18580 }, { "epoch": 0.8926770708283314, - "grad_norm": 0.5251838564872742, + "grad_norm": 0.5173797011375427, "learning_rate": 1.5516505556084888e-06, - "loss": 0.8587, + "loss": 0.859, "num_input_tokens_seen": 152289280, "step": 18590 }, { "epoch": 0.8931572629051621, - "grad_norm": 0.6006345152854919, + "grad_norm": 0.5888597369194031, "learning_rate": 1.5379404329013246e-06, - "loss": 1.0141, + "loss": 1.0143, "num_input_tokens_seen": 152371200, "step": 18600 }, { "epoch": 0.8936374549819928, - "grad_norm": 0.48716527223587036, + "grad_norm": 0.46301090717315674, "learning_rate": 1.5242892270273951e-06, "loss": 0.8301, "num_input_tokens_seen": 152453120, @@ -14898,87 +14898,87 @@ }, { "epoch": 0.8941176470588236, - "grad_norm": 0.4787551462650299, + "grad_norm": 0.479010671377182, "learning_rate": 1.5106969722669812e-06, - "loss": 0.9144, + "loss": 0.9141, "num_input_tokens_seen": 152535040, "step": 18620 }, { "epoch": 0.8945978391356543, - "grad_norm": 0.48084962368011475, + "grad_norm": 0.47869157791137695, "learning_rate": 1.4971637027523106e-06, - "loss": 1.1305, + "loss": 1.1308, "num_input_tokens_seen": 152616960, "step": 18630 }, { "epoch": 0.895078031212485, - "grad_norm": 0.5110411643981934, + "grad_norm": 0.5103761553764343, "learning_rate": 1.4836894524675126e-06, - "loss": 0.9489, + "loss": 0.9492, "num_input_tokens_seen": 152698880, "step": 18640 }, { "epoch": 0.8955582232893158, - "grad_norm": 0.5295985341072083, + "grad_norm": 0.5287526249885559, "learning_rate": 1.4702742552484884e-06, - "loss": 1.1124, + "loss": 1.1131, "num_input_tokens_seen": 152780800, "step": 18650 }, { "epoch": 0.8960384153661465, - "grad_norm": 0.47781914472579956, + "grad_norm": 0.4778541922569275, "learning_rate": 1.4569181447828623e-06, - "loss": 0.836, + "loss": 0.8355, "num_input_tokens_seen": 152862720, "step": 18660 }, { "epoch": 0.8965186074429772, - "grad_norm": 0.523032546043396, + "grad_norm": 0.5224360823631287, "learning_rate": 1.4436211546098782e-06, - "loss": 1.0544, + "loss": 1.055, "num_input_tokens_seen": 152944640, "step": 18670 }, { "epoch": 0.8969987995198079, - "grad_norm": 0.5448631644248962, + "grad_norm": 0.5333329439163208, "learning_rate": 1.430383318120318e-06, - "loss": 0.9155, + "loss": 0.9159, "num_input_tokens_seen": 153026560, "step": 18680 }, { "epoch": 0.8974789915966387, - "grad_norm": 0.4606798589229584, + "grad_norm": 0.4066697061061859, "learning_rate": 1.4172046685564212e-06, - "loss": 0.862, + "loss": 0.8634, "num_input_tokens_seen": 153108480, "step": 18690 }, { "epoch": 0.8979591836734694, - "grad_norm": 0.44669726490974426, + "grad_norm": 0.4478163719177246, "learning_rate": 1.4040852390118042e-06, - "loss": 0.9023, + "loss": 0.9027, "num_input_tokens_seen": 153190400, "step": 18700 }, { "epoch": 0.8984393757503001, - "grad_norm": 0.4901637136936188, + "grad_norm": 0.48882734775543213, "learning_rate": 1.3910250624313642e-06, - "loss": 1.0013, + "loss": 1.0014, "num_input_tokens_seen": 153272320, "step": 18710 }, { "epoch": 0.8989195678271309, - "grad_norm": 0.4382149875164032, + "grad_norm": 0.4382336139678955, "learning_rate": 1.3780241716112057e-06, "loss": 0.8953, "num_input_tokens_seen": 153354240, @@ -14986,159 +14986,159 @@ }, { "epoch": 0.8993997599039616, - "grad_norm": 0.5043530464172363, + "grad_norm": 0.5193009972572327, "learning_rate": 1.3650825991985722e-06, - "loss": 0.8403, + "loss": 0.8398, "num_input_tokens_seen": 153436160, "step": 18730 }, { "epoch": 0.8998799519807923, - "grad_norm": 0.8724451065063477, + "grad_norm": 0.8743091225624084, "learning_rate": 1.3522003776917285e-06, - "loss": 0.7954, + "loss": 0.7956, "num_input_tokens_seen": 153518080, "step": 18740 }, { "epoch": 0.9003601440576231, - "grad_norm": 0.4843882620334625, + "grad_norm": 0.4847816228866577, "learning_rate": 1.3393775394399123e-06, - "loss": 0.8046, + "loss": 0.8044, "num_input_tokens_seen": 153600000, "step": 18750 }, { "epoch": 0.9008403361344538, - "grad_norm": 0.5170858502388, + "grad_norm": 0.4940853714942932, "learning_rate": 1.326614116643246e-06, - "loss": 0.712, + "loss": 0.7121, "num_input_tokens_seen": 153681920, "step": 18760 }, { "epoch": 0.9013205282112845, - "grad_norm": 0.45620083808898926, + "grad_norm": 0.45956119894981384, "learning_rate": 1.3139101413526339e-06, - "loss": 0.8778, + "loss": 0.8775, "num_input_tokens_seen": 153763840, "step": 18770 }, { "epoch": 0.9018007202881152, - "grad_norm": 0.710594654083252, + "grad_norm": 0.7106777429580688, "learning_rate": 1.3012656454697125e-06, - "loss": 1.0348, + "loss": 1.0345, "num_input_tokens_seen": 153845760, "step": 18780 }, { "epoch": 0.902280912364946, - "grad_norm": 0.5861712098121643, + "grad_norm": 0.5692814588546753, "learning_rate": 1.2886806607467578e-06, - "loss": 0.9589, + "loss": 0.9592, "num_input_tokens_seen": 153927680, "step": 18790 }, { "epoch": 0.9027611044417767, - "grad_norm": 0.48728203773498535, + "grad_norm": 0.4884362816810608, "learning_rate": 1.2761552187865899e-06, - "loss": 0.9268, + "loss": 0.927, "num_input_tokens_seen": 154009600, "step": 18800 }, { "epoch": 0.9032412965186074, - "grad_norm": 0.49788719415664673, + "grad_norm": 0.4982493817806244, "learning_rate": 1.2636893510425186e-06, - "loss": 0.8947, + "loss": 0.8942, "num_input_tokens_seen": 154091520, "step": 18810 }, { "epoch": 0.9037214885954382, - "grad_norm": 0.5064159035682678, + "grad_norm": 0.5100504159927368, "learning_rate": 1.2512830888182531e-06, - "loss": 0.9701, + "loss": 0.97, "num_input_tokens_seen": 154173440, "step": 18820 }, { "epoch": 0.9042016806722689, - "grad_norm": 0.5245488286018372, + "grad_norm": 0.5252644419670105, "learning_rate": 1.23893646326782e-06, - "loss": 0.9968, + "loss": 0.9964, "num_input_tokens_seen": 154255360, "step": 18830 }, { "epoch": 0.9046818727490996, - "grad_norm": 0.49203112721443176, + "grad_norm": 0.4914807975292206, "learning_rate": 1.2266495053954913e-06, - "loss": 0.7056, + "loss": 0.7055, "num_input_tokens_seen": 154337280, "step": 18840 }, { "epoch": 0.9051620648259304, - "grad_norm": 0.47168827056884766, + "grad_norm": 0.47101646661758423, "learning_rate": 1.2144222460557074e-06, - "loss": 0.8708, + "loss": 0.8709, "num_input_tokens_seen": 154419200, "step": 18850 }, { "epoch": 0.9056422569027611, - "grad_norm": 0.5028705596923828, + "grad_norm": 0.502715528011322, "learning_rate": 1.2022547159529911e-06, - "loss": 1.0057, + "loss": 1.0051, "num_input_tokens_seen": 154501120, "step": 18860 }, { "epoch": 0.9061224489795918, - "grad_norm": 0.4962250888347626, + "grad_norm": 0.4965539276599884, "learning_rate": 1.190146945641879e-06, - "loss": 0.8827, + "loss": 0.8824, "num_input_tokens_seen": 154583040, "step": 18870 }, { "epoch": 0.9066026410564225, - "grad_norm": 0.5061412453651428, + "grad_norm": 0.5059521794319153, "learning_rate": 1.1780989655268415e-06, - "loss": 0.9975, + "loss": 0.997, "num_input_tokens_seen": 154664960, "step": 18880 }, { "epoch": 0.9070828331332533, - "grad_norm": 0.4892423152923584, + "grad_norm": 0.48782065510749817, "learning_rate": 1.1661108058622082e-06, - "loss": 0.8246, + "loss": 0.824, "num_input_tokens_seen": 154746880, "step": 18890 }, { "epoch": 0.907563025210084, - "grad_norm": 0.48785221576690674, + "grad_norm": 0.49019843339920044, "learning_rate": 1.154182496752082e-06, - "loss": 0.7727, + "loss": 0.7726, "num_input_tokens_seen": 154828800, "step": 18900 }, { "epoch": 0.9080432172869147, - "grad_norm": 0.5420461297035217, + "grad_norm": 0.5387908816337585, "learning_rate": 1.142314068150288e-06, - "loss": 1.0012, + "loss": 1.003, "num_input_tokens_seen": 154910720, "step": 18910 }, { "epoch": 0.9085234093637455, - "grad_norm": 0.4156251549720764, + "grad_norm": 0.4145044684410095, "learning_rate": 1.1305055498602584e-06, "loss": 0.8584, "num_input_tokens_seen": 154992640, @@ -15146,103 +15146,103 @@ }, { "epoch": 0.9090036014405762, - "grad_norm": 0.4756897985935211, + "grad_norm": 0.44272181391716003, "learning_rate": 1.1187569715350066e-06, - "loss": 0.8211, + "loss": 0.8205, "num_input_tokens_seen": 155074560, "step": 18930 }, { "epoch": 0.9094837935174069, - "grad_norm": 0.47108086943626404, + "grad_norm": 0.4699978828430176, "learning_rate": 1.1070683626770162e-06, - "loss": 0.7993, + "loss": 0.7995, "num_input_tokens_seen": 155156480, "step": 18940 }, { "epoch": 0.9099639855942377, - "grad_norm": 0.4969809949398041, + "grad_norm": 0.49818235635757446, "learning_rate": 1.0954397526381694e-06, - "loss": 0.88, + "loss": 0.8801, "num_input_tokens_seen": 155238400, "step": 18950 }, { "epoch": 0.9104441776710684, - "grad_norm": 1.2800873517990112, + "grad_norm": 1.2911748886108398, "learning_rate": 1.0838711706196992e-06, - "loss": 0.9003, + "loss": 0.9004, "num_input_tokens_seen": 155320320, "step": 18960 }, { "epoch": 0.9109243697478991, - "grad_norm": 0.5365675687789917, + "grad_norm": 0.5381184220314026, "learning_rate": 1.0723626456720925e-06, - "loss": 0.9935, + "loss": 0.9954, "num_input_tokens_seen": 155402240, "step": 18970 }, { "epoch": 0.9114045618247298, - "grad_norm": 0.516943097114563, + "grad_norm": 0.5187402963638306, "learning_rate": 1.0609142066950157e-06, - "loss": 1.0318, + "loss": 1.032, "num_input_tokens_seen": 155484160, "step": 18980 }, { "epoch": 0.9118847539015607, - "grad_norm": 0.5084674954414368, + "grad_norm": 0.5067307949066162, "learning_rate": 1.0495258824372578e-06, - "loss": 0.837, + "loss": 0.8363, "num_input_tokens_seen": 155566080, "step": 18990 }, { "epoch": 0.9123649459783914, - "grad_norm": 0.5022624135017395, + "grad_norm": 0.5005121827125549, "learning_rate": 1.0381977014966543e-06, - "loss": 0.7732, + "loss": 0.7728, "num_input_tokens_seen": 155648000, "step": 19000 }, { "epoch": 0.912845138055222, - "grad_norm": 0.47421613335609436, + "grad_norm": 0.4743783473968506, "learning_rate": 1.0269296923199972e-06, - "loss": 1.0176, + "loss": 1.018, "num_input_tokens_seen": 155729920, "step": 19010 }, { "epoch": 0.9133253301320529, - "grad_norm": 0.4957895278930664, + "grad_norm": 0.49608567357063293, "learning_rate": 1.0157218832029969e-06, - "loss": 1.012, + "loss": 1.0119, "num_input_tokens_seen": 155811840, "step": 19020 }, { "epoch": 0.9138055222088836, - "grad_norm": 0.5938453078269958, + "grad_norm": 0.5876844525337219, "learning_rate": 1.0045743022901787e-06, - "loss": 1.009, + "loss": 1.0094, "num_input_tokens_seen": 155893760, "step": 19030 }, { "epoch": 0.9142857142857143, - "grad_norm": 0.46902185678482056, + "grad_norm": 0.4514719247817993, "learning_rate": 9.934869775748258e-07, - "loss": 0.7293, + "loss": 0.7286, "num_input_tokens_seen": 155975680, "step": 19040 }, { "epoch": 0.9147659063625451, - "grad_norm": 0.4982973337173462, + "grad_norm": 0.4981219470500946, "learning_rate": 9.824599368989163e-07, "loss": 0.8924, "num_input_tokens_seen": 156057600, @@ -15250,103 +15250,103 @@ }, { "epoch": 0.9152460984393758, - "grad_norm": 0.47380825877189636, + "grad_norm": 0.4736618101596832, "learning_rate": 9.714932079530476e-07, - "loss": 0.8136, + "loss": 0.814, "num_input_tokens_seen": 156139520, "step": 19060 }, { "epoch": 0.9157262905162065, - "grad_norm": 0.43160074949264526, + "grad_norm": 0.4343636929988861, "learning_rate": 9.60586818276349e-07, - "loss": 0.7444, + "loss": 0.7436, "num_input_tokens_seen": 156221440, "step": 19070 }, { "epoch": 0.9162064825930372, - "grad_norm": 0.5274019837379456, + "grad_norm": 0.37545332312583923, "learning_rate": 9.497407952564485e-07, - "loss": 0.7502, + "loss": 0.7512, "num_input_tokens_seen": 156303360, "step": 19080 }, { "epoch": 0.916686674669868, - "grad_norm": 0.5028293132781982, + "grad_norm": 0.5027362108230591, "learning_rate": 9.389551661293683e-07, - "loss": 0.9223, + "loss": 0.9225, "num_input_tokens_seen": 156385280, "step": 19090 }, { "epoch": 0.9171668667466987, - "grad_norm": 0.4965846538543701, + "grad_norm": 0.4961453676223755, "learning_rate": 9.282299579794789e-07, - "loss": 0.8431, + "loss": 0.8426, "num_input_tokens_seen": 156467200, "step": 19100 }, { "epoch": 0.9176470588235294, - "grad_norm": 1.1068954467773438, + "grad_norm": 1.0757991075515747, "learning_rate": 9.175651977394284e-07, - "loss": 0.8672, + "loss": 0.867, "num_input_tokens_seen": 156549120, "step": 19110 }, { "epoch": 0.9181272509003602, - "grad_norm": 0.4944973886013031, + "grad_norm": 0.49529650807380676, "learning_rate": 9.069609121900663e-07, - "loss": 0.8328, + "loss": 0.8332, "num_input_tokens_seen": 156631040, "step": 19120 }, { "epoch": 0.9186074429771909, - "grad_norm": 0.47309666872024536, + "grad_norm": 0.4741530120372772, "learning_rate": 8.964171279603778e-07, - "loss": 0.7688, + "loss": 0.7689, "num_input_tokens_seen": 156712960, "step": 19130 }, { "epoch": 0.9190876350540216, - "grad_norm": 0.4812490940093994, + "grad_norm": 0.48308661580085754, "learning_rate": 8.859338715274279e-07, - "loss": 0.7694, + "loss": 0.7697, "num_input_tokens_seen": 156794880, "step": 19140 }, { "epoch": 0.9195678271308524, - "grad_norm": 0.7439998984336853, + "grad_norm": 0.7624333500862122, "learning_rate": 8.755111692162837e-07, - "loss": 0.9128, + "loss": 0.9131, "num_input_tokens_seen": 156876800, "step": 19150 }, { "epoch": 0.9200480192076831, - "grad_norm": 1.6029683351516724, + "grad_norm": 1.5886449813842773, "learning_rate": 8.651490471999424e-07, - "loss": 0.9739, + "loss": 0.9745, "num_input_tokens_seen": 156958720, "step": 19160 }, { "epoch": 0.9205282112845138, - "grad_norm": 0.5646688938140869, + "grad_norm": 0.569049596786499, "learning_rate": 8.548475314992949e-07, - "loss": 0.9776, + "loss": 0.9774, "num_input_tokens_seen": 157040640, "step": 19170 }, { "epoch": 0.9210084033613445, - "grad_norm": 0.4853648841381073, + "grad_norm": 0.4898834824562073, "learning_rate": 8.446066479830206e-07, "loss": 0.9048, "num_input_tokens_seen": 157122560, @@ -15354,63 +15354,63 @@ }, { "epoch": 0.9214885954381753, - "grad_norm": 0.5098863840103149, + "grad_norm": 0.5096047520637512, "learning_rate": 8.344264223675485e-07, - "loss": 0.8928, + "loss": 0.8926, "num_input_tokens_seen": 157204480, "step": 19190 }, { "epoch": 0.921968787515006, - "grad_norm": 0.4745608866214752, + "grad_norm": 0.46778738498687744, "learning_rate": 8.243068802169906e-07, - "loss": 0.8414, + "loss": 0.8408, "num_input_tokens_seen": 157286400, "step": 19200 }, { "epoch": 0.9224489795918367, - "grad_norm": 0.555088996887207, + "grad_norm": 0.5187940001487732, "learning_rate": 8.14248046943078e-07, - "loss": 0.9802, + "loss": 0.9799, "num_input_tokens_seen": 157368320, "step": 19210 }, { "epoch": 0.9229291716686675, - "grad_norm": 0.5188843011856079, + "grad_norm": 0.5219123959541321, "learning_rate": 8.042499478050719e-07, - "loss": 0.7226, + "loss": 0.7224, "num_input_tokens_seen": 157450240, "step": 19220 }, { "epoch": 0.9234093637454982, - "grad_norm": 0.4765109121799469, + "grad_norm": 0.47635209560394287, "learning_rate": 7.943126079097418e-07, - "loss": 0.8491, + "loss": 0.8495, "num_input_tokens_seen": 157532160, "step": 19230 }, { "epoch": 0.9238895558223289, - "grad_norm": 1.0354602336883545, + "grad_norm": 1.093017339706421, "learning_rate": 7.844360522112737e-07, - "loss": 0.9703, + "loss": 0.9701, "num_input_tokens_seen": 157614080, "step": 19240 }, { "epoch": 0.9243697478991597, - "grad_norm": 0.48967981338500977, + "grad_norm": 0.4886617660522461, "learning_rate": 7.746203055112145e-07, - "loss": 0.8683, + "loss": 0.8682, "num_input_tokens_seen": 157696000, "step": 19250 }, { "epoch": 0.9248499399759904, - "grad_norm": 0.553525984287262, + "grad_norm": 0.5579150319099426, "learning_rate": 7.648653924584137e-07, "loss": 0.9395, "num_input_tokens_seen": 157777920, @@ -15418,7 +15418,7 @@ }, { "epoch": 0.9253301320528211, - "grad_norm": 0.47739177942276, + "grad_norm": 0.458242803812027, "learning_rate": 7.55171337548946e-07, "loss": 0.84, "num_input_tokens_seen": 157859840, @@ -15426,7 +15426,7 @@ }, { "epoch": 0.9258103241296518, - "grad_norm": 0.47741973400115967, + "grad_norm": 0.475402295589447, "learning_rate": 7.455381651260807e-07, "loss": 0.7814, "num_input_tokens_seen": 157941760, @@ -15434,15 +15434,15 @@ }, { "epoch": 0.9262905162064826, - "grad_norm": 0.49380743503570557, + "grad_norm": 0.4926086962223053, "learning_rate": 7.359658993801894e-07, - "loss": 0.954, + "loss": 0.9542, "num_input_tokens_seen": 158023680, "step": 19290 }, { "epoch": 0.9267707082833133, - "grad_norm": 0.5180758833885193, + "grad_norm": 0.5198042392730713, "learning_rate": 7.264545643486997e-07, "loss": 0.9109, "num_input_tokens_seen": 158105600, @@ -15450,95 +15450,95 @@ }, { "epoch": 0.927250900360144, - "grad_norm": 0.7896413803100586, + "grad_norm": 0.7973329424858093, "learning_rate": 7.170041839160368e-07, - "loss": 0.9779, + "loss": 0.9783, "num_input_tokens_seen": 158187520, "step": 19310 }, { "epoch": 0.9277310924369748, - "grad_norm": 0.48943135142326355, + "grad_norm": 0.4901650547981262, "learning_rate": 7.076147818135537e-07, - "loss": 0.8145, + "loss": 0.8147, "num_input_tokens_seen": 158269440, "step": 19320 }, { "epoch": 0.9282112845138055, - "grad_norm": 1.7075966596603394, + "grad_norm": 1.725703477859497, "learning_rate": 6.982863816194812e-07, - "loss": 0.7036, + "loss": 0.7043, "num_input_tokens_seen": 158351360, "step": 19330 }, { "epoch": 0.9286914765906362, - "grad_norm": 0.48135149478912354, + "grad_norm": 0.46620166301727295, "learning_rate": 6.890190067588648e-07, - "loss": 0.8395, + "loss": 0.8402, "num_input_tokens_seen": 158433280, "step": 19340 }, { "epoch": 0.929171668667467, - "grad_norm": 0.43532514572143555, + "grad_norm": 0.4360329806804657, "learning_rate": 6.798126805035082e-07, - "loss": 0.9633, + "loss": 0.9634, "num_input_tokens_seen": 158515200, "step": 19350 }, { "epoch": 0.9296518607442977, - "grad_norm": 0.5122432708740234, + "grad_norm": 0.5116370916366577, "learning_rate": 6.706674259719048e-07, - "loss": 0.9551, + "loss": 0.9554, "num_input_tokens_seen": 158597120, "step": 19360 }, { "epoch": 0.9301320528211284, - "grad_norm": 0.4708029627799988, + "grad_norm": 0.4693267345428467, "learning_rate": 6.615832661291954e-07, - "loss": 0.895, + "loss": 0.8942, "num_input_tokens_seen": 158679040, "step": 19370 }, { "epoch": 0.9306122448979591, - "grad_norm": 0.5076829195022583, + "grad_norm": 0.5132554769515991, "learning_rate": 6.525602237870993e-07, - "loss": 0.7502, + "loss": 0.7503, "num_input_tokens_seen": 158760960, "step": 19380 }, { "epoch": 0.9310924369747899, - "grad_norm": 0.5408411622047424, + "grad_norm": 0.5355455279350281, "learning_rate": 6.435983216038583e-07, - "loss": 0.693, + "loss": 0.6928, "num_input_tokens_seen": 158842880, "step": 19390 }, { "epoch": 0.9315726290516206, - "grad_norm": 0.5284355282783508, + "grad_norm": 0.5428586006164551, "learning_rate": 6.346975820841927e-07, - "loss": 0.8507, + "loss": 0.8504, "num_input_tokens_seen": 158924800, "step": 19400 }, { "epoch": 0.9320528211284513, - "grad_norm": 0.47476431727409363, + "grad_norm": 0.474017471075058, "learning_rate": 6.258580275792153e-07, - "loss": 0.9174, + "loss": 0.9175, "num_input_tokens_seen": 159006720, "step": 19410 }, { "epoch": 0.9325330132052821, - "grad_norm": 0.4776346981525421, + "grad_norm": 0.47815394401550293, "learning_rate": 6.170796802864115e-07, "loss": 0.7601, "num_input_tokens_seen": 159088640, @@ -15546,183 +15546,183 @@ }, { "epoch": 0.9330132052821128, - "grad_norm": 0.5064809322357178, + "grad_norm": 0.5085130333900452, "learning_rate": 6.083625622495565e-07, - "loss": 0.8922, + "loss": 0.8923, "num_input_tokens_seen": 159170560, "step": 19430 }, { "epoch": 0.9334933973589435, - "grad_norm": 0.48573869466781616, + "grad_norm": 0.4867092967033386, "learning_rate": 5.997066953586761e-07, - "loss": 0.8506, + "loss": 0.8514, "num_input_tokens_seen": 159252480, "step": 19440 }, { "epoch": 0.9339735894357744, - "grad_norm": 0.48475074768066406, + "grad_norm": 0.48415231704711914, "learning_rate": 5.911121013499721e-07, - "loss": 0.801, + "loss": 0.8006, "num_input_tokens_seen": 159334400, "step": 19450 }, { "epoch": 0.934453781512605, - "grad_norm": 0.5202433466911316, + "grad_norm": 0.4926071763038635, "learning_rate": 5.825788018057971e-07, - "loss": 0.8358, + "loss": 0.8361, "num_input_tokens_seen": 159416320, "step": 19460 }, { "epoch": 0.9349339735894358, - "grad_norm": 0.48488256335258484, + "grad_norm": 0.485619455575943, "learning_rate": 5.741068181545684e-07, - "loss": 1.0555, + "loss": 1.0557, "num_input_tokens_seen": 159498240, "step": 19470 }, { "epoch": 0.9354141656662666, - "grad_norm": 0.467579185962677, + "grad_norm": 0.4691498875617981, "learning_rate": 5.656961716707459e-07, - "loss": 1.184, + "loss": 1.1845, "num_input_tokens_seen": 159580160, "step": 19480 }, { "epoch": 0.9358943577430973, - "grad_norm": 0.47693246603012085, + "grad_norm": 0.48349183797836304, "learning_rate": 5.57346883474752e-07, - "loss": 0.8181, + "loss": 0.818, "num_input_tokens_seen": 159662080, "step": 19490 }, { "epoch": 0.936374549819928, - "grad_norm": 0.3317853808403015, + "grad_norm": 0.3223581314086914, "learning_rate": 5.490589745329261e-07, - "loss": 0.9217, + "loss": 0.9216, "num_input_tokens_seen": 159744000, "step": 19500 }, { "epoch": 0.9368547418967587, - "grad_norm": 0.48513829708099365, + "grad_norm": 0.4837208688259125, "learning_rate": 5.408324656574842e-07, - "loss": 1.0929, + "loss": 1.0933, "num_input_tokens_seen": 159825920, "step": 19510 }, { "epoch": 0.9373349339735895, - "grad_norm": 0.5054202675819397, + "grad_norm": 0.5018269419670105, "learning_rate": 5.326673775064545e-07, - "loss": 0.9711, + "loss": 0.9709, "num_input_tokens_seen": 159907840, "step": 19520 }, { "epoch": 0.9378151260504202, - "grad_norm": 0.48606500029563904, + "grad_norm": 0.4863048791885376, "learning_rate": 5.245637305836243e-07, - "loss": 0.872, + "loss": 0.8722, "num_input_tokens_seen": 159989760, "step": 19530 }, { "epoch": 0.9382953181272509, - "grad_norm": 0.47018781304359436, + "grad_norm": 0.4706777036190033, "learning_rate": 5.165215452384936e-07, - "loss": 0.8849, + "loss": 0.8854, "num_input_tokens_seen": 160071680, "step": 19540 }, { "epoch": 0.9387755102040817, - "grad_norm": 0.5236411094665527, + "grad_norm": 0.5222617983818054, "learning_rate": 5.085408416662274e-07, - "loss": 0.8509, + "loss": 0.8518, "num_input_tokens_seen": 160153600, "step": 19550 }, { "epoch": 0.9392557022809124, - "grad_norm": 0.45992419123649597, + "grad_norm": 0.46015259623527527, "learning_rate": 5.006216399075947e-07, - "loss": 0.9798, + "loss": 0.9801, "num_input_tokens_seen": 160235520, "step": 19560 }, { "epoch": 0.9397358943577431, - "grad_norm": 0.5223960280418396, + "grad_norm": 0.522504448890686, "learning_rate": 4.92763959848927e-07, - "loss": 0.8332, + "loss": 0.8329, "num_input_tokens_seen": 160317440, "step": 19570 }, { "epoch": 0.9402160864345739, - "grad_norm": 0.4571020305156708, + "grad_norm": 0.45878833532333374, "learning_rate": 4.849678212220682e-07, - "loss": 0.9557, + "loss": 0.9552, "num_input_tokens_seen": 160399360, "step": 19580 }, { "epoch": 0.9406962785114046, - "grad_norm": 0.4470067620277405, + "grad_norm": 0.44788938760757446, "learning_rate": 4.772332436043165e-07, - "loss": 0.7306, + "loss": 0.7305, "num_input_tokens_seen": 160481280, "step": 19590 }, { "epoch": 0.9411764705882353, - "grad_norm": 0.43297266960144043, + "grad_norm": 0.4315415322780609, "learning_rate": 4.6956024641838237e-07, - "loss": 0.9514, + "loss": 0.9516, "num_input_tokens_seen": 160563200, "step": 19600 }, { "epoch": 0.941656662665066, - "grad_norm": 0.49218934774398804, + "grad_norm": 0.49236106872558594, "learning_rate": 4.619488489323448e-07, - "loss": 0.9364, + "loss": 0.9362, "num_input_tokens_seen": 160645120, "step": 19610 }, { "epoch": 0.9421368547418968, - "grad_norm": 0.440889447927475, + "grad_norm": 0.44332748651504517, "learning_rate": 4.5439907025958405e-07, - "loss": 1.0067, + "loss": 1.006, "num_input_tokens_seen": 160727040, "step": 19620 }, { "epoch": 0.9426170468187275, - "grad_norm": 0.4658232033252716, + "grad_norm": 0.46615105867385864, "learning_rate": 4.4691092935876256e-07, - "loss": 0.985, + "loss": 0.9851, "num_input_tokens_seen": 160808960, "step": 19630 }, { "epoch": 0.9430972388955582, - "grad_norm": 0.4865157902240753, + "grad_norm": 0.48708418011665344, "learning_rate": 4.394844450337443e-07, - "loss": 1.1212, + "loss": 1.1214, "num_input_tokens_seen": 160890880, "step": 19640 }, { "epoch": 0.943577430972389, - "grad_norm": 0.782720685005188, + "grad_norm": 0.7900726199150085, "learning_rate": 4.3211963593357275e-07, "loss": 1.0098, "num_input_tokens_seen": 160972800, @@ -15730,143 +15730,143 @@ }, { "epoch": 0.9440576230492197, - "grad_norm": 0.4879094660282135, + "grad_norm": 0.48821115493774414, "learning_rate": 4.248165205524152e-07, - "loss": 0.8137, + "loss": 0.8139, "num_input_tokens_seen": 161054720, "step": 19660 }, { "epoch": 0.9445378151260504, - "grad_norm": 0.45209428668022156, + "grad_norm": 0.44911932945251465, "learning_rate": 4.175751172295156e-07, - "loss": 0.9042, + "loss": 0.9041, "num_input_tokens_seen": 161136640, "step": 19670 }, { "epoch": 0.9450180072028812, - "grad_norm": 0.4872719347476959, + "grad_norm": 0.4880870580673218, "learning_rate": 4.1039544414914753e-07, - "loss": 0.8619, + "loss": 0.8617, "num_input_tokens_seen": 161218560, "step": 19680 }, { "epoch": 0.9454981992797119, - "grad_norm": 0.7091497182846069, + "grad_norm": 0.7131670117378235, "learning_rate": 4.032775193405752e-07, - "loss": 0.8915, + "loss": 0.8918, "num_input_tokens_seen": 161300480, "step": 19690 }, { "epoch": 0.9459783913565426, - "grad_norm": 0.45862463116645813, + "grad_norm": 0.4578578770160675, "learning_rate": 3.96221360677998e-07, - "loss": 0.815, + "loss": 0.8151, "num_input_tokens_seen": 161382400, "step": 19700 }, { "epoch": 0.9464585834333733, - "grad_norm": 0.5041652321815491, + "grad_norm": 0.5045084953308105, "learning_rate": 3.892269858805142e-07, - "loss": 0.9076, + "loss": 0.9074, "num_input_tokens_seen": 161464320, "step": 19710 }, { "epoch": 0.9469387755102041, - "grad_norm": 0.4937167763710022, + "grad_norm": 0.4939689040184021, "learning_rate": 3.82294412512077e-07, - "loss": 0.8012, + "loss": 0.8005, "num_input_tokens_seen": 161546240, "step": 19720 }, { "epoch": 0.9474189675870348, - "grad_norm": 0.44221341609954834, + "grad_norm": 0.44345298409461975, "learning_rate": 3.7542365798143573e-07, - "loss": 0.8382, + "loss": 0.8381, "num_input_tokens_seen": 161628160, "step": 19730 }, { "epoch": 0.9478991596638655, - "grad_norm": 0.4632039964199066, + "grad_norm": 0.4660447835922241, "learning_rate": 3.6861473954210855e-07, - "loss": 1.0418, + "loss": 1.0417, "num_input_tokens_seen": 161710080, "step": 19740 }, { "epoch": 0.9483793517406963, - "grad_norm": 0.4950004518032074, + "grad_norm": 0.49638691544532776, "learning_rate": 3.6186767429234323e-07, - "loss": 0.7393, + "loss": 0.7395, "num_input_tokens_seen": 161792000, "step": 19750 }, { "epoch": 0.948859543817527, - "grad_norm": 0.4772696793079376, + "grad_norm": 0.4776581823825836, "learning_rate": 3.5518247917505077e-07, - "loss": 0.8615, + "loss": 0.8611, "num_input_tokens_seen": 161873920, "step": 19760 }, { "epoch": 0.9493397358943577, - "grad_norm": 0.4546825885772705, + "grad_norm": 0.4587015211582184, "learning_rate": 3.485591709777802e-07, - "loss": 1.041, + "loss": 1.0409, "num_input_tokens_seen": 161955840, "step": 19770 }, { "epoch": 0.9498199279711885, - "grad_norm": 0.4924812912940979, + "grad_norm": 0.4919114112854004, "learning_rate": 3.419977663326801e-07, - "loss": 0.813, + "loss": 0.8133, "num_input_tokens_seen": 162037760, "step": 19780 }, { "epoch": 0.9503001200480192, - "grad_norm": 0.5705674290657043, + "grad_norm": 0.571787416934967, "learning_rate": 3.3549828171644537e-07, - "loss": 1.2698, + "loss": 1.2694, "num_input_tokens_seen": 162119680, "step": 19790 }, { "epoch": 0.9507803121248499, - "grad_norm": 0.48878228664398193, + "grad_norm": 0.48567476868629456, "learning_rate": 3.29060733450276e-07, - "loss": 0.8896, + "loss": 0.8892, "num_input_tokens_seen": 162201600, "step": 19800 }, { "epoch": 0.9512605042016806, - "grad_norm": 0.4292643070220947, + "grad_norm": 0.42919716238975525, "learning_rate": 3.2268513769984634e-07, - "loss": 1.0284, + "loss": 1.028, "num_input_tokens_seen": 162283520, "step": 19810 }, { "epoch": 0.9517406962785114, - "grad_norm": 0.715688169002533, + "grad_norm": 0.7574768662452698, "learning_rate": 3.163715104752524e-07, - "loss": 0.9543, + "loss": 0.9548, "num_input_tokens_seen": 162365440, "step": 19820 }, { "epoch": 0.9522208883553421, - "grad_norm": 0.46790245175361633, + "grad_norm": 0.46974584460258484, "learning_rate": 3.101198676309841e-07, "loss": 0.9059, "num_input_tokens_seen": 162447360, @@ -15874,87 +15874,87 @@ }, { "epoch": 0.9527010804321728, - "grad_norm": 0.48006242513656616, + "grad_norm": 0.48091816902160645, "learning_rate": 3.039302248658754e-07, - "loss": 0.9259, + "loss": 0.9254, "num_input_tokens_seen": 162529280, "step": 19840 }, { "epoch": 0.9531812725090036, - "grad_norm": 0.3433704376220703, + "grad_norm": 0.35266348719596863, "learning_rate": 2.978025977230736e-07, - "loss": 1.0082, + "loss": 1.0085, "num_input_tokens_seen": 162611200, "step": 19850 }, { "epoch": 0.9536614645858343, - "grad_norm": 0.4852867126464844, + "grad_norm": 0.48670434951782227, "learning_rate": 2.91737001589984e-07, - "loss": 0.9153, + "loss": 0.9163, "num_input_tokens_seen": 162693120, "step": 19860 }, { "epoch": 0.954141656662665, - "grad_norm": 0.5154809951782227, + "grad_norm": 0.5381135940551758, "learning_rate": 2.8573345169825296e-07, - "loss": 0.9446, + "loss": 0.9455, "num_input_tokens_seen": 162775040, "step": 19870 }, { "epoch": 0.9546218487394958, - "grad_norm": 0.707761287689209, + "grad_norm": 0.7383513450622559, "learning_rate": 2.797919631237156e-07, - "loss": 0.7736, + "loss": 0.774, "num_input_tokens_seen": 162856960, "step": 19880 }, { "epoch": 0.9551020408163265, - "grad_norm": 0.3873317837715149, + "grad_norm": 0.37961041927337646, "learning_rate": 2.739125507863649e-07, - "loss": 0.9343, + "loss": 0.935, "num_input_tokens_seen": 162938880, "step": 19890 }, { "epoch": 0.9555822328931572, - "grad_norm": 0.48126840591430664, + "grad_norm": 0.48009783029556274, "learning_rate": 2.680952294503075e-07, - "loss": 0.9554, + "loss": 0.9552, "num_input_tokens_seen": 163020800, "step": 19900 }, { "epoch": 0.956062424969988, - "grad_norm": 0.4585091471672058, + "grad_norm": 0.45567837357521057, "learning_rate": 2.6234001372372194e-07, - "loss": 0.9323, + "loss": 0.9321, "num_input_tokens_seen": 163102720, "step": 19910 }, { "epoch": 0.9565426170468188, - "grad_norm": 0.5132349729537964, + "grad_norm": 0.5141881108283997, "learning_rate": 2.5664691805884767e-07, - "loss": 0.9947, + "loss": 0.9948, "num_input_tokens_seen": 163184640, "step": 19920 }, { "epoch": 0.9570228091236495, - "grad_norm": 0.5150689482688904, + "grad_norm": 0.5147581100463867, "learning_rate": 2.5101595675191827e-07, - "loss": 0.9797, + "loss": 0.9796, "num_input_tokens_seen": 163266560, "step": 19930 }, { "epoch": 0.9575030012004802, - "grad_norm": 0.46773561835289, + "grad_norm": 0.4667784571647644, "learning_rate": 2.4544714394314215e-07, "loss": 0.9892, "num_input_tokens_seen": 163348480, @@ -15962,15 +15962,15 @@ }, { "epoch": 0.957983193277311, - "grad_norm": 0.7742196917533875, + "grad_norm": 0.7636064887046814, "learning_rate": 2.399404936166638e-07, - "loss": 1.171, + "loss": 1.1706, "num_input_tokens_seen": 163430400, "step": 19950 }, { "epoch": 0.9584633853541417, - "grad_norm": 0.4795765280723572, + "grad_norm": 0.4796493947505951, "learning_rate": 2.3449601960052746e-07, "loss": 0.7551, "num_input_tokens_seen": 163512320, @@ -15978,15 +15978,15 @@ }, { "epoch": 0.9589435774309724, - "grad_norm": 0.46741360425949097, + "grad_norm": 0.46821102499961853, "learning_rate": 2.2911373556664118e-07, - "loss": 0.7974, + "loss": 0.7976, "num_input_tokens_seen": 163594240, "step": 19970 }, { "epoch": 0.9594237695078032, - "grad_norm": 0.47377344965934753, + "grad_norm": 0.4732038378715515, "learning_rate": 2.23793655030749e-07, "loss": 0.8963, "num_input_tokens_seen": 163676160, @@ -15994,39 +15994,39 @@ }, { "epoch": 0.9599039615846339, - "grad_norm": 0.5991731286048889, + "grad_norm": 0.5945155620574951, "learning_rate": 2.1853579135238667e-07, - "loss": 0.9377, + "loss": 0.9375, "num_input_tokens_seen": 163758080, "step": 19990 }, { "epoch": 0.9603841536614646, - "grad_norm": 0.4813549816608429, + "grad_norm": 0.48151063919067383, "learning_rate": 2.1334015773486203e-07, - "loss": 1.0792, + "loss": 1.0805, "num_input_tokens_seen": 163840000, "step": 20000 }, { "epoch": 0.9608643457382953, - "grad_norm": 0.5007798075675964, + "grad_norm": 0.5012336373329163, "learning_rate": 2.0820676722520526e-07, - "loss": 0.7725, + "loss": 0.7723, "num_input_tokens_seen": 163921920, "step": 20010 }, { "epoch": 0.9613445378151261, - "grad_norm": 0.49032649397850037, + "grad_norm": 0.489702433347702, "learning_rate": 2.0313563271414927e-07, - "loss": 0.8982, + "loss": 0.8981, "num_input_tokens_seen": 164003840, "step": 20020 }, { "epoch": 0.9618247298919568, - "grad_norm": 0.49168848991394043, + "grad_norm": 0.4924655854701996, "learning_rate": 1.9812676693608812e-07, "loss": 0.7376, "num_input_tokens_seen": 164085760, @@ -16034,47 +16034,47 @@ }, { "epoch": 0.9623049219687875, - "grad_norm": 0.47681140899658203, + "grad_norm": 0.47769734263420105, "learning_rate": 1.9318018246905488e-07, - "loss": 0.8378, + "loss": 0.8382, "num_input_tokens_seen": 164167680, "step": 20040 }, { "epoch": 0.9627851140456183, - "grad_norm": 0.4583689272403717, + "grad_norm": 0.4559698700904846, "learning_rate": 1.8829589173468552e-07, - "loss": 0.8883, + "loss": 0.8877, "num_input_tokens_seen": 164249600, "step": 20050 }, { "epoch": 0.963265306122449, - "grad_norm": 0.3779040575027466, + "grad_norm": 0.36575424671173096, "learning_rate": 1.8347390699817724e-07, - "loss": 0.7442, + "loss": 0.7448, "num_input_tokens_seen": 164331520, "step": 20060 }, { "epoch": 0.9637454981992797, - "grad_norm": 0.571790874004364, + "grad_norm": 0.5725572109222412, "learning_rate": 1.7871424036828288e-07, - "loss": 0.9274, + "loss": 0.9267, "num_input_tokens_seen": 164413440, "step": 20070 }, { "epoch": 0.9642256902761105, - "grad_norm": 0.5354510545730591, + "grad_norm": 0.5294426083564758, "learning_rate": 1.7401690379724722e-07, - "loss": 0.9755, + "loss": 0.9753, "num_input_tokens_seen": 164495360, "step": 20080 }, { "epoch": 0.9647058823529412, - "grad_norm": 0.5626926422119141, + "grad_norm": 0.5594549179077148, "learning_rate": 1.6938190908080688e-07, "loss": 0.8417, "num_input_tokens_seen": 164577280, @@ -16082,31 +16082,31 @@ }, { "epoch": 0.9651860744297719, - "grad_norm": 0.5134274363517761, + "grad_norm": 0.5328065752983093, "learning_rate": 1.6480926785814866e-07, - "loss": 0.7835, + "loss": 0.7834, "num_input_tokens_seen": 164659200, "step": 20100 }, { "epoch": 0.9656662665066026, - "grad_norm": 0.5222331881523132, + "grad_norm": 0.5204812288284302, "learning_rate": 1.6029899161187079e-07, - "loss": 1.0117, + "loss": 1.0118, "num_input_tokens_seen": 164741120, "step": 20110 }, { "epoch": 0.9661464585834334, - "grad_norm": 0.5277162194252014, + "grad_norm": 0.5157402753829956, "learning_rate": 1.5585109166796896e-07, - "loss": 0.7976, + "loss": 0.7969, "num_input_tokens_seen": 164823040, "step": 20120 }, { "epoch": 0.9666266506602641, - "grad_norm": 0.5840759873390198, + "grad_norm": 0.5858813524246216, "learning_rate": 1.5146557919581138e-07, "loss": 0.9373, "num_input_tokens_seen": 164904960, @@ -16114,7 +16114,7 @@ }, { "epoch": 0.9671068427370948, - "grad_norm": 0.4930008053779602, + "grad_norm": 0.4909372627735138, "learning_rate": 1.4714246520808328e-07, "loss": 0.717, "num_input_tokens_seen": 164986880, @@ -16122,31 +16122,31 @@ }, { "epoch": 0.9675870348139256, - "grad_norm": 0.49782538414001465, + "grad_norm": 0.4969266653060913, "learning_rate": 1.4288176056079238e-07, - "loss": 0.8526, + "loss": 0.8529, "num_input_tokens_seen": 165068800, "step": 20150 }, { "epoch": 0.9680672268907563, - "grad_norm": 0.48981910943984985, + "grad_norm": 0.4890909194946289, "learning_rate": 1.3868347595322184e-07, - "loss": 0.8959, + "loss": 0.8958, "num_input_tokens_seen": 165150720, "step": 20160 }, { "epoch": 0.968547418967587, - "grad_norm": 0.46395954489707947, + "grad_norm": 0.4637242257595062, "learning_rate": 1.3454762192790794e-07, - "loss": 1.1636, + "loss": 1.1627, "num_input_tokens_seen": 165232640, "step": 20170 }, { "epoch": 0.9690276110444178, - "grad_norm": 0.4878246784210205, + "grad_norm": 0.48529329895973206, "learning_rate": 1.3047420887061513e-07, "loss": 1.0033, "num_input_tokens_seen": 165314560, @@ -16154,79 +16154,79 @@ }, { "epoch": 0.9695078031212485, - "grad_norm": 0.45499855279922485, + "grad_norm": 0.4544067084789276, "learning_rate": 1.264632470103111e-07, - "loss": 0.8926, + "loss": 0.8922, "num_input_tokens_seen": 165396480, "step": 20190 }, { "epoch": 0.9699879951980792, - "grad_norm": 0.4934712052345276, + "grad_norm": 0.49211665987968445, "learning_rate": 1.225147464191334e-07, - "loss": 0.9251, + "loss": 0.9253, "num_input_tokens_seen": 165478400, "step": 20200 }, { "epoch": 0.9704681872749099, - "grad_norm": 0.48829373717308044, + "grad_norm": 0.4904029667377472, "learning_rate": 1.1862871701237288e-07, - "loss": 0.754, + "loss": 0.7544, "num_input_tokens_seen": 165560320, "step": 20210 }, { "epoch": 0.9709483793517407, - "grad_norm": 0.4913751184940338, + "grad_norm": 0.490373432636261, "learning_rate": 1.1480516854844858e-07, - "loss": 0.799, + "loss": 0.7983, "num_input_tokens_seen": 165642240, "step": 20220 }, { "epoch": 0.9714285714285714, - "grad_norm": 0.5303522944450378, + "grad_norm": 0.5297425985336304, "learning_rate": 1.1104411062887732e-07, - "loss": 0.797, + "loss": 0.7969, "num_input_tokens_seen": 165724160, "step": 20230 }, { "epoch": 0.9719087635054021, - "grad_norm": 0.4915572702884674, + "grad_norm": 0.49064329266548157, "learning_rate": 1.0734555269825141e-07, - "loss": 0.7708, + "loss": 0.7713, "num_input_tokens_seen": 165806080, "step": 20240 }, { "epoch": 0.9723889555822329, - "grad_norm": 0.5113864541053772, + "grad_norm": 0.5096157789230347, "learning_rate": 1.0370950404421931e-07, - "loss": 0.7448, + "loss": 0.7452, "num_input_tokens_seen": 165888000, "step": 20250 }, { "epoch": 0.9728691476590636, - "grad_norm": 0.49266868829727173, + "grad_norm": 0.4939180314540863, "learning_rate": 1.0013597379745776e-07, - "loss": 0.8983, + "loss": 0.8992, "num_input_tokens_seen": 165969920, "step": 20260 }, { "epoch": 0.9733493397358943, - "grad_norm": 0.9062735438346863, + "grad_norm": 0.9096865653991699, "learning_rate": 9.662497093164691e-08, - "loss": 0.7648, + "loss": 0.7645, "num_input_tokens_seen": 166051840, "step": 20270 }, { "epoch": 0.9738295318127251, - "grad_norm": 0.5166149139404297, + "grad_norm": 0.5157402157783508, "learning_rate": 9.317650426345637e-08, "loss": 0.9752, "num_input_tokens_seen": 166133760, @@ -16234,39 +16234,39 @@ }, { "epoch": 0.9743097238895558, - "grad_norm": 0.4873685836791992, + "grad_norm": 0.4877472221851349, "learning_rate": 8.979058245251193e-08, - "loss": 0.8632, + "loss": 0.8634, "num_input_tokens_seen": 166215680, "step": 20290 }, { "epoch": 0.9747899159663865, - "grad_norm": 0.48087936639785767, + "grad_norm": 0.4800505042076111, "learning_rate": 8.646721400138724e-08, - "loss": 1.0177, + "loss": 1.0172, "num_input_tokens_seen": 166297600, "step": 20300 }, { "epoch": 0.9752701080432172, - "grad_norm": 0.4865896999835968, + "grad_norm": 0.4879034161567688, "learning_rate": 8.320640725556773e-08, - "loss": 0.9075, + "loss": 0.9079, "num_input_tokens_seen": 166379520, "step": 20310 }, { "epoch": 0.975750300120048, - "grad_norm": 0.4811105728149414, + "grad_norm": 0.4818404018878937, "learning_rate": 8.000817040344222e-08, - "loss": 0.9434, + "loss": 0.9436, "num_input_tokens_seen": 166461440, "step": 20320 }, { "epoch": 0.9762304921968787, - "grad_norm": 0.47913140058517456, + "grad_norm": 0.4712151288986206, "learning_rate": 7.687251147627251e-08, "loss": 0.8307, "num_input_tokens_seen": 166543360, @@ -16274,127 +16274,127 @@ }, { "epoch": 0.9767106842737094, - "grad_norm": 0.4741958677768707, + "grad_norm": 0.4749000072479248, "learning_rate": 7.379943834818214e-08, - "loss": 0.7942, + "loss": 0.7941, "num_input_tokens_seen": 166625280, "step": 20340 }, { "epoch": 0.9771908763505402, - "grad_norm": 0.492371529340744, + "grad_norm": 0.4928759038448334, "learning_rate": 7.0788958736126e-08, - "loss": 0.6887, + "loss": 0.6886, "num_input_tokens_seen": 166707200, "step": 20350 }, { "epoch": 0.977671068427371, - "grad_norm": 0.5118972659111023, + "grad_norm": 0.5131922960281372, "learning_rate": 6.784108019988189e-08, - "loss": 0.8574, + "loss": 0.8575, "num_input_tokens_seen": 166789120, "step": 20360 }, { "epoch": 0.9781512605042016, - "grad_norm": 0.387319952249527, + "grad_norm": 0.3878309726715088, "learning_rate": 6.495581014202556e-08, - "loss": 0.7726, + "loss": 0.7712, "num_input_tokens_seen": 166871040, "step": 20370 }, { "epoch": 0.9786314525810325, - "grad_norm": 0.494081050157547, + "grad_norm": 0.4928576350212097, "learning_rate": 6.213315580791135e-08, - "loss": 0.9247, + "loss": 0.9243, "num_input_tokens_seen": 166952960, "step": 20380 }, { "epoch": 0.9791116446578632, - "grad_norm": 0.5668483376502991, + "grad_norm": 0.5688450336456299, "learning_rate": 5.9373124285661e-08, - "loss": 0.8177, + "loss": 0.8171, "num_input_tokens_seen": 167034880, "step": 20390 }, { "epoch": 0.9795918367346939, - "grad_norm": 0.5243094563484192, + "grad_norm": 0.5241358876228333, "learning_rate": 5.6675722506135956e-08, - "loss": 0.8603, + "loss": 0.86, "num_input_tokens_seen": 167116800, "step": 20400 }, { "epoch": 0.9800720288115247, - "grad_norm": 0.5012032389640808, + "grad_norm": 0.5009602308273315, "learning_rate": 5.404095724292346e-08, - "loss": 0.9751, + "loss": 0.9734, "num_input_tokens_seen": 167198720, "step": 20410 }, { "epoch": 0.9805522208883554, - "grad_norm": 0.48027274012565613, + "grad_norm": 0.4785056710243225, "learning_rate": 5.146883511232825e-08, - "loss": 0.9047, + "loss": 0.9059, "num_input_tokens_seen": 167280640, "step": 20420 }, { "epoch": 0.9810324129651861, - "grad_norm": 0.9182277321815491, + "grad_norm": 0.9318637251853943, "learning_rate": 4.8959362573341995e-08, - "loss": 0.8816, + "loss": 0.8805, "num_input_tokens_seen": 167362560, "step": 20430 }, { "epoch": 0.9815126050420168, - "grad_norm": 0.47748062014579773, + "grad_norm": 0.4752718508243561, "learning_rate": 4.6512545927632213e-08, - "loss": 0.8182, + "loss": 0.8186, "num_input_tokens_seen": 167444480, "step": 20440 }, { "epoch": 0.9819927971188476, - "grad_norm": 0.5384604930877686, + "grad_norm": 0.5406351685523987, "learning_rate": 4.412839131953395e-08, - "loss": 0.8999, + "loss": 0.9007, "num_input_tokens_seen": 167526400, "step": 20450 }, { "epoch": 0.9824729891956783, - "grad_norm": 0.49782511591911316, + "grad_norm": 0.4987322688102722, "learning_rate": 4.180690473602755e-08, - "loss": 0.9972, + "loss": 1.0011, "num_input_tokens_seen": 167608320, "step": 20460 }, { "epoch": 0.982953181272509, - "grad_norm": 0.46127596497535706, + "grad_norm": 0.4613893926143646, "learning_rate": 3.9548092006719275e-08, - "loss": 0.7583, + "loss": 0.7579, "num_input_tokens_seen": 167690240, "step": 20470 }, { "epoch": 0.9834333733493398, - "grad_norm": 0.8296525478363037, + "grad_norm": 0.8392557501792908, "learning_rate": 3.7351958803835685e-08, - "loss": 1.0416, + "loss": 1.0422, "num_input_tokens_seen": 167772160, "step": 20480 }, { "epoch": 0.9839135654261705, - "grad_norm": 0.48525142669677734, + "grad_norm": 0.4861963391304016, "learning_rate": 3.5218510642201496e-08, "loss": 1.1318, "num_input_tokens_seen": 167854080, @@ -16402,23 +16402,23 @@ }, { "epoch": 0.9843937575030012, - "grad_norm": 0.656819224357605, + "grad_norm": 0.6637359857559204, "learning_rate": 3.314775287923677e-08, - "loss": 1.0568, + "loss": 1.0579, "num_input_tokens_seen": 167936000, "step": 20500 }, { "epoch": 0.984873949579832, - "grad_norm": 0.49924975633621216, + "grad_norm": 0.4990644156932831, "learning_rate": 3.1139690714931945e-08, - "loss": 0.9061, + "loss": 0.9065, "num_input_tokens_seen": 168017920, "step": 20510 }, { "epoch": 0.9853541416566627, - "grad_norm": 0.47021421790122986, + "grad_norm": 0.4722457528114319, "learning_rate": 2.919432919183396e-08, "loss": 0.7413, "num_input_tokens_seen": 168099840, @@ -16426,151 +16426,151 @@ }, { "epoch": 0.9858343337334934, - "grad_norm": 0.4875717759132385, + "grad_norm": 0.48728999495506287, "learning_rate": 2.731167319505179e-08, - "loss": 0.7303, + "loss": 0.7307, "num_input_tokens_seen": 168181760, "step": 20530 }, { "epoch": 0.9863145258103241, - "grad_norm": 0.47753021121025085, + "grad_norm": 0.477203905582428, "learning_rate": 2.5491727452217616e-08, - "loss": 0.763, + "loss": 0.7627, "num_input_tokens_seen": 168263680, "step": 20540 }, { "epoch": 0.9867947178871549, - "grad_norm": 1.1384245157241821, + "grad_norm": 1.1573189496994019, "learning_rate": 2.3734496533497907e-08, - "loss": 0.9044, + "loss": 0.9042, "num_input_tokens_seen": 168345600, "step": 20550 }, { "epoch": 0.9872749099639856, - "grad_norm": 0.47026336193084717, + "grad_norm": 0.44929322600364685, "learning_rate": 2.203998485156844e-08, - "loss": 0.9609, + "loss": 0.9611, "num_input_tokens_seen": 168427520, "step": 20560 }, { "epoch": 0.9877551020408163, - "grad_norm": 0.48212888836860657, + "grad_norm": 0.48382875323295593, "learning_rate": 2.040819666160876e-08, - "loss": 1.0561, + "loss": 1.0542, "num_input_tokens_seen": 168509440, "step": 20570 }, { "epoch": 0.9882352941176471, - "grad_norm": 0.5447530150413513, + "grad_norm": 0.5462098717689514, "learning_rate": 1.8839136061288288e-08, - "loss": 1.0402, + "loss": 1.0403, "num_input_tokens_seen": 168591360, "step": 20580 }, { "epoch": 0.9887154861944778, - "grad_norm": 0.5337615609169006, + "grad_norm": 0.5324848890304565, "learning_rate": 1.7332806990758012e-08, - "loss": 1.1063, + "loss": 1.1069, "num_input_tokens_seen": 168673280, "step": 20590 }, { "epoch": 0.9891956782713085, - "grad_norm": 0.4831818640232086, + "grad_norm": 0.4525868892669678, "learning_rate": 1.5889213232644917e-08, - "loss": 0.8762, + "loss": 0.8757, "num_input_tokens_seen": 168755200, "step": 20600 }, { "epoch": 0.9896758703481393, - "grad_norm": 0.5846928954124451, + "grad_norm": 0.5697076320648193, "learning_rate": 1.4508358412032575e-08, - "loss": 1.2618, + "loss": 1.2612, "num_input_tokens_seen": 168837120, "step": 20610 }, { "epoch": 0.99015606242497, - "grad_norm": 0.4968603253364563, + "grad_norm": 0.49757134914398193, "learning_rate": 1.319024599645835e-08, - "loss": 0.959, + "loss": 0.9593, "num_input_tokens_seen": 168919040, "step": 20620 }, { "epoch": 0.9906362545018007, - "grad_norm": 0.5029283165931702, + "grad_norm": 0.5038264989852905, "learning_rate": 1.1934879295905089e-08, - "loss": 0.7467, + "loss": 0.7466, "num_input_tokens_seen": 169000960, "step": 20630 }, { "epoch": 0.9911164465786314, - "grad_norm": 0.5078738331794739, + "grad_norm": 0.5087673664093018, "learning_rate": 1.074226146279278e-08, - "loss": 1.0644, + "loss": 1.0652, "num_input_tokens_seen": 169082880, "step": 20640 }, { "epoch": 0.9915966386554622, - "grad_norm": 0.44298240542411804, + "grad_norm": 0.44474107027053833, "learning_rate": 9.612395491970239e-09, - "loss": 1.036, + "loss": 1.0358, "num_input_tokens_seen": 169164800, "step": 20650 }, { "epoch": 0.9920768307322929, - "grad_norm": 0.480905681848526, + "grad_norm": 0.482793927192688, "learning_rate": 8.545284220698446e-09, - "loss": 0.8798, + "loss": 0.8794, "num_input_tokens_seen": 169246720, "step": 20660 }, { "epoch": 0.9925570228091236, - "grad_norm": 0.4742453396320343, + "grad_norm": 0.4741480350494385, "learning_rate": 7.540930328658879e-09, - "loss": 0.848, + "loss": 0.8482, "num_input_tokens_seen": 169328640, "step": 20670 }, { "epoch": 0.9930372148859544, - "grad_norm": 0.4936705529689789, + "grad_norm": 0.4936019480228424, "learning_rate": 6.599336337942408e-09, - "loss": 1.0114, + "loss": 1.0116, "num_input_tokens_seen": 169410560, "step": 20680 }, { "epoch": 0.9935174069627851, - "grad_norm": 0.3984171152114868, + "grad_norm": 0.4134100377559662, "learning_rate": 5.720504613035416e-09, - "loss": 0.7468, + "loss": 0.7456, "num_input_tokens_seen": 169492480, "step": 20690 }, { "epoch": 0.9939975990396158, - "grad_norm": 0.5437182188034058, + "grad_norm": 0.5405466556549072, "learning_rate": 4.904437360814252e-09, - "loss": 0.8409, + "loss": 0.8412, "num_input_tokens_seen": 169574400, "step": 20700 }, { "epoch": 0.9944777911164466, - "grad_norm": 0.4836597144603729, + "grad_norm": 0.48422014713287354, "learning_rate": 4.151136630553554e-09, "loss": 1.024, "num_input_tokens_seen": 169656320, @@ -16578,89 +16578,89 @@ }, { "epoch": 0.9949579831932773, - "grad_norm": 0.4973140358924866, + "grad_norm": 0.49525994062423706, "learning_rate": 3.4606043139068234e-09, - "loss": 1.1108, + "loss": 1.1113, "num_input_tokens_seen": 169738240, "step": 20720 }, { "epoch": 0.995438175270108, - "grad_norm": 0.4446418583393097, + "grad_norm": 0.44537219405174255, "learning_rate": 2.832842144903647e-09, - "loss": 0.9724, + "loss": 0.9721, "num_input_tokens_seen": 169820160, "step": 20730 }, { "epoch": 0.9959183673469387, - "grad_norm": 0.9471971392631531, + "grad_norm": 1.051952838897705, "learning_rate": 2.2678516999552478e-09, - "loss": 0.7679, + "loss": 0.7684, "num_input_tokens_seen": 169902080, "step": 20740 }, { "epoch": 0.9963985594237695, - "grad_norm": 0.5114956498146057, + "grad_norm": 0.5115790963172913, "learning_rate": 1.7656343978378342e-09, - "loss": 0.9723, + "loss": 0.9727, "num_input_tokens_seen": 169984000, "step": 20750 }, { "epoch": 0.9968787515006002, - "grad_norm": 0.4692475497722626, + "grad_norm": 0.46845543384552, "learning_rate": 1.3261914996953728e-09, - "loss": 1.1212, + "loss": 1.121, "num_input_tokens_seen": 170065920, "step": 20760 }, { "epoch": 0.9973589435774309, - "grad_norm": 0.49446624517440796, + "grad_norm": 0.4868726432323456, "learning_rate": 9.49524109034039e-10, - "loss": 0.9877, + "loss": 0.9879, "num_input_tokens_seen": 170147840, "step": 20770 }, { "epoch": 0.9978391356542617, - "grad_norm": 0.47214624285697937, + "grad_norm": 0.47099846601486206, "learning_rate": 6.356331717305431e-10, - "loss": 0.8818, + "loss": 0.8828, "num_input_tokens_seen": 170229760, "step": 20780 }, { "epoch": 0.9983193277310924, - "grad_norm": 0.5106399655342102, + "grad_norm": 0.5169798135757446, "learning_rate": 3.8451947600437466e-10, - "loss": 0.9168, + "loss": 0.9171, "num_input_tokens_seen": 170311680, "step": 20790 }, { "epoch": 0.9987995198079231, - "grad_norm": 0.4835795760154724, + "grad_norm": 0.48473504185676575, "learning_rate": 1.9618365244833404e-10, - "loss": 0.8026, + "loss": 0.803, "num_input_tokens_seen": 170393600, "step": 20800 }, { "epoch": 0.999279711884754, - "grad_norm": 0.5011347532272339, + "grad_norm": 0.5023286938667297, "learning_rate": 7.062617399800075e-11, - "loss": 0.9025, + "loss": 0.9032, "num_input_tokens_seen": 170475520, "step": 20810 }, { "epoch": 0.9997599039615847, - "grad_norm": 0.48915189504623413, + "grad_norm": 0.4894420802593231, "learning_rate": 7.847355951162705e-12, - "loss": 0.9373, + "loss": 0.9376, "num_input_tokens_seen": 170557440, "step": 20820 }, @@ -16669,10 +16669,10 @@ "num_input_tokens_seen": 170598400, "step": 20825, "total_flos": 7.767833833163981e+18, - "train_loss": 0.9246458945440359, - "train_runtime": 106837.984, - "train_samples_per_second": 0.195, - "train_steps_per_second": 0.195 + "train_loss": 0.2516314516834566, + "train_runtime": 26805.2364, + "train_samples_per_second": 0.777, + "train_steps_per_second": 0.777 } ], "logging_steps": 10,