diff --git "a/Luminia-8B-RP/trainer_state.json" "b/Luminia-8B-RP/trainer_state.json" --- "a/Luminia-8B-RP/trainer_state.json" +++ "b/Luminia-8B-RP/trainer_state.json" @@ -3,7009 +3,16681 @@ "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, - "global_step": 8739, + "global_step": 20825, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0011442956860052637, - "grad_norm": 0.6716632843017578, + "epoch": 0.0004801920768307323, + "grad_norm": 0.7035335898399353, "learning_rate": 5.000000000000001e-07, - "loss": 1.5075, + "loss": 1.3448, "num_input_tokens_seen": 81920, "step": 10 }, { - "epoch": 0.0022885913720105274, - "grad_norm": 0.8028005957603455, + "epoch": 0.0009603841536614646, + "grad_norm": 0.9122879505157471, "learning_rate": 1.0000000000000002e-06, - "loss": 1.379, + "loss": 1.3391, "num_input_tokens_seen": 163840, "step": 20 }, { - "epoch": 0.0034328870580157913, - "grad_norm": 0.6309388279914856, + "epoch": 0.0014405762304921968, + "grad_norm": 0.8231492638587952, "learning_rate": 1.5e-06, - "loss": 1.6386, + "loss": 1.3258, "num_input_tokens_seen": 245760, "step": 30 }, { - "epoch": 0.004577182744021055, - "grad_norm": 2.590484380722046, + "epoch": 0.0019207683073229293, + "grad_norm": 0.928112804889679, "learning_rate": 2.0000000000000003e-06, - "loss": 1.5525, + "loss": 1.4027, "num_input_tokens_seen": 327680, "step": 40 }, { - "epoch": 0.005721478430026319, - "grad_norm": 0.8970260620117188, + "epoch": 0.0024009603841536613, + "grad_norm": 0.6168057918548584, "learning_rate": 2.5e-06, - "loss": 1.651, + "loss": 1.1825, "num_input_tokens_seen": 409600, "step": 50 }, { - "epoch": 0.006865774116031583, - "grad_norm": 0.7371882796287537, + "epoch": 0.0028811524609843936, + "grad_norm": 0.9915170073509216, "learning_rate": 3e-06, - "loss": 1.5067, + "loss": 1.0617, "num_input_tokens_seen": 491520, "step": 60 }, { - "epoch": 0.008010069802036847, - "grad_norm": 2.1576790809631348, + "epoch": 0.0033613445378151263, + "grad_norm": 1.3197886943817139, "learning_rate": 3.5000000000000004e-06, - "loss": 1.3367, + "loss": 1.2087, "num_input_tokens_seen": 573440, "step": 70 }, { - "epoch": 0.00915436548804211, - "grad_norm": 0.8213880658149719, + "epoch": 0.0038415366146458585, + "grad_norm": 0.9965929985046387, "learning_rate": 4.000000000000001e-06, - "loss": 1.6071, + "loss": 1.2009, "num_input_tokens_seen": 655360, "step": 80 }, { - "epoch": 0.010298661174047374, - "grad_norm": 1.3453713655471802, + "epoch": 0.004321728691476591, + "grad_norm": 1.3831547498703003, "learning_rate": 4.5e-06, - "loss": 1.522, + "loss": 1.2692, "num_input_tokens_seen": 737280, "step": 90 }, { - "epoch": 0.011442956860052637, - "grad_norm": 1.1660873889923096, + "epoch": 0.004801920768307323, + "grad_norm": 1.1937536001205444, "learning_rate": 5e-06, - "loss": 1.5853, + "loss": 1.187, "num_input_tokens_seen": 819200, "step": 100 }, { - "epoch": 0.012587252546057902, - "grad_norm": 1.4795037508010864, + "epoch": 0.005282112845138055, + "grad_norm": 1.3311412334442139, "learning_rate": 5.500000000000001e-06, - "loss": 1.4081, + "loss": 1.21, "num_input_tokens_seen": 901120, "step": 110 }, { - "epoch": 0.013731548232063165, - "grad_norm": 2.582639455795288, + "epoch": 0.005762304921968787, + "grad_norm": 1.3286609649658203, "learning_rate": 6e-06, - "loss": 1.5405, + "loss": 1.4533, "num_input_tokens_seen": 983040, "step": 120 }, { - "epoch": 0.014875843918068428, - "grad_norm": 1.1922495365142822, + "epoch": 0.00624249699879952, + "grad_norm": 1.8026959896087646, "learning_rate": 6.5000000000000004e-06, - "loss": 1.3632, + "loss": 1.3365, "num_input_tokens_seen": 1064960, "step": 130 }, { - "epoch": 0.016020139604073693, - "grad_norm": 1.249202847480774, + "epoch": 0.0067226890756302525, + "grad_norm": 1.4594871997833252, "learning_rate": 7.000000000000001e-06, - "loss": 1.128, + "loss": 1.0839, "num_input_tokens_seen": 1146880, "step": 140 }, { - "epoch": 0.017164435290078956, - "grad_norm": 2.9147748947143555, + "epoch": 0.007202881152460984, + "grad_norm": 1.2625036239624023, "learning_rate": 7.5e-06, - "loss": 1.3543, + "loss": 1.0646, "num_input_tokens_seen": 1228800, "step": 150 }, { - "epoch": 0.01830873097608422, - "grad_norm": 1.3228706121444702, + "epoch": 0.007683073229291717, + "grad_norm": 1.7347521781921387, "learning_rate": 8.000000000000001e-06, - "loss": 1.3541, + "loss": 1.366, "num_input_tokens_seen": 1310720, "step": 160 }, { - "epoch": 0.019453026662089482, - "grad_norm": 1.2447155714035034, + "epoch": 0.00816326530612245, + "grad_norm": 1.4462337493896484, "learning_rate": 8.500000000000002e-06, - "loss": 1.4476, + "loss": 1.2509, "num_input_tokens_seen": 1392640, "step": 170 }, { - "epoch": 0.02059732234809475, - "grad_norm": 1.2924906015396118, + "epoch": 0.008643457382953182, + "grad_norm": 1.3763086795806885, "learning_rate": 9e-06, - "loss": 1.212, + "loss": 1.0235, "num_input_tokens_seen": 1474560, "step": 180 }, { - "epoch": 0.021741618034100012, - "grad_norm": 1.352099061012268, + "epoch": 0.009123649459783913, + "grad_norm": 1.7035270929336548, "learning_rate": 9.5e-06, - "loss": 1.3312, + "loss": 0.9992, "num_input_tokens_seen": 1556480, "step": 190 }, { - "epoch": 0.022885913720105275, - "grad_norm": 2.618453025817871, + "epoch": 0.009603841536614645, + "grad_norm": 1.4041584730148315, "learning_rate": 1e-05, - "loss": 1.3266, + "loss": 0.8424, "num_input_tokens_seen": 1638400, "step": 200 }, { - "epoch": 0.024030209406110538, - "grad_norm": 1.2627308368682861, + "epoch": 0.010084033613445379, + "grad_norm": 1.4125291109085083, "learning_rate": 1.05e-05, - "loss": 1.3106, + "loss": 1.0181, "num_input_tokens_seen": 1720320, "step": 210 }, { - "epoch": 0.025174505092115804, - "grad_norm": 0.9320095181465149, + "epoch": 0.01056422569027611, + "grad_norm": 1.3839924335479736, "learning_rate": 1.1000000000000001e-05, - "loss": 1.4398, + "loss": 0.8846, "num_input_tokens_seen": 1802240, "step": 220 }, { - "epoch": 0.026318800778121067, - "grad_norm": 1.7634745836257935, + "epoch": 0.011044417767106842, + "grad_norm": 1.4403977394104004, "learning_rate": 1.1500000000000002e-05, - "loss": 1.1946, + "loss": 1.1465, "num_input_tokens_seen": 1884160, "step": 230 }, { - "epoch": 0.02746309646412633, - "grad_norm": 1.1597224473953247, + "epoch": 0.011524609843937574, + "grad_norm": 1.1681145429611206, "learning_rate": 1.2e-05, - "loss": 1.2912, + "loss": 1.0085, "num_input_tokens_seen": 1966080, "step": 240 }, { - "epoch": 0.028607392150131594, - "grad_norm": 1.1823091506958008, + "epoch": 0.012004801920768308, + "grad_norm": 2.7823386192321777, "learning_rate": 1.25e-05, - "loss": 1.5374, + "loss": 1.0097, "num_input_tokens_seen": 2048000, "step": 250 }, { - "epoch": 0.029751687836136857, - "grad_norm": 0.88273024559021, + "epoch": 0.01248499399759904, + "grad_norm": 1.2464739084243774, "learning_rate": 1.3000000000000001e-05, - "loss": 1.4559, + "loss": 0.968, "num_input_tokens_seen": 2129920, "step": 260 }, { - "epoch": 0.030895983522142123, - "grad_norm": 1.2335752248764038, + "epoch": 0.012965186074429771, + "grad_norm": 2.1030478477478027, "learning_rate": 1.3500000000000001e-05, - "loss": 1.4361, + "loss": 1.1094, "num_input_tokens_seen": 2211840, "step": 270 }, { - "epoch": 0.032040279208147386, - "grad_norm": 0.8680986166000366, + "epoch": 0.013445378151260505, + "grad_norm": 1.006523609161377, "learning_rate": 1.4000000000000001e-05, - "loss": 1.7506, + "loss": 1.0791, "num_input_tokens_seen": 2293760, "step": 280 }, { - "epoch": 0.03318457489415265, - "grad_norm": 0.8861328363418579, + "epoch": 0.013925570228091237, + "grad_norm": 2.395143508911133, "learning_rate": 1.45e-05, - "loss": 1.5946, + "loss": 1.0844, "num_input_tokens_seen": 2375680, "step": 290 }, { - "epoch": 0.03432887058015791, - "grad_norm": 0.8510925769805908, + "epoch": 0.014405762304921969, + "grad_norm": 1.1959385871887207, "learning_rate": 1.5e-05, - "loss": 1.386, + "loss": 1.0286, "num_input_tokens_seen": 2457600, "step": 300 }, { - "epoch": 0.035473166266163175, - "grad_norm": 0.961478054523468, + "epoch": 0.0148859543817527, + "grad_norm": 2.8415608406066895, "learning_rate": 1.55e-05, - "loss": 1.1091, + "loss": 1.1747, "num_input_tokens_seen": 2539520, "step": 310 }, { - "epoch": 0.03661746195216844, - "grad_norm": 0.8009458780288696, + "epoch": 0.015366146458583434, + "grad_norm": 1.2111806869506836, "learning_rate": 1.6000000000000003e-05, - "loss": 1.3287, + "loss": 1.1556, "num_input_tokens_seen": 2621440, "step": 320 }, { - "epoch": 0.0377617576381737, - "grad_norm": 1.1145437955856323, + "epoch": 0.015846338535414166, + "grad_norm": 1.1664150953292847, "learning_rate": 1.65e-05, - "loss": 1.2537, + "loss": 1.0936, "num_input_tokens_seen": 2703360, "step": 330 }, { - "epoch": 0.038906053324178964, - "grad_norm": 0.7063050270080566, + "epoch": 0.0163265306122449, + "grad_norm": 1.121214747428894, "learning_rate": 1.7000000000000003e-05, - "loss": 1.4166, + "loss": 1.0638, "num_input_tokens_seen": 2785280, "step": 340 }, { - "epoch": 0.040050349010184234, - "grad_norm": 0.8494574427604675, + "epoch": 0.01680672268907563, + "grad_norm": 1.1795384883880615, "learning_rate": 1.75e-05, - "loss": 1.298, + "loss": 1.171, "num_input_tokens_seen": 2867200, "step": 350 }, { - "epoch": 0.0411946446961895, - "grad_norm": 0.8032189011573792, + "epoch": 0.017286914765906363, + "grad_norm": 1.1597260236740112, "learning_rate": 1.8e-05, - "loss": 1.4127, + "loss": 0.9427, "num_input_tokens_seen": 2949120, "step": 360 }, { - "epoch": 0.04233894038219476, - "grad_norm": 0.6479071974754333, + "epoch": 0.017767106842737093, + "grad_norm": 0.9535462856292725, "learning_rate": 1.85e-05, - "loss": 1.1462, + "loss": 0.9715, "num_input_tokens_seen": 3031040, "step": 370 }, { - "epoch": 0.043483236068200024, - "grad_norm": 1.165413737297058, + "epoch": 0.018247298919567827, + "grad_norm": 0.946639895439148, "learning_rate": 1.9e-05, - "loss": 1.5957, + "loss": 1.0749, "num_input_tokens_seen": 3112960, "step": 380 }, { - "epoch": 0.04462753175420529, - "grad_norm": 0.8543304800987244, + "epoch": 0.01872749099639856, + "grad_norm": 0.9363052845001221, "learning_rate": 1.9500000000000003e-05, - "loss": 1.3672, + "loss": 0.9876, "num_input_tokens_seen": 3194880, "step": 390 }, { - "epoch": 0.04577182744021055, - "grad_norm": 0.8080208897590637, + "epoch": 0.01920768307322929, + "grad_norm": 1.0535475015640259, "learning_rate": 2e-05, - "loss": 1.2616, + "loss": 1.105, "num_input_tokens_seen": 3276800, "step": 400 }, { - "epoch": 0.04691612312621581, - "grad_norm": 0.810095489025116, + "epoch": 0.019687875150060024, + "grad_norm": 2.634197950363159, "learning_rate": 2.05e-05, - "loss": 1.1038, + "loss": 1.1199, "num_input_tokens_seen": 3358720, "step": 410 }, { - "epoch": 0.048060418812221076, - "grad_norm": 0.6553860902786255, + "epoch": 0.020168067226890758, + "grad_norm": 1.0603679418563843, "learning_rate": 2.1e-05, - "loss": 1.3898, + "loss": 1.0458, "num_input_tokens_seen": 3440640, "step": 420 }, { - "epoch": 0.04920471449822634, - "grad_norm": 3.40867018699646, + "epoch": 0.020648259303721488, + "grad_norm": 1.0156742334365845, "learning_rate": 2.15e-05, - "loss": 1.2665, + "loss": 1.023, "num_input_tokens_seen": 3522560, "step": 430 }, { - "epoch": 0.05034901018423161, - "grad_norm": 0.6455990076065063, + "epoch": 0.02112845138055222, + "grad_norm": 0.9424085021018982, "learning_rate": 2.2000000000000003e-05, - "loss": 1.188, + "loss": 1.1381, "num_input_tokens_seen": 3604480, "step": 440 }, { - "epoch": 0.05149330587023687, - "grad_norm": 0.5759983658790588, + "epoch": 0.021608643457382955, + "grad_norm": 0.843125581741333, "learning_rate": 2.25e-05, - "loss": 1.4607, + "loss": 0.9443, "num_input_tokens_seen": 3686400, "step": 450 }, { - "epoch": 0.052637601556242135, - "grad_norm": 1.409822940826416, + "epoch": 0.022088835534213685, + "grad_norm": 0.7700474262237549, "learning_rate": 2.3000000000000003e-05, - "loss": 1.1622, + "loss": 0.9857, "num_input_tokens_seen": 3768320, "step": 460 }, { - "epoch": 0.0537818972422474, - "grad_norm": 0.6812942028045654, + "epoch": 0.02256902761104442, + "grad_norm": 0.8095147013664246, "learning_rate": 2.35e-05, - "loss": 1.3681, + "loss": 1.2423, "num_input_tokens_seen": 3850240, "step": 470 }, { - "epoch": 0.05492619292825266, - "grad_norm": 1.1562724113464355, + "epoch": 0.02304921968787515, + "grad_norm": 0.7245414853096008, "learning_rate": 2.4e-05, - "loss": 1.341, + "loss": 1.0458, "num_input_tokens_seen": 3932160, "step": 480 }, { - "epoch": 0.056070488614257924, - "grad_norm": 1.1834927797317505, + "epoch": 0.023529411764705882, + "grad_norm": 0.7768824100494385, "learning_rate": 2.45e-05, - "loss": 1.0104, + "loss": 1.0759, "num_input_tokens_seen": 4014080, "step": 490 }, { - "epoch": 0.05721478430026319, - "grad_norm": 0.6955065727233887, + "epoch": 0.024009603841536616, + "grad_norm": 0.6364564895629883, "learning_rate": 2.5e-05, - "loss": 1.4473, + "loss": 1.0799, "num_input_tokens_seen": 4096000, "step": 500 }, { - "epoch": 0.05835907998626845, - "grad_norm": 5.183798313140869, + "epoch": 0.024489795918367346, + "grad_norm": 0.7478921413421631, "learning_rate": 2.5500000000000003e-05, - "loss": 1.5255, + "loss": 1.0056, "num_input_tokens_seen": 4177920, "step": 510 }, { - "epoch": 0.05950337567227371, - "grad_norm": 0.6789785027503967, + "epoch": 0.02496998799519808, + "grad_norm": 0.5253103375434875, "learning_rate": 2.6000000000000002e-05, - "loss": 1.3778, + "loss": 0.9158, "num_input_tokens_seen": 4259840, "step": 520 }, { - "epoch": 0.060647671358278976, - "grad_norm": 0.5913729667663574, + "epoch": 0.025450180072028813, + "grad_norm": 0.7591320872306824, "learning_rate": 2.6500000000000004e-05, - "loss": 1.1477, + "loss": 1.0108, "num_input_tokens_seen": 4341760, "step": 530 }, { - "epoch": 0.061791967044284246, - "grad_norm": 1.2051905393600464, + "epoch": 0.025930372148859543, + "grad_norm": 0.7796912789344788, "learning_rate": 2.7000000000000002e-05, - "loss": 1.4827, + "loss": 0.9604, "num_input_tokens_seen": 4423680, "step": 540 }, { - "epoch": 0.0629362627302895, - "grad_norm": 0.5383505821228027, + "epoch": 0.026410564225690276, + "grad_norm": 0.7369773983955383, "learning_rate": 2.7500000000000004e-05, - "loss": 1.635, + "loss": 1.0828, "num_input_tokens_seen": 4505600, "step": 550 }, { - "epoch": 0.06408055841629477, - "grad_norm": 0.6724018454551697, + "epoch": 0.02689075630252101, + "grad_norm": 0.7800397276878357, "learning_rate": 2.8000000000000003e-05, - "loss": 1.2354, + "loss": 1.2004, "num_input_tokens_seen": 4587520, "step": 560 }, { - "epoch": 0.06522485410230003, - "grad_norm": 0.8346174359321594, + "epoch": 0.02737094837935174, + "grad_norm": 1.0041433572769165, "learning_rate": 2.8499999999999998e-05, - "loss": 1.3294, + "loss": 1.0362, "num_input_tokens_seen": 4669440, "step": 570 }, { - "epoch": 0.0663691497883053, - "grad_norm": 0.624640703201294, + "epoch": 0.027851140456182474, + "grad_norm": 0.7719876170158386, "learning_rate": 2.9e-05, - "loss": 1.3841, + "loss": 1.1366, "num_input_tokens_seen": 4751360, "step": 580 }, { - "epoch": 0.06751344547431057, - "grad_norm": 0.5437624454498291, + "epoch": 0.028331332533013204, + "grad_norm": 0.7241286039352417, "learning_rate": 2.95e-05, - "loss": 1.3379, + "loss": 1.1366, "num_input_tokens_seen": 4833280, "step": 590 }, { - "epoch": 0.06865774116031582, - "grad_norm": 0.5260199308395386, + "epoch": 0.028811524609843937, + "grad_norm": 1.1306092739105225, "learning_rate": 3e-05, - "loss": 1.187, + "loss": 0.8576, "num_input_tokens_seen": 4915200, "step": 600 }, { - "epoch": 0.0698020368463211, - "grad_norm": 0.6440827250480652, + "epoch": 0.02929171668667467, + "grad_norm": 0.6735727190971375, "learning_rate": 3.05e-05, - "loss": 1.5078, + "loss": 1.2392, "num_input_tokens_seen": 4997120, "step": 610 }, { - "epoch": 0.07094633253232635, - "grad_norm": 0.8612675666809082, + "epoch": 0.0297719087635054, + "grad_norm": 1.3449808359146118, "learning_rate": 3.1e-05, - "loss": 1.4624, + "loss": 0.9339, "num_input_tokens_seen": 5079040, "step": 620 }, { - "epoch": 0.07209062821833162, - "grad_norm": 0.5070405602455139, + "epoch": 0.030252100840336135, + "grad_norm": 0.7179974317550659, "learning_rate": 3.15e-05, - "loss": 1.2244, + "loss": 0.9704, "num_input_tokens_seen": 5160960, "step": 630 }, { - "epoch": 0.07323492390433688, - "grad_norm": 0.6587413549423218, + "epoch": 0.030732292917166868, + "grad_norm": 0.7083514928817749, "learning_rate": 3.2000000000000005e-05, - "loss": 1.4688, + "loss": 0.9718, "num_input_tokens_seen": 5242880, "step": 640 }, { - "epoch": 0.07437921959034215, - "grad_norm": 0.6212195754051208, + "epoch": 0.031212484993997598, + "grad_norm": 1.1827001571655273, "learning_rate": 3.2500000000000004e-05, - "loss": 1.1677, + "loss": 0.9997, "num_input_tokens_seen": 5324800, "step": 650 }, { - "epoch": 0.0755235152763474, - "grad_norm": 1.031207799911499, + "epoch": 0.03169267707082833, + "grad_norm": 0.553023099899292, "learning_rate": 3.3e-05, - "loss": 1.2562, + "loss": 0.8748, "num_input_tokens_seen": 5406720, "step": 660 }, { - "epoch": 0.07666781096235267, - "grad_norm": 0.5637765526771545, + "epoch": 0.032172869147659065, + "grad_norm": 0.7951409816741943, "learning_rate": 3.35e-05, - "loss": 1.3111, + "loss": 0.8658, "num_input_tokens_seen": 5488640, "step": 670 }, { - "epoch": 0.07781210664835793, - "grad_norm": 1.6246665716171265, + "epoch": 0.0326530612244898, + "grad_norm": 0.7278563976287842, "learning_rate": 3.4000000000000007e-05, - "loss": 1.2953, + "loss": 0.8799, "num_input_tokens_seen": 5570560, "step": 680 }, { - "epoch": 0.0789564023343632, - "grad_norm": 0.5538053512573242, + "epoch": 0.033133253301320525, + "grad_norm": 0.4969690442085266, "learning_rate": 3.45e-05, - "loss": 1.2493, + "loss": 1.0723, "num_input_tokens_seen": 5652480, "step": 690 }, { - "epoch": 0.08010069802036847, - "grad_norm": 0.566648542881012, + "epoch": 0.03361344537815126, + "grad_norm": 0.621137797832489, "learning_rate": 3.5e-05, - "loss": 1.3966, + "loss": 1.0756, "num_input_tokens_seen": 5734400, "step": 700 }, { - "epoch": 0.08124499370637372, - "grad_norm": 0.5915348529815674, + "epoch": 0.03409363745498199, + "grad_norm": 0.6362459063529968, "learning_rate": 3.55e-05, - "loss": 1.1898, + "loss": 0.967, "num_input_tokens_seen": 5816320, "step": 710 }, { - "epoch": 0.082389289392379, - "grad_norm": 0.5097474455833435, + "epoch": 0.034573829531812726, + "grad_norm": 0.7109155654907227, "learning_rate": 3.6e-05, - "loss": 1.3558, + "loss": 0.9767, "num_input_tokens_seen": 5898240, "step": 720 }, { - "epoch": 0.08353358507838425, - "grad_norm": 0.6351694464683533, + "epoch": 0.03505402160864346, + "grad_norm": 0.6260324120521545, "learning_rate": 3.65e-05, - "loss": 1.6577, + "loss": 0.9352, "num_input_tokens_seen": 5980160, "step": 730 }, { - "epoch": 0.08467788076438952, - "grad_norm": 0.5961860418319702, + "epoch": 0.035534213685474186, + "grad_norm": 0.6420788168907166, "learning_rate": 3.7e-05, - "loss": 1.2571, + "loss": 0.9214, "num_input_tokens_seen": 6062080, "step": 740 }, { - "epoch": 0.08582217645039478, - "grad_norm": 0.7016868591308594, + "epoch": 0.03601440576230492, + "grad_norm": 0.6354939937591553, "learning_rate": 3.7500000000000003e-05, - "loss": 1.3735, + "loss": 1.0334, "num_input_tokens_seen": 6144000, "step": 750 }, { - "epoch": 0.08696647213640005, - "grad_norm": 0.5575875043869019, + "epoch": 0.036494597839135653, + "grad_norm": 0.5470936298370361, "learning_rate": 3.8e-05, - "loss": 1.2731, + "loss": 1.0507, "num_input_tokens_seen": 6225920, "step": 760 }, { - "epoch": 0.0881107678224053, - "grad_norm": 1.047059416770935, + "epoch": 0.03697478991596639, + "grad_norm": 0.8862899541854858, "learning_rate": 3.85e-05, - "loss": 1.543, + "loss": 1.0, "num_input_tokens_seen": 6307840, "step": 770 }, { - "epoch": 0.08925506350841057, - "grad_norm": 0.8251460790634155, + "epoch": 0.03745498199279712, + "grad_norm": 0.6771734952926636, "learning_rate": 3.9000000000000006e-05, - "loss": 1.1869, + "loss": 1.0441, "num_input_tokens_seen": 6389760, "step": 780 }, { - "epoch": 0.09039935919441584, - "grad_norm": 0.6270453333854675, + "epoch": 0.037935174069627854, + "grad_norm": 0.6661419868469238, "learning_rate": 3.9500000000000005e-05, - "loss": 1.6793, + "loss": 1.0698, "num_input_tokens_seen": 6471680, "step": 790 }, { - "epoch": 0.0915436548804211, - "grad_norm": 0.6033393740653992, + "epoch": 0.03841536614645858, + "grad_norm": 0.6281525492668152, "learning_rate": 4e-05, - "loss": 1.7236, + "loss": 0.9719, "num_input_tokens_seen": 6553600, "step": 800 }, { - "epoch": 0.09268795056642637, - "grad_norm": 1.2078917026519775, + "epoch": 0.038895558223289314, + "grad_norm": 0.8006066083908081, "learning_rate": 4.05e-05, - "loss": 1.1466, + "loss": 1.1387, "num_input_tokens_seen": 6635520, "step": 810 }, { - "epoch": 0.09383224625243163, - "grad_norm": 0.6712398529052734, + "epoch": 0.03937575030012005, + "grad_norm": 0.6581395268440247, "learning_rate": 4.1e-05, - "loss": 1.308, + "loss": 1.0304, "num_input_tokens_seen": 6717440, "step": 820 }, { - "epoch": 0.0949765419384369, - "grad_norm": 0.5450591444969177, + "epoch": 0.03985594237695078, + "grad_norm": 0.5860041379928589, "learning_rate": 4.15e-05, - "loss": 1.1448, + "loss": 0.9735, "num_input_tokens_seen": 6799360, "step": 830 }, { - "epoch": 0.09612083762444215, - "grad_norm": 0.6224645376205444, + "epoch": 0.040336134453781515, + "grad_norm": 0.6659871339797974, "learning_rate": 4.2e-05, - "loss": 1.1618, + "loss": 0.9434, "num_input_tokens_seen": 6881280, "step": 840 }, { - "epoch": 0.09726513331044742, - "grad_norm": 0.8741360306739807, + "epoch": 0.04081632653061224, + "grad_norm": 0.6158783435821533, "learning_rate": 4.25e-05, - "loss": 1.3667, + "loss": 0.9919, "num_input_tokens_seen": 6963200, "step": 850 }, { - "epoch": 0.09840942899645268, - "grad_norm": 0.6075563430786133, + "epoch": 0.041296518607442975, + "grad_norm": 1.7883119583129883, "learning_rate": 4.3e-05, - "loss": 1.3752, + "loss": 0.9973, "num_input_tokens_seen": 7045120, "step": 860 }, { - "epoch": 0.09955372468245795, - "grad_norm": 0.6345754265785217, + "epoch": 0.04177671068427371, + "grad_norm": 0.6660032272338867, "learning_rate": 4.35e-05, - "loss": 1.3973, + "loss": 1.0275, "num_input_tokens_seen": 7127040, "step": 870 }, { - "epoch": 0.10069802036846322, - "grad_norm": 0.7039983868598938, + "epoch": 0.04225690276110444, + "grad_norm": 0.7275106906890869, "learning_rate": 4.4000000000000006e-05, - "loss": 1.3936, + "loss": 0.8301, "num_input_tokens_seen": 7208960, "step": 880 }, { - "epoch": 0.10184231605446847, - "grad_norm": 0.6462425589561462, + "epoch": 0.042737094837935176, + "grad_norm": 0.7131916880607605, "learning_rate": 4.4500000000000004e-05, - "loss": 1.2224, + "loss": 1.3488, "num_input_tokens_seen": 7290880, "step": 890 }, { - "epoch": 0.10298661174047374, - "grad_norm": 0.6138895153999329, + "epoch": 0.04321728691476591, + "grad_norm": 0.6134268045425415, "learning_rate": 4.5e-05, - "loss": 1.0594, + "loss": 1.1191, "num_input_tokens_seen": 7372800, "step": 900 }, { - "epoch": 0.104130907426479, - "grad_norm": 0.5026350021362305, + "epoch": 0.043697478991596636, + "grad_norm": 0.6196214556694031, "learning_rate": 4.55e-05, - "loss": 1.122, + "loss": 1.179, "num_input_tokens_seen": 7454720, "step": 910 }, { - "epoch": 0.10527520311248427, - "grad_norm": 0.6174338459968567, + "epoch": 0.04417767106842737, + "grad_norm": 0.626262903213501, "learning_rate": 4.600000000000001e-05, - "loss": 1.2711, + "loss": 1.0239, "num_input_tokens_seen": 7536640, "step": 920 }, { - "epoch": 0.10641949879848953, - "grad_norm": 0.736929178237915, + "epoch": 0.0446578631452581, + "grad_norm": 0.7423630952835083, "learning_rate": 4.6500000000000005e-05, - "loss": 1.3302, + "loss": 1.0599, "num_input_tokens_seen": 7618560, "step": 930 }, { - "epoch": 0.1075637944844948, - "grad_norm": 0.6089776158332825, + "epoch": 0.04513805522208884, + "grad_norm": 0.6767206788063049, "learning_rate": 4.7e-05, - "loss": 1.2869, + "loss": 0.9898, "num_input_tokens_seen": 7700480, "step": 940 }, { - "epoch": 0.10870809017050005, - "grad_norm": 0.7438560724258423, + "epoch": 0.04561824729891957, + "grad_norm": 0.8346232175827026, "learning_rate": 4.75e-05, - "loss": 1.2435, + "loss": 0.9133, "num_input_tokens_seen": 7782400, "step": 950 }, { - "epoch": 0.10985238585650532, - "grad_norm": 0.5268089175224304, + "epoch": 0.0460984393757503, + "grad_norm": 0.5462753176689148, "learning_rate": 4.8e-05, - "loss": 1.0464, + "loss": 0.8747, "num_input_tokens_seen": 7864320, "step": 960 }, { - "epoch": 0.11099668154251058, - "grad_norm": 0.6583835482597351, + "epoch": 0.04657863145258103, + "grad_norm": 0.6095471978187561, "learning_rate": 4.85e-05, - "loss": 1.2715, + "loss": 1.0154, "num_input_tokens_seen": 7946240, "step": 970 }, { - "epoch": 0.11214097722851585, - "grad_norm": 1.4557218551635742, + "epoch": 0.047058823529411764, + "grad_norm": 0.555296778678894, "learning_rate": 4.9e-05, - "loss": 1.4746, + "loss": 0.8588, "num_input_tokens_seen": 8028160, "step": 980 }, { - "epoch": 0.11328527291452112, - "grad_norm": 1.0798985958099365, + "epoch": 0.0475390156062425, + "grad_norm": 0.6341934204101562, "learning_rate": 4.9500000000000004e-05, - "loss": 1.1464, + "loss": 0.8407, "num_input_tokens_seen": 8110080, "step": 990 }, { - "epoch": 0.11442956860052637, - "grad_norm": 0.5683252811431885, + "epoch": 0.04801920768307323, + "grad_norm": 0.6310316324234009, "learning_rate": 5e-05, - "loss": 1.3901, + "loss": 0.8889, "num_input_tokens_seen": 8192000, "step": 1000 }, { - "epoch": 0.11557386428653164, - "grad_norm": 0.6159707903862, - "learning_rate": 4.999979401316311e-05, - "loss": 1.3986, + "epoch": 0.048499399759903965, + "grad_norm": 0.6559818983078003, + "learning_rate": 4.9999968610581127e-05, + "loss": 1.0226, "num_input_tokens_seen": 8273920, "step": 1010 }, { - "epoch": 0.1167181599725369, - "grad_norm": 0.6029934883117676, - "learning_rate": 4.999917605604688e-05, - "loss": 1.3841, + "epoch": 0.04897959183673469, + "grad_norm": 0.6584986448287964, + "learning_rate": 4.9999874442403314e-05, + "loss": 1.0976, "num_input_tokens_seen": 8355840, "step": 1020 }, { - "epoch": 0.11786245565854217, - "grad_norm": 0.6061372756958008, - "learning_rate": 4.999814613883459e-05, - "loss": 1.6034, + "epoch": 0.049459783913565425, + "grad_norm": 0.9989052414894104, + "learning_rate": 4.999971749570305e-05, + "loss": 1.1658, "num_input_tokens_seen": 8437760, "step": 1030 }, { - "epoch": 0.11900675134454743, - "grad_norm": 0.5895953178405762, - "learning_rate": 4.9996704278498185e-05, - "loss": 1.5685, + "epoch": 0.04993997599039616, + "grad_norm": 0.6315760612487793, + "learning_rate": 4.999949777087444e-05, + "loss": 0.9763, "num_input_tokens_seen": 8519680, "step": 1040 }, { - "epoch": 0.1201510470305527, - "grad_norm": 0.6715599894523621, - "learning_rate": 4.9994850498798026e-05, - "loss": 1.4407, + "epoch": 0.05042016806722689, + "grad_norm": 0.6868481636047363, + "learning_rate": 4.999921526846925e-05, + "loss": 0.8407, "num_input_tokens_seen": 8601600, "step": 1050 }, { - "epoch": 0.12129534271655795, - "grad_norm": 0.6397563815116882, - "learning_rate": 4.999258483028243e-05, - "loss": 1.2713, + "epoch": 0.050900360144057626, + "grad_norm": 0.6122430562973022, + "learning_rate": 4.9998869989196885e-05, + "loss": 0.9688, "num_input_tokens_seen": 8683520, "step": 1060 }, { - "epoch": 0.12243963840256322, - "grad_norm": 0.5121895670890808, - "learning_rate": 4.9989907310287243e-05, - "loss": 1.2696, + "epoch": 0.05138055222088835, + "grad_norm": 0.6200319528579712, + "learning_rate": 4.99984619339244e-05, + "loss": 0.9459, "num_input_tokens_seen": 8765440, "step": 1070 }, { - "epoch": 0.12358393408856849, - "grad_norm": 0.533311665058136, - "learning_rate": 4.998681798293516e-05, - "loss": 1.1279, + "epoch": 0.051860744297719086, + "grad_norm": 0.6406499147415161, + "learning_rate": 4.999799110367648e-05, + "loss": 1.0055, "num_input_tokens_seen": 8847360, "step": 1080 }, { - "epoch": 0.12472822977457375, - "grad_norm": 0.4866422414779663, - "learning_rate": 4.998331689913506e-05, - "loss": 1.3104, + "epoch": 0.05234093637454982, + "grad_norm": 0.5609921813011169, + "learning_rate": 4.999745749963545e-05, + "loss": 1.0218, "num_input_tokens_seen": 8929280, "step": 1090 }, { - "epoch": 0.125872525460579, - "grad_norm": 0.6534491181373596, - "learning_rate": 4.9979404116581104e-05, - "loss": 1.2877, + "epoch": 0.05282112845138055, + "grad_norm": 0.6337623596191406, + "learning_rate": 4.999686112314127e-05, + "loss": 0.8797, "num_input_tokens_seen": 9011200, "step": 1100 }, { - "epoch": 0.12701682114658427, - "grad_norm": 0.5954148173332214, - "learning_rate": 4.9975079699751825e-05, - "loss": 1.2768, + "epoch": 0.053301320528211286, + "grad_norm": 0.6383864879608154, + "learning_rate": 4.999620197569155e-05, + "loss": 1.1579, "num_input_tokens_seen": 9093120, "step": 1110 }, { - "epoch": 0.12816111683258954, - "grad_norm": 0.49169713258743286, - "learning_rate": 4.997034371990907e-05, - "loss": 1.5502, + "epoch": 0.05378151260504202, + "grad_norm": 0.575879693031311, + "learning_rate": 4.9995480058941483e-05, + "loss": 0.8893, "num_input_tokens_seen": 9175040, "step": 1120 }, { - "epoch": 0.12930541251859481, - "grad_norm": 0.6432734131813049, - "learning_rate": 4.99651962550968e-05, - "loss": 1.3732, + "epoch": 0.05426170468187275, + "grad_norm": 0.6337655186653137, + "learning_rate": 4.9994695374703934e-05, + "loss": 0.987, "num_input_tokens_seen": 9256960, "step": 1130 }, { - "epoch": 0.13044970820460006, - "grad_norm": 0.6305902600288391, - "learning_rate": 4.9959637390139814e-05, - "loss": 1.1296, + "epoch": 0.05474189675870348, + "grad_norm": 0.6570917367935181, + "learning_rate": 4.9993847924949364e-05, + "loss": 0.8846, "num_input_tokens_seen": 9338880, "step": 1140 }, { - "epoch": 0.13159400389060533, - "grad_norm": 0.5763765573501587, - "learning_rate": 4.995366721664234e-05, - "loss": 1.1064, + "epoch": 0.055222088835534214, + "grad_norm": 0.6110508441925049, + "learning_rate": 4.999293771180584e-05, + "loss": 0.9093, "num_input_tokens_seen": 9420800, "step": 1150 }, { - "epoch": 0.1327382995766106, - "grad_norm": 0.5058463215827942, - "learning_rate": 4.9947285832986553e-05, - "loss": 1.3228, + "epoch": 0.05570228091236495, + "grad_norm": 0.7141037583351135, + "learning_rate": 4.999196473755905e-05, + "loss": 1.0216, "num_input_tokens_seen": 9502720, "step": 1160 }, { - "epoch": 0.13388259526261587, - "grad_norm": 0.6652387380599976, - "learning_rate": 4.994049334433095e-05, - "loss": 1.3161, + "epoch": 0.05618247298919568, + "grad_norm": 0.5640223026275635, + "learning_rate": 4.9990929004652287e-05, + "loss": 1.0765, "num_input_tokens_seen": 9584640, "step": 1170 }, { - "epoch": 0.13502689094862114, - "grad_norm": 0.5453552007675171, - "learning_rate": 4.9933289862608584e-05, - "loss": 1.4789, + "epoch": 0.05666266506602641, + "grad_norm": 0.4741804301738739, + "learning_rate": 4.9989830515686434e-05, + "loss": 0.9421, "num_input_tokens_seen": 9666560, "step": 1180 }, { - "epoch": 0.13617118663462638, - "grad_norm": 0.5469970107078552, - "learning_rate": 4.992567550652525e-05, - "loss": 1.3033, + "epoch": 0.05714285714285714, + "grad_norm": 0.4780801236629486, + "learning_rate": 4.998866927341995e-05, + "loss": 1.0168, "num_input_tokens_seen": 9748480, "step": 1190 }, { - "epoch": 0.13731548232063165, - "grad_norm": 0.7196422219276428, - "learning_rate": 4.9917650401557505e-05, - "loss": 1.1661, + "epoch": 0.057623049219687875, + "grad_norm": 0.6580250859260559, + "learning_rate": 4.9987445280768916e-05, + "loss": 0.9398, "num_input_tokens_seen": 9830400, "step": 1200 }, { - "epoch": 0.13845977800663692, - "grad_norm": 0.7473416328430176, - "learning_rate": 4.990921467995064e-05, - "loss": 1.3148, + "epoch": 0.05810324129651861, + "grad_norm": 0.571909487247467, + "learning_rate": 4.998615854080695e-05, + "loss": 1.0907, "num_input_tokens_seen": 9912320, "step": 1210 }, { - "epoch": 0.1396040736926422, - "grad_norm": 0.6073607206344604, - "learning_rate": 4.9900368480716466e-05, - "loss": 1.186, + "epoch": 0.05858343337334934, + "grad_norm": 0.6233389973640442, + "learning_rate": 4.998480905676527e-05, + "loss": 0.9021, "num_input_tokens_seen": 9994240, "step": 1220 }, { - "epoch": 0.14074836937864743, - "grad_norm": 0.49970775842666626, - "learning_rate": 4.9891111949631023e-05, - "loss": 1.0854, + "epoch": 0.059063625450180075, + "grad_norm": 1.2051422595977783, + "learning_rate": 4.998339683203261e-05, + "loss": 1.2031, "num_input_tokens_seen": 10076160, "step": 1230 }, { - "epoch": 0.1418926650646527, - "grad_norm": 0.5494070053100586, - "learning_rate": 4.988144523923221e-05, - "loss": 1.2775, + "epoch": 0.0595438175270108, + "grad_norm": 0.6003074645996094, + "learning_rate": 4.9981921870155314e-05, + "loss": 1.0543, "num_input_tokens_seen": 10158080, "step": 1240 }, { - "epoch": 0.14303696075065797, - "grad_norm": 0.49916988611221313, - "learning_rate": 4.987136850881721e-05, - "loss": 1.3006, + "epoch": 0.060024009603841535, + "grad_norm": 0.5767567753791809, + "learning_rate": 4.998038417483721e-05, + "loss": 1.0474, "num_input_tokens_seen": 10240000, "step": 1250 }, { - "epoch": 0.14418125643666324, - "grad_norm": 0.46677911281585693, - "learning_rate": 4.986088192443995e-05, - "loss": 1.0796, + "epoch": 0.06050420168067227, + "grad_norm": 0.6394206285476685, + "learning_rate": 4.9978783749939705e-05, + "loss": 1.0161, "num_input_tokens_seen": 10321920, "step": 1260 }, { - "epoch": 0.1453255521226685, - "grad_norm": 0.520656943321228, - "learning_rate": 4.9849985658908296e-05, - "loss": 1.2881, + "epoch": 0.060984393757503, + "grad_norm": 0.6625099778175354, + "learning_rate": 4.997712059948171e-05, + "loss": 0.9022, "num_input_tokens_seen": 10403840, "step": 1270 }, { - "epoch": 0.14646984780867375, - "grad_norm": 0.5875335335731506, - "learning_rate": 4.9838679891781214e-05, - "loss": 1.2889, + "epoch": 0.061464585834333736, + "grad_norm": 0.6047447323799133, + "learning_rate": 4.997539472763964e-05, + "loss": 0.8682, "num_input_tokens_seen": 10485760, "step": 1280 }, { - "epoch": 0.14761414349467902, - "grad_norm": 0.431538850069046, - "learning_rate": 4.982696480936586e-05, - "loss": 1.0391, + "epoch": 0.06194477791116446, + "grad_norm": 0.6154036521911621, + "learning_rate": 4.9973606138747434e-05, + "loss": 1.147, "num_input_tokens_seen": 10567680, "step": 1290 }, { - "epoch": 0.1487584391806843, - "grad_norm": 0.7769544124603271, - "learning_rate": 4.981484060471444e-05, - "loss": 1.0809, + "epoch": 0.062424969987995196, + "grad_norm": 0.8171849250793457, + "learning_rate": 4.9971754837296516e-05, + "loss": 0.8778, "num_input_tokens_seen": 10649600, "step": 1300 }, { - "epoch": 0.14990273486668956, - "grad_norm": 0.5716103315353394, - "learning_rate": 4.9802307477621084e-05, - "loss": 1.1128, + "epoch": 0.06290516206482594, + "grad_norm": 0.8324936628341675, + "learning_rate": 4.9969840827935776e-05, + "loss": 1.249, "num_input_tokens_seen": 10731520, "step": 1310 }, { - "epoch": 0.1510470305526948, - "grad_norm": 0.546258807182312, - "learning_rate": 4.978936563461854e-05, - "loss": 1.2888, + "epoch": 0.06338535414165666, + "grad_norm": 0.5857701897621155, + "learning_rate": 4.99678641154716e-05, + "loss": 0.6743, "num_input_tokens_seen": 10813440, "step": 1320 }, { - "epoch": 0.15219132623870008, - "grad_norm": 0.9458239674568176, - "learning_rate": 4.9776015288974736e-05, - "loss": 1.3032, + "epoch": 0.06386554621848739, + "grad_norm": 0.6449385285377502, + "learning_rate": 4.9965824704867806e-05, + "loss": 1.0631, "num_input_tokens_seen": 10895360, "step": 1330 }, { - "epoch": 0.15333562192470535, - "grad_norm": 0.43678519129753113, - "learning_rate": 4.976225666068932e-05, - "loss": 0.9843, + "epoch": 0.06434573829531813, + "grad_norm": 0.5496826767921448, + "learning_rate": 4.996372260124567e-05, + "loss": 0.8721, "num_input_tokens_seen": 10977280, "step": 1340 }, { - "epoch": 0.15447991761071062, - "grad_norm": 0.5537676811218262, - "learning_rate": 4.9748089976489996e-05, - "loss": 1.3003, + "epoch": 0.06482593037214886, + "grad_norm": 0.6244034767150879, + "learning_rate": 4.996155780988389e-05, + "loss": 0.7503, "num_input_tokens_seen": 11059200, "step": 1350 }, { - "epoch": 0.15562421329671586, - "grad_norm": 0.5468762516975403, - "learning_rate": 4.9733515469828795e-05, - "loss": 1.4718, + "epoch": 0.0653061224489796, + "grad_norm": 0.5835392475128174, + "learning_rate": 4.9959330336218605e-05, + "loss": 0.8589, "num_input_tokens_seen": 11141120, "step": 1360 }, { - "epoch": 0.15676850898272113, - "grad_norm": 0.5907567143440247, - "learning_rate": 4.971853338087825e-05, - "loss": 1.3703, + "epoch": 0.06578631452581032, + "grad_norm": 0.5886926054954529, + "learning_rate": 4.995704018584334e-05, + "loss": 0.9261, "num_input_tokens_seen": 11223040, "step": 1370 }, { - "epoch": 0.1579128046687264, - "grad_norm": 0.6227702498435974, - "learning_rate": 4.97031439565274e-05, - "loss": 1.0876, + "epoch": 0.06626650660264105, + "grad_norm": 0.6225190758705139, + "learning_rate": 4.9954687364508996e-05, + "loss": 0.7139, "num_input_tokens_seen": 11304960, "step": 1380 }, { - "epoch": 0.15905710035473167, - "grad_norm": 0.5486903786659241, - "learning_rate": 4.9687347450377755e-05, - "loss": 1.2873, + "epoch": 0.06674669867947179, + "grad_norm": 0.7748022079467773, + "learning_rate": 4.995227187812389e-05, + "loss": 0.8474, "num_input_tokens_seen": 11386880, "step": 1390 }, { - "epoch": 0.16020139604073694, - "grad_norm": 1.0276037454605103, - "learning_rate": 4.9671144122739106e-05, - "loss": 1.4622, + "epoch": 0.06722689075630252, + "grad_norm": 0.625622570514679, + "learning_rate": 4.9949793732753656e-05, + "loss": 1.0527, "num_input_tokens_seen": 11468800, "step": 1400 }, { - "epoch": 0.16134569172674218, - "grad_norm": 0.5728926658630371, - "learning_rate": 4.9654534240625225e-05, - "loss": 1.0993, + "epoch": 0.06770708283313326, + "grad_norm": 0.5124455094337463, + "learning_rate": 4.994725293462132e-05, + "loss": 0.9405, "num_input_tokens_seen": 11550720, "step": 1410 }, { - "epoch": 0.16248998741274745, - "grad_norm": 0.5300357341766357, - "learning_rate": 4.9637518077749476e-05, - "loss": 1.156, + "epoch": 0.06818727490996399, + "grad_norm": 0.5305024981498718, + "learning_rate": 4.994464949010722e-05, + "loss": 0.9588, "num_input_tokens_seen": 11632640, "step": 1420 }, { - "epoch": 0.16363428309875272, - "grad_norm": 0.9682191610336304, - "learning_rate": 4.962009591452032e-05, - "loss": 1.266, + "epoch": 0.06866746698679471, + "grad_norm": 0.5525708198547363, + "learning_rate": 4.994198340574898e-05, + "loss": 0.9032, "num_input_tokens_seen": 11714560, "step": 1430 }, { - "epoch": 0.164778578784758, - "grad_norm": 0.5290905237197876, - "learning_rate": 4.960226803803664e-05, - "loss": 1.2794, + "epoch": 0.06914765906362545, + "grad_norm": 0.5921324491500854, + "learning_rate": 4.993925468824156e-05, + "loss": 0.8397, "num_input_tokens_seen": 11796480, "step": 1440 }, { - "epoch": 0.16592287447076323, - "grad_norm": 0.8597230315208435, - "learning_rate": 4.958403474208308e-05, - "loss": 1.2099, + "epoch": 0.06962785114045618, + "grad_norm": 0.5875818729400635, + "learning_rate": 4.99364633444372e-05, + "loss": 0.9019, "num_input_tokens_seen": 11878400, "step": 1450 }, { - "epoch": 0.1670671701567685, - "grad_norm": 0.6521583795547485, - "learning_rate": 4.9565396327125155e-05, - "loss": 1.3631, + "epoch": 0.07010804321728692, + "grad_norm": 0.49930045008659363, + "learning_rate": 4.993360938134537e-05, + "loss": 0.999, "num_input_tokens_seen": 11960320, "step": 1460 }, { - "epoch": 0.16821146584277377, - "grad_norm": 0.6258875727653503, - "learning_rate": 4.95463531003043e-05, - "loss": 1.1778, + "epoch": 0.07058823529411765, + "grad_norm": 0.5186768770217896, + "learning_rate": 4.993069280613282e-05, + "loss": 1.0207, "num_input_tokens_seen": 12042240, "step": 1470 }, { - "epoch": 0.16935576152877904, - "grad_norm": 3.744122266769409, - "learning_rate": 4.952690537543287e-05, - "loss": 1.3796, + "epoch": 0.07106842737094837, + "grad_norm": 0.6106103658676147, + "learning_rate": 4.9927713626123524e-05, + "loss": 0.9636, "num_input_tokens_seen": 12124160, "step": 1480 }, { - "epoch": 0.1705000572147843, - "grad_norm": 1.4813406467437744, - "learning_rate": 4.9507053472988867e-05, - "loss": 1.2806, + "epoch": 0.07154861944777911, + "grad_norm": 0.5551833510398865, + "learning_rate": 4.992467184879865e-05, + "loss": 0.9386, "num_input_tokens_seen": 12206080, "step": 1490 }, { - "epoch": 0.17164435290078955, - "grad_norm": 0.5667291283607483, - "learning_rate": 4.9486797720110746e-05, - "loss": 1.3344, + "epoch": 0.07202881152460984, + "grad_norm": 0.7018815279006958, + "learning_rate": 4.9921567481796585e-05, + "loss": 1.0065, "num_input_tokens_seen": 12288000, "step": 1500 }, { - "epoch": 0.17278864858679482, - "grad_norm": 1.1978493928909302, - "learning_rate": 4.946613845059199e-05, - "loss": 1.2448, + "epoch": 0.07250900360144058, + "grad_norm": 0.5908045768737793, + "learning_rate": 4.9918400532912845e-05, + "loss": 0.9552, "num_input_tokens_seen": 12369920, "step": 1510 }, { - "epoch": 0.1739329442728001, - "grad_norm": 1.018848180770874, - "learning_rate": 4.9445076004875596e-05, - "loss": 1.2916, + "epoch": 0.07298919567827131, + "grad_norm": 0.9950196146965027, + "learning_rate": 4.991517101010015e-05, + "loss": 0.9035, "num_input_tokens_seen": 12451840, "step": 1520 }, { - "epoch": 0.17507723995880536, - "grad_norm": 0.5812198519706726, - "learning_rate": 4.9423610730048495e-05, - "loss": 1.3731, + "epoch": 0.07346938775510205, + "grad_norm": 0.5551639199256897, + "learning_rate": 4.9911878921468304e-05, + "loss": 1.0423, "num_input_tokens_seen": 12533760, "step": 1530 }, { - "epoch": 0.1762215356448106, - "grad_norm": 0.5890350341796875, - "learning_rate": 4.940174297983581e-05, - "loss": 1.3654, + "epoch": 0.07394957983193277, + "grad_norm": 0.5617671012878418, + "learning_rate": 4.990852427528427e-05, + "loss": 0.9216, "num_input_tokens_seen": 12615680, "step": 1540 }, { - "epoch": 0.17736583133081588, - "grad_norm": 0.5180538892745972, - "learning_rate": 4.937947311459503e-05, - "loss": 1.2584, + "epoch": 0.0744297719087635, + "grad_norm": 0.6270405650138855, + "learning_rate": 4.9905107079972064e-05, + "loss": 0.9563, "num_input_tokens_seen": 12697600, "step": 1550 }, { - "epoch": 0.17851012701682115, - "grad_norm": 0.6072027087211609, - "learning_rate": 4.9356801501310105e-05, - "loss": 1.372, + "epoch": 0.07490996398559424, + "grad_norm": 0.5399815440177917, + "learning_rate": 4.990162734411279e-05, + "loss": 0.9665, "num_input_tokens_seen": 12779520, "step": 1560 }, { - "epoch": 0.17965442270282642, - "grad_norm": 0.5989131331443787, - "learning_rate": 4.933372851358532e-05, - "loss": 1.562, + "epoch": 0.07539015606242497, + "grad_norm": 0.55797278881073, + "learning_rate": 4.989808507644461e-05, + "loss": 1.0059, "num_input_tokens_seen": 12861440, "step": 1570 }, { - "epoch": 0.1807987183888317, - "grad_norm": 0.6137218475341797, - "learning_rate": 4.9310254531639235e-05, - "loss": 1.2423, + "epoch": 0.07587034813925571, + "grad_norm": 0.5673569440841675, + "learning_rate": 4.989448028586269e-05, + "loss": 0.9245, "num_input_tokens_seen": 12943360, "step": 1580 }, { - "epoch": 0.18194301407483693, - "grad_norm": 0.5463627576828003, - "learning_rate": 4.928637994229834e-05, - "loss": 1.3979, + "epoch": 0.07635054021608643, + "grad_norm": 0.5120587348937988, + "learning_rate": 4.989081298141921e-05, + "loss": 0.8285, "num_input_tokens_seen": 13025280, "step": 1590 }, { - "epoch": 0.1830873097608422, - "grad_norm": 0.5558052062988281, - "learning_rate": 4.9262105138990745e-05, - "loss": 1.3811, + "epoch": 0.07683073229291716, + "grad_norm": 0.6352382898330688, + "learning_rate": 4.988708317232334e-05, + "loss": 0.8067, "num_input_tokens_seen": 13107200, "step": 1600 }, { - "epoch": 0.18423160544684747, - "grad_norm": 0.6902844905853271, - "learning_rate": 4.9237430521739626e-05, - "loss": 1.4006, + "epoch": 0.0773109243697479, + "grad_norm": 0.6657270789146423, + "learning_rate": 4.988329086794122e-05, + "loss": 0.8029, "num_input_tokens_seen": 13189120, "step": 1610 }, { - "epoch": 0.18537590113285274, - "grad_norm": 1.180159568786621, - "learning_rate": 4.92123564971567e-05, - "loss": 1.2214, + "epoch": 0.07779111644657863, + "grad_norm": 0.5678449869155884, + "learning_rate": 4.9879436077795884e-05, + "loss": 1.1687, "num_input_tokens_seen": 13271040, "step": 1620 }, { - "epoch": 0.18652019681885798, - "grad_norm": 0.6135653853416443, - "learning_rate": 4.918688347843549e-05, - "loss": 1.2192, + "epoch": 0.07827130852340937, + "grad_norm": 0.5785603523254395, + "learning_rate": 4.98755188115673e-05, + "loss": 0.9474, "num_input_tokens_seen": 13352960, "step": 1630 }, { - "epoch": 0.18766449250486325, - "grad_norm": 0.5236485004425049, - "learning_rate": 4.916101188534452e-05, - "loss": 1.4384, + "epoch": 0.0787515006002401, + "grad_norm": 0.5578638315200806, + "learning_rate": 4.9871539079092344e-05, + "loss": 0.9421, "num_input_tokens_seen": 13434880, "step": 1640 }, { - "epoch": 0.18880878819086852, - "grad_norm": 0.6157830357551575, - "learning_rate": 4.9134742144220394e-05, - "loss": 1.2473, + "epoch": 0.07923169267707082, + "grad_norm": 0.9270123243331909, + "learning_rate": 4.9867496890364726e-05, + "loss": 1.0174, "num_input_tokens_seen": 13516800, "step": 1650 }, { - "epoch": 0.1899530838768738, - "grad_norm": 1.5510542392730713, - "learning_rate": 4.910807468796079e-05, - "loss": 1.277, + "epoch": 0.07971188475390156, + "grad_norm": 0.4947386085987091, + "learning_rate": 4.9863392255535e-05, + "loss": 1.1363, "num_input_tokens_seen": 13598720, "step": 1660 }, { - "epoch": 0.19109737956287906, - "grad_norm": 0.5472043752670288, - "learning_rate": 4.90810099560173e-05, - "loss": 1.31, + "epoch": 0.08019207683073229, + "grad_norm": 1.040479063987732, + "learning_rate": 4.985922518491054e-05, + "loss": 0.9505, "num_input_tokens_seen": 13680640, "step": 1670 }, { - "epoch": 0.1922416752488843, - "grad_norm": 0.5276287794113159, - "learning_rate": 4.90535483943882e-05, - "loss": 1.2949, + "epoch": 0.08067226890756303, + "grad_norm": 0.6103275418281555, + "learning_rate": 4.9854995688955494e-05, + "loss": 1.0416, "num_input_tokens_seen": 13762560, "step": 1680 }, { - "epoch": 0.19338597093488957, - "grad_norm": 0.5264617800712585, - "learning_rate": 4.902569045561113e-05, - "loss": 1.3735, + "epoch": 0.08115246098439376, + "grad_norm": 0.5951912999153137, + "learning_rate": 4.9850703778290784e-05, + "loss": 1.164, "num_input_tokens_seen": 13844480, "step": 1690 }, { - "epoch": 0.19453026662089484, - "grad_norm": 0.5804072022438049, - "learning_rate": 4.899743659875556e-05, - "loss": 1.2904, + "epoch": 0.08163265306122448, + "grad_norm": 0.5379506945610046, + "learning_rate": 4.984634946369404e-05, + "loss": 1.0884, "num_input_tokens_seen": 13926400, "step": 1700 }, { - "epoch": 0.1956745623069001, - "grad_norm": 0.5250833630561829, - "learning_rate": 4.896878728941531e-05, - "loss": 1.3387, + "epoch": 0.08211284513805522, + "grad_norm": 0.6716327667236328, + "learning_rate": 4.984193275609964e-05, + "loss": 1.2702, "num_input_tokens_seen": 14008320, "step": 1710 }, { - "epoch": 0.19681885799290535, - "grad_norm": 0.6384326815605164, - "learning_rate": 4.893974299970082e-05, - "loss": 1.1596, + "epoch": 0.08259303721488595, + "grad_norm": 0.555696964263916, + "learning_rate": 4.983745366659859e-05, + "loss": 0.9906, "num_input_tokens_seen": 14090240, "step": 1720 }, { - "epoch": 0.19796315367891062, - "grad_norm": 0.5223713517189026, - "learning_rate": 4.891030420823142e-05, - "loss": 1.296, + "epoch": 0.08307322929171669, + "grad_norm": 0.5970556735992432, + "learning_rate": 4.983291220643858e-05, + "loss": 0.9924, "num_input_tokens_seen": 14172160, "step": 1730 }, { - "epoch": 0.1991074493649159, - "grad_norm": 0.5310291051864624, - "learning_rate": 4.888047140012737e-05, - "loss": 1.3196, + "epoch": 0.08355342136854742, + "grad_norm": 0.5664453506469727, + "learning_rate": 4.982830838702392e-05, + "loss": 1.0358, "num_input_tokens_seen": 14254080, "step": 1740 }, { - "epoch": 0.20025174505092116, - "grad_norm": 0.6313158869743347, - "learning_rate": 4.885024506700195e-05, - "loss": 1.1085, + "epoch": 0.08403361344537816, + "grad_norm": 1.6528098583221436, + "learning_rate": 4.98236422199155e-05, + "loss": 0.9054, "num_input_tokens_seen": 14336000, "step": 1750 }, { - "epoch": 0.20139604073692643, - "grad_norm": 0.9189643263816833, - "learning_rate": 4.8819625706953286e-05, - "loss": 1.3757, + "epoch": 0.08451380552220888, + "grad_norm": 0.5737955570220947, + "learning_rate": 4.9818913716830784e-05, + "loss": 0.985, "num_input_tokens_seen": 14417920, "step": 1760 }, { - "epoch": 0.20254033642293168, - "grad_norm": 0.5741713047027588, - "learning_rate": 4.8788613824556194e-05, - "loss": 1.1697, + "epoch": 0.08499399759903961, + "grad_norm": 0.5684435963630676, + "learning_rate": 4.981412288964377e-05, + "loss": 1.0279, "num_input_tokens_seen": 14499840, "step": 1770 }, { - "epoch": 0.20368463210893695, - "grad_norm": 0.5382892489433289, - "learning_rate": 4.875720993085384e-05, - "loss": 1.2916, + "epoch": 0.08547418967587035, + "grad_norm": 0.5723880529403687, + "learning_rate": 4.9809269750384956e-05, + "loss": 1.01, "num_input_tokens_seen": 14581760, "step": 1780 }, { - "epoch": 0.20482892779494222, - "grad_norm": 0.5520409941673279, - "learning_rate": 4.8725414543349326e-05, - "loss": 1.1496, + "epoch": 0.08595438175270108, + "grad_norm": 0.5838521122932434, + "learning_rate": 4.980435431124133e-05, + "loss": 1.1156, "num_input_tokens_seen": 14663680, "step": 1790 }, { - "epoch": 0.2059732234809475, - "grad_norm": 0.5150988101959229, - "learning_rate": 4.869322818599714e-05, - "loss": 1.265, + "epoch": 0.08643457382953182, + "grad_norm": 0.45137259364128113, + "learning_rate": 4.97993765845563e-05, + "loss": 0.9487, "num_input_tokens_seen": 14745600, "step": 1800 }, { - "epoch": 0.20711751916695273, - "grad_norm": 0.5061900019645691, - "learning_rate": 4.8660651389194576e-05, - "loss": 1.293, + "epoch": 0.08691476590636255, + "grad_norm": 0.5664239525794983, + "learning_rate": 4.9794336582829714e-05, + "loss": 0.9613, "num_input_tokens_seen": 14827520, "step": 1810 }, { - "epoch": 0.208261814852958, - "grad_norm": 0.5306046009063721, - "learning_rate": 4.862768468977293e-05, - "loss": 1.3073, + "epoch": 0.08739495798319327, + "grad_norm": 0.7043583989143372, + "learning_rate": 4.9789234318717784e-05, + "loss": 1.0248, "num_input_tokens_seen": 14909440, "step": 1820 }, { - "epoch": 0.20940611053896327, - "grad_norm": 0.5865935683250427, - "learning_rate": 4.8594328630988696e-05, - "loss": 1.171, + "epoch": 0.08787515006002401, + "grad_norm": 0.5338053703308105, + "learning_rate": 4.978406980503308e-05, + "loss": 1.0187, "num_input_tokens_seen": 14991360, "step": 1830 }, { - "epoch": 0.21055040622496854, - "grad_norm": 0.5565701723098755, - "learning_rate": 4.8560583762514594e-05, - "loss": 1.2818, + "epoch": 0.08835534213685474, + "grad_norm": 0.5779690146446228, + "learning_rate": 4.9778843054744494e-05, + "loss": 0.9926, "num_input_tokens_seen": 15073280, "step": 1840 }, { - "epoch": 0.2116947019109738, - "grad_norm": 0.504626452922821, - "learning_rate": 4.852645064043053e-05, - "loss": 1.0901, + "epoch": 0.08883553421368548, + "grad_norm": 0.5544894337654114, + "learning_rate": 4.977355408097719e-05, + "loss": 0.9672, "num_input_tokens_seen": 15155200, "step": 1850 }, { - "epoch": 0.21283899759697905, - "grad_norm": 0.49429330229759216, - "learning_rate": 4.84919298272144e-05, - "loss": 1.2341, + "epoch": 0.0893157262905162, + "grad_norm": 0.7703062891960144, + "learning_rate": 4.9768202897012595e-05, + "loss": 0.8497, "num_input_tokens_seen": 15237120, "step": 1860 }, { - "epoch": 0.21398329328298432, - "grad_norm": 0.6143152117729187, - "learning_rate": 4.8457021891732866e-05, - "loss": 1.415, + "epoch": 0.08979591836734693, + "grad_norm": 0.5235300660133362, + "learning_rate": 4.9762789516288354e-05, + "loss": 1.0258, "num_input_tokens_seen": 15319040, "step": 1870 }, { - "epoch": 0.2151275889689896, - "grad_norm": 0.5530739426612854, - "learning_rate": 4.842172740923194e-05, - "loss": 1.5628, + "epoch": 0.09027611044417767, + "grad_norm": 0.5461897253990173, + "learning_rate": 4.97573139523983e-05, + "loss": 0.9799, "num_input_tokens_seen": 15400960, "step": 1880 }, { - "epoch": 0.21627188465499486, - "grad_norm": 0.5200616717338562, - "learning_rate": 4.838604696132753e-05, - "loss": 1.4214, + "epoch": 0.0907563025210084, + "grad_norm": 2.8185510635375977, + "learning_rate": 4.9751776219092405e-05, + "loss": 1.0616, "num_input_tokens_seen": 15482880, "step": 1890 }, { - "epoch": 0.2174161803410001, - "grad_norm": 0.9391474723815918, - "learning_rate": 4.8349981135995826e-05, - "loss": 1.2436, + "epoch": 0.09123649459783914, + "grad_norm": 0.5489822030067444, + "learning_rate": 4.9746176330276783e-05, + "loss": 0.894, "num_input_tokens_seen": 15564800, "step": 1900 }, { - "epoch": 0.21856047602700537, - "grad_norm": 0.49321290850639343, - "learning_rate": 4.831353052756367e-05, - "loss": 1.3363, + "epoch": 0.09171668667466987, + "grad_norm": 0.5419192910194397, + "learning_rate": 4.97405143000136e-05, + "loss": 1.069, "num_input_tokens_seen": 15646720, "step": 1910 }, { - "epoch": 0.21970477171301064, - "grad_norm": 0.5570663213729858, - "learning_rate": 4.8276695736698704e-05, - "loss": 1.3984, + "epoch": 0.0921968787515006, + "grad_norm": 0.5786951184272766, + "learning_rate": 4.9734790142521096e-05, + "loss": 1.0716, "num_input_tokens_seen": 15728640, "step": 1920 }, { - "epoch": 0.2208490673990159, - "grad_norm": 0.5171666145324707, - "learning_rate": 4.823947737039948e-05, - "loss": 1.2282, + "epoch": 0.09267707082833133, + "grad_norm": 0.5193120837211609, + "learning_rate": 4.9729003872173494e-05, + "loss": 1.1271, "num_input_tokens_seen": 15810560, "step": 1930 }, { - "epoch": 0.22199336308502116, - "grad_norm": 0.6550266742706299, - "learning_rate": 4.8201876041985496e-05, - "loss": 1.32, + "epoch": 0.09315726290516206, + "grad_norm": 0.5859541296958923, + "learning_rate": 4.972315550350102e-05, + "loss": 0.9171, "num_input_tokens_seen": 15892480, "step": 1940 }, { - "epoch": 0.22313765877102643, - "grad_norm": 0.6058536171913147, - "learning_rate": 4.8163892371087045e-05, - "loss": 1.2447, + "epoch": 0.0936374549819928, + "grad_norm": 1.0095491409301758, + "learning_rate": 4.971724505118982e-05, + "loss": 0.9881, "num_input_tokens_seen": 15974400, "step": 1950 }, { - "epoch": 0.2242819544570317, - "grad_norm": 0.7001304626464844, - "learning_rate": 4.812552698363502e-05, - "loss": 1.1607, + "epoch": 0.09411764705882353, + "grad_norm": 0.5563061237335205, + "learning_rate": 4.971127253008194e-05, + "loss": 1.0533, "num_input_tokens_seen": 16056320, "step": 1960 }, { - "epoch": 0.22542625014303697, - "grad_norm": 0.8715665340423584, - "learning_rate": 4.8086780511850606e-05, - "loss": 1.3617, + "epoch": 0.09459783913565427, + "grad_norm": 0.5516453385353088, + "learning_rate": 4.970523795517532e-05, + "loss": 1.0697, "num_input_tokens_seen": 16138240, "step": 1970 }, { - "epoch": 0.22657054582904224, - "grad_norm": 0.5489494800567627, - "learning_rate": 4.8047653594234855e-05, - "loss": 1.1251, + "epoch": 0.095078031212485, + "grad_norm": 0.6930283308029175, + "learning_rate": 4.969914134162368e-05, + "loss": 0.8598, "num_input_tokens_seen": 16220160, "step": 1980 }, { - "epoch": 0.22771484151504748, - "grad_norm": 0.5537709593772888, - "learning_rate": 4.800814687555817e-05, - "loss": 1.193, + "epoch": 0.09555822328931572, + "grad_norm": 0.5376786589622498, + "learning_rate": 4.9692982704736566e-05, + "loss": 0.9143, "num_input_tokens_seen": 16302080, "step": 1990 }, { - "epoch": 0.22885913720105275, - "grad_norm": 0.9744300842285156, - "learning_rate": 4.796826100684967e-05, - "loss": 1.1411, + "epoch": 0.09603841536614646, + "grad_norm": 0.849349856376648, + "learning_rate": 4.968676205997925e-05, + "loss": 0.8093, "num_input_tokens_seen": 16384000, "step": 2000 }, { - "epoch": 0.23000343288705802, - "grad_norm": 0.6724236607551575, - "learning_rate": 4.7927996645386476e-05, - "loss": 1.2578, + "epoch": 0.09651860744297719, + "grad_norm": 0.560719907283783, + "learning_rate": 4.9680479422972735e-05, + "loss": 1.0241, "num_input_tokens_seen": 16465920, "step": 2010 }, { - "epoch": 0.2311477285730633, - "grad_norm": 0.6202824711799622, - "learning_rate": 4.7887354454682854e-05, - "loss": 1.1249, + "epoch": 0.09699879951980793, + "grad_norm": 0.5490632653236389, + "learning_rate": 4.9674134809493686e-05, + "loss": 0.9488, "num_input_tokens_seen": 16547840, "step": 2020 }, { - "epoch": 0.23229202425906853, - "grad_norm": 0.5438826680183411, - "learning_rate": 4.784633510447932e-05, - "loss": 1.1754, + "epoch": 0.09747899159663866, + "grad_norm": 0.45328429341316223, + "learning_rate": 4.9667728235474396e-05, + "loss": 0.9864, "num_input_tokens_seen": 16629760, "step": 2030 }, { - "epoch": 0.2334363199450738, - "grad_norm": 0.5820137858390808, - "learning_rate": 4.7804939270731564e-05, - "loss": 1.2965, + "epoch": 0.09795918367346938, + "grad_norm": 0.5597791075706482, + "learning_rate": 4.9661259717002764e-05, + "loss": 0.8736, "num_input_tokens_seen": 16711680, "step": 2040 }, { - "epoch": 0.23458061563107907, - "grad_norm": 0.5191706418991089, - "learning_rate": 4.776316763559933e-05, - "loss": 1.2211, + "epoch": 0.09843937575030012, + "grad_norm": 0.6936686038970947, + "learning_rate": 4.9654729270322234e-05, + "loss": 1.0361, "num_input_tokens_seen": 16793600, "step": 2050 }, { - "epoch": 0.23572491131708434, - "grad_norm": 0.9790087342262268, - "learning_rate": 4.7721020887435186e-05, - "loss": 1.0741, + "epoch": 0.09891956782713085, + "grad_norm": 0.5862544178962708, + "learning_rate": 4.964813691183174e-05, + "loss": 0.9958, "num_input_tokens_seen": 16875520, "step": 2060 }, { - "epoch": 0.2368692070030896, - "grad_norm": 0.48102623224258423, - "learning_rate": 4.767849972077315e-05, - "loss": 1.1755, + "epoch": 0.09939975990396159, + "grad_norm": 2.3423945903778076, + "learning_rate": 4.964148265808573e-05, + "loss": 0.8843, "num_input_tokens_seen": 16957440, "step": 2070 }, { - "epoch": 0.23801350268909485, - "grad_norm": 0.5206550359725952, - "learning_rate": 4.763560483631728e-05, - "loss": 1.2918, + "epoch": 0.09987995198079232, + "grad_norm": 0.5327481031417847, + "learning_rate": 4.963476652579404e-05, + "loss": 0.8698, "num_input_tokens_seen": 17039360, "step": 2080 }, { - "epoch": 0.23915779837510012, - "grad_norm": 0.620953381061554, - "learning_rate": 4.75923369409301e-05, - "loss": 1.2861, + "epoch": 0.10036014405762304, + "grad_norm": 0.5698966979980469, + "learning_rate": 4.962798853182192e-05, + "loss": 1.1505, "num_input_tokens_seen": 17121280, "step": 2090 }, { - "epoch": 0.2403020940611054, - "grad_norm": 0.5213440656661987, - "learning_rate": 4.7548696747620956e-05, - "loss": 1.2797, + "epoch": 0.10084033613445378, + "grad_norm": 1.078123688697815, + "learning_rate": 4.9621148693189954e-05, + "loss": 1.131, "num_input_tokens_seen": 17203200, "step": 2100 }, { - "epoch": 0.24144638974711066, - "grad_norm": 0.5252777934074402, - "learning_rate": 4.750468497553429e-05, - "loss": 1.2802, + "epoch": 0.10132052821128451, + "grad_norm": 0.4983321726322174, + "learning_rate": 4.9614247027074024e-05, + "loss": 1.2084, "num_input_tokens_seen": 17285120, "step": 2110 }, { - "epoch": 0.2425906854331159, - "grad_norm": 0.553130030632019, - "learning_rate": 4.746030234993775e-05, - "loss": 1.2026, + "epoch": 0.10180072028811525, + "grad_norm": 0.5719578266143799, + "learning_rate": 4.960728355080527e-05, + "loss": 0.822, "num_input_tokens_seen": 17367040, "step": 2120 }, { - "epoch": 0.24373498111912117, - "grad_norm": 0.5378229022026062, - "learning_rate": 4.741554960221027e-05, - "loss": 1.4968, + "epoch": 0.10228091236494598, + "grad_norm": 0.6987754106521606, + "learning_rate": 4.9600258281870046e-05, + "loss": 0.9156, "num_input_tokens_seen": 17448960, "step": 2130 }, { - "epoch": 0.24487927680512644, - "grad_norm": 2.1330316066741943, - "learning_rate": 4.7370427469830016e-05, - "loss": 1.4157, + "epoch": 0.1027611044417767, + "grad_norm": 0.9254079461097717, + "learning_rate": 4.959317123790988e-05, + "loss": 0.9211, "num_input_tokens_seen": 17530880, "step": 2140 }, { - "epoch": 0.24602357249113171, - "grad_norm": 0.555316150188446, - "learning_rate": 4.73249366963622e-05, - "loss": 1.2478, + "epoch": 0.10324129651860744, + "grad_norm": 0.5085683465003967, + "learning_rate": 4.958602243672145e-05, + "loss": 0.9979, "num_input_tokens_seen": 17612800, "step": 2150 }, { - "epoch": 0.24716786817713698, - "grad_norm": 0.5266692638397217, - "learning_rate": 4.727907803144686e-05, - "loss": 1.4627, + "epoch": 0.10372148859543817, + "grad_norm": 0.6418617963790894, + "learning_rate": 4.9578811896256475e-05, + "loss": 0.9711, "num_input_tokens_seen": 17694720, "step": 2160 }, { - "epoch": 0.24831216386314223, - "grad_norm": 0.5252066254615784, - "learning_rate": 4.723285223078653e-05, - "loss": 1.2206, + "epoch": 0.10420168067226891, + "grad_norm": 0.5556450486183167, + "learning_rate": 4.957153963462172e-05, + "loss": 0.9847, "num_input_tokens_seen": 17776640, "step": 2170 }, { - "epoch": 0.2494564595491475, - "grad_norm": 0.6599643230438232, - "learning_rate": 4.71862600561337e-05, - "loss": 1.1268, + "epoch": 0.10468187274909964, + "grad_norm": 0.5734561085700989, + "learning_rate": 4.9564205670078965e-05, + "loss": 0.9521, "num_input_tokens_seen": 17858560, "step": 2180 }, { - "epoch": 0.25060075523515274, - "grad_norm": 1.4253127574920654, - "learning_rate": 4.713930227527836e-05, - "loss": 1.0187, + "epoch": 0.10516206482593037, + "grad_norm": 0.5195381045341492, + "learning_rate": 4.955681002104492e-05, + "loss": 0.9161, "num_input_tokens_seen": 17940480, "step": 2190 }, { - "epoch": 0.251745050921158, - "grad_norm": 0.9951733946800232, - "learning_rate": 4.709197966203528e-05, - "loss": 0.875, + "epoch": 0.1056422569027611, + "grad_norm": 0.5360829830169678, + "learning_rate": 4.954935270609119e-05, + "loss": 0.944, "num_input_tokens_seen": 18022400, "step": 2200 }, { - "epoch": 0.2528893466071633, - "grad_norm": 0.552880585193634, - "learning_rate": 4.704429299623129e-05, - "loss": 1.2277, + "epoch": 0.10612244897959183, + "grad_norm": 0.5251927375793457, + "learning_rate": 4.9541833743944244e-05, + "loss": 1.0233, "num_input_tokens_seen": 18104320, "step": 2210 }, { - "epoch": 0.25403364229316855, - "grad_norm": 0.47009891271591187, - "learning_rate": 4.6996243063692446e-05, - "loss": 1.4153, + "epoch": 0.10660264105642257, + "grad_norm": 0.5330286622047424, + "learning_rate": 4.953425315348534e-05, + "loss": 0.9329, "num_input_tokens_seen": 18186240, "step": 2220 }, { - "epoch": 0.2551779379791738, - "grad_norm": 0.5296884179115295, - "learning_rate": 4.694783065623102e-05, - "loss": 1.1905, + "epoch": 0.1070828331332533, + "grad_norm": 0.5502789616584778, + "learning_rate": 4.952661095375051e-05, + "loss": 0.9987, "num_input_tokens_seen": 18268160, "step": 2230 }, { - "epoch": 0.2563222336651791, - "grad_norm": 0.561241865158081, - "learning_rate": 4.68990565716325e-05, - "loss": 1.363, + "epoch": 0.10756302521008404, + "grad_norm": 0.6187983751296997, + "learning_rate": 4.95189071639305e-05, + "loss": 0.8519, "num_input_tokens_seen": 18350080, "step": 2240 }, { - "epoch": 0.25746652935118436, - "grad_norm": 0.4963686466217041, - "learning_rate": 4.6849921613642456e-05, - "loss": 1.3028, + "epoch": 0.10804321728691477, + "grad_norm": 0.5929238796234131, + "learning_rate": 4.951114180337069e-05, + "loss": 0.8418, "num_input_tokens_seen": 18432000, "step": 2250 }, { - "epoch": 0.25861082503718963, - "grad_norm": 0.5019493103027344, - "learning_rate": 4.680042659195325e-05, - "loss": 1.0479, + "epoch": 0.1085234093637455, + "grad_norm": 0.4955889880657196, + "learning_rate": 4.95033148915711e-05, + "loss": 0.7989, "num_input_tokens_seen": 18513920, "step": 2260 }, { - "epoch": 0.2597551207231949, - "grad_norm": 0.5507912039756775, - "learning_rate": 4.6750572322190716e-05, - "loss": 1.4381, + "epoch": 0.10900360144057623, + "grad_norm": 0.5626336336135864, + "learning_rate": 4.949542644818631e-05, + "loss": 0.8756, "num_input_tokens_seen": 18595840, "step": 2270 }, { - "epoch": 0.2608994164092001, - "grad_norm": 0.45838436484336853, - "learning_rate": 4.6700359625900724e-05, - "loss": 1.1211, + "epoch": 0.10948379351740696, + "grad_norm": 1.048539638519287, + "learning_rate": 4.948747649302542e-05, + "loss": 1.0697, "num_input_tokens_seen": 18677760, "step": 2280 }, { - "epoch": 0.2620437120952054, - "grad_norm": 0.49079829454421997, - "learning_rate": 4.664978933053562e-05, - "loss": 1.1206, + "epoch": 0.1099639855942377, + "grad_norm": 0.49436965584754944, + "learning_rate": 4.947946504605198e-05, + "loss": 0.9839, "num_input_tokens_seen": 18759680, "step": 2290 }, { - "epoch": 0.26318800778121065, - "grad_norm": 1.0304409265518188, - "learning_rate": 4.659886226944063e-05, - "loss": 1.2446, + "epoch": 0.11044417767106843, + "grad_norm": 0.5616737008094788, + "learning_rate": 4.947139212738395e-05, + "loss": 0.9213, "num_input_tokens_seen": 18841600, "step": 2300 }, { - "epoch": 0.2643323034672159, - "grad_norm": 0.6165328025817871, - "learning_rate": 4.65475792818401e-05, - "loss": 1.1047, + "epoch": 0.11092436974789915, + "grad_norm": 0.5444278120994568, + "learning_rate": 4.946325775729368e-05, + "loss": 1.0463, "num_input_tokens_seen": 18923520, "step": 2310 }, { - "epoch": 0.2654765991532212, - "grad_norm": 1.5652453899383545, - "learning_rate": 4.6495941212823644e-05, - "loss": 1.175, + "epoch": 0.1114045618247299, + "grad_norm": 0.521336555480957, + "learning_rate": 4.945506195620784e-05, + "loss": 0.9103, "num_input_tokens_seen": 19005440, "step": 2320 }, { - "epoch": 0.26662089483922646, - "grad_norm": 0.5280311107635498, - "learning_rate": 4.644394891333227e-05, - "loss": 1.1473, + "epoch": 0.11188475390156062, + "grad_norm": 0.5483266115188599, + "learning_rate": 4.944680474470731e-05, + "loss": 0.9041, "num_input_tokens_seen": 19087360, "step": 2330 }, { - "epoch": 0.26776519052523173, - "grad_norm": 1.034646987915039, - "learning_rate": 4.639160324014433e-05, - "loss": 1.1003, + "epoch": 0.11236494597839136, + "grad_norm": 0.5167329907417297, + "learning_rate": 4.943848614352724e-05, + "loss": 1.1012, "num_input_tokens_seen": 19169280, "step": 2340 }, { - "epoch": 0.268909486211237, - "grad_norm": 0.4900410771369934, - "learning_rate": 4.633890505586139e-05, - "loss": 1.5043, + "epoch": 0.11284513805522209, + "grad_norm": 0.539591372013092, + "learning_rate": 4.943010617355691e-05, + "loss": 0.8855, "num_input_tokens_seen": 19251200, "step": 2350 }, { - "epoch": 0.2700537818972423, - "grad_norm": 1.9830961227416992, - "learning_rate": 4.6285855228894025e-05, - "loss": 1.3875, + "epoch": 0.11332533013205282, + "grad_norm": 1.1016952991485596, + "learning_rate": 4.94216648558397e-05, + "loss": 0.9905, "num_input_tokens_seen": 19333120, "step": 2360 }, { - "epoch": 0.2711980775832475, - "grad_norm": 0.5432376265525818, - "learning_rate": 4.623245463344753e-05, - "loss": 1.2485, + "epoch": 0.11380552220888356, + "grad_norm": 0.5307420492172241, + "learning_rate": 4.9413162211573075e-05, + "loss": 0.8465, "num_input_tokens_seen": 19415040, "step": 2370 }, { - "epoch": 0.27234237326925276, - "grad_norm": 0.6268254518508911, - "learning_rate": 4.617870414950748e-05, - "loss": 1.6017, + "epoch": 0.11428571428571428, + "grad_norm": 0.5067051649093628, + "learning_rate": 4.9404598262108456e-05, + "loss": 1.3091, "num_input_tokens_seen": 19496960, "step": 2380 }, { - "epoch": 0.273486668955258, - "grad_norm": 0.6118723750114441, - "learning_rate": 4.612460466282525e-05, - "loss": 1.1282, + "epoch": 0.11476590636254502, + "grad_norm": 0.5648155212402344, + "learning_rate": 4.939597302895125e-05, + "loss": 1.0505, "num_input_tokens_seen": 19578880, "step": 2390 }, { - "epoch": 0.2746309646412633, - "grad_norm": 0.6583623290061951, - "learning_rate": 4.607015706490341e-05, - "loss": 1.1795, + "epoch": 0.11524609843937575, + "grad_norm": 0.5425604581832886, + "learning_rate": 4.938728653376075e-05, + "loss": 0.9624, "num_input_tokens_seen": 19660800, "step": 2400 }, { - "epoch": 0.27577526032726857, - "grad_norm": 0.6352823376655579, - "learning_rate": 4.601536225298104e-05, - "loss": 1.111, + "epoch": 0.11572629051620648, + "grad_norm": 0.6649373173713684, + "learning_rate": 4.9378538798350046e-05, + "loss": 0.882, "num_input_tokens_seen": 19742720, "step": 2410 }, { - "epoch": 0.27691955601327384, - "grad_norm": 1.524483561515808, - "learning_rate": 4.5960221130018946e-05, - "loss": 1.1325, + "epoch": 0.11620648259303722, + "grad_norm": 0.4412856698036194, + "learning_rate": 4.936972984468608e-05, + "loss": 0.802, "num_input_tokens_seen": 19824640, "step": 2420 }, { - "epoch": 0.2780638516992791, - "grad_norm": 0.41133037209510803, - "learning_rate": 4.590473460468475e-05, - "loss": 1.2953, + "epoch": 0.11668667466986794, + "grad_norm": 0.5264973044395447, + "learning_rate": 4.936085969488947e-05, + "loss": 0.8687, "num_input_tokens_seen": 19906560, "step": 2430 }, { - "epoch": 0.2792081473852844, - "grad_norm": 0.8059386610984802, - "learning_rate": 4.584890359133797e-05, - "loss": 1.168, + "epoch": 0.11716686674669868, + "grad_norm": 0.5381867289543152, + "learning_rate": 4.9351928371234525e-05, + "loss": 0.7774, "num_input_tokens_seen": 19988480, "step": 2440 }, { - "epoch": 0.28035244307128965, - "grad_norm": 1.0323024988174438, - "learning_rate": 4.579272901001491e-05, - "loss": 1.3607, + "epoch": 0.11764705882352941, + "grad_norm": 0.5331231355667114, + "learning_rate": 4.934293589614917e-05, + "loss": 0.8777, "num_input_tokens_seen": 20070400, "step": 2450 }, { - "epoch": 0.28149673875729486, - "grad_norm": 0.5809211134910583, - "learning_rate": 4.5736211786413524e-05, - "loss": 1.293, + "epoch": 0.11812725090036015, + "grad_norm": 0.5481956601142883, + "learning_rate": 4.93338822922149e-05, + "loss": 1.0353, "num_input_tokens_seen": 20152320, "step": 2460 }, { - "epoch": 0.28264103444330013, - "grad_norm": 0.7068034410476685, - "learning_rate": 4.5679352851878135e-05, - "loss": 1.3167, + "epoch": 0.11860744297719088, + "grad_norm": 0.5417446494102478, + "learning_rate": 4.932476758216669e-05, + "loss": 0.9866, "num_input_tokens_seen": 20234240, "step": 2470 }, { - "epoch": 0.2837853301293054, - "grad_norm": 0.5605278611183167, - "learning_rate": 4.562215314338411e-05, - "loss": 1.3806, + "epoch": 0.1190876350540216, + "grad_norm": 0.5857323408126831, + "learning_rate": 4.931559178889297e-05, + "loss": 0.9443, "num_input_tokens_seen": 20316160, "step": 2480 }, { - "epoch": 0.28492962581531067, - "grad_norm": 0.5452982187271118, - "learning_rate": 4.556461360352241e-05, - "loss": 1.0428, + "epoch": 0.11956782713085234, + "grad_norm": 0.7107313871383667, + "learning_rate": 4.9306354935435594e-05, + "loss": 1.0362, "num_input_tokens_seen": 20398080, "step": 2490 }, { - "epoch": 0.28607392150131594, - "grad_norm": 0.5816521048545837, - "learning_rate": 4.550673518048405e-05, - "loss": 1.0979, + "epoch": 0.12004801920768307, + "grad_norm": 0.5205745100975037, + "learning_rate": 4.929705704498969e-05, + "loss": 1.0075, "num_input_tokens_seen": 20480000, "step": 2500 }, { - "epoch": 0.2872182171873212, - "grad_norm": 0.6600253582000732, - "learning_rate": 4.5448518828044515e-05, - "loss": 1.2309, + "epoch": 0.12052821128451381, + "grad_norm": 0.5517631769180298, + "learning_rate": 4.928769814090371e-05, + "loss": 0.9074, "num_input_tokens_seen": 20561920, "step": 2510 }, { - "epoch": 0.2883625128733265, - "grad_norm": 0.5788952708244324, - "learning_rate": 4.538996550554798e-05, - "loss": 1.1428, + "epoch": 0.12100840336134454, + "grad_norm": 0.5010194778442383, + "learning_rate": 4.927827824667929e-05, + "loss": 1.0916, "num_input_tokens_seen": 20643840, "step": 2520 }, { - "epoch": 0.28950680855933175, - "grad_norm": 0.5803161263465881, - "learning_rate": 4.5331076177891527e-05, - "loss": 1.5423, + "epoch": 0.12148859543817526, + "grad_norm": 0.5098908543586731, + "learning_rate": 4.926879738597122e-05, + "loss": 1.0548, "num_input_tokens_seen": 20725760, "step": 2530 }, { - "epoch": 0.290651104245337, - "grad_norm": 0.5454373955726624, - "learning_rate": 4.527185181550928e-05, - "loss": 1.31, + "epoch": 0.121968787515006, + "grad_norm": 0.8620312213897705, + "learning_rate": 4.925925558258741e-05, + "loss": 1.1967, "num_input_tokens_seen": 20807680, "step": 2540 }, { - "epoch": 0.29179539993134224, - "grad_norm": 1.4661908149719238, - "learning_rate": 4.5212293394356356e-05, - "loss": 1.0873, + "epoch": 0.12244897959183673, + "grad_norm": 0.5866915583610535, + "learning_rate": 4.924965286048879e-05, + "loss": 0.9201, "num_input_tokens_seen": 20889600, "step": 2550 }, { - "epoch": 0.2929396956173475, - "grad_norm": 0.6312928199768066, - "learning_rate": 4.515240189589282e-05, - "loss": 1.1553, + "epoch": 0.12292917166866747, + "grad_norm": 1.4908480644226074, + "learning_rate": 4.9239989243789275e-05, + "loss": 1.0407, "num_input_tokens_seen": 20971520, "step": 2560 }, { - "epoch": 0.2940839913033528, - "grad_norm": 0.5782475471496582, - "learning_rate": 4.509217830706749e-05, - "loss": 1.267, + "epoch": 0.1234093637454982, + "grad_norm": 0.483049213886261, + "learning_rate": 4.9230264756755685e-05, + "loss": 1.0231, "num_input_tokens_seen": 21053440, "step": 2570 }, { - "epoch": 0.29522828698935805, - "grad_norm": 0.5917658805847168, - "learning_rate": 4.50316236203017e-05, - "loss": 1.074, + "epoch": 0.12388955582232893, + "grad_norm": 2.7187938690185547, + "learning_rate": 4.9220479423807694e-05, + "loss": 0.9814, "num_input_tokens_seen": 21135360, "step": 2580 }, { - "epoch": 0.2963725826753633, - "grad_norm": 0.5403671264648438, - "learning_rate": 4.497073883347293e-05, - "loss": 1.238, + "epoch": 0.12436974789915967, + "grad_norm": 0.5453813672065735, + "learning_rate": 4.9210633269517776e-05, + "loss": 1.0074, "num_input_tokens_seen": 21217280, "step": 2590 }, { - "epoch": 0.2975168783613686, - "grad_norm": 0.5732063055038452, - "learning_rate": 4.490952494989834e-05, - "loss": 1.2223, + "epoch": 0.12484993997599039, + "grad_norm": 0.5155650973320007, + "learning_rate": 4.920072631861115e-05, + "loss": 0.9252, "num_input_tokens_seen": 21299200, "step": 2600 }, { - "epoch": 0.29866117404737386, - "grad_norm": 0.5230866074562073, - "learning_rate": 4.484798297831826e-05, - "loss": 1.2797, + "epoch": 0.12533013205282112, + "grad_norm": 0.510693371295929, + "learning_rate": 4.919075859596567e-05, + "loss": 1.1132, "num_input_tokens_seen": 21381120, "step": 2610 }, { - "epoch": 0.2998054697333791, - "grad_norm": 0.5615763664245605, - "learning_rate": 4.4786113932879605e-05, - "loss": 1.1556, + "epoch": 0.12581032412965187, + "grad_norm": 0.5092229843139648, + "learning_rate": 4.918073012661183e-05, + "loss": 1.1313, "num_input_tokens_seen": 21463040, "step": 2620 }, { - "epoch": 0.3009497654193844, - "grad_norm": 0.8755237460136414, - "learning_rate": 4.472391883311906e-05, - "loss": 1.156, + "epoch": 0.1262905162064826, + "grad_norm": 0.5227720737457275, + "learning_rate": 4.9170640935732654e-05, + "loss": 1.0631, "num_input_tokens_seen": 21544960, "step": 2630 }, { - "epoch": 0.3020940611053896, - "grad_norm": 0.5666770935058594, - "learning_rate": 4.4661398703946396e-05, - "loss": 1.1363, + "epoch": 0.12677070828331333, + "grad_norm": 0.5651267766952515, + "learning_rate": 4.916049104866365e-05, + "loss": 0.8485, "num_input_tokens_seen": 21626880, "step": 2640 }, { - "epoch": 0.3032383567913949, - "grad_norm": 0.5907579064369202, - "learning_rate": 4.4598554575627495e-05, - "loss": 1.4523, + "epoch": 0.12725090036014405, + "grad_norm": 0.8087652921676636, + "learning_rate": 4.915028049089274e-05, + "loss": 0.9922, "num_input_tokens_seen": 21708800, "step": 2650 }, { - "epoch": 0.30438265247740015, - "grad_norm": 0.5281969904899597, - "learning_rate": 4.453538748376742e-05, - "loss": 0.9537, + "epoch": 0.12773109243697478, + "grad_norm": 0.5781594514846802, + "learning_rate": 4.914000928806021e-05, + "loss": 0.9331, "num_input_tokens_seen": 21790720, "step": 2660 }, { - "epoch": 0.3055269481634054, - "grad_norm": 0.6245942115783691, - "learning_rate": 4.4471898469293324e-05, - "loss": 1.1622, + "epoch": 0.12821128451380553, + "grad_norm": 0.5171418786048889, + "learning_rate": 4.912967746595861e-05, + "loss": 1.0793, "num_input_tokens_seen": 21872640, "step": 2670 }, { - "epoch": 0.3066712438494107, - "grad_norm": 0.5124683976173401, - "learning_rate": 4.44080885784373e-05, - "loss": 1.3041, + "epoch": 0.12869147659063626, + "grad_norm": 0.5301877856254578, + "learning_rate": 4.911928505053275e-05, + "loss": 1.1306, "num_input_tokens_seen": 21954560, "step": 2680 }, { - "epoch": 0.30781553953541596, - "grad_norm": 0.536454439163208, - "learning_rate": 4.434395886271917e-05, - "loss": 1.3722, + "epoch": 0.129171668667467, + "grad_norm": 0.5109881162643433, + "learning_rate": 4.9108832067879574e-05, + "loss": 0.9962, "num_input_tokens_seen": 22036480, "step": 2690 }, { - "epoch": 0.30895983522142123, - "grad_norm": 0.5136358737945557, - "learning_rate": 4.427951037892911e-05, - "loss": 1.4421, + "epoch": 0.12965186074429771, + "grad_norm": 0.5722172856330872, + "learning_rate": 4.909831854424812e-05, + "loss": 0.8064, "num_input_tokens_seen": 22118400, "step": 2700 }, { - "epoch": 0.3101041309074265, - "grad_norm": 0.4972691237926483, - "learning_rate": 4.4214744189110266e-05, - "loss": 1.5259, + "epoch": 0.13013205282112844, + "grad_norm": 0.5787602663040161, + "learning_rate": 4.908774450603946e-05, + "loss": 0.9561, "num_input_tokens_seen": 22200320, "step": 2710 }, { - "epoch": 0.3112484265934317, - "grad_norm": 0.5017683506011963, - "learning_rate": 4.414966136054125e-05, - "loss": 1.1897, + "epoch": 0.1306122448979592, + "grad_norm": 0.5278475284576416, + "learning_rate": 4.907710997980664e-05, + "loss": 1.0554, "num_input_tokens_seen": 22282240, "step": 2720 }, { - "epoch": 0.312392722279437, - "grad_norm": 0.468337744474411, - "learning_rate": 4.408426296571852e-05, - "loss": 1.0431, + "epoch": 0.13109243697478992, + "grad_norm": 0.5114937424659729, + "learning_rate": 4.906641499225457e-05, + "loss": 0.9203, "num_input_tokens_seen": 22364160, "step": 2730 }, { - "epoch": 0.31353701796544226, - "grad_norm": 0.6088995337486267, - "learning_rate": 4.401855008233879e-05, - "loss": 1.3437, + "epoch": 0.13157262905162065, + "grad_norm": 0.5476466417312622, + "learning_rate": 4.905565957024003e-05, + "loss": 0.9199, "num_input_tokens_seen": 22446080, "step": 2740 }, { - "epoch": 0.3146813136514475, - "grad_norm": 0.6361683011054993, - "learning_rate": 4.395252379328115e-05, - "loss": 1.4596, + "epoch": 0.13205282112845138, + "grad_norm": 0.5202212929725647, + "learning_rate": 4.9044843740771505e-05, + "loss": 0.9683, "num_input_tokens_seen": 22528000, "step": 2750 }, { - "epoch": 0.3158256093374528, - "grad_norm": 0.4672735631465912, - "learning_rate": 4.388618518658932e-05, - "loss": 1.1596, + "epoch": 0.1325330132052821, + "grad_norm": 0.5196512341499329, + "learning_rate": 4.9033967531009225e-05, + "loss": 1.1265, "num_input_tokens_seen": 22609920, "step": 2760 }, { - "epoch": 0.31696990502345807, - "grad_norm": 1.0056240558624268, - "learning_rate": 4.381953535545369e-05, - "loss": 1.2776, + "epoch": 0.13301320528211286, + "grad_norm": 0.5623713731765747, + "learning_rate": 4.902303096826502e-05, + "loss": 1.1676, "num_input_tokens_seen": 22691840, "step": 2770 }, { - "epoch": 0.31811420070946334, - "grad_norm": 0.8526020646095276, - "learning_rate": 4.375257539819328e-05, - "loss": 1.1147, + "epoch": 0.13349339735894358, + "grad_norm": 0.6298018097877502, + "learning_rate": 4.901203408000227e-05, + "loss": 0.8947, "num_input_tokens_seen": 22773760, "step": 2780 }, { - "epoch": 0.3192584963954686, - "grad_norm": 0.527472972869873, - "learning_rate": 4.368530641823769e-05, - "loss": 1.1802, + "epoch": 0.1339735894357743, + "grad_norm": 0.6308296322822571, + "learning_rate": 4.9000976893835856e-05, + "loss": 1.1018, "num_input_tokens_seen": 22855680, "step": 2790 }, { - "epoch": 0.3204027920814739, - "grad_norm": 0.5480664372444153, - "learning_rate": 4.361772952410886e-05, - "loss": 1.0881, + "epoch": 0.13445378151260504, + "grad_norm": 1.6783674955368042, + "learning_rate": 4.898985943753207e-05, + "loss": 0.9954, "num_input_tokens_seen": 22937600, "step": 2800 }, { - "epoch": 0.3215470877674791, - "grad_norm": 0.6834565997123718, - "learning_rate": 4.354984582940285e-05, - "loss": 1.1945, + "epoch": 0.13493397358943576, + "grad_norm": 0.5339183211326599, + "learning_rate": 4.897868173900854e-05, + "loss": 0.7595, "num_input_tokens_seen": 23019520, "step": 2810 }, { - "epoch": 0.32269138345348436, - "grad_norm": 0.5412240028381348, - "learning_rate": 4.348165645277145e-05, - "loss": 1.3629, + "epoch": 0.13541416566626652, + "grad_norm": 0.5435370802879333, + "learning_rate": 4.89674438263342e-05, + "loss": 1.0, "num_input_tokens_seen": 23101440, "step": 2820 }, { - "epoch": 0.32383567913948963, - "grad_norm": 0.5615849494934082, - "learning_rate": 4.34131625179038e-05, - "loss": 1.3362, + "epoch": 0.13589435774309724, + "grad_norm": 0.5289290547370911, + "learning_rate": 4.8956145727729156e-05, + "loss": 0.9093, "num_input_tokens_seen": 23183360, "step": 2830 }, { - "epoch": 0.3249799748254949, - "grad_norm": 0.5076019763946533, - "learning_rate": 4.334436515350779e-05, - "loss": 1.2316, + "epoch": 0.13637454981992797, + "grad_norm": 0.644011914730072, + "learning_rate": 4.8944787471564686e-05, + "loss": 0.8391, "num_input_tokens_seen": 23265280, "step": 2840 }, { - "epoch": 0.32612427051150017, - "grad_norm": 0.568336546421051, - "learning_rate": 4.327526549329157e-05, - "loss": 1.7573, + "epoch": 0.1368547418967587, + "grad_norm": 0.5689689517021179, + "learning_rate": 4.89333690863631e-05, + "loss": 0.8903, "num_input_tokens_seen": 23347200, "step": 2850 }, { - "epoch": 0.32726856619750544, - "grad_norm": 0.6184702515602112, - "learning_rate": 4.320586467594476e-05, - "loss": 1.4037, + "epoch": 0.13733493397358942, + "grad_norm": 0.5509852766990662, + "learning_rate": 4.892189060079773e-05, + "loss": 0.9087, "num_input_tokens_seen": 23429120, "step": 2860 }, { - "epoch": 0.3284128618835107, - "grad_norm": 0.8441096544265747, - "learning_rate": 4.313616384511976e-05, - "loss": 1.4623, + "epoch": 0.13781512605042018, + "grad_norm": 0.6888849139213562, + "learning_rate": 4.8910352043692806e-05, + "loss": 0.8255, "num_input_tokens_seen": 23511040, "step": 2870 }, { - "epoch": 0.329557157569516, - "grad_norm": 0.41614028811454773, - "learning_rate": 4.3066164149412844e-05, - "loss": 1.1027, + "epoch": 0.1382953181272509, + "grad_norm": 0.5926205515861511, + "learning_rate": 4.889875344402342e-05, + "loss": 0.9486, "num_input_tokens_seen": 23592960, "step": 2880 }, { - "epoch": 0.33070145325552125, - "grad_norm": 0.6045880317687988, - "learning_rate": 4.299586674234529e-05, - "loss": 1.2341, + "epoch": 0.13877551020408163, + "grad_norm": 0.5350954532623291, + "learning_rate": 4.8887094830915427e-05, + "loss": 1.0803, "num_input_tokens_seen": 23674880, "step": 2890 }, { - "epoch": 0.33184574894152646, - "grad_norm": 0.5018451809883118, - "learning_rate": 4.292527278234435e-05, - "loss": 1.2661, + "epoch": 0.13925570228091236, + "grad_norm": 0.5506545305252075, + "learning_rate": 4.8875376233645396e-05, + "loss": 0.9572, "num_input_tokens_seen": 23756800, "step": 2900 }, { - "epoch": 0.33299004462753173, - "grad_norm": 0.6029537320137024, - "learning_rate": 4.285438343272414e-05, - "loss": 1.375, + "epoch": 0.13973589435774308, + "grad_norm": 0.552919328212738, + "learning_rate": 4.886359768164054e-05, + "loss": 0.9725, "num_input_tokens_seen": 23838720, "step": 2910 }, { - "epoch": 0.334134340313537, - "grad_norm": 0.7714540958404541, - "learning_rate": 4.278319986166649e-05, - "loss": 1.2119, + "epoch": 0.14021608643457384, + "grad_norm": 0.5363306403160095, + "learning_rate": 4.88517592044786e-05, + "loss": 0.9, "num_input_tokens_seen": 23920640, "step": 2920 }, { - "epoch": 0.3352786359995423, - "grad_norm": 0.5025473237037659, - "learning_rate": 4.2711723242201695e-05, - "loss": 1.322, + "epoch": 0.14069627851140457, + "grad_norm": 0.5855774879455566, + "learning_rate": 4.8839860831887805e-05, + "loss": 1.0477, "num_input_tokens_seen": 24002560, "step": 2930 }, { - "epoch": 0.33642293168554754, - "grad_norm": 0.5495464205741882, - "learning_rate": 4.263995475218917e-05, - "loss": 1.4806, + "epoch": 0.1411764705882353, + "grad_norm": 0.5440022349357605, + "learning_rate": 4.882790259374681e-05, + "loss": 0.8685, "num_input_tokens_seen": 24084480, "step": 2940 }, { - "epoch": 0.3375672273715528, - "grad_norm": 0.5496026277542114, - "learning_rate": 4.256789557429806e-05, - "loss": 1.6319, + "epoch": 0.14165666266506602, + "grad_norm": 0.5618158578872681, + "learning_rate": 4.881588452008456e-05, + "loss": 0.9956, "num_input_tokens_seen": 24166400, "step": 2950 }, { - "epoch": 0.3387115230575581, - "grad_norm": 2.653693199157715, - "learning_rate": 4.2495546895987724e-05, - "loss": 1.2252, + "epoch": 0.14213685474189675, + "grad_norm": 0.5429749488830566, + "learning_rate": 4.880380664108032e-05, + "loss": 1.0358, "num_input_tokens_seen": 24248320, "step": 2960 }, { - "epoch": 0.33985581874356335, - "grad_norm": 0.7369899153709412, - "learning_rate": 4.242290990948821e-05, - "loss": 1.1941, + "epoch": 0.1426170468187275, + "grad_norm": 0.5229565501213074, + "learning_rate": 4.879166898706347e-05, + "loss": 0.8954, "num_input_tokens_seen": 24330240, "step": 2970 }, { - "epoch": 0.3410001144295686, - "grad_norm": 0.6210314631462097, - "learning_rate": 4.234998581178056e-05, - "loss": 1.2853, + "epoch": 0.14309723889555823, + "grad_norm": 0.5274274349212646, + "learning_rate": 4.877947158851352e-05, + "loss": 0.9882, "num_input_tokens_seen": 24412160, "step": 2980 }, { - "epoch": 0.34214441011557384, - "grad_norm": 0.4932264983654022, - "learning_rate": 4.227677580457711e-05, - "loss": 1.157, + "epoch": 0.14357743097238895, + "grad_norm": 0.5633429884910583, + "learning_rate": 4.876721447606002e-05, + "loss": 0.8396, "num_input_tokens_seen": 24494080, "step": 2990 }, { - "epoch": 0.3432887058015791, - "grad_norm": 0.7470729947090149, - "learning_rate": 4.220328109430167e-05, - "loss": 1.1064, + "epoch": 0.14405762304921968, + "grad_norm": 0.5823813676834106, + "learning_rate": 4.875489768048247e-05, + "loss": 1.0854, "num_input_tokens_seen": 24576000, "step": 3000 }, { - "epoch": 0.3444330014875844, - "grad_norm": 0.5054774880409241, - "learning_rate": 4.21295028920697e-05, - "loss": 1.3768, + "epoch": 0.14453781512605043, + "grad_norm": 0.5569640398025513, + "learning_rate": 4.8742521232710234e-05, + "loss": 0.9147, "num_input_tokens_seen": 24657920, "step": 3010 }, { - "epoch": 0.34557729717358965, - "grad_norm": 0.5471848845481873, - "learning_rate": 4.2055442413668264e-05, - "loss": 1.1968, + "epoch": 0.14501800720288116, + "grad_norm": 0.5742693543434143, + "learning_rate": 4.873008516382245e-05, + "loss": 0.956, "num_input_tokens_seen": 24739840, "step": 3020 }, { - "epoch": 0.3467215928595949, - "grad_norm": 0.48451676964759827, - "learning_rate": 4.198110087953606e-05, - "loss": 1.3736, + "epoch": 0.1454981992797119, + "grad_norm": 0.5225424766540527, + "learning_rate": 4.871758950504801e-05, + "loss": 0.9568, "num_input_tokens_seen": 24821760, "step": 3030 }, { - "epoch": 0.3478658885456002, - "grad_norm": 0.4831438958644867, - "learning_rate": 4.190647951474328e-05, - "loss": 1.2485, + "epoch": 0.14597839135654261, + "grad_norm": 0.42165303230285645, + "learning_rate": 4.870503428776544e-05, + "loss": 0.95, "num_input_tokens_seen": 24903680, "step": 3040 }, { - "epoch": 0.34901018423160546, - "grad_norm": 0.5406004190444946, - "learning_rate": 4.183157954897144e-05, - "loss": 1.1501, + "epoch": 0.14645858343337334, + "grad_norm": 0.5314249992370605, + "learning_rate": 4.869241954350281e-05, + "loss": 1.0183, "num_input_tokens_seen": 24985600, "step": 3050 }, { - "epoch": 0.35015447991761073, - "grad_norm": 0.5474951863288879, - "learning_rate": 4.1756402216493115e-05, - "loss": 1.1709, + "epoch": 0.1469387755102041, + "grad_norm": 0.5332600474357605, + "learning_rate": 4.867974530393767e-05, + "loss": 0.9581, "num_input_tokens_seen": 25067520, "step": 3060 }, { - "epoch": 0.351298775603616, - "grad_norm": 0.6155632138252258, - "learning_rate": 4.1680948756151564e-05, - "loss": 1.2692, + "epoch": 0.14741896758703482, + "grad_norm": 0.5184004306793213, + "learning_rate": 4.8667011600896994e-05, + "loss": 0.8628, "num_input_tokens_seen": 25149440, "step": 3070 }, { - "epoch": 0.3524430712896212, - "grad_norm": 0.47120699286460876, - "learning_rate": 4.160522041134035e-05, - "loss": 1.2124, + "epoch": 0.14789915966386555, + "grad_norm": 0.524926483631134, + "learning_rate": 4.8654218466357064e-05, + "loss": 0.8973, "num_input_tokens_seen": 25231360, "step": 3080 }, { - "epoch": 0.3535873669756265, - "grad_norm": 0.5330422520637512, - "learning_rate": 4.152921842998287e-05, - "loss": 1.1728, + "epoch": 0.14837935174069627, + "grad_norm": 0.4728389382362366, + "learning_rate": 4.86413659324434e-05, + "loss": 0.9561, "num_input_tokens_seen": 25313280, "step": 3090 }, { - "epoch": 0.35473166266163175, - "grad_norm": 0.48196467757225037, - "learning_rate": 4.145294406451173e-05, - "loss": 1.309, + "epoch": 0.148859543817527, + "grad_norm": 0.4952199459075928, + "learning_rate": 4.8628454031430694e-05, + "loss": 0.8549, "num_input_tokens_seen": 25395200, "step": 3100 }, { - "epoch": 0.355875958347637, - "grad_norm": 0.5000414252281189, - "learning_rate": 4.137639857184815e-05, - "loss": 1.3163, + "epoch": 0.14933973589435776, + "grad_norm": 0.6020041704177856, + "learning_rate": 4.8615482795742696e-05, + "loss": 1.1022, "num_input_tokens_seen": 25477120, "step": 3110 }, { - "epoch": 0.3570202540336423, - "grad_norm": 0.5418947339057922, - "learning_rate": 4.129958321338127e-05, - "loss": 1.1725, + "epoch": 0.14981992797118848, + "grad_norm": 1.6901748180389404, + "learning_rate": 4.860245225795219e-05, + "loss": 1.108, "num_input_tokens_seen": 25559040, "step": 3120 }, { - "epoch": 0.35816454971964756, - "grad_norm": 0.5317056775093079, - "learning_rate": 4.122249925494726e-05, - "loss": 1.1192, + "epoch": 0.1503001200480192, + "grad_norm": 0.5366313457489014, + "learning_rate": 4.858936245078084e-05, + "loss": 1.1467, "num_input_tokens_seen": 25640960, "step": 3130 }, { - "epoch": 0.35930884540565283, - "grad_norm": 0.6244747638702393, - "learning_rate": 4.114514796680862e-05, - "loss": 1.2959, + "epoch": 0.15078031212484994, + "grad_norm": 0.5097447633743286, + "learning_rate": 4.857621340709917e-05, + "loss": 1.0484, "num_input_tokens_seen": 25722880, "step": 3140 }, { - "epoch": 0.3604531410916581, - "grad_norm": 0.5483636856079102, - "learning_rate": 4.106753062363311e-05, - "loss": 1.2905, + "epoch": 0.15126050420168066, + "grad_norm": 0.5351747870445251, + "learning_rate": 4.856300515992646e-05, + "loss": 0.8896, "num_input_tokens_seen": 25804800, "step": 3150 }, { - "epoch": 0.3615974367776634, - "grad_norm": 0.5433777570724487, - "learning_rate": 4.098964850447281e-05, - "loss": 1.315, + "epoch": 0.15174069627851142, + "grad_norm": 0.5735185742378235, + "learning_rate": 4.854973774243062e-05, + "loss": 0.9816, "num_input_tokens_seen": 25886720, "step": 3160 }, { - "epoch": 0.3627417324636686, - "grad_norm": 0.90435391664505, - "learning_rate": 4.0911502892743035e-05, - "loss": 1.28, + "epoch": 0.15222088835534214, + "grad_norm": 0.5087327361106873, + "learning_rate": 4.8536411187928186e-05, + "loss": 0.974, "num_input_tokens_seen": 25968640, "step": 3170 }, { - "epoch": 0.36388602814967386, - "grad_norm": 0.5195088386535645, - "learning_rate": 4.083309507620118e-05, - "loss": 1.2515, + "epoch": 0.15270108043217287, + "grad_norm": 0.5174838900566101, + "learning_rate": 4.852302552988418e-05, + "loss": 0.9798, "num_input_tokens_seen": 26050560, "step": 3180 }, { - "epoch": 0.3650303238356791, - "grad_norm": 0.5890297293663025, - "learning_rate": 4.075442634692548e-05, - "loss": 1.1768, + "epoch": 0.1531812725090036, + "grad_norm": 0.5763252973556519, + "learning_rate": 4.850958080191205e-05, + "loss": 1.0298, "num_input_tokens_seen": 26132480, "step": 3190 }, { - "epoch": 0.3661746195216844, - "grad_norm": 0.6505255103111267, - "learning_rate": 4.067549800129375e-05, - "loss": 1.2891, + "epoch": 0.15366146458583432, + "grad_norm": 0.565152108669281, + "learning_rate": 4.849607703777356e-05, + "loss": 0.9407, "num_input_tokens_seen": 26214400, "step": 3200 }, { - "epoch": 0.36731891520768967, - "grad_norm": 0.6695213317871094, - "learning_rate": 4.059631133996203e-05, - "loss": 1.3888, + "epoch": 0.15414165666266508, + "grad_norm": 0.5854922533035278, + "learning_rate": 4.8482514271378745e-05, + "loss": 0.7755, "num_input_tokens_seen": 26296320, "step": 3210 }, { - "epoch": 0.36846321089369494, - "grad_norm": 0.4889095723628998, - "learning_rate": 4.05168676678431e-05, - "loss": 0.9946, + "epoch": 0.1546218487394958, + "grad_norm": 0.5151272416114807, + "learning_rate": 4.846889253678578e-05, + "loss": 0.8748, "num_input_tokens_seen": 26378240, "step": 3220 }, { - "epoch": 0.3696075065797002, - "grad_norm": 2.1291022300720215, - "learning_rate": 4.0437168294085013e-05, - "loss": 1.4232, + "epoch": 0.15510204081632653, + "grad_norm": 0.5427525043487549, + "learning_rate": 4.845521186820096e-05, + "loss": 0.9308, "num_input_tokens_seen": 26460160, "step": 3230 }, { - "epoch": 0.3707518022657055, - "grad_norm": 0.4844609797000885, - "learning_rate": 4.0357214532049535e-05, - "loss": 1.1119, + "epoch": 0.15558223289315726, + "grad_norm": 0.5123739242553711, + "learning_rate": 4.8441472299978504e-05, + "loss": 0.9556, "num_input_tokens_seen": 26542080, "step": 3240 }, { - "epoch": 0.37189609795171075, - "grad_norm": 0.509472668170929, - "learning_rate": 4.027700769929046e-05, - "loss": 1.1302, + "epoch": 0.15606242496998798, + "grad_norm": 0.6060318946838379, + "learning_rate": 4.8427673866620615e-05, + "loss": 0.9465, "num_input_tokens_seen": 26624000, "step": 3250 }, { - "epoch": 0.37304039363771596, - "grad_norm": 0.5184939503669739, - "learning_rate": 4.019654911753193e-05, - "loss": 1.2648, + "epoch": 0.15654261704681874, + "grad_norm": 0.5624286532402039, + "learning_rate": 4.841381660277725e-05, + "loss": 0.8611, "num_input_tokens_seen": 26705920, "step": 3260 }, { - "epoch": 0.37418468932372123, - "grad_norm": 0.5771918892860413, - "learning_rate": 4.011584011264665e-05, - "loss": 1.204, + "epoch": 0.15702280912364946, + "grad_norm": 0.5059430003166199, + "learning_rate": 4.839990054324614e-05, + "loss": 0.9294, "num_input_tokens_seen": 26787840, "step": 3270 }, { - "epoch": 0.3753289850097265, - "grad_norm": 0.5364697575569153, - "learning_rate": 4.0034882014634015e-05, - "loss": 1.3242, + "epoch": 0.1575030012004802, + "grad_norm": 0.5408222675323486, + "learning_rate": 4.838592572297265e-05, + "loss": 1.0522, "num_input_tokens_seen": 26869760, "step": 3280 }, { - "epoch": 0.37647328069573177, - "grad_norm": 0.5383545160293579, - "learning_rate": 3.995367615759825e-05, - "loss": 1.235, + "epoch": 0.15798319327731092, + "grad_norm": 0.5456782579421997, + "learning_rate": 4.837189217704968e-05, + "loss": 0.9068, "num_input_tokens_seen": 26951680, "step": 3290 }, { - "epoch": 0.37761757638173704, - "grad_norm": 0.5549062490463257, - "learning_rate": 3.9872223879726356e-05, - "loss": 1.1119, + "epoch": 0.15846338535414164, + "grad_norm": 0.5521805882453918, + "learning_rate": 4.835779994071764e-05, + "loss": 0.8502, "num_input_tokens_seen": 27033600, "step": 3300 }, { - "epoch": 0.3787618720677423, - "grad_norm": 0.8855923414230347, - "learning_rate": 3.979052652326609e-05, - "loss": 1.2295, + "epoch": 0.1589435774309724, + "grad_norm": 0.525742769241333, + "learning_rate": 4.8343649049364284e-05, + "loss": 0.9537, "num_input_tokens_seen": 27115520, "step": 3310 }, { - "epoch": 0.3799061677537476, - "grad_norm": 0.5798035264015198, - "learning_rate": 3.970858543450387e-05, - "loss": 1.4858, + "epoch": 0.15942376950780313, + "grad_norm": 0.7998719215393066, + "learning_rate": 4.832943953852468e-05, + "loss": 0.9564, "num_input_tokens_seen": 27197440, "step": 3320 }, { - "epoch": 0.38105046343975285, - "grad_norm": 0.9078068137168884, - "learning_rate": 3.962640196374254e-05, - "loss": 1.1919, + "epoch": 0.15990396158463385, + "grad_norm": 0.41986241936683655, + "learning_rate": 4.831517144388109e-05, + "loss": 0.8895, "num_input_tokens_seen": 27279360, "step": 3330 }, { - "epoch": 0.3821947591257581, - "grad_norm": 0.5055866241455078, - "learning_rate": 3.954397746527916e-05, - "loss": 1.1531, + "epoch": 0.16038415366146458, + "grad_norm": 0.5009111166000366, + "learning_rate": 4.830084480126288e-05, + "loss": 0.765, "num_input_tokens_seen": 27361280, "step": 3340 }, { - "epoch": 0.38333905481176334, - "grad_norm": 0.5842207670211792, - "learning_rate": 3.9461313297382666e-05, - "loss": 1.1539, + "epoch": 0.1608643457382953, + "grad_norm": 0.9933325052261353, + "learning_rate": 4.828645964664647e-05, + "loss": 0.9538, "num_input_tokens_seen": 27443200, "step": 3350 }, { - "epoch": 0.3844833504977686, - "grad_norm": 0.5523999929428101, - "learning_rate": 3.93784108222715e-05, - "loss": 1.6298, + "epoch": 0.16134453781512606, + "grad_norm": 0.6056552529335022, + "learning_rate": 4.8272016016155166e-05, + "loss": 1.0036, "num_input_tokens_seen": 27525120, "step": 3360 }, { - "epoch": 0.3856276461837739, - "grad_norm": 0.5067601799964905, - "learning_rate": 3.929527140609115e-05, - "loss": 1.2188, + "epoch": 0.1618247298919568, + "grad_norm": 0.6024153828620911, + "learning_rate": 4.825751394605916e-05, + "loss": 1.1447, "num_input_tokens_seen": 27607040, "step": 3370 }, { - "epoch": 0.38677194186977915, - "grad_norm": 0.5155417919158936, - "learning_rate": 3.921189641889163e-05, - "loss": 1.2925, + "epoch": 0.1623049219687875, + "grad_norm": 0.5308077335357666, + "learning_rate": 4.824295347277537e-05, + "loss": 0.8392, "num_input_tokens_seen": 27688960, "step": 3380 }, { - "epoch": 0.3879162375557844, - "grad_norm": 0.5797560811042786, - "learning_rate": 3.912828723460495e-05, - "loss": 1.3113, + "epoch": 0.16278511404561824, + "grad_norm": 0.5333995223045349, + "learning_rate": 4.8228334632867375e-05, + "loss": 1.1125, "num_input_tokens_seen": 27770880, "step": 3390 }, { - "epoch": 0.3890605332417897, - "grad_norm": 0.5342340469360352, - "learning_rate": 3.904444523102242e-05, - "loss": 1.2155, + "epoch": 0.16326530612244897, + "grad_norm": 0.5561527013778687, + "learning_rate": 4.8213657463045344e-05, + "loss": 1.059, "num_input_tokens_seen": 27852800, "step": 3400 }, { - "epoch": 0.39020482892779496, - "grad_norm": 0.867396354675293, - "learning_rate": 3.896037178977196e-05, - "loss": 1.4058, + "epoch": 0.16374549819927972, + "grad_norm": 0.6268987655639648, + "learning_rate": 4.819892200016588e-05, + "loss": 1.0134, "num_input_tokens_seen": 27934720, "step": 3410 }, { - "epoch": 0.3913491246138002, - "grad_norm": 0.904750645160675, - "learning_rate": 3.887606829629536e-05, - "loss": 1.2582, + "epoch": 0.16422569027611045, + "grad_norm": 0.5519012808799744, + "learning_rate": 4.818412828123201e-05, + "loss": 0.8942, "num_input_tokens_seen": 28016640, "step": 3420 }, { - "epoch": 0.3924934202998055, - "grad_norm": 0.49150246381759644, - "learning_rate": 3.87915361398254e-05, - "loss": 1.2471, + "epoch": 0.16470588235294117, + "grad_norm": 0.5266446471214294, + "learning_rate": 4.816927634339302e-05, + "loss": 0.8664, "num_input_tokens_seen": 28098560, "step": 3430 }, { - "epoch": 0.3936377159858107, - "grad_norm": 1.6605616807937622, - "learning_rate": 3.8706776713363025e-05, - "loss": 1.2815, + "epoch": 0.1651860744297719, + "grad_norm": 0.5828010439872742, + "learning_rate": 4.815436622394441e-05, + "loss": 0.7622, "num_input_tokens_seen": 28180480, "step": 3440 }, { - "epoch": 0.394782011671816, - "grad_norm": 0.5466508865356445, - "learning_rate": 3.862179141365431e-05, - "loss": 1.4604, + "epoch": 0.16566626650660263, + "grad_norm": 0.7593029141426086, + "learning_rate": 4.813939796032779e-05, + "loss": 1.0249, "num_input_tokens_seen": 28262400, "step": 3450 }, { - "epoch": 0.39592630735782125, - "grad_norm": 0.8647962808609009, - "learning_rate": 3.8536581641167506e-05, - "loss": 1.2577, + "epoch": 0.16614645858343338, + "grad_norm": 0.5545393228530884, + "learning_rate": 4.812437159013076e-05, + "loss": 0.9852, "num_input_tokens_seen": 28344320, "step": 3460 }, { - "epoch": 0.3970706030438265, - "grad_norm": 2.0921196937561035, - "learning_rate": 3.845114880006994e-05, - "loss": 1.2211, + "epoch": 0.1666266506602641, + "grad_norm": 0.5645286440849304, + "learning_rate": 4.810928715108683e-05, + "loss": 0.8945, "num_input_tokens_seen": 28426240, "step": 3470 }, { - "epoch": 0.3982148987298318, - "grad_norm": 1.0453864336013794, - "learning_rate": 3.836549429820485e-05, - "loss": 1.2543, + "epoch": 0.16710684273709484, + "grad_norm": 0.6866888999938965, + "learning_rate": 4.809414468107536e-05, + "loss": 0.8606, "num_input_tokens_seen": 28508160, "step": 3480 }, { - "epoch": 0.39935919441583706, - "grad_norm": 1.259315013885498, - "learning_rate": 3.827961954706825e-05, - "loss": 1.2716, + "epoch": 0.16758703481392556, + "grad_norm": 0.5912758111953735, + "learning_rate": 4.8078944218121404e-05, + "loss": 0.7668, "num_input_tokens_seen": 28590080, "step": 3490 }, { - "epoch": 0.40050349010184233, - "grad_norm": 0.49976375699043274, - "learning_rate": 3.8193525961785584e-05, - "loss": 0.9854, + "epoch": 0.16806722689075632, + "grad_norm": 0.5021241903305054, + "learning_rate": 4.806368580039566e-05, + "loss": 1.1072, "num_input_tokens_seen": 28672000, "step": 3500 }, { - "epoch": 0.4016477857878476, - "grad_norm": 0.4869581162929535, - "learning_rate": 3.81072149610885e-05, - "loss": 1.1416, + "epoch": 0.16854741896758704, + "grad_norm": 0.5017940402030945, + "learning_rate": 4.804836946621437e-05, + "loss": 0.7905, "num_input_tokens_seen": 28753920, "step": 3510 }, { - "epoch": 0.40279208147385287, - "grad_norm": 0.5591604113578796, - "learning_rate": 3.802068796729139e-05, - "loss": 1.3587, + "epoch": 0.16902761104441777, + "grad_norm": 0.5461972951889038, + "learning_rate": 4.803299525403919e-05, + "loss": 0.9979, "num_input_tokens_seen": 28835840, "step": 3520 }, { - "epoch": 0.4039363771598581, - "grad_norm": 0.5056213736534119, - "learning_rate": 3.7933946406268e-05, - "loss": 1.2926, + "epoch": 0.1695078031212485, + "grad_norm": 0.5111615061759949, + "learning_rate": 4.801756320247713e-05, + "loss": 1.1211, "num_input_tokens_seen": 28917760, "step": 3530 }, { - "epoch": 0.40508067284586335, - "grad_norm": 0.4954633414745331, - "learning_rate": 3.7846991707427905e-05, - "loss": 1.2945, + "epoch": 0.16998799519807922, + "grad_norm": 0.5483148097991943, + "learning_rate": 4.800207335028044e-05, + "loss": 0.9346, "num_input_tokens_seen": 28999680, "step": 3540 }, { - "epoch": 0.4062249685318686, - "grad_norm": 0.823973536491394, - "learning_rate": 3.775982530369298e-05, - "loss": 1.2348, + "epoch": 0.17046818727490998, + "grad_norm": 0.6430408358573914, + "learning_rate": 4.798652573634651e-05, + "loss": 0.8605, "num_input_tokens_seen": 29081600, "step": 3550 }, { - "epoch": 0.4073692642178739, - "grad_norm": 0.5769624710083008, - "learning_rate": 3.767244863147377e-05, - "loss": 1.2022, + "epoch": 0.1709483793517407, + "grad_norm": 0.41809263825416565, + "learning_rate": 4.797092039971779e-05, + "loss": 0.631, "num_input_tokens_seen": 29163520, "step": 3560 }, { - "epoch": 0.40851355990387916, - "grad_norm": 0.5166818499565125, - "learning_rate": 3.75848631306458e-05, - "loss": 1.5009, + "epoch": 0.17142857142857143, + "grad_norm": 0.5095425844192505, + "learning_rate": 4.7955257379581675e-05, + "loss": 0.8789, "num_input_tokens_seen": 29245440, "step": 3570 }, { - "epoch": 0.40965785558988443, - "grad_norm": 0.8839643001556396, - "learning_rate": 3.7497070244525925e-05, - "loss": 1.2186, + "epoch": 0.17190876350540216, + "grad_norm": 0.5259044766426086, + "learning_rate": 4.7939536715270415e-05, + "loss": 1.2783, "num_input_tokens_seen": 29327360, "step": 3580 }, { - "epoch": 0.4108021512758897, - "grad_norm": 0.5029737949371338, - "learning_rate": 3.7409071419848436e-05, - "loss": 1.3161, + "epoch": 0.17238895558223288, + "grad_norm": 0.5708450675010681, + "learning_rate": 4.792375844626101e-05, + "loss": 0.7012, "num_input_tokens_seen": 29409280, "step": 3590 }, { - "epoch": 0.411946446961895, - "grad_norm": 0.46834880113601685, - "learning_rate": 3.73208681067413e-05, - "loss": 1.0534, + "epoch": 0.17286914765906364, + "grad_norm": 0.5291116833686829, + "learning_rate": 4.790792261217512e-05, + "loss": 0.976, "num_input_tokens_seen": 29491200, "step": 3600 }, { - "epoch": 0.41309074264790024, - "grad_norm": 0.6240310668945312, - "learning_rate": 3.7232461758702244e-05, - "loss": 1.3398, + "epoch": 0.17334933973589436, + "grad_norm": 0.4987095296382904, + "learning_rate": 4.789202925277895e-05, + "loss": 0.9292, "num_input_tokens_seen": 29573120, "step": 3610 }, { - "epoch": 0.41423503833390546, - "grad_norm": 0.5298212766647339, - "learning_rate": 3.714385383257477e-05, - "loss": 1.1538, + "epoch": 0.1738295318127251, + "grad_norm": 0.6108640432357788, + "learning_rate": 4.787607840798317e-05, + "loss": 0.8974, "num_input_tokens_seen": 29655040, "step": 3620 }, { - "epoch": 0.41537933401991073, - "grad_norm": 0.46233418583869934, - "learning_rate": 3.7055045788524214e-05, - "loss": 1.0701, + "epoch": 0.17430972388955582, + "grad_norm": 0.42809537053108215, + "learning_rate": 4.786007011784279e-05, + "loss": 0.9049, "num_input_tokens_seen": 29736960, "step": 3630 }, { - "epoch": 0.416523629705916, - "grad_norm": 0.5719970464706421, - "learning_rate": 3.696603909001361e-05, - "loss": 1.4487, + "epoch": 0.17478991596638654, + "grad_norm": 0.4873834252357483, + "learning_rate": 4.78440044225571e-05, + "loss": 0.8813, "num_input_tokens_seen": 29818880, "step": 3640 }, { - "epoch": 0.41766792539192127, - "grad_norm": 0.5453386902809143, - "learning_rate": 3.6876835203779615e-05, - "loss": 1.3053, + "epoch": 0.1752701080432173, + "grad_norm": 0.6227244138717651, + "learning_rate": 4.7827881362469506e-05, + "loss": 0.7923, "num_input_tokens_seen": 29900800, "step": 3650 }, { - "epoch": 0.41881222107792654, - "grad_norm": 0.4228511452674866, - "learning_rate": 3.678743559980835e-05, - "loss": 1.4515, + "epoch": 0.17575030012004803, + "grad_norm": 0.4629347324371338, + "learning_rate": 4.781170097806751e-05, + "loss": 0.9563, "num_input_tokens_seen": 29982720, "step": 3660 }, { - "epoch": 0.4199565167639318, - "grad_norm": 0.5006650686264038, - "learning_rate": 3.669784175131115e-05, - "loss": 1.4384, + "epoch": 0.17623049219687875, + "grad_norm": 0.5306552648544312, + "learning_rate": 4.779546330998253e-05, + "loss": 0.9327, "num_input_tokens_seen": 30064640, "step": 3670 }, { - "epoch": 0.4211008124499371, - "grad_norm": 0.49229928851127625, - "learning_rate": 3.660805513470027e-05, - "loss": 1.4419, + "epoch": 0.17671068427370948, + "grad_norm": 0.5401423573493958, + "learning_rate": 4.7779168398989826e-05, + "loss": 1.2167, "num_input_tokens_seen": 30146560, "step": 3680 }, { - "epoch": 0.42224510813594235, - "grad_norm": 0.5263097286224365, - "learning_rate": 3.651807722956462e-05, - "loss": 1.2246, + "epoch": 0.1771908763505402, + "grad_norm": 0.5693386197090149, + "learning_rate": 4.7762816286008454e-05, + "loss": 0.9795, "num_input_tokens_seen": 30228480, "step": 3690 }, { - "epoch": 0.4233894038219476, - "grad_norm": 0.48281192779541016, - "learning_rate": 3.642790951864532e-05, - "loss": 1.0901, + "epoch": 0.17767106842737096, + "grad_norm": 0.5739344954490662, + "learning_rate": 4.774640701210106e-05, + "loss": 1.108, "num_input_tokens_seen": 30310400, "step": 3700 }, { - "epoch": 0.42453369950795283, - "grad_norm": 0.6877309083938599, - "learning_rate": 3.63375534878113e-05, - "loss": 1.2148, + "epoch": 0.1781512605042017, + "grad_norm": 0.5333011150360107, + "learning_rate": 4.7729940618473854e-05, + "loss": 1.0156, "num_input_tokens_seen": 30392320, "step": 3710 }, { - "epoch": 0.4256779951939581, - "grad_norm": 2.1429624557495117, - "learning_rate": 3.6247010626034795e-05, - "loss": 1.3764, + "epoch": 0.1786314525810324, + "grad_norm": 0.6173415780067444, + "learning_rate": 4.771341714647648e-05, + "loss": 0.9543, "num_input_tokens_seen": 30474240, "step": 3720 }, { - "epoch": 0.4268222908799634, - "grad_norm": 0.6136846542358398, - "learning_rate": 3.615628242536682e-05, - "loss": 1.216, + "epoch": 0.17911164465786314, + "grad_norm": 0.5696475505828857, + "learning_rate": 4.76968366376019e-05, + "loss": 0.875, "num_input_tokens_seen": 30556160, "step": 3730 }, { - "epoch": 0.42796658656596864, - "grad_norm": 0.5359023809432983, - "learning_rate": 3.6065370380912587e-05, - "loss": 1.2571, + "epoch": 0.17959183673469387, + "grad_norm": 0.5061577558517456, + "learning_rate": 4.768019913348634e-05, + "loss": 0.9798, "num_input_tokens_seen": 30638080, "step": 3740 }, { - "epoch": 0.4291108822519739, - "grad_norm": 1.0357657670974731, - "learning_rate": 3.5974275990806846e-05, - "loss": 1.1939, + "epoch": 0.18007202881152462, + "grad_norm": 0.454387903213501, + "learning_rate": 4.766350467590911e-05, + "loss": 0.8985, "num_input_tokens_seen": 30720000, "step": 3750 }, { - "epoch": 0.4302551779379792, - "grad_norm": 0.5274159908294678, - "learning_rate": 3.588300075618922e-05, - "loss": 1.1014, + "epoch": 0.18055222088835535, + "grad_norm": 0.5241227746009827, + "learning_rate": 4.764675330679256e-05, + "loss": 0.9849, "num_input_tokens_seen": 30801920, "step": 3760 }, { - "epoch": 0.43139947362398445, - "grad_norm": 0.48430922627449036, - "learning_rate": 3.579154618117946e-05, - "loss": 1.1029, + "epoch": 0.18103241296518607, + "grad_norm": 0.5448564887046814, + "learning_rate": 4.7629945068201954e-05, + "loss": 0.9484, "num_input_tokens_seen": 30883840, "step": 3770 }, { - "epoch": 0.4325437693099897, - "grad_norm": 0.5668869018554688, - "learning_rate": 3.5699913772852664e-05, - "loss": 1.3753, + "epoch": 0.1815126050420168, + "grad_norm": 0.5225489139556885, + "learning_rate": 4.7613080002345345e-05, + "loss": 0.8183, "num_input_tokens_seen": 30965760, "step": 3780 }, { - "epoch": 0.43368806499599494, - "grad_norm": 1.2524999380111694, - "learning_rate": 3.560810504121441e-05, - "loss": 1.405, + "epoch": 0.18199279711884753, + "grad_norm": 0.5782943367958069, + "learning_rate": 4.759615815157352e-05, + "loss": 1.0653, "num_input_tokens_seen": 31047680, "step": 3790 }, { - "epoch": 0.4348323606820002, - "grad_norm": 0.49741896986961365, - "learning_rate": 3.551612149917593e-05, - "loss": 1.1393, + "epoch": 0.18247298919567828, + "grad_norm": 0.5790293216705322, + "learning_rate": 4.7579179558379836e-05, + "loss": 0.8532, "num_input_tokens_seen": 31129600, "step": 3800 }, { - "epoch": 0.4359766563680055, - "grad_norm": 0.5412270426750183, - "learning_rate": 3.542396466252913e-05, - "loss": 1.3569, + "epoch": 0.182953181272509, + "grad_norm": 0.49173110723495483, + "learning_rate": 4.7562144265400146e-05, + "loss": 0.7594, "num_input_tokens_seen": 31211520, "step": 3810 }, { - "epoch": 0.43712095205401075, - "grad_norm": 2.5141255855560303, - "learning_rate": 3.533163604992163e-05, - "loss": 1.4194, + "epoch": 0.18343337334933973, + "grad_norm": 0.5832033157348633, + "learning_rate": 4.754505231541268e-05, + "loss": 1.0458, "num_input_tokens_seen": 31293440, "step": 3820 }, { - "epoch": 0.438265247740016, - "grad_norm": 0.5513648390769958, - "learning_rate": 3.523913718283175e-05, - "loss": 1.2899, + "epoch": 0.18391356542617046, + "grad_norm": 0.539718508720398, + "learning_rate": 4.752790375133797e-05, + "loss": 0.8403, "num_input_tokens_seen": 31375360, "step": 3830 }, { - "epoch": 0.4394095434260213, - "grad_norm": 0.583692729473114, - "learning_rate": 3.514646958554339e-05, - "loss": 1.3624, + "epoch": 0.1843937575030012, + "grad_norm": 0.5238791704177856, + "learning_rate": 4.751069861623867e-05, + "loss": 0.9284, "num_input_tokens_seen": 31457280, "step": 3840 }, { - "epoch": 0.44055383911202656, - "grad_norm": 0.5681483149528503, - "learning_rate": 3.5053634785121e-05, - "loss": 1.1517, + "epoch": 0.18487394957983194, + "grad_norm": 0.5033566355705261, + "learning_rate": 4.749343695331952e-05, + "loss": 1.0164, "num_input_tokens_seen": 31539200, "step": 3850 }, { - "epoch": 0.4416981347980318, - "grad_norm": 0.4962402880191803, - "learning_rate": 3.496063431138431e-05, - "loss": 1.2125, + "epoch": 0.18535414165666267, + "grad_norm": 0.6581845879554749, + "learning_rate": 4.747611880592721e-05, + "loss": 0.8328, "num_input_tokens_seen": 31621120, "step": 3860 }, { - "epoch": 0.4428424304840371, - "grad_norm": 0.4531221389770508, - "learning_rate": 3.4867469696883204e-05, - "loss": 1.287, + "epoch": 0.1858343337334934, + "grad_norm": 0.5361823439598083, + "learning_rate": 4.745874421755027e-05, + "loss": 1.0115, "num_input_tokens_seen": 31703040, "step": 3870 }, { - "epoch": 0.4439867261700423, - "grad_norm": 0.4687143564224243, - "learning_rate": 3.477414247687241e-05, - "loss": 1.0745, + "epoch": 0.18631452581032412, + "grad_norm": 0.5217710733413696, + "learning_rate": 4.744131323181895e-05, + "loss": 0.8417, "num_input_tokens_seen": 31784960, "step": 3880 }, { - "epoch": 0.4451310218560476, - "grad_norm": 0.5784918665885925, - "learning_rate": 3.468065418928625e-05, - "loss": 1.146, + "epoch": 0.18679471788715485, + "grad_norm": 0.5840950608253479, + "learning_rate": 4.742382589250514e-05, + "loss": 0.9902, "num_input_tokens_seen": 31866880, "step": 3890 }, { - "epoch": 0.44627531754205285, - "grad_norm": 0.4997071623802185, - "learning_rate": 3.458700637471325e-05, - "loss": 1.0793, + "epoch": 0.1872749099639856, + "grad_norm": 0.5277583599090576, + "learning_rate": 4.740628224352222e-05, + "loss": 1.0409, "num_input_tokens_seen": 31948800, "step": 3900 }, { - "epoch": 0.4474196132280581, - "grad_norm": 0.5425757765769958, - "learning_rate": 3.4493200576370776e-05, - "loss": 1.1415, + "epoch": 0.18775510204081633, + "grad_norm": 0.5681836605072021, + "learning_rate": 4.7388682328925e-05, + "loss": 0.9343, "num_input_tokens_seen": 32030720, "step": 3910 }, { - "epoch": 0.4485639089140634, - "grad_norm": 0.5466029644012451, - "learning_rate": 3.4399238340079607e-05, - "loss": 1.2075, + "epoch": 0.18823529411764706, + "grad_norm": 0.4854600131511688, + "learning_rate": 4.737102619290956e-05, + "loss": 0.9198, "num_input_tokens_seen": 32112640, "step": 3920 }, { - "epoch": 0.44970820460006866, - "grad_norm": 0.5251049995422363, - "learning_rate": 3.4305121214238446e-05, - "loss": 1.1582, + "epoch": 0.18871548619447778, + "grad_norm": 0.527630627155304, + "learning_rate": 4.7353313879813165e-05, + "loss": 0.9707, "num_input_tokens_seen": 32194560, "step": 3930 }, { - "epoch": 0.45085250028607393, - "grad_norm": 0.5477258563041687, - "learning_rate": 3.4210850749798415e-05, - "loss": 1.3309, + "epoch": 0.18919567827130854, + "grad_norm": 0.5316260457038879, + "learning_rate": 4.733554543411417e-05, + "loss": 1.0691, "num_input_tokens_seen": 32276480, "step": 3940 }, { - "epoch": 0.4519967959720792, - "grad_norm": 0.621071457862854, - "learning_rate": 3.411642850023751e-05, - "loss": 1.2072, + "epoch": 0.18967587034813926, + "grad_norm": 0.5303869843482971, + "learning_rate": 4.731772090043184e-05, + "loss": 0.9361, "num_input_tokens_seen": 32358400, "step": 3950 }, { - "epoch": 0.45314109165808447, - "grad_norm": 0.5287960767745972, - "learning_rate": 3.402185602153495e-05, - "loss": 1.1731, + "epoch": 0.19015606242497, + "grad_norm": 0.5066818594932556, + "learning_rate": 4.729984032352635e-05, + "loss": 0.8927, "num_input_tokens_seen": 32440320, "step": 3960 }, { - "epoch": 0.4542853873440897, - "grad_norm": 0.5123234987258911, - "learning_rate": 3.392713487214561e-05, - "loss": 1.1238, + "epoch": 0.19063625450180072, + "grad_norm": 0.5547256469726562, + "learning_rate": 4.728190374829854e-05, + "loss": 0.9308, "num_input_tokens_seen": 32522240, "step": 3970 }, { - "epoch": 0.45542968303009496, - "grad_norm": 0.5226893424987793, - "learning_rate": 3.38322666129743e-05, - "loss": 1.1262, + "epoch": 0.19111644657863144, + "grad_norm": 0.46649324893951416, + "learning_rate": 4.726391121978992e-05, + "loss": 0.9229, "num_input_tokens_seen": 32604160, "step": 3980 }, { - "epoch": 0.4565739787161002, - "grad_norm": 0.6006979942321777, - "learning_rate": 3.373725280735e-05, - "loss": 1.2084, + "epoch": 0.1915966386554622, + "grad_norm": 0.5034019351005554, + "learning_rate": 4.7245862783182496e-05, + "loss": 0.9033, "num_input_tokens_seen": 32686080, "step": 3990 }, { - "epoch": 0.4577182744021055, - "grad_norm": 0.475813627243042, - "learning_rate": 3.3642095021000184e-05, - "loss": 1.2836, + "epoch": 0.19207683073229292, + "grad_norm": 0.5081968307495117, + "learning_rate": 4.722775848379866e-05, + "loss": 1.0315, "num_input_tokens_seen": 32768000, "step": 4000 }, { - "epoch": 0.45886257008811077, - "grad_norm": 0.5431310534477234, - "learning_rate": 3.3546794822024976e-05, - "loss": 1.295, + "epoch": 0.19255702280912365, + "grad_norm": 0.4671435058116913, + "learning_rate": 4.720959836710107e-05, + "loss": 0.8195, "num_input_tokens_seen": 32849920, "step": 4010 }, { - "epoch": 0.46000686577411604, - "grad_norm": 0.48410120606422424, - "learning_rate": 3.3451353780871286e-05, - "loss": 1.3341, + "epoch": 0.19303721488595438, + "grad_norm": 0.4735753536224365, + "learning_rate": 4.7191382478692594e-05, + "loss": 0.9489, "num_input_tokens_seen": 32931840, "step": 4020 }, { - "epoch": 0.4611511614601213, - "grad_norm": 0.525595486164093, - "learning_rate": 3.335577347030697e-05, - "loss": 1.2892, + "epoch": 0.1935174069627851, + "grad_norm": 0.5032724142074585, + "learning_rate": 4.7173110864316104e-05, + "loss": 0.814, "num_input_tokens_seen": 33013760, "step": 4030 }, { - "epoch": 0.4622954571461266, - "grad_norm": 0.5761798620223999, - "learning_rate": 3.32600554653949e-05, - "loss": 1.5631, + "epoch": 0.19399759903961586, + "grad_norm": 0.645969033241272, + "learning_rate": 4.7154783569854444e-05, + "loss": 0.811, "num_input_tokens_seen": 33095680, "step": 4040 }, { - "epoch": 0.46343975283213185, - "grad_norm": 0.48856690526008606, - "learning_rate": 3.316420134346701e-05, - "loss": 0.9705, + "epoch": 0.19447779111644659, + "grad_norm": 0.5401909947395325, + "learning_rate": 4.713640064133025e-05, + "loss": 0.9777, "num_input_tokens_seen": 33177600, "step": 4050 }, { - "epoch": 0.46458404851813706, - "grad_norm": 0.7112519145011902, - "learning_rate": 3.306821268409827e-05, - "loss": 1.3763, + "epoch": 0.1949579831932773, + "grad_norm": 0.5023765563964844, + "learning_rate": 4.7117962124905885e-05, + "loss": 0.8785, "num_input_tokens_seen": 33259520, "step": 4060 }, { - "epoch": 0.46572834420414233, - "grad_norm": 0.9292272925376892, - "learning_rate": 3.297209106908072e-05, - "loss": 1.2818, + "epoch": 0.19543817527010804, + "grad_norm": 0.5897220373153687, + "learning_rate": 4.709946806688329e-05, + "loss": 0.9837, "num_input_tokens_seen": 33341440, "step": 4070 }, { - "epoch": 0.4668726398901476, - "grad_norm": 0.7249779105186462, - "learning_rate": 3.287583808239735e-05, - "loss": 1.1623, + "epoch": 0.19591836734693877, + "grad_norm": 0.6706360578536987, + "learning_rate": 4.708091851370389e-05, + "loss": 0.8769, "num_input_tokens_seen": 33423360, "step": 4080 }, { - "epoch": 0.46801693557615287, - "grad_norm": 0.5191545486450195, - "learning_rate": 3.277945531019601e-05, - "loss": 1.5152, + "epoch": 0.19639855942376952, + "grad_norm": 0.5493477582931519, + "learning_rate": 4.706231351194845e-05, + "loss": 0.9604, "num_input_tokens_seen": 33505280, "step": 4090 }, { - "epoch": 0.46916123126215814, - "grad_norm": 0.5034803748130798, - "learning_rate": 3.268294434076332e-05, - "loss": 1.5291, + "epoch": 0.19687875150060025, + "grad_norm": 0.5341002345085144, + "learning_rate": 4.7043653108336994e-05, + "loss": 0.8028, "num_input_tokens_seen": 33587200, "step": 4100 }, { - "epoch": 0.4703055269481634, - "grad_norm": 0.5027021765708923, - "learning_rate": 3.2586306764498395e-05, - "loss": 1.1453, + "epoch": 0.19735894357743097, + "grad_norm": 0.5551480054855347, + "learning_rate": 4.702493734972866e-05, + "loss": 0.9994, "num_input_tokens_seen": 33669120, "step": 4110 }, { - "epoch": 0.4714498226341687, - "grad_norm": 0.6401455998420715, - "learning_rate": 3.2489544173886745e-05, - "loss": 1.398, + "epoch": 0.1978391356542617, + "grad_norm": 1.2934221029281616, + "learning_rate": 4.700616628312158e-05, + "loss": 1.0073, "num_input_tokens_seen": 33751040, "step": 4120 }, { - "epoch": 0.47259411832017395, - "grad_norm": 0.34582868218421936, - "learning_rate": 3.239265816347397e-05, - "loss": 1.0819, + "epoch": 0.19831932773109243, + "grad_norm": 0.5070963501930237, + "learning_rate": 4.69873399556528e-05, + "loss": 0.8428, "num_input_tokens_seen": 33832960, "step": 4130 }, { - "epoch": 0.4737384140061792, - "grad_norm": 0.5344799757003784, - "learning_rate": 3.2295650329839474e-05, - "loss": 1.3502, + "epoch": 0.19879951980792318, + "grad_norm": 0.49348321557044983, + "learning_rate": 4.696845841459811e-05, + "loss": 1.0941, "num_input_tokens_seen": 33914880, "step": 4140 }, { - "epoch": 0.47488270969218443, - "grad_norm": 0.48560017347335815, - "learning_rate": 3.219852227157022e-05, - "loss": 1.1419, + "epoch": 0.1992797118847539, + "grad_norm": 0.48639488220214844, + "learning_rate": 4.6949521707371965e-05, + "loss": 0.8673, "num_input_tokens_seen": 33996800, "step": 4150 }, { - "epoch": 0.4760270053781897, - "grad_norm": 0.6051056981086731, - "learning_rate": 3.210127558923434e-05, - "loss": 1.3278, + "epoch": 0.19975990396158463, + "grad_norm": 0.5290814638137817, + "learning_rate": 4.693052988152733e-05, + "loss": 0.8488, "num_input_tokens_seen": 34078720, "step": 4160 }, { - "epoch": 0.477171301064195, - "grad_norm": 0.47563403844833374, - "learning_rate": 3.200391188535472e-05, - "loss": 1.2711, + "epoch": 0.20024009603841536, + "grad_norm": 0.7159754037857056, + "learning_rate": 4.691148298475561e-05, + "loss": 1.0891, "num_input_tokens_seen": 34160640, "step": 4170 }, { - "epoch": 0.47831559675020024, - "grad_norm": 0.5107710361480713, - "learning_rate": 3.1906432764382695e-05, - "loss": 1.4901, + "epoch": 0.2007202881152461, + "grad_norm": 0.5923986434936523, + "learning_rate": 4.689238106488647e-05, + "loss": 0.904, "num_input_tokens_seen": 34242560, "step": 4180 }, { - "epoch": 0.4794598924362055, - "grad_norm": 0.5463585257530212, - "learning_rate": 3.1808839832671523e-05, - "loss": 1.4351, + "epoch": 0.20120048019207684, + "grad_norm": 0.4993061125278473, + "learning_rate": 4.687322416988779e-05, + "loss": 0.8948, "num_input_tokens_seen": 34324480, "step": 4190 }, { - "epoch": 0.4806041881222108, - "grad_norm": 0.4871821403503418, - "learning_rate": 3.1711134698449946e-05, - "loss": 1.1299, + "epoch": 0.20168067226890757, + "grad_norm": 0.48761218786239624, + "learning_rate": 4.685401234786544e-05, + "loss": 0.9203, "num_input_tokens_seen": 34406400, "step": 4200 }, { - "epoch": 0.48174848380821605, - "grad_norm": 0.5575620532035828, - "learning_rate": 3.161331897179568e-05, - "loss": 1.3597, + "epoch": 0.2021608643457383, + "grad_norm": 0.4988939166069031, + "learning_rate": 4.683474564706327e-05, + "loss": 0.9092, "num_input_tokens_seen": 34488320, "step": 4210 }, { - "epoch": 0.4828927794942213, - "grad_norm": 0.5072765350341797, - "learning_rate": 3.151539426460892e-05, - "loss": 1.4977, + "epoch": 0.20264105642256902, + "grad_norm": 0.6328974366188049, + "learning_rate": 4.681542411586294e-05, + "loss": 0.8973, "num_input_tokens_seen": 34570240, "step": 4220 }, { - "epoch": 0.4840370751802266, - "grad_norm": 0.517631471157074, - "learning_rate": 3.14173621905857e-05, - "loss": 1.1672, + "epoch": 0.20312124849939975, + "grad_norm": 0.5057374835014343, + "learning_rate": 4.6796047802783755e-05, + "loss": 1.046, "num_input_tokens_seen": 34652160, "step": 4230 }, { - "epoch": 0.4851813708662318, - "grad_norm": 0.5261096954345703, - "learning_rate": 3.1319224365191366e-05, - "loss": 1.495, + "epoch": 0.2036014405762305, + "grad_norm": 0.562879741191864, + "learning_rate": 4.6776616756482624e-05, + "loss": 0.8767, "num_input_tokens_seen": 34734080, "step": 4240 }, { - "epoch": 0.4863256665522371, - "grad_norm": 0.5559542179107666, - "learning_rate": 3.122098240563396e-05, - "loss": 1.6386, + "epoch": 0.20408163265306123, + "grad_norm": 0.9288103580474854, + "learning_rate": 4.6757131025753886e-05, + "loss": 0.849, "num_input_tokens_seen": 34816000, "step": 4250 }, { - "epoch": 0.48746996223824235, - "grad_norm": 0.8691073060035706, - "learning_rate": 3.1122637930837486e-05, - "loss": 1.2742, + "epoch": 0.20456182472989196, + "grad_norm": 0.5365880727767944, + "learning_rate": 4.67375906595292e-05, + "loss": 1.1241, "num_input_tokens_seen": 34897920, "step": 4260 }, { - "epoch": 0.4886142579242476, - "grad_norm": 0.5249228477478027, - "learning_rate": 3.102419256141536e-05, - "loss": 1.2173, + "epoch": 0.20504201680672268, + "grad_norm": 0.5342133641242981, + "learning_rate": 4.671799570687743e-05, + "loss": 1.1088, "num_input_tokens_seen": 34979840, "step": 4270 }, { - "epoch": 0.4897585536102529, - "grad_norm": 0.5527949333190918, - "learning_rate": 3.092564791964358e-05, - "loss": 1.187, + "epoch": 0.2055222088835534, + "grad_norm": 0.5288349390029907, + "learning_rate": 4.6698346217004494e-05, + "loss": 0.9175, "num_input_tokens_seen": 35061760, "step": 4280 }, { - "epoch": 0.49090284929625816, - "grad_norm": 0.48992666602134705, - "learning_rate": 3.082700562943409e-05, - "loss": 1.5849, + "epoch": 0.20600240096038416, + "grad_norm": 0.7718984484672546, + "learning_rate": 4.66786422392533e-05, + "loss": 1.146, "num_input_tokens_seen": 35143680, "step": 4290 }, { - "epoch": 0.49204714498226343, - "grad_norm": 0.4628704786300659, - "learning_rate": 3.0728267316307945e-05, - "loss": 1.2716, + "epoch": 0.2064825930372149, + "grad_norm": 0.5124546885490417, + "learning_rate": 4.665888382310356e-05, + "loss": 0.9771, "num_input_tokens_seen": 35225600, "step": 4300 }, { - "epoch": 0.4931914406682687, - "grad_norm": 1.0824905633926392, - "learning_rate": 3.062943460736857e-05, - "loss": 1.2538, + "epoch": 0.20696278511404562, + "grad_norm": 0.52974534034729, + "learning_rate": 4.663907101817167e-05, + "loss": 0.9299, "num_input_tokens_seen": 35307520, "step": 4310 }, { - "epoch": 0.49433573635427397, - "grad_norm": 0.5195997953414917, - "learning_rate": 3.0530509131274935e-05, - "loss": 1.1509, + "epoch": 0.20744297719087634, + "grad_norm": 0.50569087266922, + "learning_rate": 4.661920387421064e-05, + "loss": 1.0124, "num_input_tokens_seen": 35389440, "step": 4320 }, { - "epoch": 0.4954800320402792, - "grad_norm": 0.5249128937721252, - "learning_rate": 3.04314925182147e-05, - "loss": 1.1693, + "epoch": 0.20792316926770707, + "grad_norm": 0.5128062963485718, + "learning_rate": 4.65992824411099e-05, + "loss": 0.9051, "num_input_tokens_seen": 35471360, "step": 4330 }, { - "epoch": 0.49662432772628445, - "grad_norm": 0.5838843584060669, - "learning_rate": 3.03323863998774e-05, - "loss": 1.2192, + "epoch": 0.20840336134453782, + "grad_norm": 0.549404501914978, + "learning_rate": 4.657930676889526e-05, + "loss": 1.11, "num_input_tokens_seen": 35553280, "step": 4340 }, { - "epoch": 0.4977686234122897, - "grad_norm": 0.5815695524215698, - "learning_rate": 3.0233192409427492e-05, - "loss": 1.2383, + "epoch": 0.20888355342136855, + "grad_norm": 0.5476168394088745, + "learning_rate": 4.655927690772868e-05, + "loss": 0.7549, "num_input_tokens_seen": 35635200, "step": 4350 }, { - "epoch": 0.498912919098295, - "grad_norm": 0.47992345690727234, - "learning_rate": 3.0133912181477475e-05, - "loss": 1.1951, + "epoch": 0.20936374549819928, + "grad_norm": 0.5444672107696533, + "learning_rate": 4.6539192907908204e-05, + "loss": 0.798, "num_input_tokens_seen": 35717120, "step": 4360 }, { - "epoch": 0.5000572147843002, - "grad_norm": 0.5813825726509094, - "learning_rate": 3.003454735206097e-05, - "loss": 1.4106, + "epoch": 0.20984393757503, + "grad_norm": 0.7407582998275757, + "learning_rate": 4.6519054819867856e-05, + "loss": 1.0632, "num_input_tokens_seen": 35799040, "step": 4370 }, { - "epoch": 0.5012015104703055, - "grad_norm": 0.5316118597984314, - "learning_rate": 2.9935099558605728e-05, - "loss": 1.2978, + "epoch": 0.21032412965186073, + "grad_norm": 0.5673499703407288, + "learning_rate": 4.649886269417746e-05, + "loss": 0.9968, "num_input_tokens_seen": 35880960, "step": 4380 }, { - "epoch": 0.5023458061563107, - "grad_norm": 0.5098001956939697, - "learning_rate": 2.9835570439906657e-05, - "loss": 1.1646, + "epoch": 0.21080432172869148, + "grad_norm": 0.5538674592971802, + "learning_rate": 4.647861658154254e-05, + "loss": 0.8916, "num_input_tokens_seen": 35962880, "step": 4390 }, { - "epoch": 0.503490101842316, - "grad_norm": 0.482377290725708, - "learning_rate": 2.973596163609883e-05, - "loss": 1.2874, + "epoch": 0.2112845138055222, + "grad_norm": 0.4925515949726105, + "learning_rate": 4.6458316532804214e-05, + "loss": 1.0407, "num_input_tokens_seen": 36044800, "step": 4400 }, { - "epoch": 0.5046343975283213, - "grad_norm": 0.5183489322662354, - "learning_rate": 2.9636274788630437e-05, - "loss": 1.2198, + "epoch": 0.21176470588235294, + "grad_norm": 0.4931751787662506, + "learning_rate": 4.643796259893899e-05, + "loss": 0.9456, "num_input_tokens_seen": 36126720, "step": 4410 }, { - "epoch": 0.5057786932143266, - "grad_norm": 0.4759490191936493, - "learning_rate": 2.9536511540235744e-05, - "loss": 1.2435, + "epoch": 0.21224489795918366, + "grad_norm": 0.5209604501724243, + "learning_rate": 4.641755483105874e-05, + "loss": 0.9378, "num_input_tokens_seen": 36208640, "step": 4420 }, { - "epoch": 0.5069229889003318, - "grad_norm": 0.5005761384963989, - "learning_rate": 2.9436673534908044e-05, - "loss": 1.367, + "epoch": 0.21272509003601442, + "grad_norm": 1.0745376348495483, + "learning_rate": 4.63970932804105e-05, + "loss": 0.8205, "num_input_tokens_seen": 36290560, "step": 4430 }, { - "epoch": 0.5080672845863371, - "grad_norm": 0.48932281136512756, - "learning_rate": 2.9336762417872516e-05, - "loss": 1.1908, + "epoch": 0.21320528211284515, + "grad_norm": 0.5311976671218872, + "learning_rate": 4.637657799837635e-05, + "loss": 0.8998, "num_input_tokens_seen": 36372480, "step": 4440 }, { - "epoch": 0.5092115802723424, - "grad_norm": 0.49353188276290894, - "learning_rate": 2.9236779835559165e-05, - "loss": 1.2709, + "epoch": 0.21368547418967587, + "grad_norm": 0.5121796727180481, + "learning_rate": 4.635600903647333e-05, + "loss": 0.8201, "num_input_tokens_seen": 36454400, "step": 4450 }, { - "epoch": 0.5103558759583476, - "grad_norm": 0.5292254686355591, - "learning_rate": 2.913672743557565e-05, - "loss": 1.1411, + "epoch": 0.2141656662665066, + "grad_norm": 0.5421672463417053, + "learning_rate": 4.633538644635326e-05, + "loss": 0.879, "num_input_tokens_seen": 36536320, "step": 4460 }, { - "epoch": 0.5115001716443529, - "grad_norm": 0.4894375801086426, - "learning_rate": 2.9036606866680187e-05, - "loss": 1.1795, + "epoch": 0.21464585834333733, + "grad_norm": 1.2252622842788696, + "learning_rate": 4.631471027980262e-05, + "loss": 0.9865, "num_input_tokens_seen": 36618240, "step": 4470 }, { - "epoch": 0.5126444673303582, - "grad_norm": 0.7328277826309204, - "learning_rate": 2.8936419778754294e-05, - "loss": 1.4518, + "epoch": 0.21512605042016808, + "grad_norm": 0.5252002477645874, + "learning_rate": 4.629398058874245e-05, + "loss": 0.8658, "num_input_tokens_seen": 36700160, "step": 4480 }, { - "epoch": 0.5137887630163634, - "grad_norm": 0.5245153903961182, - "learning_rate": 2.883616782277569e-05, - "loss": 1.1565, + "epoch": 0.2156062424969988, + "grad_norm": 0.5939603447914124, + "learning_rate": 4.6273197425228166e-05, + "loss": 0.9567, "num_input_tokens_seen": 36782080, "step": 4490 }, { - "epoch": 0.5149330587023687, - "grad_norm": 0.5825649499893188, - "learning_rate": 2.8735852650791035e-05, - "loss": 1.1245, + "epoch": 0.21608643457382953, + "grad_norm": 0.5345758199691772, + "learning_rate": 4.62523608414495e-05, + "loss": 0.9567, "num_input_tokens_seen": 36864000, "step": 4500 }, { - "epoch": 0.516077354388374, - "grad_norm": 0.4824535548686981, - "learning_rate": 2.8635475915888732e-05, - "loss": 1.2498, + "epoch": 0.21656662665066026, + "grad_norm": 1.0143563747406006, + "learning_rate": 4.623147088973031e-05, + "loss": 0.7952, "num_input_tokens_seen": 36945920, "step": 4510 }, { - "epoch": 0.5172216500743793, - "grad_norm": 0.51859050989151, - "learning_rate": 2.853503927217167e-05, - "loss": 1.222, + "epoch": 0.217046818727491, + "grad_norm": 0.5424322485923767, + "learning_rate": 4.6210527622528465e-05, + "loss": 1.1468, "num_input_tokens_seen": 37027840, "step": 4520 }, { - "epoch": 0.5183659457603845, - "grad_norm": 0.4794926345348358, - "learning_rate": 2.8434544374729965e-05, - "loss": 1.3003, + "epoch": 0.21752701080432174, + "grad_norm": 0.5119237303733826, + "learning_rate": 4.618953109243573e-05, + "loss": 0.8657, "num_input_tokens_seen": 37109760, "step": 4530 }, { - "epoch": 0.5195102414463898, - "grad_norm": 0.8347005248069763, - "learning_rate": 2.8333992879613712e-05, - "loss": 1.1049, + "epoch": 0.21800720288115247, + "grad_norm": 0.5329012274742126, + "learning_rate": 4.616848135217761e-05, + "loss": 0.9235, "num_input_tokens_seen": 37191680, "step": 4540 }, { - "epoch": 0.520654537132395, - "grad_norm": 0.5049195289611816, - "learning_rate": 2.823338644380566e-05, - "loss": 1.2885, + "epoch": 0.2184873949579832, + "grad_norm": 0.5090308785438538, + "learning_rate": 4.6147378454613246e-05, + "loss": 0.8308, "num_input_tokens_seen": 37273600, "step": 4550 }, { - "epoch": 0.5217988328184002, - "grad_norm": 0.6773023009300232, - "learning_rate": 2.8132726725193926e-05, - "loss": 1.3466, + "epoch": 0.21896758703481392, + "grad_norm": 0.7635847926139832, + "learning_rate": 4.6126222452735233e-05, + "loss": 0.8878, "num_input_tokens_seen": 37355520, "step": 4560 }, { - "epoch": 0.5229431285044055, - "grad_norm": 0.4892396628856659, - "learning_rate": 2.803201538254467e-05, - "loss": 1.3192, + "epoch": 0.21944777911164465, + "grad_norm": 0.48617926239967346, + "learning_rate": 4.6105013399669564e-05, + "loss": 0.9695, "num_input_tokens_seen": 37437440, "step": 4570 }, { - "epoch": 0.5240874241904108, - "grad_norm": 0.5198492407798767, - "learning_rate": 2.7931254075474768e-05, - "loss": 1.2215, + "epoch": 0.2199279711884754, + "grad_norm": 0.6429051756858826, + "learning_rate": 4.608375134867541e-05, + "loss": 0.8531, "num_input_tokens_seen": 37519360, "step": 4580 }, { - "epoch": 0.525231719876416, - "grad_norm": 0.4889214336872101, - "learning_rate": 2.7830444464424466e-05, - "loss": 1.1609, + "epoch": 0.22040816326530613, + "grad_norm": 0.5649603605270386, + "learning_rate": 4.6062436353145044e-05, + "loss": 0.8998, "num_input_tokens_seen": 37601280, "step": 4590 }, { - "epoch": 0.5263760155624213, - "grad_norm": 0.512380838394165, - "learning_rate": 2.772958821062997e-05, - "loss": 1.0642, + "epoch": 0.22088835534213686, + "grad_norm": 0.5439668297767639, + "learning_rate": 4.60410684666037e-05, + "loss": 1.2532, "num_input_tokens_seen": 37683200, "step": 4600 }, { - "epoch": 0.5275203112484266, - "grad_norm": 0.792269766330719, - "learning_rate": 2.7628686976096164e-05, - "loss": 1.1454, + "epoch": 0.22136854741896758, + "grad_norm": 0.5839661359786987, + "learning_rate": 4.601964774270941e-05, + "loss": 0.9984, "num_input_tokens_seen": 37765120, "step": 4610 }, { - "epoch": 0.5286646069344318, - "grad_norm": 0.5080438256263733, - "learning_rate": 2.7527742423569124e-05, - "loss": 1.1747, + "epoch": 0.2218487394957983, + "grad_norm": 0.5015205144882202, + "learning_rate": 4.599817423525292e-05, + "loss": 1.0361, "num_input_tokens_seen": 37847040, "step": 4620 }, { - "epoch": 0.5298089026204371, - "grad_norm": 0.494386225938797, - "learning_rate": 2.7426756216508776e-05, - "loss": 1.1861, + "epoch": 0.22232893157262906, + "grad_norm": 0.5025255084037781, + "learning_rate": 4.597664799815749e-05, + "loss": 1.0169, "num_input_tokens_seen": 37928960, "step": 4630 }, { - "epoch": 0.5309531983064424, - "grad_norm": 0.7930368185043335, - "learning_rate": 2.7325730019061474e-05, - "loss": 1.3794, + "epoch": 0.2228091236494598, + "grad_norm": 0.4889910817146301, + "learning_rate": 4.595506908547881e-05, + "loss": 0.9112, "num_input_tokens_seen": 38010880, "step": 4640 }, { - "epoch": 0.5320974939924477, - "grad_norm": 0.5523006319999695, - "learning_rate": 2.7224665496032565e-05, - "loss": 1.1395, + "epoch": 0.22328931572629052, + "grad_norm": 0.5253650546073914, + "learning_rate": 4.593343755140483e-05, + "loss": 0.9424, "num_input_tokens_seen": 38092800, "step": 4650 }, { - "epoch": 0.5332417896784529, - "grad_norm": 0.6003334522247314, - "learning_rate": 2.712356431285896e-05, - "loss": 1.1517, + "epoch": 0.22376950780312124, + "grad_norm": 0.4819222688674927, + "learning_rate": 4.5911753450255665e-05, + "loss": 1.0131, "num_input_tokens_seen": 38174720, "step": 4660 }, { - "epoch": 0.5343860853644582, - "grad_norm": 0.5045771598815918, - "learning_rate": 2.70224281355817e-05, - "loss": 1.2753, + "epoch": 0.22424969987995197, + "grad_norm": 0.5012921690940857, + "learning_rate": 4.589001683648343e-05, + "loss": 0.7262, "num_input_tokens_seen": 38256640, "step": 4670 }, { - "epoch": 0.5355303810504635, - "grad_norm": 0.5754336714744568, - "learning_rate": 2.6921258630818475e-05, - "loss": 1.1533, + "epoch": 0.22472989195678272, + "grad_norm": 0.5600073337554932, + "learning_rate": 4.586822776467208e-05, + "loss": 0.9312, "num_input_tokens_seen": 38338560, "step": 4680 }, { - "epoch": 0.5366746767364687, - "grad_norm": 0.5257157683372498, - "learning_rate": 2.6820057465736197e-05, - "loss": 1.1615, + "epoch": 0.22521008403361345, + "grad_norm": 0.5474848747253418, + "learning_rate": 4.584638628953733e-05, + "loss": 0.9062, "num_input_tokens_seen": 38420480, "step": 4690 }, { - "epoch": 0.537818972422474, - "grad_norm": 0.4963821768760681, - "learning_rate": 2.6718826308023487e-05, - "loss": 1.1366, + "epoch": 0.22569027611044418, + "grad_norm": 0.614006519317627, + "learning_rate": 4.582449246592647e-05, + "loss": 0.8918, "num_input_tokens_seen": 38502400, "step": 4700 }, { - "epoch": 0.5389632681084793, - "grad_norm": 0.5854734778404236, - "learning_rate": 2.6617566825863237e-05, - "loss": 1.129, + "epoch": 0.2261704681872749, + "grad_norm": 0.5042563080787659, + "learning_rate": 4.5802546348818264e-05, + "loss": 0.8067, "num_input_tokens_seen": 38584320, "step": 4710 }, { - "epoch": 0.5401075637944845, - "grad_norm": 0.5400436520576477, - "learning_rate": 2.651628068790507e-05, - "loss": 1.0386, + "epoch": 0.22665066026410563, + "grad_norm": 0.5609814524650574, + "learning_rate": 4.578054799332277e-05, + "loss": 0.9678, "num_input_tokens_seen": 38666240, "step": 4720 }, { - "epoch": 0.5412518594804897, - "grad_norm": 1.165490984916687, - "learning_rate": 2.6414969563237874e-05, - "loss": 1.0903, + "epoch": 0.22713085234093638, + "grad_norm": 0.4998452663421631, + "learning_rate": 4.575849745468124e-05, + "loss": 1.0305, "num_input_tokens_seen": 38748160, "step": 4730 }, { - "epoch": 0.542396155166495, - "grad_norm": 1.1871403455734253, - "learning_rate": 2.6313635121362322e-05, - "loss": 1.2361, + "epoch": 0.2276110444177671, + "grad_norm": 0.5019862651824951, + "learning_rate": 4.573639478826596e-05, + "loss": 0.8708, "num_input_tokens_seen": 38830080, "step": 4740 }, { - "epoch": 0.5435404508525002, - "grad_norm": 0.5504677295684814, - "learning_rate": 2.6212279032163283e-05, - "loss": 1.5279, + "epoch": 0.22809123649459784, + "grad_norm": 0.5069782733917236, + "learning_rate": 4.571424004958012e-05, + "loss": 1.0366, "num_input_tokens_seen": 38912000, "step": 4750 }, { - "epoch": 0.5446847465385055, - "grad_norm": 0.9438173174858093, - "learning_rate": 2.6110902965882383e-05, - "loss": 1.3035, + "epoch": 0.22857142857142856, + "grad_norm": 0.5219393372535706, + "learning_rate": 4.5692033294257666e-05, + "loss": 1.1274, "num_input_tokens_seen": 38993920, "step": 4760 }, { - "epoch": 0.5458290422245108, - "grad_norm": 0.5121588110923767, - "learning_rate": 2.6009508593090448e-05, - "loss": 1.2656, + "epoch": 0.2290516206482593, + "grad_norm": 0.5168918967247009, + "learning_rate": 4.5669774578063174e-05, + "loss": 0.906, "num_input_tokens_seen": 39075840, "step": 4770 }, { - "epoch": 0.546973337910516, - "grad_norm": 0.5496011972427368, - "learning_rate": 2.590809758465995e-05, - "loss": 1.3347, + "epoch": 0.22953181272509005, + "grad_norm": 0.554415762424469, + "learning_rate": 4.56474639568917e-05, + "loss": 0.9868, "num_input_tokens_seen": 39157760, "step": 4780 }, { - "epoch": 0.5481176335965213, - "grad_norm": 0.5163994431495667, - "learning_rate": 2.580667161173753e-05, - "loss": 1.0816, + "epoch": 0.23001200480192077, + "grad_norm": 0.5875802040100098, + "learning_rate": 4.5625101486768626e-05, + "loss": 0.9674, "num_input_tokens_seen": 39239680, "step": 4790 }, { - "epoch": 0.5492619292825266, - "grad_norm": 0.547443687915802, - "learning_rate": 2.570523234571642e-05, - "loss": 1.072, + "epoch": 0.2304921968787515, + "grad_norm": 0.524535596370697, + "learning_rate": 4.560268722384956e-05, + "loss": 0.944, "num_input_tokens_seen": 39321600, "step": 4800 }, { - "epoch": 0.5504062249685319, - "grad_norm": 0.49614182114601135, - "learning_rate": 2.5603781458208885e-05, - "loss": 1.105, + "epoch": 0.23097238895558223, + "grad_norm": 0.5096153020858765, + "learning_rate": 4.558022122442016e-05, + "loss": 0.9393, "num_input_tokens_seen": 39403520, "step": 4810 }, { - "epoch": 0.5515505206545371, - "grad_norm": 0.6012304425239563, - "learning_rate": 2.5502320621018732e-05, - "loss": 1.1652, + "epoch": 0.23145258103241295, + "grad_norm": 0.527031421661377, + "learning_rate": 4.555770354489598e-05, + "loss": 0.9572, "num_input_tokens_seen": 39485440, "step": 4820 }, { - "epoch": 0.5526948163405424, - "grad_norm": 1.310507893562317, - "learning_rate": 2.5400851506113728e-05, - "loss": 1.2073, + "epoch": 0.2319327731092437, + "grad_norm": 0.4810847342014313, + "learning_rate": 4.5535134241822394e-05, + "loss": 0.8217, "num_input_tokens_seen": 39567360, "step": 4830 }, { - "epoch": 0.5538391120265477, - "grad_norm": 0.5216242671012878, - "learning_rate": 2.5299375785598005e-05, - "loss": 1.2457, + "epoch": 0.23241296518607443, + "grad_norm": 0.4917358458042145, + "learning_rate": 4.551251337187436e-05, + "loss": 0.8829, "num_input_tokens_seen": 39649280, "step": 4840 }, { - "epoch": 0.554983407712553, - "grad_norm": 0.4636791944503784, - "learning_rate": 2.519789513168459e-05, - "loss": 1.3313, + "epoch": 0.23289315726290516, + "grad_norm": 0.4697302281856537, + "learning_rate": 4.548984099185638e-05, + "loss": 1.0275, "num_input_tokens_seen": 39731200, "step": 4850 }, { - "epoch": 0.5561277033985582, - "grad_norm": 0.4918244779109955, - "learning_rate": 2.509641121666781e-05, - "loss": 1.1312, + "epoch": 0.23337334933973589, + "grad_norm": 0.7084998488426208, + "learning_rate": 4.546711715870227e-05, + "loss": 0.8638, "num_input_tokens_seen": 39813120, "step": 4860 }, { - "epoch": 0.5572719990845635, - "grad_norm": 0.5367238521575928, - "learning_rate": 2.4994925712895697e-05, - "loss": 1.1891, + "epoch": 0.23385354141656664, + "grad_norm": 0.6179664134979248, + "learning_rate": 4.5444341929475064e-05, + "loss": 0.9251, "num_input_tokens_seen": 39895040, "step": 4870 }, { - "epoch": 0.5584162947705688, - "grad_norm": 0.45426082611083984, - "learning_rate": 2.489344029274249e-05, - "loss": 1.4017, + "epoch": 0.23433373349339737, + "grad_norm": 0.5251472592353821, + "learning_rate": 4.5421515361366854e-05, + "loss": 1.362, "num_input_tokens_seen": 39976960, "step": 4880 }, { - "epoch": 0.559560590456574, - "grad_norm": 1.55329167842865, - "learning_rate": 2.479195662858105e-05, - "loss": 1.1554, + "epoch": 0.2348139255702281, + "grad_norm": 0.566771924495697, + "learning_rate": 4.5398637511698665e-05, + "loss": 0.6601, "num_input_tokens_seen": 40058880, "step": 4890 }, { - "epoch": 0.5607048861425793, - "grad_norm": 0.5056139230728149, - "learning_rate": 2.4690476392755298e-05, - "loss": 1.3402, + "epoch": 0.23529411764705882, + "grad_norm": 0.5199572443962097, + "learning_rate": 4.5375708437920284e-05, + "loss": 0.9545, "num_input_tokens_seen": 40140800, "step": 4900 }, { - "epoch": 0.5618491818285845, - "grad_norm": 0.45648014545440674, - "learning_rate": 2.4589001257552637e-05, - "loss": 1.288, + "epoch": 0.23577430972388955, + "grad_norm": 0.4832773506641388, + "learning_rate": 4.535272819761014e-05, + "loss": 1.0709, "num_input_tokens_seen": 40222720, "step": 4910 }, { - "epoch": 0.5629934775145897, - "grad_norm": 0.5516757369041443, - "learning_rate": 2.4487532895176457e-05, - "loss": 1.2969, + "epoch": 0.2362545018007203, + "grad_norm": 0.575834333896637, + "learning_rate": 4.532969684847514e-05, + "loss": 1.0263, "num_input_tokens_seen": 40304640, "step": 4920 }, { - "epoch": 0.564137773200595, - "grad_norm": 0.5544970631599426, - "learning_rate": 2.4386072977718503e-05, - "loss": 1.346, + "epoch": 0.23673469387755103, + "grad_norm": 0.6102591753005981, + "learning_rate": 4.530661444835054e-05, + "loss": 0.9399, "num_input_tokens_seen": 40386560, "step": 4930 }, { - "epoch": 0.5652820688866003, - "grad_norm": 0.48032233119010925, - "learning_rate": 2.4284623177131395e-05, - "loss": 1.2043, + "epoch": 0.23721488595438175, + "grad_norm": 0.4413188099861145, + "learning_rate": 4.5283481055199784e-05, + "loss": 1.0041, "num_input_tokens_seen": 40468480, "step": 4940 }, { - "epoch": 0.5664263645726055, - "grad_norm": 1.1765650510787964, - "learning_rate": 2.4183185165200998e-05, - "loss": 1.1418, + "epoch": 0.23769507803121248, + "grad_norm": 0.5925584435462952, + "learning_rate": 4.526029672711437e-05, + "loss": 1.1321, "num_input_tokens_seen": 40550400, "step": 4950 }, { - "epoch": 0.5675706602586108, - "grad_norm": 0.5061176419258118, - "learning_rate": 2.4081760613518924e-05, - "loss": 1.0886, + "epoch": 0.2381752701080432, + "grad_norm": 1.5264989137649536, + "learning_rate": 4.523706152231373e-05, + "loss": 0.9726, "num_input_tokens_seen": 40632320, "step": 4960 }, { - "epoch": 0.5687149559446161, - "grad_norm": 1.4397417306900024, - "learning_rate": 2.3980351193455e-05, - "loss": 1.3159, + "epoch": 0.23865546218487396, + "grad_norm": 0.5782877206802368, + "learning_rate": 4.5213775499145e-05, + "loss": 1.0489, "num_input_tokens_seen": 40714240, "step": 4970 }, { - "epoch": 0.5698592516306213, - "grad_norm": 0.46491971611976624, - "learning_rate": 2.3878958576129664e-05, - "loss": 1.2942, + "epoch": 0.2391356542617047, + "grad_norm": 0.48922982811927795, + "learning_rate": 4.519043871608297e-05, + "loss": 0.7904, "num_input_tokens_seen": 40796160, "step": 4980 }, { - "epoch": 0.5710035473166266, - "grad_norm": 0.5516128540039062, - "learning_rate": 2.3777584432386474e-05, - "loss": 1.3472, + "epoch": 0.23961584633853542, + "grad_norm": 0.5028977990150452, + "learning_rate": 4.5167051231729894e-05, + "loss": 0.8851, "num_input_tokens_seen": 40878080, "step": 4990 }, { - "epoch": 0.5721478430026319, - "grad_norm": 0.4662507176399231, - "learning_rate": 2.367623043276459e-05, - "loss": 1.1461, + "epoch": 0.24009603841536614, + "grad_norm": 0.4978225529193878, + "learning_rate": 4.514361310481533e-05, + "loss": 0.9244, "num_input_tokens_seen": 40960000, "step": 5000 }, { - "epoch": 0.5732921386886372, - "grad_norm": 0.4610395133495331, - "learning_rate": 2.3574898247471167e-05, - "loss": 0.997, + "epoch": 0.24057623049219687, + "grad_norm": 0.6003936529159546, + "learning_rate": 4.512012439419601e-05, + "loss": 0.9562, "num_input_tokens_seen": 41041920, "step": 5010 }, { - "epoch": 0.5744364343746424, - "grad_norm": 0.6839539408683777, - "learning_rate": 2.347358954635393e-05, - "loss": 1.2608, + "epoch": 0.24105642256902762, + "grad_norm": 0.5342166423797607, + "learning_rate": 4.509658515885568e-05, + "loss": 1.0229, "num_input_tokens_seen": 41123840, "step": 5020 }, { - "epoch": 0.5755807300606477, - "grad_norm": 0.5418996810913086, - "learning_rate": 2.337230599887358e-05, - "loss": 1.1886, + "epoch": 0.24153661464585835, + "grad_norm": 0.5202224850654602, + "learning_rate": 4.5072995457904995e-05, + "loss": 0.8586, "num_input_tokens_seen": 41205760, "step": 5030 }, { - "epoch": 0.576725025746653, - "grad_norm": 0.5929667949676514, - "learning_rate": 2.327104927407634e-05, - "loss": 1.2545, + "epoch": 0.24201680672268908, + "grad_norm": 0.5373672842979431, + "learning_rate": 4.50493553505813e-05, + "loss": 0.9188, "num_input_tokens_seen": 41287680, "step": 5040 }, { - "epoch": 0.5778693214326582, - "grad_norm": 0.49478694796562195, - "learning_rate": 2.3169821040566387e-05, - "loss": 1.0687, + "epoch": 0.2424969987995198, + "grad_norm": 0.5604919195175171, + "learning_rate": 4.502566489624855e-05, + "loss": 1.041, "num_input_tokens_seen": 41369600, "step": 5050 }, { - "epoch": 0.5790136171186635, - "grad_norm": 0.48581549525260925, - "learning_rate": 2.306862296647841e-05, - "loss": 1.1071, + "epoch": 0.24297719087635053, + "grad_norm": 0.4886492192745209, + "learning_rate": 4.50019241543971e-05, + "loss": 0.9901, "num_input_tokens_seen": 41451520, "step": 5060 }, { - "epoch": 0.5801579128046688, - "grad_norm": 1.5436527729034424, - "learning_rate": 2.2967456719450127e-05, - "loss": 1.0824, + "epoch": 0.24345738295318128, + "grad_norm": 0.5549675822257996, + "learning_rate": 4.4978133184643586e-05, + "loss": 0.9472, "num_input_tokens_seen": 41533440, "step": 5070 }, { - "epoch": 0.581302208490674, - "grad_norm": 0.49023711681365967, - "learning_rate": 2.2866323966594736e-05, - "loss": 1.451, + "epoch": 0.243937575030012, + "grad_norm": 0.4997415244579315, + "learning_rate": 4.495429204673081e-05, + "loss": 0.9372, "num_input_tokens_seen": 41615360, "step": 5080 }, { - "epoch": 0.5824465041766792, - "grad_norm": 0.48039644956588745, - "learning_rate": 2.2765226374473504e-05, - "loss": 1.2207, + "epoch": 0.24441776710684274, + "grad_norm": 0.5093867182731628, + "learning_rate": 4.493040080052752e-05, + "loss": 0.8477, "num_input_tokens_seen": 41697280, "step": 5090 }, { - "epoch": 0.5835907998626845, - "grad_norm": 0.46668538451194763, - "learning_rate": 2.2664165609068304e-05, - "loss": 1.1504, + "epoch": 0.24489795918367346, + "grad_norm": 0.5322418808937073, + "learning_rate": 4.49064595060283e-05, + "loss": 1.0359, "num_input_tokens_seen": 41779200, "step": 5100 }, { - "epoch": 0.5847350955486897, - "grad_norm": 0.418855220079422, - "learning_rate": 2.2563143335754118e-05, - "loss": 1.1436, + "epoch": 0.2453781512605042, + "grad_norm": 0.485977441072464, + "learning_rate": 4.488246822335341e-05, + "loss": 1.027, "num_input_tokens_seen": 41861120, "step": 5110 }, { - "epoch": 0.585879391234695, - "grad_norm": 0.6656162142753601, - "learning_rate": 2.2462161219271622e-05, - "loss": 1.5087, + "epoch": 0.24585834333733494, + "grad_norm": 0.6202290058135986, + "learning_rate": 4.485842701274865e-05, + "loss": 1.4544, "num_input_tokens_seen": 41943040, "step": 5120 }, { - "epoch": 0.5870236869207003, - "grad_norm": 0.5571346282958984, - "learning_rate": 2.236122092369977e-05, - "loss": 1.2635, + "epoch": 0.24633853541416567, + "grad_norm": 0.5418204665184021, + "learning_rate": 4.4834335934585194e-05, + "loss": 0.8381, "num_input_tokens_seen": 42024960, "step": 5130 }, { - "epoch": 0.5881679826067056, - "grad_norm": 0.5325060486793518, - "learning_rate": 2.2260324112428336e-05, - "loss": 1.4867, + "epoch": 0.2468187274909964, + "grad_norm": 0.5167099237442017, + "learning_rate": 4.4810195049359435e-05, + "loss": 0.8446, "num_input_tokens_seen": 42106880, "step": 5140 }, { - "epoch": 0.5893122782927108, - "grad_norm": 0.48673689365386963, - "learning_rate": 2.2159472448130513e-05, - "loss": 1.2188, + "epoch": 0.24729891956782712, + "grad_norm": 0.5339356064796448, + "learning_rate": 4.4786004417692836e-05, + "loss": 0.9213, "num_input_tokens_seen": 42188800, "step": 5150 }, { - "epoch": 0.5904565739787161, - "grad_norm": 0.5229732394218445, - "learning_rate": 2.2058667592735532e-05, - "loss": 1.3696, + "epoch": 0.24777911164465785, + "grad_norm": 0.5605190396308899, + "learning_rate": 4.4761764100331795e-05, + "loss": 1.0777, "num_input_tokens_seen": 42270720, "step": 5160 }, { - "epoch": 0.5916008696647214, - "grad_norm": 0.6097173094749451, - "learning_rate": 2.1957911207401267e-05, - "loss": 1.6279, + "epoch": 0.2482593037214886, + "grad_norm": 0.5921156406402588, + "learning_rate": 4.473747415814747e-05, + "loss": 0.8808, "num_input_tokens_seen": 42352640, "step": 5170 }, { - "epoch": 0.5927451653507266, - "grad_norm": 0.5959046483039856, - "learning_rate": 2.1857204952486824e-05, - "loss": 1.521, + "epoch": 0.24873949579831933, + "grad_norm": 0.6122276186943054, + "learning_rate": 4.471313465213562e-05, + "loss": 1.0989, "num_input_tokens_seen": 42434560, "step": 5180 }, { - "epoch": 0.5938894610367319, - "grad_norm": 0.7557221055030823, - "learning_rate": 2.1756550487525247e-05, - "loss": 1.3385, + "epoch": 0.24921968787515006, + "grad_norm": 0.618924617767334, + "learning_rate": 4.46887456434165e-05, + "loss": 0.8832, "num_input_tokens_seen": 42516480, "step": 5190 }, { - "epoch": 0.5950337567227372, - "grad_norm": 0.48483288288116455, - "learning_rate": 2.165594947119613e-05, - "loss": 1.1232, + "epoch": 0.24969987995198079, + "grad_norm": 0.5166974067687988, + "learning_rate": 4.466430719323465e-05, + "loss": 0.843, "num_input_tokens_seen": 42598400, "step": 5200 }, { - "epoch": 0.5961780524087424, - "grad_norm": 0.48301464319229126, - "learning_rate": 2.1555403561298287e-05, - "loss": 1.3692, + "epoch": 0.25018007202881154, + "grad_norm": 0.6116747856140137, + "learning_rate": 4.463981936295876e-05, + "loss": 1.096, "num_input_tokens_seen": 42680320, "step": 5210 }, { - "epoch": 0.5973223480947477, - "grad_norm": 0.4715411961078644, - "learning_rate": 2.1454914414722417e-05, - "loss": 1.2656, + "epoch": 0.25066026410564224, + "grad_norm": 0.4793613851070404, + "learning_rate": 4.461528221408153e-05, + "loss": 0.8411, "num_input_tokens_seen": 42762240, "step": 5220 }, { - "epoch": 0.598466643780753, - "grad_norm": 0.4841952323913574, - "learning_rate": 2.135448368742385e-05, - "loss": 1.2838, + "epoch": 0.251140456182473, + "grad_norm": 0.4967063069343567, + "learning_rate": 4.459069580821953e-05, + "loss": 0.9483, "num_input_tokens_seen": 42844160, "step": 5230 }, { - "epoch": 0.5996109394667583, - "grad_norm": 0.4453716278076172, - "learning_rate": 2.1254113034395212e-05, - "loss": 1.2797, + "epoch": 0.25162064825930375, + "grad_norm": 0.4823471009731293, + "learning_rate": 4.4566060207112983e-05, + "loss": 0.7803, "num_input_tokens_seen": 42926080, "step": 5240 }, { - "epoch": 0.6007552351527635, - "grad_norm": 1.0583953857421875, - "learning_rate": 2.1153804109639157e-05, - "loss": 1.2836, + "epoch": 0.25210084033613445, + "grad_norm": 0.5162050127983093, + "learning_rate": 4.454137547262566e-05, + "loss": 0.9638, "num_input_tokens_seen": 43008000, "step": 5250 }, { - "epoch": 0.6018995308387688, - "grad_norm": 0.5487982630729675, - "learning_rate": 2.105355856614115e-05, - "loss": 1.1154, + "epoch": 0.2525810324129652, + "grad_norm": 0.6449117660522461, + "learning_rate": 4.451664166674472e-05, + "loss": 0.9499, "num_input_tokens_seen": 43089920, "step": 5260 }, { - "epoch": 0.603043826524774, - "grad_norm": 0.5370051264762878, - "learning_rate": 2.0953378055842183e-05, - "loss": 1.3152, + "epoch": 0.2530612244897959, + "grad_norm": 0.5185222625732422, + "learning_rate": 4.449185885158056e-05, + "loss": 0.8689, "num_input_tokens_seen": 43171840, "step": 5270 }, { - "epoch": 0.6041881222107792, - "grad_norm": 0.574317455291748, - "learning_rate": 2.0853264229611557e-05, - "loss": 0.9663, + "epoch": 0.25354141656662665, + "grad_norm": 0.5826950669288635, + "learning_rate": 4.4467027089366625e-05, + "loss": 0.9125, "num_input_tokens_seen": 43253760, "step": 5280 }, { - "epoch": 0.6053324178967845, - "grad_norm": 0.48023343086242676, - "learning_rate": 2.075321873721972e-05, - "loss": 1.0387, + "epoch": 0.2540216086434574, + "grad_norm": 0.6201736330986023, + "learning_rate": 4.444214644245928e-05, + "loss": 0.9452, "num_input_tokens_seen": 43335680, "step": 5290 }, { - "epoch": 0.6064767135827898, - "grad_norm": 0.5434787273406982, - "learning_rate": 2.0653243227311014e-05, - "loss": 1.3762, + "epoch": 0.2545018007202881, + "grad_norm": 0.5122745633125305, + "learning_rate": 4.441721697333765e-05, + "loss": 0.8625, "num_input_tokens_seen": 43417600, "step": 5300 }, { - "epoch": 0.607621009268795, - "grad_norm": 0.5317404866218567, - "learning_rate": 2.0553339347376592e-05, - "loss": 1.187, + "epoch": 0.25498199279711886, + "grad_norm": 0.5290250182151794, + "learning_rate": 4.4392238744603464e-05, + "loss": 0.8692, "num_input_tokens_seen": 43499520, "step": 5310 }, { - "epoch": 0.6087653049548003, - "grad_norm": 0.4761092960834503, - "learning_rate": 2.045350874372717e-05, - "loss": 1.0539, + "epoch": 0.25546218487394956, + "grad_norm": 0.719904363155365, + "learning_rate": 4.436721181898088e-05, + "loss": 1.0077, "num_input_tokens_seen": 43581440, "step": 5320 }, { - "epoch": 0.6099096006408056, - "grad_norm": 0.45552459359169006, - "learning_rate": 2.0353753061465972e-05, - "loss": 1.3189, + "epoch": 0.2559423769507803, + "grad_norm": 0.5093972086906433, + "learning_rate": 4.434213625931636e-05, + "loss": 0.9048, "num_input_tokens_seen": 43663360, "step": 5330 }, { - "epoch": 0.6110538963268108, - "grad_norm": 0.6579371690750122, - "learning_rate": 2.0254073944461603e-05, - "loss": 1.2721, + "epoch": 0.25642256902761107, + "grad_norm": 0.5186138153076172, + "learning_rate": 4.431701212857847e-05, + "loss": 0.9174, "num_input_tokens_seen": 43745280, "step": 5340 }, { - "epoch": 0.6121981920128161, - "grad_norm": 0.5771154761314392, - "learning_rate": 2.0154473035320936e-05, - "loss": 1.1475, + "epoch": 0.25690276110444177, + "grad_norm": 0.5094157457351685, + "learning_rate": 4.429183948985777e-05, + "loss": 0.7608, "num_input_tokens_seen": 43827200, "step": 5350 }, { - "epoch": 0.6133424876988214, - "grad_norm": 0.500095009803772, - "learning_rate": 2.0054951975362067e-05, - "loss": 1.2035, + "epoch": 0.2573829531812725, + "grad_norm": 0.5557414889335632, + "learning_rate": 4.426661840636662e-05, + "loss": 0.8841, "num_input_tokens_seen": 43909120, "step": 5360 }, { - "epoch": 0.6144867833848267, - "grad_norm": 0.46520867943763733, - "learning_rate": 1.995551240458728e-05, - "loss": 1.1699, + "epoch": 0.2578631452581032, + "grad_norm": 0.49979162216186523, + "learning_rate": 4.424134894143903e-05, + "loss": 1.0389, "num_input_tokens_seen": 43991040, "step": 5370 }, { - "epoch": 0.6156310790708319, - "grad_norm": 0.46139946579933167, - "learning_rate": 1.985615596165597e-05, - "loss": 1.2418, + "epoch": 0.258343337334934, + "grad_norm": 0.6658138036727905, + "learning_rate": 4.42160311585305e-05, + "loss": 1.0486, "num_input_tokens_seen": 44072960, "step": 5380 }, { - "epoch": 0.6167753747568372, - "grad_norm": 0.49758806824684143, - "learning_rate": 1.9756884283857685e-05, - "loss": 1.4331, + "epoch": 0.25882352941176473, + "grad_norm": 0.5858299732208252, + "learning_rate": 4.419066512121788e-05, + "loss": 0.6945, "num_input_tokens_seen": 44154880, "step": 5390 }, { - "epoch": 0.6179196704428425, - "grad_norm": 0.48119744658470154, - "learning_rate": 1.965769900708515e-05, - "loss": 1.3412, + "epoch": 0.25930372148859543, + "grad_norm": 0.5159766674041748, + "learning_rate": 4.4165250893199176e-05, + "loss": 0.8972, "num_input_tokens_seen": 44236800, "step": 5400 }, { - "epoch": 0.6190639661288477, - "grad_norm": 0.5523199439048767, - "learning_rate": 1.955860176580729e-05, - "loss": 1.1733, + "epoch": 0.2597839135654262, + "grad_norm": 0.48362815380096436, + "learning_rate": 4.413978853829342e-05, + "loss": 0.8574, "num_input_tokens_seen": 44318720, "step": 5410 }, { - "epoch": 0.620208261814853, - "grad_norm": 0.5241261124610901, - "learning_rate": 1.945959419304226e-05, - "loss": 1.2493, + "epoch": 0.2602641056422569, + "grad_norm": 0.5605273842811584, + "learning_rate": 4.411427812044049e-05, + "loss": 0.9521, "num_input_tokens_seen": 44400640, "step": 5420 }, { - "epoch": 0.6213525575008583, - "grad_norm": 0.5371418595314026, - "learning_rate": 1.936067792033061e-05, - "loss": 1.2361, + "epoch": 0.26074429771908764, + "grad_norm": 0.5811429619789124, + "learning_rate": 4.408871970370096e-05, + "loss": 1.0423, "num_input_tokens_seen": 44482560, "step": 5430 }, { - "epoch": 0.6224968531868634, - "grad_norm": 0.5102439522743225, - "learning_rate": 1.9261854577708366e-05, - "loss": 1.2999, + "epoch": 0.2612244897959184, + "grad_norm": 0.7415308952331543, + "learning_rate": 4.406311335225595e-05, + "loss": 0.9938, "num_input_tokens_seen": 44564480, "step": 5440 }, { - "epoch": 0.6236411488728687, - "grad_norm": 0.9786542654037476, - "learning_rate": 1.9163125793680125e-05, - "loss": 1.4174, + "epoch": 0.2617046818727491, + "grad_norm": 0.6823397874832153, + "learning_rate": 4.4037459130406923e-05, + "loss": 0.8559, "num_input_tokens_seen": 44646400, "step": 5450 }, { - "epoch": 0.624785444558874, - "grad_norm": 0.5264445543289185, - "learning_rate": 1.9064493195192293e-05, - "loss": 1.0868, + "epoch": 0.26218487394957984, + "grad_norm": 0.5368736386299133, + "learning_rate": 4.401175710257558e-05, + "loss": 0.9533, "num_input_tokens_seen": 44728320, "step": 5460 }, { - "epoch": 0.6259297402448792, - "grad_norm": 0.5219578742980957, - "learning_rate": 1.8965958407606236e-05, - "loss": 1.1703, + "epoch": 0.26266506602641054, + "grad_norm": 0.5195080637931824, + "learning_rate": 4.398600733330365e-05, + "loss": 0.8958, "num_input_tokens_seen": 44810240, "step": 5470 }, { - "epoch": 0.6270740359308845, - "grad_norm": 0.4690404534339905, - "learning_rate": 1.8867523054671475e-05, - "loss": 1.2942, + "epoch": 0.2631452581032413, + "grad_norm": 0.5098795890808105, + "learning_rate": 4.3960209887252766e-05, + "loss": 1.1406, "num_input_tokens_seen": 44892160, "step": 5480 }, { - "epoch": 0.6282183316168898, - "grad_norm": 0.5699681043624878, - "learning_rate": 1.8769188758498973e-05, - "loss": 1.1778, + "epoch": 0.26362545018007205, + "grad_norm": 0.5225921273231506, + "learning_rate": 4.3934364829204265e-05, + "loss": 1.0363, "num_input_tokens_seen": 44974080, "step": 5490 }, { - "epoch": 0.629362627302895, - "grad_norm": 0.4668191075325012, - "learning_rate": 1.867095713953439e-05, - "loss": 1.2899, + "epoch": 0.26410564225690275, + "grad_norm": 0.5075384378433228, + "learning_rate": 4.3908472224059064e-05, + "loss": 0.876, "num_input_tokens_seen": 45056000, "step": 5500 }, { - "epoch": 0.6305069229889003, - "grad_norm": 1.0437618494033813, - "learning_rate": 1.8572829816531364e-05, - "loss": 1.2486, + "epoch": 0.2645858343337335, + "grad_norm": 0.5186338424682617, + "learning_rate": 4.388253213683747e-05, + "loss": 0.8846, "num_input_tokens_seen": 45137920, "step": 5510 }, { - "epoch": 0.6316512186749056, - "grad_norm": 0.4803614318370819, - "learning_rate": 1.847480840652483e-05, - "loss": 1.2433, + "epoch": 0.2650660264105642, + "grad_norm": 0.5157744884490967, + "learning_rate": 4.385654463267901e-05, + "loss": 0.841, "num_input_tokens_seen": 45219840, "step": 5520 }, { - "epoch": 0.6327955143609109, - "grad_norm": 0.5359634160995483, - "learning_rate": 1.8376894524804416e-05, - "loss": 1.2503, + "epoch": 0.26554621848739496, + "grad_norm": 0.49746039509773254, + "learning_rate": 4.383050977684231e-05, + "loss": 1.1015, "num_input_tokens_seen": 45301760, "step": 5530 }, { - "epoch": 0.6339398100469161, - "grad_norm": 0.4710131883621216, - "learning_rate": 1.827908978488779e-05, - "loss": 1.3593, + "epoch": 0.2660264105642257, + "grad_norm": 0.5038164258003235, + "learning_rate": 4.3804427634704885e-05, + "loss": 0.7614, "num_input_tokens_seen": 45383680, "step": 5540 }, { - "epoch": 0.6350841057329214, - "grad_norm": 0.4845348298549652, - "learning_rate": 1.8181395798494048e-05, - "loss": 1.3471, + "epoch": 0.2665066026410564, + "grad_norm": 0.4780644476413727, + "learning_rate": 4.3778298271762995e-05, + "loss": 0.8541, "num_input_tokens_seen": 45465600, "step": 5550 }, { - "epoch": 0.6362284014189267, - "grad_norm": 0.6170366406440735, - "learning_rate": 1.8083814175517234e-05, - "loss": 1.3265, + "epoch": 0.26698679471788717, + "grad_norm": 0.49261558055877686, + "learning_rate": 4.375212175363149e-05, + "loss": 0.9464, "num_input_tokens_seen": 45547520, "step": 5560 }, { - "epoch": 0.6373726971049319, - "grad_norm": 0.5501654148101807, - "learning_rate": 1.798634652399972e-05, - "loss": 1.6024, + "epoch": 0.26746698679471786, + "grad_norm": 0.256661593914032, + "learning_rate": 4.372589814604362e-05, + "loss": 0.8231, "num_input_tokens_seen": 45629440, "step": 5570 }, { - "epoch": 0.6385169927909372, - "grad_norm": 1.0594714879989624, - "learning_rate": 1.7888994450105788e-05, - "loss": 1.2416, + "epoch": 0.2679471788715486, + "grad_norm": 0.7495452761650085, + "learning_rate": 4.369962751485089e-05, + "loss": 0.8247, "num_input_tokens_seen": 45711360, "step": 5580 }, { - "epoch": 0.6396612884769425, - "grad_norm": 0.4413740634918213, - "learning_rate": 1.7791759558095077e-05, - "loss": 1.3582, + "epoch": 0.2684273709483794, + "grad_norm": 0.6870756149291992, + "learning_rate": 4.367330992602289e-05, + "loss": 0.7793, "num_input_tokens_seen": 45793280, "step": 5590 }, { - "epoch": 0.6408055841629478, - "grad_norm": 0.6288554668426514, - "learning_rate": 1.7694643450296216e-05, - "loss": 1.4037, + "epoch": 0.2689075630252101, + "grad_norm": 0.5133636593818665, + "learning_rate": 4.3646945445647114e-05, + "loss": 0.7808, "num_input_tokens_seen": 45875200, "step": 5600 }, { - "epoch": 0.641949879848953, - "grad_norm": 0.5537555813789368, - "learning_rate": 1.7597647727080408e-05, - "loss": 1.3859, + "epoch": 0.2693877551020408, + "grad_norm": 0.48444893956184387, + "learning_rate": 4.362053413992883e-05, + "loss": 1.023, "num_input_tokens_seen": 45957120, "step": 5610 }, { - "epoch": 0.6430941755349582, - "grad_norm": 0.5998051762580872, - "learning_rate": 1.7500773986835013e-05, - "loss": 1.0984, + "epoch": 0.2698679471788715, + "grad_norm": 0.4823382794857025, + "learning_rate": 4.359407607519088e-05, + "loss": 0.7344, "num_input_tokens_seen": 46039040, "step": 5620 }, { - "epoch": 0.6442384712209634, - "grad_norm": 0.8084359765052795, - "learning_rate": 1.740402382593727e-05, - "loss": 1.3061, + "epoch": 0.2703481392557023, + "grad_norm": 0.4958154261112213, + "learning_rate": 4.356757131787353e-05, + "loss": 0.9653, "num_input_tokens_seen": 46120960, "step": 5630 }, { - "epoch": 0.6453827669069687, - "grad_norm": 0.8041592240333557, - "learning_rate": 1.730739883872795e-05, - "loss": 1.0178, + "epoch": 0.27082833133253303, + "grad_norm": 2.4843976497650146, + "learning_rate": 4.354101993453429e-05, + "loss": 0.8851, "num_input_tokens_seen": 46202880, "step": 5640 }, { - "epoch": 0.646527062592974, - "grad_norm": 0.5870161652565002, - "learning_rate": 1.7210900617485075e-05, - "loss": 1.2927, + "epoch": 0.27130852340936373, + "grad_norm": 0.5063963532447815, + "learning_rate": 4.3514421991847746e-05, + "loss": 0.8417, "num_input_tokens_seen": 46284800, "step": 5650 }, { - "epoch": 0.6476713582789793, - "grad_norm": 0.4723074734210968, - "learning_rate": 1.711453075239773e-05, - "loss": 1.205, + "epoch": 0.2717887154861945, + "grad_norm": 0.5009209513664246, + "learning_rate": 4.3487777556605446e-05, + "loss": 0.8548, "num_input_tokens_seen": 46366720, "step": 5660 }, { - "epoch": 0.6488156539649845, - "grad_norm": 0.9023962020874023, - "learning_rate": 1.7018290831539795e-05, - "loss": 1.378, + "epoch": 0.2722689075630252, + "grad_norm": 0.7282830476760864, + "learning_rate": 4.3461086695715625e-05, + "loss": 1.0355, "num_input_tokens_seen": 46448640, "step": 5670 }, { - "epoch": 0.6499599496509898, - "grad_norm": 0.7521951794624329, - "learning_rate": 1.6922182440843843e-05, - "loss": 1.0102, + "epoch": 0.27274909963985594, + "grad_norm": 0.5339224934577942, + "learning_rate": 4.343434947620315e-05, + "loss": 0.799, "num_input_tokens_seen": 46530560, "step": 5680 }, { - "epoch": 0.6511042453369951, - "grad_norm": 0.4786534309387207, - "learning_rate": 1.6826207164074924e-05, - "loss": 1.2101, + "epoch": 0.2732292917166867, + "grad_norm": 0.5233862400054932, + "learning_rate": 4.340756596520929e-05, + "loss": 0.9427, "num_input_tokens_seen": 46612480, "step": 5690 }, { - "epoch": 0.6522485410230003, - "grad_norm": 1.202210783958435, - "learning_rate": 1.6730366582804535e-05, - "loss": 1.1996, + "epoch": 0.2737094837935174, + "grad_norm": 0.487689733505249, + "learning_rate": 4.338073622999154e-05, + "loss": 0.904, "num_input_tokens_seen": 46694400, "step": 5700 }, { - "epoch": 0.6533928367090056, - "grad_norm": 0.4971697926521301, - "learning_rate": 1.6634662276384548e-05, - "loss": 1.2356, + "epoch": 0.27418967587034815, + "grad_norm": 0.5029451251029968, + "learning_rate": 4.335386033792347e-05, + "loss": 1.2054, "num_input_tokens_seen": 46776320, "step": 5710 }, { - "epoch": 0.6545371323950109, - "grad_norm": 0.5253583788871765, - "learning_rate": 1.6539095821921136e-05, - "loss": 1.2006, + "epoch": 0.27466986794717885, + "grad_norm": 0.5146957635879517, + "learning_rate": 4.332693835649461e-05, + "loss": 0.8869, "num_input_tokens_seen": 46858240, "step": 5720 }, { - "epoch": 0.6556814280810161, - "grad_norm": 0.558186411857605, - "learning_rate": 1.6443668794248828e-05, - "loss": 1.2621, + "epoch": 0.2751500600240096, + "grad_norm": 0.5206642150878906, + "learning_rate": 4.329997035331015e-05, + "loss": 0.9947, "num_input_tokens_seen": 46940160, "step": 5730 }, { - "epoch": 0.6568257237670214, - "grad_norm": 1.3204737901687622, - "learning_rate": 1.6348382765904567e-05, - "loss": 1.0727, + "epoch": 0.27563025210084036, + "grad_norm": 0.5068042278289795, + "learning_rate": 4.3272956396090906e-05, + "loss": 0.8496, "num_input_tokens_seen": 47022080, "step": 5740 }, { - "epoch": 0.6579700194530267, - "grad_norm": 0.5247931480407715, - "learning_rate": 1.6253239307101748e-05, - "loss": 1.212, + "epoch": 0.27611044417767105, + "grad_norm": 0.5032111406326294, + "learning_rate": 4.324589655267306e-05, + "loss": 0.9618, "num_input_tokens_seen": 47104000, "step": 5750 }, { - "epoch": 0.659114315139032, - "grad_norm": 0.4864051043987274, - "learning_rate": 1.6158239985704378e-05, - "loss": 1.3306, + "epoch": 0.2765906362545018, + "grad_norm": 0.6475026607513428, + "learning_rate": 4.321879089100805e-05, + "loss": 1.065, "num_input_tokens_seen": 47185920, "step": 5760 }, { - "epoch": 0.6602586108250372, - "grad_norm": 0.8225966691970825, - "learning_rate": 1.606338636720125e-05, - "loss": 1.0573, + "epoch": 0.2770708283313325, + "grad_norm": 0.9474783539772034, + "learning_rate": 4.319163947916234e-05, + "loss": 0.9001, "num_input_tokens_seen": 47267840, "step": 5770 }, { - "epoch": 0.6614029065110425, - "grad_norm": 1.1698306798934937, - "learning_rate": 1.5968680014680105e-05, - "loss": 1.4235, + "epoch": 0.27755102040816326, + "grad_norm": 0.48900583386421204, + "learning_rate": 4.316444238531729e-05, + "loss": 0.9649, "num_input_tokens_seen": 47349760, "step": 5780 }, { - "epoch": 0.6625472021970478, - "grad_norm": 0.4029783010482788, - "learning_rate": 1.5874122488801888e-05, - "loss": 1.1993, + "epoch": 0.278031212484994, + "grad_norm": 0.5249919891357422, + "learning_rate": 4.313719967776899e-05, + "loss": 0.9746, "num_input_tokens_seen": 47431680, "step": 5790 }, { - "epoch": 0.6636914978830529, - "grad_norm": 0.4945356249809265, - "learning_rate": 1.577971534777507e-05, - "loss": 1.1437, + "epoch": 0.2785114045618247, + "grad_norm": 0.5116479992866516, + "learning_rate": 4.310991142492805e-05, + "loss": 1.0193, "num_input_tokens_seen": 47513600, "step": 5800 }, { - "epoch": 0.6648357935690582, - "grad_norm": 0.5037471652030945, - "learning_rate": 1.5685460147329917e-05, - "loss": 1.2579, + "epoch": 0.27899159663865547, + "grad_norm": 0.4895400404930115, + "learning_rate": 4.308257769531947e-05, + "loss": 0.9313, "num_input_tokens_seen": 47595520, "step": 5810 }, { - "epoch": 0.6659800892550635, - "grad_norm": 0.5378947257995605, - "learning_rate": 1.5591358440692865e-05, - "loss": 1.2693, + "epoch": 0.27947178871548617, + "grad_norm": 0.5116353034973145, + "learning_rate": 4.3055198557582445e-05, + "loss": 1.0058, "num_input_tokens_seen": 47677440, "step": 5820 }, { - "epoch": 0.6671243849410687, - "grad_norm": 0.49818259477615356, - "learning_rate": 1.5497411778560954e-05, - "loss": 1.1159, + "epoch": 0.2799519807923169, + "grad_norm": 0.6602368354797363, + "learning_rate": 4.3027774080470174e-05, + "loss": 1.1261, "num_input_tokens_seen": 47759360, "step": 5830 }, { - "epoch": 0.668268680627074, - "grad_norm": 0.4517885744571686, - "learning_rate": 1.5403621709076247e-05, - "loss": 1.1901, + "epoch": 0.2804321728691477, + "grad_norm": 0.5150447487831116, + "learning_rate": 4.300030433284974e-05, + "loss": 0.8111, "num_input_tokens_seen": 47841280, "step": 5840 }, { - "epoch": 0.6694129763130793, - "grad_norm": 0.8571302890777588, - "learning_rate": 1.530998977780033e-05, - "loss": 1.4804, + "epoch": 0.2809123649459784, + "grad_norm": 0.49057701230049133, + "learning_rate": 4.29727893837019e-05, + "loss": 0.7262, "num_input_tokens_seen": 47923200, "step": 5850 }, { - "epoch": 0.6705572719990845, - "grad_norm": 0.8022615313529968, - "learning_rate": 1.5216517527688818e-05, - "loss": 1.1893, + "epoch": 0.28139255702280913, + "grad_norm": 0.5138354897499084, + "learning_rate": 4.294522930212091e-05, + "loss": 0.809, "num_input_tokens_seen": 48005120, "step": 5860 }, { - "epoch": 0.6717015676850898, - "grad_norm": 0.630002498626709, - "learning_rate": 1.5123206499065967e-05, - "loss": 1.085, + "epoch": 0.28187274909963983, + "grad_norm": 0.5236837267875671, + "learning_rate": 4.291762415731437e-05, + "loss": 0.9162, "num_input_tokens_seen": 48087040, "step": 5870 }, { - "epoch": 0.6728458633710951, - "grad_norm": 0.49356332421302795, - "learning_rate": 1.5030058229599275e-05, - "loss": 1.0382, + "epoch": 0.2823529411764706, + "grad_norm": 0.5018298625946045, + "learning_rate": 4.288997401860303e-05, + "loss": 1.0343, "num_input_tokens_seen": 48168960, "step": 5880 }, { - "epoch": 0.6739901590571004, - "grad_norm": 0.6504026055335999, - "learning_rate": 1.4937074254274117e-05, - "loss": 1.3036, + "epoch": 0.28283313325330134, + "grad_norm": 0.4813729524612427, + "learning_rate": 4.286227895542064e-05, + "loss": 1.36, "num_input_tokens_seen": 48250880, "step": 5890 }, { - "epoch": 0.6751344547431056, - "grad_norm": 0.4629366993904114, - "learning_rate": 1.4844256105368504e-05, - "loss": 1.3465, + "epoch": 0.28331332533013204, + "grad_norm": 0.44931960105895996, + "learning_rate": 4.283453903731375e-05, + "loss": 0.8472, "num_input_tokens_seen": 48332800, "step": 5900 }, { - "epoch": 0.6762787504291109, - "grad_norm": 0.4517896771430969, - "learning_rate": 1.4751605312427786e-05, - "loss": 1.4707, + "epoch": 0.2837935174069628, + "grad_norm": 0.5551528930664062, + "learning_rate": 4.2806754333941546e-05, + "loss": 1.1295, "num_input_tokens_seen": 48414720, "step": 5910 }, { - "epoch": 0.6774230461151162, - "grad_norm": 1.4166773557662964, - "learning_rate": 1.4659123402239454e-05, - "loss": 1.1061, + "epoch": 0.2842737094837935, + "grad_norm": 0.6064106822013855, + "learning_rate": 4.2778924915075704e-05, + "loss": 0.9477, "num_input_tokens_seen": 48496640, "step": 5920 }, { - "epoch": 0.6785673418011214, - "grad_norm": 0.7780046463012695, - "learning_rate": 1.4566811898808013e-05, - "loss": 1.2819, + "epoch": 0.28475390156062425, + "grad_norm": 0.48876288533210754, + "learning_rate": 4.275105085060014e-05, + "loss": 0.93, "num_input_tokens_seen": 48578560, "step": 5930 }, { - "epoch": 0.6797116374871267, - "grad_norm": 0.55788654088974, - "learning_rate": 1.4474672323329819e-05, - "loss": 1.4707, + "epoch": 0.285234093637455, + "grad_norm": 0.4730078876018524, + "learning_rate": 4.272313221051094e-05, + "loss": 1.0398, "num_input_tokens_seen": 48660480, "step": 5940 }, { - "epoch": 0.680855933173132, - "grad_norm": 0.5268608331680298, - "learning_rate": 1.4382706194168066e-05, - "loss": 1.4235, + "epoch": 0.2857142857142857, + "grad_norm": 0.40384843945503235, + "learning_rate": 4.269516906491607e-05, + "loss": 0.9514, "num_input_tokens_seen": 48742400, "step": 5950 }, { - "epoch": 0.6820002288591372, - "grad_norm": 0.48586344718933105, - "learning_rate": 1.42909150268277e-05, - "loss": 1.2991, + "epoch": 0.28619447779111645, + "grad_norm": 0.5116029381752014, + "learning_rate": 4.266716148403529e-05, + "loss": 0.7283, "num_input_tokens_seen": 48824320, "step": 5960 }, { - "epoch": 0.6831445245451425, - "grad_norm": 0.5041683912277222, - "learning_rate": 1.4199300333930515e-05, - "loss": 1.0258, + "epoch": 0.28667466986794715, + "grad_norm": 0.5653484463691711, + "learning_rate": 4.263910953819993e-05, + "loss": 0.9808, "num_input_tokens_seen": 48906240, "step": 5970 }, { - "epoch": 0.6842888202311477, - "grad_norm": 0.5228765606880188, - "learning_rate": 1.4107863625190163e-05, - "loss": 1.2446, + "epoch": 0.2871548619447779, + "grad_norm": 0.4929262697696686, + "learning_rate": 4.2611013297852744e-05, + "loss": 0.9758, "num_input_tokens_seen": 48988160, "step": 5980 }, { - "epoch": 0.685433115917153, - "grad_norm": 0.4611828029155731, - "learning_rate": 1.4016606407387312e-05, - "loss": 1.2735, + "epoch": 0.28763505402160866, + "grad_norm": 0.5140427947044373, + "learning_rate": 4.2582872833547693e-05, + "loss": 0.9226, "num_input_tokens_seen": 49070080, "step": 5990 }, { - "epoch": 0.6865774116031582, - "grad_norm": 0.543980598449707, - "learning_rate": 1.3925530184344818e-05, - "loss": 1.3767, + "epoch": 0.28811524609843936, + "grad_norm": 0.4957669973373413, + "learning_rate": 4.255468821594981e-05, + "loss": 1.0878, "num_input_tokens_seen": 49152000, "step": 6000 }, { - "epoch": 0.6877217072891635, - "grad_norm": 0.5816802978515625, - "learning_rate": 1.3834636456902944e-05, - "loss": 1.4241, + "epoch": 0.2885954381752701, + "grad_norm": 0.5024486184120178, + "learning_rate": 4.2526459515834996e-05, + "loss": 0.917, "num_input_tokens_seen": 49233920, "step": 6010 }, { - "epoch": 0.6888660029751688, - "grad_norm": 0.4795803725719452, - "learning_rate": 1.3743926722894579e-05, - "loss": 1.1986, + "epoch": 0.28907563025210087, + "grad_norm": 0.5399232506752014, + "learning_rate": 4.249818680408984e-05, + "loss": 1.0098, "num_input_tokens_seen": 49315840, "step": 6020 }, { - "epoch": 0.690010298661174, - "grad_norm": 0.4873317778110504, - "learning_rate": 1.365340247712064e-05, - "loss": 0.9731, + "epoch": 0.28955582232893157, + "grad_norm": 0.5091607570648193, + "learning_rate": 4.246987015171148e-05, + "loss": 1.0573, "num_input_tokens_seen": 49397760, "step": 6030 }, { - "epoch": 0.6911545943471793, - "grad_norm": 0.5345495939254761, - "learning_rate": 1.3563065211325349e-05, - "loss": 1.2101, + "epoch": 0.2900360144057623, + "grad_norm": 0.48257771134376526, + "learning_rate": 4.244150962980735e-05, + "loss": 1.1285, "num_input_tokens_seen": 49479680, "step": 6040 }, { - "epoch": 0.6922988900331846, - "grad_norm": 0.6827751398086548, - "learning_rate": 1.3472916414171738e-05, - "loss": 1.0661, + "epoch": 0.290516206482593, + "grad_norm": 0.7823185324668884, + "learning_rate": 4.2413105309595105e-05, + "loss": 0.9014, "num_input_tokens_seen": 49561600, "step": 6050 }, { - "epoch": 0.6934431857191898, - "grad_norm": 0.533750057220459, - "learning_rate": 1.338295757121703e-05, - "loss": 1.2239, + "epoch": 0.2909963985594238, + "grad_norm": 0.49261143803596497, + "learning_rate": 4.238465726240233e-05, + "loss": 0.8398, "num_input_tokens_seen": 49643520, "step": 6060 }, { - "epoch": 0.6945874814051951, - "grad_norm": 0.47301891446113586, - "learning_rate": 1.3293190164888242e-05, - "loss": 1.3588, + "epoch": 0.29147659063625453, + "grad_norm": 0.5321176648139954, + "learning_rate": 4.235616555966645e-05, + "loss": 0.879, "num_input_tokens_seen": 49725440, "step": 6070 }, { - "epoch": 0.6957317770912004, - "grad_norm": 0.4108991324901581, - "learning_rate": 1.3203615674457709e-05, - "loss": 1.1447, + "epoch": 0.29195678271308523, + "grad_norm": 0.5565729737281799, + "learning_rate": 4.232763027293451e-05, + "loss": 0.7561, "num_input_tokens_seen": 49807360, "step": 6080 }, { - "epoch": 0.6968760727772056, - "grad_norm": 0.5564691424369812, - "learning_rate": 1.3114235576018686e-05, - "loss": 1.3927, + "epoch": 0.292436974789916, + "grad_norm": 0.6039222478866577, + "learning_rate": 4.2299051473862976e-05, + "loss": 1.0367, "num_input_tokens_seen": 49889280, "step": 6090 }, { - "epoch": 0.6980203684632109, - "grad_norm": 0.55669766664505, - "learning_rate": 1.3025051342461087e-05, - "loss": 1.2068, + "epoch": 0.2929171668667467, + "grad_norm": 0.38333648443222046, + "learning_rate": 4.227042923421762e-05, + "loss": 0.979, "num_input_tokens_seen": 49971200, "step": 6100 }, { - "epoch": 0.6991646641492162, - "grad_norm": 0.47335195541381836, - "learning_rate": 1.2936064443447157e-05, - "loss": 1.5996, + "epoch": 0.29339735894357744, + "grad_norm": 0.5529670119285583, + "learning_rate": 4.224176362587326e-05, + "loss": 0.9951, "num_input_tokens_seen": 50053120, "step": 6110 }, { - "epoch": 0.7003089598352215, - "grad_norm": 0.5122705698013306, - "learning_rate": 1.2847276345387299e-05, - "loss": 1.0368, + "epoch": 0.2938775510204082, + "grad_norm": 0.6076828241348267, + "learning_rate": 4.221305472081365e-05, + "loss": 0.8707, "num_input_tokens_seen": 50135040, "step": 6120 }, { - "epoch": 0.7014532555212267, - "grad_norm": 0.47498294711112976, - "learning_rate": 1.2758688511415848e-05, - "loss": 1.3925, + "epoch": 0.2943577430972389, + "grad_norm": 0.49703189730644226, + "learning_rate": 4.2184302591131264e-05, + "loss": 0.9571, "num_input_tokens_seen": 50216960, "step": 6130 }, { - "epoch": 0.702597551207232, - "grad_norm": 0.4896914064884186, - "learning_rate": 1.2670302401367035e-05, - "loss": 1.1923, + "epoch": 0.29483793517406964, + "grad_norm": 0.5342078804969788, + "learning_rate": 4.21555073090271e-05, + "loss": 1.0041, "num_input_tokens_seen": 50298880, "step": 6140 }, { - "epoch": 0.7037418468932373, - "grad_norm": 0.7128376960754395, - "learning_rate": 1.2582119471750888e-05, - "loss": 1.3158, + "epoch": 0.29531812725090034, + "grad_norm": 0.6617572903633118, + "learning_rate": 4.2126668946810545e-05, + "loss": 1.1052, "num_input_tokens_seen": 50380800, "step": 6150 }, { - "epoch": 0.7048861425792424, - "grad_norm": 0.4625462293624878, - "learning_rate": 1.2494141175729216e-05, - "loss": 1.1663, + "epoch": 0.2957983193277311, + "grad_norm": 0.5139254331588745, + "learning_rate": 4.2097787576899144e-05, + "loss": 0.7878, "num_input_tokens_seen": 50462720, "step": 6160 }, { - "epoch": 0.7060304382652477, - "grad_norm": 0.5610952377319336, - "learning_rate": 1.240636896309168e-05, - "loss": 1.1836, + "epoch": 0.29627851140456185, + "grad_norm": 0.5757784247398376, + "learning_rate": 4.2068863271818455e-05, + "loss": 0.8359, "num_input_tokens_seen": 50544640, "step": 6170 }, { - "epoch": 0.707174733951253, - "grad_norm": 0.8366256952285767, - "learning_rate": 1.2318804280231939e-05, - "loss": 1.3311, + "epoch": 0.29675870348139255, + "grad_norm": 0.6105610132217407, + "learning_rate": 4.2039896104201844e-05, + "loss": 0.7906, "num_input_tokens_seen": 50626560, "step": 6180 }, { - "epoch": 0.7083190296372582, - "grad_norm": 0.6703715324401855, - "learning_rate": 1.2231448570123732e-05, - "loss": 1.1546, + "epoch": 0.2972388955582233, + "grad_norm": 0.5291628241539001, + "learning_rate": 4.201088614679032e-05, + "loss": 0.8226, "num_input_tokens_seen": 50708480, "step": 6190 }, { - "epoch": 0.7094633253232635, - "grad_norm": 0.41081807017326355, - "learning_rate": 1.2144303272297186e-05, - "loss": 1.1482, + "epoch": 0.297719087635054, + "grad_norm": 0.5184271335601807, + "learning_rate": 4.198183347243233e-05, + "loss": 0.922, "num_input_tokens_seen": 50790400, "step": 6200 }, { - "epoch": 0.7106076210092688, - "grad_norm": 0.6010390520095825, - "learning_rate": 1.2057369822815051e-05, - "loss": 1.2103, + "epoch": 0.29819927971188476, + "grad_norm": 0.5568735599517822, + "learning_rate": 4.1952738154083614e-05, + "loss": 0.8489, "num_input_tokens_seen": 50872320, "step": 6210 }, { - "epoch": 0.711751916695274, - "grad_norm": 0.5093455910682678, - "learning_rate": 1.1970649654249017e-05, - "loss": 1.5064, + "epoch": 0.2986794717887155, + "grad_norm": 0.5584334135055542, + "learning_rate": 4.1923600264806975e-05, + "loss": 0.9279, "num_input_tokens_seen": 50954240, "step": 6220 }, { - "epoch": 0.7128962123812793, - "grad_norm": 0.45238783955574036, - "learning_rate": 1.1884144195656133e-05, - "loss": 1.3582, + "epoch": 0.2991596638655462, + "grad_norm": 1.1234581470489502, + "learning_rate": 4.189441987777212e-05, + "loss": 0.8793, "num_input_tokens_seen": 51036160, "step": 6230 }, { - "epoch": 0.7140405080672846, - "grad_norm": 0.48569175601005554, - "learning_rate": 1.1797854872555272e-05, - "loss": 1.3876, + "epoch": 0.29963985594237696, + "grad_norm": 0.5137265920639038, + "learning_rate": 4.186519706625549e-05, + "loss": 0.8057, "num_input_tokens_seen": 51118080, "step": 6240 }, { - "epoch": 0.7151848037532899, - "grad_norm": 0.5373899340629578, - "learning_rate": 1.171178310690362e-05, - "loss": 1.07, + "epoch": 0.30012004801920766, + "grad_norm": 0.5186243057250977, + "learning_rate": 4.1835931903640046e-05, + "loss": 1.0145, "num_input_tokens_seen": 51200000, "step": 6250 }, { - "epoch": 0.7163290994392951, - "grad_norm": 0.568634569644928, - "learning_rate": 1.1625930317073221e-05, - "loss": 1.2047, + "epoch": 0.3006002400960384, + "grad_norm": 0.5995609164237976, + "learning_rate": 4.180662446341511e-05, + "loss": 0.9712, "num_input_tokens_seen": 51281920, "step": 6260 }, { - "epoch": 0.7174733951253004, - "grad_norm": 0.49342796206474304, - "learning_rate": 1.154029791782765e-05, - "loss": 1.2819, + "epoch": 0.3010804321728692, + "grad_norm": 0.5310963988304138, + "learning_rate": 4.1777274819176154e-05, + "loss": 0.9031, "num_input_tokens_seen": 51363840, "step": 6270 }, { - "epoch": 0.7186176908113057, - "grad_norm": 0.8063413500785828, - "learning_rate": 1.1454887320298686e-05, - "loss": 1.2803, + "epoch": 0.30156062424969987, + "grad_norm": 0.536044716835022, + "learning_rate": 4.1747883044624644e-05, + "loss": 0.9543, "num_input_tokens_seen": 51445760, "step": 6280 }, { - "epoch": 0.7197619864973109, - "grad_norm": 0.5260705947875977, - "learning_rate": 1.1369699931963018e-05, - "loss": 1.1276, + "epoch": 0.3020408163265306, + "grad_norm": 0.5187882781028748, + "learning_rate": 4.171844921356784e-05, + "loss": 1.0411, "num_input_tokens_seen": 51527680, "step": 6290 }, { - "epoch": 0.7209062821833162, - "grad_norm": 0.7325502038002014, - "learning_rate": 1.1284737156619096e-05, - "loss": 1.1392, + "epoch": 0.3025210084033613, + "grad_norm": 0.526749312877655, + "learning_rate": 4.168897339991861e-05, + "loss": 1.0103, "num_input_tokens_seen": 51609600, "step": 6300 }, { - "epoch": 0.7220505778693215, - "grad_norm": 0.3689436614513397, - "learning_rate": 1.1200000394363996e-05, - "loss": 1.1951, + "epoch": 0.3030012004801921, + "grad_norm": 0.5082546472549438, + "learning_rate": 4.1659455677695245e-05, + "loss": 1.0368, "num_input_tokens_seen": 51691520, "step": 6310 }, { - "epoch": 0.7231948735553267, - "grad_norm": 0.5486378073692322, - "learning_rate": 1.1115491041570337e-05, - "loss": 1.0797, + "epoch": 0.30348139255702283, + "grad_norm": 0.513116180896759, + "learning_rate": 4.162989612102128e-05, + "loss": 0.6589, "num_input_tokens_seen": 51773440, "step": 6320 }, { - "epoch": 0.724339169241332, - "grad_norm": 0.4265560209751129, - "learning_rate": 1.103121049086324e-05, - "loss": 1.3091, + "epoch": 0.30396158463385353, + "grad_norm": 0.5046403408050537, + "learning_rate": 4.160029480412529e-05, + "loss": 0.9936, "num_input_tokens_seen": 51855360, "step": 6330 }, { - "epoch": 0.7254834649273372, - "grad_norm": 0.5804295539855957, - "learning_rate": 1.094716013109745e-05, - "loss": 1.1386, + "epoch": 0.3044417767106843, + "grad_norm": 0.6154318451881409, + "learning_rate": 4.1570651801340735e-05, + "loss": 0.8154, "num_input_tokens_seen": 51937280, "step": 6340 }, { - "epoch": 0.7266277606133424, - "grad_norm": 0.5425553321838379, - "learning_rate": 1.0863341347334376e-05, - "loss": 1.2296, + "epoch": 0.304921968787515, + "grad_norm": 0.5283302664756775, + "learning_rate": 4.1540967187105753e-05, + "loss": 1.0798, "num_input_tokens_seen": 52019200, "step": 6350 }, { - "epoch": 0.7277720562993477, - "grad_norm": 0.584921658039093, - "learning_rate": 1.0779755520819302e-05, - "loss": 1.2027, + "epoch": 0.30540216086434574, + "grad_norm": 0.5067933797836304, + "learning_rate": 4.151124103596295e-05, + "loss": 0.8155, "num_input_tokens_seen": 52101120, "step": 6360 }, { - "epoch": 0.728916351985353, - "grad_norm": 0.6812318563461304, - "learning_rate": 1.0696404028958634e-05, - "loss": 1.1153, + "epoch": 0.3058823529411765, + "grad_norm": 0.4994664788246155, + "learning_rate": 4.148147342255926e-05, + "loss": 0.9705, "num_input_tokens_seen": 52183040, "step": 6370 }, { - "epoch": 0.7300606476713583, - "grad_norm": 0.4892879128456116, - "learning_rate": 1.0613288245297193e-05, - "loss": 1.097, + "epoch": 0.3063625450180072, + "grad_norm": 0.4871218800544739, + "learning_rate": 4.145166442164573e-05, + "loss": 0.849, "num_input_tokens_seen": 52264960, "step": 6380 }, { - "epoch": 0.7312049433573635, - "grad_norm": 0.4659218490123749, - "learning_rate": 1.053040953949557e-05, - "loss": 1.2608, + "epoch": 0.30684273709483795, + "grad_norm": 0.5769742727279663, + "learning_rate": 4.142181410807735e-05, + "loss": 0.9561, "num_input_tokens_seen": 52346880, "step": 6390 }, { - "epoch": 0.7323492390433688, - "grad_norm": 0.5382397770881653, - "learning_rate": 1.0447769277307554e-05, - "loss": 1.0397, + "epoch": 0.30732292917166865, + "grad_norm": 0.8338456153869629, + "learning_rate": 4.1391922556812815e-05, + "loss": 0.9117, "num_input_tokens_seen": 52428800, "step": 6400 }, { - "epoch": 0.7334935347293741, - "grad_norm": 0.5254513621330261, - "learning_rate": 1.0365368820557633e-05, - "loss": 1.0589, + "epoch": 0.3078031212484994, + "grad_norm": 0.521043062210083, + "learning_rate": 4.136198984291442e-05, + "loss": 0.8657, "num_input_tokens_seen": 52510720, "step": 6410 }, { - "epoch": 0.7346378304153793, - "grad_norm": 1.027398943901062, - "learning_rate": 1.0283209527118584e-05, - "loss": 1.0307, + "epoch": 0.30828331332533015, + "grad_norm": 0.4783630073070526, + "learning_rate": 4.133201604154779e-05, + "loss": 0.9356, "num_input_tokens_seen": 52592640, "step": 6420 }, { - "epoch": 0.7357821261013846, - "grad_norm": 0.5271130800247192, - "learning_rate": 1.0201292750889022e-05, - "loss": 1.3241, + "epoch": 0.30876350540216085, + "grad_norm": 0.6084679961204529, + "learning_rate": 4.1302001227981765e-05, + "loss": 1.0838, "num_input_tokens_seen": 52674560, "step": 6430 }, { - "epoch": 0.7369264217873899, - "grad_norm": 0.45055344700813293, - "learning_rate": 1.011961984177117e-05, - "loss": 1.1761, + "epoch": 0.3092436974789916, + "grad_norm": 0.5003706812858582, + "learning_rate": 4.1271945477588126e-05, + "loss": 0.7471, "num_input_tokens_seen": 52756480, "step": 6440 }, { - "epoch": 0.7380707174733951, - "grad_norm": 0.5612509846687317, - "learning_rate": 1.0038192145648567e-05, - "loss": 1.0847, + "epoch": 0.3097238895558223, + "grad_norm": 0.5811577439308167, + "learning_rate": 4.124184886584148e-05, + "loss": 0.9374, "num_input_tokens_seen": 52838400, "step": 6450 }, { - "epoch": 0.7392150131594004, - "grad_norm": 0.5806179642677307, - "learning_rate": 9.95701100436389e-06, - "loss": 1.1278, + "epoch": 0.31020408163265306, + "grad_norm": 0.5085806250572205, + "learning_rate": 4.121171146831905e-05, + "loss": 1.0186, "num_input_tokens_seen": 52920320, "step": 6460 }, { - "epoch": 0.7403593088454057, - "grad_norm": 0.5459727048873901, - "learning_rate": 9.876077755696868e-06, - "loss": 1.2579, + "epoch": 0.3106842737094838, + "grad_norm": 0.5559006333351135, + "learning_rate": 4.118153336070045e-05, + "loss": 1.1422, "num_input_tokens_seen": 53002240, "step": 6470 }, { - "epoch": 0.741503604531411, - "grad_norm": 0.4346022307872772, - "learning_rate": 9.795393733342203e-06, - "loss": 1.3319, + "epoch": 0.3111644657863145, + "grad_norm": 0.7885730266571045, + "learning_rate": 4.115131461876756e-05, + "loss": 1.0219, "num_input_tokens_seen": 53084160, "step": 6480 }, { - "epoch": 0.7426479002174162, - "grad_norm": 1.9029626846313477, - "learning_rate": 9.71496026688763e-06, - "loss": 1.4099, + "epoch": 0.31164465786314527, + "grad_norm": 0.5197670459747314, + "learning_rate": 4.1121055318404264e-05, + "loss": 0.9806, "num_input_tokens_seen": 53166080, "step": 6490 }, { - "epoch": 0.7437921959034215, - "grad_norm": 0.5004605054855347, - "learning_rate": 9.634778681791962e-06, - "loss": 1.2894, + "epoch": 0.31212484993997597, + "grad_norm": 0.4977589249610901, + "learning_rate": 4.109075553559633e-05, + "loss": 1.1074, "num_input_tokens_seen": 53248000, "step": 6500 }, { - "epoch": 0.7449364915894267, - "grad_norm": 0.7425548434257507, - "learning_rate": 9.554850299363294e-06, - "loss": 1.4331, + "epoch": 0.3126050420168067, + "grad_norm": 0.46243059635162354, + "learning_rate": 4.1060415346431134e-05, + "loss": 0.93, "num_input_tokens_seen": 53329920, "step": 6510 }, { - "epoch": 0.7460807872754319, - "grad_norm": 0.507979691028595, - "learning_rate": 9.47517643673721e-06, - "loss": 1.1623, + "epoch": 0.3130852340936375, + "grad_norm": 0.6155077219009399, + "learning_rate": 4.103003482709758e-05, + "loss": 0.9669, "num_input_tokens_seen": 53411840, "step": 6520 }, { - "epoch": 0.7472250829614372, - "grad_norm": 0.47209110856056213, - "learning_rate": 9.395758406855053e-06, - "loss": 1.195, + "epoch": 0.3135654261704682, + "grad_norm": 0.6652863025665283, + "learning_rate": 4.0999614053885795e-05, + "loss": 0.7738, "num_input_tokens_seen": 53493760, "step": 6530 }, { - "epoch": 0.7483693786474425, - "grad_norm": 0.4794630706310272, - "learning_rate": 9.31659751844232e-06, - "loss": 1.2136, + "epoch": 0.31404561824729893, + "grad_norm": 0.538733184337616, + "learning_rate": 4.096915310318702e-05, + "loss": 0.8493, "num_input_tokens_seen": 53575680, "step": 6540 }, { - "epoch": 0.7495136743334477, - "grad_norm": 0.48791974782943726, - "learning_rate": 9.237695075987106e-06, - "loss": 1.476, + "epoch": 0.31452581032412963, + "grad_norm": 0.5088375210762024, + "learning_rate": 4.093865205149337e-05, + "loss": 0.9464, "num_input_tokens_seen": 53657600, "step": 6550 }, { - "epoch": 0.750657970019453, - "grad_norm": 0.4857141971588135, - "learning_rate": 9.15905237971856e-06, - "loss": 1.3678, + "epoch": 0.3150060024009604, + "grad_norm": 0.5455616116523743, + "learning_rate": 4.090811097539768e-05, + "loss": 0.7853, "num_input_tokens_seen": 53739520, "step": 6560 }, { - "epoch": 0.7518022657054583, - "grad_norm": 0.5270280241966248, - "learning_rate": 9.080670725585511e-06, - "loss": 1.4367, + "epoch": 0.31548619447779114, + "grad_norm": 0.5154474377632141, + "learning_rate": 4.087752995159327e-05, + "loss": 0.764, "num_input_tokens_seen": 53821440, "step": 6570 }, { - "epoch": 0.7529465613914635, - "grad_norm": 0.48381784558296204, - "learning_rate": 9.002551405235082e-06, - "loss": 1.2801, + "epoch": 0.31596638655462184, + "grad_norm": 0.5074660181999207, + "learning_rate": 4.084690905687379e-05, + "loss": 0.9871, "num_input_tokens_seen": 53903360, "step": 6580 }, { - "epoch": 0.7540908570774688, - "grad_norm": 0.7849867343902588, - "learning_rate": 8.924695705991407e-06, - "loss": 1.4574, + "epoch": 0.3164465786314526, + "grad_norm": 0.510628879070282, + "learning_rate": 4.0816248368133016e-05, + "loss": 1.1464, "num_input_tokens_seen": 53985280, "step": 6590 }, { - "epoch": 0.7552351527634741, - "grad_norm": 0.45584842562675476, - "learning_rate": 8.847104910834414e-06, - "loss": 1.4978, + "epoch": 0.3169267707082833, + "grad_norm": 0.6401832699775696, + "learning_rate": 4.078554796236462e-05, + "loss": 0.7681, "num_input_tokens_seen": 54067200, "step": 6600 }, { - "epoch": 0.7563794484494794, - "grad_norm": 0.5474436283111572, - "learning_rate": 8.769780298378705e-06, - "loss": 1.277, + "epoch": 0.31740696278511404, + "grad_norm": 0.5684659481048584, + "learning_rate": 4.0754807916662055e-05, + "loss": 1.0146, "num_input_tokens_seen": 54149120, "step": 6610 }, { - "epoch": 0.7575237441354846, - "grad_norm": 0.49009135365486145, - "learning_rate": 8.69272314285248e-06, - "loss": 1.0243, + "epoch": 0.3178871548619448, + "grad_norm": 0.5699182152748108, + "learning_rate": 4.072402830821829e-05, + "loss": 0.9229, "num_input_tokens_seen": 54231040, "step": 6620 }, { - "epoch": 0.7586680398214899, - "grad_norm": 0.4982384443283081, - "learning_rate": 8.6159347140765e-06, - "loss": 1.2217, + "epoch": 0.3183673469387755, + "grad_norm": 0.5109874606132507, + "learning_rate": 4.069320921432564e-05, + "loss": 0.9067, "num_input_tokens_seen": 54312960, "step": 6630 }, { - "epoch": 0.7598123355074952, - "grad_norm": 0.5559042096138, - "learning_rate": 8.539416277443218e-06, - "loss": 1.2161, + "epoch": 0.31884753901560625, + "grad_norm": 0.498054563999176, + "learning_rate": 4.066235071237559e-05, + "loss": 0.8371, "num_input_tokens_seen": 54394880, "step": 6640 }, { - "epoch": 0.7609566311935004, - "grad_norm": 0.4743431508541107, - "learning_rate": 8.463169093895887e-06, - "loss": 1.069, + "epoch": 0.31932773109243695, + "grad_norm": 0.4968014061450958, + "learning_rate": 4.0631452879858565e-05, + "loss": 0.9809, "num_input_tokens_seen": 54476800, "step": 6650 }, { - "epoch": 0.7621009268795057, - "grad_norm": 0.5154614448547363, - "learning_rate": 8.38719441990781e-06, - "loss": 1.5262, + "epoch": 0.3198079231692677, + "grad_norm": 0.49730050563812256, + "learning_rate": 4.0600515794363774e-05, + "loss": 0.9101, "num_input_tokens_seen": 54558720, "step": 6660 }, { - "epoch": 0.763245222565511, - "grad_norm": 0.5313172340393066, - "learning_rate": 8.311493507461593e-06, - "loss": 1.3362, + "epoch": 0.32028811524609846, + "grad_norm": 0.5172863006591797, + "learning_rate": 4.0569539533578985e-05, + "loss": 0.9802, "num_input_tokens_seen": 54640640, "step": 6670 }, { - "epoch": 0.7643895182515162, - "grad_norm": 0.4694889187812805, - "learning_rate": 8.236067604028563e-06, - "loss": 1.2781, + "epoch": 0.32076830732292916, + "grad_norm": 0.5139868259429932, + "learning_rate": 4.053852417529035e-05, + "loss": 0.7294, "num_input_tokens_seen": 54722560, "step": 6680 }, { - "epoch": 0.7655338139375214, - "grad_norm": 0.5564500689506531, - "learning_rate": 8.160917952548197e-06, - "loss": 1.4282, + "epoch": 0.3212484993997599, + "grad_norm": 0.5264905691146851, + "learning_rate": 4.050746979738218e-05, + "loss": 0.9096, "num_input_tokens_seen": 54804480, "step": 6690 }, { - "epoch": 0.7666781096235267, - "grad_norm": 0.49310502409935, - "learning_rate": 8.08604579140759e-06, - "loss": 1.2766, + "epoch": 0.3217286914765906, + "grad_norm": 0.5231399536132812, + "learning_rate": 4.047637647783681e-05, + "loss": 1.0167, "num_input_tokens_seen": 54886400, "step": 6700 }, { - "epoch": 0.7678224053095319, - "grad_norm": 0.4908527731895447, - "learning_rate": 8.011452354421136e-06, - "loss": 1.4154, + "epoch": 0.32220888355342137, + "grad_norm": 0.6286957859992981, + "learning_rate": 4.044524429473431e-05, + "loss": 0.8882, "num_input_tokens_seen": 54968320, "step": 6710 }, { - "epoch": 0.7689667009955372, - "grad_norm": 0.47903621196746826, - "learning_rate": 7.937138870810115e-06, - "loss": 0.9568, + "epoch": 0.3226890756302521, + "grad_norm": 0.5430505275726318, + "learning_rate": 4.041407332625238e-05, + "loss": 0.8349, "num_input_tokens_seen": 55050240, "step": 6720 }, { - "epoch": 0.7701109966815425, - "grad_norm": 0.820591926574707, - "learning_rate": 7.863106565182474e-06, - "loss": 1.4689, + "epoch": 0.3231692677070828, + "grad_norm": 0.5100399851799011, + "learning_rate": 4.038286365066613e-05, + "loss": 0.8427, "num_input_tokens_seen": 55132160, "step": 6730 }, { - "epoch": 0.7712552923675478, - "grad_norm": 0.5931017994880676, - "learning_rate": 7.78935665751266e-06, - "loss": 1.3219, + "epoch": 0.3236494597839136, + "grad_norm": 0.48778000473976135, + "learning_rate": 4.0351615346347804e-05, + "loss": 0.9561, "num_input_tokens_seen": 55214080, "step": 6740 }, { - "epoch": 0.772399588053553, - "grad_norm": 0.5245538949966431, - "learning_rate": 7.715890363121484e-06, - "loss": 1.1528, + "epoch": 0.3241296518607443, + "grad_norm": 0.5565239191055298, + "learning_rate": 4.032032849176673e-05, + "loss": 0.9102, "num_input_tokens_seen": 55296000, "step": 6750 }, { - "epoch": 0.7735438837395583, - "grad_norm": 0.44471973180770874, - "learning_rate": 7.642708892656125e-06, - "loss": 1.129, + "epoch": 0.324609843937575, + "grad_norm": 0.486905962228775, + "learning_rate": 4.0289003165488976e-05, + "loss": 0.8668, "num_input_tokens_seen": 55377920, "step": 6760 }, { - "epoch": 0.7746881794255636, - "grad_norm": 0.5023803114891052, - "learning_rate": 7.569813452070146e-06, - "loss": 1.0879, + "epoch": 0.3250900360144058, + "grad_norm": 0.5659222602844238, + "learning_rate": 4.025763944617727e-05, + "loss": 0.9355, "num_input_tokens_seen": 55459840, "step": 6770 }, { - "epoch": 0.7758324751115688, - "grad_norm": 0.4996906518936157, - "learning_rate": 7.497205242603636e-06, - "loss": 1.2824, + "epoch": 0.3255702280912365, + "grad_norm": 0.4450863301753998, + "learning_rate": 4.0226237412590696e-05, + "loss": 0.9062, "num_input_tokens_seen": 55541760, "step": 6780 }, { - "epoch": 0.7769767707975741, - "grad_norm": 0.4397946298122406, - "learning_rate": 7.424885460763442e-06, - "loss": 1.2056, + "epoch": 0.32605042016806723, + "grad_norm": 0.4924563467502594, + "learning_rate": 4.019479714358461e-05, + "loss": 0.8195, "num_input_tokens_seen": 55623680, "step": 6790 }, { - "epoch": 0.7781210664835794, - "grad_norm": 0.5805292725563049, - "learning_rate": 7.3528552983033985e-06, - "loss": 1.2665, + "epoch": 0.32653061224489793, + "grad_norm": 0.5010157227516174, + "learning_rate": 4.016331871811033e-05, + "loss": 0.7515, "num_input_tokens_seen": 55705600, "step": 6800 }, { - "epoch": 0.7792653621695846, - "grad_norm": 0.4549366533756256, - "learning_rate": 7.281115942204739e-06, - "loss": 1.0662, + "epoch": 0.3270108043217287, + "grad_norm": 0.49570614099502563, + "learning_rate": 4.0131802215215025e-05, + "loss": 1.0522, "num_input_tokens_seen": 55787520, "step": 6810 }, { - "epoch": 0.7804096578555899, - "grad_norm": 0.547260046005249, - "learning_rate": 7.209668574656514e-06, - "loss": 1.1699, + "epoch": 0.32749099639855944, + "grad_norm": 0.7191053628921509, + "learning_rate": 4.010024771404147e-05, + "loss": 0.9683, "num_input_tokens_seen": 55869440, "step": 6820 }, { - "epoch": 0.7815539535415952, - "grad_norm": 0.5504161715507507, - "learning_rate": 7.138514373036098e-06, - "loss": 1.3507, + "epoch": 0.32797118847539014, + "grad_norm": 0.5185818672180176, + "learning_rate": 4.006865529382787e-05, + "loss": 0.8884, "num_input_tokens_seen": 55951360, "step": 6830 }, { - "epoch": 0.7826982492276005, - "grad_norm": 0.5131449699401855, - "learning_rate": 7.0676545098897956e-06, - "loss": 1.3175, + "epoch": 0.3284513805522209, + "grad_norm": 0.6259596347808838, + "learning_rate": 4.0037025033907635e-05, + "loss": 0.9416, "num_input_tokens_seen": 56033280, "step": 6840 }, { - "epoch": 0.7838425449136057, - "grad_norm": 0.4970407783985138, - "learning_rate": 6.997090152913535e-06, - "loss": 1.1614, + "epoch": 0.3289315726290516, + "grad_norm": 0.5207152962684631, + "learning_rate": 4.000535701370921e-05, + "loss": 0.9646, "num_input_tokens_seen": 56115200, "step": 6850 }, { - "epoch": 0.784986840599611, - "grad_norm": 1.6976333856582642, - "learning_rate": 6.92682246493363e-06, - "loss": 1.0802, + "epoch": 0.32941176470588235, + "grad_norm": 0.6173118948936462, + "learning_rate": 3.997365131275584e-05, + "loss": 0.9449, "num_input_tokens_seen": 56197120, "step": 6860 }, { - "epoch": 0.7861311362856161, - "grad_norm": 2.0605082511901855, - "learning_rate": 6.856852603887556e-06, - "loss": 1.2859, + "epoch": 0.3298919567827131, + "grad_norm": 0.5023009777069092, + "learning_rate": 3.994190801066542e-05, + "loss": 1.1115, "num_input_tokens_seen": 56279040, "step": 6870 }, { - "epoch": 0.7872754319716214, - "grad_norm": 0.4618769586086273, - "learning_rate": 6.787181722804959e-06, - "loss": 1.0909, + "epoch": 0.3303721488595438, + "grad_norm": 0.7239778637886047, + "learning_rate": 3.9910127187150246e-05, + "loss": 0.996, "num_input_tokens_seen": 56360960, "step": 6880 }, { - "epoch": 0.7884197276576267, - "grad_norm": 0.5229918360710144, - "learning_rate": 6.717810969788596e-06, - "loss": 1.2816, + "epoch": 0.33085234093637456, + "grad_norm": 0.45797932147979736, + "learning_rate": 3.9878308922016846e-05, + "loss": 0.887, "num_input_tokens_seen": 56442880, "step": 6890 }, { - "epoch": 0.789564023343632, - "grad_norm": 0.54283607006073, - "learning_rate": 6.648741487995416e-06, - "loss": 1.036, + "epoch": 0.33133253301320525, + "grad_norm": 0.594767689704895, + "learning_rate": 3.984645329516578e-05, + "loss": 0.803, "num_input_tokens_seen": 56524800, "step": 6900 }, { - "epoch": 0.7907083190296372, - "grad_norm": 0.6073585748672485, - "learning_rate": 6.57997441561774e-06, - "loss": 0.9993, + "epoch": 0.331812725090036, + "grad_norm": 0.502420961856842, + "learning_rate": 3.98145603865914e-05, + "loss": 0.9339, "num_input_tokens_seen": 56606720, "step": 6910 }, { - "epoch": 0.7918526147156425, - "grad_norm": 0.4236718416213989, - "learning_rate": 6.511510885864516e-06, - "loss": 1.1192, + "epoch": 0.33229291716686676, + "grad_norm": 0.4855603873729706, + "learning_rate": 3.978263027638171e-05, + "loss": 1.0942, "num_input_tokens_seen": 56688640, "step": 6920 }, { - "epoch": 0.7929969104016478, - "grad_norm": 0.5184228420257568, - "learning_rate": 6.44335202694262e-06, - "loss": 1.3028, + "epoch": 0.33277310924369746, + "grad_norm": 0.561661422252655, + "learning_rate": 3.975066304471811e-05, + "loss": 0.9728, "num_input_tokens_seen": 56770560, "step": 6930 }, { - "epoch": 0.794141206087653, - "grad_norm": 0.6947381496429443, - "learning_rate": 6.375498962038265e-06, - "loss": 1.4295, + "epoch": 0.3332533013205282, + "grad_norm": 0.5083943009376526, + "learning_rate": 3.971865877187523e-05, + "loss": 0.8131, "num_input_tokens_seen": 56852480, "step": 6940 }, { - "epoch": 0.7952855017736583, - "grad_norm": 0.5223413705825806, - "learning_rate": 6.307952809298517e-06, - "loss": 1.503, + "epoch": 0.33373349339735897, + "grad_norm": 0.5498092770576477, + "learning_rate": 3.968661753822071e-05, + "loss": 0.8142, "num_input_tokens_seen": 56934400, "step": 6950 }, { - "epoch": 0.7964297974596636, - "grad_norm": 0.5551154613494873, - "learning_rate": 6.240714681812837e-06, - "loss": 1.1341, + "epoch": 0.33421368547418967, + "grad_norm": 0.5083803534507751, + "learning_rate": 3.9654539424214996e-05, + "loss": 0.9086, "num_input_tokens_seen": 57016320, "step": 6960 }, { - "epoch": 0.7975740931456688, - "grad_norm": 0.5838407278060913, - "learning_rate": 6.173785687594761e-06, - "loss": 1.14, + "epoch": 0.3346938775510204, + "grad_norm": 0.4920550286769867, + "learning_rate": 3.962242451041118e-05, + "loss": 0.8183, "num_input_tokens_seen": 57098240, "step": 6970 }, { - "epoch": 0.7987183888316741, - "grad_norm": 0.4963679611682892, - "learning_rate": 6.107166929563629e-06, - "loss": 1.2954, + "epoch": 0.3351740696278511, + "grad_norm": 0.5124842524528503, + "learning_rate": 3.9590272877454714e-05, + "loss": 0.9817, "num_input_tokens_seen": 57180160, "step": 6980 }, { - "epoch": 0.7998626845176794, - "grad_norm": 0.5126423835754395, - "learning_rate": 6.040859505526439e-06, - "loss": 1.3446, + "epoch": 0.3356542617046819, + "grad_norm": 0.4871028959751129, + "learning_rate": 3.955808460608331e-05, + "loss": 1.0946, "num_input_tokens_seen": 57262080, "step": 6990 }, { - "epoch": 0.8010069802036847, - "grad_norm": 0.48465293645858765, - "learning_rate": 5.974864508159692e-06, - "loss": 1.0721, + "epoch": 0.33613445378151263, + "grad_norm": 0.49150577187538147, + "learning_rate": 3.952585977712664e-05, + "loss": 1.0473, "num_input_tokens_seen": 57344000, "step": 7000 }, { - "epoch": 0.8021512758896899, - "grad_norm": 0.5144678950309753, - "learning_rate": 5.9091830249914685e-06, - "loss": 1.3073, + "epoch": 0.33661464585834333, + "grad_norm": 0.5065931677818298, + "learning_rate": 3.94935984715062e-05, + "loss": 0.9402, "num_input_tokens_seen": 57425920, "step": 7010 }, { - "epoch": 0.8032955715756952, - "grad_norm": 0.4844573140144348, - "learning_rate": 5.843816138383429e-06, - "loss": 1.3409, + "epoch": 0.3370948379351741, + "grad_norm": 0.5077062249183655, + "learning_rate": 3.9461300770235093e-05, + "loss": 0.678, "num_input_tokens_seen": 57507840, "step": 7020 }, { - "epoch": 0.8044398672617005, - "grad_norm": 0.538421094417572, - "learning_rate": 5.778764925513045e-06, - "loss": 1.3687, + "epoch": 0.3375750300120048, + "grad_norm": 0.4949451684951782, + "learning_rate": 3.942896675441779e-05, + "loss": 0.9907, "num_input_tokens_seen": 57589760, "step": 7030 }, { - "epoch": 0.8055841629477057, - "grad_norm": 0.49097201228141785, - "learning_rate": 5.714030458355784e-06, - "loss": 1.099, + "epoch": 0.33805522208883554, + "grad_norm": 0.7275571823120117, + "learning_rate": 3.939659650524997e-05, + "loss": 0.9791, "num_input_tokens_seen": 57671680, "step": 7040 }, { - "epoch": 0.8067284586337109, - "grad_norm": 0.5372655391693115, - "learning_rate": 5.649613803667511e-06, - "loss": 1.3109, + "epoch": 0.3385354141656663, + "grad_norm": 0.4286993145942688, + "learning_rate": 3.9364190104018307e-05, + "loss": 0.8087, "num_input_tokens_seen": 57753600, "step": 7050 }, { - "epoch": 0.8078727543197162, - "grad_norm": 0.5266016721725464, - "learning_rate": 5.5855160229668636e-06, - "loss": 1.2747, + "epoch": 0.339015606242497, + "grad_norm": 0.4871419668197632, + "learning_rate": 3.933174763210024e-05, + "loss": 0.9574, "num_input_tokens_seen": 57835520, "step": 7060 }, { - "epoch": 0.8090170500057214, - "grad_norm": 0.5975900888442993, - "learning_rate": 5.5217381725177624e-06, - "loss": 1.2696, + "epoch": 0.33949579831932775, + "grad_norm": 0.5074589252471924, + "learning_rate": 3.9299269170963795e-05, + "loss": 0.866, "num_input_tokens_seen": 57917440, "step": 7070 }, { - "epoch": 0.8101613456917267, - "grad_norm": 0.5652705430984497, - "learning_rate": 5.458281303312016e-06, - "loss": 1.1509, + "epoch": 0.33997599039615845, + "grad_norm": 0.5097850561141968, + "learning_rate": 3.926675480216738e-05, + "loss": 0.9768, "num_input_tokens_seen": 57999360, "step": 7080 }, { - "epoch": 0.811305641377732, - "grad_norm": 0.7816336154937744, - "learning_rate": 5.39514646105202e-06, - "loss": 1.2708, + "epoch": 0.3404561824729892, + "grad_norm": 0.4989878833293915, + "learning_rate": 3.923420460735957e-05, + "loss": 0.9659, "num_input_tokens_seen": 58081280, "step": 7090 }, { - "epoch": 0.8124499370637372, - "grad_norm": 0.48780134320259094, - "learning_rate": 5.332334686133475e-06, - "loss": 1.4797, + "epoch": 0.34093637454981995, + "grad_norm": 0.4922942817211151, + "learning_rate": 3.920161866827889e-05, + "loss": 0.7939, "num_input_tokens_seen": 58163200, "step": 7100 }, { - "epoch": 0.8135942327497425, - "grad_norm": 0.5095590949058533, - "learning_rate": 5.269847013628299e-06, - "loss": 1.192, + "epoch": 0.34141656662665065, + "grad_norm": 0.5052926540374756, + "learning_rate": 3.916899706675365e-05, + "loss": 1.1513, "num_input_tokens_seen": 58245120, "step": 7110 }, { - "epoch": 0.8147385284357478, - "grad_norm": 0.4638192057609558, - "learning_rate": 5.207684473267527e-06, - "loss": 1.5395, + "epoch": 0.3418967587034814, + "grad_norm": 0.5977290272712708, + "learning_rate": 3.913633988470169e-05, + "loss": 1.0614, "num_input_tokens_seen": 58327040, "step": 7120 }, { - "epoch": 0.8158828241217531, - "grad_norm": 0.538078784942627, - "learning_rate": 5.145848089424374e-06, - "loss": 1.2233, + "epoch": 0.3423769507803121, + "grad_norm": 0.45560044050216675, + "learning_rate": 3.91036472041302e-05, + "loss": 1.0288, "num_input_tokens_seen": 58408960, "step": 7130 }, { - "epoch": 0.8170271198077583, - "grad_norm": 0.544783353805542, - "learning_rate": 5.0843388810973195e-06, - "loss": 1.4877, + "epoch": 0.34285714285714286, + "grad_norm": 0.5187392234802246, + "learning_rate": 3.907091910713553e-05, + "loss": 0.9384, "num_input_tokens_seen": 58490880, "step": 7140 }, { - "epoch": 0.8181714154937636, - "grad_norm": 0.9503467679023743, - "learning_rate": 5.02315786189334e-06, - "loss": 1.1406, + "epoch": 0.3433373349339736, + "grad_norm": 0.49635642766952515, + "learning_rate": 3.9038155675902956e-05, + "loss": 0.9873, "num_input_tokens_seen": 58572800, "step": 7150 }, { - "epoch": 0.8193157111797689, - "grad_norm": 0.5432871580123901, - "learning_rate": 4.962306040011222e-06, - "loss": 1.1193, + "epoch": 0.3438175270108043, + "grad_norm": 0.47836464643478394, + "learning_rate": 3.900535699270647e-05, + "loss": 0.9565, "num_input_tokens_seen": 58654720, "step": 7160 }, { - "epoch": 0.8204600068657741, - "grad_norm": 0.6386141777038574, - "learning_rate": 4.901784418224892e-06, - "loss": 1.0802, + "epoch": 0.34429771908763507, + "grad_norm": 0.49842721223831177, + "learning_rate": 3.8972523139908616e-05, + "loss": 0.9251, "num_input_tokens_seen": 58736640, "step": 7170 }, { - "epoch": 0.8216043025517794, - "grad_norm": 0.5047373175621033, - "learning_rate": 4.841593993866949e-06, - "loss": 1.0499, + "epoch": 0.34477791116446577, + "grad_norm": 0.5168570876121521, + "learning_rate": 3.8939654199960244e-05, + "loss": 0.9024, "num_input_tokens_seen": 58818560, "step": 7180 }, { - "epoch": 0.8227485982377847, - "grad_norm": 0.4890262484550476, - "learning_rate": 4.781735758812217e-06, - "loss": 1.2981, + "epoch": 0.3452581032412965, + "grad_norm": 0.508305549621582, + "learning_rate": 3.890675025540028e-05, + "loss": 0.8464, "num_input_tokens_seen": 58900480, "step": 7190 }, { - "epoch": 0.82389289392379, - "grad_norm": 0.744850754737854, - "learning_rate": 4.7222106994613655e-06, - "loss": 1.1535, + "epoch": 0.3457382953181273, + "grad_norm": 0.5206389427185059, + "learning_rate": 3.8873811388855605e-05, + "loss": 0.909, "num_input_tokens_seen": 58982400, "step": 7200 }, { - "epoch": 0.8250371896097952, - "grad_norm": 0.4771808087825775, - "learning_rate": 4.663019796724685e-06, - "loss": 1.2276, + "epoch": 0.346218487394958, + "grad_norm": 0.5759819149971008, + "learning_rate": 3.8840837683040766e-05, + "loss": 1.0237, "num_input_tokens_seen": 59064320, "step": 7210 }, { - "epoch": 0.8261814852958005, - "grad_norm": 0.463220477104187, - "learning_rate": 4.604164026005925e-06, - "loss": 1.3646, + "epoch": 0.34669867947178873, + "grad_norm": 0.5078734755516052, + "learning_rate": 3.880782922075778e-05, + "loss": 0.8262, "num_input_tokens_seen": 59146240, "step": 7220 }, { - "epoch": 0.8273257809818056, - "grad_norm": 0.46109485626220703, - "learning_rate": 4.5456443571862185e-06, - "loss": 1.3941, + "epoch": 0.3471788715486194, + "grad_norm": 0.9097881317138672, + "learning_rate": 3.8774786084896e-05, + "loss": 0.9193, "num_input_tokens_seen": 59228160, "step": 7230 }, { - "epoch": 0.8284700766678109, - "grad_norm": 0.4565638303756714, - "learning_rate": 4.487461754608066e-06, - "loss": 1.102, + "epoch": 0.3476590636254502, + "grad_norm": 0.4915110468864441, + "learning_rate": 3.8741708358431774e-05, + "loss": 0.8414, "num_input_tokens_seen": 59310080, "step": 7240 }, { - "epoch": 0.8296143723538162, - "grad_norm": 0.5542830228805542, - "learning_rate": 4.429617177059508e-06, - "loss": 1.167, + "epoch": 0.34813925570228094, + "grad_norm": 0.4847305119037628, + "learning_rate": 3.870859612442837e-05, + "loss": 1.138, "num_input_tokens_seen": 59392000, "step": 7250 }, { - "epoch": 0.8307586680398215, - "grad_norm": 0.44131019711494446, - "learning_rate": 4.372111577758261e-06, - "loss": 1.0619, + "epoch": 0.34861944777911164, + "grad_norm": 0.5575107336044312, + "learning_rate": 3.86754494660357e-05, + "loss": 0.8703, "num_input_tokens_seen": 59473920, "step": 7260 }, { - "epoch": 0.8319029637258267, - "grad_norm": 0.523678183555603, - "learning_rate": 4.314945904336037e-06, - "loss": 1.2679, + "epoch": 0.3490996398559424, + "grad_norm": 0.5898226499557495, + "learning_rate": 3.864226846649008e-05, + "loss": 0.8408, "num_input_tokens_seen": 59555840, "step": 7270 }, { - "epoch": 0.833047259411832, - "grad_norm": 0.485726535320282, - "learning_rate": 4.258121098822945e-06, - "loss": 1.2982, + "epoch": 0.3495798319327731, + "grad_norm": 0.5211881995201111, + "learning_rate": 3.860905320911413e-05, + "loss": 0.9104, "num_input_tokens_seen": 59637760, "step": 7280 }, { - "epoch": 0.8341915550978373, - "grad_norm": 0.49582865834236145, - "learning_rate": 4.201638097631938e-06, - "loss": 1.2368, + "epoch": 0.35006002400960384, + "grad_norm": 0.5230681896209717, + "learning_rate": 3.857580377731644e-05, + "loss": 0.7804, "num_input_tokens_seen": 59719680, "step": 7290 }, { - "epoch": 0.8353358507838425, - "grad_norm": 0.44791966676712036, - "learning_rate": 4.145497831543402e-06, - "loss": 0.9904, + "epoch": 0.3505402160864346, + "grad_norm": 0.5010928511619568, + "learning_rate": 3.854252025459144e-05, + "loss": 0.868, "num_input_tokens_seen": 59801600, "step": 7300 }, { - "epoch": 0.8364801464698478, - "grad_norm": 0.4786857068538666, - "learning_rate": 4.089701225689793e-06, - "loss": 1.186, + "epoch": 0.3510204081632653, + "grad_norm": 0.4810640811920166, + "learning_rate": 3.8509202724519165e-05, + "loss": 0.9105, "num_input_tokens_seen": 59883520, "step": 7310 }, { - "epoch": 0.8376244421558531, - "grad_norm": 0.48226872086524963, - "learning_rate": 4.034249199540432e-06, - "loss": 1.359, + "epoch": 0.35150060024009605, + "grad_norm": 0.5282043814659119, + "learning_rate": 3.8475851270765054e-05, + "loss": 0.8274, "num_input_tokens_seen": 59965440, "step": 7320 }, { - "epoch": 0.8387687378418583, - "grad_norm": 0.5494146347045898, - "learning_rate": 3.97914266688631e-06, - "loss": 1.1773, + "epoch": 0.35198079231692675, + "grad_norm": 0.5089632868766785, + "learning_rate": 3.844246597707972e-05, + "loss": 0.9016, "num_input_tokens_seen": 60047360, "step": 7330 }, { - "epoch": 0.8399130335278636, - "grad_norm": 0.4615972638130188, - "learning_rate": 3.924382535825047e-06, - "loss": 1.2996, + "epoch": 0.3524609843937575, + "grad_norm": 0.4965154230594635, + "learning_rate": 3.8409046927298755e-05, + "loss": 0.8791, "num_input_tokens_seen": 60129280, "step": 7340 }, { - "epoch": 0.8410573292138689, - "grad_norm": 1.4144350290298462, - "learning_rate": 3.869969708745946e-06, - "loss": 1.1155, + "epoch": 0.35294117647058826, + "grad_norm": 0.5418551564216614, + "learning_rate": 3.8375594205342534e-05, + "loss": 1.1453, "num_input_tokens_seen": 60211200, "step": 7350 }, { - "epoch": 0.8422016248998742, - "grad_norm": 0.5016247034072876, - "learning_rate": 3.815905082315102e-06, - "loss": 1.058, + "epoch": 0.35342136854741896, + "grad_norm": 0.5690187215805054, + "learning_rate": 3.834210789521598e-05, + "loss": 1.0104, "num_input_tokens_seen": 60293120, "step": 7360 }, { - "epoch": 0.8433459205858794, - "grad_norm": 0.4713045358657837, - "learning_rate": 3.762189547460615e-06, - "loss": 1.2502, + "epoch": 0.3539015606242497, + "grad_norm": 0.5610761046409607, + "learning_rate": 3.830858808100834e-05, + "loss": 1.0196, "num_input_tokens_seen": 60375040, "step": 7370 }, { - "epoch": 0.8444902162718847, - "grad_norm": 0.8532644510269165, - "learning_rate": 3.7088239893579456e-06, - "loss": 1.2315, + "epoch": 0.3543817527010804, + "grad_norm": 0.501844584941864, + "learning_rate": 3.8275034846893046e-05, + "loss": 0.8126, "num_input_tokens_seen": 60456960, "step": 7380 }, { - "epoch": 0.84563451195789, - "grad_norm": 0.5714083313941956, - "learning_rate": 3.655809287415285e-06, - "loss": 1.46, + "epoch": 0.35486194477791116, + "grad_norm": 0.5217064023017883, + "learning_rate": 3.824144827712738e-05, + "loss": 1.3003, "num_input_tokens_seen": 60538880, "step": 7390 }, { - "epoch": 0.8467788076438952, - "grad_norm": 0.5227552652359009, - "learning_rate": 3.603146315259104e-06, - "loss": 1.2398, + "epoch": 0.3553421368547419, + "grad_norm": 0.5244282484054565, + "learning_rate": 3.82078284560524e-05, + "loss": 1.0134, "num_input_tokens_seen": 60620800, "step": 7400 }, { - "epoch": 0.8479231033299004, - "grad_norm": 0.5611023306846619, - "learning_rate": 3.5508359407197157e-06, - "loss": 1.2431, + "epoch": 0.3558223289315726, + "grad_norm": 0.5655602216720581, + "learning_rate": 3.817417546809263e-05, + "loss": 0.9036, "num_input_tokens_seen": 60702720, "step": 7410 }, { - "epoch": 0.8490673990159057, - "grad_norm": 0.8136146068572998, - "learning_rate": 3.4988790258170146e-06, - "loss": 0.9851, + "epoch": 0.3563025210084034, + "grad_norm": 0.5483632683753967, + "learning_rate": 3.8140489397755886e-05, + "loss": 1.0055, "num_input_tokens_seen": 60784640, "step": 7420 }, { - "epoch": 0.8502116947019109, - "grad_norm": 0.5456047058105469, - "learning_rate": 3.4472764267462486e-06, - "loss": 1.1814, + "epoch": 0.35678271308523407, + "grad_norm": 0.5369768738746643, + "learning_rate": 3.810677032963307e-05, + "loss": 0.9698, "num_input_tokens_seen": 60866560, "step": 7430 }, { - "epoch": 0.8513559903879162, - "grad_norm": 0.4470462501049042, - "learning_rate": 3.396028993863906e-06, - "loss": 1.0398, + "epoch": 0.3572629051620648, + "grad_norm": 0.4874258041381836, + "learning_rate": 3.807301834839793e-05, + "loss": 0.8928, "num_input_tokens_seen": 60948480, "step": 7440 }, { - "epoch": 0.8525002860739215, - "grad_norm": 0.4927099049091339, - "learning_rate": 3.3451375716737067e-06, - "loss": 1.1797, + "epoch": 0.3577430972388956, + "grad_norm": 0.4989224076271057, + "learning_rate": 3.803923353880687e-05, + "loss": 0.7671, "num_input_tokens_seen": 61030400, "step": 7450 }, { - "epoch": 0.8536445817599267, - "grad_norm": 0.6371917724609375, - "learning_rate": 3.2946029988127068e-06, - "loss": 1.3581, + "epoch": 0.3582232893157263, + "grad_norm": 0.5844639539718628, + "learning_rate": 3.8005415985698754e-05, + "loss": 0.9805, "num_input_tokens_seen": 61112320, "step": 7460 }, { - "epoch": 0.854788877445932, - "grad_norm": 0.4422919452190399, - "learning_rate": 3.2444261080374546e-06, - "loss": 1.1539, + "epoch": 0.35870348139255703, + "grad_norm": 0.5072601437568665, + "learning_rate": 3.797156577399462e-05, + "loss": 0.9943, "num_input_tokens_seen": 61194240, "step": 7470 }, { - "epoch": 0.8559331731319373, - "grad_norm": 0.4536419212818146, - "learning_rate": 3.194607726210261e-06, - "loss": 1.0741, + "epoch": 0.35918367346938773, + "grad_norm": 0.5117142200469971, + "learning_rate": 3.7937682988697566e-05, + "loss": 0.8656, "num_input_tokens_seen": 61276160, "step": 7480 }, { - "epoch": 0.8570774688179426, - "grad_norm": 0.5793977379798889, - "learning_rate": 3.1451486742856055e-06, - "loss": 1.2566, + "epoch": 0.3596638655462185, + "grad_norm": 0.5001174211502075, + "learning_rate": 3.790376771489247e-05, + "loss": 1.0121, "num_input_tokens_seen": 61358080, "step": 7490 }, { - "epoch": 0.8582217645039478, - "grad_norm": 0.6592020392417908, - "learning_rate": 3.0960497672965825e-06, - "loss": 1.2257, + "epoch": 0.36014405762304924, + "grad_norm": 0.5205296277999878, + "learning_rate": 3.7869820037745776e-05, + "loss": 0.8441, "num_input_tokens_seen": 61440000, "step": 7500 }, { - "epoch": 0.8593660601899531, - "grad_norm": 0.49061110615730286, - "learning_rate": 3.0473118143414634e-06, - "loss": 1.4217, + "epoch": 0.36062424969987994, + "grad_norm": 0.5183282494544983, + "learning_rate": 3.783584004250531e-05, + "loss": 0.7604, "num_input_tokens_seen": 61521920, "step": 7510 }, { - "epoch": 0.8605103558759584, - "grad_norm": 0.4329964518547058, - "learning_rate": 2.9989356185703975e-06, - "loss": 1.4657, + "epoch": 0.3611044417767107, + "grad_norm": 0.3707723319530487, + "learning_rate": 3.7801827814500074e-05, + "loss": 0.8183, "num_input_tokens_seen": 61603840, "step": 7520 }, { - "epoch": 0.8616546515619636, - "grad_norm": 0.5590507388114929, - "learning_rate": 2.950921977172155e-06, - "loss": 1.0984, + "epoch": 0.3615846338535414, + "grad_norm": 0.5028261542320251, + "learning_rate": 3.7767783439139984e-05, + "loss": 0.8522, "num_input_tokens_seen": 61685760, "step": 7530 }, { - "epoch": 0.8627989472479689, - "grad_norm": 0.7304471135139465, - "learning_rate": 2.9032716813609723e-06, - "loss": 1.1865, + "epoch": 0.36206482593037215, + "grad_norm": 0.5016160011291504, + "learning_rate": 3.77337070019157e-05, + "loss": 0.8296, "num_input_tokens_seen": 61767680, "step": 7540 }, { - "epoch": 0.8639432429339742, - "grad_norm": 0.48121222853660583, - "learning_rate": 2.8559855163635544e-06, - "loss": 1.1777, + "epoch": 0.3625450180072029, + "grad_norm": 0.6660007238388062, + "learning_rate": 3.7699598588398364e-05, + "loss": 0.9405, "num_input_tokens_seen": 61849600, "step": 7550 }, { - "epoch": 0.8650875386199794, - "grad_norm": 0.469186007976532, - "learning_rate": 2.809064261406111e-06, - "loss": 1.5473, + "epoch": 0.3630252100840336, + "grad_norm": 0.4924418032169342, + "learning_rate": 3.766545828423946e-05, + "loss": 0.9701, "num_input_tokens_seen": 61931520, "step": 7560 }, { - "epoch": 0.8662318343059847, - "grad_norm": 0.5229854583740234, - "learning_rate": 2.762508689701504e-06, - "loss": 1.1929, + "epoch": 0.36350540216086435, + "grad_norm": 0.5335708260536194, + "learning_rate": 3.7631286175170535e-05, + "loss": 0.8598, "num_input_tokens_seen": 62013440, "step": 7570 }, { - "epoch": 0.8673761299919899, - "grad_norm": 0.5912187099456787, - "learning_rate": 2.716319568436529e-06, - "loss": 1.0352, + "epoch": 0.36398559423769505, + "grad_norm": 0.4859420955181122, + "learning_rate": 3.7597082347003e-05, + "loss": 1.0999, "num_input_tokens_seen": 62095360, "step": 7580 }, { - "epoch": 0.8685204256779951, - "grad_norm": 0.5109303593635559, - "learning_rate": 2.6704976587592688e-06, - "loss": 1.0181, + "epoch": 0.3644657863145258, + "grad_norm": 0.4671091139316559, + "learning_rate": 3.75628468856279e-05, + "loss": 0.8699, "num_input_tokens_seen": 62177280, "step": 7590 }, { - "epoch": 0.8696647213640004, - "grad_norm": 0.7814948558807373, - "learning_rate": 2.6250437157665455e-06, - "loss": 1.2663, + "epoch": 0.36494597839135656, + "grad_norm": 0.5766321420669556, + "learning_rate": 3.7528579877015746e-05, + "loss": 0.8688, "num_input_tokens_seen": 62259200, "step": 7600 }, { - "epoch": 0.8708090170500057, - "grad_norm": 1.977430820465088, - "learning_rate": 2.5799584884914685e-06, - "loss": 1.0863, + "epoch": 0.36542617046818726, + "grad_norm": 0.8806756138801575, + "learning_rate": 3.749428140721626e-05, + "loss": 1.0697, "num_input_tokens_seen": 62341120, "step": 7610 }, { - "epoch": 0.871953312736011, - "grad_norm": 0.4965369701385498, - "learning_rate": 2.535242719891112e-06, - "loss": 1.1277, + "epoch": 0.365906362545018, + "grad_norm": 0.5065363049507141, + "learning_rate": 3.745995156235815e-05, + "loss": 0.8349, "num_input_tokens_seen": 62423040, "step": 7620 }, { - "epoch": 0.8730976084220162, - "grad_norm": 1.2752573490142822, - "learning_rate": 2.4908971468342535e-06, - "loss": 1.067, + "epoch": 0.3663865546218487, + "grad_norm": 0.5004618763923645, + "learning_rate": 3.742559042864895e-05, + "loss": 0.9453, "num_input_tokens_seen": 62504960, "step": 7630 }, { - "epoch": 0.8742419041080215, - "grad_norm": 0.5822865962982178, - "learning_rate": 2.44692250008923e-06, - "loss": 1.2818, + "epoch": 0.36686674669867947, + "grad_norm": 0.46816208958625793, + "learning_rate": 3.7391198092374726e-05, + "loss": 1.0237, "num_input_tokens_seen": 62586880, "step": 7640 }, { - "epoch": 0.8753861997940268, - "grad_norm": 0.5638536214828491, - "learning_rate": 2.403319504311921e-06, - "loss": 1.1506, + "epoch": 0.3673469387755102, + "grad_norm": 0.5166922211647034, + "learning_rate": 3.7356774639899914e-05, + "loss": 0.8499, "num_input_tokens_seen": 62668800, "step": 7650 }, { - "epoch": 0.876530495480032, - "grad_norm": 0.48504772782325745, - "learning_rate": 2.360088878033778e-06, - "loss": 1.5572, + "epoch": 0.3678271308523409, + "grad_norm": 0.5414100289344788, + "learning_rate": 3.7322320157667094e-05, + "loss": 0.8653, "num_input_tokens_seen": 62750720, "step": 7660 }, { - "epoch": 0.8776747911660373, - "grad_norm": 0.4426518976688385, - "learning_rate": 2.317231333650005e-06, - "loss": 1.2879, + "epoch": 0.3683073229291717, + "grad_norm": 0.4967511296272278, + "learning_rate": 3.728783473219676e-05, + "loss": 0.7981, "num_input_tokens_seen": 62832640, "step": 7670 }, { - "epoch": 0.8788190868520426, - "grad_norm": 0.5259247422218323, - "learning_rate": 2.2747475774077986e-06, - "loss": 1.1017, + "epoch": 0.3687875150060024, + "grad_norm": 0.5025008320808411, + "learning_rate": 3.72533184500871e-05, + "loss": 0.7632, "num_input_tokens_seen": 62914560, "step": 7680 }, { - "epoch": 0.8799633825380478, - "grad_norm": 0.4538893401622772, - "learning_rate": 2.2326383093947135e-06, - "loss": 1.2244, + "epoch": 0.36926770708283313, + "grad_norm": 0.4785637855529785, + "learning_rate": 3.7218771398013807e-05, + "loss": 0.9014, "num_input_tokens_seen": 62996480, "step": 7690 }, { - "epoch": 0.8811076782240531, - "grad_norm": 0.6097177267074585, - "learning_rate": 2.1909042235271597e-06, - "loss": 1.3507, + "epoch": 0.3697478991596639, + "grad_norm": 1.0697089433670044, + "learning_rate": 3.718419366272982e-05, + "loss": 0.828, "num_input_tokens_seen": 63078400, "step": 7700 }, { - "epoch": 0.8822519739100584, - "grad_norm": 0.5675943493843079, - "learning_rate": 2.1495460075389133e-06, - "loss": 1.0351, + "epoch": 0.3702280912364946, + "grad_norm": 0.4909783899784088, + "learning_rate": 3.714958533106515e-05, + "loss": 0.7715, "num_input_tokens_seen": 63160320, "step": 7710 }, { - "epoch": 0.8833962695960637, - "grad_norm": 0.4782324731349945, - "learning_rate": 2.1085643429698236e-06, - "loss": 1.1214, + "epoch": 0.37070828331332534, + "grad_norm": 0.5369053483009338, + "learning_rate": 3.7114946489926633e-05, + "loss": 0.9035, "num_input_tokens_seen": 63242240, "step": 7720 }, { - "epoch": 0.8845405652820689, - "grad_norm": 0.5161313414573669, - "learning_rate": 2.067959905154568e-06, - "loss": 1.1543, + "epoch": 0.37118847539015604, + "grad_norm": 0.4767155647277832, + "learning_rate": 3.708027722629772e-05, + "loss": 1.0931, "num_input_tokens_seen": 63324160, "step": 7730 }, { - "epoch": 0.8856848609680742, - "grad_norm": 0.4714735150337219, - "learning_rate": 2.0277333632115288e-06, - "loss": 0.9881, + "epoch": 0.3716686674669868, + "grad_norm": 0.4803867042064667, + "learning_rate": 3.704557762723823e-05, + "loss": 0.8443, "num_input_tokens_seen": 63406080, "step": 7740 }, { - "epoch": 0.8868291566540795, - "grad_norm": 0.5167281031608582, - "learning_rate": 1.9878853800317535e-06, - "loss": 1.4237, + "epoch": 0.37214885954381755, + "grad_norm": 0.5255789756774902, + "learning_rate": 3.7010847779884204e-05, + "loss": 0.9003, "num_input_tokens_seen": 63488000, "step": 7750 }, { - "epoch": 0.8879734523400846, - "grad_norm": 0.47065913677215576, - "learning_rate": 1.948416612268034e-06, - "loss": 1.1573, + "epoch": 0.37262905162064824, + "grad_norm": 0.8208502531051636, + "learning_rate": 3.697608777144762e-05, + "loss": 0.9216, "num_input_tokens_seen": 63569920, "step": 7760 }, { - "epoch": 0.8891177480260899, - "grad_norm": 0.6462358236312866, - "learning_rate": 1.909327710324116e-06, - "loss": 1.303, + "epoch": 0.373109243697479, + "grad_norm": 0.5180743932723999, + "learning_rate": 3.694129768921619e-05, + "loss": 0.966, "num_input_tokens_seen": 63651840, "step": 7770 }, { - "epoch": 0.8902620437120952, - "grad_norm": 0.4849631190299988, - "learning_rate": 1.8706193183439247e-06, - "loss": 1.2495, + "epoch": 0.3735894357743097, + "grad_norm": 0.5045250058174133, + "learning_rate": 3.6906477620553156e-05, + "loss": 0.947, "num_input_tokens_seen": 63733760, "step": 7780 }, { - "epoch": 0.8914063393981004, - "grad_norm": 0.4865788519382477, - "learning_rate": 1.8322920742010086e-06, - "loss": 1.1877, + "epoch": 0.37406962785114045, + "grad_norm": 0.4700779616832733, + "learning_rate": 3.687162765289704e-05, + "loss": 0.9151, "num_input_tokens_seen": 63815680, "step": 7790 }, { - "epoch": 0.8925506350841057, - "grad_norm": 0.48363032937049866, - "learning_rate": 1.7943466094879902e-06, - "loss": 1.1676, + "epoch": 0.3745498199279712, + "grad_norm": 0.5987953543663025, + "learning_rate": 3.683674787376148e-05, + "loss": 0.8509, "num_input_tokens_seen": 63897600, "step": 7800 }, { - "epoch": 0.893694930770111, - "grad_norm": 0.6026335954666138, - "learning_rate": 1.7567835495061718e-06, - "loss": 1.1575, + "epoch": 0.3750300120048019, + "grad_norm": 0.4968700408935547, + "learning_rate": 3.6801838370734945e-05, + "loss": 0.7108, "num_input_tokens_seen": 63979520, "step": 7810 }, { - "epoch": 0.8948392264561162, - "grad_norm": 0.49204021692276, - "learning_rate": 1.7196035132552135e-06, - "loss": 1.0517, + "epoch": 0.37551020408163266, + "grad_norm": 0.4701189398765564, + "learning_rate": 3.676689923148056e-05, + "loss": 0.8534, "num_input_tokens_seen": 64061440, "step": 7820 }, { - "epoch": 0.8959835221421215, - "grad_norm": 0.5136172771453857, - "learning_rate": 1.682807113422971e-06, - "loss": 1.4192, + "epoch": 0.37599039615846336, + "grad_norm": 0.5217798948287964, + "learning_rate": 3.673193054373587e-05, + "loss": 0.9249, "num_input_tokens_seen": 64143360, "step": 7830 }, { - "epoch": 0.8971278178281268, - "grad_norm": 0.516801655292511, - "learning_rate": 1.646394956375369e-06, - "loss": 1.5286, + "epoch": 0.3764705882352941, + "grad_norm": 0.6249185800552368, + "learning_rate": 3.6696932395312606e-05, + "loss": 0.945, "num_input_tokens_seen": 64225280, "step": 7840 }, { - "epoch": 0.898272113514132, - "grad_norm": 1.405717134475708, - "learning_rate": 1.6103676421463986e-06, - "loss": 0.9688, + "epoch": 0.37695078031212487, + "grad_norm": 0.48508864641189575, + "learning_rate": 3.6661904874096503e-05, + "loss": 0.8673, "num_input_tokens_seen": 64307200, "step": 7850 }, { - "epoch": 0.8994164092001373, - "grad_norm": 0.4660581350326538, - "learning_rate": 1.5747257644282726e-06, - "loss": 1.1801, + "epoch": 0.37743097238895557, + "grad_norm": 0.5447950959205627, + "learning_rate": 3.662684806804704e-05, + "loss": 0.817, "num_input_tokens_seen": 64389120, "step": 7860 }, { - "epoch": 0.9005607048861426, - "grad_norm": 0.5325869917869568, - "learning_rate": 1.5394699105616002e-06, - "loss": 1.0305, + "epoch": 0.3779111644657863, + "grad_norm": 0.48486828804016113, + "learning_rate": 3.659176206519724e-05, + "loss": 0.9525, "num_input_tokens_seen": 64471040, "step": 7870 }, { - "epoch": 0.9017050005721479, - "grad_norm": 0.5679177641868591, - "learning_rate": 1.504600661525718e-06, - "loss": 1.1399, + "epoch": 0.3783913565426171, + "grad_norm": 0.4893752932548523, + "learning_rate": 3.655664695365344e-05, + "loss": 0.8771, "num_input_tokens_seen": 64552960, "step": 7880 }, { - "epoch": 0.9028492962581531, - "grad_norm": 0.49606427550315857, - "learning_rate": 1.4701185919291372e-06, - "loss": 1.4673, + "epoch": 0.3788715486194478, + "grad_norm": 0.490164190530777, + "learning_rate": 3.652150282159507e-05, + "loss": 0.895, "num_input_tokens_seen": 64634880, "step": 7890 }, { - "epoch": 0.9039935919441584, - "grad_norm": 0.49286141991615295, - "learning_rate": 1.436024270000058e-06, - "loss": 1.0752, + "epoch": 0.3793517406962785, + "grad_norm": 0.5217410326004028, + "learning_rate": 3.6486329757274454e-05, + "loss": 0.8366, "num_input_tokens_seen": 64716800, "step": 7900 }, { - "epoch": 0.9051378876301637, - "grad_norm": 0.5626493096351624, - "learning_rate": 1.4023182575769956e-06, - "loss": 1.1824, + "epoch": 0.3798319327731092, + "grad_norm": 0.49266862869262695, + "learning_rate": 3.645112784901655e-05, + "loss": 0.9177, "num_input_tokens_seen": 64798720, "step": 7910 }, { - "epoch": 0.9062821833161689, - "grad_norm": 0.513041079044342, - "learning_rate": 1.3690011100995437e-06, - "loss": 1.0409, + "epoch": 0.38031212484994, + "grad_norm": 0.47938328981399536, + "learning_rate": 3.641589718521875e-05, + "loss": 0.8592, "num_input_tokens_seen": 64880640, "step": 7920 }, { - "epoch": 0.9074264790021742, - "grad_norm": 0.46281594038009644, - "learning_rate": 1.3360733765992116e-06, - "loss": 1.2628, + "epoch": 0.38079231692677074, + "grad_norm": 1.3006699085235596, + "learning_rate": 3.6380637854350665e-05, + "loss": 0.9027, "num_input_tokens_seen": 64962560, "step": 7930 }, { - "epoch": 0.9085707746881794, - "grad_norm": 0.626043975353241, - "learning_rate": 1.3035355996903697e-06, - "loss": 1.307, + "epoch": 0.38127250900360143, + "grad_norm": 0.6962760090827942, + "learning_rate": 3.634534994495387e-05, + "loss": 0.8225, "num_input_tokens_seen": 65044480, "step": 7940 }, { - "epoch": 0.9097150703741846, - "grad_norm": 0.6638424396514893, - "learning_rate": 1.2713883155613144e-06, - "loss": 1.061, + "epoch": 0.3817527010804322, + "grad_norm": 0.5089699625968933, + "learning_rate": 3.631003354564175e-05, + "loss": 0.8546, "num_input_tokens_seen": 65126400, "step": 7950 }, { - "epoch": 0.9108593660601899, - "grad_norm": 0.8179208040237427, - "learning_rate": 1.2396320539654366e-06, - "loss": 1.178, + "epoch": 0.3822328931572629, + "grad_norm": 0.48060354590415955, + "learning_rate": 3.6274688745099194e-05, + "loss": 1.0001, "num_input_tokens_seen": 65208320, "step": 7960 }, { - "epoch": 0.9120036617461952, - "grad_norm": 0.44587135314941406, - "learning_rate": 1.208267338212493e-06, - "loss": 1.3368, + "epoch": 0.38271308523409364, + "grad_norm": 0.5190022587776184, + "learning_rate": 3.623931563208241e-05, + "loss": 0.9504, "num_input_tokens_seen": 65290240, "step": 7970 }, { - "epoch": 0.9131479574322005, - "grad_norm": 0.4682336449623108, - "learning_rate": 1.177294685159963e-06, - "loss": 1.3091, + "epoch": 0.3831932773109244, + "grad_norm": 0.48244509100914, + "learning_rate": 3.620391429541873e-05, + "loss": 0.8261, "num_input_tokens_seen": 65372160, "step": 7980 }, { - "epoch": 0.9142922531182057, - "grad_norm": 0.4648427367210388, - "learning_rate": 1.1467146052045603e-06, - "loss": 1.2932, + "epoch": 0.3836734693877551, + "grad_norm": 0.5255333185195923, + "learning_rate": 3.616848482400634e-05, + "loss": 0.9785, "num_input_tokens_seen": 65454080, "step": 7990 }, { - "epoch": 0.915436548804211, - "grad_norm": 0.5007482171058655, - "learning_rate": 1.1165276022737926e-06, - "loss": 1.1777, + "epoch": 0.38415366146458585, + "grad_norm": 0.507675290107727, + "learning_rate": 3.6133027306814085e-05, + "loss": 0.9941, "num_input_tokens_seen": 65536000, "step": 8000 }, { - "epoch": 0.9165808444902163, - "grad_norm": 0.5155405402183533, - "learning_rate": 1.0867341738176857e-06, - "loss": 1.3184, + "epoch": 0.38463385354141655, + "grad_norm": 0.5132362842559814, + "learning_rate": 3.609754183288122e-05, + "loss": 1.0458, "num_input_tokens_seen": 65617920, "step": 8010 }, { - "epoch": 0.9177251401762215, - "grad_norm": 0.6284940242767334, - "learning_rate": 1.0573348108005614e-06, - "loss": 1.2283, + "epoch": 0.3851140456182473, + "grad_norm": 0.5268590450286865, + "learning_rate": 3.606202849131723e-05, + "loss": 0.9985, "num_input_tokens_seen": 65699840, "step": 8020 }, { - "epoch": 0.9188694358622268, - "grad_norm": 4.340449333190918, - "learning_rate": 1.0283299976929672e-06, - "loss": 1.4513, + "epoch": 0.38559423769507806, + "grad_norm": 0.4958067834377289, + "learning_rate": 3.6026487371301564e-05, + "loss": 0.9213, "num_input_tokens_seen": 65781760, "step": 8030 }, { - "epoch": 0.9200137315482321, - "grad_norm": 0.5452487468719482, - "learning_rate": 9.997202124636785e-07, - "loss": 1.2401, + "epoch": 0.38607442977190876, + "grad_norm": 0.5100277662277222, + "learning_rate": 3.599091856208343e-05, + "loss": 0.8843, "num_input_tokens_seen": 65863680, "step": 8040 }, { - "epoch": 0.9211580272342373, - "grad_norm": 0.5035956501960754, - "learning_rate": 9.715059265718335e-07, - "loss": 1.0646, + "epoch": 0.3865546218487395, + "grad_norm": 0.49061518907546997, + "learning_rate": 3.5955322152981575e-05, + "loss": 0.9195, "num_input_tokens_seen": 65945600, "step": 8050 }, { - "epoch": 0.9223023229202426, - "grad_norm": 0.4889717996120453, - "learning_rate": 9.436876049591398e-07, - "loss": 1.4221, + "epoch": 0.3870348139255702, + "grad_norm": 0.531670093536377, + "learning_rate": 3.5919698233384034e-05, + "loss": 1.0418, "num_input_tokens_seen": 66027520, "step": 8060 }, { - "epoch": 0.9234466186062479, - "grad_norm": 0.48875346779823303, - "learning_rate": 9.162657060422574e-07, - "loss": 1.2108, + "epoch": 0.38751500600240096, + "grad_norm": 0.5200238823890686, + "learning_rate": 3.588404689274795e-05, + "loss": 0.8457, "num_input_tokens_seen": 66109440, "step": 8070 }, { - "epoch": 0.9245909142922532, - "grad_norm": 0.5173611044883728, - "learning_rate": 8.892406817051946e-07, - "loss": 1.4522, + "epoch": 0.3879951980792317, + "grad_norm": 0.5855520963668823, + "learning_rate": 3.58483682205993e-05, + "loss": 0.7761, "num_input_tokens_seen": 66191360, "step": 8080 }, { - "epoch": 0.9257352099782584, - "grad_norm": 0.598610520362854, - "learning_rate": 8.626129772918962e-07, - "loss": 1.2966, + "epoch": 0.3884753901560624, + "grad_norm": 0.5065504908561707, + "learning_rate": 3.581266230653271e-05, + "loss": 0.86, "num_input_tokens_seen": 66273280, "step": 8090 }, { - "epoch": 0.9268795056642637, - "grad_norm": 0.5002040863037109, - "learning_rate": 8.363830315988947e-07, - "loss": 1.2018, + "epoch": 0.38895558223289317, + "grad_norm": 0.5135564804077148, + "learning_rate": 3.5776929240211224e-05, + "loss": 0.8299, "num_input_tokens_seen": 66355200, "step": 8100 }, { - "epoch": 0.928023801350269, - "grad_norm": 0.42786675691604614, - "learning_rate": 8.105512768680712e-07, - "loss": 1.1427, + "epoch": 0.38943577430972387, + "grad_norm": 0.5159543752670288, + "learning_rate": 3.5741169111366047e-05, + "loss": 1.0024, "num_input_tokens_seen": 66437120, "step": 8110 }, { - "epoch": 0.9291680970362741, - "grad_norm": 0.5229976177215576, - "learning_rate": 7.851181387795392e-07, - "loss": 1.1591, + "epoch": 0.3899159663865546, + "grad_norm": 0.659740149974823, + "learning_rate": 3.570538200979635e-05, + "loss": 1.0171, "num_input_tokens_seen": 66519040, "step": 8120 }, { - "epoch": 0.9303123927222794, - "grad_norm": 0.469088613986969, - "learning_rate": 7.600840364446333e-07, - "loss": 1.4109, + "epoch": 0.3903961584633854, + "grad_norm": 0.6677471995353699, + "learning_rate": 3.566956802536904e-05, + "loss": 0.9044, "num_input_tokens_seen": 66600960, "step": 8130 }, { - "epoch": 0.9314566884082847, - "grad_norm": 0.46560269594192505, - "learning_rate": 7.354493823990006e-07, - "loss": 1.4194, + "epoch": 0.3908763505402161, + "grad_norm": 0.5268581509590149, + "learning_rate": 3.5633727248018536e-05, + "loss": 0.9338, "num_input_tokens_seen": 66682880, "step": 8140 }, { - "epoch": 0.9326009840942899, - "grad_norm": 0.6745224595069885, - "learning_rate": 7.112145825957927e-07, - "loss": 1.1635, + "epoch": 0.39135654261704683, + "grad_norm": 0.7781187891960144, + "learning_rate": 3.5597859767746524e-05, + "loss": 0.8946, "num_input_tokens_seen": 66764800, "step": 8150 }, { - "epoch": 0.9337452797802952, - "grad_norm": 0.6367943286895752, - "learning_rate": 6.873800363989935e-07, - "loss": 1.2245, + "epoch": 0.39183673469387753, + "grad_norm": 0.40642163157463074, + "learning_rate": 3.556196567462175e-05, + "loss": 0.8835, "num_input_tokens_seen": 66846720, "step": 8160 }, { - "epoch": 0.9348895754663005, - "grad_norm": 0.9506150484085083, - "learning_rate": 6.63946136576829e-07, - "loss": 1.4055, + "epoch": 0.3923169267707083, + "grad_norm": 0.5072639584541321, + "learning_rate": 3.5526045058779805e-05, + "loss": 0.9559, "num_input_tokens_seen": 66928640, "step": 8170 }, { - "epoch": 0.9360338711523057, - "grad_norm": 0.5516346096992493, - "learning_rate": 6.409132692952874e-07, - "loss": 1.134, + "epoch": 0.39279711884753904, + "grad_norm": 0.5224815011024475, + "learning_rate": 3.549009801042286e-05, + "loss": 0.8609, "num_input_tokens_seen": 67010560, "step": 8180 }, { - "epoch": 0.937178166838311, - "grad_norm": 0.48264941573143005, - "learning_rate": 6.182818141117625e-07, - "loss": 1.1915, + "epoch": 0.39327731092436974, + "grad_norm": 0.5318178534507751, + "learning_rate": 3.545412461981947e-05, + "loss": 0.9237, "num_input_tokens_seen": 67092480, "step": 8190 }, { - "epoch": 0.9383224625243163, - "grad_norm": 0.5029363632202148, - "learning_rate": 5.960521439688088e-07, - "loss": 1.0641, + "epoch": 0.3937575030012005, + "grad_norm": 0.5438302755355835, + "learning_rate": 3.541812497730435e-05, + "loss": 0.992, "num_input_tokens_seen": 67174400, "step": 8200 }, { - "epoch": 0.9394667582103216, - "grad_norm": 0.5256086587905884, - "learning_rate": 5.742246251879829e-07, - "loss": 1.1629, + "epoch": 0.3942376950780312, + "grad_norm": 0.48217347264289856, + "learning_rate": 3.5382099173278125e-05, + "loss": 0.9067, "num_input_tokens_seen": 67256320, "step": 8210 }, { - "epoch": 0.9406110538963268, - "grad_norm": 0.5969172120094299, - "learning_rate": 5.527996174638061e-07, - "loss": 1.1297, + "epoch": 0.39471788715486195, + "grad_norm": 0.6276324391365051, + "learning_rate": 3.5346047298207116e-05, + "loss": 0.891, "num_input_tokens_seen": 67338240, "step": 8220 }, { - "epoch": 0.9417553495823321, - "grad_norm": 0.6032472252845764, - "learning_rate": 5.317774738578446e-07, - "loss": 1.269, + "epoch": 0.3951980792316927, + "grad_norm": 0.5074647665023804, + "learning_rate": 3.530996944262312e-05, + "loss": 0.7989, "num_input_tokens_seen": 67420160, "step": 8230 }, { - "epoch": 0.9428996452683374, - "grad_norm": 0.4904973804950714, - "learning_rate": 5.111585407928887e-07, - "loss": 1.0976, + "epoch": 0.3956782713085234, + "grad_norm": 0.48775187134742737, + "learning_rate": 3.5273865697123164e-05, + "loss": 0.8951, "num_input_tokens_seen": 67502080, "step": 8240 }, { - "epoch": 0.9440439409543426, - "grad_norm": 0.49423882365226746, - "learning_rate": 4.909431580472385e-07, - "loss": 1.2052, + "epoch": 0.39615846338535415, + "grad_norm": 0.5741081237792969, + "learning_rate": 3.52377361523693e-05, + "loss": 1.0857, "num_input_tokens_seen": 67584000, "step": 8250 }, { - "epoch": 0.9451882366403479, - "grad_norm": 0.5731542110443115, - "learning_rate": 4.711316587491188e-07, - "loss": 1.1078, + "epoch": 0.39663865546218485, + "grad_norm": 0.5743753910064697, + "learning_rate": 3.520158089908836e-05, + "loss": 1.0151, "num_input_tokens_seen": 67665920, "step": 8260 }, { - "epoch": 0.9463325323263532, - "grad_norm": 0.6199080348014832, - "learning_rate": 4.5172436937117036e-07, - "loss": 1.2543, + "epoch": 0.3971188475390156, + "grad_norm": 0.5247855186462402, + "learning_rate": 3.516540002807174e-05, + "loss": 0.765, "num_input_tokens_seen": 67747840, "step": 8270 }, { - "epoch": 0.9474768280123584, - "grad_norm": 0.5399270057678223, - "learning_rate": 4.3272160972509524e-07, - "loss": 1.1164, + "epoch": 0.39759903961584636, + "grad_norm": 0.5025756359100342, + "learning_rate": 3.512919363017516e-05, + "loss": 0.881, "num_input_tokens_seen": 67829760, "step": 8280 }, { - "epoch": 0.9486211236983637, - "grad_norm": 0.49898287653923035, - "learning_rate": 4.1412369295635023e-07, - "loss": 1.1804, + "epoch": 0.39807923169267706, + "grad_norm": 0.46168726682662964, + "learning_rate": 3.509296179631843e-05, + "loss": 0.8293, "num_input_tokens_seen": 67911680, "step": 8290 }, { - "epoch": 0.9497654193843689, - "grad_norm": 0.5566830039024353, - "learning_rate": 3.9593092553902587e-07, - "loss": 1.1059, + "epoch": 0.3985594237695078, + "grad_norm": 0.556755542755127, + "learning_rate": 3.505670461748527e-05, + "loss": 0.8953, "num_input_tokens_seen": 67993600, "step": 8300 }, { - "epoch": 0.9509097150703741, - "grad_norm": 0.5725921988487244, - "learning_rate": 3.7814360727076724e-07, - "loss": 1.1965, + "epoch": 0.3990396158463385, + "grad_norm": 0.5389664173126221, + "learning_rate": 3.5020422184723e-05, + "loss": 1.19, "num_input_tokens_seen": 68075520, "step": 8310 }, { - "epoch": 0.9520540107563794, - "grad_norm": 0.5144397616386414, - "learning_rate": 3.607620312678528e-07, - "loss": 1.3606, + "epoch": 0.39951980792316927, + "grad_norm": 0.5176492929458618, + "learning_rate": 3.498411458914238e-05, + "loss": 0.8479, "num_input_tokens_seen": 68157440, "step": 8320 }, { - "epoch": 0.9531983064423847, - "grad_norm": 0.5176523327827454, - "learning_rate": 3.437864839603455e-07, - "loss": 1.0969, + "epoch": 0.4, + "grad_norm": 0.5035036206245422, + "learning_rate": 3.494778192191739e-05, + "loss": 0.8593, "num_input_tokens_seen": 68239360, "step": 8330 }, { - "epoch": 0.95434260212839, - "grad_norm": 0.8841016888618469, - "learning_rate": 3.272172450873967e-07, - "loss": 1.6294, + "epoch": 0.4004801920768307, + "grad_norm": 0.49024298787117004, + "learning_rate": 3.4911424274284886e-05, + "loss": 0.771, "num_input_tokens_seen": 68321280, "step": 8340 }, { - "epoch": 0.9554868978143952, - "grad_norm": 0.5609331727027893, - "learning_rate": 3.11054587692608e-07, - "loss": 1.0976, + "epoch": 0.4009603841536615, + "grad_norm": 0.4582843482494354, + "learning_rate": 3.4875041737544526e-05, + "loss": 0.9203, "num_input_tokens_seen": 68403200, "step": 8350 }, { - "epoch": 0.9566311935004005, - "grad_norm": 0.5448135137557983, - "learning_rate": 2.952987781195599e-07, - "loss": 1.1881, + "epoch": 0.4014405762304922, + "grad_norm": 0.5551020503044128, + "learning_rate": 3.483863440305845e-05, + "loss": 0.8853, "num_input_tokens_seen": 68485120, "step": 8360 }, { - "epoch": 0.9577754891864058, - "grad_norm": 0.5337486863136292, - "learning_rate": 2.799500760073931e-07, - "loss": 1.1826, + "epoch": 0.40192076830732293, + "grad_norm": 1.1716420650482178, + "learning_rate": 3.480220236225106e-05, + "loss": 0.9757, "num_input_tokens_seen": 68567040, "step": 8370 }, { - "epoch": 0.958919784872411, - "grad_norm": 0.5066606402397156, - "learning_rate": 2.6500873428656483e-07, - "loss": 1.1567, + "epoch": 0.4024009603841537, + "grad_norm": 0.4597180485725403, + "learning_rate": 3.476574570660879e-05, + "loss": 0.8719, "num_input_tokens_seen": 68648960, "step": 8380 }, { - "epoch": 0.9600640805584163, - "grad_norm": 0.5534819960594177, - "learning_rate": 2.5047499917464636e-07, - "loss": 0.9999, + "epoch": 0.4028811524609844, + "grad_norm": 0.5064864754676819, + "learning_rate": 3.472926452767992e-05, + "loss": 0.7306, "num_input_tokens_seen": 68730880, "step": 8390 }, { - "epoch": 0.9612083762444216, - "grad_norm": 0.5017804503440857, - "learning_rate": 2.3634911017229034e-07, - "loss": 1.347, + "epoch": 0.40336134453781514, + "grad_norm": 0.4826078414916992, + "learning_rate": 3.469275891707428e-05, + "loss": 0.8629, "num_input_tokens_seen": 68812800, "step": 8400 }, { - "epoch": 0.9623526719304268, - "grad_norm": 0.4598214328289032, - "learning_rate": 2.2263130005927558e-07, - "loss": 1.0357, + "epoch": 0.40384153661464584, + "grad_norm": 0.4941999316215515, + "learning_rate": 3.465622896646305e-05, + "loss": 0.8746, "num_input_tokens_seen": 68894720, "step": 8410 }, { - "epoch": 0.9634969676164321, - "grad_norm": 0.5540017485618591, - "learning_rate": 2.0932179489066006e-07, - "loss": 1.3101, + "epoch": 0.4043217286914766, + "grad_norm": 0.5197045207023621, + "learning_rate": 3.461967476757857e-05, + "loss": 0.8477, "num_input_tokens_seen": 68976640, "step": 8420 }, { - "epoch": 0.9646412633024374, - "grad_norm": 0.7891622185707092, - "learning_rate": 1.9642081399307844e-07, - "loss": 1.4347, + "epoch": 0.40480192076830734, + "grad_norm": 0.508140504360199, + "learning_rate": 3.4583096412214025e-05, + "loss": 1.0445, "num_input_tokens_seen": 69058560, "step": 8430 }, { - "epoch": 0.9657855589884426, - "grad_norm": 0.8638253211975098, - "learning_rate": 1.8392856996110875e-07, - "loss": 0.9711, + "epoch": 0.40528211284513804, + "grad_norm": 0.5302277207374573, + "learning_rate": 3.454649399222328e-05, + "loss": 0.846, "num_input_tokens_seen": 69140480, "step": 8440 }, { - "epoch": 0.9669298546744479, - "grad_norm": 0.49595341086387634, - "learning_rate": 1.7184526865377805e-07, - "loss": 1.1246, + "epoch": 0.4057623049219688, + "grad_norm": 0.4965344965457916, + "learning_rate": 3.450986759952064e-05, + "loss": 0.8504, "num_input_tokens_seen": 69222400, "step": 8450 }, { - "epoch": 0.9680741503604532, - "grad_norm": 0.5145378708839417, - "learning_rate": 1.6017110919116786e-07, - "loss": 1.182, + "epoch": 0.4062424969987995, + "grad_norm": 0.6701764464378357, + "learning_rate": 3.44732173260806e-05, + "loss": 0.8774, "num_input_tokens_seen": 69304320, "step": 8460 }, { - "epoch": 0.9692184460464585, - "grad_norm": 0.5098930597305298, - "learning_rate": 1.4890628395113072e-07, - "loss": 1.2913, + "epoch": 0.40672268907563025, + "grad_norm": 0.5118209719657898, + "learning_rate": 3.4436543263937613e-05, + "loss": 0.9147, "num_input_tokens_seen": 69386240, "step": 8470 }, { - "epoch": 0.9703627417324636, - "grad_norm": 0.5276376008987427, - "learning_rate": 1.380509785661288e-07, - "loss": 1.1669, + "epoch": 0.407202881152461, + "grad_norm": 0.5040360689163208, + "learning_rate": 3.439984550518589e-05, + "loss": 0.8726, "num_input_tokens_seen": 69468160, "step": 8480 }, { - "epoch": 0.9715070374184689, - "grad_norm": 0.5354599952697754, - "learning_rate": 1.2760537192015866e-07, - "loss": 1.2136, + "epoch": 0.4076830732292917, + "grad_norm": 0.5248441696166992, + "learning_rate": 3.436312414197913e-05, + "loss": 0.9237, "num_input_tokens_seen": 69550080, "step": 8490 }, { - "epoch": 0.9726513331044742, - "grad_norm": 0.43569329380989075, - "learning_rate": 1.1756963614582006e-07, - "loss": 1.4041, + "epoch": 0.40816326530612246, + "grad_norm": 0.47410309314727783, + "learning_rate": 3.4326379266530314e-05, + "loss": 1.0674, "num_input_tokens_seen": 69632000, "step": 8500 }, { - "epoch": 0.9737956287904794, - "grad_norm": 0.4199809432029724, - "learning_rate": 1.0794393662147129e-07, - "loss": 1.4145, + "epoch": 0.40864345738295316, + "grad_norm": 0.49306410551071167, + "learning_rate": 3.428961097111146e-05, + "loss": 0.8397, "num_input_tokens_seen": 69713920, "step": 8510 }, { - "epoch": 0.9749399244764847, - "grad_norm": 0.45975956320762634, - "learning_rate": 9.872843196850057e-08, - "loss": 1.1534, + "epoch": 0.4091236494597839, + "grad_norm": 0.4915395677089691, + "learning_rate": 3.4252819348053424e-05, + "loss": 0.909, "num_input_tokens_seen": 69795840, "step": 8520 }, { - "epoch": 0.97608422016249, - "grad_norm": 0.47319695353507996, - "learning_rate": 8.992327404872825e-08, - "loss": 1.1406, + "epoch": 0.40960384153661467, + "grad_norm": 0.4925106167793274, + "learning_rate": 3.421600448974559e-05, + "loss": 0.9302, "num_input_tokens_seen": 69877760, "step": 8530 }, { - "epoch": 0.9772285158484952, - "grad_norm": 0.6402721405029297, - "learning_rate": 8.152860796187545e-08, - "loss": 1.4171, + "epoch": 0.41008403361344536, + "grad_norm": 0.49705174565315247, + "learning_rate": 3.4179166488635736e-05, + "loss": 0.9918, "num_input_tokens_seen": 69959680, "step": 8540 }, { - "epoch": 0.9783728115345005, - "grad_norm": 0.5459085702896118, - "learning_rate": 7.354457204320486e-08, - "loss": 1.2596, + "epoch": 0.4105642256902761, + "grad_norm": 0.49341046810150146, + "learning_rate": 3.414230543722973e-05, + "loss": 0.9864, "num_input_tokens_seen": 70041600, "step": 8550 }, { - "epoch": 0.9795171072205058, - "grad_norm": 0.49747464060783386, - "learning_rate": 6.59712978612198e-08, - "loss": 1.4422, + "epoch": 0.4110444177671068, + "grad_norm": 0.5298373699188232, + "learning_rate": 3.410542142809134e-05, + "loss": 0.872, "num_input_tokens_seen": 70123520, "step": 8560 }, { - "epoch": 0.980661402906511, - "grad_norm": 0.5260741114616394, - "learning_rate": 5.880891021549928e-08, - "loss": 1.1456, + "epoch": 0.41152460984393757, + "grad_norm": 0.48551279306411743, + "learning_rate": 3.4068514553841965e-05, + "loss": 1.0904, "num_input_tokens_seen": 70205440, "step": 8570 }, { - "epoch": 0.9818056985925163, - "grad_norm": 0.5129789113998413, - "learning_rate": 5.205752713465794e-08, - "loss": 1.235, + "epoch": 0.4120048019207683, + "grad_norm": 0.5113745331764221, + "learning_rate": 3.403158490716043e-05, + "loss": 0.8943, "num_input_tokens_seen": 70287360, "step": 8580 }, { - "epoch": 0.9829499942785216, - "grad_norm": 0.44936448335647583, - "learning_rate": 4.57172598743727e-08, - "loss": 1.2076, + "epoch": 0.412484993997599, + "grad_norm": 0.5106720924377441, + "learning_rate": 3.3994632580782766e-05, + "loss": 0.8691, "num_input_tokens_seen": 70369280, "step": 8590 }, { - "epoch": 0.9840942899645269, - "grad_norm": 0.5832532644271851, - "learning_rate": 3.9788212915573e-08, - "loss": 1.1558, + "epoch": 0.4129651860744298, + "grad_norm": 0.5008137822151184, + "learning_rate": 3.395765766750192e-05, + "loss": 0.8594, "num_input_tokens_seen": 70451200, "step": 8600 }, { - "epoch": 0.9852385856505321, - "grad_norm": 0.44010186195373535, - "learning_rate": 3.427048396271171e-08, - "loss": 1.2666, + "epoch": 0.4134453781512605, + "grad_norm": 0.47841960191726685, + "learning_rate": 3.392066026016757e-05, + "loss": 0.8132, "num_input_tokens_seen": 70533120, "step": 8610 }, { - "epoch": 0.9863828813365374, - "grad_norm": 0.479427695274353, - "learning_rate": 2.9164163942146937e-08, - "loss": 1.4532, + "epoch": 0.41392557022809123, + "grad_norm": 0.8451586365699768, + "learning_rate": 3.388364045168591e-05, + "loss": 0.8289, "num_input_tokens_seen": 70615040, "step": 8620 }, { - "epoch": 0.9875271770225427, - "grad_norm": 0.5471181273460388, - "learning_rate": 2.44693370006599e-08, - "loss": 1.4154, + "epoch": 0.414405762304922, + "grad_norm": 0.47416937351226807, + "learning_rate": 3.3846598335019335e-05, + "loss": 0.8435, "num_input_tokens_seen": 70696960, "step": 8630 }, { - "epoch": 0.9886714727085479, - "grad_norm": 0.5122755765914917, - "learning_rate": 2.0186080504050466e-08, - "loss": 1.062, + "epoch": 0.4148859543817527, + "grad_norm": 0.8883240818977356, + "learning_rate": 3.38095340031863e-05, + "loss": 0.9588, "num_input_tokens_seen": 70778880, "step": 8640 }, { - "epoch": 0.9898157683945531, - "grad_norm": 0.5013795495033264, - "learning_rate": 1.6314465035879855e-08, - "loss": 1.0299, + "epoch": 0.41536614645858344, + "grad_norm": 0.5661002993583679, + "learning_rate": 3.377244754926104e-05, + "loss": 1.0117, "num_input_tokens_seen": 70860800, "step": 8650 }, { - "epoch": 0.9909600640805584, - "grad_norm": 0.4794003963470459, - "learning_rate": 1.2854554396291018e-08, - "loss": 1.1402, + "epoch": 0.41584633853541414, + "grad_norm": 0.5038189888000488, + "learning_rate": 3.3735339066373314e-05, + "loss": 0.8984, "num_input_tokens_seen": 70942720, "step": 8660 }, { - "epoch": 0.9921043597665636, - "grad_norm": 1.2419332265853882, - "learning_rate": 9.806405600967794e-09, - "loss": 1.3485, + "epoch": 0.4163265306122449, + "grad_norm": 1.053067922592163, + "learning_rate": 3.369820864770822e-05, + "loss": 0.852, "num_input_tokens_seen": 71024640, "step": 8670 }, { - "epoch": 0.9932486554525689, - "grad_norm": 0.5120067596435547, - "learning_rate": 7.1700688801940034e-09, - "loss": 1.26, + "epoch": 0.41680672268907565, + "grad_norm": 0.4945598840713501, + "learning_rate": 3.366105638650596e-05, + "loss": 0.8927, "num_input_tokens_seen": 71106560, "step": 8680 }, { - "epoch": 0.9943929511385742, - "grad_norm": 0.5355421900749207, - "learning_rate": 4.94558767802078e-09, - "loss": 1.1129, + "epoch": 0.41728691476590635, + "grad_norm": 0.5233136415481567, + "learning_rate": 3.3623882376061554e-05, + "loss": 0.8245, "num_input_tokens_seen": 71188480, "step": 8690 }, { - "epoch": 0.9955372468245794, - "grad_norm": 0.6079626083374023, - "learning_rate": 3.1329986515560295e-09, - "loss": 1.2407, + "epoch": 0.4177671068427371, + "grad_norm": 0.46826255321502686, + "learning_rate": 3.358668670972465e-05, + "loss": 0.8433, "num_input_tokens_seen": 71270400, "step": 8700 }, { - "epoch": 0.9966815425105847, - "grad_norm": 0.48402678966522217, - "learning_rate": 1.7323316703621305e-09, - "loss": 1.2159, + "epoch": 0.4182472989195678, + "grad_norm": 0.5887758731842041, + "learning_rate": 3.354946948089927e-05, + "loss": 0.9426, "num_input_tokens_seen": 71352320, "step": 8710 }, { - "epoch": 0.99782583819659, - "grad_norm": 0.45963913202285767, - "learning_rate": 7.436098159480099e-10, - "loss": 1.0369, + "epoch": 0.41872749099639855, + "grad_norm": 0.5028591752052307, + "learning_rate": 3.351223078304359e-05, + "loss": 0.8831, "num_input_tokens_seen": 71434240, "step": 8720 }, { - "epoch": 0.9989701338825953, - "grad_norm": 0.5231521725654602, - "learning_rate": 1.6684938141664498e-10, - "loss": 1.1446, + "epoch": 0.4192076830732293, + "grad_norm": 0.5117266178131104, + "learning_rate": 3.34749707096697e-05, + "loss": 0.9697, "num_input_tokens_seen": 71516160, "step": 8730 }, + { + "epoch": 0.41968787515006, + "grad_norm": 0.5058220028877258, + "learning_rate": 3.343768935434337e-05, + "loss": 0.8492, + "num_input_tokens_seen": 71598080, + "step": 8740 + }, + { + "epoch": 0.42016806722689076, + "grad_norm": 0.7977713346481323, + "learning_rate": 3.34003868106838e-05, + "loss": 0.9056, + "num_input_tokens_seen": 71680000, + "step": 8750 + }, + { + "epoch": 0.42064825930372146, + "grad_norm": 0.5445558428764343, + "learning_rate": 3.3363063172363396e-05, + "loss": 0.9732, + "num_input_tokens_seen": 71761920, + "step": 8760 + }, + { + "epoch": 0.4211284513805522, + "grad_norm": 0.5837395787239075, + "learning_rate": 3.3325718533107556e-05, + "loss": 1.1805, + "num_input_tokens_seen": 71843840, + "step": 8770 + }, + { + "epoch": 0.42160864345738297, + "grad_norm": 0.6139930486679077, + "learning_rate": 3.3288352986694396e-05, + "loss": 0.9309, + "num_input_tokens_seen": 71925760, + "step": 8780 + }, + { + "epoch": 0.42208883553421367, + "grad_norm": 1.7015244960784912, + "learning_rate": 3.325096662695454e-05, + "loss": 1.0555, + "num_input_tokens_seen": 72007680, + "step": 8790 + }, + { + "epoch": 0.4225690276110444, + "grad_norm": 0.49051469564437866, + "learning_rate": 3.321355954777087e-05, + "loss": 0.831, + "num_input_tokens_seen": 72089600, + "step": 8800 + }, + { + "epoch": 0.4230492196878752, + "grad_norm": 0.5128316283226013, + "learning_rate": 3.317613184307832e-05, + "loss": 0.9081, + "num_input_tokens_seen": 72171520, + "step": 8810 + }, + { + "epoch": 0.4235294117647059, + "grad_norm": 0.5852062106132507, + "learning_rate": 3.313868360686359e-05, + "loss": 0.9385, + "num_input_tokens_seen": 72253440, + "step": 8820 + }, + { + "epoch": 0.42400960384153663, + "grad_norm": 0.4833785891532898, + "learning_rate": 3.310121493316495e-05, + "loss": 0.8992, + "num_input_tokens_seen": 72335360, + "step": 8830 + }, + { + "epoch": 0.42448979591836733, + "grad_norm": 0.49650639295578003, + "learning_rate": 3.306372591607199e-05, + "loss": 0.8763, + "num_input_tokens_seen": 72417280, + "step": 8840 + }, + { + "epoch": 0.4249699879951981, + "grad_norm": 0.8988882303237915, + "learning_rate": 3.30262166497254e-05, + "loss": 0.8636, + "num_input_tokens_seen": 72499200, + "step": 8850 + }, + { + "epoch": 0.42545018007202884, + "grad_norm": 0.5129944086074829, + "learning_rate": 3.29886872283167e-05, + "loss": 1.1187, + "num_input_tokens_seen": 72581120, + "step": 8860 + }, + { + "epoch": 0.42593037214885954, + "grad_norm": 0.4534735381603241, + "learning_rate": 3.2951137746088004e-05, + "loss": 0.9403, + "num_input_tokens_seen": 72663040, + "step": 8870 + }, + { + "epoch": 0.4264105642256903, + "grad_norm": 0.48410260677337646, + "learning_rate": 3.291356829733186e-05, + "loss": 0.8392, + "num_input_tokens_seen": 72744960, + "step": 8880 + }, + { + "epoch": 0.426890756302521, + "grad_norm": 0.9801385998725891, + "learning_rate": 3.287597897639092e-05, + "loss": 1.0537, + "num_input_tokens_seen": 72826880, + "step": 8890 + }, + { + "epoch": 0.42737094837935174, + "grad_norm": 0.7022339105606079, + "learning_rate": 3.283836987765771e-05, + "loss": 0.8691, + "num_input_tokens_seen": 72908800, + "step": 8900 + }, + { + "epoch": 0.4278511404561825, + "grad_norm": 0.4977160096168518, + "learning_rate": 3.280074109557447e-05, + "loss": 0.8148, + "num_input_tokens_seen": 72990720, + "step": 8910 + }, + { + "epoch": 0.4283313325330132, + "grad_norm": 0.5575817823410034, + "learning_rate": 3.2763092724632854e-05, + "loss": 0.8026, + "num_input_tokens_seen": 73072640, + "step": 8920 + }, + { + "epoch": 0.42881152460984395, + "grad_norm": 0.6442136764526367, + "learning_rate": 3.272542485937369e-05, + "loss": 0.9775, + "num_input_tokens_seen": 73154560, + "step": 8930 + }, + { + "epoch": 0.42929171668667465, + "grad_norm": 0.4888969957828522, + "learning_rate": 3.2687737594386766e-05, + "loss": 0.7946, + "num_input_tokens_seen": 73236480, + "step": 8940 + }, + { + "epoch": 0.4297719087635054, + "grad_norm": 0.9433819055557251, + "learning_rate": 3.2650031024310603e-05, + "loss": 1.0062, + "num_input_tokens_seen": 73318400, + "step": 8950 + }, + { + "epoch": 0.43025210084033616, + "grad_norm": 2.151608943939209, + "learning_rate": 3.2612305243832176e-05, + "loss": 1.0847, + "num_input_tokens_seen": 73400320, + "step": 8960 + }, + { + "epoch": 0.43073229291716686, + "grad_norm": 0.5902963876724243, + "learning_rate": 3.2574560347686725e-05, + "loss": 0.9699, + "num_input_tokens_seen": 73482240, + "step": 8970 + }, + { + "epoch": 0.4312124849939976, + "grad_norm": 0.5295826196670532, + "learning_rate": 3.253679643065747e-05, + "loss": 0.9454, + "num_input_tokens_seen": 73564160, + "step": 8980 + }, + { + "epoch": 0.4316926770708283, + "grad_norm": 0.5086777210235596, + "learning_rate": 3.249901358757538e-05, + "loss": 0.9417, + "num_input_tokens_seen": 73646080, + "step": 8990 + }, + { + "epoch": 0.43217286914765907, + "grad_norm": 0.5244941711425781, + "learning_rate": 3.246121191331902e-05, + "loss": 0.9839, + "num_input_tokens_seen": 73728000, + "step": 9000 + }, + { + "epoch": 0.4326530612244898, + "grad_norm": 0.4932080805301666, + "learning_rate": 3.242339150281417e-05, + "loss": 0.8661, + "num_input_tokens_seen": 73809920, + "step": 9010 + }, + { + "epoch": 0.4331332533013205, + "grad_norm": 0.47911617159843445, + "learning_rate": 3.238555245103368e-05, + "loss": 0.8686, + "num_input_tokens_seen": 73891840, + "step": 9020 + }, + { + "epoch": 0.4336134453781513, + "grad_norm": 0.5290552377700806, + "learning_rate": 3.234769485299724e-05, + "loss": 0.8504, + "num_input_tokens_seen": 73973760, + "step": 9030 + }, + { + "epoch": 0.434093637454982, + "grad_norm": 0.4913333058357239, + "learning_rate": 3.230981880377107e-05, + "loss": 1.0346, + "num_input_tokens_seen": 74055680, + "step": 9040 + }, + { + "epoch": 0.4345738295318127, + "grad_norm": 0.8791585564613342, + "learning_rate": 3.227192439846775e-05, + "loss": 0.9299, + "num_input_tokens_seen": 74137600, + "step": 9050 + }, + { + "epoch": 0.4350540216086435, + "grad_norm": 0.4976731240749359, + "learning_rate": 3.223401173224595e-05, + "loss": 0.9369, + "num_input_tokens_seen": 74219520, + "step": 9060 + }, + { + "epoch": 0.4355342136854742, + "grad_norm": 0.49430936574935913, + "learning_rate": 3.219608090031021e-05, + "loss": 0.8562, + "num_input_tokens_seen": 74301440, + "step": 9070 + }, + { + "epoch": 0.43601440576230494, + "grad_norm": 0.4978736937046051, + "learning_rate": 3.215813199791065e-05, + "loss": 1.0076, + "num_input_tokens_seen": 74383360, + "step": 9080 + }, + { + "epoch": 0.43649459783913563, + "grad_norm": 0.5289158225059509, + "learning_rate": 3.212016512034279e-05, + "loss": 0.8394, + "num_input_tokens_seen": 74465280, + "step": 9090 + }, + { + "epoch": 0.4369747899159664, + "grad_norm": 0.4874771535396576, + "learning_rate": 3.2082180362947304e-05, + "loss": 0.9406, + "num_input_tokens_seen": 74547200, + "step": 9100 + }, + { + "epoch": 0.43745498199279714, + "grad_norm": 0.5047212243080139, + "learning_rate": 3.204417782110974e-05, + "loss": 1.0131, + "num_input_tokens_seen": 74629120, + "step": 9110 + }, + { + "epoch": 0.43793517406962784, + "grad_norm": 0.5818035006523132, + "learning_rate": 3.200615759026031e-05, + "loss": 1.0524, + "num_input_tokens_seen": 74711040, + "step": 9120 + }, + { + "epoch": 0.4384153661464586, + "grad_norm": 0.5106756687164307, + "learning_rate": 3.1968119765873654e-05, + "loss": 0.8463, + "num_input_tokens_seen": 74792960, + "step": 9130 + }, + { + "epoch": 0.4388955582232893, + "grad_norm": 1.4386917352676392, + "learning_rate": 3.193006444346859e-05, + "loss": 0.8177, + "num_input_tokens_seen": 74874880, + "step": 9140 + }, + { + "epoch": 0.43937575030012005, + "grad_norm": 0.5303124785423279, + "learning_rate": 3.189199171860787e-05, + "loss": 1.1675, + "num_input_tokens_seen": 74956800, + "step": 9150 + }, + { + "epoch": 0.4398559423769508, + "grad_norm": 0.5052891969680786, + "learning_rate": 3.185390168689796e-05, + "loss": 0.7782, + "num_input_tokens_seen": 75038720, + "step": 9160 + }, + { + "epoch": 0.4403361344537815, + "grad_norm": 0.5097294449806213, + "learning_rate": 3.1815794443988763e-05, + "loss": 0.74, + "num_input_tokens_seen": 75120640, + "step": 9170 + }, + { + "epoch": 0.44081632653061226, + "grad_norm": 1.9583684206008911, + "learning_rate": 3.177767008557343e-05, + "loss": 0.9299, + "num_input_tokens_seen": 75202560, + "step": 9180 + }, + { + "epoch": 0.44129651860744296, + "grad_norm": 0.5055694580078125, + "learning_rate": 3.1739528707388066e-05, + "loss": 0.8375, + "num_input_tokens_seen": 75284480, + "step": 9190 + }, + { + "epoch": 0.4417767106842737, + "grad_norm": 0.5287858247756958, + "learning_rate": 3.1701370405211535e-05, + "loss": 0.8294, + "num_input_tokens_seen": 75366400, + "step": 9200 + }, + { + "epoch": 0.44225690276110446, + "grad_norm": 0.5322756171226501, + "learning_rate": 3.166319527486519e-05, + "loss": 0.965, + "num_input_tokens_seen": 75448320, + "step": 9210 + }, + { + "epoch": 0.44273709483793516, + "grad_norm": 0.475495845079422, + "learning_rate": 3.162500341221264e-05, + "loss": 0.9839, + "num_input_tokens_seen": 75530240, + "step": 9220 + }, + { + "epoch": 0.4432172869147659, + "grad_norm": 0.5287263989448547, + "learning_rate": 3.158679491315951e-05, + "loss": 0.9622, + "num_input_tokens_seen": 75612160, + "step": 9230 + }, + { + "epoch": 0.4436974789915966, + "grad_norm": 0.9584441184997559, + "learning_rate": 3.154856987365322e-05, + "loss": 0.9561, + "num_input_tokens_seen": 75694080, + "step": 9240 + }, + { + "epoch": 0.44417767106842737, + "grad_norm": 0.5105222463607788, + "learning_rate": 3.151032838968271e-05, + "loss": 0.794, + "num_input_tokens_seen": 75776000, + "step": 9250 + }, + { + "epoch": 0.4446578631452581, + "grad_norm": 0.5271983742713928, + "learning_rate": 3.14720705572782e-05, + "loss": 1.0292, + "num_input_tokens_seen": 75857920, + "step": 9260 + }, + { + "epoch": 0.4451380552220888, + "grad_norm": 0.6865978240966797, + "learning_rate": 3.1433796472511e-05, + "loss": 0.865, + "num_input_tokens_seen": 75939840, + "step": 9270 + }, + { + "epoch": 0.4456182472989196, + "grad_norm": 0.5002774596214294, + "learning_rate": 3.13955062314932e-05, + "loss": 1.0853, + "num_input_tokens_seen": 76021760, + "step": 9280 + }, + { + "epoch": 0.4460984393757503, + "grad_norm": 0.5186232924461365, + "learning_rate": 3.135719993037748e-05, + "loss": 0.9558, + "num_input_tokens_seen": 76103680, + "step": 9290 + }, + { + "epoch": 0.44657863145258103, + "grad_norm": 0.5334696769714355, + "learning_rate": 3.131887766535684e-05, + "loss": 0.9276, + "num_input_tokens_seen": 76185600, + "step": 9300 + }, + { + "epoch": 0.4470588235294118, + "grad_norm": 0.49361923336982727, + "learning_rate": 3.1280539532664366e-05, + "loss": 0.8235, + "num_input_tokens_seen": 76267520, + "step": 9310 + }, + { + "epoch": 0.4475390156062425, + "grad_norm": 0.4922276735305786, + "learning_rate": 3.1242185628573e-05, + "loss": 0.8764, + "num_input_tokens_seen": 76349440, + "step": 9320 + }, + { + "epoch": 0.44801920768307324, + "grad_norm": 0.5455789566040039, + "learning_rate": 3.120381604939529e-05, + "loss": 0.8174, + "num_input_tokens_seen": 76431360, + "step": 9330 + }, + { + "epoch": 0.44849939975990394, + "grad_norm": 0.5186362862586975, + "learning_rate": 3.116543089148312e-05, + "loss": 0.9815, + "num_input_tokens_seen": 76513280, + "step": 9340 + }, + { + "epoch": 0.4489795918367347, + "grad_norm": 0.5068888664245605, + "learning_rate": 3.112703025122754e-05, + "loss": 0.7665, + "num_input_tokens_seen": 76595200, + "step": 9350 + }, + { + "epoch": 0.44945978391356545, + "grad_norm": 0.5032262206077576, + "learning_rate": 3.108861422505842e-05, + "loss": 0.7452, + "num_input_tokens_seen": 76677120, + "step": 9360 + }, + { + "epoch": 0.44993997599039615, + "grad_norm": 0.4863367974758148, + "learning_rate": 3.105018290944432e-05, + "loss": 0.9044, + "num_input_tokens_seen": 76759040, + "step": 9370 + }, + { + "epoch": 0.4504201680672269, + "grad_norm": 0.49691611528396606, + "learning_rate": 3.1011736400892175e-05, + "loss": 0.8608, + "num_input_tokens_seen": 76840960, + "step": 9380 + }, + { + "epoch": 0.4509003601440576, + "grad_norm": 0.40895968675613403, + "learning_rate": 3.097327479594707e-05, + "loss": 0.7893, + "num_input_tokens_seen": 76922880, + "step": 9390 + }, + { + "epoch": 0.45138055222088835, + "grad_norm": 0.4914085865020752, + "learning_rate": 3.093479819119198e-05, + "loss": 0.935, + "num_input_tokens_seen": 77004800, + "step": 9400 + }, + { + "epoch": 0.4518607442977191, + "grad_norm": 0.4737802743911743, + "learning_rate": 3.089630668324759e-05, + "loss": 0.92, + "num_input_tokens_seen": 77086720, + "step": 9410 + }, + { + "epoch": 0.4523409363745498, + "grad_norm": 0.4802420437335968, + "learning_rate": 3.085780036877197e-05, + "loss": 0.8323, + "num_input_tokens_seen": 77168640, + "step": 9420 + }, + { + "epoch": 0.45282112845138056, + "grad_norm": 0.8194192051887512, + "learning_rate": 3.0819279344460396e-05, + "loss": 0.8059, + "num_input_tokens_seen": 77250560, + "step": 9430 + }, + { + "epoch": 0.45330132052821126, + "grad_norm": 0.500428318977356, + "learning_rate": 3.078074370704507e-05, + "loss": 0.9285, + "num_input_tokens_seen": 77332480, + "step": 9440 + }, + { + "epoch": 0.453781512605042, + "grad_norm": 0.6494525671005249, + "learning_rate": 3.07421935532949e-05, + "loss": 0.8807, + "num_input_tokens_seen": 77414400, + "step": 9450 + }, + { + "epoch": 0.45426170468187277, + "grad_norm": 0.6894603967666626, + "learning_rate": 3.0703628980015214e-05, + "loss": 0.9211, + "num_input_tokens_seen": 77496320, + "step": 9460 + }, + { + "epoch": 0.45474189675870347, + "grad_norm": 0.4977949261665344, + "learning_rate": 3.0665050084047605e-05, + "loss": 0.8221, + "num_input_tokens_seen": 77578240, + "step": 9470 + }, + { + "epoch": 0.4552220888355342, + "grad_norm": 0.49541839957237244, + "learning_rate": 3.062645696226959e-05, + "loss": 0.7467, + "num_input_tokens_seen": 77660160, + "step": 9480 + }, + { + "epoch": 0.4557022809123649, + "grad_norm": 0.48292815685272217, + "learning_rate": 3.0587849711594425e-05, + "loss": 0.982, + "num_input_tokens_seen": 77742080, + "step": 9490 + }, + { + "epoch": 0.4561824729891957, + "grad_norm": 0.5033644437789917, + "learning_rate": 3.054922842897084e-05, + "loss": 0.863, + "num_input_tokens_seen": 77824000, + "step": 9500 + }, + { + "epoch": 0.45666266506602643, + "grad_norm": 0.5042487978935242, + "learning_rate": 3.051059321138281e-05, + "loss": 1.0322, + "num_input_tokens_seen": 77905920, + "step": 9510 + }, + { + "epoch": 0.45714285714285713, + "grad_norm": 0.49270498752593994, + "learning_rate": 3.047194415584929e-05, + "loss": 0.9702, + "num_input_tokens_seen": 77987840, + "step": 9520 + }, + { + "epoch": 0.4576230492196879, + "grad_norm": 0.5053675174713135, + "learning_rate": 3.0433281359424008e-05, + "loss": 0.9051, + "num_input_tokens_seen": 78069760, + "step": 9530 + }, + { + "epoch": 0.4581032412965186, + "grad_norm": 0.4840089976787567, + "learning_rate": 3.0394604919195156e-05, + "loss": 0.785, + "num_input_tokens_seen": 78151680, + "step": 9540 + }, + { + "epoch": 0.45858343337334934, + "grad_norm": 0.4885089099407196, + "learning_rate": 3.0355914932285228e-05, + "loss": 0.9501, + "num_input_tokens_seen": 78233600, + "step": 9550 + }, + { + "epoch": 0.4590636254501801, + "grad_norm": 0.5052339434623718, + "learning_rate": 3.0317211495850717e-05, + "loss": 0.8565, + "num_input_tokens_seen": 78315520, + "step": 9560 + }, + { + "epoch": 0.4595438175270108, + "grad_norm": 0.49368199706077576, + "learning_rate": 3.02784947070819e-05, + "loss": 0.7823, + "num_input_tokens_seen": 78397440, + "step": 9570 + }, + { + "epoch": 0.46002400960384154, + "grad_norm": 0.48532119393348694, + "learning_rate": 3.0239764663202562e-05, + "loss": 0.9049, + "num_input_tokens_seen": 78479360, + "step": 9580 + }, + { + "epoch": 0.46050420168067224, + "grad_norm": 0.48147672414779663, + "learning_rate": 3.0201021461469803e-05, + "loss": 0.9584, + "num_input_tokens_seen": 78561280, + "step": 9590 + }, + { + "epoch": 0.460984393757503, + "grad_norm": 0.49447527527809143, + "learning_rate": 3.0162265199173738e-05, + "loss": 0.8946, + "num_input_tokens_seen": 78643200, + "step": 9600 + }, + { + "epoch": 0.46146458583433375, + "grad_norm": 0.44538614153862, + "learning_rate": 3.0123495973637305e-05, + "loss": 0.973, + "num_input_tokens_seen": 78725120, + "step": 9610 + }, + { + "epoch": 0.46194477791116445, + "grad_norm": 0.4778103530406952, + "learning_rate": 3.008471388221597e-05, + "loss": 0.8953, + "num_input_tokens_seen": 78807040, + "step": 9620 + }, + { + "epoch": 0.4624249699879952, + "grad_norm": 0.5161569714546204, + "learning_rate": 3.0045919022297524e-05, + "loss": 0.7189, + "num_input_tokens_seen": 78888960, + "step": 9630 + }, + { + "epoch": 0.4629051620648259, + "grad_norm": 0.47274622321128845, + "learning_rate": 3.0007111491301816e-05, + "loss": 0.8521, + "num_input_tokens_seen": 78970880, + "step": 9640 + }, + { + "epoch": 0.46338535414165666, + "grad_norm": 0.4988880753517151, + "learning_rate": 2.9968291386680503e-05, + "loss": 0.941, + "num_input_tokens_seen": 79052800, + "step": 9650 + }, + { + "epoch": 0.4638655462184874, + "grad_norm": 0.6255693435668945, + "learning_rate": 2.9929458805916837e-05, + "loss": 1.0642, + "num_input_tokens_seen": 79134720, + "step": 9660 + }, + { + "epoch": 0.4643457382953181, + "grad_norm": 0.39187878370285034, + "learning_rate": 2.9890613846525395e-05, + "loss": 0.7306, + "num_input_tokens_seen": 79216640, + "step": 9670 + }, + { + "epoch": 0.46482593037214887, + "grad_norm": 1.9925369024276733, + "learning_rate": 2.9851756606051817e-05, + "loss": 0.8926, + "num_input_tokens_seen": 79298560, + "step": 9680 + }, + { + "epoch": 0.46530612244897956, + "grad_norm": 0.48940789699554443, + "learning_rate": 2.9812887182072607e-05, + "loss": 0.9238, + "num_input_tokens_seen": 79380480, + "step": 9690 + }, + { + "epoch": 0.4657863145258103, + "grad_norm": 0.5379921197891235, + "learning_rate": 2.9774005672194854e-05, + "loss": 0.8351, + "num_input_tokens_seen": 79462400, + "step": 9700 + }, + { + "epoch": 0.4662665066026411, + "grad_norm": 0.5129654407501221, + "learning_rate": 2.9735112174056006e-05, + "loss": 0.8861, + "num_input_tokens_seen": 79544320, + "step": 9710 + }, + { + "epoch": 0.46674669867947177, + "grad_norm": 0.4713475704193115, + "learning_rate": 2.96962067853236e-05, + "loss": 0.8593, + "num_input_tokens_seen": 79626240, + "step": 9720 + }, + { + "epoch": 0.4672268907563025, + "grad_norm": 4.391692161560059, + "learning_rate": 2.9657289603695037e-05, + "loss": 0.8369, + "num_input_tokens_seen": 79708160, + "step": 9730 + }, + { + "epoch": 0.4677070828331333, + "grad_norm": 0.5324821472167969, + "learning_rate": 2.9618360726897344e-05, + "loss": 1.3, + "num_input_tokens_seen": 79790080, + "step": 9740 + }, + { + "epoch": 0.468187274909964, + "grad_norm": 0.5004756450653076, + "learning_rate": 2.957942025268689e-05, + "loss": 0.9522, + "num_input_tokens_seen": 79872000, + "step": 9750 + }, + { + "epoch": 0.46866746698679473, + "grad_norm": 0.5157467722892761, + "learning_rate": 2.9540468278849208e-05, + "loss": 0.8728, + "num_input_tokens_seen": 79953920, + "step": 9760 + }, + { + "epoch": 0.46914765906362543, + "grad_norm": 0.4873751699924469, + "learning_rate": 2.950150490319866e-05, + "loss": 0.8798, + "num_input_tokens_seen": 80035840, + "step": 9770 + }, + { + "epoch": 0.4696278511404562, + "grad_norm": 0.9361140727996826, + "learning_rate": 2.9462530223578273e-05, + "loss": 0.9698, + "num_input_tokens_seen": 80117760, + "step": 9780 + }, + { + "epoch": 0.47010804321728694, + "grad_norm": 0.5102428197860718, + "learning_rate": 2.9423544337859454e-05, + "loss": 0.8272, + "num_input_tokens_seen": 80199680, + "step": 9790 + }, + { + "epoch": 0.47058823529411764, + "grad_norm": 0.5207327008247375, + "learning_rate": 2.938454734394174e-05, + "loss": 0.8527, + "num_input_tokens_seen": 80281600, + "step": 9800 + }, + { + "epoch": 0.4710684273709484, + "grad_norm": 0.5059601068496704, + "learning_rate": 2.9345539339752575e-05, + "loss": 0.9777, + "num_input_tokens_seen": 80363520, + "step": 9810 + }, + { + "epoch": 0.4715486194477791, + "grad_norm": 0.5397816896438599, + "learning_rate": 2.9306520423247045e-05, + "loss": 0.8295, + "num_input_tokens_seen": 80445440, + "step": 9820 + }, + { + "epoch": 0.47202881152460985, + "grad_norm": 0.4623885750770569, + "learning_rate": 2.9267490692407635e-05, + "loss": 0.9394, + "num_input_tokens_seen": 80527360, + "step": 9830 + }, + { + "epoch": 0.4725090036014406, + "grad_norm": 0.48748502135276794, + "learning_rate": 2.9228450245243993e-05, + "loss": 0.9649, + "num_input_tokens_seen": 80609280, + "step": 9840 + }, + { + "epoch": 0.4729891956782713, + "grad_norm": 0.493064820766449, + "learning_rate": 2.9189399179792676e-05, + "loss": 1.1218, + "num_input_tokens_seen": 80691200, + "step": 9850 + }, + { + "epoch": 0.47346938775510206, + "grad_norm": 0.5036744475364685, + "learning_rate": 2.9150337594116904e-05, + "loss": 0.9087, + "num_input_tokens_seen": 80773120, + "step": 9860 + }, + { + "epoch": 0.47394957983193275, + "grad_norm": 0.5360898971557617, + "learning_rate": 2.9111265586306314e-05, + "loss": 0.849, + "num_input_tokens_seen": 80855040, + "step": 9870 + }, + { + "epoch": 0.4744297719087635, + "grad_norm": 0.521198034286499, + "learning_rate": 2.9072183254476713e-05, + "loss": 0.8285, + "num_input_tokens_seen": 80936960, + "step": 9880 + }, + { + "epoch": 0.47490996398559426, + "grad_norm": 0.4821554720401764, + "learning_rate": 2.903309069676984e-05, + "loss": 0.9502, + "num_input_tokens_seen": 81018880, + "step": 9890 + }, + { + "epoch": 0.47539015606242496, + "grad_norm": 0.483500212430954, + "learning_rate": 2.8993988011353112e-05, + "loss": 0.8596, + "num_input_tokens_seen": 81100800, + "step": 9900 + }, + { + "epoch": 0.4758703481392557, + "grad_norm": 0.5084524750709534, + "learning_rate": 2.8954875296419364e-05, + "loss": 0.8992, + "num_input_tokens_seen": 81182720, + "step": 9910 + }, + { + "epoch": 0.4763505402160864, + "grad_norm": 0.5362197160720825, + "learning_rate": 2.8915752650186635e-05, + "loss": 0.9792, + "num_input_tokens_seen": 81264640, + "step": 9920 + }, + { + "epoch": 0.47683073229291717, + "grad_norm": 0.5296033024787903, + "learning_rate": 2.8876620170897895e-05, + "loss": 0.7938, + "num_input_tokens_seen": 81346560, + "step": 9930 + }, + { + "epoch": 0.4773109243697479, + "grad_norm": 0.5047898888587952, + "learning_rate": 2.88374779568208e-05, + "loss": 0.8632, + "num_input_tokens_seen": 81428480, + "step": 9940 + }, + { + "epoch": 0.4777911164465786, + "grad_norm": 0.5180176496505737, + "learning_rate": 2.879832610624747e-05, + "loss": 0.8698, + "num_input_tokens_seen": 81510400, + "step": 9950 + }, + { + "epoch": 0.4782713085234094, + "grad_norm": 0.4948786795139313, + "learning_rate": 2.8759164717494202e-05, + "loss": 0.8159, + "num_input_tokens_seen": 81592320, + "step": 9960 + }, + { + "epoch": 0.4787515006002401, + "grad_norm": 0.5000596046447754, + "learning_rate": 2.8719993888901258e-05, + "loss": 1.0499, + "num_input_tokens_seen": 81674240, + "step": 9970 + }, + { + "epoch": 0.47923169267707083, + "grad_norm": 0.5099364519119263, + "learning_rate": 2.86808137188326e-05, + "loss": 0.8724, + "num_input_tokens_seen": 81756160, + "step": 9980 + }, + { + "epoch": 0.4797118847539016, + "grad_norm": 0.48814287781715393, + "learning_rate": 2.8641624305675657e-05, + "loss": 0.9881, + "num_input_tokens_seen": 81838080, + "step": 9990 + }, + { + "epoch": 0.4801920768307323, + "grad_norm": 0.46351805329322815, + "learning_rate": 2.8602425747841057e-05, + "loss": 0.7335, + "num_input_tokens_seen": 81920000, + "step": 10000 + }, + { + "epoch": 0.48067226890756304, + "grad_norm": 0.4863956868648529, + "learning_rate": 2.8563218143762383e-05, + "loss": 0.8356, + "num_input_tokens_seen": 82001920, + "step": 10010 + }, + { + "epoch": 0.48115246098439374, + "grad_norm": 0.5032678246498108, + "learning_rate": 2.852400159189597e-05, + "loss": 0.8458, + "num_input_tokens_seen": 82083840, + "step": 10020 + }, + { + "epoch": 0.4816326530612245, + "grad_norm": 0.4915202558040619, + "learning_rate": 2.848477619072059e-05, + "loss": 0.995, + "num_input_tokens_seen": 82165760, + "step": 10030 + }, + { + "epoch": 0.48211284513805525, + "grad_norm": 0.9962053894996643, + "learning_rate": 2.8445542038737245e-05, + "loss": 1.0218, + "num_input_tokens_seen": 82247680, + "step": 10040 + }, + { + "epoch": 0.48259303721488594, + "grad_norm": 0.49556922912597656, + "learning_rate": 2.8406299234468915e-05, + "loss": 0.9613, + "num_input_tokens_seen": 82329600, + "step": 10050 + }, + { + "epoch": 0.4830732292917167, + "grad_norm": 0.5691222548484802, + "learning_rate": 2.8367047876460305e-05, + "loss": 1.0402, + "num_input_tokens_seen": 82411520, + "step": 10060 + }, + { + "epoch": 0.4835534213685474, + "grad_norm": 0.4867800176143646, + "learning_rate": 2.8327788063277594e-05, + "loss": 0.9294, + "num_input_tokens_seen": 82493440, + "step": 10070 + }, + { + "epoch": 0.48403361344537815, + "grad_norm": 0.5041534304618835, + "learning_rate": 2.8288519893508212e-05, + "loss": 0.8672, + "num_input_tokens_seen": 82575360, + "step": 10080 + }, + { + "epoch": 0.4845138055222089, + "grad_norm": 0.44031843543052673, + "learning_rate": 2.8249243465760566e-05, + "loss": 0.8052, + "num_input_tokens_seen": 82657280, + "step": 10090 + }, + { + "epoch": 0.4849939975990396, + "grad_norm": 0.48101985454559326, + "learning_rate": 2.8209958878663778e-05, + "loss": 0.8885, + "num_input_tokens_seen": 82739200, + "step": 10100 + }, + { + "epoch": 0.48547418967587036, + "grad_norm": 1.5555214881896973, + "learning_rate": 2.817066623086748e-05, + "loss": 0.8561, + "num_input_tokens_seen": 82821120, + "step": 10110 + }, + { + "epoch": 0.48595438175270106, + "grad_norm": 0.48857659101486206, + "learning_rate": 2.813136562104155e-05, + "loss": 0.8841, + "num_input_tokens_seen": 82903040, + "step": 10120 + }, + { + "epoch": 0.4864345738295318, + "grad_norm": 0.49165698885917664, + "learning_rate": 2.8092057147875856e-05, + "loss": 0.9224, + "num_input_tokens_seen": 82984960, + "step": 10130 + }, + { + "epoch": 0.48691476590636257, + "grad_norm": 0.5225831270217896, + "learning_rate": 2.8052740910079994e-05, + "loss": 0.9768, + "num_input_tokens_seen": 83066880, + "step": 10140 + }, + { + "epoch": 0.48739495798319327, + "grad_norm": 0.5045091509819031, + "learning_rate": 2.8013417006383076e-05, + "loss": 0.9255, + "num_input_tokens_seen": 83148800, + "step": 10150 + }, + { + "epoch": 0.487875150060024, + "grad_norm": 0.5114516615867615, + "learning_rate": 2.7974085535533473e-05, + "loss": 0.8565, + "num_input_tokens_seen": 83230720, + "step": 10160 + }, + { + "epoch": 0.4883553421368547, + "grad_norm": 0.5040556788444519, + "learning_rate": 2.7934746596298535e-05, + "loss": 1.0491, + "num_input_tokens_seen": 83312640, + "step": 10170 + }, + { + "epoch": 0.4888355342136855, + "grad_norm": 1.6331427097320557, + "learning_rate": 2.789540028746438e-05, + "loss": 0.8835, + "num_input_tokens_seen": 83394560, + "step": 10180 + }, + { + "epoch": 0.48931572629051623, + "grad_norm": 0.6383754014968872, + "learning_rate": 2.785604670783563e-05, + "loss": 0.9588, + "num_input_tokens_seen": 83476480, + "step": 10190 + }, + { + "epoch": 0.4897959183673469, + "grad_norm": 0.5155805349349976, + "learning_rate": 2.7816685956235165e-05, + "loss": 0.7832, + "num_input_tokens_seen": 83558400, + "step": 10200 + }, + { + "epoch": 0.4902761104441777, + "grad_norm": 0.5443750619888306, + "learning_rate": 2.7777318131503873e-05, + "loss": 0.9374, + "num_input_tokens_seen": 83640320, + "step": 10210 + }, + { + "epoch": 0.4907563025210084, + "grad_norm": 0.4726339280605316, + "learning_rate": 2.773794333250041e-05, + "loss": 0.767, + "num_input_tokens_seen": 83722240, + "step": 10220 + }, + { + "epoch": 0.49123649459783914, + "grad_norm": 0.4907895028591156, + "learning_rate": 2.769856165810093e-05, + "loss": 0.9588, + "num_input_tokens_seen": 83804160, + "step": 10230 + }, + { + "epoch": 0.4917166866746699, + "grad_norm": 0.5048187375068665, + "learning_rate": 2.765917320719887e-05, + "loss": 0.929, + "num_input_tokens_seen": 83886080, + "step": 10240 + }, + { + "epoch": 0.4921968787515006, + "grad_norm": 0.4887201189994812, + "learning_rate": 2.7619778078704685e-05, + "loss": 0.8619, + "num_input_tokens_seen": 83968000, + "step": 10250 + }, + { + "epoch": 0.49267707082833134, + "grad_norm": 0.5038937926292419, + "learning_rate": 2.7580376371545573e-05, + "loss": 0.7223, + "num_input_tokens_seen": 84049920, + "step": 10260 + }, + { + "epoch": 0.49315726290516204, + "grad_norm": 0.505540668964386, + "learning_rate": 2.7540968184665283e-05, + "loss": 0.8415, + "num_input_tokens_seen": 84131840, + "step": 10270 + }, + { + "epoch": 0.4936374549819928, + "grad_norm": 0.4871721565723419, + "learning_rate": 2.7501553617023816e-05, + "loss": 0.9001, + "num_input_tokens_seen": 84213760, + "step": 10280 + }, + { + "epoch": 0.49411764705882355, + "grad_norm": 0.4944019019603729, + "learning_rate": 2.7462132767597205e-05, + "loss": 0.8124, + "num_input_tokens_seen": 84295680, + "step": 10290 + }, + { + "epoch": 0.49459783913565425, + "grad_norm": 0.5085113644599915, + "learning_rate": 2.742270573537724e-05, + "loss": 0.8611, + "num_input_tokens_seen": 84377600, + "step": 10300 + }, + { + "epoch": 0.495078031212485, + "grad_norm": 0.4947553277015686, + "learning_rate": 2.7383272619371276e-05, + "loss": 0.9993, + "num_input_tokens_seen": 84459520, + "step": 10310 + }, + { + "epoch": 0.4955582232893157, + "grad_norm": 0.4945763945579529, + "learning_rate": 2.7343833518601913e-05, + "loss": 0.8757, + "num_input_tokens_seen": 84541440, + "step": 10320 + }, + { + "epoch": 0.49603841536614646, + "grad_norm": 0.47624143958091736, + "learning_rate": 2.7304388532106768e-05, + "loss": 1.1208, + "num_input_tokens_seen": 84623360, + "step": 10330 + }, + { + "epoch": 0.4965186074429772, + "grad_norm": 0.5344643592834473, + "learning_rate": 2.726493775893828e-05, + "loss": 0.7873, + "num_input_tokens_seen": 84705280, + "step": 10340 + }, + { + "epoch": 0.4969987995198079, + "grad_norm": 0.5042626261711121, + "learning_rate": 2.7225481298163387e-05, + "loss": 0.9001, + "num_input_tokens_seen": 84787200, + "step": 10350 + }, + { + "epoch": 0.49747899159663866, + "grad_norm": 0.4883025884628296, + "learning_rate": 2.718601924886332e-05, + "loss": 1.0793, + "num_input_tokens_seen": 84869120, + "step": 10360 + }, + { + "epoch": 0.49795918367346936, + "grad_norm": 0.512522280216217, + "learning_rate": 2.7146551710133346e-05, + "loss": 0.8888, + "num_input_tokens_seen": 84951040, + "step": 10370 + }, + { + "epoch": 0.4984393757503001, + "grad_norm": 0.47969046235084534, + "learning_rate": 2.7107078781082508e-05, + "loss": 0.8732, + "num_input_tokens_seen": 85032960, + "step": 10380 + }, + { + "epoch": 0.49891956782713087, + "grad_norm": 0.5086472034454346, + "learning_rate": 2.7067600560833384e-05, + "loss": 0.9422, + "num_input_tokens_seen": 85114880, + "step": 10390 + }, + { + "epoch": 0.49939975990396157, + "grad_norm": 0.5234156250953674, + "learning_rate": 2.7028117148521863e-05, + "loss": 0.8577, + "num_input_tokens_seen": 85196800, + "step": 10400 + }, + { + "epoch": 0.4998799519807923, + "grad_norm": 0.8362540006637573, + "learning_rate": 2.698862864329685e-05, + "loss": 0.9081, + "num_input_tokens_seen": 85278720, + "step": 10410 + }, + { + "epoch": 0.5003601440576231, + "grad_norm": 0.5078486800193787, + "learning_rate": 2.6949135144320026e-05, + "loss": 0.9876, + "num_input_tokens_seen": 85360640, + "step": 10420 + }, + { + "epoch": 0.5008403361344538, + "grad_norm": 0.49120739102363586, + "learning_rate": 2.6909636750765653e-05, + "loss": 0.8621, + "num_input_tokens_seen": 85442560, + "step": 10430 + }, + { + "epoch": 0.5013205282112845, + "grad_norm": 0.5183668732643127, + "learning_rate": 2.6870133561820243e-05, + "loss": 0.9355, + "num_input_tokens_seen": 85524480, + "step": 10440 + }, + { + "epoch": 0.5018007202881153, + "grad_norm": 0.8437354564666748, + "learning_rate": 2.683062567668238e-05, + "loss": 1.0307, + "num_input_tokens_seen": 85606400, + "step": 10450 + }, + { + "epoch": 0.502280912364946, + "grad_norm": 0.4748769700527191, + "learning_rate": 2.679111319456242e-05, + "loss": 0.8886, + "num_input_tokens_seen": 85688320, + "step": 10460 + }, + { + "epoch": 0.5027611044417767, + "grad_norm": 0.4411871135234833, + "learning_rate": 2.6751596214682278e-05, + "loss": 0.6568, + "num_input_tokens_seen": 85770240, + "step": 10470 + }, + { + "epoch": 0.5032412965186075, + "grad_norm": 0.3230482041835785, + "learning_rate": 2.671207483627515e-05, + "loss": 0.9178, + "num_input_tokens_seen": 85852160, + "step": 10480 + }, + { + "epoch": 0.5037214885954382, + "grad_norm": 0.6780135631561279, + "learning_rate": 2.6672549158585293e-05, + "loss": 0.832, + "num_input_tokens_seen": 85934080, + "step": 10490 + }, + { + "epoch": 0.5042016806722689, + "grad_norm": 0.47888603806495667, + "learning_rate": 2.663301928086774e-05, + "loss": 0.7443, + "num_input_tokens_seen": 86016000, + "step": 10500 + }, + { + "epoch": 0.5046818727490996, + "grad_norm": 0.4726419448852539, + "learning_rate": 2.6593485302388087e-05, + "loss": 0.8535, + "num_input_tokens_seen": 86097920, + "step": 10510 + }, + { + "epoch": 0.5051620648259304, + "grad_norm": 0.4687502980232239, + "learning_rate": 2.6553947322422223e-05, + "loss": 0.9447, + "num_input_tokens_seen": 86179840, + "step": 10520 + }, + { + "epoch": 0.5056422569027611, + "grad_norm": 0.5022156834602356, + "learning_rate": 2.6514405440256086e-05, + "loss": 0.8657, + "num_input_tokens_seen": 86261760, + "step": 10530 + }, + { + "epoch": 0.5061224489795918, + "grad_norm": 0.49966397881507874, + "learning_rate": 2.6474859755185415e-05, + "loss": 0.9218, + "num_input_tokens_seen": 86343680, + "step": 10540 + }, + { + "epoch": 0.5066026410564226, + "grad_norm": 0.4754197597503662, + "learning_rate": 2.6435310366515498e-05, + "loss": 0.9897, + "num_input_tokens_seen": 86425600, + "step": 10550 + }, + { + "epoch": 0.5070828331332533, + "grad_norm": 0.4898022711277008, + "learning_rate": 2.6395757373560904e-05, + "loss": 1.0327, + "num_input_tokens_seen": 86507520, + "step": 10560 + }, + { + "epoch": 0.507563025210084, + "grad_norm": 0.49128857254981995, + "learning_rate": 2.6356200875645287e-05, + "loss": 0.9704, + "num_input_tokens_seen": 86589440, + "step": 10570 + }, + { + "epoch": 0.5080432172869148, + "grad_norm": 0.49930843710899353, + "learning_rate": 2.631664097210108e-05, + "loss": 0.7818, + "num_input_tokens_seen": 86671360, + "step": 10580 + }, + { + "epoch": 0.5085234093637455, + "grad_norm": 2.420948028564453, + "learning_rate": 2.6277077762269276e-05, + "loss": 0.717, + "num_input_tokens_seen": 86753280, + "step": 10590 + }, + { + "epoch": 0.5090036014405762, + "grad_norm": 0.49312740564346313, + "learning_rate": 2.6237511345499167e-05, + "loss": 0.8514, + "num_input_tokens_seen": 86835200, + "step": 10600 + }, + { + "epoch": 0.5094837935174069, + "grad_norm": 0.5390229225158691, + "learning_rate": 2.61979418211481e-05, + "loss": 0.8897, + "num_input_tokens_seen": 86917120, + "step": 10610 + }, + { + "epoch": 0.5099639855942377, + "grad_norm": 0.5236647129058838, + "learning_rate": 2.615836928858122e-05, + "loss": 0.9499, + "num_input_tokens_seen": 86999040, + "step": 10620 + }, + { + "epoch": 0.5104441776710684, + "grad_norm": 0.48277348279953003, + "learning_rate": 2.6118793847171236e-05, + "loss": 0.7749, + "num_input_tokens_seen": 87080960, + "step": 10630 + }, + { + "epoch": 0.5109243697478991, + "grad_norm": 0.5363568067550659, + "learning_rate": 2.607921559629816e-05, + "loss": 1.0117, + "num_input_tokens_seen": 87162880, + "step": 10640 + }, + { + "epoch": 0.5114045618247299, + "grad_norm": 0.47179368138313293, + "learning_rate": 2.6039634635349043e-05, + "loss": 0.9039, + "num_input_tokens_seen": 87244800, + "step": 10650 + }, + { + "epoch": 0.5118847539015606, + "grad_norm": 0.4801013171672821, + "learning_rate": 2.6000051063717767e-05, + "loss": 0.8813, + "num_input_tokens_seen": 87326720, + "step": 10660 + }, + { + "epoch": 0.5123649459783913, + "grad_norm": 0.48771876096725464, + "learning_rate": 2.596046498080475e-05, + "loss": 0.856, + "num_input_tokens_seen": 87408640, + "step": 10670 + }, + { + "epoch": 0.5128451380552221, + "grad_norm": 0.4822755455970764, + "learning_rate": 2.5920876486016726e-05, + "loss": 0.9129, + "num_input_tokens_seen": 87490560, + "step": 10680 + }, + { + "epoch": 0.5133253301320528, + "grad_norm": 0.5060322880744934, + "learning_rate": 2.5881285678766482e-05, + "loss": 0.8935, + "num_input_tokens_seen": 87572480, + "step": 10690 + }, + { + "epoch": 0.5138055222088835, + "grad_norm": 0.7022219300270081, + "learning_rate": 2.5841692658472617e-05, + "loss": 0.8172, + "num_input_tokens_seen": 87654400, + "step": 10700 + }, + { + "epoch": 0.5142857142857142, + "grad_norm": 0.5739632844924927, + "learning_rate": 2.5802097524559264e-05, + "loss": 1.0935, + "num_input_tokens_seen": 87736320, + "step": 10710 + }, + { + "epoch": 0.514765906362545, + "grad_norm": 0.478157103061676, + "learning_rate": 2.5762500376455912e-05, + "loss": 0.8728, + "num_input_tokens_seen": 87818240, + "step": 10720 + }, + { + "epoch": 0.5152460984393757, + "grad_norm": 0.5317462086677551, + "learning_rate": 2.5722901313597052e-05, + "loss": 0.8416, + "num_input_tokens_seen": 87900160, + "step": 10730 + }, + { + "epoch": 0.5157262905162064, + "grad_norm": 0.4968595802783966, + "learning_rate": 2.5683300435422032e-05, + "loss": 0.8828, + "num_input_tokens_seen": 87982080, + "step": 10740 + }, + { + "epoch": 0.5162064825930373, + "grad_norm": 0.5193814039230347, + "learning_rate": 2.564369784137472e-05, + "loss": 0.8147, + "num_input_tokens_seen": 88064000, + "step": 10750 + }, + { + "epoch": 0.516686674669868, + "grad_norm": 0.5038419365882874, + "learning_rate": 2.5604093630903307e-05, + "loss": 0.8245, + "num_input_tokens_seen": 88145920, + "step": 10760 + }, + { + "epoch": 0.5171668667466987, + "grad_norm": 0.4722188413143158, + "learning_rate": 2.556448790346006e-05, + "loss": 0.7478, + "num_input_tokens_seen": 88227840, + "step": 10770 + }, + { + "epoch": 0.5176470588235295, + "grad_norm": 0.48475295305252075, + "learning_rate": 2.5524880758501035e-05, + "loss": 1.0129, + "num_input_tokens_seen": 88309760, + "step": 10780 + }, + { + "epoch": 0.5181272509003602, + "grad_norm": 1.4112732410430908, + "learning_rate": 2.5485272295485846e-05, + "loss": 0.9362, + "num_input_tokens_seen": 88391680, + "step": 10790 + }, + { + "epoch": 0.5186074429771909, + "grad_norm": 0.484065979719162, + "learning_rate": 2.544566261387743e-05, + "loss": 0.756, + "num_input_tokens_seen": 88473600, + "step": 10800 + }, + { + "epoch": 0.5190876350540216, + "grad_norm": 0.2988108992576599, + "learning_rate": 2.5406051813141773e-05, + "loss": 0.843, + "num_input_tokens_seen": 88555520, + "step": 10810 + }, + { + "epoch": 0.5195678271308524, + "grad_norm": 0.4882756471633911, + "learning_rate": 2.5366439992747688e-05, + "loss": 0.8411, + "num_input_tokens_seen": 88637440, + "step": 10820 + }, + { + "epoch": 0.5200480192076831, + "grad_norm": 0.4603646695613861, + "learning_rate": 2.5326827252166523e-05, + "loss": 0.8732, + "num_input_tokens_seen": 88719360, + "step": 10830 + }, + { + "epoch": 0.5205282112845138, + "grad_norm": 0.5258098244667053, + "learning_rate": 2.5287213690871957e-05, + "loss": 0.8446, + "num_input_tokens_seen": 88801280, + "step": 10840 + }, + { + "epoch": 0.5210084033613446, + "grad_norm": 0.5720292925834656, + "learning_rate": 2.5247599408339723e-05, + "loss": 0.8941, + "num_input_tokens_seen": 88883200, + "step": 10850 + }, + { + "epoch": 0.5214885954381753, + "grad_norm": 0.517301619052887, + "learning_rate": 2.5207984504047365e-05, + "loss": 0.8391, + "num_input_tokens_seen": 88965120, + "step": 10860 + }, + { + "epoch": 0.521968787515006, + "grad_norm": 0.569545567035675, + "learning_rate": 2.5168369077474004e-05, + "loss": 0.7663, + "num_input_tokens_seen": 89047040, + "step": 10870 + }, + { + "epoch": 0.5224489795918368, + "grad_norm": 0.5130712389945984, + "learning_rate": 2.512875322810002e-05, + "loss": 0.9436, + "num_input_tokens_seen": 89128960, + "step": 10880 + }, + { + "epoch": 0.5229291716686675, + "grad_norm": 0.531822144985199, + "learning_rate": 2.508913705540693e-05, + "loss": 0.8914, + "num_input_tokens_seen": 89210880, + "step": 10890 + }, + { + "epoch": 0.5234093637454982, + "grad_norm": 0.6732114553451538, + "learning_rate": 2.504952065887701e-05, + "loss": 0.8412, + "num_input_tokens_seen": 89292800, + "step": 10900 + }, + { + "epoch": 0.5238895558223289, + "grad_norm": 0.5019482970237732, + "learning_rate": 2.5009904137993106e-05, + "loss": 0.9038, + "num_input_tokens_seen": 89374720, + "step": 10910 + }, + { + "epoch": 0.5243697478991597, + "grad_norm": 2.033942699432373, + "learning_rate": 2.497028759223839e-05, + "loss": 1.0436, + "num_input_tokens_seen": 89456640, + "step": 10920 + }, + { + "epoch": 0.5248499399759904, + "grad_norm": 0.5160328149795532, + "learning_rate": 2.4930671121096105e-05, + "loss": 0.9828, + "num_input_tokens_seen": 89538560, + "step": 10930 + }, + { + "epoch": 0.5253301320528211, + "grad_norm": 0.5123900175094604, + "learning_rate": 2.4891054824049264e-05, + "loss": 0.7117, + "num_input_tokens_seen": 89620480, + "step": 10940 + }, + { + "epoch": 0.5258103241296519, + "grad_norm": 0.5120553970336914, + "learning_rate": 2.485143880058049e-05, + "loss": 0.8639, + "num_input_tokens_seen": 89702400, + "step": 10950 + }, + { + "epoch": 0.5262905162064826, + "grad_norm": 0.5253350734710693, + "learning_rate": 2.4811823150171692e-05, + "loss": 0.8927, + "num_input_tokens_seen": 89784320, + "step": 10960 + }, + { + "epoch": 0.5267707082833133, + "grad_norm": 0.7501769661903381, + "learning_rate": 2.477220797230385e-05, + "loss": 1.0442, + "num_input_tokens_seen": 89866240, + "step": 10970 + }, + { + "epoch": 0.5272509003601441, + "grad_norm": 1.2171481847763062, + "learning_rate": 2.4732593366456755e-05, + "loss": 1.1357, + "num_input_tokens_seen": 89948160, + "step": 10980 + }, + { + "epoch": 0.5277310924369748, + "grad_norm": 0.5957512259483337, + "learning_rate": 2.4692979432108777e-05, + "loss": 0.9067, + "num_input_tokens_seen": 90030080, + "step": 10990 + }, + { + "epoch": 0.5282112845138055, + "grad_norm": 0.5080304741859436, + "learning_rate": 2.4653366268736565e-05, + "loss": 1.0397, + "num_input_tokens_seen": 90112000, + "step": 11000 + }, + { + "epoch": 0.5286914765906362, + "grad_norm": 0.46417123079299927, + "learning_rate": 2.461375397581487e-05, + "loss": 0.7986, + "num_input_tokens_seen": 90193920, + "step": 11010 + }, + { + "epoch": 0.529171668667467, + "grad_norm": 0.5016286373138428, + "learning_rate": 2.4574142652816238e-05, + "loss": 0.8494, + "num_input_tokens_seen": 90275840, + "step": 11020 + }, + { + "epoch": 0.5296518607442977, + "grad_norm": 0.6602762937545776, + "learning_rate": 2.453453239921077e-05, + "loss": 0.75, + "num_input_tokens_seen": 90357760, + "step": 11030 + }, + { + "epoch": 0.5301320528211284, + "grad_norm": 0.520213782787323, + "learning_rate": 2.44949233144659e-05, + "loss": 0.7564, + "num_input_tokens_seen": 90439680, + "step": 11040 + }, + { + "epoch": 0.5306122448979592, + "grad_norm": 0.49561575055122375, + "learning_rate": 2.4455315498046134e-05, + "loss": 0.8193, + "num_input_tokens_seen": 90521600, + "step": 11050 + }, + { + "epoch": 0.5310924369747899, + "grad_norm": 0.5517764687538147, + "learning_rate": 2.4415709049412757e-05, + "loss": 0.8752, + "num_input_tokens_seen": 90603520, + "step": 11060 + }, + { + "epoch": 0.5315726290516206, + "grad_norm": 0.5008801817893982, + "learning_rate": 2.437610406802365e-05, + "loss": 0.8828, + "num_input_tokens_seen": 90685440, + "step": 11070 + }, + { + "epoch": 0.5320528211284514, + "grad_norm": 0.46303442120552063, + "learning_rate": 2.4336500653333012e-05, + "loss": 0.9908, + "num_input_tokens_seen": 90767360, + "step": 11080 + }, + { + "epoch": 0.5325330132052821, + "grad_norm": 0.49256837368011475, + "learning_rate": 2.4296898904791076e-05, + "loss": 0.9681, + "num_input_tokens_seen": 90849280, + "step": 11090 + }, + { + "epoch": 0.5330132052821128, + "grad_norm": 0.5212172269821167, + "learning_rate": 2.425729892184393e-05, + "loss": 0.8629, + "num_input_tokens_seen": 90931200, + "step": 11100 + }, + { + "epoch": 0.5334933973589436, + "grad_norm": 0.49792659282684326, + "learning_rate": 2.421770080393321e-05, + "loss": 1.1662, + "num_input_tokens_seen": 91013120, + "step": 11110 + }, + { + "epoch": 0.5339735894357743, + "grad_norm": 0.5083081126213074, + "learning_rate": 2.417810465049585e-05, + "loss": 0.9811, + "num_input_tokens_seen": 91095040, + "step": 11120 + }, + { + "epoch": 0.534453781512605, + "grad_norm": 1.1411739587783813, + "learning_rate": 2.413851056096388e-05, + "loss": 0.7801, + "num_input_tokens_seen": 91176960, + "step": 11130 + }, + { + "epoch": 0.5349339735894357, + "grad_norm": 0.4804481863975525, + "learning_rate": 2.4098918634764153e-05, + "loss": 0.8704, + "num_input_tokens_seen": 91258880, + "step": 11140 + }, + { + "epoch": 0.5354141656662665, + "grad_norm": 0.47054970264434814, + "learning_rate": 2.4059328971318053e-05, + "loss": 1.0149, + "num_input_tokens_seen": 91340800, + "step": 11150 + }, + { + "epoch": 0.5358943577430972, + "grad_norm": 1.7846838235855103, + "learning_rate": 2.4019741670041305e-05, + "loss": 0.9053, + "num_input_tokens_seen": 91422720, + "step": 11160 + }, + { + "epoch": 0.5363745498199279, + "grad_norm": 0.48844608664512634, + "learning_rate": 2.398015683034371e-05, + "loss": 0.966, + "num_input_tokens_seen": 91504640, + "step": 11170 + }, + { + "epoch": 0.5368547418967587, + "grad_norm": 0.49595731496810913, + "learning_rate": 2.394057455162886e-05, + "loss": 0.696, + "num_input_tokens_seen": 91586560, + "step": 11180 + }, + { + "epoch": 0.5373349339735894, + "grad_norm": 0.49253153800964355, + "learning_rate": 2.3900994933293953e-05, + "loss": 0.7898, + "num_input_tokens_seen": 91668480, + "step": 11190 + }, + { + "epoch": 0.5378151260504201, + "grad_norm": 0.5204420685768127, + "learning_rate": 2.3861418074729476e-05, + "loss": 0.9175, + "num_input_tokens_seen": 91750400, + "step": 11200 + }, + { + "epoch": 0.538295318127251, + "grad_norm": 0.4966830015182495, + "learning_rate": 2.3821844075318993e-05, + "loss": 0.8275, + "num_input_tokens_seen": 91832320, + "step": 11210 + }, + { + "epoch": 0.5387755102040817, + "grad_norm": 0.41914886236190796, + "learning_rate": 2.378227303443889e-05, + "loss": 0.9653, + "num_input_tokens_seen": 91914240, + "step": 11220 + }, + { + "epoch": 0.5392557022809124, + "grad_norm": 0.4765452444553375, + "learning_rate": 2.3742705051458145e-05, + "loss": 0.8119, + "num_input_tokens_seen": 91996160, + "step": 11230 + }, + { + "epoch": 0.539735894357743, + "grad_norm": 0.4974367022514343, + "learning_rate": 2.3703140225738017e-05, + "loss": 0.9283, + "num_input_tokens_seen": 92078080, + "step": 11240 + }, + { + "epoch": 0.5402160864345739, + "grad_norm": 0.48964783549308777, + "learning_rate": 2.3663578656631858e-05, + "loss": 0.8144, + "num_input_tokens_seen": 92160000, + "step": 11250 + }, + { + "epoch": 0.5406962785114046, + "grad_norm": 0.5229817628860474, + "learning_rate": 2.362402044348486e-05, + "loss": 0.8041, + "num_input_tokens_seen": 92241920, + "step": 11260 + }, + { + "epoch": 0.5411764705882353, + "grad_norm": 0.5138763785362244, + "learning_rate": 2.3584465685633738e-05, + "loss": 0.877, + "num_input_tokens_seen": 92323840, + "step": 11270 + }, + { + "epoch": 0.5416566626650661, + "grad_norm": 0.6102971434593201, + "learning_rate": 2.3544914482406592e-05, + "loss": 0.8796, + "num_input_tokens_seen": 92405760, + "step": 11280 + }, + { + "epoch": 0.5421368547418968, + "grad_norm": 0.4886428713798523, + "learning_rate": 2.350536693312255e-05, + "loss": 0.8823, + "num_input_tokens_seen": 92487680, + "step": 11290 + }, + { + "epoch": 0.5426170468187275, + "grad_norm": 0.4848209321498871, + "learning_rate": 2.3465823137091572e-05, + "loss": 0.8294, + "num_input_tokens_seen": 92569600, + "step": 11300 + }, + { + "epoch": 0.5430972388955583, + "grad_norm": 0.4850007891654968, + "learning_rate": 2.3426283193614208e-05, + "loss": 0.8574, + "num_input_tokens_seen": 92651520, + "step": 11310 + }, + { + "epoch": 0.543577430972389, + "grad_norm": 0.47127023339271545, + "learning_rate": 2.3386747201981338e-05, + "loss": 1.143, + "num_input_tokens_seen": 92733440, + "step": 11320 + }, + { + "epoch": 0.5440576230492197, + "grad_norm": 0.4775993227958679, + "learning_rate": 2.3347215261473887e-05, + "loss": 0.8502, + "num_input_tokens_seen": 92815360, + "step": 11330 + }, + { + "epoch": 0.5445378151260504, + "grad_norm": 0.4856424629688263, + "learning_rate": 2.330768747136263e-05, + "loss": 0.9998, + "num_input_tokens_seen": 92897280, + "step": 11340 + }, + { + "epoch": 0.5450180072028812, + "grad_norm": 0.4987284541130066, + "learning_rate": 2.3268163930907933e-05, + "loss": 0.8627, + "num_input_tokens_seen": 92979200, + "step": 11350 + }, + { + "epoch": 0.5454981992797119, + "grad_norm": 0.9060471653938293, + "learning_rate": 2.3228644739359444e-05, + "loss": 0.8115, + "num_input_tokens_seen": 93061120, + "step": 11360 + }, + { + "epoch": 0.5459783913565426, + "grad_norm": 0.5042136907577515, + "learning_rate": 2.3189129995955943e-05, + "loss": 0.9262, + "num_input_tokens_seen": 93143040, + "step": 11370 + }, + { + "epoch": 0.5464585834333734, + "grad_norm": 0.6296837329864502, + "learning_rate": 2.314961979992501e-05, + "loss": 0.9588, + "num_input_tokens_seen": 93224960, + "step": 11380 + }, + { + "epoch": 0.5469387755102041, + "grad_norm": 0.40785011649131775, + "learning_rate": 2.311011425048281e-05, + "loss": 1.0475, + "num_input_tokens_seen": 93306880, + "step": 11390 + }, + { + "epoch": 0.5474189675870348, + "grad_norm": 0.5426297187805176, + "learning_rate": 2.3070613446833842e-05, + "loss": 0.9962, + "num_input_tokens_seen": 93388800, + "step": 11400 + }, + { + "epoch": 0.5478991596638656, + "grad_norm": 0.49344369769096375, + "learning_rate": 2.30311174881707e-05, + "loss": 0.9423, + "num_input_tokens_seen": 93470720, + "step": 11410 + }, + { + "epoch": 0.5483793517406963, + "grad_norm": 0.5111265182495117, + "learning_rate": 2.2991626473673773e-05, + "loss": 0.78, + "num_input_tokens_seen": 93552640, + "step": 11420 + }, + { + "epoch": 0.548859543817527, + "grad_norm": 0.49911582469940186, + "learning_rate": 2.295214050251108e-05, + "loss": 0.9205, + "num_input_tokens_seen": 93634560, + "step": 11430 + }, + { + "epoch": 0.5493397358943577, + "grad_norm": 0.5109173059463501, + "learning_rate": 2.2912659673837965e-05, + "loss": 0.7834, + "num_input_tokens_seen": 93716480, + "step": 11440 + }, + { + "epoch": 0.5498199279711885, + "grad_norm": 0.49743011593818665, + "learning_rate": 2.2873184086796824e-05, + "loss": 0.8649, + "num_input_tokens_seen": 93798400, + "step": 11450 + }, + { + "epoch": 0.5503001200480192, + "grad_norm": 0.5043100118637085, + "learning_rate": 2.283371384051693e-05, + "loss": 1.2097, + "num_input_tokens_seen": 93880320, + "step": 11460 + }, + { + "epoch": 0.5507803121248499, + "grad_norm": 0.5047153830528259, + "learning_rate": 2.2794249034114137e-05, + "loss": 0.9085, + "num_input_tokens_seen": 93962240, + "step": 11470 + }, + { + "epoch": 0.5512605042016807, + "grad_norm": 1.0863653421401978, + "learning_rate": 2.275478976669062e-05, + "loss": 1.0061, + "num_input_tokens_seen": 94044160, + "step": 11480 + }, + { + "epoch": 0.5517406962785114, + "grad_norm": 0.5065763592720032, + "learning_rate": 2.2715336137334657e-05, + "loss": 0.9816, + "num_input_tokens_seen": 94126080, + "step": 11490 + }, + { + "epoch": 0.5522208883553421, + "grad_norm": 0.5079005360603333, + "learning_rate": 2.2675888245120382e-05, + "loss": 1.0597, + "num_input_tokens_seen": 94208000, + "step": 11500 + }, + { + "epoch": 0.5527010804321729, + "grad_norm": 0.5147351622581482, + "learning_rate": 2.263644618910749e-05, + "loss": 0.9222, + "num_input_tokens_seen": 94289920, + "step": 11510 + }, + { + "epoch": 0.5531812725090036, + "grad_norm": 0.4940924048423767, + "learning_rate": 2.2597010068341052e-05, + "loss": 0.6929, + "num_input_tokens_seen": 94371840, + "step": 11520 + }, + { + "epoch": 0.5536614645858343, + "grad_norm": 0.49539023637771606, + "learning_rate": 2.255757998185122e-05, + "loss": 0.943, + "num_input_tokens_seen": 94453760, + "step": 11530 + }, + { + "epoch": 0.554141656662665, + "grad_norm": 0.7505642771720886, + "learning_rate": 2.2518156028652977e-05, + "loss": 0.9255, + "num_input_tokens_seen": 94535680, + "step": 11540 + }, + { + "epoch": 0.5546218487394958, + "grad_norm": 0.2930028438568115, + "learning_rate": 2.2478738307745938e-05, + "loss": 0.7929, + "num_input_tokens_seen": 94617600, + "step": 11550 + }, + { + "epoch": 0.5551020408163265, + "grad_norm": 0.4832445979118347, + "learning_rate": 2.243932691811405e-05, + "loss": 0.9169, + "num_input_tokens_seen": 94699520, + "step": 11560 + }, + { + "epoch": 0.5555822328931572, + "grad_norm": 0.4929827153682709, + "learning_rate": 2.2399921958725364e-05, + "loss": 0.9749, + "num_input_tokens_seen": 94781440, + "step": 11570 + }, + { + "epoch": 0.556062424969988, + "grad_norm": 1.4206510782241821, + "learning_rate": 2.236052352853177e-05, + "loss": 1.0515, + "num_input_tokens_seen": 94863360, + "step": 11580 + }, + { + "epoch": 0.5565426170468187, + "grad_norm": 0.5904713273048401, + "learning_rate": 2.232113172646878e-05, + "loss": 0.9887, + "num_input_tokens_seen": 94945280, + "step": 11590 + }, + { + "epoch": 0.5570228091236494, + "grad_norm": 0.49482420086860657, + "learning_rate": 2.2281746651455272e-05, + "loss": 0.8613, + "num_input_tokens_seen": 95027200, + "step": 11600 + }, + { + "epoch": 0.5575030012004802, + "grad_norm": 0.5033355951309204, + "learning_rate": 2.2242368402393198e-05, + "loss": 1.061, + "num_input_tokens_seen": 95109120, + "step": 11610 + }, + { + "epoch": 0.5579831932773109, + "grad_norm": 1.082341194152832, + "learning_rate": 2.220299707816738e-05, + "loss": 0.8245, + "num_input_tokens_seen": 95191040, + "step": 11620 + }, + { + "epoch": 0.5584633853541416, + "grad_norm": 0.49979743361473083, + "learning_rate": 2.2163632777645282e-05, + "loss": 0.8227, + "num_input_tokens_seen": 95272960, + "step": 11630 + }, + { + "epoch": 0.5589435774309723, + "grad_norm": 0.5040926933288574, + "learning_rate": 2.2124275599676676e-05, + "loss": 0.9013, + "num_input_tokens_seen": 95354880, + "step": 11640 + }, + { + "epoch": 0.5594237695078031, + "grad_norm": 0.4717329144477844, + "learning_rate": 2.20849256430935e-05, + "loss": 0.9034, + "num_input_tokens_seen": 95436800, + "step": 11650 + }, + { + "epoch": 0.5599039615846338, + "grad_norm": 0.5357513427734375, + "learning_rate": 2.2045583006709536e-05, + "loss": 0.919, + "num_input_tokens_seen": 95518720, + "step": 11660 + }, + { + "epoch": 0.5603841536614645, + "grad_norm": 0.6526145935058594, + "learning_rate": 2.2006247789320162e-05, + "loss": 0.868, + "num_input_tokens_seen": 95600640, + "step": 11670 + }, + { + "epoch": 0.5608643457382954, + "grad_norm": 0.527145266532898, + "learning_rate": 2.1966920089702157e-05, + "loss": 0.9287, + "num_input_tokens_seen": 95682560, + "step": 11680 + }, + { + "epoch": 0.561344537815126, + "grad_norm": 0.4794023931026459, + "learning_rate": 2.192760000661343e-05, + "loss": 0.8022, + "num_input_tokens_seen": 95764480, + "step": 11690 + }, + { + "epoch": 0.5618247298919568, + "grad_norm": 0.4974648058414459, + "learning_rate": 2.1888287638792722e-05, + "loss": 0.8273, + "num_input_tokens_seen": 95846400, + "step": 11700 + }, + { + "epoch": 0.5623049219687876, + "grad_norm": 0.48735299706459045, + "learning_rate": 2.184898308495943e-05, + "loss": 0.8902, + "num_input_tokens_seen": 95928320, + "step": 11710 + }, + { + "epoch": 0.5627851140456183, + "grad_norm": 0.4717083275318146, + "learning_rate": 2.180968644381334e-05, + "loss": 0.9926, + "num_input_tokens_seen": 96010240, + "step": 11720 + }, + { + "epoch": 0.563265306122449, + "grad_norm": 0.5048367977142334, + "learning_rate": 2.1770397814034315e-05, + "loss": 0.8164, + "num_input_tokens_seen": 96092160, + "step": 11730 + }, + { + "epoch": 0.5637454981992797, + "grad_norm": 0.5213897228240967, + "learning_rate": 2.1731117294282166e-05, + "loss": 0.9192, + "num_input_tokens_seen": 96174080, + "step": 11740 + }, + { + "epoch": 0.5642256902761105, + "grad_norm": 0.47659391164779663, + "learning_rate": 2.16918449831963e-05, + "loss": 0.841, + "num_input_tokens_seen": 96256000, + "step": 11750 + }, + { + "epoch": 0.5647058823529412, + "grad_norm": 0.49390923976898193, + "learning_rate": 2.165258097939551e-05, + "loss": 0.8303, + "num_input_tokens_seen": 96337920, + "step": 11760 + }, + { + "epoch": 0.5651860744297719, + "grad_norm": 0.4540766775608063, + "learning_rate": 2.1613325381477744e-05, + "loss": 0.6811, + "num_input_tokens_seen": 96419840, + "step": 11770 + }, + { + "epoch": 0.5656662665066027, + "grad_norm": 0.5054943561553955, + "learning_rate": 2.1574078288019846e-05, + "loss": 0.9841, + "num_input_tokens_seen": 96501760, + "step": 11780 + }, + { + "epoch": 0.5661464585834334, + "grad_norm": 0.5268092751502991, + "learning_rate": 2.1534839797577268e-05, + "loss": 1.0156, + "num_input_tokens_seen": 96583680, + "step": 11790 + }, + { + "epoch": 0.5666266506602641, + "grad_norm": 1.0047301054000854, + "learning_rate": 2.14956100086839e-05, + "loss": 0.813, + "num_input_tokens_seen": 96665600, + "step": 11800 + }, + { + "epoch": 0.5671068427370949, + "grad_norm": 0.489765465259552, + "learning_rate": 2.1456389019851762e-05, + "loss": 0.8036, + "num_input_tokens_seen": 96747520, + "step": 11810 + }, + { + "epoch": 0.5675870348139256, + "grad_norm": 0.4869178831577301, + "learning_rate": 2.1417176929570768e-05, + "loss": 0.8429, + "num_input_tokens_seen": 96829440, + "step": 11820 + }, + { + "epoch": 0.5680672268907563, + "grad_norm": 0.6066574454307556, + "learning_rate": 2.137797383630851e-05, + "loss": 0.8213, + "num_input_tokens_seen": 96911360, + "step": 11830 + }, + { + "epoch": 0.568547418967587, + "grad_norm": 0.48497194051742554, + "learning_rate": 2.1338779838509965e-05, + "loss": 0.8338, + "num_input_tokens_seen": 96993280, + "step": 11840 + }, + { + "epoch": 0.5690276110444178, + "grad_norm": 0.49360740184783936, + "learning_rate": 2.129959503459728e-05, + "loss": 1.0187, + "num_input_tokens_seen": 97075200, + "step": 11850 + }, + { + "epoch": 0.5695078031212485, + "grad_norm": 0.4896343946456909, + "learning_rate": 2.126041952296951e-05, + "loss": 0.8479, + "num_input_tokens_seen": 97157120, + "step": 11860 + }, + { + "epoch": 0.5699879951980792, + "grad_norm": 0.626423180103302, + "learning_rate": 2.122125340200239e-05, + "loss": 1.0873, + "num_input_tokens_seen": 97239040, + "step": 11870 + }, + { + "epoch": 0.57046818727491, + "grad_norm": 0.5536409616470337, + "learning_rate": 2.1182096770048045e-05, + "loss": 1.1087, + "num_input_tokens_seen": 97320960, + "step": 11880 + }, + { + "epoch": 0.5709483793517407, + "grad_norm": 0.4912538528442383, + "learning_rate": 2.11429497254348e-05, + "loss": 0.872, + "num_input_tokens_seen": 97402880, + "step": 11890 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.5036265254020691, + "learning_rate": 2.1103812366466896e-05, + "loss": 0.9375, + "num_input_tokens_seen": 97484800, + "step": 11900 + }, + { + "epoch": 0.5719087635054022, + "grad_norm": 0.47547534108161926, + "learning_rate": 2.1064684791424236e-05, + "loss": 0.9402, + "num_input_tokens_seen": 97566720, + "step": 11910 + }, + { + "epoch": 0.5723889555822329, + "grad_norm": 0.6462324857711792, + "learning_rate": 2.1025567098562177e-05, + "loss": 0.83, + "num_input_tokens_seen": 97648640, + "step": 11920 + }, + { + "epoch": 0.5728691476590636, + "grad_norm": 0.6137294769287109, + "learning_rate": 2.0986459386111256e-05, + "loss": 0.8851, + "num_input_tokens_seen": 97730560, + "step": 11930 + }, + { + "epoch": 0.5733493397358943, + "grad_norm": 0.474915087223053, + "learning_rate": 2.0947361752276935e-05, + "loss": 0.8404, + "num_input_tokens_seen": 97812480, + "step": 11940 + }, + { + "epoch": 0.5738295318127251, + "grad_norm": 0.5123166441917419, + "learning_rate": 2.0908274295239365e-05, + "loss": 1.0064, + "num_input_tokens_seen": 97894400, + "step": 11950 + }, + { + "epoch": 0.5743097238895558, + "grad_norm": 1.154371738433838, + "learning_rate": 2.0869197113153175e-05, + "loss": 0.882, + "num_input_tokens_seen": 97976320, + "step": 11960 + }, + { + "epoch": 0.5747899159663865, + "grad_norm": 0.48168784379959106, + "learning_rate": 2.083013030414714e-05, + "loss": 0.7295, + "num_input_tokens_seen": 98058240, + "step": 11970 + }, + { + "epoch": 0.5752701080432173, + "grad_norm": 0.5147429704666138, + "learning_rate": 2.0791073966324037e-05, + "loss": 0.808, + "num_input_tokens_seen": 98140160, + "step": 11980 + }, + { + "epoch": 0.575750300120048, + "grad_norm": 0.47852545976638794, + "learning_rate": 2.0752028197760323e-05, + "loss": 0.7292, + "num_input_tokens_seen": 98222080, + "step": 11990 + }, + { + "epoch": 0.5762304921968787, + "grad_norm": 0.48357313871383667, + "learning_rate": 2.07129930965059e-05, + "loss": 0.9465, + "num_input_tokens_seen": 98304000, + "step": 12000 + }, + { + "epoch": 0.5767106842737095, + "grad_norm": 1.0818113088607788, + "learning_rate": 2.0673968760583912e-05, + "loss": 0.8273, + "num_input_tokens_seen": 98385920, + "step": 12010 + }, + { + "epoch": 0.5771908763505402, + "grad_norm": 0.551762580871582, + "learning_rate": 2.0634955287990465e-05, + "loss": 1.0037, + "num_input_tokens_seen": 98467840, + "step": 12020 + }, + { + "epoch": 0.5776710684273709, + "grad_norm": 0.5415941476821899, + "learning_rate": 2.059595277669436e-05, + "loss": 0.7684, + "num_input_tokens_seen": 98549760, + "step": 12030 + }, + { + "epoch": 0.5781512605042017, + "grad_norm": 0.5173623561859131, + "learning_rate": 2.0556961324636903e-05, + "loss": 0.9755, + "num_input_tokens_seen": 98631680, + "step": 12040 + }, + { + "epoch": 0.5786314525810324, + "grad_norm": 0.4724039137363434, + "learning_rate": 2.0517981029731616e-05, + "loss": 0.9699, + "num_input_tokens_seen": 98713600, + "step": 12050 + }, + { + "epoch": 0.5791116446578631, + "grad_norm": 0.5577803254127502, + "learning_rate": 2.0479011989863988e-05, + "loss": 1.0823, + "num_input_tokens_seen": 98795520, + "step": 12060 + }, + { + "epoch": 0.5795918367346938, + "grad_norm": 0.9037268757820129, + "learning_rate": 2.0440054302891276e-05, + "loss": 0.8871, + "num_input_tokens_seen": 98877440, + "step": 12070 + }, + { + "epoch": 0.5800720288115246, + "grad_norm": 0.4992835223674774, + "learning_rate": 2.0401108066642217e-05, + "loss": 0.7901, + "num_input_tokens_seen": 98959360, + "step": 12080 + }, + { + "epoch": 0.5805522208883553, + "grad_norm": 0.5037127733230591, + "learning_rate": 2.0362173378916763e-05, + "loss": 0.844, + "num_input_tokens_seen": 99041280, + "step": 12090 + }, + { + "epoch": 0.581032412965186, + "grad_norm": 0.5936850905418396, + "learning_rate": 2.032325033748591e-05, + "loss": 0.8851, + "num_input_tokens_seen": 99123200, + "step": 12100 + }, + { + "epoch": 0.5815126050420169, + "grad_norm": 0.6421457529067993, + "learning_rate": 2.0284339040091403e-05, + "loss": 0.9621, + "num_input_tokens_seen": 99205120, + "step": 12110 + }, + { + "epoch": 0.5819927971188475, + "grad_norm": 0.49321863055229187, + "learning_rate": 2.0245439584445457e-05, + "loss": 0.7169, + "num_input_tokens_seen": 99287040, + "step": 12120 + }, + { + "epoch": 0.5824729891956782, + "grad_norm": 0.4810725152492523, + "learning_rate": 2.0206552068230587e-05, + "loss": 0.9376, + "num_input_tokens_seen": 99368960, + "step": 12130 + }, + { + "epoch": 0.5829531812725091, + "grad_norm": 0.4972326159477234, + "learning_rate": 2.0167676589099324e-05, + "loss": 0.9773, + "num_input_tokens_seen": 99450880, + "step": 12140 + }, + { + "epoch": 0.5834333733493398, + "grad_norm": 0.4891018569469452, + "learning_rate": 2.0128813244673946e-05, + "loss": 0.8787, + "num_input_tokens_seen": 99532800, + "step": 12150 + }, + { + "epoch": 0.5839135654261705, + "grad_norm": 0.562864363193512, + "learning_rate": 2.0089962132546296e-05, + "loss": 0.9362, + "num_input_tokens_seen": 99614720, + "step": 12160 + }, + { + "epoch": 0.5843937575030012, + "grad_norm": 0.5169605016708374, + "learning_rate": 2.0051123350277477e-05, + "loss": 0.8472, + "num_input_tokens_seen": 99696640, + "step": 12170 + }, + { + "epoch": 0.584873949579832, + "grad_norm": 0.47625747323036194, + "learning_rate": 2.0012296995397613e-05, + "loss": 0.6606, + "num_input_tokens_seen": 99778560, + "step": 12180 + }, + { + "epoch": 0.5853541416566627, + "grad_norm": 0.34957268834114075, + "learning_rate": 1.997348316540566e-05, + "loss": 0.7587, + "num_input_tokens_seen": 99860480, + "step": 12190 + }, + { + "epoch": 0.5858343337334934, + "grad_norm": 0.5261297821998596, + "learning_rate": 1.9934681957769107e-05, + "loss": 0.7988, + "num_input_tokens_seen": 99942400, + "step": 12200 + }, + { + "epoch": 0.5863145258103242, + "grad_norm": 0.5246568322181702, + "learning_rate": 1.9895893469923736e-05, + "loss": 0.9584, + "num_input_tokens_seen": 100024320, + "step": 12210 + }, + { + "epoch": 0.5867947178871549, + "grad_norm": 0.45888885855674744, + "learning_rate": 1.985711779927339e-05, + "loss": 1.1023, + "num_input_tokens_seen": 100106240, + "step": 12220 + }, + { + "epoch": 0.5872749099639856, + "grad_norm": 0.6073553562164307, + "learning_rate": 1.9818355043189732e-05, + "loss": 0.8485, + "num_input_tokens_seen": 100188160, + "step": 12230 + }, + { + "epoch": 0.5877551020408164, + "grad_norm": 0.8071925640106201, + "learning_rate": 1.9779605299012005e-05, + "loss": 0.9277, + "num_input_tokens_seen": 100270080, + "step": 12240 + }, + { + "epoch": 0.5882352941176471, + "grad_norm": 0.5476227402687073, + "learning_rate": 1.974086866404675e-05, + "loss": 0.9011, + "num_input_tokens_seen": 100352000, + "step": 12250 + }, + { + "epoch": 0.5887154861944778, + "grad_norm": 0.4784226417541504, + "learning_rate": 1.970214523556761e-05, + "loss": 1.0077, + "num_input_tokens_seen": 100433920, + "step": 12260 + }, + { + "epoch": 0.5891956782713085, + "grad_norm": 0.49654924869537354, + "learning_rate": 1.9663435110815065e-05, + "loss": 0.9088, + "num_input_tokens_seen": 100515840, + "step": 12270 + }, + { + "epoch": 0.5896758703481393, + "grad_norm": 0.4656108617782593, + "learning_rate": 1.9624738386996163e-05, + "loss": 0.8294, + "num_input_tokens_seen": 100597760, + "step": 12280 + }, + { + "epoch": 0.59015606242497, + "grad_norm": 0.5135471820831299, + "learning_rate": 1.9586055161284332e-05, + "loss": 0.914, + "num_input_tokens_seen": 100679680, + "step": 12290 + }, + { + "epoch": 0.5906362545018007, + "grad_norm": 0.5345959663391113, + "learning_rate": 1.954738553081909e-05, + "loss": 1.1152, + "num_input_tokens_seen": 100761600, + "step": 12300 + }, + { + "epoch": 0.5911164465786315, + "grad_norm": 0.5068685412406921, + "learning_rate": 1.950872959270581e-05, + "loss": 0.8391, + "num_input_tokens_seen": 100843520, + "step": 12310 + }, + { + "epoch": 0.5915966386554622, + "grad_norm": 0.5310745239257812, + "learning_rate": 1.9470087444015484e-05, + "loss": 0.8974, + "num_input_tokens_seen": 100925440, + "step": 12320 + }, + { + "epoch": 0.5920768307322929, + "grad_norm": 0.4799753725528717, + "learning_rate": 1.9431459181784495e-05, + "loss": 1.1511, + "num_input_tokens_seen": 101007360, + "step": 12330 + }, + { + "epoch": 0.5925570228091237, + "grad_norm": 0.51557856798172, + "learning_rate": 1.939284490301432e-05, + "loss": 0.8286, + "num_input_tokens_seen": 101089280, + "step": 12340 + }, + { + "epoch": 0.5930372148859544, + "grad_norm": 0.49833816289901733, + "learning_rate": 1.935424470467135e-05, + "loss": 0.7323, + "num_input_tokens_seen": 101171200, + "step": 12350 + }, + { + "epoch": 0.5935174069627851, + "grad_norm": 0.5022624731063843, + "learning_rate": 1.9315658683686615e-05, + "loss": 0.9878, + "num_input_tokens_seen": 101253120, + "step": 12360 + }, + { + "epoch": 0.5939975990396158, + "grad_norm": 0.5500155091285706, + "learning_rate": 1.9277086936955517e-05, + "loss": 0.8682, + "num_input_tokens_seen": 101335040, + "step": 12370 + }, + { + "epoch": 0.5944777911164466, + "grad_norm": 0.4885292053222656, + "learning_rate": 1.9238529561337646e-05, + "loss": 0.8724, + "num_input_tokens_seen": 101416960, + "step": 12380 + }, + { + "epoch": 0.5949579831932773, + "grad_norm": 0.4912605583667755, + "learning_rate": 1.9199986653656502e-05, + "loss": 0.9832, + "num_input_tokens_seen": 101498880, + "step": 12390 + }, + { + "epoch": 0.595438175270108, + "grad_norm": 0.6492443084716797, + "learning_rate": 1.9161458310699225e-05, + "loss": 0.8047, + "num_input_tokens_seen": 101580800, + "step": 12400 + }, + { + "epoch": 0.5959183673469388, + "grad_norm": 0.5103248953819275, + "learning_rate": 1.9122944629216402e-05, + "loss": 0.9766, + "num_input_tokens_seen": 101662720, + "step": 12410 + }, + { + "epoch": 0.5963985594237695, + "grad_norm": 0.4565691351890564, + "learning_rate": 1.9084445705921815e-05, + "loss": 0.8339, + "num_input_tokens_seen": 101744640, + "step": 12420 + }, + { + "epoch": 0.5968787515006002, + "grad_norm": 0.47968733310699463, + "learning_rate": 1.9045961637492145e-05, + "loss": 1.0445, + "num_input_tokens_seen": 101826560, + "step": 12430 + }, + { + "epoch": 0.597358943577431, + "grad_norm": 0.49476364254951477, + "learning_rate": 1.9007492520566814e-05, + "loss": 0.9611, + "num_input_tokens_seen": 101908480, + "step": 12440 + }, + { + "epoch": 0.5978391356542617, + "grad_norm": 0.5539293885231018, + "learning_rate": 1.896903845174768e-05, + "loss": 0.9063, + "num_input_tokens_seen": 101990400, + "step": 12450 + }, + { + "epoch": 0.5983193277310924, + "grad_norm": 0.49969589710235596, + "learning_rate": 1.8930599527598797e-05, + "loss": 0.8149, + "num_input_tokens_seen": 102072320, + "step": 12460 + }, + { + "epoch": 0.5987995198079231, + "grad_norm": 0.6476652026176453, + "learning_rate": 1.8892175844646215e-05, + "loss": 1.05, + "num_input_tokens_seen": 102154240, + "step": 12470 + }, + { + "epoch": 0.5992797118847539, + "grad_norm": 0.485588401556015, + "learning_rate": 1.8853767499377712e-05, + "loss": 0.7995, + "num_input_tokens_seen": 102236160, + "step": 12480 + }, + { + "epoch": 0.5997599039615846, + "grad_norm": 0.5099250674247742, + "learning_rate": 1.8815374588242523e-05, + "loss": 0.7582, + "num_input_tokens_seen": 102318080, + "step": 12490 + }, + { + "epoch": 0.6002400960384153, + "grad_norm": 0.49604761600494385, + "learning_rate": 1.877699720765114e-05, + "loss": 0.8494, + "num_input_tokens_seen": 102400000, + "step": 12500 + }, + { + "epoch": 0.6007202881152461, + "grad_norm": 0.5131434798240662, + "learning_rate": 1.873863545397507e-05, + "loss": 0.7685, + "num_input_tokens_seen": 102481920, + "step": 12510 + }, + { + "epoch": 0.6012004801920768, + "grad_norm": 0.45130103826522827, + "learning_rate": 1.870028942354655e-05, + "loss": 0.9137, + "num_input_tokens_seen": 102563840, + "step": 12520 + }, + { + "epoch": 0.6016806722689075, + "grad_norm": 0.4828559160232544, + "learning_rate": 1.8661959212658365e-05, + "loss": 0.8186, + "num_input_tokens_seen": 102645760, + "step": 12530 + }, + { + "epoch": 0.6021608643457383, + "grad_norm": 0.5113935470581055, + "learning_rate": 1.862364491756355e-05, + "loss": 0.8923, + "num_input_tokens_seen": 102727680, + "step": 12540 + }, + { + "epoch": 0.602641056422569, + "grad_norm": 0.5698245167732239, + "learning_rate": 1.8585346634475175e-05, + "loss": 0.9799, + "num_input_tokens_seen": 102809600, + "step": 12550 + }, + { + "epoch": 0.6031212484993997, + "grad_norm": 0.4895305335521698, + "learning_rate": 1.8547064459566117e-05, + "loss": 0.9472, + "num_input_tokens_seen": 102891520, + "step": 12560 + }, + { + "epoch": 0.6036014405762304, + "grad_norm": 0.4899882376194, + "learning_rate": 1.8508798488968803e-05, + "loss": 0.7548, + "num_input_tokens_seen": 102973440, + "step": 12570 + }, + { + "epoch": 0.6040816326530613, + "grad_norm": 0.48758333921432495, + "learning_rate": 1.8470548818774942e-05, + "loss": 0.9321, + "num_input_tokens_seen": 103055360, + "step": 12580 + }, + { + "epoch": 0.604561824729892, + "grad_norm": 0.4785870313644409, + "learning_rate": 1.8432315545035328e-05, + "loss": 0.8554, + "num_input_tokens_seen": 103137280, + "step": 12590 + }, + { + "epoch": 0.6050420168067226, + "grad_norm": 0.46478036046028137, + "learning_rate": 1.83940987637596e-05, + "loss": 0.9034, + "num_input_tokens_seen": 103219200, + "step": 12600 + }, + { + "epoch": 0.6055222088835535, + "grad_norm": 0.5116038918495178, + "learning_rate": 1.8355898570915937e-05, + "loss": 1.0032, + "num_input_tokens_seen": 103301120, + "step": 12610 + }, + { + "epoch": 0.6060024009603842, + "grad_norm": 0.4858674108982086, + "learning_rate": 1.8317715062430902e-05, + "loss": 0.9037, + "num_input_tokens_seen": 103383040, + "step": 12620 + }, + { + "epoch": 0.6064825930372149, + "grad_norm": 0.49994757771492004, + "learning_rate": 1.8279548334189146e-05, + "loss": 0.9669, + "num_input_tokens_seen": 103464960, + "step": 12630 + }, + { + "epoch": 0.6069627851140457, + "grad_norm": 0.5256997346878052, + "learning_rate": 1.8241398482033185e-05, + "loss": 0.9426, + "num_input_tokens_seen": 103546880, + "step": 12640 + }, + { + "epoch": 0.6074429771908764, + "grad_norm": 0.48835039138793945, + "learning_rate": 1.8203265601763136e-05, + "loss": 0.9299, + "num_input_tokens_seen": 103628800, + "step": 12650 + }, + { + "epoch": 0.6079231692677071, + "grad_norm": 0.4843783676624298, + "learning_rate": 1.816514978913655e-05, + "loss": 0.8059, + "num_input_tokens_seen": 103710720, + "step": 12660 + }, + { + "epoch": 0.6084033613445378, + "grad_norm": 0.6175418496131897, + "learning_rate": 1.8127051139868044e-05, + "loss": 0.963, + "num_input_tokens_seen": 103792640, + "step": 12670 + }, + { + "epoch": 0.6088835534213686, + "grad_norm": 0.503555178642273, + "learning_rate": 1.8088969749629197e-05, + "loss": 0.9894, + "num_input_tokens_seen": 103874560, + "step": 12680 + }, + { + "epoch": 0.6093637454981993, + "grad_norm": 0.48050084710121155, + "learning_rate": 1.8050905714048233e-05, + "loss": 1.0683, + "num_input_tokens_seen": 103956480, + "step": 12690 + }, + { + "epoch": 0.60984393757503, + "grad_norm": 0.5039294362068176, + "learning_rate": 1.8012859128709766e-05, + "loss": 0.7611, + "num_input_tokens_seen": 104038400, + "step": 12700 + }, + { + "epoch": 0.6103241296518608, + "grad_norm": 0.6034661531448364, + "learning_rate": 1.7974830089154624e-05, + "loss": 0.9293, + "num_input_tokens_seen": 104120320, + "step": 12710 + }, + { + "epoch": 0.6108043217286915, + "grad_norm": 0.5101216435432434, + "learning_rate": 1.7936818690879574e-05, + "loss": 0.9849, + "num_input_tokens_seen": 104202240, + "step": 12720 + }, + { + "epoch": 0.6112845138055222, + "grad_norm": 0.49056023359298706, + "learning_rate": 1.7898825029337054e-05, + "loss": 0.9917, + "num_input_tokens_seen": 104284160, + "step": 12730 + }, + { + "epoch": 0.611764705882353, + "grad_norm": 0.5179559588432312, + "learning_rate": 1.7860849199934983e-05, + "loss": 0.8664, + "num_input_tokens_seen": 104366080, + "step": 12740 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 0.5554147958755493, + "learning_rate": 1.7822891298036515e-05, + "loss": 0.9739, + "num_input_tokens_seen": 104448000, + "step": 12750 + }, + { + "epoch": 0.6127250900360144, + "grad_norm": 0.5510497689247131, + "learning_rate": 1.7784951418959747e-05, + "loss": 0.9441, + "num_input_tokens_seen": 104529920, + "step": 12760 + }, + { + "epoch": 0.6132052821128451, + "grad_norm": 0.5032241344451904, + "learning_rate": 1.7747029657977556e-05, + "loss": 0.9443, + "num_input_tokens_seen": 104611840, + "step": 12770 + }, + { + "epoch": 0.6136854741896759, + "grad_norm": 0.48411932587623596, + "learning_rate": 1.7709126110317297e-05, + "loss": 1.0327, + "num_input_tokens_seen": 104693760, + "step": 12780 + }, + { + "epoch": 0.6141656662665066, + "grad_norm": 0.4949854016304016, + "learning_rate": 1.7671240871160593e-05, + "loss": 0.9103, + "num_input_tokens_seen": 104775680, + "step": 12790 + }, + { + "epoch": 0.6146458583433373, + "grad_norm": 0.47453951835632324, + "learning_rate": 1.76333740356431e-05, + "loss": 0.8165, + "num_input_tokens_seen": 104857600, + "step": 12800 + }, + { + "epoch": 0.6151260504201681, + "grad_norm": 0.4928891658782959, + "learning_rate": 1.7595525698854263e-05, + "loss": 0.8398, + "num_input_tokens_seen": 104939520, + "step": 12810 + }, + { + "epoch": 0.6156062424969988, + "grad_norm": 0.6075853109359741, + "learning_rate": 1.7557695955837063e-05, + "loss": 0.8922, + "num_input_tokens_seen": 105021440, + "step": 12820 + }, + { + "epoch": 0.6160864345738295, + "grad_norm": 0.6649598479270935, + "learning_rate": 1.7519884901587772e-05, + "loss": 0.8345, + "num_input_tokens_seen": 105103360, + "step": 12830 + }, + { + "epoch": 0.6165666266506603, + "grad_norm": 0.5287781357765198, + "learning_rate": 1.748209263105577e-05, + "loss": 1.0156, + "num_input_tokens_seen": 105185280, + "step": 12840 + }, + { + "epoch": 0.617046818727491, + "grad_norm": 0.48797744512557983, + "learning_rate": 1.744431923914326e-05, + "loss": 0.9688, + "num_input_tokens_seen": 105267200, + "step": 12850 + }, + { + "epoch": 0.6175270108043217, + "grad_norm": 0.4974718391895294, + "learning_rate": 1.7406564820705e-05, + "loss": 0.9532, + "num_input_tokens_seen": 105349120, + "step": 12860 + }, + { + "epoch": 0.6180072028811524, + "grad_norm": 1.6541121006011963, + "learning_rate": 1.736882947054815e-05, + "loss": 0.818, + "num_input_tokens_seen": 105431040, + "step": 12870 + }, + { + "epoch": 0.6184873949579832, + "grad_norm": 0.5184734463691711, + "learning_rate": 1.7331113283431966e-05, + "loss": 0.994, + "num_input_tokens_seen": 105512960, + "step": 12880 + }, + { + "epoch": 0.6189675870348139, + "grad_norm": 0.5024128556251526, + "learning_rate": 1.729341635406757e-05, + "loss": 0.7879, + "num_input_tokens_seen": 105594880, + "step": 12890 + }, + { + "epoch": 0.6194477791116446, + "grad_norm": 0.48197275400161743, + "learning_rate": 1.725573877711776e-05, + "loss": 0.9374, + "num_input_tokens_seen": 105676800, + "step": 12900 + }, + { + "epoch": 0.6199279711884754, + "grad_norm": 0.5016830563545227, + "learning_rate": 1.7218080647196698e-05, + "loss": 0.9342, + "num_input_tokens_seen": 105758720, + "step": 12910 + }, + { + "epoch": 0.6204081632653061, + "grad_norm": 0.48292943835258484, + "learning_rate": 1.7180442058869732e-05, + "loss": 0.8143, + "num_input_tokens_seen": 105840640, + "step": 12920 + }, + { + "epoch": 0.6208883553421368, + "grad_norm": 0.4773729145526886, + "learning_rate": 1.7142823106653135e-05, + "loss": 0.8115, + "num_input_tokens_seen": 105922560, + "step": 12930 + }, + { + "epoch": 0.6213685474189676, + "grad_norm": 0.5274072885513306, + "learning_rate": 1.7105223885013884e-05, + "loss": 0.8874, + "num_input_tokens_seen": 106004480, + "step": 12940 + }, + { + "epoch": 0.6218487394957983, + "grad_norm": 1.052327036857605, + "learning_rate": 1.706764448836938e-05, + "loss": 0.9164, + "num_input_tokens_seen": 106086400, + "step": 12950 + }, + { + "epoch": 0.622328931572629, + "grad_norm": 0.5385840535163879, + "learning_rate": 1.703008501108726e-05, + "loss": 0.9694, + "num_input_tokens_seen": 106168320, + "step": 12960 + }, + { + "epoch": 0.6228091236494598, + "grad_norm": 0.4652009904384613, + "learning_rate": 1.699254554748515e-05, + "loss": 0.9424, + "num_input_tokens_seen": 106250240, + "step": 12970 + }, + { + "epoch": 0.6232893157262905, + "grad_norm": 0.5203952193260193, + "learning_rate": 1.6955026191830385e-05, + "loss": 0.8038, + "num_input_tokens_seen": 106332160, + "step": 12980 + }, + { + "epoch": 0.6237695078031212, + "grad_norm": 0.47074759006500244, + "learning_rate": 1.691752703833984e-05, + "loss": 0.9074, + "num_input_tokens_seen": 106414080, + "step": 12990 + }, + { + "epoch": 0.6242496998799519, + "grad_norm": 0.5040807723999023, + "learning_rate": 1.6880048181179652e-05, + "loss": 0.8315, + "num_input_tokens_seen": 106496000, + "step": 13000 + }, + { + "epoch": 0.6247298919567827, + "grad_norm": 0.5343396663665771, + "learning_rate": 1.684258971446497e-05, + "loss": 0.9378, + "num_input_tokens_seen": 106577920, + "step": 13010 + }, + { + "epoch": 0.6252100840336134, + "grad_norm": 0.48654720187187195, + "learning_rate": 1.6805151732259755e-05, + "loss": 0.8081, + "num_input_tokens_seen": 106659840, + "step": 13020 + }, + { + "epoch": 0.6256902761104441, + "grad_norm": 0.4875001907348633, + "learning_rate": 1.6767734328576544e-05, + "loss": 0.8395, + "num_input_tokens_seen": 106741760, + "step": 13030 + }, + { + "epoch": 0.626170468187275, + "grad_norm": 0.4956626296043396, + "learning_rate": 1.6730337597376165e-05, + "loss": 0.8904, + "num_input_tokens_seen": 106823680, + "step": 13040 + }, + { + "epoch": 0.6266506602641057, + "grad_norm": 0.49269092082977295, + "learning_rate": 1.669296163256755e-05, + "loss": 0.7173, + "num_input_tokens_seen": 106905600, + "step": 13050 + }, + { + "epoch": 0.6271308523409364, + "grad_norm": 0.7931904196739197, + "learning_rate": 1.6655606528007505e-05, + "loss": 1.0009, + "num_input_tokens_seen": 106987520, + "step": 13060 + }, + { + "epoch": 0.6276110444177672, + "grad_norm": 0.5141395926475525, + "learning_rate": 1.6618272377500406e-05, + "loss": 0.9796, + "num_input_tokens_seen": 107069440, + "step": 13070 + }, + { + "epoch": 0.6280912364945979, + "grad_norm": 0.49574312567710876, + "learning_rate": 1.658095927479805e-05, + "loss": 0.7448, + "num_input_tokens_seen": 107151360, + "step": 13080 + }, + { + "epoch": 0.6285714285714286, + "grad_norm": 0.548512876033783, + "learning_rate": 1.6543667313599365e-05, + "loss": 1.1406, + "num_input_tokens_seen": 107233280, + "step": 13090 + }, + { + "epoch": 0.6290516206482593, + "grad_norm": 0.49996235966682434, + "learning_rate": 1.6506396587550188e-05, + "loss": 0.8865, + "num_input_tokens_seen": 107315200, + "step": 13100 + }, + { + "epoch": 0.6295318127250901, + "grad_norm": 0.5021194815635681, + "learning_rate": 1.6469147190243027e-05, + "loss": 0.8561, + "num_input_tokens_seen": 107397120, + "step": 13110 + }, + { + "epoch": 0.6300120048019208, + "grad_norm": 0.4927319884300232, + "learning_rate": 1.6431919215216862e-05, + "loss": 0.8364, + "num_input_tokens_seen": 107479040, + "step": 13120 + }, + { + "epoch": 0.6304921968787515, + "grad_norm": 0.5446639060974121, + "learning_rate": 1.6394712755956817e-05, + "loss": 0.996, + "num_input_tokens_seen": 107560960, + "step": 13130 + }, + { + "epoch": 0.6309723889555823, + "grad_norm": 0.5035328269004822, + "learning_rate": 1.635752790589405e-05, + "loss": 0.8958, + "num_input_tokens_seen": 107642880, + "step": 13140 + }, + { + "epoch": 0.631452581032413, + "grad_norm": 0.4967860281467438, + "learning_rate": 1.6320364758405422e-05, + "loss": 0.9864, + "num_input_tokens_seen": 107724800, + "step": 13150 + }, + { + "epoch": 0.6319327731092437, + "grad_norm": 0.554397463798523, + "learning_rate": 1.628322340681329e-05, + "loss": 0.9916, + "num_input_tokens_seen": 107806720, + "step": 13160 + }, + { + "epoch": 0.6324129651860745, + "grad_norm": 0.5113590955734253, + "learning_rate": 1.6246103944385295e-05, + "loss": 1.0281, + "num_input_tokens_seen": 107888640, + "step": 13170 + }, + { + "epoch": 0.6328931572629052, + "grad_norm": 0.49097153544425964, + "learning_rate": 1.620900646433412e-05, + "loss": 0.8902, + "num_input_tokens_seen": 107970560, + "step": 13180 + }, + { + "epoch": 0.6333733493397359, + "grad_norm": 0.523250937461853, + "learning_rate": 1.6171931059817214e-05, + "loss": 0.9968, + "num_input_tokens_seen": 108052480, + "step": 13190 + }, + { + "epoch": 0.6338535414165666, + "grad_norm": 0.4726887345314026, + "learning_rate": 1.613487782393661e-05, + "loss": 0.6298, + "num_input_tokens_seen": 108134400, + "step": 13200 + }, + { + "epoch": 0.6343337334933974, + "grad_norm": 0.5005057454109192, + "learning_rate": 1.6097846849738685e-05, + "loss": 0.7394, + "num_input_tokens_seen": 108216320, + "step": 13210 + }, + { + "epoch": 0.6348139255702281, + "grad_norm": 0.4948345124721527, + "learning_rate": 1.6060838230213883e-05, + "loss": 0.9741, + "num_input_tokens_seen": 108298240, + "step": 13220 + }, + { + "epoch": 0.6352941176470588, + "grad_norm": 0.6280222535133362, + "learning_rate": 1.6023852058296544e-05, + "loss": 1.0429, + "num_input_tokens_seen": 108380160, + "step": 13230 + }, + { + "epoch": 0.6357743097238896, + "grad_norm": 0.553558349609375, + "learning_rate": 1.5986888426864617e-05, + "loss": 0.8975, + "num_input_tokens_seen": 108462080, + "step": 13240 + }, + { + "epoch": 0.6362545018007203, + "grad_norm": 0.47756895422935486, + "learning_rate": 1.5949947428739448e-05, + "loss": 0.8076, + "num_input_tokens_seen": 108544000, + "step": 13250 + }, + { + "epoch": 0.636734693877551, + "grad_norm": 0.8511344194412231, + "learning_rate": 1.591302915668556e-05, + "loss": 0.747, + "num_input_tokens_seen": 108625920, + "step": 13260 + }, + { + "epoch": 0.6372148859543818, + "grad_norm": 0.4981195032596588, + "learning_rate": 1.5876133703410412e-05, + "loss": 0.9432, + "num_input_tokens_seen": 108707840, + "step": 13270 + }, + { + "epoch": 0.6376950780312125, + "grad_norm": 0.4754570722579956, + "learning_rate": 1.5839261161564138e-05, + "loss": 1.0365, + "num_input_tokens_seen": 108789760, + "step": 13280 + }, + { + "epoch": 0.6381752701080432, + "grad_norm": 1.2110140323638916, + "learning_rate": 1.5802411623739345e-05, + "loss": 0.9743, + "num_input_tokens_seen": 108871680, + "step": 13290 + }, + { + "epoch": 0.6386554621848739, + "grad_norm": 0.4123174250125885, + "learning_rate": 1.57655851824709e-05, + "loss": 0.8376, + "num_input_tokens_seen": 108953600, + "step": 13300 + }, + { + "epoch": 0.6391356542617047, + "grad_norm": 0.5139141082763672, + "learning_rate": 1.5728781930235627e-05, + "loss": 0.9858, + "num_input_tokens_seen": 109035520, + "step": 13310 + }, + { + "epoch": 0.6396158463385354, + "grad_norm": 0.49317967891693115, + "learning_rate": 1.5692001959452164e-05, + "loss": 1.0486, + "num_input_tokens_seen": 109117440, + "step": 13320 + }, + { + "epoch": 0.6400960384153661, + "grad_norm": 0.48599720001220703, + "learning_rate": 1.5655245362480654e-05, + "loss": 0.8965, + "num_input_tokens_seen": 109199360, + "step": 13330 + }, + { + "epoch": 0.6405762304921969, + "grad_norm": 0.4471222162246704, + "learning_rate": 1.561851223162254e-05, + "loss": 0.8839, + "num_input_tokens_seen": 109281280, + "step": 13340 + }, + { + "epoch": 0.6410564225690276, + "grad_norm": 0.4929245412349701, + "learning_rate": 1.558180265912037e-05, + "loss": 0.9363, + "num_input_tokens_seen": 109363200, + "step": 13350 + }, + { + "epoch": 0.6415366146458583, + "grad_norm": 0.4899829626083374, + "learning_rate": 1.5545116737157522e-05, + "loss": 0.9724, + "num_input_tokens_seen": 109445120, + "step": 13360 + }, + { + "epoch": 0.6420168067226891, + "grad_norm": 0.4964812994003296, + "learning_rate": 1.5508454557857966e-05, + "loss": 0.7562, + "num_input_tokens_seen": 109527040, + "step": 13370 + }, + { + "epoch": 0.6424969987995198, + "grad_norm": 0.44805675745010376, + "learning_rate": 1.5471816213286054e-05, + "loss": 0.8147, + "num_input_tokens_seen": 109608960, + "step": 13380 + }, + { + "epoch": 0.6429771908763505, + "grad_norm": 0.8485963940620422, + "learning_rate": 1.5435201795446317e-05, + "loss": 0.9489, + "num_input_tokens_seen": 109690880, + "step": 13390 + }, + { + "epoch": 0.6434573829531812, + "grad_norm": 0.4873722195625305, + "learning_rate": 1.5398611396283153e-05, + "loss": 0.8655, + "num_input_tokens_seen": 109772800, + "step": 13400 + }, + { + "epoch": 0.643937575030012, + "grad_norm": 0.5201349258422852, + "learning_rate": 1.536204510768069e-05, + "loss": 0.8621, + "num_input_tokens_seen": 109854720, + "step": 13410 + }, + { + "epoch": 0.6444177671068427, + "grad_norm": 0.4951586425304413, + "learning_rate": 1.532550302146249e-05, + "loss": 0.8371, + "num_input_tokens_seen": 109936640, + "step": 13420 + }, + { + "epoch": 0.6448979591836734, + "grad_norm": 0.4836975336074829, + "learning_rate": 1.528898522939133e-05, + "loss": 0.7762, + "num_input_tokens_seen": 110018560, + "step": 13430 + }, + { + "epoch": 0.6453781512605042, + "grad_norm": 1.549204707145691, + "learning_rate": 1.5252491823168994e-05, + "loss": 0.9059, + "num_input_tokens_seen": 110100480, + "step": 13440 + }, + { + "epoch": 0.6458583433373349, + "grad_norm": 0.39099404215812683, + "learning_rate": 1.5216022894436043e-05, + "loss": 0.9104, + "num_input_tokens_seen": 110182400, + "step": 13450 + }, + { + "epoch": 0.6463385354141656, + "grad_norm": 0.4727102816104889, + "learning_rate": 1.517957853477154e-05, + "loss": 0.9764, + "num_input_tokens_seen": 110264320, + "step": 13460 + }, + { + "epoch": 0.6468187274909964, + "grad_norm": 0.5066577792167664, + "learning_rate": 1.5143158835692866e-05, + "loss": 0.8985, + "num_input_tokens_seen": 110346240, + "step": 13470 + }, + { + "epoch": 0.6472989195678271, + "grad_norm": 0.5499137043952942, + "learning_rate": 1.5106763888655478e-05, + "loss": 1.1936, + "num_input_tokens_seen": 110428160, + "step": 13480 + }, + { + "epoch": 0.6477791116446578, + "grad_norm": 0.4742192029953003, + "learning_rate": 1.5070393785052695e-05, + "loss": 0.9241, + "num_input_tokens_seen": 110510080, + "step": 13490 + }, + { + "epoch": 0.6482593037214885, + "grad_norm": 0.4930437207221985, + "learning_rate": 1.5034048616215402e-05, + "loss": 0.7714, + "num_input_tokens_seen": 110592000, + "step": 13500 + }, + { + "epoch": 0.6487394957983194, + "grad_norm": 0.48428675532341003, + "learning_rate": 1.4997728473411903e-05, + "loss": 1.0863, + "num_input_tokens_seen": 110673920, + "step": 13510 + }, + { + "epoch": 0.64921968787515, + "grad_norm": 0.49991798400878906, + "learning_rate": 1.4961433447847672e-05, + "loss": 0.8677, + "num_input_tokens_seen": 110755840, + "step": 13520 + }, + { + "epoch": 0.6496998799519808, + "grad_norm": 0.4799818694591522, + "learning_rate": 1.4925163630665065e-05, + "loss": 0.8387, + "num_input_tokens_seen": 110837760, + "step": 13530 + }, + { + "epoch": 0.6501800720288116, + "grad_norm": 0.4553060531616211, + "learning_rate": 1.4888919112943173e-05, + "loss": 0.9952, + "num_input_tokens_seen": 110919680, + "step": 13540 + }, + { + "epoch": 0.6506602641056423, + "grad_norm": 0.8867954611778259, + "learning_rate": 1.4852699985697546e-05, + "loss": 0.8378, + "num_input_tokens_seen": 111001600, + "step": 13550 + }, + { + "epoch": 0.651140456182473, + "grad_norm": 0.49487099051475525, + "learning_rate": 1.4816506339879965e-05, + "loss": 0.8531, + "num_input_tokens_seen": 111083520, + "step": 13560 + }, + { + "epoch": 0.6516206482593038, + "grad_norm": 0.5083696246147156, + "learning_rate": 1.4780338266378232e-05, + "loss": 0.8573, + "num_input_tokens_seen": 111165440, + "step": 13570 + }, + { + "epoch": 0.6521008403361345, + "grad_norm": 0.5299503207206726, + "learning_rate": 1.4744195856015947e-05, + "loss": 0.6841, + "num_input_tokens_seen": 111247360, + "step": 13580 + }, + { + "epoch": 0.6525810324129652, + "grad_norm": 0.49826177954673767, + "learning_rate": 1.4708079199552221e-05, + "loss": 0.8537, + "num_input_tokens_seen": 111329280, + "step": 13590 + }, + { + "epoch": 0.6530612244897959, + "grad_norm": 0.5107062458992004, + "learning_rate": 1.4671988387681549e-05, + "loss": 1.038, + "num_input_tokens_seen": 111411200, + "step": 13600 + }, + { + "epoch": 0.6535414165666267, + "grad_norm": 0.4928327202796936, + "learning_rate": 1.4635923511033494e-05, + "loss": 0.9946, + "num_input_tokens_seen": 111493120, + "step": 13610 + }, + { + "epoch": 0.6540216086434574, + "grad_norm": 0.4964877963066101, + "learning_rate": 1.4599884660172485e-05, + "loss": 1.8003, + "num_input_tokens_seen": 111575040, + "step": 13620 + }, + { + "epoch": 0.6545018007202881, + "grad_norm": 0.530273973941803, + "learning_rate": 1.4563871925597622e-05, + "loss": 0.8789, + "num_input_tokens_seen": 111656960, + "step": 13630 + }, + { + "epoch": 0.6549819927971189, + "grad_norm": 0.48265501856803894, + "learning_rate": 1.452788539774241e-05, + "loss": 0.6229, + "num_input_tokens_seen": 111738880, + "step": 13640 + }, + { + "epoch": 0.6554621848739496, + "grad_norm": 0.4895990788936615, + "learning_rate": 1.4491925166974532e-05, + "loss": 0.8647, + "num_input_tokens_seen": 111820800, + "step": 13650 + }, + { + "epoch": 0.6559423769507803, + "grad_norm": 0.534498929977417, + "learning_rate": 1.4455991323595655e-05, + "loss": 0.8938, + "num_input_tokens_seen": 111902720, + "step": 13660 + }, + { + "epoch": 0.6564225690276111, + "grad_norm": 0.48878028988838196, + "learning_rate": 1.4420083957841185e-05, + "loss": 0.8419, + "num_input_tokens_seen": 111984640, + "step": 13670 + }, + { + "epoch": 0.6569027611044418, + "grad_norm": 0.49631527066230774, + "learning_rate": 1.4384203159880017e-05, + "loss": 0.8683, + "num_input_tokens_seen": 112066560, + "step": 13680 + }, + { + "epoch": 0.6573829531812725, + "grad_norm": 0.5460849404335022, + "learning_rate": 1.4348349019814344e-05, + "loss": 1.1068, + "num_input_tokens_seen": 112148480, + "step": 13690 + }, + { + "epoch": 0.6578631452581032, + "grad_norm": 0.4952901601791382, + "learning_rate": 1.4312521627679428e-05, + "loss": 0.9632, + "num_input_tokens_seen": 112230400, + "step": 13700 + }, + { + "epoch": 0.658343337334934, + "grad_norm": 0.5365872979164124, + "learning_rate": 1.4276721073443344e-05, + "loss": 0.8043, + "num_input_tokens_seen": 112312320, + "step": 13710 + }, + { + "epoch": 0.6588235294117647, + "grad_norm": 0.5443843007087708, + "learning_rate": 1.4240947447006764e-05, + "loss": 1.025, + "num_input_tokens_seen": 112394240, + "step": 13720 + }, + { + "epoch": 0.6593037214885954, + "grad_norm": 0.49245166778564453, + "learning_rate": 1.4205200838202782e-05, + "loss": 1.0484, + "num_input_tokens_seen": 112476160, + "step": 13730 + }, + { + "epoch": 0.6597839135654262, + "grad_norm": 0.5047618746757507, + "learning_rate": 1.4169481336796597e-05, + "loss": 1.0862, + "num_input_tokens_seen": 112558080, + "step": 13740 + }, + { + "epoch": 0.6602641056422569, + "grad_norm": 0.5078234076499939, + "learning_rate": 1.4133789032485367e-05, + "loss": 0.9006, + "num_input_tokens_seen": 112640000, + "step": 13750 + }, + { + "epoch": 0.6607442977190876, + "grad_norm": 1.923171877861023, + "learning_rate": 1.4098124014897961e-05, + "loss": 0.8773, + "num_input_tokens_seen": 112721920, + "step": 13760 + }, + { + "epoch": 0.6612244897959184, + "grad_norm": 0.5198791027069092, + "learning_rate": 1.4062486373594694e-05, + "loss": 0.802, + "num_input_tokens_seen": 112803840, + "step": 13770 + }, + { + "epoch": 0.6617046818727491, + "grad_norm": 0.4909871518611908, + "learning_rate": 1.4026876198067163e-05, + "loss": 1.0097, + "num_input_tokens_seen": 112885760, + "step": 13780 + }, + { + "epoch": 0.6621848739495798, + "grad_norm": 0.4735969305038452, + "learning_rate": 1.399129357773799e-05, + "loss": 0.9333, + "num_input_tokens_seen": 112967680, + "step": 13790 + }, + { + "epoch": 0.6626650660264105, + "grad_norm": 0.52108234167099, + "learning_rate": 1.3955738601960588e-05, + "loss": 0.8832, + "num_input_tokens_seen": 113049600, + "step": 13800 + }, + { + "epoch": 0.6631452581032413, + "grad_norm": 0.49739062786102295, + "learning_rate": 1.392021136001897e-05, + "loss": 0.7867, + "num_input_tokens_seen": 113131520, + "step": 13810 + }, + { + "epoch": 0.663625450180072, + "grad_norm": 0.5152223706245422, + "learning_rate": 1.3884711941127487e-05, + "loss": 0.9924, + "num_input_tokens_seen": 113213440, + "step": 13820 + }, + { + "epoch": 0.6641056422569027, + "grad_norm": 0.4832291901111603, + "learning_rate": 1.384924043443062e-05, + "loss": 0.7811, + "num_input_tokens_seen": 113295360, + "step": 13830 + }, + { + "epoch": 0.6645858343337335, + "grad_norm": 0.49378257989883423, + "learning_rate": 1.3813796929002779e-05, + "loss": 0.7269, + "num_input_tokens_seen": 113377280, + "step": 13840 + }, + { + "epoch": 0.6650660264105642, + "grad_norm": 0.4792208969593048, + "learning_rate": 1.3778381513848055e-05, + "loss": 0.7797, + "num_input_tokens_seen": 113459200, + "step": 13850 + }, + { + "epoch": 0.6655462184873949, + "grad_norm": 0.5114631056785583, + "learning_rate": 1.3742994277899967e-05, + "loss": 0.7852, + "num_input_tokens_seen": 113541120, + "step": 13860 + }, + { + "epoch": 0.6660264105642257, + "grad_norm": 0.49299582839012146, + "learning_rate": 1.370763531002132e-05, + "loss": 1.2018, + "num_input_tokens_seen": 113623040, + "step": 13870 + }, + { + "epoch": 0.6665066026410564, + "grad_norm": 0.48771587014198303, + "learning_rate": 1.3672304699003908e-05, + "loss": 0.7667, + "num_input_tokens_seen": 113704960, + "step": 13880 + }, + { + "epoch": 0.6669867947178871, + "grad_norm": 0.6052321791648865, + "learning_rate": 1.3637002533568302e-05, + "loss": 0.8471, + "num_input_tokens_seen": 113786880, + "step": 13890 + }, + { + "epoch": 0.6674669867947179, + "grad_norm": 0.47869783639907837, + "learning_rate": 1.3601728902363681e-05, + "loss": 1.1894, + "num_input_tokens_seen": 113868800, + "step": 13900 + }, + { + "epoch": 0.6679471788715486, + "grad_norm": 0.4907095432281494, + "learning_rate": 1.356648389396754e-05, + "loss": 0.9134, + "num_input_tokens_seen": 113950720, + "step": 13910 + }, + { + "epoch": 0.6684273709483793, + "grad_norm": 0.4225086569786072, + "learning_rate": 1.3531267596885488e-05, + "loss": 1.1423, + "num_input_tokens_seen": 114032640, + "step": 13920 + }, + { + "epoch": 0.66890756302521, + "grad_norm": 0.4796990156173706, + "learning_rate": 1.349608009955107e-05, + "loss": 0.7921, + "num_input_tokens_seen": 114114560, + "step": 13930 + }, + { + "epoch": 0.6693877551020408, + "grad_norm": 0.4856274724006653, + "learning_rate": 1.34609214903255e-05, + "loss": 0.8029, + "num_input_tokens_seen": 114196480, + "step": 13940 + }, + { + "epoch": 0.6698679471788715, + "grad_norm": 0.4970095753669739, + "learning_rate": 1.3425791857497422e-05, + "loss": 0.9638, + "num_input_tokens_seen": 114278400, + "step": 13950 + }, + { + "epoch": 0.6703481392557022, + "grad_norm": 0.47488856315612793, + "learning_rate": 1.3390691289282754e-05, + "loss": 0.8415, + "num_input_tokens_seen": 114360320, + "step": 13960 + }, + { + "epoch": 0.6708283313325331, + "grad_norm": 0.4611085057258606, + "learning_rate": 1.335561987382441e-05, + "loss": 0.9274, + "num_input_tokens_seen": 114442240, + "step": 13970 + }, + { + "epoch": 0.6713085234093638, + "grad_norm": 0.4784678518772125, + "learning_rate": 1.3320577699192086e-05, + "loss": 0.7943, + "num_input_tokens_seen": 114524160, + "step": 13980 + }, + { + "epoch": 0.6717887154861945, + "grad_norm": 0.49473994970321655, + "learning_rate": 1.3285564853382076e-05, + "loss": 0.9644, + "num_input_tokens_seen": 114606080, + "step": 13990 + }, + { + "epoch": 0.6722689075630253, + "grad_norm": 0.45741796493530273, + "learning_rate": 1.325058142431701e-05, + "loss": 0.8024, + "num_input_tokens_seen": 114688000, + "step": 14000 + }, + { + "epoch": 0.672749099639856, + "grad_norm": 0.4712899625301361, + "learning_rate": 1.321562749984563e-05, + "loss": 0.7327, + "num_input_tokens_seen": 114769920, + "step": 14010 + }, + { + "epoch": 0.6732292917166867, + "grad_norm": 0.4954095780849457, + "learning_rate": 1.318070316774262e-05, + "loss": 0.9788, + "num_input_tokens_seen": 114851840, + "step": 14020 + }, + { + "epoch": 0.6737094837935174, + "grad_norm": 0.5219026207923889, + "learning_rate": 1.3145808515708347e-05, + "loss": 0.7022, + "num_input_tokens_seen": 114933760, + "step": 14030 + }, + { + "epoch": 0.6741896758703482, + "grad_norm": 0.4924544394016266, + "learning_rate": 1.3110943631368616e-05, + "loss": 0.9829, + "num_input_tokens_seen": 115015680, + "step": 14040 + }, + { + "epoch": 0.6746698679471789, + "grad_norm": 0.49134406447410583, + "learning_rate": 1.3076108602274522e-05, + "loss": 0.7022, + "num_input_tokens_seen": 115097600, + "step": 14050 + }, + { + "epoch": 0.6751500600240096, + "grad_norm": 0.4494248926639557, + "learning_rate": 1.3041303515902179e-05, + "loss": 0.8251, + "num_input_tokens_seen": 115179520, + "step": 14060 + }, + { + "epoch": 0.6756302521008404, + "grad_norm": 0.46173524856567383, + "learning_rate": 1.3006528459652476e-05, + "loss": 1.0901, + "num_input_tokens_seen": 115261440, + "step": 14070 + }, + { + "epoch": 0.6761104441776711, + "grad_norm": 0.5183840394020081, + "learning_rate": 1.2971783520850939e-05, + "loss": 0.9864, + "num_input_tokens_seen": 115343360, + "step": 14080 + }, + { + "epoch": 0.6765906362545018, + "grad_norm": 0.4756566286087036, + "learning_rate": 1.2937068786747438e-05, + "loss": 0.9465, + "num_input_tokens_seen": 115425280, + "step": 14090 + }, + { + "epoch": 0.6770708283313326, + "grad_norm": 0.47903919219970703, + "learning_rate": 1.2902384344515986e-05, + "loss": 0.9708, + "num_input_tokens_seen": 115507200, + "step": 14100 + }, + { + "epoch": 0.6775510204081633, + "grad_norm": 0.6484615206718445, + "learning_rate": 1.286773028125455e-05, + "loss": 0.7833, + "num_input_tokens_seen": 115589120, + "step": 14110 + }, + { + "epoch": 0.678031212484994, + "grad_norm": 0.48011910915374756, + "learning_rate": 1.2833106683984808e-05, + "loss": 0.9278, + "num_input_tokens_seen": 115671040, + "step": 14120 + }, + { + "epoch": 0.6785114045618247, + "grad_norm": 0.4931375980377197, + "learning_rate": 1.279851363965193e-05, + "loss": 0.8275, + "num_input_tokens_seen": 115752960, + "step": 14130 + }, + { + "epoch": 0.6789915966386555, + "grad_norm": 0.5052535533905029, + "learning_rate": 1.2763951235124346e-05, + "loss": 0.8764, + "num_input_tokens_seen": 115834880, + "step": 14140 + }, + { + "epoch": 0.6794717887154862, + "grad_norm": 0.49252551794052124, + "learning_rate": 1.2729419557193573e-05, + "loss": 0.9236, + "num_input_tokens_seen": 115916800, + "step": 14150 + }, + { + "epoch": 0.6799519807923169, + "grad_norm": 0.45454323291778564, + "learning_rate": 1.2694918692573954e-05, + "loss": 0.7963, + "num_input_tokens_seen": 115998720, + "step": 14160 + }, + { + "epoch": 0.6804321728691477, + "grad_norm": 0.47249314188957214, + "learning_rate": 1.2660448727902457e-05, + "loss": 0.7971, + "num_input_tokens_seen": 116080640, + "step": 14170 + }, + { + "epoch": 0.6809123649459784, + "grad_norm": 0.4945741891860962, + "learning_rate": 1.2626009749738444e-05, + "loss": 0.7678, + "num_input_tokens_seen": 116162560, + "step": 14180 + }, + { + "epoch": 0.6813925570228091, + "grad_norm": 0.4679727852344513, + "learning_rate": 1.2591601844563488e-05, + "loss": 0.8935, + "num_input_tokens_seen": 116244480, + "step": 14190 + }, + { + "epoch": 0.6818727490996399, + "grad_norm": 0.49965664744377136, + "learning_rate": 1.2557225098781105e-05, + "loss": 0.9236, + "num_input_tokens_seen": 116326400, + "step": 14200 + }, + { + "epoch": 0.6823529411764706, + "grad_norm": 0.46757179498672485, + "learning_rate": 1.2522879598716595e-05, + "loss": 0.8667, + "num_input_tokens_seen": 116408320, + "step": 14210 + }, + { + "epoch": 0.6828331332533013, + "grad_norm": 0.6531784534454346, + "learning_rate": 1.2488565430616785e-05, + "loss": 0.8728, + "num_input_tokens_seen": 116490240, + "step": 14220 + }, + { + "epoch": 0.683313325330132, + "grad_norm": 0.4773021340370178, + "learning_rate": 1.2454282680649804e-05, + "loss": 0.6747, + "num_input_tokens_seen": 116572160, + "step": 14230 + }, + { + "epoch": 0.6837935174069628, + "grad_norm": 0.6445793509483337, + "learning_rate": 1.2420031434904906e-05, + "loss": 0.864, + "num_input_tokens_seen": 116654080, + "step": 14240 + }, + { + "epoch": 0.6842737094837935, + "grad_norm": 0.479495644569397, + "learning_rate": 1.2385811779392236e-05, + "loss": 0.8979, + "num_input_tokens_seen": 116736000, + "step": 14250 + }, + { + "epoch": 0.6847539015606242, + "grad_norm": 0.8227359652519226, + "learning_rate": 1.2351623800042587e-05, + "loss": 0.9371, + "num_input_tokens_seen": 116817920, + "step": 14260 + }, + { + "epoch": 0.685234093637455, + "grad_norm": 0.41954296827316284, + "learning_rate": 1.2317467582707238e-05, + "loss": 0.9432, + "num_input_tokens_seen": 116899840, + "step": 14270 + }, + { + "epoch": 0.6857142857142857, + "grad_norm": 0.5001243948936462, + "learning_rate": 1.2283343213157688e-05, + "loss": 0.8542, + "num_input_tokens_seen": 116981760, + "step": 14280 + }, + { + "epoch": 0.6861944777911164, + "grad_norm": 0.5058388113975525, + "learning_rate": 1.2249250777085456e-05, + "loss": 0.897, + "num_input_tokens_seen": 117063680, + "step": 14290 + }, + { + "epoch": 0.6866746698679472, + "grad_norm": 0.5120940804481506, + "learning_rate": 1.221519036010189e-05, + "loss": 0.9898, + "num_input_tokens_seen": 117145600, + "step": 14300 + }, + { + "epoch": 0.6871548619447779, + "grad_norm": 0.5334838032722473, + "learning_rate": 1.2181162047737942e-05, + "loss": 0.7581, + "num_input_tokens_seen": 117227520, + "step": 14310 + }, + { + "epoch": 0.6876350540216086, + "grad_norm": 0.4675463140010834, + "learning_rate": 1.2147165925443904e-05, + "loss": 1.0274, + "num_input_tokens_seen": 117309440, + "step": 14320 + }, + { + "epoch": 0.6881152460984393, + "grad_norm": 0.7059617638587952, + "learning_rate": 1.2113202078589267e-05, + "loss": 0.8403, + "num_input_tokens_seen": 117391360, + "step": 14330 + }, + { + "epoch": 0.6885954381752701, + "grad_norm": 0.5068497061729431, + "learning_rate": 1.2079270592462475e-05, + "loss": 0.8582, + "num_input_tokens_seen": 117473280, + "step": 14340 + }, + { + "epoch": 0.6890756302521008, + "grad_norm": 0.5074156522750854, + "learning_rate": 1.204537155227068e-05, + "loss": 0.876, + "num_input_tokens_seen": 117555200, + "step": 14350 + }, + { + "epoch": 0.6895558223289315, + "grad_norm": 0.5661312341690063, + "learning_rate": 1.201150504313959e-05, + "loss": 0.9847, + "num_input_tokens_seen": 117637120, + "step": 14360 + }, + { + "epoch": 0.6900360144057623, + "grad_norm": 0.48914408683776855, + "learning_rate": 1.1977671150113206e-05, + "loss": 0.8981, + "num_input_tokens_seen": 117719040, + "step": 14370 + }, + { + "epoch": 0.690516206482593, + "grad_norm": 0.5188919305801392, + "learning_rate": 1.1943869958153613e-05, + "loss": 0.929, + "num_input_tokens_seen": 117800960, + "step": 14380 + }, + { + "epoch": 0.6909963985594237, + "grad_norm": 0.4931916892528534, + "learning_rate": 1.1910101552140806e-05, + "loss": 0.9613, + "num_input_tokens_seen": 117882880, + "step": 14390 + }, + { + "epoch": 0.6914765906362546, + "grad_norm": 0.5177183747291565, + "learning_rate": 1.1876366016872445e-05, + "loss": 0.9258, + "num_input_tokens_seen": 117964800, + "step": 14400 + }, + { + "epoch": 0.6919567827130852, + "grad_norm": 0.48315808176994324, + "learning_rate": 1.1842663437063613e-05, + "loss": 0.7828, + "num_input_tokens_seen": 118046720, + "step": 14410 + }, + { + "epoch": 0.692436974789916, + "grad_norm": 0.4908153712749481, + "learning_rate": 1.180899389734668e-05, + "loss": 0.9457, + "num_input_tokens_seen": 118128640, + "step": 14420 + }, + { + "epoch": 0.6929171668667466, + "grad_norm": 0.47637316584587097, + "learning_rate": 1.1775357482271032e-05, + "loss": 0.872, + "num_input_tokens_seen": 118210560, + "step": 14430 + }, + { + "epoch": 0.6933973589435775, + "grad_norm": 0.5091587901115417, + "learning_rate": 1.1741754276302851e-05, + "loss": 0.8554, + "num_input_tokens_seen": 118292480, + "step": 14440 + }, + { + "epoch": 0.6938775510204082, + "grad_norm": 0.49093228578567505, + "learning_rate": 1.170818436382497e-05, + "loss": 0.9142, + "num_input_tokens_seen": 118374400, + "step": 14450 + }, + { + "epoch": 0.6943577430972389, + "grad_norm": 0.47567296028137207, + "learning_rate": 1.1674647829136581e-05, + "loss": 0.9356, + "num_input_tokens_seen": 118456320, + "step": 14460 + }, + { + "epoch": 0.6948379351740697, + "grad_norm": 0.49377357959747314, + "learning_rate": 1.164114475645306e-05, + "loss": 0.9423, + "num_input_tokens_seen": 118538240, + "step": 14470 + }, + { + "epoch": 0.6953181272509004, + "grad_norm": 0.49795305728912354, + "learning_rate": 1.1607675229905776e-05, + "loss": 0.8267, + "num_input_tokens_seen": 118620160, + "step": 14480 + }, + { + "epoch": 0.6957983193277311, + "grad_norm": 0.42667868733406067, + "learning_rate": 1.1574239333541856e-05, + "loss": 0.8272, + "num_input_tokens_seen": 118702080, + "step": 14490 + }, + { + "epoch": 0.6962785114045619, + "grad_norm": 0.4863702356815338, + "learning_rate": 1.1540837151323951e-05, + "loss": 1.0219, + "num_input_tokens_seen": 118784000, + "step": 14500 + }, + { + "epoch": 0.6967587034813926, + "grad_norm": 0.42491790652275085, + "learning_rate": 1.150746876713008e-05, + "loss": 1.0269, + "num_input_tokens_seen": 118865920, + "step": 14510 + }, + { + "epoch": 0.6972388955582233, + "grad_norm": 1.2376126050949097, + "learning_rate": 1.1474134264753384e-05, + "loss": 0.9424, + "num_input_tokens_seen": 118947840, + "step": 14520 + }, + { + "epoch": 0.697719087635054, + "grad_norm": 0.4999983310699463, + "learning_rate": 1.1440833727901894e-05, + "loss": 0.8436, + "num_input_tokens_seen": 119029760, + "step": 14530 + }, + { + "epoch": 0.6981992797118848, + "grad_norm": 1.1919395923614502, + "learning_rate": 1.1407567240198397e-05, + "loss": 0.7876, + "num_input_tokens_seen": 119111680, + "step": 14540 + }, + { + "epoch": 0.6986794717887155, + "grad_norm": 0.4805420935153961, + "learning_rate": 1.1374334885180135e-05, + "loss": 0.8579, + "num_input_tokens_seen": 119193600, + "step": 14550 + }, + { + "epoch": 0.6991596638655462, + "grad_norm": 0.505160391330719, + "learning_rate": 1.1341136746298647e-05, + "loss": 0.9189, + "num_input_tokens_seen": 119275520, + "step": 14560 + }, + { + "epoch": 0.699639855942377, + "grad_norm": 0.369730681180954, + "learning_rate": 1.1307972906919562e-05, + "loss": 0.834, + "num_input_tokens_seen": 119357440, + "step": 14570 + }, + { + "epoch": 0.7001200480192077, + "grad_norm": 0.493691086769104, + "learning_rate": 1.1274843450322381e-05, + "loss": 0.8542, + "num_input_tokens_seen": 119439360, + "step": 14580 + }, + { + "epoch": 0.7006002400960384, + "grad_norm": 0.49764561653137207, + "learning_rate": 1.1241748459700241e-05, + "loss": 0.8792, + "num_input_tokens_seen": 119521280, + "step": 14590 + }, + { + "epoch": 0.7010804321728692, + "grad_norm": 0.6448412537574768, + "learning_rate": 1.1208688018159746e-05, + "loss": 0.9641, + "num_input_tokens_seen": 119603200, + "step": 14600 + }, + { + "epoch": 0.7015606242496999, + "grad_norm": 0.499411016702652, + "learning_rate": 1.1175662208720758e-05, + "loss": 1.0869, + "num_input_tokens_seen": 119685120, + "step": 14610 + }, + { + "epoch": 0.7020408163265306, + "grad_norm": 0.5112809538841248, + "learning_rate": 1.1142671114316127e-05, + "loss": 0.9409, + "num_input_tokens_seen": 119767040, + "step": 14620 + }, + { + "epoch": 0.7025210084033613, + "grad_norm": 0.5502108931541443, + "learning_rate": 1.1109714817791584e-05, + "loss": 0.9631, + "num_input_tokens_seen": 119848960, + "step": 14630 + }, + { + "epoch": 0.7030012004801921, + "grad_norm": 0.4769909381866455, + "learning_rate": 1.1076793401905419e-05, + "loss": 0.8025, + "num_input_tokens_seen": 119930880, + "step": 14640 + }, + { + "epoch": 0.7034813925570228, + "grad_norm": 0.5704587697982788, + "learning_rate": 1.1043906949328387e-05, + "loss": 0.8847, + "num_input_tokens_seen": 120012800, + "step": 14650 + }, + { + "epoch": 0.7039615846338535, + "grad_norm": 0.49126431345939636, + "learning_rate": 1.1011055542643398e-05, + "loss": 0.8116, + "num_input_tokens_seen": 120094720, + "step": 14660 + }, + { + "epoch": 0.7044417767106843, + "grad_norm": 0.5111186504364014, + "learning_rate": 1.0978239264345397e-05, + "loss": 0.9473, + "num_input_tokens_seen": 120176640, + "step": 14670 + }, + { + "epoch": 0.704921968787515, + "grad_norm": 0.48010018467903137, + "learning_rate": 1.0945458196841078e-05, + "loss": 0.9452, + "num_input_tokens_seen": 120258560, + "step": 14680 + }, + { + "epoch": 0.7054021608643457, + "grad_norm": 0.4862194061279297, + "learning_rate": 1.0912712422448737e-05, + "loss": 1.089, + "num_input_tokens_seen": 120340480, + "step": 14690 + }, + { + "epoch": 0.7058823529411765, + "grad_norm": 0.5020564198493958, + "learning_rate": 1.0880002023398058e-05, + "loss": 0.8222, + "num_input_tokens_seen": 120422400, + "step": 14700 + }, + { + "epoch": 0.7063625450180072, + "grad_norm": 0.5132585167884827, + "learning_rate": 1.0847327081829853e-05, + "loss": 0.8349, + "num_input_tokens_seen": 120504320, + "step": 14710 + }, + { + "epoch": 0.7068427370948379, + "grad_norm": 0.5931466221809387, + "learning_rate": 1.0814687679795924e-05, + "loss": 1.4205, + "num_input_tokens_seen": 120586240, + "step": 14720 + }, + { + "epoch": 0.7073229291716686, + "grad_norm": 0.529240608215332, + "learning_rate": 1.0782083899258827e-05, + "loss": 0.9358, + "num_input_tokens_seen": 120668160, + "step": 14730 + }, + { + "epoch": 0.7078031212484994, + "grad_norm": 0.5132381319999695, + "learning_rate": 1.0749515822091658e-05, + "loss": 0.8712, + "num_input_tokens_seen": 120750080, + "step": 14740 + }, + { + "epoch": 0.7082833133253301, + "grad_norm": 0.5172890424728394, + "learning_rate": 1.0716983530077843e-05, + "loss": 1.2485, + "num_input_tokens_seen": 120832000, + "step": 14750 + }, + { + "epoch": 0.7087635054021608, + "grad_norm": 0.4927189350128174, + "learning_rate": 1.0684487104910974e-05, + "loss": 0.7965, + "num_input_tokens_seen": 120913920, + "step": 14760 + }, + { + "epoch": 0.7092436974789916, + "grad_norm": 0.3909131586551666, + "learning_rate": 1.0652026628194567e-05, + "loss": 0.8365, + "num_input_tokens_seen": 120995840, + "step": 14770 + }, + { + "epoch": 0.7097238895558223, + "grad_norm": 0.5766429901123047, + "learning_rate": 1.0619602181441848e-05, + "loss": 0.8742, + "num_input_tokens_seen": 121077760, + "step": 14780 + }, + { + "epoch": 0.710204081632653, + "grad_norm": 0.5812046527862549, + "learning_rate": 1.0587213846075591e-05, + "loss": 0.7952, + "num_input_tokens_seen": 121159680, + "step": 14790 + }, + { + "epoch": 0.7106842737094838, + "grad_norm": 0.5952445864677429, + "learning_rate": 1.0554861703427884e-05, + "loss": 0.9441, + "num_input_tokens_seen": 121241600, + "step": 14800 + }, + { + "epoch": 0.7111644657863145, + "grad_norm": 1.0399448871612549, + "learning_rate": 1.0522545834739908e-05, + "loss": 1.0587, + "num_input_tokens_seen": 121323520, + "step": 14810 + }, + { + "epoch": 0.7116446578631452, + "grad_norm": 0.519305944442749, + "learning_rate": 1.0490266321161785e-05, + "loss": 0.9414, + "num_input_tokens_seen": 121405440, + "step": 14820 + }, + { + "epoch": 0.712124849939976, + "grad_norm": 0.517493724822998, + "learning_rate": 1.0458023243752321e-05, + "loss": 0.9523, + "num_input_tokens_seen": 121487360, + "step": 14830 + }, + { + "epoch": 0.7126050420168067, + "grad_norm": 0.4983735978603363, + "learning_rate": 1.0425816683478823e-05, + "loss": 1.145, + "num_input_tokens_seen": 121569280, + "step": 14840 + }, + { + "epoch": 0.7130852340936374, + "grad_norm": 1.5348645448684692, + "learning_rate": 1.039364672121692e-05, + "loss": 0.9529, + "num_input_tokens_seen": 121651200, + "step": 14850 + }, + { + "epoch": 0.7135654261704681, + "grad_norm": 0.44925153255462646, + "learning_rate": 1.0361513437750333e-05, + "loss": 1.0087, + "num_input_tokens_seen": 121733120, + "step": 14860 + }, + { + "epoch": 0.714045618247299, + "grad_norm": 0.28816720843315125, + "learning_rate": 1.0329416913770651e-05, + "loss": 0.8125, + "num_input_tokens_seen": 121815040, + "step": 14870 + }, + { + "epoch": 0.7145258103241297, + "grad_norm": 0.5006898045539856, + "learning_rate": 1.0297357229877183e-05, + "loss": 0.8591, + "num_input_tokens_seen": 121896960, + "step": 14880 + }, + { + "epoch": 0.7150060024009603, + "grad_norm": 0.4817348122596741, + "learning_rate": 1.0265334466576723e-05, + "loss": 0.9232, + "num_input_tokens_seen": 121978880, + "step": 14890 + }, + { + "epoch": 0.7154861944777912, + "grad_norm": 0.511573851108551, + "learning_rate": 1.0233348704283332e-05, + "loss": 0.7655, + "num_input_tokens_seen": 122060800, + "step": 14900 + }, + { + "epoch": 0.7159663865546219, + "grad_norm": 0.49658405780792236, + "learning_rate": 1.0201400023318184e-05, + "loss": 0.8836, + "num_input_tokens_seen": 122142720, + "step": 14910 + }, + { + "epoch": 0.7164465786314526, + "grad_norm": 0.49688369035720825, + "learning_rate": 1.0169488503909313e-05, + "loss": 0.9663, + "num_input_tokens_seen": 122224640, + "step": 14920 + }, + { + "epoch": 0.7169267707082834, + "grad_norm": 0.4753476679325104, + "learning_rate": 1.0137614226191434e-05, + "loss": 1.1426, + "num_input_tokens_seen": 122306560, + "step": 14930 + }, + { + "epoch": 0.7174069627851141, + "grad_norm": 0.496444433927536, + "learning_rate": 1.010577727020576e-05, + "loss": 1.0298, + "num_input_tokens_seen": 122388480, + "step": 14940 + }, + { + "epoch": 0.7178871548619448, + "grad_norm": 0.5547921657562256, + "learning_rate": 1.0073977715899785e-05, + "loss": 1.0564, + "num_input_tokens_seen": 122470400, + "step": 14950 + }, + { + "epoch": 0.7183673469387755, + "grad_norm": 0.49256107211112976, + "learning_rate": 1.0042215643127051e-05, + "loss": 0.8569, + "num_input_tokens_seen": 122552320, + "step": 14960 + }, + { + "epoch": 0.7188475390156063, + "grad_norm": 0.48709073662757874, + "learning_rate": 1.0010491131647013e-05, + "loss": 0.9653, + "num_input_tokens_seen": 122634240, + "step": 14970 + }, + { + "epoch": 0.719327731092437, + "grad_norm": 0.9637261629104614, + "learning_rate": 9.978804261124792e-06, + "loss": 1.0368, + "num_input_tokens_seen": 122716160, + "step": 14980 + }, + { + "epoch": 0.7198079231692677, + "grad_norm": 0.5114941000938416, + "learning_rate": 9.947155111130969e-06, + "loss": 0.8155, + "num_input_tokens_seen": 122798080, + "step": 14990 + }, + { + "epoch": 0.7202881152460985, + "grad_norm": 0.4979882538318634, + "learning_rate": 9.915543761141432e-06, + "loss": 0.9462, + "num_input_tokens_seen": 122880000, + "step": 15000 + }, + { + "epoch": 0.7207683073229292, + "grad_norm": 0.49206554889678955, + "learning_rate": 9.883970290537134e-06, + "loss": 0.7584, + "num_input_tokens_seen": 122961920, + "step": 15010 + }, + { + "epoch": 0.7212484993997599, + "grad_norm": 0.5425018668174744, + "learning_rate": 9.852434778603888e-06, + "loss": 0.9354, + "num_input_tokens_seen": 123043840, + "step": 15020 + }, + { + "epoch": 0.7217286914765907, + "grad_norm": 0.311642587184906, + "learning_rate": 9.820937304532221e-06, + "loss": 0.8208, + "num_input_tokens_seen": 123125760, + "step": 15030 + }, + { + "epoch": 0.7222088835534214, + "grad_norm": 0.5740408301353455, + "learning_rate": 9.789477947417131e-06, + "loss": 1.0232, + "num_input_tokens_seen": 123207680, + "step": 15040 + }, + { + "epoch": 0.7226890756302521, + "grad_norm": 0.48182716965675354, + "learning_rate": 9.758056786257874e-06, + "loss": 0.8119, + "num_input_tokens_seen": 123289600, + "step": 15050 + }, + { + "epoch": 0.7231692677070828, + "grad_norm": 0.6770291328430176, + "learning_rate": 9.726673899957823e-06, + "loss": 0.9681, + "num_input_tokens_seen": 123371520, + "step": 15060 + }, + { + "epoch": 0.7236494597839136, + "grad_norm": 0.5068685412406921, + "learning_rate": 9.695329367324226e-06, + "loss": 0.831, + "num_input_tokens_seen": 123453440, + "step": 15070 + }, + { + "epoch": 0.7241296518607443, + "grad_norm": 0.5525927543640137, + "learning_rate": 9.664023267068007e-06, + "loss": 0.7507, + "num_input_tokens_seen": 123535360, + "step": 15080 + }, + { + "epoch": 0.724609843937575, + "grad_norm": 0.5521044731140137, + "learning_rate": 9.632755677803595e-06, + "loss": 0.9068, + "num_input_tokens_seen": 123617280, + "step": 15090 + }, + { + "epoch": 0.7250900360144058, + "grad_norm": 0.628960907459259, + "learning_rate": 9.601526678048701e-06, + "loss": 0.9601, + "num_input_tokens_seen": 123699200, + "step": 15100 + }, + { + "epoch": 0.7255702280912365, + "grad_norm": 0.49019432067871094, + "learning_rate": 9.570336346224145e-06, + "loss": 0.8658, + "num_input_tokens_seen": 123781120, + "step": 15110 + }, + { + "epoch": 0.7260504201680672, + "grad_norm": 0.49973979592323303, + "learning_rate": 9.53918476065363e-06, + "loss": 0.9606, + "num_input_tokens_seen": 123863040, + "step": 15120 + }, + { + "epoch": 0.726530612244898, + "grad_norm": 0.5277695655822754, + "learning_rate": 9.508071999563578e-06, + "loss": 0.8749, + "num_input_tokens_seen": 123944960, + "step": 15130 + }, + { + "epoch": 0.7270108043217287, + "grad_norm": 0.6320548057556152, + "learning_rate": 9.476998141082896e-06, + "loss": 0.8534, + "num_input_tokens_seen": 124026880, + "step": 15140 + }, + { + "epoch": 0.7274909963985594, + "grad_norm": 0.4708552658557892, + "learning_rate": 9.445963263242822e-06, + "loss": 0.8115, + "num_input_tokens_seen": 124108800, + "step": 15150 + }, + { + "epoch": 0.7279711884753901, + "grad_norm": 0.49595674872398376, + "learning_rate": 9.414967443976705e-06, + "loss": 0.8173, + "num_input_tokens_seen": 124190720, + "step": 15160 + }, + { + "epoch": 0.7284513805522209, + "grad_norm": 0.49820470809936523, + "learning_rate": 9.384010761119787e-06, + "loss": 0.7552, + "num_input_tokens_seen": 124272640, + "step": 15170 + }, + { + "epoch": 0.7289315726290516, + "grad_norm": 0.47693219780921936, + "learning_rate": 9.353093292409063e-06, + "loss": 1.0986, + "num_input_tokens_seen": 124354560, + "step": 15180 + }, + { + "epoch": 0.7294117647058823, + "grad_norm": 0.5159751176834106, + "learning_rate": 9.322215115483049e-06, + "loss": 0.9024, + "num_input_tokens_seen": 124436480, + "step": 15190 + }, + { + "epoch": 0.7298919567827131, + "grad_norm": 0.4883824586868286, + "learning_rate": 9.291376307881577e-06, + "loss": 0.7029, + "num_input_tokens_seen": 124518400, + "step": 15200 + }, + { + "epoch": 0.7303721488595438, + "grad_norm": 0.4944133162498474, + "learning_rate": 9.260576947045624e-06, + "loss": 0.7564, + "num_input_tokens_seen": 124600320, + "step": 15210 + }, + { + "epoch": 0.7308523409363745, + "grad_norm": 0.5287705063819885, + "learning_rate": 9.229817110317126e-06, + "loss": 0.8904, + "num_input_tokens_seen": 124682240, + "step": 15220 + }, + { + "epoch": 0.7313325330132053, + "grad_norm": 0.5056361556053162, + "learning_rate": 9.19909687493874e-06, + "loss": 0.8753, + "num_input_tokens_seen": 124764160, + "step": 15230 + }, + { + "epoch": 0.731812725090036, + "grad_norm": 0.49271291494369507, + "learning_rate": 9.168416318053701e-06, + "loss": 0.7468, + "num_input_tokens_seen": 124846080, + "step": 15240 + }, + { + "epoch": 0.7322929171668667, + "grad_norm": 0.4896814823150635, + "learning_rate": 9.137775516705604e-06, + "loss": 0.7828, + "num_input_tokens_seen": 124928000, + "step": 15250 + }, + { + "epoch": 0.7327731092436974, + "grad_norm": 0.5445641279220581, + "learning_rate": 9.107174547838188e-06, + "loss": 0.978, + "num_input_tokens_seen": 125009920, + "step": 15260 + }, + { + "epoch": 0.7332533013205282, + "grad_norm": 0.4939377009868622, + "learning_rate": 9.076613488295193e-06, + "loss": 0.9886, + "num_input_tokens_seen": 125091840, + "step": 15270 + }, + { + "epoch": 0.7337334933973589, + "grad_norm": 0.5052213668823242, + "learning_rate": 9.04609241482014e-06, + "loss": 0.9165, + "num_input_tokens_seen": 125173760, + "step": 15280 + }, + { + "epoch": 0.7342136854741896, + "grad_norm": 0.5136955976486206, + "learning_rate": 9.015611404056121e-06, + "loss": 0.9454, + "num_input_tokens_seen": 125255680, + "step": 15290 + }, + { + "epoch": 0.7346938775510204, + "grad_norm": 0.49934035539627075, + "learning_rate": 8.985170532545622e-06, + "loss": 0.9743, + "num_input_tokens_seen": 125337600, + "step": 15300 + }, + { + "epoch": 0.7351740696278511, + "grad_norm": 0.47998034954071045, + "learning_rate": 8.954769876730368e-06, + "loss": 1.0288, + "num_input_tokens_seen": 125419520, + "step": 15310 + }, + { + "epoch": 0.7356542617046818, + "grad_norm": 0.512691080570221, + "learning_rate": 8.924409512951045e-06, + "loss": 0.8448, + "num_input_tokens_seen": 125501440, + "step": 15320 + }, + { + "epoch": 0.7361344537815127, + "grad_norm": 0.5149726271629333, + "learning_rate": 8.894089517447206e-06, + "loss": 1.0112, + "num_input_tokens_seen": 125583360, + "step": 15330 + }, + { + "epoch": 0.7366146458583434, + "grad_norm": 0.47288787364959717, + "learning_rate": 8.863809966357017e-06, + "loss": 0.9288, + "num_input_tokens_seen": 125665280, + "step": 15340 + }, + { + "epoch": 0.737094837935174, + "grad_norm": 0.4858817756175995, + "learning_rate": 8.833570935717064e-06, + "loss": 0.8124, + "num_input_tokens_seen": 125747200, + "step": 15350 + }, + { + "epoch": 0.7375750300120048, + "grad_norm": 0.496822714805603, + "learning_rate": 8.803372501462203e-06, + "loss": 0.8461, + "num_input_tokens_seen": 125829120, + "step": 15360 + }, + { + "epoch": 0.7380552220888356, + "grad_norm": 0.510150134563446, + "learning_rate": 8.773214739425346e-06, + "loss": 0.7163, + "num_input_tokens_seen": 125911040, + "step": 15370 + }, + { + "epoch": 0.7385354141656663, + "grad_norm": 0.4825844466686249, + "learning_rate": 8.743097725337255e-06, + "loss": 0.5987, + "num_input_tokens_seen": 125992960, + "step": 15380 + }, + { + "epoch": 0.739015606242497, + "grad_norm": 0.47801604866981506, + "learning_rate": 8.713021534826366e-06, + "loss": 0.8509, + "num_input_tokens_seen": 126074880, + "step": 15390 + }, + { + "epoch": 0.7394957983193278, + "grad_norm": 0.5060662031173706, + "learning_rate": 8.68298624341862e-06, + "loss": 1.0267, + "num_input_tokens_seen": 126156800, + "step": 15400 + }, + { + "epoch": 0.7399759903961585, + "grad_norm": 0.46689069271087646, + "learning_rate": 8.652991926537254e-06, + "loss": 0.913, + "num_input_tokens_seen": 126238720, + "step": 15410 + }, + { + "epoch": 0.7404561824729892, + "grad_norm": 0.4775540828704834, + "learning_rate": 8.623038659502583e-06, + "loss": 0.9146, + "num_input_tokens_seen": 126320640, + "step": 15420 + }, + { + "epoch": 0.74093637454982, + "grad_norm": 0.5078407526016235, + "learning_rate": 8.59312651753187e-06, + "loss": 0.7079, + "num_input_tokens_seen": 126402560, + "step": 15430 + }, + { + "epoch": 0.7414165666266507, + "grad_norm": 0.6031984090805054, + "learning_rate": 8.5632555757391e-06, + "loss": 0.9282, + "num_input_tokens_seen": 126484480, + "step": 15440 + }, + { + "epoch": 0.7418967587034814, + "grad_norm": 0.48151716589927673, + "learning_rate": 8.53342590913478e-06, + "loss": 0.8451, + "num_input_tokens_seen": 126566400, + "step": 15450 + }, + { + "epoch": 0.7423769507803121, + "grad_norm": 0.4922950863838196, + "learning_rate": 8.503637592625796e-06, + "loss": 0.8644, + "num_input_tokens_seen": 126648320, + "step": 15460 + }, + { + "epoch": 0.7428571428571429, + "grad_norm": 0.48294711112976074, + "learning_rate": 8.473890701015177e-06, + "loss": 0.9529, + "num_input_tokens_seen": 126730240, + "step": 15470 + }, + { + "epoch": 0.7433373349339736, + "grad_norm": 1.1124995946884155, + "learning_rate": 8.444185309001926e-06, + "loss": 1.0753, + "num_input_tokens_seen": 126812160, + "step": 15480 + }, + { + "epoch": 0.7438175270108043, + "grad_norm": 0.6511945128440857, + "learning_rate": 8.41452149118085e-06, + "loss": 0.8009, + "num_input_tokens_seen": 126894080, + "step": 15490 + }, + { + "epoch": 0.7442977190876351, + "grad_norm": 0.47648462653160095, + "learning_rate": 8.384899322042356e-06, + "loss": 0.8622, + "num_input_tokens_seen": 126976000, + "step": 15500 + }, + { + "epoch": 0.7447779111644658, + "grad_norm": 0.48163020610809326, + "learning_rate": 8.355318875972243e-06, + "loss": 0.8215, + "num_input_tokens_seen": 127057920, + "step": 15510 + }, + { + "epoch": 0.7452581032412965, + "grad_norm": 0.4855590760707855, + "learning_rate": 8.325780227251562e-06, + "loss": 0.7382, + "num_input_tokens_seen": 127139840, + "step": 15520 + }, + { + "epoch": 0.7457382953181273, + "grad_norm": 0.4971849024295807, + "learning_rate": 8.2962834500564e-06, + "loss": 0.9809, + "num_input_tokens_seen": 127221760, + "step": 15530 + }, + { + "epoch": 0.746218487394958, + "grad_norm": 0.4910465478897095, + "learning_rate": 8.266828618457678e-06, + "loss": 0.9193, + "num_input_tokens_seen": 127303680, + "step": 15540 + }, + { + "epoch": 0.7466986794717887, + "grad_norm": 0.5851929187774658, + "learning_rate": 8.237415806421015e-06, + "loss": 1.1018, + "num_input_tokens_seen": 127385600, + "step": 15550 + }, + { + "epoch": 0.7471788715486194, + "grad_norm": 0.5044286847114563, + "learning_rate": 8.20804508780648e-06, + "loss": 0.7888, + "num_input_tokens_seen": 127467520, + "step": 15560 + }, + { + "epoch": 0.7476590636254502, + "grad_norm": 0.48159295320510864, + "learning_rate": 8.178716536368475e-06, + "loss": 1.0093, + "num_input_tokens_seen": 127549440, + "step": 15570 + }, + { + "epoch": 0.7481392557022809, + "grad_norm": 0.5040597319602966, + "learning_rate": 8.149430225755476e-06, + "loss": 0.9785, + "num_input_tokens_seen": 127631360, + "step": 15580 + }, + { + "epoch": 0.7486194477791116, + "grad_norm": 0.4986848831176758, + "learning_rate": 8.120186229509922e-06, + "loss": 0.8937, + "num_input_tokens_seen": 127713280, + "step": 15590 + }, + { + "epoch": 0.7490996398559424, + "grad_norm": 0.5762357711791992, + "learning_rate": 8.090984621067963e-06, + "loss": 1.0423, + "num_input_tokens_seen": 127795200, + "step": 15600 + }, + { + "epoch": 0.7495798319327731, + "grad_norm": 0.4688274562358856, + "learning_rate": 8.061825473759324e-06, + "loss": 0.8387, + "num_input_tokens_seen": 127877120, + "step": 15610 + }, + { + "epoch": 0.7500600240096038, + "grad_norm": 0.49585649371147156, + "learning_rate": 8.032708860807111e-06, + "loss": 0.8558, + "num_input_tokens_seen": 127959040, + "step": 15620 + }, + { + "epoch": 0.7505402160864346, + "grad_norm": 0.501162052154541, + "learning_rate": 8.003634855327594e-06, + "loss": 0.7608, + "num_input_tokens_seen": 128040960, + "step": 15630 + }, + { + "epoch": 0.7510204081632653, + "grad_norm": 0.8193572759628296, + "learning_rate": 7.974603530330069e-06, + "loss": 0.7416, + "num_input_tokens_seen": 128122880, + "step": 15640 + }, + { + "epoch": 0.751500600240096, + "grad_norm": 0.5035424828529358, + "learning_rate": 7.945614958716658e-06, + "loss": 0.7773, + "num_input_tokens_seen": 128204800, + "step": 15650 + }, + { + "epoch": 0.7519807923169267, + "grad_norm": 0.48581522703170776, + "learning_rate": 7.916669213282107e-06, + "loss": 0.8304, + "num_input_tokens_seen": 128286720, + "step": 15660 + }, + { + "epoch": 0.7524609843937575, + "grad_norm": 0.5436628460884094, + "learning_rate": 7.88776636671362e-06, + "loss": 1.1004, + "num_input_tokens_seen": 128368640, + "step": 15670 + }, + { + "epoch": 0.7529411764705882, + "grad_norm": 0.4647444784641266, + "learning_rate": 7.858906491590697e-06, + "loss": 0.9029, + "num_input_tokens_seen": 128450560, + "step": 15680 + }, + { + "epoch": 0.7534213685474189, + "grad_norm": 0.4686352014541626, + "learning_rate": 7.830089660384895e-06, + "loss": 0.8456, + "num_input_tokens_seen": 128532480, + "step": 15690 + }, + { + "epoch": 0.7539015606242497, + "grad_norm": 0.4896450340747833, + "learning_rate": 7.801315945459714e-06, + "loss": 1.3139, + "num_input_tokens_seen": 128614400, + "step": 15700 + }, + { + "epoch": 0.7543817527010804, + "grad_norm": 0.5118237137794495, + "learning_rate": 7.772585419070374e-06, + "loss": 0.931, + "num_input_tokens_seen": 128696320, + "step": 15710 + }, + { + "epoch": 0.7548619447779111, + "grad_norm": 0.4990551471710205, + "learning_rate": 7.743898153363625e-06, + "loss": 0.7754, + "num_input_tokens_seen": 128778240, + "step": 15720 + }, + { + "epoch": 0.7553421368547419, + "grad_norm": 0.47710755467414856, + "learning_rate": 7.715254220377596e-06, + "loss": 0.8358, + "num_input_tokens_seen": 128860160, + "step": 15730 + }, + { + "epoch": 0.7558223289315726, + "grad_norm": 1.072224497795105, + "learning_rate": 7.686653692041615e-06, + "loss": 0.7976, + "num_input_tokens_seen": 128942080, + "step": 15740 + }, + { + "epoch": 0.7563025210084033, + "grad_norm": 0.48185160756111145, + "learning_rate": 7.658096640175988e-06, + "loss": 0.834, + "num_input_tokens_seen": 129024000, + "step": 15750 + }, + { + "epoch": 0.7567827130852341, + "grad_norm": 0.5147521495819092, + "learning_rate": 7.629583136491844e-06, + "loss": 0.78, + "num_input_tokens_seen": 129105920, + "step": 15760 + }, + { + "epoch": 0.7572629051620648, + "grad_norm": 0.5172405242919922, + "learning_rate": 7.601113252590991e-06, + "loss": 0.855, + "num_input_tokens_seen": 129187840, + "step": 15770 + }, + { + "epoch": 0.7577430972388955, + "grad_norm": 0.5000312328338623, + "learning_rate": 7.572687059965661e-06, + "loss": 1.0542, + "num_input_tokens_seen": 129269760, + "step": 15780 + }, + { + "epoch": 0.7582232893157262, + "grad_norm": 0.5540167689323425, + "learning_rate": 7.544304629998389e-06, + "loss": 0.9999, + "num_input_tokens_seen": 129351680, + "step": 15790 + }, + { + "epoch": 0.758703481392557, + "grad_norm": 0.4820740222930908, + "learning_rate": 7.51596603396183e-06, + "loss": 0.8808, + "num_input_tokens_seen": 129433600, + "step": 15800 + }, + { + "epoch": 0.7591836734693878, + "grad_norm": 0.5026702880859375, + "learning_rate": 7.4876713430185265e-06, + "loss": 0.9648, + "num_input_tokens_seen": 129515520, + "step": 15810 + }, + { + "epoch": 0.7596638655462185, + "grad_norm": 0.4794592261314392, + "learning_rate": 7.4594206282208e-06, + "loss": 0.9263, + "num_input_tokens_seen": 129597440, + "step": 15820 + }, + { + "epoch": 0.7601440576230493, + "grad_norm": 0.4958498477935791, + "learning_rate": 7.431213960510544e-06, + "loss": 0.8364, + "num_input_tokens_seen": 129679360, + "step": 15830 + }, + { + "epoch": 0.76062424969988, + "grad_norm": 0.4976142644882202, + "learning_rate": 7.40305141071902e-06, + "loss": 1.068, + "num_input_tokens_seen": 129761280, + "step": 15840 + }, + { + "epoch": 0.7611044417767107, + "grad_norm": 0.4213819205760956, + "learning_rate": 7.374933049566704e-06, + "loss": 0.7245, + "num_input_tokens_seen": 129843200, + "step": 15850 + }, + { + "epoch": 0.7615846338535415, + "grad_norm": 0.49244338274002075, + "learning_rate": 7.346858947663138e-06, + "loss": 0.8833, + "num_input_tokens_seen": 129925120, + "step": 15860 + }, + { + "epoch": 0.7620648259303722, + "grad_norm": 0.385324090719223, + "learning_rate": 7.318829175506684e-06, + "loss": 1.0079, + "num_input_tokens_seen": 130007040, + "step": 15870 + }, + { + "epoch": 0.7625450180072029, + "grad_norm": 0.6321762800216675, + "learning_rate": 7.290843803484409e-06, + "loss": 0.9743, + "num_input_tokens_seen": 130088960, + "step": 15880 + }, + { + "epoch": 0.7630252100840336, + "grad_norm": 0.5179278254508972, + "learning_rate": 7.262902901871885e-06, + "loss": 0.923, + "num_input_tokens_seen": 130170880, + "step": 15890 + }, + { + "epoch": 0.7635054021608644, + "grad_norm": 0.48879900574684143, + "learning_rate": 7.235006540832995e-06, + "loss": 0.947, + "num_input_tokens_seen": 130252800, + "step": 15900 + }, + { + "epoch": 0.7639855942376951, + "grad_norm": 0.4765695631504059, + "learning_rate": 7.207154790419784e-06, + "loss": 0.8526, + "num_input_tokens_seen": 130334720, + "step": 15910 + }, + { + "epoch": 0.7644657863145258, + "grad_norm": 0.5195481181144714, + "learning_rate": 7.179347720572288e-06, + "loss": 1.0454, + "num_input_tokens_seen": 130416640, + "step": 15920 + }, + { + "epoch": 0.7649459783913566, + "grad_norm": 0.48631834983825684, + "learning_rate": 7.151585401118316e-06, + "loss": 0.7704, + "num_input_tokens_seen": 130498560, + "step": 15930 + }, + { + "epoch": 0.7654261704681873, + "grad_norm": 0.4827914237976074, + "learning_rate": 7.12386790177331e-06, + "loss": 0.8247, + "num_input_tokens_seen": 130580480, + "step": 15940 + }, + { + "epoch": 0.765906362545018, + "grad_norm": 0.21842628717422485, + "learning_rate": 7.096195292140173e-06, + "loss": 0.8725, + "num_input_tokens_seen": 130662400, + "step": 15950 + }, + { + "epoch": 0.7663865546218488, + "grad_norm": 0.46142032742500305, + "learning_rate": 7.06856764170907e-06, + "loss": 0.7432, + "num_input_tokens_seen": 130744320, + "step": 15960 + }, + { + "epoch": 0.7668667466986795, + "grad_norm": 0.5066096186637878, + "learning_rate": 7.040985019857274e-06, + "loss": 0.959, + "num_input_tokens_seen": 130826240, + "step": 15970 + }, + { + "epoch": 0.7673469387755102, + "grad_norm": 0.4842053949832916, + "learning_rate": 7.013447495848996e-06, + "loss": 1.1146, + "num_input_tokens_seen": 130908160, + "step": 15980 + }, + { + "epoch": 0.7678271308523409, + "grad_norm": 0.5671928524971008, + "learning_rate": 6.985955138835162e-06, + "loss": 0.9096, + "num_input_tokens_seen": 130990080, + "step": 15990 + }, + { + "epoch": 0.7683073229291717, + "grad_norm": 0.4887046813964844, + "learning_rate": 6.958508017853319e-06, + "loss": 0.9424, + "num_input_tokens_seen": 131072000, + "step": 16000 + }, + { + "epoch": 0.7687875150060024, + "grad_norm": 0.5106115341186523, + "learning_rate": 6.931106201827397e-06, + "loss": 0.905, + "num_input_tokens_seen": 131153920, + "step": 16010 + }, + { + "epoch": 0.7692677070828331, + "grad_norm": 0.4807291626930237, + "learning_rate": 6.903749759567557e-06, + "loss": 0.8972, + "num_input_tokens_seen": 131235840, + "step": 16020 + }, + { + "epoch": 0.7697478991596639, + "grad_norm": 0.4951104521751404, + "learning_rate": 6.876438759770037e-06, + "loss": 0.8672, + "num_input_tokens_seen": 131317760, + "step": 16030 + }, + { + "epoch": 0.7702280912364946, + "grad_norm": 0.5364571213722229, + "learning_rate": 6.8491732710169344e-06, + "loss": 1.0239, + "num_input_tokens_seen": 131399680, + "step": 16040 + }, + { + "epoch": 0.7707082833133253, + "grad_norm": 0.4739181399345398, + "learning_rate": 6.821953361776093e-06, + "loss": 0.7971, + "num_input_tokens_seen": 131481600, + "step": 16050 + }, + { + "epoch": 0.7711884753901561, + "grad_norm": 1.0601301193237305, + "learning_rate": 6.7947791004008665e-06, + "loss": 0.8697, + "num_input_tokens_seen": 131563520, + "step": 16060 + }, + { + "epoch": 0.7716686674669868, + "grad_norm": 0.3752616047859192, + "learning_rate": 6.767650555130009e-06, + "loss": 0.8022, + "num_input_tokens_seen": 131645440, + "step": 16070 + }, + { + "epoch": 0.7721488595438175, + "grad_norm": 0.5150649547576904, + "learning_rate": 6.740567794087463e-06, + "loss": 0.9051, + "num_input_tokens_seen": 131727360, + "step": 16080 + }, + { + "epoch": 0.7726290516206482, + "grad_norm": 0.5848947763442993, + "learning_rate": 6.713530885282188e-06, + "loss": 1.1109, + "num_input_tokens_seen": 131809280, + "step": 16090 + }, + { + "epoch": 0.773109243697479, + "grad_norm": 0.5849442481994629, + "learning_rate": 6.686539896608016e-06, + "loss": 0.781, + "num_input_tokens_seen": 131891200, + "step": 16100 + }, + { + "epoch": 0.7735894357743097, + "grad_norm": 0.47524645924568176, + "learning_rate": 6.659594895843477e-06, + "loss": 0.7884, + "num_input_tokens_seen": 131973120, + "step": 16110 + }, + { + "epoch": 0.7740696278511404, + "grad_norm": 0.46588289737701416, + "learning_rate": 6.632695950651594e-06, + "loss": 0.8852, + "num_input_tokens_seen": 132055040, + "step": 16120 + }, + { + "epoch": 0.7745498199279712, + "grad_norm": 0.6264036893844604, + "learning_rate": 6.605843128579739e-06, + "loss": 0.9598, + "num_input_tokens_seen": 132136960, + "step": 16130 + }, + { + "epoch": 0.7750300120048019, + "grad_norm": 0.47402629256248474, + "learning_rate": 6.579036497059482e-06, + "loss": 0.8386, + "num_input_tokens_seen": 132218880, + "step": 16140 + }, + { + "epoch": 0.7755102040816326, + "grad_norm": 0.4720049500465393, + "learning_rate": 6.552276123406384e-06, + "loss": 0.8359, + "num_input_tokens_seen": 132300800, + "step": 16150 + }, + { + "epoch": 0.7759903961584634, + "grad_norm": 0.4943426847457886, + "learning_rate": 6.525562074819852e-06, + "loss": 1.043, + "num_input_tokens_seen": 132382720, + "step": 16160 + }, + { + "epoch": 0.7764705882352941, + "grad_norm": 0.5186018347740173, + "learning_rate": 6.4988944183829695e-06, + "loss": 0.9455, + "num_input_tokens_seen": 132464640, + "step": 16170 + }, + { + "epoch": 0.7769507803121248, + "grad_norm": 0.5227720141410828, + "learning_rate": 6.472273221062305e-06, + "loss": 1.1211, + "num_input_tokens_seen": 132546560, + "step": 16180 + }, + { + "epoch": 0.7774309723889555, + "grad_norm": 0.491787850856781, + "learning_rate": 6.445698549707776e-06, + "loss": 1.0617, + "num_input_tokens_seen": 132628480, + "step": 16190 + }, + { + "epoch": 0.7779111644657863, + "grad_norm": 0.5019481778144836, + "learning_rate": 6.419170471052472e-06, + "loss": 0.7963, + "num_input_tokens_seen": 132710400, + "step": 16200 + }, + { + "epoch": 0.778391356542617, + "grad_norm": 0.5009409189224243, + "learning_rate": 6.392689051712458e-06, + "loss": 0.9802, + "num_input_tokens_seen": 132792320, + "step": 16210 + }, + { + "epoch": 0.7788715486194477, + "grad_norm": 0.5145575404167175, + "learning_rate": 6.3662543581866405e-06, + "loss": 1.0569, + "num_input_tokens_seen": 132874240, + "step": 16220 + }, + { + "epoch": 0.7793517406962785, + "grad_norm": 0.4958726167678833, + "learning_rate": 6.339866456856608e-06, + "loss": 0.7891, + "num_input_tokens_seen": 132956160, + "step": 16230 + }, + { + "epoch": 0.7798319327731092, + "grad_norm": 0.49058905243873596, + "learning_rate": 6.313525413986415e-06, + "loss": 0.9846, + "num_input_tokens_seen": 133038080, + "step": 16240 + }, + { + "epoch": 0.78031212484994, + "grad_norm": 0.48392897844314575, + "learning_rate": 6.28723129572247e-06, + "loss": 0.8808, + "num_input_tokens_seen": 133120000, + "step": 16250 + }, + { + "epoch": 0.7807923169267708, + "grad_norm": 0.5041011571884155, + "learning_rate": 6.260984168093353e-06, + "loss": 0.8577, + "num_input_tokens_seen": 133201920, + "step": 16260 + }, + { + "epoch": 0.7812725090036015, + "grad_norm": 0.49976569414138794, + "learning_rate": 6.234784097009608e-06, + "loss": 0.9177, + "num_input_tokens_seen": 133283840, + "step": 16270 + }, + { + "epoch": 0.7817527010804322, + "grad_norm": 1.3650128841400146, + "learning_rate": 6.208631148263649e-06, + "loss": 0.8475, + "num_input_tokens_seen": 133365760, + "step": 16280 + }, + { + "epoch": 0.7822328931572629, + "grad_norm": 1.7946758270263672, + "learning_rate": 6.18252538752955e-06, + "loss": 0.6443, + "num_input_tokens_seen": 133447680, + "step": 16290 + }, + { + "epoch": 0.7827130852340937, + "grad_norm": 0.47387173771858215, + "learning_rate": 6.156466880362877e-06, + "loss": 0.7558, + "num_input_tokens_seen": 133529600, + "step": 16300 + }, + { + "epoch": 0.7831932773109244, + "grad_norm": 0.47302481532096863, + "learning_rate": 6.1304556922005315e-06, + "loss": 0.8476, + "num_input_tokens_seen": 133611520, + "step": 16310 + }, + { + "epoch": 0.7836734693877551, + "grad_norm": 0.6125491261482239, + "learning_rate": 6.1044918883606225e-06, + "loss": 0.8751, + "num_input_tokens_seen": 133693440, + "step": 16320 + }, + { + "epoch": 0.7841536614645859, + "grad_norm": 0.6477288007736206, + "learning_rate": 6.078575534042222e-06, + "loss": 0.9734, + "num_input_tokens_seen": 133775360, + "step": 16330 + }, + { + "epoch": 0.7846338535414166, + "grad_norm": 1.2066618204116821, + "learning_rate": 6.052706694325292e-06, + "loss": 0.9515, + "num_input_tokens_seen": 133857280, + "step": 16340 + }, + { + "epoch": 0.7851140456182473, + "grad_norm": 0.5165063142776489, + "learning_rate": 6.026885434170457e-06, + "loss": 0.9599, + "num_input_tokens_seen": 133939200, + "step": 16350 + }, + { + "epoch": 0.7855942376950781, + "grad_norm": 1.0127323865890503, + "learning_rate": 6.001111818418859e-06, + "loss": 0.975, + "num_input_tokens_seen": 134021120, + "step": 16360 + }, + { + "epoch": 0.7860744297719088, + "grad_norm": 0.5036996603012085, + "learning_rate": 5.975385911792006e-06, + "loss": 0.9928, + "num_input_tokens_seen": 134103040, + "step": 16370 + }, + { + "epoch": 0.7865546218487395, + "grad_norm": 0.49564144015312195, + "learning_rate": 5.9497077788916055e-06, + "loss": 1.0484, + "num_input_tokens_seen": 134184960, + "step": 16380 + }, + { + "epoch": 0.7870348139255702, + "grad_norm": 1.0678465366363525, + "learning_rate": 5.924077484199389e-06, + "loss": 0.832, + "num_input_tokens_seen": 134266880, + "step": 16390 + }, + { + "epoch": 0.787515006002401, + "grad_norm": 0.4710373878479004, + "learning_rate": 5.89849509207695e-06, + "loss": 0.7669, + "num_input_tokens_seen": 134348800, + "step": 16400 + }, + { + "epoch": 0.7879951980792317, + "grad_norm": 0.48212987184524536, + "learning_rate": 5.872960666765618e-06, + "loss": 0.9089, + "num_input_tokens_seen": 134430720, + "step": 16410 + }, + { + "epoch": 0.7884753901560624, + "grad_norm": 1.6406066417694092, + "learning_rate": 5.847474272386239e-06, + "loss": 1.5337, + "num_input_tokens_seen": 134512640, + "step": 16420 + }, + { + "epoch": 0.7889555822328932, + "grad_norm": 0.5122446417808533, + "learning_rate": 5.822035972939069e-06, + "loss": 0.8218, + "num_input_tokens_seen": 134594560, + "step": 16430 + }, + { + "epoch": 0.7894357743097239, + "grad_norm": 0.4671303927898407, + "learning_rate": 5.79664583230359e-06, + "loss": 0.8714, + "num_input_tokens_seen": 134676480, + "step": 16440 + }, + { + "epoch": 0.7899159663865546, + "grad_norm": 0.5007370710372925, + "learning_rate": 5.771303914238333e-06, + "loss": 0.7859, + "num_input_tokens_seen": 134758400, + "step": 16450 + }, + { + "epoch": 0.7903961584633854, + "grad_norm": 0.4834369122982025, + "learning_rate": 5.746010282380745e-06, + "loss": 0.8862, + "num_input_tokens_seen": 134840320, + "step": 16460 + }, + { + "epoch": 0.7908763505402161, + "grad_norm": 0.3713008761405945, + "learning_rate": 5.7207650002470274e-06, + "loss": 0.7945, + "num_input_tokens_seen": 134922240, + "step": 16470 + }, + { + "epoch": 0.7913565426170468, + "grad_norm": 0.6575520634651184, + "learning_rate": 5.695568131231949e-06, + "loss": 0.8114, + "num_input_tokens_seen": 135004160, + "step": 16480 + }, + { + "epoch": 0.7918367346938775, + "grad_norm": 0.5816577672958374, + "learning_rate": 5.670419738608723e-06, + "loss": 0.7961, + "num_input_tokens_seen": 135086080, + "step": 16490 + }, + { + "epoch": 0.7923169267707083, + "grad_norm": 0.7607868313789368, + "learning_rate": 5.645319885528824e-06, + "loss": 0.867, + "num_input_tokens_seen": 135168000, + "step": 16500 + }, + { + "epoch": 0.792797118847539, + "grad_norm": 0.49401038885116577, + "learning_rate": 5.620268635021825e-06, + "loss": 0.8139, + "num_input_tokens_seen": 135249920, + "step": 16510 + }, + { + "epoch": 0.7932773109243697, + "grad_norm": 0.5071017742156982, + "learning_rate": 5.595266049995268e-06, + "loss": 0.9577, + "num_input_tokens_seen": 135331840, + "step": 16520 + }, + { + "epoch": 0.7937575030012005, + "grad_norm": 0.5186456441879272, + "learning_rate": 5.5703121932344896e-06, + "loss": 0.9578, + "num_input_tokens_seen": 135413760, + "step": 16530 + }, + { + "epoch": 0.7942376950780312, + "grad_norm": 0.5309847593307495, + "learning_rate": 5.5454071274024436e-06, + "loss": 0.8885, + "num_input_tokens_seen": 135495680, + "step": 16540 + }, + { + "epoch": 0.7947178871548619, + "grad_norm": 0.5009894967079163, + "learning_rate": 5.520550915039579e-06, + "loss": 0.9609, + "num_input_tokens_seen": 135577600, + "step": 16550 + }, + { + "epoch": 0.7951980792316927, + "grad_norm": 0.6387041211128235, + "learning_rate": 5.495743618563668e-06, + "loss": 0.8386, + "num_input_tokens_seen": 135659520, + "step": 16560 + }, + { + "epoch": 0.7956782713085234, + "grad_norm": 0.4808179438114166, + "learning_rate": 5.4709853002696236e-06, + "loss": 0.8747, + "num_input_tokens_seen": 135741440, + "step": 16570 + }, + { + "epoch": 0.7961584633853541, + "grad_norm": 0.49355682730674744, + "learning_rate": 5.4462760223294e-06, + "loss": 0.8467, + "num_input_tokens_seen": 135823360, + "step": 16580 + }, + { + "epoch": 0.7966386554621848, + "grad_norm": 0.4921320378780365, + "learning_rate": 5.42161584679178e-06, + "loss": 0.8295, + "num_input_tokens_seen": 135905280, + "step": 16590 + }, + { + "epoch": 0.7971188475390156, + "grad_norm": 0.4814390242099762, + "learning_rate": 5.397004835582242e-06, + "loss": 0.8218, + "num_input_tokens_seen": 135987200, + "step": 16600 + }, + { + "epoch": 0.7975990396158463, + "grad_norm": 0.5022986531257629, + "learning_rate": 5.372443050502823e-06, + "loss": 0.8522, + "num_input_tokens_seen": 136069120, + "step": 16610 + }, + { + "epoch": 0.798079231692677, + "grad_norm": 0.48835310339927673, + "learning_rate": 5.347930553231942e-06, + "loss": 0.8879, + "num_input_tokens_seen": 136151040, + "step": 16620 + }, + { + "epoch": 0.7985594237695078, + "grad_norm": 0.48353898525238037, + "learning_rate": 5.323467405324226e-06, + "loss": 1.0158, + "num_input_tokens_seen": 136232960, + "step": 16630 + }, + { + "epoch": 0.7990396158463385, + "grad_norm": 0.5025900602340698, + "learning_rate": 5.299053668210402e-06, + "loss": 1.0689, + "num_input_tokens_seen": 136314880, + "step": 16640 + }, + { + "epoch": 0.7995198079231692, + "grad_norm": 0.4869900345802307, + "learning_rate": 5.274689403197119e-06, + "loss": 0.7811, + "num_input_tokens_seen": 136396800, + "step": 16650 + }, + { + "epoch": 0.8, + "grad_norm": 0.5168715119361877, + "learning_rate": 5.250374671466776e-06, + "loss": 0.8692, + "num_input_tokens_seen": 136478720, + "step": 16660 + }, + { + "epoch": 0.8004801920768307, + "grad_norm": 0.5069206357002258, + "learning_rate": 5.2261095340774085e-06, + "loss": 1.1452, + "num_input_tokens_seen": 136560640, + "step": 16670 + }, + { + "epoch": 0.8009603841536614, + "grad_norm": 0.4853163957595825, + "learning_rate": 5.201894051962486e-06, + "loss": 1.0006, + "num_input_tokens_seen": 136642560, + "step": 16680 + }, + { + "epoch": 0.8014405762304923, + "grad_norm": 0.4831332862377167, + "learning_rate": 5.177728285930816e-06, + "loss": 0.897, + "num_input_tokens_seen": 136724480, + "step": 16690 + }, + { + "epoch": 0.801920768307323, + "grad_norm": 0.5170623660087585, + "learning_rate": 5.153612296666335e-06, + "loss": 0.7989, + "num_input_tokens_seen": 136806400, + "step": 16700 + }, + { + "epoch": 0.8024009603841536, + "grad_norm": 0.47998228669166565, + "learning_rate": 5.129546144727998e-06, + "loss": 0.9656, + "num_input_tokens_seen": 136888320, + "step": 16710 + }, + { + "epoch": 0.8028811524609843, + "grad_norm": 0.4863240420818329, + "learning_rate": 5.105529890549618e-06, + "loss": 0.8726, + "num_input_tokens_seen": 136970240, + "step": 16720 + }, + { + "epoch": 0.8033613445378152, + "grad_norm": 0.4388894736766815, + "learning_rate": 5.081563594439676e-06, + "loss": 0.7868, + "num_input_tokens_seen": 137052160, + "step": 16730 + }, + { + "epoch": 0.8038415366146459, + "grad_norm": 0.4876529276371002, + "learning_rate": 5.057647316581232e-06, + "loss": 1.0533, + "num_input_tokens_seen": 137134080, + "step": 16740 + }, + { + "epoch": 0.8043217286914766, + "grad_norm": 0.47916775941848755, + "learning_rate": 5.033781117031738e-06, + "loss": 0.9322, + "num_input_tokens_seen": 137216000, + "step": 16750 + }, + { + "epoch": 0.8048019207683074, + "grad_norm": 0.47834011912345886, + "learning_rate": 5.0099650557228785e-06, + "loss": 1.0616, + "num_input_tokens_seen": 137297920, + "step": 16760 + }, + { + "epoch": 0.8052821128451381, + "grad_norm": 0.5335723757743835, + "learning_rate": 4.986199192460428e-06, + "loss": 0.8815, + "num_input_tokens_seen": 137379840, + "step": 16770 + }, + { + "epoch": 0.8057623049219688, + "grad_norm": 0.5083367228507996, + "learning_rate": 4.962483586924136e-06, + "loss": 0.7737, + "num_input_tokens_seen": 137461760, + "step": 16780 + }, + { + "epoch": 0.8062424969987996, + "grad_norm": 0.4924291968345642, + "learning_rate": 4.93881829866751e-06, + "loss": 0.8263, + "num_input_tokens_seen": 137543680, + "step": 16790 + }, + { + "epoch": 0.8067226890756303, + "grad_norm": 0.5008875131607056, + "learning_rate": 4.915203387117736e-06, + "loss": 0.9976, + "num_input_tokens_seen": 137625600, + "step": 16800 + }, + { + "epoch": 0.807202881152461, + "grad_norm": 0.5046831369400024, + "learning_rate": 4.891638911575483e-06, + "loss": 0.8821, + "num_input_tokens_seen": 137707520, + "step": 16810 + }, + { + "epoch": 0.8076830732292917, + "grad_norm": 0.5597212910652161, + "learning_rate": 4.868124931214752e-06, + "loss": 0.9083, + "num_input_tokens_seen": 137789440, + "step": 16820 + }, + { + "epoch": 0.8081632653061225, + "grad_norm": 0.5007030367851257, + "learning_rate": 4.844661505082768e-06, + "loss": 0.8356, + "num_input_tokens_seen": 137871360, + "step": 16830 + }, + { + "epoch": 0.8086434573829532, + "grad_norm": 0.3486262857913971, + "learning_rate": 4.8212486920998005e-06, + "loss": 0.7684, + "num_input_tokens_seen": 137953280, + "step": 16840 + }, + { + "epoch": 0.8091236494597839, + "grad_norm": 0.4673330783843994, + "learning_rate": 4.797886551059011e-06, + "loss": 0.9145, + "num_input_tokens_seen": 138035200, + "step": 16850 + }, + { + "epoch": 0.8096038415366147, + "grad_norm": 0.4685009717941284, + "learning_rate": 4.7745751406263165e-06, + "loss": 0.9692, + "num_input_tokens_seen": 138117120, + "step": 16860 + }, + { + "epoch": 0.8100840336134454, + "grad_norm": 0.4861032962799072, + "learning_rate": 4.751314519340258e-06, + "loss": 0.8577, + "num_input_tokens_seen": 138199040, + "step": 16870 + }, + { + "epoch": 0.8105642256902761, + "grad_norm": 0.5741161108016968, + "learning_rate": 4.728104745611814e-06, + "loss": 0.8875, + "num_input_tokens_seen": 138280960, + "step": 16880 + }, + { + "epoch": 0.8110444177671069, + "grad_norm": 0.5122044682502747, + "learning_rate": 4.704945877724295e-06, + "loss": 1.0989, + "num_input_tokens_seen": 138362880, + "step": 16890 + }, + { + "epoch": 0.8115246098439376, + "grad_norm": 0.8650029897689819, + "learning_rate": 4.681837973833181e-06, + "loss": 0.9019, + "num_input_tokens_seen": 138444800, + "step": 16900 + }, + { + "epoch": 0.8120048019207683, + "grad_norm": 0.49754172563552856, + "learning_rate": 4.658781091965955e-06, + "loss": 0.8082, + "num_input_tokens_seen": 138526720, + "step": 16910 + }, + { + "epoch": 0.812484993997599, + "grad_norm": 0.45871591567993164, + "learning_rate": 4.635775290021988e-06, + "loss": 0.8978, + "num_input_tokens_seen": 138608640, + "step": 16920 + }, + { + "epoch": 0.8129651860744298, + "grad_norm": 0.3866370916366577, + "learning_rate": 4.612820625772391e-06, + "loss": 0.6356, + "num_input_tokens_seen": 138690560, + "step": 16930 + }, + { + "epoch": 0.8134453781512605, + "grad_norm": 0.6949254870414734, + "learning_rate": 4.589917156859838e-06, + "loss": 0.8645, + "num_input_tokens_seen": 138772480, + "step": 16940 + }, + { + "epoch": 0.8139255702280912, + "grad_norm": 0.6086997985839844, + "learning_rate": 4.5670649407984625e-06, + "loss": 0.8896, + "num_input_tokens_seen": 138854400, + "step": 16950 + }, + { + "epoch": 0.814405762304922, + "grad_norm": 0.4606131613254547, + "learning_rate": 4.544264034973686e-06, + "loss": 0.7794, + "num_input_tokens_seen": 138936320, + "step": 16960 + }, + { + "epoch": 0.8148859543817527, + "grad_norm": 0.5047131180763245, + "learning_rate": 4.521514496642074e-06, + "loss": 0.9631, + "num_input_tokens_seen": 139018240, + "step": 16970 + }, + { + "epoch": 0.8153661464585834, + "grad_norm": 0.5257802605628967, + "learning_rate": 4.498816382931217e-06, + "loss": 0.9246, + "num_input_tokens_seen": 139100160, + "step": 16980 + }, + { + "epoch": 0.8158463385354142, + "grad_norm": 0.5585389733314514, + "learning_rate": 4.476169750839571e-06, + "loss": 1.0151, + "num_input_tokens_seen": 139182080, + "step": 16990 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 0.47912609577178955, + "learning_rate": 4.45357465723629e-06, + "loss": 0.8939, + "num_input_tokens_seen": 139264000, + "step": 17000 + }, + { + "epoch": 0.8168067226890756, + "grad_norm": 0.4979766309261322, + "learning_rate": 4.4310311588611294e-06, + "loss": 0.8846, + "num_input_tokens_seen": 139345920, + "step": 17010 + }, + { + "epoch": 0.8172869147659063, + "grad_norm": 0.4919578731060028, + "learning_rate": 4.408539312324281e-06, + "loss": 0.8446, + "num_input_tokens_seen": 139427840, + "step": 17020 + }, + { + "epoch": 0.8177671068427371, + "grad_norm": 0.510263204574585, + "learning_rate": 4.386099174106212e-06, + "loss": 0.8381, + "num_input_tokens_seen": 139509760, + "step": 17030 + }, + { + "epoch": 0.8182472989195678, + "grad_norm": 0.5191063284873962, + "learning_rate": 4.363710800557566e-06, + "loss": 0.9471, + "num_input_tokens_seen": 139591680, + "step": 17040 + }, + { + "epoch": 0.8187274909963985, + "grad_norm": 0.5044844150543213, + "learning_rate": 4.341374247898983e-06, + "loss": 0.7787, + "num_input_tokens_seen": 139673600, + "step": 17050 + }, + { + "epoch": 0.8192076830732293, + "grad_norm": 0.48283785581588745, + "learning_rate": 4.3190895722209635e-06, + "loss": 0.9487, + "num_input_tokens_seen": 139755520, + "step": 17060 + }, + { + "epoch": 0.81968787515006, + "grad_norm": 0.36586451530456543, + "learning_rate": 4.296856829483759e-06, + "loss": 0.884, + "num_input_tokens_seen": 139837440, + "step": 17070 + }, + { + "epoch": 0.8201680672268907, + "grad_norm": 0.49161645770072937, + "learning_rate": 4.274676075517206e-06, + "loss": 0.741, + "num_input_tokens_seen": 139919360, + "step": 17080 + }, + { + "epoch": 0.8206482593037215, + "grad_norm": 0.47819775342941284, + "learning_rate": 4.252547366020568e-06, + "loss": 0.8349, + "num_input_tokens_seen": 140001280, + "step": 17090 + }, + { + "epoch": 0.8211284513805522, + "grad_norm": 0.48540183901786804, + "learning_rate": 4.230470756562438e-06, + "loss": 0.9832, + "num_input_tokens_seen": 140083200, + "step": 17100 + }, + { + "epoch": 0.8216086434573829, + "grad_norm": 0.7266910672187805, + "learning_rate": 4.208446302580582e-06, + "loss": 0.7601, + "num_input_tokens_seen": 140165120, + "step": 17110 + }, + { + "epoch": 0.8220888355342136, + "grad_norm": 0.48397764563560486, + "learning_rate": 4.186474059381768e-06, + "loss": 0.7198, + "num_input_tokens_seen": 140247040, + "step": 17120 + }, + { + "epoch": 0.8225690276110444, + "grad_norm": 0.5694714188575745, + "learning_rate": 4.164554082141683e-06, + "loss": 0.8547, + "num_input_tokens_seen": 140328960, + "step": 17130 + }, + { + "epoch": 0.8230492196878751, + "grad_norm": 1.3133982419967651, + "learning_rate": 4.142686425904752e-06, + "loss": 0.8511, + "num_input_tokens_seen": 140410880, + "step": 17140 + }, + { + "epoch": 0.8235294117647058, + "grad_norm": 0.48804813623428345, + "learning_rate": 4.12087114558401e-06, + "loss": 0.8076, + "num_input_tokens_seen": 140492800, + "step": 17150 + }, + { + "epoch": 0.8240096038415367, + "grad_norm": 0.4581037163734436, + "learning_rate": 4.099108295960977e-06, + "loss": 0.7348, + "num_input_tokens_seen": 140574720, + "step": 17160 + }, + { + "epoch": 0.8244897959183674, + "grad_norm": 0.48801615834236145, + "learning_rate": 4.077397931685523e-06, + "loss": 0.8413, + "num_input_tokens_seen": 140656640, + "step": 17170 + }, + { + "epoch": 0.824969987995198, + "grad_norm": 0.46194446086883545, + "learning_rate": 4.055740107275685e-06, + "loss": 1.1131, + "num_input_tokens_seen": 140738560, + "step": 17180 + }, + { + "epoch": 0.8254501800720289, + "grad_norm": 0.5051575899124146, + "learning_rate": 4.0341348771175955e-06, + "loss": 0.9822, + "num_input_tokens_seen": 140820480, + "step": 17190 + }, + { + "epoch": 0.8259303721488596, + "grad_norm": 0.4934830367565155, + "learning_rate": 4.012582295465308e-06, + "loss": 0.834, + "num_input_tokens_seen": 140902400, + "step": 17200 + }, + { + "epoch": 0.8264105642256903, + "grad_norm": 0.5021478533744812, + "learning_rate": 3.991082416440656e-06, + "loss": 0.8526, + "num_input_tokens_seen": 140984320, + "step": 17210 + }, + { + "epoch": 0.826890756302521, + "grad_norm": 0.4906045198440552, + "learning_rate": 3.969635294033144e-06, + "loss": 0.794, + "num_input_tokens_seen": 141066240, + "step": 17220 + }, + { + "epoch": 0.8273709483793518, + "grad_norm": 0.49712374806404114, + "learning_rate": 3.9482409820997826e-06, + "loss": 0.8676, + "num_input_tokens_seen": 141148160, + "step": 17230 + }, + { + "epoch": 0.8278511404561825, + "grad_norm": 0.4694906771183014, + "learning_rate": 3.926899534364969e-06, + "loss": 0.9085, + "num_input_tokens_seen": 141230080, + "step": 17240 + }, + { + "epoch": 0.8283313325330132, + "grad_norm": 0.4903012216091156, + "learning_rate": 3.90561100442036e-06, + "loss": 0.8688, + "num_input_tokens_seen": 141312000, + "step": 17250 + }, + { + "epoch": 0.828811524609844, + "grad_norm": 0.5216225385665894, + "learning_rate": 3.8843754457247275e-06, + "loss": 1.0517, + "num_input_tokens_seen": 141393920, + "step": 17260 + }, + { + "epoch": 0.8292917166866747, + "grad_norm": 0.48557716608047485, + "learning_rate": 3.863192911603808e-06, + "loss": 0.9196, + "num_input_tokens_seen": 141475840, + "step": 17270 + }, + { + "epoch": 0.8297719087635054, + "grad_norm": 0.4983423352241516, + "learning_rate": 3.842063455250203e-06, + "loss": 0.9045, + "num_input_tokens_seen": 141557760, + "step": 17280 + }, + { + "epoch": 0.8302521008403362, + "grad_norm": 0.5038923621177673, + "learning_rate": 3.820987129723228e-06, + "loss": 0.8666, + "num_input_tokens_seen": 141639680, + "step": 17290 + }, + { + "epoch": 0.8307322929171669, + "grad_norm": 0.4999052584171295, + "learning_rate": 3.799963987948757e-06, + "loss": 0.788, + "num_input_tokens_seen": 141721600, + "step": 17300 + }, + { + "epoch": 0.8312124849939976, + "grad_norm": 0.5127232074737549, + "learning_rate": 3.7789940827191395e-06, + "loss": 0.8889, + "num_input_tokens_seen": 141803520, + "step": 17310 + }, + { + "epoch": 0.8316926770708283, + "grad_norm": 0.5200673937797546, + "learning_rate": 3.7580774666930134e-06, + "loss": 0.8877, + "num_input_tokens_seen": 141885440, + "step": 17320 + }, + { + "epoch": 0.8321728691476591, + "grad_norm": 0.4389039874076843, + "learning_rate": 3.737214192395225e-06, + "loss": 0.8935, + "num_input_tokens_seen": 141967360, + "step": 17330 + }, + { + "epoch": 0.8326530612244898, + "grad_norm": 0.481927752494812, + "learning_rate": 3.7164043122166508e-06, + "loss": 1.0336, + "num_input_tokens_seen": 142049280, + "step": 17340 + }, + { + "epoch": 0.8331332533013205, + "grad_norm": 0.5148651599884033, + "learning_rate": 3.6956478784140937e-06, + "loss": 1.0014, + "num_input_tokens_seen": 142131200, + "step": 17350 + }, + { + "epoch": 0.8336134453781513, + "grad_norm": 0.48852846026420593, + "learning_rate": 3.674944943110156e-06, + "loss": 0.9075, + "num_input_tokens_seen": 142213120, + "step": 17360 + }, + { + "epoch": 0.834093637454982, + "grad_norm": 0.4977552592754364, + "learning_rate": 3.6542955582930748e-06, + "loss": 0.7461, + "num_input_tokens_seen": 142295040, + "step": 17370 + }, + { + "epoch": 0.8345738295318127, + "grad_norm": 0.4385221600532532, + "learning_rate": 3.6336997758166263e-06, + "loss": 0.9869, + "num_input_tokens_seen": 142376960, + "step": 17380 + }, + { + "epoch": 0.8350540216086435, + "grad_norm": 0.4688054919242859, + "learning_rate": 3.6131576473999924e-06, + "loss": 0.775, + "num_input_tokens_seen": 142458880, + "step": 17390 + }, + { + "epoch": 0.8355342136854742, + "grad_norm": 0.4902810752391815, + "learning_rate": 3.592669224627601e-06, + "loss": 0.916, + "num_input_tokens_seen": 142540800, + "step": 17400 + }, + { + "epoch": 0.8360144057623049, + "grad_norm": 0.49315813183784485, + "learning_rate": 3.5722345589490306e-06, + "loss": 1.0457, + "num_input_tokens_seen": 142622720, + "step": 17410 + }, + { + "epoch": 0.8364945978391356, + "grad_norm": 0.8642867207527161, + "learning_rate": 3.5518537016788646e-06, + "loss": 1.0179, + "num_input_tokens_seen": 142704640, + "step": 17420 + }, + { + "epoch": 0.8369747899159664, + "grad_norm": 0.487728476524353, + "learning_rate": 3.531526703996557e-06, + "loss": 0.8232, + "num_input_tokens_seen": 142786560, + "step": 17430 + }, + { + "epoch": 0.8374549819927971, + "grad_norm": 0.48335549235343933, + "learning_rate": 3.511253616946325e-06, + "loss": 0.9279, + "num_input_tokens_seen": 142868480, + "step": 17440 + }, + { + "epoch": 0.8379351740696278, + "grad_norm": 0.5654048323631287, + "learning_rate": 3.4910344914370093e-06, + "loss": 0.8721, + "num_input_tokens_seen": 142950400, + "step": 17450 + }, + { + "epoch": 0.8384153661464586, + "grad_norm": 0.6163344383239746, + "learning_rate": 3.4708693782419225e-06, + "loss": 0.8941, + "num_input_tokens_seen": 143032320, + "step": 17460 + }, + { + "epoch": 0.8388955582232893, + "grad_norm": 0.5046043992042542, + "learning_rate": 3.450758327998768e-06, + "loss": 0.8403, + "num_input_tokens_seen": 143114240, + "step": 17470 + }, + { + "epoch": 0.83937575030012, + "grad_norm": 0.44665244221687317, + "learning_rate": 3.4307013912094845e-06, + "loss": 0.9405, + "num_input_tokens_seen": 143196160, + "step": 17480 + }, + { + "epoch": 0.8398559423769508, + "grad_norm": 0.5446330904960632, + "learning_rate": 3.41069861824011e-06, + "loss": 0.6972, + "num_input_tokens_seen": 143278080, + "step": 17490 + }, + { + "epoch": 0.8403361344537815, + "grad_norm": 0.5121775269508362, + "learning_rate": 3.390750059320688e-06, + "loss": 0.9488, + "num_input_tokens_seen": 143360000, + "step": 17500 + }, + { + "epoch": 0.8408163265306122, + "grad_norm": 0.42379680275917053, + "learning_rate": 3.3708557645451053e-06, + "loss": 0.7979, + "num_input_tokens_seen": 143441920, + "step": 17510 + }, + { + "epoch": 0.8412965186074429, + "grad_norm": 0.915107250213623, + "learning_rate": 3.3510157838709895e-06, + "loss": 0.9223, + "num_input_tokens_seen": 143523840, + "step": 17520 + }, + { + "epoch": 0.8417767106842737, + "grad_norm": 0.4954970180988312, + "learning_rate": 3.3312301671195784e-06, + "loss": 1.0832, + "num_input_tokens_seen": 143605760, + "step": 17530 + }, + { + "epoch": 0.8422569027611044, + "grad_norm": 0.37159988284111023, + "learning_rate": 3.3114989639755983e-06, + "loss": 0.8365, + "num_input_tokens_seen": 143687680, + "step": 17540 + }, + { + "epoch": 0.8427370948379351, + "grad_norm": 0.4709157943725586, + "learning_rate": 3.2918222239871206e-06, + "loss": 0.8637, + "num_input_tokens_seen": 143769600, + "step": 17550 + }, + { + "epoch": 0.8432172869147659, + "grad_norm": 0.4602372646331787, + "learning_rate": 3.272199996565464e-06, + "loss": 0.8599, + "num_input_tokens_seen": 143851520, + "step": 17560 + }, + { + "epoch": 0.8436974789915966, + "grad_norm": 0.4694511890411377, + "learning_rate": 3.252632330985059e-06, + "loss": 0.8833, + "num_input_tokens_seen": 143933440, + "step": 17570 + }, + { + "epoch": 0.8441776710684273, + "grad_norm": 0.4874444603919983, + "learning_rate": 3.233119276383309e-06, + "loss": 0.8098, + "num_input_tokens_seen": 144015360, + "step": 17580 + }, + { + "epoch": 0.8446578631452581, + "grad_norm": 0.7956154346466064, + "learning_rate": 3.2136608817604998e-06, + "loss": 0.8137, + "num_input_tokens_seen": 144097280, + "step": 17590 + }, + { + "epoch": 0.8451380552220888, + "grad_norm": 0.5142585039138794, + "learning_rate": 3.1942571959796414e-06, + "loss": 0.9244, + "num_input_tokens_seen": 144179200, + "step": 17600 + }, + { + "epoch": 0.8456182472989195, + "grad_norm": 0.49832209944725037, + "learning_rate": 3.1749082677663606e-06, + "loss": 0.9445, + "num_input_tokens_seen": 144261120, + "step": 17610 + }, + { + "epoch": 0.8460984393757504, + "grad_norm": 0.49295270442962646, + "learning_rate": 3.1556141457087932e-06, + "loss": 1.0138, + "num_input_tokens_seen": 144343040, + "step": 17620 + }, + { + "epoch": 0.846578631452581, + "grad_norm": 0.49073633551597595, + "learning_rate": 3.1363748782574475e-06, + "loss": 0.8728, + "num_input_tokens_seen": 144424960, + "step": 17630 + }, + { + "epoch": 0.8470588235294118, + "grad_norm": 0.48808997869491577, + "learning_rate": 3.1171905137250655e-06, + "loss": 0.8881, + "num_input_tokens_seen": 144506880, + "step": 17640 + }, + { + "epoch": 0.8475390156062425, + "grad_norm": 0.5085486769676208, + "learning_rate": 3.098061100286537e-06, + "loss": 0.8107, + "num_input_tokens_seen": 144588800, + "step": 17650 + }, + { + "epoch": 0.8480192076830733, + "grad_norm": 0.4130302965641022, + "learning_rate": 3.078986685978763e-06, + "loss": 1.0014, + "num_input_tokens_seen": 144670720, + "step": 17660 + }, + { + "epoch": 0.848499399759904, + "grad_norm": 0.4925101697444916, + "learning_rate": 3.059967318700513e-06, + "loss": 0.7933, + "num_input_tokens_seen": 144752640, + "step": 17670 + }, + { + "epoch": 0.8489795918367347, + "grad_norm": 0.5227538347244263, + "learning_rate": 3.0410030462123486e-06, + "loss": 0.8734, + "num_input_tokens_seen": 144834560, + "step": 17680 + }, + { + "epoch": 0.8494597839135655, + "grad_norm": 0.48874667286872864, + "learning_rate": 3.022093916136465e-06, + "loss": 1.1692, + "num_input_tokens_seen": 144916480, + "step": 17690 + }, + { + "epoch": 0.8499399759903962, + "grad_norm": 0.5069953799247742, + "learning_rate": 3.0032399759565845e-06, + "loss": 0.8841, + "num_input_tokens_seen": 144998400, + "step": 17700 + }, + { + "epoch": 0.8504201680672269, + "grad_norm": 0.49537745118141174, + "learning_rate": 2.9844412730178515e-06, + "loss": 0.9305, + "num_input_tokens_seen": 145080320, + "step": 17710 + }, + { + "epoch": 0.8509003601440577, + "grad_norm": 0.48820367455482483, + "learning_rate": 2.9656978545267002e-06, + "loss": 0.9277, + "num_input_tokens_seen": 145162240, + "step": 17720 + }, + { + "epoch": 0.8513805522208884, + "grad_norm": 0.5144213438034058, + "learning_rate": 2.947009767550718e-06, + "loss": 0.9099, + "num_input_tokens_seen": 145244160, + "step": 17730 + }, + { + "epoch": 0.8518607442977191, + "grad_norm": 0.5397959351539612, + "learning_rate": 2.9283770590185696e-06, + "loss": 1.0301, + "num_input_tokens_seen": 145326080, + "step": 17740 + }, + { + "epoch": 0.8523409363745498, + "grad_norm": 0.8778235912322998, + "learning_rate": 2.9097997757198516e-06, + "loss": 1.1378, + "num_input_tokens_seen": 145408000, + "step": 17750 + }, + { + "epoch": 0.8528211284513806, + "grad_norm": 0.4958626329898834, + "learning_rate": 2.891277964304959e-06, + "loss": 0.8219, + "num_input_tokens_seen": 145489920, + "step": 17760 + }, + { + "epoch": 0.8533013205282113, + "grad_norm": 0.5561241507530212, + "learning_rate": 2.8728116712850193e-06, + "loss": 0.8583, + "num_input_tokens_seen": 145571840, + "step": 17770 + }, + { + "epoch": 0.853781512605042, + "grad_norm": 0.5028048753738403, + "learning_rate": 2.8544009430317153e-06, + "loss": 0.9945, + "num_input_tokens_seen": 145653760, + "step": 17780 + }, + { + "epoch": 0.8542617046818728, + "grad_norm": 0.4903087615966797, + "learning_rate": 2.8360458257772228e-06, + "loss": 0.9056, + "num_input_tokens_seen": 145735680, + "step": 17790 + }, + { + "epoch": 0.8547418967587035, + "grad_norm": 0.4912867844104767, + "learning_rate": 2.817746365614049e-06, + "loss": 0.8998, + "num_input_tokens_seen": 145817600, + "step": 17800 + }, + { + "epoch": 0.8552220888355342, + "grad_norm": 0.5079895853996277, + "learning_rate": 2.7995026084949584e-06, + "loss": 0.931, + "num_input_tokens_seen": 145899520, + "step": 17810 + }, + { + "epoch": 0.855702280912365, + "grad_norm": 0.547659695148468, + "learning_rate": 2.781314600232815e-06, + "loss": 1.0527, + "num_input_tokens_seen": 145981440, + "step": 17820 + }, + { + "epoch": 0.8561824729891957, + "grad_norm": 0.4735318124294281, + "learning_rate": 2.763182386500504e-06, + "loss": 0.8695, + "num_input_tokens_seen": 146063360, + "step": 17830 + }, + { + "epoch": 0.8566626650660264, + "grad_norm": 0.4886067509651184, + "learning_rate": 2.745106012830806e-06, + "loss": 0.9396, + "num_input_tokens_seen": 146145280, + "step": 17840 + }, + { + "epoch": 0.8571428571428571, + "grad_norm": 0.48998895287513733, + "learning_rate": 2.7270855246162547e-06, + "loss": 1.0026, + "num_input_tokens_seen": 146227200, + "step": 17850 + }, + { + "epoch": 0.8576230492196879, + "grad_norm": 0.4987630844116211, + "learning_rate": 2.7091209671090715e-06, + "loss": 0.9145, + "num_input_tokens_seen": 146309120, + "step": 17860 + }, + { + "epoch": 0.8581032412965186, + "grad_norm": 0.5029290318489075, + "learning_rate": 2.6912123854210212e-06, + "loss": 0.8782, + "num_input_tokens_seen": 146391040, + "step": 17870 + }, + { + "epoch": 0.8585834333733493, + "grad_norm": 0.5902127027511597, + "learning_rate": 2.673359824523297e-06, + "loss": 0.9368, + "num_input_tokens_seen": 146472960, + "step": 17880 + }, + { + "epoch": 0.8590636254501801, + "grad_norm": 0.6036652326583862, + "learning_rate": 2.655563329246413e-06, + "loss": 0.946, + "num_input_tokens_seen": 146554880, + "step": 17890 + }, + { + "epoch": 0.8595438175270108, + "grad_norm": 0.4699823558330536, + "learning_rate": 2.637822944280116e-06, + "loss": 0.8075, + "num_input_tokens_seen": 146636800, + "step": 17900 + }, + { + "epoch": 0.8600240096038415, + "grad_norm": 0.5656408071517944, + "learning_rate": 2.6201387141732205e-06, + "loss": 0.9612, + "num_input_tokens_seen": 146718720, + "step": 17910 + }, + { + "epoch": 0.8605042016806723, + "grad_norm": 0.4844336211681366, + "learning_rate": 2.6025106833335505e-06, + "loss": 1.0511, + "num_input_tokens_seen": 146800640, + "step": 17920 + }, + { + "epoch": 0.860984393757503, + "grad_norm": 0.7261025309562683, + "learning_rate": 2.5849388960277997e-06, + "loss": 0.7721, + "num_input_tokens_seen": 146882560, + "step": 17930 + }, + { + "epoch": 0.8614645858343337, + "grad_norm": 0.5041067600250244, + "learning_rate": 2.567423396381419e-06, + "loss": 0.7826, + "num_input_tokens_seen": 146964480, + "step": 17940 + }, + { + "epoch": 0.8619447779111644, + "grad_norm": 0.28633078932762146, + "learning_rate": 2.549964228378518e-06, + "loss": 0.7408, + "num_input_tokens_seen": 147046400, + "step": 17950 + }, + { + "epoch": 0.8624249699879952, + "grad_norm": 0.8320537209510803, + "learning_rate": 2.532561435861755e-06, + "loss": 0.906, + "num_input_tokens_seen": 147128320, + "step": 17960 + }, + { + "epoch": 0.8629051620648259, + "grad_norm": 0.4853600561618805, + "learning_rate": 2.515215062532206e-06, + "loss": 0.8907, + "num_input_tokens_seen": 147210240, + "step": 17970 + }, + { + "epoch": 0.8633853541416566, + "grad_norm": 0.5016003251075745, + "learning_rate": 2.497925151949271e-06, + "loss": 0.9341, + "num_input_tokens_seen": 147292160, + "step": 17980 + }, + { + "epoch": 0.8638655462184874, + "grad_norm": 0.4873434603214264, + "learning_rate": 2.4806917475305806e-06, + "loss": 0.9855, + "num_input_tokens_seen": 147374080, + "step": 17990 + }, + { + "epoch": 0.8643457382953181, + "grad_norm": 0.4814073443412781, + "learning_rate": 2.4635148925518577e-06, + "loss": 0.8596, + "num_input_tokens_seen": 147456000, + "step": 18000 + }, + { + "epoch": 0.8648259303721488, + "grad_norm": 0.4790041446685791, + "learning_rate": 2.4463946301468143e-06, + "loss": 0.8384, + "num_input_tokens_seen": 147537920, + "step": 18010 + }, + { + "epoch": 0.8653061224489796, + "grad_norm": 0.4911400377750397, + "learning_rate": 2.4293310033070614e-06, + "loss": 0.9121, + "num_input_tokens_seen": 147619840, + "step": 18020 + }, + { + "epoch": 0.8657863145258103, + "grad_norm": 0.5386258363723755, + "learning_rate": 2.4123240548819955e-06, + "loss": 0.9519, + "num_input_tokens_seen": 147701760, + "step": 18030 + }, + { + "epoch": 0.866266506602641, + "grad_norm": 0.4828698933124542, + "learning_rate": 2.3953738275786565e-06, + "loss": 0.7976, + "num_input_tokens_seen": 147783680, + "step": 18040 + }, + { + "epoch": 0.8667466986794717, + "grad_norm": 0.4941945970058441, + "learning_rate": 2.3784803639616854e-06, + "loss": 0.8773, + "num_input_tokens_seen": 147865600, + "step": 18050 + }, + { + "epoch": 0.8672268907563025, + "grad_norm": 0.4941422939300537, + "learning_rate": 2.361643706453151e-06, + "loss": 0.8026, + "num_input_tokens_seen": 147947520, + "step": 18060 + }, + { + "epoch": 0.8677070828331332, + "grad_norm": 0.5104964971542358, + "learning_rate": 2.3448638973324833e-06, + "loss": 0.8714, + "num_input_tokens_seen": 148029440, + "step": 18070 + }, + { + "epoch": 0.868187274909964, + "grad_norm": 0.4910539984703064, + "learning_rate": 2.328140978736365e-06, + "loss": 1.224, + "num_input_tokens_seen": 148111360, + "step": 18080 + }, + { + "epoch": 0.8686674669867948, + "grad_norm": 0.6451656222343445, + "learning_rate": 2.311474992658613e-06, + "loss": 0.8576, + "num_input_tokens_seen": 148193280, + "step": 18090 + }, + { + "epoch": 0.8691476590636255, + "grad_norm": 0.47667035460472107, + "learning_rate": 2.29486598095007e-06, + "loss": 0.7746, + "num_input_tokens_seen": 148275200, + "step": 18100 + }, + { + "epoch": 0.8696278511404562, + "grad_norm": 0.4710484743118286, + "learning_rate": 2.278313985318517e-06, + "loss": 0.9388, + "num_input_tokens_seen": 148357120, + "step": 18110 + }, + { + "epoch": 0.870108043217287, + "grad_norm": 0.5291144251823425, + "learning_rate": 2.261819047328562e-06, + "loss": 0.8536, + "num_input_tokens_seen": 148439040, + "step": 18120 + }, + { + "epoch": 0.8705882352941177, + "grad_norm": 0.5111077427864075, + "learning_rate": 2.2453812084015175e-06, + "loss": 0.9051, + "num_input_tokens_seen": 148520960, + "step": 18130 + }, + { + "epoch": 0.8710684273709484, + "grad_norm": 0.4752892553806305, + "learning_rate": 2.2290005098153296e-06, + "loss": 1.2451, + "num_input_tokens_seen": 148602880, + "step": 18140 + }, + { + "epoch": 0.8715486194477791, + "grad_norm": 0.501112163066864, + "learning_rate": 2.212676992704435e-06, + "loss": 0.9271, + "num_input_tokens_seen": 148684800, + "step": 18150 + }, + { + "epoch": 0.8720288115246099, + "grad_norm": 0.46904444694519043, + "learning_rate": 2.1964106980597034e-06, + "loss": 0.6553, + "num_input_tokens_seen": 148766720, + "step": 18160 + }, + { + "epoch": 0.8725090036014406, + "grad_norm": 0.5374292135238647, + "learning_rate": 2.1802016667282847e-06, + "loss": 0.8742, + "num_input_tokens_seen": 148848640, + "step": 18170 + }, + { + "epoch": 0.8729891956782713, + "grad_norm": 0.5054563283920288, + "learning_rate": 2.1640499394135595e-06, + "loss": 0.8704, + "num_input_tokens_seen": 148930560, + "step": 18180 + }, + { + "epoch": 0.8734693877551021, + "grad_norm": 0.48471224308013916, + "learning_rate": 2.1479555566749825e-06, + "loss": 0.9272, + "num_input_tokens_seen": 149012480, + "step": 18190 + }, + { + "epoch": 0.8739495798319328, + "grad_norm": 0.5023584961891174, + "learning_rate": 2.131918558928023e-06, + "loss": 0.9053, + "num_input_tokens_seen": 149094400, + "step": 18200 + }, + { + "epoch": 0.8744297719087635, + "grad_norm": 0.46324488520622253, + "learning_rate": 2.1159389864440495e-06, + "loss": 0.8871, + "num_input_tokens_seen": 149176320, + "step": 18210 + }, + { + "epoch": 0.8749099639855943, + "grad_norm": 0.5278415083885193, + "learning_rate": 2.100016879350214e-06, + "loss": 0.7591, + "num_input_tokens_seen": 149258240, + "step": 18220 + }, + { + "epoch": 0.875390156062425, + "grad_norm": 0.4872019290924072, + "learning_rate": 2.0841522776293725e-06, + "loss": 0.8473, + "num_input_tokens_seen": 149340160, + "step": 18230 + }, + { + "epoch": 0.8758703481392557, + "grad_norm": 0.5031896233558655, + "learning_rate": 2.0683452211199854e-06, + "loss": 1.118, + "num_input_tokens_seen": 149422080, + "step": 18240 + }, + { + "epoch": 0.8763505402160864, + "grad_norm": 0.48833465576171875, + "learning_rate": 2.052595749515987e-06, + "loss": 0.8301, + "num_input_tokens_seen": 149504000, + "step": 18250 + }, + { + "epoch": 0.8768307322929172, + "grad_norm": 0.5221076607704163, + "learning_rate": 2.0369039023667215e-06, + "loss": 0.9347, + "num_input_tokens_seen": 149585920, + "step": 18260 + }, + { + "epoch": 0.8773109243697479, + "grad_norm": 0.476140558719635, + "learning_rate": 2.0212697190768263e-06, + "loss": 0.9084, + "num_input_tokens_seen": 149667840, + "step": 18270 + }, + { + "epoch": 0.8777911164465786, + "grad_norm": 0.49334678053855896, + "learning_rate": 2.0056932389061338e-06, + "loss": 0.8173, + "num_input_tokens_seen": 149749760, + "step": 18280 + }, + { + "epoch": 0.8782713085234094, + "grad_norm": 0.8516491055488586, + "learning_rate": 1.9901745009695773e-06, + "loss": 0.9435, + "num_input_tokens_seen": 149831680, + "step": 18290 + }, + { + "epoch": 0.8787515006002401, + "grad_norm": 0.5442166328430176, + "learning_rate": 1.9747135442370946e-06, + "loss": 0.8976, + "num_input_tokens_seen": 149913600, + "step": 18300 + }, + { + "epoch": 0.8792316926770708, + "grad_norm": 0.3513561189174652, + "learning_rate": 1.9593104075335158e-06, + "loss": 0.762, + "num_input_tokens_seen": 149995520, + "step": 18310 + }, + { + "epoch": 0.8797118847539016, + "grad_norm": 0.48064127564430237, + "learning_rate": 1.943965129538483e-06, + "loss": 0.8791, + "num_input_tokens_seen": 150077440, + "step": 18320 + }, + { + "epoch": 0.8801920768307323, + "grad_norm": 0.48252299427986145, + "learning_rate": 1.9286777487863477e-06, + "loss": 0.8332, + "num_input_tokens_seen": 150159360, + "step": 18330 + }, + { + "epoch": 0.880672268907563, + "grad_norm": 0.45519593358039856, + "learning_rate": 1.913448303666071e-06, + "loss": 0.999, + "num_input_tokens_seen": 150241280, + "step": 18340 + }, + { + "epoch": 0.8811524609843937, + "grad_norm": 0.47915107011795044, + "learning_rate": 1.8982768324211197e-06, + "loss": 0.9294, + "num_input_tokens_seen": 150323200, + "step": 18350 + }, + { + "epoch": 0.8816326530612245, + "grad_norm": 0.5818130373954773, + "learning_rate": 1.8831633731493963e-06, + "loss": 0.9785, + "num_input_tokens_seen": 150405120, + "step": 18360 + }, + { + "epoch": 0.8821128451380552, + "grad_norm": 0.4887988865375519, + "learning_rate": 1.8681079638031062e-06, + "loss": 0.8177, + "num_input_tokens_seen": 150487040, + "step": 18370 + }, + { + "epoch": 0.8825930372148859, + "grad_norm": 0.6993192434310913, + "learning_rate": 1.8531106421887017e-06, + "loss": 0.8862, + "num_input_tokens_seen": 150568960, + "step": 18380 + }, + { + "epoch": 0.8830732292917167, + "grad_norm": 0.486240416765213, + "learning_rate": 1.8381714459667603e-06, + "loss": 0.7221, + "num_input_tokens_seen": 150650880, + "step": 18390 + }, + { + "epoch": 0.8835534213685474, + "grad_norm": 0.7017090916633606, + "learning_rate": 1.823290412651893e-06, + "loss": 1.0007, + "num_input_tokens_seen": 150732800, + "step": 18400 + }, + { + "epoch": 0.8840336134453781, + "grad_norm": 0.46545252203941345, + "learning_rate": 1.8084675796126576e-06, + "loss": 1.1891, + "num_input_tokens_seen": 150814720, + "step": 18410 + }, + { + "epoch": 0.8845138055222089, + "grad_norm": 0.5387531518936157, + "learning_rate": 1.7937029840714715e-06, + "loss": 1.0123, + "num_input_tokens_seen": 150896640, + "step": 18420 + }, + { + "epoch": 0.8849939975990396, + "grad_norm": 0.4813983142375946, + "learning_rate": 1.778996663104493e-06, + "loss": 0.8358, + "num_input_tokens_seen": 150978560, + "step": 18430 + }, + { + "epoch": 0.8854741896758703, + "grad_norm": 0.4829707443714142, + "learning_rate": 1.7643486536415537e-06, + "loss": 0.9354, + "num_input_tokens_seen": 151060480, + "step": 18440 + }, + { + "epoch": 0.885954381752701, + "grad_norm": 0.4897553622722626, + "learning_rate": 1.7497589924660552e-06, + "loss": 0.8802, + "num_input_tokens_seen": 151142400, + "step": 18450 + }, + { + "epoch": 0.8864345738295318, + "grad_norm": 0.826809823513031, + "learning_rate": 1.7352277162148712e-06, + "loss": 0.8142, + "num_input_tokens_seen": 151224320, + "step": 18460 + }, + { + "epoch": 0.8869147659063625, + "grad_norm": 0.4941288232803345, + "learning_rate": 1.7207548613782709e-06, + "loss": 0.8752, + "num_input_tokens_seen": 151306240, + "step": 18470 + }, + { + "epoch": 0.8873949579831932, + "grad_norm": 0.4736701548099518, + "learning_rate": 1.7063404642998186e-06, + "loss": 1.0158, + "num_input_tokens_seen": 151388160, + "step": 18480 + }, + { + "epoch": 0.887875150060024, + "grad_norm": 0.5565969944000244, + "learning_rate": 1.6919845611762714e-06, + "loss": 0.9463, + "num_input_tokens_seen": 151470080, + "step": 18490 + }, + { + "epoch": 0.8883553421368547, + "grad_norm": 0.523489773273468, + "learning_rate": 1.6776871880575084e-06, + "loss": 1.0312, + "num_input_tokens_seen": 151552000, + "step": 18500 + }, + { + "epoch": 0.8888355342136854, + "grad_norm": 0.5504932403564453, + "learning_rate": 1.663448380846433e-06, + "loss": 0.841, + "num_input_tokens_seen": 151633920, + "step": 18510 + }, + { + "epoch": 0.8893157262905163, + "grad_norm": 2.449960470199585, + "learning_rate": 1.649268175298868e-06, + "loss": 0.819, + "num_input_tokens_seen": 151715840, + "step": 18520 + }, + { + "epoch": 0.889795918367347, + "grad_norm": 0.6405714750289917, + "learning_rate": 1.6351466070234882e-06, + "loss": 1.1654, + "num_input_tokens_seen": 151797760, + "step": 18530 + }, + { + "epoch": 0.8902761104441776, + "grad_norm": 0.5734189748764038, + "learning_rate": 1.6210837114817272e-06, + "loss": 0.8883, + "num_input_tokens_seen": 151879680, + "step": 18540 + }, + { + "epoch": 0.8907563025210085, + "grad_norm": 0.4893549382686615, + "learning_rate": 1.6070795239876618e-06, + "loss": 0.785, + "num_input_tokens_seen": 151961600, + "step": 18550 + }, + { + "epoch": 0.8912364945978392, + "grad_norm": 0.48380154371261597, + "learning_rate": 1.5931340797079613e-06, + "loss": 0.7937, + "num_input_tokens_seen": 152043520, + "step": 18560 + }, + { + "epoch": 0.8917166866746699, + "grad_norm": 0.44979286193847656, + "learning_rate": 1.5792474136617858e-06, + "loss": 0.8338, + "num_input_tokens_seen": 152125440, + "step": 18570 + }, + { + "epoch": 0.8921968787515006, + "grad_norm": 0.48284098505973816, + "learning_rate": 1.5654195607206712e-06, + "loss": 0.8318, + "num_input_tokens_seen": 152207360, + "step": 18580 + }, + { + "epoch": 0.8926770708283314, + "grad_norm": 0.5251838564872742, + "learning_rate": 1.5516505556084888e-06, + "loss": 0.8587, + "num_input_tokens_seen": 152289280, + "step": 18590 + }, + { + "epoch": 0.8931572629051621, + "grad_norm": 0.6006345152854919, + "learning_rate": 1.5379404329013246e-06, + "loss": 1.0141, + "num_input_tokens_seen": 152371200, + "step": 18600 + }, + { + "epoch": 0.8936374549819928, + "grad_norm": 0.48716527223587036, + "learning_rate": 1.5242892270273951e-06, + "loss": 0.8301, + "num_input_tokens_seen": 152453120, + "step": 18610 + }, + { + "epoch": 0.8941176470588236, + "grad_norm": 0.4787551462650299, + "learning_rate": 1.5106969722669812e-06, + "loss": 0.9144, + "num_input_tokens_seen": 152535040, + "step": 18620 + }, + { + "epoch": 0.8945978391356543, + "grad_norm": 0.48084962368011475, + "learning_rate": 1.4971637027523106e-06, + "loss": 1.1305, + "num_input_tokens_seen": 152616960, + "step": 18630 + }, + { + "epoch": 0.895078031212485, + "grad_norm": 0.5110411643981934, + "learning_rate": 1.4836894524675126e-06, + "loss": 0.9489, + "num_input_tokens_seen": 152698880, + "step": 18640 + }, + { + "epoch": 0.8955582232893158, + "grad_norm": 0.5295985341072083, + "learning_rate": 1.4702742552484884e-06, + "loss": 1.1124, + "num_input_tokens_seen": 152780800, + "step": 18650 + }, + { + "epoch": 0.8960384153661465, + "grad_norm": 0.47781914472579956, + "learning_rate": 1.4569181447828623e-06, + "loss": 0.836, + "num_input_tokens_seen": 152862720, + "step": 18660 + }, + { + "epoch": 0.8965186074429772, + "grad_norm": 0.523032546043396, + "learning_rate": 1.4436211546098782e-06, + "loss": 1.0544, + "num_input_tokens_seen": 152944640, + "step": 18670 + }, + { + "epoch": 0.8969987995198079, + "grad_norm": 0.5448631644248962, + "learning_rate": 1.430383318120318e-06, + "loss": 0.9155, + "num_input_tokens_seen": 153026560, + "step": 18680 + }, + { + "epoch": 0.8974789915966387, + "grad_norm": 0.4606798589229584, + "learning_rate": 1.4172046685564212e-06, + "loss": 0.862, + "num_input_tokens_seen": 153108480, + "step": 18690 + }, + { + "epoch": 0.8979591836734694, + "grad_norm": 0.44669726490974426, + "learning_rate": 1.4040852390118042e-06, + "loss": 0.9023, + "num_input_tokens_seen": 153190400, + "step": 18700 + }, + { + "epoch": 0.8984393757503001, + "grad_norm": 0.4901637136936188, + "learning_rate": 1.3910250624313642e-06, + "loss": 1.0013, + "num_input_tokens_seen": 153272320, + "step": 18710 + }, + { + "epoch": 0.8989195678271309, + "grad_norm": 0.4382149875164032, + "learning_rate": 1.3780241716112057e-06, + "loss": 0.8953, + "num_input_tokens_seen": 153354240, + "step": 18720 + }, + { + "epoch": 0.8993997599039616, + "grad_norm": 0.5043530464172363, + "learning_rate": 1.3650825991985722e-06, + "loss": 0.8403, + "num_input_tokens_seen": 153436160, + "step": 18730 + }, + { + "epoch": 0.8998799519807923, + "grad_norm": 0.8724451065063477, + "learning_rate": 1.3522003776917285e-06, + "loss": 0.7954, + "num_input_tokens_seen": 153518080, + "step": 18740 + }, + { + "epoch": 0.9003601440576231, + "grad_norm": 0.4843882620334625, + "learning_rate": 1.3393775394399123e-06, + "loss": 0.8046, + "num_input_tokens_seen": 153600000, + "step": 18750 + }, + { + "epoch": 0.9008403361344538, + "grad_norm": 0.5170858502388, + "learning_rate": 1.326614116643246e-06, + "loss": 0.712, + "num_input_tokens_seen": 153681920, + "step": 18760 + }, + { + "epoch": 0.9013205282112845, + "grad_norm": 0.45620083808898926, + "learning_rate": 1.3139101413526339e-06, + "loss": 0.8778, + "num_input_tokens_seen": 153763840, + "step": 18770 + }, + { + "epoch": 0.9018007202881152, + "grad_norm": 0.710594654083252, + "learning_rate": 1.3012656454697125e-06, + "loss": 1.0348, + "num_input_tokens_seen": 153845760, + "step": 18780 + }, + { + "epoch": 0.902280912364946, + "grad_norm": 0.5861712098121643, + "learning_rate": 1.2886806607467578e-06, + "loss": 0.9589, + "num_input_tokens_seen": 153927680, + "step": 18790 + }, + { + "epoch": 0.9027611044417767, + "grad_norm": 0.48728203773498535, + "learning_rate": 1.2761552187865899e-06, + "loss": 0.9268, + "num_input_tokens_seen": 154009600, + "step": 18800 + }, + { + "epoch": 0.9032412965186074, + "grad_norm": 0.49788719415664673, + "learning_rate": 1.2636893510425186e-06, + "loss": 0.8947, + "num_input_tokens_seen": 154091520, + "step": 18810 + }, + { + "epoch": 0.9037214885954382, + "grad_norm": 0.5064159035682678, + "learning_rate": 1.2512830888182531e-06, + "loss": 0.9701, + "num_input_tokens_seen": 154173440, + "step": 18820 + }, + { + "epoch": 0.9042016806722689, + "grad_norm": 0.5245488286018372, + "learning_rate": 1.23893646326782e-06, + "loss": 0.9968, + "num_input_tokens_seen": 154255360, + "step": 18830 + }, + { + "epoch": 0.9046818727490996, + "grad_norm": 0.49203112721443176, + "learning_rate": 1.2266495053954913e-06, + "loss": 0.7056, + "num_input_tokens_seen": 154337280, + "step": 18840 + }, + { + "epoch": 0.9051620648259304, + "grad_norm": 0.47168827056884766, + "learning_rate": 1.2144222460557074e-06, + "loss": 0.8708, + "num_input_tokens_seen": 154419200, + "step": 18850 + }, + { + "epoch": 0.9056422569027611, + "grad_norm": 0.5028705596923828, + "learning_rate": 1.2022547159529911e-06, + "loss": 1.0057, + "num_input_tokens_seen": 154501120, + "step": 18860 + }, + { + "epoch": 0.9061224489795918, + "grad_norm": 0.4962250888347626, + "learning_rate": 1.190146945641879e-06, + "loss": 0.8827, + "num_input_tokens_seen": 154583040, + "step": 18870 + }, + { + "epoch": 0.9066026410564225, + "grad_norm": 0.5061412453651428, + "learning_rate": 1.1780989655268415e-06, + "loss": 0.9975, + "num_input_tokens_seen": 154664960, + "step": 18880 + }, + { + "epoch": 0.9070828331332533, + "grad_norm": 0.4892423152923584, + "learning_rate": 1.1661108058622082e-06, + "loss": 0.8246, + "num_input_tokens_seen": 154746880, + "step": 18890 + }, + { + "epoch": 0.907563025210084, + "grad_norm": 0.48785221576690674, + "learning_rate": 1.154182496752082e-06, + "loss": 0.7727, + "num_input_tokens_seen": 154828800, + "step": 18900 + }, + { + "epoch": 0.9080432172869147, + "grad_norm": 0.5420461297035217, + "learning_rate": 1.142314068150288e-06, + "loss": 1.0012, + "num_input_tokens_seen": 154910720, + "step": 18910 + }, + { + "epoch": 0.9085234093637455, + "grad_norm": 0.4156251549720764, + "learning_rate": 1.1305055498602584e-06, + "loss": 0.8584, + "num_input_tokens_seen": 154992640, + "step": 18920 + }, + { + "epoch": 0.9090036014405762, + "grad_norm": 0.4756897985935211, + "learning_rate": 1.1187569715350066e-06, + "loss": 0.8211, + "num_input_tokens_seen": 155074560, + "step": 18930 + }, + { + "epoch": 0.9094837935174069, + "grad_norm": 0.47108086943626404, + "learning_rate": 1.1070683626770162e-06, + "loss": 0.7993, + "num_input_tokens_seen": 155156480, + "step": 18940 + }, + { + "epoch": 0.9099639855942377, + "grad_norm": 0.4969809949398041, + "learning_rate": 1.0954397526381694e-06, + "loss": 0.88, + "num_input_tokens_seen": 155238400, + "step": 18950 + }, + { + "epoch": 0.9104441776710684, + "grad_norm": 1.2800873517990112, + "learning_rate": 1.0838711706196992e-06, + "loss": 0.9003, + "num_input_tokens_seen": 155320320, + "step": 18960 + }, + { + "epoch": 0.9109243697478991, + "grad_norm": 0.5365675687789917, + "learning_rate": 1.0723626456720925e-06, + "loss": 0.9935, + "num_input_tokens_seen": 155402240, + "step": 18970 + }, + { + "epoch": 0.9114045618247298, + "grad_norm": 0.516943097114563, + "learning_rate": 1.0609142066950157e-06, + "loss": 1.0318, + "num_input_tokens_seen": 155484160, + "step": 18980 + }, + { + "epoch": 0.9118847539015607, + "grad_norm": 0.5084674954414368, + "learning_rate": 1.0495258824372578e-06, + "loss": 0.837, + "num_input_tokens_seen": 155566080, + "step": 18990 + }, + { + "epoch": 0.9123649459783914, + "grad_norm": 0.5022624135017395, + "learning_rate": 1.0381977014966543e-06, + "loss": 0.7732, + "num_input_tokens_seen": 155648000, + "step": 19000 + }, + { + "epoch": 0.912845138055222, + "grad_norm": 0.47421613335609436, + "learning_rate": 1.0269296923199972e-06, + "loss": 1.0176, + "num_input_tokens_seen": 155729920, + "step": 19010 + }, + { + "epoch": 0.9133253301320529, + "grad_norm": 0.4957895278930664, + "learning_rate": 1.0157218832029969e-06, + "loss": 1.012, + "num_input_tokens_seen": 155811840, + "step": 19020 + }, + { + "epoch": 0.9138055222088836, + "grad_norm": 0.5938453078269958, + "learning_rate": 1.0045743022901787e-06, + "loss": 1.009, + "num_input_tokens_seen": 155893760, + "step": 19030 + }, + { + "epoch": 0.9142857142857143, + "grad_norm": 0.46902185678482056, + "learning_rate": 9.934869775748258e-07, + "loss": 0.7293, + "num_input_tokens_seen": 155975680, + "step": 19040 + }, + { + "epoch": 0.9147659063625451, + "grad_norm": 0.4982973337173462, + "learning_rate": 9.824599368989163e-07, + "loss": 0.8924, + "num_input_tokens_seen": 156057600, + "step": 19050 + }, + { + "epoch": 0.9152460984393758, + "grad_norm": 0.47380825877189636, + "learning_rate": 9.714932079530476e-07, + "loss": 0.8136, + "num_input_tokens_seen": 156139520, + "step": 19060 + }, + { + "epoch": 0.9157262905162065, + "grad_norm": 0.43160074949264526, + "learning_rate": 9.60586818276349e-07, + "loss": 0.7444, + "num_input_tokens_seen": 156221440, + "step": 19070 + }, + { + "epoch": 0.9162064825930372, + "grad_norm": 0.5274019837379456, + "learning_rate": 9.497407952564485e-07, + "loss": 0.7502, + "num_input_tokens_seen": 156303360, + "step": 19080 + }, + { + "epoch": 0.916686674669868, + "grad_norm": 0.5028293132781982, + "learning_rate": 9.389551661293683e-07, + "loss": 0.9223, + "num_input_tokens_seen": 156385280, + "step": 19090 + }, + { + "epoch": 0.9171668667466987, + "grad_norm": 0.4965846538543701, + "learning_rate": 9.282299579794789e-07, + "loss": 0.8431, + "num_input_tokens_seen": 156467200, + "step": 19100 + }, + { + "epoch": 0.9176470588235294, + "grad_norm": 1.1068954467773438, + "learning_rate": 9.175651977394284e-07, + "loss": 0.8672, + "num_input_tokens_seen": 156549120, + "step": 19110 + }, + { + "epoch": 0.9181272509003602, + "grad_norm": 0.4944973886013031, + "learning_rate": 9.069609121900663e-07, + "loss": 0.8328, + "num_input_tokens_seen": 156631040, + "step": 19120 + }, + { + "epoch": 0.9186074429771909, + "grad_norm": 0.47309666872024536, + "learning_rate": 8.964171279603778e-07, + "loss": 0.7688, + "num_input_tokens_seen": 156712960, + "step": 19130 + }, + { + "epoch": 0.9190876350540216, + "grad_norm": 0.4812490940093994, + "learning_rate": 8.859338715274279e-07, + "loss": 0.7694, + "num_input_tokens_seen": 156794880, + "step": 19140 + }, + { + "epoch": 0.9195678271308524, + "grad_norm": 0.7439998984336853, + "learning_rate": 8.755111692162837e-07, + "loss": 0.9128, + "num_input_tokens_seen": 156876800, + "step": 19150 + }, + { + "epoch": 0.9200480192076831, + "grad_norm": 1.6029683351516724, + "learning_rate": 8.651490471999424e-07, + "loss": 0.9739, + "num_input_tokens_seen": 156958720, + "step": 19160 + }, + { + "epoch": 0.9205282112845138, + "grad_norm": 0.5646688938140869, + "learning_rate": 8.548475314992949e-07, + "loss": 0.9776, + "num_input_tokens_seen": 157040640, + "step": 19170 + }, + { + "epoch": 0.9210084033613445, + "grad_norm": 0.4853648841381073, + "learning_rate": 8.446066479830206e-07, + "loss": 0.9048, + "num_input_tokens_seen": 157122560, + "step": 19180 + }, + { + "epoch": 0.9214885954381753, + "grad_norm": 0.5098863840103149, + "learning_rate": 8.344264223675485e-07, + "loss": 0.8928, + "num_input_tokens_seen": 157204480, + "step": 19190 + }, + { + "epoch": 0.921968787515006, + "grad_norm": 0.4745608866214752, + "learning_rate": 8.243068802169906e-07, + "loss": 0.8414, + "num_input_tokens_seen": 157286400, + "step": 19200 + }, + { + "epoch": 0.9224489795918367, + "grad_norm": 0.555088996887207, + "learning_rate": 8.14248046943078e-07, + "loss": 0.9802, + "num_input_tokens_seen": 157368320, + "step": 19210 + }, + { + "epoch": 0.9229291716686675, + "grad_norm": 0.5188843011856079, + "learning_rate": 8.042499478050719e-07, + "loss": 0.7226, + "num_input_tokens_seen": 157450240, + "step": 19220 + }, + { + "epoch": 0.9234093637454982, + "grad_norm": 0.4765109121799469, + "learning_rate": 7.943126079097418e-07, + "loss": 0.8491, + "num_input_tokens_seen": 157532160, + "step": 19230 + }, + { + "epoch": 0.9238895558223289, + "grad_norm": 1.0354602336883545, + "learning_rate": 7.844360522112737e-07, + "loss": 0.9703, + "num_input_tokens_seen": 157614080, + "step": 19240 + }, + { + "epoch": 0.9243697478991597, + "grad_norm": 0.48967981338500977, + "learning_rate": 7.746203055112145e-07, + "loss": 0.8683, + "num_input_tokens_seen": 157696000, + "step": 19250 + }, + { + "epoch": 0.9248499399759904, + "grad_norm": 0.553525984287262, + "learning_rate": 7.648653924584137e-07, + "loss": 0.9395, + "num_input_tokens_seen": 157777920, + "step": 19260 + }, + { + "epoch": 0.9253301320528211, + "grad_norm": 0.47739177942276, + "learning_rate": 7.55171337548946e-07, + "loss": 0.84, + "num_input_tokens_seen": 157859840, + "step": 19270 + }, + { + "epoch": 0.9258103241296518, + "grad_norm": 0.47741973400115967, + "learning_rate": 7.455381651260807e-07, + "loss": 0.7814, + "num_input_tokens_seen": 157941760, + "step": 19280 + }, + { + "epoch": 0.9262905162064826, + "grad_norm": 0.49380743503570557, + "learning_rate": 7.359658993801894e-07, + "loss": 0.954, + "num_input_tokens_seen": 158023680, + "step": 19290 + }, + { + "epoch": 0.9267707082833133, + "grad_norm": 0.5180758833885193, + "learning_rate": 7.264545643486997e-07, + "loss": 0.9109, + "num_input_tokens_seen": 158105600, + "step": 19300 + }, + { + "epoch": 0.927250900360144, + "grad_norm": 0.7896413803100586, + "learning_rate": 7.170041839160368e-07, + "loss": 0.9779, + "num_input_tokens_seen": 158187520, + "step": 19310 + }, + { + "epoch": 0.9277310924369748, + "grad_norm": 0.48943135142326355, + "learning_rate": 7.076147818135537e-07, + "loss": 0.8145, + "num_input_tokens_seen": 158269440, + "step": 19320 + }, + { + "epoch": 0.9282112845138055, + "grad_norm": 1.7075966596603394, + "learning_rate": 6.982863816194812e-07, + "loss": 0.7036, + "num_input_tokens_seen": 158351360, + "step": 19330 + }, + { + "epoch": 0.9286914765906362, + "grad_norm": 0.48135149478912354, + "learning_rate": 6.890190067588648e-07, + "loss": 0.8395, + "num_input_tokens_seen": 158433280, + "step": 19340 + }, + { + "epoch": 0.929171668667467, + "grad_norm": 0.43532514572143555, + "learning_rate": 6.798126805035082e-07, + "loss": 0.9633, + "num_input_tokens_seen": 158515200, + "step": 19350 + }, + { + "epoch": 0.9296518607442977, + "grad_norm": 0.5122432708740234, + "learning_rate": 6.706674259719048e-07, + "loss": 0.9551, + "num_input_tokens_seen": 158597120, + "step": 19360 + }, + { + "epoch": 0.9301320528211284, + "grad_norm": 0.4708029627799988, + "learning_rate": 6.615832661291954e-07, + "loss": 0.895, + "num_input_tokens_seen": 158679040, + "step": 19370 + }, + { + "epoch": 0.9306122448979591, + "grad_norm": 0.5076829195022583, + "learning_rate": 6.525602237870993e-07, + "loss": 0.7502, + "num_input_tokens_seen": 158760960, + "step": 19380 + }, + { + "epoch": 0.9310924369747899, + "grad_norm": 0.5408411622047424, + "learning_rate": 6.435983216038583e-07, + "loss": 0.693, + "num_input_tokens_seen": 158842880, + "step": 19390 + }, + { + "epoch": 0.9315726290516206, + "grad_norm": 0.5284355282783508, + "learning_rate": 6.346975820841927e-07, + "loss": 0.8507, + "num_input_tokens_seen": 158924800, + "step": 19400 + }, + { + "epoch": 0.9320528211284513, + "grad_norm": 0.47476431727409363, + "learning_rate": 6.258580275792153e-07, + "loss": 0.9174, + "num_input_tokens_seen": 159006720, + "step": 19410 + }, + { + "epoch": 0.9325330132052821, + "grad_norm": 0.4776346981525421, + "learning_rate": 6.170796802864115e-07, + "loss": 0.7601, + "num_input_tokens_seen": 159088640, + "step": 19420 + }, + { + "epoch": 0.9330132052821128, + "grad_norm": 0.5064809322357178, + "learning_rate": 6.083625622495565e-07, + "loss": 0.8922, + "num_input_tokens_seen": 159170560, + "step": 19430 + }, + { + "epoch": 0.9334933973589435, + "grad_norm": 0.48573869466781616, + "learning_rate": 5.997066953586761e-07, + "loss": 0.8506, + "num_input_tokens_seen": 159252480, + "step": 19440 + }, + { + "epoch": 0.9339735894357744, + "grad_norm": 0.48475074768066406, + "learning_rate": 5.911121013499721e-07, + "loss": 0.801, + "num_input_tokens_seen": 159334400, + "step": 19450 + }, + { + "epoch": 0.934453781512605, + "grad_norm": 0.5202433466911316, + "learning_rate": 5.825788018057971e-07, + "loss": 0.8358, + "num_input_tokens_seen": 159416320, + "step": 19460 + }, + { + "epoch": 0.9349339735894358, + "grad_norm": 0.48488256335258484, + "learning_rate": 5.741068181545684e-07, + "loss": 1.0555, + "num_input_tokens_seen": 159498240, + "step": 19470 + }, + { + "epoch": 0.9354141656662666, + "grad_norm": 0.467579185962677, + "learning_rate": 5.656961716707459e-07, + "loss": 1.184, + "num_input_tokens_seen": 159580160, + "step": 19480 + }, + { + "epoch": 0.9358943577430973, + "grad_norm": 0.47693246603012085, + "learning_rate": 5.57346883474752e-07, + "loss": 0.8181, + "num_input_tokens_seen": 159662080, + "step": 19490 + }, + { + "epoch": 0.936374549819928, + "grad_norm": 0.3317853808403015, + "learning_rate": 5.490589745329261e-07, + "loss": 0.9217, + "num_input_tokens_seen": 159744000, + "step": 19500 + }, + { + "epoch": 0.9368547418967587, + "grad_norm": 0.48513829708099365, + "learning_rate": 5.408324656574842e-07, + "loss": 1.0929, + "num_input_tokens_seen": 159825920, + "step": 19510 + }, + { + "epoch": 0.9373349339735895, + "grad_norm": 0.5054202675819397, + "learning_rate": 5.326673775064545e-07, + "loss": 0.9711, + "num_input_tokens_seen": 159907840, + "step": 19520 + }, + { + "epoch": 0.9378151260504202, + "grad_norm": 0.48606500029563904, + "learning_rate": 5.245637305836243e-07, + "loss": 0.872, + "num_input_tokens_seen": 159989760, + "step": 19530 + }, + { + "epoch": 0.9382953181272509, + "grad_norm": 0.47018781304359436, + "learning_rate": 5.165215452384936e-07, + "loss": 0.8849, + "num_input_tokens_seen": 160071680, + "step": 19540 + }, + { + "epoch": 0.9387755102040817, + "grad_norm": 0.5236411094665527, + "learning_rate": 5.085408416662274e-07, + "loss": 0.8509, + "num_input_tokens_seen": 160153600, + "step": 19550 + }, + { + "epoch": 0.9392557022809124, + "grad_norm": 0.45992419123649597, + "learning_rate": 5.006216399075947e-07, + "loss": 0.9798, + "num_input_tokens_seen": 160235520, + "step": 19560 + }, + { + "epoch": 0.9397358943577431, + "grad_norm": 0.5223960280418396, + "learning_rate": 4.92763959848927e-07, + "loss": 0.8332, + "num_input_tokens_seen": 160317440, + "step": 19570 + }, + { + "epoch": 0.9402160864345739, + "grad_norm": 0.4571020305156708, + "learning_rate": 4.849678212220682e-07, + "loss": 0.9557, + "num_input_tokens_seen": 160399360, + "step": 19580 + }, + { + "epoch": 0.9406962785114046, + "grad_norm": 0.4470067620277405, + "learning_rate": 4.772332436043165e-07, + "loss": 0.7306, + "num_input_tokens_seen": 160481280, + "step": 19590 + }, + { + "epoch": 0.9411764705882353, + "grad_norm": 0.43297266960144043, + "learning_rate": 4.6956024641838237e-07, + "loss": 0.9514, + "num_input_tokens_seen": 160563200, + "step": 19600 + }, + { + "epoch": 0.941656662665066, + "grad_norm": 0.49218934774398804, + "learning_rate": 4.619488489323448e-07, + "loss": 0.9364, + "num_input_tokens_seen": 160645120, + "step": 19610 + }, + { + "epoch": 0.9421368547418968, + "grad_norm": 0.440889447927475, + "learning_rate": 4.5439907025958405e-07, + "loss": 1.0067, + "num_input_tokens_seen": 160727040, + "step": 19620 + }, + { + "epoch": 0.9426170468187275, + "grad_norm": 0.4658232033252716, + "learning_rate": 4.4691092935876256e-07, + "loss": 0.985, + "num_input_tokens_seen": 160808960, + "step": 19630 + }, + { + "epoch": 0.9430972388955582, + "grad_norm": 0.4865157902240753, + "learning_rate": 4.394844450337443e-07, + "loss": 1.1212, + "num_input_tokens_seen": 160890880, + "step": 19640 + }, + { + "epoch": 0.943577430972389, + "grad_norm": 0.782720685005188, + "learning_rate": 4.3211963593357275e-07, + "loss": 1.0098, + "num_input_tokens_seen": 160972800, + "step": 19650 + }, + { + "epoch": 0.9440576230492197, + "grad_norm": 0.4879094660282135, + "learning_rate": 4.248165205524152e-07, + "loss": 0.8137, + "num_input_tokens_seen": 161054720, + "step": 19660 + }, + { + "epoch": 0.9445378151260504, + "grad_norm": 0.45209428668022156, + "learning_rate": 4.175751172295156e-07, + "loss": 0.9042, + "num_input_tokens_seen": 161136640, + "step": 19670 + }, + { + "epoch": 0.9450180072028812, + "grad_norm": 0.4872719347476959, + "learning_rate": 4.1039544414914753e-07, + "loss": 0.8619, + "num_input_tokens_seen": 161218560, + "step": 19680 + }, + { + "epoch": 0.9454981992797119, + "grad_norm": 0.7091497182846069, + "learning_rate": 4.032775193405752e-07, + "loss": 0.8915, + "num_input_tokens_seen": 161300480, + "step": 19690 + }, + { + "epoch": 0.9459783913565426, + "grad_norm": 0.45862463116645813, + "learning_rate": 3.96221360677998e-07, + "loss": 0.815, + "num_input_tokens_seen": 161382400, + "step": 19700 + }, + { + "epoch": 0.9464585834333733, + "grad_norm": 0.5041652321815491, + "learning_rate": 3.892269858805142e-07, + "loss": 0.9076, + "num_input_tokens_seen": 161464320, + "step": 19710 + }, + { + "epoch": 0.9469387755102041, + "grad_norm": 0.4937167763710022, + "learning_rate": 3.82294412512077e-07, + "loss": 0.8012, + "num_input_tokens_seen": 161546240, + "step": 19720 + }, + { + "epoch": 0.9474189675870348, + "grad_norm": 0.44221341609954834, + "learning_rate": 3.7542365798143573e-07, + "loss": 0.8382, + "num_input_tokens_seen": 161628160, + "step": 19730 + }, + { + "epoch": 0.9478991596638655, + "grad_norm": 0.4632039964199066, + "learning_rate": 3.6861473954210855e-07, + "loss": 1.0418, + "num_input_tokens_seen": 161710080, + "step": 19740 + }, + { + "epoch": 0.9483793517406963, + "grad_norm": 0.4950004518032074, + "learning_rate": 3.6186767429234323e-07, + "loss": 0.7393, + "num_input_tokens_seen": 161792000, + "step": 19750 + }, + { + "epoch": 0.948859543817527, + "grad_norm": 0.4772696793079376, + "learning_rate": 3.5518247917505077e-07, + "loss": 0.8615, + "num_input_tokens_seen": 161873920, + "step": 19760 + }, + { + "epoch": 0.9493397358943577, + "grad_norm": 0.4546825885772705, + "learning_rate": 3.485591709777802e-07, + "loss": 1.041, + "num_input_tokens_seen": 161955840, + "step": 19770 + }, + { + "epoch": 0.9498199279711885, + "grad_norm": 0.4924812912940979, + "learning_rate": 3.419977663326801e-07, + "loss": 0.813, + "num_input_tokens_seen": 162037760, + "step": 19780 + }, + { + "epoch": 0.9503001200480192, + "grad_norm": 0.5705674290657043, + "learning_rate": 3.3549828171644537e-07, + "loss": 1.2698, + "num_input_tokens_seen": 162119680, + "step": 19790 + }, + { + "epoch": 0.9507803121248499, + "grad_norm": 0.48878228664398193, + "learning_rate": 3.29060733450276e-07, + "loss": 0.8896, + "num_input_tokens_seen": 162201600, + "step": 19800 + }, + { + "epoch": 0.9512605042016806, + "grad_norm": 0.4292643070220947, + "learning_rate": 3.2268513769984634e-07, + "loss": 1.0284, + "num_input_tokens_seen": 162283520, + "step": 19810 + }, + { + "epoch": 0.9517406962785114, + "grad_norm": 0.715688169002533, + "learning_rate": 3.163715104752524e-07, + "loss": 0.9543, + "num_input_tokens_seen": 162365440, + "step": 19820 + }, + { + "epoch": 0.9522208883553421, + "grad_norm": 0.46790245175361633, + "learning_rate": 3.101198676309841e-07, + "loss": 0.9059, + "num_input_tokens_seen": 162447360, + "step": 19830 + }, + { + "epoch": 0.9527010804321728, + "grad_norm": 0.48006242513656616, + "learning_rate": 3.039302248658754e-07, + "loss": 0.9259, + "num_input_tokens_seen": 162529280, + "step": 19840 + }, + { + "epoch": 0.9531812725090036, + "grad_norm": 0.3433704376220703, + "learning_rate": 2.978025977230736e-07, + "loss": 1.0082, + "num_input_tokens_seen": 162611200, + "step": 19850 + }, + { + "epoch": 0.9536614645858343, + "grad_norm": 0.4852867126464844, + "learning_rate": 2.91737001589984e-07, + "loss": 0.9153, + "num_input_tokens_seen": 162693120, + "step": 19860 + }, + { + "epoch": 0.954141656662665, + "grad_norm": 0.5154809951782227, + "learning_rate": 2.8573345169825296e-07, + "loss": 0.9446, + "num_input_tokens_seen": 162775040, + "step": 19870 + }, + { + "epoch": 0.9546218487394958, + "grad_norm": 0.707761287689209, + "learning_rate": 2.797919631237156e-07, + "loss": 0.7736, + "num_input_tokens_seen": 162856960, + "step": 19880 + }, + { + "epoch": 0.9551020408163265, + "grad_norm": 0.3873317837715149, + "learning_rate": 2.739125507863649e-07, + "loss": 0.9343, + "num_input_tokens_seen": 162938880, + "step": 19890 + }, + { + "epoch": 0.9555822328931572, + "grad_norm": 0.48126840591430664, + "learning_rate": 2.680952294503075e-07, + "loss": 0.9554, + "num_input_tokens_seen": 163020800, + "step": 19900 + }, + { + "epoch": 0.956062424969988, + "grad_norm": 0.4585091471672058, + "learning_rate": 2.6234001372372194e-07, + "loss": 0.9323, + "num_input_tokens_seen": 163102720, + "step": 19910 + }, + { + "epoch": 0.9565426170468188, + "grad_norm": 0.5132349729537964, + "learning_rate": 2.5664691805884767e-07, + "loss": 0.9947, + "num_input_tokens_seen": 163184640, + "step": 19920 + }, + { + "epoch": 0.9570228091236495, + "grad_norm": 0.5150689482688904, + "learning_rate": 2.5101595675191827e-07, + "loss": 0.9797, + "num_input_tokens_seen": 163266560, + "step": 19930 + }, + { + "epoch": 0.9575030012004802, + "grad_norm": 0.46773561835289, + "learning_rate": 2.4544714394314215e-07, + "loss": 0.9892, + "num_input_tokens_seen": 163348480, + "step": 19940 + }, + { + "epoch": 0.957983193277311, + "grad_norm": 0.7742196917533875, + "learning_rate": 2.399404936166638e-07, + "loss": 1.171, + "num_input_tokens_seen": 163430400, + "step": 19950 + }, + { + "epoch": 0.9584633853541417, + "grad_norm": 0.4795765280723572, + "learning_rate": 2.3449601960052746e-07, + "loss": 0.7551, + "num_input_tokens_seen": 163512320, + "step": 19960 + }, + { + "epoch": 0.9589435774309724, + "grad_norm": 0.46741360425949097, + "learning_rate": 2.2911373556664118e-07, + "loss": 0.7974, + "num_input_tokens_seen": 163594240, + "step": 19970 + }, + { + "epoch": 0.9594237695078032, + "grad_norm": 0.47377344965934753, + "learning_rate": 2.23793655030749e-07, + "loss": 0.8963, + "num_input_tokens_seen": 163676160, + "step": 19980 + }, + { + "epoch": 0.9599039615846339, + "grad_norm": 0.5991731286048889, + "learning_rate": 2.1853579135238667e-07, + "loss": 0.9377, + "num_input_tokens_seen": 163758080, + "step": 19990 + }, + { + "epoch": 0.9603841536614646, + "grad_norm": 0.4813549816608429, + "learning_rate": 2.1334015773486203e-07, + "loss": 1.0792, + "num_input_tokens_seen": 163840000, + "step": 20000 + }, + { + "epoch": 0.9608643457382953, + "grad_norm": 0.5007798075675964, + "learning_rate": 2.0820676722520526e-07, + "loss": 0.7725, + "num_input_tokens_seen": 163921920, + "step": 20010 + }, + { + "epoch": 0.9613445378151261, + "grad_norm": 0.49032649397850037, + "learning_rate": 2.0313563271414927e-07, + "loss": 0.8982, + "num_input_tokens_seen": 164003840, + "step": 20020 + }, + { + "epoch": 0.9618247298919568, + "grad_norm": 0.49168848991394043, + "learning_rate": 1.9812676693608812e-07, + "loss": 0.7376, + "num_input_tokens_seen": 164085760, + "step": 20030 + }, + { + "epoch": 0.9623049219687875, + "grad_norm": 0.47681140899658203, + "learning_rate": 1.9318018246905488e-07, + "loss": 0.8378, + "num_input_tokens_seen": 164167680, + "step": 20040 + }, + { + "epoch": 0.9627851140456183, + "grad_norm": 0.4583689272403717, + "learning_rate": 1.8829589173468552e-07, + "loss": 0.8883, + "num_input_tokens_seen": 164249600, + "step": 20050 + }, + { + "epoch": 0.963265306122449, + "grad_norm": 0.3779040575027466, + "learning_rate": 1.8347390699817724e-07, + "loss": 0.7442, + "num_input_tokens_seen": 164331520, + "step": 20060 + }, + { + "epoch": 0.9637454981992797, + "grad_norm": 0.571790874004364, + "learning_rate": 1.7871424036828288e-07, + "loss": 0.9274, + "num_input_tokens_seen": 164413440, + "step": 20070 + }, + { + "epoch": 0.9642256902761105, + "grad_norm": 0.5354510545730591, + "learning_rate": 1.7401690379724722e-07, + "loss": 0.9755, + "num_input_tokens_seen": 164495360, + "step": 20080 + }, + { + "epoch": 0.9647058823529412, + "grad_norm": 0.5626926422119141, + "learning_rate": 1.6938190908080688e-07, + "loss": 0.8417, + "num_input_tokens_seen": 164577280, + "step": 20090 + }, + { + "epoch": 0.9651860744297719, + "grad_norm": 0.5134274363517761, + "learning_rate": 1.6480926785814866e-07, + "loss": 0.7835, + "num_input_tokens_seen": 164659200, + "step": 20100 + }, + { + "epoch": 0.9656662665066026, + "grad_norm": 0.5222331881523132, + "learning_rate": 1.6029899161187079e-07, + "loss": 1.0117, + "num_input_tokens_seen": 164741120, + "step": 20110 + }, + { + "epoch": 0.9661464585834334, + "grad_norm": 0.5277162194252014, + "learning_rate": 1.5585109166796896e-07, + "loss": 0.7976, + "num_input_tokens_seen": 164823040, + "step": 20120 + }, + { + "epoch": 0.9666266506602641, + "grad_norm": 0.5840759873390198, + "learning_rate": 1.5146557919581138e-07, + "loss": 0.9373, + "num_input_tokens_seen": 164904960, + "step": 20130 + }, + { + "epoch": 0.9671068427370948, + "grad_norm": 0.4930008053779602, + "learning_rate": 1.4714246520808328e-07, + "loss": 0.717, + "num_input_tokens_seen": 164986880, + "step": 20140 + }, + { + "epoch": 0.9675870348139256, + "grad_norm": 0.49782538414001465, + "learning_rate": 1.4288176056079238e-07, + "loss": 0.8526, + "num_input_tokens_seen": 165068800, + "step": 20150 + }, + { + "epoch": 0.9680672268907563, + "grad_norm": 0.48981910943984985, + "learning_rate": 1.3868347595322184e-07, + "loss": 0.8959, + "num_input_tokens_seen": 165150720, + "step": 20160 + }, + { + "epoch": 0.968547418967587, + "grad_norm": 0.46395954489707947, + "learning_rate": 1.3454762192790794e-07, + "loss": 1.1636, + "num_input_tokens_seen": 165232640, + "step": 20170 + }, + { + "epoch": 0.9690276110444178, + "grad_norm": 0.4878246784210205, + "learning_rate": 1.3047420887061513e-07, + "loss": 1.0033, + "num_input_tokens_seen": 165314560, + "step": 20180 + }, + { + "epoch": 0.9695078031212485, + "grad_norm": 0.45499855279922485, + "learning_rate": 1.264632470103111e-07, + "loss": 0.8926, + "num_input_tokens_seen": 165396480, + "step": 20190 + }, + { + "epoch": 0.9699879951980792, + "grad_norm": 0.4934712052345276, + "learning_rate": 1.225147464191334e-07, + "loss": 0.9251, + "num_input_tokens_seen": 165478400, + "step": 20200 + }, + { + "epoch": 0.9704681872749099, + "grad_norm": 0.48829373717308044, + "learning_rate": 1.1862871701237288e-07, + "loss": 0.754, + "num_input_tokens_seen": 165560320, + "step": 20210 + }, + { + "epoch": 0.9709483793517407, + "grad_norm": 0.4913751184940338, + "learning_rate": 1.1480516854844858e-07, + "loss": 0.799, + "num_input_tokens_seen": 165642240, + "step": 20220 + }, + { + "epoch": 0.9714285714285714, + "grad_norm": 0.5303522944450378, + "learning_rate": 1.1104411062887732e-07, + "loss": 0.797, + "num_input_tokens_seen": 165724160, + "step": 20230 + }, + { + "epoch": 0.9719087635054021, + "grad_norm": 0.4915572702884674, + "learning_rate": 1.0734555269825141e-07, + "loss": 0.7708, + "num_input_tokens_seen": 165806080, + "step": 20240 + }, + { + "epoch": 0.9723889555822329, + "grad_norm": 0.5113864541053772, + "learning_rate": 1.0370950404421931e-07, + "loss": 0.7448, + "num_input_tokens_seen": 165888000, + "step": 20250 + }, + { + "epoch": 0.9728691476590636, + "grad_norm": 0.49266868829727173, + "learning_rate": 1.0013597379745776e-07, + "loss": 0.8983, + "num_input_tokens_seen": 165969920, + "step": 20260 + }, + { + "epoch": 0.9733493397358943, + "grad_norm": 0.9062735438346863, + "learning_rate": 9.662497093164691e-08, + "loss": 0.7648, + "num_input_tokens_seen": 166051840, + "step": 20270 + }, + { + "epoch": 0.9738295318127251, + "grad_norm": 0.5166149139404297, + "learning_rate": 9.317650426345637e-08, + "loss": 0.9752, + "num_input_tokens_seen": 166133760, + "step": 20280 + }, + { + "epoch": 0.9743097238895558, + "grad_norm": 0.4873685836791992, + "learning_rate": 8.979058245251193e-08, + "loss": 0.8632, + "num_input_tokens_seen": 166215680, + "step": 20290 + }, + { + "epoch": 0.9747899159663865, + "grad_norm": 0.48087936639785767, + "learning_rate": 8.646721400138724e-08, + "loss": 1.0177, + "num_input_tokens_seen": 166297600, + "step": 20300 + }, + { + "epoch": 0.9752701080432172, + "grad_norm": 0.4865896999835968, + "learning_rate": 8.320640725556773e-08, + "loss": 0.9075, + "num_input_tokens_seen": 166379520, + "step": 20310 + }, + { + "epoch": 0.975750300120048, + "grad_norm": 0.4811105728149414, + "learning_rate": 8.000817040344222e-08, + "loss": 0.9434, + "num_input_tokens_seen": 166461440, + "step": 20320 + }, + { + "epoch": 0.9762304921968787, + "grad_norm": 0.47913140058517456, + "learning_rate": 7.687251147627251e-08, + "loss": 0.8307, + "num_input_tokens_seen": 166543360, + "step": 20330 + }, + { + "epoch": 0.9767106842737094, + "grad_norm": 0.4741958677768707, + "learning_rate": 7.379943834818214e-08, + "loss": 0.7942, + "num_input_tokens_seen": 166625280, + "step": 20340 + }, + { + "epoch": 0.9771908763505402, + "grad_norm": 0.492371529340744, + "learning_rate": 7.0788958736126e-08, + "loss": 0.6887, + "num_input_tokens_seen": 166707200, + "step": 20350 + }, + { + "epoch": 0.977671068427371, + "grad_norm": 0.5118972659111023, + "learning_rate": 6.784108019988189e-08, + "loss": 0.8574, + "num_input_tokens_seen": 166789120, + "step": 20360 + }, + { + "epoch": 0.9781512605042016, + "grad_norm": 0.387319952249527, + "learning_rate": 6.495581014202556e-08, + "loss": 0.7726, + "num_input_tokens_seen": 166871040, + "step": 20370 + }, + { + "epoch": 0.9786314525810325, + "grad_norm": 0.494081050157547, + "learning_rate": 6.213315580791135e-08, + "loss": 0.9247, + "num_input_tokens_seen": 166952960, + "step": 20380 + }, + { + "epoch": 0.9791116446578632, + "grad_norm": 0.5668483376502991, + "learning_rate": 5.9373124285661e-08, + "loss": 0.8177, + "num_input_tokens_seen": 167034880, + "step": 20390 + }, + { + "epoch": 0.9795918367346939, + "grad_norm": 0.5243094563484192, + "learning_rate": 5.6675722506135956e-08, + "loss": 0.8603, + "num_input_tokens_seen": 167116800, + "step": 20400 + }, + { + "epoch": 0.9800720288115247, + "grad_norm": 0.5012032389640808, + "learning_rate": 5.404095724292346e-08, + "loss": 0.9751, + "num_input_tokens_seen": 167198720, + "step": 20410 + }, + { + "epoch": 0.9805522208883554, + "grad_norm": 0.48027274012565613, + "learning_rate": 5.146883511232825e-08, + "loss": 0.9047, + "num_input_tokens_seen": 167280640, + "step": 20420 + }, + { + "epoch": 0.9810324129651861, + "grad_norm": 0.9182277321815491, + "learning_rate": 4.8959362573341995e-08, + "loss": 0.8816, + "num_input_tokens_seen": 167362560, + "step": 20430 + }, + { + "epoch": 0.9815126050420168, + "grad_norm": 0.47748062014579773, + "learning_rate": 4.6512545927632213e-08, + "loss": 0.8182, + "num_input_tokens_seen": 167444480, + "step": 20440 + }, + { + "epoch": 0.9819927971188476, + "grad_norm": 0.5384604930877686, + "learning_rate": 4.412839131953395e-08, + "loss": 0.8999, + "num_input_tokens_seen": 167526400, + "step": 20450 + }, + { + "epoch": 0.9824729891956783, + "grad_norm": 0.49782511591911316, + "learning_rate": 4.180690473602755e-08, + "loss": 0.9972, + "num_input_tokens_seen": 167608320, + "step": 20460 + }, + { + "epoch": 0.982953181272509, + "grad_norm": 0.46127596497535706, + "learning_rate": 3.9548092006719275e-08, + "loss": 0.7583, + "num_input_tokens_seen": 167690240, + "step": 20470 + }, + { + "epoch": 0.9834333733493398, + "grad_norm": 0.8296525478363037, + "learning_rate": 3.7351958803835685e-08, + "loss": 1.0416, + "num_input_tokens_seen": 167772160, + "step": 20480 + }, + { + "epoch": 0.9839135654261705, + "grad_norm": 0.48525142669677734, + "learning_rate": 3.5218510642201496e-08, + "loss": 1.1318, + "num_input_tokens_seen": 167854080, + "step": 20490 + }, + { + "epoch": 0.9843937575030012, + "grad_norm": 0.656819224357605, + "learning_rate": 3.314775287923677e-08, + "loss": 1.0568, + "num_input_tokens_seen": 167936000, + "step": 20500 + }, + { + "epoch": 0.984873949579832, + "grad_norm": 0.49924975633621216, + "learning_rate": 3.1139690714931945e-08, + "loss": 0.9061, + "num_input_tokens_seen": 168017920, + "step": 20510 + }, + { + "epoch": 0.9853541416566627, + "grad_norm": 0.47021421790122986, + "learning_rate": 2.919432919183396e-08, + "loss": 0.7413, + "num_input_tokens_seen": 168099840, + "step": 20520 + }, + { + "epoch": 0.9858343337334934, + "grad_norm": 0.4875717759132385, + "learning_rate": 2.731167319505179e-08, + "loss": 0.7303, + "num_input_tokens_seen": 168181760, + "step": 20530 + }, + { + "epoch": 0.9863145258103241, + "grad_norm": 0.47753021121025085, + "learning_rate": 2.5491727452217616e-08, + "loss": 0.763, + "num_input_tokens_seen": 168263680, + "step": 20540 + }, + { + "epoch": 0.9867947178871549, + "grad_norm": 1.1384245157241821, + "learning_rate": 2.3734496533497907e-08, + "loss": 0.9044, + "num_input_tokens_seen": 168345600, + "step": 20550 + }, + { + "epoch": 0.9872749099639856, + "grad_norm": 0.47026336193084717, + "learning_rate": 2.203998485156844e-08, + "loss": 0.9609, + "num_input_tokens_seen": 168427520, + "step": 20560 + }, + { + "epoch": 0.9877551020408163, + "grad_norm": 0.48212888836860657, + "learning_rate": 2.040819666160876e-08, + "loss": 1.0561, + "num_input_tokens_seen": 168509440, + "step": 20570 + }, + { + "epoch": 0.9882352941176471, + "grad_norm": 0.5447530150413513, + "learning_rate": 1.8839136061288288e-08, + "loss": 1.0402, + "num_input_tokens_seen": 168591360, + "step": 20580 + }, + { + "epoch": 0.9887154861944778, + "grad_norm": 0.5337615609169006, + "learning_rate": 1.7332806990758012e-08, + "loss": 1.1063, + "num_input_tokens_seen": 168673280, + "step": 20590 + }, + { + "epoch": 0.9891956782713085, + "grad_norm": 0.4831818640232086, + "learning_rate": 1.5889213232644917e-08, + "loss": 0.8762, + "num_input_tokens_seen": 168755200, + "step": 20600 + }, + { + "epoch": 0.9896758703481393, + "grad_norm": 0.5846928954124451, + "learning_rate": 1.4508358412032575e-08, + "loss": 1.2618, + "num_input_tokens_seen": 168837120, + "step": 20610 + }, + { + "epoch": 0.99015606242497, + "grad_norm": 0.4968603253364563, + "learning_rate": 1.319024599645835e-08, + "loss": 0.959, + "num_input_tokens_seen": 168919040, + "step": 20620 + }, + { + "epoch": 0.9906362545018007, + "grad_norm": 0.5029283165931702, + "learning_rate": 1.1934879295905089e-08, + "loss": 0.7467, + "num_input_tokens_seen": 169000960, + "step": 20630 + }, + { + "epoch": 0.9911164465786314, + "grad_norm": 0.5078738331794739, + "learning_rate": 1.074226146279278e-08, + "loss": 1.0644, + "num_input_tokens_seen": 169082880, + "step": 20640 + }, + { + "epoch": 0.9915966386554622, + "grad_norm": 0.44298240542411804, + "learning_rate": 9.612395491970239e-09, + "loss": 1.036, + "num_input_tokens_seen": 169164800, + "step": 20650 + }, + { + "epoch": 0.9920768307322929, + "grad_norm": 0.480905681848526, + "learning_rate": 8.545284220698446e-09, + "loss": 0.8798, + "num_input_tokens_seen": 169246720, + "step": 20660 + }, + { + "epoch": 0.9925570228091236, + "grad_norm": 0.4742453396320343, + "learning_rate": 7.540930328658879e-09, + "loss": 0.848, + "num_input_tokens_seen": 169328640, + "step": 20670 + }, + { + "epoch": 0.9930372148859544, + "grad_norm": 0.4936705529689789, + "learning_rate": 6.599336337942408e-09, + "loss": 1.0114, + "num_input_tokens_seen": 169410560, + "step": 20680 + }, + { + "epoch": 0.9935174069627851, + "grad_norm": 0.3984171152114868, + "learning_rate": 5.720504613035416e-09, + "loss": 0.7468, + "num_input_tokens_seen": 169492480, + "step": 20690 + }, + { + "epoch": 0.9939975990396158, + "grad_norm": 0.5437182188034058, + "learning_rate": 4.904437360814252e-09, + "loss": 0.8409, + "num_input_tokens_seen": 169574400, + "step": 20700 + }, + { + "epoch": 0.9944777911164466, + "grad_norm": 0.4836597144603729, + "learning_rate": 4.151136630553554e-09, + "loss": 1.024, + "num_input_tokens_seen": 169656320, + "step": 20710 + }, + { + "epoch": 0.9949579831932773, + "grad_norm": 0.4973140358924866, + "learning_rate": 3.4606043139068234e-09, + "loss": 1.1108, + "num_input_tokens_seen": 169738240, + "step": 20720 + }, + { + "epoch": 0.995438175270108, + "grad_norm": 0.4446418583393097, + "learning_rate": 2.832842144903647e-09, + "loss": 0.9724, + "num_input_tokens_seen": 169820160, + "step": 20730 + }, + { + "epoch": 0.9959183673469387, + "grad_norm": 0.9471971392631531, + "learning_rate": 2.2678516999552478e-09, + "loss": 0.7679, + "num_input_tokens_seen": 169902080, + "step": 20740 + }, + { + "epoch": 0.9963985594237695, + "grad_norm": 0.5114956498146057, + "learning_rate": 1.7656343978378342e-09, + "loss": 0.9723, + "num_input_tokens_seen": 169984000, + "step": 20750 + }, + { + "epoch": 0.9968787515006002, + "grad_norm": 0.4692475497722626, + "learning_rate": 1.3261914996953728e-09, + "loss": 1.1212, + "num_input_tokens_seen": 170065920, + "step": 20760 + }, + { + "epoch": 0.9973589435774309, + "grad_norm": 0.49446624517440796, + "learning_rate": 9.49524109034039e-10, + "loss": 0.9877, + "num_input_tokens_seen": 170147840, + "step": 20770 + }, + { + "epoch": 0.9978391356542617, + "grad_norm": 0.47214624285697937, + "learning_rate": 6.356331717305431e-10, + "loss": 0.8818, + "num_input_tokens_seen": 170229760, + "step": 20780 + }, + { + "epoch": 0.9983193277310924, + "grad_norm": 0.5106399655342102, + "learning_rate": 3.8451947600437466e-10, + "loss": 0.9168, + "num_input_tokens_seen": 170311680, + "step": 20790 + }, + { + "epoch": 0.9987995198079231, + "grad_norm": 0.4835795760154724, + "learning_rate": 1.9618365244833404e-10, + "loss": 0.8026, + "num_input_tokens_seen": 170393600, + "step": 20800 + }, + { + "epoch": 0.999279711884754, + "grad_norm": 0.5011347532272339, + "learning_rate": 7.062617399800075e-11, + "loss": 0.9025, + "num_input_tokens_seen": 170475520, + "step": 20810 + }, + { + "epoch": 0.9997599039615847, + "grad_norm": 0.48915189504623413, + "learning_rate": 7.847355951162705e-12, + "loss": 0.9373, + "num_input_tokens_seen": 170557440, + "step": 20820 + }, { "epoch": 1.0, - "num_input_tokens_seen": 71589888, - "step": 8739, - "total_flos": 3.2596926707332547e+18, - "train_loss": 1.260579330722832, - "train_runtime": 44987.2731, - "train_samples_per_second": 0.194, - "train_steps_per_second": 0.194 + "num_input_tokens_seen": 170598400, + "step": 20825, + "total_flos": 7.767833833163981e+18, + "train_loss": 0.9246458945440359, + "train_runtime": 106837.984, + "train_samples_per_second": 0.195, + "train_steps_per_second": 0.195 } ], "logging_steps": 10, - "max_steps": 8739, - "num_input_tokens_seen": 71589888, + "max_steps": 20825, + "num_input_tokens_seen": 170598400, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { @@ -7020,7 +16692,7 @@ "attributes": {} } }, - "total_flos": 3.2596926707332547e+18, + "total_flos": 7.767833833163981e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null