diff --git "a/Luminia-8B-RP/trainer_state.json" "b/Luminia-8B-RP/trainer_state.json" new file mode 100644--- /dev/null +++ "b/Luminia-8B-RP/trainer_state.json" @@ -0,0 +1,7027 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8739, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0011442956860052637, + "grad_norm": 0.6716632843017578, + "learning_rate": 5.000000000000001e-07, + "loss": 1.5075, + "num_input_tokens_seen": 81920, + "step": 10 + }, + { + "epoch": 0.0022885913720105274, + "grad_norm": 0.8028005957603455, + "learning_rate": 1.0000000000000002e-06, + "loss": 1.379, + "num_input_tokens_seen": 163840, + "step": 20 + }, + { + "epoch": 0.0034328870580157913, + "grad_norm": 0.6309388279914856, + "learning_rate": 1.5e-06, + "loss": 1.6386, + "num_input_tokens_seen": 245760, + "step": 30 + }, + { + "epoch": 0.004577182744021055, + "grad_norm": 2.590484380722046, + "learning_rate": 2.0000000000000003e-06, + "loss": 1.5525, + "num_input_tokens_seen": 327680, + "step": 40 + }, + { + "epoch": 0.005721478430026319, + "grad_norm": 0.8970260620117188, + "learning_rate": 2.5e-06, + "loss": 1.651, + "num_input_tokens_seen": 409600, + "step": 50 + }, + { + "epoch": 0.006865774116031583, + "grad_norm": 0.7371882796287537, + "learning_rate": 3e-06, + "loss": 1.5067, + "num_input_tokens_seen": 491520, + "step": 60 + }, + { + "epoch": 0.008010069802036847, + "grad_norm": 2.1576790809631348, + "learning_rate": 3.5000000000000004e-06, + "loss": 1.3367, + "num_input_tokens_seen": 573440, + "step": 70 + }, + { + "epoch": 0.00915436548804211, + "grad_norm": 0.8213880658149719, + "learning_rate": 4.000000000000001e-06, + "loss": 1.6071, + "num_input_tokens_seen": 655360, + "step": 80 + }, + { + "epoch": 0.010298661174047374, + "grad_norm": 1.3453713655471802, + "learning_rate": 4.5e-06, + "loss": 1.522, + "num_input_tokens_seen": 737280, + "step": 90 + }, + { + "epoch": 0.011442956860052637, + "grad_norm": 1.1660873889923096, + "learning_rate": 5e-06, + "loss": 1.5853, + "num_input_tokens_seen": 819200, + "step": 100 + }, + { + "epoch": 0.012587252546057902, + "grad_norm": 1.4795037508010864, + "learning_rate": 5.500000000000001e-06, + "loss": 1.4081, + "num_input_tokens_seen": 901120, + "step": 110 + }, + { + "epoch": 0.013731548232063165, + "grad_norm": 2.582639455795288, + "learning_rate": 6e-06, + "loss": 1.5405, + "num_input_tokens_seen": 983040, + "step": 120 + }, + { + "epoch": 0.014875843918068428, + "grad_norm": 1.1922495365142822, + "learning_rate": 6.5000000000000004e-06, + "loss": 1.3632, + "num_input_tokens_seen": 1064960, + "step": 130 + }, + { + "epoch": 0.016020139604073693, + "grad_norm": 1.249202847480774, + "learning_rate": 7.000000000000001e-06, + "loss": 1.128, + "num_input_tokens_seen": 1146880, + "step": 140 + }, + { + "epoch": 0.017164435290078956, + "grad_norm": 2.9147748947143555, + "learning_rate": 7.5e-06, + "loss": 1.3543, + "num_input_tokens_seen": 1228800, + "step": 150 + }, + { + "epoch": 0.01830873097608422, + "grad_norm": 1.3228706121444702, + "learning_rate": 8.000000000000001e-06, + "loss": 1.3541, + "num_input_tokens_seen": 1310720, + "step": 160 + }, + { + "epoch": 0.019453026662089482, + "grad_norm": 1.2447155714035034, + "learning_rate": 8.500000000000002e-06, + "loss": 1.4476, + "num_input_tokens_seen": 1392640, + "step": 170 + }, + { + "epoch": 0.02059732234809475, + "grad_norm": 1.2924906015396118, + "learning_rate": 9e-06, + "loss": 1.212, + "num_input_tokens_seen": 1474560, + "step": 180 + }, + { + "epoch": 0.021741618034100012, + "grad_norm": 1.352099061012268, + "learning_rate": 9.5e-06, + "loss": 1.3312, + "num_input_tokens_seen": 1556480, + "step": 190 + }, + { + "epoch": 0.022885913720105275, + "grad_norm": 2.618453025817871, + "learning_rate": 1e-05, + "loss": 1.3266, + "num_input_tokens_seen": 1638400, + "step": 200 + }, + { + "epoch": 0.024030209406110538, + "grad_norm": 1.2627308368682861, + "learning_rate": 1.05e-05, + "loss": 1.3106, + "num_input_tokens_seen": 1720320, + "step": 210 + }, + { + "epoch": 0.025174505092115804, + "grad_norm": 0.9320095181465149, + "learning_rate": 1.1000000000000001e-05, + "loss": 1.4398, + "num_input_tokens_seen": 1802240, + "step": 220 + }, + { + "epoch": 0.026318800778121067, + "grad_norm": 1.7634745836257935, + "learning_rate": 1.1500000000000002e-05, + "loss": 1.1946, + "num_input_tokens_seen": 1884160, + "step": 230 + }, + { + "epoch": 0.02746309646412633, + "grad_norm": 1.1597224473953247, + "learning_rate": 1.2e-05, + "loss": 1.2912, + "num_input_tokens_seen": 1966080, + "step": 240 + }, + { + "epoch": 0.028607392150131594, + "grad_norm": 1.1823091506958008, + "learning_rate": 1.25e-05, + "loss": 1.5374, + "num_input_tokens_seen": 2048000, + "step": 250 + }, + { + "epoch": 0.029751687836136857, + "grad_norm": 0.88273024559021, + "learning_rate": 1.3000000000000001e-05, + "loss": 1.4559, + "num_input_tokens_seen": 2129920, + "step": 260 + }, + { + "epoch": 0.030895983522142123, + "grad_norm": 1.2335752248764038, + "learning_rate": 1.3500000000000001e-05, + "loss": 1.4361, + "num_input_tokens_seen": 2211840, + "step": 270 + }, + { + "epoch": 0.032040279208147386, + "grad_norm": 0.8680986166000366, + "learning_rate": 1.4000000000000001e-05, + "loss": 1.7506, + "num_input_tokens_seen": 2293760, + "step": 280 + }, + { + "epoch": 0.03318457489415265, + "grad_norm": 0.8861328363418579, + "learning_rate": 1.45e-05, + "loss": 1.5946, + "num_input_tokens_seen": 2375680, + "step": 290 + }, + { + "epoch": 0.03432887058015791, + "grad_norm": 0.8510925769805908, + "learning_rate": 1.5e-05, + "loss": 1.386, + "num_input_tokens_seen": 2457600, + "step": 300 + }, + { + "epoch": 0.035473166266163175, + "grad_norm": 0.961478054523468, + "learning_rate": 1.55e-05, + "loss": 1.1091, + "num_input_tokens_seen": 2539520, + "step": 310 + }, + { + "epoch": 0.03661746195216844, + "grad_norm": 0.8009458780288696, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.3287, + "num_input_tokens_seen": 2621440, + "step": 320 + }, + { + "epoch": 0.0377617576381737, + "grad_norm": 1.1145437955856323, + "learning_rate": 1.65e-05, + "loss": 1.2537, + "num_input_tokens_seen": 2703360, + "step": 330 + }, + { + "epoch": 0.038906053324178964, + "grad_norm": 0.7063050270080566, + "learning_rate": 1.7000000000000003e-05, + "loss": 1.4166, + "num_input_tokens_seen": 2785280, + "step": 340 + }, + { + "epoch": 0.040050349010184234, + "grad_norm": 0.8494574427604675, + "learning_rate": 1.75e-05, + "loss": 1.298, + "num_input_tokens_seen": 2867200, + "step": 350 + }, + { + "epoch": 0.0411946446961895, + "grad_norm": 0.8032189011573792, + "learning_rate": 1.8e-05, + "loss": 1.4127, + "num_input_tokens_seen": 2949120, + "step": 360 + }, + { + "epoch": 0.04233894038219476, + "grad_norm": 0.6479071974754333, + "learning_rate": 1.85e-05, + "loss": 1.1462, + "num_input_tokens_seen": 3031040, + "step": 370 + }, + { + "epoch": 0.043483236068200024, + "grad_norm": 1.165413737297058, + "learning_rate": 1.9e-05, + "loss": 1.5957, + "num_input_tokens_seen": 3112960, + "step": 380 + }, + { + "epoch": 0.04462753175420529, + "grad_norm": 0.8543304800987244, + "learning_rate": 1.9500000000000003e-05, + "loss": 1.3672, + "num_input_tokens_seen": 3194880, + "step": 390 + }, + { + "epoch": 0.04577182744021055, + "grad_norm": 0.8080208897590637, + "learning_rate": 2e-05, + "loss": 1.2616, + "num_input_tokens_seen": 3276800, + "step": 400 + }, + { + "epoch": 0.04691612312621581, + "grad_norm": 0.810095489025116, + "learning_rate": 2.05e-05, + "loss": 1.1038, + "num_input_tokens_seen": 3358720, + "step": 410 + }, + { + "epoch": 0.048060418812221076, + "grad_norm": 0.6553860902786255, + "learning_rate": 2.1e-05, + "loss": 1.3898, + "num_input_tokens_seen": 3440640, + "step": 420 + }, + { + "epoch": 0.04920471449822634, + "grad_norm": 3.40867018699646, + "learning_rate": 2.15e-05, + "loss": 1.2665, + "num_input_tokens_seen": 3522560, + "step": 430 + }, + { + "epoch": 0.05034901018423161, + "grad_norm": 0.6455990076065063, + "learning_rate": 2.2000000000000003e-05, + "loss": 1.188, + "num_input_tokens_seen": 3604480, + "step": 440 + }, + { + "epoch": 0.05149330587023687, + "grad_norm": 0.5759983658790588, + "learning_rate": 2.25e-05, + "loss": 1.4607, + "num_input_tokens_seen": 3686400, + "step": 450 + }, + { + "epoch": 0.052637601556242135, + "grad_norm": 1.409822940826416, + "learning_rate": 2.3000000000000003e-05, + "loss": 1.1622, + "num_input_tokens_seen": 3768320, + "step": 460 + }, + { + "epoch": 0.0537818972422474, + "grad_norm": 0.6812942028045654, + "learning_rate": 2.35e-05, + "loss": 1.3681, + "num_input_tokens_seen": 3850240, + "step": 470 + }, + { + "epoch": 0.05492619292825266, + "grad_norm": 1.1562724113464355, + "learning_rate": 2.4e-05, + "loss": 1.341, + "num_input_tokens_seen": 3932160, + "step": 480 + }, + { + "epoch": 0.056070488614257924, + "grad_norm": 1.1834927797317505, + "learning_rate": 2.45e-05, + "loss": 1.0104, + "num_input_tokens_seen": 4014080, + "step": 490 + }, + { + "epoch": 0.05721478430026319, + "grad_norm": 0.6955065727233887, + "learning_rate": 2.5e-05, + "loss": 1.4473, + "num_input_tokens_seen": 4096000, + "step": 500 + }, + { + "epoch": 0.05835907998626845, + "grad_norm": 5.183798313140869, + "learning_rate": 2.5500000000000003e-05, + "loss": 1.5255, + "num_input_tokens_seen": 4177920, + "step": 510 + }, + { + "epoch": 0.05950337567227371, + "grad_norm": 0.6789785027503967, + "learning_rate": 2.6000000000000002e-05, + "loss": 1.3778, + "num_input_tokens_seen": 4259840, + "step": 520 + }, + { + "epoch": 0.060647671358278976, + "grad_norm": 0.5913729667663574, + "learning_rate": 2.6500000000000004e-05, + "loss": 1.1477, + "num_input_tokens_seen": 4341760, + "step": 530 + }, + { + "epoch": 0.061791967044284246, + "grad_norm": 1.2051905393600464, + "learning_rate": 2.7000000000000002e-05, + "loss": 1.4827, + "num_input_tokens_seen": 4423680, + "step": 540 + }, + { + "epoch": 0.0629362627302895, + "grad_norm": 0.5383505821228027, + "learning_rate": 2.7500000000000004e-05, + "loss": 1.635, + "num_input_tokens_seen": 4505600, + "step": 550 + }, + { + "epoch": 0.06408055841629477, + "grad_norm": 0.6724018454551697, + "learning_rate": 2.8000000000000003e-05, + "loss": 1.2354, + "num_input_tokens_seen": 4587520, + "step": 560 + }, + { + "epoch": 0.06522485410230003, + "grad_norm": 0.8346174359321594, + "learning_rate": 2.8499999999999998e-05, + "loss": 1.3294, + "num_input_tokens_seen": 4669440, + "step": 570 + }, + { + "epoch": 0.0663691497883053, + "grad_norm": 0.624640703201294, + "learning_rate": 2.9e-05, + "loss": 1.3841, + "num_input_tokens_seen": 4751360, + "step": 580 + }, + { + "epoch": 0.06751344547431057, + "grad_norm": 0.5437624454498291, + "learning_rate": 2.95e-05, + "loss": 1.3379, + "num_input_tokens_seen": 4833280, + "step": 590 + }, + { + "epoch": 0.06865774116031582, + "grad_norm": 0.5260199308395386, + "learning_rate": 3e-05, + "loss": 1.187, + "num_input_tokens_seen": 4915200, + "step": 600 + }, + { + "epoch": 0.0698020368463211, + "grad_norm": 0.6440827250480652, + "learning_rate": 3.05e-05, + "loss": 1.5078, + "num_input_tokens_seen": 4997120, + "step": 610 + }, + { + "epoch": 0.07094633253232635, + "grad_norm": 0.8612675666809082, + "learning_rate": 3.1e-05, + "loss": 1.4624, + "num_input_tokens_seen": 5079040, + "step": 620 + }, + { + "epoch": 0.07209062821833162, + "grad_norm": 0.5070405602455139, + "learning_rate": 3.15e-05, + "loss": 1.2244, + "num_input_tokens_seen": 5160960, + "step": 630 + }, + { + "epoch": 0.07323492390433688, + "grad_norm": 0.6587413549423218, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.4688, + "num_input_tokens_seen": 5242880, + "step": 640 + }, + { + "epoch": 0.07437921959034215, + "grad_norm": 0.6212195754051208, + "learning_rate": 3.2500000000000004e-05, + "loss": 1.1677, + "num_input_tokens_seen": 5324800, + "step": 650 + }, + { + "epoch": 0.0755235152763474, + "grad_norm": 1.031207799911499, + "learning_rate": 3.3e-05, + "loss": 1.2562, + "num_input_tokens_seen": 5406720, + "step": 660 + }, + { + "epoch": 0.07666781096235267, + "grad_norm": 0.5637765526771545, + "learning_rate": 3.35e-05, + "loss": 1.3111, + "num_input_tokens_seen": 5488640, + "step": 670 + }, + { + "epoch": 0.07781210664835793, + "grad_norm": 1.6246665716171265, + "learning_rate": 3.4000000000000007e-05, + "loss": 1.2953, + "num_input_tokens_seen": 5570560, + "step": 680 + }, + { + "epoch": 0.0789564023343632, + "grad_norm": 0.5538053512573242, + "learning_rate": 3.45e-05, + "loss": 1.2493, + "num_input_tokens_seen": 5652480, + "step": 690 + }, + { + "epoch": 0.08010069802036847, + "grad_norm": 0.566648542881012, + "learning_rate": 3.5e-05, + "loss": 1.3966, + "num_input_tokens_seen": 5734400, + "step": 700 + }, + { + "epoch": 0.08124499370637372, + "grad_norm": 0.5915348529815674, + "learning_rate": 3.55e-05, + "loss": 1.1898, + "num_input_tokens_seen": 5816320, + "step": 710 + }, + { + "epoch": 0.082389289392379, + "grad_norm": 0.5097474455833435, + "learning_rate": 3.6e-05, + "loss": 1.3558, + "num_input_tokens_seen": 5898240, + "step": 720 + }, + { + "epoch": 0.08353358507838425, + "grad_norm": 0.6351694464683533, + "learning_rate": 3.65e-05, + "loss": 1.6577, + "num_input_tokens_seen": 5980160, + "step": 730 + }, + { + "epoch": 0.08467788076438952, + "grad_norm": 0.5961860418319702, + "learning_rate": 3.7e-05, + "loss": 1.2571, + "num_input_tokens_seen": 6062080, + "step": 740 + }, + { + "epoch": 0.08582217645039478, + "grad_norm": 0.7016868591308594, + "learning_rate": 3.7500000000000003e-05, + "loss": 1.3735, + "num_input_tokens_seen": 6144000, + "step": 750 + }, + { + "epoch": 0.08696647213640005, + "grad_norm": 0.5575875043869019, + "learning_rate": 3.8e-05, + "loss": 1.2731, + "num_input_tokens_seen": 6225920, + "step": 760 + }, + { + "epoch": 0.0881107678224053, + "grad_norm": 1.047059416770935, + "learning_rate": 3.85e-05, + "loss": 1.543, + "num_input_tokens_seen": 6307840, + "step": 770 + }, + { + "epoch": 0.08925506350841057, + "grad_norm": 0.8251460790634155, + "learning_rate": 3.9000000000000006e-05, + "loss": 1.1869, + "num_input_tokens_seen": 6389760, + "step": 780 + }, + { + "epoch": 0.09039935919441584, + "grad_norm": 0.6270453333854675, + "learning_rate": 3.9500000000000005e-05, + "loss": 1.6793, + "num_input_tokens_seen": 6471680, + "step": 790 + }, + { + "epoch": 0.0915436548804211, + "grad_norm": 0.6033393740653992, + "learning_rate": 4e-05, + "loss": 1.7236, + "num_input_tokens_seen": 6553600, + "step": 800 + }, + { + "epoch": 0.09268795056642637, + "grad_norm": 1.2078917026519775, + "learning_rate": 4.05e-05, + "loss": 1.1466, + "num_input_tokens_seen": 6635520, + "step": 810 + }, + { + "epoch": 0.09383224625243163, + "grad_norm": 0.6712398529052734, + "learning_rate": 4.1e-05, + "loss": 1.308, + "num_input_tokens_seen": 6717440, + "step": 820 + }, + { + "epoch": 0.0949765419384369, + "grad_norm": 0.5450591444969177, + "learning_rate": 4.15e-05, + "loss": 1.1448, + "num_input_tokens_seen": 6799360, + "step": 830 + }, + { + "epoch": 0.09612083762444215, + "grad_norm": 0.6224645376205444, + "learning_rate": 4.2e-05, + "loss": 1.1618, + "num_input_tokens_seen": 6881280, + "step": 840 + }, + { + "epoch": 0.09726513331044742, + "grad_norm": 0.8741360306739807, + "learning_rate": 4.25e-05, + "loss": 1.3667, + "num_input_tokens_seen": 6963200, + "step": 850 + }, + { + "epoch": 0.09840942899645268, + "grad_norm": 0.6075563430786133, + "learning_rate": 4.3e-05, + "loss": 1.3752, + "num_input_tokens_seen": 7045120, + "step": 860 + }, + { + "epoch": 0.09955372468245795, + "grad_norm": 0.6345754265785217, + "learning_rate": 4.35e-05, + "loss": 1.3973, + "num_input_tokens_seen": 7127040, + "step": 870 + }, + { + "epoch": 0.10069802036846322, + "grad_norm": 0.7039983868598938, + "learning_rate": 4.4000000000000006e-05, + "loss": 1.3936, + "num_input_tokens_seen": 7208960, + "step": 880 + }, + { + "epoch": 0.10184231605446847, + "grad_norm": 0.6462425589561462, + "learning_rate": 4.4500000000000004e-05, + "loss": 1.2224, + "num_input_tokens_seen": 7290880, + "step": 890 + }, + { + "epoch": 0.10298661174047374, + "grad_norm": 0.6138895153999329, + "learning_rate": 4.5e-05, + "loss": 1.0594, + "num_input_tokens_seen": 7372800, + "step": 900 + }, + { + "epoch": 0.104130907426479, + "grad_norm": 0.5026350021362305, + "learning_rate": 4.55e-05, + "loss": 1.122, + "num_input_tokens_seen": 7454720, + "step": 910 + }, + { + "epoch": 0.10527520311248427, + "grad_norm": 0.6174338459968567, + "learning_rate": 4.600000000000001e-05, + "loss": 1.2711, + "num_input_tokens_seen": 7536640, + "step": 920 + }, + { + "epoch": 0.10641949879848953, + "grad_norm": 0.736929178237915, + "learning_rate": 4.6500000000000005e-05, + "loss": 1.3302, + "num_input_tokens_seen": 7618560, + "step": 930 + }, + { + "epoch": 0.1075637944844948, + "grad_norm": 0.6089776158332825, + "learning_rate": 4.7e-05, + "loss": 1.2869, + "num_input_tokens_seen": 7700480, + "step": 940 + }, + { + "epoch": 0.10870809017050005, + "grad_norm": 0.7438560724258423, + "learning_rate": 4.75e-05, + "loss": 1.2435, + "num_input_tokens_seen": 7782400, + "step": 950 + }, + { + "epoch": 0.10985238585650532, + "grad_norm": 0.5268089175224304, + "learning_rate": 4.8e-05, + "loss": 1.0464, + "num_input_tokens_seen": 7864320, + "step": 960 + }, + { + "epoch": 0.11099668154251058, + "grad_norm": 0.6583835482597351, + "learning_rate": 4.85e-05, + "loss": 1.2715, + "num_input_tokens_seen": 7946240, + "step": 970 + }, + { + "epoch": 0.11214097722851585, + "grad_norm": 1.4557218551635742, + "learning_rate": 4.9e-05, + "loss": 1.4746, + "num_input_tokens_seen": 8028160, + "step": 980 + }, + { + "epoch": 0.11328527291452112, + "grad_norm": 1.0798985958099365, + "learning_rate": 4.9500000000000004e-05, + "loss": 1.1464, + "num_input_tokens_seen": 8110080, + "step": 990 + }, + { + "epoch": 0.11442956860052637, + "grad_norm": 0.5683252811431885, + "learning_rate": 5e-05, + "loss": 1.3901, + "num_input_tokens_seen": 8192000, + "step": 1000 + }, + { + "epoch": 0.11557386428653164, + "grad_norm": 0.6159707903862, + "learning_rate": 4.999979401316311e-05, + "loss": 1.3986, + "num_input_tokens_seen": 8273920, + "step": 1010 + }, + { + "epoch": 0.1167181599725369, + "grad_norm": 0.6029934883117676, + "learning_rate": 4.999917605604688e-05, + "loss": 1.3841, + "num_input_tokens_seen": 8355840, + "step": 1020 + }, + { + "epoch": 0.11786245565854217, + "grad_norm": 0.6061372756958008, + "learning_rate": 4.999814613883459e-05, + "loss": 1.6034, + "num_input_tokens_seen": 8437760, + "step": 1030 + }, + { + "epoch": 0.11900675134454743, + "grad_norm": 0.5895953178405762, + "learning_rate": 4.9996704278498185e-05, + "loss": 1.5685, + "num_input_tokens_seen": 8519680, + "step": 1040 + }, + { + "epoch": 0.1201510470305527, + "grad_norm": 0.6715599894523621, + "learning_rate": 4.9994850498798026e-05, + "loss": 1.4407, + "num_input_tokens_seen": 8601600, + "step": 1050 + }, + { + "epoch": 0.12129534271655795, + "grad_norm": 0.6397563815116882, + "learning_rate": 4.999258483028243e-05, + "loss": 1.2713, + "num_input_tokens_seen": 8683520, + "step": 1060 + }, + { + "epoch": 0.12243963840256322, + "grad_norm": 0.5121895670890808, + "learning_rate": 4.9989907310287243e-05, + "loss": 1.2696, + "num_input_tokens_seen": 8765440, + "step": 1070 + }, + { + "epoch": 0.12358393408856849, + "grad_norm": 0.533311665058136, + "learning_rate": 4.998681798293516e-05, + "loss": 1.1279, + "num_input_tokens_seen": 8847360, + "step": 1080 + }, + { + "epoch": 0.12472822977457375, + "grad_norm": 0.4866422414779663, + "learning_rate": 4.998331689913506e-05, + "loss": 1.3104, + "num_input_tokens_seen": 8929280, + "step": 1090 + }, + { + "epoch": 0.125872525460579, + "grad_norm": 0.6534491181373596, + "learning_rate": 4.9979404116581104e-05, + "loss": 1.2877, + "num_input_tokens_seen": 9011200, + "step": 1100 + }, + { + "epoch": 0.12701682114658427, + "grad_norm": 0.5954148173332214, + "learning_rate": 4.9975079699751825e-05, + "loss": 1.2768, + "num_input_tokens_seen": 9093120, + "step": 1110 + }, + { + "epoch": 0.12816111683258954, + "grad_norm": 0.49169713258743286, + "learning_rate": 4.997034371990907e-05, + "loss": 1.5502, + "num_input_tokens_seen": 9175040, + "step": 1120 + }, + { + "epoch": 0.12930541251859481, + "grad_norm": 0.6432734131813049, + "learning_rate": 4.99651962550968e-05, + "loss": 1.3732, + "num_input_tokens_seen": 9256960, + "step": 1130 + }, + { + "epoch": 0.13044970820460006, + "grad_norm": 0.6305902600288391, + "learning_rate": 4.9959637390139814e-05, + "loss": 1.1296, + "num_input_tokens_seen": 9338880, + "step": 1140 + }, + { + "epoch": 0.13159400389060533, + "grad_norm": 0.5763765573501587, + "learning_rate": 4.995366721664234e-05, + "loss": 1.1064, + "num_input_tokens_seen": 9420800, + "step": 1150 + }, + { + "epoch": 0.1327382995766106, + "grad_norm": 0.5058463215827942, + "learning_rate": 4.9947285832986553e-05, + "loss": 1.3228, + "num_input_tokens_seen": 9502720, + "step": 1160 + }, + { + "epoch": 0.13388259526261587, + "grad_norm": 0.6652387380599976, + "learning_rate": 4.994049334433095e-05, + "loss": 1.3161, + "num_input_tokens_seen": 9584640, + "step": 1170 + }, + { + "epoch": 0.13502689094862114, + "grad_norm": 0.5453552007675171, + "learning_rate": 4.9933289862608584e-05, + "loss": 1.4789, + "num_input_tokens_seen": 9666560, + "step": 1180 + }, + { + "epoch": 0.13617118663462638, + "grad_norm": 0.5469970107078552, + "learning_rate": 4.992567550652525e-05, + "loss": 1.3033, + "num_input_tokens_seen": 9748480, + "step": 1190 + }, + { + "epoch": 0.13731548232063165, + "grad_norm": 0.7196422219276428, + "learning_rate": 4.9917650401557505e-05, + "loss": 1.1661, + "num_input_tokens_seen": 9830400, + "step": 1200 + }, + { + "epoch": 0.13845977800663692, + "grad_norm": 0.7473416328430176, + "learning_rate": 4.990921467995064e-05, + "loss": 1.3148, + "num_input_tokens_seen": 9912320, + "step": 1210 + }, + { + "epoch": 0.1396040736926422, + "grad_norm": 0.6073607206344604, + "learning_rate": 4.9900368480716466e-05, + "loss": 1.186, + "num_input_tokens_seen": 9994240, + "step": 1220 + }, + { + "epoch": 0.14074836937864743, + "grad_norm": 0.49970775842666626, + "learning_rate": 4.9891111949631023e-05, + "loss": 1.0854, + "num_input_tokens_seen": 10076160, + "step": 1230 + }, + { + "epoch": 0.1418926650646527, + "grad_norm": 0.5494070053100586, + "learning_rate": 4.988144523923221e-05, + "loss": 1.2775, + "num_input_tokens_seen": 10158080, + "step": 1240 + }, + { + "epoch": 0.14303696075065797, + "grad_norm": 0.49916988611221313, + "learning_rate": 4.987136850881721e-05, + "loss": 1.3006, + "num_input_tokens_seen": 10240000, + "step": 1250 + }, + { + "epoch": 0.14418125643666324, + "grad_norm": 0.46677911281585693, + "learning_rate": 4.986088192443995e-05, + "loss": 1.0796, + "num_input_tokens_seen": 10321920, + "step": 1260 + }, + { + "epoch": 0.1453255521226685, + "grad_norm": 0.520656943321228, + "learning_rate": 4.9849985658908296e-05, + "loss": 1.2881, + "num_input_tokens_seen": 10403840, + "step": 1270 + }, + { + "epoch": 0.14646984780867375, + "grad_norm": 0.5875335335731506, + "learning_rate": 4.9838679891781214e-05, + "loss": 1.2889, + "num_input_tokens_seen": 10485760, + "step": 1280 + }, + { + "epoch": 0.14761414349467902, + "grad_norm": 0.431538850069046, + "learning_rate": 4.982696480936586e-05, + "loss": 1.0391, + "num_input_tokens_seen": 10567680, + "step": 1290 + }, + { + "epoch": 0.1487584391806843, + "grad_norm": 0.7769544124603271, + "learning_rate": 4.981484060471444e-05, + "loss": 1.0809, + "num_input_tokens_seen": 10649600, + "step": 1300 + }, + { + "epoch": 0.14990273486668956, + "grad_norm": 0.5716103315353394, + "learning_rate": 4.9802307477621084e-05, + "loss": 1.1128, + "num_input_tokens_seen": 10731520, + "step": 1310 + }, + { + "epoch": 0.1510470305526948, + "grad_norm": 0.546258807182312, + "learning_rate": 4.978936563461854e-05, + "loss": 1.2888, + "num_input_tokens_seen": 10813440, + "step": 1320 + }, + { + "epoch": 0.15219132623870008, + "grad_norm": 0.9458239674568176, + "learning_rate": 4.9776015288974736e-05, + "loss": 1.3032, + "num_input_tokens_seen": 10895360, + "step": 1330 + }, + { + "epoch": 0.15333562192470535, + "grad_norm": 0.43678519129753113, + "learning_rate": 4.976225666068932e-05, + "loss": 0.9843, + "num_input_tokens_seen": 10977280, + "step": 1340 + }, + { + "epoch": 0.15447991761071062, + "grad_norm": 0.5537676811218262, + "learning_rate": 4.9748089976489996e-05, + "loss": 1.3003, + "num_input_tokens_seen": 11059200, + "step": 1350 + }, + { + "epoch": 0.15562421329671586, + "grad_norm": 0.5468762516975403, + "learning_rate": 4.9733515469828795e-05, + "loss": 1.4718, + "num_input_tokens_seen": 11141120, + "step": 1360 + }, + { + "epoch": 0.15676850898272113, + "grad_norm": 0.5907567143440247, + "learning_rate": 4.971853338087825e-05, + "loss": 1.3703, + "num_input_tokens_seen": 11223040, + "step": 1370 + }, + { + "epoch": 0.1579128046687264, + "grad_norm": 0.6227702498435974, + "learning_rate": 4.97031439565274e-05, + "loss": 1.0876, + "num_input_tokens_seen": 11304960, + "step": 1380 + }, + { + "epoch": 0.15905710035473167, + "grad_norm": 0.5486903786659241, + "learning_rate": 4.9687347450377755e-05, + "loss": 1.2873, + "num_input_tokens_seen": 11386880, + "step": 1390 + }, + { + "epoch": 0.16020139604073694, + "grad_norm": 1.0276037454605103, + "learning_rate": 4.9671144122739106e-05, + "loss": 1.4622, + "num_input_tokens_seen": 11468800, + "step": 1400 + }, + { + "epoch": 0.16134569172674218, + "grad_norm": 0.5728926658630371, + "learning_rate": 4.9654534240625225e-05, + "loss": 1.0993, + "num_input_tokens_seen": 11550720, + "step": 1410 + }, + { + "epoch": 0.16248998741274745, + "grad_norm": 0.5300357341766357, + "learning_rate": 4.9637518077749476e-05, + "loss": 1.156, + "num_input_tokens_seen": 11632640, + "step": 1420 + }, + { + "epoch": 0.16363428309875272, + "grad_norm": 0.9682191610336304, + "learning_rate": 4.962009591452032e-05, + "loss": 1.266, + "num_input_tokens_seen": 11714560, + "step": 1430 + }, + { + "epoch": 0.164778578784758, + "grad_norm": 0.5290905237197876, + "learning_rate": 4.960226803803664e-05, + "loss": 1.2794, + "num_input_tokens_seen": 11796480, + "step": 1440 + }, + { + "epoch": 0.16592287447076323, + "grad_norm": 0.8597230315208435, + "learning_rate": 4.958403474208308e-05, + "loss": 1.2099, + "num_input_tokens_seen": 11878400, + "step": 1450 + }, + { + "epoch": 0.1670671701567685, + "grad_norm": 0.6521583795547485, + "learning_rate": 4.9565396327125155e-05, + "loss": 1.3631, + "num_input_tokens_seen": 11960320, + "step": 1460 + }, + { + "epoch": 0.16821146584277377, + "grad_norm": 0.6258875727653503, + "learning_rate": 4.95463531003043e-05, + "loss": 1.1778, + "num_input_tokens_seen": 12042240, + "step": 1470 + }, + { + "epoch": 0.16935576152877904, + "grad_norm": 3.744122266769409, + "learning_rate": 4.952690537543287e-05, + "loss": 1.3796, + "num_input_tokens_seen": 12124160, + "step": 1480 + }, + { + "epoch": 0.1705000572147843, + "grad_norm": 1.4813406467437744, + "learning_rate": 4.9507053472988867e-05, + "loss": 1.2806, + "num_input_tokens_seen": 12206080, + "step": 1490 + }, + { + "epoch": 0.17164435290078955, + "grad_norm": 0.5667291283607483, + "learning_rate": 4.9486797720110746e-05, + "loss": 1.3344, + "num_input_tokens_seen": 12288000, + "step": 1500 + }, + { + "epoch": 0.17278864858679482, + "grad_norm": 1.1978493928909302, + "learning_rate": 4.946613845059199e-05, + "loss": 1.2448, + "num_input_tokens_seen": 12369920, + "step": 1510 + }, + { + "epoch": 0.1739329442728001, + "grad_norm": 1.018848180770874, + "learning_rate": 4.9445076004875596e-05, + "loss": 1.2916, + "num_input_tokens_seen": 12451840, + "step": 1520 + }, + { + "epoch": 0.17507723995880536, + "grad_norm": 0.5812198519706726, + "learning_rate": 4.9423610730048495e-05, + "loss": 1.3731, + "num_input_tokens_seen": 12533760, + "step": 1530 + }, + { + "epoch": 0.1762215356448106, + "grad_norm": 0.5890350341796875, + "learning_rate": 4.940174297983581e-05, + "loss": 1.3654, + "num_input_tokens_seen": 12615680, + "step": 1540 + }, + { + "epoch": 0.17736583133081588, + "grad_norm": 0.5180538892745972, + "learning_rate": 4.937947311459503e-05, + "loss": 1.2584, + "num_input_tokens_seen": 12697600, + "step": 1550 + }, + { + "epoch": 0.17851012701682115, + "grad_norm": 0.6072027087211609, + "learning_rate": 4.9356801501310105e-05, + "loss": 1.372, + "num_input_tokens_seen": 12779520, + "step": 1560 + }, + { + "epoch": 0.17965442270282642, + "grad_norm": 0.5989131331443787, + "learning_rate": 4.933372851358532e-05, + "loss": 1.562, + "num_input_tokens_seen": 12861440, + "step": 1570 + }, + { + "epoch": 0.1807987183888317, + "grad_norm": 0.6137218475341797, + "learning_rate": 4.9310254531639235e-05, + "loss": 1.2423, + "num_input_tokens_seen": 12943360, + "step": 1580 + }, + { + "epoch": 0.18194301407483693, + "grad_norm": 0.5463627576828003, + "learning_rate": 4.928637994229834e-05, + "loss": 1.3979, + "num_input_tokens_seen": 13025280, + "step": 1590 + }, + { + "epoch": 0.1830873097608422, + "grad_norm": 0.5558052062988281, + "learning_rate": 4.9262105138990745e-05, + "loss": 1.3811, + "num_input_tokens_seen": 13107200, + "step": 1600 + }, + { + "epoch": 0.18423160544684747, + "grad_norm": 0.6902844905853271, + "learning_rate": 4.9237430521739626e-05, + "loss": 1.4006, + "num_input_tokens_seen": 13189120, + "step": 1610 + }, + { + "epoch": 0.18537590113285274, + "grad_norm": 1.180159568786621, + "learning_rate": 4.92123564971567e-05, + "loss": 1.2214, + "num_input_tokens_seen": 13271040, + "step": 1620 + }, + { + "epoch": 0.18652019681885798, + "grad_norm": 0.6135653853416443, + "learning_rate": 4.918688347843549e-05, + "loss": 1.2192, + "num_input_tokens_seen": 13352960, + "step": 1630 + }, + { + "epoch": 0.18766449250486325, + "grad_norm": 0.5236485004425049, + "learning_rate": 4.916101188534452e-05, + "loss": 1.4384, + "num_input_tokens_seen": 13434880, + "step": 1640 + }, + { + "epoch": 0.18880878819086852, + "grad_norm": 0.6157830357551575, + "learning_rate": 4.9134742144220394e-05, + "loss": 1.2473, + "num_input_tokens_seen": 13516800, + "step": 1650 + }, + { + "epoch": 0.1899530838768738, + "grad_norm": 1.5510542392730713, + "learning_rate": 4.910807468796079e-05, + "loss": 1.277, + "num_input_tokens_seen": 13598720, + "step": 1660 + }, + { + "epoch": 0.19109737956287906, + "grad_norm": 0.5472043752670288, + "learning_rate": 4.90810099560173e-05, + "loss": 1.31, + "num_input_tokens_seen": 13680640, + "step": 1670 + }, + { + "epoch": 0.1922416752488843, + "grad_norm": 0.5276287794113159, + "learning_rate": 4.90535483943882e-05, + "loss": 1.2949, + "num_input_tokens_seen": 13762560, + "step": 1680 + }, + { + "epoch": 0.19338597093488957, + "grad_norm": 0.5264617800712585, + "learning_rate": 4.902569045561113e-05, + "loss": 1.3735, + "num_input_tokens_seen": 13844480, + "step": 1690 + }, + { + "epoch": 0.19453026662089484, + "grad_norm": 0.5804072022438049, + "learning_rate": 4.899743659875556e-05, + "loss": 1.2904, + "num_input_tokens_seen": 13926400, + "step": 1700 + }, + { + "epoch": 0.1956745623069001, + "grad_norm": 0.5250833630561829, + "learning_rate": 4.896878728941531e-05, + "loss": 1.3387, + "num_input_tokens_seen": 14008320, + "step": 1710 + }, + { + "epoch": 0.19681885799290535, + "grad_norm": 0.6384326815605164, + "learning_rate": 4.893974299970082e-05, + "loss": 1.1596, + "num_input_tokens_seen": 14090240, + "step": 1720 + }, + { + "epoch": 0.19796315367891062, + "grad_norm": 0.5223713517189026, + "learning_rate": 4.891030420823142e-05, + "loss": 1.296, + "num_input_tokens_seen": 14172160, + "step": 1730 + }, + { + "epoch": 0.1991074493649159, + "grad_norm": 0.5310291051864624, + "learning_rate": 4.888047140012737e-05, + "loss": 1.3196, + "num_input_tokens_seen": 14254080, + "step": 1740 + }, + { + "epoch": 0.20025174505092116, + "grad_norm": 0.6313158869743347, + "learning_rate": 4.885024506700195e-05, + "loss": 1.1085, + "num_input_tokens_seen": 14336000, + "step": 1750 + }, + { + "epoch": 0.20139604073692643, + "grad_norm": 0.9189643263816833, + "learning_rate": 4.8819625706953286e-05, + "loss": 1.3757, + "num_input_tokens_seen": 14417920, + "step": 1760 + }, + { + "epoch": 0.20254033642293168, + "grad_norm": 0.5741713047027588, + "learning_rate": 4.8788613824556194e-05, + "loss": 1.1697, + "num_input_tokens_seen": 14499840, + "step": 1770 + }, + { + "epoch": 0.20368463210893695, + "grad_norm": 0.5382892489433289, + "learning_rate": 4.875720993085384e-05, + "loss": 1.2916, + "num_input_tokens_seen": 14581760, + "step": 1780 + }, + { + "epoch": 0.20482892779494222, + "grad_norm": 0.5520409941673279, + "learning_rate": 4.8725414543349326e-05, + "loss": 1.1496, + "num_input_tokens_seen": 14663680, + "step": 1790 + }, + { + "epoch": 0.2059732234809475, + "grad_norm": 0.5150988101959229, + "learning_rate": 4.869322818599714e-05, + "loss": 1.265, + "num_input_tokens_seen": 14745600, + "step": 1800 + }, + { + "epoch": 0.20711751916695273, + "grad_norm": 0.5061900019645691, + "learning_rate": 4.8660651389194576e-05, + "loss": 1.293, + "num_input_tokens_seen": 14827520, + "step": 1810 + }, + { + "epoch": 0.208261814852958, + "grad_norm": 0.5306046009063721, + "learning_rate": 4.862768468977293e-05, + "loss": 1.3073, + "num_input_tokens_seen": 14909440, + "step": 1820 + }, + { + "epoch": 0.20940611053896327, + "grad_norm": 0.5865935683250427, + "learning_rate": 4.8594328630988696e-05, + "loss": 1.171, + "num_input_tokens_seen": 14991360, + "step": 1830 + }, + { + "epoch": 0.21055040622496854, + "grad_norm": 0.5565701723098755, + "learning_rate": 4.8560583762514594e-05, + "loss": 1.2818, + "num_input_tokens_seen": 15073280, + "step": 1840 + }, + { + "epoch": 0.2116947019109738, + "grad_norm": 0.504626452922821, + "learning_rate": 4.852645064043053e-05, + "loss": 1.0901, + "num_input_tokens_seen": 15155200, + "step": 1850 + }, + { + "epoch": 0.21283899759697905, + "grad_norm": 0.49429330229759216, + "learning_rate": 4.84919298272144e-05, + "loss": 1.2341, + "num_input_tokens_seen": 15237120, + "step": 1860 + }, + { + "epoch": 0.21398329328298432, + "grad_norm": 0.6143152117729187, + "learning_rate": 4.8457021891732866e-05, + "loss": 1.415, + "num_input_tokens_seen": 15319040, + "step": 1870 + }, + { + "epoch": 0.2151275889689896, + "grad_norm": 0.5530739426612854, + "learning_rate": 4.842172740923194e-05, + "loss": 1.5628, + "num_input_tokens_seen": 15400960, + "step": 1880 + }, + { + "epoch": 0.21627188465499486, + "grad_norm": 0.5200616717338562, + "learning_rate": 4.838604696132753e-05, + "loss": 1.4214, + "num_input_tokens_seen": 15482880, + "step": 1890 + }, + { + "epoch": 0.2174161803410001, + "grad_norm": 0.9391474723815918, + "learning_rate": 4.8349981135995826e-05, + "loss": 1.2436, + "num_input_tokens_seen": 15564800, + "step": 1900 + }, + { + "epoch": 0.21856047602700537, + "grad_norm": 0.49321290850639343, + "learning_rate": 4.831353052756367e-05, + "loss": 1.3363, + "num_input_tokens_seen": 15646720, + "step": 1910 + }, + { + "epoch": 0.21970477171301064, + "grad_norm": 0.5570663213729858, + "learning_rate": 4.8276695736698704e-05, + "loss": 1.3984, + "num_input_tokens_seen": 15728640, + "step": 1920 + }, + { + "epoch": 0.2208490673990159, + "grad_norm": 0.5171666145324707, + "learning_rate": 4.823947737039948e-05, + "loss": 1.2282, + "num_input_tokens_seen": 15810560, + "step": 1930 + }, + { + "epoch": 0.22199336308502116, + "grad_norm": 0.6550266742706299, + "learning_rate": 4.8201876041985496e-05, + "loss": 1.32, + "num_input_tokens_seen": 15892480, + "step": 1940 + }, + { + "epoch": 0.22313765877102643, + "grad_norm": 0.6058536171913147, + "learning_rate": 4.8163892371087045e-05, + "loss": 1.2447, + "num_input_tokens_seen": 15974400, + "step": 1950 + }, + { + "epoch": 0.2242819544570317, + "grad_norm": 0.7001304626464844, + "learning_rate": 4.812552698363502e-05, + "loss": 1.1607, + "num_input_tokens_seen": 16056320, + "step": 1960 + }, + { + "epoch": 0.22542625014303697, + "grad_norm": 0.8715665340423584, + "learning_rate": 4.8086780511850606e-05, + "loss": 1.3617, + "num_input_tokens_seen": 16138240, + "step": 1970 + }, + { + "epoch": 0.22657054582904224, + "grad_norm": 0.5489494800567627, + "learning_rate": 4.8047653594234855e-05, + "loss": 1.1251, + "num_input_tokens_seen": 16220160, + "step": 1980 + }, + { + "epoch": 0.22771484151504748, + "grad_norm": 0.5537709593772888, + "learning_rate": 4.800814687555817e-05, + "loss": 1.193, + "num_input_tokens_seen": 16302080, + "step": 1990 + }, + { + "epoch": 0.22885913720105275, + "grad_norm": 0.9744300842285156, + "learning_rate": 4.796826100684967e-05, + "loss": 1.1411, + "num_input_tokens_seen": 16384000, + "step": 2000 + }, + { + "epoch": 0.23000343288705802, + "grad_norm": 0.6724236607551575, + "learning_rate": 4.7927996645386476e-05, + "loss": 1.2578, + "num_input_tokens_seen": 16465920, + "step": 2010 + }, + { + "epoch": 0.2311477285730633, + "grad_norm": 0.6202824711799622, + "learning_rate": 4.7887354454682854e-05, + "loss": 1.1249, + "num_input_tokens_seen": 16547840, + "step": 2020 + }, + { + "epoch": 0.23229202425906853, + "grad_norm": 0.5438826680183411, + "learning_rate": 4.784633510447932e-05, + "loss": 1.1754, + "num_input_tokens_seen": 16629760, + "step": 2030 + }, + { + "epoch": 0.2334363199450738, + "grad_norm": 0.5820137858390808, + "learning_rate": 4.7804939270731564e-05, + "loss": 1.2965, + "num_input_tokens_seen": 16711680, + "step": 2040 + }, + { + "epoch": 0.23458061563107907, + "grad_norm": 0.5191706418991089, + "learning_rate": 4.776316763559933e-05, + "loss": 1.2211, + "num_input_tokens_seen": 16793600, + "step": 2050 + }, + { + "epoch": 0.23572491131708434, + "grad_norm": 0.9790087342262268, + "learning_rate": 4.7721020887435186e-05, + "loss": 1.0741, + "num_input_tokens_seen": 16875520, + "step": 2060 + }, + { + "epoch": 0.2368692070030896, + "grad_norm": 0.48102623224258423, + "learning_rate": 4.767849972077315e-05, + "loss": 1.1755, + "num_input_tokens_seen": 16957440, + "step": 2070 + }, + { + "epoch": 0.23801350268909485, + "grad_norm": 0.5206550359725952, + "learning_rate": 4.763560483631728e-05, + "loss": 1.2918, + "num_input_tokens_seen": 17039360, + "step": 2080 + }, + { + "epoch": 0.23915779837510012, + "grad_norm": 0.620953381061554, + "learning_rate": 4.75923369409301e-05, + "loss": 1.2861, + "num_input_tokens_seen": 17121280, + "step": 2090 + }, + { + "epoch": 0.2403020940611054, + "grad_norm": 0.5213440656661987, + "learning_rate": 4.7548696747620956e-05, + "loss": 1.2797, + "num_input_tokens_seen": 17203200, + "step": 2100 + }, + { + "epoch": 0.24144638974711066, + "grad_norm": 0.5252777934074402, + "learning_rate": 4.750468497553429e-05, + "loss": 1.2802, + "num_input_tokens_seen": 17285120, + "step": 2110 + }, + { + "epoch": 0.2425906854331159, + "grad_norm": 0.553130030632019, + "learning_rate": 4.746030234993775e-05, + "loss": 1.2026, + "num_input_tokens_seen": 17367040, + "step": 2120 + }, + { + "epoch": 0.24373498111912117, + "grad_norm": 0.5378229022026062, + "learning_rate": 4.741554960221027e-05, + "loss": 1.4968, + "num_input_tokens_seen": 17448960, + "step": 2130 + }, + { + "epoch": 0.24487927680512644, + "grad_norm": 2.1330316066741943, + "learning_rate": 4.7370427469830016e-05, + "loss": 1.4157, + "num_input_tokens_seen": 17530880, + "step": 2140 + }, + { + "epoch": 0.24602357249113171, + "grad_norm": 0.555316150188446, + "learning_rate": 4.73249366963622e-05, + "loss": 1.2478, + "num_input_tokens_seen": 17612800, + "step": 2150 + }, + { + "epoch": 0.24716786817713698, + "grad_norm": 0.5266692638397217, + "learning_rate": 4.727907803144686e-05, + "loss": 1.4627, + "num_input_tokens_seen": 17694720, + "step": 2160 + }, + { + "epoch": 0.24831216386314223, + "grad_norm": 0.5252066254615784, + "learning_rate": 4.723285223078653e-05, + "loss": 1.2206, + "num_input_tokens_seen": 17776640, + "step": 2170 + }, + { + "epoch": 0.2494564595491475, + "grad_norm": 0.6599643230438232, + "learning_rate": 4.71862600561337e-05, + "loss": 1.1268, + "num_input_tokens_seen": 17858560, + "step": 2180 + }, + { + "epoch": 0.25060075523515274, + "grad_norm": 1.4253127574920654, + "learning_rate": 4.713930227527836e-05, + "loss": 1.0187, + "num_input_tokens_seen": 17940480, + "step": 2190 + }, + { + "epoch": 0.251745050921158, + "grad_norm": 0.9951733946800232, + "learning_rate": 4.709197966203528e-05, + "loss": 0.875, + "num_input_tokens_seen": 18022400, + "step": 2200 + }, + { + "epoch": 0.2528893466071633, + "grad_norm": 0.552880585193634, + "learning_rate": 4.704429299623129e-05, + "loss": 1.2277, + "num_input_tokens_seen": 18104320, + "step": 2210 + }, + { + "epoch": 0.25403364229316855, + "grad_norm": 0.47009891271591187, + "learning_rate": 4.6996243063692446e-05, + "loss": 1.4153, + "num_input_tokens_seen": 18186240, + "step": 2220 + }, + { + "epoch": 0.2551779379791738, + "grad_norm": 0.5296884179115295, + "learning_rate": 4.694783065623102e-05, + "loss": 1.1905, + "num_input_tokens_seen": 18268160, + "step": 2230 + }, + { + "epoch": 0.2563222336651791, + "grad_norm": 0.561241865158081, + "learning_rate": 4.68990565716325e-05, + "loss": 1.363, + "num_input_tokens_seen": 18350080, + "step": 2240 + }, + { + "epoch": 0.25746652935118436, + "grad_norm": 0.4963686466217041, + "learning_rate": 4.6849921613642456e-05, + "loss": 1.3028, + "num_input_tokens_seen": 18432000, + "step": 2250 + }, + { + "epoch": 0.25861082503718963, + "grad_norm": 0.5019493103027344, + "learning_rate": 4.680042659195325e-05, + "loss": 1.0479, + "num_input_tokens_seen": 18513920, + "step": 2260 + }, + { + "epoch": 0.2597551207231949, + "grad_norm": 0.5507912039756775, + "learning_rate": 4.6750572322190716e-05, + "loss": 1.4381, + "num_input_tokens_seen": 18595840, + "step": 2270 + }, + { + "epoch": 0.2608994164092001, + "grad_norm": 0.45838436484336853, + "learning_rate": 4.6700359625900724e-05, + "loss": 1.1211, + "num_input_tokens_seen": 18677760, + "step": 2280 + }, + { + "epoch": 0.2620437120952054, + "grad_norm": 0.49079829454421997, + "learning_rate": 4.664978933053562e-05, + "loss": 1.1206, + "num_input_tokens_seen": 18759680, + "step": 2290 + }, + { + "epoch": 0.26318800778121065, + "grad_norm": 1.0304409265518188, + "learning_rate": 4.659886226944063e-05, + "loss": 1.2446, + "num_input_tokens_seen": 18841600, + "step": 2300 + }, + { + "epoch": 0.2643323034672159, + "grad_norm": 0.6165328025817871, + "learning_rate": 4.65475792818401e-05, + "loss": 1.1047, + "num_input_tokens_seen": 18923520, + "step": 2310 + }, + { + "epoch": 0.2654765991532212, + "grad_norm": 1.5652453899383545, + "learning_rate": 4.6495941212823644e-05, + "loss": 1.175, + "num_input_tokens_seen": 19005440, + "step": 2320 + }, + { + "epoch": 0.26662089483922646, + "grad_norm": 0.5280311107635498, + "learning_rate": 4.644394891333227e-05, + "loss": 1.1473, + "num_input_tokens_seen": 19087360, + "step": 2330 + }, + { + "epoch": 0.26776519052523173, + "grad_norm": 1.034646987915039, + "learning_rate": 4.639160324014433e-05, + "loss": 1.1003, + "num_input_tokens_seen": 19169280, + "step": 2340 + }, + { + "epoch": 0.268909486211237, + "grad_norm": 0.4900410771369934, + "learning_rate": 4.633890505586139e-05, + "loss": 1.5043, + "num_input_tokens_seen": 19251200, + "step": 2350 + }, + { + "epoch": 0.2700537818972423, + "grad_norm": 1.9830961227416992, + "learning_rate": 4.6285855228894025e-05, + "loss": 1.3875, + "num_input_tokens_seen": 19333120, + "step": 2360 + }, + { + "epoch": 0.2711980775832475, + "grad_norm": 0.5432376265525818, + "learning_rate": 4.623245463344753e-05, + "loss": 1.2485, + "num_input_tokens_seen": 19415040, + "step": 2370 + }, + { + "epoch": 0.27234237326925276, + "grad_norm": 0.6268254518508911, + "learning_rate": 4.617870414950748e-05, + "loss": 1.6017, + "num_input_tokens_seen": 19496960, + "step": 2380 + }, + { + "epoch": 0.273486668955258, + "grad_norm": 0.6118723750114441, + "learning_rate": 4.612460466282525e-05, + "loss": 1.1282, + "num_input_tokens_seen": 19578880, + "step": 2390 + }, + { + "epoch": 0.2746309646412633, + "grad_norm": 0.6583623290061951, + "learning_rate": 4.607015706490341e-05, + "loss": 1.1795, + "num_input_tokens_seen": 19660800, + "step": 2400 + }, + { + "epoch": 0.27577526032726857, + "grad_norm": 0.6352823376655579, + "learning_rate": 4.601536225298104e-05, + "loss": 1.111, + "num_input_tokens_seen": 19742720, + "step": 2410 + }, + { + "epoch": 0.27691955601327384, + "grad_norm": 1.524483561515808, + "learning_rate": 4.5960221130018946e-05, + "loss": 1.1325, + "num_input_tokens_seen": 19824640, + "step": 2420 + }, + { + "epoch": 0.2780638516992791, + "grad_norm": 0.41133037209510803, + "learning_rate": 4.590473460468475e-05, + "loss": 1.2953, + "num_input_tokens_seen": 19906560, + "step": 2430 + }, + { + "epoch": 0.2792081473852844, + "grad_norm": 0.8059386610984802, + "learning_rate": 4.584890359133797e-05, + "loss": 1.168, + "num_input_tokens_seen": 19988480, + "step": 2440 + }, + { + "epoch": 0.28035244307128965, + "grad_norm": 1.0323024988174438, + "learning_rate": 4.579272901001491e-05, + "loss": 1.3607, + "num_input_tokens_seen": 20070400, + "step": 2450 + }, + { + "epoch": 0.28149673875729486, + "grad_norm": 0.5809211134910583, + "learning_rate": 4.5736211786413524e-05, + "loss": 1.293, + "num_input_tokens_seen": 20152320, + "step": 2460 + }, + { + "epoch": 0.28264103444330013, + "grad_norm": 0.7068034410476685, + "learning_rate": 4.5679352851878135e-05, + "loss": 1.3167, + "num_input_tokens_seen": 20234240, + "step": 2470 + }, + { + "epoch": 0.2837853301293054, + "grad_norm": 0.5605278611183167, + "learning_rate": 4.562215314338411e-05, + "loss": 1.3806, + "num_input_tokens_seen": 20316160, + "step": 2480 + }, + { + "epoch": 0.28492962581531067, + "grad_norm": 0.5452982187271118, + "learning_rate": 4.556461360352241e-05, + "loss": 1.0428, + "num_input_tokens_seen": 20398080, + "step": 2490 + }, + { + "epoch": 0.28607392150131594, + "grad_norm": 0.5816521048545837, + "learning_rate": 4.550673518048405e-05, + "loss": 1.0979, + "num_input_tokens_seen": 20480000, + "step": 2500 + }, + { + "epoch": 0.2872182171873212, + "grad_norm": 0.6600253582000732, + "learning_rate": 4.5448518828044515e-05, + "loss": 1.2309, + "num_input_tokens_seen": 20561920, + "step": 2510 + }, + { + "epoch": 0.2883625128733265, + "grad_norm": 0.5788952708244324, + "learning_rate": 4.538996550554798e-05, + "loss": 1.1428, + "num_input_tokens_seen": 20643840, + "step": 2520 + }, + { + "epoch": 0.28950680855933175, + "grad_norm": 0.5803161263465881, + "learning_rate": 4.5331076177891527e-05, + "loss": 1.5423, + "num_input_tokens_seen": 20725760, + "step": 2530 + }, + { + "epoch": 0.290651104245337, + "grad_norm": 0.5454373955726624, + "learning_rate": 4.527185181550928e-05, + "loss": 1.31, + "num_input_tokens_seen": 20807680, + "step": 2540 + }, + { + "epoch": 0.29179539993134224, + "grad_norm": 1.4661908149719238, + "learning_rate": 4.5212293394356356e-05, + "loss": 1.0873, + "num_input_tokens_seen": 20889600, + "step": 2550 + }, + { + "epoch": 0.2929396956173475, + "grad_norm": 0.6312928199768066, + "learning_rate": 4.515240189589282e-05, + "loss": 1.1553, + "num_input_tokens_seen": 20971520, + "step": 2560 + }, + { + "epoch": 0.2940839913033528, + "grad_norm": 0.5782475471496582, + "learning_rate": 4.509217830706749e-05, + "loss": 1.267, + "num_input_tokens_seen": 21053440, + "step": 2570 + }, + { + "epoch": 0.29522828698935805, + "grad_norm": 0.5917658805847168, + "learning_rate": 4.50316236203017e-05, + "loss": 1.074, + "num_input_tokens_seen": 21135360, + "step": 2580 + }, + { + "epoch": 0.2963725826753633, + "grad_norm": 0.5403671264648438, + "learning_rate": 4.497073883347293e-05, + "loss": 1.238, + "num_input_tokens_seen": 21217280, + "step": 2590 + }, + { + "epoch": 0.2975168783613686, + "grad_norm": 0.5732063055038452, + "learning_rate": 4.490952494989834e-05, + "loss": 1.2223, + "num_input_tokens_seen": 21299200, + "step": 2600 + }, + { + "epoch": 0.29866117404737386, + "grad_norm": 0.5230866074562073, + "learning_rate": 4.484798297831826e-05, + "loss": 1.2797, + "num_input_tokens_seen": 21381120, + "step": 2610 + }, + { + "epoch": 0.2998054697333791, + "grad_norm": 0.5615763664245605, + "learning_rate": 4.4786113932879605e-05, + "loss": 1.1556, + "num_input_tokens_seen": 21463040, + "step": 2620 + }, + { + "epoch": 0.3009497654193844, + "grad_norm": 0.8755237460136414, + "learning_rate": 4.472391883311906e-05, + "loss": 1.156, + "num_input_tokens_seen": 21544960, + "step": 2630 + }, + { + "epoch": 0.3020940611053896, + "grad_norm": 0.5666770935058594, + "learning_rate": 4.4661398703946396e-05, + "loss": 1.1363, + "num_input_tokens_seen": 21626880, + "step": 2640 + }, + { + "epoch": 0.3032383567913949, + "grad_norm": 0.5907579064369202, + "learning_rate": 4.4598554575627495e-05, + "loss": 1.4523, + "num_input_tokens_seen": 21708800, + "step": 2650 + }, + { + "epoch": 0.30438265247740015, + "grad_norm": 0.5281969904899597, + "learning_rate": 4.453538748376742e-05, + "loss": 0.9537, + "num_input_tokens_seen": 21790720, + "step": 2660 + }, + { + "epoch": 0.3055269481634054, + "grad_norm": 0.6245942115783691, + "learning_rate": 4.4471898469293324e-05, + "loss": 1.1622, + "num_input_tokens_seen": 21872640, + "step": 2670 + }, + { + "epoch": 0.3066712438494107, + "grad_norm": 0.5124683976173401, + "learning_rate": 4.44080885784373e-05, + "loss": 1.3041, + "num_input_tokens_seen": 21954560, + "step": 2680 + }, + { + "epoch": 0.30781553953541596, + "grad_norm": 0.536454439163208, + "learning_rate": 4.434395886271917e-05, + "loss": 1.3722, + "num_input_tokens_seen": 22036480, + "step": 2690 + }, + { + "epoch": 0.30895983522142123, + "grad_norm": 0.5136358737945557, + "learning_rate": 4.427951037892911e-05, + "loss": 1.4421, + "num_input_tokens_seen": 22118400, + "step": 2700 + }, + { + "epoch": 0.3101041309074265, + "grad_norm": 0.4972691237926483, + "learning_rate": 4.4214744189110266e-05, + "loss": 1.5259, + "num_input_tokens_seen": 22200320, + "step": 2710 + }, + { + "epoch": 0.3112484265934317, + "grad_norm": 0.5017683506011963, + "learning_rate": 4.414966136054125e-05, + "loss": 1.1897, + "num_input_tokens_seen": 22282240, + "step": 2720 + }, + { + "epoch": 0.312392722279437, + "grad_norm": 0.468337744474411, + "learning_rate": 4.408426296571852e-05, + "loss": 1.0431, + "num_input_tokens_seen": 22364160, + "step": 2730 + }, + { + "epoch": 0.31353701796544226, + "grad_norm": 0.6088995337486267, + "learning_rate": 4.401855008233879e-05, + "loss": 1.3437, + "num_input_tokens_seen": 22446080, + "step": 2740 + }, + { + "epoch": 0.3146813136514475, + "grad_norm": 0.6361683011054993, + "learning_rate": 4.395252379328115e-05, + "loss": 1.4596, + "num_input_tokens_seen": 22528000, + "step": 2750 + }, + { + "epoch": 0.3158256093374528, + "grad_norm": 0.4672735631465912, + "learning_rate": 4.388618518658932e-05, + "loss": 1.1596, + "num_input_tokens_seen": 22609920, + "step": 2760 + }, + { + "epoch": 0.31696990502345807, + "grad_norm": 1.0056240558624268, + "learning_rate": 4.381953535545369e-05, + "loss": 1.2776, + "num_input_tokens_seen": 22691840, + "step": 2770 + }, + { + "epoch": 0.31811420070946334, + "grad_norm": 0.8526020646095276, + "learning_rate": 4.375257539819328e-05, + "loss": 1.1147, + "num_input_tokens_seen": 22773760, + "step": 2780 + }, + { + "epoch": 0.3192584963954686, + "grad_norm": 0.527472972869873, + "learning_rate": 4.368530641823769e-05, + "loss": 1.1802, + "num_input_tokens_seen": 22855680, + "step": 2790 + }, + { + "epoch": 0.3204027920814739, + "grad_norm": 0.5480664372444153, + "learning_rate": 4.361772952410886e-05, + "loss": 1.0881, + "num_input_tokens_seen": 22937600, + "step": 2800 + }, + { + "epoch": 0.3215470877674791, + "grad_norm": 0.6834565997123718, + "learning_rate": 4.354984582940285e-05, + "loss": 1.1945, + "num_input_tokens_seen": 23019520, + "step": 2810 + }, + { + "epoch": 0.32269138345348436, + "grad_norm": 0.5412240028381348, + "learning_rate": 4.348165645277145e-05, + "loss": 1.3629, + "num_input_tokens_seen": 23101440, + "step": 2820 + }, + { + "epoch": 0.32383567913948963, + "grad_norm": 0.5615849494934082, + "learning_rate": 4.34131625179038e-05, + "loss": 1.3362, + "num_input_tokens_seen": 23183360, + "step": 2830 + }, + { + "epoch": 0.3249799748254949, + "grad_norm": 0.5076019763946533, + "learning_rate": 4.334436515350779e-05, + "loss": 1.2316, + "num_input_tokens_seen": 23265280, + "step": 2840 + }, + { + "epoch": 0.32612427051150017, + "grad_norm": 0.568336546421051, + "learning_rate": 4.327526549329157e-05, + "loss": 1.7573, + "num_input_tokens_seen": 23347200, + "step": 2850 + }, + { + "epoch": 0.32726856619750544, + "grad_norm": 0.6184702515602112, + "learning_rate": 4.320586467594476e-05, + "loss": 1.4037, + "num_input_tokens_seen": 23429120, + "step": 2860 + }, + { + "epoch": 0.3284128618835107, + "grad_norm": 0.8441096544265747, + "learning_rate": 4.313616384511976e-05, + "loss": 1.4623, + "num_input_tokens_seen": 23511040, + "step": 2870 + }, + { + "epoch": 0.329557157569516, + "grad_norm": 0.41614028811454773, + "learning_rate": 4.3066164149412844e-05, + "loss": 1.1027, + "num_input_tokens_seen": 23592960, + "step": 2880 + }, + { + "epoch": 0.33070145325552125, + "grad_norm": 0.6045880317687988, + "learning_rate": 4.299586674234529e-05, + "loss": 1.2341, + "num_input_tokens_seen": 23674880, + "step": 2890 + }, + { + "epoch": 0.33184574894152646, + "grad_norm": 0.5018451809883118, + "learning_rate": 4.292527278234435e-05, + "loss": 1.2661, + "num_input_tokens_seen": 23756800, + "step": 2900 + }, + { + "epoch": 0.33299004462753173, + "grad_norm": 0.6029537320137024, + "learning_rate": 4.285438343272414e-05, + "loss": 1.375, + "num_input_tokens_seen": 23838720, + "step": 2910 + }, + { + "epoch": 0.334134340313537, + "grad_norm": 0.7714540958404541, + "learning_rate": 4.278319986166649e-05, + "loss": 1.2119, + "num_input_tokens_seen": 23920640, + "step": 2920 + }, + { + "epoch": 0.3352786359995423, + "grad_norm": 0.5025473237037659, + "learning_rate": 4.2711723242201695e-05, + "loss": 1.322, + "num_input_tokens_seen": 24002560, + "step": 2930 + }, + { + "epoch": 0.33642293168554754, + "grad_norm": 0.5495464205741882, + "learning_rate": 4.263995475218917e-05, + "loss": 1.4806, + "num_input_tokens_seen": 24084480, + "step": 2940 + }, + { + "epoch": 0.3375672273715528, + "grad_norm": 0.5496026277542114, + "learning_rate": 4.256789557429806e-05, + "loss": 1.6319, + "num_input_tokens_seen": 24166400, + "step": 2950 + }, + { + "epoch": 0.3387115230575581, + "grad_norm": 2.653693199157715, + "learning_rate": 4.2495546895987724e-05, + "loss": 1.2252, + "num_input_tokens_seen": 24248320, + "step": 2960 + }, + { + "epoch": 0.33985581874356335, + "grad_norm": 0.7369899153709412, + "learning_rate": 4.242290990948821e-05, + "loss": 1.1941, + "num_input_tokens_seen": 24330240, + "step": 2970 + }, + { + "epoch": 0.3410001144295686, + "grad_norm": 0.6210314631462097, + "learning_rate": 4.234998581178056e-05, + "loss": 1.2853, + "num_input_tokens_seen": 24412160, + "step": 2980 + }, + { + "epoch": 0.34214441011557384, + "grad_norm": 0.4932264983654022, + "learning_rate": 4.227677580457711e-05, + "loss": 1.157, + "num_input_tokens_seen": 24494080, + "step": 2990 + }, + { + "epoch": 0.3432887058015791, + "grad_norm": 0.7470729947090149, + "learning_rate": 4.220328109430167e-05, + "loss": 1.1064, + "num_input_tokens_seen": 24576000, + "step": 3000 + }, + { + "epoch": 0.3444330014875844, + "grad_norm": 0.5054774880409241, + "learning_rate": 4.21295028920697e-05, + "loss": 1.3768, + "num_input_tokens_seen": 24657920, + "step": 3010 + }, + { + "epoch": 0.34557729717358965, + "grad_norm": 0.5471848845481873, + "learning_rate": 4.2055442413668264e-05, + "loss": 1.1968, + "num_input_tokens_seen": 24739840, + "step": 3020 + }, + { + "epoch": 0.3467215928595949, + "grad_norm": 0.48451676964759827, + "learning_rate": 4.198110087953606e-05, + "loss": 1.3736, + "num_input_tokens_seen": 24821760, + "step": 3030 + }, + { + "epoch": 0.3478658885456002, + "grad_norm": 0.4831438958644867, + "learning_rate": 4.190647951474328e-05, + "loss": 1.2485, + "num_input_tokens_seen": 24903680, + "step": 3040 + }, + { + "epoch": 0.34901018423160546, + "grad_norm": 0.5406004190444946, + "learning_rate": 4.183157954897144e-05, + "loss": 1.1501, + "num_input_tokens_seen": 24985600, + "step": 3050 + }, + { + "epoch": 0.35015447991761073, + "grad_norm": 0.5474951863288879, + "learning_rate": 4.1756402216493115e-05, + "loss": 1.1709, + "num_input_tokens_seen": 25067520, + "step": 3060 + }, + { + "epoch": 0.351298775603616, + "grad_norm": 0.6155632138252258, + "learning_rate": 4.1680948756151564e-05, + "loss": 1.2692, + "num_input_tokens_seen": 25149440, + "step": 3070 + }, + { + "epoch": 0.3524430712896212, + "grad_norm": 0.47120699286460876, + "learning_rate": 4.160522041134035e-05, + "loss": 1.2124, + "num_input_tokens_seen": 25231360, + "step": 3080 + }, + { + "epoch": 0.3535873669756265, + "grad_norm": 0.5330422520637512, + "learning_rate": 4.152921842998287e-05, + "loss": 1.1728, + "num_input_tokens_seen": 25313280, + "step": 3090 + }, + { + "epoch": 0.35473166266163175, + "grad_norm": 0.48196467757225037, + "learning_rate": 4.145294406451173e-05, + "loss": 1.309, + "num_input_tokens_seen": 25395200, + "step": 3100 + }, + { + "epoch": 0.355875958347637, + "grad_norm": 0.5000414252281189, + "learning_rate": 4.137639857184815e-05, + "loss": 1.3163, + "num_input_tokens_seen": 25477120, + "step": 3110 + }, + { + "epoch": 0.3570202540336423, + "grad_norm": 0.5418947339057922, + "learning_rate": 4.129958321338127e-05, + "loss": 1.1725, + "num_input_tokens_seen": 25559040, + "step": 3120 + }, + { + "epoch": 0.35816454971964756, + "grad_norm": 0.5317056775093079, + "learning_rate": 4.122249925494726e-05, + "loss": 1.1192, + "num_input_tokens_seen": 25640960, + "step": 3130 + }, + { + "epoch": 0.35930884540565283, + "grad_norm": 0.6244747638702393, + "learning_rate": 4.114514796680862e-05, + "loss": 1.2959, + "num_input_tokens_seen": 25722880, + "step": 3140 + }, + { + "epoch": 0.3604531410916581, + "grad_norm": 0.5483636856079102, + "learning_rate": 4.106753062363311e-05, + "loss": 1.2905, + "num_input_tokens_seen": 25804800, + "step": 3150 + }, + { + "epoch": 0.3615974367776634, + "grad_norm": 0.5433777570724487, + "learning_rate": 4.098964850447281e-05, + "loss": 1.315, + "num_input_tokens_seen": 25886720, + "step": 3160 + }, + { + "epoch": 0.3627417324636686, + "grad_norm": 0.90435391664505, + "learning_rate": 4.0911502892743035e-05, + "loss": 1.28, + "num_input_tokens_seen": 25968640, + "step": 3170 + }, + { + "epoch": 0.36388602814967386, + "grad_norm": 0.5195088386535645, + "learning_rate": 4.083309507620118e-05, + "loss": 1.2515, + "num_input_tokens_seen": 26050560, + "step": 3180 + }, + { + "epoch": 0.3650303238356791, + "grad_norm": 0.5890297293663025, + "learning_rate": 4.075442634692548e-05, + "loss": 1.1768, + "num_input_tokens_seen": 26132480, + "step": 3190 + }, + { + "epoch": 0.3661746195216844, + "grad_norm": 0.6505255103111267, + "learning_rate": 4.067549800129375e-05, + "loss": 1.2891, + "num_input_tokens_seen": 26214400, + "step": 3200 + }, + { + "epoch": 0.36731891520768967, + "grad_norm": 0.6695213317871094, + "learning_rate": 4.059631133996203e-05, + "loss": 1.3888, + "num_input_tokens_seen": 26296320, + "step": 3210 + }, + { + "epoch": 0.36846321089369494, + "grad_norm": 0.4889095723628998, + "learning_rate": 4.05168676678431e-05, + "loss": 0.9946, + "num_input_tokens_seen": 26378240, + "step": 3220 + }, + { + "epoch": 0.3696075065797002, + "grad_norm": 2.1291022300720215, + "learning_rate": 4.0437168294085013e-05, + "loss": 1.4232, + "num_input_tokens_seen": 26460160, + "step": 3230 + }, + { + "epoch": 0.3707518022657055, + "grad_norm": 0.4844609797000885, + "learning_rate": 4.0357214532049535e-05, + "loss": 1.1119, + "num_input_tokens_seen": 26542080, + "step": 3240 + }, + { + "epoch": 0.37189609795171075, + "grad_norm": 0.509472668170929, + "learning_rate": 4.027700769929046e-05, + "loss": 1.1302, + "num_input_tokens_seen": 26624000, + "step": 3250 + }, + { + "epoch": 0.37304039363771596, + "grad_norm": 0.5184939503669739, + "learning_rate": 4.019654911753193e-05, + "loss": 1.2648, + "num_input_tokens_seen": 26705920, + "step": 3260 + }, + { + "epoch": 0.37418468932372123, + "grad_norm": 0.5771918892860413, + "learning_rate": 4.011584011264665e-05, + "loss": 1.204, + "num_input_tokens_seen": 26787840, + "step": 3270 + }, + { + "epoch": 0.3753289850097265, + "grad_norm": 0.5364697575569153, + "learning_rate": 4.0034882014634015e-05, + "loss": 1.3242, + "num_input_tokens_seen": 26869760, + "step": 3280 + }, + { + "epoch": 0.37647328069573177, + "grad_norm": 0.5383545160293579, + "learning_rate": 3.995367615759825e-05, + "loss": 1.235, + "num_input_tokens_seen": 26951680, + "step": 3290 + }, + { + "epoch": 0.37761757638173704, + "grad_norm": 0.5549062490463257, + "learning_rate": 3.9872223879726356e-05, + "loss": 1.1119, + "num_input_tokens_seen": 27033600, + "step": 3300 + }, + { + "epoch": 0.3787618720677423, + "grad_norm": 0.8855923414230347, + "learning_rate": 3.979052652326609e-05, + "loss": 1.2295, + "num_input_tokens_seen": 27115520, + "step": 3310 + }, + { + "epoch": 0.3799061677537476, + "grad_norm": 0.5798035264015198, + "learning_rate": 3.970858543450387e-05, + "loss": 1.4858, + "num_input_tokens_seen": 27197440, + "step": 3320 + }, + { + "epoch": 0.38105046343975285, + "grad_norm": 0.9078068137168884, + "learning_rate": 3.962640196374254e-05, + "loss": 1.1919, + "num_input_tokens_seen": 27279360, + "step": 3330 + }, + { + "epoch": 0.3821947591257581, + "grad_norm": 0.5055866241455078, + "learning_rate": 3.954397746527916e-05, + "loss": 1.1531, + "num_input_tokens_seen": 27361280, + "step": 3340 + }, + { + "epoch": 0.38333905481176334, + "grad_norm": 0.5842207670211792, + "learning_rate": 3.9461313297382666e-05, + "loss": 1.1539, + "num_input_tokens_seen": 27443200, + "step": 3350 + }, + { + "epoch": 0.3844833504977686, + "grad_norm": 0.5523999929428101, + "learning_rate": 3.93784108222715e-05, + "loss": 1.6298, + "num_input_tokens_seen": 27525120, + "step": 3360 + }, + { + "epoch": 0.3856276461837739, + "grad_norm": 0.5067601799964905, + "learning_rate": 3.929527140609115e-05, + "loss": 1.2188, + "num_input_tokens_seen": 27607040, + "step": 3370 + }, + { + "epoch": 0.38677194186977915, + "grad_norm": 0.5155417919158936, + "learning_rate": 3.921189641889163e-05, + "loss": 1.2925, + "num_input_tokens_seen": 27688960, + "step": 3380 + }, + { + "epoch": 0.3879162375557844, + "grad_norm": 0.5797560811042786, + "learning_rate": 3.912828723460495e-05, + "loss": 1.3113, + "num_input_tokens_seen": 27770880, + "step": 3390 + }, + { + "epoch": 0.3890605332417897, + "grad_norm": 0.5342340469360352, + "learning_rate": 3.904444523102242e-05, + "loss": 1.2155, + "num_input_tokens_seen": 27852800, + "step": 3400 + }, + { + "epoch": 0.39020482892779496, + "grad_norm": 0.867396354675293, + "learning_rate": 3.896037178977196e-05, + "loss": 1.4058, + "num_input_tokens_seen": 27934720, + "step": 3410 + }, + { + "epoch": 0.3913491246138002, + "grad_norm": 0.904750645160675, + "learning_rate": 3.887606829629536e-05, + "loss": 1.2582, + "num_input_tokens_seen": 28016640, + "step": 3420 + }, + { + "epoch": 0.3924934202998055, + "grad_norm": 0.49150246381759644, + "learning_rate": 3.87915361398254e-05, + "loss": 1.2471, + "num_input_tokens_seen": 28098560, + "step": 3430 + }, + { + "epoch": 0.3936377159858107, + "grad_norm": 1.6605616807937622, + "learning_rate": 3.8706776713363025e-05, + "loss": 1.2815, + "num_input_tokens_seen": 28180480, + "step": 3440 + }, + { + "epoch": 0.394782011671816, + "grad_norm": 0.5466508865356445, + "learning_rate": 3.862179141365431e-05, + "loss": 1.4604, + "num_input_tokens_seen": 28262400, + "step": 3450 + }, + { + "epoch": 0.39592630735782125, + "grad_norm": 0.8647962808609009, + "learning_rate": 3.8536581641167506e-05, + "loss": 1.2577, + "num_input_tokens_seen": 28344320, + "step": 3460 + }, + { + "epoch": 0.3970706030438265, + "grad_norm": 2.0921196937561035, + "learning_rate": 3.845114880006994e-05, + "loss": 1.2211, + "num_input_tokens_seen": 28426240, + "step": 3470 + }, + { + "epoch": 0.3982148987298318, + "grad_norm": 1.0453864336013794, + "learning_rate": 3.836549429820485e-05, + "loss": 1.2543, + "num_input_tokens_seen": 28508160, + "step": 3480 + }, + { + "epoch": 0.39935919441583706, + "grad_norm": 1.259315013885498, + "learning_rate": 3.827961954706825e-05, + "loss": 1.2716, + "num_input_tokens_seen": 28590080, + "step": 3490 + }, + { + "epoch": 0.40050349010184233, + "grad_norm": 0.49976375699043274, + "learning_rate": 3.8193525961785584e-05, + "loss": 0.9854, + "num_input_tokens_seen": 28672000, + "step": 3500 + }, + { + "epoch": 0.4016477857878476, + "grad_norm": 0.4869581162929535, + "learning_rate": 3.81072149610885e-05, + "loss": 1.1416, + "num_input_tokens_seen": 28753920, + "step": 3510 + }, + { + "epoch": 0.40279208147385287, + "grad_norm": 0.5591604113578796, + "learning_rate": 3.802068796729139e-05, + "loss": 1.3587, + "num_input_tokens_seen": 28835840, + "step": 3520 + }, + { + "epoch": 0.4039363771598581, + "grad_norm": 0.5056213736534119, + "learning_rate": 3.7933946406268e-05, + "loss": 1.2926, + "num_input_tokens_seen": 28917760, + "step": 3530 + }, + { + "epoch": 0.40508067284586335, + "grad_norm": 0.4954633414745331, + "learning_rate": 3.7846991707427905e-05, + "loss": 1.2945, + "num_input_tokens_seen": 28999680, + "step": 3540 + }, + { + "epoch": 0.4062249685318686, + "grad_norm": 0.823973536491394, + "learning_rate": 3.775982530369298e-05, + "loss": 1.2348, + "num_input_tokens_seen": 29081600, + "step": 3550 + }, + { + "epoch": 0.4073692642178739, + "grad_norm": 0.5769624710083008, + "learning_rate": 3.767244863147377e-05, + "loss": 1.2022, + "num_input_tokens_seen": 29163520, + "step": 3560 + }, + { + "epoch": 0.40851355990387916, + "grad_norm": 0.5166818499565125, + "learning_rate": 3.75848631306458e-05, + "loss": 1.5009, + "num_input_tokens_seen": 29245440, + "step": 3570 + }, + { + "epoch": 0.40965785558988443, + "grad_norm": 0.8839643001556396, + "learning_rate": 3.7497070244525925e-05, + "loss": 1.2186, + "num_input_tokens_seen": 29327360, + "step": 3580 + }, + { + "epoch": 0.4108021512758897, + "grad_norm": 0.5029737949371338, + "learning_rate": 3.7409071419848436e-05, + "loss": 1.3161, + "num_input_tokens_seen": 29409280, + "step": 3590 + }, + { + "epoch": 0.411946446961895, + "grad_norm": 0.46834880113601685, + "learning_rate": 3.73208681067413e-05, + "loss": 1.0534, + "num_input_tokens_seen": 29491200, + "step": 3600 + }, + { + "epoch": 0.41309074264790024, + "grad_norm": 0.6240310668945312, + "learning_rate": 3.7232461758702244e-05, + "loss": 1.3398, + "num_input_tokens_seen": 29573120, + "step": 3610 + }, + { + "epoch": 0.41423503833390546, + "grad_norm": 0.5298212766647339, + "learning_rate": 3.714385383257477e-05, + "loss": 1.1538, + "num_input_tokens_seen": 29655040, + "step": 3620 + }, + { + "epoch": 0.41537933401991073, + "grad_norm": 0.46233418583869934, + "learning_rate": 3.7055045788524214e-05, + "loss": 1.0701, + "num_input_tokens_seen": 29736960, + "step": 3630 + }, + { + "epoch": 0.416523629705916, + "grad_norm": 0.5719970464706421, + "learning_rate": 3.696603909001361e-05, + "loss": 1.4487, + "num_input_tokens_seen": 29818880, + "step": 3640 + }, + { + "epoch": 0.41766792539192127, + "grad_norm": 0.5453386902809143, + "learning_rate": 3.6876835203779615e-05, + "loss": 1.3053, + "num_input_tokens_seen": 29900800, + "step": 3650 + }, + { + "epoch": 0.41881222107792654, + "grad_norm": 0.4228511452674866, + "learning_rate": 3.678743559980835e-05, + "loss": 1.4515, + "num_input_tokens_seen": 29982720, + "step": 3660 + }, + { + "epoch": 0.4199565167639318, + "grad_norm": 0.5006650686264038, + "learning_rate": 3.669784175131115e-05, + "loss": 1.4384, + "num_input_tokens_seen": 30064640, + "step": 3670 + }, + { + "epoch": 0.4211008124499371, + "grad_norm": 0.49229928851127625, + "learning_rate": 3.660805513470027e-05, + "loss": 1.4419, + "num_input_tokens_seen": 30146560, + "step": 3680 + }, + { + "epoch": 0.42224510813594235, + "grad_norm": 0.5263097286224365, + "learning_rate": 3.651807722956462e-05, + "loss": 1.2246, + "num_input_tokens_seen": 30228480, + "step": 3690 + }, + { + "epoch": 0.4233894038219476, + "grad_norm": 0.48281192779541016, + "learning_rate": 3.642790951864532e-05, + "loss": 1.0901, + "num_input_tokens_seen": 30310400, + "step": 3700 + }, + { + "epoch": 0.42453369950795283, + "grad_norm": 0.6877309083938599, + "learning_rate": 3.63375534878113e-05, + "loss": 1.2148, + "num_input_tokens_seen": 30392320, + "step": 3710 + }, + { + "epoch": 0.4256779951939581, + "grad_norm": 2.1429624557495117, + "learning_rate": 3.6247010626034795e-05, + "loss": 1.3764, + "num_input_tokens_seen": 30474240, + "step": 3720 + }, + { + "epoch": 0.4268222908799634, + "grad_norm": 0.6136846542358398, + "learning_rate": 3.615628242536682e-05, + "loss": 1.216, + "num_input_tokens_seen": 30556160, + "step": 3730 + }, + { + "epoch": 0.42796658656596864, + "grad_norm": 0.5359023809432983, + "learning_rate": 3.6065370380912587e-05, + "loss": 1.2571, + "num_input_tokens_seen": 30638080, + "step": 3740 + }, + { + "epoch": 0.4291108822519739, + "grad_norm": 1.0357657670974731, + "learning_rate": 3.5974275990806846e-05, + "loss": 1.1939, + "num_input_tokens_seen": 30720000, + "step": 3750 + }, + { + "epoch": 0.4302551779379792, + "grad_norm": 0.5274159908294678, + "learning_rate": 3.588300075618922e-05, + "loss": 1.1014, + "num_input_tokens_seen": 30801920, + "step": 3760 + }, + { + "epoch": 0.43139947362398445, + "grad_norm": 0.48430922627449036, + "learning_rate": 3.579154618117946e-05, + "loss": 1.1029, + "num_input_tokens_seen": 30883840, + "step": 3770 + }, + { + "epoch": 0.4325437693099897, + "grad_norm": 0.5668869018554688, + "learning_rate": 3.5699913772852664e-05, + "loss": 1.3753, + "num_input_tokens_seen": 30965760, + "step": 3780 + }, + { + "epoch": 0.43368806499599494, + "grad_norm": 1.2524999380111694, + "learning_rate": 3.560810504121441e-05, + "loss": 1.405, + "num_input_tokens_seen": 31047680, + "step": 3790 + }, + { + "epoch": 0.4348323606820002, + "grad_norm": 0.49741896986961365, + "learning_rate": 3.551612149917593e-05, + "loss": 1.1393, + "num_input_tokens_seen": 31129600, + "step": 3800 + }, + { + "epoch": 0.4359766563680055, + "grad_norm": 0.5412270426750183, + "learning_rate": 3.542396466252913e-05, + "loss": 1.3569, + "num_input_tokens_seen": 31211520, + "step": 3810 + }, + { + "epoch": 0.43712095205401075, + "grad_norm": 2.5141255855560303, + "learning_rate": 3.533163604992163e-05, + "loss": 1.4194, + "num_input_tokens_seen": 31293440, + "step": 3820 + }, + { + "epoch": 0.438265247740016, + "grad_norm": 0.5513648390769958, + "learning_rate": 3.523913718283175e-05, + "loss": 1.2899, + "num_input_tokens_seen": 31375360, + "step": 3830 + }, + { + "epoch": 0.4394095434260213, + "grad_norm": 0.583692729473114, + "learning_rate": 3.514646958554339e-05, + "loss": 1.3624, + "num_input_tokens_seen": 31457280, + "step": 3840 + }, + { + "epoch": 0.44055383911202656, + "grad_norm": 0.5681483149528503, + "learning_rate": 3.5053634785121e-05, + "loss": 1.1517, + "num_input_tokens_seen": 31539200, + "step": 3850 + }, + { + "epoch": 0.4416981347980318, + "grad_norm": 0.4962402880191803, + "learning_rate": 3.496063431138431e-05, + "loss": 1.2125, + "num_input_tokens_seen": 31621120, + "step": 3860 + }, + { + "epoch": 0.4428424304840371, + "grad_norm": 0.4531221389770508, + "learning_rate": 3.4867469696883204e-05, + "loss": 1.287, + "num_input_tokens_seen": 31703040, + "step": 3870 + }, + { + "epoch": 0.4439867261700423, + "grad_norm": 0.4687143564224243, + "learning_rate": 3.477414247687241e-05, + "loss": 1.0745, + "num_input_tokens_seen": 31784960, + "step": 3880 + }, + { + "epoch": 0.4451310218560476, + "grad_norm": 0.5784918665885925, + "learning_rate": 3.468065418928625e-05, + "loss": 1.146, + "num_input_tokens_seen": 31866880, + "step": 3890 + }, + { + "epoch": 0.44627531754205285, + "grad_norm": 0.4997071623802185, + "learning_rate": 3.458700637471325e-05, + "loss": 1.0793, + "num_input_tokens_seen": 31948800, + "step": 3900 + }, + { + "epoch": 0.4474196132280581, + "grad_norm": 0.5425757765769958, + "learning_rate": 3.4493200576370776e-05, + "loss": 1.1415, + "num_input_tokens_seen": 32030720, + "step": 3910 + }, + { + "epoch": 0.4485639089140634, + "grad_norm": 0.5466029644012451, + "learning_rate": 3.4399238340079607e-05, + "loss": 1.2075, + "num_input_tokens_seen": 32112640, + "step": 3920 + }, + { + "epoch": 0.44970820460006866, + "grad_norm": 0.5251049995422363, + "learning_rate": 3.4305121214238446e-05, + "loss": 1.1582, + "num_input_tokens_seen": 32194560, + "step": 3930 + }, + { + "epoch": 0.45085250028607393, + "grad_norm": 0.5477258563041687, + "learning_rate": 3.4210850749798415e-05, + "loss": 1.3309, + "num_input_tokens_seen": 32276480, + "step": 3940 + }, + { + "epoch": 0.4519967959720792, + "grad_norm": 0.621071457862854, + "learning_rate": 3.411642850023751e-05, + "loss": 1.2072, + "num_input_tokens_seen": 32358400, + "step": 3950 + }, + { + "epoch": 0.45314109165808447, + "grad_norm": 0.5287960767745972, + "learning_rate": 3.402185602153495e-05, + "loss": 1.1731, + "num_input_tokens_seen": 32440320, + "step": 3960 + }, + { + "epoch": 0.4542853873440897, + "grad_norm": 0.5123234987258911, + "learning_rate": 3.392713487214561e-05, + "loss": 1.1238, + "num_input_tokens_seen": 32522240, + "step": 3970 + }, + { + "epoch": 0.45542968303009496, + "grad_norm": 0.5226893424987793, + "learning_rate": 3.38322666129743e-05, + "loss": 1.1262, + "num_input_tokens_seen": 32604160, + "step": 3980 + }, + { + "epoch": 0.4565739787161002, + "grad_norm": 0.6006979942321777, + "learning_rate": 3.373725280735e-05, + "loss": 1.2084, + "num_input_tokens_seen": 32686080, + "step": 3990 + }, + { + "epoch": 0.4577182744021055, + "grad_norm": 0.475813627243042, + "learning_rate": 3.3642095021000184e-05, + "loss": 1.2836, + "num_input_tokens_seen": 32768000, + "step": 4000 + }, + { + "epoch": 0.45886257008811077, + "grad_norm": 0.5431310534477234, + "learning_rate": 3.3546794822024976e-05, + "loss": 1.295, + "num_input_tokens_seen": 32849920, + "step": 4010 + }, + { + "epoch": 0.46000686577411604, + "grad_norm": 0.48410120606422424, + "learning_rate": 3.3451353780871286e-05, + "loss": 1.3341, + "num_input_tokens_seen": 32931840, + "step": 4020 + }, + { + "epoch": 0.4611511614601213, + "grad_norm": 0.525595486164093, + "learning_rate": 3.335577347030697e-05, + "loss": 1.2892, + "num_input_tokens_seen": 33013760, + "step": 4030 + }, + { + "epoch": 0.4622954571461266, + "grad_norm": 0.5761798620223999, + "learning_rate": 3.32600554653949e-05, + "loss": 1.5631, + "num_input_tokens_seen": 33095680, + "step": 4040 + }, + { + "epoch": 0.46343975283213185, + "grad_norm": 0.48856690526008606, + "learning_rate": 3.316420134346701e-05, + "loss": 0.9705, + "num_input_tokens_seen": 33177600, + "step": 4050 + }, + { + "epoch": 0.46458404851813706, + "grad_norm": 0.7112519145011902, + "learning_rate": 3.306821268409827e-05, + "loss": 1.3763, + "num_input_tokens_seen": 33259520, + "step": 4060 + }, + { + "epoch": 0.46572834420414233, + "grad_norm": 0.9292272925376892, + "learning_rate": 3.297209106908072e-05, + "loss": 1.2818, + "num_input_tokens_seen": 33341440, + "step": 4070 + }, + { + "epoch": 0.4668726398901476, + "grad_norm": 0.7249779105186462, + "learning_rate": 3.287583808239735e-05, + "loss": 1.1623, + "num_input_tokens_seen": 33423360, + "step": 4080 + }, + { + "epoch": 0.46801693557615287, + "grad_norm": 0.5191545486450195, + "learning_rate": 3.277945531019601e-05, + "loss": 1.5152, + "num_input_tokens_seen": 33505280, + "step": 4090 + }, + { + "epoch": 0.46916123126215814, + "grad_norm": 0.5034803748130798, + "learning_rate": 3.268294434076332e-05, + "loss": 1.5291, + "num_input_tokens_seen": 33587200, + "step": 4100 + }, + { + "epoch": 0.4703055269481634, + "grad_norm": 0.5027021765708923, + "learning_rate": 3.2586306764498395e-05, + "loss": 1.1453, + "num_input_tokens_seen": 33669120, + "step": 4110 + }, + { + "epoch": 0.4714498226341687, + "grad_norm": 0.6401455998420715, + "learning_rate": 3.2489544173886745e-05, + "loss": 1.398, + "num_input_tokens_seen": 33751040, + "step": 4120 + }, + { + "epoch": 0.47259411832017395, + "grad_norm": 0.34582868218421936, + "learning_rate": 3.239265816347397e-05, + "loss": 1.0819, + "num_input_tokens_seen": 33832960, + "step": 4130 + }, + { + "epoch": 0.4737384140061792, + "grad_norm": 0.5344799757003784, + "learning_rate": 3.2295650329839474e-05, + "loss": 1.3502, + "num_input_tokens_seen": 33914880, + "step": 4140 + }, + { + "epoch": 0.47488270969218443, + "grad_norm": 0.48560017347335815, + "learning_rate": 3.219852227157022e-05, + "loss": 1.1419, + "num_input_tokens_seen": 33996800, + "step": 4150 + }, + { + "epoch": 0.4760270053781897, + "grad_norm": 0.6051056981086731, + "learning_rate": 3.210127558923434e-05, + "loss": 1.3278, + "num_input_tokens_seen": 34078720, + "step": 4160 + }, + { + "epoch": 0.477171301064195, + "grad_norm": 0.47563403844833374, + "learning_rate": 3.200391188535472e-05, + "loss": 1.2711, + "num_input_tokens_seen": 34160640, + "step": 4170 + }, + { + "epoch": 0.47831559675020024, + "grad_norm": 0.5107710361480713, + "learning_rate": 3.1906432764382695e-05, + "loss": 1.4901, + "num_input_tokens_seen": 34242560, + "step": 4180 + }, + { + "epoch": 0.4794598924362055, + "grad_norm": 0.5463585257530212, + "learning_rate": 3.1808839832671523e-05, + "loss": 1.4351, + "num_input_tokens_seen": 34324480, + "step": 4190 + }, + { + "epoch": 0.4806041881222108, + "grad_norm": 0.4871821403503418, + "learning_rate": 3.1711134698449946e-05, + "loss": 1.1299, + "num_input_tokens_seen": 34406400, + "step": 4200 + }, + { + "epoch": 0.48174848380821605, + "grad_norm": 0.5575620532035828, + "learning_rate": 3.161331897179568e-05, + "loss": 1.3597, + "num_input_tokens_seen": 34488320, + "step": 4210 + }, + { + "epoch": 0.4828927794942213, + "grad_norm": 0.5072765350341797, + "learning_rate": 3.151539426460892e-05, + "loss": 1.4977, + "num_input_tokens_seen": 34570240, + "step": 4220 + }, + { + "epoch": 0.4840370751802266, + "grad_norm": 0.517631471157074, + "learning_rate": 3.14173621905857e-05, + "loss": 1.1672, + "num_input_tokens_seen": 34652160, + "step": 4230 + }, + { + "epoch": 0.4851813708662318, + "grad_norm": 0.5261096954345703, + "learning_rate": 3.1319224365191366e-05, + "loss": 1.495, + "num_input_tokens_seen": 34734080, + "step": 4240 + }, + { + "epoch": 0.4863256665522371, + "grad_norm": 0.5559542179107666, + "learning_rate": 3.122098240563396e-05, + "loss": 1.6386, + "num_input_tokens_seen": 34816000, + "step": 4250 + }, + { + "epoch": 0.48746996223824235, + "grad_norm": 0.8691073060035706, + "learning_rate": 3.1122637930837486e-05, + "loss": 1.2742, + "num_input_tokens_seen": 34897920, + "step": 4260 + }, + { + "epoch": 0.4886142579242476, + "grad_norm": 0.5249228477478027, + "learning_rate": 3.102419256141536e-05, + "loss": 1.2173, + "num_input_tokens_seen": 34979840, + "step": 4270 + }, + { + "epoch": 0.4897585536102529, + "grad_norm": 0.5527949333190918, + "learning_rate": 3.092564791964358e-05, + "loss": 1.187, + "num_input_tokens_seen": 35061760, + "step": 4280 + }, + { + "epoch": 0.49090284929625816, + "grad_norm": 0.48992666602134705, + "learning_rate": 3.082700562943409e-05, + "loss": 1.5849, + "num_input_tokens_seen": 35143680, + "step": 4290 + }, + { + "epoch": 0.49204714498226343, + "grad_norm": 0.4628704786300659, + "learning_rate": 3.0728267316307945e-05, + "loss": 1.2716, + "num_input_tokens_seen": 35225600, + "step": 4300 + }, + { + "epoch": 0.4931914406682687, + "grad_norm": 1.0824905633926392, + "learning_rate": 3.062943460736857e-05, + "loss": 1.2538, + "num_input_tokens_seen": 35307520, + "step": 4310 + }, + { + "epoch": 0.49433573635427397, + "grad_norm": 0.5195997953414917, + "learning_rate": 3.0530509131274935e-05, + "loss": 1.1509, + "num_input_tokens_seen": 35389440, + "step": 4320 + }, + { + "epoch": 0.4954800320402792, + "grad_norm": 0.5249128937721252, + "learning_rate": 3.04314925182147e-05, + "loss": 1.1693, + "num_input_tokens_seen": 35471360, + "step": 4330 + }, + { + "epoch": 0.49662432772628445, + "grad_norm": 0.5838843584060669, + "learning_rate": 3.03323863998774e-05, + "loss": 1.2192, + "num_input_tokens_seen": 35553280, + "step": 4340 + }, + { + "epoch": 0.4977686234122897, + "grad_norm": 0.5815695524215698, + "learning_rate": 3.0233192409427492e-05, + "loss": 1.2383, + "num_input_tokens_seen": 35635200, + "step": 4350 + }, + { + "epoch": 0.498912919098295, + "grad_norm": 0.47992345690727234, + "learning_rate": 3.0133912181477475e-05, + "loss": 1.1951, + "num_input_tokens_seen": 35717120, + "step": 4360 + }, + { + "epoch": 0.5000572147843002, + "grad_norm": 0.5813825726509094, + "learning_rate": 3.003454735206097e-05, + "loss": 1.4106, + "num_input_tokens_seen": 35799040, + "step": 4370 + }, + { + "epoch": 0.5012015104703055, + "grad_norm": 0.5316118597984314, + "learning_rate": 2.9935099558605728e-05, + "loss": 1.2978, + "num_input_tokens_seen": 35880960, + "step": 4380 + }, + { + "epoch": 0.5023458061563107, + "grad_norm": 0.5098001956939697, + "learning_rate": 2.9835570439906657e-05, + "loss": 1.1646, + "num_input_tokens_seen": 35962880, + "step": 4390 + }, + { + "epoch": 0.503490101842316, + "grad_norm": 0.482377290725708, + "learning_rate": 2.973596163609883e-05, + "loss": 1.2874, + "num_input_tokens_seen": 36044800, + "step": 4400 + }, + { + "epoch": 0.5046343975283213, + "grad_norm": 0.5183489322662354, + "learning_rate": 2.9636274788630437e-05, + "loss": 1.2198, + "num_input_tokens_seen": 36126720, + "step": 4410 + }, + { + "epoch": 0.5057786932143266, + "grad_norm": 0.4759490191936493, + "learning_rate": 2.9536511540235744e-05, + "loss": 1.2435, + "num_input_tokens_seen": 36208640, + "step": 4420 + }, + { + "epoch": 0.5069229889003318, + "grad_norm": 0.5005761384963989, + "learning_rate": 2.9436673534908044e-05, + "loss": 1.367, + "num_input_tokens_seen": 36290560, + "step": 4430 + }, + { + "epoch": 0.5080672845863371, + "grad_norm": 0.48932281136512756, + "learning_rate": 2.9336762417872516e-05, + "loss": 1.1908, + "num_input_tokens_seen": 36372480, + "step": 4440 + }, + { + "epoch": 0.5092115802723424, + "grad_norm": 0.49353188276290894, + "learning_rate": 2.9236779835559165e-05, + "loss": 1.2709, + "num_input_tokens_seen": 36454400, + "step": 4450 + }, + { + "epoch": 0.5103558759583476, + "grad_norm": 0.5292254686355591, + "learning_rate": 2.913672743557565e-05, + "loss": 1.1411, + "num_input_tokens_seen": 36536320, + "step": 4460 + }, + { + "epoch": 0.5115001716443529, + "grad_norm": 0.4894375801086426, + "learning_rate": 2.9036606866680187e-05, + "loss": 1.1795, + "num_input_tokens_seen": 36618240, + "step": 4470 + }, + { + "epoch": 0.5126444673303582, + "grad_norm": 0.7328277826309204, + "learning_rate": 2.8936419778754294e-05, + "loss": 1.4518, + "num_input_tokens_seen": 36700160, + "step": 4480 + }, + { + "epoch": 0.5137887630163634, + "grad_norm": 0.5245153903961182, + "learning_rate": 2.883616782277569e-05, + "loss": 1.1565, + "num_input_tokens_seen": 36782080, + "step": 4490 + }, + { + "epoch": 0.5149330587023687, + "grad_norm": 0.5825649499893188, + "learning_rate": 2.8735852650791035e-05, + "loss": 1.1245, + "num_input_tokens_seen": 36864000, + "step": 4500 + }, + { + "epoch": 0.516077354388374, + "grad_norm": 0.4824535548686981, + "learning_rate": 2.8635475915888732e-05, + "loss": 1.2498, + "num_input_tokens_seen": 36945920, + "step": 4510 + }, + { + "epoch": 0.5172216500743793, + "grad_norm": 0.51859050989151, + "learning_rate": 2.853503927217167e-05, + "loss": 1.222, + "num_input_tokens_seen": 37027840, + "step": 4520 + }, + { + "epoch": 0.5183659457603845, + "grad_norm": 0.4794926345348358, + "learning_rate": 2.8434544374729965e-05, + "loss": 1.3003, + "num_input_tokens_seen": 37109760, + "step": 4530 + }, + { + "epoch": 0.5195102414463898, + "grad_norm": 0.8347005248069763, + "learning_rate": 2.8333992879613712e-05, + "loss": 1.1049, + "num_input_tokens_seen": 37191680, + "step": 4540 + }, + { + "epoch": 0.520654537132395, + "grad_norm": 0.5049195289611816, + "learning_rate": 2.823338644380566e-05, + "loss": 1.2885, + "num_input_tokens_seen": 37273600, + "step": 4550 + }, + { + "epoch": 0.5217988328184002, + "grad_norm": 0.6773023009300232, + "learning_rate": 2.8132726725193926e-05, + "loss": 1.3466, + "num_input_tokens_seen": 37355520, + "step": 4560 + }, + { + "epoch": 0.5229431285044055, + "grad_norm": 0.4892396628856659, + "learning_rate": 2.803201538254467e-05, + "loss": 1.3192, + "num_input_tokens_seen": 37437440, + "step": 4570 + }, + { + "epoch": 0.5240874241904108, + "grad_norm": 0.5198492407798767, + "learning_rate": 2.7931254075474768e-05, + "loss": 1.2215, + "num_input_tokens_seen": 37519360, + "step": 4580 + }, + { + "epoch": 0.525231719876416, + "grad_norm": 0.4889214336872101, + "learning_rate": 2.7830444464424466e-05, + "loss": 1.1609, + "num_input_tokens_seen": 37601280, + "step": 4590 + }, + { + "epoch": 0.5263760155624213, + "grad_norm": 0.512380838394165, + "learning_rate": 2.772958821062997e-05, + "loss": 1.0642, + "num_input_tokens_seen": 37683200, + "step": 4600 + }, + { + "epoch": 0.5275203112484266, + "grad_norm": 0.792269766330719, + "learning_rate": 2.7628686976096164e-05, + "loss": 1.1454, + "num_input_tokens_seen": 37765120, + "step": 4610 + }, + { + "epoch": 0.5286646069344318, + "grad_norm": 0.5080438256263733, + "learning_rate": 2.7527742423569124e-05, + "loss": 1.1747, + "num_input_tokens_seen": 37847040, + "step": 4620 + }, + { + "epoch": 0.5298089026204371, + "grad_norm": 0.494386225938797, + "learning_rate": 2.7426756216508776e-05, + "loss": 1.1861, + "num_input_tokens_seen": 37928960, + "step": 4630 + }, + { + "epoch": 0.5309531983064424, + "grad_norm": 0.7930368185043335, + "learning_rate": 2.7325730019061474e-05, + "loss": 1.3794, + "num_input_tokens_seen": 38010880, + "step": 4640 + }, + { + "epoch": 0.5320974939924477, + "grad_norm": 0.5523006319999695, + "learning_rate": 2.7224665496032565e-05, + "loss": 1.1395, + "num_input_tokens_seen": 38092800, + "step": 4650 + }, + { + "epoch": 0.5332417896784529, + "grad_norm": 0.6003334522247314, + "learning_rate": 2.712356431285896e-05, + "loss": 1.1517, + "num_input_tokens_seen": 38174720, + "step": 4660 + }, + { + "epoch": 0.5343860853644582, + "grad_norm": 0.5045771598815918, + "learning_rate": 2.70224281355817e-05, + "loss": 1.2753, + "num_input_tokens_seen": 38256640, + "step": 4670 + }, + { + "epoch": 0.5355303810504635, + "grad_norm": 0.5754336714744568, + "learning_rate": 2.6921258630818475e-05, + "loss": 1.1533, + "num_input_tokens_seen": 38338560, + "step": 4680 + }, + { + "epoch": 0.5366746767364687, + "grad_norm": 0.5257157683372498, + "learning_rate": 2.6820057465736197e-05, + "loss": 1.1615, + "num_input_tokens_seen": 38420480, + "step": 4690 + }, + { + "epoch": 0.537818972422474, + "grad_norm": 0.4963821768760681, + "learning_rate": 2.6718826308023487e-05, + "loss": 1.1366, + "num_input_tokens_seen": 38502400, + "step": 4700 + }, + { + "epoch": 0.5389632681084793, + "grad_norm": 0.5854734778404236, + "learning_rate": 2.6617566825863237e-05, + "loss": 1.129, + "num_input_tokens_seen": 38584320, + "step": 4710 + }, + { + "epoch": 0.5401075637944845, + "grad_norm": 0.5400436520576477, + "learning_rate": 2.651628068790507e-05, + "loss": 1.0386, + "num_input_tokens_seen": 38666240, + "step": 4720 + }, + { + "epoch": 0.5412518594804897, + "grad_norm": 1.165490984916687, + "learning_rate": 2.6414969563237874e-05, + "loss": 1.0903, + "num_input_tokens_seen": 38748160, + "step": 4730 + }, + { + "epoch": 0.542396155166495, + "grad_norm": 1.1871403455734253, + "learning_rate": 2.6313635121362322e-05, + "loss": 1.2361, + "num_input_tokens_seen": 38830080, + "step": 4740 + }, + { + "epoch": 0.5435404508525002, + "grad_norm": 0.5504677295684814, + "learning_rate": 2.6212279032163283e-05, + "loss": 1.5279, + "num_input_tokens_seen": 38912000, + "step": 4750 + }, + { + "epoch": 0.5446847465385055, + "grad_norm": 0.9438173174858093, + "learning_rate": 2.6110902965882383e-05, + "loss": 1.3035, + "num_input_tokens_seen": 38993920, + "step": 4760 + }, + { + "epoch": 0.5458290422245108, + "grad_norm": 0.5121588110923767, + "learning_rate": 2.6009508593090448e-05, + "loss": 1.2656, + "num_input_tokens_seen": 39075840, + "step": 4770 + }, + { + "epoch": 0.546973337910516, + "grad_norm": 0.5496011972427368, + "learning_rate": 2.590809758465995e-05, + "loss": 1.3347, + "num_input_tokens_seen": 39157760, + "step": 4780 + }, + { + "epoch": 0.5481176335965213, + "grad_norm": 0.5163994431495667, + "learning_rate": 2.580667161173753e-05, + "loss": 1.0816, + "num_input_tokens_seen": 39239680, + "step": 4790 + }, + { + "epoch": 0.5492619292825266, + "grad_norm": 0.547443687915802, + "learning_rate": 2.570523234571642e-05, + "loss": 1.072, + "num_input_tokens_seen": 39321600, + "step": 4800 + }, + { + "epoch": 0.5504062249685319, + "grad_norm": 0.49614182114601135, + "learning_rate": 2.5603781458208885e-05, + "loss": 1.105, + "num_input_tokens_seen": 39403520, + "step": 4810 + }, + { + "epoch": 0.5515505206545371, + "grad_norm": 0.6012304425239563, + "learning_rate": 2.5502320621018732e-05, + "loss": 1.1652, + "num_input_tokens_seen": 39485440, + "step": 4820 + }, + { + "epoch": 0.5526948163405424, + "grad_norm": 1.310507893562317, + "learning_rate": 2.5400851506113728e-05, + "loss": 1.2073, + "num_input_tokens_seen": 39567360, + "step": 4830 + }, + { + "epoch": 0.5538391120265477, + "grad_norm": 0.5216242671012878, + "learning_rate": 2.5299375785598005e-05, + "loss": 1.2457, + "num_input_tokens_seen": 39649280, + "step": 4840 + }, + { + "epoch": 0.554983407712553, + "grad_norm": 0.4636791944503784, + "learning_rate": 2.519789513168459e-05, + "loss": 1.3313, + "num_input_tokens_seen": 39731200, + "step": 4850 + }, + { + "epoch": 0.5561277033985582, + "grad_norm": 0.4918244779109955, + "learning_rate": 2.509641121666781e-05, + "loss": 1.1312, + "num_input_tokens_seen": 39813120, + "step": 4860 + }, + { + "epoch": 0.5572719990845635, + "grad_norm": 0.5367238521575928, + "learning_rate": 2.4994925712895697e-05, + "loss": 1.1891, + "num_input_tokens_seen": 39895040, + "step": 4870 + }, + { + "epoch": 0.5584162947705688, + "grad_norm": 0.45426082611083984, + "learning_rate": 2.489344029274249e-05, + "loss": 1.4017, + "num_input_tokens_seen": 39976960, + "step": 4880 + }, + { + "epoch": 0.559560590456574, + "grad_norm": 1.55329167842865, + "learning_rate": 2.479195662858105e-05, + "loss": 1.1554, + "num_input_tokens_seen": 40058880, + "step": 4890 + }, + { + "epoch": 0.5607048861425793, + "grad_norm": 0.5056139230728149, + "learning_rate": 2.4690476392755298e-05, + "loss": 1.3402, + "num_input_tokens_seen": 40140800, + "step": 4900 + }, + { + "epoch": 0.5618491818285845, + "grad_norm": 0.45648014545440674, + "learning_rate": 2.4589001257552637e-05, + "loss": 1.288, + "num_input_tokens_seen": 40222720, + "step": 4910 + }, + { + "epoch": 0.5629934775145897, + "grad_norm": 0.5516757369041443, + "learning_rate": 2.4487532895176457e-05, + "loss": 1.2969, + "num_input_tokens_seen": 40304640, + "step": 4920 + }, + { + "epoch": 0.564137773200595, + "grad_norm": 0.5544970631599426, + "learning_rate": 2.4386072977718503e-05, + "loss": 1.346, + "num_input_tokens_seen": 40386560, + "step": 4930 + }, + { + "epoch": 0.5652820688866003, + "grad_norm": 0.48032233119010925, + "learning_rate": 2.4284623177131395e-05, + "loss": 1.2043, + "num_input_tokens_seen": 40468480, + "step": 4940 + }, + { + "epoch": 0.5664263645726055, + "grad_norm": 1.1765650510787964, + "learning_rate": 2.4183185165200998e-05, + "loss": 1.1418, + "num_input_tokens_seen": 40550400, + "step": 4950 + }, + { + "epoch": 0.5675706602586108, + "grad_norm": 0.5061176419258118, + "learning_rate": 2.4081760613518924e-05, + "loss": 1.0886, + "num_input_tokens_seen": 40632320, + "step": 4960 + }, + { + "epoch": 0.5687149559446161, + "grad_norm": 1.4397417306900024, + "learning_rate": 2.3980351193455e-05, + "loss": 1.3159, + "num_input_tokens_seen": 40714240, + "step": 4970 + }, + { + "epoch": 0.5698592516306213, + "grad_norm": 0.46491971611976624, + "learning_rate": 2.3878958576129664e-05, + "loss": 1.2942, + "num_input_tokens_seen": 40796160, + "step": 4980 + }, + { + "epoch": 0.5710035473166266, + "grad_norm": 0.5516128540039062, + "learning_rate": 2.3777584432386474e-05, + "loss": 1.3472, + "num_input_tokens_seen": 40878080, + "step": 4990 + }, + { + "epoch": 0.5721478430026319, + "grad_norm": 0.4662507176399231, + "learning_rate": 2.367623043276459e-05, + "loss": 1.1461, + "num_input_tokens_seen": 40960000, + "step": 5000 + }, + { + "epoch": 0.5732921386886372, + "grad_norm": 0.4610395133495331, + "learning_rate": 2.3574898247471167e-05, + "loss": 0.997, + "num_input_tokens_seen": 41041920, + "step": 5010 + }, + { + "epoch": 0.5744364343746424, + "grad_norm": 0.6839539408683777, + "learning_rate": 2.347358954635393e-05, + "loss": 1.2608, + "num_input_tokens_seen": 41123840, + "step": 5020 + }, + { + "epoch": 0.5755807300606477, + "grad_norm": 0.5418996810913086, + "learning_rate": 2.337230599887358e-05, + "loss": 1.1886, + "num_input_tokens_seen": 41205760, + "step": 5030 + }, + { + "epoch": 0.576725025746653, + "grad_norm": 0.5929667949676514, + "learning_rate": 2.327104927407634e-05, + "loss": 1.2545, + "num_input_tokens_seen": 41287680, + "step": 5040 + }, + { + "epoch": 0.5778693214326582, + "grad_norm": 0.49478694796562195, + "learning_rate": 2.3169821040566387e-05, + "loss": 1.0687, + "num_input_tokens_seen": 41369600, + "step": 5050 + }, + { + "epoch": 0.5790136171186635, + "grad_norm": 0.48581549525260925, + "learning_rate": 2.306862296647841e-05, + "loss": 1.1071, + "num_input_tokens_seen": 41451520, + "step": 5060 + }, + { + "epoch": 0.5801579128046688, + "grad_norm": 1.5436527729034424, + "learning_rate": 2.2967456719450127e-05, + "loss": 1.0824, + "num_input_tokens_seen": 41533440, + "step": 5070 + }, + { + "epoch": 0.581302208490674, + "grad_norm": 0.49023711681365967, + "learning_rate": 2.2866323966594736e-05, + "loss": 1.451, + "num_input_tokens_seen": 41615360, + "step": 5080 + }, + { + "epoch": 0.5824465041766792, + "grad_norm": 0.48039644956588745, + "learning_rate": 2.2765226374473504e-05, + "loss": 1.2207, + "num_input_tokens_seen": 41697280, + "step": 5090 + }, + { + "epoch": 0.5835907998626845, + "grad_norm": 0.46668538451194763, + "learning_rate": 2.2664165609068304e-05, + "loss": 1.1504, + "num_input_tokens_seen": 41779200, + "step": 5100 + }, + { + "epoch": 0.5847350955486897, + "grad_norm": 0.418855220079422, + "learning_rate": 2.2563143335754118e-05, + "loss": 1.1436, + "num_input_tokens_seen": 41861120, + "step": 5110 + }, + { + "epoch": 0.585879391234695, + "grad_norm": 0.6656162142753601, + "learning_rate": 2.2462161219271622e-05, + "loss": 1.5087, + "num_input_tokens_seen": 41943040, + "step": 5120 + }, + { + "epoch": 0.5870236869207003, + "grad_norm": 0.5571346282958984, + "learning_rate": 2.236122092369977e-05, + "loss": 1.2635, + "num_input_tokens_seen": 42024960, + "step": 5130 + }, + { + "epoch": 0.5881679826067056, + "grad_norm": 0.5325060486793518, + "learning_rate": 2.2260324112428336e-05, + "loss": 1.4867, + "num_input_tokens_seen": 42106880, + "step": 5140 + }, + { + "epoch": 0.5893122782927108, + "grad_norm": 0.48673689365386963, + "learning_rate": 2.2159472448130513e-05, + "loss": 1.2188, + "num_input_tokens_seen": 42188800, + "step": 5150 + }, + { + "epoch": 0.5904565739787161, + "grad_norm": 0.5229732394218445, + "learning_rate": 2.2058667592735532e-05, + "loss": 1.3696, + "num_input_tokens_seen": 42270720, + "step": 5160 + }, + { + "epoch": 0.5916008696647214, + "grad_norm": 0.6097173094749451, + "learning_rate": 2.1957911207401267e-05, + "loss": 1.6279, + "num_input_tokens_seen": 42352640, + "step": 5170 + }, + { + "epoch": 0.5927451653507266, + "grad_norm": 0.5959046483039856, + "learning_rate": 2.1857204952486824e-05, + "loss": 1.521, + "num_input_tokens_seen": 42434560, + "step": 5180 + }, + { + "epoch": 0.5938894610367319, + "grad_norm": 0.7557221055030823, + "learning_rate": 2.1756550487525247e-05, + "loss": 1.3385, + "num_input_tokens_seen": 42516480, + "step": 5190 + }, + { + "epoch": 0.5950337567227372, + "grad_norm": 0.48483288288116455, + "learning_rate": 2.165594947119613e-05, + "loss": 1.1232, + "num_input_tokens_seen": 42598400, + "step": 5200 + }, + { + "epoch": 0.5961780524087424, + "grad_norm": 0.48301464319229126, + "learning_rate": 2.1555403561298287e-05, + "loss": 1.3692, + "num_input_tokens_seen": 42680320, + "step": 5210 + }, + { + "epoch": 0.5973223480947477, + "grad_norm": 0.4715411961078644, + "learning_rate": 2.1454914414722417e-05, + "loss": 1.2656, + "num_input_tokens_seen": 42762240, + "step": 5220 + }, + { + "epoch": 0.598466643780753, + "grad_norm": 0.4841952323913574, + "learning_rate": 2.135448368742385e-05, + "loss": 1.2838, + "num_input_tokens_seen": 42844160, + "step": 5230 + }, + { + "epoch": 0.5996109394667583, + "grad_norm": 0.4453716278076172, + "learning_rate": 2.1254113034395212e-05, + "loss": 1.2797, + "num_input_tokens_seen": 42926080, + "step": 5240 + }, + { + "epoch": 0.6007552351527635, + "grad_norm": 1.0583953857421875, + "learning_rate": 2.1153804109639157e-05, + "loss": 1.2836, + "num_input_tokens_seen": 43008000, + "step": 5250 + }, + { + "epoch": 0.6018995308387688, + "grad_norm": 0.5487982630729675, + "learning_rate": 2.105355856614115e-05, + "loss": 1.1154, + "num_input_tokens_seen": 43089920, + "step": 5260 + }, + { + "epoch": 0.603043826524774, + "grad_norm": 0.5370051264762878, + "learning_rate": 2.0953378055842183e-05, + "loss": 1.3152, + "num_input_tokens_seen": 43171840, + "step": 5270 + }, + { + "epoch": 0.6041881222107792, + "grad_norm": 0.574317455291748, + "learning_rate": 2.0853264229611557e-05, + "loss": 0.9663, + "num_input_tokens_seen": 43253760, + "step": 5280 + }, + { + "epoch": 0.6053324178967845, + "grad_norm": 0.48023343086242676, + "learning_rate": 2.075321873721972e-05, + "loss": 1.0387, + "num_input_tokens_seen": 43335680, + "step": 5290 + }, + { + "epoch": 0.6064767135827898, + "grad_norm": 0.5434787273406982, + "learning_rate": 2.0653243227311014e-05, + "loss": 1.3762, + "num_input_tokens_seen": 43417600, + "step": 5300 + }, + { + "epoch": 0.607621009268795, + "grad_norm": 0.5317404866218567, + "learning_rate": 2.0553339347376592e-05, + "loss": 1.187, + "num_input_tokens_seen": 43499520, + "step": 5310 + }, + { + "epoch": 0.6087653049548003, + "grad_norm": 0.4761092960834503, + "learning_rate": 2.045350874372717e-05, + "loss": 1.0539, + "num_input_tokens_seen": 43581440, + "step": 5320 + }, + { + "epoch": 0.6099096006408056, + "grad_norm": 0.45552459359169006, + "learning_rate": 2.0353753061465972e-05, + "loss": 1.3189, + "num_input_tokens_seen": 43663360, + "step": 5330 + }, + { + "epoch": 0.6110538963268108, + "grad_norm": 0.6579371690750122, + "learning_rate": 2.0254073944461603e-05, + "loss": 1.2721, + "num_input_tokens_seen": 43745280, + "step": 5340 + }, + { + "epoch": 0.6121981920128161, + "grad_norm": 0.5771154761314392, + "learning_rate": 2.0154473035320936e-05, + "loss": 1.1475, + "num_input_tokens_seen": 43827200, + "step": 5350 + }, + { + "epoch": 0.6133424876988214, + "grad_norm": 0.500095009803772, + "learning_rate": 2.0054951975362067e-05, + "loss": 1.2035, + "num_input_tokens_seen": 43909120, + "step": 5360 + }, + { + "epoch": 0.6144867833848267, + "grad_norm": 0.46520867943763733, + "learning_rate": 1.995551240458728e-05, + "loss": 1.1699, + "num_input_tokens_seen": 43991040, + "step": 5370 + }, + { + "epoch": 0.6156310790708319, + "grad_norm": 0.46139946579933167, + "learning_rate": 1.985615596165597e-05, + "loss": 1.2418, + "num_input_tokens_seen": 44072960, + "step": 5380 + }, + { + "epoch": 0.6167753747568372, + "grad_norm": 0.49758806824684143, + "learning_rate": 1.9756884283857685e-05, + "loss": 1.4331, + "num_input_tokens_seen": 44154880, + "step": 5390 + }, + { + "epoch": 0.6179196704428425, + "grad_norm": 0.48119744658470154, + "learning_rate": 1.965769900708515e-05, + "loss": 1.3412, + "num_input_tokens_seen": 44236800, + "step": 5400 + }, + { + "epoch": 0.6190639661288477, + "grad_norm": 0.5523199439048767, + "learning_rate": 1.955860176580729e-05, + "loss": 1.1733, + "num_input_tokens_seen": 44318720, + "step": 5410 + }, + { + "epoch": 0.620208261814853, + "grad_norm": 0.5241261124610901, + "learning_rate": 1.945959419304226e-05, + "loss": 1.2493, + "num_input_tokens_seen": 44400640, + "step": 5420 + }, + { + "epoch": 0.6213525575008583, + "grad_norm": 0.5371418595314026, + "learning_rate": 1.936067792033061e-05, + "loss": 1.2361, + "num_input_tokens_seen": 44482560, + "step": 5430 + }, + { + "epoch": 0.6224968531868634, + "grad_norm": 0.5102439522743225, + "learning_rate": 1.9261854577708366e-05, + "loss": 1.2999, + "num_input_tokens_seen": 44564480, + "step": 5440 + }, + { + "epoch": 0.6236411488728687, + "grad_norm": 0.9786542654037476, + "learning_rate": 1.9163125793680125e-05, + "loss": 1.4174, + "num_input_tokens_seen": 44646400, + "step": 5450 + }, + { + "epoch": 0.624785444558874, + "grad_norm": 0.5264445543289185, + "learning_rate": 1.9064493195192293e-05, + "loss": 1.0868, + "num_input_tokens_seen": 44728320, + "step": 5460 + }, + { + "epoch": 0.6259297402448792, + "grad_norm": 0.5219578742980957, + "learning_rate": 1.8965958407606236e-05, + "loss": 1.1703, + "num_input_tokens_seen": 44810240, + "step": 5470 + }, + { + "epoch": 0.6270740359308845, + "grad_norm": 0.4690404534339905, + "learning_rate": 1.8867523054671475e-05, + "loss": 1.2942, + "num_input_tokens_seen": 44892160, + "step": 5480 + }, + { + "epoch": 0.6282183316168898, + "grad_norm": 0.5699681043624878, + "learning_rate": 1.8769188758498973e-05, + "loss": 1.1778, + "num_input_tokens_seen": 44974080, + "step": 5490 + }, + { + "epoch": 0.629362627302895, + "grad_norm": 0.4668191075325012, + "learning_rate": 1.867095713953439e-05, + "loss": 1.2899, + "num_input_tokens_seen": 45056000, + "step": 5500 + }, + { + "epoch": 0.6305069229889003, + "grad_norm": 1.0437618494033813, + "learning_rate": 1.8572829816531364e-05, + "loss": 1.2486, + "num_input_tokens_seen": 45137920, + "step": 5510 + }, + { + "epoch": 0.6316512186749056, + "grad_norm": 0.4803614318370819, + "learning_rate": 1.847480840652483e-05, + "loss": 1.2433, + "num_input_tokens_seen": 45219840, + "step": 5520 + }, + { + "epoch": 0.6327955143609109, + "grad_norm": 0.5359634160995483, + "learning_rate": 1.8376894524804416e-05, + "loss": 1.2503, + "num_input_tokens_seen": 45301760, + "step": 5530 + }, + { + "epoch": 0.6339398100469161, + "grad_norm": 0.4710131883621216, + "learning_rate": 1.827908978488779e-05, + "loss": 1.3593, + "num_input_tokens_seen": 45383680, + "step": 5540 + }, + { + "epoch": 0.6350841057329214, + "grad_norm": 0.4845348298549652, + "learning_rate": 1.8181395798494048e-05, + "loss": 1.3471, + "num_input_tokens_seen": 45465600, + "step": 5550 + }, + { + "epoch": 0.6362284014189267, + "grad_norm": 0.6170366406440735, + "learning_rate": 1.8083814175517234e-05, + "loss": 1.3265, + "num_input_tokens_seen": 45547520, + "step": 5560 + }, + { + "epoch": 0.6373726971049319, + "grad_norm": 0.5501654148101807, + "learning_rate": 1.798634652399972e-05, + "loss": 1.6024, + "num_input_tokens_seen": 45629440, + "step": 5570 + }, + { + "epoch": 0.6385169927909372, + "grad_norm": 1.0594714879989624, + "learning_rate": 1.7888994450105788e-05, + "loss": 1.2416, + "num_input_tokens_seen": 45711360, + "step": 5580 + }, + { + "epoch": 0.6396612884769425, + "grad_norm": 0.4413740634918213, + "learning_rate": 1.7791759558095077e-05, + "loss": 1.3582, + "num_input_tokens_seen": 45793280, + "step": 5590 + }, + { + "epoch": 0.6408055841629478, + "grad_norm": 0.6288554668426514, + "learning_rate": 1.7694643450296216e-05, + "loss": 1.4037, + "num_input_tokens_seen": 45875200, + "step": 5600 + }, + { + "epoch": 0.641949879848953, + "grad_norm": 0.5537555813789368, + "learning_rate": 1.7597647727080408e-05, + "loss": 1.3859, + "num_input_tokens_seen": 45957120, + "step": 5610 + }, + { + "epoch": 0.6430941755349582, + "grad_norm": 0.5998051762580872, + "learning_rate": 1.7500773986835013e-05, + "loss": 1.0984, + "num_input_tokens_seen": 46039040, + "step": 5620 + }, + { + "epoch": 0.6442384712209634, + "grad_norm": 0.8084359765052795, + "learning_rate": 1.740402382593727e-05, + "loss": 1.3061, + "num_input_tokens_seen": 46120960, + "step": 5630 + }, + { + "epoch": 0.6453827669069687, + "grad_norm": 0.8041592240333557, + "learning_rate": 1.730739883872795e-05, + "loss": 1.0178, + "num_input_tokens_seen": 46202880, + "step": 5640 + }, + { + "epoch": 0.646527062592974, + "grad_norm": 0.5870161652565002, + "learning_rate": 1.7210900617485075e-05, + "loss": 1.2927, + "num_input_tokens_seen": 46284800, + "step": 5650 + }, + { + "epoch": 0.6476713582789793, + "grad_norm": 0.4723074734210968, + "learning_rate": 1.711453075239773e-05, + "loss": 1.205, + "num_input_tokens_seen": 46366720, + "step": 5660 + }, + { + "epoch": 0.6488156539649845, + "grad_norm": 0.9023962020874023, + "learning_rate": 1.7018290831539795e-05, + "loss": 1.378, + "num_input_tokens_seen": 46448640, + "step": 5670 + }, + { + "epoch": 0.6499599496509898, + "grad_norm": 0.7521951794624329, + "learning_rate": 1.6922182440843843e-05, + "loss": 1.0102, + "num_input_tokens_seen": 46530560, + "step": 5680 + }, + { + "epoch": 0.6511042453369951, + "grad_norm": 0.4786534309387207, + "learning_rate": 1.6826207164074924e-05, + "loss": 1.2101, + "num_input_tokens_seen": 46612480, + "step": 5690 + }, + { + "epoch": 0.6522485410230003, + "grad_norm": 1.202210783958435, + "learning_rate": 1.6730366582804535e-05, + "loss": 1.1996, + "num_input_tokens_seen": 46694400, + "step": 5700 + }, + { + "epoch": 0.6533928367090056, + "grad_norm": 0.4971697926521301, + "learning_rate": 1.6634662276384548e-05, + "loss": 1.2356, + "num_input_tokens_seen": 46776320, + "step": 5710 + }, + { + "epoch": 0.6545371323950109, + "grad_norm": 0.5253583788871765, + "learning_rate": 1.6539095821921136e-05, + "loss": 1.2006, + "num_input_tokens_seen": 46858240, + "step": 5720 + }, + { + "epoch": 0.6556814280810161, + "grad_norm": 0.558186411857605, + "learning_rate": 1.6443668794248828e-05, + "loss": 1.2621, + "num_input_tokens_seen": 46940160, + "step": 5730 + }, + { + "epoch": 0.6568257237670214, + "grad_norm": 1.3204737901687622, + "learning_rate": 1.6348382765904567e-05, + "loss": 1.0727, + "num_input_tokens_seen": 47022080, + "step": 5740 + }, + { + "epoch": 0.6579700194530267, + "grad_norm": 0.5247931480407715, + "learning_rate": 1.6253239307101748e-05, + "loss": 1.212, + "num_input_tokens_seen": 47104000, + "step": 5750 + }, + { + "epoch": 0.659114315139032, + "grad_norm": 0.4864051043987274, + "learning_rate": 1.6158239985704378e-05, + "loss": 1.3306, + "num_input_tokens_seen": 47185920, + "step": 5760 + }, + { + "epoch": 0.6602586108250372, + "grad_norm": 0.8225966691970825, + "learning_rate": 1.606338636720125e-05, + "loss": 1.0573, + "num_input_tokens_seen": 47267840, + "step": 5770 + }, + { + "epoch": 0.6614029065110425, + "grad_norm": 1.1698306798934937, + "learning_rate": 1.5968680014680105e-05, + "loss": 1.4235, + "num_input_tokens_seen": 47349760, + "step": 5780 + }, + { + "epoch": 0.6625472021970478, + "grad_norm": 0.4029783010482788, + "learning_rate": 1.5874122488801888e-05, + "loss": 1.1993, + "num_input_tokens_seen": 47431680, + "step": 5790 + }, + { + "epoch": 0.6636914978830529, + "grad_norm": 0.4945356249809265, + "learning_rate": 1.577971534777507e-05, + "loss": 1.1437, + "num_input_tokens_seen": 47513600, + "step": 5800 + }, + { + "epoch": 0.6648357935690582, + "grad_norm": 0.5037471652030945, + "learning_rate": 1.5685460147329917e-05, + "loss": 1.2579, + "num_input_tokens_seen": 47595520, + "step": 5810 + }, + { + "epoch": 0.6659800892550635, + "grad_norm": 0.5378947257995605, + "learning_rate": 1.5591358440692865e-05, + "loss": 1.2693, + "num_input_tokens_seen": 47677440, + "step": 5820 + }, + { + "epoch": 0.6671243849410687, + "grad_norm": 0.49818259477615356, + "learning_rate": 1.5497411778560954e-05, + "loss": 1.1159, + "num_input_tokens_seen": 47759360, + "step": 5830 + }, + { + "epoch": 0.668268680627074, + "grad_norm": 0.4517885744571686, + "learning_rate": 1.5403621709076247e-05, + "loss": 1.1901, + "num_input_tokens_seen": 47841280, + "step": 5840 + }, + { + "epoch": 0.6694129763130793, + "grad_norm": 0.8571302890777588, + "learning_rate": 1.530998977780033e-05, + "loss": 1.4804, + "num_input_tokens_seen": 47923200, + "step": 5850 + }, + { + "epoch": 0.6705572719990845, + "grad_norm": 0.8022615313529968, + "learning_rate": 1.5216517527688818e-05, + "loss": 1.1893, + "num_input_tokens_seen": 48005120, + "step": 5860 + }, + { + "epoch": 0.6717015676850898, + "grad_norm": 0.630002498626709, + "learning_rate": 1.5123206499065967e-05, + "loss": 1.085, + "num_input_tokens_seen": 48087040, + "step": 5870 + }, + { + "epoch": 0.6728458633710951, + "grad_norm": 0.49356332421302795, + "learning_rate": 1.5030058229599275e-05, + "loss": 1.0382, + "num_input_tokens_seen": 48168960, + "step": 5880 + }, + { + "epoch": 0.6739901590571004, + "grad_norm": 0.6504026055335999, + "learning_rate": 1.4937074254274117e-05, + "loss": 1.3036, + "num_input_tokens_seen": 48250880, + "step": 5890 + }, + { + "epoch": 0.6751344547431056, + "grad_norm": 0.4629366993904114, + "learning_rate": 1.4844256105368504e-05, + "loss": 1.3465, + "num_input_tokens_seen": 48332800, + "step": 5900 + }, + { + "epoch": 0.6762787504291109, + "grad_norm": 0.4517896771430969, + "learning_rate": 1.4751605312427786e-05, + "loss": 1.4707, + "num_input_tokens_seen": 48414720, + "step": 5910 + }, + { + "epoch": 0.6774230461151162, + "grad_norm": 1.4166773557662964, + "learning_rate": 1.4659123402239454e-05, + "loss": 1.1061, + "num_input_tokens_seen": 48496640, + "step": 5920 + }, + { + "epoch": 0.6785673418011214, + "grad_norm": 0.7780046463012695, + "learning_rate": 1.4566811898808013e-05, + "loss": 1.2819, + "num_input_tokens_seen": 48578560, + "step": 5930 + }, + { + "epoch": 0.6797116374871267, + "grad_norm": 0.55788654088974, + "learning_rate": 1.4474672323329819e-05, + "loss": 1.4707, + "num_input_tokens_seen": 48660480, + "step": 5940 + }, + { + "epoch": 0.680855933173132, + "grad_norm": 0.5268608331680298, + "learning_rate": 1.4382706194168066e-05, + "loss": 1.4235, + "num_input_tokens_seen": 48742400, + "step": 5950 + }, + { + "epoch": 0.6820002288591372, + "grad_norm": 0.48586344718933105, + "learning_rate": 1.42909150268277e-05, + "loss": 1.2991, + "num_input_tokens_seen": 48824320, + "step": 5960 + }, + { + "epoch": 0.6831445245451425, + "grad_norm": 0.5041683912277222, + "learning_rate": 1.4199300333930515e-05, + "loss": 1.0258, + "num_input_tokens_seen": 48906240, + "step": 5970 + }, + { + "epoch": 0.6842888202311477, + "grad_norm": 0.5228765606880188, + "learning_rate": 1.4107863625190163e-05, + "loss": 1.2446, + "num_input_tokens_seen": 48988160, + "step": 5980 + }, + { + "epoch": 0.685433115917153, + "grad_norm": 0.4611828029155731, + "learning_rate": 1.4016606407387312e-05, + "loss": 1.2735, + "num_input_tokens_seen": 49070080, + "step": 5990 + }, + { + "epoch": 0.6865774116031582, + "grad_norm": 0.543980598449707, + "learning_rate": 1.3925530184344818e-05, + "loss": 1.3767, + "num_input_tokens_seen": 49152000, + "step": 6000 + }, + { + "epoch": 0.6877217072891635, + "grad_norm": 0.5816802978515625, + "learning_rate": 1.3834636456902944e-05, + "loss": 1.4241, + "num_input_tokens_seen": 49233920, + "step": 6010 + }, + { + "epoch": 0.6888660029751688, + "grad_norm": 0.4795803725719452, + "learning_rate": 1.3743926722894579e-05, + "loss": 1.1986, + "num_input_tokens_seen": 49315840, + "step": 6020 + }, + { + "epoch": 0.690010298661174, + "grad_norm": 0.4873317778110504, + "learning_rate": 1.365340247712064e-05, + "loss": 0.9731, + "num_input_tokens_seen": 49397760, + "step": 6030 + }, + { + "epoch": 0.6911545943471793, + "grad_norm": 0.5345495939254761, + "learning_rate": 1.3563065211325349e-05, + "loss": 1.2101, + "num_input_tokens_seen": 49479680, + "step": 6040 + }, + { + "epoch": 0.6922988900331846, + "grad_norm": 0.6827751398086548, + "learning_rate": 1.3472916414171738e-05, + "loss": 1.0661, + "num_input_tokens_seen": 49561600, + "step": 6050 + }, + { + "epoch": 0.6934431857191898, + "grad_norm": 0.533750057220459, + "learning_rate": 1.338295757121703e-05, + "loss": 1.2239, + "num_input_tokens_seen": 49643520, + "step": 6060 + }, + { + "epoch": 0.6945874814051951, + "grad_norm": 0.47301891446113586, + "learning_rate": 1.3293190164888242e-05, + "loss": 1.3588, + "num_input_tokens_seen": 49725440, + "step": 6070 + }, + { + "epoch": 0.6957317770912004, + "grad_norm": 0.4108991324901581, + "learning_rate": 1.3203615674457709e-05, + "loss": 1.1447, + "num_input_tokens_seen": 49807360, + "step": 6080 + }, + { + "epoch": 0.6968760727772056, + "grad_norm": 0.5564691424369812, + "learning_rate": 1.3114235576018686e-05, + "loss": 1.3927, + "num_input_tokens_seen": 49889280, + "step": 6090 + }, + { + "epoch": 0.6980203684632109, + "grad_norm": 0.55669766664505, + "learning_rate": 1.3025051342461087e-05, + "loss": 1.2068, + "num_input_tokens_seen": 49971200, + "step": 6100 + }, + { + "epoch": 0.6991646641492162, + "grad_norm": 0.47335195541381836, + "learning_rate": 1.2936064443447157e-05, + "loss": 1.5996, + "num_input_tokens_seen": 50053120, + "step": 6110 + }, + { + "epoch": 0.7003089598352215, + "grad_norm": 0.5122705698013306, + "learning_rate": 1.2847276345387299e-05, + "loss": 1.0368, + "num_input_tokens_seen": 50135040, + "step": 6120 + }, + { + "epoch": 0.7014532555212267, + "grad_norm": 0.47498294711112976, + "learning_rate": 1.2758688511415848e-05, + "loss": 1.3925, + "num_input_tokens_seen": 50216960, + "step": 6130 + }, + { + "epoch": 0.702597551207232, + "grad_norm": 0.4896914064884186, + "learning_rate": 1.2670302401367035e-05, + "loss": 1.1923, + "num_input_tokens_seen": 50298880, + "step": 6140 + }, + { + "epoch": 0.7037418468932373, + "grad_norm": 0.7128376960754395, + "learning_rate": 1.2582119471750888e-05, + "loss": 1.3158, + "num_input_tokens_seen": 50380800, + "step": 6150 + }, + { + "epoch": 0.7048861425792424, + "grad_norm": 0.4625462293624878, + "learning_rate": 1.2494141175729216e-05, + "loss": 1.1663, + "num_input_tokens_seen": 50462720, + "step": 6160 + }, + { + "epoch": 0.7060304382652477, + "grad_norm": 0.5610952377319336, + "learning_rate": 1.240636896309168e-05, + "loss": 1.1836, + "num_input_tokens_seen": 50544640, + "step": 6170 + }, + { + "epoch": 0.707174733951253, + "grad_norm": 0.8366256952285767, + "learning_rate": 1.2318804280231939e-05, + "loss": 1.3311, + "num_input_tokens_seen": 50626560, + "step": 6180 + }, + { + "epoch": 0.7083190296372582, + "grad_norm": 0.6703715324401855, + "learning_rate": 1.2231448570123732e-05, + "loss": 1.1546, + "num_input_tokens_seen": 50708480, + "step": 6190 + }, + { + "epoch": 0.7094633253232635, + "grad_norm": 0.41081807017326355, + "learning_rate": 1.2144303272297186e-05, + "loss": 1.1482, + "num_input_tokens_seen": 50790400, + "step": 6200 + }, + { + "epoch": 0.7106076210092688, + "grad_norm": 0.6010390520095825, + "learning_rate": 1.2057369822815051e-05, + "loss": 1.2103, + "num_input_tokens_seen": 50872320, + "step": 6210 + }, + { + "epoch": 0.711751916695274, + "grad_norm": 0.5093455910682678, + "learning_rate": 1.1970649654249017e-05, + "loss": 1.5064, + "num_input_tokens_seen": 50954240, + "step": 6220 + }, + { + "epoch": 0.7128962123812793, + "grad_norm": 0.45238783955574036, + "learning_rate": 1.1884144195656133e-05, + "loss": 1.3582, + "num_input_tokens_seen": 51036160, + "step": 6230 + }, + { + "epoch": 0.7140405080672846, + "grad_norm": 0.48569175601005554, + "learning_rate": 1.1797854872555272e-05, + "loss": 1.3876, + "num_input_tokens_seen": 51118080, + "step": 6240 + }, + { + "epoch": 0.7151848037532899, + "grad_norm": 0.5373899340629578, + "learning_rate": 1.171178310690362e-05, + "loss": 1.07, + "num_input_tokens_seen": 51200000, + "step": 6250 + }, + { + "epoch": 0.7163290994392951, + "grad_norm": 0.568634569644928, + "learning_rate": 1.1625930317073221e-05, + "loss": 1.2047, + "num_input_tokens_seen": 51281920, + "step": 6260 + }, + { + "epoch": 0.7174733951253004, + "grad_norm": 0.49342796206474304, + "learning_rate": 1.154029791782765e-05, + "loss": 1.2819, + "num_input_tokens_seen": 51363840, + "step": 6270 + }, + { + "epoch": 0.7186176908113057, + "grad_norm": 0.8063413500785828, + "learning_rate": 1.1454887320298686e-05, + "loss": 1.2803, + "num_input_tokens_seen": 51445760, + "step": 6280 + }, + { + "epoch": 0.7197619864973109, + "grad_norm": 0.5260705947875977, + "learning_rate": 1.1369699931963018e-05, + "loss": 1.1276, + "num_input_tokens_seen": 51527680, + "step": 6290 + }, + { + "epoch": 0.7209062821833162, + "grad_norm": 0.7325502038002014, + "learning_rate": 1.1284737156619096e-05, + "loss": 1.1392, + "num_input_tokens_seen": 51609600, + "step": 6300 + }, + { + "epoch": 0.7220505778693215, + "grad_norm": 0.3689436614513397, + "learning_rate": 1.1200000394363996e-05, + "loss": 1.1951, + "num_input_tokens_seen": 51691520, + "step": 6310 + }, + { + "epoch": 0.7231948735553267, + "grad_norm": 0.5486378073692322, + "learning_rate": 1.1115491041570337e-05, + "loss": 1.0797, + "num_input_tokens_seen": 51773440, + "step": 6320 + }, + { + "epoch": 0.724339169241332, + "grad_norm": 0.4265560209751129, + "learning_rate": 1.103121049086324e-05, + "loss": 1.3091, + "num_input_tokens_seen": 51855360, + "step": 6330 + }, + { + "epoch": 0.7254834649273372, + "grad_norm": 0.5804295539855957, + "learning_rate": 1.094716013109745e-05, + "loss": 1.1386, + "num_input_tokens_seen": 51937280, + "step": 6340 + }, + { + "epoch": 0.7266277606133424, + "grad_norm": 0.5425553321838379, + "learning_rate": 1.0863341347334376e-05, + "loss": 1.2296, + "num_input_tokens_seen": 52019200, + "step": 6350 + }, + { + "epoch": 0.7277720562993477, + "grad_norm": 0.584921658039093, + "learning_rate": 1.0779755520819302e-05, + "loss": 1.2027, + "num_input_tokens_seen": 52101120, + "step": 6360 + }, + { + "epoch": 0.728916351985353, + "grad_norm": 0.6812318563461304, + "learning_rate": 1.0696404028958634e-05, + "loss": 1.1153, + "num_input_tokens_seen": 52183040, + "step": 6370 + }, + { + "epoch": 0.7300606476713583, + "grad_norm": 0.4892879128456116, + "learning_rate": 1.0613288245297193e-05, + "loss": 1.097, + "num_input_tokens_seen": 52264960, + "step": 6380 + }, + { + "epoch": 0.7312049433573635, + "grad_norm": 0.4659218490123749, + "learning_rate": 1.053040953949557e-05, + "loss": 1.2608, + "num_input_tokens_seen": 52346880, + "step": 6390 + }, + { + "epoch": 0.7323492390433688, + "grad_norm": 0.5382397770881653, + "learning_rate": 1.0447769277307554e-05, + "loss": 1.0397, + "num_input_tokens_seen": 52428800, + "step": 6400 + }, + { + "epoch": 0.7334935347293741, + "grad_norm": 0.5254513621330261, + "learning_rate": 1.0365368820557633e-05, + "loss": 1.0589, + "num_input_tokens_seen": 52510720, + "step": 6410 + }, + { + "epoch": 0.7346378304153793, + "grad_norm": 1.027398943901062, + "learning_rate": 1.0283209527118584e-05, + "loss": 1.0307, + "num_input_tokens_seen": 52592640, + "step": 6420 + }, + { + "epoch": 0.7357821261013846, + "grad_norm": 0.5271130800247192, + "learning_rate": 1.0201292750889022e-05, + "loss": 1.3241, + "num_input_tokens_seen": 52674560, + "step": 6430 + }, + { + "epoch": 0.7369264217873899, + "grad_norm": 0.45055344700813293, + "learning_rate": 1.011961984177117e-05, + "loss": 1.1761, + "num_input_tokens_seen": 52756480, + "step": 6440 + }, + { + "epoch": 0.7380707174733951, + "grad_norm": 0.5612509846687317, + "learning_rate": 1.0038192145648567e-05, + "loss": 1.0847, + "num_input_tokens_seen": 52838400, + "step": 6450 + }, + { + "epoch": 0.7392150131594004, + "grad_norm": 0.5806179642677307, + "learning_rate": 9.95701100436389e-06, + "loss": 1.1278, + "num_input_tokens_seen": 52920320, + "step": 6460 + }, + { + "epoch": 0.7403593088454057, + "grad_norm": 0.5459727048873901, + "learning_rate": 9.876077755696868e-06, + "loss": 1.2579, + "num_input_tokens_seen": 53002240, + "step": 6470 + }, + { + "epoch": 0.741503604531411, + "grad_norm": 0.4346022307872772, + "learning_rate": 9.795393733342203e-06, + "loss": 1.3319, + "num_input_tokens_seen": 53084160, + "step": 6480 + }, + { + "epoch": 0.7426479002174162, + "grad_norm": 1.9029626846313477, + "learning_rate": 9.71496026688763e-06, + "loss": 1.4099, + "num_input_tokens_seen": 53166080, + "step": 6490 + }, + { + "epoch": 0.7437921959034215, + "grad_norm": 0.5004605054855347, + "learning_rate": 9.634778681791962e-06, + "loss": 1.2894, + "num_input_tokens_seen": 53248000, + "step": 6500 + }, + { + "epoch": 0.7449364915894267, + "grad_norm": 0.7425548434257507, + "learning_rate": 9.554850299363294e-06, + "loss": 1.4331, + "num_input_tokens_seen": 53329920, + "step": 6510 + }, + { + "epoch": 0.7460807872754319, + "grad_norm": 0.507979691028595, + "learning_rate": 9.47517643673721e-06, + "loss": 1.1623, + "num_input_tokens_seen": 53411840, + "step": 6520 + }, + { + "epoch": 0.7472250829614372, + "grad_norm": 0.47209110856056213, + "learning_rate": 9.395758406855053e-06, + "loss": 1.195, + "num_input_tokens_seen": 53493760, + "step": 6530 + }, + { + "epoch": 0.7483693786474425, + "grad_norm": 0.4794630706310272, + "learning_rate": 9.31659751844232e-06, + "loss": 1.2136, + "num_input_tokens_seen": 53575680, + "step": 6540 + }, + { + "epoch": 0.7495136743334477, + "grad_norm": 0.48791974782943726, + "learning_rate": 9.237695075987106e-06, + "loss": 1.476, + "num_input_tokens_seen": 53657600, + "step": 6550 + }, + { + "epoch": 0.750657970019453, + "grad_norm": 0.4857141971588135, + "learning_rate": 9.15905237971856e-06, + "loss": 1.3678, + "num_input_tokens_seen": 53739520, + "step": 6560 + }, + { + "epoch": 0.7518022657054583, + "grad_norm": 0.5270280241966248, + "learning_rate": 9.080670725585511e-06, + "loss": 1.4367, + "num_input_tokens_seen": 53821440, + "step": 6570 + }, + { + "epoch": 0.7529465613914635, + "grad_norm": 0.48381784558296204, + "learning_rate": 9.002551405235082e-06, + "loss": 1.2801, + "num_input_tokens_seen": 53903360, + "step": 6580 + }, + { + "epoch": 0.7540908570774688, + "grad_norm": 0.7849867343902588, + "learning_rate": 8.924695705991407e-06, + "loss": 1.4574, + "num_input_tokens_seen": 53985280, + "step": 6590 + }, + { + "epoch": 0.7552351527634741, + "grad_norm": 0.45584842562675476, + "learning_rate": 8.847104910834414e-06, + "loss": 1.4978, + "num_input_tokens_seen": 54067200, + "step": 6600 + }, + { + "epoch": 0.7563794484494794, + "grad_norm": 0.5474436283111572, + "learning_rate": 8.769780298378705e-06, + "loss": 1.277, + "num_input_tokens_seen": 54149120, + "step": 6610 + }, + { + "epoch": 0.7575237441354846, + "grad_norm": 0.49009135365486145, + "learning_rate": 8.69272314285248e-06, + "loss": 1.0243, + "num_input_tokens_seen": 54231040, + "step": 6620 + }, + { + "epoch": 0.7586680398214899, + "grad_norm": 0.4982384443283081, + "learning_rate": 8.6159347140765e-06, + "loss": 1.2217, + "num_input_tokens_seen": 54312960, + "step": 6630 + }, + { + "epoch": 0.7598123355074952, + "grad_norm": 0.5559042096138, + "learning_rate": 8.539416277443218e-06, + "loss": 1.2161, + "num_input_tokens_seen": 54394880, + "step": 6640 + }, + { + "epoch": 0.7609566311935004, + "grad_norm": 0.4743431508541107, + "learning_rate": 8.463169093895887e-06, + "loss": 1.069, + "num_input_tokens_seen": 54476800, + "step": 6650 + }, + { + "epoch": 0.7621009268795057, + "grad_norm": 0.5154614448547363, + "learning_rate": 8.38719441990781e-06, + "loss": 1.5262, + "num_input_tokens_seen": 54558720, + "step": 6660 + }, + { + "epoch": 0.763245222565511, + "grad_norm": 0.5313172340393066, + "learning_rate": 8.311493507461593e-06, + "loss": 1.3362, + "num_input_tokens_seen": 54640640, + "step": 6670 + }, + { + "epoch": 0.7643895182515162, + "grad_norm": 0.4694889187812805, + "learning_rate": 8.236067604028563e-06, + "loss": 1.2781, + "num_input_tokens_seen": 54722560, + "step": 6680 + }, + { + "epoch": 0.7655338139375214, + "grad_norm": 0.5564500689506531, + "learning_rate": 8.160917952548197e-06, + "loss": 1.4282, + "num_input_tokens_seen": 54804480, + "step": 6690 + }, + { + "epoch": 0.7666781096235267, + "grad_norm": 0.49310502409935, + "learning_rate": 8.08604579140759e-06, + "loss": 1.2766, + "num_input_tokens_seen": 54886400, + "step": 6700 + }, + { + "epoch": 0.7678224053095319, + "grad_norm": 0.4908527731895447, + "learning_rate": 8.011452354421136e-06, + "loss": 1.4154, + "num_input_tokens_seen": 54968320, + "step": 6710 + }, + { + "epoch": 0.7689667009955372, + "grad_norm": 0.47903621196746826, + "learning_rate": 7.937138870810115e-06, + "loss": 0.9568, + "num_input_tokens_seen": 55050240, + "step": 6720 + }, + { + "epoch": 0.7701109966815425, + "grad_norm": 0.820591926574707, + "learning_rate": 7.863106565182474e-06, + "loss": 1.4689, + "num_input_tokens_seen": 55132160, + "step": 6730 + }, + { + "epoch": 0.7712552923675478, + "grad_norm": 0.5931017994880676, + "learning_rate": 7.78935665751266e-06, + "loss": 1.3219, + "num_input_tokens_seen": 55214080, + "step": 6740 + }, + { + "epoch": 0.772399588053553, + "grad_norm": 0.5245538949966431, + "learning_rate": 7.715890363121484e-06, + "loss": 1.1528, + "num_input_tokens_seen": 55296000, + "step": 6750 + }, + { + "epoch": 0.7735438837395583, + "grad_norm": 0.44471973180770874, + "learning_rate": 7.642708892656125e-06, + "loss": 1.129, + "num_input_tokens_seen": 55377920, + "step": 6760 + }, + { + "epoch": 0.7746881794255636, + "grad_norm": 0.5023803114891052, + "learning_rate": 7.569813452070146e-06, + "loss": 1.0879, + "num_input_tokens_seen": 55459840, + "step": 6770 + }, + { + "epoch": 0.7758324751115688, + "grad_norm": 0.4996906518936157, + "learning_rate": 7.497205242603636e-06, + "loss": 1.2824, + "num_input_tokens_seen": 55541760, + "step": 6780 + }, + { + "epoch": 0.7769767707975741, + "grad_norm": 0.4397946298122406, + "learning_rate": 7.424885460763442e-06, + "loss": 1.2056, + "num_input_tokens_seen": 55623680, + "step": 6790 + }, + { + "epoch": 0.7781210664835794, + "grad_norm": 0.5805292725563049, + "learning_rate": 7.3528552983033985e-06, + "loss": 1.2665, + "num_input_tokens_seen": 55705600, + "step": 6800 + }, + { + "epoch": 0.7792653621695846, + "grad_norm": 0.4549366533756256, + "learning_rate": 7.281115942204739e-06, + "loss": 1.0662, + "num_input_tokens_seen": 55787520, + "step": 6810 + }, + { + "epoch": 0.7804096578555899, + "grad_norm": 0.547260046005249, + "learning_rate": 7.209668574656514e-06, + "loss": 1.1699, + "num_input_tokens_seen": 55869440, + "step": 6820 + }, + { + "epoch": 0.7815539535415952, + "grad_norm": 0.5504161715507507, + "learning_rate": 7.138514373036098e-06, + "loss": 1.3507, + "num_input_tokens_seen": 55951360, + "step": 6830 + }, + { + "epoch": 0.7826982492276005, + "grad_norm": 0.5131449699401855, + "learning_rate": 7.0676545098897956e-06, + "loss": 1.3175, + "num_input_tokens_seen": 56033280, + "step": 6840 + }, + { + "epoch": 0.7838425449136057, + "grad_norm": 0.4970407783985138, + "learning_rate": 6.997090152913535e-06, + "loss": 1.1614, + "num_input_tokens_seen": 56115200, + "step": 6850 + }, + { + "epoch": 0.784986840599611, + "grad_norm": 1.6976333856582642, + "learning_rate": 6.92682246493363e-06, + "loss": 1.0802, + "num_input_tokens_seen": 56197120, + "step": 6860 + }, + { + "epoch": 0.7861311362856161, + "grad_norm": 2.0605082511901855, + "learning_rate": 6.856852603887556e-06, + "loss": 1.2859, + "num_input_tokens_seen": 56279040, + "step": 6870 + }, + { + "epoch": 0.7872754319716214, + "grad_norm": 0.4618769586086273, + "learning_rate": 6.787181722804959e-06, + "loss": 1.0909, + "num_input_tokens_seen": 56360960, + "step": 6880 + }, + { + "epoch": 0.7884197276576267, + "grad_norm": 0.5229918360710144, + "learning_rate": 6.717810969788596e-06, + "loss": 1.2816, + "num_input_tokens_seen": 56442880, + "step": 6890 + }, + { + "epoch": 0.789564023343632, + "grad_norm": 0.54283607006073, + "learning_rate": 6.648741487995416e-06, + "loss": 1.036, + "num_input_tokens_seen": 56524800, + "step": 6900 + }, + { + "epoch": 0.7907083190296372, + "grad_norm": 0.6073585748672485, + "learning_rate": 6.57997441561774e-06, + "loss": 0.9993, + "num_input_tokens_seen": 56606720, + "step": 6910 + }, + { + "epoch": 0.7918526147156425, + "grad_norm": 0.4236718416213989, + "learning_rate": 6.511510885864516e-06, + "loss": 1.1192, + "num_input_tokens_seen": 56688640, + "step": 6920 + }, + { + "epoch": 0.7929969104016478, + "grad_norm": 0.5184228420257568, + "learning_rate": 6.44335202694262e-06, + "loss": 1.3028, + "num_input_tokens_seen": 56770560, + "step": 6930 + }, + { + "epoch": 0.794141206087653, + "grad_norm": 0.6947381496429443, + "learning_rate": 6.375498962038265e-06, + "loss": 1.4295, + "num_input_tokens_seen": 56852480, + "step": 6940 + }, + { + "epoch": 0.7952855017736583, + "grad_norm": 0.5223413705825806, + "learning_rate": 6.307952809298517e-06, + "loss": 1.503, + "num_input_tokens_seen": 56934400, + "step": 6950 + }, + { + "epoch": 0.7964297974596636, + "grad_norm": 0.5551154613494873, + "learning_rate": 6.240714681812837e-06, + "loss": 1.1341, + "num_input_tokens_seen": 57016320, + "step": 6960 + }, + { + "epoch": 0.7975740931456688, + "grad_norm": 0.5838407278060913, + "learning_rate": 6.173785687594761e-06, + "loss": 1.14, + "num_input_tokens_seen": 57098240, + "step": 6970 + }, + { + "epoch": 0.7987183888316741, + "grad_norm": 0.4963679611682892, + "learning_rate": 6.107166929563629e-06, + "loss": 1.2954, + "num_input_tokens_seen": 57180160, + "step": 6980 + }, + { + "epoch": 0.7998626845176794, + "grad_norm": 0.5126423835754395, + "learning_rate": 6.040859505526439e-06, + "loss": 1.3446, + "num_input_tokens_seen": 57262080, + "step": 6990 + }, + { + "epoch": 0.8010069802036847, + "grad_norm": 0.48465293645858765, + "learning_rate": 5.974864508159692e-06, + "loss": 1.0721, + "num_input_tokens_seen": 57344000, + "step": 7000 + }, + { + "epoch": 0.8021512758896899, + "grad_norm": 0.5144678950309753, + "learning_rate": 5.9091830249914685e-06, + "loss": 1.3073, + "num_input_tokens_seen": 57425920, + "step": 7010 + }, + { + "epoch": 0.8032955715756952, + "grad_norm": 0.4844573140144348, + "learning_rate": 5.843816138383429e-06, + "loss": 1.3409, + "num_input_tokens_seen": 57507840, + "step": 7020 + }, + { + "epoch": 0.8044398672617005, + "grad_norm": 0.538421094417572, + "learning_rate": 5.778764925513045e-06, + "loss": 1.3687, + "num_input_tokens_seen": 57589760, + "step": 7030 + }, + { + "epoch": 0.8055841629477057, + "grad_norm": 0.49097201228141785, + "learning_rate": 5.714030458355784e-06, + "loss": 1.099, + "num_input_tokens_seen": 57671680, + "step": 7040 + }, + { + "epoch": 0.8067284586337109, + "grad_norm": 0.5372655391693115, + "learning_rate": 5.649613803667511e-06, + "loss": 1.3109, + "num_input_tokens_seen": 57753600, + "step": 7050 + }, + { + "epoch": 0.8078727543197162, + "grad_norm": 0.5266016721725464, + "learning_rate": 5.5855160229668636e-06, + "loss": 1.2747, + "num_input_tokens_seen": 57835520, + "step": 7060 + }, + { + "epoch": 0.8090170500057214, + "grad_norm": 0.5975900888442993, + "learning_rate": 5.5217381725177624e-06, + "loss": 1.2696, + "num_input_tokens_seen": 57917440, + "step": 7070 + }, + { + "epoch": 0.8101613456917267, + "grad_norm": 0.5652705430984497, + "learning_rate": 5.458281303312016e-06, + "loss": 1.1509, + "num_input_tokens_seen": 57999360, + "step": 7080 + }, + { + "epoch": 0.811305641377732, + "grad_norm": 0.7816336154937744, + "learning_rate": 5.39514646105202e-06, + "loss": 1.2708, + "num_input_tokens_seen": 58081280, + "step": 7090 + }, + { + "epoch": 0.8124499370637372, + "grad_norm": 0.48780134320259094, + "learning_rate": 5.332334686133475e-06, + "loss": 1.4797, + "num_input_tokens_seen": 58163200, + "step": 7100 + }, + { + "epoch": 0.8135942327497425, + "grad_norm": 0.5095590949058533, + "learning_rate": 5.269847013628299e-06, + "loss": 1.192, + "num_input_tokens_seen": 58245120, + "step": 7110 + }, + { + "epoch": 0.8147385284357478, + "grad_norm": 0.4638192057609558, + "learning_rate": 5.207684473267527e-06, + "loss": 1.5395, + "num_input_tokens_seen": 58327040, + "step": 7120 + }, + { + "epoch": 0.8158828241217531, + "grad_norm": 0.538078784942627, + "learning_rate": 5.145848089424374e-06, + "loss": 1.2233, + "num_input_tokens_seen": 58408960, + "step": 7130 + }, + { + "epoch": 0.8170271198077583, + "grad_norm": 0.544783353805542, + "learning_rate": 5.0843388810973195e-06, + "loss": 1.4877, + "num_input_tokens_seen": 58490880, + "step": 7140 + }, + { + "epoch": 0.8181714154937636, + "grad_norm": 0.9503467679023743, + "learning_rate": 5.02315786189334e-06, + "loss": 1.1406, + "num_input_tokens_seen": 58572800, + "step": 7150 + }, + { + "epoch": 0.8193157111797689, + "grad_norm": 0.5432871580123901, + "learning_rate": 4.962306040011222e-06, + "loss": 1.1193, + "num_input_tokens_seen": 58654720, + "step": 7160 + }, + { + "epoch": 0.8204600068657741, + "grad_norm": 0.6386141777038574, + "learning_rate": 4.901784418224892e-06, + "loss": 1.0802, + "num_input_tokens_seen": 58736640, + "step": 7170 + }, + { + "epoch": 0.8216043025517794, + "grad_norm": 0.5047373175621033, + "learning_rate": 4.841593993866949e-06, + "loss": 1.0499, + "num_input_tokens_seen": 58818560, + "step": 7180 + }, + { + "epoch": 0.8227485982377847, + "grad_norm": 0.4890262484550476, + "learning_rate": 4.781735758812217e-06, + "loss": 1.2981, + "num_input_tokens_seen": 58900480, + "step": 7190 + }, + { + "epoch": 0.82389289392379, + "grad_norm": 0.744850754737854, + "learning_rate": 4.7222106994613655e-06, + "loss": 1.1535, + "num_input_tokens_seen": 58982400, + "step": 7200 + }, + { + "epoch": 0.8250371896097952, + "grad_norm": 0.4771808087825775, + "learning_rate": 4.663019796724685e-06, + "loss": 1.2276, + "num_input_tokens_seen": 59064320, + "step": 7210 + }, + { + "epoch": 0.8261814852958005, + "grad_norm": 0.463220477104187, + "learning_rate": 4.604164026005925e-06, + "loss": 1.3646, + "num_input_tokens_seen": 59146240, + "step": 7220 + }, + { + "epoch": 0.8273257809818056, + "grad_norm": 0.46109485626220703, + "learning_rate": 4.5456443571862185e-06, + "loss": 1.3941, + "num_input_tokens_seen": 59228160, + "step": 7230 + }, + { + "epoch": 0.8284700766678109, + "grad_norm": 0.4565638303756714, + "learning_rate": 4.487461754608066e-06, + "loss": 1.102, + "num_input_tokens_seen": 59310080, + "step": 7240 + }, + { + "epoch": 0.8296143723538162, + "grad_norm": 0.5542830228805542, + "learning_rate": 4.429617177059508e-06, + "loss": 1.167, + "num_input_tokens_seen": 59392000, + "step": 7250 + }, + { + "epoch": 0.8307586680398215, + "grad_norm": 0.44131019711494446, + "learning_rate": 4.372111577758261e-06, + "loss": 1.0619, + "num_input_tokens_seen": 59473920, + "step": 7260 + }, + { + "epoch": 0.8319029637258267, + "grad_norm": 0.523678183555603, + "learning_rate": 4.314945904336037e-06, + "loss": 1.2679, + "num_input_tokens_seen": 59555840, + "step": 7270 + }, + { + "epoch": 0.833047259411832, + "grad_norm": 0.485726535320282, + "learning_rate": 4.258121098822945e-06, + "loss": 1.2982, + "num_input_tokens_seen": 59637760, + "step": 7280 + }, + { + "epoch": 0.8341915550978373, + "grad_norm": 0.49582865834236145, + "learning_rate": 4.201638097631938e-06, + "loss": 1.2368, + "num_input_tokens_seen": 59719680, + "step": 7290 + }, + { + "epoch": 0.8353358507838425, + "grad_norm": 0.44791966676712036, + "learning_rate": 4.145497831543402e-06, + "loss": 0.9904, + "num_input_tokens_seen": 59801600, + "step": 7300 + }, + { + "epoch": 0.8364801464698478, + "grad_norm": 0.4786857068538666, + "learning_rate": 4.089701225689793e-06, + "loss": 1.186, + "num_input_tokens_seen": 59883520, + "step": 7310 + }, + { + "epoch": 0.8376244421558531, + "grad_norm": 0.48226872086524963, + "learning_rate": 4.034249199540432e-06, + "loss": 1.359, + "num_input_tokens_seen": 59965440, + "step": 7320 + }, + { + "epoch": 0.8387687378418583, + "grad_norm": 0.5494146347045898, + "learning_rate": 3.97914266688631e-06, + "loss": 1.1773, + "num_input_tokens_seen": 60047360, + "step": 7330 + }, + { + "epoch": 0.8399130335278636, + "grad_norm": 0.4615972638130188, + "learning_rate": 3.924382535825047e-06, + "loss": 1.2996, + "num_input_tokens_seen": 60129280, + "step": 7340 + }, + { + "epoch": 0.8410573292138689, + "grad_norm": 1.4144350290298462, + "learning_rate": 3.869969708745946e-06, + "loss": 1.1155, + "num_input_tokens_seen": 60211200, + "step": 7350 + }, + { + "epoch": 0.8422016248998742, + "grad_norm": 0.5016247034072876, + "learning_rate": 3.815905082315102e-06, + "loss": 1.058, + "num_input_tokens_seen": 60293120, + "step": 7360 + }, + { + "epoch": 0.8433459205858794, + "grad_norm": 0.4713045358657837, + "learning_rate": 3.762189547460615e-06, + "loss": 1.2502, + "num_input_tokens_seen": 60375040, + "step": 7370 + }, + { + "epoch": 0.8444902162718847, + "grad_norm": 0.8532644510269165, + "learning_rate": 3.7088239893579456e-06, + "loss": 1.2315, + "num_input_tokens_seen": 60456960, + "step": 7380 + }, + { + "epoch": 0.84563451195789, + "grad_norm": 0.5714083313941956, + "learning_rate": 3.655809287415285e-06, + "loss": 1.46, + "num_input_tokens_seen": 60538880, + "step": 7390 + }, + { + "epoch": 0.8467788076438952, + "grad_norm": 0.5227552652359009, + "learning_rate": 3.603146315259104e-06, + "loss": 1.2398, + "num_input_tokens_seen": 60620800, + "step": 7400 + }, + { + "epoch": 0.8479231033299004, + "grad_norm": 0.5611023306846619, + "learning_rate": 3.5508359407197157e-06, + "loss": 1.2431, + "num_input_tokens_seen": 60702720, + "step": 7410 + }, + { + "epoch": 0.8490673990159057, + "grad_norm": 0.8136146068572998, + "learning_rate": 3.4988790258170146e-06, + "loss": 0.9851, + "num_input_tokens_seen": 60784640, + "step": 7420 + }, + { + "epoch": 0.8502116947019109, + "grad_norm": 0.5456047058105469, + "learning_rate": 3.4472764267462486e-06, + "loss": 1.1814, + "num_input_tokens_seen": 60866560, + "step": 7430 + }, + { + "epoch": 0.8513559903879162, + "grad_norm": 0.4470462501049042, + "learning_rate": 3.396028993863906e-06, + "loss": 1.0398, + "num_input_tokens_seen": 60948480, + "step": 7440 + }, + { + "epoch": 0.8525002860739215, + "grad_norm": 0.4927099049091339, + "learning_rate": 3.3451375716737067e-06, + "loss": 1.1797, + "num_input_tokens_seen": 61030400, + "step": 7450 + }, + { + "epoch": 0.8536445817599267, + "grad_norm": 0.6371917724609375, + "learning_rate": 3.2946029988127068e-06, + "loss": 1.3581, + "num_input_tokens_seen": 61112320, + "step": 7460 + }, + { + "epoch": 0.854788877445932, + "grad_norm": 0.4422919452190399, + "learning_rate": 3.2444261080374546e-06, + "loss": 1.1539, + "num_input_tokens_seen": 61194240, + "step": 7470 + }, + { + "epoch": 0.8559331731319373, + "grad_norm": 0.4536419212818146, + "learning_rate": 3.194607726210261e-06, + "loss": 1.0741, + "num_input_tokens_seen": 61276160, + "step": 7480 + }, + { + "epoch": 0.8570774688179426, + "grad_norm": 0.5793977379798889, + "learning_rate": 3.1451486742856055e-06, + "loss": 1.2566, + "num_input_tokens_seen": 61358080, + "step": 7490 + }, + { + "epoch": 0.8582217645039478, + "grad_norm": 0.6592020392417908, + "learning_rate": 3.0960497672965825e-06, + "loss": 1.2257, + "num_input_tokens_seen": 61440000, + "step": 7500 + }, + { + "epoch": 0.8593660601899531, + "grad_norm": 0.49061110615730286, + "learning_rate": 3.0473118143414634e-06, + "loss": 1.4217, + "num_input_tokens_seen": 61521920, + "step": 7510 + }, + { + "epoch": 0.8605103558759584, + "grad_norm": 0.4329964518547058, + "learning_rate": 2.9989356185703975e-06, + "loss": 1.4657, + "num_input_tokens_seen": 61603840, + "step": 7520 + }, + { + "epoch": 0.8616546515619636, + "grad_norm": 0.5590507388114929, + "learning_rate": 2.950921977172155e-06, + "loss": 1.0984, + "num_input_tokens_seen": 61685760, + "step": 7530 + }, + { + "epoch": 0.8627989472479689, + "grad_norm": 0.7304471135139465, + "learning_rate": 2.9032716813609723e-06, + "loss": 1.1865, + "num_input_tokens_seen": 61767680, + "step": 7540 + }, + { + "epoch": 0.8639432429339742, + "grad_norm": 0.48121222853660583, + "learning_rate": 2.8559855163635544e-06, + "loss": 1.1777, + "num_input_tokens_seen": 61849600, + "step": 7550 + }, + { + "epoch": 0.8650875386199794, + "grad_norm": 0.469186007976532, + "learning_rate": 2.809064261406111e-06, + "loss": 1.5473, + "num_input_tokens_seen": 61931520, + "step": 7560 + }, + { + "epoch": 0.8662318343059847, + "grad_norm": 0.5229854583740234, + "learning_rate": 2.762508689701504e-06, + "loss": 1.1929, + "num_input_tokens_seen": 62013440, + "step": 7570 + }, + { + "epoch": 0.8673761299919899, + "grad_norm": 0.5912187099456787, + "learning_rate": 2.716319568436529e-06, + "loss": 1.0352, + "num_input_tokens_seen": 62095360, + "step": 7580 + }, + { + "epoch": 0.8685204256779951, + "grad_norm": 0.5109303593635559, + "learning_rate": 2.6704976587592688e-06, + "loss": 1.0181, + "num_input_tokens_seen": 62177280, + "step": 7590 + }, + { + "epoch": 0.8696647213640004, + "grad_norm": 0.7814948558807373, + "learning_rate": 2.6250437157665455e-06, + "loss": 1.2663, + "num_input_tokens_seen": 62259200, + "step": 7600 + }, + { + "epoch": 0.8708090170500057, + "grad_norm": 1.977430820465088, + "learning_rate": 2.5799584884914685e-06, + "loss": 1.0863, + "num_input_tokens_seen": 62341120, + "step": 7610 + }, + { + "epoch": 0.871953312736011, + "grad_norm": 0.4965369701385498, + "learning_rate": 2.535242719891112e-06, + "loss": 1.1277, + "num_input_tokens_seen": 62423040, + "step": 7620 + }, + { + "epoch": 0.8730976084220162, + "grad_norm": 1.2752573490142822, + "learning_rate": 2.4908971468342535e-06, + "loss": 1.067, + "num_input_tokens_seen": 62504960, + "step": 7630 + }, + { + "epoch": 0.8742419041080215, + "grad_norm": 0.5822865962982178, + "learning_rate": 2.44692250008923e-06, + "loss": 1.2818, + "num_input_tokens_seen": 62586880, + "step": 7640 + }, + { + "epoch": 0.8753861997940268, + "grad_norm": 0.5638536214828491, + "learning_rate": 2.403319504311921e-06, + "loss": 1.1506, + "num_input_tokens_seen": 62668800, + "step": 7650 + }, + { + "epoch": 0.876530495480032, + "grad_norm": 0.48504772782325745, + "learning_rate": 2.360088878033778e-06, + "loss": 1.5572, + "num_input_tokens_seen": 62750720, + "step": 7660 + }, + { + "epoch": 0.8776747911660373, + "grad_norm": 0.4426518976688385, + "learning_rate": 2.317231333650005e-06, + "loss": 1.2879, + "num_input_tokens_seen": 62832640, + "step": 7670 + }, + { + "epoch": 0.8788190868520426, + "grad_norm": 0.5259247422218323, + "learning_rate": 2.2747475774077986e-06, + "loss": 1.1017, + "num_input_tokens_seen": 62914560, + "step": 7680 + }, + { + "epoch": 0.8799633825380478, + "grad_norm": 0.4538893401622772, + "learning_rate": 2.2326383093947135e-06, + "loss": 1.2244, + "num_input_tokens_seen": 62996480, + "step": 7690 + }, + { + "epoch": 0.8811076782240531, + "grad_norm": 0.6097177267074585, + "learning_rate": 2.1909042235271597e-06, + "loss": 1.3507, + "num_input_tokens_seen": 63078400, + "step": 7700 + }, + { + "epoch": 0.8822519739100584, + "grad_norm": 0.5675943493843079, + "learning_rate": 2.1495460075389133e-06, + "loss": 1.0351, + "num_input_tokens_seen": 63160320, + "step": 7710 + }, + { + "epoch": 0.8833962695960637, + "grad_norm": 0.4782324731349945, + "learning_rate": 2.1085643429698236e-06, + "loss": 1.1214, + "num_input_tokens_seen": 63242240, + "step": 7720 + }, + { + "epoch": 0.8845405652820689, + "grad_norm": 0.5161313414573669, + "learning_rate": 2.067959905154568e-06, + "loss": 1.1543, + "num_input_tokens_seen": 63324160, + "step": 7730 + }, + { + "epoch": 0.8856848609680742, + "grad_norm": 0.4714735150337219, + "learning_rate": 2.0277333632115288e-06, + "loss": 0.9881, + "num_input_tokens_seen": 63406080, + "step": 7740 + }, + { + "epoch": 0.8868291566540795, + "grad_norm": 0.5167281031608582, + "learning_rate": 1.9878853800317535e-06, + "loss": 1.4237, + "num_input_tokens_seen": 63488000, + "step": 7750 + }, + { + "epoch": 0.8879734523400846, + "grad_norm": 0.47065913677215576, + "learning_rate": 1.948416612268034e-06, + "loss": 1.1573, + "num_input_tokens_seen": 63569920, + "step": 7760 + }, + { + "epoch": 0.8891177480260899, + "grad_norm": 0.6462358236312866, + "learning_rate": 1.909327710324116e-06, + "loss": 1.303, + "num_input_tokens_seen": 63651840, + "step": 7770 + }, + { + "epoch": 0.8902620437120952, + "grad_norm": 0.4849631190299988, + "learning_rate": 1.8706193183439247e-06, + "loss": 1.2495, + "num_input_tokens_seen": 63733760, + "step": 7780 + }, + { + "epoch": 0.8914063393981004, + "grad_norm": 0.4865788519382477, + "learning_rate": 1.8322920742010086e-06, + "loss": 1.1877, + "num_input_tokens_seen": 63815680, + "step": 7790 + }, + { + "epoch": 0.8925506350841057, + "grad_norm": 0.48363032937049866, + "learning_rate": 1.7943466094879902e-06, + "loss": 1.1676, + "num_input_tokens_seen": 63897600, + "step": 7800 + }, + { + "epoch": 0.893694930770111, + "grad_norm": 0.6026335954666138, + "learning_rate": 1.7567835495061718e-06, + "loss": 1.1575, + "num_input_tokens_seen": 63979520, + "step": 7810 + }, + { + "epoch": 0.8948392264561162, + "grad_norm": 0.49204021692276, + "learning_rate": 1.7196035132552135e-06, + "loss": 1.0517, + "num_input_tokens_seen": 64061440, + "step": 7820 + }, + { + "epoch": 0.8959835221421215, + "grad_norm": 0.5136172771453857, + "learning_rate": 1.682807113422971e-06, + "loss": 1.4192, + "num_input_tokens_seen": 64143360, + "step": 7830 + }, + { + "epoch": 0.8971278178281268, + "grad_norm": 0.516801655292511, + "learning_rate": 1.646394956375369e-06, + "loss": 1.5286, + "num_input_tokens_seen": 64225280, + "step": 7840 + }, + { + "epoch": 0.898272113514132, + "grad_norm": 1.405717134475708, + "learning_rate": 1.6103676421463986e-06, + "loss": 0.9688, + "num_input_tokens_seen": 64307200, + "step": 7850 + }, + { + "epoch": 0.8994164092001373, + "grad_norm": 0.4660581350326538, + "learning_rate": 1.5747257644282726e-06, + "loss": 1.1801, + "num_input_tokens_seen": 64389120, + "step": 7860 + }, + { + "epoch": 0.9005607048861426, + "grad_norm": 0.5325869917869568, + "learning_rate": 1.5394699105616002e-06, + "loss": 1.0305, + "num_input_tokens_seen": 64471040, + "step": 7870 + }, + { + "epoch": 0.9017050005721479, + "grad_norm": 0.5679177641868591, + "learning_rate": 1.504600661525718e-06, + "loss": 1.1399, + "num_input_tokens_seen": 64552960, + "step": 7880 + }, + { + "epoch": 0.9028492962581531, + "grad_norm": 0.49606427550315857, + "learning_rate": 1.4701185919291372e-06, + "loss": 1.4673, + "num_input_tokens_seen": 64634880, + "step": 7890 + }, + { + "epoch": 0.9039935919441584, + "grad_norm": 0.49286141991615295, + "learning_rate": 1.436024270000058e-06, + "loss": 1.0752, + "num_input_tokens_seen": 64716800, + "step": 7900 + }, + { + "epoch": 0.9051378876301637, + "grad_norm": 0.5626493096351624, + "learning_rate": 1.4023182575769956e-06, + "loss": 1.1824, + "num_input_tokens_seen": 64798720, + "step": 7910 + }, + { + "epoch": 0.9062821833161689, + "grad_norm": 0.513041079044342, + "learning_rate": 1.3690011100995437e-06, + "loss": 1.0409, + "num_input_tokens_seen": 64880640, + "step": 7920 + }, + { + "epoch": 0.9074264790021742, + "grad_norm": 0.46281594038009644, + "learning_rate": 1.3360733765992116e-06, + "loss": 1.2628, + "num_input_tokens_seen": 64962560, + "step": 7930 + }, + { + "epoch": 0.9085707746881794, + "grad_norm": 0.626043975353241, + "learning_rate": 1.3035355996903697e-06, + "loss": 1.307, + "num_input_tokens_seen": 65044480, + "step": 7940 + }, + { + "epoch": 0.9097150703741846, + "grad_norm": 0.6638424396514893, + "learning_rate": 1.2713883155613144e-06, + "loss": 1.061, + "num_input_tokens_seen": 65126400, + "step": 7950 + }, + { + "epoch": 0.9108593660601899, + "grad_norm": 0.8179208040237427, + "learning_rate": 1.2396320539654366e-06, + "loss": 1.178, + "num_input_tokens_seen": 65208320, + "step": 7960 + }, + { + "epoch": 0.9120036617461952, + "grad_norm": 0.44587135314941406, + "learning_rate": 1.208267338212493e-06, + "loss": 1.3368, + "num_input_tokens_seen": 65290240, + "step": 7970 + }, + { + "epoch": 0.9131479574322005, + "grad_norm": 0.4682336449623108, + "learning_rate": 1.177294685159963e-06, + "loss": 1.3091, + "num_input_tokens_seen": 65372160, + "step": 7980 + }, + { + "epoch": 0.9142922531182057, + "grad_norm": 0.4648427367210388, + "learning_rate": 1.1467146052045603e-06, + "loss": 1.2932, + "num_input_tokens_seen": 65454080, + "step": 7990 + }, + { + "epoch": 0.915436548804211, + "grad_norm": 0.5007482171058655, + "learning_rate": 1.1165276022737926e-06, + "loss": 1.1777, + "num_input_tokens_seen": 65536000, + "step": 8000 + }, + { + "epoch": 0.9165808444902163, + "grad_norm": 0.5155405402183533, + "learning_rate": 1.0867341738176857e-06, + "loss": 1.3184, + "num_input_tokens_seen": 65617920, + "step": 8010 + }, + { + "epoch": 0.9177251401762215, + "grad_norm": 0.6284940242767334, + "learning_rate": 1.0573348108005614e-06, + "loss": 1.2283, + "num_input_tokens_seen": 65699840, + "step": 8020 + }, + { + "epoch": 0.9188694358622268, + "grad_norm": 4.340449333190918, + "learning_rate": 1.0283299976929672e-06, + "loss": 1.4513, + "num_input_tokens_seen": 65781760, + "step": 8030 + }, + { + "epoch": 0.9200137315482321, + "grad_norm": 0.5452487468719482, + "learning_rate": 9.997202124636785e-07, + "loss": 1.2401, + "num_input_tokens_seen": 65863680, + "step": 8040 + }, + { + "epoch": 0.9211580272342373, + "grad_norm": 0.5035956501960754, + "learning_rate": 9.715059265718335e-07, + "loss": 1.0646, + "num_input_tokens_seen": 65945600, + "step": 8050 + }, + { + "epoch": 0.9223023229202426, + "grad_norm": 0.4889717996120453, + "learning_rate": 9.436876049591398e-07, + "loss": 1.4221, + "num_input_tokens_seen": 66027520, + "step": 8060 + }, + { + "epoch": 0.9234466186062479, + "grad_norm": 0.48875346779823303, + "learning_rate": 9.162657060422574e-07, + "loss": 1.2108, + "num_input_tokens_seen": 66109440, + "step": 8070 + }, + { + "epoch": 0.9245909142922532, + "grad_norm": 0.5173611044883728, + "learning_rate": 8.892406817051946e-07, + "loss": 1.4522, + "num_input_tokens_seen": 66191360, + "step": 8080 + }, + { + "epoch": 0.9257352099782584, + "grad_norm": 0.598610520362854, + "learning_rate": 8.626129772918962e-07, + "loss": 1.2966, + "num_input_tokens_seen": 66273280, + "step": 8090 + }, + { + "epoch": 0.9268795056642637, + "grad_norm": 0.5002040863037109, + "learning_rate": 8.363830315988947e-07, + "loss": 1.2018, + "num_input_tokens_seen": 66355200, + "step": 8100 + }, + { + "epoch": 0.928023801350269, + "grad_norm": 0.42786675691604614, + "learning_rate": 8.105512768680712e-07, + "loss": 1.1427, + "num_input_tokens_seen": 66437120, + "step": 8110 + }, + { + "epoch": 0.9291680970362741, + "grad_norm": 0.5229976177215576, + "learning_rate": 7.851181387795392e-07, + "loss": 1.1591, + "num_input_tokens_seen": 66519040, + "step": 8120 + }, + { + "epoch": 0.9303123927222794, + "grad_norm": 0.469088613986969, + "learning_rate": 7.600840364446333e-07, + "loss": 1.4109, + "num_input_tokens_seen": 66600960, + "step": 8130 + }, + { + "epoch": 0.9314566884082847, + "grad_norm": 0.46560269594192505, + "learning_rate": 7.354493823990006e-07, + "loss": 1.4194, + "num_input_tokens_seen": 66682880, + "step": 8140 + }, + { + "epoch": 0.9326009840942899, + "grad_norm": 0.6745224595069885, + "learning_rate": 7.112145825957927e-07, + "loss": 1.1635, + "num_input_tokens_seen": 66764800, + "step": 8150 + }, + { + "epoch": 0.9337452797802952, + "grad_norm": 0.6367943286895752, + "learning_rate": 6.873800363989935e-07, + "loss": 1.2245, + "num_input_tokens_seen": 66846720, + "step": 8160 + }, + { + "epoch": 0.9348895754663005, + "grad_norm": 0.9506150484085083, + "learning_rate": 6.63946136576829e-07, + "loss": 1.4055, + "num_input_tokens_seen": 66928640, + "step": 8170 + }, + { + "epoch": 0.9360338711523057, + "grad_norm": 0.5516346096992493, + "learning_rate": 6.409132692952874e-07, + "loss": 1.134, + "num_input_tokens_seen": 67010560, + "step": 8180 + }, + { + "epoch": 0.937178166838311, + "grad_norm": 0.48264941573143005, + "learning_rate": 6.182818141117625e-07, + "loss": 1.1915, + "num_input_tokens_seen": 67092480, + "step": 8190 + }, + { + "epoch": 0.9383224625243163, + "grad_norm": 0.5029363632202148, + "learning_rate": 5.960521439688088e-07, + "loss": 1.0641, + "num_input_tokens_seen": 67174400, + "step": 8200 + }, + { + "epoch": 0.9394667582103216, + "grad_norm": 0.5256086587905884, + "learning_rate": 5.742246251879829e-07, + "loss": 1.1629, + "num_input_tokens_seen": 67256320, + "step": 8210 + }, + { + "epoch": 0.9406110538963268, + "grad_norm": 0.5969172120094299, + "learning_rate": 5.527996174638061e-07, + "loss": 1.1297, + "num_input_tokens_seen": 67338240, + "step": 8220 + }, + { + "epoch": 0.9417553495823321, + "grad_norm": 0.6032472252845764, + "learning_rate": 5.317774738578446e-07, + "loss": 1.269, + "num_input_tokens_seen": 67420160, + "step": 8230 + }, + { + "epoch": 0.9428996452683374, + "grad_norm": 0.4904973804950714, + "learning_rate": 5.111585407928887e-07, + "loss": 1.0976, + "num_input_tokens_seen": 67502080, + "step": 8240 + }, + { + "epoch": 0.9440439409543426, + "grad_norm": 0.49423882365226746, + "learning_rate": 4.909431580472385e-07, + "loss": 1.2052, + "num_input_tokens_seen": 67584000, + "step": 8250 + }, + { + "epoch": 0.9451882366403479, + "grad_norm": 0.5731542110443115, + "learning_rate": 4.711316587491188e-07, + "loss": 1.1078, + "num_input_tokens_seen": 67665920, + "step": 8260 + }, + { + "epoch": 0.9463325323263532, + "grad_norm": 0.6199080348014832, + "learning_rate": 4.5172436937117036e-07, + "loss": 1.2543, + "num_input_tokens_seen": 67747840, + "step": 8270 + }, + { + "epoch": 0.9474768280123584, + "grad_norm": 0.5399270057678223, + "learning_rate": 4.3272160972509524e-07, + "loss": 1.1164, + "num_input_tokens_seen": 67829760, + "step": 8280 + }, + { + "epoch": 0.9486211236983637, + "grad_norm": 0.49898287653923035, + "learning_rate": 4.1412369295635023e-07, + "loss": 1.1804, + "num_input_tokens_seen": 67911680, + "step": 8290 + }, + { + "epoch": 0.9497654193843689, + "grad_norm": 0.5566830039024353, + "learning_rate": 3.9593092553902587e-07, + "loss": 1.1059, + "num_input_tokens_seen": 67993600, + "step": 8300 + }, + { + "epoch": 0.9509097150703741, + "grad_norm": 0.5725921988487244, + "learning_rate": 3.7814360727076724e-07, + "loss": 1.1965, + "num_input_tokens_seen": 68075520, + "step": 8310 + }, + { + "epoch": 0.9520540107563794, + "grad_norm": 0.5144397616386414, + "learning_rate": 3.607620312678528e-07, + "loss": 1.3606, + "num_input_tokens_seen": 68157440, + "step": 8320 + }, + { + "epoch": 0.9531983064423847, + "grad_norm": 0.5176523327827454, + "learning_rate": 3.437864839603455e-07, + "loss": 1.0969, + "num_input_tokens_seen": 68239360, + "step": 8330 + }, + { + "epoch": 0.95434260212839, + "grad_norm": 0.8841016888618469, + "learning_rate": 3.272172450873967e-07, + "loss": 1.6294, + "num_input_tokens_seen": 68321280, + "step": 8340 + }, + { + "epoch": 0.9554868978143952, + "grad_norm": 0.5609331727027893, + "learning_rate": 3.11054587692608e-07, + "loss": 1.0976, + "num_input_tokens_seen": 68403200, + "step": 8350 + }, + { + "epoch": 0.9566311935004005, + "grad_norm": 0.5448135137557983, + "learning_rate": 2.952987781195599e-07, + "loss": 1.1881, + "num_input_tokens_seen": 68485120, + "step": 8360 + }, + { + "epoch": 0.9577754891864058, + "grad_norm": 0.5337486863136292, + "learning_rate": 2.799500760073931e-07, + "loss": 1.1826, + "num_input_tokens_seen": 68567040, + "step": 8370 + }, + { + "epoch": 0.958919784872411, + "grad_norm": 0.5066606402397156, + "learning_rate": 2.6500873428656483e-07, + "loss": 1.1567, + "num_input_tokens_seen": 68648960, + "step": 8380 + }, + { + "epoch": 0.9600640805584163, + "grad_norm": 0.5534819960594177, + "learning_rate": 2.5047499917464636e-07, + "loss": 0.9999, + "num_input_tokens_seen": 68730880, + "step": 8390 + }, + { + "epoch": 0.9612083762444216, + "grad_norm": 0.5017804503440857, + "learning_rate": 2.3634911017229034e-07, + "loss": 1.347, + "num_input_tokens_seen": 68812800, + "step": 8400 + }, + { + "epoch": 0.9623526719304268, + "grad_norm": 0.4598214328289032, + "learning_rate": 2.2263130005927558e-07, + "loss": 1.0357, + "num_input_tokens_seen": 68894720, + "step": 8410 + }, + { + "epoch": 0.9634969676164321, + "grad_norm": 0.5540017485618591, + "learning_rate": 2.0932179489066006e-07, + "loss": 1.3101, + "num_input_tokens_seen": 68976640, + "step": 8420 + }, + { + "epoch": 0.9646412633024374, + "grad_norm": 0.7891622185707092, + "learning_rate": 1.9642081399307844e-07, + "loss": 1.4347, + "num_input_tokens_seen": 69058560, + "step": 8430 + }, + { + "epoch": 0.9657855589884426, + "grad_norm": 0.8638253211975098, + "learning_rate": 1.8392856996110875e-07, + "loss": 0.9711, + "num_input_tokens_seen": 69140480, + "step": 8440 + }, + { + "epoch": 0.9669298546744479, + "grad_norm": 0.49595341086387634, + "learning_rate": 1.7184526865377805e-07, + "loss": 1.1246, + "num_input_tokens_seen": 69222400, + "step": 8450 + }, + { + "epoch": 0.9680741503604532, + "grad_norm": 0.5145378708839417, + "learning_rate": 1.6017110919116786e-07, + "loss": 1.182, + "num_input_tokens_seen": 69304320, + "step": 8460 + }, + { + "epoch": 0.9692184460464585, + "grad_norm": 0.5098930597305298, + "learning_rate": 1.4890628395113072e-07, + "loss": 1.2913, + "num_input_tokens_seen": 69386240, + "step": 8470 + }, + { + "epoch": 0.9703627417324636, + "grad_norm": 0.5276376008987427, + "learning_rate": 1.380509785661288e-07, + "loss": 1.1669, + "num_input_tokens_seen": 69468160, + "step": 8480 + }, + { + "epoch": 0.9715070374184689, + "grad_norm": 0.5354599952697754, + "learning_rate": 1.2760537192015866e-07, + "loss": 1.2136, + "num_input_tokens_seen": 69550080, + "step": 8490 + }, + { + "epoch": 0.9726513331044742, + "grad_norm": 0.43569329380989075, + "learning_rate": 1.1756963614582006e-07, + "loss": 1.4041, + "num_input_tokens_seen": 69632000, + "step": 8500 + }, + { + "epoch": 0.9737956287904794, + "grad_norm": 0.4199809432029724, + "learning_rate": 1.0794393662147129e-07, + "loss": 1.4145, + "num_input_tokens_seen": 69713920, + "step": 8510 + }, + { + "epoch": 0.9749399244764847, + "grad_norm": 0.45975956320762634, + "learning_rate": 9.872843196850057e-08, + "loss": 1.1534, + "num_input_tokens_seen": 69795840, + "step": 8520 + }, + { + "epoch": 0.97608422016249, + "grad_norm": 0.47319695353507996, + "learning_rate": 8.992327404872825e-08, + "loss": 1.1406, + "num_input_tokens_seen": 69877760, + "step": 8530 + }, + { + "epoch": 0.9772285158484952, + "grad_norm": 0.6402721405029297, + "learning_rate": 8.152860796187545e-08, + "loss": 1.4171, + "num_input_tokens_seen": 69959680, + "step": 8540 + }, + { + "epoch": 0.9783728115345005, + "grad_norm": 0.5459085702896118, + "learning_rate": 7.354457204320486e-08, + "loss": 1.2596, + "num_input_tokens_seen": 70041600, + "step": 8550 + }, + { + "epoch": 0.9795171072205058, + "grad_norm": 0.49747464060783386, + "learning_rate": 6.59712978612198e-08, + "loss": 1.4422, + "num_input_tokens_seen": 70123520, + "step": 8560 + }, + { + "epoch": 0.980661402906511, + "grad_norm": 0.5260741114616394, + "learning_rate": 5.880891021549928e-08, + "loss": 1.1456, + "num_input_tokens_seen": 70205440, + "step": 8570 + }, + { + "epoch": 0.9818056985925163, + "grad_norm": 0.5129789113998413, + "learning_rate": 5.205752713465794e-08, + "loss": 1.235, + "num_input_tokens_seen": 70287360, + "step": 8580 + }, + { + "epoch": 0.9829499942785216, + "grad_norm": 0.44936448335647583, + "learning_rate": 4.57172598743727e-08, + "loss": 1.2076, + "num_input_tokens_seen": 70369280, + "step": 8590 + }, + { + "epoch": 0.9840942899645269, + "grad_norm": 0.5832532644271851, + "learning_rate": 3.9788212915573e-08, + "loss": 1.1558, + "num_input_tokens_seen": 70451200, + "step": 8600 + }, + { + "epoch": 0.9852385856505321, + "grad_norm": 0.44010186195373535, + "learning_rate": 3.427048396271171e-08, + "loss": 1.2666, + "num_input_tokens_seen": 70533120, + "step": 8610 + }, + { + "epoch": 0.9863828813365374, + "grad_norm": 0.479427695274353, + "learning_rate": 2.9164163942146937e-08, + "loss": 1.4532, + "num_input_tokens_seen": 70615040, + "step": 8620 + }, + { + "epoch": 0.9875271770225427, + "grad_norm": 0.5471181273460388, + "learning_rate": 2.44693370006599e-08, + "loss": 1.4154, + "num_input_tokens_seen": 70696960, + "step": 8630 + }, + { + "epoch": 0.9886714727085479, + "grad_norm": 0.5122755765914917, + "learning_rate": 2.0186080504050466e-08, + "loss": 1.062, + "num_input_tokens_seen": 70778880, + "step": 8640 + }, + { + "epoch": 0.9898157683945531, + "grad_norm": 0.5013795495033264, + "learning_rate": 1.6314465035879855e-08, + "loss": 1.0299, + "num_input_tokens_seen": 70860800, + "step": 8650 + }, + { + "epoch": 0.9909600640805584, + "grad_norm": 0.4794003963470459, + "learning_rate": 1.2854554396291018e-08, + "loss": 1.1402, + "num_input_tokens_seen": 70942720, + "step": 8660 + }, + { + "epoch": 0.9921043597665636, + "grad_norm": 1.2419332265853882, + "learning_rate": 9.806405600967794e-09, + "loss": 1.3485, + "num_input_tokens_seen": 71024640, + "step": 8670 + }, + { + "epoch": 0.9932486554525689, + "grad_norm": 0.5120067596435547, + "learning_rate": 7.1700688801940034e-09, + "loss": 1.26, + "num_input_tokens_seen": 71106560, + "step": 8680 + }, + { + "epoch": 0.9943929511385742, + "grad_norm": 0.5355421900749207, + "learning_rate": 4.94558767802078e-09, + "loss": 1.1129, + "num_input_tokens_seen": 71188480, + "step": 8690 + }, + { + "epoch": 0.9955372468245794, + "grad_norm": 0.6079626083374023, + "learning_rate": 3.1329986515560295e-09, + "loss": 1.2407, + "num_input_tokens_seen": 71270400, + "step": 8700 + }, + { + "epoch": 0.9966815425105847, + "grad_norm": 0.48402678966522217, + "learning_rate": 1.7323316703621305e-09, + "loss": 1.2159, + "num_input_tokens_seen": 71352320, + "step": 8710 + }, + { + "epoch": 0.99782583819659, + "grad_norm": 0.45963913202285767, + "learning_rate": 7.436098159480099e-10, + "loss": 1.0369, + "num_input_tokens_seen": 71434240, + "step": 8720 + }, + { + "epoch": 0.9989701338825953, + "grad_norm": 0.5231521725654602, + "learning_rate": 1.6684938141664498e-10, + "loss": 1.1446, + "num_input_tokens_seen": 71516160, + "step": 8730 + }, + { + "epoch": 1.0, + "num_input_tokens_seen": 71589888, + "step": 8739, + "total_flos": 3.2596926707332547e+18, + "train_loss": 1.260579330722832, + "train_runtime": 44987.2731, + "train_samples_per_second": 0.194, + "train_steps_per_second": 0.194 + } + ], + "logging_steps": 10, + "max_steps": 8739, + "num_input_tokens_seen": 71589888, + "num_train_epochs": 1, + "save_steps": 1000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.2596926707332547e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}