{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.49997953419835456, "eval_steps": 4886, "global_step": 24430, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009823584789816217, "grad_norm": 640.0, "learning_rate": 0.001, "loss": 11885.6341, "step": 48 }, { "epoch": 0.0019647169579632435, "grad_norm": 454.0, "learning_rate": 0.001, "loss": 8966.3691, "step": 96 }, { "epoch": 0.0029470754369448652, "grad_norm": 572.0, "learning_rate": 0.001, "loss": 7738.5072, "step": 144 }, { "epoch": 0.003929433915926487, "grad_norm": 636.0, "learning_rate": 0.001, "loss": 7036.127, "step": 192 }, { "epoch": 0.004911792394908109, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 6540.5202, "step": 240 }, { "epoch": 0.0058941508738897305, "grad_norm": 556.0, "learning_rate": 0.001, "loss": 6263.0488, "step": 288 }, { "epoch": 0.006876509352871352, "grad_norm": 532.0, "learning_rate": 0.001, "loss": 5950.1823, "step": 336 }, { "epoch": 0.007858867831852974, "grad_norm": 928.0, "learning_rate": 0.001, "loss": 5705.2292, "step": 384 }, { "epoch": 0.008841226310834595, "grad_norm": 444.0, "learning_rate": 0.001, "loss": 5496.4583, "step": 432 }, { "epoch": 0.009823584789816217, "grad_norm": 656.0, "learning_rate": 0.001, "loss": 5272.5752, "step": 480 }, { "epoch": 0.010805943268797838, "grad_norm": 612.0, "learning_rate": 0.001, "loss": 5051.2663, "step": 528 }, { "epoch": 0.011788301747779461, "grad_norm": 608.0, "learning_rate": 0.001, "loss": 4938.0895, "step": 576 }, { "epoch": 0.012770660226761082, "grad_norm": 466.0, "learning_rate": 0.001, "loss": 4740.762, "step": 624 }, { "epoch": 0.013753018705742704, "grad_norm": 438.0, "learning_rate": 0.001, "loss": 4573.4443, "step": 672 }, { "epoch": 0.014735377184724325, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 4539.9521, "step": 720 }, { "epoch": 0.015717735663705948, "grad_norm": 892.0, "learning_rate": 0.001, "loss": 4435.4001, "step": 768 }, { "epoch": 0.01670009414268757, "grad_norm": 704.0, "learning_rate": 0.001, "loss": 4239.0426, "step": 816 }, { "epoch": 0.01768245262166919, "grad_norm": 548.0, "learning_rate": 0.001, "loss": 4189.9281, "step": 864 }, { "epoch": 0.018664811100650814, "grad_norm": 540.0, "learning_rate": 0.001, "loss": 4104.7835, "step": 912 }, { "epoch": 0.019647169579632435, "grad_norm": 592.0, "learning_rate": 0.001, "loss": 4044.3369, "step": 960 }, { "epoch": 0.020629528058614056, "grad_norm": 536.0, "learning_rate": 0.001, "loss": 3936.5283, "step": 1008 }, { "epoch": 0.021611886537595677, "grad_norm": 604.0, "learning_rate": 0.001, "loss": 3915.6911, "step": 1056 }, { "epoch": 0.0225942450165773, "grad_norm": 458.0, "learning_rate": 0.001, "loss": 3759.7747, "step": 1104 }, { "epoch": 0.023576603495558922, "grad_norm": 636.0, "learning_rate": 0.001, "loss": 3760.4476, "step": 1152 }, { "epoch": 0.024558961974540543, "grad_norm": 672.0, "learning_rate": 0.001, "loss": 3672.9059, "step": 1200 }, { "epoch": 0.025541320453522164, "grad_norm": 592.0, "learning_rate": 0.001, "loss": 3645.0697, "step": 1248 }, { "epoch": 0.026523678932503784, "grad_norm": 552.0, "learning_rate": 0.001, "loss": 3528.9896, "step": 1296 }, { "epoch": 0.02750603741148541, "grad_norm": 470.0, "learning_rate": 0.001, "loss": 3488.8187, "step": 1344 }, { "epoch": 0.02848839589046703, "grad_norm": 580.0, "learning_rate": 0.001, "loss": 3466.627, "step": 1392 }, { "epoch": 0.02947075436944865, "grad_norm": 584.0, "learning_rate": 0.001, "loss": 3399.1475, "step": 1440 }, { "epoch": 0.03045311284843027, "grad_norm": 732.0, "learning_rate": 0.001, "loss": 3363.9762, "step": 1488 }, { "epoch": 0.031435471327411896, "grad_norm": 616.0, "learning_rate": 0.001, "loss": 3337.4564, "step": 1536 }, { "epoch": 0.03241782980639352, "grad_norm": 540.0, "learning_rate": 0.001, "loss": 3298.4583, "step": 1584 }, { "epoch": 0.03340018828537514, "grad_norm": 512.0, "learning_rate": 0.001, "loss": 3212.2949, "step": 1632 }, { "epoch": 0.03438254676435676, "grad_norm": 512.0, "learning_rate": 0.001, "loss": 3217.6631, "step": 1680 }, { "epoch": 0.03536490524333838, "grad_norm": 624.0, "learning_rate": 0.001, "loss": 3175.7318, "step": 1728 }, { "epoch": 0.03634726372232, "grad_norm": 520.0, "learning_rate": 0.001, "loss": 3140.3923, "step": 1776 }, { "epoch": 0.03732962220130163, "grad_norm": 664.0, "learning_rate": 0.001, "loss": 3099.8044, "step": 1824 }, { "epoch": 0.03831198068028325, "grad_norm": 604.0, "learning_rate": 0.001, "loss": 3114.8079, "step": 1872 }, { "epoch": 0.03929433915926487, "grad_norm": 478.0, "learning_rate": 0.001, "loss": 3048.9001, "step": 1920 }, { "epoch": 0.04027669763824649, "grad_norm": 520.0, "learning_rate": 0.001, "loss": 3018.8714, "step": 1968 }, { "epoch": 0.04125905611722811, "grad_norm": 456.0, "learning_rate": 0.001, "loss": 2981.1152, "step": 2016 }, { "epoch": 0.04224141459620973, "grad_norm": 628.0, "learning_rate": 0.001, "loss": 2999.249, "step": 2064 }, { "epoch": 0.04322377307519135, "grad_norm": 804.0, "learning_rate": 0.001, "loss": 2942.3376, "step": 2112 }, { "epoch": 0.044206131554172974, "grad_norm": 600.0, "learning_rate": 0.001, "loss": 2890.7354, "step": 2160 }, { "epoch": 0.0451884900331546, "grad_norm": 632.0, "learning_rate": 0.001, "loss": 2896.4242, "step": 2208 }, { "epoch": 0.04617084851213622, "grad_norm": 536.0, "learning_rate": 0.001, "loss": 2874.8643, "step": 2256 }, { "epoch": 0.047153206991117844, "grad_norm": 494.0, "learning_rate": 0.001, "loss": 2807.6911, "step": 2304 }, { "epoch": 0.048135565470099465, "grad_norm": 548.0, "learning_rate": 0.001, "loss": 2820.04, "step": 2352 }, { "epoch": 0.049117923949081085, "grad_norm": 636.0, "learning_rate": 0.001, "loss": 2787.0247, "step": 2400 }, { "epoch": 0.050100282428062706, "grad_norm": 624.0, "learning_rate": 0.001, "loss": 2782.2428, "step": 2448 }, { "epoch": 0.05108264090704433, "grad_norm": 628.0, "learning_rate": 0.001, "loss": 2725.3781, "step": 2496 }, { "epoch": 0.05206499938602595, "grad_norm": 540.0, "learning_rate": 0.001, "loss": 2755.7458, "step": 2544 }, { "epoch": 0.05304735786500757, "grad_norm": 568.0, "learning_rate": 0.001, "loss": 2699.16, "step": 2592 }, { "epoch": 0.0540297163439892, "grad_norm": 808.0, "learning_rate": 0.001, "loss": 2680.3232, "step": 2640 }, { "epoch": 0.05501207482297082, "grad_norm": 564.0, "learning_rate": 0.001, "loss": 2669.6646, "step": 2688 }, { "epoch": 0.05599443330195244, "grad_norm": 552.0, "learning_rate": 0.001, "loss": 2683.8433, "step": 2736 }, { "epoch": 0.05697679178093406, "grad_norm": 636.0, "learning_rate": 0.001, "loss": 2643.8172, "step": 2784 }, { "epoch": 0.05795915025991568, "grad_norm": 580.0, "learning_rate": 0.001, "loss": 2649.0441, "step": 2832 }, { "epoch": 0.0589415087388973, "grad_norm": 512.0, "learning_rate": 0.001, "loss": 2615.9657, "step": 2880 }, { "epoch": 0.05992386721787892, "grad_norm": 688.0, "learning_rate": 0.001, "loss": 2608.1457, "step": 2928 }, { "epoch": 0.06090622569686054, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 2590.1567, "step": 2976 }, { "epoch": 0.06188858417584217, "grad_norm": 704.0, "learning_rate": 0.001, "loss": 2627.8358, "step": 3024 }, { "epoch": 0.06287094265482379, "grad_norm": 704.0, "learning_rate": 0.001, "loss": 2538.9543, "step": 3072 }, { "epoch": 0.06385330113380541, "grad_norm": 568.0, "learning_rate": 0.001, "loss": 2482.9673, "step": 3120 }, { "epoch": 0.06483565961278703, "grad_norm": 548.0, "learning_rate": 0.001, "loss": 2530.4771, "step": 3168 }, { "epoch": 0.06581801809176865, "grad_norm": 456.0, "learning_rate": 0.001, "loss": 2496.41, "step": 3216 }, { "epoch": 0.06680037657075028, "grad_norm": 684.0, "learning_rate": 0.001, "loss": 2518.8866, "step": 3264 }, { "epoch": 0.0677827350497319, "grad_norm": 664.0, "learning_rate": 0.001, "loss": 2475.0793, "step": 3312 }, { "epoch": 0.06876509352871352, "grad_norm": 812.0, "learning_rate": 0.001, "loss": 2461.3527, "step": 3360 }, { "epoch": 0.06974745200769514, "grad_norm": 490.0, "learning_rate": 0.001, "loss": 2467.4508, "step": 3408 }, { "epoch": 0.07072981048667676, "grad_norm": 648.0, "learning_rate": 0.001, "loss": 2443.8037, "step": 3456 }, { "epoch": 0.07171216896565838, "grad_norm": 664.0, "learning_rate": 0.001, "loss": 2445.9336, "step": 3504 }, { "epoch": 0.07269452744464, "grad_norm": 524.0, "learning_rate": 0.001, "loss": 2411.4482, "step": 3552 }, { "epoch": 0.07367688592362164, "grad_norm": 608.0, "learning_rate": 0.001, "loss": 2417.4673, "step": 3600 }, { "epoch": 0.07465924440260326, "grad_norm": 504.0, "learning_rate": 0.001, "loss": 2420.4196, "step": 3648 }, { "epoch": 0.07564160288158488, "grad_norm": 564.0, "learning_rate": 0.001, "loss": 2390.8983, "step": 3696 }, { "epoch": 0.0766239613605665, "grad_norm": 664.0, "learning_rate": 0.001, "loss": 2377.8607, "step": 3744 }, { "epoch": 0.07760631983954812, "grad_norm": 616.0, "learning_rate": 0.001, "loss": 2359.1242, "step": 3792 }, { "epoch": 0.07858867831852974, "grad_norm": 636.0, "learning_rate": 0.001, "loss": 2385.3102, "step": 3840 }, { "epoch": 0.07957103679751136, "grad_norm": 454.0, "learning_rate": 0.001, "loss": 2373.0225, "step": 3888 }, { "epoch": 0.08055339527649298, "grad_norm": 502.0, "learning_rate": 0.001, "loss": 2361.2386, "step": 3936 }, { "epoch": 0.0815357537554746, "grad_norm": 506.0, "learning_rate": 0.001, "loss": 2341.1328, "step": 3984 }, { "epoch": 0.08251811223445622, "grad_norm": 472.0, "learning_rate": 0.001, "loss": 2308.069, "step": 4032 }, { "epoch": 0.08350047071343784, "grad_norm": 472.0, "learning_rate": 0.001, "loss": 2305.2542, "step": 4080 }, { "epoch": 0.08448282919241946, "grad_norm": 502.0, "learning_rate": 0.001, "loss": 2338.4048, "step": 4128 }, { "epoch": 0.08546518767140109, "grad_norm": 628.0, "learning_rate": 0.001, "loss": 2307.96, "step": 4176 }, { "epoch": 0.0864475461503827, "grad_norm": 516.0, "learning_rate": 0.001, "loss": 2314.313, "step": 4224 }, { "epoch": 0.08742990462936433, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 2270.4917, "step": 4272 }, { "epoch": 0.08841226310834595, "grad_norm": 576.0, "learning_rate": 0.001, "loss": 2292.9497, "step": 4320 }, { "epoch": 0.08939462158732757, "grad_norm": 604.0, "learning_rate": 0.001, "loss": 2274.6584, "step": 4368 }, { "epoch": 0.0903769800663092, "grad_norm": 580.0, "learning_rate": 0.001, "loss": 2275.2266, "step": 4416 }, { "epoch": 0.09135933854529082, "grad_norm": 548.0, "learning_rate": 0.001, "loss": 2262.757, "step": 4464 }, { "epoch": 0.09234169702427245, "grad_norm": 628.0, "learning_rate": 0.001, "loss": 2257.687, "step": 4512 }, { "epoch": 0.09332405550325407, "grad_norm": 544.0, "learning_rate": 0.001, "loss": 2259.9118, "step": 4560 }, { "epoch": 0.09430641398223569, "grad_norm": 580.0, "learning_rate": 0.001, "loss": 2224.4427, "step": 4608 }, { "epoch": 0.09528877246121731, "grad_norm": 628.0, "learning_rate": 0.001, "loss": 2248.397, "step": 4656 }, { "epoch": 0.09627113094019893, "grad_norm": 600.0, "learning_rate": 0.001, "loss": 2203.2843, "step": 4704 }, { "epoch": 0.09725348941918055, "grad_norm": 780.0, "learning_rate": 0.001, "loss": 2223.5656, "step": 4752 }, { "epoch": 0.09823584789816217, "grad_norm": 536.0, "learning_rate": 0.001, "loss": 2169.4321, "step": 4800 }, { "epoch": 0.09921820637714379, "grad_norm": 516.0, "learning_rate": 0.001, "loss": 2183.4987, "step": 4848 }, { "epoch": 0.09999590683967091, "eval_loss": 2087.82763671875, "eval_runtime": 9.0001, "eval_samples_per_second": 111.11, "eval_steps_per_second": 1.444, "step": 4886 }, { "epoch": 0.10020056485612541, "grad_norm": 620.0, "learning_rate": 0.001, "loss": 2210.3151, "step": 4896 }, { "epoch": 0.10118292333510703, "grad_norm": 616.0, "learning_rate": 0.001, "loss": 2208.1779, "step": 4944 }, { "epoch": 0.10216528181408865, "grad_norm": 592.0, "learning_rate": 0.001, "loss": 2166.7116, "step": 4992 }, { "epoch": 0.10314764029307028, "grad_norm": 596.0, "learning_rate": 0.001, "loss": 2191.4295, "step": 5040 }, { "epoch": 0.1041299987720519, "grad_norm": 684.0, "learning_rate": 0.001, "loss": 2155.1141, "step": 5088 }, { "epoch": 0.10511235725103352, "grad_norm": 512.0, "learning_rate": 0.001, "loss": 2135.7635, "step": 5136 }, { "epoch": 0.10609471573001514, "grad_norm": 506.0, "learning_rate": 0.001, "loss": 2155.5701, "step": 5184 }, { "epoch": 0.10707707420899677, "grad_norm": 480.0, "learning_rate": 0.001, "loss": 2150.0086, "step": 5232 }, { "epoch": 0.1080594326879784, "grad_norm": 540.0, "learning_rate": 0.001, "loss": 2142.4181, "step": 5280 }, { "epoch": 0.10904179116696001, "grad_norm": 572.0, "learning_rate": 0.001, "loss": 2116.3011, "step": 5328 }, { "epoch": 0.11002414964594164, "grad_norm": 548.0, "learning_rate": 0.001, "loss": 2141.0239, "step": 5376 }, { "epoch": 0.11100650812492326, "grad_norm": 676.0, "learning_rate": 0.001, "loss": 2119.1307, "step": 5424 }, { "epoch": 0.11198886660390488, "grad_norm": 656.0, "learning_rate": 0.001, "loss": 2137.8016, "step": 5472 }, { "epoch": 0.1129712250828865, "grad_norm": 676.0, "learning_rate": 0.001, "loss": 2119.2923, "step": 5520 }, { "epoch": 0.11395358356186812, "grad_norm": 588.0, "learning_rate": 0.001, "loss": 2120.9912, "step": 5568 }, { "epoch": 0.11493594204084974, "grad_norm": 612.0, "learning_rate": 0.001, "loss": 2111.5037, "step": 5616 }, { "epoch": 0.11591830051983136, "grad_norm": 588.0, "learning_rate": 0.001, "loss": 2119.6444, "step": 5664 }, { "epoch": 0.11690065899881298, "grad_norm": 700.0, "learning_rate": 0.001, "loss": 2078.1807, "step": 5712 }, { "epoch": 0.1178830174777946, "grad_norm": 564.0, "learning_rate": 0.001, "loss": 2095.8706, "step": 5760 }, { "epoch": 0.11886537595677622, "grad_norm": 552.0, "learning_rate": 0.001, "loss": 2080.8527, "step": 5808 }, { "epoch": 0.11984773443575784, "grad_norm": 488.0, "learning_rate": 0.001, "loss": 2062.9159, "step": 5856 }, { "epoch": 0.12083009291473946, "grad_norm": 616.0, "learning_rate": 0.001, "loss": 2060.964, "step": 5904 }, { "epoch": 0.12181245139372109, "grad_norm": 648.0, "learning_rate": 0.001, "loss": 2088.8507, "step": 5952 }, { "epoch": 0.12279480987270272, "grad_norm": 604.0, "learning_rate": 0.001, "loss": 2052.1393, "step": 6000 }, { "epoch": 0.12377716835168434, "grad_norm": 720.0, "learning_rate": 0.001, "loss": 2043.2277, "step": 6048 }, { "epoch": 0.12475952683066596, "grad_norm": 616.0, "learning_rate": 0.001, "loss": 2043.3983, "step": 6096 }, { "epoch": 0.12574188530964758, "grad_norm": 668.0, "learning_rate": 0.001, "loss": 2080.6297, "step": 6144 }, { "epoch": 0.1267242437886292, "grad_norm": 532.0, "learning_rate": 0.001, "loss": 2059.5207, "step": 6192 }, { "epoch": 0.12770660226761082, "grad_norm": 568.0, "learning_rate": 0.001, "loss": 2030.5203, "step": 6240 }, { "epoch": 0.12868896074659245, "grad_norm": 560.0, "learning_rate": 0.001, "loss": 2047.7404, "step": 6288 }, { "epoch": 0.12967131922557407, "grad_norm": 624.0, "learning_rate": 0.001, "loss": 2043.3193, "step": 6336 }, { "epoch": 0.1306536777045557, "grad_norm": 592.0, "learning_rate": 0.001, "loss": 2051.0589, "step": 6384 }, { "epoch": 0.1316360361835373, "grad_norm": 876.0, "learning_rate": 0.001, "loss": 2054.3232, "step": 6432 }, { "epoch": 0.13261839466251893, "grad_norm": 544.0, "learning_rate": 0.001, "loss": 2047.3159, "step": 6480 }, { "epoch": 0.13360075314150055, "grad_norm": 648.0, "learning_rate": 0.001, "loss": 2029.021, "step": 6528 }, { "epoch": 0.13458311162048217, "grad_norm": 556.0, "learning_rate": 0.001, "loss": 2027.506, "step": 6576 }, { "epoch": 0.1355654700994638, "grad_norm": 672.0, "learning_rate": 0.001, "loss": 2034.3325, "step": 6624 }, { "epoch": 0.1365478285784454, "grad_norm": 648.0, "learning_rate": 0.001, "loss": 1988.6841, "step": 6672 }, { "epoch": 0.13753018705742703, "grad_norm": 592.0, "learning_rate": 0.001, "loss": 1998.0236, "step": 6720 }, { "epoch": 0.13851254553640865, "grad_norm": 552.0, "learning_rate": 0.001, "loss": 2008.8337, "step": 6768 }, { "epoch": 0.13949490401539028, "grad_norm": 780.0, "learning_rate": 0.001, "loss": 2008.4787, "step": 6816 }, { "epoch": 0.1404772624943719, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1995.237, "step": 6864 }, { "epoch": 0.14145962097335352, "grad_norm": 804.0, "learning_rate": 0.001, "loss": 1996.2018, "step": 6912 }, { "epoch": 0.14244197945233514, "grad_norm": 652.0, "learning_rate": 0.001, "loss": 1992.167, "step": 6960 }, { "epoch": 0.14342433793131676, "grad_norm": 544.0, "learning_rate": 0.001, "loss": 1985.2515, "step": 7008 }, { "epoch": 0.14440669641029838, "grad_norm": 600.0, "learning_rate": 0.001, "loss": 1989.0208, "step": 7056 }, { "epoch": 0.14538905488928, "grad_norm": 712.0, "learning_rate": 0.001, "loss": 1993.743, "step": 7104 }, { "epoch": 0.14637141336826162, "grad_norm": 580.0, "learning_rate": 0.001, "loss": 1986.2668, "step": 7152 }, { "epoch": 0.14735377184724327, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1988.6514, "step": 7200 }, { "epoch": 0.1483361303262249, "grad_norm": 452.0, "learning_rate": 0.001, "loss": 1971.7622, "step": 7248 }, { "epoch": 0.1493184888052065, "grad_norm": 576.0, "learning_rate": 0.001, "loss": 1977.0863, "step": 7296 }, { "epoch": 0.15030084728418813, "grad_norm": 708.0, "learning_rate": 0.001, "loss": 1968.3294, "step": 7344 }, { "epoch": 0.15128320576316975, "grad_norm": 572.0, "learning_rate": 0.001, "loss": 1981.1888, "step": 7392 }, { "epoch": 0.15226556424215137, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 1937.5469, "step": 7440 }, { "epoch": 0.153247922721133, "grad_norm": 672.0, "learning_rate": 0.001, "loss": 1944.2785, "step": 7488 }, { "epoch": 0.15423028120011462, "grad_norm": 816.0, "learning_rate": 0.001, "loss": 1934.2336, "step": 7536 }, { "epoch": 0.15521263967909624, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1953.3698, "step": 7584 }, { "epoch": 0.15619499815807786, "grad_norm": 652.0, "learning_rate": 0.001, "loss": 1951.9084, "step": 7632 }, { "epoch": 0.15717735663705948, "grad_norm": 652.0, "learning_rate": 0.001, "loss": 1934.8753, "step": 7680 }, { "epoch": 0.1581597151160411, "grad_norm": 652.0, "learning_rate": 0.001, "loss": 1923.8843, "step": 7728 }, { "epoch": 0.15914207359502272, "grad_norm": 612.0, "learning_rate": 0.001, "loss": 1935.5955, "step": 7776 }, { "epoch": 0.16012443207400434, "grad_norm": 724.0, "learning_rate": 0.001, "loss": 1962.8574, "step": 7824 }, { "epoch": 0.16110679055298596, "grad_norm": 540.0, "learning_rate": 0.001, "loss": 1955.3468, "step": 7872 }, { "epoch": 0.16208914903196758, "grad_norm": 752.0, "learning_rate": 0.001, "loss": 1915.3901, "step": 7920 }, { "epoch": 0.1630715075109492, "grad_norm": 572.0, "learning_rate": 0.001, "loss": 1944.2292, "step": 7968 }, { "epoch": 0.16405386598993082, "grad_norm": 668.0, "learning_rate": 0.001, "loss": 1926.0425, "step": 8016 }, { "epoch": 0.16503622446891245, "grad_norm": 556.0, "learning_rate": 0.001, "loss": 1938.1131, "step": 8064 }, { "epoch": 0.16601858294789407, "grad_norm": 756.0, "learning_rate": 0.001, "loss": 1925.4678, "step": 8112 }, { "epoch": 0.1670009414268757, "grad_norm": 848.0, "learning_rate": 0.001, "loss": 1921.8462, "step": 8160 }, { "epoch": 0.1679832999058573, "grad_norm": 588.0, "learning_rate": 0.001, "loss": 1890.1263, "step": 8208 }, { "epoch": 0.16896565838483893, "grad_norm": 580.0, "learning_rate": 0.001, "loss": 1923.7113, "step": 8256 }, { "epoch": 0.16994801686382055, "grad_norm": 712.0, "learning_rate": 0.001, "loss": 1902.661, "step": 8304 }, { "epoch": 0.17093037534280217, "grad_norm": 676.0, "learning_rate": 0.001, "loss": 1898.4054, "step": 8352 }, { "epoch": 0.1719127338217838, "grad_norm": 604.0, "learning_rate": 0.001, "loss": 1899.0542, "step": 8400 }, { "epoch": 0.1728950923007654, "grad_norm": 700.0, "learning_rate": 0.001, "loss": 1906.8057, "step": 8448 }, { "epoch": 0.17387745077974703, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1908.1032, "step": 8496 }, { "epoch": 0.17485980925872865, "grad_norm": 628.0, "learning_rate": 0.001, "loss": 1928.3221, "step": 8544 }, { "epoch": 0.17584216773771028, "grad_norm": 664.0, "learning_rate": 0.001, "loss": 1890.7321, "step": 8592 }, { "epoch": 0.1768245262166919, "grad_norm": 556.0, "learning_rate": 0.001, "loss": 1910.8001, "step": 8640 }, { "epoch": 0.17780688469567352, "grad_norm": 648.0, "learning_rate": 0.001, "loss": 1908.4972, "step": 8688 }, { "epoch": 0.17878924317465514, "grad_norm": 608.0, "learning_rate": 0.001, "loss": 1870.7344, "step": 8736 }, { "epoch": 0.1797716016536368, "grad_norm": 644.0, "learning_rate": 0.001, "loss": 1901.4289, "step": 8784 }, { "epoch": 0.1807539601326184, "grad_norm": 580.0, "learning_rate": 0.001, "loss": 1883.8433, "step": 8832 }, { "epoch": 0.18173631861160003, "grad_norm": 828.0, "learning_rate": 0.001, "loss": 1869.978, "step": 8880 }, { "epoch": 0.18271867709058165, "grad_norm": 652.0, "learning_rate": 0.001, "loss": 1895.2178, "step": 8928 }, { "epoch": 0.18370103556956327, "grad_norm": 680.0, "learning_rate": 0.001, "loss": 1857.217, "step": 8976 }, { "epoch": 0.1846833940485449, "grad_norm": 608.0, "learning_rate": 0.001, "loss": 1880.6992, "step": 9024 }, { "epoch": 0.1856657525275265, "grad_norm": 664.0, "learning_rate": 0.001, "loss": 1869.5422, "step": 9072 }, { "epoch": 0.18664811100650813, "grad_norm": 720.0, "learning_rate": 0.001, "loss": 1898.1034, "step": 9120 }, { "epoch": 0.18763046948548975, "grad_norm": 604.0, "learning_rate": 0.001, "loss": 1887.1818, "step": 9168 }, { "epoch": 0.18861282796447137, "grad_norm": 672.0, "learning_rate": 0.001, "loss": 1869.6294, "step": 9216 }, { "epoch": 0.189595186443453, "grad_norm": 572.0, "learning_rate": 0.001, "loss": 1857.5962, "step": 9264 }, { "epoch": 0.19057754492243462, "grad_norm": 732.0, "learning_rate": 0.001, "loss": 1869.6444, "step": 9312 }, { "epoch": 0.19155990340141624, "grad_norm": 848.0, "learning_rate": 0.001, "loss": 1869.8807, "step": 9360 }, { "epoch": 0.19254226188039786, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1857.5882, "step": 9408 }, { "epoch": 0.19352462035937948, "grad_norm": 652.0, "learning_rate": 0.001, "loss": 1835.6294, "step": 9456 }, { "epoch": 0.1945069788383611, "grad_norm": 608.0, "learning_rate": 0.001, "loss": 1853.5081, "step": 9504 }, { "epoch": 0.19548933731734272, "grad_norm": 648.0, "learning_rate": 0.001, "loss": 1866.897, "step": 9552 }, { "epoch": 0.19647169579632434, "grad_norm": 724.0, "learning_rate": 0.001, "loss": 1848.0703, "step": 9600 }, { "epoch": 0.19745405427530596, "grad_norm": 628.0, "learning_rate": 0.001, "loss": 1867.1585, "step": 9648 }, { "epoch": 0.19843641275428758, "grad_norm": 612.0, "learning_rate": 0.001, "loss": 1864.8203, "step": 9696 }, { "epoch": 0.1994187712332692, "grad_norm": 572.0, "learning_rate": 0.001, "loss": 1827.0848, "step": 9744 }, { "epoch": 0.19999181367934182, "eval_loss": 1771.5172119140625, "eval_runtime": 9.0052, "eval_samples_per_second": 111.047, "eval_steps_per_second": 1.444, "step": 9772 }, { "epoch": 0.20040112971225083, "grad_norm": 784.0, "learning_rate": 0.001, "loss": 1850.5506, "step": 9792 }, { "epoch": 0.20138348819123245, "grad_norm": 728.0, "learning_rate": 0.001, "loss": 1853.3254, "step": 9840 }, { "epoch": 0.20236584667021407, "grad_norm": 744.0, "learning_rate": 0.001, "loss": 1884.8763, "step": 9888 }, { "epoch": 0.2033482051491957, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1852.5361, "step": 9936 }, { "epoch": 0.2043305636281773, "grad_norm": 780.0, "learning_rate": 0.001, "loss": 1840.8996, "step": 9984 }, { "epoch": 0.20531292210715893, "grad_norm": 676.0, "learning_rate": 0.001, "loss": 1848.5868, "step": 10032 }, { "epoch": 0.20629528058614055, "grad_norm": 764.0, "learning_rate": 0.001, "loss": 1848.7498, "step": 10080 }, { "epoch": 0.20727763906512217, "grad_norm": 856.0, "learning_rate": 0.001, "loss": 1845.7381, "step": 10128 }, { "epoch": 0.2082599975441038, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1851.979, "step": 10176 }, { "epoch": 0.2092423560230854, "grad_norm": 644.0, "learning_rate": 0.001, "loss": 1850.7716, "step": 10224 }, { "epoch": 0.21022471450206703, "grad_norm": 828.0, "learning_rate": 0.001, "loss": 1844.3057, "step": 10272 }, { "epoch": 0.21120707298104865, "grad_norm": 688.0, "learning_rate": 0.001, "loss": 1820.6678, "step": 10320 }, { "epoch": 0.21218943146003028, "grad_norm": 832.0, "learning_rate": 0.001, "loss": 1793.4041, "step": 10368 }, { "epoch": 0.21317178993901192, "grad_norm": 648.0, "learning_rate": 0.001, "loss": 1853.5828, "step": 10416 }, { "epoch": 0.21415414841799355, "grad_norm": 780.0, "learning_rate": 0.001, "loss": 1816.429, "step": 10464 }, { "epoch": 0.21513650689697517, "grad_norm": 708.0, "learning_rate": 0.001, "loss": 1827.7533, "step": 10512 }, { "epoch": 0.2161188653759568, "grad_norm": 812.0, "learning_rate": 0.001, "loss": 1807.555, "step": 10560 }, { "epoch": 0.2171012238549384, "grad_norm": 728.0, "learning_rate": 0.001, "loss": 1824.4678, "step": 10608 }, { "epoch": 0.21808358233392003, "grad_norm": 760.0, "learning_rate": 0.001, "loss": 1832.056, "step": 10656 }, { "epoch": 0.21906594081290165, "grad_norm": 672.0, "learning_rate": 0.001, "loss": 1819.7812, "step": 10704 }, { "epoch": 0.22004829929188327, "grad_norm": 808.0, "learning_rate": 0.001, "loss": 1837.8351, "step": 10752 }, { "epoch": 0.2210306577708649, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1823.1432, "step": 10800 }, { "epoch": 0.2220130162498465, "grad_norm": 768.0, "learning_rate": 0.001, "loss": 1810.9959, "step": 10848 }, { "epoch": 0.22299537472882813, "grad_norm": 756.0, "learning_rate": 0.001, "loss": 1848.2907, "step": 10896 }, { "epoch": 0.22397773320780975, "grad_norm": 744.0, "learning_rate": 0.001, "loss": 1786.6442, "step": 10944 }, { "epoch": 0.22496009168679137, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1804.0133, "step": 10992 }, { "epoch": 0.225942450165773, "grad_norm": 640.0, "learning_rate": 0.001, "loss": 1813.6567, "step": 11040 }, { "epoch": 0.22692480864475462, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 1812.9946, "step": 11088 }, { "epoch": 0.22790716712373624, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 1816.8553, "step": 11136 }, { "epoch": 0.22888952560271786, "grad_norm": 640.0, "learning_rate": 0.001, "loss": 1801.8009, "step": 11184 }, { "epoch": 0.22987188408169948, "grad_norm": 680.0, "learning_rate": 0.001, "loss": 1816.7332, "step": 11232 }, { "epoch": 0.2308542425606811, "grad_norm": 692.0, "learning_rate": 0.001, "loss": 1799.8708, "step": 11280 }, { "epoch": 0.23183660103966272, "grad_norm": 680.0, "learning_rate": 0.001, "loss": 1791.1471, "step": 11328 }, { "epoch": 0.23281895951864434, "grad_norm": 700.0, "learning_rate": 0.001, "loss": 1812.3979, "step": 11376 }, { "epoch": 0.23380131799762596, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 1807.806, "step": 11424 }, { "epoch": 0.23478367647660758, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1811.6502, "step": 11472 }, { "epoch": 0.2357660349555892, "grad_norm": 736.0, "learning_rate": 0.001, "loss": 1821.8501, "step": 11520 }, { "epoch": 0.23674839343457083, "grad_norm": 648.0, "learning_rate": 0.001, "loss": 1777.6597, "step": 11568 }, { "epoch": 0.23773075191355245, "grad_norm": 768.0, "learning_rate": 0.001, "loss": 1804.2365, "step": 11616 }, { "epoch": 0.23871311039253407, "grad_norm": 672.0, "learning_rate": 0.001, "loss": 1794.9201, "step": 11664 }, { "epoch": 0.2396954688715157, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 1775.2284, "step": 11712 }, { "epoch": 0.2406778273504973, "grad_norm": 700.0, "learning_rate": 0.001, "loss": 1785.5417, "step": 11760 }, { "epoch": 0.24166018582947893, "grad_norm": 840.0, "learning_rate": 0.001, "loss": 1792.2282, "step": 11808 }, { "epoch": 0.24264254430846055, "grad_norm": 992.0, "learning_rate": 0.001, "loss": 1799.9831, "step": 11856 }, { "epoch": 0.24362490278744217, "grad_norm": 872.0, "learning_rate": 0.001, "loss": 1804.1024, "step": 11904 }, { "epoch": 0.2446072612664238, "grad_norm": 668.0, "learning_rate": 0.001, "loss": 1785.5889, "step": 11952 }, { "epoch": 0.24558961974540544, "grad_norm": 872.0, "learning_rate": 0.001, "loss": 1785.6185, "step": 12000 }, { "epoch": 0.24657197822438706, "grad_norm": 784.0, "learning_rate": 0.001, "loss": 1785.6107, "step": 12048 }, { "epoch": 0.24755433670336868, "grad_norm": 644.0, "learning_rate": 0.001, "loss": 1789.2995, "step": 12096 }, { "epoch": 0.2485366951823503, "grad_norm": 772.0, "learning_rate": 0.001, "loss": 1780.3151, "step": 12144 }, { "epoch": 0.24951905366133192, "grad_norm": 728.0, "learning_rate": 0.001, "loss": 1769.0786, "step": 12192 }, { "epoch": 0.2505014121403135, "grad_norm": 752.0, "learning_rate": 0.001, "loss": 1801.0431, "step": 12240 }, { "epoch": 0.25148377061929517, "grad_norm": 844.0, "learning_rate": 0.001, "loss": 1796.9209, "step": 12288 }, { "epoch": 0.25246612909827676, "grad_norm": 1088.0, "learning_rate": 0.001, "loss": 1795.411, "step": 12336 }, { "epoch": 0.2534484875772584, "grad_norm": 1320.0, "learning_rate": 0.001, "loss": 1802.1553, "step": 12384 }, { "epoch": 0.25443084605624, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1772.7713, "step": 12432 }, { "epoch": 0.25541320453522165, "grad_norm": 744.0, "learning_rate": 0.001, "loss": 1787.7516, "step": 12480 }, { "epoch": 0.25639556301420324, "grad_norm": 744.0, "learning_rate": 0.001, "loss": 1785.005, "step": 12528 }, { "epoch": 0.2573779214931849, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1763.7612, "step": 12576 }, { "epoch": 0.2583602799721665, "grad_norm": 752.0, "learning_rate": 0.001, "loss": 1777.0677, "step": 12624 }, { "epoch": 0.25934263845114813, "grad_norm": 712.0, "learning_rate": 0.001, "loss": 1761.1235, "step": 12672 }, { "epoch": 0.2603249969301297, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 1795.2174, "step": 12720 }, { "epoch": 0.2613073554091114, "grad_norm": 816.0, "learning_rate": 0.001, "loss": 1763.8905, "step": 12768 }, { "epoch": 0.262289713888093, "grad_norm": 852.0, "learning_rate": 0.001, "loss": 1761.4404, "step": 12816 }, { "epoch": 0.2632720723670746, "grad_norm": 608.0, "learning_rate": 0.001, "loss": 1762.6668, "step": 12864 }, { "epoch": 0.26425443084605627, "grad_norm": 892.0, "learning_rate": 0.001, "loss": 1725.5638, "step": 12912 }, { "epoch": 0.26523678932503786, "grad_norm": 724.0, "learning_rate": 0.001, "loss": 1762.1764, "step": 12960 }, { "epoch": 0.2662191478040195, "grad_norm": 728.0, "learning_rate": 0.001, "loss": 1764.0163, "step": 13008 }, { "epoch": 0.2672015062830011, "grad_norm": 664.0, "learning_rate": 0.001, "loss": 1754.1729, "step": 13056 }, { "epoch": 0.26818386476198275, "grad_norm": 724.0, "learning_rate": 0.001, "loss": 1772.8592, "step": 13104 }, { "epoch": 0.26916622324096434, "grad_norm": 704.0, "learning_rate": 0.001, "loss": 1780.2349, "step": 13152 }, { "epoch": 0.270148581719946, "grad_norm": 720.0, "learning_rate": 0.001, "loss": 1764.6678, "step": 13200 }, { "epoch": 0.2711309401989276, "grad_norm": 772.0, "learning_rate": 0.001, "loss": 1757.6209, "step": 13248 }, { "epoch": 0.27211329867790923, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 1749.5741, "step": 13296 }, { "epoch": 0.2730956571568908, "grad_norm": 804.0, "learning_rate": 0.001, "loss": 1758.4183, "step": 13344 }, { "epoch": 0.2740780156358725, "grad_norm": 712.0, "learning_rate": 0.001, "loss": 1770.6115, "step": 13392 }, { "epoch": 0.27506037411485407, "grad_norm": 784.0, "learning_rate": 0.001, "loss": 1752.6141, "step": 13440 }, { "epoch": 0.2760427325938357, "grad_norm": 700.0, "learning_rate": 0.001, "loss": 1732.4147, "step": 13488 }, { "epoch": 0.2770250910728173, "grad_norm": 780.0, "learning_rate": 0.001, "loss": 1757.4318, "step": 13536 }, { "epoch": 0.27800744955179896, "grad_norm": 764.0, "learning_rate": 0.001, "loss": 1746.452, "step": 13584 }, { "epoch": 0.27898980803078055, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 1733.8742, "step": 13632 }, { "epoch": 0.2799721665097622, "grad_norm": 824.0, "learning_rate": 0.001, "loss": 1761.4808, "step": 13680 }, { "epoch": 0.2809545249887438, "grad_norm": 664.0, "learning_rate": 0.001, "loss": 1749.8506, "step": 13728 }, { "epoch": 0.28193688346772544, "grad_norm": 776.0, "learning_rate": 0.001, "loss": 1734.1479, "step": 13776 }, { "epoch": 0.28291924194670703, "grad_norm": 708.0, "learning_rate": 0.001, "loss": 1721.951, "step": 13824 }, { "epoch": 0.2839016004256887, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 1741.5046, "step": 13872 }, { "epoch": 0.2848839589046703, "grad_norm": 764.0, "learning_rate": 0.001, "loss": 1743.1763, "step": 13920 }, { "epoch": 0.2858663173836519, "grad_norm": 808.0, "learning_rate": 0.001, "loss": 1767.7448, "step": 13968 }, { "epoch": 0.2868486758626335, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 1717.0291, "step": 14016 }, { "epoch": 0.28783103434161517, "grad_norm": 804.0, "learning_rate": 0.001, "loss": 1731.1566, "step": 14064 }, { "epoch": 0.28881339282059676, "grad_norm": 788.0, "learning_rate": 0.001, "loss": 1720.1551, "step": 14112 }, { "epoch": 0.2897957512995784, "grad_norm": 732.0, "learning_rate": 0.001, "loss": 1743.5467, "step": 14160 }, { "epoch": 0.29077810977856, "grad_norm": 680.0, "learning_rate": 0.001, "loss": 1736.5747, "step": 14208 }, { "epoch": 0.29176046825754165, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1737.3779, "step": 14256 }, { "epoch": 0.29274282673652324, "grad_norm": 760.0, "learning_rate": 0.001, "loss": 1718.3322, "step": 14304 }, { "epoch": 0.2937251852155049, "grad_norm": 760.0, "learning_rate": 0.001, "loss": 1736.3989, "step": 14352 }, { "epoch": 0.29470754369448654, "grad_norm": 832.0, "learning_rate": 0.001, "loss": 1738.8551, "step": 14400 }, { "epoch": 0.29568990217346813, "grad_norm": 900.0, "learning_rate": 0.001, "loss": 1711.0072, "step": 14448 }, { "epoch": 0.2966722606524498, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 1709.6022, "step": 14496 }, { "epoch": 0.2976546191314314, "grad_norm": 704.0, "learning_rate": 0.001, "loss": 1741.5581, "step": 14544 }, { "epoch": 0.298636977610413, "grad_norm": 684.0, "learning_rate": 0.001, "loss": 1715.0571, "step": 14592 }, { "epoch": 0.2996193360893946, "grad_norm": 728.0, "learning_rate": 0.001, "loss": 1733.5199, "step": 14640 }, { "epoch": 0.29998772051901273, "eval_loss": 1650.6409912109375, "eval_runtime": 9.0148, "eval_samples_per_second": 110.929, "eval_steps_per_second": 1.442, "step": 14658 }, { "epoch": 0.30060169456837627, "grad_norm": 708.0, "learning_rate": 0.001, "loss": 1719.6375, "step": 14688 }, { "epoch": 0.30158405304735786, "grad_norm": 616.0, "learning_rate": 0.001, "loss": 1728.5174, "step": 14736 }, { "epoch": 0.3025664115263395, "grad_norm": 624.0, "learning_rate": 0.001, "loss": 1724.9813, "step": 14784 }, { "epoch": 0.3035487700053211, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1704.8024, "step": 14832 }, { "epoch": 0.30453112848430275, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 1714.6235, "step": 14880 }, { "epoch": 0.30551348696328434, "grad_norm": 1048.0, "learning_rate": 0.001, "loss": 1734.4709, "step": 14928 }, { "epoch": 0.306495845442266, "grad_norm": 688.0, "learning_rate": 0.001, "loss": 1721.2712, "step": 14976 }, { "epoch": 0.3074782039212476, "grad_norm": 724.0, "learning_rate": 0.001, "loss": 1752.32, "step": 15024 }, { "epoch": 0.30846056240022923, "grad_norm": 792.0, "learning_rate": 0.001, "loss": 1711.9393, "step": 15072 }, { "epoch": 0.3094429208792108, "grad_norm": 904.0, "learning_rate": 0.001, "loss": 1722.3177, "step": 15120 }, { "epoch": 0.3104252793581925, "grad_norm": 768.0, "learning_rate": 0.001, "loss": 1737.7088, "step": 15168 }, { "epoch": 0.31140763783717407, "grad_norm": 768.0, "learning_rate": 0.001, "loss": 1728.0853, "step": 15216 }, { "epoch": 0.3123899963161557, "grad_norm": 776.0, "learning_rate": 0.001, "loss": 1711.749, "step": 15264 }, { "epoch": 0.3133723547951373, "grad_norm": 840.0, "learning_rate": 0.001, "loss": 1717.5446, "step": 15312 }, { "epoch": 0.31435471327411896, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 1718.4888, "step": 15360 }, { "epoch": 0.31533707175310055, "grad_norm": 692.0, "learning_rate": 0.001, "loss": 1722.6672, "step": 15408 }, { "epoch": 0.3163194302320822, "grad_norm": 864.0, "learning_rate": 0.001, "loss": 1715.826, "step": 15456 }, { "epoch": 0.3173017887110638, "grad_norm": 712.0, "learning_rate": 0.001, "loss": 1714.7765, "step": 15504 }, { "epoch": 0.31828414719004544, "grad_norm": 712.0, "learning_rate": 0.001, "loss": 1718.0269, "step": 15552 }, { "epoch": 0.31926650566902703, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 1691.7855, "step": 15600 }, { "epoch": 0.3202488641480087, "grad_norm": 888.0, "learning_rate": 0.001, "loss": 1717.6808, "step": 15648 }, { "epoch": 0.3212312226269903, "grad_norm": 680.0, "learning_rate": 0.001, "loss": 1719.3472, "step": 15696 }, { "epoch": 0.3222135811059719, "grad_norm": 868.0, "learning_rate": 0.001, "loss": 1691.0158, "step": 15744 }, { "epoch": 0.3231959395849535, "grad_norm": 756.0, "learning_rate": 0.001, "loss": 1704.2186, "step": 15792 }, { "epoch": 0.32417829806393517, "grad_norm": 888.0, "learning_rate": 0.001, "loss": 1723.0382, "step": 15840 }, { "epoch": 0.32516065654291676, "grad_norm": 912.0, "learning_rate": 0.001, "loss": 1702.2889, "step": 15888 }, { "epoch": 0.3261430150218984, "grad_norm": 820.0, "learning_rate": 0.001, "loss": 1728.0734, "step": 15936 }, { "epoch": 0.32712537350088006, "grad_norm": 788.0, "learning_rate": 0.001, "loss": 1720.2152, "step": 15984 }, { "epoch": 0.32810773197986165, "grad_norm": 808.0, "learning_rate": 0.001, "loss": 1702.1133, "step": 16032 }, { "epoch": 0.3290900904588433, "grad_norm": 836.0, "learning_rate": 0.001, "loss": 1720.4746, "step": 16080 }, { "epoch": 0.3300724489378249, "grad_norm": 836.0, "learning_rate": 0.001, "loss": 1689.6606, "step": 16128 }, { "epoch": 0.33105480741680654, "grad_norm": 728.0, "learning_rate": 0.001, "loss": 1689.0417, "step": 16176 }, { "epoch": 0.33203716589578813, "grad_norm": 848.0, "learning_rate": 0.001, "loss": 1703.012, "step": 16224 }, { "epoch": 0.3330195243747698, "grad_norm": 756.0, "learning_rate": 0.001, "loss": 1700.2785, "step": 16272 }, { "epoch": 0.3340018828537514, "grad_norm": 756.0, "learning_rate": 0.001, "loss": 1709.3231, "step": 16320 }, { "epoch": 0.334984241332733, "grad_norm": 960.0, "learning_rate": 0.001, "loss": 1715.8831, "step": 16368 }, { "epoch": 0.3359665998117146, "grad_norm": 692.0, "learning_rate": 0.001, "loss": 1695.813, "step": 16416 }, { "epoch": 0.33694895829069627, "grad_norm": 688.0, "learning_rate": 0.001, "loss": 1685.9803, "step": 16464 }, { "epoch": 0.33793131676967786, "grad_norm": 876.0, "learning_rate": 0.001, "loss": 1704.5868, "step": 16512 }, { "epoch": 0.3389136752486595, "grad_norm": 732.0, "learning_rate": 0.001, "loss": 1681.2751, "step": 16560 }, { "epoch": 0.3398960337276411, "grad_norm": 748.0, "learning_rate": 0.001, "loss": 1690.0252, "step": 16608 }, { "epoch": 0.34087839220662275, "grad_norm": 780.0, "learning_rate": 0.001, "loss": 1698.1932, "step": 16656 }, { "epoch": 0.34186075068560434, "grad_norm": 856.0, "learning_rate": 0.001, "loss": 1692.6128, "step": 16704 }, { "epoch": 0.342843109164586, "grad_norm": 880.0, "learning_rate": 0.001, "loss": 1696.6901, "step": 16752 }, { "epoch": 0.3438254676435676, "grad_norm": 688.0, "learning_rate": 0.001, "loss": 1693.9344, "step": 16800 }, { "epoch": 0.34480782612254923, "grad_norm": 688.0, "learning_rate": 0.001, "loss": 1704.4855, "step": 16848 }, { "epoch": 0.3457901846015308, "grad_norm": 692.0, "learning_rate": 0.001, "loss": 1705.7817, "step": 16896 }, { "epoch": 0.3467725430805125, "grad_norm": 760.0, "learning_rate": 0.001, "loss": 1690.8944, "step": 16944 }, { "epoch": 0.34775490155949407, "grad_norm": 868.0, "learning_rate": 0.001, "loss": 1685.9479, "step": 16992 }, { "epoch": 0.3487372600384757, "grad_norm": 744.0, "learning_rate": 0.001, "loss": 1698.2961, "step": 17040 }, { "epoch": 0.3497196185174573, "grad_norm": 688.0, "learning_rate": 0.001, "loss": 1693.7596, "step": 17088 }, { "epoch": 0.35070197699643896, "grad_norm": 760.0, "learning_rate": 0.001, "loss": 1702.9092, "step": 17136 }, { "epoch": 0.35168433547542055, "grad_norm": 788.0, "learning_rate": 0.001, "loss": 1672.0039, "step": 17184 }, { "epoch": 0.3526666939544022, "grad_norm": 772.0, "learning_rate": 0.001, "loss": 1673.5811, "step": 17232 }, { "epoch": 0.3536490524333838, "grad_norm": 976.0, "learning_rate": 0.001, "loss": 1697.0251, "step": 17280 }, { "epoch": 0.35463141091236544, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1699.7508, "step": 17328 }, { "epoch": 0.35561376939134703, "grad_norm": 704.0, "learning_rate": 0.001, "loss": 1708.7798, "step": 17376 }, { "epoch": 0.3565961278703287, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1693.451, "step": 17424 }, { "epoch": 0.3575784863493103, "grad_norm": 904.0, "learning_rate": 0.001, "loss": 1676.6382, "step": 17472 }, { "epoch": 0.3585608448282919, "grad_norm": 700.0, "learning_rate": 0.001, "loss": 1691.7266, "step": 17520 }, { "epoch": 0.3595432033072736, "grad_norm": 1012.0, "learning_rate": 0.001, "loss": 1666.9458, "step": 17568 }, { "epoch": 0.36052556178625517, "grad_norm": 804.0, "learning_rate": 0.001, "loss": 1675.5584, "step": 17616 }, { "epoch": 0.3615079202652368, "grad_norm": 856.0, "learning_rate": 0.001, "loss": 1685.9595, "step": 17664 }, { "epoch": 0.3624902787442184, "grad_norm": 836.0, "learning_rate": 0.001, "loss": 1695.4741, "step": 17712 }, { "epoch": 0.36347263722320006, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 1689.256, "step": 17760 }, { "epoch": 0.36445499570218165, "grad_norm": 732.0, "learning_rate": 0.001, "loss": 1689.8599, "step": 17808 }, { "epoch": 0.3654373541811633, "grad_norm": 876.0, "learning_rate": 0.001, "loss": 1668.894, "step": 17856 }, { "epoch": 0.3664197126601449, "grad_norm": 780.0, "learning_rate": 0.001, "loss": 1684.5539, "step": 17904 }, { "epoch": 0.36740207113912654, "grad_norm": 952.0, "learning_rate": 0.001, "loss": 1683.5539, "step": 17952 }, { "epoch": 0.36838442961810813, "grad_norm": 860.0, "learning_rate": 0.001, "loss": 1695.5282, "step": 18000 }, { "epoch": 0.3693667880970898, "grad_norm": 844.0, "learning_rate": 0.001, "loss": 1667.8563, "step": 18048 }, { "epoch": 0.3703491465760714, "grad_norm": 800.0, "learning_rate": 0.001, "loss": 1671.2471, "step": 18096 }, { "epoch": 0.371331505055053, "grad_norm": 760.0, "learning_rate": 0.001, "loss": 1664.4082, "step": 18144 }, { "epoch": 0.3723138635340346, "grad_norm": 1120.0, "learning_rate": 0.001, "loss": 1666.0448, "step": 18192 }, { "epoch": 0.37329622201301627, "grad_norm": 824.0, "learning_rate": 0.001, "loss": 1665.9009, "step": 18240 }, { "epoch": 0.37427858049199786, "grad_norm": 872.0, "learning_rate": 0.001, "loss": 1663.6131, "step": 18288 }, { "epoch": 0.3752609389709795, "grad_norm": 804.0, "learning_rate": 0.001, "loss": 1665.7214, "step": 18336 }, { "epoch": 0.3762432974499611, "grad_norm": 768.0, "learning_rate": 0.001, "loss": 1663.0591, "step": 18384 }, { "epoch": 0.37722565592894275, "grad_norm": 988.0, "learning_rate": 0.001, "loss": 1683.5985, "step": 18432 }, { "epoch": 0.37820801440792434, "grad_norm": 804.0, "learning_rate": 0.001, "loss": 1661.8081, "step": 18480 }, { "epoch": 0.379190372886906, "grad_norm": 776.0, "learning_rate": 0.001, "loss": 1685.9769, "step": 18528 }, { "epoch": 0.3801727313658876, "grad_norm": 920.0, "learning_rate": 0.001, "loss": 1676.7816, "step": 18576 }, { "epoch": 0.38115508984486923, "grad_norm": 800.0, "learning_rate": 0.001, "loss": 1669.9821, "step": 18624 }, { "epoch": 0.3821374483238508, "grad_norm": 828.0, "learning_rate": 0.001, "loss": 1654.353, "step": 18672 }, { "epoch": 0.3831198068028325, "grad_norm": 756.0, "learning_rate": 0.001, "loss": 1641.133, "step": 18720 }, { "epoch": 0.38410216528181407, "grad_norm": 772.0, "learning_rate": 0.001, "loss": 1654.8257, "step": 18768 }, { "epoch": 0.3850845237607957, "grad_norm": 856.0, "learning_rate": 0.001, "loss": 1674.0685, "step": 18816 }, { "epoch": 0.3860668822397773, "grad_norm": 980.0, "learning_rate": 0.001, "loss": 1663.0804, "step": 18864 }, { "epoch": 0.38704924071875896, "grad_norm": 852.0, "learning_rate": 0.001, "loss": 1676.6375, "step": 18912 }, { "epoch": 0.38803159919774055, "grad_norm": 788.0, "learning_rate": 0.001, "loss": 1681.3968, "step": 18960 }, { "epoch": 0.3890139576767222, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1658.5428, "step": 19008 }, { "epoch": 0.3899963161557038, "grad_norm": 772.0, "learning_rate": 0.001, "loss": 1693.6683, "step": 19056 }, { "epoch": 0.39097867463468544, "grad_norm": 1224.0, "learning_rate": 0.001, "loss": 1637.9217, "step": 19104 }, { "epoch": 0.3919610331136671, "grad_norm": 896.0, "learning_rate": 0.001, "loss": 1680.59, "step": 19152 }, { "epoch": 0.3929433915926487, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1663.9777, "step": 19200 }, { "epoch": 0.39392575007163033, "grad_norm": 940.0, "learning_rate": 0.001, "loss": 1672.4491, "step": 19248 }, { "epoch": 0.3949081085506119, "grad_norm": 696.0, "learning_rate": 0.001, "loss": 1668.8294, "step": 19296 }, { "epoch": 0.3958904670295936, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1652.0028, "step": 19344 }, { "epoch": 0.39687282550857517, "grad_norm": 900.0, "learning_rate": 0.001, "loss": 1670.4543, "step": 19392 }, { "epoch": 0.3978551839875568, "grad_norm": 832.0, "learning_rate": 0.001, "loss": 1674.2799, "step": 19440 }, { "epoch": 0.3988375424665384, "grad_norm": 896.0, "learning_rate": 0.001, "loss": 1637.7557, "step": 19488 }, { "epoch": 0.39981990094552006, "grad_norm": 772.0, "learning_rate": 0.001, "loss": 1659.3999, "step": 19536 }, { "epoch": 0.39998362735868365, "eval_loss": 1588.186767578125, "eval_runtime": 9.0185, "eval_samples_per_second": 110.884, "eval_steps_per_second": 1.441, "step": 19544 }, { "epoch": 0.40080225942450165, "grad_norm": 812.0, "learning_rate": 0.001, "loss": 1663.3135, "step": 19584 }, { "epoch": 0.4017846179034833, "grad_norm": 820.0, "learning_rate": 0.001, "loss": 1648.4126, "step": 19632 }, { "epoch": 0.4027669763824649, "grad_norm": 828.0, "learning_rate": 0.001, "loss": 1629.137, "step": 19680 }, { "epoch": 0.40374933486144654, "grad_norm": 840.0, "learning_rate": 0.001, "loss": 1675.1574, "step": 19728 }, { "epoch": 0.40473169334042813, "grad_norm": 892.0, "learning_rate": 0.001, "loss": 1651.4735, "step": 19776 }, { "epoch": 0.4057140518194098, "grad_norm": 732.0, "learning_rate": 0.001, "loss": 1659.3854, "step": 19824 }, { "epoch": 0.4066964102983914, "grad_norm": 808.0, "learning_rate": 0.001, "loss": 1676.7249, "step": 19872 }, { "epoch": 0.407678768777373, "grad_norm": 1088.0, "learning_rate": 0.001, "loss": 1658.8581, "step": 19920 }, { "epoch": 0.4086611272563546, "grad_norm": 772.0, "learning_rate": 0.001, "loss": 1663.8711, "step": 19968 }, { "epoch": 0.40964348573533627, "grad_norm": 724.0, "learning_rate": 0.001, "loss": 1657.056, "step": 20016 }, { "epoch": 0.41062584421431786, "grad_norm": 920.0, "learning_rate": 0.001, "loss": 1637.993, "step": 20064 }, { "epoch": 0.4116082026932995, "grad_norm": 636.0, "learning_rate": 0.001, "loss": 1644.8833, "step": 20112 }, { "epoch": 0.4125905611722811, "grad_norm": 956.0, "learning_rate": 0.001, "loss": 1656.4339, "step": 20160 }, { "epoch": 0.41357291965126275, "grad_norm": 792.0, "learning_rate": 0.001, "loss": 1652.4888, "step": 20208 }, { "epoch": 0.41455527813024434, "grad_norm": 832.0, "learning_rate": 0.001, "loss": 1642.5142, "step": 20256 }, { "epoch": 0.415537636609226, "grad_norm": 724.0, "learning_rate": 0.001, "loss": 1668.1183, "step": 20304 }, { "epoch": 0.4165199950882076, "grad_norm": 796.0, "learning_rate": 0.001, "loss": 1660.486, "step": 20352 }, { "epoch": 0.41750235356718923, "grad_norm": 788.0, "learning_rate": 0.001, "loss": 1655.4614, "step": 20400 }, { "epoch": 0.4184847120461708, "grad_norm": 844.0, "learning_rate": 0.001, "loss": 1650.6449, "step": 20448 }, { "epoch": 0.4194670705251525, "grad_norm": 1080.0, "learning_rate": 0.001, "loss": 1639.0168, "step": 20496 }, { "epoch": 0.42044942900413407, "grad_norm": 944.0, "learning_rate": 0.001, "loss": 1655.6951, "step": 20544 }, { "epoch": 0.4214317874831157, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1644.007, "step": 20592 }, { "epoch": 0.4224141459620973, "grad_norm": 896.0, "learning_rate": 0.001, "loss": 1614.1253, "step": 20640 }, { "epoch": 0.42339650444107896, "grad_norm": 736.0, "learning_rate": 0.001, "loss": 1637.6761, "step": 20688 }, { "epoch": 0.42437886292006055, "grad_norm": 752.0, "learning_rate": 0.001, "loss": 1635.8405, "step": 20736 }, { "epoch": 0.4253612213990422, "grad_norm": 768.0, "learning_rate": 0.001, "loss": 1653.1647, "step": 20784 }, { "epoch": 0.42634357987802385, "grad_norm": 832.0, "learning_rate": 0.001, "loss": 1657.5373, "step": 20832 }, { "epoch": 0.42732593835700544, "grad_norm": 832.0, "learning_rate": 0.001, "loss": 1649.7998, "step": 20880 }, { "epoch": 0.4283082968359871, "grad_norm": 848.0, "learning_rate": 0.001, "loss": 1639.043, "step": 20928 }, { "epoch": 0.4292906553149687, "grad_norm": 716.0, "learning_rate": 0.001, "loss": 1656.0706, "step": 20976 }, { "epoch": 0.43027301379395033, "grad_norm": 924.0, "learning_rate": 0.001, "loss": 1640.4744, "step": 21024 }, { "epoch": 0.4312553722729319, "grad_norm": 832.0, "learning_rate": 0.001, "loss": 1640.8403, "step": 21072 }, { "epoch": 0.4322377307519136, "grad_norm": 828.0, "learning_rate": 0.001, "loss": 1638.4172, "step": 21120 }, { "epoch": 0.43322008923089517, "grad_norm": 732.0, "learning_rate": 0.001, "loss": 1662.0506, "step": 21168 }, { "epoch": 0.4342024477098768, "grad_norm": 752.0, "learning_rate": 0.001, "loss": 1634.0928, "step": 21216 }, { "epoch": 0.4351848061888584, "grad_norm": 976.0, "learning_rate": 0.001, "loss": 1650.1844, "step": 21264 }, { "epoch": 0.43616716466784006, "grad_norm": 864.0, "learning_rate": 0.001, "loss": 1637.9504, "step": 21312 }, { "epoch": 0.43714952314682165, "grad_norm": 660.0, "learning_rate": 0.001, "loss": 1651.0807, "step": 21360 }, { "epoch": 0.4381318816258033, "grad_norm": 884.0, "learning_rate": 0.001, "loss": 1639.6099, "step": 21408 }, { "epoch": 0.4391142401047849, "grad_norm": 752.0, "learning_rate": 0.001, "loss": 1636.7961, "step": 21456 }, { "epoch": 0.44009659858376654, "grad_norm": 704.0, "learning_rate": 0.001, "loss": 1630.2891, "step": 21504 }, { "epoch": 0.44107895706274813, "grad_norm": 868.0, "learning_rate": 0.001, "loss": 1651.1029, "step": 21552 }, { "epoch": 0.4420613155417298, "grad_norm": 1200.0, "learning_rate": 0.001, "loss": 1658.1079, "step": 21600 }, { "epoch": 0.4430436740207114, "grad_norm": 820.0, "learning_rate": 0.001, "loss": 1650.035, "step": 21648 }, { "epoch": 0.444026032499693, "grad_norm": 628.0, "learning_rate": 0.001, "loss": 1651.446, "step": 21696 }, { "epoch": 0.4450083909786746, "grad_norm": 936.0, "learning_rate": 0.001, "loss": 1651.8545, "step": 21744 }, { "epoch": 0.44599074945765627, "grad_norm": 724.0, "learning_rate": 0.001, "loss": 1633.7931, "step": 21792 }, { "epoch": 0.44697310793663786, "grad_norm": 868.0, "learning_rate": 0.001, "loss": 1643.6271, "step": 21840 }, { "epoch": 0.4479554664156195, "grad_norm": 884.0, "learning_rate": 0.001, "loss": 1628.9087, "step": 21888 }, { "epoch": 0.4489378248946011, "grad_norm": 808.0, "learning_rate": 0.001, "loss": 1633.3311, "step": 21936 }, { "epoch": 0.44992018337358275, "grad_norm": 776.0, "learning_rate": 0.001, "loss": 1636.3483, "step": 21984 }, { "epoch": 0.45090254185256434, "grad_norm": 740.0, "learning_rate": 0.001, "loss": 1627.1842, "step": 22032 }, { "epoch": 0.451884900331546, "grad_norm": 832.0, "learning_rate": 0.001, "loss": 1632.6536, "step": 22080 }, { "epoch": 0.4528672588105276, "grad_norm": 920.0, "learning_rate": 0.001, "loss": 1663.3418, "step": 22128 }, { "epoch": 0.45384961728950923, "grad_norm": 752.0, "learning_rate": 0.001, "loss": 1635.7738, "step": 22176 }, { "epoch": 0.4548319757684908, "grad_norm": 752.0, "learning_rate": 0.001, "loss": 1636.0459, "step": 22224 }, { "epoch": 0.4558143342474725, "grad_norm": 892.0, "learning_rate": 0.001, "loss": 1627.4956, "step": 22272 }, { "epoch": 0.45679669272645407, "grad_norm": 860.0, "learning_rate": 0.001, "loss": 1628.1626, "step": 22320 }, { "epoch": 0.4577790512054357, "grad_norm": 776.0, "learning_rate": 0.001, "loss": 1628.0701, "step": 22368 }, { "epoch": 0.45876140968441737, "grad_norm": 792.0, "learning_rate": 0.001, "loss": 1644.1922, "step": 22416 }, { "epoch": 0.45974376816339896, "grad_norm": 776.0, "learning_rate": 0.001, "loss": 1608.5988, "step": 22464 }, { "epoch": 0.4607261266423806, "grad_norm": 860.0, "learning_rate": 0.001, "loss": 1637.166, "step": 22512 }, { "epoch": 0.4617084851213622, "grad_norm": 924.0, "learning_rate": 0.001, "loss": 1626.6854, "step": 22560 }, { "epoch": 0.46269084360034385, "grad_norm": 876.0, "learning_rate": 0.001, "loss": 1639.1245, "step": 22608 }, { "epoch": 0.46367320207932544, "grad_norm": 936.0, "learning_rate": 0.001, "loss": 1634.3815, "step": 22656 }, { "epoch": 0.4646555605583071, "grad_norm": 912.0, "learning_rate": 0.001, "loss": 1606.1912, "step": 22704 }, { "epoch": 0.4656379190372887, "grad_norm": 952.0, "learning_rate": 0.001, "loss": 1620.5391, "step": 22752 }, { "epoch": 0.46662027751627033, "grad_norm": 960.0, "learning_rate": 0.001, "loss": 1612.0667, "step": 22800 }, { "epoch": 0.4676026359952519, "grad_norm": 832.0, "learning_rate": 0.001, "loss": 1651.9868, "step": 22848 }, { "epoch": 0.4685849944742336, "grad_norm": 712.0, "learning_rate": 0.001, "loss": 1629.237, "step": 22896 }, { "epoch": 0.46956735295321517, "grad_norm": 864.0, "learning_rate": 0.001, "loss": 1618.2004, "step": 22944 }, { "epoch": 0.4705497114321968, "grad_norm": 728.0, "learning_rate": 0.001, "loss": 1625.5379, "step": 22992 }, { "epoch": 0.4715320699111784, "grad_norm": 836.0, "learning_rate": 0.001, "loss": 1622.8146, "step": 23040 }, { "epoch": 0.47251442839016006, "grad_norm": 1064.0, "learning_rate": 0.001, "loss": 1623.9705, "step": 23088 }, { "epoch": 0.47349678686914165, "grad_norm": 860.0, "learning_rate": 0.001, "loss": 1626.2383, "step": 23136 }, { "epoch": 0.4744791453481233, "grad_norm": 1120.0, "learning_rate": 0.001, "loss": 1634.2668, "step": 23184 }, { "epoch": 0.4754615038271049, "grad_norm": 796.0, "learning_rate": 0.001, "loss": 1642.5649, "step": 23232 }, { "epoch": 0.47644386230608654, "grad_norm": 1072.0, "learning_rate": 0.001, "loss": 1633.4873, "step": 23280 }, { "epoch": 0.47742622078506813, "grad_norm": 904.0, "learning_rate": 0.001, "loss": 1604.186, "step": 23328 }, { "epoch": 0.4784085792640498, "grad_norm": 852.0, "learning_rate": 0.001, "loss": 1608.5158, "step": 23376 }, { "epoch": 0.4793909377430314, "grad_norm": 860.0, "learning_rate": 0.001, "loss": 1624.3991, "step": 23424 }, { "epoch": 0.480373296222013, "grad_norm": 820.0, "learning_rate": 0.001, "loss": 1640.4948, "step": 23472 }, { "epoch": 0.4813556547009946, "grad_norm": 700.0, "learning_rate": 0.001, "loss": 1593.3931, "step": 23520 }, { "epoch": 0.48233801317997627, "grad_norm": 824.0, "learning_rate": 0.001, "loss": 1622.2378, "step": 23568 }, { "epoch": 0.48332037165895786, "grad_norm": 936.0, "learning_rate": 0.001, "loss": 1612.6034, "step": 23616 }, { "epoch": 0.4843027301379395, "grad_norm": 732.0, "learning_rate": 0.001, "loss": 1624.4165, "step": 23664 }, { "epoch": 0.4852850886169211, "grad_norm": 820.0, "learning_rate": 0.001, "loss": 1634.2424, "step": 23712 }, { "epoch": 0.48626744709590275, "grad_norm": 896.0, "learning_rate": 0.001, "loss": 1616.617, "step": 23760 }, { "epoch": 0.48724980557488434, "grad_norm": 732.0, "learning_rate": 0.001, "loss": 1629.8065, "step": 23808 }, { "epoch": 0.488232164053866, "grad_norm": 876.0, "learning_rate": 0.001, "loss": 1611.0832, "step": 23856 }, { "epoch": 0.4892145225328476, "grad_norm": 1004.0, "learning_rate": 0.001, "loss": 1596.9705, "step": 23904 }, { "epoch": 0.49019688101182923, "grad_norm": 812.0, "learning_rate": 0.001, "loss": 1612.6437, "step": 23952 }, { "epoch": 0.4911792394908109, "grad_norm": 764.0, "learning_rate": 0.001, "loss": 1612.0643, "step": 24000 }, { "epoch": 0.4921615979697925, "grad_norm": 944.0, "learning_rate": 0.001, "loss": 1620.1268, "step": 24048 }, { "epoch": 0.4931439564487741, "grad_norm": 920.0, "learning_rate": 0.001, "loss": 1619.8875, "step": 24096 }, { "epoch": 0.4941263149277557, "grad_norm": 808.0, "learning_rate": 0.001, "loss": 1608.6463, "step": 24144 }, { "epoch": 0.49510867340673737, "grad_norm": 784.0, "learning_rate": 0.001, "loss": 1609.1462, "step": 24192 }, { "epoch": 0.49609103188571896, "grad_norm": 796.0, "learning_rate": 0.001, "loss": 1610.4935, "step": 24240 }, { "epoch": 0.4970733903647006, "grad_norm": 896.0, "learning_rate": 0.001, "loss": 1622.8371, "step": 24288 }, { "epoch": 0.4980557488436822, "grad_norm": 768.0, "learning_rate": 0.001, "loss": 1617.0732, "step": 24336 }, { "epoch": 0.49903810732266385, "grad_norm": 920.0, "learning_rate": 0.001, "loss": 1615.2142, "step": 24384 }, { "epoch": 0.49997953419835456, "eval_loss": 1546.7662353515625, "eval_runtime": 9.0072, "eval_samples_per_second": 111.022, "eval_steps_per_second": 1.443, "step": 24430 } ], "logging_steps": 48, "max_steps": 48862, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 4886, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7859053283033743e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }