{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.7007369822924607, "eval_steps": 500, "global_step": 1520000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 4.9999270159623114e-05, "loss": 0.1271, "step": 5000 }, { "epoch": 0.02, "learning_rate": 4.999707892753202e-05, "loss": 0.0859, "step": 10000 }, { "epoch": 0.04, "learning_rate": 4.9993426431557147e-05, "loss": 0.0768, "step": 15000 }, { "epoch": 0.05, "learning_rate": 4.9988311715921104e-05, "loss": 0.0727, "step": 20000 }, { "epoch": 0.06, "learning_rate": 4.9981738587426694e-05, "loss": 0.0693, "step": 25000 }, { "epoch": 0.07, "learning_rate": 4.997370216884145e-05, "loss": 0.0677, "step": 30000 }, { "epoch": 0.09, "learning_rate": 4.9964207314440955e-05, "loss": 0.0659, "step": 35000 }, { "epoch": 0.1, "learning_rate": 4.995325311774524e-05, "loss": 0.0657, "step": 40000 }, { "epoch": 0.11, "learning_rate": 4.994084284853358e-05, "loss": 0.0638, "step": 45000 }, { "epoch": 0.12, "learning_rate": 4.992697226521413e-05, "loss": 0.0623, "step": 50000 }, { "epoch": 0.13, "learning_rate": 4.991164130374091e-05, "loss": 0.0615, "step": 55000 }, { "epoch": 0.15, "learning_rate": 4.9894856993903285e-05, "loss": 0.0608, "step": 60000 }, { "epoch": 0.16, "learning_rate": 4.987661739580889e-05, "loss": 0.0604, "step": 65000 }, { "epoch": 0.17, "learning_rate": 4.985691948992265e-05, "loss": 0.0599, "step": 70000 }, { "epoch": 0.18, "learning_rate": 4.983577230843278e-05, "loss": 0.0591, "step": 75000 }, { "epoch": 0.19, "learning_rate": 4.981317329262237e-05, "loss": 0.0586, "step": 80000 }, { "epoch": 0.21, "learning_rate": 4.9789118806694247e-05, "loss": 0.0587, "step": 85000 }, { "epoch": 0.22, "learning_rate": 4.976361988048301e-05, "loss": 0.0576, "step": 90000 }, { "epoch": 0.23, "learning_rate": 4.9736673337793535e-05, "loss": 0.0576, "step": 95000 }, { "epoch": 0.24, "learning_rate": 4.970828075385029e-05, "loss": 0.0566, "step": 100000 }, { "epoch": 0.26, "learning_rate": 4.967844378840947e-05, "loss": 0.0567, "step": 105000 }, { "epoch": 0.27, "learning_rate": 4.964715778300843e-05, "loss": 0.0567, "step": 110000 }, { "epoch": 0.28, "learning_rate": 4.961443708338981e-05, "loss": 0.0568, "step": 115000 }, { "epoch": 0.29, "learning_rate": 4.9580277488129266e-05, "loss": 0.057, "step": 120000 }, { "epoch": 0.3, "learning_rate": 4.954467372832107e-05, "loss": 0.0551, "step": 125000 }, { "epoch": 0.32, "learning_rate": 4.9507634576843017e-05, "loss": 0.0549, "step": 130000 }, { "epoch": 0.33, "learning_rate": 4.9469185717167566e-05, "loss": 0.0563, "step": 135000 }, { "epoch": 0.34, "learning_rate": 4.942928322246712e-05, "loss": 0.0546, "step": 140000 }, { "epoch": 0.35, "learning_rate": 4.938796049480949e-05, "loss": 0.0538, "step": 145000 }, { "epoch": 0.37, "learning_rate": 4.934520341515526e-05, "loss": 0.0548, "step": 150000 }, { "epoch": 0.38, "learning_rate": 4.930104056921508e-05, "loss": 0.0541, "step": 155000 }, { "epoch": 0.39, "learning_rate": 4.925543918835136e-05, "loss": 0.0543, "step": 160000 }, { "epoch": 0.4, "learning_rate": 4.920843842545973e-05, "loss": 0.0538, "step": 165000 }, { "epoch": 0.41, "learning_rate": 4.9160003406180696e-05, "loss": 0.0539, "step": 170000 }, { "epoch": 0.43, "learning_rate": 4.9110165603726345e-05, "loss": 0.0536, "step": 175000 }, { "epoch": 0.44, "learning_rate": 4.9058918384022446e-05, "loss": 0.0537, "step": 180000 }, { "epoch": 0.45, "learning_rate": 4.900625406738267e-05, "loss": 0.0535, "step": 185000 }, { "epoch": 0.46, "learning_rate": 4.895219680226333e-05, "loss": 0.0527, "step": 190000 }, { "epoch": 0.47, "learning_rate": 4.889673935433758e-05, "loss": 0.0538, "step": 195000 }, { "epoch": 0.49, "learning_rate": 4.8839896480432604e-05, "loss": 0.0531, "step": 200000 }, { "epoch": 0.5, "learning_rate": 4.878163695931812e-05, "loss": 0.0527, "step": 205000 }, { "epoch": 0.51, "learning_rate": 4.872198666951938e-05, "loss": 0.0531, "step": 210000 }, { "epoch": 0.52, "learning_rate": 4.8660961447879297e-05, "loss": 0.0524, "step": 215000 }, { "epoch": 0.54, "learning_rate": 4.859854044346287e-05, "loss": 0.0527, "step": 220000 }, { "epoch": 0.55, "learning_rate": 4.853475227851366e-05, "loss": 0.0523, "step": 225000 }, { "epoch": 0.56, "learning_rate": 4.8469588333473586e-05, "loss": 0.0524, "step": 230000 }, { "epoch": 0.57, "learning_rate": 4.8403052417656516e-05, "loss": 0.0521, "step": 235000 }, { "epoch": 0.58, "learning_rate": 4.833514842057813e-05, "loss": 0.0518, "step": 240000 }, { "epoch": 0.6, "learning_rate": 4.826586631633889e-05, "loss": 0.052, "step": 245000 }, { "epoch": 0.61, "learning_rate": 4.8195237873239866e-05, "loss": 0.0525, "step": 250000 }, { "epoch": 0.62, "learning_rate": 4.8123253497174505e-05, "loss": 0.0514, "step": 255000 }, { "epoch": 0.63, "learning_rate": 4.804991739616124e-05, "loss": 0.0515, "step": 260000 }, { "epoch": 0.65, "learning_rate": 4.7975233857236826e-05, "loss": 0.0524, "step": 265000 }, { "epoch": 0.66, "learning_rate": 4.789920724620569e-05, "loss": 0.0506, "step": 270000 }, { "epoch": 0.67, "learning_rate": 4.7821857620213536e-05, "loss": 0.0513, "step": 275000 }, { "epoch": 0.68, "learning_rate": 4.774314266334363e-05, "loss": 0.0513, "step": 280000 }, { "epoch": 0.69, "learning_rate": 4.7663097669746146e-05, "loss": 0.0512, "step": 285000 }, { "epoch": 0.71, "learning_rate": 4.75817437300597e-05, "loss": 0.0508, "step": 290000 }, { "epoch": 0.72, "learning_rate": 4.749905304748714e-05, "loss": 0.0515, "step": 295000 }, { "epoch": 0.73, "learning_rate": 4.741504660273818e-05, "loss": 0.0504, "step": 300000 }, { "epoch": 0.74, "learning_rate": 4.732974650620106e-05, "loss": 0.0504, "step": 305000 }, { "epoch": 0.75, "learning_rate": 4.724315852930218e-05, "loss": 0.0506, "step": 310000 }, { "epoch": 0.77, "learning_rate": 4.715523536004309e-05, "loss": 0.0496, "step": 315000 }, { "epoch": 0.78, "learning_rate": 4.706605248838939e-05, "loss": 0.0503, "step": 320000 }, { "epoch": 0.79, "learning_rate": 4.697554374104222e-05, "loss": 0.0504, "step": 325000 }, { "epoch": 0.8, "learning_rate": 4.6883749846751347e-05, "loss": 0.05, "step": 330000 }, { "epoch": 0.82, "learning_rate": 4.679069491976989e-05, "loss": 0.0497, "step": 335000 }, { "epoch": 0.83, "learning_rate": 4.6696347165338586e-05, "loss": 0.0503, "step": 340000 }, { "epoch": 0.84, "learning_rate": 4.660074984524773e-05, "loss": 0.0503, "step": 345000 }, { "epoch": 0.85, "learning_rate": 4.650388980179216e-05, "loss": 0.0493, "step": 350000 }, { "epoch": 0.86, "learning_rate": 4.6405752940483196e-05, "loss": 0.0503, "step": 355000 }, { "epoch": 0.88, "learning_rate": 4.6306384260551005e-05, "loss": 0.0502, "step": 360000 }, { "epoch": 0.89, "learning_rate": 4.620577006509662e-05, "loss": 0.0493, "step": 365000 }, { "epoch": 0.9, "learning_rate": 4.6103916235762854e-05, "loss": 0.05, "step": 370000 }, { "epoch": 0.91, "learning_rate": 4.600082872665831e-05, "loss": 0.0492, "step": 375000 }, { "epoch": 0.93, "learning_rate": 4.589651356400925e-05, "loss": 0.0499, "step": 380000 }, { "epoch": 0.94, "learning_rate": 4.5790976845807375e-05, "loss": 0.0492, "step": 385000 }, { "epoch": 0.95, "learning_rate": 4.568422474145333e-05, "loss": 0.0496, "step": 390000 }, { "epoch": 0.96, "learning_rate": 4.5576241769938385e-05, "loss": 0.0496, "step": 395000 }, { "epoch": 0.97, "learning_rate": 4.5467099406767963e-05, "loss": 0.0491, "step": 400000 }, { "epoch": 0.99, "learning_rate": 4.535671666878825e-05, "loss": 0.0492, "step": 405000 }, { "epoch": 1.0, "learning_rate": 4.52451883295289e-05, "loss": 0.049, "step": 410000 }, { "epoch": 1.01, "learning_rate": 4.513245430925934e-05, "loss": 0.048, "step": 415000 }, { "epoch": 1.02, "learning_rate": 4.5018543398343515e-05, "loss": 0.0472, "step": 420000 }, { "epoch": 1.03, "learning_rate": 4.4903415971499975e-05, "loss": 0.0463, "step": 425000 }, { "epoch": 1.05, "learning_rate": 4.47871708602701e-05, "loss": 0.047, "step": 430000 }, { "epoch": 1.06, "learning_rate": 4.46697926477584e-05, "loss": 0.0472, "step": 435000 }, { "epoch": 1.07, "learning_rate": 4.455119354756587e-05, "loss": 0.0466, "step": 440000 }, { "epoch": 1.08, "learning_rate": 4.443154687048235e-05, "loss": 0.047, "step": 445000 }, { "epoch": 1.1, "learning_rate": 4.4310692318277604e-05, "loss": 0.047, "step": 450000 }, { "epoch": 1.11, "learning_rate": 4.418870846171951e-05, "loss": 0.047, "step": 455000 }, { "epoch": 1.12, "learning_rate": 4.406562717238809e-05, "loss": 0.0462, "step": 460000 }, { "epoch": 1.13, "learning_rate": 4.394145631643063e-05, "loss": 0.0471, "step": 465000 }, { "epoch": 1.14, "learning_rate": 4.381615345463161e-05, "loss": 0.0475, "step": 470000 }, { "epoch": 1.16, "learning_rate": 4.368972524944734e-05, "loss": 0.0467, "step": 475000 }, { "epoch": 1.17, "learning_rate": 4.3562229672692154e-05, "loss": 0.0467, "step": 480000 }, { "epoch": 1.18, "learning_rate": 4.3433674830975235e-05, "loss": 0.0463, "step": 485000 }, { "epoch": 1.19, "learning_rate": 4.3303990737414704e-05, "loss": 0.0463, "step": 490000 }, { "epoch": 1.21, "learning_rate": 4.317326247486547e-05, "loss": 0.0466, "step": 495000 }, { "epoch": 1.22, "learning_rate": 4.304147185090266e-05, "loss": 0.0463, "step": 500000 }, { "epoch": 1.23, "learning_rate": 4.290859988496148e-05, "loss": 0.0466, "step": 505000 }, { "epoch": 1.24, "learning_rate": 4.277470750354905e-05, "loss": 0.0461, "step": 510000 }, { "epoch": 1.25, "learning_rate": 4.263980315969459e-05, "loss": 0.0463, "step": 515000 }, { "epoch": 1.27, "learning_rate": 4.250381343966794e-05, "loss": 0.0463, "step": 520000 }, { "epoch": 1.28, "learning_rate": 4.236680008297452e-05, "loss": 0.0467, "step": 525000 }, { "epoch": 1.29, "learning_rate": 4.222879881460605e-05, "loss": 0.046, "step": 530000 }, { "epoch": 1.3, "learning_rate": 4.2089790395700444e-05, "loss": 0.0465, "step": 535000 }, { "epoch": 1.31, "learning_rate": 4.194978295232646e-05, "loss": 0.0471, "step": 540000 }, { "epoch": 1.33, "learning_rate": 4.180875635941759e-05, "loss": 0.046, "step": 545000 }, { "epoch": 1.34, "learning_rate": 4.1666746777322316e-05, "loss": 0.0463, "step": 550000 }, { "epoch": 1.35, "learning_rate": 4.1523819909804684e-05, "loss": 0.0464, "step": 555000 }, { "epoch": 1.36, "learning_rate": 4.1379869705757123e-05, "loss": 0.0462, "step": 560000 }, { "epoch": 1.38, "learning_rate": 4.123501975940446e-05, "loss": 0.0466, "step": 565000 }, { "epoch": 1.39, "learning_rate": 4.108916259272307e-05, "loss": 0.0462, "step": 570000 }, { "epoch": 1.4, "learning_rate": 4.094239397826731e-05, "loss": 0.0471, "step": 575000 }, { "epoch": 1.41, "learning_rate": 4.079466376852837e-05, "loss": 0.0456, "step": 580000 }, { "epoch": 1.42, "learning_rate": 4.064606953152425e-05, "loss": 0.0459, "step": 585000 }, { "epoch": 1.44, "learning_rate": 4.049653102158943e-05, "loss": 0.0464, "step": 590000 }, { "epoch": 1.45, "learning_rate": 4.034608662496395e-05, "loss": 0.0453, "step": 595000 }, { "epoch": 1.46, "learning_rate": 4.019474513623393e-05, "loss": 0.0466, "step": 600000 }, { "epoch": 1.47, "learning_rate": 4.004248485601213e-05, "loss": 0.046, "step": 605000 }, { "epoch": 1.49, "learning_rate": 3.9889375601035114e-05, "loss": 0.0454, "step": 610000 }, { "epoch": 1.5, "learning_rate": 3.973536505734094e-05, "loss": 0.0456, "step": 615000 }, { "epoch": 1.51, "learning_rate": 3.958049277800142e-05, "loss": 0.045, "step": 620000 }, { "epoch": 1.52, "learning_rate": 3.94248302918014e-05, "loss": 0.0452, "step": 625000 }, { "epoch": 1.53, "learning_rate": 3.9268262097860184e-05, "loss": 0.0458, "step": 630000 }, { "epoch": 1.55, "learning_rate": 3.911089105446822e-05, "loss": 0.0455, "step": 635000 }, { "epoch": 1.56, "learning_rate": 3.895269512527556e-05, "loss": 0.0453, "step": 640000 }, { "epoch": 1.57, "learning_rate": 3.879368355800267e-05, "loss": 0.0459, "step": 645000 }, { "epoch": 1.58, "learning_rate": 3.863386564805006e-05, "loss": 0.0458, "step": 650000 }, { "epoch": 1.59, "learning_rate": 3.8473250737954924e-05, "loss": 0.0452, "step": 655000 }, { "epoch": 1.61, "learning_rate": 3.831181584524374e-05, "loss": 0.046, "step": 660000 }, { "epoch": 1.62, "learning_rate": 3.8149602467155784e-05, "loss": 0.0454, "step": 665000 }, { "epoch": 1.63, "learning_rate": 3.7986652769317283e-05, "loss": 0.0454, "step": 670000 }, { "epoch": 1.64, "learning_rate": 3.782291107551002e-05, "loss": 0.0455, "step": 675000 }, { "epoch": 1.66, "learning_rate": 3.7658452467105766e-05, "loss": 0.0453, "step": 680000 }, { "epoch": 1.67, "learning_rate": 3.7493287004972016e-05, "loss": 0.0461, "step": 685000 }, { "epoch": 1.68, "learning_rate": 3.7327324967260834e-05, "loss": 0.045, "step": 690000 }, { "epoch": 1.69, "learning_rate": 3.716067543266813e-05, "loss": 0.0453, "step": 695000 }, { "epoch": 1.7, "learning_rate": 3.6993281460825346e-05, "loss": 0.0454, "step": 700000 }, { "epoch": 1.72, "learning_rate": 3.682521980780319e-05, "loss": 0.0455, "step": 705000 }, { "epoch": 1.73, "learning_rate": 3.665646688334596e-05, "loss": 0.0445, "step": 710000 }, { "epoch": 1.74, "learning_rate": 3.648706652022939e-05, "loss": 0.0448, "step": 715000 }, { "epoch": 1.75, "learning_rate": 3.631692671937868e-05, "loss": 0.0444, "step": 720000 }, { "epoch": 1.77, "learning_rate": 3.6146125095823744e-05, "loss": 0.0449, "step": 725000 }, { "epoch": 1.78, "learning_rate": 3.597474036228449e-05, "loss": 0.0453, "step": 730000 }, { "epoch": 1.79, "learning_rate": 3.580264535204181e-05, "loss": 0.0447, "step": 735000 }, { "epoch": 1.8, "learning_rate": 3.562995320943442e-05, "loss": 0.0441, "step": 740000 }, { "epoch": 1.81, "learning_rate": 3.545660492947155e-05, "loss": 0.0448, "step": 745000 }, { "epoch": 1.83, "learning_rate": 3.52826799984853e-05, "loss": 0.0442, "step": 750000 }, { "epoch": 1.84, "learning_rate": 3.510818894752436e-05, "loss": 0.044, "step": 755000 }, { "epoch": 1.85, "learning_rate": 3.49330721388963e-05, "loss": 0.044, "step": 760000 }, { "epoch": 1.86, "learning_rate": 3.4757339457867414e-05, "loss": 0.0444, "step": 765000 }, { "epoch": 1.87, "learning_rate": 3.4581071484102234e-05, "loss": 0.0435, "step": 770000 }, { "epoch": 1.89, "learning_rate": 3.440424342594606e-05, "loss": 0.0436, "step": 775000 }, { "epoch": 1.9, "learning_rate": 3.4226865620304894e-05, "loss": 0.0444, "step": 780000 }, { "epoch": 1.91, "learning_rate": 3.404891278526875e-05, "loss": 0.0442, "step": 785000 }, { "epoch": 1.92, "learning_rate": 3.387046651852148e-05, "loss": 0.0446, "step": 790000 }, { "epoch": 1.94, "learning_rate": 3.369146584901852e-05, "loss": 0.045, "step": 795000 }, { "epoch": 1.95, "learning_rate": 3.3511992854912975e-05, "loss": 0.044, "step": 800000 }, { "epoch": 1.96, "learning_rate": 3.333205832954606e-05, "loss": 0.0439, "step": 805000 }, { "epoch": 1.97, "learning_rate": 3.315152846675856e-05, "loss": 0.0447, "step": 810000 }, { "epoch": 1.98, "learning_rate": 3.2970630450412766e-05, "loss": 0.0439, "step": 815000 }, { "epoch": 2.0, "learning_rate": 3.2789230337843214e-05, "loss": 0.0438, "step": 820000 }, { "epoch": 2.01, "learning_rate": 3.2607374887231645e-05, "loss": 0.0414, "step": 825000 }, { "epoch": 2.02, "learning_rate": 3.24251112477632e-05, "loss": 0.0411, "step": 830000 }, { "epoch": 2.03, "learning_rate": 3.22423405210557e-05, "loss": 0.0409, "step": 835000 }, { "epoch": 2.05, "learning_rate": 3.2059146256623924e-05, "loss": 0.0403, "step": 840000 }, { "epoch": 2.06, "learning_rate": 3.187561270645646e-05, "loss": 0.0407, "step": 845000 }, { "epoch": 2.07, "learning_rate": 3.169164053812308e-05, "loss": 0.0406, "step": 850000 }, { "epoch": 2.08, "learning_rate": 3.1507240268131666e-05, "loss": 0.0412, "step": 855000 }, { "epoch": 2.09, "learning_rate": 3.1322459449652304e-05, "loss": 0.0406, "step": 860000 }, { "epoch": 2.11, "learning_rate": 3.113734596258391e-05, "loss": 0.0407, "step": 865000 }, { "epoch": 2.12, "learning_rate": 3.095187370231551e-05, "loss": 0.0406, "step": 870000 }, { "epoch": 2.13, "learning_rate": 3.076605351107057e-05, "loss": 0.0412, "step": 875000 }, { "epoch": 2.14, "learning_rate": 3.057989625141167e-05, "loss": 0.0407, "step": 880000 }, { "epoch": 2.15, "learning_rate": 3.0393412805605544e-05, "loss": 0.0414, "step": 885000 }, { "epoch": 2.17, "learning_rate": 3.0206576669471674e-05, "loss": 0.0409, "step": 890000 }, { "epoch": 2.18, "learning_rate": 3.0019473514002417e-05, "loss": 0.0411, "step": 895000 }, { "epoch": 2.19, "learning_rate": 2.983207693323402e-05, "loss": 0.0405, "step": 900000 }, { "epoch": 2.2, "learning_rate": 2.9644397881880708e-05, "loss": 0.0409, "step": 905000 }, { "epoch": 2.22, "learning_rate": 2.9456447331169147e-05, "loss": 0.0408, "step": 910000 }, { "epoch": 2.23, "learning_rate": 2.9268236268197174e-05, "loss": 0.041, "step": 915000 }, { "epoch": 2.24, "learning_rate": 2.90797756952915e-05, "loss": 0.0413, "step": 920000 }, { "epoch": 2.25, "learning_rate": 2.8891038851321416e-05, "loss": 0.0412, "step": 925000 }, { "epoch": 2.26, "learning_rate": 2.8702150101270274e-05, "loss": 0.041, "step": 930000 }, { "epoch": 2.28, "learning_rate": 2.8512969290515646e-05, "loss": 0.0408, "step": 935000 }, { "epoch": 2.29, "learning_rate": 2.832358303857603e-05, "loss": 0.041, "step": 940000 }, { "epoch": 2.3, "learning_rate": 2.8134040363313168e-05, "loss": 0.0409, "step": 945000 }, { "epoch": 2.31, "learning_rate": 2.7944276502291256e-05, "loss": 0.041, "step": 950000 }, { "epoch": 2.33, "learning_rate": 2.775437846896827e-05, "loss": 0.04, "step": 955000 }, { "epoch": 2.34, "learning_rate": 2.756435746426503e-05, "loss": 0.04, "step": 960000 }, { "epoch": 2.35, "learning_rate": 2.73741104713839e-05, "loss": 0.0406, "step": 965000 }, { "epoch": 2.36, "learning_rate": 2.718376273659552e-05, "loss": 0.0401, "step": 970000 }, { "epoch": 2.37, "learning_rate": 2.699332546709509e-05, "loss": 0.0401, "step": 975000 }, { "epoch": 2.39, "learning_rate": 2.680269543053059e-05, "loss": 0.0391, "step": 980000 }, { "epoch": 2.4, "learning_rate": 2.6612036299970488e-05, "loss": 0.0402, "step": 985000 }, { "epoch": 2.41, "learning_rate": 2.6421168424917686e-05, "loss": 0.0403, "step": 990000 }, { "epoch": 2.42, "learning_rate": 2.623033199888022e-05, "loss": 0.0401, "step": 995000 }, { "epoch": 2.43, "learning_rate": 2.6039385468590504e-05, "loss": 0.0396, "step": 1000000 }, { "epoch": 2.45, "learning_rate": 2.5848339956464096e-05, "loss": 0.0398, "step": 1005000 }, { "epoch": 2.46, "learning_rate": 2.5657283063641474e-05, "loss": 0.0401, "step": 1010000 }, { "epoch": 2.47, "learning_rate": 2.5466225985224508e-05, "loss": 0.04, "step": 1015000 }, { "epoch": 2.48, "learning_rate": 2.5275103421570534e-05, "loss": 0.0398, "step": 1020000 }, { "epoch": 2.5, "learning_rate": 2.5083926532199688e-05, "loss": 0.0398, "step": 1025000 }, { "epoch": 2.51, "learning_rate": 2.4892821222247636e-05, "loss": 0.04, "step": 1030000 }, { "epoch": 2.52, "learning_rate": 2.4701607451032485e-05, "loss": 0.0402, "step": 1035000 }, { "epoch": 2.53, "learning_rate": 2.4510525847347732e-05, "loss": 0.0398, "step": 1040000 }, { "epoch": 2.54, "learning_rate": 2.4319434615660547e-05, "loss": 0.0404, "step": 1045000 }, { "epoch": 2.56, "learning_rate": 2.4128383168046513e-05, "loss": 0.0402, "step": 1050000 }, { "epoch": 2.57, "learning_rate": 2.3937306253792384e-05, "loss": 0.0397, "step": 1055000 }, { "epoch": 2.58, "learning_rate": 2.3746367903542062e-05, "loss": 0.0402, "step": 1060000 }, { "epoch": 2.59, "learning_rate": 2.355550283734671e-05, "loss": 0.0393, "step": 1065000 }, { "epoch": 2.61, "learning_rate": 2.3364684050478952e-05, "loss": 0.0397, "step": 1070000 }, { "epoch": 2.62, "learning_rate": 2.3173999040117696e-05, "loss": 0.0394, "step": 1075000 }, { "epoch": 2.63, "learning_rate": 2.29834207729988e-05, "loss": 0.039, "step": 1080000 }, { "epoch": 2.64, "learning_rate": 2.2792922295048335e-05, "loss": 0.0393, "step": 1085000 }, { "epoch": 2.65, "learning_rate": 2.2602629024425966e-05, "loss": 0.039, "step": 1090000 }, { "epoch": 2.67, "learning_rate": 2.2412437803062146e-05, "loss": 0.0394, "step": 1095000 }, { "epoch": 2.68, "learning_rate": 2.2222359836495102e-05, "loss": 0.0389, "step": 1100000 }, { "epoch": 2.69, "learning_rate": 2.2032520255937674e-05, "loss": 0.0392, "step": 1105000 }, { "epoch": 2.7, "learning_rate": 2.1842816139163587e-05, "loss": 0.039, "step": 1110000 }, { "epoch": 2.71, "learning_rate": 2.1653258683231724e-05, "loss": 0.0392, "step": 1115000 }, { "epoch": 2.73, "learning_rate": 2.1463934806956023e-05, "loss": 0.039, "step": 1120000 }, { "epoch": 2.74, "learning_rate": 2.1274817639807107e-05, "loss": 0.0393, "step": 1125000 }, { "epoch": 2.75, "learning_rate": 2.1085918237080158e-05, "loss": 0.0393, "step": 1130000 }, { "epoch": 2.76, "learning_rate": 2.0897285366938386e-05, "loss": 0.0389, "step": 1135000 }, { "epoch": 2.78, "learning_rate": 2.0708816881777654e-05, "loss": 0.0386, "step": 1140000 }, { "epoch": 2.79, "learning_rate": 2.052059934835373e-05, "loss": 0.0387, "step": 1145000 }, { "epoch": 2.8, "learning_rate": 2.0332718917197323e-05, "loss": 0.0387, "step": 1150000 }, { "epoch": 2.81, "learning_rate": 2.0145036181854185e-05, "loss": 0.0386, "step": 1155000 }, { "epoch": 2.82, "learning_rate": 1.9957712284869015e-05, "loss": 0.0388, "step": 1160000 }, { "epoch": 2.84, "learning_rate": 1.9770645630812195e-05, "loss": 0.0385, "step": 1165000 }, { "epoch": 2.85, "learning_rate": 1.958384733531826e-05, "loss": 0.0387, "step": 1170000 }, { "epoch": 2.86, "learning_rate": 1.939744032341169e-05, "loss": 0.038, "step": 1175000 }, { "epoch": 2.87, "learning_rate": 1.921132348667458e-05, "loss": 0.0389, "step": 1180000 }, { "epoch": 2.89, "learning_rate": 1.9025545040833008e-05, "loss": 0.0388, "step": 1185000 }, { "epoch": 2.9, "learning_rate": 1.8840078780999552e-05, "loss": 0.0383, "step": 1190000 }, { "epoch": 2.91, "learning_rate": 1.8655009750029695e-05, "loss": 0.0388, "step": 1195000 }, { "epoch": 2.92, "learning_rate": 1.8470311630602035e-05, "loss": 0.0379, "step": 1200000 }, { "epoch": 2.93, "learning_rate": 1.8285995219685757e-05, "loss": 0.0378, "step": 1205000 }, { "epoch": 2.95, "learning_rate": 1.8102071291936395e-05, "loss": 0.0381, "step": 1210000 }, { "epoch": 2.96, "learning_rate": 1.7918587276844793e-05, "loss": 0.0382, "step": 1215000 }, { "epoch": 2.97, "learning_rate": 1.7735443869214267e-05, "loss": 0.0385, "step": 1220000 }, { "epoch": 2.98, "learning_rate": 1.7552725298494208e-05, "loss": 0.038, "step": 1225000 }, { "epoch": 2.99, "learning_rate": 1.7370515089505386e-05, "loss": 0.0381, "step": 1230000 }, { "epoch": 3.01, "learning_rate": 1.718875070180597e-05, "loss": 0.0362, "step": 1235000 }, { "epoch": 3.02, "learning_rate": 1.7007370282692398e-05, "loss": 0.0339, "step": 1240000 }, { "epoch": 3.03, "learning_rate": 1.682649342119219e-05, "loss": 0.0349, "step": 1245000 }, { "epoch": 3.04, "learning_rate": 1.664609436151844e-05, "loss": 0.0345, "step": 1250000 }, { "epoch": 3.06, "learning_rate": 1.6466183649328544e-05, "loss": 0.0348, "step": 1255000 }, { "epoch": 3.07, "learning_rate": 1.628673595562077e-05, "loss": 0.0346, "step": 1260000 }, { "epoch": 3.08, "learning_rate": 1.6107833563524666e-05, "loss": 0.0348, "step": 1265000 }, { "epoch": 3.09, "learning_rate": 1.5929450984231475e-05, "loss": 0.0341, "step": 1270000 }, { "epoch": 3.1, "learning_rate": 1.575159864552045e-05, "loss": 0.0346, "step": 1275000 }, { "epoch": 3.12, "learning_rate": 1.5574286944174337e-05, "loss": 0.0348, "step": 1280000 }, { "epoch": 3.13, "learning_rate": 1.5397526245371656e-05, "loss": 0.0347, "step": 1285000 }, { "epoch": 3.14, "learning_rate": 1.5221362079222911e-05, "loss": 0.0346, "step": 1290000 }, { "epoch": 3.15, "learning_rate": 1.5045699154455748e-05, "loss": 0.035, "step": 1295000 }, { "epoch": 3.17, "learning_rate": 1.4870653329234462e-05, "loss": 0.0343, "step": 1300000 }, { "epoch": 3.18, "learning_rate": 1.469619963913822e-05, "loss": 0.0344, "step": 1305000 }, { "epoch": 3.19, "learning_rate": 1.4522348282273651e-05, "loss": 0.0346, "step": 1310000 }, { "epoch": 3.2, "learning_rate": 1.434910942153659e-05, "loss": 0.0341, "step": 1315000 }, { "epoch": 3.21, "learning_rate": 1.4176493184017924e-05, "loss": 0.0338, "step": 1320000 }, { "epoch": 3.23, "learning_rate": 1.4004509660411627e-05, "loss": 0.0345, "step": 1325000 }, { "epoch": 3.24, "learning_rate": 1.3833134687545127e-05, "loss": 0.0343, "step": 1330000 }, { "epoch": 3.25, "learning_rate": 1.3662480932190311e-05, "loss": 0.0341, "step": 1335000 }, { "epoch": 3.26, "learning_rate": 1.3492421770010699e-05, "loss": 0.0336, "step": 1340000 }, { "epoch": 3.27, "learning_rate": 1.3323069396041015e-05, "loss": 0.0343, "step": 1345000 }, { "epoch": 3.29, "learning_rate": 1.3154399624907232e-05, "loss": 0.0342, "step": 1350000 }, { "epoch": 3.3, "learning_rate": 1.2986422316603203e-05, "loss": 0.0346, "step": 1355000 }, { "epoch": 3.31, "learning_rate": 1.2819147290643238e-05, "loss": 0.0341, "step": 1360000 }, { "epoch": 3.32, "learning_rate": 1.2652584325488027e-05, "loss": 0.0339, "step": 1365000 }, { "epoch": 3.34, "learning_rate": 1.2486743157973069e-05, "loss": 0.0342, "step": 1370000 }, { "epoch": 3.35, "learning_rate": 1.2321666444080471e-05, "loss": 0.0344, "step": 1375000 }, { "epoch": 3.36, "learning_rate": 1.2157264951667166e-05, "loss": 0.0338, "step": 1380000 }, { "epoch": 3.37, "learning_rate": 1.1993647173310798e-05, "loss": 0.0338, "step": 1385000 }, { "epoch": 3.38, "learning_rate": 1.183078971233793e-05, "loss": 0.0334, "step": 1390000 }, { "epoch": 3.4, "learning_rate": 1.1668669736135962e-05, "loss": 0.0347, "step": 1395000 }, { "epoch": 3.41, "learning_rate": 1.1507361582461623e-05, "loss": 0.034, "step": 1400000 }, { "epoch": 3.42, "learning_rate": 1.134684217315512e-05, "loss": 0.0332, "step": 1405000 }, { "epoch": 3.43, "learning_rate": 1.1187089015206759e-05, "loss": 0.0336, "step": 1410000 }, { "epoch": 3.45, "learning_rate": 1.1028175361103207e-05, "loss": 0.0332, "step": 1415000 }, { "epoch": 3.46, "learning_rate": 1.0870078463341248e-05, "loss": 0.0336, "step": 1420000 }, { "epoch": 3.47, "learning_rate": 1.0712838947388687e-05, "loss": 0.0334, "step": 1425000 }, { "epoch": 3.48, "learning_rate": 1.0556371856281719e-05, "loss": 0.0338, "step": 1430000 }, { "epoch": 3.49, "learning_rate": 1.0400780485452265e-05, "loss": 0.0337, "step": 1435000 }, { "epoch": 3.51, "learning_rate": 1.0246042546828628e-05, "loss": 0.0335, "step": 1440000 }, { "epoch": 3.52, "learning_rate": 1.009216708598616e-05, "loss": 0.0332, "step": 1445000 }, { "epoch": 3.53, "learning_rate": 9.939163098082024e-06, "loss": 0.0324, "step": 1450000 }, { "epoch": 3.54, "learning_rate": 9.787039527329362e-06, "loss": 0.0333, "step": 1455000 }, { "epoch": 3.55, "learning_rate": 9.635805266474399e-06, "loss": 0.0333, "step": 1460000 }, { "epoch": 3.57, "learning_rate": 9.485439167479077e-06, "loss": 0.0335, "step": 1465000 }, { "epoch": 3.58, "learning_rate": 9.335980372105996e-06, "loss": 0.0333, "step": 1470000 }, { "epoch": 3.59, "learning_rate": 9.187496865476697e-06, "loss": 0.033, "step": 1475000 }, { "epoch": 3.6, "learning_rate": 9.03987847348179e-06, "loss": 0.033, "step": 1480000 }, { "epoch": 3.62, "learning_rate": 8.893222690812272e-06, "loss": 0.033, "step": 1485000 }, { "epoch": 3.63, "learning_rate": 8.747537527998633e-06, "loss": 0.0331, "step": 1490000 }, { "epoch": 3.64, "learning_rate": 8.602773192648179e-06, "loss": 0.0329, "step": 1495000 }, { "epoch": 3.65, "learning_rate": 8.45893872011418e-06, "loss": 0.0329, "step": 1500000 }, { "epoch": 3.66, "learning_rate": 8.316100063601678e-06, "loss": 0.0323, "step": 1505000 }, { "epoch": 3.68, "learning_rate": 8.174208418379433e-06, "loss": 0.0329, "step": 1510000 }, { "epoch": 3.69, "learning_rate": 8.03332884679727e-06, "loss": 0.033, "step": 1515000 }, { "epoch": 3.7, "learning_rate": 7.893441102454437e-06, "loss": 0.0328, "step": 1520000 } ], "logging_steps": 5000, "max_steps": 2053645, "num_train_epochs": 5, "save_steps": 40000, "total_flos": 3.8313184144529346e+20, "trial_name": null, "trial_params": null }