{ "best_metric": null, "best_model_checkpoint": null, "epoch": 18.846153846153847, "eval_steps": 500, "global_step": 14040, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.2, "learning_rate": 3.2763532763532763e-06, "loss": 0.819, "step": 46 }, { "epoch": 0.39, "learning_rate": 6.5527065527065525e-06, "loss": 0.8698, "step": 92 }, { "epoch": 0.59, "learning_rate": 9.82905982905983e-06, "loss": 0.7667, "step": 138 }, { "epoch": 0.79, "learning_rate": 1.3105413105413105e-05, "loss": 0.6645, "step": 184 }, { "epoch": 0.98, "learning_rate": 1.6381766381766382e-05, "loss": 0.5342, "step": 230 }, { "epoch": 1.18, "learning_rate": 1.965811965811966e-05, "loss": 0.5067, "step": 276 }, { "epoch": 1.38, "learning_rate": 2.2934472934472936e-05, "loss": 0.434, "step": 322 }, { "epoch": 1.57, "learning_rate": 2.621082621082621e-05, "loss": 0.4473, "step": 368 }, { "epoch": 1.77, "learning_rate": 2.948717948717949e-05, "loss": 0.3901, "step": 414 }, { "epoch": 1.97, "learning_rate": 3.2763532763532764e-05, "loss": 0.3859, "step": 460 }, { "epoch": 2.16, "learning_rate": 3.603988603988604e-05, "loss": 0.3526, "step": 506 }, { "epoch": 2.36, "learning_rate": 3.931623931623932e-05, "loss": 0.3339, "step": 552 }, { "epoch": 2.56, "learning_rate": 4.259259259259259e-05, "loss": 0.343, "step": 598 }, { "epoch": 2.75, "learning_rate": 4.586894586894587e-05, "loss": 0.2998, "step": 644 }, { "epoch": 2.95, "learning_rate": 4.9145299145299147e-05, "loss": 0.2778, "step": 690 }, { "epoch": 3.15, "learning_rate": 5.242165242165242e-05, "loss": 0.3029, "step": 736 }, { "epoch": 1.06, "learning_rate": 5.5698005698005694e-05, "loss": 0.2557, "step": 782 }, { "epoch": 1.25, "learning_rate": 5.897435897435898e-05, "loss": 0.2605, "step": 828 }, { "epoch": 1.45, "learning_rate": 6.225071225071225e-05, "loss": 0.2566, "step": 874 }, { "epoch": 1.65, "learning_rate": 6.552706552706553e-05, "loss": 0.2409, "step": 920 }, { "epoch": 1.84, "learning_rate": 6.880341880341881e-05, "loss": 0.2518, "step": 966 }, { "epoch": 2.04, "learning_rate": 7.207977207977208e-05, "loss": 0.279, "step": 1012 }, { "epoch": 2.24, "learning_rate": 7.535612535612536e-05, "loss": 0.2421, "step": 1058 }, { "epoch": 2.43, "learning_rate": 7.863247863247864e-05, "loss": 0.2237, "step": 1104 }, { "epoch": 2.63, "learning_rate": 8.190883190883192e-05, "loss": 0.2057, "step": 1150 }, { "epoch": 2.82, "learning_rate": 8.518518518518518e-05, "loss": 0.2496, "step": 1196 }, { "epoch": 3.02, "learning_rate": 8.846153846153847e-05, "loss": 0.2397, "step": 1242 }, { "epoch": 3.22, "learning_rate": 9.173789173789175e-05, "loss": 0.2322, "step": 1288 }, { "epoch": 3.41, "learning_rate": 9.501424501424501e-05, "loss": 0.2046, "step": 1334 }, { "epoch": 3.61, "learning_rate": 9.829059829059829e-05, "loss": 0.2308, "step": 1380 }, { "epoch": 3.81, "learning_rate": 9.982589427033872e-05, "loss": 0.2052, "step": 1426 }, { "epoch": 4.0, "learning_rate": 9.946185501741058e-05, "loss": 0.1759, "step": 1472 }, { "epoch": 4.2, "learning_rate": 9.909781576448244e-05, "loss": 0.2116, "step": 1518 }, { "epoch": 2.11, "learning_rate": 9.87337765115543e-05, "loss": 0.1603, "step": 1564 }, { "epoch": 2.31, "learning_rate": 9.836973725862614e-05, "loss": 0.1892, "step": 1610 }, { "epoch": 2.5, "learning_rate": 9.800569800569801e-05, "loss": 0.1656, "step": 1656 }, { "epoch": 2.7, "learning_rate": 9.764165875276987e-05, "loss": 0.181, "step": 1702 }, { "epoch": 2.9, "learning_rate": 9.727761949984172e-05, "loss": 0.1727, "step": 1748 }, { "epoch": 3.09, "learning_rate": 9.691358024691359e-05, "loss": 0.1555, "step": 1794 }, { "epoch": 3.29, "learning_rate": 9.654954099398545e-05, "loss": 0.1152, "step": 1840 }, { "epoch": 3.49, "learning_rate": 9.61855017410573e-05, "loss": 0.1404, "step": 1886 }, { "epoch": 3.68, "learning_rate": 9.582146248812915e-05, "loss": 0.1635, "step": 1932 }, { "epoch": 3.88, "learning_rate": 9.545742323520101e-05, "loss": 0.1429, "step": 1978 }, { "epoch": 4.08, "learning_rate": 9.509338398227288e-05, "loss": 0.1369, "step": 2024 }, { "epoch": 4.27, "learning_rate": 9.472934472934474e-05, "loss": 0.1482, "step": 2070 }, { "epoch": 4.47, "learning_rate": 9.436530547641659e-05, "loss": 0.1388, "step": 2116 }, { "epoch": 4.67, "learning_rate": 9.400126622348845e-05, "loss": 0.1602, "step": 2162 }, { "epoch": 4.86, "learning_rate": 9.36372269705603e-05, "loss": 0.1764, "step": 2208 }, { "epoch": 5.06, "learning_rate": 9.327318771763217e-05, "loss": 0.16, "step": 2254 }, { "epoch": 5.26, "learning_rate": 9.290914846470402e-05, "loss": 0.133, "step": 2300 }, { "epoch": 3.17, "learning_rate": 9.254510921177588e-05, "loss": 0.1226, "step": 2346 }, { "epoch": 3.36, "learning_rate": 9.218106995884775e-05, "loss": 0.1062, "step": 2392 }, { "epoch": 3.56, "learning_rate": 9.181703070591961e-05, "loss": 0.1184, "step": 2438 }, { "epoch": 3.76, "learning_rate": 9.145299145299146e-05, "loss": 0.107, "step": 2484 }, { "epoch": 3.95, "learning_rate": 9.108895220006331e-05, "loss": 0.1113, "step": 2530 }, { "epoch": 4.15, "learning_rate": 9.072491294713517e-05, "loss": 0.13, "step": 2576 }, { "epoch": 4.35, "learning_rate": 9.036087369420702e-05, "loss": 0.1066, "step": 2622 }, { "epoch": 4.54, "learning_rate": 8.999683444127889e-05, "loss": 0.1131, "step": 2668 }, { "epoch": 4.74, "learning_rate": 8.963279518835075e-05, "loss": 0.1117, "step": 2714 }, { "epoch": 4.94, "learning_rate": 8.926875593542262e-05, "loss": 0.1143, "step": 2760 }, { "epoch": 5.13, "learning_rate": 8.890471668249447e-05, "loss": 0.1041, "step": 2806 }, { "epoch": 5.33, "learning_rate": 8.854067742956632e-05, "loss": 0.1041, "step": 2852 }, { "epoch": 5.53, "learning_rate": 8.817663817663818e-05, "loss": 0.1205, "step": 2898 }, { "epoch": 5.72, "learning_rate": 8.781259892371004e-05, "loss": 0.1214, "step": 2944 }, { "epoch": 5.92, "learning_rate": 8.74485596707819e-05, "loss": 0.117, "step": 2990 }, { "epoch": 6.12, "learning_rate": 8.708452041785376e-05, "loss": 0.1115, "step": 3036 }, { "epoch": 4.03, "learning_rate": 8.672048116492562e-05, "loss": 0.106, "step": 3082 }, { "epoch": 4.22, "learning_rate": 8.635644191199747e-05, "loss": 0.0876, "step": 3128 }, { "epoch": 4.42, "learning_rate": 8.599240265906932e-05, "loss": 0.0827, "step": 3174 }, { "epoch": 4.62, "learning_rate": 8.562836340614119e-05, "loss": 0.0842, "step": 3220 }, { "epoch": 4.81, "learning_rate": 8.526432415321305e-05, "loss": 0.0933, "step": 3266 }, { "epoch": 5.01, "learning_rate": 8.490028490028491e-05, "loss": 0.0908, "step": 3312 }, { "epoch": 5.21, "learning_rate": 8.453624564735676e-05, "loss": 0.089, "step": 3358 }, { "epoch": 5.4, "learning_rate": 8.417220639442863e-05, "loss": 0.084, "step": 3404 }, { "epoch": 5.6, "learning_rate": 8.380816714150048e-05, "loss": 0.0745, "step": 3450 }, { "epoch": 5.79, "learning_rate": 8.344412788857233e-05, "loss": 0.0786, "step": 3496 }, { "epoch": 5.99, "learning_rate": 8.308008863564419e-05, "loss": 0.077, "step": 3542 }, { "epoch": 6.19, "learning_rate": 8.271604938271605e-05, "loss": 0.0882, "step": 3588 }, { "epoch": 6.38, "learning_rate": 8.235201012978792e-05, "loss": 0.0902, "step": 3634 }, { "epoch": 6.58, "learning_rate": 8.198797087685977e-05, "loss": 0.0879, "step": 3680 }, { "epoch": 6.78, "learning_rate": 8.162393162393163e-05, "loss": 0.0881, "step": 3726 }, { "epoch": 6.97, "learning_rate": 8.125989237100348e-05, "loss": 0.0955, "step": 3772 }, { "epoch": 7.17, "learning_rate": 8.089585311807535e-05, "loss": 0.0917, "step": 3818 }, { "epoch": 5.08, "learning_rate": 8.05318138651472e-05, "loss": 0.0851, "step": 3864 }, { "epoch": 5.28, "learning_rate": 8.016777461221906e-05, "loss": 0.0638, "step": 3910 }, { "epoch": 5.47, "learning_rate": 7.980373535929092e-05, "loss": 0.0795, "step": 3956 }, { "epoch": 5.67, "learning_rate": 7.943969610636279e-05, "loss": 0.0671, "step": 4002 }, { "epoch": 5.87, "learning_rate": 7.907565685343464e-05, "loss": 0.0779, "step": 4048 }, { "epoch": 6.06, "learning_rate": 7.871161760050649e-05, "loss": 0.0722, "step": 4094 }, { "epoch": 6.26, "learning_rate": 7.834757834757835e-05, "loss": 0.0594, "step": 4140 }, { "epoch": 6.46, "learning_rate": 7.79835390946502e-05, "loss": 0.0774, "step": 4186 }, { "epoch": 6.65, "learning_rate": 7.761949984172207e-05, "loss": 0.0727, "step": 4232 }, { "epoch": 6.85, "learning_rate": 7.725546058879393e-05, "loss": 0.0621, "step": 4278 }, { "epoch": 7.05, "learning_rate": 7.68914213358658e-05, "loss": 0.0619, "step": 4324 }, { "epoch": 7.24, "learning_rate": 7.652738208293764e-05, "loss": 0.0633, "step": 4370 }, { "epoch": 7.44, "learning_rate": 7.61633428300095e-05, "loss": 0.0585, "step": 4416 }, { "epoch": 7.64, "learning_rate": 7.579930357708136e-05, "loss": 0.0743, "step": 4462 }, { "epoch": 7.83, "learning_rate": 7.543526432415322e-05, "loss": 0.0751, "step": 4508 }, { "epoch": 8.03, "learning_rate": 7.507122507122507e-05, "loss": 0.0597, "step": 4554 }, { "epoch": 8.23, "learning_rate": 7.470718581829694e-05, "loss": 0.0692, "step": 4600 }, { "epoch": 6.14, "learning_rate": 7.43431465653688e-05, "loss": 0.0595, "step": 4646 }, { "epoch": 6.33, "learning_rate": 7.397910731244065e-05, "loss": 0.0505, "step": 4692 }, { "epoch": 6.53, "learning_rate": 7.36150680595125e-05, "loss": 0.0516, "step": 4738 }, { "epoch": 6.73, "learning_rate": 7.325102880658436e-05, "loss": 0.0577, "step": 4784 }, { "epoch": 6.92, "learning_rate": 7.288698955365623e-05, "loss": 0.0546, "step": 4830 }, { "epoch": 7.12, "learning_rate": 7.252295030072809e-05, "loss": 0.0472, "step": 4876 }, { "epoch": 7.32, "learning_rate": 7.215891104779994e-05, "loss": 0.0565, "step": 4922 }, { "epoch": 7.51, "learning_rate": 7.17948717948718e-05, "loss": 0.0545, "step": 4968 }, { "epoch": 7.71, "learning_rate": 7.143083254194365e-05, "loss": 0.0399, "step": 5014 }, { "epoch": 7.91, "learning_rate": 7.10667932890155e-05, "loss": 0.0536, "step": 5060 }, { "epoch": 8.1, "learning_rate": 7.070275403608737e-05, "loss": 0.0567, "step": 5106 }, { "epoch": 8.3, "learning_rate": 7.033871478315923e-05, "loss": 0.0608, "step": 5152 }, { "epoch": 8.5, "learning_rate": 6.99746755302311e-05, "loss": 0.0517, "step": 5198 }, { "epoch": 8.69, "learning_rate": 6.961063627730295e-05, "loss": 0.0514, "step": 5244 }, { "epoch": 8.89, "learning_rate": 6.924659702437481e-05, "loss": 0.0525, "step": 5290 }, { "epoch": 9.09, "learning_rate": 6.888255777144666e-05, "loss": 0.0554, "step": 5336 }, { "epoch": 9.28, "learning_rate": 6.851851851851852e-05, "loss": 0.0696, "step": 5382 }, { "epoch": 7.19, "learning_rate": 6.815447926559037e-05, "loss": 0.0358, "step": 5428 }, { "epoch": 7.39, "learning_rate": 6.779044001266224e-05, "loss": 0.0413, "step": 5474 }, { "epoch": 7.59, "learning_rate": 6.74264007597341e-05, "loss": 0.0358, "step": 5520 }, { "epoch": 7.78, "learning_rate": 6.706236150680597e-05, "loss": 0.0483, "step": 5566 }, { "epoch": 7.98, "learning_rate": 6.669832225387782e-05, "loss": 0.039, "step": 5612 }, { "epoch": 8.18, "learning_rate": 6.633428300094967e-05, "loss": 0.0381, "step": 5658 }, { "epoch": 8.37, "learning_rate": 6.597024374802153e-05, "loss": 0.0502, "step": 5704 }, { "epoch": 8.57, "learning_rate": 6.560620449509338e-05, "loss": 0.0421, "step": 5750 }, { "epoch": 8.76, "learning_rate": 6.524216524216524e-05, "loss": 0.0447, "step": 5796 }, { "epoch": 8.96, "learning_rate": 6.487812598923711e-05, "loss": 0.0406, "step": 5842 }, { "epoch": 9.16, "learning_rate": 6.451408673630897e-05, "loss": 0.0446, "step": 5888 }, { "epoch": 9.35, "learning_rate": 6.415004748338082e-05, "loss": 0.0483, "step": 5934 }, { "epoch": 9.55, "learning_rate": 6.378600823045267e-05, "loss": 0.0432, "step": 5980 }, { "epoch": 9.75, "learning_rate": 6.342196897752454e-05, "loss": 0.0483, "step": 6026 }, { "epoch": 9.94, "learning_rate": 6.30579297245964e-05, "loss": 0.0493, "step": 6072 }, { "epoch": 10.14, "learning_rate": 6.269389047166825e-05, "loss": 0.0448, "step": 6118 }, { "epoch": 8.05, "learning_rate": 6.232985121874011e-05, "loss": 0.037, "step": 6164 }, { "epoch": 8.25, "learning_rate": 6.196581196581198e-05, "loss": 0.0319, "step": 6210 }, { "epoch": 8.44, "learning_rate": 6.160177271288383e-05, "loss": 0.0341, "step": 6256 }, { "epoch": 8.64, "learning_rate": 6.123773345995568e-05, "loss": 0.0322, "step": 6302 }, { "epoch": 8.84, "learning_rate": 6.087369420702754e-05, "loss": 0.0375, "step": 6348 }, { "epoch": 9.03, "learning_rate": 6.0509654954099404e-05, "loss": 0.0291, "step": 6394 }, { "epoch": 9.23, "learning_rate": 6.014561570117127e-05, "loss": 0.0322, "step": 6440 }, { "epoch": 9.43, "learning_rate": 5.978157644824312e-05, "loss": 0.0315, "step": 6486 }, { "epoch": 9.62, "learning_rate": 5.9417537195314975e-05, "loss": 0.0341, "step": 6532 }, { "epoch": 9.82, "learning_rate": 5.905349794238684e-05, "loss": 0.0314, "step": 6578 }, { "epoch": 10.02, "learning_rate": 5.868945868945869e-05, "loss": 0.0338, "step": 6624 }, { "epoch": 10.21, "learning_rate": 5.8325419436530546e-05, "loss": 0.0311, "step": 6670 }, { "epoch": 10.41, "learning_rate": 5.796138018360241e-05, "loss": 0.0335, "step": 6716 }, { "epoch": 10.61, "learning_rate": 5.7597340930674274e-05, "loss": 0.0409, "step": 6762 }, { "epoch": 10.8, "learning_rate": 5.7233301677746124e-05, "loss": 0.0332, "step": 6808 }, { "epoch": 11.0, "learning_rate": 5.686926242481798e-05, "loss": 0.0456, "step": 6854 }, { "epoch": 11.2, "learning_rate": 5.6505223171889845e-05, "loss": 0.0311, "step": 6900 }, { "epoch": 9.11, "learning_rate": 5.61411839189617e-05, "loss": 0.0331, "step": 6946 }, { "epoch": 9.3, "learning_rate": 5.577714466603355e-05, "loss": 0.0234, "step": 6992 }, { "epoch": 9.5, "learning_rate": 5.5413105413105416e-05, "loss": 0.0214, "step": 7038 }, { "epoch": 9.7, "learning_rate": 5.504906616017728e-05, "loss": 0.0266, "step": 7084 }, { "epoch": 9.89, "learning_rate": 5.4685026907249136e-05, "loss": 0.0279, "step": 7130 }, { "epoch": 10.09, "learning_rate": 5.4320987654320986e-05, "loss": 0.0256, "step": 7176 }, { "epoch": 10.29, "learning_rate": 5.395694840139285e-05, "loss": 0.0256, "step": 7222 }, { "epoch": 10.48, "learning_rate": 5.359290914846471e-05, "loss": 0.0277, "step": 7268 }, { "epoch": 10.68, "learning_rate": 5.322886989553656e-05, "loss": 0.0237, "step": 7314 }, { "epoch": 10.88, "learning_rate": 5.286483064260842e-05, "loss": 0.0286, "step": 7360 }, { "epoch": 11.07, "learning_rate": 5.2500791389680285e-05, "loss": 0.0285, "step": 7406 }, { "epoch": 11.27, "learning_rate": 5.213675213675214e-05, "loss": 0.0325, "step": 7452 }, { "epoch": 11.47, "learning_rate": 5.177271288382399e-05, "loss": 0.0345, "step": 7498 }, { "epoch": 11.66, "learning_rate": 5.1408673630895856e-05, "loss": 0.0254, "step": 7544 }, { "epoch": 11.86, "learning_rate": 5.104463437796771e-05, "loss": 0.0277, "step": 7590 }, { "epoch": 12.06, "learning_rate": 5.0680595125039577e-05, "loss": 0.0242, "step": 7636 }, { "epoch": 12.25, "learning_rate": 5.031655587211143e-05, "loss": 0.0277, "step": 7682 }, { "epoch": 10.16, "learning_rate": 4.995251661918329e-05, "loss": 0.0188, "step": 7728 }, { "epoch": 10.36, "learning_rate": 4.958847736625515e-05, "loss": 0.0218, "step": 7774 }, { "epoch": 10.56, "learning_rate": 4.9224438113327004e-05, "loss": 0.0244, "step": 7820 }, { "epoch": 10.75, "learning_rate": 4.886039886039887e-05, "loss": 0.0204, "step": 7866 }, { "epoch": 10.95, "learning_rate": 4.849635960747072e-05, "loss": 0.0224, "step": 7912 }, { "epoch": 11.15, "learning_rate": 4.813232035454258e-05, "loss": 0.0205, "step": 7958 }, { "epoch": 11.34, "learning_rate": 4.776828110161444e-05, "loss": 0.027, "step": 8004 }, { "epoch": 11.54, "learning_rate": 4.7404241848686296e-05, "loss": 0.0183, "step": 8050 }, { "epoch": 11.74, "learning_rate": 4.704020259575815e-05, "loss": 0.0199, "step": 8096 }, { "epoch": 11.93, "learning_rate": 4.667616334283001e-05, "loss": 0.0227, "step": 8142 }, { "epoch": 12.13, "learning_rate": 4.6312124089901874e-05, "loss": 0.0232, "step": 8188 }, { "epoch": 12.32, "learning_rate": 4.5948084836973724e-05, "loss": 0.0221, "step": 8234 }, { "epoch": 12.52, "learning_rate": 4.558404558404559e-05, "loss": 0.0207, "step": 8280 }, { "epoch": 12.72, "learning_rate": 4.5220006331117445e-05, "loss": 0.0213, "step": 8326 }, { "epoch": 12.91, "learning_rate": 4.48559670781893e-05, "loss": 0.0201, "step": 8372 }, { "epoch": 13.11, "learning_rate": 4.449192782526116e-05, "loss": 0.0196, "step": 8418 }, { "epoch": 11.02, "learning_rate": 4.412788857233302e-05, "loss": 0.0197, "step": 8464 }, { "epoch": 11.22, "learning_rate": 4.376384931940488e-05, "loss": 0.0154, "step": 8510 }, { "epoch": 11.41, "learning_rate": 4.3399810066476736e-05, "loss": 0.0177, "step": 8556 }, { "epoch": 11.61, "learning_rate": 4.303577081354859e-05, "loss": 0.0158, "step": 8602 }, { "epoch": 11.81, "learning_rate": 4.267173156062046e-05, "loss": 0.0158, "step": 8648 }, { "epoch": 12.0, "learning_rate": 4.230769230769231e-05, "loss": 0.0163, "step": 8694 }, { "epoch": 12.2, "learning_rate": 4.194365305476417e-05, "loss": 0.0184, "step": 8740 }, { "epoch": 12.4, "learning_rate": 4.157961380183603e-05, "loss": 0.016, "step": 8786 }, { "epoch": 12.59, "learning_rate": 4.1215574548907885e-05, "loss": 0.0154, "step": 8832 }, { "epoch": 12.79, "learning_rate": 4.085153529597974e-05, "loss": 0.0168, "step": 8878 }, { "epoch": 12.99, "learning_rate": 4.04874960430516e-05, "loss": 0.0165, "step": 8924 }, { "epoch": 13.18, "learning_rate": 4.012345679012346e-05, "loss": 0.0143, "step": 8970 }, { "epoch": 13.38, "learning_rate": 3.975941753719531e-05, "loss": 0.0188, "step": 9016 }, { "epoch": 13.58, "learning_rate": 3.9395378284267176e-05, "loss": 0.0159, "step": 9062 }, { "epoch": 13.77, "learning_rate": 3.903133903133903e-05, "loss": 0.0192, "step": 9108 }, { "epoch": 13.97, "learning_rate": 3.866729977841089e-05, "loss": 0.018, "step": 9154 }, { "epoch": 14.17, "learning_rate": 3.830326052548275e-05, "loss": 0.0194, "step": 9200 }, { "epoch": 12.08, "learning_rate": 3.793922127255461e-05, "loss": 0.0139, "step": 9246 }, { "epoch": 12.27, "learning_rate": 3.757518201962647e-05, "loss": 0.0122, "step": 9292 }, { "epoch": 12.47, "learning_rate": 3.7211142766698325e-05, "loss": 0.015, "step": 9338 }, { "epoch": 12.67, "learning_rate": 3.684710351377018e-05, "loss": 0.0112, "step": 9384 }, { "epoch": 12.86, "learning_rate": 3.6483064260842046e-05, "loss": 0.0134, "step": 9430 }, { "epoch": 13.06, "learning_rate": 3.6119025007913896e-05, "loss": 0.0139, "step": 9476 }, { "epoch": 13.26, "learning_rate": 3.575498575498576e-05, "loss": 0.0129, "step": 9522 }, { "epoch": 13.45, "learning_rate": 3.539094650205762e-05, "loss": 0.0147, "step": 9568 }, { "epoch": 13.65, "learning_rate": 3.5026907249129474e-05, "loss": 0.0133, "step": 9614 }, { "epoch": 13.85, "learning_rate": 3.466286799620133e-05, "loss": 0.0135, "step": 9660 }, { "epoch": 14.04, "learning_rate": 3.429882874327319e-05, "loss": 0.015, "step": 9706 }, { "epoch": 14.24, "learning_rate": 3.393478949034505e-05, "loss": 0.0128, "step": 9752 }, { "epoch": 14.44, "learning_rate": 3.35707502374169e-05, "loss": 0.0135, "step": 9798 }, { "epoch": 14.63, "learning_rate": 3.3206710984488765e-05, "loss": 0.0137, "step": 9844 }, { "epoch": 14.83, "learning_rate": 3.284267173156062e-05, "loss": 0.013, "step": 9890 }, { "epoch": 15.03, "learning_rate": 3.247863247863248e-05, "loss": 0.0157, "step": 9936 }, { "epoch": 15.22, "learning_rate": 3.2114593225704336e-05, "loss": 0.0139, "step": 9982 }, { "epoch": 13.13, "learning_rate": 3.17505539727762e-05, "loss": 0.0103, "step": 10028 }, { "epoch": 13.33, "learning_rate": 3.138651471984806e-05, "loss": 0.0095, "step": 10074 }, { "epoch": 13.53, "learning_rate": 3.1022475466919914e-05, "loss": 0.0114, "step": 10120 }, { "epoch": 13.72, "learning_rate": 3.065843621399177e-05, "loss": 0.0114, "step": 10166 }, { "epoch": 13.92, "learning_rate": 3.029439696106363e-05, "loss": 0.0097, "step": 10212 }, { "epoch": 14.12, "learning_rate": 2.9930357708135488e-05, "loss": 0.0094, "step": 10258 }, { "epoch": 14.31, "learning_rate": 2.956631845520735e-05, "loss": 0.0109, "step": 10304 }, { "epoch": 14.51, "learning_rate": 2.9202279202279202e-05, "loss": 0.0105, "step": 10350 }, { "epoch": 14.71, "learning_rate": 2.883823994935106e-05, "loss": 0.0103, "step": 10396 }, { "epoch": 14.9, "learning_rate": 2.847420069642292e-05, "loss": 0.0111, "step": 10442 }, { "epoch": 15.1, "learning_rate": 2.8110161443494776e-05, "loss": 0.012, "step": 10488 }, { "epoch": 15.29, "learning_rate": 2.7746122190566637e-05, "loss": 0.0104, "step": 10534 }, { "epoch": 15.49, "learning_rate": 2.7382082937638494e-05, "loss": 0.0105, "step": 10580 }, { "epoch": 15.69, "learning_rate": 2.7018043684710354e-05, "loss": 0.0096, "step": 10626 }, { "epoch": 15.88, "learning_rate": 2.6654004431782208e-05, "loss": 0.0126, "step": 10672 }, { "epoch": 16.08, "learning_rate": 2.628996517885407e-05, "loss": 0.012, "step": 10718 }, { "epoch": 16.28, "learning_rate": 2.5925925925925925e-05, "loss": 0.0091, "step": 10764 }, { "epoch": 14.19, "learning_rate": 2.5561886672997785e-05, "loss": 0.0086, "step": 10810 }, { "epoch": 14.38, "learning_rate": 2.5197847420069642e-05, "loss": 0.0095, "step": 10856 }, { "epoch": 14.58, "learning_rate": 2.48338081671415e-05, "loss": 0.0074, "step": 10902 }, { "epoch": 14.78, "learning_rate": 2.446976891421336e-05, "loss": 0.0085, "step": 10948 }, { "epoch": 14.97, "learning_rate": 2.4105729661285217e-05, "loss": 0.0092, "step": 10994 }, { "epoch": 15.17, "learning_rate": 2.3741690408357077e-05, "loss": 0.0078, "step": 11040 }, { "epoch": 15.37, "learning_rate": 2.3377651155428934e-05, "loss": 0.0089, "step": 11086 }, { "epoch": 15.56, "learning_rate": 2.301361190250079e-05, "loss": 0.0083, "step": 11132 }, { "epoch": 15.76, "learning_rate": 2.264957264957265e-05, "loss": 0.0079, "step": 11178 }, { "epoch": 15.96, "learning_rate": 2.2285533396644508e-05, "loss": 0.0091, "step": 11224 }, { "epoch": 16.15, "learning_rate": 2.192149414371637e-05, "loss": 0.0082, "step": 11270 }, { "epoch": 16.35, "learning_rate": 2.1557454890788225e-05, "loss": 0.0084, "step": 11316 }, { "epoch": 16.55, "learning_rate": 2.1193415637860082e-05, "loss": 0.0087, "step": 11362 }, { "epoch": 16.74, "learning_rate": 2.0829376384931943e-05, "loss": 0.0088, "step": 11408 }, { "epoch": 16.94, "learning_rate": 2.04653371320038e-05, "loss": 0.0098, "step": 11454 }, { "epoch": 17.14, "learning_rate": 2.010129787907566e-05, "loss": 0.0087, "step": 11500 }, { "epoch": 15.05, "learning_rate": 1.9737258626147517e-05, "loss": 0.0087, "step": 11546 }, { "epoch": 15.24, "learning_rate": 1.9373219373219374e-05, "loss": 0.0066, "step": 11592 }, { "epoch": 15.44, "learning_rate": 1.900918012029123e-05, "loss": 0.0071, "step": 11638 }, { "epoch": 15.64, "learning_rate": 1.8645140867363088e-05, "loss": 0.0079, "step": 11684 }, { "epoch": 15.83, "learning_rate": 1.828110161443495e-05, "loss": 0.0074, "step": 11730 }, { "epoch": 16.03, "learning_rate": 1.7917062361506805e-05, "loss": 0.0069, "step": 11776 }, { "epoch": 16.23, "learning_rate": 1.7553023108578666e-05, "loss": 0.0063, "step": 11822 }, { "epoch": 16.42, "learning_rate": 1.7188983855650523e-05, "loss": 0.0075, "step": 11868 }, { "epoch": 16.62, "learning_rate": 1.682494460272238e-05, "loss": 0.0074, "step": 11914 }, { "epoch": 16.82, "learning_rate": 1.646090534979424e-05, "loss": 0.0067, "step": 11960 }, { "epoch": 17.01, "learning_rate": 1.6096866096866097e-05, "loss": 0.0064, "step": 12006 }, { "epoch": 17.21, "learning_rate": 1.5732826843937957e-05, "loss": 0.0068, "step": 12052 }, { "epoch": 17.41, "learning_rate": 1.5368787591009814e-05, "loss": 0.0076, "step": 12098 }, { "epoch": 17.6, "learning_rate": 1.5004748338081673e-05, "loss": 0.0069, "step": 12144 }, { "epoch": 17.8, "learning_rate": 1.4640709085153532e-05, "loss": 0.0075, "step": 12190 }, { "epoch": 18.0, "learning_rate": 1.427666983222539e-05, "loss": 0.0072, "step": 12236 }, { "epoch": 18.19, "learning_rate": 1.3912630579297247e-05, "loss": 0.0066, "step": 12282 }, { "epoch": 16.1, "learning_rate": 1.3548591326369106e-05, "loss": 0.0069, "step": 12328 }, { "epoch": 16.3, "learning_rate": 1.3184552073440961e-05, "loss": 0.0052, "step": 12374 }, { "epoch": 16.5, "learning_rate": 1.282051282051282e-05, "loss": 0.0064, "step": 12420 }, { "epoch": 16.69, "learning_rate": 1.245647356758468e-05, "loss": 0.0063, "step": 12466 }, { "epoch": 16.89, "learning_rate": 1.2092434314656539e-05, "loss": 0.006, "step": 12512 }, { "epoch": 17.09, "learning_rate": 1.1728395061728396e-05, "loss": 0.0057, "step": 12558 }, { "epoch": 17.28, "learning_rate": 1.1364355808800253e-05, "loss": 0.0061, "step": 12604 }, { "epoch": 17.48, "learning_rate": 1.1000316555872111e-05, "loss": 0.0055, "step": 12650 }, { "epoch": 17.68, "learning_rate": 1.063627730294397e-05, "loss": 0.0053, "step": 12696 }, { "epoch": 17.87, "learning_rate": 1.0272238050015829e-05, "loss": 0.0051, "step": 12742 }, { "epoch": 18.07, "learning_rate": 9.908198797087687e-06, "loss": 0.0064, "step": 12788 }, { "epoch": 18.26, "learning_rate": 9.544159544159544e-06, "loss": 0.0056, "step": 12834 }, { "epoch": 18.46, "learning_rate": 9.180120291231403e-06, "loss": 0.0062, "step": 12880 }, { "epoch": 18.66, "learning_rate": 8.816081038303262e-06, "loss": 0.0065, "step": 12926 }, { "epoch": 18.85, "learning_rate": 8.452041785375119e-06, "loss": 0.0065, "step": 12972 }, { "epoch": 19.05, "learning_rate": 8.088002532446977e-06, "loss": 0.0064, "step": 13018 }, { "epoch": 19.25, "learning_rate": 7.723963279518836e-06, "loss": 0.0062, "step": 13064 }, { "epoch": 17.16, "learning_rate": 7.359924026590694e-06, "loss": 0.0055, "step": 13110 }, { "epoch": 17.35, "learning_rate": 6.995884773662552e-06, "loss": 0.0046, "step": 13156 }, { "epoch": 17.55, "learning_rate": 6.63184552073441e-06, "loss": 0.0049, "step": 13202 }, { "epoch": 17.75, "learning_rate": 6.267806267806268e-06, "loss": 0.0044, "step": 13248 }, { "epoch": 17.94, "learning_rate": 5.903767014878126e-06, "loss": 0.0057, "step": 13294 }, { "epoch": 18.14, "learning_rate": 5.539727761949985e-06, "loss": 0.0047, "step": 13340 }, { "epoch": 18.34, "learning_rate": 5.175688509021842e-06, "loss": 0.0048, "step": 13386 }, { "epoch": 18.53, "learning_rate": 4.8116492560937e-06, "loss": 0.0051, "step": 13432 }, { "epoch": 18.73, "learning_rate": 4.447610003165559e-06, "loss": 0.0056, "step": 13478 }, { "epoch": 18.93, "learning_rate": 4.083570750237417e-06, "loss": 0.0051, "step": 13524 }, { "epoch": 19.12, "learning_rate": 3.7195314973092754e-06, "loss": 0.0052, "step": 13570 }, { "epoch": 19.32, "learning_rate": 3.3554922443811336e-06, "loss": 0.0068, "step": 13616 }, { "epoch": 19.52, "learning_rate": 2.991452991452992e-06, "loss": 0.0064, "step": 13662 }, { "epoch": 19.71, "learning_rate": 2.6274137385248497e-06, "loss": 0.0055, "step": 13708 }, { "epoch": 19.91, "learning_rate": 2.263374485596708e-06, "loss": 0.0056, "step": 13754 }, { "epoch": 20.11, "learning_rate": 1.899335232668566e-06, "loss": 0.0052, "step": 13800 }, { "epoch": 18.02, "learning_rate": 1.5352959797404244e-06, "loss": 0.0051, "step": 13846 }, { "epoch": 18.21, "learning_rate": 1.1712567268122824e-06, "loss": 0.0052, "step": 13892 }, { "epoch": 18.41, "learning_rate": 8.072174738841407e-07, "loss": 0.0053, "step": 13938 }, { "epoch": 18.61, "learning_rate": 4.4317822095599874e-07, "loss": 0.0043, "step": 13984 }, { "epoch": 18.8, "learning_rate": 7.913896802785692e-08, "loss": 0.0046, "step": 14030 } ], "logging_steps": 46, "max_steps": 14040, "num_train_epochs": 60, "save_steps": 500, "total_flos": 6.139659927158784e+17, "trial_name": null, "trial_params": null }