{ "best_metric": 0.47572794556617737, "best_model_checkpoint": "./qlora-out/checkpoint-78500", "epoch": 2.9827374072555086, "eval_steps": 500, "global_step": 80000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 0.00019999938245325715, "loss": 0.9023, "step": 100 }, { "epoch": 0.01, "learning_rate": 0.00019999724773356797, "loss": 0.8027, "step": 200 }, { "epoch": 0.01, "learning_rate": 0.0001999935882494411, "loss": 0.8041, "step": 300 }, { "epoch": 0.01, "learning_rate": 0.00019998840405667672, "loss": 0.7944, "step": 400 }, { "epoch": 0.02, "learning_rate": 0.00019998169523432365, "loss": 0.81, "step": 500 }, { "epoch": 0.02, "learning_rate": 0.0001999734618846785, "loss": 0.7855, "step": 600 }, { "epoch": 0.03, "learning_rate": 0.00019996370413328385, "loss": 0.7849, "step": 700 }, { "epoch": 0.03, "learning_rate": 0.00019995242212892653, "loss": 0.7564, "step": 800 }, { "epoch": 0.03, "learning_rate": 0.00019993961604363532, "loss": 0.7724, "step": 900 }, { "epoch": 0.04, "learning_rate": 0.00019992528607267815, "loss": 0.7308, "step": 1000 }, { "epoch": 0.04, "eval_loss": 0.7677998542785645, "eval_runtime": 1774.3517, "eval_samples_per_second": 0.305, "eval_steps_per_second": 0.305, "step": 1000 }, { "epoch": 0.04, "learning_rate": 0.0001999094324345594, "loss": 0.7844, "step": 1100 }, { "epoch": 0.04, "learning_rate": 0.00019989205537101633, "loss": 0.7668, "step": 1200 }, { "epoch": 0.05, "learning_rate": 0.00019987315514701553, "loss": 0.7727, "step": 1300 }, { "epoch": 0.05, "learning_rate": 0.00019985273205074878, "loss": 0.7467, "step": 1400 }, { "epoch": 0.06, "learning_rate": 0.00019983078639362883, "loss": 0.7516, "step": 1500 }, { "epoch": 0.06, "learning_rate": 0.00019980731851028445, "loss": 0.7267, "step": 1600 }, { "epoch": 0.06, "learning_rate": 0.0001997823287585554, "loss": 0.7632, "step": 1700 }, { "epoch": 0.07, "learning_rate": 0.000199755817519487, "loss": 0.7392, "step": 1800 }, { "epoch": 0.07, "learning_rate": 0.00019972778519732436, "loss": 0.7528, "step": 1900 }, { "epoch": 0.07, "learning_rate": 0.0001996982322195061, "loss": 0.725, "step": 2000 }, { "epoch": 0.07, "eval_loss": 0.7452704310417175, "eval_runtime": 1787.7554, "eval_samples_per_second": 0.303, "eval_steps_per_second": 0.303, "step": 2000 }, { "epoch": 0.08, "learning_rate": 0.00019966715903665795, "loss": 0.7234, "step": 2100 }, { "epoch": 0.08, "learning_rate": 0.00019963456612258576, "loss": 0.754, "step": 2200 }, { "epoch": 0.09, "learning_rate": 0.00019960045397426841, "loss": 0.7856, "step": 2300 }, { "epoch": 0.09, "learning_rate": 0.00019956482311185006, "loss": 0.7387, "step": 2400 }, { "epoch": 0.09, "learning_rate": 0.00019952767407863245, "loss": 0.7309, "step": 2500 }, { "epoch": 0.1, "learning_rate": 0.00019948900744106633, "loss": 0.7232, "step": 2600 }, { "epoch": 0.1, "learning_rate": 0.00019944882378874316, "loss": 0.7406, "step": 2700 }, { "epoch": 0.1, "learning_rate": 0.0001994071237343858, "loss": 0.7166, "step": 2800 }, { "epoch": 0.11, "learning_rate": 0.00019936390791383936, "loss": 0.7308, "step": 2900 }, { "epoch": 0.11, "learning_rate": 0.00019931917698606143, "loss": 0.7288, "step": 3000 }, { "epoch": 0.11, "eval_loss": 0.7343490123748779, "eval_runtime": 1770.9966, "eval_samples_per_second": 0.306, "eval_steps_per_second": 0.306, "step": 3000 }, { "epoch": 0.12, "learning_rate": 0.00019927293163311206, "loss": 0.7236, "step": 3100 }, { "epoch": 0.12, "learning_rate": 0.00019922517256014337, "loss": 0.716, "step": 3200 }, { "epoch": 0.12, "learning_rate": 0.00019917590049538874, "loss": 0.7564, "step": 3300 }, { "epoch": 0.13, "learning_rate": 0.00019912511619015177, "loss": 0.7082, "step": 3400 }, { "epoch": 0.13, "learning_rate": 0.00019907282041879484, "loss": 0.7103, "step": 3500 }, { "epoch": 0.13, "learning_rate": 0.00019901901397872715, "loss": 0.7457, "step": 3600 }, { "epoch": 0.14, "learning_rate": 0.0001989636976903928, "loss": 0.7076, "step": 3700 }, { "epoch": 0.14, "learning_rate": 0.0001989068723972581, "loss": 0.7217, "step": 3800 }, { "epoch": 0.15, "learning_rate": 0.00019884853896579873, "loss": 0.7175, "step": 3900 }, { "epoch": 0.15, "learning_rate": 0.0001987886982854866, "loss": 0.7083, "step": 4000 }, { "epoch": 0.15, "eval_loss": 0.726176917552948, "eval_runtime": 1765.3933, "eval_samples_per_second": 0.307, "eval_steps_per_second": 0.307, "step": 4000 }, { "epoch": 0.15, "learning_rate": 0.00019872735126877622, "loss": 0.7228, "step": 4100 }, { "epoch": 0.16, "learning_rate": 0.0001986644988510909, "loss": 0.7133, "step": 4200 }, { "epoch": 0.16, "learning_rate": 0.00019860014199080822, "loss": 0.7243, "step": 4300 }, { "epoch": 0.16, "learning_rate": 0.00019853428166924576, "loss": 0.6929, "step": 4400 }, { "epoch": 0.17, "learning_rate": 0.00019846691889064593, "loss": 0.7392, "step": 4500 }, { "epoch": 0.17, "learning_rate": 0.0001983980546821607, "loss": 0.7247, "step": 4600 }, { "epoch": 0.18, "learning_rate": 0.0001983276900938359, "loss": 0.7258, "step": 4700 }, { "epoch": 0.18, "learning_rate": 0.00019825582619859532, "loss": 0.7197, "step": 4800 }, { "epoch": 0.18, "learning_rate": 0.0001981824640922242, "loss": 0.6906, "step": 4900 }, { "epoch": 0.19, "learning_rate": 0.00019810760489335266, "loss": 0.7274, "step": 5000 }, { "epoch": 0.19, "eval_loss": 0.7171670794487, "eval_runtime": 1812.7597, "eval_samples_per_second": 0.299, "eval_steps_per_second": 0.299, "step": 5000 }, { "epoch": 0.19, "learning_rate": 0.0001980312497434385, "loss": 0.7105, "step": 5100 }, { "epoch": 0.19, "learning_rate": 0.00019795339980675002, "loss": 0.7091, "step": 5200 }, { "epoch": 0.2, "learning_rate": 0.00019787405627034804, "loss": 0.7102, "step": 5300 }, { "epoch": 0.2, "learning_rate": 0.0001977932203440678, "loss": 0.7314, "step": 5400 }, { "epoch": 0.21, "learning_rate": 0.00019771089326050075, "loss": 0.6945, "step": 5500 }, { "epoch": 0.21, "learning_rate": 0.0001976270762749755, "loss": 0.7048, "step": 5600 }, { "epoch": 0.21, "learning_rate": 0.00019754177066553882, "loss": 0.6963, "step": 5700 }, { "epoch": 0.22, "learning_rate": 0.00019745497773293613, "loss": 0.711, "step": 5800 }, { "epoch": 0.22, "learning_rate": 0.0001973666988005916, "loss": 0.7017, "step": 5900 }, { "epoch": 0.22, "learning_rate": 0.00019727693521458806, "loss": 0.7287, "step": 6000 }, { "epoch": 0.22, "eval_loss": 0.710155725479126, "eval_runtime": 1786.7467, "eval_samples_per_second": 0.303, "eval_steps_per_second": 0.303, "step": 6000 }, { "epoch": 0.23, "learning_rate": 0.00019718568834364638, "loss": 0.6894, "step": 6100 }, { "epoch": 0.23, "learning_rate": 0.00019709295957910476, "loss": 0.7061, "step": 6200 }, { "epoch": 0.23, "learning_rate": 0.00019699875033489728, "loss": 0.7063, "step": 6300 }, { "epoch": 0.24, "learning_rate": 0.00019690306204753254, "loss": 0.6872, "step": 6400 }, { "epoch": 0.24, "learning_rate": 0.0001968058961760717, "loss": 0.7095, "step": 6500 }, { "epoch": 0.25, "learning_rate": 0.00019670725420210618, "loss": 0.695, "step": 6600 }, { "epoch": 0.25, "learning_rate": 0.0001966071376297351, "loss": 0.674, "step": 6700 }, { "epoch": 0.25, "learning_rate": 0.00019650554798554236, "loss": 0.7225, "step": 6800 }, { "epoch": 0.26, "learning_rate": 0.00019640248681857342, "loss": 0.6845, "step": 6900 }, { "epoch": 0.26, "learning_rate": 0.00019629795570031149, "loss": 0.6891, "step": 7000 }, { "epoch": 0.26, "eval_loss": 0.703677773475647, "eval_runtime": 1767.3593, "eval_samples_per_second": 0.307, "eval_steps_per_second": 0.307, "step": 7000 }, { "epoch": 0.26, "learning_rate": 0.00019619195622465379, "loss": 0.6962, "step": 7100 }, { "epoch": 0.27, "learning_rate": 0.0001960844900078871, "loss": 0.6779, "step": 7200 }, { "epoch": 0.27, "learning_rate": 0.00019597555868866318, "loss": 0.7354, "step": 7300 }, { "epoch": 0.28, "learning_rate": 0.00019586516392797374, "loss": 0.7196, "step": 7400 }, { "epoch": 0.28, "learning_rate": 0.0001957533074091252, "loss": 0.682, "step": 7500 }, { "epoch": 0.28, "learning_rate": 0.0001956399908377129, "loss": 0.6938, "step": 7600 }, { "epoch": 0.29, "learning_rate": 0.0001955252159415952, "loss": 0.6912, "step": 7700 }, { "epoch": 0.29, "learning_rate": 0.00019540898447086705, "loss": 0.7048, "step": 7800 }, { "epoch": 0.29, "learning_rate": 0.00019529129819783334, "loss": 0.7007, "step": 7900 }, { "epoch": 0.3, "learning_rate": 0.00019517215891698192, "loss": 0.6969, "step": 8000 }, { "epoch": 0.3, "eval_loss": 0.6973471641540527, "eval_runtime": 1793.2355, "eval_samples_per_second": 0.302, "eval_steps_per_second": 0.302, "step": 8000 }, { "epoch": 0.3, "learning_rate": 0.00019505156844495619, "loss": 0.6894, "step": 8100 }, { "epoch": 0.31, "learning_rate": 0.00019492952862052733, "loss": 0.6971, "step": 8200 }, { "epoch": 0.31, "learning_rate": 0.0001948060413045665, "loss": 0.7135, "step": 8300 }, { "epoch": 0.31, "learning_rate": 0.0001946811083800161, "loss": 0.6794, "step": 8400 }, { "epoch": 0.32, "learning_rate": 0.0001945547317518614, "loss": 0.7086, "step": 8500 }, { "epoch": 0.32, "learning_rate": 0.00019442691334710136, "loss": 0.7042, "step": 8600 }, { "epoch": 0.32, "learning_rate": 0.00019429765511471916, "loss": 0.6822, "step": 8700 }, { "epoch": 0.33, "learning_rate": 0.0001941669590256526, "loss": 0.7016, "step": 8800 }, { "epoch": 0.33, "learning_rate": 0.00019403482707276406, "loss": 0.705, "step": 8900 }, { "epoch": 0.34, "learning_rate": 0.00019390126127080999, "loss": 0.698, "step": 9000 }, { "epoch": 0.34, "eval_loss": 0.6910382509231567, "eval_runtime": 1782.1661, "eval_samples_per_second": 0.304, "eval_steps_per_second": 0.304, "step": 9000 }, { "epoch": 0.34, "learning_rate": 0.00019376626365641026, "loss": 0.6926, "step": 9100 }, { "epoch": 0.34, "learning_rate": 0.0001936298362880172, "loss": 0.6871, "step": 9200 }, { "epoch": 0.35, "learning_rate": 0.00019349198124588403, "loss": 0.6894, "step": 9300 }, { "epoch": 0.35, "learning_rate": 0.00019335270063203325, "loss": 0.6894, "step": 9400 }, { "epoch": 0.35, "learning_rate": 0.00019321199657022464, "loss": 0.7057, "step": 9500 }, { "epoch": 0.36, "learning_rate": 0.00019306987120592265, "loss": 0.6682, "step": 9600 }, { "epoch": 0.36, "learning_rate": 0.00019292632670626401, "loss": 0.6931, "step": 9700 }, { "epoch": 0.37, "learning_rate": 0.00019278136526002443, "loss": 0.7244, "step": 9800 }, { "epoch": 0.37, "learning_rate": 0.0001926349890775853, "loss": 0.6881, "step": 9900 }, { "epoch": 0.37, "learning_rate": 0.00019248720039090006, "loss": 0.6839, "step": 10000 }, { "epoch": 0.37, "eval_loss": 0.6857322454452515, "eval_runtime": 1760.8664, "eval_samples_per_second": 0.308, "eval_steps_per_second": 0.308, "step": 10000 }, { "epoch": 0.38, "learning_rate": 0.00019233800145346006, "loss": 0.6917, "step": 10100 }, { "epoch": 0.38, "learning_rate": 0.0001921873945402602, "loss": 0.6672, "step": 10200 }, { "epoch": 0.38, "learning_rate": 0.00019203538194776442, "loss": 0.6873, "step": 10300 }, { "epoch": 0.39, "learning_rate": 0.00019188196599387043, "loss": 0.6733, "step": 10400 }, { "epoch": 0.39, "learning_rate": 0.00019172714901787453, "loss": 0.706, "step": 10500 }, { "epoch": 0.4, "learning_rate": 0.00019157093338043583, "loss": 0.6848, "step": 10600 }, { "epoch": 0.4, "learning_rate": 0.00019141332146354042, "loss": 0.6728, "step": 10700 }, { "epoch": 0.4, "learning_rate": 0.00019125431567046494, "loss": 0.686, "step": 10800 }, { "epoch": 0.41, "learning_rate": 0.00019109391842573987, "loss": 0.6992, "step": 10900 }, { "epoch": 0.41, "learning_rate": 0.00019093213217511265, "loss": 0.6675, "step": 11000 }, { "epoch": 0.41, "eval_loss": 0.6794907450675964, "eval_runtime": 1782.5413, "eval_samples_per_second": 0.304, "eval_steps_per_second": 0.304, "step": 11000 }, { "epoch": 0.41, "learning_rate": 0.0001907689593855104, "loss": 0.6721, "step": 11100 }, { "epoch": 0.42, "learning_rate": 0.00019060440254500228, "loss": 0.6353, "step": 11200 }, { "epoch": 0.42, "learning_rate": 0.00019043846416276155, "loss": 0.6449, "step": 11300 }, { "epoch": 0.43, "learning_rate": 0.0001902711467690272, "loss": 0.6451, "step": 11400 }, { "epoch": 0.43, "learning_rate": 0.00019010245291506569, "loss": 0.6421, "step": 11500 }, { "epoch": 0.43, "learning_rate": 0.00018993238517313167, "loss": 0.6352, "step": 11600 }, { "epoch": 0.44, "learning_rate": 0.0001897609461364289, "loss": 0.6371, "step": 11700 }, { "epoch": 0.44, "learning_rate": 0.00018958813841907083, "loss": 0.623, "step": 11800 }, { "epoch": 0.44, "learning_rate": 0.00018941396465604063, "loss": 0.6533, "step": 11900 }, { "epoch": 0.45, "learning_rate": 0.00018923842750315095, "loss": 0.6371, "step": 12000 }, { "epoch": 0.45, "eval_loss": 0.6759930849075317, "eval_runtime": 1304.1351, "eval_samples_per_second": 0.416, "eval_steps_per_second": 0.416, "step": 12000 }, { "epoch": 0.45, "learning_rate": 0.00018906152963700358, "loss": 0.6664, "step": 12100 }, { "epoch": 0.45, "learning_rate": 0.00018888327375494847, "loss": 0.6644, "step": 12200 }, { "epoch": 0.46, "learning_rate": 0.00018870366257504274, "loss": 0.623, "step": 12300 }, { "epoch": 0.46, "learning_rate": 0.00018852269883600918, "loss": 0.6242, "step": 12400 }, { "epoch": 0.47, "learning_rate": 0.00018834038529719446, "loss": 0.6318, "step": 12500 }, { "epoch": 0.47, "learning_rate": 0.0001881567247385271, "loss": 0.6359, "step": 12600 }, { "epoch": 0.47, "learning_rate": 0.00018797171996047505, "loss": 0.6449, "step": 12700 }, { "epoch": 0.48, "learning_rate": 0.00018778537378400304, "loss": 0.6434, "step": 12800 }, { "epoch": 0.48, "learning_rate": 0.00018759768905052946, "loss": 0.6323, "step": 12900 }, { "epoch": 0.48, "learning_rate": 0.00018740866862188317, "loss": 0.6377, "step": 13000 }, { "epoch": 0.48, "eval_loss": 0.6696639060974121, "eval_runtime": 1238.6847, "eval_samples_per_second": 0.438, "eval_steps_per_second": 0.438, "step": 13000 }, { "epoch": 0.49, "learning_rate": 0.0001872183153802598, "loss": 0.6232, "step": 13100 }, { "epoch": 0.49, "learning_rate": 0.00018702663222817774, "loss": 0.6236, "step": 13200 }, { "epoch": 0.5, "learning_rate": 0.00018683362208843395, "loss": 0.6331, "step": 13300 }, { "epoch": 0.5, "learning_rate": 0.00018663928790405945, "loss": 0.6528, "step": 13400 }, { "epoch": 0.5, "learning_rate": 0.00018644363263827426, "loss": 0.6362, "step": 13500 }, { "epoch": 0.51, "learning_rate": 0.00018624665927444248, "loss": 0.6308, "step": 13600 }, { "epoch": 0.51, "learning_rate": 0.00018604837081602656, "loss": 0.6107, "step": 13700 }, { "epoch": 0.51, "learning_rate": 0.00018584877028654154, "loss": 0.6418, "step": 13800 }, { "epoch": 0.52, "learning_rate": 0.00018564786072950917, "loss": 0.6276, "step": 13900 }, { "epoch": 0.52, "learning_rate": 0.00018544564520841118, "loss": 0.6296, "step": 14000 }, { "epoch": 0.52, "eval_loss": 0.6651941537857056, "eval_runtime": 1277.2835, "eval_samples_per_second": 0.424, "eval_steps_per_second": 0.424, "step": 14000 }, { "epoch": 0.53, "learning_rate": 0.00018524212680664286, "loss": 0.636, "step": 14100 }, { "epoch": 0.53, "learning_rate": 0.00018503730862746574, "loss": 0.6643, "step": 14200 }, { "epoch": 0.53, "learning_rate": 0.00018483119379396058, "loss": 0.6282, "step": 14300 }, { "epoch": 0.54, "learning_rate": 0.0001846237854489796, "loss": 0.6381, "step": 14400 }, { "epoch": 0.54, "learning_rate": 0.00018441508675509844, "loss": 0.6692, "step": 14500 }, { "epoch": 0.54, "learning_rate": 0.00018420510089456823, "loss": 0.6478, "step": 14600 }, { "epoch": 0.55, "learning_rate": 0.00018399383106926676, "loss": 0.6293, "step": 14700 }, { "epoch": 0.55, "learning_rate": 0.00018378128050064988, "loss": 0.6406, "step": 14800 }, { "epoch": 0.56, "learning_rate": 0.0001835674524297023, "loss": 0.6407, "step": 14900 }, { "epoch": 0.56, "learning_rate": 0.0001833523501168881, "loss": 0.633, "step": 15000 }, { "epoch": 0.56, "eval_loss": 0.6601429581642151, "eval_runtime": 1275.6071, "eval_samples_per_second": 0.425, "eval_steps_per_second": 0.425, "step": 15000 }, { "epoch": 0.56, "learning_rate": 0.00018313597684210115, "loss": 0.6198, "step": 15100 }, { "epoch": 0.57, "learning_rate": 0.00018291833590461498, "loss": 0.6345, "step": 15200 }, { "epoch": 0.57, "learning_rate": 0.00018269943062303257, "loss": 0.6554, "step": 15300 }, { "epoch": 0.57, "learning_rate": 0.00018247926433523562, "loss": 0.6151, "step": 15400 }, { "epoch": 0.58, "learning_rate": 0.00018225784039833386, "loss": 0.6331, "step": 15500 }, { "epoch": 0.58, "learning_rate": 0.0001820351621886136, "loss": 0.6256, "step": 15600 }, { "epoch": 0.59, "learning_rate": 0.0001818112331014865, "loss": 0.6263, "step": 15700 }, { "epoch": 0.59, "learning_rate": 0.00018158605655143757, "loss": 0.6015, "step": 15800 }, { "epoch": 0.59, "learning_rate": 0.00018135963597197327, "loss": 0.6144, "step": 15900 }, { "epoch": 0.6, "learning_rate": 0.00018113197481556912, "loss": 0.613, "step": 16000 }, { "epoch": 0.6, "eval_loss": 0.6547831892967224, "eval_runtime": 1305.9645, "eval_samples_per_second": 0.415, "eval_steps_per_second": 0.415, "step": 16000 }, { "epoch": 0.6, "learning_rate": 0.00018090307655361701, "loss": 0.6354, "step": 16100 }, { "epoch": 0.6, "learning_rate": 0.00018067294467637228, "loss": 0.6349, "step": 16200 }, { "epoch": 0.61, "learning_rate": 0.00018044158269290054, "loss": 0.6127, "step": 16300 }, { "epoch": 0.61, "learning_rate": 0.00018020899413102412, "loss": 0.5977, "step": 16400 }, { "epoch": 0.62, "learning_rate": 0.00017997518253726834, "loss": 0.6213, "step": 16500 }, { "epoch": 0.62, "learning_rate": 0.00017974015147680734, "loss": 0.6168, "step": 16600 }, { "epoch": 0.62, "learning_rate": 0.00017950390453340978, "loss": 0.5978, "step": 16700 }, { "epoch": 0.63, "learning_rate": 0.0001792664453093842, "loss": 0.6201, "step": 16800 }, { "epoch": 0.63, "learning_rate": 0.000179027777425524, "loss": 0.6141, "step": 16900 }, { "epoch": 0.63, "learning_rate": 0.00017878790452105245, "loss": 0.6135, "step": 17000 }, { "epoch": 0.63, "eval_loss": 0.6480616927146912, "eval_runtime": 1347.9883, "eval_samples_per_second": 0.402, "eval_steps_per_second": 0.402, "step": 17000 }, { "epoch": 0.64, "learning_rate": 0.0001785468302535669, "loss": 0.6363, "step": 17100 }, { "epoch": 0.64, "learning_rate": 0.00017830455829898317, "loss": 0.6076, "step": 17200 }, { "epoch": 0.65, "learning_rate": 0.00017806109235147963, "loss": 0.609, "step": 17300 }, { "epoch": 0.65, "learning_rate": 0.00017781643612344058, "loss": 0.6044, "step": 17400 }, { "epoch": 0.65, "learning_rate": 0.00017757059334539994, "loss": 0.6262, "step": 17500 }, { "epoch": 0.66, "learning_rate": 0.00017732356776598403, "loss": 0.6195, "step": 17600 }, { "epoch": 0.66, "learning_rate": 0.0001770753631518548, "loss": 0.6328, "step": 17700 }, { "epoch": 0.66, "learning_rate": 0.000176825983287652, "loss": 0.6028, "step": 17800 }, { "epoch": 0.67, "learning_rate": 0.0001765754319759358, "loss": 0.6159, "step": 17900 }, { "epoch": 0.67, "learning_rate": 0.0001763237130371287, "loss": 0.6169, "step": 18000 }, { "epoch": 0.67, "eval_loss": 0.6444052457809448, "eval_runtime": 1304.3701, "eval_samples_per_second": 0.416, "eval_steps_per_second": 0.416, "step": 18000 }, { "epoch": 0.67, "learning_rate": 0.0001760708303094572, "loss": 0.6183, "step": 18100 }, { "epoch": 0.68, "learning_rate": 0.00017581678764889324, "loss": 0.6116, "step": 18200 }, { "epoch": 0.68, "learning_rate": 0.00017556158892909567, "loss": 0.6406, "step": 18300 }, { "epoch": 0.69, "learning_rate": 0.00017530523804135085, "loss": 0.6223, "step": 18400 }, { "epoch": 0.69, "learning_rate": 0.00017504773889451361, "loss": 0.628, "step": 18500 }, { "epoch": 0.69, "learning_rate": 0.00017478909541494736, "loss": 0.6173, "step": 18600 }, { "epoch": 0.7, "learning_rate": 0.00017452931154646444, "loss": 0.61, "step": 18700 }, { "epoch": 0.7, "learning_rate": 0.00017426839125026598, "loss": 0.5959, "step": 18800 }, { "epoch": 0.7, "learning_rate": 0.00017400633850488128, "loss": 0.5979, "step": 18900 }, { "epoch": 0.71, "learning_rate": 0.00017374315730610745, "loss": 0.6161, "step": 19000 }, { "epoch": 0.71, "eval_loss": 0.6378119587898254, "eval_runtime": 1283.5987, "eval_samples_per_second": 0.422, "eval_steps_per_second": 0.422, "step": 19000 }, { "epoch": 0.71, "learning_rate": 0.00017347885166694825, "loss": 0.6213, "step": 19100 }, { "epoch": 0.72, "learning_rate": 0.00017321342561755297, "loss": 0.6217, "step": 19200 }, { "epoch": 0.72, "learning_rate": 0.00017294688320515506, "loss": 0.6127, "step": 19300 }, { "epoch": 0.72, "learning_rate": 0.00017267922849401024, "loss": 0.6145, "step": 19400 }, { "epoch": 0.73, "learning_rate": 0.00017241046556533472, "loss": 0.5936, "step": 19500 }, { "epoch": 0.73, "learning_rate": 0.0001721405985172428, "loss": 0.6273, "step": 19600 }, { "epoch": 0.73, "learning_rate": 0.0001718696314646846, "loss": 0.6059, "step": 19700 }, { "epoch": 0.74, "learning_rate": 0.000171597568539383, "loss": 0.5934, "step": 19800 }, { "epoch": 0.74, "learning_rate": 0.000171324413889771, "loss": 0.6243, "step": 19900 }, { "epoch": 0.75, "learning_rate": 0.00017105017168092808, "loss": 0.6164, "step": 20000 }, { "epoch": 0.75, "eval_loss": 0.6324757933616638, "eval_runtime": 1266.6769, "eval_samples_per_second": 0.428, "eval_steps_per_second": 0.428, "step": 20000 }, { "epoch": 0.75, "learning_rate": 0.0001707748460945171, "loss": 0.5953, "step": 20100 }, { "epoch": 0.75, "learning_rate": 0.0001704984413287202, "loss": 0.6329, "step": 20200 }, { "epoch": 0.76, "learning_rate": 0.00017022096159817493, "loss": 0.6227, "step": 20300 }, { "epoch": 0.76, "learning_rate": 0.00016994241113391003, "loss": 0.6022, "step": 20400 }, { "epoch": 0.76, "learning_rate": 0.0001696627941832808, "loss": 0.604, "step": 20500 }, { "epoch": 0.77, "learning_rate": 0.0001693821150099044, "loss": 0.6101, "step": 20600 }, { "epoch": 0.77, "learning_rate": 0.00016910037789359485, "loss": 0.6242, "step": 20700 }, { "epoch": 0.78, "learning_rate": 0.00016881758713029776, "loss": 0.6096, "step": 20800 }, { "epoch": 0.78, "learning_rate": 0.0001685337470320248, "loss": 0.5948, "step": 20900 }, { "epoch": 0.78, "learning_rate": 0.0001682488619267879, "loss": 0.5911, "step": 21000 }, { "epoch": 0.78, "eval_loss": 0.6282580494880676, "eval_runtime": 1313.1215, "eval_samples_per_second": 0.413, "eval_steps_per_second": 0.413, "step": 21000 }, { "epoch": 0.79, "learning_rate": 0.0001679629361585335, "loss": 0.5716, "step": 21100 }, { "epoch": 0.79, "learning_rate": 0.00016767597408707594, "loss": 0.5957, "step": 21200 }, { "epoch": 0.79, "learning_rate": 0.00016738798008803128, "loss": 0.6308, "step": 21300 }, { "epoch": 0.8, "learning_rate": 0.00016709895855275048, "loss": 0.5891, "step": 21400 }, { "epoch": 0.8, "learning_rate": 0.00016680891388825243, "loss": 0.6104, "step": 21500 }, { "epoch": 0.81, "learning_rate": 0.00016651785051715674, "loss": 0.6344, "step": 21600 }, { "epoch": 0.81, "learning_rate": 0.0001662257728776163, "loss": 0.604, "step": 21700 }, { "epoch": 0.81, "learning_rate": 0.0001659326854232497, "loss": 0.6066, "step": 21800 }, { "epoch": 0.82, "learning_rate": 0.0001656385926230732, "loss": 0.6324, "step": 21900 }, { "epoch": 0.82, "learning_rate": 0.00016534349896143264, "loss": 0.5819, "step": 22000 }, { "epoch": 0.82, "eval_loss": 0.6218891143798828, "eval_runtime": 1296.6038, "eval_samples_per_second": 0.418, "eval_steps_per_second": 0.418, "step": 22000 }, { "epoch": 0.82, "learning_rate": 0.00016504740893793512, "loss": 0.6145, "step": 22100 }, { "epoch": 0.83, "learning_rate": 0.00016475032706738023, "loss": 0.6109, "step": 22200 }, { "epoch": 0.83, "learning_rate": 0.0001644522578796914, "loss": 0.608, "step": 22300 }, { "epoch": 0.84, "learning_rate": 0.0001641532059198466, "loss": 0.565, "step": 22400 }, { "epoch": 0.84, "learning_rate": 0.00016385317574780942, "loss": 0.6139, "step": 22500 }, { "epoch": 0.84, "learning_rate": 0.000163552171938459, "loss": 0.5888, "step": 22600 }, { "epoch": 0.85, "learning_rate": 0.00016325019908152078, "loss": 0.6065, "step": 22700 }, { "epoch": 0.85, "learning_rate": 0.0001629472617814962, "loss": 0.5959, "step": 22800 }, { "epoch": 0.85, "learning_rate": 0.00016264336465759258, "loss": 0.5918, "step": 22900 }, { "epoch": 0.86, "learning_rate": 0.0001623385123436528, "loss": 0.6083, "step": 23000 }, { "epoch": 0.86, "eval_loss": 0.6180054545402527, "eval_runtime": 1278.5639, "eval_samples_per_second": 0.424, "eval_steps_per_second": 0.424, "step": 23000 }, { "epoch": 0.86, "learning_rate": 0.0001620327094880844, "loss": 0.5795, "step": 23100 }, { "epoch": 0.86, "learning_rate": 0.00016172596075378893, "loss": 0.6025, "step": 23200 }, { "epoch": 0.87, "learning_rate": 0.00016141827081809075, "loss": 0.5669, "step": 23300 }, { "epoch": 0.87, "learning_rate": 0.00016110964437266568, "loss": 0.6172, "step": 23400 }, { "epoch": 0.88, "learning_rate": 0.00016080008612346955, "loss": 0.5899, "step": 23500 }, { "epoch": 0.88, "learning_rate": 0.00016048960079066636, "loss": 0.5889, "step": 23600 }, { "epoch": 0.88, "learning_rate": 0.00016017819310855632, "loss": 0.5893, "step": 23700 }, { "epoch": 0.89, "learning_rate": 0.00015986586782550376, "loss": 0.6363, "step": 23800 }, { "epoch": 0.89, "learning_rate": 0.00015955262970386458, "loss": 0.5876, "step": 23900 }, { "epoch": 0.89, "learning_rate": 0.00015923848351991372, "loss": 0.5964, "step": 24000 }, { "epoch": 0.89, "eval_loss": 0.6122664213180542, "eval_runtime": 1255.2341, "eval_samples_per_second": 0.432, "eval_steps_per_second": 0.432, "step": 24000 }, { "epoch": 0.9, "learning_rate": 0.00015892343406377225, "loss": 0.5943, "step": 24100 }, { "epoch": 0.9, "learning_rate": 0.00015860748613933455, "loss": 0.6008, "step": 24200 }, { "epoch": 0.91, "learning_rate": 0.00015829064456419477, "loss": 0.6123, "step": 24300 }, { "epoch": 0.91, "learning_rate": 0.00015797291416957355, "loss": 0.5819, "step": 24400 }, { "epoch": 0.91, "learning_rate": 0.00015765429980024425, "loss": 0.5731, "step": 24500 }, { "epoch": 0.92, "learning_rate": 0.00015733480631445926, "loss": 0.593, "step": 24600 }, { "epoch": 0.92, "learning_rate": 0.00015701443858387562, "loss": 0.5764, "step": 24700 }, { "epoch": 0.92, "learning_rate": 0.00015669320149348104, "loss": 0.6037, "step": 24800 }, { "epoch": 0.93, "learning_rate": 0.0001563710999415193, "loss": 0.5958, "step": 24900 }, { "epoch": 0.93, "learning_rate": 0.00015604813883941535, "loss": 0.6186, "step": 25000 }, { "epoch": 0.93, "eval_loss": 0.6086174249649048, "eval_runtime": 1260.3923, "eval_samples_per_second": 0.43, "eval_steps_per_second": 0.43, "step": 25000 }, { "epoch": 0.94, "learning_rate": 0.00015572432311170096, "loss": 0.597, "step": 25100 }, { "epoch": 0.94, "learning_rate": 0.00015539965769593894, "loss": 0.5657, "step": 25200 }, { "epoch": 0.94, "learning_rate": 0.0001550741475426484, "loss": 0.6081, "step": 25300 }, { "epoch": 0.95, "learning_rate": 0.00015474779761522894, "loss": 0.5957, "step": 25400 }, { "epoch": 0.95, "learning_rate": 0.00015442061288988525, "loss": 0.6032, "step": 25500 }, { "epoch": 0.95, "learning_rate": 0.00015409259835555089, "loss": 0.5662, "step": 25600 }, { "epoch": 0.96, "learning_rate": 0.00015376375901381256, "loss": 0.5607, "step": 25700 }, { "epoch": 0.96, "learning_rate": 0.00015343409987883354, "loss": 0.5727, "step": 25800 }, { "epoch": 0.97, "learning_rate": 0.00015310362597727747, "loss": 0.5762, "step": 25900 }, { "epoch": 0.97, "learning_rate": 0.00015277234234823154, "loss": 0.5841, "step": 26000 }, { "epoch": 0.97, "eval_loss": 0.6026987433433533, "eval_runtime": 1292.1515, "eval_samples_per_second": 0.419, "eval_steps_per_second": 0.419, "step": 26000 }, { "epoch": 0.97, "learning_rate": 0.00015244025404312974, "loss": 0.6015, "step": 26100 }, { "epoch": 0.98, "learning_rate": 0.00015210736612567588, "loss": 0.5914, "step": 26200 }, { "epoch": 0.98, "learning_rate": 0.00015177368367176616, "loss": 0.5799, "step": 26300 }, { "epoch": 0.98, "learning_rate": 0.00015143921176941205, "loss": 0.6037, "step": 26400 }, { "epoch": 0.99, "learning_rate": 0.00015110395551866255, "loss": 0.5876, "step": 26500 }, { "epoch": 0.99, "learning_rate": 0.0001507679200315264, "loss": 0.5973, "step": 26600 }, { "epoch": 1.0, "learning_rate": 0.00015043111043189423, "loss": 0.5957, "step": 26700 }, { "epoch": 1.0, "learning_rate": 0.00015009353185546046, "loss": 0.5696, "step": 26800 }, { "epoch": 1.0, "learning_rate": 0.00014975518944964478, "loss": 0.5523, "step": 26900 }, { "epoch": 1.01, "learning_rate": 0.0001494160883735139, "loss": 0.5144, "step": 27000 }, { "epoch": 1.01, "eval_loss": 0.5985096096992493, "eval_runtime": 1314.8131, "eval_samples_per_second": 0.412, "eval_steps_per_second": 0.412, "step": 27000 }, { "epoch": 1.01, "learning_rate": 0.00014907623379770263, "loss": 0.5743, "step": 27100 }, { "epoch": 1.01, "learning_rate": 0.00014873563090433547, "loss": 0.5095, "step": 27200 }, { "epoch": 1.02, "learning_rate": 0.00014839428488694706, "loss": 0.5391, "step": 27300 }, { "epoch": 1.02, "learning_rate": 0.00014805220095040334, "loss": 0.5532, "step": 27400 }, { "epoch": 1.03, "learning_rate": 0.00014770938431082212, "loss": 0.536, "step": 27500 }, { "epoch": 1.03, "learning_rate": 0.00014736584019549342, "loss": 0.5204, "step": 27600 }, { "epoch": 1.03, "learning_rate": 0.00014702157384279997, "loss": 0.5026, "step": 27700 }, { "epoch": 1.04, "learning_rate": 0.0001466765905021371, "loss": 0.5319, "step": 27800 }, { "epoch": 1.04, "learning_rate": 0.00014633089543383295, "loss": 0.5112, "step": 27900 }, { "epoch": 1.04, "learning_rate": 0.00014598449390906804, "loss": 0.5146, "step": 28000 }, { "epoch": 1.04, "eval_loss": 0.5959522128105164, "eval_runtime": 1288.6066, "eval_samples_per_second": 0.421, "eval_steps_per_second": 0.421, "step": 28000 }, { "epoch": 1.05, "learning_rate": 0.00014563739120979497, "loss": 0.5262, "step": 28100 }, { "epoch": 1.05, "learning_rate": 0.00014528959262865798, "loss": 0.5082, "step": 28200 }, { "epoch": 1.06, "learning_rate": 0.00014494110346891206, "loss": 0.5094, "step": 28300 }, { "epoch": 1.06, "learning_rate": 0.00014459192904434226, "loss": 0.5012, "step": 28400 }, { "epoch": 1.06, "learning_rate": 0.0001442420746791826, "loss": 0.4946, "step": 28500 }, { "epoch": 1.07, "learning_rate": 0.00014389154570803477, "loss": 0.5138, "step": 28600 }, { "epoch": 1.07, "learning_rate": 0.000143540347475787, "loss": 0.5082, "step": 28700 }, { "epoch": 1.07, "learning_rate": 0.0001431884853375325, "loss": 0.4842, "step": 28800 }, { "epoch": 1.08, "learning_rate": 0.0001428359646584876, "loss": 0.5143, "step": 28900 }, { "epoch": 1.08, "learning_rate": 0.00014248279081391022, "loss": 0.5029, "step": 29000 }, { "epoch": 1.08, "eval_loss": 0.5910914540290833, "eval_runtime": 1278.8257, "eval_samples_per_second": 0.424, "eval_steps_per_second": 0.424, "step": 29000 }, { "epoch": 1.08, "learning_rate": 0.00014212896918901774, "loss": 0.5003, "step": 29100 }, { "epoch": 1.09, "learning_rate": 0.00014177450517890503, "loss": 0.5102, "step": 29200 }, { "epoch": 1.09, "learning_rate": 0.0001414194041884619, "loss": 0.524, "step": 29300 }, { "epoch": 1.1, "learning_rate": 0.0001410636716322911, "loss": 0.5168, "step": 29400 }, { "epoch": 1.1, "learning_rate": 0.0001407073129346254, "loss": 0.514, "step": 29500 }, { "epoch": 1.1, "learning_rate": 0.00014035033352924502, "loss": 0.5084, "step": 29600 }, { "epoch": 1.11, "learning_rate": 0.0001399927388593948, "loss": 0.5203, "step": 29700 }, { "epoch": 1.11, "learning_rate": 0.00013963453437770119, "loss": 0.5226, "step": 29800 }, { "epoch": 1.11, "learning_rate": 0.000139275725546089, "loss": 0.5055, "step": 29900 }, { "epoch": 1.12, "learning_rate": 0.00013891631783569838, "loss": 0.5303, "step": 30000 }, { "epoch": 1.12, "eval_loss": 0.5869857668876648, "eval_runtime": 1272.0282, "eval_samples_per_second": 0.426, "eval_steps_per_second": 0.426, "step": 30000 }, { "epoch": 1.12, "learning_rate": 0.00013855631672680106, "loss": 0.5243, "step": 30100 }, { "epoch": 1.13, "learning_rate": 0.00013819572770871702, "loss": 0.5148, "step": 30200 }, { "epoch": 1.13, "learning_rate": 0.00013783455627973062, "loss": 0.522, "step": 30300 }, { "epoch": 1.13, "learning_rate": 0.00013747280794700707, "loss": 0.5289, "step": 30400 }, { "epoch": 1.14, "learning_rate": 0.00013711048822650802, "loss": 0.4996, "step": 30500 }, { "epoch": 1.14, "learning_rate": 0.00013674760264290785, "loss": 0.5099, "step": 30600 }, { "epoch": 1.14, "learning_rate": 0.0001363841567295091, "loss": 0.5219, "step": 30700 }, { "epoch": 1.15, "learning_rate": 0.00013602015602815837, "loss": 0.5297, "step": 30800 }, { "epoch": 1.15, "learning_rate": 0.00013565560608916165, "loss": 0.5029, "step": 30900 }, { "epoch": 1.16, "learning_rate": 0.0001352905124711998, "loss": 0.5266, "step": 31000 }, { "epoch": 1.16, "eval_loss": 0.5811149477958679, "eval_runtime": 1300.1475, "eval_samples_per_second": 0.417, "eval_steps_per_second": 0.417, "step": 31000 }, { "epoch": 1.16, "learning_rate": 0.00013492488074124366, "loss": 0.5295, "step": 31100 }, { "epoch": 1.16, "learning_rate": 0.00013455871647446923, "loss": 0.539, "step": 31200 }, { "epoch": 1.17, "learning_rate": 0.00013419202525417277, "loss": 0.5217, "step": 31300 }, { "epoch": 1.17, "learning_rate": 0.0001338248126716854, "loss": 0.5197, "step": 31400 }, { "epoch": 1.17, "learning_rate": 0.00013345708432628824, "loss": 0.4991, "step": 31500 }, { "epoch": 1.18, "learning_rate": 0.00013308884582512647, "loss": 0.5239, "step": 31600 }, { "epoch": 1.18, "learning_rate": 0.00013272010278312453, "loss": 0.4899, "step": 31700 }, { "epoch": 1.19, "learning_rate": 0.00013235086082289977, "loss": 0.5088, "step": 31800 }, { "epoch": 1.19, "learning_rate": 0.00013198112557467732, "loss": 0.5497, "step": 31900 }, { "epoch": 1.19, "learning_rate": 0.00013161090267620396, "loss": 0.5024, "step": 32000 }, { "epoch": 1.19, "eval_loss": 0.5758991241455078, "eval_runtime": 1292.3362, "eval_samples_per_second": 0.419, "eval_steps_per_second": 0.419, "step": 32000 }, { "epoch": 1.2, "learning_rate": 0.0001312401977726621, "loss": 0.534, "step": 32100 }, { "epoch": 1.2, "learning_rate": 0.0001308690165165839, "loss": 0.4936, "step": 32200 }, { "epoch": 1.2, "learning_rate": 0.00013049736456776485, "loss": 0.4999, "step": 32300 }, { "epoch": 1.21, "learning_rate": 0.00013012524759317774, "loss": 0.5238, "step": 32400 }, { "epoch": 1.21, "learning_rate": 0.000129752671266886, "loss": 0.4959, "step": 32500 }, { "epoch": 1.22, "learning_rate": 0.00012937964126995727, "loss": 0.514, "step": 32600 }, { "epoch": 1.22, "learning_rate": 0.00012900616329037694, "loss": 0.4964, "step": 32700 }, { "epoch": 1.22, "learning_rate": 0.00012863224302296107, "loss": 0.5054, "step": 32800 }, { "epoch": 1.23, "learning_rate": 0.0001282578861692699, "loss": 0.5079, "step": 32900 }, { "epoch": 1.23, "learning_rate": 0.0001278830984375206, "loss": 0.4929, "step": 33000 }, { "epoch": 1.23, "eval_loss": 0.5719351172447205, "eval_runtime": 1267.7603, "eval_samples_per_second": 0.428, "eval_steps_per_second": 0.428, "step": 33000 }, { "epoch": 1.23, "learning_rate": 0.0001275078855425007, "loss": 0.4971, "step": 33100 }, { "epoch": 1.24, "learning_rate": 0.0001271322532054803, "loss": 0.4977, "step": 33200 }, { "epoch": 1.24, "learning_rate": 0.0001267562071541254, "loss": 0.499, "step": 33300 }, { "epoch": 1.25, "learning_rate": 0.00012637975312241022, "loss": 0.5044, "step": 33400 }, { "epoch": 1.25, "learning_rate": 0.00012600289685052996, "loss": 0.5019, "step": 33500 }, { "epoch": 1.25, "learning_rate": 0.00012562564408481327, "loss": 0.5225, "step": 33600 }, { "epoch": 1.26, "learning_rate": 0.00012524800057763438, "loss": 0.5503, "step": 33700 }, { "epoch": 1.26, "learning_rate": 0.00012486997208732573, "loss": 0.5025, "step": 33800 }, { "epoch": 1.26, "learning_rate": 0.0001244915643780899, "loss": 0.5187, "step": 33900 }, { "epoch": 1.27, "learning_rate": 0.00012411278321991195, "loss": 0.5199, "step": 34000 }, { "epoch": 1.27, "eval_loss": 0.5665221810340881, "eval_runtime": 1263.0264, "eval_samples_per_second": 0.429, "eval_steps_per_second": 0.429, "step": 34000 }, { "epoch": 1.27, "learning_rate": 0.00012373363438847117, "loss": 0.5135, "step": 34100 }, { "epoch": 1.28, "learning_rate": 0.00012335412366505324, "loss": 0.5065, "step": 34200 }, { "epoch": 1.28, "learning_rate": 0.000122974256836462, "loss": 0.5223, "step": 34300 }, { "epoch": 1.28, "learning_rate": 0.00012259403969493114, "loss": 0.4946, "step": 34400 }, { "epoch": 1.29, "learning_rate": 0.00012221347803803605, "loss": 0.5105, "step": 34500 }, { "epoch": 1.29, "learning_rate": 0.00012183257766860514, "loss": 0.4812, "step": 34600 }, { "epoch": 1.29, "learning_rate": 0.00012145134439463178, "loss": 0.4981, "step": 34700 }, { "epoch": 1.3, "learning_rate": 0.0001210697840291852, "loss": 0.5038, "step": 34800 }, { "epoch": 1.3, "learning_rate": 0.00012068790239032241, "loss": 0.5551, "step": 34900 }, { "epoch": 1.3, "learning_rate": 0.00012030570530099902, "loss": 0.4964, "step": 35000 }, { "epoch": 1.3, "eval_loss": 0.562954843044281, "eval_runtime": 1252.1434, "eval_samples_per_second": 0.433, "eval_steps_per_second": 0.433, "step": 35000 }, { "epoch": 1.31, "learning_rate": 0.00011992319858898077, "loss": 0.4952, "step": 35100 }, { "epoch": 1.31, "learning_rate": 0.0001195403880867545, "loss": 0.5157, "step": 35200 }, { "epoch": 1.32, "learning_rate": 0.00011915727963143922, "loss": 0.4973, "step": 35300 }, { "epoch": 1.32, "learning_rate": 0.00011877387906469721, "loss": 0.4884, "step": 35400 }, { "epoch": 1.32, "learning_rate": 0.00011839019223264489, "loss": 0.5017, "step": 35500 }, { "epoch": 1.33, "learning_rate": 0.00011800622498576363, "loss": 0.5157, "step": 35600 }, { "epoch": 1.33, "learning_rate": 0.00011762198317881059, "loss": 0.4774, "step": 35700 }, { "epoch": 1.33, "learning_rate": 0.0001172374726707295, "loss": 0.4855, "step": 35800 }, { "epoch": 1.34, "learning_rate": 0.00011685269932456115, "loss": 0.5134, "step": 35900 }, { "epoch": 1.34, "learning_rate": 0.00011646766900735422, "loss": 0.5143, "step": 36000 }, { "epoch": 1.34, "eval_loss": 0.5594063997268677, "eval_runtime": 1270.0722, "eval_samples_per_second": 0.427, "eval_steps_per_second": 0.427, "step": 36000 }, { "epoch": 1.35, "learning_rate": 0.00011608238759007561, "loss": 0.5268, "step": 36100 }, { "epoch": 1.35, "learning_rate": 0.00011569686094752101, "loss": 0.5179, "step": 36200 }, { "epoch": 1.35, "learning_rate": 0.00011531109495822545, "loss": 0.5236, "step": 36300 }, { "epoch": 1.36, "learning_rate": 0.00011492509550437339, "loss": 0.5197, "step": 36400 }, { "epoch": 1.36, "learning_rate": 0.0001145388684717092, "loss": 0.5109, "step": 36500 }, { "epoch": 1.36, "learning_rate": 0.00011415241974944744, "loss": 0.5126, "step": 36600 }, { "epoch": 1.37, "learning_rate": 0.00011376575523018296, "loss": 0.501, "step": 36700 }, { "epoch": 1.37, "learning_rate": 0.00011337888080980115, "loss": 0.4888, "step": 36800 }, { "epoch": 1.38, "learning_rate": 0.00011299180238738789, "loss": 0.5324, "step": 36900 }, { "epoch": 1.38, "learning_rate": 0.00011260452586513981, "loss": 0.5053, "step": 37000 }, { "epoch": 1.38, "eval_loss": 0.5555862188339233, "eval_runtime": 1324.5375, "eval_samples_per_second": 0.409, "eval_steps_per_second": 0.409, "step": 37000 }, { "epoch": 1.38, "learning_rate": 0.00011221705714827408, "loss": 0.4994, "step": 37100 }, { "epoch": 1.39, "learning_rate": 0.00011182940214493858, "loss": 0.4826, "step": 37200 }, { "epoch": 1.39, "learning_rate": 0.0001114415667661215, "loss": 0.5173, "step": 37300 }, { "epoch": 1.39, "learning_rate": 0.00011105355692556165, "loss": 0.4992, "step": 37400 }, { "epoch": 1.4, "learning_rate": 0.00011066537853965788, "loss": 0.5155, "step": 37500 }, { "epoch": 1.4, "learning_rate": 0.00011027703752737913, "loss": 0.5118, "step": 37600 }, { "epoch": 1.41, "learning_rate": 0.00010988853981017393, "loss": 0.5078, "step": 37700 }, { "epoch": 1.41, "learning_rate": 0.00010949989131188043, "loss": 0.506, "step": 37800 }, { "epoch": 1.41, "learning_rate": 0.00010911109795863581, "loss": 0.5074, "step": 37900 }, { "epoch": 1.42, "learning_rate": 0.00010872216567878599, "loss": 0.4837, "step": 38000 }, { "epoch": 1.42, "eval_loss": 0.5529844164848328, "eval_runtime": 1281.6978, "eval_samples_per_second": 0.423, "eval_steps_per_second": 0.423, "step": 38000 }, { "epoch": 1.42, "learning_rate": 0.00010833310040279531, "loss": 0.5161, "step": 38100 }, { "epoch": 1.42, "learning_rate": 0.00010794390806315602, "loss": 0.4929, "step": 38200 }, { "epoch": 1.43, "learning_rate": 0.0001075545945942978, "loss": 0.4819, "step": 38300 }, { "epoch": 1.43, "learning_rate": 0.00010716516593249742, "loss": 0.4594, "step": 38400 }, { "epoch": 1.44, "learning_rate": 0.00010677562801578798, "loss": 0.4802, "step": 38500 }, { "epoch": 1.44, "learning_rate": 0.00010638598678386864, "loss": 0.4793, "step": 38600 }, { "epoch": 1.44, "learning_rate": 0.00010599624817801383, "loss": 0.4912, "step": 38700 }, { "epoch": 1.45, "learning_rate": 0.0001056064181409828, "loss": 0.519, "step": 38800 }, { "epoch": 1.45, "learning_rate": 0.00010521650261692886, "loss": 0.4993, "step": 38900 }, { "epoch": 1.45, "learning_rate": 0.00010482650755130898, "loss": 0.4688, "step": 39000 }, { "epoch": 1.45, "eval_loss": 0.5486682057380676, "eval_runtime": 1265.4476, "eval_samples_per_second": 0.428, "eval_steps_per_second": 0.428, "step": 39000 }, { "epoch": 1.46, "learning_rate": 0.00010443643889079282, "loss": 0.4901, "step": 39100 }, { "epoch": 1.46, "learning_rate": 0.00010404630258317236, "loss": 0.4783, "step": 39200 }, { "epoch": 1.47, "learning_rate": 0.00010365610457727095, "loss": 0.5026, "step": 39300 }, { "epoch": 1.47, "learning_rate": 0.00010326585082285279, "loss": 0.4602, "step": 39400 }, { "epoch": 1.47, "learning_rate": 0.00010287554727053215, "loss": 0.5299, "step": 39500 }, { "epoch": 1.48, "learning_rate": 0.00010248519987168252, "loss": 0.5135, "step": 39600 }, { "epoch": 1.48, "learning_rate": 0.00010209481457834616, "loss": 0.4792, "step": 39700 }, { "epoch": 1.48, "learning_rate": 0.0001017043973431429, "loss": 0.5004, "step": 39800 }, { "epoch": 1.49, "learning_rate": 0.00010131395411917979, "loss": 0.5013, "step": 39900 }, { "epoch": 1.49, "learning_rate": 0.00010092349085996011, "loss": 0.501, "step": 40000 }, { "epoch": 1.49, "eval_loss": 0.5438262820243835, "eval_runtime": 1262.0322, "eval_samples_per_second": 0.429, "eval_steps_per_second": 0.429, "step": 40000 }, { "epoch": 1.5, "learning_rate": 0.0001005330135192927, "loss": 0.5043, "step": 40100 }, { "epoch": 1.5, "learning_rate": 0.000100142528051201, "loss": 0.4888, "step": 40200 }, { "epoch": 1.5, "learning_rate": 9.975204040983244e-05, "loss": 0.4866, "step": 40300 }, { "epoch": 1.51, "learning_rate": 9.936155654936761e-05, "loss": 0.4845, "step": 40400 }, { "epoch": 1.51, "learning_rate": 9.897108242392937e-05, "loss": 0.5027, "step": 40500 }, { "epoch": 1.51, "learning_rate": 9.858062398749225e-05, "loss": 0.5048, "step": 40600 }, { "epoch": 1.52, "learning_rate": 9.819018719379143e-05, "loss": 0.4985, "step": 40700 }, { "epoch": 1.52, "learning_rate": 9.77997779962322e-05, "loss": 0.4861, "step": 40800 }, { "epoch": 1.52, "learning_rate": 9.740940234779903e-05, "loss": 0.4743, "step": 40900 }, { "epoch": 1.53, "learning_rate": 9.701906620096474e-05, "loss": 0.5036, "step": 41000 }, { "epoch": 1.53, "eval_loss": 0.539782702922821, "eval_runtime": 1250.1524, "eval_samples_per_second": 0.434, "eval_steps_per_second": 0.434, "step": 41000 }, { "epoch": 1.53, "learning_rate": 9.662877550759995e-05, "loss": 0.4902, "step": 41100 }, { "epoch": 1.54, "learning_rate": 9.62385362188821e-05, "loss": 0.4568, "step": 41200 }, { "epoch": 1.54, "learning_rate": 9.584835428520491e-05, "loss": 0.491, "step": 41300 }, { "epoch": 1.54, "learning_rate": 9.545823565608745e-05, "loss": 0.5098, "step": 41400 }, { "epoch": 1.55, "learning_rate": 9.506818628008358e-05, "loss": 0.4964, "step": 41500 }, { "epoch": 1.55, "learning_rate": 9.467821210469116e-05, "loss": 0.4855, "step": 41600 }, { "epoch": 1.55, "learning_rate": 9.42883190762614e-05, "loss": 0.49, "step": 41700 }, { "epoch": 1.56, "learning_rate": 9.389851313990813e-05, "loss": 0.4981, "step": 41800 }, { "epoch": 1.56, "learning_rate": 9.350880023941727e-05, "loss": 0.4825, "step": 41900 }, { "epoch": 1.57, "learning_rate": 9.311918631715612e-05, "loss": 0.4751, "step": 42000 }, { "epoch": 1.57, "eval_loss": 0.5362206697463989, "eval_runtime": 1229.3972, "eval_samples_per_second": 0.441, "eval_steps_per_second": 0.441, "step": 42000 }, { "epoch": 1.57, "learning_rate": 9.272967731398264e-05, "loss": 0.4789, "step": 42100 }, { "epoch": 1.57, "learning_rate": 9.234027916915512e-05, "loss": 0.4644, "step": 42200 }, { "epoch": 1.58, "learning_rate": 9.195099782024136e-05, "loss": 0.4517, "step": 42300 }, { "epoch": 1.58, "learning_rate": 9.156183920302836e-05, "loss": 0.4768, "step": 42400 }, { "epoch": 1.58, "learning_rate": 9.117280925143156e-05, "loss": 0.4472, "step": 42500 }, { "epoch": 1.59, "learning_rate": 9.078391389740465e-05, "loss": 0.5002, "step": 42600 }, { "epoch": 1.59, "learning_rate": 9.039515907084884e-05, "loss": 0.4674, "step": 42700 }, { "epoch": 1.6, "learning_rate": 9.000655069952262e-05, "loss": 0.4704, "step": 42800 }, { "epoch": 1.6, "learning_rate": 8.961809470895141e-05, "loss": 0.4671, "step": 42900 }, { "epoch": 1.6, "learning_rate": 8.922979702233692e-05, "loss": 0.5007, "step": 43000 }, { "epoch": 1.6, "eval_loss": 0.5325181484222412, "eval_runtime": 1297.5192, "eval_samples_per_second": 0.418, "eval_steps_per_second": 0.418, "step": 43000 }, { "epoch": 1.61, "learning_rate": 8.884166356046725e-05, "loss": 0.4767, "step": 43100 }, { "epoch": 1.61, "learning_rate": 8.845370024162619e-05, "loss": 0.517, "step": 43200 }, { "epoch": 1.61, "learning_rate": 8.806591298150332e-05, "loss": 0.4742, "step": 43300 }, { "epoch": 1.62, "learning_rate": 8.767830769310362e-05, "loss": 0.4953, "step": 43400 }, { "epoch": 1.62, "learning_rate": 8.729089028665733e-05, "loss": 0.473, "step": 43500 }, { "epoch": 1.63, "learning_rate": 8.690366666952989e-05, "loss": 0.4695, "step": 43600 }, { "epoch": 1.63, "learning_rate": 8.651664274613183e-05, "loss": 0.4948, "step": 43700 }, { "epoch": 1.63, "learning_rate": 8.612982441782866e-05, "loss": 0.499, "step": 43800 }, { "epoch": 1.64, "learning_rate": 8.574321758285104e-05, "loss": 0.48, "step": 43900 }, { "epoch": 1.64, "learning_rate": 8.535682813620482e-05, "loss": 0.4672, "step": 44000 }, { "epoch": 1.64, "eval_loss": 0.5285405516624451, "eval_runtime": 1272.6033, "eval_samples_per_second": 0.426, "eval_steps_per_second": 0.426, "step": 44000 }, { "epoch": 1.64, "learning_rate": 8.497066196958097e-05, "loss": 0.4684, "step": 44100 }, { "epoch": 1.65, "learning_rate": 8.458472497126595e-05, "loss": 0.4923, "step": 44200 }, { "epoch": 1.65, "learning_rate": 8.41990230260518e-05, "loss": 0.492, "step": 44300 }, { "epoch": 1.66, "learning_rate": 8.38135620151465e-05, "loss": 0.4814, "step": 44400 }, { "epoch": 1.66, "learning_rate": 8.342834781608424e-05, "loss": 0.4753, "step": 44500 }, { "epoch": 1.66, "learning_rate": 8.304338630263579e-05, "loss": 0.5032, "step": 44600 }, { "epoch": 1.67, "learning_rate": 8.265868334471895e-05, "loss": 0.4838, "step": 44700 }, { "epoch": 1.67, "learning_rate": 8.227424480830907e-05, "loss": 0.4463, "step": 44800 }, { "epoch": 1.67, "learning_rate": 8.189007655534959e-05, "loss": 0.458, "step": 44900 }, { "epoch": 1.68, "learning_rate": 8.150618444366255e-05, "loss": 0.4812, "step": 45000 }, { "epoch": 1.68, "eval_loss": 0.5250784754753113, "eval_runtime": 1265.3534, "eval_samples_per_second": 0.428, "eval_steps_per_second": 0.428, "step": 45000 }, { "epoch": 1.68, "learning_rate": 8.112257432685958e-05, "loss": 0.4743, "step": 45100 }, { "epoch": 1.69, "learning_rate": 8.073925205425225e-05, "loss": 0.4925, "step": 45200 }, { "epoch": 1.69, "learning_rate": 8.035622347076312e-05, "loss": 0.4748, "step": 45300 }, { "epoch": 1.69, "learning_rate": 7.997349441683657e-05, "loss": 0.4745, "step": 45400 }, { "epoch": 1.7, "learning_rate": 7.959107072834971e-05, "loss": 0.449, "step": 45500 }, { "epoch": 1.7, "learning_rate": 7.920895823652346e-05, "loss": 0.4753, "step": 45600 }, { "epoch": 1.7, "learning_rate": 7.882716276783352e-05, "loss": 0.457, "step": 45700 }, { "epoch": 1.71, "learning_rate": 7.844569014392172e-05, "loss": 0.488, "step": 45800 }, { "epoch": 1.71, "learning_rate": 7.806454618150698e-05, "loss": 0.4782, "step": 45900 }, { "epoch": 1.72, "learning_rate": 7.768373669229688e-05, "loss": 0.4751, "step": 46000 }, { "epoch": 1.72, "eval_loss": 0.5225542187690735, "eval_runtime": 1263.3928, "eval_samples_per_second": 0.429, "eval_steps_per_second": 0.429, "step": 46000 }, { "epoch": 1.72, "learning_rate": 7.730326748289895e-05, "loss": 0.4525, "step": 46100 }, { "epoch": 1.72, "learning_rate": 7.6923144354732e-05, "loss": 0.483, "step": 46200 }, { "epoch": 1.73, "learning_rate": 7.654337310393787e-05, "loss": 0.4941, "step": 46300 }, { "epoch": 1.73, "learning_rate": 7.616395952129287e-05, "loss": 0.4637, "step": 46400 }, { "epoch": 1.73, "learning_rate": 7.578490939211965e-05, "loss": 0.4874, "step": 46500 }, { "epoch": 1.74, "learning_rate": 7.540622849619883e-05, "loss": 0.4815, "step": 46600 }, { "epoch": 1.74, "learning_rate": 7.5027922607681e-05, "loss": 0.4973, "step": 46700 }, { "epoch": 1.74, "learning_rate": 7.46499974949985e-05, "loss": 0.5003, "step": 46800 }, { "epoch": 1.75, "learning_rate": 7.427245892077775e-05, "loss": 0.4331, "step": 46900 }, { "epoch": 1.75, "learning_rate": 7.389531264175103e-05, "loss": 0.4691, "step": 47000 }, { "epoch": 1.75, "eval_loss": 0.5184645652770996, "eval_runtime": 1306.5029, "eval_samples_per_second": 0.415, "eval_steps_per_second": 0.415, "step": 47000 }, { "epoch": 1.76, "learning_rate": 7.351856440866895e-05, "loss": 0.4875, "step": 47100 }, { "epoch": 1.76, "learning_rate": 7.314221996621279e-05, "loss": 0.4615, "step": 47200 }, { "epoch": 1.76, "learning_rate": 7.276628505290663e-05, "loss": 0.4522, "step": 47300 }, { "epoch": 1.77, "learning_rate": 7.239076540103013e-05, "loss": 0.4861, "step": 47400 }, { "epoch": 1.77, "learning_rate": 7.2015666736531e-05, "loss": 0.4676, "step": 47500 }, { "epoch": 1.77, "learning_rate": 7.164099477893768e-05, "loss": 0.4451, "step": 47600 }, { "epoch": 1.78, "learning_rate": 7.126675524127217e-05, "loss": 0.456, "step": 47700 }, { "epoch": 1.78, "learning_rate": 7.089295382996294e-05, "loss": 0.4676, "step": 47800 }, { "epoch": 1.79, "learning_rate": 7.05195962447578e-05, "loss": 0.4687, "step": 47900 }, { "epoch": 1.79, "learning_rate": 7.014668817863719e-05, "loss": 0.4415, "step": 48000 }, { "epoch": 1.79, "eval_loss": 0.5149570107460022, "eval_runtime": 1238.5109, "eval_samples_per_second": 0.438, "eval_steps_per_second": 0.438, "step": 48000 }, { "epoch": 1.79, "learning_rate": 6.977423531772711e-05, "loss": 0.5093, "step": 48100 }, { "epoch": 1.8, "learning_rate": 6.940224334121264e-05, "loss": 0.4556, "step": 48200 }, { "epoch": 1.8, "learning_rate": 6.903071792125136e-05, "loss": 0.4554, "step": 48300 }, { "epoch": 1.8, "learning_rate": 6.865966472288655e-05, "loss": 0.4442, "step": 48400 }, { "epoch": 1.81, "learning_rate": 6.828908940396123e-05, "loss": 0.4583, "step": 48500 }, { "epoch": 1.81, "learning_rate": 6.791899761503153e-05, "loss": 0.4598, "step": 48600 }, { "epoch": 1.82, "learning_rate": 6.754939499928079e-05, "loss": 0.4541, "step": 48700 }, { "epoch": 1.82, "learning_rate": 6.718028719243335e-05, "loss": 0.4474, "step": 48800 }, { "epoch": 1.82, "learning_rate": 6.68116798226687e-05, "loss": 0.4531, "step": 48900 }, { "epoch": 1.83, "learning_rate": 6.644357851053562e-05, "loss": 0.4748, "step": 49000 }, { "epoch": 1.83, "eval_loss": 0.5114006400108337, "eval_runtime": 1274.2814, "eval_samples_per_second": 0.425, "eval_steps_per_second": 0.425, "step": 49000 }, { "epoch": 1.83, "learning_rate": 6.607598886886645e-05, "loss": 0.4803, "step": 49100 }, { "epoch": 1.83, "learning_rate": 6.57089165026916e-05, "loss": 0.4703, "step": 49200 }, { "epoch": 1.84, "learning_rate": 6.534236700915406e-05, "loss": 0.4963, "step": 49300 }, { "epoch": 1.84, "learning_rate": 6.497634597742399e-05, "loss": 0.4694, "step": 49400 }, { "epoch": 1.85, "learning_rate": 6.46108589886135e-05, "loss": 0.4612, "step": 49500 }, { "epoch": 1.85, "learning_rate": 6.424591161569158e-05, "loss": 0.4361, "step": 49600 }, { "epoch": 1.85, "learning_rate": 6.388150942339917e-05, "loss": 0.4652, "step": 49700 }, { "epoch": 1.86, "learning_rate": 6.35176579681642e-05, "loss": 0.4741, "step": 49800 }, { "epoch": 1.86, "learning_rate": 6.315436279801704e-05, "loss": 0.4447, "step": 49900 }, { "epoch": 1.86, "learning_rate": 6.279162945250561e-05, "loss": 0.4542, "step": 50000 }, { "epoch": 1.86, "eval_loss": 0.507805347442627, "eval_runtime": 1229.0064, "eval_samples_per_second": 0.441, "eval_steps_per_second": 0.441, "step": 50000 }, { "epoch": 1.87, "learning_rate": 6.242946346261127e-05, "loss": 0.4431, "step": 50100 }, { "epoch": 1.87, "learning_rate": 6.20678703506642e-05, "loss": 0.4631, "step": 50200 }, { "epoch": 1.88, "learning_rate": 6.170685563025928e-05, "loss": 0.4655, "step": 50300 }, { "epoch": 1.88, "learning_rate": 6.13464248061722e-05, "loss": 0.4741, "step": 50400 }, { "epoch": 1.88, "learning_rate": 6.098658337427517e-05, "loss": 0.4362, "step": 50500 }, { "epoch": 1.89, "learning_rate": 6.0627336821453426e-05, "loss": 0.4501, "step": 50600 }, { "epoch": 1.89, "learning_rate": 6.0268690625521364e-05, "loss": 0.4638, "step": 50700 }, { "epoch": 1.89, "learning_rate": 5.99106502551392e-05, "loss": 0.4499, "step": 50800 }, { "epoch": 1.9, "learning_rate": 5.955322116972939e-05, "loss": 0.4655, "step": 50900 }, { "epoch": 1.9, "learning_rate": 5.919640881939357e-05, "loss": 0.4688, "step": 51000 }, { "epoch": 1.9, "eval_loss": 0.50446617603302, "eval_runtime": 1222.4661, "eval_samples_per_second": 0.443, "eval_steps_per_second": 0.443, "step": 51000 }, { "epoch": 1.91, "learning_rate": 5.884021864482925e-05, "loss": 0.4476, "step": 51100 }, { "epoch": 1.91, "learning_rate": 5.8484656077247066e-05, "loss": 0.4693, "step": 51200 }, { "epoch": 1.91, "learning_rate": 5.812972653828779e-05, "loss": 0.4592, "step": 51300 }, { "epoch": 1.92, "learning_rate": 5.777543543993975e-05, "loss": 0.4863, "step": 51400 }, { "epoch": 1.92, "learning_rate": 5.742178818445638e-05, "loss": 0.4615, "step": 51500 }, { "epoch": 1.92, "learning_rate": 5.706879016427364e-05, "loss": 0.4625, "step": 51600 }, { "epoch": 1.93, "learning_rate": 5.6716446761927885e-05, "loss": 0.4534, "step": 51700 }, { "epoch": 1.93, "learning_rate": 5.6364763349973995e-05, "loss": 0.4881, "step": 51800 }, { "epoch": 1.94, "learning_rate": 5.601374529090308e-05, "loss": 0.4474, "step": 51900 }, { "epoch": 1.94, "learning_rate": 5.566339793706102e-05, "loss": 0.4301, "step": 52000 }, { "epoch": 1.94, "eval_loss": 0.5012514591217041, "eval_runtime": 1223.7216, "eval_samples_per_second": 0.443, "eval_steps_per_second": 0.443, "step": 52000 }, { "epoch": 1.94, "learning_rate": 5.531372663056664e-05, "loss": 0.4523, "step": 52100 }, { "epoch": 1.95, "learning_rate": 5.496473670323052e-05, "loss": 0.4434, "step": 52200 }, { "epoch": 1.95, "learning_rate": 5.461643347647335e-05, "loss": 0.4239, "step": 52300 }, { "epoch": 1.95, "learning_rate": 5.4268822261245023e-05, "loss": 0.4581, "step": 52400 }, { "epoch": 1.96, "learning_rate": 5.392190835794369e-05, "loss": 0.4297, "step": 52500 }, { "epoch": 1.96, "learning_rate": 5.357569705633465e-05, "loss": 0.4431, "step": 52600 }, { "epoch": 1.96, "learning_rate": 5.3230193635470136e-05, "loss": 0.4406, "step": 52700 }, { "epoch": 1.97, "learning_rate": 5.288540336360836e-05, "loss": 0.4734, "step": 52800 }, { "epoch": 1.97, "learning_rate": 5.254133149813349e-05, "loss": 0.4449, "step": 52900 }, { "epoch": 1.98, "learning_rate": 5.2197983285475315e-05, "loss": 0.4519, "step": 53000 }, { "epoch": 1.98, "eval_loss": 0.4982399046421051, "eval_runtime": 1234.2642, "eval_samples_per_second": 0.439, "eval_steps_per_second": 0.439, "step": 53000 }, { "epoch": 1.98, "learning_rate": 5.185536396102946e-05, "loss": 0.4352, "step": 53100 }, { "epoch": 1.98, "learning_rate": 5.1513478749077274e-05, "loss": 0.4801, "step": 53200 }, { "epoch": 1.99, "learning_rate": 5.1172332862706376e-05, "loss": 0.4315, "step": 53300 }, { "epoch": 1.99, "learning_rate": 5.0831931503731065e-05, "loss": 0.4443, "step": 53400 }, { "epoch": 1.99, "learning_rate": 5.049227986261302e-05, "loss": 0.4597, "step": 53500 }, { "epoch": 2.0, "learning_rate": 5.0153383118382355e-05, "loss": 0.4519, "step": 53600 }, { "epoch": 2.0, "learning_rate": 4.9815246438558264e-05, "loss": 0.4314, "step": 53700 }, { "epoch": 2.01, "learning_rate": 4.9477874979070474e-05, "loss": 0.3999, "step": 53800 }, { "epoch": 2.01, "learning_rate": 4.914127388418062e-05, "loss": 0.3937, "step": 53900 }, { "epoch": 2.01, "learning_rate": 4.880544828640372e-05, "loss": 0.3597, "step": 54000 }, { "epoch": 2.01, "eval_loss": 0.49871668219566345, "eval_runtime": 1230.6045, "eval_samples_per_second": 0.44, "eval_steps_per_second": 0.44, "step": 54000 }, { "epoch": 2.02, "learning_rate": 4.8470403306430056e-05, "loss": 0.4075, "step": 54100 }, { "epoch": 2.02, "learning_rate": 4.813614405304693e-05, "loss": 0.383, "step": 54200 }, { "epoch": 2.02, "learning_rate": 4.780267562306081e-05, "loss": 0.3833, "step": 54300 }, { "epoch": 2.03, "learning_rate": 4.7470003101219664e-05, "loss": 0.3828, "step": 54400 }, { "epoch": 2.03, "learning_rate": 4.713813156013548e-05, "loss": 0.3864, "step": 54500 }, { "epoch": 2.04, "learning_rate": 4.680706606020668e-05, "loss": 0.3805, "step": 54600 }, { "epoch": 2.04, "learning_rate": 4.64768116495413e-05, "loss": 0.3934, "step": 54700 }, { "epoch": 2.04, "learning_rate": 4.61473733638797e-05, "loss": 0.3842, "step": 54800 }, { "epoch": 2.05, "learning_rate": 4.5818756226517924e-05, "loss": 0.3783, "step": 54900 }, { "epoch": 2.05, "learning_rate": 4.54909652482312e-05, "loss": 0.3912, "step": 55000 }, { "epoch": 2.05, "eval_loss": 0.49695634841918945, "eval_runtime": 1220.9853, "eval_samples_per_second": 0.444, "eval_steps_per_second": 0.444, "step": 55000 }, { "epoch": 2.05, "learning_rate": 4.516400542719733e-05, "loss": 0.3941, "step": 55100 }, { "epoch": 2.06, "learning_rate": 4.4837881748920594e-05, "loss": 0.366, "step": 55200 }, { "epoch": 2.06, "learning_rate": 4.451259918615569e-05, "loss": 0.4203, "step": 55300 }, { "epoch": 2.07, "learning_rate": 4.418816269883204e-05, "loss": 0.3613, "step": 55400 }, { "epoch": 2.07, "learning_rate": 4.386457723397794e-05, "loss": 0.3825, "step": 55500 }, { "epoch": 2.07, "learning_rate": 4.354184772564526e-05, "loss": 0.4147, "step": 55600 }, { "epoch": 2.08, "learning_rate": 4.3219979094834275e-05, "loss": 0.3812, "step": 55700 }, { "epoch": 2.08, "learning_rate": 4.289897624941841e-05, "loss": 0.3926, "step": 55800 }, { "epoch": 2.08, "learning_rate": 4.257884408406968e-05, "loss": 0.4103, "step": 55900 }, { "epoch": 2.09, "learning_rate": 4.225958748018381e-05, "loss": 0.4009, "step": 56000 }, { "epoch": 2.09, "eval_loss": 0.49594032764434814, "eval_runtime": 1218.3341, "eval_samples_per_second": 0.445, "eval_steps_per_second": 0.445, "step": 56000 }, { "epoch": 2.09, "learning_rate": 4.194121130580594e-05, "loss": 0.3779, "step": 56100 }, { "epoch": 2.1, "learning_rate": 4.1623720415556336e-05, "loss": 0.3651, "step": 56200 }, { "epoch": 2.1, "learning_rate": 4.1307119650556494e-05, "loss": 0.3754, "step": 56300 }, { "epoch": 2.1, "learning_rate": 4.099141383835512e-05, "loss": 0.3887, "step": 56400 }, { "epoch": 2.11, "learning_rate": 4.067660779285465e-05, "loss": 0.3739, "step": 56500 }, { "epoch": 2.11, "learning_rate": 4.036270631423781e-05, "loss": 0.3842, "step": 56600 }, { "epoch": 2.11, "learning_rate": 4.004971418889447e-05, "loss": 0.3723, "step": 56700 }, { "epoch": 2.12, "learning_rate": 3.9737636189348634e-05, "loss": 0.3889, "step": 56800 }, { "epoch": 2.12, "learning_rate": 3.942647707418561e-05, "loss": 0.3897, "step": 56900 }, { "epoch": 2.13, "learning_rate": 3.9116241587979496e-05, "loss": 0.3592, "step": 57000 }, { "epoch": 2.13, "eval_loss": 0.49361398816108704, "eval_runtime": 1208.1063, "eval_samples_per_second": 0.449, "eval_steps_per_second": 0.449, "step": 57000 }, { "epoch": 2.13, "learning_rate": 3.8806934461220826e-05, "loss": 0.3512, "step": 57100 }, { "epoch": 2.13, "learning_rate": 3.8498560410244546e-05, "loss": 0.3715, "step": 57200 }, { "epoch": 2.14, "learning_rate": 3.819112413715791e-05, "loss": 0.3803, "step": 57300 }, { "epoch": 2.14, "learning_rate": 3.7884630329768875e-05, "loss": 0.3785, "step": 57400 }, { "epoch": 2.14, "learning_rate": 3.757908366151463e-05, "loss": 0.3626, "step": 57500 }, { "epoch": 2.15, "learning_rate": 3.72744887913904e-05, "loss": 0.3981, "step": 57600 }, { "epoch": 2.15, "learning_rate": 3.697085036387822e-05, "loss": 0.3918, "step": 57700 }, { "epoch": 2.16, "learning_rate": 3.6668173008876324e-05, "loss": 0.3876, "step": 57800 }, { "epoch": 2.16, "learning_rate": 3.6366461341628396e-05, "loss": 0.3878, "step": 57900 }, { "epoch": 2.16, "learning_rate": 3.606571996265321e-05, "loss": 0.3674, "step": 58000 }, { "epoch": 2.16, "eval_loss": 0.4916069805622101, "eval_runtime": 1244.109, "eval_samples_per_second": 0.436, "eval_steps_per_second": 0.436, "step": 58000 }, { "epoch": 2.17, "learning_rate": 3.576595345767464e-05, "loss": 0.3759, "step": 58100 }, { "epoch": 2.17, "learning_rate": 3.5467166397551524e-05, "loss": 0.3987, "step": 58200 }, { "epoch": 2.17, "learning_rate": 3.5169363338208094e-05, "loss": 0.3809, "step": 58300 }, { "epoch": 2.18, "learning_rate": 3.4872548820564455e-05, "loss": 0.3851, "step": 58400 }, { "epoch": 2.18, "learning_rate": 3.457672737046737e-05, "loss": 0.3832, "step": 58500 }, { "epoch": 2.18, "learning_rate": 3.42819034986213e-05, "loss": 0.3923, "step": 58600 }, { "epoch": 2.19, "learning_rate": 3.398808170051951e-05, "loss": 0.3609, "step": 58700 }, { "epoch": 2.19, "learning_rate": 3.369526645637556e-05, "loss": 0.3538, "step": 58800 }, { "epoch": 2.2, "learning_rate": 3.3403462231055107e-05, "loss": 0.3941, "step": 58900 }, { "epoch": 2.2, "learning_rate": 3.3112673474007584e-05, "loss": 0.3984, "step": 59000 }, { "epoch": 2.2, "eval_loss": 0.4893116354942322, "eval_runtime": 1243.7748, "eval_samples_per_second": 0.436, "eval_steps_per_second": 0.436, "step": 59000 }, { "epoch": 2.22, "learning_rate": 3.167411635594364e-05, "loss": 0.3867, "step": 59500 }, { "epoch": 2.22, "eval_loss": 0.48985520005226135, "eval_runtime": 1240.4608, "eval_samples_per_second": 0.437, "eval_steps_per_second": 0.437, "step": 59500 }, { "epoch": 2.24, "learning_rate": 3.0261604379828834e-05, "loss": 0.3736, "step": 60000 }, { "epoch": 2.24, "eval_loss": 0.489548921585083, "eval_runtime": 1234.7527, "eval_samples_per_second": 0.439, "eval_steps_per_second": 0.439, "step": 60000 }, { "epoch": 2.26, "learning_rate": 2.887567598106955e-05, "loss": 0.361, "step": 60500 }, { "epoch": 2.26, "eval_loss": 0.4885287582874298, "eval_runtime": 1231.4045, "eval_samples_per_second": 0.44, "eval_steps_per_second": 0.44, "step": 60500 }, { "epoch": 2.27, "learning_rate": 2.7516859461678857e-05, "loss": 0.3778, "step": 61000 }, { "epoch": 2.27, "eval_loss": 0.4883672893047333, "eval_runtime": 1235.8497, "eval_samples_per_second": 0.439, "eval_steps_per_second": 0.439, "step": 61000 }, { "epoch": 2.29, "learning_rate": 2.618567278889328e-05, "loss": 0.3791, "step": 61500 }, { "epoch": 2.29, "eval_loss": 0.4874744415283203, "eval_runtime": 1231.8195, "eval_samples_per_second": 0.44, "eval_steps_per_second": 0.44, "step": 61500 }, { "epoch": 2.31, "learning_rate": 2.4882623397728655e-05, "loss": 0.3705, "step": 62000 }, { "epoch": 2.31, "eval_loss": 0.486933171749115, "eval_runtime": 1227.5583, "eval_samples_per_second": 0.442, "eval_steps_per_second": 0.442, "step": 62000 }, { "epoch": 2.33, "learning_rate": 2.3608207997551255e-05, "loss": 0.3698, "step": 62500 }, { "epoch": 2.33, "eval_loss": 0.48592954874038696, "eval_runtime": 1282.2531, "eval_samples_per_second": 0.423, "eval_steps_per_second": 0.423, "step": 62500 }, { "epoch": 2.35, "learning_rate": 2.2362912382736857e-05, "loss": 0.381, "step": 63000 }, { "epoch": 2.35, "eval_loss": 0.4852922856807709, "eval_runtime": 1229.4457, "eval_samples_per_second": 0.441, "eval_steps_per_second": 0.441, "step": 63000 }, { "epoch": 2.37, "learning_rate": 2.1147211247491084e-05, "loss": 0.3728, "step": 63500 }, { "epoch": 2.37, "eval_loss": 0.484967440366745, "eval_runtime": 1296.2845, "eval_samples_per_second": 0.418, "eval_steps_per_second": 0.418, "step": 63500 }, { "epoch": 2.39, "learning_rate": 1.9961568004900565e-05, "loss": 0.3695, "step": 64000 }, { "epoch": 2.39, "eval_loss": 0.4844016432762146, "eval_runtime": 1317.5418, "eval_samples_per_second": 0.411, "eval_steps_per_second": 0.411, "step": 64000 }, { "epoch": 2.4, "learning_rate": 1.8806434610284497e-05, "loss": 0.3682, "step": 64500 }, { "epoch": 2.4, "eval_loss": 0.4838670790195465, "eval_runtime": 1337.5922, "eval_samples_per_second": 0.405, "eval_steps_per_second": 0.405, "step": 64500 }, { "epoch": 2.42, "learning_rate": 1.768225138891393e-05, "loss": 0.3594, "step": 65000 }, { "epoch": 2.42, "eval_loss": 0.48305046558380127, "eval_runtime": 1317.2888, "eval_samples_per_second": 0.411, "eval_steps_per_second": 0.411, "step": 65000 }, { "epoch": 2.44, "learning_rate": 1.6589446868164037e-05, "loss": 0.367, "step": 65500 }, { "epoch": 2.44, "eval_loss": 0.48225167393684387, "eval_runtime": 1315.9763, "eval_samples_per_second": 0.412, "eval_steps_per_second": 0.412, "step": 65500 }, { "epoch": 2.46, "learning_rate": 1.552843761416395e-05, "loss": 0.3781, "step": 66000 }, { "epoch": 2.46, "eval_loss": 0.48182958364486694, "eval_runtime": 1298.0711, "eval_samples_per_second": 0.418, "eval_steps_per_second": 0.418, "step": 66000 }, { "epoch": 2.48, "learning_rate": 1.4499628073005733e-05, "loss": 0.3632, "step": 66500 }, { "epoch": 2.48, "eval_loss": 0.48136985301971436, "eval_runtime": 1295.6256, "eval_samples_per_second": 0.418, "eval_steps_per_second": 0.418, "step": 66500 }, { "epoch": 2.5, "learning_rate": 1.350341041657378e-05, "loss": 0.3707, "step": 67000 }, { "epoch": 2.5, "eval_loss": 0.48081424832344055, "eval_runtime": 1297.8801, "eval_samples_per_second": 0.418, "eval_steps_per_second": 0.418, "step": 67000 }, { "epoch": 2.52, "learning_rate": 1.2540164393052622e-05, "loss": 0.3657, "step": 67500 }, { "epoch": 2.52, "eval_loss": 0.48031187057495117, "eval_runtime": 1299.2471, "eval_samples_per_second": 0.417, "eval_steps_per_second": 0.417, "step": 67500 }, { "epoch": 2.54, "learning_rate": 1.1610257182170914e-05, "loss": 0.3742, "step": 68000 }, { "epoch": 2.54, "eval_loss": 0.479922354221344, "eval_runtime": 1275.2567, "eval_samples_per_second": 0.425, "eval_steps_per_second": 0.425, "step": 68000 }, { "epoch": 2.55, "learning_rate": 1.0714043255236094e-05, "loss": 0.3761, "step": 68500 }, { "epoch": 2.55, "eval_loss": 0.4795922338962555, "eval_runtime": 1321.5276, "eval_samples_per_second": 0.41, "eval_steps_per_second": 0.41, "step": 68500 }, { "epoch": 2.57, "learning_rate": 9.851864240013509e-06, "loss": 0.3754, "step": 69000 }, { "epoch": 2.57, "eval_loss": 0.4789520502090454, "eval_runtime": 1345.4528, "eval_samples_per_second": 0.403, "eval_steps_per_second": 0.403, "step": 69000 }, { "epoch": 2.59, "learning_rate": 9.024048790501272e-06, "loss": 0.3594, "step": 69500 }, { "epoch": 2.59, "eval_loss": 0.47866225242614746, "eval_runtime": 1316.9883, "eval_samples_per_second": 0.412, "eval_steps_per_second": 0.412, "step": 69500 }, { "epoch": 2.61, "learning_rate": 8.230912461650797e-06, "loss": 0.3601, "step": 70000 }, { "epoch": 2.61, "eval_loss": 0.47838443517684937, "eval_runtime": 1306.7325, "eval_samples_per_second": 0.415, "eval_steps_per_second": 0.415, "step": 70000 }, { "epoch": 2.63, "learning_rate": 7.472757589080226e-06, "loss": 0.3614, "step": 70500 }, { "epoch": 2.63, "eval_loss": 0.4780386686325073, "eval_runtime": 1290.4017, "eval_samples_per_second": 0.42, "eval_steps_per_second": 0.42, "step": 70500 }, { "epoch": 2.65, "learning_rate": 6.749873173827314e-06, "loss": 0.3746, "step": 71000 }, { "epoch": 2.65, "eval_loss": 0.47773027420043945, "eval_runtime": 1293.7698, "eval_samples_per_second": 0.419, "eval_steps_per_second": 0.419, "step": 71000 }, { "epoch": 2.67, "learning_rate": 6.0625347721849805e-06, "loss": 0.365, "step": 71500 }, { "epoch": 2.67, "eval_loss": 0.47759953141212463, "eval_runtime": 1287.2533, "eval_samples_per_second": 0.421, "eval_steps_per_second": 0.421, "step": 71500 }, { "epoch": 2.68, "learning_rate": 5.411004390662034e-06, "loss": 0.3614, "step": 72000 }, { "epoch": 2.68, "eval_loss": 0.4774133861064911, "eval_runtime": 1290.2562, "eval_samples_per_second": 0.42, "eval_steps_per_second": 0.42, "step": 72000 }, { "epoch": 2.7, "learning_rate": 4.795530386109038e-06, "loss": 0.3672, "step": 72500 }, { "epoch": 2.7, "eval_loss": 0.4771479070186615, "eval_runtime": 1313.3814, "eval_samples_per_second": 0.413, "eval_steps_per_second": 0.413, "step": 72500 }, { "epoch": 2.72, "learning_rate": 4.2163473710470355e-06, "loss": 0.3536, "step": 73000 }, { "epoch": 2.72, "eval_loss": 0.4770236909389496, "eval_runtime": 1301.9499, "eval_samples_per_second": 0.416, "eval_steps_per_second": 0.416, "step": 73000 }, { "epoch": 2.74, "learning_rate": 3.67367612423567e-06, "loss": 0.3693, "step": 73500 }, { "epoch": 2.74, "eval_loss": 0.4766899645328522, "eval_runtime": 1310.5415, "eval_samples_per_second": 0.414, "eval_steps_per_second": 0.414, "step": 73500 }, { "epoch": 2.76, "learning_rate": 3.1677235065144862e-06, "loss": 0.358, "step": 74000 }, { "epoch": 2.76, "eval_loss": 0.47646036744117737, "eval_runtime": 1327.3256, "eval_samples_per_second": 0.408, "eval_steps_per_second": 0.408, "step": 74000 }, { "epoch": 2.78, "learning_rate": 2.6986823819497353e-06, "loss": 0.3653, "step": 74500 }, { "epoch": 2.78, "eval_loss": 0.47627386450767517, "eval_runtime": 1332.1149, "eval_samples_per_second": 0.407, "eval_steps_per_second": 0.407, "step": 74500 }, { "epoch": 2.8, "learning_rate": 2.266731544316425e-06, "loss": 0.3743, "step": 75000 }, { "epoch": 2.8, "eval_loss": 0.47608959674835205, "eval_runtime": 1305.4101, "eval_samples_per_second": 0.415, "eval_steps_per_second": 0.415, "step": 75000 }, { "epoch": 2.81, "learning_rate": 1.872035648944026e-06, "loss": 0.3659, "step": 75500 }, { "epoch": 2.81, "eval_loss": 0.476179838180542, "eval_runtime": 1301.8331, "eval_samples_per_second": 0.416, "eval_steps_per_second": 0.416, "step": 75500 }, { "epoch": 2.83, "learning_rate": 1.5147451499514353e-06, "loss": 0.3678, "step": 76000 }, { "epoch": 2.83, "eval_loss": 0.4760454595088959, "eval_runtime": 1297.73, "eval_samples_per_second": 0.418, "eval_steps_per_second": 0.418, "step": 76000 }, { "epoch": 2.85, "learning_rate": 1.1949962428953965e-06, "loss": 0.3672, "step": 76500 }, { "epoch": 2.85, "eval_loss": 0.4760077893733978, "eval_runtime": 1293.9854, "eval_samples_per_second": 0.419, "eval_steps_per_second": 0.419, "step": 76500 }, { "epoch": 2.87, "learning_rate": 9.129108128541176e-07, "loss": 0.3658, "step": 77000 }, { "epoch": 2.87, "eval_loss": 0.47582224011421204, "eval_runtime": 1293.6591, "eval_samples_per_second": 0.419, "eval_steps_per_second": 0.419, "step": 77000 }, { "epoch": 2.89, "learning_rate": 6.685963879659362e-07, "loss": 0.3675, "step": 77500 }, { "epoch": 2.89, "eval_loss": 0.4758478105068207, "eval_runtime": 1311.0096, "eval_samples_per_second": 0.413, "eval_steps_per_second": 0.413, "step": 77500 }, { "epoch": 2.91, "learning_rate": 4.6214609844061894e-07, "loss": 0.3696, "step": 78000 }, { "epoch": 2.91, "eval_loss": 0.4757947325706482, "eval_runtime": 1268.9631, "eval_samples_per_second": 0.427, "eval_steps_per_second": 0.427, "step": 78000 }, { "epoch": 2.93, "learning_rate": 2.9363864105907967e-07, "loss": 0.3633, "step": 78500 }, { "epoch": 2.93, "eval_loss": 0.47572794556617737, "eval_runtime": 1284.2805, "eval_samples_per_second": 0.422, "eval_steps_per_second": 0.422, "step": 78500 }, { "epoch": 2.95, "learning_rate": 1.6313824917496555e-07, "loss": 0.3712, "step": 79000 }, { "epoch": 2.95, "eval_loss": 0.47579219937324524, "eval_runtime": 1333.9827, "eval_samples_per_second": 0.406, "eval_steps_per_second": 0.406, "step": 79000 }, { "epoch": 2.96, "learning_rate": 7.069466822952065e-08, "loss": 0.37, "step": 79500 }, { "epoch": 2.96, "eval_loss": 0.47579482197761536, "eval_runtime": 1343.7136, "eval_samples_per_second": 0.403, "eval_steps_per_second": 0.403, "step": 79500 }, { "epoch": 2.98, "learning_rate": 1.6343136789165324e-08, "loss": 0.3647, "step": 80000 }, { "epoch": 2.98, "eval_loss": 0.475759357213974, "eval_runtime": 1306.8248, "eval_samples_per_second": 0.415, "eval_steps_per_second": 0.415, "step": 80000 } ], "logging_steps": 500, "max_steps": 80463, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2.2475568675952804e+19, "trial_name": null, "trial_params": null }