{ "best_metric": 0.5594063997268677, "best_model_checkpoint": "./qlora-out/checkpoint-36000", "epoch": 1.342231833264979, "global_step": 36000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 0.00019999938245325715, "loss": 0.9023, "step": 100 }, { "epoch": 0.01, "learning_rate": 0.00019999724773356797, "loss": 0.8027, "step": 200 }, { "epoch": 0.01, "learning_rate": 0.0001999935882494411, "loss": 0.8041, "step": 300 }, { "epoch": 0.01, "learning_rate": 0.00019998840405667672, "loss": 0.7944, "step": 400 }, { "epoch": 0.02, "learning_rate": 0.00019998169523432365, "loss": 0.81, "step": 500 }, { "epoch": 0.02, "learning_rate": 0.0001999734618846785, "loss": 0.7855, "step": 600 }, { "epoch": 0.03, "learning_rate": 0.00019996370413328385, "loss": 0.7849, "step": 700 }, { "epoch": 0.03, "learning_rate": 0.00019995242212892653, "loss": 0.7564, "step": 800 }, { "epoch": 0.03, "learning_rate": 0.00019993961604363532, "loss": 0.7724, "step": 900 }, { "epoch": 0.04, "learning_rate": 0.00019992528607267815, "loss": 0.7308, "step": 1000 }, { "epoch": 0.04, "eval_loss": 0.7677998542785645, "eval_runtime": 1774.3517, "eval_samples_per_second": 0.305, "eval_steps_per_second": 0.305, "step": 1000 }, { "epoch": 0.04, "learning_rate": 0.0001999094324345594, "loss": 0.7844, "step": 1100 }, { "epoch": 0.04, "learning_rate": 0.00019989205537101633, "loss": 0.7668, "step": 1200 }, { "epoch": 0.05, "learning_rate": 0.00019987315514701553, "loss": 0.7727, "step": 1300 }, { "epoch": 0.05, "learning_rate": 0.00019985273205074878, "loss": 0.7467, "step": 1400 }, { "epoch": 0.06, "learning_rate": 0.00019983078639362883, "loss": 0.7516, "step": 1500 }, { "epoch": 0.06, "learning_rate": 0.00019980731851028445, "loss": 0.7267, "step": 1600 }, { "epoch": 0.06, "learning_rate": 0.0001997823287585554, "loss": 0.7632, "step": 1700 }, { "epoch": 0.07, "learning_rate": 0.000199755817519487, "loss": 0.7392, "step": 1800 }, { "epoch": 0.07, "learning_rate": 0.00019972778519732436, "loss": 0.7528, "step": 1900 }, { "epoch": 0.07, "learning_rate": 0.0001996982322195061, "loss": 0.725, "step": 2000 }, { "epoch": 0.07, "eval_loss": 0.7452704310417175, "eval_runtime": 1787.7554, "eval_samples_per_second": 0.303, "eval_steps_per_second": 0.303, "step": 2000 }, { "epoch": 0.08, "learning_rate": 0.00019966715903665795, "loss": 0.7234, "step": 2100 }, { "epoch": 0.08, "learning_rate": 0.00019963456612258576, "loss": 0.754, "step": 2200 }, { "epoch": 0.09, "learning_rate": 0.00019960045397426841, "loss": 0.7856, "step": 2300 }, { "epoch": 0.09, "learning_rate": 0.00019956482311185006, "loss": 0.7387, "step": 2400 }, { "epoch": 0.09, "learning_rate": 0.00019952767407863245, "loss": 0.7309, "step": 2500 }, { "epoch": 0.1, "learning_rate": 0.00019948900744106633, "loss": 0.7232, "step": 2600 }, { "epoch": 0.1, "learning_rate": 0.00019944882378874316, "loss": 0.7406, "step": 2700 }, { "epoch": 0.1, "learning_rate": 0.0001994071237343858, "loss": 0.7166, "step": 2800 }, { "epoch": 0.11, "learning_rate": 0.00019936390791383936, "loss": 0.7308, "step": 2900 }, { "epoch": 0.11, "learning_rate": 0.00019931917698606143, "loss": 0.7288, "step": 3000 }, { "epoch": 0.11, "eval_loss": 0.7343490123748779, "eval_runtime": 1770.9966, "eval_samples_per_second": 0.306, "eval_steps_per_second": 0.306, "step": 3000 }, { "epoch": 0.12, "learning_rate": 0.00019927293163311206, "loss": 0.7236, "step": 3100 }, { "epoch": 0.12, "learning_rate": 0.00019922517256014337, "loss": 0.716, "step": 3200 }, { "epoch": 0.12, "learning_rate": 0.00019917590049538874, "loss": 0.7564, "step": 3300 }, { "epoch": 0.13, "learning_rate": 0.00019912511619015177, "loss": 0.7082, "step": 3400 }, { "epoch": 0.13, "learning_rate": 0.00019907282041879484, "loss": 0.7103, "step": 3500 }, { "epoch": 0.13, "learning_rate": 0.00019901901397872715, "loss": 0.7457, "step": 3600 }, { "epoch": 0.14, "learning_rate": 0.0001989636976903928, "loss": 0.7076, "step": 3700 }, { "epoch": 0.14, "learning_rate": 0.0001989068723972581, "loss": 0.7217, "step": 3800 }, { "epoch": 0.15, "learning_rate": 0.00019884853896579873, "loss": 0.7175, "step": 3900 }, { "epoch": 0.15, "learning_rate": 0.0001987886982854866, "loss": 0.7083, "step": 4000 }, { "epoch": 0.15, "eval_loss": 0.726176917552948, "eval_runtime": 1765.3933, "eval_samples_per_second": 0.307, "eval_steps_per_second": 0.307, "step": 4000 }, { "epoch": 0.15, "learning_rate": 0.00019872735126877622, "loss": 0.7228, "step": 4100 }, { "epoch": 0.16, "learning_rate": 0.0001986644988510909, "loss": 0.7133, "step": 4200 }, { "epoch": 0.16, "learning_rate": 0.00019860014199080822, "loss": 0.7243, "step": 4300 }, { "epoch": 0.16, "learning_rate": 0.00019853428166924576, "loss": 0.6929, "step": 4400 }, { "epoch": 0.17, "learning_rate": 0.00019846691889064593, "loss": 0.7392, "step": 4500 }, { "epoch": 0.17, "learning_rate": 0.0001983980546821607, "loss": 0.7247, "step": 4600 }, { "epoch": 0.18, "learning_rate": 0.0001983276900938359, "loss": 0.7258, "step": 4700 }, { "epoch": 0.18, "learning_rate": 0.00019825582619859532, "loss": 0.7197, "step": 4800 }, { "epoch": 0.18, "learning_rate": 0.0001981824640922242, "loss": 0.6906, "step": 4900 }, { "epoch": 0.19, "learning_rate": 0.00019810760489335266, "loss": 0.7274, "step": 5000 }, { "epoch": 0.19, "eval_loss": 0.7171670794487, "eval_runtime": 1812.7597, "eval_samples_per_second": 0.299, "eval_steps_per_second": 0.299, "step": 5000 }, { "epoch": 0.19, "learning_rate": 0.0001980312497434385, "loss": 0.7105, "step": 5100 }, { "epoch": 0.19, "learning_rate": 0.00019795339980675002, "loss": 0.7091, "step": 5200 }, { "epoch": 0.2, "learning_rate": 0.00019787405627034804, "loss": 0.7102, "step": 5300 }, { "epoch": 0.2, "learning_rate": 0.0001977932203440678, "loss": 0.7314, "step": 5400 }, { "epoch": 0.21, "learning_rate": 0.00019771089326050075, "loss": 0.6945, "step": 5500 }, { "epoch": 0.21, "learning_rate": 0.0001976270762749755, "loss": 0.7048, "step": 5600 }, { "epoch": 0.21, "learning_rate": 0.00019754177066553882, "loss": 0.6963, "step": 5700 }, { "epoch": 0.22, "learning_rate": 0.00019745497773293613, "loss": 0.711, "step": 5800 }, { "epoch": 0.22, "learning_rate": 0.0001973666988005916, "loss": 0.7017, "step": 5900 }, { "epoch": 0.22, "learning_rate": 0.00019727693521458806, "loss": 0.7287, "step": 6000 }, { "epoch": 0.22, "eval_loss": 0.710155725479126, "eval_runtime": 1786.7467, "eval_samples_per_second": 0.303, "eval_steps_per_second": 0.303, "step": 6000 }, { "epoch": 0.23, "learning_rate": 0.00019718568834364638, "loss": 0.6894, "step": 6100 }, { "epoch": 0.23, "learning_rate": 0.00019709295957910476, "loss": 0.7061, "step": 6200 }, { "epoch": 0.23, "learning_rate": 0.00019699875033489728, "loss": 0.7063, "step": 6300 }, { "epoch": 0.24, "learning_rate": 0.00019690306204753254, "loss": 0.6872, "step": 6400 }, { "epoch": 0.24, "learning_rate": 0.0001968058961760717, "loss": 0.7095, "step": 6500 }, { "epoch": 0.25, "learning_rate": 0.00019670725420210618, "loss": 0.695, "step": 6600 }, { "epoch": 0.25, "learning_rate": 0.0001966071376297351, "loss": 0.674, "step": 6700 }, { "epoch": 0.25, "learning_rate": 0.00019650554798554236, "loss": 0.7225, "step": 6800 }, { "epoch": 0.26, "learning_rate": 0.00019640248681857342, "loss": 0.6845, "step": 6900 }, { "epoch": 0.26, "learning_rate": 0.00019629795570031149, "loss": 0.6891, "step": 7000 }, { "epoch": 0.26, "eval_loss": 0.703677773475647, "eval_runtime": 1767.3593, "eval_samples_per_second": 0.307, "eval_steps_per_second": 0.307, "step": 7000 }, { "epoch": 0.26, "learning_rate": 0.00019619195622465379, "loss": 0.6962, "step": 7100 }, { "epoch": 0.27, "learning_rate": 0.0001960844900078871, "loss": 0.6779, "step": 7200 }, { "epoch": 0.27, "learning_rate": 0.00019597555868866318, "loss": 0.7354, "step": 7300 }, { "epoch": 0.28, "learning_rate": 0.00019586516392797374, "loss": 0.7196, "step": 7400 }, { "epoch": 0.28, "learning_rate": 0.0001957533074091252, "loss": 0.682, "step": 7500 }, { "epoch": 0.28, "learning_rate": 0.0001956399908377129, "loss": 0.6938, "step": 7600 }, { "epoch": 0.29, "learning_rate": 0.0001955252159415952, "loss": 0.6912, "step": 7700 }, { "epoch": 0.29, "learning_rate": 0.00019540898447086705, "loss": 0.7048, "step": 7800 }, { "epoch": 0.29, "learning_rate": 0.00019529129819783334, "loss": 0.7007, "step": 7900 }, { "epoch": 0.3, "learning_rate": 0.00019517215891698192, "loss": 0.6969, "step": 8000 }, { "epoch": 0.3, "eval_loss": 0.6973471641540527, "eval_runtime": 1793.2355, "eval_samples_per_second": 0.302, "eval_steps_per_second": 0.302, "step": 8000 }, { "epoch": 0.3, "learning_rate": 0.00019505156844495619, "loss": 0.6894, "step": 8100 }, { "epoch": 0.31, "learning_rate": 0.00019492952862052733, "loss": 0.6971, "step": 8200 }, { "epoch": 0.31, "learning_rate": 0.0001948060413045665, "loss": 0.7135, "step": 8300 }, { "epoch": 0.31, "learning_rate": 0.0001946811083800161, "loss": 0.6794, "step": 8400 }, { "epoch": 0.32, "learning_rate": 0.0001945547317518614, "loss": 0.7086, "step": 8500 }, { "epoch": 0.32, "learning_rate": 0.00019442691334710136, "loss": 0.7042, "step": 8600 }, { "epoch": 0.32, "learning_rate": 0.00019429765511471916, "loss": 0.6822, "step": 8700 }, { "epoch": 0.33, "learning_rate": 0.0001941669590256526, "loss": 0.7016, "step": 8800 }, { "epoch": 0.33, "learning_rate": 0.00019403482707276406, "loss": 0.705, "step": 8900 }, { "epoch": 0.34, "learning_rate": 0.00019390126127080999, "loss": 0.698, "step": 9000 }, { "epoch": 0.34, "eval_loss": 0.6910382509231567, "eval_runtime": 1782.1661, "eval_samples_per_second": 0.304, "eval_steps_per_second": 0.304, "step": 9000 }, { "epoch": 0.34, "learning_rate": 0.00019376626365641026, "loss": 0.6926, "step": 9100 }, { "epoch": 0.34, "learning_rate": 0.0001936298362880172, "loss": 0.6871, "step": 9200 }, { "epoch": 0.35, "learning_rate": 0.00019349198124588403, "loss": 0.6894, "step": 9300 }, { "epoch": 0.35, "learning_rate": 0.00019335270063203325, "loss": 0.6894, "step": 9400 }, { "epoch": 0.35, "learning_rate": 0.00019321199657022464, "loss": 0.7057, "step": 9500 }, { "epoch": 0.36, "learning_rate": 0.00019306987120592265, "loss": 0.6682, "step": 9600 }, { "epoch": 0.36, "learning_rate": 0.00019292632670626401, "loss": 0.6931, "step": 9700 }, { "epoch": 0.37, "learning_rate": 0.00019278136526002443, "loss": 0.7244, "step": 9800 }, { "epoch": 0.37, "learning_rate": 0.0001926349890775853, "loss": 0.6881, "step": 9900 }, { "epoch": 0.37, "learning_rate": 0.00019248720039090006, "loss": 0.6839, "step": 10000 }, { "epoch": 0.37, "eval_loss": 0.6857322454452515, "eval_runtime": 1760.8664, "eval_samples_per_second": 0.308, "eval_steps_per_second": 0.308, "step": 10000 }, { "epoch": 0.38, "learning_rate": 0.00019233800145346006, "loss": 0.6917, "step": 10100 }, { "epoch": 0.38, "learning_rate": 0.0001921873945402602, "loss": 0.6672, "step": 10200 }, { "epoch": 0.38, "learning_rate": 0.00019203538194776442, "loss": 0.6873, "step": 10300 }, { "epoch": 0.39, "learning_rate": 0.00019188196599387043, "loss": 0.6733, "step": 10400 }, { "epoch": 0.39, "learning_rate": 0.00019172714901787453, "loss": 0.706, "step": 10500 }, { "epoch": 0.4, "learning_rate": 0.00019157093338043583, "loss": 0.6848, "step": 10600 }, { "epoch": 0.4, "learning_rate": 0.00019141332146354042, "loss": 0.6728, "step": 10700 }, { "epoch": 0.4, "learning_rate": 0.00019125431567046494, "loss": 0.686, "step": 10800 }, { "epoch": 0.41, "learning_rate": 0.00019109391842573987, "loss": 0.6992, "step": 10900 }, { "epoch": 0.41, "learning_rate": 0.00019093213217511265, "loss": 0.6675, "step": 11000 }, { "epoch": 0.41, "eval_loss": 0.6794907450675964, "eval_runtime": 1782.5413, "eval_samples_per_second": 0.304, "eval_steps_per_second": 0.304, "step": 11000 }, { "epoch": 0.41, "learning_rate": 0.0001907689593855104, "loss": 0.6721, "step": 11100 }, { "epoch": 0.42, "learning_rate": 0.00019060440254500228, "loss": 0.6353, "step": 11200 }, { "epoch": 0.42, "learning_rate": 0.00019043846416276155, "loss": 0.6449, "step": 11300 }, { "epoch": 0.43, "learning_rate": 0.0001902711467690272, "loss": 0.6451, "step": 11400 }, { "epoch": 0.43, "learning_rate": 0.00019010245291506569, "loss": 0.6421, "step": 11500 }, { "epoch": 0.43, "learning_rate": 0.00018993238517313167, "loss": 0.6352, "step": 11600 }, { "epoch": 0.44, "learning_rate": 0.0001897609461364289, "loss": 0.6371, "step": 11700 }, { "epoch": 0.44, "learning_rate": 0.00018958813841907083, "loss": 0.623, "step": 11800 }, { "epoch": 0.44, "learning_rate": 0.00018941396465604063, "loss": 0.6533, "step": 11900 }, { "epoch": 0.45, "learning_rate": 0.00018923842750315095, "loss": 0.6371, "step": 12000 }, { "epoch": 0.45, "eval_loss": 0.6759930849075317, "eval_runtime": 1304.1351, "eval_samples_per_second": 0.416, "eval_steps_per_second": 0.416, "step": 12000 }, { "epoch": 0.45, "learning_rate": 0.00018906152963700358, "loss": 0.6664, "step": 12100 }, { "epoch": 0.45, "learning_rate": 0.00018888327375494847, "loss": 0.6644, "step": 12200 }, { "epoch": 0.46, "learning_rate": 0.00018870366257504274, "loss": 0.623, "step": 12300 }, { "epoch": 0.46, "learning_rate": 0.00018852269883600918, "loss": 0.6242, "step": 12400 }, { "epoch": 0.47, "learning_rate": 0.00018834038529719446, "loss": 0.6318, "step": 12500 }, { "epoch": 0.47, "learning_rate": 0.0001881567247385271, "loss": 0.6359, "step": 12600 }, { "epoch": 0.47, "learning_rate": 0.00018797171996047505, "loss": 0.6449, "step": 12700 }, { "epoch": 0.48, "learning_rate": 0.00018778537378400304, "loss": 0.6434, "step": 12800 }, { "epoch": 0.48, "learning_rate": 0.00018759768905052946, "loss": 0.6323, "step": 12900 }, { "epoch": 0.48, "learning_rate": 0.00018740866862188317, "loss": 0.6377, "step": 13000 }, { "epoch": 0.48, "eval_loss": 0.6696639060974121, "eval_runtime": 1238.6847, "eval_samples_per_second": 0.438, "eval_steps_per_second": 0.438, "step": 13000 }, { "epoch": 0.49, "learning_rate": 0.0001872183153802598, "loss": 0.6232, "step": 13100 }, { "epoch": 0.49, "learning_rate": 0.00018702663222817774, "loss": 0.6236, "step": 13200 }, { "epoch": 0.5, "learning_rate": 0.00018683362208843395, "loss": 0.6331, "step": 13300 }, { "epoch": 0.5, "learning_rate": 0.00018663928790405945, "loss": 0.6528, "step": 13400 }, { "epoch": 0.5, "learning_rate": 0.00018644363263827426, "loss": 0.6362, "step": 13500 }, { "epoch": 0.51, "learning_rate": 0.00018624665927444248, "loss": 0.6308, "step": 13600 }, { "epoch": 0.51, "learning_rate": 0.00018604837081602656, "loss": 0.6107, "step": 13700 }, { "epoch": 0.51, "learning_rate": 0.00018584877028654154, "loss": 0.6418, "step": 13800 }, { "epoch": 0.52, "learning_rate": 0.00018564786072950917, "loss": 0.6276, "step": 13900 }, { "epoch": 0.52, "learning_rate": 0.00018544564520841118, "loss": 0.6296, "step": 14000 }, { "epoch": 0.52, "eval_loss": 0.6651941537857056, "eval_runtime": 1277.2835, "eval_samples_per_second": 0.424, "eval_steps_per_second": 0.424, "step": 14000 }, { "epoch": 0.53, "learning_rate": 0.00018524212680664286, "loss": 0.636, "step": 14100 }, { "epoch": 0.53, "learning_rate": 0.00018503730862746574, "loss": 0.6643, "step": 14200 }, { "epoch": 0.53, "learning_rate": 0.00018483119379396058, "loss": 0.6282, "step": 14300 }, { "epoch": 0.54, "learning_rate": 0.0001846237854489796, "loss": 0.6381, "step": 14400 }, { "epoch": 0.54, "learning_rate": 0.00018441508675509844, "loss": 0.6692, "step": 14500 }, { "epoch": 0.54, "learning_rate": 0.00018420510089456823, "loss": 0.6478, "step": 14600 }, { "epoch": 0.55, "learning_rate": 0.00018399383106926676, "loss": 0.6293, "step": 14700 }, { "epoch": 0.55, "learning_rate": 0.00018378128050064988, "loss": 0.6406, "step": 14800 }, { "epoch": 0.56, "learning_rate": 0.0001835674524297023, "loss": 0.6407, "step": 14900 }, { "epoch": 0.56, "learning_rate": 0.0001833523501168881, "loss": 0.633, "step": 15000 }, { "epoch": 0.56, "eval_loss": 0.6601429581642151, "eval_runtime": 1275.6071, "eval_samples_per_second": 0.425, "eval_steps_per_second": 0.425, "step": 15000 }, { "epoch": 0.56, "learning_rate": 0.00018313597684210115, "loss": 0.6198, "step": 15100 }, { "epoch": 0.57, "learning_rate": 0.00018291833590461498, "loss": 0.6345, "step": 15200 }, { "epoch": 0.57, "learning_rate": 0.00018269943062303257, "loss": 0.6554, "step": 15300 }, { "epoch": 0.57, "learning_rate": 0.00018247926433523562, "loss": 0.6151, "step": 15400 }, { "epoch": 0.58, "learning_rate": 0.00018225784039833386, "loss": 0.6331, "step": 15500 }, { "epoch": 0.58, "learning_rate": 0.0001820351621886136, "loss": 0.6256, "step": 15600 }, { "epoch": 0.59, "learning_rate": 0.0001818112331014865, "loss": 0.6263, "step": 15700 }, { "epoch": 0.59, "learning_rate": 0.00018158605655143757, "loss": 0.6015, "step": 15800 }, { "epoch": 0.59, "learning_rate": 0.00018135963597197327, "loss": 0.6144, "step": 15900 }, { "epoch": 0.6, "learning_rate": 0.00018113197481556912, "loss": 0.613, "step": 16000 }, { "epoch": 0.6, "eval_loss": 0.6547831892967224, "eval_runtime": 1305.9645, "eval_samples_per_second": 0.415, "eval_steps_per_second": 0.415, "step": 16000 }, { "epoch": 0.6, "learning_rate": 0.00018090307655361701, "loss": 0.6354, "step": 16100 }, { "epoch": 0.6, "learning_rate": 0.00018067294467637228, "loss": 0.6349, "step": 16200 }, { "epoch": 0.61, "learning_rate": 0.00018044158269290054, "loss": 0.6127, "step": 16300 }, { "epoch": 0.61, "learning_rate": 0.00018020899413102412, "loss": 0.5977, "step": 16400 }, { "epoch": 0.62, "learning_rate": 0.00017997518253726834, "loss": 0.6213, "step": 16500 }, { "epoch": 0.62, "learning_rate": 0.00017974015147680734, "loss": 0.6168, "step": 16600 }, { "epoch": 0.62, "learning_rate": 0.00017950390453340978, "loss": 0.5978, "step": 16700 }, { "epoch": 0.63, "learning_rate": 0.0001792664453093842, "loss": 0.6201, "step": 16800 }, { "epoch": 0.63, "learning_rate": 0.000179027777425524, "loss": 0.6141, "step": 16900 }, { "epoch": 0.63, "learning_rate": 0.00017878790452105245, "loss": 0.6135, "step": 17000 }, { "epoch": 0.63, "eval_loss": 0.6480616927146912, "eval_runtime": 1347.9883, "eval_samples_per_second": 0.402, "eval_steps_per_second": 0.402, "step": 17000 }, { "epoch": 0.64, "learning_rate": 0.0001785468302535669, "loss": 0.6363, "step": 17100 }, { "epoch": 0.64, "learning_rate": 0.00017830455829898317, "loss": 0.6076, "step": 17200 }, { "epoch": 0.65, "learning_rate": 0.00017806109235147963, "loss": 0.609, "step": 17300 }, { "epoch": 0.65, "learning_rate": 0.00017781643612344058, "loss": 0.6044, "step": 17400 }, { "epoch": 0.65, "learning_rate": 0.00017757059334539994, "loss": 0.6262, "step": 17500 }, { "epoch": 0.66, "learning_rate": 0.00017732356776598403, "loss": 0.6195, "step": 17600 }, { "epoch": 0.66, "learning_rate": 0.0001770753631518548, "loss": 0.6328, "step": 17700 }, { "epoch": 0.66, "learning_rate": 0.000176825983287652, "loss": 0.6028, "step": 17800 }, { "epoch": 0.67, "learning_rate": 0.0001765754319759358, "loss": 0.6159, "step": 17900 }, { "epoch": 0.67, "learning_rate": 0.0001763237130371287, "loss": 0.6169, "step": 18000 }, { "epoch": 0.67, "eval_loss": 0.6444052457809448, "eval_runtime": 1304.3701, "eval_samples_per_second": 0.416, "eval_steps_per_second": 0.416, "step": 18000 }, { "epoch": 0.67, "learning_rate": 0.0001760708303094572, "loss": 0.6183, "step": 18100 }, { "epoch": 0.68, "learning_rate": 0.00017581678764889324, "loss": 0.6116, "step": 18200 }, { "epoch": 0.68, "learning_rate": 0.00017556158892909567, "loss": 0.6406, "step": 18300 }, { "epoch": 0.69, "learning_rate": 0.00017530523804135085, "loss": 0.6223, "step": 18400 }, { "epoch": 0.69, "learning_rate": 0.00017504773889451361, "loss": 0.628, "step": 18500 }, { "epoch": 0.69, "learning_rate": 0.00017478909541494736, "loss": 0.6173, "step": 18600 }, { "epoch": 0.7, "learning_rate": 0.00017452931154646444, "loss": 0.61, "step": 18700 }, { "epoch": 0.7, "learning_rate": 0.00017426839125026598, "loss": 0.5959, "step": 18800 }, { "epoch": 0.7, "learning_rate": 0.00017400633850488128, "loss": 0.5979, "step": 18900 }, { "epoch": 0.71, "learning_rate": 0.00017374315730610745, "loss": 0.6161, "step": 19000 }, { "epoch": 0.71, "eval_loss": 0.6378119587898254, "eval_runtime": 1283.5987, "eval_samples_per_second": 0.422, "eval_steps_per_second": 0.422, "step": 19000 }, { "epoch": 0.71, "learning_rate": 0.00017347885166694825, "loss": 0.6213, "step": 19100 }, { "epoch": 0.72, "learning_rate": 0.00017321342561755297, "loss": 0.6217, "step": 19200 }, { "epoch": 0.72, "learning_rate": 0.00017294688320515506, "loss": 0.6127, "step": 19300 }, { "epoch": 0.72, "learning_rate": 0.00017267922849401024, "loss": 0.6145, "step": 19400 }, { "epoch": 0.73, "learning_rate": 0.00017241046556533472, "loss": 0.5936, "step": 19500 }, { "epoch": 0.73, "learning_rate": 0.0001721405985172428, "loss": 0.6273, "step": 19600 }, { "epoch": 0.73, "learning_rate": 0.0001718696314646846, "loss": 0.6059, "step": 19700 }, { "epoch": 0.74, "learning_rate": 0.000171597568539383, "loss": 0.5934, "step": 19800 }, { "epoch": 0.74, "learning_rate": 0.000171324413889771, "loss": 0.6243, "step": 19900 }, { "epoch": 0.75, "learning_rate": 0.00017105017168092808, "loss": 0.6164, "step": 20000 }, { "epoch": 0.75, "eval_loss": 0.6324757933616638, "eval_runtime": 1266.6769, "eval_samples_per_second": 0.428, "eval_steps_per_second": 0.428, "step": 20000 }, { "epoch": 0.75, "learning_rate": 0.0001707748460945171, "loss": 0.5953, "step": 20100 }, { "epoch": 0.75, "learning_rate": 0.0001704984413287202, "loss": 0.6329, "step": 20200 }, { "epoch": 0.76, "learning_rate": 0.00017022096159817493, "loss": 0.6227, "step": 20300 }, { "epoch": 0.76, "learning_rate": 0.00016994241113391003, "loss": 0.6022, "step": 20400 }, { "epoch": 0.76, "learning_rate": 0.0001696627941832808, "loss": 0.604, "step": 20500 }, { "epoch": 0.77, "learning_rate": 0.0001693821150099044, "loss": 0.6101, "step": 20600 }, { "epoch": 0.77, "learning_rate": 0.00016910037789359485, "loss": 0.6242, "step": 20700 }, { "epoch": 0.78, "learning_rate": 0.00016881758713029776, "loss": 0.6096, "step": 20800 }, { "epoch": 0.78, "learning_rate": 0.0001685337470320248, "loss": 0.5948, "step": 20900 }, { "epoch": 0.78, "learning_rate": 0.0001682488619267879, "loss": 0.5911, "step": 21000 }, { "epoch": 0.78, "eval_loss": 0.6282580494880676, "eval_runtime": 1313.1215, "eval_samples_per_second": 0.413, "eval_steps_per_second": 0.413, "step": 21000 }, { "epoch": 0.79, "learning_rate": 0.0001679629361585335, "loss": 0.5716, "step": 21100 }, { "epoch": 0.79, "learning_rate": 0.00016767597408707594, "loss": 0.5957, "step": 21200 }, { "epoch": 0.79, "learning_rate": 0.00016738798008803128, "loss": 0.6308, "step": 21300 }, { "epoch": 0.8, "learning_rate": 0.00016709895855275048, "loss": 0.5891, "step": 21400 }, { "epoch": 0.8, "learning_rate": 0.00016680891388825243, "loss": 0.6104, "step": 21500 }, { "epoch": 0.81, "learning_rate": 0.00016651785051715674, "loss": 0.6344, "step": 21600 }, { "epoch": 0.81, "learning_rate": 0.0001662257728776163, "loss": 0.604, "step": 21700 }, { "epoch": 0.81, "learning_rate": 0.0001659326854232497, "loss": 0.6066, "step": 21800 }, { "epoch": 0.82, "learning_rate": 0.0001656385926230732, "loss": 0.6324, "step": 21900 }, { "epoch": 0.82, "learning_rate": 0.00016534349896143264, "loss": 0.5819, "step": 22000 }, { "epoch": 0.82, "eval_loss": 0.6218891143798828, "eval_runtime": 1296.6038, "eval_samples_per_second": 0.418, "eval_steps_per_second": 0.418, "step": 22000 }, { "epoch": 0.82, "learning_rate": 0.00016504740893793512, "loss": 0.6145, "step": 22100 }, { "epoch": 0.83, "learning_rate": 0.00016475032706738023, "loss": 0.6109, "step": 22200 }, { "epoch": 0.83, "learning_rate": 0.0001644522578796914, "loss": 0.608, "step": 22300 }, { "epoch": 0.84, "learning_rate": 0.0001641532059198466, "loss": 0.565, "step": 22400 }, { "epoch": 0.84, "learning_rate": 0.00016385317574780942, "loss": 0.6139, "step": 22500 }, { "epoch": 0.84, "learning_rate": 0.000163552171938459, "loss": 0.5888, "step": 22600 }, { "epoch": 0.85, "learning_rate": 0.00016325019908152078, "loss": 0.6065, "step": 22700 }, { "epoch": 0.85, "learning_rate": 0.0001629472617814962, "loss": 0.5959, "step": 22800 }, { "epoch": 0.85, "learning_rate": 0.00016264336465759258, "loss": 0.5918, "step": 22900 }, { "epoch": 0.86, "learning_rate": 0.0001623385123436528, "loss": 0.6083, "step": 23000 }, { "epoch": 0.86, "eval_loss": 0.6180054545402527, "eval_runtime": 1278.5639, "eval_samples_per_second": 0.424, "eval_steps_per_second": 0.424, "step": 23000 }, { "epoch": 0.86, "learning_rate": 0.0001620327094880844, "loss": 0.5795, "step": 23100 }, { "epoch": 0.86, "learning_rate": 0.00016172596075378893, "loss": 0.6025, "step": 23200 }, { "epoch": 0.87, "learning_rate": 0.00016141827081809075, "loss": 0.5669, "step": 23300 }, { "epoch": 0.87, "learning_rate": 0.00016110964437266568, "loss": 0.6172, "step": 23400 }, { "epoch": 0.88, "learning_rate": 0.00016080008612346955, "loss": 0.5899, "step": 23500 }, { "epoch": 0.88, "learning_rate": 0.00016048960079066636, "loss": 0.5889, "step": 23600 }, { "epoch": 0.88, "learning_rate": 0.00016017819310855632, "loss": 0.5893, "step": 23700 }, { "epoch": 0.89, "learning_rate": 0.00015986586782550376, "loss": 0.6363, "step": 23800 }, { "epoch": 0.89, "learning_rate": 0.00015955262970386458, "loss": 0.5876, "step": 23900 }, { "epoch": 0.89, "learning_rate": 0.00015923848351991372, "loss": 0.5964, "step": 24000 }, { "epoch": 0.89, "eval_loss": 0.6122664213180542, "eval_runtime": 1255.2341, "eval_samples_per_second": 0.432, "eval_steps_per_second": 0.432, "step": 24000 }, { "epoch": 0.9, "learning_rate": 0.00015892343406377225, "loss": 0.5943, "step": 24100 }, { "epoch": 0.9, "learning_rate": 0.00015860748613933455, "loss": 0.6008, "step": 24200 }, { "epoch": 0.91, "learning_rate": 0.00015829064456419477, "loss": 0.6123, "step": 24300 }, { "epoch": 0.91, "learning_rate": 0.00015797291416957355, "loss": 0.5819, "step": 24400 }, { "epoch": 0.91, "learning_rate": 0.00015765429980024425, "loss": 0.5731, "step": 24500 }, { "epoch": 0.92, "learning_rate": 0.00015733480631445926, "loss": 0.593, "step": 24600 }, { "epoch": 0.92, "learning_rate": 0.00015701443858387562, "loss": 0.5764, "step": 24700 }, { "epoch": 0.92, "learning_rate": 0.00015669320149348104, "loss": 0.6037, "step": 24800 }, { "epoch": 0.93, "learning_rate": 0.0001563710999415193, "loss": 0.5958, "step": 24900 }, { "epoch": 0.93, "learning_rate": 0.00015604813883941535, "loss": 0.6186, "step": 25000 }, { "epoch": 0.93, "eval_loss": 0.6086174249649048, "eval_runtime": 1260.3923, "eval_samples_per_second": 0.43, "eval_steps_per_second": 0.43, "step": 25000 }, { "epoch": 0.94, "learning_rate": 0.00015572432311170096, "loss": 0.597, "step": 25100 }, { "epoch": 0.94, "learning_rate": 0.00015539965769593894, "loss": 0.5657, "step": 25200 }, { "epoch": 0.94, "learning_rate": 0.0001550741475426484, "loss": 0.6081, "step": 25300 }, { "epoch": 0.95, "learning_rate": 0.00015474779761522894, "loss": 0.5957, "step": 25400 }, { "epoch": 0.95, "learning_rate": 0.00015442061288988525, "loss": 0.6032, "step": 25500 }, { "epoch": 0.95, "learning_rate": 0.00015409259835555089, "loss": 0.5662, "step": 25600 }, { "epoch": 0.96, "learning_rate": 0.00015376375901381256, "loss": 0.5607, "step": 25700 }, { "epoch": 0.96, "learning_rate": 0.00015343409987883354, "loss": 0.5727, "step": 25800 }, { "epoch": 0.97, "learning_rate": 0.00015310362597727747, "loss": 0.5762, "step": 25900 }, { "epoch": 0.97, "learning_rate": 0.00015277234234823154, "loss": 0.5841, "step": 26000 }, { "epoch": 0.97, "eval_loss": 0.6026987433433533, "eval_runtime": 1292.1515, "eval_samples_per_second": 0.419, "eval_steps_per_second": 0.419, "step": 26000 }, { "epoch": 0.97, "learning_rate": 0.00015244025404312974, "loss": 0.6015, "step": 26100 }, { "epoch": 0.98, "learning_rate": 0.00015210736612567588, "loss": 0.5914, "step": 26200 }, { "epoch": 0.98, "learning_rate": 0.00015177368367176616, "loss": 0.5799, "step": 26300 }, { "epoch": 0.98, "learning_rate": 0.00015143921176941205, "loss": 0.6037, "step": 26400 }, { "epoch": 0.99, "learning_rate": 0.00015110395551866255, "loss": 0.5876, "step": 26500 }, { "epoch": 0.99, "learning_rate": 0.0001507679200315264, "loss": 0.5973, "step": 26600 }, { "epoch": 1.0, "learning_rate": 0.00015043111043189423, "loss": 0.5957, "step": 26700 }, { "epoch": 1.0, "learning_rate": 0.00015009353185546046, "loss": 0.5696, "step": 26800 }, { "epoch": 1.0, "learning_rate": 0.00014975518944964478, "loss": 0.5523, "step": 26900 }, { "epoch": 1.01, "learning_rate": 0.0001494160883735139, "loss": 0.5144, "step": 27000 }, { "epoch": 1.01, "eval_loss": 0.5985096096992493, "eval_runtime": 1314.8131, "eval_samples_per_second": 0.412, "eval_steps_per_second": 0.412, "step": 27000 }, { "epoch": 1.01, "learning_rate": 0.00014907623379770263, "loss": 0.5743, "step": 27100 }, { "epoch": 1.01, "learning_rate": 0.00014873563090433547, "loss": 0.5095, "step": 27200 }, { "epoch": 1.02, "learning_rate": 0.00014839428488694706, "loss": 0.5391, "step": 27300 }, { "epoch": 1.02, "learning_rate": 0.00014805220095040334, "loss": 0.5532, "step": 27400 }, { "epoch": 1.03, "learning_rate": 0.00014770938431082212, "loss": 0.536, "step": 27500 }, { "epoch": 1.03, "learning_rate": 0.00014736584019549342, "loss": 0.5204, "step": 27600 }, { "epoch": 1.03, "learning_rate": 0.00014702157384279997, "loss": 0.5026, "step": 27700 }, { "epoch": 1.04, "learning_rate": 0.0001466765905021371, "loss": 0.5319, "step": 27800 }, { "epoch": 1.04, "learning_rate": 0.00014633089543383295, "loss": 0.5112, "step": 27900 }, { "epoch": 1.04, "learning_rate": 0.00014598449390906804, "loss": 0.5146, "step": 28000 }, { "epoch": 1.04, "eval_loss": 0.5959522128105164, "eval_runtime": 1288.6066, "eval_samples_per_second": 0.421, "eval_steps_per_second": 0.421, "step": 28000 }, { "epoch": 1.05, "learning_rate": 0.00014563739120979497, "loss": 0.5262, "step": 28100 }, { "epoch": 1.05, "learning_rate": 0.00014528959262865798, "loss": 0.5082, "step": 28200 }, { "epoch": 1.06, "learning_rate": 0.00014494110346891206, "loss": 0.5094, "step": 28300 }, { "epoch": 1.06, "learning_rate": 0.00014459192904434226, "loss": 0.5012, "step": 28400 }, { "epoch": 1.06, "learning_rate": 0.0001442420746791826, "loss": 0.4946, "step": 28500 }, { "epoch": 1.07, "learning_rate": 0.00014389154570803477, "loss": 0.5138, "step": 28600 }, { "epoch": 1.07, "learning_rate": 0.000143540347475787, "loss": 0.5082, "step": 28700 }, { "epoch": 1.07, "learning_rate": 0.0001431884853375325, "loss": 0.4842, "step": 28800 }, { "epoch": 1.08, "learning_rate": 0.0001428359646584876, "loss": 0.5143, "step": 28900 }, { "epoch": 1.08, "learning_rate": 0.00014248279081391022, "loss": 0.5029, "step": 29000 }, { "epoch": 1.08, "eval_loss": 0.5910914540290833, "eval_runtime": 1278.8257, "eval_samples_per_second": 0.424, "eval_steps_per_second": 0.424, "step": 29000 }, { "epoch": 1.08, "learning_rate": 0.00014212896918901774, "loss": 0.5003, "step": 29100 }, { "epoch": 1.09, "learning_rate": 0.00014177450517890503, "loss": 0.5102, "step": 29200 }, { "epoch": 1.09, "learning_rate": 0.0001414194041884619, "loss": 0.524, "step": 29300 }, { "epoch": 1.1, "learning_rate": 0.0001410636716322911, "loss": 0.5168, "step": 29400 }, { "epoch": 1.1, "learning_rate": 0.0001407073129346254, "loss": 0.514, "step": 29500 }, { "epoch": 1.1, "learning_rate": 0.00014035033352924502, "loss": 0.5084, "step": 29600 }, { "epoch": 1.11, "learning_rate": 0.0001399927388593948, "loss": 0.5203, "step": 29700 }, { "epoch": 1.11, "learning_rate": 0.00013963453437770119, "loss": 0.5226, "step": 29800 }, { "epoch": 1.11, "learning_rate": 0.000139275725546089, "loss": 0.5055, "step": 29900 }, { "epoch": 1.12, "learning_rate": 0.00013891631783569838, "loss": 0.5303, "step": 30000 }, { "epoch": 1.12, "eval_loss": 0.5869857668876648, "eval_runtime": 1272.0282, "eval_samples_per_second": 0.426, "eval_steps_per_second": 0.426, "step": 30000 }, { "epoch": 1.12, "learning_rate": 0.00013855631672680106, "loss": 0.5243, "step": 30100 }, { "epoch": 1.13, "learning_rate": 0.00013819572770871702, "loss": 0.5148, "step": 30200 }, { "epoch": 1.13, "learning_rate": 0.00013783455627973062, "loss": 0.522, "step": 30300 }, { "epoch": 1.13, "learning_rate": 0.00013747280794700707, "loss": 0.5289, "step": 30400 }, { "epoch": 1.14, "learning_rate": 0.00013711048822650802, "loss": 0.4996, "step": 30500 }, { "epoch": 1.14, "learning_rate": 0.00013674760264290785, "loss": 0.5099, "step": 30600 }, { "epoch": 1.14, "learning_rate": 0.0001363841567295091, "loss": 0.5219, "step": 30700 }, { "epoch": 1.15, "learning_rate": 0.00013602015602815837, "loss": 0.5297, "step": 30800 }, { "epoch": 1.15, "learning_rate": 0.00013565560608916165, "loss": 0.5029, "step": 30900 }, { "epoch": 1.16, "learning_rate": 0.0001352905124711998, "loss": 0.5266, "step": 31000 }, { "epoch": 1.16, "eval_loss": 0.5811149477958679, "eval_runtime": 1300.1475, "eval_samples_per_second": 0.417, "eval_steps_per_second": 0.417, "step": 31000 }, { "epoch": 1.16, "learning_rate": 0.00013492488074124366, "loss": 0.5295, "step": 31100 }, { "epoch": 1.16, "learning_rate": 0.00013455871647446923, "loss": 0.539, "step": 31200 }, { "epoch": 1.17, "learning_rate": 0.00013419202525417277, "loss": 0.5217, "step": 31300 }, { "epoch": 1.17, "learning_rate": 0.0001338248126716854, "loss": 0.5197, "step": 31400 }, { "epoch": 1.17, "learning_rate": 0.00013345708432628824, "loss": 0.4991, "step": 31500 }, { "epoch": 1.18, "learning_rate": 0.00013308884582512647, "loss": 0.5239, "step": 31600 }, { "epoch": 1.18, "learning_rate": 0.00013272010278312453, "loss": 0.4899, "step": 31700 }, { "epoch": 1.19, "learning_rate": 0.00013235086082289977, "loss": 0.5088, "step": 31800 }, { "epoch": 1.19, "learning_rate": 0.00013198112557467732, "loss": 0.5497, "step": 31900 }, { "epoch": 1.19, "learning_rate": 0.00013161090267620396, "loss": 0.5024, "step": 32000 }, { "epoch": 1.19, "eval_loss": 0.5758991241455078, "eval_runtime": 1292.3362, "eval_samples_per_second": 0.419, "eval_steps_per_second": 0.419, "step": 32000 }, { "epoch": 1.2, "learning_rate": 0.0001312401977726621, "loss": 0.534, "step": 32100 }, { "epoch": 1.2, "learning_rate": 0.0001308690165165839, "loss": 0.4936, "step": 32200 }, { "epoch": 1.2, "learning_rate": 0.00013049736456776485, "loss": 0.4999, "step": 32300 }, { "epoch": 1.21, "learning_rate": 0.00013012524759317774, "loss": 0.5238, "step": 32400 }, { "epoch": 1.21, "learning_rate": 0.000129752671266886, "loss": 0.4959, "step": 32500 }, { "epoch": 1.22, "learning_rate": 0.00012937964126995727, "loss": 0.514, "step": 32600 }, { "epoch": 1.22, "learning_rate": 0.00012900616329037694, "loss": 0.4964, "step": 32700 }, { "epoch": 1.22, "learning_rate": 0.00012863224302296107, "loss": 0.5054, "step": 32800 }, { "epoch": 1.23, "learning_rate": 0.0001282578861692699, "loss": 0.5079, "step": 32900 }, { "epoch": 1.23, "learning_rate": 0.0001278830984375206, "loss": 0.4929, "step": 33000 }, { "epoch": 1.23, "eval_loss": 0.5719351172447205, "eval_runtime": 1267.7603, "eval_samples_per_second": 0.428, "eval_steps_per_second": 0.428, "step": 33000 }, { "epoch": 1.23, "learning_rate": 0.0001275078855425007, "loss": 0.4971, "step": 33100 }, { "epoch": 1.24, "learning_rate": 0.0001271322532054803, "loss": 0.4977, "step": 33200 }, { "epoch": 1.24, "learning_rate": 0.0001267562071541254, "loss": 0.499, "step": 33300 }, { "epoch": 1.25, "learning_rate": 0.00012637975312241022, "loss": 0.5044, "step": 33400 }, { "epoch": 1.25, "learning_rate": 0.00012600289685052996, "loss": 0.5019, "step": 33500 }, { "epoch": 1.25, "learning_rate": 0.00012562564408481327, "loss": 0.5225, "step": 33600 }, { "epoch": 1.26, "learning_rate": 0.00012524800057763438, "loss": 0.5503, "step": 33700 }, { "epoch": 1.26, "learning_rate": 0.00012486997208732573, "loss": 0.5025, "step": 33800 }, { "epoch": 1.26, "learning_rate": 0.0001244915643780899, "loss": 0.5187, "step": 33900 }, { "epoch": 1.27, "learning_rate": 0.00012411278321991195, "loss": 0.5199, "step": 34000 }, { "epoch": 1.27, "eval_loss": 0.5665221810340881, "eval_runtime": 1263.0264, "eval_samples_per_second": 0.429, "eval_steps_per_second": 0.429, "step": 34000 }, { "epoch": 1.27, "learning_rate": 0.00012373363438847117, "loss": 0.5135, "step": 34100 }, { "epoch": 1.28, "learning_rate": 0.00012335412366505324, "loss": 0.5065, "step": 34200 }, { "epoch": 1.28, "learning_rate": 0.000122974256836462, "loss": 0.5223, "step": 34300 }, { "epoch": 1.28, "learning_rate": 0.00012259403969493114, "loss": 0.4946, "step": 34400 }, { "epoch": 1.29, "learning_rate": 0.00012221347803803605, "loss": 0.5105, "step": 34500 }, { "epoch": 1.29, "learning_rate": 0.00012183257766860514, "loss": 0.4812, "step": 34600 }, { "epoch": 1.29, "learning_rate": 0.00012145134439463178, "loss": 0.4981, "step": 34700 }, { "epoch": 1.3, "learning_rate": 0.0001210697840291852, "loss": 0.5038, "step": 34800 }, { "epoch": 1.3, "learning_rate": 0.00012068790239032241, "loss": 0.5551, "step": 34900 }, { "epoch": 1.3, "learning_rate": 0.00012030570530099902, "loss": 0.4964, "step": 35000 }, { "epoch": 1.3, "eval_loss": 0.562954843044281, "eval_runtime": 1252.1434, "eval_samples_per_second": 0.433, "eval_steps_per_second": 0.433, "step": 35000 }, { "epoch": 1.31, "learning_rate": 0.00011992319858898077, "loss": 0.4952, "step": 35100 }, { "epoch": 1.31, "learning_rate": 0.0001195403880867545, "loss": 0.5157, "step": 35200 }, { "epoch": 1.32, "learning_rate": 0.00011915727963143922, "loss": 0.4973, "step": 35300 }, { "epoch": 1.32, "learning_rate": 0.00011877387906469721, "loss": 0.4884, "step": 35400 }, { "epoch": 1.32, "learning_rate": 0.00011839019223264489, "loss": 0.5017, "step": 35500 }, { "epoch": 1.33, "learning_rate": 0.00011800622498576363, "loss": 0.5157, "step": 35600 }, { "epoch": 1.33, "learning_rate": 0.00011762198317881059, "loss": 0.4774, "step": 35700 }, { "epoch": 1.33, "learning_rate": 0.0001172374726707295, "loss": 0.4855, "step": 35800 }, { "epoch": 1.34, "learning_rate": 0.00011685269932456115, "loss": 0.5134, "step": 35900 }, { "epoch": 1.34, "learning_rate": 0.00011646766900735422, "loss": 0.5143, "step": 36000 }, { "epoch": 1.34, "eval_loss": 0.5594063997268677, "eval_runtime": 1270.0722, "eval_samples_per_second": 0.427, "eval_steps_per_second": 0.427, "step": 36000 } ], "max_steps": 80463, "num_train_epochs": 3, "total_flos": 1.009532895096152e+19, "trial_name": null, "trial_params": null }