diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 2.7100777626037598, - "best_model_checkpoint": "./model_tweets_2020_Q2/checkpoint-192000", + "best_metric": 2.671285629272461, + "best_model_checkpoint": "./model_tweets_2020_Q2/checkpoint-32000", "epoch": 19.569471624266146, "eval_steps": 8000, "global_step": 2400000, @@ -10,3319 +10,3319 @@ "log_history": [ { "epoch": 0.07, - "eval_loss": 2.977957248687744, - "eval_runtime": 125.5213, - "eval_samples_per_second": 822.769, - "eval_steps_per_second": 51.426, + "eval_loss": 2.635031223297119, + "eval_runtime": 126.0563, + "eval_samples_per_second": 819.277, + "eval_steps_per_second": 51.207, "step": 8000 }, { "epoch": 0.13, "learning_rate": 9.939131159843243e-06, - "loss": 3.1296, + "loss": 2.7848, "step": 16000 }, { "epoch": 0.13, - "eval_loss": 2.894831418991089, - "eval_runtime": 126.0129, - "eval_samples_per_second": 819.559, - "eval_steps_per_second": 51.225, + "eval_loss": 2.6555588245391846, + "eval_runtime": 126.7417, + "eval_samples_per_second": 814.846, + "eval_steps_per_second": 50.93, "step": 16000 }, { "epoch": 0.2, - "eval_loss": 2.8589611053466797, - "eval_runtime": 125.8909, - "eval_samples_per_second": 820.353, - "eval_steps_per_second": 51.275, + "eval_loss": 2.6695027351379395, + "eval_runtime": 125.9524, + "eval_samples_per_second": 819.953, + "eval_steps_per_second": 51.25, "step": 24000 }, { "epoch": 0.26, "learning_rate": 9.872425581589261e-06, - "loss": 2.9018, + "loss": 2.7545, "step": 32000 }, { "epoch": 0.26, - "eval_loss": 2.8033480644226074, - "eval_runtime": 125.7264, - "eval_samples_per_second": 821.427, - "eval_steps_per_second": 51.342, + "eval_loss": 2.671285629272461, + "eval_runtime": 126.9886, + "eval_samples_per_second": 813.262, + "eval_steps_per_second": 50.831, "step": 32000 }, { "epoch": 0.33, - "eval_loss": 2.7938032150268555, - "eval_runtime": 125.7192, - "eval_samples_per_second": 821.474, - "eval_steps_per_second": 51.345, + "eval_loss": 2.708911895751953, + "eval_runtime": 126.0433, + "eval_samples_per_second": 819.361, + "eval_steps_per_second": 51.213, "step": 40000 }, { "epoch": 0.39, "learning_rate": 9.80572000333528e-06, - "loss": 2.8331, + "loss": 2.7717, "step": 48000 }, { "epoch": 0.39, - "eval_loss": 2.7694976329803467, - "eval_runtime": 127.0405, - "eval_samples_per_second": 812.93, - "eval_steps_per_second": 50.811, + "eval_loss": 2.7143805027008057, + "eval_runtime": 126.2378, + "eval_samples_per_second": 818.099, + "eval_steps_per_second": 51.134, "step": 48000 }, { "epoch": 0.46, - "eval_loss": 2.7614457607269287, - "eval_runtime": 125.7185, - "eval_samples_per_second": 821.478, - "eval_steps_per_second": 51.345, + "eval_loss": 2.7240307331085205, + "eval_runtime": 125.5002, + "eval_samples_per_second": 822.907, + "eval_steps_per_second": 51.434, "step": 56000 }, { "epoch": 0.52, "learning_rate": 9.739014425081299e-06, - "loss": 2.7723, + "loss": 2.8043, "step": 64000 }, { "epoch": 0.52, - "eval_loss": 2.7416625022888184, - "eval_runtime": 126.1624, - "eval_samples_per_second": 818.588, - "eval_steps_per_second": 51.164, + "eval_loss": 2.749925374984741, + "eval_runtime": 126.3275, + "eval_samples_per_second": 817.518, + "eval_steps_per_second": 51.097, "step": 64000 }, { "epoch": 0.59, - "eval_loss": 2.7248806953430176, - "eval_runtime": 126.0454, - "eval_samples_per_second": 819.348, - "eval_steps_per_second": 51.212, + "eval_loss": 2.770448684692383, + "eval_runtime": 115.1543, + "eval_samples_per_second": 896.84, + "eval_steps_per_second": 56.055, "step": 72000 }, { "epoch": 0.65, "learning_rate": 9.672308846827316e-06, - "loss": 2.75, + "loss": 2.8401, "step": 80000 }, { "epoch": 0.65, - "eval_loss": 2.7202229499816895, - "eval_runtime": 126.948, - "eval_samples_per_second": 813.522, - "eval_steps_per_second": 50.848, + "eval_loss": 2.782008409500122, + "eval_runtime": 116.1441, + "eval_samples_per_second": 889.197, + "eval_steps_per_second": 55.578, "step": 80000 }, { "epoch": 0.72, - "eval_loss": 2.7112038135528564, - "eval_runtime": 126.8524, - "eval_samples_per_second": 814.135, - "eval_steps_per_second": 50.886, + "eval_loss": 2.8068478107452393, + "eval_runtime": 116.2984, + "eval_samples_per_second": 888.017, + "eval_steps_per_second": 55.504, "step": 88000 }, { "epoch": 0.78, "learning_rate": 9.605603268573334e-06, - "loss": 2.735, + "loss": 2.8723, "step": 96000 }, { "epoch": 0.78, - "eval_loss": 2.7228710651397705, - "eval_runtime": 126.981, - "eval_samples_per_second": 813.311, - "eval_steps_per_second": 50.834, + "eval_loss": 2.8150370121002197, + "eval_runtime": 116.0456, + "eval_samples_per_second": 889.952, + "eval_steps_per_second": 55.625, "step": 96000 }, { "epoch": 0.85, - "eval_loss": 2.7370951175689697, - "eval_runtime": 126.6893, - "eval_samples_per_second": 815.183, - "eval_steps_per_second": 50.951, + "eval_loss": 2.8410351276397705, + "eval_runtime": 114.7666, + "eval_samples_per_second": 899.87, + "eval_steps_per_second": 56.245, "step": 104000 }, { "epoch": 0.91, "learning_rate": 9.538897690319354e-06, - "loss": 2.7137, + "loss": 2.9004, "step": 112000 }, { "epoch": 0.91, - "eval_loss": 2.7059037685394287, - "eval_runtime": 126.3306, - "eval_samples_per_second": 817.498, - "eval_steps_per_second": 51.096, + "eval_loss": 2.865703582763672, + "eval_runtime": 115.4628, + "eval_samples_per_second": 894.444, + "eval_steps_per_second": 55.905, "step": 112000 }, { "epoch": 0.98, - "eval_loss": 2.7120730876922607, - "eval_runtime": 126.2744, - "eval_samples_per_second": 817.862, - "eval_steps_per_second": 51.119, + "eval_loss": 2.882617950439453, + "eval_runtime": 116.5627, + "eval_samples_per_second": 886.004, + "eval_steps_per_second": 55.378, "step": 120000 }, { "epoch": 1.04, "learning_rate": 9.472192112065373e-06, - "loss": 2.7155, + "loss": 2.9396, "step": 128000 }, { "epoch": 1.04, - "eval_loss": 2.7248668670654297, - "eval_runtime": 126.8126, - "eval_samples_per_second": 814.391, - "eval_steps_per_second": 50.902, + "eval_loss": 2.9071033000946045, + "eval_runtime": 116.4638, + "eval_samples_per_second": 886.756, + "eval_steps_per_second": 55.425, "step": 128000 }, { "epoch": 1.11, - "eval_loss": 2.7130985260009766, - "eval_runtime": 126.6262, - "eval_samples_per_second": 815.589, - "eval_steps_per_second": 50.977, + "eval_loss": 2.949030876159668, + "eval_runtime": 115.1354, + "eval_samples_per_second": 896.987, + "eval_steps_per_second": 56.064, "step": 136000 }, { "epoch": 1.17, "learning_rate": 9.405486533811392e-06, - "loss": 2.7152, + "loss": 2.9801, "step": 144000 }, { "epoch": 1.17, - "eval_loss": 2.6999881267547607, - "eval_runtime": 126.4279, - "eval_samples_per_second": 816.868, - "eval_steps_per_second": 51.057, + "eval_loss": 2.951450824737549, + "eval_runtime": 114.8755, + "eval_samples_per_second": 899.017, + "eval_steps_per_second": 56.191, "step": 144000 }, { "epoch": 1.24, - "eval_loss": 2.703012704849243, - "eval_runtime": 126.2932, - "eval_samples_per_second": 817.74, - "eval_steps_per_second": 51.111, + "eval_loss": 2.9862585067749023, + "eval_runtime": 116.1529, + "eval_samples_per_second": 889.129, + "eval_steps_per_second": 55.573, "step": 152000 }, { "epoch": 1.3, "learning_rate": 9.338780955557409e-06, - "loss": 2.7151, + "loss": 3.0173, "step": 160000 }, { "epoch": 1.3, - "eval_loss": 2.721385955810547, - "eval_runtime": 126.566, - "eval_samples_per_second": 815.977, - "eval_steps_per_second": 51.001, + "eval_loss": 2.991586685180664, + "eval_runtime": 116.0798, + "eval_samples_per_second": 889.69, + "eval_steps_per_second": 55.608, "step": 160000 }, { "epoch": 1.37, - "eval_loss": 2.707641839981079, - "eval_runtime": 126.5896, - "eval_samples_per_second": 815.826, - "eval_steps_per_second": 50.992, + "eval_loss": 3.0230655670166016, + "eval_runtime": 115.2701, + "eval_samples_per_second": 895.939, + "eval_steps_per_second": 55.999, "step": 168000 }, { "epoch": 1.44, "learning_rate": 9.272075377303427e-06, - "loss": 2.7166, + "loss": 3.0674, "step": 176000 }, { "epoch": 1.44, - "eval_loss": 2.7106387615203857, - "eval_runtime": 126.9356, - "eval_samples_per_second": 813.602, - "eval_steps_per_second": 50.853, + "eval_loss": 3.0447049140930176, + "eval_runtime": 115.1489, + "eval_samples_per_second": 896.882, + "eval_steps_per_second": 56.058, "step": 176000 }, { "epoch": 1.5, - "eval_loss": 2.719717025756836, - "eval_runtime": 127.5317, - "eval_samples_per_second": 809.798, - "eval_steps_per_second": 50.615, + "eval_loss": 3.0638155937194824, + "eval_runtime": 116.1134, + "eval_samples_per_second": 889.432, + "eval_steps_per_second": 55.592, "step": 184000 }, { "epoch": 1.57, "learning_rate": 9.205369799049446e-06, - "loss": 2.7144, + "loss": 3.1059, "step": 192000 }, { "epoch": 1.57, - "eval_loss": 2.7100777626037598, - "eval_runtime": 126.3318, - "eval_samples_per_second": 817.49, - "eval_steps_per_second": 51.096, + "eval_loss": 3.094524383544922, + "eval_runtime": 114.9725, + "eval_samples_per_second": 898.258, + "eval_steps_per_second": 56.144, "step": 192000 }, { "epoch": 1.63, - "eval_loss": 2.723472833633423, - "eval_runtime": 127.1568, - "eval_samples_per_second": 812.186, - "eval_steps_per_second": 50.764, + "eval_loss": 3.1008002758026123, + "eval_runtime": 116.6453, + "eval_samples_per_second": 885.377, + "eval_steps_per_second": 55.339, "step": 200000 }, { "epoch": 1.7, "learning_rate": 9.138664220795464e-06, - "loss": 2.7179, + "loss": 3.1283, "step": 208000 }, { "epoch": 1.7, - "eval_loss": 2.706564426422119, - "eval_runtime": 127.089, - "eval_samples_per_second": 812.62, - "eval_steps_per_second": 50.791, + "eval_loss": 3.1256680488586426, + "eval_runtime": 115.0624, + "eval_samples_per_second": 897.556, + "eval_steps_per_second": 56.1, "step": 208000 }, { "epoch": 1.76, - "eval_loss": 2.7282984256744385, - "eval_runtime": 127.4927, - "eval_samples_per_second": 810.047, - "eval_steps_per_second": 50.63, + "eval_loss": 3.1262004375457764, + "eval_runtime": 114.9392, + "eval_samples_per_second": 898.518, + "eval_steps_per_second": 56.16, "step": 216000 }, { "epoch": 1.83, "learning_rate": 9.071958642541483e-06, - "loss": 2.7231, + "loss": 3.1684, "step": 224000 }, { "epoch": 1.83, - "eval_loss": 2.7203216552734375, - "eval_runtime": 127.4298, - "eval_samples_per_second": 810.446, - "eval_steps_per_second": 50.655, + "eval_loss": 3.152285099029541, + "eval_runtime": 115.5854, + "eval_samples_per_second": 893.495, + "eval_steps_per_second": 55.846, "step": 224000 }, { "epoch": 1.89, - "eval_loss": 2.711085319519043, - "eval_runtime": 126.4739, - "eval_samples_per_second": 816.571, - "eval_steps_per_second": 51.038, + "eval_loss": 3.1842401027679443, + "eval_runtime": 114.9021, + "eval_samples_per_second": 898.809, + "eval_steps_per_second": 56.178, "step": 232000 }, { "epoch": 1.96, "learning_rate": 9.005253064287502e-06, - "loss": 2.7284, + "loss": 3.1966, "step": 240000 }, { "epoch": 1.96, - "eval_loss": 2.721714973449707, - "eval_runtime": 126.401, - "eval_samples_per_second": 817.043, - "eval_steps_per_second": 51.068, + "eval_loss": 3.1820068359375, + "eval_runtime": 117.5401, + "eval_samples_per_second": 878.637, + "eval_steps_per_second": 54.917, "step": 240000 }, { "epoch": 2.02, - "eval_loss": 2.725090265274048, - "eval_runtime": 127.0199, - "eval_samples_per_second": 813.061, - "eval_steps_per_second": 50.819, + "eval_loss": 3.197575569152832, + "eval_runtime": 119.2185, + "eval_samples_per_second": 866.266, + "eval_steps_per_second": 54.144, "step": 248000 }, { "epoch": 2.09, "learning_rate": 8.93854748603352e-06, - "loss": 2.7242, + "loss": 3.2055, "step": 256000 }, { "epoch": 2.09, - "eval_loss": 2.718090057373047, - "eval_runtime": 127.9402, - "eval_samples_per_second": 807.213, - "eval_steps_per_second": 50.453, + "eval_loss": 3.2012782096862793, + "eval_runtime": 116.0617, + "eval_samples_per_second": 889.829, + "eval_steps_per_second": 55.617, "step": 256000 }, { "epoch": 2.15, - "eval_loss": 2.723750591278076, - "eval_runtime": 127.1287, - "eval_samples_per_second": 812.366, - "eval_steps_per_second": 50.775, + "eval_loss": 3.219731092453003, + "eval_runtime": 115.3459, + "eval_samples_per_second": 895.351, + "eval_steps_per_second": 55.962, "step": 264000 }, { "epoch": 2.22, "learning_rate": 8.871841907779539e-06, - "loss": 2.7171, + "loss": 3.2186, "step": 272000 }, { "epoch": 2.22, - "eval_loss": 2.748772144317627, - "eval_runtime": 128.6406, - "eval_samples_per_second": 802.818, - "eval_steps_per_second": 50.179, + "eval_loss": 3.2258596420288086, + "eval_runtime": 117.0782, + "eval_samples_per_second": 882.102, + "eval_steps_per_second": 55.134, "step": 272000 }, { "epoch": 2.28, - "eval_loss": 2.731541633605957, - "eval_runtime": 127.5368, - "eval_samples_per_second": 809.766, - "eval_steps_per_second": 50.613, + "eval_loss": 3.2410128116607666, + "eval_runtime": 115.7081, + "eval_samples_per_second": 892.547, + "eval_steps_per_second": 55.787, "step": 280000 }, { "epoch": 2.35, "learning_rate": 8.805136329525557e-06, - "loss": 2.7312, + "loss": 3.2518, "step": 288000 }, { "epoch": 2.35, - "eval_loss": 2.746854305267334, - "eval_runtime": 127.6128, - "eval_samples_per_second": 809.284, - "eval_steps_per_second": 50.583, + "eval_loss": 3.2448806762695312, + "eval_runtime": 116.2706, + "eval_samples_per_second": 888.23, + "eval_steps_per_second": 55.517, "step": 288000 }, { "epoch": 2.41, - "eval_loss": 2.7363078594207764, - "eval_runtime": 127.9259, - "eval_samples_per_second": 807.303, - "eval_steps_per_second": 50.459, + "eval_loss": 3.2685933113098145, + "eval_runtime": 117.0296, + "eval_samples_per_second": 882.469, + "eval_steps_per_second": 55.157, "step": 296000 }, { "epoch": 2.48, "learning_rate": 8.738430751271576e-06, - "loss": 2.7386, + "loss": 3.2705, "step": 304000 }, { "epoch": 2.48, - "eval_loss": 2.7398250102996826, - "eval_runtime": 127.3013, - "eval_samples_per_second": 811.264, - "eval_steps_per_second": 50.706, + "eval_loss": 3.270232915878296, + "eval_runtime": 115.7748, + "eval_samples_per_second": 892.034, + "eval_steps_per_second": 55.755, "step": 304000 }, { "epoch": 2.54, - "eval_loss": 2.747743844985962, - "eval_runtime": 127.9865, - "eval_samples_per_second": 806.921, - "eval_steps_per_second": 50.435, + "eval_loss": 3.271563768386841, + "eval_runtime": 114.7956, + "eval_samples_per_second": 899.643, + "eval_steps_per_second": 56.23, "step": 312000 }, { "epoch": 2.61, "learning_rate": 8.671725173017595e-06, - "loss": 2.7457, + "loss": 3.2677, "step": 320000 }, { "epoch": 2.61, - "eval_loss": 2.753558397293091, - "eval_runtime": 128.9208, - "eval_samples_per_second": 801.073, - "eval_steps_per_second": 50.069, + "eval_loss": 3.2934534549713135, + "eval_runtime": 116.4472, + "eval_samples_per_second": 886.883, + "eval_steps_per_second": 55.433, "step": 320000 }, { "epoch": 2.67, - "eval_loss": 2.748337984085083, - "eval_runtime": 128.758, - "eval_samples_per_second": 802.086, - "eval_steps_per_second": 50.133, + "eval_loss": 3.2941575050354004, + "eval_runtime": 115.658, + "eval_samples_per_second": 892.934, + "eval_steps_per_second": 55.811, "step": 328000 }, { "epoch": 2.74, "learning_rate": 8.605019594763613e-06, - "loss": 2.7496, + "loss": 3.2955, "step": 336000 }, { "epoch": 2.74, - "eval_loss": 2.752856969833374, - "eval_runtime": 128.3684, - "eval_samples_per_second": 804.521, - "eval_steps_per_second": 50.285, + "eval_loss": 3.304429054260254, + "eval_runtime": 115.4488, + "eval_samples_per_second": 894.552, + "eval_steps_per_second": 55.912, "step": 336000 }, { "epoch": 2.8, - "eval_loss": 2.749178171157837, - "eval_runtime": 129.8422, - "eval_samples_per_second": 795.388, - "eval_steps_per_second": 49.714, + "eval_loss": 3.3109662532806396, + "eval_runtime": 114.8039, + "eval_samples_per_second": 899.577, + "eval_steps_per_second": 56.226, "step": 344000 }, { "epoch": 2.87, "learning_rate": 8.538314016509632e-06, - "loss": 2.7521, + "loss": 3.2966, "step": 352000 }, { "epoch": 2.87, - "eval_loss": 2.761200189590454, - "eval_runtime": 127.6309, - "eval_samples_per_second": 809.169, - "eval_steps_per_second": 50.576, + "eval_loss": 3.3053431510925293, + "eval_runtime": 115.0477, + "eval_samples_per_second": 897.671, + "eval_steps_per_second": 56.107, "step": 352000 }, { "epoch": 2.94, - "eval_loss": 2.7700963020324707, - "eval_runtime": 128.3946, - "eval_samples_per_second": 804.356, - "eval_steps_per_second": 50.275, + "eval_loss": 3.3276007175445557, + "eval_runtime": 115.8876, + "eval_samples_per_second": 891.165, + "eval_steps_per_second": 55.701, "step": 360000 }, { "epoch": 3.0, "learning_rate": 8.471608438255649e-06, - "loss": 2.7649, + "loss": 3.311, "step": 368000 }, { "epoch": 3.0, - "eval_loss": 2.7705161571502686, - "eval_runtime": 128.8577, - "eval_samples_per_second": 801.466, - "eval_steps_per_second": 50.094, + "eval_loss": 3.3256120681762695, + "eval_runtime": 117.3196, + "eval_samples_per_second": 880.288, + "eval_steps_per_second": 55.021, "step": 368000 }, { "epoch": 3.07, - "eval_loss": 2.782761335372925, - "eval_runtime": 129.17, - "eval_samples_per_second": 799.528, - "eval_steps_per_second": 49.973, + "eval_loss": 3.3292236328125, + "eval_runtime": 117.5646, + "eval_samples_per_second": 878.453, + "eval_steps_per_second": 54.906, "step": 376000 }, { "epoch": 3.13, "learning_rate": 8.404902860001667e-06, - "loss": 2.7516, + "loss": 3.3217, "step": 384000 }, { "epoch": 3.13, - "eval_loss": 2.7680482864379883, - "eval_runtime": 128.9028, - "eval_samples_per_second": 801.185, - "eval_steps_per_second": 50.077, + "eval_loss": 3.333477258682251, + "eval_runtime": 116.7284, + "eval_samples_per_second": 884.746, + "eval_steps_per_second": 55.299, "step": 384000 }, { "epoch": 3.2, - "eval_loss": 2.784294605255127, - "eval_runtime": 128.4737, - "eval_samples_per_second": 803.861, - "eval_steps_per_second": 50.244, + "eval_loss": 3.316025972366333, + "eval_runtime": 118.1544, + "eval_samples_per_second": 874.068, + "eval_steps_per_second": 54.632, "step": 392000 }, { "epoch": 3.26, "learning_rate": 8.338197281747686e-06, - "loss": 2.762, + "loss": 3.3145, "step": 400000 }, { "epoch": 3.26, - "eval_loss": 2.7915961742401123, - "eval_runtime": 128.2651, - "eval_samples_per_second": 805.168, - "eval_steps_per_second": 50.325, + "eval_loss": 3.337838649749756, + "eval_runtime": 116.066, + "eval_samples_per_second": 889.796, + "eval_steps_per_second": 55.615, "step": 400000 }, { "epoch": 3.33, - "eval_loss": 2.7691826820373535, - "eval_runtime": 128.6705, - "eval_samples_per_second": 802.632, - "eval_steps_per_second": 50.167, + "eval_loss": 3.3306798934936523, + "eval_runtime": 117.4533, + "eval_samples_per_second": 879.285, + "eval_steps_per_second": 54.958, "step": 408000 }, { "epoch": 3.39, "learning_rate": 8.271491703493705e-06, - "loss": 2.7789, + "loss": 3.3246, "step": 416000 }, { "epoch": 3.39, - "eval_loss": 2.783369302749634, - "eval_runtime": 128.6603, - "eval_samples_per_second": 802.695, - "eval_steps_per_second": 50.171, + "eval_loss": 3.342693567276001, + "eval_runtime": 115.6289, + "eval_samples_per_second": 893.159, + "eval_steps_per_second": 55.825, "step": 416000 }, { "epoch": 3.46, - "eval_loss": 2.7788405418395996, - "eval_runtime": 129.7209, - "eval_samples_per_second": 796.132, - "eval_steps_per_second": 49.761, + "eval_loss": 3.3543155193328857, + "eval_runtime": 115.7056, + "eval_samples_per_second": 892.567, + "eval_steps_per_second": 55.788, "step": 424000 }, { "epoch": 3.52, "learning_rate": 8.204786125239725e-06, - "loss": 2.7879, + "loss": 3.3131, "step": 432000 }, { "epoch": 3.52, - "eval_loss": 2.803699493408203, - "eval_runtime": 128.2575, - "eval_samples_per_second": 805.216, - "eval_steps_per_second": 50.328, + "eval_loss": 3.340524196624756, + "eval_runtime": 116.2105, + "eval_samples_per_second": 888.689, + "eval_steps_per_second": 55.546, "step": 432000 }, { "epoch": 3.59, - "eval_loss": 2.791905403137207, - "eval_runtime": 129.4159, - "eval_samples_per_second": 798.009, - "eval_steps_per_second": 49.878, + "eval_loss": 3.336106777191162, + "eval_runtime": 114.9141, + "eval_samples_per_second": 898.714, + "eval_steps_per_second": 56.172, "step": 440000 }, { "epoch": 3.65, "learning_rate": 8.138080546985743e-06, - "loss": 2.7853, + "loss": 3.3266, "step": 448000 }, { "epoch": 3.65, - "eval_loss": 2.8077127933502197, - "eval_runtime": 127.9753, - "eval_samples_per_second": 806.992, - "eval_steps_per_second": 50.439, + "eval_loss": 3.370443344116211, + "eval_runtime": 115.193, + "eval_samples_per_second": 896.539, + "eval_steps_per_second": 56.036, "step": 448000 }, { "epoch": 3.72, - "eval_loss": 2.7903032302856445, - "eval_runtime": 128.9005, - "eval_samples_per_second": 801.2, - "eval_steps_per_second": 50.077, + "eval_loss": 3.354923963546753, + "eval_runtime": 115.5245, + "eval_samples_per_second": 893.967, + "eval_steps_per_second": 55.876, "step": 456000 }, { "epoch": 3.78, "learning_rate": 8.07137496873176e-06, - "loss": 2.7976, + "loss": 3.3358, "step": 464000 }, { "epoch": 3.78, - "eval_loss": 2.810896158218384, - "eval_runtime": 129.0626, - "eval_samples_per_second": 800.193, - "eval_steps_per_second": 50.014, + "eval_loss": 3.360276937484741, + "eval_runtime": 116.1443, + "eval_samples_per_second": 889.196, + "eval_steps_per_second": 55.577, "step": 464000 }, { "epoch": 3.85, - "eval_loss": 2.795713424682617, - "eval_runtime": 128.0638, - "eval_samples_per_second": 806.434, - "eval_steps_per_second": 50.405, + "eval_loss": 3.3641881942749023, + "eval_runtime": 115.4508, + "eval_samples_per_second": 894.537, + "eval_steps_per_second": 55.911, "step": 472000 }, { "epoch": 3.91, "learning_rate": 8.004669390477779e-06, - "loss": 2.789, + "loss": 3.3385, "step": 480000 }, { "epoch": 3.91, - "eval_loss": 2.8023178577423096, - "eval_runtime": 128.1962, - "eval_samples_per_second": 805.601, - "eval_steps_per_second": 50.353, + "eval_loss": 3.3572633266448975, + "eval_runtime": 114.9449, + "eval_samples_per_second": 898.474, + "eval_steps_per_second": 56.157, "step": 480000 }, { "epoch": 3.98, - "eval_loss": 2.8125839233398438, - "eval_runtime": 128.7992, - "eval_samples_per_second": 801.83, - "eval_steps_per_second": 50.117, + "eval_loss": 3.3658275604248047, + "eval_runtime": 115.0066, + "eval_samples_per_second": 897.992, + "eval_steps_per_second": 56.127, "step": 488000 }, { "epoch": 4.04, "learning_rate": 7.937963812223798e-06, - "loss": 2.8089, + "loss": 3.3375, "step": 496000 }, { "epoch": 4.04, - "eval_loss": 2.815424919128418, - "eval_runtime": 128.7985, - "eval_samples_per_second": 801.834, - "eval_steps_per_second": 50.117, + "eval_loss": 3.345881700515747, + "eval_runtime": 115.316, + "eval_samples_per_second": 895.583, + "eval_steps_per_second": 55.977, "step": 496000 }, { "epoch": 4.11, - "eval_loss": 2.8122923374176025, - "eval_runtime": 127.4092, - "eval_samples_per_second": 810.577, - "eval_steps_per_second": 50.664, + "eval_loss": 3.3702762126922607, + "eval_runtime": 114.9631, + "eval_samples_per_second": 898.331, + "eval_steps_per_second": 56.148, "step": 504000 }, { "epoch": 4.17, "learning_rate": 7.871258233969816e-06, - "loss": 2.7915, + "loss": 3.3237, "step": 512000 }, { "epoch": 4.17, - "eval_loss": 2.8145976066589355, - "eval_runtime": 128.9266, - "eval_samples_per_second": 801.037, - "eval_steps_per_second": 50.067, + "eval_loss": 3.3564202785491943, + "eval_runtime": 116.3254, + "eval_samples_per_second": 887.811, + "eval_steps_per_second": 55.491, "step": 512000 }, { "epoch": 4.24, - "eval_loss": 2.8249683380126953, - "eval_runtime": 129.1348, - "eval_samples_per_second": 799.746, - "eval_steps_per_second": 49.987, + "eval_loss": 3.3553359508514404, + "eval_runtime": 115.6968, + "eval_samples_per_second": 892.635, + "eval_steps_per_second": 55.792, "step": 520000 }, { "epoch": 4.31, "learning_rate": 7.804552655715835e-06, - "loss": 2.8094, + "loss": 3.34, "step": 528000 }, { "epoch": 4.31, - "eval_loss": 2.820560932159424, - "eval_runtime": 129.6096, - "eval_samples_per_second": 796.816, - "eval_steps_per_second": 49.803, + "eval_loss": 3.35756778717041, + "eval_runtime": 114.9307, + "eval_samples_per_second": 898.585, + "eval_steps_per_second": 56.164, "step": 528000 }, { "epoch": 4.37, - "eval_loss": 2.818159341812134, - "eval_runtime": 128.5096, - "eval_samples_per_second": 803.637, - "eval_steps_per_second": 50.23, + "eval_loss": 3.3548436164855957, + "eval_runtime": 116.9698, + "eval_samples_per_second": 882.92, + "eval_steps_per_second": 55.185, "step": 536000 }, { "epoch": 4.44, "learning_rate": 7.737847077461853e-06, - "loss": 2.8196, + "loss": 3.3247, "step": 544000 }, { "epoch": 4.44, - "eval_loss": 2.8351361751556396, - "eval_runtime": 129.1287, - "eval_samples_per_second": 799.783, - "eval_steps_per_second": 49.989, + "eval_loss": 3.3525540828704834, + "eval_runtime": 114.951, + "eval_samples_per_second": 898.427, + "eval_steps_per_second": 56.154, "step": 544000 }, { "epoch": 4.5, - "eval_loss": 2.839430570602417, - "eval_runtime": 129.5203, - "eval_samples_per_second": 797.365, - "eval_steps_per_second": 49.838, + "eval_loss": 3.367372512817383, + "eval_runtime": 116.891, + "eval_samples_per_second": 883.515, + "eval_steps_per_second": 55.222, "step": 552000 }, { "epoch": 4.57, "learning_rate": 7.671141499207872e-06, - "loss": 2.8316, + "loss": 3.318, "step": 560000 }, { "epoch": 4.57, - "eval_loss": 2.8396623134613037, - "eval_runtime": 128.6713, - "eval_samples_per_second": 802.627, - "eval_steps_per_second": 50.167, + "eval_loss": 3.3607981204986572, + "eval_runtime": 115.5047, + "eval_samples_per_second": 894.12, + "eval_steps_per_second": 55.885, "step": 560000 }, { "epoch": 4.63, - "eval_loss": 2.8402562141418457, - "eval_runtime": 128.654, - "eval_samples_per_second": 802.735, - "eval_steps_per_second": 50.173, + "eval_loss": 3.3527328968048096, + "eval_runtime": 116.278, + "eval_samples_per_second": 888.173, + "eval_steps_per_second": 55.514, "step": 568000 }, { "epoch": 4.7, "learning_rate": 7.604435920953891e-06, - "loss": 2.8444, + "loss": 3.3318, "step": 576000 }, { "epoch": 4.7, - "eval_loss": 2.8350980281829834, - "eval_runtime": 129.3424, - "eval_samples_per_second": 798.462, - "eval_steps_per_second": 49.906, + "eval_loss": 3.3600049018859863, + "eval_runtime": 115.0864, + "eval_samples_per_second": 897.369, + "eval_steps_per_second": 56.088, "step": 576000 }, { "epoch": 4.76, - "eval_loss": 2.8574254512786865, - "eval_runtime": 129.6206, - "eval_samples_per_second": 796.748, - "eval_steps_per_second": 49.799, + "eval_loss": 3.366177797317505, + "eval_runtime": 116.1802, + "eval_samples_per_second": 888.921, + "eval_steps_per_second": 55.56, "step": 584000 }, { "epoch": 4.83, "learning_rate": 7.537730342699909e-06, - "loss": 2.833, + "loss": 3.3211, "step": 592000 }, { "epoch": 4.83, - "eval_loss": 2.86171293258667, - "eval_runtime": 129.2684, - "eval_samples_per_second": 798.919, - "eval_steps_per_second": 49.935, + "eval_loss": 3.36027193069458, + "eval_runtime": 115.5036, + "eval_samples_per_second": 894.128, + "eval_steps_per_second": 55.886, "step": 592000 }, { "epoch": 4.89, - "eval_loss": 2.857750654220581, - "eval_runtime": 128.5027, - "eval_samples_per_second": 803.679, - "eval_steps_per_second": 50.232, + "eval_loss": 3.364029884338379, + "eval_runtime": 114.9019, + "eval_samples_per_second": 898.81, + "eval_steps_per_second": 56.178, "step": 600000 }, { "epoch": 4.96, "learning_rate": 7.471024764445928e-06, - "loss": 2.839, + "loss": 3.3344, "step": 608000 }, { "epoch": 4.96, - "eval_loss": 2.8577184677124023, - "eval_runtime": 128.7081, - "eval_samples_per_second": 802.397, - "eval_steps_per_second": 50.152, + "eval_loss": 3.376020669937134, + "eval_runtime": 115.5882, + "eval_samples_per_second": 893.473, + "eval_steps_per_second": 55.845, "step": 608000 }, { "epoch": 5.02, - "eval_loss": 2.8726649284362793, - "eval_runtime": 128.6474, - "eval_samples_per_second": 802.776, - "eval_steps_per_second": 50.176, + "eval_loss": 3.3876428604125977, + "eval_runtime": 115.0301, + "eval_samples_per_second": 897.809, + "eval_steps_per_second": 56.116, "step": 616000 }, { "epoch": 5.09, "learning_rate": 7.4043191861919465e-06, - "loss": 2.8427, + "loss": 3.331, "step": 624000 }, { "epoch": 5.09, - "eval_loss": 2.858550786972046, - "eval_runtime": 129.0947, - "eval_samples_per_second": 799.994, - "eval_steps_per_second": 50.002, + "eval_loss": 3.351862668991089, + "eval_runtime": 115.49, + "eval_samples_per_second": 894.233, + "eval_steps_per_second": 55.892, "step": 624000 }, { "epoch": 5.15, - "eval_loss": 2.880849599838257, - "eval_runtime": 128.221, - "eval_samples_per_second": 805.445, - "eval_steps_per_second": 50.343, + "eval_loss": 3.373405933380127, + "eval_runtime": 115.9525, + "eval_samples_per_second": 890.666, + "eval_steps_per_second": 55.669, "step": 632000 }, { "epoch": 5.22, "learning_rate": 7.337613607937964e-06, - "loss": 2.8599, + "loss": 3.3293, "step": 640000 }, { "epoch": 5.22, - "eval_loss": 2.8959789276123047, - "eval_runtime": 129.9831, - "eval_samples_per_second": 794.527, - "eval_steps_per_second": 49.66, + "eval_loss": 3.373460531234741, + "eval_runtime": 115.1854, + "eval_samples_per_second": 896.598, + "eval_steps_per_second": 56.04, "step": 640000 }, { "epoch": 5.28, - "eval_loss": 2.8883421421051025, - "eval_runtime": 129.4941, - "eval_samples_per_second": 797.527, - "eval_steps_per_second": 49.848, + "eval_loss": 3.3703157901763916, + "eval_runtime": 115.0036, + "eval_samples_per_second": 898.016, + "eval_steps_per_second": 56.129, "step": 648000 }, { "epoch": 5.35, "learning_rate": 7.270908029683983e-06, - "loss": 2.8694, + "loss": 3.3317, "step": 656000 }, { "epoch": 5.35, - "eval_loss": 2.8884825706481934, - "eval_runtime": 129.3172, - "eval_samples_per_second": 798.618, - "eval_steps_per_second": 49.916, + "eval_loss": 3.382647752761841, + "eval_runtime": 115.8086, + "eval_samples_per_second": 891.773, + "eval_steps_per_second": 55.739, "step": 656000 }, { "epoch": 5.41, - "eval_loss": 2.887291431427002, - "eval_runtime": 129.1298, - "eval_samples_per_second": 799.777, - "eval_steps_per_second": 49.988, + "eval_loss": 3.3825886249542236, + "eval_runtime": 115.3628, + "eval_samples_per_second": 895.219, + "eval_steps_per_second": 55.954, "step": 664000 }, { "epoch": 5.48, "learning_rate": 7.2042024514300015e-06, - "loss": 2.8626, + "loss": 3.3291, "step": 672000 }, { "epoch": 5.48, - "eval_loss": 2.8929550647735596, - "eval_runtime": 129.4886, - "eval_samples_per_second": 797.56, - "eval_steps_per_second": 49.85, + "eval_loss": 3.391868829727173, + "eval_runtime": 115.4028, + "eval_samples_per_second": 894.909, + "eval_steps_per_second": 55.935, "step": 672000 }, { "epoch": 5.54, - "eval_loss": 2.8987772464752197, - "eval_runtime": 129.8683, - "eval_samples_per_second": 795.229, - "eval_steps_per_second": 49.704, + "eval_loss": 3.378626585006714, + "eval_runtime": 115.4498, + "eval_samples_per_second": 894.545, + "eval_steps_per_second": 55.912, "step": 680000 }, { "epoch": 5.61, "learning_rate": 7.13749687317602e-06, - "loss": 2.8921, + "loss": 3.3423, "step": 688000 }, { "epoch": 5.61, - "eval_loss": 2.9117259979248047, - "eval_runtime": 128.3205, - "eval_samples_per_second": 804.821, - "eval_steps_per_second": 50.304, + "eval_loss": 3.377542734146118, + "eval_runtime": 115.2629, + "eval_samples_per_second": 895.995, + "eval_steps_per_second": 56.002, "step": 688000 }, { "epoch": 5.68, - "eval_loss": 2.912231206893921, - "eval_runtime": 128.7871, - "eval_samples_per_second": 801.905, - "eval_steps_per_second": 50.121, + "eval_loss": 3.373429298400879, + "eval_runtime": 115.5205, + "eval_samples_per_second": 893.997, + "eval_steps_per_second": 55.878, "step": 696000 }, { "epoch": 5.74, "learning_rate": 7.070791294922038e-06, - "loss": 2.8884, + "loss": 3.3364, "step": 704000 }, { "epoch": 5.74, - "eval_loss": 2.900118827819824, - "eval_runtime": 130.1834, - "eval_samples_per_second": 793.304, - "eval_steps_per_second": 49.584, + "eval_loss": 3.372532367706299, + "eval_runtime": 115.5543, + "eval_samples_per_second": 893.735, + "eval_steps_per_second": 55.861, "step": 704000 }, { "epoch": 5.81, - "eval_loss": 2.9093644618988037, - "eval_runtime": 129.4918, - "eval_samples_per_second": 797.541, - "eval_steps_per_second": 49.849, + "eval_loss": 3.3855302333831787, + "eval_runtime": 115.9379, + "eval_samples_per_second": 890.778, + "eval_steps_per_second": 55.676, "step": 712000 }, { "epoch": 5.87, "learning_rate": 7.0040857166680564e-06, - "loss": 2.8974, + "loss": 3.347, "step": 720000 }, { "epoch": 5.87, - "eval_loss": 2.9110264778137207, - "eval_runtime": 129.9051, - "eval_samples_per_second": 795.003, - "eval_steps_per_second": 49.69, + "eval_loss": 3.3774046897888184, + "eval_runtime": 114.6511, + "eval_samples_per_second": 900.776, + "eval_steps_per_second": 56.301, "step": 720000 }, { "epoch": 5.94, - "eval_loss": 2.9044594764709473, - "eval_runtime": 129.2324, - "eval_samples_per_second": 799.141, - "eval_steps_per_second": 49.949, + "eval_loss": 3.3717195987701416, + "eval_runtime": 115.9173, + "eval_samples_per_second": 890.937, + "eval_steps_per_second": 55.686, "step": 728000 }, { "epoch": 6.0, "learning_rate": 6.937380138414076e-06, - "loss": 2.903, + "loss": 3.3311, "step": 736000 }, { "epoch": 6.0, - "eval_loss": 2.933678388595581, - "eval_runtime": 130.3644, - "eval_samples_per_second": 792.202, - "eval_steps_per_second": 49.515, + "eval_loss": 3.392944097518921, + "eval_runtime": 115.7013, + "eval_samples_per_second": 892.6, + "eval_steps_per_second": 55.79, "step": 736000 }, { "epoch": 6.07, - "eval_loss": 2.931581735610962, - "eval_runtime": 128.3976, - "eval_samples_per_second": 804.337, - "eval_steps_per_second": 50.274, + "eval_loss": 3.389941930770874, + "eval_runtime": 117.4363, + "eval_samples_per_second": 879.413, + "eval_steps_per_second": 54.966, "step": 744000 }, { "epoch": 6.13, "learning_rate": 6.8706745601600945e-06, - "loss": 2.9057, + "loss": 3.3445, "step": 752000 }, { "epoch": 6.13, - "eval_loss": 2.944746971130371, - "eval_runtime": 128.9912, - "eval_samples_per_second": 800.636, - "eval_steps_per_second": 50.042, + "eval_loss": 3.3985016345977783, + "eval_runtime": 115.5779, + "eval_samples_per_second": 893.553, + "eval_steps_per_second": 55.85, "step": 752000 }, { "epoch": 6.2, - "eval_loss": 2.936281681060791, - "eval_runtime": 129.9533, - "eval_samples_per_second": 794.709, - "eval_steps_per_second": 49.672, + "eval_loss": 3.3865506649017334, + "eval_runtime": 114.8487, + "eval_samples_per_second": 899.227, + "eval_steps_per_second": 56.204, "step": 760000 }, { "epoch": 6.26, "learning_rate": 6.803968981906113e-06, - "loss": 2.9146, + "loss": 3.345, "step": 768000 }, { "epoch": 6.26, - "eval_loss": 2.943751096725464, - "eval_runtime": 129.9494, - "eval_samples_per_second": 794.732, - "eval_steps_per_second": 49.673, + "eval_loss": 3.3942770957946777, + "eval_runtime": 115.533, + "eval_samples_per_second": 893.901, + "eval_steps_per_second": 55.871, "step": 768000 }, { "epoch": 6.33, - "eval_loss": 2.9474806785583496, - "eval_runtime": 130.0993, - "eval_samples_per_second": 793.817, - "eval_steps_per_second": 49.616, + "eval_loss": 3.373379945755005, + "eval_runtime": 115.2598, + "eval_samples_per_second": 896.019, + "eval_steps_per_second": 56.004, "step": 776000 }, { "epoch": 6.39, "learning_rate": 6.737263403652131e-06, - "loss": 2.9221, + "loss": 3.3427, "step": 784000 }, { "epoch": 6.39, - "eval_loss": 2.9394171237945557, - "eval_runtime": 129.1928, - "eval_samples_per_second": 799.387, - "eval_steps_per_second": 49.964, + "eval_loss": 3.383202314376831, + "eval_runtime": 114.9199, + "eval_samples_per_second": 898.669, + "eval_steps_per_second": 56.17, "step": 784000 }, { "epoch": 6.46, - "eval_loss": 2.937087297439575, - "eval_runtime": 129.9118, - "eval_samples_per_second": 794.963, - "eval_steps_per_second": 49.688, + "eval_loss": 3.3966336250305176, + "eval_runtime": 115.6206, + "eval_samples_per_second": 893.223, + "eval_steps_per_second": 55.829, "step": 792000 }, { "epoch": 6.52, "learning_rate": 6.6705578253981495e-06, - "loss": 2.9316, + "loss": 3.3406, "step": 800000 }, { "epoch": 6.52, - "eval_loss": 2.949429512023926, - "eval_runtime": 129.8602, - "eval_samples_per_second": 795.278, - "eval_steps_per_second": 49.707, + "eval_loss": 3.3891854286193848, + "eval_runtime": 115.5059, + "eval_samples_per_second": 894.11, + "eval_steps_per_second": 55.885, "step": 800000 }, { "epoch": 6.59, - "eval_loss": 2.9727399349212646, - "eval_runtime": 130.9441, - "eval_samples_per_second": 788.695, - "eval_steps_per_second": 49.296, + "eval_loss": 3.390401601791382, + "eval_runtime": 116.1612, + "eval_samples_per_second": 889.066, + "eval_steps_per_second": 55.569, "step": 808000 }, { "epoch": 6.65, "learning_rate": 6.603852247144168e-06, - "loss": 2.9421, + "loss": 3.3406, "step": 816000 }, { "epoch": 6.65, - "eval_loss": 2.9758830070495605, - "eval_runtime": 129.8861, - "eval_samples_per_second": 795.12, - "eval_steps_per_second": 49.697, + "eval_loss": 3.386686086654663, + "eval_runtime": 115.3671, + "eval_samples_per_second": 895.186, + "eval_steps_per_second": 55.952, "step": 816000 }, { "epoch": 6.72, - "eval_loss": 2.966480016708374, - "eval_runtime": 129.44, - "eval_samples_per_second": 797.86, - "eval_steps_per_second": 49.869, + "eval_loss": 3.390192747116089, + "eval_runtime": 114.8586, + "eval_samples_per_second": 899.149, + "eval_steps_per_second": 56.2, "step": 824000 }, { "epoch": 6.78, "learning_rate": 6.537146668890187e-06, - "loss": 2.9538, + "loss": 3.3354, "step": 832000 }, { "epoch": 6.78, - "eval_loss": 2.9650251865386963, - "eval_runtime": 129.4919, - "eval_samples_per_second": 797.54, - "eval_steps_per_second": 49.849, + "eval_loss": 3.371840000152588, + "eval_runtime": 115.0229, + "eval_samples_per_second": 897.865, + "eval_steps_per_second": 56.119, "step": 832000 }, { "epoch": 6.85, - "eval_loss": 2.976144313812256, - "eval_runtime": 129.8294, - "eval_samples_per_second": 795.467, - "eval_steps_per_second": 49.719, + "eval_loss": 3.383141279220581, + "eval_runtime": 115.453, + "eval_samples_per_second": 894.52, + "eval_steps_per_second": 55.91, "step": 840000 }, { "epoch": 6.91, "learning_rate": 6.4704410906362044e-06, - "loss": 2.9594, + "loss": 3.3521, "step": 848000 }, { "epoch": 6.91, - "eval_loss": 2.990086317062378, - "eval_runtime": 129.827, - "eval_samples_per_second": 795.482, - "eval_steps_per_second": 49.72, + "eval_loss": 3.3909192085266113, + "eval_runtime": 115.5241, + "eval_samples_per_second": 893.97, + "eval_steps_per_second": 55.876, "step": 848000 }, { "epoch": 6.98, - "eval_loss": 2.973181962966919, - "eval_runtime": 131.5126, - "eval_samples_per_second": 785.286, - "eval_steps_per_second": 49.083, + "eval_loss": 3.3798959255218506, + "eval_runtime": 115.2184, + "eval_samples_per_second": 896.342, + "eval_steps_per_second": 56.024, "step": 856000 }, { "epoch": 7.05, "learning_rate": 6.403735512382223e-06, - "loss": 2.9564, + "loss": 3.3538, "step": 864000 }, { "epoch": 7.05, - "eval_loss": 2.9896528720855713, - "eval_runtime": 129.878, - "eval_samples_per_second": 795.169, - "eval_steps_per_second": 49.7, + "eval_loss": 3.3828136920928955, + "eval_runtime": 115.3784, + "eval_samples_per_second": 895.098, + "eval_steps_per_second": 55.946, "step": 864000 }, { "epoch": 7.11, - "eval_loss": 2.980059862136841, - "eval_runtime": 129.5351, - "eval_samples_per_second": 797.274, - "eval_steps_per_second": 49.832, + "eval_loss": 3.378514051437378, + "eval_runtime": 115.0377, + "eval_samples_per_second": 897.749, + "eval_steps_per_second": 56.112, "step": 872000 }, { "epoch": 7.18, "learning_rate": 6.337029934128242e-06, - "loss": 2.9561, + "loss": 3.3363, "step": 880000 }, { "epoch": 7.18, - "eval_loss": 2.983869791030884, - "eval_runtime": 130.0357, - "eval_samples_per_second": 794.205, - "eval_steps_per_second": 49.64, + "eval_loss": 3.3993334770202637, + "eval_runtime": 115.5145, + "eval_samples_per_second": 894.043, + "eval_steps_per_second": 55.88, "step": 880000 }, { "epoch": 7.24, - "eval_loss": 2.9887585639953613, - "eval_runtime": 130.015, - "eval_samples_per_second": 794.331, - "eval_steps_per_second": 49.648, + "eval_loss": 3.3849687576293945, + "eval_runtime": 114.7628, + "eval_samples_per_second": 899.9, + "eval_steps_per_second": 56.246, "step": 888000 }, { "epoch": 7.31, "learning_rate": 6.270324355874261e-06, - "loss": 2.9669, + "loss": 3.3341, "step": 896000 }, { "epoch": 7.31, - "eval_loss": 2.99999737739563, - "eval_runtime": 130.6345, - "eval_samples_per_second": 790.564, - "eval_steps_per_second": 49.413, + "eval_loss": 3.3932485580444336, + "eval_runtime": 115.0217, + "eval_samples_per_second": 897.874, + "eval_steps_per_second": 56.12, "step": 896000 }, { "epoch": 7.37, - "eval_loss": 2.9786183834075928, - "eval_runtime": 129.9739, - "eval_samples_per_second": 794.582, - "eval_steps_per_second": 49.664, + "eval_loss": 3.398083209991455, + "eval_runtime": 115.1782, + "eval_samples_per_second": 896.654, + "eval_steps_per_second": 56.044, "step": 904000 }, { "epoch": 7.44, "learning_rate": 6.20361877762028e-06, - "loss": 2.9649, + "loss": 3.3458, "step": 912000 }, { "epoch": 7.44, - "eval_loss": 2.994581460952759, - "eval_runtime": 131.0156, - "eval_samples_per_second": 788.265, - "eval_steps_per_second": 49.269, + "eval_loss": 3.393594741821289, + "eval_runtime": 116.8302, + "eval_samples_per_second": 883.975, + "eval_steps_per_second": 55.251, "step": 912000 }, { "epoch": 7.5, - "eval_loss": 3.0002031326293945, - "eval_runtime": 131.7355, - "eval_samples_per_second": 783.957, - "eval_steps_per_second": 49.0, + "eval_loss": 3.4032301902770996, + "eval_runtime": 115.4692, + "eval_samples_per_second": 894.394, + "eval_steps_per_second": 55.902, "step": 920000 }, { "epoch": 7.57, "learning_rate": 6.1369131993662975e-06, - "loss": 2.9665, + "loss": 3.3327, "step": 928000 }, { "epoch": 7.57, - "eval_loss": 2.9960474967956543, - "eval_runtime": 131.6559, - "eval_samples_per_second": 784.431, - "eval_steps_per_second": 49.029, + "eval_loss": 3.385192394256592, + "eval_runtime": 115.7558, + "eval_samples_per_second": 892.18, + "eval_steps_per_second": 55.764, "step": 928000 }, { "epoch": 7.63, - "eval_loss": 3.0067989826202393, - "eval_runtime": 131.8152, - "eval_samples_per_second": 783.483, - "eval_steps_per_second": 48.97, + "eval_loss": 3.38653826713562, + "eval_runtime": 116.1964, + "eval_samples_per_second": 888.797, + "eval_steps_per_second": 55.553, "step": 936000 }, { "epoch": 7.7, "learning_rate": 6.070207621112316e-06, - "loss": 2.9708, + "loss": 3.3507, "step": 944000 }, { "epoch": 7.7, - "eval_loss": 2.993788242340088, - "eval_runtime": 130.3799, - "eval_samples_per_second": 792.108, - "eval_steps_per_second": 49.509, + "eval_loss": 3.390004873275757, + "eval_runtime": 115.6497, + "eval_samples_per_second": 892.999, + "eval_steps_per_second": 55.815, "step": 944000 }, { "epoch": 7.76, - "eval_loss": 3.0126230716705322, - "eval_runtime": 130.4447, - "eval_samples_per_second": 791.715, - "eval_steps_per_second": 49.485, + "eval_loss": 3.3772072792053223, + "eval_runtime": 115.4517, + "eval_samples_per_second": 894.53, + "eval_steps_per_second": 55.911, "step": 952000 }, { "epoch": 7.83, "learning_rate": 6.003502042858335e-06, - "loss": 2.981, + "loss": 3.3493, "step": 960000 }, { "epoch": 7.83, - "eval_loss": 2.9959194660186768, - "eval_runtime": 132.0738, - "eval_samples_per_second": 781.949, - "eval_steps_per_second": 48.874, + "eval_loss": 3.388688802719116, + "eval_runtime": 115.7986, + "eval_samples_per_second": 891.85, + "eval_steps_per_second": 55.743, "step": 960000 }, { "epoch": 7.89, - "eval_loss": 2.995976448059082, - "eval_runtime": 130.9412, - "eval_samples_per_second": 788.713, - "eval_steps_per_second": 49.297, + "eval_loss": 3.395124912261963, + "eval_runtime": 115.4739, + "eval_samples_per_second": 894.358, + "eval_steps_per_second": 55.9, "step": 968000 }, { "epoch": 7.96, "learning_rate": 5.936796464604353e-06, - "loss": 2.9805, + "loss": 3.3412, "step": 976000 }, { "epoch": 7.96, - "eval_loss": 2.991947889328003, - "eval_runtime": 130.0819, - "eval_samples_per_second": 793.923, - "eval_steps_per_second": 49.623, + "eval_loss": 3.3833136558532715, + "eval_runtime": 114.7504, + "eval_samples_per_second": 899.997, + "eval_steps_per_second": 56.253, "step": 976000 }, { "epoch": 8.02, - "eval_loss": 3.0058255195617676, - "eval_runtime": 130.7007, - "eval_samples_per_second": 790.164, - "eval_steps_per_second": 49.388, + "eval_loss": 3.381627321243286, + "eval_runtime": 115.0253, + "eval_samples_per_second": 897.846, + "eval_steps_per_second": 56.118, "step": 984000 }, { "epoch": 8.09, "learning_rate": 5.870090886350371e-06, - "loss": 2.9705, + "loss": 3.3232, "step": 992000 }, { "epoch": 8.09, - "eval_loss": 3.0232017040252686, - "eval_runtime": 129.9163, - "eval_samples_per_second": 794.935, - "eval_steps_per_second": 49.686, + "eval_loss": 3.37522292137146, + "eval_runtime": 114.2933, + "eval_samples_per_second": 903.597, + "eval_steps_per_second": 56.478, "step": 992000 }, { "epoch": 8.15, - "eval_loss": 3.0046939849853516, - "eval_runtime": 130.7903, - "eval_samples_per_second": 789.623, - "eval_steps_per_second": 49.354, + "eval_loss": 3.384525775909424, + "eval_runtime": 115.119, + "eval_samples_per_second": 897.115, + "eval_steps_per_second": 56.072, "step": 1000000 }, { "epoch": 8.22, "learning_rate": 5.80338530809639e-06, - "loss": 2.9715, + "loss": 3.333, "step": 1008000 }, { "epoch": 8.22, - "eval_loss": 3.0068600177764893, - "eval_runtime": 131.6119, - "eval_samples_per_second": 784.693, - "eval_steps_per_second": 49.046, + "eval_loss": 3.3906686305999756, + "eval_runtime": 115.1127, + "eval_samples_per_second": 897.164, + "eval_steps_per_second": 56.075, "step": 1008000 }, { "epoch": 8.28, - "eval_loss": 3.0018742084503174, - "eval_runtime": 131.7567, - "eval_samples_per_second": 783.831, - "eval_steps_per_second": 48.992, + "eval_loss": 3.3822684288024902, + "eval_runtime": 114.8049, + "eval_samples_per_second": 899.569, + "eval_steps_per_second": 56.226, "step": 1016000 }, { "epoch": 8.35, "learning_rate": 5.736679729842408e-06, - "loss": 2.9695, + "loss": 3.3449, "step": 1024000 }, { "epoch": 8.35, - "eval_loss": 3.021596670150757, - "eval_runtime": 131.2334, - "eval_samples_per_second": 786.956, - "eval_steps_per_second": 49.187, + "eval_loss": 3.3724589347839355, + "eval_runtime": 114.8265, + "eval_samples_per_second": 899.4, + "eval_steps_per_second": 56.215, "step": 1024000 }, { "epoch": 8.41, - "eval_loss": 3.0219063758850098, - "eval_runtime": 131.6228, - "eval_samples_per_second": 784.629, - "eval_steps_per_second": 49.042, + "eval_loss": 3.37973952293396, + "eval_runtime": 115.0872, + "eval_samples_per_second": 897.363, + "eval_steps_per_second": 56.088, "step": 1032000 }, { "epoch": 8.48, "learning_rate": 5.669974151588427e-06, - "loss": 2.9762, + "loss": 3.3336, "step": 1040000 }, { "epoch": 8.48, - "eval_loss": 3.018242597579956, - "eval_runtime": 131.898, - "eval_samples_per_second": 782.991, - "eval_steps_per_second": 48.939, + "eval_loss": 3.38781476020813, + "eval_runtime": 116.3835, + "eval_samples_per_second": 887.368, + "eval_steps_per_second": 55.463, "step": 1040000 }, { "epoch": 8.55, - "eval_loss": 3.0332210063934326, - "eval_runtime": 132.3771, - "eval_samples_per_second": 780.158, - "eval_steps_per_second": 48.762, + "eval_loss": 3.384516716003418, + "eval_runtime": 115.2938, + "eval_samples_per_second": 895.755, + "eval_steps_per_second": 55.987, "step": 1048000 }, { "epoch": 8.61, "learning_rate": 5.603268573334446e-06, - "loss": 2.9786, + "loss": 3.3307, "step": 1056000 }, { "epoch": 8.61, - "eval_loss": 3.001666307449341, - "eval_runtime": 131.4368, - "eval_samples_per_second": 785.739, - "eval_steps_per_second": 49.111, + "eval_loss": 3.390652894973755, + "eval_runtime": 116.7145, + "eval_samples_per_second": 884.851, + "eval_steps_per_second": 55.306, "step": 1056000 }, { "epoch": 8.68, - "eval_loss": 3.0236458778381348, - "eval_runtime": 130.9562, - "eval_samples_per_second": 788.622, - "eval_steps_per_second": 49.291, + "eval_loss": 3.3857922554016113, + "eval_runtime": 115.6915, + "eval_samples_per_second": 892.676, + "eval_steps_per_second": 55.795, "step": 1064000 }, { "epoch": 8.74, "learning_rate": 5.536562995080464e-06, - "loss": 2.9889, + "loss": 3.3267, "step": 1072000 }, { "epoch": 8.74, - "eval_loss": 3.0273077487945557, - "eval_runtime": 131.9047, - "eval_samples_per_second": 782.952, - "eval_steps_per_second": 48.937, + "eval_loss": 3.3951947689056396, + "eval_runtime": 115.1111, + "eval_samples_per_second": 897.177, + "eval_steps_per_second": 56.076, "step": 1072000 }, { "epoch": 8.81, - "eval_loss": 3.01967191696167, - "eval_runtime": 131.9615, - "eval_samples_per_second": 782.615, - "eval_steps_per_second": 48.916, + "eval_loss": 3.391402006149292, + "eval_runtime": 114.8898, + "eval_samples_per_second": 898.905, + "eval_steps_per_second": 56.184, "step": 1080000 }, { "epoch": 8.87, "learning_rate": 5.469857416826483e-06, - "loss": 2.9842, + "loss": 3.335, "step": 1088000 }, { "epoch": 8.87, - "eval_loss": 3.037600040435791, - "eval_runtime": 131.9507, - "eval_samples_per_second": 782.679, - "eval_steps_per_second": 48.92, + "eval_loss": 3.3904380798339844, + "eval_runtime": 116.7468, + "eval_samples_per_second": 884.607, + "eval_steps_per_second": 55.291, "step": 1088000 }, { "epoch": 8.94, - "eval_loss": 3.032285213470459, - "eval_runtime": 131.7234, - "eval_samples_per_second": 784.029, - "eval_steps_per_second": 49.004, + "eval_loss": 3.3894879817962646, + "eval_runtime": 115.0778, + "eval_samples_per_second": 897.437, + "eval_steps_per_second": 56.092, "step": 1096000 }, { "epoch": 9.0, "learning_rate": 5.403151838572501e-06, - "loss": 2.9912, + "loss": 3.3411, "step": 1104000 }, { "epoch": 9.0, - "eval_loss": 3.031731367111206, - "eval_runtime": 131.8868, - "eval_samples_per_second": 783.058, - "eval_steps_per_second": 48.944, + "eval_loss": 3.395911455154419, + "eval_runtime": 116.3802, + "eval_samples_per_second": 887.393, + "eval_steps_per_second": 55.465, "step": 1104000 }, { "epoch": 9.07, - "eval_loss": 3.022475481033325, - "eval_runtime": 131.0568, - "eval_samples_per_second": 788.017, - "eval_steps_per_second": 49.253, + "eval_loss": 3.391462802886963, + "eval_runtime": 115.5689, + "eval_samples_per_second": 893.623, + "eval_steps_per_second": 55.854, "step": 1112000 }, { "epoch": 9.13, "learning_rate": 5.33644626031852e-06, - "loss": 2.9919, + "loss": 3.3324, "step": 1120000 }, { "epoch": 9.13, - "eval_loss": 3.036106824874878, - "eval_runtime": 132.2182, - "eval_samples_per_second": 781.095, - "eval_steps_per_second": 48.821, + "eval_loss": 3.4030401706695557, + "eval_runtime": 115.7261, + "eval_samples_per_second": 892.409, + "eval_steps_per_second": 55.778, "step": 1120000 }, { "epoch": 9.2, - "eval_loss": 3.0432300567626953, - "eval_runtime": 131.9088, - "eval_samples_per_second": 782.927, - "eval_steps_per_second": 48.935, + "eval_loss": 3.4083750247955322, + "eval_runtime": 118.5809, + "eval_samples_per_second": 870.924, + "eval_steps_per_second": 54.435, "step": 1128000 }, { "epoch": 9.26, "learning_rate": 5.269740682064538e-06, - "loss": 2.9872, + "loss": 3.3297, "step": 1136000 }, { "epoch": 9.26, - "eval_loss": 3.0306613445281982, - "eval_runtime": 131.2348, - "eval_samples_per_second": 786.948, - "eval_steps_per_second": 49.187, + "eval_loss": 3.402348518371582, + "eval_runtime": 115.6049, + "eval_samples_per_second": 893.344, + "eval_steps_per_second": 55.837, "step": 1136000 }, { "epoch": 9.33, - "eval_loss": 3.0481879711151123, - "eval_runtime": 131.7205, - "eval_samples_per_second": 784.046, - "eval_steps_per_second": 49.005, + "eval_loss": 3.3967323303222656, + "eval_runtime": 115.5344, + "eval_samples_per_second": 893.889, + "eval_steps_per_second": 55.871, "step": 1144000 }, { "epoch": 9.39, "learning_rate": 5.203035103810556e-06, - "loss": 2.9823, + "loss": 3.3492, "step": 1152000 }, { "epoch": 9.39, - "eval_loss": 3.035399913787842, - "eval_runtime": 131.2188, - "eval_samples_per_second": 787.044, - "eval_steps_per_second": 49.193, + "eval_loss": 3.393101215362549, + "eval_runtime": 115.5769, + "eval_samples_per_second": 893.561, + "eval_steps_per_second": 55.85, "step": 1152000 }, { "epoch": 9.46, - "eval_loss": 3.0419015884399414, - "eval_runtime": 131.8024, - "eval_samples_per_second": 783.559, - "eval_steps_per_second": 48.975, + "eval_loss": 3.4064693450927734, + "eval_runtime": 114.7523, + "eval_samples_per_second": 899.982, + "eval_steps_per_second": 56.252, "step": 1160000 }, { "epoch": 9.52, "learning_rate": 5.136329525556575e-06, - "loss": 2.9882, + "loss": 3.3317, "step": 1168000 }, { "epoch": 9.52, - "eval_loss": 3.0567431449890137, - "eval_runtime": 132.7773, - "eval_samples_per_second": 777.806, - "eval_steps_per_second": 48.615, + "eval_loss": 3.3905270099639893, + "eval_runtime": 115.5534, + "eval_samples_per_second": 893.743, + "eval_steps_per_second": 55.862, "step": 1168000 }, { "epoch": 9.59, - "eval_loss": 3.0395400524139404, - "eval_runtime": 131.6554, - "eval_samples_per_second": 784.434, - "eval_steps_per_second": 49.03, + "eval_loss": 3.402090072631836, + "eval_runtime": 114.6435, + "eval_samples_per_second": 900.836, + "eval_steps_per_second": 56.305, "step": 1176000 }, { "epoch": 9.65, "learning_rate": 5.0696239473025935e-06, - "loss": 3.0079, + "loss": 3.3447, "step": 1184000 }, { "epoch": 9.65, - "eval_loss": 3.0572261810302734, - "eval_runtime": 132.0184, - "eval_samples_per_second": 782.278, - "eval_steps_per_second": 48.895, + "eval_loss": 3.400120735168457, + "eval_runtime": 116.0858, + "eval_samples_per_second": 889.643, + "eval_steps_per_second": 55.605, "step": 1184000 }, { "epoch": 9.72, - "eval_loss": 3.04028058052063, - "eval_runtime": 131.8056, - "eval_samples_per_second": 783.54, - "eval_steps_per_second": 48.974, + "eval_loss": 3.3942949771881104, + "eval_runtime": 114.8922, + "eval_samples_per_second": 898.886, + "eval_steps_per_second": 56.183, "step": 1192000 }, { "epoch": 9.78, "learning_rate": 5.002918369048611e-06, - "loss": 3.0243, + "loss": 3.3377, "step": 1200000 }, { "epoch": 9.78, - "eval_loss": 3.047227621078491, - "eval_runtime": 131.9863, - "eval_samples_per_second": 782.467, - "eval_steps_per_second": 48.907, + "eval_loss": 3.3970954418182373, + "eval_runtime": 114.8942, + "eval_samples_per_second": 898.871, + "eval_steps_per_second": 56.182, "step": 1200000 }, { "epoch": 9.85, - "eval_loss": 3.052279472351074, - "eval_runtime": 132.2017, - "eval_samples_per_second": 781.193, - "eval_steps_per_second": 48.827, + "eval_loss": 3.3946433067321777, + "eval_runtime": 114.9828, + "eval_samples_per_second": 898.178, + "eval_steps_per_second": 56.139, "step": 1208000 }, { "epoch": 9.92, "learning_rate": 4.936212790794631e-06, - "loss": 3.0127, + "loss": 3.3486, "step": 1216000 }, { "epoch": 9.92, - "eval_loss": 3.053439140319824, - "eval_runtime": 131.3363, - "eval_samples_per_second": 786.34, - "eval_steps_per_second": 49.149, + "eval_loss": 3.392373561859131, + "eval_runtime": 115.6846, + "eval_samples_per_second": 892.729, + "eval_steps_per_second": 55.798, "step": 1216000 }, { "epoch": 9.98, - "eval_loss": 3.0434141159057617, - "eval_runtime": 131.7363, - "eval_samples_per_second": 783.952, - "eval_steps_per_second": 48.999, + "eval_loss": 3.398346424102783, + "eval_runtime": 115.4236, + "eval_samples_per_second": 894.747, + "eval_steps_per_second": 55.924, "step": 1224000 }, { "epoch": 10.05, "learning_rate": 4.869507212540649e-06, - "loss": 3.0106, + "loss": 3.3471, "step": 1232000 }, { "epoch": 10.05, - "eval_loss": 3.0687036514282227, - "eval_runtime": 131.4287, - "eval_samples_per_second": 785.788, - "eval_steps_per_second": 49.114, + "eval_loss": 3.414100408554077, + "eval_runtime": 115.0455, + "eval_samples_per_second": 897.689, + "eval_steps_per_second": 56.108, "step": 1232000 }, { "epoch": 10.11, - "eval_loss": 3.0677733421325684, - "eval_runtime": 132.6312, - "eval_samples_per_second": 778.663, - "eval_steps_per_second": 48.669, + "eval_loss": 3.4220006465911865, + "eval_runtime": 115.4764, + "eval_samples_per_second": 894.339, + "eval_steps_per_second": 55.899, "step": 1240000 }, { "epoch": 10.18, "learning_rate": 4.802801634286667e-06, - "loss": 3.0063, + "loss": 3.3457, "step": 1248000 }, { "epoch": 10.18, - "eval_loss": 3.0652401447296143, - "eval_runtime": 132.5035, - "eval_samples_per_second": 779.413, - "eval_steps_per_second": 48.716, + "eval_loss": 3.4085357189178467, + "eval_runtime": 115.0154, + "eval_samples_per_second": 897.923, + "eval_steps_per_second": 56.123, "step": 1248000 }, { "epoch": 10.24, - "eval_loss": 3.0768234729766846, - "eval_runtime": 131.7104, - "eval_samples_per_second": 784.107, - "eval_steps_per_second": 49.009, + "eval_loss": 3.424273729324341, + "eval_runtime": 114.96, + "eval_samples_per_second": 898.356, + "eval_steps_per_second": 56.15, "step": 1256000 }, { "epoch": 10.31, "learning_rate": 4.7360960560326865e-06, - "loss": 3.0187, + "loss": 3.3278, "step": 1264000 }, { "epoch": 10.31, - "eval_loss": 3.069179058074951, - "eval_runtime": 132.7895, - "eval_samples_per_second": 777.735, - "eval_steps_per_second": 48.611, + "eval_loss": 3.4058358669281006, + "eval_runtime": 115.4303, + "eval_samples_per_second": 894.696, + "eval_steps_per_second": 55.921, "step": 1264000 }, { "epoch": 10.37, - "eval_loss": 3.0621213912963867, - "eval_runtime": 132.041, - "eval_samples_per_second": 782.144, - "eval_steps_per_second": 48.886, + "eval_loss": 3.403254985809326, + "eval_runtime": 114.783, + "eval_samples_per_second": 899.741, + "eval_steps_per_second": 56.237, "step": 1272000 }, { "epoch": 10.44, "learning_rate": 4.669390477778704e-06, - "loss": 3.0202, + "loss": 3.325, "step": 1280000 }, { "epoch": 10.44, - "eval_loss": 3.0663187503814697, - "eval_runtime": 132.2635, - "eval_samples_per_second": 780.828, - "eval_steps_per_second": 48.804, + "eval_loss": 3.3866589069366455, + "eval_runtime": 115.6771, + "eval_samples_per_second": 892.787, + "eval_steps_per_second": 55.802, "step": 1280000 }, { "epoch": 10.5, - "eval_loss": 3.0537171363830566, - "eval_runtime": 132.2536, - "eval_samples_per_second": 780.886, - "eval_steps_per_second": 48.808, + "eval_loss": 3.3878674507141113, + "eval_runtime": 114.7924, + "eval_samples_per_second": 899.667, + "eval_steps_per_second": 56.232, "step": 1288000 }, { "epoch": 10.57, "learning_rate": 4.602684899524723e-06, - "loss": 3.0219, + "loss": 3.3248, "step": 1296000 }, { "epoch": 10.57, - "eval_loss": 3.072500705718994, - "eval_runtime": 132.0295, - "eval_samples_per_second": 782.212, - "eval_steps_per_second": 48.891, + "eval_loss": 3.380067825317383, + "eval_runtime": 115.2061, + "eval_samples_per_second": 896.437, + "eval_steps_per_second": 56.03, "step": 1296000 }, { "epoch": 10.63, - "eval_loss": 3.0664169788360596, - "eval_runtime": 131.9651, - "eval_samples_per_second": 782.593, - "eval_steps_per_second": 48.914, + "eval_loss": 3.4026682376861572, + "eval_runtime": 117.5473, + "eval_samples_per_second": 878.583, + "eval_steps_per_second": 54.914, "step": 1304000 }, { "epoch": 10.7, "learning_rate": 4.5359793212707415e-06, - "loss": 3.0232, + "loss": 3.3217, "step": 1312000 }, { "epoch": 10.7, - "eval_loss": 3.0724074840545654, - "eval_runtime": 133.2104, - "eval_samples_per_second": 775.277, - "eval_steps_per_second": 48.457, + "eval_loss": 3.3781392574310303, + "eval_runtime": 116.9837, + "eval_samples_per_second": 882.816, + "eval_steps_per_second": 55.179, "step": 1312000 }, { "epoch": 10.76, - "eval_loss": 3.0476126670837402, - "eval_runtime": 132.7171, - "eval_samples_per_second": 778.159, - "eval_steps_per_second": 48.637, + "eval_loss": 3.38712477684021, + "eval_runtime": 116.1554, + "eval_samples_per_second": 889.111, + "eval_steps_per_second": 55.572, "step": 1320000 }, { "epoch": 10.83, "learning_rate": 4.46927374301676e-06, - "loss": 3.0247, + "loss": 3.3227, "step": 1328000 }, { "epoch": 10.83, - "eval_loss": 3.0729353427886963, - "eval_runtime": 132.4018, - "eval_samples_per_second": 780.012, - "eval_steps_per_second": 48.753, + "eval_loss": 3.386099338531494, + "eval_runtime": 116.8959, + "eval_samples_per_second": 883.478, + "eval_steps_per_second": 55.22, "step": 1328000 }, { "epoch": 10.89, - "eval_loss": 3.0645902156829834, - "eval_runtime": 133.3334, - "eval_samples_per_second": 774.562, - "eval_steps_per_second": 48.412, + "eval_loss": 3.378852605819702, + "eval_runtime": 116.5746, + "eval_samples_per_second": 885.913, + "eval_steps_per_second": 55.372, "step": 1336000 }, { "epoch": 10.96, "learning_rate": 4.402568164762779e-06, - "loss": 3.0335, + "loss": 3.3259, "step": 1344000 }, { "epoch": 10.96, - "eval_loss": 3.0603559017181396, - "eval_runtime": 131.9232, - "eval_samples_per_second": 782.842, - "eval_steps_per_second": 48.93, + "eval_loss": 3.386458158493042, + "eval_runtime": 116.5428, + "eval_samples_per_second": 886.155, + "eval_steps_per_second": 55.387, "step": 1344000 }, { "epoch": 11.02, - "eval_loss": 3.0630509853363037, - "eval_runtime": 132.4502, - "eval_samples_per_second": 779.727, - "eval_steps_per_second": 48.735, + "eval_loss": 3.386268377304077, + "eval_runtime": 115.7105, + "eval_samples_per_second": 892.529, + "eval_steps_per_second": 55.786, "step": 1352000 }, { "epoch": 11.09, "learning_rate": 4.335862586508797e-06, - "loss": 3.0182, + "loss": 3.3094, "step": 1360000 }, { "epoch": 11.09, - "eval_loss": 3.0669026374816895, - "eval_runtime": 133.3499, - "eval_samples_per_second": 774.466, - "eval_steps_per_second": 48.406, + "eval_loss": 3.3826916217803955, + "eval_runtime": 118.0068, + "eval_samples_per_second": 875.161, + "eval_steps_per_second": 54.7, "step": 1360000 }, { "epoch": 11.15, - "eval_loss": 3.0626471042633057, - "eval_runtime": 133.0041, - "eval_samples_per_second": 776.48, - "eval_steps_per_second": 48.532, + "eval_loss": 3.3880295753479004, + "eval_runtime": 115.413, + "eval_samples_per_second": 894.83, + "eval_steps_per_second": 55.93, "step": 1368000 }, { "epoch": 11.22, "learning_rate": 4.269157008254816e-06, - "loss": 3.0124, + "loss": 3.3128, "step": 1376000 }, { "epoch": 11.22, - "eval_loss": 3.053469181060791, - "eval_runtime": 133.5969, - "eval_samples_per_second": 773.034, - "eval_steps_per_second": 48.317, + "eval_loss": 3.365227460861206, + "eval_runtime": 116.1062, + "eval_samples_per_second": 889.487, + "eval_steps_per_second": 55.596, "step": 1376000 }, { "epoch": 11.29, - "eval_loss": 3.076792001724243, - "eval_runtime": 133.0672, - "eval_samples_per_second": 776.112, - "eval_steps_per_second": 48.509, + "eval_loss": 3.381347179412842, + "eval_runtime": 119.0899, + "eval_samples_per_second": 867.202, + "eval_steps_per_second": 54.203, "step": 1384000 }, { "epoch": 11.35, "learning_rate": 4.202451430000834e-06, - "loss": 3.016, + "loss": 3.3088, "step": 1392000 }, { "epoch": 11.35, - "eval_loss": 3.0615081787109375, - "eval_runtime": 133.9693, - "eval_samples_per_second": 770.886, - "eval_steps_per_second": 48.183, + "eval_loss": 3.385295867919922, + "eval_runtime": 115.9391, + "eval_samples_per_second": 890.769, + "eval_steps_per_second": 55.676, "step": 1392000 }, { "epoch": 11.42, - "eval_loss": 3.0689148902893066, - "eval_runtime": 134.418, - "eval_samples_per_second": 768.312, - "eval_steps_per_second": 48.022, + "eval_loss": 3.3708653450012207, + "eval_runtime": 116.9766, + "eval_samples_per_second": 882.869, + "eval_steps_per_second": 55.182, "step": 1400000 }, { "epoch": 11.48, "learning_rate": 4.135745851746852e-06, - "loss": 3.0133, + "loss": 3.3067, "step": 1408000 }, { "epoch": 11.48, - "eval_loss": 3.069943428039551, - "eval_runtime": 133.7409, - "eval_samples_per_second": 772.202, - "eval_steps_per_second": 48.265, + "eval_loss": 3.3830504417419434, + "eval_runtime": 115.9272, + "eval_samples_per_second": 890.861, + "eval_steps_per_second": 55.682, "step": 1408000 }, { "epoch": 11.55, - "eval_loss": 3.0647213459014893, - "eval_runtime": 134.5422, - "eval_samples_per_second": 767.603, - "eval_steps_per_second": 47.977, + "eval_loss": 3.370314598083496, + "eval_runtime": 117.2105, + "eval_samples_per_second": 881.107, + "eval_steps_per_second": 55.072, "step": 1416000 }, { "epoch": 11.61, "learning_rate": 4.069040273492872e-06, - "loss": 3.0227, + "loss": 3.311, "step": 1424000 }, { "epoch": 11.61, - "eval_loss": 3.0704684257507324, - "eval_runtime": 135.8934, - "eval_samples_per_second": 759.97, - "eval_steps_per_second": 47.5, + "eval_loss": 3.369617223739624, + "eval_runtime": 116.4339, + "eval_samples_per_second": 886.984, + "eval_steps_per_second": 55.439, "step": 1424000 }, { "epoch": 11.68, - "eval_loss": 3.0705504417419434, - "eval_runtime": 133.4155, - "eval_samples_per_second": 774.086, - "eval_steps_per_second": 48.383, + "eval_loss": 3.3768646717071533, + "eval_runtime": 118.1326, + "eval_samples_per_second": 874.23, + "eval_steps_per_second": 54.642, "step": 1432000 }, { "epoch": 11.74, "learning_rate": 4.0023346952388895e-06, - "loss": 3.0267, + "loss": 3.3048, "step": 1440000 }, { "epoch": 11.74, - "eval_loss": 3.069384813308716, - "eval_runtime": 133.2021, - "eval_samples_per_second": 775.326, - "eval_steps_per_second": 48.46, + "eval_loss": 3.373983860015869, + "eval_runtime": 118.2179, + "eval_samples_per_second": 873.598, + "eval_steps_per_second": 54.603, "step": 1440000 }, { "epoch": 11.81, - "eval_loss": 3.0720527172088623, - "eval_runtime": 133.9349, - "eval_samples_per_second": 771.083, - "eval_steps_per_second": 48.195, + "eval_loss": 3.3731493949890137, + "eval_runtime": 116.9055, + "eval_samples_per_second": 883.406, + "eval_steps_per_second": 55.216, "step": 1448000 }, { "epoch": 11.87, "learning_rate": 3.935629116984908e-06, - "loss": 3.021, + "loss": 3.3055, "step": 1456000 }, { "epoch": 11.87, - "eval_loss": 3.068966865539551, - "eval_runtime": 132.597, - "eval_samples_per_second": 778.864, - "eval_steps_per_second": 48.681, + "eval_loss": 3.365483283996582, + "eval_runtime": 117.1876, + "eval_samples_per_second": 881.279, + "eval_steps_per_second": 55.083, "step": 1456000 }, { "epoch": 11.94, - "eval_loss": 3.060349702835083, - "eval_runtime": 134.1972, - "eval_samples_per_second": 769.576, - "eval_steps_per_second": 48.101, + "eval_loss": 3.3697094917297363, + "eval_runtime": 117.1788, + "eval_samples_per_second": 881.346, + "eval_steps_per_second": 55.087, "step": 1464000 }, { "epoch": 12.0, "learning_rate": 3.868923538730927e-06, - "loss": 3.0144, + "loss": 3.3105, "step": 1472000 }, { "epoch": 12.0, - "eval_loss": 3.065760374069214, - "eval_runtime": 134.4544, - "eval_samples_per_second": 768.104, - "eval_steps_per_second": 48.009, + "eval_loss": 3.3741800785064697, + "eval_runtime": 116.7081, + "eval_samples_per_second": 884.9, + "eval_steps_per_second": 55.309, "step": 1472000 }, { "epoch": 12.07, - "eval_loss": 3.0719916820526123, - "eval_runtime": 133.6199, - "eval_samples_per_second": 772.902, - "eval_steps_per_second": 48.309, + "eval_loss": 3.3614203929901123, + "eval_runtime": 118.1522, + "eval_samples_per_second": 874.084, + "eval_steps_per_second": 54.633, "step": 1480000 }, { "epoch": 12.13, "learning_rate": 3.8022179604769453e-06, - "loss": 3.0204, + "loss": 3.2977, "step": 1488000 }, { "epoch": 12.13, - "eval_loss": 3.066779851913452, - "eval_runtime": 133.3793, - "eval_samples_per_second": 774.296, - "eval_steps_per_second": 48.396, + "eval_loss": 3.370495319366455, + "eval_runtime": 117.0737, + "eval_samples_per_second": 882.137, + "eval_steps_per_second": 55.136, "step": 1488000 }, { "epoch": 12.2, - "eval_loss": 3.0773117542266846, - "eval_runtime": 135.2249, - "eval_samples_per_second": 763.728, - "eval_steps_per_second": 47.735, + "eval_loss": 3.3746001720428467, + "eval_runtime": 117.4262, + "eval_samples_per_second": 879.489, + "eval_steps_per_second": 54.971, "step": 1496000 }, { "epoch": 12.26, "learning_rate": 3.735512382222964e-06, - "loss": 3.0085, + "loss": 3.2999, "step": 1504000 }, { "epoch": 12.26, - "eval_loss": 3.0847675800323486, - "eval_runtime": 133.4406, - "eval_samples_per_second": 773.94, - "eval_steps_per_second": 48.374, + "eval_loss": 3.3690757751464844, + "eval_runtime": 114.9601, + "eval_samples_per_second": 898.355, + "eval_steps_per_second": 56.15, "step": 1504000 }, { "epoch": 12.33, - "eval_loss": 3.0567853450775146, - "eval_runtime": 136.5184, - "eval_samples_per_second": 756.492, - "eval_steps_per_second": 47.283, + "eval_loss": 3.374530792236328, + "eval_runtime": 115.3595, + "eval_samples_per_second": 895.245, + "eval_steps_per_second": 55.955, "step": 1512000 }, { "epoch": 12.39, "learning_rate": 3.668806803968982e-06, - "loss": 3.0146, + "loss": 3.2983, "step": 1520000 }, { "epoch": 12.39, - "eval_loss": 3.0783281326293945, - "eval_runtime": 134.8805, - "eval_samples_per_second": 765.678, - "eval_steps_per_second": 47.857, + "eval_loss": 3.3717198371887207, + "eval_runtime": 114.9666, + "eval_samples_per_second": 898.304, + "eval_steps_per_second": 56.147, "step": 1520000 }, { "epoch": 12.46, - "eval_loss": 3.073552370071411, - "eval_runtime": 133.8542, - "eval_samples_per_second": 771.549, - "eval_steps_per_second": 48.224, + "eval_loss": 3.368246555328369, + "eval_runtime": 115.591, + "eval_samples_per_second": 893.452, + "eval_steps_per_second": 55.843, "step": 1528000 }, { "epoch": 12.52, "learning_rate": 3.6021012257150007e-06, - "loss": 3.02, + "loss": 3.2957, "step": 1536000 }, { "epoch": 12.52, - "eval_loss": 3.0533952713012695, - "eval_runtime": 133.5934, - "eval_samples_per_second": 773.055, - "eval_steps_per_second": 48.318, + "eval_loss": 3.369278907775879, + "eval_runtime": 116.1156, + "eval_samples_per_second": 889.416, + "eval_steps_per_second": 55.591, "step": 1536000 }, { "epoch": 12.59, - "eval_loss": 3.0684494972229004, - "eval_runtime": 133.2901, - "eval_samples_per_second": 774.814, - "eval_steps_per_second": 48.428, + "eval_loss": 3.376443386077881, + "eval_runtime": 114.7209, + "eval_samples_per_second": 900.228, + "eval_steps_per_second": 56.267, "step": 1544000 }, { "epoch": 12.65, "learning_rate": 3.535395647461019e-06, - "loss": 3.0229, + "loss": 3.293, "step": 1552000 }, { "epoch": 12.65, - "eval_loss": 3.07673978805542, - "eval_runtime": 134.115, - "eval_samples_per_second": 770.048, - "eval_steps_per_second": 48.13, + "eval_loss": 3.3690662384033203, + "eval_runtime": 114.9457, + "eval_samples_per_second": 898.468, + "eval_steps_per_second": 56.157, "step": 1552000 }, { "epoch": 12.72, - "eval_loss": 3.0568747520446777, - "eval_runtime": 134.3484, - "eval_samples_per_second": 768.71, - "eval_steps_per_second": 48.047, + "eval_loss": 3.380187511444092, + "eval_runtime": 115.2975, + "eval_samples_per_second": 895.726, + "eval_steps_per_second": 55.986, "step": 1560000 }, { "epoch": 12.79, "learning_rate": 3.468690069207038e-06, - "loss": 3.0152, + "loss": 3.2919, "step": 1568000 }, { "epoch": 12.79, - "eval_loss": 3.0787863731384277, - "eval_runtime": 133.764, - "eval_samples_per_second": 772.068, - "eval_steps_per_second": 48.257, + "eval_loss": 3.3626480102539062, + "eval_runtime": 115.0018, + "eval_samples_per_second": 898.03, + "eval_steps_per_second": 56.13, "step": 1568000 }, { "epoch": 12.85, - "eval_loss": 3.066344738006592, - "eval_runtime": 133.9216, - "eval_samples_per_second": 771.16, - "eval_steps_per_second": 48.2, + "eval_loss": 3.3604438304901123, + "eval_runtime": 116.2394, + "eval_samples_per_second": 888.468, + "eval_steps_per_second": 55.532, "step": 1576000 }, { "epoch": 12.92, "learning_rate": 3.4019844909530565e-06, - "loss": 3.02, + "loss": 3.3023, "step": 1584000 }, { "epoch": 12.92, - "eval_loss": 3.067016839981079, - "eval_runtime": 133.9971, - "eval_samples_per_second": 770.725, - "eval_steps_per_second": 48.173, + "eval_loss": 3.374943971633911, + "eval_runtime": 115.4828, + "eval_samples_per_second": 894.289, + "eval_steps_per_second": 55.896, "step": 1584000 }, { "epoch": 12.98, - "eval_loss": 3.0683343410491943, - "eval_runtime": 134.2208, - "eval_samples_per_second": 769.441, - "eval_steps_per_second": 48.092, + "eval_loss": 3.368828773498535, + "eval_runtime": 114.8626, + "eval_samples_per_second": 899.118, + "eval_steps_per_second": 56.198, "step": 1592000 }, { "epoch": 13.05, "learning_rate": 3.3352789126990747e-06, - "loss": 3.0128, + "loss": 3.2988, "step": 1600000 }, { "epoch": 13.05, - "eval_loss": 3.071779489517212, - "eval_runtime": 134.2033, - "eval_samples_per_second": 769.541, - "eval_steps_per_second": 48.099, + "eval_loss": 3.3666255474090576, + "eval_runtime": 115.7226, + "eval_samples_per_second": 892.436, + "eval_steps_per_second": 55.78, "step": 1600000 }, { "epoch": 13.11, - "eval_loss": 3.0846707820892334, - "eval_runtime": 134.6625, - "eval_samples_per_second": 766.917, - "eval_steps_per_second": 47.935, + "eval_loss": 3.369481325149536, + "eval_runtime": 116.2492, + "eval_samples_per_second": 888.393, + "eval_steps_per_second": 55.527, "step": 1608000 }, { "epoch": 13.18, "learning_rate": 3.2685733344450933e-06, - "loss": 3.016, + "loss": 3.2924, "step": 1616000 }, { "epoch": 13.18, - "eval_loss": 3.066356897354126, - "eval_runtime": 134.4556, - "eval_samples_per_second": 768.097, - "eval_steps_per_second": 48.008, + "eval_loss": 3.364980697631836, + "eval_runtime": 114.892, + "eval_samples_per_second": 898.887, + "eval_steps_per_second": 56.183, "step": 1616000 }, { "epoch": 13.24, - "eval_loss": 3.0687520503997803, - "eval_runtime": 134.3299, - "eval_samples_per_second": 768.816, - "eval_steps_per_second": 48.053, + "eval_loss": 3.3651351928710938, + "eval_runtime": 114.7414, + "eval_samples_per_second": 900.068, + "eval_steps_per_second": 56.257, "step": 1624000 }, { "epoch": 13.31, "learning_rate": 3.2018677561911115e-06, - "loss": 3.0007, + "loss": 3.2958, "step": 1632000 }, { "epoch": 13.31, - "eval_loss": 3.0740671157836914, - "eval_runtime": 134.4424, - "eval_samples_per_second": 768.173, - "eval_steps_per_second": 48.013, + "eval_loss": 3.369225263595581, + "eval_runtime": 115.9526, + "eval_samples_per_second": 890.666, + "eval_steps_per_second": 55.669, "step": 1632000 }, { "epoch": 13.37, - "eval_loss": 3.0663323402404785, - "eval_runtime": 134.2383, - "eval_samples_per_second": 769.341, - "eval_steps_per_second": 48.086, + "eval_loss": 3.3855459690093994, + "eval_runtime": 114.8307, + "eval_samples_per_second": 899.367, + "eval_steps_per_second": 56.213, "step": 1640000 }, { "epoch": 13.44, "learning_rate": 3.1351621779371306e-06, - "loss": 3.0241, + "loss": 3.2918, "step": 1648000 }, { "epoch": 13.44, - "eval_loss": 3.0607213973999023, - "eval_runtime": 134.0502, - "eval_samples_per_second": 770.42, - "eval_steps_per_second": 48.154, + "eval_loss": 3.3706300258636475, + "eval_runtime": 115.344, + "eval_samples_per_second": 895.365, + "eval_steps_per_second": 55.963, "step": 1648000 }, { "epoch": 13.5, - "eval_loss": 3.0635085105895996, - "eval_runtime": 133.9453, - "eval_samples_per_second": 771.024, - "eval_steps_per_second": 48.191, + "eval_loss": 3.3680288791656494, + "eval_runtime": 114.7321, + "eval_samples_per_second": 900.14, + "eval_steps_per_second": 56.261, "step": 1656000 }, { "epoch": 13.57, "learning_rate": 3.0684565996831487e-06, - "loss": 3.0103, + "loss": 3.2948, "step": 1664000 }, { "epoch": 13.57, - "eval_loss": 3.0730724334716797, - "eval_runtime": 135.0683, - "eval_samples_per_second": 764.613, - "eval_steps_per_second": 47.791, + "eval_loss": 3.353415012359619, + "eval_runtime": 116.4266, + "eval_samples_per_second": 887.039, + "eval_steps_per_second": 55.443, "step": 1664000 }, { "epoch": 13.63, - "eval_loss": 3.0649466514587402, - "eval_runtime": 134.138, - "eval_samples_per_second": 769.916, - "eval_steps_per_second": 48.122, + "eval_loss": 3.369929790496826, + "eval_runtime": 114.8306, + "eval_samples_per_second": 899.369, + "eval_steps_per_second": 56.213, "step": 1672000 }, { "epoch": 13.7, "learning_rate": 3.0017510214291673e-06, - "loss": 3.0188, + "loss": 3.2996, "step": 1680000 }, { "epoch": 13.7, - "eval_loss": 3.058675765991211, - "eval_runtime": 134.4659, - "eval_samples_per_second": 768.039, - "eval_steps_per_second": 48.005, + "eval_loss": 3.3732664585113525, + "eval_runtime": 115.7005, + "eval_samples_per_second": 892.607, + "eval_steps_per_second": 55.791, "step": 1680000 }, { "epoch": 13.76, - "eval_loss": 3.0703861713409424, - "eval_runtime": 134.1628, - "eval_samples_per_second": 769.773, - "eval_steps_per_second": 48.113, + "eval_loss": 3.3764214515686035, + "eval_runtime": 115.4981, + "eval_samples_per_second": 894.171, + "eval_steps_per_second": 55.888, "step": 1688000 }, { "epoch": 13.83, "learning_rate": 2.9350454431751855e-06, - "loss": 3.0217, + "loss": 3.2999, "step": 1696000 }, { "epoch": 13.83, - "eval_loss": 3.066443920135498, - "eval_runtime": 135.8944, - "eval_samples_per_second": 759.965, - "eval_steps_per_second": 47.5, + "eval_loss": 3.3792943954467773, + "eval_runtime": 116.0913, + "eval_samples_per_second": 889.602, + "eval_steps_per_second": 55.603, "step": 1696000 }, { "epoch": 13.89, - "eval_loss": 3.0626626014709473, - "eval_runtime": 135.45, - "eval_samples_per_second": 762.458, - "eval_steps_per_second": 47.656, + "eval_loss": 3.368272304534912, + "eval_runtime": 116.0753, + "eval_samples_per_second": 889.724, + "eval_steps_per_second": 55.61, "step": 1704000 }, { "epoch": 13.96, "learning_rate": 2.868339864921204e-06, - "loss": 3.0282, + "loss": 3.291, "step": 1712000 }, { "epoch": 13.96, - "eval_loss": 3.071357488632202, - "eval_runtime": 134.3182, - "eval_samples_per_second": 768.883, - "eval_steps_per_second": 48.058, + "eval_loss": 3.3653597831726074, + "eval_runtime": 115.5031, + "eval_samples_per_second": 894.132, + "eval_steps_per_second": 55.886, "step": 1712000 }, { "epoch": 14.02, - "eval_loss": 3.0688371658325195, - "eval_runtime": 135.2782, - "eval_samples_per_second": 763.427, - "eval_steps_per_second": 47.716, + "eval_loss": 3.372131109237671, + "eval_runtime": 115.6199, + "eval_samples_per_second": 893.228, + "eval_steps_per_second": 55.829, "step": 1720000 }, { "epoch": 14.09, "learning_rate": 2.801634286667223e-06, - "loss": 3.0166, + "loss": 3.2952, "step": 1728000 }, { "epoch": 14.09, - "eval_loss": 3.05212664604187, - "eval_runtime": 135.0648, - "eval_samples_per_second": 764.633, - "eval_steps_per_second": 47.792, + "eval_loss": 3.367438316345215, + "eval_runtime": 115.0009, + "eval_samples_per_second": 898.037, + "eval_steps_per_second": 56.13, "step": 1728000 }, { "epoch": 14.16, - "eval_loss": 3.0538179874420166, - "eval_runtime": 134.2844, - "eval_samples_per_second": 769.076, - "eval_steps_per_second": 48.07, + "eval_loss": 3.3762009143829346, + "eval_runtime": 115.4616, + "eval_samples_per_second": 894.453, + "eval_steps_per_second": 55.906, "step": 1736000 }, { "epoch": 14.22, "learning_rate": 2.7349287084132413e-06, - "loss": 3.0134, + "loss": 3.2866, "step": 1744000 }, { "epoch": 14.22, - "eval_loss": 3.064086437225342, - "eval_runtime": 135.4053, - "eval_samples_per_second": 762.71, - "eval_steps_per_second": 47.672, + "eval_loss": 3.3699355125427246, + "eval_runtime": 114.9346, + "eval_samples_per_second": 898.554, + "eval_steps_per_second": 56.162, "step": 1744000 }, { "epoch": 14.29, - "eval_loss": 3.063884735107422, - "eval_runtime": 134.2537, - "eval_samples_per_second": 769.253, - "eval_steps_per_second": 48.081, + "eval_loss": 3.3690149784088135, + "eval_runtime": 115.9293, + "eval_samples_per_second": 890.845, + "eval_steps_per_second": 55.681, "step": 1752000 }, { "epoch": 14.35, "learning_rate": 2.66822313015926e-06, - "loss": 3.0032, + "loss": 3.2825, "step": 1760000 }, { "epoch": 14.35, - "eval_loss": 3.0587823390960693, - "eval_runtime": 135.0451, - "eval_samples_per_second": 764.745, - "eval_steps_per_second": 47.799, + "eval_loss": 3.365321636199951, + "eval_runtime": 114.9037, + "eval_samples_per_second": 898.796, + "eval_steps_per_second": 56.177, "step": 1760000 }, { "epoch": 14.42, - "eval_loss": 3.064620018005371, - "eval_runtime": 134.9837, - "eval_samples_per_second": 765.092, - "eval_steps_per_second": 47.821, + "eval_loss": 3.368727207183838, + "eval_runtime": 115.3436, + "eval_samples_per_second": 895.369, + "eval_steps_per_second": 55.963, "step": 1768000 }, { "epoch": 14.48, "learning_rate": 2.601517551905278e-06, - "loss": 3.0136, + "loss": 3.2825, "step": 1776000 }, { "epoch": 14.48, - "eval_loss": 3.062889337539673, - "eval_runtime": 134.9119, - "eval_samples_per_second": 765.499, - "eval_steps_per_second": 47.846, + "eval_loss": 3.3617701530456543, + "eval_runtime": 115.7714, + "eval_samples_per_second": 892.06, + "eval_steps_per_second": 55.756, "step": 1776000 }, { "epoch": 14.55, - "eval_loss": 3.0578110218048096, - "eval_runtime": 136.5221, - "eval_samples_per_second": 756.471, - "eval_steps_per_second": 47.282, + "eval_loss": 3.3609282970428467, + "eval_runtime": 114.879, + "eval_samples_per_second": 898.989, + "eval_steps_per_second": 56.19, "step": 1784000 }, { "epoch": 14.61, "learning_rate": 2.5348119736512967e-06, - "loss": 3.0086, + "loss": 3.2744, "step": 1792000 }, { "epoch": 14.61, - "eval_loss": 3.0528934001922607, - "eval_runtime": 135.6145, - "eval_samples_per_second": 761.534, - "eval_steps_per_second": 47.598, + "eval_loss": 3.3552184104919434, + "eval_runtime": 114.6789, + "eval_samples_per_second": 900.558, + "eval_steps_per_second": 56.288, "step": 1792000 }, { "epoch": 14.68, - "eval_loss": 3.0615251064300537, - "eval_runtime": 135.3281, - "eval_samples_per_second": 763.145, - "eval_steps_per_second": 47.699, + "eval_loss": 3.3549087047576904, + "eval_runtime": 116.3921, + "eval_samples_per_second": 887.303, + "eval_steps_per_second": 55.459, "step": 1800000 }, { "epoch": 14.74, "learning_rate": 2.4681063953973154e-06, - "loss": 3.019, + "loss": 3.2811, "step": 1808000 }, { "epoch": 14.74, - "eval_loss": 3.0565857887268066, - "eval_runtime": 134.9377, - "eval_samples_per_second": 765.353, - "eval_steps_per_second": 47.837, + "eval_loss": 3.3504152297973633, + "eval_runtime": 115.0014, + "eval_samples_per_second": 898.032, + "eval_steps_per_second": 56.13, "step": 1808000 }, { "epoch": 14.81, - "eval_loss": 3.0658679008483887, - "eval_runtime": 135.2159, - "eval_samples_per_second": 763.778, - "eval_steps_per_second": 47.738, + "eval_loss": 3.3574647903442383, + "eval_runtime": 115.1236, + "eval_samples_per_second": 897.079, + "eval_steps_per_second": 56.07, "step": 1816000 }, { "epoch": 14.87, "learning_rate": 2.4014008171433335e-06, - "loss": 3.024, + "loss": 3.2672, "step": 1824000 }, { "epoch": 14.87, - "eval_loss": 3.061464786529541, - "eval_runtime": 135.2789, - "eval_samples_per_second": 763.423, - "eval_steps_per_second": 47.716, + "eval_loss": 3.3587796688079834, + "eval_runtime": 116.6416, + "eval_samples_per_second": 885.404, + "eval_steps_per_second": 55.34, "step": 1824000 }, { "epoch": 14.94, - "eval_loss": 3.0530033111572266, - "eval_runtime": 135.9081, - "eval_samples_per_second": 759.889, - "eval_steps_per_second": 47.495, + "eval_loss": 3.3559627532958984, + "eval_runtime": 116.2457, + "eval_samples_per_second": 888.42, + "eval_steps_per_second": 55.529, "step": 1832000 }, { "epoch": 15.0, "learning_rate": 2.334695238889352e-06, - "loss": 3.0089, + "loss": 3.2919, "step": 1840000 }, { "epoch": 15.0, - "eval_loss": 3.0796985626220703, - "eval_runtime": 135.2715, - "eval_samples_per_second": 763.465, - "eval_steps_per_second": 47.719, + "eval_loss": 3.359805107116699, + "eval_runtime": 115.5497, + "eval_samples_per_second": 893.771, + "eval_steps_per_second": 55.863, "step": 1840000 }, { "epoch": 15.07, - "eval_loss": 3.0700411796569824, - "eval_runtime": 136.6273, - "eval_samples_per_second": 755.888, - "eval_steps_per_second": 47.245, + "eval_loss": 3.344524383544922, + "eval_runtime": 115.5133, + "eval_samples_per_second": 894.053, + "eval_steps_per_second": 55.881, "step": 1848000 }, { "epoch": 15.13, "learning_rate": 2.2679896606353707e-06, - "loss": 3.0174, + "loss": 3.2724, "step": 1856000 }, { "epoch": 15.13, - "eval_loss": 3.0748071670532227, - "eval_runtime": 136.44, - "eval_samples_per_second": 756.926, - "eval_steps_per_second": 47.31, + "eval_loss": 3.3516576290130615, + "eval_runtime": 115.2664, + "eval_samples_per_second": 895.968, + "eval_steps_per_second": 56.001, "step": 1856000 }, { "epoch": 15.2, - "eval_loss": 3.064267635345459, - "eval_runtime": 135.3728, - "eval_samples_per_second": 762.894, - "eval_steps_per_second": 47.683, + "eval_loss": 3.359280824661255, + "eval_runtime": 116.0103, + "eval_samples_per_second": 890.223, + "eval_steps_per_second": 55.642, "step": 1864000 }, { "epoch": 15.26, "learning_rate": 2.2012840823813894e-06, - "loss": 3.0176, + "loss": 3.277, "step": 1872000 }, { "epoch": 15.26, - "eval_loss": 3.0627517700195312, - "eval_runtime": 135.5713, - "eval_samples_per_second": 761.776, - "eval_steps_per_second": 47.613, + "eval_loss": 3.3597874641418457, + "eval_runtime": 114.9804, + "eval_samples_per_second": 898.197, + "eval_steps_per_second": 56.14, "step": 1872000 }, { "epoch": 15.33, - "eval_loss": 3.0629563331604004, - "eval_runtime": 135.894, - "eval_samples_per_second": 759.967, - "eval_steps_per_second": 47.5, + "eval_loss": 3.345801591873169, + "eval_runtime": 116.1901, + "eval_samples_per_second": 888.845, + "eval_steps_per_second": 55.555, "step": 1880000 }, { "epoch": 15.39, "learning_rate": 2.134578504127408e-06, - "loss": 3.0164, + "loss": 3.2842, "step": 1888000 }, { "epoch": 15.39, - "eval_loss": 3.0721538066864014, - "eval_runtime": 135.9329, - "eval_samples_per_second": 759.75, - "eval_steps_per_second": 47.487, + "eval_loss": 3.3583106994628906, + "eval_runtime": 114.8266, + "eval_samples_per_second": 899.399, + "eval_steps_per_second": 56.215, "step": 1888000 }, { "epoch": 15.46, - "eval_loss": 3.0744197368621826, - "eval_runtime": 135.4506, - "eval_samples_per_second": 762.455, - "eval_steps_per_second": 47.656, + "eval_loss": 3.3447749614715576, + "eval_runtime": 114.9801, + "eval_samples_per_second": 898.199, + "eval_steps_per_second": 56.14, "step": 1896000 }, { "epoch": 15.53, "learning_rate": 2.067872925873426e-06, - "loss": 3.0302, + "loss": 3.2758, "step": 1904000 }, { "epoch": 15.53, - "eval_loss": 3.0739452838897705, - "eval_runtime": 135.8281, - "eval_samples_per_second": 760.336, - "eval_steps_per_second": 47.523, + "eval_loss": 3.3593051433563232, + "eval_runtime": 114.9092, + "eval_samples_per_second": 898.753, + "eval_steps_per_second": 56.175, "step": 1904000 }, { "epoch": 15.59, - "eval_loss": 3.0700225830078125, - "eval_runtime": 136.0724, - "eval_samples_per_second": 758.971, - "eval_steps_per_second": 47.438, + "eval_loss": 3.3551743030548096, + "eval_runtime": 115.5179, + "eval_samples_per_second": 894.017, + "eval_steps_per_second": 55.879, "step": 1912000 }, { "epoch": 15.66, "learning_rate": 2.0011673476194448e-06, - "loss": 3.0204, + "loss": 3.2684, "step": 1920000 }, { "epoch": 15.66, - "eval_loss": 3.0751476287841797, - "eval_runtime": 136.119, - "eval_samples_per_second": 758.711, - "eval_steps_per_second": 47.422, + "eval_loss": 3.371454954147339, + "eval_runtime": 114.8944, + "eval_samples_per_second": 898.869, + "eval_steps_per_second": 56.182, "step": 1920000 }, { "epoch": 15.72, - "eval_loss": 3.0597870349884033, - "eval_runtime": 136.3427, - "eval_samples_per_second": 757.466, - "eval_steps_per_second": 47.344, + "eval_loss": 3.3543806076049805, + "eval_runtime": 115.4862, + "eval_samples_per_second": 894.263, + "eval_steps_per_second": 55.894, "step": 1928000 }, { "epoch": 15.79, "learning_rate": 1.9344617693654634e-06, - "loss": 3.0147, + "loss": 3.2924, "step": 1936000 }, { "epoch": 15.79, - "eval_loss": 3.0522122383117676, - "eval_runtime": 136.0082, - "eval_samples_per_second": 759.329, - "eval_steps_per_second": 47.46, + "eval_loss": 3.3514981269836426, + "eval_runtime": 115.0356, + "eval_samples_per_second": 897.766, + "eval_steps_per_second": 56.113, "step": 1936000 }, { "epoch": 15.85, - "eval_loss": 3.065509557723999, - "eval_runtime": 136.1421, - "eval_samples_per_second": 758.582, - "eval_steps_per_second": 47.414, + "eval_loss": 3.36460018157959, + "eval_runtime": 115.4242, + "eval_samples_per_second": 894.743, + "eval_steps_per_second": 55.924, "step": 1944000 }, { "epoch": 15.92, "learning_rate": 1.867756191111482e-06, - "loss": 3.0245, + "loss": 3.2673, "step": 1952000 }, { "epoch": 15.92, - "eval_loss": 3.0568597316741943, - "eval_runtime": 136.6776, - "eval_samples_per_second": 755.61, - "eval_steps_per_second": 47.228, + "eval_loss": 3.353806495666504, + "eval_runtime": 115.3905, + "eval_samples_per_second": 895.004, + "eval_steps_per_second": 55.94, "step": 1952000 }, { "epoch": 15.98, - "eval_loss": 3.062300205230713, - "eval_runtime": 136.0258, - "eval_samples_per_second": 759.231, - "eval_steps_per_second": 47.454, + "eval_loss": 3.3436896800994873, + "eval_runtime": 114.7945, + "eval_samples_per_second": 899.651, + "eval_steps_per_second": 56.231, "step": 1960000 }, { "epoch": 16.05, "learning_rate": 1.8010506128575004e-06, - "loss": 3.0069, + "loss": 3.2833, "step": 1968000 }, { "epoch": 16.05, - "eval_loss": 3.059983730316162, - "eval_runtime": 136.4638, - "eval_samples_per_second": 756.794, - "eval_steps_per_second": 47.302, + "eval_loss": 3.3442821502685547, + "eval_runtime": 116.1629, + "eval_samples_per_second": 889.053, + "eval_steps_per_second": 55.569, "step": 1968000 }, { "epoch": 16.11, - "eval_loss": 3.0638678073883057, - "eval_runtime": 137.569, - "eval_samples_per_second": 750.714, - "eval_steps_per_second": 46.922, + "eval_loss": 3.361924886703491, + "eval_runtime": 116.4426, + "eval_samples_per_second": 886.917, + "eval_steps_per_second": 55.435, "step": 1976000 }, { "epoch": 16.18, "learning_rate": 1.734345034603519e-06, - "loss": 3.0068, + "loss": 3.2636, "step": 1984000 }, { "epoch": 16.18, - "eval_loss": 3.077465534210205, - "eval_runtime": 136.0507, - "eval_samples_per_second": 759.092, - "eval_steps_per_second": 47.446, + "eval_loss": 3.3510515689849854, + "eval_runtime": 115.8529, + "eval_samples_per_second": 891.432, + "eval_steps_per_second": 55.717, "step": 1984000 }, { "epoch": 16.24, - "eval_loss": 3.0668864250183105, - "eval_runtime": 136.2552, - "eval_samples_per_second": 757.953, - "eval_steps_per_second": 47.374, + "eval_loss": 3.3447539806365967, + "eval_runtime": 114.926, + "eval_samples_per_second": 898.622, + "eval_steps_per_second": 56.167, "step": 1992000 }, { "epoch": 16.31, "learning_rate": 1.6676394563495374e-06, - "loss": 3.0275, + "loss": 3.2753, "step": 2000000 }, { "epoch": 16.31, - "eval_loss": 3.062725782394409, - "eval_runtime": 136.3436, - "eval_samples_per_second": 757.461, - "eval_steps_per_second": 47.344, + "eval_loss": 3.355980396270752, + "eval_runtime": 115.4649, + "eval_samples_per_second": 894.427, + "eval_steps_per_second": 55.904, "step": 2000000 }, { "epoch": 16.37, - "eval_loss": 3.0644514560699463, - "eval_runtime": 137.752, - "eval_samples_per_second": 749.717, - "eval_steps_per_second": 46.86, + "eval_loss": 3.3524882793426514, + "eval_runtime": 118.2786, + "eval_samples_per_second": 873.151, + "eval_steps_per_second": 54.575, "step": 2008000 }, { "epoch": 16.44, "learning_rate": 1.6009338780955558e-06, - "loss": 3.0164, + "loss": 3.2701, "step": 2016000 }, { "epoch": 16.44, - "eval_loss": 3.0666866302490234, - "eval_runtime": 135.9171, - "eval_samples_per_second": 759.838, - "eval_steps_per_second": 47.492, + "eval_loss": 3.355792760848999, + "eval_runtime": 115.0046, + "eval_samples_per_second": 898.008, + "eval_steps_per_second": 56.128, "step": 2016000 }, { "epoch": 16.5, - "eval_loss": 3.048987627029419, - "eval_runtime": 136.0156, - "eval_samples_per_second": 759.288, - "eval_steps_per_second": 47.458, + "eval_loss": 3.3558590412139893, + "eval_runtime": 115.5093, + "eval_samples_per_second": 894.084, + "eval_steps_per_second": 55.883, "step": 2024000 }, { "epoch": 16.57, "learning_rate": 1.5342282998415744e-06, - "loss": 3.0148, + "loss": 3.2761, "step": 2032000 }, { "epoch": 16.57, - "eval_loss": 3.061800003051758, - "eval_runtime": 137.187, - "eval_samples_per_second": 752.805, - "eval_steps_per_second": 47.053, + "eval_loss": 3.3439648151397705, + "eval_runtime": 114.8803, + "eval_samples_per_second": 898.979, + "eval_steps_per_second": 56.189, "step": 2032000 }, { "epoch": 16.63, - "eval_loss": 3.0544731616973877, - "eval_runtime": 137.5014, - "eval_samples_per_second": 751.083, - "eval_steps_per_second": 46.945, + "eval_loss": 3.3505825996398926, + "eval_runtime": 115.5177, + "eval_samples_per_second": 894.019, + "eval_steps_per_second": 55.879, "step": 2040000 }, { "epoch": 16.7, "learning_rate": 1.4675227215875928e-06, - "loss": 3.022, + "loss": 3.2677, "step": 2048000 }, { "epoch": 16.7, - "eval_loss": 3.0651352405548096, - "eval_runtime": 137.0124, - "eval_samples_per_second": 753.764, - "eval_steps_per_second": 47.113, + "eval_loss": 3.3473587036132812, + "eval_runtime": 115.2604, + "eval_samples_per_second": 896.014, + "eval_steps_per_second": 56.004, "step": 2048000 }, { "epoch": 16.76, - "eval_loss": 3.068650484085083, - "eval_runtime": 137.324, - "eval_samples_per_second": 752.053, - "eval_steps_per_second": 47.006, + "eval_loss": 3.3614845275878906, + "eval_runtime": 114.7851, + "eval_samples_per_second": 899.724, + "eval_steps_per_second": 56.236, "step": 2056000 }, { "epoch": 16.83, "learning_rate": 1.4008171433336116e-06, - "loss": 3.0235, + "loss": 3.2614, "step": 2064000 }, { "epoch": 16.83, - "eval_loss": 3.0515873432159424, - "eval_runtime": 137.8405, - "eval_samples_per_second": 749.235, - "eval_steps_per_second": 46.829, + "eval_loss": 3.350660562515259, + "eval_runtime": 116.1258, + "eval_samples_per_second": 889.337, + "eval_steps_per_second": 55.586, "step": 2064000 }, { "epoch": 16.89, - "eval_loss": 3.0761473178863525, - "eval_runtime": 137.435, - "eval_samples_per_second": 751.446, - "eval_steps_per_second": 46.968, + "eval_loss": 3.34436297416687, + "eval_runtime": 114.7641, + "eval_samples_per_second": 899.89, + "eval_steps_per_second": 56.246, "step": 2072000 }, { "epoch": 16.96, "learning_rate": 1.33411156507963e-06, - "loss": 3.0194, + "loss": 3.2608, "step": 2080000 }, { "epoch": 16.96, - "eval_loss": 3.0807414054870605, - "eval_runtime": 136.8928, - "eval_samples_per_second": 754.423, - "eval_steps_per_second": 47.154, + "eval_loss": 3.352665901184082, + "eval_runtime": 114.9595, + "eval_samples_per_second": 898.36, + "eval_steps_per_second": 56.15, "step": 2080000 }, { "epoch": 17.03, - "eval_loss": 3.060075283050537, - "eval_runtime": 136.6441, - "eval_samples_per_second": 755.796, - "eval_steps_per_second": 47.24, + "eval_loss": 3.3398256301879883, + "eval_runtime": 114.8716, + "eval_samples_per_second": 899.047, + "eval_steps_per_second": 56.193, "step": 2088000 }, { "epoch": 17.09, "learning_rate": 1.2674059868256484e-06, - "loss": 3.0142, + "loss": 3.2643, "step": 2096000 }, { "epoch": 17.09, - "eval_loss": 3.0721395015716553, - "eval_runtime": 136.5201, - "eval_samples_per_second": 756.482, - "eval_steps_per_second": 47.282, + "eval_loss": 3.3497581481933594, + "eval_runtime": 115.3741, + "eval_samples_per_second": 895.132, + "eval_steps_per_second": 55.948, "step": 2096000 }, { "epoch": 17.16, - "eval_loss": 3.0653316974639893, - "eval_runtime": 138.2812, - "eval_samples_per_second": 746.848, - "eval_steps_per_second": 46.68, + "eval_loss": 3.3348639011383057, + "eval_runtime": 114.8223, + "eval_samples_per_second": 899.434, + "eval_steps_per_second": 56.217, "step": 2104000 }, { "epoch": 17.22, "learning_rate": 1.2007004085716668e-06, - "loss": 3.0183, + "loss": 3.2721, "step": 2112000 }, { "epoch": 17.22, - "eval_loss": 3.061683416366577, - "eval_runtime": 136.6654, - "eval_samples_per_second": 755.678, - "eval_steps_per_second": 47.232, + "eval_loss": 3.356008291244507, + "eval_runtime": 115.5116, + "eval_samples_per_second": 894.066, + "eval_steps_per_second": 55.882, "step": 2112000 }, { "epoch": 17.29, - "eval_loss": 3.062178373336792, - "eval_runtime": 137.9621, - "eval_samples_per_second": 748.575, - "eval_steps_per_second": 46.788, + "eval_loss": 3.3421435356140137, + "eval_runtime": 115.5912, + "eval_samples_per_second": 893.45, + "eval_steps_per_second": 55.843, "step": 2120000 }, { "epoch": 17.35, "learning_rate": 1.1339948303176854e-06, - "loss": 3.0092, + "loss": 3.266, "step": 2128000 }, { "epoch": 17.35, - "eval_loss": 3.068242311477661, - "eval_runtime": 137.4752, - "eval_samples_per_second": 751.227, - "eval_steps_per_second": 46.954, + "eval_loss": 3.342872142791748, + "eval_runtime": 115.0319, + "eval_samples_per_second": 897.794, + "eval_steps_per_second": 56.115, "step": 2128000 }, { "epoch": 17.42, - "eval_loss": 3.073157787322998, - "eval_runtime": 136.5003, - "eval_samples_per_second": 756.592, - "eval_steps_per_second": 47.289, + "eval_loss": 3.337078809738159, + "eval_runtime": 114.7057, + "eval_samples_per_second": 900.347, + "eval_steps_per_second": 56.274, "step": 2136000 }, { "epoch": 17.48, "learning_rate": 1.067289252063704e-06, - "loss": 3.0071, + "loss": 3.2551, "step": 2144000 }, { "epoch": 17.48, - "eval_loss": 3.0763022899627686, - "eval_runtime": 137.95, - "eval_samples_per_second": 748.641, - "eval_steps_per_second": 46.792, + "eval_loss": 3.340388774871826, + "eval_runtime": 115.5719, + "eval_samples_per_second": 893.599, + "eval_steps_per_second": 55.853, "step": 2144000 }, { "epoch": 17.55, - "eval_loss": 3.0674524307250977, - "eval_runtime": 137.3106, - "eval_samples_per_second": 752.127, - "eval_steps_per_second": 47.01, + "eval_loss": 3.349374771118164, + "eval_runtime": 116.2218, + "eval_samples_per_second": 888.603, + "eval_steps_per_second": 55.54, "step": 2152000 }, { "epoch": 17.61, "learning_rate": 1.0005836738097224e-06, - "loss": 3.0272, + "loss": 3.26, "step": 2160000 }, { "epoch": 17.61, - "eval_loss": 3.0671498775482178, - "eval_runtime": 138.0717, - "eval_samples_per_second": 747.981, - "eval_steps_per_second": 46.751, + "eval_loss": 3.3389031887054443, + "eval_runtime": 115.0165, + "eval_samples_per_second": 897.915, + "eval_steps_per_second": 56.122, "step": 2160000 }, { "epoch": 17.68, - "eval_loss": 3.062239170074463, - "eval_runtime": 138.0499, - "eval_samples_per_second": 748.099, - "eval_steps_per_second": 46.758, + "eval_loss": 3.345613718032837, + "eval_runtime": 114.2481, + "eval_samples_per_second": 903.954, + "eval_steps_per_second": 56.5, "step": 2168000 }, { "epoch": 17.74, "learning_rate": 9.33878095555741e-07, - "loss": 3.0235, + "loss": 3.2528, "step": 2176000 }, { "epoch": 17.74, - "eval_loss": 3.0789263248443604, - "eval_runtime": 137.5626, - "eval_samples_per_second": 750.749, - "eval_steps_per_second": 46.924, + "eval_loss": 3.3248987197875977, + "eval_runtime": 115.0558, + "eval_samples_per_second": 897.608, + "eval_steps_per_second": 56.103, "step": 2176000 }, { "epoch": 17.81, - "eval_loss": 3.062295436859131, - "eval_runtime": 138.8694, - "eval_samples_per_second": 743.684, - "eval_steps_per_second": 46.483, + "eval_loss": 3.3452157974243164, + "eval_runtime": 116.2164, + "eval_samples_per_second": 888.644, + "eval_steps_per_second": 55.543, "step": 2184000 }, { "epoch": 17.87, "learning_rate": 8.671725173017595e-07, - "loss": 3.0179, + "loss": 3.2602, "step": 2192000 }, { "epoch": 17.87, - "eval_loss": 3.078376054763794, - "eval_runtime": 136.985, - "eval_samples_per_second": 753.914, - "eval_steps_per_second": 47.122, + "eval_loss": 3.33760929107666, + "eval_runtime": 116.1157, + "eval_samples_per_second": 889.414, + "eval_steps_per_second": 55.591, "step": 2192000 }, { "epoch": 17.94, - "eval_loss": 3.062905788421631, - "eval_runtime": 137.7472, - "eval_samples_per_second": 749.743, - "eval_steps_per_second": 46.861, + "eval_loss": 3.351128101348877, + "eval_runtime": 114.6575, + "eval_samples_per_second": 900.726, + "eval_steps_per_second": 56.298, "step": 2200000 }, { "epoch": 18.0, "learning_rate": 8.004669390477779e-07, - "loss": 3.0209, + "loss": 3.2492, "step": 2208000 }, { "epoch": 18.0, - "eval_loss": 3.0731070041656494, - "eval_runtime": 138.0906, - "eval_samples_per_second": 747.878, - "eval_steps_per_second": 46.745, + "eval_loss": 3.347473621368408, + "eval_runtime": 115.2092, + "eval_samples_per_second": 896.413, + "eval_steps_per_second": 56.029, "step": 2208000 }, { "epoch": 18.07, - "eval_loss": 3.0945563316345215, - "eval_runtime": 137.4959, - "eval_samples_per_second": 751.113, - "eval_steps_per_second": 46.947, + "eval_loss": 3.349674940109253, + "eval_runtime": 115.6497, + "eval_samples_per_second": 892.998, + "eval_steps_per_second": 55.815, "step": 2216000 }, { "epoch": 18.13, "learning_rate": 7.337613607937964e-07, - "loss": 3.0237, + "loss": 3.2469, "step": 2224000 }, { "epoch": 18.13, - "eval_loss": 3.065315008163452, - "eval_runtime": 138.0159, - "eval_samples_per_second": 748.283, - "eval_steps_per_second": 46.77, + "eval_loss": 3.3378491401672363, + "eval_runtime": 114.9296, + "eval_samples_per_second": 898.594, + "eval_steps_per_second": 56.165, "step": 2224000 }, { "epoch": 18.2, - "eval_loss": 3.0589962005615234, - "eval_runtime": 137.6387, - "eval_samples_per_second": 750.334, - "eval_steps_per_second": 46.898, + "eval_loss": 3.332571029663086, + "eval_runtime": 115.4244, + "eval_samples_per_second": 894.742, + "eval_steps_per_second": 55.924, "step": 2232000 }, { "epoch": 18.26, "learning_rate": 6.67055782539815e-07, - "loss": 3.0164, + "loss": 3.2589, "step": 2240000 }, { "epoch": 18.26, - "eval_loss": 3.070741891860962, - "eval_runtime": 138.4523, - "eval_samples_per_second": 745.925, - "eval_steps_per_second": 46.623, + "eval_loss": 3.3277342319488525, + "eval_runtime": 114.9762, + "eval_samples_per_second": 898.229, + "eval_steps_per_second": 56.142, "step": 2240000 }, { "epoch": 18.33, - "eval_loss": 3.0545763969421387, - "eval_runtime": 138.1194, - "eval_samples_per_second": 747.723, - "eval_steps_per_second": 46.735, + "eval_loss": 3.3456978797912598, + "eval_runtime": 116.0675, + "eval_samples_per_second": 889.784, + "eval_steps_per_second": 55.614, "step": 2248000 }, { "epoch": 18.4, "learning_rate": 6.003502042858334e-07, - "loss": 3.0206, + "loss": 3.2548, "step": 2256000 }, { "epoch": 18.4, - "eval_loss": 3.0741806030273438, - "eval_runtime": 138.8634, - "eval_samples_per_second": 743.717, - "eval_steps_per_second": 46.485, + "eval_loss": 3.334270715713501, + "eval_runtime": 115.7666, + "eval_samples_per_second": 892.097, + "eval_steps_per_second": 55.759, "step": 2256000 }, { "epoch": 18.46, - "eval_loss": 3.0793333053588867, - "eval_runtime": 138.6181, - "eval_samples_per_second": 745.032, - "eval_steps_per_second": 46.567, + "eval_loss": 3.3362197875976562, + "eval_runtime": 115.5031, + "eval_samples_per_second": 894.132, + "eval_steps_per_second": 55.886, "step": 2264000 }, { "epoch": 18.53, "learning_rate": 5.33644626031852e-07, - "loss": 3.0138, + "loss": 3.2589, "step": 2272000 }, { "epoch": 18.53, - "eval_loss": 3.05604887008667, - "eval_runtime": 139.1325, - "eval_samples_per_second": 742.278, - "eval_steps_per_second": 46.395, + "eval_loss": 3.343080997467041, + "eval_runtime": 115.3187, + "eval_samples_per_second": 895.561, + "eval_steps_per_second": 55.975, "step": 2272000 }, { "epoch": 18.59, - "eval_loss": 3.086977958679199, - "eval_runtime": 137.8163, - "eval_samples_per_second": 749.367, - "eval_steps_per_second": 46.838, + "eval_loss": 3.3428003787994385, + "eval_runtime": 115.3186, + "eval_samples_per_second": 895.563, + "eval_steps_per_second": 55.975, "step": 2280000 }, { "epoch": 18.66, "learning_rate": 4.669390477778705e-07, - "loss": 3.0377, + "loss": 3.2674, "step": 2288000 }, { "epoch": 18.66, - "eval_loss": 3.07423996925354, - "eval_runtime": 137.4738, - "eval_samples_per_second": 751.234, - "eval_steps_per_second": 46.954, + "eval_loss": 3.3400795459747314, + "eval_runtime": 114.7905, + "eval_samples_per_second": 899.682, + "eval_steps_per_second": 56.233, "step": 2288000 }, { "epoch": 18.72, - "eval_loss": 3.0675508975982666, - "eval_runtime": 138.0596, - "eval_samples_per_second": 748.046, - "eval_steps_per_second": 46.755, + "eval_loss": 3.337498903274536, + "eval_runtime": 114.9489, + "eval_samples_per_second": 898.443, + "eval_steps_per_second": 56.155, "step": 2296000 }, { "epoch": 18.79, "learning_rate": 4.0023346952388894e-07, - "loss": 3.0227, + "loss": 3.2561, "step": 2304000 }, { "epoch": 18.79, - "eval_loss": 3.06254506111145, - "eval_runtime": 139.3504, - "eval_samples_per_second": 741.117, - "eval_steps_per_second": 46.322, + "eval_loss": 3.3333868980407715, + "eval_runtime": 114.8393, + "eval_samples_per_second": 899.3, + "eval_steps_per_second": 56.209, "step": 2304000 }, { "epoch": 18.85, - "eval_loss": 3.0736207962036133, - "eval_runtime": 139.5433, - "eval_samples_per_second": 740.093, - "eval_steps_per_second": 46.258, + "eval_loss": 3.3320717811584473, + "eval_runtime": 115.0159, + "eval_samples_per_second": 897.919, + "eval_steps_per_second": 56.123, "step": 2312000 }, { "epoch": 18.92, "learning_rate": 3.335278912699075e-07, - "loss": 3.0359, + "loss": 3.2452, "step": 2320000 }, { "epoch": 18.92, - "eval_loss": 3.0800607204437256, - "eval_runtime": 138.2846, - "eval_samples_per_second": 746.829, - "eval_steps_per_second": 46.679, + "eval_loss": 3.3445632457733154, + "eval_runtime": 114.9617, + "eval_samples_per_second": 898.342, + "eval_steps_per_second": 56.149, "step": 2320000 }, { "epoch": 18.98, - "eval_loss": 3.0709972381591797, - "eval_runtime": 139.0505, - "eval_samples_per_second": 742.716, - "eval_steps_per_second": 46.422, + "eval_loss": 3.3525032997131348, + "eval_runtime": 116.2145, + "eval_samples_per_second": 888.659, + "eval_steps_per_second": 55.544, "step": 2328000 }, { "epoch": 19.05, "learning_rate": 2.66822313015926e-07, - "loss": 3.0248, + "loss": 3.259, "step": 2336000 }, { "epoch": 19.05, - "eval_loss": 3.069218158721924, - "eval_runtime": 138.9779, - "eval_samples_per_second": 743.104, - "eval_steps_per_second": 46.446, + "eval_loss": 3.331772804260254, + "eval_runtime": 115.4929, + "eval_samples_per_second": 894.211, + "eval_steps_per_second": 55.891, "step": 2336000 }, { "epoch": 19.11, - "eval_loss": 3.067660331726074, - "eval_runtime": 138.4099, - "eval_samples_per_second": 746.154, - "eval_steps_per_second": 46.637, + "eval_loss": 3.3451852798461914, + "eval_runtime": 115.1546, + "eval_samples_per_second": 896.838, + "eval_steps_per_second": 56.055, "step": 2344000 }, { "epoch": 19.18, "learning_rate": 2.0011673476194447e-07, - "loss": 3.0235, + "loss": 3.2494, "step": 2352000 }, { "epoch": 19.18, - "eval_loss": 3.089552879333496, - "eval_runtime": 138.573, - "eval_samples_per_second": 745.275, - "eval_steps_per_second": 46.582, + "eval_loss": 3.335479497909546, + "eval_runtime": 114.4583, + "eval_samples_per_second": 902.293, + "eval_steps_per_second": 56.396, "step": 2352000 }, { "epoch": 19.24, - "eval_loss": 3.0777699947357178, - "eval_runtime": 140.4362, - "eval_samples_per_second": 735.388, - "eval_steps_per_second": 45.964, + "eval_loss": 3.3322434425354004, + "eval_runtime": 116.1476, + "eval_samples_per_second": 889.17, + "eval_steps_per_second": 55.576, "step": 2360000 }, { "epoch": 19.31, "learning_rate": 1.33411156507963e-07, - "loss": 3.0187, + "loss": 3.2558, "step": 2368000 }, { "epoch": 19.31, - "eval_loss": 3.069951295852661, - "eval_runtime": 140.0319, - "eval_samples_per_second": 737.511, - "eval_steps_per_second": 46.097, + "eval_loss": 3.325453281402588, + "eval_runtime": 114.8662, + "eval_samples_per_second": 899.089, + "eval_steps_per_second": 56.196, "step": 2368000 }, { "epoch": 19.37, - "eval_loss": 3.0742506980895996, - "eval_runtime": 139.3192, - "eval_samples_per_second": 741.283, - "eval_steps_per_second": 46.332, + "eval_loss": 3.3329989910125732, + "eval_runtime": 117.9929, + "eval_samples_per_second": 875.265, + "eval_steps_per_second": 54.707, "step": 2376000 }, { "epoch": 19.44, "learning_rate": 6.67055782539815e-08, - "loss": 3.0189, + "loss": 3.2436, "step": 2384000 }, { "epoch": 19.44, - "eval_loss": 3.0780065059661865, - "eval_runtime": 138.4114, - "eval_samples_per_second": 746.145, - "eval_steps_per_second": 46.636, + "eval_loss": 3.3357789516448975, + "eval_runtime": 117.7235, + "eval_samples_per_second": 877.268, + "eval_steps_per_second": 54.832, "step": 2384000 }, { "epoch": 19.5, - "eval_loss": 3.0866599082946777, - "eval_runtime": 138.3665, - "eval_samples_per_second": 746.387, - "eval_steps_per_second": 46.651, + "eval_loss": 3.3287487030029297, + "eval_runtime": 115.6745, + "eval_samples_per_second": 892.807, + "eval_steps_per_second": 55.803, "step": 2392000 }, { "epoch": 19.57, "learning_rate": 0.0, - "loss": 3.0184, + "loss": 3.2545, "step": 2400000 }, { "epoch": 19.57, - "eval_loss": 3.079288959503174, - "eval_runtime": 138.4519, - "eval_samples_per_second": 745.927, - "eval_steps_per_second": 46.623, + "eval_loss": 3.3321266174316406, + "eval_runtime": 115.8716, + "eval_samples_per_second": 891.289, + "eval_steps_per_second": 55.708, "step": 2400000 }, { "epoch": 19.57, "step": 2400000, - "total_flos": 7.178820925216543e+17, - "train_loss": 2.9400340771484377, - "train_runtime": 198144.865, - "train_samples_per_second": 193.798, - "train_steps_per_second": 12.112 + "total_flos": 6.9600759359113e+17, + "train_loss": 3.268406458333333, + "train_runtime": 194422.9949, + "train_samples_per_second": 197.508, + "train_steps_per_second": 12.344 } ], "logging_steps": 16000, "max_steps": 2400000, "num_train_epochs": 20, "save_steps": 32000, - "total_flos": 7.178820925216543e+17, + "total_flos": 6.9600759359113e+17, "trial_name": null, "trial_params": null }