{ "best_metric": 0.9769449234008789, "best_model_checkpoint": "./checkpoints/meta-baseline-t5-small/checkpoint-70000", "epoch": 8.968516798513916, "global_step": 70000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "learning_rate": 0.001, "loss": 2.3828, "step": 200 }, { "epoch": 0.05, "learning_rate": 0.001, "loss": 1.6006, "step": 400 }, { "epoch": 0.08, "learning_rate": 0.001, "loss": 1.4131, "step": 600 }, { "epoch": 0.1, "learning_rate": 0.001, "loss": 1.3124, "step": 800 }, { "epoch": 0.13, "learning_rate": 0.001, "loss": 1.2434, "step": 1000 }, { "epoch": 0.15, "learning_rate": 0.001, "loss": 1.1786, "step": 1200 }, { "epoch": 0.18, "learning_rate": 0.001, "loss": 1.1193, "step": 1400 }, { "epoch": 0.2, "learning_rate": 0.001, "loss": 1.1045, "step": 1600 }, { "epoch": 0.23, "learning_rate": 0.001, "loss": 1.0873, "step": 1800 }, { "epoch": 0.26, "learning_rate": 0.001, "loss": 1.0495, "step": 2000 }, { "epoch": 0.28, "learning_rate": 0.001, "loss": 1.0346, "step": 2200 }, { "epoch": 0.31, "learning_rate": 0.001, "loss": 1.0245, "step": 2400 }, { "epoch": 0.33, "learning_rate": 0.001, "loss": 0.9973, "step": 2600 }, { "epoch": 0.36, "learning_rate": 0.001, "loss": 0.9785, "step": 2800 }, { "epoch": 0.38, "learning_rate": 0.001, "loss": 0.9696, "step": 3000 }, { "epoch": 0.41, "learning_rate": 0.001, "loss": 0.9684, "step": 3200 }, { "epoch": 0.44, "learning_rate": 0.001, "loss": 0.9425, "step": 3400 }, { "epoch": 0.46, "learning_rate": 0.001, "loss": 0.9437, "step": 3600 }, { "epoch": 0.49, "learning_rate": 0.001, "loss": 0.9376, "step": 3800 }, { "epoch": 0.51, "learning_rate": 0.001, "loss": 0.9247, "step": 4000 }, { "epoch": 0.54, "learning_rate": 0.001, "loss": 0.9127, "step": 4200 }, { "epoch": 0.56, "learning_rate": 0.001, "loss": 0.9133, "step": 4400 }, { "epoch": 0.59, "learning_rate": 0.001, "loss": 0.9021, "step": 4600 }, { "epoch": 0.61, "learning_rate": 0.001, "loss": 0.9021, "step": 4800 }, { "epoch": 0.64, "learning_rate": 0.001, "loss": 0.8922, "step": 5000 }, { "epoch": 0.64, "eval_loss": 1.1090599298477173, "eval_runtime": 226.6186, "eval_samples_per_second": 539.567, "eval_steps_per_second": 2.109, "step": 5000 }, { "epoch": 0.67, "learning_rate": 0.001, "loss": 0.884, "step": 5200 }, { "epoch": 0.69, "learning_rate": 0.001, "loss": 0.8681, "step": 5400 }, { "epoch": 0.72, "learning_rate": 0.001, "loss": 0.8732, "step": 5600 }, { "epoch": 0.74, "learning_rate": 0.001, "loss": 0.869, "step": 5800 }, { "epoch": 0.77, "learning_rate": 0.001, "loss": 0.8467, "step": 6000 }, { "epoch": 0.79, "learning_rate": 0.001, "loss": 0.8578, "step": 6200 }, { "epoch": 0.82, "learning_rate": 0.001, "loss": 0.8621, "step": 6400 }, { "epoch": 0.85, "learning_rate": 0.001, "loss": 0.8484, "step": 6600 }, { "epoch": 0.87, "learning_rate": 0.001, "loss": 0.8449, "step": 6800 }, { "epoch": 0.9, "learning_rate": 0.001, "loss": 0.8432, "step": 7000 }, { "epoch": 0.92, "learning_rate": 0.001, "loss": 0.8481, "step": 7200 }, { "epoch": 0.95, "learning_rate": 0.001, "loss": 0.8388, "step": 7400 }, { "epoch": 0.97, "learning_rate": 0.001, "loss": 0.8402, "step": 7600 }, { "epoch": 1.0, "learning_rate": 0.001, "loss": 0.8175, "step": 7800 }, { "epoch": 1.02, "learning_rate": 0.001, "loss": 0.7903, "step": 8000 }, { "epoch": 1.05, "learning_rate": 0.001, "loss": 0.7977, "step": 8200 }, { "epoch": 1.08, "learning_rate": 0.001, "loss": 0.8082, "step": 8400 }, { "epoch": 1.1, "learning_rate": 0.001, "loss": 0.7975, "step": 8600 }, { "epoch": 1.13, "learning_rate": 0.001, "loss": 0.8046, "step": 8800 }, { "epoch": 1.15, "learning_rate": 0.001, "loss": 0.7912, "step": 9000 }, { "epoch": 1.18, "learning_rate": 0.001, "loss": 0.7875, "step": 9200 }, { "epoch": 1.2, "learning_rate": 0.001, "loss": 0.7894, "step": 9400 }, { "epoch": 1.23, "learning_rate": 0.001, "loss": 0.7886, "step": 9600 }, { "epoch": 1.26, "learning_rate": 0.001, "loss": 0.7847, "step": 9800 }, { "epoch": 1.28, "learning_rate": 0.001, "loss": 0.7906, "step": 10000 }, { "epoch": 1.28, "eval_loss": 1.0378576517105103, "eval_runtime": 226.2892, "eval_samples_per_second": 540.353, "eval_steps_per_second": 2.112, "step": 10000 }, { "epoch": 1.31, "learning_rate": 0.001, "loss": 0.7796, "step": 10200 }, { "epoch": 1.33, "learning_rate": 0.001, "loss": 0.7873, "step": 10400 }, { "epoch": 1.36, "learning_rate": 0.001, "loss": 0.7796, "step": 10600 }, { "epoch": 1.38, "learning_rate": 0.001, "loss": 0.7661, "step": 10800 }, { "epoch": 1.41, "learning_rate": 0.001, "loss": 0.7901, "step": 11000 }, { "epoch": 1.43, "learning_rate": 0.001, "loss": 0.7852, "step": 11200 }, { "epoch": 1.46, "learning_rate": 0.001, "loss": 0.7722, "step": 11400 }, { "epoch": 1.49, "learning_rate": 0.001, "loss": 0.7879, "step": 11600 }, { "epoch": 1.51, "learning_rate": 0.001, "loss": 0.7616, "step": 11800 }, { "epoch": 1.54, "learning_rate": 0.001, "loss": 0.7645, "step": 12000 }, { "epoch": 1.56, "learning_rate": 0.001, "loss": 0.7783, "step": 12200 }, { "epoch": 1.59, "learning_rate": 0.001, "loss": 0.7725, "step": 12400 }, { "epoch": 1.61, "learning_rate": 0.001, "loss": 0.7664, "step": 12600 }, { "epoch": 1.64, "learning_rate": 0.001, "loss": 0.7632, "step": 12800 }, { "epoch": 1.67, "learning_rate": 0.001, "loss": 0.7702, "step": 13000 }, { "epoch": 1.69, "learning_rate": 0.001, "loss": 0.7695, "step": 13200 }, { "epoch": 1.72, "learning_rate": 0.001, "loss": 0.7498, "step": 13400 }, { "epoch": 1.74, "learning_rate": 0.001, "loss": 0.7583, "step": 13600 }, { "epoch": 1.77, "learning_rate": 0.001, "loss": 0.7556, "step": 13800 }, { "epoch": 1.79, "learning_rate": 0.001, "loss": 0.7401, "step": 14000 }, { "epoch": 1.82, "learning_rate": 0.001, "loss": 0.7408, "step": 14200 }, { "epoch": 1.84, "learning_rate": 0.001, "loss": 0.7474, "step": 14400 }, { "epoch": 1.87, "learning_rate": 0.001, "loss": 0.7509, "step": 14600 }, { "epoch": 1.9, "learning_rate": 0.001, "loss": 0.7588, "step": 14800 }, { "epoch": 1.92, "learning_rate": 0.001, "loss": 0.7486, "step": 15000 }, { "epoch": 1.92, "eval_loss": 1.012812852859497, "eval_runtime": 226.2966, "eval_samples_per_second": 540.335, "eval_steps_per_second": 2.112, "step": 15000 }, { "epoch": 1.95, "learning_rate": 0.001, "loss": 0.7529, "step": 15200 }, { "epoch": 1.97, "learning_rate": 0.001, "loss": 0.7405, "step": 15400 }, { "epoch": 2.0, "learning_rate": 0.001, "loss": 0.7443, "step": 15600 }, { "epoch": 2.02, "learning_rate": 0.001, "loss": 0.7358, "step": 15800 }, { "epoch": 2.05, "learning_rate": 0.001, "loss": 0.7161, "step": 16000 }, { "epoch": 2.08, "learning_rate": 0.001, "loss": 0.7292, "step": 16200 }, { "epoch": 2.1, "learning_rate": 0.001, "loss": 0.7187, "step": 16400 }, { "epoch": 2.13, "learning_rate": 0.001, "loss": 0.7228, "step": 16600 }, { "epoch": 2.15, "learning_rate": 0.001, "loss": 0.7212, "step": 16800 }, { "epoch": 2.18, "learning_rate": 0.001, "loss": 0.7183, "step": 17000 }, { "epoch": 2.2, "learning_rate": 0.001, "loss": 0.7293, "step": 17200 }, { "epoch": 2.23, "learning_rate": 0.001, "loss": 0.7164, "step": 17400 }, { "epoch": 2.25, "learning_rate": 0.001, "loss": 0.7169, "step": 17600 }, { "epoch": 2.28, "learning_rate": 0.001, "loss": 0.7262, "step": 17800 }, { "epoch": 2.31, "learning_rate": 0.001, "loss": 0.7206, "step": 18000 }, { "epoch": 2.33, "learning_rate": 0.001, "loss": 0.7075, "step": 18200 }, { "epoch": 2.36, "learning_rate": 0.001, "loss": 0.7111, "step": 18400 }, { "epoch": 2.38, "learning_rate": 0.001, "loss": 0.708, "step": 18600 }, { "epoch": 2.41, "learning_rate": 0.001, "loss": 0.7205, "step": 18800 }, { "epoch": 2.43, "learning_rate": 0.001, "loss": 0.716, "step": 19000 }, { "epoch": 2.46, "learning_rate": 0.001, "loss": 0.7092, "step": 19200 }, { "epoch": 2.49, "learning_rate": 0.001, "loss": 0.7145, "step": 19400 }, { "epoch": 2.51, "learning_rate": 0.001, "loss": 0.7079, "step": 19600 }, { "epoch": 2.54, "learning_rate": 0.001, "loss": 0.7091, "step": 19800 }, { "epoch": 2.56, "learning_rate": 0.001, "loss": 0.7069, "step": 20000 }, { "epoch": 2.56, "eval_loss": 1.0125058889389038, "eval_runtime": 226.3642, "eval_samples_per_second": 540.174, "eval_steps_per_second": 2.112, "step": 20000 }, { "epoch": 2.59, "learning_rate": 0.001, "loss": 0.7227, "step": 20200 }, { "epoch": 2.61, "learning_rate": 0.001, "loss": 0.7131, "step": 20400 }, { "epoch": 2.64, "learning_rate": 0.001, "loss": 0.718, "step": 20600 }, { "epoch": 2.66, "learning_rate": 0.001, "loss": 0.7079, "step": 20800 }, { "epoch": 2.69, "learning_rate": 0.001, "loss": 0.7029, "step": 21000 }, { "epoch": 2.72, "learning_rate": 0.001, "loss": 0.7052, "step": 21200 }, { "epoch": 2.74, "learning_rate": 0.001, "loss": 0.7023, "step": 21400 }, { "epoch": 2.77, "learning_rate": 0.001, "loss": 0.6946, "step": 21600 }, { "epoch": 2.79, "learning_rate": 0.001, "loss": 0.7007, "step": 21800 }, { "epoch": 2.82, "learning_rate": 0.001, "loss": 0.7065, "step": 22000 }, { "epoch": 2.84, "learning_rate": 0.001, "loss": 0.7129, "step": 22200 }, { "epoch": 2.87, "learning_rate": 0.001, "loss": 0.7115, "step": 22400 }, { "epoch": 2.9, "learning_rate": 0.001, "loss": 0.7002, "step": 22600 }, { "epoch": 2.92, "learning_rate": 0.001, "loss": 0.7048, "step": 22800 }, { "epoch": 2.95, "learning_rate": 0.001, "loss": 0.7051, "step": 23000 }, { "epoch": 2.97, "learning_rate": 0.001, "loss": 0.6826, "step": 23200 }, { "epoch": 3.0, "learning_rate": 0.001, "loss": 0.707, "step": 23400 }, { "epoch": 3.02, "learning_rate": 0.001, "loss": 0.6763, "step": 23600 }, { "epoch": 3.05, "learning_rate": 0.001, "loss": 0.6645, "step": 23800 }, { "epoch": 3.07, "learning_rate": 0.001, "loss": 0.6786, "step": 24000 }, { "epoch": 3.1, "learning_rate": 0.001, "loss": 0.6786, "step": 24200 }, { "epoch": 3.13, "learning_rate": 0.001, "loss": 0.6867, "step": 24400 }, { "epoch": 3.15, "learning_rate": 0.001, "loss": 0.6748, "step": 24600 }, { "epoch": 3.18, "learning_rate": 0.001, "loss": 0.6754, "step": 24800 }, { "epoch": 3.2, "learning_rate": 0.001, "loss": 0.6766, "step": 25000 }, { "epoch": 3.2, "eval_loss": 1.0152596235275269, "eval_runtime": 226.4668, "eval_samples_per_second": 539.929, "eval_steps_per_second": 2.111, "step": 25000 }, { "epoch": 3.23, "learning_rate": 0.001, "loss": 0.6852, "step": 25200 }, { "epoch": 3.25, "learning_rate": 0.001, "loss": 0.6736, "step": 25400 }, { "epoch": 3.28, "learning_rate": 0.001, "loss": 0.6859, "step": 25600 }, { "epoch": 3.31, "learning_rate": 0.001, "loss": 0.6754, "step": 25800 }, { "epoch": 3.33, "learning_rate": 0.001, "loss": 0.6789, "step": 26000 }, { "epoch": 3.36, "learning_rate": 0.001, "loss": 0.6725, "step": 26200 }, { "epoch": 3.38, "learning_rate": 0.001, "loss": 0.6826, "step": 26400 }, { "epoch": 3.41, "learning_rate": 0.001, "loss": 0.6852, "step": 26600 }, { "epoch": 3.43, "learning_rate": 0.001, "loss": 0.6767, "step": 26800 }, { "epoch": 3.46, "learning_rate": 0.001, "loss": 0.6584, "step": 27000 }, { "epoch": 3.48, "learning_rate": 0.001, "loss": 0.6642, "step": 27200 }, { "epoch": 3.51, "learning_rate": 0.001, "loss": 0.6732, "step": 27400 }, { "epoch": 3.54, "learning_rate": 0.001, "loss": 0.6832, "step": 27600 }, { "epoch": 3.56, "learning_rate": 0.001, "loss": 0.6746, "step": 27800 }, { "epoch": 3.59, "learning_rate": 0.001, "loss": 0.6803, "step": 28000 }, { "epoch": 3.61, "learning_rate": 0.001, "loss": 0.6753, "step": 28200 }, { "epoch": 3.64, "learning_rate": 0.001, "loss": 0.6901, "step": 28400 }, { "epoch": 3.66, "learning_rate": 0.001, "loss": 0.6717, "step": 28600 }, { "epoch": 3.69, "learning_rate": 0.001, "loss": 0.6761, "step": 28800 }, { "epoch": 3.72, "learning_rate": 0.001, "loss": 0.6698, "step": 29000 }, { "epoch": 3.74, "learning_rate": 0.001, "loss": 0.6737, "step": 29200 }, { "epoch": 3.77, "learning_rate": 0.001, "loss": 0.6832, "step": 29400 }, { "epoch": 3.79, "learning_rate": 0.001, "loss": 0.6699, "step": 29600 }, { "epoch": 3.82, "learning_rate": 0.001, "loss": 0.6686, "step": 29800 }, { "epoch": 3.84, "learning_rate": 0.001, "loss": 0.6624, "step": 30000 }, { "epoch": 3.84, "eval_loss": 0.9890710711479187, "eval_runtime": 226.6603, "eval_samples_per_second": 539.468, "eval_steps_per_second": 2.109, "step": 30000 }, { "epoch": 3.87, "learning_rate": 0.001, "loss": 0.6739, "step": 30200 }, { "epoch": 3.89, "learning_rate": 0.001, "loss": 0.672, "step": 30400 }, { "epoch": 3.92, "learning_rate": 0.001, "loss": 0.6728, "step": 30600 }, { "epoch": 3.95, "learning_rate": 0.001, "loss": 0.6767, "step": 30800 }, { "epoch": 3.97, "learning_rate": 0.001, "loss": 0.6808, "step": 31000 }, { "epoch": 4.0, "learning_rate": 0.001, "loss": 0.6644, "step": 31200 }, { "epoch": 4.02, "learning_rate": 0.001, "loss": 0.6459, "step": 31400 }, { "epoch": 4.05, "learning_rate": 0.001, "loss": 0.6548, "step": 31600 }, { "epoch": 4.07, "learning_rate": 0.001, "loss": 0.6486, "step": 31800 }, { "epoch": 4.1, "learning_rate": 0.001, "loss": 0.6516, "step": 32000 }, { "epoch": 4.13, "learning_rate": 0.001, "loss": 0.6533, "step": 32200 }, { "epoch": 4.15, "learning_rate": 0.001, "loss": 0.6496, "step": 32400 }, { "epoch": 4.18, "learning_rate": 0.001, "loss": 0.6478, "step": 32600 }, { "epoch": 4.2, "learning_rate": 0.001, "loss": 0.6498, "step": 32800 }, { "epoch": 4.23, "learning_rate": 0.001, "loss": 0.6345, "step": 33000 }, { "epoch": 4.25, "learning_rate": 0.001, "loss": 0.6451, "step": 33200 }, { "epoch": 4.28, "learning_rate": 0.001, "loss": 0.6465, "step": 33400 }, { "epoch": 4.3, "learning_rate": 0.001, "loss": 0.643, "step": 33600 }, { "epoch": 4.33, "learning_rate": 0.001, "loss": 0.6573, "step": 33800 }, { "epoch": 4.36, "learning_rate": 0.001, "loss": 0.6473, "step": 34000 }, { "epoch": 4.38, "learning_rate": 0.001, "loss": 0.6523, "step": 34200 }, { "epoch": 4.41, "learning_rate": 0.001, "loss": 0.6507, "step": 34400 }, { "epoch": 4.43, "learning_rate": 0.001, "loss": 0.6478, "step": 34600 }, { "epoch": 4.46, "learning_rate": 0.001, "loss": 0.658, "step": 34800 }, { "epoch": 4.48, "learning_rate": 0.001, "loss": 0.6458, "step": 35000 }, { "epoch": 4.48, "eval_loss": 0.9963102340698242, "eval_runtime": 226.814, "eval_samples_per_second": 539.103, "eval_steps_per_second": 2.107, "step": 35000 }, { "epoch": 4.51, "learning_rate": 0.001, "loss": 0.6567, "step": 35200 }, { "epoch": 4.54, "learning_rate": 0.001, "loss": 0.6474, "step": 35400 }, { "epoch": 4.56, "learning_rate": 0.001, "loss": 0.6518, "step": 35600 }, { "epoch": 4.59, "learning_rate": 0.001, "loss": 0.6509, "step": 35800 }, { "epoch": 4.61, "learning_rate": 0.001, "loss": 0.6512, "step": 36000 }, { "epoch": 4.64, "learning_rate": 0.001, "loss": 0.6501, "step": 36200 }, { "epoch": 4.66, "learning_rate": 0.001, "loss": 0.65, "step": 36400 }, { "epoch": 4.69, "learning_rate": 0.001, "loss": 0.6505, "step": 36600 }, { "epoch": 4.71, "learning_rate": 0.001, "loss": 0.6426, "step": 36800 }, { "epoch": 4.74, "learning_rate": 0.001, "loss": 0.6593, "step": 37000 }, { "epoch": 4.77, "learning_rate": 0.001, "loss": 0.6488, "step": 37200 }, { "epoch": 4.79, "learning_rate": 0.001, "loss": 0.6402, "step": 37400 }, { "epoch": 4.82, "learning_rate": 0.001, "loss": 0.6515, "step": 37600 }, { "epoch": 4.84, "learning_rate": 0.001, "loss": 0.6453, "step": 37800 }, { "epoch": 4.87, "learning_rate": 0.001, "loss": 0.6581, "step": 38000 }, { "epoch": 4.89, "learning_rate": 0.001, "loss": 0.6408, "step": 38200 }, { "epoch": 4.92, "learning_rate": 0.001, "loss": 0.6452, "step": 38400 }, { "epoch": 4.95, "learning_rate": 0.001, "loss": 0.6445, "step": 38600 }, { "epoch": 4.97, "learning_rate": 0.001, "loss": 0.6455, "step": 38800 }, { "epoch": 5.0, "learning_rate": 0.001, "loss": 0.6505, "step": 39000 }, { "epoch": 5.02, "learning_rate": 0.001, "loss": 0.6246, "step": 39200 }, { "epoch": 5.05, "learning_rate": 0.001, "loss": 0.6119, "step": 39400 }, { "epoch": 5.07, "learning_rate": 0.001, "loss": 0.622, "step": 39600 }, { "epoch": 5.1, "learning_rate": 0.001, "loss": 0.6278, "step": 39800 }, { "epoch": 5.12, "learning_rate": 0.001, "loss": 0.6233, "step": 40000 }, { "epoch": 5.12, "eval_loss": 0.9902318716049194, "eval_runtime": 227.3177, "eval_samples_per_second": 537.908, "eval_steps_per_second": 2.103, "step": 40000 }, { "epoch": 5.15, "learning_rate": 0.001, "loss": 0.6309, "step": 40200 }, { "epoch": 5.18, "learning_rate": 0.001, "loss": 0.6308, "step": 40400 }, { "epoch": 5.2, "learning_rate": 0.001, "loss": 0.6173, "step": 40600 }, { "epoch": 5.23, "learning_rate": 0.001, "loss": 0.6188, "step": 40800 }, { "epoch": 5.25, "learning_rate": 0.001, "loss": 0.6412, "step": 41000 }, { "epoch": 5.28, "learning_rate": 0.001, "loss": 0.6232, "step": 41200 }, { "epoch": 5.3, "learning_rate": 0.001, "loss": 0.6393, "step": 41400 }, { "epoch": 5.33, "learning_rate": 0.001, "loss": 0.6305, "step": 41600 }, { "epoch": 5.36, "learning_rate": 0.001, "loss": 0.632, "step": 41800 }, { "epoch": 5.38, "learning_rate": 0.001, "loss": 0.6308, "step": 42000 }, { "epoch": 5.41, "learning_rate": 0.001, "loss": 0.6307, "step": 42200 }, { "epoch": 5.43, "learning_rate": 0.001, "loss": 0.6321, "step": 42400 }, { "epoch": 5.46, "learning_rate": 0.001, "loss": 0.6337, "step": 42600 }, { "epoch": 5.48, "learning_rate": 0.001, "loss": 0.6281, "step": 42800 }, { "epoch": 5.51, "learning_rate": 0.001, "loss": 0.633, "step": 43000 }, { "epoch": 5.53, "learning_rate": 0.001, "loss": 0.6208, "step": 43200 }, { "epoch": 5.56, "learning_rate": 0.001, "loss": 0.6217, "step": 43400 }, { "epoch": 5.59, "learning_rate": 0.001, "loss": 0.6229, "step": 43600 }, { "epoch": 5.61, "learning_rate": 0.001, "loss": 0.6253, "step": 43800 }, { "epoch": 5.64, "learning_rate": 0.001, "loss": 0.6315, "step": 44000 }, { "epoch": 5.66, "learning_rate": 0.001, "loss": 0.6354, "step": 44200 }, { "epoch": 5.69, "learning_rate": 0.001, "loss": 0.6314, "step": 44400 }, { "epoch": 5.71, "learning_rate": 0.001, "loss": 0.6346, "step": 44600 }, { "epoch": 5.74, "learning_rate": 0.001, "loss": 0.6293, "step": 44800 }, { "epoch": 5.77, "learning_rate": 0.001, "loss": 0.6393, "step": 45000 }, { "epoch": 5.77, "eval_loss": 0.9893815517425537, "eval_runtime": 226.9791, "eval_samples_per_second": 538.71, "eval_steps_per_second": 2.106, "step": 45000 }, { "epoch": 5.79, "learning_rate": 0.001, "loss": 0.629, "step": 45200 }, { "epoch": 5.82, "learning_rate": 0.001, "loss": 0.6381, "step": 45400 }, { "epoch": 5.84, "learning_rate": 0.001, "loss": 0.6327, "step": 45600 }, { "epoch": 5.87, "learning_rate": 0.001, "loss": 0.6272, "step": 45800 }, { "epoch": 5.89, "learning_rate": 0.001, "loss": 0.6266, "step": 46000 }, { "epoch": 5.92, "learning_rate": 0.001, "loss": 0.628, "step": 46200 }, { "epoch": 5.94, "learning_rate": 0.001, "loss": 0.6211, "step": 46400 }, { "epoch": 5.97, "learning_rate": 0.001, "loss": 0.628, "step": 46600 }, { "epoch": 6.0, "learning_rate": 0.001, "loss": 0.6351, "step": 46800 }, { "epoch": 6.02, "learning_rate": 0.001, "loss": 0.6172, "step": 47000 }, { "epoch": 6.05, "learning_rate": 0.001, "loss": 0.5967, "step": 47200 }, { "epoch": 6.07, "learning_rate": 0.001, "loss": 0.603, "step": 47400 }, { "epoch": 6.1, "learning_rate": 0.001, "loss": 0.614, "step": 47600 }, { "epoch": 6.12, "learning_rate": 0.001, "loss": 0.6204, "step": 47800 }, { "epoch": 6.15, "learning_rate": 0.001, "loss": 0.6075, "step": 48000 }, { "epoch": 6.18, "learning_rate": 0.001, "loss": 0.612, "step": 48200 }, { "epoch": 6.2, "learning_rate": 0.001, "loss": 0.6184, "step": 48400 }, { "epoch": 6.23, "learning_rate": 0.001, "loss": 0.6062, "step": 48600 }, { "epoch": 6.25, "learning_rate": 0.001, "loss": 0.6146, "step": 48800 }, { "epoch": 6.28, "learning_rate": 0.001, "loss": 0.6172, "step": 49000 }, { "epoch": 6.3, "learning_rate": 0.001, "loss": 0.6066, "step": 49200 }, { "epoch": 6.33, "learning_rate": 0.001, "loss": 0.6185, "step": 49400 }, { "epoch": 6.35, "learning_rate": 0.001, "loss": 0.6121, "step": 49600 }, { "epoch": 6.38, "learning_rate": 0.001, "loss": 0.6054, "step": 49800 }, { "epoch": 6.41, "learning_rate": 0.001, "loss": 0.6035, "step": 50000 }, { "epoch": 6.41, "eval_loss": 0.9930422902107239, "eval_runtime": 227.4872, "eval_samples_per_second": 537.507, "eval_steps_per_second": 2.101, "step": 50000 }, { "epoch": 6.43, "learning_rate": 0.001, "loss": 0.6172, "step": 50200 }, { "epoch": 6.46, "learning_rate": 0.001, "loss": 0.6118, "step": 50400 }, { "epoch": 6.48, "learning_rate": 0.001, "loss": 0.6193, "step": 50600 }, { "epoch": 6.51, "learning_rate": 0.001, "loss": 0.6235, "step": 50800 }, { "epoch": 6.53, "learning_rate": 0.001, "loss": 0.6267, "step": 51000 }, { "epoch": 6.56, "learning_rate": 0.001, "loss": 0.6125, "step": 51200 }, { "epoch": 6.59, "learning_rate": 0.001, "loss": 0.6075, "step": 51400 }, { "epoch": 6.61, "learning_rate": 0.001, "loss": 0.6147, "step": 51600 }, { "epoch": 6.64, "learning_rate": 0.001, "loss": 0.6022, "step": 51800 }, { "epoch": 6.66, "learning_rate": 0.001, "loss": 0.6039, "step": 52000 }, { "epoch": 6.69, "learning_rate": 0.001, "loss": 0.6087, "step": 52200 }, { "epoch": 6.71, "learning_rate": 0.001, "loss": 0.611, "step": 52400 }, { "epoch": 6.74, "learning_rate": 0.001, "loss": 0.6122, "step": 52600 }, { "epoch": 6.76, "learning_rate": 0.001, "loss": 0.6038, "step": 52800 }, { "epoch": 6.79, "learning_rate": 0.001, "loss": 0.617, "step": 53000 }, { "epoch": 6.82, "learning_rate": 0.001, "loss": 0.6068, "step": 53200 }, { "epoch": 6.84, "learning_rate": 0.001, "loss": 0.6186, "step": 53400 }, { "epoch": 6.87, "learning_rate": 0.001, "loss": 0.6131, "step": 53600 }, { "epoch": 6.89, "learning_rate": 0.001, "loss": 0.612, "step": 53800 }, { "epoch": 6.92, "learning_rate": 0.001, "loss": 0.6101, "step": 54000 }, { "epoch": 6.94, "learning_rate": 0.001, "loss": 0.6087, "step": 54200 }, { "epoch": 6.97, "learning_rate": 0.001, "loss": 0.6093, "step": 54400 }, { "epoch": 7.0, "learning_rate": 0.001, "loss": 0.6139, "step": 54600 }, { "epoch": 7.02, "learning_rate": 0.001, "loss": 0.6001, "step": 54800 }, { "epoch": 7.05, "learning_rate": 0.001, "loss": 0.5916, "step": 55000 }, { "epoch": 7.05, "eval_loss": 0.9893601536750793, "eval_runtime": 227.4715, "eval_samples_per_second": 537.544, "eval_steps_per_second": 2.101, "step": 55000 }, { "epoch": 7.07, "learning_rate": 0.001, "loss": 0.5944, "step": 55200 }, { "epoch": 7.1, "learning_rate": 0.001, "loss": 0.59, "step": 55400 }, { "epoch": 7.12, "learning_rate": 0.001, "loss": 0.5999, "step": 55600 }, { "epoch": 7.15, "learning_rate": 0.001, "loss": 0.5887, "step": 55800 }, { "epoch": 7.17, "learning_rate": 0.001, "loss": 0.5939, "step": 56000 }, { "epoch": 7.2, "learning_rate": 0.001, "loss": 0.5941, "step": 56200 }, { "epoch": 7.23, "learning_rate": 0.001, "loss": 0.6025, "step": 56400 }, { "epoch": 7.25, "learning_rate": 0.001, "loss": 0.5955, "step": 56600 }, { "epoch": 7.28, "learning_rate": 0.001, "loss": 0.6038, "step": 56800 }, { "epoch": 7.3, "learning_rate": 0.001, "loss": 0.5878, "step": 57000 }, { "epoch": 7.33, "learning_rate": 0.001, "loss": 0.5974, "step": 57200 }, { "epoch": 7.35, "learning_rate": 0.001, "loss": 0.6144, "step": 57400 }, { "epoch": 7.38, "learning_rate": 0.001, "loss": 0.5916, "step": 57600 }, { "epoch": 7.41, "learning_rate": 0.001, "loss": 0.5897, "step": 57800 }, { "epoch": 7.43, "learning_rate": 0.001, "loss": 0.5954, "step": 58000 }, { "epoch": 7.46, "learning_rate": 0.001, "loss": 0.5975, "step": 58200 }, { "epoch": 7.48, "learning_rate": 0.001, "loss": 0.5953, "step": 58400 }, { "epoch": 7.51, "learning_rate": 0.001, "loss": 0.5967, "step": 58600 }, { "epoch": 7.53, "learning_rate": 0.001, "loss": 0.5988, "step": 58800 }, { "epoch": 7.56, "learning_rate": 0.001, "loss": 0.6058, "step": 59000 }, { "epoch": 7.58, "learning_rate": 0.001, "loss": 0.6007, "step": 59200 }, { "epoch": 7.61, "learning_rate": 0.001, "loss": 0.606, "step": 59400 }, { "epoch": 7.64, "learning_rate": 0.001, "loss": 0.5885, "step": 59600 }, { "epoch": 7.66, "learning_rate": 0.001, "loss": 0.5968, "step": 59800 }, { "epoch": 7.69, "learning_rate": 0.001, "loss": 0.6073, "step": 60000 }, { "epoch": 7.69, "eval_loss": 0.9817278981208801, "eval_runtime": 227.2904, "eval_samples_per_second": 537.973, "eval_steps_per_second": 2.103, "step": 60000 }, { "epoch": 7.71, "learning_rate": 0.001, "loss": 0.5998, "step": 60200 }, { "epoch": 7.74, "learning_rate": 0.001, "loss": 0.5959, "step": 60400 }, { "epoch": 7.76, "learning_rate": 0.001, "loss": 0.5946, "step": 60600 }, { "epoch": 7.79, "learning_rate": 0.001, "loss": 0.601, "step": 60800 }, { "epoch": 7.82, "learning_rate": 0.001, "loss": 0.5963, "step": 61000 }, { "epoch": 7.84, "learning_rate": 0.001, "loss": 0.5996, "step": 61200 }, { "epoch": 7.87, "learning_rate": 0.001, "loss": 0.5981, "step": 61400 }, { "epoch": 7.89, "learning_rate": 0.001, "loss": 0.5945, "step": 61600 }, { "epoch": 7.92, "learning_rate": 0.001, "loss": 0.6057, "step": 61800 }, { "epoch": 7.94, "learning_rate": 0.001, "loss": 0.5952, "step": 62000 }, { "epoch": 7.97, "learning_rate": 0.001, "loss": 0.6012, "step": 62200 }, { "epoch": 7.99, "learning_rate": 0.001, "loss": 0.5979, "step": 62400 }, { "epoch": 8.02, "learning_rate": 0.001, "loss": 0.5965, "step": 62600 }, { "epoch": 8.05, "learning_rate": 0.001, "loss": 0.5767, "step": 62800 }, { "epoch": 8.07, "learning_rate": 0.001, "loss": 0.5697, "step": 63000 }, { "epoch": 8.1, "learning_rate": 0.001, "loss": 0.5761, "step": 63200 }, { "epoch": 8.12, "learning_rate": 0.001, "loss": 0.5808, "step": 63400 }, { "epoch": 8.15, "learning_rate": 0.001, "loss": 0.5734, "step": 63600 }, { "epoch": 8.17, "learning_rate": 0.001, "loss": 0.5767, "step": 63800 }, { "epoch": 8.2, "learning_rate": 0.001, "loss": 0.5874, "step": 64000 }, { "epoch": 8.23, "learning_rate": 0.001, "loss": 0.5773, "step": 64200 }, { "epoch": 8.25, "learning_rate": 0.001, "loss": 0.587, "step": 64400 }, { "epoch": 8.28, "learning_rate": 0.001, "loss": 0.5882, "step": 64600 }, { "epoch": 8.3, "learning_rate": 0.001, "loss": 0.5815, "step": 64800 }, { "epoch": 8.33, "learning_rate": 0.001, "loss": 0.5928, "step": 65000 }, { "epoch": 8.33, "eval_loss": 0.9951310753822327, "eval_runtime": 227.2886, "eval_samples_per_second": 537.977, "eval_steps_per_second": 2.103, "step": 65000 }, { "epoch": 8.35, "learning_rate": 0.001, "loss": 0.5827, "step": 65200 }, { "epoch": 8.38, "learning_rate": 0.001, "loss": 0.5733, "step": 65400 }, { "epoch": 8.4, "learning_rate": 0.001, "loss": 0.5889, "step": 65600 }, { "epoch": 8.43, "learning_rate": 0.001, "loss": 0.589, "step": 65800 }, { "epoch": 8.46, "learning_rate": 0.001, "loss": 0.5902, "step": 66000 }, { "epoch": 8.48, "learning_rate": 0.001, "loss": 0.5899, "step": 66200 }, { "epoch": 8.51, "learning_rate": 0.001, "loss": 0.586, "step": 66400 }, { "epoch": 8.53, "learning_rate": 0.001, "loss": 0.585, "step": 66600 }, { "epoch": 8.56, "learning_rate": 0.001, "loss": 0.5968, "step": 66800 }, { "epoch": 8.58, "learning_rate": 0.001, "loss": 0.5916, "step": 67000 }, { "epoch": 8.61, "learning_rate": 0.001, "loss": 0.5773, "step": 67200 }, { "epoch": 8.64, "learning_rate": 0.001, "loss": 0.5737, "step": 67400 }, { "epoch": 8.66, "learning_rate": 0.001, "loss": 0.5806, "step": 67600 }, { "epoch": 8.69, "learning_rate": 0.001, "loss": 0.5891, "step": 67800 }, { "epoch": 8.71, "learning_rate": 0.001, "loss": 0.5984, "step": 68000 }, { "epoch": 8.74, "learning_rate": 0.001, "loss": 0.599, "step": 68200 }, { "epoch": 8.76, "learning_rate": 0.001, "loss": 0.5892, "step": 68400 }, { "epoch": 8.79, "learning_rate": 0.001, "loss": 0.5785, "step": 68600 }, { "epoch": 8.81, "learning_rate": 0.001, "loss": 0.5892, "step": 68800 }, { "epoch": 8.84, "learning_rate": 0.001, "loss": 0.5812, "step": 69000 }, { "epoch": 8.87, "learning_rate": 0.001, "loss": 0.5943, "step": 69200 }, { "epoch": 8.89, "learning_rate": 0.001, "loss": 0.5926, "step": 69400 }, { "epoch": 8.92, "learning_rate": 0.001, "loss": 0.5798, "step": 69600 }, { "epoch": 8.94, "learning_rate": 0.001, "loss": 0.5911, "step": 69800 }, { "epoch": 8.97, "learning_rate": 0.001, "loss": 0.5846, "step": 70000 }, { "epoch": 8.97, "eval_loss": 0.9769449234008789, "eval_runtime": 227.0836, "eval_samples_per_second": 538.462, "eval_steps_per_second": 2.105, "step": 70000 } ], "max_steps": 78050, "num_train_epochs": 10, "total_flos": 9.84005748927529e+17, "trial_name": null, "trial_params": null }