{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.976, "eval_steps": 100, "global_step": 6100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 203.1111602783203, "learning_rate": 7.520000000000001e-06, "loss": 3.4547, "step": 100 }, { "epoch": 0.016, "eval_all-nli-dev_cosine_accuracy": 0.737, "eval_all-nli-dev_dot_accuracy": 0.319, "eval_all-nli-dev_euclidean_accuracy": 0.737, "eval_all-nli-dev_manhattan_accuracy": 0.801, "eval_all-nli-dev_max_accuracy": 0.801, "eval_loss": 2.2853124141693115, "eval_runtime": 4.7392, "eval_samples_per_second": 211.008, "eval_steps_per_second": 13.294, "step": 100 }, { "epoch": 0.032, "grad_norm": 38.57483673095703, "learning_rate": 1.552e-05, "loss": 1.6761, "step": 200 }, { "epoch": 0.032, "eval_all-nli-dev_cosine_accuracy": 0.826, "eval_all-nli-dev_dot_accuracy": 0.267, "eval_all-nli-dev_euclidean_accuracy": 0.83, "eval_all-nli-dev_manhattan_accuracy": 0.856, "eval_all-nli-dev_max_accuracy": 0.856, "eval_loss": 1.3493391275405884, "eval_runtime": 4.734, "eval_samples_per_second": 211.236, "eval_steps_per_second": 13.308, "step": 200 }, { "epoch": 0.048, "grad_norm": 36.92991638183594, "learning_rate": 2.3520000000000002e-05, "loss": 1.5528, "step": 300 }, { "epoch": 0.048, "eval_all-nli-dev_cosine_accuracy": 0.805, "eval_all-nli-dev_dot_accuracy": 0.29, "eval_all-nli-dev_euclidean_accuracy": 0.818, "eval_all-nli-dev_manhattan_accuracy": 0.83, "eval_all-nli-dev_max_accuracy": 0.83, "eval_loss": 1.4180811643600464, "eval_runtime": 4.629, "eval_samples_per_second": 216.028, "eval_steps_per_second": 13.61, "step": 300 }, { "epoch": 0.064, "grad_norm": 35.267967224121094, "learning_rate": 3.1519999999999996e-05, "loss": 1.0069, "step": 400 }, { "epoch": 0.064, "eval_all-nli-dev_cosine_accuracy": 0.819, "eval_all-nli-dev_dot_accuracy": 0.244, "eval_all-nli-dev_euclidean_accuracy": 0.824, "eval_all-nli-dev_manhattan_accuracy": 0.835, "eval_all-nli-dev_max_accuracy": 0.835, "eval_loss": 1.3276922702789307, "eval_runtime": 4.5852, "eval_samples_per_second": 218.095, "eval_steps_per_second": 13.74, "step": 400 }, { "epoch": 0.08, "grad_norm": 25.50597381591797, "learning_rate": 3.944e-05, "loss": 1.0611, "step": 500 }, { "epoch": 0.08, "eval_all-nli-dev_cosine_accuracy": 0.814, "eval_all-nli-dev_dot_accuracy": 0.216, "eval_all-nli-dev_euclidean_accuracy": 0.804, "eval_all-nli-dev_manhattan_accuracy": 0.847, "eval_all-nli-dev_max_accuracy": 0.847, "eval_loss": 1.4609754085540771, "eval_runtime": 4.6833, "eval_samples_per_second": 213.524, "eval_steps_per_second": 13.452, "step": 500 }, { "epoch": 0.096, "grad_norm": 55.69437026977539, "learning_rate": 4.744e-05, "loss": 1.1424, "step": 600 }, { "epoch": 0.096, "eval_all-nli-dev_cosine_accuracy": 0.776, "eval_all-nli-dev_dot_accuracy": 0.319, "eval_all-nli-dev_euclidean_accuracy": 0.786, "eval_all-nli-dev_manhattan_accuracy": 0.805, "eval_all-nli-dev_max_accuracy": 0.805, "eval_loss": 1.7394046783447266, "eval_runtime": 4.6079, "eval_samples_per_second": 217.017, "eval_steps_per_second": 13.672, "step": 600 }, { "epoch": 0.112, "grad_norm": 24.330310821533203, "learning_rate": 4.9404444444444447e-05, "loss": 1.3545, "step": 700 }, { "epoch": 0.112, "eval_all-nli-dev_cosine_accuracy": 0.825, "eval_all-nli-dev_dot_accuracy": 0.211, "eval_all-nli-dev_euclidean_accuracy": 0.825, "eval_all-nli-dev_manhattan_accuracy": 0.83, "eval_all-nli-dev_max_accuracy": 0.83, "eval_loss": 1.417900562286377, "eval_runtime": 4.6918, "eval_samples_per_second": 213.136, "eval_steps_per_second": 13.428, "step": 700 }, { "epoch": 0.128, "grad_norm": 18.1214599609375, "learning_rate": 4.852444444444444e-05, "loss": 1.3587, "step": 800 }, { "epoch": 0.128, "eval_all-nli-dev_cosine_accuracy": 0.834, "eval_all-nli-dev_dot_accuracy": 0.175, "eval_all-nli-dev_euclidean_accuracy": 0.832, "eval_all-nli-dev_manhattan_accuracy": 0.84, "eval_all-nli-dev_max_accuracy": 0.84, "eval_loss": 1.6350008249282837, "eval_runtime": 4.57, "eval_samples_per_second": 218.818, "eval_steps_per_second": 13.786, "step": 800 }, { "epoch": 0.144, "grad_norm": 28.469261169433594, "learning_rate": 4.763555555555555e-05, "loss": 1.237, "step": 900 }, { "epoch": 0.144, "eval_all-nli-dev_cosine_accuracy": 0.778, "eval_all-nli-dev_dot_accuracy": 0.235, "eval_all-nli-dev_euclidean_accuracy": 0.778, "eval_all-nli-dev_manhattan_accuracy": 0.801, "eval_all-nli-dev_max_accuracy": 0.801, "eval_loss": 1.6793839931488037, "eval_runtime": 4.5941, "eval_samples_per_second": 217.673, "eval_steps_per_second": 13.713, "step": 900 }, { "epoch": 0.16, "grad_norm": 33.614253997802734, "learning_rate": 4.6746666666666664e-05, "loss": 1.2029, "step": 1000 }, { "epoch": 0.16, "eval_all-nli-dev_cosine_accuracy": 0.799, "eval_all-nli-dev_dot_accuracy": 0.24, "eval_all-nli-dev_euclidean_accuracy": 0.808, "eval_all-nli-dev_manhattan_accuracy": 0.811, "eval_all-nli-dev_max_accuracy": 0.811, "eval_loss": 1.673274040222168, "eval_runtime": 4.6683, "eval_samples_per_second": 214.212, "eval_steps_per_second": 13.495, "step": 1000 }, { "epoch": 0.176, "grad_norm": 26.691509246826172, "learning_rate": 4.5857777777777775e-05, "loss": 1.2748, "step": 1100 }, { "epoch": 0.176, "eval_all-nli-dev_cosine_accuracy": 0.8, "eval_all-nli-dev_dot_accuracy": 0.213, "eval_all-nli-dev_euclidean_accuracy": 0.802, "eval_all-nli-dev_manhattan_accuracy": 0.818, "eval_all-nli-dev_max_accuracy": 0.818, "eval_loss": 1.6359915733337402, "eval_runtime": 4.6977, "eval_samples_per_second": 212.869, "eval_steps_per_second": 13.411, "step": 1100 }, { "epoch": 0.192, "grad_norm": 23.69953727722168, "learning_rate": 4.4968888888888894e-05, "loss": 1.1433, "step": 1200 }, { "epoch": 0.192, "eval_all-nli-dev_cosine_accuracy": 0.786, "eval_all-nli-dev_dot_accuracy": 0.215, "eval_all-nli-dev_euclidean_accuracy": 0.785, "eval_all-nli-dev_manhattan_accuracy": 0.806, "eval_all-nli-dev_max_accuracy": 0.806, "eval_loss": 1.7951678037643433, "eval_runtime": 4.6338, "eval_samples_per_second": 215.807, "eval_steps_per_second": 13.596, "step": 1200 }, { "epoch": 0.208, "grad_norm": 17.910472869873047, "learning_rate": 4.4080000000000005e-05, "loss": 1.0113, "step": 1300 }, { "epoch": 0.208, "eval_all-nli-dev_cosine_accuracy": 0.817, "eval_all-nli-dev_dot_accuracy": 0.178, "eval_all-nli-dev_euclidean_accuracy": 0.815, "eval_all-nli-dev_manhattan_accuracy": 0.817, "eval_all-nli-dev_max_accuracy": 0.817, "eval_loss": 1.4315475225448608, "eval_runtime": 4.6753, "eval_samples_per_second": 213.892, "eval_steps_per_second": 13.475, "step": 1300 }, { "epoch": 0.224, "grad_norm": 43.645694732666016, "learning_rate": 4.3191111111111116e-05, "loss": 0.8216, "step": 1400 }, { "epoch": 0.224, "eval_all-nli-dev_cosine_accuracy": 0.771, "eval_all-nli-dev_dot_accuracy": 0.243, "eval_all-nli-dev_euclidean_accuracy": 0.776, "eval_all-nli-dev_manhattan_accuracy": 0.774, "eval_all-nli-dev_max_accuracy": 0.776, "eval_loss": 1.6300010681152344, "eval_runtime": 4.6418, "eval_samples_per_second": 215.435, "eval_steps_per_second": 13.572, "step": 1400 }, { "epoch": 0.24, "grad_norm": 31.21487045288086, "learning_rate": 4.231111111111111e-05, "loss": 1.3451, "step": 1500 }, { "epoch": 0.24, "eval_all-nli-dev_cosine_accuracy": 0.845, "eval_all-nli-dev_dot_accuracy": 0.186, "eval_all-nli-dev_euclidean_accuracy": 0.85, "eval_all-nli-dev_manhattan_accuracy": 0.856, "eval_all-nli-dev_max_accuracy": 0.856, "eval_loss": 1.1566354036331177, "eval_runtime": 4.6716, "eval_samples_per_second": 214.061, "eval_steps_per_second": 13.486, "step": 1500 }, { "epoch": 0.256, "grad_norm": 25.364757537841797, "learning_rate": 4.142222222222222e-05, "loss": 0.8745, "step": 1600 }, { "epoch": 0.256, "eval_all-nli-dev_cosine_accuracy": 0.825, "eval_all-nli-dev_dot_accuracy": 0.175, "eval_all-nli-dev_euclidean_accuracy": 0.833, "eval_all-nli-dev_manhattan_accuracy": 0.838, "eval_all-nli-dev_max_accuracy": 0.838, "eval_loss": 1.2074507474899292, "eval_runtime": 4.7951, "eval_samples_per_second": 208.546, "eval_steps_per_second": 13.138, "step": 1600 }, { "epoch": 0.272, "grad_norm": 16.138595581054688, "learning_rate": 4.0533333333333334e-05, "loss": 0.9945, "step": 1700 }, { "epoch": 0.272, "eval_all-nli-dev_cosine_accuracy": 0.822, "eval_all-nli-dev_dot_accuracy": 0.191, "eval_all-nli-dev_euclidean_accuracy": 0.824, "eval_all-nli-dev_manhattan_accuracy": 0.831, "eval_all-nli-dev_max_accuracy": 0.831, "eval_loss": 1.3295574188232422, "eval_runtime": 4.7422, "eval_samples_per_second": 210.873, "eval_steps_per_second": 13.285, "step": 1700 }, { "epoch": 0.288, "grad_norm": 21.44277572631836, "learning_rate": 3.9644444444444445e-05, "loss": 0.9827, "step": 1800 }, { "epoch": 0.288, "eval_all-nli-dev_cosine_accuracy": 0.844, "eval_all-nli-dev_dot_accuracy": 0.163, "eval_all-nli-dev_euclidean_accuracy": 0.839, "eval_all-nli-dev_manhattan_accuracy": 0.844, "eval_all-nli-dev_max_accuracy": 0.844, "eval_loss": 1.3051831722259521, "eval_runtime": 4.6529, "eval_samples_per_second": 214.919, "eval_steps_per_second": 13.54, "step": 1800 }, { "epoch": 0.304, "grad_norm": 25.058795928955078, "learning_rate": 3.8755555555555556e-05, "loss": 0.974, "step": 1900 }, { "epoch": 0.304, "eval_all-nli-dev_cosine_accuracy": 0.837, "eval_all-nli-dev_dot_accuracy": 0.161, "eval_all-nli-dev_euclidean_accuracy": 0.838, "eval_all-nli-dev_manhattan_accuracy": 0.85, "eval_all-nli-dev_max_accuracy": 0.85, "eval_loss": 1.164267897605896, "eval_runtime": 4.6376, "eval_samples_per_second": 215.626, "eval_steps_per_second": 13.584, "step": 1900 }, { "epoch": 0.32, "grad_norm": 17.524301528930664, "learning_rate": 3.786666666666667e-05, "loss": 0.7555, "step": 2000 }, { "epoch": 0.32, "eval_all-nli-dev_cosine_accuracy": 0.855, "eval_all-nli-dev_dot_accuracy": 0.147, "eval_all-nli-dev_euclidean_accuracy": 0.856, "eval_all-nli-dev_manhattan_accuracy": 0.869, "eval_all-nli-dev_max_accuracy": 0.869, "eval_loss": 1.2737869024276733, "eval_runtime": 4.6503, "eval_samples_per_second": 215.038, "eval_steps_per_second": 13.547, "step": 2000 }, { "epoch": 0.336, "grad_norm": 10.828136444091797, "learning_rate": 3.697777777777778e-05, "loss": 0.7176, "step": 2100 }, { "epoch": 0.336, "eval_all-nli-dev_cosine_accuracy": 0.832, "eval_all-nli-dev_dot_accuracy": 0.183, "eval_all-nli-dev_euclidean_accuracy": 0.832, "eval_all-nli-dev_manhattan_accuracy": 0.829, "eval_all-nli-dev_max_accuracy": 0.832, "eval_loss": 1.374898910522461, "eval_runtime": 4.6209, "eval_samples_per_second": 216.408, "eval_steps_per_second": 13.634, "step": 2100 }, { "epoch": 0.352, "grad_norm": 11.03420639038086, "learning_rate": 3.608888888888889e-05, "loss": 0.834, "step": 2200 }, { "epoch": 0.352, "eval_all-nli-dev_cosine_accuracy": 0.875, "eval_all-nli-dev_dot_accuracy": 0.147, "eval_all-nli-dev_euclidean_accuracy": 0.879, "eval_all-nli-dev_manhattan_accuracy": 0.874, "eval_all-nli-dev_max_accuracy": 0.879, "eval_loss": 1.071208119392395, "eval_runtime": 4.6829, "eval_samples_per_second": 213.542, "eval_steps_per_second": 13.453, "step": 2200 }, { "epoch": 0.368, "grad_norm": 15.635822296142578, "learning_rate": 3.52e-05, "loss": 1.0819, "step": 2300 }, { "epoch": 0.368, "eval_all-nli-dev_cosine_accuracy": 0.849, "eval_all-nli-dev_dot_accuracy": 0.162, "eval_all-nli-dev_euclidean_accuracy": 0.849, "eval_all-nli-dev_manhattan_accuracy": 0.848, "eval_all-nli-dev_max_accuracy": 0.849, "eval_loss": 1.27626633644104, "eval_runtime": 4.5515, "eval_samples_per_second": 219.707, "eval_steps_per_second": 13.842, "step": 2300 }, { "epoch": 0.384, "grad_norm": 15.611441612243652, "learning_rate": 3.431111111111111e-05, "loss": 0.9515, "step": 2400 }, { "epoch": 0.384, "eval_all-nli-dev_cosine_accuracy": 0.845, "eval_all-nli-dev_dot_accuracy": 0.153, "eval_all-nli-dev_euclidean_accuracy": 0.847, "eval_all-nli-dev_manhattan_accuracy": 0.848, "eval_all-nli-dev_max_accuracy": 0.848, "eval_loss": 1.1383966207504272, "eval_runtime": 4.5335, "eval_samples_per_second": 220.582, "eval_steps_per_second": 13.897, "step": 2400 }, { "epoch": 0.4, "grad_norm": 23.901636123657227, "learning_rate": 3.3422222222222224e-05, "loss": 0.7828, "step": 2500 }, { "epoch": 0.4, "eval_all-nli-dev_cosine_accuracy": 0.859, "eval_all-nli-dev_dot_accuracy": 0.142, "eval_all-nli-dev_euclidean_accuracy": 0.861, "eval_all-nli-dev_manhattan_accuracy": 0.861, "eval_all-nli-dev_max_accuracy": 0.861, "eval_loss": 1.0878574848175049, "eval_runtime": 4.6563, "eval_samples_per_second": 214.765, "eval_steps_per_second": 13.53, "step": 2500 }, { "epoch": 0.416, "grad_norm": 20.29927635192871, "learning_rate": 3.253333333333333e-05, "loss": 0.7268, "step": 2600 }, { "epoch": 0.416, "eval_all-nli-dev_cosine_accuracy": 0.868, "eval_all-nli-dev_dot_accuracy": 0.128, "eval_all-nli-dev_euclidean_accuracy": 0.864, "eval_all-nli-dev_manhattan_accuracy": 0.867, "eval_all-nli-dev_max_accuracy": 0.868, "eval_loss": 0.9835022687911987, "eval_runtime": 4.6005, "eval_samples_per_second": 217.367, "eval_steps_per_second": 13.694, "step": 2600 }, { "epoch": 0.432, "grad_norm": 39.60092544555664, "learning_rate": 3.164444444444444e-05, "loss": 0.9228, "step": 2700 }, { "epoch": 0.432, "eval_all-nli-dev_cosine_accuracy": 0.851, "eval_all-nli-dev_dot_accuracy": 0.15, "eval_all-nli-dev_euclidean_accuracy": 0.848, "eval_all-nli-dev_manhattan_accuracy": 0.848, "eval_all-nli-dev_max_accuracy": 0.851, "eval_loss": 1.1840057373046875, "eval_runtime": 4.6302, "eval_samples_per_second": 215.972, "eval_steps_per_second": 13.606, "step": 2700 }, { "epoch": 0.448, "grad_norm": 26.71760368347168, "learning_rate": 3.075555555555556e-05, "loss": 1.0017, "step": 2800 }, { "epoch": 0.448, "eval_all-nli-dev_cosine_accuracy": 0.85, "eval_all-nli-dev_dot_accuracy": 0.138, "eval_all-nli-dev_euclidean_accuracy": 0.846, "eval_all-nli-dev_manhattan_accuracy": 0.853, "eval_all-nli-dev_max_accuracy": 0.853, "eval_loss": 1.1967850923538208, "eval_runtime": 4.6908, "eval_samples_per_second": 213.184, "eval_steps_per_second": 13.431, "step": 2800 }, { "epoch": 0.464, "grad_norm": 25.688671112060547, "learning_rate": 2.986666666666667e-05, "loss": 0.9138, "step": 2900 }, { "epoch": 0.464, "eval_all-nli-dev_cosine_accuracy": 0.861, "eval_all-nli-dev_dot_accuracy": 0.14, "eval_all-nli-dev_euclidean_accuracy": 0.86, "eval_all-nli-dev_manhattan_accuracy": 0.869, "eval_all-nli-dev_max_accuracy": 0.869, "eval_loss": 0.9930791854858398, "eval_runtime": 4.7105, "eval_samples_per_second": 212.29, "eval_steps_per_second": 13.374, "step": 2900 }, { "epoch": 0.48, "grad_norm": 13.824788093566895, "learning_rate": 2.897777777777778e-05, "loss": 0.8498, "step": 3000 }, { "epoch": 0.48, "eval_all-nli-dev_cosine_accuracy": 0.872, "eval_all-nli-dev_dot_accuracy": 0.129, "eval_all-nli-dev_euclidean_accuracy": 0.871, "eval_all-nli-dev_manhattan_accuracy": 0.876, "eval_all-nli-dev_max_accuracy": 0.876, "eval_loss": 0.9925669431686401, "eval_runtime": 4.67, "eval_samples_per_second": 214.134, "eval_steps_per_second": 13.49, "step": 3000 }, { "epoch": 0.496, "grad_norm": 24.699886322021484, "learning_rate": 2.8088888888888893e-05, "loss": 0.9682, "step": 3100 }, { "epoch": 0.496, "eval_all-nli-dev_cosine_accuracy": 0.863, "eval_all-nli-dev_dot_accuracy": 0.132, "eval_all-nli-dev_euclidean_accuracy": 0.86, "eval_all-nli-dev_manhattan_accuracy": 0.866, "eval_all-nli-dev_max_accuracy": 0.866, "eval_loss": 1.0003857612609863, "eval_runtime": 4.6701, "eval_samples_per_second": 214.128, "eval_steps_per_second": 13.49, "step": 3100 }, { "epoch": 0.512, "grad_norm": 33.25304412841797, "learning_rate": 2.7200000000000004e-05, "loss": 0.7227, "step": 3200 }, { "epoch": 0.512, "eval_all-nli-dev_cosine_accuracy": 0.882, "eval_all-nli-dev_dot_accuracy": 0.118, "eval_all-nli-dev_euclidean_accuracy": 0.883, "eval_all-nli-dev_manhattan_accuracy": 0.88, "eval_all-nli-dev_max_accuracy": 0.883, "eval_loss": 0.8489543199539185, "eval_runtime": 4.5888, "eval_samples_per_second": 217.92, "eval_steps_per_second": 13.729, "step": 3200 }, { "epoch": 0.528, "grad_norm": 12.450774192810059, "learning_rate": 2.6311111111111115e-05, "loss": 0.7134, "step": 3300 }, { "epoch": 0.528, "eval_all-nli-dev_cosine_accuracy": 0.882, "eval_all-nli-dev_dot_accuracy": 0.121, "eval_all-nli-dev_euclidean_accuracy": 0.877, "eval_all-nli-dev_manhattan_accuracy": 0.884, "eval_all-nli-dev_max_accuracy": 0.884, "eval_loss": 0.8214895725250244, "eval_runtime": 4.6881, "eval_samples_per_second": 213.307, "eval_steps_per_second": 13.438, "step": 3300 }, { "epoch": 0.544, "grad_norm": 13.840750694274902, "learning_rate": 2.5422222222222227e-05, "loss": 0.6645, "step": 3400 }, { "epoch": 0.544, "eval_all-nli-dev_cosine_accuracy": 0.873, "eval_all-nli-dev_dot_accuracy": 0.136, "eval_all-nli-dev_euclidean_accuracy": 0.874, "eval_all-nli-dev_manhattan_accuracy": 0.877, "eval_all-nli-dev_max_accuracy": 0.877, "eval_loss": 0.8888874053955078, "eval_runtime": 4.6792, "eval_samples_per_second": 213.711, "eval_steps_per_second": 13.464, "step": 3400 }, { "epoch": 0.56, "grad_norm": 9.944676399230957, "learning_rate": 2.4533333333333334e-05, "loss": 0.7073, "step": 3500 }, { "epoch": 0.56, "eval_all-nli-dev_cosine_accuracy": 0.884, "eval_all-nli-dev_dot_accuracy": 0.108, "eval_all-nli-dev_euclidean_accuracy": 0.884, "eval_all-nli-dev_manhattan_accuracy": 0.888, "eval_all-nli-dev_max_accuracy": 0.888, "eval_loss": 0.8373873829841614, "eval_runtime": 4.6888, "eval_samples_per_second": 213.273, "eval_steps_per_second": 13.436, "step": 3500 }, { "epoch": 0.576, "grad_norm": 23.035810470581055, "learning_rate": 2.3644444444444446e-05, "loss": 0.6679, "step": 3600 }, { "epoch": 0.576, "eval_all-nli-dev_cosine_accuracy": 0.905, "eval_all-nli-dev_dot_accuracy": 0.094, "eval_all-nli-dev_euclidean_accuracy": 0.903, "eval_all-nli-dev_manhattan_accuracy": 0.911, "eval_all-nli-dev_max_accuracy": 0.911, "eval_loss": 0.7780482172966003, "eval_runtime": 4.6847, "eval_samples_per_second": 213.461, "eval_steps_per_second": 13.448, "step": 3600 }, { "epoch": 0.592, "grad_norm": 12.241792678833008, "learning_rate": 2.2755555555555557e-05, "loss": 0.6609, "step": 3700 }, { "epoch": 0.592, "eval_all-nli-dev_cosine_accuracy": 0.893, "eval_all-nli-dev_dot_accuracy": 0.102, "eval_all-nli-dev_euclidean_accuracy": 0.893, "eval_all-nli-dev_manhattan_accuracy": 0.896, "eval_all-nli-dev_max_accuracy": 0.896, "eval_loss": 0.812877357006073, "eval_runtime": 4.685, "eval_samples_per_second": 213.449, "eval_steps_per_second": 13.447, "step": 3700 }, { "epoch": 0.608, "grad_norm": 23.00703239440918, "learning_rate": 2.186666666666667e-05, "loss": 0.687, "step": 3800 }, { "epoch": 0.608, "eval_all-nli-dev_cosine_accuracy": 0.913, "eval_all-nli-dev_dot_accuracy": 0.085, "eval_all-nli-dev_euclidean_accuracy": 0.905, "eval_all-nli-dev_manhattan_accuracy": 0.905, "eval_all-nli-dev_max_accuracy": 0.913, "eval_loss": 0.7215772271156311, "eval_runtime": 4.6566, "eval_samples_per_second": 214.751, "eval_steps_per_second": 13.529, "step": 3800 }, { "epoch": 0.624, "grad_norm": 13.749407768249512, "learning_rate": 2.097777777777778e-05, "loss": 0.5725, "step": 3900 }, { "epoch": 0.624, "eval_all-nli-dev_cosine_accuracy": 0.912, "eval_all-nli-dev_dot_accuracy": 0.09, "eval_all-nli-dev_euclidean_accuracy": 0.908, "eval_all-nli-dev_manhattan_accuracy": 0.92, "eval_all-nli-dev_max_accuracy": 0.92, "eval_loss": 0.7618492841720581, "eval_runtime": 4.6929, "eval_samples_per_second": 213.087, "eval_steps_per_second": 13.424, "step": 3900 }, { "epoch": 0.64, "grad_norm": 14.7637300491333, "learning_rate": 2.008888888888889e-05, "loss": 0.87, "step": 4000 }, { "epoch": 0.64, "eval_all-nli-dev_cosine_accuracy": 0.909, "eval_all-nli-dev_dot_accuracy": 0.086, "eval_all-nli-dev_euclidean_accuracy": 0.909, "eval_all-nli-dev_manhattan_accuracy": 0.909, "eval_all-nli-dev_max_accuracy": 0.909, "eval_loss": 0.706980288028717, "eval_runtime": 4.6894, "eval_samples_per_second": 213.247, "eval_steps_per_second": 13.435, "step": 4000 }, { "epoch": 0.656, "grad_norm": 16.75938606262207, "learning_rate": 1.9200000000000003e-05, "loss": 1.0892, "step": 4100 }, { "epoch": 0.656, "eval_all-nli-dev_cosine_accuracy": 0.901, "eval_all-nli-dev_dot_accuracy": 0.094, "eval_all-nli-dev_euclidean_accuracy": 0.897, "eval_all-nli-dev_manhattan_accuracy": 0.899, "eval_all-nli-dev_max_accuracy": 0.901, "eval_loss": 0.7424288392066956, "eval_runtime": 4.6286, "eval_samples_per_second": 216.046, "eval_steps_per_second": 13.611, "step": 4100 }, { "epoch": 0.672, "grad_norm": 17.46446418762207, "learning_rate": 1.8311111111111114e-05, "loss": 1.048, "step": 4200 }, { "epoch": 0.672, "eval_all-nli-dev_cosine_accuracy": 0.908, "eval_all-nli-dev_dot_accuracy": 0.093, "eval_all-nli-dev_euclidean_accuracy": 0.908, "eval_all-nli-dev_manhattan_accuracy": 0.909, "eval_all-nli-dev_max_accuracy": 0.909, "eval_loss": 0.6750496029853821, "eval_runtime": 4.641, "eval_samples_per_second": 215.47, "eval_steps_per_second": 13.575, "step": 4200 }, { "epoch": 0.688, "grad_norm": 18.15054702758789, "learning_rate": 1.7422222222222222e-05, "loss": 0.8571, "step": 4300 }, { "epoch": 0.688, "eval_all-nli-dev_cosine_accuracy": 0.903, "eval_all-nli-dev_dot_accuracy": 0.095, "eval_all-nli-dev_euclidean_accuracy": 0.901, "eval_all-nli-dev_manhattan_accuracy": 0.902, "eval_all-nli-dev_max_accuracy": 0.903, "eval_loss": 0.6474354863166809, "eval_runtime": 4.7822, "eval_samples_per_second": 209.111, "eval_steps_per_second": 13.174, "step": 4300 }, { "epoch": 0.704, "grad_norm": 15.00839614868164, "learning_rate": 1.6533333333333333e-05, "loss": 0.7945, "step": 4400 }, { "epoch": 0.704, "eval_all-nli-dev_cosine_accuracy": 0.908, "eval_all-nli-dev_dot_accuracy": 0.089, "eval_all-nli-dev_euclidean_accuracy": 0.91, "eval_all-nli-dev_manhattan_accuracy": 0.911, "eval_all-nli-dev_max_accuracy": 0.911, "eval_loss": 0.6094924211502075, "eval_runtime": 4.6578, "eval_samples_per_second": 214.695, "eval_steps_per_second": 13.526, "step": 4400 }, { "epoch": 0.72, "grad_norm": 24.833444595336914, "learning_rate": 1.5644444444444444e-05, "loss": 0.6717, "step": 4500 }, { "epoch": 0.72, "eval_all-nli-dev_cosine_accuracy": 0.93, "eval_all-nli-dev_dot_accuracy": 0.066, "eval_all-nli-dev_euclidean_accuracy": 0.923, "eval_all-nli-dev_manhattan_accuracy": 0.921, "eval_all-nli-dev_max_accuracy": 0.93, "eval_loss": 0.5663518309593201, "eval_runtime": 4.7483, "eval_samples_per_second": 210.6, "eval_steps_per_second": 13.268, "step": 4500 }, { "epoch": 0.736, "grad_norm": 1504.730224609375, "learning_rate": 1.4755555555555556e-05, "loss": 0.8161, "step": 4600 }, { "epoch": 0.736, "eval_all-nli-dev_cosine_accuracy": 0.919, "eval_all-nli-dev_dot_accuracy": 0.07, "eval_all-nli-dev_euclidean_accuracy": 0.916, "eval_all-nli-dev_manhattan_accuracy": 0.918, "eval_all-nli-dev_max_accuracy": 0.919, "eval_loss": 0.5479408502578735, "eval_runtime": 4.6248, "eval_samples_per_second": 216.224, "eval_steps_per_second": 13.622, "step": 4600 }, { "epoch": 0.752, "grad_norm": 30.87934684753418, "learning_rate": 1.3866666666666667e-05, "loss": 0.7917, "step": 4700 }, { "epoch": 0.752, "eval_all-nli-dev_cosine_accuracy": 0.911, "eval_all-nli-dev_dot_accuracy": 0.083, "eval_all-nli-dev_euclidean_accuracy": 0.909, "eval_all-nli-dev_manhattan_accuracy": 0.907, "eval_all-nli-dev_max_accuracy": 0.911, "eval_loss": 0.6419683694839478, "eval_runtime": 4.5916, "eval_samples_per_second": 217.788, "eval_steps_per_second": 13.721, "step": 4700 }, { "epoch": 0.768, "grad_norm": 23.224735260009766, "learning_rate": 1.2977777777777777e-05, "loss": 0.7711, "step": 4800 }, { "epoch": 0.768, "eval_all-nli-dev_cosine_accuracy": 0.916, "eval_all-nli-dev_dot_accuracy": 0.078, "eval_all-nli-dev_euclidean_accuracy": 0.914, "eval_all-nli-dev_manhattan_accuracy": 0.913, "eval_all-nli-dev_max_accuracy": 0.916, "eval_loss": 0.5856308341026306, "eval_runtime": 4.5755, "eval_samples_per_second": 218.553, "eval_steps_per_second": 13.769, "step": 4800 }, { "epoch": 0.784, "grad_norm": 13.215950012207031, "learning_rate": 1.208888888888889e-05, "loss": 0.6441, "step": 4900 }, { "epoch": 0.784, "eval_all-nli-dev_cosine_accuracy": 0.916, "eval_all-nli-dev_dot_accuracy": 0.079, "eval_all-nli-dev_euclidean_accuracy": 0.913, "eval_all-nli-dev_manhattan_accuracy": 0.913, "eval_all-nli-dev_max_accuracy": 0.916, "eval_loss": 0.5775041580200195, "eval_runtime": 4.5681, "eval_samples_per_second": 218.91, "eval_steps_per_second": 13.791, "step": 4900 }, { "epoch": 0.8, "grad_norm": 18.222217559814453, "learning_rate": 1.1200000000000001e-05, "loss": 0.7766, "step": 5000 }, { "epoch": 0.8, "eval_all-nli-dev_cosine_accuracy": 0.922, "eval_all-nli-dev_dot_accuracy": 0.077, "eval_all-nli-dev_euclidean_accuracy": 0.92, "eval_all-nli-dev_manhattan_accuracy": 0.917, "eval_all-nli-dev_max_accuracy": 0.922, "eval_loss": 0.5785014629364014, "eval_runtime": 4.6158, "eval_samples_per_second": 216.645, "eval_steps_per_second": 13.649, "step": 5000 }, { "epoch": 0.816, "grad_norm": 10.330174446105957, "learning_rate": 1.031111111111111e-05, "loss": 0.6009, "step": 5100 }, { "epoch": 0.816, "eval_all-nli-dev_cosine_accuracy": 0.921, "eval_all-nli-dev_dot_accuracy": 0.081, "eval_all-nli-dev_euclidean_accuracy": 0.917, "eval_all-nli-dev_manhattan_accuracy": 0.919, "eval_all-nli-dev_max_accuracy": 0.921, "eval_loss": 0.5679826140403748, "eval_runtime": 4.5803, "eval_samples_per_second": 218.325, "eval_steps_per_second": 13.754, "step": 5100 }, { "epoch": 0.832, "grad_norm": 14.917418479919434, "learning_rate": 9.422222222222222e-06, "loss": 0.6711, "step": 5200 }, { "epoch": 0.832, "eval_all-nli-dev_cosine_accuracy": 0.921, "eval_all-nli-dev_dot_accuracy": 0.074, "eval_all-nli-dev_euclidean_accuracy": 0.917, "eval_all-nli-dev_manhattan_accuracy": 0.92, "eval_all-nli-dev_max_accuracy": 0.921, "eval_loss": 0.5487431883811951, "eval_runtime": 4.6796, "eval_samples_per_second": 213.694, "eval_steps_per_second": 13.463, "step": 5200 }, { "epoch": 0.848, "grad_norm": 32.70161056518555, "learning_rate": 8.533333333333334e-06, "loss": 0.618, "step": 5300 }, { "epoch": 0.848, "eval_all-nli-dev_cosine_accuracy": 0.926, "eval_all-nli-dev_dot_accuracy": 0.074, "eval_all-nli-dev_euclidean_accuracy": 0.921, "eval_all-nli-dev_manhattan_accuracy": 0.922, "eval_all-nli-dev_max_accuracy": 0.926, "eval_loss": 0.5450394749641418, "eval_runtime": 4.662, "eval_samples_per_second": 214.5, "eval_steps_per_second": 13.513, "step": 5300 }, { "epoch": 0.864, "grad_norm": 13.764747619628906, "learning_rate": 7.644444444444445e-06, "loss": 0.6702, "step": 5400 }, { "epoch": 0.864, "eval_all-nli-dev_cosine_accuracy": 0.926, "eval_all-nli-dev_dot_accuracy": 0.073, "eval_all-nli-dev_euclidean_accuracy": 0.921, "eval_all-nli-dev_manhattan_accuracy": 0.919, "eval_all-nli-dev_max_accuracy": 0.926, "eval_loss": 0.5497583150863647, "eval_runtime": 4.5535, "eval_samples_per_second": 219.613, "eval_steps_per_second": 13.836, "step": 5400 }, { "epoch": 0.88, "grad_norm": 29.501554489135742, "learning_rate": 6.755555555555555e-06, "loss": 0.7039, "step": 5500 }, { "epoch": 0.88, "eval_all-nli-dev_cosine_accuracy": 0.927, "eval_all-nli-dev_dot_accuracy": 0.07, "eval_all-nli-dev_euclidean_accuracy": 0.926, "eval_all-nli-dev_manhattan_accuracy": 0.922, "eval_all-nli-dev_max_accuracy": 0.927, "eval_loss": 0.5191856622695923, "eval_runtime": 4.5392, "eval_samples_per_second": 220.305, "eval_steps_per_second": 13.879, "step": 5500 }, { "epoch": 0.896, "grad_norm": 19.63087272644043, "learning_rate": 5.866666666666667e-06, "loss": 0.6114, "step": 5600 }, { "epoch": 0.896, "eval_all-nli-dev_cosine_accuracy": 0.932, "eval_all-nli-dev_dot_accuracy": 0.067, "eval_all-nli-dev_euclidean_accuracy": 0.931, "eval_all-nli-dev_manhattan_accuracy": 0.93, "eval_all-nli-dev_max_accuracy": 0.932, "eval_loss": 0.5045494437217712, "eval_runtime": 4.6367, "eval_samples_per_second": 215.672, "eval_steps_per_second": 13.587, "step": 5600 }, { "epoch": 0.912, "grad_norm": 57.868019104003906, "learning_rate": 4.977777777777778e-06, "loss": 0.7761, "step": 5700 }, { "epoch": 0.912, "eval_all-nli-dev_cosine_accuracy": 0.934, "eval_all-nli-dev_dot_accuracy": 0.061, "eval_all-nli-dev_euclidean_accuracy": 0.931, "eval_all-nli-dev_manhattan_accuracy": 0.928, "eval_all-nli-dev_max_accuracy": 0.934, "eval_loss": 0.5033252835273743, "eval_runtime": 4.6312, "eval_samples_per_second": 215.928, "eval_steps_per_second": 13.603, "step": 5700 }, { "epoch": 0.928, "grad_norm": 11.63007640838623, "learning_rate": 4.088888888888889e-06, "loss": 0.6248, "step": 5800 }, { "epoch": 0.928, "eval_all-nli-dev_cosine_accuracy": 0.932, "eval_all-nli-dev_dot_accuracy": 0.068, "eval_all-nli-dev_euclidean_accuracy": 0.926, "eval_all-nli-dev_manhattan_accuracy": 0.926, "eval_all-nli-dev_max_accuracy": 0.932, "eval_loss": 0.5013440251350403, "eval_runtime": 4.6162, "eval_samples_per_second": 216.629, "eval_steps_per_second": 13.648, "step": 5800 }, { "epoch": 0.944, "grad_norm": 16.12616539001465, "learning_rate": 3.2000000000000003e-06, "loss": 0.8359, "step": 5900 }, { "epoch": 0.944, "eval_all-nli-dev_cosine_accuracy": 0.93, "eval_all-nli-dev_dot_accuracy": 0.07, "eval_all-nli-dev_euclidean_accuracy": 0.926, "eval_all-nli-dev_manhattan_accuracy": 0.923, "eval_all-nli-dev_max_accuracy": 0.93, "eval_loss": 0.49764034152030945, "eval_runtime": 4.6817, "eval_samples_per_second": 213.6, "eval_steps_per_second": 13.457, "step": 5900 }, { "epoch": 0.96, "grad_norm": 16.06106185913086, "learning_rate": 2.311111111111111e-06, "loss": 0.8764, "step": 6000 }, { "epoch": 0.96, "eval_all-nli-dev_cosine_accuracy": 0.936, "eval_all-nli-dev_dot_accuracy": 0.062, "eval_all-nli-dev_euclidean_accuracy": 0.928, "eval_all-nli-dev_manhattan_accuracy": 0.928, "eval_all-nli-dev_max_accuracy": 0.936, "eval_loss": 0.49757900834083557, "eval_runtime": 4.6133, "eval_samples_per_second": 216.765, "eval_steps_per_second": 13.656, "step": 6000 }, { "epoch": 0.976, "grad_norm": 0.0003809410845860839, "learning_rate": 1.4222222222222223e-06, "loss": 0.763, "step": 6100 }, { "epoch": 0.976, "eval_all-nli-dev_cosine_accuracy": 0.935, "eval_all-nli-dev_dot_accuracy": 0.061, "eval_all-nli-dev_euclidean_accuracy": 0.93, "eval_all-nli-dev_manhattan_accuracy": 0.929, "eval_all-nli-dev_max_accuracy": 0.935, "eval_loss": 0.48454272747039795, "eval_runtime": 4.737, "eval_samples_per_second": 211.106, "eval_steps_per_second": 13.3, "step": 6100 } ], "logging_steps": 100, "max_steps": 6250, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }