|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.976, |
|
"eval_steps": 100, |
|
"global_step": 6100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 203.1111602783203, |
|
"learning_rate": 7.520000000000001e-06, |
|
"loss": 3.4547, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"eval_all-nli-dev_cosine_accuracy": 0.737, |
|
"eval_all-nli-dev_dot_accuracy": 0.319, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.737, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.801, |
|
"eval_all-nli-dev_max_accuracy": 0.801, |
|
"eval_loss": 2.2853124141693115, |
|
"eval_runtime": 4.7392, |
|
"eval_samples_per_second": 211.008, |
|
"eval_steps_per_second": 13.294, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 38.57483673095703, |
|
"learning_rate": 1.552e-05, |
|
"loss": 1.6761, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"eval_all-nli-dev_cosine_accuracy": 0.826, |
|
"eval_all-nli-dev_dot_accuracy": 0.267, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.83, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.856, |
|
"eval_all-nli-dev_max_accuracy": 0.856, |
|
"eval_loss": 1.3493391275405884, |
|
"eval_runtime": 4.734, |
|
"eval_samples_per_second": 211.236, |
|
"eval_steps_per_second": 13.308, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 36.92991638183594, |
|
"learning_rate": 2.3520000000000002e-05, |
|
"loss": 1.5528, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"eval_all-nli-dev_cosine_accuracy": 0.805, |
|
"eval_all-nli-dev_dot_accuracy": 0.29, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.818, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.83, |
|
"eval_all-nli-dev_max_accuracy": 0.83, |
|
"eval_loss": 1.4180811643600464, |
|
"eval_runtime": 4.629, |
|
"eval_samples_per_second": 216.028, |
|
"eval_steps_per_second": 13.61, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 35.267967224121094, |
|
"learning_rate": 3.1519999999999996e-05, |
|
"loss": 1.0069, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"eval_all-nli-dev_cosine_accuracy": 0.819, |
|
"eval_all-nli-dev_dot_accuracy": 0.244, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.824, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.835, |
|
"eval_all-nli-dev_max_accuracy": 0.835, |
|
"eval_loss": 1.3276922702789307, |
|
"eval_runtime": 4.5852, |
|
"eval_samples_per_second": 218.095, |
|
"eval_steps_per_second": 13.74, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 25.50597381591797, |
|
"learning_rate": 3.944e-05, |
|
"loss": 1.0611, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_all-nli-dev_cosine_accuracy": 0.814, |
|
"eval_all-nli-dev_dot_accuracy": 0.216, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.804, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.847, |
|
"eval_all-nli-dev_max_accuracy": 0.847, |
|
"eval_loss": 1.4609754085540771, |
|
"eval_runtime": 4.6833, |
|
"eval_samples_per_second": 213.524, |
|
"eval_steps_per_second": 13.452, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 55.69437026977539, |
|
"learning_rate": 4.744e-05, |
|
"loss": 1.1424, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"eval_all-nli-dev_cosine_accuracy": 0.776, |
|
"eval_all-nli-dev_dot_accuracy": 0.319, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.786, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.805, |
|
"eval_all-nli-dev_max_accuracy": 0.805, |
|
"eval_loss": 1.7394046783447266, |
|
"eval_runtime": 4.6079, |
|
"eval_samples_per_second": 217.017, |
|
"eval_steps_per_second": 13.672, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 24.330310821533203, |
|
"learning_rate": 4.9404444444444447e-05, |
|
"loss": 1.3545, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"eval_all-nli-dev_cosine_accuracy": 0.825, |
|
"eval_all-nli-dev_dot_accuracy": 0.211, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.825, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.83, |
|
"eval_all-nli-dev_max_accuracy": 0.83, |
|
"eval_loss": 1.417900562286377, |
|
"eval_runtime": 4.6918, |
|
"eval_samples_per_second": 213.136, |
|
"eval_steps_per_second": 13.428, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 18.1214599609375, |
|
"learning_rate": 4.852444444444444e-05, |
|
"loss": 1.3587, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"eval_all-nli-dev_cosine_accuracy": 0.834, |
|
"eval_all-nli-dev_dot_accuracy": 0.175, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.832, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.84, |
|
"eval_all-nli-dev_max_accuracy": 0.84, |
|
"eval_loss": 1.6350008249282837, |
|
"eval_runtime": 4.57, |
|
"eval_samples_per_second": 218.818, |
|
"eval_steps_per_second": 13.786, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 28.469261169433594, |
|
"learning_rate": 4.763555555555555e-05, |
|
"loss": 1.237, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"eval_all-nli-dev_cosine_accuracy": 0.778, |
|
"eval_all-nli-dev_dot_accuracy": 0.235, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.778, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.801, |
|
"eval_all-nli-dev_max_accuracy": 0.801, |
|
"eval_loss": 1.6793839931488037, |
|
"eval_runtime": 4.5941, |
|
"eval_samples_per_second": 217.673, |
|
"eval_steps_per_second": 13.713, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 33.614253997802734, |
|
"learning_rate": 4.6746666666666664e-05, |
|
"loss": 1.2029, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_all-nli-dev_cosine_accuracy": 0.799, |
|
"eval_all-nli-dev_dot_accuracy": 0.24, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.808, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.811, |
|
"eval_all-nli-dev_max_accuracy": 0.811, |
|
"eval_loss": 1.673274040222168, |
|
"eval_runtime": 4.6683, |
|
"eval_samples_per_second": 214.212, |
|
"eval_steps_per_second": 13.495, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 26.691509246826172, |
|
"learning_rate": 4.5857777777777775e-05, |
|
"loss": 1.2748, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"eval_all-nli-dev_cosine_accuracy": 0.8, |
|
"eval_all-nli-dev_dot_accuracy": 0.213, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.802, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.818, |
|
"eval_all-nli-dev_max_accuracy": 0.818, |
|
"eval_loss": 1.6359915733337402, |
|
"eval_runtime": 4.6977, |
|
"eval_samples_per_second": 212.869, |
|
"eval_steps_per_second": 13.411, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 23.69953727722168, |
|
"learning_rate": 4.4968888888888894e-05, |
|
"loss": 1.1433, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"eval_all-nli-dev_cosine_accuracy": 0.786, |
|
"eval_all-nli-dev_dot_accuracy": 0.215, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.785, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.806, |
|
"eval_all-nli-dev_max_accuracy": 0.806, |
|
"eval_loss": 1.7951678037643433, |
|
"eval_runtime": 4.6338, |
|
"eval_samples_per_second": 215.807, |
|
"eval_steps_per_second": 13.596, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 17.910472869873047, |
|
"learning_rate": 4.4080000000000005e-05, |
|
"loss": 1.0113, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"eval_all-nli-dev_cosine_accuracy": 0.817, |
|
"eval_all-nli-dev_dot_accuracy": 0.178, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.815, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.817, |
|
"eval_all-nli-dev_max_accuracy": 0.817, |
|
"eval_loss": 1.4315475225448608, |
|
"eval_runtime": 4.6753, |
|
"eval_samples_per_second": 213.892, |
|
"eval_steps_per_second": 13.475, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 43.645694732666016, |
|
"learning_rate": 4.3191111111111116e-05, |
|
"loss": 0.8216, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"eval_all-nli-dev_cosine_accuracy": 0.771, |
|
"eval_all-nli-dev_dot_accuracy": 0.243, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.776, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.774, |
|
"eval_all-nli-dev_max_accuracy": 0.776, |
|
"eval_loss": 1.6300010681152344, |
|
"eval_runtime": 4.6418, |
|
"eval_samples_per_second": 215.435, |
|
"eval_steps_per_second": 13.572, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 31.21487045288086, |
|
"learning_rate": 4.231111111111111e-05, |
|
"loss": 1.3451, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"eval_all-nli-dev_cosine_accuracy": 0.845, |
|
"eval_all-nli-dev_dot_accuracy": 0.186, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.85, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.856, |
|
"eval_all-nli-dev_max_accuracy": 0.856, |
|
"eval_loss": 1.1566354036331177, |
|
"eval_runtime": 4.6716, |
|
"eval_samples_per_second": 214.061, |
|
"eval_steps_per_second": 13.486, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 25.364757537841797, |
|
"learning_rate": 4.142222222222222e-05, |
|
"loss": 0.8745, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"eval_all-nli-dev_cosine_accuracy": 0.825, |
|
"eval_all-nli-dev_dot_accuracy": 0.175, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.833, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.838, |
|
"eval_all-nli-dev_max_accuracy": 0.838, |
|
"eval_loss": 1.2074507474899292, |
|
"eval_runtime": 4.7951, |
|
"eval_samples_per_second": 208.546, |
|
"eval_steps_per_second": 13.138, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 16.138595581054688, |
|
"learning_rate": 4.0533333333333334e-05, |
|
"loss": 0.9945, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"eval_all-nli-dev_cosine_accuracy": 0.822, |
|
"eval_all-nli-dev_dot_accuracy": 0.191, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.824, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.831, |
|
"eval_all-nli-dev_max_accuracy": 0.831, |
|
"eval_loss": 1.3295574188232422, |
|
"eval_runtime": 4.7422, |
|
"eval_samples_per_second": 210.873, |
|
"eval_steps_per_second": 13.285, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 21.44277572631836, |
|
"learning_rate": 3.9644444444444445e-05, |
|
"loss": 0.9827, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"eval_all-nli-dev_cosine_accuracy": 0.844, |
|
"eval_all-nli-dev_dot_accuracy": 0.163, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.839, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.844, |
|
"eval_all-nli-dev_max_accuracy": 0.844, |
|
"eval_loss": 1.3051831722259521, |
|
"eval_runtime": 4.6529, |
|
"eval_samples_per_second": 214.919, |
|
"eval_steps_per_second": 13.54, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 25.058795928955078, |
|
"learning_rate": 3.8755555555555556e-05, |
|
"loss": 0.974, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"eval_all-nli-dev_cosine_accuracy": 0.837, |
|
"eval_all-nli-dev_dot_accuracy": 0.161, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.838, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.85, |
|
"eval_all-nli-dev_max_accuracy": 0.85, |
|
"eval_loss": 1.164267897605896, |
|
"eval_runtime": 4.6376, |
|
"eval_samples_per_second": 215.626, |
|
"eval_steps_per_second": 13.584, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 17.524301528930664, |
|
"learning_rate": 3.786666666666667e-05, |
|
"loss": 0.7555, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"eval_all-nli-dev_cosine_accuracy": 0.855, |
|
"eval_all-nli-dev_dot_accuracy": 0.147, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.856, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.869, |
|
"eval_all-nli-dev_max_accuracy": 0.869, |
|
"eval_loss": 1.2737869024276733, |
|
"eval_runtime": 4.6503, |
|
"eval_samples_per_second": 215.038, |
|
"eval_steps_per_second": 13.547, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 10.828136444091797, |
|
"learning_rate": 3.697777777777778e-05, |
|
"loss": 0.7176, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"eval_all-nli-dev_cosine_accuracy": 0.832, |
|
"eval_all-nli-dev_dot_accuracy": 0.183, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.832, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.829, |
|
"eval_all-nli-dev_max_accuracy": 0.832, |
|
"eval_loss": 1.374898910522461, |
|
"eval_runtime": 4.6209, |
|
"eval_samples_per_second": 216.408, |
|
"eval_steps_per_second": 13.634, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 11.03420639038086, |
|
"learning_rate": 3.608888888888889e-05, |
|
"loss": 0.834, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"eval_all-nli-dev_cosine_accuracy": 0.875, |
|
"eval_all-nli-dev_dot_accuracy": 0.147, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.879, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.874, |
|
"eval_all-nli-dev_max_accuracy": 0.879, |
|
"eval_loss": 1.071208119392395, |
|
"eval_runtime": 4.6829, |
|
"eval_samples_per_second": 213.542, |
|
"eval_steps_per_second": 13.453, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 15.635822296142578, |
|
"learning_rate": 3.52e-05, |
|
"loss": 1.0819, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"eval_all-nli-dev_cosine_accuracy": 0.849, |
|
"eval_all-nli-dev_dot_accuracy": 0.162, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.849, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.848, |
|
"eval_all-nli-dev_max_accuracy": 0.849, |
|
"eval_loss": 1.27626633644104, |
|
"eval_runtime": 4.5515, |
|
"eval_samples_per_second": 219.707, |
|
"eval_steps_per_second": 13.842, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 15.611441612243652, |
|
"learning_rate": 3.431111111111111e-05, |
|
"loss": 0.9515, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"eval_all-nli-dev_cosine_accuracy": 0.845, |
|
"eval_all-nli-dev_dot_accuracy": 0.153, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.847, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.848, |
|
"eval_all-nli-dev_max_accuracy": 0.848, |
|
"eval_loss": 1.1383966207504272, |
|
"eval_runtime": 4.5335, |
|
"eval_samples_per_second": 220.582, |
|
"eval_steps_per_second": 13.897, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 23.901636123657227, |
|
"learning_rate": 3.3422222222222224e-05, |
|
"loss": 0.7828, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"eval_all-nli-dev_cosine_accuracy": 0.859, |
|
"eval_all-nli-dev_dot_accuracy": 0.142, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.861, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.861, |
|
"eval_all-nli-dev_max_accuracy": 0.861, |
|
"eval_loss": 1.0878574848175049, |
|
"eval_runtime": 4.6563, |
|
"eval_samples_per_second": 214.765, |
|
"eval_steps_per_second": 13.53, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 20.29927635192871, |
|
"learning_rate": 3.253333333333333e-05, |
|
"loss": 0.7268, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"eval_all-nli-dev_cosine_accuracy": 0.868, |
|
"eval_all-nli-dev_dot_accuracy": 0.128, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.864, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.867, |
|
"eval_all-nli-dev_max_accuracy": 0.868, |
|
"eval_loss": 0.9835022687911987, |
|
"eval_runtime": 4.6005, |
|
"eval_samples_per_second": 217.367, |
|
"eval_steps_per_second": 13.694, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 39.60092544555664, |
|
"learning_rate": 3.164444444444444e-05, |
|
"loss": 0.9228, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"eval_all-nli-dev_cosine_accuracy": 0.851, |
|
"eval_all-nli-dev_dot_accuracy": 0.15, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.848, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.848, |
|
"eval_all-nli-dev_max_accuracy": 0.851, |
|
"eval_loss": 1.1840057373046875, |
|
"eval_runtime": 4.6302, |
|
"eval_samples_per_second": 215.972, |
|
"eval_steps_per_second": 13.606, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 26.71760368347168, |
|
"learning_rate": 3.075555555555556e-05, |
|
"loss": 1.0017, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"eval_all-nli-dev_cosine_accuracy": 0.85, |
|
"eval_all-nli-dev_dot_accuracy": 0.138, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.846, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.853, |
|
"eval_all-nli-dev_max_accuracy": 0.853, |
|
"eval_loss": 1.1967850923538208, |
|
"eval_runtime": 4.6908, |
|
"eval_samples_per_second": 213.184, |
|
"eval_steps_per_second": 13.431, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 25.688671112060547, |
|
"learning_rate": 2.986666666666667e-05, |
|
"loss": 0.9138, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"eval_all-nli-dev_cosine_accuracy": 0.861, |
|
"eval_all-nli-dev_dot_accuracy": 0.14, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.86, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.869, |
|
"eval_all-nli-dev_max_accuracy": 0.869, |
|
"eval_loss": 0.9930791854858398, |
|
"eval_runtime": 4.7105, |
|
"eval_samples_per_second": 212.29, |
|
"eval_steps_per_second": 13.374, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 13.824788093566895, |
|
"learning_rate": 2.897777777777778e-05, |
|
"loss": 0.8498, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"eval_all-nli-dev_cosine_accuracy": 0.872, |
|
"eval_all-nli-dev_dot_accuracy": 0.129, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.871, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.876, |
|
"eval_all-nli-dev_max_accuracy": 0.876, |
|
"eval_loss": 0.9925669431686401, |
|
"eval_runtime": 4.67, |
|
"eval_samples_per_second": 214.134, |
|
"eval_steps_per_second": 13.49, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 24.699886322021484, |
|
"learning_rate": 2.8088888888888893e-05, |
|
"loss": 0.9682, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"eval_all-nli-dev_cosine_accuracy": 0.863, |
|
"eval_all-nli-dev_dot_accuracy": 0.132, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.86, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.866, |
|
"eval_all-nli-dev_max_accuracy": 0.866, |
|
"eval_loss": 1.0003857612609863, |
|
"eval_runtime": 4.6701, |
|
"eval_samples_per_second": 214.128, |
|
"eval_steps_per_second": 13.49, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 33.25304412841797, |
|
"learning_rate": 2.7200000000000004e-05, |
|
"loss": 0.7227, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"eval_all-nli-dev_cosine_accuracy": 0.882, |
|
"eval_all-nli-dev_dot_accuracy": 0.118, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.883, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.88, |
|
"eval_all-nli-dev_max_accuracy": 0.883, |
|
"eval_loss": 0.8489543199539185, |
|
"eval_runtime": 4.5888, |
|
"eval_samples_per_second": 217.92, |
|
"eval_steps_per_second": 13.729, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 12.450774192810059, |
|
"learning_rate": 2.6311111111111115e-05, |
|
"loss": 0.7134, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"eval_all-nli-dev_cosine_accuracy": 0.882, |
|
"eval_all-nli-dev_dot_accuracy": 0.121, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.877, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.884, |
|
"eval_all-nli-dev_max_accuracy": 0.884, |
|
"eval_loss": 0.8214895725250244, |
|
"eval_runtime": 4.6881, |
|
"eval_samples_per_second": 213.307, |
|
"eval_steps_per_second": 13.438, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 13.840750694274902, |
|
"learning_rate": 2.5422222222222227e-05, |
|
"loss": 0.6645, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"eval_all-nli-dev_cosine_accuracy": 0.873, |
|
"eval_all-nli-dev_dot_accuracy": 0.136, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.874, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.877, |
|
"eval_all-nli-dev_max_accuracy": 0.877, |
|
"eval_loss": 0.8888874053955078, |
|
"eval_runtime": 4.6792, |
|
"eval_samples_per_second": 213.711, |
|
"eval_steps_per_second": 13.464, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 9.944676399230957, |
|
"learning_rate": 2.4533333333333334e-05, |
|
"loss": 0.7073, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_all-nli-dev_cosine_accuracy": 0.884, |
|
"eval_all-nli-dev_dot_accuracy": 0.108, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.884, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.888, |
|
"eval_all-nli-dev_max_accuracy": 0.888, |
|
"eval_loss": 0.8373873829841614, |
|
"eval_runtime": 4.6888, |
|
"eval_samples_per_second": 213.273, |
|
"eval_steps_per_second": 13.436, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 23.035810470581055, |
|
"learning_rate": 2.3644444444444446e-05, |
|
"loss": 0.6679, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"eval_all-nli-dev_cosine_accuracy": 0.905, |
|
"eval_all-nli-dev_dot_accuracy": 0.094, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.903, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.911, |
|
"eval_all-nli-dev_max_accuracy": 0.911, |
|
"eval_loss": 0.7780482172966003, |
|
"eval_runtime": 4.6847, |
|
"eval_samples_per_second": 213.461, |
|
"eval_steps_per_second": 13.448, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 12.241792678833008, |
|
"learning_rate": 2.2755555555555557e-05, |
|
"loss": 0.6609, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"eval_all-nli-dev_cosine_accuracy": 0.893, |
|
"eval_all-nli-dev_dot_accuracy": 0.102, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.893, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.896, |
|
"eval_all-nli-dev_max_accuracy": 0.896, |
|
"eval_loss": 0.812877357006073, |
|
"eval_runtime": 4.685, |
|
"eval_samples_per_second": 213.449, |
|
"eval_steps_per_second": 13.447, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 23.00703239440918, |
|
"learning_rate": 2.186666666666667e-05, |
|
"loss": 0.687, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"eval_all-nli-dev_cosine_accuracy": 0.913, |
|
"eval_all-nli-dev_dot_accuracy": 0.085, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.905, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.905, |
|
"eval_all-nli-dev_max_accuracy": 0.913, |
|
"eval_loss": 0.7215772271156311, |
|
"eval_runtime": 4.6566, |
|
"eval_samples_per_second": 214.751, |
|
"eval_steps_per_second": 13.529, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 13.749407768249512, |
|
"learning_rate": 2.097777777777778e-05, |
|
"loss": 0.5725, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"eval_all-nli-dev_cosine_accuracy": 0.912, |
|
"eval_all-nli-dev_dot_accuracy": 0.09, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.908, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.92, |
|
"eval_all-nli-dev_max_accuracy": 0.92, |
|
"eval_loss": 0.7618492841720581, |
|
"eval_runtime": 4.6929, |
|
"eval_samples_per_second": 213.087, |
|
"eval_steps_per_second": 13.424, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 14.7637300491333, |
|
"learning_rate": 2.008888888888889e-05, |
|
"loss": 0.87, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"eval_all-nli-dev_cosine_accuracy": 0.909, |
|
"eval_all-nli-dev_dot_accuracy": 0.086, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.909, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.909, |
|
"eval_all-nli-dev_max_accuracy": 0.909, |
|
"eval_loss": 0.706980288028717, |
|
"eval_runtime": 4.6894, |
|
"eval_samples_per_second": 213.247, |
|
"eval_steps_per_second": 13.435, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 16.75938606262207, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 1.0892, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"eval_all-nli-dev_cosine_accuracy": 0.901, |
|
"eval_all-nli-dev_dot_accuracy": 0.094, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.897, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.899, |
|
"eval_all-nli-dev_max_accuracy": 0.901, |
|
"eval_loss": 0.7424288392066956, |
|
"eval_runtime": 4.6286, |
|
"eval_samples_per_second": 216.046, |
|
"eval_steps_per_second": 13.611, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 17.46446418762207, |
|
"learning_rate": 1.8311111111111114e-05, |
|
"loss": 1.048, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"eval_all-nli-dev_cosine_accuracy": 0.908, |
|
"eval_all-nli-dev_dot_accuracy": 0.093, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.908, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.909, |
|
"eval_all-nli-dev_max_accuracy": 0.909, |
|
"eval_loss": 0.6750496029853821, |
|
"eval_runtime": 4.641, |
|
"eval_samples_per_second": 215.47, |
|
"eval_steps_per_second": 13.575, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 18.15054702758789, |
|
"learning_rate": 1.7422222222222222e-05, |
|
"loss": 0.8571, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"eval_all-nli-dev_cosine_accuracy": 0.903, |
|
"eval_all-nli-dev_dot_accuracy": 0.095, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.901, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.902, |
|
"eval_all-nli-dev_max_accuracy": 0.903, |
|
"eval_loss": 0.6474354863166809, |
|
"eval_runtime": 4.7822, |
|
"eval_samples_per_second": 209.111, |
|
"eval_steps_per_second": 13.174, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 15.00839614868164, |
|
"learning_rate": 1.6533333333333333e-05, |
|
"loss": 0.7945, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"eval_all-nli-dev_cosine_accuracy": 0.908, |
|
"eval_all-nli-dev_dot_accuracy": 0.089, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.91, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.911, |
|
"eval_all-nli-dev_max_accuracy": 0.911, |
|
"eval_loss": 0.6094924211502075, |
|
"eval_runtime": 4.6578, |
|
"eval_samples_per_second": 214.695, |
|
"eval_steps_per_second": 13.526, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 24.833444595336914, |
|
"learning_rate": 1.5644444444444444e-05, |
|
"loss": 0.6717, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_all-nli-dev_cosine_accuracy": 0.93, |
|
"eval_all-nli-dev_dot_accuracy": 0.066, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.923, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.921, |
|
"eval_all-nli-dev_max_accuracy": 0.93, |
|
"eval_loss": 0.5663518309593201, |
|
"eval_runtime": 4.7483, |
|
"eval_samples_per_second": 210.6, |
|
"eval_steps_per_second": 13.268, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 1504.730224609375, |
|
"learning_rate": 1.4755555555555556e-05, |
|
"loss": 0.8161, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"eval_all-nli-dev_cosine_accuracy": 0.919, |
|
"eval_all-nli-dev_dot_accuracy": 0.07, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.916, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.918, |
|
"eval_all-nli-dev_max_accuracy": 0.919, |
|
"eval_loss": 0.5479408502578735, |
|
"eval_runtime": 4.6248, |
|
"eval_samples_per_second": 216.224, |
|
"eval_steps_per_second": 13.622, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 30.87934684753418, |
|
"learning_rate": 1.3866666666666667e-05, |
|
"loss": 0.7917, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"eval_all-nli-dev_cosine_accuracy": 0.911, |
|
"eval_all-nli-dev_dot_accuracy": 0.083, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.909, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.907, |
|
"eval_all-nli-dev_max_accuracy": 0.911, |
|
"eval_loss": 0.6419683694839478, |
|
"eval_runtime": 4.5916, |
|
"eval_samples_per_second": 217.788, |
|
"eval_steps_per_second": 13.721, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 23.224735260009766, |
|
"learning_rate": 1.2977777777777777e-05, |
|
"loss": 0.7711, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"eval_all-nli-dev_cosine_accuracy": 0.916, |
|
"eval_all-nli-dev_dot_accuracy": 0.078, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.914, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.913, |
|
"eval_all-nli-dev_max_accuracy": 0.916, |
|
"eval_loss": 0.5856308341026306, |
|
"eval_runtime": 4.5755, |
|
"eval_samples_per_second": 218.553, |
|
"eval_steps_per_second": 13.769, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 13.215950012207031, |
|
"learning_rate": 1.208888888888889e-05, |
|
"loss": 0.6441, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"eval_all-nli-dev_cosine_accuracy": 0.916, |
|
"eval_all-nli-dev_dot_accuracy": 0.079, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.913, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.913, |
|
"eval_all-nli-dev_max_accuracy": 0.916, |
|
"eval_loss": 0.5775041580200195, |
|
"eval_runtime": 4.5681, |
|
"eval_samples_per_second": 218.91, |
|
"eval_steps_per_second": 13.791, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 18.222217559814453, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.7766, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"eval_all-nli-dev_cosine_accuracy": 0.922, |
|
"eval_all-nli-dev_dot_accuracy": 0.077, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.92, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.917, |
|
"eval_all-nli-dev_max_accuracy": 0.922, |
|
"eval_loss": 0.5785014629364014, |
|
"eval_runtime": 4.6158, |
|
"eval_samples_per_second": 216.645, |
|
"eval_steps_per_second": 13.649, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 10.330174446105957, |
|
"learning_rate": 1.031111111111111e-05, |
|
"loss": 0.6009, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"eval_all-nli-dev_cosine_accuracy": 0.921, |
|
"eval_all-nli-dev_dot_accuracy": 0.081, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.917, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.919, |
|
"eval_all-nli-dev_max_accuracy": 0.921, |
|
"eval_loss": 0.5679826140403748, |
|
"eval_runtime": 4.5803, |
|
"eval_samples_per_second": 218.325, |
|
"eval_steps_per_second": 13.754, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 14.917418479919434, |
|
"learning_rate": 9.422222222222222e-06, |
|
"loss": 0.6711, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"eval_all-nli-dev_cosine_accuracy": 0.921, |
|
"eval_all-nli-dev_dot_accuracy": 0.074, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.917, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.92, |
|
"eval_all-nli-dev_max_accuracy": 0.921, |
|
"eval_loss": 0.5487431883811951, |
|
"eval_runtime": 4.6796, |
|
"eval_samples_per_second": 213.694, |
|
"eval_steps_per_second": 13.463, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 32.70161056518555, |
|
"learning_rate": 8.533333333333334e-06, |
|
"loss": 0.618, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"eval_all-nli-dev_cosine_accuracy": 0.926, |
|
"eval_all-nli-dev_dot_accuracy": 0.074, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.921, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.922, |
|
"eval_all-nli-dev_max_accuracy": 0.926, |
|
"eval_loss": 0.5450394749641418, |
|
"eval_runtime": 4.662, |
|
"eval_samples_per_second": 214.5, |
|
"eval_steps_per_second": 13.513, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 13.764747619628906, |
|
"learning_rate": 7.644444444444445e-06, |
|
"loss": 0.6702, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"eval_all-nli-dev_cosine_accuracy": 0.926, |
|
"eval_all-nli-dev_dot_accuracy": 0.073, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.921, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.919, |
|
"eval_all-nli-dev_max_accuracy": 0.926, |
|
"eval_loss": 0.5497583150863647, |
|
"eval_runtime": 4.5535, |
|
"eval_samples_per_second": 219.613, |
|
"eval_steps_per_second": 13.836, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 29.501554489135742, |
|
"learning_rate": 6.755555555555555e-06, |
|
"loss": 0.7039, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"eval_all-nli-dev_cosine_accuracy": 0.927, |
|
"eval_all-nli-dev_dot_accuracy": 0.07, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.926, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.922, |
|
"eval_all-nli-dev_max_accuracy": 0.927, |
|
"eval_loss": 0.5191856622695923, |
|
"eval_runtime": 4.5392, |
|
"eval_samples_per_second": 220.305, |
|
"eval_steps_per_second": 13.879, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 19.63087272644043, |
|
"learning_rate": 5.866666666666667e-06, |
|
"loss": 0.6114, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"eval_all-nli-dev_cosine_accuracy": 0.932, |
|
"eval_all-nli-dev_dot_accuracy": 0.067, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.931, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.93, |
|
"eval_all-nli-dev_max_accuracy": 0.932, |
|
"eval_loss": 0.5045494437217712, |
|
"eval_runtime": 4.6367, |
|
"eval_samples_per_second": 215.672, |
|
"eval_steps_per_second": 13.587, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 57.868019104003906, |
|
"learning_rate": 4.977777777777778e-06, |
|
"loss": 0.7761, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"eval_all-nli-dev_cosine_accuracy": 0.934, |
|
"eval_all-nli-dev_dot_accuracy": 0.061, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.931, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.928, |
|
"eval_all-nli-dev_max_accuracy": 0.934, |
|
"eval_loss": 0.5033252835273743, |
|
"eval_runtime": 4.6312, |
|
"eval_samples_per_second": 215.928, |
|
"eval_steps_per_second": 13.603, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 11.63007640838623, |
|
"learning_rate": 4.088888888888889e-06, |
|
"loss": 0.6248, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"eval_all-nli-dev_cosine_accuracy": 0.932, |
|
"eval_all-nli-dev_dot_accuracy": 0.068, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.926, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.926, |
|
"eval_all-nli-dev_max_accuracy": 0.932, |
|
"eval_loss": 0.5013440251350403, |
|
"eval_runtime": 4.6162, |
|
"eval_samples_per_second": 216.629, |
|
"eval_steps_per_second": 13.648, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 16.12616539001465, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.8359, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"eval_all-nli-dev_cosine_accuracy": 0.93, |
|
"eval_all-nli-dev_dot_accuracy": 0.07, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.926, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.923, |
|
"eval_all-nli-dev_max_accuracy": 0.93, |
|
"eval_loss": 0.49764034152030945, |
|
"eval_runtime": 4.6817, |
|
"eval_samples_per_second": 213.6, |
|
"eval_steps_per_second": 13.457, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 16.06106185913086, |
|
"learning_rate": 2.311111111111111e-06, |
|
"loss": 0.8764, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"eval_all-nli-dev_cosine_accuracy": 0.936, |
|
"eval_all-nli-dev_dot_accuracy": 0.062, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.928, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.928, |
|
"eval_all-nli-dev_max_accuracy": 0.936, |
|
"eval_loss": 0.49757900834083557, |
|
"eval_runtime": 4.6133, |
|
"eval_samples_per_second": 216.765, |
|
"eval_steps_per_second": 13.656, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 0.0003809410845860839, |
|
"learning_rate": 1.4222222222222223e-06, |
|
"loss": 0.763, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"eval_all-nli-dev_cosine_accuracy": 0.935, |
|
"eval_all-nli-dev_dot_accuracy": 0.061, |
|
"eval_all-nli-dev_euclidean_accuracy": 0.93, |
|
"eval_all-nli-dev_manhattan_accuracy": 0.929, |
|
"eval_all-nli-dev_max_accuracy": 0.935, |
|
"eval_loss": 0.48454272747039795, |
|
"eval_runtime": 4.737, |
|
"eval_samples_per_second": 211.106, |
|
"eval_steps_per_second": 13.3, |
|
"step": 6100 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 6250, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 0.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|