|
{ |
|
"best_metric": 0.4553995728492737, |
|
"best_model_checkpoint": "./beans_outputs/checkpoint-69", |
|
"epoch": 200.0, |
|
"eval_steps": 500, |
|
"global_step": 4600, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.43478260869565216, |
|
"grad_norm": 9.284521102905273, |
|
"learning_rate": 1.9956521739130435e-05, |
|
"loss": 0.6169, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.8695652173913043, |
|
"grad_norm": 6.047142028808594, |
|
"learning_rate": 1.9913043478260872e-05, |
|
"loss": 0.5242, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.7559055118110236, |
|
"eval_loss": 0.4960778057575226, |
|
"eval_runtime": 5.6891, |
|
"eval_samples_per_second": 44.647, |
|
"eval_steps_per_second": 0.703, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.3043478260869565, |
|
"grad_norm": 27.046829223632812, |
|
"learning_rate": 1.9869565217391305e-05, |
|
"loss": 0.5153, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.7391304347826086, |
|
"grad_norm": 9.23794937133789, |
|
"learning_rate": 1.9826086956521742e-05, |
|
"loss": 0.459, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.5001373887062073, |
|
"eval_runtime": 5.3498, |
|
"eval_samples_per_second": 47.478, |
|
"eval_steps_per_second": 0.748, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.1739130434782608, |
|
"grad_norm": 13.100432395935059, |
|
"learning_rate": 1.9782608695652176e-05, |
|
"loss": 0.4273, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.608695652173913, |
|
"grad_norm": 8.970976829528809, |
|
"learning_rate": 1.973913043478261e-05, |
|
"loss": 0.4429, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.7874015748031497, |
|
"eval_loss": 0.4553995728492737, |
|
"eval_runtime": 5.921, |
|
"eval_samples_per_second": 42.899, |
|
"eval_steps_per_second": 0.676, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 3.0434782608695654, |
|
"grad_norm": 18.094818115234375, |
|
"learning_rate": 1.9695652173913046e-05, |
|
"loss": 0.4676, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 3.4782608695652173, |
|
"grad_norm": 23.408973693847656, |
|
"learning_rate": 1.965217391304348e-05, |
|
"loss": 0.4298, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 3.9130434782608696, |
|
"grad_norm": 8.188302993774414, |
|
"learning_rate": 1.9608695652173916e-05, |
|
"loss": 0.4308, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.49241384863853455, |
|
"eval_runtime": 5.0789, |
|
"eval_samples_per_second": 50.011, |
|
"eval_steps_per_second": 0.788, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 4.3478260869565215, |
|
"grad_norm": 24.4296932220459, |
|
"learning_rate": 1.956521739130435e-05, |
|
"loss": 0.4158, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 4.782608695652174, |
|
"grad_norm": 6.616312503814697, |
|
"learning_rate": 1.9521739130434786e-05, |
|
"loss": 0.4319, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.7874015748031497, |
|
"eval_loss": 0.46732431650161743, |
|
"eval_runtime": 5.2746, |
|
"eval_samples_per_second": 48.155, |
|
"eval_steps_per_second": 0.758, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 5.217391304347826, |
|
"grad_norm": 11.640789031982422, |
|
"learning_rate": 1.947826086956522e-05, |
|
"loss": 0.4422, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 5.6521739130434785, |
|
"grad_norm": 14.8034029006958, |
|
"learning_rate": 1.9434782608695653e-05, |
|
"loss": 0.4047, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.4929516315460205, |
|
"eval_runtime": 5.4996, |
|
"eval_samples_per_second": 46.185, |
|
"eval_steps_per_second": 0.727, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 6.086956521739131, |
|
"grad_norm": 17.933513641357422, |
|
"learning_rate": 1.9391304347826087e-05, |
|
"loss": 0.419, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 6.521739130434782, |
|
"grad_norm": 11.223569869995117, |
|
"learning_rate": 1.9347826086956523e-05, |
|
"loss": 0.4235, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 6.956521739130435, |
|
"grad_norm": 9.359394073486328, |
|
"learning_rate": 1.9304347826086957e-05, |
|
"loss": 0.425, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.4738819897174835, |
|
"eval_runtime": 5.686, |
|
"eval_samples_per_second": 44.671, |
|
"eval_steps_per_second": 0.703, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 7.391304347826087, |
|
"grad_norm": 13.405791282653809, |
|
"learning_rate": 1.9260869565217394e-05, |
|
"loss": 0.4076, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 7.826086956521739, |
|
"grad_norm": 7.356156826019287, |
|
"learning_rate": 1.9217391304347827e-05, |
|
"loss": 0.4102, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 0.5118146538734436, |
|
"eval_runtime": 5.9498, |
|
"eval_samples_per_second": 42.691, |
|
"eval_steps_per_second": 0.672, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 8.26086956521739, |
|
"grad_norm": 9.282608032226562, |
|
"learning_rate": 1.9173913043478264e-05, |
|
"loss": 0.3981, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 8.695652173913043, |
|
"grad_norm": 8.834388732910156, |
|
"learning_rate": 1.9130434782608697e-05, |
|
"loss": 0.3959, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 0.5490437746047974, |
|
"eval_runtime": 6.1005, |
|
"eval_samples_per_second": 41.636, |
|
"eval_steps_per_second": 0.656, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 9.130434782608695, |
|
"grad_norm": 6.02927303314209, |
|
"learning_rate": 1.9086956521739134e-05, |
|
"loss": 0.4189, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 9.565217391304348, |
|
"grad_norm": 11.13778305053711, |
|
"learning_rate": 1.9043478260869568e-05, |
|
"loss": 0.3864, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 18.004470825195312, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.365, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.5260568261146545, |
|
"eval_runtime": 6.2538, |
|
"eval_samples_per_second": 40.615, |
|
"eval_steps_per_second": 0.64, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 10.434782608695652, |
|
"grad_norm": 12.497597694396973, |
|
"learning_rate": 1.8956521739130434e-05, |
|
"loss": 0.3614, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 10.869565217391305, |
|
"grad_norm": 17.154403686523438, |
|
"learning_rate": 1.891304347826087e-05, |
|
"loss": 0.4214, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.5089420676231384, |
|
"eval_runtime": 5.9748, |
|
"eval_samples_per_second": 42.512, |
|
"eval_steps_per_second": 0.669, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 11.304347826086957, |
|
"grad_norm": 11.148750305175781, |
|
"learning_rate": 1.8869565217391305e-05, |
|
"loss": 0.3476, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 11.73913043478261, |
|
"grad_norm": 18.083158493041992, |
|
"learning_rate": 1.882608695652174e-05, |
|
"loss": 0.3798, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.7992125984251969, |
|
"eval_loss": 0.47105538845062256, |
|
"eval_runtime": 6.2283, |
|
"eval_samples_per_second": 40.782, |
|
"eval_steps_per_second": 0.642, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 12.173913043478262, |
|
"grad_norm": 19.7681941986084, |
|
"learning_rate": 1.8782608695652175e-05, |
|
"loss": 0.3819, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 12.608695652173914, |
|
"grad_norm": 16.852922439575195, |
|
"learning_rate": 1.8739130434782612e-05, |
|
"loss": 0.3906, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.7913385826771654, |
|
"eval_loss": 0.5035075545310974, |
|
"eval_runtime": 6.1171, |
|
"eval_samples_per_second": 41.523, |
|
"eval_steps_per_second": 0.654, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 13.043478260869565, |
|
"grad_norm": 11.871081352233887, |
|
"learning_rate": 1.8695652173913045e-05, |
|
"loss": 0.3421, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 13.478260869565217, |
|
"grad_norm": 12.804304122924805, |
|
"learning_rate": 1.865217391304348e-05, |
|
"loss": 0.3452, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 13.91304347826087, |
|
"grad_norm": 13.583141326904297, |
|
"learning_rate": 1.8608695652173912e-05, |
|
"loss": 0.3706, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.7952755905511811, |
|
"eval_loss": 0.49332255125045776, |
|
"eval_runtime": 6.0462, |
|
"eval_samples_per_second": 42.01, |
|
"eval_steps_per_second": 0.662, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 14.347826086956522, |
|
"grad_norm": 10.307147979736328, |
|
"learning_rate": 1.856521739130435e-05, |
|
"loss": 0.3242, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 14.782608695652174, |
|
"grad_norm": 19.720266342163086, |
|
"learning_rate": 1.8521739130434782e-05, |
|
"loss": 0.3766, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.7992125984251969, |
|
"eval_loss": 0.4972522258758545, |
|
"eval_runtime": 5.4928, |
|
"eval_samples_per_second": 46.242, |
|
"eval_steps_per_second": 0.728, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 15.217391304347826, |
|
"grad_norm": 9.444562911987305, |
|
"learning_rate": 1.847826086956522e-05, |
|
"loss": 0.3569, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 15.652173913043478, |
|
"grad_norm": 14.776025772094727, |
|
"learning_rate": 1.8434782608695653e-05, |
|
"loss": 0.3213, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.7874015748031497, |
|
"eval_loss": 0.5220504403114319, |
|
"eval_runtime": 5.825, |
|
"eval_samples_per_second": 43.605, |
|
"eval_steps_per_second": 0.687, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 16.08695652173913, |
|
"grad_norm": 35.97469711303711, |
|
"learning_rate": 1.839130434782609e-05, |
|
"loss": 0.3719, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 16.52173913043478, |
|
"grad_norm": 15.621981620788574, |
|
"learning_rate": 1.8347826086956523e-05, |
|
"loss": 0.3303, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 16.956521739130434, |
|
"grad_norm": 12.130711555480957, |
|
"learning_rate": 1.830434782608696e-05, |
|
"loss": 0.329, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.5400053262710571, |
|
"eval_runtime": 6.058, |
|
"eval_samples_per_second": 41.928, |
|
"eval_steps_per_second": 0.66, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 17.391304347826086, |
|
"grad_norm": 28.593765258789062, |
|
"learning_rate": 1.8260869565217393e-05, |
|
"loss": 0.3275, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 17.82608695652174, |
|
"grad_norm": 10.756831169128418, |
|
"learning_rate": 1.8217391304347827e-05, |
|
"loss": 0.3427, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.7913385826771654, |
|
"eval_loss": 0.5252194404602051, |
|
"eval_runtime": 5.781, |
|
"eval_samples_per_second": 43.937, |
|
"eval_steps_per_second": 0.692, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 18.26086956521739, |
|
"grad_norm": 9.596463203430176, |
|
"learning_rate": 1.8173913043478263e-05, |
|
"loss": 0.3363, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 18.695652173913043, |
|
"grad_norm": 12.830471992492676, |
|
"learning_rate": 1.8130434782608697e-05, |
|
"loss": 0.3472, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 0.6207993626594543, |
|
"eval_runtime": 5.8687, |
|
"eval_samples_per_second": 43.281, |
|
"eval_steps_per_second": 0.682, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 19.130434782608695, |
|
"grad_norm": 11.963924407958984, |
|
"learning_rate": 1.808695652173913e-05, |
|
"loss": 0.3509, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 19.565217391304348, |
|
"grad_norm": 9.393329620361328, |
|
"learning_rate": 1.8043478260869567e-05, |
|
"loss": 0.3149, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 13.855999946594238, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.3424, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.5319867730140686, |
|
"eval_runtime": 5.8322, |
|
"eval_samples_per_second": 43.551, |
|
"eval_steps_per_second": 0.686, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 20.434782608695652, |
|
"grad_norm": 12.548827171325684, |
|
"learning_rate": 1.7956521739130437e-05, |
|
"loss": 0.322, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 20.869565217391305, |
|
"grad_norm": 12.916594505310059, |
|
"learning_rate": 1.791304347826087e-05, |
|
"loss": 0.3016, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.5488443970680237, |
|
"eval_runtime": 6.0683, |
|
"eval_samples_per_second": 41.857, |
|
"eval_steps_per_second": 0.659, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 21.304347826086957, |
|
"grad_norm": 10.366270065307617, |
|
"learning_rate": 1.7869565217391304e-05, |
|
"loss": 0.2733, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 21.73913043478261, |
|
"grad_norm": 11.919926643371582, |
|
"learning_rate": 1.782608695652174e-05, |
|
"loss": 0.3033, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 0.5889012813568115, |
|
"eval_runtime": 6.0978, |
|
"eval_samples_per_second": 41.654, |
|
"eval_steps_per_second": 0.656, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 22.17391304347826, |
|
"grad_norm": 13.522591590881348, |
|
"learning_rate": 1.7782608695652174e-05, |
|
"loss": 0.26, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 22.608695652173914, |
|
"grad_norm": 15.789691925048828, |
|
"learning_rate": 1.773913043478261e-05, |
|
"loss": 0.3083, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.6107717156410217, |
|
"eval_runtime": 5.8735, |
|
"eval_samples_per_second": 43.245, |
|
"eval_steps_per_second": 0.681, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 23.043478260869566, |
|
"grad_norm": 13.747518539428711, |
|
"learning_rate": 1.7695652173913045e-05, |
|
"loss": 0.2794, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 23.47826086956522, |
|
"grad_norm": 14.642496109008789, |
|
"learning_rate": 1.765217391304348e-05, |
|
"loss": 0.2862, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 23.91304347826087, |
|
"grad_norm": 16.773733139038086, |
|
"learning_rate": 1.7608695652173915e-05, |
|
"loss": 0.2772, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 0.5844786763191223, |
|
"eval_runtime": 6.1158, |
|
"eval_samples_per_second": 41.532, |
|
"eval_steps_per_second": 0.654, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 24.347826086956523, |
|
"grad_norm": 11.864821434020996, |
|
"learning_rate": 1.756521739130435e-05, |
|
"loss": 0.2693, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 24.782608695652176, |
|
"grad_norm": 19.129663467407227, |
|
"learning_rate": 1.7521739130434785e-05, |
|
"loss": 0.287, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_accuracy": 0.8070866141732284, |
|
"eval_loss": 0.5241626501083374, |
|
"eval_runtime": 6.0464, |
|
"eval_samples_per_second": 42.008, |
|
"eval_steps_per_second": 0.662, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 25.217391304347824, |
|
"grad_norm": 9.217809677124023, |
|
"learning_rate": 1.747826086956522e-05, |
|
"loss": 0.309, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 25.652173913043477, |
|
"grad_norm": 20.24026870727539, |
|
"learning_rate": 1.7434782608695652e-05, |
|
"loss": 0.2651, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 0.6275522112846375, |
|
"eval_runtime": 5.2184, |
|
"eval_samples_per_second": 48.674, |
|
"eval_steps_per_second": 0.767, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 26.08695652173913, |
|
"grad_norm": 14.04963207244873, |
|
"learning_rate": 1.739130434782609e-05, |
|
"loss": 0.2763, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 26.52173913043478, |
|
"grad_norm": 18.499040603637695, |
|
"learning_rate": 1.7347826086956522e-05, |
|
"loss": 0.2944, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 26.956521739130434, |
|
"grad_norm": 14.989178657531738, |
|
"learning_rate": 1.730434782608696e-05, |
|
"loss": 0.2696, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.5648530125617981, |
|
"eval_runtime": 5.345, |
|
"eval_samples_per_second": 47.521, |
|
"eval_steps_per_second": 0.748, |
|
"step": 621 |
|
}, |
|
{ |
|
"epoch": 27.391304347826086, |
|
"grad_norm": 24.154279708862305, |
|
"learning_rate": 1.7260869565217393e-05, |
|
"loss": 0.2638, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 27.82608695652174, |
|
"grad_norm": 21.489530563354492, |
|
"learning_rate": 1.721739130434783e-05, |
|
"loss": 0.2701, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.6103212237358093, |
|
"eval_runtime": 5.2378, |
|
"eval_samples_per_second": 48.494, |
|
"eval_steps_per_second": 0.764, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 28.26086956521739, |
|
"grad_norm": 7.044363975524902, |
|
"learning_rate": 1.7173913043478263e-05, |
|
"loss": 0.2929, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 28.695652173913043, |
|
"grad_norm": 8.331864356994629, |
|
"learning_rate": 1.71304347826087e-05, |
|
"loss": 0.2451, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.6206949949264526, |
|
"eval_runtime": 5.2583, |
|
"eval_samples_per_second": 48.305, |
|
"eval_steps_per_second": 0.761, |
|
"step": 667 |
|
}, |
|
{ |
|
"epoch": 29.130434782608695, |
|
"grad_norm": 14.339310646057129, |
|
"learning_rate": 1.708695652173913e-05, |
|
"loss": 0.238, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 29.565217391304348, |
|
"grad_norm": 17.397079467773438, |
|
"learning_rate": 1.7043478260869566e-05, |
|
"loss": 0.2621, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"grad_norm": 49.991676330566406, |
|
"learning_rate": 1.7e-05, |
|
"loss": 0.2705, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.5990407466888428, |
|
"eval_runtime": 5.3203, |
|
"eval_samples_per_second": 47.741, |
|
"eval_steps_per_second": 0.752, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 30.434782608695652, |
|
"grad_norm": 10.006631851196289, |
|
"learning_rate": 1.6956521739130437e-05, |
|
"loss": 0.2313, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 30.869565217391305, |
|
"grad_norm": 19.18036651611328, |
|
"learning_rate": 1.691304347826087e-05, |
|
"loss": 0.2553, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.5962309241294861, |
|
"eval_runtime": 5.2875, |
|
"eval_samples_per_second": 48.038, |
|
"eval_steps_per_second": 0.757, |
|
"step": 713 |
|
}, |
|
{ |
|
"epoch": 31.304347826086957, |
|
"grad_norm": 11.844828605651855, |
|
"learning_rate": 1.6869565217391307e-05, |
|
"loss": 0.2392, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 31.73913043478261, |
|
"grad_norm": 28.340103149414062, |
|
"learning_rate": 1.682608695652174e-05, |
|
"loss": 0.2559, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.6681433916091919, |
|
"eval_runtime": 5.5265, |
|
"eval_samples_per_second": 45.961, |
|
"eval_steps_per_second": 0.724, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 32.17391304347826, |
|
"grad_norm": 19.246511459350586, |
|
"learning_rate": 1.6782608695652177e-05, |
|
"loss": 0.2441, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 32.608695652173914, |
|
"grad_norm": 14.579903602600098, |
|
"learning_rate": 1.673913043478261e-05, |
|
"loss": 0.2405, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.5917338132858276, |
|
"eval_runtime": 5.8004, |
|
"eval_samples_per_second": 43.79, |
|
"eval_steps_per_second": 0.69, |
|
"step": 759 |
|
}, |
|
{ |
|
"epoch": 33.04347826086956, |
|
"grad_norm": 18.550579071044922, |
|
"learning_rate": 1.6695652173913044e-05, |
|
"loss": 0.2983, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 33.47826086956522, |
|
"grad_norm": 12.869200706481934, |
|
"learning_rate": 1.6652173913043477e-05, |
|
"loss": 0.2653, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 33.91304347826087, |
|
"grad_norm": 21.49362564086914, |
|
"learning_rate": 1.6608695652173914e-05, |
|
"loss": 0.2707, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.5905888080596924, |
|
"eval_runtime": 6.1194, |
|
"eval_samples_per_second": 41.508, |
|
"eval_steps_per_second": 0.654, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 34.34782608695652, |
|
"grad_norm": 20.466447830200195, |
|
"learning_rate": 1.6565217391304348e-05, |
|
"loss": 0.2314, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 34.78260869565217, |
|
"grad_norm": 14.096879005432129, |
|
"learning_rate": 1.6521739130434785e-05, |
|
"loss": 0.3004, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_accuracy": 0.7874015748031497, |
|
"eval_loss": 0.5905265212059021, |
|
"eval_runtime": 5.7562, |
|
"eval_samples_per_second": 44.126, |
|
"eval_steps_per_second": 0.695, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 35.21739130434783, |
|
"grad_norm": 12.822495460510254, |
|
"learning_rate": 1.6478260869565218e-05, |
|
"loss": 0.2325, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 35.65217391304348, |
|
"grad_norm": 15.958830833435059, |
|
"learning_rate": 1.6434782608695655e-05, |
|
"loss": 0.2404, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.5914427042007446, |
|
"eval_runtime": 5.657, |
|
"eval_samples_per_second": 44.9, |
|
"eval_steps_per_second": 0.707, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 36.08695652173913, |
|
"grad_norm": 14.542938232421875, |
|
"learning_rate": 1.6391304347826088e-05, |
|
"loss": 0.235, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 36.52173913043478, |
|
"grad_norm": 36.17627716064453, |
|
"learning_rate": 1.6347826086956525e-05, |
|
"loss": 0.258, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 36.95652173913044, |
|
"grad_norm": 16.92486572265625, |
|
"learning_rate": 1.630434782608696e-05, |
|
"loss": 0.242, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.7637115716934204, |
|
"eval_runtime": 5.8801, |
|
"eval_samples_per_second": 43.196, |
|
"eval_steps_per_second": 0.68, |
|
"step": 851 |
|
}, |
|
{ |
|
"epoch": 37.391304347826086, |
|
"grad_norm": 16.343120574951172, |
|
"learning_rate": 1.6260869565217392e-05, |
|
"loss": 0.2075, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 37.82608695652174, |
|
"grad_norm": 26.078245162963867, |
|
"learning_rate": 1.621739130434783e-05, |
|
"loss": 0.2221, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 0.7117211222648621, |
|
"eval_runtime": 5.1769, |
|
"eval_samples_per_second": 49.064, |
|
"eval_steps_per_second": 0.773, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 38.26086956521739, |
|
"grad_norm": 9.526216506958008, |
|
"learning_rate": 1.6173913043478262e-05, |
|
"loss": 0.2285, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 38.69565217391305, |
|
"grad_norm": 14.653572082519531, |
|
"learning_rate": 1.6130434782608696e-05, |
|
"loss": 0.2196, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.6442176699638367, |
|
"eval_runtime": 6.1843, |
|
"eval_samples_per_second": 41.072, |
|
"eval_steps_per_second": 0.647, |
|
"step": 897 |
|
}, |
|
{ |
|
"epoch": 39.130434782608695, |
|
"grad_norm": 6.992471694946289, |
|
"learning_rate": 1.6086956521739132e-05, |
|
"loss": 0.2301, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 39.56521739130435, |
|
"grad_norm": 15.566644668579102, |
|
"learning_rate": 1.6043478260869566e-05, |
|
"loss": 0.234, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 21.968212127685547, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.23, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.7011120915412903, |
|
"eval_runtime": 6.1863, |
|
"eval_samples_per_second": 41.059, |
|
"eval_steps_per_second": 0.647, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 40.43478260869565, |
|
"grad_norm": 7.890230655670166, |
|
"learning_rate": 1.5956521739130436e-05, |
|
"loss": 0.2143, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 40.869565217391305, |
|
"grad_norm": 13.847317695617676, |
|
"learning_rate": 1.591304347826087e-05, |
|
"loss": 0.2045, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 41.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 0.7821894288063049, |
|
"eval_runtime": 6.1565, |
|
"eval_samples_per_second": 41.257, |
|
"eval_steps_per_second": 0.65, |
|
"step": 943 |
|
}, |
|
{ |
|
"epoch": 41.30434782608695, |
|
"grad_norm": 27.132835388183594, |
|
"learning_rate": 1.5869565217391306e-05, |
|
"loss": 0.2094, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 41.73913043478261, |
|
"grad_norm": 33.14794921875, |
|
"learning_rate": 1.582608695652174e-05, |
|
"loss": 0.2043, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 42.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 0.7339490056037903, |
|
"eval_runtime": 6.1427, |
|
"eval_samples_per_second": 41.35, |
|
"eval_steps_per_second": 0.651, |
|
"step": 966 |
|
}, |
|
{ |
|
"epoch": 42.17391304347826, |
|
"grad_norm": 9.211811065673828, |
|
"learning_rate": 1.5782608695652177e-05, |
|
"loss": 0.2468, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 42.608695652173914, |
|
"grad_norm": 18.167051315307617, |
|
"learning_rate": 1.573913043478261e-05, |
|
"loss": 0.2413, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 43.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.6917413473129272, |
|
"eval_runtime": 6.1303, |
|
"eval_samples_per_second": 41.434, |
|
"eval_steps_per_second": 0.652, |
|
"step": 989 |
|
}, |
|
{ |
|
"epoch": 43.04347826086956, |
|
"grad_norm": 16.584321975708008, |
|
"learning_rate": 1.5695652173913047e-05, |
|
"loss": 0.2255, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 43.47826086956522, |
|
"grad_norm": 13.140791893005371, |
|
"learning_rate": 1.565217391304348e-05, |
|
"loss": 0.216, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 43.91304347826087, |
|
"grad_norm": 16.6044921875, |
|
"learning_rate": 1.5608695652173914e-05, |
|
"loss": 0.2135, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.695396363735199, |
|
"eval_runtime": 6.1325, |
|
"eval_samples_per_second": 41.419, |
|
"eval_steps_per_second": 0.652, |
|
"step": 1012 |
|
}, |
|
{ |
|
"epoch": 44.34782608695652, |
|
"grad_norm": 7.823153495788574, |
|
"learning_rate": 1.5565217391304347e-05, |
|
"loss": 0.2036, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 44.78260869565217, |
|
"grad_norm": 26.390613555908203, |
|
"learning_rate": 1.5521739130434784e-05, |
|
"loss": 0.2194, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 45.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.6728869080543518, |
|
"eval_runtime": 6.0671, |
|
"eval_samples_per_second": 41.865, |
|
"eval_steps_per_second": 0.659, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 45.21739130434783, |
|
"grad_norm": 22.04627227783203, |
|
"learning_rate": 1.5478260869565217e-05, |
|
"loss": 0.2219, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 45.65217391304348, |
|
"grad_norm": 9.552602767944336, |
|
"learning_rate": 1.5434782608695654e-05, |
|
"loss": 0.211, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 46.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.6840518712997437, |
|
"eval_runtime": 6.1887, |
|
"eval_samples_per_second": 41.043, |
|
"eval_steps_per_second": 0.646, |
|
"step": 1058 |
|
}, |
|
{ |
|
"epoch": 46.08695652173913, |
|
"grad_norm": 17.527860641479492, |
|
"learning_rate": 1.5391304347826088e-05, |
|
"loss": 0.1984, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 46.52173913043478, |
|
"grad_norm": 8.503457069396973, |
|
"learning_rate": 1.5347826086956524e-05, |
|
"loss": 0.1849, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 46.95652173913044, |
|
"grad_norm": 27.90496826171875, |
|
"learning_rate": 1.5304347826086958e-05, |
|
"loss": 0.2155, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 47.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.7107857465744019, |
|
"eval_runtime": 7.3514, |
|
"eval_samples_per_second": 34.551, |
|
"eval_steps_per_second": 0.544, |
|
"step": 1081 |
|
}, |
|
{ |
|
"epoch": 47.391304347826086, |
|
"grad_norm": 16.541719436645508, |
|
"learning_rate": 1.5260869565217395e-05, |
|
"loss": 0.2054, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 47.82608695652174, |
|
"grad_norm": 10.804960250854492, |
|
"learning_rate": 1.5217391304347828e-05, |
|
"loss": 0.2231, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.6758210062980652, |
|
"eval_runtime": 5.136, |
|
"eval_samples_per_second": 49.455, |
|
"eval_steps_per_second": 0.779, |
|
"step": 1104 |
|
}, |
|
{ |
|
"epoch": 48.26086956521739, |
|
"grad_norm": 13.976753234863281, |
|
"learning_rate": 1.5173913043478262e-05, |
|
"loss": 0.2362, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 48.69565217391305, |
|
"grad_norm": 37.785911560058594, |
|
"learning_rate": 1.5130434782608697e-05, |
|
"loss": 0.2364, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 49.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 0.7746602892875671, |
|
"eval_runtime": 5.5686, |
|
"eval_samples_per_second": 45.613, |
|
"eval_steps_per_second": 0.718, |
|
"step": 1127 |
|
}, |
|
{ |
|
"epoch": 49.130434782608695, |
|
"grad_norm": 17.750612258911133, |
|
"learning_rate": 1.5086956521739132e-05, |
|
"loss": 0.2044, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 49.56521739130435, |
|
"grad_norm": 29.463176727294922, |
|
"learning_rate": 1.5043478260869567e-05, |
|
"loss": 0.2272, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"grad_norm": 21.283435821533203, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.222, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 50.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.7104395627975464, |
|
"eval_runtime": 5.8427, |
|
"eval_samples_per_second": 43.473, |
|
"eval_steps_per_second": 0.685, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 50.43478260869565, |
|
"grad_norm": 13.56606674194336, |
|
"learning_rate": 1.4956521739130436e-05, |
|
"loss": 0.2, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 50.869565217391305, |
|
"grad_norm": 19.158952713012695, |
|
"learning_rate": 1.491304347826087e-05, |
|
"loss": 0.2018, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 51.0, |
|
"eval_accuracy": 0.7952755905511811, |
|
"eval_loss": 0.6884562969207764, |
|
"eval_runtime": 6.9894, |
|
"eval_samples_per_second": 36.341, |
|
"eval_steps_per_second": 0.572, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 51.30434782608695, |
|
"grad_norm": 15.846772193908691, |
|
"learning_rate": 1.4869565217391306e-05, |
|
"loss": 0.2231, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 51.73913043478261, |
|
"grad_norm": 19.628162384033203, |
|
"learning_rate": 1.4826086956521741e-05, |
|
"loss": 0.219, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 0.7609480023384094, |
|
"eval_runtime": 5.6933, |
|
"eval_samples_per_second": 44.614, |
|
"eval_steps_per_second": 0.703, |
|
"step": 1196 |
|
}, |
|
{ |
|
"epoch": 52.17391304347826, |
|
"grad_norm": 13.08530044555664, |
|
"learning_rate": 1.4782608695652174e-05, |
|
"loss": 0.1924, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 52.608695652173914, |
|
"grad_norm": 18.541603088378906, |
|
"learning_rate": 1.473913043478261e-05, |
|
"loss": 0.1916, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 53.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.83939129114151, |
|
"eval_runtime": 5.8715, |
|
"eval_samples_per_second": 43.26, |
|
"eval_steps_per_second": 0.681, |
|
"step": 1219 |
|
}, |
|
{ |
|
"epoch": 53.04347826086956, |
|
"grad_norm": 25.077634811401367, |
|
"learning_rate": 1.4695652173913045e-05, |
|
"loss": 0.2072, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 53.47826086956522, |
|
"grad_norm": 13.322381973266602, |
|
"learning_rate": 1.465217391304348e-05, |
|
"loss": 0.1981, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 53.91304347826087, |
|
"grad_norm": 37.27448654174805, |
|
"learning_rate": 1.4608695652173915e-05, |
|
"loss": 0.1767, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 54.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.7909868955612183, |
|
"eval_runtime": 5.8932, |
|
"eval_samples_per_second": 43.101, |
|
"eval_steps_per_second": 0.679, |
|
"step": 1242 |
|
}, |
|
{ |
|
"epoch": 54.34782608695652, |
|
"grad_norm": 7.898425579071045, |
|
"learning_rate": 1.456521739130435e-05, |
|
"loss": 0.1524, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 54.78260869565217, |
|
"grad_norm": 23.582290649414062, |
|
"learning_rate": 1.4521739130434785e-05, |
|
"loss": 0.236, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 55.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.7601104974746704, |
|
"eval_runtime": 5.1002, |
|
"eval_samples_per_second": 49.802, |
|
"eval_steps_per_second": 0.784, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 55.21739130434783, |
|
"grad_norm": 18.55949592590332, |
|
"learning_rate": 1.447826086956522e-05, |
|
"loss": 0.2053, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 55.65217391304348, |
|
"grad_norm": 11.066632270812988, |
|
"learning_rate": 1.4434782608695654e-05, |
|
"loss": 0.1898, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.7501403093338013, |
|
"eval_runtime": 6.1846, |
|
"eval_samples_per_second": 41.07, |
|
"eval_steps_per_second": 0.647, |
|
"step": 1288 |
|
}, |
|
{ |
|
"epoch": 56.08695652173913, |
|
"grad_norm": 13.702197074890137, |
|
"learning_rate": 1.4391304347826087e-05, |
|
"loss": 0.1817, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 56.52173913043478, |
|
"grad_norm": 12.48442268371582, |
|
"learning_rate": 1.4347826086956522e-05, |
|
"loss": 0.2184, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 56.95652173913044, |
|
"grad_norm": 12.403263092041016, |
|
"learning_rate": 1.4304347826086957e-05, |
|
"loss": 0.1876, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 57.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.7492194175720215, |
|
"eval_runtime": 5.4019, |
|
"eval_samples_per_second": 47.02, |
|
"eval_steps_per_second": 0.74, |
|
"step": 1311 |
|
}, |
|
{ |
|
"epoch": 57.391304347826086, |
|
"grad_norm": 46.26993179321289, |
|
"learning_rate": 1.4260869565217392e-05, |
|
"loss": 0.2015, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 57.82608695652174, |
|
"grad_norm": 18.550594329833984, |
|
"learning_rate": 1.4217391304347828e-05, |
|
"loss": 0.1592, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 58.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.7905071377754211, |
|
"eval_runtime": 5.5787, |
|
"eval_samples_per_second": 45.531, |
|
"eval_steps_per_second": 0.717, |
|
"step": 1334 |
|
}, |
|
{ |
|
"epoch": 58.26086956521739, |
|
"grad_norm": 5.595022201538086, |
|
"learning_rate": 1.4173913043478263e-05, |
|
"loss": 0.1851, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 58.69565217391305, |
|
"grad_norm": 12.843214988708496, |
|
"learning_rate": 1.4130434782608698e-05, |
|
"loss": 0.1772, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 59.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.7410995364189148, |
|
"eval_runtime": 5.5315, |
|
"eval_samples_per_second": 45.919, |
|
"eval_steps_per_second": 0.723, |
|
"step": 1357 |
|
}, |
|
{ |
|
"epoch": 59.130434782608695, |
|
"grad_norm": 15.449782371520996, |
|
"learning_rate": 1.4086956521739133e-05, |
|
"loss": 0.2286, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 59.56521739130435, |
|
"grad_norm": 13.496415138244629, |
|
"learning_rate": 1.4043478260869568e-05, |
|
"loss": 0.1659, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 18.721599578857422, |
|
"learning_rate": 1.4e-05, |
|
"loss": 0.1787, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.8145360946655273, |
|
"eval_runtime": 5.7016, |
|
"eval_samples_per_second": 44.549, |
|
"eval_steps_per_second": 0.702, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 60.43478260869565, |
|
"grad_norm": 10.89456558227539, |
|
"learning_rate": 1.3956521739130435e-05, |
|
"loss": 0.1598, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 60.869565217391305, |
|
"grad_norm": 13.629638671875, |
|
"learning_rate": 1.391304347826087e-05, |
|
"loss": 0.1782, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 61.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.7721127271652222, |
|
"eval_runtime": 5.8504, |
|
"eval_samples_per_second": 43.416, |
|
"eval_steps_per_second": 0.684, |
|
"step": 1403 |
|
}, |
|
{ |
|
"epoch": 61.30434782608695, |
|
"grad_norm": 17.057193756103516, |
|
"learning_rate": 1.3869565217391305e-05, |
|
"loss": 0.1892, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 61.73913043478261, |
|
"grad_norm": 18.26725196838379, |
|
"learning_rate": 1.382608695652174e-05, |
|
"loss": 0.1781, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 62.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.8022087216377258, |
|
"eval_runtime": 5.9987, |
|
"eval_samples_per_second": 42.342, |
|
"eval_steps_per_second": 0.667, |
|
"step": 1426 |
|
}, |
|
{ |
|
"epoch": 62.17391304347826, |
|
"grad_norm": 23.26697540283203, |
|
"learning_rate": 1.3782608695652175e-05, |
|
"loss": 0.1886, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 62.608695652173914, |
|
"grad_norm": 16.03859519958496, |
|
"learning_rate": 1.373913043478261e-05, |
|
"loss": 0.1884, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 63.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.8629626035690308, |
|
"eval_runtime": 5.976, |
|
"eval_samples_per_second": 42.504, |
|
"eval_steps_per_second": 0.669, |
|
"step": 1449 |
|
}, |
|
{ |
|
"epoch": 63.04347826086956, |
|
"grad_norm": 14.661111831665039, |
|
"learning_rate": 1.3695652173913046e-05, |
|
"loss": 0.1762, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 63.47826086956522, |
|
"grad_norm": 11.116896629333496, |
|
"learning_rate": 1.3652173913043479e-05, |
|
"loss": 0.162, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 63.91304347826087, |
|
"grad_norm": 17.106279373168945, |
|
"learning_rate": 1.3608695652173913e-05, |
|
"loss": 0.1905, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"eval_accuracy": 0.7952755905511811, |
|
"eval_loss": 0.7472063899040222, |
|
"eval_runtime": 5.0119, |
|
"eval_samples_per_second": 50.68, |
|
"eval_steps_per_second": 0.798, |
|
"step": 1472 |
|
}, |
|
{ |
|
"epoch": 64.34782608695652, |
|
"grad_norm": 17.491132736206055, |
|
"learning_rate": 1.3565217391304348e-05, |
|
"loss": 0.1824, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 64.78260869565217, |
|
"grad_norm": 14.771021842956543, |
|
"learning_rate": 1.3521739130434783e-05, |
|
"loss": 0.16, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 65.0, |
|
"eval_accuracy": 0.7874015748031497, |
|
"eval_loss": 0.7761121988296509, |
|
"eval_runtime": 6.264, |
|
"eval_samples_per_second": 40.549, |
|
"eval_steps_per_second": 0.639, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 65.21739130434783, |
|
"grad_norm": 11.443580627441406, |
|
"learning_rate": 1.3478260869565218e-05, |
|
"loss": 0.1691, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 65.65217391304348, |
|
"grad_norm": 9.551299095153809, |
|
"learning_rate": 1.3434782608695653e-05, |
|
"loss": 0.1619, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 66.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.858647882938385, |
|
"eval_runtime": 5.0805, |
|
"eval_samples_per_second": 49.995, |
|
"eval_steps_per_second": 0.787, |
|
"step": 1518 |
|
}, |
|
{ |
|
"epoch": 66.08695652173913, |
|
"grad_norm": 6.219148635864258, |
|
"learning_rate": 1.3391304347826088e-05, |
|
"loss": 0.1517, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 66.52173913043478, |
|
"grad_norm": 18.751920700073242, |
|
"learning_rate": 1.3347826086956523e-05, |
|
"loss": 0.1635, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 66.95652173913044, |
|
"grad_norm": 8.722993850708008, |
|
"learning_rate": 1.3304347826086958e-05, |
|
"loss": 0.1768, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 67.0, |
|
"eval_accuracy": 0.7834645669291339, |
|
"eval_loss": 0.7699764966964722, |
|
"eval_runtime": 5.8289, |
|
"eval_samples_per_second": 43.576, |
|
"eval_steps_per_second": 0.686, |
|
"step": 1541 |
|
}, |
|
{ |
|
"epoch": 67.3913043478261, |
|
"grad_norm": 16.816171646118164, |
|
"learning_rate": 1.3260869565217392e-05, |
|
"loss": 0.184, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 67.82608695652173, |
|
"grad_norm": 10.644329071044922, |
|
"learning_rate": 1.3217391304347827e-05, |
|
"loss": 0.1395, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.8326212763786316, |
|
"eval_runtime": 6.0684, |
|
"eval_samples_per_second": 41.856, |
|
"eval_steps_per_second": 0.659, |
|
"step": 1564 |
|
}, |
|
{ |
|
"epoch": 68.26086956521739, |
|
"grad_norm": 13.280193328857422, |
|
"learning_rate": 1.3173913043478262e-05, |
|
"loss": 0.1954, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 68.69565217391305, |
|
"grad_norm": 15.933572769165039, |
|
"learning_rate": 1.3130434782608697e-05, |
|
"loss": 0.1536, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 69.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.8442131280899048, |
|
"eval_runtime": 5.1134, |
|
"eval_samples_per_second": 49.673, |
|
"eval_steps_per_second": 0.782, |
|
"step": 1587 |
|
}, |
|
{ |
|
"epoch": 69.1304347826087, |
|
"grad_norm": 8.563169479370117, |
|
"learning_rate": 1.308695652173913e-05, |
|
"loss": 0.1859, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 69.56521739130434, |
|
"grad_norm": 52.772857666015625, |
|
"learning_rate": 1.3043478260869566e-05, |
|
"loss": 0.1987, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"grad_norm": 40.454124450683594, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.208, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 70.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.9288918972015381, |
|
"eval_runtime": 6.0792, |
|
"eval_samples_per_second": 41.782, |
|
"eval_steps_per_second": 0.658, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 70.43478260869566, |
|
"grad_norm": 6.047575950622559, |
|
"learning_rate": 1.2956521739130436e-05, |
|
"loss": 0.1494, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 70.8695652173913, |
|
"grad_norm": 24.65239143371582, |
|
"learning_rate": 1.2913043478260871e-05, |
|
"loss": 0.1783, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 71.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.9022064805030823, |
|
"eval_runtime": 5.9906, |
|
"eval_samples_per_second": 42.4, |
|
"eval_steps_per_second": 0.668, |
|
"step": 1633 |
|
}, |
|
{ |
|
"epoch": 71.30434782608695, |
|
"grad_norm": 10.55053424835205, |
|
"learning_rate": 1.2869565217391305e-05, |
|
"loss": 0.1737, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 71.73913043478261, |
|
"grad_norm": 10.525084495544434, |
|
"learning_rate": 1.282608695652174e-05, |
|
"loss": 0.1572, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.8510112166404724, |
|
"eval_runtime": 5.7557, |
|
"eval_samples_per_second": 44.13, |
|
"eval_steps_per_second": 0.695, |
|
"step": 1656 |
|
}, |
|
{ |
|
"epoch": 72.17391304347827, |
|
"grad_norm": 8.981342315673828, |
|
"learning_rate": 1.2782608695652175e-05, |
|
"loss": 0.1407, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 72.6086956521739, |
|
"grad_norm": 10.919659614562988, |
|
"learning_rate": 1.273913043478261e-05, |
|
"loss": 0.1349, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 73.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.7962220907211304, |
|
"eval_runtime": 6.421, |
|
"eval_samples_per_second": 39.558, |
|
"eval_steps_per_second": 0.623, |
|
"step": 1679 |
|
}, |
|
{ |
|
"epoch": 73.04347826086956, |
|
"grad_norm": 16.983463287353516, |
|
"learning_rate": 1.2695652173913045e-05, |
|
"loss": 0.1981, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 73.47826086956522, |
|
"grad_norm": 4.827481269836426, |
|
"learning_rate": 1.265217391304348e-05, |
|
"loss": 0.1354, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 73.91304347826087, |
|
"grad_norm": 7.88926887512207, |
|
"learning_rate": 1.2608695652173915e-05, |
|
"loss": 0.148, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 74.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.8641188144683838, |
|
"eval_runtime": 6.5823, |
|
"eval_samples_per_second": 38.588, |
|
"eval_steps_per_second": 0.608, |
|
"step": 1702 |
|
}, |
|
{ |
|
"epoch": 74.34782608695652, |
|
"grad_norm": 15.213589668273926, |
|
"learning_rate": 1.2565217391304349e-05, |
|
"loss": 0.1763, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 74.78260869565217, |
|
"grad_norm": 10.891641616821289, |
|
"learning_rate": 1.2521739130434784e-05, |
|
"loss": 0.1768, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 75.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.9277058839797974, |
|
"eval_runtime": 5.6008, |
|
"eval_samples_per_second": 45.351, |
|
"eval_steps_per_second": 0.714, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 75.21739130434783, |
|
"grad_norm": 17.890514373779297, |
|
"learning_rate": 1.2478260869565217e-05, |
|
"loss": 0.1467, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 75.65217391304348, |
|
"grad_norm": 19.329357147216797, |
|
"learning_rate": 1.2434782608695652e-05, |
|
"loss": 0.1833, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.8663110136985779, |
|
"eval_runtime": 5.6029, |
|
"eval_samples_per_second": 45.334, |
|
"eval_steps_per_second": 0.714, |
|
"step": 1748 |
|
}, |
|
{ |
|
"epoch": 76.08695652173913, |
|
"grad_norm": 21.218746185302734, |
|
"learning_rate": 1.2391304347826088e-05, |
|
"loss": 0.1451, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 76.52173913043478, |
|
"grad_norm": 14.454313278198242, |
|
"learning_rate": 1.2347826086956523e-05, |
|
"loss": 0.1568, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 76.95652173913044, |
|
"grad_norm": 17.544715881347656, |
|
"learning_rate": 1.2304347826086958e-05, |
|
"loss": 0.1696, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 77.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.8302040696144104, |
|
"eval_runtime": 5.0767, |
|
"eval_samples_per_second": 50.032, |
|
"eval_steps_per_second": 0.788, |
|
"step": 1771 |
|
}, |
|
{ |
|
"epoch": 77.3913043478261, |
|
"grad_norm": 23.46592903137207, |
|
"learning_rate": 1.2260869565217393e-05, |
|
"loss": 0.1504, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 77.82608695652173, |
|
"grad_norm": 10.731809616088867, |
|
"learning_rate": 1.2217391304347828e-05, |
|
"loss": 0.1577, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 78.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.8575807213783264, |
|
"eval_runtime": 4.9965, |
|
"eval_samples_per_second": 50.836, |
|
"eval_steps_per_second": 0.801, |
|
"step": 1794 |
|
}, |
|
{ |
|
"epoch": 78.26086956521739, |
|
"grad_norm": 13.635187149047852, |
|
"learning_rate": 1.2173913043478263e-05, |
|
"loss": 0.1316, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 78.69565217391305, |
|
"grad_norm": 13.30216121673584, |
|
"learning_rate": 1.2130434782608698e-05, |
|
"loss": 0.1724, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 79.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 0.8651734590530396, |
|
"eval_runtime": 5.1165, |
|
"eval_samples_per_second": 49.643, |
|
"eval_steps_per_second": 0.782, |
|
"step": 1817 |
|
}, |
|
{ |
|
"epoch": 79.1304347826087, |
|
"grad_norm": 12.134591102600098, |
|
"learning_rate": 1.208695652173913e-05, |
|
"loss": 0.1608, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 79.56521739130434, |
|
"grad_norm": 11.177396774291992, |
|
"learning_rate": 1.2043478260869565e-05, |
|
"loss": 0.1615, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"grad_norm": 9.49387264251709, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.1525, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 80.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.8567054271697998, |
|
"eval_runtime": 5.5937, |
|
"eval_samples_per_second": 45.408, |
|
"eval_steps_per_second": 0.715, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 80.43478260869566, |
|
"grad_norm": 7.956330299377441, |
|
"learning_rate": 1.1956521739130435e-05, |
|
"loss": 0.1478, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 80.8695652173913, |
|
"grad_norm": 14.717869758605957, |
|
"learning_rate": 1.191304347826087e-05, |
|
"loss": 0.158, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 81.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 0.9139416217803955, |
|
"eval_runtime": 5.5012, |
|
"eval_samples_per_second": 46.172, |
|
"eval_steps_per_second": 0.727, |
|
"step": 1863 |
|
}, |
|
{ |
|
"epoch": 81.30434782608695, |
|
"grad_norm": 9.091207504272461, |
|
"learning_rate": 1.1869565217391306e-05, |
|
"loss": 0.1738, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 81.73913043478261, |
|
"grad_norm": 13.691263198852539, |
|
"learning_rate": 1.182608695652174e-05, |
|
"loss": 0.1639, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 82.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 0.9688799977302551, |
|
"eval_runtime": 5.0902, |
|
"eval_samples_per_second": 49.9, |
|
"eval_steps_per_second": 0.786, |
|
"step": 1886 |
|
}, |
|
{ |
|
"epoch": 82.17391304347827, |
|
"grad_norm": 17.235044479370117, |
|
"learning_rate": 1.1782608695652176e-05, |
|
"loss": 0.1849, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 82.6086956521739, |
|
"grad_norm": 22.74346923828125, |
|
"learning_rate": 1.1739130434782611e-05, |
|
"loss": 0.1424, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 83.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.9698485136032104, |
|
"eval_runtime": 4.9865, |
|
"eval_samples_per_second": 50.938, |
|
"eval_steps_per_second": 0.802, |
|
"step": 1909 |
|
}, |
|
{ |
|
"epoch": 83.04347826086956, |
|
"grad_norm": 16.867551803588867, |
|
"learning_rate": 1.1695652173913043e-05, |
|
"loss": 0.1463, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 83.47826086956522, |
|
"grad_norm": 5.856212615966797, |
|
"learning_rate": 1.1652173913043478e-05, |
|
"loss": 0.1382, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 83.91304347826087, |
|
"grad_norm": 14.235991477966309, |
|
"learning_rate": 1.1608695652173913e-05, |
|
"loss": 0.1224, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 84.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 1.0238784551620483, |
|
"eval_runtime": 5.1148, |
|
"eval_samples_per_second": 49.66, |
|
"eval_steps_per_second": 0.782, |
|
"step": 1932 |
|
}, |
|
{ |
|
"epoch": 84.34782608695652, |
|
"grad_norm": 5.4066925048828125, |
|
"learning_rate": 1.1565217391304348e-05, |
|
"loss": 0.1361, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 84.78260869565217, |
|
"grad_norm": 9.836471557617188, |
|
"learning_rate": 1.1521739130434783e-05, |
|
"loss": 0.1765, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 85.0, |
|
"eval_accuracy": 0.7795275590551181, |
|
"eval_loss": 0.9071526527404785, |
|
"eval_runtime": 6.2721, |
|
"eval_samples_per_second": 40.497, |
|
"eval_steps_per_second": 0.638, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 85.21739130434783, |
|
"grad_norm": 17.78875160217285, |
|
"learning_rate": 1.1478260869565218e-05, |
|
"loss": 0.1696, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 85.65217391304348, |
|
"grad_norm": 12.22424602508545, |
|
"learning_rate": 1.1434782608695654e-05, |
|
"loss": 0.1726, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 86.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 0.9436205625534058, |
|
"eval_runtime": 5.3198, |
|
"eval_samples_per_second": 47.746, |
|
"eval_steps_per_second": 0.752, |
|
"step": 1978 |
|
}, |
|
{ |
|
"epoch": 86.08695652173913, |
|
"grad_norm": 18.984779357910156, |
|
"learning_rate": 1.1391304347826089e-05, |
|
"loss": 0.1716, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 86.52173913043478, |
|
"grad_norm": 12.640647888183594, |
|
"learning_rate": 1.1347826086956524e-05, |
|
"loss": 0.153, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 86.95652173913044, |
|
"grad_norm": 20.94854736328125, |
|
"learning_rate": 1.1304347826086957e-05, |
|
"loss": 0.1584, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 87.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.8775471448898315, |
|
"eval_runtime": 5.2147, |
|
"eval_samples_per_second": 48.708, |
|
"eval_steps_per_second": 0.767, |
|
"step": 2001 |
|
}, |
|
{ |
|
"epoch": 87.3913043478261, |
|
"grad_norm": 16.81257438659668, |
|
"learning_rate": 1.1260869565217392e-05, |
|
"loss": 0.1945, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 87.82608695652173, |
|
"grad_norm": 7.593163013458252, |
|
"learning_rate": 1.1217391304347827e-05, |
|
"loss": 0.164, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 88.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.8591587543487549, |
|
"eval_runtime": 6.3339, |
|
"eval_samples_per_second": 40.102, |
|
"eval_steps_per_second": 0.632, |
|
"step": 2024 |
|
}, |
|
{ |
|
"epoch": 88.26086956521739, |
|
"grad_norm": 27.010921478271484, |
|
"learning_rate": 1.1173913043478261e-05, |
|
"loss": 0.1514, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 88.69565217391305, |
|
"grad_norm": 11.01710033416748, |
|
"learning_rate": 1.1130434782608696e-05, |
|
"loss": 0.1682, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 89.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.9051375389099121, |
|
"eval_runtime": 6.2969, |
|
"eval_samples_per_second": 40.337, |
|
"eval_steps_per_second": 0.635, |
|
"step": 2047 |
|
}, |
|
{ |
|
"epoch": 89.1304347826087, |
|
"grad_norm": 7.84626579284668, |
|
"learning_rate": 1.1086956521739131e-05, |
|
"loss": 0.1364, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 89.56521739130434, |
|
"grad_norm": 10.766637802124023, |
|
"learning_rate": 1.1043478260869566e-05, |
|
"loss": 0.2003, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"grad_norm": 16.38211441040039, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.1455, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 90.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 1.0020496845245361, |
|
"eval_runtime": 6.3353, |
|
"eval_samples_per_second": 40.093, |
|
"eval_steps_per_second": 0.631, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 90.43478260869566, |
|
"grad_norm": 8.20938491821289, |
|
"learning_rate": 1.0956521739130435e-05, |
|
"loss": 0.1372, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 90.8695652173913, |
|
"grad_norm": 29.13675880432129, |
|
"learning_rate": 1.091304347826087e-05, |
|
"loss": 0.1596, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 91.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.9422626495361328, |
|
"eval_runtime": 5.3345, |
|
"eval_samples_per_second": 47.614, |
|
"eval_steps_per_second": 0.75, |
|
"step": 2093 |
|
}, |
|
{ |
|
"epoch": 91.30434782608695, |
|
"grad_norm": 16.11428451538086, |
|
"learning_rate": 1.0869565217391305e-05, |
|
"loss": 0.1704, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 91.73913043478261, |
|
"grad_norm": 11.314476013183594, |
|
"learning_rate": 1.082608695652174e-05, |
|
"loss": 0.1667, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 92.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.9586116671562195, |
|
"eval_runtime": 6.2772, |
|
"eval_samples_per_second": 40.464, |
|
"eval_steps_per_second": 0.637, |
|
"step": 2116 |
|
}, |
|
{ |
|
"epoch": 92.17391304347827, |
|
"grad_norm": 24.264659881591797, |
|
"learning_rate": 1.0782608695652175e-05, |
|
"loss": 0.1439, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 92.6086956521739, |
|
"grad_norm": 13.49873161315918, |
|
"learning_rate": 1.073913043478261e-05, |
|
"loss": 0.132, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 93.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.9890474081039429, |
|
"eval_runtime": 6.376, |
|
"eval_samples_per_second": 39.837, |
|
"eval_steps_per_second": 0.627, |
|
"step": 2139 |
|
}, |
|
{ |
|
"epoch": 93.04347826086956, |
|
"grad_norm": 14.934507369995117, |
|
"learning_rate": 1.0695652173913046e-05, |
|
"loss": 0.1715, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 93.47826086956522, |
|
"grad_norm": 7.096430778503418, |
|
"learning_rate": 1.0652173913043479e-05, |
|
"loss": 0.1454, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 93.91304347826087, |
|
"grad_norm": 15.230493545532227, |
|
"learning_rate": 1.0608695652173914e-05, |
|
"loss": 0.1335, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 94.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 0.9922319054603577, |
|
"eval_runtime": 5.2755, |
|
"eval_samples_per_second": 48.147, |
|
"eval_steps_per_second": 0.758, |
|
"step": 2162 |
|
}, |
|
{ |
|
"epoch": 94.34782608695652, |
|
"grad_norm": 17.714771270751953, |
|
"learning_rate": 1.0565217391304348e-05, |
|
"loss": 0.1593, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 94.78260869565217, |
|
"grad_norm": 33.39093780517578, |
|
"learning_rate": 1.0521739130434783e-05, |
|
"loss": 0.1538, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 95.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 0.9534251093864441, |
|
"eval_runtime": 5.5144, |
|
"eval_samples_per_second": 46.061, |
|
"eval_steps_per_second": 0.725, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 95.21739130434783, |
|
"grad_norm": 17.226415634155273, |
|
"learning_rate": 1.0478260869565218e-05, |
|
"loss": 0.1422, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 95.65217391304348, |
|
"grad_norm": 17.667068481445312, |
|
"learning_rate": 1.0434782608695653e-05, |
|
"loss": 0.1288, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 96.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.0713515281677246, |
|
"eval_runtime": 5.5478, |
|
"eval_samples_per_second": 45.784, |
|
"eval_steps_per_second": 0.721, |
|
"step": 2208 |
|
}, |
|
{ |
|
"epoch": 96.08695652173913, |
|
"grad_norm": 10.762154579162598, |
|
"learning_rate": 1.0391304347826088e-05, |
|
"loss": 0.1204, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 96.52173913043478, |
|
"grad_norm": 31.42228126525879, |
|
"learning_rate": 1.0347826086956523e-05, |
|
"loss": 0.1401, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 96.95652173913044, |
|
"grad_norm": 10.302103996276855, |
|
"learning_rate": 1.0304347826086958e-05, |
|
"loss": 0.1661, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 97.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 0.9949864745140076, |
|
"eval_runtime": 6.453, |
|
"eval_samples_per_second": 39.361, |
|
"eval_steps_per_second": 0.62, |
|
"step": 2231 |
|
}, |
|
{ |
|
"epoch": 97.3913043478261, |
|
"grad_norm": 13.885024070739746, |
|
"learning_rate": 1.0260869565217393e-05, |
|
"loss": 0.171, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 97.82608695652173, |
|
"grad_norm": 20.33379364013672, |
|
"learning_rate": 1.0217391304347829e-05, |
|
"loss": 0.1392, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 98.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 0.9865681529045105, |
|
"eval_runtime": 5.2827, |
|
"eval_samples_per_second": 48.081, |
|
"eval_steps_per_second": 0.757, |
|
"step": 2254 |
|
}, |
|
{ |
|
"epoch": 98.26086956521739, |
|
"grad_norm": 22.363279342651367, |
|
"learning_rate": 1.017391304347826e-05, |
|
"loss": 0.1669, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 98.69565217391305, |
|
"grad_norm": 27.860618591308594, |
|
"learning_rate": 1.0130434782608695e-05, |
|
"loss": 0.1413, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 99.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 1.0637905597686768, |
|
"eval_runtime": 5.2324, |
|
"eval_samples_per_second": 48.544, |
|
"eval_steps_per_second": 0.764, |
|
"step": 2277 |
|
}, |
|
{ |
|
"epoch": 99.1304347826087, |
|
"grad_norm": 15.204286575317383, |
|
"learning_rate": 1.008695652173913e-05, |
|
"loss": 0.1153, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 99.56521739130434, |
|
"grad_norm": 14.230497360229492, |
|
"learning_rate": 1.0043478260869566e-05, |
|
"loss": 0.1309, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"grad_norm": 14.642973899841309, |
|
"learning_rate": 1e-05, |
|
"loss": 0.1619, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 100.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 1.0178251266479492, |
|
"eval_runtime": 5.2064, |
|
"eval_samples_per_second": 48.787, |
|
"eval_steps_per_second": 0.768, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 100.43478260869566, |
|
"grad_norm": 64.10249328613281, |
|
"learning_rate": 9.956521739130436e-06, |
|
"loss": 0.1249, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 100.8695652173913, |
|
"grad_norm": 8.547202110290527, |
|
"learning_rate": 9.913043478260871e-06, |
|
"loss": 0.1537, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 101.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.9891794919967651, |
|
"eval_runtime": 5.2872, |
|
"eval_samples_per_second": 48.04, |
|
"eval_steps_per_second": 0.757, |
|
"step": 2323 |
|
}, |
|
{ |
|
"epoch": 101.30434782608695, |
|
"grad_norm": 21.414203643798828, |
|
"learning_rate": 9.869565217391304e-06, |
|
"loss": 0.1626, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 101.73913043478261, |
|
"grad_norm": 7.053102970123291, |
|
"learning_rate": 9.82608695652174e-06, |
|
"loss": 0.137, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 102.0, |
|
"eval_accuracy": 0.7559055118110236, |
|
"eval_loss": 0.9523509740829468, |
|
"eval_runtime": 5.9665, |
|
"eval_samples_per_second": 42.571, |
|
"eval_steps_per_second": 0.67, |
|
"step": 2346 |
|
}, |
|
{ |
|
"epoch": 102.17391304347827, |
|
"grad_norm": 18.426395416259766, |
|
"learning_rate": 9.782608695652175e-06, |
|
"loss": 0.1623, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 102.6086956521739, |
|
"grad_norm": 7.689826011657715, |
|
"learning_rate": 9.73913043478261e-06, |
|
"loss": 0.1416, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 103.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.0539218187332153, |
|
"eval_runtime": 6.2316, |
|
"eval_samples_per_second": 40.76, |
|
"eval_steps_per_second": 0.642, |
|
"step": 2369 |
|
}, |
|
{ |
|
"epoch": 103.04347826086956, |
|
"grad_norm": 7.151523590087891, |
|
"learning_rate": 9.695652173913043e-06, |
|
"loss": 0.1245, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 103.47826086956522, |
|
"grad_norm": 15.233275413513184, |
|
"learning_rate": 9.652173913043478e-06, |
|
"loss": 0.1347, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 103.91304347826087, |
|
"grad_norm": 6.604703903198242, |
|
"learning_rate": 9.608695652173914e-06, |
|
"loss": 0.1477, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 104.0, |
|
"eval_accuracy": 0.7283464566929134, |
|
"eval_loss": 1.0824700593948364, |
|
"eval_runtime": 5.2926, |
|
"eval_samples_per_second": 47.992, |
|
"eval_steps_per_second": 0.756, |
|
"step": 2392 |
|
}, |
|
{ |
|
"epoch": 104.34782608695652, |
|
"grad_norm": 16.267160415649414, |
|
"learning_rate": 9.565217391304349e-06, |
|
"loss": 0.1353, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 104.78260869565217, |
|
"grad_norm": 14.780786514282227, |
|
"learning_rate": 9.521739130434784e-06, |
|
"loss": 0.1283, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 105.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.0008113384246826, |
|
"eval_runtime": 5.335, |
|
"eval_samples_per_second": 47.61, |
|
"eval_steps_per_second": 0.75, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 105.21739130434783, |
|
"grad_norm": 18.90896224975586, |
|
"learning_rate": 9.478260869565217e-06, |
|
"loss": 0.1235, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 105.65217391304348, |
|
"grad_norm": 13.357507705688477, |
|
"learning_rate": 9.434782608695652e-06, |
|
"loss": 0.1498, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 106.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.9702416658401489, |
|
"eval_runtime": 5.3141, |
|
"eval_samples_per_second": 47.797, |
|
"eval_steps_per_second": 0.753, |
|
"step": 2438 |
|
}, |
|
{ |
|
"epoch": 106.08695652173913, |
|
"grad_norm": 15.780552864074707, |
|
"learning_rate": 9.391304347826087e-06, |
|
"loss": 0.1228, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 106.52173913043478, |
|
"grad_norm": 17.9113826751709, |
|
"learning_rate": 9.347826086956523e-06, |
|
"loss": 0.1394, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 106.95652173913044, |
|
"grad_norm": 9.112041473388672, |
|
"learning_rate": 9.304347826086956e-06, |
|
"loss": 0.1576, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 107.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 1.0144392251968384, |
|
"eval_runtime": 5.3399, |
|
"eval_samples_per_second": 47.567, |
|
"eval_steps_per_second": 0.749, |
|
"step": 2461 |
|
}, |
|
{ |
|
"epoch": 107.3913043478261, |
|
"grad_norm": 19.214317321777344, |
|
"learning_rate": 9.260869565217391e-06, |
|
"loss": 0.1488, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 107.82608695652173, |
|
"grad_norm": 12.886323928833008, |
|
"learning_rate": 9.217391304347826e-06, |
|
"loss": 0.1433, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 108.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 0.945662260055542, |
|
"eval_runtime": 5.4883, |
|
"eval_samples_per_second": 46.28, |
|
"eval_steps_per_second": 0.729, |
|
"step": 2484 |
|
}, |
|
{ |
|
"epoch": 108.26086956521739, |
|
"grad_norm": 6.37844181060791, |
|
"learning_rate": 9.173913043478261e-06, |
|
"loss": 0.1411, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 108.69565217391305, |
|
"grad_norm": 8.28097152709961, |
|
"learning_rate": 9.130434782608697e-06, |
|
"loss": 0.1377, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 109.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.9770110249519348, |
|
"eval_runtime": 5.6903, |
|
"eval_samples_per_second": 44.638, |
|
"eval_steps_per_second": 0.703, |
|
"step": 2507 |
|
}, |
|
{ |
|
"epoch": 109.1304347826087, |
|
"grad_norm": 14.171891212463379, |
|
"learning_rate": 9.086956521739132e-06, |
|
"loss": 0.1449, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 109.56521739130434, |
|
"grad_norm": 7.651449203491211, |
|
"learning_rate": 9.043478260869565e-06, |
|
"loss": 0.1068, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 110.0, |
|
"grad_norm": 8.761197090148926, |
|
"learning_rate": 9e-06, |
|
"loss": 0.1163, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 110.0, |
|
"eval_accuracy": 0.7559055118110236, |
|
"eval_loss": 1.1386067867279053, |
|
"eval_runtime": 5.7019, |
|
"eval_samples_per_second": 44.546, |
|
"eval_steps_per_second": 0.702, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 110.43478260869566, |
|
"grad_norm": 15.50783634185791, |
|
"learning_rate": 8.956521739130435e-06, |
|
"loss": 0.1379, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 110.8695652173913, |
|
"grad_norm": 19.807313919067383, |
|
"learning_rate": 8.91304347826087e-06, |
|
"loss": 0.1449, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 111.0, |
|
"eval_accuracy": 0.7559055118110236, |
|
"eval_loss": 1.0589202642440796, |
|
"eval_runtime": 6.1192, |
|
"eval_samples_per_second": 41.509, |
|
"eval_steps_per_second": 0.654, |
|
"step": 2553 |
|
}, |
|
{ |
|
"epoch": 111.30434782608695, |
|
"grad_norm": 15.006197929382324, |
|
"learning_rate": 8.869565217391306e-06, |
|
"loss": 0.1397, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 111.73913043478261, |
|
"grad_norm": 24.991825103759766, |
|
"learning_rate": 8.82608695652174e-06, |
|
"loss": 0.1475, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 112.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.0109583139419556, |
|
"eval_runtime": 6.251, |
|
"eval_samples_per_second": 40.633, |
|
"eval_steps_per_second": 0.64, |
|
"step": 2576 |
|
}, |
|
{ |
|
"epoch": 112.17391304347827, |
|
"grad_norm": 15.339753150939941, |
|
"learning_rate": 8.782608695652174e-06, |
|
"loss": 0.1204, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 112.6086956521739, |
|
"grad_norm": 17.208599090576172, |
|
"learning_rate": 8.73913043478261e-06, |
|
"loss": 0.1582, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 113.0, |
|
"eval_accuracy": 0.7677165354330708, |
|
"eval_loss": 0.9657435417175293, |
|
"eval_runtime": 5.3503, |
|
"eval_samples_per_second": 47.474, |
|
"eval_steps_per_second": 0.748, |
|
"step": 2599 |
|
}, |
|
{ |
|
"epoch": 113.04347826086956, |
|
"grad_norm": 16.048084259033203, |
|
"learning_rate": 8.695652173913044e-06, |
|
"loss": 0.1543, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 113.47826086956522, |
|
"grad_norm": 10.532240867614746, |
|
"learning_rate": 8.65217391304348e-06, |
|
"loss": 0.1468, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 113.91304347826087, |
|
"grad_norm": 10.69536018371582, |
|
"learning_rate": 8.608695652173915e-06, |
|
"loss": 0.1291, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 114.0, |
|
"eval_accuracy": 0.7755905511811023, |
|
"eval_loss": 0.9563351273536682, |
|
"eval_runtime": 5.5866, |
|
"eval_samples_per_second": 45.466, |
|
"eval_steps_per_second": 0.716, |
|
"step": 2622 |
|
}, |
|
{ |
|
"epoch": 114.34782608695652, |
|
"grad_norm": 14.370439529418945, |
|
"learning_rate": 8.56521739130435e-06, |
|
"loss": 0.1491, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 114.78260869565217, |
|
"grad_norm": 6.434089660644531, |
|
"learning_rate": 8.521739130434783e-06, |
|
"loss": 0.1106, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 115.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.1004494428634644, |
|
"eval_runtime": 5.299, |
|
"eval_samples_per_second": 47.934, |
|
"eval_steps_per_second": 0.755, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 115.21739130434783, |
|
"grad_norm": 16.293176651000977, |
|
"learning_rate": 8.478260869565218e-06, |
|
"loss": 0.1276, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 115.65217391304348, |
|
"grad_norm": 6.88930082321167, |
|
"learning_rate": 8.434782608695653e-06, |
|
"loss": 0.1339, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 116.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.0326673984527588, |
|
"eval_runtime": 5.6971, |
|
"eval_samples_per_second": 44.584, |
|
"eval_steps_per_second": 0.702, |
|
"step": 2668 |
|
}, |
|
{ |
|
"epoch": 116.08695652173913, |
|
"grad_norm": 13.512252807617188, |
|
"learning_rate": 8.391304347826089e-06, |
|
"loss": 0.1317, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 116.52173913043478, |
|
"grad_norm": 6.733714580535889, |
|
"learning_rate": 8.347826086956522e-06, |
|
"loss": 0.1209, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 116.95652173913044, |
|
"grad_norm": 15.30413818359375, |
|
"learning_rate": 8.304347826086957e-06, |
|
"loss": 0.1344, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 117.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.0160647630691528, |
|
"eval_runtime": 5.8565, |
|
"eval_samples_per_second": 43.371, |
|
"eval_steps_per_second": 0.683, |
|
"step": 2691 |
|
}, |
|
{ |
|
"epoch": 117.3913043478261, |
|
"grad_norm": 15.041736602783203, |
|
"learning_rate": 8.260869565217392e-06, |
|
"loss": 0.1674, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 117.82608695652173, |
|
"grad_norm": 23.318103790283203, |
|
"learning_rate": 8.217391304347827e-06, |
|
"loss": 0.1433, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 118.0, |
|
"eval_accuracy": 0.7559055118110236, |
|
"eval_loss": 1.0311795473098755, |
|
"eval_runtime": 5.6699, |
|
"eval_samples_per_second": 44.798, |
|
"eval_steps_per_second": 0.705, |
|
"step": 2714 |
|
}, |
|
{ |
|
"epoch": 118.26086956521739, |
|
"grad_norm": 9.980913162231445, |
|
"learning_rate": 8.173913043478263e-06, |
|
"loss": 0.1347, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 118.69565217391305, |
|
"grad_norm": 7.0901408195495605, |
|
"learning_rate": 8.130434782608696e-06, |
|
"loss": 0.1271, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 119.0, |
|
"eval_accuracy": 0.7598425196850394, |
|
"eval_loss": 1.0266056060791016, |
|
"eval_runtime": 5.2706, |
|
"eval_samples_per_second": 48.192, |
|
"eval_steps_per_second": 0.759, |
|
"step": 2737 |
|
}, |
|
{ |
|
"epoch": 119.1304347826087, |
|
"grad_norm": 43.82113265991211, |
|
"learning_rate": 8.086956521739131e-06, |
|
"loss": 0.1068, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 119.56521739130434, |
|
"grad_norm": 7.058877468109131, |
|
"learning_rate": 8.043478260869566e-06, |
|
"loss": 0.1144, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"grad_norm": 4.748641014099121, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.1222, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 120.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 1.0119496583938599, |
|
"eval_runtime": 6.2061, |
|
"eval_samples_per_second": 40.928, |
|
"eval_steps_per_second": 0.645, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 120.43478260869566, |
|
"grad_norm": 15.84010124206543, |
|
"learning_rate": 7.956521739130435e-06, |
|
"loss": 0.1193, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 120.8695652173913, |
|
"grad_norm": 13.664319038391113, |
|
"learning_rate": 7.91304347826087e-06, |
|
"loss": 0.1235, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 121.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.0808361768722534, |
|
"eval_runtime": 5.23, |
|
"eval_samples_per_second": 48.566, |
|
"eval_steps_per_second": 0.765, |
|
"step": 2783 |
|
}, |
|
{ |
|
"epoch": 121.30434782608695, |
|
"grad_norm": 12.779178619384766, |
|
"learning_rate": 7.869565217391305e-06, |
|
"loss": 0.1322, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 121.73913043478261, |
|
"grad_norm": 13.491189002990723, |
|
"learning_rate": 7.82608695652174e-06, |
|
"loss": 0.1311, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 122.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.0612245798110962, |
|
"eval_runtime": 5.2527, |
|
"eval_samples_per_second": 48.356, |
|
"eval_steps_per_second": 0.762, |
|
"step": 2806 |
|
}, |
|
{ |
|
"epoch": 122.17391304347827, |
|
"grad_norm": 12.711108207702637, |
|
"learning_rate": 7.782608695652174e-06, |
|
"loss": 0.1415, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 122.6086956521739, |
|
"grad_norm": 21.42779541015625, |
|
"learning_rate": 7.739130434782609e-06, |
|
"loss": 0.1219, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 123.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.1412242650985718, |
|
"eval_runtime": 5.1306, |
|
"eval_samples_per_second": 49.507, |
|
"eval_steps_per_second": 0.78, |
|
"step": 2829 |
|
}, |
|
{ |
|
"epoch": 123.04347826086956, |
|
"grad_norm": 23.719886779785156, |
|
"learning_rate": 7.695652173913044e-06, |
|
"loss": 0.138, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 123.47826086956522, |
|
"grad_norm": 9.191445350646973, |
|
"learning_rate": 7.652173913043479e-06, |
|
"loss": 0.111, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 123.91304347826087, |
|
"grad_norm": 29.234426498413086, |
|
"learning_rate": 7.608695652173914e-06, |
|
"loss": 0.148, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 124.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.0836260318756104, |
|
"eval_runtime": 5.2645, |
|
"eval_samples_per_second": 48.248, |
|
"eval_steps_per_second": 0.76, |
|
"step": 2852 |
|
}, |
|
{ |
|
"epoch": 124.34782608695652, |
|
"grad_norm": 11.18213939666748, |
|
"learning_rate": 7.565217391304348e-06, |
|
"loss": 0.13, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 124.78260869565217, |
|
"grad_norm": 15.413270950317383, |
|
"learning_rate": 7.5217391304347835e-06, |
|
"loss": 0.1076, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 125.0, |
|
"eval_accuracy": 0.7559055118110236, |
|
"eval_loss": 1.0629363059997559, |
|
"eval_runtime": 5.8955, |
|
"eval_samples_per_second": 43.084, |
|
"eval_steps_per_second": 0.678, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 125.21739130434783, |
|
"grad_norm": 8.161211013793945, |
|
"learning_rate": 7.478260869565218e-06, |
|
"loss": 0.1678, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 125.65217391304348, |
|
"grad_norm": 8.079183578491211, |
|
"learning_rate": 7.434782608695653e-06, |
|
"loss": 0.1306, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 126.0, |
|
"eval_accuracy": 0.7362204724409449, |
|
"eval_loss": 1.0791066884994507, |
|
"eval_runtime": 5.794, |
|
"eval_samples_per_second": 43.839, |
|
"eval_steps_per_second": 0.69, |
|
"step": 2898 |
|
}, |
|
{ |
|
"epoch": 126.08695652173913, |
|
"grad_norm": 20.435319900512695, |
|
"learning_rate": 7.391304347826087e-06, |
|
"loss": 0.1227, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 126.52173913043478, |
|
"grad_norm": 5.470503807067871, |
|
"learning_rate": 7.347826086956522e-06, |
|
"loss": 0.1258, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 126.95652173913044, |
|
"grad_norm": 21.71735191345215, |
|
"learning_rate": 7.304347826086957e-06, |
|
"loss": 0.1153, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 127.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.149482250213623, |
|
"eval_runtime": 5.2212, |
|
"eval_samples_per_second": 48.648, |
|
"eval_steps_per_second": 0.766, |
|
"step": 2921 |
|
}, |
|
{ |
|
"epoch": 127.3913043478261, |
|
"grad_norm": 7.169867038726807, |
|
"learning_rate": 7.2608695652173925e-06, |
|
"loss": 0.1023, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 127.82608695652173, |
|
"grad_norm": 5.137673377990723, |
|
"learning_rate": 7.217391304347827e-06, |
|
"loss": 0.1239, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 128.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.1446442604064941, |
|
"eval_runtime": 5.8267, |
|
"eval_samples_per_second": 43.592, |
|
"eval_steps_per_second": 0.686, |
|
"step": 2944 |
|
}, |
|
{ |
|
"epoch": 128.2608695652174, |
|
"grad_norm": 46.98186492919922, |
|
"learning_rate": 7.173913043478261e-06, |
|
"loss": 0.1256, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 128.69565217391303, |
|
"grad_norm": 18.51282501220703, |
|
"learning_rate": 7.130434782608696e-06, |
|
"loss": 0.1533, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 129.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.0817540884017944, |
|
"eval_runtime": 5.9648, |
|
"eval_samples_per_second": 42.583, |
|
"eval_steps_per_second": 0.671, |
|
"step": 2967 |
|
}, |
|
{ |
|
"epoch": 129.1304347826087, |
|
"grad_norm": 30.390338897705078, |
|
"learning_rate": 7.086956521739131e-06, |
|
"loss": 0.1534, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 129.56521739130434, |
|
"grad_norm": 15.933720588684082, |
|
"learning_rate": 7.0434782608695665e-06, |
|
"loss": 0.1542, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 130.0, |
|
"grad_norm": 28.77271270751953, |
|
"learning_rate": 7e-06, |
|
"loss": 0.136, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 130.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.0557951927185059, |
|
"eval_runtime": 5.2305, |
|
"eval_samples_per_second": 48.561, |
|
"eval_steps_per_second": 0.765, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 130.43478260869566, |
|
"grad_norm": 13.821724891662598, |
|
"learning_rate": 6.956521739130435e-06, |
|
"loss": 0.1428, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 130.8695652173913, |
|
"grad_norm": 12.773445129394531, |
|
"learning_rate": 6.91304347826087e-06, |
|
"loss": 0.1189, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 131.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.0423293113708496, |
|
"eval_runtime": 5.2446, |
|
"eval_samples_per_second": 48.431, |
|
"eval_steps_per_second": 0.763, |
|
"step": 3013 |
|
}, |
|
{ |
|
"epoch": 131.30434782608697, |
|
"grad_norm": 12.808055877685547, |
|
"learning_rate": 6.869565217391305e-06, |
|
"loss": 0.1204, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 131.7391304347826, |
|
"grad_norm": 12.246822357177734, |
|
"learning_rate": 6.8260869565217395e-06, |
|
"loss": 0.1247, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 132.0, |
|
"eval_accuracy": 0.7637795275590551, |
|
"eval_loss": 1.0581423044204712, |
|
"eval_runtime": 5.6958, |
|
"eval_samples_per_second": 44.594, |
|
"eval_steps_per_second": 0.702, |
|
"step": 3036 |
|
}, |
|
{ |
|
"epoch": 132.17391304347825, |
|
"grad_norm": 8.1224946975708, |
|
"learning_rate": 6.782608695652174e-06, |
|
"loss": 0.1259, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 132.6086956521739, |
|
"grad_norm": 9.494956016540527, |
|
"learning_rate": 6.739130434782609e-06, |
|
"loss": 0.1136, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 133.0, |
|
"eval_accuracy": 0.7716535433070866, |
|
"eval_loss": 1.0132337808609009, |
|
"eval_runtime": 5.341, |
|
"eval_samples_per_second": 47.556, |
|
"eval_steps_per_second": 0.749, |
|
"step": 3059 |
|
}, |
|
{ |
|
"epoch": 133.04347826086956, |
|
"grad_norm": 6.921084880828857, |
|
"learning_rate": 6.695652173913044e-06, |
|
"loss": 0.1295, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 133.47826086956522, |
|
"grad_norm": 11.8038330078125, |
|
"learning_rate": 6.652173913043479e-06, |
|
"loss": 0.094, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 133.91304347826087, |
|
"grad_norm": 14.180817604064941, |
|
"learning_rate": 6.6086956521739135e-06, |
|
"loss": 0.1492, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 134.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.1127182245254517, |
|
"eval_runtime": 5.2916, |
|
"eval_samples_per_second": 48.0, |
|
"eval_steps_per_second": 0.756, |
|
"step": 3082 |
|
}, |
|
{ |
|
"epoch": 134.34782608695653, |
|
"grad_norm": 35.78520202636719, |
|
"learning_rate": 6.565217391304349e-06, |
|
"loss": 0.1522, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 134.7826086956522, |
|
"grad_norm": 16.837764739990234, |
|
"learning_rate": 6.521739130434783e-06, |
|
"loss": 0.1184, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 135.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.1449700593948364, |
|
"eval_runtime": 5.1179, |
|
"eval_samples_per_second": 49.629, |
|
"eval_steps_per_second": 0.782, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 135.2173913043478, |
|
"grad_norm": 18.732372283935547, |
|
"learning_rate": 6.478260869565218e-06, |
|
"loss": 0.1284, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 135.65217391304347, |
|
"grad_norm": 47.60773468017578, |
|
"learning_rate": 6.434782608695652e-06, |
|
"loss": 0.1122, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 136.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.1063224077224731, |
|
"eval_runtime": 5.1054, |
|
"eval_samples_per_second": 49.751, |
|
"eval_steps_per_second": 0.783, |
|
"step": 3128 |
|
}, |
|
{ |
|
"epoch": 136.08695652173913, |
|
"grad_norm": 13.677179336547852, |
|
"learning_rate": 6.391304347826087e-06, |
|
"loss": 0.1027, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 136.52173913043478, |
|
"grad_norm": 25.119609832763672, |
|
"learning_rate": 6.3478260869565225e-06, |
|
"loss": 0.1485, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 136.95652173913044, |
|
"grad_norm": 7.572585105895996, |
|
"learning_rate": 6.304347826086958e-06, |
|
"loss": 0.1047, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 137.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.1029216051101685, |
|
"eval_runtime": 5.1837, |
|
"eval_samples_per_second": 48.999, |
|
"eval_steps_per_second": 0.772, |
|
"step": 3151 |
|
}, |
|
{ |
|
"epoch": 137.3913043478261, |
|
"grad_norm": 15.767362594604492, |
|
"learning_rate": 6.260869565217392e-06, |
|
"loss": 0.1319, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 137.82608695652175, |
|
"grad_norm": 7.367244243621826, |
|
"learning_rate": 6.217391304347826e-06, |
|
"loss": 0.1285, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 138.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.156273603439331, |
|
"eval_runtime": 6.0904, |
|
"eval_samples_per_second": 41.705, |
|
"eval_steps_per_second": 0.657, |
|
"step": 3174 |
|
}, |
|
{ |
|
"epoch": 138.2608695652174, |
|
"grad_norm": 17.1787166595459, |
|
"learning_rate": 6.173913043478261e-06, |
|
"loss": 0.1134, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 138.69565217391303, |
|
"grad_norm": 8.421507835388184, |
|
"learning_rate": 6.1304347826086965e-06, |
|
"loss": 0.1004, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 139.0, |
|
"eval_accuracy": 0.7362204724409449, |
|
"eval_loss": 1.1551874876022339, |
|
"eval_runtime": 5.6544, |
|
"eval_samples_per_second": 44.921, |
|
"eval_steps_per_second": 0.707, |
|
"step": 3197 |
|
}, |
|
{ |
|
"epoch": 139.1304347826087, |
|
"grad_norm": 13.920239448547363, |
|
"learning_rate": 6.086956521739132e-06, |
|
"loss": 0.1237, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 139.56521739130434, |
|
"grad_norm": 11.163459777832031, |
|
"learning_rate": 6.043478260869565e-06, |
|
"loss": 0.1278, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 140.0, |
|
"grad_norm": 38.89981460571289, |
|
"learning_rate": 6e-06, |
|
"loss": 0.1285, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 140.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.109745979309082, |
|
"eval_runtime": 5.999, |
|
"eval_samples_per_second": 42.34, |
|
"eval_steps_per_second": 0.667, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 140.43478260869566, |
|
"grad_norm": 7.645209312438965, |
|
"learning_rate": 5.956521739130435e-06, |
|
"loss": 0.1422, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 140.8695652173913, |
|
"grad_norm": 20.615463256835938, |
|
"learning_rate": 5.91304347826087e-06, |
|
"loss": 0.1257, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 141.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.1602002382278442, |
|
"eval_runtime": 6.0238, |
|
"eval_samples_per_second": 42.166, |
|
"eval_steps_per_second": 0.664, |
|
"step": 3243 |
|
}, |
|
{ |
|
"epoch": 141.30434782608697, |
|
"grad_norm": 13.762594223022461, |
|
"learning_rate": 5.8695652173913055e-06, |
|
"loss": 0.0992, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 141.7391304347826, |
|
"grad_norm": 5.704482555389404, |
|
"learning_rate": 5.826086956521739e-06, |
|
"loss": 0.1075, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 142.0, |
|
"eval_accuracy": 0.7559055118110236, |
|
"eval_loss": 1.1911606788635254, |
|
"eval_runtime": 6.0583, |
|
"eval_samples_per_second": 41.926, |
|
"eval_steps_per_second": 0.66, |
|
"step": 3266 |
|
}, |
|
{ |
|
"epoch": 142.17391304347825, |
|
"grad_norm": 18.972148895263672, |
|
"learning_rate": 5.782608695652174e-06, |
|
"loss": 0.1112, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 142.6086956521739, |
|
"grad_norm": 21.352266311645508, |
|
"learning_rate": 5.739130434782609e-06, |
|
"loss": 0.1098, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 143.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.1894166469573975, |
|
"eval_runtime": 5.6001, |
|
"eval_samples_per_second": 45.357, |
|
"eval_steps_per_second": 0.714, |
|
"step": 3289 |
|
}, |
|
{ |
|
"epoch": 143.04347826086956, |
|
"grad_norm": 12.431015014648438, |
|
"learning_rate": 5.695652173913044e-06, |
|
"loss": 0.0992, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 143.47826086956522, |
|
"grad_norm": 17.39335823059082, |
|
"learning_rate": 5.652173913043479e-06, |
|
"loss": 0.1301, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 143.91304347826087, |
|
"grad_norm": 3.1455869674682617, |
|
"learning_rate": 5.608695652173914e-06, |
|
"loss": 0.1148, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 144.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.155055046081543, |
|
"eval_runtime": 6.0411, |
|
"eval_samples_per_second": 42.045, |
|
"eval_steps_per_second": 0.662, |
|
"step": 3312 |
|
}, |
|
{ |
|
"epoch": 144.34782608695653, |
|
"grad_norm": 19.077762603759766, |
|
"learning_rate": 5.565217391304348e-06, |
|
"loss": 0.1051, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 144.7826086956522, |
|
"grad_norm": 22.914405822753906, |
|
"learning_rate": 5.521739130434783e-06, |
|
"loss": 0.1489, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 145.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.137885332107544, |
|
"eval_runtime": 5.3417, |
|
"eval_samples_per_second": 47.55, |
|
"eval_steps_per_second": 0.749, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 145.2173913043478, |
|
"grad_norm": 33.728397369384766, |
|
"learning_rate": 5.478260869565217e-06, |
|
"loss": 0.1242, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 145.65217391304347, |
|
"grad_norm": 22.441125869750977, |
|
"learning_rate": 5.4347826086956525e-06, |
|
"loss": 0.1461, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 146.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.1726444959640503, |
|
"eval_runtime": 5.1056, |
|
"eval_samples_per_second": 49.749, |
|
"eval_steps_per_second": 0.783, |
|
"step": 3358 |
|
}, |
|
{ |
|
"epoch": 146.08695652173913, |
|
"grad_norm": 23.851354598999023, |
|
"learning_rate": 5.391304347826088e-06, |
|
"loss": 0.1157, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 146.52173913043478, |
|
"grad_norm": 11.18212604522705, |
|
"learning_rate": 5.347826086956523e-06, |
|
"loss": 0.1238, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 146.95652173913044, |
|
"grad_norm": 10.293829917907715, |
|
"learning_rate": 5.304347826086957e-06, |
|
"loss": 0.1171, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 147.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.1190757751464844, |
|
"eval_runtime": 5.1299, |
|
"eval_samples_per_second": 49.513, |
|
"eval_steps_per_second": 0.78, |
|
"step": 3381 |
|
}, |
|
{ |
|
"epoch": 147.3913043478261, |
|
"grad_norm": 12.49338436126709, |
|
"learning_rate": 5.260869565217391e-06, |
|
"loss": 0.1363, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 147.82608695652175, |
|
"grad_norm": 16.56803321838379, |
|
"learning_rate": 5.2173913043478265e-06, |
|
"loss": 0.1262, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 148.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.1661558151245117, |
|
"eval_runtime": 5.1432, |
|
"eval_samples_per_second": 49.386, |
|
"eval_steps_per_second": 0.778, |
|
"step": 3404 |
|
}, |
|
{ |
|
"epoch": 148.2608695652174, |
|
"grad_norm": 13.47645092010498, |
|
"learning_rate": 5.173913043478262e-06, |
|
"loss": 0.1236, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 148.69565217391303, |
|
"grad_norm": 6.461868762969971, |
|
"learning_rate": 5.130434782608697e-06, |
|
"loss": 0.1137, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 149.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.1282668113708496, |
|
"eval_runtime": 5.2173, |
|
"eval_samples_per_second": 48.684, |
|
"eval_steps_per_second": 0.767, |
|
"step": 3427 |
|
}, |
|
{ |
|
"epoch": 149.1304347826087, |
|
"grad_norm": 9.386457443237305, |
|
"learning_rate": 5.08695652173913e-06, |
|
"loss": 0.0944, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 149.56521739130434, |
|
"grad_norm": 17.175273895263672, |
|
"learning_rate": 5.043478260869565e-06, |
|
"loss": 0.1179, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 150.0, |
|
"grad_norm": 3.21197247505188, |
|
"learning_rate": 5e-06, |
|
"loss": 0.1118, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 150.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.1388078927993774, |
|
"eval_runtime": 5.2199, |
|
"eval_samples_per_second": 48.66, |
|
"eval_steps_per_second": 0.766, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 150.43478260869566, |
|
"grad_norm": 13.216156005859375, |
|
"learning_rate": 4.9565217391304355e-06, |
|
"loss": 0.132, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 150.8695652173913, |
|
"grad_norm": 17.5484561920166, |
|
"learning_rate": 4.91304347826087e-06, |
|
"loss": 0.1169, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 151.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.162711501121521, |
|
"eval_runtime": 6.0191, |
|
"eval_samples_per_second": 42.199, |
|
"eval_steps_per_second": 0.665, |
|
"step": 3473 |
|
}, |
|
{ |
|
"epoch": 151.30434782608697, |
|
"grad_norm": 12.96456527709961, |
|
"learning_rate": 4.869565217391305e-06, |
|
"loss": 0.0991, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 151.7391304347826, |
|
"grad_norm": 16.27593994140625, |
|
"learning_rate": 4.826086956521739e-06, |
|
"loss": 0.1021, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 152.0, |
|
"eval_accuracy": 0.7322834645669292, |
|
"eval_loss": 1.182125210762024, |
|
"eval_runtime": 6.074, |
|
"eval_samples_per_second": 41.817, |
|
"eval_steps_per_second": 0.659, |
|
"step": 3496 |
|
}, |
|
{ |
|
"epoch": 152.17391304347825, |
|
"grad_norm": 17.580163955688477, |
|
"learning_rate": 4.782608695652174e-06, |
|
"loss": 0.1196, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 152.6086956521739, |
|
"grad_norm": 16.174192428588867, |
|
"learning_rate": 4.739130434782609e-06, |
|
"loss": 0.1392, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 153.0, |
|
"eval_accuracy": 0.7322834645669292, |
|
"eval_loss": 1.1671814918518066, |
|
"eval_runtime": 5.2176, |
|
"eval_samples_per_second": 48.681, |
|
"eval_steps_per_second": 0.767, |
|
"step": 3519 |
|
}, |
|
{ |
|
"epoch": 153.04347826086956, |
|
"grad_norm": 34.79978561401367, |
|
"learning_rate": 4.695652173913044e-06, |
|
"loss": 0.1162, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 153.47826086956522, |
|
"grad_norm": 17.405305862426758, |
|
"learning_rate": 4.652173913043478e-06, |
|
"loss": 0.1022, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 153.91304347826087, |
|
"grad_norm": 10.909713745117188, |
|
"learning_rate": 4.608695652173913e-06, |
|
"loss": 0.1111, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 154.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.2136441469192505, |
|
"eval_runtime": 5.568, |
|
"eval_samples_per_second": 45.618, |
|
"eval_steps_per_second": 0.718, |
|
"step": 3542 |
|
}, |
|
{ |
|
"epoch": 154.34782608695653, |
|
"grad_norm": 21.353759765625, |
|
"learning_rate": 4.565217391304348e-06, |
|
"loss": 0.143, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 154.7826086956522, |
|
"grad_norm": 17.216047286987305, |
|
"learning_rate": 4.5217391304347826e-06, |
|
"loss": 0.1298, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 155.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.1966124773025513, |
|
"eval_runtime": 6.0928, |
|
"eval_samples_per_second": 41.689, |
|
"eval_steps_per_second": 0.657, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 155.2173913043478, |
|
"grad_norm": 8.084007263183594, |
|
"learning_rate": 4.478260869565218e-06, |
|
"loss": 0.0944, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 155.65217391304347, |
|
"grad_norm": 13.08774471282959, |
|
"learning_rate": 4.434782608695653e-06, |
|
"loss": 0.1114, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 156.0, |
|
"eval_accuracy": 0.7362204724409449, |
|
"eval_loss": 1.138235092163086, |
|
"eval_runtime": 5.2162, |
|
"eval_samples_per_second": 48.694, |
|
"eval_steps_per_second": 0.767, |
|
"step": 3588 |
|
}, |
|
{ |
|
"epoch": 156.08695652173913, |
|
"grad_norm": 9.64503002166748, |
|
"learning_rate": 4.391304347826087e-06, |
|
"loss": 0.1216, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 156.52173913043478, |
|
"grad_norm": 20.011825561523438, |
|
"learning_rate": 4.347826086956522e-06, |
|
"loss": 0.1352, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 156.95652173913044, |
|
"grad_norm": 9.31452751159668, |
|
"learning_rate": 4.304347826086957e-06, |
|
"loss": 0.09, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 157.0, |
|
"eval_accuracy": 0.7322834645669292, |
|
"eval_loss": 1.145975947380066, |
|
"eval_runtime": 6.2926, |
|
"eval_samples_per_second": 40.365, |
|
"eval_steps_per_second": 0.636, |
|
"step": 3611 |
|
}, |
|
{ |
|
"epoch": 157.3913043478261, |
|
"grad_norm": 22.55306625366211, |
|
"learning_rate": 4.260869565217392e-06, |
|
"loss": 0.0952, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 157.82608695652175, |
|
"grad_norm": 11.786699295043945, |
|
"learning_rate": 4.217391304347827e-06, |
|
"loss": 0.1294, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 158.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.1612186431884766, |
|
"eval_runtime": 5.5409, |
|
"eval_samples_per_second": 45.841, |
|
"eval_steps_per_second": 0.722, |
|
"step": 3634 |
|
}, |
|
{ |
|
"epoch": 158.2608695652174, |
|
"grad_norm": 10.745357513427734, |
|
"learning_rate": 4.173913043478261e-06, |
|
"loss": 0.0985, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 158.69565217391303, |
|
"grad_norm": 12.405003547668457, |
|
"learning_rate": 4.130434782608696e-06, |
|
"loss": 0.1186, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 159.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.2204416990280151, |
|
"eval_runtime": 5.9711, |
|
"eval_samples_per_second": 42.538, |
|
"eval_steps_per_second": 0.67, |
|
"step": 3657 |
|
}, |
|
{ |
|
"epoch": 159.1304347826087, |
|
"grad_norm": 20.51667022705078, |
|
"learning_rate": 4.086956521739131e-06, |
|
"loss": 0.1335, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 159.56521739130434, |
|
"grad_norm": 12.348188400268555, |
|
"learning_rate": 4.0434782608695655e-06, |
|
"loss": 0.1106, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 160.0, |
|
"grad_norm": 24.83527183532715, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.1096, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 160.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2095911502838135, |
|
"eval_runtime": 6.1017, |
|
"eval_samples_per_second": 41.628, |
|
"eval_steps_per_second": 0.656, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 160.43478260869566, |
|
"grad_norm": 21.34354019165039, |
|
"learning_rate": 3.956521739130435e-06, |
|
"loss": 0.0921, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 160.8695652173913, |
|
"grad_norm": 9.070474624633789, |
|
"learning_rate": 3.91304347826087e-06, |
|
"loss": 0.1107, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 161.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.182215690612793, |
|
"eval_runtime": 6.0899, |
|
"eval_samples_per_second": 41.709, |
|
"eval_steps_per_second": 0.657, |
|
"step": 3703 |
|
}, |
|
{ |
|
"epoch": 161.30434782608697, |
|
"grad_norm": 17.007827758789062, |
|
"learning_rate": 3.869565217391304e-06, |
|
"loss": 0.1148, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 161.7391304347826, |
|
"grad_norm": 15.50957202911377, |
|
"learning_rate": 3.8260869565217395e-06, |
|
"loss": 0.1094, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 162.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.1907768249511719, |
|
"eval_runtime": 6.0337, |
|
"eval_samples_per_second": 42.097, |
|
"eval_steps_per_second": 0.663, |
|
"step": 3726 |
|
}, |
|
{ |
|
"epoch": 162.17391304347825, |
|
"grad_norm": 22.703414916992188, |
|
"learning_rate": 3.782608695652174e-06, |
|
"loss": 0.1123, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 162.6086956521739, |
|
"grad_norm": 11.196767807006836, |
|
"learning_rate": 3.739130434782609e-06, |
|
"loss": 0.1112, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 163.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.164740800857544, |
|
"eval_runtime": 5.9944, |
|
"eval_samples_per_second": 42.373, |
|
"eval_steps_per_second": 0.667, |
|
"step": 3749 |
|
}, |
|
{ |
|
"epoch": 163.04347826086956, |
|
"grad_norm": 8.430397987365723, |
|
"learning_rate": 3.6956521739130436e-06, |
|
"loss": 0.1113, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 163.47826086956522, |
|
"grad_norm": 4.709465980529785, |
|
"learning_rate": 3.6521739130434787e-06, |
|
"loss": 0.1078, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 163.91304347826087, |
|
"grad_norm": 13.146780014038086, |
|
"learning_rate": 3.6086956521739134e-06, |
|
"loss": 0.1042, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 164.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.252306342124939, |
|
"eval_runtime": 5.2297, |
|
"eval_samples_per_second": 48.569, |
|
"eval_steps_per_second": 0.765, |
|
"step": 3772 |
|
}, |
|
{ |
|
"epoch": 164.34782608695653, |
|
"grad_norm": 10.450764656066895, |
|
"learning_rate": 3.565217391304348e-06, |
|
"loss": 0.1549, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 164.7826086956522, |
|
"grad_norm": 3.6140904426574707, |
|
"learning_rate": 3.5217391304347832e-06, |
|
"loss": 0.0993, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 165.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.2039833068847656, |
|
"eval_runtime": 6.3786, |
|
"eval_samples_per_second": 39.821, |
|
"eval_steps_per_second": 0.627, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 165.2173913043478, |
|
"grad_norm": 12.08646297454834, |
|
"learning_rate": 3.4782608695652175e-06, |
|
"loss": 0.1, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 165.65217391304347, |
|
"grad_norm": 19.647462844848633, |
|
"learning_rate": 3.4347826086956526e-06, |
|
"loss": 0.105, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 166.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.2296301126480103, |
|
"eval_runtime": 6.2193, |
|
"eval_samples_per_second": 40.84, |
|
"eval_steps_per_second": 0.643, |
|
"step": 3818 |
|
}, |
|
{ |
|
"epoch": 166.08695652173913, |
|
"grad_norm": 14.596138000488281, |
|
"learning_rate": 3.391304347826087e-06, |
|
"loss": 0.0947, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 166.52173913043478, |
|
"grad_norm": 12.743565559387207, |
|
"learning_rate": 3.347826086956522e-06, |
|
"loss": 0.1144, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 166.95652173913044, |
|
"grad_norm": 2.257753610610962, |
|
"learning_rate": 3.3043478260869567e-06, |
|
"loss": 0.1071, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 167.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.286328673362732, |
|
"eval_runtime": 6.2909, |
|
"eval_samples_per_second": 40.376, |
|
"eval_steps_per_second": 0.636, |
|
"step": 3841 |
|
}, |
|
{ |
|
"epoch": 167.3913043478261, |
|
"grad_norm": 9.273666381835938, |
|
"learning_rate": 3.2608695652173914e-06, |
|
"loss": 0.11, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 167.82608695652175, |
|
"grad_norm": 9.474180221557617, |
|
"learning_rate": 3.217391304347826e-06, |
|
"loss": 0.108, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 168.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2372475862503052, |
|
"eval_runtime": 6.1145, |
|
"eval_samples_per_second": 41.541, |
|
"eval_steps_per_second": 0.654, |
|
"step": 3864 |
|
}, |
|
{ |
|
"epoch": 168.2608695652174, |
|
"grad_norm": 9.455063819885254, |
|
"learning_rate": 3.1739130434782613e-06, |
|
"loss": 0.0999, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 168.69565217391303, |
|
"grad_norm": 15.079169273376465, |
|
"learning_rate": 3.130434782608696e-06, |
|
"loss": 0.1076, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 169.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.1871860027313232, |
|
"eval_runtime": 5.0604, |
|
"eval_samples_per_second": 50.193, |
|
"eval_steps_per_second": 0.79, |
|
"step": 3887 |
|
}, |
|
{ |
|
"epoch": 169.1304347826087, |
|
"grad_norm": 12.766210556030273, |
|
"learning_rate": 3.0869565217391307e-06, |
|
"loss": 0.1003, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 169.56521739130434, |
|
"grad_norm": 7.451639175415039, |
|
"learning_rate": 3.043478260869566e-06, |
|
"loss": 0.1141, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 170.0, |
|
"grad_norm": 23.210697174072266, |
|
"learning_rate": 3e-06, |
|
"loss": 0.1107, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 170.0, |
|
"eval_accuracy": 0.7322834645669292, |
|
"eval_loss": 1.2354222536087036, |
|
"eval_runtime": 6.0785, |
|
"eval_samples_per_second": 41.787, |
|
"eval_steps_per_second": 0.658, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 170.43478260869566, |
|
"grad_norm": 13.231768608093262, |
|
"learning_rate": 2.956521739130435e-06, |
|
"loss": 0.1264, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 170.8695652173913, |
|
"grad_norm": 8.206528663635254, |
|
"learning_rate": 2.9130434782608695e-06, |
|
"loss": 0.1012, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 171.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2104681730270386, |
|
"eval_runtime": 6.0246, |
|
"eval_samples_per_second": 42.161, |
|
"eval_steps_per_second": 0.664, |
|
"step": 3933 |
|
}, |
|
{ |
|
"epoch": 171.30434782608697, |
|
"grad_norm": 5.853540420532227, |
|
"learning_rate": 2.8695652173913046e-06, |
|
"loss": 0.0994, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 171.7391304347826, |
|
"grad_norm": 23.807804107666016, |
|
"learning_rate": 2.8260869565217393e-06, |
|
"loss": 0.0918, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 172.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2026124000549316, |
|
"eval_runtime": 6.0268, |
|
"eval_samples_per_second": 42.145, |
|
"eval_steps_per_second": 0.664, |
|
"step": 3956 |
|
}, |
|
{ |
|
"epoch": 172.17391304347825, |
|
"grad_norm": 21.178815841674805, |
|
"learning_rate": 2.782608695652174e-06, |
|
"loss": 0.1558, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 172.6086956521739, |
|
"grad_norm": 16.011613845825195, |
|
"learning_rate": 2.7391304347826087e-06, |
|
"loss": 0.1043, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 173.0, |
|
"eval_accuracy": 0.7559055118110236, |
|
"eval_loss": 1.2925167083740234, |
|
"eval_runtime": 6.0458, |
|
"eval_samples_per_second": 42.012, |
|
"eval_steps_per_second": 0.662, |
|
"step": 3979 |
|
}, |
|
{ |
|
"epoch": 173.04347826086956, |
|
"grad_norm": 11.591164588928223, |
|
"learning_rate": 2.695652173913044e-06, |
|
"loss": 0.1244, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 173.47826086956522, |
|
"grad_norm": 4.560230255126953, |
|
"learning_rate": 2.6521739130434785e-06, |
|
"loss": 0.1179, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 173.91304347826087, |
|
"grad_norm": 4.175049304962158, |
|
"learning_rate": 2.6086956521739132e-06, |
|
"loss": 0.1035, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 174.0, |
|
"eval_accuracy": 0.7401574803149606, |
|
"eval_loss": 1.2313941717147827, |
|
"eval_runtime": 5.123, |
|
"eval_samples_per_second": 49.58, |
|
"eval_steps_per_second": 0.781, |
|
"step": 4002 |
|
}, |
|
{ |
|
"epoch": 174.34782608695653, |
|
"grad_norm": 23.1351375579834, |
|
"learning_rate": 2.5652173913043484e-06, |
|
"loss": 0.1087, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 174.7826086956522, |
|
"grad_norm": 16.3070068359375, |
|
"learning_rate": 2.5217391304347826e-06, |
|
"loss": 0.1101, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 175.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.194298267364502, |
|
"eval_runtime": 5.4034, |
|
"eval_samples_per_second": 47.008, |
|
"eval_steps_per_second": 0.74, |
|
"step": 4025 |
|
}, |
|
{ |
|
"epoch": 175.2173913043478, |
|
"grad_norm": 24.940147399902344, |
|
"learning_rate": 2.4782608695652178e-06, |
|
"loss": 0.1229, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 175.65217391304347, |
|
"grad_norm": 12.233097076416016, |
|
"learning_rate": 2.4347826086956525e-06, |
|
"loss": 0.1084, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 176.0, |
|
"eval_accuracy": 0.7362204724409449, |
|
"eval_loss": 1.2069385051727295, |
|
"eval_runtime": 5.5007, |
|
"eval_samples_per_second": 46.176, |
|
"eval_steps_per_second": 0.727, |
|
"step": 4048 |
|
}, |
|
{ |
|
"epoch": 176.08695652173913, |
|
"grad_norm": 2.12101149559021, |
|
"learning_rate": 2.391304347826087e-06, |
|
"loss": 0.1072, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 176.52173913043478, |
|
"grad_norm": 6.768868446350098, |
|
"learning_rate": 2.347826086956522e-06, |
|
"loss": 0.1174, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 176.95652173913044, |
|
"grad_norm": 15.918899536132812, |
|
"learning_rate": 2.3043478260869566e-06, |
|
"loss": 0.1247, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 177.0, |
|
"eval_accuracy": 0.7519685039370079, |
|
"eval_loss": 1.2303253412246704, |
|
"eval_runtime": 5.7, |
|
"eval_samples_per_second": 44.561, |
|
"eval_steps_per_second": 0.702, |
|
"step": 4071 |
|
}, |
|
{ |
|
"epoch": 177.3913043478261, |
|
"grad_norm": 19.0693416595459, |
|
"learning_rate": 2.2608695652173913e-06, |
|
"loss": 0.1032, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 177.82608695652175, |
|
"grad_norm": 21.971101760864258, |
|
"learning_rate": 2.2173913043478264e-06, |
|
"loss": 0.1278, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 178.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.2118239402770996, |
|
"eval_runtime": 5.8876, |
|
"eval_samples_per_second": 43.142, |
|
"eval_steps_per_second": 0.679, |
|
"step": 4094 |
|
}, |
|
{ |
|
"epoch": 178.2608695652174, |
|
"grad_norm": 9.593711853027344, |
|
"learning_rate": 2.173913043478261e-06, |
|
"loss": 0.1073, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 178.69565217391303, |
|
"grad_norm": 17.482452392578125, |
|
"learning_rate": 2.130434782608696e-06, |
|
"loss": 0.1117, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 179.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.2212954759597778, |
|
"eval_runtime": 6.0371, |
|
"eval_samples_per_second": 42.073, |
|
"eval_steps_per_second": 0.663, |
|
"step": 4117 |
|
}, |
|
{ |
|
"epoch": 179.1304347826087, |
|
"grad_norm": 11.529203414916992, |
|
"learning_rate": 2.0869565217391305e-06, |
|
"loss": 0.0907, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 179.56521739130434, |
|
"grad_norm": 14.801724433898926, |
|
"learning_rate": 2.0434782608695656e-06, |
|
"loss": 0.1028, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 180.0, |
|
"grad_norm": 12.524520874023438, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.1123, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 180.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.2403359413146973, |
|
"eval_runtime": 5.0925, |
|
"eval_samples_per_second": 49.877, |
|
"eval_steps_per_second": 0.785, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 180.43478260869566, |
|
"grad_norm": 16.31502914428711, |
|
"learning_rate": 1.956521739130435e-06, |
|
"loss": 0.1072, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 180.8695652173913, |
|
"grad_norm": 10.127395629882812, |
|
"learning_rate": 1.9130434782608697e-06, |
|
"loss": 0.0918, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 181.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.1987248659133911, |
|
"eval_runtime": 6.0905, |
|
"eval_samples_per_second": 41.705, |
|
"eval_steps_per_second": 0.657, |
|
"step": 4163 |
|
}, |
|
{ |
|
"epoch": 181.30434782608697, |
|
"grad_norm": 9.865519523620605, |
|
"learning_rate": 1.8695652173913044e-06, |
|
"loss": 0.0797, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 181.7391304347826, |
|
"grad_norm": 13.656352043151855, |
|
"learning_rate": 1.8260869565217394e-06, |
|
"loss": 0.0827, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 182.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2357696294784546, |
|
"eval_runtime": 6.0801, |
|
"eval_samples_per_second": 41.776, |
|
"eval_steps_per_second": 0.658, |
|
"step": 4186 |
|
}, |
|
{ |
|
"epoch": 182.17391304347825, |
|
"grad_norm": 5.673104763031006, |
|
"learning_rate": 1.782608695652174e-06, |
|
"loss": 0.0948, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 182.6086956521739, |
|
"grad_norm": 4.035088539123535, |
|
"learning_rate": 1.7391304347826088e-06, |
|
"loss": 0.0814, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 183.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.260762333869934, |
|
"eval_runtime": 5.5795, |
|
"eval_samples_per_second": 45.523, |
|
"eval_steps_per_second": 0.717, |
|
"step": 4209 |
|
}, |
|
{ |
|
"epoch": 183.04347826086956, |
|
"grad_norm": 23.201988220214844, |
|
"learning_rate": 1.6956521739130435e-06, |
|
"loss": 0.1265, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 183.47826086956522, |
|
"grad_norm": 16.737319946289062, |
|
"learning_rate": 1.6521739130434784e-06, |
|
"loss": 0.108, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 183.91304347826087, |
|
"grad_norm": 10.299909591674805, |
|
"learning_rate": 1.608695652173913e-06, |
|
"loss": 0.0897, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 184.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2369588613510132, |
|
"eval_runtime": 5.5508, |
|
"eval_samples_per_second": 45.759, |
|
"eval_steps_per_second": 0.721, |
|
"step": 4232 |
|
}, |
|
{ |
|
"epoch": 184.34782608695653, |
|
"grad_norm": 7.6501641273498535, |
|
"learning_rate": 1.565217391304348e-06, |
|
"loss": 0.0951, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 184.7826086956522, |
|
"grad_norm": 12.67705249786377, |
|
"learning_rate": 1.521739130434783e-06, |
|
"loss": 0.1321, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 185.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.231702446937561, |
|
"eval_runtime": 5.603, |
|
"eval_samples_per_second": 45.333, |
|
"eval_steps_per_second": 0.714, |
|
"step": 4255 |
|
}, |
|
{ |
|
"epoch": 185.2173913043478, |
|
"grad_norm": 13.189390182495117, |
|
"learning_rate": 1.4782608695652176e-06, |
|
"loss": 0.1084, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 185.65217391304347, |
|
"grad_norm": 14.240042686462402, |
|
"learning_rate": 1.4347826086956523e-06, |
|
"loss": 0.1194, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 186.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.228926658630371, |
|
"eval_runtime": 5.7824, |
|
"eval_samples_per_second": 43.927, |
|
"eval_steps_per_second": 0.692, |
|
"step": 4278 |
|
}, |
|
{ |
|
"epoch": 186.08695652173913, |
|
"grad_norm": 12.107095718383789, |
|
"learning_rate": 1.391304347826087e-06, |
|
"loss": 0.104, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 186.52173913043478, |
|
"grad_norm": 11.538783073425293, |
|
"learning_rate": 1.347826086956522e-06, |
|
"loss": 0.1249, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 186.95652173913044, |
|
"grad_norm": 12.4583740234375, |
|
"learning_rate": 1.3043478260869566e-06, |
|
"loss": 0.1154, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 187.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.1963870525360107, |
|
"eval_runtime": 5.0691, |
|
"eval_samples_per_second": 50.107, |
|
"eval_steps_per_second": 0.789, |
|
"step": 4301 |
|
}, |
|
{ |
|
"epoch": 187.3913043478261, |
|
"grad_norm": 9.538721084594727, |
|
"learning_rate": 1.2608695652173913e-06, |
|
"loss": 0.1291, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 187.82608695652175, |
|
"grad_norm": 12.623342514038086, |
|
"learning_rate": 1.2173913043478262e-06, |
|
"loss": 0.0964, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 188.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.200947642326355, |
|
"eval_runtime": 6.199, |
|
"eval_samples_per_second": 40.975, |
|
"eval_steps_per_second": 0.645, |
|
"step": 4324 |
|
}, |
|
{ |
|
"epoch": 188.2608695652174, |
|
"grad_norm": 12.05130386352539, |
|
"learning_rate": 1.173913043478261e-06, |
|
"loss": 0.0965, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 188.69565217391303, |
|
"grad_norm": 9.74976921081543, |
|
"learning_rate": 1.1304347826086956e-06, |
|
"loss": 0.0903, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 189.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.212328314781189, |
|
"eval_runtime": 6.2152, |
|
"eval_samples_per_second": 40.868, |
|
"eval_steps_per_second": 0.644, |
|
"step": 4347 |
|
}, |
|
{ |
|
"epoch": 189.1304347826087, |
|
"grad_norm": 23.216026306152344, |
|
"learning_rate": 1.0869565217391306e-06, |
|
"loss": 0.114, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 189.56521739130434, |
|
"grad_norm": 9.926918029785156, |
|
"learning_rate": 1.0434782608695653e-06, |
|
"loss": 0.0896, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 190.0, |
|
"grad_norm": 21.126604080200195, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.1174, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 190.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.233533263206482, |
|
"eval_runtime": 6.2079, |
|
"eval_samples_per_second": 40.916, |
|
"eval_steps_per_second": 0.644, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 190.43478260869566, |
|
"grad_norm": 24.496295928955078, |
|
"learning_rate": 9.565217391304349e-07, |
|
"loss": 0.1433, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 190.8695652173913, |
|
"grad_norm": 10.999076843261719, |
|
"learning_rate": 9.130434782608697e-07, |
|
"loss": 0.0846, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 191.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2399306297302246, |
|
"eval_runtime": 5.1624, |
|
"eval_samples_per_second": 49.202, |
|
"eval_steps_per_second": 0.775, |
|
"step": 4393 |
|
}, |
|
{ |
|
"epoch": 191.30434782608697, |
|
"grad_norm": 16.923410415649414, |
|
"learning_rate": 8.695652173913044e-07, |
|
"loss": 0.0901, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 191.7391304347826, |
|
"grad_norm": 10.834136962890625, |
|
"learning_rate": 8.260869565217392e-07, |
|
"loss": 0.1073, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 192.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.243245244026184, |
|
"eval_runtime": 5.0755, |
|
"eval_samples_per_second": 50.044, |
|
"eval_steps_per_second": 0.788, |
|
"step": 4416 |
|
}, |
|
{ |
|
"epoch": 192.17391304347825, |
|
"grad_norm": 9.724759101867676, |
|
"learning_rate": 7.82608695652174e-07, |
|
"loss": 0.126, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 192.6086956521739, |
|
"grad_norm": 6.699430465698242, |
|
"learning_rate": 7.391304347826088e-07, |
|
"loss": 0.0892, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 193.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.2603920698165894, |
|
"eval_runtime": 5.3314, |
|
"eval_samples_per_second": 47.643, |
|
"eval_steps_per_second": 0.75, |
|
"step": 4439 |
|
}, |
|
{ |
|
"epoch": 193.04347826086956, |
|
"grad_norm": 14.250825881958008, |
|
"learning_rate": 6.956521739130435e-07, |
|
"loss": 0.1211, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 193.47826086956522, |
|
"grad_norm": 9.508411407470703, |
|
"learning_rate": 6.521739130434783e-07, |
|
"loss": 0.0994, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 193.91304347826087, |
|
"grad_norm": 13.035717010498047, |
|
"learning_rate": 6.086956521739131e-07, |
|
"loss": 0.1158, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 194.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.2473300695419312, |
|
"eval_runtime": 5.0663, |
|
"eval_samples_per_second": 50.135, |
|
"eval_steps_per_second": 0.79, |
|
"step": 4462 |
|
}, |
|
{ |
|
"epoch": 194.34782608695653, |
|
"grad_norm": 15.420454978942871, |
|
"learning_rate": 5.652173913043478e-07, |
|
"loss": 0.1159, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 194.7826086956522, |
|
"grad_norm": 26.569854736328125, |
|
"learning_rate": 5.217391304347826e-07, |
|
"loss": 0.1153, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 195.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2267494201660156, |
|
"eval_runtime": 7.2318, |
|
"eval_samples_per_second": 35.123, |
|
"eval_steps_per_second": 0.553, |
|
"step": 4485 |
|
}, |
|
{ |
|
"epoch": 195.2173913043478, |
|
"grad_norm": 22.089872360229492, |
|
"learning_rate": 4.782608695652174e-07, |
|
"loss": 0.1009, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 195.65217391304347, |
|
"grad_norm": 18.216550827026367, |
|
"learning_rate": 4.347826086956522e-07, |
|
"loss": 0.1208, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 196.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2177777290344238, |
|
"eval_runtime": 6.0473, |
|
"eval_samples_per_second": 42.002, |
|
"eval_steps_per_second": 0.661, |
|
"step": 4508 |
|
}, |
|
{ |
|
"epoch": 196.08695652173913, |
|
"grad_norm": 8.497892379760742, |
|
"learning_rate": 3.91304347826087e-07, |
|
"loss": 0.0903, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 196.52173913043478, |
|
"grad_norm": 8.363734245300293, |
|
"learning_rate": 3.4782608695652175e-07, |
|
"loss": 0.0976, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 196.95652173913044, |
|
"grad_norm": 11.241954803466797, |
|
"learning_rate": 3.0434782608695656e-07, |
|
"loss": 0.083, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 197.0, |
|
"eval_accuracy": 0.7480314960629921, |
|
"eval_loss": 1.2145416736602783, |
|
"eval_runtime": 5.0766, |
|
"eval_samples_per_second": 50.034, |
|
"eval_steps_per_second": 0.788, |
|
"step": 4531 |
|
}, |
|
{ |
|
"epoch": 197.3913043478261, |
|
"grad_norm": 5.664416313171387, |
|
"learning_rate": 2.608695652173913e-07, |
|
"loss": 0.0773, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 197.82608695652175, |
|
"grad_norm": 6.778021335601807, |
|
"learning_rate": 2.173913043478261e-07, |
|
"loss": 0.1331, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 198.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2214672565460205, |
|
"eval_runtime": 5.8006, |
|
"eval_samples_per_second": 43.789, |
|
"eval_steps_per_second": 0.69, |
|
"step": 4554 |
|
}, |
|
{ |
|
"epoch": 198.2608695652174, |
|
"grad_norm": 19.4536075592041, |
|
"learning_rate": 1.7391304347826088e-07, |
|
"loss": 0.1095, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 198.69565217391303, |
|
"grad_norm": 11.348213195800781, |
|
"learning_rate": 1.3043478260869566e-07, |
|
"loss": 0.0943, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 199.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2238408327102661, |
|
"eval_runtime": 5.0996, |
|
"eval_samples_per_second": 49.808, |
|
"eval_steps_per_second": 0.784, |
|
"step": 4577 |
|
}, |
|
{ |
|
"epoch": 199.1304347826087, |
|
"grad_norm": 17.247846603393555, |
|
"learning_rate": 8.695652173913044e-08, |
|
"loss": 0.1049, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 199.56521739130434, |
|
"grad_norm": 35.16756820678711, |
|
"learning_rate": 4.347826086956522e-08, |
|
"loss": 0.1164, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"grad_norm": 3.6230416297912598, |
|
"learning_rate": 0.0, |
|
"loss": 0.0926, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"eval_accuracy": 0.7440944881889764, |
|
"eval_loss": 1.2236244678497314, |
|
"eval_runtime": 6.0673, |
|
"eval_samples_per_second": 41.864, |
|
"eval_steps_per_second": 0.659, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 200.0, |
|
"step": 4600, |
|
"total_flos": 7.223651244601344e+18, |
|
"train_loss": 0.17802202463150024, |
|
"train_runtime": 11399.934, |
|
"train_samples_per_second": 25.176, |
|
"train_steps_per_second": 0.404 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 4600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 200, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.223651244601344e+18, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|