|
{
|
|
"best_metric": 0.7724867724867724,
|
|
"best_model_checkpoint": "cont-vvt-gs-rot-flip-wtoken-f198-4.4-h768-t8.16.16\\checkpoint-1326",
|
|
"epoch": 24.035636363636364,
|
|
"eval_steps": 500,
|
|
"global_step": 5500,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0018181818181818182,
|
|
"grad_norm": 49.002803802490234,
|
|
"learning_rate": 3.636363636363636e-08,
|
|
"loss": 0.6813,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.0036363636363636364,
|
|
"grad_norm": 13.134154319763184,
|
|
"learning_rate": 7.272727272727273e-08,
|
|
"loss": 0.6513,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.005454545454545455,
|
|
"grad_norm": 59.23023223876953,
|
|
"learning_rate": 1.0909090909090908e-07,
|
|
"loss": 0.8467,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.007272727272727273,
|
|
"grad_norm": 67.50173950195312,
|
|
"learning_rate": 1.4545454545454545e-07,
|
|
"loss": 0.9694,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.00909090909090909,
|
|
"grad_norm": 16.99021339416504,
|
|
"learning_rate": 1.818181818181818e-07,
|
|
"loss": 0.4877,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.01090909090909091,
|
|
"grad_norm": 25.76717758178711,
|
|
"learning_rate": 2.1818181818181815e-07,
|
|
"loss": 0.6456,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.012727272727272728,
|
|
"grad_norm": 13.1780424118042,
|
|
"learning_rate": 2.5454545454545453e-07,
|
|
"loss": 0.7477,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.014545454545454545,
|
|
"grad_norm": 16.92459487915039,
|
|
"learning_rate": 2.909090909090909e-07,
|
|
"loss": 0.5054,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.016363636363636365,
|
|
"grad_norm": 13.13257122039795,
|
|
"learning_rate": 3.272727272727273e-07,
|
|
"loss": 0.8151,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.01818181818181818,
|
|
"grad_norm": 31.874305725097656,
|
|
"learning_rate": 3.636363636363636e-07,
|
|
"loss": 0.5125,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.02,
|
|
"grad_norm": 53.636810302734375,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.6853,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.02181818181818182,
|
|
"grad_norm": 14.061793327331543,
|
|
"learning_rate": 4.363636363636363e-07,
|
|
"loss": 0.7581,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.023636363636363636,
|
|
"grad_norm": 78.736572265625,
|
|
"learning_rate": 4.727272727272727e-07,
|
|
"loss": 0.7613,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.025454545454545455,
|
|
"grad_norm": 93.31689453125,
|
|
"learning_rate": 5.090909090909091e-07,
|
|
"loss": 0.7684,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.02727272727272727,
|
|
"grad_norm": 18.66179847717285,
|
|
"learning_rate": 5.454545454545454e-07,
|
|
"loss": 0.605,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.02909090909090909,
|
|
"grad_norm": 90.2517318725586,
|
|
"learning_rate": 5.818181818181818e-07,
|
|
"loss": 0.8696,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.03090909090909091,
|
|
"grad_norm": 7.189067840576172,
|
|
"learning_rate": 6.181818181818181e-07,
|
|
"loss": 0.6846,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.03272727272727273,
|
|
"grad_norm": 16.438549041748047,
|
|
"learning_rate": 6.545454545454546e-07,
|
|
"loss": 0.9541,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.034545454545454546,
|
|
"grad_norm": 16.495033264160156,
|
|
"learning_rate": 6.909090909090909e-07,
|
|
"loss": 0.4966,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.03636363636363636,
|
|
"grad_norm": 37.456295013427734,
|
|
"learning_rate": 7.272727272727272e-07,
|
|
"loss": 0.7683,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.038181818181818185,
|
|
"grad_norm": 25.14327049255371,
|
|
"learning_rate": 7.636363636363636e-07,
|
|
"loss": 0.6227,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.04,
|
|
"grad_norm": 33.33948516845703,
|
|
"learning_rate": 8e-07,
|
|
"loss": 0.868,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.04018181818181818,
|
|
"eval_accuracy": 0.6931216931216931,
|
|
"eval_loss": 0.7216398119926453,
|
|
"eval_runtime": 363.6588,
|
|
"eval_samples_per_second": 0.52,
|
|
"eval_steps_per_second": 0.132,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 1.0016363636363637,
|
|
"grad_norm": 34.226806640625,
|
|
"learning_rate": 8.363636363636363e-07,
|
|
"loss": 0.6464,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 1.0034545454545454,
|
|
"grad_norm": 20.59316635131836,
|
|
"learning_rate": 8.727272727272726e-07,
|
|
"loss": 0.5077,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 1.0052727272727273,
|
|
"grad_norm": 50.398475646972656,
|
|
"learning_rate": 9.09090909090909e-07,
|
|
"loss": 0.6622,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 1.007090909090909,
|
|
"grad_norm": 39.305477142333984,
|
|
"learning_rate": 9.454545454545454e-07,
|
|
"loss": 0.7465,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 1.008909090909091,
|
|
"grad_norm": 64.7542495727539,
|
|
"learning_rate": 9.818181818181818e-07,
|
|
"loss": 0.8929,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 1.0107272727272727,
|
|
"grad_norm": 19.077871322631836,
|
|
"learning_rate": 1.0181818181818181e-06,
|
|
"loss": 0.536,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 1.0125454545454546,
|
|
"grad_norm": 15.371084213256836,
|
|
"learning_rate": 1.0545454545454544e-06,
|
|
"loss": 0.7061,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 1.0143636363636364,
|
|
"grad_norm": 159.2889862060547,
|
|
"learning_rate": 1.0909090909090908e-06,
|
|
"loss": 0.707,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 1.016181818181818,
|
|
"grad_norm": 41.80310821533203,
|
|
"learning_rate": 1.127272727272727e-06,
|
|
"loss": 0.7716,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 1.018,
|
|
"grad_norm": 12.123551368713379,
|
|
"learning_rate": 1.1636363636363636e-06,
|
|
"loss": 0.576,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.0198181818181817,
|
|
"grad_norm": 50.106441497802734,
|
|
"learning_rate": 1.2e-06,
|
|
"loss": 0.6453,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 1.0216363636363637,
|
|
"grad_norm": 24.299083709716797,
|
|
"learning_rate": 1.2363636363636363e-06,
|
|
"loss": 0.4867,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.0234545454545454,
|
|
"grad_norm": 14.6795654296875,
|
|
"learning_rate": 1.2727272727272726e-06,
|
|
"loss": 0.7023,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 1.0252727272727273,
|
|
"grad_norm": 13.851099014282227,
|
|
"learning_rate": 1.3090909090909091e-06,
|
|
"loss": 0.4368,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.027090909090909,
|
|
"grad_norm": 48.12266540527344,
|
|
"learning_rate": 1.3454545454545455e-06,
|
|
"loss": 0.6924,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 1.028909090909091,
|
|
"grad_norm": 43.901878356933594,
|
|
"learning_rate": 1.3818181818181818e-06,
|
|
"loss": 0.6913,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.0307272727272727,
|
|
"grad_norm": 26.986379623413086,
|
|
"learning_rate": 1.418181818181818e-06,
|
|
"loss": 0.5203,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 1.0325454545454544,
|
|
"grad_norm": 15.282154083251953,
|
|
"learning_rate": 1.4545454545454544e-06,
|
|
"loss": 0.8971,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.0343636363636364,
|
|
"grad_norm": 7.374248504638672,
|
|
"learning_rate": 1.490909090909091e-06,
|
|
"loss": 0.5316,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 1.036181818181818,
|
|
"grad_norm": 101.63275909423828,
|
|
"learning_rate": 1.5272727272727273e-06,
|
|
"loss": 0.9083,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.038,
|
|
"grad_norm": 84.98663330078125,
|
|
"learning_rate": 1.5636363636363636e-06,
|
|
"loss": 0.8096,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 1.0398181818181818,
|
|
"grad_norm": 47.578224182128906,
|
|
"learning_rate": 1.6e-06,
|
|
"loss": 0.715,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.040181818181818,
|
|
"eval_accuracy": 0.7037037037037037,
|
|
"eval_loss": 0.6954008340835571,
|
|
"eval_runtime": 327.8952,
|
|
"eval_samples_per_second": 0.576,
|
|
"eval_steps_per_second": 0.146,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 2.0014545454545454,
|
|
"grad_norm": 15.013214111328125,
|
|
"learning_rate": 1.6363636363636365e-06,
|
|
"loss": 0.6216,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 2.0032727272727273,
|
|
"grad_norm": 24.56602668762207,
|
|
"learning_rate": 1.6727272727272726e-06,
|
|
"loss": 0.7069,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 2.0050909090909093,
|
|
"grad_norm": 11.838835716247559,
|
|
"learning_rate": 1.709090909090909e-06,
|
|
"loss": 0.6106,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 2.0069090909090908,
|
|
"grad_norm": 30.001951217651367,
|
|
"learning_rate": 1.7454545454545452e-06,
|
|
"loss": 0.7656,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 2.0087272727272727,
|
|
"grad_norm": 112.91593170166016,
|
|
"learning_rate": 1.7818181818181818e-06,
|
|
"loss": 0.5663,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 2.0105454545454546,
|
|
"grad_norm": 54.21142578125,
|
|
"learning_rate": 1.818181818181818e-06,
|
|
"loss": 0.6317,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 2.0123636363636366,
|
|
"grad_norm": 30.909801483154297,
|
|
"learning_rate": 1.8545454545454544e-06,
|
|
"loss": 0.5805,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 2.014181818181818,
|
|
"grad_norm": 15.627546310424805,
|
|
"learning_rate": 1.8909090909090907e-06,
|
|
"loss": 0.626,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 2.016,
|
|
"grad_norm": 29.413007736206055,
|
|
"learning_rate": 1.9272727272727273e-06,
|
|
"loss": 0.7046,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 2.017818181818182,
|
|
"grad_norm": 51.5157356262207,
|
|
"learning_rate": 1.9636363636363636e-06,
|
|
"loss": 0.6777,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 2.0196363636363635,
|
|
"grad_norm": 22.87698745727539,
|
|
"learning_rate": 2e-06,
|
|
"loss": 1.012,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 2.0214545454545454,
|
|
"grad_norm": 36.929908752441406,
|
|
"learning_rate": 1.9959595959595957e-06,
|
|
"loss": 0.6515,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 2.0232727272727273,
|
|
"grad_norm": 49.41072082519531,
|
|
"learning_rate": 1.991919191919192e-06,
|
|
"loss": 0.7108,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 2.0250909090909093,
|
|
"grad_norm": 64.81523895263672,
|
|
"learning_rate": 1.9878787878787877e-06,
|
|
"loss": 0.5875,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 2.0269090909090908,
|
|
"grad_norm": 30.34855842590332,
|
|
"learning_rate": 1.9838383838383834e-06,
|
|
"loss": 0.8011,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 2.0287272727272727,
|
|
"grad_norm": 128.04660034179688,
|
|
"learning_rate": 1.9797979797979796e-06,
|
|
"loss": 0.8472,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 2.0305454545454547,
|
|
"grad_norm": 21.108116149902344,
|
|
"learning_rate": 1.975757575757576e-06,
|
|
"loss": 0.6098,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 2.032363636363636,
|
|
"grad_norm": 9.428853034973145,
|
|
"learning_rate": 1.9717171717171716e-06,
|
|
"loss": 0.6362,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.034181818181818,
|
|
"grad_norm": 78.94214630126953,
|
|
"learning_rate": 1.967676767676768e-06,
|
|
"loss": 0.6845,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 2.036,
|
|
"grad_norm": 66.98030090332031,
|
|
"learning_rate": 1.9636363636363636e-06,
|
|
"loss": 0.5751,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 2.037818181818182,
|
|
"grad_norm": 9.882628440856934,
|
|
"learning_rate": 1.9595959595959594e-06,
|
|
"loss": 0.4772,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 2.0396363636363635,
|
|
"grad_norm": 33.11411666870117,
|
|
"learning_rate": 1.9555555555555556e-06,
|
|
"loss": 0.7041,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 2.0401818181818183,
|
|
"eval_accuracy": 0.746031746031746,
|
|
"eval_loss": 0.7268804907798767,
|
|
"eval_runtime": 273.7511,
|
|
"eval_samples_per_second": 0.69,
|
|
"eval_steps_per_second": 0.175,
|
|
"step": 663
|
|
},
|
|
{
|
|
"epoch": 3.001272727272727,
|
|
"grad_norm": 12.610950469970703,
|
|
"learning_rate": 1.9515151515151513e-06,
|
|
"loss": 0.752,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 3.003090909090909,
|
|
"grad_norm": 17.624067306518555,
|
|
"learning_rate": 1.9474747474747475e-06,
|
|
"loss": 0.5567,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 3.004909090909091,
|
|
"grad_norm": 73.55494689941406,
|
|
"learning_rate": 1.9434343434343433e-06,
|
|
"loss": 0.6787,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 3.006727272727273,
|
|
"grad_norm": 17.460477828979492,
|
|
"learning_rate": 1.9393939393939395e-06,
|
|
"loss": 0.6622,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 3.0085454545454544,
|
|
"grad_norm": 29.736553192138672,
|
|
"learning_rate": 1.9353535353535353e-06,
|
|
"loss": 0.5926,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 3.0103636363636364,
|
|
"grad_norm": 15.828145027160645,
|
|
"learning_rate": 1.931313131313131e-06,
|
|
"loss": 0.615,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 3.0121818181818183,
|
|
"grad_norm": 20.407047271728516,
|
|
"learning_rate": 1.9272727272727273e-06,
|
|
"loss": 0.8663,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 3.014,
|
|
"grad_norm": 23.92081642150879,
|
|
"learning_rate": 1.923232323232323e-06,
|
|
"loss": 0.7223,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 3.0158181818181817,
|
|
"grad_norm": 16.4814395904541,
|
|
"learning_rate": 1.9191919191919192e-06,
|
|
"loss": 0.6063,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 3.0176363636363637,
|
|
"grad_norm": 28.583906173706055,
|
|
"learning_rate": 1.915151515151515e-06,
|
|
"loss": 0.6109,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 3.0194545454545456,
|
|
"grad_norm": 83.84841918945312,
|
|
"learning_rate": 1.9111111111111112e-06,
|
|
"loss": 0.8352,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 3.021272727272727,
|
|
"grad_norm": 158.23809814453125,
|
|
"learning_rate": 1.907070707070707e-06,
|
|
"loss": 0.7769,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 3.023090909090909,
|
|
"grad_norm": 13.608480453491211,
|
|
"learning_rate": 1.903030303030303e-06,
|
|
"loss": 0.5668,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 3.024909090909091,
|
|
"grad_norm": 22.877429962158203,
|
|
"learning_rate": 1.898989898989899e-06,
|
|
"loss": 0.797,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 3.026727272727273,
|
|
"grad_norm": 25.291210174560547,
|
|
"learning_rate": 1.894949494949495e-06,
|
|
"loss": 0.5308,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 3.0285454545454544,
|
|
"grad_norm": 37.03635025024414,
|
|
"learning_rate": 1.8909090909090907e-06,
|
|
"loss": 0.5151,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 3.0303636363636364,
|
|
"grad_norm": 42.18269729614258,
|
|
"learning_rate": 1.8868686868686867e-06,
|
|
"loss": 0.4294,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 3.0321818181818183,
|
|
"grad_norm": 93.38839721679688,
|
|
"learning_rate": 1.8828282828282827e-06,
|
|
"loss": 0.694,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 3.034,
|
|
"grad_norm": 16.274932861328125,
|
|
"learning_rate": 1.878787878787879e-06,
|
|
"loss": 0.5724,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 3.0358181818181817,
|
|
"grad_norm": 33.26854705810547,
|
|
"learning_rate": 1.8747474747474747e-06,
|
|
"loss": 0.724,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 3.0376363636363637,
|
|
"grad_norm": 57.42934799194336,
|
|
"learning_rate": 1.8707070707070707e-06,
|
|
"loss": 0.722,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 3.0394545454545456,
|
|
"grad_norm": 35.24587631225586,
|
|
"learning_rate": 1.8666666666666667e-06,
|
|
"loss": 0.5228,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 3.0401818181818183,
|
|
"eval_accuracy": 0.7407407407407407,
|
|
"eval_loss": 0.7158430814743042,
|
|
"eval_runtime": 248.3189,
|
|
"eval_samples_per_second": 0.761,
|
|
"eval_steps_per_second": 0.193,
|
|
"step": 884
|
|
},
|
|
{
|
|
"epoch": 4.001090909090909,
|
|
"grad_norm": 95.05870056152344,
|
|
"learning_rate": 1.8626262626262626e-06,
|
|
"loss": 0.6107,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 4.002909090909091,
|
|
"grad_norm": 18.396896362304688,
|
|
"learning_rate": 1.8585858585858584e-06,
|
|
"loss": 0.7477,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 4.004727272727273,
|
|
"grad_norm": 137.47760009765625,
|
|
"learning_rate": 1.8545454545454544e-06,
|
|
"loss": 0.5227,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 4.006545454545455,
|
|
"grad_norm": 89.4850082397461,
|
|
"learning_rate": 1.8505050505050504e-06,
|
|
"loss": 0.615,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 4.008363636363637,
|
|
"grad_norm": 23.125720977783203,
|
|
"learning_rate": 1.8464646464646464e-06,
|
|
"loss": 0.6965,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 4.0101818181818185,
|
|
"grad_norm": 32.78513717651367,
|
|
"learning_rate": 1.8424242424242424e-06,
|
|
"loss": 0.6532,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 4.012,
|
|
"grad_norm": 31.5473575592041,
|
|
"learning_rate": 1.8383838383838384e-06,
|
|
"loss": 0.7502,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 4.0138181818181815,
|
|
"grad_norm": 38.10697555541992,
|
|
"learning_rate": 1.8343434343434343e-06,
|
|
"loss": 0.7254,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 4.0156363636363634,
|
|
"grad_norm": 67.04145812988281,
|
|
"learning_rate": 1.8303030303030303e-06,
|
|
"loss": 0.6275,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 4.017454545454545,
|
|
"grad_norm": 16.42119026184082,
|
|
"learning_rate": 1.826262626262626e-06,
|
|
"loss": 0.7814,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 4.019272727272727,
|
|
"grad_norm": 9.567808151245117,
|
|
"learning_rate": 1.822222222222222e-06,
|
|
"loss": 0.9277,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 4.021090909090909,
|
|
"grad_norm": 3.3725850582122803,
|
|
"learning_rate": 1.818181818181818e-06,
|
|
"loss": 0.5962,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 4.022909090909091,
|
|
"grad_norm": 159.82363891601562,
|
|
"learning_rate": 1.814141414141414e-06,
|
|
"loss": 0.7084,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 4.024727272727273,
|
|
"grad_norm": 39.41032028198242,
|
|
"learning_rate": 1.81010101010101e-06,
|
|
"loss": 0.7031,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 4.026545454545454,
|
|
"grad_norm": 63.62370681762695,
|
|
"learning_rate": 1.806060606060606e-06,
|
|
"loss": 0.6561,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 4.028363636363636,
|
|
"grad_norm": 124.5676498413086,
|
|
"learning_rate": 1.802020202020202e-06,
|
|
"loss": 0.7327,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 4.030181818181818,
|
|
"grad_norm": 56.569496154785156,
|
|
"learning_rate": 1.797979797979798e-06,
|
|
"loss": 0.5588,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 4.032,
|
|
"grad_norm": 70.69527435302734,
|
|
"learning_rate": 1.7939393939393938e-06,
|
|
"loss": 0.7602,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 4.033818181818182,
|
|
"grad_norm": 25.240571975708008,
|
|
"learning_rate": 1.7898989898989898e-06,
|
|
"loss": 0.5979,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 4.035636363636364,
|
|
"grad_norm": 48.645660400390625,
|
|
"learning_rate": 1.7858585858585858e-06,
|
|
"loss": 0.6254,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 4.037454545454546,
|
|
"grad_norm": 98.81543731689453,
|
|
"learning_rate": 1.7818181818181818e-06,
|
|
"loss": 0.8137,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 4.039272727272727,
|
|
"grad_norm": 74.22100067138672,
|
|
"learning_rate": 1.7777777777777775e-06,
|
|
"loss": 0.7336,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 4.040181818181818,
|
|
"eval_accuracy": 0.7195767195767195,
|
|
"eval_loss": 0.6832817792892456,
|
|
"eval_runtime": 222.8187,
|
|
"eval_samples_per_second": 0.848,
|
|
"eval_steps_per_second": 0.215,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 5.000909090909091,
|
|
"grad_norm": 20.993064880371094,
|
|
"learning_rate": 1.7737373737373737e-06,
|
|
"loss": 0.632,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 5.002727272727273,
|
|
"grad_norm": 177.10865783691406,
|
|
"learning_rate": 1.7696969696969697e-06,
|
|
"loss": 0.6758,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 5.004545454545455,
|
|
"grad_norm": 93.40233612060547,
|
|
"learning_rate": 1.7656565656565657e-06,
|
|
"loss": 0.7403,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 5.006363636363637,
|
|
"grad_norm": 81.70173645019531,
|
|
"learning_rate": 1.7616161616161615e-06,
|
|
"loss": 0.7051,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 5.008181818181818,
|
|
"grad_norm": 13.730420112609863,
|
|
"learning_rate": 1.7575757575757575e-06,
|
|
"loss": 0.6191,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 5.01,
|
|
"grad_norm": 130.99063110351562,
|
|
"learning_rate": 1.7535353535353535e-06,
|
|
"loss": 0.7902,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 5.011818181818182,
|
|
"grad_norm": 24.49544906616211,
|
|
"learning_rate": 1.7494949494949494e-06,
|
|
"loss": 0.6352,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 5.013636363636364,
|
|
"grad_norm": 42.5463981628418,
|
|
"learning_rate": 1.7454545454545452e-06,
|
|
"loss": 0.6941,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 5.015454545454546,
|
|
"grad_norm": 22.315645217895508,
|
|
"learning_rate": 1.7414141414141412e-06,
|
|
"loss": 0.6475,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 5.0172727272727276,
|
|
"grad_norm": 5.3459320068359375,
|
|
"learning_rate": 1.7373737373737374e-06,
|
|
"loss": 0.6624,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 5.0190909090909095,
|
|
"grad_norm": 26.51961898803711,
|
|
"learning_rate": 1.7333333333333334e-06,
|
|
"loss": 0.5881,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 5.0209090909090905,
|
|
"grad_norm": 20.37279510498047,
|
|
"learning_rate": 1.7292929292929292e-06,
|
|
"loss": 0.663,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 5.0227272727272725,
|
|
"grad_norm": 14.729677200317383,
|
|
"learning_rate": 1.7252525252525252e-06,
|
|
"loss": 0.7646,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 5.024545454545454,
|
|
"grad_norm": 48.52347183227539,
|
|
"learning_rate": 1.7212121212121211e-06,
|
|
"loss": 0.5498,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 5.026363636363636,
|
|
"grad_norm": 46.98148727416992,
|
|
"learning_rate": 1.7171717171717171e-06,
|
|
"loss": 0.3787,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 5.028181818181818,
|
|
"grad_norm": 57.40213394165039,
|
|
"learning_rate": 1.713131313131313e-06,
|
|
"loss": 0.6491,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 5.03,
|
|
"grad_norm": 43.548336029052734,
|
|
"learning_rate": 1.709090909090909e-06,
|
|
"loss": 0.7185,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 5.031818181818182,
|
|
"grad_norm": 12.414336204528809,
|
|
"learning_rate": 1.705050505050505e-06,
|
|
"loss": 0.6259,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 5.033636363636363,
|
|
"grad_norm": 55.376976013183594,
|
|
"learning_rate": 1.701010101010101e-06,
|
|
"loss": 0.5425,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 5.035454545454545,
|
|
"grad_norm": 34.820465087890625,
|
|
"learning_rate": 1.6969696969696969e-06,
|
|
"loss": 0.6265,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 5.037272727272727,
|
|
"grad_norm": 38.091888427734375,
|
|
"learning_rate": 1.6929292929292928e-06,
|
|
"loss": 0.484,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 5.039090909090909,
|
|
"grad_norm": 30.870681762695312,
|
|
"learning_rate": 1.6888888888888888e-06,
|
|
"loss": 0.5987,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 5.040181818181818,
|
|
"eval_accuracy": 0.7724867724867724,
|
|
"eval_loss": 0.6155162453651428,
|
|
"eval_runtime": 226.4639,
|
|
"eval_samples_per_second": 0.835,
|
|
"eval_steps_per_second": 0.212,
|
|
"step": 1326
|
|
},
|
|
{
|
|
"epoch": 6.000727272727273,
|
|
"grad_norm": 105.08300018310547,
|
|
"learning_rate": 1.6848484848484848e-06,
|
|
"loss": 0.8706,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 6.002545454545454,
|
|
"grad_norm": 47.137516021728516,
|
|
"learning_rate": 1.6808080808080806e-06,
|
|
"loss": 0.6063,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 6.004363636363636,
|
|
"grad_norm": 50.43033218383789,
|
|
"learning_rate": 1.6767676767676766e-06,
|
|
"loss": 0.6395,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 6.006181818181818,
|
|
"grad_norm": 13.420127868652344,
|
|
"learning_rate": 1.6727272727272726e-06,
|
|
"loss": 0.4272,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 6.008,
|
|
"grad_norm": 16.943889617919922,
|
|
"learning_rate": 1.6686868686868688e-06,
|
|
"loss": 0.4848,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 6.009818181818182,
|
|
"grad_norm": 31.666305541992188,
|
|
"learning_rate": 1.6646464646464645e-06,
|
|
"loss": 0.7149,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 6.011636363636364,
|
|
"grad_norm": 34.230838775634766,
|
|
"learning_rate": 1.6606060606060605e-06,
|
|
"loss": 0.6049,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 6.013454545454546,
|
|
"grad_norm": 36.60803985595703,
|
|
"learning_rate": 1.6565656565656565e-06,
|
|
"loss": 0.5049,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 6.015272727272727,
|
|
"grad_norm": 23.06334686279297,
|
|
"learning_rate": 1.6525252525252525e-06,
|
|
"loss": 0.4035,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 6.017090909090909,
|
|
"grad_norm": 78.13117218017578,
|
|
"learning_rate": 1.6484848484848483e-06,
|
|
"loss": 0.8025,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 6.018909090909091,
|
|
"grad_norm": 1.755617618560791,
|
|
"learning_rate": 1.6444444444444443e-06,
|
|
"loss": 0.7053,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 6.020727272727273,
|
|
"grad_norm": 30.082414627075195,
|
|
"learning_rate": 1.6404040404040403e-06,
|
|
"loss": 0.8521,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 6.022545454545455,
|
|
"grad_norm": 25.789148330688477,
|
|
"learning_rate": 1.6363636363636365e-06,
|
|
"loss": 0.4931,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 6.024363636363637,
|
|
"grad_norm": 49.93053436279297,
|
|
"learning_rate": 1.6323232323232322e-06,
|
|
"loss": 0.7615,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 6.0261818181818185,
|
|
"grad_norm": 7.271900177001953,
|
|
"learning_rate": 1.6282828282828282e-06,
|
|
"loss": 0.5762,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 6.028,
|
|
"grad_norm": 16.332393646240234,
|
|
"learning_rate": 1.6242424242424242e-06,
|
|
"loss": 0.8811,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 6.0298181818181815,
|
|
"grad_norm": 140.82994079589844,
|
|
"learning_rate": 1.6202020202020202e-06,
|
|
"loss": 0.6377,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 6.0316363636363635,
|
|
"grad_norm": 33.967838287353516,
|
|
"learning_rate": 1.616161616161616e-06,
|
|
"loss": 0.6096,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 6.033454545454545,
|
|
"grad_norm": 21.380468368530273,
|
|
"learning_rate": 1.612121212121212e-06,
|
|
"loss": 0.6112,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 6.035272727272727,
|
|
"grad_norm": 18.80341339111328,
|
|
"learning_rate": 1.608080808080808e-06,
|
|
"loss": 0.6322,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 6.037090909090909,
|
|
"grad_norm": 98.22761535644531,
|
|
"learning_rate": 1.604040404040404e-06,
|
|
"loss": 0.5789,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 6.038909090909091,
|
|
"grad_norm": 55.355403900146484,
|
|
"learning_rate": 1.6e-06,
|
|
"loss": 0.8574,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 6.040181818181818,
|
|
"eval_accuracy": 0.7301587301587301,
|
|
"eval_loss": 0.6600926518440247,
|
|
"eval_runtime": 224.4313,
|
|
"eval_samples_per_second": 0.842,
|
|
"eval_steps_per_second": 0.214,
|
|
"step": 1547
|
|
},
|
|
{
|
|
"epoch": 7.000545454545454,
|
|
"grad_norm": 45.622276306152344,
|
|
"learning_rate": 1.595959595959596e-06,
|
|
"loss": 0.4224,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 7.002363636363636,
|
|
"grad_norm": 61.306129455566406,
|
|
"learning_rate": 1.591919191919192e-06,
|
|
"loss": 0.6735,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 7.004181818181818,
|
|
"grad_norm": 22.80328941345215,
|
|
"learning_rate": 1.5878787878787879e-06,
|
|
"loss": 0.4944,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 7.006,
|
|
"grad_norm": 89.7556381225586,
|
|
"learning_rate": 1.5838383838383837e-06,
|
|
"loss": 0.5678,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 7.007818181818182,
|
|
"grad_norm": 28.243221282958984,
|
|
"learning_rate": 1.5797979797979797e-06,
|
|
"loss": 0.8799,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 7.009636363636363,
|
|
"grad_norm": 143.52423095703125,
|
|
"learning_rate": 1.5757575757575756e-06,
|
|
"loss": 0.4891,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 7.011454545454545,
|
|
"grad_norm": 43.01397705078125,
|
|
"learning_rate": 1.5717171717171716e-06,
|
|
"loss": 0.9218,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 7.013272727272727,
|
|
"grad_norm": 36.67337417602539,
|
|
"learning_rate": 1.5676767676767676e-06,
|
|
"loss": 0.7018,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 7.015090909090909,
|
|
"grad_norm": 96.1873779296875,
|
|
"learning_rate": 1.5636363636363636e-06,
|
|
"loss": 0.6009,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 7.016909090909091,
|
|
"grad_norm": 124.63884735107422,
|
|
"learning_rate": 1.5595959595959596e-06,
|
|
"loss": 0.9792,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 7.018727272727273,
|
|
"grad_norm": 75.55404663085938,
|
|
"learning_rate": 1.5555555555555556e-06,
|
|
"loss": 0.6503,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 7.020545454545455,
|
|
"grad_norm": 52.23248291015625,
|
|
"learning_rate": 1.5515151515151514e-06,
|
|
"loss": 0.9012,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 7.022363636363636,
|
|
"grad_norm": 16.753812789916992,
|
|
"learning_rate": 1.5474747474747473e-06,
|
|
"loss": 0.4862,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 7.024181818181818,
|
|
"grad_norm": 38.484703063964844,
|
|
"learning_rate": 1.5434343434343433e-06,
|
|
"loss": 0.5439,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 7.026,
|
|
"grad_norm": 11.493254661560059,
|
|
"learning_rate": 1.5393939393939393e-06,
|
|
"loss": 0.4938,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 7.027818181818182,
|
|
"grad_norm": 67.05599975585938,
|
|
"learning_rate": 1.535353535353535e-06,
|
|
"loss": 0.6632,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 7.029636363636364,
|
|
"grad_norm": 76.45126342773438,
|
|
"learning_rate": 1.5313131313131313e-06,
|
|
"loss": 0.6297,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 7.031454545454546,
|
|
"grad_norm": 99.13848876953125,
|
|
"learning_rate": 1.5272727272727273e-06,
|
|
"loss": 0.4693,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 7.033272727272728,
|
|
"grad_norm": 63.05826950073242,
|
|
"learning_rate": 1.5232323232323233e-06,
|
|
"loss": 0.7851,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 7.0350909090909095,
|
|
"grad_norm": 169.03659057617188,
|
|
"learning_rate": 1.519191919191919e-06,
|
|
"loss": 0.5195,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 7.036909090909091,
|
|
"grad_norm": 31.675315856933594,
|
|
"learning_rate": 1.515151515151515e-06,
|
|
"loss": 0.8175,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 7.0387272727272725,
|
|
"grad_norm": 15.309126853942871,
|
|
"learning_rate": 1.511111111111111e-06,
|
|
"loss": 0.6805,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 7.040181818181818,
|
|
"eval_accuracy": 0.746031746031746,
|
|
"eval_loss": 0.6374004483222961,
|
|
"eval_runtime": 259.6967,
|
|
"eval_samples_per_second": 0.728,
|
|
"eval_steps_per_second": 0.185,
|
|
"step": 1768
|
|
},
|
|
{
|
|
"epoch": 8.000363636363636,
|
|
"grad_norm": 28.315685272216797,
|
|
"learning_rate": 1.507070707070707e-06,
|
|
"loss": 0.7962,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 8.002181818181818,
|
|
"grad_norm": 52.250003814697266,
|
|
"learning_rate": 1.5030303030303028e-06,
|
|
"loss": 0.4635,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 8.004,
|
|
"grad_norm": 84.7441177368164,
|
|
"learning_rate": 1.498989898989899e-06,
|
|
"loss": 0.7817,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 8.005818181818182,
|
|
"grad_norm": 34.26324462890625,
|
|
"learning_rate": 1.494949494949495e-06,
|
|
"loss": 0.823,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 8.007636363636363,
|
|
"grad_norm": 39.26140594482422,
|
|
"learning_rate": 1.490909090909091e-06,
|
|
"loss": 0.5855,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 8.009454545454545,
|
|
"grad_norm": 15.307453155517578,
|
|
"learning_rate": 1.4868686868686867e-06,
|
|
"loss": 0.6542,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 8.011272727272727,
|
|
"grad_norm": 12.368428230285645,
|
|
"learning_rate": 1.4828282828282827e-06,
|
|
"loss": 0.64,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 8.01309090909091,
|
|
"grad_norm": 58.16361999511719,
|
|
"learning_rate": 1.4787878787878787e-06,
|
|
"loss": 0.6901,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 8.014909090909091,
|
|
"grad_norm": 78.3253402709961,
|
|
"learning_rate": 1.4747474747474747e-06,
|
|
"loss": 0.4596,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 8.016727272727273,
|
|
"grad_norm": 220.34986877441406,
|
|
"learning_rate": 1.4707070707070705e-06,
|
|
"loss": 0.6546,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 8.018545454545455,
|
|
"grad_norm": 4.556402683258057,
|
|
"learning_rate": 1.4666666666666665e-06,
|
|
"loss": 0.4136,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 8.020363636363637,
|
|
"grad_norm": 8.888373374938965,
|
|
"learning_rate": 1.4626262626262627e-06,
|
|
"loss": 0.6416,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 8.022181818181819,
|
|
"grad_norm": 58.61665344238281,
|
|
"learning_rate": 1.4585858585858586e-06,
|
|
"loss": 0.5725,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 8.024,
|
|
"grad_norm": 33.83262252807617,
|
|
"learning_rate": 1.4545454545454544e-06,
|
|
"loss": 0.6366,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 8.025818181818181,
|
|
"grad_norm": 53.01886749267578,
|
|
"learning_rate": 1.4505050505050504e-06,
|
|
"loss": 0.6309,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 8.027636363636363,
|
|
"grad_norm": 31.379945755004883,
|
|
"learning_rate": 1.4464646464646464e-06,
|
|
"loss": 0.7393,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 8.029454545454545,
|
|
"grad_norm": 15.79307746887207,
|
|
"learning_rate": 1.4424242424242424e-06,
|
|
"loss": 0.5232,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 8.031272727272727,
|
|
"grad_norm": 50.681663513183594,
|
|
"learning_rate": 1.4383838383838382e-06,
|
|
"loss": 0.6813,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 8.033090909090909,
|
|
"grad_norm": 109.0439682006836,
|
|
"learning_rate": 1.4343434343434341e-06,
|
|
"loss": 0.6733,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 8.03490909090909,
|
|
"grad_norm": 34.641143798828125,
|
|
"learning_rate": 1.4303030303030303e-06,
|
|
"loss": 0.7974,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 8.036727272727273,
|
|
"grad_norm": 66.88762664794922,
|
|
"learning_rate": 1.4262626262626263e-06,
|
|
"loss": 0.7551,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 8.038545454545455,
|
|
"grad_norm": 27.06598663330078,
|
|
"learning_rate": 1.4222222222222221e-06,
|
|
"loss": 0.8086,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 8.040181818181818,
|
|
"eval_accuracy": 0.6984126984126984,
|
|
"eval_loss": 0.689642608165741,
|
|
"eval_runtime": 235.3873,
|
|
"eval_samples_per_second": 0.803,
|
|
"eval_steps_per_second": 0.204,
|
|
"step": 1989
|
|
},
|
|
{
|
|
"epoch": 9.000181818181819,
|
|
"grad_norm": 68.73373413085938,
|
|
"learning_rate": 1.418181818181818e-06,
|
|
"loss": 0.7848,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 9.002,
|
|
"grad_norm": 187.6933135986328,
|
|
"learning_rate": 1.414141414141414e-06,
|
|
"loss": 0.4443,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 9.003818181818183,
|
|
"grad_norm": 99.57637786865234,
|
|
"learning_rate": 1.41010101010101e-06,
|
|
"loss": 0.5716,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 9.005636363636363,
|
|
"grad_norm": 8.004471778869629,
|
|
"learning_rate": 1.4060606060606058e-06,
|
|
"loss": 0.8678,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 9.007454545454545,
|
|
"grad_norm": 29.812477111816406,
|
|
"learning_rate": 1.4020202020202018e-06,
|
|
"loss": 0.6717,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 9.009272727272727,
|
|
"grad_norm": 38.03961944580078,
|
|
"learning_rate": 1.3979797979797978e-06,
|
|
"loss": 0.6911,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 9.011090909090909,
|
|
"grad_norm": 35.46464538574219,
|
|
"learning_rate": 1.393939393939394e-06,
|
|
"loss": 0.7812,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 9.01290909090909,
|
|
"grad_norm": 244.00631713867188,
|
|
"learning_rate": 1.3898989898989898e-06,
|
|
"loss": 0.4884,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 9.014727272727272,
|
|
"grad_norm": 78.77344512939453,
|
|
"learning_rate": 1.3858585858585858e-06,
|
|
"loss": 0.8511,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 9.016545454545454,
|
|
"grad_norm": 84.41845703125,
|
|
"learning_rate": 1.3818181818181818e-06,
|
|
"loss": 0.8896,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 9.018363636363636,
|
|
"grad_norm": 8.826834678649902,
|
|
"learning_rate": 1.3777777777777778e-06,
|
|
"loss": 0.4289,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 9.020181818181818,
|
|
"grad_norm": 69.07925415039062,
|
|
"learning_rate": 1.3737373737373735e-06,
|
|
"loss": 0.6058,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 9.022,
|
|
"grad_norm": 23.774072647094727,
|
|
"learning_rate": 1.3696969696969695e-06,
|
|
"loss": 0.4198,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 9.023818181818182,
|
|
"grad_norm": 39.78314971923828,
|
|
"learning_rate": 1.3656565656565655e-06,
|
|
"loss": 0.4463,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 9.025636363636364,
|
|
"grad_norm": 14.34803581237793,
|
|
"learning_rate": 1.3616161616161617e-06,
|
|
"loss": 0.6533,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 9.027454545454546,
|
|
"grad_norm": 38.20193099975586,
|
|
"learning_rate": 1.3575757575757577e-06,
|
|
"loss": 0.5529,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 9.029272727272728,
|
|
"grad_norm": 70.79397583007812,
|
|
"learning_rate": 1.3535353535353535e-06,
|
|
"loss": 0.7498,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 9.03109090909091,
|
|
"grad_norm": 7.273447036743164,
|
|
"learning_rate": 1.3494949494949495e-06,
|
|
"loss": 0.5858,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 9.03290909090909,
|
|
"grad_norm": 23.218799591064453,
|
|
"learning_rate": 1.3454545454545455e-06,
|
|
"loss": 0.66,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 9.034727272727272,
|
|
"grad_norm": 37.04141616821289,
|
|
"learning_rate": 1.3414141414141412e-06,
|
|
"loss": 0.5211,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 9.036545454545454,
|
|
"grad_norm": 103.82246398925781,
|
|
"learning_rate": 1.3373737373737372e-06,
|
|
"loss": 0.5164,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 9.038363636363636,
|
|
"grad_norm": 24.58924102783203,
|
|
"learning_rate": 1.3333333333333332e-06,
|
|
"loss": 0.6191,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 9.040181818181818,
|
|
"grad_norm": 13.224066734313965,
|
|
"learning_rate": 1.3292929292929292e-06,
|
|
"loss": 0.6552,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 9.040181818181818,
|
|
"eval_accuracy": 0.708994708994709,
|
|
"eval_loss": 0.6534519791603088,
|
|
"eval_runtime": 244.9159,
|
|
"eval_samples_per_second": 0.772,
|
|
"eval_steps_per_second": 0.196,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 10.001818181818182,
|
|
"grad_norm": 21.723186492919922,
|
|
"learning_rate": 1.3252525252525254e-06,
|
|
"loss": 0.6787,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 10.003636363636364,
|
|
"grad_norm": 27.678010940551758,
|
|
"learning_rate": 1.3212121212121212e-06,
|
|
"loss": 0.7535,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 10.005454545454546,
|
|
"grad_norm": 46.93525314331055,
|
|
"learning_rate": 1.3171717171717172e-06,
|
|
"loss": 0.6038,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 10.007272727272728,
|
|
"grad_norm": 76.83301544189453,
|
|
"learning_rate": 1.3131313131313131e-06,
|
|
"loss": 0.9054,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 10.00909090909091,
|
|
"grad_norm": 14.832221984863281,
|
|
"learning_rate": 1.3090909090909091e-06,
|
|
"loss": 0.5424,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 10.010909090909092,
|
|
"grad_norm": 73.36185455322266,
|
|
"learning_rate": 1.305050505050505e-06,
|
|
"loss": 0.4393,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 10.012727272727274,
|
|
"grad_norm": 94.55217742919922,
|
|
"learning_rate": 1.3010101010101009e-06,
|
|
"loss": 0.6569,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 10.014545454545454,
|
|
"grad_norm": 76.41736602783203,
|
|
"learning_rate": 1.2969696969696969e-06,
|
|
"loss": 0.8187,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 10.016363636363636,
|
|
"grad_norm": 39.47346496582031,
|
|
"learning_rate": 1.292929292929293e-06,
|
|
"loss": 0.5843,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 10.018181818181818,
|
|
"grad_norm": 80.64500427246094,
|
|
"learning_rate": 1.2888888888888889e-06,
|
|
"loss": 0.3888,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 10.02,
|
|
"grad_norm": 25.766429901123047,
|
|
"learning_rate": 1.2848484848484848e-06,
|
|
"loss": 1.0704,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 10.021818181818182,
|
|
"grad_norm": 24.26360321044922,
|
|
"learning_rate": 1.2808080808080808e-06,
|
|
"loss": 0.6087,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 10.023636363636363,
|
|
"grad_norm": 19.62248420715332,
|
|
"learning_rate": 1.2767676767676768e-06,
|
|
"loss": 0.5757,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 10.025454545454545,
|
|
"grad_norm": 39.63312530517578,
|
|
"learning_rate": 1.2727272727272726e-06,
|
|
"loss": 0.5182,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 10.027272727272727,
|
|
"grad_norm": 73.8565902709961,
|
|
"learning_rate": 1.2686868686868686e-06,
|
|
"loss": 0.819,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 10.02909090909091,
|
|
"grad_norm": 18.63445472717285,
|
|
"learning_rate": 1.2646464646464646e-06,
|
|
"loss": 0.6963,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 10.030909090909091,
|
|
"grad_norm": 67.73011779785156,
|
|
"learning_rate": 1.2606060606060606e-06,
|
|
"loss": 0.6698,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 10.032727272727273,
|
|
"grad_norm": 65.65953826904297,
|
|
"learning_rate": 1.2565656565656565e-06,
|
|
"loss": 0.762,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 10.034545454545455,
|
|
"grad_norm": 54.61469650268555,
|
|
"learning_rate": 1.2525252525252525e-06,
|
|
"loss": 0.617,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 10.036363636363637,
|
|
"grad_norm": 31.209501266479492,
|
|
"learning_rate": 1.2484848484848485e-06,
|
|
"loss": 0.5484,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 10.038181818181819,
|
|
"grad_norm": 30.616729736328125,
|
|
"learning_rate": 1.2444444444444445e-06,
|
|
"loss": 0.8238,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 10.04,
|
|
"grad_norm": 4.046667575836182,
|
|
"learning_rate": 1.2404040404040403e-06,
|
|
"loss": 0.7846,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 10.040181818181818,
|
|
"eval_accuracy": 0.7354497354497355,
|
|
"eval_loss": 0.6645750999450684,
|
|
"eval_runtime": 227.1176,
|
|
"eval_samples_per_second": 0.832,
|
|
"eval_steps_per_second": 0.211,
|
|
"step": 2431
|
|
},
|
|
{
|
|
"epoch": 11.001636363636363,
|
|
"grad_norm": 33.93498229980469,
|
|
"learning_rate": 1.2363636363636363e-06,
|
|
"loss": 0.8073,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 11.003454545454545,
|
|
"grad_norm": 32.35393524169922,
|
|
"learning_rate": 1.2323232323232323e-06,
|
|
"loss": 0.6641,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 11.005272727272727,
|
|
"grad_norm": 14.06725788116455,
|
|
"learning_rate": 1.2282828282828282e-06,
|
|
"loss": 0.7601,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 11.007090909090909,
|
|
"grad_norm": 17.507535934448242,
|
|
"learning_rate": 1.224242424242424e-06,
|
|
"loss": 0.5481,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 11.008909090909091,
|
|
"grad_norm": 74.39139556884766,
|
|
"learning_rate": 1.2202020202020202e-06,
|
|
"loss": 0.8024,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 11.010727272727273,
|
|
"grad_norm": 38.75892639160156,
|
|
"learning_rate": 1.2161616161616162e-06,
|
|
"loss": 0.5361,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 11.012545454545455,
|
|
"grad_norm": 54.3512077331543,
|
|
"learning_rate": 1.2121212121212122e-06,
|
|
"loss": 0.5242,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 11.014363636363637,
|
|
"grad_norm": 53.02770233154297,
|
|
"learning_rate": 1.208080808080808e-06,
|
|
"loss": 0.7339,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 11.016181818181819,
|
|
"grad_norm": 37.49360275268555,
|
|
"learning_rate": 1.204040404040404e-06,
|
|
"loss": 0.5388,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 11.018,
|
|
"grad_norm": 13.653200149536133,
|
|
"learning_rate": 1.2e-06,
|
|
"loss": 0.9184,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 11.019818181818183,
|
|
"grad_norm": 19.727231979370117,
|
|
"learning_rate": 1.195959595959596e-06,
|
|
"loss": 0.8236,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 11.021636363636363,
|
|
"grad_norm": 12.978816032409668,
|
|
"learning_rate": 1.1919191919191917e-06,
|
|
"loss": 0.5905,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 11.023454545454545,
|
|
"grad_norm": 45.52843475341797,
|
|
"learning_rate": 1.187878787878788e-06,
|
|
"loss": 0.4894,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 11.025272727272727,
|
|
"grad_norm": 51.29554748535156,
|
|
"learning_rate": 1.183838383838384e-06,
|
|
"loss": 0.5392,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 11.027090909090909,
|
|
"grad_norm": 24.330947875976562,
|
|
"learning_rate": 1.1797979797979799e-06,
|
|
"loss": 0.4959,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 11.02890909090909,
|
|
"grad_norm": 76.4671630859375,
|
|
"learning_rate": 1.1757575757575757e-06,
|
|
"loss": 0.5878,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 11.030727272727272,
|
|
"grad_norm": 30.416536331176758,
|
|
"learning_rate": 1.1717171717171716e-06,
|
|
"loss": 0.7717,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 11.032545454545454,
|
|
"grad_norm": 29.47783660888672,
|
|
"learning_rate": 1.1676767676767676e-06,
|
|
"loss": 0.6918,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 11.034363636363636,
|
|
"grad_norm": 41.43596649169922,
|
|
"learning_rate": 1.1636363636363636e-06,
|
|
"loss": 0.5184,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 11.036181818181818,
|
|
"grad_norm": 19.705320358276367,
|
|
"learning_rate": 1.1595959595959594e-06,
|
|
"loss": 0.618,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 11.038,
|
|
"grad_norm": 31.757631301879883,
|
|
"learning_rate": 1.1555555555555554e-06,
|
|
"loss": 0.6758,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 11.039818181818182,
|
|
"grad_norm": 28.794036865234375,
|
|
"learning_rate": 1.1515151515151516e-06,
|
|
"loss": 0.6114,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 11.040181818181818,
|
|
"eval_accuracy": 0.7619047619047619,
|
|
"eval_loss": 0.6110672950744629,
|
|
"eval_runtime": 232.9858,
|
|
"eval_samples_per_second": 0.811,
|
|
"eval_steps_per_second": 0.206,
|
|
"step": 2652
|
|
},
|
|
{
|
|
"epoch": 12.001454545454546,
|
|
"grad_norm": 20.952579498291016,
|
|
"learning_rate": 1.1474747474747476e-06,
|
|
"loss": 0.5381,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 12.003272727272726,
|
|
"grad_norm": 189.48483276367188,
|
|
"learning_rate": 1.1434343434343433e-06,
|
|
"loss": 0.6503,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 12.005090909090908,
|
|
"grad_norm": 58.9918098449707,
|
|
"learning_rate": 1.1393939393939393e-06,
|
|
"loss": 0.7624,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 12.00690909090909,
|
|
"grad_norm": 62.609825134277344,
|
|
"learning_rate": 1.1353535353535353e-06,
|
|
"loss": 0.6399,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 12.008727272727272,
|
|
"grad_norm": 39.57916259765625,
|
|
"learning_rate": 1.1313131313131313e-06,
|
|
"loss": 0.6646,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 12.010545454545454,
|
|
"grad_norm": 70.50138854980469,
|
|
"learning_rate": 1.127272727272727e-06,
|
|
"loss": 0.9518,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 12.012363636363636,
|
|
"grad_norm": 5.138758182525635,
|
|
"learning_rate": 1.123232323232323e-06,
|
|
"loss": 0.4584,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 12.014181818181818,
|
|
"grad_norm": 13.266735076904297,
|
|
"learning_rate": 1.1191919191919193e-06,
|
|
"loss": 0.422,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 12.016,
|
|
"grad_norm": 62.55598449707031,
|
|
"learning_rate": 1.1151515151515153e-06,
|
|
"loss": 0.5691,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 12.017818181818182,
|
|
"grad_norm": 60.93293380737305,
|
|
"learning_rate": 1.111111111111111e-06,
|
|
"loss": 0.6327,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 12.019636363636364,
|
|
"grad_norm": 95.45480346679688,
|
|
"learning_rate": 1.107070707070707e-06,
|
|
"loss": 0.8369,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 12.021454545454546,
|
|
"grad_norm": 5.98721170425415,
|
|
"learning_rate": 1.103030303030303e-06,
|
|
"loss": 0.4232,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 12.023272727272728,
|
|
"grad_norm": 27.14994239807129,
|
|
"learning_rate": 1.098989898989899e-06,
|
|
"loss": 0.6074,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 12.02509090909091,
|
|
"grad_norm": 31.834257125854492,
|
|
"learning_rate": 1.0949494949494948e-06,
|
|
"loss": 0.4744,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 12.026909090909092,
|
|
"grad_norm": 77.2571792602539,
|
|
"learning_rate": 1.0909090909090908e-06,
|
|
"loss": 0.399,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 12.028727272727274,
|
|
"grad_norm": 32.394676208496094,
|
|
"learning_rate": 1.0868686868686868e-06,
|
|
"loss": 0.604,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 12.030545454545454,
|
|
"grad_norm": 37.564598083496094,
|
|
"learning_rate": 1.082828282828283e-06,
|
|
"loss": 0.6693,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 12.032363636363636,
|
|
"grad_norm": 6.141965866088867,
|
|
"learning_rate": 1.0787878787878787e-06,
|
|
"loss": 0.776,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 12.034181818181818,
|
|
"grad_norm": 24.58110237121582,
|
|
"learning_rate": 1.0747474747474747e-06,
|
|
"loss": 0.6817,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 12.036,
|
|
"grad_norm": 31.536121368408203,
|
|
"learning_rate": 1.0707070707070707e-06,
|
|
"loss": 0.8944,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 12.037818181818182,
|
|
"grad_norm": 73.18550109863281,
|
|
"learning_rate": 1.0666666666666667e-06,
|
|
"loss": 0.6303,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 12.039636363636363,
|
|
"grad_norm": 65.80399322509766,
|
|
"learning_rate": 1.0626262626262625e-06,
|
|
"loss": 0.7435,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 12.040181818181818,
|
|
"eval_accuracy": 0.7354497354497355,
|
|
"eval_loss": 0.6779099106788635,
|
|
"eval_runtime": 234.9391,
|
|
"eval_samples_per_second": 0.804,
|
|
"eval_steps_per_second": 0.204,
|
|
"step": 2873
|
|
},
|
|
{
|
|
"epoch": 13.001272727272728,
|
|
"grad_norm": 33.63642883300781,
|
|
"learning_rate": 1.0585858585858585e-06,
|
|
"loss": 0.7628,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 13.00309090909091,
|
|
"grad_norm": 179.81727600097656,
|
|
"learning_rate": 1.0545454545454544e-06,
|
|
"loss": 0.7977,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 13.004909090909091,
|
|
"grad_norm": 38.72228240966797,
|
|
"learning_rate": 1.0505050505050506e-06,
|
|
"loss": 0.6483,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 13.006727272727273,
|
|
"grad_norm": 108.29838562011719,
|
|
"learning_rate": 1.0464646464646464e-06,
|
|
"loss": 0.5479,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 13.008545454545455,
|
|
"grad_norm": 37.4640998840332,
|
|
"learning_rate": 1.0424242424242424e-06,
|
|
"loss": 0.5578,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 13.010363636363637,
|
|
"grad_norm": 90.47122955322266,
|
|
"learning_rate": 1.0383838383838384e-06,
|
|
"loss": 0.7558,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 13.012181818181817,
|
|
"grad_norm": 7.663338661193848,
|
|
"learning_rate": 1.0343434343434344e-06,
|
|
"loss": 0.4233,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 13.014,
|
|
"grad_norm": 17.095075607299805,
|
|
"learning_rate": 1.0303030303030302e-06,
|
|
"loss": 0.8717,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 13.015818181818181,
|
|
"grad_norm": 15.45673656463623,
|
|
"learning_rate": 1.0262626262626261e-06,
|
|
"loss": 0.7572,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 13.017636363636363,
|
|
"grad_norm": 12.907837867736816,
|
|
"learning_rate": 1.0222222222222221e-06,
|
|
"loss": 0.6123,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 13.019454545454545,
|
|
"grad_norm": 244.34100341796875,
|
|
"learning_rate": 1.0181818181818181e-06,
|
|
"loss": 0.5477,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 13.021272727272727,
|
|
"grad_norm": 3.5488739013671875,
|
|
"learning_rate": 1.014141414141414e-06,
|
|
"loss": 0.5386,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 13.023090909090909,
|
|
"grad_norm": 69.64865112304688,
|
|
"learning_rate": 1.01010101010101e-06,
|
|
"loss": 0.615,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 13.024909090909091,
|
|
"grad_norm": 145.5615997314453,
|
|
"learning_rate": 1.006060606060606e-06,
|
|
"loss": 0.7061,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 13.026727272727273,
|
|
"grad_norm": 59.10872268676758,
|
|
"learning_rate": 1.002020202020202e-06,
|
|
"loss": 0.9779,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 13.028545454545455,
|
|
"grad_norm": 37.77117919921875,
|
|
"learning_rate": 9.979797979797978e-07,
|
|
"loss": 0.5565,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 13.030363636363637,
|
|
"grad_norm": 45.658267974853516,
|
|
"learning_rate": 9.939393939393938e-07,
|
|
"loss": 0.3069,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 13.032181818181819,
|
|
"grad_norm": 22.27031898498535,
|
|
"learning_rate": 9.898989898989898e-07,
|
|
"loss": 0.5964,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 13.034,
|
|
"grad_norm": 90.87818908691406,
|
|
"learning_rate": 9.858585858585858e-07,
|
|
"loss": 0.4515,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 13.035818181818183,
|
|
"grad_norm": 85.48793029785156,
|
|
"learning_rate": 9.818181818181818e-07,
|
|
"loss": 0.7309,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 13.037636363636363,
|
|
"grad_norm": 55.71916580200195,
|
|
"learning_rate": 9.777777777777778e-07,
|
|
"loss": 0.5142,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 13.039454545454545,
|
|
"grad_norm": 6.492376804351807,
|
|
"learning_rate": 9.737373737373738e-07,
|
|
"loss": 0.7742,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 13.040181818181818,
|
|
"eval_accuracy": 0.6878306878306878,
|
|
"eval_loss": 0.7390046119689941,
|
|
"eval_runtime": 245.0564,
|
|
"eval_samples_per_second": 0.771,
|
|
"eval_steps_per_second": 0.196,
|
|
"step": 3094
|
|
},
|
|
{
|
|
"epoch": 14.001090909090909,
|
|
"grad_norm": 173.43133544921875,
|
|
"learning_rate": 9.696969696969698e-07,
|
|
"loss": 0.8383,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 14.00290909090909,
|
|
"grad_norm": 59.899837493896484,
|
|
"learning_rate": 9.656565656565655e-07,
|
|
"loss": 0.7716,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 14.004727272727273,
|
|
"grad_norm": 6.925207138061523,
|
|
"learning_rate": 9.616161616161615e-07,
|
|
"loss": 0.6463,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 14.006545454545455,
|
|
"grad_norm": 250.12135314941406,
|
|
"learning_rate": 9.575757575757575e-07,
|
|
"loss": 0.7163,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 14.008363636363637,
|
|
"grad_norm": 34.77422332763672,
|
|
"learning_rate": 9.535353535353535e-07,
|
|
"loss": 0.5649,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 14.010181818181819,
|
|
"grad_norm": 119.9849624633789,
|
|
"learning_rate": 9.494949494949495e-07,
|
|
"loss": 0.8199,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 14.012,
|
|
"grad_norm": 5.420638561248779,
|
|
"learning_rate": 9.454545454545454e-07,
|
|
"loss": 0.5755,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 14.013818181818182,
|
|
"grad_norm": 61.58526611328125,
|
|
"learning_rate": 9.414141414141414e-07,
|
|
"loss": 0.5871,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 14.015636363636364,
|
|
"grad_norm": 131.88272094726562,
|
|
"learning_rate": 9.373737373737373e-07,
|
|
"loss": 0.4615,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 14.017454545454546,
|
|
"grad_norm": 6.7005157470703125,
|
|
"learning_rate": 9.333333333333333e-07,
|
|
"loss": 0.5038,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 14.019272727272726,
|
|
"grad_norm": 155.5514373779297,
|
|
"learning_rate": 9.292929292929292e-07,
|
|
"loss": 0.8315,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 14.021090909090908,
|
|
"grad_norm": 21.234600067138672,
|
|
"learning_rate": 9.252525252525252e-07,
|
|
"loss": 0.4145,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 14.02290909090909,
|
|
"grad_norm": 51.35615158081055,
|
|
"learning_rate": 9.212121212121212e-07,
|
|
"loss": 0.9987,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 14.024727272727272,
|
|
"grad_norm": 22.481821060180664,
|
|
"learning_rate": 9.171717171717172e-07,
|
|
"loss": 0.4922,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 14.026545454545454,
|
|
"grad_norm": 98.58537292480469,
|
|
"learning_rate": 9.13131313131313e-07,
|
|
"loss": 0.53,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 14.028363636363636,
|
|
"grad_norm": 46.49958419799805,
|
|
"learning_rate": 9.09090909090909e-07,
|
|
"loss": 0.77,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 14.030181818181818,
|
|
"grad_norm": 38.402347564697266,
|
|
"learning_rate": 9.05050505050505e-07,
|
|
"loss": 0.5998,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 14.032,
|
|
"grad_norm": 79.69766235351562,
|
|
"learning_rate": 9.01010101010101e-07,
|
|
"loss": 0.6367,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 14.033818181818182,
|
|
"grad_norm": 115.86608123779297,
|
|
"learning_rate": 8.969696969696969e-07,
|
|
"loss": 0.6104,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 14.035636363636364,
|
|
"grad_norm": 60.4298095703125,
|
|
"learning_rate": 8.929292929292929e-07,
|
|
"loss": 0.8602,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 14.037454545454546,
|
|
"grad_norm": 2.805131196975708,
|
|
"learning_rate": 8.888888888888888e-07,
|
|
"loss": 0.5112,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 14.039272727272728,
|
|
"grad_norm": 27.735536575317383,
|
|
"learning_rate": 8.848484848484849e-07,
|
|
"loss": 0.6558,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 14.040181818181818,
|
|
"eval_accuracy": 0.7354497354497355,
|
|
"eval_loss": 0.6284019947052002,
|
|
"eval_runtime": 253.1012,
|
|
"eval_samples_per_second": 0.747,
|
|
"eval_steps_per_second": 0.19,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"epoch": 15.00090909090909,
|
|
"grad_norm": 116.87984466552734,
|
|
"learning_rate": 8.808080808080807e-07,
|
|
"loss": 0.7683,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 15.002727272727272,
|
|
"grad_norm": 24.447603225708008,
|
|
"learning_rate": 8.767676767676767e-07,
|
|
"loss": 0.546,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 15.004545454545454,
|
|
"grad_norm": 36.200927734375,
|
|
"learning_rate": 8.727272727272726e-07,
|
|
"loss": 0.6664,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 15.006363636363636,
|
|
"grad_norm": 22.57868003845215,
|
|
"learning_rate": 8.686868686868687e-07,
|
|
"loss": 0.6843,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 15.008181818181818,
|
|
"grad_norm": 33.96149826049805,
|
|
"learning_rate": 8.646464646464646e-07,
|
|
"loss": 0.5763,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 15.01,
|
|
"grad_norm": 107.21601104736328,
|
|
"learning_rate": 8.606060606060606e-07,
|
|
"loss": 1.1283,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 15.011818181818182,
|
|
"grad_norm": 37.38850402832031,
|
|
"learning_rate": 8.565656565656565e-07,
|
|
"loss": 0.5104,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 15.013636363636364,
|
|
"grad_norm": 9.89079761505127,
|
|
"learning_rate": 8.525252525252525e-07,
|
|
"loss": 0.564,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 15.015454545454546,
|
|
"grad_norm": 51.0941162109375,
|
|
"learning_rate": 8.484848484848484e-07,
|
|
"loss": 0.9413,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 15.017272727272728,
|
|
"grad_norm": 44.05881118774414,
|
|
"learning_rate": 8.444444444444444e-07,
|
|
"loss": 0.7765,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 15.01909090909091,
|
|
"grad_norm": 182.90931701660156,
|
|
"learning_rate": 8.404040404040403e-07,
|
|
"loss": 0.9487,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 15.020909090909091,
|
|
"grad_norm": 10.030019760131836,
|
|
"learning_rate": 8.363636363636363e-07,
|
|
"loss": 0.7775,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 15.022727272727273,
|
|
"grad_norm": 140.7605438232422,
|
|
"learning_rate": 8.323232323232323e-07,
|
|
"loss": 0.3844,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 15.024545454545455,
|
|
"grad_norm": 8.101646423339844,
|
|
"learning_rate": 8.282828282828283e-07,
|
|
"loss": 0.8147,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 15.026363636363635,
|
|
"grad_norm": 57.96697235107422,
|
|
"learning_rate": 8.242424242424241e-07,
|
|
"loss": 1.0415,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 15.028181818181817,
|
|
"grad_norm": 29.711713790893555,
|
|
"learning_rate": 8.202020202020201e-07,
|
|
"loss": 0.5816,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 15.03,
|
|
"grad_norm": 77.7359848022461,
|
|
"learning_rate": 8.161616161616161e-07,
|
|
"loss": 0.597,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 15.031818181818181,
|
|
"grad_norm": 47.72003936767578,
|
|
"learning_rate": 8.121212121212121e-07,
|
|
"loss": 0.4437,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 15.033636363636363,
|
|
"grad_norm": 44.92646408081055,
|
|
"learning_rate": 8.08080808080808e-07,
|
|
"loss": 0.5099,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 15.035454545454545,
|
|
"grad_norm": 20.403213500976562,
|
|
"learning_rate": 8.04040404040404e-07,
|
|
"loss": 0.4642,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 15.037272727272727,
|
|
"grad_norm": 63.10896682739258,
|
|
"learning_rate": 8e-07,
|
|
"loss": 0.5385,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 15.039090909090909,
|
|
"grad_norm": 32.233154296875,
|
|
"learning_rate": 7.95959595959596e-07,
|
|
"loss": 0.4822,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 15.040181818181818,
|
|
"eval_accuracy": 0.7195767195767195,
|
|
"eval_loss": 0.7071319222450256,
|
|
"eval_runtime": 259.3857,
|
|
"eval_samples_per_second": 0.729,
|
|
"eval_steps_per_second": 0.185,
|
|
"step": 3536
|
|
},
|
|
{
|
|
"epoch": 16.00072727272727,
|
|
"grad_norm": 16.31338119506836,
|
|
"learning_rate": 7.919191919191918e-07,
|
|
"loss": 0.7441,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 16.002545454545455,
|
|
"grad_norm": 50.943363189697266,
|
|
"learning_rate": 7.878787878787878e-07,
|
|
"loss": 0.4436,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 16.004363636363635,
|
|
"grad_norm": 103.13694763183594,
|
|
"learning_rate": 7.838383838383838e-07,
|
|
"loss": 0.3391,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 16.00618181818182,
|
|
"grad_norm": 33.13505172729492,
|
|
"learning_rate": 7.797979797979798e-07,
|
|
"loss": 0.4457,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 16.008,
|
|
"grad_norm": 87.32142639160156,
|
|
"learning_rate": 7.757575757575757e-07,
|
|
"loss": 0.8192,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 16.009818181818183,
|
|
"grad_norm": 40.58958435058594,
|
|
"learning_rate": 7.717171717171717e-07,
|
|
"loss": 0.5037,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 16.011636363636363,
|
|
"grad_norm": 88.63417053222656,
|
|
"learning_rate": 7.676767676767675e-07,
|
|
"loss": 0.3936,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 16.013454545454547,
|
|
"grad_norm": 197.7357940673828,
|
|
"learning_rate": 7.636363636363636e-07,
|
|
"loss": 0.9558,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 16.015272727272727,
|
|
"grad_norm": 75.30937957763672,
|
|
"learning_rate": 7.595959595959595e-07,
|
|
"loss": 0.654,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 16.01709090909091,
|
|
"grad_norm": 67.23373413085938,
|
|
"learning_rate": 7.555555555555555e-07,
|
|
"loss": 0.58,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 16.01890909090909,
|
|
"grad_norm": 8.349831581115723,
|
|
"learning_rate": 7.515151515151514e-07,
|
|
"loss": 0.543,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 16.020727272727274,
|
|
"grad_norm": 52.83699035644531,
|
|
"learning_rate": 7.474747474747475e-07,
|
|
"loss": 0.6872,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 16.022545454545455,
|
|
"grad_norm": 32.91685104370117,
|
|
"learning_rate": 7.434343434343434e-07,
|
|
"loss": 0.6798,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 16.024363636363635,
|
|
"grad_norm": 130.88951110839844,
|
|
"learning_rate": 7.393939393939394e-07,
|
|
"loss": 0.5446,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 16.02618181818182,
|
|
"grad_norm": 44.42277908325195,
|
|
"learning_rate": 7.353535353535352e-07,
|
|
"loss": 0.8754,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 16.028,
|
|
"grad_norm": 285.5379943847656,
|
|
"learning_rate": 7.313131313131313e-07,
|
|
"loss": 0.8222,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 16.029818181818182,
|
|
"grad_norm": 42.74455261230469,
|
|
"learning_rate": 7.272727272727272e-07,
|
|
"loss": 0.6664,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 16.031636363636363,
|
|
"grad_norm": 101.43460083007812,
|
|
"learning_rate": 7.232323232323232e-07,
|
|
"loss": 0.7851,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 16.033454545454546,
|
|
"grad_norm": 60.82951736450195,
|
|
"learning_rate": 7.191919191919191e-07,
|
|
"loss": 0.6222,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 16.035272727272726,
|
|
"grad_norm": 211.50241088867188,
|
|
"learning_rate": 7.151515151515152e-07,
|
|
"loss": 0.7885,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 16.03709090909091,
|
|
"grad_norm": 32.02504348754883,
|
|
"learning_rate": 7.111111111111111e-07,
|
|
"loss": 0.7612,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 16.03890909090909,
|
|
"grad_norm": 9.265863418579102,
|
|
"learning_rate": 7.07070707070707e-07,
|
|
"loss": 0.7686,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 16.040181818181818,
|
|
"eval_accuracy": 0.7301587301587301,
|
|
"eval_loss": 0.6982029676437378,
|
|
"eval_runtime": 243.7878,
|
|
"eval_samples_per_second": 0.775,
|
|
"eval_steps_per_second": 0.197,
|
|
"step": 3757
|
|
},
|
|
{
|
|
"epoch": 17.000545454545456,
|
|
"grad_norm": 4.142579555511475,
|
|
"learning_rate": 7.030303030303029e-07,
|
|
"loss": 0.3824,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 17.002363636363636,
|
|
"grad_norm": 113.9232406616211,
|
|
"learning_rate": 6.989898989898989e-07,
|
|
"loss": 0.795,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 17.004181818181817,
|
|
"grad_norm": 61.50730895996094,
|
|
"learning_rate": 6.949494949494949e-07,
|
|
"loss": 0.4804,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 17.006,
|
|
"grad_norm": 69.38822937011719,
|
|
"learning_rate": 6.909090909090909e-07,
|
|
"loss": 0.6002,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 17.00781818181818,
|
|
"grad_norm": 177.55372619628906,
|
|
"learning_rate": 6.868686868686868e-07,
|
|
"loss": 0.7921,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 17.009636363636364,
|
|
"grad_norm": 45.08247756958008,
|
|
"learning_rate": 6.828282828282828e-07,
|
|
"loss": 0.6971,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 17.011454545454544,
|
|
"grad_norm": 45.8038330078125,
|
|
"learning_rate": 6.787878787878789e-07,
|
|
"loss": 0.6781,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 17.013272727272728,
|
|
"grad_norm": 102.8902816772461,
|
|
"learning_rate": 6.747474747474747e-07,
|
|
"loss": 0.6378,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 17.015090909090908,
|
|
"grad_norm": 80.0373764038086,
|
|
"learning_rate": 6.707070707070706e-07,
|
|
"loss": 0.684,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 17.016909090909092,
|
|
"grad_norm": 50.4844856262207,
|
|
"learning_rate": 6.666666666666666e-07,
|
|
"loss": 0.776,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 17.018727272727272,
|
|
"grad_norm": 56.95082092285156,
|
|
"learning_rate": 6.626262626262627e-07,
|
|
"loss": 0.8147,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 17.020545454545456,
|
|
"grad_norm": 35.694496154785156,
|
|
"learning_rate": 6.585858585858586e-07,
|
|
"loss": 0.6381,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 17.022363636363636,
|
|
"grad_norm": 31.442394256591797,
|
|
"learning_rate": 6.545454545454546e-07,
|
|
"loss": 0.9323,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 17.02418181818182,
|
|
"grad_norm": 153.8267364501953,
|
|
"learning_rate": 6.505050505050504e-07,
|
|
"loss": 0.6939,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 17.026,
|
|
"grad_norm": 8.434920310974121,
|
|
"learning_rate": 6.464646464646465e-07,
|
|
"loss": 0.8409,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 17.027818181818184,
|
|
"grad_norm": 95.91763305664062,
|
|
"learning_rate": 6.424242424242424e-07,
|
|
"loss": 0.6106,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 17.029636363636364,
|
|
"grad_norm": 61.757469177246094,
|
|
"learning_rate": 6.383838383838384e-07,
|
|
"loss": 0.7541,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 17.031454545454544,
|
|
"grad_norm": 98.56493377685547,
|
|
"learning_rate": 6.343434343434343e-07,
|
|
"loss": 0.6284,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 17.033272727272728,
|
|
"grad_norm": 5.220396995544434,
|
|
"learning_rate": 6.303030303030303e-07,
|
|
"loss": 0.5994,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 17.035090909090908,
|
|
"grad_norm": 50.64840316772461,
|
|
"learning_rate": 6.262626262626263e-07,
|
|
"loss": 0.3801,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 17.03690909090909,
|
|
"grad_norm": 56.643348693847656,
|
|
"learning_rate": 6.222222222222223e-07,
|
|
"loss": 0.5423,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 17.03872727272727,
|
|
"grad_norm": 45.90037155151367,
|
|
"learning_rate": 6.181818181818181e-07,
|
|
"loss": 0.7945,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 17.040181818181818,
|
|
"eval_accuracy": 0.7566137566137566,
|
|
"eval_loss": 0.6335788369178772,
|
|
"eval_runtime": 239.4457,
|
|
"eval_samples_per_second": 0.789,
|
|
"eval_steps_per_second": 0.2,
|
|
"step": 3978
|
|
},
|
|
{
|
|
"epoch": 18.000363636363637,
|
|
"grad_norm": 36.48537826538086,
|
|
"learning_rate": 6.141414141414141e-07,
|
|
"loss": 0.5005,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 18.002181818181818,
|
|
"grad_norm": 22.286909103393555,
|
|
"learning_rate": 6.101010101010101e-07,
|
|
"loss": 0.6037,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 18.004,
|
|
"grad_norm": 104.05951690673828,
|
|
"learning_rate": 6.060606060606061e-07,
|
|
"loss": 0.6384,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 18.00581818181818,
|
|
"grad_norm": 89.45072937011719,
|
|
"learning_rate": 6.02020202020202e-07,
|
|
"loss": 0.8027,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 18.007636363636365,
|
|
"grad_norm": 27.79203224182129,
|
|
"learning_rate": 5.97979797979798e-07,
|
|
"loss": 0.8247,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 18.009454545454545,
|
|
"grad_norm": 24.849225997924805,
|
|
"learning_rate": 5.93939393939394e-07,
|
|
"loss": 0.4399,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 18.011272727272726,
|
|
"grad_norm": 169.05714416503906,
|
|
"learning_rate": 5.898989898989899e-07,
|
|
"loss": 0.6759,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 18.01309090909091,
|
|
"grad_norm": 42.4780158996582,
|
|
"learning_rate": 5.858585858585858e-07,
|
|
"loss": 0.7287,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 18.01490909090909,
|
|
"grad_norm": 51.804603576660156,
|
|
"learning_rate": 5.818181818181818e-07,
|
|
"loss": 0.9201,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 18.016727272727273,
|
|
"grad_norm": 60.65522003173828,
|
|
"learning_rate": 5.777777777777777e-07,
|
|
"loss": 0.7215,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 18.018545454545453,
|
|
"grad_norm": 43.81229782104492,
|
|
"learning_rate": 5.737373737373738e-07,
|
|
"loss": 0.4151,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 18.020363636363637,
|
|
"grad_norm": 57.046974182128906,
|
|
"learning_rate": 5.696969696969697e-07,
|
|
"loss": 0.833,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 18.022181818181817,
|
|
"grad_norm": 40.3692626953125,
|
|
"learning_rate": 5.656565656565657e-07,
|
|
"loss": 0.5183,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 18.024,
|
|
"grad_norm": 17.42075538635254,
|
|
"learning_rate": 5.616161616161615e-07,
|
|
"loss": 0.3364,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 18.02581818181818,
|
|
"grad_norm": 61.59687805175781,
|
|
"learning_rate": 5.575757575757576e-07,
|
|
"loss": 0.7532,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 18.027636363636365,
|
|
"grad_norm": 19.387409210205078,
|
|
"learning_rate": 5.535353535353535e-07,
|
|
"loss": 0.6432,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 18.029454545454545,
|
|
"grad_norm": 48.39297103881836,
|
|
"learning_rate": 5.494949494949495e-07,
|
|
"loss": 0.6288,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 18.03127272727273,
|
|
"grad_norm": 7.193654537200928,
|
|
"learning_rate": 5.454545454545454e-07,
|
|
"loss": 0.7595,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 18.03309090909091,
|
|
"grad_norm": 83.67570495605469,
|
|
"learning_rate": 5.414141414141415e-07,
|
|
"loss": 0.9217,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 18.034909090909093,
|
|
"grad_norm": 38.86811447143555,
|
|
"learning_rate": 5.373737373737374e-07,
|
|
"loss": 0.5732,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 18.036727272727273,
|
|
"grad_norm": 38.445377349853516,
|
|
"learning_rate": 5.333333333333333e-07,
|
|
"loss": 0.4312,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 18.038545454545453,
|
|
"grad_norm": 43.40776062011719,
|
|
"learning_rate": 5.292929292929292e-07,
|
|
"loss": 0.5755,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 18.040181818181818,
|
|
"eval_accuracy": 0.746031746031746,
|
|
"eval_loss": 0.592440664768219,
|
|
"eval_runtime": 242.8528,
|
|
"eval_samples_per_second": 0.778,
|
|
"eval_steps_per_second": 0.198,
|
|
"step": 4199
|
|
},
|
|
{
|
|
"epoch": 19.00018181818182,
|
|
"grad_norm": 32.5756721496582,
|
|
"learning_rate": 5.252525252525253e-07,
|
|
"loss": 0.6466,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 19.002,
|
|
"grad_norm": 140.96115112304688,
|
|
"learning_rate": 5.212121212121212e-07,
|
|
"loss": 0.5735,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 19.003818181818183,
|
|
"grad_norm": 70.41220092773438,
|
|
"learning_rate": 5.171717171717172e-07,
|
|
"loss": 0.8348,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 19.005636363636363,
|
|
"grad_norm": 63.176944732666016,
|
|
"learning_rate": 5.131313131313131e-07,
|
|
"loss": 0.801,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 19.007454545454546,
|
|
"grad_norm": 208.63140869140625,
|
|
"learning_rate": 5.090909090909091e-07,
|
|
"loss": 0.8776,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 19.009272727272727,
|
|
"grad_norm": 45.957584381103516,
|
|
"learning_rate": 5.05050505050505e-07,
|
|
"loss": 0.5692,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 19.01109090909091,
|
|
"grad_norm": 10.686644554138184,
|
|
"learning_rate": 5.01010101010101e-07,
|
|
"loss": 0.6621,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 19.01290909090909,
|
|
"grad_norm": 19.4654541015625,
|
|
"learning_rate": 4.969696969696969e-07,
|
|
"loss": 0.4417,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 19.014727272727274,
|
|
"grad_norm": 20.204891204833984,
|
|
"learning_rate": 4.929292929292929e-07,
|
|
"loss": 0.7671,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 19.016545454545454,
|
|
"grad_norm": 72.33363342285156,
|
|
"learning_rate": 4.888888888888889e-07,
|
|
"loss": 0.5588,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 19.018363636363638,
|
|
"grad_norm": 203.05560302734375,
|
|
"learning_rate": 4.848484848484849e-07,
|
|
"loss": 0.71,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 19.02018181818182,
|
|
"grad_norm": 69.61200714111328,
|
|
"learning_rate": 4.808080808080808e-07,
|
|
"loss": 0.8894,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 19.022,
|
|
"grad_norm": 40.70662307739258,
|
|
"learning_rate": 4.7676767676767675e-07,
|
|
"loss": 0.4335,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 19.023818181818182,
|
|
"grad_norm": 24.170921325683594,
|
|
"learning_rate": 4.727272727272727e-07,
|
|
"loss": 0.4808,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 19.025636363636362,
|
|
"grad_norm": 109.3941421508789,
|
|
"learning_rate": 4.6868686868686867e-07,
|
|
"loss": 0.4816,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 19.027454545454546,
|
|
"grad_norm": 8.513394355773926,
|
|
"learning_rate": 4.646464646464646e-07,
|
|
"loss": 0.4561,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 19.029272727272726,
|
|
"grad_norm": 163.6996612548828,
|
|
"learning_rate": 4.606060606060606e-07,
|
|
"loss": 0.3983,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 19.03109090909091,
|
|
"grad_norm": 5.505578994750977,
|
|
"learning_rate": 4.565656565656565e-07,
|
|
"loss": 0.4375,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 19.03290909090909,
|
|
"grad_norm": 33.43965148925781,
|
|
"learning_rate": 4.525252525252525e-07,
|
|
"loss": 0.6939,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 19.034727272727274,
|
|
"grad_norm": 1.7440011501312256,
|
|
"learning_rate": 4.4848484848484845e-07,
|
|
"loss": 0.4278,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 19.036545454545454,
|
|
"grad_norm": 40.648921966552734,
|
|
"learning_rate": 4.444444444444444e-07,
|
|
"loss": 0.7796,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 19.038363636363638,
|
|
"grad_norm": 196.90296936035156,
|
|
"learning_rate": 4.4040404040404037e-07,
|
|
"loss": 0.6882,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 19.040181818181818,
|
|
"grad_norm": 256.42742919921875,
|
|
"learning_rate": 4.363636363636363e-07,
|
|
"loss": 0.6895,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 19.040181818181818,
|
|
"eval_accuracy": 0.7513227513227513,
|
|
"eval_loss": 0.6226804852485657,
|
|
"eval_runtime": 235.265,
|
|
"eval_samples_per_second": 0.803,
|
|
"eval_steps_per_second": 0.204,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 20.00181818181818,
|
|
"grad_norm": 41.40798568725586,
|
|
"learning_rate": 4.323232323232323e-07,
|
|
"loss": 0.7991,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 20.003636363636364,
|
|
"grad_norm": 92.54600524902344,
|
|
"learning_rate": 4.2828282828282823e-07,
|
|
"loss": 0.8162,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 20.005454545454544,
|
|
"grad_norm": 36.51567077636719,
|
|
"learning_rate": 4.242424242424242e-07,
|
|
"loss": 0.6839,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 20.007272727272728,
|
|
"grad_norm": 273.98358154296875,
|
|
"learning_rate": 4.2020202020202015e-07,
|
|
"loss": 0.5207,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 20.009090909090908,
|
|
"grad_norm": 11.173810958862305,
|
|
"learning_rate": 4.1616161616161614e-07,
|
|
"loss": 0.4805,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 20.01090909090909,
|
|
"grad_norm": 57.060791015625,
|
|
"learning_rate": 4.1212121212121207e-07,
|
|
"loss": 0.7987,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 20.012727272727272,
|
|
"grad_norm": 37.16933059692383,
|
|
"learning_rate": 4.0808080808080806e-07,
|
|
"loss": 0.5239,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 20.014545454545456,
|
|
"grad_norm": 57.254329681396484,
|
|
"learning_rate": 4.04040404040404e-07,
|
|
"loss": 1.0652,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 20.016363636363636,
|
|
"grad_norm": 2.1833977699279785,
|
|
"learning_rate": 4e-07,
|
|
"loss": 0.5412,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 20.01818181818182,
|
|
"grad_norm": 87.27790832519531,
|
|
"learning_rate": 3.959595959595959e-07,
|
|
"loss": 0.5355,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 20.02,
|
|
"grad_norm": 31.686355590820312,
|
|
"learning_rate": 3.919191919191919e-07,
|
|
"loss": 0.6888,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 20.021818181818183,
|
|
"grad_norm": 201.38262939453125,
|
|
"learning_rate": 3.8787878787878784e-07,
|
|
"loss": 0.708,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 20.023636363636363,
|
|
"grad_norm": 4.3072638511657715,
|
|
"learning_rate": 3.8383838383838377e-07,
|
|
"loss": 0.7019,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 20.025454545454547,
|
|
"grad_norm": 66.07865142822266,
|
|
"learning_rate": 3.7979797979797976e-07,
|
|
"loss": 0.5985,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 20.027272727272727,
|
|
"grad_norm": 22.882381439208984,
|
|
"learning_rate": 3.757575757575757e-07,
|
|
"loss": 0.9093,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 20.029090909090908,
|
|
"grad_norm": 126.09075927734375,
|
|
"learning_rate": 3.717171717171717e-07,
|
|
"loss": 0.8264,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 20.03090909090909,
|
|
"grad_norm": 21.116567611694336,
|
|
"learning_rate": 3.676767676767676e-07,
|
|
"loss": 0.6192,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 20.03272727272727,
|
|
"grad_norm": 38.860923767089844,
|
|
"learning_rate": 3.636363636363636e-07,
|
|
"loss": 0.4868,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 20.034545454545455,
|
|
"grad_norm": 45.6485710144043,
|
|
"learning_rate": 3.5959595959595954e-07,
|
|
"loss": 0.6272,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 20.036363636363635,
|
|
"grad_norm": 62.035587310791016,
|
|
"learning_rate": 3.5555555555555553e-07,
|
|
"loss": 0.5511,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 20.03818181818182,
|
|
"grad_norm": 30.594026565551758,
|
|
"learning_rate": 3.5151515151515146e-07,
|
|
"loss": 0.3039,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 20.04,
|
|
"grad_norm": 66.38878631591797,
|
|
"learning_rate": 3.4747474747474745e-07,
|
|
"loss": 0.4775,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 20.040181818181818,
|
|
"eval_accuracy": 0.7671957671957672,
|
|
"eval_loss": 0.5846399664878845,
|
|
"eval_runtime": 300.8509,
|
|
"eval_samples_per_second": 0.628,
|
|
"eval_steps_per_second": 0.16,
|
|
"step": 4641
|
|
},
|
|
{
|
|
"epoch": 21.001636363636365,
|
|
"grad_norm": 44.0944709777832,
|
|
"learning_rate": 3.434343434343434e-07,
|
|
"loss": 0.417,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 21.003454545454545,
|
|
"grad_norm": 40.17851257324219,
|
|
"learning_rate": 3.393939393939394e-07,
|
|
"loss": 0.9459,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 21.00527272727273,
|
|
"grad_norm": 25.39684295654297,
|
|
"learning_rate": 3.353535353535353e-07,
|
|
"loss": 0.5275,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 21.00709090909091,
|
|
"grad_norm": 67.4314193725586,
|
|
"learning_rate": 3.3131313131313135e-07,
|
|
"loss": 0.6135,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 21.00890909090909,
|
|
"grad_norm": 175.815673828125,
|
|
"learning_rate": 3.272727272727273e-07,
|
|
"loss": 0.4335,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 21.010727272727273,
|
|
"grad_norm": 9.580012321472168,
|
|
"learning_rate": 3.2323232323232327e-07,
|
|
"loss": 0.2622,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 21.012545454545453,
|
|
"grad_norm": 119.75627899169922,
|
|
"learning_rate": 3.191919191919192e-07,
|
|
"loss": 0.7259,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 21.014363636363637,
|
|
"grad_norm": 32.749778747558594,
|
|
"learning_rate": 3.1515151515151514e-07,
|
|
"loss": 0.4853,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 21.016181818181817,
|
|
"grad_norm": 101.70720672607422,
|
|
"learning_rate": 3.111111111111111e-07,
|
|
"loss": 0.7484,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 21.018,
|
|
"grad_norm": 70.83300018310547,
|
|
"learning_rate": 3.0707070707070706e-07,
|
|
"loss": 0.3837,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 21.01981818181818,
|
|
"grad_norm": 87.50294494628906,
|
|
"learning_rate": 3.0303030303030305e-07,
|
|
"loss": 0.416,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 21.021636363636365,
|
|
"grad_norm": 86.63851928710938,
|
|
"learning_rate": 2.98989898989899e-07,
|
|
"loss": 0.4393,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 21.023454545454545,
|
|
"grad_norm": 27.838314056396484,
|
|
"learning_rate": 2.9494949494949497e-07,
|
|
"loss": 0.5648,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 21.02527272727273,
|
|
"grad_norm": 25.77659797668457,
|
|
"learning_rate": 2.909090909090909e-07,
|
|
"loss": 0.9765,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 21.02709090909091,
|
|
"grad_norm": 56.5269775390625,
|
|
"learning_rate": 2.868686868686869e-07,
|
|
"loss": 0.8689,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 21.028909090909092,
|
|
"grad_norm": 41.66890335083008,
|
|
"learning_rate": 2.8282828282828283e-07,
|
|
"loss": 0.5995,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 21.030727272727272,
|
|
"grad_norm": 99.37694549560547,
|
|
"learning_rate": 2.787878787878788e-07,
|
|
"loss": 0.4791,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 21.032545454545456,
|
|
"grad_norm": 28.070980072021484,
|
|
"learning_rate": 2.7474747474747475e-07,
|
|
"loss": 0.7004,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 21.034363636363636,
|
|
"grad_norm": 21.109210968017578,
|
|
"learning_rate": 2.7070707070707074e-07,
|
|
"loss": 0.7725,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 21.036181818181817,
|
|
"grad_norm": 69.2674560546875,
|
|
"learning_rate": 2.6666666666666667e-07,
|
|
"loss": 0.8903,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 21.038,
|
|
"grad_norm": 62.15687942504883,
|
|
"learning_rate": 2.6262626262626266e-07,
|
|
"loss": 0.7883,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 21.03981818181818,
|
|
"grad_norm": 60.2054557800293,
|
|
"learning_rate": 2.585858585858586e-07,
|
|
"loss": 0.8137,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 21.040181818181818,
|
|
"eval_accuracy": 0.7354497354497355,
|
|
"eval_loss": 0.6724361777305603,
|
|
"eval_runtime": 329.4345,
|
|
"eval_samples_per_second": 0.574,
|
|
"eval_steps_per_second": 0.146,
|
|
"step": 4862
|
|
},
|
|
{
|
|
"epoch": 22.001454545454546,
|
|
"grad_norm": 45.40814208984375,
|
|
"learning_rate": 2.5454545454545453e-07,
|
|
"loss": 0.6947,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 22.003272727272726,
|
|
"grad_norm": 241.23184204101562,
|
|
"learning_rate": 2.505050505050505e-07,
|
|
"loss": 0.6354,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 22.00509090909091,
|
|
"grad_norm": 101.31082916259766,
|
|
"learning_rate": 2.4646464646464645e-07,
|
|
"loss": 0.5303,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 22.00690909090909,
|
|
"grad_norm": 25.400516510009766,
|
|
"learning_rate": 2.4242424242424244e-07,
|
|
"loss": 0.8661,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 22.008727272727274,
|
|
"grad_norm": 144.255615234375,
|
|
"learning_rate": 2.3838383838383837e-07,
|
|
"loss": 0.6776,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 22.010545454545454,
|
|
"grad_norm": 26.609569549560547,
|
|
"learning_rate": 2.3434343434343433e-07,
|
|
"loss": 0.4781,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 22.012363636363638,
|
|
"grad_norm": 42.74632263183594,
|
|
"learning_rate": 2.303030303030303e-07,
|
|
"loss": 0.6413,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 22.014181818181818,
|
|
"grad_norm": 87.13990783691406,
|
|
"learning_rate": 2.2626262626262626e-07,
|
|
"loss": 0.6047,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 22.016,
|
|
"grad_norm": 34.36100387573242,
|
|
"learning_rate": 2.222222222222222e-07,
|
|
"loss": 0.6049,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 22.017818181818182,
|
|
"grad_norm": 26.21919059753418,
|
|
"learning_rate": 2.1818181818181815e-07,
|
|
"loss": 0.5679,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 22.019636363636362,
|
|
"grad_norm": 40.07568359375,
|
|
"learning_rate": 2.1414141414141411e-07,
|
|
"loss": 0.6462,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 22.021454545454546,
|
|
"grad_norm": 60.2342643737793,
|
|
"learning_rate": 2.1010101010101007e-07,
|
|
"loss": 0.7231,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 22.023272727272726,
|
|
"grad_norm": 112.45123291015625,
|
|
"learning_rate": 2.0606060606060604e-07,
|
|
"loss": 0.6436,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 22.02509090909091,
|
|
"grad_norm": 19.947439193725586,
|
|
"learning_rate": 2.02020202020202e-07,
|
|
"loss": 0.4875,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 22.02690909090909,
|
|
"grad_norm": 98.87757110595703,
|
|
"learning_rate": 1.9797979797979796e-07,
|
|
"loss": 0.456,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 22.028727272727274,
|
|
"grad_norm": 35.51746368408203,
|
|
"learning_rate": 1.9393939393939392e-07,
|
|
"loss": 0.7567,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 22.030545454545454,
|
|
"grad_norm": 18.169660568237305,
|
|
"learning_rate": 1.8989898989898988e-07,
|
|
"loss": 0.6013,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 22.032363636363637,
|
|
"grad_norm": 128.1077423095703,
|
|
"learning_rate": 1.8585858585858584e-07,
|
|
"loss": 0.8739,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 22.034181818181818,
|
|
"grad_norm": 84.50912475585938,
|
|
"learning_rate": 1.818181818181818e-07,
|
|
"loss": 0.9112,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 22.036,
|
|
"grad_norm": 82.26988220214844,
|
|
"learning_rate": 1.7777777777777776e-07,
|
|
"loss": 0.5877,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 22.03781818181818,
|
|
"grad_norm": 252.0263214111328,
|
|
"learning_rate": 1.7373737373737372e-07,
|
|
"loss": 0.5726,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 22.039636363636365,
|
|
"grad_norm": 17.127042770385742,
|
|
"learning_rate": 1.696969696969697e-07,
|
|
"loss": 0.4226,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 22.040181818181818,
|
|
"eval_accuracy": 0.746031746031746,
|
|
"eval_loss": 0.6772251129150391,
|
|
"eval_runtime": 278.3106,
|
|
"eval_samples_per_second": 0.679,
|
|
"eval_steps_per_second": 0.172,
|
|
"step": 5083
|
|
},
|
|
{
|
|
"epoch": 23.001272727272728,
|
|
"grad_norm": 3.078542947769165,
|
|
"learning_rate": 1.6565656565656567e-07,
|
|
"loss": 0.5202,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 23.003090909090908,
|
|
"grad_norm": 20.412845611572266,
|
|
"learning_rate": 1.6161616161616163e-07,
|
|
"loss": 0.6778,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 23.00490909090909,
|
|
"grad_norm": 22.000837326049805,
|
|
"learning_rate": 1.5757575757575757e-07,
|
|
"loss": 0.4499,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 23.00672727272727,
|
|
"grad_norm": 52.1019401550293,
|
|
"learning_rate": 1.5353535353535353e-07,
|
|
"loss": 0.9384,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 23.008545454545455,
|
|
"grad_norm": 19.972681045532227,
|
|
"learning_rate": 1.494949494949495e-07,
|
|
"loss": 0.5696,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 23.010363636363635,
|
|
"grad_norm": 54.88703155517578,
|
|
"learning_rate": 1.4545454545454545e-07,
|
|
"loss": 0.4899,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 23.01218181818182,
|
|
"grad_norm": 82.5994873046875,
|
|
"learning_rate": 1.4141414141414141e-07,
|
|
"loss": 0.4649,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 23.014,
|
|
"grad_norm": 109.95027160644531,
|
|
"learning_rate": 1.3737373737373738e-07,
|
|
"loss": 0.8847,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 23.015818181818183,
|
|
"grad_norm": 107.37582397460938,
|
|
"learning_rate": 1.3333333333333334e-07,
|
|
"loss": 0.3493,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 23.017636363636363,
|
|
"grad_norm": 71.61345672607422,
|
|
"learning_rate": 1.292929292929293e-07,
|
|
"loss": 0.8436,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 23.019454545454547,
|
|
"grad_norm": 105.52105712890625,
|
|
"learning_rate": 1.2525252525252526e-07,
|
|
"loss": 1.107,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 23.021272727272727,
|
|
"grad_norm": 39.01383590698242,
|
|
"learning_rate": 1.2121212121212122e-07,
|
|
"loss": 0.4364,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 23.02309090909091,
|
|
"grad_norm": 21.590635299682617,
|
|
"learning_rate": 1.1717171717171717e-07,
|
|
"loss": 0.4454,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 23.02490909090909,
|
|
"grad_norm": 57.09373474121094,
|
|
"learning_rate": 1.1313131313131313e-07,
|
|
"loss": 0.5412,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 23.02672727272727,
|
|
"grad_norm": 72.85536193847656,
|
|
"learning_rate": 1.0909090909090908e-07,
|
|
"loss": 0.7742,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 23.028545454545455,
|
|
"grad_norm": 41.63391876220703,
|
|
"learning_rate": 1.0505050505050504e-07,
|
|
"loss": 0.5171,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 23.030363636363635,
|
|
"grad_norm": 45.07915496826172,
|
|
"learning_rate": 1.01010101010101e-07,
|
|
"loss": 0.6507,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 23.03218181818182,
|
|
"grad_norm": 54.700439453125,
|
|
"learning_rate": 9.696969696969696e-08,
|
|
"loss": 0.74,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 23.034,
|
|
"grad_norm": 38.23246383666992,
|
|
"learning_rate": 9.292929292929292e-08,
|
|
"loss": 0.6672,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 23.035818181818183,
|
|
"grad_norm": 99.04003143310547,
|
|
"learning_rate": 8.888888888888888e-08,
|
|
"loss": 0.6738,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 23.037636363636363,
|
|
"grad_norm": 83.21279907226562,
|
|
"learning_rate": 8.484848484848486e-08,
|
|
"loss": 0.3389,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"epoch": 23.039454545454547,
|
|
"grad_norm": 50.007049560546875,
|
|
"learning_rate": 8.080808080808082e-08,
|
|
"loss": 0.6616,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 23.040181818181818,
|
|
"eval_accuracy": 0.7407407407407407,
|
|
"eval_loss": 0.6855835318565369,
|
|
"eval_runtime": 233.4207,
|
|
"eval_samples_per_second": 0.81,
|
|
"eval_steps_per_second": 0.206,
|
|
"step": 5304
|
|
},
|
|
{
|
|
"epoch": 24.00109090909091,
|
|
"grad_norm": 45.41238784790039,
|
|
"learning_rate": 7.676767676767677e-08,
|
|
"loss": 0.6613,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 24.002909090909093,
|
|
"grad_norm": 27.93305015563965,
|
|
"learning_rate": 7.272727272727273e-08,
|
|
"loss": 0.8375,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 24.004727272727273,
|
|
"grad_norm": 77.27233123779297,
|
|
"learning_rate": 6.868686868686869e-08,
|
|
"loss": 0.7925,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"epoch": 24.006545454545453,
|
|
"grad_norm": 0.9629121422767639,
|
|
"learning_rate": 6.464646464646465e-08,
|
|
"loss": 0.8575,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 24.008363636363637,
|
|
"grad_norm": 2.4619650840759277,
|
|
"learning_rate": 6.060606060606061e-08,
|
|
"loss": 0.6424,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 24.010181818181817,
|
|
"grad_norm": 49.1207160949707,
|
|
"learning_rate": 5.6565656565656564e-08,
|
|
"loss": 0.5061,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 24.012,
|
|
"grad_norm": 260.5978698730469,
|
|
"learning_rate": 5.252525252525252e-08,
|
|
"loss": 0.5873,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"epoch": 24.01381818181818,
|
|
"grad_norm": 115.65216064453125,
|
|
"learning_rate": 4.848484848484848e-08,
|
|
"loss": 0.5979,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 24.015636363636364,
|
|
"grad_norm": 32.967918395996094,
|
|
"learning_rate": 4.444444444444444e-08,
|
|
"loss": 0.7006,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"epoch": 24.017454545454545,
|
|
"grad_norm": 3.882549285888672,
|
|
"learning_rate": 4.040404040404041e-08,
|
|
"loss": 0.3618,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 24.019272727272728,
|
|
"grad_norm": 24.57760238647461,
|
|
"learning_rate": 3.636363636363636e-08,
|
|
"loss": 0.5466,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"epoch": 24.02109090909091,
|
|
"grad_norm": 95.3873519897461,
|
|
"learning_rate": 3.2323232323232324e-08,
|
|
"loss": 0.9054,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 24.022909090909092,
|
|
"grad_norm": 57.160274505615234,
|
|
"learning_rate": 2.8282828282828282e-08,
|
|
"loss": 0.4994,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"epoch": 24.024727272727272,
|
|
"grad_norm": 71.60919952392578,
|
|
"learning_rate": 2.424242424242424e-08,
|
|
"loss": 0.6256,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 24.026545454545456,
|
|
"grad_norm": 96.06597137451172,
|
|
"learning_rate": 2.0202020202020204e-08,
|
|
"loss": 0.9133,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 24.028363636363636,
|
|
"grad_norm": 68.16241455078125,
|
|
"learning_rate": 1.6161616161616162e-08,
|
|
"loss": 0.4041,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 24.03018181818182,
|
|
"grad_norm": 75.95323944091797,
|
|
"learning_rate": 1.212121212121212e-08,
|
|
"loss": 0.3785,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"epoch": 24.032,
|
|
"grad_norm": 21.103824615478516,
|
|
"learning_rate": 8.080808080808081e-09,
|
|
"loss": 0.5002,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 24.03381818181818,
|
|
"grad_norm": 22.679424285888672,
|
|
"learning_rate": 4.0404040404040405e-09,
|
|
"loss": 0.4925,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"epoch": 24.035636363636364,
|
|
"grad_norm": 219.1442108154297,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.5246,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 24.035636363636364,
|
|
"eval_accuracy": 0.7407407407407407,
|
|
"eval_loss": 0.6568745374679565,
|
|
"eval_runtime": 252.0189,
|
|
"eval_samples_per_second": 0.75,
|
|
"eval_steps_per_second": 0.19,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 24.035636363636364,
|
|
"step": 5500,
|
|
"total_flos": 4.389253736670403e+19,
|
|
"train_loss": 0.6518208643306386,
|
|
"train_runtime": 40508.4799,
|
|
"train_samples_per_second": 0.543,
|
|
"train_steps_per_second": 0.136
|
|
}
|
|
],
|
|
"logging_steps": 10,
|
|
"max_steps": 5500,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 9223372036854775807,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 4.389253736670403e+19,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|
|
|