ft-wmt14-5 / trainer_state.json
lilferrit's picture
End of training
e188f5a verified
{
"best_metric": 20.7584,
"best_model_checkpoint": "/local1/hfs/gs_stuff/ft-wmt14-5/checkpoint-90000",
"epoch": 2.7777777777777777,
"eval_steps": 10000,
"global_step": 100000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.027777777777777776,
"grad_norm": 1.9314790964126587,
"learning_rate": 0.0005,
"loss": 3.3589,
"step": 1000
},
{
"epoch": 0.05555555555555555,
"grad_norm": 1.7348469495773315,
"learning_rate": 0.0005,
"loss": 2.5263,
"step": 2000
},
{
"epoch": 0.08333333333333333,
"grad_norm": 1.9181748628616333,
"learning_rate": 0.0005,
"loss": 2.3365,
"step": 3000
},
{
"epoch": 0.1111111111111111,
"grad_norm": 1.6642646789550781,
"learning_rate": 0.0005,
"loss": 2.2207,
"step": 4000
},
{
"epoch": 0.1388888888888889,
"grad_norm": 1.1876742839813232,
"learning_rate": 0.0005,
"loss": 2.1363,
"step": 5000
},
{
"epoch": 0.16666666666666666,
"grad_norm": 1.567658543586731,
"learning_rate": 0.0005,
"loss": 2.0733,
"step": 6000
},
{
"epoch": 0.19444444444444445,
"grad_norm": 1.2552471160888672,
"learning_rate": 0.0005,
"loss": 2.0262,
"step": 7000
},
{
"epoch": 0.2222222222222222,
"grad_norm": 1.049357533454895,
"learning_rate": 0.0005,
"loss": 1.9775,
"step": 8000
},
{
"epoch": 0.25,
"grad_norm": 1.303145170211792,
"learning_rate": 0.0005,
"loss": 1.9412,
"step": 9000
},
{
"epoch": 0.2777777777777778,
"grad_norm": 1.0213723182678223,
"learning_rate": 0.0005,
"loss": 1.9166,
"step": 10000
},
{
"epoch": 0.2777777777777778,
"eval_bleu": 15.8119,
"eval_gen_len": 32.097,
"eval_loss": 2.31050968170166,
"eval_runtime": 410.6001,
"eval_samples_per_second": 7.306,
"eval_steps_per_second": 0.913,
"step": 10000
},
{
"epoch": 0.3055555555555556,
"grad_norm": 1.2851905822753906,
"learning_rate": 0.0005,
"loss": 1.8878,
"step": 11000
},
{
"epoch": 0.3333333333333333,
"grad_norm": 0.8447160720825195,
"learning_rate": 0.0005,
"loss": 1.8492,
"step": 12000
},
{
"epoch": 0.3611111111111111,
"grad_norm": 1.1516064405441284,
"learning_rate": 0.0005,
"loss": 1.8309,
"step": 13000
},
{
"epoch": 0.3888888888888889,
"grad_norm": 1.0370670557022095,
"learning_rate": 0.0005,
"loss": 1.8057,
"step": 14000
},
{
"epoch": 0.4166666666666667,
"grad_norm": 1.1649495363235474,
"learning_rate": 0.0005,
"loss": 1.7867,
"step": 15000
},
{
"epoch": 0.4444444444444444,
"grad_norm": 1.2666045427322388,
"learning_rate": 0.0005,
"loss": 1.7679,
"step": 16000
},
{
"epoch": 0.4722222222222222,
"grad_norm": 1.0923264026641846,
"learning_rate": 0.0005,
"loss": 1.7563,
"step": 17000
},
{
"epoch": 0.5,
"grad_norm": 1.560994029045105,
"learning_rate": 0.0005,
"loss": 1.7342,
"step": 18000
},
{
"epoch": 0.5277777777777778,
"grad_norm": 0.9684827327728271,
"learning_rate": 0.0005,
"loss": 1.7228,
"step": 19000
},
{
"epoch": 0.5555555555555556,
"grad_norm": 0.9182453751564026,
"learning_rate": 0.0005,
"loss": 1.7184,
"step": 20000
},
{
"epoch": 0.5555555555555556,
"eval_bleu": 17.5903,
"eval_gen_len": 31.1153,
"eval_loss": 2.19934344291687,
"eval_runtime": 393.3017,
"eval_samples_per_second": 7.628,
"eval_steps_per_second": 0.953,
"step": 20000
},
{
"epoch": 0.5833333333333334,
"grad_norm": 0.8953577280044556,
"learning_rate": 0.0005,
"loss": 1.7042,
"step": 21000
},
{
"epoch": 0.6111111111111112,
"grad_norm": 0.9418250918388367,
"learning_rate": 0.0005,
"loss": 1.683,
"step": 22000
},
{
"epoch": 0.6388888888888888,
"grad_norm": 0.8577601909637451,
"learning_rate": 0.0005,
"loss": 1.6799,
"step": 23000
},
{
"epoch": 0.6666666666666666,
"grad_norm": 0.9786076545715332,
"learning_rate": 0.0005,
"loss": 1.6675,
"step": 24000
},
{
"epoch": 0.6944444444444444,
"grad_norm": 0.9262654781341553,
"learning_rate": 0.0005,
"loss": 1.6499,
"step": 25000
},
{
"epoch": 0.7222222222222222,
"grad_norm": 0.8759564757347107,
"learning_rate": 0.0005,
"loss": 1.6468,
"step": 26000
},
{
"epoch": 0.75,
"grad_norm": 1.0495752096176147,
"learning_rate": 0.0005,
"loss": 1.6285,
"step": 27000
},
{
"epoch": 0.7777777777777778,
"grad_norm": 1.092642068862915,
"learning_rate": 0.0005,
"loss": 1.6276,
"step": 28000
},
{
"epoch": 0.8055555555555556,
"grad_norm": 0.8775661587715149,
"learning_rate": 0.0005,
"loss": 1.6172,
"step": 29000
},
{
"epoch": 0.8333333333333334,
"grad_norm": 0.8970679044723511,
"learning_rate": 0.0005,
"loss": 1.6061,
"step": 30000
},
{
"epoch": 0.8333333333333334,
"eval_bleu": 18.9604,
"eval_gen_len": 30.327,
"eval_loss": 2.1379551887512207,
"eval_runtime": 380.095,
"eval_samples_per_second": 7.893,
"eval_steps_per_second": 0.987,
"step": 30000
},
{
"epoch": 0.8611111111111112,
"grad_norm": 0.9657310247421265,
"learning_rate": 0.0005,
"loss": 1.5959,
"step": 31000
},
{
"epoch": 0.8888888888888888,
"grad_norm": 0.8748376369476318,
"learning_rate": 0.0005,
"loss": 1.5908,
"step": 32000
},
{
"epoch": 0.9166666666666666,
"grad_norm": 0.8462302088737488,
"learning_rate": 0.0005,
"loss": 1.5845,
"step": 33000
},
{
"epoch": 0.9444444444444444,
"grad_norm": 0.9005241394042969,
"learning_rate": 0.0005,
"loss": 1.5699,
"step": 34000
},
{
"epoch": 0.9722222222222222,
"grad_norm": 0.9596630930900574,
"learning_rate": 0.0005,
"loss": 1.5752,
"step": 35000
},
{
"epoch": 1.0,
"grad_norm": 0.8307533860206604,
"learning_rate": 0.0005,
"loss": 1.5634,
"step": 36000
},
{
"epoch": 1.0277777777777777,
"grad_norm": 0.9918788075447083,
"learning_rate": 0.0005,
"loss": 1.5117,
"step": 37000
},
{
"epoch": 1.0555555555555556,
"grad_norm": 0.9118058085441589,
"learning_rate": 0.0005,
"loss": 1.5023,
"step": 38000
},
{
"epoch": 1.0833333333333333,
"grad_norm": 0.7213552594184875,
"learning_rate": 0.0005,
"loss": 1.5087,
"step": 39000
},
{
"epoch": 1.1111111111111112,
"grad_norm": 1.0255305767059326,
"learning_rate": 0.0005,
"loss": 1.516,
"step": 40000
},
{
"epoch": 1.1111111111111112,
"eval_bleu": 19.1444,
"eval_gen_len": 30.2727,
"eval_loss": 2.1365692615509033,
"eval_runtime": 377.1737,
"eval_samples_per_second": 7.954,
"eval_steps_per_second": 0.994,
"step": 40000
},
{
"epoch": 1.1388888888888888,
"grad_norm": 0.8766499161720276,
"learning_rate": 0.0005,
"loss": 1.5096,
"step": 41000
},
{
"epoch": 1.1666666666666667,
"grad_norm": 1.1786612272262573,
"learning_rate": 0.0005,
"loss": 1.4982,
"step": 42000
},
{
"epoch": 1.1944444444444444,
"grad_norm": 1.011268973350525,
"learning_rate": 0.0005,
"loss": 1.5013,
"step": 43000
},
{
"epoch": 1.2222222222222223,
"grad_norm": 1.0863969326019287,
"learning_rate": 0.0005,
"loss": 1.4878,
"step": 44000
},
{
"epoch": 1.25,
"grad_norm": 0.9729832410812378,
"learning_rate": 0.0005,
"loss": 1.4922,
"step": 45000
},
{
"epoch": 1.2777777777777777,
"grad_norm": 1.3476896286010742,
"learning_rate": 0.0005,
"loss": 1.4876,
"step": 46000
},
{
"epoch": 1.3055555555555556,
"grad_norm": 0.8493963479995728,
"learning_rate": 0.0005,
"loss": 1.4823,
"step": 47000
},
{
"epoch": 1.3333333333333333,
"grad_norm": 1.0311123132705688,
"learning_rate": 0.0005,
"loss": 1.4739,
"step": 48000
},
{
"epoch": 1.3611111111111112,
"grad_norm": 1.259581446647644,
"learning_rate": 0.0005,
"loss": 1.4747,
"step": 49000
},
{
"epoch": 1.3888888888888888,
"grad_norm": 1.1934195756912231,
"learning_rate": 0.0005,
"loss": 1.4675,
"step": 50000
},
{
"epoch": 1.3888888888888888,
"eval_bleu": 19.7588,
"eval_gen_len": 30.1127,
"eval_loss": 2.120835781097412,
"eval_runtime": 372.4281,
"eval_samples_per_second": 8.055,
"eval_steps_per_second": 1.007,
"step": 50000
},
{
"epoch": 1.4166666666666667,
"grad_norm": 1.1824595928192139,
"learning_rate": 0.0005,
"loss": 1.4659,
"step": 51000
},
{
"epoch": 1.4444444444444444,
"grad_norm": 1.1661032438278198,
"learning_rate": 0.0005,
"loss": 1.4737,
"step": 52000
},
{
"epoch": 1.4722222222222223,
"grad_norm": 0.7856634259223938,
"learning_rate": 0.0005,
"loss": 1.4595,
"step": 53000
},
{
"epoch": 1.5,
"grad_norm": 0.9908609986305237,
"learning_rate": 0.0005,
"loss": 1.4656,
"step": 54000
},
{
"epoch": 1.5277777777777777,
"grad_norm": 0.9270644187927246,
"learning_rate": 0.0005,
"loss": 1.4524,
"step": 55000
},
{
"epoch": 1.5555555555555556,
"grad_norm": 0.9910904169082642,
"learning_rate": 0.0005,
"loss": 1.4453,
"step": 56000
},
{
"epoch": 1.5833333333333335,
"grad_norm": 1.0300639867782593,
"learning_rate": 0.0005,
"loss": 1.451,
"step": 57000
},
{
"epoch": 1.6111111111111112,
"grad_norm": 0.809105396270752,
"learning_rate": 0.0005,
"loss": 1.444,
"step": 58000
},
{
"epoch": 1.6388888888888888,
"grad_norm": 0.7915866374969482,
"learning_rate": 0.0005,
"loss": 1.4421,
"step": 59000
},
{
"epoch": 1.6666666666666665,
"grad_norm": 0.9778928756713867,
"learning_rate": 0.0005,
"loss": 1.4416,
"step": 60000
},
{
"epoch": 1.6666666666666665,
"eval_bleu": 19.9263,
"eval_gen_len": 30.4463,
"eval_loss": 2.088862657546997,
"eval_runtime": 383.2772,
"eval_samples_per_second": 7.827,
"eval_steps_per_second": 0.978,
"step": 60000
},
{
"epoch": 1.6944444444444444,
"grad_norm": 0.8484209775924683,
"learning_rate": 0.0005,
"loss": 1.4313,
"step": 61000
},
{
"epoch": 1.7222222222222223,
"grad_norm": 0.8703031539916992,
"learning_rate": 0.0005,
"loss": 1.4405,
"step": 62000
},
{
"epoch": 1.75,
"grad_norm": 1.4096006155014038,
"learning_rate": 0.0005,
"loss": 1.4375,
"step": 63000
},
{
"epoch": 1.7777777777777777,
"grad_norm": 0.9177774786949158,
"learning_rate": 0.0005,
"loss": 1.4262,
"step": 64000
},
{
"epoch": 1.8055555555555556,
"grad_norm": 1.2332441806793213,
"learning_rate": 0.0005,
"loss": 1.4233,
"step": 65000
},
{
"epoch": 1.8333333333333335,
"grad_norm": 0.8750177621841431,
"learning_rate": 0.0005,
"loss": 1.4287,
"step": 66000
},
{
"epoch": 1.8611111111111112,
"grad_norm": 0.6736052632331848,
"learning_rate": 0.0005,
"loss": 1.4231,
"step": 67000
},
{
"epoch": 1.8888888888888888,
"grad_norm": 0.7802408933639526,
"learning_rate": 0.0005,
"loss": 1.4106,
"step": 68000
},
{
"epoch": 1.9166666666666665,
"grad_norm": 1.1860034465789795,
"learning_rate": 0.0005,
"loss": 1.4121,
"step": 69000
},
{
"epoch": 1.9444444444444444,
"grad_norm": 0.926054835319519,
"learning_rate": 0.0005,
"loss": 1.4111,
"step": 70000
},
{
"epoch": 1.9444444444444444,
"eval_bleu": 20.3323,
"eval_gen_len": 30.1207,
"eval_loss": 2.079472541809082,
"eval_runtime": 371.9755,
"eval_samples_per_second": 8.065,
"eval_steps_per_second": 1.008,
"step": 70000
},
{
"epoch": 1.9722222222222223,
"grad_norm": 1.1691533327102661,
"learning_rate": 0.0005,
"loss": 1.407,
"step": 71000
},
{
"epoch": 2.0,
"grad_norm": 0.9077666997909546,
"learning_rate": 0.0005,
"loss": 1.4051,
"step": 72000
},
{
"epoch": 2.0277777777777777,
"grad_norm": 0.9149623513221741,
"learning_rate": 0.0005,
"loss": 1.3517,
"step": 73000
},
{
"epoch": 2.0555555555555554,
"grad_norm": 1.0772947072982788,
"learning_rate": 0.0005,
"loss": 1.3624,
"step": 74000
},
{
"epoch": 2.0833333333333335,
"grad_norm": 0.7283540964126587,
"learning_rate": 0.0005,
"loss": 1.355,
"step": 75000
},
{
"epoch": 2.111111111111111,
"grad_norm": 0.7279065847396851,
"learning_rate": 0.0005,
"loss": 1.3526,
"step": 76000
},
{
"epoch": 2.138888888888889,
"grad_norm": 1.2707905769348145,
"learning_rate": 0.0005,
"loss": 1.3535,
"step": 77000
},
{
"epoch": 2.1666666666666665,
"grad_norm": 0.9000493288040161,
"learning_rate": 0.0005,
"loss": 1.3519,
"step": 78000
},
{
"epoch": 2.1944444444444446,
"grad_norm": 1.043967843055725,
"learning_rate": 0.0005,
"loss": 1.3567,
"step": 79000
},
{
"epoch": 2.2222222222222223,
"grad_norm": 1.1248853206634521,
"learning_rate": 0.0005,
"loss": 1.3603,
"step": 80000
},
{
"epoch": 2.2222222222222223,
"eval_bleu": 20.5373,
"eval_gen_len": 30.5943,
"eval_loss": 2.085047960281372,
"eval_runtime": 373.0705,
"eval_samples_per_second": 8.041,
"eval_steps_per_second": 1.005,
"step": 80000
},
{
"epoch": 2.25,
"grad_norm": 1.056221842765808,
"learning_rate": 0.0005,
"loss": 1.3657,
"step": 81000
},
{
"epoch": 2.2777777777777777,
"grad_norm": 0.9176587462425232,
"learning_rate": 0.0005,
"loss": 1.3572,
"step": 82000
},
{
"epoch": 2.3055555555555554,
"grad_norm": 1.0105085372924805,
"learning_rate": 0.0005,
"loss": 1.3498,
"step": 83000
},
{
"epoch": 2.3333333333333335,
"grad_norm": 1.1589380502700806,
"learning_rate": 0.0005,
"loss": 1.3567,
"step": 84000
},
{
"epoch": 2.361111111111111,
"grad_norm": 0.7733587622642517,
"learning_rate": 0.0005,
"loss": 1.3533,
"step": 85000
},
{
"epoch": 2.388888888888889,
"grad_norm": 1.036777138710022,
"learning_rate": 0.0005,
"loss": 1.3469,
"step": 86000
},
{
"epoch": 2.4166666666666665,
"grad_norm": 1.4935026168823242,
"learning_rate": 0.0005,
"loss": 1.3469,
"step": 87000
},
{
"epoch": 2.4444444444444446,
"grad_norm": 0.864630937576294,
"learning_rate": 0.0005,
"loss": 1.3506,
"step": 88000
},
{
"epoch": 2.4722222222222223,
"grad_norm": 0.8495751619338989,
"learning_rate": 0.0005,
"loss": 1.3408,
"step": 89000
},
{
"epoch": 2.5,
"grad_norm": 1.0840762853622437,
"learning_rate": 0.0005,
"loss": 1.3378,
"step": 90000
},
{
"epoch": 2.5,
"eval_bleu": 20.7584,
"eval_gen_len": 30.499,
"eval_loss": 2.0603742599487305,
"eval_runtime": 368.0992,
"eval_samples_per_second": 8.15,
"eval_steps_per_second": 1.019,
"step": 90000
},
{
"epoch": 2.5277777777777777,
"grad_norm": 0.7769622802734375,
"learning_rate": 0.0005,
"loss": 1.3409,
"step": 91000
},
{
"epoch": 2.5555555555555554,
"grad_norm": 1.049972414970398,
"learning_rate": 0.0005,
"loss": 1.3443,
"step": 92000
},
{
"epoch": 2.5833333333333335,
"grad_norm": 0.965621292591095,
"learning_rate": 0.0005,
"loss": 1.342,
"step": 93000
},
{
"epoch": 2.611111111111111,
"grad_norm": 0.8234182000160217,
"learning_rate": 0.0005,
"loss": 1.3297,
"step": 94000
},
{
"epoch": 2.638888888888889,
"grad_norm": 0.9464855790138245,
"learning_rate": 0.0005,
"loss": 1.3345,
"step": 95000
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.987382709980011,
"learning_rate": 0.0005,
"loss": 1.3284,
"step": 96000
},
{
"epoch": 2.6944444444444446,
"grad_norm": 0.6439863443374634,
"learning_rate": 0.0005,
"loss": 1.3285,
"step": 97000
},
{
"epoch": 2.7222222222222223,
"grad_norm": 0.8853390216827393,
"learning_rate": 0.0005,
"loss": 1.3339,
"step": 98000
},
{
"epoch": 2.75,
"grad_norm": 0.7582658529281616,
"learning_rate": 0.0005,
"loss": 1.3281,
"step": 99000
},
{
"epoch": 2.7777777777777777,
"grad_norm": 0.9061763882637024,
"learning_rate": 0.0005,
"loss": 1.3381,
"step": 100000
},
{
"epoch": 2.7777777777777777,
"eval_bleu": 20.6113,
"eval_gen_len": 30.701,
"eval_loss": 2.059664726257324,
"eval_runtime": 371.2241,
"eval_samples_per_second": 8.081,
"eval_steps_per_second": 1.01,
"step": 100000
},
{
"epoch": 2.7777777777777777,
"step": 100000,
"total_flos": 1.4240580791795712e+17,
"train_loss": 0.5475473999023438,
"train_runtime": 14821.2356,
"train_samples_per_second": 107.953,
"train_steps_per_second": 6.747
}
],
"logging_steps": 1000,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 10000,
"total_flos": 1.4240580791795712e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}