|
{ |
|
"best_metric": 20.7584, |
|
"best_model_checkpoint": "/local1/hfs/gs_stuff/ft-wmt14-5/checkpoint-90000", |
|
"epoch": 2.7777777777777777, |
|
"eval_steps": 10000, |
|
"global_step": 100000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.027777777777777776, |
|
"grad_norm": 1.9314790964126587, |
|
"learning_rate": 0.0005, |
|
"loss": 3.3589, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 1.7348469495773315, |
|
"learning_rate": 0.0005, |
|
"loss": 2.5263, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 1.9181748628616333, |
|
"learning_rate": 0.0005, |
|
"loss": 2.3365, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 1.6642646789550781, |
|
"learning_rate": 0.0005, |
|
"loss": 2.2207, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1388888888888889, |
|
"grad_norm": 1.1876742839813232, |
|
"learning_rate": 0.0005, |
|
"loss": 2.1363, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 1.567658543586731, |
|
"learning_rate": 0.0005, |
|
"loss": 2.0733, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.19444444444444445, |
|
"grad_norm": 1.2552471160888672, |
|
"learning_rate": 0.0005, |
|
"loss": 2.0262, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 1.049357533454895, |
|
"learning_rate": 0.0005, |
|
"loss": 1.9775, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.303145170211792, |
|
"learning_rate": 0.0005, |
|
"loss": 1.9412, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 1.0213723182678223, |
|
"learning_rate": 0.0005, |
|
"loss": 1.9166, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"eval_bleu": 15.8119, |
|
"eval_gen_len": 32.097, |
|
"eval_loss": 2.31050968170166, |
|
"eval_runtime": 410.6001, |
|
"eval_samples_per_second": 7.306, |
|
"eval_steps_per_second": 0.913, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.3055555555555556, |
|
"grad_norm": 1.2851905822753906, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8878, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.8447160720825195, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8492, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.3611111111111111, |
|
"grad_norm": 1.1516064405441284, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8309, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 1.0370670557022095, |
|
"learning_rate": 0.0005, |
|
"loss": 1.8057, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 1.1649495363235474, |
|
"learning_rate": 0.0005, |
|
"loss": 1.7867, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 1.2666045427322388, |
|
"learning_rate": 0.0005, |
|
"loss": 1.7679, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.4722222222222222, |
|
"grad_norm": 1.0923264026641846, |
|
"learning_rate": 0.0005, |
|
"loss": 1.7563, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.560994029045105, |
|
"learning_rate": 0.0005, |
|
"loss": 1.7342, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.5277777777777778, |
|
"grad_norm": 0.9684827327728271, |
|
"learning_rate": 0.0005, |
|
"loss": 1.7228, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.9182453751564026, |
|
"learning_rate": 0.0005, |
|
"loss": 1.7184, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"eval_bleu": 17.5903, |
|
"eval_gen_len": 31.1153, |
|
"eval_loss": 2.19934344291687, |
|
"eval_runtime": 393.3017, |
|
"eval_samples_per_second": 7.628, |
|
"eval_steps_per_second": 0.953, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5833333333333334, |
|
"grad_norm": 0.8953577280044556, |
|
"learning_rate": 0.0005, |
|
"loss": 1.7042, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.6111111111111112, |
|
"grad_norm": 0.9418250918388367, |
|
"learning_rate": 0.0005, |
|
"loss": 1.683, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.6388888888888888, |
|
"grad_norm": 0.8577601909637451, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6799, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.9786076545715332, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6675, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6944444444444444, |
|
"grad_norm": 0.9262654781341553, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6499, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.7222222222222222, |
|
"grad_norm": 0.8759564757347107, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6468, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0495752096176147, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6285, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 1.092642068862915, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6276, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.8055555555555556, |
|
"grad_norm": 0.8775661587715149, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6172, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.8970679044723511, |
|
"learning_rate": 0.0005, |
|
"loss": 1.6061, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"eval_bleu": 18.9604, |
|
"eval_gen_len": 30.327, |
|
"eval_loss": 2.1379551887512207, |
|
"eval_runtime": 380.095, |
|
"eval_samples_per_second": 7.893, |
|
"eval_steps_per_second": 0.987, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.8611111111111112, |
|
"grad_norm": 0.9657310247421265, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5959, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.8748376369476318, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5908, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.9166666666666666, |
|
"grad_norm": 0.8462302088737488, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5845, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.9444444444444444, |
|
"grad_norm": 0.9005241394042969, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5699, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.9722222222222222, |
|
"grad_norm": 0.9596630930900574, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5752, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.8307533860206604, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5634, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.0277777777777777, |
|
"grad_norm": 0.9918788075447083, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5117, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.0555555555555556, |
|
"grad_norm": 0.9118058085441589, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5023, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.0833333333333333, |
|
"grad_norm": 0.7213552594184875, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5087, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 1.0255305767059326, |
|
"learning_rate": 0.0005, |
|
"loss": 1.516, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"eval_bleu": 19.1444, |
|
"eval_gen_len": 30.2727, |
|
"eval_loss": 2.1365692615509033, |
|
"eval_runtime": 377.1737, |
|
"eval_samples_per_second": 7.954, |
|
"eval_steps_per_second": 0.994, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.1388888888888888, |
|
"grad_norm": 0.8766499161720276, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5096, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 1.1786612272262573, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4982, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.1944444444444444, |
|
"grad_norm": 1.011268973350525, |
|
"learning_rate": 0.0005, |
|
"loss": 1.5013, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 1.0863969326019287, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4878, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.9729832410812378, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4922, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.2777777777777777, |
|
"grad_norm": 1.3476896286010742, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4876, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.3055555555555556, |
|
"grad_norm": 0.8493963479995728, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4823, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 1.0311123132705688, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4739, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.3611111111111112, |
|
"grad_norm": 1.259581446647644, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4747, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 1.1934195756912231, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4675, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"eval_bleu": 19.7588, |
|
"eval_gen_len": 30.1127, |
|
"eval_loss": 2.120835781097412, |
|
"eval_runtime": 372.4281, |
|
"eval_samples_per_second": 8.055, |
|
"eval_steps_per_second": 1.007, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.4166666666666667, |
|
"grad_norm": 1.1824595928192139, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4659, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 1.1661032438278198, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4737, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.4722222222222223, |
|
"grad_norm": 0.7856634259223938, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4595, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.9908609986305237, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4656, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.5277777777777777, |
|
"grad_norm": 0.9270644187927246, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4524, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.9910904169082642, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4453, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.5833333333333335, |
|
"grad_norm": 1.0300639867782593, |
|
"learning_rate": 0.0005, |
|
"loss": 1.451, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.6111111111111112, |
|
"grad_norm": 0.809105396270752, |
|
"learning_rate": 0.0005, |
|
"loss": 1.444, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.6388888888888888, |
|
"grad_norm": 0.7915866374969482, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4421, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.9778928756713867, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4416, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"eval_bleu": 19.9263, |
|
"eval_gen_len": 30.4463, |
|
"eval_loss": 2.088862657546997, |
|
"eval_runtime": 383.2772, |
|
"eval_samples_per_second": 7.827, |
|
"eval_steps_per_second": 0.978, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.6944444444444444, |
|
"grad_norm": 0.8484209775924683, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4313, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.7222222222222223, |
|
"grad_norm": 0.8703031539916992, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4405, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 1.4096006155014038, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4375, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.9177774786949158, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4262, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.8055555555555556, |
|
"grad_norm": 1.2332441806793213, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4233, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 0.8750177621841431, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4287, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.8611111111111112, |
|
"grad_norm": 0.6736052632331848, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4231, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.7802408933639526, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4106, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.9166666666666665, |
|
"grad_norm": 1.1860034465789795, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4121, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 0.926054835319519, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4111, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"eval_bleu": 20.3323, |
|
"eval_gen_len": 30.1207, |
|
"eval_loss": 2.079472541809082, |
|
"eval_runtime": 371.9755, |
|
"eval_samples_per_second": 8.065, |
|
"eval_steps_per_second": 1.008, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.9722222222222223, |
|
"grad_norm": 1.1691533327102661, |
|
"learning_rate": 0.0005, |
|
"loss": 1.407, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.9077666997909546, |
|
"learning_rate": 0.0005, |
|
"loss": 1.4051, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.0277777777777777, |
|
"grad_norm": 0.9149623513221741, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3517, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 2.0555555555555554, |
|
"grad_norm": 1.0772947072982788, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3624, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 2.0833333333333335, |
|
"grad_norm": 0.7283540964126587, |
|
"learning_rate": 0.0005, |
|
"loss": 1.355, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.111111111111111, |
|
"grad_norm": 0.7279065847396851, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3526, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 2.138888888888889, |
|
"grad_norm": 1.2707905769348145, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3535, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 2.1666666666666665, |
|
"grad_norm": 0.9000493288040161, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3519, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.1944444444444446, |
|
"grad_norm": 1.043967843055725, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3567, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 1.1248853206634521, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3603, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"eval_bleu": 20.5373, |
|
"eval_gen_len": 30.5943, |
|
"eval_loss": 2.085047960281372, |
|
"eval_runtime": 373.0705, |
|
"eval_samples_per_second": 8.041, |
|
"eval_steps_per_second": 1.005, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 1.056221842765808, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3657, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.2777777777777777, |
|
"grad_norm": 0.9176587462425232, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3572, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 2.3055555555555554, |
|
"grad_norm": 1.0105085372924805, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3498, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 2.3333333333333335, |
|
"grad_norm": 1.1589380502700806, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3567, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.361111111111111, |
|
"grad_norm": 0.7733587622642517, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3533, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.388888888888889, |
|
"grad_norm": 1.036777138710022, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3469, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 2.4166666666666665, |
|
"grad_norm": 1.4935026168823242, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3469, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 0.864630937576294, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3506, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 2.4722222222222223, |
|
"grad_norm": 0.8495751619338989, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3408, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 1.0840762853622437, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3378, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"eval_bleu": 20.7584, |
|
"eval_gen_len": 30.499, |
|
"eval_loss": 2.0603742599487305, |
|
"eval_runtime": 368.0992, |
|
"eval_samples_per_second": 8.15, |
|
"eval_steps_per_second": 1.019, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.5277777777777777, |
|
"grad_norm": 0.7769622802734375, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3409, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 2.5555555555555554, |
|
"grad_norm": 1.049972414970398, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3443, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 2.5833333333333335, |
|
"grad_norm": 0.965621292591095, |
|
"learning_rate": 0.0005, |
|
"loss": 1.342, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 2.611111111111111, |
|
"grad_norm": 0.8234182000160217, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3297, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 2.638888888888889, |
|
"grad_norm": 0.9464855790138245, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3345, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.987382709980011, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3284, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 2.6944444444444446, |
|
"grad_norm": 0.6439863443374634, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3285, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 2.7222222222222223, |
|
"grad_norm": 0.8853390216827393, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3339, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.7582658529281616, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3281, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"grad_norm": 0.9061763882637024, |
|
"learning_rate": 0.0005, |
|
"loss": 1.3381, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"eval_bleu": 20.6113, |
|
"eval_gen_len": 30.701, |
|
"eval_loss": 2.059664726257324, |
|
"eval_runtime": 371.2241, |
|
"eval_samples_per_second": 8.081, |
|
"eval_steps_per_second": 1.01, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.7777777777777777, |
|
"step": 100000, |
|
"total_flos": 1.4240580791795712e+17, |
|
"train_loss": 0.5475473999023438, |
|
"train_runtime": 14821.2356, |
|
"train_samples_per_second": 107.953, |
|
"train_steps_per_second": 6.747 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 10000, |
|
"total_flos": 1.4240580791795712e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|