|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 112500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 2.7033166885375977, |
|
"learning_rate": 1.9911644444444447e-05, |
|
"loss": 2.2268, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 2.958282232284546, |
|
"learning_rate": 1.9822755555555557e-05, |
|
"loss": 2.0732, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 3.01822829246521, |
|
"learning_rate": 1.9733866666666668e-05, |
|
"loss": 2.0296, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 2.8041861057281494, |
|
"learning_rate": 1.9644977777777778e-05, |
|
"loss": 2.0176, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 2.194178342819214, |
|
"learning_rate": 1.9556088888888892e-05, |
|
"loss": 2.0181, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 2.7587132453918457, |
|
"learning_rate": 1.9467200000000002e-05, |
|
"loss": 1.9864, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 2.8509016036987305, |
|
"learning_rate": 1.9378311111111113e-05, |
|
"loss": 1.9701, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 2.180856704711914, |
|
"learning_rate": 1.9289422222222223e-05, |
|
"loss": 1.936, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.2034873962402344, |
|
"learning_rate": 1.9200533333333337e-05, |
|
"loss": 1.9455, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 3.1408488750457764, |
|
"learning_rate": 1.9111644444444447e-05, |
|
"loss": 1.9438, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 3.070190191268921, |
|
"learning_rate": 1.9022755555555558e-05, |
|
"loss": 1.9491, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 2.0312423706054688, |
|
"learning_rate": 1.893386666666667e-05, |
|
"loss": 1.9236, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 2.183950424194336, |
|
"learning_rate": 1.884497777777778e-05, |
|
"loss": 1.9111, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 2.13474702835083, |
|
"learning_rate": 1.875608888888889e-05, |
|
"loss": 1.9158, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 2.055859327316284, |
|
"learning_rate": 1.866737777777778e-05, |
|
"loss": 1.8952, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 2.280942916870117, |
|
"learning_rate": 1.857848888888889e-05, |
|
"loss": 1.9023, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 3.20082426071167, |
|
"learning_rate": 1.84896e-05, |
|
"loss": 1.8711, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.5550222396850586, |
|
"learning_rate": 1.8400711111111114e-05, |
|
"loss": 1.8865, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 3.101032018661499, |
|
"learning_rate": 1.8311822222222224e-05, |
|
"loss": 1.8942, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 1.9913034439086914, |
|
"learning_rate": 1.8222933333333335e-05, |
|
"loss": 1.8823, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 1.8786649703979492, |
|
"learning_rate": 1.8134044444444445e-05, |
|
"loss": 1.8903, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 2.1240804195404053, |
|
"learning_rate": 1.804515555555556e-05, |
|
"loss": 1.8759, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_Sacrebleu": 10.927683654987915, |
|
"eval_loss": 1.7730144262313843, |
|
"eval_runtime": 5155.9444, |
|
"eval_samples_per_second": 1.94, |
|
"eval_steps_per_second": 0.242, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 1.0222222222222221, |
|
"grad_norm": 3.4717421531677246, |
|
"learning_rate": 1.795626666666667e-05, |
|
"loss": 1.809, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 2.8455264568328857, |
|
"learning_rate": 1.786755555555556e-05, |
|
"loss": 1.7839, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 2.3160150051116943, |
|
"learning_rate": 1.7778844444444446e-05, |
|
"loss": 1.7898, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 2.7126030921936035, |
|
"learning_rate": 1.768995555555556e-05, |
|
"loss": 1.7653, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 2.833963394165039, |
|
"learning_rate": 1.760106666666667e-05, |
|
"loss": 1.7937, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 2.582491159439087, |
|
"learning_rate": 1.751217777777778e-05, |
|
"loss": 1.7672, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.2888888888888888, |
|
"grad_norm": 2.6020431518554688, |
|
"learning_rate": 1.742346666666667e-05, |
|
"loss": 1.7788, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 2.6280031204223633, |
|
"learning_rate": 1.733457777777778e-05, |
|
"loss": 1.7617, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.3777777777777778, |
|
"grad_norm": 2.258315324783325, |
|
"learning_rate": 1.724568888888889e-05, |
|
"loss": 1.7603, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 2.546867847442627, |
|
"learning_rate": 1.71568e-05, |
|
"loss": 1.7639, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 2.596524715423584, |
|
"learning_rate": 1.7067911111111112e-05, |
|
"loss": 1.7787, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"grad_norm": 2.728980541229248, |
|
"learning_rate": 1.6979022222222222e-05, |
|
"loss": 1.7728, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 1.9743603467941284, |
|
"learning_rate": 1.6890133333333333e-05, |
|
"loss": 1.7436, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.306102991104126, |
|
"learning_rate": 1.6801422222222223e-05, |
|
"loss": 1.7593, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.6444444444444444, |
|
"grad_norm": 2.595468282699585, |
|
"learning_rate": 1.6712533333333333e-05, |
|
"loss": 1.7587, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"grad_norm": 1.8225383758544922, |
|
"learning_rate": 1.6623644444444447e-05, |
|
"loss": 1.7852, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 2.581843852996826, |
|
"learning_rate": 1.6534755555555557e-05, |
|
"loss": 1.7582, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 1.9384685754776, |
|
"learning_rate": 1.6445866666666668e-05, |
|
"loss": 1.7544, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.8222222222222222, |
|
"grad_norm": 2.542980670928955, |
|
"learning_rate": 1.6356977777777778e-05, |
|
"loss": 1.7592, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 2.4207804203033447, |
|
"learning_rate": 1.6268088888888892e-05, |
|
"loss": 1.7685, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.911111111111111, |
|
"grad_norm": 2.7720110416412354, |
|
"learning_rate": 1.617937777777778e-05, |
|
"loss": 1.7654, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"grad_norm": 2.8182454109191895, |
|
"learning_rate": 1.6090488888888892e-05, |
|
"loss": 1.7577, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 2.3921427726745605, |
|
"learning_rate": 1.6001600000000003e-05, |
|
"loss": 1.7666, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_Sacrebleu": 11.504795316238502, |
|
"eval_loss": 1.7359962463378906, |
|
"eval_runtime": 4723.0089, |
|
"eval_samples_per_second": 2.117, |
|
"eval_steps_per_second": 0.265, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.0444444444444443, |
|
"grad_norm": 2.5132415294647217, |
|
"learning_rate": 1.5912711111111113e-05, |
|
"loss": 1.6699, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.088888888888889, |
|
"grad_norm": 2.244852066040039, |
|
"learning_rate": 1.5823822222222224e-05, |
|
"loss": 1.6636, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.1333333333333333, |
|
"grad_norm": 2.456984281539917, |
|
"learning_rate": 1.5734933333333334e-05, |
|
"loss": 1.6812, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.1777777777777776, |
|
"grad_norm": 2.611400842666626, |
|
"learning_rate": 1.5646044444444445e-05, |
|
"loss": 1.6813, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.2222222222222223, |
|
"grad_norm": 2.504777431488037, |
|
"learning_rate": 1.5557155555555555e-05, |
|
"loss": 1.6698, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.2666666666666666, |
|
"grad_norm": 1.9886395931243896, |
|
"learning_rate": 1.5468444444444445e-05, |
|
"loss": 1.6649, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.311111111111111, |
|
"grad_norm": 2.4218804836273193, |
|
"learning_rate": 1.5379555555555555e-05, |
|
"loss": 1.6827, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.3555555555555556, |
|
"grad_norm": 2.380363941192627, |
|
"learning_rate": 1.5290666666666666e-05, |
|
"loss": 1.6821, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.9390811920166016, |
|
"learning_rate": 1.520177777777778e-05, |
|
"loss": 1.6517, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.4444444444444446, |
|
"grad_norm": 2.278156042098999, |
|
"learning_rate": 1.511288888888889e-05, |
|
"loss": 1.6551, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.488888888888889, |
|
"grad_norm": 2.5583651065826416, |
|
"learning_rate": 1.5024e-05, |
|
"loss": 1.6386, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.533333333333333, |
|
"grad_norm": 2.4227840900421143, |
|
"learning_rate": 1.493528888888889e-05, |
|
"loss": 1.6808, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.5777777777777775, |
|
"grad_norm": 2.4636645317077637, |
|
"learning_rate": 1.48464e-05, |
|
"loss": 1.6675, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.6222222222222222, |
|
"grad_norm": 1.932691216468811, |
|
"learning_rate": 1.4757511111111111e-05, |
|
"loss": 1.6649, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 2.324248790740967, |
|
"learning_rate": 1.4668622222222223e-05, |
|
"loss": 1.641, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.7111111111111112, |
|
"grad_norm": 2.5001227855682373, |
|
"learning_rate": 1.4579733333333335e-05, |
|
"loss": 1.6593, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 2.7555555555555555, |
|
"grad_norm": 2.053122043609619, |
|
"learning_rate": 1.4490844444444446e-05, |
|
"loss": 1.6679, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 2.3404734134674072, |
|
"learning_rate": 1.4401955555555556e-05, |
|
"loss": 1.6705, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 2.8444444444444446, |
|
"grad_norm": 1.811047911643982, |
|
"learning_rate": 1.4313066666666669e-05, |
|
"loss": 1.6573, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.888888888888889, |
|
"grad_norm": 2.5269010066986084, |
|
"learning_rate": 1.4224355555555555e-05, |
|
"loss": 1.6659, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 2.9333333333333336, |
|
"grad_norm": 3.2324156761169434, |
|
"learning_rate": 1.4135466666666669e-05, |
|
"loss": 1.6509, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.977777777777778, |
|
"grad_norm": 2.910116195678711, |
|
"learning_rate": 1.404657777777778e-05, |
|
"loss": 1.6799, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_Sacrebleu": 11.461615098924428, |
|
"eval_loss": 1.7215642929077148, |
|
"eval_runtime": 4737.4626, |
|
"eval_samples_per_second": 2.111, |
|
"eval_steps_per_second": 0.264, |
|
"step": 33750 |
|
}, |
|
{ |
|
"epoch": 3.022222222222222, |
|
"grad_norm": 2.388887643814087, |
|
"learning_rate": 1.395768888888889e-05, |
|
"loss": 1.6205, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.066666666666667, |
|
"grad_norm": 2.422229290008545, |
|
"learning_rate": 1.38688e-05, |
|
"loss": 1.5935, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 3.111111111111111, |
|
"grad_norm": 2.689824104309082, |
|
"learning_rate": 1.378008888888889e-05, |
|
"loss": 1.5677, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.1555555555555554, |
|
"grad_norm": 2.5040597915649414, |
|
"learning_rate": 1.369137777777778e-05, |
|
"loss": 1.5982, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 2.0994420051574707, |
|
"learning_rate": 1.360248888888889e-05, |
|
"loss": 1.6006, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.2444444444444445, |
|
"grad_norm": 2.4700512886047363, |
|
"learning_rate": 1.35136e-05, |
|
"loss": 1.594, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.2888888888888888, |
|
"grad_norm": 2.679499864578247, |
|
"learning_rate": 1.3424711111111113e-05, |
|
"loss": 1.5901, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 2.6343369483947754, |
|
"learning_rate": 1.3335822222222223e-05, |
|
"loss": 1.5875, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.3777777777777778, |
|
"grad_norm": 2.5335209369659424, |
|
"learning_rate": 1.3247111111111113e-05, |
|
"loss": 1.6139, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 3.422222222222222, |
|
"grad_norm": 2.9107964038848877, |
|
"learning_rate": 1.3158222222222223e-05, |
|
"loss": 1.6033, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 3.466666666666667, |
|
"grad_norm": 2.534843921661377, |
|
"learning_rate": 1.3069333333333334e-05, |
|
"loss": 1.584, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 3.511111111111111, |
|
"grad_norm": 2.409266710281372, |
|
"learning_rate": 1.2980444444444444e-05, |
|
"loss": 1.5854, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 3.5555555555555554, |
|
"grad_norm": 2.509253740310669, |
|
"learning_rate": 1.2891555555555556e-05, |
|
"loss": 1.5809, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.9488080739974976, |
|
"learning_rate": 1.2802666666666667e-05, |
|
"loss": 1.5889, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 3.6444444444444444, |
|
"grad_norm": 2.8996152877807617, |
|
"learning_rate": 1.2713777777777779e-05, |
|
"loss": 1.5807, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 3.688888888888889, |
|
"grad_norm": 2.444199800491333, |
|
"learning_rate": 1.2624888888888891e-05, |
|
"loss": 1.6047, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 3.7333333333333334, |
|
"grad_norm": 2.627521276473999, |
|
"learning_rate": 1.2536000000000002e-05, |
|
"loss": 1.582, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 3.7777777777777777, |
|
"grad_norm": 2.4339983463287354, |
|
"learning_rate": 1.2447288888888891e-05, |
|
"loss": 1.5887, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 3.822222222222222, |
|
"grad_norm": 2.3496522903442383, |
|
"learning_rate": 1.2358577777777778e-05, |
|
"loss": 1.5878, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.8666666666666667, |
|
"grad_norm": 2.325495719909668, |
|
"learning_rate": 1.2269688888888892e-05, |
|
"loss": 1.5826, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 3.911111111111111, |
|
"grad_norm": 2.1297905445098877, |
|
"learning_rate": 1.2180800000000002e-05, |
|
"loss": 1.6244, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.9555555555555557, |
|
"grad_norm": 2.5512731075286865, |
|
"learning_rate": 1.2091911111111112e-05, |
|
"loss": 1.6075, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.9816246032714844, |
|
"learning_rate": 1.2003200000000002e-05, |
|
"loss": 1.5647, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_Sacrebleu": 11.693143227743814, |
|
"eval_loss": 1.7151726484298706, |
|
"eval_runtime": 4558.3443, |
|
"eval_samples_per_second": 2.194, |
|
"eval_steps_per_second": 0.274, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.044444444444444, |
|
"grad_norm": 2.9890973567962646, |
|
"learning_rate": 1.1914311111111113e-05, |
|
"loss": 1.5201, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 4.088888888888889, |
|
"grad_norm": 2.128161668777466, |
|
"learning_rate": 1.1825422222222223e-05, |
|
"loss": 1.5305, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 4.133333333333334, |
|
"grad_norm": 2.3265063762664795, |
|
"learning_rate": 1.1736533333333335e-05, |
|
"loss": 1.5326, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 4.177777777777778, |
|
"grad_norm": 2.2641632556915283, |
|
"learning_rate": 1.1647644444444446e-05, |
|
"loss": 1.5413, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 4.222222222222222, |
|
"grad_norm": 2.4255621433258057, |
|
"learning_rate": 1.1558755555555556e-05, |
|
"loss": 1.5391, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 4.266666666666667, |
|
"grad_norm": 2.454061269760132, |
|
"learning_rate": 1.1469866666666667e-05, |
|
"loss": 1.5202, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 4.311111111111111, |
|
"grad_norm": 2.373842477798462, |
|
"learning_rate": 1.138097777777778e-05, |
|
"loss": 1.5369, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 4.355555555555555, |
|
"grad_norm": 2.1090219020843506, |
|
"learning_rate": 1.1292088888888891e-05, |
|
"loss": 1.541, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 2.7172622680664062, |
|
"learning_rate": 1.1203200000000001e-05, |
|
"loss": 1.5172, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 4.444444444444445, |
|
"grad_norm": 2.557300329208374, |
|
"learning_rate": 1.1114311111111112e-05, |
|
"loss": 1.5325, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 4.488888888888889, |
|
"grad_norm": 2.321272850036621, |
|
"learning_rate": 1.1025422222222224e-05, |
|
"loss": 1.5424, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 4.533333333333333, |
|
"grad_norm": 2.1814475059509277, |
|
"learning_rate": 1.0936533333333334e-05, |
|
"loss": 1.5219, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 4.5777777777777775, |
|
"grad_norm": 2.5891165733337402, |
|
"learning_rate": 1.0847822222222224e-05, |
|
"loss": 1.5294, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 4.622222222222222, |
|
"grad_norm": 2.655404567718506, |
|
"learning_rate": 1.0758933333333335e-05, |
|
"loss": 1.5272, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 4.666666666666667, |
|
"grad_norm": 2.0418145656585693, |
|
"learning_rate": 1.0670044444444445e-05, |
|
"loss": 1.5323, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 4.711111111111111, |
|
"grad_norm": 2.315140962600708, |
|
"learning_rate": 1.0581155555555556e-05, |
|
"loss": 1.5201, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 4.7555555555555555, |
|
"grad_norm": 2.4892210960388184, |
|
"learning_rate": 1.0492266666666668e-05, |
|
"loss": 1.5065, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 3.2457361221313477, |
|
"learning_rate": 1.0403555555555556e-05, |
|
"loss": 1.531, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 4.844444444444444, |
|
"grad_norm": 2.579188346862793, |
|
"learning_rate": 1.0314666666666668e-05, |
|
"loss": 1.5093, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 4.888888888888889, |
|
"grad_norm": 2.9643566608428955, |
|
"learning_rate": 1.0225777777777778e-05, |
|
"loss": 1.5394, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 4.933333333333334, |
|
"grad_norm": 2.824070930480957, |
|
"learning_rate": 1.0136888888888889e-05, |
|
"loss": 1.5352, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 4.977777777777778, |
|
"grad_norm": 2.714317560195923, |
|
"learning_rate": 1.0048e-05, |
|
"loss": 1.5055, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_Sacrebleu": 11.699486858073387, |
|
"eval_loss": 1.7135735750198364, |
|
"eval_runtime": 6965.0506, |
|
"eval_samples_per_second": 1.436, |
|
"eval_steps_per_second": 0.179, |
|
"step": 56250 |
|
}, |
|
{ |
|
"epoch": 5.022222222222222, |
|
"grad_norm": 2.3869211673736572, |
|
"learning_rate": 9.959288888888889e-06, |
|
"loss": 1.5236, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 5.066666666666666, |
|
"grad_norm": 2.824490785598755, |
|
"learning_rate": 9.870400000000001e-06, |
|
"loss": 1.4628, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 5.111111111111111, |
|
"grad_norm": 2.378420829772949, |
|
"learning_rate": 9.781511111111112e-06, |
|
"loss": 1.4761, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 5.155555555555556, |
|
"grad_norm": 2.934887647628784, |
|
"learning_rate": 9.692622222222224e-06, |
|
"loss": 1.4744, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 5.2, |
|
"grad_norm": 2.5306599140167236, |
|
"learning_rate": 9.603733333333334e-06, |
|
"loss": 1.4719, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 5.2444444444444445, |
|
"grad_norm": 2.27069354057312, |
|
"learning_rate": 9.515022222222224e-06, |
|
"loss": 1.4717, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 5.288888888888889, |
|
"grad_norm": 2.79362154006958, |
|
"learning_rate": 9.426133333333335e-06, |
|
"loss": 1.4671, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 5.333333333333333, |
|
"grad_norm": 2.4642298221588135, |
|
"learning_rate": 9.337244444444445e-06, |
|
"loss": 1.4962, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 5.377777777777778, |
|
"grad_norm": 2.8472516536712646, |
|
"learning_rate": 9.248355555555555e-06, |
|
"loss": 1.4997, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 5.4222222222222225, |
|
"grad_norm": 2.2296738624572754, |
|
"learning_rate": 9.159644444444445e-06, |
|
"loss": 1.4713, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 5.466666666666667, |
|
"grad_norm": 2.8742563724517822, |
|
"learning_rate": 9.070755555555556e-06, |
|
"loss": 1.4629, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 5.511111111111111, |
|
"grad_norm": 2.6657145023345947, |
|
"learning_rate": 8.982044444444445e-06, |
|
"loss": 1.4653, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 2.7092838287353516, |
|
"learning_rate": 8.893155555555556e-06, |
|
"loss": 1.4888, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 5.6, |
|
"grad_norm": 2.277348041534424, |
|
"learning_rate": 8.804266666666668e-06, |
|
"loss": 1.4786, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 5.644444444444445, |
|
"grad_norm": 1.995211124420166, |
|
"learning_rate": 8.715377777777778e-06, |
|
"loss": 1.4685, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 5.688888888888889, |
|
"grad_norm": 2.569850444793701, |
|
"learning_rate": 8.62648888888889e-06, |
|
"loss": 1.4687, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 5.733333333333333, |
|
"grad_norm": 2.3745999336242676, |
|
"learning_rate": 8.537600000000001e-06, |
|
"loss": 1.4949, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 5.777777777777778, |
|
"grad_norm": 2.3172736167907715, |
|
"learning_rate": 8.448711111111112e-06, |
|
"loss": 1.4807, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 5.822222222222222, |
|
"grad_norm": 2.316258192062378, |
|
"learning_rate": 8.359822222222222e-06, |
|
"loss": 1.4994, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 5.866666666666667, |
|
"grad_norm": 2.303201913833618, |
|
"learning_rate": 8.270933333333334e-06, |
|
"loss": 1.4887, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 5.911111111111111, |
|
"grad_norm": 2.1658709049224854, |
|
"learning_rate": 8.182222222222222e-06, |
|
"loss": 1.4751, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 5.955555555555556, |
|
"grad_norm": 2.177354574203491, |
|
"learning_rate": 8.093333333333334e-06, |
|
"loss": 1.4751, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 2.559067726135254, |
|
"learning_rate": 8.004444444444445e-06, |
|
"loss": 1.4748, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_Sacrebleu": 11.844348456691092, |
|
"eval_loss": 1.7136952877044678, |
|
"eval_runtime": 4383.3868, |
|
"eval_samples_per_second": 2.281, |
|
"eval_steps_per_second": 0.285, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 6.044444444444444, |
|
"grad_norm": 2.618431329727173, |
|
"learning_rate": 7.915555555555557e-06, |
|
"loss": 1.4243, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 6.088888888888889, |
|
"grad_norm": 2.2536802291870117, |
|
"learning_rate": 7.826666666666667e-06, |
|
"loss": 1.4415, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 6.133333333333334, |
|
"grad_norm": 2.866265058517456, |
|
"learning_rate": 7.737777777777778e-06, |
|
"loss": 1.429, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 6.177777777777778, |
|
"grad_norm": 2.799807071685791, |
|
"learning_rate": 7.648888888888888e-06, |
|
"loss": 1.4456, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 6.222222222222222, |
|
"grad_norm": 2.6799798011779785, |
|
"learning_rate": 7.5600000000000005e-06, |
|
"loss": 1.4302, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 6.266666666666667, |
|
"grad_norm": 2.8705008029937744, |
|
"learning_rate": 7.471288888888889e-06, |
|
"loss": 1.4334, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 6.311111111111111, |
|
"grad_norm": 2.6502320766448975, |
|
"learning_rate": 7.382400000000001e-06, |
|
"loss": 1.4338, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 6.355555555555555, |
|
"grad_norm": 2.9277572631835938, |
|
"learning_rate": 7.293511111111111e-06, |
|
"loss": 1.4364, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 6.4, |
|
"grad_norm": 2.3213584423065186, |
|
"learning_rate": 7.2046222222222224e-06, |
|
"loss": 1.4258, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 6.444444444444445, |
|
"grad_norm": 3.05753231048584, |
|
"learning_rate": 7.115911111111111e-06, |
|
"loss": 1.4159, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 6.488888888888889, |
|
"grad_norm": 2.879302978515625, |
|
"learning_rate": 7.027022222222223e-06, |
|
"loss": 1.4575, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 6.533333333333333, |
|
"grad_norm": 2.283719539642334, |
|
"learning_rate": 6.938133333333333e-06, |
|
"loss": 1.4355, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 6.5777777777777775, |
|
"grad_norm": 2.5252952575683594, |
|
"learning_rate": 6.849422222222223e-06, |
|
"loss": 1.4346, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 6.622222222222222, |
|
"grad_norm": 2.6116600036621094, |
|
"learning_rate": 6.760533333333333e-06, |
|
"loss": 1.4646, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 2.494943380355835, |
|
"learning_rate": 6.671644444444445e-06, |
|
"loss": 1.4385, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 6.711111111111111, |
|
"grad_norm": 2.3194327354431152, |
|
"learning_rate": 6.582755555555556e-06, |
|
"loss": 1.4508, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 6.7555555555555555, |
|
"grad_norm": 2.8426125049591064, |
|
"learning_rate": 6.494044444444445e-06, |
|
"loss": 1.4412, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 6.8, |
|
"grad_norm": 3.0324361324310303, |
|
"learning_rate": 6.405155555555555e-06, |
|
"loss": 1.423, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 6.844444444444444, |
|
"grad_norm": 2.176151990890503, |
|
"learning_rate": 6.3162666666666674e-06, |
|
"loss": 1.4474, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 6.888888888888889, |
|
"grad_norm": 2.6893413066864014, |
|
"learning_rate": 6.227377777777778e-06, |
|
"loss": 1.4415, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 6.933333333333334, |
|
"grad_norm": 2.657773017883301, |
|
"learning_rate": 6.138488888888889e-06, |
|
"loss": 1.4257, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 6.977777777777778, |
|
"grad_norm": 2.4668519496917725, |
|
"learning_rate": 6.049600000000001e-06, |
|
"loss": 1.424, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_Sacrebleu": 11.861485151142872, |
|
"eval_loss": 1.7204111814498901, |
|
"eval_runtime": 4008.7534, |
|
"eval_samples_per_second": 2.495, |
|
"eval_steps_per_second": 0.312, |
|
"step": 78750 |
|
}, |
|
{ |
|
"epoch": 7.022222222222222, |
|
"grad_norm": 3.0832223892211914, |
|
"learning_rate": 5.960711111111112e-06, |
|
"loss": 1.4192, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 7.066666666666666, |
|
"grad_norm": 2.553757667541504, |
|
"learning_rate": 5.871822222222223e-06, |
|
"loss": 1.4091, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 7.111111111111111, |
|
"grad_norm": 2.9473392963409424, |
|
"learning_rate": 5.7829333333333336e-06, |
|
"loss": 1.4289, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 7.155555555555556, |
|
"grad_norm": 3.1997127532958984, |
|
"learning_rate": 5.694044444444446e-06, |
|
"loss": 1.3862, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 7.2, |
|
"grad_norm": 3.077010154724121, |
|
"learning_rate": 5.605155555555556e-06, |
|
"loss": 1.4042, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 7.2444444444444445, |
|
"grad_norm": 3.0450375080108643, |
|
"learning_rate": 5.5162666666666675e-06, |
|
"loss": 1.4109, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 7.288888888888889, |
|
"grad_norm": 2.3486075401306152, |
|
"learning_rate": 5.4275555555555555e-06, |
|
"loss": 1.3827, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 7.333333333333333, |
|
"grad_norm": 1.9607141017913818, |
|
"learning_rate": 5.338666666666668e-06, |
|
"loss": 1.4181, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 7.377777777777778, |
|
"grad_norm": 2.3723082542419434, |
|
"learning_rate": 5.249777777777778e-06, |
|
"loss": 1.386, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 7.4222222222222225, |
|
"grad_norm": 3.01910138130188, |
|
"learning_rate": 5.1608888888888894e-06, |
|
"loss": 1.4034, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 7.466666666666667, |
|
"grad_norm": 3.99428129196167, |
|
"learning_rate": 5.072e-06, |
|
"loss": 1.4088, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 7.511111111111111, |
|
"grad_norm": 2.6704437732696533, |
|
"learning_rate": 4.983111111111111e-06, |
|
"loss": 1.4215, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 7.555555555555555, |
|
"grad_norm": 2.2187724113464355, |
|
"learning_rate": 4.8944e-06, |
|
"loss": 1.4019, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 7.6, |
|
"grad_norm": 2.6798274517059326, |
|
"learning_rate": 4.805511111111111e-06, |
|
"loss": 1.4033, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 7.644444444444445, |
|
"grad_norm": 2.2254135608673096, |
|
"learning_rate": 4.716622222222223e-06, |
|
"loss": 1.3873, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 7.688888888888889, |
|
"grad_norm": 3.0042669773101807, |
|
"learning_rate": 4.627733333333333e-06, |
|
"loss": 1.3814, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 7.733333333333333, |
|
"grad_norm": 2.9878363609313965, |
|
"learning_rate": 4.539022222222222e-06, |
|
"loss": 1.4171, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 7.777777777777778, |
|
"grad_norm": 2.8947293758392334, |
|
"learning_rate": 4.450133333333333e-06, |
|
"loss": 1.3992, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 7.822222222222222, |
|
"grad_norm": 2.1072092056274414, |
|
"learning_rate": 4.361422222222222e-06, |
|
"loss": 1.4128, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 7.866666666666667, |
|
"grad_norm": 2.7761893272399902, |
|
"learning_rate": 4.272533333333334e-06, |
|
"loss": 1.4229, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 7.911111111111111, |
|
"grad_norm": 2.724802255630493, |
|
"learning_rate": 4.183644444444445e-06, |
|
"loss": 1.4147, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 7.955555555555556, |
|
"grad_norm": 2.1179940700531006, |
|
"learning_rate": 4.094755555555555e-06, |
|
"loss": 1.4154, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 2.6914615631103516, |
|
"learning_rate": 4.005866666666667e-06, |
|
"loss": 1.4058, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_Sacrebleu": 11.669051917717454, |
|
"eval_loss": 1.7239112854003906, |
|
"eval_runtime": 3810.4584, |
|
"eval_samples_per_second": 2.624, |
|
"eval_steps_per_second": 0.328, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 8.044444444444444, |
|
"grad_norm": 2.054410457611084, |
|
"learning_rate": 3.916977777777778e-06, |
|
"loss": 1.3687, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 8.088888888888889, |
|
"grad_norm": 3.7907662391662598, |
|
"learning_rate": 3.828088888888889e-06, |
|
"loss": 1.3839, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 8.133333333333333, |
|
"grad_norm": 2.704850435256958, |
|
"learning_rate": 3.7392e-06, |
|
"loss": 1.3677, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 8.177777777777777, |
|
"grad_norm": 2.888054132461548, |
|
"learning_rate": 3.650311111111111e-06, |
|
"loss": 1.382, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 8.222222222222221, |
|
"grad_norm": 3.2732253074645996, |
|
"learning_rate": 3.5616e-06, |
|
"loss": 1.4076, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 8.266666666666667, |
|
"grad_norm": 2.5939693450927734, |
|
"learning_rate": 3.4727111111111112e-06, |
|
"loss": 1.376, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 8.311111111111112, |
|
"grad_norm": 3.650665760040283, |
|
"learning_rate": 3.383822222222222e-06, |
|
"loss": 1.3869, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 8.355555555555556, |
|
"grad_norm": 2.4687626361846924, |
|
"learning_rate": 3.294933333333334e-06, |
|
"loss": 1.3643, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 8.4, |
|
"grad_norm": 2.497544765472412, |
|
"learning_rate": 3.206044444444445e-06, |
|
"loss": 1.3783, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 8.444444444444445, |
|
"grad_norm": 2.620575189590454, |
|
"learning_rate": 3.117333333333333e-06, |
|
"loss": 1.3929, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 8.488888888888889, |
|
"grad_norm": 1.9510403871536255, |
|
"learning_rate": 3.028444444444445e-06, |
|
"loss": 1.3757, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 8.533333333333333, |
|
"grad_norm": 2.253645896911621, |
|
"learning_rate": 2.9395555555555562e-06, |
|
"loss": 1.3843, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 8.577777777777778, |
|
"grad_norm": 2.5692851543426514, |
|
"learning_rate": 2.850666666666667e-06, |
|
"loss": 1.3617, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 8.622222222222222, |
|
"grad_norm": 2.7318949699401855, |
|
"learning_rate": 2.761955555555556e-06, |
|
"loss": 1.3965, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 8.666666666666666, |
|
"grad_norm": 2.397948741912842, |
|
"learning_rate": 2.6730666666666673e-06, |
|
"loss": 1.3901, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 8.71111111111111, |
|
"grad_norm": 2.227858781814575, |
|
"learning_rate": 2.584177777777778e-06, |
|
"loss": 1.3631, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 8.755555555555556, |
|
"grad_norm": 2.7916066646575928, |
|
"learning_rate": 2.495288888888889e-06, |
|
"loss": 1.3957, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 8.8, |
|
"grad_norm": 3.0856597423553467, |
|
"learning_rate": 2.4064e-06, |
|
"loss": 1.3941, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 8.844444444444445, |
|
"grad_norm": 2.253394603729248, |
|
"learning_rate": 2.317688888888889e-06, |
|
"loss": 1.3731, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 8.88888888888889, |
|
"grad_norm": 2.375483274459839, |
|
"learning_rate": 2.2288e-06, |
|
"loss": 1.4075, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 8.933333333333334, |
|
"grad_norm": 2.5208873748779297, |
|
"learning_rate": 2.139911111111111e-06, |
|
"loss": 1.36, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 8.977777777777778, |
|
"grad_norm": 2.9750773906707764, |
|
"learning_rate": 2.0510222222222223e-06, |
|
"loss": 1.3803, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_Sacrebleu": 11.812893236671785, |
|
"eval_loss": 1.7298730611801147, |
|
"eval_runtime": 3842.5674, |
|
"eval_samples_per_second": 2.602, |
|
"eval_steps_per_second": 0.325, |
|
"step": 101250 |
|
}, |
|
{ |
|
"epoch": 9.022222222222222, |
|
"grad_norm": 2.893308162689209, |
|
"learning_rate": 1.9621333333333332e-06, |
|
"loss": 1.3477, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 9.066666666666666, |
|
"grad_norm": 2.4525198936462402, |
|
"learning_rate": 1.8732444444444445e-06, |
|
"loss": 1.3894, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 9.11111111111111, |
|
"grad_norm": 2.138702392578125, |
|
"learning_rate": 1.7843555555555556e-06, |
|
"loss": 1.3871, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 9.155555555555555, |
|
"grad_norm": 2.6823806762695312, |
|
"learning_rate": 1.6956444444444445e-06, |
|
"loss": 1.3678, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 9.2, |
|
"grad_norm": 3.6477270126342773, |
|
"learning_rate": 1.6067555555555556e-06, |
|
"loss": 1.3709, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 9.244444444444444, |
|
"grad_norm": 2.2082371711730957, |
|
"learning_rate": 1.5182222222222223e-06, |
|
"loss": 1.376, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 9.28888888888889, |
|
"grad_norm": 2.7231674194335938, |
|
"learning_rate": 1.4293333333333334e-06, |
|
"loss": 1.3372, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 9.333333333333334, |
|
"grad_norm": 2.383312940597534, |
|
"learning_rate": 1.3404444444444445e-06, |
|
"loss": 1.3507, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 9.377777777777778, |
|
"grad_norm": 2.908069610595703, |
|
"learning_rate": 1.2515555555555556e-06, |
|
"loss": 1.371, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 9.422222222222222, |
|
"grad_norm": 2.7813098430633545, |
|
"learning_rate": 1.1626666666666667e-06, |
|
"loss": 1.355, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 9.466666666666667, |
|
"grad_norm": 2.4911601543426514, |
|
"learning_rate": 1.0737777777777778e-06, |
|
"loss": 1.3662, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 9.511111111111111, |
|
"grad_norm": 2.429072856903076, |
|
"learning_rate": 9.85066666666667e-07, |
|
"loss": 1.3643, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 9.555555555555555, |
|
"grad_norm": 2.832702875137329, |
|
"learning_rate": 8.961777777777779e-07, |
|
"loss": 1.3664, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 9.6, |
|
"grad_norm": 2.4679746627807617, |
|
"learning_rate": 8.07288888888889e-07, |
|
"loss": 1.3599, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 9.644444444444444, |
|
"grad_norm": 2.522490978240967, |
|
"learning_rate": 7.184000000000001e-07, |
|
"loss": 1.3691, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 9.688888888888888, |
|
"grad_norm": 2.7089273929595947, |
|
"learning_rate": 6.295111111111112e-07, |
|
"loss": 1.373, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 9.733333333333333, |
|
"grad_norm": 2.0823116302490234, |
|
"learning_rate": 5.406222222222223e-07, |
|
"loss": 1.369, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 9.777777777777779, |
|
"grad_norm": 1.9741628170013428, |
|
"learning_rate": 4.517333333333334e-07, |
|
"loss": 1.3559, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 9.822222222222223, |
|
"grad_norm": 2.4756247997283936, |
|
"learning_rate": 3.628444444444445e-07, |
|
"loss": 1.379, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 9.866666666666667, |
|
"grad_norm": 2.6208088397979736, |
|
"learning_rate": 2.739555555555556e-07, |
|
"loss": 1.3585, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 9.911111111111111, |
|
"grad_norm": 3.2721610069274902, |
|
"learning_rate": 1.8506666666666668e-07, |
|
"loss": 1.3772, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 9.955555555555556, |
|
"grad_norm": 2.5334367752075195, |
|
"learning_rate": 9.617777777777777e-08, |
|
"loss": 1.3576, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 2.59663987159729, |
|
"learning_rate": 7.288888888888889e-09, |
|
"loss": 1.3678, |
|
"step": 112500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 112500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.437992677376e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|