|
{ |
|
"best_metric": 0.20909027755260468, |
|
"best_model_checkpoint": "/root/pretrain_utg4java_02/checkpoint-91689", |
|
"epoch": 40.0, |
|
"eval_steps": 500, |
|
"global_step": 94040, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.1109575033187866, |
|
"learning_rate": 9.98e-05, |
|
"loss": 0.4973, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.1452946662902832, |
|
"learning_rate": 0.0001998, |
|
"loss": 0.4793, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9271925091743469, |
|
"learning_rate": 0.0001989273430782459, |
|
"loss": 0.4999, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.9055812358856201, |
|
"learning_rate": 0.0001978525365434222, |
|
"loss": 0.4934, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.4301895797252655, |
|
"eval_runtime": 56.7204, |
|
"eval_samples_per_second": 331.733, |
|
"eval_steps_per_second": 2.592, |
|
"step": 2351 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.8988948464393616, |
|
"learning_rate": 0.00019677773000859847, |
|
"loss": 0.4818, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.9011508822441101, |
|
"learning_rate": 0.00019570292347377474, |
|
"loss": 0.4651, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.905346691608429, |
|
"learning_rate": 0.00019463026655202063, |
|
"loss": 0.467, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.7411425709724426, |
|
"learning_rate": 0.0001935597592433362, |
|
"loss": 0.4625, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.8985588550567627, |
|
"learning_rate": 0.00019248495270851249, |
|
"loss": 0.4572, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.4058227837085724, |
|
"eval_runtime": 56.7252, |
|
"eval_samples_per_second": 331.705, |
|
"eval_steps_per_second": 2.591, |
|
"step": 4702 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.827357292175293, |
|
"learning_rate": 0.00019141014617368875, |
|
"loss": 0.4472, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.736473023891449, |
|
"learning_rate": 0.00019033533963886502, |
|
"loss": 0.4372, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.7558779120445251, |
|
"learning_rate": 0.00018926053310404129, |
|
"loss": 0.4362, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.7970576882362366, |
|
"learning_rate": 0.00018818572656921755, |
|
"loss": 0.4332, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.7692726850509644, |
|
"learning_rate": 0.00018711092003439382, |
|
"loss": 0.43, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.38832882046699524, |
|
"eval_runtime": 56.9476, |
|
"eval_samples_per_second": 330.409, |
|
"eval_steps_per_second": 2.581, |
|
"step": 7053 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 0.7109490633010864, |
|
"learning_rate": 0.00018603611349957009, |
|
"loss": 0.4217, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.7125234603881836, |
|
"learning_rate": 0.00018496130696474635, |
|
"loss": 0.4141, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 0.6914758682250977, |
|
"learning_rate": 0.00018388650042992262, |
|
"loss": 0.4169, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 0.7215930223464966, |
|
"learning_rate": 0.00018281169389509889, |
|
"loss": 0.4132, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 0.3813731372356415, |
|
"eval_runtime": 56.8812, |
|
"eval_samples_per_second": 330.794, |
|
"eval_steps_per_second": 2.584, |
|
"step": 9404 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 0.7283121943473816, |
|
"learning_rate": 0.00018173688736027515, |
|
"loss": 0.4131, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 4.25, |
|
"grad_norm": 0.6947267055511475, |
|
"learning_rate": 0.00018066208082545142, |
|
"loss": 0.4059, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 0.7544755339622498, |
|
"learning_rate": 0.0001795872742906277, |
|
"loss": 0.4023, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.6204919815063477, |
|
"learning_rate": 0.00017851246775580398, |
|
"loss": 0.4038, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 4.89, |
|
"grad_norm": 0.7036948800086975, |
|
"learning_rate": 0.00017743766122098022, |
|
"loss": 0.4017, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 0.3638169765472412, |
|
"eval_runtime": 56.8241, |
|
"eval_samples_per_second": 331.127, |
|
"eval_steps_per_second": 2.587, |
|
"step": 11755 |
|
}, |
|
{ |
|
"epoch": 5.1, |
|
"grad_norm": 0.62198805809021, |
|
"learning_rate": 0.0001763628546861565, |
|
"loss": 0.3921, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 5.32, |
|
"grad_norm": 0.6095067858695984, |
|
"learning_rate": 0.00017528804815133278, |
|
"loss": 0.3901, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 5.53, |
|
"grad_norm": 1.0518782138824463, |
|
"learning_rate": 0.00017421324161650902, |
|
"loss": 0.3888, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 5.74, |
|
"grad_norm": 0.6801586151123047, |
|
"learning_rate": 0.00017313843508168531, |
|
"loss": 0.3859, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 5.95, |
|
"grad_norm": 0.6489918828010559, |
|
"learning_rate": 0.00017206362854686158, |
|
"loss": 0.3839, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 0.35733696818351746, |
|
"eval_runtime": 56.779, |
|
"eval_samples_per_second": 331.39, |
|
"eval_steps_per_second": 2.589, |
|
"step": 14106 |
|
}, |
|
{ |
|
"epoch": 6.17, |
|
"grad_norm": 0.6174113154411316, |
|
"learning_rate": 0.00017098882201203782, |
|
"loss": 0.3782, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 6.38, |
|
"grad_norm": 0.641315221786499, |
|
"learning_rate": 0.00016991401547721411, |
|
"loss": 0.3792, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 6.59, |
|
"grad_norm": 0.6454831957817078, |
|
"learning_rate": 0.00016883920894239038, |
|
"loss": 0.3745, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 6.81, |
|
"grad_norm": 0.6489464640617371, |
|
"learning_rate": 0.00016776440240756665, |
|
"loss": 0.3729, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 0.3489345610141754, |
|
"eval_runtime": 57.0727, |
|
"eval_samples_per_second": 329.685, |
|
"eval_steps_per_second": 2.576, |
|
"step": 16457 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"grad_norm": 0.6929277181625366, |
|
"learning_rate": 0.0001666938950988822, |
|
"loss": 0.3718, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 7.23, |
|
"grad_norm": 0.5726625323295593, |
|
"learning_rate": 0.00016562123817712813, |
|
"loss": 0.3624, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 7.44, |
|
"grad_norm": 0.603529155254364, |
|
"learning_rate": 0.0001645464316423044, |
|
"loss": 0.3664, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 7.66, |
|
"grad_norm": 0.7436888813972473, |
|
"learning_rate": 0.00016347162510748066, |
|
"loss": 0.3614, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 7.87, |
|
"grad_norm": 0.6405676007270813, |
|
"learning_rate": 0.00016239681857265693, |
|
"loss": 0.3646, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.3384884297847748, |
|
"eval_runtime": 56.7342, |
|
"eval_samples_per_second": 331.652, |
|
"eval_steps_per_second": 2.591, |
|
"step": 18808 |
|
}, |
|
{ |
|
"epoch": 8.08, |
|
"grad_norm": 0.660517156124115, |
|
"learning_rate": 0.0001613220120378332, |
|
"loss": 0.3623, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 8.29, |
|
"grad_norm": 0.6352570056915283, |
|
"learning_rate": 0.0001602493551160791, |
|
"loss": 0.3518, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 8.51, |
|
"grad_norm": 0.6094992756843567, |
|
"learning_rate": 0.0001591745485812554, |
|
"loss": 0.3569, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"grad_norm": 0.6646167635917664, |
|
"learning_rate": 0.00015809974204643166, |
|
"loss": 0.3567, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 8.93, |
|
"grad_norm": 0.6240518093109131, |
|
"learning_rate": 0.00015702493551160792, |
|
"loss": 0.353, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 0.3339434862136841, |
|
"eval_runtime": 56.979, |
|
"eval_samples_per_second": 330.227, |
|
"eval_steps_per_second": 2.58, |
|
"step": 21159 |
|
}, |
|
{ |
|
"epoch": 9.15, |
|
"grad_norm": 0.6990819573402405, |
|
"learning_rate": 0.0001559501289767842, |
|
"loss": 0.3491, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 9.36, |
|
"grad_norm": 0.6751831769943237, |
|
"learning_rate": 0.00015487532244196046, |
|
"loss": 0.346, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 9.57, |
|
"grad_norm": 0.615511417388916, |
|
"learning_rate": 0.00015380051590713672, |
|
"loss": 0.3488, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 9.78, |
|
"grad_norm": 0.5277218222618103, |
|
"learning_rate": 0.000152725709372313, |
|
"loss": 0.3466, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.5809108018875122, |
|
"learning_rate": 0.00015165090283748926, |
|
"loss": 0.3441, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 0.324989378452301, |
|
"eval_runtime": 56.9322, |
|
"eval_samples_per_second": 330.499, |
|
"eval_steps_per_second": 2.582, |
|
"step": 23510 |
|
}, |
|
{ |
|
"epoch": 10.21, |
|
"grad_norm": 0.5285485982894897, |
|
"learning_rate": 0.00015058039552880484, |
|
"loss": 0.3407, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 10.42, |
|
"grad_norm": 0.6268051862716675, |
|
"learning_rate": 0.00014950558899398108, |
|
"loss": 0.3389, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 10.63, |
|
"grad_norm": 0.5879684090614319, |
|
"learning_rate": 0.00014843078245915735, |
|
"loss": 0.3387, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 10.85, |
|
"grad_norm": 0.5434576869010925, |
|
"learning_rate": 0.00014735597592433364, |
|
"loss": 0.3379, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 0.31874018907546997, |
|
"eval_runtime": 57.0459, |
|
"eval_samples_per_second": 329.84, |
|
"eval_steps_per_second": 2.577, |
|
"step": 25861 |
|
}, |
|
{ |
|
"epoch": 11.06, |
|
"grad_norm": 0.6547732949256897, |
|
"learning_rate": 0.00014628116938950988, |
|
"loss": 0.3369, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 11.27, |
|
"grad_norm": 0.648601233959198, |
|
"learning_rate": 0.00014520636285468615, |
|
"loss": 0.3308, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 11.48, |
|
"grad_norm": 0.6311826109886169, |
|
"learning_rate": 0.00014413155631986244, |
|
"loss": 0.3298, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 11.7, |
|
"grad_norm": 0.5571895837783813, |
|
"learning_rate": 0.00014305674978503868, |
|
"loss": 0.3292, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 11.91, |
|
"grad_norm": 0.5373964309692383, |
|
"learning_rate": 0.00014198194325021498, |
|
"loss": 0.3284, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 0.3123379051685333, |
|
"eval_runtime": 56.7858, |
|
"eval_samples_per_second": 331.351, |
|
"eval_steps_per_second": 2.589, |
|
"step": 28212 |
|
}, |
|
{ |
|
"epoch": 12.12, |
|
"grad_norm": 0.5307437777519226, |
|
"learning_rate": 0.00014090928632846088, |
|
"loss": 0.3247, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 12.34, |
|
"grad_norm": 0.6422705054283142, |
|
"learning_rate": 0.00013983447979363714, |
|
"loss": 0.3223, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 12.55, |
|
"grad_norm": 0.6680347323417664, |
|
"learning_rate": 0.00013875967325881344, |
|
"loss": 0.3223, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 12.76, |
|
"grad_norm": 0.5851155519485474, |
|
"learning_rate": 0.00013768701633705933, |
|
"loss": 0.3245, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 12.97, |
|
"grad_norm": 0.544624924659729, |
|
"learning_rate": 0.0001366122098022356, |
|
"loss": 0.3252, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 0.3076089322566986, |
|
"eval_runtime": 57.0752, |
|
"eval_samples_per_second": 329.67, |
|
"eval_steps_per_second": 2.576, |
|
"step": 30563 |
|
}, |
|
{ |
|
"epoch": 13.19, |
|
"grad_norm": 0.5343388319015503, |
|
"learning_rate": 0.00013553740326741187, |
|
"loss": 0.3185, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 13.4, |
|
"grad_norm": 0.6074999570846558, |
|
"learning_rate": 0.00013446259673258814, |
|
"loss": 0.3191, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 13.61, |
|
"grad_norm": 0.6202041506767273, |
|
"learning_rate": 0.00013338779019776443, |
|
"loss": 0.3186, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 13.82, |
|
"grad_norm": 0.6058717966079712, |
|
"learning_rate": 0.00013231298366294067, |
|
"loss": 0.3134, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 0.3018127679824829, |
|
"eval_runtime": 57.0557, |
|
"eval_samples_per_second": 329.783, |
|
"eval_steps_per_second": 2.576, |
|
"step": 32914 |
|
}, |
|
{ |
|
"epoch": 14.04, |
|
"grad_norm": 0.6439224481582642, |
|
"learning_rate": 0.00013123817712811694, |
|
"loss": 0.3161, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 14.25, |
|
"grad_norm": 0.5888046622276306, |
|
"learning_rate": 0.00013016337059329323, |
|
"loss": 0.3094, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 14.46, |
|
"grad_norm": 0.6644863486289978, |
|
"learning_rate": 0.00012908856405846947, |
|
"loss": 0.3092, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 14.67, |
|
"grad_norm": 0.5649863481521606, |
|
"learning_rate": 0.00012801375752364576, |
|
"loss": 0.3117, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 14.89, |
|
"grad_norm": 0.4947267174720764, |
|
"learning_rate": 0.00012693895098882203, |
|
"loss": 0.3072, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 0.29680636525154114, |
|
"eval_runtime": 57.044, |
|
"eval_samples_per_second": 329.851, |
|
"eval_steps_per_second": 2.577, |
|
"step": 35265 |
|
}, |
|
{ |
|
"epoch": 15.1, |
|
"grad_norm": 0.6015808582305908, |
|
"learning_rate": 0.00012586414445399827, |
|
"loss": 0.3067, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 15.31, |
|
"grad_norm": 0.6003320217132568, |
|
"learning_rate": 0.00012478933791917456, |
|
"loss": 0.3027, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 15.53, |
|
"grad_norm": 0.5643883347511292, |
|
"learning_rate": 0.00012371453138435083, |
|
"loss": 0.3012, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 15.74, |
|
"grad_norm": 0.5768193602561951, |
|
"learning_rate": 0.00012264187446259673, |
|
"loss": 0.303, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 15.95, |
|
"grad_norm": 0.5599430799484253, |
|
"learning_rate": 0.00012156921754084265, |
|
"loss": 0.3034, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 0.2938060164451599, |
|
"eval_runtime": 56.7422, |
|
"eval_samples_per_second": 331.605, |
|
"eval_steps_per_second": 2.591, |
|
"step": 37616 |
|
}, |
|
{ |
|
"epoch": 16.16, |
|
"grad_norm": 0.6380453705787659, |
|
"learning_rate": 0.00012049441100601892, |
|
"loss": 0.2987, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 16.38, |
|
"grad_norm": 0.5312320590019226, |
|
"learning_rate": 0.0001194196044711952, |
|
"loss": 0.2975, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 16.59, |
|
"grad_norm": 0.7027547955513, |
|
"learning_rate": 0.00011834479793637145, |
|
"loss": 0.2934, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 16.8, |
|
"grad_norm": 0.5144539475440979, |
|
"learning_rate": 0.00011726999140154773, |
|
"loss": 0.2959, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 0.28754693269729614, |
|
"eval_runtime": 56.9555, |
|
"eval_samples_per_second": 330.363, |
|
"eval_steps_per_second": 2.581, |
|
"step": 39967 |
|
}, |
|
{ |
|
"epoch": 17.01, |
|
"grad_norm": 0.5759513974189758, |
|
"learning_rate": 0.00011619733447979365, |
|
"loss": 0.2997, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 17.23, |
|
"grad_norm": 0.593640148639679, |
|
"learning_rate": 0.00011512467755803955, |
|
"loss": 0.294, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 17.44, |
|
"grad_norm": 0.6821351647377014, |
|
"learning_rate": 0.00011404987102321583, |
|
"loss": 0.2904, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 17.65, |
|
"grad_norm": 0.5297681093215942, |
|
"learning_rate": 0.0001129750644883921, |
|
"loss": 0.2896, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 17.86, |
|
"grad_norm": 0.5864290595054626, |
|
"learning_rate": 0.00011190025795356836, |
|
"loss": 0.2884, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 0.28015270829200745, |
|
"eval_runtime": 56.8879, |
|
"eval_samples_per_second": 330.756, |
|
"eval_steps_per_second": 2.584, |
|
"step": 42318 |
|
}, |
|
{ |
|
"epoch": 18.08, |
|
"grad_norm": 0.5764068961143494, |
|
"learning_rate": 0.00011082545141874463, |
|
"loss": 0.2874, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 18.29, |
|
"grad_norm": 0.6073163747787476, |
|
"learning_rate": 0.0001097506448839209, |
|
"loss": 0.2828, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 18.5, |
|
"grad_norm": 0.5691092610359192, |
|
"learning_rate": 0.00010867583834909716, |
|
"loss": 0.2848, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 18.72, |
|
"grad_norm": 0.5399264097213745, |
|
"learning_rate": 0.00010760103181427344, |
|
"loss": 0.2861, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 18.93, |
|
"grad_norm": 0.6221365928649902, |
|
"learning_rate": 0.0001065262252794497, |
|
"loss": 0.2839, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 0.27618157863616943, |
|
"eval_runtime": 56.9107, |
|
"eval_samples_per_second": 330.623, |
|
"eval_steps_per_second": 2.583, |
|
"step": 44669 |
|
}, |
|
{ |
|
"epoch": 19.14, |
|
"grad_norm": 0.5080223679542542, |
|
"learning_rate": 0.00010545141874462596, |
|
"loss": 0.2791, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 19.35, |
|
"grad_norm": 0.567764163017273, |
|
"learning_rate": 0.00010437661220980224, |
|
"loss": 0.2792, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 19.57, |
|
"grad_norm": 0.5838685035705566, |
|
"learning_rate": 0.00010330180567497851, |
|
"loss": 0.2768, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 19.78, |
|
"grad_norm": 0.5778998136520386, |
|
"learning_rate": 0.00010222699914015476, |
|
"loss": 0.2776, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 19.99, |
|
"grad_norm": 0.615337073802948, |
|
"learning_rate": 0.0001011543422184007, |
|
"loss": 0.2785, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 0.2724262773990631, |
|
"eval_runtime": 56.852, |
|
"eval_samples_per_second": 330.965, |
|
"eval_steps_per_second": 2.586, |
|
"step": 47020 |
|
}, |
|
{ |
|
"epoch": 20.2, |
|
"grad_norm": 0.6153652667999268, |
|
"learning_rate": 0.00010007953568357695, |
|
"loss": 0.2712, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 20.42, |
|
"grad_norm": 0.6126906275749207, |
|
"learning_rate": 9.900472914875323e-05, |
|
"loss": 0.2718, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 20.63, |
|
"grad_norm": 0.5799471139907837, |
|
"learning_rate": 9.792992261392949e-05, |
|
"loss": 0.2701, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 20.84, |
|
"grad_norm": 0.5679476261138916, |
|
"learning_rate": 9.685511607910577e-05, |
|
"loss": 0.2727, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 0.26610955595970154, |
|
"eval_runtime": 56.7263, |
|
"eval_samples_per_second": 331.698, |
|
"eval_steps_per_second": 2.591, |
|
"step": 49371 |
|
}, |
|
{ |
|
"epoch": 21.05, |
|
"grad_norm": 0.5923852324485779, |
|
"learning_rate": 9.578245915735168e-05, |
|
"loss": 0.2694, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 21.27, |
|
"grad_norm": 0.6184679269790649, |
|
"learning_rate": 9.470765262252796e-05, |
|
"loss": 0.2649, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 21.48, |
|
"grad_norm": 0.5939807295799255, |
|
"learning_rate": 9.363284608770421e-05, |
|
"loss": 0.2697, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 21.69, |
|
"grad_norm": 0.5925255417823792, |
|
"learning_rate": 9.255803955288048e-05, |
|
"loss": 0.2652, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 21.91, |
|
"grad_norm": 0.5666438341140747, |
|
"learning_rate": 9.148323301805676e-05, |
|
"loss": 0.266, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 0.266626238822937, |
|
"eval_runtime": 56.6361, |
|
"eval_samples_per_second": 332.226, |
|
"eval_steps_per_second": 2.596, |
|
"step": 51722 |
|
}, |
|
{ |
|
"epoch": 22.12, |
|
"grad_norm": 0.5490565299987793, |
|
"learning_rate": 9.040842648323303e-05, |
|
"loss": 0.2613, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 22.33, |
|
"grad_norm": 0.6534095406532288, |
|
"learning_rate": 8.933361994840929e-05, |
|
"loss": 0.2592, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 22.54, |
|
"grad_norm": 0.568953812122345, |
|
"learning_rate": 8.82609630266552e-05, |
|
"loss": 0.2604, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 22.76, |
|
"grad_norm": 0.5968701243400574, |
|
"learning_rate": 8.718615649183147e-05, |
|
"loss": 0.2602, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 22.97, |
|
"grad_norm": 0.5995833277702332, |
|
"learning_rate": 8.61134995700774e-05, |
|
"loss": 0.2614, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 0.26461803913116455, |
|
"eval_runtime": 56.8736, |
|
"eval_samples_per_second": 330.839, |
|
"eval_steps_per_second": 2.585, |
|
"step": 54073 |
|
}, |
|
{ |
|
"epoch": 23.18, |
|
"grad_norm": 0.577324390411377, |
|
"learning_rate": 8.503869303525366e-05, |
|
"loss": 0.2502, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 23.39, |
|
"grad_norm": 0.631363034248352, |
|
"learning_rate": 8.396603611349956e-05, |
|
"loss": 0.256, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 23.61, |
|
"grad_norm": 0.5862709879875183, |
|
"learning_rate": 8.289122957867584e-05, |
|
"loss": 0.2559, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 23.82, |
|
"grad_norm": 0.5620314478874207, |
|
"learning_rate": 8.181857265692176e-05, |
|
"loss": 0.2549, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 0.2593855559825897, |
|
"eval_runtime": 57.337, |
|
"eval_samples_per_second": 328.165, |
|
"eval_steps_per_second": 2.564, |
|
"step": 56424 |
|
}, |
|
{ |
|
"epoch": 24.03, |
|
"grad_norm": 0.5514592528343201, |
|
"learning_rate": 8.074376612209804e-05, |
|
"loss": 0.2542, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 24.25, |
|
"grad_norm": 0.6351886987686157, |
|
"learning_rate": 7.966895958727429e-05, |
|
"loss": 0.2483, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 24.46, |
|
"grad_norm": 0.6075631380081177, |
|
"learning_rate": 7.859415305245056e-05, |
|
"loss": 0.2503, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 24.67, |
|
"grad_norm": 0.5592435002326965, |
|
"learning_rate": 7.751934651762684e-05, |
|
"loss": 0.249, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 24.88, |
|
"grad_norm": 0.5757043957710266, |
|
"learning_rate": 7.64445399828031e-05, |
|
"loss": 0.2497, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 25.0, |
|
"eval_loss": 0.2535327076911926, |
|
"eval_runtime": 56.789, |
|
"eval_samples_per_second": 331.332, |
|
"eval_steps_per_second": 2.589, |
|
"step": 58775 |
|
}, |
|
{ |
|
"epoch": 25.1, |
|
"grad_norm": 0.5995878577232361, |
|
"learning_rate": 7.536973344797937e-05, |
|
"loss": 0.245, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 25.31, |
|
"grad_norm": 0.6077148914337158, |
|
"learning_rate": 7.429492691315564e-05, |
|
"loss": 0.2461, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 25.52, |
|
"grad_norm": 0.5592058300971985, |
|
"learning_rate": 7.32201203783319e-05, |
|
"loss": 0.2433, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 25.73, |
|
"grad_norm": 0.6243628859519958, |
|
"learning_rate": 7.214531384350817e-05, |
|
"loss": 0.2444, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 25.95, |
|
"grad_norm": 0.6848371624946594, |
|
"learning_rate": 7.107050730868444e-05, |
|
"loss": 0.243, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 26.0, |
|
"eval_loss": 0.250180721282959, |
|
"eval_runtime": 56.9862, |
|
"eval_samples_per_second": 330.185, |
|
"eval_steps_per_second": 2.58, |
|
"step": 61126 |
|
}, |
|
{ |
|
"epoch": 26.16, |
|
"grad_norm": 0.5177180171012878, |
|
"learning_rate": 6.99957007738607e-05, |
|
"loss": 0.2382, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 26.37, |
|
"grad_norm": 0.6144821047782898, |
|
"learning_rate": 6.892304385210663e-05, |
|
"loss": 0.2349, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 26.58, |
|
"grad_norm": 0.5862034559249878, |
|
"learning_rate": 6.78482373172829e-05, |
|
"loss": 0.2385, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 26.8, |
|
"grad_norm": 0.5799028277397156, |
|
"learning_rate": 6.677343078245916e-05, |
|
"loss": 0.2383, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 27.0, |
|
"eval_loss": 0.24638937413692474, |
|
"eval_runtime": 56.9276, |
|
"eval_samples_per_second": 330.525, |
|
"eval_steps_per_second": 2.582, |
|
"step": 63477 |
|
}, |
|
{ |
|
"epoch": 27.01, |
|
"grad_norm": 0.6466034054756165, |
|
"learning_rate": 6.569862424763543e-05, |
|
"loss": 0.2375, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 27.22, |
|
"grad_norm": 0.6161571741104126, |
|
"learning_rate": 6.46238177128117e-05, |
|
"loss": 0.2337, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 27.44, |
|
"grad_norm": 0.6162907481193542, |
|
"learning_rate": 6.354901117798796e-05, |
|
"loss": 0.2328, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 27.65, |
|
"grad_norm": 0.6643475890159607, |
|
"learning_rate": 6.247420464316423e-05, |
|
"loss": 0.2316, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 27.86, |
|
"grad_norm": 0.5605382323265076, |
|
"learning_rate": 6.13993981083405e-05, |
|
"loss": 0.2325, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"eval_loss": 0.24465161561965942, |
|
"eval_runtime": 57.0032, |
|
"eval_samples_per_second": 330.087, |
|
"eval_steps_per_second": 2.579, |
|
"step": 65828 |
|
}, |
|
{ |
|
"epoch": 28.07, |
|
"grad_norm": 0.6108579635620117, |
|
"learning_rate": 6.032459157351676e-05, |
|
"loss": 0.231, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 28.29, |
|
"grad_norm": 0.5990898013114929, |
|
"learning_rate": 5.924978503869304e-05, |
|
"loss": 0.2269, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 28.5, |
|
"grad_norm": 0.5171389579772949, |
|
"learning_rate": 5.817712811693895e-05, |
|
"loss": 0.2281, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 28.71, |
|
"grad_norm": 0.6478968262672424, |
|
"learning_rate": 5.710232158211523e-05, |
|
"loss": 0.23, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 28.92, |
|
"grad_norm": 0.5454473495483398, |
|
"learning_rate": 5.602751504729149e-05, |
|
"loss": 0.224, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 29.0, |
|
"eval_loss": 0.23835687339305878, |
|
"eval_runtime": 56.7685, |
|
"eval_samples_per_second": 331.452, |
|
"eval_steps_per_second": 2.589, |
|
"step": 68179 |
|
}, |
|
{ |
|
"epoch": 29.14, |
|
"grad_norm": 0.5909966826438904, |
|
"learning_rate": 5.4952708512467756e-05, |
|
"loss": 0.2237, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 29.35, |
|
"grad_norm": 0.7489617466926575, |
|
"learning_rate": 5.387790197764403e-05, |
|
"loss": 0.2218, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 29.56, |
|
"grad_norm": 0.5880895853042603, |
|
"learning_rate": 5.2803095442820296e-05, |
|
"loss": 0.2224, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 29.77, |
|
"grad_norm": 0.6126420497894287, |
|
"learning_rate": 5.1728288907996556e-05, |
|
"loss": 0.2235, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 29.99, |
|
"grad_norm": 0.7013407349586487, |
|
"learning_rate": 5.0653482373172836e-05, |
|
"loss": 0.2189, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 30.0, |
|
"eval_loss": 0.23514553904533386, |
|
"eval_runtime": 56.6615, |
|
"eval_samples_per_second": 332.077, |
|
"eval_steps_per_second": 2.594, |
|
"step": 70530 |
|
}, |
|
{ |
|
"epoch": 30.2, |
|
"grad_norm": 0.6700057983398438, |
|
"learning_rate": 4.9578675838349096e-05, |
|
"loss": 0.2175, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 30.41, |
|
"grad_norm": 0.5728330016136169, |
|
"learning_rate": 4.8506018916595015e-05, |
|
"loss": 0.2169, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 30.63, |
|
"grad_norm": 0.5460176467895508, |
|
"learning_rate": 4.743121238177128e-05, |
|
"loss": 0.2159, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 30.84, |
|
"grad_norm": 0.5714312791824341, |
|
"learning_rate": 4.63585554600172e-05, |
|
"loss": 0.2131, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 31.0, |
|
"eval_loss": 0.23221717774868011, |
|
"eval_runtime": 56.8018, |
|
"eval_samples_per_second": 331.257, |
|
"eval_steps_per_second": 2.588, |
|
"step": 72881 |
|
}, |
|
{ |
|
"epoch": 31.05, |
|
"grad_norm": 0.5165086984634399, |
|
"learning_rate": 4.528374892519347e-05, |
|
"loss": 0.2139, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 31.26, |
|
"grad_norm": 0.6647119522094727, |
|
"learning_rate": 4.4208942390369734e-05, |
|
"loss": 0.2098, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 31.48, |
|
"grad_norm": 0.6132521629333496, |
|
"learning_rate": 4.3134135855546e-05, |
|
"loss": 0.213, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 31.69, |
|
"grad_norm": 0.5501160025596619, |
|
"learning_rate": 4.2059329320722275e-05, |
|
"loss": 0.2111, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 31.9, |
|
"grad_norm": 0.6153804659843445, |
|
"learning_rate": 4.098667239896819e-05, |
|
"loss": 0.2095, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 0.22537025809288025, |
|
"eval_runtime": 56.6384, |
|
"eval_samples_per_second": 332.213, |
|
"eval_steps_per_second": 2.595, |
|
"step": 75232 |
|
}, |
|
{ |
|
"epoch": 32.11, |
|
"grad_norm": 0.5587669014930725, |
|
"learning_rate": 3.9911865864144454e-05, |
|
"loss": 0.2102, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 32.33, |
|
"grad_norm": 0.6313532590866089, |
|
"learning_rate": 3.883705932932073e-05, |
|
"loss": 0.2066, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 32.54, |
|
"grad_norm": 0.6371473670005798, |
|
"learning_rate": 3.7762252794496994e-05, |
|
"loss": 0.2067, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 32.75, |
|
"grad_norm": 0.6398050785064697, |
|
"learning_rate": 3.668744625967326e-05, |
|
"loss": 0.2049, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 32.96, |
|
"grad_norm": 0.6136685013771057, |
|
"learning_rate": 3.561263972484953e-05, |
|
"loss": 0.2072, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 33.0, |
|
"eval_loss": 0.22324109077453613, |
|
"eval_runtime": 56.9404, |
|
"eval_samples_per_second": 330.451, |
|
"eval_steps_per_second": 2.582, |
|
"step": 77583 |
|
}, |
|
{ |
|
"epoch": 33.18, |
|
"grad_norm": 0.652153730392456, |
|
"learning_rate": 3.45378331900258e-05, |
|
"loss": 0.2024, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 33.39, |
|
"grad_norm": 0.5828467011451721, |
|
"learning_rate": 3.346302665520207e-05, |
|
"loss": 0.2008, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 33.6, |
|
"grad_norm": 0.5484930872917175, |
|
"learning_rate": 3.239036973344798e-05, |
|
"loss": 0.2032, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 33.82, |
|
"grad_norm": 0.5094757676124573, |
|
"learning_rate": 3.1315563198624247e-05, |
|
"loss": 0.2012, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 34.0, |
|
"eval_loss": 0.22044067084789276, |
|
"eval_runtime": 56.7734, |
|
"eval_samples_per_second": 331.423, |
|
"eval_steps_per_second": 2.589, |
|
"step": 79934 |
|
}, |
|
{ |
|
"epoch": 34.03, |
|
"grad_norm": 0.6236255764961243, |
|
"learning_rate": 3.0240756663800517e-05, |
|
"loss": 0.1984, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 34.24, |
|
"grad_norm": 0.6019966006278992, |
|
"learning_rate": 2.9165950128976787e-05, |
|
"loss": 0.1958, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 34.45, |
|
"grad_norm": 0.6366199851036072, |
|
"learning_rate": 2.8093293207222702e-05, |
|
"loss": 0.1988, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 34.67, |
|
"grad_norm": 0.6756080389022827, |
|
"learning_rate": 2.701848667239897e-05, |
|
"loss": 0.1975, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 34.88, |
|
"grad_norm": 0.587624728679657, |
|
"learning_rate": 2.594368013757524e-05, |
|
"loss": 0.1938, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 35.0, |
|
"eval_loss": 0.21603567898273468, |
|
"eval_runtime": 56.8324, |
|
"eval_samples_per_second": 331.079, |
|
"eval_steps_per_second": 2.587, |
|
"step": 82285 |
|
}, |
|
{ |
|
"epoch": 35.09, |
|
"grad_norm": 0.5856329798698425, |
|
"learning_rate": 2.4868873602751506e-05, |
|
"loss": 0.193, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 35.3, |
|
"grad_norm": 0.6045832633972168, |
|
"learning_rate": 2.3794067067927776e-05, |
|
"loss": 0.1936, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 35.52, |
|
"grad_norm": 0.5652228593826294, |
|
"learning_rate": 2.2719260533104043e-05, |
|
"loss": 0.194, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 35.73, |
|
"grad_norm": 0.6424867510795593, |
|
"learning_rate": 2.1644453998280313e-05, |
|
"loss": 0.1896, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 35.94, |
|
"grad_norm": 0.5543941855430603, |
|
"learning_rate": 2.0571797076526225e-05, |
|
"loss": 0.1921, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"eval_loss": 0.21675606071949005, |
|
"eval_runtime": 56.9718, |
|
"eval_samples_per_second": 330.269, |
|
"eval_steps_per_second": 2.58, |
|
"step": 84636 |
|
}, |
|
{ |
|
"epoch": 36.15, |
|
"grad_norm": 0.6019295454025269, |
|
"learning_rate": 1.9496990541702495e-05, |
|
"loss": 0.1884, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 36.37, |
|
"grad_norm": 0.645598292350769, |
|
"learning_rate": 1.8422184006878762e-05, |
|
"loss": 0.1888, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 36.58, |
|
"grad_norm": 0.6292818188667297, |
|
"learning_rate": 1.7347377472055032e-05, |
|
"loss": 0.1886, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 36.79, |
|
"grad_norm": 0.5169576406478882, |
|
"learning_rate": 1.62725709372313e-05, |
|
"loss": 0.1898, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 37.0, |
|
"eval_loss": 0.21486429870128632, |
|
"eval_runtime": 56.8955, |
|
"eval_samples_per_second": 330.711, |
|
"eval_steps_per_second": 2.584, |
|
"step": 86987 |
|
}, |
|
{ |
|
"epoch": 37.01, |
|
"grad_norm": 0.680117130279541, |
|
"learning_rate": 1.5197764402407566e-05, |
|
"loss": 0.187, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 37.22, |
|
"grad_norm": 0.5357288718223572, |
|
"learning_rate": 1.4122957867583836e-05, |
|
"loss": 0.1841, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 37.43, |
|
"grad_norm": 0.5753670334815979, |
|
"learning_rate": 1.3048151332760103e-05, |
|
"loss": 0.1829, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 37.64, |
|
"grad_norm": 0.654707133769989, |
|
"learning_rate": 1.1975494411006018e-05, |
|
"loss": 0.1857, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 37.86, |
|
"grad_norm": 0.5702338218688965, |
|
"learning_rate": 1.0900687876182287e-05, |
|
"loss": 0.1829, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 38.0, |
|
"eval_loss": 0.21089451014995575, |
|
"eval_runtime": 56.6335, |
|
"eval_samples_per_second": 332.241, |
|
"eval_steps_per_second": 2.596, |
|
"step": 89338 |
|
}, |
|
{ |
|
"epoch": 38.07, |
|
"grad_norm": 0.6671079993247986, |
|
"learning_rate": 9.825881341358555e-06, |
|
"loss": 0.1846, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 38.28, |
|
"grad_norm": 0.5165778398513794, |
|
"learning_rate": 8.751074806534824e-06, |
|
"loss": 0.1809, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 38.49, |
|
"grad_norm": 0.6374748945236206, |
|
"learning_rate": 7.676268271711092e-06, |
|
"loss": 0.1793, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 38.71, |
|
"grad_norm": 0.5679296851158142, |
|
"learning_rate": 6.60146173688736e-06, |
|
"loss": 0.1804, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 38.92, |
|
"grad_norm": 0.5970684885978699, |
|
"learning_rate": 5.526655202063629e-06, |
|
"loss": 0.1804, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 39.0, |
|
"eval_loss": 0.20909027755260468, |
|
"eval_runtime": 56.346, |
|
"eval_samples_per_second": 333.937, |
|
"eval_steps_per_second": 2.609, |
|
"step": 91689 |
|
}, |
|
{ |
|
"epoch": 39.13, |
|
"grad_norm": 0.5444430112838745, |
|
"learning_rate": 4.4539982803095445e-06, |
|
"loss": 0.1804, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 39.34, |
|
"grad_norm": 0.6086540818214417, |
|
"learning_rate": 3.379191745485813e-06, |
|
"loss": 0.1786, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 39.56, |
|
"grad_norm": 0.5558965802192688, |
|
"learning_rate": 2.304385210662081e-06, |
|
"loss": 0.1758, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 39.77, |
|
"grad_norm": 0.5531004667282104, |
|
"learning_rate": 1.2295786758383493e-06, |
|
"loss": 0.1788, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 39.98, |
|
"grad_norm": 0.5611669421195984, |
|
"learning_rate": 1.5477214101461737e-07, |
|
"loss": 0.1773, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 0.20976465940475464, |
|
"eval_runtime": 56.6405, |
|
"eval_samples_per_second": 332.2, |
|
"eval_steps_per_second": 2.595, |
|
"step": 94040 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"step": 94040, |
|
"total_flos": 3.6650496018087936e+18, |
|
"train_loss": 0.2880888063569923, |
|
"train_runtime": 54838.0441, |
|
"train_samples_per_second": 109.795, |
|
"train_steps_per_second": 1.715 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 94040, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 40, |
|
"save_steps": 500, |
|
"total_flos": 3.6650496018087936e+18, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|