|
{ |
|
"best_metric": 0.23903648555278778, |
|
"best_model_checkpoint": "/root/pretrain_executions/pretrain_utg4java_220m_seq1024/checkpoint-38422", |
|
"epoch": 49.992029332058024, |
|
"eval_steps": 500, |
|
"global_step": 39200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.3953451299218875, |
|
"grad_norm": 0.6701709628105164, |
|
"learning_rate": 7.908163265306123e-05, |
|
"loss": 0.5421, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.790690259843775, |
|
"grad_norm": 0.7041985988616943, |
|
"learning_rate": 0.00015816326530612246, |
|
"loss": 0.517, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9998405866411605, |
|
"eval_loss": 0.4792475402355194, |
|
"eval_runtime": 146.6334, |
|
"eval_samples_per_second": 85.567, |
|
"eval_steps_per_second": 2.68, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 1.1860353897656624, |
|
"grad_norm": 0.7499191164970398, |
|
"learning_rate": 0.00019924510620574761, |
|
"loss": 0.5394, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.5813805196875497, |
|
"grad_norm": 0.6494282484054565, |
|
"learning_rate": 0.00019763119533527698, |
|
"loss": 0.5327, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.9767256496094374, |
|
"grad_norm": 0.5610156059265137, |
|
"learning_rate": 0.00019601728446480633, |
|
"loss": 0.518, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.9996811732823212, |
|
"eval_loss": 0.45565617084503174, |
|
"eval_runtime": 146.6648, |
|
"eval_samples_per_second": 85.549, |
|
"eval_steps_per_second": 2.68, |
|
"step": 1568 |
|
}, |
|
{ |
|
"epoch": 2.3720707795313247, |
|
"grad_norm": 0.6261674761772156, |
|
"learning_rate": 0.00019440337359433573, |
|
"loss": 0.5076, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.767415909453212, |
|
"grad_norm": 0.5601200461387634, |
|
"learning_rate": 0.00019278946272386507, |
|
"loss": 0.4972, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.9995217599234816, |
|
"eval_loss": 0.4429556429386139, |
|
"eval_runtime": 146.6472, |
|
"eval_samples_per_second": 85.559, |
|
"eval_steps_per_second": 2.68, |
|
"step": 2352 |
|
}, |
|
{ |
|
"epoch": 3.1627610393751, |
|
"grad_norm": 0.6102643609046936, |
|
"learning_rate": 0.00019117555185339441, |
|
"loss": 0.4957, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.558106169296987, |
|
"grad_norm": 0.6246281266212463, |
|
"learning_rate": 0.00018956164098292379, |
|
"loss": 0.4827, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 3.9534512992188744, |
|
"grad_norm": 0.6781056523323059, |
|
"learning_rate": 0.00018794773011245316, |
|
"loss": 0.4736, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 3.999362346564642, |
|
"eval_loss": 0.42357587814331055, |
|
"eval_runtime": 146.7171, |
|
"eval_samples_per_second": 85.518, |
|
"eval_steps_per_second": 2.679, |
|
"step": 3136 |
|
}, |
|
{ |
|
"epoch": 4.348796429140762, |
|
"grad_norm": 0.5225201845169067, |
|
"learning_rate": 0.00018633381924198253, |
|
"loss": 0.4686, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 4.7441415590626494, |
|
"grad_norm": 0.5367516279220581, |
|
"learning_rate": 0.00018471990837151187, |
|
"loss": 0.4575, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 4.999202933205803, |
|
"eval_loss": 0.4168592095375061, |
|
"eval_runtime": 146.7474, |
|
"eval_samples_per_second": 85.501, |
|
"eval_steps_per_second": 2.678, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 5.139486688984537, |
|
"grad_norm": 0.5979415774345398, |
|
"learning_rate": 0.00018310599750104124, |
|
"loss": 0.4591, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 5.534831818906424, |
|
"grad_norm": 0.6041168570518494, |
|
"learning_rate": 0.0001814920866305706, |
|
"loss": 0.4506, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 5.930176948828311, |
|
"grad_norm": 0.5398473739624023, |
|
"learning_rate": 0.00017987817576009998, |
|
"loss": 0.4499, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 5.999043519846963, |
|
"eval_loss": 0.3998393714427948, |
|
"eval_runtime": 146.7067, |
|
"eval_samples_per_second": 85.524, |
|
"eval_steps_per_second": 2.679, |
|
"step": 4704 |
|
}, |
|
{ |
|
"epoch": 6.3255220787502, |
|
"grad_norm": 0.5446251630783081, |
|
"learning_rate": 0.00017826426488962933, |
|
"loss": 0.4444, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 6.720867208672087, |
|
"grad_norm": 0.564083993434906, |
|
"learning_rate": 0.00017665556018325697, |
|
"loss": 0.4401, |
|
"step": 5270 |
|
}, |
|
{ |
|
"epoch": 6.9988841064881235, |
|
"eval_loss": 0.3943786323070526, |
|
"eval_runtime": 146.7223, |
|
"eval_samples_per_second": 85.515, |
|
"eval_steps_per_second": 2.679, |
|
"step": 5488 |
|
}, |
|
{ |
|
"epoch": 7.116212338593974, |
|
"grad_norm": 0.5197238326072693, |
|
"learning_rate": 0.00017504164931278634, |
|
"loss": 0.4349, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 7.5115574685158615, |
|
"grad_norm": 0.5063862204551697, |
|
"learning_rate": 0.0001734277384423157, |
|
"loss": 0.4274, |
|
"step": 5890 |
|
}, |
|
{ |
|
"epoch": 7.906902598437749, |
|
"grad_norm": 0.8238950371742249, |
|
"learning_rate": 0.00017181382757184508, |
|
"loss": 0.4275, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 0.38801178336143494, |
|
"eval_runtime": 146.7154, |
|
"eval_samples_per_second": 85.519, |
|
"eval_steps_per_second": 2.679, |
|
"step": 6273 |
|
}, |
|
{ |
|
"epoch": 8.302247728359637, |
|
"grad_norm": 0.4785802662372589, |
|
"learning_rate": 0.00017019991670137442, |
|
"loss": 0.4218, |
|
"step": 6510 |
|
}, |
|
{ |
|
"epoch": 8.697592858281524, |
|
"grad_norm": 0.5460196137428284, |
|
"learning_rate": 0.0001685860058309038, |
|
"loss": 0.4165, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 8.999840586641161, |
|
"eval_loss": 0.3786679804325104, |
|
"eval_runtime": 146.6889, |
|
"eval_samples_per_second": 85.535, |
|
"eval_steps_per_second": 2.679, |
|
"step": 7057 |
|
}, |
|
{ |
|
"epoch": 9.092937988203412, |
|
"grad_norm": 0.5532106161117554, |
|
"learning_rate": 0.00016697209496043317, |
|
"loss": 0.4147, |
|
"step": 7130 |
|
}, |
|
{ |
|
"epoch": 9.488283118125299, |
|
"grad_norm": 0.5270036458969116, |
|
"learning_rate": 0.00016535818408996254, |
|
"loss": 0.4054, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 9.883628248047186, |
|
"grad_norm": 0.5107512474060059, |
|
"learning_rate": 0.00016374427321949188, |
|
"loss": 0.407, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 9.99968117328232, |
|
"eval_loss": 0.3678111732006073, |
|
"eval_runtime": 146.7168, |
|
"eval_samples_per_second": 85.519, |
|
"eval_steps_per_second": 2.679, |
|
"step": 7841 |
|
}, |
|
{ |
|
"epoch": 10.278973377969074, |
|
"grad_norm": 0.4663056433200836, |
|
"learning_rate": 0.00016213036234902125, |
|
"loss": 0.4001, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 10.67431850789096, |
|
"grad_norm": 0.5166866183280945, |
|
"learning_rate": 0.00016051645147855062, |
|
"loss": 0.4012, |
|
"step": 8370 |
|
}, |
|
{ |
|
"epoch": 10.999521759923482, |
|
"eval_loss": 0.36857831478118896, |
|
"eval_runtime": 146.752, |
|
"eval_samples_per_second": 85.498, |
|
"eval_steps_per_second": 2.678, |
|
"step": 8625 |
|
}, |
|
{ |
|
"epoch": 11.069663637812848, |
|
"grad_norm": 0.5623896718025208, |
|
"learning_rate": 0.00015890254060807997, |
|
"loss": 0.3967, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 11.465008767734735, |
|
"grad_norm": 0.4826233983039856, |
|
"learning_rate": 0.00015728862973760934, |
|
"loss": 0.3902, |
|
"step": 8990 |
|
}, |
|
{ |
|
"epoch": 11.860353897656623, |
|
"grad_norm": 0.5024587512016296, |
|
"learning_rate": 0.00015567471886713868, |
|
"loss": 0.3889, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 11.999362346564642, |
|
"eval_loss": 0.35674843192100525, |
|
"eval_runtime": 146.7093, |
|
"eval_samples_per_second": 85.523, |
|
"eval_steps_per_second": 2.679, |
|
"step": 9409 |
|
}, |
|
{ |
|
"epoch": 12.255699027578512, |
|
"grad_norm": 0.4992258846759796, |
|
"learning_rate": 0.00015406080799666805, |
|
"loss": 0.3838, |
|
"step": 9610 |
|
}, |
|
{ |
|
"epoch": 12.6510441575004, |
|
"grad_norm": 0.4781612455844879, |
|
"learning_rate": 0.00015244689712619742, |
|
"loss": 0.3789, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 12.999202933205803, |
|
"eval_loss": 0.35254529118537903, |
|
"eval_runtime": 146.7424, |
|
"eval_samples_per_second": 85.504, |
|
"eval_steps_per_second": 2.678, |
|
"step": 10193 |
|
}, |
|
{ |
|
"epoch": 13.046389287422286, |
|
"grad_norm": 0.49535173177719116, |
|
"learning_rate": 0.0001508329862557268, |
|
"loss": 0.3775, |
|
"step": 10230 |
|
}, |
|
{ |
|
"epoch": 13.441734417344174, |
|
"grad_norm": 0.5237115621566772, |
|
"learning_rate": 0.00014922428154935443, |
|
"loss": 0.3734, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 13.837079547266061, |
|
"grad_norm": 0.4549529552459717, |
|
"learning_rate": 0.0001476155768429821, |
|
"loss": 0.37, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 13.999043519846964, |
|
"eval_loss": 0.3443816006183624, |
|
"eval_runtime": 146.6954, |
|
"eval_samples_per_second": 85.531, |
|
"eval_steps_per_second": 2.679, |
|
"step": 10977 |
|
}, |
|
{ |
|
"epoch": 14.232424677187948, |
|
"grad_norm": 0.520125150680542, |
|
"learning_rate": 0.00014600166597251147, |
|
"loss": 0.3647, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 14.627769807109836, |
|
"grad_norm": 0.5332316160202026, |
|
"learning_rate": 0.00014438775510204084, |
|
"loss": 0.3678, |
|
"step": 11470 |
|
}, |
|
{ |
|
"epoch": 14.998884106488124, |
|
"eval_loss": 0.3436979055404663, |
|
"eval_runtime": 146.7179, |
|
"eval_samples_per_second": 85.518, |
|
"eval_steps_per_second": 2.679, |
|
"step": 11761 |
|
}, |
|
{ |
|
"epoch": 15.023114937031723, |
|
"grad_norm": 0.47955256700515747, |
|
"learning_rate": 0.0001427738442315702, |
|
"loss": 0.3664, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 15.41846006695361, |
|
"grad_norm": 0.48371678590774536, |
|
"learning_rate": 0.00014115993336109953, |
|
"loss": 0.36, |
|
"step": 12090 |
|
}, |
|
{ |
|
"epoch": 15.813805196875498, |
|
"grad_norm": 0.4756961166858673, |
|
"learning_rate": 0.0001395460224906289, |
|
"loss": 0.3577, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 0.3342459499835968, |
|
"eval_runtime": 146.7158, |
|
"eval_samples_per_second": 85.519, |
|
"eval_steps_per_second": 2.679, |
|
"step": 12546 |
|
}, |
|
{ |
|
"epoch": 16.209150326797385, |
|
"grad_norm": 0.5593659281730652, |
|
"learning_rate": 0.00013793211162015827, |
|
"loss": 0.3536, |
|
"step": 12710 |
|
}, |
|
{ |
|
"epoch": 16.604495456719274, |
|
"grad_norm": 0.6009001135826111, |
|
"learning_rate": 0.00013631820074968764, |
|
"loss": 0.3539, |
|
"step": 13020 |
|
}, |
|
{ |
|
"epoch": 16.99984058664116, |
|
"grad_norm": 0.48707565665245056, |
|
"learning_rate": 0.000134704289879217, |
|
"loss": 0.3522, |
|
"step": 13330 |
|
}, |
|
{ |
|
"epoch": 16.99984058664116, |
|
"eval_loss": 0.33101820945739746, |
|
"eval_runtime": 146.73, |
|
"eval_samples_per_second": 85.511, |
|
"eval_steps_per_second": 2.678, |
|
"step": 13330 |
|
}, |
|
{ |
|
"epoch": 17.39518571656305, |
|
"grad_norm": 0.4787095785140991, |
|
"learning_rate": 0.00013309037900874636, |
|
"loss": 0.343, |
|
"step": 13640 |
|
}, |
|
{ |
|
"epoch": 17.790530846484934, |
|
"grad_norm": 0.4643840789794922, |
|
"learning_rate": 0.00013147646813827573, |
|
"loss": 0.3466, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 17.999681173282323, |
|
"eval_loss": 0.3281005620956421, |
|
"eval_runtime": 146.7108, |
|
"eval_samples_per_second": 85.522, |
|
"eval_steps_per_second": 2.679, |
|
"step": 14114 |
|
}, |
|
{ |
|
"epoch": 18.185875976406823, |
|
"grad_norm": 0.4819445312023163, |
|
"learning_rate": 0.0001298625572678051, |
|
"loss": 0.3415, |
|
"step": 14260 |
|
}, |
|
{ |
|
"epoch": 18.58122110632871, |
|
"grad_norm": 0.46530964970588684, |
|
"learning_rate": 0.00012824864639733444, |
|
"loss": 0.3393, |
|
"step": 14570 |
|
}, |
|
{ |
|
"epoch": 18.976566236250598, |
|
"grad_norm": 0.5159475207328796, |
|
"learning_rate": 0.00012663473552686382, |
|
"loss": 0.3377, |
|
"step": 14880 |
|
}, |
|
{ |
|
"epoch": 18.999521759923482, |
|
"eval_loss": 0.32132235169410706, |
|
"eval_runtime": 146.7396, |
|
"eval_samples_per_second": 85.505, |
|
"eval_steps_per_second": 2.678, |
|
"step": 14898 |
|
}, |
|
{ |
|
"epoch": 19.371911366172487, |
|
"grad_norm": 0.45964986085891724, |
|
"learning_rate": 0.00012502082465639319, |
|
"loss": 0.3348, |
|
"step": 15190 |
|
}, |
|
{ |
|
"epoch": 19.767256496094372, |
|
"grad_norm": 0.49627387523651123, |
|
"learning_rate": 0.00012340691378592253, |
|
"loss": 0.3316, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 19.99936234656464, |
|
"eval_loss": 0.31626757979393005, |
|
"eval_runtime": 146.7396, |
|
"eval_samples_per_second": 85.505, |
|
"eval_steps_per_second": 2.678, |
|
"step": 15682 |
|
}, |
|
{ |
|
"epoch": 20.16260162601626, |
|
"grad_norm": 0.48719242215156555, |
|
"learning_rate": 0.0001217930029154519, |
|
"loss": 0.3294, |
|
"step": 15810 |
|
}, |
|
{ |
|
"epoch": 20.557946755938147, |
|
"grad_norm": 0.5443927049636841, |
|
"learning_rate": 0.00012017909204498126, |
|
"loss": 0.3261, |
|
"step": 16120 |
|
}, |
|
{ |
|
"epoch": 20.953291885860036, |
|
"grad_norm": 0.4637634754180908, |
|
"learning_rate": 0.00011856518117451063, |
|
"loss": 0.3255, |
|
"step": 16430 |
|
}, |
|
{ |
|
"epoch": 20.9992029332058, |
|
"eval_loss": 0.31501948833465576, |
|
"eval_runtime": 146.7591, |
|
"eval_samples_per_second": 85.494, |
|
"eval_steps_per_second": 2.678, |
|
"step": 16466 |
|
}, |
|
{ |
|
"epoch": 21.34863701578192, |
|
"grad_norm": 0.46018585562705994, |
|
"learning_rate": 0.00011695127030403999, |
|
"loss": 0.3198, |
|
"step": 16740 |
|
}, |
|
{ |
|
"epoch": 21.74398214570381, |
|
"grad_norm": 0.5096014738082886, |
|
"learning_rate": 0.00011533735943356936, |
|
"loss": 0.3226, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 21.999043519846964, |
|
"eval_loss": 0.30657365918159485, |
|
"eval_runtime": 146.7538, |
|
"eval_samples_per_second": 85.497, |
|
"eval_steps_per_second": 2.678, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 22.139327275625696, |
|
"grad_norm": 0.44816407561302185, |
|
"learning_rate": 0.00011372344856309872, |
|
"loss": 0.3178, |
|
"step": 17360 |
|
}, |
|
{ |
|
"epoch": 22.534672405547585, |
|
"grad_norm": 0.437168151140213, |
|
"learning_rate": 0.00011211474385672638, |
|
"loss": 0.3172, |
|
"step": 17670 |
|
}, |
|
{ |
|
"epoch": 22.93001753546947, |
|
"grad_norm": 0.5836613774299622, |
|
"learning_rate": 0.00011050083298625573, |
|
"loss": 0.3121, |
|
"step": 17980 |
|
}, |
|
{ |
|
"epoch": 22.998884106488124, |
|
"eval_loss": 0.30263882875442505, |
|
"eval_runtime": 146.7108, |
|
"eval_samples_per_second": 85.522, |
|
"eval_steps_per_second": 2.679, |
|
"step": 18034 |
|
}, |
|
{ |
|
"epoch": 23.32536266539136, |
|
"grad_norm": 0.4829230308532715, |
|
"learning_rate": 0.00010888692211578508, |
|
"loss": 0.3079, |
|
"step": 18290 |
|
}, |
|
{ |
|
"epoch": 23.720707795313245, |
|
"grad_norm": 0.4485584497451782, |
|
"learning_rate": 0.00010727821740941275, |
|
"loss": 0.3105, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 0.3048921227455139, |
|
"eval_runtime": 146.8103, |
|
"eval_samples_per_second": 85.464, |
|
"eval_steps_per_second": 2.677, |
|
"step": 18819 |
|
}, |
|
{ |
|
"epoch": 24.116052925235135, |
|
"grad_norm": 0.5251662135124207, |
|
"learning_rate": 0.00010566430653894211, |
|
"loss": 0.3052, |
|
"step": 18910 |
|
}, |
|
{ |
|
"epoch": 24.511398055157024, |
|
"grad_norm": 0.4876725971698761, |
|
"learning_rate": 0.00010405039566847148, |
|
"loss": 0.3045, |
|
"step": 19220 |
|
}, |
|
{ |
|
"epoch": 24.90674318507891, |
|
"grad_norm": 0.5600521564483643, |
|
"learning_rate": 0.00010243648479800084, |
|
"loss": 0.3048, |
|
"step": 19530 |
|
}, |
|
{ |
|
"epoch": 24.99984058664116, |
|
"eval_loss": 0.2986990809440613, |
|
"eval_runtime": 146.734, |
|
"eval_samples_per_second": 85.508, |
|
"eval_steps_per_second": 2.678, |
|
"step": 19603 |
|
}, |
|
{ |
|
"epoch": 25.3020883150008, |
|
"grad_norm": 0.5170055627822876, |
|
"learning_rate": 0.00010082257392753021, |
|
"loss": 0.3003, |
|
"step": 19840 |
|
}, |
|
{ |
|
"epoch": 25.697433444922684, |
|
"grad_norm": 0.48347124457359314, |
|
"learning_rate": 9.920866305705956e-05, |
|
"loss": 0.2983, |
|
"step": 20150 |
|
}, |
|
{ |
|
"epoch": 25.999681173282323, |
|
"eval_loss": 0.2916134297847748, |
|
"eval_runtime": 146.7271, |
|
"eval_samples_per_second": 85.512, |
|
"eval_steps_per_second": 2.678, |
|
"step": 20387 |
|
}, |
|
{ |
|
"epoch": 26.092778574844573, |
|
"grad_norm": 0.48907041549682617, |
|
"learning_rate": 9.759475218658892e-05, |
|
"loss": 0.2959, |
|
"step": 20460 |
|
}, |
|
{ |
|
"epoch": 26.48812370476646, |
|
"grad_norm": 0.5060804486274719, |
|
"learning_rate": 9.598084131611829e-05, |
|
"loss": 0.2923, |
|
"step": 20770 |
|
}, |
|
{ |
|
"epoch": 26.883468834688347, |
|
"grad_norm": 0.4843296706676483, |
|
"learning_rate": 9.436693044564765e-05, |
|
"loss": 0.2918, |
|
"step": 21080 |
|
}, |
|
{ |
|
"epoch": 26.999521759923482, |
|
"eval_loss": 0.29019656777381897, |
|
"eval_runtime": 146.6934, |
|
"eval_samples_per_second": 85.532, |
|
"eval_steps_per_second": 2.679, |
|
"step": 21171 |
|
}, |
|
{ |
|
"epoch": 27.278813964610233, |
|
"grad_norm": 0.42266514897346497, |
|
"learning_rate": 9.275301957517701e-05, |
|
"loss": 0.2901, |
|
"step": 21390 |
|
}, |
|
{ |
|
"epoch": 27.674159094532122, |
|
"grad_norm": 0.5161967873573303, |
|
"learning_rate": 9.113910870470638e-05, |
|
"loss": 0.2889, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 27.99936234656464, |
|
"eval_loss": 0.2833983302116394, |
|
"eval_runtime": 146.7193, |
|
"eval_samples_per_second": 85.517, |
|
"eval_steps_per_second": 2.679, |
|
"step": 21955 |
|
}, |
|
{ |
|
"epoch": 28.069504224454008, |
|
"grad_norm": 0.4523755609989166, |
|
"learning_rate": 8.952519783423574e-05, |
|
"loss": 0.2871, |
|
"step": 22010 |
|
}, |
|
{ |
|
"epoch": 28.464849354375897, |
|
"grad_norm": 0.44348961114883423, |
|
"learning_rate": 8.791128696376511e-05, |
|
"loss": 0.2847, |
|
"step": 22320 |
|
}, |
|
{ |
|
"epoch": 28.860194484297786, |
|
"grad_norm": 0.6467667818069458, |
|
"learning_rate": 8.630258225739276e-05, |
|
"loss": 0.2844, |
|
"step": 22630 |
|
}, |
|
{ |
|
"epoch": 28.9992029332058, |
|
"eval_loss": 0.28629302978515625, |
|
"eval_runtime": 146.7547, |
|
"eval_samples_per_second": 85.496, |
|
"eval_steps_per_second": 2.678, |
|
"step": 22739 |
|
}, |
|
{ |
|
"epoch": 29.25553961421967, |
|
"grad_norm": 0.4734992980957031, |
|
"learning_rate": 8.468867138692213e-05, |
|
"loss": 0.2787, |
|
"step": 22940 |
|
}, |
|
{ |
|
"epoch": 29.65088474414156, |
|
"grad_norm": 0.4827498495578766, |
|
"learning_rate": 8.307476051645148e-05, |
|
"loss": 0.2787, |
|
"step": 23250 |
|
}, |
|
{ |
|
"epoch": 29.999043519846964, |
|
"eval_loss": 0.2794826626777649, |
|
"eval_runtime": 146.9198, |
|
"eval_samples_per_second": 85.4, |
|
"eval_steps_per_second": 2.675, |
|
"step": 23523 |
|
}, |
|
{ |
|
"epoch": 30.046229874063446, |
|
"grad_norm": 0.5005486607551575, |
|
"learning_rate": 8.146084964598085e-05, |
|
"loss": 0.2758, |
|
"step": 23560 |
|
}, |
|
{ |
|
"epoch": 30.441575003985335, |
|
"grad_norm": 0.5253671407699585, |
|
"learning_rate": 7.98469387755102e-05, |
|
"loss": 0.2761, |
|
"step": 23870 |
|
}, |
|
{ |
|
"epoch": 30.83692013390722, |
|
"grad_norm": 0.472740113735199, |
|
"learning_rate": 7.823302790503957e-05, |
|
"loss": 0.2726, |
|
"step": 24180 |
|
}, |
|
{ |
|
"epoch": 30.998884106488124, |
|
"eval_loss": 0.2779182493686676, |
|
"eval_runtime": 146.7777, |
|
"eval_samples_per_second": 85.483, |
|
"eval_steps_per_second": 2.678, |
|
"step": 24307 |
|
}, |
|
{ |
|
"epoch": 31.23226526382911, |
|
"grad_norm": 0.5228144526481628, |
|
"learning_rate": 7.661911703456893e-05, |
|
"loss": 0.2717, |
|
"step": 24490 |
|
}, |
|
{ |
|
"epoch": 31.627610393750995, |
|
"grad_norm": 0.47681719064712524, |
|
"learning_rate": 7.501041232819659e-05, |
|
"loss": 0.2664, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"eval_loss": 0.27039337158203125, |
|
"eval_runtime": 146.805, |
|
"eval_samples_per_second": 85.467, |
|
"eval_steps_per_second": 2.677, |
|
"step": 25092 |
|
}, |
|
{ |
|
"epoch": 32.022955523672884, |
|
"grad_norm": 0.4973162114620209, |
|
"learning_rate": 7.339650145772596e-05, |
|
"loss": 0.268, |
|
"step": 25110 |
|
}, |
|
{ |
|
"epoch": 32.41830065359477, |
|
"grad_norm": 0.5740240216255188, |
|
"learning_rate": 7.178259058725531e-05, |
|
"loss": 0.2668, |
|
"step": 25420 |
|
}, |
|
{ |
|
"epoch": 32.813645783516655, |
|
"grad_norm": 0.4842962622642517, |
|
"learning_rate": 7.016867971678468e-05, |
|
"loss": 0.2631, |
|
"step": 25730 |
|
}, |
|
{ |
|
"epoch": 32.99984058664116, |
|
"eval_loss": 0.2733234763145447, |
|
"eval_runtime": 146.7109, |
|
"eval_samples_per_second": 85.522, |
|
"eval_steps_per_second": 2.679, |
|
"step": 25876 |
|
}, |
|
{ |
|
"epoch": 33.20899091343855, |
|
"grad_norm": 0.499452143907547, |
|
"learning_rate": 6.855476884631404e-05, |
|
"loss": 0.263, |
|
"step": 26040 |
|
}, |
|
{ |
|
"epoch": 33.60433604336043, |
|
"grad_norm": 0.4541178345680237, |
|
"learning_rate": 6.69408579758434e-05, |
|
"loss": 0.2603, |
|
"step": 26350 |
|
}, |
|
{ |
|
"epoch": 33.99968117328232, |
|
"grad_norm": 0.5029833912849426, |
|
"learning_rate": 6.532694710537276e-05, |
|
"loss": 0.258, |
|
"step": 26660 |
|
}, |
|
{ |
|
"epoch": 33.99968117328232, |
|
"eval_loss": 0.26625362038612366, |
|
"eval_runtime": 146.7319, |
|
"eval_samples_per_second": 85.51, |
|
"eval_steps_per_second": 2.678, |
|
"step": 26660 |
|
}, |
|
{ |
|
"epoch": 34.39502630320421, |
|
"grad_norm": 0.5090352892875671, |
|
"learning_rate": 6.371303623490213e-05, |
|
"loss": 0.2544, |
|
"step": 26970 |
|
}, |
|
{ |
|
"epoch": 34.7903714331261, |
|
"grad_norm": 0.4605717360973358, |
|
"learning_rate": 6.209912536443149e-05, |
|
"loss": 0.254, |
|
"step": 27280 |
|
}, |
|
{ |
|
"epoch": 34.99952175992348, |
|
"eval_loss": 0.26669949293136597, |
|
"eval_runtime": 146.7117, |
|
"eval_samples_per_second": 85.521, |
|
"eval_steps_per_second": 2.679, |
|
"step": 27444 |
|
}, |
|
{ |
|
"epoch": 35.18571656304798, |
|
"grad_norm": 0.46216222643852234, |
|
"learning_rate": 6.048521449396085e-05, |
|
"loss": 0.254, |
|
"step": 27590 |
|
}, |
|
{ |
|
"epoch": 35.58106169296987, |
|
"grad_norm": 0.49629315733909607, |
|
"learning_rate": 5.8871303623490214e-05, |
|
"loss": 0.2521, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 35.97640682289176, |
|
"grad_norm": 0.48311081528663635, |
|
"learning_rate": 5.725739275301958e-05, |
|
"loss": 0.2493, |
|
"step": 28210 |
|
}, |
|
{ |
|
"epoch": 35.999362346564645, |
|
"eval_loss": 0.26483407616615295, |
|
"eval_runtime": 146.7384, |
|
"eval_samples_per_second": 85.506, |
|
"eval_steps_per_second": 2.678, |
|
"step": 28228 |
|
}, |
|
{ |
|
"epoch": 36.371751952813646, |
|
"grad_norm": 0.43428850173950195, |
|
"learning_rate": 5.564348188254894e-05, |
|
"loss": 0.2455, |
|
"step": 28520 |
|
}, |
|
{ |
|
"epoch": 36.76709708273553, |
|
"grad_norm": 0.4786287844181061, |
|
"learning_rate": 5.4029571012078306e-05, |
|
"loss": 0.2454, |
|
"step": 28830 |
|
}, |
|
{ |
|
"epoch": 36.9992029332058, |
|
"eval_loss": 0.26446378231048584, |
|
"eval_runtime": 146.73, |
|
"eval_samples_per_second": 85.511, |
|
"eval_steps_per_second": 2.678, |
|
"step": 29012 |
|
}, |
|
{ |
|
"epoch": 37.16244221265742, |
|
"grad_norm": 0.5931326746940613, |
|
"learning_rate": 5.241566014160767e-05, |
|
"loss": 0.247, |
|
"step": 29140 |
|
}, |
|
{ |
|
"epoch": 37.55778734257931, |
|
"grad_norm": 0.5031745433807373, |
|
"learning_rate": 5.0801749271137035e-05, |
|
"loss": 0.2425, |
|
"step": 29450 |
|
}, |
|
{ |
|
"epoch": 37.953132472501196, |
|
"grad_norm": 0.5432093739509583, |
|
"learning_rate": 4.918783840066639e-05, |
|
"loss": 0.2416, |
|
"step": 29760 |
|
}, |
|
{ |
|
"epoch": 37.999043519846964, |
|
"eval_loss": 0.2601180672645569, |
|
"eval_runtime": 146.6811, |
|
"eval_samples_per_second": 85.539, |
|
"eval_steps_per_second": 2.679, |
|
"step": 29796 |
|
}, |
|
{ |
|
"epoch": 38.34847760242308, |
|
"grad_norm": 0.5319362878799438, |
|
"learning_rate": 4.7573927530195756e-05, |
|
"loss": 0.2392, |
|
"step": 30070 |
|
}, |
|
{ |
|
"epoch": 38.743822732344974, |
|
"grad_norm": 0.5319586396217346, |
|
"learning_rate": 4.596001665972512e-05, |
|
"loss": 0.2368, |
|
"step": 30380 |
|
}, |
|
{ |
|
"epoch": 38.99888410648813, |
|
"eval_loss": 0.25446435809135437, |
|
"eval_runtime": 146.6972, |
|
"eval_samples_per_second": 85.53, |
|
"eval_steps_per_second": 2.679, |
|
"step": 30580 |
|
}, |
|
{ |
|
"epoch": 39.13916786226686, |
|
"grad_norm": 0.4489250183105469, |
|
"learning_rate": 4.434610578925448e-05, |
|
"loss": 0.2368, |
|
"step": 30690 |
|
}, |
|
{ |
|
"epoch": 39.534512992188745, |
|
"grad_norm": 0.48287880420684814, |
|
"learning_rate": 4.273740108288213e-05, |
|
"loss": 0.2353, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 39.92985812211063, |
|
"grad_norm": 0.49850553274154663, |
|
"learning_rate": 4.1123490212411495e-05, |
|
"loss": 0.2321, |
|
"step": 31310 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"eval_loss": 0.24883659183979034, |
|
"eval_runtime": 146.7363, |
|
"eval_samples_per_second": 85.507, |
|
"eval_steps_per_second": 2.678, |
|
"step": 31365 |
|
}, |
|
{ |
|
"epoch": 40.32520325203252, |
|
"grad_norm": 0.4667394161224365, |
|
"learning_rate": 3.9514785506039155e-05, |
|
"loss": 0.2337, |
|
"step": 31620 |
|
}, |
|
{ |
|
"epoch": 40.72054838195441, |
|
"grad_norm": 0.5053902864456177, |
|
"learning_rate": 3.790087463556852e-05, |
|
"loss": 0.2284, |
|
"step": 31930 |
|
}, |
|
{ |
|
"epoch": 40.99984058664116, |
|
"eval_loss": 0.2544113099575043, |
|
"eval_runtime": 146.7257, |
|
"eval_samples_per_second": 85.513, |
|
"eval_steps_per_second": 2.678, |
|
"step": 32149 |
|
}, |
|
{ |
|
"epoch": 41.115893511876294, |
|
"grad_norm": 0.47476327419281006, |
|
"learning_rate": 3.628696376509788e-05, |
|
"loss": 0.2286, |
|
"step": 32240 |
|
}, |
|
{ |
|
"epoch": 41.51123864179818, |
|
"grad_norm": 0.5025794506072998, |
|
"learning_rate": 3.467305289462724e-05, |
|
"loss": 0.2292, |
|
"step": 32550 |
|
}, |
|
{ |
|
"epoch": 41.90658377172007, |
|
"grad_norm": 0.4553293287754059, |
|
"learning_rate": 3.3059142024156605e-05, |
|
"loss": 0.225, |
|
"step": 32860 |
|
}, |
|
{ |
|
"epoch": 41.99968117328232, |
|
"eval_loss": 0.24568869173526764, |
|
"eval_runtime": 146.7316, |
|
"eval_samples_per_second": 85.51, |
|
"eval_steps_per_second": 2.678, |
|
"step": 32933 |
|
}, |
|
{ |
|
"epoch": 42.30192890164196, |
|
"grad_norm": 0.4845215678215027, |
|
"learning_rate": 3.144523115368597e-05, |
|
"loss": 0.2236, |
|
"step": 33170 |
|
}, |
|
{ |
|
"epoch": 42.69727403156384, |
|
"grad_norm": 0.5739601850509644, |
|
"learning_rate": 2.983132028321533e-05, |
|
"loss": 0.2234, |
|
"step": 33480 |
|
}, |
|
{ |
|
"epoch": 42.99952175992348, |
|
"eval_loss": 0.24620206654071808, |
|
"eval_runtime": 146.7264, |
|
"eval_samples_per_second": 85.513, |
|
"eval_steps_per_second": 2.678, |
|
"step": 33717 |
|
}, |
|
{ |
|
"epoch": 43.092619161485736, |
|
"grad_norm": 0.4569677412509918, |
|
"learning_rate": 2.8217409412744688e-05, |
|
"loss": 0.2213, |
|
"step": 33790 |
|
}, |
|
{ |
|
"epoch": 43.48796429140762, |
|
"grad_norm": 0.5146024227142334, |
|
"learning_rate": 2.6603498542274052e-05, |
|
"loss": 0.2188, |
|
"step": 34100 |
|
}, |
|
{ |
|
"epoch": 43.88330942132951, |
|
"grad_norm": 0.47475871443748474, |
|
"learning_rate": 2.4989587671803416e-05, |
|
"loss": 0.2206, |
|
"step": 34410 |
|
}, |
|
{ |
|
"epoch": 43.999362346564645, |
|
"eval_loss": 0.2445935159921646, |
|
"eval_runtime": 146.7897, |
|
"eval_samples_per_second": 85.476, |
|
"eval_steps_per_second": 2.677, |
|
"step": 34501 |
|
}, |
|
{ |
|
"epoch": 44.27865455125139, |
|
"grad_norm": 0.45915085077285767, |
|
"learning_rate": 2.337567680133278e-05, |
|
"loss": 0.217, |
|
"step": 34720 |
|
}, |
|
{ |
|
"epoch": 44.673999681173285, |
|
"grad_norm": 0.4429190456867218, |
|
"learning_rate": 2.176176593086214e-05, |
|
"loss": 0.2165, |
|
"step": 35030 |
|
}, |
|
{ |
|
"epoch": 44.9992029332058, |
|
"eval_loss": 0.24302400648593903, |
|
"eval_runtime": 146.7631, |
|
"eval_samples_per_second": 85.492, |
|
"eval_steps_per_second": 2.678, |
|
"step": 35285 |
|
}, |
|
{ |
|
"epoch": 45.06934481109517, |
|
"grad_norm": 0.5038246512413025, |
|
"learning_rate": 2.0147855060391505e-05, |
|
"loss": 0.217, |
|
"step": 35340 |
|
}, |
|
{ |
|
"epoch": 45.464689941017056, |
|
"grad_norm": 0.4302615523338318, |
|
"learning_rate": 1.8539150354019162e-05, |
|
"loss": 0.2137, |
|
"step": 35650 |
|
}, |
|
{ |
|
"epoch": 45.86003507093894, |
|
"grad_norm": 0.5075607299804688, |
|
"learning_rate": 1.6925239483548523e-05, |
|
"loss": 0.2145, |
|
"step": 35960 |
|
}, |
|
{ |
|
"epoch": 45.999043519846964, |
|
"eval_loss": 0.24222899973392487, |
|
"eval_runtime": 146.735, |
|
"eval_samples_per_second": 85.508, |
|
"eval_steps_per_second": 2.678, |
|
"step": 36069 |
|
}, |
|
{ |
|
"epoch": 46.255380200860834, |
|
"grad_norm": 0.4777955114841461, |
|
"learning_rate": 1.531653477717618e-05, |
|
"loss": 0.2126, |
|
"step": 36270 |
|
}, |
|
{ |
|
"epoch": 46.65072533078272, |
|
"grad_norm": 0.48974084854125977, |
|
"learning_rate": 1.3702623906705539e-05, |
|
"loss": 0.2112, |
|
"step": 36580 |
|
}, |
|
{ |
|
"epoch": 46.99888410648813, |
|
"eval_loss": 0.2432757019996643, |
|
"eval_runtime": 146.7494, |
|
"eval_samples_per_second": 85.499, |
|
"eval_steps_per_second": 2.678, |
|
"step": 36853 |
|
}, |
|
{ |
|
"epoch": 47.046070460704605, |
|
"grad_norm": 0.46624037623405457, |
|
"learning_rate": 1.2088713036234903e-05, |
|
"loss": 0.2089, |
|
"step": 36890 |
|
}, |
|
{ |
|
"epoch": 47.44141559062649, |
|
"grad_norm": 0.4808659553527832, |
|
"learning_rate": 1.0474802165764265e-05, |
|
"loss": 0.2085, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 47.836760720548384, |
|
"grad_norm": 0.4421006143093109, |
|
"learning_rate": 8.86089129529363e-06, |
|
"loss": 0.2087, |
|
"step": 37510 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"eval_loss": 0.24061799049377441, |
|
"eval_runtime": 146.7785, |
|
"eval_samples_per_second": 85.483, |
|
"eval_steps_per_second": 2.678, |
|
"step": 37638 |
|
}, |
|
{ |
|
"epoch": 48.23210585047027, |
|
"grad_norm": 0.4642196297645569, |
|
"learning_rate": 7.246980424822991e-06, |
|
"loss": 0.208, |
|
"step": 37820 |
|
}, |
|
{ |
|
"epoch": 48.627450980392155, |
|
"grad_norm": 0.47141027450561523, |
|
"learning_rate": 5.633069554352354e-06, |
|
"loss": 0.2067, |
|
"step": 38130 |
|
}, |
|
{ |
|
"epoch": 48.99984058664116, |
|
"eval_loss": 0.23903648555278778, |
|
"eval_runtime": 146.7908, |
|
"eval_samples_per_second": 85.475, |
|
"eval_steps_per_second": 2.677, |
|
"step": 38422 |
|
}, |
|
{ |
|
"epoch": 49.02279611031405, |
|
"grad_norm": 0.45030030608177185, |
|
"learning_rate": 4.019158683881716e-06, |
|
"loss": 0.2062, |
|
"step": 38440 |
|
}, |
|
{ |
|
"epoch": 49.41814124023593, |
|
"grad_norm": 0.48792940378189087, |
|
"learning_rate": 2.4052478134110786e-06, |
|
"loss": 0.2062, |
|
"step": 38750 |
|
}, |
|
{ |
|
"epoch": 49.81348637015782, |
|
"grad_norm": 0.40084025263786316, |
|
"learning_rate": 7.913369429404415e-07, |
|
"loss": 0.2055, |
|
"step": 39060 |
|
}, |
|
{ |
|
"epoch": 49.992029332058024, |
|
"eval_loss": 0.23992499709129333, |
|
"eval_runtime": 146.7718, |
|
"eval_samples_per_second": 85.486, |
|
"eval_steps_per_second": 2.678, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 49.992029332058024, |
|
"step": 39200, |
|
"total_flos": 6.111014223347712e+18, |
|
"train_loss": 0.31974333125717785, |
|
"train_runtime": 145458.5548, |
|
"train_samples_per_second": 34.503, |
|
"train_steps_per_second": 0.269 |
|
} |
|
], |
|
"logging_steps": 310, |
|
"max_steps": 39200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"total_flos": 6.111014223347712e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|