|
{ |
|
"best_metric": 1.0753824710845947, |
|
"best_model_checkpoint": "/root/finetuning_executions/finetuning_02_codet5p_src_fm_fc_ms_ff/checkpoint-17548", |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 87740, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5287591218948364, |
|
"learning_rate": 2.4687500000000004e-05, |
|
"loss": 1.4862, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.919360876083374, |
|
"learning_rate": 4.96875e-05, |
|
"loss": 1.1714, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1510220766067505, |
|
"learning_rate": 4.977283183804923e-05, |
|
"loss": 1.1326, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.1849422454833984, |
|
"learning_rate": 4.9542788129744654e-05, |
|
"loss": 1.1176, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.052920937538147, |
|
"learning_rate": 4.931274442144008e-05, |
|
"loss": 1.0981, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.174275517463684, |
|
"learning_rate": 4.90827007131355e-05, |
|
"loss": 1.0811, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.0344840288162231, |
|
"learning_rate": 4.885265700483092e-05, |
|
"loss": 1.065, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.2671674489974976, |
|
"learning_rate": 4.862261329652634e-05, |
|
"loss": 1.0578, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.1277002096176147, |
|
"learning_rate": 4.839256958822176e-05, |
|
"loss": 1.0421, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.1894861459732056, |
|
"learning_rate": 4.8162525879917186e-05, |
|
"loss": 1.031, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.2189041376113892, |
|
"learning_rate": 4.793248217161261e-05, |
|
"loss": 1.0322, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.2372210025787354, |
|
"learning_rate": 4.770243846330803e-05, |
|
"loss": 1.0155, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2500073909759521, |
|
"learning_rate": 4.7472394755003454e-05, |
|
"loss": 1.0211, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.9148824214935303, |
|
"learning_rate": 4.724235104669887e-05, |
|
"loss": 1.0001, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.1473156213760376, |
|
"learning_rate": 4.7012307338394294e-05, |
|
"loss": 0.9869, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.1870834827423096, |
|
"learning_rate": 4.6782263630089717e-05, |
|
"loss": 0.9799, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.1499440670013428, |
|
"learning_rate": 4.655221992178514e-05, |
|
"loss": 0.9745, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.0729453563690186, |
|
"learning_rate": 4.632217621348056e-05, |
|
"loss": 0.9871, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.3007827997207642, |
|
"learning_rate": 4.6092132505175986e-05, |
|
"loss": 0.9612, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.1860408782958984, |
|
"learning_rate": 4.586208879687141e-05, |
|
"loss": 0.9636, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.0349955558776855, |
|
"learning_rate": 4.5632045088566825e-05, |
|
"loss": 0.9645, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.3005322217941284, |
|
"learning_rate": 4.5402001380262254e-05, |
|
"loss": 0.9536, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.2307965755462646, |
|
"learning_rate": 4.517195767195768e-05, |
|
"loss": 0.9474, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.0385469198226929, |
|
"learning_rate": 4.49419139636531e-05, |
|
"loss": 0.9402, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.1734727621078491, |
|
"learning_rate": 4.471187025534852e-05, |
|
"loss": 0.9321, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.3363800048828125, |
|
"learning_rate": 4.448182654704394e-05, |
|
"loss": 0.9192, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.073585033416748, |
|
"learning_rate": 4.425178283873936e-05, |
|
"loss": 0.9378, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0610324144363403, |
|
"learning_rate": 4.4021739130434786e-05, |
|
"loss": 0.9187, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.039048194885254, |
|
"learning_rate": 4.379169542213021e-05, |
|
"loss": 0.9191, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0391401052474976, |
|
"learning_rate": 4.356165171382563e-05, |
|
"loss": 0.91, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.082083821296692, |
|
"learning_rate": 4.3331608005521054e-05, |
|
"loss": 0.9166, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.0464677810668945, |
|
"learning_rate": 4.310156429721647e-05, |
|
"loss": 0.9234, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.0795680284500122, |
|
"learning_rate": 4.2871520588911894e-05, |
|
"loss": 0.9004, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.2177696228027344, |
|
"learning_rate": 4.2641476880607317e-05, |
|
"loss": 0.8991, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9279542565345764, |
|
"learning_rate": 4.241143317230274e-05, |
|
"loss": 0.901, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.2393149137496948, |
|
"learning_rate": 4.218138946399816e-05, |
|
"loss": 0.8898, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.2811025381088257, |
|
"learning_rate": 4.1951920864964345e-05, |
|
"loss": 0.8975, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.0508288145065308, |
|
"learning_rate": 4.172187715665977e-05, |
|
"loss": 0.897, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.962242066860199, |
|
"learning_rate": 4.149183344835519e-05, |
|
"loss": 0.8776, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.9615252017974854, |
|
"learning_rate": 4.126178974005061e-05, |
|
"loss": 0.873, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.040337324142456, |
|
"learning_rate": 4.103174603174603e-05, |
|
"loss": 0.8831, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.0600088834762573, |
|
"learning_rate": 4.0801702323441453e-05, |
|
"loss": 0.8759, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.9814367890357971, |
|
"learning_rate": 4.0572233724407636e-05, |
|
"loss": 0.8732, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.0753824710845947, |
|
"eval_runtime": 239.6966, |
|
"eval_samples_per_second": 251.464, |
|
"eval_steps_per_second": 3.93, |
|
"step": 17548 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 1.055283784866333, |
|
"learning_rate": 4.034219001610306e-05, |
|
"loss": 0.8697, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 1.1038569211959839, |
|
"learning_rate": 4.011272141706924e-05, |
|
"loss": 0.8246, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.9692428708076477, |
|
"learning_rate": 3.9882677708764665e-05, |
|
"loss": 0.8284, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 1.093485951423645, |
|
"learning_rate": 3.965263400046009e-05, |
|
"loss": 0.8271, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 1.1435869932174683, |
|
"learning_rate": 3.942259029215551e-05, |
|
"loss": 0.8198, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.389695644378662, |
|
"learning_rate": 3.9192546583850934e-05, |
|
"loss": 0.8223, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 1.081563949584961, |
|
"learning_rate": 3.896307798481712e-05, |
|
"loss": 0.8078, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 1.20356023311615, |
|
"learning_rate": 3.873303427651253e-05, |
|
"loss": 0.8216, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 1.2045621871948242, |
|
"learning_rate": 3.850299056820796e-05, |
|
"loss": 0.8222, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.969454824924469, |
|
"learning_rate": 3.8272946859903386e-05, |
|
"loss": 0.803, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 1.2209794521331787, |
|
"learning_rate": 3.804290315159881e-05, |
|
"loss": 0.8115, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 1.0688341856002808, |
|
"learning_rate": 3.781285944329423e-05, |
|
"loss": 0.8051, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 1.1031506061553955, |
|
"learning_rate": 3.7582815734989655e-05, |
|
"loss": 0.8059, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.9878343939781189, |
|
"learning_rate": 3.735277202668507e-05, |
|
"loss": 0.8054, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 1.327987790107727, |
|
"learning_rate": 3.7122728318380494e-05, |
|
"loss": 0.8131, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 1.0833244323730469, |
|
"learning_rate": 3.689268461007592e-05, |
|
"loss": 0.7936, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 1.1618777513504028, |
|
"learning_rate": 3.666264090177134e-05, |
|
"loss": 0.7991, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 1.022359013557434, |
|
"learning_rate": 3.643259719346676e-05, |
|
"loss": 0.8002, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 1.2475693225860596, |
|
"learning_rate": 3.6202553485162186e-05, |
|
"loss": 0.8001, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.1127784252166748, |
|
"learning_rate": 3.59725097768576e-05, |
|
"loss": 0.7865, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 1.2091097831726074, |
|
"learning_rate": 3.5742466068553025e-05, |
|
"loss": 0.7899, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.9588549733161926, |
|
"learning_rate": 3.551242236024845e-05, |
|
"loss": 0.7942, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 1.195241093635559, |
|
"learning_rate": 3.528237865194387e-05, |
|
"loss": 0.7813, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.9788525700569153, |
|
"learning_rate": 3.5052334943639294e-05, |
|
"loss": 0.7805, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 1.2794181108474731, |
|
"learning_rate": 3.482286634460548e-05, |
|
"loss": 0.7763, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.9700046181678772, |
|
"learning_rate": 3.45928226363009e-05, |
|
"loss": 0.7801, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 1.2326452732086182, |
|
"learning_rate": 3.436335403726708e-05, |
|
"loss": 0.7864, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 1.2367639541625977, |
|
"learning_rate": 3.4133310328962506e-05, |
|
"loss": 0.7845, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 1.077854871749878, |
|
"learning_rate": 3.390326662065793e-05, |
|
"loss": 0.7869, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 1.0575716495513916, |
|
"learning_rate": 3.3673222912353345e-05, |
|
"loss": 0.7838, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 1.1674555540084839, |
|
"learning_rate": 3.344317920404877e-05, |
|
"loss": 0.7827, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 1.148335337638855, |
|
"learning_rate": 3.321313549574419e-05, |
|
"loss": 0.7781, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 1.0287448167800903, |
|
"learning_rate": 3.2983091787439614e-05, |
|
"loss": 0.7652, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.2461556196212769, |
|
"learning_rate": 3.275304807913504e-05, |
|
"loss": 0.7773, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 1.1946007013320923, |
|
"learning_rate": 3.252357948010122e-05, |
|
"loss": 0.7694, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 1.019499659538269, |
|
"learning_rate": 3.229353577179664e-05, |
|
"loss": 0.7803, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 1.3375366926193237, |
|
"learning_rate": 3.2063492063492065e-05, |
|
"loss": 0.7684, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 1.2477443218231201, |
|
"learning_rate": 3.183344835518749e-05, |
|
"loss": 0.7657, |
|
"step": 32400 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 1.1749552488327026, |
|
"learning_rate": 3.160340464688291e-05, |
|
"loss": 0.767, |
|
"step": 32800 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 1.0863006114959717, |
|
"learning_rate": 3.1373360938578334e-05, |
|
"loss": 0.767, |
|
"step": 33200 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.9976168870925903, |
|
"learning_rate": 3.114389233954452e-05, |
|
"loss": 0.7536, |
|
"step": 33600 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 1.1924540996551514, |
|
"learning_rate": 3.09144237405107e-05, |
|
"loss": 0.7622, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.0996850728988647, |
|
"learning_rate": 3.068438003220612e-05, |
|
"loss": 0.7569, |
|
"step": 34400 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 1.2163282632827759, |
|
"learning_rate": 3.0454336323901546e-05, |
|
"loss": 0.7667, |
|
"step": 34800 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.0829898118972778, |
|
"eval_runtime": 239.7954, |
|
"eval_samples_per_second": 251.36, |
|
"eval_steps_per_second": 3.928, |
|
"step": 35096 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 1.1651737689971924, |
|
"learning_rate": 3.0224292615596966e-05, |
|
"loss": 0.7442, |
|
"step": 35200 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 1.1764894723892212, |
|
"learning_rate": 2.999424890729239e-05, |
|
"loss": 0.714, |
|
"step": 35600 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 1.1951353549957275, |
|
"learning_rate": 2.976420519898781e-05, |
|
"loss": 0.7076, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 1.1282097101211548, |
|
"learning_rate": 2.953416149068323e-05, |
|
"loss": 0.7105, |
|
"step": 36400 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 1.3397319316864014, |
|
"learning_rate": 2.9304117782378654e-05, |
|
"loss": 0.7023, |
|
"step": 36800 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 1.1150188446044922, |
|
"learning_rate": 2.9074074074074077e-05, |
|
"loss": 0.7035, |
|
"step": 37200 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 1.2119678258895874, |
|
"learning_rate": 2.8844030365769497e-05, |
|
"loss": 0.7168, |
|
"step": 37600 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 1.167506456375122, |
|
"learning_rate": 2.861398665746492e-05, |
|
"loss": 0.7125, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 1.0915708541870117, |
|
"learning_rate": 2.8384518058431102e-05, |
|
"loss": 0.7101, |
|
"step": 38400 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 1.135021686553955, |
|
"learning_rate": 2.8154474350126525e-05, |
|
"loss": 0.7145, |
|
"step": 38800 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 1.3739718198776245, |
|
"learning_rate": 2.792443064182195e-05, |
|
"loss": 0.7096, |
|
"step": 39200 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 1.1629129648208618, |
|
"learning_rate": 2.7694386933517368e-05, |
|
"loss": 0.7053, |
|
"step": 39600 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.9963687062263489, |
|
"learning_rate": 2.746434322521279e-05, |
|
"loss": 0.7012, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 1.0318909883499146, |
|
"learning_rate": 2.7234874626178974e-05, |
|
"loss": 0.713, |
|
"step": 40400 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 1.0613532066345215, |
|
"learning_rate": 2.7004830917874397e-05, |
|
"loss": 0.704, |
|
"step": 40800 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 1.1298637390136719, |
|
"learning_rate": 2.677478720956982e-05, |
|
"loss": 0.708, |
|
"step": 41200 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 1.1079801321029663, |
|
"learning_rate": 2.654474350126524e-05, |
|
"loss": 0.6975, |
|
"step": 41600 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 1.0751113891601562, |
|
"learning_rate": 2.6314699792960662e-05, |
|
"loss": 0.6999, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 1.1240077018737793, |
|
"learning_rate": 2.6085231193926845e-05, |
|
"loss": 0.7055, |
|
"step": 42400 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.0788402557373047, |
|
"learning_rate": 2.5855187485622268e-05, |
|
"loss": 0.7055, |
|
"step": 42800 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 1.00369131565094, |
|
"learning_rate": 2.562514377731769e-05, |
|
"loss": 0.6949, |
|
"step": 43200 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.1382017135620117, |
|
"learning_rate": 2.539510006901311e-05, |
|
"loss": 0.7093, |
|
"step": 43600 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 1.0273314714431763, |
|
"learning_rate": 2.5165056360708534e-05, |
|
"loss": 0.7066, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 1.331964373588562, |
|
"learning_rate": 2.4935012652403957e-05, |
|
"loss": 0.7037, |
|
"step": 44400 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 1.102133870124817, |
|
"learning_rate": 2.470496894409938e-05, |
|
"loss": 0.7028, |
|
"step": 44800 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 1.131090521812439, |
|
"learning_rate": 2.4474925235794803e-05, |
|
"loss": 0.6871, |
|
"step": 45200 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 1.1939336061477661, |
|
"learning_rate": 2.4244881527490222e-05, |
|
"loss": 0.6966, |
|
"step": 45600 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 1.344831109046936, |
|
"learning_rate": 2.4014837819185645e-05, |
|
"loss": 0.6933, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.9559622406959534, |
|
"learning_rate": 2.3784794110881068e-05, |
|
"loss": 0.6916, |
|
"step": 46400 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 1.182010293006897, |
|
"learning_rate": 2.355475040257649e-05, |
|
"loss": 0.6903, |
|
"step": 46800 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 1.080712080001831, |
|
"learning_rate": 2.3325281803542674e-05, |
|
"loss": 0.6965, |
|
"step": 47200 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 1.2468616962432861, |
|
"learning_rate": 2.3095238095238097e-05, |
|
"loss": 0.6906, |
|
"step": 47600 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 1.0585706233978271, |
|
"learning_rate": 2.286519438693352e-05, |
|
"loss": 0.6966, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.2725940942764282, |
|
"learning_rate": 2.2635150678628943e-05, |
|
"loss": 0.6894, |
|
"step": 48400 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 1.1753593683242798, |
|
"learning_rate": 2.2405106970324362e-05, |
|
"loss": 0.6806, |
|
"step": 48800 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.117319941520691, |
|
"learning_rate": 2.2175063262019785e-05, |
|
"loss": 0.6879, |
|
"step": 49200 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 1.2521744966506958, |
|
"learning_rate": 2.194501955371521e-05, |
|
"loss": 0.6808, |
|
"step": 49600 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 1.396971344947815, |
|
"learning_rate": 2.1714975845410628e-05, |
|
"loss": 0.6798, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 1.0855846405029297, |
|
"learning_rate": 2.148493213710605e-05, |
|
"loss": 0.6978, |
|
"step": 50400 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 1.199013113975525, |
|
"learning_rate": 2.1254888428801474e-05, |
|
"loss": 0.6882, |
|
"step": 50800 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.366407871246338, |
|
"learning_rate": 2.1024844720496894e-05, |
|
"loss": 0.6882, |
|
"step": 51200 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 1.1709498167037964, |
|
"learning_rate": 2.0794801012192317e-05, |
|
"loss": 0.6907, |
|
"step": 51600 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.1881307363510132, |
|
"learning_rate": 2.05653324131585e-05, |
|
"loss": 0.6883, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 1.4105783700942993, |
|
"learning_rate": 2.0335288704853922e-05, |
|
"loss": 0.6833, |
|
"step": 52400 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.0844900608062744, |
|
"eval_runtime": 239.8565, |
|
"eval_samples_per_second": 251.296, |
|
"eval_steps_per_second": 3.927, |
|
"step": 52644 |
|
}, |
|
{ |
|
"epoch": 3.01, |
|
"grad_norm": 1.4675981998443604, |
|
"learning_rate": 2.0105244996549345e-05, |
|
"loss": 0.6679, |
|
"step": 52800 |
|
}, |
|
{ |
|
"epoch": 3.03, |
|
"grad_norm": 1.151491403579712, |
|
"learning_rate": 1.9875201288244768e-05, |
|
"loss": 0.6501, |
|
"step": 53200 |
|
}, |
|
{ |
|
"epoch": 3.05, |
|
"grad_norm": 1.0938260555267334, |
|
"learning_rate": 1.964515757994019e-05, |
|
"loss": 0.6396, |
|
"step": 53600 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.055185317993164, |
|
"learning_rate": 1.941511387163561e-05, |
|
"loss": 0.6442, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.1, |
|
"grad_norm": 1.0307785272598267, |
|
"learning_rate": 1.9185645272601797e-05, |
|
"loss": 0.6489, |
|
"step": 54400 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.184102177619934, |
|
"learning_rate": 1.8955601564297217e-05, |
|
"loss": 0.6454, |
|
"step": 54800 |
|
}, |
|
{ |
|
"epoch": 3.15, |
|
"grad_norm": 1.1798542737960815, |
|
"learning_rate": 1.872555785599264e-05, |
|
"loss": 0.6552, |
|
"step": 55200 |
|
}, |
|
{ |
|
"epoch": 3.17, |
|
"grad_norm": 1.1375089883804321, |
|
"learning_rate": 1.8496089256958822e-05, |
|
"loss": 0.6359, |
|
"step": 55600 |
|
}, |
|
{ |
|
"epoch": 3.19, |
|
"grad_norm": 1.0475974082946777, |
|
"learning_rate": 1.8266045548654245e-05, |
|
"loss": 0.6374, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 3.21, |
|
"grad_norm": 1.0948106050491333, |
|
"learning_rate": 1.803600184034967e-05, |
|
"loss": 0.6431, |
|
"step": 56400 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.1488378047943115, |
|
"learning_rate": 1.7805958132045088e-05, |
|
"loss": 0.646, |
|
"step": 56800 |
|
}, |
|
{ |
|
"epoch": 3.26, |
|
"grad_norm": 1.1257692575454712, |
|
"learning_rate": 1.757591442374051e-05, |
|
"loss": 0.6408, |
|
"step": 57200 |
|
}, |
|
{ |
|
"epoch": 3.28, |
|
"grad_norm": 1.1101455688476562, |
|
"learning_rate": 1.7345870715435934e-05, |
|
"loss": 0.6389, |
|
"step": 57600 |
|
}, |
|
{ |
|
"epoch": 3.31, |
|
"grad_norm": 1.329904556274414, |
|
"learning_rate": 1.7115827007131354e-05, |
|
"loss": 0.6399, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 3.33, |
|
"grad_norm": 1.2944815158843994, |
|
"learning_rate": 1.6885783298826777e-05, |
|
"loss": 0.6421, |
|
"step": 58400 |
|
}, |
|
{ |
|
"epoch": 3.35, |
|
"grad_norm": 1.1607027053833008, |
|
"learning_rate": 1.6655739590522203e-05, |
|
"loss": 0.637, |
|
"step": 58800 |
|
}, |
|
{ |
|
"epoch": 3.37, |
|
"grad_norm": 1.0392543077468872, |
|
"learning_rate": 1.6426270991488382e-05, |
|
"loss": 0.6411, |
|
"step": 59200 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 1.3244273662567139, |
|
"learning_rate": 1.6196227283183805e-05, |
|
"loss": 0.6473, |
|
"step": 59600 |
|
}, |
|
{ |
|
"epoch": 3.42, |
|
"grad_norm": 1.1351373195648193, |
|
"learning_rate": 1.5966183574879228e-05, |
|
"loss": 0.6298, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.1698590517044067, |
|
"learning_rate": 1.573613986657465e-05, |
|
"loss": 0.6355, |
|
"step": 60400 |
|
}, |
|
{ |
|
"epoch": 3.46, |
|
"grad_norm": 1.2005553245544434, |
|
"learning_rate": 1.5506671267540834e-05, |
|
"loss": 0.6395, |
|
"step": 60800 |
|
}, |
|
{ |
|
"epoch": 3.49, |
|
"grad_norm": 0.97503662109375, |
|
"learning_rate": 1.5276627559236257e-05, |
|
"loss": 0.6437, |
|
"step": 61200 |
|
}, |
|
{ |
|
"epoch": 3.51, |
|
"grad_norm": 1.2518908977508545, |
|
"learning_rate": 1.5046583850931678e-05, |
|
"loss": 0.6385, |
|
"step": 61600 |
|
}, |
|
{ |
|
"epoch": 3.53, |
|
"grad_norm": 1.2661454677581787, |
|
"learning_rate": 1.48165401426271e-05, |
|
"loss": 0.6403, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.2612046003341675, |
|
"learning_rate": 1.4586496434322523e-05, |
|
"loss": 0.6442, |
|
"step": 62400 |
|
}, |
|
{ |
|
"epoch": 3.58, |
|
"grad_norm": 1.1942335367202759, |
|
"learning_rate": 1.4356452726017944e-05, |
|
"loss": 0.6383, |
|
"step": 62800 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.1030133962631226, |
|
"learning_rate": 1.4126409017713365e-05, |
|
"loss": 0.6277, |
|
"step": 63200 |
|
}, |
|
{ |
|
"epoch": 3.62, |
|
"grad_norm": 1.2485852241516113, |
|
"learning_rate": 1.3896365309408788e-05, |
|
"loss": 0.6414, |
|
"step": 63600 |
|
}, |
|
{ |
|
"epoch": 3.65, |
|
"grad_norm": 0.9925839900970459, |
|
"learning_rate": 1.366632160110421e-05, |
|
"loss": 0.6337, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 3.67, |
|
"grad_norm": 1.3896905183792114, |
|
"learning_rate": 1.343627789279963e-05, |
|
"loss": 0.6314, |
|
"step": 64400 |
|
}, |
|
{ |
|
"epoch": 3.69, |
|
"grad_norm": 1.1392475366592407, |
|
"learning_rate": 1.3206809293765815e-05, |
|
"loss": 0.6312, |
|
"step": 64800 |
|
}, |
|
{ |
|
"epoch": 3.72, |
|
"grad_norm": 1.2051880359649658, |
|
"learning_rate": 1.2976765585461237e-05, |
|
"loss": 0.6198, |
|
"step": 65200 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 1.3581410646438599, |
|
"learning_rate": 1.2746721877156661e-05, |
|
"loss": 0.634, |
|
"step": 65600 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.4071406126022339, |
|
"learning_rate": 1.2516678168852084e-05, |
|
"loss": 0.633, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 3.78, |
|
"grad_norm": 1.1921656131744385, |
|
"learning_rate": 1.2286634460547504e-05, |
|
"loss": 0.6206, |
|
"step": 66400 |
|
}, |
|
{ |
|
"epoch": 3.81, |
|
"grad_norm": 1.4039461612701416, |
|
"learning_rate": 1.2056590752242927e-05, |
|
"loss": 0.6341, |
|
"step": 66800 |
|
}, |
|
{ |
|
"epoch": 3.83, |
|
"grad_norm": 1.3369255065917969, |
|
"learning_rate": 1.182654704393835e-05, |
|
"loss": 0.6427, |
|
"step": 67200 |
|
}, |
|
{ |
|
"epoch": 3.85, |
|
"grad_norm": 1.2129446268081665, |
|
"learning_rate": 1.1596503335633771e-05, |
|
"loss": 0.6293, |
|
"step": 67600 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 1.264256238937378, |
|
"learning_rate": 1.1366459627329192e-05, |
|
"loss": 0.6282, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 3.9, |
|
"grad_norm": 1.1778966188430786, |
|
"learning_rate": 1.1136415919024615e-05, |
|
"loss": 0.6383, |
|
"step": 68400 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 1.045240044593811, |
|
"learning_rate": 1.0906372210720037e-05, |
|
"loss": 0.6315, |
|
"step": 68800 |
|
}, |
|
{ |
|
"epoch": 3.94, |
|
"grad_norm": 1.2942785024642944, |
|
"learning_rate": 1.0676903611686221e-05, |
|
"loss": 0.6276, |
|
"step": 69200 |
|
}, |
|
{ |
|
"epoch": 3.97, |
|
"grad_norm": 1.2519258260726929, |
|
"learning_rate": 1.0446859903381644e-05, |
|
"loss": 0.6228, |
|
"step": 69600 |
|
}, |
|
{ |
|
"epoch": 3.99, |
|
"grad_norm": 1.2884622812271118, |
|
"learning_rate": 1.0216816195077065e-05, |
|
"loss": 0.6234, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.09929621219635, |
|
"eval_runtime": 239.9825, |
|
"eval_samples_per_second": 251.164, |
|
"eval_steps_per_second": 3.925, |
|
"step": 70192 |
|
}, |
|
{ |
|
"epoch": 4.01, |
|
"grad_norm": 1.029523253440857, |
|
"learning_rate": 9.986772486772487e-06, |
|
"loss": 0.607, |
|
"step": 70400 |
|
}, |
|
{ |
|
"epoch": 4.03, |
|
"grad_norm": 1.1874665021896362, |
|
"learning_rate": 9.75672877846791e-06, |
|
"loss": 0.5992, |
|
"step": 70800 |
|
}, |
|
{ |
|
"epoch": 4.06, |
|
"grad_norm": 1.3719263076782227, |
|
"learning_rate": 9.526685070163331e-06, |
|
"loss": 0.5932, |
|
"step": 71200 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 1.1106728315353394, |
|
"learning_rate": 9.296641361858754e-06, |
|
"loss": 0.6082, |
|
"step": 71600 |
|
}, |
|
{ |
|
"epoch": 4.1, |
|
"grad_norm": 1.1333997249603271, |
|
"learning_rate": 9.066597653554177e-06, |
|
"loss": 0.5958, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 4.13, |
|
"grad_norm": 1.2606267929077148, |
|
"learning_rate": 8.837129054520358e-06, |
|
"loss": 0.6015, |
|
"step": 72400 |
|
}, |
|
{ |
|
"epoch": 4.15, |
|
"grad_norm": 1.123744249343872, |
|
"learning_rate": 8.607085346215783e-06, |
|
"loss": 0.6011, |
|
"step": 72800 |
|
}, |
|
{ |
|
"epoch": 4.17, |
|
"grad_norm": 1.155521273612976, |
|
"learning_rate": 8.377041637911204e-06, |
|
"loss": 0.5973, |
|
"step": 73200 |
|
}, |
|
{ |
|
"epoch": 4.19, |
|
"grad_norm": 1.1591954231262207, |
|
"learning_rate": 8.146997929606625e-06, |
|
"loss": 0.5924, |
|
"step": 73600 |
|
}, |
|
{ |
|
"epoch": 4.22, |
|
"grad_norm": 1.3380868434906006, |
|
"learning_rate": 7.916954221302048e-06, |
|
"loss": 0.5983, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 1.2216105461120605, |
|
"learning_rate": 7.68691051299747e-06, |
|
"loss": 0.5947, |
|
"step": 74400 |
|
}, |
|
{ |
|
"epoch": 4.26, |
|
"grad_norm": 1.0791873931884766, |
|
"learning_rate": 7.4568668046928916e-06, |
|
"loss": 0.6054, |
|
"step": 74800 |
|
}, |
|
{ |
|
"epoch": 4.29, |
|
"grad_norm": 1.1365481615066528, |
|
"learning_rate": 7.2268230963883145e-06, |
|
"loss": 0.6092, |
|
"step": 75200 |
|
}, |
|
{ |
|
"epoch": 4.31, |
|
"grad_norm": 1.1376712322235107, |
|
"learning_rate": 6.997354497354498e-06, |
|
"loss": 0.5942, |
|
"step": 75600 |
|
}, |
|
{ |
|
"epoch": 4.33, |
|
"grad_norm": 1.1192513704299927, |
|
"learning_rate": 6.76731078904992e-06, |
|
"loss": 0.5955, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 4.35, |
|
"grad_norm": 1.1927390098571777, |
|
"learning_rate": 6.537267080745342e-06, |
|
"loss": 0.5901, |
|
"step": 76400 |
|
}, |
|
{ |
|
"epoch": 4.38, |
|
"grad_norm": 1.236060619354248, |
|
"learning_rate": 6.307223372440764e-06, |
|
"loss": 0.6, |
|
"step": 76800 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 1.077643871307373, |
|
"learning_rate": 6.077179664136186e-06, |
|
"loss": 0.6088, |
|
"step": 77200 |
|
}, |
|
{ |
|
"epoch": 4.42, |
|
"grad_norm": 1.3172234296798706, |
|
"learning_rate": 5.847135955831608e-06, |
|
"loss": 0.5944, |
|
"step": 77600 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 1.2222837209701538, |
|
"learning_rate": 5.61709224752703e-06, |
|
"loss": 0.5976, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 4.47, |
|
"grad_norm": 1.2887938022613525, |
|
"learning_rate": 5.387048539222452e-06, |
|
"loss": 0.6023, |
|
"step": 78400 |
|
}, |
|
{ |
|
"epoch": 4.49, |
|
"grad_norm": 1.1380060911178589, |
|
"learning_rate": 5.157579940188636e-06, |
|
"loss": 0.5938, |
|
"step": 78800 |
|
}, |
|
{ |
|
"epoch": 4.51, |
|
"grad_norm": 1.2178806066513062, |
|
"learning_rate": 4.927536231884058e-06, |
|
"loss": 0.5916, |
|
"step": 79200 |
|
}, |
|
{ |
|
"epoch": 4.54, |
|
"grad_norm": 1.2010163068771362, |
|
"learning_rate": 4.69749252357948e-06, |
|
"loss": 0.5891, |
|
"step": 79600 |
|
}, |
|
{ |
|
"epoch": 4.56, |
|
"grad_norm": 1.2172470092773438, |
|
"learning_rate": 4.467448815274902e-06, |
|
"loss": 0.6019, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 4.58, |
|
"grad_norm": 1.2008330821990967, |
|
"learning_rate": 4.2374051069703245e-06, |
|
"loss": 0.596, |
|
"step": 80400 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 1.3656328916549683, |
|
"learning_rate": 4.007936507936508e-06, |
|
"loss": 0.6001, |
|
"step": 80800 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 1.336308240890503, |
|
"learning_rate": 3.7778927996319303e-06, |
|
"loss": 0.5912, |
|
"step": 81200 |
|
}, |
|
{ |
|
"epoch": 4.65, |
|
"grad_norm": 1.1399625539779663, |
|
"learning_rate": 3.5478490913273524e-06, |
|
"loss": 0.5962, |
|
"step": 81600 |
|
}, |
|
{ |
|
"epoch": 4.67, |
|
"grad_norm": 1.237598180770874, |
|
"learning_rate": 3.317805383022775e-06, |
|
"loss": 0.5869, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 4.7, |
|
"grad_norm": 1.1215174198150635, |
|
"learning_rate": 3.0877616747181967e-06, |
|
"loss": 0.5927, |
|
"step": 82400 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 1.3274859189987183, |
|
"learning_rate": 2.857717966413619e-06, |
|
"loss": 0.6066, |
|
"step": 82800 |
|
}, |
|
{ |
|
"epoch": 4.74, |
|
"grad_norm": 1.276289463043213, |
|
"learning_rate": 2.628249367379802e-06, |
|
"loss": 0.5994, |
|
"step": 83200 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 1.154296636581421, |
|
"learning_rate": 2.3982056590752246e-06, |
|
"loss": 0.5907, |
|
"step": 83600 |
|
}, |
|
{ |
|
"epoch": 4.79, |
|
"grad_norm": 1.1015737056732178, |
|
"learning_rate": 2.1681619507706463e-06, |
|
"loss": 0.6043, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 4.81, |
|
"grad_norm": 1.356696367263794, |
|
"learning_rate": 1.9381182424660685e-06, |
|
"loss": 0.5883, |
|
"step": 84400 |
|
}, |
|
{ |
|
"epoch": 4.83, |
|
"grad_norm": 1.1508524417877197, |
|
"learning_rate": 1.7080745341614908e-06, |
|
"loss": 0.5899, |
|
"step": 84800 |
|
}, |
|
{ |
|
"epoch": 4.86, |
|
"grad_norm": 1.1132149696350098, |
|
"learning_rate": 1.478030825856913e-06, |
|
"loss": 0.5994, |
|
"step": 85200 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.306624174118042, |
|
"learning_rate": 1.2485622268230964e-06, |
|
"loss": 0.5891, |
|
"step": 85600 |
|
}, |
|
{ |
|
"epoch": 4.9, |
|
"grad_norm": 1.234307050704956, |
|
"learning_rate": 1.0185185185185188e-06, |
|
"loss": 0.5916, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 1.0994372367858887, |
|
"learning_rate": 7.884748102139407e-07, |
|
"loss": 0.5908, |
|
"step": 86400 |
|
}, |
|
{ |
|
"epoch": 4.95, |
|
"grad_norm": 1.2712494134902954, |
|
"learning_rate": 5.584311019093628e-07, |
|
"loss": 0.5963, |
|
"step": 86800 |
|
}, |
|
{ |
|
"epoch": 4.97, |
|
"grad_norm": 1.2190104722976685, |
|
"learning_rate": 3.283873936047849e-07, |
|
"loss": 0.5902, |
|
"step": 87200 |
|
}, |
|
{ |
|
"epoch": 4.99, |
|
"grad_norm": 1.3301359415054321, |
|
"learning_rate": 9.891879457096849e-08, |
|
"loss": 0.591, |
|
"step": 87600 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.1109092235565186, |
|
"eval_runtime": 240.0172, |
|
"eval_samples_per_second": 251.128, |
|
"eval_steps_per_second": 3.925, |
|
"step": 87740 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 87740, |
|
"total_flos": 1.7097588901675008e+18, |
|
"train_loss": 0.7401387438351292, |
|
"train_runtime": 31448.3717, |
|
"train_samples_per_second": 89.283, |
|
"train_steps_per_second": 2.79 |
|
} |
|
], |
|
"logging_steps": 400, |
|
"max_steps": 87740, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"total_flos": 1.7097588901675008e+18, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|