{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.5152129817444218, "eval_steps": 31, "global_step": 310, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008113590263691683, "eval_loss": 5.708795547485352, "eval_runtime": 34.9917, "eval_samples_per_second": 5.944, "eval_steps_per_second": 0.743, "step": 1 }, { "epoch": 0.02434077079107505, "grad_norm": 27.649152755737305, "learning_rate": 3e-05, "loss": 22.4374, "step": 3 }, { "epoch": 0.0486815415821501, "grad_norm": 24.12411880493164, "learning_rate": 6e-05, "loss": 20.3477, "step": 6 }, { "epoch": 0.07302231237322515, "grad_norm": 22.957651138305664, "learning_rate": 9e-05, "loss": 13.9237, "step": 9 }, { "epoch": 0.0973630831643002, "grad_norm": 22.14281463623047, "learning_rate": 9.999238475781957e-05, "loss": 8.2192, "step": 12 }, { "epoch": 0.12170385395537525, "grad_norm": 15.811911582946777, "learning_rate": 9.99524110790929e-05, "loss": 5.4619, "step": 15 }, { "epoch": 0.1460446247464503, "grad_norm": 17.17688751220703, "learning_rate": 9.987820251299122e-05, "loss": 4.1998, "step": 18 }, { "epoch": 0.17038539553752535, "grad_norm": 9.06302547454834, "learning_rate": 9.976980991835894e-05, "loss": 3.5102, "step": 21 }, { "epoch": 0.1947261663286004, "grad_norm": 9.071863174438477, "learning_rate": 9.962730758206611e-05, "loss": 2.9501, "step": 24 }, { "epoch": 0.21906693711967545, "grad_norm": 10.715943336486816, "learning_rate": 9.945079316809585e-05, "loss": 2.8262, "step": 27 }, { "epoch": 0.2434077079107505, "grad_norm": 6.811004161834717, "learning_rate": 9.924038765061042e-05, "loss": 2.6217, "step": 30 }, { "epoch": 0.2515212981744422, "eval_loss": 0.6682190299034119, "eval_runtime": 35.2448, "eval_samples_per_second": 5.902, "eval_steps_per_second": 0.738, "step": 31 }, { "epoch": 0.26774847870182555, "grad_norm": 5.427604675292969, "learning_rate": 9.899623523104149e-05, "loss": 2.4006, "step": 33 }, { "epoch": 0.2920892494929006, "grad_norm": 7.537416458129883, "learning_rate": 9.871850323926177e-05, "loss": 2.7397, "step": 36 }, { "epoch": 0.31643002028397565, "grad_norm": 8.674805641174316, "learning_rate": 9.84073820189054e-05, "loss": 2.4828, "step": 39 }, { "epoch": 0.3407707910750507, "grad_norm": 8.167730331420898, "learning_rate": 9.806308479691595e-05, "loss": 2.5302, "step": 42 }, { "epoch": 0.36511156186612576, "grad_norm": 10.937932014465332, "learning_rate": 9.768584753741134e-05, "loss": 2.3383, "step": 45 }, { "epoch": 0.3894523326572008, "grad_norm": 3.8246636390686035, "learning_rate": 9.727592877996585e-05, "loss": 2.212, "step": 48 }, { "epoch": 0.41379310344827586, "grad_norm": 4.667800426483154, "learning_rate": 9.683360946241989e-05, "loss": 2.3763, "step": 51 }, { "epoch": 0.4381338742393509, "grad_norm": 4.990477561950684, "learning_rate": 9.635919272833938e-05, "loss": 2.3995, "step": 54 }, { "epoch": 0.46247464503042596, "grad_norm": 9.09072494506836, "learning_rate": 9.58530037192562e-05, "loss": 2.545, "step": 57 }, { "epoch": 0.486815415821501, "grad_norm": 5.338695049285889, "learning_rate": 9.53153893518325e-05, "loss": 2.3183, "step": 60 }, { "epoch": 0.5030425963488844, "eval_loss": 0.5645254850387573, "eval_runtime": 13.4155, "eval_samples_per_second": 15.504, "eval_steps_per_second": 1.938, "step": 62 }, { "epoch": 0.5111561866125761, "grad_norm": 3.9001667499542236, "learning_rate": 9.474671808010126e-05, "loss": 2.3843, "step": 63 }, { "epoch": 0.5354969574036511, "grad_norm": 6.379414081573486, "learning_rate": 9.414737964294636e-05, "loss": 2.2963, "step": 66 }, { "epoch": 0.5598377281947262, "grad_norm": 4.97390079498291, "learning_rate": 9.351778479699499e-05, "loss": 2.2517, "step": 69 }, { "epoch": 0.5841784989858012, "grad_norm": 4.063937187194824, "learning_rate": 9.285836503510562e-05, "loss": 2.2437, "step": 72 }, { "epoch": 0.6085192697768763, "grad_norm": 3.602717161178589, "learning_rate": 9.21695722906443e-05, "loss": 2.1695, "step": 75 }, { "epoch": 0.6328600405679513, "grad_norm": 4.555128574371338, "learning_rate": 9.145187862775209e-05, "loss": 2.1648, "step": 78 }, { "epoch": 0.6572008113590264, "grad_norm": 5.0544867515563965, "learning_rate": 9.070577591781597e-05, "loss": 2.0922, "step": 81 }, { "epoch": 0.6815415821501014, "grad_norm": 4.92878532409668, "learning_rate": 8.993177550236464e-05, "loss": 2.141, "step": 84 }, { "epoch": 0.7058823529411765, "grad_norm": 3.6681647300720215, "learning_rate": 8.91304078426207e-05, "loss": 2.1638, "step": 87 }, { "epoch": 0.7302231237322515, "grad_norm": 4.451002597808838, "learning_rate": 8.83022221559489e-05, "loss": 2.1766, "step": 90 }, { "epoch": 0.7545638945233266, "grad_norm": 5.902685165405273, "learning_rate": 8.744778603945011e-05, "loss": 2.1005, "step": 93 }, { "epoch": 0.7545638945233266, "eval_loss": 0.5568718314170837, "eval_runtime": 13.4611, "eval_samples_per_second": 15.452, "eval_steps_per_second": 1.931, "step": 93 }, { "epoch": 0.7789046653144016, "grad_norm": 4.811842918395996, "learning_rate": 8.656768508095853e-05, "loss": 2.2144, "step": 96 }, { "epoch": 0.8032454361054767, "grad_norm": 4.506133556365967, "learning_rate": 8.566252245770909e-05, "loss": 2.0727, "step": 99 }, { "epoch": 0.8275862068965517, "grad_norm": 3.898118734359741, "learning_rate": 8.473291852294987e-05, "loss": 1.9802, "step": 102 }, { "epoch": 0.8519269776876268, "grad_norm": 4.056739807128906, "learning_rate": 8.377951038078302e-05, "loss": 2.034, "step": 105 }, { "epoch": 0.8762677484787018, "grad_norm": 4.857217788696289, "learning_rate": 8.280295144952536e-05, "loss": 1.9623, "step": 108 }, { "epoch": 0.9006085192697769, "grad_norm": 4.711864948272705, "learning_rate": 8.18039110138882e-05, "loss": 2.3024, "step": 111 }, { "epoch": 0.9249492900608519, "grad_norm": 4.590112686157227, "learning_rate": 8.07830737662829e-05, "loss": 1.9634, "step": 114 }, { "epoch": 0.949290060851927, "grad_norm": 6.305273056030273, "learning_rate": 7.974113933756707e-05, "loss": 2.0567, "step": 117 }, { "epoch": 0.973630831643002, "grad_norm": 3.979123115539551, "learning_rate": 7.86788218175523e-05, "loss": 2.024, "step": 120 }, { "epoch": 0.9979716024340771, "grad_norm": 3.4800965785980225, "learning_rate": 7.75968492656029e-05, "loss": 2.2663, "step": 123 }, { "epoch": 1.0060851926977687, "eval_loss": 0.5130282640457153, "eval_runtime": 13.4323, "eval_samples_per_second": 15.485, "eval_steps_per_second": 1.936, "step": 124 }, { "epoch": 1.0223123732251522, "grad_norm": 4.823460578918457, "learning_rate": 7.649596321166024e-05, "loss": 1.8243, "step": 126 }, { "epoch": 1.0466531440162272, "grad_norm": 3.5276401042938232, "learning_rate": 7.537691814803521e-05, "loss": 1.8807, "step": 129 }, { "epoch": 1.0709939148073022, "grad_norm": 4.932689189910889, "learning_rate": 7.424048101231686e-05, "loss": 1.6877, "step": 132 }, { "epoch": 1.0953346855983772, "grad_norm": 4.79948091506958, "learning_rate": 7.308743066175172e-05, "loss": 1.675, "step": 135 }, { "epoch": 1.1196754563894524, "grad_norm": 5.320326328277588, "learning_rate": 7.191855733945387e-05, "loss": 1.8877, "step": 138 }, { "epoch": 1.1440162271805274, "grad_norm": 5.259227752685547, "learning_rate": 7.073466213281196e-05, "loss": 1.7584, "step": 141 }, { "epoch": 1.1683569979716024, "grad_norm": 3.7490923404693604, "learning_rate": 6.953655642446368e-05, "loss": 1.6734, "step": 144 }, { "epoch": 1.1926977687626774, "grad_norm": 5.446073532104492, "learning_rate": 6.832506133621487e-05, "loss": 1.9009, "step": 147 }, { "epoch": 1.2170385395537526, "grad_norm": 4.892331600189209, "learning_rate": 6.710100716628344e-05, "loss": 1.7449, "step": 150 }, { "epoch": 1.2413793103448276, "grad_norm": 5.024781227111816, "learning_rate": 6.586523282025462e-05, "loss": 1.7636, "step": 153 }, { "epoch": 1.2576064908722109, "eval_loss": 0.5101395845413208, "eval_runtime": 13.4502, "eval_samples_per_second": 15.464, "eval_steps_per_second": 1.933, "step": 155 }, { "epoch": 1.2657200811359026, "grad_norm": 4.381926536560059, "learning_rate": 6.461858523613684e-05, "loss": 1.8001, "step": 156 }, { "epoch": 1.2900608519269776, "grad_norm": 4.504693031311035, "learning_rate": 6.336191880391284e-05, "loss": 1.7454, "step": 159 }, { "epoch": 1.3144016227180528, "grad_norm": 5.055481910705566, "learning_rate": 6.209609477998338e-05, "loss": 1.7643, "step": 162 }, { "epoch": 1.3387423935091278, "grad_norm": 4.872294902801514, "learning_rate": 6.0821980696905146e-05, "loss": 1.6705, "step": 165 }, { "epoch": 1.3630831643002028, "grad_norm": 4.806439399719238, "learning_rate": 5.9540449768827246e-05, "loss": 1.7039, "step": 168 }, { "epoch": 1.3874239350912778, "grad_norm": 5.403498649597168, "learning_rate": 5.8252380293033884e-05, "loss": 1.7613, "step": 171 }, { "epoch": 1.4117647058823528, "grad_norm": 3.657991647720337, "learning_rate": 5.695865504800327e-05, "loss": 1.8176, "step": 174 }, { "epoch": 1.436105476673428, "grad_norm": 4.227349281311035, "learning_rate": 5.566016068839535e-05, "loss": 1.715, "step": 177 }, { "epoch": 1.460446247464503, "grad_norm": 3.618288278579712, "learning_rate": 5.435778713738292e-05, "loss": 1.6001, "step": 180 }, { "epoch": 1.484787018255578, "grad_norm": 6.720012187957764, "learning_rate": 5.3052426976742855e-05, "loss": 1.7047, "step": 183 }, { "epoch": 1.5091277890466532, "grad_norm": 6.494766712188721, "learning_rate": 5.174497483512506e-05, "loss": 1.7607, "step": 186 }, { "epoch": 1.5091277890466532, "eval_loss": 0.5061110258102417, "eval_runtime": 13.4249, "eval_samples_per_second": 15.494, "eval_steps_per_second": 1.937, "step": 186 }, { "epoch": 1.5334685598377282, "grad_norm": 15.1577730178833, "learning_rate": 5.04363267749187e-05, "loss": 1.6155, "step": 189 }, { "epoch": 1.5578093306288032, "grad_norm": 4.470703125, "learning_rate": 4.912737967813583e-05, "loss": 1.6756, "step": 192 }, { "epoch": 1.5821501014198782, "grad_norm": 5.094776630401611, "learning_rate": 4.781903063173321e-05, "loss": 1.5824, "step": 195 }, { "epoch": 1.6064908722109532, "grad_norm": 3.8165762424468994, "learning_rate": 4.6512176312793736e-05, "loss": 1.5824, "step": 198 }, { "epoch": 1.6308316430020284, "grad_norm": 5.532138347625732, "learning_rate": 4.52077123739888e-05, "loss": 1.744, "step": 201 }, { "epoch": 1.6551724137931034, "grad_norm": 4.430169582366943, "learning_rate": 4.390653282974264e-05, "loss": 1.6626, "step": 204 }, { "epoch": 1.6795131845841786, "grad_norm": 3.9386327266693115, "learning_rate": 4.260952944351947e-05, "loss": 1.4812, "step": 207 }, { "epoch": 1.7038539553752536, "grad_norm": 4.511838436126709, "learning_rate": 4.131759111665349e-05, "loss": 1.7028, "step": 210 }, { "epoch": 1.7281947261663286, "grad_norm": 4.205485820770264, "learning_rate": 4.003160327914015e-05, "loss": 1.761, "step": 213 }, { "epoch": 1.7525354969574036, "grad_norm": 4.336887836456299, "learning_rate": 3.875244728280676e-05, "loss": 1.5719, "step": 216 }, { "epoch": 1.7606490872210954, "eval_loss": 0.49112430214881897, "eval_runtime": 13.4322, "eval_samples_per_second": 15.485, "eval_steps_per_second": 1.936, "step": 217 }, { "epoch": 1.7768762677484786, "grad_norm": 5.897202968597412, "learning_rate": 3.748099979727792e-05, "loss": 1.6029, "step": 219 }, { "epoch": 1.8012170385395536, "grad_norm": 4.379933834075928, "learning_rate": 3.6218132209150045e-05, "loss": 1.6497, "step": 222 }, { "epoch": 1.8255578093306288, "grad_norm": 4.874183177947998, "learning_rate": 3.4964710024786354e-05, "loss": 1.6421, "step": 225 }, { "epoch": 1.8498985801217038, "grad_norm": 4.163862705230713, "learning_rate": 3.372159227714218e-05, "loss": 1.4842, "step": 228 }, { "epoch": 1.874239350912779, "grad_norm": 4.190274238586426, "learning_rate": 3.248963093702663e-05, "loss": 1.6199, "step": 231 }, { "epoch": 1.898580121703854, "grad_norm": 4.762985706329346, "learning_rate": 3.12696703292044e-05, "loss": 1.6452, "step": 234 }, { "epoch": 1.922920892494929, "grad_norm": 5.081422805786133, "learning_rate": 3.006254655373769e-05, "loss": 1.6196, "step": 237 }, { "epoch": 1.947261663286004, "grad_norm": 4.602591037750244, "learning_rate": 2.886908691296504e-05, "loss": 1.6198, "step": 240 }, { "epoch": 1.971602434077079, "grad_norm": 4.777091026306152, "learning_rate": 2.7690109344509563e-05, "loss": 1.6883, "step": 243 }, { "epoch": 1.995943204868154, "grad_norm": 5.713743209838867, "learning_rate": 2.6526421860705473e-05, "loss": 1.5588, "step": 246 }, { "epoch": 2.0121703853955375, "eval_loss": 0.48639488220214844, "eval_runtime": 13.4291, "eval_samples_per_second": 15.489, "eval_steps_per_second": 1.936, "step": 248 }, { "epoch": 2.020283975659229, "grad_norm": 3.395972967147827, "learning_rate": 2.537882199482665e-05, "loss": 1.1378, "step": 249 }, { "epoch": 2.0446247464503045, "grad_norm": 4.442982196807861, "learning_rate": 2.4248096254497288e-05, "loss": 1.253, "step": 252 }, { "epoch": 2.0689655172413794, "grad_norm": 5.2550764083862305, "learning_rate": 2.3135019582658802e-05, "loss": 1.0433, "step": 255 }, { "epoch": 2.0933062880324544, "grad_norm": 4.615274429321289, "learning_rate": 2.2040354826462668e-05, "loss": 1.1078, "step": 258 }, { "epoch": 2.1176470588235294, "grad_norm": 5.723622798919678, "learning_rate": 2.0964852214453013e-05, "loss": 0.9585, "step": 261 }, { "epoch": 2.1419878296146044, "grad_norm": 4.3719587326049805, "learning_rate": 1.9909248842397584e-05, "loss": 0.9587, "step": 264 }, { "epoch": 2.1663286004056794, "grad_norm": 6.265243053436279, "learning_rate": 1.887426816811903e-05, "loss": 0.9681, "step": 267 }, { "epoch": 2.1906693711967544, "grad_norm": 5.796363830566406, "learning_rate": 1.7860619515673033e-05, "loss": 1.0059, "step": 270 }, { "epoch": 2.2150101419878294, "grad_norm": 5.817225456237793, "learning_rate": 1.6868997589213136e-05, "loss": 1.0253, "step": 273 }, { "epoch": 2.239350912778905, "grad_norm": 4.450856685638428, "learning_rate": 1.5900081996875083e-05, "loss": 0.7533, "step": 276 }, { "epoch": 2.26369168356998, "grad_norm": 7.340899467468262, "learning_rate": 1.4954536785007456e-05, "loss": 0.9246, "step": 279 }, { "epoch": 2.26369168356998, "eval_loss": 0.5402039885520935, "eval_runtime": 13.4355, "eval_samples_per_second": 15.481, "eval_steps_per_second": 1.935, "step": 279 }, { "epoch": 2.288032454361055, "grad_norm": 5.454842567443848, "learning_rate": 1.4033009983067452e-05, "loss": 1.0822, "step": 282 }, { "epoch": 2.31237322515213, "grad_norm": 5.592954635620117, "learning_rate": 1.3136133159493802e-05, "loss": 0.9626, "step": 285 }, { "epoch": 2.336713995943205, "grad_norm": 5.249399662017822, "learning_rate": 1.22645209888614e-05, "loss": 0.8161, "step": 288 }, { "epoch": 2.36105476673428, "grad_norm": 5.653781890869141, "learning_rate": 1.1418770830614013e-05, "loss": 0.9639, "step": 291 }, { "epoch": 2.385395537525355, "grad_norm": 5.4389119148254395, "learning_rate": 1.0599462319663905e-05, "loss": 0.868, "step": 294 }, { "epoch": 2.40973630831643, "grad_norm": 5.680983543395996, "learning_rate": 9.807156969139136e-06, "loss": 1.1558, "step": 297 }, { "epoch": 2.4340770791075053, "grad_norm": 8.884838104248047, "learning_rate": 9.042397785550405e-06, "loss": 0.9113, "step": 300 }, { "epoch": 2.4584178498985803, "grad_norm": 6.362790107727051, "learning_rate": 8.305708896641594e-06, "loss": 1.0533, "step": 303 }, { "epoch": 2.4827586206896552, "grad_norm": 6.722659111022949, "learning_rate": 7.597595192178702e-06, "loss": 0.945, "step": 306 }, { "epoch": 2.5070993914807302, "grad_norm": 4.19068717956543, "learning_rate": 6.918541977923709e-06, "loss": 0.8369, "step": 309 }, { "epoch": 2.5152129817444218, "eval_loss": 0.5427043437957764, "eval_runtime": 13.4361, "eval_samples_per_second": 15.481, "eval_steps_per_second": 1.935, "step": 310 } ], "logging_steps": 3, "max_steps": 370, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 31, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.118715981197476e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }