{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05117543579082041, "eval_steps": 100, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012793858947705101, "eval_loss": 2.7024941444396973, "eval_runtime": 240.6903, "eval_samples_per_second": 13.677, "eval_steps_per_second": 6.839, "step": 1 }, { "epoch": 0.0006396929473852551, "grad_norm": 1.2781825065612793, "learning_rate": 1.6666666666666667e-05, "loss": 1.7362, "step": 5 }, { "epoch": 0.0012793858947705101, "grad_norm": 1.6901952028274536, "learning_rate": 3.3333333333333335e-05, "loss": 2.0195, "step": 10 }, { "epoch": 0.0019190788421557653, "grad_norm": 2.2342793941497803, "learning_rate": 5e-05, "loss": 2.0461, "step": 15 }, { "epoch": 0.0025587717895410203, "grad_norm": 1.6336160898208618, "learning_rate": 6.666666666666667e-05, "loss": 1.6357, "step": 20 }, { "epoch": 0.0031984647369262755, "grad_norm": 2.752976894378662, "learning_rate": 8.333333333333334e-05, "loss": 1.1627, "step": 25 }, { "epoch": 0.0038381576843115306, "grad_norm": 1.8429460525512695, "learning_rate": 0.0001, "loss": 0.6154, "step": 30 }, { "epoch": 0.004477850631696785, "grad_norm": 1.7854329347610474, "learning_rate": 9.995494831023409e-05, "loss": 0.321, "step": 35 }, { "epoch": 0.005117543579082041, "grad_norm": 1.596530556678772, "learning_rate": 9.981987442712633e-05, "loss": 0.2499, "step": 40 }, { "epoch": 0.005757236526467296, "grad_norm": 1.2020143270492554, "learning_rate": 9.959502176294383e-05, "loss": 0.2349, "step": 45 }, { "epoch": 0.006396929473852551, "grad_norm": 1.5480122566223145, "learning_rate": 9.928079551738543e-05, "loss": 0.2652, "step": 50 }, { "epoch": 0.007036622421237806, "grad_norm": 1.2814526557922363, "learning_rate": 9.887776194738432e-05, "loss": 0.1417, "step": 55 }, { "epoch": 0.007676315368623061, "grad_norm": 0.6458909511566162, "learning_rate": 9.838664734667495e-05, "loss": 0.1702, "step": 60 }, { "epoch": 0.008316008316008316, "grad_norm": 0.6655418276786804, "learning_rate": 9.780833673696254e-05, "loss": 0.1657, "step": 65 }, { "epoch": 0.00895570126339357, "grad_norm": 0.7218236327171326, "learning_rate": 9.714387227305422e-05, "loss": 0.1586, "step": 70 }, { "epoch": 0.009595394210778827, "grad_norm": 0.7546175718307495, "learning_rate": 9.639445136482548e-05, "loss": 0.1706, "step": 75 }, { "epoch": 0.010235087158164081, "grad_norm": 0.8237160444259644, "learning_rate": 9.55614245194068e-05, "loss": 0.1836, "step": 80 }, { "epoch": 0.010874780105549335, "grad_norm": 0.7809853553771973, "learning_rate": 9.464629290747842e-05, "loss": 0.1837, "step": 85 }, { "epoch": 0.011514473052934592, "grad_norm": 0.6915144920349121, "learning_rate": 9.365070565805941e-05, "loss": 0.1868, "step": 90 }, { "epoch": 0.012154166000319846, "grad_norm": 0.8375842571258545, "learning_rate": 9.257645688666556e-05, "loss": 0.2455, "step": 95 }, { "epoch": 0.012793858947705102, "grad_norm": 0.9935116767883301, "learning_rate": 9.142548246219212e-05, "loss": 0.2471, "step": 100 }, { "epoch": 0.012793858947705102, "eval_loss": 0.18092785775661469, "eval_runtime": 244.1307, "eval_samples_per_second": 13.485, "eval_steps_per_second": 6.742, "step": 100 }, { "epoch": 0.013433551895090356, "grad_norm": 0.42624446749687195, "learning_rate": 9.019985651834703e-05, "loss": 0.1207, "step": 105 }, { "epoch": 0.014073244842475612, "grad_norm": 0.3908662796020508, "learning_rate": 8.890178771592199e-05, "loss": 0.1382, "step": 110 }, { "epoch": 0.014712937789860867, "grad_norm": 0.524199366569519, "learning_rate": 8.753361526263621e-05, "loss": 0.1309, "step": 115 }, { "epoch": 0.015352630737246123, "grad_norm": 0.5209535360336304, "learning_rate": 8.609780469772623e-05, "loss": 0.1512, "step": 120 }, { "epoch": 0.01599232368463138, "grad_norm": 0.4131312370300293, "learning_rate": 8.459694344887732e-05, "loss": 0.1443, "step": 125 }, { "epoch": 0.016632016632016633, "grad_norm": 0.5860401391983032, "learning_rate": 8.303373616950408e-05, "loss": 0.1718, "step": 130 }, { "epoch": 0.017271709579401887, "grad_norm": 0.5590812563896179, "learning_rate": 8.141099986478212e-05, "loss": 0.1659, "step": 135 }, { "epoch": 0.01791140252678714, "grad_norm": 0.4526077210903168, "learning_rate": 7.973165881521434e-05, "loss": 0.1862, "step": 140 }, { "epoch": 0.018551095474172396, "grad_norm": 0.5904171466827393, "learning_rate": 7.799873930687978e-05, "loss": 0.2245, "step": 145 }, { "epoch": 0.019190788421557654, "grad_norm": 0.8725635409355164, "learning_rate": 7.621536417786159e-05, "loss": 0.2688, "step": 150 }, { "epoch": 0.019830481368942908, "grad_norm": 0.32253244519233704, "learning_rate": 7.438474719068173e-05, "loss": 0.1232, "step": 155 }, { "epoch": 0.020470174316328162, "grad_norm": 0.3636815845966339, "learning_rate": 7.251018724088367e-05, "loss": 0.1412, "step": 160 }, { "epoch": 0.021109867263713417, "grad_norm": 0.42371389269828796, "learning_rate": 7.059506241219965e-05, "loss": 0.1479, "step": 165 }, { "epoch": 0.02174956021109867, "grad_norm": 0.28215959668159485, "learning_rate": 6.864282388901544e-05, "loss": 0.1513, "step": 170 }, { "epoch": 0.02238925315848393, "grad_norm": 0.6038751006126404, "learning_rate": 6.665698973710288e-05, "loss": 0.1708, "step": 175 }, { "epoch": 0.023028946105869183, "grad_norm": 0.3659113943576813, "learning_rate": 6.464113856382752e-05, "loss": 0.1717, "step": 180 }, { "epoch": 0.023668639053254437, "grad_norm": 0.4708649218082428, "learning_rate": 6.259890306925627e-05, "loss": 0.1729, "step": 185 }, { "epoch": 0.02430833200063969, "grad_norm": 0.9622364640235901, "learning_rate": 6.0533963499786314e-05, "loss": 0.2008, "step": 190 }, { "epoch": 0.02494802494802495, "grad_norm": 0.5879449844360352, "learning_rate": 5.8450041016092464e-05, "loss": 0.2236, "step": 195 }, { "epoch": 0.025587717895410204, "grad_norm": 0.7524951696395874, "learning_rate": 5.6350890987343944e-05, "loss": 0.2424, "step": 200 }, { "epoch": 0.025587717895410204, "eval_loss": 0.1732141375541687, "eval_runtime": 244.2514, "eval_samples_per_second": 13.478, "eval_steps_per_second": 6.739, "step": 200 }, { "epoch": 0.026227410842795458, "grad_norm": 0.2855418026447296, "learning_rate": 5.4240296223775465e-05, "loss": 0.0997, "step": 205 }, { "epoch": 0.026867103790180712, "grad_norm": 0.3188900947570801, "learning_rate": 5.212206015980742e-05, "loss": 0.1139, "step": 210 }, { "epoch": 0.027506796737565967, "grad_norm": 0.38831430673599243, "learning_rate": 5e-05, "loss": 0.136, "step": 215 }, { "epoch": 0.028146489684951224, "grad_norm": 0.39263615012168884, "learning_rate": 4.78779398401926e-05, "loss": 0.1322, "step": 220 }, { "epoch": 0.02878618263233648, "grad_norm": 0.6268131136894226, "learning_rate": 4.575970377622456e-05, "loss": 0.1702, "step": 225 }, { "epoch": 0.029425875579721733, "grad_norm": 0.3940875828266144, "learning_rate": 4.364910901265606e-05, "loss": 0.175, "step": 230 }, { "epoch": 0.030065568527106987, "grad_norm": 0.43840473890304565, "learning_rate": 4.1549958983907555e-05, "loss": 0.1645, "step": 235 }, { "epoch": 0.030705261474492245, "grad_norm": 0.5527442693710327, "learning_rate": 3.94660365002137e-05, "loss": 0.1824, "step": 240 }, { "epoch": 0.0313449544218775, "grad_norm": 0.5612766146659851, "learning_rate": 3.740109693074375e-05, "loss": 0.2346, "step": 245 }, { "epoch": 0.03198464736926276, "grad_norm": 0.9959267973899841, "learning_rate": 3.5358861436172485e-05, "loss": 0.2655, "step": 250 }, { "epoch": 0.03262434031664801, "grad_norm": 0.36881548166275024, "learning_rate": 3.334301026289712e-05, "loss": 0.1131, "step": 255 }, { "epoch": 0.033264033264033266, "grad_norm": 0.31683406233787537, "learning_rate": 3.135717611098458e-05, "loss": 0.1326, "step": 260 }, { "epoch": 0.03390372621141852, "grad_norm": 0.3013302683830261, "learning_rate": 2.9404937587800375e-05, "loss": 0.1368, "step": 265 }, { "epoch": 0.034543419158803775, "grad_norm": 0.3135119676589966, "learning_rate": 2.748981275911633e-05, "loss": 0.1464, "step": 270 }, { "epoch": 0.03518311210618903, "grad_norm": 0.40254464745521545, "learning_rate": 2.5615252809318284e-05, "loss": 0.1499, "step": 275 }, { "epoch": 0.03582280505357428, "grad_norm": 0.28640156984329224, "learning_rate": 2.3784635822138424e-05, "loss": 0.1573, "step": 280 }, { "epoch": 0.03646249800095954, "grad_norm": 0.322263240814209, "learning_rate": 2.2001260693120233e-05, "loss": 0.1658, "step": 285 }, { "epoch": 0.03710219094834479, "grad_norm": 0.31009647250175476, "learning_rate": 2.026834118478567e-05, "loss": 0.1563, "step": 290 }, { "epoch": 0.03774188389573005, "grad_norm": 0.5385209918022156, "learning_rate": 1.858900013521788e-05, "loss": 0.2028, "step": 295 }, { "epoch": 0.03838157684311531, "grad_norm": 0.5858637094497681, "learning_rate": 1.6966263830495936e-05, "loss": 0.245, "step": 300 }, { "epoch": 0.03838157684311531, "eval_loss": 0.16857710480690002, "eval_runtime": 244.1346, "eval_samples_per_second": 13.484, "eval_steps_per_second": 6.742, "step": 300 }, { "epoch": 0.03902126979050056, "grad_norm": 0.20234771072864532, "learning_rate": 1.5403056551122697e-05, "loss": 0.1184, "step": 305 }, { "epoch": 0.039660962737885816, "grad_norm": 0.265678346157074, "learning_rate": 1.3902195302273779e-05, "loss": 0.123, "step": 310 }, { "epoch": 0.04030065568527107, "grad_norm": 0.4162317216396332, "learning_rate": 1.246638473736378e-05, "loss": 0.1302, "step": 315 }, { "epoch": 0.040940348632656325, "grad_norm": 0.3373229503631592, "learning_rate": 1.1098212284078036e-05, "loss": 0.1346, "step": 320 }, { "epoch": 0.04158004158004158, "grad_norm": 0.3058888018131256, "learning_rate": 9.800143481652979e-06, "loss": 0.1455, "step": 325 }, { "epoch": 0.04221973452742683, "grad_norm": 0.31154316663742065, "learning_rate": 8.574517537807897e-06, "loss": 0.1691, "step": 330 }, { "epoch": 0.04285942747481209, "grad_norm": 0.47663167119026184, "learning_rate": 7.423543113334436e-06, "loss": 0.1658, "step": 335 }, { "epoch": 0.04349912042219734, "grad_norm": 0.4618123769760132, "learning_rate": 6.349294341940593e-06, "loss": 0.1846, "step": 340 }, { "epoch": 0.0441388133695826, "grad_norm": 0.37711745500564575, "learning_rate": 5.353707092521582e-06, "loss": 0.1828, "step": 345 }, { "epoch": 0.04477850631696786, "grad_norm": 0.541806161403656, "learning_rate": 4.43857548059321e-06, "loss": 0.2395, "step": 350 }, { "epoch": 0.04541819926435311, "grad_norm": 0.2688872218132019, "learning_rate": 3.605548635174533e-06, "loss": 0.1082, "step": 355 }, { "epoch": 0.046057892211738366, "grad_norm": 0.26909637451171875, "learning_rate": 2.85612772694579e-06, "loss": 0.1264, "step": 360 }, { "epoch": 0.046697585159123624, "grad_norm": 0.3528990149497986, "learning_rate": 2.191663263037458e-06, "loss": 0.1372, "step": 365 }, { "epoch": 0.047337278106508875, "grad_norm": 0.3015151619911194, "learning_rate": 1.6133526533250565e-06, "loss": 0.1726, "step": 370 }, { "epoch": 0.04797697105389413, "grad_norm": 0.288101464509964, "learning_rate": 1.1222380526156928e-06, "loss": 0.1504, "step": 375 }, { "epoch": 0.04861666400127938, "grad_norm": 0.45127633213996887, "learning_rate": 7.192044826145771e-07, "loss": 0.1526, "step": 380 }, { "epoch": 0.04925635694866464, "grad_norm": 0.3559752404689789, "learning_rate": 4.049782370561583e-07, "loss": 0.1684, "step": 385 }, { "epoch": 0.0498960498960499, "grad_norm": 0.41711530089378357, "learning_rate": 1.8012557287367392e-07, "loss": 0.1758, "step": 390 }, { "epoch": 0.05053574284343515, "grad_norm": 0.4933544397354126, "learning_rate": 4.5051689765929214e-08, "loss": 0.196, "step": 395 }, { "epoch": 0.05117543579082041, "grad_norm": 1.311772346496582, "learning_rate": 0.0, "loss": 0.2441, "step": 400 }, { "epoch": 0.05117543579082041, "eval_loss": 0.16797442734241486, "eval_runtime": 244.0328, "eval_samples_per_second": 13.49, "eval_steps_per_second": 6.745, "step": 400 } ], "logging_steps": 5, "max_steps": 400, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.394108836872192e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }