|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.05117543579082041, |
|
"eval_steps": 100, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.00012793858947705101, |
|
"eval_loss": 2.7024941444396973, |
|
"eval_runtime": 240.6903, |
|
"eval_samples_per_second": 13.677, |
|
"eval_steps_per_second": 6.839, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0006396929473852551, |
|
"grad_norm": 1.2781825065612793, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 1.7362, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0012793858947705101, |
|
"grad_norm": 1.6901952028274536, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 2.0195, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0019190788421557653, |
|
"grad_norm": 2.2342793941497803, |
|
"learning_rate": 5e-05, |
|
"loss": 2.0461, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0025587717895410203, |
|
"grad_norm": 1.6336160898208618, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 1.6357, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0031984647369262755, |
|
"grad_norm": 2.752976894378662, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.1627, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0038381576843115306, |
|
"grad_norm": 1.8429460525512695, |
|
"learning_rate": 0.0001, |
|
"loss": 0.6154, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.004477850631696785, |
|
"grad_norm": 1.7854329347610474, |
|
"learning_rate": 9.995494831023409e-05, |
|
"loss": 0.321, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.005117543579082041, |
|
"grad_norm": 1.596530556678772, |
|
"learning_rate": 9.981987442712633e-05, |
|
"loss": 0.2499, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.005757236526467296, |
|
"grad_norm": 1.2020143270492554, |
|
"learning_rate": 9.959502176294383e-05, |
|
"loss": 0.2349, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.006396929473852551, |
|
"grad_norm": 1.5480122566223145, |
|
"learning_rate": 9.928079551738543e-05, |
|
"loss": 0.2652, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.007036622421237806, |
|
"grad_norm": 1.2814526557922363, |
|
"learning_rate": 9.887776194738432e-05, |
|
"loss": 0.1417, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.007676315368623061, |
|
"grad_norm": 0.6458909511566162, |
|
"learning_rate": 9.838664734667495e-05, |
|
"loss": 0.1702, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.008316008316008316, |
|
"grad_norm": 0.6655418276786804, |
|
"learning_rate": 9.780833673696254e-05, |
|
"loss": 0.1657, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.00895570126339357, |
|
"grad_norm": 0.7218236327171326, |
|
"learning_rate": 9.714387227305422e-05, |
|
"loss": 0.1586, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.009595394210778827, |
|
"grad_norm": 0.7546175718307495, |
|
"learning_rate": 9.639445136482548e-05, |
|
"loss": 0.1706, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.010235087158164081, |
|
"grad_norm": 0.8237160444259644, |
|
"learning_rate": 9.55614245194068e-05, |
|
"loss": 0.1836, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.010874780105549335, |
|
"grad_norm": 0.7809853553771973, |
|
"learning_rate": 9.464629290747842e-05, |
|
"loss": 0.1837, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.011514473052934592, |
|
"grad_norm": 0.6915144920349121, |
|
"learning_rate": 9.365070565805941e-05, |
|
"loss": 0.1868, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.012154166000319846, |
|
"grad_norm": 0.8375842571258545, |
|
"learning_rate": 9.257645688666556e-05, |
|
"loss": 0.2455, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.012793858947705102, |
|
"grad_norm": 0.9935116767883301, |
|
"learning_rate": 9.142548246219212e-05, |
|
"loss": 0.2471, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.012793858947705102, |
|
"eval_loss": 0.18092785775661469, |
|
"eval_runtime": 244.1307, |
|
"eval_samples_per_second": 13.485, |
|
"eval_steps_per_second": 6.742, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.013433551895090356, |
|
"grad_norm": 0.42624446749687195, |
|
"learning_rate": 9.019985651834703e-05, |
|
"loss": 0.1207, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.014073244842475612, |
|
"grad_norm": 0.3908662796020508, |
|
"learning_rate": 8.890178771592199e-05, |
|
"loss": 0.1382, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.014712937789860867, |
|
"grad_norm": 0.524199366569519, |
|
"learning_rate": 8.753361526263621e-05, |
|
"loss": 0.1309, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.015352630737246123, |
|
"grad_norm": 0.5209535360336304, |
|
"learning_rate": 8.609780469772623e-05, |
|
"loss": 0.1512, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.01599232368463138, |
|
"grad_norm": 0.4131312370300293, |
|
"learning_rate": 8.459694344887732e-05, |
|
"loss": 0.1443, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.016632016632016633, |
|
"grad_norm": 0.5860401391983032, |
|
"learning_rate": 8.303373616950408e-05, |
|
"loss": 0.1718, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.017271709579401887, |
|
"grad_norm": 0.5590812563896179, |
|
"learning_rate": 8.141099986478212e-05, |
|
"loss": 0.1659, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.01791140252678714, |
|
"grad_norm": 0.4526077210903168, |
|
"learning_rate": 7.973165881521434e-05, |
|
"loss": 0.1862, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.018551095474172396, |
|
"grad_norm": 0.5904171466827393, |
|
"learning_rate": 7.799873930687978e-05, |
|
"loss": 0.2245, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.019190788421557654, |
|
"grad_norm": 0.8725635409355164, |
|
"learning_rate": 7.621536417786159e-05, |
|
"loss": 0.2688, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.019830481368942908, |
|
"grad_norm": 0.32253244519233704, |
|
"learning_rate": 7.438474719068173e-05, |
|
"loss": 0.1232, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.020470174316328162, |
|
"grad_norm": 0.3636815845966339, |
|
"learning_rate": 7.251018724088367e-05, |
|
"loss": 0.1412, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.021109867263713417, |
|
"grad_norm": 0.42371389269828796, |
|
"learning_rate": 7.059506241219965e-05, |
|
"loss": 0.1479, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.02174956021109867, |
|
"grad_norm": 0.28215959668159485, |
|
"learning_rate": 6.864282388901544e-05, |
|
"loss": 0.1513, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.02238925315848393, |
|
"grad_norm": 0.6038751006126404, |
|
"learning_rate": 6.665698973710288e-05, |
|
"loss": 0.1708, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.023028946105869183, |
|
"grad_norm": 0.3659113943576813, |
|
"learning_rate": 6.464113856382752e-05, |
|
"loss": 0.1717, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.023668639053254437, |
|
"grad_norm": 0.4708649218082428, |
|
"learning_rate": 6.259890306925627e-05, |
|
"loss": 0.1729, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.02430833200063969, |
|
"grad_norm": 0.9622364640235901, |
|
"learning_rate": 6.0533963499786314e-05, |
|
"loss": 0.2008, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.02494802494802495, |
|
"grad_norm": 0.5879449844360352, |
|
"learning_rate": 5.8450041016092464e-05, |
|
"loss": 0.2236, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.025587717895410204, |
|
"grad_norm": 0.7524951696395874, |
|
"learning_rate": 5.6350890987343944e-05, |
|
"loss": 0.2424, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.025587717895410204, |
|
"eval_loss": 0.1732141375541687, |
|
"eval_runtime": 244.2514, |
|
"eval_samples_per_second": 13.478, |
|
"eval_steps_per_second": 6.739, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.026227410842795458, |
|
"grad_norm": 0.2855418026447296, |
|
"learning_rate": 5.4240296223775465e-05, |
|
"loss": 0.0997, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.026867103790180712, |
|
"grad_norm": 0.3188900947570801, |
|
"learning_rate": 5.212206015980742e-05, |
|
"loss": 0.1139, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.027506796737565967, |
|
"grad_norm": 0.38831430673599243, |
|
"learning_rate": 5e-05, |
|
"loss": 0.136, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.028146489684951224, |
|
"grad_norm": 0.39263615012168884, |
|
"learning_rate": 4.78779398401926e-05, |
|
"loss": 0.1322, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.02878618263233648, |
|
"grad_norm": 0.6268131136894226, |
|
"learning_rate": 4.575970377622456e-05, |
|
"loss": 0.1702, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.029425875579721733, |
|
"grad_norm": 0.3940875828266144, |
|
"learning_rate": 4.364910901265606e-05, |
|
"loss": 0.175, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.030065568527106987, |
|
"grad_norm": 0.43840473890304565, |
|
"learning_rate": 4.1549958983907555e-05, |
|
"loss": 0.1645, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.030705261474492245, |
|
"grad_norm": 0.5527442693710327, |
|
"learning_rate": 3.94660365002137e-05, |
|
"loss": 0.1824, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0313449544218775, |
|
"grad_norm": 0.5612766146659851, |
|
"learning_rate": 3.740109693074375e-05, |
|
"loss": 0.2346, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.03198464736926276, |
|
"grad_norm": 0.9959267973899841, |
|
"learning_rate": 3.5358861436172485e-05, |
|
"loss": 0.2655, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.03262434031664801, |
|
"grad_norm": 0.36881548166275024, |
|
"learning_rate": 3.334301026289712e-05, |
|
"loss": 0.1131, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.033264033264033266, |
|
"grad_norm": 0.31683406233787537, |
|
"learning_rate": 3.135717611098458e-05, |
|
"loss": 0.1326, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.03390372621141852, |
|
"grad_norm": 0.3013302683830261, |
|
"learning_rate": 2.9404937587800375e-05, |
|
"loss": 0.1368, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.034543419158803775, |
|
"grad_norm": 0.3135119676589966, |
|
"learning_rate": 2.748981275911633e-05, |
|
"loss": 0.1464, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.03518311210618903, |
|
"grad_norm": 0.40254464745521545, |
|
"learning_rate": 2.5615252809318284e-05, |
|
"loss": 0.1499, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.03582280505357428, |
|
"grad_norm": 0.28640156984329224, |
|
"learning_rate": 2.3784635822138424e-05, |
|
"loss": 0.1573, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.03646249800095954, |
|
"grad_norm": 0.322263240814209, |
|
"learning_rate": 2.2001260693120233e-05, |
|
"loss": 0.1658, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.03710219094834479, |
|
"grad_norm": 0.31009647250175476, |
|
"learning_rate": 2.026834118478567e-05, |
|
"loss": 0.1563, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.03774188389573005, |
|
"grad_norm": 0.5385209918022156, |
|
"learning_rate": 1.858900013521788e-05, |
|
"loss": 0.2028, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.03838157684311531, |
|
"grad_norm": 0.5858637094497681, |
|
"learning_rate": 1.6966263830495936e-05, |
|
"loss": 0.245, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03838157684311531, |
|
"eval_loss": 0.16857710480690002, |
|
"eval_runtime": 244.1346, |
|
"eval_samples_per_second": 13.484, |
|
"eval_steps_per_second": 6.742, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.03902126979050056, |
|
"grad_norm": 0.20234771072864532, |
|
"learning_rate": 1.5403056551122697e-05, |
|
"loss": 0.1184, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.039660962737885816, |
|
"grad_norm": 0.265678346157074, |
|
"learning_rate": 1.3902195302273779e-05, |
|
"loss": 0.123, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.04030065568527107, |
|
"grad_norm": 0.4162317216396332, |
|
"learning_rate": 1.246638473736378e-05, |
|
"loss": 0.1302, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.040940348632656325, |
|
"grad_norm": 0.3373229503631592, |
|
"learning_rate": 1.1098212284078036e-05, |
|
"loss": 0.1346, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.04158004158004158, |
|
"grad_norm": 0.3058888018131256, |
|
"learning_rate": 9.800143481652979e-06, |
|
"loss": 0.1455, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.04221973452742683, |
|
"grad_norm": 0.31154316663742065, |
|
"learning_rate": 8.574517537807897e-06, |
|
"loss": 0.1691, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.04285942747481209, |
|
"grad_norm": 0.47663167119026184, |
|
"learning_rate": 7.423543113334436e-06, |
|
"loss": 0.1658, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.04349912042219734, |
|
"grad_norm": 0.4618123769760132, |
|
"learning_rate": 6.349294341940593e-06, |
|
"loss": 0.1846, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.0441388133695826, |
|
"grad_norm": 0.37711745500564575, |
|
"learning_rate": 5.353707092521582e-06, |
|
"loss": 0.1828, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.04477850631696786, |
|
"grad_norm": 0.541806161403656, |
|
"learning_rate": 4.43857548059321e-06, |
|
"loss": 0.2395, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.04541819926435311, |
|
"grad_norm": 0.2688872218132019, |
|
"learning_rate": 3.605548635174533e-06, |
|
"loss": 0.1082, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.046057892211738366, |
|
"grad_norm": 0.26909637451171875, |
|
"learning_rate": 2.85612772694579e-06, |
|
"loss": 0.1264, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.046697585159123624, |
|
"grad_norm": 0.3528990149497986, |
|
"learning_rate": 2.191663263037458e-06, |
|
"loss": 0.1372, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.047337278106508875, |
|
"grad_norm": 0.3015151619911194, |
|
"learning_rate": 1.6133526533250565e-06, |
|
"loss": 0.1726, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.04797697105389413, |
|
"grad_norm": 0.288101464509964, |
|
"learning_rate": 1.1222380526156928e-06, |
|
"loss": 0.1504, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.04861666400127938, |
|
"grad_norm": 0.45127633213996887, |
|
"learning_rate": 7.192044826145771e-07, |
|
"loss": 0.1526, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.04925635694866464, |
|
"grad_norm": 0.3559752404689789, |
|
"learning_rate": 4.049782370561583e-07, |
|
"loss": 0.1684, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.0498960498960499, |
|
"grad_norm": 0.41711530089378357, |
|
"learning_rate": 1.8012557287367392e-07, |
|
"loss": 0.1758, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.05053574284343515, |
|
"grad_norm": 0.4933544397354126, |
|
"learning_rate": 4.5051689765929214e-08, |
|
"loss": 0.196, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.05117543579082041, |
|
"grad_norm": 1.311772346496582, |
|
"learning_rate": 0.0, |
|
"loss": 0.2441, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.05117543579082041, |
|
"eval_loss": 0.16797442734241486, |
|
"eval_runtime": 244.0328, |
|
"eval_samples_per_second": 13.49, |
|
"eval_steps_per_second": 6.745, |
|
"step": 400 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.394108836872192e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|