|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.05815644082582146, |
|
"eval_steps": 25, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0007754192110109528, |
|
"grad_norm": 13.743983268737793, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 10.4552, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0007754192110109528, |
|
"eval_loss": 10.45703125, |
|
"eval_runtime": 17.3532, |
|
"eval_samples_per_second": 62.582, |
|
"eval_steps_per_second": 31.291, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0015508384220219056, |
|
"grad_norm": 15.994314193725586, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 10.4156, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0023262576330328583, |
|
"grad_norm": 17.251068115234375, |
|
"learning_rate": 0.0001, |
|
"loss": 10.3381, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0031016768440438112, |
|
"grad_norm": 13.243555068969727, |
|
"learning_rate": 9.99524110790929e-05, |
|
"loss": 10.5021, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.003877096055054764, |
|
"grad_norm": 12.53420352935791, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 10.0685, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004652515266065717, |
|
"grad_norm": 13.710821151733398, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 9.9248, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0054279344770766695, |
|
"grad_norm": 14.05363941192627, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 9.9161, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0062033536880876225, |
|
"grad_norm": 13.404027938842773, |
|
"learning_rate": 9.881480035599667e-05, |
|
"loss": 9.5758, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.006978772899098575, |
|
"grad_norm": 15.494867324829102, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 9.7933, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.007754192110109528, |
|
"grad_norm": 21.381547927856445, |
|
"learning_rate": 9.768584753741134e-05, |
|
"loss": 9.1108, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00852961132112048, |
|
"grad_norm": 17.526710510253906, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 9.1686, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.009305030532131433, |
|
"grad_norm": 16.56471824645996, |
|
"learning_rate": 9.619397662556435e-05, |
|
"loss": 9.0037, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.010080449743142386, |
|
"grad_norm": 17.765729904174805, |
|
"learning_rate": 9.53153893518325e-05, |
|
"loss": 8.9391, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.010855868954153339, |
|
"grad_norm": 18.965198516845703, |
|
"learning_rate": 9.435054165891109e-05, |
|
"loss": 8.794, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.011631288165164292, |
|
"grad_norm": 20.40072250366211, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 8.0026, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.012406707376175245, |
|
"grad_norm": 15.78911304473877, |
|
"learning_rate": 9.21695722906443e-05, |
|
"loss": 8.396, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.013182126587186198, |
|
"grad_norm": 21.308853149414062, |
|
"learning_rate": 9.09576022144496e-05, |
|
"loss": 8.1367, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.01395754579819715, |
|
"grad_norm": 17.67489242553711, |
|
"learning_rate": 8.966766701456177e-05, |
|
"loss": 7.9123, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.014732965009208104, |
|
"grad_norm": 17.39400291442871, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 7.8025, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.015508384220219057, |
|
"grad_norm": 18.812978744506836, |
|
"learning_rate": 8.68638668405062e-05, |
|
"loss": 7.7002, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01628380343123001, |
|
"grad_norm": 20.998043060302734, |
|
"learning_rate": 8.535533905932738e-05, |
|
"loss": 7.5169, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.01705922264224096, |
|
"grad_norm": 17.887672424316406, |
|
"learning_rate": 8.377951038078302e-05, |
|
"loss": 7.2246, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.017834641853251915, |
|
"grad_norm": 18.00612449645996, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 6.679, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.018610061064262867, |
|
"grad_norm": 16.111370086669922, |
|
"learning_rate": 8.043807145043604e-05, |
|
"loss": 6.2643, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.01938548027527382, |
|
"grad_norm": 22.258943557739258, |
|
"learning_rate": 7.86788218175523e-05, |
|
"loss": 6.6102, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01938548027527382, |
|
"eval_loss": 6.311874866485596, |
|
"eval_runtime": 17.3852, |
|
"eval_samples_per_second": 62.467, |
|
"eval_steps_per_second": 31.233, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.020160899486284772, |
|
"grad_norm": 15.573179244995117, |
|
"learning_rate": 7.68649804173412e-05, |
|
"loss": 6.3441, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.020936318697295727, |
|
"grad_norm": 15.074617385864258, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 6.2842, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.021711737908306678, |
|
"grad_norm": 15.864977836608887, |
|
"learning_rate": 7.308743066175172e-05, |
|
"loss": 5.7357, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.02248715711931763, |
|
"grad_norm": 19.382728576660156, |
|
"learning_rate": 7.113091308703498e-05, |
|
"loss": 6.0517, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.023262576330328584, |
|
"grad_norm": 18.489917755126953, |
|
"learning_rate": 6.91341716182545e-05, |
|
"loss": 5.7561, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.024037995541339535, |
|
"grad_norm": 11.977217674255371, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 5.3761, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.02481341475235049, |
|
"grad_norm": 12.359403610229492, |
|
"learning_rate": 6.503528997521366e-05, |
|
"loss": 5.6858, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.02558883396336144, |
|
"grad_norm": 17.48925018310547, |
|
"learning_rate": 6.294095225512603e-05, |
|
"loss": 5.4112, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.026364253174372396, |
|
"grad_norm": 12.810553550720215, |
|
"learning_rate": 6.0821980696905146e-05, |
|
"loss": 5.1038, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.027139672385383347, |
|
"grad_norm": 11.41131591796875, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 5.1019, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0279150915963943, |
|
"grad_norm": 11.404109001159668, |
|
"learning_rate": 5.6526309611002594e-05, |
|
"loss": 5.0096, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.028690510807405253, |
|
"grad_norm": 10.934562683105469, |
|
"learning_rate": 5.435778713738292e-05, |
|
"loss": 4.8367, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.029465930018416207, |
|
"grad_norm": 11.160900115966797, |
|
"learning_rate": 5.218096936826681e-05, |
|
"loss": 5.009, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.03024134922942716, |
|
"grad_norm": 9.400223731994629, |
|
"learning_rate": 5e-05, |
|
"loss": 5.121, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.031016768440438113, |
|
"grad_norm": 9.578547477722168, |
|
"learning_rate": 4.781903063173321e-05, |
|
"loss": 4.9154, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03179218765144907, |
|
"grad_norm": 8.08479118347168, |
|
"learning_rate": 4.564221286261709e-05, |
|
"loss": 4.4877, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.03256760686246002, |
|
"grad_norm": 9.323592185974121, |
|
"learning_rate": 4.347369038899744e-05, |
|
"loss": 4.7919, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.03334302607347097, |
|
"grad_norm": 12.592964172363281, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 4.9646, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.03411844528448192, |
|
"grad_norm": 8.313202857971191, |
|
"learning_rate": 3.917801930309486e-05, |
|
"loss": 4.5304, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.03489386449549287, |
|
"grad_norm": 9.623037338256836, |
|
"learning_rate": 3.705904774487396e-05, |
|
"loss": 4.5277, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.03566928370650383, |
|
"grad_norm": 6.551573753356934, |
|
"learning_rate": 3.4964710024786354e-05, |
|
"loss": 4.7194, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.03644470291751478, |
|
"grad_norm": 9.742829322814941, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 4.4445, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.03722012212852573, |
|
"grad_norm": 6.696950435638428, |
|
"learning_rate": 3.086582838174551e-05, |
|
"loss": 4.3009, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.037995541339536684, |
|
"grad_norm": 6.955257415771484, |
|
"learning_rate": 2.886908691296504e-05, |
|
"loss": 4.2683, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.03877096055054764, |
|
"grad_norm": 11.268085479736328, |
|
"learning_rate": 2.6912569338248315e-05, |
|
"loss": 4.3482, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.03877096055054764, |
|
"eval_loss": 4.515497207641602, |
|
"eval_runtime": 17.341, |
|
"eval_samples_per_second": 62.626, |
|
"eval_steps_per_second": 31.313, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.039546379761558594, |
|
"grad_norm": 7.341830253601074, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 4.3773, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.040321798972569545, |
|
"grad_norm": 8.497244834899902, |
|
"learning_rate": 2.3135019582658802e-05, |
|
"loss": 4.6826, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.041097218183580496, |
|
"grad_norm": 7.568642616271973, |
|
"learning_rate": 2.132117818244771e-05, |
|
"loss": 4.4897, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.041872637394591454, |
|
"grad_norm": 7.663918495178223, |
|
"learning_rate": 1.9561928549563968e-05, |
|
"loss": 5.1256, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.042648056605602405, |
|
"grad_norm": 6.138592720031738, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 4.3068, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.043423475816613356, |
|
"grad_norm": 6.359229564666748, |
|
"learning_rate": 1.622048961921699e-05, |
|
"loss": 4.6431, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.04419889502762431, |
|
"grad_norm": 6.404804706573486, |
|
"learning_rate": 1.4644660940672627e-05, |
|
"loss": 4.4604, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.04497431423863526, |
|
"grad_norm": 5.194124221801758, |
|
"learning_rate": 1.3136133159493802e-05, |
|
"loss": 4.4148, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.04574973344964622, |
|
"grad_norm": 6.9789557456970215, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 4.5563, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.04652515266065717, |
|
"grad_norm": 6.215468406677246, |
|
"learning_rate": 1.0332332985438248e-05, |
|
"loss": 4.6267, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04730057187166812, |
|
"grad_norm": 5.669643878936768, |
|
"learning_rate": 9.042397785550405e-06, |
|
"loss": 4.3564, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.04807599108267907, |
|
"grad_norm": 6.695188045501709, |
|
"learning_rate": 7.830427709355725e-06, |
|
"loss": 4.6728, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.04885141029369003, |
|
"grad_norm": 6.3594069480896, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 4.4725, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.04962682950470098, |
|
"grad_norm": 6.124025344848633, |
|
"learning_rate": 5.649458341088915e-06, |
|
"loss": 4.7726, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.05040224871571193, |
|
"grad_norm": 6.306771755218506, |
|
"learning_rate": 4.684610648167503e-06, |
|
"loss": 4.7565, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05117766792672288, |
|
"grad_norm": 6.299499034881592, |
|
"learning_rate": 3.8060233744356633e-06, |
|
"loss": 4.1523, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.05195308713773384, |
|
"grad_norm": 5.291078567504883, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 4.4084, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.05272850634874479, |
|
"grad_norm": 7.21616268157959, |
|
"learning_rate": 2.314152462588659e-06, |
|
"loss": 4.2842, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.05350392555975574, |
|
"grad_norm": 5.060047626495361, |
|
"learning_rate": 1.70370868554659e-06, |
|
"loss": 4.4137, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.054279344770766694, |
|
"grad_norm": 5.063629627227783, |
|
"learning_rate": 1.1851996440033319e-06, |
|
"loss": 4.4392, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05505476398177765, |
|
"grad_norm": 4.903884410858154, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 4.3334, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0558301831927886, |
|
"grad_norm": 6.626821517944336, |
|
"learning_rate": 4.277569313094809e-07, |
|
"loss": 4.8866, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.056605602403799554, |
|
"grad_norm": 5.395899772644043, |
|
"learning_rate": 1.9026509541272275e-07, |
|
"loss": 4.1787, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.057381021614810505, |
|
"grad_norm": 5.96885347366333, |
|
"learning_rate": 4.7588920907110094e-08, |
|
"loss": 4.9024, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.05815644082582146, |
|
"grad_norm": 6.069552898406982, |
|
"learning_rate": 0.0, |
|
"loss": 4.4114, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.05815644082582146, |
|
"eval_loss": 4.3782525062561035, |
|
"eval_runtime": 17.3208, |
|
"eval_samples_per_second": 62.699, |
|
"eval_steps_per_second": 31.35, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 75, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 43502587084800.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|