|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5067567567567568, |
|
"eval_steps": 9, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.006756756756756757, |
|
"grad_norm": 1.165550708770752, |
|
"learning_rate": 1e-05, |
|
"loss": 3.626, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.006756756756756757, |
|
"eval_loss": 3.7272541522979736, |
|
"eval_runtime": 4.139, |
|
"eval_samples_per_second": 30.2, |
|
"eval_steps_per_second": 3.866, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.013513513513513514, |
|
"grad_norm": 1.0644617080688477, |
|
"learning_rate": 2e-05, |
|
"loss": 3.8724, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02027027027027027, |
|
"grad_norm": 1.1389989852905273, |
|
"learning_rate": 3e-05, |
|
"loss": 3.5915, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02702702702702703, |
|
"grad_norm": 1.0900439023971558, |
|
"learning_rate": 4e-05, |
|
"loss": 3.7253, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.033783783783783786, |
|
"grad_norm": 1.1674542427062988, |
|
"learning_rate": 5e-05, |
|
"loss": 3.6544, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04054054054054054, |
|
"grad_norm": 1.027275800704956, |
|
"learning_rate": 6e-05, |
|
"loss": 3.3359, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0472972972972973, |
|
"grad_norm": 1.348042607307434, |
|
"learning_rate": 7e-05, |
|
"loss": 3.7027, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05405405405405406, |
|
"grad_norm": 1.2195662260055542, |
|
"learning_rate": 8e-05, |
|
"loss": 3.4344, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.060810810810810814, |
|
"grad_norm": 1.1674548387527466, |
|
"learning_rate": 9e-05, |
|
"loss": 3.4253, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.060810810810810814, |
|
"eval_loss": 3.543837070465088, |
|
"eval_runtime": 3.5109, |
|
"eval_samples_per_second": 35.603, |
|
"eval_steps_per_second": 4.557, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06756756756756757, |
|
"grad_norm": 1.3922748565673828, |
|
"learning_rate": 0.0001, |
|
"loss": 3.643, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07432432432432433, |
|
"grad_norm": 1.216573715209961, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 3.5204, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.08108108108108109, |
|
"grad_norm": 1.1309475898742676, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 3.113, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08783783783783784, |
|
"grad_norm": 1.4442179203033447, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 3.0735, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0945945945945946, |
|
"grad_norm": 1.3493503332138062, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 3.2508, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.10135135135135136, |
|
"grad_norm": 1.3219938278198242, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 3.1881, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.10810810810810811, |
|
"grad_norm": 1.606813669204712, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 3.004, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.11486486486486487, |
|
"grad_norm": 1.4070640802383423, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 2.5799, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.12162162162162163, |
|
"grad_norm": 1.4162650108337402, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 2.6839, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12162162162162163, |
|
"eval_loss": 2.5524935722351074, |
|
"eval_runtime": 3.5211, |
|
"eval_samples_per_second": 35.501, |
|
"eval_steps_per_second": 4.544, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12837837837837837, |
|
"grad_norm": 1.2180479764938354, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 2.632, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 1.2392053604125977, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 2.543, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.14189189189189189, |
|
"grad_norm": 1.225856900215149, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 2.3457, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14864864864864866, |
|
"grad_norm": 1.4014432430267334, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 2.4257, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1554054054054054, |
|
"grad_norm": 1.1311300992965698, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 1.9219, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.16216216216216217, |
|
"grad_norm": 1.2671700716018677, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 2.3732, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16891891891891891, |
|
"grad_norm": 0.9614380598068237, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 2.2316, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.17567567567567569, |
|
"grad_norm": 1.0399982929229736, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 2.0554, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.18243243243243243, |
|
"grad_norm": 1.2994896173477173, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 2.1411, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18243243243243243, |
|
"eval_loss": 2.0132856369018555, |
|
"eval_runtime": 3.5135, |
|
"eval_samples_per_second": 35.577, |
|
"eval_steps_per_second": 4.554, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1891891891891892, |
|
"grad_norm": 1.0128037929534912, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 1.8349, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.19594594594594594, |
|
"grad_norm": 1.0621966123580933, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 1.7304, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.20270270270270271, |
|
"grad_norm": 0.9444336891174316, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 1.8391, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.20945945945945946, |
|
"grad_norm": 1.013601541519165, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 1.9608, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.21621621621621623, |
|
"grad_norm": 1.1101553440093994, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 1.9638, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.22297297297297297, |
|
"grad_norm": 0.9574841856956482, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 2.0326, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.22972972972972974, |
|
"grad_norm": 0.9780818223953247, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 1.7894, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.23648648648648649, |
|
"grad_norm": 1.0742874145507812, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 1.9416, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"grad_norm": 0.9108137488365173, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 1.861, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.24324324324324326, |
|
"eval_loss": 1.8477528095245361, |
|
"eval_runtime": 3.5122, |
|
"eval_samples_per_second": 35.59, |
|
"eval_steps_per_second": 4.556, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.9335376024246216, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 1.8027, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.25675675675675674, |
|
"grad_norm": 1.0257062911987305, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 1.8306, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.2635135135135135, |
|
"grad_norm": 0.8713502883911133, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 1.5814, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 1.0091434717178345, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.5569, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.27702702702702703, |
|
"grad_norm": 0.9113713502883911, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 1.7839, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.28378378378378377, |
|
"grad_norm": 0.8845016360282898, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 1.6395, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2905405405405405, |
|
"grad_norm": 0.9751928448677063, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 2.0989, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2972972972972973, |
|
"grad_norm": 1.0509858131408691, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 1.6493, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.30405405405405406, |
|
"grad_norm": 1.0471868515014648, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 1.6431, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.30405405405405406, |
|
"eval_loss": 1.7883583307266235, |
|
"eval_runtime": 3.5109, |
|
"eval_samples_per_second": 35.604, |
|
"eval_steps_per_second": 4.557, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.3108108108108108, |
|
"grad_norm": 1.315850019454956, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 1.5634, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.31756756756756754, |
|
"grad_norm": 0.9055303335189819, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 1.7563, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.32432432432432434, |
|
"grad_norm": 0.9029581546783447, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 1.822, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3310810810810811, |
|
"grad_norm": 0.8379126191139221, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 1.5573, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.33783783783783783, |
|
"grad_norm": 1.1745027303695679, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 2.062, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.34459459459459457, |
|
"grad_norm": 1.1078695058822632, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 2.1678, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.35135135135135137, |
|
"grad_norm": 0.9430006742477417, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 1.8513, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3581081081081081, |
|
"grad_norm": 0.9669394493103027, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 1.8486, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.36486486486486486, |
|
"grad_norm": 0.9533030390739441, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 1.8129, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.36486486486486486, |
|
"eval_loss": 1.7548061609268188, |
|
"eval_runtime": 3.5366, |
|
"eval_samples_per_second": 35.345, |
|
"eval_steps_per_second": 4.524, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3716216216216216, |
|
"grad_norm": 0.8269166350364685, |
|
"learning_rate": 5e-05, |
|
"loss": 1.6914, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3783783783783784, |
|
"grad_norm": 0.9480825662612915, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 1.8577, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.38513513513513514, |
|
"grad_norm": 0.9473639130592346, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 1.9744, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3918918918918919, |
|
"grad_norm": 0.9393849968910217, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 1.8944, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.39864864864864863, |
|
"grad_norm": 0.848622739315033, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 1.619, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 0.9589544534683228, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 1.8031, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.41216216216216217, |
|
"grad_norm": 0.9193155169487, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 1.57, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4189189189189189, |
|
"grad_norm": 0.8888735175132751, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 1.5869, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.42567567567567566, |
|
"grad_norm": 0.9404612183570862, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 1.8206, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.42567567567567566, |
|
"eval_loss": 1.7357913255691528, |
|
"eval_runtime": 3.5129, |
|
"eval_samples_per_second": 35.583, |
|
"eval_steps_per_second": 4.555, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.43243243243243246, |
|
"grad_norm": 0.8742846846580505, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 1.5211, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.4391891891891892, |
|
"grad_norm": 0.9241499304771423, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 2.082, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.44594594594594594, |
|
"grad_norm": 0.9210823774337769, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 1.7166, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.4527027027027027, |
|
"grad_norm": 0.8455032706260681, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 1.5984, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4594594594594595, |
|
"grad_norm": 0.9703152775764465, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 1.8997, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.46621621621621623, |
|
"grad_norm": 1.0417201519012451, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 1.6946, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.47297297297297297, |
|
"grad_norm": 0.8623643517494202, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 1.8819, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4797297297297297, |
|
"grad_norm": 0.8981446027755737, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 1.7159, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"grad_norm": 0.8115476965904236, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 1.3985, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.4864864864864865, |
|
"eval_loss": 1.719651699066162, |
|
"eval_runtime": 3.5221, |
|
"eval_samples_per_second": 35.49, |
|
"eval_steps_per_second": 4.543, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.49324324324324326, |
|
"grad_norm": 0.8665674328804016, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 1.5179, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.9210397005081177, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 1.7493, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.5067567567567568, |
|
"grad_norm": 0.8442562222480774, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 1.8021, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7771734228860928.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|