|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.44642857142857145, |
|
"eval_steps": 9, |
|
"global_step": 75, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005952380952380952, |
|
"grad_norm": 1.8847510814666748, |
|
"learning_rate": 1e-05, |
|
"loss": 3.4453, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005952380952380952, |
|
"eval_loss": 3.4847054481506348, |
|
"eval_runtime": 0.7278, |
|
"eval_samples_per_second": 776.315, |
|
"eval_steps_per_second": 24.732, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.011904761904761904, |
|
"grad_norm": 1.8967914581298828, |
|
"learning_rate": 2e-05, |
|
"loss": 3.5233, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.017857142857142856, |
|
"grad_norm": 1.9609869718551636, |
|
"learning_rate": 3e-05, |
|
"loss": 3.4954, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.023809523809523808, |
|
"grad_norm": 1.975792646408081, |
|
"learning_rate": 4e-05, |
|
"loss": 3.4817, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02976190476190476, |
|
"grad_norm": 1.9651564359664917, |
|
"learning_rate": 5e-05, |
|
"loss": 3.476, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03571428571428571, |
|
"grad_norm": 2.0456418991088867, |
|
"learning_rate": 6e-05, |
|
"loss": 3.4424, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.041666666666666664, |
|
"grad_norm": 1.925234317779541, |
|
"learning_rate": 7e-05, |
|
"loss": 3.3821, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.047619047619047616, |
|
"grad_norm": 1.7819530963897705, |
|
"learning_rate": 8e-05, |
|
"loss": 3.2534, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05357142857142857, |
|
"grad_norm": 1.7863258123397827, |
|
"learning_rate": 9e-05, |
|
"loss": 3.1961, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05357142857142857, |
|
"eval_loss": 3.0999181270599365, |
|
"eval_runtime": 0.729, |
|
"eval_samples_per_second": 775.008, |
|
"eval_steps_per_second": 24.691, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05952380952380952, |
|
"grad_norm": 1.722019076347351, |
|
"learning_rate": 0.0001, |
|
"loss": 3.1102, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06547619047619048, |
|
"grad_norm": 1.6802176237106323, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 3.0669, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 1.68272864818573, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 2.7909, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.07738095238095238, |
|
"grad_norm": 1.5962737798690796, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 2.6373, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.08333333333333333, |
|
"grad_norm": 1.5708441734313965, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 2.5206, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.08928571428571429, |
|
"grad_norm": 1.5052292346954346, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 2.4557, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 1.5192614793777466, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 2.3104, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10119047619047619, |
|
"grad_norm": 1.5247867107391357, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 2.275, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"grad_norm": 1.5035266876220703, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 2.1103, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.10714285714285714, |
|
"eval_loss": 2.022083282470703, |
|
"eval_runtime": 0.7263, |
|
"eval_samples_per_second": 777.939, |
|
"eval_steps_per_second": 24.784, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1130952380952381, |
|
"grad_norm": 1.5212229490280151, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 1.9887, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.11904761904761904, |
|
"grad_norm": 1.5906459093093872, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 1.9476, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.7577109336853027, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 1.7878, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.13095238095238096, |
|
"grad_norm": 1.9069538116455078, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 1.7624, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.13690476190476192, |
|
"grad_norm": 2.0785834789276123, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 1.6126, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 2.226637363433838, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 1.4719, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1488095238095238, |
|
"grad_norm": 3.0561819076538086, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 1.4277, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.15476190476190477, |
|
"grad_norm": 2.925053834915161, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 1.3428, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.16071428571428573, |
|
"grad_norm": 2.3791446685791016, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 1.3196, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.16071428571428573, |
|
"eval_loss": 1.1837644577026367, |
|
"eval_runtime": 0.749, |
|
"eval_samples_per_second": 754.302, |
|
"eval_steps_per_second": 24.031, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 1.918161392211914, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 1.078, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.17261904761904762, |
|
"grad_norm": 1.787397027015686, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 1.0918, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.17857142857142858, |
|
"grad_norm": 1.718632459640503, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 1.0613, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.18452380952380953, |
|
"grad_norm": 1.6407841444015503, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 0.9426, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 1.6310867071151733, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 0.9499, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.19642857142857142, |
|
"grad_norm": 1.6535135507583618, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 0.8009, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.20238095238095238, |
|
"grad_norm": 1.7860593795776367, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 0.698, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.20833333333333334, |
|
"grad_norm": 1.6134707927703857, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.6973, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 1.5818922519683838, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 0.6982, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"eval_loss": 0.591748058795929, |
|
"eval_runtime": 0.7267, |
|
"eval_samples_per_second": 777.438, |
|
"eval_steps_per_second": 24.768, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.22023809523809523, |
|
"grad_norm": 1.3319734334945679, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.6164, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.2261904761904762, |
|
"grad_norm": 1.2683157920837402, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 0.5391, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.23214285714285715, |
|
"grad_norm": 1.0097771883010864, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 0.49, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.23809523809523808, |
|
"grad_norm": 1.0080583095550537, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.4928, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.24404761904761904, |
|
"grad_norm": 0.8241538405418396, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 0.4672, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.8907387852668762, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 0.4678, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.25595238095238093, |
|
"grad_norm": 0.8545511364936829, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 0.3988, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2619047619047619, |
|
"grad_norm": 0.8012543320655823, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 0.4446, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"grad_norm": 0.6742522716522217, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.3976, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.26785714285714285, |
|
"eval_loss": 0.393344908952713, |
|
"eval_runtime": 0.731, |
|
"eval_samples_per_second": 772.887, |
|
"eval_steps_per_second": 24.623, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.27380952380952384, |
|
"grad_norm": 0.6675676107406616, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.3466, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.27976190476190477, |
|
"grad_norm": 0.7589141726493835, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 0.3242, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.872429370880127, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 0.3272, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.2916666666666667, |
|
"grad_norm": 0.5737527012825012, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 0.3232, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2976190476190476, |
|
"grad_norm": 0.8787867426872253, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.3176, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.30357142857142855, |
|
"grad_norm": 0.8264537453651428, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 0.2985, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.30952380952380953, |
|
"grad_norm": 0.6697880029678345, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 0.335, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.31547619047619047, |
|
"grad_norm": 0.6313273310661316, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 0.3323, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"grad_norm": 0.7156944274902344, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 0.2925, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.32142857142857145, |
|
"eval_loss": 0.30814456939697266, |
|
"eval_runtime": 0.7272, |
|
"eval_samples_per_second": 776.911, |
|
"eval_steps_per_second": 24.751, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3273809523809524, |
|
"grad_norm": 0.8110126852989197, |
|
"learning_rate": 5e-05, |
|
"loss": 0.2934, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.5642712712287903, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 0.3237, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.3392857142857143, |
|
"grad_norm": 0.5082366466522217, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 0.2284, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.34523809523809523, |
|
"grad_norm": 0.5590605735778809, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 0.2723, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.35119047619047616, |
|
"grad_norm": 0.7834331393241882, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 0.2732, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.5841456651687622, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.2316, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3630952380952381, |
|
"grad_norm": 0.5144384503364563, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 0.2676, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.36904761904761907, |
|
"grad_norm": 0.5047840476036072, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 0.2311, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"grad_norm": 0.5553379058837891, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 0.2693, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.375, |
|
"eval_loss": 0.2591702938079834, |
|
"eval_runtime": 0.7281, |
|
"eval_samples_per_second": 776.028, |
|
"eval_steps_per_second": 24.723, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 0.5578157901763916, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.2313, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3869047619047619, |
|
"grad_norm": 0.5519534945487976, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.1994, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.39285714285714285, |
|
"grad_norm": 0.5156115889549255, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 0.2438, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.39880952380952384, |
|
"grad_norm": 0.5865421891212463, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 0.2964, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.40476190476190477, |
|
"grad_norm": 0.4177829921245575, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 0.2274, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.4107142857142857, |
|
"grad_norm": 0.442807674407959, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 0.2507, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.4166666666666667, |
|
"grad_norm": 0.4709461033344269, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.2241, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.4226190476190476, |
|
"grad_norm": 0.36320897936820984, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 0.203, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.4016166031360626, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 0.2008, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"eval_loss": 0.2299022376537323, |
|
"eval_runtime": 0.725, |
|
"eval_samples_per_second": 779.359, |
|
"eval_steps_per_second": 24.829, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.43452380952380953, |
|
"grad_norm": 0.5135464668273926, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.2505, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.44047619047619047, |
|
"grad_norm": 0.4497532844543457, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 0.2021, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.44642857142857145, |
|
"grad_norm": 0.44834592938423157, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.1972, |
|
"step": 75 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4165127582515200.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|