|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.13531799729364005, |
|
"eval_steps": 9, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0013531799729364006, |
|
"grad_norm": 8.728355407714844, |
|
"learning_rate": 1e-05, |
|
"loss": 1.6889, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0013531799729364006, |
|
"eval_loss": 1.6367640495300293, |
|
"eval_runtime": 33.1413, |
|
"eval_samples_per_second": 18.798, |
|
"eval_steps_per_second": 2.354, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0027063599458728013, |
|
"grad_norm": 8.341392517089844, |
|
"learning_rate": 2e-05, |
|
"loss": 1.7952, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0040595399188092015, |
|
"grad_norm": 8.935415267944336, |
|
"learning_rate": 3e-05, |
|
"loss": 1.5444, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.005412719891745603, |
|
"grad_norm": 7.361211776733398, |
|
"learning_rate": 4e-05, |
|
"loss": 1.452, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.006765899864682003, |
|
"grad_norm": 5.186268329620361, |
|
"learning_rate": 5e-05, |
|
"loss": 1.3568, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.008119079837618403, |
|
"grad_norm": 4.2360663414001465, |
|
"learning_rate": 6e-05, |
|
"loss": 1.1432, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.009472259810554804, |
|
"grad_norm": 3.4211342334747314, |
|
"learning_rate": 7e-05, |
|
"loss": 0.8908, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.010825439783491205, |
|
"grad_norm": 2.8270256519317627, |
|
"learning_rate": 8e-05, |
|
"loss": 0.7026, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.012178619756427604, |
|
"grad_norm": 4.626265525817871, |
|
"learning_rate": 9e-05, |
|
"loss": 0.7591, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.012178619756427604, |
|
"eval_loss": 0.4093446731567383, |
|
"eval_runtime": 32.5672, |
|
"eval_samples_per_second": 19.13, |
|
"eval_steps_per_second": 2.395, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.013531799729364006, |
|
"grad_norm": 1.9083399772644043, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3278, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.014884979702300407, |
|
"grad_norm": 2.498229503631592, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 0.4199, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.016238159675236806, |
|
"grad_norm": 1.37482750415802, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 0.1413, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.017591339648173207, |
|
"grad_norm": 1.1291427612304688, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 0.137, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.018944519621109608, |
|
"grad_norm": 1.5424270629882812, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 0.1136, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.02029769959404601, |
|
"grad_norm": 1.4762475490570068, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.0587, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02165087956698241, |
|
"grad_norm": 0.367654025554657, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 0.0216, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.023004059539918808, |
|
"grad_norm": 1.203076958656311, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 0.0476, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.02435723951285521, |
|
"grad_norm": 0.837530791759491, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 0.0208, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02435723951285521, |
|
"eval_loss": 0.03585303574800491, |
|
"eval_runtime": 32.5664, |
|
"eval_samples_per_second": 19.13, |
|
"eval_steps_per_second": 2.395, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.02571041948579161, |
|
"grad_norm": 0.6422433853149414, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 0.0111, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.02706359945872801, |
|
"grad_norm": 1.1846771240234375, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 0.0873, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.028416779431664412, |
|
"grad_norm": 0.19324840605258942, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 0.0102, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.029769959404600813, |
|
"grad_norm": 0.296364426612854, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 0.0174, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.03112313937753721, |
|
"grad_norm": 0.7968310117721558, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 0.047, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.03247631935047361, |
|
"grad_norm": 0.12593711912631989, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 0.0061, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.03382949932341001, |
|
"grad_norm": 0.28613799810409546, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 0.0075, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.035182679296346414, |
|
"grad_norm": 0.23756200075149536, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 0.0165, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.036535859269282815, |
|
"grad_norm": 0.12324630469083786, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 0.0029, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.036535859269282815, |
|
"eval_loss": 0.020997991785407066, |
|
"eval_runtime": 32.6042, |
|
"eval_samples_per_second": 19.108, |
|
"eval_steps_per_second": 2.392, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.037889039242219216, |
|
"grad_norm": 0.08755990862846375, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 0.0074, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.03924221921515562, |
|
"grad_norm": 0.18577858805656433, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 0.0084, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.04059539918809202, |
|
"grad_norm": 0.09318247437477112, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 0.0049, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04194857916102842, |
|
"grad_norm": 0.2695228159427643, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 0.0159, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.04330175913396482, |
|
"grad_norm": 0.08464854955673218, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 0.0065, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.044654939106901215, |
|
"grad_norm": 0.17591221630573273, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 0.0049, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.046008119079837616, |
|
"grad_norm": 0.2792809009552002, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 0.0171, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.04736129905277402, |
|
"grad_norm": 0.12685370445251465, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 0.0043, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04871447902571042, |
|
"grad_norm": 0.5813450813293457, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 0.0135, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.04871447902571042, |
|
"eval_loss": 0.019910942763090134, |
|
"eval_runtime": 32.5995, |
|
"eval_samples_per_second": 19.111, |
|
"eval_steps_per_second": 2.393, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.05006765899864682, |
|
"grad_norm": 0.08670137077569962, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 0.0062, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.05142083897158322, |
|
"grad_norm": 0.09463904052972794, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 0.0021, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.05277401894451962, |
|
"grad_norm": 0.05870044231414795, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 0.006, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.05412719891745602, |
|
"grad_norm": 0.09653154015541077, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.007, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05548037889039242, |
|
"grad_norm": 0.06797784566879272, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 0.0094, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.056833558863328824, |
|
"grad_norm": 0.9252069592475891, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 0.0755, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.058186738836265225, |
|
"grad_norm": 0.08743719756603241, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 0.007, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.05953991880920163, |
|
"grad_norm": 0.22028687596321106, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 0.0107, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.06089309878213803, |
|
"grad_norm": 0.24319298565387726, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 0.0168, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06089309878213803, |
|
"eval_loss": 0.019319333136081696, |
|
"eval_runtime": 32.603, |
|
"eval_samples_per_second": 19.109, |
|
"eval_steps_per_second": 2.392, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.06224627875507442, |
|
"grad_norm": 0.6694943904876709, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 0.081, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.06359945872801083, |
|
"grad_norm": 0.06383314728736877, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 0.0016, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.06495263870094722, |
|
"grad_norm": 0.05941140279173851, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 0.0045, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.06630581867388363, |
|
"grad_norm": 0.19987758994102478, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 0.0132, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.06765899864682003, |
|
"grad_norm": 0.11729590594768524, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 0.0058, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06901217861975643, |
|
"grad_norm": 0.0919797345995903, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 0.0045, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.07036535859269283, |
|
"grad_norm": 0.05789254978299141, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 0.0048, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.07171853856562922, |
|
"grad_norm": 0.18244007229804993, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 0.0155, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.07307171853856563, |
|
"grad_norm": 0.6028981804847717, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 0.1107, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.07307171853856563, |
|
"eval_loss": 0.018029918894171715, |
|
"eval_runtime": 32.6237, |
|
"eval_samples_per_second": 19.097, |
|
"eval_steps_per_second": 2.391, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.07442489851150202, |
|
"grad_norm": 0.5009127855300903, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0893, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.07577807848443843, |
|
"grad_norm": 0.04764901474118233, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 0.0064, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.07713125845737483, |
|
"grad_norm": 0.8551574945449829, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 0.1503, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.07848443843031123, |
|
"grad_norm": 0.0777273029088974, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 0.005, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.07983761840324763, |
|
"grad_norm": 0.6547033786773682, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 0.1104, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.08119079837618404, |
|
"grad_norm": 0.15890103578567505, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 0.0155, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.08254397834912043, |
|
"grad_norm": 1.0533157587051392, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 0.1344, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.08389715832205684, |
|
"grad_norm": 0.13047480583190918, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 0.0116, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.08525033829499323, |
|
"grad_norm": 0.14830543100833893, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 0.0111, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08525033829499323, |
|
"eval_loss": 0.01703736186027527, |
|
"eval_runtime": 32.5933, |
|
"eval_samples_per_second": 19.114, |
|
"eval_steps_per_second": 2.393, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.08660351826792964, |
|
"grad_norm": 0.10914517939090729, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 0.008, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.08795669824086604, |
|
"grad_norm": 0.13250844180583954, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 0.0078, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.08930987821380243, |
|
"grad_norm": 0.14327938854694366, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 0.0085, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.09066305818673884, |
|
"grad_norm": 0.14970730245113373, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 0.0042, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.09201623815967523, |
|
"grad_norm": 0.08402867615222931, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 0.0104, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.09336941813261164, |
|
"grad_norm": 0.08050940930843353, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 0.005, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.09472259810554803, |
|
"grad_norm": 0.5739089250564575, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 0.0547, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.09607577807848444, |
|
"grad_norm": 0.11337218433618546, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 0.0091, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.09742895805142084, |
|
"grad_norm": 0.06483836472034454, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 0.0072, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09742895805142084, |
|
"eval_loss": 0.015148372389376163, |
|
"eval_runtime": 32.5938, |
|
"eval_samples_per_second": 19.114, |
|
"eval_steps_per_second": 2.393, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.09878213802435724, |
|
"grad_norm": 0.12975576519966125, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 0.0114, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.10013531799729364, |
|
"grad_norm": 0.08989189565181732, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 0.0041, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.10148849797023005, |
|
"grad_norm": 0.6734408140182495, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 0.0926, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10284167794316644, |
|
"grad_norm": 0.1009981706738472, |
|
"learning_rate": 1.6543469682057106e-05, |
|
"loss": 0.009, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.10419485791610285, |
|
"grad_norm": 0.044376980513334274, |
|
"learning_rate": 1.526708147705013e-05, |
|
"loss": 0.0047, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.10554803788903924, |
|
"grad_norm": 0.07079390436410904, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 0.0048, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.10690121786197564, |
|
"grad_norm": 0.1579139083623886, |
|
"learning_rate": 1.2842758726130283e-05, |
|
"loss": 0.006, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.10825439783491204, |
|
"grad_norm": 0.3575461208820343, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 0.0312, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.10960757780784844, |
|
"grad_norm": 0.054556429386138916, |
|
"learning_rate": 1.0599462319663905e-05, |
|
"loss": 0.0069, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.10960757780784844, |
|
"eval_loss": 0.013904375955462456, |
|
"eval_runtime": 32.5975, |
|
"eval_samples_per_second": 19.112, |
|
"eval_steps_per_second": 2.393, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.11096075778078485, |
|
"grad_norm": 0.059846166521310806, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 0.0073, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.11231393775372124, |
|
"grad_norm": 0.07215581089258194, |
|
"learning_rate": 8.548121372247918e-06, |
|
"loss": 0.0051, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.11366711772665765, |
|
"grad_norm": 0.1450567990541458, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 0.0122, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.11502029769959404, |
|
"grad_norm": 0.03491105139255524, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 0.0043, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.11637347767253045, |
|
"grad_norm": 0.5953935384750366, |
|
"learning_rate": 5.852620357053651e-06, |
|
"loss": 0.0621, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.11772665764546685, |
|
"grad_norm": 0.4230695068836212, |
|
"learning_rate": 5.060297685041659e-06, |
|
"loss": 0.0422, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.11907983761840325, |
|
"grad_norm": 0.033300649374723434, |
|
"learning_rate": 4.322727117869951e-06, |
|
"loss": 0.0045, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.12043301759133965, |
|
"grad_norm": 0.30535688996315, |
|
"learning_rate": 3.6408072716606346e-06, |
|
"loss": 0.0485, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.12178619756427606, |
|
"grad_norm": 0.07103042304515839, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 0.004, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.12178619756427606, |
|
"eval_loss": 0.013438088819384575, |
|
"eval_runtime": 32.5892, |
|
"eval_samples_per_second": 19.117, |
|
"eval_steps_per_second": 2.393, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.12313937753721245, |
|
"grad_norm": 0.11808881163597107, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 0.0081, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.12449255751014884, |
|
"grad_norm": 0.033296674489974976, |
|
"learning_rate": 1.9369152030840556e-06, |
|
"loss": 0.0045, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.12584573748308525, |
|
"grad_norm": 0.19024379551410675, |
|
"learning_rate": 1.4852136862001764e-06, |
|
"loss": 0.0157, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.12719891745602166, |
|
"grad_norm": 0.7022133469581604, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 0.0563, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.12855209742895804, |
|
"grad_norm": 0.110652394592762, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 0.0102, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.12990527740189445, |
|
"grad_norm": 0.110796257853508, |
|
"learning_rate": 4.865965629214819e-07, |
|
"loss": 0.0116, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.13125845737483086, |
|
"grad_norm": 0.03754672780632973, |
|
"learning_rate": 2.7390523158633554e-07, |
|
"loss": 0.0038, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.13261163734776726, |
|
"grad_norm": 0.07318272441625595, |
|
"learning_rate": 1.2179748700879012e-07, |
|
"loss": 0.0019, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.13396481732070364, |
|
"grad_norm": 0.4888281226158142, |
|
"learning_rate": 3.04586490452119e-08, |
|
"loss": 0.0511, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.13396481732070364, |
|
"eval_loss": 0.013342314399778843, |
|
"eval_runtime": 32.607, |
|
"eval_samples_per_second": 19.106, |
|
"eval_steps_per_second": 2.392, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.13531799729364005, |
|
"grad_norm": 0.3825893998146057, |
|
"learning_rate": 0.0, |
|
"loss": 0.0563, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.41887283560448e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|