{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.018001800180018002, "eval_steps": 9, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00018001800180018, "grad_norm": 5.87307071685791, "learning_rate": 1e-05, "loss": 11.6144, "step": 1 }, { "epoch": 0.00018001800180018, "eval_loss": 11.61415958404541, "eval_runtime": 41.4656, "eval_samples_per_second": 112.816, "eval_steps_per_second": 14.108, "step": 1 }, { "epoch": 0.00036003600360036, "grad_norm": 6.0188374519348145, "learning_rate": 2e-05, "loss": 11.4565, "step": 2 }, { "epoch": 0.00054005400540054, "grad_norm": 5.906339168548584, "learning_rate": 3e-05, "loss": 11.3091, "step": 3 }, { "epoch": 0.00072007200720072, "grad_norm": 5.721518516540527, "learning_rate": 4e-05, "loss": 11.9668, "step": 4 }, { "epoch": 0.0009000900090009, "grad_norm": 6.276498794555664, "learning_rate": 5e-05, "loss": 11.5562, "step": 5 }, { "epoch": 0.00108010801080108, "grad_norm": 4.8785576820373535, "learning_rate": 6e-05, "loss": 11.1344, "step": 6 }, { "epoch": 0.0012601260126012602, "grad_norm": 5.525317668914795, "learning_rate": 7e-05, "loss": 11.1017, "step": 7 }, { "epoch": 0.00144014401440144, "grad_norm": 7.0762529373168945, "learning_rate": 8e-05, "loss": 11.8841, "step": 8 }, { "epoch": 0.0016201620162016202, "grad_norm": 5.757331371307373, "learning_rate": 9e-05, "loss": 11.6058, "step": 9 }, { "epoch": 0.0016201620162016202, "eval_loss": 11.442190170288086, "eval_runtime": 41.4866, "eval_samples_per_second": 112.759, "eval_steps_per_second": 14.101, "step": 9 }, { "epoch": 0.0018001800180018, "grad_norm": 5.312053203582764, "learning_rate": 0.0001, "loss": 11.3765, "step": 10 }, { "epoch": 0.0019801980198019802, "grad_norm": 5.930967807769775, "learning_rate": 9.99695413509548e-05, "loss": 11.6333, "step": 11 }, { "epoch": 0.00216021602160216, "grad_norm": 6.4748945236206055, "learning_rate": 9.987820251299122e-05, "loss": 11.5855, "step": 12 }, { "epoch": 0.00234023402340234, "grad_norm": 6.37196683883667, "learning_rate": 9.972609476841367e-05, "loss": 11.5042, "step": 13 }, { "epoch": 0.0025202520252025204, "grad_norm": 5.835845947265625, "learning_rate": 9.951340343707852e-05, "loss": 11.6153, "step": 14 }, { "epoch": 0.0027002700270027003, "grad_norm": 5.734114170074463, "learning_rate": 9.924038765061042e-05, "loss": 11.1202, "step": 15 }, { "epoch": 0.00288028802880288, "grad_norm": 5.847100257873535, "learning_rate": 9.890738003669029e-05, "loss": 11.1372, "step": 16 }, { "epoch": 0.0030603060306030605, "grad_norm": 5.214365005493164, "learning_rate": 9.851478631379982e-05, "loss": 11.0652, "step": 17 }, { "epoch": 0.0032403240324032404, "grad_norm": 5.617738723754883, "learning_rate": 9.806308479691595e-05, "loss": 10.7516, "step": 18 }, { "epoch": 0.0032403240324032404, "eval_loss": 10.946746826171875, "eval_runtime": 41.4928, "eval_samples_per_second": 112.743, "eval_steps_per_second": 14.099, "step": 18 }, { "epoch": 0.0034203420342034203, "grad_norm": 6.485275745391846, "learning_rate": 9.755282581475769e-05, "loss": 11.0168, "step": 19 }, { "epoch": 0.0036003600360036, "grad_norm": 5.778016090393066, "learning_rate": 9.698463103929542e-05, "loss": 11.0644, "step": 20 }, { "epoch": 0.0037803780378037805, "grad_norm": 4.897763252258301, "learning_rate": 9.635919272833938e-05, "loss": 10.8183, "step": 21 }, { "epoch": 0.0039603960396039604, "grad_norm": 4.856274127960205, "learning_rate": 9.567727288213005e-05, "loss": 10.8357, "step": 22 }, { "epoch": 0.004140414041404141, "grad_norm": 6.177309989929199, "learning_rate": 9.493970231495835e-05, "loss": 10.8354, "step": 23 }, { "epoch": 0.00432043204320432, "grad_norm": 5.736455917358398, "learning_rate": 9.414737964294636e-05, "loss": 10.4543, "step": 24 }, { "epoch": 0.004500450045004501, "grad_norm": 5.722464084625244, "learning_rate": 9.330127018922194e-05, "loss": 10.7997, "step": 25 }, { "epoch": 0.00468046804680468, "grad_norm": 5.896301746368408, "learning_rate": 9.24024048078213e-05, "loss": 10.8154, "step": 26 }, { "epoch": 0.00486048604860486, "grad_norm": 6.108735084533691, "learning_rate": 9.145187862775209e-05, "loss": 10.477, "step": 27 }, { "epoch": 0.00486048604860486, "eval_loss": 10.479217529296875, "eval_runtime": 41.5514, "eval_samples_per_second": 112.583, "eval_steps_per_second": 14.079, "step": 27 }, { "epoch": 0.005040504050405041, "grad_norm": 6.187192916870117, "learning_rate": 9.045084971874738e-05, "loss": 10.7532, "step": 28 }, { "epoch": 0.00522052205220522, "grad_norm": 6.162825107574463, "learning_rate": 8.940053768033609e-05, "loss": 10.0425, "step": 29 }, { "epoch": 0.0054005400540054005, "grad_norm": 5.06771993637085, "learning_rate": 8.83022221559489e-05, "loss": 10.6456, "step": 30 }, { "epoch": 0.005580558055805581, "grad_norm": 6.633289813995361, "learning_rate": 8.715724127386972e-05, "loss": 10.5557, "step": 31 }, { "epoch": 0.00576057605760576, "grad_norm": 7.135171413421631, "learning_rate": 8.596699001693255e-05, "loss": 10.4076, "step": 32 }, { "epoch": 0.005940594059405941, "grad_norm": 6.247159481048584, "learning_rate": 8.473291852294987e-05, "loss": 10.2683, "step": 33 }, { "epoch": 0.006120612061206121, "grad_norm": 5.888491153717041, "learning_rate": 8.345653031794292e-05, "loss": 10.3905, "step": 34 }, { "epoch": 0.0063006300630063005, "grad_norm": 6.2340407371521, "learning_rate": 8.213938048432697e-05, "loss": 10.0002, "step": 35 }, { "epoch": 0.006480648064806481, "grad_norm": 5.402869701385498, "learning_rate": 8.07830737662829e-05, "loss": 10.0471, "step": 36 }, { "epoch": 0.006480648064806481, "eval_loss": 10.007487297058105, "eval_runtime": 41.5019, "eval_samples_per_second": 112.718, "eval_steps_per_second": 14.096, "step": 36 }, { "epoch": 0.00666066606660666, "grad_norm": 6.251911163330078, "learning_rate": 7.938926261462366e-05, "loss": 10.0034, "step": 37 }, { "epoch": 0.006840684068406841, "grad_norm": 7.18772554397583, "learning_rate": 7.795964517353735e-05, "loss": 10.0799, "step": 38 }, { "epoch": 0.007020702070207021, "grad_norm": 6.769697666168213, "learning_rate": 7.649596321166024e-05, "loss": 9.8849, "step": 39 }, { "epoch": 0.0072007200720072, "grad_norm": 6.6725568771362305, "learning_rate": 7.500000000000001e-05, "loss": 10.18, "step": 40 }, { "epoch": 0.007380738073807381, "grad_norm": 7.166985511779785, "learning_rate": 7.347357813929454e-05, "loss": 9.7688, "step": 41 }, { "epoch": 0.007560756075607561, "grad_norm": 5.779541015625, "learning_rate": 7.191855733945387e-05, "loss": 9.6471, "step": 42 }, { "epoch": 0.0077407740774077406, "grad_norm": 7.53178596496582, "learning_rate": 7.033683215379002e-05, "loss": 9.5978, "step": 43 }, { "epoch": 0.007920792079207921, "grad_norm": 5.810229301452637, "learning_rate": 6.873032967079561e-05, "loss": 9.6907, "step": 44 }, { "epoch": 0.008100810081008101, "grad_norm": 5.271368980407715, "learning_rate": 6.710100716628344e-05, "loss": 9.7006, "step": 45 }, { "epoch": 0.008100810081008101, "eval_loss": 9.594242095947266, "eval_runtime": 41.4612, "eval_samples_per_second": 112.828, "eval_steps_per_second": 14.11, "step": 45 }, { "epoch": 0.008280828082808282, "grad_norm": 6.034334182739258, "learning_rate": 6.545084971874738e-05, "loss": 9.9457, "step": 46 }, { "epoch": 0.00846084608460846, "grad_norm": 6.873132228851318, "learning_rate": 6.378186779084995e-05, "loss": 9.3553, "step": 47 }, { "epoch": 0.00864086408640864, "grad_norm": 5.189530849456787, "learning_rate": 6.209609477998338e-05, "loss": 9.7457, "step": 48 }, { "epoch": 0.00882088208820882, "grad_norm": 4.753223896026611, "learning_rate": 6.0395584540887963e-05, "loss": 9.5327, "step": 49 }, { "epoch": 0.009000900090009001, "grad_norm": 5.407626628875732, "learning_rate": 5.868240888334653e-05, "loss": 9.4796, "step": 50 }, { "epoch": 0.009180918091809182, "grad_norm": 5.1421217918396, "learning_rate": 5.695865504800327e-05, "loss": 9.5437, "step": 51 }, { "epoch": 0.00936093609360936, "grad_norm": 4.592971324920654, "learning_rate": 5.522642316338268e-05, "loss": 9.1944, "step": 52 }, { "epoch": 0.00954095409540954, "grad_norm": 5.622161865234375, "learning_rate": 5.348782368720626e-05, "loss": 9.6394, "step": 53 }, { "epoch": 0.00972097209720972, "grad_norm": 5.27858829498291, "learning_rate": 5.174497483512506e-05, "loss": 9.5195, "step": 54 }, { "epoch": 0.00972097209720972, "eval_loss": 9.32948112487793, "eval_runtime": 41.5183, "eval_samples_per_second": 112.673, "eval_steps_per_second": 14.09, "step": 54 }, { "epoch": 0.009900990099009901, "grad_norm": 4.497775077819824, "learning_rate": 5e-05, "loss": 9.308, "step": 55 }, { "epoch": 0.010081008100810081, "grad_norm": 4.790937900543213, "learning_rate": 4.825502516487497e-05, "loss": 9.3374, "step": 56 }, { "epoch": 0.010261026102610262, "grad_norm": 4.156779766082764, "learning_rate": 4.6512176312793736e-05, "loss": 9.0797, "step": 57 }, { "epoch": 0.01044104410441044, "grad_norm": 5.075934410095215, "learning_rate": 4.477357683661734e-05, "loss": 9.1865, "step": 58 }, { "epoch": 0.01062106210621062, "grad_norm": 5.212141513824463, "learning_rate": 4.3041344951996746e-05, "loss": 9.2446, "step": 59 }, { "epoch": 0.010801080108010801, "grad_norm": 4.7454705238342285, "learning_rate": 4.131759111665349e-05, "loss": 9.1307, "step": 60 }, { "epoch": 0.010981098109810981, "grad_norm": 4.656852722167969, "learning_rate": 3.960441545911204e-05, "loss": 8.9722, "step": 61 }, { "epoch": 0.011161116111611162, "grad_norm": 4.986353397369385, "learning_rate": 3.790390522001662e-05, "loss": 9.6201, "step": 62 }, { "epoch": 0.01134113411341134, "grad_norm": 4.1770830154418945, "learning_rate": 3.6218132209150045e-05, "loss": 9.1031, "step": 63 }, { "epoch": 0.01134113411341134, "eval_loss": 9.168355941772461, "eval_runtime": 41.4788, "eval_samples_per_second": 112.781, "eval_steps_per_second": 14.104, "step": 63 }, { "epoch": 0.01152115211521152, "grad_norm": 5.309531211853027, "learning_rate": 3.4549150281252636e-05, "loss": 9.3599, "step": 64 }, { "epoch": 0.011701170117011701, "grad_norm": 4.383259296417236, "learning_rate": 3.289899283371657e-05, "loss": 9.15, "step": 65 }, { "epoch": 0.011881188118811881, "grad_norm": 4.308704853057861, "learning_rate": 3.12696703292044e-05, "loss": 9.0913, "step": 66 }, { "epoch": 0.012061206120612062, "grad_norm": 4.408690929412842, "learning_rate": 2.9663167846209998e-05, "loss": 9.3145, "step": 67 }, { "epoch": 0.012241224122412242, "grad_norm": 4.576683521270752, "learning_rate": 2.8081442660546125e-05, "loss": 8.8956, "step": 68 }, { "epoch": 0.01242124212421242, "grad_norm": 4.590541839599609, "learning_rate": 2.6526421860705473e-05, "loss": 9.1312, "step": 69 }, { "epoch": 0.012601260126012601, "grad_norm": 4.922686576843262, "learning_rate": 2.500000000000001e-05, "loss": 9.0805, "step": 70 }, { "epoch": 0.012781278127812781, "grad_norm": 4.215728759765625, "learning_rate": 2.350403678833976e-05, "loss": 9.2433, "step": 71 }, { "epoch": 0.012961296129612962, "grad_norm": 4.534681797027588, "learning_rate": 2.2040354826462668e-05, "loss": 8.948, "step": 72 }, { "epoch": 0.012961296129612962, "eval_loss": 9.072084426879883, "eval_runtime": 41.5005, "eval_samples_per_second": 112.721, "eval_steps_per_second": 14.096, "step": 72 }, { "epoch": 0.013141314131413142, "grad_norm": 4.945130348205566, "learning_rate": 2.061073738537635e-05, "loss": 9.4625, "step": 73 }, { "epoch": 0.01332133213321332, "grad_norm": 4.447700500488281, "learning_rate": 1.9216926233717085e-05, "loss": 8.8096, "step": 74 }, { "epoch": 0.013501350135013501, "grad_norm": 4.476735591888428, "learning_rate": 1.7860619515673033e-05, "loss": 8.8277, "step": 75 }, { "epoch": 0.013681368136813681, "grad_norm": 4.766935348510742, "learning_rate": 1.6543469682057106e-05, "loss": 9.1841, "step": 76 }, { "epoch": 0.013861386138613862, "grad_norm": 3.851850986480713, "learning_rate": 1.526708147705013e-05, "loss": 9.1235, "step": 77 }, { "epoch": 0.014041404140414042, "grad_norm": 4.168645858764648, "learning_rate": 1.4033009983067452e-05, "loss": 8.9935, "step": 78 }, { "epoch": 0.014221422142214222, "grad_norm": 4.831826210021973, "learning_rate": 1.2842758726130283e-05, "loss": 8.9272, "step": 79 }, { "epoch": 0.0144014401440144, "grad_norm": 4.024872779846191, "learning_rate": 1.1697777844051105e-05, "loss": 9.0397, "step": 80 }, { "epoch": 0.014581458145814581, "grad_norm": 4.845215797424316, "learning_rate": 1.0599462319663905e-05, "loss": 9.3672, "step": 81 }, { "epoch": 0.014581458145814581, "eval_loss": 9.021185874938965, "eval_runtime": 41.5093, "eval_samples_per_second": 112.698, "eval_steps_per_second": 14.093, "step": 81 }, { "epoch": 0.014761476147614761, "grad_norm": 4.619018077850342, "learning_rate": 9.549150281252633e-06, "loss": 8.9587, "step": 82 }, { "epoch": 0.014941494149414942, "grad_norm": 4.925379276275635, "learning_rate": 8.548121372247918e-06, "loss": 9.0123, "step": 83 }, { "epoch": 0.015121512151215122, "grad_norm": 4.891155242919922, "learning_rate": 7.597595192178702e-06, "loss": 9.1269, "step": 84 }, { "epoch": 0.0153015301530153, "grad_norm": 5.3043365478515625, "learning_rate": 6.698729810778065e-06, "loss": 9.0393, "step": 85 }, { "epoch": 0.015481548154815481, "grad_norm": 4.2021026611328125, "learning_rate": 5.852620357053651e-06, "loss": 9.1264, "step": 86 }, { "epoch": 0.015661566156615663, "grad_norm": 4.361050605773926, "learning_rate": 5.060297685041659e-06, "loss": 8.5332, "step": 87 }, { "epoch": 0.015841584158415842, "grad_norm": 4.473583221435547, "learning_rate": 4.322727117869951e-06, "loss": 9.0751, "step": 88 }, { "epoch": 0.01602160216021602, "grad_norm": 4.596582412719727, "learning_rate": 3.6408072716606346e-06, "loss": 8.8596, "step": 89 }, { "epoch": 0.016201620162016202, "grad_norm": 4.931702613830566, "learning_rate": 3.0153689607045845e-06, "loss": 9.0441, "step": 90 }, { "epoch": 0.016201620162016202, "eval_loss": 9.001091003417969, "eval_runtime": 41.4597, "eval_samples_per_second": 112.832, "eval_steps_per_second": 14.11, "step": 90 }, { "epoch": 0.01638163816381638, "grad_norm": 5.500175952911377, "learning_rate": 2.4471741852423237e-06, "loss": 9.2136, "step": 91 }, { "epoch": 0.016561656165616563, "grad_norm": 5.783007621765137, "learning_rate": 1.9369152030840556e-06, "loss": 9.234, "step": 92 }, { "epoch": 0.01674167416741674, "grad_norm": 4.667166233062744, "learning_rate": 1.4852136862001764e-06, "loss": 9.107, "step": 93 }, { "epoch": 0.01692169216921692, "grad_norm": 4.6480712890625, "learning_rate": 1.0926199633097157e-06, "loss": 8.7428, "step": 94 }, { "epoch": 0.017101710171017102, "grad_norm": 4.516505241394043, "learning_rate": 7.596123493895991e-07, "loss": 8.8254, "step": 95 }, { "epoch": 0.01728172817281728, "grad_norm": 4.1551103591918945, "learning_rate": 4.865965629214819e-07, "loss": 8.9528, "step": 96 }, { "epoch": 0.017461746174617463, "grad_norm": 4.877590656280518, "learning_rate": 2.7390523158633554e-07, "loss": 9.1006, "step": 97 }, { "epoch": 0.01764176417641764, "grad_norm": 5.414852619171143, "learning_rate": 1.2179748700879012e-07, "loss": 9.0552, "step": 98 }, { "epoch": 0.01782178217821782, "grad_norm": 5.404477596282959, "learning_rate": 3.04586490452119e-08, "loss": 9.08, "step": 99 }, { "epoch": 0.01782178217821782, "eval_loss": 8.99763298034668, "eval_runtime": 41.4635, "eval_samples_per_second": 112.822, "eval_steps_per_second": 14.109, "step": 99 }, { "epoch": 0.018001800180018002, "grad_norm": 5.185412406921387, "learning_rate": 0.0, "loss": 9.0785, "step": 100 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 27149520076800.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }