{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.13711151736745886, "eval_steps": 9, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0018281535648994515, "grad_norm": 2.1625711917877197, "learning_rate": 1e-05, "loss": 2.0061, "step": 1 }, { "epoch": 0.0018281535648994515, "eval_loss": 1.9399960041046143, "eval_runtime": 64.9097, "eval_samples_per_second": 7.102, "eval_steps_per_second": 0.894, "step": 1 }, { "epoch": 0.003656307129798903, "grad_norm": 2.010770082473755, "learning_rate": 2e-05, "loss": 1.9237, "step": 2 }, { "epoch": 0.005484460694698354, "grad_norm": 2.0491600036621094, "learning_rate": 3e-05, "loss": 2.0086, "step": 3 }, { "epoch": 0.007312614259597806, "grad_norm": 1.9856997728347778, "learning_rate": 4e-05, "loss": 1.8546, "step": 4 }, { "epoch": 0.009140767824497258, "grad_norm": 1.999695897102356, "learning_rate": 5e-05, "loss": 1.7567, "step": 5 }, { "epoch": 0.010968921389396709, "grad_norm": 2.1339967250823975, "learning_rate": 6e-05, "loss": 1.6694, "step": 6 }, { "epoch": 0.012797074954296161, "grad_norm": 2.18298602104187, "learning_rate": 7e-05, "loss": 1.7307, "step": 7 }, { "epoch": 0.014625228519195612, "grad_norm": 1.8223989009857178, "learning_rate": 8e-05, "loss": 1.399, "step": 8 }, { "epoch": 0.016453382084095063, "grad_norm": 1.704209804534912, "learning_rate": 9e-05, "loss": 1.3331, "step": 9 }, { "epoch": 0.016453382084095063, "eval_loss": 1.2653414011001587, "eval_runtime": 65.9311, "eval_samples_per_second": 6.992, "eval_steps_per_second": 0.88, "step": 9 }, { "epoch": 0.018281535648994516, "grad_norm": 1.8971141576766968, "learning_rate": 0.0001, "loss": 1.3892, "step": 10 }, { "epoch": 0.02010968921389397, "grad_norm": 2.0571422576904297, "learning_rate": 9.99695413509548e-05, "loss": 1.1854, "step": 11 }, { "epoch": 0.021937842778793418, "grad_norm": 2.0925815105438232, "learning_rate": 9.987820251299122e-05, "loss": 1.1601, "step": 12 }, { "epoch": 0.02376599634369287, "grad_norm": 1.8084160089492798, "learning_rate": 9.972609476841367e-05, "loss": 1.0377, "step": 13 }, { "epoch": 0.025594149908592323, "grad_norm": 1.7532470226287842, "learning_rate": 9.951340343707852e-05, "loss": 1.0497, "step": 14 }, { "epoch": 0.027422303473491772, "grad_norm": 1.5659563541412354, "learning_rate": 9.924038765061042e-05, "loss": 0.9577, "step": 15 }, { "epoch": 0.029250457038391225, "grad_norm": 1.5059962272644043, "learning_rate": 9.890738003669029e-05, "loss": 0.9969, "step": 16 }, { "epoch": 0.031078610603290677, "grad_norm": 1.6406954526901245, "learning_rate": 9.851478631379982e-05, "loss": 0.9157, "step": 17 }, { "epoch": 0.03290676416819013, "grad_norm": 1.520993709564209, "learning_rate": 9.806308479691595e-05, "loss": 0.9165, "step": 18 }, { "epoch": 0.03290676416819013, "eval_loss": 0.9396372437477112, "eval_runtime": 65.9735, "eval_samples_per_second": 6.988, "eval_steps_per_second": 0.879, "step": 18 }, { "epoch": 0.03473491773308958, "grad_norm": 1.4247767925262451, "learning_rate": 9.755282581475769e-05, "loss": 0.8283, "step": 19 }, { "epoch": 0.03656307129798903, "grad_norm": 1.5920847654342651, "learning_rate": 9.698463103929542e-05, "loss": 1.0668, "step": 20 }, { "epoch": 0.038391224862888484, "grad_norm": 1.451392650604248, "learning_rate": 9.635919272833938e-05, "loss": 0.9108, "step": 21 }, { "epoch": 0.04021937842778794, "grad_norm": 1.4005733728408813, "learning_rate": 9.567727288213005e-05, "loss": 0.8867, "step": 22 }, { "epoch": 0.04204753199268738, "grad_norm": 1.284644365310669, "learning_rate": 9.493970231495835e-05, "loss": 0.8585, "step": 23 }, { "epoch": 0.043875685557586835, "grad_norm": 1.3989516496658325, "learning_rate": 9.414737964294636e-05, "loss": 0.6688, "step": 24 }, { "epoch": 0.04570383912248629, "grad_norm": 1.6957075595855713, "learning_rate": 9.330127018922194e-05, "loss": 0.7847, "step": 25 }, { "epoch": 0.04753199268738574, "grad_norm": 1.3463910818099976, "learning_rate": 9.24024048078213e-05, "loss": 0.7988, "step": 26 }, { "epoch": 0.04936014625228519, "grad_norm": 1.4402669668197632, "learning_rate": 9.145187862775209e-05, "loss": 0.7279, "step": 27 }, { "epoch": 0.04936014625228519, "eval_loss": 0.8090941309928894, "eval_runtime": 65.9062, "eval_samples_per_second": 6.995, "eval_steps_per_second": 0.88, "step": 27 }, { "epoch": 0.051188299817184646, "grad_norm": 1.5752246379852295, "learning_rate": 9.045084971874738e-05, "loss": 0.784, "step": 28 }, { "epoch": 0.05301645338208409, "grad_norm": 1.5713309049606323, "learning_rate": 8.940053768033609e-05, "loss": 0.8821, "step": 29 }, { "epoch": 0.054844606946983544, "grad_norm": 1.5881538391113281, "learning_rate": 8.83022221559489e-05, "loss": 0.7659, "step": 30 }, { "epoch": 0.056672760511883, "grad_norm": 1.4045054912567139, "learning_rate": 8.715724127386972e-05, "loss": 0.8434, "step": 31 }, { "epoch": 0.05850091407678245, "grad_norm": 1.3920577764511108, "learning_rate": 8.596699001693255e-05, "loss": 0.7623, "step": 32 }, { "epoch": 0.0603290676416819, "grad_norm": 1.535837173461914, "learning_rate": 8.473291852294987e-05, "loss": 0.8262, "step": 33 }, { "epoch": 0.062157221206581355, "grad_norm": 1.3267228603363037, "learning_rate": 8.345653031794292e-05, "loss": 0.7212, "step": 34 }, { "epoch": 0.06398537477148081, "grad_norm": 1.2633198499679565, "learning_rate": 8.213938048432697e-05, "loss": 0.6503, "step": 35 }, { "epoch": 0.06581352833638025, "grad_norm": 1.4025475978851318, "learning_rate": 8.07830737662829e-05, "loss": 0.7201, "step": 36 }, { "epoch": 0.06581352833638025, "eval_loss": 0.7338544130325317, "eval_runtime": 65.937, "eval_samples_per_second": 6.992, "eval_steps_per_second": 0.88, "step": 36 }, { "epoch": 0.06764168190127971, "grad_norm": 1.480459213256836, "learning_rate": 7.938926261462366e-05, "loss": 0.7089, "step": 37 }, { "epoch": 0.06946983546617916, "grad_norm": 1.4058476686477661, "learning_rate": 7.795964517353735e-05, "loss": 0.7587, "step": 38 }, { "epoch": 0.0712979890310786, "grad_norm": 1.5269132852554321, "learning_rate": 7.649596321166024e-05, "loss": 0.7869, "step": 39 }, { "epoch": 0.07312614259597806, "grad_norm": 1.5104089975357056, "learning_rate": 7.500000000000001e-05, "loss": 0.7055, "step": 40 }, { "epoch": 0.07495429616087751, "grad_norm": 1.5070502758026123, "learning_rate": 7.347357813929454e-05, "loss": 0.677, "step": 41 }, { "epoch": 0.07678244972577697, "grad_norm": 1.4173520803451538, "learning_rate": 7.191855733945387e-05, "loss": 0.6873, "step": 42 }, { "epoch": 0.07861060329067641, "grad_norm": 1.5242536067962646, "learning_rate": 7.033683215379002e-05, "loss": 0.707, "step": 43 }, { "epoch": 0.08043875685557587, "grad_norm": 1.484297752380371, "learning_rate": 6.873032967079561e-05, "loss": 0.6689, "step": 44 }, { "epoch": 0.08226691042047532, "grad_norm": 1.4110380411148071, "learning_rate": 6.710100716628344e-05, "loss": 0.7221, "step": 45 }, { "epoch": 0.08226691042047532, "eval_loss": 0.6858494281768799, "eval_runtime": 65.9822, "eval_samples_per_second": 6.987, "eval_steps_per_second": 0.879, "step": 45 }, { "epoch": 0.08409506398537477, "grad_norm": 1.623942494392395, "learning_rate": 6.545084971874738e-05, "loss": 0.6482, "step": 46 }, { "epoch": 0.08592321755027423, "grad_norm": 1.4406014680862427, "learning_rate": 6.378186779084995e-05, "loss": 0.6407, "step": 47 }, { "epoch": 0.08775137111517367, "grad_norm": 1.2649809122085571, "learning_rate": 6.209609477998338e-05, "loss": 0.5991, "step": 48 }, { "epoch": 0.08957952468007313, "grad_norm": 1.4061084985733032, "learning_rate": 6.0395584540887963e-05, "loss": 0.6766, "step": 49 }, { "epoch": 0.09140767824497258, "grad_norm": 1.412961483001709, "learning_rate": 5.868240888334653e-05, "loss": 0.685, "step": 50 }, { "epoch": 0.09323583180987204, "grad_norm": 1.3918739557266235, "learning_rate": 5.695865504800327e-05, "loss": 0.6061, "step": 51 }, { "epoch": 0.09506398537477148, "grad_norm": 1.3472814559936523, "learning_rate": 5.522642316338268e-05, "loss": 0.6214, "step": 52 }, { "epoch": 0.09689213893967093, "grad_norm": 1.3871572017669678, "learning_rate": 5.348782368720626e-05, "loss": 0.6313, "step": 53 }, { "epoch": 0.09872029250457039, "grad_norm": 1.3942546844482422, "learning_rate": 5.174497483512506e-05, "loss": 0.6416, "step": 54 }, { "epoch": 0.09872029250457039, "eval_loss": 0.6498388648033142, "eval_runtime": 65.8779, "eval_samples_per_second": 6.998, "eval_steps_per_second": 0.88, "step": 54 }, { "epoch": 0.10054844606946983, "grad_norm": 1.36981201171875, "learning_rate": 5e-05, "loss": 0.6073, "step": 55 }, { "epoch": 0.10237659963436929, "grad_norm": 1.2767627239227295, "learning_rate": 4.825502516487497e-05, "loss": 0.6272, "step": 56 }, { "epoch": 0.10420475319926874, "grad_norm": 1.4287116527557373, "learning_rate": 4.6512176312793736e-05, "loss": 0.6646, "step": 57 }, { "epoch": 0.10603290676416818, "grad_norm": 1.3621071577072144, "learning_rate": 4.477357683661734e-05, "loss": 0.7029, "step": 58 }, { "epoch": 0.10786106032906764, "grad_norm": 1.4007452726364136, "learning_rate": 4.3041344951996746e-05, "loss": 0.6084, "step": 59 }, { "epoch": 0.10968921389396709, "grad_norm": 1.3651444911956787, "learning_rate": 4.131759111665349e-05, "loss": 0.6416, "step": 60 }, { "epoch": 0.11151736745886655, "grad_norm": 1.371893286705017, "learning_rate": 3.960441545911204e-05, "loss": 0.64, "step": 61 }, { "epoch": 0.113345521023766, "grad_norm": 1.320582628250122, "learning_rate": 3.790390522001662e-05, "loss": 0.5838, "step": 62 }, { "epoch": 0.11517367458866545, "grad_norm": 1.489526391029358, "learning_rate": 3.6218132209150045e-05, "loss": 0.743, "step": 63 }, { "epoch": 0.11517367458866545, "eval_loss": 0.632634162902832, "eval_runtime": 65.9467, "eval_samples_per_second": 6.99, "eval_steps_per_second": 0.879, "step": 63 }, { "epoch": 0.1170018281535649, "grad_norm": 1.4206615686416626, "learning_rate": 3.4549150281252636e-05, "loss": 0.6447, "step": 64 }, { "epoch": 0.11882998171846434, "grad_norm": 1.3722271919250488, "learning_rate": 3.289899283371657e-05, "loss": 0.6066, "step": 65 }, { "epoch": 0.1206581352833638, "grad_norm": 1.4413255453109741, "learning_rate": 3.12696703292044e-05, "loss": 0.567, "step": 66 }, { "epoch": 0.12248628884826325, "grad_norm": 1.2582032680511475, "learning_rate": 2.9663167846209998e-05, "loss": 0.5628, "step": 67 }, { "epoch": 0.12431444241316271, "grad_norm": 1.5137399435043335, "learning_rate": 2.8081442660546125e-05, "loss": 0.615, "step": 68 }, { "epoch": 0.12614259597806216, "grad_norm": 1.4835834503173828, "learning_rate": 2.6526421860705473e-05, "loss": 0.6619, "step": 69 }, { "epoch": 0.12797074954296161, "grad_norm": 1.2490266561508179, "learning_rate": 2.500000000000001e-05, "loss": 0.5641, "step": 70 }, { "epoch": 0.12979890310786105, "grad_norm": 1.3015044927597046, "learning_rate": 2.350403678833976e-05, "loss": 0.5771, "step": 71 }, { "epoch": 0.1316270566727605, "grad_norm": 1.284609317779541, "learning_rate": 2.2040354826462668e-05, "loss": 0.5642, "step": 72 }, { "epoch": 0.1316270566727605, "eval_loss": 0.610695481300354, "eval_runtime": 65.8956, "eval_samples_per_second": 6.996, "eval_steps_per_second": 0.88, "step": 72 }, { "epoch": 0.13345521023765997, "grad_norm": 1.3459687232971191, "learning_rate": 2.061073738537635e-05, "loss": 0.6421, "step": 73 }, { "epoch": 0.13528336380255943, "grad_norm": 1.2567322254180908, "learning_rate": 1.9216926233717085e-05, "loss": 0.5645, "step": 74 }, { "epoch": 0.13711151736745886, "grad_norm": 1.2956007719039917, "learning_rate": 1.7860619515673033e-05, "loss": 0.5018, "step": 75 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.75170771156992e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }