{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.14720314033366044, "eval_steps": 9, "global_step": 75, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001962708537782139, "grad_norm": 20.81890106201172, "learning_rate": 1e-05, "loss": 9.4225, "step": 1 }, { "epoch": 0.001962708537782139, "eval_loss": 8.840479850769043, "eval_runtime": 52.2007, "eval_samples_per_second": 8.218, "eval_steps_per_second": 1.034, "step": 1 }, { "epoch": 0.003925417075564278, "grad_norm": 16.247425079345703, "learning_rate": 2e-05, "loss": 8.8323, "step": 2 }, { "epoch": 0.005888125613346418, "grad_norm": 17.129257202148438, "learning_rate": 3e-05, "loss": 9.7792, "step": 3 }, { "epoch": 0.007850834151128557, "grad_norm": 15.433220863342285, "learning_rate": 4e-05, "loss": 8.5797, "step": 4 }, { "epoch": 0.009813542688910697, "grad_norm": 13.5704927444458, "learning_rate": 5e-05, "loss": 7.7854, "step": 5 }, { "epoch": 0.011776251226692836, "grad_norm": 14.451172828674316, "learning_rate": 6e-05, "loss": 7.6687, "step": 6 }, { "epoch": 0.013738959764474975, "grad_norm": 17.657636642456055, "learning_rate": 7e-05, "loss": 6.804, "step": 7 }, { "epoch": 0.015701668302257114, "grad_norm": 25.692893981933594, "learning_rate": 8e-05, "loss": 5.6844, "step": 8 }, { "epoch": 0.017664376840039256, "grad_norm": 16.068389892578125, "learning_rate": 9e-05, "loss": 4.5304, "step": 9 }, { "epoch": 0.017664376840039256, "eval_loss": 3.219860076904297, "eval_runtime": 51.8093, "eval_samples_per_second": 8.28, "eval_steps_per_second": 1.042, "step": 9 }, { "epoch": 0.019627085377821395, "grad_norm": 13.496415138244629, "learning_rate": 0.0001, "loss": 3.1004, "step": 10 }, { "epoch": 0.021589793915603533, "grad_norm": 11.793410301208496, "learning_rate": 9.99695413509548e-05, "loss": 2.1632, "step": 11 }, { "epoch": 0.023552502453385672, "grad_norm": 12.152811050415039, "learning_rate": 9.987820251299122e-05, "loss": 1.0349, "step": 12 }, { "epoch": 0.02551521099116781, "grad_norm": 7.9459919929504395, "learning_rate": 9.972609476841367e-05, "loss": 0.422, "step": 13 }, { "epoch": 0.02747791952894995, "grad_norm": 2.51922869682312, "learning_rate": 9.951340343707852e-05, "loss": 0.14, "step": 14 }, { "epoch": 0.029440628066732092, "grad_norm": 10.466949462890625, "learning_rate": 9.924038765061042e-05, "loss": 0.3396, "step": 15 }, { "epoch": 0.03140333660451423, "grad_norm": 4.913854122161865, "learning_rate": 9.890738003669029e-05, "loss": 0.2676, "step": 16 }, { "epoch": 0.033366045142296366, "grad_norm": 3.1969945430755615, "learning_rate": 9.851478631379982e-05, "loss": 0.151, "step": 17 }, { "epoch": 0.03532875368007851, "grad_norm": 2.624812602996826, "learning_rate": 9.806308479691595e-05, "loss": 0.1735, "step": 18 }, { "epoch": 0.03532875368007851, "eval_loss": 0.1688159704208374, "eval_runtime": 51.8215, "eval_samples_per_second": 8.278, "eval_steps_per_second": 1.042, "step": 18 }, { "epoch": 0.03729146221786065, "grad_norm": 4.066593647003174, "learning_rate": 9.755282581475769e-05, "loss": 0.1576, "step": 19 }, { "epoch": 0.03925417075564279, "grad_norm": 2.2764923572540283, "learning_rate": 9.698463103929542e-05, "loss": 0.0787, "step": 20 }, { "epoch": 0.04121687929342493, "grad_norm": 2.3905651569366455, "learning_rate": 9.635919272833938e-05, "loss": 0.1805, "step": 21 }, { "epoch": 0.04317958783120707, "grad_norm": 1.5698174238204956, "learning_rate": 9.567727288213005e-05, "loss": 0.146, "step": 22 }, { "epoch": 0.045142296368989206, "grad_norm": 1.9719736576080322, "learning_rate": 9.493970231495835e-05, "loss": 0.1639, "step": 23 }, { "epoch": 0.047105004906771344, "grad_norm": 1.390785813331604, "learning_rate": 9.414737964294636e-05, "loss": 0.1304, "step": 24 }, { "epoch": 0.04906771344455348, "grad_norm": 0.9126207232475281, "learning_rate": 9.330127018922194e-05, "loss": 0.156, "step": 25 }, { "epoch": 0.05103042198233562, "grad_norm": 2.437849283218384, "learning_rate": 9.24024048078213e-05, "loss": 0.1155, "step": 26 }, { "epoch": 0.05299313052011776, "grad_norm": 0.4617787003517151, "learning_rate": 9.145187862775209e-05, "loss": 0.123, "step": 27 }, { "epoch": 0.05299313052011776, "eval_loss": 0.1673201620578766, "eval_runtime": 51.8705, "eval_samples_per_second": 8.271, "eval_steps_per_second": 1.041, "step": 27 }, { "epoch": 0.0549558390578999, "grad_norm": 1.0778998136520386, "learning_rate": 9.045084971874738e-05, "loss": 0.1533, "step": 28 }, { "epoch": 0.05691854759568204, "grad_norm": 2.6037826538085938, "learning_rate": 8.940053768033609e-05, "loss": 0.2361, "step": 29 }, { "epoch": 0.058881256133464184, "grad_norm": 1.1255934238433838, "learning_rate": 8.83022221559489e-05, "loss": 0.133, "step": 30 }, { "epoch": 0.06084396467124632, "grad_norm": 0.5252030491828918, "learning_rate": 8.715724127386972e-05, "loss": 0.0579, "step": 31 }, { "epoch": 0.06280667320902845, "grad_norm": 0.34210190176963806, "learning_rate": 8.596699001693255e-05, "loss": 0.0365, "step": 32 }, { "epoch": 0.0647693817468106, "grad_norm": 4.503533840179443, "learning_rate": 8.473291852294987e-05, "loss": 0.3934, "step": 33 }, { "epoch": 0.06673209028459273, "grad_norm": 1.0601013898849487, "learning_rate": 8.345653031794292e-05, "loss": 0.1153, "step": 34 }, { "epoch": 0.06869479882237488, "grad_norm": 2.3702797889709473, "learning_rate": 8.213938048432697e-05, "loss": 0.2298, "step": 35 }, { "epoch": 0.07065750736015702, "grad_norm": 0.9476222991943359, "learning_rate": 8.07830737662829e-05, "loss": 0.1219, "step": 36 }, { "epoch": 0.07065750736015702, "eval_loss": 0.22275131940841675, "eval_runtime": 51.8318, "eval_samples_per_second": 8.277, "eval_steps_per_second": 1.042, "step": 36 }, { "epoch": 0.07262021589793916, "grad_norm": 4.955600261688232, "learning_rate": 7.938926261462366e-05, "loss": 0.1983, "step": 37 }, { "epoch": 0.0745829244357213, "grad_norm": 5.591402053833008, "learning_rate": 7.795964517353735e-05, "loss": 0.2904, "step": 38 }, { "epoch": 0.07654563297350343, "grad_norm": 5.275705337524414, "learning_rate": 7.649596321166024e-05, "loss": 0.232, "step": 39 }, { "epoch": 0.07850834151128558, "grad_norm": 1.338252305984497, "learning_rate": 7.500000000000001e-05, "loss": 0.1405, "step": 40 }, { "epoch": 0.08047105004906771, "grad_norm": 1.6821722984313965, "learning_rate": 7.347357813929454e-05, "loss": 0.1187, "step": 41 }, { "epoch": 0.08243375858684986, "grad_norm": 1.2487791776657104, "learning_rate": 7.191855733945387e-05, "loss": 0.1329, "step": 42 }, { "epoch": 0.08439646712463199, "grad_norm": 2.20461368560791, "learning_rate": 7.033683215379002e-05, "loss": 0.1637, "step": 43 }, { "epoch": 0.08635917566241413, "grad_norm": 0.9710573554039001, "learning_rate": 6.873032967079561e-05, "loss": 0.109, "step": 44 }, { "epoch": 0.08832188420019627, "grad_norm": 0.7090091705322266, "learning_rate": 6.710100716628344e-05, "loss": 0.0919, "step": 45 }, { "epoch": 0.08832188420019627, "eval_loss": 0.11113426089286804, "eval_runtime": 51.8528, "eval_samples_per_second": 8.273, "eval_steps_per_second": 1.041, "step": 45 }, { "epoch": 0.09028459273797841, "grad_norm": 0.7625433206558228, "learning_rate": 6.545084971874738e-05, "loss": 0.1295, "step": 46 }, { "epoch": 0.09224730127576054, "grad_norm": 1.3009706735610962, "learning_rate": 6.378186779084995e-05, "loss": 0.09, "step": 47 }, { "epoch": 0.09421000981354269, "grad_norm": 0.7470637559890747, "learning_rate": 6.209609477998338e-05, "loss": 0.0984, "step": 48 }, { "epoch": 0.09617271835132483, "grad_norm": 0.5593076348304749, "learning_rate": 6.0395584540887963e-05, "loss": 0.0986, "step": 49 }, { "epoch": 0.09813542688910697, "grad_norm": 0.45251327753067017, "learning_rate": 5.868240888334653e-05, "loss": 0.0453, "step": 50 }, { "epoch": 0.10009813542688911, "grad_norm": 0.6971419453620911, "learning_rate": 5.695865504800327e-05, "loss": 0.0857, "step": 51 }, { "epoch": 0.10206084396467124, "grad_norm": 1.4667972326278687, "learning_rate": 5.522642316338268e-05, "loss": 0.1401, "step": 52 }, { "epoch": 0.10402355250245339, "grad_norm": 0.5915104150772095, "learning_rate": 5.348782368720626e-05, "loss": 0.0572, "step": 53 }, { "epoch": 0.10598626104023552, "grad_norm": 0.9713770747184753, "learning_rate": 5.174497483512506e-05, "loss": 0.0795, "step": 54 }, { "epoch": 0.10598626104023552, "eval_loss": 0.07904753088951111, "eval_runtime": 51.8493, "eval_samples_per_second": 8.274, "eval_steps_per_second": 1.041, "step": 54 }, { "epoch": 0.10794896957801767, "grad_norm": 0.7951352596282959, "learning_rate": 5e-05, "loss": 0.0704, "step": 55 }, { "epoch": 0.1099116781157998, "grad_norm": 2.8656163215637207, "learning_rate": 4.825502516487497e-05, "loss": 0.1262, "step": 56 }, { "epoch": 0.11187438665358194, "grad_norm": 0.786283016204834, "learning_rate": 4.6512176312793736e-05, "loss": 0.0959, "step": 57 }, { "epoch": 0.11383709519136408, "grad_norm": 0.9875638484954834, "learning_rate": 4.477357683661734e-05, "loss": 0.0507, "step": 58 }, { "epoch": 0.11579980372914622, "grad_norm": 0.8965640068054199, "learning_rate": 4.3041344951996746e-05, "loss": 0.0643, "step": 59 }, { "epoch": 0.11776251226692837, "grad_norm": 0.4218522608280182, "learning_rate": 4.131759111665349e-05, "loss": 0.0271, "step": 60 }, { "epoch": 0.1197252208047105, "grad_norm": 1.05173659324646, "learning_rate": 3.960441545911204e-05, "loss": 0.0416, "step": 61 }, { "epoch": 0.12168792934249265, "grad_norm": 0.9266757965087891, "learning_rate": 3.790390522001662e-05, "loss": 0.0968, "step": 62 }, { "epoch": 0.12365063788027478, "grad_norm": 2.450507164001465, "learning_rate": 3.6218132209150045e-05, "loss": 0.1366, "step": 63 }, { "epoch": 0.12365063788027478, "eval_loss": 0.06722546368837357, "eval_runtime": 51.8533, "eval_samples_per_second": 8.273, "eval_steps_per_second": 1.041, "step": 63 }, { "epoch": 0.1256133464180569, "grad_norm": 0.7940893769264221, "learning_rate": 3.4549150281252636e-05, "loss": 0.0558, "step": 64 }, { "epoch": 0.12757605495583907, "grad_norm": 1.0147578716278076, "learning_rate": 3.289899283371657e-05, "loss": 0.0655, "step": 65 }, { "epoch": 0.1295387634936212, "grad_norm": 0.7361927032470703, "learning_rate": 3.12696703292044e-05, "loss": 0.0467, "step": 66 }, { "epoch": 0.13150147203140333, "grad_norm": 0.8373685479164124, "learning_rate": 2.9663167846209998e-05, "loss": 0.0838, "step": 67 }, { "epoch": 0.13346418056918546, "grad_norm": 0.41943085193634033, "learning_rate": 2.8081442660546125e-05, "loss": 0.0278, "step": 68 }, { "epoch": 0.13542688910696762, "grad_norm": 0.35588350892066956, "learning_rate": 2.6526421860705473e-05, "loss": 0.025, "step": 69 }, { "epoch": 0.13738959764474976, "grad_norm": 0.7751619219779968, "learning_rate": 2.500000000000001e-05, "loss": 0.0516, "step": 70 }, { "epoch": 0.1393523061825319, "grad_norm": 0.735934853553772, "learning_rate": 2.350403678833976e-05, "loss": 0.0708, "step": 71 }, { "epoch": 0.14131501472031405, "grad_norm": 0.5799421668052673, "learning_rate": 2.2040354826462668e-05, "loss": 0.0559, "step": 72 }, { "epoch": 0.14131501472031405, "eval_loss": 0.06862723082304001, "eval_runtime": 51.8315, "eval_samples_per_second": 8.277, "eval_steps_per_second": 1.042, "step": 72 }, { "epoch": 0.14327772325809618, "grad_norm": 1.0606155395507812, "learning_rate": 2.061073738537635e-05, "loss": 0.0798, "step": 73 }, { "epoch": 0.1452404317958783, "grad_norm": 0.8018274903297424, "learning_rate": 1.9216926233717085e-05, "loss": 0.0536, "step": 74 }, { "epoch": 0.14720314033366044, "grad_norm": 0.9818136692047119, "learning_rate": 1.7860619515673033e-05, "loss": 0.076, "step": 75 } ], "logging_steps": 1, "max_steps": 100, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.56415764660224e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }