{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.19860973187686196, "eval_steps": 5, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003972194637537239, "grad_norm": 0.4402098059654236, "learning_rate": 1e-05, "loss": 0.8096, "step": 1 }, { "epoch": 0.003972194637537239, "eval_loss": 0.9886362552642822, "eval_runtime": 26.8876, "eval_samples_per_second": 3.942, "eval_steps_per_second": 1.971, "step": 1 }, { "epoch": 0.007944389275074478, "grad_norm": 0.387863427400589, "learning_rate": 2e-05, "loss": 0.7586, "step": 2 }, { "epoch": 0.011916583912611719, "grad_norm": 0.3501233160495758, "learning_rate": 3e-05, "loss": 0.5331, "step": 3 }, { "epoch": 0.015888778550148957, "grad_norm": 0.48034775257110596, "learning_rate": 4e-05, "loss": 0.621, "step": 4 }, { "epoch": 0.019860973187686197, "grad_norm": 0.4095894694328308, "learning_rate": 5e-05, "loss": 0.7758, "step": 5 }, { "epoch": 0.019860973187686197, "eval_loss": 0.9715360403060913, "eval_runtime": 26.9366, "eval_samples_per_second": 3.935, "eval_steps_per_second": 1.968, "step": 5 }, { "epoch": 0.023833167825223437, "grad_norm": 0.606968343257904, "learning_rate": 6e-05, "loss": 0.7785, "step": 6 }, { "epoch": 0.027805362462760674, "grad_norm": 1.2388267517089844, "learning_rate": 7e-05, "loss": 0.9735, "step": 7 }, { "epoch": 0.031777557100297914, "grad_norm": 1.5921814441680908, "learning_rate": 8e-05, "loss": 0.9913, "step": 8 }, { "epoch": 0.03574975173783516, "grad_norm": 2.2070536613464355, "learning_rate": 9e-05, "loss": 1.5737, "step": 9 }, { "epoch": 0.039721946375372394, "grad_norm": 0.39528149366378784, "learning_rate": 0.0001, "loss": 0.4109, "step": 10 }, { "epoch": 0.039721946375372394, "eval_loss": 0.6905767917633057, "eval_runtime": 27.0421, "eval_samples_per_second": 3.92, "eval_steps_per_second": 1.96, "step": 10 }, { "epoch": 0.04369414101290963, "grad_norm": 0.9849607944488525, "learning_rate": 9.98458666866564e-05, "loss": 0.5394, "step": 11 }, { "epoch": 0.047666335650446874, "grad_norm": 0.7983195185661316, "learning_rate": 9.938441702975689e-05, "loss": 0.5027, "step": 12 }, { "epoch": 0.05163853028798411, "grad_norm": 1.2366907596588135, "learning_rate": 9.861849601988383e-05, "loss": 0.749, "step": 13 }, { "epoch": 0.05561072492552135, "grad_norm": 0.5725651979446411, "learning_rate": 9.755282581475769e-05, "loss": 0.4126, "step": 14 }, { "epoch": 0.05958291956305859, "grad_norm": 0.7858136892318726, "learning_rate": 9.619397662556435e-05, "loss": 0.4712, "step": 15 }, { "epoch": 0.05958291956305859, "eval_loss": 0.43833062052726746, "eval_runtime": 27.0611, "eval_samples_per_second": 3.917, "eval_steps_per_second": 1.959, "step": 15 }, { "epoch": 0.06355511420059583, "grad_norm": 0.487553209066391, "learning_rate": 9.45503262094184e-05, "loss": 0.2656, "step": 16 }, { "epoch": 0.06752730883813307, "grad_norm": 0.596966564655304, "learning_rate": 9.263200821770461e-05, "loss": 0.2773, "step": 17 }, { "epoch": 0.07149950347567031, "grad_norm": 0.7163795828819275, "learning_rate": 9.045084971874738e-05, "loss": 0.3517, "step": 18 }, { "epoch": 0.07547169811320754, "grad_norm": 0.4074513614177704, "learning_rate": 8.802029828000156e-05, "loss": 0.3021, "step": 19 }, { "epoch": 0.07944389275074479, "grad_norm": 0.6424429416656494, "learning_rate": 8.535533905932738e-05, "loss": 0.3123, "step": 20 }, { "epoch": 0.07944389275074479, "eval_loss": 0.35487380623817444, "eval_runtime": 27.0826, "eval_samples_per_second": 3.914, "eval_steps_per_second": 1.957, "step": 20 }, { "epoch": 0.08341608738828203, "grad_norm": 0.6385417580604553, "learning_rate": 8.247240241650918e-05, "loss": 0.3987, "step": 21 }, { "epoch": 0.08738828202581926, "grad_norm": 0.4243467450141907, "learning_rate": 7.938926261462366e-05, "loss": 0.2488, "step": 22 }, { "epoch": 0.0913604766633565, "grad_norm": 0.34767284989356995, "learning_rate": 7.612492823579745e-05, "loss": 0.2428, "step": 23 }, { "epoch": 0.09533267130089375, "grad_norm": 0.6192077994346619, "learning_rate": 7.269952498697734e-05, "loss": 0.3342, "step": 24 }, { "epoch": 0.09930486593843098, "grad_norm": 0.47093358635902405, "learning_rate": 6.91341716182545e-05, "loss": 0.2855, "step": 25 }, { "epoch": 0.09930486593843098, "eval_loss": 0.30741116404533386, "eval_runtime": 27.0911, "eval_samples_per_second": 3.913, "eval_steps_per_second": 1.956, "step": 25 }, { "epoch": 0.10327706057596822, "grad_norm": 0.5661317110061646, "learning_rate": 6.545084971874738e-05, "loss": 0.3159, "step": 26 }, { "epoch": 0.10724925521350546, "grad_norm": 0.3478457033634186, "learning_rate": 6.167226819279528e-05, "loss": 0.2156, "step": 27 }, { "epoch": 0.1112214498510427, "grad_norm": 0.3984580338001251, "learning_rate": 5.782172325201155e-05, "loss": 0.265, "step": 28 }, { "epoch": 0.11519364448857994, "grad_norm": 0.5232352614402771, "learning_rate": 5.392295478639225e-05, "loss": 0.2667, "step": 29 }, { "epoch": 0.11916583912611718, "grad_norm": 0.5363503694534302, "learning_rate": 5e-05, "loss": 0.2837, "step": 30 }, { "epoch": 0.11916583912611718, "eval_loss": 0.2915026843547821, "eval_runtime": 27.0837, "eval_samples_per_second": 3.914, "eval_steps_per_second": 1.957, "step": 30 }, { "epoch": 0.12313803376365443, "grad_norm": 0.3917446434497833, "learning_rate": 4.607704521360776e-05, "loss": 0.2119, "step": 31 }, { "epoch": 0.12711022840119166, "grad_norm": 0.2668308615684509, "learning_rate": 4.2178276747988446e-05, "loss": 0.2285, "step": 32 }, { "epoch": 0.13108242303872888, "grad_norm": 0.32772475481033325, "learning_rate": 3.832773180720475e-05, "loss": 0.2214, "step": 33 }, { "epoch": 0.13505461767626614, "grad_norm": 0.4210818111896515, "learning_rate": 3.4549150281252636e-05, "loss": 0.2204, "step": 34 }, { "epoch": 0.13902681231380337, "grad_norm": 0.470721960067749, "learning_rate": 3.086582838174551e-05, "loss": 0.261, "step": 35 }, { "epoch": 0.13902681231380337, "eval_loss": 0.2774577736854553, "eval_runtime": 27.0865, "eval_samples_per_second": 3.913, "eval_steps_per_second": 1.957, "step": 35 }, { "epoch": 0.14299900695134063, "grad_norm": 0.3494637608528137, "learning_rate": 2.7300475013022663e-05, "loss": 0.2237, "step": 36 }, { "epoch": 0.14697120158887786, "grad_norm": 0.49559351801872253, "learning_rate": 2.3875071764202563e-05, "loss": 0.2608, "step": 37 }, { "epoch": 0.1509433962264151, "grad_norm": 0.4698236882686615, "learning_rate": 2.061073738537635e-05, "loss": 0.2241, "step": 38 }, { "epoch": 0.15491559086395235, "grad_norm": 0.6071957349777222, "learning_rate": 1.7527597583490822e-05, "loss": 0.2778, "step": 39 }, { "epoch": 0.15888778550148958, "grad_norm": 0.302901953458786, "learning_rate": 1.4644660940672627e-05, "loss": 0.2584, "step": 40 }, { "epoch": 0.15888778550148958, "eval_loss": 0.2731941044330597, "eval_runtime": 27.0826, "eval_samples_per_second": 3.914, "eval_steps_per_second": 1.957, "step": 40 }, { "epoch": 0.1628599801390268, "grad_norm": 0.7419318556785583, "learning_rate": 1.1979701719998453e-05, "loss": 0.3068, "step": 41 }, { "epoch": 0.16683217477656406, "grad_norm": 0.5678064227104187, "learning_rate": 9.549150281252633e-06, "loss": 0.2748, "step": 42 }, { "epoch": 0.1708043694141013, "grad_norm": 0.3631245791912079, "learning_rate": 7.367991782295391e-06, "loss": 0.2302, "step": 43 }, { "epoch": 0.17477656405163852, "grad_norm": 0.48080864548683167, "learning_rate": 5.449673790581611e-06, "loss": 0.2325, "step": 44 }, { "epoch": 0.17874875868917578, "grad_norm": 0.3569527566432953, "learning_rate": 3.8060233744356633e-06, "loss": 0.1848, "step": 45 }, { "epoch": 0.17874875868917578, "eval_loss": 0.2712867259979248, "eval_runtime": 27.0882, "eval_samples_per_second": 3.913, "eval_steps_per_second": 1.957, "step": 45 }, { "epoch": 0.182720953326713, "grad_norm": 0.42595577239990234, "learning_rate": 2.4471741852423237e-06, "loss": 0.2049, "step": 46 }, { "epoch": 0.18669314796425024, "grad_norm": 0.5140714645385742, "learning_rate": 1.3815039801161721e-06, "loss": 0.2283, "step": 47 }, { "epoch": 0.1906653426017875, "grad_norm": 0.6511046290397644, "learning_rate": 6.15582970243117e-07, "loss": 0.2613, "step": 48 }, { "epoch": 0.19463753723932473, "grad_norm": 0.6832799911499023, "learning_rate": 1.5413331334360182e-07, "loss": 0.2708, "step": 49 }, { "epoch": 0.19860973187686196, "grad_norm": 0.32798025012016296, "learning_rate": 0.0, "loss": 0.2238, "step": 50 }, { "epoch": 0.19860973187686196, "eval_loss": 0.2707710564136505, "eval_runtime": 27.0853, "eval_samples_per_second": 3.914, "eval_steps_per_second": 1.957, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.69912849629184e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }