{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05841974587410545, "eval_steps": 5, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011683949174821088, "grad_norm": 0.5629591941833496, "learning_rate": 2e-05, "loss": 1.6259, "step": 1 }, { "epoch": 0.0011683949174821088, "eval_loss": 1.7091846466064453, "eval_runtime": 32.7497, "eval_samples_per_second": 22.016, "eval_steps_per_second": 11.023, "step": 1 }, { "epoch": 0.0023367898349642177, "grad_norm": 0.4464726150035858, "learning_rate": 4e-05, "loss": 1.9928, "step": 2 }, { "epoch": 0.0035051847524463268, "grad_norm": 0.554092288017273, "learning_rate": 6e-05, "loss": 1.9481, "step": 3 }, { "epoch": 0.004673579669928435, "grad_norm": 0.5418124198913574, "learning_rate": 8e-05, "loss": 2.0157, "step": 4 }, { "epoch": 0.0058419745874105445, "grad_norm": 0.6429816484451294, "learning_rate": 0.0001, "loss": 1.9883, "step": 5 }, { "epoch": 0.0058419745874105445, "eval_loss": 1.6724804639816284, "eval_runtime": 31.1462, "eval_samples_per_second": 23.149, "eval_steps_per_second": 11.59, "step": 5 }, { "epoch": 0.0070103695048926535, "grad_norm": 0.7108197212219238, "learning_rate": 0.00012, "loss": 1.9129, "step": 6 }, { "epoch": 0.008178764422374763, "grad_norm": 0.6660429835319519, "learning_rate": 0.00014, "loss": 1.5668, "step": 7 }, { "epoch": 0.00934715933985687, "grad_norm": 0.7972020506858826, "learning_rate": 0.00016, "loss": 1.6182, "step": 8 }, { "epoch": 0.01051555425733898, "grad_norm": 0.9052525162696838, "learning_rate": 0.00018, "loss": 1.7629, "step": 9 }, { "epoch": 0.011683949174821089, "grad_norm": 0.6085521578788757, "learning_rate": 0.0002, "loss": 1.7345, "step": 10 }, { "epoch": 0.011683949174821089, "eval_loss": 1.4248594045639038, "eval_runtime": 31.1654, "eval_samples_per_second": 23.135, "eval_steps_per_second": 11.583, "step": 10 }, { "epoch": 0.012852344092303198, "grad_norm": 0.6540700793266296, "learning_rate": 0.0001996917333733128, "loss": 1.3627, "step": 11 }, { "epoch": 0.014020739009785307, "grad_norm": 1.1813312768936157, "learning_rate": 0.00019876883405951377, "loss": 1.4923, "step": 12 }, { "epoch": 0.015189133927267416, "grad_norm": 0.9703741669654846, "learning_rate": 0.00019723699203976766, "loss": 1.5092, "step": 13 }, { "epoch": 0.016357528844749527, "grad_norm": 1.16099214553833, "learning_rate": 0.00019510565162951537, "loss": 1.2979, "step": 14 }, { "epoch": 0.017525923762231634, "grad_norm": 1.7793681621551514, "learning_rate": 0.0001923879532511287, "loss": 1.8037, "step": 15 }, { "epoch": 0.017525923762231634, "eval_loss": 1.2205755710601807, "eval_runtime": 31.1965, "eval_samples_per_second": 23.112, "eval_steps_per_second": 11.572, "step": 15 }, { "epoch": 0.01869431867971374, "grad_norm": 1.3740328550338745, "learning_rate": 0.0001891006524188368, "loss": 1.2364, "step": 16 }, { "epoch": 0.019862713597195852, "grad_norm": 0.6489018201828003, "learning_rate": 0.00018526401643540922, "loss": 1.7165, "step": 17 }, { "epoch": 0.02103110851467796, "grad_norm": 1.0925291776657104, "learning_rate": 0.00018090169943749476, "loss": 1.4376, "step": 18 }, { "epoch": 0.02219950343216007, "grad_norm": 0.28496918082237244, "learning_rate": 0.0001760405965600031, "loss": 1.5726, "step": 19 }, { "epoch": 0.023367898349642178, "grad_norm": 0.37671753764152527, "learning_rate": 0.00017071067811865476, "loss": 1.1192, "step": 20 }, { "epoch": 0.023367898349642178, "eval_loss": 1.2007029056549072, "eval_runtime": 31.1968, "eval_samples_per_second": 23.111, "eval_steps_per_second": 11.572, "step": 20 }, { "epoch": 0.02453629326712429, "grad_norm": 0.3938184380531311, "learning_rate": 0.00016494480483301836, "loss": 1.4101, "step": 21 }, { "epoch": 0.025704688184606396, "grad_norm": 0.42164233326911926, "learning_rate": 0.00015877852522924732, "loss": 1.1861, "step": 22 }, { "epoch": 0.026873083102088507, "grad_norm": 0.3876625597476959, "learning_rate": 0.0001522498564715949, "loss": 1.3193, "step": 23 }, { "epoch": 0.028041478019570614, "grad_norm": 0.511690080165863, "learning_rate": 0.00014539904997395468, "loss": 1.1979, "step": 24 }, { "epoch": 0.029209872937052725, "grad_norm": 0.4257306456565857, "learning_rate": 0.000138268343236509, "loss": 1.4168, "step": 25 }, { "epoch": 0.029209872937052725, "eval_loss": 1.1946120262145996, "eval_runtime": 31.1806, "eval_samples_per_second": 23.123, "eval_steps_per_second": 11.578, "step": 25 }, { "epoch": 0.030378267854534832, "grad_norm": 0.3946686387062073, "learning_rate": 0.00013090169943749476, "loss": 1.3165, "step": 26 }, { "epoch": 0.03154666277201694, "grad_norm": 0.37801477313041687, "learning_rate": 0.00012334453638559057, "loss": 1.6097, "step": 27 }, { "epoch": 0.032715057689499054, "grad_norm": 0.3698787987232208, "learning_rate": 0.0001156434465040231, "loss": 1.4314, "step": 28 }, { "epoch": 0.03388345260698116, "grad_norm": 0.36553719639778137, "learning_rate": 0.0001078459095727845, "loss": 1.6258, "step": 29 }, { "epoch": 0.03505184752446327, "grad_norm": 0.3180171847343445, "learning_rate": 0.0001, "loss": 1.8529, "step": 30 }, { "epoch": 0.03505184752446327, "eval_loss": 1.188832402229309, "eval_runtime": 31.1804, "eval_samples_per_second": 23.123, "eval_steps_per_second": 11.578, "step": 30 }, { "epoch": 0.036220242441945376, "grad_norm": 0.3740488886833191, "learning_rate": 9.215409042721552e-05, "loss": 0.7608, "step": 31 }, { "epoch": 0.03738863735942748, "grad_norm": 0.3934629261493683, "learning_rate": 8.435655349597689e-05, "loss": 1.1501, "step": 32 }, { "epoch": 0.0385570322769096, "grad_norm": 0.3714420199394226, "learning_rate": 7.66554636144095e-05, "loss": 1.2233, "step": 33 }, { "epoch": 0.039725427194391705, "grad_norm": 0.3141462206840515, "learning_rate": 6.909830056250527e-05, "loss": 1.3308, "step": 34 }, { "epoch": 0.04089382211187381, "grad_norm": 0.3738103210926056, "learning_rate": 6.173165676349103e-05, "loss": 1.5121, "step": 35 }, { "epoch": 0.04089382211187381, "eval_loss": 1.1856106519699097, "eval_runtime": 31.1551, "eval_samples_per_second": 23.142, "eval_steps_per_second": 11.587, "step": 35 }, { "epoch": 0.04206221702935592, "grad_norm": 0.3965364992618561, "learning_rate": 5.4600950026045326e-05, "loss": 1.6638, "step": 36 }, { "epoch": 0.043230611946838034, "grad_norm": 0.40209704637527466, "learning_rate": 4.7750143528405126e-05, "loss": 1.2929, "step": 37 }, { "epoch": 0.04439900686432014, "grad_norm": 0.4041275084018707, "learning_rate": 4.12214747707527e-05, "loss": 1.3061, "step": 38 }, { "epoch": 0.04556740178180225, "grad_norm": 0.3122788965702057, "learning_rate": 3.5055195166981645e-05, "loss": 1.3712, "step": 39 }, { "epoch": 0.046735796699284356, "grad_norm": 0.2880614697933197, "learning_rate": 2.9289321881345254e-05, "loss": 1.7083, "step": 40 }, { "epoch": 0.046735796699284356, "eval_loss": 1.18394935131073, "eval_runtime": 31.17, "eval_samples_per_second": 23.131, "eval_steps_per_second": 11.582, "step": 40 }, { "epoch": 0.04790419161676647, "grad_norm": 0.4715648293495178, "learning_rate": 2.3959403439996907e-05, "loss": 1.2266, "step": 41 }, { "epoch": 0.04907258653424858, "grad_norm": 0.3626469075679779, "learning_rate": 1.9098300562505266e-05, "loss": 1.3755, "step": 42 }, { "epoch": 0.050240981451730685, "grad_norm": 0.3921823799610138, "learning_rate": 1.4735983564590783e-05, "loss": 1.3125, "step": 43 }, { "epoch": 0.05140937636921279, "grad_norm": 0.35432127118110657, "learning_rate": 1.0899347581163221e-05, "loss": 1.2351, "step": 44 }, { "epoch": 0.052577771286694906, "grad_norm": 0.4417038559913635, "learning_rate": 7.612046748871327e-06, "loss": 1.7417, "step": 45 }, { "epoch": 0.052577771286694906, "eval_loss": 1.1834783554077148, "eval_runtime": 31.2195, "eval_samples_per_second": 23.095, "eval_steps_per_second": 11.563, "step": 45 }, { "epoch": 0.053746166204177014, "grad_norm": 0.3598865866661072, "learning_rate": 4.8943483704846475e-06, "loss": 1.4514, "step": 46 }, { "epoch": 0.05491456112165912, "grad_norm": 0.43709179759025574, "learning_rate": 2.7630079602323442e-06, "loss": 1.5828, "step": 47 }, { "epoch": 0.05608295603914123, "grad_norm": 0.32742375135421753, "learning_rate": 1.231165940486234e-06, "loss": 1.5362, "step": 48 }, { "epoch": 0.057251350956623336, "grad_norm": 0.31574079394340515, "learning_rate": 3.0826662668720364e-07, "loss": 1.5458, "step": 49 }, { "epoch": 0.05841974587410545, "grad_norm": 0.3892696797847748, "learning_rate": 0.0, "loss": 1.5221, "step": 50 }, { "epoch": 0.05841974587410545, "eval_loss": 1.1832040548324585, "eval_runtime": 31.1605, "eval_samples_per_second": 23.138, "eval_steps_per_second": 11.585, "step": 50 } ], "logging_steps": 1, "max_steps": 50, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3069219818110976e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }