{ "best_metric": null, "best_model_checkpoint": null, "epoch": 9.770992366412214, "eval_steps": 500, "global_step": 320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3053435114503817, "grad_norm": 1.0634765625, "learning_rate": 0.0001995184726672197, "loss": 1.8094, "step": 10 }, { "epoch": 0.6106870229007634, "grad_norm": 0.9521484375, "learning_rate": 0.00019807852804032305, "loss": 1.1712, "step": 20 }, { "epoch": 0.916030534351145, "grad_norm": 2.720703125, "learning_rate": 0.0001956940335732209, "loss": 1.0862, "step": 30 }, { "epoch": 1.2213740458015268, "grad_norm": 2.3515625, "learning_rate": 0.0001923879532511287, "loss": 0.9789, "step": 40 }, { "epoch": 1.5267175572519083, "grad_norm": 0.46435546875, "learning_rate": 0.0001881921264348355, "loss": 0.8284, "step": 50 }, { "epoch": 1.83206106870229, "grad_norm": 2.533203125, "learning_rate": 0.00018314696123025454, "loss": 0.809, "step": 60 }, { "epoch": 2.1374045801526718, "grad_norm": 0.8310546875, "learning_rate": 0.0001773010453362737, "loss": 0.7111, "step": 70 }, { "epoch": 2.4427480916030535, "grad_norm": 4.42578125, "learning_rate": 0.00017071067811865476, "loss": 0.7286, "step": 80 }, { "epoch": 2.7480916030534353, "grad_norm": 0.70947265625, "learning_rate": 0.00016343932841636456, "loss": 0.7382, "step": 90 }, { "epoch": 3.053435114503817, "grad_norm": 1.1298828125, "learning_rate": 0.00015555702330196023, "loss": 0.7349, "step": 100 }, { "epoch": 3.3587786259541983, "grad_norm": 2.6484375, "learning_rate": 0.0001471396736825998, "loss": 0.7027, "step": 110 }, { "epoch": 3.66412213740458, "grad_norm": 0.407958984375, "learning_rate": 0.000138268343236509, "loss": 0.707, "step": 120 }, { "epoch": 3.969465648854962, "grad_norm": 0.427001953125, "learning_rate": 0.00012902846772544624, "loss": 0.6915, "step": 130 }, { "epoch": 4.2748091603053435, "grad_norm": 0.34228515625, "learning_rate": 0.00011950903220161285, "loss": 0.6953, "step": 140 }, { "epoch": 4.580152671755725, "grad_norm": 0.489990234375, "learning_rate": 0.0001098017140329561, "loss": 0.6831, "step": 150 }, { "epoch": 4.885496183206107, "grad_norm": 0.38525390625, "learning_rate": 0.0001, "loss": 0.6754, "step": 160 }, { "epoch": 5.190839694656488, "grad_norm": 0.389892578125, "learning_rate": 9.019828596704394e-05, "loss": 0.6688, "step": 170 }, { "epoch": 5.4961832061068705, "grad_norm": 0.467529296875, "learning_rate": 8.049096779838719e-05, "loss": 0.668, "step": 180 }, { "epoch": 5.801526717557252, "grad_norm": 0.39794921875, "learning_rate": 7.097153227455379e-05, "loss": 0.6651, "step": 190 }, { "epoch": 6.106870229007634, "grad_norm": 0.52294921875, "learning_rate": 6.173165676349103e-05, "loss": 0.662, "step": 200 }, { "epoch": 6.412213740458015, "grad_norm": 0.37109375, "learning_rate": 5.286032631740023e-05, "loss": 0.6603, "step": 210 }, { "epoch": 6.717557251908397, "grad_norm": 0.354248046875, "learning_rate": 4.444297669803981e-05, "loss": 0.6439, "step": 220 }, { "epoch": 7.022900763358779, "grad_norm": 0.43017578125, "learning_rate": 3.6560671583635467e-05, "loss": 0.6696, "step": 230 }, { "epoch": 7.32824427480916, "grad_norm": 0.37109375, "learning_rate": 2.9289321881345254e-05, "loss": 0.6603, "step": 240 }, { "epoch": 7.633587786259542, "grad_norm": 0.4580078125, "learning_rate": 2.26989546637263e-05, "loss": 0.6418, "step": 250 }, { "epoch": 7.938931297709924, "grad_norm": 0.40185546875, "learning_rate": 1.6853038769745467e-05, "loss": 0.6404, "step": 260 }, { "epoch": 8.244274809160306, "grad_norm": 0.391357421875, "learning_rate": 1.1807873565164506e-05, "loss": 0.6574, "step": 270 }, { "epoch": 8.549618320610687, "grad_norm": 0.383544921875, "learning_rate": 7.612046748871327e-06, "loss": 0.6405, "step": 280 }, { "epoch": 8.854961832061068, "grad_norm": 0.358642578125, "learning_rate": 4.305966426779118e-06, "loss": 0.6382, "step": 290 }, { "epoch": 9.16030534351145, "grad_norm": 0.365234375, "learning_rate": 1.921471959676957e-06, "loss": 0.6499, "step": 300 }, { "epoch": 9.465648854961833, "grad_norm": 0.40673828125, "learning_rate": 4.815273327803182e-07, "loss": 0.6562, "step": 310 }, { "epoch": 9.770992366412214, "grad_norm": 0.364013671875, "learning_rate": 0.0, "loss": 0.6581, "step": 320 }, { "epoch": 9.770992366412214, "step": 320, "total_flos": 1.561516427968512e+16, "train_loss": 0.7572343468666076, "train_runtime": 291.1608, "train_samples_per_second": 4.499, "train_steps_per_second": 1.099 } ], "logging_steps": 10, "max_steps": 320, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 1.561516427968512e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }