{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007142857142857143, "grad_norm": 286.327392578125, "learning_rate": 0.003333333333333333, "loss": 27.4831, "step": 1 }, { "epoch": 0.03571428571428571, "grad_norm": 564.7811889648438, "learning_rate": 0.016666666666666666, "loss": 180.9505, "step": 5 }, { "epoch": 0.07142857142857142, "grad_norm": 45641.8671875, "learning_rate": 0.029770992366412213, "loss": 193.5063, "step": 10 }, { "epoch": 0.10714285714285714, "grad_norm": 1771723.5, "learning_rate": 0.02862595419847328, "loss": 376.766, "step": 15 }, { "epoch": 0.14285714285714285, "grad_norm": 286214.5, "learning_rate": 0.02748091603053435, "loss": 569.9451, "step": 20 }, { "epoch": 0.17857142857142858, "grad_norm": 1100.7747802734375, "learning_rate": 0.02633587786259542, "loss": 615.8839, "step": 25 }, { "epoch": 0.21428571428571427, "grad_norm": 169022.0, "learning_rate": 0.025190839694656485, "loss": 481.0353, "step": 30 }, { "epoch": 0.25, "grad_norm": 333.40972900390625, "learning_rate": 0.024045801526717557, "loss": 305.1498, "step": 35 }, { "epoch": 0.2857142857142857, "grad_norm": 139948.15625, "learning_rate": 0.022900763358778626, "loss": 241.1332, "step": 40 }, { "epoch": 0.32142857142857145, "grad_norm": 1393742.875, "learning_rate": 0.021755725190839695, "loss": 207.3882, "step": 45 }, { "epoch": 0.35714285714285715, "grad_norm": 12597.32421875, "learning_rate": 0.020610687022900764, "loss": 201.0974, "step": 50 }, { "epoch": 0.39285714285714285, "grad_norm": 1081976.875, "learning_rate": 0.01946564885496183, "loss": 222.2186, "step": 55 }, { "epoch": 0.42857142857142855, "grad_norm": 18104.56640625, "learning_rate": 0.0183206106870229, "loss": 254.4615, "step": 60 }, { "epoch": 0.4642857142857143, "grad_norm": 53967.2265625, "learning_rate": 0.017175572519083967, "loss": 256.7784, "step": 65 }, { "epoch": 0.5, "grad_norm": 36778.24609375, "learning_rate": 0.01603053435114504, "loss": 189.0749, "step": 70 }, { "epoch": 0.5357142857142857, "grad_norm": 1759.3746337890625, "learning_rate": 0.014885496183206106, "loss": 198.0302, "step": 75 }, { "epoch": 0.5714285714285714, "grad_norm": 2888.434326171875, "learning_rate": 0.013740458015267175, "loss": 208.9426, "step": 80 }, { "epoch": 0.6071428571428571, "grad_norm": 3183.7255859375, "learning_rate": 0.012595419847328242, "loss": 237.6604, "step": 85 }, { "epoch": 0.6428571428571429, "grad_norm": 295.5591735839844, "learning_rate": 0.011450381679389313, "loss": 259.345, "step": 90 }, { "epoch": 0.6785714285714286, "grad_norm": 2817.36083984375, "learning_rate": 0.010305343511450382, "loss": 229.8393, "step": 95 }, { "epoch": 0.7142857142857143, "grad_norm": 1439.1002197265625, "learning_rate": 0.00916030534351145, "loss": 206.7085, "step": 100 }, { "epoch": 0.75, "grad_norm": 2063.854248046875, "learning_rate": 0.00801526717557252, "loss": 200.2123, "step": 105 }, { "epoch": 0.7857142857142857, "grad_norm": 3193.01953125, "learning_rate": 0.006870229007633588, "loss": 195.9004, "step": 110 }, { "epoch": 0.8214285714285714, "grad_norm": 1513.8978271484375, "learning_rate": 0.0057251908396946565, "loss": 188.8404, "step": 115 }, { "epoch": 0.8571428571428571, "grad_norm": 5677.81494140625, "learning_rate": 0.004580152671755725, "loss": 176.7594, "step": 120 }, { "epoch": 0.8928571428571429, "grad_norm": 726.0751953125, "learning_rate": 0.003435114503816794, "loss": 167.6159, "step": 125 }, { "epoch": 0.9285714285714286, "grad_norm": 1151.048095703125, "learning_rate": 0.0022900763358778627, "loss": 155.8075, "step": 130 }, { "epoch": 0.9642857142857143, "grad_norm": 3488.1396484375, "learning_rate": 0.0011450381679389313, "loss": 155.3781, "step": 135 }, { "epoch": 1.0, "grad_norm": 718.7391357421875, "learning_rate": 0.0, "loss": 149.2635, "step": 140 }, { "epoch": 1.0, "eval_loss": 149.4047088623047, "eval_runtime": 2.8984, "eval_samples_per_second": 61.413, "eval_steps_per_second": 1.035, "step": 140 }, { "epoch": 1.0, "step": 140, "total_flos": 4.2812236905630925e+17, "train_loss": 249.82138957977295, "train_runtime": 1603.7748, "train_samples_per_second": 11.164, "train_steps_per_second": 0.087 } ], "logging_steps": 5, "max_steps": 140, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.2812236905630925e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }