{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 21249, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.023530519083250975, "grad_norm": 1.6875, "learning_rate": 4.882347404583745e-05, "loss": 0.9427, "step": 500 }, { "epoch": 0.04706103816650195, "grad_norm": 1.75, "learning_rate": 4.76469480916749e-05, "loss": 0.8108, "step": 1000 }, { "epoch": 0.07059155724975293, "grad_norm": 1.8671875, "learning_rate": 4.647042213751235e-05, "loss": 0.7777, "step": 1500 }, { "epoch": 0.0941220763330039, "grad_norm": 1.6171875, "learning_rate": 4.52938961833498e-05, "loss": 0.7551, "step": 2000 }, { "epoch": 0.11765259541625488, "grad_norm": 1.4765625, "learning_rate": 4.411737022918726e-05, "loss": 0.7373, "step": 2500 }, { "epoch": 0.14118311449950585, "grad_norm": 1.6328125, "learning_rate": 4.294084427502471e-05, "loss": 0.727, "step": 3000 }, { "epoch": 0.16471363358275684, "grad_norm": 1.84375, "learning_rate": 4.1764318320862164e-05, "loss": 0.718, "step": 3500 }, { "epoch": 0.1882441526660078, "grad_norm": 1.7265625, "learning_rate": 4.0587792366699614e-05, "loss": 0.7153, "step": 4000 }, { "epoch": 0.2117746717492588, "grad_norm": 1.5625, "learning_rate": 3.9411266412537063e-05, "loss": 0.7067, "step": 4500 }, { "epoch": 0.23530519083250975, "grad_norm": 1.6796875, "learning_rate": 3.823474045837451e-05, "loss": 0.7037, "step": 5000 }, { "epoch": 0.2588357099157607, "grad_norm": 1.8203125, "learning_rate": 3.705821450421196e-05, "loss": 0.6984, "step": 5500 }, { "epoch": 0.2823662289990117, "grad_norm": 1.609375, "learning_rate": 3.588168855004941e-05, "loss": 0.6954, "step": 6000 }, { "epoch": 0.3058967480822627, "grad_norm": 1.5859375, "learning_rate": 3.470516259588686e-05, "loss": 0.6948, "step": 6500 }, { "epoch": 0.3294272671655137, "grad_norm": 1.515625, "learning_rate": 3.352863664172432e-05, "loss": 0.6908, "step": 7000 }, { "epoch": 0.35295778624876467, "grad_norm": 1.6796875, "learning_rate": 3.235211068756177e-05, "loss": 0.6873, "step": 7500 }, { "epoch": 0.3764883053320156, "grad_norm": 1.7109375, "learning_rate": 3.117558473339922e-05, "loss": 0.6876, "step": 8000 }, { "epoch": 0.4000188244152666, "grad_norm": 1.5703125, "learning_rate": 2.999905877923667e-05, "loss": 0.6879, "step": 8500 }, { "epoch": 0.4235493434985176, "grad_norm": 1.5859375, "learning_rate": 2.882253282507412e-05, "loss": 0.6862, "step": 9000 }, { "epoch": 0.4470798625817686, "grad_norm": 1.8359375, "learning_rate": 2.7646006870911574e-05, "loss": 0.6829, "step": 9500 }, { "epoch": 0.4706103816650195, "grad_norm": 1.6640625, "learning_rate": 2.6469480916749024e-05, "loss": 0.6852, "step": 10000 }, { "epoch": 0.4941409007482705, "grad_norm": 1.5, "learning_rate": 2.5292954962586474e-05, "loss": 0.681, "step": 10500 }, { "epoch": 0.5176714198315214, "grad_norm": 1.59375, "learning_rate": 2.4116429008423927e-05, "loss": 0.6798, "step": 11000 }, { "epoch": 0.5412019389147724, "grad_norm": 1.6640625, "learning_rate": 2.2939903054261376e-05, "loss": 0.6803, "step": 11500 }, { "epoch": 0.5647324579980234, "grad_norm": 1.6875, "learning_rate": 2.176337710009883e-05, "loss": 0.6799, "step": 12000 }, { "epoch": 0.5882629770812744, "grad_norm": 1.875, "learning_rate": 2.0586851145936283e-05, "loss": 0.6761, "step": 12500 }, { "epoch": 0.6117934961645254, "grad_norm": 1.578125, "learning_rate": 1.9410325191773732e-05, "loss": 0.675, "step": 13000 }, { "epoch": 0.6353240152477764, "grad_norm": 1.7109375, "learning_rate": 1.8233799237611182e-05, "loss": 0.6785, "step": 13500 }, { "epoch": 0.6588545343310274, "grad_norm": 1.8515625, "learning_rate": 1.7057273283448632e-05, "loss": 0.6782, "step": 14000 }, { "epoch": 0.6823850534142784, "grad_norm": 1.5390625, "learning_rate": 1.5880747329286085e-05, "loss": 0.6761, "step": 14500 }, { "epoch": 0.7059155724975293, "grad_norm": 1.6015625, "learning_rate": 1.4704221375123536e-05, "loss": 0.6795, "step": 15000 }, { "epoch": 0.7294460915807802, "grad_norm": 1.8125, "learning_rate": 1.3527695420960988e-05, "loss": 0.6774, "step": 15500 }, { "epoch": 0.7529766106640312, "grad_norm": 1.5546875, "learning_rate": 1.2351169466798439e-05, "loss": 0.6777, "step": 16000 }, { "epoch": 0.7765071297472822, "grad_norm": 1.5859375, "learning_rate": 1.1174643512635889e-05, "loss": 0.6746, "step": 16500 }, { "epoch": 0.8000376488305332, "grad_norm": 1.5234375, "learning_rate": 9.998117558473342e-06, "loss": 0.6782, "step": 17000 }, { "epoch": 0.8235681679137842, "grad_norm": 1.5546875, "learning_rate": 8.821591604310792e-06, "loss": 0.6776, "step": 17500 }, { "epoch": 0.8470986869970352, "grad_norm": 1.546875, "learning_rate": 7.645065650148241e-06, "loss": 0.6782, "step": 18000 }, { "epoch": 0.8706292060802862, "grad_norm": 1.609375, "learning_rate": 6.468539695985694e-06, "loss": 0.6754, "step": 18500 }, { "epoch": 0.8941597251635371, "grad_norm": 1.515625, "learning_rate": 5.292013741823145e-06, "loss": 0.6757, "step": 19000 }, { "epoch": 0.9176902442467881, "grad_norm": 1.6015625, "learning_rate": 4.1154877876605964e-06, "loss": 0.6765, "step": 19500 }, { "epoch": 0.941220763330039, "grad_norm": 1.5625, "learning_rate": 2.938961833498047e-06, "loss": 0.6747, "step": 20000 }, { "epoch": 0.96475128241329, "grad_norm": 1.6171875, "learning_rate": 1.7624358793354984e-06, "loss": 0.677, "step": 20500 }, { "epoch": 0.988281801496541, "grad_norm": 1.65625, "learning_rate": 5.859099251729494e-07, "loss": 0.6804, "step": 21000 } ], "logging_steps": 500, "max_steps": 21249, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.787078596423629e+17, "train_batch_size": 48, "trial_name": null, "trial_params": null }