{ "best_metric": null, "best_model_checkpoint": null, "epoch": 7.0, "eval_steps": 500, "global_step": 11655, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.3003003003003003, "grad_norm": 4.6720170974731445, "learning_rate": 1.9399399399399402e-05, "loss": 0.4843, "step": 500 }, { "epoch": 0.6006006006006006, "grad_norm": 17.49016571044922, "learning_rate": 1.87987987987988e-05, "loss": 0.3955, "step": 1000 }, { "epoch": 0.9009009009009009, "grad_norm": 50.96406936645508, "learning_rate": 1.81981981981982e-05, "loss": 0.3807, "step": 1500 }, { "epoch": 1.0, "eval_loss": 0.2911546230316162, "eval_runtime": 1.0985, "eval_samples_per_second": 1346.393, "eval_steps_per_second": 168.413, "step": 1665 }, { "epoch": 1.2012012012012012, "grad_norm": 41.78978729248047, "learning_rate": 1.7597597597597598e-05, "loss": 0.3176, "step": 2000 }, { "epoch": 1.5015015015015014, "grad_norm": 15.492409706115723, "learning_rate": 1.6996996996997e-05, "loss": 0.2729, "step": 2500 }, { "epoch": 1.8018018018018018, "grad_norm": 15.4843111038208, "learning_rate": 1.6396396396396396e-05, "loss": 0.2802, "step": 3000 }, { "epoch": 2.0, "eval_loss": 0.4967462718486786, "eval_runtime": 1.1211, "eval_samples_per_second": 1319.266, "eval_steps_per_second": 165.02, "step": 3330 }, { "epoch": 2.1021021021021022, "grad_norm": 0.058736126869916916, "learning_rate": 1.5795795795795797e-05, "loss": 0.2095, "step": 3500 }, { "epoch": 2.4024024024024024, "grad_norm": 41.070953369140625, "learning_rate": 1.5195195195195196e-05, "loss": 0.1309, "step": 4000 }, { "epoch": 2.7027027027027026, "grad_norm": 26.841840744018555, "learning_rate": 1.4594594594594596e-05, "loss": 0.169, "step": 4500 }, { "epoch": 3.0, "eval_loss": 0.5553244948387146, "eval_runtime": 1.1109, "eval_samples_per_second": 1331.404, "eval_steps_per_second": 166.538, "step": 4995 }, { "epoch": 3.003003003003003, "grad_norm": 0.2728441655635834, "learning_rate": 1.3993993993993995e-05, "loss": 0.1481, "step": 5000 }, { "epoch": 3.3033033033033035, "grad_norm": 0.013637651689350605, "learning_rate": 1.3393393393393394e-05, "loss": 0.0638, "step": 5500 }, { "epoch": 3.6036036036036037, "grad_norm": 0.5741263031959534, "learning_rate": 1.2792792792792795e-05, "loss": 0.0753, "step": 6000 }, { "epoch": 3.903903903903904, "grad_norm": 0.03777342289686203, "learning_rate": 1.2192192192192194e-05, "loss": 0.0823, "step": 6500 }, { "epoch": 4.0, "eval_loss": 0.7062426805496216, "eval_runtime": 1.1008, "eval_samples_per_second": 1343.529, "eval_steps_per_second": 168.055, "step": 6660 }, { "epoch": 4.2042042042042045, "grad_norm": 0.0037255873903632164, "learning_rate": 1.1591591591591593e-05, "loss": 0.0377, "step": 7000 }, { "epoch": 4.504504504504505, "grad_norm": 0.004647154361009598, "learning_rate": 1.0990990990990992e-05, "loss": 0.037, "step": 7500 }, { "epoch": 4.804804804804805, "grad_norm": 0.00761270709335804, "learning_rate": 1.039039039039039e-05, "loss": 0.0534, "step": 8000 }, { "epoch": 5.0, "eval_loss": 0.7775081396102905, "eval_runtime": 1.0937, "eval_samples_per_second": 1352.232, "eval_steps_per_second": 169.143, "step": 8325 }, { "epoch": 5.105105105105105, "grad_norm": 0.004814255982637405, "learning_rate": 9.78978978978979e-06, "loss": 0.0287, "step": 8500 }, { "epoch": 5.405405405405405, "grad_norm": 0.0026015336625277996, "learning_rate": 9.189189189189191e-06, "loss": 0.025, "step": 9000 }, { "epoch": 5.7057057057057055, "grad_norm": 0.012158718891441822, "learning_rate": 8.588588588588589e-06, "loss": 0.0246, "step": 9500 }, { "epoch": 6.0, "eval_loss": 0.8445501923561096, "eval_runtime": 1.0956, "eval_samples_per_second": 1349.885, "eval_steps_per_second": 168.85, "step": 9990 }, { "epoch": 6.006006006006006, "grad_norm": 0.0015228951815515757, "learning_rate": 7.987987987987988e-06, "loss": 0.0288, "step": 10000 }, { "epoch": 6.306306306306306, "grad_norm": 0.0007786727510392666, "learning_rate": 7.387387387387388e-06, "loss": 0.0152, "step": 10500 }, { "epoch": 6.606606606606607, "grad_norm": 0.0005245794309303164, "learning_rate": 6.786786786786788e-06, "loss": 0.0191, "step": 11000 }, { "epoch": 6.906906906906907, "grad_norm": 0.0009558356832712889, "learning_rate": 6.186186186186187e-06, "loss": 0.0111, "step": 11500 }, { "epoch": 7.0, "eval_loss": 0.9164330363273621, "eval_runtime": 1.0989, "eval_samples_per_second": 1345.937, "eval_steps_per_second": 168.356, "step": 11655 } ], "logging_steps": 500, "max_steps": 16650, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3066329128047360.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }