{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05859947260474656, "eval_steps": 3, "global_step": 25, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023439789041898623, "grad_norm": 30.85260009765625, "learning_rate": 5e-05, "loss": 6.6922, "step": 1 }, { "epoch": 0.0023439789041898623, "eval_loss": 7.640185832977295, "eval_runtime": 375.7713, "eval_samples_per_second": 1.913, "eval_steps_per_second": 0.479, "step": 1 }, { "epoch": 0.004687957808379725, "grad_norm": 32.24323272705078, "learning_rate": 0.0001, "loss": 7.3599, "step": 2 }, { "epoch": 0.007031936712569587, "grad_norm": 35.10329818725586, "learning_rate": 9.953429730181653e-05, "loss": 4.3299, "step": 3 }, { "epoch": 0.007031936712569587, "eval_loss": 1.007381796836853, "eval_runtime": 377.1204, "eval_samples_per_second": 1.907, "eval_steps_per_second": 0.477, "step": 3 }, { "epoch": 0.00937591561675945, "grad_norm": 12.174251556396484, "learning_rate": 9.814586436738998e-05, "loss": 1.0876, "step": 4 }, { "epoch": 0.011719894520949311, "grad_norm": 11.167458534240723, "learning_rate": 9.586056507527266e-05, "loss": 0.618, "step": 5 }, { "epoch": 0.014063873425139173, "grad_norm": 16.5855770111084, "learning_rate": 9.272097022732443e-05, "loss": 0.4434, "step": 6 }, { "epoch": 0.014063873425139173, "eval_loss": 0.5613827109336853, "eval_runtime": 377.1269, "eval_samples_per_second": 1.907, "eval_steps_per_second": 0.477, "step": 6 }, { "epoch": 0.016407852329329037, "grad_norm": 2.6942837238311768, "learning_rate": 8.8785564535221e-05, "loss": 0.3475, "step": 7 }, { "epoch": 0.0187518312335189, "grad_norm": 4.204878807067871, "learning_rate": 8.412765716093272e-05, "loss": 0.4269, "step": 8 }, { "epoch": 0.02109581013770876, "grad_norm": 8.653458595275879, "learning_rate": 7.883401610574336e-05, "loss": 0.8488, "step": 9 }, { "epoch": 0.02109581013770876, "eval_loss": 0.46133139729499817, "eval_runtime": 376.6374, "eval_samples_per_second": 1.909, "eval_steps_per_second": 0.478, "step": 9 }, { "epoch": 0.023439789041898623, "grad_norm": 6.0319647789001465, "learning_rate": 7.300325188655761e-05, "loss": 0.5979, "step": 10 }, { "epoch": 0.025783767946088484, "grad_norm": 7.0973124504089355, "learning_rate": 6.674398060854931e-05, "loss": 0.3395, "step": 11 }, { "epoch": 0.028127746850278346, "grad_norm": 3.4368035793304443, "learning_rate": 6.01728006526317e-05, "loss": 0.4039, "step": 12 }, { "epoch": 0.028127746850278346, "eval_loss": 0.3852250874042511, "eval_runtime": 376.4542, "eval_samples_per_second": 1.91, "eval_steps_per_second": 0.478, "step": 12 }, { "epoch": 0.030471725754468208, "grad_norm": 4.774641990661621, "learning_rate": 5.341212066823355e-05, "loss": 0.4755, "step": 13 }, { "epoch": 0.032815704658658074, "grad_norm": 3.387488842010498, "learning_rate": 4.658787933176646e-05, "loss": 0.3168, "step": 14 }, { "epoch": 0.03515968356284793, "grad_norm": 4.079291343688965, "learning_rate": 3.982719934736832e-05, "loss": 0.3634, "step": 15 }, { "epoch": 0.03515968356284793, "eval_loss": 0.3480370044708252, "eval_runtime": 376.8136, "eval_samples_per_second": 1.908, "eval_steps_per_second": 0.478, "step": 15 }, { "epoch": 0.0375036624670378, "grad_norm": 2.9585719108581543, "learning_rate": 3.325601939145069e-05, "loss": 0.2674, "step": 16 }, { "epoch": 0.039847641371227656, "grad_norm": 3.4793038368225098, "learning_rate": 2.6996748113442394e-05, "loss": 0.2099, "step": 17 }, { "epoch": 0.04219162027541752, "grad_norm": 3.6350109577178955, "learning_rate": 2.1165983894256647e-05, "loss": 0.2057, "step": 18 }, { "epoch": 0.04219162027541752, "eval_loss": 0.30665451288223267, "eval_runtime": 376.8336, "eval_samples_per_second": 1.908, "eval_steps_per_second": 0.478, "step": 18 }, { "epoch": 0.04453559917960739, "grad_norm": 2.6633102893829346, "learning_rate": 1.5872342839067306e-05, "loss": 0.2433, "step": 19 }, { "epoch": 0.046879578083797245, "grad_norm": 3.862442970275879, "learning_rate": 1.1214435464779006e-05, "loss": 0.2717, "step": 20 }, { "epoch": 0.04922355698798711, "grad_norm": 4.074159145355225, "learning_rate": 7.2790297726755716e-06, "loss": 0.2596, "step": 21 }, { "epoch": 0.04922355698798711, "eval_loss": 0.32189393043518066, "eval_runtime": 376.6717, "eval_samples_per_second": 1.909, "eval_steps_per_second": 0.478, "step": 21 }, { "epoch": 0.05156753589217697, "grad_norm": 4.781056880950928, "learning_rate": 4.139434924727359e-06, "loss": 0.4503, "step": 22 }, { "epoch": 0.053911514796366834, "grad_norm": 6.350940227508545, "learning_rate": 1.8541356326100433e-06, "loss": 0.3268, "step": 23 }, { "epoch": 0.05625549370055669, "grad_norm": 3.5974953174591064, "learning_rate": 4.6570269818346224e-07, "loss": 0.362, "step": 24 }, { "epoch": 0.05625549370055669, "eval_loss": 0.31786561012268066, "eval_runtime": 376.8204, "eval_samples_per_second": 1.908, "eval_steps_per_second": 0.478, "step": 24 }, { "epoch": 0.05859947260474656, "grad_norm": 5.433014392852783, "learning_rate": 0.0, "loss": 0.2848, "step": 25 } ], "logging_steps": 1, "max_steps": 25, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.508513578745856e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }