{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.853932584269663, "eval_steps": 500, "global_step": 88, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0898876404494382, "grad_norm": NaN, "learning_rate": 0.00019772727272727273, "loss": 9.4365, "step": 2 }, { "epoch": 0.1797752808988764, "grad_norm": 10.061033248901367, "learning_rate": 0.0001931818181818182, "loss": 9.2158, "step": 4 }, { "epoch": 0.2696629213483146, "grad_norm": 15.205401420593262, "learning_rate": 0.00018863636363636364, "loss": 8.8062, "step": 6 }, { "epoch": 0.3595505617977528, "grad_norm": 20.087194442749023, "learning_rate": 0.00018409090909090909, "loss": 8.4059, "step": 8 }, { "epoch": 0.449438202247191, "grad_norm": 12.80905818939209, "learning_rate": 0.00017954545454545456, "loss": 7.8943, "step": 10 }, { "epoch": 0.5393258426966292, "grad_norm": 13.147987365722656, "learning_rate": 0.000175, "loss": 7.5915, "step": 12 }, { "epoch": 0.6292134831460674, "grad_norm": 14.075699806213379, "learning_rate": 0.00017045454545454547, "loss": 7.3797, "step": 14 }, { "epoch": 0.7191011235955056, "grad_norm": 6.000680923461914, "learning_rate": 0.00016590909090909094, "loss": 7.3359, "step": 16 }, { "epoch": 0.8089887640449438, "grad_norm": 7.718422889709473, "learning_rate": 0.00016136363636363635, "loss": 7.3118, "step": 18 }, { "epoch": 0.898876404494382, "grad_norm": 6.30356502532959, "learning_rate": 0.00015681818181818182, "loss": 7.2512, "step": 20 }, { "epoch": 0.9887640449438202, "grad_norm": 11.91089153289795, "learning_rate": 0.00015227272727272727, "loss": 7.1725, "step": 22 }, { "epoch": 1.0449438202247192, "grad_norm": 10.328221321105957, "learning_rate": 0.00014772727272727274, "loss": 4.4411, "step": 24 }, { "epoch": 1.1348314606741572, "grad_norm": 11.233499526977539, "learning_rate": 0.0001431818181818182, "loss": 7.1546, "step": 26 }, { "epoch": 1.2247191011235956, "grad_norm": 12.115806579589844, "learning_rate": 0.00013863636363636365, "loss": 7.146, "step": 28 }, { "epoch": 1.3146067415730336, "grad_norm": 8.608111381530762, "learning_rate": 0.0001340909090909091, "loss": 7.1384, "step": 30 }, { "epoch": 1.404494382022472, "grad_norm": 6.332569599151611, "learning_rate": 0.00012954545454545456, "loss": 7.0654, "step": 32 }, { "epoch": 1.49438202247191, "grad_norm": 8.418745994567871, "learning_rate": 0.000125, "loss": 7.025, "step": 34 }, { "epoch": 1.5842696629213484, "grad_norm": 3.712430000305176, "learning_rate": 0.00012045454545454546, "loss": 7.0801, "step": 36 }, { "epoch": 1.6741573033707864, "grad_norm": 10.121654510498047, "learning_rate": 0.00011590909090909093, "loss": 7.1843, "step": 38 }, { "epoch": 1.7640449438202248, "grad_norm": 4.086280345916748, "learning_rate": 0.00011136363636363636, "loss": 7.085, "step": 40 }, { "epoch": 1.8539325842696628, "grad_norm": 6.2060933113098145, "learning_rate": 0.00010681818181818181, "loss": 7.0862, "step": 42 }, { "epoch": 1.9438202247191012, "grad_norm": 3.9082467555999756, "learning_rate": 0.00010227272727272727, "loss": 7.034, "step": 44 }, { "epoch": 2.0, "grad_norm": 3.6770272254943848, "learning_rate": 9.772727272727274e-05, "loss": 4.2733, "step": 46 }, { "epoch": 2.0898876404494384, "grad_norm": 3.2819325923919678, "learning_rate": 9.318181818181818e-05, "loss": 7.0196, "step": 48 }, { "epoch": 2.1797752808988764, "grad_norm": 6.796480655670166, "learning_rate": 8.863636363636364e-05, "loss": 7.0643, "step": 50 }, { "epoch": 2.2696629213483144, "grad_norm": 5.090155124664307, "learning_rate": 8.40909090909091e-05, "loss": 6.9201, "step": 52 }, { "epoch": 2.359550561797753, "grad_norm": 9.731218338012695, "learning_rate": 7.954545454545455e-05, "loss": 7.0715, "step": 54 }, { "epoch": 2.449438202247191, "grad_norm": 5.852891445159912, "learning_rate": 7.500000000000001e-05, "loss": 7.1071, "step": 56 }, { "epoch": 2.539325842696629, "grad_norm": 9.918388366699219, "learning_rate": 7.045454545454546e-05, "loss": 7.0041, "step": 58 }, { "epoch": 2.629213483146067, "grad_norm": 5.5014190673828125, "learning_rate": 6.59090909090909e-05, "loss": 6.982, "step": 60 }, { "epoch": 2.7191011235955056, "grad_norm": 7.316319465637207, "learning_rate": 6.136363636363636e-05, "loss": 6.9352, "step": 62 }, { "epoch": 2.808988764044944, "grad_norm": 13.674769401550293, "learning_rate": 5.6818181818181825e-05, "loss": 6.9549, "step": 64 }, { "epoch": 2.898876404494382, "grad_norm": 6.491578102111816, "learning_rate": 5.2272727272727274e-05, "loss": 7.0187, "step": 66 }, { "epoch": 2.98876404494382, "grad_norm": 5.663166522979736, "learning_rate": 4.772727272727273e-05, "loss": 6.9697, "step": 68 }, { "epoch": 3.044943820224719, "grad_norm": 5.13517427444458, "learning_rate": 4.318181818181819e-05, "loss": 4.3462, "step": 70 }, { "epoch": 3.134831460674157, "grad_norm": 4.548388957977295, "learning_rate": 3.8636363636363636e-05, "loss": 6.8757, "step": 72 }, { "epoch": 3.2247191011235956, "grad_norm": 7.8548431396484375, "learning_rate": 3.409090909090909e-05, "loss": 6.9019, "step": 74 }, { "epoch": 3.3146067415730336, "grad_norm": 18.156105041503906, "learning_rate": 2.954545454545455e-05, "loss": 7.076, "step": 76 }, { "epoch": 3.404494382022472, "grad_norm": 6.122278690338135, "learning_rate": 2.5e-05, "loss": 6.9759, "step": 78 }, { "epoch": 3.49438202247191, "grad_norm": 8.052112579345703, "learning_rate": 2.0454545454545457e-05, "loss": 7.1131, "step": 80 }, { "epoch": 3.5842696629213484, "grad_norm": 18.36809730529785, "learning_rate": 1.8181818181818182e-05, "loss": 6.8889, "step": 82 }, { "epoch": 3.6741573033707864, "grad_norm": 3.2839643955230713, "learning_rate": 1.3636363636363637e-05, "loss": 6.9873, "step": 84 }, { "epoch": 3.764044943820225, "grad_norm": 7.564036846160889, "learning_rate": 9.090909090909091e-06, "loss": 6.9841, "step": 86 }, { "epoch": 3.853932584269663, "grad_norm": 3.23893404006958, "learning_rate": 4.5454545454545455e-06, "loss": 6.975, "step": 88 }, { "epoch": 3.853932584269663, "step": 88, "total_flos": 343326692124408.0, "train_loss": 7.0837731144645, "train_runtime": 380.6612, "train_samples_per_second": 3.73, "train_steps_per_second": 0.231 } ], "logging_steps": 2, "max_steps": 88, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 343326692124408.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }