{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 5620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18, "grad_norm": 48.51668930053711, "learning_rate": 1.98220640569395e-06, "loss": 10.5512, "step": 100 }, { "epoch": 0.36, "grad_norm": 33.89441680908203, "learning_rate": 1.9644128113879e-06, "loss": 6.656, "step": 200 }, { "epoch": 0.53, "grad_norm": 54.47207260131836, "learning_rate": 1.9466192170818503e-06, "loss": 4.8936, "step": 300 }, { "epoch": 0.71, "grad_norm": 37.983856201171875, "learning_rate": 1.9288256227758005e-06, "loss": 3.5277, "step": 400 }, { "epoch": 0.89, "grad_norm": 23.24921417236328, "learning_rate": 1.9110320284697506e-06, "loss": 2.8756, "step": 500 }, { "epoch": 1.07, "grad_norm": 26.32309913635254, "learning_rate": 1.8932384341637008e-06, "loss": 2.4437, "step": 600 }, { "epoch": 1.25, "grad_norm": 154.72117614746094, "learning_rate": 1.8754448398576511e-06, "loss": 2.2582, "step": 700 }, { "epoch": 1.42, "grad_norm": 16.39977264404297, "learning_rate": 1.8576512455516013e-06, "loss": 2.2289, "step": 800 }, { "epoch": 1.6, "grad_norm": 22.533836364746094, "learning_rate": 1.8398576512455514e-06, "loss": 2.091, "step": 900 }, { "epoch": 1.78, "grad_norm": 16.523881912231445, "learning_rate": 1.8220640569395016e-06, "loss": 2.0129, "step": 1000 }, { "epoch": 1.96, "grad_norm": 69.78620147705078, "learning_rate": 1.804270462633452e-06, "loss": 1.926, "step": 1100 }, { "epoch": 2.14, "grad_norm": 21.385257720947266, "learning_rate": 1.786476868327402e-06, "loss": 1.8193, "step": 1200 }, { "epoch": 2.31, "grad_norm": 23.061298370361328, "learning_rate": 1.7686832740213522e-06, "loss": 1.804, "step": 1300 }, { "epoch": 2.49, "grad_norm": 21.625669479370117, "learning_rate": 1.7508896797153024e-06, "loss": 1.7568, "step": 1400 }, { "epoch": 2.67, "grad_norm": 13.98591136932373, "learning_rate": 1.7330960854092527e-06, "loss": 1.7313, "step": 1500 }, { "epoch": 2.85, "grad_norm": 18.47169303894043, "learning_rate": 1.7153024911032029e-06, "loss": 1.733, "step": 1600 }, { "epoch": 3.02, "grad_norm": 20.671327590942383, "learning_rate": 1.697508896797153e-06, "loss": 1.698, "step": 1700 }, { "epoch": 3.2, "grad_norm": 20.78021812438965, "learning_rate": 1.6797153024911032e-06, "loss": 1.6192, "step": 1800 }, { "epoch": 3.38, "grad_norm": 35.38755416870117, "learning_rate": 1.6619217081850533e-06, "loss": 1.5752, "step": 1900 }, { "epoch": 3.56, "grad_norm": 24.105249404907227, "learning_rate": 1.6441281138790034e-06, "loss": 1.5921, "step": 2000 }, { "epoch": 3.74, "grad_norm": 11.530924797058105, "learning_rate": 1.6263345195729536e-06, "loss": 1.5497, "step": 2100 }, { "epoch": 3.91, "grad_norm": 17.551040649414062, "learning_rate": 1.6085409252669037e-06, "loss": 1.5751, "step": 2200 }, { "epoch": 4.09, "grad_norm": 22.44804573059082, "learning_rate": 1.590747330960854e-06, "loss": 1.6072, "step": 2300 }, { "epoch": 4.27, "grad_norm": 14.482297897338867, "learning_rate": 1.5729537366548042e-06, "loss": 1.552, "step": 2400 }, { "epoch": 4.45, "grad_norm": 17.7537899017334, "learning_rate": 1.5551601423487544e-06, "loss": 1.4403, "step": 2500 }, { "epoch": 4.63, "grad_norm": 23.001920700073242, "learning_rate": 1.5373665480427045e-06, "loss": 1.4955, "step": 2600 }, { "epoch": 4.8, "grad_norm": 14.721695899963379, "learning_rate": 1.5195729537366549e-06, "loss": 1.4456, "step": 2700 }, { "epoch": 4.98, "grad_norm": 15.371649742126465, "learning_rate": 1.501779359430605e-06, "loss": 1.4303, "step": 2800 }, { "epoch": 5.16, "grad_norm": 14.734794616699219, "learning_rate": 1.4839857651245552e-06, "loss": 1.4544, "step": 2900 }, { "epoch": 5.34, "grad_norm": 13.686590194702148, "learning_rate": 1.4661921708185053e-06, "loss": 1.401, "step": 3000 }, { "epoch": 5.52, "grad_norm": 18.93415641784668, "learning_rate": 1.4483985765124555e-06, "loss": 1.4612, "step": 3100 }, { "epoch": 5.69, "grad_norm": 9.70661735534668, "learning_rate": 1.4306049822064056e-06, "loss": 1.3558, "step": 3200 }, { "epoch": 5.87, "grad_norm": 16.12574577331543, "learning_rate": 1.4128113879003557e-06, "loss": 1.3686, "step": 3300 }, { "epoch": 6.05, "grad_norm": 29.739870071411133, "learning_rate": 1.3950177935943059e-06, "loss": 1.3703, "step": 3400 }, { "epoch": 6.23, "grad_norm": 22.152677536010742, "learning_rate": 1.377224199288256e-06, "loss": 1.2662, "step": 3500 }, { "epoch": 6.41, "grad_norm": 24.051326751708984, "learning_rate": 1.3594306049822064e-06, "loss": 1.35, "step": 3600 }, { "epoch": 6.58, "grad_norm": 11.552955627441406, "learning_rate": 1.3416370106761565e-06, "loss": 1.3592, "step": 3700 }, { "epoch": 6.76, "grad_norm": 16.08234977722168, "learning_rate": 1.3238434163701067e-06, "loss": 1.3566, "step": 3800 }, { "epoch": 6.94, "grad_norm": 14.58088493347168, "learning_rate": 1.3060498220640568e-06, "loss": 1.3257, "step": 3900 }, { "epoch": 7.12, "grad_norm": 12.278518676757812, "learning_rate": 1.2882562277580072e-06, "loss": 1.3254, "step": 4000 }, { "epoch": 7.3, "grad_norm": 17.330495834350586, "learning_rate": 1.2704626334519573e-06, "loss": 1.2095, "step": 4100 }, { "epoch": 7.47, "grad_norm": 13.842063903808594, "learning_rate": 1.2526690391459075e-06, "loss": 1.3475, "step": 4200 }, { "epoch": 7.65, "grad_norm": 13.967167854309082, "learning_rate": 1.2348754448398574e-06, "loss": 1.2757, "step": 4300 }, { "epoch": 7.83, "grad_norm": 18.25010871887207, "learning_rate": 1.2170818505338078e-06, "loss": 1.2795, "step": 4400 }, { "epoch": 8.01, "grad_norm": 11.46198558807373, "learning_rate": 1.199288256227758e-06, "loss": 1.2648, "step": 4500 }, { "epoch": 8.19, "grad_norm": 18.330867767333984, "learning_rate": 1.181494661921708e-06, "loss": 1.2345, "step": 4600 }, { "epoch": 8.36, "grad_norm": 27.236454010009766, "learning_rate": 1.1637010676156582e-06, "loss": 1.2998, "step": 4700 }, { "epoch": 8.54, "grad_norm": 15.08573055267334, "learning_rate": 1.1459074733096086e-06, "loss": 1.2578, "step": 4800 }, { "epoch": 8.72, "grad_norm": 15.639131546020508, "learning_rate": 1.1281138790035587e-06, "loss": 1.2062, "step": 4900 }, { "epoch": 8.9, "grad_norm": 22.758560180664062, "learning_rate": 1.1103202846975088e-06, "loss": 1.2305, "step": 5000 }, { "epoch": 9.07, "grad_norm": 11.382159233093262, "learning_rate": 1.092526690391459e-06, "loss": 1.2236, "step": 5100 }, { "epoch": 9.25, "grad_norm": 14.228442192077637, "learning_rate": 1.0747330960854093e-06, "loss": 1.1871, "step": 5200 }, { "epoch": 9.43, "grad_norm": 16.07115936279297, "learning_rate": 1.0569395017793595e-06, "loss": 1.2286, "step": 5300 }, { "epoch": 9.61, "grad_norm": 12.73045539855957, "learning_rate": 1.0391459074733096e-06, "loss": 1.2837, "step": 5400 }, { "epoch": 9.79, "grad_norm": 19.55732536315918, "learning_rate": 1.0213523131672596e-06, "loss": 1.1911, "step": 5500 }, { "epoch": 9.96, "grad_norm": 13.422231674194336, "learning_rate": 1.00355871886121e-06, "loss": 1.1532, "step": 5600 } ], "logging_steps": 100, "max_steps": 11240, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "total_flos": 1.510200611453952e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }