|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 5620, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 48.51668930053711, |
|
"learning_rate": 1.98220640569395e-06, |
|
"loss": 10.5512, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 33.89441680908203, |
|
"learning_rate": 1.9644128113879e-06, |
|
"loss": 6.656, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 54.47207260131836, |
|
"learning_rate": 1.9466192170818503e-06, |
|
"loss": 4.8936, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 37.983856201171875, |
|
"learning_rate": 1.9288256227758005e-06, |
|
"loss": 3.5277, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 23.24921417236328, |
|
"learning_rate": 1.9110320284697506e-06, |
|
"loss": 2.8756, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 26.32309913635254, |
|
"learning_rate": 1.8932384341637008e-06, |
|
"loss": 2.4437, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 154.72117614746094, |
|
"learning_rate": 1.8754448398576511e-06, |
|
"loss": 2.2582, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 16.39977264404297, |
|
"learning_rate": 1.8576512455516013e-06, |
|
"loss": 2.2289, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 22.533836364746094, |
|
"learning_rate": 1.8398576512455514e-06, |
|
"loss": 2.091, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 16.523881912231445, |
|
"learning_rate": 1.8220640569395016e-06, |
|
"loss": 2.0129, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 69.78620147705078, |
|
"learning_rate": 1.804270462633452e-06, |
|
"loss": 1.926, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 21.385257720947266, |
|
"learning_rate": 1.786476868327402e-06, |
|
"loss": 1.8193, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 23.061298370361328, |
|
"learning_rate": 1.7686832740213522e-06, |
|
"loss": 1.804, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 21.625669479370117, |
|
"learning_rate": 1.7508896797153024e-06, |
|
"loss": 1.7568, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 13.98591136932373, |
|
"learning_rate": 1.7330960854092527e-06, |
|
"loss": 1.7313, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 18.47169303894043, |
|
"learning_rate": 1.7153024911032029e-06, |
|
"loss": 1.733, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 20.671327590942383, |
|
"learning_rate": 1.697508896797153e-06, |
|
"loss": 1.698, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 20.78021812438965, |
|
"learning_rate": 1.6797153024911032e-06, |
|
"loss": 1.6192, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.38, |
|
"grad_norm": 35.38755416870117, |
|
"learning_rate": 1.6619217081850533e-06, |
|
"loss": 1.5752, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 24.105249404907227, |
|
"learning_rate": 1.6441281138790034e-06, |
|
"loss": 1.5921, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.74, |
|
"grad_norm": 11.530924797058105, |
|
"learning_rate": 1.6263345195729536e-06, |
|
"loss": 1.5497, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.91, |
|
"grad_norm": 17.551040649414062, |
|
"learning_rate": 1.6085409252669037e-06, |
|
"loss": 1.5751, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.09, |
|
"grad_norm": 22.44804573059082, |
|
"learning_rate": 1.590747330960854e-06, |
|
"loss": 1.6072, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.27, |
|
"grad_norm": 14.482297897338867, |
|
"learning_rate": 1.5729537366548042e-06, |
|
"loss": 1.552, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.45, |
|
"grad_norm": 17.7537899017334, |
|
"learning_rate": 1.5551601423487544e-06, |
|
"loss": 1.4403, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.63, |
|
"grad_norm": 23.001920700073242, |
|
"learning_rate": 1.5373665480427045e-06, |
|
"loss": 1.4955, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 14.721695899963379, |
|
"learning_rate": 1.5195729537366549e-06, |
|
"loss": 1.4456, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.98, |
|
"grad_norm": 15.371649742126465, |
|
"learning_rate": 1.501779359430605e-06, |
|
"loss": 1.4303, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.16, |
|
"grad_norm": 14.734794616699219, |
|
"learning_rate": 1.4839857651245552e-06, |
|
"loss": 1.4544, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.34, |
|
"grad_norm": 13.686590194702148, |
|
"learning_rate": 1.4661921708185053e-06, |
|
"loss": 1.401, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 5.52, |
|
"grad_norm": 18.93415641784668, |
|
"learning_rate": 1.4483985765124555e-06, |
|
"loss": 1.4612, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 5.69, |
|
"grad_norm": 9.70661735534668, |
|
"learning_rate": 1.4306049822064056e-06, |
|
"loss": 1.3558, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 5.87, |
|
"grad_norm": 16.12574577331543, |
|
"learning_rate": 1.4128113879003557e-06, |
|
"loss": 1.3686, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.05, |
|
"grad_norm": 29.739870071411133, |
|
"learning_rate": 1.3950177935943059e-06, |
|
"loss": 1.3703, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.23, |
|
"grad_norm": 22.152677536010742, |
|
"learning_rate": 1.377224199288256e-06, |
|
"loss": 1.2662, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 6.41, |
|
"grad_norm": 24.051326751708984, |
|
"learning_rate": 1.3594306049822064e-06, |
|
"loss": 1.35, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 6.58, |
|
"grad_norm": 11.552955627441406, |
|
"learning_rate": 1.3416370106761565e-06, |
|
"loss": 1.3592, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 6.76, |
|
"grad_norm": 16.08234977722168, |
|
"learning_rate": 1.3238434163701067e-06, |
|
"loss": 1.3566, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 6.94, |
|
"grad_norm": 14.58088493347168, |
|
"learning_rate": 1.3060498220640568e-06, |
|
"loss": 1.3257, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.12, |
|
"grad_norm": 12.278518676757812, |
|
"learning_rate": 1.2882562277580072e-06, |
|
"loss": 1.3254, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 7.3, |
|
"grad_norm": 17.330495834350586, |
|
"learning_rate": 1.2704626334519573e-06, |
|
"loss": 1.2095, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 7.47, |
|
"grad_norm": 13.842063903808594, |
|
"learning_rate": 1.2526690391459075e-06, |
|
"loss": 1.3475, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 7.65, |
|
"grad_norm": 13.967167854309082, |
|
"learning_rate": 1.2348754448398574e-06, |
|
"loss": 1.2757, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 7.83, |
|
"grad_norm": 18.25010871887207, |
|
"learning_rate": 1.2170818505338078e-06, |
|
"loss": 1.2795, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.01, |
|
"grad_norm": 11.46198558807373, |
|
"learning_rate": 1.199288256227758e-06, |
|
"loss": 1.2648, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 8.19, |
|
"grad_norm": 18.330867767333984, |
|
"learning_rate": 1.181494661921708e-06, |
|
"loss": 1.2345, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 8.36, |
|
"grad_norm": 27.236454010009766, |
|
"learning_rate": 1.1637010676156582e-06, |
|
"loss": 1.2998, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 8.54, |
|
"grad_norm": 15.08573055267334, |
|
"learning_rate": 1.1459074733096086e-06, |
|
"loss": 1.2578, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 8.72, |
|
"grad_norm": 15.639131546020508, |
|
"learning_rate": 1.1281138790035587e-06, |
|
"loss": 1.2062, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 8.9, |
|
"grad_norm": 22.758560180664062, |
|
"learning_rate": 1.1103202846975088e-06, |
|
"loss": 1.2305, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.07, |
|
"grad_norm": 11.382159233093262, |
|
"learning_rate": 1.092526690391459e-06, |
|
"loss": 1.2236, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 9.25, |
|
"grad_norm": 14.228442192077637, |
|
"learning_rate": 1.0747330960854093e-06, |
|
"loss": 1.1871, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 9.43, |
|
"grad_norm": 16.07115936279297, |
|
"learning_rate": 1.0569395017793595e-06, |
|
"loss": 1.2286, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 9.61, |
|
"grad_norm": 12.73045539855957, |
|
"learning_rate": 1.0391459074733096e-06, |
|
"loss": 1.2837, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 9.79, |
|
"grad_norm": 19.55732536315918, |
|
"learning_rate": 1.0213523131672596e-06, |
|
"loss": 1.1911, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 9.96, |
|
"grad_norm": 13.422231674194336, |
|
"learning_rate": 1.00355871886121e-06, |
|
"loss": 1.1532, |
|
"step": 5600 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 11240, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 500, |
|
"total_flos": 1.510200611453952e+19, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|