|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.949238578680204, |
|
"eval_steps": 500, |
|
"global_step": 490, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.20304568527918782, |
|
"grad_norm": 0.2015380859375, |
|
"learning_rate": 0.00019979453927503364, |
|
"loss": 0.7307, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.40609137055837563, |
|
"grad_norm": 0.255615234375, |
|
"learning_rate": 0.0001991790013823246, |
|
"loss": 0.7211, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.6091370558375635, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.00019815591569910654, |
|
"loss": 0.7372, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.8121827411167513, |
|
"grad_norm": 0.359130859375, |
|
"learning_rate": 0.00019672948630390294, |
|
"loss": 0.7826, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.015228426395939, |
|
"grad_norm": 0.385986328125, |
|
"learning_rate": 0.00019490557470106686, |
|
"loss": 0.8238, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.218274111675127, |
|
"grad_norm": 0.359130859375, |
|
"learning_rate": 0.0001926916757346022, |
|
"loss": 0.7958, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.4213197969543148, |
|
"grad_norm": 0.34912109375, |
|
"learning_rate": 0.0001900968867902419, |
|
"loss": 0.7704, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.6243654822335025, |
|
"grad_norm": 0.378662109375, |
|
"learning_rate": 0.00018713187041233896, |
|
"loss": 0.7576, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.8274111675126905, |
|
"grad_norm": 0.38818359375, |
|
"learning_rate": 0.00018380881048918405, |
|
"loss": 0.7895, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 2.030456852791878, |
|
"grad_norm": 0.3798828125, |
|
"learning_rate": 0.00018014136218679567, |
|
"loss": 0.7953, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.233502538071066, |
|
"grad_norm": 0.40869140625, |
|
"learning_rate": 0.00017614459583691346, |
|
"loss": 0.7621, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.436548223350254, |
|
"grad_norm": 0.409912109375, |
|
"learning_rate": 0.00017183493500977278, |
|
"loss": 0.7806, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.6395939086294415, |
|
"grad_norm": 0.41162109375, |
|
"learning_rate": 0.0001672300890261317, |
|
"loss": 0.7414, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.8426395939086295, |
|
"grad_norm": 0.4326171875, |
|
"learning_rate": 0.00016234898018587337, |
|
"loss": 0.7326, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 3.045685279187817, |
|
"grad_norm": 0.468994140625, |
|
"learning_rate": 0.00015721166601221698, |
|
"loss": 0.7233, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.248730964467005, |
|
"grad_norm": 0.443359375, |
|
"learning_rate": 0.00015183925683105254, |
|
"loss": 0.7304, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.451776649746193, |
|
"grad_norm": 0.47705078125, |
|
"learning_rate": 0.00014625382902408356, |
|
"loss": 0.7311, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.6548223350253806, |
|
"grad_norm": 0.44287109375, |
|
"learning_rate": 0.00014047833431223938, |
|
"loss": 0.7204, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.8578680203045685, |
|
"grad_norm": 0.490478515625, |
|
"learning_rate": 0.00013453650544213076, |
|
"loss": 0.6965, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 4.060913705583756, |
|
"grad_norm": 0.42578125, |
|
"learning_rate": 0.00012845275866310324, |
|
"loss": 0.7102, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.2639593908629445, |
|
"grad_norm": 0.46240234375, |
|
"learning_rate": 0.00012225209339563145, |
|
"loss": 0.7164, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.467005076142132, |
|
"grad_norm": 0.449951171875, |
|
"learning_rate": 0.00011595998950333793, |
|
"loss": 0.7068, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.67005076142132, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00010960230259076818, |
|
"loss": 0.7092, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.873096446700508, |
|
"grad_norm": 0.5244140625, |
|
"learning_rate": 0.00010320515775716555, |
|
"loss": 0.6712, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 5.0761421319796955, |
|
"grad_norm": 0.47705078125, |
|
"learning_rate": 9.679484224283449e-05, |
|
"loss": 0.6593, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.279187817258883, |
|
"grad_norm": 0.50830078125, |
|
"learning_rate": 9.039769740923183e-05, |
|
"loss": 0.6991, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.482233502538071, |
|
"grad_norm": 0.51318359375, |
|
"learning_rate": 8.404001049666211e-05, |
|
"loss": 0.6612, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.685279187817259, |
|
"grad_norm": 0.50341796875, |
|
"learning_rate": 7.774790660436858e-05, |
|
"loss": 0.6829, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.888324873096447, |
|
"grad_norm": 0.513671875, |
|
"learning_rate": 7.154724133689677e-05, |
|
"loss": 0.6576, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 6.091370558375634, |
|
"grad_norm": 0.471435546875, |
|
"learning_rate": 6.546349455786926e-05, |
|
"loss": 0.6669, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.2944162436548226, |
|
"grad_norm": 0.55224609375, |
|
"learning_rate": 5.952166568776062e-05, |
|
"loss": 0.6569, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.49746192893401, |
|
"grad_norm": 0.5986328125, |
|
"learning_rate": 5.37461709759165e-05, |
|
"loss": 0.6517, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.700507614213198, |
|
"grad_norm": 0.5849609375, |
|
"learning_rate": 4.8160743168947496e-05, |
|
"loss": 0.6627, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.903553299492386, |
|
"grad_norm": 0.54345703125, |
|
"learning_rate": 4.278833398778306e-05, |
|
"loss": 0.6526, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 7.106598984771574, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 3.7651019814126654e-05, |
|
"loss": 0.652, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.309644670050761, |
|
"grad_norm": 0.51611328125, |
|
"learning_rate": 3.276991097386831e-05, |
|
"loss": 0.6237, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.5126903553299496, |
|
"grad_norm": 0.5654296875, |
|
"learning_rate": 2.8165064990227252e-05, |
|
"loss": 0.6313, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.715736040609137, |
|
"grad_norm": 0.5810546875, |
|
"learning_rate": 2.3855404163086558e-05, |
|
"loss": 0.6452, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.918781725888325, |
|
"grad_norm": 0.59423828125, |
|
"learning_rate": 1.985863781320435e-05, |
|
"loss": 0.6412, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 8.121827411167512, |
|
"grad_norm": 0.55712890625, |
|
"learning_rate": 1.619118951081594e-05, |
|
"loss": 0.6532, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.3248730964467, |
|
"grad_norm": 0.60595703125, |
|
"learning_rate": 1.286812958766106e-05, |
|
"loss": 0.6271, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 8.527918781725889, |
|
"grad_norm": 0.580078125, |
|
"learning_rate": 9.903113209758096e-06, |
|
"loss": 0.6202, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.730964467005077, |
|
"grad_norm": 0.54833984375, |
|
"learning_rate": 7.308324265397836e-06, |
|
"loss": 0.6382, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 8.934010152284264, |
|
"grad_norm": 0.53759765625, |
|
"learning_rate": 5.094425298933136e-06, |
|
"loss": 0.6374, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 9.137055837563452, |
|
"grad_norm": 0.591796875, |
|
"learning_rate": 3.270513696097055e-06, |
|
"loss": 0.6319, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.34010152284264, |
|
"grad_norm": 0.54638671875, |
|
"learning_rate": 1.8440843008934561e-06, |
|
"loss": 0.6336, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.543147208121827, |
|
"grad_norm": 0.56494140625, |
|
"learning_rate": 8.209986176753948e-07, |
|
"loss": 0.6319, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 9.746192893401016, |
|
"grad_norm": 0.55029296875, |
|
"learning_rate": 2.054607249663665e-07, |
|
"loss": 0.6295, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.949238578680204, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0, |
|
"loss": 0.6314, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 9.949238578680204, |
|
"step": 490, |
|
"total_flos": 2.391072030326784e+16, |
|
"train_loss": 0.6941771516994554, |
|
"train_runtime": 436.887, |
|
"train_samples_per_second": 4.509, |
|
"train_steps_per_second": 1.122 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 490, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 2.391072030326784e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|