|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.852216748768473, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.19704433497536947, |
|
"grad_norm": 0.90673828125, |
|
"learning_rate": 0.00019980267284282717, |
|
"loss": 1.6839, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.39408866995073893, |
|
"grad_norm": 0.7666015625, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 1.3775, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.5911330049261084, |
|
"grad_norm": 1.26171875, |
|
"learning_rate": 0.0001982287250728689, |
|
"loss": 1.2362, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.7881773399014779, |
|
"grad_norm": 0.51904296875, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 1.1045, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.9852216748768473, |
|
"grad_norm": 0.457763671875, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 1.0098, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.1822660098522166, |
|
"grad_norm": 0.378173828125, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 0.9158, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.3793103448275863, |
|
"grad_norm": 0.385009765625, |
|
"learning_rate": 0.00019048270524660196, |
|
"loss": 0.8679, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.5763546798029555, |
|
"grad_norm": 0.436279296875, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 0.8699, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.7733990147783252, |
|
"grad_norm": 0.358154296875, |
|
"learning_rate": 0.00018443279255020152, |
|
"loss": 0.8928, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.9704433497536946, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.8451, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 2.167487684729064, |
|
"grad_norm": 0.344482421875, |
|
"learning_rate": 0.00017705132427757895, |
|
"loss": 0.8518, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.3645320197044333, |
|
"grad_norm": 0.3330078125, |
|
"learning_rate": 0.00017289686274214118, |
|
"loss": 0.834, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.561576354679803, |
|
"grad_norm": 0.33935546875, |
|
"learning_rate": 0.00016845471059286887, |
|
"loss": 0.8082, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.7586206896551726, |
|
"grad_norm": 0.381103515625, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 0.8103, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.955665024630542, |
|
"grad_norm": 0.371826171875, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.8405, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 3.1527093596059115, |
|
"grad_norm": 0.4560546875, |
|
"learning_rate": 0.00015358267949789966, |
|
"loss": 0.8076, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.3497536945812807, |
|
"grad_norm": 0.497314453125, |
|
"learning_rate": 0.00014817536741017152, |
|
"loss": 0.8112, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.5467980295566504, |
|
"grad_norm": 0.3759765625, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 0.7933, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.7438423645320196, |
|
"grad_norm": 0.396484375, |
|
"learning_rate": 0.00013681245526846783, |
|
"loss": 0.7877, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.9408866995073892, |
|
"grad_norm": 0.421630859375, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.8012, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 4.137931034482759, |
|
"grad_norm": 0.4208984375, |
|
"learning_rate": 0.0001248689887164855, |
|
"loss": 0.799, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.334975369458128, |
|
"grad_norm": 0.377685546875, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 0.7942, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.532019704433497, |
|
"grad_norm": 0.52880859375, |
|
"learning_rate": 0.00011253332335643043, |
|
"loss": 0.7658, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.7290640394088665, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 0.7761, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.926108374384237, |
|
"grad_norm": 0.54638671875, |
|
"learning_rate": 0.0001, |
|
"loss": 0.789, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 5.123152709359606, |
|
"grad_norm": 0.42822265625, |
|
"learning_rate": 9.372094804706867e-05, |
|
"loss": 0.7697, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.320197044334975, |
|
"grad_norm": 0.4482421875, |
|
"learning_rate": 8.746667664356956e-05, |
|
"loss": 0.7713, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.517241379310345, |
|
"grad_norm": 0.45361328125, |
|
"learning_rate": 8.126186854142752e-05, |
|
"loss": 0.7812, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.4521484375, |
|
"learning_rate": 7.513101128351454e-05, |
|
"loss": 0.7549, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.911330049261084, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.7387, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 6.108374384236453, |
|
"grad_norm": 0.433837890625, |
|
"learning_rate": 6.318754473153221e-05, |
|
"loss": 0.759, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.305418719211823, |
|
"grad_norm": 0.492431640625, |
|
"learning_rate": 5.7422070843492734e-05, |
|
"loss": 0.7572, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.502463054187192, |
|
"grad_norm": 0.45068359375, |
|
"learning_rate": 5.182463258982846e-05, |
|
"loss": 0.7507, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.699507389162561, |
|
"grad_norm": 0.476806640625, |
|
"learning_rate": 4.6417320502100316e-05, |
|
"loss": 0.7483, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.896551724137931, |
|
"grad_norm": 0.464599609375, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.7618, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 7.093596059113301, |
|
"grad_norm": 0.44970703125, |
|
"learning_rate": 3.6257601025131026e-05, |
|
"loss": 0.7294, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.29064039408867, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 3.154528940713113e-05, |
|
"loss": 0.7575, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.487684729064039, |
|
"grad_norm": 0.45751953125, |
|
"learning_rate": 2.7103137257858868e-05, |
|
"loss": 0.7349, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.684729064039409, |
|
"grad_norm": 0.51123046875, |
|
"learning_rate": 2.2948675722421086e-05, |
|
"loss": 0.7431, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 7.8817733990147785, |
|
"grad_norm": 0.442138671875, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.7429, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 8.078817733990148, |
|
"grad_norm": 0.4794921875, |
|
"learning_rate": 1.5567207449798515e-05, |
|
"loss": 0.7386, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 8.275862068965518, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 1.2369331995613665e-05, |
|
"loss": 0.7209, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.472906403940886, |
|
"grad_norm": 0.453369140625, |
|
"learning_rate": 9.517294753398064e-06, |
|
"loss": 0.7379, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 8.669950738916256, |
|
"grad_norm": 0.461181640625, |
|
"learning_rate": 7.022351411174866e-06, |
|
"loss": 0.7192, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.866995073891626, |
|
"grad_norm": 0.4892578125, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.7553, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 9.064039408866995, |
|
"grad_norm": 0.52685546875, |
|
"learning_rate": 3.1416838871368924e-06, |
|
"loss": 0.7381, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.261083743842365, |
|
"grad_norm": 0.490234375, |
|
"learning_rate": 1.771274927131139e-06, |
|
"loss": 0.733, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 9.458128078817733, |
|
"grad_norm": 0.46240234375, |
|
"learning_rate": 7.885298685522235e-07, |
|
"loss": 0.7257, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.655172413793103, |
|
"grad_norm": 0.5029296875, |
|
"learning_rate": 1.973271571728441e-07, |
|
"loss": 0.7418, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 9.852216748768473, |
|
"grad_norm": 0.447021484375, |
|
"learning_rate": 0.0, |
|
"loss": 0.7347, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.852216748768473, |
|
"step": 500, |
|
"total_flos": 2.4398694187008e+16, |
|
"train_loss": 0.8303789939880372, |
|
"train_runtime": 437.9863, |
|
"train_samples_per_second": 4.635, |
|
"train_steps_per_second": 1.142 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 2.4398694187008e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|