|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9998339008388006, |
|
"eval_steps": 500, |
|
"global_step": 12040, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08304958059961798, |
|
"grad_norm": 0.3119010627269745, |
|
"learning_rate": 4.7940199335548175e-05, |
|
"loss": 0.3439, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.16609916119923596, |
|
"grad_norm": 0.9808832406997681, |
|
"learning_rate": 4.5863787375415286e-05, |
|
"loss": 0.2923, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2491487417988539, |
|
"grad_norm": 0.4935096800327301, |
|
"learning_rate": 4.379568106312292e-05, |
|
"loss": 0.4114, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.3321983223984719, |
|
"grad_norm": 0.8764172196388245, |
|
"learning_rate": 4.171926910299003e-05, |
|
"loss": 0.309, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.41524790299808984, |
|
"grad_norm": 0.9836592674255371, |
|
"learning_rate": 3.964285714285714e-05, |
|
"loss": 0.323, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.4982974835977078, |
|
"grad_norm": 0.8834519386291504, |
|
"learning_rate": 3.7566445182724254e-05, |
|
"loss": 0.2963, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5813470641973258, |
|
"grad_norm": 0.0723366066813469, |
|
"learning_rate": 3.5490033222591366e-05, |
|
"loss": 0.2738, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.6643966447969438, |
|
"grad_norm": 0.799544095993042, |
|
"learning_rate": 3.341362126245847e-05, |
|
"loss": 0.2729, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.7474462253965617, |
|
"grad_norm": 0.26322752237319946, |
|
"learning_rate": 3.133720930232558e-05, |
|
"loss": 0.2726, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.8304958059961797, |
|
"grad_norm": 1.7207447290420532, |
|
"learning_rate": 2.9264950166112958e-05, |
|
"loss": 0.2854, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.9135453865957976, |
|
"grad_norm": 0.27596575021743774, |
|
"learning_rate": 2.718853820598007e-05, |
|
"loss": 0.2919, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.9965949671954156, |
|
"grad_norm": 0.3219141364097595, |
|
"learning_rate": 2.5112126245847177e-05, |
|
"loss": 0.2948, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.0796445477950336, |
|
"grad_norm": 1.064741849899292, |
|
"learning_rate": 2.3035714285714285e-05, |
|
"loss": 0.2182, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.1626941283946517, |
|
"grad_norm": 1.5456783771514893, |
|
"learning_rate": 2.0959302325581396e-05, |
|
"loss": 0.1974, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.2457437089942696, |
|
"grad_norm": 1.0485806465148926, |
|
"learning_rate": 1.8882890365448504e-05, |
|
"loss": 0.2068, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.3287932895938876, |
|
"grad_norm": 0.9721829891204834, |
|
"learning_rate": 1.6806478405315615e-05, |
|
"loss": 0.211, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.4118428701935055, |
|
"grad_norm": 0.15295402705669403, |
|
"learning_rate": 1.4730066445182724e-05, |
|
"loss": 0.2258, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.4948924507931234, |
|
"grad_norm": 1.4515725374221802, |
|
"learning_rate": 1.2657807308970101e-05, |
|
"loss": 0.2239, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.5779420313927415, |
|
"grad_norm": 1.0659480094909668, |
|
"learning_rate": 1.058139534883721e-05, |
|
"loss": 0.2222, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.6609916119923596, |
|
"grad_norm": 2.279104232788086, |
|
"learning_rate": 8.50498338870432e-06, |
|
"loss": 0.232, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.7440411925919774, |
|
"grad_norm": 0.7458949089050293, |
|
"learning_rate": 6.428571428571429e-06, |
|
"loss": 0.2378, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.8270907731915953, |
|
"grad_norm": 1.7988941669464111, |
|
"learning_rate": 4.352159468438539e-06, |
|
"loss": 0.223, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.9101403537912134, |
|
"grad_norm": 0.9544153213500977, |
|
"learning_rate": 2.275747508305648e-06, |
|
"loss": 0.2448, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.9931899343908315, |
|
"grad_norm": 2.081665277481079, |
|
"learning_rate": 1.9933554817275746e-07, |
|
"loss": 0.2509, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.9998339008388006, |
|
"step": 12040, |
|
"total_flos": 1.2491757048924733e+18, |
|
"train_loss": 0.2651168168185161, |
|
"train_runtime": 32469.1467, |
|
"train_samples_per_second": 2.967, |
|
"train_steps_per_second": 0.371 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 12040, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 1.2491757048924733e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|