|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 100, |
|
"global_step": 2345, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.21321961620469082, |
|
"grad_norm": 1.6159120798110962, |
|
"learning_rate": 0.0007658848614072496, |
|
"loss": 0.026, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21321961620469082, |
|
"eval_accuracy": 0.9867, |
|
"eval_loss": 0.04582102224230766, |
|
"eval_runtime": 5.154, |
|
"eval_samples_per_second": 1940.235, |
|
"eval_steps_per_second": 15.328, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.42643923240938164, |
|
"grad_norm": 0.22664281725883484, |
|
"learning_rate": 0.000731769722814499, |
|
"loss": 0.0232, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42643923240938164, |
|
"eval_accuracy": 0.9867, |
|
"eval_loss": 0.04527696222066879, |
|
"eval_runtime": 5.4041, |
|
"eval_samples_per_second": 1850.446, |
|
"eval_steps_per_second": 14.619, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.6396588486140725, |
|
"grad_norm": 1.3475489616394043, |
|
"learning_rate": 0.0006976545842217485, |
|
"loss": 0.0277, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6396588486140725, |
|
"eval_accuracy": 0.9863, |
|
"eval_loss": 0.04839787632226944, |
|
"eval_runtime": 5.3353, |
|
"eval_samples_per_second": 1874.323, |
|
"eval_steps_per_second": 14.807, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"grad_norm": 0.9408140182495117, |
|
"learning_rate": 0.0006635394456289979, |
|
"loss": 0.0293, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8528784648187633, |
|
"eval_accuracy": 0.9865, |
|
"eval_loss": 0.046898942440748215, |
|
"eval_runtime": 5.1935, |
|
"eval_samples_per_second": 1925.498, |
|
"eval_steps_per_second": 15.211, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.0660980810234542, |
|
"grad_norm": 0.6018700003623962, |
|
"learning_rate": 0.0006294243070362473, |
|
"loss": 0.0235, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.0660980810234542, |
|
"eval_accuracy": 0.9899, |
|
"eval_loss": 0.028819510713219643, |
|
"eval_runtime": 5.1407, |
|
"eval_samples_per_second": 1945.277, |
|
"eval_steps_per_second": 15.368, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.279317697228145, |
|
"grad_norm": 0.7029837369918823, |
|
"learning_rate": 0.0005953091684434968, |
|
"loss": 0.0203, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.279317697228145, |
|
"eval_accuracy": 0.9924, |
|
"eval_loss": 0.02526690438389778, |
|
"eval_runtime": 4.869, |
|
"eval_samples_per_second": 2053.81, |
|
"eval_steps_per_second": 16.225, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"grad_norm": 0.17393378913402557, |
|
"learning_rate": 0.0005611940298507463, |
|
"loss": 0.0182, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4925373134328357, |
|
"eval_accuracy": 0.9916, |
|
"eval_loss": 0.028603948652744293, |
|
"eval_runtime": 4.6847, |
|
"eval_samples_per_second": 2134.586, |
|
"eval_steps_per_second": 16.863, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.7057569296375266, |
|
"grad_norm": 0.9470173120498657, |
|
"learning_rate": 0.0005270788912579957, |
|
"loss": 0.0205, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7057569296375266, |
|
"eval_accuracy": 0.9935, |
|
"eval_loss": 0.02031560428440571, |
|
"eval_runtime": 5.0835, |
|
"eval_samples_per_second": 1967.155, |
|
"eval_steps_per_second": 15.541, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9189765458422174, |
|
"grad_norm": 1.4821853637695312, |
|
"learning_rate": 0.0004929637526652453, |
|
"loss": 0.0162, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.9189765458422174, |
|
"eval_accuracy": 0.9913, |
|
"eval_loss": 0.023791342973709106, |
|
"eval_runtime": 5.2196, |
|
"eval_samples_per_second": 1915.863, |
|
"eval_steps_per_second": 15.135, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.1321961620469083, |
|
"grad_norm": 1.0076653957366943, |
|
"learning_rate": 0.0004588486140724947, |
|
"loss": 0.0118, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.1321961620469083, |
|
"eval_accuracy": 0.9916, |
|
"eval_loss": 0.024731909856200218, |
|
"eval_runtime": 5.8209, |
|
"eval_samples_per_second": 1717.944, |
|
"eval_steps_per_second": 13.572, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.345415778251599, |
|
"grad_norm": 0.39255017042160034, |
|
"learning_rate": 0.0004247334754797441, |
|
"loss": 0.0121, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.345415778251599, |
|
"eval_accuracy": 0.9932, |
|
"eval_loss": 0.019426949322223663, |
|
"eval_runtime": 5.5038, |
|
"eval_samples_per_second": 1816.931, |
|
"eval_steps_per_second": 14.354, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.55863539445629, |
|
"grad_norm": 0.4671665132045746, |
|
"learning_rate": 0.0003906183368869936, |
|
"loss": 0.0154, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.55863539445629, |
|
"eval_accuracy": 0.9933, |
|
"eval_loss": 0.01936325989663601, |
|
"eval_runtime": 5.4304, |
|
"eval_samples_per_second": 1841.5, |
|
"eval_steps_per_second": 14.548, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.771855010660981, |
|
"grad_norm": 0.17253048717975616, |
|
"learning_rate": 0.0003565031982942431, |
|
"loss": 0.015, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.771855010660981, |
|
"eval_accuracy": 0.9933, |
|
"eval_loss": 0.02162059210240841, |
|
"eval_runtime": 5.3729, |
|
"eval_samples_per_second": 1861.177, |
|
"eval_steps_per_second": 14.703, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"grad_norm": 0.8343185782432556, |
|
"learning_rate": 0.00032238805970149256, |
|
"loss": 0.0145, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.9850746268656714, |
|
"eval_accuracy": 0.9919, |
|
"eval_loss": 0.02381654642522335, |
|
"eval_runtime": 5.1099, |
|
"eval_samples_per_second": 1957.0, |
|
"eval_steps_per_second": 15.46, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 3.1982942430703627, |
|
"grad_norm": 0.3106481432914734, |
|
"learning_rate": 0.000288272921108742, |
|
"loss": 0.0098, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.1982942430703627, |
|
"eval_accuracy": 0.993, |
|
"eval_loss": 0.020756520330905914, |
|
"eval_runtime": 4.9767, |
|
"eval_samples_per_second": 2009.353, |
|
"eval_steps_per_second": 15.874, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.411513859275053, |
|
"grad_norm": 0.03616774454712868, |
|
"learning_rate": 0.0002541577825159915, |
|
"loss": 0.0093, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.411513859275053, |
|
"eval_accuracy": 0.9929, |
|
"eval_loss": 0.021822581067681313, |
|
"eval_runtime": 5.4965, |
|
"eval_samples_per_second": 1819.356, |
|
"eval_steps_per_second": 14.373, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.624733475479744, |
|
"grad_norm": 0.6045345067977905, |
|
"learning_rate": 0.00022004264392324095, |
|
"loss": 0.0073, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.624733475479744, |
|
"eval_accuracy": 0.9933, |
|
"eval_loss": 0.018862707540392876, |
|
"eval_runtime": 5.7766, |
|
"eval_samples_per_second": 1731.12, |
|
"eval_steps_per_second": 13.676, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.837953091684435, |
|
"grad_norm": 0.631215512752533, |
|
"learning_rate": 0.0001859275053304904, |
|
"loss": 0.008, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.837953091684435, |
|
"eval_accuracy": 0.9932, |
|
"eval_loss": 0.01944512128829956, |
|
"eval_runtime": 5.1522, |
|
"eval_samples_per_second": 1940.922, |
|
"eval_steps_per_second": 15.333, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 4.051172707889126, |
|
"grad_norm": 0.5996153950691223, |
|
"learning_rate": 0.0001518123667377399, |
|
"loss": 0.006, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.051172707889126, |
|
"eval_accuracy": 0.9938, |
|
"eval_loss": 0.018317226320505142, |
|
"eval_runtime": 4.9544, |
|
"eval_samples_per_second": 2018.428, |
|
"eval_steps_per_second": 15.946, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 4.264392324093817, |
|
"grad_norm": 0.0069321137852966785, |
|
"learning_rate": 0.00011769722814498933, |
|
"loss": 0.0063, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.264392324093817, |
|
"eval_accuracy": 0.9934, |
|
"eval_loss": 0.018384862691164017, |
|
"eval_runtime": 4.7636, |
|
"eval_samples_per_second": 2099.232, |
|
"eval_steps_per_second": 16.584, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.477611940298507, |
|
"grad_norm": 0.06509185582399368, |
|
"learning_rate": 8.392324093816631e-05, |
|
"loss": 0.0043, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.477611940298507, |
|
"eval_accuracy": 0.9932, |
|
"eval_loss": 0.018380142748355865, |
|
"eval_runtime": 4.6951, |
|
"eval_samples_per_second": 2129.883, |
|
"eval_steps_per_second": 16.826, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.690831556503198, |
|
"grad_norm": 0.17103737592697144, |
|
"learning_rate": 4.980810234541578e-05, |
|
"loss": 0.0035, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.690831556503198, |
|
"eval_accuracy": 0.9931, |
|
"eval_loss": 0.018344268202781677, |
|
"eval_runtime": 4.6229, |
|
"eval_samples_per_second": 2163.133, |
|
"eval_steps_per_second": 17.089, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.904051172707889, |
|
"grad_norm": 0.6465263962745667, |
|
"learning_rate": 1.5692963752665246e-05, |
|
"loss": 0.0061, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.904051172707889, |
|
"eval_accuracy": 0.9931, |
|
"eval_loss": 0.018412619829177856, |
|
"eval_runtime": 4.7593, |
|
"eval_samples_per_second": 2101.13, |
|
"eval_steps_per_second": 16.599, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 2345, |
|
"total_flos": 1346208595200000.0, |
|
"train_loss": 0.014373551363121472, |
|
"train_runtime": 335.5598, |
|
"train_samples_per_second": 894.028, |
|
"train_steps_per_second": 6.988 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 2345, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1346208595200000.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|