nttx's picture
Training in progress, step 50, checkpoint
0dfda4f verified
{
"best_metric": 1.8007385730743408,
"best_model_checkpoint": "miner_id_24/checkpoint-50",
"epoch": 0.8547008547008547,
"eval_steps": 25,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.017094017094017096,
"grad_norm": 3.938539743423462,
"learning_rate": 2.9999999999999997e-05,
"loss": 3.4641,
"step": 1
},
{
"epoch": 0.017094017094017096,
"eval_loss": 4.367037296295166,
"eval_runtime": 1.8323,
"eval_samples_per_second": 27.289,
"eval_steps_per_second": 3.82,
"step": 1
},
{
"epoch": 0.03418803418803419,
"grad_norm": 4.934310436248779,
"learning_rate": 5.9999999999999995e-05,
"loss": 3.7733,
"step": 2
},
{
"epoch": 0.05128205128205128,
"grad_norm": 4.855484962463379,
"learning_rate": 8.999999999999999e-05,
"loss": 3.6143,
"step": 3
},
{
"epoch": 0.06837606837606838,
"grad_norm": 5.6892476081848145,
"learning_rate": 0.00011999999999999999,
"loss": 3.701,
"step": 4
},
{
"epoch": 0.08547008547008547,
"grad_norm": 4.726133823394775,
"learning_rate": 0.00015,
"loss": 3.2567,
"step": 5
},
{
"epoch": 0.10256410256410256,
"grad_norm": 3.7231974601745605,
"learning_rate": 0.00017999999999999998,
"loss": 2.7275,
"step": 6
},
{
"epoch": 0.11965811965811966,
"grad_norm": 2.9245429039001465,
"learning_rate": 0.00020999999999999998,
"loss": 2.7143,
"step": 7
},
{
"epoch": 0.13675213675213677,
"grad_norm": 3.1964733600616455,
"learning_rate": 0.00023999999999999998,
"loss": 2.3785,
"step": 8
},
{
"epoch": 0.15384615384615385,
"grad_norm": 3.5964674949645996,
"learning_rate": 0.00027,
"loss": 2.6142,
"step": 9
},
{
"epoch": 0.17094017094017094,
"grad_norm": 3.1771557331085205,
"learning_rate": 0.0003,
"loss": 2.5761,
"step": 10
},
{
"epoch": 0.18803418803418803,
"grad_norm": 2.4278526306152344,
"learning_rate": 0.00029993535093404976,
"loss": 2.1675,
"step": 11
},
{
"epoch": 0.20512820512820512,
"grad_norm": 3.4053242206573486,
"learning_rate": 0.00029974145946288876,
"loss": 2.1913,
"step": 12
},
{
"epoch": 0.2222222222222222,
"grad_norm": 3.4192192554473877,
"learning_rate": 0.00029941849271855034,
"loss": 2.1841,
"step": 13
},
{
"epoch": 0.23931623931623933,
"grad_norm": 3.6070780754089355,
"learning_rate": 0.00029896672909434605,
"loss": 1.6725,
"step": 14
},
{
"epoch": 0.2564102564102564,
"grad_norm": 3.5193164348602295,
"learning_rate": 0.0002983865580048935,
"loss": 2.4415,
"step": 15
},
{
"epoch": 0.27350427350427353,
"grad_norm": 2.824167490005493,
"learning_rate": 0.00029767847955044656,
"loss": 2.1966,
"step": 16
},
{
"epoch": 0.2905982905982906,
"grad_norm": 2.186579465866089,
"learning_rate": 0.0002968431040858144,
"loss": 2.1251,
"step": 17
},
{
"epoch": 0.3076923076923077,
"grad_norm": 2.0862064361572266,
"learning_rate": 0.0002958811516942438,
"loss": 2.1255,
"step": 18
},
{
"epoch": 0.3247863247863248,
"grad_norm": 2.040565013885498,
"learning_rate": 0.0002947934515667162,
"loss": 2.1153,
"step": 19
},
{
"epoch": 0.3418803418803419,
"grad_norm": 2.1491634845733643,
"learning_rate": 0.0002935809412871952,
"loss": 2.1219,
"step": 20
},
{
"epoch": 0.358974358974359,
"grad_norm": 2.158245325088501,
"learning_rate": 0.00029224466602444126,
"loss": 2.045,
"step": 21
},
{
"epoch": 0.37606837606837606,
"grad_norm": 2.090855121612549,
"learning_rate": 0.00029078577763108886,
"loss": 1.9201,
"step": 22
},
{
"epoch": 0.39316239316239315,
"grad_norm": 2.34712553024292,
"learning_rate": 0.0002892055336507641,
"loss": 2.1339,
"step": 23
},
{
"epoch": 0.41025641025641024,
"grad_norm": 2.3505215644836426,
"learning_rate": 0.00028750529623409766,
"loss": 2.167,
"step": 24
},
{
"epoch": 0.42735042735042733,
"grad_norm": 2.1497771739959717,
"learning_rate": 0.0002856865309645679,
"loss": 1.9427,
"step": 25
},
{
"epoch": 0.42735042735042733,
"eval_loss": 2.0743188858032227,
"eval_runtime": 1.8525,
"eval_samples_per_second": 26.991,
"eval_steps_per_second": 3.779,
"step": 25
},
{
"epoch": 0.4444444444444444,
"grad_norm": 2.295942544937134,
"learning_rate": 0.0002837508055951863,
"loss": 1.8171,
"step": 26
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.5262718200683594,
"learning_rate": 0.00028169978869711386,
"loss": 1.6738,
"step": 27
},
{
"epoch": 0.47863247863247865,
"grad_norm": 2.7707014083862305,
"learning_rate": 0.00027953524822137317,
"loss": 1.6301,
"step": 28
},
{
"epoch": 0.49572649572649574,
"grad_norm": 2.6095831394195557,
"learning_rate": 0.00027725904997489724,
"loss": 2.1898,
"step": 29
},
{
"epoch": 0.5128205128205128,
"grad_norm": 2.4387624263763428,
"learning_rate": 0.0002748731560122267,
"loss": 2.2275,
"step": 30
},
{
"epoch": 0.5299145299145299,
"grad_norm": 2.2945480346679688,
"learning_rate": 0.00027237962294424355,
"loss": 1.9293,
"step": 31
},
{
"epoch": 0.5470085470085471,
"grad_norm": 2.197488784790039,
"learning_rate": 0.0002697806001653979,
"loss": 2.0488,
"step": 32
},
{
"epoch": 0.5641025641025641,
"grad_norm": 2.2360706329345703,
"learning_rate": 0.0002670783280009569,
"loss": 2.0198,
"step": 33
},
{
"epoch": 0.5811965811965812,
"grad_norm": 2.0630950927734375,
"learning_rate": 0.00026427513577587217,
"loss": 1.6752,
"step": 34
},
{
"epoch": 0.5982905982905983,
"grad_norm": 2.5239932537078857,
"learning_rate": 0.0002613734398069308,
"loss": 1.9719,
"step": 35
},
{
"epoch": 0.6153846153846154,
"grad_norm": 2.581416130065918,
"learning_rate": 0.0002583757413199203,
"loss": 2.0279,
"step": 36
},
{
"epoch": 0.6324786324786325,
"grad_norm": 2.4350247383117676,
"learning_rate": 0.0002552846242936032,
"loss": 2.0634,
"step": 37
},
{
"epoch": 0.6495726495726496,
"grad_norm": 2.4420244693756104,
"learning_rate": 0.00025210275323235944,
"loss": 2.0166,
"step": 38
},
{
"epoch": 0.6666666666666666,
"grad_norm": 2.505471706390381,
"learning_rate": 0.00024883287086941667,
"loss": 1.8165,
"step": 39
},
{
"epoch": 0.6837606837606838,
"grad_norm": 2.3507983684539795,
"learning_rate": 0.00024547779580264874,
"loss": 1.6553,
"step": 40
},
{
"epoch": 0.7008547008547008,
"grad_norm": 2.3374176025390625,
"learning_rate": 0.00024204042006497906,
"loss": 1.2892,
"step": 41
},
{
"epoch": 0.717948717948718,
"grad_norm": 2.7650320529937744,
"learning_rate": 0.0002385237066314845,
"loss": 1.3018,
"step": 42
},
{
"epoch": 0.7350427350427351,
"grad_norm": 1.938658356666565,
"learning_rate": 0.00023493068686534756,
"loss": 1.8023,
"step": 43
},
{
"epoch": 0.7521367521367521,
"grad_norm": 2.1138954162597656,
"learning_rate": 0.00023126445790485917,
"loss": 2.0602,
"step": 44
},
{
"epoch": 0.7692307692307693,
"grad_norm": 2.6716885566711426,
"learning_rate": 0.0002275281799937241,
"loss": 1.9736,
"step": 45
},
{
"epoch": 0.7863247863247863,
"grad_norm": 2.3430912494659424,
"learning_rate": 0.00022372507375697015,
"loss": 1.8303,
"step": 46
},
{
"epoch": 0.8034188034188035,
"grad_norm": 2.5371756553649902,
"learning_rate": 0.00021985841742480953,
"loss": 1.6751,
"step": 47
},
{
"epoch": 0.8205128205128205,
"grad_norm": 2.3102569580078125,
"learning_rate": 0.00021593154400684523,
"loss": 1.7225,
"step": 48
},
{
"epoch": 0.8376068376068376,
"grad_norm": 2.714444637298584,
"learning_rate": 0.00021194783841905826,
"loss": 1.7524,
"step": 49
},
{
"epoch": 0.8547008547008547,
"grad_norm": 2.427034616470337,
"learning_rate": 0.00020791073456605222,
"loss": 1.5867,
"step": 50
},
{
"epoch": 0.8547008547008547,
"eval_loss": 1.8007385730743408,
"eval_runtime": 1.8575,
"eval_samples_per_second": 26.918,
"eval_steps_per_second": 3.768,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 117,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.09052260892672e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}