fats-fme's picture
Training in progress, step 63, checkpoint
e3d4540 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9960474308300395,
"eval_steps": 16,
"global_step": 63,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.015810276679841896,
"grad_norm": 3.106086254119873,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.2762,
"step": 1
},
{
"epoch": 0.015810276679841896,
"eval_loss": 0.34248441457748413,
"eval_runtime": 9.4978,
"eval_samples_per_second": 11.266,
"eval_steps_per_second": 2.843,
"step": 1
},
{
"epoch": 0.03162055335968379,
"grad_norm": 0.6217677593231201,
"learning_rate": 4.000000000000001e-06,
"loss": 0.2623,
"step": 2
},
{
"epoch": 0.04743083003952569,
"grad_norm": 1.1419365406036377,
"learning_rate": 6e-06,
"loss": 0.3943,
"step": 3
},
{
"epoch": 0.06324110671936758,
"grad_norm": 0.8881447315216064,
"learning_rate": 8.000000000000001e-06,
"loss": 0.1572,
"step": 4
},
{
"epoch": 0.07905138339920949,
"grad_norm": 0.7020868062973022,
"learning_rate": 1e-05,
"loss": 0.1578,
"step": 5
},
{
"epoch": 0.09486166007905138,
"grad_norm": 1.2612448930740356,
"learning_rate": 1.2e-05,
"loss": 0.2245,
"step": 6
},
{
"epoch": 0.11067193675889328,
"grad_norm": 1.2954490184783936,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.2101,
"step": 7
},
{
"epoch": 0.12648221343873517,
"grad_norm": 0.8434045314788818,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.1529,
"step": 8
},
{
"epoch": 0.1422924901185771,
"grad_norm": 0.9214808940887451,
"learning_rate": 1.8e-05,
"loss": 0.1522,
"step": 9
},
{
"epoch": 0.15810276679841898,
"grad_norm": 1.3638683557510376,
"learning_rate": 2e-05,
"loss": 0.3282,
"step": 10
},
{
"epoch": 0.17391304347826086,
"grad_norm": 2.2487895488739014,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.472,
"step": 11
},
{
"epoch": 0.18972332015810275,
"grad_norm": 1.98398756980896,
"learning_rate": 2.4e-05,
"loss": 0.4099,
"step": 12
},
{
"epoch": 0.20553359683794467,
"grad_norm": 1.796846866607666,
"learning_rate": 2.6000000000000002e-05,
"loss": 0.3184,
"step": 13
},
{
"epoch": 0.22134387351778656,
"grad_norm": 1.6363037824630737,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.3551,
"step": 14
},
{
"epoch": 0.23715415019762845,
"grad_norm": 1.927720308303833,
"learning_rate": 3e-05,
"loss": 0.405,
"step": 15
},
{
"epoch": 0.25296442687747034,
"grad_norm": 1.0266072750091553,
"learning_rate": 3.2000000000000005e-05,
"loss": 0.2991,
"step": 16
},
{
"epoch": 0.25296442687747034,
"eval_loss": 0.21815715730190277,
"eval_runtime": 8.042,
"eval_samples_per_second": 13.305,
"eval_steps_per_second": 3.357,
"step": 16
},
{
"epoch": 0.26877470355731226,
"grad_norm": 0.4768655002117157,
"learning_rate": 3.4000000000000007e-05,
"loss": 0.2191,
"step": 17
},
{
"epoch": 0.2845849802371542,
"grad_norm": 0.9923710823059082,
"learning_rate": 3.6e-05,
"loss": 0.384,
"step": 18
},
{
"epoch": 0.30039525691699603,
"grad_norm": 0.5063422322273254,
"learning_rate": 3.8e-05,
"loss": 0.1493,
"step": 19
},
{
"epoch": 0.31620553359683795,
"grad_norm": 0.5010712742805481,
"learning_rate": 4e-05,
"loss": 0.0675,
"step": 20
},
{
"epoch": 0.33201581027667987,
"grad_norm": 0.48852574825286865,
"learning_rate": 4.2e-05,
"loss": 0.0739,
"step": 21
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.4387320578098297,
"learning_rate": 4.4000000000000006e-05,
"loss": 0.0751,
"step": 22
},
{
"epoch": 0.36363636363636365,
"grad_norm": 0.3455885648727417,
"learning_rate": 4.600000000000001e-05,
"loss": 0.0516,
"step": 23
},
{
"epoch": 0.3794466403162055,
"grad_norm": 0.29778942465782166,
"learning_rate": 4.8e-05,
"loss": 0.0421,
"step": 24
},
{
"epoch": 0.3952569169960474,
"grad_norm": 0.3956562578678131,
"learning_rate": 5e-05,
"loss": 0.057,
"step": 25
},
{
"epoch": 0.41106719367588934,
"grad_norm": 0.8842503428459167,
"learning_rate": 5.2000000000000004e-05,
"loss": 0.1188,
"step": 26
},
{
"epoch": 0.4268774703557312,
"grad_norm": 0.9197725653648376,
"learning_rate": 5.4000000000000005e-05,
"loss": 0.1128,
"step": 27
},
{
"epoch": 0.4426877470355731,
"grad_norm": 0.9175456762313843,
"learning_rate": 5.6000000000000006e-05,
"loss": 0.0628,
"step": 28
},
{
"epoch": 0.45849802371541504,
"grad_norm": 0.5987579822540283,
"learning_rate": 5.8e-05,
"loss": 0.0499,
"step": 29
},
{
"epoch": 0.4743083003952569,
"grad_norm": 0.8026472330093384,
"learning_rate": 6e-05,
"loss": 0.0619,
"step": 30
},
{
"epoch": 0.4901185770750988,
"grad_norm": 0.5789671540260315,
"learning_rate": 6.2e-05,
"loss": 0.0394,
"step": 31
},
{
"epoch": 0.5059288537549407,
"grad_norm": 0.7335872054100037,
"learning_rate": 6.400000000000001e-05,
"loss": 0.2007,
"step": 32
},
{
"epoch": 0.5059288537549407,
"eval_loss": 0.050875596702098846,
"eval_runtime": 8.2485,
"eval_samples_per_second": 12.972,
"eval_steps_per_second": 3.273,
"step": 32
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.6524657011032104,
"learning_rate": 6.6e-05,
"loss": 0.1999,
"step": 33
},
{
"epoch": 0.5375494071146245,
"grad_norm": 1.1696100234985352,
"learning_rate": 6.800000000000001e-05,
"loss": 0.3284,
"step": 34
},
{
"epoch": 0.5533596837944664,
"grad_norm": 0.16617901623249054,
"learning_rate": 7e-05,
"loss": 0.0075,
"step": 35
},
{
"epoch": 0.5691699604743083,
"grad_norm": 0.34399452805519104,
"learning_rate": 7.2e-05,
"loss": 0.0108,
"step": 36
},
{
"epoch": 0.5849802371541502,
"grad_norm": 0.39466115832328796,
"learning_rate": 7.4e-05,
"loss": 0.0283,
"step": 37
},
{
"epoch": 0.6007905138339921,
"grad_norm": 0.31488093733787537,
"learning_rate": 7.6e-05,
"loss": 0.0178,
"step": 38
},
{
"epoch": 0.616600790513834,
"grad_norm": 0.32184290885925293,
"learning_rate": 7.800000000000001e-05,
"loss": 0.011,
"step": 39
},
{
"epoch": 0.6324110671936759,
"grad_norm": 0.395563006401062,
"learning_rate": 8e-05,
"loss": 0.0096,
"step": 40
},
{
"epoch": 0.6482213438735178,
"grad_norm": 0.22120489180088043,
"learning_rate": 8.2e-05,
"loss": 0.005,
"step": 41
},
{
"epoch": 0.6640316205533597,
"grad_norm": 0.3320792317390442,
"learning_rate": 8.4e-05,
"loss": 0.008,
"step": 42
},
{
"epoch": 0.6798418972332015,
"grad_norm": 0.28633660078048706,
"learning_rate": 8.6e-05,
"loss": 0.005,
"step": 43
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.4111138582229614,
"learning_rate": 8.800000000000001e-05,
"loss": 0.0084,
"step": 44
},
{
"epoch": 0.7114624505928854,
"grad_norm": 0.06865093857049942,
"learning_rate": 9e-05,
"loss": 0.0019,
"step": 45
},
{
"epoch": 0.7272727272727273,
"grad_norm": 0.05319277197122574,
"learning_rate": 9.200000000000001e-05,
"loss": 0.0016,
"step": 46
},
{
"epoch": 0.7430830039525692,
"grad_norm": 0.6792236566543579,
"learning_rate": 9.4e-05,
"loss": 0.1265,
"step": 47
},
{
"epoch": 0.758893280632411,
"grad_norm": 0.5738235712051392,
"learning_rate": 9.6e-05,
"loss": 0.0992,
"step": 48
},
{
"epoch": 0.758893280632411,
"eval_loss": 0.025439240038394928,
"eval_runtime": 8.0993,
"eval_samples_per_second": 13.211,
"eval_steps_per_second": 3.334,
"step": 48
},
{
"epoch": 0.7747035573122529,
"grad_norm": 0.8376194834709167,
"learning_rate": 9.8e-05,
"loss": 0.1525,
"step": 49
},
{
"epoch": 0.7905138339920948,
"grad_norm": 0.18561908602714539,
"learning_rate": 0.0001,
"loss": 0.0172,
"step": 50
},
{
"epoch": 0.8063241106719368,
"grad_norm": 0.31320735812187195,
"learning_rate": 9.85470908713026e-05,
"loss": 0.0038,
"step": 51
},
{
"epoch": 0.8221343873517787,
"grad_norm": 0.4068452715873718,
"learning_rate": 9.42728012826605e-05,
"loss": 0.0178,
"step": 52
},
{
"epoch": 0.8379446640316206,
"grad_norm": 0.16684125363826752,
"learning_rate": 8.742553740855506e-05,
"loss": 0.0087,
"step": 53
},
{
"epoch": 0.8537549407114624,
"grad_norm": 0.05175252631306648,
"learning_rate": 7.840323733655778e-05,
"loss": 0.0011,
"step": 54
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.018029799684882164,
"learning_rate": 6.773024435212678e-05,
"loss": 0.0006,
"step": 55
},
{
"epoch": 0.8853754940711462,
"grad_norm": 0.03412799909710884,
"learning_rate": 5.602683401276615e-05,
"loss": 0.0009,
"step": 56
},
{
"epoch": 0.9011857707509882,
"grad_norm": 0.06215568631887436,
"learning_rate": 4.397316598723385e-05,
"loss": 0.0011,
"step": 57
},
{
"epoch": 0.9169960474308301,
"grad_norm": 0.05936681851744652,
"learning_rate": 3.226975564787322e-05,
"loss": 0.0011,
"step": 58
},
{
"epoch": 0.932806324110672,
"grad_norm": 0.045637015253305435,
"learning_rate": 2.1596762663442218e-05,
"loss": 0.0011,
"step": 59
},
{
"epoch": 0.9486166007905138,
"grad_norm": 0.03515447676181793,
"learning_rate": 1.257446259144494e-05,
"loss": 0.001,
"step": 60
},
{
"epoch": 0.9644268774703557,
"grad_norm": 0.04813205078244209,
"learning_rate": 5.727198717339511e-06,
"loss": 0.0011,
"step": 61
},
{
"epoch": 0.9802371541501976,
"grad_norm": 0.05862165987491608,
"learning_rate": 1.4529091286973995e-06,
"loss": 0.0012,
"step": 62
},
{
"epoch": 0.9960474308300395,
"grad_norm": 0.07167425751686096,
"learning_rate": 0.0,
"loss": 0.0096,
"step": 63
}
],
"logging_steps": 1,
"max_steps": 63,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 16,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9264114014617600.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}