0x1202's picture
Training in progress, step 50, checkpoint
76367f0 verified
raw
history blame
10.2 kB
{
"best_metric": 0.03938543424010277,
"best_model_checkpoint": "miner_id_24/checkpoint-50",
"epoch": 0.91324200913242,
"eval_steps": 25,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0182648401826484,
"grad_norm": 0.8473725318908691,
"learning_rate": 2.9999999999999997e-05,
"loss": 1.7975,
"step": 1
},
{
"epoch": 0.0182648401826484,
"eval_loss": 1.8122729063034058,
"eval_runtime": 3.8568,
"eval_samples_per_second": 12.964,
"eval_steps_per_second": 1.815,
"step": 1
},
{
"epoch": 0.0365296803652968,
"grad_norm": 0.9350575804710388,
"learning_rate": 5.9999999999999995e-05,
"loss": 1.764,
"step": 2
},
{
"epoch": 0.0547945205479452,
"grad_norm": 0.995567798614502,
"learning_rate": 8.999999999999999e-05,
"loss": 2.0446,
"step": 3
},
{
"epoch": 0.0730593607305936,
"grad_norm": 1.407714605331421,
"learning_rate": 0.00011999999999999999,
"loss": 2.0893,
"step": 4
},
{
"epoch": 0.091324200913242,
"grad_norm": 1.8826767206192017,
"learning_rate": 0.00015,
"loss": 2.7079,
"step": 5
},
{
"epoch": 0.1095890410958904,
"grad_norm": 1.0161590576171875,
"learning_rate": 0.00017999999999999998,
"loss": 1.9996,
"step": 6
},
{
"epoch": 0.1278538812785388,
"grad_norm": 1.010097861289978,
"learning_rate": 0.00020999999999999998,
"loss": 1.9585,
"step": 7
},
{
"epoch": 0.1461187214611872,
"grad_norm": 1.0560206174850464,
"learning_rate": 0.00023999999999999998,
"loss": 1.7454,
"step": 8
},
{
"epoch": 0.1643835616438356,
"grad_norm": 1.162116527557373,
"learning_rate": 0.00027,
"loss": 1.6178,
"step": 9
},
{
"epoch": 0.182648401826484,
"grad_norm": 1.4233083724975586,
"learning_rate": 0.0003,
"loss": 1.5871,
"step": 10
},
{
"epoch": 0.2009132420091324,
"grad_norm": 1.7879890203475952,
"learning_rate": 0.0002999259840548597,
"loss": 1.364,
"step": 11
},
{
"epoch": 0.2191780821917808,
"grad_norm": 1.9744043350219727,
"learning_rate": 0.0002997040092642407,
"loss": 1.3302,
"step": 12
},
{
"epoch": 0.2374429223744292,
"grad_norm": 2.14997935295105,
"learning_rate": 0.000299334294690462,
"loss": 1.1676,
"step": 13
},
{
"epoch": 0.2557077625570776,
"grad_norm": 2.865056037902832,
"learning_rate": 0.0002988172051971717,
"loss": 1.179,
"step": 14
},
{
"epoch": 0.273972602739726,
"grad_norm": 2.2772552967071533,
"learning_rate": 0.00029815325108927063,
"loss": 1.0101,
"step": 15
},
{
"epoch": 0.2922374429223744,
"grad_norm": 1.7772722244262695,
"learning_rate": 0.0002973430876093033,
"loss": 0.924,
"step": 16
},
{
"epoch": 0.3105022831050228,
"grad_norm": 1.6902390718460083,
"learning_rate": 0.0002963875142908121,
"loss": 0.8714,
"step": 17
},
{
"epoch": 0.3287671232876712,
"grad_norm": 1.3245774507522583,
"learning_rate": 0.00029528747416929463,
"loss": 0.849,
"step": 18
},
{
"epoch": 0.3470319634703196,
"grad_norm": 1.3994680643081665,
"learning_rate": 0.0002940440528515414,
"loss": 0.6922,
"step": 19
},
{
"epoch": 0.365296803652968,
"grad_norm": 1.3206199407577515,
"learning_rate": 0.00029265847744427303,
"loss": 0.6246,
"step": 20
},
{
"epoch": 0.3835616438356164,
"grad_norm": 1.5602670907974243,
"learning_rate": 0.0002911321153431338,
"loss": 0.4424,
"step": 21
},
{
"epoch": 0.4018264840182648,
"grad_norm": 1.6248738765716553,
"learning_rate": 0.00028946647288323766,
"loss": 0.3231,
"step": 22
},
{
"epoch": 0.4200913242009132,
"grad_norm": 2.217292547225952,
"learning_rate": 0.00028766319385259713,
"loss": 0.4047,
"step": 23
},
{
"epoch": 0.4383561643835616,
"grad_norm": 1.8803366422653198,
"learning_rate": 0.00028572405786990294,
"loss": 0.2903,
"step": 24
},
{
"epoch": 0.45662100456621,
"grad_norm": 2.2177164554595947,
"learning_rate": 0.00028365097862825513,
"loss": 0.2043,
"step": 25
},
{
"epoch": 0.45662100456621,
"eval_loss": 0.2833889126777649,
"eval_runtime": 3.4109,
"eval_samples_per_second": 14.659,
"eval_steps_per_second": 2.052,
"step": 25
},
{
"epoch": 0.4748858447488584,
"grad_norm": 2.0055646896362305,
"learning_rate": 0.0002814460020065795,
"loss": 0.1663,
"step": 26
},
{
"epoch": 0.4931506849315068,
"grad_norm": 6.125588893890381,
"learning_rate": 0.0002791113040505915,
"loss": 0.4882,
"step": 27
},
{
"epoch": 0.5114155251141552,
"grad_norm": 2.6787264347076416,
"learning_rate": 0.00027664918882530225,
"loss": 0.267,
"step": 28
},
{
"epoch": 0.5296803652968036,
"grad_norm": 2.423906087875366,
"learning_rate": 0.00027406208614118424,
"loss": 0.2878,
"step": 29
},
{
"epoch": 0.547945205479452,
"grad_norm": 2.6322414875030518,
"learning_rate": 0.0002713525491562421,
"loss": 0.3463,
"step": 30
},
{
"epoch": 0.5662100456621004,
"grad_norm": 2.0104191303253174,
"learning_rate": 0.00026852325185635354,
"loss": 0.2524,
"step": 31
},
{
"epoch": 0.5844748858447488,
"grad_norm": 1.3756883144378662,
"learning_rate": 0.00026557698641636835,
"loss": 0.1495,
"step": 32
},
{
"epoch": 0.6027397260273972,
"grad_norm": 1.8245288133621216,
"learning_rate": 0.0002625166604445689,
"loss": 0.1895,
"step": 33
},
{
"epoch": 0.6210045662100456,
"grad_norm": 1.0265326499938965,
"learning_rate": 0.0002593452941132117,
"loss": 0.0832,
"step": 34
},
{
"epoch": 0.639269406392694,
"grad_norm": 0.7454352378845215,
"learning_rate": 0.00025606601717798207,
"loss": 0.06,
"step": 35
},
{
"epoch": 0.6575342465753424,
"grad_norm": 0.8095255494117737,
"learning_rate": 0.0002526820658893033,
"loss": 0.064,
"step": 36
},
{
"epoch": 0.6757990867579908,
"grad_norm": 0.7397829294204712,
"learning_rate": 0.00024919677979854776,
"loss": 0.0462,
"step": 37
},
{
"epoch": 0.6940639269406392,
"grad_norm": 0.965843915939331,
"learning_rate": 0.0002456135984623034,
"loss": 0.048,
"step": 38
},
{
"epoch": 0.7123287671232876,
"grad_norm": 0.4619573652744293,
"learning_rate": 0.00024193605804794646,
"loss": 0.02,
"step": 39
},
{
"epoch": 0.730593607305936,
"grad_norm": 1.6063387393951416,
"learning_rate": 0.00023816778784387094,
"loss": 0.1591,
"step": 40
},
{
"epoch": 0.7488584474885844,
"grad_norm": 1.861398696899414,
"learning_rate": 0.00023431250667781958,
"loss": 0.144,
"step": 41
},
{
"epoch": 0.7671232876712328,
"grad_norm": 2.341588258743286,
"learning_rate": 0.00023037401924684946,
"loss": 0.1565,
"step": 42
},
{
"epoch": 0.7853881278538812,
"grad_norm": 1.8141049146652222,
"learning_rate": 0.00022635621236255567,
"loss": 0.1546,
"step": 43
},
{
"epoch": 0.8036529680365296,
"grad_norm": 2.6451475620269775,
"learning_rate": 0.00022226305111525726,
"loss": 0.1163,
"step": 44
},
{
"epoch": 0.821917808219178,
"grad_norm": 0.9599330425262451,
"learning_rate": 0.00021809857496093199,
"loss": 0.0565,
"step": 45
},
{
"epoch": 0.8401826484018264,
"grad_norm": 0.49869561195373535,
"learning_rate": 0.00021386689373476087,
"loss": 0.0417,
"step": 46
},
{
"epoch": 0.8584474885844748,
"grad_norm": 0.35756292939186096,
"learning_rate": 0.00020957218359521706,
"loss": 0.0231,
"step": 47
},
{
"epoch": 0.8767123287671232,
"grad_norm": 0.629383385181427,
"learning_rate": 0.0002052186829027017,
"loss": 0.0203,
"step": 48
},
{
"epoch": 0.8949771689497716,
"grad_norm": 0.6143040060997009,
"learning_rate": 0.00020081068803679371,
"loss": 0.0258,
"step": 49
},
{
"epoch": 0.91324200913242,
"grad_norm": 0.9527487754821777,
"learning_rate": 0.0001963525491562421,
"loss": 0.0326,
"step": 50
},
{
"epoch": 0.91324200913242,
"eval_loss": 0.03938543424010277,
"eval_runtime": 3.9802,
"eval_samples_per_second": 12.562,
"eval_steps_per_second": 1.759,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 110,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 1,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.310961668456448e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}