|
{ |
|
"best_metric": 2.433828830718994, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-50", |
|
"epoch": 3.011204481792717, |
|
"eval_steps": 50, |
|
"global_step": 67, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.04481792717086835, |
|
"grad_norm": 25.642864227294922, |
|
"learning_rate": 5e-06, |
|
"loss": 10.3377, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04481792717086835, |
|
"eval_loss": 10.295247077941895, |
|
"eval_runtime": 1.1874, |
|
"eval_samples_per_second": 252.661, |
|
"eval_steps_per_second": 16.002, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0896358543417367, |
|
"grad_norm": 25.922531127929688, |
|
"learning_rate": 1e-05, |
|
"loss": 10.223, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.13445378151260504, |
|
"grad_norm": 25.65165138244629, |
|
"learning_rate": 1.5e-05, |
|
"loss": 10.2814, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.1792717086834734, |
|
"grad_norm": 25.051061630249023, |
|
"learning_rate": 2e-05, |
|
"loss": 10.3075, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.22408963585434175, |
|
"grad_norm": 27.376066207885742, |
|
"learning_rate": 2.5e-05, |
|
"loss": 10.079, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.2689075630252101, |
|
"grad_norm": 28.760995864868164, |
|
"learning_rate": 3e-05, |
|
"loss": 9.866, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.3137254901960784, |
|
"grad_norm": 30.537446975708008, |
|
"learning_rate": 3.5e-05, |
|
"loss": 9.6662, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.3585434173669468, |
|
"grad_norm": 31.815292358398438, |
|
"learning_rate": 4e-05, |
|
"loss": 9.3129, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.40336134453781514, |
|
"grad_norm": 30.294048309326172, |
|
"learning_rate": 4.5e-05, |
|
"loss": 9.0616, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.4481792717086835, |
|
"grad_norm": 27.685863494873047, |
|
"learning_rate": 5e-05, |
|
"loss": 8.6176, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.49299719887955185, |
|
"grad_norm": 30.450334548950195, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 8.2255, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.5378151260504201, |
|
"grad_norm": 26.99419403076172, |
|
"learning_rate": 6e-05, |
|
"loss": 7.785, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.5826330532212886, |
|
"grad_norm": 25.451345443725586, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 7.3677, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.6274509803921569, |
|
"grad_norm": 19.89314079284668, |
|
"learning_rate": 7e-05, |
|
"loss": 7.0903, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.6722689075630253, |
|
"grad_norm": 17.851165771484375, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 6.638, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.7170868347338936, |
|
"grad_norm": 17.867111206054688, |
|
"learning_rate": 8e-05, |
|
"loss": 6.256, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 15.737651824951172, |
|
"learning_rate": 8.5e-05, |
|
"loss": 5.9234, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.8067226890756303, |
|
"grad_norm": 13.73977279663086, |
|
"learning_rate": 9e-05, |
|
"loss": 5.9018, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.8515406162464986, |
|
"grad_norm": 13.81631088256836, |
|
"learning_rate": 9.5e-05, |
|
"loss": 5.4379, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.896358543417367, |
|
"grad_norm": 13.534049034118652, |
|
"learning_rate": 0.0001, |
|
"loss": 5.0785, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9411764705882353, |
|
"grad_norm": 13.347393989562988, |
|
"learning_rate": 9.988834393115767e-05, |
|
"loss": 4.7373, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.9859943977591037, |
|
"grad_norm": 13.346118927001953, |
|
"learning_rate": 9.9553874407739e-05, |
|
"loss": 4.534, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.0336134453781514, |
|
"grad_norm": 23.33193588256836, |
|
"learning_rate": 9.899808525182935e-05, |
|
"loss": 7.0157, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.0784313725490196, |
|
"grad_norm": 13.095778465270996, |
|
"learning_rate": 9.822345875271883e-05, |
|
"loss": 4.0933, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.123249299719888, |
|
"grad_norm": 11.095013618469238, |
|
"learning_rate": 9.723345458039594e-05, |
|
"loss": 3.4608, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.1680672268907564, |
|
"grad_norm": 9.79887580871582, |
|
"learning_rate": 9.603249433382144e-05, |
|
"loss": 3.7063, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.2128851540616246, |
|
"grad_norm": 7.285381317138672, |
|
"learning_rate": 9.462594179299406e-05, |
|
"loss": 4.121, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.257703081232493, |
|
"grad_norm": 5.901654243469238, |
|
"learning_rate": 9.302007896300698e-05, |
|
"loss": 2.883, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.3025210084033614, |
|
"grad_norm": 5.953999996185303, |
|
"learning_rate": 9.122207801708802e-05, |
|
"loss": 2.8394, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.3473389355742298, |
|
"grad_norm": 4.182607650756836, |
|
"learning_rate": 8.923996926393305e-05, |
|
"loss": 2.7042, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.392156862745098, |
|
"grad_norm": 3.3463335037231445, |
|
"learning_rate": 8.708260528239788e-05, |
|
"loss": 3.0285, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.4369747899159664, |
|
"grad_norm": 3.1249260902404785, |
|
"learning_rate": 8.475962138373213e-05, |
|
"loss": 2.7339, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.4817927170868348, |
|
"grad_norm": 2.7283530235290527, |
|
"learning_rate": 8.228139257794012e-05, |
|
"loss": 2.7393, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.526610644257703, |
|
"grad_norm": 2.110501527786255, |
|
"learning_rate": 7.965898723646776e-05, |
|
"loss": 2.7651, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.5714285714285714, |
|
"grad_norm": 2.313706874847412, |
|
"learning_rate": 7.690411765816864e-05, |
|
"loss": 2.718, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.6162464985994398, |
|
"grad_norm": 2.0307371616363525, |
|
"learning_rate": 7.402908775933419e-05, |
|
"loss": 2.8055, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.661064425770308, |
|
"grad_norm": 1.9101359844207764, |
|
"learning_rate": 7.104673812141675e-05, |
|
"loss": 2.8809, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.7058823529411766, |
|
"grad_norm": 1.9070402383804321, |
|
"learning_rate": 6.797038864187564e-05, |
|
"loss": 2.8126, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.7507002801120448, |
|
"grad_norm": 1.455893635749817, |
|
"learning_rate": 6.481377904428171e-05, |
|
"loss": 2.4823, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.795518207282913, |
|
"grad_norm": 1.853187918663025, |
|
"learning_rate": 6.159100751337642e-05, |
|
"loss": 2.6832, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.8403361344537816, |
|
"grad_norm": 1.4273921251296997, |
|
"learning_rate": 5.831646772915651e-05, |
|
"loss": 2.5476, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.8851540616246498, |
|
"grad_norm": 1.2385693788528442, |
|
"learning_rate": 5.5004784581204927e-05, |
|
"loss": 2.4887, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.9299719887955182, |
|
"grad_norm": 1.0763683319091797, |
|
"learning_rate": 5.167074885038373e-05, |
|
"loss": 2.5657, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 1.9747899159663866, |
|
"grad_norm": 1.6198722124099731, |
|
"learning_rate": 4.832925114961629e-05, |
|
"loss": 3.0987, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 2.022408963585434, |
|
"grad_norm": 2.0531015396118164, |
|
"learning_rate": 4.4995215418795085e-05, |
|
"loss": 4.2895, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.0672268907563027, |
|
"grad_norm": 1.1228376626968384, |
|
"learning_rate": 4.1683532270843504e-05, |
|
"loss": 2.7512, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.112044817927171, |
|
"grad_norm": 2.324376344680786, |
|
"learning_rate": 3.840899248662358e-05, |
|
"loss": 3.5319, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 2.156862745098039, |
|
"grad_norm": 1.4633479118347168, |
|
"learning_rate": 3.5186220955718306e-05, |
|
"loss": 2.5993, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 2.2016806722689077, |
|
"grad_norm": 1.424329161643982, |
|
"learning_rate": 3.202961135812437e-05, |
|
"loss": 2.6048, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 2.246498599439776, |
|
"grad_norm": 1.2687908411026, |
|
"learning_rate": 2.895326187858326e-05, |
|
"loss": 2.4032, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.246498599439776, |
|
"eval_loss": 2.433828830718994, |
|
"eval_runtime": 1.1804, |
|
"eval_samples_per_second": 254.153, |
|
"eval_steps_per_second": 16.096, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.291316526610644, |
|
"grad_norm": 1.2064857482910156, |
|
"learning_rate": 2.5970912240665813e-05, |
|
"loss": 2.4134, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 2.3361344537815127, |
|
"grad_norm": 1.254510760307312, |
|
"learning_rate": 2.3095882341831372e-05, |
|
"loss": 2.4845, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 1.4852594137191772, |
|
"learning_rate": 2.0341012763532243e-05, |
|
"loss": 2.4392, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.425770308123249, |
|
"grad_norm": 0.9326648116111755, |
|
"learning_rate": 1.771860742205988e-05, |
|
"loss": 2.422, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.4705882352941178, |
|
"grad_norm": 1.0687801837921143, |
|
"learning_rate": 1.5240378616267886e-05, |
|
"loss": 2.3847, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.515406162464986, |
|
"grad_norm": 0.8696810007095337, |
|
"learning_rate": 1.2917394717602121e-05, |
|
"loss": 2.3407, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.560224089635854, |
|
"grad_norm": 1.5607281923294067, |
|
"learning_rate": 1.0760030736066951e-05, |
|
"loss": 2.9372, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.6050420168067228, |
|
"grad_norm": 1.2624620199203491, |
|
"learning_rate": 8.777921982911996e-06, |
|
"loss": 2.5201, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.649859943977591, |
|
"grad_norm": 1.4880586862564087, |
|
"learning_rate": 6.979921036993042e-06, |
|
"loss": 2.9394, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.6946778711484596, |
|
"grad_norm": 0.8246588706970215, |
|
"learning_rate": 5.374058207005944e-06, |
|
"loss": 2.3346, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.7394957983193278, |
|
"grad_norm": 1.5645203590393066, |
|
"learning_rate": 3.967505666178556e-06, |
|
"loss": 2.7406, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.784313725490196, |
|
"grad_norm": 1.2056126594543457, |
|
"learning_rate": 2.7665454196040664e-06, |
|
"loss": 2.396, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.8291316526610646, |
|
"grad_norm": 2.0966439247131348, |
|
"learning_rate": 1.7765412472811771e-06, |
|
"loss": 3.1009, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.8739495798319328, |
|
"grad_norm": 0.9577468633651733, |
|
"learning_rate": 1.0019147481706625e-06, |
|
"loss": 2.4537, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.918767507002801, |
|
"grad_norm": 1.0243852138519287, |
|
"learning_rate": 4.461255922609986e-07, |
|
"loss": 2.4703, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 2.9635854341736696, |
|
"grad_norm": 1.1177788972854614, |
|
"learning_rate": 1.1165606884234181e-07, |
|
"loss": 2.6208, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 3.011204481792717, |
|
"grad_norm": 3.6235415935516357, |
|
"learning_rate": 0.0, |
|
"loss": 4.9354, |
|
"step": 67 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 67, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 701219513303040.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|