{ "best_metric": 0.7010467052459717, "best_model_checkpoint": "miner_id_24/checkpoint-50", "epoch": 1.3840830449826989, "eval_steps": 25, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01384083044982699, "grad_norm": 40.9892692565918, "learning_rate": 2.9999999999999997e-05, "loss": 30.7293, "step": 1 }, { "epoch": 0.01384083044982699, "eval_loss": 7.911479949951172, "eval_runtime": 5.1322, "eval_samples_per_second": 9.742, "eval_steps_per_second": 1.364, "step": 1 }, { "epoch": 0.02768166089965398, "grad_norm": 44.54869079589844, "learning_rate": 5.9999999999999995e-05, "loss": 31.553, "step": 2 }, { "epoch": 0.04152249134948097, "grad_norm": 46.91074752807617, "learning_rate": 8.999999999999999e-05, "loss": 30.4963, "step": 3 }, { "epoch": 0.05536332179930796, "grad_norm": 49.72462844848633, "learning_rate": 0.00011999999999999999, "loss": 28.3763, "step": 4 }, { "epoch": 0.06920415224913495, "grad_norm": 50.18564224243164, "learning_rate": 0.00015, "loss": 22.6685, "step": 5 }, { "epoch": 0.08304498269896193, "grad_norm": 44.5780143737793, "learning_rate": 0.00017999999999999998, "loss": 14.5096, "step": 6 }, { "epoch": 0.09688581314878893, "grad_norm": 30.434995651245117, "learning_rate": 0.00020999999999999998, "loss": 7.348, "step": 7 }, { "epoch": 0.11072664359861592, "grad_norm": 15.010186195373535, "learning_rate": 0.00023999999999999998, "loss": 4.1836, "step": 8 }, { "epoch": 0.1245674740484429, "grad_norm": 8.79946231842041, "learning_rate": 0.00027, "loss": 3.2994, "step": 9 }, { "epoch": 0.1384083044982699, "grad_norm": 3.8603556156158447, "learning_rate": 0.0003, "loss": 2.8264, "step": 10 }, { "epoch": 0.1522491349480969, "grad_norm": 10.656445503234863, "learning_rate": 0.0002999593861769192, "loss": 3.1222, "step": 11 }, { "epoch": 0.16608996539792387, "grad_norm": 5.157639980316162, "learning_rate": 0.00029983756670077865, "loss": 2.9336, "step": 12 }, { "epoch": 0.17993079584775087, "grad_norm": 3.850473165512085, "learning_rate": 0.0002996346075389736, "loss": 2.8686, "step": 13 }, { "epoch": 0.19377162629757785, "grad_norm": 8.165860176086426, "learning_rate": 0.00029935061859747065, "loss": 2.9763, "step": 14 }, { "epoch": 0.20761245674740483, "grad_norm": 6.534956455230713, "learning_rate": 0.00029898575366129145, "loss": 3.1093, "step": 15 }, { "epoch": 0.22145328719723184, "grad_norm": 5.248986721038818, "learning_rate": 0.0002985402103112355, "loss": 2.896, "step": 16 }, { "epoch": 0.23529411764705882, "grad_norm": 3.770387887954712, "learning_rate": 0.00029801422981688686, "loss": 2.7932, "step": 17 }, { "epoch": 0.2491349480968858, "grad_norm": 9.586681365966797, "learning_rate": 0.000297408097005962, "loss": 2.9843, "step": 18 }, { "epoch": 0.2629757785467128, "grad_norm": 5.363808631896973, "learning_rate": 0.0002967221401100708, "loss": 2.9857, "step": 19 }, { "epoch": 0.2768166089965398, "grad_norm": 3.2338597774505615, "learning_rate": 0.00029595673058697357, "loss": 2.8622, "step": 20 }, { "epoch": 0.2906574394463668, "grad_norm": 3.012906312942505, "learning_rate": 0.0002951122829194295, "loss": 2.6277, "step": 21 }, { "epoch": 0.3044982698961938, "grad_norm": 6.697887420654297, "learning_rate": 0.0002941892543907478, "loss": 3.0416, "step": 22 }, { "epoch": 0.31833910034602075, "grad_norm": 2.6583356857299805, "learning_rate": 0.0002931881448371598, "loss": 2.6767, "step": 23 }, { "epoch": 0.33217993079584773, "grad_norm": 1.7108440399169922, "learning_rate": 0.0002921094963771494, "loss": 2.7423, "step": 24 }, { "epoch": 0.3460207612456747, "grad_norm": 2.377303123474121, "learning_rate": 0.0002909538931178862, "loss": 2.811, "step": 25 }, { "epoch": 0.3460207612456747, "eval_loss": 0.6890237331390381, "eval_runtime": 5.2521, "eval_samples_per_second": 9.52, "eval_steps_per_second": 1.333, "step": 25 }, { "epoch": 0.35986159169550175, "grad_norm": 2.390648603439331, "learning_rate": 0.00028972196083892134, "loss": 2.7977, "step": 26 }, { "epoch": 0.3737024221453287, "grad_norm": 2.9108152389526367, "learning_rate": 0.00028841436665331634, "loss": 2.9092, "step": 27 }, { "epoch": 0.3875432525951557, "grad_norm": 2.952723979949951, "learning_rate": 0.0002870318186463901, "loss": 2.7992, "step": 28 }, { "epoch": 0.4013840830449827, "grad_norm": 2.9674861431121826, "learning_rate": 0.00028557506549227804, "loss": 2.8412, "step": 29 }, { "epoch": 0.41522491349480967, "grad_norm": 1.8786237239837646, "learning_rate": 0.0002840448960485118, "loss": 2.7702, "step": 30 }, { "epoch": 0.4290657439446367, "grad_norm": 3.3518176078796387, "learning_rate": 0.000282442138928839, "loss": 2.8484, "step": 31 }, { "epoch": 0.4429065743944637, "grad_norm": 1.953736662864685, "learning_rate": 0.0002807676620545143, "loss": 2.5684, "step": 32 }, { "epoch": 0.45674740484429066, "grad_norm": 8.230539321899414, "learning_rate": 0.00027902237218430483, "loss": 3.2569, "step": 33 }, { "epoch": 0.47058823529411764, "grad_norm": 7.019381046295166, "learning_rate": 0.00027720721442346387, "loss": 3.1625, "step": 34 }, { "epoch": 0.4844290657439446, "grad_norm": 5.099842071533203, "learning_rate": 0.00027532317171194046, "loss": 3.0486, "step": 35 }, { "epoch": 0.4982698961937716, "grad_norm": 2.0828545093536377, "learning_rate": 0.00027337126429209934, "loss": 2.832, "step": 36 }, { "epoch": 0.5121107266435986, "grad_norm": 1.9583015441894531, "learning_rate": 0.0002713525491562421, "loss": 2.8298, "step": 37 }, { "epoch": 0.5259515570934256, "grad_norm": 2.480612277984619, "learning_rate": 0.00026926811947422714, "loss": 2.7638, "step": 38 }, { "epoch": 0.5397923875432526, "grad_norm": 2.304385185241699, "learning_rate": 0.0002671191040014989, "loss": 2.7757, "step": 39 }, { "epoch": 0.5536332179930796, "grad_norm": 3.724012851715088, "learning_rate": 0.00026490666646784665, "loss": 2.8564, "step": 40 }, { "epoch": 0.5674740484429066, "grad_norm": 6.570769309997559, "learning_rate": 0.00026263200494722487, "loss": 3.0332, "step": 41 }, { "epoch": 0.5813148788927336, "grad_norm": 3.7891201972961426, "learning_rate": 0.0002602963512089743, "loss": 2.8577, "step": 42 }, { "epoch": 0.5951557093425606, "grad_norm": 3.1248676776885986, "learning_rate": 0.00025790097005079764, "loss": 2.8053, "step": 43 }, { "epoch": 0.6089965397923875, "grad_norm": 2.767016649246216, "learning_rate": 0.0002554471586138493, "loss": 2.7561, "step": 44 }, { "epoch": 0.6228373702422145, "grad_norm": 3.91235089302063, "learning_rate": 0.00025293624568031, "loss": 2.9683, "step": 45 }, { "epoch": 0.6366782006920415, "grad_norm": 3.9107556343078613, "learning_rate": 0.0002503695909538287, "loss": 2.9184, "step": 46 }, { "epoch": 0.6505190311418685, "grad_norm": 3.6938490867614746, "learning_rate": 0.0002477485843232183, "loss": 2.7981, "step": 47 }, { "epoch": 0.6643598615916955, "grad_norm": 1.329195499420166, "learning_rate": 0.0002450746451098065, "loss": 2.7712, "step": 48 }, { "epoch": 0.6782006920415224, "grad_norm": 1.1611841917037964, "learning_rate": 0.0002423492212988487, "loss": 2.778, "step": 49 }, { "epoch": 0.6920415224913494, "grad_norm": 2.376075506210327, "learning_rate": 0.00023957378875541792, "loss": 2.6997, "step": 50 }, { "epoch": 0.6920415224913494, "eval_loss": 0.7010467052459717, "eval_runtime": 5.2581, "eval_samples_per_second": 9.509, "eval_steps_per_second": 1.331, "step": 50 }, { "epoch": 0.7058823529411765, "grad_norm": 9.266826629638672, "learning_rate": 0.00023674985042519795, "loss": 3.4074, "step": 51 }, { "epoch": 0.7197231833910035, "grad_norm": 2.7682924270629883, "learning_rate": 0.00023387893552061199, "loss": 2.6934, "step": 52 }, { "epoch": 0.7335640138408305, "grad_norm": 1.135008454322815, "learning_rate": 0.00023096259869272693, "loss": 2.7218, "step": 53 }, { "epoch": 0.7474048442906575, "grad_norm": 3.1696879863739014, "learning_rate": 0.00022800241918938228, "loss": 2.8763, "step": 54 }, { "epoch": 0.7612456747404844, "grad_norm": 3.2597336769104004, "learning_rate": 0.000225, "loss": 2.794, "step": 55 }, { "epoch": 0.7750865051903114, "grad_norm": 2.9772768020629883, "learning_rate": 0.00022195696698753693, "loss": 2.9002, "step": 56 }, { "epoch": 0.7889273356401384, "grad_norm": 2.6768057346343994, "learning_rate": 0.00021887496800805173, "loss": 2.8117, "step": 57 }, { "epoch": 0.8027681660899654, "grad_norm": 1.5049817562103271, "learning_rate": 0.0002157556720183616, "loss": 2.8173, "step": 58 }, { "epoch": 0.8166089965397924, "grad_norm": 1.5916688442230225, "learning_rate": 0.00021260076817227266, "loss": 2.7637, "step": 59 }, { "epoch": 0.8304498269896193, "grad_norm": 2.124816656112671, "learning_rate": 0.0002094119649058735, "loss": 2.7946, "step": 60 }, { "epoch": 0.8442906574394463, "grad_norm": 1.6769332885742188, "learning_rate": 0.0002061909890123868, "loss": 2.7702, "step": 61 }, { "epoch": 0.8581314878892734, "grad_norm": 3.2984721660614014, "learning_rate": 0.0002029395847070803, "loss": 2.7117, "step": 62 }, { "epoch": 0.8719723183391004, "grad_norm": 1.5877281427383423, "learning_rate": 0.0001996595126827437, "loss": 2.705, "step": 63 }, { "epoch": 0.8858131487889274, "grad_norm": 3.0824265480041504, "learning_rate": 0.0001963525491562421, "loss": 2.8567, "step": 64 }, { "epoch": 0.8996539792387543, "grad_norm": 3.194683313369751, "learning_rate": 0.00019302048490666353, "loss": 2.8681, "step": 65 }, { "epoch": 0.9134948096885813, "grad_norm": 2.6802637577056885, "learning_rate": 0.00018966512430558034, "loss": 2.8496, "step": 66 }, { "epoch": 0.9273356401384083, "grad_norm": 3.094783067703247, "learning_rate": 0.00018628828433995013, "loss": 2.8264, "step": 67 }, { "epoch": 0.9411764705882353, "grad_norm": 1.2324931621551514, "learning_rate": 0.00018289179362818546, "loss": 2.7473, "step": 68 }, { "epoch": 0.9550173010380623, "grad_norm": 2.415250778198242, "learning_rate": 0.0001794774914299245, "loss": 2.835, "step": 69 }, { "epoch": 0.9688581314878892, "grad_norm": 2.9529473781585693, "learning_rate": 0.00017604722665003956, "loss": 2.958, "step": 70 }, { "epoch": 0.9826989619377162, "grad_norm": 1.7587758302688599, "learning_rate": 0.00017260285683742246, "loss": 2.8396, "step": 71 }, { "epoch": 0.9965397923875432, "grad_norm": 1.809816598892212, "learning_rate": 0.00016914624717908922, "loss": 2.8112, "step": 72 }, { "epoch": 1.0103806228373702, "grad_norm": 1.430889368057251, "learning_rate": 0.000165679269490148, "loss": 2.7052, "step": 73 }, { "epoch": 1.0242214532871972, "grad_norm": 2.520479440689087, "learning_rate": 0.00016220380120017872, "loss": 2.7617, "step": 74 }, { "epoch": 1.0380622837370241, "grad_norm": 1.8224382400512695, "learning_rate": 0.00015872172433657134, "loss": 2.7747, "step": 75 }, { "epoch": 1.0380622837370241, "eval_loss": 0.6992730498313904, "eval_runtime": 5.2527, "eval_samples_per_second": 9.519, "eval_steps_per_second": 1.333, "step": 75 }, { "epoch": 1.0519031141868511, "grad_norm": 1.3448646068572998, "learning_rate": 0.00015523492450537517, "loss": 2.763, "step": 76 }, { "epoch": 1.065743944636678, "grad_norm": 1.085045576095581, "learning_rate": 0.00015174528987020957, "loss": 2.7388, "step": 77 }, { "epoch": 1.0795847750865053, "grad_norm": 1.328552007675171, "learning_rate": 0.00014825471012979045, "loss": 2.7668, "step": 78 }, { "epoch": 1.0934256055363323, "grad_norm": 1.4537187814712524, "learning_rate": 0.0001447650754946249, "loss": 2.7073, "step": 79 }, { "epoch": 1.1072664359861593, "grad_norm": 1.3335843086242676, "learning_rate": 0.00014127827566342863, "loss": 2.796, "step": 80 }, { "epoch": 1.1211072664359862, "grad_norm": 2.351191997528076, "learning_rate": 0.00013779619879982126, "loss": 2.8494, "step": 81 }, { "epoch": 1.1349480968858132, "grad_norm": 2.260059118270874, "learning_rate": 0.000134320730509852, "loss": 2.7993, "step": 82 }, { "epoch": 1.1487889273356402, "grad_norm": 2.7046008110046387, "learning_rate": 0.00013085375282091078, "loss": 2.837, "step": 83 }, { "epoch": 1.1626297577854672, "grad_norm": 1.4147647619247437, "learning_rate": 0.0001273971431625775, "loss": 2.7192, "step": 84 }, { "epoch": 1.1764705882352942, "grad_norm": 2.916611909866333, "learning_rate": 0.00012395277334996044, "loss": 2.8989, "step": 85 }, { "epoch": 1.1903114186851211, "grad_norm": 3.9687960147857666, "learning_rate": 0.00012052250857007545, "loss": 2.728, "step": 86 }, { "epoch": 1.2041522491349481, "grad_norm": 1.5941438674926758, "learning_rate": 0.00011710820637181447, "loss": 2.7109, "step": 87 }, { "epoch": 1.217993079584775, "grad_norm": 1.5909277200698853, "learning_rate": 0.00011371171566004985, "loss": 2.78, "step": 88 }, { "epoch": 1.231833910034602, "grad_norm": 3.141655206680298, "learning_rate": 0.0001103348756944197, "loss": 2.7588, "step": 89 }, { "epoch": 1.245674740484429, "grad_norm": 2.018080234527588, "learning_rate": 0.0001069795150933365, "loss": 2.7771, "step": 90 }, { "epoch": 1.259515570934256, "grad_norm": 1.5090923309326172, "learning_rate": 0.0001036474508437579, "loss": 2.7333, "step": 91 }, { "epoch": 1.273356401384083, "grad_norm": 1.451968789100647, "learning_rate": 0.0001003404873172563, "loss": 2.7345, "step": 92 }, { "epoch": 1.28719723183391, "grad_norm": 2.1932549476623535, "learning_rate": 9.706041529291968e-05, "loss": 2.7071, "step": 93 }, { "epoch": 1.301038062283737, "grad_norm": 2.9638872146606445, "learning_rate": 9.380901098761319e-05, "loss": 2.8978, "step": 94 }, { "epoch": 1.314878892733564, "grad_norm": 2.9430599212646484, "learning_rate": 9.058803509412646e-05, "loss": 2.8598, "step": 95 }, { "epoch": 1.328719723183391, "grad_norm": 2.1553397178649902, "learning_rate": 8.739923182772731e-05, "loss": 2.6598, "step": 96 }, { "epoch": 1.342560553633218, "grad_norm": 1.4002549648284912, "learning_rate": 8.424432798163836e-05, "loss": 2.7373, "step": 97 }, { "epoch": 1.356401384083045, "grad_norm": 2.587266206741333, "learning_rate": 8.112503199194821e-05, "loss": 2.785, "step": 98 }, { "epoch": 1.370242214532872, "grad_norm": 1.669758915901184, "learning_rate": 7.804303301246311e-05, "loss": 2.6785, "step": 99 }, { "epoch": 1.3840830449826989, "grad_norm": 3.042837381362915, "learning_rate": 7.500000000000002e-05, "loss": 2.858, "step": 100 }, { "epoch": 1.3840830449826989, "eval_loss": 0.7048866748809814, "eval_runtime": 5.2514, "eval_samples_per_second": 9.521, "eval_steps_per_second": 1.333, "step": 100 } ], "logging_steps": 1, "max_steps": 145, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3877308048696934e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }