{ "best_metric": 1.7268242835998535, "best_model_checkpoint": "miner_id_24/checkpoint-75", "epoch": 0.628358825961141, "eval_steps": 25, "global_step": 95, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006614303431169905, "grad_norm": 17.736679077148438, "learning_rate": 3.3333333333333335e-05, "loss": 11.468, "step": 1 }, { "epoch": 0.006614303431169905, "eval_loss": 11.592829704284668, "eval_runtime": 4.0134, "eval_samples_per_second": 12.458, "eval_steps_per_second": 3.239, "step": 1 }, { "epoch": 0.01322860686233981, "grad_norm": 20.307588577270508, "learning_rate": 6.666666666666667e-05, "loss": 11.3596, "step": 2 }, { "epoch": 0.019842910293509715, "grad_norm": 15.574076652526855, "learning_rate": 0.0001, "loss": 10.8161, "step": 3 }, { "epoch": 0.02645721372467962, "grad_norm": 12.424201011657715, "learning_rate": 9.997376600647783e-05, "loss": 9.4755, "step": 4 }, { "epoch": 0.03307151715584952, "grad_norm": 9.448874473571777, "learning_rate": 9.989509461357426e-05, "loss": 8.0971, "step": 5 }, { "epoch": 0.03968582058701943, "grad_norm": 10.42374038696289, "learning_rate": 9.976407754861426e-05, "loss": 7.2079, "step": 6 }, { "epoch": 0.04630012401818934, "grad_norm": 12.758567810058594, "learning_rate": 9.958086757163489e-05, "loss": 5.7105, "step": 7 }, { "epoch": 0.05291442744935924, "grad_norm": 13.52912712097168, "learning_rate": 9.934567829727386e-05, "loss": 3.8279, "step": 8 }, { "epoch": 0.059528730880529145, "grad_norm": 15.979461669921875, "learning_rate": 9.905878394570453e-05, "loss": 3.0876, "step": 9 }, { "epoch": 0.06614303431169904, "grad_norm": 7.501053333282471, "learning_rate": 9.872051902290737e-05, "loss": 2.8203, "step": 10 }, { "epoch": 0.07275733774286895, "grad_norm": 7.698332786560059, "learning_rate": 9.833127793065098e-05, "loss": 3.329, "step": 11 }, { "epoch": 0.07937164117403886, "grad_norm": 7.099416732788086, "learning_rate": 9.789151450663723e-05, "loss": 2.6664, "step": 12 }, { "epoch": 0.08598594460520877, "grad_norm": 15.326375961303711, "learning_rate": 9.740174149534693e-05, "loss": 2.7908, "step": 13 }, { "epoch": 0.09260024803637867, "grad_norm": 10.195456504821777, "learning_rate": 9.686252995020249e-05, "loss": 2.3301, "step": 14 }, { "epoch": 0.09921455146754857, "grad_norm": 4.361770153045654, "learning_rate": 9.627450856774539e-05, "loss": 2.0583, "step": 15 }, { "epoch": 0.10582885489871847, "grad_norm": 3.182706117630005, "learning_rate": 9.563836295460398e-05, "loss": 2.0367, "step": 16 }, { "epoch": 0.11244315832988838, "grad_norm": 5.076944828033447, "learning_rate": 9.495483482810688e-05, "loss": 1.9523, "step": 17 }, { "epoch": 0.11905746176105829, "grad_norm": 8.57449722290039, "learning_rate": 9.422472115147382e-05, "loss": 2.025, "step": 18 }, { "epoch": 0.1256717651922282, "grad_norm": 3.4628255367279053, "learning_rate": 9.3448873204592e-05, "loss": 2.0057, "step": 19 }, { "epoch": 0.1322860686233981, "grad_norm": 2.956812620162964, "learning_rate": 9.2628195591462e-05, "loss": 2.006, "step": 20 }, { "epoch": 0.138900372054568, "grad_norm": 9.580174446105957, "learning_rate": 9.176364518546989e-05, "loss": 2.0139, "step": 21 }, { "epoch": 0.1455146754857379, "grad_norm": 6.208186626434326, "learning_rate": 9.08562300137157e-05, "loss": 1.9206, "step": 22 }, { "epoch": 0.15212897891690783, "grad_norm": 4.192629337310791, "learning_rate": 8.990700808169889e-05, "loss": 2.037, "step": 23 }, { "epoch": 0.15874328234807772, "grad_norm": 7.658245086669922, "learning_rate": 8.891708613973126e-05, "loss": 2.3119, "step": 24 }, { "epoch": 0.1653575857792476, "grad_norm": 6.482032775878906, "learning_rate": 8.788761839251559e-05, "loss": 2.2091, "step": 25 }, { "epoch": 0.1653575857792476, "eval_loss": 2.0168991088867188, "eval_runtime": 4.0539, "eval_samples_per_second": 12.334, "eval_steps_per_second": 3.207, "step": 25 }, { "epoch": 0.17197188921041753, "grad_norm": 8.424822807312012, "learning_rate": 8.681980515339464e-05, "loss": 2.0132, "step": 26 }, { "epoch": 0.17858619264158743, "grad_norm": 3.7590909004211426, "learning_rate": 8.571489144483944e-05, "loss": 1.7841, "step": 27 }, { "epoch": 0.18520049607275735, "grad_norm": 1.9769529104232788, "learning_rate": 8.457416554680877e-05, "loss": 1.7366, "step": 28 }, { "epoch": 0.19181479950392724, "grad_norm": 1.1488988399505615, "learning_rate": 8.339895749467238e-05, "loss": 1.7041, "step": 29 }, { "epoch": 0.19842910293509713, "grad_norm": 3.0149729251861572, "learning_rate": 8.219063752844926e-05, "loss": 1.7511, "step": 30 }, { "epoch": 0.20504340636626706, "grad_norm": 4.112295627593994, "learning_rate": 8.095061449516903e-05, "loss": 1.9589, "step": 31 }, { "epoch": 0.21165770979743695, "grad_norm": 1.706176519393921, "learning_rate": 7.968033420621935e-05, "loss": 1.9123, "step": 32 }, { "epoch": 0.21827201322860687, "grad_norm": 1.1065466403961182, "learning_rate": 7.838127775159452e-05, "loss": 1.8454, "step": 33 }, { "epoch": 0.22488631665977676, "grad_norm": 1.1772329807281494, "learning_rate": 7.705495977301078e-05, "loss": 1.7768, "step": 34 }, { "epoch": 0.23150062009094668, "grad_norm": 1.597283124923706, "learning_rate": 7.570292669790186e-05, "loss": 1.7835, "step": 35 }, { "epoch": 0.23811492352211658, "grad_norm": 2.4821412563323975, "learning_rate": 7.43267549363537e-05, "loss": 1.944, "step": 36 }, { "epoch": 0.24472922695328647, "grad_norm": 3.8077642917633057, "learning_rate": 7.292804904308087e-05, "loss": 2.0487, "step": 37 }, { "epoch": 0.2513435303844564, "grad_norm": 3.4441919326782227, "learning_rate": 7.150843984658754e-05, "loss": 1.972, "step": 38 }, { "epoch": 0.2579578338156263, "grad_norm": 1.2412813901901245, "learning_rate": 7.006958254769438e-05, "loss": 1.6434, "step": 39 }, { "epoch": 0.2645721372467962, "grad_norm": 1.4234942197799683, "learning_rate": 6.861315478964841e-05, "loss": 1.6448, "step": 40 }, { "epoch": 0.2711864406779661, "grad_norm": 1.5209438800811768, "learning_rate": 6.714085470206609e-05, "loss": 1.656, "step": 41 }, { "epoch": 0.277800744109136, "grad_norm": 1.6743462085723877, "learning_rate": 6.56543989209901e-05, "loss": 1.6416, "step": 42 }, { "epoch": 0.2844150475403059, "grad_norm": 1.5066150426864624, "learning_rate": 6.415552058736854e-05, "loss": 1.7058, "step": 43 }, { "epoch": 0.2910293509714758, "grad_norm": 1.3054348230361938, "learning_rate": 6.264596732629e-05, "loss": 1.7489, "step": 44 }, { "epoch": 0.29764365440264573, "grad_norm": 1.1575602293014526, "learning_rate": 6.112749920933111e-05, "loss": 1.8122, "step": 45 }, { "epoch": 0.30425795783381565, "grad_norm": 0.9143672585487366, "learning_rate": 5.960188670239154e-05, "loss": 1.726, "step": 46 }, { "epoch": 0.3108722612649855, "grad_norm": 1.3133972883224487, "learning_rate": 5.80709086014102e-05, "loss": 1.7691, "step": 47 }, { "epoch": 0.31748656469615544, "grad_norm": 1.7563693523406982, "learning_rate": 5.653634995836856e-05, "loss": 1.862, "step": 48 }, { "epoch": 0.32410086812732536, "grad_norm": 3.2339706420898438, "learning_rate": 5.500000000000001e-05, "loss": 1.9687, "step": 49 }, { "epoch": 0.3307151715584952, "grad_norm": 1.415595531463623, "learning_rate": 5.346365004163145e-05, "loss": 1.9923, "step": 50 }, { "epoch": 0.3307151715584952, "eval_loss": 1.8034321069717407, "eval_runtime": 4.0537, "eval_samples_per_second": 12.334, "eval_steps_per_second": 3.207, "step": 50 }, { "epoch": 0.33732947498966515, "grad_norm": 1.7704148292541504, "learning_rate": 5.192909139858981e-05, "loss": 1.7264, "step": 51 }, { "epoch": 0.34394377842083507, "grad_norm": 0.8146671056747437, "learning_rate": 5.0398113297608465e-05, "loss": 1.593, "step": 52 }, { "epoch": 0.350558081852005, "grad_norm": 0.8015440702438354, "learning_rate": 4.887250079066892e-05, "loss": 1.6073, "step": 53 }, { "epoch": 0.35717238528317485, "grad_norm": 0.6833037734031677, "learning_rate": 4.7354032673710005e-05, "loss": 1.6055, "step": 54 }, { "epoch": 0.3637866887143448, "grad_norm": 1.2183301448822021, "learning_rate": 4.584447941263149e-05, "loss": 1.6739, "step": 55 }, { "epoch": 0.3704009921455147, "grad_norm": 2.2584455013275146, "learning_rate": 4.43456010790099e-05, "loss": 1.7502, "step": 56 }, { "epoch": 0.37701529557668456, "grad_norm": 2.1715304851531982, "learning_rate": 4.285914529793391e-05, "loss": 1.7755, "step": 57 }, { "epoch": 0.3836295990078545, "grad_norm": 2.027273654937744, "learning_rate": 4.13868452103516e-05, "loss": 1.732, "step": 58 }, { "epoch": 0.3902439024390244, "grad_norm": 1.1867992877960205, "learning_rate": 3.9930417452305626e-05, "loss": 1.6905, "step": 59 }, { "epoch": 0.39685820587019427, "grad_norm": 1.161856770515442, "learning_rate": 3.8491560153412466e-05, "loss": 1.7519, "step": 60 }, { "epoch": 0.4034725093013642, "grad_norm": 2.370361804962158, "learning_rate": 3.707195095691913e-05, "loss": 1.8929, "step": 61 }, { "epoch": 0.4100868127325341, "grad_norm": 1.8539221286773682, "learning_rate": 3.567324506364632e-05, "loss": 1.8778, "step": 62 }, { "epoch": 0.41670111616370403, "grad_norm": 1.790212631225586, "learning_rate": 3.4297073302098156e-05, "loss": 1.7908, "step": 63 }, { "epoch": 0.4233154195948739, "grad_norm": 1.4143702983856201, "learning_rate": 3.2945040226989244e-05, "loss": 1.5811, "step": 64 }, { "epoch": 0.4299297230260438, "grad_norm": 1.0017449855804443, "learning_rate": 3.16187222484055e-05, "loss": 1.56, "step": 65 }, { "epoch": 0.43654402645721374, "grad_norm": 0.7748380303382874, "learning_rate": 3.0319665793780648e-05, "loss": 1.5291, "step": 66 }, { "epoch": 0.4431583298883836, "grad_norm": 0.7852901220321655, "learning_rate": 2.9049385504830985e-05, "loss": 1.6028, "step": 67 }, { "epoch": 0.4497726333195535, "grad_norm": 1.797322392463684, "learning_rate": 2.7809362471550748e-05, "loss": 1.746, "step": 68 }, { "epoch": 0.45638693675072345, "grad_norm": 2.199897050857544, "learning_rate": 2.660104250532764e-05, "loss": 1.7829, "step": 69 }, { "epoch": 0.46300124018189337, "grad_norm": 2.4895544052124023, "learning_rate": 2.5425834453191232e-05, "loss": 1.8072, "step": 70 }, { "epoch": 0.46961554361306324, "grad_norm": 1.3517361879348755, "learning_rate": 2.4285108555160577e-05, "loss": 1.711, "step": 71 }, { "epoch": 0.47622984704423316, "grad_norm": 1.6711078882217407, "learning_rate": 2.3180194846605367e-05, "loss": 1.7425, "step": 72 }, { "epoch": 0.4828441504754031, "grad_norm": 1.413799524307251, "learning_rate": 2.2112381607484417e-05, "loss": 1.7813, "step": 73 }, { "epoch": 0.48945845390657294, "grad_norm": 1.5747374296188354, "learning_rate": 2.1082913860268765e-05, "loss": 1.8938, "step": 74 }, { "epoch": 0.49607275733774286, "grad_norm": 1.6026586294174194, "learning_rate": 2.0092991918301108e-05, "loss": 1.9236, "step": 75 }, { "epoch": 0.49607275733774286, "eval_loss": 1.7268242835998535, "eval_runtime": 4.0519, "eval_samples_per_second": 12.34, "eval_steps_per_second": 3.208, "step": 75 }, { "epoch": 0.5026870607689128, "grad_norm": 1.0837961435317993, "learning_rate": 1.91437699862843e-05, "loss": 1.5649, "step": 76 }, { "epoch": 0.5093013642000827, "grad_norm": 1.3891311883926392, "learning_rate": 1.8236354814530112e-05, "loss": 1.5706, "step": 77 }, { "epoch": 0.5159156676312526, "grad_norm": 0.9055605530738831, "learning_rate": 1.7371804408538024e-05, "loss": 1.5843, "step": 78 }, { "epoch": 0.5225299710624225, "grad_norm": 1.7795766592025757, "learning_rate": 1.6551126795408016e-05, "loss": 1.5982, "step": 79 }, { "epoch": 0.5291442744935924, "grad_norm": 1.0256783962249756, "learning_rate": 1.577527884852619e-05, "loss": 1.6283, "step": 80 }, { "epoch": 0.5357585779247623, "grad_norm": 1.4087269306182861, "learning_rate": 1.5045165171893116e-05, "loss": 1.7221, "step": 81 }, { "epoch": 0.5423728813559322, "grad_norm": 1.3582618236541748, "learning_rate": 1.4361637045396029e-05, "loss": 1.7319, "step": 82 }, { "epoch": 0.5489871847871021, "grad_norm": 1.3992999792099, "learning_rate": 1.3725491432254624e-05, "loss": 1.7597, "step": 83 }, { "epoch": 0.555601488218272, "grad_norm": 1.0967944860458374, "learning_rate": 1.313747004979751e-05, "loss": 1.636, "step": 84 }, { "epoch": 0.5622157916494419, "grad_norm": 1.2128665447235107, "learning_rate": 1.2598258504653081e-05, "loss": 1.6875, "step": 85 }, { "epoch": 0.5688300950806118, "grad_norm": 1.3232948780059814, "learning_rate": 1.2108485493362765e-05, "loss": 1.8021, "step": 86 }, { "epoch": 0.5754443985117818, "grad_norm": 1.3909993171691895, "learning_rate": 1.1668722069349041e-05, "loss": 1.8413, "step": 87 }, { "epoch": 0.5820587019429516, "grad_norm": 1.2057301998138428, "learning_rate": 1.1279480977092635e-05, "loss": 1.73, "step": 88 }, { "epoch": 0.5886730053741215, "grad_norm": 0.6822798252105713, "learning_rate": 1.094121605429547e-05, "loss": 1.5085, "step": 89 }, { "epoch": 0.5952873088052915, "grad_norm": 0.7973278760910034, "learning_rate": 1.0654321702726141e-05, "loss": 1.5389, "step": 90 }, { "epoch": 0.6019016122364613, "grad_norm": 0.8194143772125244, "learning_rate": 1.0419132428365116e-05, "loss": 1.563, "step": 91 }, { "epoch": 0.6085159156676313, "grad_norm": 0.8316376209259033, "learning_rate": 1.0235922451385733e-05, "loss": 1.5623, "step": 92 }, { "epoch": 0.6151302190988012, "grad_norm": 1.0160751342773438, "learning_rate": 1.0104905386425733e-05, "loss": 1.6195, "step": 93 }, { "epoch": 0.621744522529971, "grad_norm": 1.2698185443878174, "learning_rate": 1.002623399352217e-05, "loss": 1.6597, "step": 94 }, { "epoch": 0.628358825961141, "grad_norm": 0.7779133319854736, "learning_rate": 1e-05, "loss": 1.6596, "step": 95 } ], "logging_steps": 1, "max_steps": 95, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1698376402502943e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }