{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.900004054802914, "eval_steps": 500, "global_step": 199764, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010001847187994179, "grad_norm": 8.891203880310059, "learning_rate": 9.89998152812006e-06, "loss": 3.7466, "step": 2220 }, { "epoch": 0.020003694375988357, "grad_norm": 9.095343589782715, "learning_rate": 9.799963056240118e-06, "loss": 3.8874, "step": 4440 }, { "epoch": 0.030005541563982538, "grad_norm": 10.57905387878418, "learning_rate": 9.699944584360175e-06, "loss": 3.865, "step": 6660 }, { "epoch": 0.040007388751976715, "grad_norm": 4.935700416564941, "learning_rate": 9.599926112480233e-06, "loss": 3.8441, "step": 8880 }, { "epoch": 0.0500092359399709, "grad_norm": 8.008011817932129, "learning_rate": 9.499907640600292e-06, "loss": 3.8126, "step": 11100 }, { "epoch": 0.060011083127965076, "grad_norm": 6.469420909881592, "learning_rate": 9.39988916872035e-06, "loss": 3.8078, "step": 13320 }, { "epoch": 0.07001293031595926, "grad_norm": 7.097398281097412, "learning_rate": 9.29987069684041e-06, "loss": 3.8054, "step": 15540 }, { "epoch": 0.08001477750395343, "grad_norm": 5.952394485473633, "learning_rate": 9.199852224960467e-06, "loss": 3.802, "step": 17760 }, { "epoch": 0.09001662469194761, "grad_norm": 4.3405280113220215, "learning_rate": 9.099833753080524e-06, "loss": 3.7729, "step": 19980 }, { "epoch": 0.1000184718799418, "grad_norm": 7.396389961242676, "learning_rate": 8.999815281200582e-06, "loss": 3.7697, "step": 22200 }, { "epoch": 0.11002031906793597, "grad_norm": 6.3005876541137695, "learning_rate": 8.899796809320641e-06, "loss": 3.7642, "step": 24420 }, { "epoch": 0.12002216625593015, "grad_norm": 13.214675903320312, "learning_rate": 8.799778337440699e-06, "loss": 3.767, "step": 26640 }, { "epoch": 0.13002401344392434, "grad_norm": 6.847354412078857, "learning_rate": 8.699759865560757e-06, "loss": 3.755, "step": 28860 }, { "epoch": 0.14002586063191852, "grad_norm": 5.719586372375488, "learning_rate": 8.599741393680816e-06, "loss": 3.7427, "step": 31080 }, { "epoch": 0.15002770781991268, "grad_norm": 5.763194561004639, "learning_rate": 8.499722921800874e-06, "loss": 3.75, "step": 33300 }, { "epoch": 0.16002955500790686, "grad_norm": 5.9269561767578125, "learning_rate": 8.399704449920933e-06, "loss": 3.7333, "step": 35520 }, { "epoch": 0.17003140219590104, "grad_norm": 6.1445698738098145, "learning_rate": 8.29968597804099e-06, "loss": 3.7217, "step": 37740 }, { "epoch": 0.18003324938389523, "grad_norm": 5.2200188636779785, "learning_rate": 8.199667506161048e-06, "loss": 3.7317, "step": 39960 }, { "epoch": 0.1900350965718894, "grad_norm": 6.599005222320557, "learning_rate": 8.099649034281106e-06, "loss": 3.723, "step": 42180 }, { "epoch": 0.2000369437598836, "grad_norm": 5.919734477996826, "learning_rate": 7.999630562401165e-06, "loss": 3.7136, "step": 44400 }, { "epoch": 0.21003879094787775, "grad_norm": 5.932505130767822, "learning_rate": 7.899612090521223e-06, "loss": 3.7222, "step": 46620 }, { "epoch": 0.22004063813587194, "grad_norm": 5.9128499031066895, "learning_rate": 7.799593618641282e-06, "loss": 3.7119, "step": 48840 }, { "epoch": 0.23004248532386612, "grad_norm": 7.6859893798828125, "learning_rate": 7.69957514676134e-06, "loss": 3.7053, "step": 51060 }, { "epoch": 0.2400443325118603, "grad_norm": 10.485933303833008, "learning_rate": 7.599556674881397e-06, "loss": 3.7189, "step": 53280 }, { "epoch": 0.2500461796998545, "grad_norm": 4.997628211975098, "learning_rate": 7.499538203001457e-06, "loss": 3.7048, "step": 55500 }, { "epoch": 0.26004802688784867, "grad_norm": 4.950057029724121, "learning_rate": 7.399519731121514e-06, "loss": 3.707, "step": 57720 }, { "epoch": 0.27004987407584286, "grad_norm": 8.856302261352539, "learning_rate": 7.299501259241573e-06, "loss": 3.6913, "step": 59940 }, { "epoch": 0.28005172126383704, "grad_norm": 8.338162422180176, "learning_rate": 7.19948278736163e-06, "loss": 3.692, "step": 62160 }, { "epoch": 0.29005356845183117, "grad_norm": 5.352757453918457, "learning_rate": 7.099464315481689e-06, "loss": 3.6888, "step": 64380 }, { "epoch": 0.30005541563982535, "grad_norm": 10.270312309265137, "learning_rate": 6.999445843601746e-06, "loss": 3.6863, "step": 66600 }, { "epoch": 0.31005726282781954, "grad_norm": 7.968948841094971, "learning_rate": 6.899427371721805e-06, "loss": 3.6716, "step": 68820 }, { "epoch": 0.3200591100158137, "grad_norm": 5.720911502838135, "learning_rate": 6.7994088998418625e-06, "loss": 3.6875, "step": 71040 }, { "epoch": 0.3300609572038079, "grad_norm": 6.272682189941406, "learning_rate": 6.699390427961922e-06, "loss": 3.6763, "step": 73260 }, { "epoch": 0.3400628043918021, "grad_norm": 4.321349143981934, "learning_rate": 6.5993719560819794e-06, "loss": 3.6762, "step": 75480 }, { "epoch": 0.35006465157979627, "grad_norm": 4.718802452087402, "learning_rate": 6.499353484202038e-06, "loss": 3.6822, "step": 77700 }, { "epoch": 0.36006649876779045, "grad_norm": 7.162393093109131, "learning_rate": 6.399335012322096e-06, "loss": 3.6787, "step": 79920 }, { "epoch": 0.37006834595578464, "grad_norm": 4.617556571960449, "learning_rate": 6.299316540442154e-06, "loss": 3.6606, "step": 82140 }, { "epoch": 0.3800701931437788, "grad_norm": 9.247603416442871, "learning_rate": 6.1992980685622125e-06, "loss": 3.6734, "step": 84360 }, { "epoch": 0.390072040331773, "grad_norm": 9.668780326843262, "learning_rate": 6.09927959668227e-06, "loss": 3.6683, "step": 86580 }, { "epoch": 0.4000738875197672, "grad_norm": 8.191458702087402, "learning_rate": 5.999261124802329e-06, "loss": 3.6652, "step": 88800 }, { "epoch": 0.4100757347077613, "grad_norm": 6.6612396240234375, "learning_rate": 5.899242652922387e-06, "loss": 3.6631, "step": 91020 }, { "epoch": 0.4200775818957555, "grad_norm": 4.553181171417236, "learning_rate": 5.7992241810424455e-06, "loss": 3.6654, "step": 93240 }, { "epoch": 0.4300794290837497, "grad_norm": 5.245594501495361, "learning_rate": 5.699205709162503e-06, "loss": 3.651, "step": 95460 }, { "epoch": 0.44008127627174387, "grad_norm": 4.845808029174805, "learning_rate": 5.599187237282562e-06, "loss": 3.6749, "step": 97680 }, { "epoch": 0.45008312345973805, "grad_norm": 8.331583023071289, "learning_rate": 5.499168765402619e-06, "loss": 3.6657, "step": 99900 }, { "epoch": 0.46008497064773224, "grad_norm": 7.097965240478516, "learning_rate": 5.399150293522678e-06, "loss": 3.6581, "step": 102120 }, { "epoch": 0.4700868178357264, "grad_norm": 15.923288345336914, "learning_rate": 5.299131821642737e-06, "loss": 3.6563, "step": 104340 }, { "epoch": 0.4800886650237206, "grad_norm": 6.850131034851074, "learning_rate": 5.199113349762795e-06, "loss": 3.652, "step": 106560 }, { "epoch": 0.4900905122117148, "grad_norm": 3.908322811126709, "learning_rate": 5.099094877882853e-06, "loss": 3.6505, "step": 108780 }, { "epoch": 0.500092359399709, "grad_norm": 9.632164001464844, "learning_rate": 4.999076406002911e-06, "loss": 3.6461, "step": 111000 }, { "epoch": 0.5100942065877031, "grad_norm": 7.106192111968994, "learning_rate": 4.899057934122969e-06, "loss": 3.6538, "step": 113220 }, { "epoch": 0.5200960537756973, "grad_norm": 5.387803554534912, "learning_rate": 4.799039462243028e-06, "loss": 3.6489, "step": 115440 }, { "epoch": 0.5300979009636915, "grad_norm": 11.75901985168457, "learning_rate": 4.699020990363085e-06, "loss": 3.637, "step": 117660 }, { "epoch": 0.5400997481516857, "grad_norm": 6.171950817108154, "learning_rate": 4.599002518483144e-06, "loss": 3.6478, "step": 119880 }, { "epoch": 0.5501015953396798, "grad_norm": 9.803488731384277, "learning_rate": 4.498984046603202e-06, "loss": 3.6437, "step": 122100 }, { "epoch": 0.5601034425276741, "grad_norm": 4.786858558654785, "learning_rate": 4.39896557472326e-06, "loss": 3.629, "step": 124320 }, { "epoch": 0.5701052897156682, "grad_norm": 5.292118072509766, "learning_rate": 4.298947102843318e-06, "loss": 3.6311, "step": 126540 }, { "epoch": 0.5801071369036623, "grad_norm": 5.671876907348633, "learning_rate": 4.198928630963377e-06, "loss": 3.6239, "step": 128760 }, { "epoch": 0.5901089840916566, "grad_norm": 8.806153297424316, "learning_rate": 4.098910159083434e-06, "loss": 3.6379, "step": 130980 }, { "epoch": 0.6001108312796507, "grad_norm": 3.7141523361206055, "learning_rate": 3.998891687203493e-06, "loss": 3.6331, "step": 133200 }, { "epoch": 0.6101126784676449, "grad_norm": 8.392793655395508, "learning_rate": 3.8988732153235505e-06, "loss": 3.6373, "step": 135420 }, { "epoch": 0.6201145256556391, "grad_norm": 7.832735061645508, "learning_rate": 3.798854743443609e-06, "loss": 3.6116, "step": 137640 }, { "epoch": 0.6301163728436333, "grad_norm": 6.3072967529296875, "learning_rate": 3.698836271563667e-06, "loss": 3.6399, "step": 139860 }, { "epoch": 0.6401182200316274, "grad_norm": 5.968746185302734, "learning_rate": 3.598817799683726e-06, "loss": 3.6308, "step": 142080 }, { "epoch": 0.6501200672196217, "grad_norm": 8.840259552001953, "learning_rate": 3.498799327803784e-06, "loss": 3.644, "step": 144300 }, { "epoch": 0.6601219144076158, "grad_norm": 7.575094223022461, "learning_rate": 3.3987808559238424e-06, "loss": 3.6346, "step": 146520 }, { "epoch": 0.67012376159561, "grad_norm": 7.250246524810791, "learning_rate": 3.2987623840439004e-06, "loss": 3.6204, "step": 148740 }, { "epoch": 0.6801256087836042, "grad_norm": 5.222540378570557, "learning_rate": 3.1987439121639585e-06, "loss": 3.6132, "step": 150960 }, { "epoch": 0.6901274559715984, "grad_norm": 10.793761253356934, "learning_rate": 3.0987254402840165e-06, "loss": 3.6179, "step": 153180 }, { "epoch": 0.7001293031595925, "grad_norm": 4.64948844909668, "learning_rate": 2.998706968404075e-06, "loss": 3.6175, "step": 155400 }, { "epoch": 0.7101311503475867, "grad_norm": 4.545493125915527, "learning_rate": 2.898688496524133e-06, "loss": 3.6308, "step": 157620 }, { "epoch": 0.7201329975355809, "grad_norm": 5.714044094085693, "learning_rate": 2.798670024644191e-06, "loss": 3.6006, "step": 159840 }, { "epoch": 0.730134844723575, "grad_norm": 6.625074863433838, "learning_rate": 2.698651552764249e-06, "loss": 3.6158, "step": 162060 }, { "epoch": 0.7401366919115693, "grad_norm": 5.874523639678955, "learning_rate": 2.5986330808843076e-06, "loss": 3.6297, "step": 164280 }, { "epoch": 0.7501385390995634, "grad_norm": 6.403592586517334, "learning_rate": 2.4986146090043657e-06, "loss": 3.6249, "step": 166500 }, { "epoch": 0.7601403862875576, "grad_norm": 7.914163589477539, "learning_rate": 2.398596137124424e-06, "loss": 3.6185, "step": 168720 }, { "epoch": 0.7701422334755518, "grad_norm": 5.295339107513428, "learning_rate": 2.298577665244482e-06, "loss": 3.6117, "step": 170940 }, { "epoch": 0.780144080663546, "grad_norm": 7.561370372772217, "learning_rate": 2.1985591933645406e-06, "loss": 3.6129, "step": 173160 }, { "epoch": 0.7901459278515401, "grad_norm": 7.633889675140381, "learning_rate": 2.0985407214845987e-06, "loss": 3.6253, "step": 175380 }, { "epoch": 0.8001477750395344, "grad_norm": 8.387534141540527, "learning_rate": 1.9985222496046567e-06, "loss": 3.6102, "step": 177600 }, { "epoch": 0.8101496222275285, "grad_norm": 9.557201385498047, "learning_rate": 1.898503777724715e-06, "loss": 3.6088, "step": 179820 }, { "epoch": 0.8201514694155226, "grad_norm": 8.81215763092041, "learning_rate": 1.7984853058447732e-06, "loss": 3.6092, "step": 182040 }, { "epoch": 0.8301533166035169, "grad_norm": 5.788514137268066, "learning_rate": 1.6984668339648313e-06, "loss": 3.5987, "step": 184260 }, { "epoch": 0.840155163791511, "grad_norm": 3.8115339279174805, "learning_rate": 1.5984483620848898e-06, "loss": 3.6097, "step": 186480 }, { "epoch": 0.8501570109795052, "grad_norm": 9.834037780761719, "learning_rate": 1.498429890204948e-06, "loss": 3.6115, "step": 188700 }, { "epoch": 0.8601588581674994, "grad_norm": 4.8395586013793945, "learning_rate": 1.398411418325006e-06, "loss": 3.6048, "step": 190920 }, { "epoch": 0.8701607053554936, "grad_norm": 5.381741523742676, "learning_rate": 1.2983929464450643e-06, "loss": 3.6098, "step": 193140 }, { "epoch": 0.8801625525434877, "grad_norm": 9.676674842834473, "learning_rate": 1.1983744745651226e-06, "loss": 3.6137, "step": 195360 }, { "epoch": 0.890164399731482, "grad_norm": 9.529138565063477, "learning_rate": 1.0983560026851806e-06, "loss": 3.6122, "step": 197580 } ], "logging_steps": 2220, "max_steps": 221959, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 22196, "total_flos": 4.6201916348928e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }