{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.003338898163606, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022259321090706732, "grad_norm": 0.6662815809249878, "learning_rate": 7.3529411764705884e-06, "loss": 1.2136, "step": 5 }, { "epoch": 0.044518642181413465, "grad_norm": 0.9986972212791443, "learning_rate": 1.4705882352941177e-05, "loss": 1.2704, "step": 10 }, { "epoch": 0.0667779632721202, "grad_norm": 1.0016194581985474, "learning_rate": 2.2058823529411766e-05, "loss": 1.1674, "step": 15 }, { "epoch": 0.08903728436282693, "grad_norm": 0.4482230246067047, "learning_rate": 2.9411764705882354e-05, "loss": 1.1745, "step": 20 }, { "epoch": 0.11129660545353366, "grad_norm": 0.32080814242362976, "learning_rate": 3.6764705882352945e-05, "loss": 1.1222, "step": 25 }, { "epoch": 0.1335559265442404, "grad_norm": 0.3496420979499817, "learning_rate": 4.411764705882353e-05, "loss": 1.1505, "step": 30 }, { "epoch": 0.15581524763494714, "grad_norm": 0.40206676721572876, "learning_rate": 5.147058823529411e-05, "loss": 1.1051, "step": 35 }, { "epoch": 0.17807456872565386, "grad_norm": 0.34886816143989563, "learning_rate": 5.882352941176471e-05, "loss": 1.0896, "step": 40 }, { "epoch": 0.2003338898163606, "grad_norm": 0.3876524269580841, "learning_rate": 6.61764705882353e-05, "loss": 1.1052, "step": 45 }, { "epoch": 0.22259321090706732, "grad_norm": 0.42603397369384766, "learning_rate": 7.352941176470589e-05, "loss": 1.0529, "step": 50 }, { "epoch": 0.22259321090706732, "eval_loss": 1.0506384372711182, "eval_runtime": 63.5224, "eval_samples_per_second": 12.578, "eval_steps_per_second": 1.574, "step": 50 }, { "epoch": 0.24485253199777407, "grad_norm": 0.42488643527030945, "learning_rate": 8.088235294117648e-05, "loss": 1.0093, "step": 55 }, { "epoch": 0.2671118530884808, "grad_norm": 0.46560072898864746, "learning_rate": 8.823529411764706e-05, "loss": 0.9961, "step": 60 }, { "epoch": 0.28937117417918756, "grad_norm": 0.4789991080760956, "learning_rate": 9.558823529411765e-05, "loss": 0.96, "step": 65 }, { "epoch": 0.3116304952698943, "grad_norm": 0.5202014446258545, "learning_rate": 9.999729465939036e-05, "loss": 1.0675, "step": 70 }, { "epoch": 0.333889816360601, "grad_norm": 0.49772489070892334, "learning_rate": 9.996686293953675e-05, "loss": 0.9783, "step": 75 }, { "epoch": 0.3561491374513077, "grad_norm": 0.4958195984363556, "learning_rate": 9.990263847374976e-05, "loss": 0.9948, "step": 80 }, { "epoch": 0.3784084585420145, "grad_norm": 0.5104294419288635, "learning_rate": 9.98046646972368e-05, "loss": 1.0261, "step": 85 }, { "epoch": 0.4006677796327212, "grad_norm": 0.41805458068847656, "learning_rate": 9.96730078699698e-05, "loss": 0.9852, "step": 90 }, { "epoch": 0.42292710072342793, "grad_norm": 0.4951586425304413, "learning_rate": 9.950775703187354e-05, "loss": 0.9717, "step": 95 }, { "epoch": 0.44518642181413465, "grad_norm": 0.44999977946281433, "learning_rate": 9.930902394260747e-05, "loss": 1.0382, "step": 100 }, { "epoch": 0.44518642181413465, "eval_loss": 0.9870203137397766, "eval_runtime": 62.5821, "eval_samples_per_second": 12.767, "eval_steps_per_second": 1.598, "step": 100 }, { "epoch": 0.4674457429048414, "grad_norm": 0.4562780559062958, "learning_rate": 9.907694300598237e-05, "loss": 0.9566, "step": 105 }, { "epoch": 0.48970506399554814, "grad_norm": 0.42032694816589355, "learning_rate": 9.881167117906275e-05, "loss": 1.0088, "step": 110 }, { "epoch": 0.5119643850862549, "grad_norm": 0.4715286195278168, "learning_rate": 9.851338786601614e-05, "loss": 0.952, "step": 115 }, { "epoch": 0.5342237061769616, "grad_norm": 0.43396106362342834, "learning_rate": 9.818229479678158e-05, "loss": 0.9474, "step": 120 }, { "epoch": 0.5564830272676683, "grad_norm": 0.4831869900226593, "learning_rate": 9.781861589063895e-05, "loss": 0.9822, "step": 125 }, { "epoch": 0.5787423483583751, "grad_norm": 0.4437423050403595, "learning_rate": 9.742259710477177e-05, "loss": 0.9948, "step": 130 }, { "epoch": 0.6010016694490818, "grad_norm": 0.42239800095558167, "learning_rate": 9.699450626792548e-05, "loss": 1.0358, "step": 135 }, { "epoch": 0.6232609905397886, "grad_norm": 0.3968772888183594, "learning_rate": 9.653463289927411e-05, "loss": 0.9488, "step": 140 }, { "epoch": 0.6455203116304953, "grad_norm": 0.4016033709049225, "learning_rate": 9.604328801261746e-05, "loss": 0.9543, "step": 145 }, { "epoch": 0.667779632721202, "grad_norm": 0.3789500892162323, "learning_rate": 9.55208039060416e-05, "loss": 0.9517, "step": 150 }, { "epoch": 0.667779632721202, "eval_loss": 0.9664375185966492, "eval_runtime": 62.5254, "eval_samples_per_second": 12.779, "eval_steps_per_second": 1.599, "step": 150 }, { "epoch": 0.6900389538119087, "grad_norm": 0.42804014682769775, "learning_rate": 9.496753393718453e-05, "loss": 1.0225, "step": 155 }, { "epoch": 0.7122982749026154, "grad_norm": 0.4701464772224426, "learning_rate": 9.438385228425938e-05, "loss": 0.9765, "step": 160 }, { "epoch": 0.7345575959933222, "grad_norm": 0.4485417604446411, "learning_rate": 9.377015369299651e-05, "loss": 1.0122, "step": 165 }, { "epoch": 0.756816917084029, "grad_norm": 0.4678383767604828, "learning_rate": 9.312685320967564e-05, "loss": 1.0016, "step": 170 }, { "epoch": 0.7790762381747357, "grad_norm": 0.46723246574401855, "learning_rate": 9.245438590042887e-05, "loss": 0.9604, "step": 175 }, { "epoch": 0.8013355592654424, "grad_norm": 0.40324416756629944, "learning_rate": 9.175320655700406e-05, "loss": 0.9903, "step": 180 }, { "epoch": 0.8235948803561491, "grad_norm": 0.4134933650493622, "learning_rate": 9.102378938918764e-05, "loss": 0.9012, "step": 185 }, { "epoch": 0.8458542014468559, "grad_norm": 0.4861450791358948, "learning_rate": 9.026662770409522e-05, "loss": 0.9666, "step": 190 }, { "epoch": 0.8681135225375626, "grad_norm": 0.46025416254997253, "learning_rate": 8.948223357254636e-05, "loss": 0.9406, "step": 195 }, { "epoch": 0.8903728436282693, "grad_norm": 0.45446667075157166, "learning_rate": 8.86711374827494e-05, "loss": 0.9871, "step": 200 }, { "epoch": 0.8903728436282693, "eval_loss": 0.9560968279838562, "eval_runtime": 62.5387, "eval_samples_per_second": 12.776, "eval_steps_per_second": 1.599, "step": 200 }, { "epoch": 0.9126321647189761, "grad_norm": 0.44481343030929565, "learning_rate": 8.783388798153074e-05, "loss": 1.0485, "step": 205 }, { "epoch": 0.9348914858096828, "grad_norm": 0.3908093273639679, "learning_rate": 8.697105130335085e-05, "loss": 0.943, "step": 210 }, { "epoch": 0.9571508069003896, "grad_norm": 0.4595666527748108, "learning_rate": 8.608321098735811e-05, "loss": 1.0317, "step": 215 }, { "epoch": 0.9794101279910963, "grad_norm": 0.4300236403942108, "learning_rate": 8.517096748273951e-05, "loss": 0.9079, "step": 220 }, { "epoch": 1.001669449081803, "grad_norm": 0.42982354760169983, "learning_rate": 8.423493774263493e-05, "loss": 0.9856, "step": 225 }, { "epoch": 1.0239287701725097, "grad_norm": 0.43644052743911743, "learning_rate": 8.327575480688985e-05, "loss": 0.9443, "step": 230 }, { "epoch": 1.0461880912632164, "grad_norm": 0.4620282053947449, "learning_rate": 8.229406737392843e-05, "loss": 0.9307, "step": 235 }, { "epoch": 1.0684474123539232, "grad_norm": 0.4973452687263489, "learning_rate": 8.129053936203687e-05, "loss": 0.9211, "step": 240 }, { "epoch": 1.0907067334446299, "grad_norm": 0.4471150040626526, "learning_rate": 8.026584946035331e-05, "loss": 0.8556, "step": 245 }, { "epoch": 1.1129660545353366, "grad_norm": 0.4626604914665222, "learning_rate": 7.92206906698682e-05, "loss": 0.8805, "step": 250 }, { "epoch": 1.1129660545353366, "eval_loss": 0.9515854120254517, "eval_runtime": 62.5564, "eval_samples_per_second": 12.772, "eval_steps_per_second": 1.599, "step": 250 }, { "epoch": 1.1352253756260433, "grad_norm": 0.4923231899738312, "learning_rate": 7.815576983474562e-05, "loss": 0.9766, "step": 255 }, { "epoch": 1.1574846967167502, "grad_norm": 0.5031096339225769, "learning_rate": 7.707180716428237e-05, "loss": 0.9154, "step": 260 }, { "epoch": 1.179744017807457, "grad_norm": 0.46800029277801514, "learning_rate": 7.596953574582814e-05, "loss": 0.9189, "step": 265 }, { "epoch": 1.2020033388981637, "grad_norm": 0.5481358170509338, "learning_rate": 7.484970104899624e-05, "loss": 0.9032, "step": 270 }, { "epoch": 1.2242626599888704, "grad_norm": 0.5768924951553345, "learning_rate": 7.371306042150012e-05, "loss": 0.9209, "step": 275 }, { "epoch": 1.2465219810795771, "grad_norm": 0.5290476083755493, "learning_rate": 7.256038257695687e-05, "loss": 0.8781, "step": 280 }, { "epoch": 1.2687813021702838, "grad_norm": 0.5345566868782043, "learning_rate": 7.139244707500363e-05, "loss": 0.9094, "step": 285 }, { "epoch": 1.2910406232609906, "grad_norm": 0.51719069480896, "learning_rate": 7.021004379407909e-05, "loss": 0.8804, "step": 290 }, { "epoch": 1.3132999443516973, "grad_norm": 0.4983583390712738, "learning_rate": 6.901397239722616e-05, "loss": 0.9081, "step": 295 }, { "epoch": 1.335559265442404, "grad_norm": 0.5766102075576782, "learning_rate": 6.780504179127734e-05, "loss": 0.8714, "step": 300 }, { "epoch": 1.335559265442404, "eval_loss": 0.9486159682273865, "eval_runtime": 62.5468, "eval_samples_per_second": 12.774, "eval_steps_per_second": 1.599, "step": 300 }, { "epoch": 1.3578185865331107, "grad_norm": 0.5398675799369812, "learning_rate": 6.658406957978862e-05, "loss": 0.921, "step": 305 }, { "epoch": 1.3800779076238174, "grad_norm": 0.5467468500137329, "learning_rate": 6.535188151009143e-05, "loss": 0.8828, "step": 310 }, { "epoch": 1.4023372287145242, "grad_norm": 0.5757557153701782, "learning_rate": 6.41093109148373e-05, "loss": 0.9417, "step": 315 }, { "epoch": 1.4245965498052309, "grad_norm": 0.5766004920005798, "learning_rate": 6.28571981484123e-05, "loss": 0.8331, "step": 320 }, { "epoch": 1.4468558708959378, "grad_norm": 0.6180554628372192, "learning_rate": 6.159639001860277e-05, "loss": 0.8924, "step": 325 }, { "epoch": 1.4691151919866443, "grad_norm": 0.5856440663337708, "learning_rate": 6.032773921389655e-05, "loss": 0.8868, "step": 330 }, { "epoch": 1.4913745130773512, "grad_norm": 0.5318928360939026, "learning_rate": 5.905210372680704e-05, "loss": 0.9377, "step": 335 }, { "epoch": 1.5136338341680577, "grad_norm": 0.6072263717651367, "learning_rate": 5.7770346273610254e-05, "loss": 0.9368, "step": 340 }, { "epoch": 1.5358931552587647, "grad_norm": 0.5962588787078857, "learning_rate": 5.648333371088706e-05, "loss": 0.9315, "step": 345 }, { "epoch": 1.5581524763494712, "grad_norm": 0.5635940432548523, "learning_rate": 5.519193644926535e-05, "loss": 0.9306, "step": 350 }, { "epoch": 1.5581524763494712, "eval_loss": 0.942983090877533, "eval_runtime": 62.5313, "eval_samples_per_second": 12.778, "eval_steps_per_second": 1.599, "step": 350 }, { "epoch": 1.5804117974401781, "grad_norm": 0.5850438475608826, "learning_rate": 5.389702786475862e-05, "loss": 0.9122, "step": 355 }, { "epoch": 1.6026711185308848, "grad_norm": 0.5826383829116821, "learning_rate": 5.2599483708099016e-05, "loss": 0.9005, "step": 360 }, { "epoch": 1.6249304396215916, "grad_norm": 0.5365060567855835, "learning_rate": 5.130018151246445e-05, "loss": 0.8918, "step": 365 }, { "epoch": 1.6471897607122983, "grad_norm": 0.5675637722015381, "learning_rate": 5e-05, "loss": 0.9157, "step": 370 }, { "epoch": 1.669449081803005, "grad_norm": 0.6204765439033508, "learning_rate": 4.869981848753556e-05, "loss": 0.8914, "step": 375 }, { "epoch": 1.6917084028937117, "grad_norm": 0.5928322672843933, "learning_rate": 4.740051629190099e-05, "loss": 0.8721, "step": 380 }, { "epoch": 1.7139677239844184, "grad_norm": 0.5092572569847107, "learning_rate": 4.61029721352414e-05, "loss": 0.8573, "step": 385 }, { "epoch": 1.7362270450751254, "grad_norm": 0.6391741633415222, "learning_rate": 4.480806355073467e-05, "loss": 0.907, "step": 390 }, { "epoch": 1.7584863661658319, "grad_norm": 0.6126671433448792, "learning_rate": 4.351666628911295e-05, "loss": 0.8023, "step": 395 }, { "epoch": 1.7807456872565388, "grad_norm": 0.6055718660354614, "learning_rate": 4.2229653726389765e-05, "loss": 0.8914, "step": 400 }, { "epoch": 1.7807456872565388, "eval_loss": 0.9382885098457336, "eval_runtime": 62.4923, "eval_samples_per_second": 12.786, "eval_steps_per_second": 1.6, "step": 400 }, { "epoch": 1.8030050083472453, "grad_norm": 0.5169083476066589, "learning_rate": 4.094789627319298e-05, "loss": 0.9202, "step": 405 }, { "epoch": 1.8252643294379522, "grad_norm": 0.5566220879554749, "learning_rate": 3.967226078610347e-05, "loss": 0.8758, "step": 410 }, { "epoch": 1.8475236505286587, "grad_norm": 0.568168580532074, "learning_rate": 3.840360998139724e-05, "loss": 0.8854, "step": 415 }, { "epoch": 1.8697829716193657, "grad_norm": 0.5981225371360779, "learning_rate": 3.714280185158771e-05, "loss": 0.9467, "step": 420 }, { "epoch": 1.8920422927100722, "grad_norm": 0.6130527257919312, "learning_rate": 3.589068908516271e-05, "loss": 0.8456, "step": 425 }, { "epoch": 1.9143016138007791, "grad_norm": 0.5936880111694336, "learning_rate": 3.464811848990859e-05, "loss": 0.9022, "step": 430 }, { "epoch": 1.9365609348914858, "grad_norm": 0.6016620397567749, "learning_rate": 3.341593042021138e-05, "loss": 0.9057, "step": 435 }, { "epoch": 1.9588202559821926, "grad_norm": 0.6164581179618835, "learning_rate": 3.219495820872265e-05, "loss": 0.8907, "step": 440 }, { "epoch": 1.9810795770728993, "grad_norm": 0.582241952419281, "learning_rate": 3.098602760277385e-05, "loss": 0.9133, "step": 445 }, { "epoch": 2.003338898163606, "grad_norm": 0.5405800938606262, "learning_rate": 2.978995620592092e-05, "loss": 0.8567, "step": 450 }, { "epoch": 2.003338898163606, "eval_loss": 0.9342555999755859, "eval_runtime": 62.5282, "eval_samples_per_second": 12.778, "eval_steps_per_second": 1.599, "step": 450 } ], "logging_steps": 5, "max_steps": 672, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9.259026931200819e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }