{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.003338898163606, "eval_steps": 50, "global_step": 450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022259321090706732, "grad_norm": 0.4673316478729248, "learning_rate": 7.3529411764705884e-06, "loss": 1.2911, "step": 5 }, { "epoch": 0.044518642181413465, "grad_norm": 0.5188754200935364, "learning_rate": 1.4705882352941177e-05, "loss": 1.3411, "step": 10 }, { "epoch": 0.0667779632721202, "grad_norm": 0.584618866443634, "learning_rate": 2.2058823529411766e-05, "loss": 1.2434, "step": 15 }, { "epoch": 0.08903728436282693, "grad_norm": 0.40156278014183044, "learning_rate": 2.9411764705882354e-05, "loss": 1.2591, "step": 20 }, { "epoch": 0.11129660545353366, "grad_norm": 0.2917367219924927, "learning_rate": 3.6764705882352945e-05, "loss": 1.2279, "step": 25 }, { "epoch": 0.1335559265442404, "grad_norm": 0.2543434500694275, "learning_rate": 4.411764705882353e-05, "loss": 1.226, "step": 30 }, { "epoch": 0.15581524763494714, "grad_norm": 0.28222939372062683, "learning_rate": 5.147058823529411e-05, "loss": 1.1873, "step": 35 }, { "epoch": 0.17807456872565386, "grad_norm": 0.25760558247566223, "learning_rate": 5.882352941176471e-05, "loss": 1.1476, "step": 40 }, { "epoch": 0.2003338898163606, "grad_norm": 0.3042117953300476, "learning_rate": 6.61764705882353e-05, "loss": 1.1758, "step": 45 }, { "epoch": 0.22259321090706732, "grad_norm": 0.31747791171073914, "learning_rate": 7.352941176470589e-05, "loss": 1.1373, "step": 50 }, { "epoch": 0.22259321090706732, "eval_loss": 1.1148781776428223, "eval_runtime": 52.0257, "eval_samples_per_second": 15.358, "eval_steps_per_second": 1.922, "step": 50 }, { "epoch": 0.24485253199777407, "grad_norm": 0.2843828797340393, "learning_rate": 8.088235294117648e-05, "loss": 1.0659, "step": 55 }, { "epoch": 0.2671118530884808, "grad_norm": 0.30523741245269775, "learning_rate": 8.823529411764706e-05, "loss": 1.0535, "step": 60 }, { "epoch": 0.28937117417918756, "grad_norm": 0.3263162076473236, "learning_rate": 9.558823529411765e-05, "loss": 1.0327, "step": 65 }, { "epoch": 0.3116304952698943, "grad_norm": 0.34655460715293884, "learning_rate": 9.999729465939036e-05, "loss": 1.1408, "step": 70 }, { "epoch": 0.333889816360601, "grad_norm": 0.32706472277641296, "learning_rate": 9.996686293953675e-05, "loss": 1.0506, "step": 75 }, { "epoch": 0.3561491374513077, "grad_norm": 0.37062904238700867, "learning_rate": 9.990263847374976e-05, "loss": 1.0611, "step": 80 }, { "epoch": 0.3784084585420145, "grad_norm": 0.38553890585899353, "learning_rate": 9.98046646972368e-05, "loss": 1.0951, "step": 85 }, { "epoch": 0.4006677796327212, "grad_norm": 0.31734517216682434, "learning_rate": 9.96730078699698e-05, "loss": 1.0661, "step": 90 }, { "epoch": 0.42292710072342793, "grad_norm": 0.3253045380115509, "learning_rate": 9.950775703187354e-05, "loss": 1.0389, "step": 95 }, { "epoch": 0.44518642181413465, "grad_norm": 0.3521738648414612, "learning_rate": 9.930902394260747e-05, "loss": 1.1163, "step": 100 }, { "epoch": 0.44518642181413465, "eval_loss": 1.0541198253631592, "eval_runtime": 50.9676, "eval_samples_per_second": 15.677, "eval_steps_per_second": 1.962, "step": 100 }, { "epoch": 0.4674457429048414, "grad_norm": 0.3556961119174957, "learning_rate": 9.907694300598237e-05, "loss": 1.0173, "step": 105 }, { "epoch": 0.48970506399554814, "grad_norm": 0.355704665184021, "learning_rate": 9.881167117906275e-05, "loss": 1.0947, "step": 110 }, { "epoch": 0.5119643850862549, "grad_norm": 0.389106810092926, "learning_rate": 9.851338786601614e-05, "loss": 1.018, "step": 115 }, { "epoch": 0.5342237061769616, "grad_norm": 0.3406746983528137, "learning_rate": 9.818229479678158e-05, "loss": 1.0, "step": 120 }, { "epoch": 0.5564830272676683, "grad_norm": 0.38247111439704895, "learning_rate": 9.781861589063895e-05, "loss": 1.0442, "step": 125 }, { "epoch": 0.5787423483583751, "grad_norm": 0.3582042157649994, "learning_rate": 9.742259710477177e-05, "loss": 1.0627, "step": 130 }, { "epoch": 0.6010016694490818, "grad_norm": 0.3528626263141632, "learning_rate": 9.699450626792548e-05, "loss": 1.1155, "step": 135 }, { "epoch": 0.6232609905397886, "grad_norm": 0.33342617750167847, "learning_rate": 9.653463289927411e-05, "loss": 1.0184, "step": 140 }, { "epoch": 0.6455203116304953, "grad_norm": 0.32322484254837036, "learning_rate": 9.604328801261746e-05, "loss": 1.0169, "step": 145 }, { "epoch": 0.667779632721202, "grad_norm": 0.33450886607170105, "learning_rate": 9.55208039060416e-05, "loss": 1.0256, "step": 150 }, { "epoch": 0.667779632721202, "eval_loss": 1.0322891473770142, "eval_runtime": 50.8847, "eval_samples_per_second": 15.702, "eval_steps_per_second": 1.965, "step": 150 }, { "epoch": 0.6900389538119087, "grad_norm": 0.352691650390625, "learning_rate": 9.496753393718453e-05, "loss": 1.0888, "step": 155 }, { "epoch": 0.7122982749026154, "grad_norm": 0.39313045144081116, "learning_rate": 9.438385228425938e-05, "loss": 1.0477, "step": 160 }, { "epoch": 0.7345575959933222, "grad_norm": 0.3759097158908844, "learning_rate": 9.377015369299651e-05, "loss": 1.0776, "step": 165 }, { "epoch": 0.756816917084029, "grad_norm": 0.3853538930416107, "learning_rate": 9.312685320967564e-05, "loss": 1.0664, "step": 170 }, { "epoch": 0.7790762381747357, "grad_norm": 0.38343074917793274, "learning_rate": 9.245438590042887e-05, "loss": 1.0407, "step": 175 }, { "epoch": 0.8013355592654424, "grad_norm": 0.3434021770954132, "learning_rate": 9.175320655700406e-05, "loss": 1.0624, "step": 180 }, { "epoch": 0.8235948803561491, "grad_norm": 0.3616039752960205, "learning_rate": 9.102378938918764e-05, "loss": 0.9745, "step": 185 }, { "epoch": 0.8458542014468559, "grad_norm": 0.4113802909851074, "learning_rate": 9.026662770409522e-05, "loss": 1.0284, "step": 190 }, { "epoch": 0.8681135225375626, "grad_norm": 0.371598482131958, "learning_rate": 8.948223357254636e-05, "loss": 1.0205, "step": 195 }, { "epoch": 0.8903728436282693, "grad_norm": 0.3975715637207031, "learning_rate": 8.86711374827494e-05, "loss": 1.057, "step": 200 }, { "epoch": 0.8903728436282693, "eval_loss": 1.0230859518051147, "eval_runtime": 50.8798, "eval_samples_per_second": 15.704, "eval_steps_per_second": 1.965, "step": 200 }, { "epoch": 0.9126321647189761, "grad_norm": 0.49988657236099243, "learning_rate": 8.783388798153074e-05, "loss": 1.1316, "step": 205 }, { "epoch": 0.9348914858096828, "grad_norm": 0.3169264495372772, "learning_rate": 8.697105130335085e-05, "loss": 1.0057, "step": 210 }, { "epoch": 0.9571508069003896, "grad_norm": 0.3785440921783447, "learning_rate": 8.608321098735811e-05, "loss": 1.1214, "step": 215 }, { "epoch": 0.9794101279910963, "grad_norm": 0.35510075092315674, "learning_rate": 8.517096748273951e-05, "loss": 0.9736, "step": 220 }, { "epoch": 1.001669449081803, "grad_norm": 0.3593966066837311, "learning_rate": 8.423493774263493e-05, "loss": 1.0571, "step": 225 }, { "epoch": 1.0239287701725097, "grad_norm": 0.3669678270816803, "learning_rate": 8.327575480688985e-05, "loss": 1.0336, "step": 230 }, { "epoch": 1.0461880912632164, "grad_norm": 0.3945116400718689, "learning_rate": 8.229406737392843e-05, "loss": 1.0242, "step": 235 }, { "epoch": 1.0684474123539232, "grad_norm": 0.4227205514907837, "learning_rate": 8.129053936203687e-05, "loss": 0.9969, "step": 240 }, { "epoch": 1.0907067334446299, "grad_norm": 0.39083144068717957, "learning_rate": 8.026584946035331e-05, "loss": 0.9173, "step": 245 }, { "epoch": 1.1129660545353366, "grad_norm": 0.41568800806999207, "learning_rate": 7.92206906698682e-05, "loss": 0.9623, "step": 250 }, { "epoch": 1.1129660545353366, "eval_loss": 1.0195302963256836, "eval_runtime": 50.891, "eval_samples_per_second": 15.7, "eval_steps_per_second": 1.965, "step": 250 }, { "epoch": 1.1352253756260433, "grad_norm": 0.449934720993042, "learning_rate": 7.815576983474562e-05, "loss": 1.0695, "step": 255 }, { "epoch": 1.1574846967167502, "grad_norm": 0.44527217745780945, "learning_rate": 7.707180716428237e-05, "loss": 1.0041, "step": 260 }, { "epoch": 1.179744017807457, "grad_norm": 0.42600926756858826, "learning_rate": 7.596953574582814e-05, "loss": 0.9999, "step": 265 }, { "epoch": 1.2020033388981637, "grad_norm": 0.49353349208831787, "learning_rate": 7.484970104899624e-05, "loss": 0.9788, "step": 270 }, { "epoch": 1.2242626599888704, "grad_norm": 0.5350990295410156, "learning_rate": 7.371306042150012e-05, "loss": 0.9951, "step": 275 }, { "epoch": 1.2465219810795771, "grad_norm": 0.49454548954963684, "learning_rate": 7.256038257695687e-05, "loss": 0.9426, "step": 280 }, { "epoch": 1.2687813021702838, "grad_norm": 0.49435731768608093, "learning_rate": 7.139244707500363e-05, "loss": 0.98, "step": 285 }, { "epoch": 1.2910406232609906, "grad_norm": 0.4851316809654236, "learning_rate": 7.021004379407909e-05, "loss": 0.9591, "step": 290 }, { "epoch": 1.3132999443516973, "grad_norm": 0.5039463639259338, "learning_rate": 6.901397239722616e-05, "loss": 0.9815, "step": 295 }, { "epoch": 1.335559265442404, "grad_norm": 0.5476050972938538, "learning_rate": 6.780504179127734e-05, "loss": 0.9334, "step": 300 }, { "epoch": 1.335559265442404, "eval_loss": 1.0174856185913086, "eval_runtime": 50.9073, "eval_samples_per_second": 15.695, "eval_steps_per_second": 1.964, "step": 300 }, { "epoch": 1.3578185865331107, "grad_norm": 0.5071010589599609, "learning_rate": 6.658406957978862e-05, "loss": 0.9943, "step": 305 }, { "epoch": 1.3800779076238174, "grad_norm": 0.5222419500350952, "learning_rate": 6.535188151009143e-05, "loss": 0.9503, "step": 310 }, { "epoch": 1.4023372287145242, "grad_norm": 0.5639395713806152, "learning_rate": 6.41093109148373e-05, "loss": 1.019, "step": 315 }, { "epoch": 1.4245965498052309, "grad_norm": 0.5787636637687683, "learning_rate": 6.28571981484123e-05, "loss": 0.8958, "step": 320 }, { "epoch": 1.4468558708959378, "grad_norm": 0.5925642848014832, "learning_rate": 6.159639001860277e-05, "loss": 0.9914, "step": 325 }, { "epoch": 1.4691151919866443, "grad_norm": 0.57623690366745, "learning_rate": 6.032773921389655e-05, "loss": 0.9708, "step": 330 }, { "epoch": 1.4913745130773512, "grad_norm": 0.5331757664680481, "learning_rate": 5.905210372680704e-05, "loss": 1.0122, "step": 335 }, { "epoch": 1.5136338341680577, "grad_norm": 0.5566096901893616, "learning_rate": 5.7770346273610254e-05, "loss": 1.012, "step": 340 }, { "epoch": 1.5358931552587647, "grad_norm": 0.5592398047447205, "learning_rate": 5.648333371088706e-05, "loss": 1.0056, "step": 345 }, { "epoch": 1.5581524763494712, "grad_norm": 0.5523474812507629, "learning_rate": 5.519193644926535e-05, "loss": 1.0113, "step": 350 }, { "epoch": 1.5581524763494712, "eval_loss": 1.0144401788711548, "eval_runtime": 50.9085, "eval_samples_per_second": 15.695, "eval_steps_per_second": 1.964, "step": 350 }, { "epoch": 1.5804117974401781, "grad_norm": 0.5672771334648132, "learning_rate": 5.389702786475862e-05, "loss": 1.0022, "step": 355 }, { "epoch": 1.6026711185308848, "grad_norm": 0.5656226873397827, "learning_rate": 5.2599483708099016e-05, "loss": 0.9705, "step": 360 }, { "epoch": 1.6249304396215916, "grad_norm": 0.575705349445343, "learning_rate": 5.130018151246445e-05, "loss": 0.9638, "step": 365 }, { "epoch": 1.6471897607122983, "grad_norm": 0.5695839524269104, "learning_rate": 5e-05, "loss": 0.9957, "step": 370 }, { "epoch": 1.669449081803005, "grad_norm": 0.6090526580810547, "learning_rate": 4.869981848753556e-05, "loss": 0.9637, "step": 375 }, { "epoch": 1.6917084028937117, "grad_norm": 0.5683362483978271, "learning_rate": 4.740051629190099e-05, "loss": 0.9425, "step": 380 }, { "epoch": 1.7139677239844184, "grad_norm": 0.48340535163879395, "learning_rate": 4.61029721352414e-05, "loss": 0.9211, "step": 385 }, { "epoch": 1.7362270450751254, "grad_norm": 0.6130196452140808, "learning_rate": 4.480806355073467e-05, "loss": 0.9821, "step": 390 }, { "epoch": 1.7584863661658319, "grad_norm": 0.5705894231796265, "learning_rate": 4.351666628911295e-05, "loss": 0.8659, "step": 395 }, { "epoch": 1.7807456872565388, "grad_norm": 0.5951923131942749, "learning_rate": 4.2229653726389765e-05, "loss": 0.9537, "step": 400 }, { "epoch": 1.7807456872565388, "eval_loss": 1.010872483253479, "eval_runtime": 50.8897, "eval_samples_per_second": 15.701, "eval_steps_per_second": 1.965, "step": 400 }, { "epoch": 1.8030050083472453, "grad_norm": 0.535017728805542, "learning_rate": 4.094789627319298e-05, "loss": 1.009, "step": 405 }, { "epoch": 1.8252643294379522, "grad_norm": 0.5428237915039062, "learning_rate": 3.967226078610347e-05, "loss": 0.9533, "step": 410 }, { "epoch": 1.8475236505286587, "grad_norm": 0.5607818961143494, "learning_rate": 3.840360998139724e-05, "loss": 0.9488, "step": 415 }, { "epoch": 1.8697829716193657, "grad_norm": 0.5991472005844116, "learning_rate": 3.714280185158771e-05, "loss": 1.0467, "step": 420 }, { "epoch": 1.8920422927100722, "grad_norm": 0.6074681282043457, "learning_rate": 3.589068908516271e-05, "loss": 0.9183, "step": 425 }, { "epoch": 1.9143016138007791, "grad_norm": 0.5638883113861084, "learning_rate": 3.464811848990859e-05, "loss": 0.9581, "step": 430 }, { "epoch": 1.9365609348914858, "grad_norm": 0.5864625573158264, "learning_rate": 3.341593042021138e-05, "loss": 0.9824, "step": 435 }, { "epoch": 1.9588202559821926, "grad_norm": 0.5764511227607727, "learning_rate": 3.219495820872265e-05, "loss": 0.9699, "step": 440 }, { "epoch": 1.9810795770728993, "grad_norm": 0.5945712327957153, "learning_rate": 3.098602760277385e-05, "loss": 0.9828, "step": 445 }, { "epoch": 2.003338898163606, "grad_norm": 0.5488788485527039, "learning_rate": 2.978995620592092e-05, "loss": 0.9356, "step": 450 }, { "epoch": 2.003338898163606, "eval_loss": 1.0078463554382324, "eval_runtime": 50.8879, "eval_samples_per_second": 15.701, "eval_steps_per_second": 1.965, "step": 450 } ], "logging_steps": 5, "max_steps": 672, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.99240623805694e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }