{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 620, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0032258064516129032, "grad_norm": 3.971249580383301, "learning_rate": 1.6129032258064518e-07, "loss": 0.6284, "step": 1 }, { "epoch": 0.016129032258064516, "grad_norm": 5.593815326690674, "learning_rate": 8.064516129032258e-07, "loss": 0.6406, "step": 5 }, { "epoch": 0.03225806451612903, "grad_norm": 3.338022232055664, "learning_rate": 1.6129032258064516e-06, "loss": 0.6179, "step": 10 }, { "epoch": 0.04838709677419355, "grad_norm": 2.572110891342163, "learning_rate": 2.4193548387096776e-06, "loss": 0.5695, "step": 15 }, { "epoch": 0.06451612903225806, "grad_norm": 2.096151113510132, "learning_rate": 3.225806451612903e-06, "loss": 0.5072, "step": 20 }, { "epoch": 0.08064516129032258, "grad_norm": 2.135874032974243, "learning_rate": 4.032258064516129e-06, "loss": 0.4539, "step": 25 }, { "epoch": 0.0967741935483871, "grad_norm": 1.8665850162506104, "learning_rate": 4.838709677419355e-06, "loss": 0.394, "step": 30 }, { "epoch": 0.11290322580645161, "grad_norm": 1.8506455421447754, "learning_rate": 5.645161290322582e-06, "loss": 0.3099, "step": 35 }, { "epoch": 0.12903225806451613, "grad_norm": 1.882623314857483, "learning_rate": 6.451612903225806e-06, "loss": 0.2164, "step": 40 }, { "epoch": 0.14516129032258066, "grad_norm": 2.4444594383239746, "learning_rate": 7.258064516129033e-06, "loss": 0.1262, "step": 45 }, { "epoch": 0.16129032258064516, "grad_norm": 1.5979326963424683, "learning_rate": 8.064516129032258e-06, "loss": 0.068, "step": 50 }, { "epoch": 0.1774193548387097, "grad_norm": 3.323439359664917, "learning_rate": 8.870967741935484e-06, "loss": 0.0337, "step": 55 }, { "epoch": 0.1935483870967742, "grad_norm": 1.5633885860443115, "learning_rate": 9.67741935483871e-06, "loss": 0.0272, "step": 60 }, { "epoch": 0.20967741935483872, "grad_norm": 1.1317998170852661, "learning_rate": 9.99928681279855e-06, "loss": 0.0208, "step": 65 }, { "epoch": 0.22580645161290322, "grad_norm": 0.4592137932777405, "learning_rate": 9.994929183335237e-06, "loss": 0.0158, "step": 70 }, { "epoch": 0.24193548387096775, "grad_norm": 0.7832735180854797, "learning_rate": 9.986613588305435e-06, "loss": 0.0145, "step": 75 }, { "epoch": 0.25806451612903225, "grad_norm": 0.8761097192764282, "learning_rate": 9.974346616959476e-06, "loss": 0.0104, "step": 80 }, { "epoch": 0.27419354838709675, "grad_norm": 0.9911714196205139, "learning_rate": 9.95813798960538e-06, "loss": 0.0115, "step": 85 }, { "epoch": 0.2903225806451613, "grad_norm": 0.48290345072746277, "learning_rate": 9.938000549906509e-06, "loss": 0.0092, "step": 90 }, { "epoch": 0.3064516129032258, "grad_norm": 0.8782544136047363, "learning_rate": 9.913950254704291e-06, "loss": 0.0167, "step": 95 }, { "epoch": 0.3225806451612903, "grad_norm": 0.827684223651886, "learning_rate": 9.88600616137407e-06, "loss": 0.0087, "step": 100 }, { "epoch": 0.3387096774193548, "grad_norm": 0.4351416826248169, "learning_rate": 9.854190412724114e-06, "loss": 0.0092, "step": 105 }, { "epoch": 0.3548387096774194, "grad_norm": 0.617416262626648, "learning_rate": 9.818528219449705e-06, "loss": 0.0106, "step": 110 }, { "epoch": 0.3709677419354839, "grad_norm": 0.3410218060016632, "learning_rate": 9.779047840156288e-06, "loss": 0.0092, "step": 115 }, { "epoch": 0.3870967741935484, "grad_norm": 0.6016420722007751, "learning_rate": 9.735780558967434e-06, "loss": 0.0081, "step": 120 }, { "epoch": 0.4032258064516129, "grad_norm": 0.34005704522132874, "learning_rate": 9.688760660735403e-06, "loss": 0.0075, "step": 125 }, { "epoch": 0.41935483870967744, "grad_norm": 1.3894850015640259, "learning_rate": 9.638025403873939e-06, "loss": 0.0074, "step": 130 }, { "epoch": 0.43548387096774194, "grad_norm": 0.3171682059764862, "learning_rate": 9.58361499083483e-06, "loss": 0.007, "step": 135 }, { "epoch": 0.45161290322580644, "grad_norm": 0.5876194834709167, "learning_rate": 9.525572536251608e-06, "loss": 0.0085, "step": 140 }, { "epoch": 0.46774193548387094, "grad_norm": 0.47260814905166626, "learning_rate": 9.46394403277566e-06, "loss": 0.0067, "step": 145 }, { "epoch": 0.4838709677419355, "grad_norm": 0.6115548014640808, "learning_rate": 9.398778314631801e-06, "loss": 0.0084, "step": 150 }, { "epoch": 0.5, "grad_norm": 0.44270601868629456, "learning_rate": 9.330127018922195e-06, "loss": 0.0063, "step": 155 }, { "epoch": 0.5161290322580645, "grad_norm": 0.5065975785255432, "learning_rate": 9.258044544709276e-06, "loss": 0.0079, "step": 160 }, { "epoch": 0.532258064516129, "grad_norm": 0.9441617131233215, "learning_rate": 9.182588009910119e-06, "loss": 0.0075, "step": 165 }, { "epoch": 0.5483870967741935, "grad_norm": 0.7031365036964417, "learning_rate": 9.103817206036383e-06, "loss": 0.0067, "step": 170 }, { "epoch": 0.5645161290322581, "grad_norm": 0.6422185301780701, "learning_rate": 9.021794550815713e-06, "loss": 0.0088, "step": 175 }, { "epoch": 0.5806451612903226, "grad_norm": 0.547900378704071, "learning_rate": 8.936585038732143e-06, "loss": 0.006, "step": 180 }, { "epoch": 0.5967741935483871, "grad_norm": 0.6972706913948059, "learning_rate": 8.848256189524661e-06, "loss": 0.0057, "step": 185 }, { "epoch": 0.6129032258064516, "grad_norm": 0.37197422981262207, "learning_rate": 8.756877994684818e-06, "loss": 0.0049, "step": 190 }, { "epoch": 0.6290322580645161, "grad_norm": 0.2147480994462967, "learning_rate": 8.66252286199567e-06, "loss": 0.0081, "step": 195 }, { "epoch": 0.6451612903225806, "grad_norm": 0.44903531670570374, "learning_rate": 8.565265558156101e-06, "loss": 0.0055, "step": 200 }, { "epoch": 0.6612903225806451, "grad_norm": 1.0128227472305298, "learning_rate": 8.465183149535939e-06, "loss": 0.0054, "step": 205 }, { "epoch": 0.6774193548387096, "grad_norm": 0.4877210557460785, "learning_rate": 8.362354941108803e-06, "loss": 0.0057, "step": 210 }, { "epoch": 0.6935483870967742, "grad_norm": 0.2644139230251312, "learning_rate": 8.256862413611113e-06, "loss": 0.005, "step": 215 }, { "epoch": 0.7096774193548387, "grad_norm": 0.6257811784744263, "learning_rate": 8.148789158977012e-06, "loss": 0.0055, "step": 220 }, { "epoch": 0.7258064516129032, "grad_norm": 0.19504040479660034, "learning_rate": 8.038220814100403e-06, "loss": 0.005, "step": 225 }, { "epoch": 0.7419354838709677, "grad_norm": 0.5125346779823303, "learning_rate": 7.925244992976538e-06, "loss": 0.0055, "step": 230 }, { "epoch": 0.7580645161290323, "grad_norm": 0.46777665615081787, "learning_rate": 7.809951217276986e-06, "loss": 0.0052, "step": 235 }, { "epoch": 0.7741935483870968, "grad_norm": 0.5273892283439636, "learning_rate": 7.692430845412946e-06, "loss": 0.006, "step": 240 }, { "epoch": 0.7903225806451613, "grad_norm": 0.5478794574737549, "learning_rate": 7.572777000143145e-06, "loss": 0.0078, "step": 245 }, { "epoch": 0.8064516129032258, "grad_norm": 0.35614484548568726, "learning_rate": 7.451084494783668e-06, "loss": 0.0051, "step": 250 }, { "epoch": 0.8225806451612904, "grad_norm": 0.42776939272880554, "learning_rate": 7.327449758078194e-06, "loss": 0.0057, "step": 255 }, { "epoch": 0.8387096774193549, "grad_norm": 0.19282685220241547, "learning_rate": 7.201970757788172e-06, "loss": 0.0039, "step": 260 }, { "epoch": 0.8548387096774194, "grad_norm": 0.7167965173721313, "learning_rate": 7.074746923063497e-06, "loss": 0.0042, "step": 265 }, { "epoch": 0.8709677419354839, "grad_norm": 3.5436365604400635, "learning_rate": 6.945879065655164e-06, "loss": 0.0042, "step": 270 }, { "epoch": 0.8870967741935484, "grad_norm": 0.3570103347301483, "learning_rate": 6.815469300032374e-06, "loss": 0.0043, "step": 275 }, { "epoch": 0.9032258064516129, "grad_norm": 0.252421110868454, "learning_rate": 6.6836209624673575e-06, "loss": 0.0056, "step": 280 }, { "epoch": 0.9193548387096774, "grad_norm": 0.2856729030609131, "learning_rate": 6.5504385291520554e-06, "loss": 0.0052, "step": 285 }, { "epoch": 0.9354838709677419, "grad_norm": 0.25228211283683777, "learning_rate": 6.41602753341152e-06, "loss": 0.0055, "step": 290 }, { "epoch": 0.9516129032258065, "grad_norm": 0.29258614778518677, "learning_rate": 6.2804944820796596e-06, "loss": 0.0037, "step": 295 }, { "epoch": 0.967741935483871, "grad_norm": 0.1690083146095276, "learning_rate": 6.143946771103561e-06, "loss": 0.0032, "step": 300 }, { "epoch": 0.9838709677419355, "grad_norm": 0.45113542675971985, "learning_rate": 6.006492600443301e-06, "loss": 0.0036, "step": 305 }, { "epoch": 1.0, "grad_norm": 0.34520477056503296, "learning_rate": 5.8682408883346535e-06, "loss": 0.0049, "step": 310 }, { "epoch": 1.0, "eval_loss": 0.004980116616934538, "eval_runtime": 3.8044, "eval_samples_per_second": 0.789, "eval_steps_per_second": 0.789, "step": 310 }, { "epoch": 1.0161290322580645, "grad_norm": 0.19288980960845947, "learning_rate": 5.729301184982622e-06, "loss": 0.0034, "step": 315 }, { "epoch": 1.032258064516129, "grad_norm": 0.19823557138442993, "learning_rate": 5.5897835857542315e-06, "loss": 0.0026, "step": 320 }, { "epoch": 1.0483870967741935, "grad_norm": 0.09697025269269943, "learning_rate": 5.449798643939305e-06, "loss": 0.0024, "step": 325 }, { "epoch": 1.064516129032258, "grad_norm": 0.23041202127933502, "learning_rate": 5.30945728314841e-06, "loss": 0.0028, "step": 330 }, { "epoch": 1.0806451612903225, "grad_norm": 0.27025359869003296, "learning_rate": 5.168870709417342e-06, "loss": 0.0034, "step": 335 }, { "epoch": 1.096774193548387, "grad_norm": 0.3000248968601227, "learning_rate": 5.0281503230878304e-06, "loss": 0.0033, "step": 340 }, { "epoch": 1.1129032258064515, "grad_norm": 0.22583173215389252, "learning_rate": 4.887407630534271e-06, "loss": 0.0027, "step": 345 }, { "epoch": 1.129032258064516, "grad_norm": 0.21011537313461304, "learning_rate": 4.746754155806437e-06, "loss": 0.0026, "step": 350 }, { "epoch": 1.1451612903225807, "grad_norm": 0.13884232938289642, "learning_rate": 4.606301352258192e-06, "loss": 0.0028, "step": 355 }, { "epoch": 1.1612903225806452, "grad_norm": 0.25201186537742615, "learning_rate": 4.466160514232206e-06, "loss": 0.0027, "step": 360 }, { "epoch": 1.1774193548387097, "grad_norm": 0.22268442809581757, "learning_rate": 4.326442688870697e-06, "loss": 0.0028, "step": 365 }, { "epoch": 1.1935483870967742, "grad_norm": 0.21970783174037933, "learning_rate": 4.187258588122019e-06, "loss": 0.0031, "step": 370 }, { "epoch": 1.2096774193548387, "grad_norm": 0.26229333877563477, "learning_rate": 4.048718501012895e-06, "loss": 0.0026, "step": 375 }, { "epoch": 1.2258064516129032, "grad_norm": 0.25772520899772644, "learning_rate": 3.910932206255742e-06, "loss": 0.0022, "step": 380 }, { "epoch": 1.2419354838709677, "grad_norm": 0.4050711691379547, "learning_rate": 3.77400888526038e-06, "loss": 0.0023, "step": 385 }, { "epoch": 1.2580645161290323, "grad_norm": 0.2486872673034668, "learning_rate": 3.6380570356190346e-06, "loss": 0.0036, "step": 390 }, { "epoch": 1.2741935483870968, "grad_norm": 0.20115166902542114, "learning_rate": 3.5031843851332105e-06, "loss": 0.0017, "step": 395 }, { "epoch": 1.2903225806451613, "grad_norm": 0.19394518435001373, "learning_rate": 3.3694978064505258e-06, "loss": 0.0029, "step": 400 }, { "epoch": 1.3064516129032258, "grad_norm": 0.25958162546157837, "learning_rate": 3.2371032323791757e-06, "loss": 0.003, "step": 405 }, { "epoch": 1.3225806451612903, "grad_norm": 0.27680277824401855, "learning_rate": 3.10610557194712e-06, "loss": 0.002, "step": 410 }, { "epoch": 1.3387096774193548, "grad_norm": 0.31953132152557373, "learning_rate": 2.97660862727252e-06, "loss": 0.0039, "step": 415 }, { "epoch": 1.3548387096774195, "grad_norm": 0.3594481647014618, "learning_rate": 2.848715011311271e-06, "loss": 0.0031, "step": 420 }, { "epoch": 1.370967741935484, "grad_norm": 0.15407758951187134, "learning_rate": 2.72252606654683e-06, "loss": 0.0032, "step": 425 }, { "epoch": 1.3870967741935485, "grad_norm": 0.25268280506134033, "learning_rate": 2.5981417846867753e-06, "loss": 0.0029, "step": 430 }, { "epoch": 1.403225806451613, "grad_norm": 0.19317582249641418, "learning_rate": 2.4756607274296844e-06, "loss": 0.0035, "step": 435 }, { "epoch": 1.4193548387096775, "grad_norm": 0.15693552792072296, "learning_rate": 2.3551799483651894e-06, "loss": 0.0026, "step": 440 }, { "epoch": 1.435483870967742, "grad_norm": 0.19608916342258453, "learning_rate": 2.236794916069007e-06, "loss": 0.0028, "step": 445 }, { "epoch": 1.4516129032258065, "grad_norm": 0.403870552778244, "learning_rate": 2.120599438453968e-06, "loss": 0.0032, "step": 450 }, { "epoch": 1.467741935483871, "grad_norm": 0.16400307416915894, "learning_rate": 2.0066855884369246e-06, "loss": 0.0025, "step": 455 }, { "epoch": 1.4838709677419355, "grad_norm": 0.954981803894043, "learning_rate": 1.8951436309804766e-06, "loss": 0.0024, "step": 460 }, { "epoch": 1.5, "grad_norm": 0.23671939969062805, "learning_rate": 1.7860619515673034e-06, "loss": 0.0029, "step": 465 }, { "epoch": 1.5161290322580645, "grad_norm": 0.1934683918952942, "learning_rate": 1.6795269861638041e-06, "loss": 0.0033, "step": 470 }, { "epoch": 1.532258064516129, "grad_norm": 0.12341190874576569, "learning_rate": 1.5756231527285181e-06, "loss": 0.0027, "step": 475 }, { "epoch": 1.5483870967741935, "grad_norm": 0.44817498326301575, "learning_rate": 1.4744327843196043e-06, "loss": 0.0034, "step": 480 }, { "epoch": 1.564516129032258, "grad_norm": 0.29806482791900635, "learning_rate": 1.3760360638544012e-06, "loss": 0.0031, "step": 485 }, { "epoch": 1.5806451612903225, "grad_norm": 0.19129404425621033, "learning_rate": 1.280510960572745e-06, "loss": 0.0017, "step": 490 }, { "epoch": 1.596774193548387, "grad_norm": 0.21360230445861816, "learning_rate": 1.1879331682543972e-06, "loss": 0.0034, "step": 495 }, { "epoch": 1.6129032258064515, "grad_norm": 0.2776956260204315, "learning_rate": 1.0983760452395415e-06, "loss": 0.0021, "step": 500 }, { "epoch": 1.629032258064516, "grad_norm": 0.15755566954612732, "learning_rate": 1.01191055629987e-06, "loss": 0.0019, "step": 505 }, { "epoch": 1.6451612903225805, "grad_norm": 0.3595326840877533, "learning_rate": 9.286052164063369e-07, "loss": 0.0023, "step": 510 }, { "epoch": 1.661290322580645, "grad_norm": 0.19125986099243164, "learning_rate": 8.485260364381187e-07, "loss": 0.0041, "step": 515 }, { "epoch": 1.6774193548387095, "grad_norm": 0.3362486958503723, "learning_rate": 7.717364708758024e-07, "loss": 0.002, "step": 520 }, { "epoch": 1.6935483870967742, "grad_norm": 0.23303864896297455, "learning_rate": 6.982973675202676e-07, "loss": 0.0017, "step": 525 }, { "epoch": 1.7096774193548387, "grad_norm": 0.13133780658245087, "learning_rate": 6.282669192770896e-07, "loss": 0.0024, "step": 530 }, { "epoch": 1.7258064516129032, "grad_norm": 0.35981041193008423, "learning_rate": 5.617006180446688e-07, "loss": 0.0033, "step": 535 }, { "epoch": 1.7419354838709677, "grad_norm": 0.41394540667533875, "learning_rate": 4.986512107426283e-07, "loss": 0.003, "step": 540 }, { "epoch": 1.7580645161290323, "grad_norm": 0.24306242167949677, "learning_rate": 4.3916865751533313e-07, "loss": 0.003, "step": 545 }, { "epoch": 1.7741935483870968, "grad_norm": 0.1294822096824646, "learning_rate": 3.8330009214363197e-07, "loss": 0.0028, "step": 550 }, { "epoch": 1.7903225806451613, "grad_norm": 0.12696751952171326, "learning_rate": 3.310897846962041e-07, "loss": 0.0027, "step": 555 }, { "epoch": 1.8064516129032258, "grad_norm": 0.27052465081214905, "learning_rate": 2.8257910645009935e-07, "loss": 0.0034, "step": 560 }, { "epoch": 1.8225806451612905, "grad_norm": 0.21007302403450012, "learning_rate": 2.3780649710827552e-07, "loss": 0.0029, "step": 565 }, { "epoch": 1.838709677419355, "grad_norm": 0.20920442044734955, "learning_rate": 1.9680743434010385e-07, "loss": 0.0026, "step": 570 }, { "epoch": 1.8548387096774195, "grad_norm": 0.2021329551935196, "learning_rate": 1.5961440566897913e-07, "loss": 0.0026, "step": 575 }, { "epoch": 1.870967741935484, "grad_norm": 0.23637109994888306, "learning_rate": 1.2625688272930925e-07, "loss": 0.0036, "step": 580 }, { "epoch": 1.8870967741935485, "grad_norm": 0.17650996148586273, "learning_rate": 9.676129791329481e-08, "loss": 0.0024, "step": 585 }, { "epoch": 1.903225806451613, "grad_norm": 0.19747471809387207, "learning_rate": 7.115102342598101e-08, "loss": 0.0035, "step": 590 }, { "epoch": 1.9193548387096775, "grad_norm": 0.6039708852767944, "learning_rate": 4.944635276520393e-08, "loss": 0.0029, "step": 595 }, { "epoch": 1.935483870967742, "grad_norm": 0.3589779734611511, "learning_rate": 3.166448464108629e-08, "loss": 0.0049, "step": 600 }, { "epoch": 1.9516129032258065, "grad_norm": 0.12553617358207703, "learning_rate": 1.781950934783505e-08, "loss": 0.0027, "step": 605 }, { "epoch": 1.967741935483871, "grad_norm": 0.1768651306629181, "learning_rate": 7.922397598642551e-09, "loss": 0.0018, "step": 610 }, { "epoch": 1.9838709677419355, "grad_norm": 0.24496199190616608, "learning_rate": 1.980991832524759e-09, "loss": 0.0021, "step": 615 }, { "epoch": 2.0, "grad_norm": 0.2647761106491089, "learning_rate": 0.0, "loss": 0.0026, "step": 620 }, { "epoch": 2.0, "eval_loss": 0.004049910232424736, "eval_runtime": 3.8097, "eval_samples_per_second": 0.787, "eval_steps_per_second": 0.787, "step": 620 }, { "epoch": 2.0, "step": 620, "total_flos": 1.1181005894949274e+17, "train_loss": 0.03632537112270873, "train_runtime": 2624.7531, "train_samples_per_second": 0.236, "train_steps_per_second": 0.236 } ], "logging_steps": 5, "max_steps": 620, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1181005894949274e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }