{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 630, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.039860488290981565, "grad_norm": 11.324122796709977, "learning_rate": 2.53968253968254e-06, "loss": 1.6286, "step": 5 }, { "epoch": 0.07972097658196313, "grad_norm": 1.991457650057207, "learning_rate": 5.7142857142857145e-06, "loss": 1.4713, "step": 10 }, { "epoch": 0.11958146487294469, "grad_norm": 1.0610273535036991, "learning_rate": 8.888888888888888e-06, "loss": 1.3152, "step": 15 }, { "epoch": 0.15944195316392626, "grad_norm": 0.7999727467798167, "learning_rate": 1.2063492063492064e-05, "loss": 1.2185, "step": 20 }, { "epoch": 0.19930244145490783, "grad_norm": 0.6462858961182688, "learning_rate": 1.523809523809524e-05, "loss": 1.1579, "step": 25 }, { "epoch": 0.23916292974588937, "grad_norm": 0.47561233028025485, "learning_rate": 1.8412698412698415e-05, "loss": 1.1295, "step": 30 }, { "epoch": 0.279023418036871, "grad_norm": 0.4460008015131473, "learning_rate": 2.158730158730159e-05, "loss": 1.1069, "step": 35 }, { "epoch": 0.3188839063278525, "grad_norm": 0.35932407714480935, "learning_rate": 2.4761904761904766e-05, "loss": 1.0887, "step": 40 }, { "epoch": 0.35874439461883406, "grad_norm": 0.3717286127467533, "learning_rate": 2.7936507936507936e-05, "loss": 1.0869, "step": 45 }, { "epoch": 0.39860488290981566, "grad_norm": 0.38261468570309465, "learning_rate": 3.111111111111112e-05, "loss": 1.0774, "step": 50 }, { "epoch": 0.4384653712007972, "grad_norm": 0.383323190556911, "learning_rate": 3.4285714285714284e-05, "loss": 1.0636, "step": 55 }, { "epoch": 0.47832585949177875, "grad_norm": 0.4100491937186789, "learning_rate": 3.7460317460317464e-05, "loss": 1.0535, "step": 60 }, { "epoch": 0.5181863477827603, "grad_norm": 0.3975249651869064, "learning_rate": 3.9999693004141615e-05, "loss": 1.0297, "step": 65 }, { "epoch": 0.558046836073742, "grad_norm": 0.3819095926536229, "learning_rate": 3.998894913865352e-05, "loss": 1.0337, "step": 70 }, { "epoch": 0.5979073243647235, "grad_norm": 0.36490844056440797, "learning_rate": 3.9962864903591375e-05, "loss": 1.0063, "step": 75 }, { "epoch": 0.637767812655705, "grad_norm": 0.4076298707032981, "learning_rate": 3.992146031710637e-05, "loss": 1.0237, "step": 80 }, { "epoch": 0.6776283009466866, "grad_norm": 0.37566110161128824, "learning_rate": 3.9864767154838864e-05, "loss": 1.0145, "step": 85 }, { "epoch": 0.7174887892376681, "grad_norm": 0.34530643882424805, "learning_rate": 3.9792828925532376e-05, "loss": 1.0296, "step": 90 }, { "epoch": 0.7573492775286498, "grad_norm": 0.3432558970267317, "learning_rate": 3.970570083764316e-05, "loss": 1.0059, "step": 95 }, { "epoch": 0.7972097658196313, "grad_norm": 0.3408833181353496, "learning_rate": 3.9603449756970877e-05, "loss": 1.004, "step": 100 }, { "epoch": 0.8370702541106129, "grad_norm": 0.3589480700028462, "learning_rate": 3.948615415534294e-05, "loss": 0.9936, "step": 105 }, { "epoch": 0.8769307424015944, "grad_norm": 0.33854687709123976, "learning_rate": 3.9353904050391874e-05, "loss": 1.006, "step": 110 }, { "epoch": 0.916791230692576, "grad_norm": 0.3606947276733311, "learning_rate": 3.9206800936472e-05, "loss": 1.0033, "step": 115 }, { "epoch": 0.9566517189835575, "grad_norm": 0.3524202550527317, "learning_rate": 3.904495770676831e-05, "loss": 0.9917, "step": 120 }, { "epoch": 0.9965122072745392, "grad_norm": 0.3695008401673435, "learning_rate": 3.886849856665746e-05, "loss": 1.0137, "step": 125 }, { "epoch": 1.0318883906327851, "grad_norm": 0.5103901242908977, "learning_rate": 3.8677558938387276e-05, "loss": 0.885, "step": 130 }, { "epoch": 1.0717488789237668, "grad_norm": 0.39411785206718664, "learning_rate": 3.8472285357147966e-05, "loss": 0.8679, "step": 135 }, { "epoch": 1.1116093672147485, "grad_norm": 0.3898193416463423, "learning_rate": 3.825283535861476e-05, "loss": 0.8733, "step": 140 }, { "epoch": 1.15146985550573, "grad_norm": 0.7348802698342378, "learning_rate": 3.801937735804838e-05, "loss": 0.8434, "step": 145 }, { "epoch": 1.1913303437967115, "grad_norm": 0.36945102723915507, "learning_rate": 3.777209052104598e-05, "loss": 0.8461, "step": 150 }, { "epoch": 1.2311908320876932, "grad_norm": 0.4602742420015167, "learning_rate": 3.7511164626041823e-05, "loss": 0.8606, "step": 155 }, { "epoch": 1.2710513203786746, "grad_norm": 0.35621072409911864, "learning_rate": 3.7236799918663284e-05, "loss": 0.8555, "step": 160 }, { "epoch": 1.310911808669656, "grad_norm": 0.42801215480497146, "learning_rate": 3.6949206958053825e-05, "loss": 0.8437, "step": 165 }, { "epoch": 1.3507722969606377, "grad_norm": 0.36223537090099234, "learning_rate": 3.6648606455280944e-05, "loss": 0.8566, "step": 170 }, { "epoch": 1.3906327852516194, "grad_norm": 0.702392888184491, "learning_rate": 3.633522910395314e-05, "loss": 0.8665, "step": 175 }, { "epoch": 1.4304932735426008, "grad_norm": 0.36020937464475294, "learning_rate": 3.6009315403175786e-05, "loss": 0.8363, "step": 180 }, { "epoch": 1.4703537618335825, "grad_norm": 0.3500105740239205, "learning_rate": 3.567111547298194e-05, "loss": 0.853, "step": 185 }, { "epoch": 1.5102142501245641, "grad_norm": 0.4768600668120408, "learning_rate": 3.532088886237956e-05, "loss": 0.8496, "step": 190 }, { "epoch": 1.5500747384155455, "grad_norm": 0.36399360324025654, "learning_rate": 3.495890435016258e-05, "loss": 0.8636, "step": 195 }, { "epoch": 1.5899352267065272, "grad_norm": 0.3380619981423237, "learning_rate": 3.458543973863859e-05, "loss": 0.8538, "step": 200 }, { "epoch": 1.6297957149975089, "grad_norm": 0.3389261022528899, "learning_rate": 3.420078164043161e-05, "loss": 0.8591, "step": 205 }, { "epoch": 1.6696562032884903, "grad_norm": 0.395644963130336, "learning_rate": 3.38052252585233e-05, "loss": 0.8401, "step": 210 }, { "epoch": 1.7095166915794717, "grad_norm": 0.30378900619983906, "learning_rate": 3.339907415970168e-05, "loss": 0.8476, "step": 215 }, { "epoch": 1.7493771798704534, "grad_norm": 0.3900044033629726, "learning_rate": 3.298264004159104e-05, "loss": 0.8413, "step": 220 }, { "epoch": 1.789237668161435, "grad_norm": 0.36925152268695366, "learning_rate": 3.255624249344198e-05, "loss": 0.8534, "step": 225 }, { "epoch": 1.8290981564524165, "grad_norm": 0.3334498107167973, "learning_rate": 3.212020875086495e-05, "loss": 0.8734, "step": 230 }, { "epoch": 1.8689586447433981, "grad_norm": 0.3231836736531515, "learning_rate": 3.1674873444695804e-05, "loss": 0.8619, "step": 235 }, { "epoch": 1.9088191330343798, "grad_norm": 0.3352794024716405, "learning_rate": 3.122057834418582e-05, "loss": 0.8604, "step": 240 }, { "epoch": 1.9486796213253612, "grad_norm": 0.34760623558396486, "learning_rate": 3.075767209471345e-05, "loss": 0.8712, "step": 245 }, { "epoch": 1.9885401096163426, "grad_norm": 0.3494983937948561, "learning_rate": 3.0286509950219077e-05, "loss": 0.8449, "step": 250 }, { "epoch": 2.023916292974589, "grad_norm": 0.6438619381464517, "learning_rate": 2.9807453500567937e-05, "loss": 0.7382, "step": 255 }, { "epoch": 2.0637767812655703, "grad_norm": 0.48837947056765113, "learning_rate": 2.9320870394050783e-05, "loss": 0.6794, "step": 260 }, { "epoch": 2.103637269556552, "grad_norm": 0.4249085778688401, "learning_rate": 2.8827134055234883e-05, "loss": 0.6878, "step": 265 }, { "epoch": 2.1434977578475336, "grad_norm": 0.4112164304088742, "learning_rate": 2.8326623398382174e-05, "loss": 0.6895, "step": 270 }, { "epoch": 2.183358246138515, "grad_norm": 0.4393590314899133, "learning_rate": 2.781972253665431e-05, "loss": 0.6684, "step": 275 }, { "epoch": 2.223218734429497, "grad_norm": 0.37274870892195433, "learning_rate": 2.7306820487327906e-05, "loss": 0.6719, "step": 280 }, { "epoch": 2.2630792227204783, "grad_norm": 0.38062444497520725, "learning_rate": 2.6788310873246133e-05, "loss": 0.6735, "step": 285 }, { "epoch": 2.30293971101146, "grad_norm": 0.3754103984568679, "learning_rate": 2.62645916207358e-05, "loss": 0.6757, "step": 290 }, { "epoch": 2.3428001993024417, "grad_norm": 0.38094693535399177, "learning_rate": 2.5736064654221808e-05, "loss": 0.6544, "step": 295 }, { "epoch": 2.382660687593423, "grad_norm": 0.37267334400373975, "learning_rate": 2.5203135587773196e-05, "loss": 0.6612, "step": 300 }, { "epoch": 2.4225211758844045, "grad_norm": 0.3511334496003159, "learning_rate": 2.4666213413817696e-05, "loss": 0.6763, "step": 305 }, { "epoch": 2.4623816641753864, "grad_norm": 0.368576714561806, "learning_rate": 2.4125710189263555e-05, "loss": 0.6563, "step": 310 }, { "epoch": 2.502242152466368, "grad_norm": 0.35196724604768753, "learning_rate": 2.3582040719269504e-05, "loss": 0.65, "step": 315 }, { "epoch": 2.5421026407573493, "grad_norm": 0.3540473065272315, "learning_rate": 2.3035622238905694e-05, "loss": 0.6679, "step": 320 }, { "epoch": 2.5819631290483307, "grad_norm": 0.3661596143769676, "learning_rate": 2.2486874092949708e-05, "loss": 0.6738, "step": 325 }, { "epoch": 2.621823617339312, "grad_norm": 0.35913068756682465, "learning_rate": 2.1936217414063584e-05, "loss": 0.6887, "step": 330 }, { "epoch": 2.661684105630294, "grad_norm": 0.36799867292080646, "learning_rate": 2.138407479959869e-05, "loss": 0.6709, "step": 335 }, { "epoch": 2.7015445939212754, "grad_norm": 0.35691876781155074, "learning_rate": 2.0830869987276537e-05, "loss": 0.665, "step": 340 }, { "epoch": 2.741405082212257, "grad_norm": 0.38987523468576574, "learning_rate": 2.027702752999444e-05, "loss": 0.6528, "step": 345 }, { "epoch": 2.7812655705032387, "grad_norm": 0.38505381731754873, "learning_rate": 1.9722972470005573e-05, "loss": 0.6771, "step": 350 }, { "epoch": 2.82112605879422, "grad_norm": 0.35842950010166197, "learning_rate": 1.916913001272347e-05, "loss": 0.6638, "step": 355 }, { "epoch": 2.8609865470852016, "grad_norm": 0.37422580701088903, "learning_rate": 1.8615925200401318e-05, "loss": 0.6753, "step": 360 }, { "epoch": 2.9008470353761835, "grad_norm": 0.36860495212005495, "learning_rate": 1.806378258593642e-05, "loss": 0.6681, "step": 365 }, { "epoch": 2.940707523667165, "grad_norm": 0.3647903293380729, "learning_rate": 1.7513125907050302e-05, "loss": 0.6658, "step": 370 }, { "epoch": 2.9805680119581464, "grad_norm": 0.35870478733795147, "learning_rate": 1.6964377761094313e-05, "loss": 0.667, "step": 375 }, { "epoch": 3.0159441953163926, "grad_norm": 0.6451047782511159, "learning_rate": 1.6417959280730506e-05, "loss": 0.612, "step": 380 }, { "epoch": 3.055804683607374, "grad_norm": 0.7456786595663745, "learning_rate": 1.5874289810736452e-05, "loss": 0.5282, "step": 385 }, { "epoch": 3.095665171898356, "grad_norm": 0.4611479200453808, "learning_rate": 1.5333786586182308e-05, "loss": 0.4945, "step": 390 }, { "epoch": 3.1355256601893373, "grad_norm": 0.4624423710225065, "learning_rate": 1.4796864412226812e-05, "loss": 0.5178, "step": 395 }, { "epoch": 3.1753861484803187, "grad_norm": 0.3988674416095005, "learning_rate": 1.4263935345778202e-05, "loss": 0.5015, "step": 400 }, { "epoch": 3.2152466367713006, "grad_norm": 0.4171540538418279, "learning_rate": 1.37354083792642e-05, "loss": 0.4988, "step": 405 }, { "epoch": 3.255107125062282, "grad_norm": 0.4043531142190362, "learning_rate": 1.3211689126753879e-05, "loss": 0.4966, "step": 410 }, { "epoch": 3.2949676133532635, "grad_norm": 0.406846754086433, "learning_rate": 1.26931795126721e-05, "loss": 0.5081, "step": 415 }, { "epoch": 3.334828101644245, "grad_norm": 0.42193109699205417, "learning_rate": 1.2180277463345697e-05, "loss": 0.5088, "step": 420 }, { "epoch": 3.374688589935227, "grad_norm": 0.4039970787325296, "learning_rate": 1.167337660161783e-05, "loss": 0.5023, "step": 425 }, { "epoch": 3.4145490782262082, "grad_norm": 0.3891294704118612, "learning_rate": 1.1172865944765122e-05, "loss": 0.5054, "step": 430 }, { "epoch": 3.4544095665171897, "grad_norm": 0.4123628750841747, "learning_rate": 1.067912960594923e-05, "loss": 0.5078, "step": 435 }, { "epoch": 3.4942700548081715, "grad_norm": 0.4116103758152132, "learning_rate": 1.0192546499432066e-05, "loss": 0.5008, "step": 440 }, { "epoch": 3.534130543099153, "grad_norm": 0.39334581361661547, "learning_rate": 9.713490049780931e-06, "loss": 0.5117, "step": 445 }, { "epoch": 3.5739910313901344, "grad_norm": 0.3977163931160735, "learning_rate": 9.242327905286552e-06, "loss": 0.5004, "step": 450 }, { "epoch": 3.6138515196811163, "grad_norm": 0.41291346064789947, "learning_rate": 8.779421655814189e-06, "loss": 0.4899, "step": 455 }, { "epoch": 3.6537120079720977, "grad_norm": 0.39864981854696013, "learning_rate": 8.325126555304208e-06, "loss": 0.4925, "step": 460 }, { "epoch": 3.693572496263079, "grad_norm": 0.3783439845572638, "learning_rate": 7.879791249135059e-06, "loss": 0.4936, "step": 465 }, { "epoch": 3.733432984554061, "grad_norm": 0.36028247570266225, "learning_rate": 7.443757506558033e-06, "loss": 0.5071, "step": 470 }, { "epoch": 3.7732934728450425, "grad_norm": 0.36974308586061955, "learning_rate": 7.0173599584089625e-06, "loss": 0.4902, "step": 475 }, { "epoch": 3.813153961136024, "grad_norm": 0.3709767672807715, "learning_rate": 6.600925840298331e-06, "loss": 0.505, "step": 480 }, { "epoch": 3.8530144494270053, "grad_norm": 0.3610583139391416, "learning_rate": 6.1947747414767035e-06, "loss": 0.5208, "step": 485 }, { "epoch": 3.892874937717987, "grad_norm": 0.3791524633277678, "learning_rate": 5.799218359568395e-06, "loss": 0.5301, "step": 490 }, { "epoch": 3.9327354260089686, "grad_norm": 0.363724718810471, "learning_rate": 5.414560261361415e-06, "loss": 0.4855, "step": 495 }, { "epoch": 3.97259591429995, "grad_norm": 0.3867971355298049, "learning_rate": 5.041095649837429e-06, "loss": 0.4872, "step": 500 }, { "epoch": 4.007972097658197, "grad_norm": 0.6922635714147061, "learning_rate": 4.679111137620442e-06, "loss": 0.4901, "step": 505 }, { "epoch": 4.047832585949178, "grad_norm": 0.5464640833763156, "learning_rate": 4.328884527018067e-06, "loss": 0.4086, "step": 510 }, { "epoch": 4.08769307424016, "grad_norm": 0.5514248166968356, "learning_rate": 3.990684596824219e-06, "loss": 0.4042, "step": 515 }, { "epoch": 4.127553562531141, "grad_norm": 0.46194805338366773, "learning_rate": 3.6647708960468696e-06, "loss": 0.4028, "step": 520 }, { "epoch": 4.1674140508221225, "grad_norm": 0.45065104707045, "learning_rate": 3.3513935447190595e-06, "loss": 0.3937, "step": 525 }, { "epoch": 4.207274539113104, "grad_norm": 0.3884400001553067, "learning_rate": 3.050793041946183e-06, "loss": 0.3941, "step": 530 }, { "epoch": 4.247135027404085, "grad_norm": 0.4057662200999109, "learning_rate": 2.763200081336721e-06, "loss": 0.3913, "step": 535 }, { "epoch": 4.286995515695067, "grad_norm": 0.3927959847718352, "learning_rate": 2.488835373958185e-06, "loss": 0.3993, "step": 540 }, { "epoch": 4.326856003986049, "grad_norm": 0.3915532354537384, "learning_rate": 2.2279094789540244e-06, "loss": 0.4047, "step": 545 }, { "epoch": 4.36671649227703, "grad_norm": 0.37576385131138085, "learning_rate": 1.9806226419516195e-06, "loss": 0.402, "step": 550 }, { "epoch": 4.406576980568012, "grad_norm": 0.3589030046466532, "learning_rate": 1.7471646413852439e-06, "loss": 0.4037, "step": 555 }, { "epoch": 4.446437468858994, "grad_norm": 0.38802398319508147, "learning_rate": 1.527714642852045e-06, "loss": 0.3975, "step": 560 }, { "epoch": 4.486297957149975, "grad_norm": 0.3781055143033295, "learning_rate": 1.3224410616127292e-06, "loss": 0.3902, "step": 565 }, { "epoch": 4.526158445440957, "grad_norm": 0.380556572113873, "learning_rate": 1.1315014333425455e-06, "loss": 0.3955, "step": 570 }, { "epoch": 4.566018933731939, "grad_norm": 0.37627984573745005, "learning_rate": 9.550422932316938e-07, "loss": 0.3925, "step": 575 }, { "epoch": 4.60587942202292, "grad_norm": 0.36605761395168335, "learning_rate": 7.931990635280052e-07, "loss": 0.3929, "step": 580 }, { "epoch": 4.645739910313901, "grad_norm": 0.37403610200301257, "learning_rate": 6.460959496081276e-07, "loss": 0.4068, "step": 585 }, { "epoch": 4.685600398604883, "grad_norm": 0.3673645131484265, "learning_rate": 5.13845844657066e-07, "loss": 0.3962, "step": 590 }, { "epoch": 4.725460886895864, "grad_norm": 0.39800543909133185, "learning_rate": 3.965502430291235e-07, "loss": 0.3912, "step": 595 }, { "epoch": 4.765321375186846, "grad_norm": 0.37969719371341065, "learning_rate": 2.942991623568436e-07, "loss": 0.3801, "step": 600 }, { "epoch": 4.805181863477827, "grad_norm": 0.3777219824944034, "learning_rate": 2.0717107446762696e-07, "loss": 0.369, "step": 605 }, { "epoch": 4.845042351768809, "grad_norm": 0.3913050535737085, "learning_rate": 1.3523284516113955e-07, "loss": 0.3982, "step": 610 }, { "epoch": 4.884902840059791, "grad_norm": 0.3712369916416077, "learning_rate": 7.853968289363245e-08, "loss": 0.3916, "step": 615 }, { "epoch": 4.924763328350773, "grad_norm": 0.38101325204255376, "learning_rate": 3.7135096408631443e-08, "loss": 0.3978, "step": 620 }, { "epoch": 4.964623816641754, "grad_norm": 0.37989319704275204, "learning_rate": 1.1050861346488806e-08, "loss": 0.389, "step": 625 }, { "epoch": 5.0, "grad_norm": 0.5514474703259893, "learning_rate": 3.069958583856725e-10, "loss": 0.4095, "step": 630 }, { "epoch": 5.0, "step": 630, "total_flos": 1235947560108032.0, "train_loss": 0.7053156269921197, "train_runtime": 61423.1886, "train_samples_per_second": 1.307, "train_steps_per_second": 0.01 } ], "logging_steps": 5, "max_steps": 630, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1235947560108032.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }