{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 984, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 70.84053355134651, "learning_rate": 2.0202020202020206e-06, "loss": 16.5976, "step": 1 }, { "epoch": 0.02, "grad_norm": 70.40484875837613, "learning_rate": 1.0101010101010101e-05, "loss": 16.2878, "step": 5 }, { "epoch": 0.03, "grad_norm": 64.13592776682735, "learning_rate": 2.0202020202020203e-05, "loss": 15.6234, "step": 10 }, { "epoch": 0.05, "grad_norm": 48.17657464142415, "learning_rate": 3.0303030303030306e-05, "loss": 13.1692, "step": 15 }, { "epoch": 0.06, "grad_norm": 42.09774383233372, "learning_rate": 4.0404040404040405e-05, "loss": 9.9677, "step": 20 }, { "epoch": 0.08, "grad_norm": 21.40017494750998, "learning_rate": 5.050505050505051e-05, "loss": 6.438, "step": 25 }, { "epoch": 0.09, "grad_norm": 13.47184864955966, "learning_rate": 6.060606060606061e-05, "loss": 4.5534, "step": 30 }, { "epoch": 0.11, "grad_norm": 9.0175134340465, "learning_rate": 7.07070707070707e-05, "loss": 3.3841, "step": 35 }, { "epoch": 0.12, "grad_norm": 7.9069963595872474, "learning_rate": 8.080808080808081e-05, "loss": 2.4486, "step": 40 }, { "epoch": 0.14, "grad_norm": 4.497563859166001, "learning_rate": 9.090909090909092e-05, "loss": 1.9157, "step": 45 }, { "epoch": 0.15, "grad_norm": 3.8365103057277237, "learning_rate": 0.00010101010101010102, "loss": 1.6253, "step": 50 }, { "epoch": 0.17, "grad_norm": 2.6711072114914822, "learning_rate": 0.00011111111111111112, "loss": 1.4531, "step": 55 }, { "epoch": 0.18, "grad_norm": 2.8242292867987038, "learning_rate": 0.00012121212121212122, "loss": 1.355, "step": 60 }, { "epoch": 0.2, "grad_norm": 1.5627627321505486, "learning_rate": 0.00013131313131313133, "loss": 1.3144, "step": 65 }, { "epoch": 0.21, "grad_norm": 1.9610550212854403, "learning_rate": 0.0001414141414141414, "loss": 1.2403, "step": 70 }, { "epoch": 0.23, "grad_norm": 1.4826190020479229, "learning_rate": 0.00015151515151515152, "loss": 1.2116, "step": 75 }, { "epoch": 0.24, "grad_norm": 1.4179410680444433, "learning_rate": 0.00016161616161616162, "loss": 1.1361, "step": 80 }, { "epoch": 0.26, "grad_norm": 1.6440373948698321, "learning_rate": 0.00017171717171717173, "loss": 1.0898, "step": 85 }, { "epoch": 0.27, "grad_norm": 1.4741470072787244, "learning_rate": 0.00018181818181818183, "loss": 1.1321, "step": 90 }, { "epoch": 0.29, "grad_norm": 1.4918119127052252, "learning_rate": 0.00019191919191919191, "loss": 1.1502, "step": 95 }, { "epoch": 0.3, "grad_norm": 1.780737333023736, "learning_rate": 0.0001999993699387764, "loss": 1.0674, "step": 100 }, { "epoch": 0.32, "grad_norm": 1.5713634753942138, "learning_rate": 0.0001999773186295914, "loss": 1.0976, "step": 105 }, { "epoch": 0.34, "grad_norm": 1.4792461465668252, "learning_rate": 0.0001999237721983199, "loss": 1.0793, "step": 110 }, { "epoch": 0.35, "grad_norm": 1.414080214214796, "learning_rate": 0.0001998387475133018, "loss": 1.1008, "step": 115 }, { "epoch": 0.37, "grad_norm": 1.4612237811023239, "learning_rate": 0.00019972227135924052, "loss": 1.0824, "step": 120 }, { "epoch": 0.38, "grad_norm": 1.5484440969715458, "learning_rate": 0.00019957438042876542, "loss": 1.0178, "step": 125 }, { "epoch": 0.4, "grad_norm": 2.0098511861836643, "learning_rate": 0.0001993951213108726, "loss": 1.0397, "step": 130 }, { "epoch": 0.41, "grad_norm": 1.461268889218997, "learning_rate": 0.00019918455047624847, "loss": 1.0558, "step": 135 }, { "epoch": 0.43, "grad_norm": 1.816402545663251, "learning_rate": 0.00019894273425948003, "loss": 0.9983, "step": 140 }, { "epoch": 0.44, "grad_norm": 1.5043083827737078, "learning_rate": 0.0001986697488381581, "loss": 0.9565, "step": 145 }, { "epoch": 0.46, "grad_norm": 1.5595660341738484, "learning_rate": 0.00019836568020887964, "loss": 0.9811, "step": 150 }, { "epoch": 0.47, "grad_norm": 1.3010086010553632, "learning_rate": 0.00019803062416015675, "loss": 1.05, "step": 155 }, { "epoch": 0.49, "grad_norm": 1.936074537977103, "learning_rate": 0.0001976646862422413, "loss": 1.0033, "step": 160 }, { "epoch": 0.5, "grad_norm": 2.43894566672306, "learning_rate": 0.00019726798173387415, "loss": 0.9904, "step": 165 }, { "epoch": 0.52, "grad_norm": 2.0226681751936604, "learning_rate": 0.0001968406356059696, "loss": 1.0087, "step": 170 }, { "epoch": 0.53, "grad_norm": 1.5765984883843074, "learning_rate": 0.00019638278248224688, "loss": 1.0074, "step": 175 }, { "epoch": 0.55, "grad_norm": 1.4953203184933894, "learning_rate": 0.00019589456659682057, "loss": 0.9663, "step": 180 }, { "epoch": 0.56, "grad_norm": 1.3905440186304812, "learning_rate": 0.00019537614174876356, "loss": 0.9621, "step": 185 }, { "epoch": 0.58, "grad_norm": 1.7108204646804686, "learning_rate": 0.0001948276712536569, "loss": 0.958, "step": 190 }, { "epoch": 0.59, "grad_norm": 1.146705235438316, "learning_rate": 0.00019424932789214157, "loss": 1.0311, "step": 195 }, { "epoch": 0.61, "grad_norm": 1.1584586792685783, "learning_rate": 0.00019364129385548892, "loss": 1.0171, "step": 200 }, { "epoch": 0.62, "grad_norm": 1.629476089763749, "learning_rate": 0.00019300376068820602, "loss": 0.9775, "step": 205 }, { "epoch": 0.64, "grad_norm": 1.2581783118467773, "learning_rate": 0.00019233692922769496, "loss": 1.0471, "step": 210 }, { "epoch": 0.66, "grad_norm": 1.4906993491973393, "learning_rate": 0.0001916410095409843, "loss": 0.9967, "step": 215 }, { "epoch": 0.67, "grad_norm": 1.1893776969242746, "learning_rate": 0.00019091622085855363, "loss": 0.9619, "step": 220 }, { "epoch": 0.69, "grad_norm": 1.4151710347672513, "learning_rate": 0.00019016279150527044, "loss": 1.0227, "step": 225 }, { "epoch": 0.7, "grad_norm": 1.2558733422369428, "learning_rate": 0.00018938095882846306, "loss": 0.9832, "step": 230 }, { "epoch": 0.72, "grad_norm": 1.3320637162932236, "learning_rate": 0.00018857096912315062, "loss": 0.9802, "step": 235 }, { "epoch": 0.73, "grad_norm": 1.2878623504565412, "learning_rate": 0.00018773307755445465, "loss": 0.9251, "step": 240 }, { "epoch": 0.75, "grad_norm": 1.213232935367416, "learning_rate": 0.0001868675480772163, "loss": 0.98, "step": 245 }, { "epoch": 0.76, "grad_norm": 1.3405116881469274, "learning_rate": 0.00018597465335284437, "loss": 0.9473, "step": 250 }, { "epoch": 0.78, "grad_norm": 1.543540810073074, "learning_rate": 0.0001850546746634211, "loss": 0.9902, "step": 255 }, { "epoch": 0.79, "grad_norm": 1.3710808148280944, "learning_rate": 0.00018410790182309169, "loss": 0.975, "step": 260 }, { "epoch": 0.81, "grad_norm": 1.21692171592665, "learning_rate": 0.00018313463308676636, "loss": 0.9613, "step": 265 }, { "epoch": 0.82, "grad_norm": 1.1934007014753554, "learning_rate": 0.00018213517505616338, "loss": 0.9618, "step": 270 }, { "epoch": 0.84, "grad_norm": 1.60819637926966, "learning_rate": 0.00018110984258322237, "loss": 0.9939, "step": 275 }, { "epoch": 0.85, "grad_norm": 1.2454326978067687, "learning_rate": 0.00018005895867091896, "loss": 0.9191, "step": 280 }, { "epoch": 0.87, "grad_norm": 0.9690417217411257, "learning_rate": 0.0001789828543715116, "loss": 0.9421, "step": 285 }, { "epoch": 0.88, "grad_norm": 1.3978495893847322, "learning_rate": 0.0001778818686822523, "loss": 0.9757, "step": 290 }, { "epoch": 0.9, "grad_norm": 1.2683425234668648, "learning_rate": 0.00017675634843859514, "loss": 0.9902, "step": 295 }, { "epoch": 0.91, "grad_norm": 1.2072894977789046, "learning_rate": 0.00017560664820493498, "loss": 0.9517, "step": 300 }, { "epoch": 0.93, "grad_norm": 1.179285109564365, "learning_rate": 0.00017443313016291185, "loss": 0.9657, "step": 305 }, { "epoch": 0.95, "grad_norm": 1.1314300153647592, "learning_rate": 0.00017323616399731533, "loss": 0.9438, "step": 310 }, { "epoch": 0.96, "grad_norm": 1.0638903666221389, "learning_rate": 0.0001720161267796256, "loss": 1.0097, "step": 315 }, { "epoch": 0.98, "grad_norm": 1.0687942439764777, "learning_rate": 0.00017077340284922732, "loss": 0.9453, "step": 320 }, { "epoch": 0.99, "grad_norm": 1.2396289777933702, "learning_rate": 0.000169508383692334, "loss": 0.9388, "step": 325 }, { "epoch": 1.0, "eval_loss": 1.1537501811981201, "eval_runtime": 156.3402, "eval_samples_per_second": 14.775, "eval_steps_per_second": 0.467, "step": 328 }, { "epoch": 1.01, "grad_norm": 1.178139096940401, "learning_rate": 0.00016822146781866098, "loss": 0.9355, "step": 330 }, { "epoch": 1.02, "grad_norm": 0.9864464240664622, "learning_rate": 0.00016691306063588583, "loss": 0.9135, "step": 335 }, { "epoch": 1.04, "grad_norm": 1.0449361148940848, "learning_rate": 0.00016558357432193578, "loss": 0.9088, "step": 340 }, { "epoch": 1.05, "grad_norm": 1.02719313804267, "learning_rate": 0.00016423342769514228, "loss": 0.9361, "step": 345 }, { "epoch": 1.07, "grad_norm": 1.1796531987937382, "learning_rate": 0.00016286304608230368, "loss": 0.9167, "step": 350 }, { "epoch": 1.08, "grad_norm": 1.2165440327749395, "learning_rate": 0.0001614728611846978, "loss": 0.9071, "step": 355 }, { "epoch": 1.1, "grad_norm": 1.0680390308725847, "learning_rate": 0.0001600633109420861, "loss": 0.9423, "step": 360 }, { "epoch": 1.11, "grad_norm": 1.1460931817295472, "learning_rate": 0.00015863483939475281, "loss": 0.9414, "step": 365 }, { "epoch": 1.13, "grad_norm": 1.2232830262553624, "learning_rate": 0.00015718789654362204, "loss": 0.9501, "step": 370 }, { "epoch": 1.14, "grad_norm": 1.1171762766168023, "learning_rate": 0.00015572293820849753, "loss": 0.9278, "step": 375 }, { "epoch": 1.16, "grad_norm": 1.134580557741046, "learning_rate": 0.00015424042588446882, "loss": 0.8877, "step": 380 }, { "epoch": 1.17, "grad_norm": 1.1467717113467715, "learning_rate": 0.00015274082659653, "loss": 0.8968, "step": 385 }, { "epoch": 1.19, "grad_norm": 1.0245879848355788, "learning_rate": 0.0001512246127524561, "loss": 0.933, "step": 390 }, { "epoch": 1.2, "grad_norm": 0.9149640789208816, "learning_rate": 0.0001496922619939842, "loss": 0.9562, "step": 395 }, { "epoch": 1.22, "grad_norm": 1.5241591126822251, "learning_rate": 0.00014814425704634508, "loss": 0.9906, "step": 400 }, { "epoch": 1.23, "grad_norm": 1.0478360565843916, "learning_rate": 0.00014658108556619417, "loss": 0.9131, "step": 405 }, { "epoch": 1.25, "grad_norm": 1.1580130995552973, "learning_rate": 0.00014500323998798843, "loss": 0.9385, "step": 410 }, { "epoch": 1.27, "grad_norm": 1.247960363943888, "learning_rate": 0.00014341121736885843, "loss": 0.8527, "step": 415 }, { "epoch": 1.28, "grad_norm": 1.1576786962267551, "learning_rate": 0.00014180551923202405, "loss": 0.9051, "step": 420 }, { "epoch": 1.3, "grad_norm": 1.0608499549231114, "learning_rate": 0.00014018665140880332, "loss": 0.908, "step": 425 }, { "epoch": 1.31, "grad_norm": 1.1012203287352098, "learning_rate": 0.000138555123879264, "loss": 0.9342, "step": 430 }, { "epoch": 1.33, "grad_norm": 1.0806519524832878, "learning_rate": 0.00013691145061156844, "loss": 0.9136, "step": 435 }, { "epoch": 1.34, "grad_norm": 1.0359931963939946, "learning_rate": 0.00013525614940006184, "loss": 0.8949, "step": 440 }, { "epoch": 1.36, "grad_norm": 1.3063581428657738, "learning_rate": 0.00013358974170215538, "loss": 0.9174, "step": 445 }, { "epoch": 1.37, "grad_norm": 1.0718302654560807, "learning_rate": 0.00013191275247405527, "loss": 0.8404, "step": 450 }, { "epoch": 1.39, "grad_norm": 0.9302638253667144, "learning_rate": 0.00013022571000538953, "loss": 0.9538, "step": 455 }, { "epoch": 1.4, "grad_norm": 1.002831930422535, "learning_rate": 0.00012852914575278498, "loss": 0.9497, "step": 460 }, { "epoch": 1.42, "grad_norm": 1.0473392521685814, "learning_rate": 0.0001268235941724463, "loss": 0.9065, "step": 465 }, { "epoch": 1.43, "grad_norm": 1.077577179849463, "learning_rate": 0.00012510959255179006, "loss": 0.9666, "step": 470 }, { "epoch": 1.45, "grad_norm": 1.0906580828243861, "learning_rate": 0.00012338768084018718, "loss": 0.9149, "step": 475 }, { "epoch": 1.46, "grad_norm": 1.1044210032346624, "learning_rate": 0.00012165840147886656, "loss": 0.9367, "step": 480 }, { "epoch": 1.48, "grad_norm": 1.20379457933143, "learning_rate": 0.00011992229923003377, "loss": 0.9065, "step": 485 }, { "epoch": 1.49, "grad_norm": 1.2198024955592683, "learning_rate": 0.00011817992100525872, "loss": 0.9161, "step": 490 }, { "epoch": 1.51, "grad_norm": 0.9478289622764081, "learning_rate": 0.00011643181569318595, "loss": 0.9285, "step": 495 }, { "epoch": 1.52, "grad_norm": 1.3556113362942024, "learning_rate": 0.00011467853398662236, "loss": 0.9149, "step": 500 }, { "epoch": 1.54, "grad_norm": 1.0405819610764953, "learning_rate": 0.00011292062820905651, "loss": 0.8987, "step": 505 }, { "epoch": 1.55, "grad_norm": 1.0272942715335152, "learning_rate": 0.00011115865214066414, "loss": 0.9296, "step": 510 }, { "epoch": 1.57, "grad_norm": 1.235836010680135, "learning_rate": 0.00010939316084385489, "loss": 0.907, "step": 515 }, { "epoch": 1.59, "grad_norm": 1.141603178243694, "learning_rate": 0.00010762471048841501, "loss": 0.9424, "step": 520 }, { "epoch": 1.6, "grad_norm": 0.9315641865978828, "learning_rate": 0.00010585385817630137, "loss": 0.8801, "step": 525 }, { "epoch": 1.62, "grad_norm": 1.2155191029089636, "learning_rate": 0.00010408116176614167, "loss": 0.9832, "step": 530 }, { "epoch": 1.63, "grad_norm": 1.03073958762087, "learning_rate": 0.00010230717969749616, "loss": 0.9226, "step": 535 }, { "epoch": 1.65, "grad_norm": 0.9081025102798024, "learning_rate": 0.00010053247081493685, "loss": 0.9383, "step": 540 }, { "epoch": 1.66, "grad_norm": 0.9719608179885268, "learning_rate": 9.875759419199848e-05, "loss": 0.9139, "step": 545 }, { "epoch": 1.68, "grad_norm": 0.9765085879374533, "learning_rate": 9.698310895505784e-05, "loss": 0.9546, "step": 550 }, { "epoch": 1.69, "grad_norm": 0.9934690602517904, "learning_rate": 9.520957410719632e-05, "loss": 0.9085, "step": 555 }, { "epoch": 1.71, "grad_norm": 1.0535937418956107, "learning_rate": 9.343754835210106e-05, "loss": 0.9058, "step": 560 }, { "epoch": 1.72, "grad_norm": 1.023947025769762, "learning_rate": 9.166758991806098e-05, "loss": 0.9229, "step": 565 }, { "epoch": 1.74, "grad_norm": 0.9292604570424309, "learning_rate": 8.990025638211179e-05, "loss": 0.9222, "step": 570 }, { "epoch": 1.75, "grad_norm": 1.114153037641386, "learning_rate": 8.813610449438692e-05, "loss": 0.8626, "step": 575 }, { "epoch": 1.77, "grad_norm": 1.040325901090811, "learning_rate": 8.637569000272835e-05, "loss": 0.9408, "step": 580 }, { "epoch": 1.78, "grad_norm": 0.8817859269222795, "learning_rate": 8.461956747761374e-05, "loss": 0.8877, "step": 585 }, { "epoch": 1.8, "grad_norm": 1.1284180630525456, "learning_rate": 8.286829013745382e-05, "loss": 0.9207, "step": 590 }, { "epoch": 1.81, "grad_norm": 0.9343742775094561, "learning_rate": 8.11224096743163e-05, "loss": 0.8933, "step": 595 }, { "epoch": 1.83, "grad_norm": 1.0538577448642068, "learning_rate": 7.938247608013021e-05, "loss": 0.9298, "step": 600 }, { "epoch": 1.84, "grad_norm": 1.0018046757269787, "learning_rate": 7.764903747342603e-05, "loss": 0.9026, "step": 605 }, { "epoch": 1.86, "grad_norm": 0.9082963605695086, "learning_rate": 7.592263992666604e-05, "loss": 0.8603, "step": 610 }, { "epoch": 1.88, "grad_norm": 0.9123839165404105, "learning_rate": 7.420382729421883e-05, "loss": 0.9466, "step": 615 }, { "epoch": 1.89, "grad_norm": 0.8612729466211886, "learning_rate": 7.249314104103315e-05, "loss": 0.9053, "step": 620 }, { "epoch": 1.91, "grad_norm": 0.9068865831245562, "learning_rate": 7.079112007206394e-05, "loss": 0.9099, "step": 625 }, { "epoch": 1.92, "grad_norm": 0.9654244669285843, "learning_rate": 6.909830056250527e-05, "loss": 0.922, "step": 630 }, { "epoch": 1.94, "grad_norm": 0.8439836577250905, "learning_rate": 6.741521578888272e-05, "loss": 0.9283, "step": 635 }, { "epoch": 1.95, "grad_norm": 1.0735235072492841, "learning_rate": 6.574239596105951e-05, "loss": 0.9292, "step": 640 }, { "epoch": 1.97, "grad_norm": 1.1672700848634472, "learning_rate": 6.408036805520801e-05, "loss": 0.8986, "step": 645 }, { "epoch": 1.98, "grad_norm": 0.952568318833893, "learning_rate": 6.242965564780071e-05, "loss": 0.8812, "step": 650 }, { "epoch": 2.0, "grad_norm": 1.1182556181299996, "learning_rate": 6.079077875067136e-05, "loss": 0.9232, "step": 655 }, { "epoch": 2.0, "eval_loss": 1.1174074411392212, "eval_runtime": 155.7531, "eval_samples_per_second": 14.831, "eval_steps_per_second": 0.469, "step": 656 }, { "epoch": 2.01, "grad_norm": 1.0745550951971172, "learning_rate": 5.916425364719975e-05, "loss": 0.8609, "step": 660 }, { "epoch": 2.03, "grad_norm": 0.9414624320547766, "learning_rate": 5.755059272967054e-05, "loss": 0.8921, "step": 665 }, { "epoch": 2.04, "grad_norm": 0.9382686352429901, "learning_rate": 5.5950304337858176e-05, "loss": 0.8712, "step": 670 }, { "epoch": 2.06, "grad_norm": 1.0370060367806875, "learning_rate": 5.436389259888841e-05, "loss": 0.9143, "step": 675 }, { "epoch": 2.07, "grad_norm": 1.0449428824200535, "learning_rate": 5.279185726842658e-05, "loss": 0.8609, "step": 680 }, { "epoch": 2.09, "grad_norm": 1.2140796388499904, "learning_rate": 5.1234693573243554e-05, "loss": 0.8814, "step": 685 }, { "epoch": 2.1, "grad_norm": 1.0118415907930658, "learning_rate": 4.969289205520778e-05, "loss": 0.8797, "step": 690 }, { "epoch": 2.12, "grad_norm": 1.0784940351527341, "learning_rate": 4.816693841675368e-05, "loss": 0.8898, "step": 695 }, { "epoch": 2.13, "grad_norm": 1.231043264244889, "learning_rate": 4.6657313367874256e-05, "loss": 0.8772, "step": 700 }, { "epoch": 2.15, "grad_norm": 0.93091465787962, "learning_rate": 4.516449247468666e-05, "loss": 0.9097, "step": 705 }, { "epoch": 2.16, "grad_norm": 1.1113936820092343, "learning_rate": 4.368894600961792e-05, "loss": 0.8757, "step": 710 }, { "epoch": 2.18, "grad_norm": 0.9533707648230687, "learning_rate": 4.223113880325865e-05, "loss": 0.872, "step": 715 }, { "epoch": 2.2, "grad_norm": 1.0584356613150987, "learning_rate": 4.0791530097930676e-05, "loss": 0.848, "step": 720 }, { "epoch": 2.21, "grad_norm": 1.0955157548141539, "learning_rate": 3.937057340301551e-05, "loss": 0.8831, "step": 725 }, { "epoch": 2.23, "grad_norm": 1.0974350105007893, "learning_rate": 3.7968716352088406e-05, "loss": 0.8965, "step": 730 }, { "epoch": 2.24, "grad_norm": 1.07417322379521, "learning_rate": 3.658640056190378e-05, "loss": 0.8024, "step": 735 }, { "epoch": 2.26, "grad_norm": 0.932073914233341, "learning_rate": 3.52240614932758e-05, "loss": 0.859, "step": 740 }, { "epoch": 2.27, "grad_norm": 0.960220324993126, "learning_rate": 3.388212831389854e-05, "loss": 0.8955, "step": 745 }, { "epoch": 2.29, "grad_norm": 0.9981005413964692, "learning_rate": 3.256102376314824e-05, "loss": 0.94, "step": 750 }, { "epoch": 2.3, "grad_norm": 1.0397017457559017, "learning_rate": 3.126116401891085e-05, "loss": 0.8587, "step": 755 }, { "epoch": 2.32, "grad_norm": 0.9448895278738276, "learning_rate": 2.9982958566476705e-05, "loss": 0.8707, "step": 760 }, { "epoch": 2.33, "grad_norm": 1.0610274165270486, "learning_rate": 2.872681006954315e-05, "loss": 0.8825, "step": 765 }, { "epoch": 2.35, "grad_norm": 1.0210002250645476, "learning_rate": 2.749311424336659e-05, "loss": 0.8915, "step": 770 }, { "epoch": 2.36, "grad_norm": 0.9667083953810022, "learning_rate": 2.6282259730103e-05, "loss": 0.8727, "step": 775 }, { "epoch": 2.38, "grad_norm": 0.9251306242277755, "learning_rate": 2.5094627976376927e-05, "loss": 0.8887, "step": 780 }, { "epoch": 2.39, "grad_norm": 0.9383384405043022, "learning_rate": 2.393059311311715e-05, "loss": 0.8894, "step": 785 }, { "epoch": 2.41, "grad_norm": 0.8717419970020719, "learning_rate": 2.2790521837697034e-05, "loss": 0.8279, "step": 790 }, { "epoch": 2.42, "grad_norm": 0.9486113739115168, "learning_rate": 2.167477329841633e-05, "loss": 0.8545, "step": 795 }, { "epoch": 2.44, "grad_norm": 0.9988727624604463, "learning_rate": 2.0583698981361577e-05, "loss": 0.8624, "step": 800 }, { "epoch": 2.45, "grad_norm": 0.9856023900634717, "learning_rate": 1.9517642599679807e-05, "loss": 0.8953, "step": 805 }, { "epoch": 2.47, "grad_norm": 1.146341438442503, "learning_rate": 1.8476939985301256e-05, "loss": 0.8583, "step": 810 }, { "epoch": 2.48, "grad_norm": 0.9934930089425651, "learning_rate": 1.7461918983144588e-05, "loss": 0.8236, "step": 815 }, { "epoch": 2.5, "grad_norm": 1.0570151368189196, "learning_rate": 1.6472899347838356e-05, "loss": 0.8521, "step": 820 }, { "epoch": 2.52, "grad_norm": 0.940846894356948, "learning_rate": 1.5510192642991073e-05, "loss": 0.8412, "step": 825 }, { "epoch": 2.53, "grad_norm": 0.9182797173905372, "learning_rate": 1.4574102143041512e-05, "loss": 0.8292, "step": 830 }, { "epoch": 2.55, "grad_norm": 1.0514937418330759, "learning_rate": 1.3664922737720586e-05, "loss": 0.8463, "step": 835 }, { "epoch": 2.56, "grad_norm": 0.9672420613383418, "learning_rate": 1.2782940839154111e-05, "loss": 0.8677, "step": 840 }, { "epoch": 2.58, "grad_norm": 1.0215480963434165, "learning_rate": 1.192843429163677e-05, "loss": 0.8555, "step": 845 }, { "epoch": 2.59, "grad_norm": 1.133368133421988, "learning_rate": 1.1101672284104624e-05, "loss": 0.8732, "step": 850 }, { "epoch": 2.61, "grad_norm": 0.9880449036656486, "learning_rate": 1.0302915265334723e-05, "loss": 0.8832, "step": 855 }, { "epoch": 2.62, "grad_norm": 0.934408626896597, "learning_rate": 9.532414861897632e-06, "loss": 0.8527, "step": 860 }, { "epoch": 2.64, "grad_norm": 0.90008489968748, "learning_rate": 8.790413798889452e-06, "loss": 0.8611, "step": 865 }, { "epoch": 2.65, "grad_norm": 0.9910034914102378, "learning_rate": 8.077145823467924e-06, "loss": 0.932, "step": 870 }, { "epoch": 2.67, "grad_norm": 1.1138535634921578, "learning_rate": 7.392835631216766e-06, "loss": 0.878, "step": 875 }, { "epoch": 2.68, "grad_norm": 1.1422833234291747, "learning_rate": 6.7376987953614246e-06, "loss": 0.8607, "step": 880 }, { "epoch": 2.7, "grad_norm": 1.1003844781363714, "learning_rate": 6.111941698858681e-06, "loss": 0.912, "step": 885 }, { "epoch": 2.71, "grad_norm": 1.137696707362488, "learning_rate": 5.5157614693812865e-06, "loss": 0.8416, "step": 890 }, { "epoch": 2.73, "grad_norm": 0.9640103484120158, "learning_rate": 4.9493459172183285e-06, "loss": 0.8452, "step": 895 }, { "epoch": 2.74, "grad_norm": 0.9223029933937373, "learning_rate": 4.412873476110702e-06, "loss": 0.8568, "step": 900 }, { "epoch": 2.76, "grad_norm": 0.9007733331158723, "learning_rate": 3.906513147040425e-06, "loss": 0.8713, "step": 905 }, { "epoch": 2.77, "grad_norm": 1.0288738184384674, "learning_rate": 3.43042444499152e-06, "loss": 0.9235, "step": 910 }, { "epoch": 2.79, "grad_norm": 1.0184524502121122, "learning_rate": 2.984757348699152e-06, "loss": 0.846, "step": 915 }, { "epoch": 2.8, "grad_norm": 1.1092808565987327, "learning_rate": 2.569652253402999e-06, "loss": 0.8701, "step": 920 }, { "epoch": 2.82, "grad_norm": 0.9711566457168485, "learning_rate": 2.1852399266194314e-06, "loss": 0.84, "step": 925 }, { "epoch": 2.84, "grad_norm": 1.0636030655856668, "learning_rate": 1.8316414669469539e-06, "loss": 0.8756, "step": 930 }, { "epoch": 2.85, "grad_norm": 1.062584489248108, "learning_rate": 1.5089682659172321e-06, "loss": 0.848, "step": 935 }, { "epoch": 2.87, "grad_norm": 1.109498548513231, "learning_rate": 1.2173219729043507e-06, "loss": 0.8488, "step": 940 }, { "epoch": 2.88, "grad_norm": 0.9140768109712519, "learning_rate": 9.567944631029169e-07, "loss": 0.8614, "step": 945 }, { "epoch": 2.9, "grad_norm": 0.9332248724423268, "learning_rate": 7.274678085852693e-07, "loss": 0.8839, "step": 950 }, { "epoch": 2.91, "grad_norm": 0.9421812832473311, "learning_rate": 5.294142524469359e-07, "loss": 0.8691, "step": 955 }, { "epoch": 2.93, "grad_norm": 1.0718790701802063, "learning_rate": 3.6269618604847233e-07, "loss": 0.8243, "step": 960 }, { "epoch": 2.94, "grad_norm": 0.996460942018122, "learning_rate": 2.2736612936065106e-07, "loss": 0.842, "step": 965 }, { "epoch": 2.96, "grad_norm": 1.1337948192404557, "learning_rate": 1.2346671441958447e-07, "loss": 0.8646, "step": 970 }, { "epoch": 2.97, "grad_norm": 0.9562746029081519, "learning_rate": 5.103067189662358e-08, "loss": 0.8854, "step": 975 }, { "epoch": 2.99, "grad_norm": 0.9192847747786613, "learning_rate": 1.0080820787450319e-08, "loss": 0.8492, "step": 980 }, { "epoch": 3.0, "eval_loss": 1.1201515197753906, "eval_runtime": 155.5537, "eval_samples_per_second": 14.85, "eval_steps_per_second": 0.469, "step": 984 }, { "epoch": 3.0, "step": 984, "total_flos": 2528700107063296.0, "train_loss": 1.2712977070633957, "train_runtime": 11725.5058, "train_samples_per_second": 5.365, "train_steps_per_second": 0.084 } ], "logging_steps": 5, "max_steps": 984, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 2528700107063296.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }