{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9994655264564404, "eval_steps": 500, "global_step": 935, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010689470871191875, "grad_norm": 23.473513373152795, "learning_rate": 1.0638297872340426e-07, "loss": 1.353, "step": 1 }, { "epoch": 0.005344735435595938, "grad_norm": 21.711768425322266, "learning_rate": 5.319148936170213e-07, "loss": 1.3519, "step": 5 }, { "epoch": 0.010689470871191877, "grad_norm": 8.713068756520672, "learning_rate": 1.0638297872340427e-06, "loss": 1.2088, "step": 10 }, { "epoch": 0.016034206306787813, "grad_norm": 8.310866409645099, "learning_rate": 1.595744680851064e-06, "loss": 1.0635, "step": 15 }, { "epoch": 0.021378941742383754, "grad_norm": 3.066210523084854, "learning_rate": 2.1276595744680853e-06, "loss": 0.9417, "step": 20 }, { "epoch": 0.02672367717797969, "grad_norm": 2.493239812049204, "learning_rate": 2.6595744680851065e-06, "loss": 0.8884, "step": 25 }, { "epoch": 0.032068412613575625, "grad_norm": 2.403610159527061, "learning_rate": 3.191489361702128e-06, "loss": 0.8557, "step": 30 }, { "epoch": 0.03741314804917156, "grad_norm": 2.3974276447793694, "learning_rate": 3.723404255319149e-06, "loss": 0.8349, "step": 35 }, { "epoch": 0.04275788348476751, "grad_norm": 2.2921630085655025, "learning_rate": 4.255319148936171e-06, "loss": 0.8163, "step": 40 }, { "epoch": 0.048102618920363445, "grad_norm": 2.3116910060192817, "learning_rate": 4.787234042553192e-06, "loss": 0.798, "step": 45 }, { "epoch": 0.05344735435595938, "grad_norm": 2.242005189567634, "learning_rate": 5.319148936170213e-06, "loss": 0.7863, "step": 50 }, { "epoch": 0.05879208979155532, "grad_norm": 2.536220362549579, "learning_rate": 5.851063829787235e-06, "loss": 0.7663, "step": 55 }, { "epoch": 0.06413682522715125, "grad_norm": 2.492174958150876, "learning_rate": 6.382978723404256e-06, "loss": 0.7579, "step": 60 }, { "epoch": 0.06948156066274719, "grad_norm": 2.3311723974999095, "learning_rate": 6.914893617021278e-06, "loss": 0.7397, "step": 65 }, { "epoch": 0.07482629609834313, "grad_norm": 2.5040282086769317, "learning_rate": 7.446808510638298e-06, "loss": 0.7338, "step": 70 }, { "epoch": 0.08017103153393906, "grad_norm": 2.411086582278843, "learning_rate": 7.97872340425532e-06, "loss": 0.7215, "step": 75 }, { "epoch": 0.08551576696953501, "grad_norm": 2.3433794783119883, "learning_rate": 8.510638297872341e-06, "loss": 0.7124, "step": 80 }, { "epoch": 0.09086050240513095, "grad_norm": 2.354225850924038, "learning_rate": 9.042553191489362e-06, "loss": 0.7177, "step": 85 }, { "epoch": 0.09620523784072689, "grad_norm": 2.4091177426727266, "learning_rate": 9.574468085106385e-06, "loss": 0.7163, "step": 90 }, { "epoch": 0.10154997327632283, "grad_norm": 2.498651758504887, "learning_rate": 9.999965114314806e-06, "loss": 0.7031, "step": 95 }, { "epoch": 0.10689470871191876, "grad_norm": 2.423896856776365, "learning_rate": 9.998744166446685e-06, "loss": 0.7005, "step": 100 }, { "epoch": 0.1122394441475147, "grad_norm": 2.387782305227773, "learning_rate": 9.995779421092695e-06, "loss": 0.6846, "step": 105 }, { "epoch": 0.11758417958311064, "grad_norm": 2.424890050569407, "learning_rate": 9.991071912495701e-06, "loss": 0.7027, "step": 110 }, { "epoch": 0.12292891501870658, "grad_norm": 2.115700167815373, "learning_rate": 9.984623282856502e-06, "loss": 0.6923, "step": 115 }, { "epoch": 0.1282736504543025, "grad_norm": 2.0709758825284363, "learning_rate": 9.97643578176095e-06, "loss": 0.6859, "step": 120 }, { "epoch": 0.13361838588989844, "grad_norm": 2.1305031798421186, "learning_rate": 9.966512265395188e-06, "loss": 0.6846, "step": 125 }, { "epoch": 0.13896312132549438, "grad_norm": 2.0044100900913384, "learning_rate": 9.95485619554928e-06, "loss": 0.6799, "step": 130 }, { "epoch": 0.14430785676109031, "grad_norm": 2.050948697307653, "learning_rate": 9.941471638409576e-06, "loss": 0.6767, "step": 135 }, { "epoch": 0.14965259219668625, "grad_norm": 1.952049269951998, "learning_rate": 9.926363263140234e-06, "loss": 0.6669, "step": 140 }, { "epoch": 0.1549973276322822, "grad_norm": 2.099731647032975, "learning_rate": 9.90953634025439e-06, "loss": 0.6786, "step": 145 }, { "epoch": 0.16034206306787813, "grad_norm": 1.993767696614697, "learning_rate": 9.890996739775562e-06, "loss": 0.6674, "step": 150 }, { "epoch": 0.16568679850347406, "grad_norm": 2.0517367712519503, "learning_rate": 9.870750929189914e-06, "loss": 0.6748, "step": 155 }, { "epoch": 0.17103153393907003, "grad_norm": 2.0491814339157486, "learning_rate": 9.848805971190074e-06, "loss": 0.6621, "step": 160 }, { "epoch": 0.17637626937466597, "grad_norm": 2.0777487120565397, "learning_rate": 9.825169521211354e-06, "loss": 0.6543, "step": 165 }, { "epoch": 0.1817210048102619, "grad_norm": 2.351975397829705, "learning_rate": 9.799849824761159e-06, "loss": 0.6552, "step": 170 }, { "epoch": 0.18706574024585784, "grad_norm": 2.0302930695761034, "learning_rate": 9.772855714542569e-06, "loss": 0.6525, "step": 175 }, { "epoch": 0.19241047568145378, "grad_norm": 2.188255761505827, "learning_rate": 9.744196607373086e-06, "loss": 0.6512, "step": 180 }, { "epoch": 0.19775521111704972, "grad_norm": 2.1631601855326763, "learning_rate": 9.71388250089959e-06, "loss": 0.6506, "step": 185 }, { "epoch": 0.20309994655264565, "grad_norm": 2.041485612804426, "learning_rate": 9.681923970110698e-06, "loss": 0.6402, "step": 190 }, { "epoch": 0.2084446819882416, "grad_norm": 2.1331611096395973, "learning_rate": 9.648332163647705e-06, "loss": 0.6234, "step": 195 }, { "epoch": 0.21378941742383753, "grad_norm": 2.076285244795538, "learning_rate": 9.613118799915417e-06, "loss": 0.6422, "step": 200 }, { "epoch": 0.21913415285943347, "grad_norm": 1.9966049947480653, "learning_rate": 9.576296162994214e-06, "loss": 0.62, "step": 205 }, { "epoch": 0.2244788882950294, "grad_norm": 2.003867139711824, "learning_rate": 9.537877098354787e-06, "loss": 0.6431, "step": 210 }, { "epoch": 0.22982362373062534, "grad_norm": 1.9196987285290612, "learning_rate": 9.497875008377033e-06, "loss": 0.6234, "step": 215 }, { "epoch": 0.23516835916622128, "grad_norm": 2.1957665769389774, "learning_rate": 9.456303847674674e-06, "loss": 0.6256, "step": 220 }, { "epoch": 0.24051309460181722, "grad_norm": 1.884124911991183, "learning_rate": 9.41317811822723e-06, "loss": 0.6052, "step": 225 }, { "epoch": 0.24585783003741316, "grad_norm": 1.9253697770132223, "learning_rate": 9.36851286432104e-06, "loss": 0.6228, "step": 230 }, { "epoch": 0.25120256547300907, "grad_norm": 1.8020291089671914, "learning_rate": 9.322323667301113e-06, "loss": 0.6191, "step": 235 }, { "epoch": 0.256547300908605, "grad_norm": 1.9107218407751854, "learning_rate": 9.274626640135616e-06, "loss": 0.6121, "step": 240 }, { "epoch": 0.26189203634420094, "grad_norm": 1.8979104758577288, "learning_rate": 9.225438421794919e-06, "loss": 0.616, "step": 245 }, { "epoch": 0.2672367717797969, "grad_norm": 1.9569646424906317, "learning_rate": 9.174776171447126e-06, "loss": 0.6047, "step": 250 }, { "epoch": 0.2725815072153928, "grad_norm": 2.0364213519267715, "learning_rate": 9.12265756247216e-06, "loss": 0.6093, "step": 255 }, { "epoch": 0.27792624265098875, "grad_norm": 1.9725707409017998, "learning_rate": 9.06910077629645e-06, "loss": 0.599, "step": 260 }, { "epoch": 0.2832709780865847, "grad_norm": 1.9492899906874035, "learning_rate": 9.014124496050391e-06, "loss": 0.5902, "step": 265 }, { "epoch": 0.28861571352218063, "grad_norm": 2.174543196329182, "learning_rate": 8.957747900050797e-06, "loss": 0.5937, "step": 270 }, { "epoch": 0.29396044895777657, "grad_norm": 2.4160108027006615, "learning_rate": 8.899990655110586e-06, "loss": 0.6018, "step": 275 }, { "epoch": 0.2993051843933725, "grad_norm": 2.087978454933178, "learning_rate": 8.840872909678081e-06, "loss": 0.5832, "step": 280 }, { "epoch": 0.30464991982896844, "grad_norm": 1.9762659778049954, "learning_rate": 8.780415286808284e-06, "loss": 0.5922, "step": 285 }, { "epoch": 0.3099946552645644, "grad_norm": 2.269903050974625, "learning_rate": 8.718638876968564e-06, "loss": 0.587, "step": 290 }, { "epoch": 0.3153393907001603, "grad_norm": 2.0490480592368043, "learning_rate": 8.655565230681329e-06, "loss": 0.5748, "step": 295 }, { "epoch": 0.32068412613575625, "grad_norm": 2.2764533528101167, "learning_rate": 8.591216351006181e-06, "loss": 0.575, "step": 300 }, { "epoch": 0.3260288615713522, "grad_norm": 2.0737769584580072, "learning_rate": 8.525614685864209e-06, "loss": 0.5716, "step": 305 }, { "epoch": 0.33137359700694813, "grad_norm": 1.9102889564355032, "learning_rate": 8.458783120207099e-06, "loss": 0.5686, "step": 310 }, { "epoch": 0.3367183324425441, "grad_norm": 1.8800981653614064, "learning_rate": 8.390744968033785e-06, "loss": 0.5629, "step": 315 }, { "epoch": 0.34206306787814006, "grad_norm": 1.96332588340258, "learning_rate": 8.321523964257431e-06, "loss": 0.5657, "step": 320 }, { "epoch": 0.347407803313736, "grad_norm": 1.9485967093770373, "learning_rate": 8.251144256425562e-06, "loss": 0.575, "step": 325 }, { "epoch": 0.35275253874933193, "grad_norm": 1.929135934757674, "learning_rate": 8.179630396296285e-06, "loss": 0.5632, "step": 330 }, { "epoch": 0.35809727418492787, "grad_norm": 1.9559096020055298, "learning_rate": 8.107007331273449e-06, "loss": 0.5626, "step": 335 }, { "epoch": 0.3634420096205238, "grad_norm": 2.012794105136495, "learning_rate": 8.033300395703845e-06, "loss": 0.5546, "step": 340 }, { "epoch": 0.36878674505611975, "grad_norm": 1.9997480665852025, "learning_rate": 7.958535302039368e-06, "loss": 0.5458, "step": 345 }, { "epoch": 0.3741314804917157, "grad_norm": 1.8451676891383395, "learning_rate": 7.88273813186732e-06, "loss": 0.5483, "step": 350 }, { "epoch": 0.3794762159273116, "grad_norm": 1.895313478955964, "learning_rate": 7.805935326811913e-06, "loss": 0.5517, "step": 355 }, { "epoch": 0.38482095136290756, "grad_norm": 1.9301374165518745, "learning_rate": 7.728153679310186e-06, "loss": 0.5464, "step": 360 }, { "epoch": 0.3901656867985035, "grad_norm": 2.014621013972388, "learning_rate": 7.649420323265547e-06, "loss": 0.5441, "step": 365 }, { "epoch": 0.39551042223409943, "grad_norm": 1.9682365332357328, "learning_rate": 7.569762724582179e-06, "loss": 0.5247, "step": 370 }, { "epoch": 0.40085515766969537, "grad_norm": 1.9950047285262882, "learning_rate": 7.48920867158365e-06, "loss": 0.5435, "step": 375 }, { "epoch": 0.4061998931052913, "grad_norm": 1.8591367525423361, "learning_rate": 7.407786265319023e-06, "loss": 0.5363, "step": 380 }, { "epoch": 0.41154462854088725, "grad_norm": 2.038203032181739, "learning_rate": 7.325523909759902e-06, "loss": 0.5312, "step": 385 }, { "epoch": 0.4168893639764832, "grad_norm": 2.1300865254144856, "learning_rate": 7.242450301891772e-06, "loss": 0.5257, "step": 390 }, { "epoch": 0.4222340994120791, "grad_norm": 1.882363933745456, "learning_rate": 7.158594421703152e-06, "loss": 0.5096, "step": 395 }, { "epoch": 0.42757883484767506, "grad_norm": 1.8727951959035394, "learning_rate": 7.073985522076001e-06, "loss": 0.5184, "step": 400 }, { "epoch": 0.432923570283271, "grad_norm": 1.9894963218262207, "learning_rate": 6.9886531185809385e-06, "loss": 0.5259, "step": 405 }, { "epoch": 0.43826830571886694, "grad_norm": 2.0264734474019854, "learning_rate": 6.902626979180821e-06, "loss": 0.5287, "step": 410 }, { "epoch": 0.4436130411544629, "grad_norm": 1.8722413198751242, "learning_rate": 6.8159371138462745e-06, "loss": 0.5203, "step": 415 }, { "epoch": 0.4489577765900588, "grad_norm": 1.9496986261286822, "learning_rate": 6.728613764086806e-06, "loss": 0.5117, "step": 420 }, { "epoch": 0.45430251202565475, "grad_norm": 1.9774605540105135, "learning_rate": 6.640687392401132e-06, "loss": 0.5031, "step": 425 }, { "epoch": 0.4596472474612507, "grad_norm": 1.954818070964011, "learning_rate": 6.552188671650434e-06, "loss": 0.5031, "step": 430 }, { "epoch": 0.4649919828968466, "grad_norm": 1.9617332949799173, "learning_rate": 6.46314847435821e-06, "loss": 0.5066, "step": 435 }, { "epoch": 0.47033671833244256, "grad_norm": 2.004929689013094, "learning_rate": 6.373597861940488e-06, "loss": 0.4988, "step": 440 }, { "epoch": 0.4756814537680385, "grad_norm": 1.9774178895562935, "learning_rate": 6.283568073870147e-06, "loss": 0.4975, "step": 445 }, { "epoch": 0.48102618920363444, "grad_norm": 1.9264282288547034, "learning_rate": 6.1930905167791025e-06, "loss": 0.4995, "step": 450 }, { "epoch": 0.4863709246392304, "grad_norm": 1.8764137691966425, "learning_rate": 6.102196753502202e-06, "loss": 0.4808, "step": 455 }, { "epoch": 0.4917156600748263, "grad_norm": 2.050651493580463, "learning_rate": 6.010918492066628e-06, "loss": 0.482, "step": 460 }, { "epoch": 0.49706039551042225, "grad_norm": 1.9228723322188161, "learning_rate": 5.919287574630628e-06, "loss": 0.4843, "step": 465 }, { "epoch": 0.5024051309460181, "grad_norm": 1.9851687404052978, "learning_rate": 5.827335966375485e-06, "loss": 0.4894, "step": 470 }, { "epoch": 0.5077498663816141, "grad_norm": 1.9039931208460692, "learning_rate": 5.735095744354543e-06, "loss": 0.4673, "step": 475 }, { "epoch": 0.51309460181721, "grad_norm": 1.9780097627545055, "learning_rate": 5.642599086303233e-06, "loss": 0.4788, "step": 480 }, { "epoch": 0.518439337252806, "grad_norm": 2.0794416013824994, "learning_rate": 5.5498782594139476e-06, "loss": 0.4662, "step": 485 }, { "epoch": 0.5237840726884019, "grad_norm": 1.9483487482401622, "learning_rate": 5.456965609079741e-06, "loss": 0.4763, "step": 490 }, { "epoch": 0.5291288081239979, "grad_norm": 1.8945000444925455, "learning_rate": 5.363893547610715e-06, "loss": 0.4807, "step": 495 }, { "epoch": 0.5344735435595938, "grad_norm": 1.9768379335510178, "learning_rate": 5.270694542927089e-06, "loss": 0.4803, "step": 500 }, { "epoch": 0.5398182789951897, "grad_norm": 1.9758527545184905, "learning_rate": 5.1774011072328575e-06, "loss": 0.4627, "step": 505 }, { "epoch": 0.5451630144307856, "grad_norm": 1.866392771665028, "learning_rate": 5.084045785674001e-06, "loss": 0.4608, "step": 510 }, { "epoch": 0.5505077498663816, "grad_norm": 1.957078999847815, "learning_rate": 4.9906611449852035e-06, "loss": 0.4542, "step": 515 }, { "epoch": 0.5558524853019775, "grad_norm": 1.9342632236304118, "learning_rate": 4.897279762129044e-06, "loss": 0.4547, "step": 520 }, { "epoch": 0.5611972207375735, "grad_norm": 1.966938473585663, "learning_rate": 4.8039342129316175e-06, "loss": 0.4526, "step": 525 }, { "epoch": 0.5665419561731694, "grad_norm": 1.9823937668875962, "learning_rate": 4.710657060718547e-06, "loss": 0.4503, "step": 530 }, { "epoch": 0.5718866916087654, "grad_norm": 1.931604909577632, "learning_rate": 4.617480844955367e-06, "loss": 0.4543, "step": 535 }, { "epoch": 0.5772314270443613, "grad_norm": 1.9170992207978994, "learning_rate": 4.52443806989622e-06, "loss": 0.4383, "step": 540 }, { "epoch": 0.5825761624799572, "grad_norm": 1.9043676805575804, "learning_rate": 4.431561193244852e-06, "loss": 0.4546, "step": 545 }, { "epoch": 0.5879208979155531, "grad_norm": 1.9233137290689744, "learning_rate": 4.338882614831817e-06, "loss": 0.446, "step": 550 }, { "epoch": 0.5932656333511491, "grad_norm": 1.8515724013719734, "learning_rate": 4.246434665311907e-06, "loss": 0.4321, "step": 555 }, { "epoch": 0.598610368786745, "grad_norm": 1.8828506758113701, "learning_rate": 4.154249594885687e-06, "loss": 0.4337, "step": 560 }, { "epoch": 0.603955104222341, "grad_norm": 1.9687942950195187, "learning_rate": 4.062359562049109e-06, "loss": 0.431, "step": 565 }, { "epoch": 0.6092998396579369, "grad_norm": 1.8995143280590803, "learning_rate": 3.970796622375116e-06, "loss": 0.4405, "step": 570 }, { "epoch": 0.6146445750935329, "grad_norm": 1.8557131453687543, "learning_rate": 3.879592717331141e-06, "loss": 0.4283, "step": 575 }, { "epoch": 0.6199893105291288, "grad_norm": 1.9309890578975801, "learning_rate": 3.78877966313642e-06, "loss": 0.4367, "step": 580 }, { "epoch": 0.6253340459647247, "grad_norm": 1.973985583949257, "learning_rate": 3.698389139663003e-06, "loss": 0.4324, "step": 585 }, { "epoch": 0.6306787814003206, "grad_norm": 1.8679236635919254, "learning_rate": 3.608452679384311e-06, "loss": 0.4289, "step": 590 }, { "epoch": 0.6360235168359166, "grad_norm": 1.827491182434645, "learning_rate": 3.5190016563751316e-06, "loss": 0.427, "step": 595 }, { "epoch": 0.6413682522715125, "grad_norm": 2.0739567283668583, "learning_rate": 3.4300672753668635e-06, "loss": 0.4163, "step": 600 }, { "epoch": 0.6467129877071085, "grad_norm": 1.8900939036994293, "learning_rate": 3.34168056086183e-06, "loss": 0.4322, "step": 605 }, { "epoch": 0.6520577231427044, "grad_norm": 1.9421083071105274, "learning_rate": 3.2538723463104737e-06, "loss": 0.4139, "step": 610 }, { "epoch": 0.6574024585783004, "grad_norm": 1.9389760896779549, "learning_rate": 3.166673263355199e-06, "loss": 0.4238, "step": 615 }, { "epoch": 0.6627471940138963, "grad_norm": 1.874356566114327, "learning_rate": 3.0801137311446087e-06, "loss": 0.4165, "step": 620 }, { "epoch": 0.6680919294494923, "grad_norm": 1.9580525970654405, "learning_rate": 2.994223945721872e-06, "loss": 0.4082, "step": 625 }, { "epoch": 0.6734366648850882, "grad_norm": 1.9147611470583412, "learning_rate": 2.9090338694909254e-06, "loss": 0.4219, "step": 630 }, { "epoch": 0.6787814003206841, "grad_norm": 1.86847720750241, "learning_rate": 2.8245732207641705e-06, "loss": 0.4132, "step": 635 }, { "epoch": 0.6841261357562801, "grad_norm": 1.8982814741174805, "learning_rate": 2.740871463395325e-06, "loss": 0.4129, "step": 640 }, { "epoch": 0.689470871191876, "grad_norm": 1.901393825227574, "learning_rate": 2.65795779650105e-06, "loss": 0.4033, "step": 645 }, { "epoch": 0.694815606627472, "grad_norm": 1.8509358915404532, "learning_rate": 2.575861144274914e-06, "loss": 0.4013, "step": 650 }, { "epoch": 0.7001603420630679, "grad_norm": 1.8958219311827695, "learning_rate": 2.4946101458972744e-06, "loss": 0.4027, "step": 655 }, { "epoch": 0.7055050774986639, "grad_norm": 1.8426235286345023, "learning_rate": 2.414233145544585e-06, "loss": 0.3964, "step": 660 }, { "epoch": 0.7108498129342598, "grad_norm": 1.9191012877180513, "learning_rate": 2.33475818250161e-06, "loss": 0.3954, "step": 665 }, { "epoch": 0.7161945483698557, "grad_norm": 1.902716391588277, "learning_rate": 2.256212981379996e-06, "loss": 0.3957, "step": 670 }, { "epoch": 0.7215392838054516, "grad_norm": 1.947059841210012, "learning_rate": 2.178624942446626e-06, "loss": 0.401, "step": 675 }, { "epoch": 0.7268840192410476, "grad_norm": 1.917543839246915, "learning_rate": 2.1020211320651135e-06, "loss": 0.3888, "step": 680 }, { "epoch": 0.7322287546766435, "grad_norm": 1.8327154685147313, "learning_rate": 2.0264282732537827e-06, "loss": 0.4003, "step": 685 }, { "epoch": 0.7375734901122395, "grad_norm": 1.7919182630342896, "learning_rate": 1.9518727363634187e-06, "loss": 0.3873, "step": 690 }, { "epoch": 0.7429182255478354, "grad_norm": 1.9169963711238958, "learning_rate": 1.8783805298780427e-06, "loss": 0.4018, "step": 695 }, { "epoch": 0.7482629609834314, "grad_norm": 1.8380910216299624, "learning_rate": 1.8059772913419305e-06, "loss": 0.3946, "step": 700 }, { "epoch": 0.7536076964190273, "grad_norm": 1.9558297629437302, "learning_rate": 1.7346882784160346e-06, "loss": 0.3916, "step": 705 }, { "epoch": 0.7589524318546232, "grad_norm": 1.897087729189459, "learning_rate": 1.6645383600669124e-06, "loss": 0.3919, "step": 710 }, { "epoch": 0.7642971672902191, "grad_norm": 1.8755131544746726, "learning_rate": 1.5955520078912628e-06, "loss": 0.3846, "step": 715 }, { "epoch": 0.7696419027258151, "grad_norm": 1.8807950794746984, "learning_rate": 1.527753287579084e-06, "loss": 0.3801, "step": 720 }, { "epoch": 0.774986638161411, "grad_norm": 1.8519074792344092, "learning_rate": 1.461165850518424e-06, "loss": 0.3788, "step": 725 }, { "epoch": 0.780331373597007, "grad_norm": 1.894015594982712, "learning_rate": 1.3958129255446585e-06, "loss": 0.383, "step": 730 }, { "epoch": 0.7856761090326029, "grad_norm": 1.8830744382706812, "learning_rate": 1.3317173108371834e-06, "loss": 0.3803, "step": 735 }, { "epoch": 0.7910208444681989, "grad_norm": 1.723519908836632, "learning_rate": 1.268901365966337e-06, "loss": 0.3822, "step": 740 }, { "epoch": 0.7963655799037948, "grad_norm": 1.7952600980364406, "learning_rate": 1.2073870040933212e-06, "loss": 0.3715, "step": 745 }, { "epoch": 0.8017103153393907, "grad_norm": 1.9566873596603427, "learning_rate": 1.1471956843258676e-06, "loss": 0.384, "step": 750 }, { "epoch": 0.8070550507749866, "grad_norm": 1.8386259618144518, "learning_rate": 1.0883484042322796e-06, "loss": 0.3869, "step": 755 }, { "epoch": 0.8123997862105826, "grad_norm": 1.8260297894288255, "learning_rate": 1.0308656925165033e-06, "loss": 0.3812, "step": 760 }, { "epoch": 0.8177445216461785, "grad_norm": 1.8047190650634146, "learning_rate": 9.74767601856737e-07, "loss": 0.3806, "step": 765 }, { "epoch": 0.8230892570817745, "grad_norm": 1.8221471376738447, "learning_rate": 9.200737019101169e-07, "loss": 0.3777, "step": 770 }, { "epoch": 0.8284339925173704, "grad_norm": 1.963769411688169, "learning_rate": 8.668030724858984e-07, "loss": 0.3688, "step": 775 }, { "epoch": 0.8337787279529664, "grad_norm": 1.7734438619458048, "learning_rate": 8.149742968895253e-07, "loss": 0.3756, "step": 780 }, { "epoch": 0.8391234633885623, "grad_norm": 1.800103499706633, "learning_rate": 7.646054554398863e-07, "loss": 0.3773, "step": 785 }, { "epoch": 0.8444681988241582, "grad_norm": 1.8625669508711113, "learning_rate": 7.157141191620548e-07, "loss": 0.3682, "step": 790 }, { "epoch": 0.8498129342597541, "grad_norm": 1.8353500996941807, "learning_rate": 6.683173436576851e-07, "loss": 0.3646, "step": 795 }, { "epoch": 0.8551576696953501, "grad_norm": 1.8832589125337102, "learning_rate": 6.224316631552207e-07, "loss": 0.376, "step": 800 }, { "epoch": 0.860502405130946, "grad_norm": 1.7768361367435437, "learning_rate": 5.780730847419652e-07, "loss": 0.3657, "step": 805 }, { "epoch": 0.865847140566542, "grad_norm": 1.7627566159259074, "learning_rate": 5.35257082780069e-07, "loss": 0.3674, "step": 810 }, { "epoch": 0.8711918760021379, "grad_norm": 1.8420469952326894, "learning_rate": 4.939985935083314e-07, "loss": 0.3669, "step": 815 }, { "epoch": 0.8765366114377339, "grad_norm": 1.7499771334622984, "learning_rate": 4.5431200983174493e-07, "loss": 0.3671, "step": 820 }, { "epoch": 0.8818813468733298, "grad_norm": 1.7862835118779703, "learning_rate": 4.1621117630056606e-07, "loss": 0.3729, "step": 825 }, { "epoch": 0.8872260823089257, "grad_norm": 1.7739216023779494, "learning_rate": 3.7970938428068813e-07, "loss": 0.3714, "step": 830 }, { "epoch": 0.8925708177445216, "grad_norm": 1.7066183325006379, "learning_rate": 3.4481936731698415e-07, "loss": 0.3655, "step": 835 }, { "epoch": 0.8979155531801176, "grad_norm": 1.7916893607261628, "learning_rate": 3.1155329669124876e-07, "loss": 0.3702, "step": 840 }, { "epoch": 0.9032602886157135, "grad_norm": 1.803470706275281, "learning_rate": 2.7992277717627856e-07, "loss": 0.3599, "step": 845 }, { "epoch": 0.9086050240513095, "grad_norm": 1.7999593269005436, "learning_rate": 2.4993884298758097e-07, "loss": 0.3621, "step": 850 }, { "epoch": 0.9139497594869054, "grad_norm": 1.749439500356134, "learning_rate": 2.2161195393412493e-07, "loss": 0.361, "step": 855 }, { "epoch": 0.9192944949225014, "grad_norm": 1.804710229821361, "learning_rate": 1.9495199176945977e-07, "loss": 0.3705, "step": 860 }, { "epoch": 0.9246392303580973, "grad_norm": 1.7380753832775804, "learning_rate": 1.6996825674449768e-07, "loss": 0.3581, "step": 865 }, { "epoch": 0.9299839657936932, "grad_norm": 1.9009106101736641, "learning_rate": 1.4666946436314832e-07, "loss": 0.3699, "step": 870 }, { "epoch": 0.9353287012292891, "grad_norm": 1.7420463319700745, "learning_rate": 1.2506374234193985e-07, "loss": 0.36, "step": 875 }, { "epoch": 0.9406734366648851, "grad_norm": 1.7367484197909535, "learning_rate": 1.0515862777468689e-07, "loss": 0.3611, "step": 880 }, { "epoch": 0.946018172100481, "grad_norm": 1.8449072611969448, "learning_rate": 8.69610645031993e-08, "loss": 0.3585, "step": 885 }, { "epoch": 0.951362907536077, "grad_norm": 1.7050020816812541, "learning_rate": 7.047740069494102e-08, "loss": 0.3663, "step": 890 }, { "epoch": 0.9567076429716729, "grad_norm": 1.7491629746492106, "learning_rate": 5.571338662849257e-08, "loss": 0.3621, "step": 895 }, { "epoch": 0.9620523784072689, "grad_norm": 1.8299154207593904, "learning_rate": 4.267417268758123e-08, "loss": 0.3657, "step": 900 }, { "epoch": 0.9673971138428648, "grad_norm": 1.7386431635341528, "learning_rate": 3.1364307564384357e-08, "loss": 0.3604, "step": 905 }, { "epoch": 0.9727418492784607, "grad_norm": 1.851821490817842, "learning_rate": 2.178773667273204e-08, "loss": 0.3639, "step": 910 }, { "epoch": 0.9780865847140566, "grad_norm": 1.7134927436590126, "learning_rate": 1.3947800771760278e-08, "loss": 0.3623, "step": 915 }, { "epoch": 0.9834313201496526, "grad_norm": 1.7740180149347793, "learning_rate": 7.84723480049765e-09, "loss": 0.3727, "step": 920 }, { "epoch": 0.9887760555852485, "grad_norm": 1.6783259441701572, "learning_rate": 3.4881669237890603e-09, "loss": 0.36, "step": 925 }, { "epoch": 0.9941207910208445, "grad_norm": 1.6994301851701823, "learning_rate": 8.721177898912691e-10, "loss": 0.3739, "step": 930 }, { "epoch": 0.9994655264564404, "grad_norm": 1.7638563088423833, "learning_rate": 0.0, "loss": 0.3534, "step": 935 }, { "epoch": 0.9994655264564404, "eval_loss": 0.3047424554824829, "eval_runtime": 0.9585, "eval_samples_per_second": 2.087, "eval_steps_per_second": 1.043, "step": 935 }, { "epoch": 0.9994655264564404, "step": 935, "total_flos": 195717633146880.0, "train_loss": 0.5211284054791864, "train_runtime": 23277.081, "train_samples_per_second": 1.286, "train_steps_per_second": 0.04 } ], "logging_steps": 5, "max_steps": 935, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 195717633146880.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }