|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9994655264564404, |
|
"eval_steps": 500, |
|
"global_step": 935, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0010689470871191875, |
|
"grad_norm": 23.473513373152795, |
|
"learning_rate": 1.0638297872340426e-07, |
|
"loss": 1.353, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.005344735435595938, |
|
"grad_norm": 21.711768425322266, |
|
"learning_rate": 5.319148936170213e-07, |
|
"loss": 1.3519, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010689470871191877, |
|
"grad_norm": 8.713068756520672, |
|
"learning_rate": 1.0638297872340427e-06, |
|
"loss": 1.2088, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.016034206306787813, |
|
"grad_norm": 8.310866409645099, |
|
"learning_rate": 1.595744680851064e-06, |
|
"loss": 1.0635, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.021378941742383754, |
|
"grad_norm": 3.066210523084854, |
|
"learning_rate": 2.1276595744680853e-06, |
|
"loss": 0.9417, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02672367717797969, |
|
"grad_norm": 2.493239812049204, |
|
"learning_rate": 2.6595744680851065e-06, |
|
"loss": 0.8884, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.032068412613575625, |
|
"grad_norm": 2.403610159527061, |
|
"learning_rate": 3.191489361702128e-06, |
|
"loss": 0.8557, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03741314804917156, |
|
"grad_norm": 2.3974276447793694, |
|
"learning_rate": 3.723404255319149e-06, |
|
"loss": 0.8349, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04275788348476751, |
|
"grad_norm": 2.2921630085655025, |
|
"learning_rate": 4.255319148936171e-06, |
|
"loss": 0.8163, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.048102618920363445, |
|
"grad_norm": 2.3116910060192817, |
|
"learning_rate": 4.787234042553192e-06, |
|
"loss": 0.798, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05344735435595938, |
|
"grad_norm": 2.242005189567634, |
|
"learning_rate": 5.319148936170213e-06, |
|
"loss": 0.7863, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.05879208979155532, |
|
"grad_norm": 2.536220362549579, |
|
"learning_rate": 5.851063829787235e-06, |
|
"loss": 0.7663, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06413682522715125, |
|
"grad_norm": 2.492174958150876, |
|
"learning_rate": 6.382978723404256e-06, |
|
"loss": 0.7579, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.06948156066274719, |
|
"grad_norm": 2.3311723974999095, |
|
"learning_rate": 6.914893617021278e-06, |
|
"loss": 0.7397, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07482629609834313, |
|
"grad_norm": 2.5040282086769317, |
|
"learning_rate": 7.446808510638298e-06, |
|
"loss": 0.7338, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08017103153393906, |
|
"grad_norm": 2.411086582278843, |
|
"learning_rate": 7.97872340425532e-06, |
|
"loss": 0.7215, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.08551576696953501, |
|
"grad_norm": 2.3433794783119883, |
|
"learning_rate": 8.510638297872341e-06, |
|
"loss": 0.7124, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09086050240513095, |
|
"grad_norm": 2.354225850924038, |
|
"learning_rate": 9.042553191489362e-06, |
|
"loss": 0.7177, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.09620523784072689, |
|
"grad_norm": 2.4091177426727266, |
|
"learning_rate": 9.574468085106385e-06, |
|
"loss": 0.7163, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.10154997327632283, |
|
"grad_norm": 2.498651758504887, |
|
"learning_rate": 9.999965114314806e-06, |
|
"loss": 0.7031, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.10689470871191876, |
|
"grad_norm": 2.423896856776365, |
|
"learning_rate": 9.998744166446685e-06, |
|
"loss": 0.7005, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.1122394441475147, |
|
"grad_norm": 2.387782305227773, |
|
"learning_rate": 9.995779421092695e-06, |
|
"loss": 0.6846, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.11758417958311064, |
|
"grad_norm": 2.424890050569407, |
|
"learning_rate": 9.991071912495701e-06, |
|
"loss": 0.7027, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12292891501870658, |
|
"grad_norm": 2.115700167815373, |
|
"learning_rate": 9.984623282856502e-06, |
|
"loss": 0.6923, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.1282736504543025, |
|
"grad_norm": 2.0709758825284363, |
|
"learning_rate": 9.97643578176095e-06, |
|
"loss": 0.6859, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13361838588989844, |
|
"grad_norm": 2.1305031798421186, |
|
"learning_rate": 9.966512265395188e-06, |
|
"loss": 0.6846, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.13896312132549438, |
|
"grad_norm": 2.0044100900913384, |
|
"learning_rate": 9.95485619554928e-06, |
|
"loss": 0.6799, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14430785676109031, |
|
"grad_norm": 2.050948697307653, |
|
"learning_rate": 9.941471638409576e-06, |
|
"loss": 0.6767, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.14965259219668625, |
|
"grad_norm": 1.952049269951998, |
|
"learning_rate": 9.926363263140234e-06, |
|
"loss": 0.6669, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.1549973276322822, |
|
"grad_norm": 2.099731647032975, |
|
"learning_rate": 9.90953634025439e-06, |
|
"loss": 0.6786, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.16034206306787813, |
|
"grad_norm": 1.993767696614697, |
|
"learning_rate": 9.890996739775562e-06, |
|
"loss": 0.6674, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16568679850347406, |
|
"grad_norm": 2.0517367712519503, |
|
"learning_rate": 9.870750929189914e-06, |
|
"loss": 0.6748, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.17103153393907003, |
|
"grad_norm": 2.0491814339157486, |
|
"learning_rate": 9.848805971190074e-06, |
|
"loss": 0.6621, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.17637626937466597, |
|
"grad_norm": 2.0777487120565397, |
|
"learning_rate": 9.825169521211354e-06, |
|
"loss": 0.6543, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.1817210048102619, |
|
"grad_norm": 2.351975397829705, |
|
"learning_rate": 9.799849824761159e-06, |
|
"loss": 0.6552, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.18706574024585784, |
|
"grad_norm": 2.0302930695761034, |
|
"learning_rate": 9.772855714542569e-06, |
|
"loss": 0.6525, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.19241047568145378, |
|
"grad_norm": 2.188255761505827, |
|
"learning_rate": 9.744196607373086e-06, |
|
"loss": 0.6512, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.19775521111704972, |
|
"grad_norm": 2.1631601855326763, |
|
"learning_rate": 9.71388250089959e-06, |
|
"loss": 0.6506, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.20309994655264565, |
|
"grad_norm": 2.041485612804426, |
|
"learning_rate": 9.681923970110698e-06, |
|
"loss": 0.6402, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2084446819882416, |
|
"grad_norm": 2.1331611096395973, |
|
"learning_rate": 9.648332163647705e-06, |
|
"loss": 0.6234, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.21378941742383753, |
|
"grad_norm": 2.076285244795538, |
|
"learning_rate": 9.613118799915417e-06, |
|
"loss": 0.6422, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21913415285943347, |
|
"grad_norm": 1.9966049947480653, |
|
"learning_rate": 9.576296162994214e-06, |
|
"loss": 0.62, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.2244788882950294, |
|
"grad_norm": 2.003867139711824, |
|
"learning_rate": 9.537877098354787e-06, |
|
"loss": 0.6431, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.22982362373062534, |
|
"grad_norm": 1.9196987285290612, |
|
"learning_rate": 9.497875008377033e-06, |
|
"loss": 0.6234, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.23516835916622128, |
|
"grad_norm": 2.1957665769389774, |
|
"learning_rate": 9.456303847674674e-06, |
|
"loss": 0.6256, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24051309460181722, |
|
"grad_norm": 1.884124911991183, |
|
"learning_rate": 9.41317811822723e-06, |
|
"loss": 0.6052, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.24585783003741316, |
|
"grad_norm": 1.9253697770132223, |
|
"learning_rate": 9.36851286432104e-06, |
|
"loss": 0.6228, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.25120256547300907, |
|
"grad_norm": 1.8020291089671914, |
|
"learning_rate": 9.322323667301113e-06, |
|
"loss": 0.6191, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.256547300908605, |
|
"grad_norm": 1.9107218407751854, |
|
"learning_rate": 9.274626640135616e-06, |
|
"loss": 0.6121, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.26189203634420094, |
|
"grad_norm": 1.8979104758577288, |
|
"learning_rate": 9.225438421794919e-06, |
|
"loss": 0.616, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.2672367717797969, |
|
"grad_norm": 1.9569646424906317, |
|
"learning_rate": 9.174776171447126e-06, |
|
"loss": 0.6047, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2725815072153928, |
|
"grad_norm": 2.0364213519267715, |
|
"learning_rate": 9.12265756247216e-06, |
|
"loss": 0.6093, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.27792624265098875, |
|
"grad_norm": 1.9725707409017998, |
|
"learning_rate": 9.06910077629645e-06, |
|
"loss": 0.599, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2832709780865847, |
|
"grad_norm": 1.9492899906874035, |
|
"learning_rate": 9.014124496050391e-06, |
|
"loss": 0.5902, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.28861571352218063, |
|
"grad_norm": 2.174543196329182, |
|
"learning_rate": 8.957747900050797e-06, |
|
"loss": 0.5937, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29396044895777657, |
|
"grad_norm": 2.4160108027006615, |
|
"learning_rate": 8.899990655110586e-06, |
|
"loss": 0.6018, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.2993051843933725, |
|
"grad_norm": 2.087978454933178, |
|
"learning_rate": 8.840872909678081e-06, |
|
"loss": 0.5832, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.30464991982896844, |
|
"grad_norm": 1.9762659778049954, |
|
"learning_rate": 8.780415286808284e-06, |
|
"loss": 0.5922, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.3099946552645644, |
|
"grad_norm": 2.269903050974625, |
|
"learning_rate": 8.718638876968564e-06, |
|
"loss": 0.587, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.3153393907001603, |
|
"grad_norm": 2.0490480592368043, |
|
"learning_rate": 8.655565230681329e-06, |
|
"loss": 0.5748, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.32068412613575625, |
|
"grad_norm": 2.2764533528101167, |
|
"learning_rate": 8.591216351006181e-06, |
|
"loss": 0.575, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.3260288615713522, |
|
"grad_norm": 2.0737769584580072, |
|
"learning_rate": 8.525614685864209e-06, |
|
"loss": 0.5716, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.33137359700694813, |
|
"grad_norm": 1.9102889564355032, |
|
"learning_rate": 8.458783120207099e-06, |
|
"loss": 0.5686, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.3367183324425441, |
|
"grad_norm": 1.8800981653614064, |
|
"learning_rate": 8.390744968033785e-06, |
|
"loss": 0.5629, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.34206306787814006, |
|
"grad_norm": 1.96332588340258, |
|
"learning_rate": 8.321523964257431e-06, |
|
"loss": 0.5657, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.347407803313736, |
|
"grad_norm": 1.9485967093770373, |
|
"learning_rate": 8.251144256425562e-06, |
|
"loss": 0.575, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.35275253874933193, |
|
"grad_norm": 1.929135934757674, |
|
"learning_rate": 8.179630396296285e-06, |
|
"loss": 0.5632, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.35809727418492787, |
|
"grad_norm": 1.9559096020055298, |
|
"learning_rate": 8.107007331273449e-06, |
|
"loss": 0.5626, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.3634420096205238, |
|
"grad_norm": 2.012794105136495, |
|
"learning_rate": 8.033300395703845e-06, |
|
"loss": 0.5546, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.36878674505611975, |
|
"grad_norm": 1.9997480665852025, |
|
"learning_rate": 7.958535302039368e-06, |
|
"loss": 0.5458, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.3741314804917157, |
|
"grad_norm": 1.8451676891383395, |
|
"learning_rate": 7.88273813186732e-06, |
|
"loss": 0.5483, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3794762159273116, |
|
"grad_norm": 1.895313478955964, |
|
"learning_rate": 7.805935326811913e-06, |
|
"loss": 0.5517, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.38482095136290756, |
|
"grad_norm": 1.9301374165518745, |
|
"learning_rate": 7.728153679310186e-06, |
|
"loss": 0.5464, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.3901656867985035, |
|
"grad_norm": 2.014621013972388, |
|
"learning_rate": 7.649420323265547e-06, |
|
"loss": 0.5441, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.39551042223409943, |
|
"grad_norm": 1.9682365332357328, |
|
"learning_rate": 7.569762724582179e-06, |
|
"loss": 0.5247, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.40085515766969537, |
|
"grad_norm": 1.9950047285262882, |
|
"learning_rate": 7.48920867158365e-06, |
|
"loss": 0.5435, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4061998931052913, |
|
"grad_norm": 1.8591367525423361, |
|
"learning_rate": 7.407786265319023e-06, |
|
"loss": 0.5363, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.41154462854088725, |
|
"grad_norm": 2.038203032181739, |
|
"learning_rate": 7.325523909759902e-06, |
|
"loss": 0.5312, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.4168893639764832, |
|
"grad_norm": 2.1300865254144856, |
|
"learning_rate": 7.242450301891772e-06, |
|
"loss": 0.5257, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.4222340994120791, |
|
"grad_norm": 1.882363933745456, |
|
"learning_rate": 7.158594421703152e-06, |
|
"loss": 0.5096, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.42757883484767506, |
|
"grad_norm": 1.8727951959035394, |
|
"learning_rate": 7.073985522076001e-06, |
|
"loss": 0.5184, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.432923570283271, |
|
"grad_norm": 1.9894963218262207, |
|
"learning_rate": 6.9886531185809385e-06, |
|
"loss": 0.5259, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.43826830571886694, |
|
"grad_norm": 2.0264734474019854, |
|
"learning_rate": 6.902626979180821e-06, |
|
"loss": 0.5287, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.4436130411544629, |
|
"grad_norm": 1.8722413198751242, |
|
"learning_rate": 6.8159371138462745e-06, |
|
"loss": 0.5203, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.4489577765900588, |
|
"grad_norm": 1.9496986261286822, |
|
"learning_rate": 6.728613764086806e-06, |
|
"loss": 0.5117, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.45430251202565475, |
|
"grad_norm": 1.9774605540105135, |
|
"learning_rate": 6.640687392401132e-06, |
|
"loss": 0.5031, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4596472474612507, |
|
"grad_norm": 1.954818070964011, |
|
"learning_rate": 6.552188671650434e-06, |
|
"loss": 0.5031, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.4649919828968466, |
|
"grad_norm": 1.9617332949799173, |
|
"learning_rate": 6.46314847435821e-06, |
|
"loss": 0.5066, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.47033671833244256, |
|
"grad_norm": 2.004929689013094, |
|
"learning_rate": 6.373597861940488e-06, |
|
"loss": 0.4988, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.4756814537680385, |
|
"grad_norm": 1.9774178895562935, |
|
"learning_rate": 6.283568073870147e-06, |
|
"loss": 0.4975, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.48102618920363444, |
|
"grad_norm": 1.9264282288547034, |
|
"learning_rate": 6.1930905167791025e-06, |
|
"loss": 0.4995, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4863709246392304, |
|
"grad_norm": 1.8764137691966425, |
|
"learning_rate": 6.102196753502202e-06, |
|
"loss": 0.4808, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.4917156600748263, |
|
"grad_norm": 2.050651493580463, |
|
"learning_rate": 6.010918492066628e-06, |
|
"loss": 0.482, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.49706039551042225, |
|
"grad_norm": 1.9228723322188161, |
|
"learning_rate": 5.919287574630628e-06, |
|
"loss": 0.4843, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5024051309460181, |
|
"grad_norm": 1.9851687404052978, |
|
"learning_rate": 5.827335966375485e-06, |
|
"loss": 0.4894, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.5077498663816141, |
|
"grad_norm": 1.9039931208460692, |
|
"learning_rate": 5.735095744354543e-06, |
|
"loss": 0.4673, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.51309460181721, |
|
"grad_norm": 1.9780097627545055, |
|
"learning_rate": 5.642599086303233e-06, |
|
"loss": 0.4788, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.518439337252806, |
|
"grad_norm": 2.0794416013824994, |
|
"learning_rate": 5.5498782594139476e-06, |
|
"loss": 0.4662, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.5237840726884019, |
|
"grad_norm": 1.9483487482401622, |
|
"learning_rate": 5.456965609079741e-06, |
|
"loss": 0.4763, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.5291288081239979, |
|
"grad_norm": 1.8945000444925455, |
|
"learning_rate": 5.363893547610715e-06, |
|
"loss": 0.4807, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.5344735435595938, |
|
"grad_norm": 1.9768379335510178, |
|
"learning_rate": 5.270694542927089e-06, |
|
"loss": 0.4803, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.5398182789951897, |
|
"grad_norm": 1.9758527545184905, |
|
"learning_rate": 5.1774011072328575e-06, |
|
"loss": 0.4627, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.5451630144307856, |
|
"grad_norm": 1.866392771665028, |
|
"learning_rate": 5.084045785674001e-06, |
|
"loss": 0.4608, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.5505077498663816, |
|
"grad_norm": 1.957078999847815, |
|
"learning_rate": 4.9906611449852035e-06, |
|
"loss": 0.4542, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.5558524853019775, |
|
"grad_norm": 1.9342632236304118, |
|
"learning_rate": 4.897279762129044e-06, |
|
"loss": 0.4547, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.5611972207375735, |
|
"grad_norm": 1.966938473585663, |
|
"learning_rate": 4.8039342129316175e-06, |
|
"loss": 0.4526, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.5665419561731694, |
|
"grad_norm": 1.9823937668875962, |
|
"learning_rate": 4.710657060718547e-06, |
|
"loss": 0.4503, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.5718866916087654, |
|
"grad_norm": 1.931604909577632, |
|
"learning_rate": 4.617480844955367e-06, |
|
"loss": 0.4543, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.5772314270443613, |
|
"grad_norm": 1.9170992207978994, |
|
"learning_rate": 4.52443806989622e-06, |
|
"loss": 0.4383, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.5825761624799572, |
|
"grad_norm": 1.9043676805575804, |
|
"learning_rate": 4.431561193244852e-06, |
|
"loss": 0.4546, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.5879208979155531, |
|
"grad_norm": 1.9233137290689744, |
|
"learning_rate": 4.338882614831817e-06, |
|
"loss": 0.446, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.5932656333511491, |
|
"grad_norm": 1.8515724013719734, |
|
"learning_rate": 4.246434665311907e-06, |
|
"loss": 0.4321, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.598610368786745, |
|
"grad_norm": 1.8828506758113701, |
|
"learning_rate": 4.154249594885687e-06, |
|
"loss": 0.4337, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.603955104222341, |
|
"grad_norm": 1.9687942950195187, |
|
"learning_rate": 4.062359562049109e-06, |
|
"loss": 0.431, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.6092998396579369, |
|
"grad_norm": 1.8995143280590803, |
|
"learning_rate": 3.970796622375116e-06, |
|
"loss": 0.4405, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.6146445750935329, |
|
"grad_norm": 1.8557131453687543, |
|
"learning_rate": 3.879592717331141e-06, |
|
"loss": 0.4283, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.6199893105291288, |
|
"grad_norm": 1.9309890578975801, |
|
"learning_rate": 3.78877966313642e-06, |
|
"loss": 0.4367, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.6253340459647247, |
|
"grad_norm": 1.973985583949257, |
|
"learning_rate": 3.698389139663003e-06, |
|
"loss": 0.4324, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.6306787814003206, |
|
"grad_norm": 1.8679236635919254, |
|
"learning_rate": 3.608452679384311e-06, |
|
"loss": 0.4289, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.6360235168359166, |
|
"grad_norm": 1.827491182434645, |
|
"learning_rate": 3.5190016563751316e-06, |
|
"loss": 0.427, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.6413682522715125, |
|
"grad_norm": 2.0739567283668583, |
|
"learning_rate": 3.4300672753668635e-06, |
|
"loss": 0.4163, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.6467129877071085, |
|
"grad_norm": 1.8900939036994293, |
|
"learning_rate": 3.34168056086183e-06, |
|
"loss": 0.4322, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.6520577231427044, |
|
"grad_norm": 1.9421083071105274, |
|
"learning_rate": 3.2538723463104737e-06, |
|
"loss": 0.4139, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.6574024585783004, |
|
"grad_norm": 1.9389760896779549, |
|
"learning_rate": 3.166673263355199e-06, |
|
"loss": 0.4238, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.6627471940138963, |
|
"grad_norm": 1.874356566114327, |
|
"learning_rate": 3.0801137311446087e-06, |
|
"loss": 0.4165, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.6680919294494923, |
|
"grad_norm": 1.9580525970654405, |
|
"learning_rate": 2.994223945721872e-06, |
|
"loss": 0.4082, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.6734366648850882, |
|
"grad_norm": 1.9147611470583412, |
|
"learning_rate": 2.9090338694909254e-06, |
|
"loss": 0.4219, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.6787814003206841, |
|
"grad_norm": 1.86847720750241, |
|
"learning_rate": 2.8245732207641705e-06, |
|
"loss": 0.4132, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.6841261357562801, |
|
"grad_norm": 1.8982814741174805, |
|
"learning_rate": 2.740871463395325e-06, |
|
"loss": 0.4129, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.689470871191876, |
|
"grad_norm": 1.901393825227574, |
|
"learning_rate": 2.65795779650105e-06, |
|
"loss": 0.4033, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.694815606627472, |
|
"grad_norm": 1.8509358915404532, |
|
"learning_rate": 2.575861144274914e-06, |
|
"loss": 0.4013, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7001603420630679, |
|
"grad_norm": 1.8958219311827695, |
|
"learning_rate": 2.4946101458972744e-06, |
|
"loss": 0.4027, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.7055050774986639, |
|
"grad_norm": 1.8426235286345023, |
|
"learning_rate": 2.414233145544585e-06, |
|
"loss": 0.3964, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.7108498129342598, |
|
"grad_norm": 1.9191012877180513, |
|
"learning_rate": 2.33475818250161e-06, |
|
"loss": 0.3954, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.7161945483698557, |
|
"grad_norm": 1.902716391588277, |
|
"learning_rate": 2.256212981379996e-06, |
|
"loss": 0.3957, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.7215392838054516, |
|
"grad_norm": 1.947059841210012, |
|
"learning_rate": 2.178624942446626e-06, |
|
"loss": 0.401, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.7268840192410476, |
|
"grad_norm": 1.917543839246915, |
|
"learning_rate": 2.1020211320651135e-06, |
|
"loss": 0.3888, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.7322287546766435, |
|
"grad_norm": 1.8327154685147313, |
|
"learning_rate": 2.0264282732537827e-06, |
|
"loss": 0.4003, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.7375734901122395, |
|
"grad_norm": 1.7919182630342896, |
|
"learning_rate": 1.9518727363634187e-06, |
|
"loss": 0.3873, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.7429182255478354, |
|
"grad_norm": 1.9169963711238958, |
|
"learning_rate": 1.8783805298780427e-06, |
|
"loss": 0.4018, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.7482629609834314, |
|
"grad_norm": 1.8380910216299624, |
|
"learning_rate": 1.8059772913419305e-06, |
|
"loss": 0.3946, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.7536076964190273, |
|
"grad_norm": 1.9558297629437302, |
|
"learning_rate": 1.7346882784160346e-06, |
|
"loss": 0.3916, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.7589524318546232, |
|
"grad_norm": 1.897087729189459, |
|
"learning_rate": 1.6645383600669124e-06, |
|
"loss": 0.3919, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.7642971672902191, |
|
"grad_norm": 1.8755131544746726, |
|
"learning_rate": 1.5955520078912628e-06, |
|
"loss": 0.3846, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.7696419027258151, |
|
"grad_norm": 1.8807950794746984, |
|
"learning_rate": 1.527753287579084e-06, |
|
"loss": 0.3801, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.774986638161411, |
|
"grad_norm": 1.8519074792344092, |
|
"learning_rate": 1.461165850518424e-06, |
|
"loss": 0.3788, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.780331373597007, |
|
"grad_norm": 1.894015594982712, |
|
"learning_rate": 1.3958129255446585e-06, |
|
"loss": 0.383, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.7856761090326029, |
|
"grad_norm": 1.8830744382706812, |
|
"learning_rate": 1.3317173108371834e-06, |
|
"loss": 0.3803, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.7910208444681989, |
|
"grad_norm": 1.723519908836632, |
|
"learning_rate": 1.268901365966337e-06, |
|
"loss": 0.3822, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.7963655799037948, |
|
"grad_norm": 1.7952600980364406, |
|
"learning_rate": 1.2073870040933212e-06, |
|
"loss": 0.3715, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.8017103153393907, |
|
"grad_norm": 1.9566873596603427, |
|
"learning_rate": 1.1471956843258676e-06, |
|
"loss": 0.384, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.8070550507749866, |
|
"grad_norm": 1.8386259618144518, |
|
"learning_rate": 1.0883484042322796e-06, |
|
"loss": 0.3869, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.8123997862105826, |
|
"grad_norm": 1.8260297894288255, |
|
"learning_rate": 1.0308656925165033e-06, |
|
"loss": 0.3812, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.8177445216461785, |
|
"grad_norm": 1.8047190650634146, |
|
"learning_rate": 9.74767601856737e-07, |
|
"loss": 0.3806, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.8230892570817745, |
|
"grad_norm": 1.8221471376738447, |
|
"learning_rate": 9.200737019101169e-07, |
|
"loss": 0.3777, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.8284339925173704, |
|
"grad_norm": 1.963769411688169, |
|
"learning_rate": 8.668030724858984e-07, |
|
"loss": 0.3688, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.8337787279529664, |
|
"grad_norm": 1.7734438619458048, |
|
"learning_rate": 8.149742968895253e-07, |
|
"loss": 0.3756, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.8391234633885623, |
|
"grad_norm": 1.800103499706633, |
|
"learning_rate": 7.646054554398863e-07, |
|
"loss": 0.3773, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.8444681988241582, |
|
"grad_norm": 1.8625669508711113, |
|
"learning_rate": 7.157141191620548e-07, |
|
"loss": 0.3682, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.8498129342597541, |
|
"grad_norm": 1.8353500996941807, |
|
"learning_rate": 6.683173436576851e-07, |
|
"loss": 0.3646, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.8551576696953501, |
|
"grad_norm": 1.8832589125337102, |
|
"learning_rate": 6.224316631552207e-07, |
|
"loss": 0.376, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.860502405130946, |
|
"grad_norm": 1.7768361367435437, |
|
"learning_rate": 5.780730847419652e-07, |
|
"loss": 0.3657, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.865847140566542, |
|
"grad_norm": 1.7627566159259074, |
|
"learning_rate": 5.35257082780069e-07, |
|
"loss": 0.3674, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.8711918760021379, |
|
"grad_norm": 1.8420469952326894, |
|
"learning_rate": 4.939985935083314e-07, |
|
"loss": 0.3669, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.8765366114377339, |
|
"grad_norm": 1.7499771334622984, |
|
"learning_rate": 4.5431200983174493e-07, |
|
"loss": 0.3671, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.8818813468733298, |
|
"grad_norm": 1.7862835118779703, |
|
"learning_rate": 4.1621117630056606e-07, |
|
"loss": 0.3729, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8872260823089257, |
|
"grad_norm": 1.7739216023779494, |
|
"learning_rate": 3.7970938428068813e-07, |
|
"loss": 0.3714, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.8925708177445216, |
|
"grad_norm": 1.7066183325006379, |
|
"learning_rate": 3.4481936731698415e-07, |
|
"loss": 0.3655, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.8979155531801176, |
|
"grad_norm": 1.7916893607261628, |
|
"learning_rate": 3.1155329669124876e-07, |
|
"loss": 0.3702, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9032602886157135, |
|
"grad_norm": 1.803470706275281, |
|
"learning_rate": 2.7992277717627856e-07, |
|
"loss": 0.3599, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.9086050240513095, |
|
"grad_norm": 1.7999593269005436, |
|
"learning_rate": 2.4993884298758097e-07, |
|
"loss": 0.3621, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.9139497594869054, |
|
"grad_norm": 1.749439500356134, |
|
"learning_rate": 2.2161195393412493e-07, |
|
"loss": 0.361, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.9192944949225014, |
|
"grad_norm": 1.804710229821361, |
|
"learning_rate": 1.9495199176945977e-07, |
|
"loss": 0.3705, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.9246392303580973, |
|
"grad_norm": 1.7380753832775804, |
|
"learning_rate": 1.6996825674449768e-07, |
|
"loss": 0.3581, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.9299839657936932, |
|
"grad_norm": 1.9009106101736641, |
|
"learning_rate": 1.4666946436314832e-07, |
|
"loss": 0.3699, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.9353287012292891, |
|
"grad_norm": 1.7420463319700745, |
|
"learning_rate": 1.2506374234193985e-07, |
|
"loss": 0.36, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.9406734366648851, |
|
"grad_norm": 1.7367484197909535, |
|
"learning_rate": 1.0515862777468689e-07, |
|
"loss": 0.3611, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.946018172100481, |
|
"grad_norm": 1.8449072611969448, |
|
"learning_rate": 8.69610645031993e-08, |
|
"loss": 0.3585, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.951362907536077, |
|
"grad_norm": 1.7050020816812541, |
|
"learning_rate": 7.047740069494102e-08, |
|
"loss": 0.3663, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.9567076429716729, |
|
"grad_norm": 1.7491629746492106, |
|
"learning_rate": 5.571338662849257e-08, |
|
"loss": 0.3621, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.9620523784072689, |
|
"grad_norm": 1.8299154207593904, |
|
"learning_rate": 4.267417268758123e-08, |
|
"loss": 0.3657, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.9673971138428648, |
|
"grad_norm": 1.7386431635341528, |
|
"learning_rate": 3.1364307564384357e-08, |
|
"loss": 0.3604, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.9727418492784607, |
|
"grad_norm": 1.851821490817842, |
|
"learning_rate": 2.178773667273204e-08, |
|
"loss": 0.3639, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.9780865847140566, |
|
"grad_norm": 1.7134927436590126, |
|
"learning_rate": 1.3947800771760278e-08, |
|
"loss": 0.3623, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.9834313201496526, |
|
"grad_norm": 1.7740180149347793, |
|
"learning_rate": 7.84723480049765e-09, |
|
"loss": 0.3727, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.9887760555852485, |
|
"grad_norm": 1.6783259441701572, |
|
"learning_rate": 3.4881669237890603e-09, |
|
"loss": 0.36, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9941207910208445, |
|
"grad_norm": 1.6994301851701823, |
|
"learning_rate": 8.721177898912691e-10, |
|
"loss": 0.3739, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.9994655264564404, |
|
"grad_norm": 1.7638563088423833, |
|
"learning_rate": 0.0, |
|
"loss": 0.3534, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9994655264564404, |
|
"eval_loss": 0.3047424554824829, |
|
"eval_runtime": 0.9585, |
|
"eval_samples_per_second": 2.087, |
|
"eval_steps_per_second": 1.043, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.9994655264564404, |
|
"step": 935, |
|
"total_flos": 195717633146880.0, |
|
"train_loss": 0.5211284054791864, |
|
"train_runtime": 23277.081, |
|
"train_samples_per_second": 1.286, |
|
"train_steps_per_second": 0.04 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 935, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 195717633146880.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|