{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998731447418495, "eval_steps": 500, "global_step": 3941, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002537105163009007, "grad_norm": 4.350327938684374, "learning_rate": 2.5316455696202533e-07, "loss": 1.8196, "step": 10 }, { "epoch": 0.005074210326018014, "grad_norm": 3.678530456221003, "learning_rate": 5.063291139240507e-07, "loss": 1.8108, "step": 20 }, { "epoch": 0.00761131548902702, "grad_norm": 2.945036910011768, "learning_rate": 7.59493670886076e-07, "loss": 1.8147, "step": 30 }, { "epoch": 0.010148420652036028, "grad_norm": 2.3190574498378447, "learning_rate": 1.0126582278481013e-06, "loss": 1.7548, "step": 40 }, { "epoch": 0.012685525815045033, "grad_norm": 2.074674879169714, "learning_rate": 1.2658227848101267e-06, "loss": 1.7204, "step": 50 }, { "epoch": 0.01522263097805404, "grad_norm": 1.748653753253889, "learning_rate": 1.518987341772152e-06, "loss": 1.6762, "step": 60 }, { "epoch": 0.01775973614106305, "grad_norm": 1.6751399170091914, "learning_rate": 1.7721518987341774e-06, "loss": 1.6487, "step": 70 }, { "epoch": 0.020296841304072055, "grad_norm": 1.6363195765660283, "learning_rate": 2.0253164556962026e-06, "loss": 1.6116, "step": 80 }, { "epoch": 0.022833946467081062, "grad_norm": 1.5794976909544993, "learning_rate": 2.278481012658228e-06, "loss": 1.5916, "step": 90 }, { "epoch": 0.025371051630090066, "grad_norm": 1.6233632992191482, "learning_rate": 2.5316455696202535e-06, "loss": 1.598, "step": 100 }, { "epoch": 0.027908156793099072, "grad_norm": 1.5800661094425872, "learning_rate": 2.7848101265822785e-06, "loss": 1.5626, "step": 110 }, { "epoch": 0.03044526195610808, "grad_norm": 1.6072050143283245, "learning_rate": 3.037974683544304e-06, "loss": 1.5457, "step": 120 }, { "epoch": 0.03298236711911709, "grad_norm": 1.6572306247078625, "learning_rate": 3.2911392405063294e-06, "loss": 1.5391, "step": 130 }, { "epoch": 0.0355194722821261, "grad_norm": 1.586848380490154, "learning_rate": 3.544303797468355e-06, "loss": 1.5125, "step": 140 }, { "epoch": 0.038056577445135104, "grad_norm": 1.638449311664989, "learning_rate": 3.7974683544303802e-06, "loss": 1.5243, "step": 150 }, { "epoch": 0.04059368260814411, "grad_norm": 1.6031920359772533, "learning_rate": 4.050632911392405e-06, "loss": 1.4856, "step": 160 }, { "epoch": 0.04313078777115312, "grad_norm": 1.5967439995800559, "learning_rate": 4.303797468354431e-06, "loss": 1.5129, "step": 170 }, { "epoch": 0.045667892934162124, "grad_norm": 1.6075212497819606, "learning_rate": 4.556962025316456e-06, "loss": 1.4961, "step": 180 }, { "epoch": 0.04820499809717113, "grad_norm": 1.6791238021539772, "learning_rate": 4.8101265822784815e-06, "loss": 1.4893, "step": 190 }, { "epoch": 0.05074210326018013, "grad_norm": 1.6210921331693446, "learning_rate": 5.063291139240507e-06, "loss": 1.5013, "step": 200 }, { "epoch": 0.05327920842318914, "grad_norm": 1.7652729374280518, "learning_rate": 5.3164556962025316e-06, "loss": 1.473, "step": 210 }, { "epoch": 0.055816313586198145, "grad_norm": 1.6723244529240142, "learning_rate": 5.569620253164557e-06, "loss": 1.4791, "step": 220 }, { "epoch": 0.05835341874920715, "grad_norm": 1.825326138794735, "learning_rate": 5.8227848101265824e-06, "loss": 1.4761, "step": 230 }, { "epoch": 0.06089052391221616, "grad_norm": 1.9131148271572453, "learning_rate": 6.075949367088608e-06, "loss": 1.4626, "step": 240 }, { "epoch": 0.06342762907522517, "grad_norm": 1.6613770739809675, "learning_rate": 6.329113924050634e-06, "loss": 1.4601, "step": 250 }, { "epoch": 0.06596473423823418, "grad_norm": 1.6666458237214428, "learning_rate": 6.582278481012659e-06, "loss": 1.4686, "step": 260 }, { "epoch": 0.06850183940124319, "grad_norm": 1.5745675069520453, "learning_rate": 6.835443037974684e-06, "loss": 1.461, "step": 270 }, { "epoch": 0.0710389445642522, "grad_norm": 1.6507776778175596, "learning_rate": 7.08860759493671e-06, "loss": 1.47, "step": 280 }, { "epoch": 0.0735760497272612, "grad_norm": 1.6009958375778823, "learning_rate": 7.341772151898735e-06, "loss": 1.4526, "step": 290 }, { "epoch": 0.07611315489027021, "grad_norm": 1.6786912574149853, "learning_rate": 7.5949367088607605e-06, "loss": 1.4501, "step": 300 }, { "epoch": 0.07865026005327921, "grad_norm": 1.6698693144659327, "learning_rate": 7.848101265822786e-06, "loss": 1.4483, "step": 310 }, { "epoch": 0.08118736521628822, "grad_norm": 1.7393580296857223, "learning_rate": 8.10126582278481e-06, "loss": 1.4252, "step": 320 }, { "epoch": 0.08372447037929723, "grad_norm": 1.6124831573952214, "learning_rate": 8.354430379746837e-06, "loss": 1.4274, "step": 330 }, { "epoch": 0.08626157554230623, "grad_norm": 1.6899774259466704, "learning_rate": 8.607594936708861e-06, "loss": 1.437, "step": 340 }, { "epoch": 0.08879868070531524, "grad_norm": 1.6821954539953226, "learning_rate": 8.860759493670886e-06, "loss": 1.4388, "step": 350 }, { "epoch": 0.09133578586832425, "grad_norm": 1.8121412852354848, "learning_rate": 9.113924050632912e-06, "loss": 1.4151, "step": 360 }, { "epoch": 0.09387289103133326, "grad_norm": 1.5407350947949157, "learning_rate": 9.367088607594937e-06, "loss": 1.4274, "step": 370 }, { "epoch": 0.09640999619434226, "grad_norm": 1.7381357929095853, "learning_rate": 9.620253164556963e-06, "loss": 1.4309, "step": 380 }, { "epoch": 0.09894710135735126, "grad_norm": 1.6085237968347799, "learning_rate": 9.87341772151899e-06, "loss": 1.4173, "step": 390 }, { "epoch": 0.10148420652036026, "grad_norm": 1.728407830056737, "learning_rate": 9.999950942931784e-06, "loss": 1.4312, "step": 400 }, { "epoch": 0.10402131168336927, "grad_norm": 1.6972786696047149, "learning_rate": 9.999558492161865e-06, "loss": 1.422, "step": 410 }, { "epoch": 0.10655841684637828, "grad_norm": 1.6297612720977512, "learning_rate": 9.998773621425852e-06, "loss": 1.3892, "step": 420 }, { "epoch": 0.10909552200938728, "grad_norm": 1.7368434280409393, "learning_rate": 9.997596392328971e-06, "loss": 1.4368, "step": 430 }, { "epoch": 0.11163262717239629, "grad_norm": 1.8196387241516612, "learning_rate": 9.996026897273024e-06, "loss": 1.4129, "step": 440 }, { "epoch": 0.1141697323354053, "grad_norm": 1.6083388947957715, "learning_rate": 9.994065259449128e-06, "loss": 1.4181, "step": 450 }, { "epoch": 0.1167068374984143, "grad_norm": 1.7315485017229137, "learning_rate": 9.991711632828049e-06, "loss": 1.4107, "step": 460 }, { "epoch": 0.11924394266142331, "grad_norm": 1.6192259621686464, "learning_rate": 9.988966202148115e-06, "loss": 1.3933, "step": 470 }, { "epoch": 0.12178104782443232, "grad_norm": 1.685661277294985, "learning_rate": 9.985829182900717e-06, "loss": 1.4305, "step": 480 }, { "epoch": 0.12431815298744132, "grad_norm": 1.7673237121898477, "learning_rate": 9.982300821313394e-06, "loss": 1.407, "step": 490 }, { "epoch": 0.12685525815045035, "grad_norm": 1.6729684650970384, "learning_rate": 9.978381394330509e-06, "loss": 1.3941, "step": 500 }, { "epoch": 0.12939236331345935, "grad_norm": 1.6737204711157692, "learning_rate": 9.974071209591507e-06, "loss": 1.4083, "step": 510 }, { "epoch": 0.13192946847646836, "grad_norm": 1.5846450496238496, "learning_rate": 9.96937060540677e-06, "loss": 1.3913, "step": 520 }, { "epoch": 0.13446657363947737, "grad_norm": 1.6440175318683266, "learning_rate": 9.964279950731066e-06, "loss": 1.4141, "step": 530 }, { "epoch": 0.13700367880248637, "grad_norm": 1.5435352480418292, "learning_rate": 9.958799645134585e-06, "loss": 1.3923, "step": 540 }, { "epoch": 0.13954078396549538, "grad_norm": 1.6806917695478834, "learning_rate": 9.952930118771576e-06, "loss": 1.3882, "step": 550 }, { "epoch": 0.1420778891285044, "grad_norm": 1.6991483906725386, "learning_rate": 9.946671832346588e-06, "loss": 1.3806, "step": 560 }, { "epoch": 0.1446149942915134, "grad_norm": 1.6444779930069549, "learning_rate": 9.940025277078304e-06, "loss": 1.3877, "step": 570 }, { "epoch": 0.1471520994545224, "grad_norm": 1.584958906864304, "learning_rate": 9.932990974660992e-06, "loss": 1.3758, "step": 580 }, { "epoch": 0.1496892046175314, "grad_norm": 1.6339337045637337, "learning_rate": 9.925569477223549e-06, "loss": 1.3942, "step": 590 }, { "epoch": 0.15222630978054041, "grad_norm": 1.6782688039697937, "learning_rate": 9.917761367286164e-06, "loss": 1.3997, "step": 600 }, { "epoch": 0.15476341494354942, "grad_norm": 1.722985917307532, "learning_rate": 9.909567257714605e-06, "loss": 1.3902, "step": 610 }, { "epoch": 0.15730052010655843, "grad_norm": 1.515629790408513, "learning_rate": 9.9009877916721e-06, "loss": 1.3906, "step": 620 }, { "epoch": 0.15983762526956743, "grad_norm": 1.6832684084973726, "learning_rate": 9.892023642568871e-06, "loss": 1.3644, "step": 630 }, { "epoch": 0.16237473043257644, "grad_norm": 1.7217857604177804, "learning_rate": 9.882675514009262e-06, "loss": 1.3673, "step": 640 }, { "epoch": 0.16491183559558545, "grad_norm": 1.772784930329774, "learning_rate": 9.872944139736523e-06, "loss": 1.3751, "step": 650 }, { "epoch": 0.16744894075859446, "grad_norm": 1.528731449667675, "learning_rate": 9.862830283575215e-06, "loss": 1.3678, "step": 660 }, { "epoch": 0.16998604592160346, "grad_norm": 1.6156407009731812, "learning_rate": 9.852334739371252e-06, "loss": 1.3825, "step": 670 }, { "epoch": 0.17252315108461247, "grad_norm": 1.6415774929326135, "learning_rate": 9.841458330929598e-06, "loss": 1.3884, "step": 680 }, { "epoch": 0.17506025624762148, "grad_norm": 1.6070221223746397, "learning_rate": 9.830201911949604e-06, "loss": 1.3934, "step": 690 }, { "epoch": 0.17759736141063048, "grad_norm": 1.562454926578275, "learning_rate": 9.818566365957996e-06, "loss": 1.3645, "step": 700 }, { "epoch": 0.1801344665736395, "grad_norm": 1.5996614008577792, "learning_rate": 9.80655260623953e-06, "loss": 1.3708, "step": 710 }, { "epoch": 0.1826715717366485, "grad_norm": 1.5048794279696338, "learning_rate": 9.794161575765311e-06, "loss": 1.3749, "step": 720 }, { "epoch": 0.1852086768996575, "grad_norm": 1.5935516523984996, "learning_rate": 9.78139424711877e-06, "loss": 1.3886, "step": 730 }, { "epoch": 0.1877457820626665, "grad_norm": 1.5282445861981415, "learning_rate": 9.76825162241933e-06, "loss": 1.373, "step": 740 }, { "epoch": 0.19028288722567552, "grad_norm": 1.6358363561782086, "learning_rate": 9.754734733243749e-06, "loss": 1.3742, "step": 750 }, { "epoch": 0.19281999238868452, "grad_norm": 1.587601691095452, "learning_rate": 9.740844640545151e-06, "loss": 1.3603, "step": 760 }, { "epoch": 0.19535709755169353, "grad_norm": 1.5280980736395107, "learning_rate": 9.726582434569744e-06, "loss": 1.3636, "step": 770 }, { "epoch": 0.1978942027147025, "grad_norm": 1.5821827900533842, "learning_rate": 9.711949234771258e-06, "loss": 1.3536, "step": 780 }, { "epoch": 0.20043130787771152, "grad_norm": 1.6354511380141648, "learning_rate": 9.696946189723067e-06, "loss": 1.3777, "step": 790 }, { "epoch": 0.20296841304072052, "grad_norm": 1.4961728532518945, "learning_rate": 9.681574477028039e-06, "loss": 1.3555, "step": 800 }, { "epoch": 0.20550551820372953, "grad_norm": 1.6165039622149184, "learning_rate": 9.66583530322611e-06, "loss": 1.3736, "step": 810 }, { "epoch": 0.20804262336673854, "grad_norm": 1.6085071421673924, "learning_rate": 9.649729903699575e-06, "loss": 1.3685, "step": 820 }, { "epoch": 0.21057972852974755, "grad_norm": 1.674024959469941, "learning_rate": 9.633259542576127e-06, "loss": 1.3516, "step": 830 }, { "epoch": 0.21311683369275655, "grad_norm": 1.5457350813422102, "learning_rate": 9.61642551262963e-06, "loss": 1.3433, "step": 840 }, { "epoch": 0.21565393885576556, "grad_norm": 1.5638603110160445, "learning_rate": 9.599229135178651e-06, "loss": 1.3596, "step": 850 }, { "epoch": 0.21819104401877457, "grad_norm": 1.5845605847496684, "learning_rate": 9.581671759982747e-06, "loss": 1.3821, "step": 860 }, { "epoch": 0.22072814918178357, "grad_norm": 1.56321358587459, "learning_rate": 9.563754765136522e-06, "loss": 1.3568, "step": 870 }, { "epoch": 0.22326525434479258, "grad_norm": 1.621089242536098, "learning_rate": 9.545479556961457e-06, "loss": 1.3614, "step": 880 }, { "epoch": 0.2258023595078016, "grad_norm": 1.5746598904044478, "learning_rate": 9.526847569895529e-06, "loss": 1.3536, "step": 890 }, { "epoch": 0.2283394646708106, "grad_norm": 1.632144120218129, "learning_rate": 9.507860266380625e-06, "loss": 1.3521, "step": 900 }, { "epoch": 0.2308765698338196, "grad_norm": 1.6660492103415234, "learning_rate": 9.488519136747741e-06, "loss": 1.3455, "step": 910 }, { "epoch": 0.2334136749968286, "grad_norm": 1.6307872469664786, "learning_rate": 9.468825699100013e-06, "loss": 1.3388, "step": 920 }, { "epoch": 0.23595078015983761, "grad_norm": 1.51760811186189, "learning_rate": 9.448781499193563e-06, "loss": 1.36, "step": 930 }, { "epoch": 0.23848788532284662, "grad_norm": 1.6298958079571104, "learning_rate": 9.428388110316165e-06, "loss": 1.346, "step": 940 }, { "epoch": 0.24102499048585563, "grad_norm": 1.6241168589647443, "learning_rate": 9.407647133163754e-06, "loss": 1.3565, "step": 950 }, { "epoch": 0.24356209564886463, "grad_norm": 1.6330870068463266, "learning_rate": 9.386560195714796e-06, "loss": 1.3539, "step": 960 }, { "epoch": 0.24609920081187364, "grad_norm": 1.5846187793083721, "learning_rate": 9.365128953102495e-06, "loss": 1.3443, "step": 970 }, { "epoch": 0.24863630597488265, "grad_norm": 1.5816319458789425, "learning_rate": 9.343355087484893e-06, "loss": 1.3449, "step": 980 }, { "epoch": 0.25117341113789166, "grad_norm": 1.608667389007063, "learning_rate": 9.321240307912818e-06, "loss": 1.3503, "step": 990 }, { "epoch": 0.2537105163009007, "grad_norm": 1.535736158897923, "learning_rate": 9.298786350195758e-06, "loss": 1.3504, "step": 1000 }, { "epoch": 0.25624762146390967, "grad_norm": 1.6105502703548435, "learning_rate": 9.275994976765602e-06, "loss": 1.3512, "step": 1010 }, { "epoch": 0.2587847266269187, "grad_norm": 1.457234439212148, "learning_rate": 9.252867976538312e-06, "loss": 1.3447, "step": 1020 }, { "epoch": 0.2613218317899277, "grad_norm": 1.632312084639862, "learning_rate": 9.22940716477351e-06, "loss": 1.3451, "step": 1030 }, { "epoch": 0.2638589369529367, "grad_norm": 1.5657163405769847, "learning_rate": 9.205614382931986e-06, "loss": 1.3678, "step": 1040 }, { "epoch": 0.2663960421159457, "grad_norm": 1.523325498659843, "learning_rate": 9.181491498531179e-06, "loss": 1.355, "step": 1050 }, { "epoch": 0.26893314727895473, "grad_norm": 1.5647021825494114, "learning_rate": 9.157040404998572e-06, "loss": 1.3455, "step": 1060 }, { "epoch": 0.2714702524419637, "grad_norm": 1.581907280598391, "learning_rate": 9.132263021523096e-06, "loss": 1.353, "step": 1070 }, { "epoch": 0.27400735760497275, "grad_norm": 1.4861566014453274, "learning_rate": 9.107161292904476e-06, "loss": 1.3428, "step": 1080 }, { "epoch": 0.2765444627679817, "grad_norm": 1.6256210181495103, "learning_rate": 9.081737189400583e-06, "loss": 1.3421, "step": 1090 }, { "epoch": 0.27908156793099076, "grad_norm": 1.4876360574590954, "learning_rate": 9.0559927065728e-06, "loss": 1.3377, "step": 1100 }, { "epoch": 0.28161867309399974, "grad_norm": 1.5204275847901962, "learning_rate": 9.029929865129375e-06, "loss": 1.349, "step": 1110 }, { "epoch": 0.2841557782570088, "grad_norm": 1.5624633516357405, "learning_rate": 9.003550710766813e-06, "loss": 1.3552, "step": 1120 }, { "epoch": 0.28669288342001775, "grad_norm": 1.5591567540085947, "learning_rate": 8.97685731400932e-06, "loss": 1.3209, "step": 1130 }, { "epoch": 0.2892299885830268, "grad_norm": 1.5373317845285133, "learning_rate": 8.949851770046272e-06, "loss": 1.3267, "step": 1140 }, { "epoch": 0.29176709374603577, "grad_norm": 1.5556061129094692, "learning_rate": 8.922536198567772e-06, "loss": 1.3379, "step": 1150 }, { "epoch": 0.2943041989090448, "grad_norm": 1.971486780664198, "learning_rate": 8.894912743598269e-06, "loss": 1.3272, "step": 1160 }, { "epoch": 0.2968413040720538, "grad_norm": 1.5365700226491938, "learning_rate": 8.866983573328267e-06, "loss": 1.333, "step": 1170 }, { "epoch": 0.2993784092350628, "grad_norm": 1.6217713070921793, "learning_rate": 8.83875087994415e-06, "loss": 1.3497, "step": 1180 }, { "epoch": 0.3019155143980718, "grad_norm": 1.4917017043884344, "learning_rate": 8.810216879456114e-06, "loss": 1.3355, "step": 1190 }, { "epoch": 0.30445261956108083, "grad_norm": 1.5427563058731948, "learning_rate": 8.781383811524222e-06, "loss": 1.3339, "step": 1200 }, { "epoch": 0.3069897247240898, "grad_norm": 1.5666645778409243, "learning_rate": 8.752253939282622e-06, "loss": 1.332, "step": 1210 }, { "epoch": 0.30952682988709884, "grad_norm": 1.5940427272465527, "learning_rate": 8.722829549161904e-06, "loss": 1.3411, "step": 1220 }, { "epoch": 0.3120639350501078, "grad_norm": 1.569355522659196, "learning_rate": 8.69311295070964e-06, "loss": 1.321, "step": 1230 }, { "epoch": 0.31460104021311686, "grad_norm": 1.5823744419831982, "learning_rate": 8.663106476409107e-06, "loss": 1.3511, "step": 1240 }, { "epoch": 0.31713814537612584, "grad_norm": 1.5626340370876246, "learning_rate": 8.632812481496195e-06, "loss": 1.3491, "step": 1250 }, { "epoch": 0.31967525053913487, "grad_norm": 1.6216546055767536, "learning_rate": 8.602233343774562e-06, "loss": 1.3294, "step": 1260 }, { "epoch": 0.32221235570214385, "grad_norm": 1.4885399811487754, "learning_rate": 8.571371463428986e-06, "loss": 1.3419, "step": 1270 }, { "epoch": 0.3247494608651529, "grad_norm": 1.597124872589071, "learning_rate": 8.540229262836974e-06, "loss": 1.3245, "step": 1280 }, { "epoch": 0.32728656602816186, "grad_norm": 1.5069638761813242, "learning_rate": 8.508809186378631e-06, "loss": 1.3357, "step": 1290 }, { "epoch": 0.3298236711911709, "grad_norm": 1.5496475251999724, "learning_rate": 8.477113700244788e-06, "loss": 1.3297, "step": 1300 }, { "epoch": 0.3323607763541799, "grad_norm": 1.5177410295586948, "learning_rate": 8.445145292243446e-06, "loss": 1.3361, "step": 1310 }, { "epoch": 0.3348978815171889, "grad_norm": 1.4375424317665, "learning_rate": 8.412906471604489e-06, "loss": 1.3365, "step": 1320 }, { "epoch": 0.3374349866801979, "grad_norm": 1.4733958562961815, "learning_rate": 8.380399768782742e-06, "loss": 1.3364, "step": 1330 }, { "epoch": 0.3399720918432069, "grad_norm": 1.5665888162471464, "learning_rate": 8.347627735259344e-06, "loss": 1.3572, "step": 1340 }, { "epoch": 0.3425091970062159, "grad_norm": 1.5175787042273947, "learning_rate": 8.314592943341494e-06, "loss": 1.311, "step": 1350 }, { "epoch": 0.34504630216922494, "grad_norm": 1.5210307965368668, "learning_rate": 8.281297985960538e-06, "loss": 1.3261, "step": 1360 }, { "epoch": 0.3475834073322339, "grad_norm": 1.5365431443148119, "learning_rate": 8.247745476468449e-06, "loss": 1.3433, "step": 1370 }, { "epoch": 0.35012051249524295, "grad_norm": 1.5548012069585933, "learning_rate": 8.213938048432697e-06, "loss": 1.3134, "step": 1380 }, { "epoch": 0.35265761765825193, "grad_norm": 1.4642811591908687, "learning_rate": 8.179878355429556e-06, "loss": 1.3159, "step": 1390 }, { "epoch": 0.35519472282126097, "grad_norm": 1.6713134353309254, "learning_rate": 8.145569070835799e-06, "loss": 1.3285, "step": 1400 }, { "epoch": 0.35773182798426995, "grad_norm": 1.5444628338197106, "learning_rate": 8.111012887618882e-06, "loss": 1.344, "step": 1410 }, { "epoch": 0.360268933147279, "grad_norm": 1.5042040298049457, "learning_rate": 8.076212518125556e-06, "loss": 1.3217, "step": 1420 }, { "epoch": 0.36280603831028796, "grad_norm": 1.5827643194628298, "learning_rate": 8.041170693868985e-06, "loss": 1.3284, "step": 1430 }, { "epoch": 0.365343143473297, "grad_norm": 1.4314485322723574, "learning_rate": 8.005890165314334e-06, "loss": 1.3188, "step": 1440 }, { "epoch": 0.367880248636306, "grad_norm": 1.5452457890288078, "learning_rate": 7.970373701662892e-06, "loss": 1.3123, "step": 1450 }, { "epoch": 0.370417353799315, "grad_norm": 1.5944938106930338, "learning_rate": 7.934624090634713e-06, "loss": 1.3131, "step": 1460 }, { "epoch": 0.372954458962324, "grad_norm": 1.5553727991379855, "learning_rate": 7.8986441382498e-06, "loss": 1.3318, "step": 1470 }, { "epoch": 0.375491564125333, "grad_norm": 1.5196578480754726, "learning_rate": 7.862436668607865e-06, "loss": 1.3164, "step": 1480 }, { "epoch": 0.378028669288342, "grad_norm": 1.5354385242535227, "learning_rate": 7.826004523666661e-06, "loss": 1.3292, "step": 1490 }, { "epoch": 0.38056577445135104, "grad_norm": 1.5449910825994637, "learning_rate": 7.78935056301891e-06, "loss": 1.3272, "step": 1500 }, { "epoch": 0.38310287961436, "grad_norm": 1.4946907973724173, "learning_rate": 7.752477663667854e-06, "loss": 1.3391, "step": 1510 }, { "epoch": 0.38563998477736905, "grad_norm": 1.5791940161814702, "learning_rate": 7.715388719801437e-06, "loss": 1.3392, "step": 1520 }, { "epoch": 0.38817708994037803, "grad_norm": 1.4567702862839176, "learning_rate": 7.67808664256514e-06, "loss": 1.2971, "step": 1530 }, { "epoch": 0.39071419510338706, "grad_norm": 1.4605769814867744, "learning_rate": 7.640574359833472e-06, "loss": 1.3148, "step": 1540 }, { "epoch": 0.39325130026639604, "grad_norm": 1.5566796816874888, "learning_rate": 7.6028548159801685e-06, "loss": 1.3315, "step": 1550 }, { "epoch": 0.395788405429405, "grad_norm": 1.5768032029757384, "learning_rate": 7.564930971647087e-06, "loss": 1.3238, "step": 1560 }, { "epoch": 0.39832551059241406, "grad_norm": 1.5702550171255043, "learning_rate": 7.52680580351181e-06, "loss": 1.3175, "step": 1570 }, { "epoch": 0.40086261575542304, "grad_norm": 1.5687466837527182, "learning_rate": 7.488482304054019e-06, "loss": 1.3104, "step": 1580 }, { "epoch": 0.40339972091843207, "grad_norm": 1.5897550883645912, "learning_rate": 7.449963481320599e-06, "loss": 1.316, "step": 1590 }, { "epoch": 0.40593682608144105, "grad_norm": 1.5236147067965886, "learning_rate": 7.411252358689541e-06, "loss": 1.3273, "step": 1600 }, { "epoch": 0.4084739312444501, "grad_norm": 1.5469446528938424, "learning_rate": 7.372351974632634e-06, "loss": 1.3119, "step": 1610 }, { "epoch": 0.41101103640745906, "grad_norm": 1.4722026799112722, "learning_rate": 7.333265382476971e-06, "loss": 1.3151, "step": 1620 }, { "epoch": 0.4135481415704681, "grad_norm": 1.5178886141824586, "learning_rate": 7.293995650165287e-06, "loss": 1.3245, "step": 1630 }, { "epoch": 0.4160852467334771, "grad_norm": 1.5308435376939995, "learning_rate": 7.2545458600151615e-06, "loss": 1.3317, "step": 1640 }, { "epoch": 0.4186223518964861, "grad_norm": 1.5091424984828243, "learning_rate": 7.214919108477077e-06, "loss": 1.3044, "step": 1650 }, { "epoch": 0.4211594570594951, "grad_norm": 1.457202507709852, "learning_rate": 7.175118505891385e-06, "loss": 1.3339, "step": 1660 }, { "epoch": 0.4236965622225041, "grad_norm": 1.530896247556501, "learning_rate": 7.135147176244158e-06, "loss": 1.3044, "step": 1670 }, { "epoch": 0.4262336673855131, "grad_norm": 1.5274463812149695, "learning_rate": 7.0950082569219955e-06, "loss": 1.3048, "step": 1680 }, { "epoch": 0.42877077254852214, "grad_norm": 1.507428973101804, "learning_rate": 7.054704898465772e-06, "loss": 1.3069, "step": 1690 }, { "epoch": 0.4313078777115311, "grad_norm": 1.5716469315983397, "learning_rate": 7.0142402643233346e-06, "loss": 1.3136, "step": 1700 }, { "epoch": 0.43384498287454015, "grad_norm": 1.4220881687524514, "learning_rate": 6.973617530601209e-06, "loss": 1.3165, "step": 1710 }, { "epoch": 0.43638208803754913, "grad_norm": 1.5926945403384438, "learning_rate": 6.932839885815304e-06, "loss": 1.3301, "step": 1720 }, { "epoch": 0.43891919320055817, "grad_norm": 1.4527595611730801, "learning_rate": 6.891910530640642e-06, "loss": 1.3145, "step": 1730 }, { "epoch": 0.44145629836356715, "grad_norm": 1.5069254389998272, "learning_rate": 6.850832677660134e-06, "loss": 1.3139, "step": 1740 }, { "epoch": 0.4439934035265762, "grad_norm": 1.4587280578384394, "learning_rate": 6.809609551112419e-06, "loss": 1.3085, "step": 1750 }, { "epoch": 0.44653050868958516, "grad_norm": 1.5122830472595903, "learning_rate": 6.768244386638793e-06, "loss": 1.3158, "step": 1760 }, { "epoch": 0.4490676138525942, "grad_norm": 1.4912245201929943, "learning_rate": 6.726740431029243e-06, "loss": 1.3167, "step": 1770 }, { "epoch": 0.4516047190156032, "grad_norm": 1.5574941259720791, "learning_rate": 6.685100941967596e-06, "loss": 1.3118, "step": 1780 }, { "epoch": 0.4541418241786122, "grad_norm": 1.4994130740882026, "learning_rate": 6.643329187775827e-06, "loss": 1.307, "step": 1790 }, { "epoch": 0.4566789293416212, "grad_norm": 1.5791237950971593, "learning_rate": 6.601428447157525e-06, "loss": 1.3086, "step": 1800 }, { "epoch": 0.4592160345046302, "grad_norm": 1.5319564794342408, "learning_rate": 6.559402008940539e-06, "loss": 1.3025, "step": 1810 }, { "epoch": 0.4617531396676392, "grad_norm": 1.5560620624811086, "learning_rate": 6.517253171818844e-06, "loss": 1.3146, "step": 1820 }, { "epoch": 0.46429024483064824, "grad_norm": 1.5762189341956727, "learning_rate": 6.474985244093613e-06, "loss": 1.307, "step": 1830 }, { "epoch": 0.4668273499936572, "grad_norm": 1.568824162809672, "learning_rate": 6.432601543413552e-06, "loss": 1.2996, "step": 1840 }, { "epoch": 0.46936445515666625, "grad_norm": 1.461712822890638, "learning_rate": 6.390105396514497e-06, "loss": 1.3013, "step": 1850 }, { "epoch": 0.47190156031967523, "grad_norm": 1.4727912142252813, "learning_rate": 6.347500138958285e-06, "loss": 1.3086, "step": 1860 }, { "epoch": 0.47443866548268426, "grad_norm": 1.4842630358439066, "learning_rate": 6.304789114870953e-06, "loss": 1.3121, "step": 1870 }, { "epoch": 0.47697577064569324, "grad_norm": 1.5147058669468259, "learning_rate": 6.261975676680252e-06, "loss": 1.3109, "step": 1880 }, { "epoch": 0.4795128758087023, "grad_norm": 1.5879467208142688, "learning_rate": 6.219063184852509e-06, "loss": 1.3057, "step": 1890 }, { "epoch": 0.48204998097171126, "grad_norm": 1.4622817504218393, "learning_rate": 6.176055007628859e-06, "loss": 1.2978, "step": 1900 }, { "epoch": 0.4845870861347203, "grad_norm": 1.4651555100721898, "learning_rate": 6.132954520760882e-06, "loss": 1.2936, "step": 1910 }, { "epoch": 0.48712419129772927, "grad_norm": 1.4242680820832143, "learning_rate": 6.089765107245616e-06, "loss": 1.311, "step": 1920 }, { "epoch": 0.4896612964607383, "grad_norm": 1.4510357489546541, "learning_rate": 6.046490157060041e-06, "loss": 1.2917, "step": 1930 }, { "epoch": 0.4921984016237473, "grad_norm": 1.5389362630585735, "learning_rate": 6.003133066894987e-06, "loss": 1.3173, "step": 1940 }, { "epoch": 0.4947355067867563, "grad_norm": 1.5597918071325416, "learning_rate": 5.959697239888525e-06, "loss": 1.2978, "step": 1950 }, { "epoch": 0.4972726119497653, "grad_norm": 1.481163850939429, "learning_rate": 5.916186085358858e-06, "loss": 1.3125, "step": 1960 }, { "epoch": 0.49980971711277433, "grad_norm": 1.5172196100773179, "learning_rate": 5.872603018536713e-06, "loss": 1.3035, "step": 1970 }, { "epoch": 0.5023468222757833, "grad_norm": 1.520182324070576, "learning_rate": 5.828951460297277e-06, "loss": 1.2943, "step": 1980 }, { "epoch": 0.5048839274387923, "grad_norm": 1.374020881318329, "learning_rate": 5.785234836891697e-06, "loss": 1.3019, "step": 1990 }, { "epoch": 0.5074210326018014, "grad_norm": 1.609172422257604, "learning_rate": 5.741456579678141e-06, "loss": 1.2929, "step": 2000 }, { "epoch": 0.5099581377648104, "grad_norm": 1.451921659432821, "learning_rate": 5.697620124852472e-06, "loss": 1.2868, "step": 2010 }, { "epoch": 0.5124952429278193, "grad_norm": 1.531522896512812, "learning_rate": 5.65372891317854e-06, "loss": 1.2875, "step": 2020 }, { "epoch": 0.5150323480908283, "grad_norm": 1.443649652350418, "learning_rate": 5.6097863897181075e-06, "loss": 1.2963, "step": 2030 }, { "epoch": 0.5175694532538374, "grad_norm": 1.5591743411035264, "learning_rate": 5.565796003560447e-06, "loss": 1.3121, "step": 2040 }, { "epoch": 0.5201065584168464, "grad_norm": 1.428229068798765, "learning_rate": 5.521761207551622e-06, "loss": 1.2979, "step": 2050 }, { "epoch": 0.5226436635798554, "grad_norm": 1.5164415865949983, "learning_rate": 5.47768545802346e-06, "loss": 1.3107, "step": 2060 }, { "epoch": 0.5251807687428643, "grad_norm": 1.5292361648846982, "learning_rate": 5.433572214522275e-06, "loss": 1.2952, "step": 2070 }, { "epoch": 0.5277178739058734, "grad_norm": 1.4451039662214231, "learning_rate": 5.389424939537311e-06, "loss": 1.2922, "step": 2080 }, { "epoch": 0.5302549790688824, "grad_norm": 1.558654012548035, "learning_rate": 5.345247098228977e-06, "loss": 1.2942, "step": 2090 }, { "epoch": 0.5327920842318914, "grad_norm": 1.5393309134302235, "learning_rate": 5.301042158156866e-06, "loss": 1.2898, "step": 2100 }, { "epoch": 0.5353291893949004, "grad_norm": 1.5206662969722375, "learning_rate": 5.256813589007571e-06, "loss": 1.2967, "step": 2110 }, { "epoch": 0.5378662945579095, "grad_norm": 1.5295277898061372, "learning_rate": 5.212564862322355e-06, "loss": 1.2987, "step": 2120 }, { "epoch": 0.5404033997209184, "grad_norm": 1.5121887795702076, "learning_rate": 5.168299451224665e-06, "loss": 1.2859, "step": 2130 }, { "epoch": 0.5429405048839274, "grad_norm": 1.5405224763949017, "learning_rate": 5.124020830147525e-06, "loss": 1.2942, "step": 2140 }, { "epoch": 0.5454776100469364, "grad_norm": 1.5241647102261355, "learning_rate": 5.079732474560821e-06, "loss": 1.2967, "step": 2150 }, { "epoch": 0.5480147152099455, "grad_norm": 1.5740459163455902, "learning_rate": 5.035437860698508e-06, "loss": 1.2792, "step": 2160 }, { "epoch": 0.5505518203729545, "grad_norm": 1.455514390960437, "learning_rate": 4.991140465285762e-06, "loss": 1.2722, "step": 2170 }, { "epoch": 0.5530889255359634, "grad_norm": 1.4543563727275153, "learning_rate": 4.94684376526608e-06, "loss": 1.294, "step": 2180 }, { "epoch": 0.5556260306989724, "grad_norm": 1.415880887469612, "learning_rate": 4.902551237528387e-06, "loss": 1.2898, "step": 2190 }, { "epoch": 0.5581631358619815, "grad_norm": 1.5027054686198038, "learning_rate": 4.858266358634109e-06, "loss": 1.2943, "step": 2200 }, { "epoch": 0.5607002410249905, "grad_norm": 1.495415983271707, "learning_rate": 4.813992604544319e-06, "loss": 1.309, "step": 2210 }, { "epoch": 0.5632373461879995, "grad_norm": 1.5256997169566149, "learning_rate": 4.769733450346885e-06, "loss": 1.2941, "step": 2220 }, { "epoch": 0.5657744513510085, "grad_norm": 1.4207029137255274, "learning_rate": 4.725492369983721e-06, "loss": 1.2808, "step": 2230 }, { "epoch": 0.5683115565140175, "grad_norm": 1.5127789303300487, "learning_rate": 4.6812728359781064e-06, "loss": 1.2886, "step": 2240 }, { "epoch": 0.5708486616770265, "grad_norm": 1.4480660719145084, "learning_rate": 4.637078319162127e-06, "loss": 1.2848, "step": 2250 }, { "epoch": 0.5733857668400355, "grad_norm": 1.4818074524822986, "learning_rate": 4.592912288404251e-06, "loss": 1.2747, "step": 2260 }, { "epoch": 0.5759228720030445, "grad_norm": 1.496021447098999, "learning_rate": 4.5487782103370445e-06, "loss": 1.2889, "step": 2270 }, { "epoch": 0.5784599771660536, "grad_norm": 1.4726400774082267, "learning_rate": 4.504679549085077e-06, "loss": 1.2956, "step": 2280 }, { "epoch": 0.5809970823290626, "grad_norm": 1.492109044123467, "learning_rate": 4.460619765993025e-06, "loss": 1.2974, "step": 2290 }, { "epoch": 0.5835341874920715, "grad_norm": 1.4567515467141523, "learning_rate": 4.416602319353974e-06, "loss": 1.29, "step": 2300 }, { "epoch": 0.5860712926550805, "grad_norm": 1.460535915347314, "learning_rate": 4.3726306641379915e-06, "loss": 1.2745, "step": 2310 }, { "epoch": 0.5886083978180896, "grad_norm": 1.4651576736560898, "learning_rate": 4.328708251720924e-06, "loss": 1.2739, "step": 2320 }, { "epoch": 0.5911455029810986, "grad_norm": 1.6196158147206026, "learning_rate": 4.2848385296135165e-06, "loss": 1.3101, "step": 2330 }, { "epoch": 0.5936826081441076, "grad_norm": 1.527439804056797, "learning_rate": 4.241024941190792e-06, "loss": 1.2771, "step": 2340 }, { "epoch": 0.5962197133071165, "grad_norm": 1.4872645401772542, "learning_rate": 4.197270925421796e-06, "loss": 1.2877, "step": 2350 }, { "epoch": 0.5987568184701256, "grad_norm": 1.4908027336325684, "learning_rate": 4.153579916599659e-06, "loss": 1.2969, "step": 2360 }, { "epoch": 0.6012939236331346, "grad_norm": 1.370441167203172, "learning_rate": 4.109955344072036e-06, "loss": 1.2745, "step": 2370 }, { "epoch": 0.6038310287961436, "grad_norm": 1.457801692594122, "learning_rate": 4.066400631971938e-06, "loss": 1.2714, "step": 2380 }, { "epoch": 0.6063681339591526, "grad_norm": 1.5047248748403204, "learning_rate": 4.022919198948966e-06, "loss": 1.2759, "step": 2390 }, { "epoch": 0.6089052391221617, "grad_norm": 1.5232259549425642, "learning_rate": 3.979514457900982e-06, "loss": 1.2845, "step": 2400 }, { "epoch": 0.6114423442851706, "grad_norm": 1.4170452963382303, "learning_rate": 3.936189815706219e-06, "loss": 1.2833, "step": 2410 }, { "epoch": 0.6139794494481796, "grad_norm": 1.5010818180720833, "learning_rate": 3.8929486729558775e-06, "loss": 1.2941, "step": 2420 }, { "epoch": 0.6165165546111886, "grad_norm": 1.4420347497785075, "learning_rate": 3.849794423687212e-06, "loss": 1.2775, "step": 2430 }, { "epoch": 0.6190536597741977, "grad_norm": 1.520468191298721, "learning_rate": 3.8067304551171247e-06, "loss": 1.2627, "step": 2440 }, { "epoch": 0.6215907649372067, "grad_norm": 1.4753704862458017, "learning_rate": 3.7637601473763035e-06, "loss": 1.284, "step": 2450 }, { "epoch": 0.6241278701002156, "grad_norm": 1.469877746697786, "learning_rate": 3.7208868732439145e-06, "loss": 1.2927, "step": 2460 }, { "epoch": 0.6266649752632246, "grad_norm": 1.4601548141707599, "learning_rate": 3.6781139978828606e-06, "loss": 1.2947, "step": 2470 }, { "epoch": 0.6292020804262337, "grad_norm": 1.5092438879342172, "learning_rate": 3.6354448785756558e-06, "loss": 1.2843, "step": 2480 }, { "epoch": 0.6317391855892427, "grad_norm": 1.4368007055488876, "learning_rate": 3.592882864460905e-06, "loss": 1.265, "step": 2490 }, { "epoch": 0.6342762907522517, "grad_norm": 1.4672055312297339, "learning_rate": 3.5504312962704245e-06, "loss": 1.2709, "step": 2500 }, { "epoch": 0.6368133959152606, "grad_norm": 1.4995451462382032, "learning_rate": 3.5080935060670345e-06, "loss": 1.2679, "step": 2510 }, { "epoch": 0.6393505010782697, "grad_norm": 1.458116276283539, "learning_rate": 3.465872816983008e-06, "loss": 1.2821, "step": 2520 }, { "epoch": 0.6418876062412787, "grad_norm": 1.4447640379158275, "learning_rate": 3.4237725429592507e-06, "loss": 1.2865, "step": 2530 }, { "epoch": 0.6444247114042877, "grad_norm": 1.3965736731366891, "learning_rate": 3.3817959884851735e-06, "loss": 1.2698, "step": 2540 }, { "epoch": 0.6469618165672967, "grad_norm": 1.4648194884238146, "learning_rate": 3.3399464483393272e-06, "loss": 1.291, "step": 2550 }, { "epoch": 0.6494989217303058, "grad_norm": 1.4271493727093771, "learning_rate": 3.298227207330792e-06, "loss": 1.2765, "step": 2560 }, { "epoch": 0.6520360268933147, "grad_norm": 1.5962462881292958, "learning_rate": 3.256641540041346e-06, "loss": 1.2905, "step": 2570 }, { "epoch": 0.6545731320563237, "grad_norm": 1.4501719681830862, "learning_rate": 3.2151927105684423e-06, "loss": 1.298, "step": 2580 }, { "epoch": 0.6571102372193327, "grad_norm": 1.5186349976521718, "learning_rate": 3.1738839722690085e-06, "loss": 1.2742, "step": 2590 }, { "epoch": 0.6596473423823418, "grad_norm": 1.3901740398219145, "learning_rate": 3.1327185675040907e-06, "loss": 1.2769, "step": 2600 }, { "epoch": 0.6621844475453508, "grad_norm": 1.4618375024699428, "learning_rate": 3.0916997273843454e-06, "loss": 1.2938, "step": 2610 }, { "epoch": 0.6647215527083598, "grad_norm": 1.4675982361039484, "learning_rate": 3.0508306715164416e-06, "loss": 1.2913, "step": 2620 }, { "epoch": 0.6672586578713687, "grad_norm": 1.5086185778550512, "learning_rate": 3.0101146077503386e-06, "loss": 1.2777, "step": 2630 }, { "epoch": 0.6697957630343778, "grad_norm": 1.4573487737483761, "learning_rate": 2.9695547319275093e-06, "loss": 1.2633, "step": 2640 }, { "epoch": 0.6723328681973868, "grad_norm": 1.43323809832072, "learning_rate": 2.9291542276300866e-06, "loss": 1.289, "step": 2650 }, { "epoch": 0.6748699733603958, "grad_norm": 1.420082813628849, "learning_rate": 2.8889162659309832e-06, "loss": 1.2729, "step": 2660 }, { "epoch": 0.6774070785234048, "grad_norm": 1.424990219399345, "learning_rate": 2.848844005145004e-06, "loss": 1.3024, "step": 2670 }, { "epoch": 0.6799441836864138, "grad_norm": 1.4395745448115305, "learning_rate": 2.808940590580922e-06, "loss": 1.2845, "step": 2680 }, { "epoch": 0.6824812888494228, "grad_norm": 1.4802086998925903, "learning_rate": 2.769209154294623e-06, "loss": 1.2844, "step": 2690 }, { "epoch": 0.6850183940124318, "grad_norm": 1.491623196795251, "learning_rate": 2.7296528148432565e-06, "loss": 1.2683, "step": 2700 }, { "epoch": 0.6875554991754408, "grad_norm": 1.416764375906272, "learning_rate": 2.690274677040462e-06, "loss": 1.2776, "step": 2710 }, { "epoch": 0.6900926043384499, "grad_norm": 1.519033593874162, "learning_rate": 2.6510778317126597e-06, "loss": 1.2807, "step": 2720 }, { "epoch": 0.6926297095014589, "grad_norm": 1.3894691132515595, "learning_rate": 2.6120653554564624e-06, "loss": 1.2777, "step": 2730 }, { "epoch": 0.6951668146644678, "grad_norm": 1.4049713206074572, "learning_rate": 2.573240310397187e-06, "loss": 1.2736, "step": 2740 }, { "epoch": 0.6977039198274768, "grad_norm": 1.4357642101900112, "learning_rate": 2.5346057439484923e-06, "loss": 1.2803, "step": 2750 }, { "epoch": 0.7002410249904859, "grad_norm": 1.490167340198777, "learning_rate": 2.4961646885732034e-06, "loss": 1.2744, "step": 2760 }, { "epoch": 0.7027781301534949, "grad_norm": 1.4179312953545702, "learning_rate": 2.4579201615452812e-06, "loss": 1.2842, "step": 2770 }, { "epoch": 0.7053152353165039, "grad_norm": 1.6140649717523825, "learning_rate": 2.4198751647129896e-06, "loss": 1.2963, "step": 2780 }, { "epoch": 0.7078523404795128, "grad_norm": 1.530468810042779, "learning_rate": 2.3820326842632894e-06, "loss": 1.2637, "step": 2790 }, { "epoch": 0.7103894456425219, "grad_norm": 1.412588711043796, "learning_rate": 2.344395690487441e-06, "loss": 1.2856, "step": 2800 }, { "epoch": 0.7129265508055309, "grad_norm": 1.5447254908338892, "learning_rate": 2.3069671375478645e-06, "loss": 1.2848, "step": 2810 }, { "epoch": 0.7154636559685399, "grad_norm": 1.43691808431636, "learning_rate": 2.2697499632462695e-06, "loss": 1.2536, "step": 2820 }, { "epoch": 0.7180007611315489, "grad_norm": 1.5560428170621574, "learning_rate": 2.2327470887930595e-06, "loss": 1.3015, "step": 2830 }, { "epoch": 0.720537866294558, "grad_norm": 1.450374747082515, "learning_rate": 2.195961418578041e-06, "loss": 1.2744, "step": 2840 }, { "epoch": 0.7230749714575669, "grad_norm": 1.484538648746269, "learning_rate": 2.159395839942464e-06, "loss": 1.2664, "step": 2850 }, { "epoch": 0.7256120766205759, "grad_norm": 1.3953379506558543, "learning_rate": 2.1230532229523865e-06, "loss": 1.2489, "step": 2860 }, { "epoch": 0.7281491817835849, "grad_norm": 1.4415654155573785, "learning_rate": 2.086936420173399e-06, "loss": 1.2719, "step": 2870 }, { "epoch": 0.730686286946594, "grad_norm": 1.4271516629005172, "learning_rate": 2.051048266446727e-06, "loss": 1.2652, "step": 2880 }, { "epoch": 0.733223392109603, "grad_norm": 1.4951992832082914, "learning_rate": 2.0153915786667203e-06, "loss": 1.26, "step": 2890 }, { "epoch": 0.735760497272612, "grad_norm": 1.4351479751585414, "learning_rate": 1.9799691555597555e-06, "loss": 1.2881, "step": 2900 }, { "epoch": 0.7382976024356209, "grad_norm": 1.474899241565124, "learning_rate": 1.9447837774645513e-06, "loss": 1.2702, "step": 2910 }, { "epoch": 0.74083470759863, "grad_norm": 1.4426835070499822, "learning_rate": 1.9098382061139503e-06, "loss": 1.2699, "step": 2920 }, { "epoch": 0.743371812761639, "grad_norm": 1.4876818985570295, "learning_rate": 1.8751351844181414e-06, "loss": 1.2612, "step": 2930 }, { "epoch": 0.745908917924648, "grad_norm": 1.4360645410392319, "learning_rate": 1.8406774362493662e-06, "loss": 1.2754, "step": 2940 }, { "epoch": 0.748446023087657, "grad_norm": 1.4473888665732064, "learning_rate": 1.8064676662281206e-06, "loss": 1.2902, "step": 2950 }, { "epoch": 0.750983128250666, "grad_norm": 1.4434612838312966, "learning_rate": 1.7725085595108682e-06, "loss": 1.273, "step": 2960 }, { "epoch": 0.753520233413675, "grad_norm": 1.558136105535075, "learning_rate": 1.7388027815792725e-06, "loss": 1.2787, "step": 2970 }, { "epoch": 0.756057338576684, "grad_norm": 1.4724878594646564, "learning_rate": 1.705352978030993e-06, "loss": 1.2627, "step": 2980 }, { "epoch": 0.758594443739693, "grad_norm": 1.4768497018650097, "learning_rate": 1.672161774372022e-06, "loss": 1.2911, "step": 2990 }, { "epoch": 0.7611315489027021, "grad_norm": 1.4598692131173956, "learning_rate": 1.639231775810602e-06, "loss": 1.2907, "step": 3000 }, { "epoch": 0.763668654065711, "grad_norm": 1.3971487709781405, "learning_rate": 1.6065655670527546e-06, "loss": 1.2632, "step": 3010 }, { "epoch": 0.76620575922872, "grad_norm": 1.4196228285690422, "learning_rate": 1.574165712099392e-06, "loss": 1.2542, "step": 3020 }, { "epoch": 0.768742864391729, "grad_norm": 1.4395590200787511, "learning_rate": 1.542034754045067e-06, "loss": 1.2693, "step": 3030 }, { "epoch": 0.7712799695547381, "grad_norm": 1.4538143237649903, "learning_rate": 1.5101752148783705e-06, "loss": 1.2728, "step": 3040 }, { "epoch": 0.7738170747177471, "grad_norm": 1.4483981816763403, "learning_rate": 1.4785895952839735e-06, "loss": 1.2671, "step": 3050 }, { "epoch": 0.7763541798807561, "grad_norm": 1.5335192207213328, "learning_rate": 1.447280374446346e-06, "loss": 1.2778, "step": 3060 }, { "epoch": 0.778891285043765, "grad_norm": 1.4504666284348766, "learning_rate": 1.4162500098551608e-06, "loss": 1.276, "step": 3070 }, { "epoch": 0.7814283902067741, "grad_norm": 1.454412830474016, "learning_rate": 1.385500937112415e-06, "loss": 1.2804, "step": 3080 }, { "epoch": 0.7839654953697831, "grad_norm": 1.462536001446098, "learning_rate": 1.3550355697412386e-06, "loss": 1.2586, "step": 3090 }, { "epoch": 0.7865026005327921, "grad_norm": 1.4861860594882876, "learning_rate": 1.3248562989964719e-06, "loss": 1.2843, "step": 3100 }, { "epoch": 0.7890397056958011, "grad_norm": 1.391241218546658, "learning_rate": 1.2949654936769622e-06, "loss": 1.2723, "step": 3110 }, { "epoch": 0.79157681085881, "grad_norm": 1.412674356321388, "learning_rate": 1.2653654999396436e-06, "loss": 1.2621, "step": 3120 }, { "epoch": 0.7941139160218191, "grad_norm": 1.406341007739084, "learning_rate": 1.2360586411153747e-06, "loss": 1.2897, "step": 3130 }, { "epoch": 0.7966510211848281, "grad_norm": 1.4125498017483746, "learning_rate": 1.2070472175265857e-06, "loss": 1.2657, "step": 3140 }, { "epoch": 0.7991881263478371, "grad_norm": 1.575395352386111, "learning_rate": 1.1783335063067286e-06, "loss": 1.2974, "step": 3150 }, { "epoch": 0.8017252315108461, "grad_norm": 1.4632409776646316, "learning_rate": 1.1499197612215269e-06, "loss": 1.2914, "step": 3160 }, { "epoch": 0.8042623366738552, "grad_norm": 1.420307782085356, "learning_rate": 1.1218082124920903e-06, "loss": 1.2583, "step": 3170 }, { "epoch": 0.8067994418368641, "grad_norm": 1.4073518626370982, "learning_rate": 1.0940010666198575e-06, "loss": 1.2588, "step": 3180 }, { "epoch": 0.8093365469998731, "grad_norm": 1.427712685491864, "learning_rate": 1.0665005062134015e-06, "loss": 1.2641, "step": 3190 }, { "epoch": 0.8118736521628821, "grad_norm": 1.4042233353051128, "learning_rate": 1.0393086898171234e-06, "loss": 1.2623, "step": 3200 }, { "epoch": 0.8144107573258912, "grad_norm": 1.4353551061219325, "learning_rate": 1.0124277517418196e-06, "loss": 1.2701, "step": 3210 }, { "epoch": 0.8169478624889002, "grad_norm": 1.4714738106408498, "learning_rate": 9.858598018971599e-07, "loss": 1.2665, "step": 3220 }, { "epoch": 0.8194849676519091, "grad_norm": 1.3867710691517015, "learning_rate": 9.596069256260792e-07, "loss": 1.2811, "step": 3230 }, { "epoch": 0.8220220728149181, "grad_norm": 1.5556697110120234, "learning_rate": 9.336711835410972e-07, "loss": 1.2577, "step": 3240 }, { "epoch": 0.8245591779779272, "grad_norm": 1.4677232808017586, "learning_rate": 9.080546113625738e-07, "loss": 1.2675, "step": 3250 }, { "epoch": 0.8270962831409362, "grad_norm": 1.4009076826572764, "learning_rate": 8.827592197589341e-07, "loss": 1.2573, "step": 3260 }, { "epoch": 0.8296333883039452, "grad_norm": 1.3584163410682717, "learning_rate": 8.577869941888389e-07, "loss": 1.2654, "step": 3270 }, { "epoch": 0.8321704934669542, "grad_norm": 1.4763004523041792, "learning_rate": 8.331398947453512e-07, "loss": 1.271, "step": 3280 }, { "epoch": 0.8347075986299632, "grad_norm": 1.432107775038367, "learning_rate": 8.08819856002081e-07, "loss": 1.2771, "step": 3290 }, { "epoch": 0.8372447037929722, "grad_norm": 1.4636493219573536, "learning_rate": 7.848287868613441e-07, "loss": 1.2511, "step": 3300 }, { "epoch": 0.8397818089559812, "grad_norm": 1.501456048501624, "learning_rate": 7.611685704043281e-07, "loss": 1.2724, "step": 3310 }, { "epoch": 0.8423189141189902, "grad_norm": 1.4533194309629769, "learning_rate": 7.378410637432848e-07, "loss": 1.2761, "step": 3320 }, { "epoch": 0.8448560192819993, "grad_norm": 1.4930040288043631, "learning_rate": 7.148480978757694e-07, "loss": 1.2808, "step": 3330 }, { "epoch": 0.8473931244450083, "grad_norm": 1.4327631888495864, "learning_rate": 6.921914775409211e-07, "loss": 1.2764, "step": 3340 }, { "epoch": 0.8499302296080172, "grad_norm": 1.4066505744498654, "learning_rate": 6.698729810778065e-07, "loss": 1.2724, "step": 3350 }, { "epoch": 0.8524673347710262, "grad_norm": 1.4214214237910756, "learning_rate": 6.478943602858373e-07, "loss": 1.2703, "step": 3360 }, { "epoch": 0.8550044399340353, "grad_norm": 1.4609797404161982, "learning_rate": 6.262573402872707e-07, "loss": 1.2702, "step": 3370 }, { "epoch": 0.8575415450970443, "grad_norm": 1.4426076897314533, "learning_rate": 6.04963619391799e-07, "loss": 1.2652, "step": 3380 }, { "epoch": 0.8600786502600533, "grad_norm": 1.4557782632700174, "learning_rate": 5.840148689632536e-07, "loss": 1.2628, "step": 3390 }, { "epoch": 0.8626157554230622, "grad_norm": 1.4465949605683495, "learning_rate": 5.634127332884143e-07, "loss": 1.2649, "step": 3400 }, { "epoch": 0.8651528605860713, "grad_norm": 1.4543385042228827, "learning_rate": 5.431588294479479e-07, "loss": 1.2863, "step": 3410 }, { "epoch": 0.8676899657490803, "grad_norm": 1.4211990179028964, "learning_rate": 5.232547471894839e-07, "loss": 1.2603, "step": 3420 }, { "epoch": 0.8702270709120893, "grad_norm": 1.4970252757505178, "learning_rate": 5.037020488028322e-07, "loss": 1.2659, "step": 3430 }, { "epoch": 0.8727641760750983, "grad_norm": 1.476846612856639, "learning_rate": 4.845022689973567e-07, "loss": 1.2622, "step": 3440 }, { "epoch": 0.8753012812381074, "grad_norm": 1.3975933791175674, "learning_rate": 4.656569147815171e-07, "loss": 1.2675, "step": 3450 }, { "epoch": 0.8778383864011163, "grad_norm": 1.4066115466115592, "learning_rate": 4.471674653445801e-07, "loss": 1.2657, "step": 3460 }, { "epoch": 0.8803754915641253, "grad_norm": 1.4093052515025426, "learning_rate": 4.290353719405199e-07, "loss": 1.2622, "step": 3470 }, { "epoch": 0.8829125967271343, "grad_norm": 1.517434872609148, "learning_rate": 4.1126205777410054e-07, "loss": 1.2658, "step": 3480 }, { "epoch": 0.8854497018901434, "grad_norm": 1.3671982309013966, "learning_rate": 3.938489178891769e-07, "loss": 1.26, "step": 3490 }, { "epoch": 0.8879868070531524, "grad_norm": 1.3848746120915914, "learning_rate": 3.767973190591906e-07, "loss": 1.252, "step": 3500 }, { "epoch": 0.8905239122161613, "grad_norm": 1.404335518132526, "learning_rate": 3.6010859967988975e-07, "loss": 1.2684, "step": 3510 }, { "epoch": 0.8930610173791703, "grad_norm": 1.4696908362694405, "learning_rate": 3.437840696642797e-07, "loss": 1.28, "step": 3520 }, { "epoch": 0.8955981225421794, "grad_norm": 1.4394982066957633, "learning_rate": 3.2782501033980897e-07, "loss": 1.2596, "step": 3530 }, { "epoch": 0.8981352277051884, "grad_norm": 1.441827806292722, "learning_rate": 3.1223267434778934e-07, "loss": 1.2548, "step": 3540 }, { "epoch": 0.9006723328681974, "grad_norm": 1.4029572337771223, "learning_rate": 2.9700828554508175e-07, "loss": 1.2714, "step": 3550 }, { "epoch": 0.9032094380312063, "grad_norm": 1.456563644801128, "learning_rate": 2.82153038908034e-07, "loss": 1.271, "step": 3560 }, { "epoch": 0.9057465431942154, "grad_norm": 1.4854658320433165, "learning_rate": 2.6766810043867996e-07, "loss": 1.2636, "step": 3570 }, { "epoch": 0.9082836483572244, "grad_norm": 1.3976434876269141, "learning_rate": 2.53554607073227e-07, "loss": 1.2555, "step": 3580 }, { "epoch": 0.9108207535202334, "grad_norm": 1.4520566750739115, "learning_rate": 2.3981366659281135e-07, "loss": 1.2741, "step": 3590 }, { "epoch": 0.9133578586832424, "grad_norm": 1.431635462724296, "learning_rate": 2.2644635753654832e-07, "loss": 1.2641, "step": 3600 }, { "epoch": 0.9158949638462515, "grad_norm": 1.4459695881350807, "learning_rate": 2.1345372911687868e-07, "loss": 1.2719, "step": 3610 }, { "epoch": 0.9184320690092604, "grad_norm": 1.4523937909973577, "learning_rate": 2.008368011372136e-07, "loss": 1.2574, "step": 3620 }, { "epoch": 0.9209691741722694, "grad_norm": 1.448000545062192, "learning_rate": 1.8859656391188918e-07, "loss": 1.2678, "step": 3630 }, { "epoch": 0.9235062793352784, "grad_norm": 1.4574912967654818, "learning_rate": 1.7673397818843696e-07, "loss": 1.2631, "step": 3640 }, { "epoch": 0.9260433844982875, "grad_norm": 1.469570266612804, "learning_rate": 1.65249975072172e-07, "loss": 1.2676, "step": 3650 }, { "epoch": 0.9285804896612965, "grad_norm": 1.4705072480655184, "learning_rate": 1.5414545595311193e-07, "loss": 1.2363, "step": 3660 }, { "epoch": 0.9311175948243055, "grad_norm": 1.469943074464241, "learning_rate": 1.4342129243522241e-07, "loss": 1.2716, "step": 3670 }, { "epoch": 0.9336546999873144, "grad_norm": 1.4039691988667693, "learning_rate": 1.3307832626800966e-07, "loss": 1.2674, "step": 3680 }, { "epoch": 0.9361918051503235, "grad_norm": 1.4357184274036978, "learning_rate": 1.2311736928044437e-07, "loss": 1.2662, "step": 3690 }, { "epoch": 0.9387289103133325, "grad_norm": 1.4198942808558601, "learning_rate": 1.1353920331724666e-07, "loss": 1.2743, "step": 3700 }, { "epoch": 0.9412660154763415, "grad_norm": 1.472183542066415, "learning_rate": 1.0434458017751392e-07, "loss": 1.2505, "step": 3710 }, { "epoch": 0.9438031206393505, "grad_norm": 1.4530560276575668, "learning_rate": 9.553422155571257e-08, "loss": 1.2637, "step": 3720 }, { "epoch": 0.9463402258023595, "grad_norm": 1.4342800198901315, "learning_rate": 8.710881898503276e-08, "loss": 1.2706, "step": 3730 }, { "epoch": 0.9488773309653685, "grad_norm": 1.4769111711160674, "learning_rate": 7.906903378310738e-08, "loss": 1.2717, "step": 3740 }, { "epoch": 0.9514144361283775, "grad_norm": 1.4434198544006103, "learning_rate": 7.141549700010741e-08, "loss": 1.2764, "step": 3750 }, { "epoch": 0.9539515412913865, "grad_norm": 1.3967239883734883, "learning_rate": 6.414880936920665e-08, "loss": 1.2454, "step": 3760 }, { "epoch": 0.9564886464543956, "grad_norm": 1.4745766429828837, "learning_rate": 5.726954125943318e-08, "loss": 1.2747, "step": 3770 }, { "epoch": 0.9590257516174046, "grad_norm": 1.3354472782196753, "learning_rate": 5.0778232630897536e-08, "loss": 1.2717, "step": 3780 }, { "epoch": 0.9615628567804135, "grad_norm": 1.5048063707346293, "learning_rate": 4.4675392992412634e-08, "loss": 1.2728, "step": 3790 }, { "epoch": 0.9640999619434225, "grad_norm": 1.4399837174477832, "learning_rate": 3.896150136150134e-08, "loss": 1.2826, "step": 3800 }, { "epoch": 0.9666370671064316, "grad_norm": 1.4450380885385077, "learning_rate": 3.3637006226797665e-08, "loss": 1.2534, "step": 3810 }, { "epoch": 0.9691741722694406, "grad_norm": 1.35673950609508, "learning_rate": 2.8702325512844908e-08, "loss": 1.2609, "step": 3820 }, { "epoch": 0.9717112774324496, "grad_norm": 1.43190405721669, "learning_rate": 2.4157846547292473e-08, "loss": 1.2787, "step": 3830 }, { "epoch": 0.9742483825954585, "grad_norm": 1.4245067018508633, "learning_rate": 2.000392603049517e-08, "loss": 1.2665, "step": 3840 }, { "epoch": 0.9767854877584676, "grad_norm": 1.4112885660620007, "learning_rate": 1.6240890007510612e-08, "loss": 1.2785, "step": 3850 }, { "epoch": 0.9793225929214766, "grad_norm": 1.499341122900986, "learning_rate": 1.286903384251581e-08, "loss": 1.2539, "step": 3860 }, { "epoch": 0.9818596980844856, "grad_norm": 1.4494246005853764, "learning_rate": 9.888622195615705e-09, "loss": 1.2725, "step": 3870 }, { "epoch": 0.9843968032474946, "grad_norm": 1.421431046100693, "learning_rate": 7.299889002075344e-09, "loss": 1.2726, "step": 3880 }, { "epoch": 0.9869339084105037, "grad_norm": 1.4209346862475516, "learning_rate": 5.103037453954573e-09, "loss": 1.2548, "step": 3890 }, { "epoch": 0.9894710135735126, "grad_norm": 1.3751824191752333, "learning_rate": 3.2982399841618996e-09, "loss": 1.2859, "step": 3900 }, { "epoch": 0.9920081187365216, "grad_norm": 1.4627888810880714, "learning_rate": 1.8856382529192085e-09, "loss": 1.2842, "step": 3910 }, { "epoch": 0.9945452238995306, "grad_norm": 1.3905016842842302, "learning_rate": 8.653431366406617e-10, "loss": 1.2447, "step": 3920 }, { "epoch": 0.9970823290625397, "grad_norm": 1.5470180096733397, "learning_rate": 2.374347192335424e-10, "loss": 1.2707, "step": 3930 }, { "epoch": 0.9996194342255487, "grad_norm": 1.4506027900307656, "learning_rate": 1.9622858088430564e-12, "loss": 1.2737, "step": 3940 }, { "epoch": 0.9998731447418495, "step": 3941, "total_flos": 3.7575827488610714e+18, "train_loss": 1.3289946492206504, "train_runtime": 13442.1001, "train_samples_per_second": 37.53, "train_steps_per_second": 0.293 } ], "logging_steps": 10, "max_steps": 3941, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.7575827488610714e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }