{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9991537376586743, "eval_steps": 500, "global_step": 1329, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.022566995768688293, "grad_norm": 3.9500676023225703, "learning_rate": 3.7593984962406015e-07, "loss": 0.8582, "step": 10 }, { "epoch": 0.045133991537376586, "grad_norm": 1.395677808116374, "learning_rate": 7.518796992481203e-07, "loss": 0.8024, "step": 20 }, { "epoch": 0.06770098730606489, "grad_norm": 0.9062193282306004, "learning_rate": 1.1278195488721805e-06, "loss": 0.7357, "step": 30 }, { "epoch": 0.09026798307475317, "grad_norm": 0.8282785482347225, "learning_rate": 1.5037593984962406e-06, "loss": 0.6898, "step": 40 }, { "epoch": 0.11283497884344147, "grad_norm": 0.701981062657255, "learning_rate": 1.8796992481203007e-06, "loss": 0.6707, "step": 50 }, { "epoch": 0.13540197461212977, "grad_norm": 0.640628105313228, "learning_rate": 2.255639097744361e-06, "loss": 0.6616, "step": 60 }, { "epoch": 0.15796897038081806, "grad_norm": 0.6794950494897954, "learning_rate": 2.631578947368421e-06, "loss": 0.6518, "step": 70 }, { "epoch": 0.18053596614950634, "grad_norm": 0.7767981323221586, "learning_rate": 3.007518796992481e-06, "loss": 0.6496, "step": 80 }, { "epoch": 0.20310296191819463, "grad_norm": 0.7882061395786479, "learning_rate": 3.3834586466165413e-06, "loss": 0.6451, "step": 90 }, { "epoch": 0.22566995768688294, "grad_norm": 0.9439189657933283, "learning_rate": 3.7593984962406014e-06, "loss": 0.6417, "step": 100 }, { "epoch": 0.24823695345557123, "grad_norm": 0.7346506648254489, "learning_rate": 4.135338345864662e-06, "loss": 0.6314, "step": 110 }, { "epoch": 0.27080394922425954, "grad_norm": 0.6595327999279074, "learning_rate": 4.511278195488722e-06, "loss": 0.6293, "step": 120 }, { "epoch": 0.2933709449929478, "grad_norm": 0.8336967878084969, "learning_rate": 4.887218045112782e-06, "loss": 0.6301, "step": 130 }, { "epoch": 0.3159379407616361, "grad_norm": 1.0189555343494847, "learning_rate": 4.9995858503307465e-06, "loss": 0.6244, "step": 140 }, { "epoch": 0.3385049365303244, "grad_norm": 0.7728001819911596, "learning_rate": 4.997557699213042e-06, "loss": 0.6247, "step": 150 }, { "epoch": 0.3610719322990127, "grad_norm": 0.9777925962610354, "learning_rate": 4.993840875896834e-06, "loss": 0.6272, "step": 160 }, { "epoch": 0.383638928067701, "grad_norm": 0.7322084042258205, "learning_rate": 4.988437944773639e-06, "loss": 0.6206, "step": 170 }, { "epoch": 0.40620592383638926, "grad_norm": 0.5923379291119025, "learning_rate": 4.981352633551117e-06, "loss": 0.6168, "step": 180 }, { "epoch": 0.4287729196050776, "grad_norm": 0.5930368670137429, "learning_rate": 4.972589830681161e-06, "loss": 0.611, "step": 190 }, { "epoch": 0.4513399153737659, "grad_norm": 0.6233035871272617, "learning_rate": 4.962155581987166e-06, "loss": 0.6169, "step": 200 }, { "epoch": 0.47390691114245415, "grad_norm": 0.9181448240990033, "learning_rate": 4.9500570864927485e-06, "loss": 0.6131, "step": 210 }, { "epoch": 0.49647390691114246, "grad_norm": 0.6136110334327369, "learning_rate": 4.936302691454844e-06, "loss": 0.6046, "step": 220 }, { "epoch": 0.5190409026798307, "grad_norm": 0.6511071668467217, "learning_rate": 4.920901886604593e-06, "loss": 0.6133, "step": 230 }, { "epoch": 0.5416078984485191, "grad_norm": 0.7958047199444372, "learning_rate": 4.9038652975999665e-06, "loss": 0.6118, "step": 240 }, { "epoch": 0.5641748942172073, "grad_norm": 0.6134271249647514, "learning_rate": 4.8852046786946946e-06, "loss": 0.6057, "step": 250 }, { "epoch": 0.5867418899858956, "grad_norm": 0.7236439355039117, "learning_rate": 4.864932904628507e-06, "loss": 0.608, "step": 260 }, { "epoch": 0.609308885754584, "grad_norm": 0.6533513563016325, "learning_rate": 4.843063961744315e-06, "loss": 0.6031, "step": 270 }, { "epoch": 0.6318758815232722, "grad_norm": 0.6682862848806669, "learning_rate": 4.819612938338455e-06, "loss": 0.6096, "step": 280 }, { "epoch": 0.6544428772919605, "grad_norm": 0.6266507108859902, "learning_rate": 4.794596014250633e-06, "loss": 0.6027, "step": 290 }, { "epoch": 0.6770098730606487, "grad_norm": 0.6518141655545894, "learning_rate": 4.768030449700788e-06, "loss": 0.6086, "step": 300 }, { "epoch": 0.6995768688293371, "grad_norm": 0.5630169599812361, "learning_rate": 4.7399345733805415e-06, "loss": 0.6015, "step": 310 }, { "epoch": 0.7221438645980254, "grad_norm": 0.5368338699635572, "learning_rate": 4.710327769807469e-06, "loss": 0.6046, "step": 320 }, { "epoch": 0.7447108603667136, "grad_norm": 0.5663633274514737, "learning_rate": 4.679230465950908e-06, "loss": 0.6, "step": 330 }, { "epoch": 0.767277856135402, "grad_norm": 0.6293933108526993, "learning_rate": 4.646664117138547e-06, "loss": 0.5993, "step": 340 }, { "epoch": 0.7898448519040903, "grad_norm": 0.5856354618989478, "learning_rate": 4.612651192253495e-06, "loss": 0.6004, "step": 350 }, { "epoch": 0.8124118476727785, "grad_norm": 0.5921119713926103, "learning_rate": 4.577215158232064e-06, "loss": 0.5993, "step": 360 }, { "epoch": 0.8349788434414669, "grad_norm": 0.6330447864809035, "learning_rate": 4.540380463872945e-06, "loss": 0.6063, "step": 370 }, { "epoch": 0.8575458392101551, "grad_norm": 0.5905756830961565, "learning_rate": 4.502172522968966e-06, "loss": 0.6027, "step": 380 }, { "epoch": 0.8801128349788434, "grad_norm": 0.6831894646683164, "learning_rate": 4.462617696773059e-06, "loss": 0.5966, "step": 390 }, { "epoch": 0.9026798307475318, "grad_norm": 0.7462417386371082, "learning_rate": 4.4217432758105215e-06, "loss": 0.5988, "step": 400 }, { "epoch": 0.92524682651622, "grad_norm": 0.6005218513233451, "learning_rate": 4.379577461050153e-06, "loss": 0.5967, "step": 410 }, { "epoch": 0.9478138222849083, "grad_norm": 0.6506758426507315, "learning_rate": 4.336149344447227e-06, "loss": 0.5972, "step": 420 }, { "epoch": 0.9703808180535967, "grad_norm": 0.5379000787907976, "learning_rate": 4.291488888871734e-06, "loss": 0.591, "step": 430 }, { "epoch": 0.9929478138222849, "grad_norm": 0.5676292405964318, "learning_rate": 4.245626907435755e-06, "loss": 0.5957, "step": 440 }, { "epoch": 0.9997179125528914, "eval_loss": 0.5937371253967285, "eval_runtime": 688.298, "eval_samples_per_second": 17.349, "eval_steps_per_second": 0.543, "step": 443 }, { "epoch": 1.0155148095909732, "grad_norm": 0.6510043441238305, "learning_rate": 4.198595042234199e-06, "loss": 0.6093, "step": 450 }, { "epoch": 1.0380818053596614, "grad_norm": 0.6399485357865101, "learning_rate": 4.150425742513609e-06, "loss": 0.5388, "step": 460 }, { "epoch": 1.0606488011283497, "grad_norm": 0.6202997517009662, "learning_rate": 4.101152242284071e-06, "loss": 0.5538, "step": 470 }, { "epoch": 1.0832157968970382, "grad_norm": 0.596478907194731, "learning_rate": 4.050808537389682e-06, "loss": 0.5413, "step": 480 }, { "epoch": 1.1057827926657264, "grad_norm": 0.6688106585925544, "learning_rate": 3.999429362053397e-06, "loss": 0.5516, "step": 490 }, { "epoch": 1.1283497884344147, "grad_norm": 0.5514385320720825, "learning_rate": 3.947050164912446e-06, "loss": 0.5497, "step": 500 }, { "epoch": 1.150916784203103, "grad_norm": 0.5764779109375965, "learning_rate": 3.893707084560831e-06, "loss": 0.5506, "step": 510 }, { "epoch": 1.1734837799717912, "grad_norm": 0.5687644731921871, "learning_rate": 3.839436924615808e-06, "loss": 0.5428, "step": 520 }, { "epoch": 1.1960507757404795, "grad_norm": 0.6716035430024566, "learning_rate": 3.784277128325532e-06, "loss": 0.5485, "step": 530 }, { "epoch": 1.2186177715091677, "grad_norm": 0.7510569732848463, "learning_rate": 3.7282657527354014e-06, "loss": 0.5458, "step": 540 }, { "epoch": 1.2411847672778562, "grad_norm": 0.7205124677149223, "learning_rate": 3.671441442430908e-06, "loss": 0.5507, "step": 550 }, { "epoch": 1.2637517630465445, "grad_norm": 0.5847094902297382, "learning_rate": 3.6138434028751334e-06, "loss": 0.5521, "step": 560 }, { "epoch": 1.2863187588152327, "grad_norm": 0.569651078418994, "learning_rate": 3.555511373359255e-06, "loss": 0.549, "step": 570 }, { "epoch": 1.308885754583921, "grad_norm": 0.6219046665446807, "learning_rate": 3.496485599584756e-06, "loss": 0.5489, "step": 580 }, { "epoch": 1.3314527503526092, "grad_norm": 0.5714415325332034, "learning_rate": 3.4368068058962357e-06, "loss": 0.5487, "step": 590 }, { "epoch": 1.3540197461212977, "grad_norm": 1.1106035467973956, "learning_rate": 3.376516167183983e-06, "loss": 0.5542, "step": 600 }, { "epoch": 1.376586741889986, "grad_norm": 0.5885941973302055, "learning_rate": 3.315655280475704e-06, "loss": 0.5459, "step": 610 }, { "epoch": 1.3991537376586742, "grad_norm": 0.5908363560064332, "learning_rate": 3.2542661362369925e-06, "loss": 0.5502, "step": 620 }, { "epoch": 1.4217207334273625, "grad_norm": 0.5591849121824698, "learning_rate": 3.1923910894003598e-06, "loss": 0.5479, "step": 630 }, { "epoch": 1.4442877291960508, "grad_norm": 0.5602591930048809, "learning_rate": 3.1300728301427947e-06, "loss": 0.5471, "step": 640 }, { "epoch": 1.466854724964739, "grad_norm": 0.5931164000131571, "learning_rate": 3.0673543544320354e-06, "loss": 0.5479, "step": 650 }, { "epoch": 1.4894217207334273, "grad_norm": 0.6020148726486988, "learning_rate": 3.00427893436185e-06, "loss": 0.5515, "step": 660 }, { "epoch": 1.5119887165021155, "grad_norm": 0.5726305762841026, "learning_rate": 2.9408900882968174e-06, "loss": 0.5459, "step": 670 }, { "epoch": 1.5345557122708038, "grad_norm": 0.5629379890905293, "learning_rate": 2.8772315508471893e-06, "loss": 0.5402, "step": 680 }, { "epoch": 1.5571227080394923, "grad_norm": 0.5704880969842114, "learning_rate": 2.8133472426945584e-06, "loss": 0.5417, "step": 690 }, { "epoch": 1.5796897038081805, "grad_norm": 0.5710723412296994, "learning_rate": 2.749281240289151e-06, "loss": 0.546, "step": 700 }, { "epoch": 1.6022566995768688, "grad_norm": 0.5392466716099986, "learning_rate": 2.6850777454396342e-06, "loss": 0.5486, "step": 710 }, { "epoch": 1.6248236953455573, "grad_norm": 0.5812586258338217, "learning_rate": 2.620781054816457e-06, "loss": 0.5563, "step": 720 }, { "epoch": 1.6473906911142455, "grad_norm": 0.5716844996625347, "learning_rate": 2.5564355293897154e-06, "loss": 0.5393, "step": 730 }, { "epoch": 1.6699576868829338, "grad_norm": 0.753596906503236, "learning_rate": 2.49208556382268e-06, "loss": 0.5546, "step": 740 }, { "epoch": 1.692524682651622, "grad_norm": 0.576809698814229, "learning_rate": 2.427775555842057e-06, "loss": 0.5461, "step": 750 }, { "epoch": 1.7150916784203103, "grad_norm": 0.5622230076695471, "learning_rate": 2.3635498756061458e-06, "loss": 0.5448, "step": 760 }, { "epoch": 1.7376586741889986, "grad_norm": 0.5168416840202211, "learning_rate": 2.299452835092016e-06, "loss": 0.545, "step": 770 }, { "epoch": 1.7602256699576868, "grad_norm": 0.5622613178078913, "learning_rate": 2.2355286575228257e-06, "loss": 0.5411, "step": 780 }, { "epoch": 1.782792665726375, "grad_norm": 0.5430296403870848, "learning_rate": 2.1718214468563717e-06, "loss": 0.5441, "step": 790 }, { "epoch": 1.8053596614950633, "grad_norm": 0.5628381998512493, "learning_rate": 2.108375157355931e-06, "loss": 0.5388, "step": 800 }, { "epoch": 1.8279266572637518, "grad_norm": 0.5362604312102565, "learning_rate": 2.0452335632643795e-06, "loss": 0.5494, "step": 810 }, { "epoch": 1.85049365303244, "grad_norm": 0.5154758412468956, "learning_rate": 1.9824402286025154e-06, "loss": 0.5363, "step": 820 }, { "epoch": 1.8730606488011283, "grad_norm": 0.5518765581430269, "learning_rate": 1.92003847711242e-06, "loss": 0.5481, "step": 830 }, { "epoch": 1.8956276445698168, "grad_norm": 0.5312728956898368, "learning_rate": 1.8580713623666068e-06, "loss": 0.5483, "step": 840 }, { "epoch": 1.918194640338505, "grad_norm": 0.5739522548168999, "learning_rate": 1.7965816380635584e-06, "loss": 0.5358, "step": 850 }, { "epoch": 1.9407616361071933, "grad_norm": 0.5414270420814203, "learning_rate": 1.7356117285301693e-06, "loss": 0.5409, "step": 860 }, { "epoch": 1.9633286318758816, "grad_norm": 0.5315261469742604, "learning_rate": 1.675203699451431e-06, "loss": 0.5475, "step": 870 }, { "epoch": 1.9858956276445698, "grad_norm": 0.5423045172744205, "learning_rate": 1.6153992288475608e-06, "loss": 0.5451, "step": 880 }, { "epoch": 1.9994358251057829, "eval_loss": 0.5835120677947998, "eval_runtime": 684.5151, "eval_samples_per_second": 17.444, "eval_steps_per_second": 0.546, "step": 886 }, { "epoch": 2.008462623413258, "grad_norm": 0.7539353987683947, "learning_rate": 1.556239578318598e-06, "loss": 0.5758, "step": 890 }, { "epoch": 2.0310296191819464, "grad_norm": 0.649378671530231, "learning_rate": 1.4977655645762997e-06, "loss": 0.4993, "step": 900 }, { "epoch": 2.0535966149506346, "grad_norm": 0.5877788518727063, "learning_rate": 1.4400175312829936e-06, "loss": 0.5018, "step": 910 }, { "epoch": 2.076163610719323, "grad_norm": 0.5479736885278639, "learning_rate": 1.383035321216798e-06, "loss": 0.4941, "step": 920 }, { "epoch": 2.098730606488011, "grad_norm": 0.560572864576164, "learning_rate": 1.3268582487824252e-06, "loss": 0.5008, "step": 930 }, { "epoch": 2.1212976022566994, "grad_norm": 0.544916918302148, "learning_rate": 1.2715250728865372e-06, "loss": 0.5045, "step": 940 }, { "epoch": 2.143864598025388, "grad_norm": 0.5381551703507882, "learning_rate": 1.2170739701963579e-06, "loss": 0.5052, "step": 950 }, { "epoch": 2.1664315937940763, "grad_norm": 0.546851451490464, "learning_rate": 1.1635425087999944e-06, "loss": 0.5012, "step": 960 }, { "epoch": 2.1889985895627646, "grad_norm": 0.553761979218256, "learning_rate": 1.1109676222866447e-06, "loss": 0.4979, "step": 970 }, { "epoch": 2.211565585331453, "grad_norm": 0.5140355615402071, "learning_rate": 1.0593855842645777e-06, "loss": 0.4922, "step": 980 }, { "epoch": 2.234132581100141, "grad_norm": 0.5469356578333175, "learning_rate": 1.0088319833344533e-06, "loss": 0.4919, "step": 990 }, { "epoch": 2.2566995768688294, "grad_norm": 0.5770077265245278, "learning_rate": 9.593416985352631e-07, "loss": 0.5012, "step": 1000 }, { "epoch": 2.2792665726375176, "grad_norm": 0.5611649964055478, "learning_rate": 9.109488752798265e-07, "loss": 0.5067, "step": 1010 }, { "epoch": 2.301833568406206, "grad_norm": 0.544952831573923, "learning_rate": 8.636869017964417e-07, "loss": 0.5007, "step": 1020 }, { "epoch": 2.324400564174894, "grad_norm": 0.5789004985292946, "learning_rate": 8.175883860929587e-07, "loss": 0.5045, "step": 1030 }, { "epoch": 2.3469675599435824, "grad_norm": 0.5276179958933169, "learning_rate": 7.726851334591511e-07, "loss": 0.5017, "step": 1040 }, { "epoch": 2.3695345557122707, "grad_norm": 0.5396380700970511, "learning_rate": 7.290081245229201e-07, "loss": 0.4989, "step": 1050 }, { "epoch": 2.392101551480959, "grad_norm": 0.5079546332537075, "learning_rate": 6.865874938754659e-07, "loss": 0.498, "step": 1060 }, { "epoch": 2.414668547249647, "grad_norm": 0.5372856734992372, "learning_rate": 6.454525092801745e-07, "loss": 0.4983, "step": 1070 }, { "epoch": 2.4372355430183354, "grad_norm": 0.5433991054748281, "learning_rate": 6.056315514795699e-07, "loss": 0.5016, "step": 1080 }, { "epoch": 2.459802538787024, "grad_norm": 0.5113580868200133, "learning_rate": 5.671520946142526e-07, "loss": 0.4997, "step": 1090 }, { "epoch": 2.4823695345557124, "grad_norm": 0.5287422362758118, "learning_rate": 5.300406872673434e-07, "loss": 0.5022, "step": 1100 }, { "epoch": 2.5049365303244007, "grad_norm": 0.5243252088737198, "learning_rate": 4.943229341475108e-07, "loss": 0.5054, "step": 1110 }, { "epoch": 2.527503526093089, "grad_norm": 0.5214904323076445, "learning_rate": 4.600234784232087e-07, "loss": 0.495, "step": 1120 }, { "epoch": 2.550070521861777, "grad_norm": 0.5270933929763095, "learning_rate": 4.2716598472033066e-07, "loss": 0.4999, "step": 1130 }, { "epoch": 2.5726375176304654, "grad_norm": 0.5345621477399009, "learning_rate": 3.957731227949975e-07, "loss": 0.4982, "step": 1140 }, { "epoch": 2.5952045133991537, "grad_norm": 0.5205630847096079, "learning_rate": 3.6586655189274235e-07, "loss": 0.4986, "step": 1150 }, { "epoch": 2.617771509167842, "grad_norm": 0.5533684987412459, "learning_rate": 3.374669058048948e-07, "loss": 0.5019, "step": 1160 }, { "epoch": 2.64033850493653, "grad_norm": 0.5750216155975586, "learning_rate": 3.1059377863246947e-07, "loss": 0.4904, "step": 1170 }, { "epoch": 2.6629055007052185, "grad_norm": 0.5225421360510641, "learning_rate": 2.8526571126737587e-07, "loss": 0.5015, "step": 1180 }, { "epoch": 2.685472496473907, "grad_norm": 0.5182002723553231, "learning_rate": 2.615001786002833e-07, "loss": 0.502, "step": 1190 }, { "epoch": 2.7080394922425954, "grad_norm": 0.5335554786344534, "learning_rate": 2.3931357746396515e-07, "loss": 0.5001, "step": 1200 }, { "epoch": 2.7306064880112837, "grad_norm": 0.5298076499586926, "learning_rate": 2.1872121532043977e-07, "loss": 0.5009, "step": 1210 }, { "epoch": 2.753173483779972, "grad_norm": 0.521497332991374, "learning_rate": 1.9973729969971315e-07, "loss": 0.4973, "step": 1220 }, { "epoch": 2.77574047954866, "grad_norm": 0.5199026307611461, "learning_rate": 1.823749283974119e-07, "loss": 0.5041, "step": 1230 }, { "epoch": 2.7983074753173485, "grad_norm": 0.5141813761312602, "learning_rate": 1.6664608043806614e-07, "loss": 0.5023, "step": 1240 }, { "epoch": 2.8208744710860367, "grad_norm": 0.513473611989643, "learning_rate": 1.5256160781028118e-07, "loss": 0.5021, "step": 1250 }, { "epoch": 2.843441466854725, "grad_norm": 0.5144086219492163, "learning_rate": 1.401312279794984e-07, "loss": 0.5036, "step": 1260 }, { "epoch": 2.8660084626234132, "grad_norm": 0.507879807961254, "learning_rate": 1.2936351718350923e-07, "loss": 0.5009, "step": 1270 }, { "epoch": 2.8885754583921015, "grad_norm": 0.5056596630306435, "learning_rate": 1.202659045153513e-07, "loss": 0.4983, "step": 1280 }, { "epoch": 2.9111424541607898, "grad_norm": 0.5186469070972282, "learning_rate": 1.1284466679766762e-07, "loss": 0.5021, "step": 1290 }, { "epoch": 2.933709449929478, "grad_norm": 0.5191667761810439, "learning_rate": 1.0710492425206354e-07, "loss": 0.4973, "step": 1300 }, { "epoch": 2.9562764456981663, "grad_norm": 0.5453516596097608, "learning_rate": 1.0305063696645283e-07, "loss": 0.5012, "step": 1310 }, { "epoch": 2.9788434414668545, "grad_norm": 0.5160351097067257, "learning_rate": 1.006846021628281e-07, "loss": 0.4967, "step": 1320 }, { "epoch": 2.9991537376586743, "eval_loss": 0.5900959968566895, "eval_runtime": 688.3543, "eval_samples_per_second": 17.347, "eval_steps_per_second": 0.543, "step": 1329 }, { "epoch": 2.9991537376586743, "step": 1329, "total_flos": 5064195066298368.0, "train_loss": 0.55943138999451, "train_runtime": 121397.9839, "train_samples_per_second": 5.606, "train_steps_per_second": 0.011 } ], "logging_steps": 10, "max_steps": 1329, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5064195066298368.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }