{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 600, "global_step": 3573, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008396305625524769, "grad_norm": 3.039360795711161, "learning_rate": 1.3966480446927373e-06, "loss": 1.7981, "step": 10 }, { "epoch": 0.016792611251049538, "grad_norm": 2.1081820956343322, "learning_rate": 2.7932960893854746e-06, "loss": 1.7816, "step": 20 }, { "epoch": 0.02518891687657431, "grad_norm": 0.563623056034138, "learning_rate": 4.189944134078212e-06, "loss": 1.6797, "step": 30 }, { "epoch": 0.033585222502099076, "grad_norm": 0.4076099849250059, "learning_rate": 5.586592178770949e-06, "loss": 1.5657, "step": 40 }, { "epoch": 0.041981528127623846, "grad_norm": 0.30151480402917696, "learning_rate": 6.983240223463687e-06, "loss": 1.4759, "step": 50 }, { "epoch": 0.05037783375314862, "grad_norm": 0.23775034251274657, "learning_rate": 8.379888268156424e-06, "loss": 1.4312, "step": 60 }, { "epoch": 0.05877413937867338, "grad_norm": 0.2421182919342462, "learning_rate": 9.776536312849161e-06, "loss": 1.4082, "step": 70 }, { "epoch": 0.06717044500419815, "grad_norm": 0.23011928549656843, "learning_rate": 1.1173184357541899e-05, "loss": 1.4263, "step": 80 }, { "epoch": 0.07556675062972293, "grad_norm": 0.21851210361234266, "learning_rate": 1.2569832402234637e-05, "loss": 1.4216, "step": 90 }, { "epoch": 0.08396305625524769, "grad_norm": 0.23106315932058966, "learning_rate": 1.3966480446927374e-05, "loss": 1.4207, "step": 100 }, { "epoch": 0.09235936188077246, "grad_norm": 0.23062228974277152, "learning_rate": 1.5363128491620113e-05, "loss": 1.399, "step": 110 }, { "epoch": 0.10075566750629723, "grad_norm": 0.23855661202824083, "learning_rate": 1.675977653631285e-05, "loss": 1.404, "step": 120 }, { "epoch": 0.109151973131822, "grad_norm": 0.2302715158226374, "learning_rate": 1.8156424581005588e-05, "loss": 1.4057, "step": 130 }, { "epoch": 0.11754827875734676, "grad_norm": 0.23018949408838, "learning_rate": 1.9553072625698323e-05, "loss": 1.3956, "step": 140 }, { "epoch": 0.12594458438287154, "grad_norm": 0.23103802546960453, "learning_rate": 2.0949720670391062e-05, "loss": 1.4179, "step": 150 }, { "epoch": 0.1343408900083963, "grad_norm": 0.2287991193827312, "learning_rate": 2.2346368715083797e-05, "loss": 1.412, "step": 160 }, { "epoch": 0.14273719563392107, "grad_norm": 0.22988402418477955, "learning_rate": 2.3743016759776536e-05, "loss": 1.39, "step": 170 }, { "epoch": 0.15113350125944586, "grad_norm": 0.21974152422479476, "learning_rate": 2.5139664804469275e-05, "loss": 1.4004, "step": 180 }, { "epoch": 0.15952980688497062, "grad_norm": 0.23284763880742212, "learning_rate": 2.6536312849162014e-05, "loss": 1.386, "step": 190 }, { "epoch": 0.16792611251049538, "grad_norm": 0.21486774869633107, "learning_rate": 2.793296089385475e-05, "loss": 1.4012, "step": 200 }, { "epoch": 0.17632241813602015, "grad_norm": 0.22783180948472934, "learning_rate": 2.9329608938547488e-05, "loss": 1.3707, "step": 210 }, { "epoch": 0.1847187237615449, "grad_norm": 0.22399716105168718, "learning_rate": 3.0726256983240227e-05, "loss": 1.4125, "step": 220 }, { "epoch": 0.19311502938706968, "grad_norm": 0.2147254679707046, "learning_rate": 3.212290502793296e-05, "loss": 1.3699, "step": 230 }, { "epoch": 0.20151133501259447, "grad_norm": 0.21426390076476165, "learning_rate": 3.35195530726257e-05, "loss": 1.374, "step": 240 }, { "epoch": 0.20990764063811923, "grad_norm": 0.22424846384922523, "learning_rate": 3.491620111731844e-05, "loss": 1.3788, "step": 250 }, { "epoch": 0.218303946263644, "grad_norm": 0.21440605091273993, "learning_rate": 3.6312849162011175e-05, "loss": 1.3565, "step": 260 }, { "epoch": 0.22670025188916876, "grad_norm": 0.21539799426915837, "learning_rate": 3.770949720670392e-05, "loss": 1.3521, "step": 270 }, { "epoch": 0.23509655751469352, "grad_norm": 0.22056667191105353, "learning_rate": 3.9106145251396646e-05, "loss": 1.3613, "step": 280 }, { "epoch": 0.2434928631402183, "grad_norm": 0.2041245512715553, "learning_rate": 4.050279329608939e-05, "loss": 1.36, "step": 290 }, { "epoch": 0.2518891687657431, "grad_norm": 0.2053261976415489, "learning_rate": 4.1899441340782123e-05, "loss": 1.3799, "step": 300 }, { "epoch": 0.26028547439126787, "grad_norm": 0.20324705222690662, "learning_rate": 4.3296089385474866e-05, "loss": 1.3797, "step": 310 }, { "epoch": 0.2686817800167926, "grad_norm": 0.20526982003231917, "learning_rate": 4.4692737430167594e-05, "loss": 1.3675, "step": 320 }, { "epoch": 0.2770780856423174, "grad_norm": 0.20273779659012153, "learning_rate": 4.6089385474860336e-05, "loss": 1.3888, "step": 330 }, { "epoch": 0.28547439126784213, "grad_norm": 0.2061146702772721, "learning_rate": 4.748603351955307e-05, "loss": 1.3626, "step": 340 }, { "epoch": 0.2938706968933669, "grad_norm": 0.18820362655678458, "learning_rate": 4.8882681564245814e-05, "loss": 1.3598, "step": 350 }, { "epoch": 0.3022670025188917, "grad_norm": 0.21352845647640323, "learning_rate": 4.9999952257225154e-05, "loss": 1.3933, "step": 360 }, { "epoch": 0.31066330814441645, "grad_norm": 0.21205929000762902, "learning_rate": 4.9998281279252004e-05, "loss": 1.3723, "step": 370 }, { "epoch": 0.31905961376994124, "grad_norm": 0.20026396935477256, "learning_rate": 4.9994223344882976e-05, "loss": 1.3889, "step": 380 }, { "epoch": 0.327455919395466, "grad_norm": 0.2027355098630582, "learning_rate": 4.9987778841589184e-05, "loss": 1.3757, "step": 390 }, { "epoch": 0.33585222502099077, "grad_norm": 0.1917389523789995, "learning_rate": 4.997894838472288e-05, "loss": 1.3602, "step": 400 }, { "epoch": 0.34424853064651556, "grad_norm": 0.19386030500153506, "learning_rate": 4.996773281745864e-05, "loss": 1.3483, "step": 410 }, { "epoch": 0.3526448362720403, "grad_norm": 0.18924586602111995, "learning_rate": 4.9954133210712904e-05, "loss": 1.3773, "step": 420 }, { "epoch": 0.3610411418975651, "grad_norm": 0.19404566415989755, "learning_rate": 4.9938150863041666e-05, "loss": 1.3817, "step": 430 }, { "epoch": 0.3694374475230898, "grad_norm": 0.1864716184637937, "learning_rate": 4.991978730051653e-05, "loss": 1.3436, "step": 440 }, { "epoch": 0.3778337531486146, "grad_norm": 0.18756275917822954, "learning_rate": 4.9899044276578956e-05, "loss": 1.376, "step": 450 }, { "epoch": 0.38623005877413935, "grad_norm": 0.19103603911697656, "learning_rate": 4.987592377187286e-05, "loss": 1.3699, "step": 460 }, { "epoch": 0.39462636439966414, "grad_norm": 0.19933360659538543, "learning_rate": 4.985042799405547e-05, "loss": 1.3831, "step": 470 }, { "epoch": 0.40302267002518893, "grad_norm": 0.19498666555973088, "learning_rate": 4.982255937758657e-05, "loss": 1.3579, "step": 480 }, { "epoch": 0.41141897565071367, "grad_norm": 0.19300141279658406, "learning_rate": 4.9792320583495975e-05, "loss": 1.348, "step": 490 }, { "epoch": 0.41981528127623846, "grad_norm": 0.19742724416821206, "learning_rate": 4.975971449912952e-05, "loss": 1.3474, "step": 500 }, { "epoch": 0.4282115869017632, "grad_norm": 0.19421992100107466, "learning_rate": 4.972474423787331e-05, "loss": 1.3609, "step": 510 }, { "epoch": 0.436607892527288, "grad_norm": 0.2078937193948646, "learning_rate": 4.9687413138856446e-05, "loss": 1.3829, "step": 520 }, { "epoch": 0.4450041981528128, "grad_norm": 0.19272346953581754, "learning_rate": 4.964772476663222e-05, "loss": 1.3554, "step": 530 }, { "epoch": 0.4534005037783375, "grad_norm": 0.19281909541060518, "learning_rate": 4.960568291083772e-05, "loss": 1.3515, "step": 540 }, { "epoch": 0.4617968094038623, "grad_norm": 0.19781631242122302, "learning_rate": 4.956129158583199e-05, "loss": 1.3547, "step": 550 }, { "epoch": 0.47019311502938704, "grad_norm": 0.21165698099082017, "learning_rate": 4.951455503031271e-05, "loss": 1.329, "step": 560 }, { "epoch": 0.47858942065491183, "grad_norm": 0.1954676247733346, "learning_rate": 4.946547770691152e-05, "loss": 1.3547, "step": 570 }, { "epoch": 0.4869857262804366, "grad_norm": 0.1934746445894875, "learning_rate": 4.9414064301767786e-05, "loss": 1.3421, "step": 580 }, { "epoch": 0.49538203190596136, "grad_norm": 0.19263774524822871, "learning_rate": 4.9360319724081285e-05, "loss": 1.3576, "step": 590 }, { "epoch": 0.5037783375314862, "grad_norm": 0.18876853207737465, "learning_rate": 4.930424910564334e-05, "loss": 1.3667, "step": 600 }, { "epoch": 0.5037783375314862, "eval_loss": 1.3807679414749146, "eval_runtime": 37.4693, "eval_samples_per_second": 10.275, "eval_steps_per_second": 2.589, "step": 600 }, { "epoch": 0.5121746431570109, "grad_norm": 0.19209309775474845, "learning_rate": 4.92458578003469e-05, "loss": 1.3504, "step": 610 }, { "epoch": 0.5205709487825357, "grad_norm": 0.19079678945155537, "learning_rate": 4.9185151383675246e-05, "loss": 1.339, "step": 620 }, { "epoch": 0.5289672544080605, "grad_norm": 0.194534067082253, "learning_rate": 4.912213565216966e-05, "loss": 1.3358, "step": 630 }, { "epoch": 0.5373635600335852, "grad_norm": 0.19307901988822676, "learning_rate": 4.905681662287595e-05, "loss": 1.3708, "step": 640 }, { "epoch": 0.5457598656591099, "grad_norm": 0.1990057756809498, "learning_rate": 4.898920053276989e-05, "loss": 1.363, "step": 650 }, { "epoch": 0.5541561712846348, "grad_norm": 0.2010697996628507, "learning_rate": 4.891929383816169e-05, "loss": 1.3993, "step": 660 }, { "epoch": 0.5625524769101595, "grad_norm": 0.20209675418432702, "learning_rate": 4.8847103214079524e-05, "loss": 1.351, "step": 670 }, { "epoch": 0.5709487825356843, "grad_norm": 0.20138078865908093, "learning_rate": 4.877263555363216e-05, "loss": 1.3494, "step": 680 }, { "epoch": 0.5793450881612091, "grad_norm": 0.20340329720637837, "learning_rate": 4.869589796735078e-05, "loss": 1.3699, "step": 690 }, { "epoch": 0.5877413937867338, "grad_norm": 0.18554034872359323, "learning_rate": 4.8616897782510006e-05, "loss": 1.3358, "step": 700 }, { "epoch": 0.5961376994122586, "grad_norm": 0.20470702291536869, "learning_rate": 4.853564254242831e-05, "loss": 1.3417, "step": 710 }, { "epoch": 0.6045340050377834, "grad_norm": 0.20233392948438, "learning_rate": 4.845214000574768e-05, "loss": 1.3633, "step": 720 }, { "epoch": 0.6129303106633082, "grad_norm": 0.19552940545300115, "learning_rate": 4.8366398145692824e-05, "loss": 1.3582, "step": 730 }, { "epoch": 0.6213266162888329, "grad_norm": 0.20658182549440357, "learning_rate": 4.827842514930985e-05, "loss": 1.3702, "step": 740 }, { "epoch": 0.6297229219143576, "grad_norm": 0.2029780054195454, "learning_rate": 4.81882294166845e-05, "loss": 1.3807, "step": 750 }, { "epoch": 0.6381192275398825, "grad_norm": 0.19667484329069757, "learning_rate": 4.8095819560140096e-05, "loss": 1.3593, "step": 760 }, { "epoch": 0.6465155331654072, "grad_norm": 0.2048531959544165, "learning_rate": 4.8001204403415204e-05, "loss": 1.3583, "step": 770 }, { "epoch": 0.654911838790932, "grad_norm": 0.2036838290154731, "learning_rate": 4.7904392980821035e-05, "loss": 1.3478, "step": 780 }, { "epoch": 0.6633081444164568, "grad_norm": 0.20036896998141893, "learning_rate": 4.78053945363789e-05, "loss": 1.361, "step": 790 }, { "epoch": 0.6717044500419815, "grad_norm": 0.19874326596454528, "learning_rate": 4.770421852293746e-05, "loss": 1.3445, "step": 800 }, { "epoch": 0.6801007556675063, "grad_norm": 0.20224592915430473, "learning_rate": 4.760087460127019e-05, "loss": 1.3328, "step": 810 }, { "epoch": 0.6884970612930311, "grad_norm": 0.20341065735706346, "learning_rate": 4.749537263915288e-05, "loss": 1.327, "step": 820 }, { "epoch": 0.6968933669185559, "grad_norm": 0.19878763475266986, "learning_rate": 4.7387722710421414e-05, "loss": 1.3508, "step": 830 }, { "epoch": 0.7052896725440806, "grad_norm": 0.1901873610814321, "learning_rate": 4.7277935094009896e-05, "loss": 1.331, "step": 840 }, { "epoch": 0.7136859781696053, "grad_norm": 0.20858107457668973, "learning_rate": 4.716602027296916e-05, "loss": 1.3398, "step": 850 }, { "epoch": 0.7220822837951302, "grad_norm": 0.2127448740445764, "learning_rate": 4.705198893346578e-05, "loss": 1.3557, "step": 860 }, { "epoch": 0.7304785894206549, "grad_norm": 0.2052323089476111, "learning_rate": 4.693585196376171e-05, "loss": 1.3422, "step": 870 }, { "epoch": 0.7388748950461796, "grad_norm": 0.2034886804011326, "learning_rate": 4.681762045317464e-05, "loss": 1.3767, "step": 880 }, { "epoch": 0.7472712006717045, "grad_norm": 0.20205914104891914, "learning_rate": 4.6697305691019105e-05, "loss": 1.3459, "step": 890 }, { "epoch": 0.7556675062972292, "grad_norm": 0.1947344074048, "learning_rate": 4.6574919165528567e-05, "loss": 1.3253, "step": 900 }, { "epoch": 0.764063811922754, "grad_norm": 0.20617146845645057, "learning_rate": 4.645047256275839e-05, "loss": 1.3419, "step": 910 }, { "epoch": 0.7724601175482787, "grad_norm": 0.19999709652925082, "learning_rate": 4.632397776547011e-05, "loss": 1.3478, "step": 920 }, { "epoch": 0.7808564231738035, "grad_norm": 0.20682503408598918, "learning_rate": 4.619544685199669e-05, "loss": 1.3391, "step": 930 }, { "epoch": 0.7892527287993283, "grad_norm": 0.2111768615345554, "learning_rate": 4.6064892095089326e-05, "loss": 1.3445, "step": 940 }, { "epoch": 0.797649034424853, "grad_norm": 0.2081975290000968, "learning_rate": 4.59323259607455e-05, "loss": 1.3167, "step": 950 }, { "epoch": 0.8060453400503779, "grad_norm": 0.20101213815546085, "learning_rate": 4.5797761107018754e-05, "loss": 1.3383, "step": 960 }, { "epoch": 0.8144416456759026, "grad_norm": 0.2174273103650515, "learning_rate": 4.566121038280995e-05, "loss": 1.3404, "step": 970 }, { "epoch": 0.8228379513014273, "grad_norm": 0.19965611041565887, "learning_rate": 4.552268682664045e-05, "loss": 1.3333, "step": 980 }, { "epoch": 0.8312342569269522, "grad_norm": 0.20691842606557614, "learning_rate": 4.538220366540713e-05, "loss": 1.3182, "step": 990 }, { "epoch": 0.8396305625524769, "grad_norm": 0.21181131160704766, "learning_rate": 4.52397743131194e-05, "loss": 1.3494, "step": 1000 }, { "epoch": 0.8480268681780017, "grad_norm": 0.20088379875239834, "learning_rate": 4.509541236961835e-05, "loss": 1.3372, "step": 1010 }, { "epoch": 0.8564231738035264, "grad_norm": 0.21111113264702316, "learning_rate": 4.4949131619278233e-05, "loss": 1.3459, "step": 1020 }, { "epoch": 0.8648194794290512, "grad_norm": 0.2099829717112053, "learning_rate": 4.4800946029690205e-05, "loss": 1.3288, "step": 1030 }, { "epoch": 0.873215785054576, "grad_norm": 0.217914005065172, "learning_rate": 4.465086975032866e-05, "loss": 1.3524, "step": 1040 }, { "epoch": 0.8816120906801007, "grad_norm": 0.2063304535624245, "learning_rate": 4.4498917111200154e-05, "loss": 1.3233, "step": 1050 }, { "epoch": 0.8900083963056256, "grad_norm": 0.21350579375508957, "learning_rate": 4.434510262147514e-05, "loss": 1.3486, "step": 1060 }, { "epoch": 0.8984047019311503, "grad_norm": 0.2091334109956843, "learning_rate": 4.418944096810253e-05, "loss": 1.3375, "step": 1070 }, { "epoch": 0.906801007556675, "grad_norm": 0.20954696844936122, "learning_rate": 4.403194701440733e-05, "loss": 1.3428, "step": 1080 }, { "epoch": 0.9151973131821999, "grad_norm": 0.21790164835706194, "learning_rate": 4.3872635798671384e-05, "loss": 1.3312, "step": 1090 }, { "epoch": 0.9235936188077246, "grad_norm": 0.21625055602980792, "learning_rate": 4.3711522532697526e-05, "loss": 1.3268, "step": 1100 }, { "epoch": 0.9319899244332494, "grad_norm": 0.21992048192275737, "learning_rate": 4.354862260035699e-05, "loss": 1.2904, "step": 1110 }, { "epoch": 0.9403862300587741, "grad_norm": 0.22805590389300254, "learning_rate": 4.338395155612055e-05, "loss": 1.3352, "step": 1120 }, { "epoch": 0.9487825356842989, "grad_norm": 0.2300433128490517, "learning_rate": 4.321752512357328e-05, "loss": 1.3327, "step": 1130 }, { "epoch": 0.9571788413098237, "grad_norm": 0.2564818367135206, "learning_rate": 4.304935919391317e-05, "loss": 1.3293, "step": 1140 }, { "epoch": 0.9655751469353484, "grad_norm": 0.21597640959608433, "learning_rate": 4.2879469824433814e-05, "loss": 1.3171, "step": 1150 }, { "epoch": 0.9739714525608733, "grad_norm": 0.21055331712423173, "learning_rate": 4.2707873236991116e-05, "loss": 1.3291, "step": 1160 }, { "epoch": 0.982367758186398, "grad_norm": 0.21558556200945594, "learning_rate": 4.2534585816454384e-05, "loss": 1.2906, "step": 1170 }, { "epoch": 0.9907640638119227, "grad_norm": 0.2134813755097652, "learning_rate": 4.235962410914185e-05, "loss": 1.3516, "step": 1180 }, { "epoch": 0.9991603694374476, "grad_norm": 0.22460106184588927, "learning_rate": 4.218300482124069e-05, "loss": 1.3249, "step": 1190 }, { "epoch": 1.0075566750629723, "grad_norm": 0.2510896281590031, "learning_rate": 4.2004744817211866e-05, "loss": 1.2766, "step": 1200 }, { "epoch": 1.0075566750629723, "eval_loss": 1.3506666421890259, "eval_runtime": 37.3255, "eval_samples_per_second": 10.315, "eval_steps_per_second": 2.599, "step": 1200 }, { "epoch": 1.015952980688497, "grad_norm": 0.2491416849165635, "learning_rate": 4.182486111817983e-05, "loss": 1.2816, "step": 1210 }, { "epoch": 1.0243492863140218, "grad_norm": 0.2711674700370134, "learning_rate": 4.164337090030727e-05, "loss": 1.2815, "step": 1220 }, { "epoch": 1.0327455919395465, "grad_norm": 0.25237777386220234, "learning_rate": 4.146029149315502e-05, "loss": 1.2692, "step": 1230 }, { "epoch": 1.0411418975650715, "grad_norm": 0.24173423889336904, "learning_rate": 4.1275640378027367e-05, "loss": 1.2572, "step": 1240 }, { "epoch": 1.0495382031905962, "grad_norm": 0.2516517653129535, "learning_rate": 4.108943518630287e-05, "loss": 1.2636, "step": 1250 }, { "epoch": 1.057934508816121, "grad_norm": 0.2663809997677264, "learning_rate": 4.09016936977508e-05, "loss": 1.246, "step": 1260 }, { "epoch": 1.0663308144416457, "grad_norm": 0.2666405948242722, "learning_rate": 4.071243383883344e-05, "loss": 1.2416, "step": 1270 }, { "epoch": 1.0747271200671704, "grad_norm": 0.27372912036143243, "learning_rate": 4.052167368099443e-05, "loss": 1.2684, "step": 1280 }, { "epoch": 1.0831234256926952, "grad_norm": 0.2822619978126531, "learning_rate": 4.032943143893315e-05, "loss": 1.2679, "step": 1290 }, { "epoch": 1.0915197313182199, "grad_norm": 0.2630554056666267, "learning_rate": 4.0135725468865545e-05, "loss": 1.2801, "step": 1300 }, { "epoch": 1.0999160369437448, "grad_norm": 0.27334249585434633, "learning_rate": 3.994057426677135e-05, "loss": 1.2552, "step": 1310 }, { "epoch": 1.1083123425692696, "grad_norm": 0.2751762295182287, "learning_rate": 3.974399646662804e-05, "loss": 1.2467, "step": 1320 }, { "epoch": 1.1167086481947943, "grad_norm": 0.27811851573000057, "learning_rate": 3.9546010838631523e-05, "loss": 1.2393, "step": 1330 }, { "epoch": 1.125104953820319, "grad_norm": 0.2786179703795066, "learning_rate": 3.9346636287403936e-05, "loss": 1.2413, "step": 1340 }, { "epoch": 1.1335012594458438, "grad_norm": 0.2757598420534043, "learning_rate": 3.9145891850188446e-05, "loss": 1.2344, "step": 1350 }, { "epoch": 1.1418975650713685, "grad_norm": 0.2826938023138817, "learning_rate": 3.894379669503159e-05, "loss": 1.2723, "step": 1360 }, { "epoch": 1.1502938706968933, "grad_norm": 0.28640980146174666, "learning_rate": 3.874037011895294e-05, "loss": 1.2441, "step": 1370 }, { "epoch": 1.1586901763224182, "grad_norm": 0.2806223555234711, "learning_rate": 3.853563154610255e-05, "loss": 1.2599, "step": 1380 }, { "epoch": 1.167086481947943, "grad_norm": 0.280705015660275, "learning_rate": 3.832960052590626e-05, "loss": 1.2596, "step": 1390 }, { "epoch": 1.1754827875734677, "grad_norm": 0.2794281887739961, "learning_rate": 3.812229673119902e-05, "loss": 1.2679, "step": 1400 }, { "epoch": 1.1838790931989924, "grad_norm": 0.29244474520092206, "learning_rate": 3.791373995634641e-05, "loss": 1.2893, "step": 1410 }, { "epoch": 1.1922753988245172, "grad_norm": 0.27087843377656023, "learning_rate": 3.770395011535459e-05, "loss": 1.2858, "step": 1420 }, { "epoch": 1.200671704450042, "grad_norm": 0.3084320815357459, "learning_rate": 3.749294723996884e-05, "loss": 1.2544, "step": 1430 }, { "epoch": 1.2090680100755669, "grad_norm": 0.2970014625754613, "learning_rate": 3.72807514777608e-05, "loss": 1.2406, "step": 1440 }, { "epoch": 1.2174643157010916, "grad_norm": 0.2858465781764202, "learning_rate": 3.7067383090204696e-05, "loss": 1.2687, "step": 1450 }, { "epoch": 1.2258606213266163, "grad_norm": 0.29654341377963134, "learning_rate": 3.6852862450742664e-05, "loss": 1.2529, "step": 1460 }, { "epoch": 1.234256926952141, "grad_norm": 0.2999209714333275, "learning_rate": 3.663721004283942e-05, "loss": 1.2607, "step": 1470 }, { "epoch": 1.2426532325776658, "grad_norm": 0.32376267785640894, "learning_rate": 3.642044645802638e-05, "loss": 1.2633, "step": 1480 }, { "epoch": 1.2510495382031905, "grad_norm": 0.2946271678215341, "learning_rate": 3.620259239393549e-05, "loss": 1.2631, "step": 1490 }, { "epoch": 1.2594458438287153, "grad_norm": 0.29934101463933066, "learning_rate": 3.5983668652322914e-05, "loss": 1.2323, "step": 1500 }, { "epoch": 1.26784214945424, "grad_norm": 0.3058737589423296, "learning_rate": 3.576369613708278e-05, "loss": 1.2499, "step": 1510 }, { "epoch": 1.276238455079765, "grad_norm": 0.294073209044752, "learning_rate": 3.554269585225117e-05, "loss": 1.2623, "step": 1520 }, { "epoch": 1.2846347607052897, "grad_norm": 0.3133601837566205, "learning_rate": 3.532068890000059e-05, "loss": 1.25, "step": 1530 }, { "epoch": 1.2930310663308144, "grad_norm": 0.3030161218843826, "learning_rate": 3.5097696478624956e-05, "loss": 1.2615, "step": 1540 }, { "epoch": 1.3014273719563392, "grad_norm": 0.2944854213261804, "learning_rate": 3.487373988051556e-05, "loss": 1.2577, "step": 1550 }, { "epoch": 1.309823677581864, "grad_norm": 0.3198932031443526, "learning_rate": 3.464884049012794e-05, "loss": 1.2252, "step": 1560 }, { "epoch": 1.3182199832073889, "grad_norm": 0.3075614601508588, "learning_rate": 3.442301978193996e-05, "loss": 1.2333, "step": 1570 }, { "epoch": 1.3266162888329136, "grad_norm": 0.3025833538356648, "learning_rate": 3.419629931840137e-05, "loss": 1.2309, "step": 1580 }, { "epoch": 1.3350125944584383, "grad_norm": 0.33238464076222285, "learning_rate": 3.396870074787489e-05, "loss": 1.2347, "step": 1590 }, { "epoch": 1.343408900083963, "grad_norm": 0.33142866431526025, "learning_rate": 3.374024580256913e-05, "loss": 1.2278, "step": 1600 }, { "epoch": 1.3518052057094878, "grad_norm": 0.3068832931071064, "learning_rate": 3.351095629646348e-05, "loss": 1.2459, "step": 1610 }, { "epoch": 1.3602015113350125, "grad_norm": 0.32787598875401036, "learning_rate": 3.3280854123225245e-05, "loss": 1.2567, "step": 1620 }, { "epoch": 1.3685978169605373, "grad_norm": 0.334986160782939, "learning_rate": 3.3049961254119077e-05, "loss": 1.2427, "step": 1630 }, { "epoch": 1.376994122586062, "grad_norm": 0.3257919179674348, "learning_rate": 3.281829973590909e-05, "loss": 1.2292, "step": 1640 }, { "epoch": 1.385390428211587, "grad_norm": 0.33488335118724716, "learning_rate": 3.258589168875373e-05, "loss": 1.2231, "step": 1650 }, { "epoch": 1.3937867338371117, "grad_norm": 0.29956276796558484, "learning_rate": 3.2352759304093624e-05, "loss": 1.2399, "step": 1660 }, { "epoch": 1.4021830394626364, "grad_norm": 0.32793658183091984, "learning_rate": 3.211892484253261e-05, "loss": 1.2786, "step": 1670 }, { "epoch": 1.4105793450881612, "grad_norm": 0.3437947332874714, "learning_rate": 3.1884410631712235e-05, "loss": 1.2423, "step": 1680 }, { "epoch": 1.418975650713686, "grad_norm": 0.32417782627379454, "learning_rate": 3.164923906417979e-05, "loss": 1.2399, "step": 1690 }, { "epoch": 1.4273719563392109, "grad_norm": 0.32303675587973024, "learning_rate": 3.1413432595250134e-05, "loss": 1.2575, "step": 1700 }, { "epoch": 1.4357682619647356, "grad_norm": 0.3107110639387478, "learning_rate": 3.117701374086158e-05, "loss": 1.2155, "step": 1710 }, { "epoch": 1.4441645675902604, "grad_norm": 0.31980715839644586, "learning_rate": 3.094000507542595e-05, "loss": 1.2561, "step": 1720 }, { "epoch": 1.452560873215785, "grad_norm": 0.3452092427446392, "learning_rate": 3.07024292296731e-05, "loss": 1.2389, "step": 1730 }, { "epoch": 1.4609571788413098, "grad_norm": 0.34168169619885197, "learning_rate": 3.0464308888489936e-05, "loss": 1.2323, "step": 1740 }, { "epoch": 1.4693534844668346, "grad_norm": 0.33064208693690544, "learning_rate": 3.022566678875446e-05, "loss": 1.228, "step": 1750 }, { "epoch": 1.4777497900923593, "grad_norm": 0.3362430973154644, "learning_rate": 2.9986525717164694e-05, "loss": 1.2025, "step": 1760 }, { "epoch": 1.486146095717884, "grad_norm": 0.34796677921939057, "learning_rate": 2.974690850806287e-05, "loss": 1.2585, "step": 1770 }, { "epoch": 1.4945424013434088, "grad_norm": 0.31369265907711963, "learning_rate": 2.9506838041255147e-05, "loss": 1.2438, "step": 1780 }, { "epoch": 1.5029387069689337, "grad_norm": 0.35233288464429297, "learning_rate": 2.926633723982692e-05, "loss": 1.23, "step": 1790 }, { "epoch": 1.5113350125944585, "grad_norm": 0.33824484157682727, "learning_rate": 2.9025429067953975e-05, "loss": 1.2089, "step": 1800 }, { "epoch": 1.5113350125944585, "eval_loss": 1.3161499500274658, "eval_runtime": 37.5007, "eval_samples_per_second": 10.266, "eval_steps_per_second": 2.587, "step": 1800 }, { "epoch": 1.5197313182199832, "grad_norm": 0.3450077958737747, "learning_rate": 2.878413652870983e-05, "loss": 1.2452, "step": 1810 }, { "epoch": 1.528127623845508, "grad_norm": 0.3401369524359448, "learning_rate": 2.8542482661869228e-05, "loss": 1.2335, "step": 1820 }, { "epoch": 1.536523929471033, "grad_norm": 0.35772471665666244, "learning_rate": 2.8300490541708226e-05, "loss": 1.2107, "step": 1830 }, { "epoch": 1.5449202350965576, "grad_norm": 0.36182719622922654, "learning_rate": 2.8058183274800933e-05, "loss": 1.2323, "step": 1840 }, { "epoch": 1.5533165407220824, "grad_norm": 0.3427498080631671, "learning_rate": 2.7815583997813176e-05, "loss": 1.2208, "step": 1850 }, { "epoch": 1.561712846347607, "grad_norm": 0.3416312895125341, "learning_rate": 2.7572715875293336e-05, "loss": 1.2519, "step": 1860 }, { "epoch": 1.5701091519731318, "grad_norm": 0.34134497126888136, "learning_rate": 2.7329602097460453e-05, "loss": 1.2072, "step": 1870 }, { "epoch": 1.5785054575986566, "grad_norm": 0.3414019695200471, "learning_rate": 2.7086265877989907e-05, "loss": 1.2171, "step": 1880 }, { "epoch": 1.5869017632241813, "grad_norm": 0.3493625784848411, "learning_rate": 2.68427304517969e-05, "loss": 1.2032, "step": 1890 }, { "epoch": 1.595298068849706, "grad_norm": 0.3432387874546048, "learning_rate": 2.6599019072817823e-05, "loss": 1.2233, "step": 1900 }, { "epoch": 1.6036943744752308, "grad_norm": 0.34931851248995316, "learning_rate": 2.6355155011789918e-05, "loss": 1.2302, "step": 1910 }, { "epoch": 1.6120906801007555, "grad_norm": 0.3423165132865183, "learning_rate": 2.6111161554029216e-05, "loss": 1.2237, "step": 1920 }, { "epoch": 1.6204869857262805, "grad_norm": 0.3286103317410491, "learning_rate": 2.5867061997207208e-05, "loss": 1.2373, "step": 1930 }, { "epoch": 1.6288832913518052, "grad_norm": 0.34439194836734, "learning_rate": 2.5622879649126204e-05, "loss": 1.2366, "step": 1940 }, { "epoch": 1.63727959697733, "grad_norm": 0.37147052522944746, "learning_rate": 2.5378637825493827e-05, "loss": 1.2175, "step": 1950 }, { "epoch": 1.645675902602855, "grad_norm": 0.3720397203000226, "learning_rate": 2.5134359847696725e-05, "loss": 1.2226, "step": 1960 }, { "epoch": 1.6540722082283796, "grad_norm": 0.372249827112124, "learning_rate": 2.48900690405737e-05, "loss": 1.2354, "step": 1970 }, { "epoch": 1.6624685138539044, "grad_norm": 0.32759893432286225, "learning_rate": 2.4645788730188595e-05, "loss": 1.2346, "step": 1980 }, { "epoch": 1.670864819479429, "grad_norm": 0.36258855898143105, "learning_rate": 2.4401542241602936e-05, "loss": 1.2266, "step": 1990 }, { "epoch": 1.6792611251049538, "grad_norm": 0.35828351901418637, "learning_rate": 2.4157352896648785e-05, "loss": 1.2489, "step": 2000 }, { "epoch": 1.6876574307304786, "grad_norm": 0.3547226628271256, "learning_rate": 2.391324401170189e-05, "loss": 1.2321, "step": 2010 }, { "epoch": 1.6960537363560033, "grad_norm": 0.37844822229404196, "learning_rate": 2.3669238895455257e-05, "loss": 1.2133, "step": 2020 }, { "epoch": 1.704450041981528, "grad_norm": 0.36535971407503814, "learning_rate": 2.3425360846693546e-05, "loss": 1.1832, "step": 2030 }, { "epoch": 1.7128463476070528, "grad_norm": 0.351966888983441, "learning_rate": 2.3181633152068444e-05, "loss": 1.1978, "step": 2040 }, { "epoch": 1.7212426532325775, "grad_norm": 0.35857960344757944, "learning_rate": 2.293807908387507e-05, "loss": 1.2274, "step": 2050 }, { "epoch": 1.7296389588581025, "grad_norm": 0.36067730619373684, "learning_rate": 2.269472189782984e-05, "loss": 1.2107, "step": 2060 }, { "epoch": 1.7380352644836272, "grad_norm": 0.3686118668283621, "learning_rate": 2.2451584830849963e-05, "loss": 1.2453, "step": 2070 }, { "epoch": 1.746431570109152, "grad_norm": 0.3574767970991608, "learning_rate": 2.2208691098834574e-05, "loss": 1.2112, "step": 2080 }, { "epoch": 1.7548278757346767, "grad_norm": 0.3649596547435675, "learning_rate": 2.196606389444802e-05, "loss": 1.2301, "step": 2090 }, { "epoch": 1.7632241813602016, "grad_norm": 0.35608820401696456, "learning_rate": 2.1723726384905332e-05, "loss": 1.2254, "step": 2100 }, { "epoch": 1.7716204869857264, "grad_norm": 0.3903682280760152, "learning_rate": 2.1481701709760054e-05, "loss": 1.2191, "step": 2110 }, { "epoch": 1.7800167926112511, "grad_norm": 0.3700744583799887, "learning_rate": 2.1240012978694786e-05, "loss": 1.1897, "step": 2120 }, { "epoch": 1.7884130982367759, "grad_norm": 0.3678524367281197, "learning_rate": 2.0998683269314613e-05, "loss": 1.1936, "step": 2130 }, { "epoch": 1.7968094038623006, "grad_norm": 0.39001676927124185, "learning_rate": 2.0757735624943448e-05, "loss": 1.2248, "step": 2140 }, { "epoch": 1.8052057094878253, "grad_norm": 0.39780376699395825, "learning_rate": 2.0517193052423823e-05, "loss": 1.2022, "step": 2150 }, { "epoch": 1.81360201511335, "grad_norm": 0.380325076212259, "learning_rate": 2.0277078519920044e-05, "loss": 1.1938, "step": 2160 }, { "epoch": 1.8219983207388748, "grad_norm": 0.3871471596296449, "learning_rate": 2.0037414954725117e-05, "loss": 1.1825, "step": 2170 }, { "epoch": 1.8303946263643995, "grad_norm": 0.37964701587107547, "learning_rate": 1.979822524107148e-05, "loss": 1.2092, "step": 2180 }, { "epoch": 1.8387909319899243, "grad_norm": 0.36807392164929836, "learning_rate": 1.9559532217945974e-05, "loss": 1.2378, "step": 2190 }, { "epoch": 1.8471872376154492, "grad_norm": 0.3558532380492764, "learning_rate": 1.932135867690901e-05, "loss": 1.2142, "step": 2200 }, { "epoch": 1.855583543240974, "grad_norm": 0.3795291868557196, "learning_rate": 1.908372735991836e-05, "loss": 1.2003, "step": 2210 }, { "epoch": 1.8639798488664987, "grad_norm": 0.3698629794511824, "learning_rate": 1.8846660957157626e-05, "loss": 1.2279, "step": 2220 }, { "epoch": 1.8723761544920237, "grad_norm": 0.37197609159817596, "learning_rate": 1.8610182104869676e-05, "loss": 1.2175, "step": 2230 }, { "epoch": 1.8807724601175484, "grad_norm": 0.39691535067925626, "learning_rate": 1.837431338319523e-05, "loss": 1.1965, "step": 2240 }, { "epoch": 1.8891687657430731, "grad_norm": 0.39830491258185996, "learning_rate": 1.813907731401677e-05, "loss": 1.2082, "step": 2250 }, { "epoch": 1.8975650713685979, "grad_norm": 0.40042636908106866, "learning_rate": 1.790449635880813e-05, "loss": 1.1969, "step": 2260 }, { "epoch": 1.9059613769941226, "grad_norm": 0.3631756871613192, "learning_rate": 1.7670592916489637e-05, "loss": 1.2094, "step": 2270 }, { "epoch": 1.9143576826196473, "grad_norm": 0.41767311488963316, "learning_rate": 1.743738932128945e-05, "loss": 1.2171, "step": 2280 }, { "epoch": 1.922753988245172, "grad_norm": 0.42201095235456104, "learning_rate": 1.7204907840610968e-05, "loss": 1.1902, "step": 2290 }, { "epoch": 1.9311502938706968, "grad_norm": 0.38604694393746347, "learning_rate": 1.6973170672906592e-05, "loss": 1.1915, "step": 2300 }, { "epoch": 1.9395465994962215, "grad_norm": 0.3699669985546787, "learning_rate": 1.6742199945558116e-05, "loss": 1.2018, "step": 2310 }, { "epoch": 1.9479429051217463, "grad_norm": 0.3862222953863642, "learning_rate": 1.651201771276397e-05, "loss": 1.1742, "step": 2320 }, { "epoch": 1.9563392107472712, "grad_norm": 0.4294759850725183, "learning_rate": 1.6282645953433278e-05, "loss": 1.2024, "step": 2330 }, { "epoch": 1.964735516372796, "grad_norm": 0.4231278271726052, "learning_rate": 1.6054106569087247e-05, "loss": 1.2072, "step": 2340 }, { "epoch": 1.9731318219983207, "grad_norm": 0.40525859849828033, "learning_rate": 1.5826421381767943e-05, "loss": 1.1996, "step": 2350 }, { "epoch": 1.9815281276238457, "grad_norm": 0.41477520137980894, "learning_rate": 1.5599612131954562e-05, "loss": 1.1963, "step": 2360 }, { "epoch": 1.9899244332493704, "grad_norm": 0.4201296814849193, "learning_rate": 1.537370047648755e-05, "loss": 1.1909, "step": 2370 }, { "epoch": 1.9983207388748951, "grad_norm": 0.39991078592707974, "learning_rate": 1.5148707986500754e-05, "loss": 1.1934, "step": 2380 }, { "epoch": 2.00671704450042, "grad_norm": 0.542630060134636, "learning_rate": 1.4924656145361642e-05, "loss": 1.1015, "step": 2390 }, { "epoch": 2.0151133501259446, "grad_norm": 0.44916796266698206, "learning_rate": 1.4701566346619994e-05, "loss": 1.0911, "step": 2400 }, { "epoch": 2.0151133501259446, "eval_loss": 1.288699746131897, "eval_runtime": 37.7556, "eval_samples_per_second": 10.197, "eval_steps_per_second": 2.569, "step": 2400 }, { "epoch": 2.0235096557514693, "grad_norm": 0.44991891228670405, "learning_rate": 1.447945989196518e-05, "loss": 1.0922, "step": 2410 }, { "epoch": 2.031905961376994, "grad_norm": 0.4773364949208615, "learning_rate": 1.4258357989192089e-05, "loss": 1.0882, "step": 2420 }, { "epoch": 2.040302267002519, "grad_norm": 0.47709647551922374, "learning_rate": 1.403828175017618e-05, "loss": 1.0976, "step": 2430 }, { "epoch": 2.0486985726280436, "grad_norm": 0.5391514428897679, "learning_rate": 1.381925218885759e-05, "loss": 1.0999, "step": 2440 }, { "epoch": 2.0570948782535683, "grad_norm": 0.482275832558956, "learning_rate": 1.3601290219234642e-05, "loss": 1.0848, "step": 2450 }, { "epoch": 2.065491183879093, "grad_norm": 0.4879837972652172, "learning_rate": 1.3384416653366796e-05, "loss": 1.1053, "step": 2460 }, { "epoch": 2.0738874895046178, "grad_norm": 0.5108185234761726, "learning_rate": 1.3168652199387565e-05, "loss": 1.0565, "step": 2470 }, { "epoch": 2.082283795130143, "grad_norm": 0.47147504315736866, "learning_rate": 1.2954017459527037e-05, "loss": 1.1135, "step": 2480 }, { "epoch": 2.0906801007556677, "grad_norm": 0.4683105843680792, "learning_rate": 1.2740532928144785e-05, "loss": 1.0978, "step": 2490 }, { "epoch": 2.0990764063811924, "grad_norm": 0.46420490471813974, "learning_rate": 1.2528218989772928e-05, "loss": 1.0924, "step": 2500 }, { "epoch": 2.107472712006717, "grad_norm": 0.5116237328178771, "learning_rate": 1.2317095917169729e-05, "loss": 1.1122, "step": 2510 }, { "epoch": 2.115869017632242, "grad_norm": 0.48364433468846557, "learning_rate": 1.2107183869383817e-05, "loss": 1.0654, "step": 2520 }, { "epoch": 2.1242653232577666, "grad_norm": 0.47577790302982037, "learning_rate": 1.1898502889829367e-05, "loss": 1.0885, "step": 2530 }, { "epoch": 2.1326616288832914, "grad_norm": 0.5105053373884131, "learning_rate": 1.1691072904372211e-05, "loss": 1.0814, "step": 2540 }, { "epoch": 2.141057934508816, "grad_norm": 0.45277641048792205, "learning_rate": 1.1484913719427245e-05, "loss": 1.1074, "step": 2550 }, { "epoch": 2.149454240134341, "grad_norm": 0.4721310040821966, "learning_rate": 1.1280045020067173e-05, "loss": 1.0624, "step": 2560 }, { "epoch": 2.1578505457598656, "grad_norm": 0.5264060503334844, "learning_rate": 1.1076486368142974e-05, "loss": 1.075, "step": 2570 }, { "epoch": 2.1662468513853903, "grad_norm": 0.49378445456124825, "learning_rate": 1.0874257200415921e-05, "loss": 1.1239, "step": 2580 }, { "epoch": 2.174643157010915, "grad_norm": 0.4696082013605863, "learning_rate": 1.0673376826701764e-05, "loss": 1.0855, "step": 2590 }, { "epoch": 2.1830394626364398, "grad_norm": 0.520016314444527, "learning_rate": 1.0473864428026903e-05, "loss": 1.0613, "step": 2600 }, { "epoch": 2.1914357682619645, "grad_norm": 0.5085845086562594, "learning_rate": 1.0275739054796849e-05, "loss": 1.0874, "step": 2610 }, { "epoch": 2.1998320738874897, "grad_norm": 0.5229182760518745, "learning_rate": 1.0079019624977277e-05, "loss": 1.093, "step": 2620 }, { "epoch": 2.2082283795130144, "grad_norm": 0.4847844823377718, "learning_rate": 9.883724922287593e-06, "loss": 1.0792, "step": 2630 }, { "epoch": 2.216624685138539, "grad_norm": 0.4731502709572998, "learning_rate": 9.689873594407398e-06, "loss": 1.0513, "step": 2640 }, { "epoch": 2.225020990764064, "grad_norm": 0.5138138380876534, "learning_rate": 9.497484151195874e-06, "loss": 1.0663, "step": 2650 }, { "epoch": 2.2334172963895886, "grad_norm": 0.5026329202465635, "learning_rate": 9.30657496292447e-06, "loss": 1.0787, "step": 2660 }, { "epoch": 2.2418136020151134, "grad_norm": 0.44925382614215525, "learning_rate": 9.117164258522695e-06, "loss": 1.0579, "step": 2670 }, { "epoch": 2.250209907640638, "grad_norm": 0.5293508173928859, "learning_rate": 8.929270123837632e-06, "loss": 1.0512, "step": 2680 }, { "epoch": 2.258606213266163, "grad_norm": 0.5247265106521677, "learning_rate": 8.742910499906973e-06, "loss": 1.0658, "step": 2690 }, { "epoch": 2.2670025188916876, "grad_norm": 0.5190390778509362, "learning_rate": 8.558103181245921e-06, "loss": 1.0696, "step": 2700 }, { "epoch": 2.2753988245172123, "grad_norm": 0.55132785975463, "learning_rate": 8.374865814148073e-06, "loss": 1.092, "step": 2710 }, { "epoch": 2.283795130142737, "grad_norm": 0.5213613994205901, "learning_rate": 8.193215895000526e-06, "loss": 1.0822, "step": 2720 }, { "epoch": 2.292191435768262, "grad_norm": 0.4796879456796719, "learning_rate": 8.013170768613146e-06, "loss": 1.0568, "step": 2730 }, { "epoch": 2.3005877413937865, "grad_norm": 0.5385546333675868, "learning_rate": 7.834747626562484e-06, "loss": 1.0731, "step": 2740 }, { "epoch": 2.3089840470193117, "grad_norm": 0.5377987038023434, "learning_rate": 7.657963505550189e-06, "loss": 1.075, "step": 2750 }, { "epoch": 2.3173803526448364, "grad_norm": 0.562428992862054, "learning_rate": 7.482835285776305e-06, "loss": 1.0854, "step": 2760 }, { "epoch": 2.325776658270361, "grad_norm": 0.4988726750554791, "learning_rate": 7.309379689327409e-06, "loss": 1.093, "step": 2770 }, { "epoch": 2.334172963895886, "grad_norm": 0.5433657502932824, "learning_rate": 7.137613278579964e-06, "loss": 1.0926, "step": 2780 }, { "epoch": 2.3425692695214106, "grad_norm": 0.5087897129635799, "learning_rate": 6.967552454618839e-06, "loss": 1.1078, "step": 2790 }, { "epoch": 2.3509655751469354, "grad_norm": 0.5170416004589418, "learning_rate": 6.799213455671255e-06, "loss": 1.0781, "step": 2800 }, { "epoch": 2.35936188077246, "grad_norm": 0.5063141748122139, "learning_rate": 6.632612355556256e-06, "loss": 1.0824, "step": 2810 }, { "epoch": 2.367758186397985, "grad_norm": 0.5169250792091412, "learning_rate": 6.467765062149977e-06, "loss": 1.0686, "step": 2820 }, { "epoch": 2.3761544920235096, "grad_norm": 0.47167776818464224, "learning_rate": 6.304687315866589e-06, "loss": 1.0902, "step": 2830 }, { "epoch": 2.3845507976490343, "grad_norm": 0.5230768936155751, "learning_rate": 6.143394688155396e-06, "loss": 1.0704, "step": 2840 }, { "epoch": 2.392947103274559, "grad_norm": 0.5113178637899721, "learning_rate": 5.98390258001397e-06, "loss": 1.0824, "step": 2850 }, { "epoch": 2.401343408900084, "grad_norm": 0.5008499019390629, "learning_rate": 5.826226220517606e-06, "loss": 1.0549, "step": 2860 }, { "epoch": 2.4097397145256085, "grad_norm": 0.5530580136554508, "learning_rate": 5.670380665365144e-06, "loss": 1.0999, "step": 2870 }, { "epoch": 2.4181360201511337, "grad_norm": 0.48265348835919253, "learning_rate": 5.516380795441417e-06, "loss": 1.0929, "step": 2880 }, { "epoch": 2.4265323257766584, "grad_norm": 0.5159406165392976, "learning_rate": 5.364241315396335e-06, "loss": 1.0857, "step": 2890 }, { "epoch": 2.434928631402183, "grad_norm": 0.5456400716277449, "learning_rate": 5.21397675224081e-06, "loss": 1.0763, "step": 2900 }, { "epoch": 2.443324937027708, "grad_norm": 0.5275724452771665, "learning_rate": 5.065601453959659e-06, "loss": 1.0837, "step": 2910 }, { "epoch": 2.4517212426532327, "grad_norm": 0.5054255164716437, "learning_rate": 4.919129588141563e-06, "loss": 1.0655, "step": 2920 }, { "epoch": 2.4601175482787574, "grad_norm": 0.5226226880297187, "learning_rate": 4.7745751406263165e-06, "loss": 1.0604, "step": 2930 }, { "epoch": 2.468513853904282, "grad_norm": 0.4917792079813032, "learning_rate": 4.631951914169363e-06, "loss": 1.0635, "step": 2940 }, { "epoch": 2.476910159529807, "grad_norm": 0.5656339286373594, "learning_rate": 4.491273527123866e-06, "loss": 1.0838, "step": 2950 }, { "epoch": 2.4853064651553316, "grad_norm": 0.5253948788481366, "learning_rate": 4.352553412140303e-06, "loss": 1.06, "step": 2960 }, { "epoch": 2.4937027707808563, "grad_norm": 0.525259011295812, "learning_rate": 4.215804814883959e-06, "loss": 1.0728, "step": 2970 }, { "epoch": 2.502099076406381, "grad_norm": 0.4945489147573741, "learning_rate": 4.08104079277005e-06, "loss": 1.0567, "step": 2980 }, { "epoch": 2.510495382031906, "grad_norm": 0.5289037096962053, "learning_rate": 3.948274213717015e-06, "loss": 1.0833, "step": 2990 }, { "epoch": 2.5188916876574305, "grad_norm": 0.520353171293509, "learning_rate": 3.817517754917802e-06, "loss": 1.0794, "step": 3000 }, { "epoch": 2.5188916876574305, "eval_loss": 1.271108627319336, "eval_runtime": 37.5109, "eval_samples_per_second": 10.264, "eval_steps_per_second": 2.586, "step": 3000 }, { "epoch": 2.5272879932829557, "grad_norm": 0.47216191409024166, "learning_rate": 3.6887839016293807e-06, "loss": 1.0797, "step": 3010 }, { "epoch": 2.53568429890848, "grad_norm": 0.5168618762259352, "learning_rate": 3.5620849459805983e-06, "loss": 1.0809, "step": 3020 }, { "epoch": 2.544080604534005, "grad_norm": 0.5702868726978964, "learning_rate": 3.4374329857984637e-06, "loss": 1.0841, "step": 3030 }, { "epoch": 2.55247691015953, "grad_norm": 0.5324695724343947, "learning_rate": 3.314839923453003e-06, "loss": 1.0783, "step": 3040 }, { "epoch": 2.5608732157850547, "grad_norm": 0.5856245582119982, "learning_rate": 3.1943174647207474e-06, "loss": 1.0657, "step": 3050 }, { "epoch": 2.5692695214105794, "grad_norm": 0.5622043278789876, "learning_rate": 3.0758771176669948e-06, "loss": 1.0811, "step": 3060 }, { "epoch": 2.577665827036104, "grad_norm": 0.5164314316915084, "learning_rate": 2.9595301915470246e-06, "loss": 1.085, "step": 3070 }, { "epoch": 2.586062132661629, "grad_norm": 0.5302956251674786, "learning_rate": 2.8452877957261537e-06, "loss": 1.0682, "step": 3080 }, { "epoch": 2.5944584382871536, "grad_norm": 0.5749370462779954, "learning_rate": 2.73316083861902e-06, "loss": 1.0705, "step": 3090 }, { "epoch": 2.6028547439126783, "grad_norm": 0.528328794476431, "learning_rate": 2.623160026647978e-06, "loss": 1.08, "step": 3100 }, { "epoch": 2.611251049538203, "grad_norm": 0.5235304102916779, "learning_rate": 2.515295863220796e-06, "loss": 1.055, "step": 3110 }, { "epoch": 2.619647355163728, "grad_norm": 0.5312138570667937, "learning_rate": 2.4095786477277253e-06, "loss": 1.0541, "step": 3120 }, { "epoch": 2.6280436607892526, "grad_norm": 0.5443569327871725, "learning_rate": 2.3060184745580972e-06, "loss": 1.0712, "step": 3130 }, { "epoch": 2.6364399664147777, "grad_norm": 0.48867610044695425, "learning_rate": 2.2046252321364368e-06, "loss": 1.0785, "step": 3140 }, { "epoch": 2.644836272040302, "grad_norm": 0.5523392458602866, "learning_rate": 2.1054086019782785e-06, "loss": 1.0623, "step": 3150 }, { "epoch": 2.653232577665827, "grad_norm": 0.5415782176601202, "learning_rate": 2.008378057765725e-06, "loss": 1.0629, "step": 3160 }, { "epoch": 2.661628883291352, "grad_norm": 0.5486934335306471, "learning_rate": 1.913542864442849e-06, "loss": 1.0682, "step": 3170 }, { "epoch": 2.6700251889168767, "grad_norm": 0.5176003080136743, "learning_rate": 1.8209120773310468e-06, "loss": 1.0352, "step": 3180 }, { "epoch": 2.6784214945424014, "grad_norm": 0.5415079933956461, "learning_rate": 1.7304945412643825e-06, "loss": 1.0617, "step": 3190 }, { "epoch": 2.686817800167926, "grad_norm": 0.5461504669544123, "learning_rate": 1.6422988897450392e-06, "loss": 1.0712, "step": 3200 }, { "epoch": 2.695214105793451, "grad_norm": 0.5356674558567608, "learning_rate": 1.5563335441189458e-06, "loss": 1.0735, "step": 3210 }, { "epoch": 2.7036104114189756, "grad_norm": 0.5177763070870024, "learning_rate": 1.4726067127716958e-06, "loss": 1.0621, "step": 3220 }, { "epoch": 2.7120067170445004, "grad_norm": 0.5696590300719524, "learning_rate": 1.3911263903447241e-06, "loss": 1.0652, "step": 3230 }, { "epoch": 2.720403022670025, "grad_norm": 0.4910661226799717, "learning_rate": 1.3119003569719789e-06, "loss": 1.0622, "step": 3240 }, { "epoch": 2.72879932829555, "grad_norm": 0.5036708642245188, "learning_rate": 1.2349361775370245e-06, "loss": 1.063, "step": 3250 }, { "epoch": 2.7371956339210746, "grad_norm": 0.5183995701558842, "learning_rate": 1.1602412009507014e-06, "loss": 1.0598, "step": 3260 }, { "epoch": 2.7455919395465997, "grad_norm": 0.4923573412537131, "learning_rate": 1.087822559449425e-06, "loss": 1.037, "step": 3270 }, { "epoch": 2.753988245172124, "grad_norm": 0.5351167051477966, "learning_rate": 1.0176871679141664e-06, "loss": 1.0721, "step": 3280 }, { "epoch": 2.762384550797649, "grad_norm": 0.5138535715173458, "learning_rate": 9.49841723210182e-07, "loss": 1.0624, "step": 3290 }, { "epoch": 2.770780856423174, "grad_norm": 0.5562194428595657, "learning_rate": 8.842927035475763e-07, "loss": 1.0868, "step": 3300 }, { "epoch": 2.7791771620486987, "grad_norm": 0.5352125908767389, "learning_rate": 8.210463678626989e-07, "loss": 1.0509, "step": 3310 }, { "epoch": 2.7875734676742234, "grad_norm": 0.5617426830025886, "learning_rate": 7.601087552205621e-07, "loss": 1.0568, "step": 3320 }, { "epoch": 2.795969773299748, "grad_norm": 0.5445075989319467, "learning_rate": 7.014856842381484e-07, "loss": 1.0745, "step": 3330 }, { "epoch": 2.804366078925273, "grad_norm": 0.5294691240524095, "learning_rate": 6.451827525288612e-07, "loss": 1.0708, "step": 3340 }, { "epoch": 2.8127623845507976, "grad_norm": 0.5469429580225932, "learning_rate": 5.912053361680159e-07, "loss": 1.0772, "step": 3350 }, { "epoch": 2.8211586901763224, "grad_norm": 0.5197486795769499, "learning_rate": 5.395585891795174e-07, "loss": 1.0437, "step": 3360 }, { "epoch": 2.829554995801847, "grad_norm": 0.5804047203332435, "learning_rate": 4.90247443043712e-07, "loss": 1.064, "step": 3370 }, { "epoch": 2.837951301427372, "grad_norm": 0.5645169239872654, "learning_rate": 4.43276606226542e-07, "loss": 1.0702, "step": 3380 }, { "epoch": 2.8463476070528966, "grad_norm": 0.5350844658781305, "learning_rate": 3.9865056372990763e-07, "loss": 1.0731, "step": 3390 }, { "epoch": 2.8547439126784218, "grad_norm": 0.5252887341555534, "learning_rate": 3.563735766634629e-07, "loss": 1.0576, "step": 3400 }, { "epoch": 2.863140218303946, "grad_norm": 0.4893728491690957, "learning_rate": 3.164496818377188e-07, "loss": 1.0526, "step": 3410 }, { "epoch": 2.8715365239294712, "grad_norm": 0.5238206701307841, "learning_rate": 2.7888269137860413e-07, "loss": 1.0631, "step": 3420 }, { "epoch": 2.879932829554996, "grad_norm": 0.5616265204939842, "learning_rate": 2.436761923634456e-07, "loss": 1.1013, "step": 3430 }, { "epoch": 2.8883291351805207, "grad_norm": 0.52918003457263, "learning_rate": 2.108335464784811e-07, "loss": 1.0737, "step": 3440 }, { "epoch": 2.8967254408060454, "grad_norm": 0.5686960611410121, "learning_rate": 1.8035788969784638e-07, "loss": 1.0823, "step": 3450 }, { "epoch": 2.90512174643157, "grad_norm": 0.4994659787416042, "learning_rate": 1.522521319841541e-07, "loss": 1.0692, "step": 3460 }, { "epoch": 2.913518052057095, "grad_norm": 0.5629206398185236, "learning_rate": 1.2651895701063233e-07, "loss": 1.0698, "step": 3470 }, { "epoch": 2.9219143576826196, "grad_norm": 0.5675179585041099, "learning_rate": 1.0316082190486298e-07, "loss": 1.0746, "step": 3480 }, { "epoch": 2.9303106633081444, "grad_norm": 0.5175715407014831, "learning_rate": 8.217995701418624e-08, "loss": 1.0454, "step": 3490 }, { "epoch": 2.938706968933669, "grad_norm": 0.5240794519085307, "learning_rate": 6.35783656927097e-08, "loss": 1.056, "step": 3500 }, { "epoch": 2.947103274559194, "grad_norm": 0.49756212695927055, "learning_rate": 4.7357824110044743e-08, "loss": 1.0689, "step": 3510 }, { "epoch": 2.9554995801847186, "grad_norm": 0.5251282063460313, "learning_rate": 3.351988108168391e-08, "loss": 1.0603, "step": 3520 }, { "epoch": 2.9638958858102438, "grad_norm": 0.5422569799063949, "learning_rate": 2.2065857921133025e-08, "loss": 1.0822, "step": 3530 }, { "epoch": 2.972292191435768, "grad_norm": 0.543354269594394, "learning_rate": 1.2996848313734378e-08, "loss": 1.075, "step": 3540 }, { "epoch": 2.9806884970612932, "grad_norm": 0.5348306207009784, "learning_rate": 6.31371821224469e-09, "loss": 1.0692, "step": 3550 }, { "epoch": 2.9890848026868175, "grad_norm": 0.5488049489103773, "learning_rate": 2.017105754134607e-09, "loss": 1.0895, "step": 3560 }, { "epoch": 2.9974811083123427, "grad_norm": 0.552482965655854, "learning_rate": 1.0742120067630535e-10, "loss": 1.0465, "step": 3570 }, { "epoch": 3.0, "step": 3573, "total_flos": 234145141751808.0, "train_loss": 1.2264163225196905, "train_runtime": 4742.9935, "train_samples_per_second": 24.104, "train_steps_per_second": 0.753 } ], "logging_steps": 10, "max_steps": 3573, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 234145141751808.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }