{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992688276870583, "eval_steps": 500, "global_step": 1025, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009748964172556666, "grad_norm": 22.86059565076889, "learning_rate": 9.70873786407767e-08, "loss": 1.3065, "step": 1 }, { "epoch": 0.004874482086278333, "grad_norm": 22.08339033526192, "learning_rate": 4.854368932038835e-07, "loss": 1.316, "step": 5 }, { "epoch": 0.009748964172556666, "grad_norm": 8.710899949749317, "learning_rate": 9.70873786407767e-07, "loss": 1.2194, "step": 10 }, { "epoch": 0.014623446258834999, "grad_norm": 8.431065000460007, "learning_rate": 1.4563106796116506e-06, "loss": 1.0655, "step": 15 }, { "epoch": 0.01949792834511333, "grad_norm": 3.0129778026962346, "learning_rate": 1.941747572815534e-06, "loss": 0.9273, "step": 20 }, { "epoch": 0.024372410431391666, "grad_norm": 2.4586920682855142, "learning_rate": 2.427184466019418e-06, "loss": 0.8841, "step": 25 }, { "epoch": 0.029246892517669997, "grad_norm": 2.1763561124357684, "learning_rate": 2.912621359223301e-06, "loss": 0.853, "step": 30 }, { "epoch": 0.03412137460394833, "grad_norm": 2.2029840594866466, "learning_rate": 3.398058252427185e-06, "loss": 0.8341, "step": 35 }, { "epoch": 0.03899585669022666, "grad_norm": 2.446000778730272, "learning_rate": 3.883495145631068e-06, "loss": 0.8118, "step": 40 }, { "epoch": 0.043870338776504994, "grad_norm": 2.451352004117067, "learning_rate": 4.368932038834952e-06, "loss": 0.7943, "step": 45 }, { "epoch": 0.04874482086278333, "grad_norm": 2.4129113465126957, "learning_rate": 4.854368932038836e-06, "loss": 0.7768, "step": 50 }, { "epoch": 0.05361930294906166, "grad_norm": 2.3127065430589298, "learning_rate": 5.3398058252427185e-06, "loss": 0.7778, "step": 55 }, { "epoch": 0.058493785035339994, "grad_norm": 2.405876637403882, "learning_rate": 5.825242718446602e-06, "loss": 0.7534, "step": 60 }, { "epoch": 0.06336826712161833, "grad_norm": 2.2639828354846387, "learning_rate": 6.310679611650487e-06, "loss": 0.7398, "step": 65 }, { "epoch": 0.06824274920789666, "grad_norm": 2.3243442460154657, "learning_rate": 6.79611650485437e-06, "loss": 0.7261, "step": 70 }, { "epoch": 0.073117231294175, "grad_norm": 2.4227745775819836, "learning_rate": 7.2815533980582534e-06, "loss": 0.7168, "step": 75 }, { "epoch": 0.07799171338045333, "grad_norm": 4.25071801960126, "learning_rate": 7.766990291262136e-06, "loss": 0.71, "step": 80 }, { "epoch": 0.08286619546673166, "grad_norm": 2.148825747496477, "learning_rate": 8.25242718446602e-06, "loss": 0.7074, "step": 85 }, { "epoch": 0.08774067755300999, "grad_norm": 2.380339632272055, "learning_rate": 8.737864077669904e-06, "loss": 0.7032, "step": 90 }, { "epoch": 0.09261515963928832, "grad_norm": 2.3544801176511916, "learning_rate": 9.223300970873788e-06, "loss": 0.7, "step": 95 }, { "epoch": 0.09748964172556666, "grad_norm": 2.1648774285785013, "learning_rate": 9.708737864077671e-06, "loss": 0.6812, "step": 100 }, { "epoch": 0.102364123811845, "grad_norm": 2.103759384635341, "learning_rate": 9.999883898929927e-06, "loss": 0.6822, "step": 105 }, { "epoch": 0.10723860589812333, "grad_norm": 2.1317773929603576, "learning_rate": 9.998577823812066e-06, "loss": 0.6845, "step": 110 }, { "epoch": 0.11211308798440166, "grad_norm": 2.5038388167151924, "learning_rate": 9.995820927586548e-06, "loss": 0.6799, "step": 115 }, { "epoch": 0.11698757007067999, "grad_norm": 2.2890913021002772, "learning_rate": 9.99161401043362e-06, "loss": 0.6893, "step": 120 }, { "epoch": 0.12186205215695832, "grad_norm": 2.287115199507479, "learning_rate": 9.985958293397433e-06, "loss": 0.6801, "step": 125 }, { "epoch": 0.12673653424323666, "grad_norm": 2.111605184938528, "learning_rate": 9.978855418031633e-06, "loss": 0.6761, "step": 130 }, { "epoch": 0.131611016329515, "grad_norm": 2.3968266021989573, "learning_rate": 9.970307445922905e-06, "loss": 0.6659, "step": 135 }, { "epoch": 0.13648549841579333, "grad_norm": 2.0562856783183383, "learning_rate": 9.960316858092613e-06, "loss": 0.6735, "step": 140 }, { "epoch": 0.14135998050207166, "grad_norm": 2.2596310641814634, "learning_rate": 9.948886554276689e-06, "loss": 0.6658, "step": 145 }, { "epoch": 0.14623446258835, "grad_norm": 2.3851159107252022, "learning_rate": 9.936019852083982e-06, "loss": 0.6592, "step": 150 }, { "epoch": 0.15110894467462832, "grad_norm": 2.0161901435788083, "learning_rate": 9.921720486033348e-06, "loss": 0.657, "step": 155 }, { "epoch": 0.15598342676090665, "grad_norm": 2.127638747482369, "learning_rate": 9.905992606469708e-06, "loss": 0.6595, "step": 160 }, { "epoch": 0.16085790884718498, "grad_norm": 2.0531639641862447, "learning_rate": 9.888840778359431e-06, "loss": 0.6515, "step": 165 }, { "epoch": 0.1657323909334633, "grad_norm": 1.8950315321987827, "learning_rate": 9.870269979965364e-06, "loss": 0.6492, "step": 170 }, { "epoch": 0.17060687301974164, "grad_norm": 1.9060347213321087, "learning_rate": 9.850285601401899e-06, "loss": 0.6458, "step": 175 }, { "epoch": 0.17548135510601998, "grad_norm": 2.074587641369688, "learning_rate": 9.828893443070527e-06, "loss": 0.6515, "step": 180 }, { "epoch": 0.1803558371922983, "grad_norm": 2.021646896648869, "learning_rate": 9.806099713976277e-06, "loss": 0.6306, "step": 185 }, { "epoch": 0.18523031927857664, "grad_norm": 2.183178620435626, "learning_rate": 9.781911029925573e-06, "loss": 0.6317, "step": 190 }, { "epoch": 0.190104801364855, "grad_norm": 1.9176655344163256, "learning_rate": 9.756334411606028e-06, "loss": 0.6304, "step": 195 }, { "epoch": 0.19497928345113333, "grad_norm": 2.0283053056639226, "learning_rate": 9.729377282548696e-06, "loss": 0.6273, "step": 200 }, { "epoch": 0.19985376553741166, "grad_norm": 2.2614639215711794, "learning_rate": 9.701047466973429e-06, "loss": 0.6227, "step": 205 }, { "epoch": 0.20472824762369, "grad_norm": 2.048850431237601, "learning_rate": 9.67135318751792e-06, "loss": 0.6289, "step": 210 }, { "epoch": 0.20960272970996832, "grad_norm": 2.0716723168430224, "learning_rate": 9.640303062851101e-06, "loss": 0.6235, "step": 215 }, { "epoch": 0.21447721179624665, "grad_norm": 2.165745603614713, "learning_rate": 9.607906105171613e-06, "loss": 0.6254, "step": 220 }, { "epoch": 0.21935169388252498, "grad_norm": 2.121524753608251, "learning_rate": 9.574171717592038e-06, "loss": 0.6215, "step": 225 }, { "epoch": 0.22422617596880332, "grad_norm": 2.0155505697588207, "learning_rate": 9.539109691409677e-06, "loss": 0.6125, "step": 230 }, { "epoch": 0.22910065805508165, "grad_norm": 2.5497104730670457, "learning_rate": 9.502730203264656e-06, "loss": 0.6116, "step": 235 }, { "epoch": 0.23397514014135998, "grad_norm": 2.0859151782380163, "learning_rate": 9.465043812186194e-06, "loss": 0.6026, "step": 240 }, { "epoch": 0.2388496222276383, "grad_norm": 2.0192086858289673, "learning_rate": 9.426061456527871e-06, "loss": 0.601, "step": 245 }, { "epoch": 0.24372410431391664, "grad_norm": 2.08123508728777, "learning_rate": 9.385794450792818e-06, "loss": 0.593, "step": 250 }, { "epoch": 0.24859858640019497, "grad_norm": 1.970692344326779, "learning_rate": 9.344254482349702e-06, "loss": 0.5879, "step": 255 }, { "epoch": 0.25347306848647333, "grad_norm": 2.0360574888538037, "learning_rate": 9.301453608040523e-06, "loss": 0.5884, "step": 260 }, { "epoch": 0.25834755057275166, "grad_norm": 1.9510711367676767, "learning_rate": 9.25740425068114e-06, "loss": 0.5937, "step": 265 }, { "epoch": 0.26322203265903, "grad_norm": 2.0679042358578057, "learning_rate": 9.2121191954556e-06, "loss": 0.5838, "step": 270 }, { "epoch": 0.2680965147453083, "grad_norm": 2.0443212477081008, "learning_rate": 9.165611586205268e-06, "loss": 0.5719, "step": 275 }, { "epoch": 0.27297099683158665, "grad_norm": 2.0408583983165234, "learning_rate": 9.11789492161388e-06, "loss": 0.5839, "step": 280 }, { "epoch": 0.277845478917865, "grad_norm": 1.9767895517261123, "learning_rate": 9.068983051289589e-06, "loss": 0.5865, "step": 285 }, { "epoch": 0.2827199610041433, "grad_norm": 1.881981157978116, "learning_rate": 9.018890171745156e-06, "loss": 0.5793, "step": 290 }, { "epoch": 0.28759444309042165, "grad_norm": 1.9932110390018152, "learning_rate": 8.967630822277472e-06, "loss": 0.5808, "step": 295 }, { "epoch": 0.2924689251767, "grad_norm": 1.9915669044094475, "learning_rate": 8.915219880747555e-06, "loss": 0.569, "step": 300 }, { "epoch": 0.2973434072629783, "grad_norm": 2.046268209029942, "learning_rate": 8.861672559262316e-06, "loss": 0.5759, "step": 305 }, { "epoch": 0.30221788934925664, "grad_norm": 1.9535790327085676, "learning_rate": 8.80700439975928e-06, "loss": 0.5717, "step": 310 }, { "epoch": 0.30709237143553497, "grad_norm": 2.280776169449102, "learning_rate": 8.751231269495604e-06, "loss": 0.5703, "step": 315 }, { "epoch": 0.3119668535218133, "grad_norm": 2.0042263363709996, "learning_rate": 8.694369356442638e-06, "loss": 0.5668, "step": 320 }, { "epoch": 0.31684133560809163, "grad_norm": 2.098934601526615, "learning_rate": 8.636435164587436e-06, "loss": 0.5532, "step": 325 }, { "epoch": 0.32171581769436997, "grad_norm": 2.0675738967767496, "learning_rate": 8.577445509142514e-06, "loss": 0.5585, "step": 330 }, { "epoch": 0.3265902997806483, "grad_norm": 2.3766255529170692, "learning_rate": 8.517417511665299e-06, "loss": 0.5658, "step": 335 }, { "epoch": 0.3314647818669266, "grad_norm": 2.1910659775597248, "learning_rate": 8.456368595088647e-06, "loss": 0.5507, "step": 340 }, { "epoch": 0.33633926395320496, "grad_norm": 1.8703655986029497, "learning_rate": 8.394316478663886e-06, "loss": 0.5406, "step": 345 }, { "epoch": 0.3412137460394833, "grad_norm": 1.9244217565682906, "learning_rate": 8.331279172817876e-06, "loss": 0.542, "step": 350 }, { "epoch": 0.3460882281257616, "grad_norm": 2.0014619556553335, "learning_rate": 8.26727497392553e-06, "loss": 0.5392, "step": 355 }, { "epoch": 0.35096271021203995, "grad_norm": 1.8442031091685414, "learning_rate": 8.20232245899935e-06, "loss": 0.5318, "step": 360 }, { "epoch": 0.3558371922983183, "grad_norm": 1.9799423972998886, "learning_rate": 8.136440480297514e-06, "loss": 0.5414, "step": 365 }, { "epoch": 0.3607116743845966, "grad_norm": 2.0577170932676596, "learning_rate": 8.069648159852059e-06, "loss": 0.5296, "step": 370 }, { "epoch": 0.36558615647087495, "grad_norm": 2.037446340676939, "learning_rate": 8.001964883918793e-06, "loss": 0.5348, "step": 375 }, { "epoch": 0.3704606385571533, "grad_norm": 2.0638439360397665, "learning_rate": 7.933410297350472e-06, "loss": 0.5229, "step": 380 }, { "epoch": 0.3753351206434316, "grad_norm": 1.9569671735850687, "learning_rate": 7.864004297894963e-06, "loss": 0.5275, "step": 385 }, { "epoch": 0.38020960272971, "grad_norm": 1.9870139157026745, "learning_rate": 7.793767030419975e-06, "loss": 0.533, "step": 390 }, { "epoch": 0.3850840848159883, "grad_norm": 1.9631002389899173, "learning_rate": 7.722718881066086e-06, "loss": 0.5245, "step": 395 }, { "epoch": 0.38995856690226666, "grad_norm": 1.9193362092353006, "learning_rate": 7.650880471329725e-06, "loss": 0.5203, "step": 400 }, { "epoch": 0.394833048988545, "grad_norm": 1.9397872066961752, "learning_rate": 7.578272652077849e-06, "loss": 0.5144, "step": 405 }, { "epoch": 0.3997075310748233, "grad_norm": 1.952394556800401, "learning_rate": 7.504916497496051e-06, "loss": 0.5168, "step": 410 }, { "epoch": 0.40458201316110165, "grad_norm": 1.9410715991990146, "learning_rate": 7.43083329897184e-06, "loss": 0.4964, "step": 415 }, { "epoch": 0.40945649524738, "grad_norm": 1.9371602423339187, "learning_rate": 7.3560445589148875e-06, "loss": 0.5136, "step": 420 }, { "epoch": 0.4143309773336583, "grad_norm": 1.9398436533180357, "learning_rate": 7.2805719845160195e-06, "loss": 0.5012, "step": 425 }, { "epoch": 0.41920545941993664, "grad_norm": 1.8404891949351312, "learning_rate": 7.20443748144678e-06, "loss": 0.4985, "step": 430 }, { "epoch": 0.424079941506215, "grad_norm": 1.9219013215637166, "learning_rate": 7.127663147501377e-06, "loss": 0.497, "step": 435 }, { "epoch": 0.4289544235924933, "grad_norm": 2.087372537148497, "learning_rate": 7.050271266182862e-06, "loss": 0.4954, "step": 440 }, { "epoch": 0.43382890567877164, "grad_norm": 1.9732921364375713, "learning_rate": 6.97228430023543e-06, "loss": 0.4914, "step": 445 }, { "epoch": 0.43870338776504997, "grad_norm": 1.8509876619225545, "learning_rate": 6.893724885124668e-06, "loss": 0.4816, "step": 450 }, { "epoch": 0.4435778698513283, "grad_norm": 1.876089892137146, "learning_rate": 6.814615822467691e-06, "loss": 0.4863, "step": 455 }, { "epoch": 0.44845235193760663, "grad_norm": 1.956376416630627, "learning_rate": 6.734980073415038e-06, "loss": 0.4914, "step": 460 }, { "epoch": 0.45332683402388496, "grad_norm": 1.831312024979652, "learning_rate": 6.654840751986282e-06, "loss": 0.4773, "step": 465 }, { "epoch": 0.4582013161101633, "grad_norm": 1.9745029935310336, "learning_rate": 6.574221118361254e-06, "loss": 0.4843, "step": 470 }, { "epoch": 0.4630757981964416, "grad_norm": 2.090848022257445, "learning_rate": 6.493144572128852e-06, "loss": 0.4891, "step": 475 }, { "epoch": 0.46795028028271995, "grad_norm": 2.0539193178381345, "learning_rate": 6.411634645495388e-06, "loss": 0.465, "step": 480 }, { "epoch": 0.4728247623689983, "grad_norm": 1.8380774504527437, "learning_rate": 6.329714996454436e-06, "loss": 0.4717, "step": 485 }, { "epoch": 0.4776992444552766, "grad_norm": 1.9501438562254991, "learning_rate": 6.247409401920184e-06, "loss": 0.47, "step": 490 }, { "epoch": 0.48257372654155495, "grad_norm": 1.9399214168055752, "learning_rate": 6.164741750826246e-06, "loss": 0.4696, "step": 495 }, { "epoch": 0.4874482086278333, "grad_norm": 1.9438999474698049, "learning_rate": 6.081736037191998e-06, "loss": 0.4761, "step": 500 }, { "epoch": 0.4923226907141116, "grad_norm": 2.109991763270123, "learning_rate": 5.998416353158369e-06, "loss": 0.467, "step": 505 }, { "epoch": 0.49719717280038994, "grad_norm": 1.9305942803443825, "learning_rate": 5.914806881995192e-06, "loss": 0.4519, "step": 510 }, { "epoch": 0.5020716548866683, "grad_norm": 1.8894659709352353, "learning_rate": 5.830931891082077e-06, "loss": 0.4625, "step": 515 }, { "epoch": 0.5069461369729467, "grad_norm": 1.943600690573164, "learning_rate": 5.746815724864884e-06, "loss": 0.4486, "step": 520 }, { "epoch": 0.5118206190592249, "grad_norm": 1.8604126506438579, "learning_rate": 5.662482797789833e-06, "loss": 0.4501, "step": 525 }, { "epoch": 0.5166951011455033, "grad_norm": 1.8945174085661423, "learning_rate": 5.577957587217281e-06, "loss": 0.4576, "step": 530 }, { "epoch": 0.5215695832317816, "grad_norm": 1.9454371455405457, "learning_rate": 5.493264626317252e-06, "loss": 0.4546, "step": 535 }, { "epoch": 0.52644406531806, "grad_norm": 1.9257763152448473, "learning_rate": 5.408428496948761e-06, "loss": 0.4476, "step": 540 }, { "epoch": 0.5313185474043383, "grad_norm": 1.942601355689532, "learning_rate": 5.323473822525011e-06, "loss": 0.4419, "step": 545 }, { "epoch": 0.5361930294906166, "grad_norm": 1.8380797740924733, "learning_rate": 5.238425260866524e-06, "loss": 0.4339, "step": 550 }, { "epoch": 0.5410675115768949, "grad_norm": 1.9856061522664756, "learning_rate": 5.153307497044291e-06, "loss": 0.4486, "step": 555 }, { "epoch": 0.5459419936631733, "grad_norm": 2.0152615373656446, "learning_rate": 5.068145236215007e-06, "loss": 0.4361, "step": 560 }, { "epoch": 0.5508164757494516, "grad_norm": 1.8713611793445957, "learning_rate": 4.982963196450478e-06, "loss": 0.4388, "step": 565 }, { "epoch": 0.55569095783573, "grad_norm": 1.9591094110255243, "learning_rate": 4.8977861015632865e-06, "loss": 0.4382, "step": 570 }, { "epoch": 0.5605654399220082, "grad_norm": 1.8237556125663812, "learning_rate": 4.812638673930777e-06, "loss": 0.4289, "step": 575 }, { "epoch": 0.5654399220082866, "grad_norm": 1.8090312867754712, "learning_rate": 4.72754562731947e-06, "loss": 0.4258, "step": 580 }, { "epoch": 0.5703144040945649, "grad_norm": 1.8257902996618338, "learning_rate": 4.64253165971197e-06, "loss": 0.4303, "step": 585 }, { "epoch": 0.5751888861808433, "grad_norm": 2.0254032476042223, "learning_rate": 4.557621446138455e-06, "loss": 0.4202, "step": 590 }, { "epoch": 0.5800633682671216, "grad_norm": 1.9947879579802112, "learning_rate": 4.47283963151483e-06, "loss": 0.424, "step": 595 }, { "epoch": 0.5849378503534, "grad_norm": 1.8982751664240438, "learning_rate": 4.388210823489616e-06, "loss": 0.4221, "step": 600 }, { "epoch": 0.5898123324396782, "grad_norm": 1.8999729869459319, "learning_rate": 4.3037595853016645e-06, "loss": 0.4162, "step": 605 }, { "epoch": 0.5946868145259566, "grad_norm": 1.9051454199437885, "learning_rate": 4.219510428650752e-06, "loss": 0.4154, "step": 610 }, { "epoch": 0.5995612966122349, "grad_norm": 1.8892938847030196, "learning_rate": 4.135487806583141e-06, "loss": 0.4183, "step": 615 }, { "epoch": 0.6044357786985133, "grad_norm": 1.8391212463048745, "learning_rate": 4.051716106394162e-06, "loss": 0.4169, "step": 620 }, { "epoch": 0.6093102607847917, "grad_norm": 1.9164931200596216, "learning_rate": 3.968219642549876e-06, "loss": 0.4096, "step": 625 }, { "epoch": 0.6141847428710699, "grad_norm": 1.8882587070789514, "learning_rate": 3.885022649629887e-06, "loss": 0.4089, "step": 630 }, { "epoch": 0.6190592249573483, "grad_norm": 2.0414351606801677, "learning_rate": 3.8021492752933196e-06, "loss": 0.4118, "step": 635 }, { "epoch": 0.6239337070436266, "grad_norm": 1.88948983848305, "learning_rate": 3.7196235732700546e-06, "loss": 0.4155, "step": 640 }, { "epoch": 0.628808189129905, "grad_norm": 1.9185526120999834, "learning_rate": 3.637469496379201e-06, "loss": 0.3988, "step": 645 }, { "epoch": 0.6336826712161833, "grad_norm": 1.8441572398843216, "learning_rate": 3.5557108895768723e-06, "loss": 0.4099, "step": 650 }, { "epoch": 0.6385571533024617, "grad_norm": 2.0485958888309796, "learning_rate": 3.4743714830352604e-06, "loss": 0.405, "step": 655 }, { "epoch": 0.6434316353887399, "grad_norm": 1.806992701143073, "learning_rate": 3.3934748852550353e-06, "loss": 0.4037, "step": 660 }, { "epoch": 0.6483061174750183, "grad_norm": 1.943526238424205, "learning_rate": 3.3130445762130485e-06, "loss": 0.3967, "step": 665 }, { "epoch": 0.6531805995612966, "grad_norm": 1.8848557764463123, "learning_rate": 3.2331039005473495e-06, "loss": 0.3924, "step": 670 }, { "epoch": 0.658055081647575, "grad_norm": 1.892841917353598, "learning_rate": 3.1536760607814747e-06, "loss": 0.3961, "step": 675 }, { "epoch": 0.6629295637338533, "grad_norm": 1.7833910365050067, "learning_rate": 3.0747841105899965e-06, "loss": 0.3973, "step": 680 }, { "epoch": 0.6678040458201316, "grad_norm": 1.8266463006432494, "learning_rate": 2.9964509481072627e-06, "loss": 0.3829, "step": 685 }, { "epoch": 0.6726785279064099, "grad_norm": 1.9794830429310166, "learning_rate": 2.918699309281292e-06, "loss": 0.3886, "step": 690 }, { "epoch": 0.6775530099926883, "grad_norm": 1.8605018457416864, "learning_rate": 2.84155176127473e-06, "loss": 0.3889, "step": 695 }, { "epoch": 0.6824274920789666, "grad_norm": 1.8677944161418016, "learning_rate": 2.765030695914815e-06, "loss": 0.3878, "step": 700 }, { "epoch": 0.687301974165245, "grad_norm": 1.7809933219767806, "learning_rate": 2.689158323194212e-06, "loss": 0.389, "step": 705 }, { "epoch": 0.6921764562515232, "grad_norm": 1.9534924091406372, "learning_rate": 2.6139566648246355e-06, "loss": 0.38, "step": 710 }, { "epoch": 0.6970509383378016, "grad_norm": 1.8154536663255623, "learning_rate": 2.5394475478451246e-06, "loss": 0.3819, "step": 715 }, { "epoch": 0.7019254204240799, "grad_norm": 1.910488831165307, "learning_rate": 2.4656525982868106e-06, "loss": 0.3805, "step": 720 }, { "epoch": 0.7067999025103583, "grad_norm": 1.8313054725277078, "learning_rate": 2.39259323489603e-06, "loss": 0.3742, "step": 725 }, { "epoch": 0.7116743845966366, "grad_norm": 1.8851239942706999, "learning_rate": 2.320290662917607e-06, "loss": 0.3726, "step": 730 }, { "epoch": 0.716548866682915, "grad_norm": 1.8525938618294533, "learning_rate": 2.2487658679400943e-06, "loss": 0.3812, "step": 735 }, { "epoch": 0.7214233487691932, "grad_norm": 1.952969026800498, "learning_rate": 2.178039609804777e-06, "loss": 0.3757, "step": 740 }, { "epoch": 0.7262978308554716, "grad_norm": 1.8641758450652162, "learning_rate": 2.108132416580198e-06, "loss": 0.3794, "step": 745 }, { "epoch": 0.7311723129417499, "grad_norm": 1.8518338990761232, "learning_rate": 2.0390645786039406e-06, "loss": 0.3713, "step": 750 }, { "epoch": 0.7360467950280283, "grad_norm": 1.8015066231257286, "learning_rate": 1.9708561425934393e-06, "loss": 0.3784, "step": 755 }, { "epoch": 0.7409212771143066, "grad_norm": 1.8409794783180748, "learning_rate": 1.903526905827474e-06, "loss": 0.3751, "step": 760 }, { "epoch": 0.7457957592005849, "grad_norm": 1.8641076874515972, "learning_rate": 1.8370964104000783e-06, "loss": 0.3746, "step": 765 }, { "epoch": 0.7506702412868632, "grad_norm": 1.786737876865444, "learning_rate": 1.7715839375485067e-06, "loss": 0.3628, "step": 770 }, { "epoch": 0.7555447233731416, "grad_norm": 1.8446799495571504, "learning_rate": 1.7070085020569194e-06, "loss": 0.3644, "step": 775 }, { "epoch": 0.76041920545942, "grad_norm": 1.751658260263151, "learning_rate": 1.6433888467374015e-06, "loss": 0.37, "step": 780 }, { "epoch": 0.7652936875456983, "grad_norm": 1.8477843728803192, "learning_rate": 1.5807434369899248e-06, "loss": 0.3628, "step": 785 }, { "epoch": 0.7701681696319767, "grad_norm": 1.8593690412561963, "learning_rate": 1.51909045544282e-06, "loss": 0.3708, "step": 790 }, { "epoch": 0.7750426517182549, "grad_norm": 1.747782138054366, "learning_rate": 1.4584477966753324e-06, "loss": 0.3652, "step": 795 }, { "epoch": 0.7799171338045333, "grad_norm": 1.755667670884649, "learning_rate": 1.398833062023775e-06, "loss": 0.3691, "step": 800 }, { "epoch": 0.7847916158908116, "grad_norm": 1.8425636184668681, "learning_rate": 1.3402635544727992e-06, "loss": 0.366, "step": 805 }, { "epoch": 0.78966609797709, "grad_norm": 1.9026395556203364, "learning_rate": 1.2827562736332555e-06, "loss": 0.3589, "step": 810 }, { "epoch": 0.7945405800633683, "grad_norm": 1.7133796113318556, "learning_rate": 1.226327910808116e-06, "loss": 0.3597, "step": 815 }, { "epoch": 0.7994150621496466, "grad_norm": 1.8616850952101744, "learning_rate": 1.1709948441478763e-06, "loss": 0.3583, "step": 820 }, { "epoch": 0.8042895442359249, "grad_norm": 1.7675149524570417, "learning_rate": 1.116773133896848e-06, "loss": 0.3605, "step": 825 }, { "epoch": 0.8091640263222033, "grad_norm": 1.900735872689697, "learning_rate": 1.0636785177317255e-06, "loss": 0.3547, "step": 830 }, { "epoch": 0.8140385084084816, "grad_norm": 1.7353000724393655, "learning_rate": 1.0117264061937777e-06, "loss": 0.3543, "step": 835 }, { "epoch": 0.81891299049476, "grad_norm": 1.8226701532638399, "learning_rate": 9.60931878215985e-07, "loss": 0.3523, "step": 840 }, { "epoch": 0.8237874725810382, "grad_norm": 1.7199982449094653, "learning_rate": 9.113096767464302e-07, "loss": 0.3572, "step": 845 }, { "epoch": 0.8286619546673166, "grad_norm": 1.7416045554651556, "learning_rate": 8.62874204469204e-07, "loss": 0.3546, "step": 850 }, { "epoch": 0.8335364367535949, "grad_norm": 1.7748467098472374, "learning_rate": 8.156395196240752e-07, "loss": 0.3488, "step": 855 }, { "epoch": 0.8384109188398733, "grad_norm": 1.8256676668006175, "learning_rate": 7.696193319261242e-07, "loss": 0.3467, "step": 860 }, { "epoch": 0.8432854009261516, "grad_norm": 1.8827284517179357, "learning_rate": 7.248269985865514e-07, "loss": 0.3521, "step": 865 }, { "epoch": 0.84815988301243, "grad_norm": 1.6781224427131647, "learning_rate": 6.812755204357857e-07, "loss": 0.3535, "step": 870 }, { "epoch": 0.8530343650987082, "grad_norm": 1.6995357059191782, "learning_rate": 6.389775381500351e-07, "loss": 0.3435, "step": 875 }, { "epoch": 0.8579088471849866, "grad_norm": 1.7402333449103136, "learning_rate": 5.979453285823711e-07, "loss": 0.3443, "step": 880 }, { "epoch": 0.8627833292712649, "grad_norm": 1.7615298544617708, "learning_rate": 5.58190801199413e-07, "loss": 0.3528, "step": 885 }, { "epoch": 0.8676578113575433, "grad_norm": 1.7343491942185678, "learning_rate": 5.197254946246416e-07, "loss": 0.3495, "step": 890 }, { "epoch": 0.8725322934438215, "grad_norm": 1.657555480865734, "learning_rate": 4.825605732893546e-07, "loss": 0.3468, "step": 895 }, { "epoch": 0.8774067755300999, "grad_norm": 1.6499922538828395, "learning_rate": 4.4670682419221954e-07, "loss": 0.3396, "step": 900 }, { "epoch": 0.8822812576163782, "grad_norm": 1.7518714860746227, "learning_rate": 4.121746537683907e-07, "loss": 0.3504, "step": 905 }, { "epoch": 0.8871557397026566, "grad_norm": 1.770832100980816, "learning_rate": 3.789740848690682e-07, "loss": 0.3518, "step": 910 }, { "epoch": 0.8920302217889349, "grad_norm": 1.7567597669594073, "learning_rate": 3.4711475385240057e-07, "loss": 0.3492, "step": 915 }, { "epoch": 0.8969047038752133, "grad_norm": 1.7161657861744943, "learning_rate": 3.1660590778656406e-07, "loss": 0.3428, "step": 920 }, { "epoch": 0.9017791859614915, "grad_norm": 1.728233513619595, "learning_rate": 2.8745640176582766e-07, "loss": 0.3396, "step": 925 }, { "epoch": 0.9066536680477699, "grad_norm": 1.8381612957565303, "learning_rate": 2.5967469634039177e-07, "loss": 0.345, "step": 930 }, { "epoch": 0.9115281501340483, "grad_norm": 1.7243006426458327, "learning_rate": 2.3326885506074314e-07, "loss": 0.3465, "step": 935 }, { "epoch": 0.9164026322203266, "grad_norm": 1.5970738101828363, "learning_rate": 2.0824654213723038e-07, "loss": 0.3429, "step": 940 }, { "epoch": 0.921277114306605, "grad_norm": 1.6533959425871336, "learning_rate": 1.8461502021555721e-07, "loss": 0.3389, "step": 945 }, { "epoch": 0.9261515963928832, "grad_norm": 1.7738912948147607, "learning_rate": 1.6238114826881868e-07, "loss": 0.3439, "step": 950 }, { "epoch": 0.9310260784791616, "grad_norm": 1.7429674496591296, "learning_rate": 1.4155137960670974e-07, "loss": 0.3389, "step": 955 }, { "epoch": 0.9359005605654399, "grad_norm": 1.7343774559673155, "learning_rate": 1.2213176000246852e-07, "loss": 0.345, "step": 960 }, { "epoch": 0.9407750426517183, "grad_norm": 1.7876244818652967, "learning_rate": 1.0412792593811505e-07, "loss": 0.3426, "step": 965 }, { "epoch": 0.9456495247379966, "grad_norm": 1.6810981829120015, "learning_rate": 8.754510296847651e-08, "loss": 0.3444, "step": 970 }, { "epoch": 0.950524006824275, "grad_norm": 1.7314637174387817, "learning_rate": 7.238810420448883e-08, "loss": 0.3339, "step": 975 }, { "epoch": 0.9553984889105532, "grad_norm": 1.6057497995450185, "learning_rate": 5.866132891620746e-08, "loss": 0.346, "step": 980 }, { "epoch": 0.9602729709968316, "grad_norm": 1.7119707817106466, "learning_rate": 4.6368761255930485e-08, "loss": 0.3379, "step": 985 }, { "epoch": 0.9651474530831099, "grad_norm": 1.7236174016491694, "learning_rate": 3.551396910181415e-08, "loss": 0.3412, "step": 990 }, { "epoch": 0.9700219351693883, "grad_norm": 1.6591399917697027, "learning_rate": 2.6100103022306257e-08, "loss": 0.3433, "step": 995 }, { "epoch": 0.9748964172556666, "grad_norm": 1.7276142814853213, "learning_rate": 1.812989536170484e-08, "loss": 0.3414, "step": 1000 }, { "epoch": 0.979770899341945, "grad_norm": 1.6558936839338774, "learning_rate": 1.1605659447102568e-08, "loss": 0.3366, "step": 1005 }, { "epoch": 0.9846453814282232, "grad_norm": 1.6766541339225858, "learning_rate": 6.529288916952703e-09, "loss": 0.3331, "step": 1010 }, { "epoch": 0.9895198635145016, "grad_norm": 1.7452767005894518, "learning_rate": 2.9022571714448776e-09, "loss": 0.3429, "step": 1015 }, { "epoch": 0.9943943456007799, "grad_norm": 1.8097610241635727, "learning_rate": 7.256169448560668e-10, "loss": 0.3415, "step": 1020 }, { "epoch": 0.9992688276870583, "grad_norm": 1.758337391190151, "learning_rate": 0.0, "loss": 0.3467, "step": 1025 }, { "epoch": 0.9992688276870583, "eval_loss": 0.3359443247318268, "eval_runtime": 96.9406, "eval_samples_per_second": 3.115, "eval_steps_per_second": 0.784, "step": 1025 }, { "epoch": 0.9992688276870583, "step": 1025, "total_flos": 214561802158080.0, "train_loss": 0.5001517156275307, "train_runtime": 26987.6611, "train_samples_per_second": 1.216, "train_steps_per_second": 0.038 } ], "logging_steps": 5, "max_steps": 1025, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 214561802158080.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }