{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9992646286374619, "eval_steps": 500, "global_step": 1189, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008404244143292362, "grad_norm": 0.7383340001106262, "learning_rate": 2e-05, "loss": 2.4066, "step": 10 }, { "epoch": 0.016808488286584725, "grad_norm": 0.46394309401512146, "learning_rate": 4e-05, "loss": 2.0375, "step": 20 }, { "epoch": 0.025212732429877087, "grad_norm": 0.4739478528499603, "learning_rate": 6e-05, "loss": 1.5044, "step": 30 }, { "epoch": 0.03361697657316945, "grad_norm": 0.20930196344852448, "learning_rate": 8e-05, "loss": 0.8704, "step": 40 }, { "epoch": 0.04202122071646181, "grad_norm": 0.15288038551807404, "learning_rate": 0.0001, "loss": 0.6533, "step": 50 }, { "epoch": 0.050425464859754174, "grad_norm": 0.13073962926864624, "learning_rate": 0.00012, "loss": 0.586, "step": 60 }, { "epoch": 0.058829709003046536, "grad_norm": 0.14555367827415466, "learning_rate": 0.00014, "loss": 0.5793, "step": 70 }, { "epoch": 0.0672339531463389, "grad_norm": 0.12397414445877075, "learning_rate": 0.00016, "loss": 0.581, "step": 80 }, { "epoch": 0.07563819728963127, "grad_norm": 0.13021130859851837, "learning_rate": 0.00018, "loss": 0.5512, "step": 90 }, { "epoch": 0.08404244143292362, "grad_norm": 0.13012883067131042, "learning_rate": 0.0002, "loss": 0.5403, "step": 100 }, { "epoch": 0.09244668557621599, "grad_norm": 0.11942347884178162, "learning_rate": 0.00019942313239111625, "loss": 0.5247, "step": 110 }, { "epoch": 0.10085092971950835, "grad_norm": 0.11690942198038101, "learning_rate": 0.0001988462647822325, "loss": 0.5417, "step": 120 }, { "epoch": 0.10925517386280072, "grad_norm": 0.1355101615190506, "learning_rate": 0.00019826939717334873, "loss": 0.5273, "step": 130 }, { "epoch": 0.11765941800609307, "grad_norm": 0.1345665603876114, "learning_rate": 0.00019769252956446497, "loss": 0.5243, "step": 140 }, { "epoch": 0.12606366214938544, "grad_norm": 0.12515193223953247, "learning_rate": 0.0001971156619555812, "loss": 0.5344, "step": 150 }, { "epoch": 0.1344679062926778, "grad_norm": 0.15686553716659546, "learning_rate": 0.00019653879434669745, "loss": 0.5118, "step": 160 }, { "epoch": 0.14287215043597015, "grad_norm": 0.12068944424390793, "learning_rate": 0.0001959619267378137, "loss": 0.4979, "step": 170 }, { "epoch": 0.15127639457926254, "grad_norm": 0.13319459557533264, "learning_rate": 0.00019538505912892993, "loss": 0.503, "step": 180 }, { "epoch": 0.1596806387225549, "grad_norm": 0.11806949228048325, "learning_rate": 0.00019480819152004617, "loss": 0.49, "step": 190 }, { "epoch": 0.16808488286584725, "grad_norm": 0.12932075560092926, "learning_rate": 0.00019423132391116238, "loss": 0.514, "step": 200 }, { "epoch": 0.17648912700913963, "grad_norm": 0.11743929982185364, "learning_rate": 0.00019365445630227862, "loss": 0.4788, "step": 210 }, { "epoch": 0.18489337115243198, "grad_norm": 0.11788313835859299, "learning_rate": 0.00019307758869339486, "loss": 0.4891, "step": 220 }, { "epoch": 0.19329761529572434, "grad_norm": 0.11414741724729538, "learning_rate": 0.0001925007210845111, "loss": 0.5033, "step": 230 }, { "epoch": 0.2017018594390167, "grad_norm": 0.11419043689966202, "learning_rate": 0.00019192385347562737, "loss": 0.4844, "step": 240 }, { "epoch": 0.21010610358230908, "grad_norm": 0.12788020074367523, "learning_rate": 0.0001913469858667436, "loss": 0.4697, "step": 250 }, { "epoch": 0.21851034772560143, "grad_norm": 0.13661302626132965, "learning_rate": 0.00019077011825785982, "loss": 0.4627, "step": 260 }, { "epoch": 0.2269145918688938, "grad_norm": 0.12041325867176056, "learning_rate": 0.00019019325064897606, "loss": 0.4964, "step": 270 }, { "epoch": 0.23531883601218614, "grad_norm": 0.133742094039917, "learning_rate": 0.0001896163830400923, "loss": 0.4658, "step": 280 }, { "epoch": 0.24372308015547853, "grad_norm": 0.1261977106332779, "learning_rate": 0.00018903951543120854, "loss": 0.4781, "step": 290 }, { "epoch": 0.2521273242987709, "grad_norm": 0.130150705575943, "learning_rate": 0.00018846264782232478, "loss": 0.4922, "step": 300 }, { "epoch": 0.26053156844206327, "grad_norm": 0.13174410164356232, "learning_rate": 0.00018788578021344102, "loss": 0.4559, "step": 310 }, { "epoch": 0.2689358125853556, "grad_norm": 0.1186077669262886, "learning_rate": 0.00018730891260455726, "loss": 0.4722, "step": 320 }, { "epoch": 0.277340056728648, "grad_norm": 0.116569384932518, "learning_rate": 0.0001867320449956735, "loss": 0.4457, "step": 330 }, { "epoch": 0.2857443008719403, "grad_norm": 0.12219471484422684, "learning_rate": 0.00018615517738678974, "loss": 0.4849, "step": 340 }, { "epoch": 0.2941485450152327, "grad_norm": 0.12746909260749817, "learning_rate": 0.00018557830977790598, "loss": 0.4821, "step": 350 }, { "epoch": 0.30255278915852507, "grad_norm": 0.14125944674015045, "learning_rate": 0.00018500144216902222, "loss": 0.4605, "step": 360 }, { "epoch": 0.3109570333018174, "grad_norm": 0.19157269597053528, "learning_rate": 0.00018442457456013846, "loss": 0.4541, "step": 370 }, { "epoch": 0.3193612774451098, "grad_norm": 0.12603330612182617, "learning_rate": 0.0001838477069512547, "loss": 0.4536, "step": 380 }, { "epoch": 0.32776552158840216, "grad_norm": 0.12653909623622894, "learning_rate": 0.00018327083934237091, "loss": 0.4468, "step": 390 }, { "epoch": 0.3361697657316945, "grad_norm": 0.15930472314357758, "learning_rate": 0.00018269397173348718, "loss": 0.4542, "step": 400 }, { "epoch": 0.3445740098749869, "grad_norm": 0.13266988098621368, "learning_rate": 0.00018211710412460342, "loss": 0.4335, "step": 410 }, { "epoch": 0.35297825401827926, "grad_norm": 0.12103667855262756, "learning_rate": 0.00018154023651571966, "loss": 0.4575, "step": 420 }, { "epoch": 0.3613824981615716, "grad_norm": 0.14439740777015686, "learning_rate": 0.0001809633689068359, "loss": 0.4377, "step": 430 }, { "epoch": 0.36978674230486397, "grad_norm": 0.12652407586574554, "learning_rate": 0.00018038650129795214, "loss": 0.4363, "step": 440 }, { "epoch": 0.3781909864481563, "grad_norm": 0.14594405889511108, "learning_rate": 0.00017980963368906835, "loss": 0.4306, "step": 450 }, { "epoch": 0.3865952305914487, "grad_norm": 0.12562687695026398, "learning_rate": 0.0001792327660801846, "loss": 0.4501, "step": 460 }, { "epoch": 0.39499947473474106, "grad_norm": 0.14584492146968842, "learning_rate": 0.00017865589847130083, "loss": 0.4509, "step": 470 }, { "epoch": 0.4034037188780334, "grad_norm": 0.13192500174045563, "learning_rate": 0.00017807903086241707, "loss": 0.4505, "step": 480 }, { "epoch": 0.4118079630213258, "grad_norm": 0.14266645908355713, "learning_rate": 0.00017750216325353331, "loss": 0.4585, "step": 490 }, { "epoch": 0.42021220716461816, "grad_norm": 0.1400412619113922, "learning_rate": 0.00017692529564464958, "loss": 0.4365, "step": 500 }, { "epoch": 0.4286164513079105, "grad_norm": 0.14728468656539917, "learning_rate": 0.0001763484280357658, "loss": 0.4303, "step": 510 }, { "epoch": 0.43702069545120287, "grad_norm": 0.15791365504264832, "learning_rate": 0.00017577156042688203, "loss": 0.4407, "step": 520 }, { "epoch": 0.4454249395944952, "grad_norm": 0.15447258949279785, "learning_rate": 0.00017519469281799827, "loss": 0.4365, "step": 530 }, { "epoch": 0.4538291837377876, "grad_norm": 0.1518252044916153, "learning_rate": 0.00017461782520911451, "loss": 0.4305, "step": 540 }, { "epoch": 0.46223342788107996, "grad_norm": 0.1154065877199173, "learning_rate": 0.00017404095760023075, "loss": 0.4212, "step": 550 }, { "epoch": 0.4706376720243723, "grad_norm": 0.12900012731552124, "learning_rate": 0.000173464089991347, "loss": 0.4277, "step": 560 }, { "epoch": 0.47904191616766467, "grad_norm": 0.1349458247423172, "learning_rate": 0.00017288722238246323, "loss": 0.4051, "step": 570 }, { "epoch": 0.48744616031095706, "grad_norm": 0.16337165236473083, "learning_rate": 0.00017231035477357947, "loss": 0.407, "step": 580 }, { "epoch": 0.4958504044542494, "grad_norm": 0.13420593738555908, "learning_rate": 0.0001717334871646957, "loss": 0.4138, "step": 590 }, { "epoch": 0.5042546485975418, "grad_norm": 0.13840581476688385, "learning_rate": 0.00017115661955581195, "loss": 0.4099, "step": 600 }, { "epoch": 0.5126588927408341, "grad_norm": 0.1378021389245987, "learning_rate": 0.0001705797519469282, "loss": 0.4254, "step": 610 }, { "epoch": 0.5210631368841265, "grad_norm": 0.1607150137424469, "learning_rate": 0.00017000288433804443, "loss": 0.4353, "step": 620 }, { "epoch": 0.5294673810274189, "grad_norm": 0.13462169468402863, "learning_rate": 0.00016942601672916067, "loss": 0.4267, "step": 630 }, { "epoch": 0.5378716251707112, "grad_norm": 0.14311543107032776, "learning_rate": 0.00016884914912027689, "loss": 0.4301, "step": 640 }, { "epoch": 0.5462758693140036, "grad_norm": 0.15559442341327667, "learning_rate": 0.00016827228151139313, "loss": 0.4102, "step": 650 }, { "epoch": 0.554680113457296, "grad_norm": 0.15557149052619934, "learning_rate": 0.00016769541390250937, "loss": 0.4136, "step": 660 }, { "epoch": 0.5630843576005883, "grad_norm": 0.135511115193367, "learning_rate": 0.00016711854629362563, "loss": 0.4153, "step": 670 }, { "epoch": 0.5714886017438806, "grad_norm": 0.13760776817798615, "learning_rate": 0.00016654167868474187, "loss": 0.4145, "step": 680 }, { "epoch": 0.579892845887173, "grad_norm": 0.14971590042114258, "learning_rate": 0.0001659648110758581, "loss": 0.3875, "step": 690 }, { "epoch": 0.5882970900304654, "grad_norm": 0.16005663573741913, "learning_rate": 0.00016538794346697433, "loss": 0.3938, "step": 700 }, { "epoch": 0.5967013341737577, "grad_norm": 0.1625218689441681, "learning_rate": 0.00016481107585809057, "loss": 0.3871, "step": 710 }, { "epoch": 0.6051055783170501, "grad_norm": 0.17047689855098724, "learning_rate": 0.0001642342082492068, "loss": 0.412, "step": 720 }, { "epoch": 0.6135098224603425, "grad_norm": 0.13825903832912445, "learning_rate": 0.00016365734064032305, "loss": 0.3948, "step": 730 }, { "epoch": 0.6219140666036348, "grad_norm": 0.14830929040908813, "learning_rate": 0.00016308047303143929, "loss": 0.3927, "step": 740 }, { "epoch": 0.6303183107469272, "grad_norm": 0.13950933516025543, "learning_rate": 0.00016250360542255553, "loss": 0.4051, "step": 750 }, { "epoch": 0.6387225548902196, "grad_norm": 0.15511371195316315, "learning_rate": 0.0001619267378136718, "loss": 0.4041, "step": 760 }, { "epoch": 0.6471267990335119, "grad_norm": 0.14828190207481384, "learning_rate": 0.000161349870204788, "loss": 0.3824, "step": 770 }, { "epoch": 0.6555310431768043, "grad_norm": 0.144051194190979, "learning_rate": 0.00016077300259590425, "loss": 0.3829, "step": 780 }, { "epoch": 0.6639352873200967, "grad_norm": 0.14780694246292114, "learning_rate": 0.00016019613498702049, "loss": 0.3814, "step": 790 }, { "epoch": 0.672339531463389, "grad_norm": 0.15042325854301453, "learning_rate": 0.00015961926737813673, "loss": 0.3962, "step": 800 }, { "epoch": 0.6807437756066814, "grad_norm": 0.16325107216835022, "learning_rate": 0.00015904239976925297, "loss": 0.3801, "step": 810 }, { "epoch": 0.6891480197499738, "grad_norm": 0.14843328297138214, "learning_rate": 0.0001584655321603692, "loss": 0.4082, "step": 820 }, { "epoch": 0.6975522638932661, "grad_norm": 0.16731064021587372, "learning_rate": 0.00015788866455148545, "loss": 0.4192, "step": 830 }, { "epoch": 0.7059565080365585, "grad_norm": 0.18703435361385345, "learning_rate": 0.00015731179694260169, "loss": 0.4009, "step": 840 }, { "epoch": 0.7143607521798508, "grad_norm": 0.13935630023479462, "learning_rate": 0.00015673492933371793, "loss": 0.3618, "step": 850 }, { "epoch": 0.7227649963231432, "grad_norm": 0.13263636827468872, "learning_rate": 0.00015615806172483417, "loss": 0.3963, "step": 860 }, { "epoch": 0.7311692404664355, "grad_norm": 0.14940643310546875, "learning_rate": 0.0001555811941159504, "loss": 0.3585, "step": 870 }, { "epoch": 0.7395734846097279, "grad_norm": 0.14807912707328796, "learning_rate": 0.00015500432650706665, "loss": 0.3748, "step": 880 }, { "epoch": 0.7479777287530203, "grad_norm": 0.15254080295562744, "learning_rate": 0.00015442745889818286, "loss": 0.3718, "step": 890 }, { "epoch": 0.7563819728963126, "grad_norm": 0.16590768098831177, "learning_rate": 0.0001538505912892991, "loss": 0.386, "step": 900 }, { "epoch": 0.764786217039605, "grad_norm": 0.15733902156352997, "learning_rate": 0.00015327372368041534, "loss": 0.3756, "step": 910 }, { "epoch": 0.7731904611828974, "grad_norm": 0.13757385313510895, "learning_rate": 0.00015269685607153158, "loss": 0.3843, "step": 920 }, { "epoch": 0.7815947053261897, "grad_norm": 0.14952607452869415, "learning_rate": 0.00015211998846264784, "loss": 0.3634, "step": 930 }, { "epoch": 0.7899989494694821, "grad_norm": 0.1516282558441162, "learning_rate": 0.00015154312085376408, "loss": 0.3798, "step": 940 }, { "epoch": 0.7984031936127745, "grad_norm": 0.17785628139972687, "learning_rate": 0.00015096625324488032, "loss": 0.3681, "step": 950 }, { "epoch": 0.8068074377560668, "grad_norm": 0.171351820230484, "learning_rate": 0.00015038938563599654, "loss": 0.3686, "step": 960 }, { "epoch": 0.8152116818993592, "grad_norm": 0.1742231398820877, "learning_rate": 0.00014981251802711278, "loss": 0.3792, "step": 970 }, { "epoch": 0.8236159260426515, "grad_norm": 0.16650599241256714, "learning_rate": 0.00014923565041822902, "loss": 0.3577, "step": 980 }, { "epoch": 0.8320201701859439, "grad_norm": 0.1497887670993805, "learning_rate": 0.00014865878280934526, "loss": 0.3553, "step": 990 }, { "epoch": 0.8404244143292363, "grad_norm": 0.14781557023525238, "learning_rate": 0.0001480819152004615, "loss": 0.3538, "step": 1000 }, { "epoch": 0.8488286584725286, "grad_norm": 0.15724751353263855, "learning_rate": 0.00014750504759157774, "loss": 0.3597, "step": 1010 }, { "epoch": 0.857232902615821, "grad_norm": 0.18635571002960205, "learning_rate": 0.00014692817998269398, "loss": 0.3615, "step": 1020 }, { "epoch": 0.8656371467591134, "grad_norm": 0.17742526531219482, "learning_rate": 0.00014635131237381022, "loss": 0.348, "step": 1030 }, { "epoch": 0.8740413909024057, "grad_norm": 0.20535768568515778, "learning_rate": 0.00014577444476492646, "loss": 0.3343, "step": 1040 }, { "epoch": 0.8824456350456981, "grad_norm": 0.18968522548675537, "learning_rate": 0.0001451975771560427, "loss": 0.3615, "step": 1050 }, { "epoch": 0.8908498791889904, "grad_norm": 0.1528492122888565, "learning_rate": 0.00014462070954715894, "loss": 0.3786, "step": 1060 }, { "epoch": 0.8992541233322828, "grad_norm": 0.15841075778007507, "learning_rate": 0.00014404384193827518, "loss": 0.3761, "step": 1070 }, { "epoch": 0.9076583674755752, "grad_norm": 0.15167982876300812, "learning_rate": 0.0001434669743293914, "loss": 0.3528, "step": 1080 }, { "epoch": 0.9160626116188675, "grad_norm": 0.14096671342849731, "learning_rate": 0.00014289010672050766, "loss": 0.371, "step": 1090 }, { "epoch": 0.9244668557621599, "grad_norm": 0.1579194813966751, "learning_rate": 0.0001423132391116239, "loss": 0.3491, "step": 1100 }, { "epoch": 0.9328710999054523, "grad_norm": 0.16789057850837708, "learning_rate": 0.00014173637150274014, "loss": 0.3536, "step": 1110 }, { "epoch": 0.9412753440487446, "grad_norm": 0.13980717957019806, "learning_rate": 0.00014115950389385638, "loss": 0.3423, "step": 1120 }, { "epoch": 0.949679588192037, "grad_norm": 0.19879643619060516, "learning_rate": 0.00014058263628497262, "loss": 0.3285, "step": 1130 }, { "epoch": 0.9580838323353293, "grad_norm": 0.16574440896511078, "learning_rate": 0.00014000576867608886, "loss": 0.3568, "step": 1140 }, { "epoch": 0.9664880764786217, "grad_norm": 0.15376180410385132, "learning_rate": 0.00013942890106720507, "loss": 0.3558, "step": 1150 }, { "epoch": 0.9748923206219141, "grad_norm": 0.17232170701026917, "learning_rate": 0.0001388520334583213, "loss": 0.342, "step": 1160 }, { "epoch": 0.9832965647652064, "grad_norm": 0.1959993690252304, "learning_rate": 0.00013827516584943755, "loss": 0.3458, "step": 1170 }, { "epoch": 0.9917008089084988, "grad_norm": 0.14029347896575928, "learning_rate": 0.0001376982982405538, "loss": 0.3297, "step": 1180 } ], "logging_steps": 10, "max_steps": 3567, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.069523410383258e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }