{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997559189650964, "eval_steps": 500, "global_step": 1024, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000976324139614352, "grad_norm": 22.493754488892705, "learning_rate": 9.70873786407767e-08, "loss": 1.3084, "step": 1 }, { "epoch": 0.00488162069807176, "grad_norm": 21.407549210638884, "learning_rate": 4.854368932038835e-07, "loss": 1.3239, "step": 5 }, { "epoch": 0.00976324139614352, "grad_norm": 9.073471250768227, "learning_rate": 9.70873786407767e-07, "loss": 1.2075, "step": 10 }, { "epoch": 0.01464486209421528, "grad_norm": 8.959633641742187, "learning_rate": 1.4563106796116506e-06, "loss": 1.0555, "step": 15 }, { "epoch": 0.01952648279228704, "grad_norm": 2.9726378530361903, "learning_rate": 1.941747572815534e-06, "loss": 0.9134, "step": 20 }, { "epoch": 0.0244081034903588, "grad_norm": 2.336512387637656, "learning_rate": 2.427184466019418e-06, "loss": 0.8712, "step": 25 }, { "epoch": 0.02928972418843056, "grad_norm": 2.1980743279610633, "learning_rate": 2.912621359223301e-06, "loss": 0.8452, "step": 30 }, { "epoch": 0.034171344886502315, "grad_norm": 2.2947165951756108, "learning_rate": 3.398058252427185e-06, "loss": 0.8304, "step": 35 }, { "epoch": 0.03905296558457408, "grad_norm": 2.231374010631092, "learning_rate": 3.883495145631068e-06, "loss": 0.8005, "step": 40 }, { "epoch": 0.043934586282645835, "grad_norm": 2.323172717277238, "learning_rate": 4.368932038834952e-06, "loss": 0.7954, "step": 45 }, { "epoch": 0.0488162069807176, "grad_norm": 2.6284940732364697, "learning_rate": 4.854368932038836e-06, "loss": 0.7828, "step": 50 }, { "epoch": 0.053697827678789355, "grad_norm": 2.4706362764533383, "learning_rate": 5.3398058252427185e-06, "loss": 0.7649, "step": 55 }, { "epoch": 0.05857944837686112, "grad_norm": 2.5587525974220915, "learning_rate": 5.825242718446602e-06, "loss": 0.7448, "step": 60 }, { "epoch": 0.06346106907493287, "grad_norm": 2.367706031948518, "learning_rate": 6.310679611650487e-06, "loss": 0.7313, "step": 65 }, { "epoch": 0.06834268977300463, "grad_norm": 2.3861668688704416, "learning_rate": 6.79611650485437e-06, "loss": 0.7285, "step": 70 }, { "epoch": 0.0732243104710764, "grad_norm": 2.2781470198507665, "learning_rate": 7.2815533980582534e-06, "loss": 0.7107, "step": 75 }, { "epoch": 0.07810593116914816, "grad_norm": 2.4811146424704855, "learning_rate": 7.766990291262136e-06, "loss": 0.7127, "step": 80 }, { "epoch": 0.08298755186721991, "grad_norm": 2.3108856194291945, "learning_rate": 8.25242718446602e-06, "loss": 0.703, "step": 85 }, { "epoch": 0.08786917256529167, "grad_norm": 2.4623970274283877, "learning_rate": 8.737864077669904e-06, "loss": 0.693, "step": 90 }, { "epoch": 0.09275079326336344, "grad_norm": 2.2177593778006544, "learning_rate": 9.223300970873788e-06, "loss": 0.6974, "step": 95 }, { "epoch": 0.0976324139614352, "grad_norm": 2.3758805201515765, "learning_rate": 9.708737864077671e-06, "loss": 0.6771, "step": 100 }, { "epoch": 0.10251403465950695, "grad_norm": 2.367648483120376, "learning_rate": 9.999883646674445e-06, "loss": 0.703, "step": 105 }, { "epoch": 0.10739565535757871, "grad_norm": 2.7148702269535145, "learning_rate": 9.998574733951775e-06, "loss": 0.6806, "step": 110 }, { "epoch": 0.11227727605565048, "grad_norm": 2.3763948535285357, "learning_rate": 9.995811848851807e-06, "loss": 0.6747, "step": 115 }, { "epoch": 0.11715889675372224, "grad_norm": 2.3041091870777564, "learning_rate": 9.991595795035352e-06, "loss": 0.663, "step": 120 }, { "epoch": 0.12204051745179399, "grad_norm": 2.1733479043188644, "learning_rate": 9.985927798857143e-06, "loss": 0.677, "step": 125 }, { "epoch": 0.12692213814986575, "grad_norm": 2.2480309788297284, "learning_rate": 9.978809509009121e-06, "loss": 0.6625, "step": 130 }, { "epoch": 0.13180375884793752, "grad_norm": 2.138615471864636, "learning_rate": 9.970242996040865e-06, "loss": 0.656, "step": 135 }, { "epoch": 0.13668537954600926, "grad_norm": 2.1218250053659853, "learning_rate": 9.960230751757318e-06, "loss": 0.6676, "step": 140 }, { "epoch": 0.14156700024408103, "grad_norm": 2.067260559588222, "learning_rate": 9.948775688493974e-06, "loss": 0.6511, "step": 145 }, { "epoch": 0.1464486209421528, "grad_norm": 2.0246056917859048, "learning_rate": 9.93588113826975e-06, "loss": 0.6514, "step": 150 }, { "epoch": 0.15133024164022454, "grad_norm": 2.2415009500993635, "learning_rate": 9.921550851817774e-06, "loss": 0.6554, "step": 155 }, { "epoch": 0.15621186233829631, "grad_norm": 2.2315931042976627, "learning_rate": 9.905788997494377e-06, "loss": 0.6499, "step": 160 }, { "epoch": 0.16109348303636808, "grad_norm": 2.15809328665933, "learning_rate": 9.888600160066627e-06, "loss": 0.6473, "step": 165 }, { "epoch": 0.16597510373443983, "grad_norm": 2.0579191397149836, "learning_rate": 9.869989339378706e-06, "loss": 0.6369, "step": 170 }, { "epoch": 0.1708567244325116, "grad_norm": 2.031129747279531, "learning_rate": 9.849961948897582e-06, "loss": 0.6365, "step": 175 }, { "epoch": 0.17573834513058334, "grad_norm": 2.3571773219128067, "learning_rate": 9.828523814138344e-06, "loss": 0.6256, "step": 180 }, { "epoch": 0.1806199658286551, "grad_norm": 2.0680850663572503, "learning_rate": 9.8056811709697e-06, "loss": 0.6305, "step": 185 }, { "epoch": 0.18550158652672688, "grad_norm": 2.070854390313614, "learning_rate": 9.781440663800099e-06, "loss": 0.6289, "step": 190 }, { "epoch": 0.19038320722479862, "grad_norm": 1.9927467214888446, "learning_rate": 9.755809343645021e-06, "loss": 0.6097, "step": 195 }, { "epoch": 0.1952648279228704, "grad_norm": 2.1885730975002544, "learning_rate": 9.728794666076004e-06, "loss": 0.6202, "step": 200 }, { "epoch": 0.20014644862094216, "grad_norm": 2.0241934863146747, "learning_rate": 9.700404489051974e-06, "loss": 0.6218, "step": 205 }, { "epoch": 0.2050280693190139, "grad_norm": 2.0845209209163076, "learning_rate": 9.670647070633554e-06, "loss": 0.6195, "step": 210 }, { "epoch": 0.20990969001708568, "grad_norm": 1.9439387252135285, "learning_rate": 9.639531066580979e-06, "loss": 0.6206, "step": 215 }, { "epoch": 0.21479131071515742, "grad_norm": 2.0645626959829184, "learning_rate": 9.607065527836324e-06, "loss": 0.6121, "step": 220 }, { "epoch": 0.2196729314132292, "grad_norm": 2.0720595201040926, "learning_rate": 9.573259897890794e-06, "loss": 0.6146, "step": 225 }, { "epoch": 0.22455455211130096, "grad_norm": 1.9623515996646703, "learning_rate": 9.538124010037832e-06, "loss": 0.5982, "step": 230 }, { "epoch": 0.2294361728093727, "grad_norm": 2.010779905137125, "learning_rate": 9.501668084512827e-06, "loss": 0.6062, "step": 235 }, { "epoch": 0.23431779350744447, "grad_norm": 2.1305241902290226, "learning_rate": 9.46390272552028e-06, "loss": 0.5966, "step": 240 }, { "epoch": 0.23919941420551624, "grad_norm": 1.8945487010213031, "learning_rate": 9.424838918149285e-06, "loss": 0.6016, "step": 245 }, { "epoch": 0.24408103490358798, "grad_norm": 1.9720726992448023, "learning_rate": 9.384488025178214e-06, "loss": 0.5908, "step": 250 }, { "epoch": 0.24896265560165975, "grad_norm": 2.5727183425693663, "learning_rate": 9.342861783769535e-06, "loss": 0.5892, "step": 255 }, { "epoch": 0.2538442762997315, "grad_norm": 2.321112395905857, "learning_rate": 9.29997230205575e-06, "loss": 0.6011, "step": 260 }, { "epoch": 0.2587258969978033, "grad_norm": 2.2030364900806076, "learning_rate": 9.2558320556174e-06, "loss": 0.5821, "step": 265 }, { "epoch": 0.26360751769587504, "grad_norm": 1.9812931636150968, "learning_rate": 9.210453883854204e-06, "loss": 0.5732, "step": 270 }, { "epoch": 0.2684891383939468, "grad_norm": 2.085078299483637, "learning_rate": 9.163850986250375e-06, "loss": 0.5748, "step": 275 }, { "epoch": 0.2733707590920185, "grad_norm": 2.2601821261422415, "learning_rate": 9.11603691853518e-06, "loss": 0.5704, "step": 280 }, { "epoch": 0.2782523797900903, "grad_norm": 2.3842333315576076, "learning_rate": 9.067025588739889e-06, "loss": 0.5691, "step": 285 }, { "epoch": 0.28313400048816206, "grad_norm": 2.2224957447478038, "learning_rate": 9.016831253152244e-06, "loss": 0.5651, "step": 290 }, { "epoch": 0.2880156211862338, "grad_norm": 2.1614870694331993, "learning_rate": 8.96546851216962e-06, "loss": 0.5559, "step": 295 }, { "epoch": 0.2928972418843056, "grad_norm": 2.005310823414414, "learning_rate": 8.912952306052109e-06, "loss": 0.5657, "step": 300 }, { "epoch": 0.29777886258237735, "grad_norm": 2.38204601184677, "learning_rate": 8.859297910576732e-06, "loss": 0.5568, "step": 305 }, { "epoch": 0.3026604832804491, "grad_norm": 2.1648970631056743, "learning_rate": 8.804520932594061e-06, "loss": 0.5579, "step": 310 }, { "epoch": 0.3075421039785209, "grad_norm": 2.100265583460356, "learning_rate": 8.748637305488537e-06, "loss": 0.5551, "step": 315 }, { "epoch": 0.31242372467659263, "grad_norm": 2.0675326704721138, "learning_rate": 8.691663284543812e-06, "loss": 0.5469, "step": 320 }, { "epoch": 0.31730534537466437, "grad_norm": 2.267073420915594, "learning_rate": 8.633615442214452e-06, "loss": 0.5561, "step": 325 }, { "epoch": 0.32218696607273617, "grad_norm": 2.087971484130279, "learning_rate": 8.574510663305388e-06, "loss": 0.5546, "step": 330 }, { "epoch": 0.3270685867708079, "grad_norm": 1.8815922539302756, "learning_rate": 8.514366140060504e-06, "loss": 0.545, "step": 335 }, { "epoch": 0.33195020746887965, "grad_norm": 2.081332065948261, "learning_rate": 8.453199367161804e-06, "loss": 0.5409, "step": 340 }, { "epoch": 0.33683182816695145, "grad_norm": 2.0447565241323145, "learning_rate": 8.391028136640604e-06, "loss": 0.5345, "step": 345 }, { "epoch": 0.3417134488650232, "grad_norm": 1.8534927309067777, "learning_rate": 8.32787053270223e-06, "loss": 0.5295, "step": 350 }, { "epoch": 0.34659506956309494, "grad_norm": 2.0218669755624696, "learning_rate": 8.263744926465744e-06, "loss": 0.5339, "step": 355 }, { "epoch": 0.3514766902611667, "grad_norm": 2.015374192428264, "learning_rate": 8.198669970620177e-06, "loss": 0.5296, "step": 360 }, { "epoch": 0.3563583109592385, "grad_norm": 1.8414197150680836, "learning_rate": 8.13266459399891e-06, "loss": 0.5279, "step": 365 }, { "epoch": 0.3612399316573102, "grad_norm": 1.9511713212083668, "learning_rate": 8.065747996073681e-06, "loss": 0.5201, "step": 370 }, { "epoch": 0.36612155235538196, "grad_norm": 2.0198178510951035, "learning_rate": 7.997939641369909e-06, "loss": 0.5231, "step": 375 }, { "epoch": 0.37100317305345376, "grad_norm": 1.9974814551196278, "learning_rate": 7.929259253804903e-06, "loss": 0.5127, "step": 380 }, { "epoch": 0.3758847937515255, "grad_norm": 1.9584094585419525, "learning_rate": 7.859726810950606e-06, "loss": 0.503, "step": 385 }, { "epoch": 0.38076641444959725, "grad_norm": 1.9555704803278644, "learning_rate": 7.789362538222585e-06, "loss": 0.5031, "step": 390 }, { "epoch": 0.38564803514766904, "grad_norm": 1.9287309322500008, "learning_rate": 7.718186902996912e-06, "loss": 0.5095, "step": 395 }, { "epoch": 0.3905296558457408, "grad_norm": 2.0211386074482998, "learning_rate": 7.646220608656662e-06, "loss": 0.4967, "step": 400 }, { "epoch": 0.39541127654381253, "grad_norm": 2.039367090773858, "learning_rate": 7.573484588569775e-06, "loss": 0.5015, "step": 405 }, { "epoch": 0.4002928972418843, "grad_norm": 1.9208073920098578, "learning_rate": 7.500000000000001e-06, "loss": 0.497, "step": 410 }, { "epoch": 0.40517451793995607, "grad_norm": 1.8719019363210703, "learning_rate": 7.425788217952744e-06, "loss": 0.4955, "step": 415 }, { "epoch": 0.4100561386380278, "grad_norm": 1.9360771987365168, "learning_rate": 7.350870828957547e-06, "loss": 0.4906, "step": 420 }, { "epoch": 0.4149377593360996, "grad_norm": 1.887372218244023, "learning_rate": 7.27526962478906e-06, "loss": 0.4896, "step": 425 }, { "epoch": 0.41981938003417135, "grad_norm": 1.965135998139074, "learning_rate": 7.1990065961283075e-06, "loss": 0.494, "step": 430 }, { "epoch": 0.4247010007322431, "grad_norm": 1.936347943982164, "learning_rate": 7.122103926166096e-06, "loss": 0.474, "step": 435 }, { "epoch": 0.42958262143031484, "grad_norm": 1.8906249170878375, "learning_rate": 7.044583984150425e-06, "loss": 0.4932, "step": 440 }, { "epoch": 0.43446424212838664, "grad_norm": 1.919261654779276, "learning_rate": 6.9664693188797776e-06, "loss": 0.4812, "step": 445 }, { "epoch": 0.4393458628264584, "grad_norm": 1.8837975572074062, "learning_rate": 6.887782652144186e-06, "loss": 0.477, "step": 450 }, { "epoch": 0.4442274835245301, "grad_norm": 2.0761696776456904, "learning_rate": 6.808546872115976e-06, "loss": 0.4835, "step": 455 }, { "epoch": 0.4491091042226019, "grad_norm": 1.9034620515101144, "learning_rate": 6.728785026692113e-06, "loss": 0.4692, "step": 460 }, { "epoch": 0.45399072492067366, "grad_norm": 2.0736151869195667, "learning_rate": 6.648520316790102e-06, "loss": 0.4585, "step": 465 }, { "epoch": 0.4588723456187454, "grad_norm": 2.011123921684231, "learning_rate": 6.567776089599339e-06, "loss": 0.4803, "step": 470 }, { "epoch": 0.4637539663168172, "grad_norm": 1.96249455995112, "learning_rate": 6.486575831789974e-06, "loss": 0.4623, "step": 475 }, { "epoch": 0.46863558701488894, "grad_norm": 2.2467468199710017, "learning_rate": 6.404943162681144e-06, "loss": 0.4574, "step": 480 }, { "epoch": 0.4735172077129607, "grad_norm": 2.121288144154755, "learning_rate": 6.322901827370659e-06, "loss": 0.4681, "step": 485 }, { "epoch": 0.4783988284110325, "grad_norm": 2.131811769895187, "learning_rate": 6.240475689828087e-06, "loss": 0.4573, "step": 490 }, { "epoch": 0.4832804491091042, "grad_norm": 1.9175130135112601, "learning_rate": 6.1576887259532695e-06, "loss": 0.4465, "step": 495 }, { "epoch": 0.48816206980717597, "grad_norm": 1.8070710350230668, "learning_rate": 6.074565016602263e-06, "loss": 0.4419, "step": 500 }, { "epoch": 0.49304369050524777, "grad_norm": 1.99048575433868, "learning_rate": 5.991128740582774e-06, "loss": 0.4504, "step": 505 }, { "epoch": 0.4979253112033195, "grad_norm": 1.9210921458678536, "learning_rate": 5.907404167621087e-06, "loss": 0.4406, "step": 510 }, { "epoch": 0.5028069319013913, "grad_norm": 2.009882308013614, "learning_rate": 5.823415651302545e-06, "loss": 0.4458, "step": 515 }, { "epoch": 0.507688552599463, "grad_norm": 1.897100615945393, "learning_rate": 5.739187621987649e-06, "loss": 0.4379, "step": 520 }, { "epoch": 0.5125701732975347, "grad_norm": 1.907463353966728, "learning_rate": 5.654744579705815e-06, "loss": 0.4392, "step": 525 }, { "epoch": 0.5174517939956066, "grad_norm": 2.112475304318316, "learning_rate": 5.570111087028868e-06, "loss": 0.4309, "step": 530 }, { "epoch": 0.5223334146936783, "grad_norm": 1.9618257078116872, "learning_rate": 5.4853117619263496e-06, "loss": 0.4294, "step": 535 }, { "epoch": 0.5272150353917501, "grad_norm": 2.023435280859463, "learning_rate": 5.4003712706047055e-06, "loss": 0.4412, "step": 540 }, { "epoch": 0.5320966560898218, "grad_norm": 2.1255050556087003, "learning_rate": 5.315314320332438e-06, "loss": 0.4319, "step": 545 }, { "epoch": 0.5369782767878936, "grad_norm": 2.0850824266858967, "learning_rate": 5.230165652253329e-06, "loss": 0.4365, "step": 550 }, { "epoch": 0.5418598974859653, "grad_norm": 1.9630523459005746, "learning_rate": 5.144950034189798e-06, "loss": 0.4286, "step": 555 }, { "epoch": 0.546741518184037, "grad_norm": 1.8594385781324942, "learning_rate": 5.059692253438495e-06, "loss": 0.4216, "step": 560 }, { "epoch": 0.5516231388821089, "grad_norm": 1.9410469701406243, "learning_rate": 4.97441710956025e-06, "loss": 0.4146, "step": 565 }, { "epoch": 0.5565047595801806, "grad_norm": 1.8926730305562134, "learning_rate": 4.8891494071664315e-06, "loss": 0.4243, "step": 570 }, { "epoch": 0.5613863802782524, "grad_norm": 1.9153480931779545, "learning_rate": 4.803913948703845e-06, "loss": 0.4193, "step": 575 }, { "epoch": 0.5662680009763241, "grad_norm": 1.9039605462186175, "learning_rate": 4.71873552724027e-06, "loss": 0.4098, "step": 580 }, { "epoch": 0.5711496216743959, "grad_norm": 1.7928530873219377, "learning_rate": 4.633638919252712e-06, "loss": 0.4132, "step": 585 }, { "epoch": 0.5760312423724676, "grad_norm": 1.8310404534316238, "learning_rate": 4.548648877420481e-06, "loss": 0.4056, "step": 590 }, { "epoch": 0.5809128630705395, "grad_norm": 1.887050893521495, "learning_rate": 4.463790123425209e-06, "loss": 0.414, "step": 595 }, { "epoch": 0.5857944837686112, "grad_norm": 1.8879105533071967, "learning_rate": 4.379087340759861e-06, "loss": 0.4105, "step": 600 }, { "epoch": 0.590676104466683, "grad_norm": 1.8851096816683868, "learning_rate": 4.294565167548866e-06, "loss": 0.4097, "step": 605 }, { "epoch": 0.5955577251647547, "grad_norm": 1.8909509373180151, "learning_rate": 4.2102481893814504e-06, "loss": 0.4038, "step": 610 }, { "epoch": 0.6004393458628264, "grad_norm": 2.0267286135794564, "learning_rate": 4.1261609321602406e-06, "loss": 0.401, "step": 615 }, { "epoch": 0.6053209665608982, "grad_norm": 1.8447881878140413, "learning_rate": 4.042327854967231e-06, "loss": 0.3998, "step": 620 }, { "epoch": 0.61020258725897, "grad_norm": 1.8206856992603886, "learning_rate": 3.958773342949196e-06, "loss": 0.3957, "step": 625 }, { "epoch": 0.6150842079570418, "grad_norm": 1.989687612307625, "learning_rate": 3.875521700224598e-06, "loss": 0.3897, "step": 630 }, { "epoch": 0.6199658286551135, "grad_norm": 1.7577176528030285, "learning_rate": 3.7925971428140655e-06, "loss": 0.3974, "step": 635 }, { "epoch": 0.6248474493531853, "grad_norm": 1.7992538376367235, "learning_rate": 3.71002379159651e-06, "loss": 0.3933, "step": 640 }, { "epoch": 0.629729070051257, "grad_norm": 1.838523059460591, "learning_rate": 3.627825665292899e-06, "loss": 0.3961, "step": 645 }, { "epoch": 0.6346106907493287, "grad_norm": 1.881394659994862, "learning_rate": 3.546026673479755e-06, "loss": 0.3938, "step": 650 }, { "epoch": 0.6394923114474005, "grad_norm": 1.879760457077884, "learning_rate": 3.464650609634403e-06, "loss": 0.3929, "step": 655 }, { "epoch": 0.6443739321454723, "grad_norm": 1.9193591824291845, "learning_rate": 3.383721144213985e-06, "loss": 0.3869, "step": 660 }, { "epoch": 0.6492555528435441, "grad_norm": 2.0119373139170724, "learning_rate": 3.3032618177702546e-06, "loss": 0.3851, "step": 665 }, { "epoch": 0.6541371735416158, "grad_norm": 1.9053960768739056, "learning_rate": 3.2232960341021703e-06, "loss": 0.3874, "step": 670 }, { "epoch": 0.6590187942396876, "grad_norm": 1.8532461602692916, "learning_rate": 3.1438470534482547e-06, "loss": 0.3822, "step": 675 }, { "epoch": 0.6639004149377593, "grad_norm": 1.9222824143537547, "learning_rate": 3.064937985720717e-06, "loss": 0.3737, "step": 680 }, { "epoch": 0.668782035635831, "grad_norm": 1.7942220113298164, "learning_rate": 2.9865917837833025e-06, "loss": 0.3868, "step": 685 }, { "epoch": 0.6736636563339029, "grad_norm": 1.7376810506201046, "learning_rate": 2.9088312367748257e-06, "loss": 0.3763, "step": 690 }, { "epoch": 0.6785452770319746, "grad_norm": 1.9201828792137692, "learning_rate": 2.8316789634803223e-06, "loss": 0.3865, "step": 695 }, { "epoch": 0.6834268977300464, "grad_norm": 1.9845153126888537, "learning_rate": 2.75515740575176e-06, "loss": 0.372, "step": 700 }, { "epoch": 0.6883085184281181, "grad_norm": 1.7986907948997257, "learning_rate": 2.6792888219802017e-06, "loss": 0.3727, "step": 705 }, { "epoch": 0.6931901391261899, "grad_norm": 1.8814905894540856, "learning_rate": 2.604095280621354e-06, "loss": 0.3719, "step": 710 }, { "epoch": 0.6980717598242616, "grad_norm": 1.8333636960019017, "learning_rate": 2.529598653776349e-06, "loss": 0.3825, "step": 715 }, { "epoch": 0.7029533805223334, "grad_norm": 1.973156240281083, "learning_rate": 2.4558206108296394e-06, "loss": 0.3666, "step": 720 }, { "epoch": 0.7078350012204052, "grad_norm": 1.8104849203150244, "learning_rate": 2.3827826121458713e-06, "loss": 0.3681, "step": 725 }, { "epoch": 0.712716621918477, "grad_norm": 1.7648438350645508, "learning_rate": 2.3105059028275467e-06, "loss": 0.3604, "step": 730 }, { "epoch": 0.7175982426165487, "grad_norm": 1.7085215152593554, "learning_rate": 2.2390115065352974e-06, "loss": 0.3599, "step": 735 }, { "epoch": 0.7224798633146204, "grad_norm": 1.7691508841496486, "learning_rate": 2.16832021937259e-06, "loss": 0.361, "step": 740 }, { "epoch": 0.7273614840126922, "grad_norm": 1.7631951882565822, "learning_rate": 2.0984526038366005e-06, "loss": 0.3648, "step": 745 }, { "epoch": 0.7322431047107639, "grad_norm": 1.7905281564289197, "learning_rate": 2.0294289828370506e-06, "loss": 0.3593, "step": 750 }, { "epoch": 0.7371247254088358, "grad_norm": 1.7614811796241294, "learning_rate": 1.9612694337847334e-06, "loss": 0.3636, "step": 755 }, { "epoch": 0.7420063461069075, "grad_norm": 1.7427057794971907, "learning_rate": 1.8939937827514509e-06, "loss": 0.3514, "step": 760 }, { "epoch": 0.7468879668049793, "grad_norm": 1.8108047484780578, "learning_rate": 1.8276215987030489e-06, "loss": 0.354, "step": 765 }, { "epoch": 0.751769587503051, "grad_norm": 1.8460169409837044, "learning_rate": 1.7621721878072601e-06, "loss": 0.3536, "step": 770 }, { "epoch": 0.7566512082011227, "grad_norm": 1.7192504649916136, "learning_rate": 1.6976645878179677e-06, "loss": 0.3523, "step": 775 }, { "epoch": 0.7615328288991945, "grad_norm": 1.7659873554952885, "learning_rate": 1.6341175625375554e-06, "loss": 0.3556, "step": 780 }, { "epoch": 0.7664144495972663, "grad_norm": 1.7642790853502184, "learning_rate": 1.5715495963589434e-06, "loss": 0.3505, "step": 785 }, { "epoch": 0.7712960702953381, "grad_norm": 1.8214336145120926, "learning_rate": 1.509978888888894e-06, "loss": 0.3557, "step": 790 }, { "epoch": 0.7761776909934098, "grad_norm": 1.693821479855387, "learning_rate": 1.4494233496541548e-06, "loss": 0.3533, "step": 795 }, { "epoch": 0.7810593116914816, "grad_norm": 1.7741424308531102, "learning_rate": 1.3899005928919901e-06, "loss": 0.3436, "step": 800 }, { "epoch": 0.7859409323895533, "grad_norm": 1.702567333467084, "learning_rate": 1.3314279324265922e-06, "loss": 0.3484, "step": 805 }, { "epoch": 0.7908225530876251, "grad_norm": 1.7878487144245359, "learning_rate": 1.2740223766328813e-06, "loss": 0.3472, "step": 810 }, { "epoch": 0.7957041737856968, "grad_norm": 1.8410165852894955, "learning_rate": 1.2177006234891548e-06, "loss": 0.3462, "step": 815 }, { "epoch": 0.8005857944837687, "grad_norm": 1.5957050689906005, "learning_rate": 1.1624790557200255e-06, "loss": 0.3474, "step": 820 }, { "epoch": 0.8054674151818404, "grad_norm": 1.7840672497407395, "learning_rate": 1.1083737360310487e-06, "loss": 0.3366, "step": 825 }, { "epoch": 0.8103490358799121, "grad_norm": 1.7234306868229268, "learning_rate": 1.0554004024364573e-06, "loss": 0.3459, "step": 830 }, { "epoch": 0.8152306565779839, "grad_norm": 1.8763987301580518, "learning_rate": 1.0035744636813188e-06, "loss": 0.3399, "step": 835 }, { "epoch": 0.8201122772760556, "grad_norm": 1.8526209211475768, "learning_rate": 9.529109947594834e-07, "loss": 0.3414, "step": 840 }, { "epoch": 0.8249938979741274, "grad_norm": 1.6663345867535264, "learning_rate": 9.034247325286122e-07, "loss": 0.3443, "step": 845 }, { "epoch": 0.8298755186721992, "grad_norm": 1.7676921955192966, "learning_rate": 8.551300714235494e-07, "loss": 0.3427, "step": 850 }, { "epoch": 0.834757139370271, "grad_norm": 1.7531428909936075, "learning_rate": 8.080410592693183e-07, "loss": 0.335, "step": 855 }, { "epoch": 0.8396387600683427, "grad_norm": 1.8118502056267978, "learning_rate": 7.621713931949181e-07, "loss": 0.3484, "step": 860 }, { "epoch": 0.8445203807664144, "grad_norm": 1.6928707382034682, "learning_rate": 7.175344156491432e-07, "loss": 0.3392, "step": 865 }, { "epoch": 0.8494020014644862, "grad_norm": 1.7632689910362094, "learning_rate": 6.741431105195623e-07, "loss": 0.338, "step": 870 }, { "epoch": 0.8542836221625579, "grad_norm": 1.7172615010190266, "learning_rate": 6.32010099355806e-07, "loss": 0.3386, "step": 875 }, { "epoch": 0.8591652428606297, "grad_norm": 1.6578979212585563, "learning_rate": 5.911476376982333e-07, "loss": 0.3322, "step": 880 }, { "epoch": 0.8640468635587015, "grad_norm": 1.6474380347956434, "learning_rate": 5.515676115130819e-07, "loss": 0.342, "step": 885 }, { "epoch": 0.8689284842567733, "grad_norm": 1.6988649276270078, "learning_rate": 5.132815337351038e-07, "loss": 0.346, "step": 890 }, { "epoch": 0.873810104954845, "grad_norm": 1.8306999314950443, "learning_rate": 4.763005409187155e-07, "loss": 0.3374, "step": 895 }, { "epoch": 0.8786917256529168, "grad_norm": 1.631803186772389, "learning_rate": 4.406353899986221e-07, "loss": 0.3361, "step": 900 }, { "epoch": 0.8835733463509885, "grad_norm": 1.709090450527214, "learning_rate": 4.06296455160875e-07, "loss": 0.3293, "step": 905 }, { "epoch": 0.8884549670490602, "grad_norm": 1.583186301936809, "learning_rate": 3.732937248252472e-07, "loss": 0.3343, "step": 910 }, { "epoch": 0.8933365877471321, "grad_norm": 1.6042173876546493, "learning_rate": 3.416367987398345e-07, "loss": 0.331, "step": 915 }, { "epoch": 0.8982182084452038, "grad_norm": 1.6876494518887346, "learning_rate": 3.113348851887038e-07, "loss": 0.3292, "step": 920 }, { "epoch": 0.9030998291432756, "grad_norm": 1.6392793033144468, "learning_rate": 2.8239679831341126e-07, "loss": 0.3296, "step": 925 }, { "epoch": 0.9079814498413473, "grad_norm": 1.6466791755980095, "learning_rate": 2.548309555491674e-07, "loss": 0.3348, "step": 930 }, { "epoch": 0.9128630705394191, "grad_norm": 1.6369494718827322, "learning_rate": 2.2864537517639618e-07, "loss": 0.3329, "step": 935 }, { "epoch": 0.9177446912374908, "grad_norm": 1.7151132649607947, "learning_rate": 2.038476739883982e-07, "loss": 0.334, "step": 940 }, { "epoch": 0.9226263119355627, "grad_norm": 1.7998544094471982, "learning_rate": 1.804450650757972e-07, "loss": 0.3366, "step": 945 }, { "epoch": 0.9275079326336344, "grad_norm": 1.6113958011155078, "learning_rate": 1.5844435572841544e-07, "loss": 0.3199, "step": 950 }, { "epoch": 0.9323895533317061, "grad_norm": 1.620629896748232, "learning_rate": 1.3785194545518965e-07, "loss": 0.3331, "step": 955 }, { "epoch": 0.9372711740297779, "grad_norm": 1.570691984682264, "learning_rate": 1.1867382412269257e-07, "loss": 0.3266, "step": 960 }, { "epoch": 0.9421527947278496, "grad_norm": 1.7541787796617982, "learning_rate": 1.0091557021282283e-07, "loss": 0.3349, "step": 965 }, { "epoch": 0.9470344154259214, "grad_norm": 1.6021405868365253, "learning_rate": 8.458234920014685e-08, "loss": 0.3242, "step": 970 }, { "epoch": 0.9519160361239931, "grad_norm": 1.6818332750612033, "learning_rate": 6.967891204937737e-08, "loss": 0.3272, "step": 975 }, { "epoch": 0.956797656822065, "grad_norm": 1.6842336140953997, "learning_rate": 5.620959383343061e-08, "loss": 0.3335, "step": 980 }, { "epoch": 0.9616792775201367, "grad_norm": 1.6333432398092111, "learning_rate": 4.417831247244819e-08, "loss": 0.3269, "step": 985 }, { "epoch": 0.9665608982182085, "grad_norm": 1.5870095684725907, "learning_rate": 3.3588567594161625e-08, "loss": 0.3167, "step": 990 }, { "epoch": 0.9714425189162802, "grad_norm": 1.645998653655925, "learning_rate": 2.4443439515933754e-08, "loss": 0.3345, "step": 995 }, { "epoch": 0.9763241396143519, "grad_norm": 1.6627361867783679, "learning_rate": 1.6745588348758836e-08, "loss": 0.329, "step": 1000 }, { "epoch": 0.9812057603124237, "grad_norm": 1.6968847442052823, "learning_rate": 1.0497253223502035e-08, "loss": 0.328, "step": 1005 }, { "epoch": 0.9860873810104955, "grad_norm": 1.8141716265386114, "learning_rate": 5.700251639581544e-09, "loss": 0.3243, "step": 1010 }, { "epoch": 0.9909690017085673, "grad_norm": 1.6444141321089352, "learning_rate": 2.355978936303127e-09, "loss": 0.3277, "step": 1015 }, { "epoch": 0.995850622406639, "grad_norm": 1.5713108568101966, "learning_rate": 4.6540788698534735e-10, "loss": 0.3291, "step": 1020 }, { "epoch": 0.9997559189650964, "eval_loss": 0.3360166847705841, "eval_runtime": 96.8069, "eval_samples_per_second": 3.12, "eval_steps_per_second": 0.785, "step": 1024 }, { "epoch": 0.9997559189650964, "step": 1024, "total_flos": 214352422502400.0, "train_loss": 0.48904780531302094, "train_runtime": 23248.9436, "train_samples_per_second": 1.409, "train_steps_per_second": 0.044 } ], "logging_steps": 5, "max_steps": 1024, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 214352422502400.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }