{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 4233, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01771793054571226, "grad_norm": 0.24673239886760712, "learning_rate": 0.0001, "loss": 3.8313, "step": 25 }, { "epoch": 0.03543586109142452, "grad_norm": 0.23432838916778564, "learning_rate": 0.0001, "loss": 3.6164, "step": 50 }, { "epoch": 0.05315379163713678, "grad_norm": 0.4262351393699646, "learning_rate": 0.0001, "loss": 3.4113, "step": 75 }, { "epoch": 0.07087172218284904, "grad_norm": 0.329630047082901, "learning_rate": 0.0001, "loss": 3.1874, "step": 100 }, { "epoch": 0.07087172218284904, "eval_loss": 3.0117650032043457, "eval_runtime": 31.2497, "eval_samples_per_second": 9.536, "eval_steps_per_second": 2.4, "step": 100 }, { "epoch": 0.0885896527285613, "grad_norm": 0.39532458782196045, "learning_rate": 0.0001, "loss": 3.0928, "step": 125 }, { "epoch": 0.10630758327427356, "grad_norm": 0.38724836707115173, "learning_rate": 0.0001, "loss": 2.8781, "step": 150 }, { "epoch": 0.12402551381998582, "grad_norm": 0.2721017897129059, "learning_rate": 0.0001, "loss": 2.9166, "step": 175 }, { "epoch": 0.14174344436569808, "grad_norm": 0.40747421979904175, "learning_rate": 0.0001, "loss": 2.8756, "step": 200 }, { "epoch": 0.14174344436569808, "eval_loss": 2.822810411453247, "eval_runtime": 32.4779, "eval_samples_per_second": 9.175, "eval_steps_per_second": 2.309, "step": 200 }, { "epoch": 0.15946137491141035, "grad_norm": 0.4922279119491577, "learning_rate": 0.0001, "loss": 2.9015, "step": 225 }, { "epoch": 0.1771793054571226, "grad_norm": 0.3577287197113037, "learning_rate": 0.0001, "loss": 2.8639, "step": 250 }, { "epoch": 0.19489723600283487, "grad_norm": 0.26928216218948364, "learning_rate": 0.0001, "loss": 2.6776, "step": 275 }, { "epoch": 0.21261516654854712, "grad_norm": 0.3623720109462738, "learning_rate": 0.0001, "loss": 2.7134, "step": 300 }, { "epoch": 0.21261516654854712, "eval_loss": 2.735769271850586, "eval_runtime": 32.6152, "eval_samples_per_second": 9.137, "eval_steps_per_second": 2.3, "step": 300 }, { "epoch": 0.2303330970942594, "grad_norm": 0.48846569657325745, "learning_rate": 0.0001, "loss": 2.7118, "step": 325 }, { "epoch": 0.24805102763997164, "grad_norm": 0.555242657661438, "learning_rate": 0.0001, "loss": 2.7072, "step": 350 }, { "epoch": 0.2657689581856839, "grad_norm": 0.3421138823032379, "learning_rate": 0.0001, "loss": 2.7114, "step": 375 }, { "epoch": 0.28348688873139616, "grad_norm": 0.3518970012664795, "learning_rate": 0.0001, "loss": 2.6948, "step": 400 }, { "epoch": 0.28348688873139616, "eval_loss": 2.6832730770111084, "eval_runtime": 32.6428, "eval_samples_per_second": 9.129, "eval_steps_per_second": 2.298, "step": 400 }, { "epoch": 0.30120481927710846, "grad_norm": 0.7523891925811768, "learning_rate": 0.0001, "loss": 2.7691, "step": 425 }, { "epoch": 0.3189227498228207, "grad_norm": 0.36019715666770935, "learning_rate": 0.0001, "loss": 2.7116, "step": 450 }, { "epoch": 0.33664068036853295, "grad_norm": 0.47855111956596375, "learning_rate": 0.0001, "loss": 2.5935, "step": 475 }, { "epoch": 0.3543586109142452, "grad_norm": 0.3621235489845276, "learning_rate": 0.0001, "loss": 2.6386, "step": 500 }, { "epoch": 0.3543586109142452, "eval_loss": 2.6440742015838623, "eval_runtime": 32.6151, "eval_samples_per_second": 9.137, "eval_steps_per_second": 2.3, "step": 500 }, { "epoch": 0.3720765414599575, "grad_norm": 0.6827765703201294, "learning_rate": 0.0001, "loss": 2.6515, "step": 525 }, { "epoch": 0.38979447200566975, "grad_norm": 0.5584849119186401, "learning_rate": 0.0001, "loss": 2.7017, "step": 550 }, { "epoch": 0.407512402551382, "grad_norm": 0.42108777165412903, "learning_rate": 0.0001, "loss": 2.6605, "step": 575 }, { "epoch": 0.42523033309709424, "grad_norm": 0.4811398684978485, "learning_rate": 0.0001, "loss": 2.6525, "step": 600 }, { "epoch": 0.42523033309709424, "eval_loss": 2.6149747371673584, "eval_runtime": 32.6256, "eval_samples_per_second": 9.134, "eval_steps_per_second": 2.299, "step": 600 }, { "epoch": 0.44294826364280654, "grad_norm": 0.3859870731830597, "learning_rate": 0.0001, "loss": 2.6805, "step": 625 }, { "epoch": 0.4606661941885188, "grad_norm": 0.5074867010116577, "learning_rate": 0.0001, "loss": 2.66, "step": 650 }, { "epoch": 0.47838412473423103, "grad_norm": 0.4459812343120575, "learning_rate": 0.0001, "loss": 2.645, "step": 675 }, { "epoch": 0.4961020552799433, "grad_norm": 0.3960532248020172, "learning_rate": 0.0001, "loss": 2.6242, "step": 700 }, { "epoch": 0.4961020552799433, "eval_loss": 2.5855672359466553, "eval_runtime": 32.5667, "eval_samples_per_second": 9.15, "eval_steps_per_second": 2.303, "step": 700 }, { "epoch": 0.5138199858256556, "grad_norm": 0.28771647810935974, "learning_rate": 0.0001, "loss": 2.6985, "step": 725 }, { "epoch": 0.5315379163713678, "grad_norm": 0.3489636182785034, "learning_rate": 0.0001, "loss": 2.7126, "step": 750 }, { "epoch": 0.5492558469170801, "grad_norm": 0.5809263586997986, "learning_rate": 0.0001, "loss": 2.558, "step": 775 }, { "epoch": 0.5669737774627923, "grad_norm": 0.5202405452728271, "learning_rate": 0.0001, "loss": 2.6444, "step": 800 }, { "epoch": 0.5669737774627923, "eval_loss": 2.5701286792755127, "eval_runtime": 32.5138, "eval_samples_per_second": 9.165, "eval_steps_per_second": 2.307, "step": 800 }, { "epoch": 0.5846917080085046, "grad_norm": 0.4100205600261688, "learning_rate": 0.0001, "loss": 2.5452, "step": 825 }, { "epoch": 0.6024096385542169, "grad_norm": 0.494243323802948, "learning_rate": 0.0001, "loss": 2.566, "step": 850 }, { "epoch": 0.6201275690999292, "grad_norm": 0.41979244351387024, "learning_rate": 0.0001, "loss": 2.6271, "step": 875 }, { "epoch": 0.6378454996456414, "grad_norm": 0.4423984885215759, "learning_rate": 0.0001, "loss": 2.6007, "step": 900 }, { "epoch": 0.6378454996456414, "eval_loss": 2.554037094116211, "eval_runtime": 32.562, "eval_samples_per_second": 9.152, "eval_steps_per_second": 2.303, "step": 900 }, { "epoch": 0.6555634301913537, "grad_norm": 0.49124300479888916, "learning_rate": 0.0001, "loss": 2.6006, "step": 925 }, { "epoch": 0.6732813607370659, "grad_norm": 0.4179531931877136, "learning_rate": 0.0001, "loss": 2.634, "step": 950 }, { "epoch": 0.6909992912827781, "grad_norm": 0.35822227597236633, "learning_rate": 0.0001, "loss": 2.5601, "step": 975 }, { "epoch": 0.7087172218284904, "grad_norm": 0.44015607237815857, "learning_rate": 0.0001, "loss": 2.462, "step": 1000 }, { "epoch": 0.7087172218284904, "eval_loss": 2.5418050289154053, "eval_runtime": 32.5949, "eval_samples_per_second": 9.143, "eval_steps_per_second": 2.301, "step": 1000 }, { "epoch": 0.7264351523742026, "grad_norm": 0.3899957835674286, "learning_rate": 0.0001, "loss": 2.538, "step": 1025 }, { "epoch": 0.744153082919915, "grad_norm": 0.5775083303451538, "learning_rate": 0.0001, "loss": 2.5553, "step": 1050 }, { "epoch": 0.7618710134656272, "grad_norm": 0.6485064625740051, "learning_rate": 0.0001, "loss": 2.5878, "step": 1075 }, { "epoch": 0.7795889440113395, "grad_norm": 0.5756754875183105, "learning_rate": 0.0001, "loss": 2.5641, "step": 1100 }, { "epoch": 0.7795889440113395, "eval_loss": 2.5314512252807617, "eval_runtime": 32.673, "eval_samples_per_second": 9.121, "eval_steps_per_second": 2.295, "step": 1100 }, { "epoch": 0.7973068745570517, "grad_norm": 0.46359655261039734, "learning_rate": 0.0001, "loss": 2.5062, "step": 1125 }, { "epoch": 0.815024805102764, "grad_norm": 0.6569529175758362, "learning_rate": 0.0001, "loss": 2.5921, "step": 1150 }, { "epoch": 0.8327427356484762, "grad_norm": 0.5323280096054077, "learning_rate": 0.0001, "loss": 2.4839, "step": 1175 }, { "epoch": 0.8504606661941885, "grad_norm": 0.46269166469573975, "learning_rate": 0.0001, "loss": 2.4672, "step": 1200 }, { "epoch": 0.8504606661941885, "eval_loss": 2.5238418579101562, "eval_runtime": 32.6451, "eval_samples_per_second": 9.128, "eval_steps_per_second": 2.297, "step": 1200 }, { "epoch": 0.8681785967399008, "grad_norm": 0.4144786596298218, "learning_rate": 0.0001, "loss": 2.5174, "step": 1225 }, { "epoch": 0.8858965272856131, "grad_norm": 0.6366602182388306, "learning_rate": 0.0001, "loss": 2.512, "step": 1250 }, { "epoch": 0.9036144578313253, "grad_norm": 0.490589439868927, "learning_rate": 0.0001, "loss": 2.5243, "step": 1275 }, { "epoch": 0.9213323883770376, "grad_norm": 0.5880224108695984, "learning_rate": 0.0001, "loss": 2.5017, "step": 1300 }, { "epoch": 0.9213323883770376, "eval_loss": 2.51455020904541, "eval_runtime": 32.6422, "eval_samples_per_second": 9.129, "eval_steps_per_second": 2.298, "step": 1300 }, { "epoch": 0.9390503189227498, "grad_norm": 0.5042569637298584, "learning_rate": 0.0001, "loss": 2.5238, "step": 1325 }, { "epoch": 0.9567682494684621, "grad_norm": 0.3827306032180786, "learning_rate": 0.0001, "loss": 2.5785, "step": 1350 }, { "epoch": 0.9744861800141743, "grad_norm": 0.4069231450557709, "learning_rate": 0.0001, "loss": 2.5312, "step": 1375 }, { "epoch": 0.9922041105598866, "grad_norm": 0.5515998005867004, "learning_rate": 0.0001, "loss": 2.6389, "step": 1400 }, { "epoch": 0.9922041105598866, "eval_loss": 2.5082926750183105, "eval_runtime": 32.6146, "eval_samples_per_second": 9.137, "eval_steps_per_second": 2.3, "step": 1400 }, { "epoch": 1.009922041105599, "grad_norm": 0.34569019079208374, "learning_rate": 0.0001, "loss": 2.4996, "step": 1425 }, { "epoch": 1.0276399716513112, "grad_norm": 0.4051329493522644, "learning_rate": 0.0001, "loss": 2.4935, "step": 1450 }, { "epoch": 1.0453579021970234, "grad_norm": 0.5617458820343018, "learning_rate": 0.0001, "loss": 2.5528, "step": 1475 }, { "epoch": 1.0630758327427356, "grad_norm": 0.7300685048103333, "learning_rate": 0.0001, "loss": 2.4869, "step": 1500 }, { "epoch": 1.0630758327427356, "eval_loss": 2.502122640609741, "eval_runtime": 32.6521, "eval_samples_per_second": 9.127, "eval_steps_per_second": 2.297, "step": 1500 }, { "epoch": 1.080793763288448, "grad_norm": 0.3424386978149414, "learning_rate": 0.0001, "loss": 2.5036, "step": 1525 }, { "epoch": 1.0985116938341601, "grad_norm": 0.48358920216560364, "learning_rate": 0.0001, "loss": 2.4546, "step": 1550 }, { "epoch": 1.1162296243798724, "grad_norm": 0.620297372341156, "learning_rate": 0.0001, "loss": 2.4505, "step": 1575 }, { "epoch": 1.1339475549255846, "grad_norm": 0.41402992606163025, "learning_rate": 0.0001, "loss": 2.5302, "step": 1600 }, { "epoch": 1.1339475549255846, "eval_loss": 2.4941623210906982, "eval_runtime": 32.6717, "eval_samples_per_second": 9.121, "eval_steps_per_second": 2.296, "step": 1600 }, { "epoch": 1.1516654854712969, "grad_norm": 0.6208804249763489, "learning_rate": 0.0001, "loss": 2.5035, "step": 1625 }, { "epoch": 1.1693834160170091, "grad_norm": 0.3958500623703003, "learning_rate": 0.0001, "loss": 2.6296, "step": 1650 }, { "epoch": 1.1871013465627214, "grad_norm": 0.5418444275856018, "learning_rate": 0.0001, "loss": 2.5194, "step": 1675 }, { "epoch": 1.2048192771084336, "grad_norm": 0.7059912085533142, "learning_rate": 0.0001, "loss": 2.497, "step": 1700 }, { "epoch": 1.2048192771084336, "eval_loss": 2.4886298179626465, "eval_runtime": 32.6428, "eval_samples_per_second": 9.129, "eval_steps_per_second": 2.298, "step": 1700 }, { "epoch": 1.222537207654146, "grad_norm": 0.5447824001312256, "learning_rate": 0.0001, "loss": 2.5098, "step": 1725 }, { "epoch": 1.2402551381998583, "grad_norm": 0.508121132850647, "learning_rate": 0.0001, "loss": 2.4463, "step": 1750 }, { "epoch": 1.2579730687455706, "grad_norm": 0.5141641497612, "learning_rate": 0.0001, "loss": 2.4407, "step": 1775 }, { "epoch": 1.2756909992912828, "grad_norm": 0.7895596623420715, "learning_rate": 0.0001, "loss": 2.4965, "step": 1800 }, { "epoch": 1.2756909992912828, "eval_loss": 2.4845895767211914, "eval_runtime": 32.6353, "eval_samples_per_second": 9.131, "eval_steps_per_second": 2.298, "step": 1800 }, { "epoch": 1.293408929836995, "grad_norm": 0.33818376064300537, "learning_rate": 0.0001, "loss": 2.3678, "step": 1825 }, { "epoch": 1.3111268603827073, "grad_norm": 0.36670777201652527, "learning_rate": 0.0001, "loss": 2.4522, "step": 1850 }, { "epoch": 1.3288447909284196, "grad_norm": 0.46683812141418457, "learning_rate": 0.0001, "loss": 2.5355, "step": 1875 }, { "epoch": 1.3465627214741318, "grad_norm": 0.45993196964263916, "learning_rate": 0.0001, "loss": 2.5535, "step": 1900 }, { "epoch": 1.3465627214741318, "eval_loss": 2.4782519340515137, "eval_runtime": 32.6522, "eval_samples_per_second": 9.126, "eval_steps_per_second": 2.297, "step": 1900 }, { "epoch": 1.364280652019844, "grad_norm": 0.4003180265426636, "learning_rate": 0.0001, "loss": 2.4353, "step": 1925 }, { "epoch": 1.3819985825655563, "grad_norm": 0.4715386629104614, "learning_rate": 0.0001, "loss": 2.4086, "step": 1950 }, { "epoch": 1.3997165131112685, "grad_norm": 0.4463669955730438, "learning_rate": 0.0001, "loss": 2.5299, "step": 1975 }, { "epoch": 1.4174344436569808, "grad_norm": 0.6194645762443542, "learning_rate": 0.0001, "loss": 2.5747, "step": 2000 }, { "epoch": 1.4174344436569808, "eval_loss": 2.4732108116149902, "eval_runtime": 32.5875, "eval_samples_per_second": 9.145, "eval_steps_per_second": 2.301, "step": 2000 }, { "epoch": 1.4351523742026933, "grad_norm": 0.4712235927581787, "learning_rate": 0.0001, "loss": 2.433, "step": 2025 }, { "epoch": 1.4528703047484055, "grad_norm": 0.5619576573371887, "learning_rate": 0.0001, "loss": 2.4883, "step": 2050 }, { "epoch": 1.4705882352941178, "grad_norm": 0.44078579545021057, "learning_rate": 0.0001, "loss": 2.5575, "step": 2075 }, { "epoch": 1.48830616583983, "grad_norm": 0.44347578287124634, "learning_rate": 0.0001, "loss": 2.4534, "step": 2100 }, { "epoch": 1.48830616583983, "eval_loss": 2.467924118041992, "eval_runtime": 32.6469, "eval_samples_per_second": 9.128, "eval_steps_per_second": 2.297, "step": 2100 }, { "epoch": 1.5060240963855422, "grad_norm": 0.4806898832321167, "learning_rate": 0.0001, "loss": 2.4605, "step": 2125 }, { "epoch": 1.5237420269312545, "grad_norm": 0.4710817337036133, "learning_rate": 0.0001, "loss": 2.5124, "step": 2150 }, { "epoch": 1.5414599574769667, "grad_norm": 0.4697023630142212, "learning_rate": 0.0001, "loss": 2.5251, "step": 2175 }, { "epoch": 1.559177888022679, "grad_norm": 0.4275937080383301, "learning_rate": 0.0001, "loss": 2.4909, "step": 2200 }, { "epoch": 1.559177888022679, "eval_loss": 2.465703248977661, "eval_runtime": 32.66, "eval_samples_per_second": 9.124, "eval_steps_per_second": 2.296, "step": 2200 }, { "epoch": 1.5768958185683912, "grad_norm": 0.6271502375602722, "learning_rate": 0.0001, "loss": 2.4563, "step": 2225 }, { "epoch": 1.5946137491141035, "grad_norm": 0.5484246015548706, "learning_rate": 0.0001, "loss": 2.4781, "step": 2250 }, { "epoch": 1.6123316796598157, "grad_norm": 0.7340563535690308, "learning_rate": 0.0001, "loss": 2.5204, "step": 2275 }, { "epoch": 1.630049610205528, "grad_norm": 0.43257761001586914, "learning_rate": 0.0001, "loss": 2.5192, "step": 2300 }, { "epoch": 1.630049610205528, "eval_loss": 2.461730718612671, "eval_runtime": 32.5818, "eval_samples_per_second": 9.146, "eval_steps_per_second": 2.302, "step": 2300 }, { "epoch": 1.6477675407512402, "grad_norm": 0.7394423484802246, "learning_rate": 0.0001, "loss": 2.5862, "step": 2325 }, { "epoch": 1.6654854712969525, "grad_norm": 0.48102429509162903, "learning_rate": 0.0001, "loss": 2.519, "step": 2350 }, { "epoch": 1.6832034018426647, "grad_norm": 0.5994846820831299, "learning_rate": 0.0001, "loss": 2.4566, "step": 2375 }, { "epoch": 1.700921332388377, "grad_norm": 0.4805436134338379, "learning_rate": 0.0001, "loss": 2.4271, "step": 2400 }, { "epoch": 1.700921332388377, "eval_loss": 2.457273006439209, "eval_runtime": 32.6806, "eval_samples_per_second": 9.119, "eval_steps_per_second": 2.295, "step": 2400 }, { "epoch": 1.7186392629340892, "grad_norm": 0.6208567023277283, "learning_rate": 0.0001, "loss": 2.3581, "step": 2425 }, { "epoch": 1.7363571934798014, "grad_norm": 0.44081413745880127, "learning_rate": 0.0001, "loss": 2.4295, "step": 2450 }, { "epoch": 1.7540751240255137, "grad_norm": 0.4629543721675873, "learning_rate": 0.0001, "loss": 2.4569, "step": 2475 }, { "epoch": 1.771793054571226, "grad_norm": 0.518991231918335, "learning_rate": 0.0001, "loss": 2.4855, "step": 2500 }, { "epoch": 1.771793054571226, "eval_loss": 2.454190731048584, "eval_runtime": 32.6137, "eval_samples_per_second": 9.137, "eval_steps_per_second": 2.3, "step": 2500 }, { "epoch": 1.7895109851169382, "grad_norm": 0.6166653037071228, "learning_rate": 0.0001, "loss": 2.4761, "step": 2525 }, { "epoch": 1.8072289156626506, "grad_norm": 0.5490785241127014, "learning_rate": 0.0001, "loss": 2.5232, "step": 2550 }, { "epoch": 1.824946846208363, "grad_norm": 0.6279402375221252, "learning_rate": 0.0001, "loss": 2.4425, "step": 2575 }, { "epoch": 1.8426647767540751, "grad_norm": 0.606396496295929, "learning_rate": 0.0001, "loss": 2.4599, "step": 2600 }, { "epoch": 1.8426647767540751, "eval_loss": 2.4530327320098877, "eval_runtime": 32.6381, "eval_samples_per_second": 9.13, "eval_steps_per_second": 2.298, "step": 2600 }, { "epoch": 1.8603827072997874, "grad_norm": 0.5355327129364014, "learning_rate": 0.0001, "loss": 2.5615, "step": 2625 }, { "epoch": 1.8781006378454996, "grad_norm": 0.3971356451511383, "learning_rate": 0.0001, "loss": 2.445, "step": 2650 }, { "epoch": 1.8958185683912119, "grad_norm": 0.48701226711273193, "learning_rate": 0.0001, "loss": 2.5405, "step": 2675 }, { "epoch": 1.9135364989369241, "grad_norm": 0.5117021203041077, "learning_rate": 0.0001, "loss": 2.4482, "step": 2700 }, { "epoch": 1.9135364989369241, "eval_loss": 2.444391965866089, "eval_runtime": 32.6264, "eval_samples_per_second": 9.134, "eval_steps_per_second": 2.299, "step": 2700 }, { "epoch": 1.9312544294826366, "grad_norm": 0.40752479434013367, "learning_rate": 0.0001, "loss": 2.421, "step": 2725 }, { "epoch": 1.9489723600283488, "grad_norm": 0.4466327428817749, "learning_rate": 0.0001, "loss": 2.4286, "step": 2750 }, { "epoch": 1.966690290574061, "grad_norm": 0.35452115535736084, "learning_rate": 0.0001, "loss": 2.4265, "step": 2775 }, { "epoch": 1.9844082211197733, "grad_norm": 0.7978025674819946, "learning_rate": 0.0001, "loss": 2.493, "step": 2800 }, { "epoch": 1.9844082211197733, "eval_loss": 2.444624423980713, "eval_runtime": 32.6679, "eval_samples_per_second": 9.122, "eval_steps_per_second": 2.296, "step": 2800 }, { "epoch": 2.0021261516654856, "grad_norm": 0.46374988555908203, "learning_rate": 0.0001, "loss": 2.5357, "step": 2825 }, { "epoch": 2.019844082211198, "grad_norm": 0.4631684422492981, "learning_rate": 0.0001, "loss": 2.4015, "step": 2850 }, { "epoch": 2.03756201275691, "grad_norm": 0.4475260376930237, "learning_rate": 0.0001, "loss": 2.3658, "step": 2875 }, { "epoch": 2.0552799433026223, "grad_norm": 0.47790655493736267, "learning_rate": 0.0001, "loss": 2.3527, "step": 2900 }, { "epoch": 2.0552799433026223, "eval_loss": 2.441364049911499, "eval_runtime": 32.6599, "eval_samples_per_second": 9.124, "eval_steps_per_second": 2.296, "step": 2900 }, { "epoch": 2.0729978738483346, "grad_norm": 0.5602151155471802, "learning_rate": 0.0001, "loss": 2.4763, "step": 2925 }, { "epoch": 2.090715804394047, "grad_norm": 0.37178730964660645, "learning_rate": 0.0001, "loss": 2.4431, "step": 2950 }, { "epoch": 2.108433734939759, "grad_norm": 0.47269827127456665, "learning_rate": 0.0001, "loss": 2.5528, "step": 2975 }, { "epoch": 2.1261516654854713, "grad_norm": 0.5636725425720215, "learning_rate": 0.0001, "loss": 2.5243, "step": 3000 }, { "epoch": 2.1261516654854713, "eval_loss": 2.4375791549682617, "eval_runtime": 32.6693, "eval_samples_per_second": 9.122, "eval_steps_per_second": 2.296, "step": 3000 }, { "epoch": 2.1438695960311835, "grad_norm": 0.5602971315383911, "learning_rate": 0.0001, "loss": 2.4726, "step": 3025 }, { "epoch": 2.161587526576896, "grad_norm": 0.7102957367897034, "learning_rate": 0.0001, "loss": 2.4513, "step": 3050 }, { "epoch": 2.179305457122608, "grad_norm": 0.5028663277626038, "learning_rate": 0.0001, "loss": 2.4038, "step": 3075 }, { "epoch": 2.1970233876683203, "grad_norm": 0.5358246564865112, "learning_rate": 0.0001, "loss": 2.4644, "step": 3100 }, { "epoch": 2.1970233876683203, "eval_loss": 2.433030605316162, "eval_runtime": 32.6486, "eval_samples_per_second": 9.128, "eval_steps_per_second": 2.297, "step": 3100 }, { "epoch": 2.2147413182140325, "grad_norm": 0.5380859971046448, "learning_rate": 0.0001, "loss": 2.4342, "step": 3125 }, { "epoch": 2.2324592487597448, "grad_norm": 0.8703417181968689, "learning_rate": 0.0001, "loss": 2.4462, "step": 3150 }, { "epoch": 2.250177179305457, "grad_norm": 0.44465309381484985, "learning_rate": 0.0001, "loss": 2.4428, "step": 3175 }, { "epoch": 2.2678951098511693, "grad_norm": 0.4541110396385193, "learning_rate": 0.0001, "loss": 2.386, "step": 3200 }, { "epoch": 2.2678951098511693, "eval_loss": 2.4308454990386963, "eval_runtime": 32.6678, "eval_samples_per_second": 9.122, "eval_steps_per_second": 2.296, "step": 3200 }, { "epoch": 2.2856130403968815, "grad_norm": 0.6527560949325562, "learning_rate": 0.0001, "loss": 2.3964, "step": 3225 }, { "epoch": 2.3033309709425938, "grad_norm": 0.5541362762451172, "learning_rate": 0.0001, "loss": 2.4817, "step": 3250 }, { "epoch": 2.321048901488306, "grad_norm": 0.5997689366340637, "learning_rate": 0.0001, "loss": 2.496, "step": 3275 }, { "epoch": 2.3387668320340183, "grad_norm": 0.5446316003799438, "learning_rate": 0.0001, "loss": 2.3762, "step": 3300 }, { "epoch": 2.3387668320340183, "eval_loss": 2.428109645843506, "eval_runtime": 32.6878, "eval_samples_per_second": 9.117, "eval_steps_per_second": 2.294, "step": 3300 }, { "epoch": 2.3564847625797305, "grad_norm": 0.3934761881828308, "learning_rate": 0.0001, "loss": 2.5195, "step": 3325 }, { "epoch": 2.3742026931254427, "grad_norm": 0.47348254919052124, "learning_rate": 0.0001, "loss": 2.5055, "step": 3350 }, { "epoch": 2.3919206236711554, "grad_norm": 0.5292345881462097, "learning_rate": 0.0001, "loss": 2.3635, "step": 3375 }, { "epoch": 2.4096385542168672, "grad_norm": 0.6056796312332153, "learning_rate": 0.0001, "loss": 2.3827, "step": 3400 }, { "epoch": 2.4096385542168672, "eval_loss": 2.4244818687438965, "eval_runtime": 32.5864, "eval_samples_per_second": 9.145, "eval_steps_per_second": 2.302, "step": 3400 }, { "epoch": 2.42735648476258, "grad_norm": 0.2907162010669708, "learning_rate": 0.0001, "loss": 2.3354, "step": 3425 }, { "epoch": 2.445074415308292, "grad_norm": 0.43741077184677124, "learning_rate": 0.0001, "loss": 2.4199, "step": 3450 }, { "epoch": 2.4627923458540044, "grad_norm": 0.36141782999038696, "learning_rate": 0.0001, "loss": 2.4165, "step": 3475 }, { "epoch": 2.4805102763997167, "grad_norm": 0.5461854338645935, "learning_rate": 0.0001, "loss": 2.3487, "step": 3500 }, { "epoch": 2.4805102763997167, "eval_loss": 2.4221482276916504, "eval_runtime": 32.6681, "eval_samples_per_second": 9.122, "eval_steps_per_second": 2.296, "step": 3500 }, { "epoch": 2.498228206945429, "grad_norm": 0.61762934923172, "learning_rate": 0.0001, "loss": 2.4136, "step": 3525 }, { "epoch": 2.515946137491141, "grad_norm": 0.41114169359207153, "learning_rate": 0.0001, "loss": 2.3587, "step": 3550 }, { "epoch": 2.5336640680368534, "grad_norm": 0.5726279020309448, "learning_rate": 0.0001, "loss": 2.5009, "step": 3575 }, { "epoch": 2.5513819985825656, "grad_norm": 0.4807787239551544, "learning_rate": 0.0001, "loss": 2.4737, "step": 3600 }, { "epoch": 2.5513819985825656, "eval_loss": 2.4191701412200928, "eval_runtime": 32.6743, "eval_samples_per_second": 9.12, "eval_steps_per_second": 2.295, "step": 3600 }, { "epoch": 2.569099929128278, "grad_norm": 0.5931722521781921, "learning_rate": 0.0001, "loss": 2.4178, "step": 3625 }, { "epoch": 2.58681785967399, "grad_norm": 0.4658395051956177, "learning_rate": 0.0001, "loss": 2.5162, "step": 3650 }, { "epoch": 2.6045357902197024, "grad_norm": 0.5829235315322876, "learning_rate": 0.0001, "loss": 2.3402, "step": 3675 }, { "epoch": 2.6222537207654146, "grad_norm": 0.6382436156272888, "learning_rate": 0.0001, "loss": 2.4907, "step": 3700 }, { "epoch": 2.6222537207654146, "eval_loss": 2.417147397994995, "eval_runtime": 32.6259, "eval_samples_per_second": 9.134, "eval_steps_per_second": 2.299, "step": 3700 }, { "epoch": 2.639971651311127, "grad_norm": 0.578823983669281, "learning_rate": 0.0001, "loss": 2.4281, "step": 3725 }, { "epoch": 2.657689581856839, "grad_norm": 0.5311617255210876, "learning_rate": 0.0001, "loss": 2.4315, "step": 3750 }, { "epoch": 2.6754075124025514, "grad_norm": 0.4713846743106842, "learning_rate": 0.0001, "loss": 2.4268, "step": 3775 }, { "epoch": 2.6931254429482636, "grad_norm": 0.7472273111343384, "learning_rate": 0.0001, "loss": 2.3967, "step": 3800 }, { "epoch": 2.6931254429482636, "eval_loss": 2.415893077850342, "eval_runtime": 32.6303, "eval_samples_per_second": 9.133, "eval_steps_per_second": 2.298, "step": 3800 }, { "epoch": 2.710843373493976, "grad_norm": 0.5875506401062012, "learning_rate": 0.0001, "loss": 2.3541, "step": 3825 }, { "epoch": 2.728561304039688, "grad_norm": 0.38152602314949036, "learning_rate": 0.0001, "loss": 2.4623, "step": 3850 }, { "epoch": 2.7462792345854004, "grad_norm": 0.35034802556037903, "learning_rate": 0.0001, "loss": 2.4392, "step": 3875 }, { "epoch": 2.7639971651311126, "grad_norm": 0.3683781027793884, "learning_rate": 0.0001, "loss": 2.4772, "step": 3900 }, { "epoch": 2.7639971651311126, "eval_loss": 2.414635181427002, "eval_runtime": 32.6546, "eval_samples_per_second": 9.126, "eval_steps_per_second": 2.297, "step": 3900 }, { "epoch": 2.781715095676825, "grad_norm": 0.632203221321106, "learning_rate": 0.0001, "loss": 2.48, "step": 3925 }, { "epoch": 2.799433026222537, "grad_norm": 0.4688514173030853, "learning_rate": 0.0001, "loss": 2.4058, "step": 3950 }, { "epoch": 2.8171509567682493, "grad_norm": 0.3703823685646057, "learning_rate": 0.0001, "loss": 2.3802, "step": 3975 }, { "epoch": 2.8348688873139616, "grad_norm": 0.4395906329154968, "learning_rate": 0.0001, "loss": 2.4114, "step": 4000 }, { "epoch": 2.8348688873139616, "eval_loss": 2.4105727672576904, "eval_runtime": 32.6241, "eval_samples_per_second": 9.134, "eval_steps_per_second": 2.299, "step": 4000 }, { "epoch": 2.852586817859674, "grad_norm": 0.3975163996219635, "learning_rate": 0.0001, "loss": 2.4189, "step": 4025 }, { "epoch": 2.8703047484053865, "grad_norm": 0.37457334995269775, "learning_rate": 0.0001, "loss": 2.3946, "step": 4050 }, { "epoch": 2.8880226789510983, "grad_norm": 0.3786819279193878, "learning_rate": 0.0001, "loss": 2.4683, "step": 4075 }, { "epoch": 2.905740609496811, "grad_norm": 0.633921205997467, "learning_rate": 0.0001, "loss": 2.4017, "step": 4100 }, { "epoch": 2.905740609496811, "eval_loss": 2.406451463699341, "eval_runtime": 32.6727, "eval_samples_per_second": 9.121, "eval_steps_per_second": 2.295, "step": 4100 }, { "epoch": 2.923458540042523, "grad_norm": 0.9005404710769653, "learning_rate": 0.0001, "loss": 2.4099, "step": 4125 }, { "epoch": 2.9411764705882355, "grad_norm": 0.5802463293075562, "learning_rate": 0.0001, "loss": 2.4068, "step": 4150 }, { "epoch": 2.9588944011339473, "grad_norm": 0.3155713975429535, "learning_rate": 0.0001, "loss": 2.3682, "step": 4175 }, { "epoch": 2.97661233167966, "grad_norm": 0.4876560568809509, "learning_rate": 0.0001, "loss": 2.3477, "step": 4200 }, { "epoch": 2.97661233167966, "eval_loss": 2.405850648880005, "eval_runtime": 32.6084, "eval_samples_per_second": 9.139, "eval_steps_per_second": 2.3, "step": 4200 }, { "epoch": 2.9943302622253722, "grad_norm": 0.49624720215797424, "learning_rate": 0.0001, "loss": 2.445, "step": 4225 }, { "epoch": 3.0, "step": 4233, "total_flos": 9152314711474176.0, "train_loss": 2.5398848939386913, "train_runtime": 5263.4986, "train_samples_per_second": 3.217, "train_steps_per_second": 0.804 } ], "logging_steps": 25, "max_steps": 4233, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9152314711474176.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }