{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 808, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0024752475247524753, "grad_norm": 38.88609991114745, "learning_rate": 2.469135802469136e-07, "loss": 1.8604, "step": 1 }, { "epoch": 0.012376237623762377, "grad_norm": 41.27939914036261, "learning_rate": 1.234567901234568e-06, "loss": 1.8561, "step": 5 }, { "epoch": 0.024752475247524754, "grad_norm": 5.095317934955837, "learning_rate": 2.469135802469136e-06, "loss": 1.6195, "step": 10 }, { "epoch": 0.03712871287128713, "grad_norm": 2.761116338631256, "learning_rate": 3.7037037037037037e-06, "loss": 1.3811, "step": 15 }, { "epoch": 0.04950495049504951, "grad_norm": 1.6927067748367093, "learning_rate": 4.938271604938272e-06, "loss": 1.1736, "step": 20 }, { "epoch": 0.06188118811881188, "grad_norm": 1.637997489475765, "learning_rate": 6.17283950617284e-06, "loss": 1.0247, "step": 25 }, { "epoch": 0.07425742574257425, "grad_norm": 0.9921453622709451, "learning_rate": 7.4074074074074075e-06, "loss": 1.0275, "step": 30 }, { "epoch": 0.08663366336633663, "grad_norm": 0.9409275181475548, "learning_rate": 8.641975308641975e-06, "loss": 0.9832, "step": 35 }, { "epoch": 0.09900990099009901, "grad_norm": 1.0325275844641013, "learning_rate": 9.876543209876543e-06, "loss": 0.9685, "step": 40 }, { "epoch": 0.11138613861386139, "grad_norm": 1.1218773632200145, "learning_rate": 1.1111111111111113e-05, "loss": 0.9528, "step": 45 }, { "epoch": 0.12376237623762376, "grad_norm": 0.9572180193224005, "learning_rate": 1.234567901234568e-05, "loss": 0.9201, "step": 50 }, { "epoch": 0.13613861386138615, "grad_norm": 1.0009650827934435, "learning_rate": 1.3580246913580248e-05, "loss": 0.9542, "step": 55 }, { "epoch": 0.1485148514851485, "grad_norm": 1.3065165374147407, "learning_rate": 1.4814814814814815e-05, "loss": 0.9437, "step": 60 }, { "epoch": 0.1608910891089109, "grad_norm": 1.0022123167697776, "learning_rate": 1.6049382716049385e-05, "loss": 0.9228, "step": 65 }, { "epoch": 0.17326732673267325, "grad_norm": 1.2184130543835685, "learning_rate": 1.728395061728395e-05, "loss": 0.9772, "step": 70 }, { "epoch": 0.18564356435643564, "grad_norm": 0.9408942641640691, "learning_rate": 1.851851851851852e-05, "loss": 0.9284, "step": 75 }, { "epoch": 0.19801980198019803, "grad_norm": 0.9712584145776969, "learning_rate": 1.9753086419753087e-05, "loss": 0.9484, "step": 80 }, { "epoch": 0.2103960396039604, "grad_norm": 0.9980192527761912, "learning_rate": 1.999850613931615e-05, "loss": 0.9536, "step": 85 }, { "epoch": 0.22277227722772278, "grad_norm": 0.9890748697192007, "learning_rate": 1.9992438095219886e-05, "loss": 0.9539, "step": 90 }, { "epoch": 0.23514851485148514, "grad_norm": 0.9155960993755162, "learning_rate": 1.9981705331953295e-05, "loss": 0.936, "step": 95 }, { "epoch": 0.24752475247524752, "grad_norm": 0.8051611539712368, "learning_rate": 1.996631285983779e-05, "loss": 0.9456, "step": 100 }, { "epoch": 0.2599009900990099, "grad_norm": 0.9230091255662991, "learning_rate": 1.9946267864463027e-05, "loss": 0.9136, "step": 105 }, { "epoch": 0.2722772277227723, "grad_norm": 0.8382334460112006, "learning_rate": 1.9921579703332475e-05, "loss": 0.93, "step": 110 }, { "epoch": 0.28465346534653463, "grad_norm": 0.8774240428827245, "learning_rate": 1.989225990149512e-05, "loss": 0.9245, "step": 115 }, { "epoch": 0.297029702970297, "grad_norm": 0.8987251464829349, "learning_rate": 1.9858322146165272e-05, "loss": 0.9341, "step": 120 }, { "epoch": 0.3094059405940594, "grad_norm": 0.9049306018652525, "learning_rate": 1.981978228033304e-05, "loss": 0.9257, "step": 125 }, { "epoch": 0.3217821782178218, "grad_norm": 0.9029524926074908, "learning_rate": 1.977665829536842e-05, "loss": 0.916, "step": 130 }, { "epoch": 0.3341584158415842, "grad_norm": 0.8591727043131795, "learning_rate": 1.9728970322622485e-05, "loss": 0.9168, "step": 135 }, { "epoch": 0.3465346534653465, "grad_norm": 0.8712548282993846, "learning_rate": 1.9676740624029566e-05, "loss": 0.9347, "step": 140 }, { "epoch": 0.3589108910891089, "grad_norm": 0.8442378198048982, "learning_rate": 1.961999358171482e-05, "loss": 0.9286, "step": 145 }, { "epoch": 0.3712871287128713, "grad_norm": 0.8448891617644472, "learning_rate": 1.955875568661206e-05, "loss": 0.9247, "step": 150 }, { "epoch": 0.38366336633663367, "grad_norm": 0.8506188708401377, "learning_rate": 1.94930555260971e-05, "loss": 0.9089, "step": 155 }, { "epoch": 0.39603960396039606, "grad_norm": 0.7615649650166222, "learning_rate": 1.9422923770642494e-05, "loss": 0.9121, "step": 160 }, { "epoch": 0.4084158415841584, "grad_norm": 0.8438076926475072, "learning_rate": 1.934839315949976e-05, "loss": 0.9133, "step": 165 }, { "epoch": 0.4207920792079208, "grad_norm": 0.9246754527016784, "learning_rate": 1.9269498485415897e-05, "loss": 0.9298, "step": 170 }, { "epoch": 0.43316831683168316, "grad_norm": 0.9627433039386685, "learning_rate": 1.9186276578391268e-05, "loss": 0.9421, "step": 175 }, { "epoch": 0.44554455445544555, "grad_norm": 0.8087976752191404, "learning_rate": 1.9098766288486426e-05, "loss": 0.9261, "step": 180 }, { "epoch": 0.45792079207920794, "grad_norm": 0.774521835464554, "learning_rate": 1.9007008467685947e-05, "loss": 0.9328, "step": 185 }, { "epoch": 0.47029702970297027, "grad_norm": 0.7725284205468596, "learning_rate": 1.8911045950827693e-05, "loss": 0.9093, "step": 190 }, { "epoch": 0.48267326732673266, "grad_norm": 0.8054030551821266, "learning_rate": 1.881092353560646e-05, "loss": 0.9013, "step": 195 }, { "epoch": 0.49504950495049505, "grad_norm": 0.7758400337225813, "learning_rate": 1.870668796166129e-05, "loss": 0.9142, "step": 200 }, { "epoch": 0.5074257425742574, "grad_norm": 0.8158556574805117, "learning_rate": 1.8598387888756224e-05, "loss": 0.9222, "step": 205 }, { "epoch": 0.5198019801980198, "grad_norm": 0.7595417385088049, "learning_rate": 1.8486073874064745e-05, "loss": 0.9061, "step": 210 }, { "epoch": 0.5321782178217822, "grad_norm": 0.7472799231884507, "learning_rate": 1.8369798348568403e-05, "loss": 0.9083, "step": 215 }, { "epoch": 0.5445544554455446, "grad_norm": 0.7606289538163604, "learning_rate": 1.8249615592580733e-05, "loss": 0.9328, "step": 220 }, { "epoch": 0.556930693069307, "grad_norm": 0.805186253739664, "learning_rate": 1.8125581710407864e-05, "loss": 0.9138, "step": 225 }, { "epoch": 0.5693069306930693, "grad_norm": 0.767139039892534, "learning_rate": 1.7997754604157607e-05, "loss": 0.9075, "step": 230 }, { "epoch": 0.5816831683168316, "grad_norm": 0.7674924368854081, "learning_rate": 1.786619394670933e-05, "loss": 0.9094, "step": 235 }, { "epoch": 0.594059405940594, "grad_norm": 0.7416157345263308, "learning_rate": 1.7730961153857155e-05, "loss": 0.9146, "step": 240 }, { "epoch": 0.6064356435643564, "grad_norm": 0.7972512834290945, "learning_rate": 1.7592119355639545e-05, "loss": 0.8986, "step": 245 }, { "epoch": 0.6188118811881188, "grad_norm": 0.7413943937305464, "learning_rate": 1.744973336686862e-05, "loss": 0.9261, "step": 250 }, { "epoch": 0.6311881188118812, "grad_norm": 0.7288799636758897, "learning_rate": 1.7303869656872994e-05, "loss": 0.9004, "step": 255 }, { "epoch": 0.6435643564356436, "grad_norm": 0.7554557666231643, "learning_rate": 1.715459631846824e-05, "loss": 0.9097, "step": 260 }, { "epoch": 0.655940594059406, "grad_norm": 0.7267320540898269, "learning_rate": 1.700198303616944e-05, "loss": 0.902, "step": 265 }, { "epoch": 0.6683168316831684, "grad_norm": 0.6823040717527771, "learning_rate": 1.684610105366076e-05, "loss": 0.8913, "step": 270 }, { "epoch": 0.6806930693069307, "grad_norm": 0.6787438356187852, "learning_rate": 1.6687023140537082e-05, "loss": 0.882, "step": 275 }, { "epoch": 0.693069306930693, "grad_norm": 0.7420049831517054, "learning_rate": 1.6524823558333362e-05, "loss": 0.8985, "step": 280 }, { "epoch": 0.7054455445544554, "grad_norm": 0.7627405635266166, "learning_rate": 1.6359578025857495e-05, "loss": 0.8836, "step": 285 }, { "epoch": 0.7178217821782178, "grad_norm": 0.8778998231303615, "learning_rate": 1.6191363683842883e-05, "loss": 0.8871, "step": 290 }, { "epoch": 0.7301980198019802, "grad_norm": 0.8167069067691852, "learning_rate": 1.6020259058937228e-05, "loss": 0.8866, "step": 295 }, { "epoch": 0.7425742574257426, "grad_norm": 0.7980390861743758, "learning_rate": 1.5846344027044307e-05, "loss": 0.9047, "step": 300 }, { "epoch": 0.754950495049505, "grad_norm": 0.7563819762216695, "learning_rate": 1.5669699776035958e-05, "loss": 0.921, "step": 305 }, { "epoch": 0.7673267326732673, "grad_norm": 0.7797762078681911, "learning_rate": 1.5490408767851506e-05, "loss": 0.8869, "step": 310 }, { "epoch": 0.7797029702970297, "grad_norm": 0.7666252485199382, "learning_rate": 1.530855470000251e-05, "loss": 0.9151, "step": 315 }, { "epoch": 0.7920792079207921, "grad_norm": 0.7051563046414802, "learning_rate": 1.5124222466500665e-05, "loss": 0.9024, "step": 320 }, { "epoch": 0.8044554455445545, "grad_norm": 0.7203691895762679, "learning_rate": 1.4937498118227156e-05, "loss": 0.9098, "step": 325 }, { "epoch": 0.8168316831683168, "grad_norm": 0.7125293698622911, "learning_rate": 1.4748468822761974e-05, "loss": 0.9076, "step": 330 }, { "epoch": 0.8292079207920792, "grad_norm": 0.7428102709759479, "learning_rate": 1.4557222823691913e-05, "loss": 0.9082, "step": 335 }, { "epoch": 0.8415841584158416, "grad_norm": 0.7230219909417678, "learning_rate": 1.4363849399416254e-05, "loss": 0.9004, "step": 340 }, { "epoch": 0.8539603960396039, "grad_norm": 0.788928114591889, "learning_rate": 1.4168438821469402e-05, "loss": 0.8845, "step": 345 }, { "epoch": 0.8663366336633663, "grad_norm": 0.7456926963995498, "learning_rate": 1.3971082312379864e-05, "loss": 0.9013, "step": 350 }, { "epoch": 0.8787128712871287, "grad_norm": 0.7757067249831519, "learning_rate": 1.3771872003085315e-05, "loss": 0.8913, "step": 355 }, { "epoch": 0.8910891089108911, "grad_norm": 0.7040237849520753, "learning_rate": 1.3570900889923566e-05, "loss": 0.9178, "step": 360 }, { "epoch": 0.9034653465346535, "grad_norm": 0.7028463213970978, "learning_rate": 1.3368262791219568e-05, "loss": 0.8864, "step": 365 }, { "epoch": 0.9158415841584159, "grad_norm": 0.6865489126472716, "learning_rate": 1.3164052303488673e-05, "loss": 0.8958, "step": 370 }, { "epoch": 0.9282178217821783, "grad_norm": 0.7124826316517546, "learning_rate": 1.2958364757276616e-05, "loss": 0.8927, "step": 375 }, { "epoch": 0.9405940594059405, "grad_norm": 0.71875227320249, "learning_rate": 1.2751296172656862e-05, "loss": 0.897, "step": 380 }, { "epoch": 0.9529702970297029, "grad_norm": 0.7298632107047348, "learning_rate": 1.2542943214406012e-05, "loss": 0.9051, "step": 385 }, { "epoch": 0.9653465346534653, "grad_norm": 0.6885898822393058, "learning_rate": 1.23334031468783e-05, "loss": 0.8546, "step": 390 }, { "epoch": 0.9777227722772277, "grad_norm": 0.7078778884772863, "learning_rate": 1.2122773788600164e-05, "loss": 0.9019, "step": 395 }, { "epoch": 0.9900990099009901, "grad_norm": 0.7533302944336393, "learning_rate": 1.1911153466606105e-05, "loss": 0.895, "step": 400 }, { "epoch": 1.0, "eval_loss": 1.06302809715271, "eval_runtime": 43.2706, "eval_samples_per_second": 75.871, "eval_steps_per_second": 1.202, "step": 404 }, { "epoch": 1.0024752475247525, "grad_norm": 1.3960472434954319, "learning_rate": 1.1698640970537195e-05, "loss": 0.8479, "step": 405 }, { "epoch": 1.0148514851485149, "grad_norm": 0.8222898474391185, "learning_rate": 1.14853355065236e-05, "loss": 0.7492, "step": 410 }, { "epoch": 1.0272277227722773, "grad_norm": 1.146533181549737, "learning_rate": 1.1271336650872687e-05, "loss": 0.7352, "step": 415 }, { "epoch": 1.0396039603960396, "grad_norm": 1.2500390362477898, "learning_rate": 1.1056744303584322e-05, "loss": 0.7107, "step": 420 }, { "epoch": 1.051980198019802, "grad_norm": 0.8882156471951722, "learning_rate": 1.0841658641715064e-05, "loss": 0.7027, "step": 425 }, { "epoch": 1.0643564356435644, "grad_norm": 0.8783518588023339, "learning_rate": 1.0626180072613011e-05, "loss": 0.7199, "step": 430 }, { "epoch": 1.0767326732673268, "grad_norm": 0.7956936034755372, "learning_rate": 1.0410409187045145e-05, "loss": 0.6972, "step": 435 }, { "epoch": 1.0891089108910892, "grad_norm": 0.7957451930016578, "learning_rate": 1.0194446712239076e-05, "loss": 0.7194, "step": 440 }, { "epoch": 1.1014851485148516, "grad_norm": 0.8085539993808099, "learning_rate": 9.978393464861036e-06, "loss": 0.7085, "step": 445 }, { "epoch": 1.113861386138614, "grad_norm": 0.7629426599829563, "learning_rate": 9.76235030395215e-06, "loss": 0.7221, "step": 450 }, { "epoch": 1.1262376237623761, "grad_norm": 0.7099439929036686, "learning_rate": 9.546418083844944e-06, "loss": 0.7228, "step": 455 }, { "epoch": 1.1386138613861387, "grad_norm": 0.768143602885412, "learning_rate": 9.330697607081995e-06, "loss": 0.7055, "step": 460 }, { "epoch": 1.150990099009901, "grad_norm": 0.7399932448725586, "learning_rate": 9.115289577358826e-06, "loss": 0.7126, "step": 465 }, { "epoch": 1.1633663366336633, "grad_norm": 0.7720020506192959, "learning_rate": 8.900294552512878e-06, "loss": 0.7095, "step": 470 }, { "epoch": 1.1757425742574257, "grad_norm": 0.7452407090275405, "learning_rate": 8.68581289758063e-06, "loss": 0.7028, "step": 475 }, { "epoch": 1.188118811881188, "grad_norm": 0.7571246453363882, "learning_rate": 8.471944737944687e-06, "loss": 0.7184, "step": 480 }, { "epoch": 1.2004950495049505, "grad_norm": 0.7494453898237526, "learning_rate": 8.25878991259276e-06, "loss": 0.6982, "step": 485 }, { "epoch": 1.2128712871287128, "grad_norm": 0.7254138409040641, "learning_rate": 8.046447927510335e-06, "loss": 0.7175, "step": 490 }, { "epoch": 1.2252475247524752, "grad_norm": 0.7709180774648441, "learning_rate": 7.835017909228801e-06, "loss": 0.7075, "step": 495 }, { "epoch": 1.2376237623762376, "grad_norm": 0.760113272357165, "learning_rate": 7.624598558550707e-06, "loss": 0.7224, "step": 500 }, { "epoch": 1.25, "grad_norm": 0.7251746178009261, "learning_rate": 7.415288104473774e-06, "loss": 0.7059, "step": 505 }, { "epoch": 1.2623762376237624, "grad_norm": 0.7604221994294873, "learning_rate": 7.207184258335163e-06, "loss": 0.7022, "step": 510 }, { "epoch": 1.2747524752475248, "grad_norm": 0.7947902441474564, "learning_rate": 7.000384168197354e-06, "loss": 0.7076, "step": 515 }, { "epoch": 1.2871287128712872, "grad_norm": 0.7054449499686205, "learning_rate": 6.7949843734970475e-06, "loss": 0.7133, "step": 520 }, { "epoch": 1.2995049504950495, "grad_norm": 0.708545411017501, "learning_rate": 6.5910807599781135e-06, "loss": 0.7106, "step": 525 }, { "epoch": 1.311881188118812, "grad_norm": 0.7262363459114348, "learning_rate": 6.388768514929768e-06, "loss": 0.7114, "step": 530 }, { "epoch": 1.3242574257425743, "grad_norm": 0.6998186137578639, "learning_rate": 6.18814208275075e-06, "loss": 0.7122, "step": 535 }, { "epoch": 1.3366336633663367, "grad_norm": 0.7095860494984699, "learning_rate": 5.989295120860334e-06, "loss": 0.7252, "step": 540 }, { "epoch": 1.349009900990099, "grad_norm": 0.6915165561447487, "learning_rate": 5.792320455976714e-06, "loss": 0.7048, "step": 545 }, { "epoch": 1.3613861386138613, "grad_norm": 0.7119909691342722, "learning_rate": 5.597310040783161e-06, "loss": 0.6962, "step": 550 }, { "epoch": 1.3737623762376239, "grad_norm": 0.7504970044336818, "learning_rate": 5.404354911002243e-06, "loss": 0.707, "step": 555 }, { "epoch": 1.386138613861386, "grad_norm": 0.7146866752363458, "learning_rate": 5.213545142898061e-06, "loss": 0.7223, "step": 560 }, { "epoch": 1.3985148514851486, "grad_norm": 0.7356980622491117, "learning_rate": 5.024969811226419e-06, "loss": 0.6998, "step": 565 }, { "epoch": 1.4108910891089108, "grad_norm": 0.7061674308512749, "learning_rate": 4.838716947652485e-06, "loss": 0.6958, "step": 570 }, { "epoch": 1.4232673267326732, "grad_norm": 0.7284088021218859, "learning_rate": 4.654873499655449e-06, "loss": 0.6964, "step": 575 }, { "epoch": 1.4356435643564356, "grad_norm": 0.6886931942419792, "learning_rate": 4.4735252899392335e-06, "loss": 0.7023, "step": 580 }, { "epoch": 1.448019801980198, "grad_norm": 0.7166576506174707, "learning_rate": 4.294756976368351e-06, "loss": 0.7069, "step": 585 }, { "epoch": 1.4603960396039604, "grad_norm": 0.7032864356512767, "learning_rate": 4.118652012447486e-06, "loss": 0.7211, "step": 590 }, { "epoch": 1.4727722772277227, "grad_norm": 0.7242483728321348, "learning_rate": 3.945292608363312e-06, "loss": 0.7119, "step": 595 }, { "epoch": 1.4851485148514851, "grad_norm": 0.6918844982337927, "learning_rate": 3.7747596926067485e-06, "loss": 0.7221, "step": 600 }, { "epoch": 1.4975247524752475, "grad_norm": 0.7132050642871574, "learning_rate": 3.6071328741934985e-06, "loss": 0.7022, "step": 605 }, { "epoch": 1.50990099009901, "grad_norm": 0.7347264550601227, "learning_rate": 3.442490405500598e-06, "loss": 0.6975, "step": 610 }, { "epoch": 1.5222772277227723, "grad_norm": 0.6602507777994507, "learning_rate": 3.2809091457362464e-06, "loss": 0.7065, "step": 615 }, { "epoch": 1.5346534653465347, "grad_norm": 0.6740627682629335, "learning_rate": 3.122464525060013e-06, "loss": 0.6978, "step": 620 }, { "epoch": 1.547029702970297, "grad_norm": 0.7142221920558823, "learning_rate": 2.96723050937015e-06, "loss": 0.709, "step": 625 }, { "epoch": 1.5594059405940595, "grad_norm": 0.7335799725765082, "learning_rate": 2.8152795657744882e-06, "loss": 0.6893, "step": 630 }, { "epoch": 1.5717821782178216, "grad_norm": 0.7032104638603995, "learning_rate": 2.666682628760958e-06, "loss": 0.6961, "step": 635 }, { "epoch": 1.5841584158415842, "grad_norm": 0.6921897923437655, "learning_rate": 2.521509067083631e-06, "loss": 0.6938, "step": 640 }, { "epoch": 1.5965346534653464, "grad_norm": 0.6863228632248336, "learning_rate": 2.379826651379632e-06, "loss": 0.7033, "step": 645 }, { "epoch": 1.608910891089109, "grad_norm": 0.6778666475373859, "learning_rate": 2.241701522532136e-06, "loss": 0.7077, "step": 650 }, { "epoch": 1.6212871287128712, "grad_norm": 0.978566748682469, "learning_rate": 2.107198160794136e-06, "loss": 0.7102, "step": 655 }, { "epoch": 1.6336633663366338, "grad_norm": 0.7103004954900359, "learning_rate": 1.9763793556874655e-06, "loss": 0.6983, "step": 660 }, { "epoch": 1.646039603960396, "grad_norm": 0.676195777058198, "learning_rate": 1.849306176691088e-06, "loss": 0.7176, "step": 665 }, { "epoch": 1.6584158415841586, "grad_norm": 0.7079808939239, "learning_rate": 1.7260379447323327e-06, "loss": 0.6998, "step": 670 }, { "epoch": 1.6707920792079207, "grad_norm": 0.6910118308885802, "learning_rate": 1.6066322044944126e-06, "loss": 0.6847, "step": 675 }, { "epoch": 1.6831683168316833, "grad_norm": 0.6904569392825494, "learning_rate": 1.4911446975531329e-06, "loss": 0.7014, "step": 680 }, { "epoch": 1.6955445544554455, "grad_norm": 0.6979836145792466, "learning_rate": 1.3796293363553259e-06, "loss": 0.7252, "step": 685 }, { "epoch": 1.7079207920792079, "grad_norm": 0.6898195721719471, "learning_rate": 1.2721381790511832e-06, "loss": 0.7096, "step": 690 }, { "epoch": 1.7202970297029703, "grad_norm": 0.7872710836498652, "learning_rate": 1.168721405192218e-06, "loss": 0.7118, "step": 695 }, { "epoch": 1.7326732673267327, "grad_norm": 0.6351928901398578, "learning_rate": 1.0694272923061933e-06, "loss": 0.7073, "step": 700 }, { "epoch": 1.745049504950495, "grad_norm": 0.6550150479810684, "learning_rate": 9.743021933599695e-07, "loss": 0.6879, "step": 705 }, { "epoch": 1.7574257425742574, "grad_norm": 0.6762595095414403, "learning_rate": 8.833905151207833e-07, "loss": 0.6972, "step": 710 }, { "epoch": 1.7698019801980198, "grad_norm": 0.7046793104440422, "learning_rate": 7.967346974260626e-07, "loss": 0.7119, "step": 715 }, { "epoch": 1.7821782178217822, "grad_norm": 0.6803563399348503, "learning_rate": 7.143751933714583e-07, "loss": 0.7064, "step": 720 }, { "epoch": 1.7945544554455446, "grad_norm": 0.6569063272572411, "learning_rate": 6.363504504263207e-07, "loss": 0.694, "step": 725 }, { "epoch": 1.806930693069307, "grad_norm": 0.672487356470194, "learning_rate": 5.626968924854714e-07, "loss": 0.7133, "step": 730 }, { "epoch": 1.8193069306930694, "grad_norm": 0.6612411849360775, "learning_rate": 4.934489028656164e-07, "loss": 0.6933, "step": 735 }, { "epoch": 1.8316831683168315, "grad_norm": 0.7156254594176831, "learning_rate": 4.2863880825435687e-07, "loss": 0.7057, "step": 740 }, { "epoch": 1.8440594059405941, "grad_norm": 0.6833411888683371, "learning_rate": 3.682968636192863e-07, "loss": 0.7112, "step": 745 }, { "epoch": 1.8564356435643563, "grad_norm": 0.6969326586876218, "learning_rate": 3.124512380842204e-07, "loss": 0.6912, "step": 750 }, { "epoch": 1.868811881188119, "grad_norm": 0.6742873155450232, "learning_rate": 2.61128001779144e-07, "loss": 0.6823, "step": 755 }, { "epoch": 1.881188118811881, "grad_norm": 0.6414949104474531, "learning_rate": 2.1435111367002826e-07, "loss": 0.7126, "step": 760 }, { "epoch": 1.8935643564356437, "grad_norm": 0.6525583867438978, "learning_rate": 1.7214241037418312e-07, "loss": 0.6969, "step": 765 }, { "epoch": 1.9059405940594059, "grad_norm": 0.65521211994519, "learning_rate": 1.345215959663837e-07, "loss": 0.6903, "step": 770 }, { "epoch": 1.9183168316831685, "grad_norm": 0.6944361037558262, "learning_rate": 1.0150623278051719e-07, "loss": 0.6912, "step": 775 }, { "epoch": 1.9306930693069306, "grad_norm": 0.648743159889204, "learning_rate": 7.311173321104648e-08, "loss": 0.683, "step": 780 }, { "epoch": 1.943069306930693, "grad_norm": 0.6925205112702544, "learning_rate": 4.935135251811995e-08, "loss": 0.6934, "step": 785 }, { "epoch": 1.9554455445544554, "grad_norm": 0.6504666964480196, "learning_rate": 3.023618263968797e-08, "loss": 0.7186, "step": 790 }, { "epoch": 1.9678217821782178, "grad_norm": 0.7265261251377658, "learning_rate": 1.577514701350591e-08, "loss": 0.7216, "step": 795 }, { "epoch": 1.9801980198019802, "grad_norm": 0.6739382260611008, "learning_rate": 5.97499641145416e-09, "loss": 0.6928, "step": 800 }, { "epoch": 1.9925742574257426, "grad_norm": 0.6660006595388417, "learning_rate": 8.403057881067877e-10, "loss": 0.6941, "step": 805 }, { "epoch": 2.0, "eval_loss": 1.07817542552948, "eval_runtime": 43.2932, "eval_samples_per_second": 75.832, "eval_steps_per_second": 1.201, "step": 808 }, { "epoch": 2.0, "step": 808, "total_flos": 169178761789440.0, "train_loss": 0.8271688128461933, "train_runtime": 2411.6836, "train_samples_per_second": 21.427, "train_steps_per_second": 0.335 } ], "logging_steps": 5, "max_steps": 808, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 169178761789440.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }