{ "best_metric": 11.920478820800781, "best_model_checkpoint": "miner_id_24/checkpoint-150", "epoch": 0.9345794392523364, "eval_steps": 25, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004672897196261682, "grad_norm": 0.015262666158378124, "learning_rate": 2.9999999999999997e-05, "loss": 11.9336, "step": 1 }, { "epoch": 0.004672897196261682, "eval_loss": 11.933741569519043, "eval_runtime": 0.2661, "eval_samples_per_second": 187.865, "eval_steps_per_second": 26.301, "step": 1 }, { "epoch": 0.009345794392523364, "grad_norm": 0.010072018951177597, "learning_rate": 5.9999999999999995e-05, "loss": 11.9349, "step": 2 }, { "epoch": 0.014018691588785047, "grad_norm": 0.014539014548063278, "learning_rate": 8.999999999999999e-05, "loss": 11.9336, "step": 3 }, { "epoch": 0.018691588785046728, "grad_norm": 0.012826521880924702, "learning_rate": 0.00011999999999999999, "loss": 11.9343, "step": 4 }, { "epoch": 0.02336448598130841, "grad_norm": 0.01322372816503048, "learning_rate": 0.00015, "loss": 11.9332, "step": 5 }, { "epoch": 0.028037383177570093, "grad_norm": 0.011686271987855434, "learning_rate": 0.00017999999999999998, "loss": 11.9324, "step": 6 }, { "epoch": 0.03271028037383177, "grad_norm": 0.01585233211517334, "learning_rate": 0.00020999999999999998, "loss": 11.9335, "step": 7 }, { "epoch": 0.037383177570093455, "grad_norm": 0.011046156287193298, "learning_rate": 0.00023999999999999998, "loss": 11.9347, "step": 8 }, { "epoch": 0.04205607476635514, "grad_norm": 0.01065156888216734, "learning_rate": 0.00027, "loss": 11.9333, "step": 9 }, { "epoch": 0.04672897196261682, "grad_norm": 0.00867808423936367, "learning_rate": 0.0003, "loss": 11.9356, "step": 10 }, { "epoch": 0.0514018691588785, "grad_norm": 0.011804298497736454, "learning_rate": 0.0002999794957488703, "loss": 11.933, "step": 11 }, { "epoch": 0.056074766355140186, "grad_norm": 0.012750547379255295, "learning_rate": 0.0002999179886011389, "loss": 11.9328, "step": 12 }, { "epoch": 0.06074766355140187, "grad_norm": 0.010166078805923462, "learning_rate": 0.0002998154953722457, "loss": 11.934, "step": 13 }, { "epoch": 0.06542056074766354, "grad_norm": 0.008991819806396961, "learning_rate": 0.00029967204408281613, "loss": 11.9359, "step": 14 }, { "epoch": 0.07009345794392523, "grad_norm": 0.012300461530685425, "learning_rate": 0.00029948767395100045, "loss": 11.9335, "step": 15 }, { "epoch": 0.07476635514018691, "grad_norm": 0.01353101059794426, "learning_rate": 0.0002992624353817517, "loss": 11.9321, "step": 16 }, { "epoch": 0.0794392523364486, "grad_norm": 0.011064659804105759, "learning_rate": 0.0002989963899530457, "loss": 11.9329, "step": 17 }, { "epoch": 0.08411214953271028, "grad_norm": 0.010729066096246243, "learning_rate": 0.00029868961039904624, "loss": 11.9339, "step": 18 }, { "epoch": 0.08878504672897196, "grad_norm": 0.012338140979409218, "learning_rate": 0.00029834218059022024, "loss": 11.933, "step": 19 }, { "epoch": 0.09345794392523364, "grad_norm": 0.011216608807444572, "learning_rate": 0.00029795419551040833, "loss": 11.9326, "step": 20 }, { "epoch": 0.09813084112149532, "grad_norm": 0.011954426765441895, "learning_rate": 0.00029752576123085736, "loss": 11.932, "step": 21 }, { "epoch": 0.102803738317757, "grad_norm": 0.012529377825558186, "learning_rate": 0.0002970569948812214, "loss": 11.9341, "step": 22 }, { "epoch": 0.10747663551401869, "grad_norm": 0.013572929427027702, "learning_rate": 0.0002965480246175399, "loss": 11.9341, "step": 23 }, { "epoch": 0.11214953271028037, "grad_norm": 0.014423901215195656, "learning_rate": 0.0002959989895872009, "loss": 11.9338, "step": 24 }, { "epoch": 0.11682242990654206, "grad_norm": 0.01651933416724205, "learning_rate": 0.0002954100398908995, "loss": 11.9333, "step": 25 }, { "epoch": 0.11682242990654206, "eval_loss": 11.932720184326172, "eval_runtime": 0.2737, "eval_samples_per_second": 182.703, "eval_steps_per_second": 25.578, "step": 25 }, { "epoch": 0.12149532710280374, "grad_norm": 0.01691250689327717, "learning_rate": 0.0002947813365416023, "loss": 11.9356, "step": 26 }, { "epoch": 0.1261682242990654, "grad_norm": 0.01488806214183569, "learning_rate": 0.0002941130514205272, "loss": 11.9327, "step": 27 }, { "epoch": 0.1308411214953271, "grad_norm": 0.011068173684179783, "learning_rate": 0.0002934053672301536, "loss": 11.9306, "step": 28 }, { "epoch": 0.13551401869158877, "grad_norm": 0.01614326983690262, "learning_rate": 0.00029265847744427303, "loss": 11.9339, "step": 29 }, { "epoch": 0.14018691588785046, "grad_norm": 0.017582055181264877, "learning_rate": 0.00029187258625509513, "loss": 11.9326, "step": 30 }, { "epoch": 0.14485981308411214, "grad_norm": 0.021076716482639313, "learning_rate": 0.00029104790851742417, "loss": 11.9312, "step": 31 }, { "epoch": 0.14953271028037382, "grad_norm": 0.031267255544662476, "learning_rate": 0.0002901846696899191, "loss": 11.9309, "step": 32 }, { "epoch": 0.1542056074766355, "grad_norm": 0.028096068650484085, "learning_rate": 0.00028928310577345606, "loss": 11.9308, "step": 33 }, { "epoch": 0.1588785046728972, "grad_norm": 0.024486729875206947, "learning_rate": 0.0002883434632466077, "loss": 11.934, "step": 34 }, { "epoch": 0.16355140186915887, "grad_norm": 0.021672122180461884, "learning_rate": 0.00028736599899825856, "loss": 11.9301, "step": 35 }, { "epoch": 0.16822429906542055, "grad_norm": 0.0294785276055336, "learning_rate": 0.00028635098025737434, "loss": 11.9303, "step": 36 }, { "epoch": 0.17289719626168223, "grad_norm": 0.029576191678643227, "learning_rate": 0.00028529868451994384, "loss": 11.9298, "step": 37 }, { "epoch": 0.17757009345794392, "grad_norm": 0.03013836406171322, "learning_rate": 0.0002842093994731145, "loss": 11.9297, "step": 38 }, { "epoch": 0.1822429906542056, "grad_norm": 0.039979491382837296, "learning_rate": 0.00028308342291654174, "loss": 11.9309, "step": 39 }, { "epoch": 0.18691588785046728, "grad_norm": 0.047189485281705856, "learning_rate": 0.00028192106268097334, "loss": 11.9321, "step": 40 }, { "epoch": 0.19158878504672897, "grad_norm": 0.04108109697699547, "learning_rate": 0.00028072263654409154, "loss": 11.9305, "step": 41 }, { "epoch": 0.19626168224299065, "grad_norm": 0.05130422115325928, "learning_rate": 0.0002794884721436361, "loss": 11.932, "step": 42 }, { "epoch": 0.20093457943925233, "grad_norm": 0.041305627673864365, "learning_rate": 0.00027821890688783083, "loss": 11.9291, "step": 43 }, { "epoch": 0.205607476635514, "grad_norm": 0.05282793566584587, "learning_rate": 0.0002769142878631403, "loss": 11.9339, "step": 44 }, { "epoch": 0.2102803738317757, "grad_norm": 0.04912222549319267, "learning_rate": 0.00027557497173937923, "loss": 11.927, "step": 45 }, { "epoch": 0.21495327102803738, "grad_norm": 0.07533613592386246, "learning_rate": 0.000274201324672203, "loss": 11.9275, "step": 46 }, { "epoch": 0.21962616822429906, "grad_norm": 0.06588118523359299, "learning_rate": 0.00027279372220300385, "loss": 11.931, "step": 47 }, { "epoch": 0.22429906542056074, "grad_norm": 0.07589995115995407, "learning_rate": 0.0002713525491562421, "loss": 11.928, "step": 48 }, { "epoch": 0.22897196261682243, "grad_norm": 0.06371214240789413, "learning_rate": 0.00026987819953423867, "loss": 11.9286, "step": 49 }, { "epoch": 0.2336448598130841, "grad_norm": 0.0883798599243164, "learning_rate": 0.00026837107640945905, "loss": 11.932, "step": 50 }, { "epoch": 0.2336448598130841, "eval_loss": 11.927355766296387, "eval_runtime": 0.2651, "eval_samples_per_second": 188.605, "eval_steps_per_second": 26.405, "step": 50 }, { "epoch": 0.2383177570093458, "grad_norm": 0.04947098344564438, "learning_rate": 0.0002668315918143169, "loss": 11.9246, "step": 51 }, { "epoch": 0.24299065420560748, "grad_norm": 0.029438398778438568, "learning_rate": 0.00026526016662852886, "loss": 11.9282, "step": 52 }, { "epoch": 0.24766355140186916, "grad_norm": 0.046046871691942215, "learning_rate": 0.00026365723046405023, "loss": 11.9266, "step": 53 }, { "epoch": 0.2523364485981308, "grad_norm": 0.04566225782036781, "learning_rate": 0.0002620232215476231, "loss": 11.9255, "step": 54 }, { "epoch": 0.2570093457943925, "grad_norm": 0.03357386589050293, "learning_rate": 0.0002603585866009697, "loss": 11.9266, "step": 55 }, { "epoch": 0.2616822429906542, "grad_norm": 0.03560081124305725, "learning_rate": 0.00025866378071866334, "loss": 11.9264, "step": 56 }, { "epoch": 0.26635514018691586, "grad_norm": 0.03819046542048454, "learning_rate": 0.00025693926724370956, "loss": 11.9281, "step": 57 }, { "epoch": 0.27102803738317754, "grad_norm": 0.04918365925550461, "learning_rate": 0.00025518551764087326, "loss": 11.9252, "step": 58 }, { "epoch": 0.2757009345794392, "grad_norm": 0.03922976925969124, "learning_rate": 0.00025340301136778483, "loss": 11.9276, "step": 59 }, { "epoch": 0.2803738317757009, "grad_norm": 0.032605502754449844, "learning_rate": 0.00025159223574386114, "loss": 11.927, "step": 60 }, { "epoch": 0.2850467289719626, "grad_norm": 0.03251456469297409, "learning_rate": 0.0002497536858170772, "loss": 11.9229, "step": 61 }, { "epoch": 0.2897196261682243, "grad_norm": 0.03914351388812065, "learning_rate": 0.00024788786422862526, "loss": 11.9239, "step": 62 }, { "epoch": 0.29439252336448596, "grad_norm": 0.030496085062623024, "learning_rate": 0.00024599528107549745, "loss": 11.9252, "step": 63 }, { "epoch": 0.29906542056074764, "grad_norm": 0.033903758972883224, "learning_rate": 0.00024407645377103054, "loss": 11.9216, "step": 64 }, { "epoch": 0.3037383177570093, "grad_norm": 0.03244215250015259, "learning_rate": 0.00024213190690345018, "loss": 11.9231, "step": 65 }, { "epoch": 0.308411214953271, "grad_norm": 0.034409601241350174, "learning_rate": 0.00024016217209245374, "loss": 11.9226, "step": 66 }, { "epoch": 0.3130841121495327, "grad_norm": 0.040117502212524414, "learning_rate": 0.00023816778784387094, "loss": 11.9235, "step": 67 }, { "epoch": 0.3177570093457944, "grad_norm": 0.03953894227743149, "learning_rate": 0.0002361492994024415, "loss": 11.9224, "step": 68 }, { "epoch": 0.32242990654205606, "grad_norm": 0.026788828894495964, "learning_rate": 0.0002341072586027509, "loss": 11.9257, "step": 69 }, { "epoch": 0.32710280373831774, "grad_norm": 0.03243542090058327, "learning_rate": 0.00023204222371836405, "loss": 11.9226, "step": 70 }, { "epoch": 0.3317757009345794, "grad_norm": 0.023114504292607307, "learning_rate": 0.00022995475930919905, "loss": 11.9268, "step": 71 }, { "epoch": 0.3364485981308411, "grad_norm": 0.02156795933842659, "learning_rate": 0.00022784543606718227, "loss": 11.9248, "step": 72 }, { "epoch": 0.3411214953271028, "grad_norm": 0.028591834008693695, "learning_rate": 0.00022571483066022657, "loss": 11.9221, "step": 73 }, { "epoch": 0.34579439252336447, "grad_norm": 0.030331918969750404, "learning_rate": 0.0002235635255745762, "loss": 11.9242, "step": 74 }, { "epoch": 0.35046728971962615, "grad_norm": 0.031945616006851196, "learning_rate": 0.00022139210895556104, "loss": 11.9249, "step": 75 }, { "epoch": 0.35046728971962615, "eval_loss": 11.922245979309082, "eval_runtime": 0.2642, "eval_samples_per_second": 189.229, "eval_steps_per_second": 26.492, "step": 75 }, { "epoch": 0.35514018691588783, "grad_norm": 0.028651485219597816, "learning_rate": 0.00021920117444680317, "loss": 11.9221, "step": 76 }, { "epoch": 0.3598130841121495, "grad_norm": 0.02382078766822815, "learning_rate": 0.00021699132102792097, "loss": 11.9222, "step": 77 }, { "epoch": 0.3644859813084112, "grad_norm": 0.02109277807176113, "learning_rate": 0.0002147631528507739, "loss": 11.9208, "step": 78 }, { "epoch": 0.3691588785046729, "grad_norm": 0.022438790649175644, "learning_rate": 0.00021251727907429355, "loss": 11.9221, "step": 79 }, { "epoch": 0.37383177570093457, "grad_norm": 0.020786315202713013, "learning_rate": 0.0002102543136979454, "loss": 11.9233, "step": 80 }, { "epoch": 0.37850467289719625, "grad_norm": 0.025819014757871628, "learning_rate": 0.0002079748753938678, "loss": 11.9232, "step": 81 }, { "epoch": 0.38317757009345793, "grad_norm": 0.021457618102431297, "learning_rate": 0.0002056795873377331, "loss": 11.9217, "step": 82 }, { "epoch": 0.3878504672897196, "grad_norm": 0.02814180590212345, "learning_rate": 0.00020336907703837748, "loss": 11.9207, "step": 83 }, { "epoch": 0.3925233644859813, "grad_norm": 0.024183204397559166, "learning_rate": 0.00020104397616624645, "loss": 11.9216, "step": 84 }, { "epoch": 0.397196261682243, "grad_norm": 0.021621285006403923, "learning_rate": 0.00019870492038070252, "loss": 11.9217, "step": 85 }, { "epoch": 0.40186915887850466, "grad_norm": 0.03631271794438362, "learning_rate": 0.0001963525491562421, "loss": 11.9196, "step": 86 }, { "epoch": 0.40654205607476634, "grad_norm": 0.02298405021429062, "learning_rate": 0.0001939875056076697, "loss": 11.9227, "step": 87 }, { "epoch": 0.411214953271028, "grad_norm": 0.021272210404276848, "learning_rate": 0.00019161043631427666, "loss": 11.9204, "step": 88 }, { "epoch": 0.4158878504672897, "grad_norm": 0.026837944984436035, "learning_rate": 0.00018922199114307294, "loss": 11.9212, "step": 89 }, { "epoch": 0.4205607476635514, "grad_norm": 0.030852586030960083, "learning_rate": 0.00018682282307111987, "loss": 11.9213, "step": 90 }, { "epoch": 0.4252336448598131, "grad_norm": 0.033108122646808624, "learning_rate": 0.00018441358800701273, "loss": 11.9219, "step": 91 }, { "epoch": 0.42990654205607476, "grad_norm": 0.035448107868433, "learning_rate": 0.00018199494461156203, "loss": 11.9252, "step": 92 }, { "epoch": 0.43457943925233644, "grad_norm": 0.04730350896716118, "learning_rate": 0.000179567554117722, "loss": 11.9211, "step": 93 }, { "epoch": 0.4392523364485981, "grad_norm": 0.04452222213149071, "learning_rate": 0.00017713208014981648, "loss": 11.9242, "step": 94 }, { "epoch": 0.4439252336448598, "grad_norm": 0.054192908108234406, "learning_rate": 0.00017468918854211007, "loss": 11.9279, "step": 95 }, { "epoch": 0.4485981308411215, "grad_norm": 0.027065036818385124, "learning_rate": 0.00017223954715677627, "loss": 11.9206, "step": 96 }, { "epoch": 0.4532710280373832, "grad_norm": 0.06823795288801193, "learning_rate": 0.00016978382570131034, "loss": 11.9215, "step": 97 }, { "epoch": 0.45794392523364486, "grad_norm": 0.056718260049819946, "learning_rate": 0.00016732269554543794, "loss": 11.9245, "step": 98 }, { "epoch": 0.46261682242990654, "grad_norm": 0.0761389508843422, "learning_rate": 0.00016485682953756942, "loss": 11.9219, "step": 99 }, { "epoch": 0.4672897196261682, "grad_norm": 0.07411511242389679, "learning_rate": 0.00016238690182084986, "loss": 11.9257, "step": 100 }, { "epoch": 0.4672897196261682, "eval_loss": 11.921294212341309, "eval_runtime": 0.2642, "eval_samples_per_second": 189.256, "eval_steps_per_second": 26.496, "step": 100 }, { "epoch": 0.4719626168224299, "grad_norm": 0.024411911144852638, "learning_rate": 0.0001599135876488549, "loss": 11.9211, "step": 101 }, { "epoch": 0.4766355140186916, "grad_norm": 0.020111719146370888, "learning_rate": 0.00015743756320098332, "loss": 11.9233, "step": 102 }, { "epoch": 0.48130841121495327, "grad_norm": 0.025503534823656082, "learning_rate": 0.0001549595053975962, "loss": 11.9217, "step": 103 }, { "epoch": 0.48598130841121495, "grad_norm": 0.020468704402446747, "learning_rate": 0.00015248009171495378, "loss": 11.9217, "step": 104 }, { "epoch": 0.49065420560747663, "grad_norm": 0.025375397875905037, "learning_rate": 0.00015, "loss": 11.923, "step": 105 }, { "epoch": 0.4953271028037383, "grad_norm": 0.02081984281539917, "learning_rate": 0.00014751990828504622, "loss": 11.9224, "step": 106 }, { "epoch": 0.5, "grad_norm": 0.020906785503029823, "learning_rate": 0.00014504049460240375, "loss": 11.9198, "step": 107 }, { "epoch": 0.5046728971962616, "grad_norm": 0.029501456767320633, "learning_rate": 0.00014256243679901663, "loss": 11.9231, "step": 108 }, { "epoch": 0.5093457943925234, "grad_norm": 0.01729564182460308, "learning_rate": 0.00014008641235114508, "loss": 11.9218, "step": 109 }, { "epoch": 0.514018691588785, "grad_norm": 0.021059928461909294, "learning_rate": 0.00013761309817915014, "loss": 11.9204, "step": 110 }, { "epoch": 0.5186915887850467, "grad_norm": 0.01901732198894024, "learning_rate": 0.00013514317046243058, "loss": 11.9221, "step": 111 }, { "epoch": 0.5233644859813084, "grad_norm": 0.018536528572440147, "learning_rate": 0.00013267730445456208, "loss": 11.922, "step": 112 }, { "epoch": 0.5280373831775701, "grad_norm": 0.028534485027194023, "learning_rate": 0.00013021617429868963, "loss": 11.921, "step": 113 }, { "epoch": 0.5327102803738317, "grad_norm": 0.025273285806179047, "learning_rate": 0.00012776045284322368, "loss": 11.9239, "step": 114 }, { "epoch": 0.5373831775700935, "grad_norm": 0.024570738896727562, "learning_rate": 0.00012531081145788987, "loss": 11.9216, "step": 115 }, { "epoch": 0.5420560747663551, "grad_norm": 0.02263668179512024, "learning_rate": 0.00012286791985018355, "loss": 11.9209, "step": 116 }, { "epoch": 0.5467289719626168, "grad_norm": 0.022012576460838318, "learning_rate": 0.00012043244588227796, "loss": 11.9225, "step": 117 }, { "epoch": 0.5514018691588785, "grad_norm": 0.02482011914253235, "learning_rate": 0.00011800505538843798, "loss": 11.9206, "step": 118 }, { "epoch": 0.5560747663551402, "grad_norm": 0.024725789204239845, "learning_rate": 0.00011558641199298727, "loss": 11.9225, "step": 119 }, { "epoch": 0.5607476635514018, "grad_norm": 0.016468815505504608, "learning_rate": 0.00011317717692888012, "loss": 11.9234, "step": 120 }, { "epoch": 0.5654205607476636, "grad_norm": 0.016353553161025047, "learning_rate": 0.00011077800885692702, "loss": 11.9206, "step": 121 }, { "epoch": 0.5700934579439252, "grad_norm": 0.02241048961877823, "learning_rate": 0.00010838956368572334, "loss": 11.9206, "step": 122 }, { "epoch": 0.5747663551401869, "grad_norm": 0.024675803259015083, "learning_rate": 0.0001060124943923303, "loss": 11.9222, "step": 123 }, { "epoch": 0.5794392523364486, "grad_norm": 0.01750737614929676, "learning_rate": 0.0001036474508437579, "loss": 11.9225, "step": 124 }, { "epoch": 0.5841121495327103, "grad_norm": 0.017665334045886993, "learning_rate": 0.00010129507961929748, "loss": 11.9222, "step": 125 }, { "epoch": 0.5841121495327103, "eval_loss": 11.92119026184082, "eval_runtime": 0.2648, "eval_samples_per_second": 188.843, "eval_steps_per_second": 26.438, "step": 125 }, { "epoch": 0.5887850467289719, "grad_norm": 0.016782743856310844, "learning_rate": 9.895602383375353e-05, "loss": 11.9226, "step": 126 }, { "epoch": 0.5934579439252337, "grad_norm": 0.018075762316584587, "learning_rate": 9.663092296162251e-05, "loss": 11.9188, "step": 127 }, { "epoch": 0.5981308411214953, "grad_norm": 0.021824495866894722, "learning_rate": 9.432041266226686e-05, "loss": 11.9217, "step": 128 }, { "epoch": 0.602803738317757, "grad_norm": 0.030567770823836327, "learning_rate": 9.202512460613219e-05, "loss": 11.9177, "step": 129 }, { "epoch": 0.6074766355140186, "grad_norm": 0.018205830827355385, "learning_rate": 8.97456863020546e-05, "loss": 11.922, "step": 130 }, { "epoch": 0.6121495327102804, "grad_norm": 0.01752409152686596, "learning_rate": 8.748272092570646e-05, "loss": 11.9196, "step": 131 }, { "epoch": 0.616822429906542, "grad_norm": 0.02085956372320652, "learning_rate": 8.523684714922608e-05, "loss": 11.9215, "step": 132 }, { "epoch": 0.6214953271028038, "grad_norm": 0.019331621006131172, "learning_rate": 8.300867897207903e-05, "loss": 11.9219, "step": 133 }, { "epoch": 0.6261682242990654, "grad_norm": 0.026845021173357964, "learning_rate": 8.079882555319684e-05, "loss": 11.9197, "step": 134 }, { "epoch": 0.6308411214953271, "grad_norm": 0.01837092451751232, "learning_rate": 7.860789104443896e-05, "loss": 11.9198, "step": 135 }, { "epoch": 0.6355140186915887, "grad_norm": 0.02682948298752308, "learning_rate": 7.643647442542382e-05, "loss": 11.9227, "step": 136 }, { "epoch": 0.6401869158878505, "grad_norm": 0.01692628674209118, "learning_rate": 7.428516933977347e-05, "loss": 11.918, "step": 137 }, { "epoch": 0.6448598130841121, "grad_norm": 0.02475605346262455, "learning_rate": 7.215456393281776e-05, "loss": 11.9186, "step": 138 }, { "epoch": 0.6495327102803738, "grad_norm": 0.016714580357074738, "learning_rate": 7.004524069080096e-05, "loss": 11.9221, "step": 139 }, { "epoch": 0.6542056074766355, "grad_norm": 0.02623322419822216, "learning_rate": 6.795777628163599e-05, "loss": 11.9204, "step": 140 }, { "epoch": 0.6588785046728972, "grad_norm": 0.03229835256934166, "learning_rate": 6.58927413972491e-05, "loss": 11.9229, "step": 141 }, { "epoch": 0.6635514018691588, "grad_norm": 0.035907916724681854, "learning_rate": 6.385070059755846e-05, "loss": 11.9198, "step": 142 }, { "epoch": 0.6682242990654206, "grad_norm": 0.028768159449100494, "learning_rate": 6.183221215612904e-05, "loss": 11.9197, "step": 143 }, { "epoch": 0.6728971962616822, "grad_norm": 0.03234109655022621, "learning_rate": 5.983782790754623e-05, "loss": 11.9205, "step": 144 }, { "epoch": 0.677570093457944, "grad_norm": 0.047680024057626724, "learning_rate": 5.786809309654982e-05, "loss": 11.9231, "step": 145 }, { "epoch": 0.6822429906542056, "grad_norm": 0.04186435043811798, "learning_rate": 5.592354622896944e-05, "loss": 11.9231, "step": 146 }, { "epoch": 0.6869158878504673, "grad_norm": 0.026183342561125755, "learning_rate": 5.40047189245025e-05, "loss": 11.9241, "step": 147 }, { "epoch": 0.6915887850467289, "grad_norm": 0.047092072665691376, "learning_rate": 5.211213577137469e-05, "loss": 11.9235, "step": 148 }, { "epoch": 0.6962616822429907, "grad_norm": 0.0479263998568058, "learning_rate": 5.024631418292274e-05, "loss": 11.9234, "step": 149 }, { "epoch": 0.7009345794392523, "grad_norm": 0.05726196989417076, "learning_rate": 4.840776425613886e-05, "loss": 11.9209, "step": 150 }, { "epoch": 0.7009345794392523, "eval_loss": 11.920478820800781, "eval_runtime": 0.2634, "eval_samples_per_second": 189.797, "eval_steps_per_second": 26.572, "step": 150 }, { "epoch": 0.705607476635514, "grad_norm": 0.023836780339479446, "learning_rate": 4.659698863221513e-05, "loss": 11.9215, "step": 151 }, { "epoch": 0.7102803738317757, "grad_norm": 0.022620679810643196, "learning_rate": 4.481448235912671e-05, "loss": 11.9212, "step": 152 }, { "epoch": 0.7149532710280374, "grad_norm": 0.021642200648784637, "learning_rate": 4.306073275629044e-05, "loss": 11.9215, "step": 153 }, { "epoch": 0.719626168224299, "grad_norm": 0.011966847814619541, "learning_rate": 4.133621928133665e-05, "loss": 11.9232, "step": 154 }, { "epoch": 0.7242990654205608, "grad_norm": 0.019636252894997597, "learning_rate": 3.964141339903026e-05, "loss": 11.9201, "step": 155 }, { "epoch": 0.7289719626168224, "grad_norm": 0.014556117355823517, "learning_rate": 3.797677845237696e-05, "loss": 11.9184, "step": 156 }, { "epoch": 0.7336448598130841, "grad_norm": 0.02636311575770378, "learning_rate": 3.634276953594982e-05, "loss": 11.9202, "step": 157 }, { "epoch": 0.7383177570093458, "grad_norm": 0.019601935520768166, "learning_rate": 3.473983337147118e-05, "loss": 11.9216, "step": 158 }, { "epoch": 0.7429906542056075, "grad_norm": 0.03268706426024437, "learning_rate": 3.316840818568315e-05, "loss": 11.92, "step": 159 }, { "epoch": 0.7476635514018691, "grad_norm": 0.024846911430358887, "learning_rate": 3.162892359054098e-05, "loss": 11.9236, "step": 160 }, { "epoch": 0.7523364485981309, "grad_norm": 0.02114190347492695, "learning_rate": 3.0121800465761293e-05, "loss": 11.9237, "step": 161 }, { "epoch": 0.7570093457943925, "grad_norm": 0.023230189457535744, "learning_rate": 2.8647450843757897e-05, "loss": 11.9205, "step": 162 }, { "epoch": 0.7616822429906542, "grad_norm": 0.027432497590780258, "learning_rate": 2.7206277796996144e-05, "loss": 11.9223, "step": 163 }, { "epoch": 0.7663551401869159, "grad_norm": 0.021228693425655365, "learning_rate": 2.5798675327796993e-05, "loss": 11.9228, "step": 164 }, { "epoch": 0.7710280373831776, "grad_norm": 0.024462547153234482, "learning_rate": 2.4425028260620715e-05, "loss": 11.9221, "step": 165 }, { "epoch": 0.7757009345794392, "grad_norm": 0.02734757773578167, "learning_rate": 2.3085712136859668e-05, "loss": 11.9202, "step": 166 }, { "epoch": 0.780373831775701, "grad_norm": 0.013866426423192024, "learning_rate": 2.178109311216913e-05, "loss": 11.9213, "step": 167 }, { "epoch": 0.7850467289719626, "grad_norm": 0.021045256406068802, "learning_rate": 2.0511527856363912e-05, "loss": 11.921, "step": 168 }, { "epoch": 0.7897196261682243, "grad_norm": 0.01423388347029686, "learning_rate": 1.927736345590839e-05, "loss": 11.9213, "step": 169 }, { "epoch": 0.794392523364486, "grad_norm": 0.014857144095003605, "learning_rate": 1.8078937319026654e-05, "loss": 11.9215, "step": 170 }, { "epoch": 0.7990654205607477, "grad_norm": 0.02455286495387554, "learning_rate": 1.6916577083458228e-05, "loss": 11.9197, "step": 171 }, { "epoch": 0.8037383177570093, "grad_norm": 0.027797911316156387, "learning_rate": 1.579060052688548e-05, "loss": 11.9203, "step": 172 }, { "epoch": 0.8084112149532711, "grad_norm": 0.023412272334098816, "learning_rate": 1.4701315480056164e-05, "loss": 11.9207, "step": 173 }, { "epoch": 0.8130841121495327, "grad_norm": 0.01104673556983471, "learning_rate": 1.3649019742625623e-05, "loss": 11.9237, "step": 174 }, { "epoch": 0.8177570093457944, "grad_norm": 0.017326245084404945, "learning_rate": 1.2634001001741373e-05, "loss": 11.9215, "step": 175 }, { "epoch": 0.8177570093457944, "eval_loss": 11.9202241897583, "eval_runtime": 0.2656, "eval_samples_per_second": 188.283, "eval_steps_per_second": 26.36, "step": 175 }, { "epoch": 0.822429906542056, "grad_norm": 0.02686849981546402, "learning_rate": 1.1656536753392287e-05, "loss": 11.9219, "step": 176 }, { "epoch": 0.8271028037383178, "grad_norm": 0.01976598985493183, "learning_rate": 1.0716894226543953e-05, "loss": 11.92, "step": 177 }, { "epoch": 0.8317757009345794, "grad_norm": 0.013946283608675003, "learning_rate": 9.815330310080887e-06, "loss": 11.9232, "step": 178 }, { "epoch": 0.8364485981308412, "grad_norm": 0.019464576616883278, "learning_rate": 8.952091482575824e-06, "loss": 11.9227, "step": 179 }, { "epoch": 0.8411214953271028, "grad_norm": 0.01335290540009737, "learning_rate": 8.127413744904804e-06, "loss": 11.9224, "step": 180 }, { "epoch": 0.8457943925233645, "grad_norm": 0.018407030031085014, "learning_rate": 7.34152255572697e-06, "loss": 11.9227, "step": 181 }, { "epoch": 0.8504672897196262, "grad_norm": 0.02281804196536541, "learning_rate": 6.594632769846353e-06, "loss": 11.9216, "step": 182 }, { "epoch": 0.8551401869158879, "grad_norm": 0.02438226528465748, "learning_rate": 5.886948579472778e-06, "loss": 11.9209, "step": 183 }, { "epoch": 0.8598130841121495, "grad_norm": 0.020450523123145103, "learning_rate": 5.218663458397715e-06, "loss": 11.922, "step": 184 }, { "epoch": 0.8644859813084113, "grad_norm": 0.018540920689702034, "learning_rate": 4.589960109100444e-06, "loss": 11.9197, "step": 185 }, { "epoch": 0.8691588785046729, "grad_norm": 0.024907134473323822, "learning_rate": 4.001010412799138e-06, "loss": 11.9209, "step": 186 }, { "epoch": 0.8738317757009346, "grad_norm": 0.03048735298216343, "learning_rate": 3.451975382460109e-06, "loss": 11.9212, "step": 187 }, { "epoch": 0.8785046728971962, "grad_norm": 0.02939632534980774, "learning_rate": 2.9430051187785962e-06, "loss": 11.9205, "step": 188 }, { "epoch": 0.883177570093458, "grad_norm": 0.038265522569417953, "learning_rate": 2.4742387691426445e-06, "loss": 11.9204, "step": 189 }, { "epoch": 0.8878504672897196, "grad_norm": 0.038394853472709656, "learning_rate": 2.0458044895916513e-06, "loss": 11.9178, "step": 190 }, { "epoch": 0.8925233644859814, "grad_norm": 0.01815103180706501, "learning_rate": 1.6578194097797258e-06, "loss": 11.9203, "step": 191 }, { "epoch": 0.897196261682243, "grad_norm": 0.030815018340945244, "learning_rate": 1.3103896009537207e-06, "loss": 11.9186, "step": 192 }, { "epoch": 0.9018691588785047, "grad_norm": 0.03687365725636482, "learning_rate": 1.0036100469542786e-06, "loss": 11.9234, "step": 193 }, { "epoch": 0.9065420560747663, "grad_norm": 0.03689073771238327, "learning_rate": 7.375646182482875e-07, "loss": 11.9231, "step": 194 }, { "epoch": 0.9112149532710281, "grad_norm": 0.037798840552568436, "learning_rate": 5.123260489995229e-07, "loss": 11.9244, "step": 195 }, { "epoch": 0.9158878504672897, "grad_norm": 0.0500546358525753, "learning_rate": 3.2795591718381975e-07, "loss": 11.9184, "step": 196 }, { "epoch": 0.9205607476635514, "grad_norm": 0.05376823619008064, "learning_rate": 1.8450462775428942e-07, "loss": 11.9244, "step": 197 }, { "epoch": 0.9252336448598131, "grad_norm": 0.050430312752723694, "learning_rate": 8.201139886109264e-08, "loss": 11.9225, "step": 198 }, { "epoch": 0.9299065420560748, "grad_norm": 0.06013676896691322, "learning_rate": 2.0504251129649374e-08, "loss": 11.9224, "step": 199 }, { "epoch": 0.9345794392523364, "grad_norm": 0.05553491413593292, "learning_rate": 0.0, "loss": 11.9231, "step": 200 }, { "epoch": 0.9345794392523364, "eval_loss": 11.920884132385254, "eval_runtime": 0.2647, "eval_samples_per_second": 188.867, "eval_steps_per_second": 26.441, "step": 200 } ], "logging_steps": 1, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 1, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 1 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 318524620800.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }