{ "best_metric": 1.8645277672129885, "best_model_checkpoint": "./modernBERT-content-regression/run-3/checkpoint-248", "epoch": 2.0, "eval_steps": 500, "global_step": 248, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008064516129032258, "grad_norm": 312.8093566894531, "learning_rate": 1.1731316602713056e-06, "loss": 21.3087, "step": 1 }, { "epoch": 0.016129032258064516, "grad_norm": 97.11482238769531, "learning_rate": 1.1712364556505119e-06, "loss": 1.9024, "step": 2 }, { "epoch": 0.024193548387096774, "grad_norm": 608.1284790039062, "learning_rate": 1.1693412510297182e-06, "loss": 154.2055, "step": 3 }, { "epoch": 0.03225806451612903, "grad_norm": 205.3589630126953, "learning_rate": 1.1674460464089244e-06, "loss": 18.409, "step": 4 }, { "epoch": 0.04032258064516129, "grad_norm": 116.6544189453125, "learning_rate": 1.165550841788131e-06, "loss": 4.2643, "step": 5 }, { "epoch": 0.04838709677419355, "grad_norm": 34.426883697509766, "learning_rate": 1.1636556371673372e-06, "loss": 0.5243, "step": 6 }, { "epoch": 0.056451612903225805, "grad_norm": 161.2743682861328, "learning_rate": 1.1617604325465434e-06, "loss": 9.653, "step": 7 }, { "epoch": 0.06451612903225806, "grad_norm": 316.244384765625, "learning_rate": 1.1598652279257497e-06, "loss": 21.3918, "step": 8 }, { "epoch": 0.07258064516129033, "grad_norm": 64.28992462158203, "learning_rate": 1.157970023304956e-06, "loss": 0.6204, "step": 9 }, { "epoch": 0.08064516129032258, "grad_norm": 130.6162109375, "learning_rate": 1.1560748186841622e-06, "loss": 5.0316, "step": 10 }, { "epoch": 0.08870967741935484, "grad_norm": 340.1315612792969, "learning_rate": 1.1541796140633685e-06, "loss": 18.7971, "step": 11 }, { "epoch": 0.0967741935483871, "grad_norm": 63.929317474365234, "learning_rate": 1.1522844094425748e-06, "loss": 1.2454, "step": 12 }, { "epoch": 0.10483870967741936, "grad_norm": 5.4183030128479, "learning_rate": 1.150389204821781e-06, "loss": 0.0884, "step": 13 }, { "epoch": 0.11290322580645161, "grad_norm": 137.7165985107422, "learning_rate": 1.1484940002009875e-06, "loss": 4.2256, "step": 14 }, { "epoch": 0.12096774193548387, "grad_norm": 31.921754837036133, "learning_rate": 1.1465987955801938e-06, "loss": 0.3319, "step": 15 }, { "epoch": 0.12903225806451613, "grad_norm": 200.37228393554688, "learning_rate": 1.1447035909594e-06, "loss": 11.9662, "step": 16 }, { "epoch": 0.13709677419354838, "grad_norm": 54.43822479248047, "learning_rate": 1.1428083863386063e-06, "loss": 1.2817, "step": 17 }, { "epoch": 0.14516129032258066, "grad_norm": 122.57898712158203, "learning_rate": 1.1409131817178126e-06, "loss": 3.3717, "step": 18 }, { "epoch": 0.1532258064516129, "grad_norm": 24.417898178100586, "learning_rate": 1.1390179770970188e-06, "loss": 0.5961, "step": 19 }, { "epoch": 0.16129032258064516, "grad_norm": 280.314453125, "learning_rate": 1.1371227724762251e-06, "loss": 12.7171, "step": 20 }, { "epoch": 0.1693548387096774, "grad_norm": 74.58489227294922, "learning_rate": 1.1352275678554316e-06, "loss": 0.6804, "step": 21 }, { "epoch": 0.1774193548387097, "grad_norm": 134.18431091308594, "learning_rate": 1.1333323632346376e-06, "loss": 1.4807, "step": 22 }, { "epoch": 0.18548387096774194, "grad_norm": 127.26359558105469, "learning_rate": 1.1314371586138441e-06, "loss": 18.1665, "step": 23 }, { "epoch": 0.1935483870967742, "grad_norm": 196.29579162597656, "learning_rate": 1.1295419539930504e-06, "loss": 2.6718, "step": 24 }, { "epoch": 0.20161290322580644, "grad_norm": 59.49917984008789, "learning_rate": 1.1276467493722567e-06, "loss": 0.7888, "step": 25 }, { "epoch": 0.20967741935483872, "grad_norm": 160.57215881347656, "learning_rate": 1.125751544751463e-06, "loss": 0.7413, "step": 26 }, { "epoch": 0.21774193548387097, "grad_norm": 63.3122444152832, "learning_rate": 1.1238563401306692e-06, "loss": 3.04, "step": 27 }, { "epoch": 0.22580645161290322, "grad_norm": 132.4581756591797, "learning_rate": 1.1219611355098755e-06, "loss": 1.7609, "step": 28 }, { "epoch": 0.23387096774193547, "grad_norm": 69.83406829833984, "learning_rate": 1.1200659308890817e-06, "loss": 1.1596, "step": 29 }, { "epoch": 0.24193548387096775, "grad_norm": 34.646114349365234, "learning_rate": 1.1181707262682882e-06, "loss": 0.5233, "step": 30 }, { "epoch": 0.25, "grad_norm": 28.260757446289062, "learning_rate": 1.1162755216474943e-06, "loss": 1.0132, "step": 31 }, { "epoch": 0.25806451612903225, "grad_norm": 44.67536544799805, "learning_rate": 1.1143803170267007e-06, "loss": 0.9895, "step": 32 }, { "epoch": 0.2661290322580645, "grad_norm": 54.21400451660156, "learning_rate": 1.112485112405907e-06, "loss": 1.1101, "step": 33 }, { "epoch": 0.27419354838709675, "grad_norm": 36.84498977661133, "learning_rate": 1.1105899077851133e-06, "loss": 1.3959, "step": 34 }, { "epoch": 0.28225806451612906, "grad_norm": 369.2140808105469, "learning_rate": 1.1086947031643195e-06, "loss": 23.5792, "step": 35 }, { "epoch": 0.2903225806451613, "grad_norm": 51.993858337402344, "learning_rate": 1.1067994985435258e-06, "loss": 0.6532, "step": 36 }, { "epoch": 0.29838709677419356, "grad_norm": 87.89725494384766, "learning_rate": 1.104904293922732e-06, "loss": 1.0406, "step": 37 }, { "epoch": 0.3064516129032258, "grad_norm": 272.9493713378906, "learning_rate": 1.1030090893019383e-06, "loss": 24.8131, "step": 38 }, { "epoch": 0.31451612903225806, "grad_norm": 96.62998962402344, "learning_rate": 1.1011138846811448e-06, "loss": 0.7572, "step": 39 }, { "epoch": 0.3225806451612903, "grad_norm": 538.8275756835938, "learning_rate": 1.0992186800603509e-06, "loss": 30.8125, "step": 40 }, { "epoch": 0.33064516129032256, "grad_norm": 285.717041015625, "learning_rate": 1.0973234754395574e-06, "loss": 1.837, "step": 41 }, { "epoch": 0.3387096774193548, "grad_norm": 215.8197021484375, "learning_rate": 1.0954282708187636e-06, "loss": 18.2501, "step": 42 }, { "epoch": 0.3467741935483871, "grad_norm": 163.01434326171875, "learning_rate": 1.0935330661979699e-06, "loss": 27.9919, "step": 43 }, { "epoch": 0.3548387096774194, "grad_norm": 24.41200828552246, "learning_rate": 1.0916378615771762e-06, "loss": 0.3323, "step": 44 }, { "epoch": 0.3629032258064516, "grad_norm": 39.837669372558594, "learning_rate": 1.0897426569563824e-06, "loss": 1.1347, "step": 45 }, { "epoch": 0.3709677419354839, "grad_norm": 47.366092681884766, "learning_rate": 1.0878474523355887e-06, "loss": 2.3667, "step": 46 }, { "epoch": 0.3790322580645161, "grad_norm": 85.20360565185547, "learning_rate": 1.085952247714795e-06, "loss": 2.8234, "step": 47 }, { "epoch": 0.3870967741935484, "grad_norm": 209.84487915039062, "learning_rate": 1.0840570430940014e-06, "loss": 2.7326, "step": 48 }, { "epoch": 0.3951612903225806, "grad_norm": 61.69172286987305, "learning_rate": 1.0821618384732077e-06, "loss": 0.9325, "step": 49 }, { "epoch": 0.4032258064516129, "grad_norm": 149.46607971191406, "learning_rate": 1.080266633852414e-06, "loss": 7.6034, "step": 50 }, { "epoch": 0.4112903225806452, "grad_norm": 163.2115478515625, "learning_rate": 1.0783714292316202e-06, "loss": 1.8005, "step": 51 }, { "epoch": 0.41935483870967744, "grad_norm": 42.81549072265625, "learning_rate": 1.0764762246108265e-06, "loss": 0.3282, "step": 52 }, { "epoch": 0.4274193548387097, "grad_norm": 516.490966796875, "learning_rate": 1.0745810199900328e-06, "loss": 13.6021, "step": 53 }, { "epoch": 0.43548387096774194, "grad_norm": 83.2822265625, "learning_rate": 1.072685815369239e-06, "loss": 1.2216, "step": 54 }, { "epoch": 0.4435483870967742, "grad_norm": 86.0993881225586, "learning_rate": 1.0707906107484453e-06, "loss": 4.7606, "step": 55 }, { "epoch": 0.45161290322580644, "grad_norm": 44.0859375, "learning_rate": 1.0688954061276516e-06, "loss": 0.8722, "step": 56 }, { "epoch": 0.4596774193548387, "grad_norm": 124.6083984375, "learning_rate": 1.067000201506858e-06, "loss": 1.3964, "step": 57 }, { "epoch": 0.46774193548387094, "grad_norm": 206.24273681640625, "learning_rate": 1.0651049968860643e-06, "loss": 7.95, "step": 58 }, { "epoch": 0.47580645161290325, "grad_norm": 75.12397766113281, "learning_rate": 1.0632097922652704e-06, "loss": 0.7911, "step": 59 }, { "epoch": 0.4838709677419355, "grad_norm": 234.4656524658203, "learning_rate": 1.0613145876444769e-06, "loss": 14.9343, "step": 60 }, { "epoch": 0.49193548387096775, "grad_norm": 244.2675323486328, "learning_rate": 1.0594193830236831e-06, "loss": 0.5364, "step": 61 }, { "epoch": 0.5, "grad_norm": 134.83209228515625, "learning_rate": 1.0575241784028894e-06, "loss": 23.3423, "step": 62 }, { "epoch": 0.5080645161290323, "grad_norm": 117.73362731933594, "learning_rate": 1.0556289737820957e-06, "loss": 1.3623, "step": 63 }, { "epoch": 0.5161290322580645, "grad_norm": 67.53313446044922, "learning_rate": 1.053733769161302e-06, "loss": 2.8317, "step": 64 }, { "epoch": 0.5241935483870968, "grad_norm": 169.532470703125, "learning_rate": 1.0518385645405082e-06, "loss": 14.4335, "step": 65 }, { "epoch": 0.532258064516129, "grad_norm": 62.2873649597168, "learning_rate": 1.0499433599197147e-06, "loss": 1.3316, "step": 66 }, { "epoch": 0.5403225806451613, "grad_norm": 70.10298156738281, "learning_rate": 1.048048155298921e-06, "loss": 1.325, "step": 67 }, { "epoch": 0.5483870967741935, "grad_norm": 60.70389175415039, "learning_rate": 1.0461529506781272e-06, "loss": 0.8161, "step": 68 }, { "epoch": 0.5564516129032258, "grad_norm": 104.86827087402344, "learning_rate": 1.0442577460573335e-06, "loss": 1.3411, "step": 69 }, { "epoch": 0.5645161290322581, "grad_norm": 78.47552490234375, "learning_rate": 1.0423625414365397e-06, "loss": 0.7227, "step": 70 }, { "epoch": 0.5725806451612904, "grad_norm": 74.83175659179688, "learning_rate": 1.040467336815746e-06, "loss": 2.0944, "step": 71 }, { "epoch": 0.5806451612903226, "grad_norm": 261.87298583984375, "learning_rate": 1.0385721321949523e-06, "loss": 17.7856, "step": 72 }, { "epoch": 0.5887096774193549, "grad_norm": 55.27876663208008, "learning_rate": 1.0366769275741585e-06, "loss": 0.98, "step": 73 }, { "epoch": 0.5967741935483871, "grad_norm": 145.73300170898438, "learning_rate": 1.0347817229533648e-06, "loss": 1.7042, "step": 74 }, { "epoch": 0.6048387096774194, "grad_norm": 125.30365753173828, "learning_rate": 1.0328865183325713e-06, "loss": 16.882, "step": 75 }, { "epoch": 0.6129032258064516, "grad_norm": 26.542724609375, "learning_rate": 1.0309913137117776e-06, "loss": 0.1877, "step": 76 }, { "epoch": 0.6209677419354839, "grad_norm": 37.75946044921875, "learning_rate": 1.0290961090909838e-06, "loss": 0.2059, "step": 77 }, { "epoch": 0.6290322580645161, "grad_norm": 128.78330993652344, "learning_rate": 1.02720090447019e-06, "loss": 13.4677, "step": 78 }, { "epoch": 0.6370967741935484, "grad_norm": 210.26707458496094, "learning_rate": 1.0253056998493964e-06, "loss": 22.7431, "step": 79 }, { "epoch": 0.6451612903225806, "grad_norm": 35.724334716796875, "learning_rate": 1.0234104952286026e-06, "loss": 1.0135, "step": 80 }, { "epoch": 0.6532258064516129, "grad_norm": 66.4817886352539, "learning_rate": 1.0215152906078089e-06, "loss": 0.4585, "step": 81 }, { "epoch": 0.6612903225806451, "grad_norm": 53.807220458984375, "learning_rate": 1.0196200859870154e-06, "loss": 2.1388, "step": 82 }, { "epoch": 0.6693548387096774, "grad_norm": 33.74148178100586, "learning_rate": 1.0177248813662214e-06, "loss": 0.2365, "step": 83 }, { "epoch": 0.6774193548387096, "grad_norm": 47.4416618347168, "learning_rate": 1.015829676745428e-06, "loss": 1.1393, "step": 84 }, { "epoch": 0.6854838709677419, "grad_norm": 280.392578125, "learning_rate": 1.0139344721246342e-06, "loss": 18.5951, "step": 85 }, { "epoch": 0.6935483870967742, "grad_norm": 91.39240264892578, "learning_rate": 1.0120392675038404e-06, "loss": 14.467, "step": 86 }, { "epoch": 0.7016129032258065, "grad_norm": 97.39054107666016, "learning_rate": 1.0101440628830467e-06, "loss": 2.257, "step": 87 }, { "epoch": 0.7096774193548387, "grad_norm": 95.57569885253906, "learning_rate": 1.008248858262253e-06, "loss": 0.9064, "step": 88 }, { "epoch": 0.717741935483871, "grad_norm": 230.4451904296875, "learning_rate": 1.0063536536414592e-06, "loss": 1.5809, "step": 89 }, { "epoch": 0.7258064516129032, "grad_norm": 149.36996459960938, "learning_rate": 1.0044584490206655e-06, "loss": 1.7855, "step": 90 }, { "epoch": 0.7338709677419355, "grad_norm": 319.1463317871094, "learning_rate": 1.002563244399872e-06, "loss": 19.9728, "step": 91 }, { "epoch": 0.7419354838709677, "grad_norm": 132.4302215576172, "learning_rate": 1.000668039779078e-06, "loss": 7.4546, "step": 92 }, { "epoch": 0.75, "grad_norm": 24.757253646850586, "learning_rate": 9.987728351582845e-07, "loss": 0.1834, "step": 93 }, { "epoch": 0.7580645161290323, "grad_norm": 117.05673217773438, "learning_rate": 9.968776305374908e-07, "loss": 1.7553, "step": 94 }, { "epoch": 0.7661290322580645, "grad_norm": 178.41818237304688, "learning_rate": 9.94982425916697e-07, "loss": 9.5775, "step": 95 }, { "epoch": 0.7741935483870968, "grad_norm": 129.0936737060547, "learning_rate": 9.930872212959033e-07, "loss": 2.2522, "step": 96 }, { "epoch": 0.782258064516129, "grad_norm": 120.8997802734375, "learning_rate": 9.911920166751096e-07, "loss": 1.365, "step": 97 }, { "epoch": 0.7903225806451613, "grad_norm": 124.6778335571289, "learning_rate": 9.892968120543159e-07, "loss": 1.6905, "step": 98 }, { "epoch": 0.7983870967741935, "grad_norm": 805.3359375, "learning_rate": 9.874016074335221e-07, "loss": 2.0304, "step": 99 }, { "epoch": 0.8064516129032258, "grad_norm": 258.4070739746094, "learning_rate": 9.855064028127286e-07, "loss": 18.573, "step": 100 }, { "epoch": 0.8145161290322581, "grad_norm": 25.10341453552246, "learning_rate": 9.836111981919347e-07, "loss": 0.8904, "step": 101 }, { "epoch": 0.8225806451612904, "grad_norm": 137.26222229003906, "learning_rate": 9.817159935711411e-07, "loss": 1.3759, "step": 102 }, { "epoch": 0.8306451612903226, "grad_norm": 100.84027862548828, "learning_rate": 9.798207889503474e-07, "loss": 11.2571, "step": 103 }, { "epoch": 0.8387096774193549, "grad_norm": 99.84913635253906, "learning_rate": 9.779255843295537e-07, "loss": 0.7655, "step": 104 }, { "epoch": 0.8467741935483871, "grad_norm": 58.0277099609375, "learning_rate": 9.7603037970876e-07, "loss": 0.685, "step": 105 }, { "epoch": 0.8548387096774194, "grad_norm": 118.5246810913086, "learning_rate": 9.741351750879662e-07, "loss": 3.1796, "step": 106 }, { "epoch": 0.8629032258064516, "grad_norm": 93.28719329833984, "learning_rate": 9.722399704671725e-07, "loss": 1.5221, "step": 107 }, { "epoch": 0.8709677419354839, "grad_norm": 109.0409927368164, "learning_rate": 9.703447658463787e-07, "loss": 1.2383, "step": 108 }, { "epoch": 0.8790322580645161, "grad_norm": 167.7772216796875, "learning_rate": 9.684495612255852e-07, "loss": 2.5045, "step": 109 }, { "epoch": 0.8870967741935484, "grad_norm": 51.926692962646484, "learning_rate": 9.665543566047915e-07, "loss": 1.0717, "step": 110 }, { "epoch": 0.8951612903225806, "grad_norm": 222.6824951171875, "learning_rate": 9.646591519839975e-07, "loss": 4.1007, "step": 111 }, { "epoch": 0.9032258064516129, "grad_norm": 53.584320068359375, "learning_rate": 9.62763947363204e-07, "loss": 1.257, "step": 112 }, { "epoch": 0.9112903225806451, "grad_norm": 92.6733627319336, "learning_rate": 9.608687427424103e-07, "loss": 0.7563, "step": 113 }, { "epoch": 0.9193548387096774, "grad_norm": 23.77389144897461, "learning_rate": 9.589735381216166e-07, "loss": 0.7985, "step": 114 }, { "epoch": 0.9274193548387096, "grad_norm": 374.8204650878906, "learning_rate": 9.570783335008228e-07, "loss": 5.5011, "step": 115 }, { "epoch": 0.9354838709677419, "grad_norm": 83.84817504882812, "learning_rate": 9.55183128880029e-07, "loss": 2.123, "step": 116 }, { "epoch": 0.9435483870967742, "grad_norm": 160.6682891845703, "learning_rate": 9.532879242592355e-07, "loss": 1.6017, "step": 117 }, { "epoch": 0.9516129032258065, "grad_norm": 150.16294860839844, "learning_rate": 9.513927196384417e-07, "loss": 14.0088, "step": 118 }, { "epoch": 0.9596774193548387, "grad_norm": 94.02184295654297, "learning_rate": 9.494975150176481e-07, "loss": 1.6218, "step": 119 }, { "epoch": 0.967741935483871, "grad_norm": 40.898277282714844, "learning_rate": 9.476023103968543e-07, "loss": 1.4986, "step": 120 }, { "epoch": 0.9758064516129032, "grad_norm": 167.00746154785156, "learning_rate": 9.457071057760605e-07, "loss": 1.4057, "step": 121 }, { "epoch": 0.9838709677419355, "grad_norm": 35.20161819458008, "learning_rate": 9.438119011552669e-07, "loss": 0.5185, "step": 122 }, { "epoch": 0.9919354838709677, "grad_norm": 61.312583923339844, "learning_rate": 9.419166965344732e-07, "loss": 0.6058, "step": 123 }, { "epoch": 1.0, "grad_norm": 9.15098762512207, "learning_rate": 9.400214919136795e-07, "loss": 0.008, "step": 124 }, { "epoch": 1.0, "eval_loss": 3.5903782844543457, "eval_mae": 1.1498976945877075, "eval_mse": 3.590378522872925, "eval_r2": 0.01701676845550537, "eval_rmse": 1.8948294178824976, "eval_runtime": 1.3414, "eval_samples_per_second": 41.002, "eval_smape": 46.445611119270325, "eval_steps_per_second": 10.437, "step": 124 }, { "epoch": 1.0080645161290323, "grad_norm": 93.99532318115234, "learning_rate": 9.381262872928857e-07, "loss": 1.3843, "step": 125 }, { "epoch": 1.0161290322580645, "grad_norm": 131.81895446777344, "learning_rate": 9.362310826720921e-07, "loss": 0.7999, "step": 126 }, { "epoch": 1.0241935483870968, "grad_norm": 137.10763549804688, "learning_rate": 9.343358780512983e-07, "loss": 6.5196, "step": 127 }, { "epoch": 1.032258064516129, "grad_norm": 48.18350601196289, "learning_rate": 9.324406734305047e-07, "loss": 0.6827, "step": 128 }, { "epoch": 1.0403225806451613, "grad_norm": 177.0278778076172, "learning_rate": 9.30545468809711e-07, "loss": 0.8596, "step": 129 }, { "epoch": 1.0483870967741935, "grad_norm": 47.4871711730957, "learning_rate": 9.286502641889171e-07, "loss": 2.1029, "step": 130 }, { "epoch": 1.0564516129032258, "grad_norm": 47.476219177246094, "learning_rate": 9.267550595681235e-07, "loss": 0.2775, "step": 131 }, { "epoch": 1.064516129032258, "grad_norm": 171.87510681152344, "learning_rate": 9.248598549473298e-07, "loss": 5.2664, "step": 132 }, { "epoch": 1.0725806451612903, "grad_norm": 76.38048553466797, "learning_rate": 9.229646503265362e-07, "loss": 0.9048, "step": 133 }, { "epoch": 1.0806451612903225, "grad_norm": 65.83001708984375, "learning_rate": 9.210694457057423e-07, "loss": 0.6209, "step": 134 }, { "epoch": 1.0887096774193548, "grad_norm": 155.04318237304688, "learning_rate": 9.191742410849487e-07, "loss": 7.5308, "step": 135 }, { "epoch": 1.096774193548387, "grad_norm": 180.7427520751953, "learning_rate": 9.17279036464155e-07, "loss": 16.1446, "step": 136 }, { "epoch": 1.1048387096774193, "grad_norm": 209.10556030273438, "learning_rate": 9.153838318433613e-07, "loss": 10.6327, "step": 137 }, { "epoch": 1.1129032258064515, "grad_norm": 31.048919677734375, "learning_rate": 9.134886272225676e-07, "loss": 0.4128, "step": 138 }, { "epoch": 1.120967741935484, "grad_norm": 419.14520263671875, "learning_rate": 9.115934226017738e-07, "loss": 45.5376, "step": 139 }, { "epoch": 1.129032258064516, "grad_norm": 18.564205169677734, "learning_rate": 9.096982179809801e-07, "loss": 0.1034, "step": 140 }, { "epoch": 1.1370967741935485, "grad_norm": 38.12897872924805, "learning_rate": 9.078030133601864e-07, "loss": 1.2526, "step": 141 }, { "epoch": 1.1451612903225807, "grad_norm": 56.46409606933594, "learning_rate": 9.059078087393928e-07, "loss": 1.3236, "step": 142 }, { "epoch": 1.153225806451613, "grad_norm": 57.76015090942383, "learning_rate": 9.04012604118599e-07, "loss": 0.3464, "step": 143 }, { "epoch": 1.1612903225806452, "grad_norm": 32.81700134277344, "learning_rate": 9.021173994978053e-07, "loss": 0.2656, "step": 144 }, { "epoch": 1.1693548387096775, "grad_norm": 40.10895919799805, "learning_rate": 9.002221948770116e-07, "loss": 0.8185, "step": 145 }, { "epoch": 1.1774193548387097, "grad_norm": 46.941200256347656, "learning_rate": 8.98326990256218e-07, "loss": 2.0041, "step": 146 }, { "epoch": 1.185483870967742, "grad_norm": 47.112571716308594, "learning_rate": 8.964317856354242e-07, "loss": 0.4361, "step": 147 }, { "epoch": 1.1935483870967742, "grad_norm": 91.99717712402344, "learning_rate": 8.945365810146304e-07, "loss": 1.2795, "step": 148 }, { "epoch": 1.2016129032258065, "grad_norm": 321.48345947265625, "learning_rate": 8.926413763938367e-07, "loss": 21.7397, "step": 149 }, { "epoch": 1.2096774193548387, "grad_norm": 420.6986083984375, "learning_rate": 8.90746171773043e-07, "loss": 37.258, "step": 150 }, { "epoch": 1.217741935483871, "grad_norm": 195.8186492919922, "learning_rate": 8.888509671522494e-07, "loss": 17.6255, "step": 151 }, { "epoch": 1.2258064516129032, "grad_norm": 131.71688842773438, "learning_rate": 8.869557625314557e-07, "loss": 1.9709, "step": 152 }, { "epoch": 1.2338709677419355, "grad_norm": 275.066162109375, "learning_rate": 8.850605579106619e-07, "loss": 10.5323, "step": 153 }, { "epoch": 1.2419354838709677, "grad_norm": 121.62479400634766, "learning_rate": 8.831653532898682e-07, "loss": 13.0309, "step": 154 }, { "epoch": 1.25, "grad_norm": 44.131412506103516, "learning_rate": 8.812701486690746e-07, "loss": 0.4236, "step": 155 }, { "epoch": 1.2580645161290323, "grad_norm": 48.05416488647461, "learning_rate": 8.793749440482808e-07, "loss": 0.787, "step": 156 }, { "epoch": 1.2661290322580645, "grad_norm": 136.1278839111328, "learning_rate": 8.774797394274871e-07, "loss": 4.653, "step": 157 }, { "epoch": 1.2741935483870968, "grad_norm": 156.81910705566406, "learning_rate": 8.755845348066934e-07, "loss": 14.6158, "step": 158 }, { "epoch": 1.282258064516129, "grad_norm": 311.3755798339844, "learning_rate": 8.736893301858996e-07, "loss": 25.6107, "step": 159 }, { "epoch": 1.2903225806451613, "grad_norm": 147.80181884765625, "learning_rate": 8.71794125565106e-07, "loss": 10.7386, "step": 160 }, { "epoch": 1.2983870967741935, "grad_norm": 326.3906555175781, "learning_rate": 8.698989209443123e-07, "loss": 41.9267, "step": 161 }, { "epoch": 1.3064516129032258, "grad_norm": 35.69755172729492, "learning_rate": 8.680037163235186e-07, "loss": 0.9466, "step": 162 }, { "epoch": 1.314516129032258, "grad_norm": 82.98596954345703, "learning_rate": 8.661085117027248e-07, "loss": 0.6258, "step": 163 }, { "epoch": 1.3225806451612903, "grad_norm": 105.8860855102539, "learning_rate": 8.642133070819311e-07, "loss": 2.386, "step": 164 }, { "epoch": 1.3306451612903225, "grad_norm": 196.2127685546875, "learning_rate": 8.623181024611374e-07, "loss": 2.511, "step": 165 }, { "epoch": 1.3387096774193548, "grad_norm": 145.88681030273438, "learning_rate": 8.604228978403437e-07, "loss": 1.8581, "step": 166 }, { "epoch": 1.346774193548387, "grad_norm": 151.8137664794922, "learning_rate": 8.5852769321955e-07, "loss": 3.5077, "step": 167 }, { "epoch": 1.3548387096774195, "grad_norm": 83.18711853027344, "learning_rate": 8.566324885987562e-07, "loss": 0.5735, "step": 168 }, { "epoch": 1.3629032258064515, "grad_norm": 137.17172241210938, "learning_rate": 8.547372839779626e-07, "loss": 2.0364, "step": 169 }, { "epoch": 1.370967741935484, "grad_norm": 486.7686767578125, "learning_rate": 8.528420793571689e-07, "loss": 130.1139, "step": 170 }, { "epoch": 1.379032258064516, "grad_norm": 53.59453582763672, "learning_rate": 8.509468747363753e-07, "loss": 1.0157, "step": 171 }, { "epoch": 1.3870967741935485, "grad_norm": 361.83270263671875, "learning_rate": 8.490516701155814e-07, "loss": 9.2801, "step": 172 }, { "epoch": 1.3951612903225805, "grad_norm": 122.44477081298828, "learning_rate": 8.471564654947877e-07, "loss": 1.7689, "step": 173 }, { "epoch": 1.403225806451613, "grad_norm": 163.0518035888672, "learning_rate": 8.452612608739941e-07, "loss": 8.8246, "step": 174 }, { "epoch": 1.4112903225806452, "grad_norm": 23.73016357421875, "learning_rate": 8.433660562532003e-07, "loss": 0.3707, "step": 175 }, { "epoch": 1.4193548387096775, "grad_norm": 392.39739990234375, "learning_rate": 8.414708516324067e-07, "loss": 19.7684, "step": 176 }, { "epoch": 1.4274193548387097, "grad_norm": 101.17041015625, "learning_rate": 8.395756470116129e-07, "loss": 1.2123, "step": 177 }, { "epoch": 1.435483870967742, "grad_norm": 129.82452392578125, "learning_rate": 8.376804423908192e-07, "loss": 1.3362, "step": 178 }, { "epoch": 1.4435483870967742, "grad_norm": 93.25836944580078, "learning_rate": 8.357852377700255e-07, "loss": 2.67, "step": 179 }, { "epoch": 1.4516129032258065, "grad_norm": 144.9728240966797, "learning_rate": 8.338900331492319e-07, "loss": 1.4356, "step": 180 }, { "epoch": 1.4596774193548387, "grad_norm": 106.55712127685547, "learning_rate": 8.31994828528438e-07, "loss": 1.0411, "step": 181 }, { "epoch": 1.467741935483871, "grad_norm": 50.07807922363281, "learning_rate": 8.300996239076443e-07, "loss": 1.0384, "step": 182 }, { "epoch": 1.4758064516129032, "grad_norm": 47.749080657958984, "learning_rate": 8.282044192868507e-07, "loss": 0.8744, "step": 183 }, { "epoch": 1.4838709677419355, "grad_norm": 505.6190185546875, "learning_rate": 8.263092146660569e-07, "loss": 3.8061, "step": 184 }, { "epoch": 1.4919354838709677, "grad_norm": 126.373779296875, "learning_rate": 8.244140100452633e-07, "loss": 1.3906, "step": 185 }, { "epoch": 1.5, "grad_norm": 95.14514923095703, "learning_rate": 8.225188054244695e-07, "loss": 0.7666, "step": 186 }, { "epoch": 1.5080645161290323, "grad_norm": 171.452392578125, "learning_rate": 8.206236008036759e-07, "loss": 4.4224, "step": 187 }, { "epoch": 1.5161290322580645, "grad_norm": 58.15391159057617, "learning_rate": 8.187283961828821e-07, "loss": 2.4317, "step": 188 }, { "epoch": 1.5241935483870968, "grad_norm": 73.09158325195312, "learning_rate": 8.168331915620885e-07, "loss": 1.3731, "step": 189 }, { "epoch": 1.532258064516129, "grad_norm": 92.6122817993164, "learning_rate": 8.149379869412948e-07, "loss": 1.0538, "step": 190 }, { "epoch": 1.5403225806451613, "grad_norm": 61.34569549560547, "learning_rate": 8.130427823205009e-07, "loss": 0.8926, "step": 191 }, { "epoch": 1.5483870967741935, "grad_norm": 46.87858200073242, "learning_rate": 8.111475776997073e-07, "loss": 0.774, "step": 192 }, { "epoch": 1.5564516129032258, "grad_norm": 32.91598129272461, "learning_rate": 8.092523730789136e-07, "loss": 0.6749, "step": 193 }, { "epoch": 1.564516129032258, "grad_norm": 69.08930206298828, "learning_rate": 8.073571684581199e-07, "loss": 0.7667, "step": 194 }, { "epoch": 1.5725806451612905, "grad_norm": 61.4034538269043, "learning_rate": 8.054619638373261e-07, "loss": 0.6325, "step": 195 }, { "epoch": 1.5806451612903225, "grad_norm": 252.8091583251953, "learning_rate": 8.035667592165325e-07, "loss": 9.2466, "step": 196 }, { "epoch": 1.588709677419355, "grad_norm": 60.166046142578125, "learning_rate": 8.016715545957387e-07, "loss": 1.7571, "step": 197 }, { "epoch": 1.596774193548387, "grad_norm": 42.4896125793457, "learning_rate": 7.997763499749451e-07, "loss": 0.5738, "step": 198 }, { "epoch": 1.6048387096774195, "grad_norm": 62.25552749633789, "learning_rate": 7.978811453541514e-07, "loss": 0.356, "step": 199 }, { "epoch": 1.6129032258064515, "grad_norm": 131.12069702148438, "learning_rate": 7.959859407333575e-07, "loss": 1.0522, "step": 200 }, { "epoch": 1.620967741935484, "grad_norm": 13.091470718383789, "learning_rate": 7.940907361125639e-07, "loss": 0.2088, "step": 201 }, { "epoch": 1.629032258064516, "grad_norm": 61.03318786621094, "learning_rate": 7.921955314917702e-07, "loss": 1.6789, "step": 202 }, { "epoch": 1.6370967741935485, "grad_norm": 14.173700332641602, "learning_rate": 7.903003268709766e-07, "loss": 0.1112, "step": 203 }, { "epoch": 1.6451612903225805, "grad_norm": 70.016845703125, "learning_rate": 7.884051222501828e-07, "loss": 0.8495, "step": 204 }, { "epoch": 1.653225806451613, "grad_norm": 217.45611572265625, "learning_rate": 7.865099176293891e-07, "loss": 15.7051, "step": 205 }, { "epoch": 1.661290322580645, "grad_norm": 99.29098510742188, "learning_rate": 7.846147130085954e-07, "loss": 1.0348, "step": 206 }, { "epoch": 1.6693548387096775, "grad_norm": 33.91508102416992, "learning_rate": 7.827195083878016e-07, "loss": 0.7734, "step": 207 }, { "epoch": 1.6774193548387095, "grad_norm": 93.15892028808594, "learning_rate": 7.80824303767008e-07, "loss": 0.3326, "step": 208 }, { "epoch": 1.685483870967742, "grad_norm": 14.714130401611328, "learning_rate": 7.789290991462143e-07, "loss": 0.3946, "step": 209 }, { "epoch": 1.6935483870967742, "grad_norm": 22.98834800720215, "learning_rate": 7.770338945254205e-07, "loss": 0.2196, "step": 210 }, { "epoch": 1.7016129032258065, "grad_norm": 165.18626403808594, "learning_rate": 7.751386899046268e-07, "loss": 3.0653, "step": 211 }, { "epoch": 1.7096774193548387, "grad_norm": 55.49916458129883, "learning_rate": 7.732434852838332e-07, "loss": 0.8254, "step": 212 }, { "epoch": 1.717741935483871, "grad_norm": 25.563867568969727, "learning_rate": 7.713482806630394e-07, "loss": 0.1924, "step": 213 }, { "epoch": 1.7258064516129032, "grad_norm": 277.5033874511719, "learning_rate": 7.694530760422456e-07, "loss": 26.1251, "step": 214 }, { "epoch": 1.7338709677419355, "grad_norm": 218.53463745117188, "learning_rate": 7.67557871421452e-07, "loss": 16.69, "step": 215 }, { "epoch": 1.7419354838709677, "grad_norm": 48.32767868041992, "learning_rate": 7.656626668006582e-07, "loss": 1.267, "step": 216 }, { "epoch": 1.75, "grad_norm": 297.5074462890625, "learning_rate": 7.637674621798646e-07, "loss": 8.8952, "step": 217 }, { "epoch": 1.7580645161290323, "grad_norm": 19.5325927734375, "learning_rate": 7.618722575590709e-07, "loss": 0.1047, "step": 218 }, { "epoch": 1.7661290322580645, "grad_norm": 382.64434814453125, "learning_rate": 7.599770529382771e-07, "loss": 18.6839, "step": 219 }, { "epoch": 1.7741935483870968, "grad_norm": 18.517972946166992, "learning_rate": 7.580818483174834e-07, "loss": 0.1679, "step": 220 }, { "epoch": 1.782258064516129, "grad_norm": 36.31782913208008, "learning_rate": 7.561866436966898e-07, "loss": 0.1592, "step": 221 }, { "epoch": 1.7903225806451613, "grad_norm": 44.47116470336914, "learning_rate": 7.54291439075896e-07, "loss": 0.6091, "step": 222 }, { "epoch": 1.7983870967741935, "grad_norm": 16.11575698852539, "learning_rate": 7.523962344551024e-07, "loss": 0.1174, "step": 223 }, { "epoch": 1.8064516129032258, "grad_norm": 288.2611999511719, "learning_rate": 7.505010298343086e-07, "loss": 18.947, "step": 224 }, { "epoch": 1.814516129032258, "grad_norm": 267.47540283203125, "learning_rate": 7.486058252135149e-07, "loss": 10.6328, "step": 225 }, { "epoch": 1.8225806451612905, "grad_norm": 33.86650848388672, "learning_rate": 7.467106205927212e-07, "loss": 2.1226, "step": 226 }, { "epoch": 1.8306451612903225, "grad_norm": 121.6884765625, "learning_rate": 7.448154159719275e-07, "loss": 0.371, "step": 227 }, { "epoch": 1.838709677419355, "grad_norm": 42.18556594848633, "learning_rate": 7.429202113511338e-07, "loss": 0.7408, "step": 228 }, { "epoch": 1.846774193548387, "grad_norm": 64.0304946899414, "learning_rate": 7.4102500673034e-07, "loss": 0.7927, "step": 229 }, { "epoch": 1.8548387096774195, "grad_norm": 211.26959228515625, "learning_rate": 7.391298021095464e-07, "loss": 8.6268, "step": 230 }, { "epoch": 1.8629032258064515, "grad_norm": 97.74939727783203, "learning_rate": 7.372345974887527e-07, "loss": 1.1381, "step": 231 }, { "epoch": 1.870967741935484, "grad_norm": 325.8834228515625, "learning_rate": 7.35339392867959e-07, "loss": 1.0786, "step": 232 }, { "epoch": 1.879032258064516, "grad_norm": 49.10792541503906, "learning_rate": 7.334441882471652e-07, "loss": 0.5383, "step": 233 }, { "epoch": 1.8870967741935485, "grad_norm": 27.410547256469727, "learning_rate": 7.315489836263715e-07, "loss": 0.4044, "step": 234 }, { "epoch": 1.8951612903225805, "grad_norm": 31.08856773376465, "learning_rate": 7.296537790055778e-07, "loss": 0.2857, "step": 235 }, { "epoch": 1.903225806451613, "grad_norm": 236.7850341796875, "learning_rate": 7.277585743847841e-07, "loss": 10.882, "step": 236 }, { "epoch": 1.911290322580645, "grad_norm": 85.23385620117188, "learning_rate": 7.258633697639905e-07, "loss": 6.385, "step": 237 }, { "epoch": 1.9193548387096775, "grad_norm": 42.652278900146484, "learning_rate": 7.239681651431966e-07, "loss": 0.7745, "step": 238 }, { "epoch": 1.9274193548387095, "grad_norm": 37.44199752807617, "learning_rate": 7.22072960522403e-07, "loss": 0.6186, "step": 239 }, { "epoch": 1.935483870967742, "grad_norm": 122.3661117553711, "learning_rate": 7.201777559016093e-07, "loss": 3.3579, "step": 240 }, { "epoch": 1.9435483870967742, "grad_norm": 29.615245819091797, "learning_rate": 7.182825512808157e-07, "loss": 1.0682, "step": 241 }, { "epoch": 1.9516129032258065, "grad_norm": 38.644981384277344, "learning_rate": 7.163873466600218e-07, "loss": 1.6679, "step": 242 }, { "epoch": 1.9596774193548387, "grad_norm": 151.14231872558594, "learning_rate": 7.144921420392281e-07, "loss": 3.0028, "step": 243 }, { "epoch": 1.967741935483871, "grad_norm": 90.12117767333984, "learning_rate": 7.125969374184345e-07, "loss": 3.1295, "step": 244 }, { "epoch": 1.9758064516129032, "grad_norm": 65.80992889404297, "learning_rate": 7.107017327976407e-07, "loss": 0.8262, "step": 245 }, { "epoch": 1.9838709677419355, "grad_norm": 224.11376953125, "learning_rate": 7.088065281768471e-07, "loss": 2.9559, "step": 246 }, { "epoch": 1.9919354838709677, "grad_norm": 165.00535583496094, "learning_rate": 7.069113235560533e-07, "loss": 1.3005, "step": 247 }, { "epoch": 2.0, "grad_norm": 178.8065185546875, "learning_rate": 7.050161189352596e-07, "loss": 2.704, "step": 248 }, { "epoch": 2.0, "eval_loss": 3.476463556289673, "eval_mae": 1.1249998807907104, "eval_mse": 3.476463794708252, "eval_r2": 0.04820472002029419, "eval_rmse": 1.8645277672129885, "eval_runtime": 1.4258, "eval_samples_per_second": 38.574, "eval_smape": 47.31981158256531, "eval_steps_per_second": 9.819, "step": 248 } ], "logging_steps": 1, "max_steps": 620, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1343939627507712.0, "train_batch_size": 4, "trial_name": null, "trial_params": { "learning_rate": 1.1750268648920993e-06 } }