{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 3, "global_step": 282, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.031914893617021274, "grad_norm": 219.1436309814453, "learning_rate": 4.687722050568071e-06, "loss": 1.6437, "step": 3 }, { "epoch": 0.031914893617021274, "eval_loss": 1.2238304615020752, "eval_runtime": 49.6348, "eval_samples_per_second": 6.044, "eval_steps_per_second": 0.201, "step": 3 }, { "epoch": 0.06382978723404255, "grad_norm": 151.3193817138672, "learning_rate": 4.637316437121103e-06, "loss": 1.5519, "step": 6 }, { "epoch": 0.06382978723404255, "eval_loss": 1.166587471961975, "eval_runtime": 43.6496, "eval_samples_per_second": 6.873, "eval_steps_per_second": 0.229, "step": 6 }, { "epoch": 0.09574468085106383, "grad_norm": 151.81333923339844, "learning_rate": 4.586910823674134e-06, "loss": 1.1065, "step": 9 }, { "epoch": 0.09574468085106383, "eval_loss": 1.167779803276062, "eval_runtime": 43.8309, "eval_samples_per_second": 6.844, "eval_steps_per_second": 0.228, "step": 9 }, { "epoch": 0.1276595744680851, "grad_norm": 216.54078674316406, "learning_rate": 4.536505210227166e-06, "loss": 1.0103, "step": 12 }, { "epoch": 0.1276595744680851, "eval_loss": 1.1300101280212402, "eval_runtime": 42.8188, "eval_samples_per_second": 7.006, "eval_steps_per_second": 0.234, "step": 12 }, { "epoch": 0.1595744680851064, "grad_norm": 96.97032928466797, "learning_rate": 4.486099596780197e-06, "loss": 1.1391, "step": 15 }, { "epoch": 0.1595744680851064, "eval_loss": 1.093359351158142, "eval_runtime": 43.7071, "eval_samples_per_second": 6.864, "eval_steps_per_second": 0.229, "step": 15 }, { "epoch": 0.19148936170212766, "grad_norm": 93.6313247680664, "learning_rate": 4.435693983333229e-06, "loss": 1.1749, "step": 18 }, { "epoch": 0.19148936170212766, "eval_loss": 1.0837420225143433, "eval_runtime": 42.6236, "eval_samples_per_second": 7.038, "eval_steps_per_second": 0.235, "step": 18 }, { "epoch": 0.22340425531914893, "grad_norm": 128.5612030029297, "learning_rate": 4.38528836988626e-06, "loss": 1.0878, "step": 21 }, { "epoch": 0.22340425531914893, "eval_loss": 1.0713236331939697, "eval_runtime": 43.1057, "eval_samples_per_second": 6.96, "eval_steps_per_second": 0.232, "step": 21 }, { "epoch": 0.2553191489361702, "grad_norm": 106.38141632080078, "learning_rate": 4.334882756439292e-06, "loss": 0.8829, "step": 24 }, { "epoch": 0.2553191489361702, "eval_loss": 1.0457205772399902, "eval_runtime": 43.4262, "eval_samples_per_second": 6.908, "eval_steps_per_second": 0.23, "step": 24 }, { "epoch": 0.2872340425531915, "grad_norm": 109.02174377441406, "learning_rate": 4.284477142992323e-06, "loss": 1.3004, "step": 27 }, { "epoch": 0.2872340425531915, "eval_loss": 1.015425205230713, "eval_runtime": 42.8448, "eval_samples_per_second": 7.002, "eval_steps_per_second": 0.233, "step": 27 }, { "epoch": 0.3191489361702128, "grad_norm": 93.8167495727539, "learning_rate": 4.234071529545355e-06, "loss": 1.0768, "step": 30 }, { "epoch": 0.3191489361702128, "eval_loss": 0.9899512529373169, "eval_runtime": 62.0193, "eval_samples_per_second": 4.837, "eval_steps_per_second": 0.161, "step": 30 }, { "epoch": 0.35106382978723405, "grad_norm": 117.0296401977539, "learning_rate": 4.183665916098386e-06, "loss": 0.8761, "step": 33 }, { "epoch": 0.35106382978723405, "eval_loss": 0.9733409881591797, "eval_runtime": 43.6195, "eval_samples_per_second": 6.878, "eval_steps_per_second": 0.229, "step": 33 }, { "epoch": 0.3829787234042553, "grad_norm": 97.06668090820312, "learning_rate": 4.1332603026514175e-06, "loss": 1.0648, "step": 36 }, { "epoch": 0.3829787234042553, "eval_loss": 0.9584157466888428, "eval_runtime": 43.1964, "eval_samples_per_second": 6.945, "eval_steps_per_second": 0.232, "step": 36 }, { "epoch": 0.4148936170212766, "grad_norm": 69.0722427368164, "learning_rate": 4.082854689204449e-06, "loss": 0.8425, "step": 39 }, { "epoch": 0.4148936170212766, "eval_loss": 0.9429272413253784, "eval_runtime": 43.4379, "eval_samples_per_second": 6.906, "eval_steps_per_second": 0.23, "step": 39 }, { "epoch": 0.44680851063829785, "grad_norm": 72.62019348144531, "learning_rate": 4.032449075757481e-06, "loss": 0.8714, "step": 42 }, { "epoch": 0.44680851063829785, "eval_loss": 0.9335389137268066, "eval_runtime": 42.713, "eval_samples_per_second": 7.024, "eval_steps_per_second": 0.234, "step": 42 }, { "epoch": 0.4787234042553192, "grad_norm": 91.91283416748047, "learning_rate": 3.982043462310513e-06, "loss": 0.9101, "step": 45 }, { "epoch": 0.4787234042553192, "eval_loss": 0.9222455024719238, "eval_runtime": 43.5273, "eval_samples_per_second": 6.892, "eval_steps_per_second": 0.23, "step": 45 }, { "epoch": 0.5106382978723404, "grad_norm": 71.0954360961914, "learning_rate": 3.931637848863544e-06, "loss": 1.0336, "step": 48 }, { "epoch": 0.5106382978723404, "eval_loss": 0.9134926199913025, "eval_runtime": 46.5111, "eval_samples_per_second": 6.45, "eval_steps_per_second": 0.215, "step": 48 }, { "epoch": 0.5425531914893617, "grad_norm": 92.62516021728516, "learning_rate": 3.881232235416575e-06, "loss": 0.941, "step": 51 }, { "epoch": 0.5425531914893617, "eval_loss": 0.9049246311187744, "eval_runtime": 43.2332, "eval_samples_per_second": 6.939, "eval_steps_per_second": 0.231, "step": 51 }, { "epoch": 0.574468085106383, "grad_norm": 87.83678436279297, "learning_rate": 3.830826621969607e-06, "loss": 1.1561, "step": 54 }, { "epoch": 0.574468085106383, "eval_loss": 0.9015933275222778, "eval_runtime": 43.95, "eval_samples_per_second": 6.826, "eval_steps_per_second": 0.228, "step": 54 }, { "epoch": 0.6063829787234043, "grad_norm": 81.34359741210938, "learning_rate": 3.780421008522638e-06, "loss": 0.9756, "step": 57 }, { "epoch": 0.6063829787234043, "eval_loss": 0.8971786499023438, "eval_runtime": 43.2014, "eval_samples_per_second": 6.944, "eval_steps_per_second": 0.231, "step": 57 }, { "epoch": 0.6382978723404256, "grad_norm": 64.89433288574219, "learning_rate": 3.73001539507567e-06, "loss": 0.749, "step": 60 }, { "epoch": 0.6382978723404256, "eval_loss": 0.8940334916114807, "eval_runtime": 50.073, "eval_samples_per_second": 5.991, "eval_steps_per_second": 0.2, "step": 60 }, { "epoch": 0.6702127659574468, "grad_norm": 92.25138854980469, "learning_rate": 3.679609781628701e-06, "loss": 0.9315, "step": 63 }, { "epoch": 0.6702127659574468, "eval_loss": 0.8884549736976624, "eval_runtime": 46.5461, "eval_samples_per_second": 6.445, "eval_steps_per_second": 0.215, "step": 63 }, { "epoch": 0.7021276595744681, "grad_norm": 86.6369400024414, "learning_rate": 3.629204168181733e-06, "loss": 1.1571, "step": 66 }, { "epoch": 0.7021276595744681, "eval_loss": 0.883185625076294, "eval_runtime": 43.0575, "eval_samples_per_second": 6.967, "eval_steps_per_second": 0.232, "step": 66 }, { "epoch": 0.7340425531914894, "grad_norm": 87.74919128417969, "learning_rate": 3.5787985547347642e-06, "loss": 1.3094, "step": 69 }, { "epoch": 0.7340425531914894, "eval_loss": 0.8721677660942078, "eval_runtime": 43.1156, "eval_samples_per_second": 6.958, "eval_steps_per_second": 0.232, "step": 69 }, { "epoch": 0.7659574468085106, "grad_norm": 81.59742736816406, "learning_rate": 3.5283929412877956e-06, "loss": 1.0689, "step": 72 }, { "epoch": 0.7659574468085106, "eval_loss": 0.8642645478248596, "eval_runtime": 43.5775, "eval_samples_per_second": 6.884, "eval_steps_per_second": 0.229, "step": 72 }, { "epoch": 0.7978723404255319, "grad_norm": 81.44770050048828, "learning_rate": 3.477987327840827e-06, "loss": 0.8599, "step": 75 }, { "epoch": 0.7978723404255319, "eval_loss": 0.8605104684829712, "eval_runtime": 43.731, "eval_samples_per_second": 6.86, "eval_steps_per_second": 0.229, "step": 75 }, { "epoch": 0.8297872340425532, "grad_norm": 74.29925537109375, "learning_rate": 3.427581714393859e-06, "loss": 1.0591, "step": 78 }, { "epoch": 0.8297872340425532, "eval_loss": 0.8572540283203125, "eval_runtime": 43.0132, "eval_samples_per_second": 6.975, "eval_steps_per_second": 0.232, "step": 78 }, { "epoch": 0.8617021276595744, "grad_norm": 80.95857238769531, "learning_rate": 3.37717610094689e-06, "loss": 0.8628, "step": 81 }, { "epoch": 0.8617021276595744, "eval_loss": 0.854705810546875, "eval_runtime": 43.5041, "eval_samples_per_second": 6.896, "eval_steps_per_second": 0.23, "step": 81 }, { "epoch": 0.8936170212765957, "grad_norm": 73.22078704833984, "learning_rate": 3.3267704874999216e-06, "loss": 0.8449, "step": 84 }, { "epoch": 0.8936170212765957, "eval_loss": 0.8551430106163025, "eval_runtime": 43.3103, "eval_samples_per_second": 6.927, "eval_steps_per_second": 0.231, "step": 84 }, { "epoch": 0.925531914893617, "grad_norm": 81.8653564453125, "learning_rate": 3.2763648740529535e-06, "loss": 0.8565, "step": 87 }, { "epoch": 0.925531914893617, "eval_loss": 0.8558651804924011, "eval_runtime": 43.4899, "eval_samples_per_second": 6.898, "eval_steps_per_second": 0.23, "step": 87 }, { "epoch": 0.9574468085106383, "grad_norm": 97.51951599121094, "learning_rate": 3.2259592606059844e-06, "loss": 0.8928, "step": 90 }, { "epoch": 0.9574468085106383, "eval_loss": 0.8567631244659424, "eval_runtime": 43.4864, "eval_samples_per_second": 6.899, "eval_steps_per_second": 0.23, "step": 90 }, { "epoch": 0.9893617021276596, "grad_norm": 60.12514114379883, "learning_rate": 3.1755536471590163e-06, "loss": 0.7314, "step": 93 }, { "epoch": 0.9893617021276596, "eval_loss": 0.8565493822097778, "eval_runtime": 22.6087, "eval_samples_per_second": 13.269, "eval_steps_per_second": 0.442, "step": 93 }, { "epoch": 1.0212765957446808, "grad_norm": 57.255775451660156, "learning_rate": 3.1251480337120472e-06, "loss": 0.7931, "step": 96 }, { "epoch": 1.0212765957446808, "eval_loss": 0.8531547784805298, "eval_runtime": 23.9019, "eval_samples_per_second": 12.551, "eval_steps_per_second": 0.418, "step": 96 }, { "epoch": 1.053191489361702, "grad_norm": 43.00119400024414, "learning_rate": 3.074742420265079e-06, "loss": 0.3847, "step": 99 }, { "epoch": 1.053191489361702, "eval_loss": 0.84489506483078, "eval_runtime": 23.0798, "eval_samples_per_second": 12.998, "eval_steps_per_second": 0.433, "step": 99 }, { "epoch": 1.0851063829787233, "grad_norm": 21.35872459411621, "learning_rate": 3.024336806818111e-06, "loss": 0.252, "step": 102 }, { "epoch": 1.0851063829787233, "eval_loss": 0.8377478122711182, "eval_runtime": 23.6666, "eval_samples_per_second": 12.676, "eval_steps_per_second": 0.423, "step": 102 }, { "epoch": 1.1170212765957448, "grad_norm": 42.683170318603516, "learning_rate": 2.973931193371142e-06, "loss": 0.2581, "step": 105 }, { "epoch": 1.1170212765957448, "eval_loss": 0.832615315914154, "eval_runtime": 22.1568, "eval_samples_per_second": 13.54, "eval_steps_per_second": 0.451, "step": 105 }, { "epoch": 1.148936170212766, "grad_norm": 36.07487487792969, "learning_rate": 2.9235255799241737e-06, "loss": 0.272, "step": 108 }, { "epoch": 1.148936170212766, "eval_loss": 0.8290462493896484, "eval_runtime": 23.4484, "eval_samples_per_second": 12.794, "eval_steps_per_second": 0.426, "step": 108 }, { "epoch": 1.1808510638297873, "grad_norm": 25.169939041137695, "learning_rate": 2.873119966477205e-06, "loss": 0.3822, "step": 111 }, { "epoch": 1.1808510638297873, "eval_loss": 0.8273450136184692, "eval_runtime": 22.2076, "eval_samples_per_second": 13.509, "eval_steps_per_second": 0.45, "step": 111 }, { "epoch": 1.2127659574468086, "grad_norm": 43.500083923339844, "learning_rate": 2.8227143530302365e-06, "loss": 0.3555, "step": 114 }, { "epoch": 1.2127659574468086, "eval_loss": 0.825173020362854, "eval_runtime": 23.9364, "eval_samples_per_second": 12.533, "eval_steps_per_second": 0.418, "step": 114 }, { "epoch": 1.2446808510638299, "grad_norm": 47.19260787963867, "learning_rate": 2.772308739583268e-06, "loss": 0.2687, "step": 117 }, { "epoch": 1.2446808510638299, "eval_loss": 0.8255329132080078, "eval_runtime": 22.128, "eval_samples_per_second": 13.558, "eval_steps_per_second": 0.452, "step": 117 }, { "epoch": 1.2765957446808511, "grad_norm": 47.747982025146484, "learning_rate": 2.7219031261362997e-06, "loss": 0.2614, "step": 120 }, { "epoch": 1.2765957446808511, "eval_loss": 0.8268000483512878, "eval_runtime": 23.832, "eval_samples_per_second": 12.588, "eval_steps_per_second": 0.42, "step": 120 }, { "epoch": 1.3085106382978724, "grad_norm": 52.21699142456055, "learning_rate": 2.671497512689331e-06, "loss": 0.3726, "step": 123 }, { "epoch": 1.3085106382978724, "eval_loss": 0.8281159400939941, "eval_runtime": 22.7023, "eval_samples_per_second": 13.215, "eval_steps_per_second": 0.44, "step": 123 }, { "epoch": 1.3404255319148937, "grad_norm": 43.84182357788086, "learning_rate": 2.6210918992423625e-06, "loss": 0.2081, "step": 126 }, { "epoch": 1.3404255319148937, "eval_loss": 0.8298842310905457, "eval_runtime": 23.5811, "eval_samples_per_second": 12.722, "eval_steps_per_second": 0.424, "step": 126 }, { "epoch": 1.372340425531915, "grad_norm": 35.47488021850586, "learning_rate": 2.570686285795394e-06, "loss": 0.2363, "step": 129 }, { "epoch": 1.372340425531915, "eval_loss": 0.8358010053634644, "eval_runtime": 22.1111, "eval_samples_per_second": 13.568, "eval_steps_per_second": 0.452, "step": 129 }, { "epoch": 1.4042553191489362, "grad_norm": 47.503395080566406, "learning_rate": 2.5202806723484253e-06, "loss": 0.3497, "step": 132 }, { "epoch": 1.4042553191489362, "eval_loss": 0.8382316827774048, "eval_runtime": 23.6168, "eval_samples_per_second": 12.703, "eval_steps_per_second": 0.423, "step": 132 }, { "epoch": 1.4361702127659575, "grad_norm": 43.59933853149414, "learning_rate": 2.469875058901457e-06, "loss": 0.2927, "step": 135 }, { "epoch": 1.4361702127659575, "eval_loss": 0.8399573564529419, "eval_runtime": 22.1313, "eval_samples_per_second": 13.555, "eval_steps_per_second": 0.452, "step": 135 }, { "epoch": 1.4680851063829787, "grad_norm": 46.75539779663086, "learning_rate": 2.419469445454488e-06, "loss": 0.3142, "step": 138 }, { "epoch": 1.4680851063829787, "eval_loss": 0.841060996055603, "eval_runtime": 24.0437, "eval_samples_per_second": 12.477, "eval_steps_per_second": 0.416, "step": 138 }, { "epoch": 1.5, "grad_norm": 47.08267593383789, "learning_rate": 2.36906383200752e-06, "loss": 0.2303, "step": 141 }, { "epoch": 1.5, "eval_loss": 0.8422343730926514, "eval_runtime": 21.8892, "eval_samples_per_second": 13.705, "eval_steps_per_second": 0.457, "step": 141 }, { "epoch": 1.5319148936170213, "grad_norm": 59.37859344482422, "learning_rate": 2.3186582185605513e-06, "loss": 0.2628, "step": 144 }, { "epoch": 1.5319148936170213, "eval_loss": 0.843423068523407, "eval_runtime": 23.5581, "eval_samples_per_second": 12.735, "eval_steps_per_second": 0.424, "step": 144 }, { "epoch": 1.5638297872340425, "grad_norm": 66.62377166748047, "learning_rate": 2.268252605113583e-06, "loss": 0.2734, "step": 147 }, { "epoch": 1.5638297872340425, "eval_loss": 0.8458875417709351, "eval_runtime": 22.7309, "eval_samples_per_second": 13.198, "eval_steps_per_second": 0.44, "step": 147 }, { "epoch": 1.5957446808510638, "grad_norm": 65.94501495361328, "learning_rate": 2.2178469916666146e-06, "loss": 0.2989, "step": 150 }, { "epoch": 1.5957446808510638, "eval_loss": 0.8469556570053101, "eval_runtime": 23.6111, "eval_samples_per_second": 12.706, "eval_steps_per_second": 0.424, "step": 150 }, { "epoch": 1.627659574468085, "grad_norm": 99.2269287109375, "learning_rate": 2.167441378219646e-06, "loss": 0.3895, "step": 153 }, { "epoch": 1.627659574468085, "eval_loss": 0.8473237752914429, "eval_runtime": 22.2962, "eval_samples_per_second": 13.455, "eval_steps_per_second": 0.449, "step": 153 }, { "epoch": 1.6595744680851063, "grad_norm": 45.91320037841797, "learning_rate": 2.1170357647726774e-06, "loss": 0.3273, "step": 156 }, { "epoch": 1.6595744680851063, "eval_loss": 0.8420724868774414, "eval_runtime": 24.0122, "eval_samples_per_second": 12.494, "eval_steps_per_second": 0.416, "step": 156 }, { "epoch": 1.6914893617021276, "grad_norm": 52.12400817871094, "learning_rate": 2.0666301513257088e-06, "loss": 0.2508, "step": 159 }, { "epoch": 1.6914893617021276, "eval_loss": 0.8369884490966797, "eval_runtime": 22.131, "eval_samples_per_second": 13.556, "eval_steps_per_second": 0.452, "step": 159 }, { "epoch": 1.7234042553191489, "grad_norm": 59.420372009277344, "learning_rate": 2.0162245378787406e-06, "loss": 0.2143, "step": 162 }, { "epoch": 1.7234042553191489, "eval_loss": 0.8358878493309021, "eval_runtime": 23.6554, "eval_samples_per_second": 12.682, "eval_steps_per_second": 0.423, "step": 162 }, { "epoch": 1.7553191489361701, "grad_norm": 53.903350830078125, "learning_rate": 1.965818924431772e-06, "loss": 0.2337, "step": 165 }, { "epoch": 1.7553191489361701, "eval_loss": 0.8342965841293335, "eval_runtime": 22.1781, "eval_samples_per_second": 13.527, "eval_steps_per_second": 0.451, "step": 165 }, { "epoch": 1.7872340425531914, "grad_norm": 37.46884536743164, "learning_rate": 1.9154133109848034e-06, "loss": 0.1867, "step": 168 }, { "epoch": 1.7872340425531914, "eval_loss": 0.8319395184516907, "eval_runtime": 23.6724, "eval_samples_per_second": 12.673, "eval_steps_per_second": 0.422, "step": 168 }, { "epoch": 1.8191489361702127, "grad_norm": 73.817138671875, "learning_rate": 1.865007697537835e-06, "loss": 0.3836, "step": 171 }, { "epoch": 1.8191489361702127, "eval_loss": 0.8294512033462524, "eval_runtime": 22.7496, "eval_samples_per_second": 13.187, "eval_steps_per_second": 0.44, "step": 171 }, { "epoch": 1.851063829787234, "grad_norm": 66.78804016113281, "learning_rate": 1.8146020840908664e-06, "loss": 0.2939, "step": 174 }, { "epoch": 1.851063829787234, "eval_loss": 0.8301381468772888, "eval_runtime": 23.7921, "eval_samples_per_second": 12.609, "eval_steps_per_second": 0.42, "step": 174 }, { "epoch": 1.8829787234042552, "grad_norm": 27.3295841217041, "learning_rate": 1.7641964706438978e-06, "loss": 0.2702, "step": 177 }, { "epoch": 1.8829787234042552, "eval_loss": 0.8287379741668701, "eval_runtime": 22.3208, "eval_samples_per_second": 13.44, "eval_steps_per_second": 0.448, "step": 177 }, { "epoch": 1.9148936170212765, "grad_norm": 34.200496673583984, "learning_rate": 1.7137908571969294e-06, "loss": 0.1949, "step": 180 }, { "epoch": 1.9148936170212765, "eval_loss": 0.827938437461853, "eval_runtime": 23.7637, "eval_samples_per_second": 12.624, "eval_steps_per_second": 0.421, "step": 180 }, { "epoch": 1.9468085106382977, "grad_norm": 57.093162536621094, "learning_rate": 1.6633852437499608e-06, "loss": 0.2186, "step": 183 }, { "epoch": 1.9468085106382977, "eval_loss": 0.8297010064125061, "eval_runtime": 22.4348, "eval_samples_per_second": 13.372, "eval_steps_per_second": 0.446, "step": 183 }, { "epoch": 1.978723404255319, "grad_norm": 54.119667053222656, "learning_rate": 1.6129796303029922e-06, "loss": 0.2609, "step": 186 }, { "epoch": 1.978723404255319, "eval_loss": 0.8327953219413757, "eval_runtime": 24.1101, "eval_samples_per_second": 12.443, "eval_steps_per_second": 0.415, "step": 186 }, { "epoch": 2.0106382978723403, "grad_norm": 34.58624267578125, "learning_rate": 1.5625740168560236e-06, "loss": 0.1952, "step": 189 }, { "epoch": 2.0106382978723403, "eval_loss": 0.8341763615608215, "eval_runtime": 45.5245, "eval_samples_per_second": 6.59, "eval_steps_per_second": 0.22, "step": 189 }, { "epoch": 2.0425531914893615, "grad_norm": 35.64149856567383, "learning_rate": 1.5121684034090554e-06, "loss": 0.0993, "step": 192 }, { "epoch": 2.0425531914893615, "eval_loss": 0.837165355682373, "eval_runtime": 13.0218, "eval_samples_per_second": 23.038, "eval_steps_per_second": 0.768, "step": 192 }, { "epoch": 2.074468085106383, "grad_norm": 50.38720703125, "learning_rate": 1.4617627899620868e-06, "loss": 0.1409, "step": 195 }, { "epoch": 2.074468085106383, "eval_loss": 0.8386108875274658, "eval_runtime": 32.929, "eval_samples_per_second": 9.11, "eval_steps_per_second": 0.304, "step": 195 }, { "epoch": 2.106382978723404, "grad_norm": 28.053754806518555, "learning_rate": 1.4113571765151182e-06, "loss": 0.0522, "step": 198 }, { "epoch": 2.106382978723404, "eval_loss": 0.8413487076759338, "eval_runtime": 13.422, "eval_samples_per_second": 22.351, "eval_steps_per_second": 0.745, "step": 198 }, { "epoch": 2.1382978723404253, "grad_norm": 48.99291229248047, "learning_rate": 1.3609515630681499e-06, "loss": 0.123, "step": 201 }, { "epoch": 2.1382978723404253, "eval_loss": 0.8437539339065552, "eval_runtime": 32.9133, "eval_samples_per_second": 9.115, "eval_steps_per_second": 0.304, "step": 201 }, { "epoch": 2.1702127659574466, "grad_norm": 30.540855407714844, "learning_rate": 1.3105459496211813e-06, "loss": 0.0946, "step": 204 }, { "epoch": 2.1702127659574466, "eval_loss": 0.8469984531402588, "eval_runtime": 12.8212, "eval_samples_per_second": 23.399, "eval_steps_per_second": 0.78, "step": 204 }, { "epoch": 2.202127659574468, "grad_norm": 65.50384521484375, "learning_rate": 1.2601403361742127e-06, "loss": 0.2346, "step": 207 }, { "epoch": 2.202127659574468, "eval_loss": 0.8491196036338806, "eval_runtime": 33.0116, "eval_samples_per_second": 9.088, "eval_steps_per_second": 0.303, "step": 207 }, { "epoch": 2.2340425531914896, "grad_norm": 37.9770622253418, "learning_rate": 1.209734722727244e-06, "loss": 0.1445, "step": 210 }, { "epoch": 2.2340425531914896, "eval_loss": 0.8509886860847473, "eval_runtime": 12.9443, "eval_samples_per_second": 23.176, "eval_steps_per_second": 0.773, "step": 210 }, { "epoch": 2.2659574468085104, "grad_norm": 34.36880111694336, "learning_rate": 1.1593291092802757e-06, "loss": 0.1112, "step": 213 }, { "epoch": 2.2659574468085104, "eval_loss": 0.8533729910850525, "eval_runtime": 33.1181, "eval_samples_per_second": 9.058, "eval_steps_per_second": 0.302, "step": 213 }, { "epoch": 2.297872340425532, "grad_norm": 45.28392028808594, "learning_rate": 1.1089234958333073e-06, "loss": 0.113, "step": 216 }, { "epoch": 2.297872340425532, "eval_loss": 0.8565312623977661, "eval_runtime": 12.9159, "eval_samples_per_second": 23.227, "eval_steps_per_second": 0.774, "step": 216 }, { "epoch": 2.329787234042553, "grad_norm": 24.961444854736328, "learning_rate": 1.0585178823863387e-06, "loss": 0.0929, "step": 219 }, { "epoch": 2.329787234042553, "eval_loss": 0.859375, "eval_runtime": 32.6853, "eval_samples_per_second": 9.178, "eval_steps_per_second": 0.306, "step": 219 }, { "epoch": 2.3617021276595747, "grad_norm": 22.85188865661621, "learning_rate": 1.0081122689393703e-06, "loss": 0.1108, "step": 222 }, { "epoch": 2.3617021276595747, "eval_loss": 0.8612162470817566, "eval_runtime": 13.8881, "eval_samples_per_second": 21.601, "eval_steps_per_second": 0.72, "step": 222 }, { "epoch": 2.393617021276596, "grad_norm": 20.546833038330078, "learning_rate": 9.577066554924017e-07, "loss": 0.092, "step": 225 }, { "epoch": 2.393617021276596, "eval_loss": 0.8626397848129272, "eval_runtime": 32.9393, "eval_samples_per_second": 9.108, "eval_steps_per_second": 0.304, "step": 225 }, { "epoch": 2.425531914893617, "grad_norm": 36.019561767578125, "learning_rate": 9.073010420454332e-07, "loss": 0.0641, "step": 228 }, { "epoch": 2.425531914893617, "eval_loss": 0.8644223809242249, "eval_runtime": 12.991, "eval_samples_per_second": 23.093, "eval_steps_per_second": 0.77, "step": 228 }, { "epoch": 2.4574468085106385, "grad_norm": 15.907193183898926, "learning_rate": 8.568954285984647e-07, "loss": 0.0437, "step": 231 }, { "epoch": 2.4574468085106385, "eval_loss": 0.8656131029129028, "eval_runtime": 33.0386, "eval_samples_per_second": 9.08, "eval_steps_per_second": 0.303, "step": 231 }, { "epoch": 2.4893617021276597, "grad_norm": 30.25502586364746, "learning_rate": 8.064898151514961e-07, "loss": 0.097, "step": 234 }, { "epoch": 2.4893617021276597, "eval_loss": 0.8662587404251099, "eval_runtime": 12.9969, "eval_samples_per_second": 23.082, "eval_steps_per_second": 0.769, "step": 234 }, { "epoch": 2.521276595744681, "grad_norm": 34.13824462890625, "learning_rate": 7.560842017045277e-07, "loss": 0.1028, "step": 237 }, { "epoch": 2.521276595744681, "eval_loss": 0.8667454123497009, "eval_runtime": 33.0579, "eval_samples_per_second": 9.075, "eval_steps_per_second": 0.302, "step": 237 }, { "epoch": 2.5531914893617023, "grad_norm": 47.30521011352539, "learning_rate": 7.056785882575591e-07, "loss": 0.1503, "step": 240 }, { "epoch": 2.5531914893617023, "eval_loss": 0.8677744269371033, "eval_runtime": 12.8958, "eval_samples_per_second": 23.263, "eval_steps_per_second": 0.775, "step": 240 }, { "epoch": 2.5851063829787235, "grad_norm": 40.989444732666016, "learning_rate": 6.552729748105906e-07, "loss": 0.1286, "step": 243 }, { "epoch": 2.5851063829787235, "eval_loss": 0.8686262965202332, "eval_runtime": 33.1611, "eval_samples_per_second": 9.047, "eval_steps_per_second": 0.302, "step": 243 }, { "epoch": 2.617021276595745, "grad_norm": 58.63330841064453, "learning_rate": 6.04867361363622e-07, "loss": 0.1938, "step": 246 }, { "epoch": 2.617021276595745, "eval_loss": 0.8695897459983826, "eval_runtime": 13.2572, "eval_samples_per_second": 22.629, "eval_steps_per_second": 0.754, "step": 246 }, { "epoch": 2.648936170212766, "grad_norm": 25.59177017211914, "learning_rate": 5.544617479166536e-07, "loss": 0.0655, "step": 249 }, { "epoch": 2.648936170212766, "eval_loss": 0.8699460029602051, "eval_runtime": 33.2452, "eval_samples_per_second": 9.024, "eval_steps_per_second": 0.301, "step": 249 }, { "epoch": 2.6808510638297873, "grad_norm": 10.76020622253418, "learning_rate": 5.040561344696851e-07, "loss": 0.0638, "step": 252 }, { "epoch": 2.6808510638297873, "eval_loss": 0.8698618412017822, "eval_runtime": 13.4624, "eval_samples_per_second": 22.284, "eval_steps_per_second": 0.743, "step": 252 }, { "epoch": 2.7127659574468086, "grad_norm": 14.23372745513916, "learning_rate": 4.536505210227166e-07, "loss": 0.04, "step": 255 }, { "epoch": 2.7127659574468086, "eval_loss": 0.8700897097587585, "eval_runtime": 33.0408, "eval_samples_per_second": 9.08, "eval_steps_per_second": 0.303, "step": 255 }, { "epoch": 2.74468085106383, "grad_norm": 54.65508270263672, "learning_rate": 4.0324490757574806e-07, "loss": 0.158, "step": 258 }, { "epoch": 2.74468085106383, "eval_loss": 0.8706519603729248, "eval_runtime": 13.0406, "eval_samples_per_second": 23.005, "eval_steps_per_second": 0.767, "step": 258 }, { "epoch": 2.776595744680851, "grad_norm": 36.574066162109375, "learning_rate": 3.5283929412877956e-07, "loss": 0.0951, "step": 261 }, { "epoch": 2.776595744680851, "eval_loss": 0.871029794216156, "eval_runtime": 32.8987, "eval_samples_per_second": 9.119, "eval_steps_per_second": 0.304, "step": 261 }, { "epoch": 2.8085106382978724, "grad_norm": 32.9112434387207, "learning_rate": 3.02433680681811e-07, "loss": 0.0788, "step": 264 }, { "epoch": 2.8085106382978724, "eval_loss": 0.871411144733429, "eval_runtime": 12.964, "eval_samples_per_second": 23.141, "eval_steps_per_second": 0.771, "step": 264 }, { "epoch": 2.8404255319148937, "grad_norm": 26.687891006469727, "learning_rate": 2.520280672348426e-07, "loss": 0.0429, "step": 267 }, { "epoch": 2.8404255319148937, "eval_loss": 0.8717812299728394, "eval_runtime": 32.9027, "eval_samples_per_second": 9.118, "eval_steps_per_second": 0.304, "step": 267 }, { "epoch": 2.872340425531915, "grad_norm": 25.479427337646484, "learning_rate": 2.0162245378787403e-07, "loss": 0.0809, "step": 270 }, { "epoch": 2.872340425531915, "eval_loss": 0.8716564178466797, "eval_runtime": 12.9971, "eval_samples_per_second": 23.082, "eval_steps_per_second": 0.769, "step": 270 }, { "epoch": 2.904255319148936, "grad_norm": 14.731446266174316, "learning_rate": 1.512168403409055e-07, "loss": 0.0942, "step": 273 }, { "epoch": 2.904255319148936, "eval_loss": 0.8718379735946655, "eval_runtime": 33.0136, "eval_samples_per_second": 9.087, "eval_steps_per_second": 0.303, "step": 273 }, { "epoch": 2.9361702127659575, "grad_norm": 23.23249053955078, "learning_rate": 1.0081122689393701e-07, "loss": 0.1419, "step": 276 }, { "epoch": 2.9361702127659575, "eval_loss": 0.8720559477806091, "eval_runtime": 13.6301, "eval_samples_per_second": 22.01, "eval_steps_per_second": 0.734, "step": 276 }, { "epoch": 2.9680851063829787, "grad_norm": 42.01313400268555, "learning_rate": 5.040561344696851e-08, "loss": 0.1036, "step": 279 }, { "epoch": 2.9680851063829787, "eval_loss": 0.8721166253089905, "eval_runtime": 33.0448, "eval_samples_per_second": 9.079, "eval_steps_per_second": 0.303, "step": 279 }, { "epoch": 3.0, "grad_norm": 37.830604553222656, "learning_rate": 0.0, "loss": 0.0854, "step": 282 }, { "epoch": 3.0, "eval_loss": 0.8720741868019104, "eval_runtime": 12.8271, "eval_samples_per_second": 23.388, "eval_steps_per_second": 0.78, "step": 282 } ], "logging_steps": 3, "max_steps": 282, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 523328480700102.0, "train_batch_size": 32, "trial_name": null, "trial_params": { "_wandb": {}, "assignments": {}, "decay": 0.01, "learning_rate": 4.73812766401504e-06, "metric": "eval/loss", "per_device_train_batch_size": 32 } }