{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0035087719298246, "eval_steps": 36, "global_step": 428, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007017543859649123, "eval_loss": 0.6692813038825989, "eval_runtime": 46.9815, "eval_samples_per_second": 5.108, "eval_steps_per_second": 0.639, "step": 1 }, { "epoch": 0.021052631578947368, "grad_norm": 9.726601600646973, "learning_rate": 1.5e-05, "loss": 2.4103, "step": 3 }, { "epoch": 0.042105263157894736, "grad_norm": 6.305670738220215, "learning_rate": 3e-05, "loss": 2.7525, "step": 6 }, { "epoch": 0.06315789473684211, "grad_norm": 4.551560401916504, "learning_rate": 4.5e-05, "loss": 2.5999, "step": 9 }, { "epoch": 0.08421052631578947, "grad_norm": 4.885453701019287, "learning_rate": 4.999717571181742e-05, "loss": 2.2257, "step": 12 }, { "epoch": 0.10526315789473684, "grad_norm": 4.619503498077393, "learning_rate": 4.998234994371135e-05, "loss": 2.0927, "step": 15 }, { "epoch": 0.12631578947368421, "grad_norm": 4.877598285675049, "learning_rate": 4.995482415049123e-05, "loss": 2.3476, "step": 18 }, { "epoch": 0.14736842105263157, "grad_norm": 6.852722644805908, "learning_rate": 4.991461232516675e-05, "loss": 2.028, "step": 21 }, { "epoch": 0.16842105263157894, "grad_norm": 6.002420902252197, "learning_rate": 4.986173490981773e-05, "loss": 1.6801, "step": 24 }, { "epoch": 0.18947368421052632, "grad_norm": 5.471989631652832, "learning_rate": 4.979621878520216e-05, "loss": 1.4341, "step": 27 }, { "epoch": 0.21052631578947367, "grad_norm": 5.2608489990234375, "learning_rate": 4.971809725709112e-05, "loss": 1.8805, "step": 30 }, { "epoch": 0.23157894736842105, "grad_norm": 3.3612749576568604, "learning_rate": 4.962741003933742e-05, "loss": 1.6929, "step": 33 }, { "epoch": 0.25263157894736843, "grad_norm": 4.646411895751953, "learning_rate": 4.952420323368673e-05, "loss": 1.5576, "step": 36 }, { "epoch": 0.25263157894736843, "eval_loss": 0.37584158778190613, "eval_runtime": 47.717, "eval_samples_per_second": 5.03, "eval_steps_per_second": 0.629, "step": 36 }, { "epoch": 0.2736842105263158, "grad_norm": 4.316969394683838, "learning_rate": 4.9408529306341255e-05, "loss": 1.8731, "step": 39 }, { "epoch": 0.29473684210526313, "grad_norm": 4.682113170623779, "learning_rate": 4.928044706128803e-05, "loss": 1.8301, "step": 42 }, { "epoch": 0.3157894736842105, "grad_norm": 3.629147529602051, "learning_rate": 4.9140021610405326e-05, "loss": 1.2944, "step": 45 }, { "epoch": 0.3368421052631579, "grad_norm": 7.077390193939209, "learning_rate": 4.898732434036244e-05, "loss": 2.0447, "step": 48 }, { "epoch": 0.35789473684210527, "grad_norm": 4.48635196685791, "learning_rate": 4.882243287632947e-05, "loss": 1.4274, "step": 51 }, { "epoch": 0.37894736842105264, "grad_norm": 4.4893388748168945, "learning_rate": 4.864543104251587e-05, "loss": 1.7248, "step": 54 }, { "epoch": 0.4, "grad_norm": 5.431076526641846, "learning_rate": 4.8456408819557564e-05, "loss": 1.6822, "step": 57 }, { "epoch": 0.42105263157894735, "grad_norm": 3.690011501312256, "learning_rate": 4.825546229877439e-05, "loss": 1.7077, "step": 60 }, { "epoch": 0.4421052631578947, "grad_norm": 5.592578411102295, "learning_rate": 4.804269363332112e-05, "loss": 1.836, "step": 63 }, { "epoch": 0.4631578947368421, "grad_norm": 5.439390182495117, "learning_rate": 4.78182109862569e-05, "loss": 1.2283, "step": 66 }, { "epoch": 0.4842105263157895, "grad_norm": 4.344250202178955, "learning_rate": 4.758212847555953e-05, "loss": 1.6078, "step": 69 }, { "epoch": 0.5052631578947369, "grad_norm": 5.0562825202941895, "learning_rate": 4.733456611611233e-05, "loss": 1.858, "step": 72 }, { "epoch": 0.5052631578947369, "eval_loss": 0.3427739441394806, "eval_runtime": 47.7477, "eval_samples_per_second": 5.026, "eval_steps_per_second": 0.628, "step": 72 }, { "epoch": 0.5263157894736842, "grad_norm": 4.3865838050842285, "learning_rate": 4.7075649758693565e-05, "loss": 1.2519, "step": 75 }, { "epoch": 0.5473684210526316, "grad_norm": 2.8480889797210693, "learning_rate": 4.68055110259988e-05, "loss": 1.6193, "step": 78 }, { "epoch": 0.5684210526315789, "grad_norm": 3.4718546867370605, "learning_rate": 4.6524287245729295e-05, "loss": 1.4091, "step": 81 }, { "epoch": 0.5894736842105263, "grad_norm": 4.449820041656494, "learning_rate": 4.6232121380780034e-05, "loss": 1.484, "step": 84 }, { "epoch": 0.6105263157894737, "grad_norm": 3.7628002166748047, "learning_rate": 4.592916195656322e-05, "loss": 1.3605, "step": 87 }, { "epoch": 0.631578947368421, "grad_norm": 3.6203603744506836, "learning_rate": 4.561556298550379e-05, "loss": 1.4026, "step": 90 }, { "epoch": 0.6526315789473685, "grad_norm": 3.6984612941741943, "learning_rate": 4.529148388874577e-05, "loss": 1.1724, "step": 93 }, { "epoch": 0.6736842105263158, "grad_norm": 3.412766933441162, "learning_rate": 4.49570894151089e-05, "loss": 1.5515, "step": 96 }, { "epoch": 0.6947368421052632, "grad_norm": 2.684919595718384, "learning_rate": 4.4612549557336974e-05, "loss": 1.2596, "step": 99 }, { "epoch": 0.7157894736842105, "grad_norm": 4.008241176605225, "learning_rate": 4.4258039465680326e-05, "loss": 1.1391, "step": 102 }, { "epoch": 0.7368421052631579, "grad_norm": 4.187386989593506, "learning_rate": 4.389373935885646e-05, "loss": 1.1588, "step": 105 }, { "epoch": 0.7578947368421053, "grad_norm": 4.869933605194092, "learning_rate": 4.351983443243409e-05, "loss": 1.5655, "step": 108 }, { "epoch": 0.7578947368421053, "eval_loss": 0.322973370552063, "eval_runtime": 47.6997, "eval_samples_per_second": 5.031, "eval_steps_per_second": 0.629, "step": 108 }, { "epoch": 0.7789473684210526, "grad_norm": 3.7822816371917725, "learning_rate": 4.313651476468715e-05, "loss": 1.5809, "step": 111 }, { "epoch": 0.8, "grad_norm": 2.936788320541382, "learning_rate": 4.274397521996658e-05, "loss": 1.0463, "step": 114 }, { "epoch": 0.8210526315789474, "grad_norm": 4.922979831695557, "learning_rate": 4.234241534963916e-05, "loss": 1.2287, "step": 117 }, { "epoch": 0.8421052631578947, "grad_norm": 5.986371040344238, "learning_rate": 4.193203929064353e-05, "loss": 1.3477, "step": 120 }, { "epoch": 0.8631578947368421, "grad_norm": 3.4700145721435547, "learning_rate": 4.1513055661715214e-05, "loss": 0.9548, "step": 123 }, { "epoch": 0.8842105263157894, "grad_norm": 4.394268035888672, "learning_rate": 4.108567745733318e-05, "loss": 1.2286, "step": 126 }, { "epoch": 0.9052631578947369, "grad_norm": 4.035145282745361, "learning_rate": 4.065012193944201e-05, "loss": 1.1731, "step": 129 }, { "epoch": 0.9263157894736842, "grad_norm": 3.933317184448242, "learning_rate": 4.020661052700461e-05, "loss": 1.6722, "step": 132 }, { "epoch": 0.9473684210526315, "grad_norm": 3.2603344917297363, "learning_rate": 3.9755368683441735e-05, "loss": 1.3816, "step": 135 }, { "epoch": 0.968421052631579, "grad_norm": 6.198463439941406, "learning_rate": 3.9296625802015356e-05, "loss": 1.2843, "step": 138 }, { "epoch": 0.9894736842105263, "grad_norm": 4.392797470092773, "learning_rate": 3.883061508921439e-05, "loss": 1.5944, "step": 141 }, { "epoch": 1.0105263157894737, "grad_norm": 3.6408369541168213, "learning_rate": 3.8357573446201825e-05, "loss": 1.1528, "step": 144 }, { "epoch": 1.0105263157894737, "eval_loss": 0.31307944655418396, "eval_runtime": 47.7481, "eval_samples_per_second": 5.026, "eval_steps_per_second": 0.628, "step": 144 }, { "epoch": 1.0315789473684212, "grad_norm": 3.727839946746826, "learning_rate": 3.78777413483837e-05, "loss": 1.3407, "step": 147 }, { "epoch": 1.0526315789473684, "grad_norm": 4.318253517150879, "learning_rate": 3.739136272316102e-05, "loss": 1.274, "step": 150 }, { "epoch": 1.0736842105263158, "grad_norm": 3.0407471656799316, "learning_rate": 3.689868482592684e-05, "loss": 1.0978, "step": 153 }, { "epoch": 1.0947368421052632, "grad_norm": 3.2110660076141357, "learning_rate": 3.6399958114371595e-05, "loss": 0.9378, "step": 156 }, { "epoch": 1.1157894736842104, "grad_norm": 4.471799373626709, "learning_rate": 3.5895436121160386e-05, "loss": 1.334, "step": 159 }, { "epoch": 1.1368421052631579, "grad_norm": 2.7536613941192627, "learning_rate": 3.5385375325047166e-05, "loss": 1.5206, "step": 162 }, { "epoch": 1.1578947368421053, "grad_norm": 3.1631388664245605, "learning_rate": 3.487003502049122e-05, "loss": 0.9874, "step": 165 }, { "epoch": 1.1789473684210527, "grad_norm": 3.0744566917419434, "learning_rate": 3.4349677185842245e-05, "loss": 1.2542, "step": 168 }, { "epoch": 1.2, "grad_norm": 3.199769973754883, "learning_rate": 3.38245663501611e-05, "loss": 1.0781, "step": 171 }, { "epoch": 1.2210526315789474, "grad_norm": 3.3641140460968018, "learning_rate": 3.32949694587438e-05, "loss": 1.0915, "step": 174 }, { "epoch": 1.2421052631578948, "grad_norm": 2.533961057662964, "learning_rate": 3.276115573741724e-05, "loss": 1.2862, "step": 177 }, { "epoch": 1.263157894736842, "grad_norm": 4.081838130950928, "learning_rate": 3.222339655567556e-05, "loss": 1.2205, "step": 180 }, { "epoch": 1.263157894736842, "eval_loss": 0.3107610046863556, "eval_runtime": 47.7384, "eval_samples_per_second": 5.027, "eval_steps_per_second": 0.628, "step": 180 }, { "epoch": 1.2842105263157895, "grad_norm": 2.3932526111602783, "learning_rate": 3.168196528872682e-05, "loss": 1.0431, "step": 183 }, { "epoch": 1.305263157894737, "grad_norm": 2.7691686153411865, "learning_rate": 3.1137137178519985e-05, "loss": 1.314, "step": 186 }, { "epoch": 1.3263157894736843, "grad_norm": 3.8344638347625732, "learning_rate": 3.0589189193822895e-05, "loss": 0.8119, "step": 189 }, { "epoch": 1.3473684210526315, "grad_norm": 4.127139568328857, "learning_rate": 3.0038399889422553e-05, "loss": 1.1671, "step": 192 }, { "epoch": 1.368421052631579, "grad_norm": 3.597393035888672, "learning_rate": 2.948504926451896e-05, "loss": 1.4459, "step": 195 }, { "epoch": 1.3894736842105262, "grad_norm": 3.0417675971984863, "learning_rate": 2.8929418620384753e-05, "loss": 1.0606, "step": 198 }, { "epoch": 1.4105263157894736, "grad_norm": 4.269920825958252, "learning_rate": 2.8371790417362987e-05, "loss": 0.8091, "step": 201 }, { "epoch": 1.431578947368421, "grad_norm": 4.4791789054870605, "learning_rate": 2.781244813127552e-05, "loss": 1.4956, "step": 204 }, { "epoch": 1.4526315789473685, "grad_norm": 4.570736885070801, "learning_rate": 2.7251676109315338e-05, "loss": 0.791, "step": 207 }, { "epoch": 1.4736842105263157, "grad_norm": 4.790010929107666, "learning_rate": 2.668975942549583e-05, "loss": 1.2485, "step": 210 }, { "epoch": 1.4947368421052631, "grad_norm": 3.679155111312866, "learning_rate": 2.612698373573056e-05, "loss": 0.9622, "step": 213 }, { "epoch": 1.5157894736842106, "grad_norm": 3.991124153137207, "learning_rate": 2.5563635132617302e-05, "loss": 0.7821, "step": 216 }, { "epoch": 1.5157894736842106, "eval_loss": 0.30537185072898865, "eval_runtime": 47.7614, "eval_samples_per_second": 5.025, "eval_steps_per_second": 0.628, "step": 216 }, { "epoch": 1.5368421052631578, "grad_norm": 5.918197154998779, "learning_rate": 2.5e-05, "loss": 0.7552, "step": 219 }, { "epoch": 1.5578947368421052, "grad_norm": 6.4377241134643555, "learning_rate": 2.44363648673827e-05, "loss": 1.13, "step": 222 }, { "epoch": 1.5789473684210527, "grad_norm": 3.93595814704895, "learning_rate": 2.387301626426944e-05, "loss": 0.9218, "step": 225 }, { "epoch": 1.6, "grad_norm": 5.706233978271484, "learning_rate": 2.3310240574504185e-05, "loss": 1.1022, "step": 228 }, { "epoch": 1.6210526315789475, "grad_norm": 2.740601062774658, "learning_rate": 2.2748323890684665e-05, "loss": 1.2584, "step": 231 }, { "epoch": 1.6421052631578947, "grad_norm": 4.44104528427124, "learning_rate": 2.2187551868724485e-05, "loss": 1.0941, "step": 234 }, { "epoch": 1.663157894736842, "grad_norm": 4.569465160369873, "learning_rate": 2.1628209582637022e-05, "loss": 1.1554, "step": 237 }, { "epoch": 1.6842105263157894, "grad_norm": 4.33217191696167, "learning_rate": 2.1070581379615253e-05, "loss": 0.5728, "step": 240 }, { "epoch": 1.7052631578947368, "grad_norm": 4.296968936920166, "learning_rate": 2.0514950735481052e-05, "loss": 1.0808, "step": 243 }, { "epoch": 1.7263157894736842, "grad_norm": 3.474714994430542, "learning_rate": 1.9961600110577456e-05, "loss": 1.2945, "step": 246 }, { "epoch": 1.7473684210526317, "grad_norm": 3.817056655883789, "learning_rate": 1.9410810806177104e-05, "loss": 1.4233, "step": 249 }, { "epoch": 1.768421052631579, "grad_norm": 3.0018868446350098, "learning_rate": 1.8862862821480025e-05, "loss": 1.0385, "step": 252 }, { "epoch": 1.768421052631579, "eval_loss": 0.3030892610549927, "eval_runtime": 47.7428, "eval_samples_per_second": 5.027, "eval_steps_per_second": 0.628, "step": 252 }, { "epoch": 1.7894736842105263, "grad_norm": 3.522315502166748, "learning_rate": 1.831803471127318e-05, "loss": 1.1658, "step": 255 }, { "epoch": 1.8105263157894735, "grad_norm": 3.5018210411071777, "learning_rate": 1.7776603444324445e-05, "loss": 1.0903, "step": 258 }, { "epoch": 1.831578947368421, "grad_norm": 4.468841552734375, "learning_rate": 1.723884426258277e-05, "loss": 1.1171, "step": 261 }, { "epoch": 1.8526315789473684, "grad_norm": 3.999666452407837, "learning_rate": 1.670503054125621e-05, "loss": 1.2162, "step": 264 }, { "epoch": 1.8736842105263158, "grad_norm": 3.463674783706665, "learning_rate": 1.61754336498389e-05, "loss": 0.8498, "step": 267 }, { "epoch": 1.8947368421052633, "grad_norm": 3.4514553546905518, "learning_rate": 1.5650322814157764e-05, "loss": 1.2623, "step": 270 }, { "epoch": 1.9157894736842105, "grad_norm": 3.6156108379364014, "learning_rate": 1.5129964979508792e-05, "loss": 0.8503, "step": 273 }, { "epoch": 1.936842105263158, "grad_norm": 3.3259615898132324, "learning_rate": 1.4614624674952842e-05, "loss": 0.9937, "step": 276 }, { "epoch": 1.9578947368421051, "grad_norm": 5.9854230880737305, "learning_rate": 1.4104563878839621e-05, "loss": 0.9689, "step": 279 }, { "epoch": 1.9789473684210526, "grad_norm": 2.22936749458313, "learning_rate": 1.3600041885628409e-05, "loss": 0.9137, "step": 282 }, { "epoch": 2.0, "grad_norm": 3.004664897918701, "learning_rate": 1.3101315174073162e-05, "loss": 0.6448, "step": 285 }, { "epoch": 2.0210526315789474, "grad_norm": 6.208250522613525, "learning_rate": 1.2608637276838986e-05, "loss": 1.319, "step": 288 }, { "epoch": 2.0210526315789474, "eval_loss": 0.30173683166503906, "eval_runtime": 47.7589, "eval_samples_per_second": 5.025, "eval_steps_per_second": 0.628, "step": 288 }, { "epoch": 2.042105263157895, "grad_norm": 2.969910144805908, "learning_rate": 1.2122258651616306e-05, "loss": 0.8383, "step": 291 }, { "epoch": 2.0631578947368423, "grad_norm": 2.5157318115234375, "learning_rate": 1.1642426553798174e-05, "loss": 0.7519, "step": 294 }, { "epoch": 2.0842105263157893, "grad_norm": 3.564941167831421, "learning_rate": 1.1169384910785614e-05, "loss": 0.5701, "step": 297 }, { "epoch": 2.1052631578947367, "grad_norm": 3.544473886489868, "learning_rate": 1.0703374197984653e-05, "loss": 0.7366, "step": 300 }, { "epoch": 2.126315789473684, "grad_norm": 2.952383041381836, "learning_rate": 1.0244631316558267e-05, "loss": 0.6928, "step": 303 }, { "epoch": 2.1473684210526316, "grad_norm": 3.4289209842681885, "learning_rate": 9.793389472995393e-06, "loss": 0.7361, "step": 306 }, { "epoch": 2.168421052631579, "grad_norm": 3.7741119861602783, "learning_rate": 9.349878060557999e-06, "loss": 0.7777, "step": 309 }, { "epoch": 2.1894736842105265, "grad_norm": 4.074053764343262, "learning_rate": 8.914322542666822e-06, "loss": 0.9209, "step": 312 }, { "epoch": 2.2105263157894735, "grad_norm": 4.839679718017578, "learning_rate": 8.486944338284797e-06, "loss": 0.937, "step": 315 }, { "epoch": 2.231578947368421, "grad_norm": 3.5984749794006348, "learning_rate": 8.067960709356478e-06, "loss": 1.0567, "step": 318 }, { "epoch": 2.2526315789473683, "grad_norm": 4.226260185241699, "learning_rate": 7.657584650360847e-06, "loss": 0.8969, "step": 321 }, { "epoch": 2.2736842105263158, "grad_norm": 2.624924421310425, "learning_rate": 7.256024780033418e-06, "loss": 0.7665, "step": 324 }, { "epoch": 2.2736842105263158, "eval_loss": 0.3099728524684906, "eval_runtime": 47.753, "eval_samples_per_second": 5.026, "eval_steps_per_second": 0.628, "step": 324 }, { "epoch": 2.294736842105263, "grad_norm": 3.7707293033599854, "learning_rate": 6.863485235312853e-06, "loss": 0.7157, "step": 327 }, { "epoch": 2.3157894736842106, "grad_norm": 3.5063211917877197, "learning_rate": 6.480165567565913e-06, "loss": 0.7941, "step": 330 }, { "epoch": 2.336842105263158, "grad_norm": 5.289640426635742, "learning_rate": 6.106260641143546e-06, "loss": 1.022, "step": 333 }, { "epoch": 2.3578947368421055, "grad_norm": 3.4733479022979736, "learning_rate": 5.741960534319677e-06, "loss": 0.8732, "step": 336 }, { "epoch": 2.3789473684210525, "grad_norm": 2.74438214302063, "learning_rate": 5.387450442663025e-06, "loss": 0.488, "step": 339 }, { "epoch": 2.4, "grad_norm": 3.423187732696533, "learning_rate": 5.0429105848911e-06, "loss": 1.0244, "step": 342 }, { "epoch": 2.4210526315789473, "grad_norm": 3.875284194946289, "learning_rate": 4.708516111254238e-06, "loss": 0.9071, "step": 345 }, { "epoch": 2.442105263157895, "grad_norm": 4.707957744598389, "learning_rate": 4.384437014496215e-06, "loss": 0.8664, "step": 348 }, { "epoch": 2.463157894736842, "grad_norm": 4.914385795593262, "learning_rate": 4.070838043436786e-06, "loss": 0.6006, "step": 351 }, { "epoch": 2.4842105263157896, "grad_norm": 3.2543418407440186, "learning_rate": 3.7678786192199694e-06, "loss": 0.5789, "step": 354 }, { "epoch": 2.5052631578947366, "grad_norm": 2.9000864028930664, "learning_rate": 3.475712754270716e-06, "loss": 0.5431, "step": 357 }, { "epoch": 2.526315789473684, "grad_norm": 4.2075886726379395, "learning_rate": 3.194488974001203e-06, "loss": 0.6753, "step": 360 }, { "epoch": 2.526315789473684, "eval_loss": 0.3118632733821869, "eval_runtime": 47.721, "eval_samples_per_second": 5.029, "eval_steps_per_second": 0.629, "step": 360 }, { "epoch": 2.5473684210526315, "grad_norm": 5.408112049102783, "learning_rate": 2.9243502413064368e-06, "loss": 0.6439, "step": 363 }, { "epoch": 2.568421052631579, "grad_norm": 3.7381534576416016, "learning_rate": 2.6654338838876665e-06, "loss": 0.9288, "step": 366 }, { "epoch": 2.5894736842105264, "grad_norm": 4.740654468536377, "learning_rate": 2.4178715244404794e-06, "loss": 0.9505, "step": 369 }, { "epoch": 2.610526315789474, "grad_norm": 4.9893364906311035, "learning_rate": 2.1817890137430934e-06, "loss": 1.046, "step": 372 }, { "epoch": 2.6315789473684212, "grad_norm": 4.344699382781982, "learning_rate": 1.9573063666788875e-06, "loss": 0.8301, "step": 375 }, { "epoch": 2.6526315789473687, "grad_norm": 2.871662139892578, "learning_rate": 1.7445377012256126e-06, "loss": 0.6642, "step": 378 }, { "epoch": 2.6736842105263157, "grad_norm": 3.569286346435547, "learning_rate": 1.5435911804424357e-06, "loss": 0.8558, "step": 381 }, { "epoch": 2.694736842105263, "grad_norm": 4.009424209594727, "learning_rate": 1.3545689574841342e-06, "loss": 0.8686, "step": 384 }, { "epoch": 2.7157894736842105, "grad_norm": 3.5932652950286865, "learning_rate": 1.1775671236705365e-06, "loss": 1.0848, "step": 387 }, { "epoch": 2.736842105263158, "grad_norm": 4.354364395141602, "learning_rate": 1.0126756596375686e-06, "loss": 1.1122, "step": 390 }, { "epoch": 2.7578947368421054, "grad_norm": 3.184096336364746, "learning_rate": 8.599783895946761e-07, "loss": 0.8129, "step": 393 }, { "epoch": 2.7789473684210524, "grad_norm": 4.265777587890625, "learning_rate": 7.195529387119815e-07, "loss": 0.7224, "step": 396 }, { "epoch": 2.7789473684210524, "eval_loss": 0.3112446963787079, "eval_runtime": 47.7893, "eval_samples_per_second": 5.022, "eval_steps_per_second": 0.628, "step": 396 }, { "epoch": 2.8, "grad_norm": 3.4699087142944336, "learning_rate": 5.914706936587494e-07, "loss": 0.614, "step": 399 }, { "epoch": 2.8210526315789473, "grad_norm": 2.6950035095214844, "learning_rate": 4.75796766313269e-07, "loss": 0.9641, "step": 402 }, { "epoch": 2.8421052631578947, "grad_norm": 4.25594425201416, "learning_rate": 3.7258996066258103e-07, "loss": 0.736, "step": 405 }, { "epoch": 2.863157894736842, "grad_norm": 3.8812239170074463, "learning_rate": 2.819027429088822e-07, "loss": 0.7287, "step": 408 }, { "epoch": 2.8842105263157896, "grad_norm": 4.651484966278076, "learning_rate": 2.0378121479783796e-07, "loss": 0.8938, "step": 411 }, { "epoch": 2.905263157894737, "grad_norm": 4.784148216247559, "learning_rate": 1.3826509018227128e-07, "loss": 0.9602, "step": 414 }, { "epoch": 2.9263157894736844, "grad_norm": 4.499444007873535, "learning_rate": 8.538767483325383e-08, "loss": 0.985, "step": 417 }, { "epoch": 2.9473684210526314, "grad_norm": 5.214015483856201, "learning_rate": 4.517584950877452e-08, "loss": 0.9054, "step": 420 }, { "epoch": 2.968421052631579, "grad_norm": 3.8694188594818115, "learning_rate": 1.7650056288651127e-08, "loss": 0.651, "step": 423 }, { "epoch": 2.9894736842105263, "grad_norm": 3.8104214668273926, "learning_rate": 2.8242881825846223e-09, "loss": 0.8252, "step": 426 } ], "logging_steps": 3, "max_steps": 428, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 36, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7558214228836352e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }