{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.5001883239171375, "eval_steps": 83, "global_step": 166, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0030131826741996233, "grad_norm": 7.092586040496826, "learning_rate": 2.0000000000000003e-06, "loss": 2.9681, "step": 1 }, { "epoch": 0.0030131826741996233, "eval_loss": NaN, "eval_runtime": 92.9755, "eval_samples_per_second": 6.012, "eval_steps_per_second": 1.506, "step": 1 }, { "epoch": 0.006026365348399247, "grad_norm": 6.170314788818359, "learning_rate": 4.000000000000001e-06, "loss": 3.029, "step": 2 }, { "epoch": 0.00903954802259887, "grad_norm": 7.000847339630127, "learning_rate": 6e-06, "loss": 3.5581, "step": 3 }, { "epoch": 0.012052730696798493, "grad_norm": 7.161468029022217, "learning_rate": 8.000000000000001e-06, "loss": 3.5814, "step": 4 }, { "epoch": 0.015065913370998116, "grad_norm": 7.698644638061523, "learning_rate": 1e-05, "loss": 3.8585, "step": 5 }, { "epoch": 0.01807909604519774, "grad_norm": 8.074420928955078, "learning_rate": 1.2e-05, "loss": 4.001, "step": 6 }, { "epoch": 0.021092278719397364, "grad_norm": 7.968758583068848, "learning_rate": 1.4000000000000001e-05, "loss": 4.1068, "step": 7 }, { "epoch": 0.024105461393596987, "grad_norm": 8.725764274597168, "learning_rate": 1.6000000000000003e-05, "loss": 3.6916, "step": 8 }, { "epoch": 0.02711864406779661, "grad_norm": 9.704715728759766, "learning_rate": 1.8e-05, "loss": 4.083, "step": 9 }, { "epoch": 0.030131826741996232, "grad_norm": 8.977635383605957, "learning_rate": 2e-05, "loss": 3.548, "step": 10 }, { "epoch": 0.03314500941619586, "grad_norm": 7.7479376792907715, "learning_rate": 2.2000000000000003e-05, "loss": 3.2281, "step": 11 }, { "epoch": 0.03615819209039548, "grad_norm": 6.718513011932373, "learning_rate": 2.4e-05, "loss": 3.0443, "step": 12 }, { "epoch": 0.039171374764595104, "grad_norm": 5.1653056144714355, "learning_rate": 2.6000000000000002e-05, "loss": 2.6208, "step": 13 }, { "epoch": 0.04218455743879473, "grad_norm": 4.840386867523193, "learning_rate": 2.8000000000000003e-05, "loss": 2.3382, "step": 14 }, { "epoch": 0.04519774011299435, "grad_norm": 4.570226669311523, "learning_rate": 3e-05, "loss": 2.0896, "step": 15 }, { "epoch": 0.04821092278719397, "grad_norm": 5.179202079772949, "learning_rate": 3.2000000000000005e-05, "loss": 2.0368, "step": 16 }, { "epoch": 0.051224105461393596, "grad_norm": 4.725672245025635, "learning_rate": 3.4000000000000007e-05, "loss": 1.8898, "step": 17 }, { "epoch": 0.05423728813559322, "grad_norm": 4.372887134552002, "learning_rate": 3.6e-05, "loss": 1.6599, "step": 18 }, { "epoch": 0.05725047080979284, "grad_norm": 4.00122594833374, "learning_rate": 3.8e-05, "loss": 1.4213, "step": 19 }, { "epoch": 0.060263653483992465, "grad_norm": 4.925197124481201, "learning_rate": 4e-05, "loss": 1.5454, "step": 20 }, { "epoch": 0.06327683615819209, "grad_norm": 9.11976146697998, "learning_rate": 4.2e-05, "loss": 2.1408, "step": 21 }, { "epoch": 0.06629001883239172, "grad_norm": 15.82564640045166, "learning_rate": 4.4000000000000006e-05, "loss": 2.5483, "step": 22 }, { "epoch": 0.06930320150659133, "grad_norm": 15.502534866333008, "learning_rate": 4.600000000000001e-05, "loss": 1.8806, "step": 23 }, { "epoch": 0.07231638418079096, "grad_norm": 18.15461540222168, "learning_rate": 4.8e-05, "loss": 1.6405, "step": 24 }, { "epoch": 0.07532956685499058, "grad_norm": 23.61888313293457, "learning_rate": 5e-05, "loss": 1.2773, "step": 25 }, { "epoch": 0.07834274952919021, "grad_norm": 11.435036659240723, "learning_rate": 5.2000000000000004e-05, "loss": 1.3372, "step": 26 }, { "epoch": 0.08135593220338982, "grad_norm": 13.24569320678711, "learning_rate": 5.4000000000000005e-05, "loss": 1.3893, "step": 27 }, { "epoch": 0.08436911487758945, "grad_norm": 9.795696258544922, "learning_rate": 5.6000000000000006e-05, "loss": 1.3095, "step": 28 }, { "epoch": 0.08738229755178907, "grad_norm": 8.10338020324707, "learning_rate": 5.8e-05, "loss": 1.1856, "step": 29 }, { "epoch": 0.0903954802259887, "grad_norm": 5.573620796203613, "learning_rate": 6e-05, "loss": 1.0843, "step": 30 }, { "epoch": 0.09340866290018833, "grad_norm": 2.8769285678863525, "learning_rate": 6.2e-05, "loss": 0.9771, "step": 31 }, { "epoch": 0.09642184557438795, "grad_norm": 3.623781681060791, "learning_rate": 6.400000000000001e-05, "loss": 0.9225, "step": 32 }, { "epoch": 0.09943502824858758, "grad_norm": 3.1273627281188965, "learning_rate": 6.6e-05, "loss": 0.8808, "step": 33 }, { "epoch": 0.10244821092278719, "grad_norm": 2.4352469444274902, "learning_rate": 6.800000000000001e-05, "loss": 0.8043, "step": 34 }, { "epoch": 0.10546139359698682, "grad_norm": 2.5423271656036377, "learning_rate": 7e-05, "loss": 0.7235, "step": 35 }, { "epoch": 0.10847457627118644, "grad_norm": 2.7016396522521973, "learning_rate": 7.2e-05, "loss": 0.7387, "step": 36 }, { "epoch": 0.11148775894538607, "grad_norm": 3.2129478454589844, "learning_rate": 7.4e-05, "loss": 0.7625, "step": 37 }, { "epoch": 0.11450094161958568, "grad_norm": 2.897091865539551, "learning_rate": 7.6e-05, "loss": 0.7295, "step": 38 }, { "epoch": 0.11751412429378531, "grad_norm": 2.90976619720459, "learning_rate": 7.800000000000001e-05, "loss": 0.6797, "step": 39 }, { "epoch": 0.12052730696798493, "grad_norm": 2.8482582569122314, "learning_rate": 8e-05, "loss": 0.7018, "step": 40 }, { "epoch": 0.12354048964218456, "grad_norm": 2.0846457481384277, "learning_rate": 8.2e-05, "loss": 0.7266, "step": 41 }, { "epoch": 0.12655367231638417, "grad_norm": 2.0968143939971924, "learning_rate": 8.4e-05, "loss": 0.5808, "step": 42 }, { "epoch": 0.1295668549905838, "grad_norm": 2.948556423187256, "learning_rate": 8.6e-05, "loss": 0.7257, "step": 43 }, { "epoch": 0.13258003766478343, "grad_norm": 2.561649799346924, "learning_rate": 8.800000000000001e-05, "loss": 0.6828, "step": 44 }, { "epoch": 0.13559322033898305, "grad_norm": 2.9468488693237305, "learning_rate": 9e-05, "loss": 0.7191, "step": 45 }, { "epoch": 0.13860640301318267, "grad_norm": 4.2950825691223145, "learning_rate": 9.200000000000001e-05, "loss": 0.7896, "step": 46 }, { "epoch": 0.1416195856873823, "grad_norm": 8.009641647338867, "learning_rate": 9.4e-05, "loss": 1.2446, "step": 47 }, { "epoch": 0.14463276836158193, "grad_norm": 6.519936561584473, "learning_rate": 9.6e-05, "loss": 0.8017, "step": 48 }, { "epoch": 0.14764595103578154, "grad_norm": 5.3175530433654785, "learning_rate": 9.8e-05, "loss": 0.6389, "step": 49 }, { "epoch": 0.15065913370998116, "grad_norm": 7.807773113250732, "learning_rate": 0.0001, "loss": 0.5498, "step": 50 }, { "epoch": 0.1536723163841808, "grad_norm": 20.014755249023438, "learning_rate": 9.999687519737639e-05, "loss": 1.7708, "step": 51 }, { "epoch": 0.15668549905838042, "grad_norm": 14.703155517578125, "learning_rate": 9.998750118008115e-05, "loss": 1.7615, "step": 52 }, { "epoch": 0.15969868173258003, "grad_norm": 11.548951148986816, "learning_rate": 9.997187911979252e-05, "loss": 1.4554, "step": 53 }, { "epoch": 0.16271186440677965, "grad_norm": 8.304758071899414, "learning_rate": 9.995001096914461e-05, "loss": 1.2888, "step": 54 }, { "epoch": 0.1657250470809793, "grad_norm": 5.550910472869873, "learning_rate": 9.992189946148366e-05, "loss": 1.0796, "step": 55 }, { "epoch": 0.1687382297551789, "grad_norm": 3.1853792667388916, "learning_rate": 9.988754811052616e-05, "loss": 0.9803, "step": 56 }, { "epoch": 0.17175141242937852, "grad_norm": 2.140463352203369, "learning_rate": 9.984696120991978e-05, "loss": 0.8841, "step": 57 }, { "epoch": 0.17476459510357814, "grad_norm": 2.4900693893432617, "learning_rate": 9.980014383270668e-05, "loss": 0.8013, "step": 58 }, { "epoch": 0.17777777777777778, "grad_norm": 2.4014413356781006, "learning_rate": 9.974710183068935e-05, "loss": 0.7769, "step": 59 }, { "epoch": 0.1807909604519774, "grad_norm": 1.818248987197876, "learning_rate": 9.968784183369929e-05, "loss": 0.6339, "step": 60 }, { "epoch": 0.18380414312617702, "grad_norm": 2.038853406906128, "learning_rate": 9.962237124876828e-05, "loss": 0.7058, "step": 61 }, { "epoch": 0.18681732580037666, "grad_norm": 1.9188238382339478, "learning_rate": 9.955069825920249e-05, "loss": 0.6063, "step": 62 }, { "epoch": 0.18983050847457628, "grad_norm": 1.960972547531128, "learning_rate": 9.947283182355982e-05, "loss": 0.6681, "step": 63 }, { "epoch": 0.1928436911487759, "grad_norm": 1.8506237268447876, "learning_rate": 9.938878167452992e-05, "loss": 0.4687, "step": 64 }, { "epoch": 0.1958568738229755, "grad_norm": 1.7827894687652588, "learning_rate": 9.929855831771786e-05, "loss": 0.5907, "step": 65 }, { "epoch": 0.19887005649717515, "grad_norm": 1.788488745689392, "learning_rate": 9.92021730303309e-05, "loss": 0.5778, "step": 66 }, { "epoch": 0.20188323917137477, "grad_norm": 1.9174665212631226, "learning_rate": 9.909963785976903e-05, "loss": 0.5587, "step": 67 }, { "epoch": 0.20489642184557438, "grad_norm": 1.898708462715149, "learning_rate": 9.899096562211902e-05, "loss": 0.4941, "step": 68 }, { "epoch": 0.207909604519774, "grad_norm": 2.086160659790039, "learning_rate": 9.887616990055262e-05, "loss": 0.5702, "step": 69 }, { "epoch": 0.21092278719397364, "grad_norm": 3.054201126098633, "learning_rate": 9.875526504362869e-05, "loss": 0.6915, "step": 70 }, { "epoch": 0.21393596986817326, "grad_norm": 4.820160388946533, "learning_rate": 9.86282661634998e-05, "loss": 1.0488, "step": 71 }, { "epoch": 0.21694915254237288, "grad_norm": 4.556093692779541, "learning_rate": 9.849518913402334e-05, "loss": 0.7673, "step": 72 }, { "epoch": 0.2199623352165725, "grad_norm": 3.98148250579834, "learning_rate": 9.835605058877729e-05, "loss": 0.7301, "step": 73 }, { "epoch": 0.22297551789077213, "grad_norm": 4.126860618591309, "learning_rate": 9.821086791898134e-05, "loss": 0.8709, "step": 74 }, { "epoch": 0.22598870056497175, "grad_norm": 4.8316216468811035, "learning_rate": 9.805965927132295e-05, "loss": 0.7508, "step": 75 }, { "epoch": 0.22900188323917137, "grad_norm": 7.367558479309082, "learning_rate": 9.79024435456893e-05, "loss": 1.4311, "step": 76 }, { "epoch": 0.232015065913371, "grad_norm": 6.81304407119751, "learning_rate": 9.773924039280487e-05, "loss": 1.3631, "step": 77 }, { "epoch": 0.23502824858757063, "grad_norm": 5.532495021820068, "learning_rate": 9.75700702117753e-05, "loss": 1.2462, "step": 78 }, { "epoch": 0.23804143126177024, "grad_norm": 3.750790596008301, "learning_rate": 9.739495414753753e-05, "loss": 0.9057, "step": 79 }, { "epoch": 0.24105461393596986, "grad_norm": 2.6967413425445557, "learning_rate": 9.721391408821711e-05, "loss": 0.8322, "step": 80 }, { "epoch": 0.2440677966101695, "grad_norm": 1.915755271911621, "learning_rate": 9.702697266239212e-05, "loss": 0.6605, "step": 81 }, { "epoch": 0.24708097928436912, "grad_norm": 1.681639552116394, "learning_rate": 9.683415323626485e-05, "loss": 0.6949, "step": 82 }, { "epoch": 0.25009416195856876, "grad_norm": 1.6740490198135376, "learning_rate": 9.663547991074127e-05, "loss": 0.751, "step": 83 }, { "epoch": 0.25009416195856876, "eval_loss": NaN, "eval_runtime": 92.9339, "eval_samples_per_second": 6.015, "eval_steps_per_second": 1.506, "step": 83 }, { "epoch": 0.25310734463276835, "grad_norm": 1.4990551471710205, "learning_rate": 9.643097751841854e-05, "loss": 0.5189, "step": 84 }, { "epoch": 0.256120527306968, "grad_norm": 1.4080356359481812, "learning_rate": 9.622067162048112e-05, "loss": 0.5112, "step": 85 }, { "epoch": 0.2591337099811676, "grad_norm": 1.3661057949066162, "learning_rate": 9.600458850350588e-05, "loss": 0.4688, "step": 86 }, { "epoch": 0.2621468926553672, "grad_norm": 1.570552945137024, "learning_rate": 9.578275517617645e-05, "loss": 0.5058, "step": 87 }, { "epoch": 0.26516007532956687, "grad_norm": 1.6037708520889282, "learning_rate": 9.555519936590738e-05, "loss": 0.5201, "step": 88 }, { "epoch": 0.26817325800376646, "grad_norm": 1.5268930196762085, "learning_rate": 9.532194951537838e-05, "loss": 0.4661, "step": 89 }, { "epoch": 0.2711864406779661, "grad_norm": 1.7837523221969604, "learning_rate": 9.508303477897924e-05, "loss": 0.5005, "step": 90 }, { "epoch": 0.27419962335216574, "grad_norm": 1.3590326309204102, "learning_rate": 9.483848501916578e-05, "loss": 0.3866, "step": 91 }, { "epoch": 0.27721280602636533, "grad_norm": 1.5031671524047852, "learning_rate": 9.458833080272722e-05, "loss": 0.3559, "step": 92 }, { "epoch": 0.280225988700565, "grad_norm": 1.2212880849838257, "learning_rate": 9.433260339696563e-05, "loss": 0.3586, "step": 93 }, { "epoch": 0.2832391713747646, "grad_norm": 1.8385019302368164, "learning_rate": 9.407133476578778e-05, "loss": 0.4775, "step": 94 }, { "epoch": 0.2862523540489642, "grad_norm": 2.6899161338806152, "learning_rate": 9.38045575657098e-05, "loss": 0.6809, "step": 95 }, { "epoch": 0.28926553672316385, "grad_norm": 3.9981398582458496, "learning_rate": 9.353230514177552e-05, "loss": 0.8967, "step": 96 }, { "epoch": 0.29227871939736344, "grad_norm": 3.7616143226623535, "learning_rate": 9.325461152338846e-05, "loss": 0.9173, "step": 97 }, { "epoch": 0.2952919020715631, "grad_norm": 3.3938989639282227, "learning_rate": 9.297151142005851e-05, "loss": 0.7849, "step": 98 }, { "epoch": 0.2983050847457627, "grad_norm": 3.3373446464538574, "learning_rate": 9.268304021706349e-05, "loss": 0.6619, "step": 99 }, { "epoch": 0.3013182674199623, "grad_norm": 4.476459503173828, "learning_rate": 9.23892339710263e-05, "loss": 0.7758, "step": 100 }, { "epoch": 0.30433145009416196, "grad_norm": 2.75358510017395, "learning_rate": 9.209012940540805e-05, "loss": 0.7565, "step": 101 }, { "epoch": 0.3073446327683616, "grad_norm": 2.192662000656128, "learning_rate": 9.178576390591802e-05, "loss": 0.6634, "step": 102 }, { "epoch": 0.3103578154425612, "grad_norm": 2.3334836959838867, "learning_rate": 9.147617551584066e-05, "loss": 0.6961, "step": 103 }, { "epoch": 0.31337099811676083, "grad_norm": 1.9057625532150269, "learning_rate": 9.116140293128051e-05, "loss": 0.5762, "step": 104 }, { "epoch": 0.3163841807909605, "grad_norm": 1.5543274879455566, "learning_rate": 9.084148549632547e-05, "loss": 0.5249, "step": 105 }, { "epoch": 0.31939736346516007, "grad_norm": 1.3116902112960815, "learning_rate": 9.051646319812918e-05, "loss": 0.4895, "step": 106 }, { "epoch": 0.3224105461393597, "grad_norm": 1.6137094497680664, "learning_rate": 9.018637666191283e-05, "loss": 0.5036, "step": 107 }, { "epoch": 0.3254237288135593, "grad_norm": 1.4955766201019287, "learning_rate": 8.985126714588738e-05, "loss": 0.4571, "step": 108 }, { "epoch": 0.32843691148775894, "grad_norm": 1.5371748208999634, "learning_rate": 8.951117653609666e-05, "loss": 0.4958, "step": 109 }, { "epoch": 0.3314500941619586, "grad_norm": 1.2266839742660522, "learning_rate": 8.916614734118184e-05, "loss": 0.4171, "step": 110 }, { "epoch": 0.3344632768361582, "grad_norm": 1.21657133102417, "learning_rate": 8.881622268706825e-05, "loss": 0.421, "step": 111 }, { "epoch": 0.3374764595103578, "grad_norm": 1.2184901237487793, "learning_rate": 8.8461446311575e-05, "loss": 0.4307, "step": 112 }, { "epoch": 0.34048964218455746, "grad_norm": 1.5124021768569946, "learning_rate": 8.810186255894803e-05, "loss": 0.4865, "step": 113 }, { "epoch": 0.34350282485875705, "grad_norm": 1.078994870185852, "learning_rate": 8.773751637431748e-05, "loss": 0.3592, "step": 114 }, { "epoch": 0.3465160075329567, "grad_norm": 1.2173560857772827, "learning_rate": 8.736845329807993e-05, "loss": 0.3757, "step": 115 }, { "epoch": 0.3495291902071563, "grad_norm": 1.4223103523254395, "learning_rate": 8.69947194602061e-05, "loss": 0.4002, "step": 116 }, { "epoch": 0.3525423728813559, "grad_norm": 1.2369580268859863, "learning_rate": 8.66163615744751e-05, "loss": 0.3891, "step": 117 }, { "epoch": 0.35555555555555557, "grad_norm": 1.2306034564971924, "learning_rate": 8.623342693263548e-05, "loss": 0.3176, "step": 118 }, { "epoch": 0.35856873822975516, "grad_norm": 1.20809805393219, "learning_rate": 8.584596339849417e-05, "loss": 0.3715, "step": 119 }, { "epoch": 0.3615819209039548, "grad_norm": 1.59524405002594, "learning_rate": 8.545401940193392e-05, "loss": 0.4539, "step": 120 }, { "epoch": 0.36459510357815444, "grad_norm": 2.4288361072540283, "learning_rate": 8.505764393285984e-05, "loss": 0.7094, "step": 121 }, { "epoch": 0.36760828625235403, "grad_norm": 2.587125778198242, "learning_rate": 8.46568865350762e-05, "loss": 0.7052, "step": 122 }, { "epoch": 0.3706214689265537, "grad_norm": 3.610764980316162, "learning_rate": 8.425179730009368e-05, "loss": 0.6835, "step": 123 }, { "epoch": 0.3736346516007533, "grad_norm": 2.254451274871826, "learning_rate": 8.384242686086848e-05, "loss": 0.5733, "step": 124 }, { "epoch": 0.3766478342749529, "grad_norm": 3.2182092666625977, "learning_rate": 8.342882638547351e-05, "loss": 0.7416, "step": 125 }, { "epoch": 0.37966101694915255, "grad_norm": 2.0895962715148926, "learning_rate": 8.301104757070274e-05, "loss": 0.611, "step": 126 }, { "epoch": 0.38267419962335214, "grad_norm": 1.9307582378387451, "learning_rate": 8.258914263560971e-05, "loss": 0.6099, "step": 127 }, { "epoch": 0.3856873822975518, "grad_norm": 1.7885206937789917, "learning_rate": 8.216316431498028e-05, "loss": 0.4832, "step": 128 }, { "epoch": 0.3887005649717514, "grad_norm": 1.2265185117721558, "learning_rate": 8.173316585274145e-05, "loss": 0.4042, "step": 129 }, { "epoch": 0.391713747645951, "grad_norm": 1.369534969329834, "learning_rate": 8.129920099530607e-05, "loss": 0.4681, "step": 130 }, { "epoch": 0.39472693032015066, "grad_norm": 1.340951681137085, "learning_rate": 8.086132398485524e-05, "loss": 0.4775, "step": 131 }, { "epoch": 0.3977401129943503, "grad_norm": 1.1047234535217285, "learning_rate": 8.041958955255814e-05, "loss": 0.4508, "step": 132 }, { "epoch": 0.4007532956685499, "grad_norm": 1.0403156280517578, "learning_rate": 7.99740529117313e-05, "loss": 0.4217, "step": 133 }, { "epoch": 0.40376647834274954, "grad_norm": 0.9500618577003479, "learning_rate": 7.952476975093729e-05, "loss": 0.34, "step": 134 }, { "epoch": 0.4067796610169492, "grad_norm": 1.1021428108215332, "learning_rate": 7.907179622702408e-05, "loss": 0.392, "step": 135 }, { "epoch": 0.40979284369114877, "grad_norm": 1.2623156309127808, "learning_rate": 7.861518895810596e-05, "loss": 0.4238, "step": 136 }, { "epoch": 0.4128060263653484, "grad_norm": 1.395652413368225, "learning_rate": 7.815500501648653e-05, "loss": 0.4211, "step": 137 }, { "epoch": 0.415819209039548, "grad_norm": 1.3175368309020996, "learning_rate": 7.769130192152538e-05, "loss": 0.415, "step": 138 }, { "epoch": 0.41883239171374764, "grad_norm": 1.3882197141647339, "learning_rate": 7.722413763244838e-05, "loss": 0.422, "step": 139 }, { "epoch": 0.4218455743879473, "grad_norm": 1.396023154258728, "learning_rate": 7.675357054110336e-05, "loss": 0.466, "step": 140 }, { "epoch": 0.4248587570621469, "grad_norm": 1.0779083967208862, "learning_rate": 7.627965946466166e-05, "loss": 0.3576, "step": 141 }, { "epoch": 0.4278719397363465, "grad_norm": 1.2511008977890015, "learning_rate": 7.580246363826621e-05, "loss": 0.301, "step": 142 }, { "epoch": 0.43088512241054616, "grad_norm": 1.13119375705719, "learning_rate": 7.532204270762786e-05, "loss": 0.3332, "step": 143 }, { "epoch": 0.43389830508474575, "grad_norm": 2.0195682048797607, "learning_rate": 7.483845672156998e-05, "loss": 0.6475, "step": 144 }, { "epoch": 0.4369114877589454, "grad_norm": 2.429945230484009, "learning_rate": 7.435176612452286e-05, "loss": 0.7177, "step": 145 }, { "epoch": 0.439924670433145, "grad_norm": 3.0756828784942627, "learning_rate": 7.386203174896872e-05, "loss": 0.741, "step": 146 }, { "epoch": 0.4429378531073446, "grad_norm": 3.7236998081207275, "learning_rate": 7.336931480783801e-05, "loss": 0.7999, "step": 147 }, { "epoch": 0.44595103578154427, "grad_norm": 2.7121517658233643, "learning_rate": 7.287367688685835e-05, "loss": 0.6044, "step": 148 }, { "epoch": 0.44896421845574386, "grad_norm": 3.661588668823242, "learning_rate": 7.237517993685678e-05, "loss": 0.5553, "step": 149 }, { "epoch": 0.4519774011299435, "grad_norm": 4.68520975112915, "learning_rate": 7.187388626601637e-05, "loss": 0.411, "step": 150 }, { "epoch": 0.45499058380414314, "grad_norm": 1.866217017173767, "learning_rate": 7.136985853208824e-05, "loss": 0.5442, "step": 151 }, { "epoch": 0.45800376647834273, "grad_norm": 1.6526014804840088, "learning_rate": 7.086315973455981e-05, "loss": 0.5071, "step": 152 }, { "epoch": 0.4610169491525424, "grad_norm": 1.3213937282562256, "learning_rate": 7.035385320678036e-05, "loss": 0.4598, "step": 153 }, { "epoch": 0.464030131826742, "grad_norm": 0.959452211856842, "learning_rate": 6.984200260804484e-05, "loss": 0.3485, "step": 154 }, { "epoch": 0.4670433145009416, "grad_norm": 1.0355703830718994, "learning_rate": 6.932767191563703e-05, "loss": 0.3648, "step": 155 }, { "epoch": 0.47005649717514125, "grad_norm": 0.9991386532783508, "learning_rate": 6.881092541683278e-05, "loss": 0.3535, "step": 156 }, { "epoch": 0.47306967984934084, "grad_norm": 1.0915963649749756, "learning_rate": 6.829182770086474e-05, "loss": 0.3682, "step": 157 }, { "epoch": 0.4760828625235405, "grad_norm": 0.9837580323219299, "learning_rate": 6.777044365084907e-05, "loss": 0.3703, "step": 158 }, { "epoch": 0.47909604519774013, "grad_norm": 1.258581280708313, "learning_rate": 6.724683843567568e-05, "loss": 0.4104, "step": 159 }, { "epoch": 0.4821092278719397, "grad_norm": 0.832224428653717, "learning_rate": 6.672107750186255e-05, "loss": 0.2934, "step": 160 }, { "epoch": 0.48512241054613936, "grad_norm": 0.881106436252594, "learning_rate": 6.619322656537552e-05, "loss": 0.3127, "step": 161 }, { "epoch": 0.488135593220339, "grad_norm": 1.257350206375122, "learning_rate": 6.566335160341424e-05, "loss": 0.3804, "step": 162 }, { "epoch": 0.4911487758945386, "grad_norm": 1.6826411485671997, "learning_rate": 6.513151884616556e-05, "loss": 0.4807, "step": 163 }, { "epoch": 0.49416195856873824, "grad_norm": 1.31766676902771, "learning_rate": 6.459779476852528e-05, "loss": 0.3872, "step": 164 }, { "epoch": 0.4971751412429379, "grad_norm": 1.438594102859497, "learning_rate": 6.406224608178932e-05, "loss": 0.3868, "step": 165 }, { "epoch": 0.5001883239171375, "grad_norm": 1.198364496231079, "learning_rate": 6.352493972531534e-05, "loss": 0.3361, "step": 166 }, { "epoch": 0.5001883239171375, "eval_loss": NaN, "eval_runtime": 93.1419, "eval_samples_per_second": 6.002, "eval_steps_per_second": 1.503, "step": 166 } ], "logging_steps": 1, "max_steps": 331, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 83, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 5.46878716765012e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }