{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 135, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02247191011235955, "grad_norm": 0.2744160592556, "learning_rate": 1.0000000000000002e-06, "loss": 0.7619, "step": 1 }, { "epoch": 0.0449438202247191, "grad_norm": 0.32727673649787903, "learning_rate": 2.0000000000000003e-06, "loss": 0.8704, "step": 2 }, { "epoch": 0.06741573033707865, "grad_norm": 0.22079522907733917, "learning_rate": 3e-06, "loss": 0.7241, "step": 3 }, { "epoch": 0.0898876404494382, "grad_norm": 0.3042285144329071, "learning_rate": 4.000000000000001e-06, "loss": 0.9591, "step": 4 }, { "epoch": 0.11235955056179775, "grad_norm": 0.28325748443603516, "learning_rate": 5e-06, "loss": 1.061, "step": 5 }, { "epoch": 0.1348314606741573, "grad_norm": 0.29770785570144653, "learning_rate": 6e-06, "loss": 1.0407, "step": 6 }, { "epoch": 0.15730337078651685, "grad_norm": 0.2635294198989868, "learning_rate": 7e-06, "loss": 0.9524, "step": 7 }, { "epoch": 0.1797752808988764, "grad_norm": 0.2629376947879791, "learning_rate": 8.000000000000001e-06, "loss": 1.0095, "step": 8 }, { "epoch": 0.20224719101123595, "grad_norm": 0.30229848623275757, "learning_rate": 9e-06, "loss": 0.7805, "step": 9 }, { "epoch": 0.2247191011235955, "grad_norm": 0.2629299759864807, "learning_rate": 1e-05, "loss": 0.8706, "step": 10 }, { "epoch": 0.24719101123595505, "grad_norm": 0.26694297790527344, "learning_rate": 9.999146252290264e-06, "loss": 0.9653, "step": 11 }, { "epoch": 0.2696629213483146, "grad_norm": 0.2794544994831085, "learning_rate": 9.996585300715117e-06, "loss": 0.7668, "step": 12 }, { "epoch": 0.29213483146067415, "grad_norm": 0.30685344338417053, "learning_rate": 9.992318019837171e-06, "loss": 0.8074, "step": 13 }, { "epoch": 0.3146067415730337, "grad_norm": 0.3292616605758667, "learning_rate": 9.98634586692894e-06, "loss": 0.8413, "step": 14 }, { "epoch": 0.33707865168539325, "grad_norm": 0.34031111001968384, "learning_rate": 9.978670881475173e-06, "loss": 0.6885, "step": 15 }, { "epoch": 0.3595505617977528, "grad_norm": 0.31020820140838623, "learning_rate": 9.96929568447637e-06, "loss": 1.0567, "step": 16 }, { "epoch": 0.38202247191011235, "grad_norm": 0.28236857056617737, "learning_rate": 9.958223477553715e-06, "loss": 0.7969, "step": 17 }, { "epoch": 0.4044943820224719, "grad_norm": 0.3202574551105499, "learning_rate": 9.945458041855732e-06, "loss": 0.8376, "step": 18 }, { "epoch": 0.42696629213483145, "grad_norm": 0.2797839045524597, "learning_rate": 9.931003736767013e-06, "loss": 0.8911, "step": 19 }, { "epoch": 0.449438202247191, "grad_norm": 0.3114033639431, "learning_rate": 9.91486549841951e-06, "loss": 0.9042, "step": 20 }, { "epoch": 0.47191011235955055, "grad_norm": 0.288392037153244, "learning_rate": 9.89704883800683e-06, "loss": 0.9409, "step": 21 }, { "epoch": 0.4943820224719101, "grad_norm": 0.31594759225845337, "learning_rate": 9.877559839902185e-06, "loss": 0.7856, "step": 22 }, { "epoch": 0.5168539325842697, "grad_norm": 0.33034929633140564, "learning_rate": 9.85640515958057e-06, "loss": 0.8219, "step": 23 }, { "epoch": 0.5393258426966292, "grad_norm": 0.3292032778263092, "learning_rate": 9.833592021345938e-06, "loss": 0.7739, "step": 24 }, { "epoch": 0.5617977528089888, "grad_norm": 0.2713082432746887, "learning_rate": 9.809128215864096e-06, "loss": 0.6795, "step": 25 }, { "epoch": 0.5842696629213483, "grad_norm": 0.28911784291267395, "learning_rate": 9.783022097502204e-06, "loss": 0.8294, "step": 26 }, { "epoch": 0.6067415730337079, "grad_norm": 0.2532961666584015, "learning_rate": 9.755282581475769e-06, "loss": 0.8253, "step": 27 }, { "epoch": 0.6292134831460674, "grad_norm": 0.23403476178646088, "learning_rate": 9.7259191408041e-06, "loss": 0.8501, "step": 28 }, { "epoch": 0.651685393258427, "grad_norm": 0.2743186056613922, "learning_rate": 9.694941803075285e-06, "loss": 1.2514, "step": 29 }, { "epoch": 0.6741573033707865, "grad_norm": 0.22719059884548187, "learning_rate": 9.66236114702178e-06, "loss": 0.7012, "step": 30 }, { "epoch": 0.6966292134831461, "grad_norm": 0.22790080308914185, "learning_rate": 9.628188298907782e-06, "loss": 0.7681, "step": 31 }, { "epoch": 0.7191011235955056, "grad_norm": 0.18746985495090485, "learning_rate": 9.592434928729617e-06, "loss": 0.8587, "step": 32 }, { "epoch": 0.7415730337078652, "grad_norm": 0.19890236854553223, "learning_rate": 9.555113246230443e-06, "loss": 0.8129, "step": 33 }, { "epoch": 0.7640449438202247, "grad_norm": 0.16481083631515503, "learning_rate": 9.516235996730645e-06, "loss": 0.658, "step": 34 }, { "epoch": 0.7865168539325843, "grad_norm": 0.19077646732330322, "learning_rate": 9.475816456775313e-06, "loss": 0.6721, "step": 35 }, { "epoch": 0.8089887640449438, "grad_norm": 0.20115327835083008, "learning_rate": 9.43386842960031e-06, "loss": 0.7361, "step": 36 }, { "epoch": 0.8314606741573034, "grad_norm": 0.16515249013900757, "learning_rate": 9.39040624041849e-06, "loss": 0.6858, "step": 37 }, { "epoch": 0.8539325842696629, "grad_norm": 0.16454002261161804, "learning_rate": 9.345444731527642e-06, "loss": 0.902, "step": 38 }, { "epoch": 0.8764044943820225, "grad_norm": 0.12956053018569946, "learning_rate": 9.298999257241862e-06, "loss": 0.7653, "step": 39 }, { "epoch": 0.898876404494382, "grad_norm": 0.1354527771472931, "learning_rate": 9.251085678648072e-06, "loss": 0.6624, "step": 40 }, { "epoch": 0.9213483146067416, "grad_norm": 0.14311733841896057, "learning_rate": 9.201720358189464e-06, "loss": 0.6757, "step": 41 }, { "epoch": 0.9438202247191011, "grad_norm": 0.14996573328971863, "learning_rate": 9.150920154077753e-06, "loss": 0.639, "step": 42 }, { "epoch": 0.9662921348314607, "grad_norm": 0.1524282693862915, "learning_rate": 9.098702414536107e-06, "loss": 0.8575, "step": 43 }, { "epoch": 0.9887640449438202, "grad_norm": 0.1349412202835083, "learning_rate": 9.045084971874738e-06, "loss": 0.6459, "step": 44 }, { "epoch": 1.0112359550561798, "grad_norm": 0.12348250299692154, "learning_rate": 8.990086136401199e-06, "loss": 0.6682, "step": 45 }, { "epoch": 1.0167597765363128, "grad_norm": 0.12682083249092102, "learning_rate": 8.933724690167417e-06, "loss": 0.6996, "step": 46 }, { "epoch": 1.0391061452513966, "grad_norm": 0.14481687545776367, "learning_rate": 8.87601988055565e-06, "loss": 0.9028, "step": 47 }, { "epoch": 1.0614525139664805, "grad_norm": 0.20267356932163239, "learning_rate": 8.816991413705515e-06, "loss": 0.7655, "step": 48 }, { "epoch": 1.0837988826815643, "grad_norm": 0.16279681026935577, "learning_rate": 8.756659447784367e-06, "loss": 0.6583, "step": 49 }, { "epoch": 1.106145251396648, "grad_norm": 0.1354082226753235, "learning_rate": 8.695044586103297e-06, "loss": 0.8319, "step": 50 }, { "epoch": 1.1284916201117319, "grad_norm": 0.14600947499275208, "learning_rate": 8.632167870081122e-06, "loss": 0.6148, "step": 51 }, { "epoch": 1.1508379888268156, "grad_norm": 0.2995755672454834, "learning_rate": 8.568050772058763e-06, "loss": 0.8722, "step": 52 }, { "epoch": 1.1731843575418994, "grad_norm": 0.1370067149400711, "learning_rate": 8.502715187966455e-06, "loss": 0.8219, "step": 53 }, { "epoch": 1.1955307262569832, "grad_norm": 0.14704233407974243, "learning_rate": 8.436183429846314e-06, "loss": 0.8698, "step": 54 }, { "epoch": 1.217877094972067, "grad_norm": 0.13113325834274292, "learning_rate": 8.368478218232787e-06, "loss": 0.6741, "step": 55 }, { "epoch": 1.2402234636871508, "grad_norm": 0.15853165090084076, "learning_rate": 8.299622674393615e-06, "loss": 0.6672, "step": 56 }, { "epoch": 1.2625698324022347, "grad_norm": 0.12545864284038544, "learning_rate": 8.229640312433938e-06, "loss": 0.6325, "step": 57 }, { "epoch": 1.2849162011173183, "grad_norm": 0.13042129576206207, "learning_rate": 8.158555031266255e-06, "loss": 0.8421, "step": 58 }, { "epoch": 1.3072625698324023, "grad_norm": 0.12107989937067032, "learning_rate": 8.086391106448965e-06, "loss": 0.849, "step": 59 }, { "epoch": 1.329608938547486, "grad_norm": 0.12185141444206238, "learning_rate": 8.013173181896283e-06, "loss": 0.6157, "step": 60 }, { "epoch": 1.3519553072625698, "grad_norm": 0.14786343276500702, "learning_rate": 7.938926261462366e-06, "loss": 0.8534, "step": 61 }, { "epoch": 1.3743016759776536, "grad_norm": 0.1170087456703186, "learning_rate": 7.863675700402527e-06, "loss": 0.6024, "step": 62 }, { "epoch": 1.3966480446927374, "grad_norm": 0.12368728965520859, "learning_rate": 7.787447196714428e-06, "loss": 0.621, "step": 63 }, { "epoch": 1.4189944134078212, "grad_norm": 0.13665403425693512, "learning_rate": 7.710266782362248e-06, "loss": 0.8647, "step": 64 }, { "epoch": 1.441340782122905, "grad_norm": 0.14922167360782623, "learning_rate": 7.63216081438678e-06, "loss": 0.6963, "step": 65 }, { "epoch": 1.463687150837989, "grad_norm": 0.20886604487895966, "learning_rate": 7.553155965904535e-06, "loss": 0.9604, "step": 66 }, { "epoch": 1.4860335195530725, "grad_norm": 0.12474123388528824, "learning_rate": 7.473279216998896e-06, "loss": 0.7405, "step": 67 }, { "epoch": 1.5083798882681565, "grad_norm": 0.12420104444026947, "learning_rate": 7.392557845506433e-06, "loss": 0.6579, "step": 68 }, { "epoch": 1.5307262569832403, "grad_norm": 0.13506121933460236, "learning_rate": 7.311019417701567e-06, "loss": 0.8146, "step": 69 }, { "epoch": 1.553072625698324, "grad_norm": 0.13394294679164886, "learning_rate": 7.2286917788826926e-06, "loss": 0.6438, "step": 70 }, { "epoch": 1.5754189944134078, "grad_norm": 0.12186373025178909, "learning_rate": 7.145603043863045e-06, "loss": 0.6964, "step": 71 }, { "epoch": 1.5977653631284916, "grad_norm": 0.12778373062610626, "learning_rate": 7.061781587369518e-06, "loss": 0.6403, "step": 72 }, { "epoch": 1.6201117318435754, "grad_norm": 0.12650710344314575, "learning_rate": 6.977256034352713e-06, "loss": 0.7991, "step": 73 }, { "epoch": 1.6424581005586592, "grad_norm": 0.12929368019104004, "learning_rate": 6.892055250211552e-06, "loss": 0.731, "step": 74 }, { "epoch": 1.6648044692737431, "grad_norm": 0.11519794166088104, "learning_rate": 6.806208330935766e-06, "loss": 0.787, "step": 75 }, { "epoch": 1.6871508379888267, "grad_norm": 0.11956265568733215, "learning_rate": 6.719744593169642e-06, "loss": 0.7078, "step": 76 }, { "epoch": 1.7094972067039107, "grad_norm": 0.11849120259284973, "learning_rate": 6.6326935642004165e-06, "loss": 0.6098, "step": 77 }, { "epoch": 1.7318435754189943, "grad_norm": 0.1370634287595749, "learning_rate": 6.545084971874738e-06, "loss": 0.775, "step": 78 }, { "epoch": 1.7541899441340782, "grad_norm": 0.1339302361011505, "learning_rate": 6.456948734446624e-06, "loss": 0.7778, "step": 79 }, { "epoch": 1.776536312849162, "grad_norm": 0.1872129589319229, "learning_rate": 6.368314950360416e-06, "loss": 0.7051, "step": 80 }, { "epoch": 1.7988826815642458, "grad_norm": 0.13236720860004425, "learning_rate": 6.279213887972179e-06, "loss": 0.6648, "step": 81 }, { "epoch": 1.8212290502793296, "grad_norm": 0.14064921438694, "learning_rate": 6.189675975213094e-06, "loss": 0.9904, "step": 82 }, { "epoch": 1.8435754189944134, "grad_norm": 0.14369916915893555, "learning_rate": 6.099731789198344e-06, "loss": 0.7428, "step": 83 }, { "epoch": 1.8659217877094973, "grad_norm": 0.12341982126235962, "learning_rate": 6.009412045785051e-06, "loss": 0.679, "step": 84 }, { "epoch": 1.888268156424581, "grad_norm": 0.12746700644493103, "learning_rate": 5.918747589082853e-06, "loss": 0.8121, "step": 85 }, { "epoch": 1.910614525139665, "grad_norm": 0.1257738620042801, "learning_rate": 5.82776938092065e-06, "loss": 0.75, "step": 86 }, { "epoch": 1.9329608938547485, "grad_norm": 0.1163194552063942, "learning_rate": 5.736508490273189e-06, "loss": 0.6534, "step": 87 }, { "epoch": 1.9553072625698324, "grad_norm": 0.14038220047950745, "learning_rate": 5.644996082651018e-06, "loss": 0.6898, "step": 88 }, { "epoch": 1.9776536312849162, "grad_norm": 0.11753737926483154, "learning_rate": 5.553263409457504e-06, "loss": 0.5927, "step": 89 }, { "epoch": 2.0, "grad_norm": 0.1380361169576645, "learning_rate": 5.46134179731651e-06, "loss": 0.5046, "step": 90 }, { "epoch": 2.022222222222222, "grad_norm": 0.13965260982513428, "learning_rate": 5.36926263737437e-06, "loss": 0.6015, "step": 91 }, { "epoch": 2.0444444444444443, "grad_norm": 0.1232825219631195, "learning_rate": 5.27705737457985e-06, "loss": 0.6435, "step": 92 }, { "epoch": 2.066666666666667, "grad_norm": 0.1476251482963562, "learning_rate": 5.184757496945726e-06, "loss": 0.7735, "step": 93 }, { "epoch": 2.088888888888889, "grad_norm": 0.15276069939136505, "learning_rate": 5.09239452479565e-06, "loss": 0.7135, "step": 94 }, { "epoch": 2.111111111111111, "grad_norm": 0.1452779620885849, "learning_rate": 5e-06, "loss": 0.7005, "step": 95 }, { "epoch": 2.1333333333333333, "grad_norm": 0.14925116300582886, "learning_rate": 4.907605475204352e-06, "loss": 0.6766, "step": 96 }, { "epoch": 2.1555555555555554, "grad_norm": 0.1299029439687729, "learning_rate": 4.815242503054277e-06, "loss": 0.7491, "step": 97 }, { "epoch": 2.1777777777777776, "grad_norm": 0.1248641163110733, "learning_rate": 4.7229426254201504e-06, "loss": 0.5849, "step": 98 }, { "epoch": 2.2, "grad_norm": 0.15963758528232574, "learning_rate": 4.630737362625631e-06, "loss": 0.7076, "step": 99 }, { "epoch": 2.2222222222222223, "grad_norm": 0.12676207721233368, "learning_rate": 4.53865820268349e-06, "loss": 0.7432, "step": 100 }, { "epoch": 2.2444444444444445, "grad_norm": 0.13234369456768036, "learning_rate": 4.446736590542497e-06, "loss": 0.6691, "step": 101 }, { "epoch": 2.2666666666666666, "grad_norm": 0.13355791568756104, "learning_rate": 4.355003917348985e-06, "loss": 0.8496, "step": 102 }, { "epoch": 2.2888888888888888, "grad_norm": 0.1326327621936798, "learning_rate": 4.263491509726812e-06, "loss": 0.6953, "step": 103 }, { "epoch": 2.311111111111111, "grad_norm": 0.14978842437267303, "learning_rate": 4.17223061907935e-06, "loss": 0.6379, "step": 104 }, { "epoch": 2.3333333333333335, "grad_norm": 0.14492392539978027, "learning_rate": 4.081252410917148e-06, "loss": 0.647, "step": 105 }, { "epoch": 2.3555555555555556, "grad_norm": 0.12510988116264343, "learning_rate": 3.99058795421495e-06, "loss": 0.6436, "step": 106 }, { "epoch": 2.3777777777777778, "grad_norm": 0.13913969695568085, "learning_rate": 3.9002682108016585e-06, "loss": 0.8472, "step": 107 }, { "epoch": 2.4, "grad_norm": 0.1545591950416565, "learning_rate": 3.8103240247869077e-06, "loss": 0.9525, "step": 108 }, { "epoch": 2.422222222222222, "grad_norm": 0.1325288563966751, "learning_rate": 3.720786112027822e-06, "loss": 0.6432, "step": 109 }, { "epoch": 2.4444444444444446, "grad_norm": 0.13336889445781708, "learning_rate": 3.6316850496395863e-06, "loss": 0.6298, "step": 110 }, { "epoch": 2.466666666666667, "grad_norm": 0.13420487940311432, "learning_rate": 3.5430512655533774e-06, "loss": 0.6816, "step": 111 }, { "epoch": 2.488888888888889, "grad_norm": 0.12793636322021484, "learning_rate": 3.4549150281252635e-06, "loss": 0.8182, "step": 112 }, { "epoch": 2.511111111111111, "grad_norm": 0.1300155520439148, "learning_rate": 3.3673064357995844e-06, "loss": 0.5745, "step": 113 }, { "epoch": 2.533333333333333, "grad_norm": 0.130320742726326, "learning_rate": 3.2802554068303595e-06, "loss": 0.6592, "step": 114 }, { "epoch": 2.5555555555555554, "grad_norm": 0.13515885174274445, "learning_rate": 3.1937916690642356e-06, "loss": 0.6122, "step": 115 }, { "epoch": 2.5777777777777775, "grad_norm": 0.1352824568748474, "learning_rate": 3.107944749788449e-06, "loss": 0.6337, "step": 116 }, { "epoch": 2.6, "grad_norm": 0.14149518311023712, "learning_rate": 3.0227439656472878e-06, "loss": 0.8707, "step": 117 }, { "epoch": 2.6222222222222222, "grad_norm": 0.12694476544857025, "learning_rate": 2.9382184126304834e-06, "loss": 0.6052, "step": 118 }, { "epoch": 2.6444444444444444, "grad_norm": 0.1378766894340515, "learning_rate": 2.8543969561369556e-06, "loss": 0.6302, "step": 119 }, { "epoch": 2.6666666666666665, "grad_norm": 0.13251428306102753, "learning_rate": 2.771308221117309e-06, "loss": 0.7513, "step": 120 }, { "epoch": 2.688888888888889, "grad_norm": 0.14362263679504395, "learning_rate": 2.6889805822984348e-06, "loss": 0.5635, "step": 121 }, { "epoch": 2.7111111111111112, "grad_norm": 0.16358952224254608, "learning_rate": 2.607442154493568e-06, "loss": 0.6577, "step": 122 }, { "epoch": 2.7333333333333334, "grad_norm": 0.13316309452056885, "learning_rate": 2.526720783001107e-06, "loss": 0.5734, "step": 123 }, { "epoch": 2.7555555555555555, "grad_norm": 0.12918144464492798, "learning_rate": 2.4468440340954664e-06, "loss": 0.6182, "step": 124 }, { "epoch": 2.7777777777777777, "grad_norm": 0.13864773511886597, "learning_rate": 2.3678391856132203e-06, "loss": 0.79, "step": 125 }, { "epoch": 2.8, "grad_norm": 0.13537746667861938, "learning_rate": 2.289733217637753e-06, "loss": 0.8381, "step": 126 }, { "epoch": 2.822222222222222, "grad_norm": 0.2007640302181244, "learning_rate": 2.2125528032855727e-06, "loss": 0.9545, "step": 127 }, { "epoch": 2.8444444444444446, "grad_norm": 0.15388049185276031, "learning_rate": 2.136324299597474e-06, "loss": 0.6663, "step": 128 }, { "epoch": 2.8666666666666667, "grad_norm": 0.14259681105613708, "learning_rate": 2.061073738537635e-06, "loss": 0.7019, "step": 129 }, { "epoch": 2.888888888888889, "grad_norm": 0.1373143494129181, "learning_rate": 1.9868268181037186e-06, "loss": 0.613, "step": 130 }, { "epoch": 2.911111111111111, "grad_norm": 0.1363854706287384, "learning_rate": 1.913608893551036e-06, "loss": 0.7546, "step": 131 }, { "epoch": 2.9333333333333336, "grad_norm": 0.17132681608200073, "learning_rate": 1.8414449687337467e-06, "loss": 0.8887, "step": 132 }, { "epoch": 2.9555555555555557, "grad_norm": 0.17492803931236267, "learning_rate": 1.7703596875660645e-06, "loss": 1.037, "step": 133 }, { "epoch": 2.977777777777778, "grad_norm": 0.13808095455169678, "learning_rate": 1.7003773256063882e-06, "loss": 0.8112, "step": 134 }, { "epoch": 3.0, "grad_norm": 0.16262874007225037, "learning_rate": 1.6315217817672142e-06, "loss": 0.5395, "step": 135 } ], "logging_steps": 1, "max_steps": 180, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 45, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.0233514423222272e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }