{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9566929133858268, "eval_steps": 16, "global_step": 126, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.015748031496062992, "grad_norm": 0.034481361508369446, "learning_rate": 4e-05, "loss": 0.1412, "step": 1 }, { "epoch": 0.015748031496062992, "eval_loss": 0.1612786203622818, "eval_runtime": 64.5157, "eval_samples_per_second": 7.812, "eval_steps_per_second": 0.977, "step": 1 }, { "epoch": 0.031496062992125984, "grad_norm": 0.029317770153284073, "learning_rate": 8e-05, "loss": 0.1191, "step": 2 }, { "epoch": 0.047244094488188976, "grad_norm": 0.036621659994125366, "learning_rate": 0.00012, "loss": 0.1369, "step": 3 }, { "epoch": 0.06299212598425197, "grad_norm": 0.04425783455371857, "learning_rate": 0.00016, "loss": 0.1321, "step": 4 }, { "epoch": 0.07874015748031496, "grad_norm": 0.05247063934803009, "learning_rate": 0.0002, "loss": 0.1285, "step": 5 }, { "epoch": 0.09448818897637795, "grad_norm": 0.03902214765548706, "learning_rate": 0.00019996629653035126, "loss": 0.1004, "step": 6 }, { "epoch": 0.11023622047244094, "grad_norm": 0.03752463683485985, "learning_rate": 0.00019986520883988232, "loss": 0.0985, "step": 7 }, { "epoch": 0.12598425196850394, "grad_norm": 0.03061060793697834, "learning_rate": 0.00019969680506871137, "loss": 0.0912, "step": 8 }, { "epoch": 0.14173228346456693, "grad_norm": 0.034427180886268616, "learning_rate": 0.00019946119873266613, "loss": 0.0836, "step": 9 }, { "epoch": 0.15748031496062992, "grad_norm": 0.03106631338596344, "learning_rate": 0.00019915854864676664, "loss": 0.0734, "step": 10 }, { "epoch": 0.1732283464566929, "grad_norm": 0.02498232200741768, "learning_rate": 0.00019878905881817252, "loss": 0.0729, "step": 11 }, { "epoch": 0.1889763779527559, "grad_norm": 0.03798564895987511, "learning_rate": 0.00019835297830866826, "loss": 0.0694, "step": 12 }, { "epoch": 0.2047244094488189, "grad_norm": 0.046124912798404694, "learning_rate": 0.00019785060106677818, "loss": 0.0833, "step": 13 }, { "epoch": 0.2204724409448819, "grad_norm": 0.02981509082019329, "learning_rate": 0.00019728226572962473, "loss": 0.0713, "step": 14 }, { "epoch": 0.23622047244094488, "grad_norm": 0.02461801841855049, "learning_rate": 0.0001966483553946637, "loss": 0.0657, "step": 15 }, { "epoch": 0.25196850393700787, "grad_norm": 0.04344266653060913, "learning_rate": 0.00019594929736144976, "loss": 0.0635, "step": 16 }, { "epoch": 0.25196850393700787, "eval_loss": 0.06579381227493286, "eval_runtime": 64.6166, "eval_samples_per_second": 7.8, "eval_steps_per_second": 0.975, "step": 16 }, { "epoch": 0.2677165354330709, "grad_norm": 0.0320642925798893, "learning_rate": 0.00019518556284360696, "loss": 0.0656, "step": 17 }, { "epoch": 0.28346456692913385, "grad_norm": 0.028899891301989555, "learning_rate": 0.0001943576666511982, "loss": 0.0462, "step": 18 }, { "epoch": 0.2992125984251969, "grad_norm": 0.02383616380393505, "learning_rate": 0.0001934661668437073, "loss": 0.0649, "step": 19 }, { "epoch": 0.31496062992125984, "grad_norm": 0.03346535563468933, "learning_rate": 0.0001925116643538684, "loss": 0.0546, "step": 20 }, { "epoch": 0.33070866141732286, "grad_norm": 0.020454615354537964, "learning_rate": 0.00019149480258259533, "loss": 0.0538, "step": 21 }, { "epoch": 0.3464566929133858, "grad_norm": 0.02081696316599846, "learning_rate": 0.00019041626696528503, "loss": 0.0526, "step": 22 }, { "epoch": 0.36220472440944884, "grad_norm": 0.028128350153565407, "learning_rate": 0.0001892767845097864, "loss": 0.0593, "step": 23 }, { "epoch": 0.3779527559055118, "grad_norm": 0.015519126318395138, "learning_rate": 0.00018807712330634642, "loss": 0.0528, "step": 24 }, { "epoch": 0.3937007874015748, "grad_norm": 0.03593792766332626, "learning_rate": 0.0001868180920098644, "loss": 0.0481, "step": 25 }, { "epoch": 0.4094488188976378, "grad_norm": 0.015408644452691078, "learning_rate": 0.00018550053929480202, "loss": 0.0479, "step": 26 }, { "epoch": 0.4251968503937008, "grad_norm": 0.021226301789283752, "learning_rate": 0.00018412535328311814, "loss": 0.054, "step": 27 }, { "epoch": 0.4409448818897638, "grad_norm": 0.01717953570187092, "learning_rate": 0.0001826934609456129, "loss": 0.0523, "step": 28 }, { "epoch": 0.4566929133858268, "grad_norm": 0.019626960158348083, "learning_rate": 0.00018120582747708502, "loss": 0.0512, "step": 29 }, { "epoch": 0.47244094488188976, "grad_norm": 0.019186396151781082, "learning_rate": 0.0001796634556457236, "loss": 0.05, "step": 30 }, { "epoch": 0.4881889763779528, "grad_norm": 0.014989328570663929, "learning_rate": 0.0001780673851171728, "loss": 0.0441, "step": 31 }, { "epoch": 0.5039370078740157, "grad_norm": 0.012519012205302715, "learning_rate": 0.00017641869175372493, "loss": 0.0459, "step": 32 }, { "epoch": 0.5039370078740157, "eval_loss": 0.056131936609745026, "eval_runtime": 64.4985, "eval_samples_per_second": 7.814, "eval_steps_per_second": 0.977, "step": 32 }, { "epoch": 0.5196850393700787, "grad_norm": 0.01598576456308365, "learning_rate": 0.00017471848688911464, "loss": 0.0496, "step": 33 }, { "epoch": 0.5354330708661418, "grad_norm": 0.017361309379339218, "learning_rate": 0.000172967916579403, "loss": 0.0534, "step": 34 }, { "epoch": 0.5511811023622047, "grad_norm": 0.021230200305581093, "learning_rate": 0.00017116816083045602, "loss": 0.0505, "step": 35 }, { "epoch": 0.5669291338582677, "grad_norm": 0.01624094881117344, "learning_rate": 0.0001693204328025389, "loss": 0.0568, "step": 36 }, { "epoch": 0.5826771653543307, "grad_norm": 0.014916475862264633, "learning_rate": 0.00016742597799256182, "loss": 0.0542, "step": 37 }, { "epoch": 0.5984251968503937, "grad_norm": 0.013211382552981377, "learning_rate": 0.00016548607339452853, "loss": 0.0507, "step": 38 }, { "epoch": 0.6141732283464567, "grad_norm": 0.01305565144866705, "learning_rate": 0.00016350202663875386, "loss": 0.0387, "step": 39 }, { "epoch": 0.6299212598425197, "grad_norm": 0.011459614150226116, "learning_rate": 0.0001614751751104301, "loss": 0.0433, "step": 40 }, { "epoch": 0.6456692913385826, "grad_norm": 0.014712609350681305, "learning_rate": 0.00015940688504813662, "loss": 0.0571, "step": 41 }, { "epoch": 0.6614173228346457, "grad_norm": 0.015662657096982002, "learning_rate": 0.00015729855062290022, "loss": 0.0504, "step": 42 }, { "epoch": 0.6771653543307087, "grad_norm": 0.011235736310482025, "learning_rate": 0.00015515159299842707, "loss": 0.0453, "step": 43 }, { "epoch": 0.6929133858267716, "grad_norm": 0.011984420008957386, "learning_rate": 0.00015296745937313987, "loss": 0.0402, "step": 44 }, { "epoch": 0.7086614173228346, "grad_norm": 0.010523953475058079, "learning_rate": 0.00015074762200466556, "loss": 0.036, "step": 45 }, { "epoch": 0.7244094488188977, "grad_norm": 0.013540665619075298, "learning_rate": 0.00014849357721743168, "loss": 0.0346, "step": 46 }, { "epoch": 0.7401574803149606, "grad_norm": 0.012998640537261963, "learning_rate": 0.00014620684439403962, "loss": 0.0468, "step": 47 }, { "epoch": 0.7559055118110236, "grad_norm": 0.01443515345454216, "learning_rate": 0.0001438889649510956, "loss": 0.0453, "step": 48 }, { "epoch": 0.7559055118110236, "eval_loss": 0.05216333642601967, "eval_runtime": 64.5137, "eval_samples_per_second": 7.812, "eval_steps_per_second": 0.977, "step": 48 }, { "epoch": 0.7716535433070866, "grad_norm": 0.01463907677680254, "learning_rate": 0.00014154150130018866, "loss": 0.0526, "step": 49 }, { "epoch": 0.7874015748031497, "grad_norm": 0.01614455319941044, "learning_rate": 0.00013916603579471705, "loss": 0.0484, "step": 50 }, { "epoch": 0.8031496062992126, "grad_norm": 0.014042153023183346, "learning_rate": 0.000136764169663272, "loss": 0.0419, "step": 51 }, { "epoch": 0.8188976377952756, "grad_norm": 0.015309924259781837, "learning_rate": 0.00013433752193029886, "loss": 0.0425, "step": 52 }, { "epoch": 0.8346456692913385, "grad_norm": 0.018054217100143433, "learning_rate": 0.00013188772832476188, "loss": 0.0426, "step": 53 }, { "epoch": 0.8503937007874016, "grad_norm": 0.012343033216893673, "learning_rate": 0.00012941644017754964, "loss": 0.0448, "step": 54 }, { "epoch": 0.8661417322834646, "grad_norm": 0.012457596138119698, "learning_rate": 0.00012692532330836346, "loss": 0.0451, "step": 55 }, { "epoch": 0.8818897637795275, "grad_norm": 0.013512413017451763, "learning_rate": 0.00012441605690283915, "loss": 0.0413, "step": 56 }, { "epoch": 0.8976377952755905, "grad_norm": 0.013424846343696117, "learning_rate": 0.0001218903323806595, "loss": 0.0441, "step": 57 }, { "epoch": 0.9133858267716536, "grad_norm": 0.014157367870211601, "learning_rate": 0.00011934985225541998, "loss": 0.0443, "step": 58 }, { "epoch": 0.9291338582677166, "grad_norm": 0.0130110839381814, "learning_rate": 0.00011679632898701649, "loss": 0.0478, "step": 59 }, { "epoch": 0.9448818897637795, "grad_norm": 0.012677576392889023, "learning_rate": 0.00011423148382732853, "loss": 0.0399, "step": 60 }, { "epoch": 0.9606299212598425, "grad_norm": 0.01409006118774414, "learning_rate": 0.00011165704565997593, "loss": 0.0481, "step": 61 }, { "epoch": 0.9763779527559056, "grad_norm": 0.013535700738430023, "learning_rate": 0.00010907474983493144, "loss": 0.0406, "step": 62 }, { "epoch": 0.9921259842519685, "grad_norm": 0.014210895635187626, "learning_rate": 0.0001064863369987743, "loss": 0.0425, "step": 63 }, { "epoch": 1.0078740157480315, "grad_norm": 0.014430968090891838, "learning_rate": 0.00010389355192137377, "loss": 0.0483, "step": 64 }, { "epoch": 1.0078740157480315, "eval_loss": 0.049744635820388794, "eval_runtime": 64.598, "eval_samples_per_second": 7.802, "eval_steps_per_second": 0.975, "step": 64 }, { "epoch": 1.0236220472440944, "grad_norm": 0.0142066590487957, "learning_rate": 0.0001012981423197931, "loss": 0.0391, "step": 65 }, { "epoch": 1.0118110236220472, "grad_norm": 0.013278558850288391, "learning_rate": 9.870185768020693e-05, "loss": 0.045, "step": 66 }, { "epoch": 1.0275590551181102, "grad_norm": 0.01264102477580309, "learning_rate": 9.610644807862625e-05, "loss": 0.0396, "step": 67 }, { "epoch": 1.0433070866141732, "grad_norm": 0.014591066166758537, "learning_rate": 9.35136630012257e-05, "loss": 0.0443, "step": 68 }, { "epoch": 1.0590551181102361, "grad_norm": 0.013674317859113216, "learning_rate": 9.092525016506858e-05, "loss": 0.0493, "step": 69 }, { "epoch": 1.0748031496062993, "grad_norm": 0.0148893091827631, "learning_rate": 8.83429543400241e-05, "loss": 0.0412, "step": 70 }, { "epoch": 1.0905511811023623, "grad_norm": 0.01666112430393696, "learning_rate": 8.57685161726715e-05, "loss": 0.0476, "step": 71 }, { "epoch": 1.1062992125984252, "grad_norm": 0.013044373132288456, "learning_rate": 8.320367101298351e-05, "loss": 0.0391, "step": 72 }, { "epoch": 1.1220472440944882, "grad_norm": 0.014822134748101234, "learning_rate": 8.065014774458003e-05, "loss": 0.0406, "step": 73 }, { "epoch": 1.1377952755905512, "grad_norm": 0.013880250044167042, "learning_rate": 7.810966761934053e-05, "loss": 0.0405, "step": 74 }, { "epoch": 1.1535433070866141, "grad_norm": 0.014100627042353153, "learning_rate": 7.558394309716088e-05, "loss": 0.0422, "step": 75 }, { "epoch": 1.169291338582677, "grad_norm": 0.01578613370656967, "learning_rate": 7.307467669163655e-05, "loss": 0.0411, "step": 76 }, { "epoch": 1.1850393700787403, "grad_norm": 0.013604246079921722, "learning_rate": 7.058355982245037e-05, "loss": 0.0373, "step": 77 }, { "epoch": 1.2007874015748032, "grad_norm": 0.016308438032865524, "learning_rate": 6.811227167523815e-05, "loss": 0.0472, "step": 78 }, { "epoch": 1.2165354330708662, "grad_norm": 0.014247502200305462, "learning_rate": 6.566247806970119e-05, "loss": 0.0464, "step": 79 }, { "epoch": 1.2322834645669292, "grad_norm": 0.012891258113086224, "learning_rate": 6.323583033672799e-05, "loss": 0.0366, "step": 80 }, { "epoch": 1.2322834645669292, "eval_loss": 0.048343077301979065, "eval_runtime": 64.7735, "eval_samples_per_second": 7.781, "eval_steps_per_second": 0.973, "step": 80 }, { "epoch": 1.2480314960629921, "grad_norm": 0.015204845927655697, "learning_rate": 6.083396420528298e-05, "loss": 0.0438, "step": 81 }, { "epoch": 1.263779527559055, "grad_norm": 0.01763073354959488, "learning_rate": 5.845849869981137e-05, "loss": 0.0466, "step": 82 }, { "epoch": 1.279527559055118, "grad_norm": 0.013175925239920616, "learning_rate": 5.611103504890444e-05, "loss": 0.039, "step": 83 }, { "epoch": 1.295275590551181, "grad_norm": 0.016102107241749763, "learning_rate": 5.379315560596038e-05, "loss": 0.0462, "step": 84 }, { "epoch": 1.311023622047244, "grad_norm": 0.014480439946055412, "learning_rate": 5.1506422782568345e-05, "loss": 0.0402, "step": 85 }, { "epoch": 1.326771653543307, "grad_norm": 0.017164282500743866, "learning_rate": 4.9252377995334444e-05, "loss": 0.0418, "step": 86 }, { "epoch": 1.3425196850393701, "grad_norm": 0.013455789536237717, "learning_rate": 4.703254062686017e-05, "loss": 0.0402, "step": 87 }, { "epoch": 1.358267716535433, "grad_norm": 0.014540264382958412, "learning_rate": 4.484840700157295e-05, "loss": 0.038, "step": 88 }, { "epoch": 1.374015748031496, "grad_norm": 0.014430800452828407, "learning_rate": 4.270144937709981e-05, "loss": 0.0393, "step": 89 }, { "epoch": 1.389763779527559, "grad_norm": 0.013658607378602028, "learning_rate": 4.059311495186338e-05, "loss": 0.0354, "step": 90 }, { "epoch": 1.405511811023622, "grad_norm": 0.01640120893716812, "learning_rate": 3.852482488956992e-05, "loss": 0.0446, "step": 91 }, { "epoch": 1.421259842519685, "grad_norm": 0.013601432554423809, "learning_rate": 3.649797336124615e-05, "loss": 0.035, "step": 92 }, { "epoch": 1.4370078740157481, "grad_norm": 0.016174251213669777, "learning_rate": 3.45139266054715e-05, "loss": 0.0432, "step": 93 }, { "epoch": 1.452755905511811, "grad_norm": 0.01637461967766285, "learning_rate": 3.257402200743821e-05, "loss": 0.0445, "step": 94 }, { "epoch": 1.468503937007874, "grad_norm": 0.0154279675334692, "learning_rate": 3.0679567197461134e-05, "loss": 0.0433, "step": 95 }, { "epoch": 1.484251968503937, "grad_norm": 0.013604864478111267, "learning_rate": 2.8831839169543996e-05, "loss": 0.0408, "step": 96 }, { "epoch": 1.484251968503937, "eval_loss": 0.047206979244947433, "eval_runtime": 64.4089, "eval_samples_per_second": 7.825, "eval_steps_per_second": 0.978, "step": 96 }, { "epoch": 1.5, "grad_norm": 0.014560838229954243, "learning_rate": 2.7032083420597e-05, "loss": 0.0385, "step": 97 }, { "epoch": 1.515748031496063, "grad_norm": 0.01328711025416851, "learning_rate": 2.528151311088537e-05, "loss": 0.0397, "step": 98 }, { "epoch": 1.531496062992126, "grad_norm": 0.016683636233210564, "learning_rate": 2.3581308246275103e-05, "loss": 0.0398, "step": 99 }, { "epoch": 1.547244094488189, "grad_norm": 0.012160832062363625, "learning_rate": 2.1932614882827197e-05, "loss": 0.0328, "step": 100 }, { "epoch": 1.5629921259842519, "grad_norm": 0.013753566890954971, "learning_rate": 2.03365443542764e-05, "loss": 0.0392, "step": 101 }, { "epoch": 1.5787401574803148, "grad_norm": 0.013317620381712914, "learning_rate": 1.879417252291502e-05, "loss": 0.0346, "step": 102 }, { "epoch": 1.594488188976378, "grad_norm": 0.018083734437823296, "learning_rate": 1.730653905438714e-05, "loss": 0.0482, "step": 103 }, { "epoch": 1.610236220472441, "grad_norm": 0.015288034453988075, "learning_rate": 1.587464671688187e-05, "loss": 0.0416, "step": 104 }, { "epoch": 1.625984251968504, "grad_norm": 0.01392639335244894, "learning_rate": 1.4499460705197998e-05, "loss": 0.0355, "step": 105 }, { "epoch": 1.641732283464567, "grad_norm": 0.014464518055319786, "learning_rate": 1.3181907990135622e-05, "loss": 0.0378, "step": 106 }, { "epoch": 1.65748031496063, "grad_norm": 0.014780817553400993, "learning_rate": 1.1922876693653585e-05, "loss": 0.0375, "step": 107 }, { "epoch": 1.673228346456693, "grad_norm": 0.014019722118973732, "learning_rate": 1.0723215490213634e-05, "loss": 0.0371, "step": 108 }, { "epoch": 1.688976377952756, "grad_norm": 0.013653130270540714, "learning_rate": 9.583733034714981e-06, "loss": 0.0341, "step": 109 }, { "epoch": 1.704724409448819, "grad_norm": 0.014543344266712666, "learning_rate": 8.505197417404687e-06, "loss": 0.0363, "step": 110 }, { "epoch": 1.720472440944882, "grad_norm": 0.01664627157151699, "learning_rate": 7.488335646131628e-06, "loss": 0.0397, "step": 111 }, { "epoch": 1.736220472440945, "grad_norm": 0.01318218931555748, "learning_rate": 6.533833156292679e-06, "loss": 0.0363, "step": 112 }, { "epoch": 1.736220472440945, "eval_loss": 0.04670024663209915, "eval_runtime": 64.4696, "eval_samples_per_second": 7.818, "eval_steps_per_second": 0.977, "step": 112 }, { "epoch": 1.7519685039370079, "grad_norm": 0.014432979747653008, "learning_rate": 5.6423333488018095e-06, "loss": 0.0384, "step": 113 }, { "epoch": 1.7677165354330708, "grad_norm": 0.013415982015430927, "learning_rate": 4.8144371563930476e-06, "loss": 0.0383, "step": 114 }, { "epoch": 1.7834645669291338, "grad_norm": 0.01275411993265152, "learning_rate": 4.050702638550275e-06, "loss": 0.0344, "step": 115 }, { "epoch": 1.7992125984251968, "grad_norm": 0.012640634551644325, "learning_rate": 3.3516446053363015e-06, "loss": 0.0325, "step": 116 }, { "epoch": 1.8149606299212597, "grad_norm": 0.014171491377055645, "learning_rate": 2.717734270375272e-06, "loss": 0.0375, "step": 117 }, { "epoch": 1.8307086614173227, "grad_norm": 0.014255956746637821, "learning_rate": 2.1493989332218468e-06, "loss": 0.0338, "step": 118 }, { "epoch": 1.8464566929133859, "grad_norm": 0.015443972311913967, "learning_rate": 1.6470216913317626e-06, "loss": 0.0395, "step": 119 }, { "epoch": 1.8622047244094488, "grad_norm": 0.015347709879279137, "learning_rate": 1.2109411818274852e-06, "loss": 0.0412, "step": 120 }, { "epoch": 1.8779527559055118, "grad_norm": 0.011879626661539078, "learning_rate": 8.41451353233369e-07, "loss": 0.0353, "step": 121 }, { "epoch": 1.8937007874015748, "grad_norm": 0.013861949555575848, "learning_rate": 5.388012673338661e-07, "loss": 0.0385, "step": 122 }, { "epoch": 1.909448818897638, "grad_norm": 0.013466684147715569, "learning_rate": 3.0319493128866396e-07, "loss": 0.0392, "step": 123 }, { "epoch": 1.925196850393701, "grad_norm": 0.014232831075787544, "learning_rate": 1.3479116011769767e-07, "loss": 0.0387, "step": 124 }, { "epoch": 1.9409448818897639, "grad_norm": 0.013951584696769714, "learning_rate": 3.370346964876036e-08, "loss": 0.039, "step": 125 }, { "epoch": 1.9566929133858268, "grad_norm": 0.014759634621441364, "learning_rate": 0.0, "loss": 0.04, "step": 126 } ], "logging_steps": 1, "max_steps": 126, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 63, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.624893472111329e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }