{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2503884572697003, "eval_steps": 141, "global_step": 141, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017758046614872365, "grad_norm": 0.40501952171325684, "learning_rate": 4.000000000000001e-06, "loss": 1.1387, "step": 1 }, { "epoch": 0.0017758046614872365, "eval_loss": 1.4082584381103516, "eval_runtime": 167.7664, "eval_samples_per_second": 5.657, "eval_steps_per_second": 1.419, "step": 1 }, { "epoch": 0.003551609322974473, "grad_norm": 0.491682767868042, "learning_rate": 8.000000000000001e-06, "loss": 1.2151, "step": 2 }, { "epoch": 0.005327413984461709, "grad_norm": 0.49752455949783325, "learning_rate": 1.2e-05, "loss": 1.1941, "step": 3 }, { "epoch": 0.007103218645948946, "grad_norm": 0.5617953538894653, "learning_rate": 1.6000000000000003e-05, "loss": 1.2472, "step": 4 }, { "epoch": 0.008879023307436182, "grad_norm": 0.646000862121582, "learning_rate": 2e-05, "loss": 1.2767, "step": 5 }, { "epoch": 0.010654827968923418, "grad_norm": 0.6190630197525024, "learning_rate": 2.4e-05, "loss": 1.2839, "step": 6 }, { "epoch": 0.012430632630410655, "grad_norm": 0.6891798973083496, "learning_rate": 2.8000000000000003e-05, "loss": 1.2914, "step": 7 }, { "epoch": 0.014206437291897892, "grad_norm": 0.6742885708808899, "learning_rate": 3.2000000000000005e-05, "loss": 1.3001, "step": 8 }, { "epoch": 0.01598224195338513, "grad_norm": 0.693493664264679, "learning_rate": 3.6e-05, "loss": 1.2673, "step": 9 }, { "epoch": 0.017758046614872364, "grad_norm": 0.7951493859291077, "learning_rate": 4e-05, "loss": 1.3314, "step": 10 }, { "epoch": 0.0195338512763596, "grad_norm": 0.7866435050964355, "learning_rate": 4.4000000000000006e-05, "loss": 1.2703, "step": 11 }, { "epoch": 0.021309655937846835, "grad_norm": 0.7218112349510193, "learning_rate": 4.8e-05, "loss": 1.2542, "step": 12 }, { "epoch": 0.023085460599334074, "grad_norm": 0.6838662028312683, "learning_rate": 5.2000000000000004e-05, "loss": 1.2432, "step": 13 }, { "epoch": 0.02486126526082131, "grad_norm": 0.6592800617218018, "learning_rate": 5.6000000000000006e-05, "loss": 1.2374, "step": 14 }, { "epoch": 0.026637069922308545, "grad_norm": 0.513134241104126, "learning_rate": 6e-05, "loss": 1.246, "step": 15 }, { "epoch": 0.028412874583795784, "grad_norm": 0.5785119533538818, "learning_rate": 6.400000000000001e-05, "loss": 1.169, "step": 16 }, { "epoch": 0.03018867924528302, "grad_norm": 0.6144536733627319, "learning_rate": 6.800000000000001e-05, "loss": 1.1532, "step": 17 }, { "epoch": 0.03196448390677026, "grad_norm": 0.674633800983429, "learning_rate": 7.2e-05, "loss": 1.1175, "step": 18 }, { "epoch": 0.03374028856825749, "grad_norm": 0.5997682809829712, "learning_rate": 7.6e-05, "loss": 1.092, "step": 19 }, { "epoch": 0.03551609322974473, "grad_norm": 0.5651845335960388, "learning_rate": 8e-05, "loss": 1.0543, "step": 20 }, { "epoch": 0.03729189789123197, "grad_norm": 0.562713623046875, "learning_rate": 8.4e-05, "loss": 1.0377, "step": 21 }, { "epoch": 0.0390677025527192, "grad_norm": 0.5826591849327087, "learning_rate": 8.800000000000001e-05, "loss": 1.0178, "step": 22 }, { "epoch": 0.04084350721420644, "grad_norm": 0.5972415208816528, "learning_rate": 9.200000000000001e-05, "loss": 0.9916, "step": 23 }, { "epoch": 0.04261931187569367, "grad_norm": 0.6266159415245056, "learning_rate": 9.6e-05, "loss": 1.0026, "step": 24 }, { "epoch": 0.04439511653718091, "grad_norm": 0.7757481932640076, "learning_rate": 0.0001, "loss": 1.0245, "step": 25 }, { "epoch": 0.04617092119866815, "grad_norm": 0.6832363605499268, "learning_rate": 0.00010400000000000001, "loss": 0.9408, "step": 26 }, { "epoch": 0.04794672586015538, "grad_norm": 1.2983894348144531, "learning_rate": 0.00010800000000000001, "loss": 0.9228, "step": 27 }, { "epoch": 0.04972253052164262, "grad_norm": 0.9382829666137695, "learning_rate": 0.00011200000000000001, "loss": 0.9605, "step": 28 }, { "epoch": 0.05149833518312986, "grad_norm": 0.5051376819610596, "learning_rate": 0.000116, "loss": 0.9769, "step": 29 }, { "epoch": 0.05327413984461709, "grad_norm": 0.40853050351142883, "learning_rate": 0.00012, "loss": 0.8912, "step": 30 }, { "epoch": 0.05504994450610433, "grad_norm": 0.4261438846588135, "learning_rate": 0.000124, "loss": 0.9438, "step": 31 }, { "epoch": 0.05682574916759157, "grad_norm": 0.44900333881378174, "learning_rate": 0.00012800000000000002, "loss": 0.9589, "step": 32 }, { "epoch": 0.0586015538290788, "grad_norm": 0.4262010157108307, "learning_rate": 0.000132, "loss": 0.8775, "step": 33 }, { "epoch": 0.06037735849056604, "grad_norm": 0.40672022104263306, "learning_rate": 0.00013600000000000003, "loss": 0.8956, "step": 34 }, { "epoch": 0.06215316315205328, "grad_norm": 0.39336153864860535, "learning_rate": 0.00014, "loss": 0.866, "step": 35 }, { "epoch": 0.06392896781354052, "grad_norm": 0.40699368715286255, "learning_rate": 0.000144, "loss": 0.8612, "step": 36 }, { "epoch": 0.06570477247502775, "grad_norm": 0.438643217086792, "learning_rate": 0.000148, "loss": 0.922, "step": 37 }, { "epoch": 0.06748057713651498, "grad_norm": 0.45053544640541077, "learning_rate": 0.000152, "loss": 0.8681, "step": 38 }, { "epoch": 0.06925638179800223, "grad_norm": 0.4289852976799011, "learning_rate": 0.00015600000000000002, "loss": 0.9071, "step": 39 }, { "epoch": 0.07103218645948946, "grad_norm": 0.4101032316684723, "learning_rate": 0.00016, "loss": 0.8692, "step": 40 }, { "epoch": 0.07280799112097669, "grad_norm": 0.418319433927536, "learning_rate": 0.000164, "loss": 0.8654, "step": 41 }, { "epoch": 0.07458379578246394, "grad_norm": 0.41637811064720154, "learning_rate": 0.000168, "loss": 0.8348, "step": 42 }, { "epoch": 0.07635960044395117, "grad_norm": 0.40830302238464355, "learning_rate": 0.000172, "loss": 0.8928, "step": 43 }, { "epoch": 0.0781354051054384, "grad_norm": 0.4163912236690521, "learning_rate": 0.00017600000000000002, "loss": 0.8793, "step": 44 }, { "epoch": 0.07991120976692564, "grad_norm": 0.4240954518318176, "learning_rate": 0.00018, "loss": 0.9029, "step": 45 }, { "epoch": 0.08168701442841288, "grad_norm": 0.48420408368110657, "learning_rate": 0.00018400000000000003, "loss": 0.8632, "step": 46 }, { "epoch": 0.08346281908990011, "grad_norm": 0.5267483592033386, "learning_rate": 0.000188, "loss": 0.8575, "step": 47 }, { "epoch": 0.08523862375138734, "grad_norm": 0.4947332441806793, "learning_rate": 0.000192, "loss": 0.9051, "step": 48 }, { "epoch": 0.08701442841287459, "grad_norm": 0.5025691986083984, "learning_rate": 0.000196, "loss": 0.9145, "step": 49 }, { "epoch": 0.08879023307436182, "grad_norm": 0.5430313944816589, "learning_rate": 0.0002, "loss": 0.8954, "step": 50 }, { "epoch": 0.09056603773584905, "grad_norm": 0.45721662044525146, "learning_rate": 0.00019999812486015523, "loss": 0.9655, "step": 51 }, { "epoch": 0.0923418423973363, "grad_norm": 0.4364672899246216, "learning_rate": 0.00019999249951094388, "loss": 0.9318, "step": 52 }, { "epoch": 0.09411764705882353, "grad_norm": 0.38933759927749634, "learning_rate": 0.00019998312416333227, "loss": 0.8963, "step": 53 }, { "epoch": 0.09589345172031076, "grad_norm": 0.35572728514671326, "learning_rate": 0.0001999699991689222, "loss": 0.9073, "step": 54 }, { "epoch": 0.097669256381798, "grad_norm": 0.3042948544025421, "learning_rate": 0.00019995312501993765, "loss": 0.8751, "step": 55 }, { "epoch": 0.09944506104328524, "grad_norm": 0.32266151905059814, "learning_rate": 0.00019993250234920636, "loss": 0.8493, "step": 56 }, { "epoch": 0.10122086570477247, "grad_norm": 0.31894031167030334, "learning_rate": 0.00019990813193013625, "loss": 0.8512, "step": 57 }, { "epoch": 0.10299667036625972, "grad_norm": 0.33073991537094116, "learning_rate": 0.0001998800146766861, "loss": 0.8424, "step": 58 }, { "epoch": 0.10477247502774695, "grad_norm": 0.32064828276634216, "learning_rate": 0.00019984815164333163, "loss": 0.8698, "step": 59 }, { "epoch": 0.10654827968923418, "grad_norm": 0.3364376425743103, "learning_rate": 0.00019981254402502566, "loss": 0.8525, "step": 60 }, { "epoch": 0.10832408435072143, "grad_norm": 0.31403639912605286, "learning_rate": 0.0001997731931571535, "loss": 0.8309, "step": 61 }, { "epoch": 0.11009988901220866, "grad_norm": 0.3375100791454315, "learning_rate": 0.00019973010051548275, "loss": 0.8573, "step": 62 }, { "epoch": 0.11187569367369589, "grad_norm": 0.3584939241409302, "learning_rate": 0.00019968326771610797, "loss": 0.8479, "step": 63 }, { "epoch": 0.11365149833518313, "grad_norm": 0.35480472445487976, "learning_rate": 0.00019963269651539017, "loss": 0.845, "step": 64 }, { "epoch": 0.11542730299667037, "grad_norm": 0.33250972628593445, "learning_rate": 0.00019957838880989078, "loss": 0.8438, "step": 65 }, { "epoch": 0.1172031076581576, "grad_norm": 0.39302438497543335, "learning_rate": 0.00019952034663630062, "loss": 0.8391, "step": 66 }, { "epoch": 0.11897891231964484, "grad_norm": 0.3517158031463623, "learning_rate": 0.00019945857217136363, "loss": 0.7966, "step": 67 }, { "epoch": 0.12075471698113208, "grad_norm": 0.38860467076301575, "learning_rate": 0.00019939306773179497, "loss": 0.8279, "step": 68 }, { "epoch": 0.12253052164261931, "grad_norm": 0.3762984573841095, "learning_rate": 0.00019932383577419432, "loss": 0.7848, "step": 69 }, { "epoch": 0.12430632630410655, "grad_norm": 0.4535103440284729, "learning_rate": 0.00019925087889495374, "loss": 0.8, "step": 70 }, { "epoch": 0.12608213096559379, "grad_norm": 0.4869844317436218, "learning_rate": 0.00019917419983016025, "loss": 0.8442, "step": 71 }, { "epoch": 0.12785793562708103, "grad_norm": 0.4379689395427704, "learning_rate": 0.00019909380145549324, "loss": 0.8353, "step": 72 }, { "epoch": 0.12963374028856825, "grad_norm": 0.39510270953178406, "learning_rate": 0.00019900968678611666, "loss": 0.8538, "step": 73 }, { "epoch": 0.1314095449500555, "grad_norm": 0.4764181971549988, "learning_rate": 0.00019892185897656578, "loss": 0.8509, "step": 74 }, { "epoch": 0.13318534961154274, "grad_norm": 0.5591267347335815, "learning_rate": 0.00019883032132062925, "loss": 0.8661, "step": 75 }, { "epoch": 0.13496115427302996, "grad_norm": 0.41077056527137756, "learning_rate": 0.00019873507725122504, "loss": 0.9418, "step": 76 }, { "epoch": 0.1367369589345172, "grad_norm": 0.393622487783432, "learning_rate": 0.00019863613034027224, "loss": 0.926, "step": 77 }, { "epoch": 0.13851276359600445, "grad_norm": 0.36414071917533875, "learning_rate": 0.00019853348429855672, "loss": 0.8649, "step": 78 }, { "epoch": 0.14028856825749167, "grad_norm": 0.3100601136684418, "learning_rate": 0.00019842714297559213, "loss": 0.9114, "step": 79 }, { "epoch": 0.14206437291897892, "grad_norm": 0.29151105880737305, "learning_rate": 0.0001983171103594755, "loss": 0.8681, "step": 80 }, { "epoch": 0.14384017758046616, "grad_norm": 0.28398221731185913, "learning_rate": 0.0001982033905767377, "loss": 0.8515, "step": 81 }, { "epoch": 0.14561598224195338, "grad_norm": 0.2883840799331665, "learning_rate": 0.00019808598789218865, "loss": 0.8569, "step": 82 }, { "epoch": 0.14739178690344062, "grad_norm": 0.29812031984329224, "learning_rate": 0.0001979649067087574, "loss": 0.8529, "step": 83 }, { "epoch": 0.14916759156492787, "grad_norm": 0.3074108958244324, "learning_rate": 0.00019784015156732693, "loss": 0.8771, "step": 84 }, { "epoch": 0.1509433962264151, "grad_norm": 0.3601110279560089, "learning_rate": 0.000197711727146564, "loss": 0.8609, "step": 85 }, { "epoch": 0.15271920088790233, "grad_norm": 0.3126521110534668, "learning_rate": 0.00019757963826274357, "loss": 0.8162, "step": 86 }, { "epoch": 0.15449500554938958, "grad_norm": 0.3152073323726654, "learning_rate": 0.00019744388986956822, "loss": 0.7661, "step": 87 }, { "epoch": 0.1562708102108768, "grad_norm": 0.33570149540901184, "learning_rate": 0.00019730448705798239, "loss": 0.8179, "step": 88 }, { "epoch": 0.15804661487236404, "grad_norm": 0.33989331126213074, "learning_rate": 0.0001971614350559814, "loss": 0.8288, "step": 89 }, { "epoch": 0.1598224195338513, "grad_norm": 0.3292713761329651, "learning_rate": 0.0001970147392284154, "loss": 0.8415, "step": 90 }, { "epoch": 0.1615982241953385, "grad_norm": 0.3394547700881958, "learning_rate": 0.00019686440507678824, "loss": 0.8232, "step": 91 }, { "epoch": 0.16337402885682575, "grad_norm": 0.3370296061038971, "learning_rate": 0.0001967104382390511, "loss": 0.7771, "step": 92 }, { "epoch": 0.16514983351831297, "grad_norm": 0.3798193633556366, "learning_rate": 0.00019655284448939094, "loss": 0.789, "step": 93 }, { "epoch": 0.16692563817980022, "grad_norm": 0.3790013790130615, "learning_rate": 0.00019639162973801426, "loss": 0.8153, "step": 94 }, { "epoch": 0.16870144284128746, "grad_norm": 0.42274704575538635, "learning_rate": 0.00019622680003092503, "loss": 0.8012, "step": 95 }, { "epoch": 0.17047724750277468, "grad_norm": 0.4776620864868164, "learning_rate": 0.0001960583615496984, "loss": 0.8132, "step": 96 }, { "epoch": 0.17225305216426193, "grad_norm": 0.4170360565185547, "learning_rate": 0.00019588632061124837, "loss": 0.8139, "step": 97 }, { "epoch": 0.17402885682574917, "grad_norm": 0.47097012400627136, "learning_rate": 0.00019571068366759143, "loss": 0.7711, "step": 98 }, { "epoch": 0.1758046614872364, "grad_norm": 0.8176291584968567, "learning_rate": 0.00019553145730560415, "loss": 0.7906, "step": 99 }, { "epoch": 0.17758046614872364, "grad_norm": 0.7204902172088623, "learning_rate": 0.0001953486482467764, "loss": 0.9088, "step": 100 }, { "epoch": 0.17935627081021088, "grad_norm": 0.3952767252922058, "learning_rate": 0.0001951622633469592, "loss": 0.9362, "step": 101 }, { "epoch": 0.1811320754716981, "grad_norm": 0.3742019534111023, "learning_rate": 0.00019497230959610756, "loss": 0.933, "step": 102 }, { "epoch": 0.18290788013318535, "grad_norm": 0.3385975658893585, "learning_rate": 0.00019477879411801844, "loss": 0.9028, "step": 103 }, { "epoch": 0.1846836847946726, "grad_norm": 0.2950561046600342, "learning_rate": 0.00019458172417006347, "loss": 0.8245, "step": 104 }, { "epoch": 0.1864594894561598, "grad_norm": 0.30859696865081787, "learning_rate": 0.00019438110714291694, "loss": 0.8771, "step": 105 }, { "epoch": 0.18823529411764706, "grad_norm": 0.3490929901599884, "learning_rate": 0.00019417695056027844, "loss": 0.8565, "step": 106 }, { "epoch": 0.1900110987791343, "grad_norm": 0.31133994460105896, "learning_rate": 0.00019396926207859084, "loss": 0.8734, "step": 107 }, { "epoch": 0.19178690344062152, "grad_norm": 0.2884789705276489, "learning_rate": 0.00019375804948675306, "loss": 0.8645, "step": 108 }, { "epoch": 0.19356270810210877, "grad_norm": 0.2969193160533905, "learning_rate": 0.0001935433207058281, "loss": 0.8751, "step": 109 }, { "epoch": 0.195338512763596, "grad_norm": 0.41810932755470276, "learning_rate": 0.0001933250837887457, "loss": 0.8037, "step": 110 }, { "epoch": 0.19711431742508323, "grad_norm": 0.3271716833114624, "learning_rate": 0.00019310334692000075, "loss": 0.7814, "step": 111 }, { "epoch": 0.19889012208657048, "grad_norm": 0.4146140515804291, "learning_rate": 0.00019287811841534595, "loss": 0.8425, "step": 112 }, { "epoch": 0.20066592674805772, "grad_norm": 0.3369704484939575, "learning_rate": 0.00019264940672148018, "loss": 0.8301, "step": 113 }, { "epoch": 0.20244173140954494, "grad_norm": 0.32731175422668457, "learning_rate": 0.00019241722041573166, "loss": 0.7964, "step": 114 }, { "epoch": 0.20421753607103219, "grad_norm": 0.3840983510017395, "learning_rate": 0.0001921815682057362, "loss": 0.7864, "step": 115 }, { "epoch": 0.20599334073251943, "grad_norm": 0.37049344182014465, "learning_rate": 0.0001919424589291108, "loss": 0.8086, "step": 116 }, { "epoch": 0.20776914539400665, "grad_norm": 0.380991131067276, "learning_rate": 0.0001916999015531221, "loss": 0.8039, "step": 117 }, { "epoch": 0.2095449500554939, "grad_norm": 0.3884637653827667, "learning_rate": 0.00019145390517435012, "loss": 0.7693, "step": 118 }, { "epoch": 0.21132075471698114, "grad_norm": 0.39195218682289124, "learning_rate": 0.00019120447901834706, "loss": 0.8139, "step": 119 }, { "epoch": 0.21309655937846836, "grad_norm": 0.41479626297950745, "learning_rate": 0.00019095163243929142, "loss": 0.7714, "step": 120 }, { "epoch": 0.2148723640399556, "grad_norm": 0.3856278657913208, "learning_rate": 0.0001906953749196371, "loss": 0.8198, "step": 121 }, { "epoch": 0.21664816870144285, "grad_norm": 0.3706349730491638, "learning_rate": 0.00019043571606975777, "loss": 0.7106, "step": 122 }, { "epoch": 0.21842397336293007, "grad_norm": 0.5981292724609375, "learning_rate": 0.00019017266562758659, "loss": 0.8005, "step": 123 }, { "epoch": 0.22019977802441731, "grad_norm": 0.4480712115764618, "learning_rate": 0.00018990623345825083, "loss": 0.8167, "step": 124 }, { "epoch": 0.22197558268590456, "grad_norm": 0.9817702770233154, "learning_rate": 0.00018963642955370201, "loss": 0.8555, "step": 125 }, { "epoch": 0.22375138734739178, "grad_norm": 0.4110267460346222, "learning_rate": 0.00018936326403234125, "loss": 0.9069, "step": 126 }, { "epoch": 0.22552719200887902, "grad_norm": 0.36051687598228455, "learning_rate": 0.00018908674713863952, "loss": 0.8783, "step": 127 }, { "epoch": 0.22730299667036627, "grad_norm": 0.34053486585617065, "learning_rate": 0.00018880688924275378, "loss": 0.8563, "step": 128 }, { "epoch": 0.2290788013318535, "grad_norm": 0.30984926223754883, "learning_rate": 0.0001885237008401378, "loss": 0.8434, "step": 129 }, { "epoch": 0.23085460599334073, "grad_norm": 0.3125753700733185, "learning_rate": 0.0001882371925511488, "loss": 0.831, "step": 130 }, { "epoch": 0.23263041065482798, "grad_norm": 0.3113706409931183, "learning_rate": 0.0001879473751206489, "loss": 0.8659, "step": 131 }, { "epoch": 0.2344062153163152, "grad_norm": 0.2837103605270386, "learning_rate": 0.00018765425941760238, "loss": 0.812, "step": 132 }, { "epoch": 0.23618201997780244, "grad_norm": 0.2814521789550781, "learning_rate": 0.00018735785643466784, "loss": 0.8116, "step": 133 }, { "epoch": 0.2379578246392897, "grad_norm": 0.2922544777393341, "learning_rate": 0.00018705817728778624, "loss": 0.8305, "step": 134 }, { "epoch": 0.2397336293007769, "grad_norm": 0.3140820860862732, "learning_rate": 0.00018675523321576371, "loss": 0.7882, "step": 135 }, { "epoch": 0.24150943396226415, "grad_norm": 0.29498058557510376, "learning_rate": 0.00018644903557985025, "loss": 0.8226, "step": 136 }, { "epoch": 0.2432852386237514, "grad_norm": 0.3298538625240326, "learning_rate": 0.00018613959586331362, "loss": 0.7867, "step": 137 }, { "epoch": 0.24506104328523862, "grad_norm": 0.3474237024784088, "learning_rate": 0.00018582692567100867, "loss": 0.7876, "step": 138 }, { "epoch": 0.24683684794672586, "grad_norm": 0.3735051155090332, "learning_rate": 0.00018551103672894206, "loss": 0.818, "step": 139 }, { "epoch": 0.2486126526082131, "grad_norm": 0.3931002914905548, "learning_rate": 0.00018519194088383273, "loss": 0.7896, "step": 140 }, { "epoch": 0.2503884572697003, "grad_norm": 0.36460694670677185, "learning_rate": 0.00018486965010266725, "loss": 0.8105, "step": 141 }, { "epoch": 0.2503884572697003, "eval_loss": 0.8086357712745667, "eval_runtime": 159.8215, "eval_samples_per_second": 5.938, "eval_steps_per_second": 1.489, "step": 141 } ], "logging_steps": 1, "max_steps": 563, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 141, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.8570783162472858e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }