{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997780244173141, "eval_steps": 141, "global_step": 563, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0017758046614872365, "grad_norm": 0.40501952171325684, "learning_rate": 4.000000000000001e-06, "loss": 1.1387, "step": 1 }, { "epoch": 0.0017758046614872365, "eval_loss": 1.4082584381103516, "eval_runtime": 167.7664, "eval_samples_per_second": 5.657, "eval_steps_per_second": 1.419, "step": 1 }, { "epoch": 0.003551609322974473, "grad_norm": 0.491682767868042, "learning_rate": 8.000000000000001e-06, "loss": 1.2151, "step": 2 }, { "epoch": 0.005327413984461709, "grad_norm": 0.49752455949783325, "learning_rate": 1.2e-05, "loss": 1.1941, "step": 3 }, { "epoch": 0.007103218645948946, "grad_norm": 0.5617953538894653, "learning_rate": 1.6000000000000003e-05, "loss": 1.2472, "step": 4 }, { "epoch": 0.008879023307436182, "grad_norm": 0.646000862121582, "learning_rate": 2e-05, "loss": 1.2767, "step": 5 }, { "epoch": 0.010654827968923418, "grad_norm": 0.6190630197525024, "learning_rate": 2.4e-05, "loss": 1.2839, "step": 6 }, { "epoch": 0.012430632630410655, "grad_norm": 0.6891798973083496, "learning_rate": 2.8000000000000003e-05, "loss": 1.2914, "step": 7 }, { "epoch": 0.014206437291897892, "grad_norm": 0.6742885708808899, "learning_rate": 3.2000000000000005e-05, "loss": 1.3001, "step": 8 }, { "epoch": 0.01598224195338513, "grad_norm": 0.693493664264679, "learning_rate": 3.6e-05, "loss": 1.2673, "step": 9 }, { "epoch": 0.017758046614872364, "grad_norm": 0.7951493859291077, "learning_rate": 4e-05, "loss": 1.3314, "step": 10 }, { "epoch": 0.0195338512763596, "grad_norm": 0.7866435050964355, "learning_rate": 4.4000000000000006e-05, "loss": 1.2703, "step": 11 }, { "epoch": 0.021309655937846835, "grad_norm": 0.7218112349510193, "learning_rate": 4.8e-05, "loss": 1.2542, "step": 12 }, { "epoch": 0.023085460599334074, "grad_norm": 0.6838662028312683, "learning_rate": 5.2000000000000004e-05, "loss": 1.2432, "step": 13 }, { "epoch": 0.02486126526082131, "grad_norm": 0.6592800617218018, "learning_rate": 5.6000000000000006e-05, "loss": 1.2374, "step": 14 }, { "epoch": 0.026637069922308545, "grad_norm": 0.513134241104126, "learning_rate": 6e-05, "loss": 1.246, "step": 15 }, { "epoch": 0.028412874583795784, "grad_norm": 0.5785119533538818, "learning_rate": 6.400000000000001e-05, "loss": 1.169, "step": 16 }, { "epoch": 0.03018867924528302, "grad_norm": 0.6144536733627319, "learning_rate": 6.800000000000001e-05, "loss": 1.1532, "step": 17 }, { "epoch": 0.03196448390677026, "grad_norm": 0.674633800983429, "learning_rate": 7.2e-05, "loss": 1.1175, "step": 18 }, { "epoch": 0.03374028856825749, "grad_norm": 0.5997682809829712, "learning_rate": 7.6e-05, "loss": 1.092, "step": 19 }, { "epoch": 0.03551609322974473, "grad_norm": 0.5651845335960388, "learning_rate": 8e-05, "loss": 1.0543, "step": 20 }, { "epoch": 0.03729189789123197, "grad_norm": 0.562713623046875, "learning_rate": 8.4e-05, "loss": 1.0377, "step": 21 }, { "epoch": 0.0390677025527192, "grad_norm": 0.5826591849327087, "learning_rate": 8.800000000000001e-05, "loss": 1.0178, "step": 22 }, { "epoch": 0.04084350721420644, "grad_norm": 0.5972415208816528, "learning_rate": 9.200000000000001e-05, "loss": 0.9916, "step": 23 }, { "epoch": 0.04261931187569367, "grad_norm": 0.6266159415245056, "learning_rate": 9.6e-05, "loss": 1.0026, "step": 24 }, { "epoch": 0.04439511653718091, "grad_norm": 0.7757481932640076, "learning_rate": 0.0001, "loss": 1.0245, "step": 25 }, { "epoch": 0.04617092119866815, "grad_norm": 0.6832363605499268, "learning_rate": 0.00010400000000000001, "loss": 0.9408, "step": 26 }, { "epoch": 0.04794672586015538, "grad_norm": 1.2983894348144531, "learning_rate": 0.00010800000000000001, "loss": 0.9228, "step": 27 }, { "epoch": 0.04972253052164262, "grad_norm": 0.9382829666137695, "learning_rate": 0.00011200000000000001, "loss": 0.9605, "step": 28 }, { "epoch": 0.05149833518312986, "grad_norm": 0.5051376819610596, "learning_rate": 0.000116, "loss": 0.9769, "step": 29 }, { "epoch": 0.05327413984461709, "grad_norm": 0.40853050351142883, "learning_rate": 0.00012, "loss": 0.8912, "step": 30 }, { "epoch": 0.05504994450610433, "grad_norm": 0.4261438846588135, "learning_rate": 0.000124, "loss": 0.9438, "step": 31 }, { "epoch": 0.05682574916759157, "grad_norm": 0.44900333881378174, "learning_rate": 0.00012800000000000002, "loss": 0.9589, "step": 32 }, { "epoch": 0.0586015538290788, "grad_norm": 0.4262010157108307, "learning_rate": 0.000132, "loss": 0.8775, "step": 33 }, { "epoch": 0.06037735849056604, "grad_norm": 0.40672022104263306, "learning_rate": 0.00013600000000000003, "loss": 0.8956, "step": 34 }, { "epoch": 0.06215316315205328, "grad_norm": 0.39336153864860535, "learning_rate": 0.00014, "loss": 0.866, "step": 35 }, { "epoch": 0.06392896781354052, "grad_norm": 0.40699368715286255, "learning_rate": 0.000144, "loss": 0.8612, "step": 36 }, { "epoch": 0.06570477247502775, "grad_norm": 0.438643217086792, "learning_rate": 0.000148, "loss": 0.922, "step": 37 }, { "epoch": 0.06748057713651498, "grad_norm": 0.45053544640541077, "learning_rate": 0.000152, "loss": 0.8681, "step": 38 }, { "epoch": 0.06925638179800223, "grad_norm": 0.4289852976799011, "learning_rate": 0.00015600000000000002, "loss": 0.9071, "step": 39 }, { "epoch": 0.07103218645948946, "grad_norm": 0.4101032316684723, "learning_rate": 0.00016, "loss": 0.8692, "step": 40 }, { "epoch": 0.07280799112097669, "grad_norm": 0.418319433927536, "learning_rate": 0.000164, "loss": 0.8654, "step": 41 }, { "epoch": 0.07458379578246394, "grad_norm": 0.41637811064720154, "learning_rate": 0.000168, "loss": 0.8348, "step": 42 }, { "epoch": 0.07635960044395117, "grad_norm": 0.40830302238464355, "learning_rate": 0.000172, "loss": 0.8928, "step": 43 }, { "epoch": 0.0781354051054384, "grad_norm": 0.4163912236690521, "learning_rate": 0.00017600000000000002, "loss": 0.8793, "step": 44 }, { "epoch": 0.07991120976692564, "grad_norm": 0.4240954518318176, "learning_rate": 0.00018, "loss": 0.9029, "step": 45 }, { "epoch": 0.08168701442841288, "grad_norm": 0.48420408368110657, "learning_rate": 0.00018400000000000003, "loss": 0.8632, "step": 46 }, { "epoch": 0.08346281908990011, "grad_norm": 0.5267483592033386, "learning_rate": 0.000188, "loss": 0.8575, "step": 47 }, { "epoch": 0.08523862375138734, "grad_norm": 0.4947332441806793, "learning_rate": 0.000192, "loss": 0.9051, "step": 48 }, { "epoch": 0.08701442841287459, "grad_norm": 0.5025691986083984, "learning_rate": 0.000196, "loss": 0.9145, "step": 49 }, { "epoch": 0.08879023307436182, "grad_norm": 0.5430313944816589, "learning_rate": 0.0002, "loss": 0.8954, "step": 50 }, { "epoch": 0.09056603773584905, "grad_norm": 0.45721662044525146, "learning_rate": 0.00019999812486015523, "loss": 0.9655, "step": 51 }, { "epoch": 0.0923418423973363, "grad_norm": 0.4364672899246216, "learning_rate": 0.00019999249951094388, "loss": 0.9318, "step": 52 }, { "epoch": 0.09411764705882353, "grad_norm": 0.38933759927749634, "learning_rate": 0.00019998312416333227, "loss": 0.8963, "step": 53 }, { "epoch": 0.09589345172031076, "grad_norm": 0.35572728514671326, "learning_rate": 0.0001999699991689222, "loss": 0.9073, "step": 54 }, { "epoch": 0.097669256381798, "grad_norm": 0.3042948544025421, "learning_rate": 0.00019995312501993765, "loss": 0.8751, "step": 55 }, { "epoch": 0.09944506104328524, "grad_norm": 0.32266151905059814, "learning_rate": 0.00019993250234920636, "loss": 0.8493, "step": 56 }, { "epoch": 0.10122086570477247, "grad_norm": 0.31894031167030334, "learning_rate": 0.00019990813193013625, "loss": 0.8512, "step": 57 }, { "epoch": 0.10299667036625972, "grad_norm": 0.33073991537094116, "learning_rate": 0.0001998800146766861, "loss": 0.8424, "step": 58 }, { "epoch": 0.10477247502774695, "grad_norm": 0.32064828276634216, "learning_rate": 0.00019984815164333163, "loss": 0.8698, "step": 59 }, { "epoch": 0.10654827968923418, "grad_norm": 0.3364376425743103, "learning_rate": 0.00019981254402502566, "loss": 0.8525, "step": 60 }, { "epoch": 0.10832408435072143, "grad_norm": 0.31403639912605286, "learning_rate": 0.0001997731931571535, "loss": 0.8309, "step": 61 }, { "epoch": 0.11009988901220866, "grad_norm": 0.3375100791454315, "learning_rate": 0.00019973010051548275, "loss": 0.8573, "step": 62 }, { "epoch": 0.11187569367369589, "grad_norm": 0.3584939241409302, "learning_rate": 0.00019968326771610797, "loss": 0.8479, "step": 63 }, { "epoch": 0.11365149833518313, "grad_norm": 0.35480472445487976, "learning_rate": 0.00019963269651539017, "loss": 0.845, "step": 64 }, { "epoch": 0.11542730299667037, "grad_norm": 0.33250972628593445, "learning_rate": 0.00019957838880989078, "loss": 0.8438, "step": 65 }, { "epoch": 0.1172031076581576, "grad_norm": 0.39302438497543335, "learning_rate": 0.00019952034663630062, "loss": 0.8391, "step": 66 }, { "epoch": 0.11897891231964484, "grad_norm": 0.3517158031463623, "learning_rate": 0.00019945857217136363, "loss": 0.7966, "step": 67 }, { "epoch": 0.12075471698113208, "grad_norm": 0.38860467076301575, "learning_rate": 0.00019939306773179497, "loss": 0.8279, "step": 68 }, { "epoch": 0.12253052164261931, "grad_norm": 0.3762984573841095, "learning_rate": 0.00019932383577419432, "loss": 0.7848, "step": 69 }, { "epoch": 0.12430632630410655, "grad_norm": 0.4535103440284729, "learning_rate": 0.00019925087889495374, "loss": 0.8, "step": 70 }, { "epoch": 0.12608213096559379, "grad_norm": 0.4869844317436218, "learning_rate": 0.00019917419983016025, "loss": 0.8442, "step": 71 }, { "epoch": 0.12785793562708103, "grad_norm": 0.4379689395427704, "learning_rate": 0.00019909380145549324, "loss": 0.8353, "step": 72 }, { "epoch": 0.12963374028856825, "grad_norm": 0.39510270953178406, "learning_rate": 0.00019900968678611666, "loss": 0.8538, "step": 73 }, { "epoch": 0.1314095449500555, "grad_norm": 0.4764181971549988, "learning_rate": 0.00019892185897656578, "loss": 0.8509, "step": 74 }, { "epoch": 0.13318534961154274, "grad_norm": 0.5591267347335815, "learning_rate": 0.00019883032132062925, "loss": 0.8661, "step": 75 }, { "epoch": 0.13496115427302996, "grad_norm": 0.41077056527137756, "learning_rate": 0.00019873507725122504, "loss": 0.9418, "step": 76 }, { "epoch": 0.1367369589345172, "grad_norm": 0.393622487783432, "learning_rate": 0.00019863613034027224, "loss": 0.926, "step": 77 }, { "epoch": 0.13851276359600445, "grad_norm": 0.36414071917533875, "learning_rate": 0.00019853348429855672, "loss": 0.8649, "step": 78 }, { "epoch": 0.14028856825749167, "grad_norm": 0.3100601136684418, "learning_rate": 0.00019842714297559213, "loss": 0.9114, "step": 79 }, { "epoch": 0.14206437291897892, "grad_norm": 0.29151105880737305, "learning_rate": 0.0001983171103594755, "loss": 0.8681, "step": 80 }, { "epoch": 0.14384017758046616, "grad_norm": 0.28398221731185913, "learning_rate": 0.0001982033905767377, "loss": 0.8515, "step": 81 }, { "epoch": 0.14561598224195338, "grad_norm": 0.2883840799331665, "learning_rate": 0.00019808598789218865, "loss": 0.8569, "step": 82 }, { "epoch": 0.14739178690344062, "grad_norm": 0.29812031984329224, "learning_rate": 0.0001979649067087574, "loss": 0.8529, "step": 83 }, { "epoch": 0.14916759156492787, "grad_norm": 0.3074108958244324, "learning_rate": 0.00019784015156732693, "loss": 0.8771, "step": 84 }, { "epoch": 0.1509433962264151, "grad_norm": 0.3601110279560089, "learning_rate": 0.000197711727146564, "loss": 0.8609, "step": 85 }, { "epoch": 0.15271920088790233, "grad_norm": 0.3126521110534668, "learning_rate": 0.00019757963826274357, "loss": 0.8162, "step": 86 }, { "epoch": 0.15449500554938958, "grad_norm": 0.3152073323726654, "learning_rate": 0.00019744388986956822, "loss": 0.7661, "step": 87 }, { "epoch": 0.1562708102108768, "grad_norm": 0.33570149540901184, "learning_rate": 0.00019730448705798239, "loss": 0.8179, "step": 88 }, { "epoch": 0.15804661487236404, "grad_norm": 0.33989331126213074, "learning_rate": 0.0001971614350559814, "loss": 0.8288, "step": 89 }, { "epoch": 0.1598224195338513, "grad_norm": 0.3292713761329651, "learning_rate": 0.0001970147392284154, "loss": 0.8415, "step": 90 }, { "epoch": 0.1615982241953385, "grad_norm": 0.3394547700881958, "learning_rate": 0.00019686440507678824, "loss": 0.8232, "step": 91 }, { "epoch": 0.16337402885682575, "grad_norm": 0.3370296061038971, "learning_rate": 0.0001967104382390511, "loss": 0.7771, "step": 92 }, { "epoch": 0.16514983351831297, "grad_norm": 0.3798193633556366, "learning_rate": 0.00019655284448939094, "loss": 0.789, "step": 93 }, { "epoch": 0.16692563817980022, "grad_norm": 0.3790013790130615, "learning_rate": 0.00019639162973801426, "loss": 0.8153, "step": 94 }, { "epoch": 0.16870144284128746, "grad_norm": 0.42274704575538635, "learning_rate": 0.00019622680003092503, "loss": 0.8012, "step": 95 }, { "epoch": 0.17047724750277468, "grad_norm": 0.4776620864868164, "learning_rate": 0.0001960583615496984, "loss": 0.8132, "step": 96 }, { "epoch": 0.17225305216426193, "grad_norm": 0.4170360565185547, "learning_rate": 0.00019588632061124837, "loss": 0.8139, "step": 97 }, { "epoch": 0.17402885682574917, "grad_norm": 0.47097012400627136, "learning_rate": 0.00019571068366759143, "loss": 0.7711, "step": 98 }, { "epoch": 0.1758046614872364, "grad_norm": 0.8176291584968567, "learning_rate": 0.00019553145730560415, "loss": 0.7906, "step": 99 }, { "epoch": 0.17758046614872364, "grad_norm": 0.7204902172088623, "learning_rate": 0.0001953486482467764, "loss": 0.9088, "step": 100 }, { "epoch": 0.17935627081021088, "grad_norm": 0.3952767252922058, "learning_rate": 0.0001951622633469592, "loss": 0.9362, "step": 101 }, { "epoch": 0.1811320754716981, "grad_norm": 0.3742019534111023, "learning_rate": 0.00019497230959610756, "loss": 0.933, "step": 102 }, { "epoch": 0.18290788013318535, "grad_norm": 0.3385975658893585, "learning_rate": 0.00019477879411801844, "loss": 0.9028, "step": 103 }, { "epoch": 0.1846836847946726, "grad_norm": 0.2950561046600342, "learning_rate": 0.00019458172417006347, "loss": 0.8245, "step": 104 }, { "epoch": 0.1864594894561598, "grad_norm": 0.30859696865081787, "learning_rate": 0.00019438110714291694, "loss": 0.8771, "step": 105 }, { "epoch": 0.18823529411764706, "grad_norm": 0.3490929901599884, "learning_rate": 0.00019417695056027844, "loss": 0.8565, "step": 106 }, { "epoch": 0.1900110987791343, "grad_norm": 0.31133994460105896, "learning_rate": 0.00019396926207859084, "loss": 0.8734, "step": 107 }, { "epoch": 0.19178690344062152, "grad_norm": 0.2884789705276489, "learning_rate": 0.00019375804948675306, "loss": 0.8645, "step": 108 }, { "epoch": 0.19356270810210877, "grad_norm": 0.2969193160533905, "learning_rate": 0.0001935433207058281, "loss": 0.8751, "step": 109 }, { "epoch": 0.195338512763596, "grad_norm": 0.41810932755470276, "learning_rate": 0.0001933250837887457, "loss": 0.8037, "step": 110 }, { "epoch": 0.19711431742508323, "grad_norm": 0.3271716833114624, "learning_rate": 0.00019310334692000075, "loss": 0.7814, "step": 111 }, { "epoch": 0.19889012208657048, "grad_norm": 0.4146140515804291, "learning_rate": 0.00019287811841534595, "loss": 0.8425, "step": 112 }, { "epoch": 0.20066592674805772, "grad_norm": 0.3369704484939575, "learning_rate": 0.00019264940672148018, "loss": 0.8301, "step": 113 }, { "epoch": 0.20244173140954494, "grad_norm": 0.32731175422668457, "learning_rate": 0.00019241722041573166, "loss": 0.7964, "step": 114 }, { "epoch": 0.20421753607103219, "grad_norm": 0.3840983510017395, "learning_rate": 0.0001921815682057362, "loss": 0.7864, "step": 115 }, { "epoch": 0.20599334073251943, "grad_norm": 0.37049344182014465, "learning_rate": 0.0001919424589291108, "loss": 0.8086, "step": 116 }, { "epoch": 0.20776914539400665, "grad_norm": 0.380991131067276, "learning_rate": 0.0001916999015531221, "loss": 0.8039, "step": 117 }, { "epoch": 0.2095449500554939, "grad_norm": 0.3884637653827667, "learning_rate": 0.00019145390517435012, "loss": 0.7693, "step": 118 }, { "epoch": 0.21132075471698114, "grad_norm": 0.39195218682289124, "learning_rate": 0.00019120447901834706, "loss": 0.8139, "step": 119 }, { "epoch": 0.21309655937846836, "grad_norm": 0.41479626297950745, "learning_rate": 0.00019095163243929142, "loss": 0.7714, "step": 120 }, { "epoch": 0.2148723640399556, "grad_norm": 0.3856278657913208, "learning_rate": 0.0001906953749196371, "loss": 0.8198, "step": 121 }, { "epoch": 0.21664816870144285, "grad_norm": 0.3706349730491638, "learning_rate": 0.00019043571606975777, "loss": 0.7106, "step": 122 }, { "epoch": 0.21842397336293007, "grad_norm": 0.5981292724609375, "learning_rate": 0.00019017266562758659, "loss": 0.8005, "step": 123 }, { "epoch": 0.22019977802441731, "grad_norm": 0.4480712115764618, "learning_rate": 0.00018990623345825083, "loss": 0.8167, "step": 124 }, { "epoch": 0.22197558268590456, "grad_norm": 0.9817702770233154, "learning_rate": 0.00018963642955370201, "loss": 0.8555, "step": 125 }, { "epoch": 0.22375138734739178, "grad_norm": 0.4110267460346222, "learning_rate": 0.00018936326403234125, "loss": 0.9069, "step": 126 }, { "epoch": 0.22552719200887902, "grad_norm": 0.36051687598228455, "learning_rate": 0.00018908674713863952, "loss": 0.8783, "step": 127 }, { "epoch": 0.22730299667036627, "grad_norm": 0.34053486585617065, "learning_rate": 0.00018880688924275378, "loss": 0.8563, "step": 128 }, { "epoch": 0.2290788013318535, "grad_norm": 0.30984926223754883, "learning_rate": 0.0001885237008401378, "loss": 0.8434, "step": 129 }, { "epoch": 0.23085460599334073, "grad_norm": 0.3125753700733185, "learning_rate": 0.0001882371925511488, "loss": 0.831, "step": 130 }, { "epoch": 0.23263041065482798, "grad_norm": 0.3113706409931183, "learning_rate": 0.0001879473751206489, "loss": 0.8659, "step": 131 }, { "epoch": 0.2344062153163152, "grad_norm": 0.2837103605270386, "learning_rate": 0.00018765425941760238, "loss": 0.812, "step": 132 }, { "epoch": 0.23618201997780244, "grad_norm": 0.2814521789550781, "learning_rate": 0.00018735785643466784, "loss": 0.8116, "step": 133 }, { "epoch": 0.2379578246392897, "grad_norm": 0.2922544777393341, "learning_rate": 0.00018705817728778624, "loss": 0.8305, "step": 134 }, { "epoch": 0.2397336293007769, "grad_norm": 0.3140820860862732, "learning_rate": 0.00018675523321576371, "loss": 0.7882, "step": 135 }, { "epoch": 0.24150943396226415, "grad_norm": 0.29498058557510376, "learning_rate": 0.00018644903557985025, "loss": 0.8226, "step": 136 }, { "epoch": 0.2432852386237514, "grad_norm": 0.3298538625240326, "learning_rate": 0.00018613959586331362, "loss": 0.7867, "step": 137 }, { "epoch": 0.24506104328523862, "grad_norm": 0.3474237024784088, "learning_rate": 0.00018582692567100867, "loss": 0.7876, "step": 138 }, { "epoch": 0.24683684794672586, "grad_norm": 0.3735051155090332, "learning_rate": 0.00018551103672894206, "loss": 0.818, "step": 139 }, { "epoch": 0.2486126526082131, "grad_norm": 0.3931002914905548, "learning_rate": 0.00018519194088383273, "loss": 0.7896, "step": 140 }, { "epoch": 0.2503884572697003, "grad_norm": 0.36460694670677185, "learning_rate": 0.00018486965010266725, "loss": 0.8105, "step": 141 }, { "epoch": 0.2503884572697003, "eval_loss": 0.8086357712745667, "eval_runtime": 159.8215, "eval_samples_per_second": 5.938, "eval_steps_per_second": 1.489, "step": 141 }, { "epoch": 0.25216426193118757, "grad_norm": 0.3713844120502472, "learning_rate": 0.0001845441764722514, "loss": 0.7688, "step": 142 }, { "epoch": 0.2539400665926748, "grad_norm": 0.352450430393219, "learning_rate": 0.00018421553219875658, "loss": 0.7769, "step": 143 }, { "epoch": 0.25571587125416206, "grad_norm": 0.3609173893928528, "learning_rate": 0.00018388372960726228, "loss": 0.7718, "step": 144 }, { "epoch": 0.25749167591564925, "grad_norm": 0.36195874214172363, "learning_rate": 0.00018354878114129367, "loss": 0.7375, "step": 145 }, { "epoch": 0.2592674805771365, "grad_norm": 0.3802485466003418, "learning_rate": 0.00018321069936235503, "loss": 0.7778, "step": 146 }, { "epoch": 0.26104328523862375, "grad_norm": 0.38449469208717346, "learning_rate": 0.00018286949694945866, "loss": 0.7458, "step": 147 }, { "epoch": 0.262819089900111, "grad_norm": 0.3975572884082794, "learning_rate": 0.00018252518669864936, "loss": 0.7367, "step": 148 }, { "epoch": 0.26459489456159824, "grad_norm": 0.49581316113471985, "learning_rate": 0.0001821777815225245, "loss": 0.7948, "step": 149 }, { "epoch": 0.2663706992230855, "grad_norm": 0.5556712746620178, "learning_rate": 0.00018182729444974992, "loss": 0.8143, "step": 150 }, { "epoch": 0.2681465038845727, "grad_norm": 0.3207700848579407, "learning_rate": 0.00018147373862457107, "loss": 0.8578, "step": 151 }, { "epoch": 0.2699223085460599, "grad_norm": 0.3484250605106354, "learning_rate": 0.00018111712730632022, "loss": 0.8757, "step": 152 }, { "epoch": 0.27169811320754716, "grad_norm": 0.33792024850845337, "learning_rate": 0.0001807574738689193, "loss": 0.8464, "step": 153 }, { "epoch": 0.2734739178690344, "grad_norm": 0.3430371582508087, "learning_rate": 0.000180394791800378, "loss": 0.8607, "step": 154 }, { "epoch": 0.27524972253052166, "grad_norm": 0.3120534420013428, "learning_rate": 0.00018002909470228842, "loss": 0.8392, "step": 155 }, { "epoch": 0.2770255271920089, "grad_norm": 0.3126620054244995, "learning_rate": 0.00017966039628931446, "loss": 0.8191, "step": 156 }, { "epoch": 0.2788013318534961, "grad_norm": 0.32269468903541565, "learning_rate": 0.00017928871038867784, "loss": 0.8164, "step": 157 }, { "epoch": 0.28057713651498334, "grad_norm": 0.3052617907524109, "learning_rate": 0.00017891405093963938, "loss": 0.8268, "step": 158 }, { "epoch": 0.2823529411764706, "grad_norm": 0.29926028847694397, "learning_rate": 0.00017853643199297633, "loss": 0.7847, "step": 159 }, { "epoch": 0.28412874583795783, "grad_norm": 0.2997240722179413, "learning_rate": 0.00017815586771045535, "loss": 0.8143, "step": 160 }, { "epoch": 0.2859045504994451, "grad_norm": 0.29772111773490906, "learning_rate": 0.0001777723723643014, "loss": 0.7412, "step": 161 }, { "epoch": 0.2876803551609323, "grad_norm": 0.3138352632522583, "learning_rate": 0.0001773859603366626, "loss": 0.7747, "step": 162 }, { "epoch": 0.2894561598224195, "grad_norm": 0.32726818323135376, "learning_rate": 0.00017699664611907072, "loss": 0.8123, "step": 163 }, { "epoch": 0.29123196448390676, "grad_norm": 0.3244825005531311, "learning_rate": 0.0001766044443118978, "loss": 0.7705, "step": 164 }, { "epoch": 0.293007769145394, "grad_norm": 0.35875847935676575, "learning_rate": 0.00017620936962380856, "loss": 0.7881, "step": 165 }, { "epoch": 0.29478357380688125, "grad_norm": 0.36488401889801025, "learning_rate": 0.00017581143687120875, "loss": 0.7956, "step": 166 }, { "epoch": 0.2965593784683685, "grad_norm": 0.33817097544670105, "learning_rate": 0.00017541066097768963, "loss": 0.7719, "step": 167 }, { "epoch": 0.29833518312985574, "grad_norm": 0.36390411853790283, "learning_rate": 0.0001750070569734681, "loss": 0.8172, "step": 168 }, { "epoch": 0.30011098779134293, "grad_norm": 0.34076422452926636, "learning_rate": 0.00017460063999482316, "loss": 0.7419, "step": 169 }, { "epoch": 0.3018867924528302, "grad_norm": 0.39437592029571533, "learning_rate": 0.00017419142528352817, "loss": 0.7519, "step": 170 }, { "epoch": 0.3036625971143174, "grad_norm": 0.4019312560558319, "learning_rate": 0.00017377942818627942, "loss": 0.7944, "step": 171 }, { "epoch": 0.30543840177580467, "grad_norm": 0.40751898288726807, "learning_rate": 0.00017336466415412028, "loss": 0.7827, "step": 172 }, { "epoch": 0.3072142064372919, "grad_norm": 0.4780448079109192, "learning_rate": 0.0001729471487418621, "loss": 0.7872, "step": 173 }, { "epoch": 0.30899001109877916, "grad_norm": 0.40511685609817505, "learning_rate": 0.0001725268976075005, "loss": 0.7642, "step": 174 }, { "epoch": 0.31076581576026635, "grad_norm": 0.5618127584457397, "learning_rate": 0.0001721039265116285, "loss": 0.872, "step": 175 }, { "epoch": 0.3125416204217536, "grad_norm": 0.294917494058609, "learning_rate": 0.00017167825131684513, "loss": 0.8545, "step": 176 }, { "epoch": 0.31431742508324084, "grad_norm": 0.3281805217266083, "learning_rate": 0.00017124988798716083, "loss": 0.8404, "step": 177 }, { "epoch": 0.3160932297447281, "grad_norm": 0.33336278796195984, "learning_rate": 0.00017081885258739846, "loss": 0.8495, "step": 178 }, { "epoch": 0.31786903440621533, "grad_norm": 0.3366440236568451, "learning_rate": 0.00017038516128259115, "loss": 0.8659, "step": 179 }, { "epoch": 0.3196448390677026, "grad_norm": 0.32397955656051636, "learning_rate": 0.00016994883033737582, "loss": 0.8292, "step": 180 }, { "epoch": 0.32142064372918977, "grad_norm": 0.2874945402145386, "learning_rate": 0.00016950987611538324, "loss": 0.7949, "step": 181 }, { "epoch": 0.323196448390677, "grad_norm": 0.3074096143245697, "learning_rate": 0.00016906831507862443, "loss": 0.8076, "step": 182 }, { "epoch": 0.32497225305216426, "grad_norm": 0.30116966366767883, "learning_rate": 0.0001686241637868734, "loss": 0.8058, "step": 183 }, { "epoch": 0.3267480577136515, "grad_norm": 0.3052218556404114, "learning_rate": 0.00016817743889704565, "loss": 0.8067, "step": 184 }, { "epoch": 0.32852386237513875, "grad_norm": 0.3073555827140808, "learning_rate": 0.00016772815716257412, "loss": 0.8496, "step": 185 }, { "epoch": 0.33029966703662594, "grad_norm": 0.289145290851593, "learning_rate": 0.0001672763354327804, "loss": 0.7362, "step": 186 }, { "epoch": 0.3320754716981132, "grad_norm": 0.31561294198036194, "learning_rate": 0.00016682199065224307, "loss": 0.802, "step": 187 }, { "epoch": 0.33385127635960044, "grad_norm": 0.2900339365005493, "learning_rate": 0.00016636513986016213, "loss": 0.7432, "step": 188 }, { "epoch": 0.3356270810210877, "grad_norm": 0.3267146646976471, "learning_rate": 0.0001659058001897201, "loss": 0.7771, "step": 189 }, { "epoch": 0.3374028856825749, "grad_norm": 0.3258307874202728, "learning_rate": 0.00016544398886743933, "loss": 0.7345, "step": 190 }, { "epoch": 0.3391786903440622, "grad_norm": 0.32989659905433655, "learning_rate": 0.000164979723212536, "loss": 0.7383, "step": 191 }, { "epoch": 0.34095449500554936, "grad_norm": 0.3265599310398102, "learning_rate": 0.00016451302063627066, "loss": 0.6977, "step": 192 }, { "epoch": 0.3427302996670366, "grad_norm": 0.39376598596572876, "learning_rate": 0.00016404389864129533, "loss": 0.7851, "step": 193 }, { "epoch": 0.34450610432852385, "grad_norm": 0.40358301997184753, "learning_rate": 0.00016357237482099684, "loss": 0.7928, "step": 194 }, { "epoch": 0.3462819089900111, "grad_norm": 0.3747034966945648, "learning_rate": 0.00016309846685883726, "loss": 0.7751, "step": 195 }, { "epoch": 0.34805771365149835, "grad_norm": 0.4160248041152954, "learning_rate": 0.00016262219252769064, "loss": 0.8035, "step": 196 }, { "epoch": 0.3498335183129856, "grad_norm": 0.39067476987838745, "learning_rate": 0.00016214356968917648, "loss": 0.6726, "step": 197 }, { "epoch": 0.3516093229744728, "grad_norm": 0.4980023205280304, "learning_rate": 0.00016166261629298995, "loss": 0.7917, "step": 198 }, { "epoch": 0.35338512763596003, "grad_norm": 0.4774058163166046, "learning_rate": 0.0001611793503762285, "loss": 0.7599, "step": 199 }, { "epoch": 0.3551609322974473, "grad_norm": 0.5196167230606079, "learning_rate": 0.00016069379006271566, "loss": 0.7608, "step": 200 }, { "epoch": 0.3569367369589345, "grad_norm": 0.2735799551010132, "learning_rate": 0.00016020595356232135, "loss": 0.8588, "step": 201 }, { "epoch": 0.35871254162042177, "grad_norm": 0.30770814418792725, "learning_rate": 0.00015971585917027862, "loss": 0.8222, "step": 202 }, { "epoch": 0.360488346281909, "grad_norm": 0.317123144865036, "learning_rate": 0.00015922352526649803, "loss": 0.7941, "step": 203 }, { "epoch": 0.3622641509433962, "grad_norm": 0.32672154903411865, "learning_rate": 0.00015872897031487791, "loss": 0.867, "step": 204 }, { "epoch": 0.36403995560488345, "grad_norm": 0.3169744610786438, "learning_rate": 0.00015823221286261215, "loss": 0.8781, "step": 205 }, { "epoch": 0.3658157602663707, "grad_norm": 0.30588722229003906, "learning_rate": 0.00015773327153949465, "loss": 0.7827, "step": 206 }, { "epoch": 0.36759156492785794, "grad_norm": 0.3179618716239929, "learning_rate": 0.0001572321650572205, "loss": 0.8178, "step": 207 }, { "epoch": 0.3693673695893452, "grad_norm": 0.3094286322593689, "learning_rate": 0.00015672891220868432, "loss": 0.7966, "step": 208 }, { "epoch": 0.37114317425083243, "grad_norm": 0.31584280729293823, "learning_rate": 0.00015622353186727544, "loss": 0.7982, "step": 209 }, { "epoch": 0.3729189789123196, "grad_norm": 0.29120850563049316, "learning_rate": 0.0001557160429861702, "loss": 0.7789, "step": 210 }, { "epoch": 0.37469478357380687, "grad_norm": 0.29743698239326477, "learning_rate": 0.000155206464597621, "loss": 0.7799, "step": 211 }, { "epoch": 0.3764705882352941, "grad_norm": 0.31440189480781555, "learning_rate": 0.00015469481581224272, "loss": 0.7661, "step": 212 }, { "epoch": 0.37824639289678136, "grad_norm": 0.3395606279373169, "learning_rate": 0.00015418111581829574, "loss": 0.7657, "step": 213 }, { "epoch": 0.3800221975582686, "grad_norm": 0.31749066710472107, "learning_rate": 0.0001536653838809667, "loss": 0.7913, "step": 214 }, { "epoch": 0.38179800221975585, "grad_norm": 0.3586166501045227, "learning_rate": 0.0001531476393416456, "loss": 0.7774, "step": 215 }, { "epoch": 0.38357380688124304, "grad_norm": 0.32895100116729736, "learning_rate": 0.0001526279016172008, "loss": 0.7882, "step": 216 }, { "epoch": 0.3853496115427303, "grad_norm": 0.3541489839553833, "learning_rate": 0.00015210619019925066, "loss": 0.7708, "step": 217 }, { "epoch": 0.38712541620421753, "grad_norm": 0.3232908546924591, "learning_rate": 0.00015158252465343242, "loss": 0.7238, "step": 218 }, { "epoch": 0.3889012208657048, "grad_norm": 0.36565467715263367, "learning_rate": 0.00015105692461866874, "loss": 0.7685, "step": 219 }, { "epoch": 0.390677025527192, "grad_norm": 0.3799486756324768, "learning_rate": 0.000150529409806431, "loss": 0.7296, "step": 220 }, { "epoch": 0.39245283018867927, "grad_norm": 0.4193985164165497, "learning_rate": 0.00015000000000000001, "loss": 0.7731, "step": 221 }, { "epoch": 0.39422863485016646, "grad_norm": 0.4226386845111847, "learning_rate": 0.00014946871505372425, "loss": 0.8048, "step": 222 }, { "epoch": 0.3960044395116537, "grad_norm": 0.40805166959762573, "learning_rate": 0.00014893557489227517, "loss": 0.7389, "step": 223 }, { "epoch": 0.39778024417314095, "grad_norm": 0.5135468244552612, "learning_rate": 0.0001484005995098999, "loss": 0.779, "step": 224 }, { "epoch": 0.3995560488346282, "grad_norm": 0.6674650311470032, "learning_rate": 0.0001478638089696716, "loss": 0.82, "step": 225 }, { "epoch": 0.40133185349611544, "grad_norm": 0.3206911087036133, "learning_rate": 0.00014732522340273684, "loss": 0.8985, "step": 226 }, { "epoch": 0.4031076581576027, "grad_norm": 0.33583980798721313, "learning_rate": 0.0001467848630075608, "loss": 0.8171, "step": 227 }, { "epoch": 0.4048834628190899, "grad_norm": 0.3324304223060608, "learning_rate": 0.00014624274804916958, "loss": 0.8531, "step": 228 }, { "epoch": 0.4066592674805771, "grad_norm": 0.32210710644721985, "learning_rate": 0.00014569889885839037, "loss": 0.8349, "step": 229 }, { "epoch": 0.40843507214206437, "grad_norm": 0.30829885601997375, "learning_rate": 0.00014515333583108896, "loss": 0.8176, "step": 230 }, { "epoch": 0.4102108768035516, "grad_norm": 0.31730225682258606, "learning_rate": 0.00014460607942740468, "loss": 0.8109, "step": 231 }, { "epoch": 0.41198668146503886, "grad_norm": 0.32128164172172546, "learning_rate": 0.00014405715017098335, "loss": 0.8049, "step": 232 }, { "epoch": 0.4137624861265261, "grad_norm": 0.32257241010665894, "learning_rate": 0.00014350656864820733, "loss": 0.79, "step": 233 }, { "epoch": 0.4155382907880133, "grad_norm": 0.29663363099098206, "learning_rate": 0.0001429543555074237, "loss": 0.7606, "step": 234 }, { "epoch": 0.41731409544950054, "grad_norm": 0.3175968527793884, "learning_rate": 0.00014240053145816967, "loss": 0.8093, "step": 235 }, { "epoch": 0.4190899001109878, "grad_norm": 0.30839797854423523, "learning_rate": 0.00014184511727039612, "loss": 0.8033, "step": 236 }, { "epoch": 0.42086570477247504, "grad_norm": 0.32169485092163086, "learning_rate": 0.0001412881337736885, "loss": 0.7583, "step": 237 }, { "epoch": 0.4226415094339623, "grad_norm": 0.3165202736854553, "learning_rate": 0.00014072960185648577, "loss": 0.7864, "step": 238 }, { "epoch": 0.4244173140954495, "grad_norm": 0.3507262170314789, "learning_rate": 0.00014016954246529696, "loss": 0.8196, "step": 239 }, { "epoch": 0.4261931187569367, "grad_norm": 0.3330634534358978, "learning_rate": 0.0001396079766039157, "loss": 0.7356, "step": 240 }, { "epoch": 0.42796892341842396, "grad_norm": 0.3456502854824066, "learning_rate": 0.00013904492533263244, "loss": 0.7636, "step": 241 }, { "epoch": 0.4297447280799112, "grad_norm": 0.3290559649467468, "learning_rate": 0.00013848040976744457, "loss": 0.6921, "step": 242 }, { "epoch": 0.43152053274139845, "grad_norm": 0.34343284368515015, "learning_rate": 0.00013791445107926478, "loss": 0.7661, "step": 243 }, { "epoch": 0.4332963374028857, "grad_norm": 0.34806933999061584, "learning_rate": 0.00013734707049312673, "loss": 0.7266, "step": 244 }, { "epoch": 0.43507214206437295, "grad_norm": 0.3577682375907898, "learning_rate": 0.00013677828928738934, "loss": 0.7337, "step": 245 }, { "epoch": 0.43684794672586014, "grad_norm": 0.37708649039268494, "learning_rate": 0.00013620812879293863, "loss": 0.6949, "step": 246 }, { "epoch": 0.4386237513873474, "grad_norm": 0.3661216199398041, "learning_rate": 0.00013563661039238785, "loss": 0.7049, "step": 247 }, { "epoch": 0.44039955604883463, "grad_norm": 0.4453539550304413, "learning_rate": 0.00013506375551927547, "loss": 0.7957, "step": 248 }, { "epoch": 0.4421753607103219, "grad_norm": 0.46171826124191284, "learning_rate": 0.00013448958565726144, "loss": 0.7175, "step": 249 }, { "epoch": 0.4439511653718091, "grad_norm": 0.6314205527305603, "learning_rate": 0.00013391412233932149, "loss": 0.8853, "step": 250 }, { "epoch": 0.4457269700332963, "grad_norm": 0.29680782556533813, "learning_rate": 0.00013333738714693956, "loss": 0.8789, "step": 251 }, { "epoch": 0.44750277469478356, "grad_norm": 0.30771735310554504, "learning_rate": 0.00013275940170929843, "loss": 0.8126, "step": 252 }, { "epoch": 0.4492785793562708, "grad_norm": 0.3242880403995514, "learning_rate": 0.00013218018770246858, "loss": 0.7787, "step": 253 }, { "epoch": 0.45105438401775805, "grad_norm": 0.33549076318740845, "learning_rate": 0.00013159976684859527, "loss": 0.8113, "step": 254 }, { "epoch": 0.4528301886792453, "grad_norm": 0.34281155467033386, "learning_rate": 0.00013101816091508388, "loss": 0.8371, "step": 255 }, { "epoch": 0.45460599334073254, "grad_norm": 0.3422442078590393, "learning_rate": 0.0001304353917137836, "loss": 0.8362, "step": 256 }, { "epoch": 0.45638179800221973, "grad_norm": 0.3019155263900757, "learning_rate": 0.00012985148110016947, "loss": 0.7317, "step": 257 }, { "epoch": 0.458157602663707, "grad_norm": 0.32793429493904114, "learning_rate": 0.0001292664509725226, "loss": 0.7861, "step": 258 }, { "epoch": 0.4599334073251942, "grad_norm": 0.32433855533599854, "learning_rate": 0.00012868032327110904, "loss": 0.7708, "step": 259 }, { "epoch": 0.46170921198668147, "grad_norm": 0.31858816742897034, "learning_rate": 0.00012809311997735696, "loss": 0.7754, "step": 260 }, { "epoch": 0.4634850166481687, "grad_norm": 0.3172609210014343, "learning_rate": 0.00012750486311303218, "loss": 0.7839, "step": 261 }, { "epoch": 0.46526082130965596, "grad_norm": 0.2951931953430176, "learning_rate": 0.00012691557473941243, "loss": 0.7261, "step": 262 }, { "epoch": 0.46703662597114315, "grad_norm": 0.31385374069213867, "learning_rate": 0.00012632527695645993, "loss": 0.8221, "step": 263 }, { "epoch": 0.4688124306326304, "grad_norm": 0.31157392263412476, "learning_rate": 0.0001257339919019925, "loss": 0.7711, "step": 264 }, { "epoch": 0.47058823529411764, "grad_norm": 0.32580870389938354, "learning_rate": 0.00012514174175085345, "loss": 0.7592, "step": 265 }, { "epoch": 0.4723640399556049, "grad_norm": 0.33285781741142273, "learning_rate": 0.00012454854871407994, "loss": 0.7349, "step": 266 }, { "epoch": 0.47413984461709213, "grad_norm": 0.3179035186767578, "learning_rate": 0.0001239544350380699, "loss": 0.7338, "step": 267 }, { "epoch": 0.4759156492785794, "grad_norm": 0.31393003463745117, "learning_rate": 0.00012335942300374788, "loss": 0.7088, "step": 268 }, { "epoch": 0.47769145394006657, "grad_norm": 0.33285436034202576, "learning_rate": 0.00012276353492572935, "loss": 0.7069, "step": 269 }, { "epoch": 0.4794672586015538, "grad_norm": 0.38329485058784485, "learning_rate": 0.00012216679315148386, "loss": 0.7093, "step": 270 }, { "epoch": 0.48124306326304106, "grad_norm": 0.3584016263484955, "learning_rate": 0.00012156922006049702, "loss": 0.7513, "step": 271 }, { "epoch": 0.4830188679245283, "grad_norm": 0.3995126187801361, "learning_rate": 0.00012097083806343103, "loss": 0.7384, "step": 272 }, { "epoch": 0.48479467258601555, "grad_norm": 0.4097007215023041, "learning_rate": 0.00012037166960128443, "loss": 0.7794, "step": 273 }, { "epoch": 0.4865704772475028, "grad_norm": 0.4780315160751343, "learning_rate": 0.00011977173714455034, "loss": 0.7437, "step": 274 }, { "epoch": 0.48834628190899, "grad_norm": 0.5396427512168884, "learning_rate": 0.00011917106319237386, "loss": 0.7542, "step": 275 }, { "epoch": 0.49012208657047723, "grad_norm": 0.29439178109169006, "learning_rate": 0.00011856967027170818, "loss": 0.8389, "step": 276 }, { "epoch": 0.4918978912319645, "grad_norm": 0.3243663012981415, "learning_rate": 0.00011796758093646989, "loss": 0.8767, "step": 277 }, { "epoch": 0.4936736958934517, "grad_norm": 0.342454195022583, "learning_rate": 0.00011736481776669306, "loss": 0.8538, "step": 278 }, { "epoch": 0.49544950055493897, "grad_norm": 0.30882903933525085, "learning_rate": 0.00011676140336768236, "loss": 0.7766, "step": 279 }, { "epoch": 0.4972253052164262, "grad_norm": 0.3247200548648834, "learning_rate": 0.00011615736036916549, "loss": 0.8268, "step": 280 }, { "epoch": 0.4990011098779134, "grad_norm": 0.3077162504196167, "learning_rate": 0.00011555271142444433, "loss": 0.7786, "step": 281 }, { "epoch": 0.5007769145394007, "grad_norm": 0.3300260603427887, "learning_rate": 0.00011494747920954545, "loss": 0.7853, "step": 282 }, { "epoch": 0.5007769145394007, "eval_loss": 0.7658749222755432, "eval_runtime": 158.4653, "eval_samples_per_second": 5.989, "eval_steps_per_second": 1.502, "step": 282 }, { "epoch": 0.502552719200888, "grad_norm": 0.331061989068985, "learning_rate": 0.00011434168642236964, "loss": 0.8114, "step": 283 }, { "epoch": 0.5043285238623751, "grad_norm": 0.3186919689178467, "learning_rate": 0.00011373535578184082, "loss": 0.7872, "step": 284 }, { "epoch": 0.5061043285238623, "grad_norm": 0.3114188611507416, "learning_rate": 0.00011312851002705383, "loss": 0.7311, "step": 285 }, { "epoch": 0.5078801331853496, "grad_norm": 0.3148879408836365, "learning_rate": 0.00011252117191642175, "loss": 0.7311, "step": 286 }, { "epoch": 0.5096559378468368, "grad_norm": 0.3390887379646301, "learning_rate": 0.00011191336422682237, "loss": 0.7773, "step": 287 }, { "epoch": 0.5114317425083241, "grad_norm": 0.31982842087745667, "learning_rate": 0.00011130510975274409, "loss": 0.7474, "step": 288 }, { "epoch": 0.5132075471698113, "grad_norm": 0.31643104553222656, "learning_rate": 0.00011069643130543084, "loss": 0.7375, "step": 289 }, { "epoch": 0.5149833518312985, "grad_norm": 0.33758479356765747, "learning_rate": 0.00011008735171202684, "loss": 0.7411, "step": 290 }, { "epoch": 0.5167591564927858, "grad_norm": 0.324556440114975, "learning_rate": 0.00010947789381472035, "loss": 0.7235, "step": 291 }, { "epoch": 0.518534961154273, "grad_norm": 0.3768496513366699, "learning_rate": 0.00010886808046988717, "loss": 0.7618, "step": 292 }, { "epoch": 0.5203107658157603, "grad_norm": 0.34034618735313416, "learning_rate": 0.00010825793454723325, "loss": 0.7426, "step": 293 }, { "epoch": 0.5220865704772475, "grad_norm": 0.3409979045391083, "learning_rate": 0.00010764747892893723, "loss": 0.7327, "step": 294 }, { "epoch": 0.5238623751387348, "grad_norm": 0.35839787125587463, "learning_rate": 0.00010703673650879218, "loss": 0.7057, "step": 295 }, { "epoch": 0.525638179800222, "grad_norm": 0.3807874023914337, "learning_rate": 0.00010642573019134703, "loss": 0.7225, "step": 296 }, { "epoch": 0.5274139844617092, "grad_norm": 0.4682140648365021, "learning_rate": 0.00010581448289104758, "loss": 0.715, "step": 297 }, { "epoch": 0.5291897891231965, "grad_norm": 0.4261273145675659, "learning_rate": 0.00010520301753137724, "loss": 0.7239, "step": 298 }, { "epoch": 0.5309655937846837, "grad_norm": 0.4854682981967926, "learning_rate": 0.00010459135704399718, "loss": 0.7304, "step": 299 }, { "epoch": 0.532741398446171, "grad_norm": 0.6740989685058594, "learning_rate": 0.00010397952436788642, "loss": 0.8604, "step": 300 }, { "epoch": 0.5345172031076582, "grad_norm": 0.2903907299041748, "learning_rate": 0.00010336754244848157, "loss": 0.8551, "step": 301 }, { "epoch": 0.5362930077691453, "grad_norm": 0.28648582100868225, "learning_rate": 0.00010275543423681621, "loss": 0.7958, "step": 302 }, { "epoch": 0.5380688124306326, "grad_norm": 0.33123767375946045, "learning_rate": 0.00010214322268866032, "loss": 0.7853, "step": 303 }, { "epoch": 0.5398446170921198, "grad_norm": 0.31327784061431885, "learning_rate": 0.00010153093076365923, "loss": 0.7856, "step": 304 }, { "epoch": 0.5416204217536071, "grad_norm": 0.3101854622364044, "learning_rate": 0.00010091858142447265, "loss": 0.7694, "step": 305 }, { "epoch": 0.5433962264150943, "grad_norm": 0.3217926621437073, "learning_rate": 0.00010030619763591347, "loss": 0.7899, "step": 306 }, { "epoch": 0.5451720310765816, "grad_norm": 0.33827194571495056, "learning_rate": 9.969380236408656e-05, "loss": 0.8088, "step": 307 }, { "epoch": 0.5469478357380688, "grad_norm": 0.32632124423980713, "learning_rate": 9.908141857552737e-05, "loss": 0.769, "step": 308 }, { "epoch": 0.548723640399556, "grad_norm": 0.3152617812156677, "learning_rate": 9.846906923634079e-05, "loss": 0.7804, "step": 309 }, { "epoch": 0.5504994450610433, "grad_norm": 0.33337536454200745, "learning_rate": 9.78567773113397e-05, "loss": 0.7379, "step": 310 }, { "epoch": 0.5522752497225305, "grad_norm": 0.3020349144935608, "learning_rate": 9.724456576318381e-05, "loss": 0.7146, "step": 311 }, { "epoch": 0.5540510543840178, "grad_norm": 0.34656378626823425, "learning_rate": 9.663245755151846e-05, "loss": 0.7437, "step": 312 }, { "epoch": 0.555826859045505, "grad_norm": 0.3417186737060547, "learning_rate": 9.602047563211359e-05, "loss": 0.7472, "step": 313 }, { "epoch": 0.5576026637069922, "grad_norm": 0.34442222118377686, "learning_rate": 9.540864295600283e-05, "loss": 0.7426, "step": 314 }, { "epoch": 0.5593784683684795, "grad_norm": 0.3521478772163391, "learning_rate": 9.479698246862276e-05, "loss": 0.7522, "step": 315 }, { "epoch": 0.5611542730299667, "grad_norm": 0.3358227014541626, "learning_rate": 9.418551710895243e-05, "loss": 0.7454, "step": 316 }, { "epoch": 0.562930077691454, "grad_norm": 0.343226820230484, "learning_rate": 9.357426980865301e-05, "loss": 0.7341, "step": 317 }, { "epoch": 0.5647058823529412, "grad_norm": 0.3432699739933014, "learning_rate": 9.296326349120785e-05, "loss": 0.6836, "step": 318 }, { "epoch": 0.5664816870144284, "grad_norm": 0.3710852265357971, "learning_rate": 9.235252107106279e-05, "loss": 0.6961, "step": 319 }, { "epoch": 0.5682574916759157, "grad_norm": 0.351094514131546, "learning_rate": 9.174206545276677e-05, "loss": 0.6668, "step": 320 }, { "epoch": 0.5700332963374029, "grad_norm": 0.4484163224697113, "learning_rate": 9.113191953011287e-05, "loss": 0.7427, "step": 321 }, { "epoch": 0.5718091009988902, "grad_norm": 0.44636109471321106, "learning_rate": 9.052210618527966e-05, "loss": 0.8119, "step": 322 }, { "epoch": 0.5735849056603773, "grad_norm": 0.43749314546585083, "learning_rate": 8.991264828797319e-05, "loss": 0.7846, "step": 323 }, { "epoch": 0.5753607103218646, "grad_norm": 0.4471510350704193, "learning_rate": 8.930356869456919e-05, "loss": 0.7215, "step": 324 }, { "epoch": 0.5771365149833518, "grad_norm": 0.5141078233718872, "learning_rate": 8.869489024725595e-05, "loss": 0.7492, "step": 325 }, { "epoch": 0.578912319644839, "grad_norm": 0.2640296518802643, "learning_rate": 8.808663577317764e-05, "loss": 0.8625, "step": 326 }, { "epoch": 0.5806881243063263, "grad_norm": 0.28867048025131226, "learning_rate": 8.747882808357828e-05, "loss": 0.8352, "step": 327 }, { "epoch": 0.5824639289678135, "grad_norm": 0.2925030589103699, "learning_rate": 8.687148997294621e-05, "loss": 0.8091, "step": 328 }, { "epoch": 0.5842397336293008, "grad_norm": 0.28383681178092957, "learning_rate": 8.626464421815919e-05, "loss": 0.784, "step": 329 }, { "epoch": 0.586015538290788, "grad_norm": 0.3055633306503296, "learning_rate": 8.565831357763039e-05, "loss": 0.79, "step": 330 }, { "epoch": 0.5877913429522752, "grad_norm": 0.30299943685531616, "learning_rate": 8.505252079045458e-05, "loss": 0.8105, "step": 331 }, { "epoch": 0.5895671476137625, "grad_norm": 0.3154890239238739, "learning_rate": 8.444728857555572e-05, "loss": 0.7664, "step": 332 }, { "epoch": 0.5913429522752497, "grad_norm": 0.31844133138656616, "learning_rate": 8.384263963083453e-05, "loss": 0.7709, "step": 333 }, { "epoch": 0.593118756936737, "grad_norm": 0.31844353675842285, "learning_rate": 8.323859663231768e-05, "loss": 0.7426, "step": 334 }, { "epoch": 0.5948945615982242, "grad_norm": 0.31527841091156006, "learning_rate": 8.263518223330697e-05, "loss": 0.7441, "step": 335 }, { "epoch": 0.5966703662597115, "grad_norm": 0.32145699858665466, "learning_rate": 8.203241906353014e-05, "loss": 0.7333, "step": 336 }, { "epoch": 0.5984461709211987, "grad_norm": 0.3175109922885895, "learning_rate": 8.143032972829183e-05, "loss": 0.7488, "step": 337 }, { "epoch": 0.6002219755826859, "grad_norm": 0.3342651128768921, "learning_rate": 8.082893680762619e-05, "loss": 0.7265, "step": 338 }, { "epoch": 0.6019977802441732, "grad_norm": 0.339743971824646, "learning_rate": 8.022826285544968e-05, "loss": 0.7005, "step": 339 }, { "epoch": 0.6037735849056604, "grad_norm": 0.35757359862327576, "learning_rate": 7.96283303987156e-05, "loss": 0.7806, "step": 340 }, { "epoch": 0.6055493895671477, "grad_norm": 0.4024328291416168, "learning_rate": 7.902916193656898e-05, "loss": 0.6895, "step": 341 }, { "epoch": 0.6073251942286348, "grad_norm": 0.3628247380256653, "learning_rate": 7.843077993950302e-05, "loss": 0.7285, "step": 342 }, { "epoch": 0.609100998890122, "grad_norm": 0.3793889582157135, "learning_rate": 7.783320684851614e-05, "loss": 0.729, "step": 343 }, { "epoch": 0.6108768035516093, "grad_norm": 0.37614578008651733, "learning_rate": 7.72364650742707e-05, "loss": 0.6869, "step": 344 }, { "epoch": 0.6126526082130965, "grad_norm": 0.3737132251262665, "learning_rate": 7.664057699625214e-05, "loss": 0.7373, "step": 345 }, { "epoch": 0.6144284128745838, "grad_norm": 0.40523961186408997, "learning_rate": 7.604556496193015e-05, "loss": 0.729, "step": 346 }, { "epoch": 0.616204217536071, "grad_norm": 0.3903469145298004, "learning_rate": 7.54514512859201e-05, "loss": 0.7063, "step": 347 }, { "epoch": 0.6179800221975583, "grad_norm": 0.43782973289489746, "learning_rate": 7.485825824914659e-05, "loss": 0.6763, "step": 348 }, { "epoch": 0.6197558268590455, "grad_norm": 0.4907206594944, "learning_rate": 7.426600809800752e-05, "loss": 0.7405, "step": 349 }, { "epoch": 0.6215316315205327, "grad_norm": 0.5378274917602539, "learning_rate": 7.36747230435401e-05, "loss": 0.7417, "step": 350 }, { "epoch": 0.62330743618202, "grad_norm": 0.266481876373291, "learning_rate": 7.308442526058756e-05, "loss": 0.8434, "step": 351 }, { "epoch": 0.6250832408435072, "grad_norm": 0.28670433163642883, "learning_rate": 7.249513688696786e-05, "loss": 0.8049, "step": 352 }, { "epoch": 0.6268590455049945, "grad_norm": 0.29961690306663513, "learning_rate": 7.190688002264308e-05, "loss": 0.762, "step": 353 }, { "epoch": 0.6286348501664817, "grad_norm": 0.2873949706554413, "learning_rate": 7.131967672889101e-05, "loss": 0.7389, "step": 354 }, { "epoch": 0.6304106548279689, "grad_norm": 0.3315136730670929, "learning_rate": 7.073354902747741e-05, "loss": 0.7719, "step": 355 }, { "epoch": 0.6321864594894562, "grad_norm": 0.31057095527648926, "learning_rate": 7.014851889983057e-05, "loss": 0.7407, "step": 356 }, { "epoch": 0.6339622641509434, "grad_norm": 0.345838725566864, "learning_rate": 6.95646082862164e-05, "loss": 0.7838, "step": 357 }, { "epoch": 0.6357380688124307, "grad_norm": 0.31915196776390076, "learning_rate": 6.898183908491617e-05, "loss": 0.7591, "step": 358 }, { "epoch": 0.6375138734739179, "grad_norm": 0.3124110698699951, "learning_rate": 6.840023315140475e-05, "loss": 0.7222, "step": 359 }, { "epoch": 0.6392896781354052, "grad_norm": 0.3307512104511261, "learning_rate": 6.781981229753145e-05, "loss": 0.7472, "step": 360 }, { "epoch": 0.6410654827968923, "grad_norm": 0.3425205945968628, "learning_rate": 6.724059829070158e-05, "loss": 0.764, "step": 361 }, { "epoch": 0.6428412874583795, "grad_norm": 0.33861225843429565, "learning_rate": 6.666261285306047e-05, "loss": 0.7396, "step": 362 }, { "epoch": 0.6446170921198668, "grad_norm": 0.3248923420906067, "learning_rate": 6.608587766067852e-05, "loss": 0.7158, "step": 363 }, { "epoch": 0.646392896781354, "grad_norm": 0.349185049533844, "learning_rate": 6.551041434273861e-05, "loss": 0.7415, "step": 364 }, { "epoch": 0.6481687014428413, "grad_norm": 0.33934569358825684, "learning_rate": 6.493624448072457e-05, "loss": 0.744, "step": 365 }, { "epoch": 0.6499445061043285, "grad_norm": 0.3628052771091461, "learning_rate": 6.43633896076122e-05, "loss": 0.7328, "step": 366 }, { "epoch": 0.6517203107658157, "grad_norm": 0.348979115486145, "learning_rate": 6.379187120706138e-05, "loss": 0.6755, "step": 367 }, { "epoch": 0.653496115427303, "grad_norm": 0.38474076986312866, "learning_rate": 6.322171071261071e-05, "loss": 0.711, "step": 368 }, { "epoch": 0.6552719200887902, "grad_norm": 0.34556257724761963, "learning_rate": 6.26529295068733e-05, "loss": 0.6995, "step": 369 }, { "epoch": 0.6570477247502775, "grad_norm": 0.4337230622768402, "learning_rate": 6.208554892073528e-05, "loss": 0.7412, "step": 370 }, { "epoch": 0.6588235294117647, "grad_norm": 0.37804853916168213, "learning_rate": 6.151959023255545e-05, "loss": 0.6724, "step": 371 }, { "epoch": 0.6605993340732519, "grad_norm": 0.40870919823646545, "learning_rate": 6.095507466736763e-05, "loss": 0.7243, "step": 372 }, { "epoch": 0.6623751387347392, "grad_norm": 0.45504140853881836, "learning_rate": 6.039202339608432e-05, "loss": 0.7373, "step": 373 }, { "epoch": 0.6641509433962264, "grad_norm": 0.46973538398742676, "learning_rate": 5.983045753470308e-05, "loss": 0.7101, "step": 374 }, { "epoch": 0.6659267480577137, "grad_norm": 0.5572993755340576, "learning_rate": 5.927039814351426e-05, "loss": 0.7393, "step": 375 }, { "epoch": 0.6677025527192009, "grad_norm": 0.2691468596458435, "learning_rate": 5.8711866226311553e-05, "loss": 0.8102, "step": 376 }, { "epoch": 0.6694783573806882, "grad_norm": 0.2898322641849518, "learning_rate": 5.8154882729603876e-05, "loss": 0.7968, "step": 377 }, { "epoch": 0.6712541620421754, "grad_norm": 0.3048444092273712, "learning_rate": 5.7599468541830356e-05, "loss": 0.775, "step": 378 }, { "epoch": 0.6730299667036626, "grad_norm": 0.3111611604690552, "learning_rate": 5.7045644492576346e-05, "loss": 0.7742, "step": 379 }, { "epoch": 0.6748057713651499, "grad_norm": 0.31889772415161133, "learning_rate": 5.64934313517927e-05, "loss": 0.7304, "step": 380 }, { "epoch": 0.676581576026637, "grad_norm": 0.3219664692878723, "learning_rate": 5.5942849829016695e-05, "loss": 0.7679, "step": 381 }, { "epoch": 0.6783573806881243, "grad_norm": 0.30955034494400024, "learning_rate": 5.5393920572595356e-05, "loss": 0.7443, "step": 382 }, { "epoch": 0.6801331853496115, "grad_norm": 0.344043105840683, "learning_rate": 5.484666416891109e-05, "loss": 0.7272, "step": 383 }, { "epoch": 0.6819089900110987, "grad_norm": 0.33895599842071533, "learning_rate": 5.430110114160964e-05, "loss": 0.7585, "step": 384 }, { "epoch": 0.683684794672586, "grad_norm": 0.37816834449768066, "learning_rate": 5.375725195083046e-05, "loss": 0.7749, "step": 385 }, { "epoch": 0.6854605993340732, "grad_norm": 0.3477395176887512, "learning_rate": 5.321513699243924e-05, "loss": 0.7022, "step": 386 }, { "epoch": 0.6872364039955605, "grad_norm": 0.3380398154258728, "learning_rate": 5.2674776597263186e-05, "loss": 0.7266, "step": 387 }, { "epoch": 0.6890122086570477, "grad_norm": 0.35505762696266174, "learning_rate": 5.2136191030328455e-05, "loss": 0.7411, "step": 388 }, { "epoch": 0.690788013318535, "grad_norm": 0.38739171624183655, "learning_rate": 5.159940049010015e-05, "loss": 0.7666, "step": 389 }, { "epoch": 0.6925638179800222, "grad_norm": 0.38473132252693176, "learning_rate": 5.106442510772489e-05, "loss": 0.7038, "step": 390 }, { "epoch": 0.6943396226415094, "grad_norm": 0.37635302543640137, "learning_rate": 5.0531284946275784e-05, "loss": 0.7488, "step": 391 }, { "epoch": 0.6961154273029967, "grad_norm": 0.37422046065330505, "learning_rate": 5.000000000000002e-05, "loss": 0.693, "step": 392 }, { "epoch": 0.6978912319644839, "grad_norm": 0.3987278342247009, "learning_rate": 4.9470590193569044e-05, "loss": 0.6965, "step": 393 }, { "epoch": 0.6996670366259712, "grad_norm": 0.34372609853744507, "learning_rate": 4.894307538133129e-05, "loss": 0.6632, "step": 394 }, { "epoch": 0.7014428412874584, "grad_norm": 0.4215118885040283, "learning_rate": 4.841747534656763e-05, "loss": 0.7081, "step": 395 }, { "epoch": 0.7032186459489456, "grad_norm": 0.4211183488368988, "learning_rate": 4.7893809800749403e-05, "loss": 0.687, "step": 396 }, { "epoch": 0.7049944506104329, "grad_norm": 0.44248080253601074, "learning_rate": 4.737209838279922e-05, "loss": 0.7118, "step": 397 }, { "epoch": 0.7067702552719201, "grad_norm": 0.38100606203079224, "learning_rate": 4.685236065835443e-05, "loss": 0.6259, "step": 398 }, { "epoch": 0.7085460599334074, "grad_norm": 0.46482354402542114, "learning_rate": 4.6334616119033356e-05, "loss": 0.6668, "step": 399 }, { "epoch": 0.7103218645948945, "grad_norm": 0.5484885573387146, "learning_rate": 4.5818884181704294e-05, "loss": 0.7973, "step": 400 }, { "epoch": 0.7120976692563818, "grad_norm": 0.2660059928894043, "learning_rate": 4.530518418775733e-05, "loss": 0.7845, "step": 401 }, { "epoch": 0.713873473917869, "grad_norm": 0.30005505681037903, "learning_rate": 4.479353540237903e-05, "loss": 0.8141, "step": 402 }, { "epoch": 0.7156492785793562, "grad_norm": 0.3031437397003174, "learning_rate": 4.4283957013829846e-05, "loss": 0.7505, "step": 403 }, { "epoch": 0.7174250832408435, "grad_norm": 0.3152884542942047, "learning_rate": 4.3776468132724604e-05, "loss": 0.8191, "step": 404 }, { "epoch": 0.7192008879023307, "grad_norm": 0.3122805058956146, "learning_rate": 4.3271087791315734e-05, "loss": 0.7732, "step": 405 }, { "epoch": 0.720976692563818, "grad_norm": 0.3241139054298401, "learning_rate": 4.276783494277954e-05, "loss": 0.7652, "step": 406 }, { "epoch": 0.7227524972253052, "grad_norm": 0.3523857295513153, "learning_rate": 4.2266728460505375e-05, "loss": 0.7923, "step": 407 }, { "epoch": 0.7245283018867924, "grad_norm": 0.3518478274345398, "learning_rate": 4.176778713738787e-05, "loss": 0.8046, "step": 408 }, { "epoch": 0.7263041065482797, "grad_norm": 0.35740435123443604, "learning_rate": 4.127102968512214e-05, "loss": 0.741, "step": 409 }, { "epoch": 0.7280799112097669, "grad_norm": 0.3561273217201233, "learning_rate": 4.077647473350201e-05, "loss": 0.7304, "step": 410 }, { "epoch": 0.7298557158712542, "grad_norm": 0.3595544397830963, "learning_rate": 4.028414082972141e-05, "loss": 0.7601, "step": 411 }, { "epoch": 0.7316315205327414, "grad_norm": 0.38603028655052185, "learning_rate": 3.97940464376787e-05, "loss": 0.768, "step": 412 }, { "epoch": 0.7334073251942287, "grad_norm": 0.347781240940094, "learning_rate": 3.9306209937284346e-05, "loss": 0.7255, "step": 413 }, { "epoch": 0.7351831298557159, "grad_norm": 0.3760242462158203, "learning_rate": 3.882064962377154e-05, "loss": 0.7371, "step": 414 }, { "epoch": 0.7369589345172031, "grad_norm": 0.359371542930603, "learning_rate": 3.83373837070101e-05, "loss": 0.7422, "step": 415 }, { "epoch": 0.7387347391786904, "grad_norm": 0.3574449419975281, "learning_rate": 3.7856430310823545e-05, "loss": 0.6915, "step": 416 }, { "epoch": 0.7405105438401776, "grad_norm": 0.3730245530605316, "learning_rate": 3.737780747230941e-05, "loss": 0.7309, "step": 417 }, { "epoch": 0.7422863485016649, "grad_norm": 0.36496400833129883, "learning_rate": 3.69015331411628e-05, "loss": 0.7245, "step": 418 }, { "epoch": 0.744062153163152, "grad_norm": 0.3593985140323639, "learning_rate": 3.642762517900322e-05, "loss": 0.6389, "step": 419 }, { "epoch": 0.7458379578246392, "grad_norm": 0.3603939116001129, "learning_rate": 3.595610135870472e-05, "loss": 0.703, "step": 420 }, { "epoch": 0.7476137624861265, "grad_norm": 0.397124320268631, "learning_rate": 3.548697936372937e-05, "loss": 0.7265, "step": 421 }, { "epoch": 0.7493895671476137, "grad_norm": 0.4071907103061676, "learning_rate": 3.5020276787464056e-05, "loss": 0.6752, "step": 422 }, { "epoch": 0.751165371809101, "grad_norm": 0.3834024965763092, "learning_rate": 3.455601113256073e-05, "loss": 0.6297, "step": 423 }, { "epoch": 0.751165371809101, "eval_loss": 0.7374839186668396, "eval_runtime": 156.6123, "eval_samples_per_second": 6.06, "eval_steps_per_second": 1.52, "step": 423 }, { "epoch": 0.7529411764705882, "grad_norm": 0.44857800006866455, "learning_rate": 3.4094199810279924e-05, "loss": 0.6288, "step": 424 }, { "epoch": 0.7547169811320755, "grad_norm": 0.564264178276062, "learning_rate": 3.363486013983788e-05, "loss": 0.784, "step": 425 }, { "epoch": 0.7564927857935627, "grad_norm": 0.26752522587776184, "learning_rate": 3.317800934775696e-05, "loss": 0.8435, "step": 426 }, { "epoch": 0.7582685904550499, "grad_norm": 0.2690868377685547, "learning_rate": 3.2723664567219626e-05, "loss": 0.8125, "step": 427 }, { "epoch": 0.7600443951165372, "grad_norm": 0.28497472405433655, "learning_rate": 3.227184283742591e-05, "loss": 0.7349, "step": 428 }, { "epoch": 0.7618201997780244, "grad_norm": 0.3184243142604828, "learning_rate": 3.182256110295437e-05, "loss": 0.8037, "step": 429 }, { "epoch": 0.7635960044395117, "grad_norm": 0.29851233959198, "learning_rate": 3.137583621312665e-05, "loss": 0.7631, "step": 430 }, { "epoch": 0.7653718091009989, "grad_norm": 0.31429582834243774, "learning_rate": 3.093168492137557e-05, "loss": 0.7446, "step": 431 }, { "epoch": 0.7671476137624861, "grad_norm": 0.3191598653793335, "learning_rate": 3.0490123884616796e-05, "loss": 0.7301, "step": 432 }, { "epoch": 0.7689234184239734, "grad_norm": 0.32959750294685364, "learning_rate": 3.0051169662624225e-05, "loss": 0.7891, "step": 433 }, { "epoch": 0.7706992230854606, "grad_norm": 0.306445449590683, "learning_rate": 2.9614838717408867e-05, "loss": 0.7205, "step": 434 }, { "epoch": 0.7724750277469479, "grad_norm": 0.3152560889720917, "learning_rate": 2.9181147412601562e-05, "loss": 0.7436, "step": 435 }, { "epoch": 0.7742508324084351, "grad_norm": 0.3343624174594879, "learning_rate": 2.8750112012839214e-05, "loss": 0.7004, "step": 436 }, { "epoch": 0.7760266370699223, "grad_norm": 0.3389514982700348, "learning_rate": 2.8321748683154893e-05, "loss": 0.7037, "step": 437 }, { "epoch": 0.7778024417314096, "grad_norm": 0.36133134365081787, "learning_rate": 2.789607348837153e-05, "loss": 0.7648, "step": 438 }, { "epoch": 0.7795782463928967, "grad_norm": 0.3495211601257324, "learning_rate": 2.7473102392499518e-05, "loss": 0.7668, "step": 439 }, { "epoch": 0.781354051054384, "grad_norm": 0.3677636384963989, "learning_rate": 2.7052851258137935e-05, "loss": 0.7267, "step": 440 }, { "epoch": 0.7831298557158712, "grad_norm": 0.36717966198921204, "learning_rate": 2.6635335845879737e-05, "loss": 0.7577, "step": 441 }, { "epoch": 0.7849056603773585, "grad_norm": 0.38047298789024353, "learning_rate": 2.622057181372063e-05, "loss": 0.6682, "step": 442 }, { "epoch": 0.7866814650388457, "grad_norm": 0.37885257601737976, "learning_rate": 2.5808574716471856e-05, "loss": 0.7454, "step": 443 }, { "epoch": 0.7884572697003329, "grad_norm": 0.37553516030311584, "learning_rate": 2.5399360005176886e-05, "loss": 0.6721, "step": 444 }, { "epoch": 0.7902330743618202, "grad_norm": 0.42358729243278503, "learning_rate": 2.4992943026531935e-05, "loss": 0.7339, "step": 445 }, { "epoch": 0.7920088790233074, "grad_norm": 0.3923121392726898, "learning_rate": 2.4589339022310386e-05, "loss": 0.6952, "step": 446 }, { "epoch": 0.7937846836847947, "grad_norm": 0.39447784423828125, "learning_rate": 2.4188563128791254e-05, "loss": 0.6898, "step": 447 }, { "epoch": 0.7955604883462819, "grad_norm": 0.4486071467399597, "learning_rate": 2.379063037619146e-05, "loss": 0.6485, "step": 448 }, { "epoch": 0.7973362930077691, "grad_norm": 0.47466063499450684, "learning_rate": 2.339555568810221e-05, "loss": 0.6961, "step": 449 }, { "epoch": 0.7991120976692564, "grad_norm": 0.5662741661071777, "learning_rate": 2.300335388092929e-05, "loss": 0.7295, "step": 450 }, { "epoch": 0.8008879023307436, "grad_norm": 0.28682488203048706, "learning_rate": 2.2614039663337417e-05, "loss": 0.8068, "step": 451 }, { "epoch": 0.8026637069922309, "grad_norm": 0.2780812680721283, "learning_rate": 2.222762763569862e-05, "loss": 0.8236, "step": 452 }, { "epoch": 0.8044395116537181, "grad_norm": 0.29560670256614685, "learning_rate": 2.184413228954468e-05, "loss": 0.7894, "step": 453 }, { "epoch": 0.8062153163152054, "grad_norm": 0.2896682918071747, "learning_rate": 2.1463568007023704e-05, "loss": 0.7534, "step": 454 }, { "epoch": 0.8079911209766926, "grad_norm": 0.2878231108188629, "learning_rate": 2.1085949060360654e-05, "loss": 0.7245, "step": 455 }, { "epoch": 0.8097669256381798, "grad_norm": 0.3214217722415924, "learning_rate": 2.0711289611322204e-05, "loss": 0.7731, "step": 456 }, { "epoch": 0.8115427302996671, "grad_norm": 0.3201158940792084, "learning_rate": 2.033960371068557e-05, "loss": 0.7475, "step": 457 }, { "epoch": 0.8133185349611542, "grad_norm": 0.33665937185287476, "learning_rate": 1.9970905297711606e-05, "loss": 0.721, "step": 458 }, { "epoch": 0.8150943396226416, "grad_norm": 0.3305956721305847, "learning_rate": 1.9605208199621995e-05, "loss": 0.7249, "step": 459 }, { "epoch": 0.8168701442841287, "grad_norm": 0.3384665548801422, "learning_rate": 1.924252613108073e-05, "loss": 0.724, "step": 460 }, { "epoch": 0.8186459489456159, "grad_norm": 0.359944224357605, "learning_rate": 1.888287269367979e-05, "loss": 0.7516, "step": 461 }, { "epoch": 0.8204217536071032, "grad_norm": 0.3357195556163788, "learning_rate": 1.8526261375428955e-05, "loss": 0.7327, "step": 462 }, { "epoch": 0.8221975582685904, "grad_norm": 0.34272313117980957, "learning_rate": 1.8172705550250092e-05, "loss": 0.7161, "step": 463 }, { "epoch": 0.8239733629300777, "grad_norm": 0.32507383823394775, "learning_rate": 1.7822218477475494e-05, "loss": 0.6392, "step": 464 }, { "epoch": 0.8257491675915649, "grad_norm": 0.37359514832496643, "learning_rate": 1.7474813301350666e-05, "loss": 0.7298, "step": 465 }, { "epoch": 0.8275249722530522, "grad_norm": 0.3348104655742645, "learning_rate": 1.7130503050541368e-05, "loss": 0.6568, "step": 466 }, { "epoch": 0.8293007769145394, "grad_norm": 0.3460003435611725, "learning_rate": 1.6789300637645e-05, "loss": 0.6742, "step": 467 }, { "epoch": 0.8310765815760266, "grad_norm": 0.3993259370326996, "learning_rate": 1.6451218858706374e-05, "loss": 0.7365, "step": 468 }, { "epoch": 0.8328523862375139, "grad_norm": 0.3710460960865021, "learning_rate": 1.6116270392737754e-05, "loss": 0.699, "step": 469 }, { "epoch": 0.8346281908990011, "grad_norm": 0.41104644536972046, "learning_rate": 1.578446780124344e-05, "loss": 0.7185, "step": 470 }, { "epoch": 0.8364039955604884, "grad_norm": 0.39822298288345337, "learning_rate": 1.5455823527748626e-05, "loss": 0.6968, "step": 471 }, { "epoch": 0.8381798002219756, "grad_norm": 0.3909063935279846, "learning_rate": 1.5130349897332763e-05, "loss": 0.6427, "step": 472 }, { "epoch": 0.8399556048834628, "grad_norm": 0.39908355474472046, "learning_rate": 1.4808059116167305e-05, "loss": 0.6307, "step": 473 }, { "epoch": 0.8417314095449501, "grad_norm": 0.4725106954574585, "learning_rate": 1.4488963271057943e-05, "loss": 0.7274, "step": 474 }, { "epoch": 0.8435072142064373, "grad_norm": 0.58518385887146, "learning_rate": 1.4173074328991377e-05, "loss": 0.7112, "step": 475 }, { "epoch": 0.8452830188679246, "grad_norm": 0.2664174735546112, "learning_rate": 1.3860404136686411e-05, "loss": 0.8515, "step": 476 }, { "epoch": 0.8470588235294118, "grad_norm": 0.30460578203201294, "learning_rate": 1.355096442014977e-05, "loss": 0.8107, "step": 477 }, { "epoch": 0.848834628190899, "grad_norm": 0.28965044021606445, "learning_rate": 1.3244766784236307e-05, "loss": 0.7361, "step": 478 }, { "epoch": 0.8506104328523862, "grad_norm": 0.3329102396965027, "learning_rate": 1.294182271221377e-05, "loss": 0.7712, "step": 479 }, { "epoch": 0.8523862375138734, "grad_norm": 0.30333012342453003, "learning_rate": 1.2642143565332154e-05, "loss": 0.7245, "step": 480 }, { "epoch": 0.8541620421753607, "grad_norm": 0.328744500875473, "learning_rate": 1.2345740582397648e-05, "loss": 0.7557, "step": 481 }, { "epoch": 0.8559378468368479, "grad_norm": 0.33845219016075134, "learning_rate": 1.2052624879351104e-05, "loss": 0.7857, "step": 482 }, { "epoch": 0.8577136514983352, "grad_norm": 0.3305346667766571, "learning_rate": 1.176280744885121e-05, "loss": 0.7512, "step": 483 }, { "epoch": 0.8594894561598224, "grad_norm": 0.340707391500473, "learning_rate": 1.1476299159862203e-05, "loss": 0.7678, "step": 484 }, { "epoch": 0.8612652608213096, "grad_norm": 0.3646427392959595, "learning_rate": 1.119311075724625e-05, "loss": 0.7473, "step": 485 }, { "epoch": 0.8630410654827969, "grad_norm": 0.35034942626953125, "learning_rate": 1.09132528613605e-05, "loss": 0.7679, "step": 486 }, { "epoch": 0.8648168701442841, "grad_norm": 0.35471147298812866, "learning_rate": 1.0636735967658784e-05, "loss": 0.7416, "step": 487 }, { "epoch": 0.8665926748057714, "grad_norm": 0.3597537875175476, "learning_rate": 1.0363570446297999e-05, "loss": 0.7125, "step": 488 }, { "epoch": 0.8683684794672586, "grad_norm": 0.35092103481292725, "learning_rate": 1.0093766541749205e-05, "loss": 0.692, "step": 489 }, { "epoch": 0.8701442841287459, "grad_norm": 0.35275334119796753, "learning_rate": 9.827334372413444e-06, "loss": 0.6683, "step": 490 }, { "epoch": 0.8719200887902331, "grad_norm": 0.3727843463420868, "learning_rate": 9.564283930242257e-06, "loss": 0.665, "step": 491 }, { "epoch": 0.8736958934517203, "grad_norm": 0.3570787310600281, "learning_rate": 9.30462508036294e-06, "loss": 0.6736, "step": 492 }, { "epoch": 0.8754716981132076, "grad_norm": 0.39428988099098206, "learning_rate": 9.048367560708604e-06, "loss": 0.7076, "step": 493 }, { "epoch": 0.8772475027746948, "grad_norm": 0.3717636168003082, "learning_rate": 8.795520981652961e-06, "loss": 0.6807, "step": 494 }, { "epoch": 0.8790233074361821, "grad_norm": 0.4105593264102936, "learning_rate": 8.546094825649908e-06, "loss": 0.7068, "step": 495 }, { "epoch": 0.8807991120976693, "grad_norm": 0.45720747113227844, "learning_rate": 8.300098446877923e-06, "loss": 0.7189, "step": 496 }, { "epoch": 0.8825749167591564, "grad_norm": 0.44911620020866394, "learning_rate": 8.05754107088923e-06, "loss": 0.6891, "step": 497 }, { "epoch": 0.8843507214206437, "grad_norm": 0.4414433240890503, "learning_rate": 7.818431794263836e-06, "loss": 0.7167, "step": 498 }, { "epoch": 0.8861265260821309, "grad_norm": 0.49208080768585205, "learning_rate": 7.582779584268373e-06, "loss": 0.7084, "step": 499 }, { "epoch": 0.8879023307436182, "grad_norm": 0.5831857323646545, "learning_rate": 7.350593278519824e-06, "loss": 0.7877, "step": 500 }, { "epoch": 0.8896781354051054, "grad_norm": 0.25118499994277954, "learning_rate": 7.121881584654056e-06, "loss": 0.8006, "step": 501 }, { "epoch": 0.8914539400665926, "grad_norm": 0.2842087745666504, "learning_rate": 6.896653079999249e-06, "loss": 0.7796, "step": 502 }, { "epoch": 0.8932297447280799, "grad_norm": 0.2935945391654968, "learning_rate": 6.674916211254289e-06, "loss": 0.7614, "step": 503 }, { "epoch": 0.8950055493895671, "grad_norm": 0.3194078505039215, "learning_rate": 6.45667929417193e-06, "loss": 0.7537, "step": 504 }, { "epoch": 0.8967813540510544, "grad_norm": 0.32085007429122925, "learning_rate": 6.2419505132469305e-06, "loss": 0.7843, "step": 505 }, { "epoch": 0.8985571587125416, "grad_norm": 0.32116949558258057, "learning_rate": 6.030737921409169e-06, "loss": 0.736, "step": 506 }, { "epoch": 0.9003329633740289, "grad_norm": 0.32136133313179016, "learning_rate": 5.823049439721561e-06, "loss": 0.7388, "step": 507 }, { "epoch": 0.9021087680355161, "grad_norm": 0.33068105578422546, "learning_rate": 5.618892857083069e-06, "loss": 0.6994, "step": 508 }, { "epoch": 0.9038845726970033, "grad_norm": 0.3484742343425751, "learning_rate": 5.418275829936537e-06, "loss": 0.7431, "step": 509 }, { "epoch": 0.9056603773584906, "grad_norm": 0.35299912095069885, "learning_rate": 5.221205881981595e-06, "loss": 0.7568, "step": 510 }, { "epoch": 0.9074361820199778, "grad_norm": 0.34243056178092957, "learning_rate": 5.02769040389246e-06, "loss": 0.6817, "step": 511 }, { "epoch": 0.9092119866814651, "grad_norm": 0.38018926978111267, "learning_rate": 4.8377366530408254e-06, "loss": 0.759, "step": 512 }, { "epoch": 0.9109877913429523, "grad_norm": 0.36619412899017334, "learning_rate": 4.65135175322361e-06, "loss": 0.7202, "step": 513 }, { "epoch": 0.9127635960044395, "grad_norm": 0.38696765899658203, "learning_rate": 4.468542694395861e-06, "loss": 0.7202, "step": 514 }, { "epoch": 0.9145394006659268, "grad_norm": 0.39391985535621643, "learning_rate": 4.2893163324085885e-06, "loss": 0.7091, "step": 515 }, { "epoch": 0.916315205327414, "grad_norm": 0.37037429213523865, "learning_rate": 4.1136793887516345e-06, "loss": 0.6974, "step": 516 }, { "epoch": 0.9180910099889013, "grad_norm": 0.39087411761283875, "learning_rate": 3.941638450301644e-06, "loss": 0.7328, "step": 517 }, { "epoch": 0.9198668146503884, "grad_norm": 0.38174766302108765, "learning_rate": 3.7731999690749585e-06, "loss": 0.7184, "step": 518 }, { "epoch": 0.9216426193118757, "grad_norm": 0.3953598737716675, "learning_rate": 3.6083702619857605e-06, "loss": 0.7121, "step": 519 }, { "epoch": 0.9234184239733629, "grad_norm": 0.39504143595695496, "learning_rate": 3.447155510609057e-06, "loss": 0.6665, "step": 520 }, { "epoch": 0.9251942286348501, "grad_norm": 0.4065762162208557, "learning_rate": 3.2895617609489336e-06, "loss": 0.7019, "step": 521 }, { "epoch": 0.9269700332963374, "grad_norm": 0.39577242732048035, "learning_rate": 3.135594923211771e-06, "loss": 0.6444, "step": 522 }, { "epoch": 0.9287458379578246, "grad_norm": 0.4378613531589508, "learning_rate": 2.9852607715846193e-06, "loss": 0.7066, "step": 523 }, { "epoch": 0.9305216426193119, "grad_norm": 0.4423007369041443, "learning_rate": 2.838564944018618e-06, "loss": 0.6555, "step": 524 }, { "epoch": 0.9322974472807991, "grad_norm": 0.5693342089653015, "learning_rate": 2.6955129420176196e-06, "loss": 0.8152, "step": 525 }, { "epoch": 0.9340732519422863, "grad_norm": 0.2593931555747986, "learning_rate": 2.556110130431788e-06, "loss": 0.813, "step": 526 }, { "epoch": 0.9358490566037736, "grad_norm": 0.30711933970451355, "learning_rate": 2.420361737256438e-06, "loss": 0.7564, "step": 527 }, { "epoch": 0.9376248612652608, "grad_norm": 0.29575708508491516, "learning_rate": 2.288272853436013e-06, "loss": 0.7813, "step": 528 }, { "epoch": 0.9394006659267481, "grad_norm": 0.3270512521266937, "learning_rate": 2.1598484326730837e-06, "loss": 0.7658, "step": 529 }, { "epoch": 0.9411764705882353, "grad_norm": 0.3134397268295288, "learning_rate": 2.035093291242607e-06, "loss": 0.7335, "step": 530 }, { "epoch": 0.9429522752497226, "grad_norm": 0.34165453910827637, "learning_rate": 1.914012107811336e-06, "loss": 0.798, "step": 531 }, { "epoch": 0.9447280799112098, "grad_norm": 0.33172738552093506, "learning_rate": 1.7966094232622855e-06, "loss": 0.7516, "step": 532 }, { "epoch": 0.946503884572697, "grad_norm": 0.35185980796813965, "learning_rate": 1.6828896405244988e-06, "loss": 0.7745, "step": 533 }, { "epoch": 0.9482796892341843, "grad_norm": 0.3368275761604309, "learning_rate": 1.572857024407881e-06, "loss": 0.749, "step": 534 }, { "epoch": 0.9500554938956715, "grad_norm": 0.3556804656982422, "learning_rate": 1.466515701443294e-06, "loss": 0.737, "step": 535 }, { "epoch": 0.9518312985571588, "grad_norm": 0.35035377740859985, "learning_rate": 1.3638696597277679e-06, "loss": 0.716, "step": 536 }, { "epoch": 0.953607103218646, "grad_norm": 0.356507807970047, "learning_rate": 1.2649227487749548e-06, "loss": 0.7292, "step": 537 }, { "epoch": 0.9553829078801331, "grad_norm": 0.3645875155925751, "learning_rate": 1.1696786793707781e-06, "loss": 0.7325, "step": 538 }, { "epoch": 0.9571587125416204, "grad_norm": 0.35591599345207214, "learning_rate": 1.0781410234342094e-06, "loss": 0.7203, "step": 539 }, { "epoch": 0.9589345172031076, "grad_norm": 0.35885104537010193, "learning_rate": 9.90313213883376e-07, "loss": 0.665, "step": 540 }, { "epoch": 0.9607103218645949, "grad_norm": 0.38247016072273254, "learning_rate": 9.061985445067756e-07, "loss": 0.6885, "step": 541 }, { "epoch": 0.9624861265260821, "grad_norm": 0.38850679993629456, "learning_rate": 8.258001698397744e-07, "loss": 0.707, "step": 542 }, { "epoch": 0.9642619311875694, "grad_norm": 0.3912898004055023, "learning_rate": 7.491211050462798e-07, "loss": 0.6818, "step": 543 }, { "epoch": 0.9660377358490566, "grad_norm": 0.3983571529388428, "learning_rate": 6.761642258056978e-07, "loss": 0.6786, "step": 544 }, { "epoch": 0.9678135405105438, "grad_norm": 0.4406982660293579, "learning_rate": 6.069322682050516e-07, "loss": 0.6564, "step": 545 }, { "epoch": 0.9695893451720311, "grad_norm": 0.38563239574432373, "learning_rate": 5.414278286363761e-07, "loss": 0.5921, "step": 546 }, { "epoch": 0.9713651498335183, "grad_norm": 0.4474928081035614, "learning_rate": 4.796533636993727e-07, "loss": 0.681, "step": 547 }, { "epoch": 0.9731409544950056, "grad_norm": 0.43392056226730347, "learning_rate": 4.216111901092501e-07, "loss": 0.6673, "step": 548 }, { "epoch": 0.9749167591564928, "grad_norm": 0.5074283480644226, "learning_rate": 3.6730348460985996e-07, "loss": 0.7363, "step": 549 }, { "epoch": 0.97669256381798, "grad_norm": 0.5351345539093018, "learning_rate": 3.1673228389204055e-07, "loss": 0.6898, "step": 550 }, { "epoch": 0.9784683684794673, "grad_norm": 0.2640553414821625, "learning_rate": 2.6989948451726643e-07, "loss": 0.7556, "step": 551 }, { "epoch": 0.9802441731409545, "grad_norm": 0.28559839725494385, "learning_rate": 2.2680684284650533e-07, "loss": 0.7636, "step": 552 }, { "epoch": 0.9820199778024418, "grad_norm": 0.3131345510482788, "learning_rate": 1.8745597497433765e-07, "loss": 0.7366, "step": 553 }, { "epoch": 0.983795782463929, "grad_norm": 0.3156748116016388, "learning_rate": 1.518483566683826e-07, "loss": 0.7377, "step": 554 }, { "epoch": 0.9855715871254163, "grad_norm": 0.34887486696243286, "learning_rate": 1.199853233138981e-07, "loss": 0.7588, "step": 555 }, { "epoch": 0.9873473917869034, "grad_norm": 0.3328061103820801, "learning_rate": 9.186806986376529e-08, "loss": 0.744, "step": 556 }, { "epoch": 0.9891231964483906, "grad_norm": 0.36429402232170105, "learning_rate": 6.749765079363534e-08, "loss": 0.7261, "step": 557 }, { "epoch": 0.9908990011098779, "grad_norm": 0.36065101623535156, "learning_rate": 4.687498006236135e-08, "loss": 0.6815, "step": 558 }, { "epoch": 0.9926748057713651, "grad_norm": 0.3841758966445923, "learning_rate": 3.000083107780327e-08, "loss": 0.7204, "step": 559 }, { "epoch": 0.9944506104328524, "grad_norm": 0.43167850375175476, "learning_rate": 1.687583666772907e-08, "loss": 0.725, "step": 560 }, { "epoch": 0.9962264150943396, "grad_norm": 0.4131225347518921, "learning_rate": 7.500489056133652e-09, "loss": 0.6763, "step": 561 }, { "epoch": 0.9980022197558268, "grad_norm": 0.4707167446613312, "learning_rate": 1.8751398447758306e-09, "loss": 0.6854, "step": 562 }, { "epoch": 0.9997780244173141, "grad_norm": 0.4748234152793884, "learning_rate": 0.0, "loss": 0.6381, "step": 563 } ], "logging_steps": 1, "max_steps": 563, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 141, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.415142496788808e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }