|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.2503884572697003, |
|
"eval_steps": 141, |
|
"global_step": 141, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0017758046614872365, |
|
"grad_norm": 0.40501952171325684, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.1387, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0017758046614872365, |
|
"eval_loss": 1.4082584381103516, |
|
"eval_runtime": 167.7664, |
|
"eval_samples_per_second": 5.657, |
|
"eval_steps_per_second": 1.419, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003551609322974473, |
|
"grad_norm": 0.491682767868042, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.2151, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005327413984461709, |
|
"grad_norm": 0.49752455949783325, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.1941, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007103218645948946, |
|
"grad_norm": 0.5617953538894653, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.2472, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.008879023307436182, |
|
"grad_norm": 0.646000862121582, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2767, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010654827968923418, |
|
"grad_norm": 0.6190630197525024, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.2839, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.012430632630410655, |
|
"grad_norm": 0.6891798973083496, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.2914, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.014206437291897892, |
|
"grad_norm": 0.6742885708808899, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.3001, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01598224195338513, |
|
"grad_norm": 0.693493664264679, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.2673, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.017758046614872364, |
|
"grad_norm": 0.7951493859291077, |
|
"learning_rate": 4e-05, |
|
"loss": 1.3314, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0195338512763596, |
|
"grad_norm": 0.7866435050964355, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.2703, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.021309655937846835, |
|
"grad_norm": 0.7218112349510193, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.2542, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.023085460599334074, |
|
"grad_norm": 0.6838662028312683, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.2432, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02486126526082131, |
|
"grad_norm": 0.6592800617218018, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.2374, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.026637069922308545, |
|
"grad_norm": 0.513134241104126, |
|
"learning_rate": 6e-05, |
|
"loss": 1.246, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.028412874583795784, |
|
"grad_norm": 0.5785119533538818, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.169, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03018867924528302, |
|
"grad_norm": 0.6144536733627319, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.1532, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03196448390677026, |
|
"grad_norm": 0.674633800983429, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.1175, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03374028856825749, |
|
"grad_norm": 0.5997682809829712, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.092, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03551609322974473, |
|
"grad_norm": 0.5651845335960388, |
|
"learning_rate": 8e-05, |
|
"loss": 1.0543, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03729189789123197, |
|
"grad_norm": 0.562713623046875, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.0377, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0390677025527192, |
|
"grad_norm": 0.5826591849327087, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.0178, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04084350721420644, |
|
"grad_norm": 0.5972415208816528, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.9916, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04261931187569367, |
|
"grad_norm": 0.6266159415245056, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.0026, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04439511653718091, |
|
"grad_norm": 0.7757481932640076, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0245, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04617092119866815, |
|
"grad_norm": 0.6832363605499268, |
|
"learning_rate": 0.00010400000000000001, |
|
"loss": 0.9408, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.04794672586015538, |
|
"grad_norm": 1.2983894348144531, |
|
"learning_rate": 0.00010800000000000001, |
|
"loss": 0.9228, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.04972253052164262, |
|
"grad_norm": 0.9382829666137695, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 0.9605, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05149833518312986, |
|
"grad_norm": 0.5051376819610596, |
|
"learning_rate": 0.000116, |
|
"loss": 0.9769, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05327413984461709, |
|
"grad_norm": 0.40853050351142883, |
|
"learning_rate": 0.00012, |
|
"loss": 0.8912, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05504994450610433, |
|
"grad_norm": 0.4261438846588135, |
|
"learning_rate": 0.000124, |
|
"loss": 0.9438, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.05682574916759157, |
|
"grad_norm": 0.44900333881378174, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 0.9589, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0586015538290788, |
|
"grad_norm": 0.4262010157108307, |
|
"learning_rate": 0.000132, |
|
"loss": 0.8775, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06037735849056604, |
|
"grad_norm": 0.40672022104263306, |
|
"learning_rate": 0.00013600000000000003, |
|
"loss": 0.8956, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06215316315205328, |
|
"grad_norm": 0.39336153864860535, |
|
"learning_rate": 0.00014, |
|
"loss": 0.866, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06392896781354052, |
|
"grad_norm": 0.40699368715286255, |
|
"learning_rate": 0.000144, |
|
"loss": 0.8612, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.06570477247502775, |
|
"grad_norm": 0.438643217086792, |
|
"learning_rate": 0.000148, |
|
"loss": 0.922, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.06748057713651498, |
|
"grad_norm": 0.45053544640541077, |
|
"learning_rate": 0.000152, |
|
"loss": 0.8681, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.06925638179800223, |
|
"grad_norm": 0.4289852976799011, |
|
"learning_rate": 0.00015600000000000002, |
|
"loss": 0.9071, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07103218645948946, |
|
"grad_norm": 0.4101032316684723, |
|
"learning_rate": 0.00016, |
|
"loss": 0.8692, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07280799112097669, |
|
"grad_norm": 0.418319433927536, |
|
"learning_rate": 0.000164, |
|
"loss": 0.8654, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.07458379578246394, |
|
"grad_norm": 0.41637811064720154, |
|
"learning_rate": 0.000168, |
|
"loss": 0.8348, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.07635960044395117, |
|
"grad_norm": 0.40830302238464355, |
|
"learning_rate": 0.000172, |
|
"loss": 0.8928, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0781354051054384, |
|
"grad_norm": 0.4163912236690521, |
|
"learning_rate": 0.00017600000000000002, |
|
"loss": 0.8793, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.07991120976692564, |
|
"grad_norm": 0.4240954518318176, |
|
"learning_rate": 0.00018, |
|
"loss": 0.9029, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08168701442841288, |
|
"grad_norm": 0.48420408368110657, |
|
"learning_rate": 0.00018400000000000003, |
|
"loss": 0.8632, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08346281908990011, |
|
"grad_norm": 0.5267483592033386, |
|
"learning_rate": 0.000188, |
|
"loss": 0.8575, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08523862375138734, |
|
"grad_norm": 0.4947332441806793, |
|
"learning_rate": 0.000192, |
|
"loss": 0.9051, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.08701442841287459, |
|
"grad_norm": 0.5025691986083984, |
|
"learning_rate": 0.000196, |
|
"loss": 0.9145, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.08879023307436182, |
|
"grad_norm": 0.5430313944816589, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8954, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09056603773584905, |
|
"grad_norm": 0.45721662044525146, |
|
"learning_rate": 0.00019999812486015523, |
|
"loss": 0.9655, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0923418423973363, |
|
"grad_norm": 0.4364672899246216, |
|
"learning_rate": 0.00019999249951094388, |
|
"loss": 0.9318, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.09411764705882353, |
|
"grad_norm": 0.38933759927749634, |
|
"learning_rate": 0.00019998312416333227, |
|
"loss": 0.8963, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.09589345172031076, |
|
"grad_norm": 0.35572728514671326, |
|
"learning_rate": 0.0001999699991689222, |
|
"loss": 0.9073, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.097669256381798, |
|
"grad_norm": 0.3042948544025421, |
|
"learning_rate": 0.00019995312501993765, |
|
"loss": 0.8751, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.09944506104328524, |
|
"grad_norm": 0.32266151905059814, |
|
"learning_rate": 0.00019993250234920636, |
|
"loss": 0.8493, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10122086570477247, |
|
"grad_norm": 0.31894031167030334, |
|
"learning_rate": 0.00019990813193013625, |
|
"loss": 0.8512, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.10299667036625972, |
|
"grad_norm": 0.33073991537094116, |
|
"learning_rate": 0.0001998800146766861, |
|
"loss": 0.8424, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.10477247502774695, |
|
"grad_norm": 0.32064828276634216, |
|
"learning_rate": 0.00019984815164333163, |
|
"loss": 0.8698, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.10654827968923418, |
|
"grad_norm": 0.3364376425743103, |
|
"learning_rate": 0.00019981254402502566, |
|
"loss": 0.8525, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10832408435072143, |
|
"grad_norm": 0.31403639912605286, |
|
"learning_rate": 0.0001997731931571535, |
|
"loss": 0.8309, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.11009988901220866, |
|
"grad_norm": 0.3375100791454315, |
|
"learning_rate": 0.00019973010051548275, |
|
"loss": 0.8573, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.11187569367369589, |
|
"grad_norm": 0.3584939241409302, |
|
"learning_rate": 0.00019968326771610797, |
|
"loss": 0.8479, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.11365149833518313, |
|
"grad_norm": 0.35480472445487976, |
|
"learning_rate": 0.00019963269651539017, |
|
"loss": 0.845, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11542730299667037, |
|
"grad_norm": 0.33250972628593445, |
|
"learning_rate": 0.00019957838880989078, |
|
"loss": 0.8438, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1172031076581576, |
|
"grad_norm": 0.39302438497543335, |
|
"learning_rate": 0.00019952034663630062, |
|
"loss": 0.8391, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.11897891231964484, |
|
"grad_norm": 0.3517158031463623, |
|
"learning_rate": 0.00019945857217136363, |
|
"loss": 0.7966, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12075471698113208, |
|
"grad_norm": 0.38860467076301575, |
|
"learning_rate": 0.00019939306773179497, |
|
"loss": 0.8279, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12253052164261931, |
|
"grad_norm": 0.3762984573841095, |
|
"learning_rate": 0.00019932383577419432, |
|
"loss": 0.7848, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.12430632630410655, |
|
"grad_norm": 0.4535103440284729, |
|
"learning_rate": 0.00019925087889495374, |
|
"loss": 0.8, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12608213096559379, |
|
"grad_norm": 0.4869844317436218, |
|
"learning_rate": 0.00019917419983016025, |
|
"loss": 0.8442, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.12785793562708103, |
|
"grad_norm": 0.4379689395427704, |
|
"learning_rate": 0.00019909380145549324, |
|
"loss": 0.8353, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.12963374028856825, |
|
"grad_norm": 0.39510270953178406, |
|
"learning_rate": 0.00019900968678611666, |
|
"loss": 0.8538, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1314095449500555, |
|
"grad_norm": 0.4764181971549988, |
|
"learning_rate": 0.00019892185897656578, |
|
"loss": 0.8509, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.13318534961154274, |
|
"grad_norm": 0.5591267347335815, |
|
"learning_rate": 0.00019883032132062925, |
|
"loss": 0.8661, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.13496115427302996, |
|
"grad_norm": 0.41077056527137756, |
|
"learning_rate": 0.00019873507725122504, |
|
"loss": 0.9418, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1367369589345172, |
|
"grad_norm": 0.393622487783432, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 0.926, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.13851276359600445, |
|
"grad_norm": 0.36414071917533875, |
|
"learning_rate": 0.00019853348429855672, |
|
"loss": 0.8649, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14028856825749167, |
|
"grad_norm": 0.3100601136684418, |
|
"learning_rate": 0.00019842714297559213, |
|
"loss": 0.9114, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.14206437291897892, |
|
"grad_norm": 0.29151105880737305, |
|
"learning_rate": 0.0001983171103594755, |
|
"loss": 0.8681, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14384017758046616, |
|
"grad_norm": 0.28398221731185913, |
|
"learning_rate": 0.0001982033905767377, |
|
"loss": 0.8515, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.14561598224195338, |
|
"grad_norm": 0.2883840799331665, |
|
"learning_rate": 0.00019808598789218865, |
|
"loss": 0.8569, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.14739178690344062, |
|
"grad_norm": 0.29812031984329224, |
|
"learning_rate": 0.0001979649067087574, |
|
"loss": 0.8529, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.14916759156492787, |
|
"grad_norm": 0.3074108958244324, |
|
"learning_rate": 0.00019784015156732693, |
|
"loss": 0.8771, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"grad_norm": 0.3601110279560089, |
|
"learning_rate": 0.000197711727146564, |
|
"loss": 0.8609, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15271920088790233, |
|
"grad_norm": 0.3126521110534668, |
|
"learning_rate": 0.00019757963826274357, |
|
"loss": 0.8162, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.15449500554938958, |
|
"grad_norm": 0.3152073323726654, |
|
"learning_rate": 0.00019744388986956822, |
|
"loss": 0.7661, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.1562708102108768, |
|
"grad_norm": 0.33570149540901184, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.8179, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.15804661487236404, |
|
"grad_norm": 0.33989331126213074, |
|
"learning_rate": 0.0001971614350559814, |
|
"loss": 0.8288, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1598224195338513, |
|
"grad_norm": 0.3292713761329651, |
|
"learning_rate": 0.0001970147392284154, |
|
"loss": 0.8415, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1615982241953385, |
|
"grad_norm": 0.3394547700881958, |
|
"learning_rate": 0.00019686440507678824, |
|
"loss": 0.8232, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.16337402885682575, |
|
"grad_norm": 0.3370296061038971, |
|
"learning_rate": 0.0001967104382390511, |
|
"loss": 0.7771, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.16514983351831297, |
|
"grad_norm": 0.3798193633556366, |
|
"learning_rate": 0.00019655284448939094, |
|
"loss": 0.789, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.16692563817980022, |
|
"grad_norm": 0.3790013790130615, |
|
"learning_rate": 0.00019639162973801426, |
|
"loss": 0.8153, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.16870144284128746, |
|
"grad_norm": 0.42274704575538635, |
|
"learning_rate": 0.00019622680003092503, |
|
"loss": 0.8012, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17047724750277468, |
|
"grad_norm": 0.4776620864868164, |
|
"learning_rate": 0.0001960583615496984, |
|
"loss": 0.8132, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.17225305216426193, |
|
"grad_norm": 0.4170360565185547, |
|
"learning_rate": 0.00019588632061124837, |
|
"loss": 0.8139, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.17402885682574917, |
|
"grad_norm": 0.47097012400627136, |
|
"learning_rate": 0.00019571068366759143, |
|
"loss": 0.7711, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.1758046614872364, |
|
"grad_norm": 0.8176291584968567, |
|
"learning_rate": 0.00019553145730560415, |
|
"loss": 0.7906, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.17758046614872364, |
|
"grad_norm": 0.7204902172088623, |
|
"learning_rate": 0.0001953486482467764, |
|
"loss": 0.9088, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17935627081021088, |
|
"grad_norm": 0.3952767252922058, |
|
"learning_rate": 0.0001951622633469592, |
|
"loss": 0.9362, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.1811320754716981, |
|
"grad_norm": 0.3742019534111023, |
|
"learning_rate": 0.00019497230959610756, |
|
"loss": 0.933, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.18290788013318535, |
|
"grad_norm": 0.3385975658893585, |
|
"learning_rate": 0.00019477879411801844, |
|
"loss": 0.9028, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.1846836847946726, |
|
"grad_norm": 0.2950561046600342, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 0.8245, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1864594894561598, |
|
"grad_norm": 0.30859696865081787, |
|
"learning_rate": 0.00019438110714291694, |
|
"loss": 0.8771, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.18823529411764706, |
|
"grad_norm": 0.3490929901599884, |
|
"learning_rate": 0.00019417695056027844, |
|
"loss": 0.8565, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.1900110987791343, |
|
"grad_norm": 0.31133994460105896, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.8734, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.19178690344062152, |
|
"grad_norm": 0.2884789705276489, |
|
"learning_rate": 0.00019375804948675306, |
|
"loss": 0.8645, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.19356270810210877, |
|
"grad_norm": 0.2969193160533905, |
|
"learning_rate": 0.0001935433207058281, |
|
"loss": 0.8751, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.195338512763596, |
|
"grad_norm": 0.41810932755470276, |
|
"learning_rate": 0.0001933250837887457, |
|
"loss": 0.8037, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19711431742508323, |
|
"grad_norm": 0.3271716833114624, |
|
"learning_rate": 0.00019310334692000075, |
|
"loss": 0.7814, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.19889012208657048, |
|
"grad_norm": 0.4146140515804291, |
|
"learning_rate": 0.00019287811841534595, |
|
"loss": 0.8425, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.20066592674805772, |
|
"grad_norm": 0.3369704484939575, |
|
"learning_rate": 0.00019264940672148018, |
|
"loss": 0.8301, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.20244173140954494, |
|
"grad_norm": 0.32731175422668457, |
|
"learning_rate": 0.00019241722041573166, |
|
"loss": 0.7964, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.20421753607103219, |
|
"grad_norm": 0.3840983510017395, |
|
"learning_rate": 0.0001921815682057362, |
|
"loss": 0.7864, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20599334073251943, |
|
"grad_norm": 0.37049344182014465, |
|
"learning_rate": 0.0001919424589291108, |
|
"loss": 0.8086, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.20776914539400665, |
|
"grad_norm": 0.380991131067276, |
|
"learning_rate": 0.0001916999015531221, |
|
"loss": 0.8039, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2095449500554939, |
|
"grad_norm": 0.3884637653827667, |
|
"learning_rate": 0.00019145390517435012, |
|
"loss": 0.7693, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.21132075471698114, |
|
"grad_norm": 0.39195218682289124, |
|
"learning_rate": 0.00019120447901834706, |
|
"loss": 0.8139, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.21309655937846836, |
|
"grad_norm": 0.41479626297950745, |
|
"learning_rate": 0.00019095163243929142, |
|
"loss": 0.7714, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2148723640399556, |
|
"grad_norm": 0.3856278657913208, |
|
"learning_rate": 0.0001906953749196371, |
|
"loss": 0.8198, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.21664816870144285, |
|
"grad_norm": 0.3706349730491638, |
|
"learning_rate": 0.00019043571606975777, |
|
"loss": 0.7106, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.21842397336293007, |
|
"grad_norm": 0.5981292724609375, |
|
"learning_rate": 0.00019017266562758659, |
|
"loss": 0.8005, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.22019977802441731, |
|
"grad_norm": 0.4480712115764618, |
|
"learning_rate": 0.00018990623345825083, |
|
"loss": 0.8167, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"grad_norm": 0.9817702770233154, |
|
"learning_rate": 0.00018963642955370201, |
|
"loss": 0.8555, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.22375138734739178, |
|
"grad_norm": 0.4110267460346222, |
|
"learning_rate": 0.00018936326403234125, |
|
"loss": 0.9069, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.22552719200887902, |
|
"grad_norm": 0.36051687598228455, |
|
"learning_rate": 0.00018908674713863952, |
|
"loss": 0.8783, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.22730299667036627, |
|
"grad_norm": 0.34053486585617065, |
|
"learning_rate": 0.00018880688924275378, |
|
"loss": 0.8563, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.2290788013318535, |
|
"grad_norm": 0.30984926223754883, |
|
"learning_rate": 0.0001885237008401378, |
|
"loss": 0.8434, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.23085460599334073, |
|
"grad_norm": 0.3125753700733185, |
|
"learning_rate": 0.0001882371925511488, |
|
"loss": 0.831, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23263041065482798, |
|
"grad_norm": 0.3113706409931183, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 0.8659, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2344062153163152, |
|
"grad_norm": 0.2837103605270386, |
|
"learning_rate": 0.00018765425941760238, |
|
"loss": 0.812, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.23618201997780244, |
|
"grad_norm": 0.2814521789550781, |
|
"learning_rate": 0.00018735785643466784, |
|
"loss": 0.8116, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2379578246392897, |
|
"grad_norm": 0.2922544777393341, |
|
"learning_rate": 0.00018705817728778624, |
|
"loss": 0.8305, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.2397336293007769, |
|
"grad_norm": 0.3140820860862732, |
|
"learning_rate": 0.00018675523321576371, |
|
"loss": 0.7882, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24150943396226415, |
|
"grad_norm": 0.29498058557510376, |
|
"learning_rate": 0.00018644903557985025, |
|
"loss": 0.8226, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2432852386237514, |
|
"grad_norm": 0.3298538625240326, |
|
"learning_rate": 0.00018613959586331362, |
|
"loss": 0.7867, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.24506104328523862, |
|
"grad_norm": 0.3474237024784088, |
|
"learning_rate": 0.00018582692567100867, |
|
"loss": 0.7876, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.24683684794672586, |
|
"grad_norm": 0.3735051155090332, |
|
"learning_rate": 0.00018551103672894206, |
|
"loss": 0.818, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.2486126526082131, |
|
"grad_norm": 0.3931002914905548, |
|
"learning_rate": 0.00018519194088383273, |
|
"loss": 0.7896, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2503884572697003, |
|
"grad_norm": 0.36460694670677185, |
|
"learning_rate": 0.00018486965010266725, |
|
"loss": 0.8105, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2503884572697003, |
|
"eval_loss": 0.8086357712745667, |
|
"eval_runtime": 159.8215, |
|
"eval_samples_per_second": 5.938, |
|
"eval_steps_per_second": 1.489, |
|
"step": 141 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 563, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 141, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.8570783162472858e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|