|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.751165371809101, |
|
"eval_steps": 141, |
|
"global_step": 423, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0017758046614872365, |
|
"grad_norm": 0.40501952171325684, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.1387, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0017758046614872365, |
|
"eval_loss": 1.4082584381103516, |
|
"eval_runtime": 167.7664, |
|
"eval_samples_per_second": 5.657, |
|
"eval_steps_per_second": 1.419, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003551609322974473, |
|
"grad_norm": 0.491682767868042, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.2151, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.005327413984461709, |
|
"grad_norm": 0.49752455949783325, |
|
"learning_rate": 1.2e-05, |
|
"loss": 1.1941, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.007103218645948946, |
|
"grad_norm": 0.5617953538894653, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 1.2472, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.008879023307436182, |
|
"grad_norm": 0.646000862121582, |
|
"learning_rate": 2e-05, |
|
"loss": 1.2767, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.010654827968923418, |
|
"grad_norm": 0.6190630197525024, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.2839, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.012430632630410655, |
|
"grad_norm": 0.6891798973083496, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.2914, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.014206437291897892, |
|
"grad_norm": 0.6742885708808899, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.3001, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.01598224195338513, |
|
"grad_norm": 0.693493664264679, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.2673, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.017758046614872364, |
|
"grad_norm": 0.7951493859291077, |
|
"learning_rate": 4e-05, |
|
"loss": 1.3314, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0195338512763596, |
|
"grad_norm": 0.7866435050964355, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.2703, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.021309655937846835, |
|
"grad_norm": 0.7218112349510193, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.2542, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.023085460599334074, |
|
"grad_norm": 0.6838662028312683, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.2432, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.02486126526082131, |
|
"grad_norm": 0.6592800617218018, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.2374, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.026637069922308545, |
|
"grad_norm": 0.513134241104126, |
|
"learning_rate": 6e-05, |
|
"loss": 1.246, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.028412874583795784, |
|
"grad_norm": 0.5785119533538818, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.169, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03018867924528302, |
|
"grad_norm": 0.6144536733627319, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.1532, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.03196448390677026, |
|
"grad_norm": 0.674633800983429, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.1175, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.03374028856825749, |
|
"grad_norm": 0.5997682809829712, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.092, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.03551609322974473, |
|
"grad_norm": 0.5651845335960388, |
|
"learning_rate": 8e-05, |
|
"loss": 1.0543, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03729189789123197, |
|
"grad_norm": 0.562713623046875, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.0377, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.0390677025527192, |
|
"grad_norm": 0.5826591849327087, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.0178, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.04084350721420644, |
|
"grad_norm": 0.5972415208816528, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.9916, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.04261931187569367, |
|
"grad_norm": 0.6266159415245056, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.0026, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04439511653718091, |
|
"grad_norm": 0.7757481932640076, |
|
"learning_rate": 0.0001, |
|
"loss": 1.0245, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04617092119866815, |
|
"grad_norm": 0.6832363605499268, |
|
"learning_rate": 0.00010400000000000001, |
|
"loss": 0.9408, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.04794672586015538, |
|
"grad_norm": 1.2983894348144531, |
|
"learning_rate": 0.00010800000000000001, |
|
"loss": 0.9228, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.04972253052164262, |
|
"grad_norm": 0.9382829666137695, |
|
"learning_rate": 0.00011200000000000001, |
|
"loss": 0.9605, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05149833518312986, |
|
"grad_norm": 0.5051376819610596, |
|
"learning_rate": 0.000116, |
|
"loss": 0.9769, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.05327413984461709, |
|
"grad_norm": 0.40853050351142883, |
|
"learning_rate": 0.00012, |
|
"loss": 0.8912, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05504994450610433, |
|
"grad_norm": 0.4261438846588135, |
|
"learning_rate": 0.000124, |
|
"loss": 0.9438, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.05682574916759157, |
|
"grad_norm": 0.44900333881378174, |
|
"learning_rate": 0.00012800000000000002, |
|
"loss": 0.9589, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0586015538290788, |
|
"grad_norm": 0.4262010157108307, |
|
"learning_rate": 0.000132, |
|
"loss": 0.8775, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.06037735849056604, |
|
"grad_norm": 0.40672022104263306, |
|
"learning_rate": 0.00013600000000000003, |
|
"loss": 0.8956, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06215316315205328, |
|
"grad_norm": 0.39336153864860535, |
|
"learning_rate": 0.00014, |
|
"loss": 0.866, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.06392896781354052, |
|
"grad_norm": 0.40699368715286255, |
|
"learning_rate": 0.000144, |
|
"loss": 0.8612, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.06570477247502775, |
|
"grad_norm": 0.438643217086792, |
|
"learning_rate": 0.000148, |
|
"loss": 0.922, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.06748057713651498, |
|
"grad_norm": 0.45053544640541077, |
|
"learning_rate": 0.000152, |
|
"loss": 0.8681, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.06925638179800223, |
|
"grad_norm": 0.4289852976799011, |
|
"learning_rate": 0.00015600000000000002, |
|
"loss": 0.9071, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.07103218645948946, |
|
"grad_norm": 0.4101032316684723, |
|
"learning_rate": 0.00016, |
|
"loss": 0.8692, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07280799112097669, |
|
"grad_norm": 0.418319433927536, |
|
"learning_rate": 0.000164, |
|
"loss": 0.8654, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.07458379578246394, |
|
"grad_norm": 0.41637811064720154, |
|
"learning_rate": 0.000168, |
|
"loss": 0.8348, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.07635960044395117, |
|
"grad_norm": 0.40830302238464355, |
|
"learning_rate": 0.000172, |
|
"loss": 0.8928, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.0781354051054384, |
|
"grad_norm": 0.4163912236690521, |
|
"learning_rate": 0.00017600000000000002, |
|
"loss": 0.8793, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.07991120976692564, |
|
"grad_norm": 0.4240954518318176, |
|
"learning_rate": 0.00018, |
|
"loss": 0.9029, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.08168701442841288, |
|
"grad_norm": 0.48420408368110657, |
|
"learning_rate": 0.00018400000000000003, |
|
"loss": 0.8632, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08346281908990011, |
|
"grad_norm": 0.5267483592033386, |
|
"learning_rate": 0.000188, |
|
"loss": 0.8575, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.08523862375138734, |
|
"grad_norm": 0.4947332441806793, |
|
"learning_rate": 0.000192, |
|
"loss": 0.9051, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.08701442841287459, |
|
"grad_norm": 0.5025691986083984, |
|
"learning_rate": 0.000196, |
|
"loss": 0.9145, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.08879023307436182, |
|
"grad_norm": 0.5430313944816589, |
|
"learning_rate": 0.0002, |
|
"loss": 0.8954, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.09056603773584905, |
|
"grad_norm": 0.45721662044525146, |
|
"learning_rate": 0.00019999812486015523, |
|
"loss": 0.9655, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0923418423973363, |
|
"grad_norm": 0.4364672899246216, |
|
"learning_rate": 0.00019999249951094388, |
|
"loss": 0.9318, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.09411764705882353, |
|
"grad_norm": 0.38933759927749634, |
|
"learning_rate": 0.00019998312416333227, |
|
"loss": 0.8963, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.09589345172031076, |
|
"grad_norm": 0.35572728514671326, |
|
"learning_rate": 0.0001999699991689222, |
|
"loss": 0.9073, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.097669256381798, |
|
"grad_norm": 0.3042948544025421, |
|
"learning_rate": 0.00019995312501993765, |
|
"loss": 0.8751, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.09944506104328524, |
|
"grad_norm": 0.32266151905059814, |
|
"learning_rate": 0.00019993250234920636, |
|
"loss": 0.8493, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10122086570477247, |
|
"grad_norm": 0.31894031167030334, |
|
"learning_rate": 0.00019990813193013625, |
|
"loss": 0.8512, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.10299667036625972, |
|
"grad_norm": 0.33073991537094116, |
|
"learning_rate": 0.0001998800146766861, |
|
"loss": 0.8424, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.10477247502774695, |
|
"grad_norm": 0.32064828276634216, |
|
"learning_rate": 0.00019984815164333163, |
|
"loss": 0.8698, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.10654827968923418, |
|
"grad_norm": 0.3364376425743103, |
|
"learning_rate": 0.00019981254402502566, |
|
"loss": 0.8525, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10832408435072143, |
|
"grad_norm": 0.31403639912605286, |
|
"learning_rate": 0.0001997731931571535, |
|
"loss": 0.8309, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.11009988901220866, |
|
"grad_norm": 0.3375100791454315, |
|
"learning_rate": 0.00019973010051548275, |
|
"loss": 0.8573, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.11187569367369589, |
|
"grad_norm": 0.3584939241409302, |
|
"learning_rate": 0.00019968326771610797, |
|
"loss": 0.8479, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.11365149833518313, |
|
"grad_norm": 0.35480472445487976, |
|
"learning_rate": 0.00019963269651539017, |
|
"loss": 0.845, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.11542730299667037, |
|
"grad_norm": 0.33250972628593445, |
|
"learning_rate": 0.00019957838880989078, |
|
"loss": 0.8438, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1172031076581576, |
|
"grad_norm": 0.39302438497543335, |
|
"learning_rate": 0.00019952034663630062, |
|
"loss": 0.8391, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.11897891231964484, |
|
"grad_norm": 0.3517158031463623, |
|
"learning_rate": 0.00019945857217136363, |
|
"loss": 0.7966, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.12075471698113208, |
|
"grad_norm": 0.38860467076301575, |
|
"learning_rate": 0.00019939306773179497, |
|
"loss": 0.8279, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12253052164261931, |
|
"grad_norm": 0.3762984573841095, |
|
"learning_rate": 0.00019932383577419432, |
|
"loss": 0.7848, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.12430632630410655, |
|
"grad_norm": 0.4535103440284729, |
|
"learning_rate": 0.00019925087889495374, |
|
"loss": 0.8, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.12608213096559379, |
|
"grad_norm": 0.4869844317436218, |
|
"learning_rate": 0.00019917419983016025, |
|
"loss": 0.8442, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.12785793562708103, |
|
"grad_norm": 0.4379689395427704, |
|
"learning_rate": 0.00019909380145549324, |
|
"loss": 0.8353, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.12963374028856825, |
|
"grad_norm": 0.39510270953178406, |
|
"learning_rate": 0.00019900968678611666, |
|
"loss": 0.8538, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1314095449500555, |
|
"grad_norm": 0.4764181971549988, |
|
"learning_rate": 0.00019892185897656578, |
|
"loss": 0.8509, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.13318534961154274, |
|
"grad_norm": 0.5591267347335815, |
|
"learning_rate": 0.00019883032132062925, |
|
"loss": 0.8661, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.13496115427302996, |
|
"grad_norm": 0.41077056527137756, |
|
"learning_rate": 0.00019873507725122504, |
|
"loss": 0.9418, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1367369589345172, |
|
"grad_norm": 0.393622487783432, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 0.926, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.13851276359600445, |
|
"grad_norm": 0.36414071917533875, |
|
"learning_rate": 0.00019853348429855672, |
|
"loss": 0.8649, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14028856825749167, |
|
"grad_norm": 0.3100601136684418, |
|
"learning_rate": 0.00019842714297559213, |
|
"loss": 0.9114, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.14206437291897892, |
|
"grad_norm": 0.29151105880737305, |
|
"learning_rate": 0.0001983171103594755, |
|
"loss": 0.8681, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.14384017758046616, |
|
"grad_norm": 0.28398221731185913, |
|
"learning_rate": 0.0001982033905767377, |
|
"loss": 0.8515, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.14561598224195338, |
|
"grad_norm": 0.2883840799331665, |
|
"learning_rate": 0.00019808598789218865, |
|
"loss": 0.8569, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.14739178690344062, |
|
"grad_norm": 0.29812031984329224, |
|
"learning_rate": 0.0001979649067087574, |
|
"loss": 0.8529, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.14916759156492787, |
|
"grad_norm": 0.3074108958244324, |
|
"learning_rate": 0.00019784015156732693, |
|
"loss": 0.8771, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.1509433962264151, |
|
"grad_norm": 0.3601110279560089, |
|
"learning_rate": 0.000197711727146564, |
|
"loss": 0.8609, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.15271920088790233, |
|
"grad_norm": 0.3126521110534668, |
|
"learning_rate": 0.00019757963826274357, |
|
"loss": 0.8162, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.15449500554938958, |
|
"grad_norm": 0.3152073323726654, |
|
"learning_rate": 0.00019744388986956822, |
|
"loss": 0.7661, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.1562708102108768, |
|
"grad_norm": 0.33570149540901184, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.8179, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.15804661487236404, |
|
"grad_norm": 0.33989331126213074, |
|
"learning_rate": 0.0001971614350559814, |
|
"loss": 0.8288, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.1598224195338513, |
|
"grad_norm": 0.3292713761329651, |
|
"learning_rate": 0.0001970147392284154, |
|
"loss": 0.8415, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1615982241953385, |
|
"grad_norm": 0.3394547700881958, |
|
"learning_rate": 0.00019686440507678824, |
|
"loss": 0.8232, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.16337402885682575, |
|
"grad_norm": 0.3370296061038971, |
|
"learning_rate": 0.0001967104382390511, |
|
"loss": 0.7771, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.16514983351831297, |
|
"grad_norm": 0.3798193633556366, |
|
"learning_rate": 0.00019655284448939094, |
|
"loss": 0.789, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.16692563817980022, |
|
"grad_norm": 0.3790013790130615, |
|
"learning_rate": 0.00019639162973801426, |
|
"loss": 0.8153, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.16870144284128746, |
|
"grad_norm": 0.42274704575538635, |
|
"learning_rate": 0.00019622680003092503, |
|
"loss": 0.8012, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.17047724750277468, |
|
"grad_norm": 0.4776620864868164, |
|
"learning_rate": 0.0001960583615496984, |
|
"loss": 0.8132, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.17225305216426193, |
|
"grad_norm": 0.4170360565185547, |
|
"learning_rate": 0.00019588632061124837, |
|
"loss": 0.8139, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.17402885682574917, |
|
"grad_norm": 0.47097012400627136, |
|
"learning_rate": 0.00019571068366759143, |
|
"loss": 0.7711, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.1758046614872364, |
|
"grad_norm": 0.8176291584968567, |
|
"learning_rate": 0.00019553145730560415, |
|
"loss": 0.7906, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.17758046614872364, |
|
"grad_norm": 0.7204902172088623, |
|
"learning_rate": 0.0001953486482467764, |
|
"loss": 0.9088, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.17935627081021088, |
|
"grad_norm": 0.3952767252922058, |
|
"learning_rate": 0.0001951622633469592, |
|
"loss": 0.9362, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.1811320754716981, |
|
"grad_norm": 0.3742019534111023, |
|
"learning_rate": 0.00019497230959610756, |
|
"loss": 0.933, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.18290788013318535, |
|
"grad_norm": 0.3385975658893585, |
|
"learning_rate": 0.00019477879411801844, |
|
"loss": 0.9028, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.1846836847946726, |
|
"grad_norm": 0.2950561046600342, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 0.8245, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1864594894561598, |
|
"grad_norm": 0.30859696865081787, |
|
"learning_rate": 0.00019438110714291694, |
|
"loss": 0.8771, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.18823529411764706, |
|
"grad_norm": 0.3490929901599884, |
|
"learning_rate": 0.00019417695056027844, |
|
"loss": 0.8565, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.1900110987791343, |
|
"grad_norm": 0.31133994460105896, |
|
"learning_rate": 0.00019396926207859084, |
|
"loss": 0.8734, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.19178690344062152, |
|
"grad_norm": 0.2884789705276489, |
|
"learning_rate": 0.00019375804948675306, |
|
"loss": 0.8645, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.19356270810210877, |
|
"grad_norm": 0.2969193160533905, |
|
"learning_rate": 0.0001935433207058281, |
|
"loss": 0.8751, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.195338512763596, |
|
"grad_norm": 0.41810932755470276, |
|
"learning_rate": 0.0001933250837887457, |
|
"loss": 0.8037, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.19711431742508323, |
|
"grad_norm": 0.3271716833114624, |
|
"learning_rate": 0.00019310334692000075, |
|
"loss": 0.7814, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.19889012208657048, |
|
"grad_norm": 0.4146140515804291, |
|
"learning_rate": 0.00019287811841534595, |
|
"loss": 0.8425, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.20066592674805772, |
|
"grad_norm": 0.3369704484939575, |
|
"learning_rate": 0.00019264940672148018, |
|
"loss": 0.8301, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.20244173140954494, |
|
"grad_norm": 0.32731175422668457, |
|
"learning_rate": 0.00019241722041573166, |
|
"loss": 0.7964, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.20421753607103219, |
|
"grad_norm": 0.3840983510017395, |
|
"learning_rate": 0.0001921815682057362, |
|
"loss": 0.7864, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.20599334073251943, |
|
"grad_norm": 0.37049344182014465, |
|
"learning_rate": 0.0001919424589291108, |
|
"loss": 0.8086, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.20776914539400665, |
|
"grad_norm": 0.380991131067276, |
|
"learning_rate": 0.0001916999015531221, |
|
"loss": 0.8039, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2095449500554939, |
|
"grad_norm": 0.3884637653827667, |
|
"learning_rate": 0.00019145390517435012, |
|
"loss": 0.7693, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.21132075471698114, |
|
"grad_norm": 0.39195218682289124, |
|
"learning_rate": 0.00019120447901834706, |
|
"loss": 0.8139, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.21309655937846836, |
|
"grad_norm": 0.41479626297950745, |
|
"learning_rate": 0.00019095163243929142, |
|
"loss": 0.7714, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2148723640399556, |
|
"grad_norm": 0.3856278657913208, |
|
"learning_rate": 0.0001906953749196371, |
|
"loss": 0.8198, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.21664816870144285, |
|
"grad_norm": 0.3706349730491638, |
|
"learning_rate": 0.00019043571606975777, |
|
"loss": 0.7106, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.21842397336293007, |
|
"grad_norm": 0.5981292724609375, |
|
"learning_rate": 0.00019017266562758659, |
|
"loss": 0.8005, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.22019977802441731, |
|
"grad_norm": 0.4480712115764618, |
|
"learning_rate": 0.00018990623345825083, |
|
"loss": 0.8167, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.22197558268590456, |
|
"grad_norm": 0.9817702770233154, |
|
"learning_rate": 0.00018963642955370201, |
|
"loss": 0.8555, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.22375138734739178, |
|
"grad_norm": 0.4110267460346222, |
|
"learning_rate": 0.00018936326403234125, |
|
"loss": 0.9069, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.22552719200887902, |
|
"grad_norm": 0.36051687598228455, |
|
"learning_rate": 0.00018908674713863952, |
|
"loss": 0.8783, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.22730299667036627, |
|
"grad_norm": 0.34053486585617065, |
|
"learning_rate": 0.00018880688924275378, |
|
"loss": 0.8563, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.2290788013318535, |
|
"grad_norm": 0.30984926223754883, |
|
"learning_rate": 0.0001885237008401378, |
|
"loss": 0.8434, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.23085460599334073, |
|
"grad_norm": 0.3125753700733185, |
|
"learning_rate": 0.0001882371925511488, |
|
"loss": 0.831, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.23263041065482798, |
|
"grad_norm": 0.3113706409931183, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 0.8659, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.2344062153163152, |
|
"grad_norm": 0.2837103605270386, |
|
"learning_rate": 0.00018765425941760238, |
|
"loss": 0.812, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.23618201997780244, |
|
"grad_norm": 0.2814521789550781, |
|
"learning_rate": 0.00018735785643466784, |
|
"loss": 0.8116, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.2379578246392897, |
|
"grad_norm": 0.2922544777393341, |
|
"learning_rate": 0.00018705817728778624, |
|
"loss": 0.8305, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.2397336293007769, |
|
"grad_norm": 0.3140820860862732, |
|
"learning_rate": 0.00018675523321576371, |
|
"loss": 0.7882, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.24150943396226415, |
|
"grad_norm": 0.29498058557510376, |
|
"learning_rate": 0.00018644903557985025, |
|
"loss": 0.8226, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.2432852386237514, |
|
"grad_norm": 0.3298538625240326, |
|
"learning_rate": 0.00018613959586331362, |
|
"loss": 0.7867, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.24506104328523862, |
|
"grad_norm": 0.3474237024784088, |
|
"learning_rate": 0.00018582692567100867, |
|
"loss": 0.7876, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.24683684794672586, |
|
"grad_norm": 0.3735051155090332, |
|
"learning_rate": 0.00018551103672894206, |
|
"loss": 0.818, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.2486126526082131, |
|
"grad_norm": 0.3931002914905548, |
|
"learning_rate": 0.00018519194088383273, |
|
"loss": 0.7896, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2503884572697003, |
|
"grad_norm": 0.36460694670677185, |
|
"learning_rate": 0.00018486965010266725, |
|
"loss": 0.8105, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.2503884572697003, |
|
"eval_loss": 0.8086357712745667, |
|
"eval_runtime": 159.8215, |
|
"eval_samples_per_second": 5.938, |
|
"eval_steps_per_second": 1.489, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.25216426193118757, |
|
"grad_norm": 0.3713844120502472, |
|
"learning_rate": 0.0001845441764722514, |
|
"loss": 0.7688, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.2539400665926748, |
|
"grad_norm": 0.352450430393219, |
|
"learning_rate": 0.00018421553219875658, |
|
"loss": 0.7769, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.25571587125416206, |
|
"grad_norm": 0.3609173893928528, |
|
"learning_rate": 0.00018388372960726228, |
|
"loss": 0.7718, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.25749167591564925, |
|
"grad_norm": 0.36195874214172363, |
|
"learning_rate": 0.00018354878114129367, |
|
"loss": 0.7375, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.2592674805771365, |
|
"grad_norm": 0.3802485466003418, |
|
"learning_rate": 0.00018321069936235503, |
|
"loss": 0.7778, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.26104328523862375, |
|
"grad_norm": 0.38449469208717346, |
|
"learning_rate": 0.00018286949694945866, |
|
"loss": 0.7458, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.262819089900111, |
|
"grad_norm": 0.3975572884082794, |
|
"learning_rate": 0.00018252518669864936, |
|
"loss": 0.7367, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.26459489456159824, |
|
"grad_norm": 0.49581316113471985, |
|
"learning_rate": 0.0001821777815225245, |
|
"loss": 0.7948, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.2663706992230855, |
|
"grad_norm": 0.5556712746620178, |
|
"learning_rate": 0.00018182729444974992, |
|
"loss": 0.8143, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2681465038845727, |
|
"grad_norm": 0.3207700848579407, |
|
"learning_rate": 0.00018147373862457107, |
|
"loss": 0.8578, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.2699223085460599, |
|
"grad_norm": 0.3484250605106354, |
|
"learning_rate": 0.00018111712730632022, |
|
"loss": 0.8757, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.27169811320754716, |
|
"grad_norm": 0.33792024850845337, |
|
"learning_rate": 0.0001807574738689193, |
|
"loss": 0.8464, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.2734739178690344, |
|
"grad_norm": 0.3430371582508087, |
|
"learning_rate": 0.000180394791800378, |
|
"loss": 0.8607, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.27524972253052166, |
|
"grad_norm": 0.3120534420013428, |
|
"learning_rate": 0.00018002909470228842, |
|
"loss": 0.8392, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2770255271920089, |
|
"grad_norm": 0.3126620054244995, |
|
"learning_rate": 0.00017966039628931446, |
|
"loss": 0.8191, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.2788013318534961, |
|
"grad_norm": 0.32269468903541565, |
|
"learning_rate": 0.00017928871038867784, |
|
"loss": 0.8164, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.28057713651498334, |
|
"grad_norm": 0.3052617907524109, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 0.8268, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2823529411764706, |
|
"grad_norm": 0.29926028847694397, |
|
"learning_rate": 0.00017853643199297633, |
|
"loss": 0.7847, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.28412874583795783, |
|
"grad_norm": 0.2997240722179413, |
|
"learning_rate": 0.00017815586771045535, |
|
"loss": 0.8143, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.2859045504994451, |
|
"grad_norm": 0.29772111773490906, |
|
"learning_rate": 0.0001777723723643014, |
|
"loss": 0.7412, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.2876803551609323, |
|
"grad_norm": 0.3138352632522583, |
|
"learning_rate": 0.0001773859603366626, |
|
"loss": 0.7747, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.2894561598224195, |
|
"grad_norm": 0.32726818323135376, |
|
"learning_rate": 0.00017699664611907072, |
|
"loss": 0.8123, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.29123196448390676, |
|
"grad_norm": 0.3244825005531311, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.7705, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.293007769145394, |
|
"grad_norm": 0.35875847935676575, |
|
"learning_rate": 0.00017620936962380856, |
|
"loss": 0.7881, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.29478357380688125, |
|
"grad_norm": 0.36488401889801025, |
|
"learning_rate": 0.00017581143687120875, |
|
"loss": 0.7956, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.2965593784683685, |
|
"grad_norm": 0.33817097544670105, |
|
"learning_rate": 0.00017541066097768963, |
|
"loss": 0.7719, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.29833518312985574, |
|
"grad_norm": 0.36390411853790283, |
|
"learning_rate": 0.0001750070569734681, |
|
"loss": 0.8172, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.30011098779134293, |
|
"grad_norm": 0.34076422452926636, |
|
"learning_rate": 0.00017460063999482316, |
|
"loss": 0.7419, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.3018867924528302, |
|
"grad_norm": 0.39437592029571533, |
|
"learning_rate": 0.00017419142528352817, |
|
"loss": 0.7519, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3036625971143174, |
|
"grad_norm": 0.4019312560558319, |
|
"learning_rate": 0.00017377942818627942, |
|
"loss": 0.7944, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.30543840177580467, |
|
"grad_norm": 0.40751898288726807, |
|
"learning_rate": 0.00017336466415412028, |
|
"loss": 0.7827, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.3072142064372919, |
|
"grad_norm": 0.4780448079109192, |
|
"learning_rate": 0.0001729471487418621, |
|
"loss": 0.7872, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.30899001109877916, |
|
"grad_norm": 0.40511685609817505, |
|
"learning_rate": 0.0001725268976075005, |
|
"loss": 0.7642, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.31076581576026635, |
|
"grad_norm": 0.5618127584457397, |
|
"learning_rate": 0.0001721039265116285, |
|
"loss": 0.872, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.3125416204217536, |
|
"grad_norm": 0.294917494058609, |
|
"learning_rate": 0.00017167825131684513, |
|
"loss": 0.8545, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.31431742508324084, |
|
"grad_norm": 0.3281805217266083, |
|
"learning_rate": 0.00017124988798716083, |
|
"loss": 0.8404, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.3160932297447281, |
|
"grad_norm": 0.33336278796195984, |
|
"learning_rate": 0.00017081885258739846, |
|
"loss": 0.8495, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.31786903440621533, |
|
"grad_norm": 0.3366440236568451, |
|
"learning_rate": 0.00017038516128259115, |
|
"loss": 0.8659, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.3196448390677026, |
|
"grad_norm": 0.32397955656051636, |
|
"learning_rate": 0.00016994883033737582, |
|
"loss": 0.8292, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.32142064372918977, |
|
"grad_norm": 0.2874945402145386, |
|
"learning_rate": 0.00016950987611538324, |
|
"loss": 0.7949, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.323196448390677, |
|
"grad_norm": 0.3074096143245697, |
|
"learning_rate": 0.00016906831507862443, |
|
"loss": 0.8076, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.32497225305216426, |
|
"grad_norm": 0.30116966366767883, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.8058, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.3267480577136515, |
|
"grad_norm": 0.3052218556404114, |
|
"learning_rate": 0.00016817743889704565, |
|
"loss": 0.8067, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.32852386237513875, |
|
"grad_norm": 0.3073555827140808, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 0.8496, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.33029966703662594, |
|
"grad_norm": 0.289145290851593, |
|
"learning_rate": 0.0001672763354327804, |
|
"loss": 0.7362, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.3320754716981132, |
|
"grad_norm": 0.31561294198036194, |
|
"learning_rate": 0.00016682199065224307, |
|
"loss": 0.802, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.33385127635960044, |
|
"grad_norm": 0.2900339365005493, |
|
"learning_rate": 0.00016636513986016213, |
|
"loss": 0.7432, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.3356270810210877, |
|
"grad_norm": 0.3267146646976471, |
|
"learning_rate": 0.0001659058001897201, |
|
"loss": 0.7771, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.3374028856825749, |
|
"grad_norm": 0.3258307874202728, |
|
"learning_rate": 0.00016544398886743933, |
|
"loss": 0.7345, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3391786903440622, |
|
"grad_norm": 0.32989659905433655, |
|
"learning_rate": 0.000164979723212536, |
|
"loss": 0.7383, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.34095449500554936, |
|
"grad_norm": 0.3265599310398102, |
|
"learning_rate": 0.00016451302063627066, |
|
"loss": 0.6977, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3427302996670366, |
|
"grad_norm": 0.39376598596572876, |
|
"learning_rate": 0.00016404389864129533, |
|
"loss": 0.7851, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.34450610432852385, |
|
"grad_norm": 0.40358301997184753, |
|
"learning_rate": 0.00016357237482099684, |
|
"loss": 0.7928, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.3462819089900111, |
|
"grad_norm": 0.3747034966945648, |
|
"learning_rate": 0.00016309846685883726, |
|
"loss": 0.7751, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.34805771365149835, |
|
"grad_norm": 0.4160248041152954, |
|
"learning_rate": 0.00016262219252769064, |
|
"loss": 0.8035, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.3498335183129856, |
|
"grad_norm": 0.39067476987838745, |
|
"learning_rate": 0.00016214356968917648, |
|
"loss": 0.6726, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.3516093229744728, |
|
"grad_norm": 0.4980023205280304, |
|
"learning_rate": 0.00016166261629298995, |
|
"loss": 0.7917, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.35338512763596003, |
|
"grad_norm": 0.4774058163166046, |
|
"learning_rate": 0.0001611793503762285, |
|
"loss": 0.7599, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.3551609322974473, |
|
"grad_norm": 0.5196167230606079, |
|
"learning_rate": 0.00016069379006271566, |
|
"loss": 0.7608, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3569367369589345, |
|
"grad_norm": 0.2735799551010132, |
|
"learning_rate": 0.00016020595356232135, |
|
"loss": 0.8588, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.35871254162042177, |
|
"grad_norm": 0.30770814418792725, |
|
"learning_rate": 0.00015971585917027862, |
|
"loss": 0.8222, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.360488346281909, |
|
"grad_norm": 0.317123144865036, |
|
"learning_rate": 0.00015922352526649803, |
|
"loss": 0.7941, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.3622641509433962, |
|
"grad_norm": 0.32672154903411865, |
|
"learning_rate": 0.00015872897031487791, |
|
"loss": 0.867, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.36403995560488345, |
|
"grad_norm": 0.3169744610786438, |
|
"learning_rate": 0.00015823221286261215, |
|
"loss": 0.8781, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.3658157602663707, |
|
"grad_norm": 0.30588722229003906, |
|
"learning_rate": 0.00015773327153949465, |
|
"loss": 0.7827, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.36759156492785794, |
|
"grad_norm": 0.3179618716239929, |
|
"learning_rate": 0.0001572321650572205, |
|
"loss": 0.8178, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.3693673695893452, |
|
"grad_norm": 0.3094286322593689, |
|
"learning_rate": 0.00015672891220868432, |
|
"loss": 0.7966, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.37114317425083243, |
|
"grad_norm": 0.31584280729293823, |
|
"learning_rate": 0.00015622353186727544, |
|
"loss": 0.7982, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.3729189789123196, |
|
"grad_norm": 0.29120850563049316, |
|
"learning_rate": 0.0001557160429861702, |
|
"loss": 0.7789, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.37469478357380687, |
|
"grad_norm": 0.29743698239326477, |
|
"learning_rate": 0.000155206464597621, |
|
"loss": 0.7799, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.3764705882352941, |
|
"grad_norm": 0.31440189480781555, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 0.7661, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.37824639289678136, |
|
"grad_norm": 0.3395606279373169, |
|
"learning_rate": 0.00015418111581829574, |
|
"loss": 0.7657, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.3800221975582686, |
|
"grad_norm": 0.31749066710472107, |
|
"learning_rate": 0.0001536653838809667, |
|
"loss": 0.7913, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.38179800221975585, |
|
"grad_norm": 0.3586166501045227, |
|
"learning_rate": 0.0001531476393416456, |
|
"loss": 0.7774, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.38357380688124304, |
|
"grad_norm": 0.32895100116729736, |
|
"learning_rate": 0.0001526279016172008, |
|
"loss": 0.7882, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.3853496115427303, |
|
"grad_norm": 0.3541489839553833, |
|
"learning_rate": 0.00015210619019925066, |
|
"loss": 0.7708, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.38712541620421753, |
|
"grad_norm": 0.3232908546924591, |
|
"learning_rate": 0.00015158252465343242, |
|
"loss": 0.7238, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.3889012208657048, |
|
"grad_norm": 0.36565467715263367, |
|
"learning_rate": 0.00015105692461866874, |
|
"loss": 0.7685, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.390677025527192, |
|
"grad_norm": 0.3799486756324768, |
|
"learning_rate": 0.000150529409806431, |
|
"loss": 0.7296, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.39245283018867927, |
|
"grad_norm": 0.4193985164165497, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.7731, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.39422863485016646, |
|
"grad_norm": 0.4226386845111847, |
|
"learning_rate": 0.00014946871505372425, |
|
"loss": 0.8048, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.3960044395116537, |
|
"grad_norm": 0.40805166959762573, |
|
"learning_rate": 0.00014893557489227517, |
|
"loss": 0.7389, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.39778024417314095, |
|
"grad_norm": 0.5135468244552612, |
|
"learning_rate": 0.0001484005995098999, |
|
"loss": 0.779, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.3995560488346282, |
|
"grad_norm": 0.6674650311470032, |
|
"learning_rate": 0.0001478638089696716, |
|
"loss": 0.82, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.40133185349611544, |
|
"grad_norm": 0.3206911087036133, |
|
"learning_rate": 0.00014732522340273684, |
|
"loss": 0.8985, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.4031076581576027, |
|
"grad_norm": 0.33583980798721313, |
|
"learning_rate": 0.0001467848630075608, |
|
"loss": 0.8171, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.4048834628190899, |
|
"grad_norm": 0.3324304223060608, |
|
"learning_rate": 0.00014624274804916958, |
|
"loss": 0.8531, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.4066592674805771, |
|
"grad_norm": 0.32210710644721985, |
|
"learning_rate": 0.00014569889885839037, |
|
"loss": 0.8349, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.40843507214206437, |
|
"grad_norm": 0.30829885601997375, |
|
"learning_rate": 0.00014515333583108896, |
|
"loss": 0.8176, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.4102108768035516, |
|
"grad_norm": 0.31730225682258606, |
|
"learning_rate": 0.00014460607942740468, |
|
"loss": 0.8109, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.41198668146503886, |
|
"grad_norm": 0.32128164172172546, |
|
"learning_rate": 0.00014405715017098335, |
|
"loss": 0.8049, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.4137624861265261, |
|
"grad_norm": 0.32257241010665894, |
|
"learning_rate": 0.00014350656864820733, |
|
"loss": 0.79, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.4155382907880133, |
|
"grad_norm": 0.29663363099098206, |
|
"learning_rate": 0.0001429543555074237, |
|
"loss": 0.7606, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.41731409544950054, |
|
"grad_norm": 0.3175968527793884, |
|
"learning_rate": 0.00014240053145816967, |
|
"loss": 0.8093, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.4190899001109878, |
|
"grad_norm": 0.30839797854423523, |
|
"learning_rate": 0.00014184511727039612, |
|
"loss": 0.8033, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.42086570477247504, |
|
"grad_norm": 0.32169485092163086, |
|
"learning_rate": 0.0001412881337736885, |
|
"loss": 0.7583, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.4226415094339623, |
|
"grad_norm": 0.3165202736854553, |
|
"learning_rate": 0.00014072960185648577, |
|
"loss": 0.7864, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.4244173140954495, |
|
"grad_norm": 0.3507262170314789, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 0.8196, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.4261931187569367, |
|
"grad_norm": 0.3330634534358978, |
|
"learning_rate": 0.0001396079766039157, |
|
"loss": 0.7356, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.42796892341842396, |
|
"grad_norm": 0.3456502854824066, |
|
"learning_rate": 0.00013904492533263244, |
|
"loss": 0.7636, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.4297447280799112, |
|
"grad_norm": 0.3290559649467468, |
|
"learning_rate": 0.00013848040976744457, |
|
"loss": 0.6921, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.43152053274139845, |
|
"grad_norm": 0.34343284368515015, |
|
"learning_rate": 0.00013791445107926478, |
|
"loss": 0.7661, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.4332963374028857, |
|
"grad_norm": 0.34806933999061584, |
|
"learning_rate": 0.00013734707049312673, |
|
"loss": 0.7266, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.43507214206437295, |
|
"grad_norm": 0.3577682375907898, |
|
"learning_rate": 0.00013677828928738934, |
|
"loss": 0.7337, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.43684794672586014, |
|
"grad_norm": 0.37708649039268494, |
|
"learning_rate": 0.00013620812879293863, |
|
"loss": 0.6949, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.4386237513873474, |
|
"grad_norm": 0.3661216199398041, |
|
"learning_rate": 0.00013563661039238785, |
|
"loss": 0.7049, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.44039955604883463, |
|
"grad_norm": 0.4453539550304413, |
|
"learning_rate": 0.00013506375551927547, |
|
"loss": 0.7957, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.4421753607103219, |
|
"grad_norm": 0.46171826124191284, |
|
"learning_rate": 0.00013448958565726144, |
|
"loss": 0.7175, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.4439511653718091, |
|
"grad_norm": 0.6314205527305603, |
|
"learning_rate": 0.00013391412233932149, |
|
"loss": 0.8853, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4457269700332963, |
|
"grad_norm": 0.29680782556533813, |
|
"learning_rate": 0.00013333738714693956, |
|
"loss": 0.8789, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.44750277469478356, |
|
"grad_norm": 0.30771735310554504, |
|
"learning_rate": 0.00013275940170929843, |
|
"loss": 0.8126, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4492785793562708, |
|
"grad_norm": 0.3242880403995514, |
|
"learning_rate": 0.00013218018770246858, |
|
"loss": 0.7787, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.45105438401775805, |
|
"grad_norm": 0.33549076318740845, |
|
"learning_rate": 0.00013159976684859527, |
|
"loss": 0.8113, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.4528301886792453, |
|
"grad_norm": 0.34281155467033386, |
|
"learning_rate": 0.00013101816091508388, |
|
"loss": 0.8371, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.45460599334073254, |
|
"grad_norm": 0.3422442078590393, |
|
"learning_rate": 0.0001304353917137836, |
|
"loss": 0.8362, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.45638179800221973, |
|
"grad_norm": 0.3019155263900757, |
|
"learning_rate": 0.00012985148110016947, |
|
"loss": 0.7317, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.458157602663707, |
|
"grad_norm": 0.32793429493904114, |
|
"learning_rate": 0.0001292664509725226, |
|
"loss": 0.7861, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.4599334073251942, |
|
"grad_norm": 0.32433855533599854, |
|
"learning_rate": 0.00012868032327110904, |
|
"loss": 0.7708, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.46170921198668147, |
|
"grad_norm": 0.31858816742897034, |
|
"learning_rate": 0.00012809311997735696, |
|
"loss": 0.7754, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4634850166481687, |
|
"grad_norm": 0.3172609210014343, |
|
"learning_rate": 0.00012750486311303218, |
|
"loss": 0.7839, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.46526082130965596, |
|
"grad_norm": 0.2951931953430176, |
|
"learning_rate": 0.00012691557473941243, |
|
"loss": 0.7261, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.46703662597114315, |
|
"grad_norm": 0.31385374069213867, |
|
"learning_rate": 0.00012632527695645993, |
|
"loss": 0.8221, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.4688124306326304, |
|
"grad_norm": 0.31157392263412476, |
|
"learning_rate": 0.0001257339919019925, |
|
"loss": 0.7711, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.47058823529411764, |
|
"grad_norm": 0.32580870389938354, |
|
"learning_rate": 0.00012514174175085345, |
|
"loss": 0.7592, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.4723640399556049, |
|
"grad_norm": 0.33285781741142273, |
|
"learning_rate": 0.00012454854871407994, |
|
"loss": 0.7349, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.47413984461709213, |
|
"grad_norm": 0.3179035186767578, |
|
"learning_rate": 0.0001239544350380699, |
|
"loss": 0.7338, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.4759156492785794, |
|
"grad_norm": 0.31393003463745117, |
|
"learning_rate": 0.00012335942300374788, |
|
"loss": 0.7088, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.47769145394006657, |
|
"grad_norm": 0.33285436034202576, |
|
"learning_rate": 0.00012276353492572935, |
|
"loss": 0.7069, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.4794672586015538, |
|
"grad_norm": 0.38329485058784485, |
|
"learning_rate": 0.00012216679315148386, |
|
"loss": 0.7093, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.48124306326304106, |
|
"grad_norm": 0.3584016263484955, |
|
"learning_rate": 0.00012156922006049702, |
|
"loss": 0.7513, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.4830188679245283, |
|
"grad_norm": 0.3995126187801361, |
|
"learning_rate": 0.00012097083806343103, |
|
"loss": 0.7384, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.48479467258601555, |
|
"grad_norm": 0.4097007215023041, |
|
"learning_rate": 0.00012037166960128443, |
|
"loss": 0.7794, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.4865704772475028, |
|
"grad_norm": 0.4780315160751343, |
|
"learning_rate": 0.00011977173714455034, |
|
"loss": 0.7437, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.48834628190899, |
|
"grad_norm": 0.5396427512168884, |
|
"learning_rate": 0.00011917106319237386, |
|
"loss": 0.7542, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.49012208657047723, |
|
"grad_norm": 0.29439178109169006, |
|
"learning_rate": 0.00011856967027170818, |
|
"loss": 0.8389, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.4918978912319645, |
|
"grad_norm": 0.3243663012981415, |
|
"learning_rate": 0.00011796758093646989, |
|
"loss": 0.8767, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.4936736958934517, |
|
"grad_norm": 0.342454195022583, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.8538, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.49544950055493897, |
|
"grad_norm": 0.30882903933525085, |
|
"learning_rate": 0.00011676140336768236, |
|
"loss": 0.7766, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.4972253052164262, |
|
"grad_norm": 0.3247200548648834, |
|
"learning_rate": 0.00011615736036916549, |
|
"loss": 0.8268, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4990011098779134, |
|
"grad_norm": 0.3077162504196167, |
|
"learning_rate": 0.00011555271142444433, |
|
"loss": 0.7786, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.5007769145394007, |
|
"grad_norm": 0.3300260603427887, |
|
"learning_rate": 0.00011494747920954545, |
|
"loss": 0.7853, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5007769145394007, |
|
"eval_loss": 0.7658749222755432, |
|
"eval_runtime": 158.4653, |
|
"eval_samples_per_second": 5.989, |
|
"eval_steps_per_second": 1.502, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.502552719200888, |
|
"grad_norm": 0.331061989068985, |
|
"learning_rate": 0.00011434168642236964, |
|
"loss": 0.8114, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.5043285238623751, |
|
"grad_norm": 0.3186919689178467, |
|
"learning_rate": 0.00011373535578184082, |
|
"loss": 0.7872, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5061043285238623, |
|
"grad_norm": 0.3114188611507416, |
|
"learning_rate": 0.00011312851002705383, |
|
"loss": 0.7311, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.5078801331853496, |
|
"grad_norm": 0.3148879408836365, |
|
"learning_rate": 0.00011252117191642175, |
|
"loss": 0.7311, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5096559378468368, |
|
"grad_norm": 0.3390887379646301, |
|
"learning_rate": 0.00011191336422682237, |
|
"loss": 0.7773, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.5114317425083241, |
|
"grad_norm": 0.31982842087745667, |
|
"learning_rate": 0.00011130510975274409, |
|
"loss": 0.7474, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5132075471698113, |
|
"grad_norm": 0.31643104553222656, |
|
"learning_rate": 0.00011069643130543084, |
|
"loss": 0.7375, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.5149833518312985, |
|
"grad_norm": 0.33758479356765747, |
|
"learning_rate": 0.00011008735171202684, |
|
"loss": 0.7411, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5167591564927858, |
|
"grad_norm": 0.324556440114975, |
|
"learning_rate": 0.00010947789381472035, |
|
"loss": 0.7235, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.518534961154273, |
|
"grad_norm": 0.3768496513366699, |
|
"learning_rate": 0.00010886808046988717, |
|
"loss": 0.7618, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5203107658157603, |
|
"grad_norm": 0.34034618735313416, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 0.7426, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.5220865704772475, |
|
"grad_norm": 0.3409979045391083, |
|
"learning_rate": 0.00010764747892893723, |
|
"loss": 0.7327, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5238623751387348, |
|
"grad_norm": 0.35839787125587463, |
|
"learning_rate": 0.00010703673650879218, |
|
"loss": 0.7057, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.525638179800222, |
|
"grad_norm": 0.3807874023914337, |
|
"learning_rate": 0.00010642573019134703, |
|
"loss": 0.7225, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5274139844617092, |
|
"grad_norm": 0.4682140648365021, |
|
"learning_rate": 0.00010581448289104758, |
|
"loss": 0.715, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.5291897891231965, |
|
"grad_norm": 0.4261273145675659, |
|
"learning_rate": 0.00010520301753137724, |
|
"loss": 0.7239, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.5309655937846837, |
|
"grad_norm": 0.4854682981967926, |
|
"learning_rate": 0.00010459135704399718, |
|
"loss": 0.7304, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.532741398446171, |
|
"grad_norm": 0.6740989685058594, |
|
"learning_rate": 0.00010397952436788642, |
|
"loss": 0.8604, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5345172031076582, |
|
"grad_norm": 0.2903907299041748, |
|
"learning_rate": 0.00010336754244848157, |
|
"loss": 0.8551, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.5362930077691453, |
|
"grad_norm": 0.28648582100868225, |
|
"learning_rate": 0.00010275543423681621, |
|
"loss": 0.7958, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.5380688124306326, |
|
"grad_norm": 0.33123767375946045, |
|
"learning_rate": 0.00010214322268866032, |
|
"loss": 0.7853, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.5398446170921198, |
|
"grad_norm": 0.31327784061431885, |
|
"learning_rate": 0.00010153093076365923, |
|
"loss": 0.7856, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5416204217536071, |
|
"grad_norm": 0.3101854622364044, |
|
"learning_rate": 0.00010091858142447265, |
|
"loss": 0.7694, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.5433962264150943, |
|
"grad_norm": 0.3217926621437073, |
|
"learning_rate": 0.00010030619763591347, |
|
"loss": 0.7899, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.5451720310765816, |
|
"grad_norm": 0.33827194571495056, |
|
"learning_rate": 9.969380236408656e-05, |
|
"loss": 0.8088, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.5469478357380688, |
|
"grad_norm": 0.32632124423980713, |
|
"learning_rate": 9.908141857552737e-05, |
|
"loss": 0.769, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.548723640399556, |
|
"grad_norm": 0.3152617812156677, |
|
"learning_rate": 9.846906923634079e-05, |
|
"loss": 0.7804, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.5504994450610433, |
|
"grad_norm": 0.33337536454200745, |
|
"learning_rate": 9.78567773113397e-05, |
|
"loss": 0.7379, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5522752497225305, |
|
"grad_norm": 0.3020349144935608, |
|
"learning_rate": 9.724456576318381e-05, |
|
"loss": 0.7146, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.5540510543840178, |
|
"grad_norm": 0.34656378626823425, |
|
"learning_rate": 9.663245755151846e-05, |
|
"loss": 0.7437, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.555826859045505, |
|
"grad_norm": 0.3417186737060547, |
|
"learning_rate": 9.602047563211359e-05, |
|
"loss": 0.7472, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.5576026637069922, |
|
"grad_norm": 0.34442222118377686, |
|
"learning_rate": 9.540864295600283e-05, |
|
"loss": 0.7426, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.5593784683684795, |
|
"grad_norm": 0.3521478772163391, |
|
"learning_rate": 9.479698246862276e-05, |
|
"loss": 0.7522, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.5611542730299667, |
|
"grad_norm": 0.3358227014541626, |
|
"learning_rate": 9.418551710895243e-05, |
|
"loss": 0.7454, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.562930077691454, |
|
"grad_norm": 0.343226820230484, |
|
"learning_rate": 9.357426980865301e-05, |
|
"loss": 0.7341, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.5647058823529412, |
|
"grad_norm": 0.3432699739933014, |
|
"learning_rate": 9.296326349120785e-05, |
|
"loss": 0.6836, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.5664816870144284, |
|
"grad_norm": 0.3710852265357971, |
|
"learning_rate": 9.235252107106279e-05, |
|
"loss": 0.6961, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.5682574916759157, |
|
"grad_norm": 0.351094514131546, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 0.6668, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5700332963374029, |
|
"grad_norm": 0.4484163224697113, |
|
"learning_rate": 9.113191953011287e-05, |
|
"loss": 0.7427, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.5718091009988902, |
|
"grad_norm": 0.44636109471321106, |
|
"learning_rate": 9.052210618527966e-05, |
|
"loss": 0.8119, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.5735849056603773, |
|
"grad_norm": 0.43749314546585083, |
|
"learning_rate": 8.991264828797319e-05, |
|
"loss": 0.7846, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.5753607103218646, |
|
"grad_norm": 0.4471510350704193, |
|
"learning_rate": 8.930356869456919e-05, |
|
"loss": 0.7215, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.5771365149833518, |
|
"grad_norm": 0.5141078233718872, |
|
"learning_rate": 8.869489024725595e-05, |
|
"loss": 0.7492, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.578912319644839, |
|
"grad_norm": 0.2640296518802643, |
|
"learning_rate": 8.808663577317764e-05, |
|
"loss": 0.8625, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.5806881243063263, |
|
"grad_norm": 0.28867048025131226, |
|
"learning_rate": 8.747882808357828e-05, |
|
"loss": 0.8352, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.5824639289678135, |
|
"grad_norm": 0.2925030589103699, |
|
"learning_rate": 8.687148997294621e-05, |
|
"loss": 0.8091, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.5842397336293008, |
|
"grad_norm": 0.28383681178092957, |
|
"learning_rate": 8.626464421815919e-05, |
|
"loss": 0.784, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.586015538290788, |
|
"grad_norm": 0.3055633306503296, |
|
"learning_rate": 8.565831357763039e-05, |
|
"loss": 0.79, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5877913429522752, |
|
"grad_norm": 0.30299943685531616, |
|
"learning_rate": 8.505252079045458e-05, |
|
"loss": 0.8105, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.5895671476137625, |
|
"grad_norm": 0.3154890239238739, |
|
"learning_rate": 8.444728857555572e-05, |
|
"loss": 0.7664, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.5913429522752497, |
|
"grad_norm": 0.31844133138656616, |
|
"learning_rate": 8.384263963083453e-05, |
|
"loss": 0.7709, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.593118756936737, |
|
"grad_norm": 0.31844353675842285, |
|
"learning_rate": 8.323859663231768e-05, |
|
"loss": 0.7426, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.5948945615982242, |
|
"grad_norm": 0.31527841091156006, |
|
"learning_rate": 8.263518223330697e-05, |
|
"loss": 0.7441, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.5966703662597115, |
|
"grad_norm": 0.32145699858665466, |
|
"learning_rate": 8.203241906353014e-05, |
|
"loss": 0.7333, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.5984461709211987, |
|
"grad_norm": 0.3175109922885895, |
|
"learning_rate": 8.143032972829183e-05, |
|
"loss": 0.7488, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.6002219755826859, |
|
"grad_norm": 0.3342651128768921, |
|
"learning_rate": 8.082893680762619e-05, |
|
"loss": 0.7265, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6019977802441732, |
|
"grad_norm": 0.339743971824646, |
|
"learning_rate": 8.022826285544968e-05, |
|
"loss": 0.7005, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.6037735849056604, |
|
"grad_norm": 0.35757359862327576, |
|
"learning_rate": 7.96283303987156e-05, |
|
"loss": 0.7806, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6055493895671477, |
|
"grad_norm": 0.4024328291416168, |
|
"learning_rate": 7.902916193656898e-05, |
|
"loss": 0.6895, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.6073251942286348, |
|
"grad_norm": 0.3628247380256653, |
|
"learning_rate": 7.843077993950302e-05, |
|
"loss": 0.7285, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.609100998890122, |
|
"grad_norm": 0.3793889582157135, |
|
"learning_rate": 7.783320684851614e-05, |
|
"loss": 0.729, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.6108768035516093, |
|
"grad_norm": 0.37614578008651733, |
|
"learning_rate": 7.72364650742707e-05, |
|
"loss": 0.6869, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6126526082130965, |
|
"grad_norm": 0.3737132251262665, |
|
"learning_rate": 7.664057699625214e-05, |
|
"loss": 0.7373, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.6144284128745838, |
|
"grad_norm": 0.40523961186408997, |
|
"learning_rate": 7.604556496193015e-05, |
|
"loss": 0.729, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.616204217536071, |
|
"grad_norm": 0.3903469145298004, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 0.7063, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.6179800221975583, |
|
"grad_norm": 0.43782973289489746, |
|
"learning_rate": 7.485825824914659e-05, |
|
"loss": 0.6763, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6197558268590455, |
|
"grad_norm": 0.4907206594944, |
|
"learning_rate": 7.426600809800752e-05, |
|
"loss": 0.7405, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.6215316315205327, |
|
"grad_norm": 0.5378274917602539, |
|
"learning_rate": 7.36747230435401e-05, |
|
"loss": 0.7417, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.62330743618202, |
|
"grad_norm": 0.266481876373291, |
|
"learning_rate": 7.308442526058756e-05, |
|
"loss": 0.8434, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.6250832408435072, |
|
"grad_norm": 0.28670433163642883, |
|
"learning_rate": 7.249513688696786e-05, |
|
"loss": 0.8049, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6268590455049945, |
|
"grad_norm": 0.29961690306663513, |
|
"learning_rate": 7.190688002264308e-05, |
|
"loss": 0.762, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.6286348501664817, |
|
"grad_norm": 0.2873949706554413, |
|
"learning_rate": 7.131967672889101e-05, |
|
"loss": 0.7389, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.6304106548279689, |
|
"grad_norm": 0.3315136730670929, |
|
"learning_rate": 7.073354902747741e-05, |
|
"loss": 0.7719, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.6321864594894562, |
|
"grad_norm": 0.31057095527648926, |
|
"learning_rate": 7.014851889983057e-05, |
|
"loss": 0.7407, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.6339622641509434, |
|
"grad_norm": 0.345838725566864, |
|
"learning_rate": 6.95646082862164e-05, |
|
"loss": 0.7838, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.6357380688124307, |
|
"grad_norm": 0.31915196776390076, |
|
"learning_rate": 6.898183908491617e-05, |
|
"loss": 0.7591, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.6375138734739179, |
|
"grad_norm": 0.3124110698699951, |
|
"learning_rate": 6.840023315140475e-05, |
|
"loss": 0.7222, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.6392896781354052, |
|
"grad_norm": 0.3307512104511261, |
|
"learning_rate": 6.781981229753145e-05, |
|
"loss": 0.7472, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6410654827968923, |
|
"grad_norm": 0.3425205945968628, |
|
"learning_rate": 6.724059829070158e-05, |
|
"loss": 0.764, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.6428412874583795, |
|
"grad_norm": 0.33861225843429565, |
|
"learning_rate": 6.666261285306047e-05, |
|
"loss": 0.7396, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.6446170921198668, |
|
"grad_norm": 0.3248923420906067, |
|
"learning_rate": 6.608587766067852e-05, |
|
"loss": 0.7158, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.646392896781354, |
|
"grad_norm": 0.349185049533844, |
|
"learning_rate": 6.551041434273861e-05, |
|
"loss": 0.7415, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6481687014428413, |
|
"grad_norm": 0.33934569358825684, |
|
"learning_rate": 6.493624448072457e-05, |
|
"loss": 0.744, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.6499445061043285, |
|
"grad_norm": 0.3628052771091461, |
|
"learning_rate": 6.43633896076122e-05, |
|
"loss": 0.7328, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.6517203107658157, |
|
"grad_norm": 0.348979115486145, |
|
"learning_rate": 6.379187120706138e-05, |
|
"loss": 0.6755, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.653496115427303, |
|
"grad_norm": 0.38474076986312866, |
|
"learning_rate": 6.322171071261071e-05, |
|
"loss": 0.711, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.6552719200887902, |
|
"grad_norm": 0.34556257724761963, |
|
"learning_rate": 6.26529295068733e-05, |
|
"loss": 0.6995, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.6570477247502775, |
|
"grad_norm": 0.4337230622768402, |
|
"learning_rate": 6.208554892073528e-05, |
|
"loss": 0.7412, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6588235294117647, |
|
"grad_norm": 0.37804853916168213, |
|
"learning_rate": 6.151959023255545e-05, |
|
"loss": 0.6724, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.6605993340732519, |
|
"grad_norm": 0.40870919823646545, |
|
"learning_rate": 6.095507466736763e-05, |
|
"loss": 0.7243, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.6623751387347392, |
|
"grad_norm": 0.45504140853881836, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.7373, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.6641509433962264, |
|
"grad_norm": 0.46973538398742676, |
|
"learning_rate": 5.983045753470308e-05, |
|
"loss": 0.7101, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.6659267480577137, |
|
"grad_norm": 0.5572993755340576, |
|
"learning_rate": 5.927039814351426e-05, |
|
"loss": 0.7393, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.6677025527192009, |
|
"grad_norm": 0.2691468596458435, |
|
"learning_rate": 5.8711866226311553e-05, |
|
"loss": 0.8102, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.6694783573806882, |
|
"grad_norm": 0.2898322641849518, |
|
"learning_rate": 5.8154882729603876e-05, |
|
"loss": 0.7968, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.6712541620421754, |
|
"grad_norm": 0.3048444092273712, |
|
"learning_rate": 5.7599468541830356e-05, |
|
"loss": 0.775, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.6730299667036626, |
|
"grad_norm": 0.3111611604690552, |
|
"learning_rate": 5.7045644492576346e-05, |
|
"loss": 0.7742, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.6748057713651499, |
|
"grad_norm": 0.31889772415161133, |
|
"learning_rate": 5.64934313517927e-05, |
|
"loss": 0.7304, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.676581576026637, |
|
"grad_norm": 0.3219664692878723, |
|
"learning_rate": 5.5942849829016695e-05, |
|
"loss": 0.7679, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.6783573806881243, |
|
"grad_norm": 0.30955034494400024, |
|
"learning_rate": 5.5393920572595356e-05, |
|
"loss": 0.7443, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.6801331853496115, |
|
"grad_norm": 0.344043105840683, |
|
"learning_rate": 5.484666416891109e-05, |
|
"loss": 0.7272, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.6819089900110987, |
|
"grad_norm": 0.33895599842071533, |
|
"learning_rate": 5.430110114160964e-05, |
|
"loss": 0.7585, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.683684794672586, |
|
"grad_norm": 0.37816834449768066, |
|
"learning_rate": 5.375725195083046e-05, |
|
"loss": 0.7749, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.6854605993340732, |
|
"grad_norm": 0.3477395176887512, |
|
"learning_rate": 5.321513699243924e-05, |
|
"loss": 0.7022, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.6872364039955605, |
|
"grad_norm": 0.3380398154258728, |
|
"learning_rate": 5.2674776597263186e-05, |
|
"loss": 0.7266, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.6890122086570477, |
|
"grad_norm": 0.35505762696266174, |
|
"learning_rate": 5.2136191030328455e-05, |
|
"loss": 0.7411, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.690788013318535, |
|
"grad_norm": 0.38739171624183655, |
|
"learning_rate": 5.159940049010015e-05, |
|
"loss": 0.7666, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.6925638179800222, |
|
"grad_norm": 0.38473132252693176, |
|
"learning_rate": 5.106442510772489e-05, |
|
"loss": 0.7038, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.6943396226415094, |
|
"grad_norm": 0.37635302543640137, |
|
"learning_rate": 5.0531284946275784e-05, |
|
"loss": 0.7488, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.6961154273029967, |
|
"grad_norm": 0.37422046065330505, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 0.693, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.6978912319644839, |
|
"grad_norm": 0.3987278342247009, |
|
"learning_rate": 4.9470590193569044e-05, |
|
"loss": 0.6965, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.6996670366259712, |
|
"grad_norm": 0.34372609853744507, |
|
"learning_rate": 4.894307538133129e-05, |
|
"loss": 0.6632, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7014428412874584, |
|
"grad_norm": 0.4215118885040283, |
|
"learning_rate": 4.841747534656763e-05, |
|
"loss": 0.7081, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.7032186459489456, |
|
"grad_norm": 0.4211183488368988, |
|
"learning_rate": 4.7893809800749403e-05, |
|
"loss": 0.687, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.7049944506104329, |
|
"grad_norm": 0.44248080253601074, |
|
"learning_rate": 4.737209838279922e-05, |
|
"loss": 0.7118, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.7067702552719201, |
|
"grad_norm": 0.38100606203079224, |
|
"learning_rate": 4.685236065835443e-05, |
|
"loss": 0.6259, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7085460599334074, |
|
"grad_norm": 0.46482354402542114, |
|
"learning_rate": 4.6334616119033356e-05, |
|
"loss": 0.6668, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.7103218645948945, |
|
"grad_norm": 0.5484885573387146, |
|
"learning_rate": 4.5818884181704294e-05, |
|
"loss": 0.7973, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7120976692563818, |
|
"grad_norm": 0.2660059928894043, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 0.7845, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.713873473917869, |
|
"grad_norm": 0.30005505681037903, |
|
"learning_rate": 4.479353540237903e-05, |
|
"loss": 0.8141, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.7156492785793562, |
|
"grad_norm": 0.3031437397003174, |
|
"learning_rate": 4.4283957013829846e-05, |
|
"loss": 0.7505, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.7174250832408435, |
|
"grad_norm": 0.3152884542942047, |
|
"learning_rate": 4.3776468132724604e-05, |
|
"loss": 0.8191, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.7192008879023307, |
|
"grad_norm": 0.3122805058956146, |
|
"learning_rate": 4.3271087791315734e-05, |
|
"loss": 0.7732, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.720976692563818, |
|
"grad_norm": 0.3241139054298401, |
|
"learning_rate": 4.276783494277954e-05, |
|
"loss": 0.7652, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.7227524972253052, |
|
"grad_norm": 0.3523857295513153, |
|
"learning_rate": 4.2266728460505375e-05, |
|
"loss": 0.7923, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.7245283018867924, |
|
"grad_norm": 0.3518478274345398, |
|
"learning_rate": 4.176778713738787e-05, |
|
"loss": 0.8046, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.7263041065482797, |
|
"grad_norm": 0.35740435123443604, |
|
"learning_rate": 4.127102968512214e-05, |
|
"loss": 0.741, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.7280799112097669, |
|
"grad_norm": 0.3561273217201233, |
|
"learning_rate": 4.077647473350201e-05, |
|
"loss": 0.7304, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.7298557158712542, |
|
"grad_norm": 0.3595544397830963, |
|
"learning_rate": 4.028414082972141e-05, |
|
"loss": 0.7601, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.7316315205327414, |
|
"grad_norm": 0.38603028655052185, |
|
"learning_rate": 3.97940464376787e-05, |
|
"loss": 0.768, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.7334073251942287, |
|
"grad_norm": 0.347781240940094, |
|
"learning_rate": 3.9306209937284346e-05, |
|
"loss": 0.7255, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.7351831298557159, |
|
"grad_norm": 0.3760242462158203, |
|
"learning_rate": 3.882064962377154e-05, |
|
"loss": 0.7371, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.7369589345172031, |
|
"grad_norm": 0.359371542930603, |
|
"learning_rate": 3.83373837070101e-05, |
|
"loss": 0.7422, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.7387347391786904, |
|
"grad_norm": 0.3574449419975281, |
|
"learning_rate": 3.7856430310823545e-05, |
|
"loss": 0.6915, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.7405105438401776, |
|
"grad_norm": 0.3730245530605316, |
|
"learning_rate": 3.737780747230941e-05, |
|
"loss": 0.7309, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.7422863485016649, |
|
"grad_norm": 0.36496400833129883, |
|
"learning_rate": 3.69015331411628e-05, |
|
"loss": 0.7245, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.744062153163152, |
|
"grad_norm": 0.3593985140323639, |
|
"learning_rate": 3.642762517900322e-05, |
|
"loss": 0.6389, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.7458379578246392, |
|
"grad_norm": 0.3603939116001129, |
|
"learning_rate": 3.595610135870472e-05, |
|
"loss": 0.703, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7476137624861265, |
|
"grad_norm": 0.397124320268631, |
|
"learning_rate": 3.548697936372937e-05, |
|
"loss": 0.7265, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.7493895671476137, |
|
"grad_norm": 0.4071907103061676, |
|
"learning_rate": 3.5020276787464056e-05, |
|
"loss": 0.6752, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.751165371809101, |
|
"grad_norm": 0.3834024965763092, |
|
"learning_rate": 3.455601113256073e-05, |
|
"loss": 0.6297, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.751165371809101, |
|
"eval_loss": 0.7374839186668396, |
|
"eval_runtime": 156.6123, |
|
"eval_samples_per_second": 6.06, |
|
"eval_steps_per_second": 1.52, |
|
"step": 423 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 563, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 141, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.571234948741857e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|