|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9929408114188193, |
|
"eval_steps": 500, |
|
"global_step": 3200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 20.729074478149414, |
|
"learning_rate": 2.5e-06, |
|
"loss": 8.0612, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 16.320600509643555, |
|
"learning_rate": 5e-06, |
|
"loss": 7.3007, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 17.508378982543945, |
|
"learning_rate": 7.5e-06, |
|
"loss": 7.7541, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 18.7609920501709, |
|
"learning_rate": 1e-05, |
|
"loss": 7.0762, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.039741516113281, |
|
"learning_rate": 1.25e-05, |
|
"loss": 6.3794, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 10.681583404541016, |
|
"learning_rate": 1.5e-05, |
|
"loss": 5.7463, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 8.521218299865723, |
|
"learning_rate": 1.75e-05, |
|
"loss": 5.1425, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 8.024609565734863, |
|
"learning_rate": 2e-05, |
|
"loss": 4.8565, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 6.419050216674805, |
|
"learning_rate": 2.25e-05, |
|
"loss": 4.4552, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.1052398681640625, |
|
"learning_rate": 2.5e-05, |
|
"loss": 4.1432, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.79315710067749, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 3.9919, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.008393287658691, |
|
"learning_rate": 3e-05, |
|
"loss": 3.3339, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 8.750615119934082, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 3.3154, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 5.283076286315918, |
|
"learning_rate": 3.5e-05, |
|
"loss": 2.8296, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 6.005578517913818, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 2.8239, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.009499549865723, |
|
"learning_rate": 4e-05, |
|
"loss": 3.0532, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.712557315826416, |
|
"learning_rate": 4.25e-05, |
|
"loss": 2.8819, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.914234638214111, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.8031, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 7.396793842315674, |
|
"learning_rate": 4.75e-05, |
|
"loss": 2.6904, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 5.087535381317139, |
|
"learning_rate": 5e-05, |
|
"loss": 2.772, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.230583190917969, |
|
"learning_rate": 4.9999683566063894e-05, |
|
"loss": 2.6301, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 4.741369724273682, |
|
"learning_rate": 4.9998734272266e-05, |
|
"loss": 2.5966, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.758203506469727, |
|
"learning_rate": 4.9997152142637426e-05, |
|
"loss": 2.4406, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.093080997467041, |
|
"learning_rate": 4.999493721722933e-05, |
|
"loss": 2.6457, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.253550052642822, |
|
"learning_rate": 4.999208955211192e-05, |
|
"loss": 2.5449, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.3556294441223145, |
|
"learning_rate": 4.998860921937302e-05, |
|
"loss": 2.5182, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.888378620147705, |
|
"learning_rate": 4.998449630711627e-05, |
|
"loss": 2.6575, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.9733967781066895, |
|
"learning_rate": 4.997975091945886e-05, |
|
"loss": 2.5669, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 3.3941574096679688, |
|
"learning_rate": 4.997437317652894e-05, |
|
"loss": 2.5628, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.743703842163086, |
|
"learning_rate": 4.996836321446253e-05, |
|
"loss": 2.6051, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.359017848968506, |
|
"learning_rate": 4.99617211854001e-05, |
|
"loss": 2.2357, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.703392028808594, |
|
"learning_rate": 4.995444725748274e-05, |
|
"loss": 2.4146, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.182121753692627, |
|
"learning_rate": 4.994654161484784e-05, |
|
"loss": 2.4228, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.623451232910156, |
|
"learning_rate": 4.993800445762451e-05, |
|
"loss": 2.4149, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 3.7832231521606445, |
|
"learning_rate": 4.992883600192844e-05, |
|
"loss": 2.4566, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.907249689102173, |
|
"learning_rate": 4.991903647985646e-05, |
|
"loss": 2.403, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.4823191165924072, |
|
"learning_rate": 4.990860613948071e-05, |
|
"loss": 2.518, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.531657695770264, |
|
"learning_rate": 4.989754524484225e-05, |
|
"loss": 2.4007, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 4.945577621459961, |
|
"learning_rate": 4.988585407594449e-05, |
|
"loss": 2.3891, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.9174554347991943, |
|
"learning_rate": 4.9873532928746036e-05, |
|
"loss": 2.2904, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 3.8385236263275146, |
|
"learning_rate": 4.986058211515321e-05, |
|
"loss": 2.2802, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.326376914978027, |
|
"learning_rate": 4.9847001963012176e-05, |
|
"loss": 2.295, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.581832408905029, |
|
"learning_rate": 4.9832792816100605e-05, |
|
"loss": 2.4895, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.5401458740234375, |
|
"learning_rate": 4.981795503411901e-05, |
|
"loss": 2.3254, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.960626125335693, |
|
"learning_rate": 4.9802488992681594e-05, |
|
"loss": 2.2977, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.908995628356934, |
|
"learning_rate": 4.978639508330681e-05, |
|
"loss": 2.3534, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.865789890289307, |
|
"learning_rate": 4.976967371340736e-05, |
|
"loss": 2.3781, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 4.27896785736084, |
|
"learning_rate": 4.975232530627998e-05, |
|
"loss": 2.3221, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.9018704891204834, |
|
"learning_rate": 4.973435030109463e-05, |
|
"loss": 2.407, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 3.4363269805908203, |
|
"learning_rate": 4.971574915288345e-05, |
|
"loss": 2.3857, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.802529335021973, |
|
"learning_rate": 4.9696522332529205e-05, |
|
"loss": 2.183, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.064101696014404, |
|
"learning_rate": 4.967667032675337e-05, |
|
"loss": 2.2134, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 5.066267490386963, |
|
"learning_rate": 4.965619363810381e-05, |
|
"loss": 2.2722, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 4.149215221405029, |
|
"learning_rate": 4.9635092784942064e-05, |
|
"loss": 2.3393, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.8846592903137207, |
|
"learning_rate": 4.9613368301430194e-05, |
|
"loss": 2.2163, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.181525230407715, |
|
"learning_rate": 4.9591020737517335e-05, |
|
"loss": 2.4478, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.1801464557647705, |
|
"learning_rate": 4.956805065892568e-05, |
|
"loss": 2.2887, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.8738250732421875, |
|
"learning_rate": 4.954445864713622e-05, |
|
"loss": 2.29, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.968664646148682, |
|
"learning_rate": 4.9520245299374014e-05, |
|
"loss": 2.2801, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.4960784912109375, |
|
"learning_rate": 4.949541122859305e-05, |
|
"loss": 2.3109, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 3.6677656173706055, |
|
"learning_rate": 4.9469957063460747e-05, |
|
"loss": 2.2748, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 2.90336275100708, |
|
"learning_rate": 4.944388344834205e-05, |
|
"loss": 2.2016, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.515296459197998, |
|
"learning_rate": 4.9417191043283086e-05, |
|
"loss": 2.3607, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.070936679840088, |
|
"learning_rate": 4.938988052399447e-05, |
|
"loss": 2.3314, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.801671028137207, |
|
"learning_rate": 4.936195258183422e-05, |
|
"loss": 2.2395, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.183629035949707, |
|
"learning_rate": 4.933340792379023e-05, |
|
"loss": 2.4527, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 3.9023029804229736, |
|
"learning_rate": 4.930424727246238e-05, |
|
"loss": 2.2828, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.6366467475891113, |
|
"learning_rate": 4.927447136604424e-05, |
|
"loss": 2.2859, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.219228506088257, |
|
"learning_rate": 4.924408095830439e-05, |
|
"loss": 2.3497, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.768355369567871, |
|
"learning_rate": 4.921307681856735e-05, |
|
"loss": 2.1229, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.8723647594451904, |
|
"learning_rate": 4.9181459731694054e-05, |
|
"loss": 2.3544, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.512420892715454, |
|
"learning_rate": 4.914923049806207e-05, |
|
"loss": 1.9489, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.77095627784729, |
|
"learning_rate": 4.911638993354524e-05, |
|
"loss": 2.2499, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 3.8103721141815186, |
|
"learning_rate": 4.90829388694931e-05, |
|
"loss": 2.1032, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.6579902172088623, |
|
"learning_rate": 4.9048878152709785e-05, |
|
"loss": 2.2104, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.087968826293945, |
|
"learning_rate": 4.901420864543265e-05, |
|
"loss": 2.2601, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.773608684539795, |
|
"learning_rate": 4.8978931225310375e-05, |
|
"loss": 2.1831, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.229213714599609, |
|
"learning_rate": 4.8943046785380795e-05, |
|
"loss": 2.2507, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 5.113283634185791, |
|
"learning_rate": 4.890655623404828e-05, |
|
"loss": 2.2868, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 3.9976158142089844, |
|
"learning_rate": 4.8869460495060726e-05, |
|
"loss": 2.264, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.450018405914307, |
|
"learning_rate": 4.883176050748619e-05, |
|
"loss": 2.2319, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.610208511352539, |
|
"learning_rate": 4.879345722568911e-05, |
|
"loss": 2.1011, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.5385842323303223, |
|
"learning_rate": 4.875455161930614e-05, |
|
"loss": 2.2372, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.5152907371520996, |
|
"learning_rate": 4.871504467322162e-05, |
|
"loss": 2.3424, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.0804309844970703, |
|
"learning_rate": 4.867493738754263e-05, |
|
"loss": 1.9902, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 4.568037033081055, |
|
"learning_rate": 4.8634230777573655e-05, |
|
"loss": 2.216, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 3.0766966342926025, |
|
"learning_rate": 4.859292587379094e-05, |
|
"loss": 2.2049, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.8717846870422363, |
|
"learning_rate": 4.855102372181634e-05, |
|
"loss": 2.179, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.963639497756958, |
|
"learning_rate": 4.8508525382390876e-05, |
|
"loss": 2.3567, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 3.3204896450042725, |
|
"learning_rate": 4.8465431931347904e-05, |
|
"loss": 2.1157, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.884645938873291, |
|
"learning_rate": 4.842174445958585e-05, |
|
"loss": 2.192, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.058561325073242, |
|
"learning_rate": 4.837746407304061e-05, |
|
"loss": 2.2785, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 4.240612983703613, |
|
"learning_rate": 4.833259189265753e-05, |
|
"loss": 2.3115, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.628058433532715, |
|
"learning_rate": 4.8287129054363076e-05, |
|
"loss": 2.3267, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.4856207370758057, |
|
"learning_rate": 4.8241076709036036e-05, |
|
"loss": 2.1803, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 4.317348480224609, |
|
"learning_rate": 4.8194436022478404e-05, |
|
"loss": 2.1224, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.6160759925842285, |
|
"learning_rate": 4.814720817538585e-05, |
|
"loss": 2.1848, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.2244794368743896, |
|
"learning_rate": 4.809939436331786e-05, |
|
"loss": 2.2176, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 3.645427942276001, |
|
"learning_rate": 4.805099579666748e-05, |
|
"loss": 2.1778, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.9020988941192627, |
|
"learning_rate": 4.800201370063059e-05, |
|
"loss": 2.2817, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.484887599945068, |
|
"learning_rate": 4.7952449315174996e-05, |
|
"loss": 1.9207, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.281662464141846, |
|
"learning_rate": 4.790230389500901e-05, |
|
"loss": 2.2251, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.9683914184570312, |
|
"learning_rate": 4.785157870954961e-05, |
|
"loss": 2.22, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.39128041267395, |
|
"learning_rate": 4.780027504289042e-05, |
|
"loss": 2.3237, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.148158550262451, |
|
"learning_rate": 4.774839419376914e-05, |
|
"loss": 2.1838, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.339906692504883, |
|
"learning_rate": 4.769593747553468e-05, |
|
"loss": 2.0075, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.3067688941955566, |
|
"learning_rate": 4.764290621611388e-05, |
|
"loss": 2.1666, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 4.491573810577393, |
|
"learning_rate": 4.758930175797797e-05, |
|
"loss": 2.3295, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.894711494445801, |
|
"learning_rate": 4.753512545810851e-05, |
|
"loss": 2.1021, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.7983195781707764, |
|
"learning_rate": 4.7480378687963114e-05, |
|
"loss": 2.2335, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.40674090385437, |
|
"learning_rate": 4.7425062833440634e-05, |
|
"loss": 2.0456, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 3.834815263748169, |
|
"learning_rate": 4.736917929484616e-05, |
|
"loss": 2.3161, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.8907999992370605, |
|
"learning_rate": 4.731272948685554e-05, |
|
"loss": 2.1104, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.7746763229370117, |
|
"learning_rate": 4.725571483847958e-05, |
|
"loss": 2.0498, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.495760917663574, |
|
"learning_rate": 4.719813679302784e-05, |
|
"loss": 2.231, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 3.9231886863708496, |
|
"learning_rate": 4.713999680807211e-05, |
|
"loss": 2.1878, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.197574138641357, |
|
"learning_rate": 4.708129635540955e-05, |
|
"loss": 2.1897, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 4.721147060394287, |
|
"learning_rate": 4.702203692102539e-05, |
|
"loss": 2.1359, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.4958722591400146, |
|
"learning_rate": 4.696222000505529e-05, |
|
"loss": 2.1873, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.5209269523620605, |
|
"learning_rate": 4.6901847121747455e-05, |
|
"loss": 2.0386, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.6823954582214355, |
|
"learning_rate": 4.6840919799424186e-05, |
|
"loss": 2.0325, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.033428192138672, |
|
"learning_rate": 4.677943958044329e-05, |
|
"loss": 2.13, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.907592535018921, |
|
"learning_rate": 4.671740802115897e-05, |
|
"loss": 2.0553, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 3.318100690841675, |
|
"learning_rate": 4.665482669188248e-05, |
|
"loss": 2.0218, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 4.057621479034424, |
|
"learning_rate": 4.659169717684232e-05, |
|
"loss": 2.1056, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.882345199584961, |
|
"learning_rate": 4.6528021074144165e-05, |
|
"loss": 2.1249, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.954129219055176, |
|
"learning_rate": 4.646379999573039e-05, |
|
"loss": 2.1942, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.156874656677246, |
|
"learning_rate": 4.639903556733931e-05, |
|
"loss": 2.175, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.1573710441589355, |
|
"learning_rate": 4.633372942846393e-05, |
|
"loss": 2.0856, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 5.385977745056152, |
|
"learning_rate": 4.6267883232310575e-05, |
|
"loss": 2.2399, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 4.143659591674805, |
|
"learning_rate": 4.620149864575689e-05, |
|
"loss": 2.17, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.286294460296631, |
|
"learning_rate": 4.613457734930978e-05, |
|
"loss": 2.0458, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.520682334899902, |
|
"learning_rate": 4.606712103706278e-05, |
|
"loss": 2.1244, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.6921236515045166, |
|
"learning_rate": 4.59991314166532e-05, |
|
"loss": 2.0801, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.1880507469177246, |
|
"learning_rate": 4.593061020921889e-05, |
|
"loss": 2.3062, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.380157709121704, |
|
"learning_rate": 4.586155914935469e-05, |
|
"loss": 2.0267, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.0647785663604736, |
|
"learning_rate": 4.57919799850685e-05, |
|
"loss": 2.1566, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 3.353318691253662, |
|
"learning_rate": 4.5721874477737006e-05, |
|
"loss": 2.0618, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.342336654663086, |
|
"learning_rate": 4.5651244402061144e-05, |
|
"loss": 1.9534, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.064236640930176, |
|
"learning_rate": 4.558009154602115e-05, |
|
"loss": 2.1573, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.5223772525787354, |
|
"learning_rate": 4.550841771083129e-05, |
|
"loss": 2.0089, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.3469557762146, |
|
"learning_rate": 4.543622471089426e-05, |
|
"loss": 2.1214, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.922893762588501, |
|
"learning_rate": 4.536351437375526e-05, |
|
"loss": 2.0982, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.053823947906494, |
|
"learning_rate": 4.529028854005576e-05, |
|
"loss": 2.0791, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 3.636437177658081, |
|
"learning_rate": 4.521654906348687e-05, |
|
"loss": 2.1326, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.3226318359375, |
|
"learning_rate": 4.51422978107424e-05, |
|
"loss": 2.2037, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.59119987487793, |
|
"learning_rate": 4.506753666147163e-05, |
|
"loss": 2.1187, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 5.592061996459961, |
|
"learning_rate": 4.499226750823177e-05, |
|
"loss": 2.3031, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 4.18353271484375, |
|
"learning_rate": 4.491649225643996e-05, |
|
"loss": 2.0337, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.2864906787872314, |
|
"learning_rate": 4.484021282432509e-05, |
|
"loss": 2.0575, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 3.3072474002838135, |
|
"learning_rate": 4.476343114287924e-05, |
|
"loss": 2.0173, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.088031768798828, |
|
"learning_rate": 4.468614915580879e-05, |
|
"loss": 2.1929, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.264316082000732, |
|
"learning_rate": 4.4608368819485204e-05, |
|
"loss": 2.0457, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 4.678459644317627, |
|
"learning_rate": 4.453009210289551e-05, |
|
"loss": 2.031, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.3418045043945312, |
|
"learning_rate": 4.445132098759249e-05, |
|
"loss": 2.1464, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 3.89583420753479, |
|
"learning_rate": 4.4372057467644455e-05, |
|
"loss": 2.1509, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.6973416805267334, |
|
"learning_rate": 4.4292303549584816e-05, |
|
"loss": 2.072, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 4.848878383636475, |
|
"learning_rate": 4.421206125236128e-05, |
|
"loss": 2.166, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.48630428314209, |
|
"learning_rate": 4.4131332607284706e-05, |
|
"loss": 1.9686, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.4183597564697266, |
|
"learning_rate": 4.405011965797775e-05, |
|
"loss": 2.0781, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.5883586406707764, |
|
"learning_rate": 4.3968424460323047e-05, |
|
"loss": 2.0631, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.683375835418701, |
|
"learning_rate": 4.388624908241124e-05, |
|
"loss": 2.0533, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.0786943435668945, |
|
"learning_rate": 4.3803595604488595e-05, |
|
"loss": 1.8946, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 3.2280662059783936, |
|
"learning_rate": 4.372046611890434e-05, |
|
"loss": 2.0221, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.1918365955352783, |
|
"learning_rate": 4.36368627300577e-05, |
|
"loss": 2.0023, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.814984321594238, |
|
"learning_rate": 4.3552787554344634e-05, |
|
"loss": 2.0967, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.989580154418945, |
|
"learning_rate": 4.346824272010423e-05, |
|
"loss": 1.9698, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.2674803733825684, |
|
"learning_rate": 4.338323036756488e-05, |
|
"loss": 2.0381, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 3.6016860008239746, |
|
"learning_rate": 4.3297752648790035e-05, |
|
"loss": 2.0444, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 4.092184543609619, |
|
"learning_rate": 4.321181172762379e-05, |
|
"loss": 2.1514, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.5366742610931396, |
|
"learning_rate": 4.312540977963604e-05, |
|
"loss": 2.0518, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.222804069519043, |
|
"learning_rate": 4.303854899206749e-05, |
|
"loss": 1.9858, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 4.207810401916504, |
|
"learning_rate": 4.295123156377419e-05, |
|
"loss": 2.0067, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.15069842338562, |
|
"learning_rate": 4.2863459705171945e-05, |
|
"loss": 1.9234, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.337561845779419, |
|
"learning_rate": 4.2775235638180344e-05, |
|
"loss": 1.974, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.987912178039551, |
|
"learning_rate": 4.2686561596166487e-05, |
|
"loss": 2.1928, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.9456374645233154, |
|
"learning_rate": 4.259743982388845e-05, |
|
"loss": 2.023, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.308691501617432, |
|
"learning_rate": 4.250787257743851e-05, |
|
"loss": 2.1075, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.699410915374756, |
|
"learning_rate": 4.2417862124185955e-05, |
|
"loss": 2.0471, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.254593372344971, |
|
"learning_rate": 4.232741074271977e-05, |
|
"loss": 2.0331, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.2899739742279053, |
|
"learning_rate": 4.2236520722790855e-05, |
|
"loss": 2.0153, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 5.5724616050720215, |
|
"learning_rate": 4.214519436525418e-05, |
|
"loss": 2.1466, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 3.673755168914795, |
|
"learning_rate": 4.2053433982010436e-05, |
|
"loss": 2.1062, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.009172439575195, |
|
"learning_rate": 4.1961241895947554e-05, |
|
"loss": 2.013, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.0359890460968018, |
|
"learning_rate": 4.1868620440881925e-05, |
|
"loss": 2.1153, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.953378200531006, |
|
"learning_rate": 4.177557196149927e-05, |
|
"loss": 2.0847, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.580415964126587, |
|
"learning_rate": 4.168209881329531e-05, |
|
"loss": 1.9907, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 3.3144888877868652, |
|
"learning_rate": 4.1588203362516153e-05, |
|
"loss": 2.0741, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 4.115612983703613, |
|
"learning_rate": 4.149388798609836e-05, |
|
"loss": 1.9596, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.178717613220215, |
|
"learning_rate": 4.1399155071608774e-05, |
|
"loss": 2.142, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.350316286087036, |
|
"learning_rate": 4.1304007017184146e-05, |
|
"loss": 2.06, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.030082702636719, |
|
"learning_rate": 4.120844623147033e-05, |
|
"loss": 2.0618, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 5.1543707847595215, |
|
"learning_rate": 4.1112475133561376e-05, |
|
"loss": 2.3692, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.9695091247558594, |
|
"learning_rate": 4.101609615293827e-05, |
|
"loss": 2.0065, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.1106691360473633, |
|
"learning_rate": 4.0919311729407416e-05, |
|
"loss": 2.0318, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 3.532636880874634, |
|
"learning_rate": 4.0822124313038904e-05, |
|
"loss": 2.139, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.04263162612915, |
|
"learning_rate": 4.072453636410448e-05, |
|
"loss": 2.1352, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.174222946166992, |
|
"learning_rate": 4.0626550353015236e-05, |
|
"loss": 2.0269, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.390026569366455, |
|
"learning_rate": 4.052816876025912e-05, |
|
"loss": 2.0775, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.04339075088501, |
|
"learning_rate": 4.042939407633808e-05, |
|
"loss": 2.0042, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.5550975799560547, |
|
"learning_rate": 4.03302288017051e-05, |
|
"loss": 1.9624, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 4.015019416809082, |
|
"learning_rate": 4.023067544670082e-05, |
|
"loss": 2.142, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 3.452937126159668, |
|
"learning_rate": 4.013073653149005e-05, |
|
"loss": 2.0798, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.2777509689331055, |
|
"learning_rate": 4.0030414585997925e-05, |
|
"loss": 2.0245, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 5.5015459060668945, |
|
"learning_rate": 3.99297121498459e-05, |
|
"loss": 2.0897, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.524988651275635, |
|
"learning_rate": 3.982863177228743e-05, |
|
"loss": 2.182, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 4.300734043121338, |
|
"learning_rate": 3.972717601214345e-05, |
|
"loss": 2.0477, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.456317186355591, |
|
"learning_rate": 3.962534743773761e-05, |
|
"loss": 2.1261, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.567162275314331, |
|
"learning_rate": 3.9523148626831234e-05, |
|
"loss": 2.119, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.5200531482696533, |
|
"learning_rate": 3.942058216655808e-05, |
|
"loss": 1.9731, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.380658149719238, |
|
"learning_rate": 3.931765065335886e-05, |
|
"loss": 1.9642, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.44472074508667, |
|
"learning_rate": 3.921435669291547e-05, |
|
"loss": 1.8666, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.24396276473999, |
|
"learning_rate": 3.9110702900085064e-05, |
|
"loss": 2.0983, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 4.166001319885254, |
|
"learning_rate": 3.900669189883386e-05, |
|
"loss": 1.9032, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 3.893059730529785, |
|
"learning_rate": 3.890232632217071e-05, |
|
"loss": 1.9269, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.5707895755767822, |
|
"learning_rate": 3.879760881208042e-05, |
|
"loss": 1.9055, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.270632743835449, |
|
"learning_rate": 3.869254201945692e-05, |
|
"loss": 1.9936, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.152591228485107, |
|
"learning_rate": 3.858712860403608e-05, |
|
"loss": 2.1007, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.5370168685913086, |
|
"learning_rate": 3.848137123432848e-05, |
|
"loss": 2.1225, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.657259941101074, |
|
"learning_rate": 3.837527258755177e-05, |
|
"loss": 1.9526, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 4.236551761627197, |
|
"learning_rate": 3.8268835349562946e-05, |
|
"loss": 1.9357, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 3.312053680419922, |
|
"learning_rate": 3.816206221479034e-05, |
|
"loss": 1.9833, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.346323013305664, |
|
"learning_rate": 3.8054955886165427e-05, |
|
"loss": 1.9351, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.557433843612671, |
|
"learning_rate": 3.7947519075054364e-05, |
|
"loss": 2.0037, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 3.824169635772705, |
|
"learning_rate": 3.7839754501189406e-05, |
|
"loss": 2.1035, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.1984968185424805, |
|
"learning_rate": 3.7731664892600004e-05, |
|
"loss": 1.9416, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 2.998347520828247, |
|
"learning_rate": 3.762325298554379e-05, |
|
"loss": 1.9615, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 4.985104560852051, |
|
"learning_rate": 3.751452152443728e-05, |
|
"loss": 1.912, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.560026168823242, |
|
"learning_rate": 3.74054732617864e-05, |
|
"loss": 1.9317, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.894937515258789, |
|
"learning_rate": 3.7296110958116844e-05, |
|
"loss": 1.9516, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.1330158710479736, |
|
"learning_rate": 3.718643738190414e-05, |
|
"loss": 1.8787, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.924584150314331, |
|
"learning_rate": 3.707645530950361e-05, |
|
"loss": 1.9294, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.2176225185394287, |
|
"learning_rate": 3.6966167525080056e-05, |
|
"loss": 2.1003, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 3.9685873985290527, |
|
"learning_rate": 3.6855576820537277e-05, |
|
"loss": 1.9088, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 4.544212818145752, |
|
"learning_rate": 3.674468599544746e-05, |
|
"loss": 2.0211, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.6609127521514893, |
|
"learning_rate": 3.663349785698021e-05, |
|
"loss": 2.0021, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.17726469039917, |
|
"learning_rate": 3.6522015219831546e-05, |
|
"loss": 2.0828, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.6899638175964355, |
|
"learning_rate": 3.641024090615265e-05, |
|
"loss": 1.9462, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.7764229774475098, |
|
"learning_rate": 3.62981777454784e-05, |
|
"loss": 2.0825, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 4.037018775939941, |
|
"learning_rate": 3.6185828574655766e-05, |
|
"loss": 1.8715, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 3.727513074874878, |
|
"learning_rate": 3.607319623777196e-05, |
|
"loss": 1.9394, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.162086009979248, |
|
"learning_rate": 3.59602835860825e-05, |
|
"loss": 1.89, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.546518564224243, |
|
"learning_rate": 3.5847093477938956e-05, |
|
"loss": 1.8102, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 4.054803371429443, |
|
"learning_rate": 3.5733628778716646e-05, |
|
"loss": 1.8825, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.638885498046875, |
|
"learning_rate": 3.5619892360742075e-05, |
|
"loss": 2.0755, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.433565378189087, |
|
"learning_rate": 3.5505887103220254e-05, |
|
"loss": 2.0261, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.5785629749298096, |
|
"learning_rate": 3.5391615892161754e-05, |
|
"loss": 2.1362, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 3.4514031410217285, |
|
"learning_rate": 3.527708162030971e-05, |
|
"loss": 1.8821, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.2519073486328125, |
|
"learning_rate": 3.516228718706656e-05, |
|
"loss": 2.112, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.0281126499176025, |
|
"learning_rate": 3.504723549842066e-05, |
|
"loss": 1.8516, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.3636157512664795, |
|
"learning_rate": 3.4931929466872685e-05, |
|
"loss": 1.9612, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.7413578033447266, |
|
"learning_rate": 3.481637201136197e-05, |
|
"loss": 1.9865, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 3.007408618927002, |
|
"learning_rate": 3.4700566057192544e-05, |
|
"loss": 1.9493, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.331480979919434, |
|
"learning_rate": 3.4584514535959114e-05, |
|
"loss": 2.1174, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.286431312561035, |
|
"learning_rate": 3.446822038547287e-05, |
|
"loss": 1.883, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.356170177459717, |
|
"learning_rate": 3.435168654968706e-05, |
|
"loss": 1.9707, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.436434507369995, |
|
"learning_rate": 3.423491597862251e-05, |
|
"loss": 1.8922, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.307274580001831, |
|
"learning_rate": 3.411791162829294e-05, |
|
"loss": 2.0583, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 4.032553195953369, |
|
"learning_rate": 3.4000676460630126e-05, |
|
"loss": 2.0121, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.4915122985839844, |
|
"learning_rate": 3.3883213443408903e-05, |
|
"loss": 1.9361, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.969005823135376, |
|
"learning_rate": 3.3765525550172066e-05, |
|
"loss": 1.8782, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.772780179977417, |
|
"learning_rate": 3.364761576015507e-05, |
|
"loss": 2.0914, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 2.9640040397644043, |
|
"learning_rate": 3.352948705821065e-05, |
|
"loss": 1.9143, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 5.698980331420898, |
|
"learning_rate": 3.341114243473319e-05, |
|
"loss": 1.9417, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.4275810718536377, |
|
"learning_rate": 3.3292584885583114e-05, |
|
"loss": 1.9053, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 3.2752602100372314, |
|
"learning_rate": 3.317381741201097e-05, |
|
"loss": 2.0126, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 4.166382312774658, |
|
"learning_rate": 3.305484302058148e-05, |
|
"loss": 1.9256, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.7549707889556885, |
|
"learning_rate": 3.293566472309746e-05, |
|
"loss": 2.0742, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.449774980545044, |
|
"learning_rate": 3.2816285536523515e-05, |
|
"loss": 1.9322, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 3.590756416320801, |
|
"learning_rate": 3.269670848290973e-05, |
|
"loss": 1.9619, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 4.403102874755859, |
|
"learning_rate": 3.2576936589315124e-05, |
|
"loss": 1.9513, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 4.1176676750183105, |
|
"learning_rate": 3.245697288773102e-05, |
|
"loss": 2.0274, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 4.0299859046936035, |
|
"learning_rate": 3.233682041500433e-05, |
|
"loss": 1.9853, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 4.306421279907227, |
|
"learning_rate": 3.2216482212760646e-05, |
|
"loss": 1.949, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.9233736991882324, |
|
"learning_rate": 3.209596132732725e-05, |
|
"loss": 1.9009, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.82336163520813, |
|
"learning_rate": 3.197526080965598e-05, |
|
"loss": 2.1035, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.946753740310669, |
|
"learning_rate": 3.185438371524605e-05, |
|
"loss": 1.9775, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 4.122159481048584, |
|
"learning_rate": 3.173333310406662e-05, |
|
"loss": 1.7694, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 3.5491435527801514, |
|
"learning_rate": 3.161211204047943e-05, |
|
"loss": 2.0022, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 4.0456438064575195, |
|
"learning_rate": 3.1490723593161096e-05, |
|
"loss": 2.1332, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.476616621017456, |
|
"learning_rate": 3.1369170835025594e-05, |
|
"loss": 1.9567, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.3506128787994385, |
|
"learning_rate": 3.124745684314633e-05, |
|
"loss": 2.1015, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.737765312194824, |
|
"learning_rate": 3.112558469867829e-05, |
|
"loss": 1.9677, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.6628215312957764, |
|
"learning_rate": 3.100355748678009e-05, |
|
"loss": 2.1167, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 3.3631627559661865, |
|
"learning_rate": 3.0881378296535784e-05, |
|
"loss": 1.928, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 4.281042575836182, |
|
"learning_rate": 3.075905022087675e-05, |
|
"loss": 1.9394, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.994631290435791, |
|
"learning_rate": 3.063657635650335e-05, |
|
"loss": 1.8533, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 5.131731033325195, |
|
"learning_rate": 3.0513959803806526e-05, |
|
"loss": 1.9484, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.4644176959991455, |
|
"learning_rate": 3.039120366678937e-05, |
|
"loss": 1.9492, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.832453966140747, |
|
"learning_rate": 3.0268311052988473e-05, |
|
"loss": 1.869, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.8497562408447266, |
|
"learning_rate": 3.0145285073395334e-05, |
|
"loss": 1.8965, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 3.4898972511291504, |
|
"learning_rate": 3.0022128842377534e-05, |
|
"loss": 2.0029, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 4.340991020202637, |
|
"learning_rate": 2.9898845477599963e-05, |
|
"loss": 1.9139, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 5.687810897827148, |
|
"learning_rate": 2.9775438099945836e-05, |
|
"loss": 2.0196, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.468388795852661, |
|
"learning_rate": 2.965190983343774e-05, |
|
"loss": 2.0382, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 3.2167277336120605, |
|
"learning_rate": 2.9528263805158524e-05, |
|
"loss": 2.0924, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.481842041015625, |
|
"learning_rate": 2.940450314517214e-05, |
|
"loss": 2.0535, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.334501266479492, |
|
"learning_rate": 2.92806309864444e-05, |
|
"loss": 1.9523, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 4.137599945068359, |
|
"learning_rate": 2.9156650464763713e-05, |
|
"loss": 2.0247, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.5023269653320312, |
|
"learning_rate": 2.9032564718661603e-05, |
|
"loss": 2.0151, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 4.225565433502197, |
|
"learning_rate": 2.8908376889333376e-05, |
|
"loss": 1.9438, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.86175799369812, |
|
"learning_rate": 2.8784090120558515e-05, |
|
"loss": 2.0108, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.7544214725494385, |
|
"learning_rate": 2.865970755862114e-05, |
|
"loss": 1.943, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.8477399349212646, |
|
"learning_rate": 2.8535232352230345e-05, |
|
"loss": 1.891, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.7875800132751465, |
|
"learning_rate": 2.8410667652440482e-05, |
|
"loss": 1.9343, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 3.8977842330932617, |
|
"learning_rate": 2.828601661257142e-05, |
|
"loss": 1.8978, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.39017915725708, |
|
"learning_rate": 2.8161282388128696e-05, |
|
"loss": 1.9368, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.3148322105407715, |
|
"learning_rate": 2.8036468136723627e-05, |
|
"loss": 1.9393, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.528031587600708, |
|
"learning_rate": 2.7911577017993412e-05, |
|
"loss": 1.831, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.506915092468262, |
|
"learning_rate": 2.778661219352111e-05, |
|
"loss": 2.1384, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 4.252208709716797, |
|
"learning_rate": 2.766157682675562e-05, |
|
"loss": 1.9593, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 3.718641996383667, |
|
"learning_rate": 2.753647408293161e-05, |
|
"loss": 1.9347, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.7793309688568115, |
|
"learning_rate": 2.7411307128989368e-05, |
|
"loss": 1.9519, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.7921085357666016, |
|
"learning_rate": 2.728607913349464e-05, |
|
"loss": 1.8966, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.735579252243042, |
|
"learning_rate": 2.7160793266558443e-05, |
|
"loss": 1.8972, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 4.979485511779785, |
|
"learning_rate": 2.7035452699756768e-05, |
|
"loss": 1.9879, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.672161102294922, |
|
"learning_rate": 2.6910060606050324e-05, |
|
"loss": 1.895, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 3.2381715774536133, |
|
"learning_rate": 2.6784620159704222e-05, |
|
"loss": 1.9259, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 5.407585620880127, |
|
"learning_rate": 2.6659134536207587e-05, |
|
"loss": 1.9021, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.894399642944336, |
|
"learning_rate": 2.6533606912193216e-05, |
|
"loss": 2.0666, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.4516754150390625, |
|
"learning_rate": 2.6408040465357097e-05, |
|
"loss": 1.9388, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.389581203460693, |
|
"learning_rate": 2.628243837437806e-05, |
|
"loss": 1.9731, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.623656988143921, |
|
"learning_rate": 2.6156803818837204e-05, |
|
"loss": 1.8931, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 3.5042312145233154, |
|
"learning_rate": 2.6031139979137492e-05, |
|
"loss": 1.8365, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.07073974609375, |
|
"learning_rate": 2.59054500364232e-05, |
|
"loss": 2.0215, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.199176788330078, |
|
"learning_rate": 2.5779737172499396e-05, |
|
"loss": 1.967, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.009402751922607, |
|
"learning_rate": 2.565400456975138e-05, |
|
"loss": 2.0154, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.114271640777588, |
|
"learning_rate": 2.552825541106414e-05, |
|
"loss": 1.9405, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.4758782386779785, |
|
"learning_rate": 2.540249287974178e-05, |
|
"loss": 1.94, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 6.038011074066162, |
|
"learning_rate": 2.527672015942693e-05, |
|
"loss": 2.1653, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.370410203933716, |
|
"learning_rate": 2.5150940434020132e-05, |
|
"loss": 1.9588, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.766829252243042, |
|
"learning_rate": 2.5025156887599288e-05, |
|
"loss": 1.8133, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.650520086288452, |
|
"learning_rate": 2.489937270433901e-05, |
|
"loss": 1.9111, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.1080238819122314, |
|
"learning_rate": 2.4773591068430018e-05, |
|
"loss": 1.8758, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 3.3637783527374268, |
|
"learning_rate": 2.4647815163998585e-05, |
|
"loss": 1.7589, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.043179988861084, |
|
"learning_rate": 2.452204817502587e-05, |
|
"loss": 1.9339, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.033404350280762, |
|
"learning_rate": 2.4396293285267327e-05, |
|
"loss": 1.9412, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 4.043616771697998, |
|
"learning_rate": 2.427055367817214e-05, |
|
"loss": 1.8728, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 4.840696811676025, |
|
"learning_rate": 2.4144832536802628e-05, |
|
"loss": 1.9966, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 4.977992057800293, |
|
"learning_rate": 2.4019133043753628e-05, |
|
"loss": 1.9621, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.1471240520477295, |
|
"learning_rate": 2.3893458381071964e-05, |
|
"loss": 2.0315, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 5.21504020690918, |
|
"learning_rate": 2.376781173017589e-05, |
|
"loss": 1.9859, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.4117472171783447, |
|
"learning_rate": 2.3642196271774568e-05, |
|
"loss": 1.905, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 3.8640167713165283, |
|
"learning_rate": 2.3516615185787494e-05, |
|
"loss": 2.0321, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.5830259323120117, |
|
"learning_rate": 2.3391071651264064e-05, |
|
"loss": 1.9936, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 5.528283596038818, |
|
"learning_rate": 2.3265568846303054e-05, |
|
"loss": 1.8955, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.968691110610962, |
|
"learning_rate": 2.3140109947972204e-05, |
|
"loss": 1.9137, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.56799054145813, |
|
"learning_rate": 2.3014698132227735e-05, |
|
"loss": 1.9854, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 4.353531360626221, |
|
"learning_rate": 2.2889336573834027e-05, |
|
"loss": 1.8967, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.8630661964416504, |
|
"learning_rate": 2.276402844628317e-05, |
|
"loss": 1.8833, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.5117268562316895, |
|
"learning_rate": 2.2638776921714696e-05, |
|
"loss": 1.8493, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 4.000200271606445, |
|
"learning_rate": 2.251358517083524e-05, |
|
"loss": 1.8717, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.0542423725128174, |
|
"learning_rate": 2.2388456362838283e-05, |
|
"loss": 1.9941, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 4.117686748504639, |
|
"learning_rate": 2.2263393665323907e-05, |
|
"loss": 2.0925, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 5.376316070556641, |
|
"learning_rate": 2.2138400244218665e-05, |
|
"loss": 2.0568, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 3.879211187362671, |
|
"learning_rate": 2.2013479263695368e-05, |
|
"loss": 1.9256, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 4.660920143127441, |
|
"learning_rate": 2.1888633886093017e-05, |
|
"loss": 2.092, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.143937587738037, |
|
"learning_rate": 2.176386727183676e-05, |
|
"loss": 1.7624, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.354220390319824, |
|
"learning_rate": 2.1639182579357846e-05, |
|
"loss": 1.8961, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.339317798614502, |
|
"learning_rate": 2.151458296501374e-05, |
|
"loss": 1.9361, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.080310344696045, |
|
"learning_rate": 2.139007158300814e-05, |
|
"loss": 1.8459, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.5018744468688965, |
|
"learning_rate": 2.126565158531119e-05, |
|
"loss": 1.9086, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 5.1605072021484375, |
|
"learning_rate": 2.1141326121579638e-05, |
|
"loss": 1.9395, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.0767998695373535, |
|
"learning_rate": 2.1017098339077176e-05, |
|
"loss": 2.005, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 4.308762073516846, |
|
"learning_rate": 2.0892971382594694e-05, |
|
"loss": 1.8772, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.049802541732788, |
|
"learning_rate": 2.0768948394370702e-05, |
|
"loss": 1.9591, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.853872060775757, |
|
"learning_rate": 2.0645032514011773e-05, |
|
"loss": 1.8408, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.8186545372009277, |
|
"learning_rate": 2.052122687841311e-05, |
|
"loss": 1.9765, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.238193988800049, |
|
"learning_rate": 2.0397534621679075e-05, |
|
"loss": 1.931, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 3.316253662109375, |
|
"learning_rate": 2.0273958875043874e-05, |
|
"loss": 1.9787, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.303181171417236, |
|
"learning_rate": 2.0150502766792298e-05, |
|
"loss": 1.9991, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.6812000274658203, |
|
"learning_rate": 2.0027169422180546e-05, |
|
"loss": 1.8782, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 5.033133506774902, |
|
"learning_rate": 1.990396196335706e-05, |
|
"loss": 1.8406, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.612210750579834, |
|
"learning_rate": 1.9780883509283526e-05, |
|
"loss": 2.0226, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 4.63312292098999, |
|
"learning_rate": 1.9657937175655922e-05, |
|
"loss": 1.9403, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.5263733863830566, |
|
"learning_rate": 1.9535126074825647e-05, |
|
"loss": 1.9812, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 3.100794792175293, |
|
"learning_rate": 1.941245331572068e-05, |
|
"loss": 1.8332, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.041380405426025, |
|
"learning_rate": 1.9289922003766962e-05, |
|
"loss": 1.9352, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.329756736755371, |
|
"learning_rate": 1.9167535240809703e-05, |
|
"loss": 1.9084, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.596053123474121, |
|
"learning_rate": 1.904529612503493e-05, |
|
"loss": 1.8971, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.9134511947631836, |
|
"learning_rate": 1.8923207750890992e-05, |
|
"loss": 2.0642, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 3.707994222640991, |
|
"learning_rate": 1.8801273209010284e-05, |
|
"loss": 1.8276, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.338993072509766, |
|
"learning_rate": 1.8679495586130952e-05, |
|
"loss": 1.9576, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.758429765701294, |
|
"learning_rate": 1.8557877965018817e-05, |
|
"loss": 1.9956, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.7816905975341797, |
|
"learning_rate": 1.843642342438928e-05, |
|
"loss": 1.9079, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 5.009194850921631, |
|
"learning_rate": 1.8315135038829406e-05, |
|
"loss": 1.9509, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.4465157985687256, |
|
"learning_rate": 1.8194015878720084e-05, |
|
"loss": 2.0019, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.6948273181915283, |
|
"learning_rate": 1.8073069010158334e-05, |
|
"loss": 2.0043, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 3.3850791454315186, |
|
"learning_rate": 1.795229749487965e-05, |
|
"loss": 1.9031, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 5.051716327667236, |
|
"learning_rate": 1.7831704390180498e-05, |
|
"loss": 1.8958, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.8910887241363525, |
|
"learning_rate": 1.7711292748840943e-05, |
|
"loss": 1.8856, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.8123810291290283, |
|
"learning_rate": 1.759106561904737e-05, |
|
"loss": 1.8229, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 4.154626369476318, |
|
"learning_rate": 1.747102604431528e-05, |
|
"loss": 1.9509, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 4.20812463760376, |
|
"learning_rate": 1.7351177063412276e-05, |
|
"loss": 1.9501, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.2041704654693604, |
|
"learning_rate": 1.723152171028114e-05, |
|
"loss": 1.9888, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 3.133105754852295, |
|
"learning_rate": 1.7112063013963044e-05, |
|
"loss": 2.0086, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.227274417877197, |
|
"learning_rate": 1.6992803998520794e-05, |
|
"loss": 1.9373, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.2231645584106445, |
|
"learning_rate": 1.6873747682962394e-05, |
|
"loss": 1.7439, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 2.90924334526062, |
|
"learning_rate": 1.67548970811645e-05, |
|
"loss": 1.8914, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.2363147735595703, |
|
"learning_rate": 1.6636255201796237e-05, |
|
"loss": 1.9674, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 4.925014019012451, |
|
"learning_rate": 1.6517825048242936e-05, |
|
"loss": 1.8693, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 3.2326242923736572, |
|
"learning_rate": 1.6399609618530183e-05, |
|
"loss": 1.8776, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.984081506729126, |
|
"learning_rate": 1.6281611905247855e-05, |
|
"loss": 1.881, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.8823959827423096, |
|
"learning_rate": 1.6163834895474445e-05, |
|
"loss": 1.9769, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.131060600280762, |
|
"learning_rate": 1.604628157070136e-05, |
|
"loss": 1.9811, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.516271591186523, |
|
"learning_rate": 1.5928954906757515e-05, |
|
"loss": 1.995, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.9269816875457764, |
|
"learning_rate": 1.5811857873733942e-05, |
|
"loss": 1.8224, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 3.7068333625793457, |
|
"learning_rate": 1.5694993435908646e-05, |
|
"loss": 1.8799, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 4.0933756828308105, |
|
"learning_rate": 1.557836455167157e-05, |
|
"loss": 1.9251, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 4.189598560333252, |
|
"learning_rate": 1.546197417344965e-05, |
|
"loss": 2.032, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.609545946121216, |
|
"learning_rate": 1.5345825247632135e-05, |
|
"loss": 1.9399, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.9929699897766113, |
|
"learning_rate": 1.5229920714495948e-05, |
|
"loss": 1.8803, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.578582286834717, |
|
"learning_rate": 1.5114263508131327e-05, |
|
"loss": 1.8303, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.167156457901001, |
|
"learning_rate": 1.499885655636746e-05, |
|
"loss": 2.0741, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 3.376950263977051, |
|
"learning_rate": 1.4883702780698433e-05, |
|
"loss": 1.8935, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 7.022952556610107, |
|
"learning_rate": 1.4768805096209231e-05, |
|
"loss": 1.9285, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 4.465900897979736, |
|
"learning_rate": 1.4654166411502002e-05, |
|
"loss": 1.9464, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.990349292755127, |
|
"learning_rate": 1.4539789628622347e-05, |
|
"loss": 1.8252, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.1683619022369385, |
|
"learning_rate": 1.4425677642985924e-05, |
|
"loss": 1.8346, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.782841444015503, |
|
"learning_rate": 1.4311833343305097e-05, |
|
"loss": 1.8584, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 3.302788257598877, |
|
"learning_rate": 1.4198259611515886e-05, |
|
"loss": 1.9615, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 4.179065227508545, |
|
"learning_rate": 1.4084959322704893e-05, |
|
"loss": 2.0387, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.3860225677490234, |
|
"learning_rate": 1.3971935345036657e-05, |
|
"loss": 1.7267, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.326015472412109, |
|
"learning_rate": 1.3859190539680927e-05, |
|
"loss": 1.9828, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 3.4805123805999756, |
|
"learning_rate": 1.3746727760740328e-05, |
|
"loss": 1.8873, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.8176207542419434, |
|
"learning_rate": 1.3634549855178028e-05, |
|
"loss": 2.0302, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 2.756837844848633, |
|
"learning_rate": 1.3522659662745723e-05, |
|
"loss": 1.9893, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 4.258969783782959, |
|
"learning_rate": 1.3411060015911734e-05, |
|
"loss": 1.847, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.707541465759277, |
|
"learning_rate": 1.32997537397893e-05, |
|
"loss": 1.8802, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.7876532077789307, |
|
"learning_rate": 1.3188743652065083e-05, |
|
"loss": 1.9015, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.91947340965271, |
|
"learning_rate": 1.3078032562927788e-05, |
|
"loss": 1.8293, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 4.129434108734131, |
|
"learning_rate": 1.296762327499707e-05, |
|
"loss": 1.786, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.0605030059814453, |
|
"learning_rate": 1.2857518583252587e-05, |
|
"loss": 1.9754, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.6712772846221924, |
|
"learning_rate": 1.2747721274963214e-05, |
|
"loss": 1.8931, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 3.6777453422546387, |
|
"learning_rate": 1.2638234129616488e-05, |
|
"loss": 1.9122, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.1498284339904785, |
|
"learning_rate": 1.2529059918848296e-05, |
|
"loss": 1.8041, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.7665841579437256, |
|
"learning_rate": 1.2420201406372662e-05, |
|
"loss": 1.7802, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.147603988647461, |
|
"learning_rate": 1.2311661347911783e-05, |
|
"loss": 1.9658, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.327116012573242, |
|
"learning_rate": 1.220344249112629e-05, |
|
"loss": 1.8795, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.689382553100586, |
|
"learning_rate": 1.2095547575545686e-05, |
|
"loss": 1.942, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 3.967803955078125, |
|
"learning_rate": 1.1987979332499011e-05, |
|
"loss": 1.8653, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.113976001739502, |
|
"learning_rate": 1.1880740485045649e-05, |
|
"loss": 1.8737, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.3383049964904785, |
|
"learning_rate": 1.1773833747906471e-05, |
|
"loss": 1.9163, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.971327304840088, |
|
"learning_rate": 1.1667261827395035e-05, |
|
"loss": 2.0355, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.8071823120117188, |
|
"learning_rate": 1.1561027421349117e-05, |
|
"loss": 1.7467, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 3.7409048080444336, |
|
"learning_rate": 1.145513321906243e-05, |
|
"loss": 1.847, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 5.195309162139893, |
|
"learning_rate": 1.1349581901216514e-05, |
|
"loss": 2.0805, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 2.922433376312256, |
|
"learning_rate": 1.1244376139812867e-05, |
|
"loss": 1.7545, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 5.311805725097656, |
|
"learning_rate": 1.1139518598105358e-05, |
|
"loss": 1.9093, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.9856057167053223, |
|
"learning_rate": 1.1035011930532771e-05, |
|
"loss": 1.8777, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.006605386734009, |
|
"learning_rate": 1.0930858782651585e-05, |
|
"loss": 1.9631, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 3.3158912658691406, |
|
"learning_rate": 1.0827061791069045e-05, |
|
"loss": 1.8097, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.086146831512451, |
|
"learning_rate": 1.0723623583376392e-05, |
|
"loss": 1.9171, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 4.822931289672852, |
|
"learning_rate": 1.062054677808238e-05, |
|
"loss": 2.1704, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.8096282482147217, |
|
"learning_rate": 1.0517833984546923e-05, |
|
"loss": 1.9599, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.096799373626709, |
|
"learning_rate": 1.0415487802915133e-05, |
|
"loss": 1.9463, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.9913666248321533, |
|
"learning_rate": 1.0313510824051393e-05, |
|
"loss": 1.9045, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.0718228816986084, |
|
"learning_rate": 1.0211905629473866e-05, |
|
"loss": 1.7678, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.186037540435791, |
|
"learning_rate": 1.0110674791289079e-05, |
|
"loss": 1.9355, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 3.739786386489868, |
|
"learning_rate": 1.0009820872126835e-05, |
|
"loss": 2.015, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.730051040649414, |
|
"learning_rate": 9.909346425075335e-06, |
|
"loss": 1.9639, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.366475582122803, |
|
"learning_rate": 9.809253993616569e-06, |
|
"loss": 2.1142, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.9198176860809326, |
|
"learning_rate": 9.709546111561913e-06, |
|
"loss": 1.8616, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.5179014205932617, |
|
"learning_rate": 9.610225302987961e-06, |
|
"loss": 1.8651, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.9303548336029053, |
|
"learning_rate": 9.511294082172653e-06, |
|
"loss": 2.0002, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 3.435821771621704, |
|
"learning_rate": 9.412754953531663e-06, |
|
"loss": 1.8817, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 4.4535932540893555, |
|
"learning_rate": 9.314610411554925e-06, |
|
"loss": 1.8213, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.345769166946411, |
|
"learning_rate": 9.216862940743529e-06, |
|
"loss": 1.8374, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 4.314777851104736, |
|
"learning_rate": 9.119515015546836e-06, |
|
"loss": 2.0438, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 4.599632263183594, |
|
"learning_rate": 9.02256910029983e-06, |
|
"loss": 1.8459, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.590637683868408, |
|
"learning_rate": 8.926027649160704e-06, |
|
"loss": 1.8009, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.119189500808716, |
|
"learning_rate": 8.82989310604877e-06, |
|
"loss": 1.9651, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 3.1386303901672363, |
|
"learning_rate": 8.734167904582566e-06, |
|
"loss": 1.7791, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.6528995037078857, |
|
"learning_rate": 8.638854468018296e-06, |
|
"loss": 1.9259, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 4.182424545288086, |
|
"learning_rate": 8.543955209188412e-06, |
|
"loss": 1.8853, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 5.662861347198486, |
|
"learning_rate": 8.449472530440612e-06, |
|
"loss": 1.9349, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 4.169982433319092, |
|
"learning_rate": 8.355408823576951e-06, |
|
"loss": 1.9554, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.808478832244873, |
|
"learning_rate": 8.261766469793373e-06, |
|
"loss": 1.8309, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.801201343536377, |
|
"learning_rate": 8.168547839619352e-06, |
|
"loss": 1.8714, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 3.8212218284606934, |
|
"learning_rate": 8.075755292857933e-06, |
|
"loss": 1.844, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.7147650718688965, |
|
"learning_rate": 7.983391178525979e-06, |
|
"loss": 1.9004, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.4768807888031006, |
|
"learning_rate": 7.89145783479471e-06, |
|
"loss": 1.947, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.307199478149414, |
|
"learning_rate": 7.799957588930523e-06, |
|
"loss": 1.9069, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 4.613658905029297, |
|
"learning_rate": 7.708892757236047e-06, |
|
"loss": 1.917, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 2.8293955326080322, |
|
"learning_rate": 7.618265644991535e-06, |
|
"loss": 1.8854, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 3.302823066711426, |
|
"learning_rate": 7.528078546396481e-06, |
|
"loss": 2.0073, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.862478494644165, |
|
"learning_rate": 7.438333744511591e-06, |
|
"loss": 1.9243, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 4.1902899742126465, |
|
"learning_rate": 7.3490335112009225e-06, |
|
"loss": 1.8696, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.4848709106445312, |
|
"learning_rate": 7.260180107074438e-06, |
|
"loss": 2.0236, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 2.9219446182250977, |
|
"learning_rate": 7.171775781430712e-06, |
|
"loss": 1.9218, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.458622694015503, |
|
"learning_rate": 7.083822772200058e-06, |
|
"loss": 1.9155, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.5859556198120117, |
|
"learning_rate": 6.996323305887822e-06, |
|
"loss": 1.9701, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.7645373344421387, |
|
"learning_rate": 6.909279597518048e-06, |
|
"loss": 1.9555, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 5.934003829956055, |
|
"learning_rate": 6.822693850577385e-06, |
|
"loss": 1.9963, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 4.152750015258789, |
|
"learning_rate": 6.7365682569593496e-06, |
|
"loss": 1.8777, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.7498714923858643, |
|
"learning_rate": 6.6509049969087715e-06, |
|
"loss": 1.9313, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 2.86311411857605, |
|
"learning_rate": 6.565706238966671e-06, |
|
"loss": 1.7692, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 4.296627521514893, |
|
"learning_rate": 6.480974139915297e-06, |
|
"loss": 1.942, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 3.102341890335083, |
|
"learning_rate": 6.396710844723597e-06, |
|
"loss": 1.9011, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.467423439025879, |
|
"learning_rate": 6.312918486492855e-06, |
|
"loss": 1.8276, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 4.662038803100586, |
|
"learning_rate": 6.229599186402729e-06, |
|
"loss": 1.8927, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 6.194324493408203, |
|
"learning_rate": 6.146755053657541e-06, |
|
"loss": 1.8046, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.2271151542663574, |
|
"learning_rate": 6.064388185432898e-06, |
|
"loss": 1.7897, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.0152978897094727, |
|
"learning_rate": 5.9825006668225905e-06, |
|
"loss": 1.8203, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 3.5677027702331543, |
|
"learning_rate": 5.901094570785798e-06, |
|
"loss": 1.9312, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.464501142501831, |
|
"learning_rate": 5.820171958094628e-06, |
|
"loss": 1.9227, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 4.184050559997559, |
|
"learning_rate": 5.73973487728196e-06, |
|
"loss": 1.8542, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.7280945777893066, |
|
"learning_rate": 5.659785364589556e-06, |
|
"loss": 2.0387, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.863532543182373, |
|
"learning_rate": 5.580325443916526e-06, |
|
"loss": 1.8824, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.403118133544922, |
|
"learning_rate": 5.501357126768117e-06, |
|
"loss": 1.8999, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.203178644180298, |
|
"learning_rate": 5.422882412204766e-06, |
|
"loss": 2.0521, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 3.8374898433685303, |
|
"learning_rate": 5.344903286791494e-06, |
|
"loss": 1.8838, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.570945978164673, |
|
"learning_rate": 5.267421724547627e-06, |
|
"loss": 1.9615, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 6.397089004516602, |
|
"learning_rate": 5.1904396868968195e-06, |
|
"loss": 1.9624, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.234090805053711, |
|
"learning_rate": 5.113959122617412e-06, |
|
"loss": 1.9239, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.1682183742523193, |
|
"learning_rate": 5.037981967793076e-06, |
|
"loss": 1.8498, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.0839152336120605, |
|
"learning_rate": 4.9625101457638376e-06, |
|
"loss": 1.9856, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 3.629542589187622, |
|
"learning_rate": 4.887545567077337e-06, |
|
"loss": 1.8867, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 4.0674638748168945, |
|
"learning_rate": 4.8130901294405255e-06, |
|
"loss": 2.0402, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.093059539794922, |
|
"learning_rate": 4.739145717671572e-06, |
|
"loss": 1.9107, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 6.425740718841553, |
|
"learning_rate": 4.665714203652177e-06, |
|
"loss": 1.8893, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.764960765838623, |
|
"learning_rate": 4.592797446280178e-06, |
|
"loss": 1.8649, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.2027156352996826, |
|
"learning_rate": 4.520397291422501e-06, |
|
"loss": 1.991, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 4.535457134246826, |
|
"learning_rate": 4.448515571868434e-06, |
|
"loss": 1.8798, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 3.6848881244659424, |
|
"learning_rate": 4.3771541072832045e-06, |
|
"loss": 1.9349, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.817534923553467, |
|
"learning_rate": 4.306314704161937e-06, |
|
"loss": 1.8637, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.4655098915100098, |
|
"learning_rate": 4.23599915578394e-06, |
|
"loss": 1.8615, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 2.829066276550293, |
|
"learning_rate": 4.16620924216726e-06, |
|
"loss": 1.7928, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.525213241577148, |
|
"learning_rate": 4.096946730023662e-06, |
|
"loss": 1.903, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 3.8306119441986084, |
|
"learning_rate": 4.028213372713904e-06, |
|
"loss": 1.9473, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 4.448178768157959, |
|
"learning_rate": 3.960010910203319e-06, |
|
"loss": 1.959, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.6487441062927246, |
|
"learning_rate": 3.892341069017808e-06, |
|
"loss": 1.9932, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.487689256668091, |
|
"learning_rate": 3.825205562200101e-06, |
|
"loss": 1.9578, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.0234782695770264, |
|
"learning_rate": 3.75860608926642e-06, |
|
"loss": 1.9083, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.328275203704834, |
|
"learning_rate": 3.69254433616342e-06, |
|
"loss": 2.0128, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 2.9996497631073, |
|
"learning_rate": 3.627021975225553e-06, |
|
"loss": 1.633, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 3.9526045322418213, |
|
"learning_rate": 3.562040665132715e-06, |
|
"loss": 1.8948, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.027220249176025, |
|
"learning_rate": 3.4976020508682344e-06, |
|
"loss": 1.8918, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.6429829597473145, |
|
"learning_rate": 3.4337077636772547e-06, |
|
"loss": 1.8865, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.5367865562438965, |
|
"learning_rate": 3.3703594210254487e-06, |
|
"loss": 1.895, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 3.4687774181365967, |
|
"learning_rate": 3.3075586265580494e-06, |
|
"loss": 1.8908, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.654914855957031, |
|
"learning_rate": 3.24530697005925e-06, |
|
"loss": 1.7785, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.516482353210449, |
|
"learning_rate": 3.183606027411998e-06, |
|
"loss": 1.7936, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 4.209545135498047, |
|
"learning_rate": 3.1224573605580648e-06, |
|
"loss": 1.9851, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.1666178703308105, |
|
"learning_rate": 3.061862517458519e-06, |
|
"loss": 1.858, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 5.190033912658691, |
|
"learning_rate": 3.001823032054532e-06, |
|
"loss": 1.9802, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.3511528968811035, |
|
"learning_rate": 2.942340424228554e-06, |
|
"loss": 1.9403, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 4.630067348480225, |
|
"learning_rate": 2.8834161997658565e-06, |
|
"loss": 1.7726, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.705087184906006, |
|
"learning_rate": 2.825051850316371e-06, |
|
"loss": 1.8286, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 3.315842628479004, |
|
"learning_rate": 2.767248853356971e-06, |
|
"loss": 1.8397, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 5.60033655166626, |
|
"learning_rate": 2.710008672154035e-06, |
|
"loss": 1.994, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.465238571166992, |
|
"learning_rate": 2.65333275572644e-06, |
|
"loss": 1.9824, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.8040528297424316, |
|
"learning_rate": 2.5972225388088497e-06, |
|
"loss": 1.8507, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 3.2600059509277344, |
|
"learning_rate": 2.5416794418154035e-06, |
|
"loss": 1.992, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.9075703620910645, |
|
"learning_rate": 2.486704870803763e-06, |
|
"loss": 1.8189, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.047214508056641, |
|
"learning_rate": 2.432300217439526e-06, |
|
"loss": 1.9156, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 4.082090854644775, |
|
"learning_rate": 2.3784668589609814e-06, |
|
"loss": 1.8582, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.8980605602264404, |
|
"learning_rate": 2.3252061581442496e-06, |
|
"loss": 1.8418, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.5113372802734375, |
|
"learning_rate": 2.2725194632687795e-06, |
|
"loss": 1.8942, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 4.78348445892334, |
|
"learning_rate": 2.220408108083244e-06, |
|
"loss": 1.868, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.327033281326294, |
|
"learning_rate": 2.1688734117717295e-06, |
|
"loss": 1.9177, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.6453311443328857, |
|
"learning_rate": 2.117916678920384e-06, |
|
"loss": 1.8282, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 3.0697853565216064, |
|
"learning_rate": 2.0675391994843695e-06, |
|
"loss": 1.8374, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.6173019409179688, |
|
"learning_rate": 2.017742248755225e-06, |
|
"loss": 1.9797, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.858684539794922, |
|
"learning_rate": 1.9685270873285505e-06, |
|
"loss": 1.9083, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.6615593433380127, |
|
"learning_rate": 1.9198949610721273e-06, |
|
"loss": 2.0119, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 4.125614643096924, |
|
"learning_rate": 1.8718471010943623e-06, |
|
"loss": 1.8927, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.79669451713562, |
|
"learning_rate": 1.8243847237131406e-06, |
|
"loss": 1.8407, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.5093576908111572, |
|
"learning_rate": 1.7775090304250065e-06, |
|
"loss": 1.9293, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 3.6266543865203857, |
|
"learning_rate": 1.7312212078747781e-06, |
|
"loss": 1.6496, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 4.086301326751709, |
|
"learning_rate": 1.6855224278254812e-06, |
|
"loss": 1.9496, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.14742374420166, |
|
"learning_rate": 1.6404138471286966e-06, |
|
"loss": 1.8646, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.868939161300659, |
|
"learning_rate": 1.5958966076952992e-06, |
|
"loss": 1.9593, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.424562931060791, |
|
"learning_rate": 1.5519718364665009e-06, |
|
"loss": 1.7344, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 3.9741764068603516, |
|
"learning_rate": 1.5086406453853646e-06, |
|
"loss": 1.7876, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 4.209314346313477, |
|
"learning_rate": 1.4659041313686366e-06, |
|
"loss": 2.1263, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.095180034637451, |
|
"learning_rate": 1.4237633762789942e-06, |
|
"loss": 1.7563, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.4438066482543945, |
|
"learning_rate": 1.3822194468976284e-06, |
|
"loss": 1.8099, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.844168663024902, |
|
"learning_rate": 1.3412733948972688e-06, |
|
"loss": 1.8867, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.2806739807128906, |
|
"learning_rate": 1.300926256815546e-06, |
|
"loss": 1.9385, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.7914087772369385, |
|
"learning_rate": 1.2611790540287633e-06, |
|
"loss": 1.7425, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 4.138453960418701, |
|
"learning_rate": 1.2220327927260161e-06, |
|
"loss": 1.9172, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 3.3346848487854004, |
|
"learning_rate": 1.1834884638837613e-06, |
|
"loss": 1.9754, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 3.6204893589019775, |
|
"learning_rate": 1.1455470432406829e-06, |
|
"loss": 1.7101, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.972575664520264, |
|
"learning_rate": 1.108209491273035e-06, |
|
"loss": 1.8861, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 3.620809316635132, |
|
"learning_rate": 1.0714767531702973e-06, |
|
"loss": 1.8525, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 3.33205509185791, |
|
"learning_rate": 1.035349758811263e-06, |
|
"loss": 1.8453, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 3.7018685340881348, |
|
"learning_rate": 9.998294227404863e-07, |
|
"loss": 2.0806, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.9941864013671875, |
|
"learning_rate": 9.649166441451557e-07, |
|
"loss": 1.94, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.217085361480713, |
|
"learning_rate": 9.306123068323097e-07, |
|
"loss": 1.9168, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.2208547592163086, |
|
"learning_rate": 8.969172792064634e-07, |
|
"loss": 1.8819, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.9018375873565674, |
|
"learning_rate": 8.638324142476284e-07, |
|
"loss": 1.9311, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.776543140411377, |
|
"learning_rate": 8.313585494897385e-07, |
|
"loss": 1.762, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 6.1161603927612305, |
|
"learning_rate": 7.994965069994142e-07, |
|
"loss": 1.8604, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 3.6044158935546875, |
|
"learning_rate": 7.682470933551761e-07, |
|
"loss": 1.7736, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 4.38954496383667, |
|
"learning_rate": 7.376110996270281e-07, |
|
"loss": 1.9429, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.361955165863037, |
|
"learning_rate": 7.075893013564123e-07, |
|
"loss": 1.8157, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 3.799809217453003, |
|
"learning_rate": 6.781824585365915e-07, |
|
"loss": 1.9094, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.269566059112549, |
|
"learning_rate": 6.493913155934117e-07, |
|
"loss": 1.9207, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 4.451285362243652, |
|
"learning_rate": 6.212166013664422e-07, |
|
"loss": 1.6652, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 3.91097092628479, |
|
"learning_rate": 5.93659029090543e-07, |
|
"loss": 1.9185, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 3.952296257019043, |
|
"learning_rate": 5.667192963778017e-07, |
|
"loss": 1.7982, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.8603575229644775, |
|
"learning_rate": 5.403980851998669e-07, |
|
"loss": 1.8665, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 4.040564060211182, |
|
"learning_rate": 5.146960618706981e-07, |
|
"loss": 1.8744, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.266788959503174, |
|
"learning_rate": 4.896138770296876e-07, |
|
"loss": 1.8463, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.374309539794922, |
|
"learning_rate": 4.6515216562519615e-07, |
|
"loss": 1.8195, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.7271621227264404, |
|
"learning_rate": 4.41311546898468e-07, |
|
"loss": 1.788, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.1484320163726807, |
|
"learning_rate": 4.180926243679689e-07, |
|
"loss": 1.8316, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 3.443974256515503, |
|
"learning_rate": 3.954959858141066e-07, |
|
"loss": 1.9071, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.8171606063842773, |
|
"learning_rate": 3.735222032643426e-07, |
|
"loss": 2.1321, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.141526699066162, |
|
"learning_rate": 3.521718329787177e-07, |
|
"loss": 1.8597, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.848994255065918, |
|
"learning_rate": 3.314454154357688e-07, |
|
"loss": 1.9906, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 3.9238314628601074, |
|
"learning_rate": 3.1134347531884267e-07, |
|
"loss": 1.9433, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.169834136962891, |
|
"learning_rate": 2.9186652150282603e-07, |
|
"loss": 1.7679, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 6.12147331237793, |
|
"learning_rate": 2.7301504704125016e-07, |
|
"loss": 1.6556, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.5053157806396484, |
|
"learning_rate": 2.547895291538177e-07, |
|
"loss": 1.9142, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.274362087249756, |
|
"learning_rate": 2.371904292143151e-07, |
|
"loss": 1.8754, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.843151569366455, |
|
"learning_rate": 2.2021819273894127e-07, |
|
"loss": 1.7239, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 3.5693886280059814, |
|
"learning_rate": 2.0387324937502505e-07, |
|
"loss": 1.8063, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.155526161193848, |
|
"learning_rate": 1.8815601289014496e-07, |
|
"loss": 1.8008, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.957355499267578, |
|
"learning_rate": 1.730668811616598e-07, |
|
"loss": 1.9108, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.035935878753662, |
|
"learning_rate": 1.5860623616664184e-07, |
|
"loss": 2.0325, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 4.176791667938232, |
|
"learning_rate": 1.4477444397219542e-07, |
|
"loss": 1.8947, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.648829460144043, |
|
"learning_rate": 1.3157185472619516e-07, |
|
"loss": 1.8535, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.8320178985595703, |
|
"learning_rate": 1.1899880264842068e-07, |
|
"loss": 1.8678, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.046886682510376, |
|
"learning_rate": 1.0705560602210784e-07, |
|
"loss": 1.8263, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 5.341119766235352, |
|
"learning_rate": 9.574256718586639e-08, |
|
"loss": 1.9319, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 3.0084095001220703, |
|
"learning_rate": 8.505997252605258e-08, |
|
"loss": 1.7669, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.5134646892547607, |
|
"learning_rate": 7.500809246950569e-08, |
|
"loss": 1.824, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.576869249343872, |
|
"learning_rate": 6.558718147670339e-08, |
|
"loss": 1.8971, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.1408050060272217, |
|
"learning_rate": 5.679747803531699e-08, |
|
"loss": 1.9365, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 4.063467979431152, |
|
"learning_rate": 4.863920465418836e-08, |
|
"loss": 1.8272, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.66452693939209, |
|
"learning_rate": 4.111256785767903e-08, |
|
"loss": 1.7885, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 3.7975409030914307, |
|
"learning_rate": 3.421775818045481e-08, |
|
"loss": 1.879, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.497860908508301, |
|
"learning_rate": 2.7954950162656367e-08, |
|
"loss": 1.828, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.815382242202759, |
|
"learning_rate": 2.2324302345483327e-08, |
|
"loss": 1.9715, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 5.165794849395752, |
|
"learning_rate": 1.7325957267180782e-08, |
|
"loss": 1.8856, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.661296367645264, |
|
"learning_rate": 1.2960041459425532e-08, |
|
"loss": 1.9542, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 4.152047157287598, |
|
"learning_rate": 9.226665444136973e-09, |
|
"loss": 1.9453, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 3.161618232727051, |
|
"learning_rate": 6.1259237306599e-09, |
|
"loss": 1.7805, |
|
"step": 3200 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3222, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"total_flos": 4.797270917531566e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|