|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9984, |
|
"eval_steps": 500, |
|
"global_step": 2811, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.5548, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 7.59375, |
|
"learning_rate": 2.5e-05, |
|
"loss": 1.5637, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.5441, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 7.5, |
|
"learning_rate": 5e-05, |
|
"loss": 1.5212, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 4.999960406039296e-05, |
|
"loss": 1.3853, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 4.9998416254113275e-05, |
|
"loss": 1.3669, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.9996436618784923e-05, |
|
"loss": 1.339, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.9993665217113186e-05, |
|
"loss": 1.4381, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 4.9990102136882665e-05, |
|
"loss": 1.3781, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 4.875, |
|
"learning_rate": 4.998574749095454e-05, |
|
"loss": 1.3353, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 4.998060141726295e-05, |
|
"loss": 1.326, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.997466407881065e-05, |
|
"loss": 1.314, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 4.996793566366384e-05, |
|
"loss": 1.2821, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 4.99604163849462e-05, |
|
"loss": 1.3627, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 6.25, |
|
"learning_rate": 4.995210648083214e-05, |
|
"loss": 1.3492, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 4.59375, |
|
"learning_rate": 4.994300621453928e-05, |
|
"loss": 1.3351, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 4.99331158743201e-05, |
|
"loss": 1.3716, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 4.992243577345278e-05, |
|
"loss": 1.3064, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 4.9910966250231324e-05, |
|
"loss": 1.3729, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 4.989870766795481e-05, |
|
"loss": 1.3587, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.988566041491589e-05, |
|
"loss": 1.3231, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.987182490438852e-05, |
|
"loss": 1.2753, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.985720157461482e-05, |
|
"loss": 1.2592, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 4.984179088879122e-05, |
|
"loss": 1.3175, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 4.9825593335053796e-05, |
|
"loss": 1.4169, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 4.9808609426462794e-05, |
|
"loss": 1.3497, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 4.979083970098638e-05, |
|
"loss": 1.4249, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 4.977228472148361e-05, |
|
"loss": 1.3558, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 4.9752945075686574e-05, |
|
"loss": 1.371, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 4.973282137618182e-05, |
|
"loss": 1.2899, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 4.9711914260390924e-05, |
|
"loss": 1.3947, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 5.625, |
|
"learning_rate": 4.96902243905503e-05, |
|
"loss": 1.2762, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.966775245369024e-05, |
|
"loss": 1.392, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 4.964449916161312e-05, |
|
"loss": 1.3061, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 4.9620465250870886e-05, |
|
"loss": 1.2861, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 4.959565148274172e-05, |
|
"loss": 1.3093, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.5, |
|
"learning_rate": 4.95700586432059e-05, |
|
"loss": 1.2894, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 4.954368754292095e-05, |
|
"loss": 1.2801, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.75, |
|
"learning_rate": 4.951653901719589e-05, |
|
"loss": 1.3643, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.9488613925964886e-05, |
|
"loss": 1.2965, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.945991315375987e-05, |
|
"loss": 1.3055, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 4.943043760968267e-05, |
|
"loss": 1.2909, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.8125, |
|
"learning_rate": 4.940018822737609e-05, |
|
"loss": 1.3486, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 6.625, |
|
"learning_rate": 4.9369165964994434e-05, |
|
"loss": 1.3847, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.9337371805173074e-05, |
|
"loss": 1.3065, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.75, |
|
"learning_rate": 4.93048067549974e-05, |
|
"loss": 1.309, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 4.9271471845970854e-05, |
|
"loss": 1.2953, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 4.9237368133982286e-05, |
|
"loss": 1.3291, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 6.25, |
|
"learning_rate": 4.920249669927255e-05, |
|
"loss": 1.2906, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 4.916685864640021e-05, |
|
"loss": 1.3583, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 4.913045510420658e-05, |
|
"loss": 1.2911, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 4.9093287225780004e-05, |
|
"loss": 1.2865, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.905535618841931e-05, |
|
"loss": 1.3438, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 4.901666319359649e-05, |
|
"loss": 1.3012, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 4.897720946691867e-05, |
|
"loss": 1.313, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 4.893699625808931e-05, |
|
"loss": 1.3882, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 4.889602484086857e-05, |
|
"loss": 1.2409, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 4.8854296513032996e-05, |
|
"loss": 1.3014, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 5.125, |
|
"learning_rate": 4.8811812596334395e-05, |
|
"loss": 1.3349, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 4.8768574436458e-05, |
|
"loss": 1.2216, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.5, |
|
"learning_rate": 4.8724583402979826e-05, |
|
"loss": 1.27, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 4.867984088932326e-05, |
|
"loss": 1.2596, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 4.8634348312714956e-05, |
|
"loss": 1.3084, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 5.625, |
|
"learning_rate": 4.8588107114139955e-05, |
|
"loss": 1.3364, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 4.8541118758296025e-05, |
|
"loss": 1.1912, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 4.849338473354725e-05, |
|
"loss": 1.3021, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 6.0, |
|
"learning_rate": 4.8444906551876914e-05, |
|
"loss": 1.2595, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 5.25, |
|
"learning_rate": 4.839568574883958e-05, |
|
"loss": 1.2704, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 5.25, |
|
"learning_rate": 4.83457238835125e-05, |
|
"loss": 1.4043, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 4.8295022538446164e-05, |
|
"loss": 1.3419, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.75, |
|
"learning_rate": 4.824358331961423e-05, |
|
"loss": 1.2798, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 4.819140785636262e-05, |
|
"loss": 1.3005, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.813849780135795e-05, |
|
"loss": 1.3454, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 4.8084854830535095e-05, |
|
"loss": 1.3375, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.803048064304423e-05, |
|
"loss": 1.3316, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 4.875, |
|
"learning_rate": 4.797537696119689e-05, |
|
"loss": 1.2503, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 6.375, |
|
"learning_rate": 4.7919545530411494e-05, |
|
"loss": 1.2164, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 4.786298811915801e-05, |
|
"loss": 1.2928, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 4.7805706518902e-05, |
|
"loss": 1.2657, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 5.375, |
|
"learning_rate": 4.774770254404779e-05, |
|
"loss": 1.2557, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 4.768897803188105e-05, |
|
"loss": 1.3147, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 4.7629534842510626e-05, |
|
"loss": 1.289, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 4.7569374858809546e-05, |
|
"loss": 1.2597, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 4.750849998635544e-05, |
|
"loss": 1.247, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 4.744691215337015e-05, |
|
"loss": 1.3946, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.738461331065867e-05, |
|
"loss": 1.3241, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 4.732160543154735e-05, |
|
"loss": 1.2764, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 10.125, |
|
"learning_rate": 4.725789051182136e-05, |
|
"loss": 1.2941, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 4.719347056966155e-05, |
|
"loss": 1.3391, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 7.40625, |
|
"learning_rate": 4.7128347645580434e-05, |
|
"loss": 1.3739, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 4.7062523802357606e-05, |
|
"loss": 1.2787, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 4.699600112497441e-05, |
|
"loss": 1.2488, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 4.692878172054784e-05, |
|
"loss": 1.217, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 4.686086771826389e-05, |
|
"loss": 1.2612, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 4.679226126931001e-05, |
|
"loss": 1.2482, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 4.625, |
|
"learning_rate": 4.672296454680705e-05, |
|
"loss": 1.2179, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 4.665297974574037e-05, |
|
"loss": 1.2427, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 4.6582309082890327e-05, |
|
"loss": 1.3199, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 4.6510954796762094e-05, |
|
"loss": 1.2884, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 4.643891914751472e-05, |
|
"loss": 1.3176, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 4.6366204416889526e-05, |
|
"loss": 1.361, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 4.629281290813785e-05, |
|
"loss": 1.3099, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 4.6218746945948124e-05, |
|
"loss": 1.2626, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 4.614400887637218e-05, |
|
"loss": 1.2902, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 4.6068601066750964e-05, |
|
"loss": 1.242, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 4.599252590563956e-05, |
|
"loss": 1.2424, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 4.5915785802731515e-05, |
|
"loss": 1.263, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 5.625, |
|
"learning_rate": 4.583838318878253e-05, |
|
"loss": 1.237, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 4.576032051553345e-05, |
|
"loss": 1.3625, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 6.34375, |
|
"learning_rate": 4.5681600255632605e-05, |
|
"loss": 1.2604, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 4.875, |
|
"learning_rate": 4.56022249025575e-05, |
|
"loss": 1.4195, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 4.5522196970535825e-05, |
|
"loss": 1.2599, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 4.5441518994465806e-05, |
|
"loss": 1.3454, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 4.5360193529835935e-05, |
|
"loss": 1.3825, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 4.527822315264403e-05, |
|
"loss": 1.2646, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 4.519561045931559e-05, |
|
"loss": 1.262, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 4.5112358066621605e-05, |
|
"loss": 1.2725, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 4.502846861159565e-05, |
|
"loss": 1.282, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 4.494394475145036e-05, |
|
"loss": 1.292, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 4.485878916349324e-05, |
|
"loss": 1.2546, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 6.0, |
|
"learning_rate": 4.4773004545041894e-05, |
|
"loss": 1.2673, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 4.468659361333857e-05, |
|
"loss": 1.2391, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.4599559105464106e-05, |
|
"loss": 1.3886, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 4.451190377825121e-05, |
|
"loss": 1.1826, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.375, |
|
"learning_rate": 4.442363040819714e-05, |
|
"loss": 1.215, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 4.433474179137577e-05, |
|
"loss": 1.18, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 5.875, |
|
"learning_rate": 4.424524074334902e-05, |
|
"loss": 1.2138, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 4.4155130099077676e-05, |
|
"loss": 1.2433, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 5.0, |
|
"learning_rate": 4.406441271283159e-05, |
|
"loss": 1.2532, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 4.3973091458099266e-05, |
|
"loss": 1.263, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 4.388116922749683e-05, |
|
"loss": 1.2687, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 4.3788648932676436e-05, |
|
"loss": 1.2414, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.5, |
|
"learning_rate": 4.369553350423401e-05, |
|
"loss": 1.2109, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 4.3601825891616453e-05, |
|
"loss": 1.3227, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 4.35075290630282e-05, |
|
"loss": 1.2769, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 4.3412646005337166e-05, |
|
"loss": 1.2736, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 5.875, |
|
"learning_rate": 4.331717972398022e-05, |
|
"loss": 1.1944, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 5.875, |
|
"learning_rate": 4.32211332428679e-05, |
|
"loss": 1.2209, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 4.31245096042887e-05, |
|
"loss": 1.2117, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 4.302731186881265e-05, |
|
"loss": 1.2603, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 4.29295431151944e-05, |
|
"loss": 1.3085, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 5.5, |
|
"learning_rate": 4.283120644027571e-05, |
|
"loss": 1.1815, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 4.2732304958887336e-05, |
|
"loss": 1.2594, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 4.2632841803750365e-05, |
|
"loss": 1.2502, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 4.253282012537701e-05, |
|
"loss": 1.3242, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 4.2432243091970784e-05, |
|
"loss": 1.286, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 5.625, |
|
"learning_rate": 4.233111388932618e-05, |
|
"loss": 1.2395, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 4.222943572072773e-05, |
|
"loss": 1.2041, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 4.375, |
|
"learning_rate": 4.2127211806848575e-05, |
|
"loss": 1.2588, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 4.202444538564842e-05, |
|
"loss": 1.2544, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 4.192113971227097e-05, |
|
"loss": 1.2406, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 4.181729805894083e-05, |
|
"loss": 1.3559, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 4.171292371485991e-05, |
|
"loss": 1.2854, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 4.160801998610312e-05, |
|
"loss": 1.2643, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 4.625, |
|
"learning_rate": 4.150259019551377e-05, |
|
"loss": 1.1756, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 4.1396637682598235e-05, |
|
"loss": 1.2207, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 4.1290165803420235e-05, |
|
"loss": 1.2095, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 4.118317793049448e-05, |
|
"loss": 1.2021, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 5.25, |
|
"learning_rate": 4.107567745267987e-05, |
|
"loss": 1.2172, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 4.096766777507218e-05, |
|
"loss": 1.1723, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 4.085915231889615e-05, |
|
"loss": 1.2254, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 4.075013452139714e-05, |
|
"loss": 1.2538, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 4.064061783573227e-05, |
|
"loss": 1.1936, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 4.053060573086101e-05, |
|
"loss": 1.19, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 6.125, |
|
"learning_rate": 4.042010169143533e-05, |
|
"loss": 1.2572, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 4.030910921768931e-05, |
|
"loss": 1.2529, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 4.0197631825328264e-05, |
|
"loss": 1.2959, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 4.008567304541737e-05, |
|
"loss": 1.3033, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 3.9973236424269876e-05, |
|
"loss": 1.3391, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.9860325523334695e-05, |
|
"loss": 1.2128, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.9746943919083665e-05, |
|
"loss": 1.2255, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 3.963309520289819e-05, |
|
"loss": 1.2787, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 7.625, |
|
"learning_rate": 3.951878298095556e-05, |
|
"loss": 1.4147, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 3.940401087411467e-05, |
|
"loss": 1.2919, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 3.928878251780135e-05, |
|
"loss": 1.291, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 3.91731015618932e-05, |
|
"loss": 1.2669, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 3.905697167060402e-05, |
|
"loss": 1.2935, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.894039652236767e-05, |
|
"loss": 1.3249, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 3.882337980972164e-05, |
|
"loss": 1.2558, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 3.870592523919002e-05, |
|
"loss": 1.1875, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 6.375, |
|
"learning_rate": 3.8588036531166124e-05, |
|
"loss": 1.2176, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 7.21875, |
|
"learning_rate": 3.846971741979465e-05, |
|
"loss": 1.2816, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 7.34375, |
|
"learning_rate": 3.835097165285341e-05, |
|
"loss": 1.2478, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 5.75, |
|
"learning_rate": 3.823180299163458e-05, |
|
"loss": 1.2156, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 3.811221521082558e-05, |
|
"loss": 1.3427, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 3.7992212098389536e-05, |
|
"loss": 1.2286, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 3.7871797455445256e-05, |
|
"loss": 1.2868, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 3.775097509614687e-05, |
|
"loss": 1.2123, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 3.762974884756296e-05, |
|
"loss": 1.114, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.7508122549555384e-05, |
|
"loss": 1.1734, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 3.738610005465764e-05, |
|
"loss": 1.1267, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 3.726368522795282e-05, |
|
"loss": 1.2459, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 3.714088194695119e-05, |
|
"loss": 1.1431, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 3.7017694101467384e-05, |
|
"loss": 1.2636, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 3.6894125593497167e-05, |
|
"loss": 1.1851, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 5.75, |
|
"learning_rate": 3.677018033709386e-05, |
|
"loss": 1.134, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 3.6645862258244344e-05, |
|
"loss": 1.1106, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 3.6521175294744735e-05, |
|
"loss": 1.1718, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 3.639612339607561e-05, |
|
"loss": 1.1642, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 3.627071052327695e-05, |
|
"loss": 1.1399, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 3.614494064882263e-05, |
|
"loss": 1.1741, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.6018817756494634e-05, |
|
"loss": 1.1551, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 5.5, |
|
"learning_rate": 3.5892345841256825e-05, |
|
"loss": 1.1106, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 7.75, |
|
"learning_rate": 3.576552890912844e-05, |
|
"loss": 1.1462, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 3.5638370977057186e-05, |
|
"loss": 1.1489, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 5.125, |
|
"learning_rate": 3.5510876072791986e-05, |
|
"loss": 1.1427, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 5.125, |
|
"learning_rate": 3.538304823475543e-05, |
|
"loss": 1.1172, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 3.5254891511915836e-05, |
|
"loss": 1.2579, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 3.5126409963659004e-05, |
|
"loss": 1.1606, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 4.6875, |
|
"learning_rate": 3.4997607659659614e-05, |
|
"loss": 1.2115, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.4868488679752384e-05, |
|
"loss": 1.1618, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 4.875, |
|
"learning_rate": 3.4739057113802756e-05, |
|
"loss": 1.1754, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 6.125, |
|
"learning_rate": 3.46093170615774e-05, |
|
"loss": 1.2046, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 3.4479272632614326e-05, |
|
"loss": 1.2066, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 3.4348927946092766e-05, |
|
"loss": 1.1538, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 3.421828713070261e-05, |
|
"loss": 1.1924, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 3.4087354324513734e-05, |
|
"loss": 1.1463, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 8.75, |
|
"learning_rate": 3.3956133674844826e-05, |
|
"loss": 1.2498, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 3.38246293381321e-05, |
|
"loss": 1.1543, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 3.3692845479797564e-05, |
|
"loss": 1.1401, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 3.356078627411716e-05, |
|
"loss": 1.1909, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 3.342845590408849e-05, |
|
"loss": 1.1075, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.329585856129833e-05, |
|
"loss": 1.2045, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 3.316299844578985e-05, |
|
"loss": 1.1752, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 3.302987976592963e-05, |
|
"loss": 1.1077, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 3.289650673827427e-05, |
|
"loss": 1.1641, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.276288358743691e-05, |
|
"loss": 1.1683, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 6.6875, |
|
"learning_rate": 3.262901454595338e-05, |
|
"loss": 1.1507, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 3.2494903854148134e-05, |
|
"loss": 1.1706, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 5.5, |
|
"learning_rate": 3.236055575999994e-05, |
|
"loss": 1.1484, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.222597451900733e-05, |
|
"loss": 1.2247, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 3.2091164394053785e-05, |
|
"loss": 1.1492, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 5.375, |
|
"learning_rate": 3.1956129655272755e-05, |
|
"loss": 1.1822, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 6.59375, |
|
"learning_rate": 3.182087457991233e-05, |
|
"loss": 1.2857, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 7.75, |
|
"learning_rate": 3.168540345219985e-05, |
|
"loss": 1.1828, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 7.75, |
|
"learning_rate": 3.15497205632061e-05, |
|
"loss": 1.2212, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 3.141383021070947e-05, |
|
"loss": 1.1151, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 3.127773669905978e-05, |
|
"loss": 1.226, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 3.114144433904194e-05, |
|
"loss": 1.2121, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 3.100495744773944e-05, |
|
"loss": 1.1956, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 3.086828034839756e-05, |
|
"loss": 1.1705, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 3.073141737028648e-05, |
|
"loss": 1.1738, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 3.059437284856409e-05, |
|
"loss": 1.2063, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 3.04571511241387e-05, |
|
"loss": 1.1492, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 3.031975654353158e-05, |
|
"loss": 1.2052, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 5.0, |
|
"learning_rate": 3.018219345873923e-05, |
|
"loss": 1.1688, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 3.004446622709554e-05, |
|
"loss": 1.0857, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 2.99065792111338e-05, |
|
"loss": 1.2369, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 2.976853677844847e-05, |
|
"loss": 1.1783, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 2.9630343301556883e-05, |
|
"loss": 1.1596, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 2.9492003157760713e-05, |
|
"loss": 1.2145, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 5.625, |
|
"learning_rate": 2.9353520729007326e-05, |
|
"loss": 1.157, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 2.921490040175101e-05, |
|
"loss": 1.2069, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 2.9076146566813994e-05, |
|
"loss": 1.1791, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 2.893726361924738e-05, |
|
"loss": 1.1243, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 2.8798255958191946e-05, |
|
"loss": 1.191, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 5.625, |
|
"learning_rate": 2.8659127986738782e-05, |
|
"loss": 1.1712, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 4.75, |
|
"learning_rate": 2.851988411178985e-05, |
|
"loss": 1.1619, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 2.8380528743918348e-05, |
|
"loss": 1.1699, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 6.375, |
|
"learning_rate": 2.824106629722906e-05, |
|
"loss": 1.2141, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 2.810150118921847e-05, |
|
"loss": 1.1444, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 5.875, |
|
"learning_rate": 2.7961837840634913e-05, |
|
"loss": 1.1494, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 2.7822080675338492e-05, |
|
"loss": 1.1783, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 5.25, |
|
"learning_rate": 2.768223412016099e-05, |
|
"loss": 1.1855, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 2.7542302604765598e-05, |
|
"loss": 1.1509, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 2.740229056150666e-05, |
|
"loss": 1.1274, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 2.7262202425289253e-05, |
|
"loss": 1.1581, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 2.7122042633428697e-05, |
|
"loss": 1.0967, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 2.698181562551002e-05, |
|
"loss": 1.181, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 5.625, |
|
"learning_rate": 2.684152584324735e-05, |
|
"loss": 1.151, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 2.6701177730343186e-05, |
|
"loss": 1.1978, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 2.6560775732347654e-05, |
|
"loss": 1.1315, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 5.25, |
|
"learning_rate": 2.6420324296517708e-05, |
|
"loss": 1.1845, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 2.627982787167625e-05, |
|
"loss": 1.1386, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 2.613929090807123e-05, |
|
"loss": 1.159, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 2.599871785723465e-05, |
|
"loss": 1.1653, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 2.5858113171841604e-05, |
|
"loss": 1.2584, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 5.75, |
|
"learning_rate": 2.5717481305569186e-05, |
|
"loss": 1.2103, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 5.875, |
|
"learning_rate": 2.5576826712955472e-05, |
|
"loss": 1.2361, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 2.5436153849258398e-05, |
|
"loss": 1.1491, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 2.5295467170314624e-05, |
|
"loss": 1.1003, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 2.515477113239843e-05, |
|
"loss": 1.2162, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 2.5014070192080526e-05, |
|
"loss": 1.1498, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 2.4873368806086918e-05, |
|
"loss": 1.2132, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 2.4732671431157716e-05, |
|
"loss": 1.1903, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 7.625, |
|
"learning_rate": 2.4591982523906e-05, |
|
"loss": 1.1472, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 5.25, |
|
"learning_rate": 2.4451306540676615e-05, |
|
"loss": 1.1085, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 2.431064793740505e-05, |
|
"loss": 1.0636, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 2.4170011169476254e-05, |
|
"loss": 1.1439, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 2.4029400691583574e-05, |
|
"loss": 1.2272, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 2.3888820957587605e-05, |
|
"loss": 1.158, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 2.3748276420375095e-05, |
|
"loss": 1.1893, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 2.3607771531717982e-05, |
|
"loss": 1.1827, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 2.3467310742132266e-05, |
|
"loss": 1.2413, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 5.125, |
|
"learning_rate": 2.332689850073715e-05, |
|
"loss": 1.1588, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 7.46875, |
|
"learning_rate": 2.3186539255114058e-05, |
|
"loss": 1.2112, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 2.304623745116573e-05, |
|
"loss": 1.1712, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 5.625, |
|
"learning_rate": 2.2905997532975484e-05, |
|
"loss": 1.1619, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 2.2765823942666343e-05, |
|
"loss": 1.1737, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 2.2625721120260416e-05, |
|
"loss": 1.1432, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 6.0, |
|
"learning_rate": 2.2485693503538232e-05, |
|
"loss": 1.1612, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 2.2345745527898138e-05, |
|
"loss": 1.1659, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 2.2205881626215868e-05, |
|
"loss": 1.1377, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 2.2066106228704058e-05, |
|
"loss": 1.1139, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 2.1926423762772017e-05, |
|
"loss": 1.1575, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 2.1786838652885373e-05, |
|
"loss": 1.1132, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 2.1647355320426017e-05, |
|
"loss": 1.1038, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 5.625, |
|
"learning_rate": 2.1507978183552034e-05, |
|
"loss": 1.064, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 2.1368711657057705e-05, |
|
"loss": 1.1183, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 2.122956015223375e-05, |
|
"loss": 1.1619, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 5.0, |
|
"learning_rate": 2.109052807672752e-05, |
|
"loss": 1.2064, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 2.0951619834403462e-05, |
|
"loss": 1.1686, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 5.375, |
|
"learning_rate": 2.081283982520356e-05, |
|
"loss": 1.1372, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 5.375, |
|
"learning_rate": 2.0674192445007997e-05, |
|
"loss": 1.1645, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 5.75, |
|
"learning_rate": 2.0535682085495912e-05, |
|
"loss": 1.1098, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 2.039731313400629e-05, |
|
"loss": 1.1762, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 2.025908997339899e-05, |
|
"loss": 1.1623, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 2.0121016981915934e-05, |
|
"loss": 1.169, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.9983098533042384e-05, |
|
"loss": 1.1352, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.9845338995368483e-05, |
|
"loss": 1.1707, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.9707742732450775e-05, |
|
"loss": 1.1958, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.9570314102674108e-05, |
|
"loss": 1.1699, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.9433057459113485e-05, |
|
"loss": 1.1558, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.9295977149396225e-05, |
|
"loss": 1.1885, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.9159077515564257e-05, |
|
"loss": 1.1141, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.902236289393654e-05, |
|
"loss": 1.1866, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 1.888583761497178e-05, |
|
"loss": 1.1655, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.874950600313118e-05, |
|
"loss": 1.167, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 6.625, |
|
"learning_rate": 1.861337237674154e-05, |
|
"loss": 1.1927, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.8477441047858425e-05, |
|
"loss": 1.1902, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.8341716322129575e-05, |
|
"loss": 1.1921, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.820620249865858e-05, |
|
"loss": 1.1864, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 6.875, |
|
"learning_rate": 1.8070903869868615e-05, |
|
"loss": 1.1866, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.7935824721366574e-05, |
|
"loss": 1.1595, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 1.7800969331807237e-05, |
|
"loss": 1.1422, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.766634197275781e-05, |
|
"loss": 1.1428, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 1.7531946908562596e-05, |
|
"loss": 1.1056, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 1.7397788396207883e-05, |
|
"loss": 1.2066, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 1.7263870685187188e-05, |
|
"loss": 1.1753, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 1.7130198017366556e-05, |
|
"loss": 1.1243, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.699677462685029e-05, |
|
"loss": 1.1959, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.686360473984678e-05, |
|
"loss": 1.1313, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.6730692574534622e-05, |
|
"loss": 1.1578, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 1.659804234092908e-05, |
|
"loss": 1.1415, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 1.646565824074865e-05, |
|
"loss": 1.1565, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.6333544467282028e-05, |
|
"loss": 1.1677, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 9.375, |
|
"learning_rate": 1.6201705205255244e-05, |
|
"loss": 1.1842, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 1.6070144630699165e-05, |
|
"loss": 1.2085, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.5938866910817152e-05, |
|
"loss": 1.156, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 1.5807876203853118e-05, |
|
"loss": 1.1278, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.5677176658959782e-05, |
|
"loss": 1.1558, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 5.75, |
|
"learning_rate": 1.554677241606726e-05, |
|
"loss": 1.1674, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 1.5416667605751933e-05, |
|
"loss": 1.1816, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.5286866349105593e-05, |
|
"loss": 1.1665, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.5157372757604932e-05, |
|
"loss": 1.1067, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 6.75, |
|
"learning_rate": 1.502819093298128e-05, |
|
"loss": 1.2465, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 1.4899324967090705e-05, |
|
"loss": 1.1388, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.4770778941784401e-05, |
|
"loss": 1.1855, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.4642556928779403e-05, |
|
"loss": 1.1639, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 5.625, |
|
"learning_rate": 1.4514662989529562e-05, |
|
"loss": 1.1454, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.4387101175096985e-05, |
|
"loss": 1.1658, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.4259875526023622e-05, |
|
"loss": 1.1015, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.4132990072203378e-05, |
|
"loss": 1.1869, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.400644883275437e-05, |
|
"loss": 1.1581, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 1.3880255815891696e-05, |
|
"loss": 1.2049, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.3754415018800448e-05, |
|
"loss": 1.1779, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.3628930427509068e-05, |
|
"loss": 1.1909, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 4.625, |
|
"learning_rate": 1.3503806016763152e-05, |
|
"loss": 1.1462, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 1.3379045749899485e-05, |
|
"loss": 1.1609, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 1.325465357872056e-05, |
|
"loss": 1.1463, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.3130633443369377e-05, |
|
"loss": 1.1447, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 1.3006989272204611e-05, |
|
"loss": 1.1105, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 1.2883724981676243e-05, |
|
"loss": 1.2189, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.2760844476201429e-05, |
|
"loss": 1.1292, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.2638351648040918e-05, |
|
"loss": 1.1375, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 1.2516250377175673e-05, |
|
"loss": 1.1576, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 1.2394544531184036e-05, |
|
"loss": 1.168, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 1.2273237965119202e-05, |
|
"loss": 1.1362, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.2152334521387082e-05, |
|
"loss": 1.147, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 1.2031838029624657e-05, |
|
"loss": 1.0798, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 4.375, |
|
"learning_rate": 1.1911752306578593e-05, |
|
"loss": 1.0446, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 1.1792081155984422e-05, |
|
"loss": 1.0947, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.1672828368446018e-05, |
|
"loss": 1.1792, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.1553997721315509e-05, |
|
"loss": 1.1014, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.1435592978573695e-05, |
|
"loss": 1.1151, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.1317617890710738e-05, |
|
"loss": 1.1495, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 5.125, |
|
"learning_rate": 1.1200076194607448e-05, |
|
"loss": 1.203, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 1.1082971613416865e-05, |
|
"loss": 1.1496, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 1.0966307856446322e-05, |
|
"loss": 1.098, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 1.0850088619040005e-05, |
|
"loss": 1.1397, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.0734317582461833e-05, |
|
"loss": 1.2423, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 4.875, |
|
"learning_rate": 1.061899841377892e-05, |
|
"loss": 1.1154, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 1.0504134765745351e-05, |
|
"loss": 1.2164, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.0389730276686557e-05, |
|
"loss": 1.0513, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 1.0275788570384016e-05, |
|
"loss": 1.0955, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.0162313255960465e-05, |
|
"loss": 1.075, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 1.0049307927765634e-05, |
|
"loss": 1.1331, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 6.0, |
|
"learning_rate": 9.936776165262324e-06, |
|
"loss": 1.1564, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 4.75, |
|
"learning_rate": 9.824721532913091e-06, |
|
"loss": 1.0781, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 9.71314758006731e-06, |
|
"loss": 1.1172, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 9.602057840848732e-06, |
|
"loss": 1.1993, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 4.75, |
|
"learning_rate": 9.491455834043583e-06, |
|
"loss": 1.0923, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 5.75, |
|
"learning_rate": 9.38134506298906e-06, |
|
"loss": 1.1308, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 9.271729015462411e-06, |
|
"loss": 1.1036, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 9.162611163570403e-06, |
|
"loss": 1.1123, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 9.053994963639406e-06, |
|
"loss": 1.1922, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 8.945883856105841e-06, |
|
"loss": 1.2439, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 5.9375, |
|
"learning_rate": 8.838281265407284e-06, |
|
"loss": 1.1455, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 8.731190599873912e-06, |
|
"loss": 1.1547, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 5.40625, |
|
"learning_rate": 8.624615251620627e-06, |
|
"loss": 1.1167, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 6.53125, |
|
"learning_rate": 8.518558596439532e-06, |
|
"loss": 1.1272, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 8.413023993693075e-06, |
|
"loss": 1.1031, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 8.308014786207571e-06, |
|
"loss": 1.1688, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 4.84375, |
|
"learning_rate": 8.203534300167382e-06, |
|
"loss": 1.1888, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 8.0995858450095e-06, |
|
"loss": 1.1494, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 7.99617271331877e-06, |
|
"loss": 1.1709, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 7.893298180723582e-06, |
|
"loss": 1.1323, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 7.790965505792078e-06, |
|
"loss": 1.1689, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 7.689177929929004e-06, |
|
"loss": 1.1852, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 7.5879386772729625e-06, |
|
"loss": 1.1371, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 4.5625, |
|
"learning_rate": 7.4872509545943715e-06, |
|
"loss": 1.0184, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 4.78125, |
|
"learning_rate": 7.387117951193792e-06, |
|
"loss": 1.1552, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 6.125, |
|
"learning_rate": 7.287542838801003e-06, |
|
"loss": 1.1553, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 5.28125, |
|
"learning_rate": 7.188528771474478e-06, |
|
"loss": 1.1062, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 5.125, |
|
"learning_rate": 7.090078885501475e-06, |
|
"loss": 1.0785, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 6.992196299298739e-06, |
|
"loss": 1.0447, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 6.375, |
|
"learning_rate": 6.894884113313679e-06, |
|
"loss": 1.1757, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 6.798145409926191e-06, |
|
"loss": 1.1345, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 6.701983253351021e-06, |
|
"loss": 1.1477, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 6.606400689540673e-06, |
|
"loss": 1.1445, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 5.0625, |
|
"learning_rate": 6.511400746088986e-06, |
|
"loss": 1.1672, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 5.75, |
|
"learning_rate": 6.416986432135161e-06, |
|
"loss": 1.0833, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 6.323160738268524e-06, |
|
"loss": 1.0998, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 4.53125, |
|
"learning_rate": 6.229926636433733e-06, |
|
"loss": 1.0866, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 6.137287079836681e-06, |
|
"loss": 1.142, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 6.045245002850952e-06, |
|
"loss": 1.0761, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 5.95380332092483e-06, |
|
"loss": 1.1538, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 4.9375, |
|
"learning_rate": 5.8629649304890235e-06, |
|
"loss": 1.1283, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 5.772732708864842e-06, |
|
"loss": 1.2129, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 5.625, |
|
"learning_rate": 5.683109514173124e-06, |
|
"loss": 1.1755, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 5.125, |
|
"learning_rate": 5.594098185243676e-06, |
|
"loss": 1.072, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 5.505701541525335e-06, |
|
"loss": 1.1341, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 5.417922382996699e-06, |
|
"loss": 1.1623, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 5.875, |
|
"learning_rate": 5.330763490077404e-06, |
|
"loss": 1.09, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 7.875, |
|
"learning_rate": 5.244227623540074e-06, |
|
"loss": 1.1628, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 5.1583175244228785e-06, |
|
"loss": 1.1654, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 5.5, |
|
"learning_rate": 5.073035913942678e-06, |
|
"loss": 1.117, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 4.988385493408871e-06, |
|
"loss": 1.1308, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 5.0, |
|
"learning_rate": 4.904368944137783e-06, |
|
"loss": 1.1502, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 4.82098892736779e-06, |
|
"loss": 1.137, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 4.738248084174959e-06, |
|
"loss": 1.1274, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 6.125, |
|
"learning_rate": 4.656149035389457e-06, |
|
"loss": 1.0886, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 4.90625, |
|
"learning_rate": 4.574694381512498e-06, |
|
"loss": 1.2045, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 5.875, |
|
"learning_rate": 4.493886702633962e-06, |
|
"loss": 1.1118, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 4.41372855835071e-06, |
|
"loss": 1.1354, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 5.5, |
|
"learning_rate": 4.3342224876854604e-06, |
|
"loss": 1.1385, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 5.3125, |
|
"learning_rate": 4.255371009006415e-06, |
|
"loss": 1.2304, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 4.177176619947451e-06, |
|
"loss": 1.1728, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 5.625, |
|
"learning_rate": 4.099641797329015e-06, |
|
"loss": 1.14, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 6.15625, |
|
"learning_rate": 4.0227689970796956e-06, |
|
"loss": 1.1576, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 3.946560654158385e-06, |
|
"loss": 1.1663, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 7.5, |
|
"learning_rate": 3.871019182477203e-06, |
|
"loss": 1.2139, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 3.796146974824988e-06, |
|
"loss": 1.142, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 4.8125, |
|
"learning_rate": 3.7219464027915403e-06, |
|
"loss": 1.1319, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 3.6484198166924973e-06, |
|
"loss": 1.1338, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 5.375, |
|
"learning_rate": 3.5755695454948558e-06, |
|
"loss": 1.1259, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 6.40625, |
|
"learning_rate": 3.5033978967432437e-06, |
|
"loss": 1.226, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 3.431907156486791e-06, |
|
"loss": 1.1347, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 5.25, |
|
"learning_rate": 3.361099589206751e-06, |
|
"loss": 1.133, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 4.5, |
|
"learning_rate": 3.2909774377447606e-06, |
|
"loss": 1.0546, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 6.125, |
|
"learning_rate": 3.221542923231774e-06, |
|
"loss": 1.1597, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 5.75, |
|
"learning_rate": 3.1527982450177547e-06, |
|
"loss": 1.1044, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 3.084745580601964e-06, |
|
"loss": 1.1435, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 5.5, |
|
"learning_rate": 3.0173870855640246e-06, |
|
"loss": 1.2131, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 6.25, |
|
"learning_rate": 2.9507248934956155e-06, |
|
"loss": 1.1718, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 2.8847611159329072e-06, |
|
"loss": 1.1415, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 5.5625, |
|
"learning_rate": 2.819497842289684e-06, |
|
"loss": 1.1762, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 5.375, |
|
"learning_rate": 2.754937139791125e-06, |
|
"loss": 1.1139, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 5.78125, |
|
"learning_rate": 2.691081053408376e-06, |
|
"loss": 1.1348, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 2.6279316057937263e-06, |
|
"loss": 1.2093, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 6.9375, |
|
"learning_rate": 2.5654907972165853e-06, |
|
"loss": 1.1454, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 2.50376060550008e-06, |
|
"loss": 1.1167, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 2.442742985958446e-06, |
|
"loss": 1.1659, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 5.125, |
|
"learning_rate": 2.3824398713350597e-06, |
|
"loss": 1.1642, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 6.0625, |
|
"learning_rate": 2.3228531717412515e-06, |
|
"loss": 1.1461, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 2.2639847745957647e-06, |
|
"loss": 1.1391, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 2.2058365445650126e-06, |
|
"loss": 1.1331, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 2.1484103235039714e-06, |
|
"loss": 1.108, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 2.091707930397885e-06, |
|
"loss": 1.142, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 6.1875, |
|
"learning_rate": 2.035731161304602e-06, |
|
"loss": 1.0998, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 4.40625, |
|
"learning_rate": 1.9804817892977277e-06, |
|
"loss": 1.1734, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 1.925961564410428e-06, |
|
"loss": 1.2022, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.8721722135800174e-06, |
|
"loss": 1.1813, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.819115440593258e-06, |
|
"loss": 1.1642, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 1.766792926032365e-06, |
|
"loss": 1.1271, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 7.1875, |
|
"learning_rate": 1.715206327221816e-06, |
|
"loss": 1.1829, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 5.53125, |
|
"learning_rate": 1.6643572781758181e-06, |
|
"loss": 1.1764, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 7.03125, |
|
"learning_rate": 1.614247389546575e-06, |
|
"loss": 1.0975, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 1.5648782485732566e-06, |
|
"loss": 1.1742, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 1.5162514190317195e-06, |
|
"loss": 1.1686, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 6.0, |
|
"learning_rate": 1.4683684411849947e-06, |
|
"loss": 1.1135, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 4.96875, |
|
"learning_rate": 1.4212308317344686e-06, |
|
"loss": 1.1648, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 1.3748400837718696e-06, |
|
"loss": 1.1331, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 5.0, |
|
"learning_rate": 1.32919766673196e-06, |
|
"loss": 1.1018, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 6.125, |
|
"learning_rate": 1.2843050263459888e-06, |
|
"loss": 1.1353, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.2401635845959159e-06, |
|
"loss": 1.1662, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 6.4375, |
|
"learning_rate": 1.1967747396693401e-06, |
|
"loss": 1.2042, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 5.59375, |
|
"learning_rate": 1.1541398659152464e-06, |
|
"loss": 1.132, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 6.5625, |
|
"learning_rate": 1.1122603138004422e-06, |
|
"loss": 1.1555, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 1.0711374098667987e-06, |
|
"loss": 1.1753, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.0307724566892385e-06, |
|
"loss": 1.1657, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 5.0, |
|
"learning_rate": 9.911667328344477e-07, |
|
"loss": 1.1786, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 9.523214928204127e-07, |
|
"loss": 1.1759, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 6.25, |
|
"learning_rate": 9.142379670766532e-07, |
|
"loss": 1.19, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 6.0, |
|
"learning_rate": 8.76917361905269e-07, |
|
"loss": 1.1569, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 7.28125, |
|
"learning_rate": 8.403608594427237e-07, |
|
"loss": 1.1427, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 5.1875, |
|
"learning_rate": 8.045696176223888e-07, |
|
"loss": 1.1103, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 5.75, |
|
"learning_rate": 7.695447701378866e-07, |
|
"loss": 1.1479, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 5.0, |
|
"learning_rate": 7.352874264071579e-07, |
|
"loss": 1.1034, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 7.017986715373515e-07, |
|
"loss": 1.0887, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 6.69079566290412e-07, |
|
"loss": 1.2236, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 6.371311470495162e-07, |
|
"loss": 1.1793, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 5.15625, |
|
"learning_rate": 6.059544257862343e-07, |
|
"loss": 1.0842, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 4.71875, |
|
"learning_rate": 5.75550390028462e-07, |
|
"loss": 1.077, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 6.03125, |
|
"learning_rate": 5.459200028291617e-07, |
|
"loss": 1.1295, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 5.170642027358341e-07, |
|
"loss": 1.1019, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 5.96875, |
|
"learning_rate": 4.889839037608202e-07, |
|
"loss": 1.1931, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 5.75, |
|
"learning_rate": 4.616799953523182e-07, |
|
"loss": 1.1147, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 5.8125, |
|
"learning_rate": 4.351533423662318e-07, |
|
"loss": 1.1505, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 5.375, |
|
"learning_rate": 4.094047850387667e-07, |
|
"loss": 1.1301, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 3.8443513895981e-07, |
|
"loss": 1.1204, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 3.6024519504711806e-07, |
|
"loss": 1.0925, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 3.3683571952123906e-07, |
|
"loss": 1.1177, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 3.142074538812545e-07, |
|
"loss": 1.1465, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 5.25, |
|
"learning_rate": 2.9236111488129545e-07, |
|
"loss": 1.1202, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 6.84375, |
|
"learning_rate": 2.712973945078301e-07, |
|
"loss": 1.1668, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 2.510169599577533e-07, |
|
"loss": 1.1126, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 6.28125, |
|
"learning_rate": 2.315204536172455e-07, |
|
"loss": 1.1607, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 5.84375, |
|
"learning_rate": 2.128084930414359e-07, |
|
"loss": 1.1588, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 6.90625, |
|
"learning_rate": 1.9488167093482667e-07, |
|
"loss": 1.1142, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 5.90625, |
|
"learning_rate": 1.7774055513253006e-07, |
|
"loss": 1.1313, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 5.46875, |
|
"learning_rate": 1.613856885822801e-07, |
|
"loss": 1.0984, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.4581758932722956e-07, |
|
"loss": 1.0768, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 5.375, |
|
"learning_rate": 1.3103675048954934e-07, |
|
"loss": 1.1069, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 5.25, |
|
"learning_rate": 1.1704364025479642e-07, |
|
"loss": 1.1131, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 6.46875, |
|
"learning_rate": 1.0383870185710076e-07, |
|
"loss": 1.1476, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 6.71875, |
|
"learning_rate": 9.142235356510987e-08, |
|
"loss": 1.1705, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 5.0, |
|
"learning_rate": 7.979498866874935e-08, |
|
"loss": 1.069, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 5.4375, |
|
"learning_rate": 6.895697546676349e-08, |
|
"loss": 1.1401, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 6.65625, |
|
"learning_rate": 5.890865725504402e-08, |
|
"loss": 1.119, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 6.09375, |
|
"learning_rate": 4.9650352315766535e-08, |
|
"loss": 1.1177, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 4.118235390730141e-08, |
|
"loss": 1.0892, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 3.350493025492396e-08, |
|
"loss": 1.1853, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 5.21875, |
|
"learning_rate": 2.661832454232127e-08, |
|
"loss": 1.1004, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 5.71875, |
|
"learning_rate": 2.0522754903892772e-08, |
|
"loss": 1.113, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 5.5, |
|
"learning_rate": 1.5218414417833558e-08, |
|
"loss": 1.1525, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 5.34375, |
|
"learning_rate": 1.0705471100022624e-08, |
|
"loss": 1.1145, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 6.9840678986993205e-09, |
|
"loss": 1.0893, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 6.3125, |
|
"learning_rate": 4.054322689936441e-09, |
|
"loss": 1.1528, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 1.9163282739126376e-09, |
|
"loss": 1.1335, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 7.53125, |
|
"learning_rate": 5.701523719536805e-10, |
|
"loss": 1.1066, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 5.875, |
|
"learning_rate": 1.5837624414705154e-11, |
|
"loss": 1.2276, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 2811, |
|
"total_flos": 3.2918248209933926e+17, |
|
"train_loss": 1.199945468658317, |
|
"train_runtime": 5291.6066, |
|
"train_samples_per_second": 17.008, |
|
"train_steps_per_second": 0.531 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2811, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 30000, |
|
"total_flos": 3.2918248209933926e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|