|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.996101364522417, |
|
"eval_steps": 500, |
|
"global_step": 384, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.005198180636777128, |
|
"grad_norm": 46.50929641723633, |
|
"learning_rate": 8.333333333333333e-08, |
|
"loss": 1.4958, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.010396361273554255, |
|
"grad_norm": 36.618709564208984, |
|
"learning_rate": 1.6666666666666665e-07, |
|
"loss": 1.2222, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.015594541910331383, |
|
"grad_norm": 42.09670639038086, |
|
"learning_rate": 2.5e-07, |
|
"loss": 1.2627, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.02079272254710851, |
|
"grad_norm": 38.13116455078125, |
|
"learning_rate": 3.333333333333333e-07, |
|
"loss": 1.2579, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.02599090318388564, |
|
"grad_norm": 40.15380859375, |
|
"learning_rate": 4.1666666666666667e-07, |
|
"loss": 1.2421, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.031189083820662766, |
|
"grad_norm": 43.696563720703125, |
|
"learning_rate": 5e-07, |
|
"loss": 1.264, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.036387264457439894, |
|
"grad_norm": 44.71561813354492, |
|
"learning_rate": 5.833333333333334e-07, |
|
"loss": 1.3881, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.04158544509421702, |
|
"grad_norm": 41.07535171508789, |
|
"learning_rate": 6.666666666666666e-07, |
|
"loss": 1.1705, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04678362573099415, |
|
"grad_norm": 37.13037109375, |
|
"learning_rate": 7.5e-07, |
|
"loss": 1.1108, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.05198180636777128, |
|
"grad_norm": 39.47488021850586, |
|
"learning_rate": 8.333333333333333e-07, |
|
"loss": 1.1024, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.057179987004548405, |
|
"grad_norm": 35.2398681640625, |
|
"learning_rate": 9.166666666666665e-07, |
|
"loss": 1.0848, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.06237816764132553, |
|
"grad_norm": 26.39617347717285, |
|
"learning_rate": 1e-06, |
|
"loss": 0.7678, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06757634827810266, |
|
"grad_norm": 23.569713592529297, |
|
"learning_rate": 9.999821700020548e-07, |
|
"loss": 0.7753, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.07277452891487979, |
|
"grad_norm": 20.85965919494629, |
|
"learning_rate": 9.99928681279855e-07, |
|
"loss": 0.6663, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.07797270955165692, |
|
"grad_norm": 19.964326858520508, |
|
"learning_rate": 9.998395376482152e-07, |
|
"loss": 0.5468, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08317089018843404, |
|
"grad_norm": 10.887548446655273, |
|
"learning_rate": 9.997147454648588e-07, |
|
"loss": 0.4754, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.08836907082521117, |
|
"grad_norm": 10.726633071899414, |
|
"learning_rate": 9.995543136299635e-07, |
|
"loss": 0.4547, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0935672514619883, |
|
"grad_norm": 9.673012733459473, |
|
"learning_rate": 9.993582535855263e-07, |
|
"loss": 0.4634, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.09876543209876543, |
|
"grad_norm": 7.258286476135254, |
|
"learning_rate": 9.991265793145479e-07, |
|
"loss": 0.3635, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.10396361273554255, |
|
"grad_norm": 5.402667999267578, |
|
"learning_rate": 9.988593073400354e-07, |
|
"loss": 0.356, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.10916179337231968, |
|
"grad_norm": 4.859364032745361, |
|
"learning_rate": 9.985564567238236e-07, |
|
"loss": 0.3692, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.11435997400909681, |
|
"grad_norm": 3.8276686668395996, |
|
"learning_rate": 9.982180490652164e-07, |
|
"loss": 0.2976, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.11955815464587394, |
|
"grad_norm": 3.0185964107513428, |
|
"learning_rate": 9.97844108499445e-07, |
|
"loss": 0.2635, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.12475633528265107, |
|
"grad_norm": 2.6632726192474365, |
|
"learning_rate": 9.974346616959475e-07, |
|
"loss": 0.3086, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.1299545159194282, |
|
"grad_norm": 2.5440852642059326, |
|
"learning_rate": 9.969897378564667e-07, |
|
"loss": 0.2746, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13515269655620532, |
|
"grad_norm": 2.642413377761841, |
|
"learning_rate": 9.965093687129667e-07, |
|
"loss": 0.2889, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.14035087719298245, |
|
"grad_norm": 2.513338565826416, |
|
"learning_rate": 9.959935885253715e-07, |
|
"loss": 0.2778, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.14554905782975958, |
|
"grad_norm": 2.2585127353668213, |
|
"learning_rate": 9.954424340791195e-07, |
|
"loss": 0.2311, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1507472384665367, |
|
"grad_norm": 2.021958351135254, |
|
"learning_rate": 9.948559446825411e-07, |
|
"loss": 0.2403, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.15594541910331383, |
|
"grad_norm": 2.938659429550171, |
|
"learning_rate": 9.942341621640557e-07, |
|
"loss": 0.2984, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.16114359974009096, |
|
"grad_norm": 1.9811211824417114, |
|
"learning_rate": 9.93577130869187e-07, |
|
"loss": 0.2607, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1663417803768681, |
|
"grad_norm": 1.8804433345794678, |
|
"learning_rate": 9.928848976574018e-07, |
|
"loss": 0.2236, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.17153996101364521, |
|
"grad_norm": 2.2095425128936768, |
|
"learning_rate": 9.921575118987671e-07, |
|
"loss": 0.247, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.17673814165042234, |
|
"grad_norm": 2.0361135005950928, |
|
"learning_rate": 9.91395025470429e-07, |
|
"loss": 0.2519, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.18193632228719947, |
|
"grad_norm": 1.9882704019546509, |
|
"learning_rate": 9.905974927529133e-07, |
|
"loss": 0.2387, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1871345029239766, |
|
"grad_norm": 2.1970348358154297, |
|
"learning_rate": 9.897649706262473e-07, |
|
"loss": 0.2506, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.19233268356075373, |
|
"grad_norm": 1.9535129070281982, |
|
"learning_rate": 9.888975184659016e-07, |
|
"loss": 0.2491, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.19753086419753085, |
|
"grad_norm": 1.7368297576904297, |
|
"learning_rate": 9.879951981385577e-07, |
|
"loss": 0.2002, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.20272904483430798, |
|
"grad_norm": 1.8394443988800049, |
|
"learning_rate": 9.870580739976935e-07, |
|
"loss": 0.2107, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.2079272254710851, |
|
"grad_norm": 2.0104002952575684, |
|
"learning_rate": 9.860862128789952e-07, |
|
"loss": 0.2373, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.21312540610786224, |
|
"grad_norm": 1.7731590270996094, |
|
"learning_rate": 9.850796840955899e-07, |
|
"loss": 0.1881, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.21832358674463936, |
|
"grad_norm": 1.937873363494873, |
|
"learning_rate": 9.840385594331022e-07, |
|
"loss": 0.2238, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.2235217673814165, |
|
"grad_norm": 1.9435638189315796, |
|
"learning_rate": 9.82962913144534e-07, |
|
"loss": 0.2237, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.22871994801819362, |
|
"grad_norm": 1.786537766456604, |
|
"learning_rate": 9.818528219449704e-07, |
|
"loss": 0.1951, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.23391812865497075, |
|
"grad_norm": 1.7915631532669067, |
|
"learning_rate": 9.807083650061062e-07, |
|
"loss": 0.2257, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.23911630929174787, |
|
"grad_norm": 1.798353910446167, |
|
"learning_rate": 9.79529623950601e-07, |
|
"loss": 0.236, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.244314489928525, |
|
"grad_norm": 1.872049331665039, |
|
"learning_rate": 9.783166828462572e-07, |
|
"loss": 0.2354, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.24951267056530213, |
|
"grad_norm": 1.879210114479065, |
|
"learning_rate": 9.770696282000244e-07, |
|
"loss": 0.2229, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.25471085120207926, |
|
"grad_norm": 1.9663130044937134, |
|
"learning_rate": 9.757885489518296e-07, |
|
"loss": 0.2461, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.2599090318388564, |
|
"grad_norm": 1.6957286596298218, |
|
"learning_rate": 9.744735364682344e-07, |
|
"loss": 0.2065, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.2651072124756335, |
|
"grad_norm": 1.7848544120788574, |
|
"learning_rate": 9.731246845359184e-07, |
|
"loss": 0.1949, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.27030539311241064, |
|
"grad_norm": 1.8262349367141724, |
|
"learning_rate": 9.7174208935499e-07, |
|
"loss": 0.2135, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.27550357374918777, |
|
"grad_norm": 1.6100451946258545, |
|
"learning_rate": 9.703258495321265e-07, |
|
"loss": 0.1643, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2807017543859649, |
|
"grad_norm": 1.6476277112960815, |
|
"learning_rate": 9.688760660735402e-07, |
|
"loss": 0.1796, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.285899935022742, |
|
"grad_norm": 1.6926974058151245, |
|
"learning_rate": 9.673928423777756e-07, |
|
"loss": 0.2048, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.29109811565951915, |
|
"grad_norm": 1.7797563076019287, |
|
"learning_rate": 9.658762842283341e-07, |
|
"loss": 0.1953, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 1.7844983339309692, |
|
"learning_rate": 9.643264997861312e-07, |
|
"loss": 0.2103, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3014944769330734, |
|
"grad_norm": 1.8552502393722534, |
|
"learning_rate": 9.627435995817797e-07, |
|
"loss": 0.1854, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.30669265756985054, |
|
"grad_norm": 1.8308689594268799, |
|
"learning_rate": 9.611276965077097e-07, |
|
"loss": 0.1892, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.31189083820662766, |
|
"grad_norm": 1.8636093139648438, |
|
"learning_rate": 9.594789058101153e-07, |
|
"loss": 0.216, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3170890188434048, |
|
"grad_norm": 1.5857099294662476, |
|
"learning_rate": 9.577973450807351e-07, |
|
"loss": 0.1924, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3222871994801819, |
|
"grad_norm": 1.670000433921814, |
|
"learning_rate": 9.560831342484666e-07, |
|
"loss": 0.2088, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.32748538011695905, |
|
"grad_norm": 1.8758388757705688, |
|
"learning_rate": 9.543363955708124e-07, |
|
"loss": 0.1697, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3326835607537362, |
|
"grad_norm": 2.020310401916504, |
|
"learning_rate": 9.525572536251605e-07, |
|
"loss": 0.2249, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.3378817413905133, |
|
"grad_norm": 1.8294882774353027, |
|
"learning_rate": 9.507458352999001e-07, |
|
"loss": 0.1884, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.34307992202729043, |
|
"grad_norm": 1.606002926826477, |
|
"learning_rate": 9.489022697853708e-07, |
|
"loss": 0.1761, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.34827810266406756, |
|
"grad_norm": 1.6073530912399292, |
|
"learning_rate": 9.470266885646503e-07, |
|
"loss": 0.1871, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.3534762833008447, |
|
"grad_norm": 1.7087726593017578, |
|
"learning_rate": 9.451192254041758e-07, |
|
"loss": 0.173, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.3586744639376218, |
|
"grad_norm": 1.7764538526535034, |
|
"learning_rate": 9.431800163442041e-07, |
|
"loss": 0.1957, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.36387264457439894, |
|
"grad_norm": 1.8759775161743164, |
|
"learning_rate": 9.412091996891095e-07, |
|
"loss": 0.2154, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.36907082521117607, |
|
"grad_norm": 1.8281443119049072, |
|
"learning_rate": 9.392069159975198e-07, |
|
"loss": 0.1679, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.3742690058479532, |
|
"grad_norm": 1.7894129753112793, |
|
"learning_rate": 9.37173308072291e-07, |
|
"loss": 0.1679, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3794671864847303, |
|
"grad_norm": 1.6492183208465576, |
|
"learning_rate": 9.35108520950324e-07, |
|
"loss": 0.1833, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.38466536712150745, |
|
"grad_norm": 1.6076239347457886, |
|
"learning_rate": 9.330127018922193e-07, |
|
"loss": 0.1507, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.3898635477582846, |
|
"grad_norm": 1.8182544708251953, |
|
"learning_rate": 9.308860003717748e-07, |
|
"loss": 0.1759, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3950617283950617, |
|
"grad_norm": 2.183497667312622, |
|
"learning_rate": 9.287285680653254e-07, |
|
"loss": 0.2069, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.40025990903183883, |
|
"grad_norm": 1.9281930923461914, |
|
"learning_rate": 9.265405588409256e-07, |
|
"loss": 0.1813, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.40545808966861596, |
|
"grad_norm": 1.7534650564193726, |
|
"learning_rate": 9.243221287473755e-07, |
|
"loss": 0.1764, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.4106562703053931, |
|
"grad_norm": 1.7174078226089478, |
|
"learning_rate": 9.220734360030906e-07, |
|
"loss": 0.1863, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.4158544509421702, |
|
"grad_norm": 1.7550305128097534, |
|
"learning_rate": 9.197946409848194e-07, |
|
"loss": 0.1718, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.42105263157894735, |
|
"grad_norm": 1.4776816368103027, |
|
"learning_rate": 9.174859062162037e-07, |
|
"loss": 0.156, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.4262508122157245, |
|
"grad_norm": 1.7932229042053223, |
|
"learning_rate": 9.151473963561882e-07, |
|
"loss": 0.1821, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.4314489928525016, |
|
"grad_norm": 1.6103583574295044, |
|
"learning_rate": 9.127792781872768e-07, |
|
"loss": 0.1749, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.43664717348927873, |
|
"grad_norm": 1.8216729164123535, |
|
"learning_rate": 9.103817206036382e-07, |
|
"loss": 0.177, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.44184535412605586, |
|
"grad_norm": 1.7169886827468872, |
|
"learning_rate": 9.079548945990592e-07, |
|
"loss": 0.1845, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.447043534762833, |
|
"grad_norm": 1.4935150146484375, |
|
"learning_rate": 9.054989732547506e-07, |
|
"loss": 0.1518, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.4522417153996101, |
|
"grad_norm": 1.7215607166290283, |
|
"learning_rate": 9.030141317270025e-07, |
|
"loss": 0.1651, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.45743989603638724, |
|
"grad_norm": 1.885299801826477, |
|
"learning_rate": 9.005005472346923e-07, |
|
"loss": 0.1862, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.46263807667316437, |
|
"grad_norm": 1.6924781799316406, |
|
"learning_rate": 8.979583990466452e-07, |
|
"loss": 0.1834, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.4678362573099415, |
|
"grad_norm": 1.6620601415634155, |
|
"learning_rate": 8.953878684688492e-07, |
|
"loss": 0.1736, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4730344379467186, |
|
"grad_norm": 1.7256325483322144, |
|
"learning_rate": 8.92789138831524e-07, |
|
"loss": 0.1792, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.47823261858349575, |
|
"grad_norm": 1.6039340496063232, |
|
"learning_rate": 8.901623954760459e-07, |
|
"loss": 0.1704, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.4834307992202729, |
|
"grad_norm": 1.6422524452209473, |
|
"learning_rate": 8.875078257417294e-07, |
|
"loss": 0.1621, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.48862897985705, |
|
"grad_norm": 1.6837060451507568, |
|
"learning_rate": 8.84825618952466e-07, |
|
"loss": 0.183, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.49382716049382713, |
|
"grad_norm": 1.750653862953186, |
|
"learning_rate": 8.821159664032223e-07, |
|
"loss": 0.1689, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.49902534113060426, |
|
"grad_norm": 1.6462229490280151, |
|
"learning_rate": 8.793790613463954e-07, |
|
"loss": 0.1394, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.5042235217673814, |
|
"grad_norm": 1.7336857318878174, |
|
"learning_rate": 8.766150989780317e-07, |
|
"loss": 0.1581, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.5094217024041585, |
|
"grad_norm": 1.8384933471679688, |
|
"learning_rate": 8.738242764239046e-07, |
|
"loss": 0.1918, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.5146198830409356, |
|
"grad_norm": 1.723486065864563, |
|
"learning_rate": 8.710067927254554e-07, |
|
"loss": 0.1737, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.5198180636777128, |
|
"grad_norm": 1.9092669486999512, |
|
"learning_rate": 8.681628488255986e-07, |
|
"loss": 0.1728, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5250162443144899, |
|
"grad_norm": 1.729762315750122, |
|
"learning_rate": 8.652926475543898e-07, |
|
"loss": 0.162, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.530214424951267, |
|
"grad_norm": 1.7867392301559448, |
|
"learning_rate": 8.623963936145599e-07, |
|
"loss": 0.1658, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.5354126055880442, |
|
"grad_norm": 2.0217678546905518, |
|
"learning_rate": 8.594742935669164e-07, |
|
"loss": 0.1865, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.5406107862248213, |
|
"grad_norm": 1.7473349571228027, |
|
"learning_rate": 8.565265558156101e-07, |
|
"loss": 0.1535, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.5458089668615984, |
|
"grad_norm": 1.5292036533355713, |
|
"learning_rate": 8.535533905932737e-07, |
|
"loss": 0.1559, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5510071474983755, |
|
"grad_norm": 1.5472049713134766, |
|
"learning_rate": 8.505550099460263e-07, |
|
"loss": 0.1423, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.5562053281351527, |
|
"grad_norm": 1.636443853378296, |
|
"learning_rate": 8.475316277183508e-07, |
|
"loss": 0.1747, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.5614035087719298, |
|
"grad_norm": 1.5992189645767212, |
|
"learning_rate": 8.444834595378433e-07, |
|
"loss": 0.1766, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.5666016894087069, |
|
"grad_norm": 1.6766347885131836, |
|
"learning_rate": 8.414107227998328e-07, |
|
"loss": 0.1421, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.571799870045484, |
|
"grad_norm": 1.7345399856567383, |
|
"learning_rate": 8.383136366518787e-07, |
|
"loss": 0.1752, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5769980506822612, |
|
"grad_norm": 1.669264793395996, |
|
"learning_rate": 8.351924219781392e-07, |
|
"loss": 0.1661, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.5821962313190383, |
|
"grad_norm": 1.7636350393295288, |
|
"learning_rate": 8.320473013836195e-07, |
|
"loss": 0.1892, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.5873944119558154, |
|
"grad_norm": 1.8429635763168335, |
|
"learning_rate": 8.288784991782945e-07, |
|
"loss": 0.1883, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 1.5329152345657349, |
|
"learning_rate": 8.256862413611112e-07, |
|
"loss": 0.1472, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5977907732293697, |
|
"grad_norm": 1.9208284616470337, |
|
"learning_rate": 8.22470755603871e-07, |
|
"loss": 0.1714, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.6029889538661468, |
|
"grad_norm": 1.6381752490997314, |
|
"learning_rate": 8.192322712349917e-07, |
|
"loss": 0.1806, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.6081871345029239, |
|
"grad_norm": 1.5502922534942627, |
|
"learning_rate": 8.159710192231519e-07, |
|
"loss": 0.1653, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.6133853151397011, |
|
"grad_norm": 1.604650616645813, |
|
"learning_rate": 8.126872321608183e-07, |
|
"loss": 0.1478, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.6185834957764782, |
|
"grad_norm": 1.6860443353652954, |
|
"learning_rate": 8.093811442476572e-07, |
|
"loss": 0.1639, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.6237816764132553, |
|
"grad_norm": 1.5915076732635498, |
|
"learning_rate": 8.060529912738314e-07, |
|
"loss": 0.1511, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6289798570500325, |
|
"grad_norm": 1.7241225242614746, |
|
"learning_rate": 8.027030106031835e-07, |
|
"loss": 0.1848, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.6341780376868096, |
|
"grad_norm": 1.7747095823287964, |
|
"learning_rate": 7.993314411563075e-07, |
|
"loss": 0.1816, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.6393762183235867, |
|
"grad_norm": 1.6497771739959717, |
|
"learning_rate": 7.959385233935085e-07, |
|
"loss": 0.1696, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.6445743989603638, |
|
"grad_norm": 1.4712307453155518, |
|
"learning_rate": 7.925244992976537e-07, |
|
"loss": 0.1297, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.649772579597141, |
|
"grad_norm": 1.618713140487671, |
|
"learning_rate": 7.890896123569135e-07, |
|
"loss": 0.1708, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6549707602339181, |
|
"grad_norm": 1.8550593852996826, |
|
"learning_rate": 7.856341075473961e-07, |
|
"loss": 0.1646, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.6601689408706952, |
|
"grad_norm": 1.7929205894470215, |
|
"learning_rate": 7.821582313156763e-07, |
|
"loss": 0.1555, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.6653671215074723, |
|
"grad_norm": 1.8011633157730103, |
|
"learning_rate": 7.786622315612181e-07, |
|
"loss": 0.1882, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.6705653021442495, |
|
"grad_norm": 1.642986536026001, |
|
"learning_rate": 7.751463576186957e-07, |
|
"loss": 0.1659, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.6757634827810266, |
|
"grad_norm": 1.547602653503418, |
|
"learning_rate": 7.716108602402094e-07, |
|
"loss": 0.1479, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6809616634178037, |
|
"grad_norm": 1.6602659225463867, |
|
"learning_rate": 7.680559915774033e-07, |
|
"loss": 0.1627, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.6861598440545809, |
|
"grad_norm": 1.8091386556625366, |
|
"learning_rate": 7.644820051634812e-07, |
|
"loss": 0.1637, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.691358024691358, |
|
"grad_norm": 1.669487476348877, |
|
"learning_rate": 7.608891558951248e-07, |
|
"loss": 0.1599, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.6965562053281351, |
|
"grad_norm": 1.9016000032424927, |
|
"learning_rate": 7.572777000143145e-07, |
|
"loss": 0.1654, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.7017543859649122, |
|
"grad_norm": 1.4672502279281616, |
|
"learning_rate": 7.536478950900536e-07, |
|
"loss": 0.1482, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.7069525666016894, |
|
"grad_norm": 1.4602234363555908, |
|
"learning_rate": 7.5e-07, |
|
"loss": 0.1214, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.7121507472384665, |
|
"grad_norm": 1.725661277770996, |
|
"learning_rate": 7.463342749120013e-07, |
|
"loss": 0.1406, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.7173489278752436, |
|
"grad_norm": 1.6164398193359375, |
|
"learning_rate": 7.426509812655405e-07, |
|
"loss": 0.1492, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.7225471085120208, |
|
"grad_norm": 1.609312891960144, |
|
"learning_rate": 7.389503817530905e-07, |
|
"loss": 0.1669, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.7277452891487979, |
|
"grad_norm": 1.512629508972168, |
|
"learning_rate": 7.352327403013779e-07, |
|
"loss": 0.1318, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.732943469785575, |
|
"grad_norm": 1.7129087448120117, |
|
"learning_rate": 7.314983220525604e-07, |
|
"loss": 0.1762, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.7381416504223521, |
|
"grad_norm": 1.6480506658554077, |
|
"learning_rate": 7.277473933453169e-07, |
|
"loss": 0.1738, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.7433398310591293, |
|
"grad_norm": 1.5904552936553955, |
|
"learning_rate": 7.239802216958522e-07, |
|
"loss": 0.1558, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.7485380116959064, |
|
"grad_norm": 1.6988767385482788, |
|
"learning_rate": 7.201970757788171e-07, |
|
"loss": 0.1661, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.7537361923326835, |
|
"grad_norm": 1.5458639860153198, |
|
"learning_rate": 7.163982254081474e-07, |
|
"loss": 0.1338, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7589343729694606, |
|
"grad_norm": 1.5240118503570557, |
|
"learning_rate": 7.125839415178203e-07, |
|
"loss": 0.1405, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.7641325536062378, |
|
"grad_norm": 1.7464412450790405, |
|
"learning_rate": 7.087544961425316e-07, |
|
"loss": 0.1682, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.7693307342430149, |
|
"grad_norm": 1.7425211668014526, |
|
"learning_rate": 7.049101623982937e-07, |
|
"loss": 0.1839, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.774528914879792, |
|
"grad_norm": 1.4918522834777832, |
|
"learning_rate": 7.010512144629579e-07, |
|
"loss": 0.1124, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.7797270955165692, |
|
"grad_norm": 1.6756539344787598, |
|
"learning_rate": 6.971779275566593e-07, |
|
"loss": 0.1546, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7849252761533463, |
|
"grad_norm": 1.5222876071929932, |
|
"learning_rate": 6.93290577922188e-07, |
|
"loss": 0.1313, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.7901234567901234, |
|
"grad_norm": 1.548453688621521, |
|
"learning_rate": 6.89389442805288e-07, |
|
"loss": 0.1349, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.7953216374269005, |
|
"grad_norm": 1.6898419857025146, |
|
"learning_rate": 6.85474800434884e-07, |
|
"loss": 0.1568, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.8005198180636777, |
|
"grad_norm": 1.8794304132461548, |
|
"learning_rate": 6.815469300032373e-07, |
|
"loss": 0.161, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.8057179987004548, |
|
"grad_norm": 1.6816418170928955, |
|
"learning_rate": 6.776061116460352e-07, |
|
"loss": 0.1615, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.8109161793372319, |
|
"grad_norm": 1.960444688796997, |
|
"learning_rate": 6.7365262642241e-07, |
|
"loss": 0.1948, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.816114359974009, |
|
"grad_norm": 1.6450730562210083, |
|
"learning_rate": 6.696867562948962e-07, |
|
"loss": 0.161, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.8213125406107862, |
|
"grad_norm": 1.4993230104446411, |
|
"learning_rate": 6.657087841093179e-07, |
|
"loss": 0.1476, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.8265107212475633, |
|
"grad_norm": 1.856066346168518, |
|
"learning_rate": 6.61718993574619e-07, |
|
"loss": 0.1599, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.8317089018843404, |
|
"grad_norm": 1.6243445873260498, |
|
"learning_rate": 6.577176692426278e-07, |
|
"loss": 0.1548, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.8369070825211176, |
|
"grad_norm": 1.538219928741455, |
|
"learning_rate": 6.537050964877625e-07, |
|
"loss": 0.1428, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.8421052631578947, |
|
"grad_norm": 1.429417371749878, |
|
"learning_rate": 6.496815614866791e-07, |
|
"loss": 0.1205, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.8473034437946718, |
|
"grad_norm": 1.7732073068618774, |
|
"learning_rate": 6.456473511978606e-07, |
|
"loss": 0.1903, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.852501624431449, |
|
"grad_norm": 1.575061321258545, |
|
"learning_rate": 6.416027533411519e-07, |
|
"loss": 0.1571, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.8576998050682261, |
|
"grad_norm": 1.6352499723434448, |
|
"learning_rate": 6.375480563772389e-07, |
|
"loss": 0.1484, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8628979857050032, |
|
"grad_norm": 1.7170888185501099, |
|
"learning_rate": 6.334835494870758e-07, |
|
"loss": 0.1735, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.8680961663417803, |
|
"grad_norm": 1.5450496673583984, |
|
"learning_rate": 6.294095225512604e-07, |
|
"loss": 0.1339, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.8732943469785575, |
|
"grad_norm": 1.5989458560943604, |
|
"learning_rate": 6.253262661293602e-07, |
|
"loss": 0.1393, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.8784925276153346, |
|
"grad_norm": 1.464534878730774, |
|
"learning_rate": 6.2123407143919e-07, |
|
"loss": 0.1421, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.8836907082521117, |
|
"grad_norm": 1.6165345907211304, |
|
"learning_rate": 6.17133230336041e-07, |
|
"loss": 0.154, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 1.5105384588241577, |
|
"learning_rate": 6.130240352918674e-07, |
|
"loss": 0.1614, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.894087069525666, |
|
"grad_norm": 1.6538264751434326, |
|
"learning_rate": 6.089067793744257e-07, |
|
"loss": 0.1213, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.8992852501624431, |
|
"grad_norm": 1.5659717321395874, |
|
"learning_rate": 6.047817562263743e-07, |
|
"loss": 0.1349, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.9044834307992202, |
|
"grad_norm": 1.6108099222183228, |
|
"learning_rate": 6.0064926004433e-07, |
|
"loss": 0.1572, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.9096816114359974, |
|
"grad_norm": 1.7230148315429688, |
|
"learning_rate": 5.965095855578868e-07, |
|
"loss": 0.1376, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.9148797920727745, |
|
"grad_norm": 1.7344483137130737, |
|
"learning_rate": 5.923630280085947e-07, |
|
"loss": 0.1572, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.9200779727095516, |
|
"grad_norm": 1.6481879949569702, |
|
"learning_rate": 5.882098831289043e-07, |
|
"loss": 0.1626, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.9252761533463287, |
|
"grad_norm": 1.7318065166473389, |
|
"learning_rate": 5.840504471210741e-07, |
|
"loss": 0.1756, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.9304743339831059, |
|
"grad_norm": 1.676165223121643, |
|
"learning_rate": 5.79885016636046e-07, |
|
"loss": 0.147, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.935672514619883, |
|
"grad_norm": 1.4620646238327026, |
|
"learning_rate": 5.757138887522883e-07, |
|
"loss": 0.1249, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.9408706952566601, |
|
"grad_norm": 1.65927255153656, |
|
"learning_rate": 5.71537360954607e-07, |
|
"loss": 0.163, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.9460688758934372, |
|
"grad_norm": 1.5536587238311768, |
|
"learning_rate": 5.673557311129306e-07, |
|
"loss": 0.1351, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.9512670565302144, |
|
"grad_norm": 1.7076836824417114, |
|
"learning_rate": 5.631692974610647e-07, |
|
"loss": 0.1771, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.9564652371669915, |
|
"grad_norm": 1.4979828596115112, |
|
"learning_rate": 5.589783585754231e-07, |
|
"loss": 0.121, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.9616634178037686, |
|
"grad_norm": 1.5839756727218628, |
|
"learning_rate": 5.547832133537327e-07, |
|
"loss": 0.1458, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9668615984405458, |
|
"grad_norm": 1.7546137571334839, |
|
"learning_rate": 5.505841609937161e-07, |
|
"loss": 0.1671, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.9720597790773229, |
|
"grad_norm": 1.7105190753936768, |
|
"learning_rate": 5.463815009717532e-07, |
|
"loss": 0.1314, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.9772579597141, |
|
"grad_norm": 1.8557852506637573, |
|
"learning_rate": 5.421755330215223e-07, |
|
"loss": 0.1794, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.9824561403508771, |
|
"grad_norm": 1.569214105606079, |
|
"learning_rate": 5.379665571126231e-07, |
|
"loss": 0.1307, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.9876543209876543, |
|
"grad_norm": 1.6137492656707764, |
|
"learning_rate": 5.337548734291826e-07, |
|
"loss": 0.1412, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9928525016244314, |
|
"grad_norm": 1.6707996129989624, |
|
"learning_rate": 5.295407823484467e-07, |
|
"loss": 0.1627, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.9980506822612085, |
|
"grad_norm": 2.3229496479034424, |
|
"learning_rate": 5.253245844193564e-07, |
|
"loss": 0.1872, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.0032488628979856, |
|
"grad_norm": 1.4374881982803345, |
|
"learning_rate": 5.211065803411134e-07, |
|
"loss": 0.1118, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.0084470435347628, |
|
"grad_norm": 1.7097264528274536, |
|
"learning_rate": 5.168870709417341e-07, |
|
"loss": 0.1603, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.01364522417154, |
|
"grad_norm": 1.5904935598373413, |
|
"learning_rate": 5.126663571565939e-07, |
|
"loss": 0.128, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.018843404808317, |
|
"grad_norm": 1.5433835983276367, |
|
"learning_rate": 5.084447400069654e-07, |
|
"loss": 0.1192, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.0240415854450942, |
|
"grad_norm": 1.40073561668396, |
|
"learning_rate": 5.042225205785492e-07, |
|
"loss": 0.1188, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.0292397660818713, |
|
"grad_norm": 1.6374619007110596, |
|
"learning_rate": 5e-07, |
|
"loss": 0.1486, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.0344379467186484, |
|
"grad_norm": 1.4800790548324585, |
|
"learning_rate": 4.957774794214508e-07, |
|
"loss": 0.1297, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.0396361273554255, |
|
"grad_norm": 1.5941686630249023, |
|
"learning_rate": 4.915552599930345e-07, |
|
"loss": 0.1466, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.0448343079922027, |
|
"grad_norm": 1.5258111953735352, |
|
"learning_rate": 4.873336428434061e-07, |
|
"loss": 0.1264, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.0500324886289798, |
|
"grad_norm": 1.725998878479004, |
|
"learning_rate": 4.831129290582659e-07, |
|
"loss": 0.1334, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.055230669265757, |
|
"grad_norm": 1.8626656532287598, |
|
"learning_rate": 4.788934196588865e-07, |
|
"loss": 0.1503, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.060428849902534, |
|
"grad_norm": 1.4593480825424194, |
|
"learning_rate": 4.746754155806437e-07, |
|
"loss": 0.1225, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.0656270305393112, |
|
"grad_norm": 1.6284171342849731, |
|
"learning_rate": 4.7045921765155337e-07, |
|
"loss": 0.1397, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.0708252111760883, |
|
"grad_norm": 1.4469428062438965, |
|
"learning_rate": 4.662451265708174e-07, |
|
"loss": 0.1082, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.0760233918128654, |
|
"grad_norm": 1.3986436128616333, |
|
"learning_rate": 4.620334428873769e-07, |
|
"loss": 0.1025, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.0812215724496426, |
|
"grad_norm": 1.6322413682937622, |
|
"learning_rate": 4.5782446697847764e-07, |
|
"loss": 0.126, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.0864197530864197, |
|
"grad_norm": 1.4022878408432007, |
|
"learning_rate": 4.536184990282467e-07, |
|
"loss": 0.0932, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.0916179337231968, |
|
"grad_norm": 1.8021215200424194, |
|
"learning_rate": 4.4941583900628393e-07, |
|
"loss": 0.1662, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.096816114359974, |
|
"grad_norm": 1.806060552597046, |
|
"learning_rate": 4.4521678664626745e-07, |
|
"loss": 0.1574, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.102014294996751, |
|
"grad_norm": 1.7470866441726685, |
|
"learning_rate": 4.4102164142457705e-07, |
|
"loss": 0.1467, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.1072124756335282, |
|
"grad_norm": 1.5723119974136353, |
|
"learning_rate": 4.368307025389355e-07, |
|
"loss": 0.1084, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.1124106562703053, |
|
"grad_norm": 1.5377353429794312, |
|
"learning_rate": 4.326442688870696e-07, |
|
"loss": 0.116, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.1176088369070825, |
|
"grad_norm": 1.7077683210372925, |
|
"learning_rate": 4.2846263904539303e-07, |
|
"loss": 0.1483, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.1228070175438596, |
|
"grad_norm": 1.8333083391189575, |
|
"learning_rate": 4.242861112477118e-07, |
|
"loss": 0.1527, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.1280051981806367, |
|
"grad_norm": 1.5769239664077759, |
|
"learning_rate": 4.201149833639539e-07, |
|
"loss": 0.121, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.1332033788174138, |
|
"grad_norm": 1.919921636581421, |
|
"learning_rate": 4.15949552878926e-07, |
|
"loss": 0.173, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.138401559454191, |
|
"grad_norm": 1.5571967363357544, |
|
"learning_rate": 4.117901168710959e-07, |
|
"loss": 0.1227, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.143599740090968, |
|
"grad_norm": 1.6683669090270996, |
|
"learning_rate": 4.0763697199140546e-07, |
|
"loss": 0.1422, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.1487979207277452, |
|
"grad_norm": 1.6401057243347168, |
|
"learning_rate": 4.034904144421134e-07, |
|
"loss": 0.1256, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.1539961013645224, |
|
"grad_norm": 1.5419056415557861, |
|
"learning_rate": 3.9935073995566987e-07, |
|
"loss": 0.1302, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.1591942820012995, |
|
"grad_norm": 1.650795817375183, |
|
"learning_rate": 3.952182437736256e-07, |
|
"loss": 0.1471, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.1643924626380766, |
|
"grad_norm": 1.672743797302246, |
|
"learning_rate": 3.910932206255742e-07, |
|
"loss": 0.1298, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.1695906432748537, |
|
"grad_norm": 1.6902425289154053, |
|
"learning_rate": 3.869759647081325e-07, |
|
"loss": 0.1414, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.1747888239116309, |
|
"grad_norm": 1.607485055923462, |
|
"learning_rate": 3.828667696639589e-07, |
|
"loss": 0.1032, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.179987004548408, |
|
"grad_norm": 1.5336533784866333, |
|
"learning_rate": 3.7876592856081e-07, |
|
"loss": 0.1116, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 1.5528396368026733, |
|
"learning_rate": 3.7467373387063964e-07, |
|
"loss": 0.1243, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.1903833658219622, |
|
"grad_norm": 1.743318796157837, |
|
"learning_rate": 3.7059047744873955e-07, |
|
"loss": 0.1437, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.1955815464587394, |
|
"grad_norm": 1.6484525203704834, |
|
"learning_rate": 3.665164505129241e-07, |
|
"loss": 0.131, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.2007797270955165, |
|
"grad_norm": 1.6531956195831299, |
|
"learning_rate": 3.6245194362276094e-07, |
|
"loss": 0.1268, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.2059779077322936, |
|
"grad_norm": 1.491297960281372, |
|
"learning_rate": 3.5839724665884795e-07, |
|
"loss": 0.1261, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.2111760883690708, |
|
"grad_norm": 1.5535728931427002, |
|
"learning_rate": 3.5435264880213937e-07, |
|
"loss": 0.1233, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.2163742690058479, |
|
"grad_norm": 1.6621695756912231, |
|
"learning_rate": 3.50318438513321e-07, |
|
"loss": 0.1331, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.221572449642625, |
|
"grad_norm": 1.5829371213912964, |
|
"learning_rate": 3.462949035122376e-07, |
|
"loss": 0.1229, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.2267706302794021, |
|
"grad_norm": 1.7693301439285278, |
|
"learning_rate": 3.4228233075737223e-07, |
|
"loss": 0.1434, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.2319688109161793, |
|
"grad_norm": 1.6113789081573486, |
|
"learning_rate": 3.3828100642538093e-07, |
|
"loss": 0.1213, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.2371669915529564, |
|
"grad_norm": 1.5799354314804077, |
|
"learning_rate": 3.342912158906821e-07, |
|
"loss": 0.1191, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.2423651721897335, |
|
"grad_norm": 1.5467253923416138, |
|
"learning_rate": 3.3031324370510396e-07, |
|
"loss": 0.1133, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.2475633528265107, |
|
"grad_norm": 1.8147982358932495, |
|
"learning_rate": 3.263473735775899e-07, |
|
"loss": 0.1391, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.2527615334632878, |
|
"grad_norm": 1.702359676361084, |
|
"learning_rate": 3.2239388835396484e-07, |
|
"loss": 0.1339, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.257959714100065, |
|
"grad_norm": 1.7504138946533203, |
|
"learning_rate": 3.184530699967627e-07, |
|
"loss": 0.1565, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.263157894736842, |
|
"grad_norm": 1.7226463556289673, |
|
"learning_rate": 3.1452519956511614e-07, |
|
"loss": 0.1266, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.2683560753736192, |
|
"grad_norm": 1.8186461925506592, |
|
"learning_rate": 3.1061055719471197e-07, |
|
"loss": 0.1347, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.2735542560103963, |
|
"grad_norm": 1.6384614706039429, |
|
"learning_rate": 3.0670942207781204e-07, |
|
"loss": 0.1115, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.2787524366471734, |
|
"grad_norm": 1.7369823455810547, |
|
"learning_rate": 3.028220724433408e-07, |
|
"loss": 0.129, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.2839506172839505, |
|
"grad_norm": 1.6780822277069092, |
|
"learning_rate": 2.989487855370421e-07, |
|
"loss": 0.1385, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.2891487979207277, |
|
"grad_norm": 1.87428879737854, |
|
"learning_rate": 2.9508983760170634e-07, |
|
"loss": 0.1435, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.2943469785575048, |
|
"grad_norm": 1.7940468788146973, |
|
"learning_rate": 2.9124550385746856e-07, |
|
"loss": 0.1491, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.299545159194282, |
|
"grad_norm": 1.712099552154541, |
|
"learning_rate": 2.8741605848217976e-07, |
|
"loss": 0.131, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.304743339831059, |
|
"grad_norm": 1.6154824495315552, |
|
"learning_rate": 2.8360177459185263e-07, |
|
"loss": 0.1145, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.3099415204678362, |
|
"grad_norm": 1.6371185779571533, |
|
"learning_rate": 2.7980292422118277e-07, |
|
"loss": 0.1232, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.3151397011046133, |
|
"grad_norm": 1.8156847953796387, |
|
"learning_rate": 2.7601977830414766e-07, |
|
"loss": 0.1274, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.3203378817413904, |
|
"grad_norm": 1.6596229076385498, |
|
"learning_rate": 2.72252606654683e-07, |
|
"loss": 0.1168, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.3255360623781676, |
|
"grad_norm": 1.6106423139572144, |
|
"learning_rate": 2.685016779474396e-07, |
|
"loss": 0.1139, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.3307342430149447, |
|
"grad_norm": 1.6363728046417236, |
|
"learning_rate": 2.6476725969862226e-07, |
|
"loss": 0.1297, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.3359324236517218, |
|
"grad_norm": 1.4978957176208496, |
|
"learning_rate": 2.6104961824690964e-07, |
|
"loss": 0.1191, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.341130604288499, |
|
"grad_norm": 1.5889379978179932, |
|
"learning_rate": 2.5734901873445956e-07, |
|
"loss": 0.1236, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.346328784925276, |
|
"grad_norm": 1.534178376197815, |
|
"learning_rate": 2.536657250879988e-07, |
|
"loss": 0.1053, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.3515269655620532, |
|
"grad_norm": 1.8409833908081055, |
|
"learning_rate": 2.500000000000001e-07, |
|
"loss": 0.1427, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.3567251461988303, |
|
"grad_norm": 1.7446589469909668, |
|
"learning_rate": 2.4635210490994647e-07, |
|
"loss": 0.1194, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.3619233268356075, |
|
"grad_norm": 1.7886688709259033, |
|
"learning_rate": 2.427222999856857e-07, |
|
"loss": 0.1351, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.3671215074723846, |
|
"grad_norm": 1.6462031602859497, |
|
"learning_rate": 2.391108441048753e-07, |
|
"loss": 0.1249, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.3723196881091617, |
|
"grad_norm": 1.8700019121170044, |
|
"learning_rate": 2.355179948365189e-07, |
|
"loss": 0.1482, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.3775178687459388, |
|
"grad_norm": 1.8244132995605469, |
|
"learning_rate": 2.3194400842259687e-07, |
|
"loss": 0.134, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.382716049382716, |
|
"grad_norm": 1.8189918994903564, |
|
"learning_rate": 2.283891397597908e-07, |
|
"loss": 0.1258, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.387914230019493, |
|
"grad_norm": 1.5552480220794678, |
|
"learning_rate": 2.2485364238130433e-07, |
|
"loss": 0.1131, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.3931124106562702, |
|
"grad_norm": 1.659328579902649, |
|
"learning_rate": 2.2133776843878183e-07, |
|
"loss": 0.1119, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.3983105912930474, |
|
"grad_norm": 1.3867595195770264, |
|
"learning_rate": 2.1784176868432375e-07, |
|
"loss": 0.0851, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.4035087719298245, |
|
"grad_norm": 1.7647099494934082, |
|
"learning_rate": 2.1436589245260372e-07, |
|
"loss": 0.1158, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.4087069525666016, |
|
"grad_norm": 1.8926544189453125, |
|
"learning_rate": 2.109103876430864e-07, |
|
"loss": 0.1527, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.4139051332033787, |
|
"grad_norm": 1.839390516281128, |
|
"learning_rate": 2.074755007023461e-07, |
|
"loss": 0.1108, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.4191033138401559, |
|
"grad_norm": 1.797500729560852, |
|
"learning_rate": 2.040614766064913e-07, |
|
"loss": 0.1508, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.424301494476933, |
|
"grad_norm": 1.7864497900009155, |
|
"learning_rate": 2.0066855884369243e-07, |
|
"loss": 0.1242, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.4294996751137101, |
|
"grad_norm": 1.853615641593933, |
|
"learning_rate": 1.9729698939681644e-07, |
|
"loss": 0.122, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.4346978557504872, |
|
"grad_norm": 1.6054631471633911, |
|
"learning_rate": 1.9394700872616853e-07, |
|
"loss": 0.1212, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.4398960363872644, |
|
"grad_norm": 1.632055640220642, |
|
"learning_rate": 1.906188557523427e-07, |
|
"loss": 0.1101, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.4450942170240415, |
|
"grad_norm": 1.6560664176940918, |
|
"learning_rate": 1.873127678391816e-07, |
|
"loss": 0.1217, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.4502923976608186, |
|
"grad_norm": 1.4159197807312012, |
|
"learning_rate": 1.8402898077684803e-07, |
|
"loss": 0.0988, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.4554905782975958, |
|
"grad_norm": 1.6314151287078857, |
|
"learning_rate": 1.8076772876500828e-07, |
|
"loss": 0.1293, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.4606887589343729, |
|
"grad_norm": 1.7430942058563232, |
|
"learning_rate": 1.775292443961291e-07, |
|
"loss": 0.1401, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.46588693957115, |
|
"grad_norm": 1.7857812643051147, |
|
"learning_rate": 1.7431375863888898e-07, |
|
"loss": 0.1275, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.4710851202079271, |
|
"grad_norm": 1.5308443307876587, |
|
"learning_rate": 1.7112150082170568e-07, |
|
"loss": 0.1061, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.4762833008447043, |
|
"grad_norm": 1.6288737058639526, |
|
"learning_rate": 1.679526986163804e-07, |
|
"loss": 0.1119, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 1.704690933227539, |
|
"learning_rate": 1.6480757802186068e-07, |
|
"loss": 0.1166, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.4866796621182585, |
|
"grad_norm": 1.5033763647079468, |
|
"learning_rate": 1.6168636334812125e-07, |
|
"loss": 0.1045, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.4918778427550357, |
|
"grad_norm": 1.4401872158050537, |
|
"learning_rate": 1.5858927720016706e-07, |
|
"loss": 0.0959, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.4970760233918128, |
|
"grad_norm": 1.6706205606460571, |
|
"learning_rate": 1.555165404621567e-07, |
|
"loss": 0.1124, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.50227420402859, |
|
"grad_norm": 1.8483508825302124, |
|
"learning_rate": 1.5246837228164905e-07, |
|
"loss": 0.1146, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.507472384665367, |
|
"grad_norm": 1.8255398273468018, |
|
"learning_rate": 1.494449900539737e-07, |
|
"loss": 0.1413, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.5126705653021442, |
|
"grad_norm": 1.8132373094558716, |
|
"learning_rate": 1.4644660940672627e-07, |
|
"loss": 0.1316, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.5178687459389213, |
|
"grad_norm": 1.7580801248550415, |
|
"learning_rate": 1.434734441843899e-07, |
|
"loss": 0.1252, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.5230669265756984, |
|
"grad_norm": 1.4802451133728027, |
|
"learning_rate": 1.4052570643308375e-07, |
|
"loss": 0.1087, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.5282651072124755, |
|
"grad_norm": 1.595434308052063, |
|
"learning_rate": 1.376036063854401e-07, |
|
"loss": 0.1063, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.5334632878492527, |
|
"grad_norm": 1.5052251815795898, |
|
"learning_rate": 1.3470735244561027e-07, |
|
"loss": 0.1071, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.5386614684860298, |
|
"grad_norm": 1.4966932535171509, |
|
"learning_rate": 1.3183715117440142e-07, |
|
"loss": 0.1003, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.543859649122807, |
|
"grad_norm": 1.5577093362808228, |
|
"learning_rate": 1.2899320727454472e-07, |
|
"loss": 0.1147, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.549057829759584, |
|
"grad_norm": 2.081566572189331, |
|
"learning_rate": 1.2617572357609562e-07, |
|
"loss": 0.1479, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.5542560103963612, |
|
"grad_norm": 1.5348504781723022, |
|
"learning_rate": 1.2338490102196825e-07, |
|
"loss": 0.1061, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.5594541910331383, |
|
"grad_norm": 1.7641793489456177, |
|
"learning_rate": 1.2062093865360457e-07, |
|
"loss": 0.1359, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.5646523716699154, |
|
"grad_norm": 1.5747112035751343, |
|
"learning_rate": 1.1788403359677767e-07, |
|
"loss": 0.1069, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.5698505523066926, |
|
"grad_norm": 1.6333017349243164, |
|
"learning_rate": 1.1517438104753385e-07, |
|
"loss": 0.1077, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.5750487329434697, |
|
"grad_norm": 1.5666186809539795, |
|
"learning_rate": 1.1249217425827062e-07, |
|
"loss": 0.1118, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.5802469135802468, |
|
"grad_norm": 1.5051664113998413, |
|
"learning_rate": 1.0983760452395413e-07, |
|
"loss": 0.1043, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.585445094217024, |
|
"grad_norm": 1.7494423389434814, |
|
"learning_rate": 1.07210861168476e-07, |
|
"loss": 0.1327, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.590643274853801, |
|
"grad_norm": 1.5114164352416992, |
|
"learning_rate": 1.0461213153115079e-07, |
|
"loss": 0.0938, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.5958414554905782, |
|
"grad_norm": 1.562703251838684, |
|
"learning_rate": 1.0204160095335479e-07, |
|
"loss": 0.1056, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.6010396361273553, |
|
"grad_norm": 1.6739459037780762, |
|
"learning_rate": 9.94994527653078e-08, |
|
"loss": 0.1223, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.6062378167641325, |
|
"grad_norm": 1.6659190654754639, |
|
"learning_rate": 9.69858682729976e-08, |
|
"loss": 0.1201, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.6114359974009096, |
|
"grad_norm": 1.6198205947875977, |
|
"learning_rate": 9.45010267452495e-08, |
|
"loss": 0.1041, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.6166341780376867, |
|
"grad_norm": 1.6569786071777344, |
|
"learning_rate": 9.204510540094095e-08, |
|
"loss": 0.1153, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.6218323586744638, |
|
"grad_norm": 1.6825376749038696, |
|
"learning_rate": 8.961827939636196e-08, |
|
"loss": 0.1195, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.627030539311241, |
|
"grad_norm": 1.6692954301834106, |
|
"learning_rate": 8.722072181272311e-08, |
|
"loss": 0.14, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.632228719948018, |
|
"grad_norm": 1.86336350440979, |
|
"learning_rate": 8.485260364381186e-08, |
|
"loss": 0.1154, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.6374269005847952, |
|
"grad_norm": 1.6375104188919067, |
|
"learning_rate": 8.251409378379637e-08, |
|
"loss": 0.1087, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.6426250812215724, |
|
"grad_norm": 1.6260446310043335, |
|
"learning_rate": 8.02053590151805e-08, |
|
"loss": 0.1099, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.6478232618583495, |
|
"grad_norm": 1.7146011590957642, |
|
"learning_rate": 7.792656399690922e-08, |
|
"loss": 0.1167, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.6530214424951266, |
|
"grad_norm": 1.6423556804656982, |
|
"learning_rate": 7.567787125262449e-08, |
|
"loss": 0.1171, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.6582196231319037, |
|
"grad_norm": 1.5510940551757812, |
|
"learning_rate": 7.345944115907421e-08, |
|
"loss": 0.1013, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.6634178037686809, |
|
"grad_norm": 1.863503336906433, |
|
"learning_rate": 7.127143193467445e-08, |
|
"loss": 0.1423, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.668615984405458, |
|
"grad_norm": 1.7284936904907227, |
|
"learning_rate": 6.911399962822518e-08, |
|
"loss": 0.1112, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.6738141650422351, |
|
"grad_norm": 1.7205549478530884, |
|
"learning_rate": 6.698729810778064e-08, |
|
"loss": 0.136, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.6790123456790123, |
|
"grad_norm": 1.4762628078460693, |
|
"learning_rate": 6.48914790496759e-08, |
|
"loss": 0.1144, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.6842105263157894, |
|
"grad_norm": 1.8362925052642822, |
|
"learning_rate": 6.282669192770895e-08, |
|
"loss": 0.1328, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.6894087069525665, |
|
"grad_norm": 1.4527249336242676, |
|
"learning_rate": 6.079308400248029e-08, |
|
"loss": 0.1055, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.6946068875893436, |
|
"grad_norm": 1.7803164720535278, |
|
"learning_rate": 5.8790800310890456e-08, |
|
"loss": 0.1451, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.6998050682261208, |
|
"grad_norm": 2.059589147567749, |
|
"learning_rate": 5.6819983655795936e-08, |
|
"loss": 0.1458, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.705003248862898, |
|
"grad_norm": 1.7053784132003784, |
|
"learning_rate": 5.4880774595824245e-08, |
|
"loss": 0.1257, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.710201429499675, |
|
"grad_norm": 1.97287118434906, |
|
"learning_rate": 5.297331143534972e-08, |
|
"loss": 0.1381, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.7153996101364521, |
|
"grad_norm": 1.6465667486190796, |
|
"learning_rate": 5.109773021462921e-08, |
|
"loss": 0.1155, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.7205977907732293, |
|
"grad_norm": 1.6381739377975464, |
|
"learning_rate": 4.925416470009991e-08, |
|
"loss": 0.1224, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.7257959714100064, |
|
"grad_norm": 1.5278061628341675, |
|
"learning_rate": 4.744274637483936e-08, |
|
"loss": 0.1203, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.7309941520467835, |
|
"grad_norm": 1.7712794542312622, |
|
"learning_rate": 4.566360442918754e-08, |
|
"loss": 0.1334, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.7361923326835607, |
|
"grad_norm": 1.912316083908081, |
|
"learning_rate": 4.3916865751533306e-08, |
|
"loss": 0.1475, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.7413905133203378, |
|
"grad_norm": 1.7500931024551392, |
|
"learning_rate": 4.220265491926489e-08, |
|
"loss": 0.1165, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.746588693957115, |
|
"grad_norm": 1.7582532167434692, |
|
"learning_rate": 4.0521094189884696e-08, |
|
"loss": 0.119, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.751786874593892, |
|
"grad_norm": 1.6794097423553467, |
|
"learning_rate": 3.887230349229015e-08, |
|
"loss": 0.1093, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.7569850552306692, |
|
"grad_norm": 1.797913908958435, |
|
"learning_rate": 3.7256400418220256e-08, |
|
"loss": 0.1235, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.7621832358674463, |
|
"grad_norm": 1.6042097806930542, |
|
"learning_rate": 3.567350021386895e-08, |
|
"loss": 0.096, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.7673814165042234, |
|
"grad_norm": 1.7623355388641357, |
|
"learning_rate": 3.412371577166578e-08, |
|
"loss": 0.1153, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.7725795971410006, |
|
"grad_norm": 1.737052321434021, |
|
"learning_rate": 3.260715762222449e-08, |
|
"loss": 0.1327, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 1.61103355884552, |
|
"learning_rate": 3.1123933926459845e-08, |
|
"loss": 0.1213, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.7829759584145548, |
|
"grad_norm": 1.78020441532135, |
|
"learning_rate": 2.9674150467873527e-08, |
|
"loss": 0.1262, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.788174139051332, |
|
"grad_norm": 1.7625352144241333, |
|
"learning_rate": 2.825791064500993e-08, |
|
"loss": 0.1465, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.793372319688109, |
|
"grad_norm": 1.7474111318588257, |
|
"learning_rate": 2.6875315464081562e-08, |
|
"loss": 0.1289, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.7985705003248862, |
|
"grad_norm": 1.7776579856872559, |
|
"learning_rate": 2.5526463531765463e-08, |
|
"loss": 0.1219, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.8037686809616633, |
|
"grad_norm": 1.7461159229278564, |
|
"learning_rate": 2.4211451048170296e-08, |
|
"loss": 0.1361, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.8089668615984404, |
|
"grad_norm": 1.701060175895691, |
|
"learning_rate": 2.293037179997559e-08, |
|
"loss": 0.1463, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.8141650422352176, |
|
"grad_norm": 1.7974237203598022, |
|
"learning_rate": 2.1683317153742775e-08, |
|
"loss": 0.14, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.8193632228719947, |
|
"grad_norm": 1.5575218200683594, |
|
"learning_rate": 2.047037604939894e-08, |
|
"loss": 0.1234, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.8245614035087718, |
|
"grad_norm": 1.6900442838668823, |
|
"learning_rate": 1.92916349938938e-08, |
|
"loss": 0.1119, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.829759584145549, |
|
"grad_norm": 1.7725763320922852, |
|
"learning_rate": 1.8147178055029577e-08, |
|
"loss": 0.1456, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.834957764782326, |
|
"grad_norm": 1.677262544631958, |
|
"learning_rate": 1.7037086855465898e-08, |
|
"loss": 0.1041, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.8401559454191032, |
|
"grad_norm": 1.5990703105926514, |
|
"learning_rate": 1.596144056689791e-08, |
|
"loss": 0.1149, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.8453541260558803, |
|
"grad_norm": 2.083341598510742, |
|
"learning_rate": 1.4920315904410064e-08, |
|
"loss": 0.1375, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.8505523066926575, |
|
"grad_norm": 1.7379205226898193, |
|
"learning_rate": 1.3913787121004716e-08, |
|
"loss": 0.1365, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.8557504873294346, |
|
"grad_norm": 1.6175756454467773, |
|
"learning_rate": 1.2941926002306536e-08, |
|
"loss": 0.1348, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.8609486679662117, |
|
"grad_norm": 1.7680426836013794, |
|
"learning_rate": 1.200480186144237e-08, |
|
"loss": 0.1232, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.8661468486029889, |
|
"grad_norm": 1.3798372745513916, |
|
"learning_rate": 1.1102481534098374e-08, |
|
"loss": 0.0966, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.871345029239766, |
|
"grad_norm": 1.584373950958252, |
|
"learning_rate": 1.0235029373752757e-08, |
|
"loss": 0.1082, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.876543209876543, |
|
"grad_norm": 1.7235833406448364, |
|
"learning_rate": 9.402507247086578e-09, |
|
"loss": 0.1348, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.8817413905133202, |
|
"grad_norm": 1.6077182292938232, |
|
"learning_rate": 8.60497452957104e-09, |
|
"loss": 0.1234, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.8869395711500974, |
|
"grad_norm": 1.5776941776275635, |
|
"learning_rate": 7.842488101232891e-09, |
|
"loss": 0.1177, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.8921377517868745, |
|
"grad_norm": 1.5909714698791504, |
|
"learning_rate": 7.115102342598101e-09, |
|
"loss": 0.1133, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.8973359324236516, |
|
"grad_norm": 1.7370619773864746, |
|
"learning_rate": 6.422869130812913e-09, |
|
"loss": 0.1269, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.9025341130604287, |
|
"grad_norm": 1.7405683994293213, |
|
"learning_rate": 5.765837835944309e-09, |
|
"loss": 0.1307, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.9077322936972059, |
|
"grad_norm": 1.9631272554397583, |
|
"learning_rate": 5.144055317458817e-09, |
|
"loss": 0.1546, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.912930474333983, |
|
"grad_norm": 1.6291249990463257, |
|
"learning_rate": 4.55756592088058e-09, |
|
"loss": 0.1249, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.9181286549707601, |
|
"grad_norm": 1.528748869895935, |
|
"learning_rate": 4.0064114746284905e-09, |
|
"loss": 0.1027, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.9233268356075373, |
|
"grad_norm": 1.8311703205108643, |
|
"learning_rate": 3.4906312870331965e-09, |
|
"loss": 0.1128, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.9285250162443144, |
|
"grad_norm": 1.8863190412521362, |
|
"learning_rate": 3.010262143533393e-09, |
|
"loss": 0.1441, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.9337231968810915, |
|
"grad_norm": 1.5785506963729858, |
|
"learning_rate": 2.5653383040524224e-09, |
|
"loss": 0.1198, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.9389213775178686, |
|
"grad_norm": 1.6531542539596558, |
|
"learning_rate": 2.155891500554896e-09, |
|
"loss": 0.1224, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.9441195581546458, |
|
"grad_norm": 1.5542500019073486, |
|
"learning_rate": 1.7819509347835049e-09, |
|
"loss": 0.1149, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.949317738791423, |
|
"grad_norm": 1.5082885026931763, |
|
"learning_rate": 1.4435432761762955e-09, |
|
"loss": 0.1061, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.9545159194282, |
|
"grad_norm": 1.5856765508651733, |
|
"learning_rate": 1.1406926599646372e-09, |
|
"loss": 0.1327, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.9597141000649771, |
|
"grad_norm": 1.6566141843795776, |
|
"learning_rate": 8.73420685452042e-10, |
|
"loss": 0.1105, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.9649122807017543, |
|
"grad_norm": 1.7047754526138306, |
|
"learning_rate": 6.417464144736207e-10, |
|
"loss": 0.1301, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.9701104613385314, |
|
"grad_norm": 1.7699244022369385, |
|
"learning_rate": 4.4568637003633556e-10, |
|
"loss": 0.1412, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.9753086419753085, |
|
"grad_norm": 1.8177380561828613, |
|
"learning_rate": 2.852545351409996e-10, |
|
"loss": 0.1571, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.9805068226120857, |
|
"grad_norm": 1.5334748029708862, |
|
"learning_rate": 1.6046235178474033e-10, |
|
"loss": 0.104, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.9857050032488628, |
|
"grad_norm": 1.7260781526565552, |
|
"learning_rate": 7.13187201450971e-11, |
|
"loss": 0.1263, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.99090318388564, |
|
"grad_norm": 1.6607623100280762, |
|
"learning_rate": 1.7829997945084662e-11, |
|
"loss": 0.1135, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.996101364522417, |
|
"grad_norm": 1.7004939317703247, |
|
"learning_rate": 0.0, |
|
"loss": 0.1315, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.996101364522417, |
|
"step": 384, |
|
"total_flos": 3.0550374682743276e+18, |
|
"train_loss": 0.19105235013800362, |
|
"train_runtime": 6828.3197, |
|
"train_samples_per_second": 7.211, |
|
"train_steps_per_second": 0.056 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 384, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.0550374682743276e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|