|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.99000999000999, |
|
"eval_steps": 500, |
|
"global_step": 5000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01998001998001998, |
|
"grad_norm": 1.9170171022415161, |
|
"learning_rate": 0.0001999980260856137, |
|
"loss": 1.4846, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03996003996003996, |
|
"grad_norm": 0.5531741976737976, |
|
"learning_rate": 0.00019999210442038162, |
|
"loss": 1.0709, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.059940059940059943, |
|
"grad_norm": 0.39242061972618103, |
|
"learning_rate": 0.0001999822352380809, |
|
"loss": 0.9892, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07992007992007992, |
|
"grad_norm": 0.46874135732650757, |
|
"learning_rate": 0.00019996841892833, |
|
"loss": 0.9705, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0999000999000999, |
|
"grad_norm": 0.4039924740791321, |
|
"learning_rate": 0.00019995065603657316, |
|
"loss": 0.953, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11988011988011989, |
|
"grad_norm": 0.37654027342796326, |
|
"learning_rate": 0.00019992894726405893, |
|
"loss": 0.9138, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13986013986013987, |
|
"grad_norm": 0.41351592540740967, |
|
"learning_rate": 0.0001999032934678125, |
|
"loss": 0.9121, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15984015984015984, |
|
"grad_norm": 0.42355260252952576, |
|
"learning_rate": 0.00019987369566060176, |
|
"loss": 0.8971, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1798201798201798, |
|
"grad_norm": 0.40265560150146484, |
|
"learning_rate": 0.00019984015501089752, |
|
"loss": 0.892, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.1998001998001998, |
|
"grad_norm": 0.36668843030929565, |
|
"learning_rate": 0.00019980267284282717, |
|
"loss": 0.8907, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21978021978021978, |
|
"grad_norm": 0.3516446352005005, |
|
"learning_rate": 0.00019976125063612252, |
|
"loss": 0.888, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23976023976023977, |
|
"grad_norm": 0.3761754631996155, |
|
"learning_rate": 0.0001997158900260614, |
|
"loss": 0.8883, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 0.3486793041229248, |
|
"learning_rate": 0.00019966659280340297, |
|
"loss": 0.8709, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.27972027972027974, |
|
"grad_norm": 0.39413630962371826, |
|
"learning_rate": 0.00019961336091431727, |
|
"loss": 0.8544, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2997002997002997, |
|
"grad_norm": 0.3653990924358368, |
|
"learning_rate": 0.00019955619646030802, |
|
"loss": 0.8647, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3196803196803197, |
|
"grad_norm": 0.4523209035396576, |
|
"learning_rate": 0.00019949510169813003, |
|
"loss": 0.8698, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.33966033966033965, |
|
"grad_norm": 0.3841874897480011, |
|
"learning_rate": 0.0001994300790396999, |
|
"loss": 0.8513, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.3596403596403596, |
|
"grad_norm": 0.3849908709526062, |
|
"learning_rate": 0.00019936113105200085, |
|
"loss": 0.8553, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.37962037962037964, |
|
"grad_norm": 0.3563358783721924, |
|
"learning_rate": 0.00019928826045698136, |
|
"loss": 0.8615, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.3996003996003996, |
|
"grad_norm": 0.3968392610549927, |
|
"learning_rate": 0.0001992114701314478, |
|
"loss": 0.8502, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.4195804195804196, |
|
"grad_norm": 0.366230845451355, |
|
"learning_rate": 0.00019913076310695068, |
|
"loss": 0.8368, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.43956043956043955, |
|
"grad_norm": 0.38813525438308716, |
|
"learning_rate": 0.00019904614256966512, |
|
"loss": 0.862, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4595404595404595, |
|
"grad_norm": 0.35268592834472656, |
|
"learning_rate": 0.0001989576118602651, |
|
"loss": 0.8468, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.47952047952047955, |
|
"grad_norm": 0.342580109834671, |
|
"learning_rate": 0.0001988651744737914, |
|
"loss": 0.8575, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4995004995004995, |
|
"grad_norm": 0.37153083086013794, |
|
"learning_rate": 0.00019876883405951377, |
|
"loss": 0.8374, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 0.3486216366291046, |
|
"learning_rate": 0.0001986685944207868, |
|
"loss": 0.8333, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.5394605394605395, |
|
"grad_norm": 0.3562557101249695, |
|
"learning_rate": 0.00019856445951489982, |
|
"loss": 0.8238, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5594405594405595, |
|
"grad_norm": 0.3600502610206604, |
|
"learning_rate": 0.00019845643345292054, |
|
"loss": 0.8331, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5794205794205795, |
|
"grad_norm": 0.3475654423236847, |
|
"learning_rate": 0.00019834452049953297, |
|
"loss": 0.8093, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5994005994005994, |
|
"grad_norm": 0.358980655670166, |
|
"learning_rate": 0.0001982287250728689, |
|
"loss": 0.8302, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.6193806193806194, |
|
"grad_norm": 0.3721815347671509, |
|
"learning_rate": 0.0001981090517443334, |
|
"loss": 0.8175, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.6393606393606394, |
|
"grad_norm": 0.35128098726272583, |
|
"learning_rate": 0.0001979855052384247, |
|
"loss": 0.8193, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.6593406593406593, |
|
"grad_norm": 0.3471618592739105, |
|
"learning_rate": 0.00019785809043254722, |
|
"loss": 0.8232, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6793206793206793, |
|
"grad_norm": 0.35060420632362366, |
|
"learning_rate": 0.00019772681235681936, |
|
"loss": 0.8194, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6993006993006993, |
|
"grad_norm": 0.3695327341556549, |
|
"learning_rate": 0.00019759167619387476, |
|
"loss": 0.806, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.7192807192807192, |
|
"grad_norm": 0.35857513546943665, |
|
"learning_rate": 0.00019745268727865774, |
|
"loss": 0.8019, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.7392607392607392, |
|
"grad_norm": 0.3612421154975891, |
|
"learning_rate": 0.00019730985109821266, |
|
"loss": 0.8061, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.7592407592407593, |
|
"grad_norm": 0.34007078409194946, |
|
"learning_rate": 0.0001971631732914674, |
|
"loss": 0.7919, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 0.3594492971897125, |
|
"learning_rate": 0.0001970126596490106, |
|
"loss": 0.7821, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.7992007992007992, |
|
"grad_norm": 0.37426885962486267, |
|
"learning_rate": 0.0001968583161128631, |
|
"loss": 0.8054, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.8191808191808192, |
|
"grad_norm": 0.3551250398159027, |
|
"learning_rate": 0.00019670014877624353, |
|
"loss": 0.7954, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.8391608391608392, |
|
"grad_norm": 0.35951119661331177, |
|
"learning_rate": 0.0001965381638833274, |
|
"loss": 0.7966, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.8591408591408591, |
|
"grad_norm": 0.36964887380599976, |
|
"learning_rate": 0.000196372367829001, |
|
"loss": 0.7888, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8791208791208791, |
|
"grad_norm": 0.36829873919487, |
|
"learning_rate": 0.0001962027671586086, |
|
"loss": 0.7902, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8991008991008991, |
|
"grad_norm": 0.34358304738998413, |
|
"learning_rate": 0.0001960293685676943, |
|
"loss": 0.7733, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.919080919080919, |
|
"grad_norm": 0.37369629740715027, |
|
"learning_rate": 0.0001958521789017376, |
|
"loss": 0.796, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.939060939060939, |
|
"grad_norm": 0.40985429286956787, |
|
"learning_rate": 0.00019567120515588308, |
|
"loss": 0.7931, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.9590409590409591, |
|
"grad_norm": 0.34838569164276123, |
|
"learning_rate": 0.00019548645447466431, |
|
"loss": 0.7682, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.9790209790209791, |
|
"grad_norm": 0.36467525362968445, |
|
"learning_rate": 0.00019529793415172192, |
|
"loss": 0.7781, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.999000999000999, |
|
"grad_norm": 0.37112316489219666, |
|
"learning_rate": 0.00019510565162951537, |
|
"loss": 0.7773, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.018981018981019, |
|
"grad_norm": 0.3998737931251526, |
|
"learning_rate": 0.00019490961449902946, |
|
"loss": 0.7324, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.0389610389610389, |
|
"grad_norm": 0.3966336250305176, |
|
"learning_rate": 0.00019470983049947444, |
|
"loss": 0.7395, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.058941058941059, |
|
"grad_norm": 0.39721325039863586, |
|
"learning_rate": 0.00019450630751798048, |
|
"loss": 0.7302, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.078921078921079, |
|
"grad_norm": 0.38532692193984985, |
|
"learning_rate": 0.00019429905358928646, |
|
"loss": 0.7177, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.098901098901099, |
|
"grad_norm": 0.3948540985584259, |
|
"learning_rate": 0.00019408807689542257, |
|
"loss": 0.7382, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.118881118881119, |
|
"grad_norm": 0.399676650762558, |
|
"learning_rate": 0.00019387338576538744, |
|
"loss": 0.7286, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.138861138861139, |
|
"grad_norm": 0.4208274781703949, |
|
"learning_rate": 0.00019365498867481923, |
|
"loss": 0.7251, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.158841158841159, |
|
"grad_norm": 0.4160782992839813, |
|
"learning_rate": 0.00019343289424566122, |
|
"loss": 0.7138, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.1788211788211789, |
|
"grad_norm": 0.4297160804271698, |
|
"learning_rate": 0.0001932071112458211, |
|
"loss": 0.7296, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.1988011988011988, |
|
"grad_norm": 0.4196039140224457, |
|
"learning_rate": 0.00019297764858882514, |
|
"loss": 0.7091, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.2187812187812188, |
|
"grad_norm": 0.406012624502182, |
|
"learning_rate": 0.00019274451533346615, |
|
"loss": 0.7021, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.2387612387612388, |
|
"grad_norm": 0.41200658679008484, |
|
"learning_rate": 0.0001925077206834458, |
|
"loss": 0.7238, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.2587412587412588, |
|
"grad_norm": 0.4819345772266388, |
|
"learning_rate": 0.0001922672739870115, |
|
"loss": 0.7275, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.2787212787212787, |
|
"grad_norm": 0.40825748443603516, |
|
"learning_rate": 0.00019202318473658705, |
|
"loss": 0.7183, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"grad_norm": 0.41940203309059143, |
|
"learning_rate": 0.00019177546256839812, |
|
"loss": 0.7149, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.3186813186813187, |
|
"grad_norm": 0.40075168013572693, |
|
"learning_rate": 0.00019152411726209176, |
|
"loss": 0.722, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.3386613386613386, |
|
"grad_norm": 0.4254063665866852, |
|
"learning_rate": 0.0001912691587403503, |
|
"loss": 0.7254, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.3586413586413586, |
|
"grad_norm": 0.39732539653778076, |
|
"learning_rate": 0.00019101059706849957, |
|
"loss": 0.7115, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.3786213786213786, |
|
"grad_norm": 0.3889389932155609, |
|
"learning_rate": 0.0001907484424541117, |
|
"loss": 0.7031, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.3986013986013985, |
|
"grad_norm": 0.3994196355342865, |
|
"learning_rate": 0.00019048270524660196, |
|
"loss": 0.7095, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.4185814185814185, |
|
"grad_norm": 0.4238826036453247, |
|
"learning_rate": 0.00019021339593682028, |
|
"loss": 0.7156, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.4385614385614387, |
|
"grad_norm": 0.4787987172603607, |
|
"learning_rate": 0.0001899405251566371, |
|
"loss": 0.7142, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.4585414585414584, |
|
"grad_norm": 0.4219954013824463, |
|
"learning_rate": 0.00018966410367852362, |
|
"loss": 0.7267, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.4785214785214786, |
|
"grad_norm": 0.4154765009880066, |
|
"learning_rate": 0.0001893841424151264, |
|
"loss": 0.721, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.4985014985014984, |
|
"grad_norm": 0.44605547189712524, |
|
"learning_rate": 0.0001891006524188368, |
|
"loss": 0.7266, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.5184815184815186, |
|
"grad_norm": 0.4613310992717743, |
|
"learning_rate": 0.00018881364488135448, |
|
"loss": 0.7253, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.41615426540374756, |
|
"learning_rate": 0.00018852313113324552, |
|
"loss": 0.69, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.5584415584415585, |
|
"grad_norm": 0.4512516260147095, |
|
"learning_rate": 0.00018822912264349534, |
|
"loss": 0.7124, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.5784215784215783, |
|
"grad_norm": 0.464336633682251, |
|
"learning_rate": 0.00018793163101905563, |
|
"loss": 0.7067, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.5984015984015985, |
|
"grad_norm": 0.4427087604999542, |
|
"learning_rate": 0.00018763066800438636, |
|
"loss": 0.7097, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.6183816183816184, |
|
"grad_norm": 0.43341028690338135, |
|
"learning_rate": 0.00018732624548099204, |
|
"loss": 0.7068, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.6383616383616384, |
|
"grad_norm": 0.4100460112094879, |
|
"learning_rate": 0.0001870183754669526, |
|
"loss": 0.705, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.6583416583416584, |
|
"grad_norm": 0.43942147493362427, |
|
"learning_rate": 0.000186707070116449, |
|
"loss": 0.7043, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.6783216783216783, |
|
"grad_norm": 0.430095911026001, |
|
"learning_rate": 0.00018639234171928353, |
|
"loss": 0.6989, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.6983016983016983, |
|
"grad_norm": 0.40418198704719543, |
|
"learning_rate": 0.0001860742027003944, |
|
"loss": 0.6933, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.7182817182817183, |
|
"grad_norm": 0.40910184383392334, |
|
"learning_rate": 0.00018575266561936523, |
|
"loss": 0.6848, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.7382617382617382, |
|
"grad_norm": 0.4620640277862549, |
|
"learning_rate": 0.0001854277431699295, |
|
"loss": 0.6943, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.7582417582417582, |
|
"grad_norm": 0.4648028314113617, |
|
"learning_rate": 0.00018509944817946922, |
|
"loss": 0.6993, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.7782217782217782, |
|
"grad_norm": 0.43752139806747437, |
|
"learning_rate": 0.00018476779360850832, |
|
"loss": 0.6827, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.7982017982017982, |
|
"grad_norm": 0.4481639862060547, |
|
"learning_rate": 0.00018443279255020152, |
|
"loss": 0.6978, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.4678110182285309, |
|
"learning_rate": 0.00018409445822981693, |
|
"loss": 0.6848, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.838161838161838, |
|
"grad_norm": 0.433933824300766, |
|
"learning_rate": 0.0001837528040042142, |
|
"loss": 0.658, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.8581418581418583, |
|
"grad_norm": 0.4601323902606964, |
|
"learning_rate": 0.00018340784336131713, |
|
"loss": 0.6912, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.878121878121878, |
|
"grad_norm": 0.4591493308544159, |
|
"learning_rate": 0.00018305958991958127, |
|
"loss": 0.697, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.8981018981018982, |
|
"grad_norm": 0.445711225271225, |
|
"learning_rate": 0.00018270805742745617, |
|
"loss": 0.6922, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.918081918081918, |
|
"grad_norm": 0.43125954270362854, |
|
"learning_rate": 0.00018235325976284275, |
|
"loss": 0.6742, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.9380619380619382, |
|
"grad_norm": 0.4716484248638153, |
|
"learning_rate": 0.00018199521093254523, |
|
"loss": 0.6796, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.958041958041958, |
|
"grad_norm": 0.4613405764102936, |
|
"learning_rate": 0.00018163392507171842, |
|
"loss": 0.6832, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.978021978021978, |
|
"grad_norm": 0.48080363869667053, |
|
"learning_rate": 0.0001812694164433094, |
|
"loss": 0.6807, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.9980019980019978, |
|
"grad_norm": 0.47017648816108704, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 0.6785, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.017982017982018, |
|
"grad_norm": 0.516197919845581, |
|
"learning_rate": 0.0001805307885711122, |
|
"loss": 0.6019, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 2.037962037962038, |
|
"grad_norm": 0.5556052923202515, |
|
"learning_rate": 0.00018015669848708767, |
|
"loss": 0.5906, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 2.057942057942058, |
|
"grad_norm": 0.5169907808303833, |
|
"learning_rate": 0.0001797794439538571, |
|
"loss": 0.6076, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 2.0779220779220777, |
|
"grad_norm": 0.5560281276702881, |
|
"learning_rate": 0.00017939903986478355, |
|
"loss": 0.582, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 2.097902097902098, |
|
"grad_norm": 0.521091878414154, |
|
"learning_rate": 0.00017901550123756906, |
|
"loss": 0.5929, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 2.117882117882118, |
|
"grad_norm": 0.5990195870399475, |
|
"learning_rate": 0.00017862884321366188, |
|
"loss": 0.5863, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 2.137862137862138, |
|
"grad_norm": 0.5285313725471497, |
|
"learning_rate": 0.0001782390810576588, |
|
"loss": 0.5845, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 2.157842157842158, |
|
"grad_norm": 0.5402159690856934, |
|
"learning_rate": 0.00017784623015670238, |
|
"loss": 0.5926, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 2.177822177822178, |
|
"grad_norm": 0.5576025247573853, |
|
"learning_rate": 0.00017745030601987337, |
|
"loss": 0.5964, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 2.197802197802198, |
|
"grad_norm": 0.5605506896972656, |
|
"learning_rate": 0.00017705132427757895, |
|
"loss": 0.5877, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.2177822177822177, |
|
"grad_norm": 0.5754747986793518, |
|
"learning_rate": 0.00017664930068093498, |
|
"loss": 0.6002, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 2.237762237762238, |
|
"grad_norm": 0.5654470324516296, |
|
"learning_rate": 0.0001762442511011448, |
|
"loss": 0.5922, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 2.2577422577422577, |
|
"grad_norm": 0.5414491891860962, |
|
"learning_rate": 0.0001758361915288722, |
|
"loss": 0.5917, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 2.277722277722278, |
|
"grad_norm": 0.5563125014305115, |
|
"learning_rate": 0.00017542513807361037, |
|
"loss": 0.5867, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 2.2977022977022976, |
|
"grad_norm": 0.5236257314682007, |
|
"learning_rate": 0.00017501110696304596, |
|
"loss": 0.5888, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 2.317682317682318, |
|
"grad_norm": 0.614734411239624, |
|
"learning_rate": 0.00017459411454241822, |
|
"loss": 0.6001, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 2.3376623376623376, |
|
"grad_norm": 0.605421781539917, |
|
"learning_rate": 0.00017417417727387394, |
|
"loss": 0.5968, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 2.3576423576423577, |
|
"grad_norm": 0.5595569014549255, |
|
"learning_rate": 0.0001737513117358174, |
|
"loss": 0.5924, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 2.3776223776223775, |
|
"grad_norm": 0.5283003449440002, |
|
"learning_rate": 0.00017332553462225602, |
|
"loss": 0.5952, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 2.3976023976023977, |
|
"grad_norm": 0.5287072658538818, |
|
"learning_rate": 0.00017289686274214118, |
|
"loss": 0.5763, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4175824175824174, |
|
"grad_norm": 0.5907203555107117, |
|
"learning_rate": 0.0001724653130187047, |
|
"loss": 0.5993, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 2.4375624375624376, |
|
"grad_norm": 0.5622738003730774, |
|
"learning_rate": 0.0001720309024887907, |
|
"loss": 0.6001, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 2.4575424575424574, |
|
"grad_norm": 0.5795326232910156, |
|
"learning_rate": 0.00017159364830218312, |
|
"loss": 0.5857, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 2.4775224775224776, |
|
"grad_norm": 0.5654671788215637, |
|
"learning_rate": 0.00017115356772092857, |
|
"loss": 0.5809, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.4975024975024973, |
|
"grad_norm": 0.5641043186187744, |
|
"learning_rate": 0.00017071067811865476, |
|
"loss": 0.5824, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.5174825174825175, |
|
"grad_norm": 0.5851653218269348, |
|
"learning_rate": 0.00017026499697988493, |
|
"loss": 0.59, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.5374625374625372, |
|
"grad_norm": 0.570210337638855, |
|
"learning_rate": 0.00016981654189934727, |
|
"loss": 0.5761, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.5574425574425574, |
|
"grad_norm": 0.5725647807121277, |
|
"learning_rate": 0.0001693653305812805, |
|
"loss": 0.589, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.5774225774225776, |
|
"grad_norm": 0.5896579623222351, |
|
"learning_rate": 0.00016891138083873487, |
|
"loss": 0.5852, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"grad_norm": 0.5988901853561401, |
|
"learning_rate": 0.00016845471059286887, |
|
"loss": 0.5723, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.617382617382617, |
|
"grad_norm": 0.5854650735855103, |
|
"learning_rate": 0.00016799533787224192, |
|
"loss": 0.5845, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.6373626373626373, |
|
"grad_norm": 0.5547802448272705, |
|
"learning_rate": 0.00016753328081210245, |
|
"loss": 0.5909, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.6573426573426575, |
|
"grad_norm": 0.5562127232551575, |
|
"learning_rate": 0.000167068557653672, |
|
"loss": 0.5799, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.6773226773226773, |
|
"grad_norm": 0.5999246835708618, |
|
"learning_rate": 0.00016660118674342517, |
|
"loss": 0.5757, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.6973026973026974, |
|
"grad_norm": 0.5909945368766785, |
|
"learning_rate": 0.00016613118653236518, |
|
"loss": 0.5674, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.717282717282717, |
|
"grad_norm": 0.6357455849647522, |
|
"learning_rate": 0.00016565857557529566, |
|
"loss": 0.5821, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.7372627372627374, |
|
"grad_norm": 0.6019343733787537, |
|
"learning_rate": 0.0001651833725300879, |
|
"loss": 0.5783, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.757242757242757, |
|
"grad_norm": 0.6180288791656494, |
|
"learning_rate": 0.00016470559615694446, |
|
"loss": 0.6056, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.7772227772227773, |
|
"grad_norm": 0.6171667575836182, |
|
"learning_rate": 0.00016422526531765846, |
|
"loss": 0.5799, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.797202797202797, |
|
"grad_norm": 0.5991246700286865, |
|
"learning_rate": 0.000163742398974869, |
|
"loss": 0.5668, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.8171828171828173, |
|
"grad_norm": 0.6568031907081604, |
|
"learning_rate": 0.00016325701619131246, |
|
"loss": 0.5662, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.837162837162837, |
|
"grad_norm": 0.6639891266822815, |
|
"learning_rate": 0.00016276913612907007, |
|
"loss": 0.5797, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.5978193879127502, |
|
"learning_rate": 0.00016227877804881127, |
|
"loss": 0.5613, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.8771228771228774, |
|
"grad_norm": 0.576871395111084, |
|
"learning_rate": 0.00016178596130903344, |
|
"loss": 0.5796, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.897102897102897, |
|
"grad_norm": 0.5936170220375061, |
|
"learning_rate": 0.00016129070536529766, |
|
"loss": 0.5791, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.917082917082917, |
|
"grad_norm": 0.6093722581863403, |
|
"learning_rate": 0.00016079302976946055, |
|
"loss": 0.5836, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.937062937062937, |
|
"grad_norm": 0.5815151929855347, |
|
"learning_rate": 0.00016029295416890248, |
|
"loss": 0.5644, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.9570429570429573, |
|
"grad_norm": 0.621591329574585, |
|
"learning_rate": 0.0001597904983057519, |
|
"loss": 0.5779, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.977022977022977, |
|
"grad_norm": 0.5824622511863708, |
|
"learning_rate": 0.00015928568201610595, |
|
"loss": 0.5659, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.9970029970029968, |
|
"grad_norm": 0.6264435052871704, |
|
"learning_rate": 0.00015877852522924732, |
|
"loss": 0.5823, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.016983016983017, |
|
"grad_norm": 0.7021110653877258, |
|
"learning_rate": 0.00015826904796685762, |
|
"loss": 0.4732, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 3.036963036963037, |
|
"grad_norm": 0.7195537686347961, |
|
"learning_rate": 0.00015775727034222675, |
|
"loss": 0.4484, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 3.056943056943057, |
|
"grad_norm": 0.7159614562988281, |
|
"learning_rate": 0.0001572432125594591, |
|
"loss": 0.4533, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 0.686655580997467, |
|
"learning_rate": 0.00015672689491267567, |
|
"loss": 0.4588, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 3.096903096903097, |
|
"grad_norm": 0.6840978264808655, |
|
"learning_rate": 0.00015620833778521307, |
|
"loss": 0.4632, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 3.116883116883117, |
|
"grad_norm": 0.6888960003852844, |
|
"learning_rate": 0.00015568756164881882, |
|
"loss": 0.463, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 3.136863136863137, |
|
"grad_norm": 0.6887105107307434, |
|
"learning_rate": 0.00015516458706284303, |
|
"loss": 0.4683, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 3.156843156843157, |
|
"grad_norm": 0.6880657076835632, |
|
"learning_rate": 0.00015463943467342693, |
|
"loss": 0.4703, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 3.1768231768231767, |
|
"grad_norm": 0.667488157749176, |
|
"learning_rate": 0.00015411212521268758, |
|
"loss": 0.4681, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 3.196803196803197, |
|
"grad_norm": 0.7201547026634216, |
|
"learning_rate": 0.00015358267949789966, |
|
"loss": 0.4708, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.2167832167832167, |
|
"grad_norm": 0.7887006998062134, |
|
"learning_rate": 0.0001530511184306734, |
|
"loss": 0.4692, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 3.236763236763237, |
|
"grad_norm": 0.6850538849830627, |
|
"learning_rate": 0.0001525174629961296, |
|
"loss": 0.4652, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 3.2567432567432566, |
|
"grad_norm": 0.7573882937431335, |
|
"learning_rate": 0.00015198173426207094, |
|
"loss": 0.4618, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 3.276723276723277, |
|
"grad_norm": 0.7027117609977722, |
|
"learning_rate": 0.00015144395337815064, |
|
"loss": 0.4665, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 3.2967032967032965, |
|
"grad_norm": 0.6847530007362366, |
|
"learning_rate": 0.00015090414157503714, |
|
"loss": 0.4669, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 3.3166833166833167, |
|
"grad_norm": 0.7099263072013855, |
|
"learning_rate": 0.0001503623201635761, |
|
"loss": 0.4666, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 3.3366633366633365, |
|
"grad_norm": 0.6803727149963379, |
|
"learning_rate": 0.0001498185105339491, |
|
"loss": 0.4674, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 3.3566433566433567, |
|
"grad_norm": 0.7080752849578857, |
|
"learning_rate": 0.00014927273415482915, |
|
"loss": 0.4694, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 3.3766233766233764, |
|
"grad_norm": 0.7016042470932007, |
|
"learning_rate": 0.00014872501257253323, |
|
"loss": 0.4716, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 3.3966033966033966, |
|
"grad_norm": 0.6896219849586487, |
|
"learning_rate": 0.00014817536741017152, |
|
"loss": 0.4706, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.416583416583417, |
|
"grad_norm": 0.7319151163101196, |
|
"learning_rate": 0.0001476238203667939, |
|
"loss": 0.4657, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 3.4365634365634365, |
|
"grad_norm": 0.7796220779418945, |
|
"learning_rate": 0.0001470703932165333, |
|
"loss": 0.4762, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 3.4565434565434563, |
|
"grad_norm": 0.6749796271324158, |
|
"learning_rate": 0.00014651510780774583, |
|
"loss": 0.4602, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 3.4765234765234765, |
|
"grad_norm": 0.6736605167388916, |
|
"learning_rate": 0.00014595798606214882, |
|
"loss": 0.4751, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 3.4965034965034967, |
|
"grad_norm": 0.7386316657066345, |
|
"learning_rate": 0.00014539904997395468, |
|
"loss": 0.4658, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 3.5164835164835164, |
|
"grad_norm": 0.7023107409477234, |
|
"learning_rate": 0.00014483832160900326, |
|
"loss": 0.4678, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 3.5364635364635366, |
|
"grad_norm": 0.6938359141349792, |
|
"learning_rate": 0.0001442758231038902, |
|
"loss": 0.4619, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 3.5564435564435564, |
|
"grad_norm": 0.7815272212028503, |
|
"learning_rate": 0.0001437115766650933, |
|
"loss": 0.4744, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 3.5764235764235766, |
|
"grad_norm": 0.7307267189025879, |
|
"learning_rate": 0.0001431456045680959, |
|
"loss": 0.4767, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 3.5964035964035963, |
|
"grad_norm": 0.6948580741882324, |
|
"learning_rate": 0.00014257792915650728, |
|
"loss": 0.4644, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.6163836163836165, |
|
"grad_norm": 0.691348671913147, |
|
"learning_rate": 0.00014200857284118066, |
|
"loss": 0.4609, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.7828198671340942, |
|
"learning_rate": 0.00014143755809932845, |
|
"loss": 0.4506, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 3.6563436563436564, |
|
"grad_norm": 0.73238205909729, |
|
"learning_rate": 0.00014086490747363493, |
|
"loss": 0.4599, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 3.676323676323676, |
|
"grad_norm": 0.7216520309448242, |
|
"learning_rate": 0.00014029064357136628, |
|
"loss": 0.4582, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 3.6963036963036964, |
|
"grad_norm": 0.7676394581794739, |
|
"learning_rate": 0.00013971478906347806, |
|
"loss": 0.4494, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 3.716283716283716, |
|
"grad_norm": 0.7596750259399414, |
|
"learning_rate": 0.00013913736668372026, |
|
"loss": 0.4704, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 3.7362637362637363, |
|
"grad_norm": 0.7686085104942322, |
|
"learning_rate": 0.00013855839922773968, |
|
"loss": 0.4603, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.756243756243756, |
|
"grad_norm": 0.6850613951683044, |
|
"learning_rate": 0.00013797790955218014, |
|
"loss": 0.4503, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 3.7762237762237763, |
|
"grad_norm": 0.721778392791748, |
|
"learning_rate": 0.00013739592057378003, |
|
"loss": 0.4713, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 3.7962037962037964, |
|
"grad_norm": 0.7122541069984436, |
|
"learning_rate": 0.00013681245526846783, |
|
"loss": 0.4664, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.816183816183816, |
|
"grad_norm": 0.7361748218536377, |
|
"learning_rate": 0.00013622753667045457, |
|
"loss": 0.4571, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 3.836163836163836, |
|
"grad_norm": 0.8220844864845276, |
|
"learning_rate": 0.00013564118787132506, |
|
"loss": 0.4521, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 3.856143856143856, |
|
"grad_norm": 0.7139246463775635, |
|
"learning_rate": 0.0001350534320191259, |
|
"loss": 0.4491, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 3.8761238761238763, |
|
"grad_norm": 0.7244653701782227, |
|
"learning_rate": 0.0001344642923174517, |
|
"loss": 0.4552, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 3.896103896103896, |
|
"grad_norm": 0.7056713700294495, |
|
"learning_rate": 0.00013387379202452917, |
|
"loss": 0.4548, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 3.916083916083916, |
|
"grad_norm": 0.7653645277023315, |
|
"learning_rate": 0.00013328195445229868, |
|
"loss": 0.4492, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 3.936063936063936, |
|
"grad_norm": 0.6818165183067322, |
|
"learning_rate": 0.00013268880296549425, |
|
"loss": 0.4463, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 3.956043956043956, |
|
"grad_norm": 0.687439501285553, |
|
"learning_rate": 0.00013209436098072095, |
|
"loss": 0.457, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 3.976023976023976, |
|
"grad_norm": 0.7704656720161438, |
|
"learning_rate": 0.0001314986519655305, |
|
"loss": 0.4522, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 3.996003996003996, |
|
"grad_norm": 0.7227702736854553, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 0.4454, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 4.015984015984016, |
|
"grad_norm": 0.8689281344413757, |
|
"learning_rate": 0.00013030352696327742, |
|
"loss": 0.3645, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 4.035964035964036, |
|
"grad_norm": 0.7620906829833984, |
|
"learning_rate": 0.0001297041581577035, |
|
"loss": 0.3478, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 4.055944055944056, |
|
"grad_norm": 0.768671989440918, |
|
"learning_rate": 0.00012910361668282719, |
|
"loss": 0.3595, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 4.075924075924076, |
|
"grad_norm": 0.7327402234077454, |
|
"learning_rate": 0.0001285019262469976, |
|
"loss": 0.3471, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 4.095904095904096, |
|
"grad_norm": 0.6913720965385437, |
|
"learning_rate": 0.00012789911060392294, |
|
"loss": 0.3501, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 4.115884115884116, |
|
"grad_norm": 0.7310584783554077, |
|
"learning_rate": 0.00012729519355173254, |
|
"loss": 0.3509, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 4.135864135864136, |
|
"grad_norm": 0.7578213214874268, |
|
"learning_rate": 0.00012669019893203759, |
|
"loss": 0.3506, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 4.1558441558441555, |
|
"grad_norm": 0.7301665544509888, |
|
"learning_rate": 0.00012608415062898972, |
|
"loss": 0.3536, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 4.175824175824176, |
|
"grad_norm": 0.8198577165603638, |
|
"learning_rate": 0.00012547707256833823, |
|
"loss": 0.3578, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 4.195804195804196, |
|
"grad_norm": 0.7331268787384033, |
|
"learning_rate": 0.0001248689887164855, |
|
"loss": 0.3508, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.215784215784216, |
|
"grad_norm": 0.7666186094284058, |
|
"learning_rate": 0.00012425992307954075, |
|
"loss": 0.3468, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 4.235764235764236, |
|
"grad_norm": 0.7020666599273682, |
|
"learning_rate": 0.00012364989970237248, |
|
"loss": 0.3586, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 4.255744255744256, |
|
"grad_norm": 0.7276338338851929, |
|
"learning_rate": 0.00012303894266765908, |
|
"loss": 0.3672, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 4.275724275724276, |
|
"grad_norm": 0.6978778839111328, |
|
"learning_rate": 0.00012242707609493814, |
|
"loss": 0.3576, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 4.2957042957042955, |
|
"grad_norm": 0.822030246257782, |
|
"learning_rate": 0.00012181432413965428, |
|
"loss": 0.3618, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 4.315684315684316, |
|
"grad_norm": 0.744611918926239, |
|
"learning_rate": 0.00012120071099220549, |
|
"loss": 0.3578, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 4.335664335664336, |
|
"grad_norm": 0.7712835669517517, |
|
"learning_rate": 0.00012058626087698814, |
|
"loss": 0.3632, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 4.355644355644356, |
|
"grad_norm": 0.7824398279190063, |
|
"learning_rate": 0.00011997099805144069, |
|
"loss": 0.36, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 4.375624375624375, |
|
"grad_norm": 0.8473492860794067, |
|
"learning_rate": 0.00011935494680508606, |
|
"loss": 0.3645, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 4.395604395604396, |
|
"grad_norm": 0.7394326329231262, |
|
"learning_rate": 0.00011873813145857249, |
|
"loss": 0.3604, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.415584415584416, |
|
"grad_norm": 0.763633131980896, |
|
"learning_rate": 0.00011812057636271374, |
|
"loss": 0.3634, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 4.4355644355644355, |
|
"grad_norm": 0.7612594962120056, |
|
"learning_rate": 0.00011750230589752762, |
|
"loss": 0.355, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 4.455544455544455, |
|
"grad_norm": 0.7789061665534973, |
|
"learning_rate": 0.00011688334447127338, |
|
"loss": 0.3629, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 4.475524475524476, |
|
"grad_norm": 0.7422770261764526, |
|
"learning_rate": 0.00011626371651948838, |
|
"loss": 0.361, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 4.495504495504496, |
|
"grad_norm": 0.7636354565620422, |
|
"learning_rate": 0.0001156434465040231, |
|
"loss": 0.3593, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 4.515484515484515, |
|
"grad_norm": 0.7884863615036011, |
|
"learning_rate": 0.00011502255891207572, |
|
"loss": 0.3587, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 4.535464535464535, |
|
"grad_norm": 0.7233232855796814, |
|
"learning_rate": 0.00011440107825522521, |
|
"loss": 0.3577, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 4.555444555444556, |
|
"grad_norm": 0.8420186638832092, |
|
"learning_rate": 0.0001137790290684638, |
|
"loss": 0.3686, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 4.5754245754245755, |
|
"grad_norm": 0.7679941654205322, |
|
"learning_rate": 0.00011315643590922827, |
|
"loss": 0.3539, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 4.595404595404595, |
|
"grad_norm": 0.826885461807251, |
|
"learning_rate": 0.00011253332335643043, |
|
"loss": 0.3627, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 0.7590234875679016, |
|
"learning_rate": 0.00011190971600948699, |
|
"loss": 0.3613, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 4.635364635364636, |
|
"grad_norm": 0.7376580238342285, |
|
"learning_rate": 0.00011128563848734816, |
|
"loss": 0.3694, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 4.655344655344655, |
|
"grad_norm": 0.7795658111572266, |
|
"learning_rate": 0.000110661115427526, |
|
"loss": 0.3598, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 4.675324675324675, |
|
"grad_norm": 0.7736489176750183, |
|
"learning_rate": 0.00011003617148512149, |
|
"loss": 0.3598, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 4.695304695304696, |
|
"grad_norm": 0.757072925567627, |
|
"learning_rate": 0.00010941083133185146, |
|
"loss": 0.366, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 4.7152847152847155, |
|
"grad_norm": 0.8167831301689148, |
|
"learning_rate": 0.00010878511965507434, |
|
"loss": 0.3633, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 4.735264735264735, |
|
"grad_norm": 0.8083499670028687, |
|
"learning_rate": 0.00010815906115681578, |
|
"loss": 0.3562, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 4.755244755244755, |
|
"grad_norm": 0.7758758068084717, |
|
"learning_rate": 0.00010753268055279329, |
|
"loss": 0.3614, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 4.775224775224775, |
|
"grad_norm": 0.8572462797164917, |
|
"learning_rate": 0.00010690600257144061, |
|
"loss": 0.3652, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 4.795204795204795, |
|
"grad_norm": 0.8319938778877258, |
|
"learning_rate": 0.00010627905195293135, |
|
"loss": 0.3622, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.815184815184815, |
|
"grad_norm": 0.8004459142684937, |
|
"learning_rate": 0.00010565185344820247, |
|
"loss": 0.3604, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 4.835164835164835, |
|
"grad_norm": 0.790908694267273, |
|
"learning_rate": 0.00010502443181797697, |
|
"loss": 0.3587, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 4.8551448551448555, |
|
"grad_norm": 0.7726609110832214, |
|
"learning_rate": 0.0001043968118317865, |
|
"loss": 0.364, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 4.875124875124875, |
|
"grad_norm": 0.7808167338371277, |
|
"learning_rate": 0.00010376901826699348, |
|
"loss": 0.3637, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 4.895104895104895, |
|
"grad_norm": 0.8596636652946472, |
|
"learning_rate": 0.00010314107590781284, |
|
"loss": 0.3536, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 4.915084915084915, |
|
"grad_norm": 0.8091081380844116, |
|
"learning_rate": 0.00010251300954433376, |
|
"loss": 0.3522, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 4.935064935064935, |
|
"grad_norm": 0.8672420978546143, |
|
"learning_rate": 0.00010188484397154084, |
|
"loss": 0.3643, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 4.955044955044955, |
|
"grad_norm": 0.7860444188117981, |
|
"learning_rate": 0.00010125660398833528, |
|
"loss": 0.3493, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 4.975024975024975, |
|
"grad_norm": 0.7510725259780884, |
|
"learning_rate": 0.00010062831439655591, |
|
"loss": 0.3497, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 4.995004995004995, |
|
"grad_norm": 0.7850112915039062, |
|
"learning_rate": 0.0001, |
|
"loss": 0.361, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 5.014985014985015, |
|
"grad_norm": 0.9001740217208862, |
|
"learning_rate": 9.937168560344412e-05, |
|
"loss": 0.2983, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 5.034965034965035, |
|
"grad_norm": 0.683803141117096, |
|
"learning_rate": 9.874339601166473e-05, |
|
"loss": 0.2805, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 5.054945054945055, |
|
"grad_norm": 0.7267177700996399, |
|
"learning_rate": 9.81151560284592e-05, |
|
"loss": 0.2751, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 5.0749250749250745, |
|
"grad_norm": 0.7268999814987183, |
|
"learning_rate": 9.748699045566626e-05, |
|
"loss": 0.2805, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 5.094905094905095, |
|
"grad_norm": 0.6958262324333191, |
|
"learning_rate": 9.685892409218717e-05, |
|
"loss": 0.2808, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 5.114885114885115, |
|
"grad_norm": 0.7481863498687744, |
|
"learning_rate": 9.623098173300654e-05, |
|
"loss": 0.2808, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 5.134865134865135, |
|
"grad_norm": 0.6923096179962158, |
|
"learning_rate": 9.560318816821353e-05, |
|
"loss": 0.2802, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 5.154845154845155, |
|
"grad_norm": 0.8236074447631836, |
|
"learning_rate": 9.497556818202306e-05, |
|
"loss": 0.2845, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 5.174825174825175, |
|
"grad_norm": 0.7225534915924072, |
|
"learning_rate": 9.434814655179755e-05, |
|
"loss": 0.2802, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 5.194805194805195, |
|
"grad_norm": 0.7639855146408081, |
|
"learning_rate": 9.372094804706867e-05, |
|
"loss": 0.2846, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 5.2147852147852145, |
|
"grad_norm": 0.7572929859161377, |
|
"learning_rate": 9.309399742855942e-05, |
|
"loss": 0.2826, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 5.234765234765235, |
|
"grad_norm": 0.8045923709869385, |
|
"learning_rate": 9.246731944720675e-05, |
|
"loss": 0.2862, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 5.254745254745255, |
|
"grad_norm": 0.7385067939758301, |
|
"learning_rate": 9.184093884318425e-05, |
|
"loss": 0.2886, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 5.274725274725275, |
|
"grad_norm": 0.7742624282836914, |
|
"learning_rate": 9.121488034492569e-05, |
|
"loss": 0.2857, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 5.294705294705294, |
|
"grad_norm": 0.73873370885849, |
|
"learning_rate": 9.058916866814858e-05, |
|
"loss": 0.2874, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 5.314685314685315, |
|
"grad_norm": 0.8087053298950195, |
|
"learning_rate": 8.99638285148785e-05, |
|
"loss": 0.2814, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 5.334665334665335, |
|
"grad_norm": 0.7873129844665527, |
|
"learning_rate": 8.933888457247402e-05, |
|
"loss": 0.2827, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 5.3546453546453545, |
|
"grad_norm": 0.776678204536438, |
|
"learning_rate": 8.871436151265184e-05, |
|
"loss": 0.2861, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 5.374625374625374, |
|
"grad_norm": 0.7478957772254944, |
|
"learning_rate": 8.809028399051302e-05, |
|
"loss": 0.2841, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 5.394605394605395, |
|
"grad_norm": 0.7491159439086914, |
|
"learning_rate": 8.746667664356956e-05, |
|
"loss": 0.2781, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 5.414585414585415, |
|
"grad_norm": 0.7022270560264587, |
|
"learning_rate": 8.684356409077176e-05, |
|
"loss": 0.2831, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 5.434565434565434, |
|
"grad_norm": 0.714643120765686, |
|
"learning_rate": 8.62209709315362e-05, |
|
"loss": 0.2816, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 5.454545454545454, |
|
"grad_norm": 0.7695267796516418, |
|
"learning_rate": 8.559892174477479e-05, |
|
"loss": 0.2845, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 5.474525474525475, |
|
"grad_norm": 0.7670512795448303, |
|
"learning_rate": 8.497744108792429e-05, |
|
"loss": 0.284, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 5.4945054945054945, |
|
"grad_norm": 0.7777095437049866, |
|
"learning_rate": 8.435655349597689e-05, |
|
"loss": 0.2849, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 5.514485514485514, |
|
"grad_norm": 0.7117462158203125, |
|
"learning_rate": 8.373628348051165e-05, |
|
"loss": 0.2892, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 5.534465534465534, |
|
"grad_norm": 0.7786485552787781, |
|
"learning_rate": 8.311665552872662e-05, |
|
"loss": 0.2867, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 5.554445554445555, |
|
"grad_norm": 0.7926625609397888, |
|
"learning_rate": 8.249769410247239e-05, |
|
"loss": 0.2862, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 5.574425574425574, |
|
"grad_norm": 0.7426894307136536, |
|
"learning_rate": 8.187942363728625e-05, |
|
"loss": 0.288, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 5.594405594405594, |
|
"grad_norm": 0.7075335383415222, |
|
"learning_rate": 8.126186854142752e-05, |
|
"loss": 0.2847, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 5.614385614385615, |
|
"grad_norm": 0.7743814587593079, |
|
"learning_rate": 8.064505319491398e-05, |
|
"loss": 0.2912, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 5.6343656343656345, |
|
"grad_norm": 0.7679479122161865, |
|
"learning_rate": 8.002900194855932e-05, |
|
"loss": 0.2944, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 5.654345654345654, |
|
"grad_norm": 0.8007961511611938, |
|
"learning_rate": 7.941373912301189e-05, |
|
"loss": 0.2934, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 5.674325674325674, |
|
"grad_norm": 0.8405194878578186, |
|
"learning_rate": 7.879928900779456e-05, |
|
"loss": 0.2848, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 5.694305694305695, |
|
"grad_norm": 0.7828160524368286, |
|
"learning_rate": 7.818567586034577e-05, |
|
"loss": 0.2932, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 5.714285714285714, |
|
"grad_norm": 0.7869848608970642, |
|
"learning_rate": 7.75729239050619e-05, |
|
"loss": 0.2851, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 5.734265734265734, |
|
"grad_norm": 0.7781445980072021, |
|
"learning_rate": 7.696105733234098e-05, |
|
"loss": 0.2849, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 5.754245754245754, |
|
"grad_norm": 0.8406656980514526, |
|
"learning_rate": 7.635010029762756e-05, |
|
"loss": 0.2854, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 5.7742257742257745, |
|
"grad_norm": 0.7491788864135742, |
|
"learning_rate": 7.574007692045928e-05, |
|
"loss": 0.288, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 5.794205794205794, |
|
"grad_norm": 0.7962749004364014, |
|
"learning_rate": 7.513101128351454e-05, |
|
"loss": 0.2888, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 5.814185814185814, |
|
"grad_norm": 0.7898345589637756, |
|
"learning_rate": 7.45229274316618e-05, |
|
"loss": 0.2875, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 5.834165834165834, |
|
"grad_norm": 0.7886426448822021, |
|
"learning_rate": 7.391584937101033e-05, |
|
"loss": 0.2947, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 5.854145854145854, |
|
"grad_norm": 0.7488512396812439, |
|
"learning_rate": 7.330980106796246e-05, |
|
"loss": 0.2846, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 5.874125874125874, |
|
"grad_norm": 0.7348522543907166, |
|
"learning_rate": 7.270480644826749e-05, |
|
"loss": 0.2883, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 5.894105894105894, |
|
"grad_norm": 0.7618998885154724, |
|
"learning_rate": 7.210088939607708e-05, |
|
"loss": 0.2899, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 5.9140859140859146, |
|
"grad_norm": 0.78291255235672, |
|
"learning_rate": 7.149807375300239e-05, |
|
"loss": 0.2865, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 5.934065934065934, |
|
"grad_norm": 0.7446394562721252, |
|
"learning_rate": 7.089638331717284e-05, |
|
"loss": 0.2846, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 5.954045954045954, |
|
"grad_norm": 0.767301619052887, |
|
"learning_rate": 7.029584184229653e-05, |
|
"loss": 0.2887, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 5.974025974025974, |
|
"grad_norm": 0.7523135542869568, |
|
"learning_rate": 6.969647303672262e-05, |
|
"loss": 0.2873, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 5.9940059940059935, |
|
"grad_norm": 0.7532919049263, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 0.2882, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 6.013986013986014, |
|
"grad_norm": 0.6552711129188538, |
|
"learning_rate": 6.850134803446954e-05, |
|
"loss": 0.2488, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 6.033966033966034, |
|
"grad_norm": 0.6565443873405457, |
|
"learning_rate": 6.790563901927907e-05, |
|
"loss": 0.2345, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 6.053946053946054, |
|
"grad_norm": 0.6884881854057312, |
|
"learning_rate": 6.731119703450577e-05, |
|
"loss": 0.233, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 6.073926073926074, |
|
"grad_norm": 0.6287186741828918, |
|
"learning_rate": 6.671804554770135e-05, |
|
"loss": 0.2356, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 6.093906093906094, |
|
"grad_norm": 0.754036545753479, |
|
"learning_rate": 6.612620797547087e-05, |
|
"loss": 0.2352, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 6.113886113886114, |
|
"grad_norm": 0.6492979526519775, |
|
"learning_rate": 6.55357076825483e-05, |
|
"loss": 0.2329, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 6.1338661338661336, |
|
"grad_norm": 0.6303039789199829, |
|
"learning_rate": 6.494656798087412e-05, |
|
"loss": 0.2339, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.6423007845878601, |
|
"learning_rate": 6.435881212867493e-05, |
|
"loss": 0.2377, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 6.173826173826174, |
|
"grad_norm": 0.6716975569725037, |
|
"learning_rate": 6.377246332954544e-05, |
|
"loss": 0.2365, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 6.193806193806194, |
|
"grad_norm": 0.6927747130393982, |
|
"learning_rate": 6.318754473153221e-05, |
|
"loss": 0.2346, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 6.213786213786213, |
|
"grad_norm": 0.6551555395126343, |
|
"learning_rate": 6.260407942621998e-05, |
|
"loss": 0.235, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 6.233766233766234, |
|
"grad_norm": 0.7131916284561157, |
|
"learning_rate": 6.20220904478199e-05, |
|
"loss": 0.2401, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 6.253746253746254, |
|
"grad_norm": 0.7002174258232117, |
|
"learning_rate": 6.144160077226036e-05, |
|
"loss": 0.2398, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 6.273726273726274, |
|
"grad_norm": 0.7129354476928711, |
|
"learning_rate": 6.086263331627976e-05, |
|
"loss": 0.2401, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 6.293706293706293, |
|
"grad_norm": 0.6942778825759888, |
|
"learning_rate": 6.0285210936521955e-05, |
|
"loss": 0.2391, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 6.313686313686314, |
|
"grad_norm": 0.7181575298309326, |
|
"learning_rate": 5.9709356428633746e-05, |
|
"loss": 0.2434, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 6.333666333666334, |
|
"grad_norm": 0.720330536365509, |
|
"learning_rate": 5.913509252636511e-05, |
|
"loss": 0.2352, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 6.353646353646353, |
|
"grad_norm": 0.6518005728721619, |
|
"learning_rate": 5.856244190067159e-05, |
|
"loss": 0.2377, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 6.373626373626374, |
|
"grad_norm": 0.6705808639526367, |
|
"learning_rate": 5.799142715881938e-05, |
|
"loss": 0.2416, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 6.393606393606394, |
|
"grad_norm": 0.7210578322410583, |
|
"learning_rate": 5.7422070843492734e-05, |
|
"loss": 0.2406, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 6.413586413586414, |
|
"grad_norm": 0.6428204774856567, |
|
"learning_rate": 5.6854395431904094e-05, |
|
"loss": 0.2397, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 6.433566433566433, |
|
"grad_norm": 0.697733461856842, |
|
"learning_rate": 5.6288423334906735e-05, |
|
"loss": 0.2425, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 6.453546453546454, |
|
"grad_norm": 0.7867773175239563, |
|
"learning_rate": 5.572417689610987e-05, |
|
"loss": 0.2401, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 6.473526473526474, |
|
"grad_norm": 0.6750375032424927, |
|
"learning_rate": 5.5161678390996796e-05, |
|
"loss": 0.2396, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 6.4935064935064934, |
|
"grad_norm": 0.677237868309021, |
|
"learning_rate": 5.4600950026045326e-05, |
|
"loss": 0.2434, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 6.513486513486513, |
|
"grad_norm": 0.6781632304191589, |
|
"learning_rate": 5.404201393785122e-05, |
|
"loss": 0.2454, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 6.533466533466534, |
|
"grad_norm": 0.7506418824195862, |
|
"learning_rate": 5.348489219225416e-05, |
|
"loss": 0.2397, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 6.553446553446554, |
|
"grad_norm": 0.7256707549095154, |
|
"learning_rate": 5.292960678346675e-05, |
|
"loss": 0.2403, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 6.573426573426573, |
|
"grad_norm": 0.664169430732727, |
|
"learning_rate": 5.237617963320608e-05, |
|
"loss": 0.2392, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 6.593406593406593, |
|
"grad_norm": 0.7900999188423157, |
|
"learning_rate": 5.182463258982846e-05, |
|
"loss": 0.2426, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 6.613386613386614, |
|
"grad_norm": 0.7012047171592712, |
|
"learning_rate": 5.127498742746675e-05, |
|
"loss": 0.2429, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 6.6333666333666335, |
|
"grad_norm": 0.752498984336853, |
|
"learning_rate": 5.072726584517086e-05, |
|
"loss": 0.2425, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 6.653346653346653, |
|
"grad_norm": 0.7256404161453247, |
|
"learning_rate": 5.018148946605092e-05, |
|
"loss": 0.2381, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 6.673326673326673, |
|
"grad_norm": 0.6938993334770203, |
|
"learning_rate": 4.9637679836423924e-05, |
|
"loss": 0.2428, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 6.693306693306694, |
|
"grad_norm": 0.7288166284561157, |
|
"learning_rate": 4.909585842496287e-05, |
|
"loss": 0.2409, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 6.713286713286713, |
|
"grad_norm": 0.7148503661155701, |
|
"learning_rate": 4.8556046621849346e-05, |
|
"loss": 0.2402, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 6.733266733266733, |
|
"grad_norm": 0.7477458715438843, |
|
"learning_rate": 4.8018265737929044e-05, |
|
"loss": 0.2394, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 6.753246753246753, |
|
"grad_norm": 0.7404049634933472, |
|
"learning_rate": 4.748253700387042e-05, |
|
"loss": 0.2422, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 6.7732267732267735, |
|
"grad_norm": 0.6715726852416992, |
|
"learning_rate": 4.694888156932658e-05, |
|
"loss": 0.2405, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 6.793206793206793, |
|
"grad_norm": 0.6998412609100342, |
|
"learning_rate": 4.6417320502100316e-05, |
|
"loss": 0.2405, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 6.813186813186813, |
|
"grad_norm": 0.7061425447463989, |
|
"learning_rate": 4.588787478731242e-05, |
|
"loss": 0.2368, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 6.833166833166834, |
|
"grad_norm": 0.7432896494865417, |
|
"learning_rate": 4.5360565326573104e-05, |
|
"loss": 0.2399, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 6.853146853146853, |
|
"grad_norm": 0.7876798510551453, |
|
"learning_rate": 4.483541293715698e-05, |
|
"loss": 0.2395, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 6.873126873126873, |
|
"grad_norm": 0.7446125149726868, |
|
"learning_rate": 4.431243835118124e-05, |
|
"loss": 0.241, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 6.893106893106893, |
|
"grad_norm": 0.6832261085510254, |
|
"learning_rate": 4.379166221478697e-05, |
|
"loss": 0.2396, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 6.913086913086913, |
|
"grad_norm": 0.7039461135864258, |
|
"learning_rate": 4.327310508732437e-05, |
|
"loss": 0.2408, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 6.933066933066933, |
|
"grad_norm": 0.7428474426269531, |
|
"learning_rate": 4.2756787440540936e-05, |
|
"loss": 0.2407, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 6.953046953046953, |
|
"grad_norm": 0.7313565015792847, |
|
"learning_rate": 4.224272965777326e-05, |
|
"loss": 0.2406, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 6.973026973026973, |
|
"grad_norm": 0.7175894975662231, |
|
"learning_rate": 4.173095203314241e-05, |
|
"loss": 0.2409, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 6.993006993006993, |
|
"grad_norm": 0.6897133588790894, |
|
"learning_rate": 4.12214747707527e-05, |
|
"loss": 0.2389, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 7.012987012987013, |
|
"grad_norm": 0.5959777235984802, |
|
"learning_rate": 4.071431798389408e-05, |
|
"loss": 0.2184, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 7.032967032967033, |
|
"grad_norm": 0.7147582173347473, |
|
"learning_rate": 4.020950169424815e-05, |
|
"loss": 0.2087, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 7.052947052947053, |
|
"grad_norm": 0.6122413873672485, |
|
"learning_rate": 3.9707045831097555e-05, |
|
"loss": 0.2106, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 7.072927072927073, |
|
"grad_norm": 0.633969783782959, |
|
"learning_rate": 3.920697023053949e-05, |
|
"loss": 0.2099, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 7.092907092907093, |
|
"grad_norm": 0.6842843890190125, |
|
"learning_rate": 3.8709294634702376e-05, |
|
"loss": 0.2104, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 7.112887112887113, |
|
"grad_norm": 0.5708280205726624, |
|
"learning_rate": 3.821403869096658e-05, |
|
"loss": 0.2125, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 7.1328671328671325, |
|
"grad_norm": 0.6579930782318115, |
|
"learning_rate": 3.7721221951188765e-05, |
|
"loss": 0.2107, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 7.152847152847153, |
|
"grad_norm": 0.5980693101882935, |
|
"learning_rate": 3.7230863870929964e-05, |
|
"loss": 0.2085, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 7.172827172827173, |
|
"grad_norm": 0.5968551635742188, |
|
"learning_rate": 3.674298380868756e-05, |
|
"loss": 0.209, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 7.192807192807193, |
|
"grad_norm": 0.6218951940536499, |
|
"learning_rate": 3.6257601025131026e-05, |
|
"loss": 0.2095, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 7.212787212787212, |
|
"grad_norm": 0.6248393058776855, |
|
"learning_rate": 3.577473468234156e-05, |
|
"loss": 0.2155, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 7.232767232767233, |
|
"grad_norm": 0.6496105194091797, |
|
"learning_rate": 3.52944038430556e-05, |
|
"loss": 0.2139, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 7.252747252747253, |
|
"grad_norm": 0.6064103841781616, |
|
"learning_rate": 3.481662746991214e-05, |
|
"loss": 0.2081, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 7.2727272727272725, |
|
"grad_norm": 0.6504641771316528, |
|
"learning_rate": 3.4341424424704375e-05, |
|
"loss": 0.2111, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 7.292707292707293, |
|
"grad_norm": 0.6580168604850769, |
|
"learning_rate": 3.386881346763483e-05, |
|
"loss": 0.2123, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 7.312687312687313, |
|
"grad_norm": 0.5861549973487854, |
|
"learning_rate": 3.339881325657484e-05, |
|
"loss": 0.2084, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 7.332667332667333, |
|
"grad_norm": 0.6313382387161255, |
|
"learning_rate": 3.2931442346328004e-05, |
|
"loss": 0.2078, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 7.352647352647352, |
|
"grad_norm": 0.646842896938324, |
|
"learning_rate": 3.246671918789755e-05, |
|
"loss": 0.2135, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 7.372627372627373, |
|
"grad_norm": 0.6964268088340759, |
|
"learning_rate": 3.200466212775808e-05, |
|
"loss": 0.2126, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 7.392607392607393, |
|
"grad_norm": 0.6139673590660095, |
|
"learning_rate": 3.154528940713113e-05, |
|
"loss": 0.215, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 7.4125874125874125, |
|
"grad_norm": 0.6455628871917725, |
|
"learning_rate": 3.108861916126518e-05, |
|
"loss": 0.2114, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 7.432567432567432, |
|
"grad_norm": 0.6227108240127563, |
|
"learning_rate": 3.063466941871952e-05, |
|
"loss": 0.2114, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 7.452547452547453, |
|
"grad_norm": 0.5858675837516785, |
|
"learning_rate": 3.018345810065275e-05, |
|
"loss": 0.2107, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 7.472527472527473, |
|
"grad_norm": 0.6218124628067017, |
|
"learning_rate": 2.9735003020115092e-05, |
|
"loss": 0.2115, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 7.492507492507492, |
|
"grad_norm": 0.6510396003723145, |
|
"learning_rate": 2.9289321881345254e-05, |
|
"loss": 0.2124, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 7.512487512487512, |
|
"grad_norm": 0.6465820074081421, |
|
"learning_rate": 2.8846432279071467e-05, |
|
"loss": 0.2132, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 7.532467532467533, |
|
"grad_norm": 0.7002317905426025, |
|
"learning_rate": 2.840635169781688e-05, |
|
"loss": 0.2129, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 7.5524475524475525, |
|
"grad_norm": 0.647723913192749, |
|
"learning_rate": 2.7969097511209308e-05, |
|
"loss": 0.2136, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 7.572427572427572, |
|
"grad_norm": 0.5907153487205505, |
|
"learning_rate": 2.753468698129533e-05, |
|
"loss": 0.2115, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 7.592407592407593, |
|
"grad_norm": 0.6074231863021851, |
|
"learning_rate": 2.7103137257858868e-05, |
|
"loss": 0.2128, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 7.612387612387613, |
|
"grad_norm": 0.6356890797615051, |
|
"learning_rate": 2.6674465377744017e-05, |
|
"loss": 0.2108, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 7.632367632367632, |
|
"grad_norm": 0.6739248633384705, |
|
"learning_rate": 2.624868826418262e-05, |
|
"loss": 0.2129, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 7.652347652347652, |
|
"grad_norm": 0.6241906881332397, |
|
"learning_rate": 2.582582272612609e-05, |
|
"loss": 0.211, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 7.672327672327672, |
|
"grad_norm": 0.6532058715820312, |
|
"learning_rate": 2.540588545758179e-05, |
|
"loss": 0.2137, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 0.7098828554153442, |
|
"learning_rate": 2.4988893036954043e-05, |
|
"loss": 0.2105, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 7.712287712287712, |
|
"grad_norm": 0.6868453025817871, |
|
"learning_rate": 2.4574861926389615e-05, |
|
"loss": 0.214, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 7.732267732267732, |
|
"grad_norm": 0.6777834296226501, |
|
"learning_rate": 2.4163808471127812e-05, |
|
"loss": 0.2125, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 7.752247752247753, |
|
"grad_norm": 0.6967138648033142, |
|
"learning_rate": 2.37557488988552e-05, |
|
"loss": 0.2118, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 7.772227772227772, |
|
"grad_norm": 0.6641217470169067, |
|
"learning_rate": 2.3350699319065026e-05, |
|
"loss": 0.2134, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 7.792207792207792, |
|
"grad_norm": 0.6727011799812317, |
|
"learning_rate": 2.2948675722421086e-05, |
|
"loss": 0.217, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 7.812187812187812, |
|
"grad_norm": 0.6331846117973328, |
|
"learning_rate": 2.254969398012663e-05, |
|
"loss": 0.2127, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 7.8321678321678325, |
|
"grad_norm": 0.6486308574676514, |
|
"learning_rate": 2.2153769843297667e-05, |
|
"loss": 0.2096, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 7.852147852147852, |
|
"grad_norm": 0.6658995151519775, |
|
"learning_rate": 2.1760918942341192e-05, |
|
"loss": 0.211, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 7.872127872127872, |
|
"grad_norm": 0.687493085861206, |
|
"learning_rate": 2.137115678633811e-05, |
|
"loss": 0.2163, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 7.892107892107892, |
|
"grad_norm": 0.6267641186714172, |
|
"learning_rate": 2.098449876243096e-05, |
|
"loss": 0.2142, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 7.912087912087912, |
|
"grad_norm": 0.6141098141670227, |
|
"learning_rate": 2.0600960135216462e-05, |
|
"loss": 0.2134, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 7.932067932067932, |
|
"grad_norm": 0.6436827182769775, |
|
"learning_rate": 2.0220556046142893e-05, |
|
"loss": 0.214, |
|
"step": 3970 |
|
}, |
|
{ |
|
"epoch": 7.952047952047952, |
|
"grad_norm": 0.6543010473251343, |
|
"learning_rate": 1.9843301512912327e-05, |
|
"loss": 0.2126, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 7.972027972027972, |
|
"grad_norm": 0.6083731651306152, |
|
"learning_rate": 1.946921142888781e-05, |
|
"loss": 0.2135, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 7.992007992007992, |
|
"grad_norm": 0.6408571600914001, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 0.2123, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 8.011988011988011, |
|
"grad_norm": 0.556982159614563, |
|
"learning_rate": 1.8730583556690605e-05, |
|
"loss": 0.2042, |
|
"step": 4010 |
|
}, |
|
{ |
|
"epoch": 8.031968031968033, |
|
"grad_norm": 0.5726343393325806, |
|
"learning_rate": 1.8366074928281607e-05, |
|
"loss": 0.1941, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 8.051948051948052, |
|
"grad_norm": 0.5825814604759216, |
|
"learning_rate": 1.8004789067454764e-05, |
|
"loss": 0.1976, |
|
"step": 4030 |
|
}, |
|
{ |
|
"epoch": 8.071928071928072, |
|
"grad_norm": 0.569325864315033, |
|
"learning_rate": 1.7646740237157256e-05, |
|
"loss": 0.196, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 8.091908091908092, |
|
"grad_norm": 0.5917354226112366, |
|
"learning_rate": 1.7291942572543807e-05, |
|
"loss": 0.195, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 8.111888111888112, |
|
"grad_norm": 0.5817933678627014, |
|
"learning_rate": 1.6940410080418723e-05, |
|
"loss": 0.1971, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 8.131868131868131, |
|
"grad_norm": 0.6475218534469604, |
|
"learning_rate": 1.6592156638682886e-05, |
|
"loss": 0.197, |
|
"step": 4070 |
|
}, |
|
{ |
|
"epoch": 8.151848151848151, |
|
"grad_norm": 0.6248770356178284, |
|
"learning_rate": 1.6247195995785837e-05, |
|
"loss": 0.1971, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 8.171828171828173, |
|
"grad_norm": 0.5749895572662354, |
|
"learning_rate": 1.5905541770183096e-05, |
|
"loss": 0.1964, |
|
"step": 4090 |
|
}, |
|
{ |
|
"epoch": 8.191808191808192, |
|
"grad_norm": 0.6148300766944885, |
|
"learning_rate": 1.5567207449798515e-05, |
|
"loss": 0.1966, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 8.211788211788212, |
|
"grad_norm": 0.6778724789619446, |
|
"learning_rate": 1.5232206391491699e-05, |
|
"loss": 0.1955, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 8.231768231768232, |
|
"grad_norm": 0.5883269906044006, |
|
"learning_rate": 1.4900551820530828e-05, |
|
"loss": 0.1919, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 8.251748251748252, |
|
"grad_norm": 0.567950963973999, |
|
"learning_rate": 1.4572256830070497e-05, |
|
"loss": 0.1966, |
|
"step": 4130 |
|
}, |
|
{ |
|
"epoch": 8.271728271728271, |
|
"grad_norm": 0.5733300447463989, |
|
"learning_rate": 1.4247334380634792e-05, |
|
"loss": 0.1964, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 8.291708291708291, |
|
"grad_norm": 0.638990044593811, |
|
"learning_rate": 1.3925797299605647e-05, |
|
"loss": 0.1944, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 8.311688311688311, |
|
"grad_norm": 0.6272343397140503, |
|
"learning_rate": 1.3607658280716473e-05, |
|
"loss": 0.1951, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 8.331668331668332, |
|
"grad_norm": 0.5631300210952759, |
|
"learning_rate": 1.3292929883550998e-05, |
|
"loss": 0.1983, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 8.351648351648352, |
|
"grad_norm": 0.6056917309761047, |
|
"learning_rate": 1.2981624533047432e-05, |
|
"loss": 0.1976, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 8.371628371628372, |
|
"grad_norm": 0.6021771430969238, |
|
"learning_rate": 1.2673754519008008e-05, |
|
"loss": 0.1968, |
|
"step": 4190 |
|
}, |
|
{ |
|
"epoch": 8.391608391608392, |
|
"grad_norm": 0.5835386514663696, |
|
"learning_rate": 1.2369331995613665e-05, |
|
"loss": 0.1977, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 8.411588411588411, |
|
"grad_norm": 0.5700567960739136, |
|
"learning_rate": 1.206836898094439e-05, |
|
"loss": 0.1992, |
|
"step": 4210 |
|
}, |
|
{ |
|
"epoch": 8.431568431568431, |
|
"grad_norm": 0.6391722559928894, |
|
"learning_rate": 1.1770877356504683e-05, |
|
"loss": 0.1977, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 8.451548451548451, |
|
"grad_norm": 0.5633198022842407, |
|
"learning_rate": 1.1476868866754486e-05, |
|
"loss": 0.1975, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 8.471528471528472, |
|
"grad_norm": 0.6308007836341858, |
|
"learning_rate": 1.1186355118645554e-05, |
|
"loss": 0.2002, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 8.491508491508492, |
|
"grad_norm": 0.6147842407226562, |
|
"learning_rate": 1.0899347581163221e-05, |
|
"loss": 0.199, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 8.511488511488512, |
|
"grad_norm": 0.6099655628204346, |
|
"learning_rate": 1.0615857584873623e-05, |
|
"loss": 0.1971, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 8.531468531468532, |
|
"grad_norm": 0.6306450366973877, |
|
"learning_rate": 1.0335896321476413e-05, |
|
"loss": 0.1971, |
|
"step": 4270 |
|
}, |
|
{ |
|
"epoch": 8.551448551448551, |
|
"grad_norm": 0.5740554928779602, |
|
"learning_rate": 1.0059474843362892e-05, |
|
"loss": 0.1964, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 8.571428571428571, |
|
"grad_norm": 0.566005289554596, |
|
"learning_rate": 9.786604063179728e-06, |
|
"loss": 0.197, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 8.591408591408591, |
|
"grad_norm": 0.6008467674255371, |
|
"learning_rate": 9.517294753398064e-06, |
|
"loss": 0.1969, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 8.61138861138861, |
|
"grad_norm": 0.5880402326583862, |
|
"learning_rate": 9.251557545888312e-06, |
|
"loss": 0.1944, |
|
"step": 4310 |
|
}, |
|
{ |
|
"epoch": 8.631368631368632, |
|
"grad_norm": 0.6250616908073425, |
|
"learning_rate": 8.989402931500434e-06, |
|
"loss": 0.1978, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 8.651348651348652, |
|
"grad_norm": 0.554460883140564, |
|
"learning_rate": 8.730841259649725e-06, |
|
"loss": 0.1998, |
|
"step": 4330 |
|
}, |
|
{ |
|
"epoch": 8.671328671328672, |
|
"grad_norm": 0.5680242776870728, |
|
"learning_rate": 8.475882737908248e-06, |
|
"loss": 0.2, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 8.691308691308691, |
|
"grad_norm": 0.5889159440994263, |
|
"learning_rate": 8.224537431601886e-06, |
|
"loss": 0.1985, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 8.711288711288711, |
|
"grad_norm": 0.6051207780838013, |
|
"learning_rate": 7.976815263412963e-06, |
|
"loss": 0.1944, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 8.731268731268731, |
|
"grad_norm": 0.6148102283477783, |
|
"learning_rate": 7.73272601298851e-06, |
|
"loss": 0.1952, |
|
"step": 4370 |
|
}, |
|
{ |
|
"epoch": 8.75124875124875, |
|
"grad_norm": 0.6123753786087036, |
|
"learning_rate": 7.492279316554207e-06, |
|
"loss": 0.1955, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 8.77122877122877, |
|
"grad_norm": 0.5911871790885925, |
|
"learning_rate": 7.255484666533874e-06, |
|
"loss": 0.1987, |
|
"step": 4390 |
|
}, |
|
{ |
|
"epoch": 8.791208791208792, |
|
"grad_norm": 0.5861064195632935, |
|
"learning_rate": 7.022351411174866e-06, |
|
"loss": 0.1972, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 8.811188811188812, |
|
"grad_norm": 0.6565813422203064, |
|
"learning_rate": 6.7928887541789055e-06, |
|
"loss": 0.1966, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 8.831168831168831, |
|
"grad_norm": 0.6338573694229126, |
|
"learning_rate": 6.5671057543387985e-06, |
|
"loss": 0.1987, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 8.851148851148851, |
|
"grad_norm": 0.5672295093536377, |
|
"learning_rate": 6.345011325180772e-06, |
|
"loss": 0.198, |
|
"step": 4430 |
|
}, |
|
{ |
|
"epoch": 8.871128871128871, |
|
"grad_norm": 0.6036155223846436, |
|
"learning_rate": 6.126614234612593e-06, |
|
"loss": 0.199, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 8.89110889110889, |
|
"grad_norm": 0.5816395878791809, |
|
"learning_rate": 5.911923104577455e-06, |
|
"loss": 0.1985, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 8.91108891108891, |
|
"grad_norm": 0.5562584400177002, |
|
"learning_rate": 5.700946410713548e-06, |
|
"loss": 0.1964, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 8.931068931068932, |
|
"grad_norm": 0.6179762482643127, |
|
"learning_rate": 5.49369248201953e-06, |
|
"loss": 0.1948, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 8.951048951048952, |
|
"grad_norm": 0.5566456317901611, |
|
"learning_rate": 5.290169500525577e-06, |
|
"loss": 0.1958, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 8.971028971028971, |
|
"grad_norm": 0.6196462512016296, |
|
"learning_rate": 5.0903855009705514e-06, |
|
"loss": 0.1978, |
|
"step": 4490 |
|
}, |
|
{ |
|
"epoch": 8.991008991008991, |
|
"grad_norm": 0.5933112502098083, |
|
"learning_rate": 4.8943483704846475e-06, |
|
"loss": 0.1962, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 9.010989010989011, |
|
"grad_norm": 0.5680419206619263, |
|
"learning_rate": 4.702065848278126e-06, |
|
"loss": 0.1948, |
|
"step": 4510 |
|
}, |
|
{ |
|
"epoch": 9.03096903096903, |
|
"grad_norm": 0.5447672605514526, |
|
"learning_rate": 4.513545525335705e-06, |
|
"loss": 0.1894, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 9.05094905094905, |
|
"grad_norm": 0.5605758428573608, |
|
"learning_rate": 4.328794844116946e-06, |
|
"loss": 0.1903, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 9.07092907092907, |
|
"grad_norm": 0.5727641582489014, |
|
"learning_rate": 4.147821098262405e-06, |
|
"loss": 0.1899, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 9.090909090909092, |
|
"grad_norm": 0.5076532363891602, |
|
"learning_rate": 3.970631432305694e-06, |
|
"loss": 0.1872, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 9.110889110889111, |
|
"grad_norm": 0.5827686190605164, |
|
"learning_rate": 3.797232841391407e-06, |
|
"loss": 0.1871, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 9.130869130869131, |
|
"grad_norm": 0.5457426905632019, |
|
"learning_rate": 3.627632170999029e-06, |
|
"loss": 0.1903, |
|
"step": 4570 |
|
}, |
|
{ |
|
"epoch": 9.150849150849151, |
|
"grad_norm": 0.5931391716003418, |
|
"learning_rate": 3.461836116672612e-06, |
|
"loss": 0.1935, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 9.17082917082917, |
|
"grad_norm": 0.5335982441902161, |
|
"learning_rate": 3.2998512237565005e-06, |
|
"loss": 0.188, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 9.19080919080919, |
|
"grad_norm": 0.5809586048126221, |
|
"learning_rate": 3.1416838871368924e-06, |
|
"loss": 0.1882, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 9.21078921078921, |
|
"grad_norm": 0.5997488498687744, |
|
"learning_rate": 2.9873403509894203e-06, |
|
"loss": 0.189, |
|
"step": 4610 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 0.5423487424850464, |
|
"learning_rate": 2.836826708532603e-06, |
|
"loss": 0.1916, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 9.250749250749251, |
|
"grad_norm": 0.5920736193656921, |
|
"learning_rate": 2.690148901787337e-06, |
|
"loss": 0.1914, |
|
"step": 4630 |
|
}, |
|
{ |
|
"epoch": 9.270729270729271, |
|
"grad_norm": 0.5774621367454529, |
|
"learning_rate": 2.5473127213422763e-06, |
|
"loss": 0.1901, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 9.290709290709291, |
|
"grad_norm": 0.6183256506919861, |
|
"learning_rate": 2.4083238061252567e-06, |
|
"loss": 0.1918, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 9.31068931068931, |
|
"grad_norm": 0.5502414107322693, |
|
"learning_rate": 2.273187643180652e-06, |
|
"loss": 0.1888, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 9.33066933066933, |
|
"grad_norm": 0.5888564586639404, |
|
"learning_rate": 2.141909567452793e-06, |
|
"loss": 0.189, |
|
"step": 4670 |
|
}, |
|
{ |
|
"epoch": 9.35064935064935, |
|
"grad_norm": 0.582281231880188, |
|
"learning_rate": 2.014494761575314e-06, |
|
"loss": 0.188, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 9.37062937062937, |
|
"grad_norm": 0.549766480922699, |
|
"learning_rate": 1.8909482556666024e-06, |
|
"loss": 0.1911, |
|
"step": 4690 |
|
}, |
|
{ |
|
"epoch": 9.390609390609391, |
|
"grad_norm": 0.6442523002624512, |
|
"learning_rate": 1.771274927131139e-06, |
|
"loss": 0.1913, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 9.410589410589411, |
|
"grad_norm": 0.5612021684646606, |
|
"learning_rate": 1.6554795004670388e-06, |
|
"loss": 0.1926, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 9.430569430569431, |
|
"grad_norm": 0.6060473918914795, |
|
"learning_rate": 1.543566547079467e-06, |
|
"loss": 0.19, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 9.45054945054945, |
|
"grad_norm": 0.5958064794540405, |
|
"learning_rate": 1.4355404851001952e-06, |
|
"loss": 0.1885, |
|
"step": 4730 |
|
}, |
|
{ |
|
"epoch": 9.47052947052947, |
|
"grad_norm": 0.536431610584259, |
|
"learning_rate": 1.3314055792131964e-06, |
|
"loss": 0.1891, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 9.49050949050949, |
|
"grad_norm": 0.5971366763114929, |
|
"learning_rate": 1.231165940486234e-06, |
|
"loss": 0.1889, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 9.51048951048951, |
|
"grad_norm": 0.5461220145225525, |
|
"learning_rate": 1.134825526208605e-06, |
|
"loss": 0.1874, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 9.53046953046953, |
|
"grad_norm": 0.570928156375885, |
|
"learning_rate": 1.0423881397349068e-06, |
|
"loss": 0.1884, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 9.550449550449551, |
|
"grad_norm": 0.5855159759521484, |
|
"learning_rate": 9.538574303348813e-07, |
|
"loss": 0.1895, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 9.570429570429571, |
|
"grad_norm": 0.5505802631378174, |
|
"learning_rate": 8.692368930493521e-07, |
|
"loss": 0.1904, |
|
"step": 4790 |
|
}, |
|
{ |
|
"epoch": 9.59040959040959, |
|
"grad_norm": 0.5663396716117859, |
|
"learning_rate": 7.885298685522235e-07, |
|
"loss": 0.1909, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 9.61038961038961, |
|
"grad_norm": 0.6069871783256531, |
|
"learning_rate": 7.117395430186414e-07, |
|
"loss": 0.1895, |
|
"step": 4810 |
|
}, |
|
{ |
|
"epoch": 9.63036963036963, |
|
"grad_norm": 0.5576395988464355, |
|
"learning_rate": 6.388689479991605e-07, |
|
"loss": 0.1906, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 9.65034965034965, |
|
"grad_norm": 0.5069971084594727, |
|
"learning_rate": 5.699209603001076e-07, |
|
"loss": 0.1889, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 9.67032967032967, |
|
"grad_norm": 0.5770872235298157, |
|
"learning_rate": 5.048983018699827e-07, |
|
"loss": 0.1907, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 9.69030969030969, |
|
"grad_norm": 0.6914857029914856, |
|
"learning_rate": 4.438035396920004e-07, |
|
"loss": 0.1939, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 9.710289710289711, |
|
"grad_norm": 0.5999007821083069, |
|
"learning_rate": 3.866390856827495e-07, |
|
"loss": 0.1924, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 9.73026973026973, |
|
"grad_norm": 0.569180965423584, |
|
"learning_rate": 3.3340719659701313e-07, |
|
"loss": 0.1887, |
|
"step": 4870 |
|
}, |
|
{ |
|
"epoch": 9.75024975024975, |
|
"grad_norm": 0.5442143082618713, |
|
"learning_rate": 2.841099739386066e-07, |
|
"loss": 0.1897, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 9.77022977022977, |
|
"grad_norm": 0.5622804164886475, |
|
"learning_rate": 2.387493638774774e-07, |
|
"loss": 0.1898, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 9.79020979020979, |
|
"grad_norm": 0.6558981537818909, |
|
"learning_rate": 1.973271571728441e-07, |
|
"loss": 0.1917, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 9.81018981018981, |
|
"grad_norm": 0.5756235122680664, |
|
"learning_rate": 1.598449891024978e-07, |
|
"loss": 0.192, |
|
"step": 4910 |
|
}, |
|
{ |
|
"epoch": 9.83016983016983, |
|
"grad_norm": 0.5818027257919312, |
|
"learning_rate": 1.2630433939825327e-07, |
|
"loss": 0.1899, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 9.850149850149851, |
|
"grad_norm": 0.5986452698707581, |
|
"learning_rate": 9.670653218752934e-08, |
|
"loss": 0.1918, |
|
"step": 4930 |
|
}, |
|
{ |
|
"epoch": 9.87012987012987, |
|
"grad_norm": 0.5438185334205627, |
|
"learning_rate": 7.105273594107953e-08, |
|
"loss": 0.1905, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 9.89010989010989, |
|
"grad_norm": 0.5430960059165955, |
|
"learning_rate": 4.934396342684e-08, |
|
"loss": 0.1913, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 9.91008991008991, |
|
"grad_norm": 0.5492510199546814, |
|
"learning_rate": 3.1581071670006015e-08, |
|
"loss": 0.1904, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 9.93006993006993, |
|
"grad_norm": 0.5370259881019592, |
|
"learning_rate": 1.7764761919103477e-08, |
|
"loss": 0.1901, |
|
"step": 4970 |
|
}, |
|
{ |
|
"epoch": 9.95004995004995, |
|
"grad_norm": 0.5463282465934753, |
|
"learning_rate": 7.895579618388827e-09, |
|
"loss": 0.191, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 9.97002997002997, |
|
"grad_norm": 0.5733128190040588, |
|
"learning_rate": 1.973914386288467e-09, |
|
"loss": 0.1885, |
|
"step": 4990 |
|
}, |
|
{ |
|
"epoch": 9.99000999000999, |
|
"grad_norm": 0.5241893529891968, |
|
"learning_rate": 0.0, |
|
"loss": 0.1916, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 9.99000999000999, |
|
"step": 5000, |
|
"total_flos": 7.37720834306605e+17, |
|
"train_loss": 0.4096551623106003, |
|
"train_runtime": 80947.5605, |
|
"train_samples_per_second": 0.742, |
|
"train_steps_per_second": 0.062 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 5000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"total_flos": 7.37720834306605e+17, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|