|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 8.807339449541285, |
|
"eval_steps": 500, |
|
"global_step": 1080, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08154943934760449, |
|
"grad_norm": 12.287543296813965, |
|
"learning_rate": 9.259259259259259e-07, |
|
"loss": 1.1833, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.16309887869520898, |
|
"grad_norm": 11.78052043914795, |
|
"learning_rate": 1.8518518518518519e-06, |
|
"loss": 1.0856, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.24464831804281345, |
|
"grad_norm": 15.526741981506348, |
|
"learning_rate": 2.7777777777777783e-06, |
|
"loss": 1.0896, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.32619775739041795, |
|
"grad_norm": 8.213525772094727, |
|
"learning_rate": 3.7037037037037037e-06, |
|
"loss": 0.9945, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4077471967380224, |
|
"grad_norm": 12.257412910461426, |
|
"learning_rate": 4.62962962962963e-06, |
|
"loss": 1.0562, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4892966360856269, |
|
"grad_norm": 8.546279907226562, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 1.0866, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5708460754332314, |
|
"grad_norm": 13.490253448486328, |
|
"learning_rate": 6.481481481481482e-06, |
|
"loss": 1.151, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6523955147808359, |
|
"grad_norm": 19.235937118530273, |
|
"learning_rate": 7.4074074074074075e-06, |
|
"loss": 0.9591, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7339449541284404, |
|
"grad_norm": 6.614363193511963, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 1.1466, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8154943934760448, |
|
"grad_norm": 9.026179313659668, |
|
"learning_rate": 9.25925925925926e-06, |
|
"loss": 1.0284, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8970438328236493, |
|
"grad_norm": 10.798940658569336, |
|
"learning_rate": 9.999895536228031e-06, |
|
"loss": 1.2601, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9785932721712538, |
|
"grad_norm": 9.451533317565918, |
|
"learning_rate": 9.996239762521152e-06, |
|
"loss": 1.1091, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0601427115188584, |
|
"grad_norm": 6.230576992034912, |
|
"learning_rate": 9.987365164467767e-06, |
|
"loss": 0.9699, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.1416921508664628, |
|
"grad_norm": 7.345333099365234, |
|
"learning_rate": 9.973281012033009e-06, |
|
"loss": 0.8974, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2232415902140672, |
|
"grad_norm": 8.226116180419922, |
|
"learning_rate": 9.954002016824226e-06, |
|
"loss": 0.9152, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3047910295616718, |
|
"grad_norm": 5.226821422576904, |
|
"learning_rate": 9.929548316723983e-06, |
|
"loss": 0.8205, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.3863404689092762, |
|
"grad_norm": 14.963723182678223, |
|
"learning_rate": 9.899945454855007e-06, |
|
"loss": 0.852, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4678899082568808, |
|
"grad_norm": 6.257207870483398, |
|
"learning_rate": 9.86522435289912e-06, |
|
"loss": 0.7172, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.5494393476044852, |
|
"grad_norm": 5.380673885345459, |
|
"learning_rate": 9.825421278797984e-06, |
|
"loss": 0.7586, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6309887869520896, |
|
"grad_norm": 6.626143455505371, |
|
"learning_rate": 9.7805778088694e-06, |
|
"loss": 0.802, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.7125382262996942, |
|
"grad_norm": 5.537201881408691, |
|
"learning_rate": 9.730740784378755e-06, |
|
"loss": 0.8788, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.7940876656472988, |
|
"grad_norm": 14.645645141601562, |
|
"learning_rate": 9.67596226261095e-06, |
|
"loss": 0.8143, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8756371049949032, |
|
"grad_norm": 5.771991729736328, |
|
"learning_rate": 9.616299462493952e-06, |
|
"loss": 0.8228, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.9571865443425076, |
|
"grad_norm": 7.7364091873168945, |
|
"learning_rate": 9.551814704830734e-06, |
|
"loss": 0.7194, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.038735983690112, |
|
"grad_norm": 7.201632499694824, |
|
"learning_rate": 9.482575347202047e-06, |
|
"loss": 0.6792, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.120285423037717, |
|
"grad_norm": 10.6597318649292, |
|
"learning_rate": 9.40865371360804e-06, |
|
"loss": 0.5799, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.2018348623853212, |
|
"grad_norm": 5.66484260559082, |
|
"learning_rate": 9.330127018922195e-06, |
|
"loss": 0.5307, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.2833843017329256, |
|
"grad_norm": 11.373878479003906, |
|
"learning_rate": 9.247077288236488e-06, |
|
"loss": 0.5052, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.36493374108053, |
|
"grad_norm": 7.9306864738464355, |
|
"learning_rate": 9.159591271182058e-06, |
|
"loss": 0.4944, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.4464831804281344, |
|
"grad_norm": 4.888540267944336, |
|
"learning_rate": 9.067760351314838e-06, |
|
"loss": 0.5803, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.528032619775739, |
|
"grad_norm": 5.445197582244873, |
|
"learning_rate": 8.97168045066082e-06, |
|
"loss": 0.4859, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.6095820591233436, |
|
"grad_norm": 6.764598369598389, |
|
"learning_rate": 8.871451929520662e-06, |
|
"loss": 0.5261, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.691131498470948, |
|
"grad_norm": 4.372009754180908, |
|
"learning_rate": 8.767179481638303e-06, |
|
"loss": 0.5705, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.7726809378185524, |
|
"grad_norm": 5.150655746459961, |
|
"learning_rate": 8.658972024843063e-06, |
|
"loss": 0.5248, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.8542303771661572, |
|
"grad_norm": 7.280877590179443, |
|
"learning_rate": 8.546942587279465e-06, |
|
"loss": 0.4764, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.9357798165137616, |
|
"grad_norm": 6.407410621643066, |
|
"learning_rate": 8.43120818934367e-06, |
|
"loss": 0.5481, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.017329255861366, |
|
"grad_norm": 3.3497812747955322, |
|
"learning_rate": 8.31188972144974e-06, |
|
"loss": 0.4215, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.0988786952089704, |
|
"grad_norm": 8.869242668151855, |
|
"learning_rate": 8.18911181775353e-06, |
|
"loss": 0.3038, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.180428134556575, |
|
"grad_norm": 7.462818622589111, |
|
"learning_rate": 8.063002725966014e-06, |
|
"loss": 0.3173, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.261977573904179, |
|
"grad_norm": 4.446223258972168, |
|
"learning_rate": 7.93369417339209e-06, |
|
"loss": 0.3345, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.343527013251784, |
|
"grad_norm": 4.583369255065918, |
|
"learning_rate": 7.801321229334764e-06, |
|
"loss": 0.3165, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.4250764525993884, |
|
"grad_norm": 3.839564323425293, |
|
"learning_rate": 7.666022164008458e-06, |
|
"loss": 0.2921, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.506625891946993, |
|
"grad_norm": 3.111229181289673, |
|
"learning_rate": 7.527938304108795e-06, |
|
"loss": 0.3062, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.588175331294597, |
|
"grad_norm": 4.569026470184326, |
|
"learning_rate": 7.387213885189746e-06, |
|
"loss": 0.2924, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.669724770642202, |
|
"grad_norm": 6.843806266784668, |
|
"learning_rate": 7.243995901002312e-06, |
|
"loss": 0.2929, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.7512742099898064, |
|
"grad_norm": 3.574108839035034, |
|
"learning_rate": 7.098433949952146e-06, |
|
"loss": 0.3544, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.832823649337411, |
|
"grad_norm": 3.691032886505127, |
|
"learning_rate": 6.950680078836475e-06, |
|
"loss": 0.3103, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.914373088685015, |
|
"grad_norm": 3.9971020221710205, |
|
"learning_rate": 6.800888624023552e-06, |
|
"loss": 0.2742, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.9959225280326196, |
|
"grad_norm": 2.556887149810791, |
|
"learning_rate": 6.649216050240539e-06, |
|
"loss": 0.3339, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.077471967380224, |
|
"grad_norm": 2.1896212100982666, |
|
"learning_rate": 6.495820787138209e-06, |
|
"loss": 0.193, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.077471967380224, |
|
"eval_loss": 1.6843377351760864, |
|
"eval_runtime": 4.0486, |
|
"eval_samples_per_second": 27.17, |
|
"eval_steps_per_second": 27.17, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.159021406727828, |
|
"grad_norm": 4.995784282684326, |
|
"learning_rate": 6.340863063803187e-06, |
|
"loss": 0.2102, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.240570846075434, |
|
"grad_norm": 3.022165536880493, |
|
"learning_rate": 6.184504741390596e-06, |
|
"loss": 0.1799, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.322120285423038, |
|
"grad_norm": 3.73397159576416, |
|
"learning_rate": 6.02690914405191e-06, |
|
"loss": 0.1945, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.4036697247706424, |
|
"grad_norm": 3.1206557750701904, |
|
"learning_rate": 5.8682408883346535e-06, |
|
"loss": 0.1186, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.485219164118247, |
|
"grad_norm": 6.9254326820373535, |
|
"learning_rate": 5.708665711232103e-06, |
|
"loss": 0.1761, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.566768603465851, |
|
"grad_norm": 1.7940961122512817, |
|
"learning_rate": 5.548350297062659e-06, |
|
"loss": 0.1747, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.648318042813456, |
|
"grad_norm": 3.6487746238708496, |
|
"learning_rate": 5.387462103359655e-06, |
|
"loss": 0.205, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.72986748216106, |
|
"grad_norm": 2.9038984775543213, |
|
"learning_rate": 5.2261691859535325e-06, |
|
"loss": 0.1384, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.811416921508664, |
|
"grad_norm": 7.9458842277526855, |
|
"learning_rate": 5.064640023429042e-06, |
|
"loss": 0.1475, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.892966360856269, |
|
"grad_norm": 3.393327236175537, |
|
"learning_rate": 4.903043341140879e-06, |
|
"loss": 0.1647, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.974515800203873, |
|
"grad_norm": 5.027682781219482, |
|
"learning_rate": 4.741547934971528e-06, |
|
"loss": 0.1691, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 5.0560652395514785, |
|
"grad_norm": 1.468493938446045, |
|
"learning_rate": 4.580322495015466e-06, |
|
"loss": 0.1022, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.137614678899083, |
|
"grad_norm": 1.9386202096939087, |
|
"learning_rate": 4.4195354293738484e-06, |
|
"loss": 0.0677, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.219164118246687, |
|
"grad_norm": 2.2345311641693115, |
|
"learning_rate": 4.259354688243758e-06, |
|
"loss": 0.0761, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.300713557594292, |
|
"grad_norm": 2.380667209625244, |
|
"learning_rate": 4.099947588485744e-06, |
|
"loss": 0.0733, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.382262996941896, |
|
"grad_norm": 1.2158432006835938, |
|
"learning_rate": 3.941480638852948e-06, |
|
"loss": 0.0841, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.4638124362895, |
|
"grad_norm": 2.382153034210205, |
|
"learning_rate": 3.784119366064293e-06, |
|
"loss": 0.0929, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 5.545361875637105, |
|
"grad_norm": 2.7093493938446045, |
|
"learning_rate": 3.6280281419034934e-06, |
|
"loss": 0.0634, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 5.626911314984709, |
|
"grad_norm": 1.597835659980774, |
|
"learning_rate": 3.473370011524435e-06, |
|
"loss": 0.1201, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 5.708460754332314, |
|
"grad_norm": 2.351442575454712, |
|
"learning_rate": 3.3203065231422904e-06, |
|
"loss": 0.0811, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 5.790010193679919, |
|
"grad_norm": 2.5688395500183105, |
|
"learning_rate": 3.1689975592882603e-06, |
|
"loss": 0.0708, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 5.871559633027523, |
|
"grad_norm": 3.0511674880981445, |
|
"learning_rate": 3.019601169804216e-06, |
|
"loss": 0.0846, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 5.953109072375128, |
|
"grad_norm": 5.8331298828125, |
|
"learning_rate": 2.8722734067516637e-06, |
|
"loss": 0.0782, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 6.034658511722732, |
|
"grad_norm": 1.1434911489486694, |
|
"learning_rate": 2.7271681614074973e-06, |
|
"loss": 0.0624, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.116207951070336, |
|
"grad_norm": 2.7258777618408203, |
|
"learning_rate": 2.5844370035168077e-06, |
|
"loss": 0.0373, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.197757390417941, |
|
"grad_norm": 0.5002675652503967, |
|
"learning_rate": 2.4442290229706344e-06, |
|
"loss": 0.0331, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.279306829765545, |
|
"grad_norm": 0.7639961242675781, |
|
"learning_rate": 2.3066906740740626e-06, |
|
"loss": 0.0232, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 6.36085626911315, |
|
"grad_norm": 2.4113121032714844, |
|
"learning_rate": 2.171965622567308e-06, |
|
"loss": 0.0356, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 6.442405708460754, |
|
"grad_norm": 1.4164212942123413, |
|
"learning_rate": 2.0401945955596206e-06, |
|
"loss": 0.0599, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 6.523955147808358, |
|
"grad_norm": 1.6531766653060913, |
|
"learning_rate": 1.9115152345327154e-06, |
|
"loss": 0.0298, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 6.605504587155964, |
|
"grad_norm": 1.0268425941467285, |
|
"learning_rate": 1.7860619515673034e-06, |
|
"loss": 0.029, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 6.687054026503568, |
|
"grad_norm": 1.7151083946228027, |
|
"learning_rate": 1.6639657889429017e-06, |
|
"loss": 0.0322, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 6.7686034658511725, |
|
"grad_norm": 1.3095351457595825, |
|
"learning_rate": 1.5453542822575624e-06, |
|
"loss": 0.0276, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 6.850152905198777, |
|
"grad_norm": 2.322570323944092, |
|
"learning_rate": 1.4303513272105057e-06, |
|
"loss": 0.0307, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 6.931702344546381, |
|
"grad_norm": 5.283768653869629, |
|
"learning_rate": 1.3190770501868243e-06, |
|
"loss": 0.0439, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 7.013251783893986, |
|
"grad_norm": 2.021864414215088, |
|
"learning_rate": 1.2116476827794104e-06, |
|
"loss": 0.0349, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 7.09480122324159, |
|
"grad_norm": 1.7383687496185303, |
|
"learning_rate": 1.1081754403792e-06, |
|
"loss": 0.0141, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 7.176350662589194, |
|
"grad_norm": 0.9524487853050232, |
|
"learning_rate": 1.008768404960535e-06, |
|
"loss": 0.0131, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 7.257900101936799, |
|
"grad_norm": 0.6132211089134216, |
|
"learning_rate": 9.135304121840976e-07, |
|
"loss": 0.0156, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 7.339449541284404, |
|
"grad_norm": 0.16791853308677673, |
|
"learning_rate": 8.225609429353187e-07, |
|
"loss": 0.0129, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 7.4209989806320085, |
|
"grad_norm": 0.24683955311775208, |
|
"learning_rate": 7.35955019411585e-07, |
|
"loss": 0.0142, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 7.502548419979613, |
|
"grad_norm": 0.177597314119339, |
|
"learning_rate": 6.53803105866761e-07, |
|
"loss": 0.0103, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 7.584097859327217, |
|
"grad_norm": 1.1235181093215942, |
|
"learning_rate": 5.76191014116711e-07, |
|
"loss": 0.0166, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 7.665647298674822, |
|
"grad_norm": 0.7732178568840027, |
|
"learning_rate": 5.031998139045352e-07, |
|
"loss": 0.0144, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 7.747196738022426, |
|
"grad_norm": 1.0395855903625488, |
|
"learning_rate": 4.349057482191299e-07, |
|
"loss": 0.0165, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 7.82874617737003, |
|
"grad_norm": 0.6988322734832764, |
|
"learning_rate": 3.7138015365554834e-07, |
|
"loss": 0.0118, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 7.910295616717635, |
|
"grad_norm": 0.8800845742225647, |
|
"learning_rate": 3.1268938590032495e-07, |
|
"loss": 0.0141, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 7.991845056065239, |
|
"grad_norm": 0.32935312390327454, |
|
"learning_rate": 2.5889475041961767e-07, |
|
"loss": 0.0096, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 8.073394495412844, |
|
"grad_norm": 0.8593177199363708, |
|
"learning_rate": 2.1005243842255552e-07, |
|
"loss": 0.0207, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 8.154943934760448, |
|
"grad_norm": 0.561518132686615, |
|
"learning_rate": 1.6621346816668993e-07, |
|
"loss": 0.0083, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.154943934760448, |
|
"eval_loss": 2.4749834537506104, |
|
"eval_runtime": 3.9418, |
|
"eval_samples_per_second": 27.906, |
|
"eval_steps_per_second": 27.906, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 8.236493374108052, |
|
"grad_norm": 0.1318514049053192, |
|
"learning_rate": 1.2742363166685035e-07, |
|
"loss": 0.0084, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 8.318042813455657, |
|
"grad_norm": 0.09888464212417603, |
|
"learning_rate": 9.372344686307655e-08, |
|
"loss": 0.0065, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 8.399592252803261, |
|
"grad_norm": 0.2378782033920288, |
|
"learning_rate": 6.514811529758747e-08, |
|
"loss": 0.0116, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 8.481141692150867, |
|
"grad_norm": 0.2287997156381607, |
|
"learning_rate": 4.172748534499449e-08, |
|
"loss": 0.0108, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 8.562691131498472, |
|
"grad_norm": 0.5172834992408752, |
|
"learning_rate": 2.3486021034170857e-08, |
|
"loss": 0.0088, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 8.644240570846076, |
|
"grad_norm": 0.272684246301651, |
|
"learning_rate": 1.044277649433989e-08, |
|
"loss": 0.0086, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 8.72579001019368, |
|
"grad_norm": 0.2911706864833832, |
|
"learning_rate": 2.611376052073511e-09, |
|
"loss": 0.0086, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 8.807339449541285, |
|
"grad_norm": 0.5013861060142517, |
|
"learning_rate": 0.0, |
|
"loss": 0.0063, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 8.807339449541285, |
|
"step": 1080, |
|
"total_flos": 1.2056852760477696e+16, |
|
"train_loss": 0.3479360826589443, |
|
"train_runtime": 1723.5404, |
|
"train_samples_per_second": 5.013, |
|
"train_steps_per_second": 0.627 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1080, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.2056852760477696e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|