|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 141420, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07071135624381275, |
|
"grad_norm": 0.950070858001709, |
|
"learning_rate": 3.125e-05, |
|
"loss": 5.0021, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1414227124876255, |
|
"grad_norm": 0.9159772992134094, |
|
"learning_rate": 6.25e-05, |
|
"loss": 3.5952, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.21213406873143828, |
|
"grad_norm": 0.8742101192474365, |
|
"learning_rate": 9.375e-05, |
|
"loss": 3.3236, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.282845424975251, |
|
"grad_norm": 0.7689530253410339, |
|
"learning_rate": 0.000125, |
|
"loss": 3.1344, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3535567812190638, |
|
"grad_norm": 0.7027168273925781, |
|
"learning_rate": 0.00015625, |
|
"loss": 3.0058, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.42426813746287656, |
|
"grad_norm": 0.6757282018661499, |
|
"learning_rate": 0.0001875, |
|
"loss": 2.8925, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.4949794937066893, |
|
"grad_norm": 0.6485090851783752, |
|
"learning_rate": 0.00021875, |
|
"loss": 2.8129, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.565690849950502, |
|
"grad_norm": 0.6332668662071228, |
|
"learning_rate": 0.00025, |
|
"loss": 2.7427, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6364022061943148, |
|
"grad_norm": 0.5585498213768005, |
|
"learning_rate": 0.00028125000000000003, |
|
"loss": 2.6942, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7071135624381276, |
|
"grad_norm": 0.5789941549301147, |
|
"learning_rate": 0.0003125, |
|
"loss": 2.6592, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7778249186819404, |
|
"grad_norm": 0.49083390831947327, |
|
"learning_rate": 0.00034371875, |
|
"loss": 2.6102, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8485362749257531, |
|
"grad_norm": 0.5267419815063477, |
|
"learning_rate": 0.00037496875000000003, |
|
"loss": 2.5811, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9192476311695659, |
|
"grad_norm": 0.46979865431785583, |
|
"learning_rate": 0.0004061875, |
|
"loss": 2.5689, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9899589874133786, |
|
"grad_norm": 0.4309781491756439, |
|
"learning_rate": 0.00043740625, |
|
"loss": 2.5477, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.48551466175887975, |
|
"eval_loss": 2.7489795684814453, |
|
"eval_runtime": 122.3768, |
|
"eval_samples_per_second": 382.981, |
|
"eval_steps_per_second": 5.99, |
|
"step": 14142 |
|
}, |
|
{ |
|
"epoch": 1.0606703436571914, |
|
"grad_norm": 0.3975456655025482, |
|
"learning_rate": 0.00046865625, |
|
"loss": 2.5021, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.131381699901004, |
|
"grad_norm": 0.40377160906791687, |
|
"learning_rate": 0.00049990625, |
|
"loss": 2.4895, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.2020930561448169, |
|
"grad_norm": 0.36566758155822754, |
|
"learning_rate": 0.000531125, |
|
"loss": 2.4727, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.2728044123886297, |
|
"grad_norm": 0.37737491726875305, |
|
"learning_rate": 0.0005623749999999999, |
|
"loss": 2.4652, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.3435157686324424, |
|
"grad_norm": 0.3293197751045227, |
|
"learning_rate": 0.000593625, |
|
"loss": 2.4574, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.414227124876255, |
|
"grad_norm": 0.290075421333313, |
|
"learning_rate": 0.000624875, |
|
"loss": 2.4275, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.4849384811200679, |
|
"grad_norm": 0.2959868311882019, |
|
"learning_rate": 0.00065609375, |
|
"loss": 2.4386, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.5556498373638807, |
|
"grad_norm": 0.27695414423942566, |
|
"learning_rate": 0.00068734375, |
|
"loss": 2.4161, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.6263611936076934, |
|
"grad_norm": 0.2632512152194977, |
|
"learning_rate": 0.00071859375, |
|
"loss": 2.4201, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.697072549851506, |
|
"grad_norm": 0.2265060991048813, |
|
"learning_rate": 0.0007498125, |
|
"loss": 2.4058, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.7677839060953189, |
|
"grad_norm": 0.26139551401138306, |
|
"learning_rate": 0.0007810625, |
|
"loss": 2.3958, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.8384952623391317, |
|
"grad_norm": 0.2395378053188324, |
|
"learning_rate": 0.0008123125, |
|
"loss": 2.3868, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.9092066185829444, |
|
"grad_norm": 0.23772157728672028, |
|
"learning_rate": 0.00084353125, |
|
"loss": 2.3766, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.979917974826757, |
|
"grad_norm": 0.23179960250854492, |
|
"learning_rate": 0.00087478125, |
|
"loss": 2.377, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5044756048031578, |
|
"eval_loss": 2.580270290374756, |
|
"eval_runtime": 126.9742, |
|
"eval_samples_per_second": 369.114, |
|
"eval_steps_per_second": 5.773, |
|
"step": 28284 |
|
}, |
|
{ |
|
"epoch": 2.05062933107057, |
|
"grad_norm": 0.25655558705329895, |
|
"learning_rate": 0.0009060312499999999, |
|
"loss": 2.3364, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.1213406873143827, |
|
"grad_norm": 0.22631210088729858, |
|
"learning_rate": 0.00093725, |
|
"loss": 2.3297, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.1920520435581956, |
|
"grad_norm": 0.2411614954471588, |
|
"learning_rate": 0.0009685000000000001, |
|
"loss": 2.3249, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.262763399802008, |
|
"grad_norm": 0.25610190629959106, |
|
"learning_rate": 0.00099975, |
|
"loss": 2.3166, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.333474756045821, |
|
"grad_norm": 0.2797723412513733, |
|
"learning_rate": 0.000990943154816304, |
|
"loss": 2.3086, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.4041861122896337, |
|
"grad_norm": 0.2100619226694107, |
|
"learning_rate": 0.0009818040577590935, |
|
"loss": 2.3117, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.4748974685334466, |
|
"grad_norm": 0.2163330614566803, |
|
"learning_rate": 0.0009726740997989398, |
|
"loss": 2.302, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.5456088247772595, |
|
"grad_norm": 0.22995002567768097, |
|
"learning_rate": 0.0009635350027417291, |
|
"loss": 2.2872, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.616320181021072, |
|
"grad_norm": 0.2004874050617218, |
|
"learning_rate": 0.0009544141838786328, |
|
"loss": 2.2843, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.6870315372648848, |
|
"grad_norm": 0.2153329849243164, |
|
"learning_rate": 0.0009452750868214221, |
|
"loss": 2.2788, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.7577428935086976, |
|
"grad_norm": 0.22890245914459229, |
|
"learning_rate": 0.0009361359897642113, |
|
"loss": 2.2711, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.82845424975251, |
|
"grad_norm": 0.2471633404493332, |
|
"learning_rate": 0.0009269968927070006, |
|
"loss": 2.2673, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.899165605996323, |
|
"grad_norm": 0.20824675261974335, |
|
"learning_rate": 0.0009178577956497898, |
|
"loss": 2.2589, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.9698769622401358, |
|
"grad_norm": 0.27108854055404663, |
|
"learning_rate": 0.0009087278376896363, |
|
"loss": 2.2625, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.5164308635329491, |
|
"eval_loss": 2.4760255813598633, |
|
"eval_runtime": 124.5514, |
|
"eval_samples_per_second": 376.295, |
|
"eval_steps_per_second": 5.885, |
|
"step": 42426 |
|
}, |
|
{ |
|
"epoch": 3.0405883184839486, |
|
"grad_norm": 0.2760034501552582, |
|
"learning_rate": 0.0008995887406324256, |
|
"loss": 2.22, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.1112996747277615, |
|
"grad_norm": 0.24826580286026, |
|
"learning_rate": 0.0008904587826722719, |
|
"loss": 2.1969, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.182011030971574, |
|
"grad_norm": 0.1961706429719925, |
|
"learning_rate": 0.0008813196856150612, |
|
"loss": 2.1929, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.2527223872153868, |
|
"grad_norm": 0.2030291110277176, |
|
"learning_rate": 0.0008721805885578505, |
|
"loss": 2.1974, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.3234337434591996, |
|
"grad_norm": 0.24897335469722748, |
|
"learning_rate": 0.0008630414915006397, |
|
"loss": 2.1993, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.3941450997030125, |
|
"grad_norm": 0.2421874701976776, |
|
"learning_rate": 0.0008539115335404863, |
|
"loss": 2.186, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.464856455946825, |
|
"grad_norm": 0.2960880398750305, |
|
"learning_rate": 0.0008447724364832756, |
|
"loss": 2.1877, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.5355678121906378, |
|
"grad_norm": 0.20504000782966614, |
|
"learning_rate": 0.0008356424785231219, |
|
"loss": 2.1816, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.6062791684344506, |
|
"grad_norm": 0.23933938145637512, |
|
"learning_rate": 0.0008265033814659112, |
|
"loss": 2.1807, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 3.6769905246782635, |
|
"grad_norm": 0.23281992971897125, |
|
"learning_rate": 0.0008173734235057576, |
|
"loss": 2.1804, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 3.747701880922076, |
|
"grad_norm": 0.20451070368289948, |
|
"learning_rate": 0.0008082343264485469, |
|
"loss": 2.1807, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 3.8184132371658888, |
|
"grad_norm": 0.23853066563606262, |
|
"learning_rate": 0.0007990952293913362, |
|
"loss": 2.1767, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.8891245934097016, |
|
"grad_norm": 0.24050584435462952, |
|
"learning_rate": 0.0007899652714311825, |
|
"loss": 2.1667, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.9598359496535145, |
|
"grad_norm": 0.21722733974456787, |
|
"learning_rate": 0.0007808261743739718, |
|
"loss": 2.1715, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.523546219218542, |
|
"eval_loss": 2.4206314086914062, |
|
"eval_runtime": 124.5767, |
|
"eval_samples_per_second": 376.218, |
|
"eval_steps_per_second": 5.884, |
|
"step": 56568 |
|
}, |
|
{ |
|
"epoch": 4.030547305897327, |
|
"grad_norm": 0.2417266070842743, |
|
"learning_rate": 0.0007716962164138184, |
|
"loss": 2.1342, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 4.10125866214114, |
|
"grad_norm": 0.22849377989768982, |
|
"learning_rate": 0.0007625571193566076, |
|
"loss": 2.1082, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 4.171970018384952, |
|
"grad_norm": 0.226350799202919, |
|
"learning_rate": 0.000753427161396454, |
|
"loss": 2.1144, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 4.2426813746287655, |
|
"grad_norm": 0.207255020737648, |
|
"learning_rate": 0.0007442880643392433, |
|
"loss": 2.1116, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.313392730872578, |
|
"grad_norm": 0.21048206090927124, |
|
"learning_rate": 0.0007351581063790898, |
|
"loss": 2.1092, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 4.384104087116391, |
|
"grad_norm": 0.26391103863716125, |
|
"learning_rate": 0.000726019009321879, |
|
"loss": 2.1105, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 4.454815443360204, |
|
"grad_norm": 0.22511842846870422, |
|
"learning_rate": 0.0007168799122646683, |
|
"loss": 2.1111, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 4.525526799604016, |
|
"grad_norm": 0.264876127243042, |
|
"learning_rate": 0.0007077499543045147, |
|
"loss": 2.1069, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 4.596238155847829, |
|
"grad_norm": 0.20152664184570312, |
|
"learning_rate": 0.0006986199963443612, |
|
"loss": 2.1107, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 4.666949512091642, |
|
"grad_norm": 0.2202775925397873, |
|
"learning_rate": 0.0006894808992871504, |
|
"loss": 2.1192, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 4.737660868335455, |
|
"grad_norm": 0.2462519109249115, |
|
"learning_rate": 0.0006803418022299397, |
|
"loss": 2.1094, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 4.8083722245792675, |
|
"grad_norm": 0.24858810007572174, |
|
"learning_rate": 0.000671202705172729, |
|
"loss": 2.116, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 4.87908358082308, |
|
"grad_norm": 0.25709378719329834, |
|
"learning_rate": 0.0006620727472125753, |
|
"loss": 2.1125, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 4.949794937066893, |
|
"grad_norm": 0.22404509782791138, |
|
"learning_rate": 0.0006529336501553646, |
|
"loss": 2.0996, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5277767446216196, |
|
"eval_loss": 2.388011932373047, |
|
"eval_runtime": 124.6122, |
|
"eval_samples_per_second": 376.111, |
|
"eval_steps_per_second": 5.882, |
|
"step": 70710 |
|
}, |
|
{ |
|
"epoch": 5.020506293310706, |
|
"grad_norm": 0.21308551728725433, |
|
"learning_rate": 0.0006438036921952112, |
|
"loss": 2.0836, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 5.091217649554518, |
|
"grad_norm": 0.26624125242233276, |
|
"learning_rate": 0.0006346645951380004, |
|
"loss": 2.0401, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 5.161929005798331, |
|
"grad_norm": 0.24497570097446442, |
|
"learning_rate": 0.0006255254980807897, |
|
"loss": 2.0422, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 5.232640362042144, |
|
"grad_norm": 0.2567467987537384, |
|
"learning_rate": 0.0006163955401206361, |
|
"loss": 2.044, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 5.303351718285957, |
|
"grad_norm": 0.23071114718914032, |
|
"learning_rate": 0.0006072564430634253, |
|
"loss": 2.0465, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 5.3740630745297695, |
|
"grad_norm": 0.25491389632225037, |
|
"learning_rate": 0.0005981264851032718, |
|
"loss": 2.0482, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 5.444774430773582, |
|
"grad_norm": 0.2559095621109009, |
|
"learning_rate": 0.0005889873880460611, |
|
"loss": 2.0496, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 5.515485787017395, |
|
"grad_norm": 0.22284868359565735, |
|
"learning_rate": 0.0005798574300859076, |
|
"loss": 2.0525, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 5.586197143261208, |
|
"grad_norm": 0.23331965506076813, |
|
"learning_rate": 0.0005707183330286967, |
|
"loss": 2.0649, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 5.65690849950502, |
|
"grad_norm": 0.2457083910703659, |
|
"learning_rate": 0.0005615883750685432, |
|
"loss": 2.0515, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 5.727619855748833, |
|
"grad_norm": 0.2747284173965454, |
|
"learning_rate": 0.0005524492780113325, |
|
"loss": 2.0636, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 5.798331211992646, |
|
"grad_norm": 0.2696532607078552, |
|
"learning_rate": 0.0005433101809541218, |
|
"loss": 2.0548, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 5.869042568236459, |
|
"grad_norm": 0.2052222490310669, |
|
"learning_rate": 0.0005341802229939682, |
|
"loss": 2.0486, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 5.9397539244802715, |
|
"grad_norm": 0.2313115894794464, |
|
"learning_rate": 0.0005250411259367574, |
|
"loss": 2.0456, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.5305995578608221, |
|
"eval_loss": 2.372159957885742, |
|
"eval_runtime": 124.7467, |
|
"eval_samples_per_second": 375.705, |
|
"eval_steps_per_second": 5.876, |
|
"step": 84852 |
|
}, |
|
{ |
|
"epoch": 6.010465280724084, |
|
"grad_norm": 0.2167889028787613, |
|
"learning_rate": 0.0005159020288795467, |
|
"loss": 2.0537, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 6.081176636967897, |
|
"grad_norm": 0.2274264246225357, |
|
"learning_rate": 0.0005067720709193931, |
|
"loss": 1.9868, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 6.15188799321171, |
|
"grad_norm": 0.21367891132831573, |
|
"learning_rate": 0.0004976329738621824, |
|
"loss": 1.9966, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 6.222599349455523, |
|
"grad_norm": 0.23541270196437836, |
|
"learning_rate": 0.0004885030159020289, |
|
"loss": 2.0013, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 6.293310705699335, |
|
"grad_norm": 0.24054056406021118, |
|
"learning_rate": 0.0004793639188448181, |
|
"loss": 1.9875, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 6.364022061943148, |
|
"grad_norm": 0.23533108830451965, |
|
"learning_rate": 0.00047023396088466456, |
|
"loss": 1.9967, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 6.434733418186961, |
|
"grad_norm": 0.2795858383178711, |
|
"learning_rate": 0.00046110400292451105, |
|
"loss": 2.0019, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 6.5054447744307735, |
|
"grad_norm": 0.2619366943836212, |
|
"learning_rate": 0.00045196490586730035, |
|
"loss": 2.0028, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 6.576156130674587, |
|
"grad_norm": 0.24517033994197845, |
|
"learning_rate": 0.00044282580881008953, |
|
"loss": 2.0044, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 6.646867486918399, |
|
"grad_norm": 0.2473345845937729, |
|
"learning_rate": 0.0004336867117528788, |
|
"loss": 1.9935, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 6.717578843162212, |
|
"grad_norm": 0.229476198554039, |
|
"learning_rate": 0.00042454761469566806, |
|
"loss": 1.9943, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 6.788290199406025, |
|
"grad_norm": 0.2603880763053894, |
|
"learning_rate": 0.0004154176567355145, |
|
"loss": 2.0073, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 6.859001555649837, |
|
"grad_norm": 0.2473394274711609, |
|
"learning_rate": 0.0004062785596783038, |
|
"loss": 2.012, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 6.92971291189365, |
|
"grad_norm": 0.24815410375595093, |
|
"learning_rate": 0.00039714860171815024, |
|
"loss": 1.9983, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.5327383571510881, |
|
"eval_loss": 2.359189748764038, |
|
"eval_runtime": 124.2745, |
|
"eval_samples_per_second": 377.133, |
|
"eval_steps_per_second": 5.898, |
|
"step": 98994 |
|
}, |
|
{ |
|
"epoch": 7.000424268137463, |
|
"grad_norm": 0.2351510226726532, |
|
"learning_rate": 0.0003880095046609395, |
|
"loss": 2.0039, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 7.0711356243812755, |
|
"grad_norm": 0.27717456221580505, |
|
"learning_rate": 0.0003788704076037288, |
|
"loss": 1.9358, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 7.141846980625088, |
|
"grad_norm": 0.2275402843952179, |
|
"learning_rate": 0.0003697495887406324, |
|
"loss": 1.9391, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 7.212558336868901, |
|
"grad_norm": 0.3047560453414917, |
|
"learning_rate": 0.0003606104916834217, |
|
"loss": 1.9472, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 7.283269693112714, |
|
"grad_norm": 0.2828271687030792, |
|
"learning_rate": 0.0003514713946262109, |
|
"loss": 1.9415, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 7.353981049356527, |
|
"grad_norm": 0.27506691217422485, |
|
"learning_rate": 0.0003423322975690002, |
|
"loss": 1.9467, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 7.424692405600339, |
|
"grad_norm": 0.23766624927520752, |
|
"learning_rate": 0.0003332023396088467, |
|
"loss": 1.9559, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 7.495403761844152, |
|
"grad_norm": 0.25875958800315857, |
|
"learning_rate": 0.00032406324255163587, |
|
"loss": 1.9569, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 7.566115118087965, |
|
"grad_norm": 0.24315999448299408, |
|
"learning_rate": 0.00031493328459148237, |
|
"loss": 1.9567, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 7.6368264743317775, |
|
"grad_norm": 0.29540637135505676, |
|
"learning_rate": 0.0003057941875342716, |
|
"loss": 1.9567, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 7.707537830575591, |
|
"grad_norm": 0.2695215940475464, |
|
"learning_rate": 0.00029665509047706085, |
|
"loss": 1.9605, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 7.778249186819403, |
|
"grad_norm": 0.2626267075538635, |
|
"learning_rate": 0.00028751599341985014, |
|
"loss": 1.9527, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 7.848960543063216, |
|
"grad_norm": 0.25401443243026733, |
|
"learning_rate": 0.0002783768963626394, |
|
"loss": 1.966, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 7.919671899307029, |
|
"grad_norm": 0.22462651133537292, |
|
"learning_rate": 0.0002692377993054286, |
|
"loss": 1.9584, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 7.990383255550841, |
|
"grad_norm": 0.28454989194869995, |
|
"learning_rate": 0.0002601078413452751, |
|
"loss": 1.9482, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5338266026714983, |
|
"eval_loss": 2.357896327972412, |
|
"eval_runtime": 124.302, |
|
"eval_samples_per_second": 377.05, |
|
"eval_steps_per_second": 5.897, |
|
"step": 113136 |
|
}, |
|
{ |
|
"epoch": 8.061094611794655, |
|
"grad_norm": 0.26728782057762146, |
|
"learning_rate": 0.0002509687442880643, |
|
"loss": 1.8937, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 8.131805968038467, |
|
"grad_norm": 0.24568845331668854, |
|
"learning_rate": 0.00024183878632791082, |
|
"loss": 1.8983, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 8.20251732428228, |
|
"grad_norm": 0.2506534457206726, |
|
"learning_rate": 0.0002327088283677573, |
|
"loss": 1.9067, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 8.273228680526092, |
|
"grad_norm": 0.31150582432746887, |
|
"learning_rate": 0.00022356973131054653, |
|
"loss": 1.9085, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 8.343940036769904, |
|
"grad_norm": 0.2992372512817383, |
|
"learning_rate": 0.00021443063425333577, |
|
"loss": 1.9085, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 8.414651393013719, |
|
"grad_norm": 0.23084251582622528, |
|
"learning_rate": 0.00020529153719612504, |
|
"loss": 1.91, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 8.485362749257531, |
|
"grad_norm": 0.2754100561141968, |
|
"learning_rate": 0.0001961615792359715, |
|
"loss": 1.9076, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 8.556074105501343, |
|
"grad_norm": 0.25915420055389404, |
|
"learning_rate": 0.00018702248217876074, |
|
"loss": 1.915, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 8.626785461745156, |
|
"grad_norm": 0.26031365990638733, |
|
"learning_rate": 0.00017788338512155, |
|
"loss": 1.9159, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 8.697496817988968, |
|
"grad_norm": 0.2626364231109619, |
|
"learning_rate": 0.00016875342716139648, |
|
"loss": 1.905, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 8.768208174232782, |
|
"grad_norm": 0.30089327692985535, |
|
"learning_rate": 0.00015961433010418572, |
|
"loss": 1.9046, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 8.838919530476595, |
|
"grad_norm": 0.2982795536518097, |
|
"learning_rate": 0.00015047523304697496, |
|
"loss": 1.9052, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 8.909630886720407, |
|
"grad_norm": 0.2462824285030365, |
|
"learning_rate": 0.00014134527508682143, |
|
"loss": 1.907, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 8.98034224296422, |
|
"grad_norm": 0.2993851900100708, |
|
"learning_rate": 0.0001322061780296107, |
|
"loss": 1.9061, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.5346066724693145, |
|
"eval_loss": 2.3604698181152344, |
|
"eval_runtime": 124.2295, |
|
"eval_samples_per_second": 377.27, |
|
"eval_steps_per_second": 5.9, |
|
"step": 127278 |
|
}, |
|
{ |
|
"epoch": 9.051053599208032, |
|
"grad_norm": 0.29052260518074036, |
|
"learning_rate": 0.00012307622006945714, |
|
"loss": 1.8738, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 9.121764955451846, |
|
"grad_norm": 0.2522347569465637, |
|
"learning_rate": 0.0001139462621093036, |
|
"loss": 1.8601, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 9.192476311695659, |
|
"grad_norm": 0.30291104316711426, |
|
"learning_rate": 0.00010480716505209286, |
|
"loss": 1.8793, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 9.263187667939471, |
|
"grad_norm": 0.26850393414497375, |
|
"learning_rate": 9.566806799488211e-05, |
|
"loss": 1.8676, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 9.333899024183284, |
|
"grad_norm": 0.270274817943573, |
|
"learning_rate": 8.652897093767135e-05, |
|
"loss": 1.8669, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 9.404610380427096, |
|
"grad_norm": 0.2623472809791565, |
|
"learning_rate": 7.739901297751782e-05, |
|
"loss": 1.8651, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 9.47532173667091, |
|
"grad_norm": 0.2898847758769989, |
|
"learning_rate": 6.825991592030707e-05, |
|
"loss": 1.8697, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 9.546033092914723, |
|
"grad_norm": 0.30749163031578064, |
|
"learning_rate": 5.912081886309633e-05, |
|
"loss": 1.8635, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 9.616744449158535, |
|
"grad_norm": 0.255743145942688, |
|
"learning_rate": 4.9981721805885585e-05, |
|
"loss": 1.8662, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 9.687455805402347, |
|
"grad_norm": 0.30720534920692444, |
|
"learning_rate": 4.085176384573204e-05, |
|
"loss": 1.8582, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 9.75816716164616, |
|
"grad_norm": 0.2820815145969391, |
|
"learning_rate": 3.17126667885213e-05, |
|
"loss": 1.8694, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 9.828878517889972, |
|
"grad_norm": 0.30231404304504395, |
|
"learning_rate": 2.2582708828367758e-05, |
|
"loss": 1.8654, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 9.899589874133786, |
|
"grad_norm": 0.292241632938385, |
|
"learning_rate": 1.344361177115701e-05, |
|
"loss": 1.8634, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 9.970301230377599, |
|
"grad_norm": 0.25611528754234314, |
|
"learning_rate": 4.313653811003472e-06, |
|
"loss": 1.8692, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.534805757971141, |
|
"eval_loss": 2.368875503540039, |
|
"eval_runtime": 124.6443, |
|
"eval_samples_per_second": 376.014, |
|
"eval_steps_per_second": 5.881, |
|
"step": 141420 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 141420, |
|
"total_flos": 5.9582336320512e+17, |
|
"train_loss": 2.1796733167279037, |
|
"train_runtime": 31005.8847, |
|
"train_samples_per_second": 145.948, |
|
"train_steps_per_second": 4.561 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 141420, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.9582336320512e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|