opt-babylm2-subset-default-1e-3 / trainer_state.json
kanishka's picture
End of training
380b940 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 10.0,
"eval_steps": 500,
"global_step": 141420,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.07071135624381275,
"grad_norm": 0.950070858001709,
"learning_rate": 3.125e-05,
"loss": 5.0021,
"step": 1000
},
{
"epoch": 0.1414227124876255,
"grad_norm": 0.9159772992134094,
"learning_rate": 6.25e-05,
"loss": 3.5952,
"step": 2000
},
{
"epoch": 0.21213406873143828,
"grad_norm": 0.8742101192474365,
"learning_rate": 9.375e-05,
"loss": 3.3236,
"step": 3000
},
{
"epoch": 0.282845424975251,
"grad_norm": 0.7689530253410339,
"learning_rate": 0.000125,
"loss": 3.1344,
"step": 4000
},
{
"epoch": 0.3535567812190638,
"grad_norm": 0.7027168273925781,
"learning_rate": 0.00015625,
"loss": 3.0058,
"step": 5000
},
{
"epoch": 0.42426813746287656,
"grad_norm": 0.6757282018661499,
"learning_rate": 0.0001875,
"loss": 2.8925,
"step": 6000
},
{
"epoch": 0.4949794937066893,
"grad_norm": 0.6485090851783752,
"learning_rate": 0.00021875,
"loss": 2.8129,
"step": 7000
},
{
"epoch": 0.565690849950502,
"grad_norm": 0.6332668662071228,
"learning_rate": 0.00025,
"loss": 2.7427,
"step": 8000
},
{
"epoch": 0.6364022061943148,
"grad_norm": 0.5585498213768005,
"learning_rate": 0.00028125000000000003,
"loss": 2.6942,
"step": 9000
},
{
"epoch": 0.7071135624381276,
"grad_norm": 0.5789941549301147,
"learning_rate": 0.0003125,
"loss": 2.6592,
"step": 10000
},
{
"epoch": 0.7778249186819404,
"grad_norm": 0.49083390831947327,
"learning_rate": 0.00034371875,
"loss": 2.6102,
"step": 11000
},
{
"epoch": 0.8485362749257531,
"grad_norm": 0.5267419815063477,
"learning_rate": 0.00037496875000000003,
"loss": 2.5811,
"step": 12000
},
{
"epoch": 0.9192476311695659,
"grad_norm": 0.46979865431785583,
"learning_rate": 0.0004061875,
"loss": 2.5689,
"step": 13000
},
{
"epoch": 0.9899589874133786,
"grad_norm": 0.4309781491756439,
"learning_rate": 0.00043740625,
"loss": 2.5477,
"step": 14000
},
{
"epoch": 1.0,
"eval_accuracy": 0.48551466175887975,
"eval_loss": 2.7489795684814453,
"eval_runtime": 122.3768,
"eval_samples_per_second": 382.981,
"eval_steps_per_second": 5.99,
"step": 14142
},
{
"epoch": 1.0606703436571914,
"grad_norm": 0.3975456655025482,
"learning_rate": 0.00046865625,
"loss": 2.5021,
"step": 15000
},
{
"epoch": 1.131381699901004,
"grad_norm": 0.40377160906791687,
"learning_rate": 0.00049990625,
"loss": 2.4895,
"step": 16000
},
{
"epoch": 1.2020930561448169,
"grad_norm": 0.36566758155822754,
"learning_rate": 0.000531125,
"loss": 2.4727,
"step": 17000
},
{
"epoch": 1.2728044123886297,
"grad_norm": 0.37737491726875305,
"learning_rate": 0.0005623749999999999,
"loss": 2.4652,
"step": 18000
},
{
"epoch": 1.3435157686324424,
"grad_norm": 0.3293197751045227,
"learning_rate": 0.000593625,
"loss": 2.4574,
"step": 19000
},
{
"epoch": 1.414227124876255,
"grad_norm": 0.290075421333313,
"learning_rate": 0.000624875,
"loss": 2.4275,
"step": 20000
},
{
"epoch": 1.4849384811200679,
"grad_norm": 0.2959868311882019,
"learning_rate": 0.00065609375,
"loss": 2.4386,
"step": 21000
},
{
"epoch": 1.5556498373638807,
"grad_norm": 0.27695414423942566,
"learning_rate": 0.00068734375,
"loss": 2.4161,
"step": 22000
},
{
"epoch": 1.6263611936076934,
"grad_norm": 0.2632512152194977,
"learning_rate": 0.00071859375,
"loss": 2.4201,
"step": 23000
},
{
"epoch": 1.697072549851506,
"grad_norm": 0.2265060991048813,
"learning_rate": 0.0007498125,
"loss": 2.4058,
"step": 24000
},
{
"epoch": 1.7677839060953189,
"grad_norm": 0.26139551401138306,
"learning_rate": 0.0007810625,
"loss": 2.3958,
"step": 25000
},
{
"epoch": 1.8384952623391317,
"grad_norm": 0.2395378053188324,
"learning_rate": 0.0008123125,
"loss": 2.3868,
"step": 26000
},
{
"epoch": 1.9092066185829444,
"grad_norm": 0.23772157728672028,
"learning_rate": 0.00084353125,
"loss": 2.3766,
"step": 27000
},
{
"epoch": 1.979917974826757,
"grad_norm": 0.23179960250854492,
"learning_rate": 0.00087478125,
"loss": 2.377,
"step": 28000
},
{
"epoch": 2.0,
"eval_accuracy": 0.5044756048031578,
"eval_loss": 2.580270290374756,
"eval_runtime": 126.9742,
"eval_samples_per_second": 369.114,
"eval_steps_per_second": 5.773,
"step": 28284
},
{
"epoch": 2.05062933107057,
"grad_norm": 0.25655558705329895,
"learning_rate": 0.0009060312499999999,
"loss": 2.3364,
"step": 29000
},
{
"epoch": 2.1213406873143827,
"grad_norm": 0.22631210088729858,
"learning_rate": 0.00093725,
"loss": 2.3297,
"step": 30000
},
{
"epoch": 2.1920520435581956,
"grad_norm": 0.2411614954471588,
"learning_rate": 0.0009685000000000001,
"loss": 2.3249,
"step": 31000
},
{
"epoch": 2.262763399802008,
"grad_norm": 0.25610190629959106,
"learning_rate": 0.00099975,
"loss": 2.3166,
"step": 32000
},
{
"epoch": 2.333474756045821,
"grad_norm": 0.2797723412513733,
"learning_rate": 0.000990943154816304,
"loss": 2.3086,
"step": 33000
},
{
"epoch": 2.4041861122896337,
"grad_norm": 0.2100619226694107,
"learning_rate": 0.0009818040577590935,
"loss": 2.3117,
"step": 34000
},
{
"epoch": 2.4748974685334466,
"grad_norm": 0.2163330614566803,
"learning_rate": 0.0009726740997989398,
"loss": 2.302,
"step": 35000
},
{
"epoch": 2.5456088247772595,
"grad_norm": 0.22995002567768097,
"learning_rate": 0.0009635350027417291,
"loss": 2.2872,
"step": 36000
},
{
"epoch": 2.616320181021072,
"grad_norm": 0.2004874050617218,
"learning_rate": 0.0009544141838786328,
"loss": 2.2843,
"step": 37000
},
{
"epoch": 2.6870315372648848,
"grad_norm": 0.2153329849243164,
"learning_rate": 0.0009452750868214221,
"loss": 2.2788,
"step": 38000
},
{
"epoch": 2.7577428935086976,
"grad_norm": 0.22890245914459229,
"learning_rate": 0.0009361359897642113,
"loss": 2.2711,
"step": 39000
},
{
"epoch": 2.82845424975251,
"grad_norm": 0.2471633404493332,
"learning_rate": 0.0009269968927070006,
"loss": 2.2673,
"step": 40000
},
{
"epoch": 2.899165605996323,
"grad_norm": 0.20824675261974335,
"learning_rate": 0.0009178577956497898,
"loss": 2.2589,
"step": 41000
},
{
"epoch": 2.9698769622401358,
"grad_norm": 0.27108854055404663,
"learning_rate": 0.0009087278376896363,
"loss": 2.2625,
"step": 42000
},
{
"epoch": 3.0,
"eval_accuracy": 0.5164308635329491,
"eval_loss": 2.4760255813598633,
"eval_runtime": 124.5514,
"eval_samples_per_second": 376.295,
"eval_steps_per_second": 5.885,
"step": 42426
},
{
"epoch": 3.0405883184839486,
"grad_norm": 0.2760034501552582,
"learning_rate": 0.0008995887406324256,
"loss": 2.22,
"step": 43000
},
{
"epoch": 3.1112996747277615,
"grad_norm": 0.24826580286026,
"learning_rate": 0.0008904587826722719,
"loss": 2.1969,
"step": 44000
},
{
"epoch": 3.182011030971574,
"grad_norm": 0.1961706429719925,
"learning_rate": 0.0008813196856150612,
"loss": 2.1929,
"step": 45000
},
{
"epoch": 3.2527223872153868,
"grad_norm": 0.2030291110277176,
"learning_rate": 0.0008721805885578505,
"loss": 2.1974,
"step": 46000
},
{
"epoch": 3.3234337434591996,
"grad_norm": 0.24897335469722748,
"learning_rate": 0.0008630414915006397,
"loss": 2.1993,
"step": 47000
},
{
"epoch": 3.3941450997030125,
"grad_norm": 0.2421874701976776,
"learning_rate": 0.0008539115335404863,
"loss": 2.186,
"step": 48000
},
{
"epoch": 3.464856455946825,
"grad_norm": 0.2960880398750305,
"learning_rate": 0.0008447724364832756,
"loss": 2.1877,
"step": 49000
},
{
"epoch": 3.5355678121906378,
"grad_norm": 0.20504000782966614,
"learning_rate": 0.0008356424785231219,
"loss": 2.1816,
"step": 50000
},
{
"epoch": 3.6062791684344506,
"grad_norm": 0.23933938145637512,
"learning_rate": 0.0008265033814659112,
"loss": 2.1807,
"step": 51000
},
{
"epoch": 3.6769905246782635,
"grad_norm": 0.23281992971897125,
"learning_rate": 0.0008173734235057576,
"loss": 2.1804,
"step": 52000
},
{
"epoch": 3.747701880922076,
"grad_norm": 0.20451070368289948,
"learning_rate": 0.0008082343264485469,
"loss": 2.1807,
"step": 53000
},
{
"epoch": 3.8184132371658888,
"grad_norm": 0.23853066563606262,
"learning_rate": 0.0007990952293913362,
"loss": 2.1767,
"step": 54000
},
{
"epoch": 3.8891245934097016,
"grad_norm": 0.24050584435462952,
"learning_rate": 0.0007899652714311825,
"loss": 2.1667,
"step": 55000
},
{
"epoch": 3.9598359496535145,
"grad_norm": 0.21722733974456787,
"learning_rate": 0.0007808261743739718,
"loss": 2.1715,
"step": 56000
},
{
"epoch": 4.0,
"eval_accuracy": 0.523546219218542,
"eval_loss": 2.4206314086914062,
"eval_runtime": 124.5767,
"eval_samples_per_second": 376.218,
"eval_steps_per_second": 5.884,
"step": 56568
},
{
"epoch": 4.030547305897327,
"grad_norm": 0.2417266070842743,
"learning_rate": 0.0007716962164138184,
"loss": 2.1342,
"step": 57000
},
{
"epoch": 4.10125866214114,
"grad_norm": 0.22849377989768982,
"learning_rate": 0.0007625571193566076,
"loss": 2.1082,
"step": 58000
},
{
"epoch": 4.171970018384952,
"grad_norm": 0.226350799202919,
"learning_rate": 0.000753427161396454,
"loss": 2.1144,
"step": 59000
},
{
"epoch": 4.2426813746287655,
"grad_norm": 0.207255020737648,
"learning_rate": 0.0007442880643392433,
"loss": 2.1116,
"step": 60000
},
{
"epoch": 4.313392730872578,
"grad_norm": 0.21048206090927124,
"learning_rate": 0.0007351581063790898,
"loss": 2.1092,
"step": 61000
},
{
"epoch": 4.384104087116391,
"grad_norm": 0.26391103863716125,
"learning_rate": 0.000726019009321879,
"loss": 2.1105,
"step": 62000
},
{
"epoch": 4.454815443360204,
"grad_norm": 0.22511842846870422,
"learning_rate": 0.0007168799122646683,
"loss": 2.1111,
"step": 63000
},
{
"epoch": 4.525526799604016,
"grad_norm": 0.264876127243042,
"learning_rate": 0.0007077499543045147,
"loss": 2.1069,
"step": 64000
},
{
"epoch": 4.596238155847829,
"grad_norm": 0.20152664184570312,
"learning_rate": 0.0006986199963443612,
"loss": 2.1107,
"step": 65000
},
{
"epoch": 4.666949512091642,
"grad_norm": 0.2202775925397873,
"learning_rate": 0.0006894808992871504,
"loss": 2.1192,
"step": 66000
},
{
"epoch": 4.737660868335455,
"grad_norm": 0.2462519109249115,
"learning_rate": 0.0006803418022299397,
"loss": 2.1094,
"step": 67000
},
{
"epoch": 4.8083722245792675,
"grad_norm": 0.24858810007572174,
"learning_rate": 0.000671202705172729,
"loss": 2.116,
"step": 68000
},
{
"epoch": 4.87908358082308,
"grad_norm": 0.25709378719329834,
"learning_rate": 0.0006620727472125753,
"loss": 2.1125,
"step": 69000
},
{
"epoch": 4.949794937066893,
"grad_norm": 0.22404509782791138,
"learning_rate": 0.0006529336501553646,
"loss": 2.0996,
"step": 70000
},
{
"epoch": 5.0,
"eval_accuracy": 0.5277767446216196,
"eval_loss": 2.388011932373047,
"eval_runtime": 124.6122,
"eval_samples_per_second": 376.111,
"eval_steps_per_second": 5.882,
"step": 70710
},
{
"epoch": 5.020506293310706,
"grad_norm": 0.21308551728725433,
"learning_rate": 0.0006438036921952112,
"loss": 2.0836,
"step": 71000
},
{
"epoch": 5.091217649554518,
"grad_norm": 0.26624125242233276,
"learning_rate": 0.0006346645951380004,
"loss": 2.0401,
"step": 72000
},
{
"epoch": 5.161929005798331,
"grad_norm": 0.24497570097446442,
"learning_rate": 0.0006255254980807897,
"loss": 2.0422,
"step": 73000
},
{
"epoch": 5.232640362042144,
"grad_norm": 0.2567467987537384,
"learning_rate": 0.0006163955401206361,
"loss": 2.044,
"step": 74000
},
{
"epoch": 5.303351718285957,
"grad_norm": 0.23071114718914032,
"learning_rate": 0.0006072564430634253,
"loss": 2.0465,
"step": 75000
},
{
"epoch": 5.3740630745297695,
"grad_norm": 0.25491389632225037,
"learning_rate": 0.0005981264851032718,
"loss": 2.0482,
"step": 76000
},
{
"epoch": 5.444774430773582,
"grad_norm": 0.2559095621109009,
"learning_rate": 0.0005889873880460611,
"loss": 2.0496,
"step": 77000
},
{
"epoch": 5.515485787017395,
"grad_norm": 0.22284868359565735,
"learning_rate": 0.0005798574300859076,
"loss": 2.0525,
"step": 78000
},
{
"epoch": 5.586197143261208,
"grad_norm": 0.23331965506076813,
"learning_rate": 0.0005707183330286967,
"loss": 2.0649,
"step": 79000
},
{
"epoch": 5.65690849950502,
"grad_norm": 0.2457083910703659,
"learning_rate": 0.0005615883750685432,
"loss": 2.0515,
"step": 80000
},
{
"epoch": 5.727619855748833,
"grad_norm": 0.2747284173965454,
"learning_rate": 0.0005524492780113325,
"loss": 2.0636,
"step": 81000
},
{
"epoch": 5.798331211992646,
"grad_norm": 0.2696532607078552,
"learning_rate": 0.0005433101809541218,
"loss": 2.0548,
"step": 82000
},
{
"epoch": 5.869042568236459,
"grad_norm": 0.2052222490310669,
"learning_rate": 0.0005341802229939682,
"loss": 2.0486,
"step": 83000
},
{
"epoch": 5.9397539244802715,
"grad_norm": 0.2313115894794464,
"learning_rate": 0.0005250411259367574,
"loss": 2.0456,
"step": 84000
},
{
"epoch": 6.0,
"eval_accuracy": 0.5305995578608221,
"eval_loss": 2.372159957885742,
"eval_runtime": 124.7467,
"eval_samples_per_second": 375.705,
"eval_steps_per_second": 5.876,
"step": 84852
},
{
"epoch": 6.010465280724084,
"grad_norm": 0.2167889028787613,
"learning_rate": 0.0005159020288795467,
"loss": 2.0537,
"step": 85000
},
{
"epoch": 6.081176636967897,
"grad_norm": 0.2274264246225357,
"learning_rate": 0.0005067720709193931,
"loss": 1.9868,
"step": 86000
},
{
"epoch": 6.15188799321171,
"grad_norm": 0.21367891132831573,
"learning_rate": 0.0004976329738621824,
"loss": 1.9966,
"step": 87000
},
{
"epoch": 6.222599349455523,
"grad_norm": 0.23541270196437836,
"learning_rate": 0.0004885030159020289,
"loss": 2.0013,
"step": 88000
},
{
"epoch": 6.293310705699335,
"grad_norm": 0.24054056406021118,
"learning_rate": 0.0004793639188448181,
"loss": 1.9875,
"step": 89000
},
{
"epoch": 6.364022061943148,
"grad_norm": 0.23533108830451965,
"learning_rate": 0.00047023396088466456,
"loss": 1.9967,
"step": 90000
},
{
"epoch": 6.434733418186961,
"grad_norm": 0.2795858383178711,
"learning_rate": 0.00046110400292451105,
"loss": 2.0019,
"step": 91000
},
{
"epoch": 6.5054447744307735,
"grad_norm": 0.2619366943836212,
"learning_rate": 0.00045196490586730035,
"loss": 2.0028,
"step": 92000
},
{
"epoch": 6.576156130674587,
"grad_norm": 0.24517033994197845,
"learning_rate": 0.00044282580881008953,
"loss": 2.0044,
"step": 93000
},
{
"epoch": 6.646867486918399,
"grad_norm": 0.2473345845937729,
"learning_rate": 0.0004336867117528788,
"loss": 1.9935,
"step": 94000
},
{
"epoch": 6.717578843162212,
"grad_norm": 0.229476198554039,
"learning_rate": 0.00042454761469566806,
"loss": 1.9943,
"step": 95000
},
{
"epoch": 6.788290199406025,
"grad_norm": 0.2603880763053894,
"learning_rate": 0.0004154176567355145,
"loss": 2.0073,
"step": 96000
},
{
"epoch": 6.859001555649837,
"grad_norm": 0.2473394274711609,
"learning_rate": 0.0004062785596783038,
"loss": 2.012,
"step": 97000
},
{
"epoch": 6.92971291189365,
"grad_norm": 0.24815410375595093,
"learning_rate": 0.00039714860171815024,
"loss": 1.9983,
"step": 98000
},
{
"epoch": 7.0,
"eval_accuracy": 0.5327383571510881,
"eval_loss": 2.359189748764038,
"eval_runtime": 124.2745,
"eval_samples_per_second": 377.133,
"eval_steps_per_second": 5.898,
"step": 98994
},
{
"epoch": 7.000424268137463,
"grad_norm": 0.2351510226726532,
"learning_rate": 0.0003880095046609395,
"loss": 2.0039,
"step": 99000
},
{
"epoch": 7.0711356243812755,
"grad_norm": 0.27717456221580505,
"learning_rate": 0.0003788704076037288,
"loss": 1.9358,
"step": 100000
},
{
"epoch": 7.141846980625088,
"grad_norm": 0.2275402843952179,
"learning_rate": 0.0003697495887406324,
"loss": 1.9391,
"step": 101000
},
{
"epoch": 7.212558336868901,
"grad_norm": 0.3047560453414917,
"learning_rate": 0.0003606104916834217,
"loss": 1.9472,
"step": 102000
},
{
"epoch": 7.283269693112714,
"grad_norm": 0.2828271687030792,
"learning_rate": 0.0003514713946262109,
"loss": 1.9415,
"step": 103000
},
{
"epoch": 7.353981049356527,
"grad_norm": 0.27506691217422485,
"learning_rate": 0.0003423322975690002,
"loss": 1.9467,
"step": 104000
},
{
"epoch": 7.424692405600339,
"grad_norm": 0.23766624927520752,
"learning_rate": 0.0003332023396088467,
"loss": 1.9559,
"step": 105000
},
{
"epoch": 7.495403761844152,
"grad_norm": 0.25875958800315857,
"learning_rate": 0.00032406324255163587,
"loss": 1.9569,
"step": 106000
},
{
"epoch": 7.566115118087965,
"grad_norm": 0.24315999448299408,
"learning_rate": 0.00031493328459148237,
"loss": 1.9567,
"step": 107000
},
{
"epoch": 7.6368264743317775,
"grad_norm": 0.29540637135505676,
"learning_rate": 0.0003057941875342716,
"loss": 1.9567,
"step": 108000
},
{
"epoch": 7.707537830575591,
"grad_norm": 0.2695215940475464,
"learning_rate": 0.00029665509047706085,
"loss": 1.9605,
"step": 109000
},
{
"epoch": 7.778249186819403,
"grad_norm": 0.2626267075538635,
"learning_rate": 0.00028751599341985014,
"loss": 1.9527,
"step": 110000
},
{
"epoch": 7.848960543063216,
"grad_norm": 0.25401443243026733,
"learning_rate": 0.0002783768963626394,
"loss": 1.966,
"step": 111000
},
{
"epoch": 7.919671899307029,
"grad_norm": 0.22462651133537292,
"learning_rate": 0.0002692377993054286,
"loss": 1.9584,
"step": 112000
},
{
"epoch": 7.990383255550841,
"grad_norm": 0.28454989194869995,
"learning_rate": 0.0002601078413452751,
"loss": 1.9482,
"step": 113000
},
{
"epoch": 8.0,
"eval_accuracy": 0.5338266026714983,
"eval_loss": 2.357896327972412,
"eval_runtime": 124.302,
"eval_samples_per_second": 377.05,
"eval_steps_per_second": 5.897,
"step": 113136
},
{
"epoch": 8.061094611794655,
"grad_norm": 0.26728782057762146,
"learning_rate": 0.0002509687442880643,
"loss": 1.8937,
"step": 114000
},
{
"epoch": 8.131805968038467,
"grad_norm": 0.24568845331668854,
"learning_rate": 0.00024183878632791082,
"loss": 1.8983,
"step": 115000
},
{
"epoch": 8.20251732428228,
"grad_norm": 0.2506534457206726,
"learning_rate": 0.0002327088283677573,
"loss": 1.9067,
"step": 116000
},
{
"epoch": 8.273228680526092,
"grad_norm": 0.31150582432746887,
"learning_rate": 0.00022356973131054653,
"loss": 1.9085,
"step": 117000
},
{
"epoch": 8.343940036769904,
"grad_norm": 0.2992372512817383,
"learning_rate": 0.00021443063425333577,
"loss": 1.9085,
"step": 118000
},
{
"epoch": 8.414651393013719,
"grad_norm": 0.23084251582622528,
"learning_rate": 0.00020529153719612504,
"loss": 1.91,
"step": 119000
},
{
"epoch": 8.485362749257531,
"grad_norm": 0.2754100561141968,
"learning_rate": 0.0001961615792359715,
"loss": 1.9076,
"step": 120000
},
{
"epoch": 8.556074105501343,
"grad_norm": 0.25915420055389404,
"learning_rate": 0.00018702248217876074,
"loss": 1.915,
"step": 121000
},
{
"epoch": 8.626785461745156,
"grad_norm": 0.26031365990638733,
"learning_rate": 0.00017788338512155,
"loss": 1.9159,
"step": 122000
},
{
"epoch": 8.697496817988968,
"grad_norm": 0.2626364231109619,
"learning_rate": 0.00016875342716139648,
"loss": 1.905,
"step": 123000
},
{
"epoch": 8.768208174232782,
"grad_norm": 0.30089327692985535,
"learning_rate": 0.00015961433010418572,
"loss": 1.9046,
"step": 124000
},
{
"epoch": 8.838919530476595,
"grad_norm": 0.2982795536518097,
"learning_rate": 0.00015047523304697496,
"loss": 1.9052,
"step": 125000
},
{
"epoch": 8.909630886720407,
"grad_norm": 0.2462824285030365,
"learning_rate": 0.00014134527508682143,
"loss": 1.907,
"step": 126000
},
{
"epoch": 8.98034224296422,
"grad_norm": 0.2993851900100708,
"learning_rate": 0.0001322061780296107,
"loss": 1.9061,
"step": 127000
},
{
"epoch": 9.0,
"eval_accuracy": 0.5346066724693145,
"eval_loss": 2.3604698181152344,
"eval_runtime": 124.2295,
"eval_samples_per_second": 377.27,
"eval_steps_per_second": 5.9,
"step": 127278
},
{
"epoch": 9.051053599208032,
"grad_norm": 0.29052260518074036,
"learning_rate": 0.00012307622006945714,
"loss": 1.8738,
"step": 128000
},
{
"epoch": 9.121764955451846,
"grad_norm": 0.2522347569465637,
"learning_rate": 0.0001139462621093036,
"loss": 1.8601,
"step": 129000
},
{
"epoch": 9.192476311695659,
"grad_norm": 0.30291104316711426,
"learning_rate": 0.00010480716505209286,
"loss": 1.8793,
"step": 130000
},
{
"epoch": 9.263187667939471,
"grad_norm": 0.26850393414497375,
"learning_rate": 9.566806799488211e-05,
"loss": 1.8676,
"step": 131000
},
{
"epoch": 9.333899024183284,
"grad_norm": 0.270274817943573,
"learning_rate": 8.652897093767135e-05,
"loss": 1.8669,
"step": 132000
},
{
"epoch": 9.404610380427096,
"grad_norm": 0.2623472809791565,
"learning_rate": 7.739901297751782e-05,
"loss": 1.8651,
"step": 133000
},
{
"epoch": 9.47532173667091,
"grad_norm": 0.2898847758769989,
"learning_rate": 6.825991592030707e-05,
"loss": 1.8697,
"step": 134000
},
{
"epoch": 9.546033092914723,
"grad_norm": 0.30749163031578064,
"learning_rate": 5.912081886309633e-05,
"loss": 1.8635,
"step": 135000
},
{
"epoch": 9.616744449158535,
"grad_norm": 0.255743145942688,
"learning_rate": 4.9981721805885585e-05,
"loss": 1.8662,
"step": 136000
},
{
"epoch": 9.687455805402347,
"grad_norm": 0.30720534920692444,
"learning_rate": 4.085176384573204e-05,
"loss": 1.8582,
"step": 137000
},
{
"epoch": 9.75816716164616,
"grad_norm": 0.2820815145969391,
"learning_rate": 3.17126667885213e-05,
"loss": 1.8694,
"step": 138000
},
{
"epoch": 9.828878517889972,
"grad_norm": 0.30231404304504395,
"learning_rate": 2.2582708828367758e-05,
"loss": 1.8654,
"step": 139000
},
{
"epoch": 9.899589874133786,
"grad_norm": 0.292241632938385,
"learning_rate": 1.344361177115701e-05,
"loss": 1.8634,
"step": 140000
},
{
"epoch": 9.970301230377599,
"grad_norm": 0.25611528754234314,
"learning_rate": 4.313653811003472e-06,
"loss": 1.8692,
"step": 141000
},
{
"epoch": 10.0,
"eval_accuracy": 0.534805757971141,
"eval_loss": 2.368875503540039,
"eval_runtime": 124.6443,
"eval_samples_per_second": 376.014,
"eval_steps_per_second": 5.881,
"step": 141420
},
{
"epoch": 10.0,
"step": 141420,
"total_flos": 5.9582336320512e+17,
"train_loss": 2.1796733167279037,
"train_runtime": 31005.8847,
"train_samples_per_second": 145.948,
"train_steps_per_second": 4.561
}
],
"logging_steps": 1000,
"max_steps": 141420,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.9582336320512e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}