{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 141420, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.07071135624381275, "grad_norm": 0.950070858001709, "learning_rate": 3.125e-05, "loss": 5.0021, "step": 1000 }, { "epoch": 0.1414227124876255, "grad_norm": 0.9159772992134094, "learning_rate": 6.25e-05, "loss": 3.5952, "step": 2000 }, { "epoch": 0.21213406873143828, "grad_norm": 0.8742101192474365, "learning_rate": 9.375e-05, "loss": 3.3236, "step": 3000 }, { "epoch": 0.282845424975251, "grad_norm": 0.7689530253410339, "learning_rate": 0.000125, "loss": 3.1344, "step": 4000 }, { "epoch": 0.3535567812190638, "grad_norm": 0.7027168273925781, "learning_rate": 0.00015625, "loss": 3.0058, "step": 5000 }, { "epoch": 0.42426813746287656, "grad_norm": 0.6757282018661499, "learning_rate": 0.0001875, "loss": 2.8925, "step": 6000 }, { "epoch": 0.4949794937066893, "grad_norm": 0.6485090851783752, "learning_rate": 0.00021875, "loss": 2.8129, "step": 7000 }, { "epoch": 0.565690849950502, "grad_norm": 0.6332668662071228, "learning_rate": 0.00025, "loss": 2.7427, "step": 8000 }, { "epoch": 0.6364022061943148, "grad_norm": 0.5585498213768005, "learning_rate": 0.00028125000000000003, "loss": 2.6942, "step": 9000 }, { "epoch": 0.7071135624381276, "grad_norm": 0.5789941549301147, "learning_rate": 0.0003125, "loss": 2.6592, "step": 10000 }, { "epoch": 0.7778249186819404, "grad_norm": 0.49083390831947327, "learning_rate": 0.00034371875, "loss": 2.6102, "step": 11000 }, { "epoch": 0.8485362749257531, "grad_norm": 0.5267419815063477, "learning_rate": 0.00037496875000000003, "loss": 2.5811, "step": 12000 }, { "epoch": 0.9192476311695659, "grad_norm": 0.46979865431785583, "learning_rate": 0.0004061875, "loss": 2.5689, "step": 13000 }, { "epoch": 0.9899589874133786, "grad_norm": 0.4309781491756439, "learning_rate": 0.00043740625, "loss": 2.5477, "step": 14000 }, { "epoch": 1.0, "eval_accuracy": 0.48551466175887975, "eval_loss": 2.7489795684814453, "eval_runtime": 122.3768, "eval_samples_per_second": 382.981, "eval_steps_per_second": 5.99, "step": 14142 }, { "epoch": 1.0606703436571914, "grad_norm": 0.3975456655025482, "learning_rate": 0.00046865625, "loss": 2.5021, "step": 15000 }, { "epoch": 1.131381699901004, "grad_norm": 0.40377160906791687, "learning_rate": 0.00049990625, "loss": 2.4895, "step": 16000 }, { "epoch": 1.2020930561448169, "grad_norm": 0.36566758155822754, "learning_rate": 0.000531125, "loss": 2.4727, "step": 17000 }, { "epoch": 1.2728044123886297, "grad_norm": 0.37737491726875305, "learning_rate": 0.0005623749999999999, "loss": 2.4652, "step": 18000 }, { "epoch": 1.3435157686324424, "grad_norm": 0.3293197751045227, "learning_rate": 0.000593625, "loss": 2.4574, "step": 19000 }, { "epoch": 1.414227124876255, "grad_norm": 0.290075421333313, "learning_rate": 0.000624875, "loss": 2.4275, "step": 20000 }, { "epoch": 1.4849384811200679, "grad_norm": 0.2959868311882019, "learning_rate": 0.00065609375, "loss": 2.4386, "step": 21000 }, { "epoch": 1.5556498373638807, "grad_norm": 0.27695414423942566, "learning_rate": 0.00068734375, "loss": 2.4161, "step": 22000 }, { "epoch": 1.6263611936076934, "grad_norm": 0.2632512152194977, "learning_rate": 0.00071859375, "loss": 2.4201, "step": 23000 }, { "epoch": 1.697072549851506, "grad_norm": 0.2265060991048813, "learning_rate": 0.0007498125, "loss": 2.4058, "step": 24000 }, { "epoch": 1.7677839060953189, "grad_norm": 0.26139551401138306, "learning_rate": 0.0007810625, "loss": 2.3958, "step": 25000 }, { "epoch": 1.8384952623391317, "grad_norm": 0.2395378053188324, "learning_rate": 0.0008123125, "loss": 2.3868, "step": 26000 }, { "epoch": 1.9092066185829444, "grad_norm": 0.23772157728672028, "learning_rate": 0.00084353125, "loss": 2.3766, "step": 27000 }, { "epoch": 1.979917974826757, "grad_norm": 0.23179960250854492, "learning_rate": 0.00087478125, "loss": 2.377, "step": 28000 }, { "epoch": 2.0, "eval_accuracy": 0.5044756048031578, "eval_loss": 2.580270290374756, "eval_runtime": 126.9742, "eval_samples_per_second": 369.114, "eval_steps_per_second": 5.773, "step": 28284 }, { "epoch": 2.05062933107057, "grad_norm": 0.25655558705329895, "learning_rate": 0.0009060312499999999, "loss": 2.3364, "step": 29000 }, { "epoch": 2.1213406873143827, "grad_norm": 0.22631210088729858, "learning_rate": 0.00093725, "loss": 2.3297, "step": 30000 }, { "epoch": 2.1920520435581956, "grad_norm": 0.2411614954471588, "learning_rate": 0.0009685000000000001, "loss": 2.3249, "step": 31000 }, { "epoch": 2.262763399802008, "grad_norm": 0.25610190629959106, "learning_rate": 0.00099975, "loss": 2.3166, "step": 32000 }, { "epoch": 2.333474756045821, "grad_norm": 0.2797723412513733, "learning_rate": 0.000990943154816304, "loss": 2.3086, "step": 33000 }, { "epoch": 2.4041861122896337, "grad_norm": 0.2100619226694107, "learning_rate": 0.0009818040577590935, "loss": 2.3117, "step": 34000 }, { "epoch": 2.4748974685334466, "grad_norm": 0.2163330614566803, "learning_rate": 0.0009726740997989398, "loss": 2.302, "step": 35000 }, { "epoch": 2.5456088247772595, "grad_norm": 0.22995002567768097, "learning_rate": 0.0009635350027417291, "loss": 2.2872, "step": 36000 }, { "epoch": 2.616320181021072, "grad_norm": 0.2004874050617218, "learning_rate": 0.0009544141838786328, "loss": 2.2843, "step": 37000 }, { "epoch": 2.6870315372648848, "grad_norm": 0.2153329849243164, "learning_rate": 0.0009452750868214221, "loss": 2.2788, "step": 38000 }, { "epoch": 2.7577428935086976, "grad_norm": 0.22890245914459229, "learning_rate": 0.0009361359897642113, "loss": 2.2711, "step": 39000 }, { "epoch": 2.82845424975251, "grad_norm": 0.2471633404493332, "learning_rate": 0.0009269968927070006, "loss": 2.2673, "step": 40000 }, { "epoch": 2.899165605996323, "grad_norm": 0.20824675261974335, "learning_rate": 0.0009178577956497898, "loss": 2.2589, "step": 41000 }, { "epoch": 2.9698769622401358, "grad_norm": 0.27108854055404663, "learning_rate": 0.0009087278376896363, "loss": 2.2625, "step": 42000 }, { "epoch": 3.0, "eval_accuracy": 0.5164308635329491, "eval_loss": 2.4760255813598633, "eval_runtime": 124.5514, "eval_samples_per_second": 376.295, "eval_steps_per_second": 5.885, "step": 42426 }, { "epoch": 3.0405883184839486, "grad_norm": 0.2760034501552582, "learning_rate": 0.0008995887406324256, "loss": 2.22, "step": 43000 }, { "epoch": 3.1112996747277615, "grad_norm": 0.24826580286026, "learning_rate": 0.0008904587826722719, "loss": 2.1969, "step": 44000 }, { "epoch": 3.182011030971574, "grad_norm": 0.1961706429719925, "learning_rate": 0.0008813196856150612, "loss": 2.1929, "step": 45000 }, { "epoch": 3.2527223872153868, "grad_norm": 0.2030291110277176, "learning_rate": 0.0008721805885578505, "loss": 2.1974, "step": 46000 }, { "epoch": 3.3234337434591996, "grad_norm": 0.24897335469722748, "learning_rate": 0.0008630414915006397, "loss": 2.1993, "step": 47000 }, { "epoch": 3.3941450997030125, "grad_norm": 0.2421874701976776, "learning_rate": 0.0008539115335404863, "loss": 2.186, "step": 48000 }, { "epoch": 3.464856455946825, "grad_norm": 0.2960880398750305, "learning_rate": 0.0008447724364832756, "loss": 2.1877, "step": 49000 }, { "epoch": 3.5355678121906378, "grad_norm": 0.20504000782966614, "learning_rate": 0.0008356424785231219, "loss": 2.1816, "step": 50000 }, { "epoch": 3.6062791684344506, "grad_norm": 0.23933938145637512, "learning_rate": 0.0008265033814659112, "loss": 2.1807, "step": 51000 }, { "epoch": 3.6769905246782635, "grad_norm": 0.23281992971897125, "learning_rate": 0.0008173734235057576, "loss": 2.1804, "step": 52000 }, { "epoch": 3.747701880922076, "grad_norm": 0.20451070368289948, "learning_rate": 0.0008082343264485469, "loss": 2.1807, "step": 53000 }, { "epoch": 3.8184132371658888, "grad_norm": 0.23853066563606262, "learning_rate": 0.0007990952293913362, "loss": 2.1767, "step": 54000 }, { "epoch": 3.8891245934097016, "grad_norm": 0.24050584435462952, "learning_rate": 0.0007899652714311825, "loss": 2.1667, "step": 55000 }, { "epoch": 3.9598359496535145, "grad_norm": 0.21722733974456787, "learning_rate": 0.0007808261743739718, "loss": 2.1715, "step": 56000 }, { "epoch": 4.0, "eval_accuracy": 0.523546219218542, "eval_loss": 2.4206314086914062, "eval_runtime": 124.5767, "eval_samples_per_second": 376.218, "eval_steps_per_second": 5.884, "step": 56568 }, { "epoch": 4.030547305897327, "grad_norm": 0.2417266070842743, "learning_rate": 0.0007716962164138184, "loss": 2.1342, "step": 57000 }, { "epoch": 4.10125866214114, "grad_norm": 0.22849377989768982, "learning_rate": 0.0007625571193566076, "loss": 2.1082, "step": 58000 }, { "epoch": 4.171970018384952, "grad_norm": 0.226350799202919, "learning_rate": 0.000753427161396454, "loss": 2.1144, "step": 59000 }, { "epoch": 4.2426813746287655, "grad_norm": 0.207255020737648, "learning_rate": 0.0007442880643392433, "loss": 2.1116, "step": 60000 }, { "epoch": 4.313392730872578, "grad_norm": 0.21048206090927124, "learning_rate": 0.0007351581063790898, "loss": 2.1092, "step": 61000 }, { "epoch": 4.384104087116391, "grad_norm": 0.26391103863716125, "learning_rate": 0.000726019009321879, "loss": 2.1105, "step": 62000 }, { "epoch": 4.454815443360204, "grad_norm": 0.22511842846870422, "learning_rate": 0.0007168799122646683, "loss": 2.1111, "step": 63000 }, { "epoch": 4.525526799604016, "grad_norm": 0.264876127243042, "learning_rate": 0.0007077499543045147, "loss": 2.1069, "step": 64000 }, { "epoch": 4.596238155847829, "grad_norm": 0.20152664184570312, "learning_rate": 0.0006986199963443612, "loss": 2.1107, "step": 65000 }, { "epoch": 4.666949512091642, "grad_norm": 0.2202775925397873, "learning_rate": 0.0006894808992871504, "loss": 2.1192, "step": 66000 }, { "epoch": 4.737660868335455, "grad_norm": 0.2462519109249115, "learning_rate": 0.0006803418022299397, "loss": 2.1094, "step": 67000 }, { "epoch": 4.8083722245792675, "grad_norm": 0.24858810007572174, "learning_rate": 0.000671202705172729, "loss": 2.116, "step": 68000 }, { "epoch": 4.87908358082308, "grad_norm": 0.25709378719329834, "learning_rate": 0.0006620727472125753, "loss": 2.1125, "step": 69000 }, { "epoch": 4.949794937066893, "grad_norm": 0.22404509782791138, "learning_rate": 0.0006529336501553646, "loss": 2.0996, "step": 70000 }, { "epoch": 5.0, "eval_accuracy": 0.5277767446216196, "eval_loss": 2.388011932373047, "eval_runtime": 124.6122, "eval_samples_per_second": 376.111, "eval_steps_per_second": 5.882, "step": 70710 }, { "epoch": 5.020506293310706, "grad_norm": 0.21308551728725433, "learning_rate": 0.0006438036921952112, "loss": 2.0836, "step": 71000 }, { "epoch": 5.091217649554518, "grad_norm": 0.26624125242233276, "learning_rate": 0.0006346645951380004, "loss": 2.0401, "step": 72000 }, { "epoch": 5.161929005798331, "grad_norm": 0.24497570097446442, "learning_rate": 0.0006255254980807897, "loss": 2.0422, "step": 73000 }, { "epoch": 5.232640362042144, "grad_norm": 0.2567467987537384, "learning_rate": 0.0006163955401206361, "loss": 2.044, "step": 74000 }, { "epoch": 5.303351718285957, "grad_norm": 0.23071114718914032, "learning_rate": 0.0006072564430634253, "loss": 2.0465, "step": 75000 }, { "epoch": 5.3740630745297695, "grad_norm": 0.25491389632225037, "learning_rate": 0.0005981264851032718, "loss": 2.0482, "step": 76000 }, { "epoch": 5.444774430773582, "grad_norm": 0.2559095621109009, "learning_rate": 0.0005889873880460611, "loss": 2.0496, "step": 77000 }, { "epoch": 5.515485787017395, "grad_norm": 0.22284868359565735, "learning_rate": 0.0005798574300859076, "loss": 2.0525, "step": 78000 }, { "epoch": 5.586197143261208, "grad_norm": 0.23331965506076813, "learning_rate": 0.0005707183330286967, "loss": 2.0649, "step": 79000 }, { "epoch": 5.65690849950502, "grad_norm": 0.2457083910703659, "learning_rate": 0.0005615883750685432, "loss": 2.0515, "step": 80000 }, { "epoch": 5.727619855748833, "grad_norm": 0.2747284173965454, "learning_rate": 0.0005524492780113325, "loss": 2.0636, "step": 81000 }, { "epoch": 5.798331211992646, "grad_norm": 0.2696532607078552, "learning_rate": 0.0005433101809541218, "loss": 2.0548, "step": 82000 }, { "epoch": 5.869042568236459, "grad_norm": 0.2052222490310669, "learning_rate": 0.0005341802229939682, "loss": 2.0486, "step": 83000 }, { "epoch": 5.9397539244802715, "grad_norm": 0.2313115894794464, "learning_rate": 0.0005250411259367574, "loss": 2.0456, "step": 84000 }, { "epoch": 6.0, "eval_accuracy": 0.5305995578608221, "eval_loss": 2.372159957885742, "eval_runtime": 124.7467, "eval_samples_per_second": 375.705, "eval_steps_per_second": 5.876, "step": 84852 }, { "epoch": 6.010465280724084, "grad_norm": 0.2167889028787613, "learning_rate": 0.0005159020288795467, "loss": 2.0537, "step": 85000 }, { "epoch": 6.081176636967897, "grad_norm": 0.2274264246225357, "learning_rate": 0.0005067720709193931, "loss": 1.9868, "step": 86000 }, { "epoch": 6.15188799321171, "grad_norm": 0.21367891132831573, "learning_rate": 0.0004976329738621824, "loss": 1.9966, "step": 87000 }, { "epoch": 6.222599349455523, "grad_norm": 0.23541270196437836, "learning_rate": 0.0004885030159020289, "loss": 2.0013, "step": 88000 }, { "epoch": 6.293310705699335, "grad_norm": 0.24054056406021118, "learning_rate": 0.0004793639188448181, "loss": 1.9875, "step": 89000 }, { "epoch": 6.364022061943148, "grad_norm": 0.23533108830451965, "learning_rate": 0.00047023396088466456, "loss": 1.9967, "step": 90000 }, { "epoch": 6.434733418186961, "grad_norm": 0.2795858383178711, "learning_rate": 0.00046110400292451105, "loss": 2.0019, "step": 91000 }, { "epoch": 6.5054447744307735, "grad_norm": 0.2619366943836212, "learning_rate": 0.00045196490586730035, "loss": 2.0028, "step": 92000 }, { "epoch": 6.576156130674587, "grad_norm": 0.24517033994197845, "learning_rate": 0.00044282580881008953, "loss": 2.0044, "step": 93000 }, { "epoch": 6.646867486918399, "grad_norm": 0.2473345845937729, "learning_rate": 0.0004336867117528788, "loss": 1.9935, "step": 94000 }, { "epoch": 6.717578843162212, "grad_norm": 0.229476198554039, "learning_rate": 0.00042454761469566806, "loss": 1.9943, "step": 95000 }, { "epoch": 6.788290199406025, "grad_norm": 0.2603880763053894, "learning_rate": 0.0004154176567355145, "loss": 2.0073, "step": 96000 }, { "epoch": 6.859001555649837, "grad_norm": 0.2473394274711609, "learning_rate": 0.0004062785596783038, "loss": 2.012, "step": 97000 }, { "epoch": 6.92971291189365, "grad_norm": 0.24815410375595093, "learning_rate": 0.00039714860171815024, "loss": 1.9983, "step": 98000 }, { "epoch": 7.0, "eval_accuracy": 0.5327383571510881, "eval_loss": 2.359189748764038, "eval_runtime": 124.2745, "eval_samples_per_second": 377.133, "eval_steps_per_second": 5.898, "step": 98994 }, { "epoch": 7.000424268137463, "grad_norm": 0.2351510226726532, "learning_rate": 0.0003880095046609395, "loss": 2.0039, "step": 99000 }, { "epoch": 7.0711356243812755, "grad_norm": 0.27717456221580505, "learning_rate": 0.0003788704076037288, "loss": 1.9358, "step": 100000 }, { "epoch": 7.141846980625088, "grad_norm": 0.2275402843952179, "learning_rate": 0.0003697495887406324, "loss": 1.9391, "step": 101000 }, { "epoch": 7.212558336868901, "grad_norm": 0.3047560453414917, "learning_rate": 0.0003606104916834217, "loss": 1.9472, "step": 102000 }, { "epoch": 7.283269693112714, "grad_norm": 0.2828271687030792, "learning_rate": 0.0003514713946262109, "loss": 1.9415, "step": 103000 }, { "epoch": 7.353981049356527, "grad_norm": 0.27506691217422485, "learning_rate": 0.0003423322975690002, "loss": 1.9467, "step": 104000 }, { "epoch": 7.424692405600339, "grad_norm": 0.23766624927520752, "learning_rate": 0.0003332023396088467, "loss": 1.9559, "step": 105000 }, { "epoch": 7.495403761844152, "grad_norm": 0.25875958800315857, "learning_rate": 0.00032406324255163587, "loss": 1.9569, "step": 106000 }, { "epoch": 7.566115118087965, "grad_norm": 0.24315999448299408, "learning_rate": 0.00031493328459148237, "loss": 1.9567, "step": 107000 }, { "epoch": 7.6368264743317775, "grad_norm": 0.29540637135505676, "learning_rate": 0.0003057941875342716, "loss": 1.9567, "step": 108000 }, { "epoch": 7.707537830575591, "grad_norm": 0.2695215940475464, "learning_rate": 0.00029665509047706085, "loss": 1.9605, "step": 109000 }, { "epoch": 7.778249186819403, "grad_norm": 0.2626267075538635, "learning_rate": 0.00028751599341985014, "loss": 1.9527, "step": 110000 }, { "epoch": 7.848960543063216, "grad_norm": 0.25401443243026733, "learning_rate": 0.0002783768963626394, "loss": 1.966, "step": 111000 }, { "epoch": 7.919671899307029, "grad_norm": 0.22462651133537292, "learning_rate": 0.0002692377993054286, "loss": 1.9584, "step": 112000 }, { "epoch": 7.990383255550841, "grad_norm": 0.28454989194869995, "learning_rate": 0.0002601078413452751, "loss": 1.9482, "step": 113000 }, { "epoch": 8.0, "eval_accuracy": 0.5338266026714983, "eval_loss": 2.357896327972412, "eval_runtime": 124.302, "eval_samples_per_second": 377.05, "eval_steps_per_second": 5.897, "step": 113136 }, { "epoch": 8.061094611794655, "grad_norm": 0.26728782057762146, "learning_rate": 0.0002509687442880643, "loss": 1.8937, "step": 114000 }, { "epoch": 8.131805968038467, "grad_norm": 0.24568845331668854, "learning_rate": 0.00024183878632791082, "loss": 1.8983, "step": 115000 }, { "epoch": 8.20251732428228, "grad_norm": 0.2506534457206726, "learning_rate": 0.0002327088283677573, "loss": 1.9067, "step": 116000 }, { "epoch": 8.273228680526092, "grad_norm": 0.31150582432746887, "learning_rate": 0.00022356973131054653, "loss": 1.9085, "step": 117000 }, { "epoch": 8.343940036769904, "grad_norm": 0.2992372512817383, "learning_rate": 0.00021443063425333577, "loss": 1.9085, "step": 118000 }, { "epoch": 8.414651393013719, "grad_norm": 0.23084251582622528, "learning_rate": 0.00020529153719612504, "loss": 1.91, "step": 119000 }, { "epoch": 8.485362749257531, "grad_norm": 0.2754100561141968, "learning_rate": 0.0001961615792359715, "loss": 1.9076, "step": 120000 }, { "epoch": 8.556074105501343, "grad_norm": 0.25915420055389404, "learning_rate": 0.00018702248217876074, "loss": 1.915, "step": 121000 }, { "epoch": 8.626785461745156, "grad_norm": 0.26031365990638733, "learning_rate": 0.00017788338512155, "loss": 1.9159, "step": 122000 }, { "epoch": 8.697496817988968, "grad_norm": 0.2626364231109619, "learning_rate": 0.00016875342716139648, "loss": 1.905, "step": 123000 }, { "epoch": 8.768208174232782, "grad_norm": 0.30089327692985535, "learning_rate": 0.00015961433010418572, "loss": 1.9046, "step": 124000 }, { "epoch": 8.838919530476595, "grad_norm": 0.2982795536518097, "learning_rate": 0.00015047523304697496, "loss": 1.9052, "step": 125000 }, { "epoch": 8.909630886720407, "grad_norm": 0.2462824285030365, "learning_rate": 0.00014134527508682143, "loss": 1.907, "step": 126000 }, { "epoch": 8.98034224296422, "grad_norm": 0.2993851900100708, "learning_rate": 0.0001322061780296107, "loss": 1.9061, "step": 127000 }, { "epoch": 9.0, "eval_accuracy": 0.5346066724693145, "eval_loss": 2.3604698181152344, "eval_runtime": 124.2295, "eval_samples_per_second": 377.27, "eval_steps_per_second": 5.9, "step": 127278 }, { "epoch": 9.051053599208032, "grad_norm": 0.29052260518074036, "learning_rate": 0.00012307622006945714, "loss": 1.8738, "step": 128000 }, { "epoch": 9.121764955451846, "grad_norm": 0.2522347569465637, "learning_rate": 0.0001139462621093036, "loss": 1.8601, "step": 129000 }, { "epoch": 9.192476311695659, "grad_norm": 0.30291104316711426, "learning_rate": 0.00010480716505209286, "loss": 1.8793, "step": 130000 }, { "epoch": 9.263187667939471, "grad_norm": 0.26850393414497375, "learning_rate": 9.566806799488211e-05, "loss": 1.8676, "step": 131000 }, { "epoch": 9.333899024183284, "grad_norm": 0.270274817943573, "learning_rate": 8.652897093767135e-05, "loss": 1.8669, "step": 132000 }, { "epoch": 9.404610380427096, "grad_norm": 0.2623472809791565, "learning_rate": 7.739901297751782e-05, "loss": 1.8651, "step": 133000 }, { "epoch": 9.47532173667091, "grad_norm": 0.2898847758769989, "learning_rate": 6.825991592030707e-05, "loss": 1.8697, "step": 134000 }, { "epoch": 9.546033092914723, "grad_norm": 0.30749163031578064, "learning_rate": 5.912081886309633e-05, "loss": 1.8635, "step": 135000 }, { "epoch": 9.616744449158535, "grad_norm": 0.255743145942688, "learning_rate": 4.9981721805885585e-05, "loss": 1.8662, "step": 136000 }, { "epoch": 9.687455805402347, "grad_norm": 0.30720534920692444, "learning_rate": 4.085176384573204e-05, "loss": 1.8582, "step": 137000 }, { "epoch": 9.75816716164616, "grad_norm": 0.2820815145969391, "learning_rate": 3.17126667885213e-05, "loss": 1.8694, "step": 138000 }, { "epoch": 9.828878517889972, "grad_norm": 0.30231404304504395, "learning_rate": 2.2582708828367758e-05, "loss": 1.8654, "step": 139000 }, { "epoch": 9.899589874133786, "grad_norm": 0.292241632938385, "learning_rate": 1.344361177115701e-05, "loss": 1.8634, "step": 140000 }, { "epoch": 9.970301230377599, "grad_norm": 0.25611528754234314, "learning_rate": 4.313653811003472e-06, "loss": 1.8692, "step": 141000 }, { "epoch": 10.0, "eval_accuracy": 0.534805757971141, "eval_loss": 2.368875503540039, "eval_runtime": 124.6443, "eval_samples_per_second": 376.014, "eval_steps_per_second": 5.881, "step": 141420 }, { "epoch": 10.0, "step": 141420, "total_flos": 5.9582336320512e+17, "train_loss": 2.1796733167279037, "train_runtime": 31005.8847, "train_samples_per_second": 145.948, "train_steps_per_second": 4.561 } ], "logging_steps": 1000, "max_steps": 141420, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.9582336320512e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }