{ "best_metric": null, "best_model_checkpoint": null, "epoch": 16.0, "eval_steps": 500, "global_step": 4000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.1, "grad_norm": 1.1046091318130493, "learning_rate": 5.319148936170213e-05, "loss": 0.6562, "step": 25 }, { "epoch": 0.2, "grad_norm": 0.4066573977470398, "learning_rate": 0.00010638297872340425, "loss": 0.5373, "step": 50 }, { "epoch": 0.3, "grad_norm": 0.7340325713157654, "learning_rate": 0.00015957446808510637, "loss": 0.4995, "step": 75 }, { "epoch": 0.4, "grad_norm": 2.134040117263794, "learning_rate": 0.0002127659574468085, "loss": 0.4871, "step": 100 }, { "epoch": 0.5, "grad_norm": 2.911975860595703, "learning_rate": 0.00026595744680851064, "loss": 0.4507, "step": 125 }, { "epoch": 0.6, "grad_norm": 2.040365219116211, "learning_rate": 0.00031914893617021275, "loss": 0.3944, "step": 150 }, { "epoch": 0.7, "grad_norm": 5.643661022186279, "learning_rate": 0.0003723404255319149, "loss": 0.3863, "step": 175 }, { "epoch": 0.8, "grad_norm": 3.882492780685425, "learning_rate": 0.0003992081821181128, "loss": 0.3875, "step": 200 }, { "epoch": 0.9, "grad_norm": 5.87437105178833, "learning_rate": 0.0003975585615308479, "loss": 0.3856, "step": 225 }, { "epoch": 1.0, "grad_norm": 4.611206531524658, "learning_rate": 0.00039590894094358297, "loss": 0.3866, "step": 250 }, { "epoch": 1.1, "grad_norm": 3.6404929161071777, "learning_rate": 0.0003942593203563181, "loss": 0.3437, "step": 275 }, { "epoch": 1.2, "grad_norm": 6.469444751739502, "learning_rate": 0.0003926096997690532, "loss": 0.3192, "step": 300 }, { "epoch": 1.3, "grad_norm": 3.6251447200775146, "learning_rate": 0.00039096007918178817, "loss": 0.349, "step": 325 }, { "epoch": 1.4, "grad_norm": 2.854794979095459, "learning_rate": 0.0003893104585945233, "loss": 0.3274, "step": 350 }, { "epoch": 1.5, "grad_norm": 4.805097579956055, "learning_rate": 0.0003876608380072583, "loss": 0.3279, "step": 375 }, { "epoch": 1.6, "grad_norm": 2.9891581535339355, "learning_rate": 0.00038601121741999343, "loss": 0.3166, "step": 400 }, { "epoch": 1.7, "grad_norm": 5.333752155303955, "learning_rate": 0.0003843615968327285, "loss": 0.2856, "step": 425 }, { "epoch": 1.8, "grad_norm": 2.615621566772461, "learning_rate": 0.0003827119762454636, "loss": 0.3286, "step": 450 }, { "epoch": 1.9, "grad_norm": 2.4228415489196777, "learning_rate": 0.00038106235565819863, "loss": 0.2812, "step": 475 }, { "epoch": 2.0, "grad_norm": 3.6015517711639404, "learning_rate": 0.0003794127350709337, "loss": 0.2884, "step": 500 }, { "epoch": 2.1, "grad_norm": 2.7867860794067383, "learning_rate": 0.0003777631144836688, "loss": 0.2582, "step": 525 }, { "epoch": 2.2, "grad_norm": 10.079645156860352, "learning_rate": 0.00037611349389640383, "loss": 0.245, "step": 550 }, { "epoch": 2.3, "grad_norm": 2.475917100906372, "learning_rate": 0.00037446387330913894, "loss": 0.2615, "step": 575 }, { "epoch": 2.4, "grad_norm": 8.098592758178711, "learning_rate": 0.000372814252721874, "loss": 0.2484, "step": 600 }, { "epoch": 2.5, "grad_norm": 2.318723201751709, "learning_rate": 0.00037116463213460903, "loss": 0.2056, "step": 625 }, { "epoch": 2.6, "grad_norm": 3.616283893585205, "learning_rate": 0.00036951501154734414, "loss": 0.2358, "step": 650 }, { "epoch": 2.7, "grad_norm": 6.419492721557617, "learning_rate": 0.0003678653909600792, "loss": 0.2204, "step": 675 }, { "epoch": 2.8, "grad_norm": 2.9333388805389404, "learning_rate": 0.0003662157703728143, "loss": 0.1857, "step": 700 }, { "epoch": 2.9, "grad_norm": 7.691112995147705, "learning_rate": 0.00036456614978554934, "loss": 0.2054, "step": 725 }, { "epoch": 3.0, "grad_norm": 4.908492565155029, "learning_rate": 0.00036291652919828444, "loss": 0.1956, "step": 750 }, { "epoch": 3.1, "grad_norm": 1.8923287391662598, "learning_rate": 0.00036126690861101944, "loss": 0.1708, "step": 775 }, { "epoch": 3.2, "grad_norm": 4.403504848480225, "learning_rate": 0.00035961728802375454, "loss": 0.1441, "step": 800 }, { "epoch": 3.3, "grad_norm": 4.117386817932129, "learning_rate": 0.0003579676674364896, "loss": 0.1586, "step": 825 }, { "epoch": 3.4, "grad_norm": 7.119816303253174, "learning_rate": 0.0003563180468492247, "loss": 0.1472, "step": 850 }, { "epoch": 3.5, "grad_norm": 3.5796430110931396, "learning_rate": 0.0003546684262619598, "loss": 0.1802, "step": 875 }, { "epoch": 3.6, "grad_norm": 2.97688889503479, "learning_rate": 0.00035301880567469485, "loss": 0.1578, "step": 900 }, { "epoch": 3.7, "grad_norm": 3.716148614883423, "learning_rate": 0.0003513691850874299, "loss": 0.1662, "step": 925 }, { "epoch": 3.8, "grad_norm": 6.1249566078186035, "learning_rate": 0.00034971956450016495, "loss": 0.159, "step": 950 }, { "epoch": 3.9, "grad_norm": 3.0592427253723145, "learning_rate": 0.00034806994391290005, "loss": 0.148, "step": 975 }, { "epoch": 4.0, "grad_norm": 4.467265605926514, "learning_rate": 0.0003464203233256351, "loss": 0.1442, "step": 1000 }, { "epoch": 4.1, "grad_norm": 2.7223546504974365, "learning_rate": 0.0003447707027383702, "loss": 0.1339, "step": 1025 }, { "epoch": 4.2, "grad_norm": 3.698854923248291, "learning_rate": 0.00034312108215110525, "loss": 0.1381, "step": 1050 }, { "epoch": 4.3, "grad_norm": 7.418384552001953, "learning_rate": 0.0003414714615638403, "loss": 0.1158, "step": 1075 }, { "epoch": 4.4, "grad_norm": 1.2887814044952393, "learning_rate": 0.0003398218409765754, "loss": 0.0855, "step": 1100 }, { "epoch": 4.5, "grad_norm": 0.838731586933136, "learning_rate": 0.00033817222038931045, "loss": 0.1272, "step": 1125 }, { "epoch": 4.6, "grad_norm": 5.912592887878418, "learning_rate": 0.00033652259980204556, "loss": 0.1156, "step": 1150 }, { "epoch": 4.7, "grad_norm": 8.49521255493164, "learning_rate": 0.0003348729792147806, "loss": 0.1182, "step": 1175 }, { "epoch": 4.8, "grad_norm": 10.278315544128418, "learning_rate": 0.0003332233586275157, "loss": 0.1071, "step": 1200 }, { "epoch": 4.9, "grad_norm": 0.724703311920166, "learning_rate": 0.00033157373804025076, "loss": 0.11, "step": 1225 }, { "epoch": 5.0, "grad_norm": 1.6808199882507324, "learning_rate": 0.0003299241174529858, "loss": 0.1101, "step": 1250 }, { "epoch": 5.1, "grad_norm": 0.1038585901260376, "learning_rate": 0.0003282744968657209, "loss": 0.0791, "step": 1275 }, { "epoch": 5.2, "grad_norm": 0.15866787731647491, "learning_rate": 0.00032662487627845596, "loss": 0.0638, "step": 1300 }, { "epoch": 5.3, "grad_norm": 6.748287200927734, "learning_rate": 0.00032497525569119106, "loss": 0.0961, "step": 1325 }, { "epoch": 5.4, "grad_norm": 1.5960336923599243, "learning_rate": 0.0003233256351039261, "loss": 0.1057, "step": 1350 }, { "epoch": 5.5, "grad_norm": 3.3678526878356934, "learning_rate": 0.00032167601451666116, "loss": 0.1092, "step": 1375 }, { "epoch": 5.6, "grad_norm": 4.743562698364258, "learning_rate": 0.00032002639392939627, "loss": 0.1054, "step": 1400 }, { "epoch": 5.7, "grad_norm": 12.860156059265137, "learning_rate": 0.0003183767733421313, "loss": 0.1009, "step": 1425 }, { "epoch": 5.8, "grad_norm": 0.8820792436599731, "learning_rate": 0.0003167271527548664, "loss": 0.1219, "step": 1450 }, { "epoch": 5.9, "grad_norm": 2.732572317123413, "learning_rate": 0.00031507753216760147, "loss": 0.0682, "step": 1475 }, { "epoch": 6.0, "grad_norm": 9.810711860656738, "learning_rate": 0.00031342791158033657, "loss": 0.1134, "step": 1500 }, { "epoch": 6.1, "grad_norm": 6.699489593505859, "learning_rate": 0.00031177829099307157, "loss": 0.0557, "step": 1525 }, { "epoch": 6.2, "grad_norm": 14.691123008728027, "learning_rate": 0.00031012867040580667, "loss": 0.0599, "step": 1550 }, { "epoch": 6.3, "grad_norm": 24.44135093688965, "learning_rate": 0.0003084790498185417, "loss": 0.0818, "step": 1575 }, { "epoch": 6.4, "grad_norm": 0.033995576202869415, "learning_rate": 0.0003068294292312768, "loss": 0.0692, "step": 1600 }, { "epoch": 6.5, "grad_norm": 6.722524642944336, "learning_rate": 0.0003051798086440119, "loss": 0.0702, "step": 1625 }, { "epoch": 6.6, "grad_norm": 2.9519436359405518, "learning_rate": 0.000303530188056747, "loss": 0.073, "step": 1650 }, { "epoch": 6.7, "grad_norm": 1.5118328332901, "learning_rate": 0.000301880567469482, "loss": 0.0926, "step": 1675 }, { "epoch": 6.8, "grad_norm": 2.1923201084136963, "learning_rate": 0.0003002309468822171, "loss": 0.0821, "step": 1700 }, { "epoch": 6.9, "grad_norm": 3.34309458732605, "learning_rate": 0.0002985813262949522, "loss": 0.0827, "step": 1725 }, { "epoch": 7.0, "grad_norm": 10.289947509765625, "learning_rate": 0.00029693170570768723, "loss": 0.1162, "step": 1750 }, { "epoch": 7.1, "grad_norm": 0.051936667412519455, "learning_rate": 0.00029528208512042233, "loss": 0.0626, "step": 1775 }, { "epoch": 7.2, "grad_norm": 1.8913339376449585, "learning_rate": 0.0002936324645331574, "loss": 0.0657, "step": 1800 }, { "epoch": 7.3, "grad_norm": 15.073695182800293, "learning_rate": 0.00029198284394589243, "loss": 0.0539, "step": 1825 }, { "epoch": 7.4, "grad_norm": 4.514949798583984, "learning_rate": 0.00029033322335862753, "loss": 0.056, "step": 1850 }, { "epoch": 7.5, "grad_norm": 0.46094000339508057, "learning_rate": 0.0002886836027713626, "loss": 0.0667, "step": 1875 }, { "epoch": 7.6, "grad_norm": 4.682873725891113, "learning_rate": 0.0002870339821840977, "loss": 0.0673, "step": 1900 }, { "epoch": 7.7, "grad_norm": 3.581599235534668, "learning_rate": 0.00028538436159683273, "loss": 0.0699, "step": 1925 }, { "epoch": 7.8, "grad_norm": 3.2518703937530518, "learning_rate": 0.00028373474100956784, "loss": 0.0628, "step": 1950 }, { "epoch": 7.9, "grad_norm": 0.24071219563484192, "learning_rate": 0.0002820851204223029, "loss": 0.0747, "step": 1975 }, { "epoch": 8.0, "grad_norm": 2.8545596599578857, "learning_rate": 0.00028043549983503794, "loss": 0.0688, "step": 2000 }, { "epoch": 8.1, "grad_norm": 0.049499694257974625, "learning_rate": 0.00027878587924777304, "loss": 0.0563, "step": 2025 }, { "epoch": 8.2, "grad_norm": 22.112478256225586, "learning_rate": 0.0002771362586605081, "loss": 0.0545, "step": 2050 }, { "epoch": 8.3, "grad_norm": 20.87473487854004, "learning_rate": 0.0002754866380732432, "loss": 0.0256, "step": 2075 }, { "epoch": 8.4, "grad_norm": 2.317768096923828, "learning_rate": 0.00027383701748597824, "loss": 0.0753, "step": 2100 }, { "epoch": 8.5, "grad_norm": 20.241336822509766, "learning_rate": 0.0002721873968987133, "loss": 0.0257, "step": 2125 }, { "epoch": 8.6, "grad_norm": 14.994575500488281, "learning_rate": 0.00027053777631144834, "loss": 0.0748, "step": 2150 }, { "epoch": 8.7, "grad_norm": 0.8268639445304871, "learning_rate": 0.00026888815572418344, "loss": 0.0805, "step": 2175 }, { "epoch": 8.8, "grad_norm": 0.031071001663804054, "learning_rate": 0.00026723853513691855, "loss": 0.0603, "step": 2200 }, { "epoch": 8.9, "grad_norm": 2.4572932720184326, "learning_rate": 0.0002655889145496536, "loss": 0.071, "step": 2225 }, { "epoch": 9.0, "grad_norm": 0.21708321571350098, "learning_rate": 0.0002639392939623887, "loss": 0.0606, "step": 2250 }, { "epoch": 9.1, "grad_norm": 14.928559303283691, "learning_rate": 0.0002622896733751237, "loss": 0.0443, "step": 2275 }, { "epoch": 9.2, "grad_norm": 1.108826994895935, "learning_rate": 0.0002606400527878588, "loss": 0.0509, "step": 2300 }, { "epoch": 9.3, "grad_norm": 0.05721910670399666, "learning_rate": 0.00025899043220059385, "loss": 0.0459, "step": 2325 }, { "epoch": 9.4, "grad_norm": 7.81903600692749, "learning_rate": 0.00025734081161332895, "loss": 0.0535, "step": 2350 }, { "epoch": 9.5, "grad_norm": 0.01475981343537569, "learning_rate": 0.000255691191026064, "loss": 0.0399, "step": 2375 }, { "epoch": 9.6, "grad_norm": 5.496493339538574, "learning_rate": 0.0002540415704387991, "loss": 0.0744, "step": 2400 }, { "epoch": 9.7, "grad_norm": 0.05929339677095413, "learning_rate": 0.00025239194985153415, "loss": 0.0416, "step": 2425 }, { "epoch": 9.8, "grad_norm": 0.06961135566234589, "learning_rate": 0.0002507423292642692, "loss": 0.04, "step": 2450 }, { "epoch": 9.9, "grad_norm": 0.02165246568620205, "learning_rate": 0.0002490927086770043, "loss": 0.0984, "step": 2475 }, { "epoch": 10.0, "grad_norm": 25.155675888061523, "learning_rate": 0.00024744308808973936, "loss": 0.042, "step": 2500 }, { "epoch": 10.1, "grad_norm": 8.566617965698242, "learning_rate": 0.00024579346750247446, "loss": 0.0385, "step": 2525 }, { "epoch": 10.2, "grad_norm": 0.024992404505610466, "learning_rate": 0.0002441438469152095, "loss": 0.0327, "step": 2550 }, { "epoch": 10.3, "grad_norm": 0.22233247756958008, "learning_rate": 0.00024249422632794456, "loss": 0.0211, "step": 2575 }, { "epoch": 10.4, "grad_norm": 13.307366371154785, "learning_rate": 0.00024084460574067963, "loss": 0.0472, "step": 2600 }, { "epoch": 10.5, "grad_norm": 0.17787696421146393, "learning_rate": 0.0002391949851534147, "loss": 0.0346, "step": 2625 }, { "epoch": 10.6, "grad_norm": 1.2845549583435059, "learning_rate": 0.0002375453645661498, "loss": 0.0587, "step": 2650 }, { "epoch": 10.7, "grad_norm": 0.0603482760488987, "learning_rate": 0.00023589574397888486, "loss": 0.0394, "step": 2675 }, { "epoch": 10.8, "grad_norm": 0.031819943338632584, "learning_rate": 0.00023424612339161997, "loss": 0.0434, "step": 2700 }, { "epoch": 10.9, "grad_norm": 1.5738537311553955, "learning_rate": 0.000232596502804355, "loss": 0.0393, "step": 2725 }, { "epoch": 11.0, "grad_norm": 0.6862583756446838, "learning_rate": 0.00023094688221709007, "loss": 0.0183, "step": 2750 }, { "epoch": 11.1, "grad_norm": 0.0053489054553210735, "learning_rate": 0.00022929726162982514, "loss": 0.0277, "step": 2775 }, { "epoch": 11.2, "grad_norm": 0.04365606606006622, "learning_rate": 0.00022764764104256022, "loss": 0.0337, "step": 2800 }, { "epoch": 11.3, "grad_norm": 1.3348641395568848, "learning_rate": 0.0002259980204552953, "loss": 0.0219, "step": 2825 }, { "epoch": 11.4, "grad_norm": 3.3695547580718994, "learning_rate": 0.00022434839986803037, "loss": 0.0227, "step": 2850 }, { "epoch": 11.5, "grad_norm": 2.507455587387085, "learning_rate": 0.00022269877928076542, "loss": 0.0443, "step": 2875 }, { "epoch": 11.6, "grad_norm": 0.013451912440359592, "learning_rate": 0.0002210491586935005, "loss": 0.0486, "step": 2900 }, { "epoch": 11.7, "grad_norm": 0.031177405267953873, "learning_rate": 0.00021939953810623557, "loss": 0.0511, "step": 2925 }, { "epoch": 11.8, "grad_norm": 0.02148960903286934, "learning_rate": 0.00021774991751897065, "loss": 0.0269, "step": 2950 }, { "epoch": 11.9, "grad_norm": 0.01219563465565443, "learning_rate": 0.00021610029693170573, "loss": 0.042, "step": 2975 }, { "epoch": 12.0, "grad_norm": 0.2062515765428543, "learning_rate": 0.0002144506763444408, "loss": 0.031, "step": 3000 }, { "epoch": 12.1, "grad_norm": 3.235403060913086, "learning_rate": 0.00021280105575717582, "loss": 0.0414, "step": 3025 }, { "epoch": 12.2, "grad_norm": 1.089821696281433, "learning_rate": 0.00021115143516991093, "loss": 0.0692, "step": 3050 }, { "epoch": 12.3, "grad_norm": 0.013664484024047852, "learning_rate": 0.000209501814582646, "loss": 0.0367, "step": 3075 }, { "epoch": 12.4, "grad_norm": 1.3101387023925781, "learning_rate": 0.00020785219399538108, "loss": 0.0277, "step": 3100 }, { "epoch": 12.5, "grad_norm": 3.244853973388672, "learning_rate": 0.00020620257340811616, "loss": 0.0373, "step": 3125 }, { "epoch": 12.6, "grad_norm": 0.013548053801059723, "learning_rate": 0.00020455295282085123, "loss": 0.027, "step": 3150 }, { "epoch": 12.7, "grad_norm": 0.004670978989452124, "learning_rate": 0.00020290333223358626, "loss": 0.0252, "step": 3175 }, { "epoch": 12.8, "grad_norm": 0.007080752402544022, "learning_rate": 0.00020125371164632133, "loss": 0.0187, "step": 3200 }, { "epoch": 12.9, "grad_norm": 0.005918944254517555, "learning_rate": 0.0001996040910590564, "loss": 0.0423, "step": 3225 }, { "epoch": 13.0, "grad_norm": 4.530269145965576, "learning_rate": 0.00019795447047179148, "loss": 0.0381, "step": 3250 }, { "epoch": 13.1, "grad_norm": 0.00533739197999239, "learning_rate": 0.0001963048498845266, "loss": 0.0097, "step": 3275 }, { "epoch": 13.2, "grad_norm": 0.7641226649284363, "learning_rate": 0.00019465522929726164, "loss": 0.0205, "step": 3300 }, { "epoch": 13.3, "grad_norm": 2.16174054145813, "learning_rate": 0.00019300560870999671, "loss": 0.0174, "step": 3325 }, { "epoch": 13.4, "grad_norm": 0.00814723875373602, "learning_rate": 0.0001913559881227318, "loss": 0.0215, "step": 3350 }, { "epoch": 13.5, "grad_norm": 10.500170707702637, "learning_rate": 0.00018970636753546684, "loss": 0.0226, "step": 3375 }, { "epoch": 13.6, "grad_norm": 2.612359046936035, "learning_rate": 0.00018805674694820192, "loss": 0.0314, "step": 3400 }, { "epoch": 13.7, "grad_norm": 0.11265736818313599, "learning_rate": 0.000186407126360937, "loss": 0.0234, "step": 3425 }, { "epoch": 13.8, "grad_norm": 0.06453213095664978, "learning_rate": 0.00018475750577367207, "loss": 0.0314, "step": 3450 }, { "epoch": 13.9, "grad_norm": 0.01650763861835003, "learning_rate": 0.00018310788518640715, "loss": 0.0403, "step": 3475 }, { "epoch": 14.0, "grad_norm": 0.002764373552054167, "learning_rate": 0.00018145826459914222, "loss": 0.0137, "step": 3500 }, { "epoch": 14.1, "grad_norm": 0.0022430066019296646, "learning_rate": 0.00017980864401187727, "loss": 0.0084, "step": 3525 }, { "epoch": 14.2, "grad_norm": 0.007267679553478956, "learning_rate": 0.00017815902342461235, "loss": 0.0161, "step": 3550 }, { "epoch": 14.3, "grad_norm": 0.004844362381845713, "learning_rate": 0.00017650940283734742, "loss": 0.0399, "step": 3575 }, { "epoch": 14.4, "grad_norm": 0.05529040843248367, "learning_rate": 0.00017485978225008247, "loss": 0.0192, "step": 3600 }, { "epoch": 14.5, "grad_norm": 0.008248478174209595, "learning_rate": 0.00017321016166281755, "loss": 0.0079, "step": 3625 }, { "epoch": 14.6, "grad_norm": 0.02617962658405304, "learning_rate": 0.00017156054107555263, "loss": 0.0244, "step": 3650 }, { "epoch": 14.7, "grad_norm": 0.004497932270169258, "learning_rate": 0.0001699109204882877, "loss": 0.0464, "step": 3675 }, { "epoch": 14.8, "grad_norm": 0.008586671203374863, "learning_rate": 0.00016826129990102278, "loss": 0.0354, "step": 3700 }, { "epoch": 14.9, "grad_norm": 0.30177807807922363, "learning_rate": 0.00016661167931375785, "loss": 0.0147, "step": 3725 }, { "epoch": 15.0, "grad_norm": 0.10390808433294296, "learning_rate": 0.0001649620587264929, "loss": 0.0078, "step": 3750 }, { "epoch": 15.1, "grad_norm": 1.584868311882019, "learning_rate": 0.00016331243813922798, "loss": 0.0059, "step": 3775 }, { "epoch": 15.2, "grad_norm": 0.0038078685756772757, "learning_rate": 0.00016166281755196306, "loss": 0.0143, "step": 3800 }, { "epoch": 15.3, "grad_norm": 0.017795555293560028, "learning_rate": 0.00016001319696469813, "loss": 0.0003, "step": 3825 }, { "epoch": 15.4, "grad_norm": 7.246474742889404, "learning_rate": 0.0001583635763774332, "loss": 0.0288, "step": 3850 }, { "epoch": 15.5, "grad_norm": 0.0014716258738189936, "learning_rate": 0.00015671395579016829, "loss": 0.0113, "step": 3875 }, { "epoch": 15.6, "grad_norm": 0.002159764291718602, "learning_rate": 0.00015506433520290334, "loss": 0.0138, "step": 3900 }, { "epoch": 15.7, "grad_norm": 27.16730308532715, "learning_rate": 0.0001534147146156384, "loss": 0.0048, "step": 3925 }, { "epoch": 15.8, "grad_norm": 0.004570333287119865, "learning_rate": 0.0001517650940283735, "loss": 0.0144, "step": 3950 }, { "epoch": 15.9, "grad_norm": 0.008330491371452808, "learning_rate": 0.00015011547344110854, "loss": 0.0371, "step": 3975 }, { "epoch": 16.0, "grad_norm": 15.75979232788086, "learning_rate": 0.00014846585285384361, "loss": 0.0207, "step": 4000 } ], "logging_steps": 25, "max_steps": 6250, "num_input_tokens_seen": 0, "num_train_epochs": 25, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 203277926400000.0, "train_batch_size": 20, "trial_name": null, "trial_params": null }