TinyDNABERT
/
Finetuned Models
/finetuning_outputs
/dnasplice
/TinyDNABERT_base_model
/checkpoint-4000
/trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 16.0, | |
"eval_steps": 500, | |
"global_step": 4000, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.1, | |
"grad_norm": 1.1046091318130493, | |
"learning_rate": 5.319148936170213e-05, | |
"loss": 0.6562, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.2, | |
"grad_norm": 0.4066573977470398, | |
"learning_rate": 0.00010638297872340425, | |
"loss": 0.5373, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.3, | |
"grad_norm": 0.7340325713157654, | |
"learning_rate": 0.00015957446808510637, | |
"loss": 0.4995, | |
"step": 75 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 2.134040117263794, | |
"learning_rate": 0.0002127659574468085, | |
"loss": 0.4871, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 2.911975860595703, | |
"learning_rate": 0.00026595744680851064, | |
"loss": 0.4507, | |
"step": 125 | |
}, | |
{ | |
"epoch": 0.6, | |
"grad_norm": 2.040365219116211, | |
"learning_rate": 0.00031914893617021275, | |
"loss": 0.3944, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.7, | |
"grad_norm": 5.643661022186279, | |
"learning_rate": 0.0003723404255319149, | |
"loss": 0.3863, | |
"step": 175 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 3.882492780685425, | |
"learning_rate": 0.0003992081821181128, | |
"loss": 0.3875, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.9, | |
"grad_norm": 5.87437105178833, | |
"learning_rate": 0.0003975585615308479, | |
"loss": 0.3856, | |
"step": 225 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 4.611206531524658, | |
"learning_rate": 0.00039590894094358297, | |
"loss": 0.3866, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.1, | |
"grad_norm": 3.6404929161071777, | |
"learning_rate": 0.0003942593203563181, | |
"loss": 0.3437, | |
"step": 275 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 6.469444751739502, | |
"learning_rate": 0.0003926096997690532, | |
"loss": 0.3192, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.3, | |
"grad_norm": 3.6251447200775146, | |
"learning_rate": 0.00039096007918178817, | |
"loss": 0.349, | |
"step": 325 | |
}, | |
{ | |
"epoch": 1.4, | |
"grad_norm": 2.854794979095459, | |
"learning_rate": 0.0003893104585945233, | |
"loss": 0.3274, | |
"step": 350 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 4.805097579956055, | |
"learning_rate": 0.0003876608380072583, | |
"loss": 0.3279, | |
"step": 375 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 2.9891581535339355, | |
"learning_rate": 0.00038601121741999343, | |
"loss": 0.3166, | |
"step": 400 | |
}, | |
{ | |
"epoch": 1.7, | |
"grad_norm": 5.333752155303955, | |
"learning_rate": 0.0003843615968327285, | |
"loss": 0.2856, | |
"step": 425 | |
}, | |
{ | |
"epoch": 1.8, | |
"grad_norm": 2.615621566772461, | |
"learning_rate": 0.0003827119762454636, | |
"loss": 0.3286, | |
"step": 450 | |
}, | |
{ | |
"epoch": 1.9, | |
"grad_norm": 2.4228415489196777, | |
"learning_rate": 0.00038106235565819863, | |
"loss": 0.2812, | |
"step": 475 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 3.6015517711639404, | |
"learning_rate": 0.0003794127350709337, | |
"loss": 0.2884, | |
"step": 500 | |
}, | |
{ | |
"epoch": 2.1, | |
"grad_norm": 2.7867860794067383, | |
"learning_rate": 0.0003777631144836688, | |
"loss": 0.2582, | |
"step": 525 | |
}, | |
{ | |
"epoch": 2.2, | |
"grad_norm": 10.079645156860352, | |
"learning_rate": 0.00037611349389640383, | |
"loss": 0.245, | |
"step": 550 | |
}, | |
{ | |
"epoch": 2.3, | |
"grad_norm": 2.475917100906372, | |
"learning_rate": 0.00037446387330913894, | |
"loss": 0.2615, | |
"step": 575 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 8.098592758178711, | |
"learning_rate": 0.000372814252721874, | |
"loss": 0.2484, | |
"step": 600 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 2.318723201751709, | |
"learning_rate": 0.00037116463213460903, | |
"loss": 0.2056, | |
"step": 625 | |
}, | |
{ | |
"epoch": 2.6, | |
"grad_norm": 3.616283893585205, | |
"learning_rate": 0.00036951501154734414, | |
"loss": 0.2358, | |
"step": 650 | |
}, | |
{ | |
"epoch": 2.7, | |
"grad_norm": 6.419492721557617, | |
"learning_rate": 0.0003678653909600792, | |
"loss": 0.2204, | |
"step": 675 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 2.9333388805389404, | |
"learning_rate": 0.0003662157703728143, | |
"loss": 0.1857, | |
"step": 700 | |
}, | |
{ | |
"epoch": 2.9, | |
"grad_norm": 7.691112995147705, | |
"learning_rate": 0.00036456614978554934, | |
"loss": 0.2054, | |
"step": 725 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 4.908492565155029, | |
"learning_rate": 0.00036291652919828444, | |
"loss": 0.1956, | |
"step": 750 | |
}, | |
{ | |
"epoch": 3.1, | |
"grad_norm": 1.8923287391662598, | |
"learning_rate": 0.00036126690861101944, | |
"loss": 0.1708, | |
"step": 775 | |
}, | |
{ | |
"epoch": 3.2, | |
"grad_norm": 4.403504848480225, | |
"learning_rate": 0.00035961728802375454, | |
"loss": 0.1441, | |
"step": 800 | |
}, | |
{ | |
"epoch": 3.3, | |
"grad_norm": 4.117386817932129, | |
"learning_rate": 0.0003579676674364896, | |
"loss": 0.1586, | |
"step": 825 | |
}, | |
{ | |
"epoch": 3.4, | |
"grad_norm": 7.119816303253174, | |
"learning_rate": 0.0003563180468492247, | |
"loss": 0.1472, | |
"step": 850 | |
}, | |
{ | |
"epoch": 3.5, | |
"grad_norm": 3.5796430110931396, | |
"learning_rate": 0.0003546684262619598, | |
"loss": 0.1802, | |
"step": 875 | |
}, | |
{ | |
"epoch": 3.6, | |
"grad_norm": 2.97688889503479, | |
"learning_rate": 0.00035301880567469485, | |
"loss": 0.1578, | |
"step": 900 | |
}, | |
{ | |
"epoch": 3.7, | |
"grad_norm": 3.716148614883423, | |
"learning_rate": 0.0003513691850874299, | |
"loss": 0.1662, | |
"step": 925 | |
}, | |
{ | |
"epoch": 3.8, | |
"grad_norm": 6.1249566078186035, | |
"learning_rate": 0.00034971956450016495, | |
"loss": 0.159, | |
"step": 950 | |
}, | |
{ | |
"epoch": 3.9, | |
"grad_norm": 3.0592427253723145, | |
"learning_rate": 0.00034806994391290005, | |
"loss": 0.148, | |
"step": 975 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 4.467265605926514, | |
"learning_rate": 0.0003464203233256351, | |
"loss": 0.1442, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 4.1, | |
"grad_norm": 2.7223546504974365, | |
"learning_rate": 0.0003447707027383702, | |
"loss": 0.1339, | |
"step": 1025 | |
}, | |
{ | |
"epoch": 4.2, | |
"grad_norm": 3.698854923248291, | |
"learning_rate": 0.00034312108215110525, | |
"loss": 0.1381, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 4.3, | |
"grad_norm": 7.418384552001953, | |
"learning_rate": 0.0003414714615638403, | |
"loss": 0.1158, | |
"step": 1075 | |
}, | |
{ | |
"epoch": 4.4, | |
"grad_norm": 1.2887814044952393, | |
"learning_rate": 0.0003398218409765754, | |
"loss": 0.0855, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 4.5, | |
"grad_norm": 0.838731586933136, | |
"learning_rate": 0.00033817222038931045, | |
"loss": 0.1272, | |
"step": 1125 | |
}, | |
{ | |
"epoch": 4.6, | |
"grad_norm": 5.912592887878418, | |
"learning_rate": 0.00033652259980204556, | |
"loss": 0.1156, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 4.7, | |
"grad_norm": 8.49521255493164, | |
"learning_rate": 0.0003348729792147806, | |
"loss": 0.1182, | |
"step": 1175 | |
}, | |
{ | |
"epoch": 4.8, | |
"grad_norm": 10.278315544128418, | |
"learning_rate": 0.0003332233586275157, | |
"loss": 0.1071, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 4.9, | |
"grad_norm": 0.724703311920166, | |
"learning_rate": 0.00033157373804025076, | |
"loss": 0.11, | |
"step": 1225 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 1.6808199882507324, | |
"learning_rate": 0.0003299241174529858, | |
"loss": 0.1101, | |
"step": 1250 | |
}, | |
{ | |
"epoch": 5.1, | |
"grad_norm": 0.1038585901260376, | |
"learning_rate": 0.0003282744968657209, | |
"loss": 0.0791, | |
"step": 1275 | |
}, | |
{ | |
"epoch": 5.2, | |
"grad_norm": 0.15866787731647491, | |
"learning_rate": 0.00032662487627845596, | |
"loss": 0.0638, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 5.3, | |
"grad_norm": 6.748287200927734, | |
"learning_rate": 0.00032497525569119106, | |
"loss": 0.0961, | |
"step": 1325 | |
}, | |
{ | |
"epoch": 5.4, | |
"grad_norm": 1.5960336923599243, | |
"learning_rate": 0.0003233256351039261, | |
"loss": 0.1057, | |
"step": 1350 | |
}, | |
{ | |
"epoch": 5.5, | |
"grad_norm": 3.3678526878356934, | |
"learning_rate": 0.00032167601451666116, | |
"loss": 0.1092, | |
"step": 1375 | |
}, | |
{ | |
"epoch": 5.6, | |
"grad_norm": 4.743562698364258, | |
"learning_rate": 0.00032002639392939627, | |
"loss": 0.1054, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 5.7, | |
"grad_norm": 12.860156059265137, | |
"learning_rate": 0.0003183767733421313, | |
"loss": 0.1009, | |
"step": 1425 | |
}, | |
{ | |
"epoch": 5.8, | |
"grad_norm": 0.8820792436599731, | |
"learning_rate": 0.0003167271527548664, | |
"loss": 0.1219, | |
"step": 1450 | |
}, | |
{ | |
"epoch": 5.9, | |
"grad_norm": 2.732572317123413, | |
"learning_rate": 0.00031507753216760147, | |
"loss": 0.0682, | |
"step": 1475 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 9.810711860656738, | |
"learning_rate": 0.00031342791158033657, | |
"loss": 0.1134, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 6.1, | |
"grad_norm": 6.699489593505859, | |
"learning_rate": 0.00031177829099307157, | |
"loss": 0.0557, | |
"step": 1525 | |
}, | |
{ | |
"epoch": 6.2, | |
"grad_norm": 14.691123008728027, | |
"learning_rate": 0.00031012867040580667, | |
"loss": 0.0599, | |
"step": 1550 | |
}, | |
{ | |
"epoch": 6.3, | |
"grad_norm": 24.44135093688965, | |
"learning_rate": 0.0003084790498185417, | |
"loss": 0.0818, | |
"step": 1575 | |
}, | |
{ | |
"epoch": 6.4, | |
"grad_norm": 0.033995576202869415, | |
"learning_rate": 0.0003068294292312768, | |
"loss": 0.0692, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 6.5, | |
"grad_norm": 6.722524642944336, | |
"learning_rate": 0.0003051798086440119, | |
"loss": 0.0702, | |
"step": 1625 | |
}, | |
{ | |
"epoch": 6.6, | |
"grad_norm": 2.9519436359405518, | |
"learning_rate": 0.000303530188056747, | |
"loss": 0.073, | |
"step": 1650 | |
}, | |
{ | |
"epoch": 6.7, | |
"grad_norm": 1.5118328332901, | |
"learning_rate": 0.000301880567469482, | |
"loss": 0.0926, | |
"step": 1675 | |
}, | |
{ | |
"epoch": 6.8, | |
"grad_norm": 2.1923201084136963, | |
"learning_rate": 0.0003002309468822171, | |
"loss": 0.0821, | |
"step": 1700 | |
}, | |
{ | |
"epoch": 6.9, | |
"grad_norm": 3.34309458732605, | |
"learning_rate": 0.0002985813262949522, | |
"loss": 0.0827, | |
"step": 1725 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 10.289947509765625, | |
"learning_rate": 0.00029693170570768723, | |
"loss": 0.1162, | |
"step": 1750 | |
}, | |
{ | |
"epoch": 7.1, | |
"grad_norm": 0.051936667412519455, | |
"learning_rate": 0.00029528208512042233, | |
"loss": 0.0626, | |
"step": 1775 | |
}, | |
{ | |
"epoch": 7.2, | |
"grad_norm": 1.8913339376449585, | |
"learning_rate": 0.0002936324645331574, | |
"loss": 0.0657, | |
"step": 1800 | |
}, | |
{ | |
"epoch": 7.3, | |
"grad_norm": 15.073695182800293, | |
"learning_rate": 0.00029198284394589243, | |
"loss": 0.0539, | |
"step": 1825 | |
}, | |
{ | |
"epoch": 7.4, | |
"grad_norm": 4.514949798583984, | |
"learning_rate": 0.00029033322335862753, | |
"loss": 0.056, | |
"step": 1850 | |
}, | |
{ | |
"epoch": 7.5, | |
"grad_norm": 0.46094000339508057, | |
"learning_rate": 0.0002886836027713626, | |
"loss": 0.0667, | |
"step": 1875 | |
}, | |
{ | |
"epoch": 7.6, | |
"grad_norm": 4.682873725891113, | |
"learning_rate": 0.0002870339821840977, | |
"loss": 0.0673, | |
"step": 1900 | |
}, | |
{ | |
"epoch": 7.7, | |
"grad_norm": 3.581599235534668, | |
"learning_rate": 0.00028538436159683273, | |
"loss": 0.0699, | |
"step": 1925 | |
}, | |
{ | |
"epoch": 7.8, | |
"grad_norm": 3.2518703937530518, | |
"learning_rate": 0.00028373474100956784, | |
"loss": 0.0628, | |
"step": 1950 | |
}, | |
{ | |
"epoch": 7.9, | |
"grad_norm": 0.24071219563484192, | |
"learning_rate": 0.0002820851204223029, | |
"loss": 0.0747, | |
"step": 1975 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 2.8545596599578857, | |
"learning_rate": 0.00028043549983503794, | |
"loss": 0.0688, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 8.1, | |
"grad_norm": 0.049499694257974625, | |
"learning_rate": 0.00027878587924777304, | |
"loss": 0.0563, | |
"step": 2025 | |
}, | |
{ | |
"epoch": 8.2, | |
"grad_norm": 22.112478256225586, | |
"learning_rate": 0.0002771362586605081, | |
"loss": 0.0545, | |
"step": 2050 | |
}, | |
{ | |
"epoch": 8.3, | |
"grad_norm": 20.87473487854004, | |
"learning_rate": 0.0002754866380732432, | |
"loss": 0.0256, | |
"step": 2075 | |
}, | |
{ | |
"epoch": 8.4, | |
"grad_norm": 2.317768096923828, | |
"learning_rate": 0.00027383701748597824, | |
"loss": 0.0753, | |
"step": 2100 | |
}, | |
{ | |
"epoch": 8.5, | |
"grad_norm": 20.241336822509766, | |
"learning_rate": 0.0002721873968987133, | |
"loss": 0.0257, | |
"step": 2125 | |
}, | |
{ | |
"epoch": 8.6, | |
"grad_norm": 14.994575500488281, | |
"learning_rate": 0.00027053777631144834, | |
"loss": 0.0748, | |
"step": 2150 | |
}, | |
{ | |
"epoch": 8.7, | |
"grad_norm": 0.8268639445304871, | |
"learning_rate": 0.00026888815572418344, | |
"loss": 0.0805, | |
"step": 2175 | |
}, | |
{ | |
"epoch": 8.8, | |
"grad_norm": 0.031071001663804054, | |
"learning_rate": 0.00026723853513691855, | |
"loss": 0.0603, | |
"step": 2200 | |
}, | |
{ | |
"epoch": 8.9, | |
"grad_norm": 2.4572932720184326, | |
"learning_rate": 0.0002655889145496536, | |
"loss": 0.071, | |
"step": 2225 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 0.21708321571350098, | |
"learning_rate": 0.0002639392939623887, | |
"loss": 0.0606, | |
"step": 2250 | |
}, | |
{ | |
"epoch": 9.1, | |
"grad_norm": 14.928559303283691, | |
"learning_rate": 0.0002622896733751237, | |
"loss": 0.0443, | |
"step": 2275 | |
}, | |
{ | |
"epoch": 9.2, | |
"grad_norm": 1.108826994895935, | |
"learning_rate": 0.0002606400527878588, | |
"loss": 0.0509, | |
"step": 2300 | |
}, | |
{ | |
"epoch": 9.3, | |
"grad_norm": 0.05721910670399666, | |
"learning_rate": 0.00025899043220059385, | |
"loss": 0.0459, | |
"step": 2325 | |
}, | |
{ | |
"epoch": 9.4, | |
"grad_norm": 7.81903600692749, | |
"learning_rate": 0.00025734081161332895, | |
"loss": 0.0535, | |
"step": 2350 | |
}, | |
{ | |
"epoch": 9.5, | |
"grad_norm": 0.01475981343537569, | |
"learning_rate": 0.000255691191026064, | |
"loss": 0.0399, | |
"step": 2375 | |
}, | |
{ | |
"epoch": 9.6, | |
"grad_norm": 5.496493339538574, | |
"learning_rate": 0.0002540415704387991, | |
"loss": 0.0744, | |
"step": 2400 | |
}, | |
{ | |
"epoch": 9.7, | |
"grad_norm": 0.05929339677095413, | |
"learning_rate": 0.00025239194985153415, | |
"loss": 0.0416, | |
"step": 2425 | |
}, | |
{ | |
"epoch": 9.8, | |
"grad_norm": 0.06961135566234589, | |
"learning_rate": 0.0002507423292642692, | |
"loss": 0.04, | |
"step": 2450 | |
}, | |
{ | |
"epoch": 9.9, | |
"grad_norm": 0.02165246568620205, | |
"learning_rate": 0.0002490927086770043, | |
"loss": 0.0984, | |
"step": 2475 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 25.155675888061523, | |
"learning_rate": 0.00024744308808973936, | |
"loss": 0.042, | |
"step": 2500 | |
}, | |
{ | |
"epoch": 10.1, | |
"grad_norm": 8.566617965698242, | |
"learning_rate": 0.00024579346750247446, | |
"loss": 0.0385, | |
"step": 2525 | |
}, | |
{ | |
"epoch": 10.2, | |
"grad_norm": 0.024992404505610466, | |
"learning_rate": 0.0002441438469152095, | |
"loss": 0.0327, | |
"step": 2550 | |
}, | |
{ | |
"epoch": 10.3, | |
"grad_norm": 0.22233247756958008, | |
"learning_rate": 0.00024249422632794456, | |
"loss": 0.0211, | |
"step": 2575 | |
}, | |
{ | |
"epoch": 10.4, | |
"grad_norm": 13.307366371154785, | |
"learning_rate": 0.00024084460574067963, | |
"loss": 0.0472, | |
"step": 2600 | |
}, | |
{ | |
"epoch": 10.5, | |
"grad_norm": 0.17787696421146393, | |
"learning_rate": 0.0002391949851534147, | |
"loss": 0.0346, | |
"step": 2625 | |
}, | |
{ | |
"epoch": 10.6, | |
"grad_norm": 1.2845549583435059, | |
"learning_rate": 0.0002375453645661498, | |
"loss": 0.0587, | |
"step": 2650 | |
}, | |
{ | |
"epoch": 10.7, | |
"grad_norm": 0.0603482760488987, | |
"learning_rate": 0.00023589574397888486, | |
"loss": 0.0394, | |
"step": 2675 | |
}, | |
{ | |
"epoch": 10.8, | |
"grad_norm": 0.031819943338632584, | |
"learning_rate": 0.00023424612339161997, | |
"loss": 0.0434, | |
"step": 2700 | |
}, | |
{ | |
"epoch": 10.9, | |
"grad_norm": 1.5738537311553955, | |
"learning_rate": 0.000232596502804355, | |
"loss": 0.0393, | |
"step": 2725 | |
}, | |
{ | |
"epoch": 11.0, | |
"grad_norm": 0.6862583756446838, | |
"learning_rate": 0.00023094688221709007, | |
"loss": 0.0183, | |
"step": 2750 | |
}, | |
{ | |
"epoch": 11.1, | |
"grad_norm": 0.0053489054553210735, | |
"learning_rate": 0.00022929726162982514, | |
"loss": 0.0277, | |
"step": 2775 | |
}, | |
{ | |
"epoch": 11.2, | |
"grad_norm": 0.04365606606006622, | |
"learning_rate": 0.00022764764104256022, | |
"loss": 0.0337, | |
"step": 2800 | |
}, | |
{ | |
"epoch": 11.3, | |
"grad_norm": 1.3348641395568848, | |
"learning_rate": 0.0002259980204552953, | |
"loss": 0.0219, | |
"step": 2825 | |
}, | |
{ | |
"epoch": 11.4, | |
"grad_norm": 3.3695547580718994, | |
"learning_rate": 0.00022434839986803037, | |
"loss": 0.0227, | |
"step": 2850 | |
}, | |
{ | |
"epoch": 11.5, | |
"grad_norm": 2.507455587387085, | |
"learning_rate": 0.00022269877928076542, | |
"loss": 0.0443, | |
"step": 2875 | |
}, | |
{ | |
"epoch": 11.6, | |
"grad_norm": 0.013451912440359592, | |
"learning_rate": 0.0002210491586935005, | |
"loss": 0.0486, | |
"step": 2900 | |
}, | |
{ | |
"epoch": 11.7, | |
"grad_norm": 0.031177405267953873, | |
"learning_rate": 0.00021939953810623557, | |
"loss": 0.0511, | |
"step": 2925 | |
}, | |
{ | |
"epoch": 11.8, | |
"grad_norm": 0.02148960903286934, | |
"learning_rate": 0.00021774991751897065, | |
"loss": 0.0269, | |
"step": 2950 | |
}, | |
{ | |
"epoch": 11.9, | |
"grad_norm": 0.01219563465565443, | |
"learning_rate": 0.00021610029693170573, | |
"loss": 0.042, | |
"step": 2975 | |
}, | |
{ | |
"epoch": 12.0, | |
"grad_norm": 0.2062515765428543, | |
"learning_rate": 0.0002144506763444408, | |
"loss": 0.031, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 12.1, | |
"grad_norm": 3.235403060913086, | |
"learning_rate": 0.00021280105575717582, | |
"loss": 0.0414, | |
"step": 3025 | |
}, | |
{ | |
"epoch": 12.2, | |
"grad_norm": 1.089821696281433, | |
"learning_rate": 0.00021115143516991093, | |
"loss": 0.0692, | |
"step": 3050 | |
}, | |
{ | |
"epoch": 12.3, | |
"grad_norm": 0.013664484024047852, | |
"learning_rate": 0.000209501814582646, | |
"loss": 0.0367, | |
"step": 3075 | |
}, | |
{ | |
"epoch": 12.4, | |
"grad_norm": 1.3101387023925781, | |
"learning_rate": 0.00020785219399538108, | |
"loss": 0.0277, | |
"step": 3100 | |
}, | |
{ | |
"epoch": 12.5, | |
"grad_norm": 3.244853973388672, | |
"learning_rate": 0.00020620257340811616, | |
"loss": 0.0373, | |
"step": 3125 | |
}, | |
{ | |
"epoch": 12.6, | |
"grad_norm": 0.013548053801059723, | |
"learning_rate": 0.00020455295282085123, | |
"loss": 0.027, | |
"step": 3150 | |
}, | |
{ | |
"epoch": 12.7, | |
"grad_norm": 0.004670978989452124, | |
"learning_rate": 0.00020290333223358626, | |
"loss": 0.0252, | |
"step": 3175 | |
}, | |
{ | |
"epoch": 12.8, | |
"grad_norm": 0.007080752402544022, | |
"learning_rate": 0.00020125371164632133, | |
"loss": 0.0187, | |
"step": 3200 | |
}, | |
{ | |
"epoch": 12.9, | |
"grad_norm": 0.005918944254517555, | |
"learning_rate": 0.0001996040910590564, | |
"loss": 0.0423, | |
"step": 3225 | |
}, | |
{ | |
"epoch": 13.0, | |
"grad_norm": 4.530269145965576, | |
"learning_rate": 0.00019795447047179148, | |
"loss": 0.0381, | |
"step": 3250 | |
}, | |
{ | |
"epoch": 13.1, | |
"grad_norm": 0.00533739197999239, | |
"learning_rate": 0.0001963048498845266, | |
"loss": 0.0097, | |
"step": 3275 | |
}, | |
{ | |
"epoch": 13.2, | |
"grad_norm": 0.7641226649284363, | |
"learning_rate": 0.00019465522929726164, | |
"loss": 0.0205, | |
"step": 3300 | |
}, | |
{ | |
"epoch": 13.3, | |
"grad_norm": 2.16174054145813, | |
"learning_rate": 0.00019300560870999671, | |
"loss": 0.0174, | |
"step": 3325 | |
}, | |
{ | |
"epoch": 13.4, | |
"grad_norm": 0.00814723875373602, | |
"learning_rate": 0.0001913559881227318, | |
"loss": 0.0215, | |
"step": 3350 | |
}, | |
{ | |
"epoch": 13.5, | |
"grad_norm": 10.500170707702637, | |
"learning_rate": 0.00018970636753546684, | |
"loss": 0.0226, | |
"step": 3375 | |
}, | |
{ | |
"epoch": 13.6, | |
"grad_norm": 2.612359046936035, | |
"learning_rate": 0.00018805674694820192, | |
"loss": 0.0314, | |
"step": 3400 | |
}, | |
{ | |
"epoch": 13.7, | |
"grad_norm": 0.11265736818313599, | |
"learning_rate": 0.000186407126360937, | |
"loss": 0.0234, | |
"step": 3425 | |
}, | |
{ | |
"epoch": 13.8, | |
"grad_norm": 0.06453213095664978, | |
"learning_rate": 0.00018475750577367207, | |
"loss": 0.0314, | |
"step": 3450 | |
}, | |
{ | |
"epoch": 13.9, | |
"grad_norm": 0.01650763861835003, | |
"learning_rate": 0.00018310788518640715, | |
"loss": 0.0403, | |
"step": 3475 | |
}, | |
{ | |
"epoch": 14.0, | |
"grad_norm": 0.002764373552054167, | |
"learning_rate": 0.00018145826459914222, | |
"loss": 0.0137, | |
"step": 3500 | |
}, | |
{ | |
"epoch": 14.1, | |
"grad_norm": 0.0022430066019296646, | |
"learning_rate": 0.00017980864401187727, | |
"loss": 0.0084, | |
"step": 3525 | |
}, | |
{ | |
"epoch": 14.2, | |
"grad_norm": 0.007267679553478956, | |
"learning_rate": 0.00017815902342461235, | |
"loss": 0.0161, | |
"step": 3550 | |
}, | |
{ | |
"epoch": 14.3, | |
"grad_norm": 0.004844362381845713, | |
"learning_rate": 0.00017650940283734742, | |
"loss": 0.0399, | |
"step": 3575 | |
}, | |
{ | |
"epoch": 14.4, | |
"grad_norm": 0.05529040843248367, | |
"learning_rate": 0.00017485978225008247, | |
"loss": 0.0192, | |
"step": 3600 | |
}, | |
{ | |
"epoch": 14.5, | |
"grad_norm": 0.008248478174209595, | |
"learning_rate": 0.00017321016166281755, | |
"loss": 0.0079, | |
"step": 3625 | |
}, | |
{ | |
"epoch": 14.6, | |
"grad_norm": 0.02617962658405304, | |
"learning_rate": 0.00017156054107555263, | |
"loss": 0.0244, | |
"step": 3650 | |
}, | |
{ | |
"epoch": 14.7, | |
"grad_norm": 0.004497932270169258, | |
"learning_rate": 0.0001699109204882877, | |
"loss": 0.0464, | |
"step": 3675 | |
}, | |
{ | |
"epoch": 14.8, | |
"grad_norm": 0.008586671203374863, | |
"learning_rate": 0.00016826129990102278, | |
"loss": 0.0354, | |
"step": 3700 | |
}, | |
{ | |
"epoch": 14.9, | |
"grad_norm": 0.30177807807922363, | |
"learning_rate": 0.00016661167931375785, | |
"loss": 0.0147, | |
"step": 3725 | |
}, | |
{ | |
"epoch": 15.0, | |
"grad_norm": 0.10390808433294296, | |
"learning_rate": 0.0001649620587264929, | |
"loss": 0.0078, | |
"step": 3750 | |
}, | |
{ | |
"epoch": 15.1, | |
"grad_norm": 1.584868311882019, | |
"learning_rate": 0.00016331243813922798, | |
"loss": 0.0059, | |
"step": 3775 | |
}, | |
{ | |
"epoch": 15.2, | |
"grad_norm": 0.0038078685756772757, | |
"learning_rate": 0.00016166281755196306, | |
"loss": 0.0143, | |
"step": 3800 | |
}, | |
{ | |
"epoch": 15.3, | |
"grad_norm": 0.017795555293560028, | |
"learning_rate": 0.00016001319696469813, | |
"loss": 0.0003, | |
"step": 3825 | |
}, | |
{ | |
"epoch": 15.4, | |
"grad_norm": 7.246474742889404, | |
"learning_rate": 0.0001583635763774332, | |
"loss": 0.0288, | |
"step": 3850 | |
}, | |
{ | |
"epoch": 15.5, | |
"grad_norm": 0.0014716258738189936, | |
"learning_rate": 0.00015671395579016829, | |
"loss": 0.0113, | |
"step": 3875 | |
}, | |
{ | |
"epoch": 15.6, | |
"grad_norm": 0.002159764291718602, | |
"learning_rate": 0.00015506433520290334, | |
"loss": 0.0138, | |
"step": 3900 | |
}, | |
{ | |
"epoch": 15.7, | |
"grad_norm": 27.16730308532715, | |
"learning_rate": 0.0001534147146156384, | |
"loss": 0.0048, | |
"step": 3925 | |
}, | |
{ | |
"epoch": 15.8, | |
"grad_norm": 0.004570333287119865, | |
"learning_rate": 0.0001517650940283735, | |
"loss": 0.0144, | |
"step": 3950 | |
}, | |
{ | |
"epoch": 15.9, | |
"grad_norm": 0.008330491371452808, | |
"learning_rate": 0.00015011547344110854, | |
"loss": 0.0371, | |
"step": 3975 | |
}, | |
{ | |
"epoch": 16.0, | |
"grad_norm": 15.75979232788086, | |
"learning_rate": 0.00014846585285384361, | |
"loss": 0.0207, | |
"step": 4000 | |
} | |
], | |
"logging_steps": 25, | |
"max_steps": 6250, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 25, | |
"save_steps": 1000, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 203277926400000.0, | |
"train_batch_size": 20, | |
"trial_name": null, | |
"trial_params": null | |
} | |