TinyDNABERT
/
Finetuned Models
/finetuning_outputs
/tfbs
/TinyDNABERT_base_model
/checkpoint-6000
/trainer_state.json
{ | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 24.0, | |
"eval_steps": 500, | |
"global_step": 6000, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.1, | |
"grad_norm": 0.6611918807029724, | |
"learning_rate": 5.319148936170213e-05, | |
"loss": 0.6932, | |
"step": 25 | |
}, | |
{ | |
"epoch": 0.2, | |
"grad_norm": 0.9467485547065735, | |
"learning_rate": 0.00010638297872340425, | |
"loss": 0.6912, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.3, | |
"grad_norm": 0.9465051889419556, | |
"learning_rate": 0.00015957446808510637, | |
"loss": 0.6893, | |
"step": 75 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 1.5430934429168701, | |
"learning_rate": 0.0002127659574468085, | |
"loss": 0.6804, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 1.6103107929229736, | |
"learning_rate": 0.00026595744680851064, | |
"loss": 0.6806, | |
"step": 125 | |
}, | |
{ | |
"epoch": 0.6, | |
"grad_norm": 2.3773200511932373, | |
"learning_rate": 0.00031914893617021275, | |
"loss": 0.6801, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.7, | |
"grad_norm": 1.8832203149795532, | |
"learning_rate": 0.0003723404255319149, | |
"loss": 0.6791, | |
"step": 175 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 1.3350876569747925, | |
"learning_rate": 0.0003992081821181128, | |
"loss": 0.6746, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.9, | |
"grad_norm": 1.5796219110488892, | |
"learning_rate": 0.0003975585615308479, | |
"loss": 0.6771, | |
"step": 225 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 1.884006381034851, | |
"learning_rate": 0.00039590894094358297, | |
"loss": 0.6649, | |
"step": 250 | |
}, | |
{ | |
"epoch": 1.1, | |
"grad_norm": 3.286440372467041, | |
"learning_rate": 0.0003942593203563181, | |
"loss": 0.6388, | |
"step": 275 | |
}, | |
{ | |
"epoch": 1.2, | |
"grad_norm": 8.244946479797363, | |
"learning_rate": 0.0003926096997690532, | |
"loss": 0.6238, | |
"step": 300 | |
}, | |
{ | |
"epoch": 1.3, | |
"grad_norm": 4.265683650970459, | |
"learning_rate": 0.00039096007918178817, | |
"loss": 0.6383, | |
"step": 325 | |
}, | |
{ | |
"epoch": 1.4, | |
"grad_norm": 4.115826606750488, | |
"learning_rate": 0.0003893104585945233, | |
"loss": 0.6194, | |
"step": 350 | |
}, | |
{ | |
"epoch": 1.5, | |
"grad_norm": 5.694250583648682, | |
"learning_rate": 0.0003876608380072583, | |
"loss": 0.6324, | |
"step": 375 | |
}, | |
{ | |
"epoch": 1.6, | |
"grad_norm": 3.463121175765991, | |
"learning_rate": 0.00038601121741999343, | |
"loss": 0.621, | |
"step": 400 | |
}, | |
{ | |
"epoch": 1.7, | |
"grad_norm": 4.582865238189697, | |
"learning_rate": 0.0003843615968327285, | |
"loss": 0.6116, | |
"step": 425 | |
}, | |
{ | |
"epoch": 1.8, | |
"grad_norm": 11.996281623840332, | |
"learning_rate": 0.0003827119762454636, | |
"loss": 0.6393, | |
"step": 450 | |
}, | |
{ | |
"epoch": 1.9, | |
"grad_norm": 3.0407373905181885, | |
"learning_rate": 0.00038106235565819863, | |
"loss": 0.628, | |
"step": 475 | |
}, | |
{ | |
"epoch": 2.0, | |
"grad_norm": 2.917588233947754, | |
"learning_rate": 0.0003794127350709337, | |
"loss": 0.6078, | |
"step": 500 | |
}, | |
{ | |
"epoch": 2.1, | |
"grad_norm": 4.748379707336426, | |
"learning_rate": 0.0003777631144836688, | |
"loss": 0.4899, | |
"step": 525 | |
}, | |
{ | |
"epoch": 2.2, | |
"grad_norm": 3.8076977729797363, | |
"learning_rate": 0.00037611349389640383, | |
"loss": 0.5086, | |
"step": 550 | |
}, | |
{ | |
"epoch": 2.3, | |
"grad_norm": 5.2440714836120605, | |
"learning_rate": 0.00037446387330913894, | |
"loss": 0.5327, | |
"step": 575 | |
}, | |
{ | |
"epoch": 2.4, | |
"grad_norm": 7.110438346862793, | |
"learning_rate": 0.000372814252721874, | |
"loss": 0.5436, | |
"step": 600 | |
}, | |
{ | |
"epoch": 2.5, | |
"grad_norm": 5.46150541305542, | |
"learning_rate": 0.00037116463213460903, | |
"loss": 0.5294, | |
"step": 625 | |
}, | |
{ | |
"epoch": 2.6, | |
"grad_norm": 5.136163234710693, | |
"learning_rate": 0.00036951501154734414, | |
"loss": 0.5245, | |
"step": 650 | |
}, | |
{ | |
"epoch": 2.7, | |
"grad_norm": 8.735346794128418, | |
"learning_rate": 0.0003678653909600792, | |
"loss": 0.5449, | |
"step": 675 | |
}, | |
{ | |
"epoch": 2.8, | |
"grad_norm": 2.922825574874878, | |
"learning_rate": 0.0003662157703728143, | |
"loss": 0.5406, | |
"step": 700 | |
}, | |
{ | |
"epoch": 2.9, | |
"grad_norm": 7.744819641113281, | |
"learning_rate": 0.00036456614978554934, | |
"loss": 0.5447, | |
"step": 725 | |
}, | |
{ | |
"epoch": 3.0, | |
"grad_norm": 11.6185884475708, | |
"learning_rate": 0.00036291652919828444, | |
"loss": 0.5195, | |
"step": 750 | |
}, | |
{ | |
"epoch": 3.1, | |
"grad_norm": 3.73836088180542, | |
"learning_rate": 0.00036126690861101944, | |
"loss": 0.3824, | |
"step": 775 | |
}, | |
{ | |
"epoch": 3.2, | |
"grad_norm": 14.850343704223633, | |
"learning_rate": 0.00035961728802375454, | |
"loss": 0.4071, | |
"step": 800 | |
}, | |
{ | |
"epoch": 3.3, | |
"grad_norm": 5.7157440185546875, | |
"learning_rate": 0.0003579676674364896, | |
"loss": 0.3986, | |
"step": 825 | |
}, | |
{ | |
"epoch": 3.4, | |
"grad_norm": 12.418399810791016, | |
"learning_rate": 0.0003563180468492247, | |
"loss": 0.4282, | |
"step": 850 | |
}, | |
{ | |
"epoch": 3.5, | |
"grad_norm": 12.793001174926758, | |
"learning_rate": 0.0003546684262619598, | |
"loss": 0.4822, | |
"step": 875 | |
}, | |
{ | |
"epoch": 3.6, | |
"grad_norm": 6.489450931549072, | |
"learning_rate": 0.00035301880567469485, | |
"loss": 0.4239, | |
"step": 900 | |
}, | |
{ | |
"epoch": 3.7, | |
"grad_norm": 5.365822792053223, | |
"learning_rate": 0.0003513691850874299, | |
"loss": 0.421, | |
"step": 925 | |
}, | |
{ | |
"epoch": 3.8, | |
"grad_norm": 12.643745422363281, | |
"learning_rate": 0.00034971956450016495, | |
"loss": 0.3964, | |
"step": 950 | |
}, | |
{ | |
"epoch": 3.9, | |
"grad_norm": 14.334024429321289, | |
"learning_rate": 0.00034806994391290005, | |
"loss": 0.4634, | |
"step": 975 | |
}, | |
{ | |
"epoch": 4.0, | |
"grad_norm": 6.819091320037842, | |
"learning_rate": 0.0003464203233256351, | |
"loss": 0.4139, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 4.1, | |
"grad_norm": 8.29238224029541, | |
"learning_rate": 0.0003447707027383702, | |
"loss": 0.2695, | |
"step": 1025 | |
}, | |
{ | |
"epoch": 4.2, | |
"grad_norm": 5.984206676483154, | |
"learning_rate": 0.00034312108215110525, | |
"loss": 0.2653, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 4.3, | |
"grad_norm": 5.9425435066223145, | |
"learning_rate": 0.0003414714615638403, | |
"loss": 0.2982, | |
"step": 1075 | |
}, | |
{ | |
"epoch": 4.4, | |
"grad_norm": 7.1877593994140625, | |
"learning_rate": 0.0003398218409765754, | |
"loss": 0.3309, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 4.5, | |
"grad_norm": 17.83046531677246, | |
"learning_rate": 0.00033817222038931045, | |
"loss": 0.3467, | |
"step": 1125 | |
}, | |
{ | |
"epoch": 4.6, | |
"grad_norm": 4.865128517150879, | |
"learning_rate": 0.00033652259980204556, | |
"loss": 0.3117, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 4.7, | |
"grad_norm": 31.427154541015625, | |
"learning_rate": 0.0003348729792147806, | |
"loss": 0.3273, | |
"step": 1175 | |
}, | |
{ | |
"epoch": 4.8, | |
"grad_norm": 26.77984619140625, | |
"learning_rate": 0.0003332233586275157, | |
"loss": 0.3504, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 4.9, | |
"grad_norm": 9.389993667602539, | |
"learning_rate": 0.00033157373804025076, | |
"loss": 0.3339, | |
"step": 1225 | |
}, | |
{ | |
"epoch": 5.0, | |
"grad_norm": 25.70441246032715, | |
"learning_rate": 0.0003299241174529858, | |
"loss": 0.3336, | |
"step": 1250 | |
}, | |
{ | |
"epoch": 5.1, | |
"grad_norm": 13.482085227966309, | |
"learning_rate": 0.0003282744968657209, | |
"loss": 0.2156, | |
"step": 1275 | |
}, | |
{ | |
"epoch": 5.2, | |
"grad_norm": 8.725810050964355, | |
"learning_rate": 0.00032662487627845596, | |
"loss": 0.1811, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 5.3, | |
"grad_norm": 9.313215255737305, | |
"learning_rate": 0.00032497525569119106, | |
"loss": 0.2807, | |
"step": 1325 | |
}, | |
{ | |
"epoch": 5.4, | |
"grad_norm": 11.026411056518555, | |
"learning_rate": 0.0003233256351039261, | |
"loss": 0.2757, | |
"step": 1350 | |
}, | |
{ | |
"epoch": 5.5, | |
"grad_norm": 11.038985252380371, | |
"learning_rate": 0.00032167601451666116, | |
"loss": 0.2177, | |
"step": 1375 | |
}, | |
{ | |
"epoch": 5.6, | |
"grad_norm": 4.008651256561279, | |
"learning_rate": 0.00032002639392939627, | |
"loss": 0.2163, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 5.7, | |
"grad_norm": 12.480770111083984, | |
"learning_rate": 0.0003183767733421313, | |
"loss": 0.2173, | |
"step": 1425 | |
}, | |
{ | |
"epoch": 5.8, | |
"grad_norm": 8.751969337463379, | |
"learning_rate": 0.0003167271527548664, | |
"loss": 0.2299, | |
"step": 1450 | |
}, | |
{ | |
"epoch": 5.9, | |
"grad_norm": 7.701971530914307, | |
"learning_rate": 0.00031507753216760147, | |
"loss": 0.1949, | |
"step": 1475 | |
}, | |
{ | |
"epoch": 6.0, | |
"grad_norm": 8.48027515411377, | |
"learning_rate": 0.00031342791158033657, | |
"loss": 0.2599, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 6.1, | |
"grad_norm": 9.29404067993164, | |
"learning_rate": 0.00031177829099307157, | |
"loss": 0.1376, | |
"step": 1525 | |
}, | |
{ | |
"epoch": 6.2, | |
"grad_norm": 20.137714385986328, | |
"learning_rate": 0.00031012867040580667, | |
"loss": 0.1647, | |
"step": 1550 | |
}, | |
{ | |
"epoch": 6.3, | |
"grad_norm": 11.394575119018555, | |
"learning_rate": 0.0003084790498185417, | |
"loss": 0.1565, | |
"step": 1575 | |
}, | |
{ | |
"epoch": 6.4, | |
"grad_norm": 8.214287757873535, | |
"learning_rate": 0.0003068294292312768, | |
"loss": 0.1739, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 6.5, | |
"grad_norm": 7.779988765716553, | |
"learning_rate": 0.0003051798086440119, | |
"loss": 0.1403, | |
"step": 1625 | |
}, | |
{ | |
"epoch": 6.6, | |
"grad_norm": 9.421648025512695, | |
"learning_rate": 0.000303530188056747, | |
"loss": 0.1545, | |
"step": 1650 | |
}, | |
{ | |
"epoch": 6.7, | |
"grad_norm": 5.751734256744385, | |
"learning_rate": 0.000301880567469482, | |
"loss": 0.1971, | |
"step": 1675 | |
}, | |
{ | |
"epoch": 6.8, | |
"grad_norm": 23.861705780029297, | |
"learning_rate": 0.0003002309468822171, | |
"loss": 0.1681, | |
"step": 1700 | |
}, | |
{ | |
"epoch": 6.9, | |
"grad_norm": 18.944721221923828, | |
"learning_rate": 0.0002985813262949522, | |
"loss": 0.1703, | |
"step": 1725 | |
}, | |
{ | |
"epoch": 7.0, | |
"grad_norm": 14.045795440673828, | |
"learning_rate": 0.00029693170570768723, | |
"loss": 0.1801, | |
"step": 1750 | |
}, | |
{ | |
"epoch": 7.1, | |
"grad_norm": 6.4620137214660645, | |
"learning_rate": 0.00029528208512042233, | |
"loss": 0.1253, | |
"step": 1775 | |
}, | |
{ | |
"epoch": 7.2, | |
"grad_norm": 4.318169593811035, | |
"learning_rate": 0.0002936324645331574, | |
"loss": 0.1397, | |
"step": 1800 | |
}, | |
{ | |
"epoch": 7.3, | |
"grad_norm": 24.91462516784668, | |
"learning_rate": 0.00029198284394589243, | |
"loss": 0.1259, | |
"step": 1825 | |
}, | |
{ | |
"epoch": 7.4, | |
"grad_norm": 23.614572525024414, | |
"learning_rate": 0.00029033322335862753, | |
"loss": 0.1293, | |
"step": 1850 | |
}, | |
{ | |
"epoch": 7.5, | |
"grad_norm": 3.60048508644104, | |
"learning_rate": 0.0002886836027713626, | |
"loss": 0.138, | |
"step": 1875 | |
}, | |
{ | |
"epoch": 7.6, | |
"grad_norm": 16.62705421447754, | |
"learning_rate": 0.0002870339821840977, | |
"loss": 0.1733, | |
"step": 1900 | |
}, | |
{ | |
"epoch": 7.7, | |
"grad_norm": 33.79671859741211, | |
"learning_rate": 0.00028538436159683273, | |
"loss": 0.217, | |
"step": 1925 | |
}, | |
{ | |
"epoch": 7.8, | |
"grad_norm": 9.69206428527832, | |
"learning_rate": 0.00028373474100956784, | |
"loss": 0.1289, | |
"step": 1950 | |
}, | |
{ | |
"epoch": 7.9, | |
"grad_norm": 18.655046463012695, | |
"learning_rate": 0.0002820851204223029, | |
"loss": 0.1168, | |
"step": 1975 | |
}, | |
{ | |
"epoch": 8.0, | |
"grad_norm": 8.4110746383667, | |
"learning_rate": 0.00028043549983503794, | |
"loss": 0.1471, | |
"step": 2000 | |
}, | |
{ | |
"epoch": 8.1, | |
"grad_norm": 14.473428726196289, | |
"learning_rate": 0.00027878587924777304, | |
"loss": 0.0774, | |
"step": 2025 | |
}, | |
{ | |
"epoch": 8.2, | |
"grad_norm": 1.2464979887008667, | |
"learning_rate": 0.0002771362586605081, | |
"loss": 0.0715, | |
"step": 2050 | |
}, | |
{ | |
"epoch": 8.3, | |
"grad_norm": 26.74981689453125, | |
"learning_rate": 0.0002754866380732432, | |
"loss": 0.1289, | |
"step": 2075 | |
}, | |
{ | |
"epoch": 8.4, | |
"grad_norm": 13.958703994750977, | |
"learning_rate": 0.00027383701748597824, | |
"loss": 0.1294, | |
"step": 2100 | |
}, | |
{ | |
"epoch": 8.5, | |
"grad_norm": 10.394835472106934, | |
"learning_rate": 0.0002721873968987133, | |
"loss": 0.0987, | |
"step": 2125 | |
}, | |
{ | |
"epoch": 8.6, | |
"grad_norm": 9.387774467468262, | |
"learning_rate": 0.00027053777631144834, | |
"loss": 0.1269, | |
"step": 2150 | |
}, | |
{ | |
"epoch": 8.7, | |
"grad_norm": 4.988718509674072, | |
"learning_rate": 0.00026888815572418344, | |
"loss": 0.1374, | |
"step": 2175 | |
}, | |
{ | |
"epoch": 8.8, | |
"grad_norm": 12.66832447052002, | |
"learning_rate": 0.00026723853513691855, | |
"loss": 0.0997, | |
"step": 2200 | |
}, | |
{ | |
"epoch": 8.9, | |
"grad_norm": 19.10486602783203, | |
"learning_rate": 0.0002655889145496536, | |
"loss": 0.1325, | |
"step": 2225 | |
}, | |
{ | |
"epoch": 9.0, | |
"grad_norm": 3.2943971157073975, | |
"learning_rate": 0.0002639392939623887, | |
"loss": 0.1385, | |
"step": 2250 | |
}, | |
{ | |
"epoch": 9.1, | |
"grad_norm": 11.490874290466309, | |
"learning_rate": 0.0002622896733751237, | |
"loss": 0.0573, | |
"step": 2275 | |
}, | |
{ | |
"epoch": 9.2, | |
"grad_norm": 11.961288452148438, | |
"learning_rate": 0.0002606400527878588, | |
"loss": 0.1128, | |
"step": 2300 | |
}, | |
{ | |
"epoch": 9.3, | |
"grad_norm": 0.7256277203559875, | |
"learning_rate": 0.00025899043220059385, | |
"loss": 0.0673, | |
"step": 2325 | |
}, | |
{ | |
"epoch": 9.4, | |
"grad_norm": 9.857927322387695, | |
"learning_rate": 0.00025734081161332895, | |
"loss": 0.2072, | |
"step": 2350 | |
}, | |
{ | |
"epoch": 9.5, | |
"grad_norm": 34.927734375, | |
"learning_rate": 0.000255691191026064, | |
"loss": 0.221, | |
"step": 2375 | |
}, | |
{ | |
"epoch": 9.6, | |
"grad_norm": 28.461849212646484, | |
"learning_rate": 0.0002540415704387991, | |
"loss": 0.1263, | |
"step": 2400 | |
}, | |
{ | |
"epoch": 9.7, | |
"grad_norm": 9.591217041015625, | |
"learning_rate": 0.00025239194985153415, | |
"loss": 0.099, | |
"step": 2425 | |
}, | |
{ | |
"epoch": 9.8, | |
"grad_norm": 9.780253410339355, | |
"learning_rate": 0.0002507423292642692, | |
"loss": 0.0754, | |
"step": 2450 | |
}, | |
{ | |
"epoch": 9.9, | |
"grad_norm": 14.394485473632812, | |
"learning_rate": 0.0002490927086770043, | |
"loss": 0.0909, | |
"step": 2475 | |
}, | |
{ | |
"epoch": 10.0, | |
"grad_norm": 4.217277526855469, | |
"learning_rate": 0.00024744308808973936, | |
"loss": 0.1495, | |
"step": 2500 | |
}, | |
{ | |
"epoch": 10.1, | |
"grad_norm": 0.08438724279403687, | |
"learning_rate": 0.00024579346750247446, | |
"loss": 0.0662, | |
"step": 2525 | |
}, | |
{ | |
"epoch": 10.2, | |
"grad_norm": 0.9595862030982971, | |
"learning_rate": 0.0002441438469152095, | |
"loss": 0.084, | |
"step": 2550 | |
}, | |
{ | |
"epoch": 10.3, | |
"grad_norm": 42.32374954223633, | |
"learning_rate": 0.00024249422632794456, | |
"loss": 0.0541, | |
"step": 2575 | |
}, | |
{ | |
"epoch": 10.4, | |
"grad_norm": 24.792821884155273, | |
"learning_rate": 0.00024084460574067963, | |
"loss": 0.121, | |
"step": 2600 | |
}, | |
{ | |
"epoch": 10.5, | |
"grad_norm": 2.7049331665039062, | |
"learning_rate": 0.0002391949851534147, | |
"loss": 0.1551, | |
"step": 2625 | |
}, | |
{ | |
"epoch": 10.6, | |
"grad_norm": 2.788362741470337, | |
"learning_rate": 0.0002375453645661498, | |
"loss": 0.0877, | |
"step": 2650 | |
}, | |
{ | |
"epoch": 10.7, | |
"grad_norm": 0.4999215006828308, | |
"learning_rate": 0.00023589574397888486, | |
"loss": 0.1263, | |
"step": 2675 | |
}, | |
{ | |
"epoch": 10.8, | |
"grad_norm": 16.476675033569336, | |
"learning_rate": 0.00023424612339161997, | |
"loss": 0.0883, | |
"step": 2700 | |
}, | |
{ | |
"epoch": 10.9, | |
"grad_norm": 38.192726135253906, | |
"learning_rate": 0.000232596502804355, | |
"loss": 0.0843, | |
"step": 2725 | |
}, | |
{ | |
"epoch": 11.0, | |
"grad_norm": 18.09952735900879, | |
"learning_rate": 0.00023094688221709007, | |
"loss": 0.0957, | |
"step": 2750 | |
}, | |
{ | |
"epoch": 11.1, | |
"grad_norm": 34.254417419433594, | |
"learning_rate": 0.00022929726162982514, | |
"loss": 0.0856, | |
"step": 2775 | |
}, | |
{ | |
"epoch": 11.2, | |
"grad_norm": 11.744216918945312, | |
"learning_rate": 0.00022764764104256022, | |
"loss": 0.0654, | |
"step": 2800 | |
}, | |
{ | |
"epoch": 11.3, | |
"grad_norm": 0.1285558044910431, | |
"learning_rate": 0.0002259980204552953, | |
"loss": 0.0638, | |
"step": 2825 | |
}, | |
{ | |
"epoch": 11.4, | |
"grad_norm": 20.57583999633789, | |
"learning_rate": 0.00022434839986803037, | |
"loss": 0.079, | |
"step": 2850 | |
}, | |
{ | |
"epoch": 11.5, | |
"grad_norm": 0.7192332148551941, | |
"learning_rate": 0.00022269877928076542, | |
"loss": 0.0854, | |
"step": 2875 | |
}, | |
{ | |
"epoch": 11.6, | |
"grad_norm": 31.341829299926758, | |
"learning_rate": 0.0002210491586935005, | |
"loss": 0.063, | |
"step": 2900 | |
}, | |
{ | |
"epoch": 11.7, | |
"grad_norm": 13.460247039794922, | |
"learning_rate": 0.00021939953810623557, | |
"loss": 0.1358, | |
"step": 2925 | |
}, | |
{ | |
"epoch": 11.8, | |
"grad_norm": 1.671036720275879, | |
"learning_rate": 0.00021774991751897065, | |
"loss": 0.0806, | |
"step": 2950 | |
}, | |
{ | |
"epoch": 11.9, | |
"grad_norm": 22.026491165161133, | |
"learning_rate": 0.00021610029693170573, | |
"loss": 0.0912, | |
"step": 2975 | |
}, | |
{ | |
"epoch": 12.0, | |
"grad_norm": 46.78192138671875, | |
"learning_rate": 0.0002144506763444408, | |
"loss": 0.0763, | |
"step": 3000 | |
}, | |
{ | |
"epoch": 12.1, | |
"grad_norm": 6.397649765014648, | |
"learning_rate": 0.00021280105575717582, | |
"loss": 0.0613, | |
"step": 3025 | |
}, | |
{ | |
"epoch": 12.2, | |
"grad_norm": 7.179838180541992, | |
"learning_rate": 0.00021115143516991093, | |
"loss": 0.0496, | |
"step": 3050 | |
}, | |
{ | |
"epoch": 12.3, | |
"grad_norm": 0.021825680509209633, | |
"learning_rate": 0.000209501814582646, | |
"loss": 0.0633, | |
"step": 3075 | |
}, | |
{ | |
"epoch": 12.4, | |
"grad_norm": 6.6774187088012695, | |
"learning_rate": 0.00020785219399538108, | |
"loss": 0.0765, | |
"step": 3100 | |
}, | |
{ | |
"epoch": 12.5, | |
"grad_norm": 0.13217756152153015, | |
"learning_rate": 0.00020620257340811616, | |
"loss": 0.0696, | |
"step": 3125 | |
}, | |
{ | |
"epoch": 12.6, | |
"grad_norm": 2.5293831825256348, | |
"learning_rate": 0.00020455295282085123, | |
"loss": 0.0419, | |
"step": 3150 | |
}, | |
{ | |
"epoch": 12.7, | |
"grad_norm": 1.1612874269485474, | |
"learning_rate": 0.00020290333223358626, | |
"loss": 0.1046, | |
"step": 3175 | |
}, | |
{ | |
"epoch": 12.8, | |
"grad_norm": 40.93427658081055, | |
"learning_rate": 0.00020125371164632133, | |
"loss": 0.0488, | |
"step": 3200 | |
}, | |
{ | |
"epoch": 12.9, | |
"grad_norm": 1.446478009223938, | |
"learning_rate": 0.0001996040910590564, | |
"loss": 0.062, | |
"step": 3225 | |
}, | |
{ | |
"epoch": 13.0, | |
"grad_norm": 28.893821716308594, | |
"learning_rate": 0.00019795447047179148, | |
"loss": 0.0913, | |
"step": 3250 | |
}, | |
{ | |
"epoch": 13.1, | |
"grad_norm": 23.31914520263672, | |
"learning_rate": 0.0001963048498845266, | |
"loss": 0.061, | |
"step": 3275 | |
}, | |
{ | |
"epoch": 13.2, | |
"grad_norm": 0.26242795586586, | |
"learning_rate": 0.00019465522929726164, | |
"loss": 0.0563, | |
"step": 3300 | |
}, | |
{ | |
"epoch": 13.3, | |
"grad_norm": 0.08909507840871811, | |
"learning_rate": 0.00019300560870999671, | |
"loss": 0.0387, | |
"step": 3325 | |
}, | |
{ | |
"epoch": 13.4, | |
"grad_norm": 24.825326919555664, | |
"learning_rate": 0.0001913559881227318, | |
"loss": 0.0776, | |
"step": 3350 | |
}, | |
{ | |
"epoch": 13.5, | |
"grad_norm": 39.43446731567383, | |
"learning_rate": 0.00018970636753546684, | |
"loss": 0.0846, | |
"step": 3375 | |
}, | |
{ | |
"epoch": 13.6, | |
"grad_norm": 32.98988723754883, | |
"learning_rate": 0.00018805674694820192, | |
"loss": 0.0512, | |
"step": 3400 | |
}, | |
{ | |
"epoch": 13.7, | |
"grad_norm": 4.053821563720703, | |
"learning_rate": 0.000186407126360937, | |
"loss": 0.1576, | |
"step": 3425 | |
}, | |
{ | |
"epoch": 13.8, | |
"grad_norm": 0.2785554528236389, | |
"learning_rate": 0.00018475750577367207, | |
"loss": 0.0728, | |
"step": 3450 | |
}, | |
{ | |
"epoch": 13.9, | |
"grad_norm": 11.39714527130127, | |
"learning_rate": 0.00018310788518640715, | |
"loss": 0.0776, | |
"step": 3475 | |
}, | |
{ | |
"epoch": 14.0, | |
"grad_norm": 3.7018072605133057, | |
"learning_rate": 0.00018145826459914222, | |
"loss": 0.0736, | |
"step": 3500 | |
}, | |
{ | |
"epoch": 14.1, | |
"grad_norm": 0.26053619384765625, | |
"learning_rate": 0.00017980864401187727, | |
"loss": 0.0598, | |
"step": 3525 | |
}, | |
{ | |
"epoch": 14.2, | |
"grad_norm": 6.454179763793945, | |
"learning_rate": 0.00017815902342461235, | |
"loss": 0.0435, | |
"step": 3550 | |
}, | |
{ | |
"epoch": 14.3, | |
"grad_norm": 34.47985076904297, | |
"learning_rate": 0.00017650940283734742, | |
"loss": 0.0467, | |
"step": 3575 | |
}, | |
{ | |
"epoch": 14.4, | |
"grad_norm": 11.76491928100586, | |
"learning_rate": 0.00017485978225008247, | |
"loss": 0.1298, | |
"step": 3600 | |
}, | |
{ | |
"epoch": 14.5, | |
"grad_norm": 26.64131736755371, | |
"learning_rate": 0.00017321016166281755, | |
"loss": 0.1313, | |
"step": 3625 | |
}, | |
{ | |
"epoch": 14.6, | |
"grad_norm": 5.510906219482422, | |
"learning_rate": 0.00017156054107555263, | |
"loss": 0.0421, | |
"step": 3650 | |
}, | |
{ | |
"epoch": 14.7, | |
"grad_norm": 0.04721131548285484, | |
"learning_rate": 0.0001699109204882877, | |
"loss": 0.0569, | |
"step": 3675 | |
}, | |
{ | |
"epoch": 14.8, | |
"grad_norm": 0.6616227030754089, | |
"learning_rate": 0.00016826129990102278, | |
"loss": 0.0618, | |
"step": 3700 | |
}, | |
{ | |
"epoch": 14.9, | |
"grad_norm": 0.05287986248731613, | |
"learning_rate": 0.00016661167931375785, | |
"loss": 0.0574, | |
"step": 3725 | |
}, | |
{ | |
"epoch": 15.0, | |
"grad_norm": 0.49707508087158203, | |
"learning_rate": 0.0001649620587264929, | |
"loss": 0.094, | |
"step": 3750 | |
}, | |
{ | |
"epoch": 15.1, | |
"grad_norm": 23.649728775024414, | |
"learning_rate": 0.00016331243813922798, | |
"loss": 0.0512, | |
"step": 3775 | |
}, | |
{ | |
"epoch": 15.2, | |
"grad_norm": 0.01772051490843296, | |
"learning_rate": 0.00016166281755196306, | |
"loss": 0.0719, | |
"step": 3800 | |
}, | |
{ | |
"epoch": 15.3, | |
"grad_norm": 1.5259039402008057, | |
"learning_rate": 0.00016001319696469813, | |
"loss": 0.0615, | |
"step": 3825 | |
}, | |
{ | |
"epoch": 15.4, | |
"grad_norm": 0.026406478136777878, | |
"learning_rate": 0.0001583635763774332, | |
"loss": 0.0651, | |
"step": 3850 | |
}, | |
{ | |
"epoch": 15.5, | |
"grad_norm": 27.12177085876465, | |
"learning_rate": 0.00015671395579016829, | |
"loss": 0.0458, | |
"step": 3875 | |
}, | |
{ | |
"epoch": 15.6, | |
"grad_norm": 9.838811874389648, | |
"learning_rate": 0.00015506433520290334, | |
"loss": 0.0662, | |
"step": 3900 | |
}, | |
{ | |
"epoch": 15.7, | |
"grad_norm": 0.08516795933246613, | |
"learning_rate": 0.0001534147146156384, | |
"loss": 0.0302, | |
"step": 3925 | |
}, | |
{ | |
"epoch": 15.8, | |
"grad_norm": 1.1224850416183472, | |
"learning_rate": 0.0001517650940283735, | |
"loss": 0.0637, | |
"step": 3950 | |
}, | |
{ | |
"epoch": 15.9, | |
"grad_norm": 0.14810702204704285, | |
"learning_rate": 0.00015011547344110854, | |
"loss": 0.0296, | |
"step": 3975 | |
}, | |
{ | |
"epoch": 16.0, | |
"grad_norm": 0.046288371086120605, | |
"learning_rate": 0.00014846585285384361, | |
"loss": 0.0368, | |
"step": 4000 | |
}, | |
{ | |
"epoch": 16.1, | |
"grad_norm": 0.008764918893575668, | |
"learning_rate": 0.0001468162322665787, | |
"loss": 0.0272, | |
"step": 4025 | |
}, | |
{ | |
"epoch": 16.2, | |
"grad_norm": 0.39770299196243286, | |
"learning_rate": 0.00014516661167931377, | |
"loss": 0.0344, | |
"step": 4050 | |
}, | |
{ | |
"epoch": 16.3, | |
"grad_norm": 23.780405044555664, | |
"learning_rate": 0.00014351699109204884, | |
"loss": 0.0617, | |
"step": 4075 | |
}, | |
{ | |
"epoch": 16.4, | |
"grad_norm": 0.023103665560483932, | |
"learning_rate": 0.00014186737050478392, | |
"loss": 0.0182, | |
"step": 4100 | |
}, | |
{ | |
"epoch": 16.5, | |
"grad_norm": 0.00998806394636631, | |
"learning_rate": 0.00014021774991751897, | |
"loss": 0.042, | |
"step": 4125 | |
}, | |
{ | |
"epoch": 16.6, | |
"grad_norm": 43.775169372558594, | |
"learning_rate": 0.00013856812933025404, | |
"loss": 0.0182, | |
"step": 4150 | |
}, | |
{ | |
"epoch": 16.7, | |
"grad_norm": 0.008157053031027317, | |
"learning_rate": 0.00013691850874298912, | |
"loss": 0.0377, | |
"step": 4175 | |
}, | |
{ | |
"epoch": 16.8, | |
"grad_norm": 0.011455570347607136, | |
"learning_rate": 0.00013526888815572417, | |
"loss": 0.0255, | |
"step": 4200 | |
}, | |
{ | |
"epoch": 16.9, | |
"grad_norm": 34.8545036315918, | |
"learning_rate": 0.00013361926756845927, | |
"loss": 0.0486, | |
"step": 4225 | |
}, | |
{ | |
"epoch": 17.0, | |
"grad_norm": 0.006066238507628441, | |
"learning_rate": 0.00013196964698119435, | |
"loss": 0.0821, | |
"step": 4250 | |
}, | |
{ | |
"epoch": 17.1, | |
"grad_norm": 43.46882247924805, | |
"learning_rate": 0.0001303200263939294, | |
"loss": 0.0333, | |
"step": 4275 | |
}, | |
{ | |
"epoch": 17.2, | |
"grad_norm": 2.3177573680877686, | |
"learning_rate": 0.00012867040580666448, | |
"loss": 0.0477, | |
"step": 4300 | |
}, | |
{ | |
"epoch": 17.3, | |
"grad_norm": 0.2264157235622406, | |
"learning_rate": 0.00012702078521939955, | |
"loss": 0.0373, | |
"step": 4325 | |
}, | |
{ | |
"epoch": 17.4, | |
"grad_norm": 68.17675018310547, | |
"learning_rate": 0.0001253711646321346, | |
"loss": 0.036, | |
"step": 4350 | |
}, | |
{ | |
"epoch": 17.5, | |
"grad_norm": 1.5359147787094116, | |
"learning_rate": 0.00012372154404486968, | |
"loss": 0.0437, | |
"step": 4375 | |
}, | |
{ | |
"epoch": 17.6, | |
"grad_norm": 0.008654219098389149, | |
"learning_rate": 0.00012207192345760475, | |
"loss": 0.0295, | |
"step": 4400 | |
}, | |
{ | |
"epoch": 17.7, | |
"grad_norm": 0.0074989828281104565, | |
"learning_rate": 0.00012042230287033982, | |
"loss": 0.0229, | |
"step": 4425 | |
}, | |
{ | |
"epoch": 17.8, | |
"grad_norm": 0.03568067401647568, | |
"learning_rate": 0.0001187726822830749, | |
"loss": 0.0636, | |
"step": 4450 | |
}, | |
{ | |
"epoch": 17.9, | |
"grad_norm": 0.0151940006762743, | |
"learning_rate": 0.00011712306169580998, | |
"loss": 0.0669, | |
"step": 4475 | |
}, | |
{ | |
"epoch": 18.0, | |
"grad_norm": 0.04830991476774216, | |
"learning_rate": 0.00011547344110854503, | |
"loss": 0.0388, | |
"step": 4500 | |
}, | |
{ | |
"epoch": 18.1, | |
"grad_norm": 1.1794861555099487, | |
"learning_rate": 0.00011382382052128011, | |
"loss": 0.0069, | |
"step": 4525 | |
}, | |
{ | |
"epoch": 18.2, | |
"grad_norm": 13.07977294921875, | |
"learning_rate": 0.00011217419993401519, | |
"loss": 0.0315, | |
"step": 4550 | |
}, | |
{ | |
"epoch": 18.3, | |
"grad_norm": 0.011115381494164467, | |
"learning_rate": 0.00011052457934675025, | |
"loss": 0.0144, | |
"step": 4575 | |
}, | |
{ | |
"epoch": 18.4, | |
"grad_norm": 25.42391014099121, | |
"learning_rate": 0.00010887495875948532, | |
"loss": 0.0198, | |
"step": 4600 | |
}, | |
{ | |
"epoch": 18.5, | |
"grad_norm": 44.36895751953125, | |
"learning_rate": 0.0001072253381722204, | |
"loss": 0.0297, | |
"step": 4625 | |
}, | |
{ | |
"epoch": 18.6, | |
"grad_norm": 6.828146457672119, | |
"learning_rate": 0.00010557571758495546, | |
"loss": 0.0348, | |
"step": 4650 | |
}, | |
{ | |
"epoch": 18.7, | |
"grad_norm": 38.9465446472168, | |
"learning_rate": 0.00010392609699769054, | |
"loss": 0.0336, | |
"step": 4675 | |
}, | |
{ | |
"epoch": 18.8, | |
"grad_norm": 0.015516690909862518, | |
"learning_rate": 0.00010227647641042562, | |
"loss": 0.032, | |
"step": 4700 | |
}, | |
{ | |
"epoch": 18.9, | |
"grad_norm": 5.419480323791504, | |
"learning_rate": 0.00010062685582316067, | |
"loss": 0.0169, | |
"step": 4725 | |
}, | |
{ | |
"epoch": 19.0, | |
"grad_norm": 0.7849720120429993, | |
"learning_rate": 9.897723523589574e-05, | |
"loss": 0.0321, | |
"step": 4750 | |
}, | |
{ | |
"epoch": 19.1, | |
"grad_norm": 25.438196182250977, | |
"learning_rate": 9.732761464863082e-05, | |
"loss": 0.046, | |
"step": 4775 | |
}, | |
{ | |
"epoch": 19.2, | |
"grad_norm": 0.010676453821361065, | |
"learning_rate": 9.56779940613659e-05, | |
"loss": 0.0111, | |
"step": 4800 | |
}, | |
{ | |
"epoch": 19.3, | |
"grad_norm": 0.011585243977606297, | |
"learning_rate": 9.402837347410096e-05, | |
"loss": 0.0154, | |
"step": 4825 | |
}, | |
{ | |
"epoch": 19.4, | |
"grad_norm": 0.012879762798547745, | |
"learning_rate": 9.237875288683603e-05, | |
"loss": 0.02, | |
"step": 4850 | |
}, | |
{ | |
"epoch": 19.5, | |
"grad_norm": 0.4159695506095886, | |
"learning_rate": 9.072913229957111e-05, | |
"loss": 0.0277, | |
"step": 4875 | |
}, | |
{ | |
"epoch": 19.6, | |
"grad_norm": 0.013872765935957432, | |
"learning_rate": 8.907951171230617e-05, | |
"loss": 0.0069, | |
"step": 4900 | |
}, | |
{ | |
"epoch": 19.7, | |
"grad_norm": 3.4260928630828857, | |
"learning_rate": 8.742989112504124e-05, | |
"loss": 0.0319, | |
"step": 4925 | |
}, | |
{ | |
"epoch": 19.8, | |
"grad_norm": 0.043105900287628174, | |
"learning_rate": 8.578027053777631e-05, | |
"loss": 0.0445, | |
"step": 4950 | |
}, | |
{ | |
"epoch": 19.9, | |
"grad_norm": 10.97398853302002, | |
"learning_rate": 8.413064995051139e-05, | |
"loss": 0.0185, | |
"step": 4975 | |
}, | |
{ | |
"epoch": 20.0, | |
"grad_norm": 12.879217147827148, | |
"learning_rate": 8.248102936324645e-05, | |
"loss": 0.0197, | |
"step": 5000 | |
}, | |
{ | |
"epoch": 20.1, | |
"grad_norm": 64.58011627197266, | |
"learning_rate": 8.083140877598153e-05, | |
"loss": 0.0228, | |
"step": 5025 | |
}, | |
{ | |
"epoch": 20.2, | |
"grad_norm": 43.28678512573242, | |
"learning_rate": 7.91817881887166e-05, | |
"loss": 0.0311, | |
"step": 5050 | |
}, | |
{ | |
"epoch": 20.3, | |
"grad_norm": 0.005558234639465809, | |
"learning_rate": 7.753216760145167e-05, | |
"loss": 0.0283, | |
"step": 5075 | |
}, | |
{ | |
"epoch": 20.4, | |
"grad_norm": 0.19123849272727966, | |
"learning_rate": 7.588254701418674e-05, | |
"loss": 0.0445, | |
"step": 5100 | |
}, | |
{ | |
"epoch": 20.5, | |
"grad_norm": 5.734357833862305, | |
"learning_rate": 7.423292642692181e-05, | |
"loss": 0.0145, | |
"step": 5125 | |
}, | |
{ | |
"epoch": 20.6, | |
"grad_norm": 0.005894747097045183, | |
"learning_rate": 7.258330583965688e-05, | |
"loss": 0.0233, | |
"step": 5150 | |
}, | |
{ | |
"epoch": 20.7, | |
"grad_norm": 0.018981292843818665, | |
"learning_rate": 7.093368525239196e-05, | |
"loss": 0.0273, | |
"step": 5175 | |
}, | |
{ | |
"epoch": 20.8, | |
"grad_norm": 0.09156472235918045, | |
"learning_rate": 6.928406466512702e-05, | |
"loss": 0.0233, | |
"step": 5200 | |
}, | |
{ | |
"epoch": 20.9, | |
"grad_norm": 0.8908875584602356, | |
"learning_rate": 6.763444407786209e-05, | |
"loss": 0.0221, | |
"step": 5225 | |
}, | |
{ | |
"epoch": 21.0, | |
"grad_norm": 6.745031833648682, | |
"learning_rate": 6.598482349059718e-05, | |
"loss": 0.0278, | |
"step": 5250 | |
}, | |
{ | |
"epoch": 21.1, | |
"grad_norm": 0.009193326346576214, | |
"learning_rate": 6.433520290333224e-05, | |
"loss": 0.0226, | |
"step": 5275 | |
}, | |
{ | |
"epoch": 21.2, | |
"grad_norm": 2.408499002456665, | |
"learning_rate": 6.26855823160673e-05, | |
"loss": 0.0202, | |
"step": 5300 | |
}, | |
{ | |
"epoch": 21.3, | |
"grad_norm": 0.006645245011895895, | |
"learning_rate": 6.103596172880238e-05, | |
"loss": 0.0234, | |
"step": 5325 | |
}, | |
{ | |
"epoch": 21.4, | |
"grad_norm": 0.006036018021404743, | |
"learning_rate": 5.938634114153745e-05, | |
"loss": 0.0333, | |
"step": 5350 | |
}, | |
{ | |
"epoch": 21.5, | |
"grad_norm": 0.25791531801223755, | |
"learning_rate": 5.7736720554272516e-05, | |
"loss": 0.012, | |
"step": 5375 | |
}, | |
{ | |
"epoch": 21.6, | |
"grad_norm": 0.9368523359298706, | |
"learning_rate": 5.608709996700759e-05, | |
"loss": 0.0078, | |
"step": 5400 | |
}, | |
{ | |
"epoch": 21.7, | |
"grad_norm": 0.0046651544980704784, | |
"learning_rate": 5.443747937974266e-05, | |
"loss": 0.0096, | |
"step": 5425 | |
}, | |
{ | |
"epoch": 21.8, | |
"grad_norm": 0.004647469613701105, | |
"learning_rate": 5.278785879247773e-05, | |
"loss": 0.0299, | |
"step": 5450 | |
}, | |
{ | |
"epoch": 21.9, | |
"grad_norm": 0.004732856526970863, | |
"learning_rate": 5.113823820521281e-05, | |
"loss": 0.0185, | |
"step": 5475 | |
}, | |
{ | |
"epoch": 22.0, | |
"grad_norm": 0.007009522989392281, | |
"learning_rate": 4.948861761794787e-05, | |
"loss": 0.0228, | |
"step": 5500 | |
}, | |
{ | |
"epoch": 22.1, | |
"grad_norm": 25.634002685546875, | |
"learning_rate": 4.783899703068295e-05, | |
"loss": 0.011, | |
"step": 5525 | |
}, | |
{ | |
"epoch": 22.2, | |
"grad_norm": 0.009100685827434063, | |
"learning_rate": 4.618937644341802e-05, | |
"loss": 0.0056, | |
"step": 5550 | |
}, | |
{ | |
"epoch": 22.3, | |
"grad_norm": 0.004046417307108641, | |
"learning_rate": 4.453975585615309e-05, | |
"loss": 0.0084, | |
"step": 5575 | |
}, | |
{ | |
"epoch": 22.4, | |
"grad_norm": 0.025503572076559067, | |
"learning_rate": 4.2890135268888156e-05, | |
"loss": 0.0066, | |
"step": 5600 | |
}, | |
{ | |
"epoch": 22.5, | |
"grad_norm": 0.005815221928060055, | |
"learning_rate": 4.1240514681623226e-05, | |
"loss": 0.0407, | |
"step": 5625 | |
}, | |
{ | |
"epoch": 22.6, | |
"grad_norm": 0.06112132593989372, | |
"learning_rate": 3.95908940943583e-05, | |
"loss": 0.0112, | |
"step": 5650 | |
}, | |
{ | |
"epoch": 22.7, | |
"grad_norm": 0.01705116033554077, | |
"learning_rate": 3.794127350709337e-05, | |
"loss": 0.0008, | |
"step": 5675 | |
}, | |
{ | |
"epoch": 22.8, | |
"grad_norm": 0.005267620086669922, | |
"learning_rate": 3.629165291982844e-05, | |
"loss": 0.0131, | |
"step": 5700 | |
}, | |
{ | |
"epoch": 22.9, | |
"grad_norm": 0.0046350546181201935, | |
"learning_rate": 3.464203233256351e-05, | |
"loss": 0.0205, | |
"step": 5725 | |
}, | |
{ | |
"epoch": 23.0, | |
"grad_norm": 0.0063110594637691975, | |
"learning_rate": 3.299241174529859e-05, | |
"loss": 0.0072, | |
"step": 5750 | |
}, | |
{ | |
"epoch": 23.1, | |
"grad_norm": 0.004843282513320446, | |
"learning_rate": 3.134279115803365e-05, | |
"loss": 0.033, | |
"step": 5775 | |
}, | |
{ | |
"epoch": 23.2, | |
"grad_norm": 0.004899237770587206, | |
"learning_rate": 2.9693170570768723e-05, | |
"loss": 0.002, | |
"step": 5800 | |
}, | |
{ | |
"epoch": 23.3, | |
"grad_norm": 0.0040695276111364365, | |
"learning_rate": 2.8043549983503796e-05, | |
"loss": 0.0185, | |
"step": 5825 | |
}, | |
{ | |
"epoch": 23.4, | |
"grad_norm": 46.083438873291016, | |
"learning_rate": 2.6393929396238866e-05, | |
"loss": 0.0149, | |
"step": 5850 | |
}, | |
{ | |
"epoch": 23.5, | |
"grad_norm": 0.009059540927410126, | |
"learning_rate": 2.4744308808973936e-05, | |
"loss": 0.0264, | |
"step": 5875 | |
}, | |
{ | |
"epoch": 23.6, | |
"grad_norm": 11.57484245300293, | |
"learning_rate": 2.309468822170901e-05, | |
"loss": 0.0145, | |
"step": 5900 | |
}, | |
{ | |
"epoch": 23.7, | |
"grad_norm": 0.00815950334072113, | |
"learning_rate": 2.1445067634444078e-05, | |
"loss": 0.015, | |
"step": 5925 | |
}, | |
{ | |
"epoch": 23.8, | |
"grad_norm": 0.005930441431701183, | |
"learning_rate": 1.979544704717915e-05, | |
"loss": 0.0187, | |
"step": 5950 | |
}, | |
{ | |
"epoch": 23.9, | |
"grad_norm": 0.006135927978903055, | |
"learning_rate": 1.814582645991422e-05, | |
"loss": 0.0143, | |
"step": 5975 | |
}, | |
{ | |
"epoch": 24.0, | |
"grad_norm": 0.004939633421599865, | |
"learning_rate": 1.6496205872649294e-05, | |
"loss": 0.0088, | |
"step": 6000 | |
} | |
], | |
"logging_steps": 25, | |
"max_steps": 6250, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 25, | |
"save_steps": 1000, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": false | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 304916889600000.0, | |
"train_batch_size": 20, | |
"trial_name": null, | |
"trial_params": null | |
} | |