| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 5.0, | |
| "eval_steps": 500, | |
| "global_step": 630, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.039860488290981565, | |
| "grad_norm": 11.324122796709977, | |
| "learning_rate": 2.53968253968254e-06, | |
| "loss": 1.6286, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.07972097658196313, | |
| "grad_norm": 1.991457650057207, | |
| "learning_rate": 5.7142857142857145e-06, | |
| "loss": 1.4713, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.11958146487294469, | |
| "grad_norm": 1.0610273535036991, | |
| "learning_rate": 8.888888888888888e-06, | |
| "loss": 1.3152, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.15944195316392626, | |
| "grad_norm": 0.7999727467798167, | |
| "learning_rate": 1.2063492063492064e-05, | |
| "loss": 1.2185, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.19930244145490783, | |
| "grad_norm": 0.6462858961182688, | |
| "learning_rate": 1.523809523809524e-05, | |
| "loss": 1.1579, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.23916292974588937, | |
| "grad_norm": 0.47561233028025485, | |
| "learning_rate": 1.8412698412698415e-05, | |
| "loss": 1.1295, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.279023418036871, | |
| "grad_norm": 0.4460008015131473, | |
| "learning_rate": 2.158730158730159e-05, | |
| "loss": 1.1069, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3188839063278525, | |
| "grad_norm": 0.35932407714480935, | |
| "learning_rate": 2.4761904761904766e-05, | |
| "loss": 1.0887, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.35874439461883406, | |
| "grad_norm": 0.3717286127467533, | |
| "learning_rate": 2.7936507936507936e-05, | |
| "loss": 1.0869, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.39860488290981566, | |
| "grad_norm": 0.38261468570309465, | |
| "learning_rate": 3.111111111111112e-05, | |
| "loss": 1.0774, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.4384653712007972, | |
| "grad_norm": 0.383323190556911, | |
| "learning_rate": 3.4285714285714284e-05, | |
| "loss": 1.0636, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.47832585949177875, | |
| "grad_norm": 0.4100491937186789, | |
| "learning_rate": 3.7460317460317464e-05, | |
| "loss": 1.0535, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.5181863477827603, | |
| "grad_norm": 0.3975249651869064, | |
| "learning_rate": 3.9999693004141615e-05, | |
| "loss": 1.0297, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.558046836073742, | |
| "grad_norm": 0.3819095926536229, | |
| "learning_rate": 3.998894913865352e-05, | |
| "loss": 1.0337, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.5979073243647235, | |
| "grad_norm": 0.36490844056440797, | |
| "learning_rate": 3.9962864903591375e-05, | |
| "loss": 1.0063, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.637767812655705, | |
| "grad_norm": 0.4076298707032981, | |
| "learning_rate": 3.992146031710637e-05, | |
| "loss": 1.0237, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.6776283009466866, | |
| "grad_norm": 0.37566110161128824, | |
| "learning_rate": 3.9864767154838864e-05, | |
| "loss": 1.0145, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.7174887892376681, | |
| "grad_norm": 0.34530643882424805, | |
| "learning_rate": 3.9792828925532376e-05, | |
| "loss": 1.0296, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.7573492775286498, | |
| "grad_norm": 0.3432558970267317, | |
| "learning_rate": 3.970570083764316e-05, | |
| "loss": 1.0059, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.7972097658196313, | |
| "grad_norm": 0.3408833181353496, | |
| "learning_rate": 3.9603449756970877e-05, | |
| "loss": 1.004, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.8370702541106129, | |
| "grad_norm": 0.3589480700028462, | |
| "learning_rate": 3.948615415534294e-05, | |
| "loss": 0.9936, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.8769307424015944, | |
| "grad_norm": 0.33854687709123976, | |
| "learning_rate": 3.9353904050391874e-05, | |
| "loss": 1.006, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.916791230692576, | |
| "grad_norm": 0.3606947276733311, | |
| "learning_rate": 3.9206800936472e-05, | |
| "loss": 1.0033, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.9566517189835575, | |
| "grad_norm": 0.3524202550527317, | |
| "learning_rate": 3.904495770676831e-05, | |
| "loss": 0.9917, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.9965122072745392, | |
| "grad_norm": 0.3695008401673435, | |
| "learning_rate": 3.886849856665746e-05, | |
| "loss": 1.0137, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.0318883906327851, | |
| "grad_norm": 0.5103901242908977, | |
| "learning_rate": 3.8677558938387276e-05, | |
| "loss": 0.885, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.0717488789237668, | |
| "grad_norm": 0.39411785206718664, | |
| "learning_rate": 3.8472285357147966e-05, | |
| "loss": 0.8679, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.1116093672147485, | |
| "grad_norm": 0.3898193416463423, | |
| "learning_rate": 3.825283535861476e-05, | |
| "loss": 0.8733, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.15146985550573, | |
| "grad_norm": 0.7348802698342378, | |
| "learning_rate": 3.801937735804838e-05, | |
| "loss": 0.8434, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.1913303437967115, | |
| "grad_norm": 0.36945102723915507, | |
| "learning_rate": 3.777209052104598e-05, | |
| "loss": 0.8461, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.2311908320876932, | |
| "grad_norm": 0.4602742420015167, | |
| "learning_rate": 3.7511164626041823e-05, | |
| "loss": 0.8606, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.2710513203786746, | |
| "grad_norm": 0.35621072409911864, | |
| "learning_rate": 3.7236799918663284e-05, | |
| "loss": 0.8555, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.310911808669656, | |
| "grad_norm": 0.42801215480497146, | |
| "learning_rate": 3.6949206958053825e-05, | |
| "loss": 0.8437, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.3507722969606377, | |
| "grad_norm": 0.36223537090099234, | |
| "learning_rate": 3.6648606455280944e-05, | |
| "loss": 0.8566, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.3906327852516194, | |
| "grad_norm": 0.702392888184491, | |
| "learning_rate": 3.633522910395314e-05, | |
| "loss": 0.8665, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.4304932735426008, | |
| "grad_norm": 0.36020937464475294, | |
| "learning_rate": 3.6009315403175786e-05, | |
| "loss": 0.8363, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.4703537618335825, | |
| "grad_norm": 0.3500105740239205, | |
| "learning_rate": 3.567111547298194e-05, | |
| "loss": 0.853, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.5102142501245641, | |
| "grad_norm": 0.4768600668120408, | |
| "learning_rate": 3.532088886237956e-05, | |
| "loss": 0.8496, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.5500747384155455, | |
| "grad_norm": 0.36399360324025654, | |
| "learning_rate": 3.495890435016258e-05, | |
| "loss": 0.8636, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.5899352267065272, | |
| "grad_norm": 0.3380619981423237, | |
| "learning_rate": 3.458543973863859e-05, | |
| "loss": 0.8538, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.6297957149975089, | |
| "grad_norm": 0.3389261022528899, | |
| "learning_rate": 3.420078164043161e-05, | |
| "loss": 0.8591, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 1.6696562032884903, | |
| "grad_norm": 0.395644963130336, | |
| "learning_rate": 3.38052252585233e-05, | |
| "loss": 0.8401, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 1.7095166915794717, | |
| "grad_norm": 0.30378900619983906, | |
| "learning_rate": 3.339907415970168e-05, | |
| "loss": 0.8476, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 1.7493771798704534, | |
| "grad_norm": 0.3900044033629726, | |
| "learning_rate": 3.298264004159104e-05, | |
| "loss": 0.8413, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 1.789237668161435, | |
| "grad_norm": 0.36925152268695366, | |
| "learning_rate": 3.255624249344198e-05, | |
| "loss": 0.8534, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 1.8290981564524165, | |
| "grad_norm": 0.3334498107167973, | |
| "learning_rate": 3.212020875086495e-05, | |
| "loss": 0.8734, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 1.8689586447433981, | |
| "grad_norm": 0.3231836736531515, | |
| "learning_rate": 3.1674873444695804e-05, | |
| "loss": 0.8619, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 1.9088191330343798, | |
| "grad_norm": 0.3352794024716405, | |
| "learning_rate": 3.122057834418582e-05, | |
| "loss": 0.8604, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 1.9486796213253612, | |
| "grad_norm": 0.34760623558396486, | |
| "learning_rate": 3.075767209471345e-05, | |
| "loss": 0.8712, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 1.9885401096163426, | |
| "grad_norm": 0.3494983937948561, | |
| "learning_rate": 3.0286509950219077e-05, | |
| "loss": 0.8449, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.023916292974589, | |
| "grad_norm": 0.6438619381464517, | |
| "learning_rate": 2.9807453500567937e-05, | |
| "loss": 0.7382, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.0637767812655703, | |
| "grad_norm": 0.48837947056765113, | |
| "learning_rate": 2.9320870394050783e-05, | |
| "loss": 0.6794, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.103637269556552, | |
| "grad_norm": 0.4249085778688401, | |
| "learning_rate": 2.8827134055234883e-05, | |
| "loss": 0.6878, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.1434977578475336, | |
| "grad_norm": 0.4112164304088742, | |
| "learning_rate": 2.8326623398382174e-05, | |
| "loss": 0.6895, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.183358246138515, | |
| "grad_norm": 0.4393590314899133, | |
| "learning_rate": 2.781972253665431e-05, | |
| "loss": 0.6684, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.223218734429497, | |
| "grad_norm": 0.37274870892195433, | |
| "learning_rate": 2.7306820487327906e-05, | |
| "loss": 0.6719, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.2630792227204783, | |
| "grad_norm": 0.38062444497520725, | |
| "learning_rate": 2.6788310873246133e-05, | |
| "loss": 0.6735, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.30293971101146, | |
| "grad_norm": 0.3754103984568679, | |
| "learning_rate": 2.62645916207358e-05, | |
| "loss": 0.6757, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.3428001993024417, | |
| "grad_norm": 0.38094693535399177, | |
| "learning_rate": 2.5736064654221808e-05, | |
| "loss": 0.6544, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.382660687593423, | |
| "grad_norm": 0.37267334400373975, | |
| "learning_rate": 2.5203135587773196e-05, | |
| "loss": 0.6612, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.4225211758844045, | |
| "grad_norm": 0.3511334496003159, | |
| "learning_rate": 2.4666213413817696e-05, | |
| "loss": 0.6763, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.4623816641753864, | |
| "grad_norm": 0.368576714561806, | |
| "learning_rate": 2.4125710189263555e-05, | |
| "loss": 0.6563, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 2.502242152466368, | |
| "grad_norm": 0.35196724604768753, | |
| "learning_rate": 2.3582040719269504e-05, | |
| "loss": 0.65, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 2.5421026407573493, | |
| "grad_norm": 0.3540473065272315, | |
| "learning_rate": 2.3035622238905694e-05, | |
| "loss": 0.6679, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 2.5819631290483307, | |
| "grad_norm": 0.3661596143769676, | |
| "learning_rate": 2.2486874092949708e-05, | |
| "loss": 0.6738, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 2.621823617339312, | |
| "grad_norm": 0.35913068756682465, | |
| "learning_rate": 2.1936217414063584e-05, | |
| "loss": 0.6887, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 2.661684105630294, | |
| "grad_norm": 0.36799867292080646, | |
| "learning_rate": 2.138407479959869e-05, | |
| "loss": 0.6709, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 2.7015445939212754, | |
| "grad_norm": 0.35691876781155074, | |
| "learning_rate": 2.0830869987276537e-05, | |
| "loss": 0.665, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 2.741405082212257, | |
| "grad_norm": 0.38987523468576574, | |
| "learning_rate": 2.027702752999444e-05, | |
| "loss": 0.6528, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 2.7812655705032387, | |
| "grad_norm": 0.38505381731754873, | |
| "learning_rate": 1.9722972470005573e-05, | |
| "loss": 0.6771, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 2.82112605879422, | |
| "grad_norm": 0.35842950010166197, | |
| "learning_rate": 1.916913001272347e-05, | |
| "loss": 0.6638, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 2.8609865470852016, | |
| "grad_norm": 0.37422580701088903, | |
| "learning_rate": 1.8615925200401318e-05, | |
| "loss": 0.6753, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 2.9008470353761835, | |
| "grad_norm": 0.36860495212005495, | |
| "learning_rate": 1.806378258593642e-05, | |
| "loss": 0.6681, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 2.940707523667165, | |
| "grad_norm": 0.3647903293380729, | |
| "learning_rate": 1.7513125907050302e-05, | |
| "loss": 0.6658, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 2.9805680119581464, | |
| "grad_norm": 0.35870478733795147, | |
| "learning_rate": 1.6964377761094313e-05, | |
| "loss": 0.667, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.0159441953163926, | |
| "grad_norm": 0.6451047782511159, | |
| "learning_rate": 1.6417959280730506e-05, | |
| "loss": 0.612, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.055804683607374, | |
| "grad_norm": 0.7456786595663745, | |
| "learning_rate": 1.5874289810736452e-05, | |
| "loss": 0.5282, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.095665171898356, | |
| "grad_norm": 0.4611479200453808, | |
| "learning_rate": 1.5333786586182308e-05, | |
| "loss": 0.4945, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.1355256601893373, | |
| "grad_norm": 0.4624423710225065, | |
| "learning_rate": 1.4796864412226812e-05, | |
| "loss": 0.5178, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.1753861484803187, | |
| "grad_norm": 0.3988674416095005, | |
| "learning_rate": 1.4263935345778202e-05, | |
| "loss": 0.5015, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.2152466367713006, | |
| "grad_norm": 0.4171540538418279, | |
| "learning_rate": 1.37354083792642e-05, | |
| "loss": 0.4988, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.255107125062282, | |
| "grad_norm": 0.4043531142190362, | |
| "learning_rate": 1.3211689126753879e-05, | |
| "loss": 0.4966, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.2949676133532635, | |
| "grad_norm": 0.406846754086433, | |
| "learning_rate": 1.26931795126721e-05, | |
| "loss": 0.5081, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.334828101644245, | |
| "grad_norm": 0.42193109699205417, | |
| "learning_rate": 1.2180277463345697e-05, | |
| "loss": 0.5088, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 3.374688589935227, | |
| "grad_norm": 0.4039970787325296, | |
| "learning_rate": 1.167337660161783e-05, | |
| "loss": 0.5023, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 3.4145490782262082, | |
| "grad_norm": 0.3891294704118612, | |
| "learning_rate": 1.1172865944765122e-05, | |
| "loss": 0.5054, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 3.4544095665171897, | |
| "grad_norm": 0.4123628750841747, | |
| "learning_rate": 1.067912960594923e-05, | |
| "loss": 0.5078, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 3.4942700548081715, | |
| "grad_norm": 0.4116103758152132, | |
| "learning_rate": 1.0192546499432066e-05, | |
| "loss": 0.5008, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 3.534130543099153, | |
| "grad_norm": 0.39334581361661547, | |
| "learning_rate": 9.713490049780931e-06, | |
| "loss": 0.5117, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 3.5739910313901344, | |
| "grad_norm": 0.3977163931160735, | |
| "learning_rate": 9.242327905286552e-06, | |
| "loss": 0.5004, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 3.6138515196811163, | |
| "grad_norm": 0.41291346064789947, | |
| "learning_rate": 8.779421655814189e-06, | |
| "loss": 0.4899, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 3.6537120079720977, | |
| "grad_norm": 0.39864981854696013, | |
| "learning_rate": 8.325126555304208e-06, | |
| "loss": 0.4925, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 3.693572496263079, | |
| "grad_norm": 0.3783439845572638, | |
| "learning_rate": 7.879791249135059e-06, | |
| "loss": 0.4936, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 3.733432984554061, | |
| "grad_norm": 0.36028247570266225, | |
| "learning_rate": 7.443757506558033e-06, | |
| "loss": 0.5071, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 3.7732934728450425, | |
| "grad_norm": 0.36974308586061955, | |
| "learning_rate": 7.0173599584089625e-06, | |
| "loss": 0.4902, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 3.813153961136024, | |
| "grad_norm": 0.3709767672807715, | |
| "learning_rate": 6.600925840298331e-06, | |
| "loss": 0.505, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 3.8530144494270053, | |
| "grad_norm": 0.3610583139391416, | |
| "learning_rate": 6.1947747414767035e-06, | |
| "loss": 0.5208, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 3.892874937717987, | |
| "grad_norm": 0.3791524633277678, | |
| "learning_rate": 5.799218359568395e-06, | |
| "loss": 0.5301, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 3.9327354260089686, | |
| "grad_norm": 0.363724718810471, | |
| "learning_rate": 5.414560261361415e-06, | |
| "loss": 0.4855, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 3.97259591429995, | |
| "grad_norm": 0.3867971355298049, | |
| "learning_rate": 5.041095649837429e-06, | |
| "loss": 0.4872, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 4.007972097658197, | |
| "grad_norm": 0.6922635714147061, | |
| "learning_rate": 4.679111137620442e-06, | |
| "loss": 0.4901, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 4.047832585949178, | |
| "grad_norm": 0.5464640833763156, | |
| "learning_rate": 4.328884527018067e-06, | |
| "loss": 0.4086, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 4.08769307424016, | |
| "grad_norm": 0.5514248166968356, | |
| "learning_rate": 3.990684596824219e-06, | |
| "loss": 0.4042, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 4.127553562531141, | |
| "grad_norm": 0.46194805338366773, | |
| "learning_rate": 3.6647708960468696e-06, | |
| "loss": 0.4028, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 4.1674140508221225, | |
| "grad_norm": 0.45065104707045, | |
| "learning_rate": 3.3513935447190595e-06, | |
| "loss": 0.3937, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 4.207274539113104, | |
| "grad_norm": 0.3884400001553067, | |
| "learning_rate": 3.050793041946183e-06, | |
| "loss": 0.3941, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 4.247135027404085, | |
| "grad_norm": 0.4057662200999109, | |
| "learning_rate": 2.763200081336721e-06, | |
| "loss": 0.3913, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 4.286995515695067, | |
| "grad_norm": 0.3927959847718352, | |
| "learning_rate": 2.488835373958185e-06, | |
| "loss": 0.3993, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 4.326856003986049, | |
| "grad_norm": 0.3915532354537384, | |
| "learning_rate": 2.2279094789540244e-06, | |
| "loss": 0.4047, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 4.36671649227703, | |
| "grad_norm": 0.37576385131138085, | |
| "learning_rate": 1.9806226419516195e-06, | |
| "loss": 0.402, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 4.406576980568012, | |
| "grad_norm": 0.3589030046466532, | |
| "learning_rate": 1.7471646413852439e-06, | |
| "loss": 0.4037, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 4.446437468858994, | |
| "grad_norm": 0.38802398319508147, | |
| "learning_rate": 1.527714642852045e-06, | |
| "loss": 0.3975, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 4.486297957149975, | |
| "grad_norm": 0.3781055143033295, | |
| "learning_rate": 1.3224410616127292e-06, | |
| "loss": 0.3902, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 4.526158445440957, | |
| "grad_norm": 0.380556572113873, | |
| "learning_rate": 1.1315014333425455e-06, | |
| "loss": 0.3955, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 4.566018933731939, | |
| "grad_norm": 0.37627984573745005, | |
| "learning_rate": 9.550422932316938e-07, | |
| "loss": 0.3925, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 4.60587942202292, | |
| "grad_norm": 0.36605761395168335, | |
| "learning_rate": 7.931990635280052e-07, | |
| "loss": 0.3929, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 4.645739910313901, | |
| "grad_norm": 0.37403610200301257, | |
| "learning_rate": 6.460959496081276e-07, | |
| "loss": 0.4068, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 4.685600398604883, | |
| "grad_norm": 0.3673645131484265, | |
| "learning_rate": 5.13845844657066e-07, | |
| "loss": 0.3962, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 4.725460886895864, | |
| "grad_norm": 0.39800543909133185, | |
| "learning_rate": 3.965502430291235e-07, | |
| "loss": 0.3912, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 4.765321375186846, | |
| "grad_norm": 0.37969719371341065, | |
| "learning_rate": 2.942991623568436e-07, | |
| "loss": 0.3801, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 4.805181863477827, | |
| "grad_norm": 0.3777219824944034, | |
| "learning_rate": 2.0717107446762696e-07, | |
| "loss": 0.369, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 4.845042351768809, | |
| "grad_norm": 0.3913050535737085, | |
| "learning_rate": 1.3523284516113955e-07, | |
| "loss": 0.3982, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 4.884902840059791, | |
| "grad_norm": 0.3712369916416077, | |
| "learning_rate": 7.853968289363245e-08, | |
| "loss": 0.3916, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 4.924763328350773, | |
| "grad_norm": 0.38101325204255376, | |
| "learning_rate": 3.7135096408631443e-08, | |
| "loss": 0.3978, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 4.964623816641754, | |
| "grad_norm": 0.37989319704275204, | |
| "learning_rate": 1.1050861346488806e-08, | |
| "loss": 0.389, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "grad_norm": 0.5514474703259893, | |
| "learning_rate": 3.069958583856725e-10, | |
| "loss": 0.4095, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 5.0, | |
| "step": 630, | |
| "total_flos": 1235947560108032.0, | |
| "train_loss": 0.7053156269921197, | |
| "train_runtime": 61423.1886, | |
| "train_samples_per_second": 1.307, | |
| "train_steps_per_second": 0.01 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 630, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 5, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1235947560108032.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |