|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.003832078327681018, |
|
"eval_steps": 9, |
|
"global_step": 100, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 3.832078327681018e-05, |
|
"grad_norm": 3.426521062850952, |
|
"learning_rate": 1e-05, |
|
"loss": 10.2803, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 3.832078327681018e-05, |
|
"eval_loss": 10.226699829101562, |
|
"eval_runtime": 198.1021, |
|
"eval_samples_per_second": 110.933, |
|
"eval_steps_per_second": 13.867, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 7.664156655362036e-05, |
|
"grad_norm": 3.3724396228790283, |
|
"learning_rate": 2e-05, |
|
"loss": 10.1469, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00011496234983043054, |
|
"grad_norm": 3.1231155395507812, |
|
"learning_rate": 3e-05, |
|
"loss": 10.3281, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0001532831331072407, |
|
"grad_norm": 3.4462990760803223, |
|
"learning_rate": 4e-05, |
|
"loss": 10.0777, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.00019160391638405088, |
|
"grad_norm": 3.0611374378204346, |
|
"learning_rate": 5e-05, |
|
"loss": 10.1656, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.00022992469966086107, |
|
"grad_norm": 3.089136838912964, |
|
"learning_rate": 6e-05, |
|
"loss": 10.3253, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.00026824548293767126, |
|
"grad_norm": 3.1562752723693848, |
|
"learning_rate": 7e-05, |
|
"loss": 10.1898, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0003065662662144814, |
|
"grad_norm": 3.3566901683807373, |
|
"learning_rate": 8e-05, |
|
"loss": 10.2334, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.0003448870494912916, |
|
"grad_norm": 3.380046844482422, |
|
"learning_rate": 9e-05, |
|
"loss": 10.1106, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.0003448870494912916, |
|
"eval_loss": 10.10090446472168, |
|
"eval_runtime": 198.2556, |
|
"eval_samples_per_second": 110.847, |
|
"eval_steps_per_second": 13.856, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.00038320783276810176, |
|
"grad_norm": 3.2303075790405273, |
|
"learning_rate": 0.0001, |
|
"loss": 10.0097, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.000421528616044912, |
|
"grad_norm": 2.8799538612365723, |
|
"learning_rate": 9.99695413509548e-05, |
|
"loss": 9.9966, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.00045984939932172214, |
|
"grad_norm": 3.717454433441162, |
|
"learning_rate": 9.987820251299122e-05, |
|
"loss": 10.1469, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0004981701825985324, |
|
"grad_norm": 3.4059760570526123, |
|
"learning_rate": 9.972609476841367e-05, |
|
"loss": 10.1607, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0005364909658753425, |
|
"grad_norm": 3.6317834854125977, |
|
"learning_rate": 9.951340343707852e-05, |
|
"loss": 10.0034, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0005748117491521527, |
|
"grad_norm": 3.2543015480041504, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 9.8262, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0006131325324289629, |
|
"grad_norm": 3.1305909156799316, |
|
"learning_rate": 9.890738003669029e-05, |
|
"loss": 10.0142, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.000651453315705773, |
|
"grad_norm": 2.911153554916382, |
|
"learning_rate": 9.851478631379982e-05, |
|
"loss": 9.6862, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0006897740989825832, |
|
"grad_norm": 2.923849105834961, |
|
"learning_rate": 9.806308479691595e-05, |
|
"loss": 9.6763, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0006897740989825832, |
|
"eval_loss": 9.730630874633789, |
|
"eval_runtime": 197.3292, |
|
"eval_samples_per_second": 111.367, |
|
"eval_steps_per_second": 13.921, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.0007280948822593933, |
|
"grad_norm": 3.134058713912964, |
|
"learning_rate": 9.755282581475769e-05, |
|
"loss": 9.7739, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.0007664156655362035, |
|
"grad_norm": 3.1884331703186035, |
|
"learning_rate": 9.698463103929542e-05, |
|
"loss": 9.5725, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0008047364488130138, |
|
"grad_norm": 3.215076446533203, |
|
"learning_rate": 9.635919272833938e-05, |
|
"loss": 9.8354, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.000843057232089824, |
|
"grad_norm": 3.2122676372528076, |
|
"learning_rate": 9.567727288213005e-05, |
|
"loss": 9.6639, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.0008813780153666341, |
|
"grad_norm": 3.1786868572235107, |
|
"learning_rate": 9.493970231495835e-05, |
|
"loss": 9.5412, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0009196987986434443, |
|
"grad_norm": 3.19095516204834, |
|
"learning_rate": 9.414737964294636e-05, |
|
"loss": 9.5994, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.0009580195819202544, |
|
"grad_norm": 3.1054611206054688, |
|
"learning_rate": 9.330127018922194e-05, |
|
"loss": 9.5802, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0009963403651970647, |
|
"grad_norm": 3.075852155685425, |
|
"learning_rate": 9.24024048078213e-05, |
|
"loss": 9.5869, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0010346611484738748, |
|
"grad_norm": 3.107454776763916, |
|
"learning_rate": 9.145187862775209e-05, |
|
"loss": 9.2758, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.0010346611484738748, |
|
"eval_loss": 9.356222152709961, |
|
"eval_runtime": 196.8567, |
|
"eval_samples_per_second": 111.635, |
|
"eval_steps_per_second": 13.954, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.001072981931750685, |
|
"grad_norm": 3.239929437637329, |
|
"learning_rate": 9.045084971874738e-05, |
|
"loss": 9.3093, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.001111302715027495, |
|
"grad_norm": 3.1697399616241455, |
|
"learning_rate": 8.940053768033609e-05, |
|
"loss": 9.4413, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0011496234983043054, |
|
"grad_norm": 2.983443260192871, |
|
"learning_rate": 8.83022221559489e-05, |
|
"loss": 9.2819, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0011879442815811154, |
|
"grad_norm": 2.9809553623199463, |
|
"learning_rate": 8.715724127386972e-05, |
|
"loss": 9.2256, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.0012262650648579257, |
|
"grad_norm": 2.92080020904541, |
|
"learning_rate": 8.596699001693255e-05, |
|
"loss": 9.0488, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.001264585848134736, |
|
"grad_norm": 2.7908945083618164, |
|
"learning_rate": 8.473291852294987e-05, |
|
"loss": 9.1336, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.001302906631411546, |
|
"grad_norm": 2.688901662826538, |
|
"learning_rate": 8.345653031794292e-05, |
|
"loss": 9.0057, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0013412274146883563, |
|
"grad_norm": 2.6229913234710693, |
|
"learning_rate": 8.213938048432697e-05, |
|
"loss": 8.8516, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0013795481979651664, |
|
"grad_norm": 2.903886556625366, |
|
"learning_rate": 8.07830737662829e-05, |
|
"loss": 9.1772, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0013795481979651664, |
|
"eval_loss": 9.0332670211792, |
|
"eval_runtime": 200.6695, |
|
"eval_samples_per_second": 109.513, |
|
"eval_steps_per_second": 13.689, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0014178689812419766, |
|
"grad_norm": 2.304446220397949, |
|
"learning_rate": 7.938926261462366e-05, |
|
"loss": 9.0422, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.0014561897645187867, |
|
"grad_norm": 2.6148622035980225, |
|
"learning_rate": 7.795964517353735e-05, |
|
"loss": 8.9424, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.001494510547795597, |
|
"grad_norm": 2.5895235538482666, |
|
"learning_rate": 7.649596321166024e-05, |
|
"loss": 8.9902, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.001532831331072407, |
|
"grad_norm": 2.4067177772521973, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 8.9559, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0015711521143492173, |
|
"grad_norm": 2.6639695167541504, |
|
"learning_rate": 7.347357813929454e-05, |
|
"loss": 8.8575, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.0016094728976260276, |
|
"grad_norm": 2.2785677909851074, |
|
"learning_rate": 7.191855733945387e-05, |
|
"loss": 9.0544, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.0016477936809028376, |
|
"grad_norm": 2.155168294906616, |
|
"learning_rate": 7.033683215379002e-05, |
|
"loss": 8.7763, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.001686114464179648, |
|
"grad_norm": 2.451356887817383, |
|
"learning_rate": 6.873032967079561e-05, |
|
"loss": 8.9635, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.001724435247456458, |
|
"grad_norm": 2.1244118213653564, |
|
"learning_rate": 6.710100716628344e-05, |
|
"loss": 8.7786, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.001724435247456458, |
|
"eval_loss": 8.795065879821777, |
|
"eval_runtime": 198.6273, |
|
"eval_samples_per_second": 110.639, |
|
"eval_steps_per_second": 13.83, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.0017627560307332682, |
|
"grad_norm": 2.1409757137298584, |
|
"learning_rate": 6.545084971874738e-05, |
|
"loss": 8.843, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.0018010768140100783, |
|
"grad_norm": 2.1220507621765137, |
|
"learning_rate": 6.378186779084995e-05, |
|
"loss": 8.8037, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.0018393975972868886, |
|
"grad_norm": 1.9951223134994507, |
|
"learning_rate": 6.209609477998338e-05, |
|
"loss": 8.7091, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.0018777183805636986, |
|
"grad_norm": 2.0571706295013428, |
|
"learning_rate": 6.0395584540887963e-05, |
|
"loss": 8.624, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.001916039163840509, |
|
"grad_norm": 2.1683812141418457, |
|
"learning_rate": 5.868240888334653e-05, |
|
"loss": 8.7798, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.001954359947117319, |
|
"grad_norm": 1.9748177528381348, |
|
"learning_rate": 5.695865504800327e-05, |
|
"loss": 8.669, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.0019926807303941294, |
|
"grad_norm": 2.2210891246795654, |
|
"learning_rate": 5.522642316338268e-05, |
|
"loss": 8.7657, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.0020310015136709393, |
|
"grad_norm": 1.8661513328552246, |
|
"learning_rate": 5.348782368720626e-05, |
|
"loss": 8.6206, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.0020693222969477496, |
|
"grad_norm": 1.8140807151794434, |
|
"learning_rate": 5.174497483512506e-05, |
|
"loss": 8.7015, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.0020693222969477496, |
|
"eval_loss": 8.63232421875, |
|
"eval_runtime": 197.1296, |
|
"eval_samples_per_second": 111.48, |
|
"eval_steps_per_second": 13.935, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.00210764308022456, |
|
"grad_norm": 1.8564491271972656, |
|
"learning_rate": 5e-05, |
|
"loss": 8.6189, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.00214596386350137, |
|
"grad_norm": 1.8724466562271118, |
|
"learning_rate": 4.825502516487497e-05, |
|
"loss": 8.48, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.00218428464677818, |
|
"grad_norm": 1.7884173393249512, |
|
"learning_rate": 4.6512176312793736e-05, |
|
"loss": 8.7433, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.00222260543005499, |
|
"grad_norm": 1.7040969133377075, |
|
"learning_rate": 4.477357683661734e-05, |
|
"loss": 8.6235, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.0022609262133318005, |
|
"grad_norm": 1.9406371116638184, |
|
"learning_rate": 4.3041344951996746e-05, |
|
"loss": 8.6172, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.0022992469966086108, |
|
"grad_norm": 1.7244473695755005, |
|
"learning_rate": 4.131759111665349e-05, |
|
"loss": 8.6254, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.002337567779885421, |
|
"grad_norm": 1.6151647567749023, |
|
"learning_rate": 3.960441545911204e-05, |
|
"loss": 8.5338, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.002375888563162231, |
|
"grad_norm": 1.602726697921753, |
|
"learning_rate": 3.790390522001662e-05, |
|
"loss": 8.6495, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.002414209346439041, |
|
"grad_norm": 1.556414246559143, |
|
"learning_rate": 3.6218132209150045e-05, |
|
"loss": 8.66, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.002414209346439041, |
|
"eval_loss": 8.53507137298584, |
|
"eval_runtime": 197.1109, |
|
"eval_samples_per_second": 111.491, |
|
"eval_steps_per_second": 13.936, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.0024525301297158514, |
|
"grad_norm": 1.8471159934997559, |
|
"learning_rate": 3.4549150281252636e-05, |
|
"loss": 8.6437, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.0024908509129926617, |
|
"grad_norm": 1.5061924457550049, |
|
"learning_rate": 3.289899283371657e-05, |
|
"loss": 8.4696, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.002529171696269472, |
|
"grad_norm": 1.6884492635726929, |
|
"learning_rate": 3.12696703292044e-05, |
|
"loss": 8.5363, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.002567492479546282, |
|
"grad_norm": 1.7472187280654907, |
|
"learning_rate": 2.9663167846209998e-05, |
|
"loss": 8.3917, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.002605813262823092, |
|
"grad_norm": 1.601895809173584, |
|
"learning_rate": 2.8081442660546125e-05, |
|
"loss": 8.4395, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.0026441340460999023, |
|
"grad_norm": 1.3957659006118774, |
|
"learning_rate": 2.6526421860705473e-05, |
|
"loss": 8.6504, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.0026824548293767126, |
|
"grad_norm": 1.4126591682434082, |
|
"learning_rate": 2.500000000000001e-05, |
|
"loss": 8.4654, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0027207756126535225, |
|
"grad_norm": 1.6027814149856567, |
|
"learning_rate": 2.350403678833976e-05, |
|
"loss": 8.4307, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.0027590963959303327, |
|
"grad_norm": 1.5236140489578247, |
|
"learning_rate": 2.2040354826462668e-05, |
|
"loss": 8.479, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.0027590963959303327, |
|
"eval_loss": 8.478059768676758, |
|
"eval_runtime": 197.0774, |
|
"eval_samples_per_second": 111.509, |
|
"eval_steps_per_second": 13.939, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.002797417179207143, |
|
"grad_norm": 1.574795126914978, |
|
"learning_rate": 2.061073738537635e-05, |
|
"loss": 8.3117, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.0028357379624839533, |
|
"grad_norm": 1.6588678359985352, |
|
"learning_rate": 1.9216926233717085e-05, |
|
"loss": 8.3155, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.0028740587457607636, |
|
"grad_norm": 1.4572874307632446, |
|
"learning_rate": 1.7860619515673033e-05, |
|
"loss": 8.4948, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.0029123795290375734, |
|
"grad_norm": 1.3906676769256592, |
|
"learning_rate": 1.6543469682057106e-05, |
|
"loss": 8.5319, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.0029507003123143837, |
|
"grad_norm": 1.549890160560608, |
|
"learning_rate": 1.526708147705013e-05, |
|
"loss": 8.3946, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.002989021095591194, |
|
"grad_norm": 1.4431867599487305, |
|
"learning_rate": 1.4033009983067452e-05, |
|
"loss": 8.3487, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.003027341878868004, |
|
"grad_norm": 1.5942152738571167, |
|
"learning_rate": 1.2842758726130283e-05, |
|
"loss": 8.3657, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.003065662662144814, |
|
"grad_norm": 1.3526906967163086, |
|
"learning_rate": 1.1697777844051105e-05, |
|
"loss": 8.5344, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0031039834454216243, |
|
"grad_norm": 1.568245530128479, |
|
"learning_rate": 1.0599462319663905e-05, |
|
"loss": 8.4987, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0031039834454216243, |
|
"eval_loss": 8.4481782913208, |
|
"eval_runtime": 197.166, |
|
"eval_samples_per_second": 111.459, |
|
"eval_steps_per_second": 13.932, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.0031423042286984346, |
|
"grad_norm": 1.4625965356826782, |
|
"learning_rate": 9.549150281252633e-06, |
|
"loss": 8.3553, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.003180625011975245, |
|
"grad_norm": 1.6190887689590454, |
|
"learning_rate": 8.548121372247918e-06, |
|
"loss": 8.4586, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.003218945795252055, |
|
"grad_norm": 1.4923970699310303, |
|
"learning_rate": 7.597595192178702e-06, |
|
"loss": 8.3606, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.003257266578528865, |
|
"grad_norm": 1.470651626586914, |
|
"learning_rate": 6.698729810778065e-06, |
|
"loss": 8.3707, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.0032955873618056753, |
|
"grad_norm": 1.3880239725112915, |
|
"learning_rate": 5.852620357053651e-06, |
|
"loss": 8.5658, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.0033339081450824855, |
|
"grad_norm": 1.5443851947784424, |
|
"learning_rate": 5.060297685041659e-06, |
|
"loss": 8.381, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.003372228928359296, |
|
"grad_norm": 1.3600224256515503, |
|
"learning_rate": 4.322727117869951e-06, |
|
"loss": 8.4857, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.0034105497116361056, |
|
"grad_norm": 1.3307414054870605, |
|
"learning_rate": 3.6408072716606346e-06, |
|
"loss": 8.3026, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.003448870494912916, |
|
"grad_norm": 1.3035300970077515, |
|
"learning_rate": 3.0153689607045845e-06, |
|
"loss": 8.3625, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.003448870494912916, |
|
"eval_loss": 8.436622619628906, |
|
"eval_runtime": 197.0498, |
|
"eval_samples_per_second": 111.525, |
|
"eval_steps_per_second": 13.941, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.003487191278189726, |
|
"grad_norm": 1.4547570943832397, |
|
"learning_rate": 2.4471741852423237e-06, |
|
"loss": 8.4409, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.0035255120614665365, |
|
"grad_norm": 1.3213475942611694, |
|
"learning_rate": 1.9369152030840556e-06, |
|
"loss": 8.4133, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.0035638328447433467, |
|
"grad_norm": 1.3285075426101685, |
|
"learning_rate": 1.4852136862001764e-06, |
|
"loss": 8.4509, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.0036021536280201566, |
|
"grad_norm": 1.4316322803497314, |
|
"learning_rate": 1.0926199633097157e-06, |
|
"loss": 8.4731, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.003640474411296967, |
|
"grad_norm": 1.4853651523590088, |
|
"learning_rate": 7.596123493895991e-07, |
|
"loss": 8.4, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.003678795194573777, |
|
"grad_norm": 1.407329797744751, |
|
"learning_rate": 4.865965629214819e-07, |
|
"loss": 8.4689, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.0037171159778505874, |
|
"grad_norm": 1.4351682662963867, |
|
"learning_rate": 2.7390523158633554e-07, |
|
"loss": 8.3587, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.0037554367611273972, |
|
"grad_norm": 1.4511561393737793, |
|
"learning_rate": 1.2179748700879012e-07, |
|
"loss": 8.4413, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.0037937575444042075, |
|
"grad_norm": 1.4957984685897827, |
|
"learning_rate": 3.04586490452119e-08, |
|
"loss": 8.3962, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.0037937575444042075, |
|
"eval_loss": 8.434378623962402, |
|
"eval_runtime": 197.6671, |
|
"eval_samples_per_second": 111.177, |
|
"eval_steps_per_second": 13.897, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.003832078327681018, |
|
"grad_norm": 1.5082837343215942, |
|
"learning_rate": 0.0, |
|
"loss": 8.2469, |
|
"step": 100 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 100, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 27149520076800.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|