|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 200, |
|
"global_step": 6341, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004731114966093676, |
|
"grad_norm": 0.9585382342338562, |
|
"learning_rate": 7.235790156711095e-05, |
|
"loss": 1.19, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.009462229932187352, |
|
"grad_norm": 0.9438452124595642, |
|
"learning_rate": 8.817139967814685e-05, |
|
"loss": 1.0589, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.014193344898281028, |
|
"grad_norm": 0.9442492723464966, |
|
"learning_rate": 9.722413360750843e-05, |
|
"loss": 1.0764, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.018924459864374705, |
|
"grad_norm": 0.8840267658233643, |
|
"learning_rate": 9.994621104255655e-05, |
|
"loss": 1.0847, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.02365557483046838, |
|
"grad_norm": 0.8207218050956726, |
|
"learning_rate": 9.985445340927068e-05, |
|
"loss": 1.0912, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.028386689796562056, |
|
"grad_norm": 0.8883314728736877, |
|
"learning_rate": 9.975953171966461e-05, |
|
"loss": 1.0608, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.03154076644062451, |
|
"eval_loss": 1.2097724676132202, |
|
"eval_runtime": 3.756, |
|
"eval_samples_per_second": 26.89, |
|
"eval_steps_per_second": 3.461, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03311780476265573, |
|
"grad_norm": 0.7577874064445496, |
|
"learning_rate": 9.966461003005853e-05, |
|
"loss": 1.0802, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03784891972874941, |
|
"grad_norm": 1.4911932945251465, |
|
"learning_rate": 9.956968834045246e-05, |
|
"loss": 1.0397, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.04258003469484308, |
|
"grad_norm": 0.8236317038536072, |
|
"learning_rate": 9.947476665084638e-05, |
|
"loss": 1.0575, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.04731114966093676, |
|
"grad_norm": 0.7883521318435669, |
|
"learning_rate": 9.937984496124031e-05, |
|
"loss": 1.0369, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.05204226462703044, |
|
"grad_norm": 0.7798565626144409, |
|
"learning_rate": 9.928492327163424e-05, |
|
"loss": 1.0354, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.05677337959312411, |
|
"grad_norm": 0.7784315943717957, |
|
"learning_rate": 9.919000158202817e-05, |
|
"loss": 1.0341, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.06150449455921779, |
|
"grad_norm": 0.836300790309906, |
|
"learning_rate": 9.909507989242209e-05, |
|
"loss": 1.0272, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.06308153288124901, |
|
"eval_loss": 1.1889104843139648, |
|
"eval_runtime": 3.7553, |
|
"eval_samples_per_second": 26.895, |
|
"eval_steps_per_second": 3.462, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06623560952531146, |
|
"grad_norm": 0.7245925664901733, |
|
"learning_rate": 9.900015820281602e-05, |
|
"loss": 1.0256, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.07096672449140515, |
|
"grad_norm": 0.8321049213409424, |
|
"learning_rate": 9.890523651320994e-05, |
|
"loss": 1.0332, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.07569783945749882, |
|
"grad_norm": 0.7657173275947571, |
|
"learning_rate": 9.881031482360387e-05, |
|
"loss": 1.0221, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.08042895442359249, |
|
"grad_norm": 0.7464463114738464, |
|
"learning_rate": 9.871539313399779e-05, |
|
"loss": 0.9911, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.08516006938968616, |
|
"grad_norm": 0.7290617227554321, |
|
"learning_rate": 9.862047144439172e-05, |
|
"loss": 1.0258, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.08989118435577985, |
|
"grad_norm": 0.7311350703239441, |
|
"learning_rate": 9.852554975478564e-05, |
|
"loss": 1.0165, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.09462229932187352, |
|
"grad_norm": 0.8087915182113647, |
|
"learning_rate": 9.843062806517957e-05, |
|
"loss": 0.9716, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.09462229932187352, |
|
"eval_loss": 1.1471492052078247, |
|
"eval_runtime": 3.7536, |
|
"eval_samples_per_second": 26.907, |
|
"eval_steps_per_second": 3.463, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.09935341428796719, |
|
"grad_norm": 0.7442970275878906, |
|
"learning_rate": 9.833570637557348e-05, |
|
"loss": 0.9747, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.10408452925406088, |
|
"grad_norm": 0.9510965347290039, |
|
"learning_rate": 9.824078468596742e-05, |
|
"loss": 0.9582, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.10881564422015455, |
|
"grad_norm": 0.6995567083358765, |
|
"learning_rate": 9.814586299636133e-05, |
|
"loss": 1.0118, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.11354675918624822, |
|
"grad_norm": 0.9319436550140381, |
|
"learning_rate": 9.805094130675526e-05, |
|
"loss": 0.9815, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.11827787415234191, |
|
"grad_norm": 0.7033783793449402, |
|
"learning_rate": 9.795601961714918e-05, |
|
"loss": 0.9738, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.12300898911843558, |
|
"grad_norm": 0.6606217622756958, |
|
"learning_rate": 9.786109792754311e-05, |
|
"loss": 0.961, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.12616306576249803, |
|
"eval_loss": 1.125948190689087, |
|
"eval_runtime": 3.7557, |
|
"eval_samples_per_second": 26.892, |
|
"eval_steps_per_second": 3.461, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.12774010408452927, |
|
"grad_norm": 0.9087960124015808, |
|
"learning_rate": 9.776617623793703e-05, |
|
"loss": 0.9734, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.13247121905062292, |
|
"grad_norm": 0.7387025952339172, |
|
"learning_rate": 9.767125454833097e-05, |
|
"loss": 0.9605, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.1372023340167166, |
|
"grad_norm": 0.7939543724060059, |
|
"learning_rate": 9.757633285872489e-05, |
|
"loss": 0.952, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.1419334489828103, |
|
"grad_norm": 1.1417864561080933, |
|
"learning_rate": 9.748141116911882e-05, |
|
"loss": 0.9113, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.14666456394890395, |
|
"grad_norm": 0.7591778635978699, |
|
"learning_rate": 9.738648947951274e-05, |
|
"loss": 0.9565, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.15139567891499764, |
|
"grad_norm": 0.759545087814331, |
|
"learning_rate": 9.729156778990667e-05, |
|
"loss": 0.9401, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.1561267938810913, |
|
"grad_norm": 0.700552761554718, |
|
"learning_rate": 9.719664610030059e-05, |
|
"loss": 0.9447, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.15770383220312253, |
|
"eval_loss": 1.0677810907363892, |
|
"eval_runtime": 3.7551, |
|
"eval_samples_per_second": 26.897, |
|
"eval_steps_per_second": 3.462, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.16085790884718498, |
|
"grad_norm": 0.6673519015312195, |
|
"learning_rate": 9.710172441069452e-05, |
|
"loss": 0.8919, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.16558902381327867, |
|
"grad_norm": 0.8046931028366089, |
|
"learning_rate": 9.700680272108844e-05, |
|
"loss": 0.9136, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.17032013877937233, |
|
"grad_norm": 0.7277413606643677, |
|
"learning_rate": 9.691188103148237e-05, |
|
"loss": 0.9001, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.175051253745466, |
|
"grad_norm": 0.661359429359436, |
|
"learning_rate": 9.681695934187629e-05, |
|
"loss": 0.9119, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.1797823687115597, |
|
"grad_norm": 0.7349006533622742, |
|
"learning_rate": 9.672203765227022e-05, |
|
"loss": 0.8825, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.18451348367765336, |
|
"grad_norm": 0.7114729285240173, |
|
"learning_rate": 9.662711596266414e-05, |
|
"loss": 0.8872, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.18924459864374704, |
|
"grad_norm": 0.6496574282646179, |
|
"learning_rate": 9.653219427305807e-05, |
|
"loss": 0.8809, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.18924459864374704, |
|
"eval_loss": 1.0253973007202148, |
|
"eval_runtime": 3.7532, |
|
"eval_samples_per_second": 26.91, |
|
"eval_steps_per_second": 3.464, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.19397571360984073, |
|
"grad_norm": 0.6576619744300842, |
|
"learning_rate": 9.643727258345198e-05, |
|
"loss": 0.876, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.19870682857593439, |
|
"grad_norm": 0.666749119758606, |
|
"learning_rate": 9.634235089384591e-05, |
|
"loss": 0.8877, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.20343794354202807, |
|
"grad_norm": 0.7769750952720642, |
|
"learning_rate": 9.624742920423983e-05, |
|
"loss": 0.8894, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.20816905850812176, |
|
"grad_norm": 0.6562801599502563, |
|
"learning_rate": 9.615250751463376e-05, |
|
"loss": 0.8912, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.21290017347421542, |
|
"grad_norm": 0.6531364917755127, |
|
"learning_rate": 9.605758582502768e-05, |
|
"loss": 0.875, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.2176312884403091, |
|
"grad_norm": 0.6414660811424255, |
|
"learning_rate": 9.596266413542163e-05, |
|
"loss": 0.8721, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.22078536508437155, |
|
"eval_loss": 1.0128834247589111, |
|
"eval_runtime": 3.7539, |
|
"eval_samples_per_second": 26.906, |
|
"eval_steps_per_second": 3.463, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.2223624034064028, |
|
"grad_norm": 0.8413099646568298, |
|
"learning_rate": 9.586774244581554e-05, |
|
"loss": 0.8807, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.22709351837249644, |
|
"grad_norm": 0.6748294830322266, |
|
"learning_rate": 9.577282075620947e-05, |
|
"loss": 0.8245, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.23182463333859013, |
|
"grad_norm": 0.7067525386810303, |
|
"learning_rate": 9.567789906660339e-05, |
|
"loss": 0.8767, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.23655574830468382, |
|
"grad_norm": 1.074791431427002, |
|
"learning_rate": 9.558297737699732e-05, |
|
"loss": 0.8856, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.24128686327077747, |
|
"grad_norm": 0.7461240887641907, |
|
"learning_rate": 9.548805568739124e-05, |
|
"loss": 0.8759, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.24601797823687116, |
|
"grad_norm": 0.6231616139411926, |
|
"learning_rate": 9.539313399778517e-05, |
|
"loss": 0.837, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.25074909320296485, |
|
"grad_norm": 0.7053641080856323, |
|
"learning_rate": 9.529821230817909e-05, |
|
"loss": 0.8763, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.25232613152499606, |
|
"eval_loss": 0.9505324959754944, |
|
"eval_runtime": 3.7563, |
|
"eval_samples_per_second": 26.888, |
|
"eval_steps_per_second": 3.461, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.25548020816905853, |
|
"grad_norm": 0.6484207510948181, |
|
"learning_rate": 9.520329061857302e-05, |
|
"loss": 0.8787, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.26021132313515216, |
|
"grad_norm": 0.5929827094078064, |
|
"learning_rate": 9.510836892896694e-05, |
|
"loss": 0.844, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.26494243810124585, |
|
"grad_norm": 0.6840829849243164, |
|
"learning_rate": 9.501344723936087e-05, |
|
"loss": 0.8492, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.26967355306733953, |
|
"grad_norm": 0.7365448474884033, |
|
"learning_rate": 9.491852554975479e-05, |
|
"loss": 0.8584, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.2744046680334332, |
|
"grad_norm": 0.6528182029724121, |
|
"learning_rate": 9.482360386014872e-05, |
|
"loss": 0.8346, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.2791357829995269, |
|
"grad_norm": 0.6200223565101624, |
|
"learning_rate": 9.472868217054263e-05, |
|
"loss": 0.8008, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.2838668979656206, |
|
"grad_norm": 0.7503982186317444, |
|
"learning_rate": 9.463376048093657e-05, |
|
"loss": 0.8197, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.2838668979656206, |
|
"eval_loss": 0.9286572933197021, |
|
"eval_runtime": 3.7535, |
|
"eval_samples_per_second": 26.908, |
|
"eval_steps_per_second": 3.463, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.2885980129317142, |
|
"grad_norm": 0.6671140193939209, |
|
"learning_rate": 9.453883879133048e-05, |
|
"loss": 0.8405, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.2933291278978079, |
|
"grad_norm": 0.7057023048400879, |
|
"learning_rate": 9.444391710172441e-05, |
|
"loss": 0.7822, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.2980602428639016, |
|
"grad_norm": 0.8120527267456055, |
|
"learning_rate": 9.434899541211833e-05, |
|
"loss": 0.8416, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.3027913578299953, |
|
"grad_norm": 0.622718334197998, |
|
"learning_rate": 9.425407372251228e-05, |
|
"loss": 0.8174, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.30752247279608896, |
|
"grad_norm": 0.6605896353721619, |
|
"learning_rate": 9.41591520329062e-05, |
|
"loss": 0.8003, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.3122535877621826, |
|
"grad_norm": 0.7473495006561279, |
|
"learning_rate": 9.406423034330012e-05, |
|
"loss": 0.798, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.31540766440624507, |
|
"eval_loss": 0.8976284861564636, |
|
"eval_runtime": 3.7537, |
|
"eval_samples_per_second": 26.907, |
|
"eval_steps_per_second": 3.463, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3169847027282763, |
|
"grad_norm": 0.7177520394325256, |
|
"learning_rate": 9.396930865369404e-05, |
|
"loss": 0.8168, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.32171581769436997, |
|
"grad_norm": 0.7600869536399841, |
|
"learning_rate": 9.387438696408797e-05, |
|
"loss": 0.7918, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.32644693266046365, |
|
"grad_norm": 0.7001503109931946, |
|
"learning_rate": 9.377946527448189e-05, |
|
"loss": 0.7906, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.33117804762655734, |
|
"grad_norm": 0.6279382705688477, |
|
"learning_rate": 9.368454358487582e-05, |
|
"loss": 0.7624, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.335909162592651, |
|
"grad_norm": 0.7481889128684998, |
|
"learning_rate": 9.358962189526974e-05, |
|
"loss": 0.7849, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.34064027755874465, |
|
"grad_norm": 0.6797828078269958, |
|
"learning_rate": 9.349470020566367e-05, |
|
"loss": 0.7899, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.34537139252483834, |
|
"grad_norm": 0.6929941177368164, |
|
"learning_rate": 9.339977851605759e-05, |
|
"loss": 0.7703, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.3469484308468696, |
|
"eval_loss": 0.8858568072319031, |
|
"eval_runtime": 3.7538, |
|
"eval_samples_per_second": 26.906, |
|
"eval_steps_per_second": 3.463, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.350102507490932, |
|
"grad_norm": 0.698906660079956, |
|
"learning_rate": 9.330485682645152e-05, |
|
"loss": 0.7724, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.3548336224570257, |
|
"grad_norm": 0.779211163520813, |
|
"learning_rate": 9.320993513684544e-05, |
|
"loss": 0.7875, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.3595647374231194, |
|
"grad_norm": 0.7313475608825684, |
|
"learning_rate": 9.311817750355957e-05, |
|
"loss": 0.794, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.3642958523892131, |
|
"grad_norm": 0.6143506169319153, |
|
"learning_rate": 9.30232558139535e-05, |
|
"loss": 0.7742, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.3690269673553067, |
|
"grad_norm": 0.6775010824203491, |
|
"learning_rate": 9.292833412434741e-05, |
|
"loss": 0.7822, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.3737580823214004, |
|
"grad_norm": 0.7151722311973572, |
|
"learning_rate": 9.283341243474134e-05, |
|
"loss": 0.7617, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.3784891972874941, |
|
"grad_norm": 0.6855128407478333, |
|
"learning_rate": 9.273849074513526e-05, |
|
"loss": 0.7668, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.3784891972874941, |
|
"eval_loss": 0.8862702250480652, |
|
"eval_runtime": 3.7541, |
|
"eval_samples_per_second": 26.904, |
|
"eval_steps_per_second": 3.463, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.38322031225358777, |
|
"grad_norm": 0.743325412273407, |
|
"learning_rate": 9.26435690555292e-05, |
|
"loss": 0.7885, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.38795142721968146, |
|
"grad_norm": 0.6186659932136536, |
|
"learning_rate": 9.254864736592311e-05, |
|
"loss": 0.7619, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.39268254218577514, |
|
"grad_norm": 0.6791619062423706, |
|
"learning_rate": 9.245372567631704e-05, |
|
"loss": 0.8084, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.39741365715186877, |
|
"grad_norm": 0.6537867784500122, |
|
"learning_rate": 9.235880398671097e-05, |
|
"loss": 0.7641, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.40214477211796246, |
|
"grad_norm": 0.6688680052757263, |
|
"learning_rate": 9.22638822971049e-05, |
|
"loss": 0.7634, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.40687588708405614, |
|
"grad_norm": 0.6369423866271973, |
|
"learning_rate": 9.216896060749882e-05, |
|
"loss": 0.7407, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.4100299637281186, |
|
"eval_loss": 0.8817442059516907, |
|
"eval_runtime": 3.7541, |
|
"eval_samples_per_second": 26.904, |
|
"eval_steps_per_second": 3.463, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.41160700205014983, |
|
"grad_norm": 0.6841573119163513, |
|
"learning_rate": 9.207403891789275e-05, |
|
"loss": 0.7572, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.4163381170162435, |
|
"grad_norm": 0.625957727432251, |
|
"learning_rate": 9.197911722828667e-05, |
|
"loss": 0.7493, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.42106923198233714, |
|
"grad_norm": 0.7467941641807556, |
|
"learning_rate": 9.18841955386806e-05, |
|
"loss": 0.7468, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.42580034694843083, |
|
"grad_norm": 0.6891815662384033, |
|
"learning_rate": 9.178927384907452e-05, |
|
"loss": 0.7698, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.4305314619145245, |
|
"grad_norm": 0.6197889447212219, |
|
"learning_rate": 9.169435215946845e-05, |
|
"loss": 0.7588, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.4352625768806182, |
|
"grad_norm": 0.7140328884124756, |
|
"learning_rate": 9.159943046986237e-05, |
|
"loss": 0.7569, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.4399936918467119, |
|
"grad_norm": 0.7718496322631836, |
|
"learning_rate": 9.15045087802563e-05, |
|
"loss": 0.7448, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.4415707301687431, |
|
"eval_loss": 0.8855557441711426, |
|
"eval_runtime": 3.7544, |
|
"eval_samples_per_second": 26.902, |
|
"eval_steps_per_second": 3.463, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.4447248068128056, |
|
"grad_norm": 0.6447039246559143, |
|
"learning_rate": 9.140958709065022e-05, |
|
"loss": 0.7623, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.4494559217788992, |
|
"grad_norm": 0.6694769859313965, |
|
"learning_rate": 9.131466540104415e-05, |
|
"loss": 0.7081, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.4541870367449929, |
|
"grad_norm": 0.6863081455230713, |
|
"learning_rate": 9.121974371143806e-05, |
|
"loss": 0.7228, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.4589181517110866, |
|
"grad_norm": 0.7198454737663269, |
|
"learning_rate": 9.1124822021832e-05, |
|
"loss": 0.7356, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.46364926667718026, |
|
"grad_norm": 0.6542885303497314, |
|
"learning_rate": 9.102990033222591e-05, |
|
"loss": 0.7606, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.46838038164327395, |
|
"grad_norm": 0.657539963722229, |
|
"learning_rate": 9.093497864261984e-05, |
|
"loss": 0.7255, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.47311149660936763, |
|
"grad_norm": 0.819503664970398, |
|
"learning_rate": 9.084005695301376e-05, |
|
"loss": 0.7184, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.47311149660936763, |
|
"eval_loss": 0.8140414357185364, |
|
"eval_runtime": 3.7531, |
|
"eval_samples_per_second": 26.911, |
|
"eval_steps_per_second": 3.464, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.47784261157546126, |
|
"grad_norm": 0.7199704647064209, |
|
"learning_rate": 9.074513526340769e-05, |
|
"loss": 0.7227, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 0.48257372654155495, |
|
"grad_norm": 0.7655025720596313, |
|
"learning_rate": 9.065021357380162e-05, |
|
"loss": 0.7217, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.48730484150764863, |
|
"grad_norm": 0.7312873601913452, |
|
"learning_rate": 9.055845594051574e-05, |
|
"loss": 0.7059, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 0.4920359564737423, |
|
"grad_norm": 0.5961809158325195, |
|
"learning_rate": 9.046353425090967e-05, |
|
"loss": 0.7033, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.496767071439836, |
|
"grad_norm": 0.6955564022064209, |
|
"learning_rate": 9.03686125613036e-05, |
|
"loss": 0.7289, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.5014981864059297, |
|
"grad_norm": 0.6622660160064697, |
|
"learning_rate": 9.027369087169752e-05, |
|
"loss": 0.6935, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.5046522630499921, |
|
"eval_loss": 0.7775673270225525, |
|
"eval_runtime": 3.754, |
|
"eval_samples_per_second": 26.904, |
|
"eval_steps_per_second": 3.463, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.5062293013720234, |
|
"grad_norm": 0.7262014746665955, |
|
"learning_rate": 9.017876918209145e-05, |
|
"loss": 0.6906, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 0.5109604163381171, |
|
"grad_norm": 0.7221697568893433, |
|
"learning_rate": 9.008384749248537e-05, |
|
"loss": 0.7079, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.5156915313042106, |
|
"grad_norm": 0.7115603089332581, |
|
"learning_rate": 8.99889258028793e-05, |
|
"loss": 0.7191, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 0.5204226462703043, |
|
"grad_norm": 0.7292232513427734, |
|
"learning_rate": 8.989400411327322e-05, |
|
"loss": 0.6702, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.525153761236398, |
|
"grad_norm": 0.741580605506897, |
|
"learning_rate": 8.979908242366715e-05, |
|
"loss": 0.6762, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 0.5298848762024917, |
|
"grad_norm": 0.7870708107948303, |
|
"learning_rate": 8.970416073406108e-05, |
|
"loss": 0.6838, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.5346159911685854, |
|
"grad_norm": 0.71812903881073, |
|
"learning_rate": 8.9609239044455e-05, |
|
"loss": 0.7174, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 0.5361930294906166, |
|
"eval_loss": 0.7375061511993408, |
|
"eval_runtime": 3.7548, |
|
"eval_samples_per_second": 26.899, |
|
"eval_steps_per_second": 3.462, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.5393471061346791, |
|
"grad_norm": 0.7266995906829834, |
|
"learning_rate": 8.951431735484893e-05, |
|
"loss": 0.6763, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.5440782211007728, |
|
"grad_norm": 0.7786857485771179, |
|
"learning_rate": 8.941939566524284e-05, |
|
"loss": 0.7149, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.5488093360668664, |
|
"grad_norm": 0.7807109355926514, |
|
"learning_rate": 8.932447397563677e-05, |
|
"loss": 0.6534, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.5535404510329601, |
|
"grad_norm": 0.6960239410400391, |
|
"learning_rate": 8.922955228603069e-05, |
|
"loss": 0.7313, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 0.5582715659990538, |
|
"grad_norm": 0.586615264415741, |
|
"learning_rate": 8.913463059642462e-05, |
|
"loss": 0.6579, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.5630026809651475, |
|
"grad_norm": 0.9740248918533325, |
|
"learning_rate": 8.903970890681854e-05, |
|
"loss": 0.7013, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 0.5677337959312412, |
|
"grad_norm": 0.6628558039665222, |
|
"learning_rate": 8.894478721721247e-05, |
|
"loss": 0.6546, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5677337959312412, |
|
"eval_loss": 0.7031014561653137, |
|
"eval_runtime": 3.7542, |
|
"eval_samples_per_second": 26.903, |
|
"eval_steps_per_second": 3.463, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.5724649108973348, |
|
"grad_norm": 0.6030669808387756, |
|
"learning_rate": 8.884986552760639e-05, |
|
"loss": 0.7146, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 0.5771960258634284, |
|
"grad_norm": 0.6010313034057617, |
|
"learning_rate": 8.875494383800032e-05, |
|
"loss": 0.6816, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.5819271408295221, |
|
"grad_norm": 0.6319311857223511, |
|
"learning_rate": 8.866002214839425e-05, |
|
"loss": 0.6642, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 0.5866582557956158, |
|
"grad_norm": 0.6059941053390503, |
|
"learning_rate": 8.856510045878817e-05, |
|
"loss": 0.6998, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.5913893707617095, |
|
"grad_norm": 0.5976997017860413, |
|
"learning_rate": 8.84701787691821e-05, |
|
"loss": 0.6694, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.5961204857278032, |
|
"grad_norm": 0.6985177993774414, |
|
"learning_rate": 8.837525707957602e-05, |
|
"loss": 0.6402, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.5992745623718656, |
|
"eval_loss": 0.6977850198745728, |
|
"eval_runtime": 3.7545, |
|
"eval_samples_per_second": 26.901, |
|
"eval_steps_per_second": 3.462, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.6008516006938969, |
|
"grad_norm": 0.7076742053031921, |
|
"learning_rate": 8.828033538996995e-05, |
|
"loss": 0.6749, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 0.6055827156599906, |
|
"grad_norm": 0.9254401326179504, |
|
"learning_rate": 8.818541370036387e-05, |
|
"loss": 0.6481, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.6103138306260842, |
|
"grad_norm": 0.7403334379196167, |
|
"learning_rate": 8.80904920107578e-05, |
|
"loss": 0.6704, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 0.6150449455921779, |
|
"grad_norm": 0.6302973628044128, |
|
"learning_rate": 8.799557032115171e-05, |
|
"loss": 0.6717, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.6197760605582716, |
|
"grad_norm": 0.7587308287620544, |
|
"learning_rate": 8.790064863154565e-05, |
|
"loss": 0.6526, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 0.6245071755243652, |
|
"grad_norm": 0.768151581287384, |
|
"learning_rate": 8.780572694193956e-05, |
|
"loss": 0.6614, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.6292382904904589, |
|
"grad_norm": 0.662624716758728, |
|
"learning_rate": 8.77108052523335e-05, |
|
"loss": 0.6471, |
|
"step": 3990 |
|
}, |
|
{ |
|
"epoch": 0.6308153288124901, |
|
"eval_loss": 0.6685364246368408, |
|
"eval_runtime": 3.7533, |
|
"eval_samples_per_second": 26.909, |
|
"eval_steps_per_second": 3.464, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6339694054565526, |
|
"grad_norm": 0.614434540271759, |
|
"learning_rate": 8.761588356272743e-05, |
|
"loss": 0.6305, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.6387005204226462, |
|
"grad_norm": 0.7292618751525879, |
|
"learning_rate": 8.752096187312134e-05, |
|
"loss": 0.632, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.6434316353887399, |
|
"grad_norm": 0.5890663862228394, |
|
"learning_rate": 8.742604018351527e-05, |
|
"loss": 0.6594, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.6481627503548336, |
|
"grad_norm": 0.6511669158935547, |
|
"learning_rate": 8.733111849390919e-05, |
|
"loss": 0.6417, |
|
"step": 4110 |
|
}, |
|
{ |
|
"epoch": 0.6528938653209273, |
|
"grad_norm": 0.6794877648353577, |
|
"learning_rate": 8.723619680430312e-05, |
|
"loss": 0.6472, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.657624980287021, |
|
"grad_norm": 0.5826547145843506, |
|
"learning_rate": 8.714127511469704e-05, |
|
"loss": 0.6255, |
|
"step": 4170 |
|
}, |
|
{ |
|
"epoch": 0.6623560952531147, |
|
"grad_norm": 0.8411812782287598, |
|
"learning_rate": 8.704635342509097e-05, |
|
"loss": 0.6368, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.6623560952531147, |
|
"eval_loss": 0.6538847088813782, |
|
"eval_runtime": 3.7543, |
|
"eval_samples_per_second": 26.903, |
|
"eval_steps_per_second": 3.463, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.6670872102192084, |
|
"grad_norm": 0.5682166218757629, |
|
"learning_rate": 8.69514317354849e-05, |
|
"loss": 0.6269, |
|
"step": 4230 |
|
}, |
|
{ |
|
"epoch": 0.671818325185302, |
|
"grad_norm": 0.6340855360031128, |
|
"learning_rate": 8.685651004587882e-05, |
|
"loss": 0.6423, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.6765494401513957, |
|
"grad_norm": 0.6693681478500366, |
|
"learning_rate": 8.676158835627275e-05, |
|
"loss": 0.6471, |
|
"step": 4290 |
|
}, |
|
{ |
|
"epoch": 0.6812805551174893, |
|
"grad_norm": 0.6101056337356567, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 0.6168, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.686011670083583, |
|
"grad_norm": 0.6096228361129761, |
|
"learning_rate": 8.65717449770606e-05, |
|
"loss": 0.6494, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.6907427850496767, |
|
"grad_norm": 0.6632306575775146, |
|
"learning_rate": 8.647682328745452e-05, |
|
"loss": 0.664, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.6938968616937392, |
|
"eval_loss": 0.6377571225166321, |
|
"eval_runtime": 3.756, |
|
"eval_samples_per_second": 26.89, |
|
"eval_steps_per_second": 3.461, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.6954739000157704, |
|
"grad_norm": 0.6547721028327942, |
|
"learning_rate": 8.638190159784845e-05, |
|
"loss": 0.6091, |
|
"step": 4410 |
|
}, |
|
{ |
|
"epoch": 0.700205014981864, |
|
"grad_norm": 0.6063847541809082, |
|
"learning_rate": 8.628697990824237e-05, |
|
"loss": 0.6055, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.7049361299479577, |
|
"grad_norm": 0.6687933802604675, |
|
"learning_rate": 8.61920582186363e-05, |
|
"loss": 0.601, |
|
"step": 4470 |
|
}, |
|
{ |
|
"epoch": 0.7096672449140514, |
|
"grad_norm": 0.701770007610321, |
|
"learning_rate": 8.609713652903021e-05, |
|
"loss": 0.6064, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.7143983598801451, |
|
"grad_norm": 0.6652805209159851, |
|
"learning_rate": 8.600221483942414e-05, |
|
"loss": 0.653, |
|
"step": 4530 |
|
}, |
|
{ |
|
"epoch": 0.7191294748462388, |
|
"grad_norm": 0.6469018459320068, |
|
"learning_rate": 8.590729314981806e-05, |
|
"loss": 0.6019, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.7238605898123325, |
|
"grad_norm": 0.6343564391136169, |
|
"learning_rate": 8.5812371460212e-05, |
|
"loss": 0.6083, |
|
"step": 4590 |
|
}, |
|
{ |
|
"epoch": 0.7254376281343636, |
|
"eval_loss": 0.6411118507385254, |
|
"eval_runtime": 3.754, |
|
"eval_samples_per_second": 26.905, |
|
"eval_steps_per_second": 3.463, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.7285917047784262, |
|
"grad_norm": 0.5817134976387024, |
|
"learning_rate": 8.571744977060592e-05, |
|
"loss": 0.602, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.7333228197445197, |
|
"grad_norm": 0.5552039742469788, |
|
"learning_rate": 8.562252808099984e-05, |
|
"loss": 0.6223, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.7380539347106134, |
|
"grad_norm": 0.6455065011978149, |
|
"learning_rate": 8.552760639139377e-05, |
|
"loss": 0.5865, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.7427850496767071, |
|
"grad_norm": 0.6448588371276855, |
|
"learning_rate": 8.543268470178769e-05, |
|
"loss": 0.6126, |
|
"step": 4710 |
|
}, |
|
{ |
|
"epoch": 0.7475161646428008, |
|
"grad_norm": 0.6447100639343262, |
|
"learning_rate": 8.533776301218162e-05, |
|
"loss": 0.6167, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.7522472796088945, |
|
"grad_norm": 0.6894412636756897, |
|
"learning_rate": 8.524284132257555e-05, |
|
"loss": 0.5851, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 0.7569783945749882, |
|
"grad_norm": 0.6036236882209778, |
|
"learning_rate": 8.514791963296947e-05, |
|
"loss": 0.6025, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7569783945749882, |
|
"eval_loss": 0.6117845177650452, |
|
"eval_runtime": 3.7554, |
|
"eval_samples_per_second": 26.894, |
|
"eval_steps_per_second": 3.462, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.7617095095410819, |
|
"grad_norm": 0.6214340925216675, |
|
"learning_rate": 8.50529979433634e-05, |
|
"loss": 0.6145, |
|
"step": 4830 |
|
}, |
|
{ |
|
"epoch": 0.7664406245071755, |
|
"grad_norm": 0.6933445334434509, |
|
"learning_rate": 8.495807625375732e-05, |
|
"loss": 0.6184, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.7711717394732692, |
|
"grad_norm": 0.5649739503860474, |
|
"learning_rate": 8.486315456415125e-05, |
|
"loss": 0.5996, |
|
"step": 4890 |
|
}, |
|
{ |
|
"epoch": 0.7759028544393629, |
|
"grad_norm": 0.6250168085098267, |
|
"learning_rate": 8.476823287454517e-05, |
|
"loss": 0.5762, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.7806339694054566, |
|
"grad_norm": 1.7125053405761719, |
|
"learning_rate": 8.46733111849391e-05, |
|
"loss": 0.5716, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.7853650843715503, |
|
"grad_norm": 0.5721966028213501, |
|
"learning_rate": 8.457838949533302e-05, |
|
"loss": 0.5612, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.7885191610156127, |
|
"eval_loss": 0.5980841517448425, |
|
"eval_runtime": 3.7547, |
|
"eval_samples_per_second": 26.9, |
|
"eval_steps_per_second": 3.462, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7900961993376439, |
|
"grad_norm": 0.6716078519821167, |
|
"learning_rate": 8.448346780572695e-05, |
|
"loss": 0.5765, |
|
"step": 5010 |
|
}, |
|
{ |
|
"epoch": 0.7948273143037375, |
|
"grad_norm": 0.6005885601043701, |
|
"learning_rate": 8.438854611612086e-05, |
|
"loss": 0.5941, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.7995584292698312, |
|
"grad_norm": 0.6507188081741333, |
|
"learning_rate": 8.42936244265148e-05, |
|
"loss": 0.5827, |
|
"step": 5070 |
|
}, |
|
{ |
|
"epoch": 0.8042895442359249, |
|
"grad_norm": 0.7276827096939087, |
|
"learning_rate": 8.419870273690871e-05, |
|
"loss": 0.5555, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.8090206592020186, |
|
"grad_norm": 0.6792399287223816, |
|
"learning_rate": 8.410378104730264e-05, |
|
"loss": 0.5724, |
|
"step": 5130 |
|
}, |
|
{ |
|
"epoch": 0.8137517741681123, |
|
"grad_norm": 0.7074045538902283, |
|
"learning_rate": 8.400885935769656e-05, |
|
"loss": 0.5724, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.818482889134206, |
|
"grad_norm": 0.6056311130523682, |
|
"learning_rate": 8.391393766809049e-05, |
|
"loss": 0.5546, |
|
"step": 5190 |
|
}, |
|
{ |
|
"epoch": 0.8200599274562372, |
|
"eval_loss": 0.5805890560150146, |
|
"eval_runtime": 3.7561, |
|
"eval_samples_per_second": 26.889, |
|
"eval_steps_per_second": 3.461, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.8232140041002997, |
|
"grad_norm": 0.6667674779891968, |
|
"learning_rate": 8.381901597848441e-05, |
|
"loss": 0.6173, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.8279451190663933, |
|
"grad_norm": 0.607284426689148, |
|
"learning_rate": 8.372409428887834e-05, |
|
"loss": 0.5781, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.832676234032487, |
|
"grad_norm": 0.6476745009422302, |
|
"learning_rate": 8.362917259927227e-05, |
|
"loss": 0.5667, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.8374073489985807, |
|
"grad_norm": 0.6668260097503662, |
|
"learning_rate": 8.35342509096662e-05, |
|
"loss": 0.5456, |
|
"step": 5310 |
|
}, |
|
{ |
|
"epoch": 0.8421384639646743, |
|
"grad_norm": 0.585110068321228, |
|
"learning_rate": 8.343932922006012e-05, |
|
"loss": 0.5648, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.846869578930768, |
|
"grad_norm": 0.6268571019172668, |
|
"learning_rate": 8.334757158677425e-05, |
|
"loss": 0.555, |
|
"step": 5370 |
|
}, |
|
{ |
|
"epoch": 0.8516006938968617, |
|
"grad_norm": 0.6197232604026794, |
|
"learning_rate": 8.325264989716818e-05, |
|
"loss": 0.5333, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.8516006938968617, |
|
"eval_loss": 0.5601951479911804, |
|
"eval_runtime": 3.7534, |
|
"eval_samples_per_second": 26.909, |
|
"eval_steps_per_second": 3.464, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.8563318088629553, |
|
"grad_norm": 0.63880455493927, |
|
"learning_rate": 8.31577282075621e-05, |
|
"loss": 0.5602, |
|
"step": 5430 |
|
}, |
|
{ |
|
"epoch": 0.861062923829049, |
|
"grad_norm": 0.6235695481300354, |
|
"learning_rate": 8.306280651795603e-05, |
|
"loss": 0.5604, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.8657940387951427, |
|
"grad_norm": 0.9000911712646484, |
|
"learning_rate": 8.296788482834995e-05, |
|
"loss": 0.5654, |
|
"step": 5490 |
|
}, |
|
{ |
|
"epoch": 0.8705251537612364, |
|
"grad_norm": 0.6557802557945251, |
|
"learning_rate": 8.287612719506408e-05, |
|
"loss": 0.5962, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.8752562687273301, |
|
"grad_norm": 0.6231096982955933, |
|
"learning_rate": 8.278120550545801e-05, |
|
"loss": 0.5636, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.8799873836934238, |
|
"grad_norm": 0.5984258651733398, |
|
"learning_rate": 8.268628381585192e-05, |
|
"loss": 0.5616, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.8831414603374862, |
|
"eval_loss": 0.5611711740493774, |
|
"eval_runtime": 3.7542, |
|
"eval_samples_per_second": 26.903, |
|
"eval_steps_per_second": 3.463, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.8847184986595175, |
|
"grad_norm": 0.5818042159080505, |
|
"learning_rate": 8.259452618256605e-05, |
|
"loss": 0.5316, |
|
"step": 5610 |
|
}, |
|
{ |
|
"epoch": 0.8894496136256111, |
|
"grad_norm": 0.7120912671089172, |
|
"learning_rate": 8.249960449295998e-05, |
|
"loss": 0.5556, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.8941807285917048, |
|
"grad_norm": 0.6223446130752563, |
|
"learning_rate": 8.24046828033539e-05, |
|
"loss": 0.5452, |
|
"step": 5670 |
|
}, |
|
{ |
|
"epoch": 0.8989118435577984, |
|
"grad_norm": 0.6196858286857605, |
|
"learning_rate": 8.230976111374783e-05, |
|
"loss": 0.5601, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.9036429585238921, |
|
"grad_norm": 0.6353973150253296, |
|
"learning_rate": 8.221483942414175e-05, |
|
"loss": 0.5402, |
|
"step": 5730 |
|
}, |
|
{ |
|
"epoch": 0.9083740734899858, |
|
"grad_norm": 0.6631510257720947, |
|
"learning_rate": 8.211991773453568e-05, |
|
"loss": 0.5382, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.9131051884560795, |
|
"grad_norm": 0.6404465436935425, |
|
"learning_rate": 8.20249960449296e-05, |
|
"loss": 0.5298, |
|
"step": 5790 |
|
}, |
|
{ |
|
"epoch": 0.9146822267781107, |
|
"eval_loss": 0.560188353061676, |
|
"eval_runtime": 3.7541, |
|
"eval_samples_per_second": 26.904, |
|
"eval_steps_per_second": 3.463, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.9178363034221731, |
|
"grad_norm": 0.6810153126716614, |
|
"learning_rate": 8.193007435532353e-05, |
|
"loss": 0.5159, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.9225674183882668, |
|
"grad_norm": 0.5828801989555359, |
|
"learning_rate": 8.183515266571745e-05, |
|
"loss": 0.5155, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.9272985333543605, |
|
"grad_norm": 0.538987934589386, |
|
"learning_rate": 8.174023097611138e-05, |
|
"loss": 0.5273, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.9320296483204542, |
|
"grad_norm": 0.6222363114356995, |
|
"learning_rate": 8.16453092865053e-05, |
|
"loss": 0.526, |
|
"step": 5910 |
|
}, |
|
{ |
|
"epoch": 0.9367607632865479, |
|
"grad_norm": 0.542966902256012, |
|
"learning_rate": 8.155038759689923e-05, |
|
"loss": 0.5653, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.9414918782526416, |
|
"grad_norm": 0.7064533829689026, |
|
"learning_rate": 8.145546590729315e-05, |
|
"loss": 0.5207, |
|
"step": 5970 |
|
}, |
|
{ |
|
"epoch": 0.9462229932187353, |
|
"grad_norm": 0.6652514934539795, |
|
"learning_rate": 8.136054421768708e-05, |
|
"loss": 0.5342, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9462229932187353, |
|
"eval_loss": 0.5476773977279663, |
|
"eval_runtime": 3.7543, |
|
"eval_samples_per_second": 26.902, |
|
"eval_steps_per_second": 3.463, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.9509541081848288, |
|
"grad_norm": 0.6436010003089905, |
|
"learning_rate": 8.126562252808101e-05, |
|
"loss": 0.536, |
|
"step": 6030 |
|
}, |
|
{ |
|
"epoch": 0.9556852231509225, |
|
"grad_norm": 0.5532657504081726, |
|
"learning_rate": 8.117070083847494e-05, |
|
"loss": 0.5261, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.9604163381170162, |
|
"grad_norm": 0.6539950370788574, |
|
"learning_rate": 8.107577914886886e-05, |
|
"loss": 0.5226, |
|
"step": 6090 |
|
}, |
|
{ |
|
"epoch": 0.9651474530831099, |
|
"grad_norm": 0.5767289996147156, |
|
"learning_rate": 8.098085745926279e-05, |
|
"loss": 0.534, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.9698785680492036, |
|
"grad_norm": 0.6355389356613159, |
|
"learning_rate": 8.08859357696567e-05, |
|
"loss": 0.5282, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.9746096830152973, |
|
"grad_norm": 0.6711322665214539, |
|
"learning_rate": 8.079101408005064e-05, |
|
"loss": 0.5384, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.9777637596593597, |
|
"eval_loss": 0.5372142195701599, |
|
"eval_runtime": 3.7547, |
|
"eval_samples_per_second": 26.899, |
|
"eval_steps_per_second": 3.462, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.979340797981391, |
|
"grad_norm": 0.5990795493125916, |
|
"learning_rate": 8.069609239044455e-05, |
|
"loss": 0.4624, |
|
"step": 6210 |
|
}, |
|
{ |
|
"epoch": 0.9840719129474846, |
|
"grad_norm": 0.6971167325973511, |
|
"learning_rate": 8.060117070083848e-05, |
|
"loss": 0.5015, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.9888030279135783, |
|
"grad_norm": 0.6699081659317017, |
|
"learning_rate": 8.05062490112324e-05, |
|
"loss": 0.5325, |
|
"step": 6270 |
|
}, |
|
{ |
|
"epoch": 0.993534142879672, |
|
"grad_norm": 0.6347541213035583, |
|
"learning_rate": 8.041132732162633e-05, |
|
"loss": 0.5255, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.9982652578457657, |
|
"grad_norm": 0.7587487101554871, |
|
"learning_rate": 8.031640563202025e-05, |
|
"loss": 0.5154, |
|
"step": 6330 |
|
} |
|
], |
|
"logging_steps": 30, |
|
"max_steps": 31705, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.9843838888449147e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|