{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 6341, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004731114966093676, "grad_norm": 0.9585382342338562, "learning_rate": 7.235790156711095e-05, "loss": 1.19, "step": 30 }, { "epoch": 0.009462229932187352, "grad_norm": 0.9438452124595642, "learning_rate": 8.817139967814685e-05, "loss": 1.0589, "step": 60 }, { "epoch": 0.014193344898281028, "grad_norm": 0.9442492723464966, "learning_rate": 9.722413360750843e-05, "loss": 1.0764, "step": 90 }, { "epoch": 0.018924459864374705, "grad_norm": 0.8840267658233643, "learning_rate": 9.994621104255655e-05, "loss": 1.0847, "step": 120 }, { "epoch": 0.02365557483046838, "grad_norm": 0.8207218050956726, "learning_rate": 9.985445340927068e-05, "loss": 1.0912, "step": 150 }, { "epoch": 0.028386689796562056, "grad_norm": 0.8883314728736877, "learning_rate": 9.975953171966461e-05, "loss": 1.0608, "step": 180 }, { "epoch": 0.03154076644062451, "eval_loss": 1.2097724676132202, "eval_runtime": 3.756, "eval_samples_per_second": 26.89, "eval_steps_per_second": 3.461, "step": 200 }, { "epoch": 0.03311780476265573, "grad_norm": 0.7577874064445496, "learning_rate": 9.966461003005853e-05, "loss": 1.0802, "step": 210 }, { "epoch": 0.03784891972874941, "grad_norm": 1.4911932945251465, "learning_rate": 9.956968834045246e-05, "loss": 1.0397, "step": 240 }, { "epoch": 0.04258003469484308, "grad_norm": 0.8236317038536072, "learning_rate": 9.947476665084638e-05, "loss": 1.0575, "step": 270 }, { "epoch": 0.04731114966093676, "grad_norm": 0.7883521318435669, "learning_rate": 9.937984496124031e-05, "loss": 1.0369, "step": 300 }, { "epoch": 0.05204226462703044, "grad_norm": 0.7798565626144409, "learning_rate": 9.928492327163424e-05, "loss": 1.0354, "step": 330 }, { "epoch": 0.05677337959312411, "grad_norm": 0.7784315943717957, "learning_rate": 9.919000158202817e-05, "loss": 1.0341, "step": 360 }, { "epoch": 0.06150449455921779, "grad_norm": 0.836300790309906, "learning_rate": 9.909507989242209e-05, "loss": 1.0272, "step": 390 }, { "epoch": 0.06308153288124901, "eval_loss": 1.1889104843139648, "eval_runtime": 3.7553, "eval_samples_per_second": 26.895, "eval_steps_per_second": 3.462, "step": 400 }, { "epoch": 0.06623560952531146, "grad_norm": 0.7245925664901733, "learning_rate": 9.900015820281602e-05, "loss": 1.0256, "step": 420 }, { "epoch": 0.07096672449140515, "grad_norm": 0.8321049213409424, "learning_rate": 9.890523651320994e-05, "loss": 1.0332, "step": 450 }, { "epoch": 0.07569783945749882, "grad_norm": 0.7657173275947571, "learning_rate": 9.881031482360387e-05, "loss": 1.0221, "step": 480 }, { "epoch": 0.08042895442359249, "grad_norm": 0.7464463114738464, "learning_rate": 9.871539313399779e-05, "loss": 0.9911, "step": 510 }, { "epoch": 0.08516006938968616, "grad_norm": 0.7290617227554321, "learning_rate": 9.862047144439172e-05, "loss": 1.0258, "step": 540 }, { "epoch": 0.08989118435577985, "grad_norm": 0.7311350703239441, "learning_rate": 9.852554975478564e-05, "loss": 1.0165, "step": 570 }, { "epoch": 0.09462229932187352, "grad_norm": 0.8087915182113647, "learning_rate": 9.843062806517957e-05, "loss": 0.9716, "step": 600 }, { "epoch": 0.09462229932187352, "eval_loss": 1.1471492052078247, "eval_runtime": 3.7536, "eval_samples_per_second": 26.907, "eval_steps_per_second": 3.463, "step": 600 }, { "epoch": 0.09935341428796719, "grad_norm": 0.7442970275878906, "learning_rate": 9.833570637557348e-05, "loss": 0.9747, "step": 630 }, { "epoch": 0.10408452925406088, "grad_norm": 0.9510965347290039, "learning_rate": 9.824078468596742e-05, "loss": 0.9582, "step": 660 }, { "epoch": 0.10881564422015455, "grad_norm": 0.6995567083358765, "learning_rate": 9.814586299636133e-05, "loss": 1.0118, "step": 690 }, { "epoch": 0.11354675918624822, "grad_norm": 0.9319436550140381, "learning_rate": 9.805094130675526e-05, "loss": 0.9815, "step": 720 }, { "epoch": 0.11827787415234191, "grad_norm": 0.7033783793449402, "learning_rate": 9.795601961714918e-05, "loss": 0.9738, "step": 750 }, { "epoch": 0.12300898911843558, "grad_norm": 0.6606217622756958, "learning_rate": 9.786109792754311e-05, "loss": 0.961, "step": 780 }, { "epoch": 0.12616306576249803, "eval_loss": 1.125948190689087, "eval_runtime": 3.7557, "eval_samples_per_second": 26.892, "eval_steps_per_second": 3.461, "step": 800 }, { "epoch": 0.12774010408452927, "grad_norm": 0.9087960124015808, "learning_rate": 9.776617623793703e-05, "loss": 0.9734, "step": 810 }, { "epoch": 0.13247121905062292, "grad_norm": 0.7387025952339172, "learning_rate": 9.767125454833097e-05, "loss": 0.9605, "step": 840 }, { "epoch": 0.1372023340167166, "grad_norm": 0.7939543724060059, "learning_rate": 9.757633285872489e-05, "loss": 0.952, "step": 870 }, { "epoch": 0.1419334489828103, "grad_norm": 1.1417864561080933, "learning_rate": 9.748141116911882e-05, "loss": 0.9113, "step": 900 }, { "epoch": 0.14666456394890395, "grad_norm": 0.7591778635978699, "learning_rate": 9.738648947951274e-05, "loss": 0.9565, "step": 930 }, { "epoch": 0.15139567891499764, "grad_norm": 0.759545087814331, "learning_rate": 9.729156778990667e-05, "loss": 0.9401, "step": 960 }, { "epoch": 0.1561267938810913, "grad_norm": 0.700552761554718, "learning_rate": 9.719664610030059e-05, "loss": 0.9447, "step": 990 }, { "epoch": 0.15770383220312253, "eval_loss": 1.0677810907363892, "eval_runtime": 3.7551, "eval_samples_per_second": 26.897, "eval_steps_per_second": 3.462, "step": 1000 }, { "epoch": 0.16085790884718498, "grad_norm": 0.6673519015312195, "learning_rate": 9.710172441069452e-05, "loss": 0.8919, "step": 1020 }, { "epoch": 0.16558902381327867, "grad_norm": 0.8046931028366089, "learning_rate": 9.700680272108844e-05, "loss": 0.9136, "step": 1050 }, { "epoch": 0.17032013877937233, "grad_norm": 0.7277413606643677, "learning_rate": 9.691188103148237e-05, "loss": 0.9001, "step": 1080 }, { "epoch": 0.175051253745466, "grad_norm": 0.661359429359436, "learning_rate": 9.681695934187629e-05, "loss": 0.9119, "step": 1110 }, { "epoch": 0.1797823687115597, "grad_norm": 0.7349006533622742, "learning_rate": 9.672203765227022e-05, "loss": 0.8825, "step": 1140 }, { "epoch": 0.18451348367765336, "grad_norm": 0.7114729285240173, "learning_rate": 9.662711596266414e-05, "loss": 0.8872, "step": 1170 }, { "epoch": 0.18924459864374704, "grad_norm": 0.6496574282646179, "learning_rate": 9.653219427305807e-05, "loss": 0.8809, "step": 1200 }, { "epoch": 0.18924459864374704, "eval_loss": 1.0253973007202148, "eval_runtime": 3.7532, "eval_samples_per_second": 26.91, "eval_steps_per_second": 3.464, "step": 1200 }, { "epoch": 0.19397571360984073, "grad_norm": 0.6576619744300842, "learning_rate": 9.643727258345198e-05, "loss": 0.876, "step": 1230 }, { "epoch": 0.19870682857593439, "grad_norm": 0.666749119758606, "learning_rate": 9.634235089384591e-05, "loss": 0.8877, "step": 1260 }, { "epoch": 0.20343794354202807, "grad_norm": 0.7769750952720642, "learning_rate": 9.624742920423983e-05, "loss": 0.8894, "step": 1290 }, { "epoch": 0.20816905850812176, "grad_norm": 0.6562801599502563, "learning_rate": 9.615250751463376e-05, "loss": 0.8912, "step": 1320 }, { "epoch": 0.21290017347421542, "grad_norm": 0.6531364917755127, "learning_rate": 9.605758582502768e-05, "loss": 0.875, "step": 1350 }, { "epoch": 0.2176312884403091, "grad_norm": 0.6414660811424255, "learning_rate": 9.596266413542163e-05, "loss": 0.8721, "step": 1380 }, { "epoch": 0.22078536508437155, "eval_loss": 1.0128834247589111, "eval_runtime": 3.7539, "eval_samples_per_second": 26.906, "eval_steps_per_second": 3.463, "step": 1400 }, { "epoch": 0.2223624034064028, "grad_norm": 0.8413099646568298, "learning_rate": 9.586774244581554e-05, "loss": 0.8807, "step": 1410 }, { "epoch": 0.22709351837249644, "grad_norm": 0.6748294830322266, "learning_rate": 9.577282075620947e-05, "loss": 0.8245, "step": 1440 }, { "epoch": 0.23182463333859013, "grad_norm": 0.7067525386810303, "learning_rate": 9.567789906660339e-05, "loss": 0.8767, "step": 1470 }, { "epoch": 0.23655574830468382, "grad_norm": 1.074791431427002, "learning_rate": 9.558297737699732e-05, "loss": 0.8856, "step": 1500 }, { "epoch": 0.24128686327077747, "grad_norm": 0.7461240887641907, "learning_rate": 9.548805568739124e-05, "loss": 0.8759, "step": 1530 }, { "epoch": 0.24601797823687116, "grad_norm": 0.6231616139411926, "learning_rate": 9.539313399778517e-05, "loss": 0.837, "step": 1560 }, { "epoch": 0.25074909320296485, "grad_norm": 0.7053641080856323, "learning_rate": 9.529821230817909e-05, "loss": 0.8763, "step": 1590 }, { "epoch": 0.25232613152499606, "eval_loss": 0.9505324959754944, "eval_runtime": 3.7563, "eval_samples_per_second": 26.888, "eval_steps_per_second": 3.461, "step": 1600 }, { "epoch": 0.25548020816905853, "grad_norm": 0.6484207510948181, "learning_rate": 9.520329061857302e-05, "loss": 0.8787, "step": 1620 }, { "epoch": 0.26021132313515216, "grad_norm": 0.5929827094078064, "learning_rate": 9.510836892896694e-05, "loss": 0.844, "step": 1650 }, { "epoch": 0.26494243810124585, "grad_norm": 0.6840829849243164, "learning_rate": 9.501344723936087e-05, "loss": 0.8492, "step": 1680 }, { "epoch": 0.26967355306733953, "grad_norm": 0.7365448474884033, "learning_rate": 9.491852554975479e-05, "loss": 0.8584, "step": 1710 }, { "epoch": 0.2744046680334332, "grad_norm": 0.6528182029724121, "learning_rate": 9.482360386014872e-05, "loss": 0.8346, "step": 1740 }, { "epoch": 0.2791357829995269, "grad_norm": 0.6200223565101624, "learning_rate": 9.472868217054263e-05, "loss": 0.8008, "step": 1770 }, { "epoch": 0.2838668979656206, "grad_norm": 0.7503982186317444, "learning_rate": 9.463376048093657e-05, "loss": 0.8197, "step": 1800 }, { "epoch": 0.2838668979656206, "eval_loss": 0.9286572933197021, "eval_runtime": 3.7535, "eval_samples_per_second": 26.908, "eval_steps_per_second": 3.463, "step": 1800 }, { "epoch": 0.2885980129317142, "grad_norm": 0.6671140193939209, "learning_rate": 9.453883879133048e-05, "loss": 0.8405, "step": 1830 }, { "epoch": 0.2933291278978079, "grad_norm": 0.7057023048400879, "learning_rate": 9.444391710172441e-05, "loss": 0.7822, "step": 1860 }, { "epoch": 0.2980602428639016, "grad_norm": 0.8120527267456055, "learning_rate": 9.434899541211833e-05, "loss": 0.8416, "step": 1890 }, { "epoch": 0.3027913578299953, "grad_norm": 0.622718334197998, "learning_rate": 9.425407372251228e-05, "loss": 0.8174, "step": 1920 }, { "epoch": 0.30752247279608896, "grad_norm": 0.6605896353721619, "learning_rate": 9.41591520329062e-05, "loss": 0.8003, "step": 1950 }, { "epoch": 0.3122535877621826, "grad_norm": 0.7473495006561279, "learning_rate": 9.406423034330012e-05, "loss": 0.798, "step": 1980 }, { "epoch": 0.31540766440624507, "eval_loss": 0.8976284861564636, "eval_runtime": 3.7537, "eval_samples_per_second": 26.907, "eval_steps_per_second": 3.463, "step": 2000 }, { "epoch": 0.3169847027282763, "grad_norm": 0.7177520394325256, "learning_rate": 9.396930865369404e-05, "loss": 0.8168, "step": 2010 }, { "epoch": 0.32171581769436997, "grad_norm": 0.7600869536399841, "learning_rate": 9.387438696408797e-05, "loss": 0.7918, "step": 2040 }, { "epoch": 0.32644693266046365, "grad_norm": 0.7001503109931946, "learning_rate": 9.377946527448189e-05, "loss": 0.7906, "step": 2070 }, { "epoch": 0.33117804762655734, "grad_norm": 0.6279382705688477, "learning_rate": 9.368454358487582e-05, "loss": 0.7624, "step": 2100 }, { "epoch": 0.335909162592651, "grad_norm": 0.7481889128684998, "learning_rate": 9.358962189526974e-05, "loss": 0.7849, "step": 2130 }, { "epoch": 0.34064027755874465, "grad_norm": 0.6797828078269958, "learning_rate": 9.349470020566367e-05, "loss": 0.7899, "step": 2160 }, { "epoch": 0.34537139252483834, "grad_norm": 0.6929941177368164, "learning_rate": 9.339977851605759e-05, "loss": 0.7703, "step": 2190 }, { "epoch": 0.3469484308468696, "eval_loss": 0.8858568072319031, "eval_runtime": 3.7538, "eval_samples_per_second": 26.906, "eval_steps_per_second": 3.463, "step": 2200 }, { "epoch": 0.350102507490932, "grad_norm": 0.698906660079956, "learning_rate": 9.330485682645152e-05, "loss": 0.7724, "step": 2220 }, { "epoch": 0.3548336224570257, "grad_norm": 0.779211163520813, "learning_rate": 9.320993513684544e-05, "loss": 0.7875, "step": 2250 }, { "epoch": 0.3595647374231194, "grad_norm": 0.7313475608825684, "learning_rate": 9.311817750355957e-05, "loss": 0.794, "step": 2280 }, { "epoch": 0.3642958523892131, "grad_norm": 0.6143506169319153, "learning_rate": 9.30232558139535e-05, "loss": 0.7742, "step": 2310 }, { "epoch": 0.3690269673553067, "grad_norm": 0.6775010824203491, "learning_rate": 9.292833412434741e-05, "loss": 0.7822, "step": 2340 }, { "epoch": 0.3737580823214004, "grad_norm": 0.7151722311973572, "learning_rate": 9.283341243474134e-05, "loss": 0.7617, "step": 2370 }, { "epoch": 0.3784891972874941, "grad_norm": 0.6855128407478333, "learning_rate": 9.273849074513526e-05, "loss": 0.7668, "step": 2400 }, { "epoch": 0.3784891972874941, "eval_loss": 0.8862702250480652, "eval_runtime": 3.7541, "eval_samples_per_second": 26.904, "eval_steps_per_second": 3.463, "step": 2400 }, { "epoch": 0.38322031225358777, "grad_norm": 0.743325412273407, "learning_rate": 9.26435690555292e-05, "loss": 0.7885, "step": 2430 }, { "epoch": 0.38795142721968146, "grad_norm": 0.6186659932136536, "learning_rate": 9.254864736592311e-05, "loss": 0.7619, "step": 2460 }, { "epoch": 0.39268254218577514, "grad_norm": 0.6791619062423706, "learning_rate": 9.245372567631704e-05, "loss": 0.8084, "step": 2490 }, { "epoch": 0.39741365715186877, "grad_norm": 0.6537867784500122, "learning_rate": 9.235880398671097e-05, "loss": 0.7641, "step": 2520 }, { "epoch": 0.40214477211796246, "grad_norm": 0.6688680052757263, "learning_rate": 9.22638822971049e-05, "loss": 0.7634, "step": 2550 }, { "epoch": 0.40687588708405614, "grad_norm": 0.6369423866271973, "learning_rate": 9.216896060749882e-05, "loss": 0.7407, "step": 2580 }, { "epoch": 0.4100299637281186, "eval_loss": 0.8817442059516907, "eval_runtime": 3.7541, "eval_samples_per_second": 26.904, "eval_steps_per_second": 3.463, "step": 2600 }, { "epoch": 0.41160700205014983, "grad_norm": 0.6841573119163513, "learning_rate": 9.207403891789275e-05, "loss": 0.7572, "step": 2610 }, { "epoch": 0.4163381170162435, "grad_norm": 0.625957727432251, "learning_rate": 9.197911722828667e-05, "loss": 0.7493, "step": 2640 }, { "epoch": 0.42106923198233714, "grad_norm": 0.7467941641807556, "learning_rate": 9.18841955386806e-05, "loss": 0.7468, "step": 2670 }, { "epoch": 0.42580034694843083, "grad_norm": 0.6891815662384033, "learning_rate": 9.178927384907452e-05, "loss": 0.7698, "step": 2700 }, { "epoch": 0.4305314619145245, "grad_norm": 0.6197889447212219, "learning_rate": 9.169435215946845e-05, "loss": 0.7588, "step": 2730 }, { "epoch": 0.4352625768806182, "grad_norm": 0.7140328884124756, "learning_rate": 9.159943046986237e-05, "loss": 0.7569, "step": 2760 }, { "epoch": 0.4399936918467119, "grad_norm": 0.7718496322631836, "learning_rate": 9.15045087802563e-05, "loss": 0.7448, "step": 2790 }, { "epoch": 0.4415707301687431, "eval_loss": 0.8855557441711426, "eval_runtime": 3.7544, "eval_samples_per_second": 26.902, "eval_steps_per_second": 3.463, "step": 2800 }, { "epoch": 0.4447248068128056, "grad_norm": 0.6447039246559143, "learning_rate": 9.140958709065022e-05, "loss": 0.7623, "step": 2820 }, { "epoch": 0.4494559217788992, "grad_norm": 0.6694769859313965, "learning_rate": 9.131466540104415e-05, "loss": 0.7081, "step": 2850 }, { "epoch": 0.4541870367449929, "grad_norm": 0.6863081455230713, "learning_rate": 9.121974371143806e-05, "loss": 0.7228, "step": 2880 }, { "epoch": 0.4589181517110866, "grad_norm": 0.7198454737663269, "learning_rate": 9.1124822021832e-05, "loss": 0.7356, "step": 2910 }, { "epoch": 0.46364926667718026, "grad_norm": 0.6542885303497314, "learning_rate": 9.102990033222591e-05, "loss": 0.7606, "step": 2940 }, { "epoch": 0.46838038164327395, "grad_norm": 0.657539963722229, "learning_rate": 9.093497864261984e-05, "loss": 0.7255, "step": 2970 }, { "epoch": 0.47311149660936763, "grad_norm": 0.819503664970398, "learning_rate": 9.084005695301376e-05, "loss": 0.7184, "step": 3000 }, { "epoch": 0.47311149660936763, "eval_loss": 0.8140414357185364, "eval_runtime": 3.7531, "eval_samples_per_second": 26.911, "eval_steps_per_second": 3.464, "step": 3000 }, { "epoch": 0.47784261157546126, "grad_norm": 0.7199704647064209, "learning_rate": 9.074513526340769e-05, "loss": 0.7227, "step": 3030 }, { "epoch": 0.48257372654155495, "grad_norm": 0.7655025720596313, "learning_rate": 9.065021357380162e-05, "loss": 0.7217, "step": 3060 }, { "epoch": 0.48730484150764863, "grad_norm": 0.7312873601913452, "learning_rate": 9.055845594051574e-05, "loss": 0.7059, "step": 3090 }, { "epoch": 0.4920359564737423, "grad_norm": 0.5961809158325195, "learning_rate": 9.046353425090967e-05, "loss": 0.7033, "step": 3120 }, { "epoch": 0.496767071439836, "grad_norm": 0.6955564022064209, "learning_rate": 9.03686125613036e-05, "loss": 0.7289, "step": 3150 }, { "epoch": 0.5014981864059297, "grad_norm": 0.6622660160064697, "learning_rate": 9.027369087169752e-05, "loss": 0.6935, "step": 3180 }, { "epoch": 0.5046522630499921, "eval_loss": 0.7775673270225525, "eval_runtime": 3.754, "eval_samples_per_second": 26.904, "eval_steps_per_second": 3.463, "step": 3200 }, { "epoch": 0.5062293013720234, "grad_norm": 0.7262014746665955, "learning_rate": 9.017876918209145e-05, "loss": 0.6906, "step": 3210 }, { "epoch": 0.5109604163381171, "grad_norm": 0.7221697568893433, "learning_rate": 9.008384749248537e-05, "loss": 0.7079, "step": 3240 }, { "epoch": 0.5156915313042106, "grad_norm": 0.7115603089332581, "learning_rate": 8.99889258028793e-05, "loss": 0.7191, "step": 3270 }, { "epoch": 0.5204226462703043, "grad_norm": 0.7292232513427734, "learning_rate": 8.989400411327322e-05, "loss": 0.6702, "step": 3300 }, { "epoch": 0.525153761236398, "grad_norm": 0.741580605506897, "learning_rate": 8.979908242366715e-05, "loss": 0.6762, "step": 3330 }, { "epoch": 0.5298848762024917, "grad_norm": 0.7870708107948303, "learning_rate": 8.970416073406108e-05, "loss": 0.6838, "step": 3360 }, { "epoch": 0.5346159911685854, "grad_norm": 0.71812903881073, "learning_rate": 8.9609239044455e-05, "loss": 0.7174, "step": 3390 }, { "epoch": 0.5361930294906166, "eval_loss": 0.7375061511993408, "eval_runtime": 3.7548, "eval_samples_per_second": 26.899, "eval_steps_per_second": 3.462, "step": 3400 }, { "epoch": 0.5393471061346791, "grad_norm": 0.7266995906829834, "learning_rate": 8.951431735484893e-05, "loss": 0.6763, "step": 3420 }, { "epoch": 0.5440782211007728, "grad_norm": 0.7786857485771179, "learning_rate": 8.941939566524284e-05, "loss": 0.7149, "step": 3450 }, { "epoch": 0.5488093360668664, "grad_norm": 0.7807109355926514, "learning_rate": 8.932447397563677e-05, "loss": 0.6534, "step": 3480 }, { "epoch": 0.5535404510329601, "grad_norm": 0.6960239410400391, "learning_rate": 8.922955228603069e-05, "loss": 0.7313, "step": 3510 }, { "epoch": 0.5582715659990538, "grad_norm": 0.586615264415741, "learning_rate": 8.913463059642462e-05, "loss": 0.6579, "step": 3540 }, { "epoch": 0.5630026809651475, "grad_norm": 0.9740248918533325, "learning_rate": 8.903970890681854e-05, "loss": 0.7013, "step": 3570 }, { "epoch": 0.5677337959312412, "grad_norm": 0.6628558039665222, "learning_rate": 8.894478721721247e-05, "loss": 0.6546, "step": 3600 }, { "epoch": 0.5677337959312412, "eval_loss": 0.7031014561653137, "eval_runtime": 3.7542, "eval_samples_per_second": 26.903, "eval_steps_per_second": 3.463, "step": 3600 }, { "epoch": 0.5724649108973348, "grad_norm": 0.6030669808387756, "learning_rate": 8.884986552760639e-05, "loss": 0.7146, "step": 3630 }, { "epoch": 0.5771960258634284, "grad_norm": 0.6010313034057617, "learning_rate": 8.875494383800032e-05, "loss": 0.6816, "step": 3660 }, { "epoch": 0.5819271408295221, "grad_norm": 0.6319311857223511, "learning_rate": 8.866002214839425e-05, "loss": 0.6642, "step": 3690 }, { "epoch": 0.5866582557956158, "grad_norm": 0.6059941053390503, "learning_rate": 8.856510045878817e-05, "loss": 0.6998, "step": 3720 }, { "epoch": 0.5913893707617095, "grad_norm": 0.5976997017860413, "learning_rate": 8.84701787691821e-05, "loss": 0.6694, "step": 3750 }, { "epoch": 0.5961204857278032, "grad_norm": 0.6985177993774414, "learning_rate": 8.837525707957602e-05, "loss": 0.6402, "step": 3780 }, { "epoch": 0.5992745623718656, "eval_loss": 0.6977850198745728, "eval_runtime": 3.7545, "eval_samples_per_second": 26.901, "eval_steps_per_second": 3.462, "step": 3800 }, { "epoch": 0.6008516006938969, "grad_norm": 0.7076742053031921, "learning_rate": 8.828033538996995e-05, "loss": 0.6749, "step": 3810 }, { "epoch": 0.6055827156599906, "grad_norm": 0.9254401326179504, "learning_rate": 8.818541370036387e-05, "loss": 0.6481, "step": 3840 }, { "epoch": 0.6103138306260842, "grad_norm": 0.7403334379196167, "learning_rate": 8.80904920107578e-05, "loss": 0.6704, "step": 3870 }, { "epoch": 0.6150449455921779, "grad_norm": 0.6302973628044128, "learning_rate": 8.799557032115171e-05, "loss": 0.6717, "step": 3900 }, { "epoch": 0.6197760605582716, "grad_norm": 0.7587308287620544, "learning_rate": 8.790064863154565e-05, "loss": 0.6526, "step": 3930 }, { "epoch": 0.6245071755243652, "grad_norm": 0.768151581287384, "learning_rate": 8.780572694193956e-05, "loss": 0.6614, "step": 3960 }, { "epoch": 0.6292382904904589, "grad_norm": 0.662624716758728, "learning_rate": 8.77108052523335e-05, "loss": 0.6471, "step": 3990 }, { "epoch": 0.6308153288124901, "eval_loss": 0.6685364246368408, "eval_runtime": 3.7533, "eval_samples_per_second": 26.909, "eval_steps_per_second": 3.464, "step": 4000 }, { "epoch": 0.6339694054565526, "grad_norm": 0.614434540271759, "learning_rate": 8.761588356272743e-05, "loss": 0.6305, "step": 4020 }, { "epoch": 0.6387005204226462, "grad_norm": 0.7292618751525879, "learning_rate": 8.752096187312134e-05, "loss": 0.632, "step": 4050 }, { "epoch": 0.6434316353887399, "grad_norm": 0.5890663862228394, "learning_rate": 8.742604018351527e-05, "loss": 0.6594, "step": 4080 }, { "epoch": 0.6481627503548336, "grad_norm": 0.6511669158935547, "learning_rate": 8.733111849390919e-05, "loss": 0.6417, "step": 4110 }, { "epoch": 0.6528938653209273, "grad_norm": 0.6794877648353577, "learning_rate": 8.723619680430312e-05, "loss": 0.6472, "step": 4140 }, { "epoch": 0.657624980287021, "grad_norm": 0.5826547145843506, "learning_rate": 8.714127511469704e-05, "loss": 0.6255, "step": 4170 }, { "epoch": 0.6623560952531147, "grad_norm": 0.8411812782287598, "learning_rate": 8.704635342509097e-05, "loss": 0.6368, "step": 4200 }, { "epoch": 0.6623560952531147, "eval_loss": 0.6538847088813782, "eval_runtime": 3.7543, "eval_samples_per_second": 26.903, "eval_steps_per_second": 3.463, "step": 4200 }, { "epoch": 0.6670872102192084, "grad_norm": 0.5682166218757629, "learning_rate": 8.69514317354849e-05, "loss": 0.6269, "step": 4230 }, { "epoch": 0.671818325185302, "grad_norm": 0.6340855360031128, "learning_rate": 8.685651004587882e-05, "loss": 0.6423, "step": 4260 }, { "epoch": 0.6765494401513957, "grad_norm": 0.6693681478500366, "learning_rate": 8.676158835627275e-05, "loss": 0.6471, "step": 4290 }, { "epoch": 0.6812805551174893, "grad_norm": 0.6101056337356567, "learning_rate": 8.666666666666667e-05, "loss": 0.6168, "step": 4320 }, { "epoch": 0.686011670083583, "grad_norm": 0.6096228361129761, "learning_rate": 8.65717449770606e-05, "loss": 0.6494, "step": 4350 }, { "epoch": 0.6907427850496767, "grad_norm": 0.6632306575775146, "learning_rate": 8.647682328745452e-05, "loss": 0.664, "step": 4380 }, { "epoch": 0.6938968616937392, "eval_loss": 0.6377571225166321, "eval_runtime": 3.756, "eval_samples_per_second": 26.89, "eval_steps_per_second": 3.461, "step": 4400 }, { "epoch": 0.6954739000157704, "grad_norm": 0.6547721028327942, "learning_rate": 8.638190159784845e-05, "loss": 0.6091, "step": 4410 }, { "epoch": 0.700205014981864, "grad_norm": 0.6063847541809082, "learning_rate": 8.628697990824237e-05, "loss": 0.6055, "step": 4440 }, { "epoch": 0.7049361299479577, "grad_norm": 0.6687933802604675, "learning_rate": 8.61920582186363e-05, "loss": 0.601, "step": 4470 }, { "epoch": 0.7096672449140514, "grad_norm": 0.701770007610321, "learning_rate": 8.609713652903021e-05, "loss": 0.6064, "step": 4500 }, { "epoch": 0.7143983598801451, "grad_norm": 0.6652805209159851, "learning_rate": 8.600221483942414e-05, "loss": 0.653, "step": 4530 }, { "epoch": 0.7191294748462388, "grad_norm": 0.6469018459320068, "learning_rate": 8.590729314981806e-05, "loss": 0.6019, "step": 4560 }, { "epoch": 0.7238605898123325, "grad_norm": 0.6343564391136169, "learning_rate": 8.5812371460212e-05, "loss": 0.6083, "step": 4590 }, { "epoch": 0.7254376281343636, "eval_loss": 0.6411118507385254, "eval_runtime": 3.754, "eval_samples_per_second": 26.905, "eval_steps_per_second": 3.463, "step": 4600 }, { "epoch": 0.7285917047784262, "grad_norm": 0.5817134976387024, "learning_rate": 8.571744977060592e-05, "loss": 0.602, "step": 4620 }, { "epoch": 0.7333228197445197, "grad_norm": 0.5552039742469788, "learning_rate": 8.562252808099984e-05, "loss": 0.6223, "step": 4650 }, { "epoch": 0.7380539347106134, "grad_norm": 0.6455065011978149, "learning_rate": 8.552760639139377e-05, "loss": 0.5865, "step": 4680 }, { "epoch": 0.7427850496767071, "grad_norm": 0.6448588371276855, "learning_rate": 8.543268470178769e-05, "loss": 0.6126, "step": 4710 }, { "epoch": 0.7475161646428008, "grad_norm": 0.6447100639343262, "learning_rate": 8.533776301218162e-05, "loss": 0.6167, "step": 4740 }, { "epoch": 0.7522472796088945, "grad_norm": 0.6894412636756897, "learning_rate": 8.524284132257555e-05, "loss": 0.5851, "step": 4770 }, { "epoch": 0.7569783945749882, "grad_norm": 0.6036236882209778, "learning_rate": 8.514791963296947e-05, "loss": 0.6025, "step": 4800 }, { "epoch": 0.7569783945749882, "eval_loss": 0.6117845177650452, "eval_runtime": 3.7554, "eval_samples_per_second": 26.894, "eval_steps_per_second": 3.462, "step": 4800 }, { "epoch": 0.7617095095410819, "grad_norm": 0.6214340925216675, "learning_rate": 8.50529979433634e-05, "loss": 0.6145, "step": 4830 }, { "epoch": 0.7664406245071755, "grad_norm": 0.6933445334434509, "learning_rate": 8.495807625375732e-05, "loss": 0.6184, "step": 4860 }, { "epoch": 0.7711717394732692, "grad_norm": 0.5649739503860474, "learning_rate": 8.486315456415125e-05, "loss": 0.5996, "step": 4890 }, { "epoch": 0.7759028544393629, "grad_norm": 0.6250168085098267, "learning_rate": 8.476823287454517e-05, "loss": 0.5762, "step": 4920 }, { "epoch": 0.7806339694054566, "grad_norm": 1.7125053405761719, "learning_rate": 8.46733111849391e-05, "loss": 0.5716, "step": 4950 }, { "epoch": 0.7853650843715503, "grad_norm": 0.5721966028213501, "learning_rate": 8.457838949533302e-05, "loss": 0.5612, "step": 4980 }, { "epoch": 0.7885191610156127, "eval_loss": 0.5980841517448425, "eval_runtime": 3.7547, "eval_samples_per_second": 26.9, "eval_steps_per_second": 3.462, "step": 5000 }, { "epoch": 0.7900961993376439, "grad_norm": 0.6716078519821167, "learning_rate": 8.448346780572695e-05, "loss": 0.5765, "step": 5010 }, { "epoch": 0.7948273143037375, "grad_norm": 0.6005885601043701, "learning_rate": 8.438854611612086e-05, "loss": 0.5941, "step": 5040 }, { "epoch": 0.7995584292698312, "grad_norm": 0.6507188081741333, "learning_rate": 8.42936244265148e-05, "loss": 0.5827, "step": 5070 }, { "epoch": 0.8042895442359249, "grad_norm": 0.7276827096939087, "learning_rate": 8.419870273690871e-05, "loss": 0.5555, "step": 5100 }, { "epoch": 0.8090206592020186, "grad_norm": 0.6792399287223816, "learning_rate": 8.410378104730264e-05, "loss": 0.5724, "step": 5130 }, { "epoch": 0.8137517741681123, "grad_norm": 0.7074045538902283, "learning_rate": 8.400885935769656e-05, "loss": 0.5724, "step": 5160 }, { "epoch": 0.818482889134206, "grad_norm": 0.6056311130523682, "learning_rate": 8.391393766809049e-05, "loss": 0.5546, "step": 5190 }, { "epoch": 0.8200599274562372, "eval_loss": 0.5805890560150146, "eval_runtime": 3.7561, "eval_samples_per_second": 26.889, "eval_steps_per_second": 3.461, "step": 5200 }, { "epoch": 0.8232140041002997, "grad_norm": 0.6667674779891968, "learning_rate": 8.381901597848441e-05, "loss": 0.6173, "step": 5220 }, { "epoch": 0.8279451190663933, "grad_norm": 0.607284426689148, "learning_rate": 8.372409428887834e-05, "loss": 0.5781, "step": 5250 }, { "epoch": 0.832676234032487, "grad_norm": 0.6476745009422302, "learning_rate": 8.362917259927227e-05, "loss": 0.5667, "step": 5280 }, { "epoch": 0.8374073489985807, "grad_norm": 0.6668260097503662, "learning_rate": 8.35342509096662e-05, "loss": 0.5456, "step": 5310 }, { "epoch": 0.8421384639646743, "grad_norm": 0.585110068321228, "learning_rate": 8.343932922006012e-05, "loss": 0.5648, "step": 5340 }, { "epoch": 0.846869578930768, "grad_norm": 0.6268571019172668, "learning_rate": 8.334757158677425e-05, "loss": 0.555, "step": 5370 }, { "epoch": 0.8516006938968617, "grad_norm": 0.6197232604026794, "learning_rate": 8.325264989716818e-05, "loss": 0.5333, "step": 5400 }, { "epoch": 0.8516006938968617, "eval_loss": 0.5601951479911804, "eval_runtime": 3.7534, "eval_samples_per_second": 26.909, "eval_steps_per_second": 3.464, "step": 5400 }, { "epoch": 0.8563318088629553, "grad_norm": 0.63880455493927, "learning_rate": 8.31577282075621e-05, "loss": 0.5602, "step": 5430 }, { "epoch": 0.861062923829049, "grad_norm": 0.6235695481300354, "learning_rate": 8.306280651795603e-05, "loss": 0.5604, "step": 5460 }, { "epoch": 0.8657940387951427, "grad_norm": 0.9000911712646484, "learning_rate": 8.296788482834995e-05, "loss": 0.5654, "step": 5490 }, { "epoch": 0.8705251537612364, "grad_norm": 0.6557802557945251, "learning_rate": 8.287612719506408e-05, "loss": 0.5962, "step": 5520 }, { "epoch": 0.8752562687273301, "grad_norm": 0.6231096982955933, "learning_rate": 8.278120550545801e-05, "loss": 0.5636, "step": 5550 }, { "epoch": 0.8799873836934238, "grad_norm": 0.5984258651733398, "learning_rate": 8.268628381585192e-05, "loss": 0.5616, "step": 5580 }, { "epoch": 0.8831414603374862, "eval_loss": 0.5611711740493774, "eval_runtime": 3.7542, "eval_samples_per_second": 26.903, "eval_steps_per_second": 3.463, "step": 5600 }, { "epoch": 0.8847184986595175, "grad_norm": 0.5818042159080505, "learning_rate": 8.259452618256605e-05, "loss": 0.5316, "step": 5610 }, { "epoch": 0.8894496136256111, "grad_norm": 0.7120912671089172, "learning_rate": 8.249960449295998e-05, "loss": 0.5556, "step": 5640 }, { "epoch": 0.8941807285917048, "grad_norm": 0.6223446130752563, "learning_rate": 8.24046828033539e-05, "loss": 0.5452, "step": 5670 }, { "epoch": 0.8989118435577984, "grad_norm": 0.6196858286857605, "learning_rate": 8.230976111374783e-05, "loss": 0.5601, "step": 5700 }, { "epoch": 0.9036429585238921, "grad_norm": 0.6353973150253296, "learning_rate": 8.221483942414175e-05, "loss": 0.5402, "step": 5730 }, { "epoch": 0.9083740734899858, "grad_norm": 0.6631510257720947, "learning_rate": 8.211991773453568e-05, "loss": 0.5382, "step": 5760 }, { "epoch": 0.9131051884560795, "grad_norm": 0.6404465436935425, "learning_rate": 8.20249960449296e-05, "loss": 0.5298, "step": 5790 }, { "epoch": 0.9146822267781107, "eval_loss": 0.560188353061676, "eval_runtime": 3.7541, "eval_samples_per_second": 26.904, "eval_steps_per_second": 3.463, "step": 5800 }, { "epoch": 0.9178363034221731, "grad_norm": 0.6810153126716614, "learning_rate": 8.193007435532353e-05, "loss": 0.5159, "step": 5820 }, { "epoch": 0.9225674183882668, "grad_norm": 0.5828801989555359, "learning_rate": 8.183515266571745e-05, "loss": 0.5155, "step": 5850 }, { "epoch": 0.9272985333543605, "grad_norm": 0.538987934589386, "learning_rate": 8.174023097611138e-05, "loss": 0.5273, "step": 5880 }, { "epoch": 0.9320296483204542, "grad_norm": 0.6222363114356995, "learning_rate": 8.16453092865053e-05, "loss": 0.526, "step": 5910 }, { "epoch": 0.9367607632865479, "grad_norm": 0.542966902256012, "learning_rate": 8.155038759689923e-05, "loss": 0.5653, "step": 5940 }, { "epoch": 0.9414918782526416, "grad_norm": 0.7064533829689026, "learning_rate": 8.145546590729315e-05, "loss": 0.5207, "step": 5970 }, { "epoch": 0.9462229932187353, "grad_norm": 0.6652514934539795, "learning_rate": 8.136054421768708e-05, "loss": 0.5342, "step": 6000 }, { "epoch": 0.9462229932187353, "eval_loss": 0.5476773977279663, "eval_runtime": 3.7543, "eval_samples_per_second": 26.902, "eval_steps_per_second": 3.463, "step": 6000 }, { "epoch": 0.9509541081848288, "grad_norm": 0.6436010003089905, "learning_rate": 8.126562252808101e-05, "loss": 0.536, "step": 6030 }, { "epoch": 0.9556852231509225, "grad_norm": 0.5532657504081726, "learning_rate": 8.117070083847494e-05, "loss": 0.5261, "step": 6060 }, { "epoch": 0.9604163381170162, "grad_norm": 0.6539950370788574, "learning_rate": 8.107577914886886e-05, "loss": 0.5226, "step": 6090 }, { "epoch": 0.9651474530831099, "grad_norm": 0.5767289996147156, "learning_rate": 8.098085745926279e-05, "loss": 0.534, "step": 6120 }, { "epoch": 0.9698785680492036, "grad_norm": 0.6355389356613159, "learning_rate": 8.08859357696567e-05, "loss": 0.5282, "step": 6150 }, { "epoch": 0.9746096830152973, "grad_norm": 0.6711322665214539, "learning_rate": 8.079101408005064e-05, "loss": 0.5384, "step": 6180 }, { "epoch": 0.9777637596593597, "eval_loss": 0.5372142195701599, "eval_runtime": 3.7547, "eval_samples_per_second": 26.899, "eval_steps_per_second": 3.462, "step": 6200 }, { "epoch": 0.979340797981391, "grad_norm": 0.5990795493125916, "learning_rate": 8.069609239044455e-05, "loss": 0.4624, "step": 6210 }, { "epoch": 0.9840719129474846, "grad_norm": 0.6971167325973511, "learning_rate": 8.060117070083848e-05, "loss": 0.5015, "step": 6240 }, { "epoch": 0.9888030279135783, "grad_norm": 0.6699081659317017, "learning_rate": 8.05062490112324e-05, "loss": 0.5325, "step": 6270 }, { "epoch": 0.993534142879672, "grad_norm": 0.6347541213035583, "learning_rate": 8.041132732162633e-05, "loss": 0.5255, "step": 6300 }, { "epoch": 0.9982652578457657, "grad_norm": 0.7587487101554871, "learning_rate": 8.031640563202025e-05, "loss": 0.5154, "step": 6330 } ], "logging_steps": 30, "max_steps": 31705, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.9843838888449147e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }