{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.07284240787863483, "eval_steps": 75, "global_step": 1875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003884928420193858, "grad_norm": 0.49890583753585815, "learning_rate": 7.759200756309999e-05, "loss": 1.8971, "step": 10 }, { "epoch": 0.0007769856840387716, "grad_norm": 1.6535608768463135, "learning_rate": 0.0001, "loss": 1.6661, "step": 20 }, { "epoch": 0.0011654785260581573, "grad_norm": 1.0714327096939087, "learning_rate": 0.0001, "loss": 1.4684, "step": 30 }, { "epoch": 0.0015539713680775432, "grad_norm": 0.6442272067070007, "learning_rate": 0.0001, "loss": 1.5553, "step": 40 }, { "epoch": 0.0019424642100969289, "grad_norm": 0.818639874458313, "learning_rate": 0.0001, "loss": 1.5007, "step": 50 }, { "epoch": 0.0023309570521163146, "grad_norm": 1.3463096618652344, "learning_rate": 0.0001, "loss": 1.2678, "step": 60 }, { "epoch": 0.0027194498941357005, "grad_norm": 0.8409688472747803, "learning_rate": 0.0001, "loss": 1.4836, "step": 70 }, { "epoch": 0.0029136963151453936, "eval_loss": 1.4955415725708008, "eval_runtime": 328.1939, "eval_samples_per_second": 1.587, "eval_steps_per_second": 1.587, "step": 75 }, { "epoch": 0.0031079427361550864, "grad_norm": 0.49311453104019165, "learning_rate": 0.0001, "loss": 1.5626, "step": 80 }, { "epoch": 0.0034964355781744723, "grad_norm": 0.5505372881889343, "learning_rate": 0.0001, "loss": 1.4427, "step": 90 }, { "epoch": 0.0038849284201938577, "grad_norm": 1.1296964883804321, "learning_rate": 0.0001, "loss": 1.4702, "step": 100 }, { "epoch": 0.004273421262213244, "grad_norm": 1.38261878490448, "learning_rate": 0.0001, "loss": 1.5806, "step": 110 }, { "epoch": 0.004661914104232629, "grad_norm": 0.5213516354560852, "learning_rate": 0.0001, "loss": 1.4675, "step": 120 }, { "epoch": 0.0050504069462520155, "grad_norm": 2.93784761428833, "learning_rate": 0.0001, "loss": 1.6413, "step": 130 }, { "epoch": 0.005438899788271401, "grad_norm": 0.6772735118865967, "learning_rate": 0.0001, "loss": 1.2942, "step": 140 }, { "epoch": 0.005827392630290787, "grad_norm": 1.1066700220108032, "learning_rate": 0.0001, "loss": 1.4082, "step": 150 }, { "epoch": 0.005827392630290787, "eval_loss": 1.4717903137207031, "eval_runtime": 419.3037, "eval_samples_per_second": 1.243, "eval_steps_per_second": 1.243, "step": 150 }, { "epoch": 0.006215885472310173, "grad_norm": 0.5077598690986633, "learning_rate": 0.0001, "loss": 1.6572, "step": 160 }, { "epoch": 0.006604378314329558, "grad_norm": 0.5666481256484985, "learning_rate": 0.0001, "loss": 1.7344, "step": 170 }, { "epoch": 0.0069928711563489445, "grad_norm": 0.7042965888977051, "learning_rate": 0.0001, "loss": 1.3634, "step": 180 }, { "epoch": 0.00738136399836833, "grad_norm": 0.6379776000976562, "learning_rate": 0.0001, "loss": 1.6191, "step": 190 }, { "epoch": 0.0077698568403877155, "grad_norm": 0.7309342622756958, "learning_rate": 0.0001, "loss": 1.2186, "step": 200 }, { "epoch": 0.008158349682407102, "grad_norm": 1.4138643741607666, "learning_rate": 0.0001, "loss": 1.5201, "step": 210 }, { "epoch": 0.008546842524426487, "grad_norm": 1.3856728076934814, "learning_rate": 0.0001, "loss": 1.4056, "step": 220 }, { "epoch": 0.00874108894543618, "eval_loss": 1.473658800125122, "eval_runtime": 413.7612, "eval_samples_per_second": 1.259, "eval_steps_per_second": 1.259, "step": 225 }, { "epoch": 0.008935335366445873, "grad_norm": 1.1668083667755127, "learning_rate": 0.0001, "loss": 1.5261, "step": 230 }, { "epoch": 0.009323828208465258, "grad_norm": 0.719367265701294, "learning_rate": 0.0001, "loss": 1.4151, "step": 240 }, { "epoch": 0.009712321050484645, "grad_norm": 0.7443403005599976, "learning_rate": 0.0001, "loss": 1.3723, "step": 250 }, { "epoch": 0.010100813892504031, "grad_norm": 0.8915978670120239, "learning_rate": 0.0001, "loss": 1.5165, "step": 260 }, { "epoch": 0.010489306734523416, "grad_norm": 0.7369945049285889, "learning_rate": 0.0001, "loss": 1.4225, "step": 270 }, { "epoch": 0.010877799576542802, "grad_norm": 1.2632057666778564, "learning_rate": 0.0001, "loss": 1.3462, "step": 280 }, { "epoch": 0.011266292418562187, "grad_norm": 1.6178512573242188, "learning_rate": 0.0001, "loss": 1.4283, "step": 290 }, { "epoch": 0.011654785260581575, "grad_norm": 2.717789649963379, "learning_rate": 0.0001, "loss": 1.4773, "step": 300 }, { "epoch": 0.011654785260581575, "eval_loss": 1.4709599018096924, "eval_runtime": 410.5517, "eval_samples_per_second": 1.269, "eval_steps_per_second": 1.269, "step": 300 }, { "epoch": 0.01204327810260096, "grad_norm": 3.8710834980010986, "learning_rate": 0.0001, "loss": 1.4183, "step": 310 }, { "epoch": 0.012431770944620345, "grad_norm": 1.1690031290054321, "learning_rate": 0.0001, "loss": 1.6159, "step": 320 }, { "epoch": 0.012820263786639731, "grad_norm": 1.422135829925537, "learning_rate": 0.0001, "loss": 1.5327, "step": 330 }, { "epoch": 0.013208756628659116, "grad_norm": 1.353925347328186, "learning_rate": 0.0001, "loss": 1.4782, "step": 340 }, { "epoch": 0.013597249470678504, "grad_norm": 0.8083727359771729, "learning_rate": 0.0001, "loss": 1.3199, "step": 350 }, { "epoch": 0.013985742312697889, "grad_norm": 0.6409865021705627, "learning_rate": 0.0001, "loss": 1.3829, "step": 360 }, { "epoch": 0.014374235154717275, "grad_norm": 1.9057331085205078, "learning_rate": 0.0001, "loss": 1.3386, "step": 370 }, { "epoch": 0.014568481575726967, "eval_loss": 1.4621069431304932, "eval_runtime": 410.8963, "eval_samples_per_second": 1.268, "eval_steps_per_second": 1.268, "step": 375 }, { "epoch": 0.01476272799673666, "grad_norm": 1.4260625839233398, "learning_rate": 0.0001, "loss": 1.4447, "step": 380 }, { "epoch": 0.015151220838756045, "grad_norm": 2.0252511501312256, "learning_rate": 0.0001, "loss": 1.4396, "step": 390 }, { "epoch": 0.015539713680775431, "grad_norm": 1.5493030548095703, "learning_rate": 0.0001, "loss": 1.5377, "step": 400 }, { "epoch": 0.015928206522794818, "grad_norm": 1.5620871782302856, "learning_rate": 0.0001, "loss": 1.5368, "step": 410 }, { "epoch": 0.016316699364814204, "grad_norm": 1.8342182636260986, "learning_rate": 0.0001, "loss": 1.4932, "step": 420 }, { "epoch": 0.01670519220683359, "grad_norm": 0.8918685913085938, "learning_rate": 0.0001, "loss": 1.4847, "step": 430 }, { "epoch": 0.017093685048852975, "grad_norm": 1.4548940658569336, "learning_rate": 0.0001, "loss": 1.5184, "step": 440 }, { "epoch": 0.01748217789087236, "grad_norm": 1.4839730262756348, "learning_rate": 0.0001, "loss": 1.4276, "step": 450 }, { "epoch": 0.01748217789087236, "eval_loss": 1.4603033065795898, "eval_runtime": 408.9003, "eval_samples_per_second": 1.274, "eval_steps_per_second": 1.274, "step": 450 }, { "epoch": 0.017870670732891746, "grad_norm": 0.6719891428947449, "learning_rate": 0.0001, "loss": 1.3042, "step": 460 }, { "epoch": 0.01825916357491113, "grad_norm": 0.8530905246734619, "learning_rate": 0.0001, "loss": 1.454, "step": 470 }, { "epoch": 0.018647656416930516, "grad_norm": 0.8087925910949707, "learning_rate": 0.0001, "loss": 1.4225, "step": 480 }, { "epoch": 0.019036149258949905, "grad_norm": 2.091627359390259, "learning_rate": 0.0001, "loss": 1.4617, "step": 490 }, { "epoch": 0.01942464210096929, "grad_norm": 2.1747212409973145, "learning_rate": 0.0001, "loss": 1.5124, "step": 500 }, { "epoch": 0.019813134942988676, "grad_norm": 1.7147002220153809, "learning_rate": 0.0001, "loss": 1.4442, "step": 510 }, { "epoch": 0.020201627785008062, "grad_norm": 0.7326516509056091, "learning_rate": 0.0001, "loss": 1.4376, "step": 520 }, { "epoch": 0.020395874206017753, "eval_loss": 1.4605984687805176, "eval_runtime": 418.9148, "eval_samples_per_second": 1.244, "eval_steps_per_second": 1.244, "step": 525 }, { "epoch": 0.020590120627027447, "grad_norm": 1.7703779935836792, "learning_rate": 0.0001, "loss": 1.4911, "step": 530 }, { "epoch": 0.020978613469046833, "grad_norm": 0.8552814722061157, "learning_rate": 0.0001, "loss": 1.4079, "step": 540 }, { "epoch": 0.021367106311066218, "grad_norm": 1.0003011226654053, "learning_rate": 0.0001, "loss": 1.5769, "step": 550 }, { "epoch": 0.021755599153085604, "grad_norm": 1.0176352262496948, "learning_rate": 0.0001, "loss": 1.2431, "step": 560 }, { "epoch": 0.02214409199510499, "grad_norm": 1.8341186046600342, "learning_rate": 0.0001, "loss": 1.5053, "step": 570 }, { "epoch": 0.022532584837124375, "grad_norm": 0.7317614555358887, "learning_rate": 0.0001, "loss": 1.5135, "step": 580 }, { "epoch": 0.02292107767914376, "grad_norm": 1.3072996139526367, "learning_rate": 0.0001, "loss": 1.2935, "step": 590 }, { "epoch": 0.02330957052116315, "grad_norm": 0.5384438633918762, "learning_rate": 0.0001, "loss": 1.4028, "step": 600 }, { "epoch": 0.02330957052116315, "eval_loss": 1.457323670387268, "eval_runtime": 419.1828, "eval_samples_per_second": 1.243, "eval_steps_per_second": 1.243, "step": 600 }, { "epoch": 0.023698063363182535, "grad_norm": 0.6213059425354004, "learning_rate": 0.0001, "loss": 1.3461, "step": 610 }, { "epoch": 0.02408655620520192, "grad_norm": 0.9022939801216125, "learning_rate": 0.0001, "loss": 1.458, "step": 620 }, { "epoch": 0.024475049047221305, "grad_norm": 1.511841893196106, "learning_rate": 0.0001, "loss": 1.3387, "step": 630 }, { "epoch": 0.02486354188924069, "grad_norm": 1.193332552909851, "learning_rate": 0.0001, "loss": 1.25, "step": 640 }, { "epoch": 0.025252034731260076, "grad_norm": 0.664730429649353, "learning_rate": 0.0001, "loss": 1.4608, "step": 650 }, { "epoch": 0.025640527573279462, "grad_norm": 0.9817675352096558, "learning_rate": 0.0001, "loss": 1.4694, "step": 660 }, { "epoch": 0.026029020415298847, "grad_norm": 0.8713122606277466, "learning_rate": 0.0001, "loss": 1.6154, "step": 670 }, { "epoch": 0.026223266836308542, "eval_loss": 1.4552098512649536, "eval_runtime": 418.233, "eval_samples_per_second": 1.246, "eval_steps_per_second": 1.246, "step": 675 }, { "epoch": 0.026417513257318233, "grad_norm": 0.8656709790229797, "learning_rate": 0.0001, "loss": 1.6028, "step": 680 }, { "epoch": 0.02680600609933762, "grad_norm": 0.7827064990997314, "learning_rate": 0.0001, "loss": 1.454, "step": 690 }, { "epoch": 0.027194498941357007, "grad_norm": 0.8780921101570129, "learning_rate": 0.0001, "loss": 1.377, "step": 700 }, { "epoch": 0.027582991783376393, "grad_norm": 0.664682149887085, "learning_rate": 0.0001, "loss": 1.3761, "step": 710 }, { "epoch": 0.027971484625395778, "grad_norm": 1.6883013248443604, "learning_rate": 0.0001, "loss": 1.293, "step": 720 }, { "epoch": 0.028359977467415164, "grad_norm": 0.6659910082817078, "learning_rate": 0.0001, "loss": 1.3595, "step": 730 }, { "epoch": 0.02874847030943455, "grad_norm": 1.0495606660842896, "learning_rate": 0.0001, "loss": 1.3881, "step": 740 }, { "epoch": 0.029136963151453935, "grad_norm": 2.0675432682037354, "learning_rate": 0.0001, "loss": 1.3353, "step": 750 }, { "epoch": 0.029136963151453935, "eval_loss": 1.4496526718139648, "eval_runtime": 412.105, "eval_samples_per_second": 1.264, "eval_steps_per_second": 1.264, "step": 750 }, { "epoch": 0.02952545599347332, "grad_norm": 2.147975444793701, "learning_rate": 0.0001, "loss": 1.4715, "step": 760 }, { "epoch": 0.029913948835492706, "grad_norm": 1.4400185346603394, "learning_rate": 0.0001, "loss": 1.704, "step": 770 }, { "epoch": 0.03030244167751209, "grad_norm": 0.5840633511543274, "learning_rate": 0.0001, "loss": 1.2531, "step": 780 }, { "epoch": 0.030690934519531476, "grad_norm": 1.9958975315093994, "learning_rate": 0.0001, "loss": 1.6409, "step": 790 }, { "epoch": 0.031079427361550862, "grad_norm": 0.4322706460952759, "learning_rate": 0.0001, "loss": 1.3866, "step": 800 }, { "epoch": 0.03146792020357025, "grad_norm": 0.9608808755874634, "learning_rate": 0.0001, "loss": 1.2862, "step": 810 }, { "epoch": 0.031856413045589636, "grad_norm": 1.0402257442474365, "learning_rate": 0.0001, "loss": 1.4305, "step": 820 }, { "epoch": 0.03205065946659933, "eval_loss": 1.4473251104354858, "eval_runtime": 410.7169, "eval_samples_per_second": 1.269, "eval_steps_per_second": 1.269, "step": 825 }, { "epoch": 0.03224490588760902, "grad_norm": 0.6171532273292542, "learning_rate": 0.0001, "loss": 1.2107, "step": 830 }, { "epoch": 0.03263339872962841, "grad_norm": 1.6381995677947998, "learning_rate": 0.0001, "loss": 1.5369, "step": 840 }, { "epoch": 0.03302189157164779, "grad_norm": 0.5398985743522644, "learning_rate": 0.0001, "loss": 1.4939, "step": 850 }, { "epoch": 0.03341038441366718, "grad_norm": 1.1927576065063477, "learning_rate": 0.0001, "loss": 1.459, "step": 860 }, { "epoch": 0.03379887725568657, "grad_norm": 0.5355756878852844, "learning_rate": 0.0001, "loss": 1.4591, "step": 870 }, { "epoch": 0.03418737009770595, "grad_norm": 1.0324468612670898, "learning_rate": 0.0001, "loss": 1.4079, "step": 880 }, { "epoch": 0.03457586293972534, "grad_norm": 0.9082580804824829, "learning_rate": 0.0001, "loss": 1.4703, "step": 890 }, { "epoch": 0.03496435578174472, "grad_norm": 1.0036635398864746, "learning_rate": 0.0001, "loss": 1.2948, "step": 900 }, { "epoch": 0.03496435578174472, "eval_loss": 1.4607137441635132, "eval_runtime": 418.5246, "eval_samples_per_second": 1.245, "eval_steps_per_second": 1.245, "step": 900 }, { "epoch": 0.03535284862376411, "grad_norm": 0.7732622027397156, "learning_rate": 0.0001, "loss": 1.5017, "step": 910 }, { "epoch": 0.03574134146578349, "grad_norm": 0.7425190806388855, "learning_rate": 0.0001, "loss": 1.4275, "step": 920 }, { "epoch": 0.03612983430780288, "grad_norm": 0.6782093644142151, "learning_rate": 0.0001, "loss": 1.4424, "step": 930 }, { "epoch": 0.03651832714982226, "grad_norm": 0.6914064288139343, "learning_rate": 0.0001, "loss": 1.49, "step": 940 }, { "epoch": 0.03690681999184165, "grad_norm": 1.2722946405410767, "learning_rate": 0.0001, "loss": 1.4737, "step": 950 }, { "epoch": 0.03729531283386103, "grad_norm": 0.9967614412307739, "learning_rate": 0.0001, "loss": 1.3731, "step": 960 }, { "epoch": 0.03768380567588042, "grad_norm": 0.5614752173423767, "learning_rate": 0.0001, "loss": 1.3554, "step": 970 }, { "epoch": 0.037878052096890116, "eval_loss": 1.4594156742095947, "eval_runtime": 414.1638, "eval_samples_per_second": 1.258, "eval_steps_per_second": 1.258, "step": 975 }, { "epoch": 0.03807229851789981, "grad_norm": 1.496825933456421, "learning_rate": 0.0001, "loss": 1.3043, "step": 980 }, { "epoch": 0.03846079135991919, "grad_norm": 0.5324123501777649, "learning_rate": 0.0001, "loss": 1.5467, "step": 990 }, { "epoch": 0.03884928420193858, "grad_norm": 2.828305959701538, "learning_rate": 0.0001, "loss": 1.4766, "step": 1000 }, { "epoch": 0.039237777043957964, "grad_norm": 1.0788389444351196, "learning_rate": 0.0001, "loss": 1.6391, "step": 1010 }, { "epoch": 0.03962626988597735, "grad_norm": 1.3913893699645996, "learning_rate": 0.0001, "loss": 1.36, "step": 1020 }, { "epoch": 0.040014762727996735, "grad_norm": 1.0683279037475586, "learning_rate": 0.0001, "loss": 1.3993, "step": 1030 }, { "epoch": 0.040403255570016124, "grad_norm": 0.14315283298492432, "learning_rate": 0.0001, "loss": 1.3594, "step": 1040 }, { "epoch": 0.040791748412035506, "grad_norm": 1.996098518371582, "learning_rate": 0.0001, "loss": 1.3501, "step": 1050 }, { "epoch": 0.040791748412035506, "eval_loss": 1.4564799070358276, "eval_runtime": 407.4512, "eval_samples_per_second": 1.279, "eval_steps_per_second": 1.279, "step": 1050 }, { "epoch": 0.041180241254054895, "grad_norm": 1.9238131046295166, "learning_rate": 1e-05, "loss": 1.3409, "step": 1060 }, { "epoch": 0.04156873409607428, "grad_norm": 0.561337947845459, "learning_rate": 1e-05, "loss": 1.3931, "step": 1070 }, { "epoch": 0.041957226938093665, "grad_norm": 0.6750600934028625, "learning_rate": 1e-05, "loss": 1.3055, "step": 1080 }, { "epoch": 0.042345719780113054, "grad_norm": 1.6704535484313965, "learning_rate": 1e-05, "loss": 1.4371, "step": 1090 }, { "epoch": 0.042734212622132436, "grad_norm": 0.6073994636535645, "learning_rate": 1e-05, "loss": 1.5461, "step": 1100 }, { "epoch": 0.043122705464151825, "grad_norm": 1.1396293640136719, "learning_rate": 1e-05, "loss": 1.4749, "step": 1110 }, { "epoch": 0.04351119830617121, "grad_norm": 0.6748817563056946, "learning_rate": 1e-05, "loss": 1.4803, "step": 1120 }, { "epoch": 0.0437054447271809, "eval_loss": 1.4066863059997559, "eval_runtime": 198.1965, "eval_samples_per_second": 2.629, "eval_steps_per_second": 2.629, "step": 1125 }, { "epoch": 0.043899691148190596, "grad_norm": 0.8130941987037659, "learning_rate": 1e-05, "loss": 1.4579, "step": 1130 }, { "epoch": 0.04428818399020998, "grad_norm": 0.5348241329193115, "learning_rate": 1e-05, "loss": 1.3988, "step": 1140 }, { "epoch": 0.04467667683222937, "grad_norm": 0.6961309313774109, "learning_rate": 1e-05, "loss": 1.4051, "step": 1150 }, { "epoch": 0.04506516967424875, "grad_norm": 0.8562794923782349, "learning_rate": 1e-05, "loss": 1.3631, "step": 1160 }, { "epoch": 0.04545366251626814, "grad_norm": 0.6999790668487549, "learning_rate": 1e-05, "loss": 1.4532, "step": 1170 }, { "epoch": 0.04584215535828752, "grad_norm": 0.5127655267715454, "learning_rate": 1e-05, "loss": 1.4715, "step": 1180 }, { "epoch": 0.04623064820030691, "grad_norm": 1.5171382427215576, "learning_rate": 1e-05, "loss": 1.2901, "step": 1190 }, { "epoch": 0.0466191410423263, "grad_norm": 0.7225420475006104, "learning_rate": 1e-05, "loss": 1.2778, "step": 1200 }, { "epoch": 0.0466191410423263, "eval_loss": 1.4001317024230957, "eval_runtime": 199.2632, "eval_samples_per_second": 2.615, "eval_steps_per_second": 2.615, "step": 1200 }, { "epoch": 0.04700763388434568, "grad_norm": 1.5108428001403809, "learning_rate": 1e-05, "loss": 1.6175, "step": 1210 }, { "epoch": 0.04739612672636507, "grad_norm": 1.1392805576324463, "learning_rate": 1e-05, "loss": 1.3466, "step": 1220 }, { "epoch": 0.04778461956838445, "grad_norm": 0.94669109582901, "learning_rate": 1e-05, "loss": 1.3415, "step": 1230 }, { "epoch": 0.04817311241040384, "grad_norm": 0.8593105673789978, "learning_rate": 1e-05, "loss": 1.3334, "step": 1240 }, { "epoch": 0.04856160525242322, "grad_norm": 0.8188263773918152, "learning_rate": 1e-05, "loss": 1.4602, "step": 1250 }, { "epoch": 0.04895009809444261, "grad_norm": 0.6875782608985901, "learning_rate": 1e-05, "loss": 1.3261, "step": 1260 }, { "epoch": 0.04933859093646199, "grad_norm": 1.8237006664276123, "learning_rate": 1e-05, "loss": 1.5862, "step": 1270 }, { "epoch": 0.04953283735747169, "eval_loss": 1.3970199823379517, "eval_runtime": 205.8088, "eval_samples_per_second": 2.531, "eval_steps_per_second": 2.531, "step": 1275 }, { "epoch": 0.04972708377848138, "grad_norm": 1.319785237312317, "learning_rate": 1e-05, "loss": 1.3576, "step": 1280 }, { "epoch": 0.050115576620500764, "grad_norm": 1.727789282798767, "learning_rate": 1e-05, "loss": 1.5409, "step": 1290 }, { "epoch": 0.05050406946252015, "grad_norm": 0.9914244413375854, "learning_rate": 1e-05, "loss": 1.3503, "step": 1300 }, { "epoch": 0.05089256230453954, "grad_norm": 1.8328955173492432, "learning_rate": 1e-05, "loss": 1.5384, "step": 1310 }, { "epoch": 0.051281055146558924, "grad_norm": 1.7998759746551514, "learning_rate": 1e-05, "loss": 1.4807, "step": 1320 }, { "epoch": 0.05166954798857831, "grad_norm": 1.53579843044281, "learning_rate": 1e-05, "loss": 1.4255, "step": 1330 }, { "epoch": 0.052058040830597695, "grad_norm": 0.9572857022285461, "learning_rate": 1e-05, "loss": 1.4547, "step": 1340 }, { "epoch": 0.052446533672617084, "grad_norm": 0.6299539804458618, "learning_rate": 1e-05, "loss": 1.2758, "step": 1350 }, { "epoch": 0.052446533672617084, "eval_loss": 1.394976258277893, "eval_runtime": 206.0718, "eval_samples_per_second": 2.528, "eval_steps_per_second": 2.528, "step": 1350 }, { "epoch": 0.052835026514636466, "grad_norm": 1.1869505643844604, "learning_rate": 1e-05, "loss": 1.2709, "step": 1360 }, { "epoch": 0.053223519356655855, "grad_norm": 0.5684358477592468, "learning_rate": 1e-05, "loss": 1.3306, "step": 1370 }, { "epoch": 0.05361201219867524, "grad_norm": 0.5880847573280334, "learning_rate": 1e-05, "loss": 1.3093, "step": 1380 }, { "epoch": 0.054000505040694625, "grad_norm": 0.6990231275558472, "learning_rate": 1e-05, "loss": 1.4534, "step": 1390 }, { "epoch": 0.054388997882714014, "grad_norm": 1.0700093507766724, "learning_rate": 1e-05, "loss": 1.3294, "step": 1400 }, { "epoch": 0.054777490724733396, "grad_norm": 1.044433832168579, "learning_rate": 1e-05, "loss": 1.4177, "step": 1410 }, { "epoch": 0.055165983566752785, "grad_norm": 2.6891329288482666, "learning_rate": 1e-05, "loss": 1.4451, "step": 1420 }, { "epoch": 0.05536022998776247, "eval_loss": 1.3934379816055298, "eval_runtime": 204.0926, "eval_samples_per_second": 2.553, "eval_steps_per_second": 2.553, "step": 1425 }, { "epoch": 0.05555447640877217, "grad_norm": 0.4769861698150635, "learning_rate": 1e-05, "loss": 1.3179, "step": 1430 }, { "epoch": 0.055942969250791556, "grad_norm": 1.0731093883514404, "learning_rate": 1e-05, "loss": 1.47, "step": 1440 }, { "epoch": 0.05633146209281094, "grad_norm": 1.016760230064392, "learning_rate": 1e-05, "loss": 1.5151, "step": 1450 }, { "epoch": 0.05671995493483033, "grad_norm": 1.5259450674057007, "learning_rate": 1e-05, "loss": 1.4038, "step": 1460 }, { "epoch": 0.05710844777684971, "grad_norm": 0.654501736164093, "learning_rate": 1e-05, "loss": 1.3135, "step": 1470 }, { "epoch": 0.0574969406188691, "grad_norm": 0.6827269196510315, "learning_rate": 1e-05, "loss": 1.2978, "step": 1480 }, { "epoch": 0.05788543346088848, "grad_norm": 0.5111151933670044, "learning_rate": 1e-05, "loss": 1.4352, "step": 1490 }, { "epoch": 0.05827392630290787, "grad_norm": 1.9571446180343628, "learning_rate": 1e-05, "loss": 1.4764, "step": 1500 }, { "epoch": 0.05827392630290787, "eval_loss": 1.3912627696990967, "eval_runtime": 204.5081, "eval_samples_per_second": 2.548, "eval_steps_per_second": 2.548, "step": 1500 }, { "epoch": 0.05866241914492726, "grad_norm": 0.8712412714958191, "learning_rate": 1e-05, "loss": 1.3778, "step": 1510 }, { "epoch": 0.05905091198694664, "grad_norm": 0.7130087018013, "learning_rate": 1e-05, "loss": 1.3266, "step": 1520 }, { "epoch": 0.05943940482896603, "grad_norm": 1.6288388967514038, "learning_rate": 1e-05, "loss": 1.4783, "step": 1530 }, { "epoch": 0.05982789767098541, "grad_norm": 2.629760503768921, "learning_rate": 1e-05, "loss": 1.6038, "step": 1540 }, { "epoch": 0.0602163905130048, "grad_norm": 1.0394636392593384, "learning_rate": 1e-05, "loss": 1.4683, "step": 1550 }, { "epoch": 0.06060488335502418, "grad_norm": 1.128451943397522, "learning_rate": 1e-05, "loss": 1.4578, "step": 1560 }, { "epoch": 0.06099337619704357, "grad_norm": 2.473900079727173, "learning_rate": 1e-05, "loss": 1.4326, "step": 1570 }, { "epoch": 0.061187622618053265, "eval_loss": 1.3889408111572266, "eval_runtime": 205.5592, "eval_samples_per_second": 2.535, "eval_steps_per_second": 2.535, "step": 1575 }, { "epoch": 0.06138186903906295, "grad_norm": 1.940373182296753, "learning_rate": 1e-05, "loss": 1.5117, "step": 1580 }, { "epoch": 0.06177036188108234, "grad_norm": 0.7575955986976624, "learning_rate": 1e-05, "loss": 1.3894, "step": 1590 }, { "epoch": 0.062158854723101724, "grad_norm": 1.4801169633865356, "learning_rate": 1e-05, "loss": 1.3132, "step": 1600 }, { "epoch": 0.0625473475651211, "grad_norm": 1.291632890701294, "learning_rate": 1e-05, "loss": 1.3286, "step": 1610 }, { "epoch": 0.0629358404071405, "grad_norm": 1.9607435464859009, "learning_rate": 1e-05, "loss": 1.3005, "step": 1620 }, { "epoch": 0.06332433324915988, "grad_norm": 0.8362483382225037, "learning_rate": 1e-05, "loss": 1.4172, "step": 1630 }, { "epoch": 0.06371282609117927, "grad_norm": 1.3649120330810547, "learning_rate": 1e-05, "loss": 1.6757, "step": 1640 }, { "epoch": 0.06410131893319866, "grad_norm": 1.0758274793624878, "learning_rate": 1e-05, "loss": 1.3867, "step": 1650 }, { "epoch": 0.06410131893319866, "eval_loss": 1.3888965845108032, "eval_runtime": 205.6882, "eval_samples_per_second": 2.533, "eval_steps_per_second": 2.533, "step": 1650 }, { "epoch": 0.06448981177521804, "grad_norm": 0.8754805326461792, "learning_rate": 1e-05, "loss": 1.3389, "step": 1660 }, { "epoch": 0.06487830461723743, "grad_norm": 0.7831467986106873, "learning_rate": 1e-05, "loss": 1.4257, "step": 1670 }, { "epoch": 0.06526679745925681, "grad_norm": 0.4581933915615082, "learning_rate": 1e-05, "loss": 1.3556, "step": 1680 }, { "epoch": 0.0656552903012762, "grad_norm": 0.9837825894355774, "learning_rate": 1e-05, "loss": 1.3184, "step": 1690 }, { "epoch": 0.06604378314329558, "grad_norm": 1.005288004875183, "learning_rate": 1e-05, "loss": 1.2944, "step": 1700 }, { "epoch": 0.06643227598531497, "grad_norm": 0.9397820234298706, "learning_rate": 1e-05, "loss": 1.4305, "step": 1710 }, { "epoch": 0.06682076882733436, "grad_norm": 2.7833900451660156, "learning_rate": 1e-05, "loss": 1.3273, "step": 1720 }, { "epoch": 0.06701501524834405, "eval_loss": 1.3884316682815552, "eval_runtime": 206.1573, "eval_samples_per_second": 2.527, "eval_steps_per_second": 2.527, "step": 1725 }, { "epoch": 0.06720926166935375, "grad_norm": 1.1208202838897705, "learning_rate": 1e-05, "loss": 1.2229, "step": 1730 }, { "epoch": 0.06759775451137313, "grad_norm": 0.5742992758750916, "learning_rate": 1e-05, "loss": 1.3349, "step": 1740 }, { "epoch": 0.06798624735339251, "grad_norm": 0.7946904897689819, "learning_rate": 1e-05, "loss": 1.3682, "step": 1750 }, { "epoch": 0.0683747401954119, "grad_norm": 0.7263549566268921, "learning_rate": 1e-05, "loss": 1.5025, "step": 1760 }, { "epoch": 0.06876323303743129, "grad_norm": 0.8954797387123108, "learning_rate": 1e-05, "loss": 1.4383, "step": 1770 }, { "epoch": 0.06915172587945068, "grad_norm": 0.6124446392059326, "learning_rate": 1e-05, "loss": 1.3322, "step": 1780 }, { "epoch": 0.06954021872147005, "grad_norm": 1.140678882598877, "learning_rate": 1e-05, "loss": 1.5233, "step": 1790 }, { "epoch": 0.06992871156348944, "grad_norm": 7.1586689949035645, "learning_rate": 1e-05, "loss": 1.3691, "step": 1800 }, { "epoch": 0.06992871156348944, "eval_loss": 1.387437105178833, "eval_runtime": 204.4312, "eval_samples_per_second": 2.549, "eval_steps_per_second": 2.549, "step": 1800 }, { "epoch": 0.07031720440550883, "grad_norm": 0.634140133857727, "learning_rate": 1e-05, "loss": 1.3735, "step": 1810 }, { "epoch": 0.07070569724752822, "grad_norm": 0.7632227540016174, "learning_rate": 1e-05, "loss": 1.3542, "step": 1820 }, { "epoch": 0.07109419008954761, "grad_norm": 0.7211370468139648, "learning_rate": 1e-05, "loss": 1.2832, "step": 1830 }, { "epoch": 0.07148268293156698, "grad_norm": 0.7608075737953186, "learning_rate": 1e-05, "loss": 1.5292, "step": 1840 }, { "epoch": 0.07187117577358637, "grad_norm": 0.8131744265556335, "learning_rate": 1e-05, "loss": 1.4005, "step": 1850 }, { "epoch": 0.07225966861560576, "grad_norm": 0.6415278911590576, "learning_rate": 1e-05, "loss": 1.4455, "step": 1860 }, { "epoch": 0.07264816145762515, "grad_norm": 2.333056688308716, "learning_rate": 1e-05, "loss": 1.4367, "step": 1870 }, { "epoch": 0.07284240787863483, "eval_loss": 1.3867840766906738, "eval_runtime": 205.2163, "eval_samples_per_second": 2.539, "eval_steps_per_second": 2.539, "step": 1875 } ], "logging_steps": 10, "max_steps": 3600, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 75, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.331378819072e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }