{ "best_metric": 2.5235376358032227, "best_model_checkpoint": "./results/checkpoint-50000", "epoch": 0.3637362257638006, "eval_steps": 1000, "global_step": 50000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 3.156810998916626, "learning_rate": 5e-06, "loss": 5.0881, "step": 100 }, { "epoch": 0.0, "grad_norm": 1.8936021327972412, "learning_rate": 1e-05, "loss": 3.4609, "step": 200 }, { "epoch": 0.0, "grad_norm": 0.8277048468589783, "learning_rate": 1.5e-05, "loss": 2.9956, "step": 300 }, { "epoch": 0.0, "grad_norm": 0.5409151315689087, "learning_rate": 2e-05, "loss": 2.7702, "step": 400 }, { "epoch": 0.0, "grad_norm": 0.37298905849456787, "learning_rate": 2.5e-05, "loss": 2.7116, "step": 500 }, { "epoch": 0.0, "grad_norm": 1.4856789112091064, "learning_rate": 3e-05, "loss": 2.676, "step": 600 }, { "epoch": 0.01, "grad_norm": 0.9477183222770691, "learning_rate": 3.5e-05, "loss": 2.6509, "step": 700 }, { "epoch": 0.01, "grad_norm": 0.42317140102386475, "learning_rate": 4e-05, "loss": 2.6364, "step": 800 }, { "epoch": 0.01, "grad_norm": 0.9827988743782043, "learning_rate": 4.5e-05, "loss": 2.6289, "step": 900 }, { "epoch": 0.01, "grad_norm": 0.647011935710907, "learning_rate": 5e-05, "loss": 2.6236, "step": 1000 }, { "epoch": 0.01, "eval_loss": 2.6208438873291016, "eval_runtime": 5241.4331, "eval_samples_per_second": 1118.979, "eval_steps_per_second": 69.936, "step": 1000 }, { "epoch": 0.01, "grad_norm": 0.3872681260108948, "learning_rate": 5.500000000000001e-05, "loss": 2.6201, "step": 1100 }, { "epoch": 0.01, "grad_norm": 0.6852623224258423, "learning_rate": 6e-05, "loss": 2.6152, "step": 1200 }, { "epoch": 0.01, "grad_norm": 0.5895536541938782, "learning_rate": 6.500000000000001e-05, "loss": 2.6152, "step": 1300 }, { "epoch": 0.01, "grad_norm": 0.5872493386268616, "learning_rate": 7e-05, "loss": 2.6124, "step": 1400 }, { "epoch": 0.01, "grad_norm": 0.6140819787979126, "learning_rate": 7.500000000000001e-05, "loss": 2.6106, "step": 1500 }, { "epoch": 0.01, "grad_norm": 0.6721034646034241, "learning_rate": 8e-05, "loss": 2.6097, "step": 1600 }, { "epoch": 0.01, "grad_norm": 0.3682301640510559, "learning_rate": 8.5e-05, "loss": 2.6075, "step": 1700 }, { "epoch": 0.01, "grad_norm": 0.4152977764606476, "learning_rate": 9e-05, "loss": 2.6065, "step": 1800 }, { "epoch": 0.01, "grad_norm": 0.30114710330963135, "learning_rate": 9.5e-05, "loss": 2.6067, "step": 1900 }, { "epoch": 0.01, "grad_norm": 0.29172569513320923, "learning_rate": 0.0001, "loss": 2.6056, "step": 2000 }, { "epoch": 0.01, "eval_loss": 2.604510545730591, "eval_runtime": 5009.0383, "eval_samples_per_second": 1170.894, "eval_steps_per_second": 73.181, "step": 2000 }, { "epoch": 0.02, "grad_norm": 0.26403701305389404, "learning_rate": 9.999999909099618e-05, "loss": 2.6061, "step": 2100 }, { "epoch": 0.02, "grad_norm": 0.42730650305747986, "learning_rate": 9.999999636398476e-05, "loss": 2.6038, "step": 2200 }, { "epoch": 0.02, "grad_norm": 0.2776353657245636, "learning_rate": 9.999999181896581e-05, "loss": 2.6048, "step": 2300 }, { "epoch": 0.02, "grad_norm": 0.2872585952281952, "learning_rate": 9.999998545593951e-05, "loss": 2.6031, "step": 2400 }, { "epoch": 0.02, "grad_norm": 0.24884726107120514, "learning_rate": 9.999997727490612e-05, "loss": 2.6007, "step": 2500 }, { "epoch": 0.02, "grad_norm": 0.2905128002166748, "learning_rate": 9.99999672758659e-05, "loss": 2.6001, "step": 2600 }, { "epoch": 0.02, "grad_norm": 0.22506840527057648, "learning_rate": 9.999995545881924e-05, "loss": 2.6002, "step": 2700 }, { "epoch": 0.02, "grad_norm": 0.235252246260643, "learning_rate": 9.999994182376653e-05, "loss": 2.5991, "step": 2800 }, { "epoch": 0.02, "grad_norm": 0.3328251838684082, "learning_rate": 9.999992637070832e-05, "loss": 2.5997, "step": 2900 }, { "epoch": 0.02, "grad_norm": 0.23325322568416595, "learning_rate": 9.999990909964513e-05, "loss": 2.5995, "step": 3000 }, { "epoch": 0.02, "eval_loss": 2.598254919052124, "eval_runtime": 5203.4796, "eval_samples_per_second": 1127.141, "eval_steps_per_second": 70.446, "step": 3000 }, { "epoch": 0.02, "grad_norm": 0.2796269655227661, "learning_rate": 9.99998900105776e-05, "loss": 2.5984, "step": 3100 }, { "epoch": 0.02, "grad_norm": 0.21093028783798218, "learning_rate": 9.999986910350642e-05, "loss": 2.5985, "step": 3200 }, { "epoch": 0.02, "grad_norm": 0.22558662295341492, "learning_rate": 9.999984637843238e-05, "loss": 2.5978, "step": 3300 }, { "epoch": 0.02, "grad_norm": 0.3363204598426819, "learning_rate": 9.999982183535627e-05, "loss": 2.5965, "step": 3400 }, { "epoch": 0.03, "grad_norm": 0.27081742882728577, "learning_rate": 9.9999795474279e-05, "loss": 2.5961, "step": 3500 }, { "epoch": 0.03, "grad_norm": 0.23502503335475922, "learning_rate": 9.999976729520151e-05, "loss": 2.5956, "step": 3600 }, { "epoch": 0.03, "grad_norm": 0.17260690033435822, "learning_rate": 9.999973729812485e-05, "loss": 2.5955, "step": 3700 }, { "epoch": 0.03, "grad_norm": 0.16165785491466522, "learning_rate": 9.999970548305009e-05, "loss": 2.5955, "step": 3800 }, { "epoch": 0.03, "grad_norm": 0.24890035390853882, "learning_rate": 9.99996718499784e-05, "loss": 2.5946, "step": 3900 }, { "epoch": 0.03, "grad_norm": 0.30852416157722473, "learning_rate": 9.999963639891102e-05, "loss": 2.5956, "step": 4000 }, { "epoch": 0.03, "eval_loss": 2.594769239425659, "eval_runtime": 5243.3493, "eval_samples_per_second": 1118.57, "eval_steps_per_second": 69.911, "step": 4000 }, { "epoch": 0.03, "grad_norm": 0.37085938453674316, "learning_rate": 9.999959912984918e-05, "loss": 2.5947, "step": 4100 }, { "epoch": 0.03, "grad_norm": 0.3395659029483795, "learning_rate": 9.999956004279429e-05, "loss": 2.5955, "step": 4200 }, { "epoch": 0.03, "grad_norm": 0.16178911924362183, "learning_rate": 9.999951913774777e-05, "loss": 2.594, "step": 4300 }, { "epoch": 0.03, "grad_norm": 0.32948923110961914, "learning_rate": 9.999947641471107e-05, "loss": 2.5941, "step": 4400 }, { "epoch": 0.03, "grad_norm": 0.20488527417182922, "learning_rate": 9.999943187368577e-05, "loss": 2.5934, "step": 4500 }, { "epoch": 0.03, "grad_norm": 0.2330751270055771, "learning_rate": 9.999938551467348e-05, "loss": 2.593, "step": 4600 }, { "epoch": 0.03, "grad_norm": 0.2759188711643219, "learning_rate": 9.99993373376759e-05, "loss": 2.5934, "step": 4700 }, { "epoch": 0.03, "grad_norm": 0.20582658052444458, "learning_rate": 9.999928734269477e-05, "loss": 2.5921, "step": 4800 }, { "epoch": 0.04, "grad_norm": 0.27048927545547485, "learning_rate": 9.99992355297319e-05, "loss": 2.5908, "step": 4900 }, { "epoch": 0.04, "grad_norm": 0.23385432362556458, "learning_rate": 9.999918189878918e-05, "loss": 2.5921, "step": 5000 }, { "epoch": 0.04, "eval_loss": 2.5919315814971924, "eval_runtime": 5496.6841, "eval_samples_per_second": 1067.017, "eval_steps_per_second": 66.689, "step": 5000 }, { "epoch": 0.04, "grad_norm": 0.22557920217514038, "learning_rate": 9.999912644986859e-05, "loss": 2.5931, "step": 5100 }, { "epoch": 0.04, "grad_norm": 0.23983053863048553, "learning_rate": 9.99990691829721e-05, "loss": 2.5915, "step": 5200 }, { "epoch": 0.04, "grad_norm": 0.18743467330932617, "learning_rate": 9.999901009810181e-05, "loss": 2.5913, "step": 5300 }, { "epoch": 0.04, "grad_norm": 0.20421144366264343, "learning_rate": 9.999894919525987e-05, "loss": 2.59, "step": 5400 }, { "epoch": 0.04, "grad_norm": 0.19018307328224182, "learning_rate": 9.999888647444851e-05, "loss": 2.5908, "step": 5500 }, { "epoch": 0.04, "grad_norm": 0.23669780790805817, "learning_rate": 9.999882193566997e-05, "loss": 2.5908, "step": 5600 }, { "epoch": 0.04, "grad_norm": 0.20617301762104034, "learning_rate": 9.999875557892664e-05, "loss": 2.5902, "step": 5700 }, { "epoch": 0.04, "grad_norm": 0.1931978017091751, "learning_rate": 9.999868740422092e-05, "loss": 2.5907, "step": 5800 }, { "epoch": 0.04, "grad_norm": 0.19971776008605957, "learning_rate": 9.999861741155526e-05, "loss": 2.5885, "step": 5900 }, { "epoch": 0.04, "grad_norm": 0.21058019995689392, "learning_rate": 9.999854560093225e-05, "loss": 2.5897, "step": 6000 }, { "epoch": 0.04, "eval_loss": 2.5891263484954834, "eval_runtime": 5309.84, "eval_samples_per_second": 1104.563, "eval_steps_per_second": 69.035, "step": 6000 }, { "epoch": 0.04, "grad_norm": 0.2691722512245178, "learning_rate": 9.999847197235446e-05, "loss": 2.5882, "step": 6100 }, { "epoch": 0.05, "grad_norm": 0.2411184161901474, "learning_rate": 9.99983965258246e-05, "loss": 2.589, "step": 6200 }, { "epoch": 0.05, "grad_norm": 0.15842053294181824, "learning_rate": 9.99983192613454e-05, "loss": 2.5892, "step": 6300 }, { "epoch": 0.05, "grad_norm": 0.17729991674423218, "learning_rate": 9.999824017891965e-05, "loss": 2.588, "step": 6400 }, { "epoch": 0.05, "grad_norm": 0.22472384572029114, "learning_rate": 9.999815927855027e-05, "loss": 2.5873, "step": 6500 }, { "epoch": 0.05, "grad_norm": 0.2445155680179596, "learning_rate": 9.999807656024016e-05, "loss": 2.5871, "step": 6600 }, { "epoch": 0.05, "grad_norm": 0.2309512048959732, "learning_rate": 9.999799202399236e-05, "loss": 2.5872, "step": 6700 }, { "epoch": 0.05, "grad_norm": 0.24274510145187378, "learning_rate": 9.999790566980991e-05, "loss": 2.5863, "step": 6800 }, { "epoch": 0.05, "grad_norm": 0.2553771436214447, "learning_rate": 9.999781749769597e-05, "loss": 2.5866, "step": 6900 }, { "epoch": 0.05, "grad_norm": 0.2770930230617523, "learning_rate": 9.999772750765375e-05, "loss": 2.5867, "step": 7000 }, { "epoch": 0.05, "eval_loss": 2.5862584114074707, "eval_runtime": 5428.0291, "eval_samples_per_second": 1080.512, "eval_steps_per_second": 67.532, "step": 7000 }, { "epoch": 0.05, "grad_norm": 0.20760175585746765, "learning_rate": 9.999763569968652e-05, "loss": 2.5864, "step": 7100 }, { "epoch": 0.05, "grad_norm": 0.195896178483963, "learning_rate": 9.999754207379762e-05, "loss": 2.5859, "step": 7200 }, { "epoch": 0.05, "grad_norm": 0.17725147306919098, "learning_rate": 9.999744662999042e-05, "loss": 2.5862, "step": 7300 }, { "epoch": 0.05, "grad_norm": 0.19556549191474915, "learning_rate": 9.999734936826843e-05, "loss": 2.5849, "step": 7400 }, { "epoch": 0.05, "grad_norm": 0.1898818463087082, "learning_rate": 9.999725028863518e-05, "loss": 2.5838, "step": 7500 }, { "epoch": 0.06, "grad_norm": 0.1693861335515976, "learning_rate": 9.999714939109426e-05, "loss": 2.5842, "step": 7600 }, { "epoch": 0.06, "grad_norm": 0.2052980363368988, "learning_rate": 9.999704667564935e-05, "loss": 2.5846, "step": 7700 }, { "epoch": 0.06, "grad_norm": 0.24053046107292175, "learning_rate": 9.999694214230418e-05, "loss": 2.5853, "step": 7800 }, { "epoch": 0.06, "grad_norm": 0.18848931789398193, "learning_rate": 9.999683579106255e-05, "loss": 2.5847, "step": 7900 }, { "epoch": 0.06, "grad_norm": 0.21658702194690704, "learning_rate": 9.999672762192834e-05, "loss": 2.5834, "step": 8000 }, { "epoch": 0.06, "eval_loss": 2.5834951400756836, "eval_runtime": 5812.5431, "eval_samples_per_second": 1009.034, "eval_steps_per_second": 63.065, "step": 8000 }, { "epoch": 0.06, "grad_norm": 0.2054041624069214, "learning_rate": 9.999661763490544e-05, "loss": 2.5846, "step": 8100 }, { "epoch": 0.06, "grad_norm": 0.19781599938869476, "learning_rate": 9.99965058299979e-05, "loss": 2.5828, "step": 8200 }, { "epoch": 0.06, "grad_norm": 0.25480276346206665, "learning_rate": 9.999639220720978e-05, "loss": 2.5824, "step": 8300 }, { "epoch": 0.06, "grad_norm": 0.24584674835205078, "learning_rate": 9.999627676654517e-05, "loss": 2.5815, "step": 8400 }, { "epoch": 0.06, "grad_norm": 0.21407808363437653, "learning_rate": 9.999615950800832e-05, "loss": 2.5816, "step": 8500 }, { "epoch": 0.06, "grad_norm": 0.19315016269683838, "learning_rate": 9.999604043160346e-05, "loss": 2.582, "step": 8600 }, { "epoch": 0.06, "grad_norm": 0.18959668278694153, "learning_rate": 9.999591953733491e-05, "loss": 2.5823, "step": 8700 }, { "epoch": 0.06, "grad_norm": 0.2016109973192215, "learning_rate": 9.99957968252071e-05, "loss": 2.5817, "step": 8800 }, { "epoch": 0.06, "grad_norm": 0.1687597781419754, "learning_rate": 9.999567229522448e-05, "loss": 2.5808, "step": 8900 }, { "epoch": 0.07, "grad_norm": 0.17627790570259094, "learning_rate": 9.999554594739155e-05, "loss": 2.5809, "step": 9000 }, { "epoch": 0.07, "eval_loss": 2.5812571048736572, "eval_runtime": 5682.2226, "eval_samples_per_second": 1032.176, "eval_steps_per_second": 64.511, "step": 9000 }, { "epoch": 0.07, "grad_norm": 0.12426480650901794, "learning_rate": 9.999541778171295e-05, "loss": 2.5808, "step": 9100 }, { "epoch": 0.07, "grad_norm": 0.20493678748607635, "learning_rate": 9.999528779819331e-05, "loss": 2.582, "step": 9200 }, { "epoch": 0.07, "grad_norm": 0.2577107548713684, "learning_rate": 9.999515599683736e-05, "loss": 2.5821, "step": 9300 }, { "epoch": 0.07, "grad_norm": 0.17233897745609283, "learning_rate": 9.999502237764991e-05, "loss": 2.5805, "step": 9400 }, { "epoch": 0.07, "grad_norm": 0.17032714188098907, "learning_rate": 9.99948869406358e-05, "loss": 2.5797, "step": 9500 }, { "epoch": 0.07, "grad_norm": 0.21331337094306946, "learning_rate": 9.999474968579994e-05, "loss": 2.5807, "step": 9600 }, { "epoch": 0.07, "grad_norm": 0.1757407933473587, "learning_rate": 9.999461061314734e-05, "loss": 2.5796, "step": 9700 }, { "epoch": 0.07, "grad_norm": 0.20256169140338898, "learning_rate": 9.999446972268308e-05, "loss": 2.5805, "step": 9800 }, { "epoch": 0.07, "grad_norm": 0.16938987374305725, "learning_rate": 9.999432701441223e-05, "loss": 2.5793, "step": 9900 }, { "epoch": 0.07, "grad_norm": 0.15477782487869263, "learning_rate": 9.999418248834002e-05, "loss": 2.5797, "step": 10000 }, { "epoch": 0.07, "eval_loss": 2.579577684402466, "eval_runtime": 5716.7677, "eval_samples_per_second": 1025.939, "eval_steps_per_second": 64.121, "step": 10000 }, { "epoch": 0.07, "grad_norm": 0.1892194151878357, "learning_rate": 9.99940361444717e-05, "loss": 2.5807, "step": 10100 }, { "epoch": 0.07, "grad_norm": 0.1950315237045288, "learning_rate": 9.999388798281258e-05, "loss": 2.5803, "step": 10200 }, { "epoch": 0.07, "grad_norm": 0.182656928896904, "learning_rate": 9.999373800336806e-05, "loss": 2.579, "step": 10300 }, { "epoch": 0.08, "grad_norm": 0.2610529959201813, "learning_rate": 9.999358620614357e-05, "loss": 2.5798, "step": 10400 }, { "epoch": 0.08, "grad_norm": 0.1903856247663498, "learning_rate": 9.999343259114464e-05, "loss": 2.5782, "step": 10500 }, { "epoch": 0.08, "grad_norm": 0.2288079857826233, "learning_rate": 9.999327715837687e-05, "loss": 2.5798, "step": 10600 }, { "epoch": 0.08, "grad_norm": 0.22872629761695862, "learning_rate": 9.99931199078459e-05, "loss": 2.5777, "step": 10700 }, { "epoch": 0.08, "grad_norm": 0.22784186899662018, "learning_rate": 9.999296083955744e-05, "loss": 2.5783, "step": 10800 }, { "epoch": 0.08, "grad_norm": 0.1781127005815506, "learning_rate": 9.999279995351729e-05, "loss": 2.5784, "step": 10900 }, { "epoch": 0.08, "grad_norm": 0.19801098108291626, "learning_rate": 9.99926372497313e-05, "loss": 2.5786, "step": 11000 }, { "epoch": 0.08, "eval_loss": 2.5776655673980713, "eval_runtime": 5841.1118, "eval_samples_per_second": 1004.099, "eval_steps_per_second": 62.756, "step": 11000 }, { "epoch": 0.08, "grad_norm": 0.24839451909065247, "learning_rate": 9.999247272820536e-05, "loss": 2.5777, "step": 11100 }, { "epoch": 0.08, "grad_norm": 0.18496295809745789, "learning_rate": 9.999230638894548e-05, "loss": 2.5784, "step": 11200 }, { "epoch": 0.08, "grad_norm": 0.1987435519695282, "learning_rate": 9.99921382319577e-05, "loss": 2.5779, "step": 11300 }, { "epoch": 0.08, "grad_norm": 0.23174594342708588, "learning_rate": 9.999196825724813e-05, "loss": 2.5781, "step": 11400 }, { "epoch": 0.08, "grad_norm": 0.18109597265720367, "learning_rate": 9.999179646482295e-05, "loss": 2.5785, "step": 11500 }, { "epoch": 0.08, "grad_norm": 0.18909253180027008, "learning_rate": 9.999162285468841e-05, "loss": 2.5779, "step": 11600 }, { "epoch": 0.09, "grad_norm": 0.1274472177028656, "learning_rate": 9.999144742685083e-05, "loss": 2.5774, "step": 11700 }, { "epoch": 0.09, "grad_norm": 0.14917853474617004, "learning_rate": 9.999127018131655e-05, "loss": 2.5759, "step": 11800 }, { "epoch": 0.09, "grad_norm": 0.13930270075798035, "learning_rate": 9.999109111809207e-05, "loss": 2.5769, "step": 11900 }, { "epoch": 0.09, "grad_norm": 0.18673798441886902, "learning_rate": 9.999091023718388e-05, "loss": 2.5765, "step": 12000 }, { "epoch": 0.09, "eval_loss": 2.576554775238037, "eval_runtime": 5806.3953, "eval_samples_per_second": 1010.102, "eval_steps_per_second": 63.131, "step": 12000 }, { "epoch": 0.09, "grad_norm": 0.23524725437164307, "learning_rate": 9.999072753859854e-05, "loss": 2.577, "step": 12100 }, { "epoch": 0.09, "grad_norm": 0.23039047420024872, "learning_rate": 9.99905430223427e-05, "loss": 2.5777, "step": 12200 }, { "epoch": 0.09, "grad_norm": 0.17183104157447815, "learning_rate": 9.99903566884231e-05, "loss": 2.5757, "step": 12300 }, { "epoch": 0.09, "grad_norm": 0.22224171459674835, "learning_rate": 9.999016853684646e-05, "loss": 2.5775, "step": 12400 }, { "epoch": 0.09, "grad_norm": 0.2552145719528198, "learning_rate": 9.998997856761967e-05, "loss": 2.5753, "step": 12500 }, { "epoch": 0.09, "grad_norm": 0.14996840059757233, "learning_rate": 9.998978678074961e-05, "loss": 2.577, "step": 12600 }, { "epoch": 0.09, "grad_norm": 0.22422178089618683, "learning_rate": 9.998959317624325e-05, "loss": 2.576, "step": 12700 }, { "epoch": 0.09, "grad_norm": 0.15495595335960388, "learning_rate": 9.998939775410767e-05, "loss": 2.5773, "step": 12800 }, { "epoch": 0.09, "grad_norm": 0.21027402579784393, "learning_rate": 9.998920051434992e-05, "loss": 2.575, "step": 12900 }, { "epoch": 0.09, "grad_norm": 0.17667286098003387, "learning_rate": 9.99890014569772e-05, "loss": 2.5765, "step": 13000 }, { "epoch": 0.09, "eval_loss": 2.574887275695801, "eval_runtime": 5869.3511, "eval_samples_per_second": 999.268, "eval_steps_per_second": 62.454, "step": 13000 }, { "epoch": 0.1, "grad_norm": 0.21658362448215485, "learning_rate": 9.998880058199675e-05, "loss": 2.5741, "step": 13100 }, { "epoch": 0.1, "grad_norm": 0.20807647705078125, "learning_rate": 9.998859788941588e-05, "loss": 2.5756, "step": 13200 }, { "epoch": 0.1, "grad_norm": 0.1646522879600525, "learning_rate": 9.998839337924195e-05, "loss": 2.5756, "step": 13300 }, { "epoch": 0.1, "grad_norm": 0.166889950633049, "learning_rate": 9.998818705148238e-05, "loss": 2.5745, "step": 13400 }, { "epoch": 0.1, "grad_norm": 0.25859805941581726, "learning_rate": 9.998797890614469e-05, "loss": 2.5747, "step": 13500 }, { "epoch": 0.1, "grad_norm": 0.20016120374202728, "learning_rate": 9.998776894323645e-05, "loss": 2.5746, "step": 13600 }, { "epoch": 0.1, "grad_norm": 0.16718171536922455, "learning_rate": 9.998755716276528e-05, "loss": 2.574, "step": 13700 }, { "epoch": 0.1, "grad_norm": 0.20205742120742798, "learning_rate": 9.998734356473892e-05, "loss": 2.5754, "step": 13800 }, { "epoch": 0.1, "grad_norm": 0.14121076464653015, "learning_rate": 9.998712814916508e-05, "loss": 2.5746, "step": 13900 }, { "epoch": 0.1, "grad_norm": 0.21463043987751007, "learning_rate": 9.998691091605163e-05, "loss": 2.5748, "step": 14000 }, { "epoch": 0.1, "eval_loss": 2.573892116546631, "eval_runtime": 5790.9885, "eval_samples_per_second": 1012.79, "eval_steps_per_second": 63.299, "step": 14000 }, { "epoch": 0.1, "grad_norm": 0.21442490816116333, "learning_rate": 9.998669406490967e-05, "loss": 2.5752, "step": 14100 }, { "epoch": 0.1, "grad_norm": 0.1974353939294815, "learning_rate": 9.998647321491594e-05, "loss": 2.5733, "step": 14200 }, { "epoch": 0.1, "grad_norm": 0.17868639528751373, "learning_rate": 9.99862505474064e-05, "loss": 2.5733, "step": 14300 }, { "epoch": 0.1, "grad_norm": 0.18465746939182281, "learning_rate": 9.998602606238913e-05, "loss": 2.5727, "step": 14400 }, { "epoch": 0.11, "grad_norm": 0.22418785095214844, "learning_rate": 9.998579975987234e-05, "loss": 2.5739, "step": 14500 }, { "epoch": 0.11, "grad_norm": 0.20102180540561676, "learning_rate": 9.998557163986423e-05, "loss": 2.5726, "step": 14600 }, { "epoch": 0.11, "grad_norm": 0.1889944076538086, "learning_rate": 9.998534170237307e-05, "loss": 2.5719, "step": 14700 }, { "epoch": 0.11, "grad_norm": 0.1904602348804474, "learning_rate": 9.998510994740727e-05, "loss": 2.5719, "step": 14800 }, { "epoch": 0.11, "grad_norm": 0.20775534212589264, "learning_rate": 9.998487637497522e-05, "loss": 2.5725, "step": 14900 }, { "epoch": 0.11, "grad_norm": 0.203061044216156, "learning_rate": 9.998464334798072e-05, "loss": 2.5733, "step": 15000 }, { "epoch": 0.11, "eval_loss": 2.5725255012512207, "eval_runtime": 5867.3015, "eval_samples_per_second": 999.617, "eval_steps_per_second": 62.476, "step": 15000 }, { "epoch": 0.11, "grad_norm": 0.1729467511177063, "learning_rate": 9.998440615881618e-05, "loss": 2.5729, "step": 15100 }, { "epoch": 0.11, "grad_norm": 0.1721724420785904, "learning_rate": 9.998416715221101e-05, "loss": 2.5719, "step": 15200 }, { "epoch": 0.11, "grad_norm": 0.170950248837471, "learning_rate": 9.998392632817387e-05, "loss": 2.5725, "step": 15300 }, { "epoch": 0.11, "grad_norm": 0.1992483288049698, "learning_rate": 9.998368368671353e-05, "loss": 2.573, "step": 15400 }, { "epoch": 0.11, "grad_norm": 0.19748467206954956, "learning_rate": 9.998343922783881e-05, "loss": 2.5719, "step": 15500 }, { "epoch": 0.11, "grad_norm": 0.22308781743049622, "learning_rate": 9.99831929515586e-05, "loss": 2.5712, "step": 15600 }, { "epoch": 0.11, "grad_norm": 0.17296306788921356, "learning_rate": 9.998294485788187e-05, "loss": 2.5709, "step": 15700 }, { "epoch": 0.11, "grad_norm": 0.2266322374343872, "learning_rate": 9.99826949468176e-05, "loss": 2.5716, "step": 15800 }, { "epoch": 0.12, "grad_norm": 0.1890050321817398, "learning_rate": 9.998244321837492e-05, "loss": 2.5714, "step": 15900 }, { "epoch": 0.12, "grad_norm": 0.14952068030834198, "learning_rate": 9.998218967256294e-05, "loss": 2.5724, "step": 16000 }, { "epoch": 0.12, "eval_loss": 2.570789098739624, "eval_runtime": 5730.232, "eval_samples_per_second": 1023.528, "eval_steps_per_second": 63.971, "step": 16000 }, { "epoch": 0.12, "grad_norm": 0.20020237565040588, "learning_rate": 9.998193430939093e-05, "loss": 2.5712, "step": 16100 }, { "epoch": 0.12, "grad_norm": 0.2040109634399414, "learning_rate": 9.998167712886813e-05, "loss": 2.5722, "step": 16200 }, { "epoch": 0.12, "grad_norm": 0.14661578834056854, "learning_rate": 9.998141813100392e-05, "loss": 2.5713, "step": 16300 }, { "epoch": 0.12, "grad_norm": 0.17079943418502808, "learning_rate": 9.998115731580771e-05, "loss": 2.5718, "step": 16400 }, { "epoch": 0.12, "grad_norm": 0.17369835078716278, "learning_rate": 9.998089468328898e-05, "loss": 2.5721, "step": 16500 }, { "epoch": 0.12, "grad_norm": 0.16338428854942322, "learning_rate": 9.998063023345725e-05, "loss": 2.5718, "step": 16600 }, { "epoch": 0.12, "grad_norm": 0.16794905066490173, "learning_rate": 9.99803639663222e-05, "loss": 2.5708, "step": 16700 }, { "epoch": 0.12, "grad_norm": 0.20138411223888397, "learning_rate": 9.998009588189345e-05, "loss": 2.5696, "step": 16800 }, { "epoch": 0.12, "grad_norm": 0.23276585340499878, "learning_rate": 9.99798259801808e-05, "loss": 2.5691, "step": 16900 }, { "epoch": 0.12, "grad_norm": 0.1607973575592041, "learning_rate": 9.997955426119402e-05, "loss": 2.5714, "step": 17000 }, { "epoch": 0.12, "eval_loss": 2.569423198699951, "eval_runtime": 5840.5846, "eval_samples_per_second": 1004.189, "eval_steps_per_second": 62.762, "step": 17000 }, { "epoch": 0.12, "grad_norm": 0.1984170377254486, "learning_rate": 9.997928072494302e-05, "loss": 2.5703, "step": 17100 }, { "epoch": 0.13, "grad_norm": 0.22356334328651428, "learning_rate": 9.997900537143772e-05, "loss": 2.5711, "step": 17200 }, { "epoch": 0.13, "grad_norm": 0.1815560907125473, "learning_rate": 9.997873098139096e-05, "loss": 2.5697, "step": 17300 }, { "epoch": 0.13, "grad_norm": 0.27280667424201965, "learning_rate": 9.997845201157949e-05, "loss": 2.5697, "step": 17400 }, { "epoch": 0.13, "grad_norm": 0.15276475250720978, "learning_rate": 9.997817122454387e-05, "loss": 2.5675, "step": 17500 }, { "epoch": 0.13, "grad_norm": 0.19267494976520538, "learning_rate": 9.997788862029429e-05, "loss": 2.569, "step": 17600 }, { "epoch": 0.13, "grad_norm": 0.1580013483762741, "learning_rate": 9.997760419884105e-05, "loss": 2.5704, "step": 17700 }, { "epoch": 0.13, "grad_norm": 0.21980580687522888, "learning_rate": 9.997731796019448e-05, "loss": 2.5702, "step": 17800 }, { "epoch": 0.13, "grad_norm": 0.2392682433128357, "learning_rate": 9.997702990436498e-05, "loss": 2.5696, "step": 17900 }, { "epoch": 0.13, "grad_norm": 0.2011999785900116, "learning_rate": 9.997674003136303e-05, "loss": 2.5682, "step": 18000 }, { "epoch": 0.13, "eval_loss": 2.5681583881378174, "eval_runtime": 5608.808, "eval_samples_per_second": 1045.686, "eval_steps_per_second": 65.355, "step": 18000 }, { "epoch": 0.13, "grad_norm": 0.23138105869293213, "learning_rate": 9.997644834119919e-05, "loss": 2.5685, "step": 18100 }, { "epoch": 0.13, "grad_norm": 0.19521869719028473, "learning_rate": 9.997615483388406e-05, "loss": 2.5685, "step": 18200 }, { "epoch": 0.13, "grad_norm": 0.22132566571235657, "learning_rate": 9.997585950942827e-05, "loss": 2.5687, "step": 18300 }, { "epoch": 0.13, "grad_norm": 0.19450217485427856, "learning_rate": 9.99755623678426e-05, "loss": 2.5666, "step": 18400 }, { "epoch": 0.13, "grad_norm": 0.20810718834400177, "learning_rate": 9.997526340913785e-05, "loss": 2.5681, "step": 18500 }, { "epoch": 0.14, "grad_norm": 0.1711287647485733, "learning_rate": 9.997496263332487e-05, "loss": 2.5666, "step": 18600 }, { "epoch": 0.14, "grad_norm": 0.15737326443195343, "learning_rate": 9.997466004041462e-05, "loss": 2.5663, "step": 18700 }, { "epoch": 0.14, "grad_norm": 0.19085867702960968, "learning_rate": 9.997435563041809e-05, "loss": 2.569, "step": 18800 }, { "epoch": 0.14, "grad_norm": 0.20687063038349152, "learning_rate": 9.997404940334637e-05, "loss": 2.5654, "step": 18900 }, { "epoch": 0.14, "grad_norm": 0.21126438677310944, "learning_rate": 9.997374135921054e-05, "loss": 2.5677, "step": 19000 }, { "epoch": 0.14, "eval_loss": 2.5664379596710205, "eval_runtime": 5437.5739, "eval_samples_per_second": 1078.616, "eval_steps_per_second": 67.414, "step": 19000 }, { "epoch": 0.14, "grad_norm": 0.1988440304994583, "learning_rate": 9.997343149802186e-05, "loss": 2.5667, "step": 19100 }, { "epoch": 0.14, "grad_norm": 0.2025243192911148, "learning_rate": 9.997311981979155e-05, "loss": 2.5674, "step": 19200 }, { "epoch": 0.14, "grad_norm": 0.15592768788337708, "learning_rate": 9.997280632453097e-05, "loss": 2.5679, "step": 19300 }, { "epoch": 0.14, "grad_norm": 0.18881608545780182, "learning_rate": 9.997249101225153e-05, "loss": 2.5664, "step": 19400 }, { "epoch": 0.14, "grad_norm": 0.2082224190235138, "learning_rate": 9.997217706325169e-05, "loss": 2.5653, "step": 19500 }, { "epoch": 0.14, "grad_norm": 0.20503577589988708, "learning_rate": 9.997185813513884e-05, "loss": 2.5675, "step": 19600 }, { "epoch": 0.14, "grad_norm": 0.157631978392601, "learning_rate": 9.997153739004159e-05, "loss": 2.5655, "step": 19700 }, { "epoch": 0.14, "grad_norm": 0.1750982105731964, "learning_rate": 9.997121482797162e-05, "loss": 2.566, "step": 19800 }, { "epoch": 0.14, "grad_norm": 0.2016943246126175, "learning_rate": 9.997089044894064e-05, "loss": 2.564, "step": 19900 }, { "epoch": 0.15, "grad_norm": 0.23606260120868683, "learning_rate": 9.997056425296043e-05, "loss": 2.5643, "step": 20000 }, { "epoch": 0.15, "eval_loss": 2.564971446990967, "eval_runtime": 5311.9496, "eval_samples_per_second": 1104.124, "eval_steps_per_second": 69.008, "step": 20000 }, { "epoch": 0.15, "grad_norm": 0.21572504937648773, "learning_rate": 9.997023624004287e-05, "loss": 2.5655, "step": 20100 }, { "epoch": 0.15, "grad_norm": 0.1901276409626007, "learning_rate": 9.996990641019987e-05, "loss": 2.5646, "step": 20200 }, { "epoch": 0.15, "grad_norm": 0.1661011129617691, "learning_rate": 9.996957476344345e-05, "loss": 2.5656, "step": 20300 }, { "epoch": 0.15, "grad_norm": 0.1857927441596985, "learning_rate": 9.996924129978566e-05, "loss": 2.5641, "step": 20400 }, { "epoch": 0.15, "grad_norm": 0.17554591596126556, "learning_rate": 9.996890601923861e-05, "loss": 2.565, "step": 20500 }, { "epoch": 0.15, "grad_norm": 0.19168546795845032, "learning_rate": 9.99685689218145e-05, "loss": 2.5634, "step": 20600 }, { "epoch": 0.15, "grad_norm": 0.24985399842262268, "learning_rate": 9.996823000752557e-05, "loss": 2.5645, "step": 20700 }, { "epoch": 0.15, "grad_norm": 0.17692847549915314, "learning_rate": 9.996788927638418e-05, "loss": 2.5623, "step": 20800 }, { "epoch": 0.15, "grad_norm": 0.19862769544124603, "learning_rate": 9.996754672840269e-05, "loss": 2.5637, "step": 20900 }, { "epoch": 0.15, "grad_norm": 0.15231578052043915, "learning_rate": 9.996720236359356e-05, "loss": 2.5629, "step": 21000 }, { "epoch": 0.15, "eval_loss": 2.5632758140563965, "eval_runtime": 5223.2614, "eval_samples_per_second": 1122.872, "eval_steps_per_second": 70.18, "step": 21000 }, { "epoch": 0.15, "grad_norm": 0.15703310072422028, "learning_rate": 9.996685618196933e-05, "loss": 2.5647, "step": 21100 }, { "epoch": 0.15, "grad_norm": 0.21033476293087006, "learning_rate": 9.996650818354254e-05, "loss": 2.5616, "step": 21200 }, { "epoch": 0.15, "grad_norm": 0.17880433797836304, "learning_rate": 9.99661583683259e-05, "loss": 2.5632, "step": 21300 }, { "epoch": 0.16, "grad_norm": 0.17895223200321198, "learning_rate": 9.996580673633208e-05, "loss": 2.5612, "step": 21400 }, { "epoch": 0.16, "grad_norm": 0.1816408783197403, "learning_rate": 9.996545683105445e-05, "loss": 2.5627, "step": 21500 }, { "epoch": 0.16, "grad_norm": 0.17461837828159332, "learning_rate": 9.996510158371221e-05, "loss": 2.5622, "step": 21600 }, { "epoch": 0.16, "grad_norm": 0.22741451859474182, "learning_rate": 9.996474451963123e-05, "loss": 2.5626, "step": 21700 }, { "epoch": 0.16, "grad_norm": 0.2556922733783722, "learning_rate": 9.996438563882451e-05, "loss": 2.5627, "step": 21800 }, { "epoch": 0.16, "grad_norm": 0.18055401742458344, "learning_rate": 9.99640249413051e-05, "loss": 2.5623, "step": 21900 }, { "epoch": 0.16, "grad_norm": 0.1370435506105423, "learning_rate": 9.996366242708609e-05, "loss": 2.5629, "step": 22000 }, { "epoch": 0.16, "eval_loss": 2.5613725185394287, "eval_runtime": 5233.9822, "eval_samples_per_second": 1120.572, "eval_steps_per_second": 70.036, "step": 22000 }, { "epoch": 0.16, "grad_norm": 0.150621235370636, "learning_rate": 9.996329809618068e-05, "loss": 2.5595, "step": 22100 }, { "epoch": 0.16, "grad_norm": 0.23477642238140106, "learning_rate": 9.996293194860211e-05, "loss": 2.561, "step": 22200 }, { "epoch": 0.16, "grad_norm": 0.20428013801574707, "learning_rate": 9.996256398436372e-05, "loss": 2.5589, "step": 22300 }, { "epoch": 0.16, "grad_norm": 0.18161137402057648, "learning_rate": 9.996219420347885e-05, "loss": 2.5618, "step": 22400 }, { "epoch": 0.16, "grad_norm": 0.18925230205059052, "learning_rate": 9.996182260596096e-05, "loss": 2.5579, "step": 22500 }, { "epoch": 0.16, "grad_norm": 0.20608845353126526, "learning_rate": 9.996144919182355e-05, "loss": 2.5592, "step": 22600 }, { "epoch": 0.17, "grad_norm": 0.19250337779521942, "learning_rate": 9.996107396108024e-05, "loss": 2.5596, "step": 22700 }, { "epoch": 0.17, "grad_norm": 0.2164393663406372, "learning_rate": 9.996069691374462e-05, "loss": 2.5596, "step": 22800 }, { "epoch": 0.17, "grad_norm": 0.16476331651210785, "learning_rate": 9.996031804983043e-05, "loss": 2.5606, "step": 22900 }, { "epoch": 0.17, "grad_norm": 0.15099835395812988, "learning_rate": 9.995993736935145e-05, "loss": 2.5612, "step": 23000 }, { "epoch": 0.17, "eval_loss": 2.5598702430725098, "eval_runtime": 6082.4008, "eval_samples_per_second": 964.266, "eval_steps_per_second": 60.267, "step": 23000 }, { "epoch": 0.17, "grad_norm": 0.1863376796245575, "learning_rate": 9.99595548723215e-05, "loss": 2.5593, "step": 23100 }, { "epoch": 0.17, "grad_norm": 0.19632737338542938, "learning_rate": 9.99591705587545e-05, "loss": 2.5594, "step": 23200 }, { "epoch": 0.17, "grad_norm": 0.18330197036266327, "learning_rate": 9.995878442866442e-05, "loss": 2.5598, "step": 23300 }, { "epoch": 0.17, "grad_norm": 0.23508641123771667, "learning_rate": 9.995839648206531e-05, "loss": 2.5579, "step": 23400 }, { "epoch": 0.17, "grad_norm": 0.2019587904214859, "learning_rate": 9.99580106255938e-05, "loss": 2.5584, "step": 23500 }, { "epoch": 0.17, "grad_norm": 0.20707851648330688, "learning_rate": 9.995761906418372e-05, "loss": 2.5589, "step": 23600 }, { "epoch": 0.17, "grad_norm": 0.16306054592132568, "learning_rate": 9.995722568630698e-05, "loss": 2.5576, "step": 23700 }, { "epoch": 0.17, "grad_norm": 0.21417422592639923, "learning_rate": 9.995683049197788e-05, "loss": 2.5574, "step": 23800 }, { "epoch": 0.17, "grad_norm": 0.21078960597515106, "learning_rate": 9.99564334812108e-05, "loss": 2.5588, "step": 23900 }, { "epoch": 0.17, "grad_norm": 0.17299005389213562, "learning_rate": 9.995603465402014e-05, "loss": 2.5567, "step": 24000 }, { "epoch": 0.17, "eval_loss": 2.5584349632263184, "eval_runtime": 5391.8597, "eval_samples_per_second": 1087.761, "eval_steps_per_second": 67.985, "step": 24000 }, { "epoch": 0.18, "grad_norm": 0.1550675630569458, "learning_rate": 9.995563401042043e-05, "loss": 2.5576, "step": 24100 }, { "epoch": 0.18, "grad_norm": 0.21063561737537384, "learning_rate": 9.995523155042623e-05, "loss": 2.5585, "step": 24200 }, { "epoch": 0.18, "grad_norm": 0.19711926579475403, "learning_rate": 9.995482727405219e-05, "loss": 2.5589, "step": 24300 }, { "epoch": 0.18, "grad_norm": 0.19466377794742584, "learning_rate": 9.995442118131297e-05, "loss": 2.5579, "step": 24400 }, { "epoch": 0.18, "grad_norm": 0.1623954474925995, "learning_rate": 9.995401327222338e-05, "loss": 2.5577, "step": 24500 }, { "epoch": 0.18, "grad_norm": 0.18352490663528442, "learning_rate": 9.995360354679822e-05, "loss": 2.5583, "step": 24600 }, { "epoch": 0.18, "grad_norm": 0.17624753713607788, "learning_rate": 9.99531920050524e-05, "loss": 2.5577, "step": 24700 }, { "epoch": 0.18, "grad_norm": 0.15336942672729492, "learning_rate": 9.995277864700089e-05, "loss": 2.5578, "step": 24800 }, { "epoch": 0.18, "grad_norm": 0.17434526979923248, "learning_rate": 9.99523634726587e-05, "loss": 2.5569, "step": 24900 }, { "epoch": 0.18, "grad_norm": 0.21592725813388824, "learning_rate": 9.995194648204095e-05, "loss": 2.5543, "step": 25000 }, { "epoch": 0.18, "eval_loss": 2.5564043521881104, "eval_runtime": 5516.1995, "eval_samples_per_second": 1063.242, "eval_steps_per_second": 66.453, "step": 25000 }, { "epoch": 0.18, "grad_norm": 0.15827420353889465, "learning_rate": 9.99515276751628e-05, "loss": 2.5554, "step": 25100 }, { "epoch": 0.18, "grad_norm": 0.17489619553089142, "learning_rate": 9.995110705203945e-05, "loss": 2.5566, "step": 25200 }, { "epoch": 0.18, "grad_norm": 0.18893638253211975, "learning_rate": 9.995068461268622e-05, "loss": 2.5561, "step": 25300 }, { "epoch": 0.18, "grad_norm": 0.19673244655132294, "learning_rate": 9.995026035711845e-05, "loss": 2.5569, "step": 25400 }, { "epoch": 0.19, "grad_norm": 0.18465718626976013, "learning_rate": 9.994983855505939e-05, "loss": 2.5554, "step": 25500 }, { "epoch": 0.19, "grad_norm": 0.1976306438446045, "learning_rate": 9.994941068527068e-05, "loss": 2.5561, "step": 25600 }, { "epoch": 0.19, "grad_norm": 0.1467328816652298, "learning_rate": 9.994898099931376e-05, "loss": 2.5542, "step": 25700 }, { "epoch": 0.19, "grad_norm": 0.20342576503753662, "learning_rate": 9.994854949720426e-05, "loss": 2.555, "step": 25800 }, { "epoch": 0.19, "grad_norm": 0.20438461005687714, "learning_rate": 9.994811617895786e-05, "loss": 2.5553, "step": 25900 }, { "epoch": 0.19, "grad_norm": 0.16016176342964172, "learning_rate": 9.994768104459032e-05, "loss": 2.5568, "step": 26000 }, { "epoch": 0.19, "eval_loss": 2.554506778717041, "eval_runtime": 5432.0955, "eval_samples_per_second": 1079.704, "eval_steps_per_second": 67.482, "step": 26000 }, { "epoch": 0.19, "grad_norm": 0.14238539338111877, "learning_rate": 9.994724409411746e-05, "loss": 2.5552, "step": 26100 }, { "epoch": 0.19, "grad_norm": 0.23573274910449982, "learning_rate": 9.994680532755518e-05, "loss": 2.5523, "step": 26200 }, { "epoch": 0.19, "grad_norm": 0.19863677024841309, "learning_rate": 9.994636474491942e-05, "loss": 2.5529, "step": 26300 }, { "epoch": 0.19, "grad_norm": 0.21995621919631958, "learning_rate": 9.994592234622619e-05, "loss": 2.5538, "step": 26400 }, { "epoch": 0.19, "grad_norm": 0.22063715755939484, "learning_rate": 9.99454781314916e-05, "loss": 2.5529, "step": 26500 }, { "epoch": 0.19, "grad_norm": 0.1817554086446762, "learning_rate": 9.99450321007318e-05, "loss": 2.5542, "step": 26600 }, { "epoch": 0.19, "grad_norm": 0.16820061206817627, "learning_rate": 9.994458425396298e-05, "loss": 2.5532, "step": 26700 }, { "epoch": 0.19, "grad_norm": 0.22539284825325012, "learning_rate": 9.994413459120146e-05, "loss": 2.5546, "step": 26800 }, { "epoch": 0.2, "grad_norm": 0.17431499063968658, "learning_rate": 9.994368311246356e-05, "loss": 2.5535, "step": 26900 }, { "epoch": 0.2, "grad_norm": 0.21042422950267792, "learning_rate": 9.99432298177657e-05, "loss": 2.554, "step": 27000 }, { "epoch": 0.2, "eval_loss": 2.5529098510742188, "eval_runtime": 7939.2552, "eval_samples_per_second": 738.741, "eval_steps_per_second": 46.171, "step": 27000 }, { "epoch": 0.2, "grad_norm": 0.18547575175762177, "learning_rate": 9.994277470712439e-05, "loss": 2.5533, "step": 27100 }, { "epoch": 0.2, "grad_norm": 0.17700788378715515, "learning_rate": 9.994231778055614e-05, "loss": 2.553, "step": 27200 }, { "epoch": 0.2, "grad_norm": 0.2162560373544693, "learning_rate": 9.99418590380776e-05, "loss": 2.5531, "step": 27300 }, { "epoch": 0.2, "grad_norm": 0.17806923389434814, "learning_rate": 9.99413984797054e-05, "loss": 2.5508, "step": 27400 }, { "epoch": 0.2, "grad_norm": 0.1959153264760971, "learning_rate": 9.994094073818738e-05, "loss": 2.5521, "step": 27500 }, { "epoch": 0.2, "grad_norm": 0.2098911553621292, "learning_rate": 9.994047656623675e-05, "loss": 2.5518, "step": 27600 }, { "epoch": 0.2, "grad_norm": 0.17420220375061035, "learning_rate": 9.994001057844278e-05, "loss": 2.5508, "step": 27700 }, { "epoch": 0.2, "grad_norm": 0.1538473218679428, "learning_rate": 9.993954277482238e-05, "loss": 2.5506, "step": 27800 }, { "epoch": 0.2, "grad_norm": 0.1710011512041092, "learning_rate": 9.993907315539257e-05, "loss": 2.5509, "step": 27900 }, { "epoch": 0.2, "grad_norm": 0.14763277769088745, "learning_rate": 9.993860172017044e-05, "loss": 2.5514, "step": 28000 }, { "epoch": 0.2, "eval_loss": 2.5512099266052246, "eval_runtime": 9244.95, "eval_samples_per_second": 634.406, "eval_steps_per_second": 39.65, "step": 28000 }, { "epoch": 0.2, "grad_norm": 0.24788980185985565, "learning_rate": 9.99381284691731e-05, "loss": 2.5507, "step": 28100 }, { "epoch": 0.21, "grad_norm": 0.16047827899456024, "learning_rate": 9.993765340241779e-05, "loss": 2.5504, "step": 28200 }, { "epoch": 0.21, "grad_norm": 0.2097485363483429, "learning_rate": 9.993717651992176e-05, "loss": 2.5496, "step": 28300 }, { "epoch": 0.21, "grad_norm": 0.15989455580711365, "learning_rate": 9.993669782170236e-05, "loss": 2.5511, "step": 28400 }, { "epoch": 0.21, "grad_norm": 0.1990625262260437, "learning_rate": 9.9936217307777e-05, "loss": 2.5491, "step": 28500 }, { "epoch": 0.21, "grad_norm": 0.17394055426120758, "learning_rate": 9.993573497816314e-05, "loss": 2.5503, "step": 28600 }, { "epoch": 0.21, "grad_norm": 0.17688289284706116, "learning_rate": 9.993525083287832e-05, "loss": 2.5487, "step": 28700 }, { "epoch": 0.21, "grad_norm": 0.20647858083248138, "learning_rate": 9.993476487194015e-05, "loss": 2.5502, "step": 28800 }, { "epoch": 0.21, "grad_norm": 0.17351941764354706, "learning_rate": 9.99342770953663e-05, "loss": 2.5503, "step": 28900 }, { "epoch": 0.21, "grad_norm": 0.16259269416332245, "learning_rate": 9.993379240808367e-05, "loss": 2.5505, "step": 29000 }, { "epoch": 0.21, "eval_loss": 2.5498733520507812, "eval_runtime": 8094.52, "eval_samples_per_second": 724.571, "eval_steps_per_second": 45.286, "step": 29000 }, { "epoch": 0.21, "grad_norm": 0.152243971824646, "learning_rate": 9.993330101844764e-05, "loss": 2.5494, "step": 29100 }, { "epoch": 0.21, "grad_norm": 0.21322308480739594, "learning_rate": 9.993280781322914e-05, "loss": 2.5495, "step": 29200 }, { "epoch": 0.21, "grad_norm": 0.17743776738643646, "learning_rate": 9.993231279244612e-05, "loss": 2.55, "step": 29300 }, { "epoch": 0.21, "grad_norm": 0.19075080752372742, "learning_rate": 9.993181595611659e-05, "loss": 2.5477, "step": 29400 }, { "epoch": 0.21, "grad_norm": 0.14956378936767578, "learning_rate": 9.993131730425858e-05, "loss": 2.5499, "step": 29500 }, { "epoch": 0.22, "grad_norm": 0.1913817673921585, "learning_rate": 9.993081683689026e-05, "loss": 2.5499, "step": 29600 }, { "epoch": 0.22, "grad_norm": 0.1898573786020279, "learning_rate": 9.99303145540298e-05, "loss": 2.5478, "step": 29700 }, { "epoch": 0.22, "grad_norm": 0.18223215639591217, "learning_rate": 9.992981045569545e-05, "loss": 2.5497, "step": 29800 }, { "epoch": 0.22, "grad_norm": 0.17807921767234802, "learning_rate": 9.992930454190558e-05, "loss": 2.5482, "step": 29900 }, { "epoch": 0.22, "grad_norm": 0.19245785474777222, "learning_rate": 9.992879681267855e-05, "loss": 2.5477, "step": 30000 }, { "epoch": 0.22, "eval_loss": 2.5480847358703613, "eval_runtime": 5418.7874, "eval_samples_per_second": 1082.355, "eval_steps_per_second": 67.647, "step": 30000 }, { "epoch": 0.22, "grad_norm": 0.16644108295440674, "learning_rate": 9.992828726803284e-05, "loss": 2.5462, "step": 30100 }, { "epoch": 0.22, "grad_norm": 0.21199798583984375, "learning_rate": 9.992777590798698e-05, "loss": 2.5472, "step": 30200 }, { "epoch": 0.22, "grad_norm": 0.16835100948810577, "learning_rate": 9.992726273255957e-05, "loss": 2.5492, "step": 30300 }, { "epoch": 0.22, "grad_norm": 0.24301642179489136, "learning_rate": 9.992674774176924e-05, "loss": 2.547, "step": 30400 }, { "epoch": 0.22, "grad_norm": 0.1855507344007492, "learning_rate": 9.992623093563473e-05, "loss": 2.5482, "step": 30500 }, { "epoch": 0.22, "grad_norm": 0.20825399458408356, "learning_rate": 9.992571231417482e-05, "loss": 2.5472, "step": 30600 }, { "epoch": 0.22, "grad_norm": 0.18983405828475952, "learning_rate": 9.99251918774084e-05, "loss": 2.5456, "step": 30700 }, { "epoch": 0.22, "grad_norm": 0.19868837296962738, "learning_rate": 9.992466962535437e-05, "loss": 2.5472, "step": 30800 }, { "epoch": 0.22, "grad_norm": 0.217344731092453, "learning_rate": 9.99241455580317e-05, "loss": 2.5468, "step": 30900 }, { "epoch": 0.23, "grad_norm": 0.21630564332008362, "learning_rate": 9.992361967545949e-05, "loss": 2.5461, "step": 31000 }, { "epoch": 0.23, "eval_loss": 2.5460619926452637, "eval_runtime": 5334.7258, "eval_samples_per_second": 1099.41, "eval_steps_per_second": 68.713, "step": 31000 }, { "epoch": 0.23, "grad_norm": 0.20726899802684784, "learning_rate": 9.992309197765681e-05, "loss": 2.5479, "step": 31100 }, { "epoch": 0.23, "grad_norm": 0.20710667967796326, "learning_rate": 9.99225624646429e-05, "loss": 2.5472, "step": 31200 }, { "epoch": 0.23, "grad_norm": 0.23984268307685852, "learning_rate": 9.992203113643699e-05, "loss": 2.5449, "step": 31300 }, { "epoch": 0.23, "grad_norm": 0.1859433650970459, "learning_rate": 9.992149799305838e-05, "loss": 2.5456, "step": 31400 }, { "epoch": 0.23, "grad_norm": 0.19020648300647736, "learning_rate": 9.992096303452647e-05, "loss": 2.5472, "step": 31500 }, { "epoch": 0.23, "grad_norm": 0.18619538843631744, "learning_rate": 9.992042626086073e-05, "loss": 2.5446, "step": 31600 }, { "epoch": 0.23, "grad_norm": 0.15984103083610535, "learning_rate": 9.991988767208065e-05, "loss": 2.5429, "step": 31700 }, { "epoch": 0.23, "grad_norm": 0.1874154657125473, "learning_rate": 9.991934726820583e-05, "loss": 2.5457, "step": 31800 }, { "epoch": 0.23, "grad_norm": 0.21573562920093536, "learning_rate": 9.99188050492559e-05, "loss": 2.5451, "step": 31900 }, { "epoch": 0.23, "grad_norm": 0.20766520500183105, "learning_rate": 9.991826101525059e-05, "loss": 2.5457, "step": 32000 }, { "epoch": 0.23, "eval_loss": 2.5444440841674805, "eval_runtime": 5469.9575, "eval_samples_per_second": 1072.23, "eval_steps_per_second": 67.014, "step": 32000 }, { "epoch": 0.23, "grad_norm": 0.1947011798620224, "learning_rate": 9.991771516620969e-05, "loss": 2.5439, "step": 32100 }, { "epoch": 0.23, "grad_norm": 0.20974138379096985, "learning_rate": 9.991716750215303e-05, "loss": 2.5455, "step": 32200 }, { "epoch": 0.23, "grad_norm": 0.20095574855804443, "learning_rate": 9.991661802310053e-05, "loss": 2.544, "step": 32300 }, { "epoch": 0.24, "grad_norm": 0.15232567489147186, "learning_rate": 9.991606672907218e-05, "loss": 2.5447, "step": 32400 }, { "epoch": 0.24, "grad_norm": 0.1890016496181488, "learning_rate": 9.99155191601618e-05, "loss": 2.5438, "step": 32500 }, { "epoch": 0.24, "grad_norm": 0.15099911391735077, "learning_rate": 9.991496425439117e-05, "loss": 2.5426, "step": 32600 }, { "epoch": 0.24, "grad_norm": 0.19564999639987946, "learning_rate": 9.991440753370483e-05, "loss": 2.5433, "step": 32700 }, { "epoch": 0.24, "grad_norm": 0.23213982582092285, "learning_rate": 9.991384899812299e-05, "loss": 2.5451, "step": 32800 }, { "epoch": 0.24, "grad_norm": 0.21088984608650208, "learning_rate": 9.991328864766595e-05, "loss": 2.5423, "step": 32900 }, { "epoch": 0.24, "grad_norm": 0.2080707848072052, "learning_rate": 9.991272648235412e-05, "loss": 2.542, "step": 33000 }, { "epoch": 0.24, "eval_loss": 2.543639898300171, "eval_runtime": 5500.5758, "eval_samples_per_second": 1066.262, "eval_steps_per_second": 66.641, "step": 33000 }, { "epoch": 0.24, "grad_norm": 0.20043937861919403, "learning_rate": 9.991216250220794e-05, "loss": 2.5434, "step": 33100 }, { "epoch": 0.24, "grad_norm": 0.26814088225364685, "learning_rate": 9.991159670724789e-05, "loss": 2.5439, "step": 33200 }, { "epoch": 0.24, "grad_norm": 0.1790783852338791, "learning_rate": 9.991102909749455e-05, "loss": 2.5422, "step": 33300 }, { "epoch": 0.24, "grad_norm": 0.17718929052352905, "learning_rate": 9.991045967296856e-05, "loss": 2.5414, "step": 33400 }, { "epoch": 0.24, "grad_norm": 0.19124653935432434, "learning_rate": 9.990988843369065e-05, "loss": 2.5409, "step": 33500 }, { "epoch": 0.24, "grad_norm": 0.19628183543682098, "learning_rate": 9.990931537968155e-05, "loss": 2.5423, "step": 33600 }, { "epoch": 0.25, "grad_norm": 0.2127145528793335, "learning_rate": 9.990874051096211e-05, "loss": 2.5451, "step": 33700 }, { "epoch": 0.25, "grad_norm": 0.19408264756202698, "learning_rate": 9.990816382755324e-05, "loss": 2.5405, "step": 33800 }, { "epoch": 0.25, "grad_norm": 0.1892174333333969, "learning_rate": 9.99075853294759e-05, "loss": 2.5428, "step": 33900 }, { "epoch": 0.25, "grad_norm": 0.20231932401657104, "learning_rate": 9.990700501675114e-05, "loss": 2.5424, "step": 34000 }, { "epoch": 0.25, "eval_loss": 2.5417420864105225, "eval_runtime": 5451.1377, "eval_samples_per_second": 1075.932, "eval_steps_per_second": 67.246, "step": 34000 }, { "epoch": 0.25, "grad_norm": 0.17818154394626617, "learning_rate": 9.990642288940005e-05, "loss": 2.5406, "step": 34100 }, { "epoch": 0.25, "grad_norm": 0.20383848249912262, "learning_rate": 9.990583894744378e-05, "loss": 2.5414, "step": 34200 }, { "epoch": 0.25, "grad_norm": 0.19464430212974548, "learning_rate": 9.99052531909036e-05, "loss": 2.5413, "step": 34300 }, { "epoch": 0.25, "grad_norm": 0.17793488502502441, "learning_rate": 9.990466561980076e-05, "loss": 2.5421, "step": 34400 }, { "epoch": 0.25, "grad_norm": 0.2119537591934204, "learning_rate": 9.990407623415668e-05, "loss": 2.54, "step": 34500 }, { "epoch": 0.25, "grad_norm": 0.20085354149341583, "learning_rate": 9.990348503399273e-05, "loss": 2.5395, "step": 34600 }, { "epoch": 0.25, "grad_norm": 0.18408524990081787, "learning_rate": 9.990289201933045e-05, "loss": 2.5395, "step": 34700 }, { "epoch": 0.25, "grad_norm": 0.18326416611671448, "learning_rate": 9.990229719019137e-05, "loss": 2.5404, "step": 34800 }, { "epoch": 0.25, "grad_norm": 0.2630787193775177, "learning_rate": 9.990170054659715e-05, "loss": 2.5404, "step": 34900 }, { "epoch": 0.25, "grad_norm": 0.23082153499126434, "learning_rate": 9.990110208856948e-05, "loss": 2.5402, "step": 35000 }, { "epoch": 0.25, "eval_loss": 2.540555477142334, "eval_runtime": 5286.1621, "eval_samples_per_second": 1109.511, "eval_steps_per_second": 69.344, "step": 35000 }, { "epoch": 0.26, "grad_norm": 0.2101873904466629, "learning_rate": 9.990050181613012e-05, "loss": 2.5409, "step": 35100 }, { "epoch": 0.26, "grad_norm": 0.1748313456773758, "learning_rate": 9.989989972930086e-05, "loss": 2.5408, "step": 35200 }, { "epoch": 0.26, "grad_norm": 0.21641358733177185, "learning_rate": 9.989929582810362e-05, "loss": 2.5407, "step": 35300 }, { "epoch": 0.26, "grad_norm": 0.20202140510082245, "learning_rate": 9.989869011256037e-05, "loss": 2.5419, "step": 35400 }, { "epoch": 0.26, "grad_norm": 0.1987978219985962, "learning_rate": 9.989808258269311e-05, "loss": 2.54, "step": 35500 }, { "epoch": 0.26, "grad_norm": 0.17482729256153107, "learning_rate": 9.989747323852394e-05, "loss": 2.5398, "step": 35600 }, { "epoch": 0.26, "grad_norm": 0.15725675225257874, "learning_rate": 9.989686208007502e-05, "loss": 2.5384, "step": 35700 }, { "epoch": 0.26, "grad_norm": Infinity, "learning_rate": 9.989625524607613e-05, "loss": 2.54, "step": 35800 }, { "epoch": 0.26, "grad_norm": 0.19513057172298431, "learning_rate": 9.989564047727667e-05, "loss": 2.541, "step": 35900 }, { "epoch": 0.26, "grad_norm": 0.24705880880355835, "learning_rate": 9.989502389426411e-05, "loss": 2.5398, "step": 36000 }, { "epoch": 0.26, "eval_loss": 2.5393338203430176, "eval_runtime": 5370.7521, "eval_samples_per_second": 1092.036, "eval_steps_per_second": 68.252, "step": 36000 }, { "epoch": 0.26, "grad_norm": 0.17639793455600739, "learning_rate": 9.989440549706085e-05, "loss": 2.5398, "step": 36100 }, { "epoch": 0.26, "grad_norm": 0.23090311884880066, "learning_rate": 9.989378528568935e-05, "loss": 2.5399, "step": 36200 }, { "epoch": 0.26, "grad_norm": 0.21377325057983398, "learning_rate": 9.989316326017221e-05, "loss": 2.5351, "step": 36300 }, { "epoch": 0.26, "grad_norm": 0.21422795951366425, "learning_rate": 9.989253942053204e-05, "loss": 2.5381, "step": 36400 }, { "epoch": 0.27, "grad_norm": 0.20974934101104736, "learning_rate": 9.98919137667915e-05, "loss": 2.5378, "step": 36500 }, { "epoch": 0.27, "grad_norm": 0.19352389872074127, "learning_rate": 9.989128629897335e-05, "loss": 2.5378, "step": 36600 }, { "epoch": 0.27, "grad_norm": 0.21016819775104523, "learning_rate": 9.989065701710041e-05, "loss": 2.5366, "step": 36700 }, { "epoch": 0.27, "grad_norm": 0.1750701367855072, "learning_rate": 9.989002592119554e-05, "loss": 2.5399, "step": 36800 }, { "epoch": 0.27, "grad_norm": 0.18955004215240479, "learning_rate": 9.988939301128171e-05, "loss": 2.5411, "step": 36900 }, { "epoch": 0.27, "grad_norm": 0.22183337807655334, "learning_rate": 9.988875828738192e-05, "loss": 2.5385, "step": 37000 }, { "epoch": 0.27, "eval_loss": 2.537937879562378, "eval_runtime": 5267.1428, "eval_samples_per_second": 1113.517, "eval_steps_per_second": 69.595, "step": 37000 }, { "epoch": 0.27, "grad_norm": 0.17703290283679962, "learning_rate": 9.988812174951926e-05, "loss": 2.5386, "step": 37100 }, { "epoch": 0.27, "grad_norm": 0.17456910014152527, "learning_rate": 9.988748339771686e-05, "loss": 2.536, "step": 37200 }, { "epoch": 0.27, "grad_norm": 0.1729470044374466, "learning_rate": 9.988684323199795e-05, "loss": 2.5367, "step": 37300 }, { "epoch": 0.27, "grad_norm": 0.14801037311553955, "learning_rate": 9.988620125238578e-05, "loss": 2.5365, "step": 37400 }, { "epoch": 0.27, "grad_norm": 0.19737888872623444, "learning_rate": 9.988555745890371e-05, "loss": 2.5373, "step": 37500 }, { "epoch": 0.27, "grad_norm": 0.2556416392326355, "learning_rate": 9.988491185157514e-05, "loss": 2.5389, "step": 37600 }, { "epoch": 0.27, "grad_norm": 0.22999688982963562, "learning_rate": 9.988426443042357e-05, "loss": 2.5366, "step": 37700 }, { "epoch": 0.27, "grad_norm": 0.20399746298789978, "learning_rate": 9.988361519547252e-05, "loss": 2.5375, "step": 37800 }, { "epoch": 0.28, "grad_norm": 0.1886683702468872, "learning_rate": 9.988296414674556e-05, "loss": 2.5378, "step": 37900 }, { "epoch": 0.28, "grad_norm": 0.15870767831802368, "learning_rate": 9.988231128426643e-05, "loss": 2.5396, "step": 38000 }, { "epoch": 0.28, "eval_loss": 2.536235809326172, "eval_runtime": 5469.5004, "eval_samples_per_second": 1072.32, "eval_steps_per_second": 67.02, "step": 38000 }, { "epoch": 0.28, "grad_norm": 0.19297446310520172, "learning_rate": 9.988165660805883e-05, "loss": 2.5364, "step": 38100 }, { "epoch": 0.28, "grad_norm": 0.1729346513748169, "learning_rate": 9.988100011814657e-05, "loss": 2.5374, "step": 38200 }, { "epoch": 0.28, "grad_norm": 0.2072724550962448, "learning_rate": 9.988034181455352e-05, "loss": 2.5381, "step": 38300 }, { "epoch": 0.28, "grad_norm": 0.1756744384765625, "learning_rate": 9.987968169730362e-05, "loss": 2.5379, "step": 38400 }, { "epoch": 0.28, "grad_norm": 0.27329882979393005, "learning_rate": 9.987901976642086e-05, "loss": 2.5349, "step": 38500 }, { "epoch": 0.28, "grad_norm": 0.23222842812538147, "learning_rate": 9.987835602192934e-05, "loss": 2.5355, "step": 38600 }, { "epoch": 0.28, "grad_norm": 0.2461015284061432, "learning_rate": 9.987769046385316e-05, "loss": 2.5364, "step": 38700 }, { "epoch": 0.28, "grad_norm": 0.1758180409669876, "learning_rate": 9.987702309221651e-05, "loss": 2.5377, "step": 38800 }, { "epoch": 0.28, "grad_norm": 0.15205176174640656, "learning_rate": 9.987635390704369e-05, "loss": 2.5376, "step": 38900 }, { "epoch": 0.28, "grad_norm": 0.1985970139503479, "learning_rate": 9.987568290835903e-05, "loss": 2.5372, "step": 39000 }, { "epoch": 0.28, "eval_loss": 2.53525710105896, "eval_runtime": 5498.4571, "eval_samples_per_second": 1066.673, "eval_steps_per_second": 66.667, "step": 39000 }, { "epoch": 0.28, "grad_norm": 0.1800653338432312, "learning_rate": 9.987501009618691e-05, "loss": 2.5349, "step": 39100 }, { "epoch": 0.29, "grad_norm": 0.22484809160232544, "learning_rate": 9.987433547055178e-05, "loss": 2.5364, "step": 39200 }, { "epoch": 0.29, "grad_norm": 0.1974021941423416, "learning_rate": 9.98736590314782e-05, "loss": 2.5333, "step": 39300 }, { "epoch": 0.29, "grad_norm": 0.23238864541053772, "learning_rate": 9.987298077899076e-05, "loss": 2.5371, "step": 39400 }, { "epoch": 0.29, "grad_norm": 0.17493529617786407, "learning_rate": 9.987230071311411e-05, "loss": 2.5356, "step": 39500 }, { "epoch": 0.29, "grad_norm": 0.23257118463516235, "learning_rate": 9.987161883387299e-05, "loss": 2.5354, "step": 39600 }, { "epoch": 0.29, "grad_norm": 0.18204239010810852, "learning_rate": 9.987094198719394e-05, "loss": 2.5348, "step": 39700 }, { "epoch": 0.29, "grad_norm": 0.20585016906261444, "learning_rate": 9.987025649943133e-05, "loss": 2.5347, "step": 39800 }, { "epoch": 0.29, "grad_norm": 0.19951903820037842, "learning_rate": 9.986956919837858e-05, "loss": 2.5355, "step": 39900 }, { "epoch": 0.29, "grad_norm": 0.1689595878124237, "learning_rate": 9.986888008406065e-05, "loss": 2.535, "step": 40000 }, { "epoch": 0.29, "eval_loss": 2.534259796142578, "eval_runtime": 5327.044, "eval_samples_per_second": 1100.996, "eval_steps_per_second": 68.812, "step": 40000 }, { "epoch": 0.29, "grad_norm": 0.16131243109703064, "learning_rate": 9.986818915650265e-05, "loss": 2.5343, "step": 40100 }, { "epoch": 0.29, "grad_norm": 0.16442734003067017, "learning_rate": 9.986749641572963e-05, "loss": 2.5336, "step": 40200 }, { "epoch": 0.29, "grad_norm": 0.17911262810230255, "learning_rate": 9.986680186176684e-05, "loss": 2.534, "step": 40300 }, { "epoch": 0.29, "grad_norm": 0.19391265511512756, "learning_rate": 9.986610549463952e-05, "loss": 2.5344, "step": 40400 }, { "epoch": 0.29, "grad_norm": 0.21224987506866455, "learning_rate": 9.986540731437298e-05, "loss": 2.5362, "step": 40500 }, { "epoch": 0.3, "grad_norm": 0.21114754676818848, "learning_rate": 9.986470732099258e-05, "loss": 2.5344, "step": 40600 }, { "epoch": 0.3, "grad_norm": 0.17994599044322968, "learning_rate": 9.986400551452382e-05, "loss": 2.5338, "step": 40700 }, { "epoch": 0.3, "grad_norm": 0.20839715003967285, "learning_rate": 9.98633018949922e-05, "loss": 2.5327, "step": 40800 }, { "epoch": 0.3, "grad_norm": 0.1808551549911499, "learning_rate": 9.986259646242329e-05, "loss": 2.5323, "step": 40900 }, { "epoch": 0.3, "grad_norm": 0.20471493899822235, "learning_rate": 9.986188921684276e-05, "loss": 2.5318, "step": 41000 }, { "epoch": 0.3, "eval_loss": 2.5325772762298584, "eval_runtime": 5462.6174, "eval_samples_per_second": 1073.671, "eval_steps_per_second": 67.104, "step": 41000 }, { "epoch": 0.3, "grad_norm": 0.18955881893634796, "learning_rate": 9.986118015827632e-05, "loss": 2.5328, "step": 41100 }, { "epoch": 0.3, "grad_norm": 0.20864000916481018, "learning_rate": 9.986046928674974e-05, "loss": 2.5303, "step": 41200 }, { "epoch": 0.3, "grad_norm": 0.17366698384284973, "learning_rate": 9.985975660228888e-05, "loss": 2.5314, "step": 41300 }, { "epoch": 0.3, "grad_norm": 0.2348269373178482, "learning_rate": 9.985904210491963e-05, "loss": 2.5336, "step": 41400 }, { "epoch": 0.3, "grad_norm": 0.2203817069530487, "learning_rate": 9.985832579466801e-05, "loss": 2.5307, "step": 41500 }, { "epoch": 0.3, "grad_norm": 0.20355217158794403, "learning_rate": 9.985760767156003e-05, "loss": 2.5329, "step": 41600 }, { "epoch": 0.3, "grad_norm": 0.21894210577011108, "learning_rate": 9.985688773562183e-05, "loss": 2.5325, "step": 41700 }, { "epoch": 0.3, "grad_norm": 0.18119728565216064, "learning_rate": 9.985616598687954e-05, "loss": 2.5316, "step": 41800 }, { "epoch": 0.3, "grad_norm": 0.22831158339977264, "learning_rate": 9.985544242535946e-05, "loss": 2.5316, "step": 41900 }, { "epoch": 0.31, "grad_norm": 0.18714164197444916, "learning_rate": 9.985471705108787e-05, "loss": 2.5315, "step": 42000 }, { "epoch": 0.31, "eval_loss": 2.531712770462036, "eval_runtime": 5461.1677, "eval_samples_per_second": 1073.956, "eval_steps_per_second": 67.122, "step": 42000 }, { "epoch": 0.31, "grad_norm": 0.19056876003742218, "learning_rate": 9.985398986409115e-05, "loss": 2.5305, "step": 42100 }, { "epoch": 0.31, "grad_norm": 0.19371190667152405, "learning_rate": 9.985326086439573e-05, "loss": 2.5333, "step": 42200 }, { "epoch": 0.31, "grad_norm": 0.19493506848812103, "learning_rate": 9.985253005202813e-05, "loss": 2.5279, "step": 42300 }, { "epoch": 0.31, "grad_norm": 0.17453902959823608, "learning_rate": 9.985179742701491e-05, "loss": 2.5334, "step": 42400 }, { "epoch": 0.31, "grad_norm": 0.19747483730316162, "learning_rate": 9.985106298938272e-05, "loss": 2.5321, "step": 42500 }, { "epoch": 0.31, "grad_norm": 0.18142738938331604, "learning_rate": 9.985032673915826e-05, "loss": 2.5312, "step": 42600 }, { "epoch": 0.31, "grad_norm": 0.21015243232250214, "learning_rate": 9.984958867636828e-05, "loss": 2.5328, "step": 42700 }, { "epoch": 0.31, "grad_norm": 0.20541459321975708, "learning_rate": 9.984884880103966e-05, "loss": 2.5318, "step": 42800 }, { "epoch": 0.31, "grad_norm": 0.24034422636032104, "learning_rate": 9.984810711319927e-05, "loss": 2.5305, "step": 42900 }, { "epoch": 0.31, "grad_norm": 0.22423624992370605, "learning_rate": 9.98473636128741e-05, "loss": 2.5298, "step": 43000 }, { "epoch": 0.31, "eval_loss": 2.530484437942505, "eval_runtime": 5481.4821, "eval_samples_per_second": 1069.976, "eval_steps_per_second": 66.874, "step": 43000 }, { "epoch": 0.31, "grad_norm": 0.1877780258655548, "learning_rate": 9.984661830009115e-05, "loss": 2.5308, "step": 43100 }, { "epoch": 0.31, "grad_norm": 0.2150457501411438, "learning_rate": 9.984587117487755e-05, "loss": 2.5318, "step": 43200 }, { "epoch": 0.31, "grad_norm": 0.17595185339450836, "learning_rate": 9.984512223726045e-05, "loss": 2.5291, "step": 43300 }, { "epoch": 0.32, "grad_norm": 0.1872701346874237, "learning_rate": 9.984437148726708e-05, "loss": 2.5298, "step": 43400 }, { "epoch": 0.32, "grad_norm": 0.15969082713127136, "learning_rate": 9.984361892492475e-05, "loss": 2.5322, "step": 43500 }, { "epoch": 0.32, "grad_norm": 0.22320838272571564, "learning_rate": 9.984286455026082e-05, "loss": 2.5284, "step": 43600 }, { "epoch": 0.32, "grad_norm": 0.22105048596858978, "learning_rate": 9.984211593414306e-05, "loss": 2.529, "step": 43700 }, { "epoch": 0.32, "grad_norm": 0.18613219261169434, "learning_rate": 9.98413579530408e-05, "loss": 2.5306, "step": 43800 }, { "epoch": 0.32, "grad_norm": 0.21678748726844788, "learning_rate": 9.984059815969915e-05, "loss": 2.5296, "step": 43900 }, { "epoch": 0.32, "grad_norm": 0.19831928610801697, "learning_rate": 9.983983655414574e-05, "loss": 2.5296, "step": 44000 }, { "epoch": 0.32, "eval_loss": 2.5292155742645264, "eval_runtime": 5508.261, "eval_samples_per_second": 1064.774, "eval_steps_per_second": 66.548, "step": 44000 }, { "epoch": 0.32, "grad_norm": NaN, "learning_rate": 9.983908077955583e-05, "loss": 2.5294, "step": 44100 }, { "epoch": 0.32, "grad_norm": 0.18271589279174805, "learning_rate": 9.983831556778345e-05, "loss": 2.5312, "step": 44200 }, { "epoch": 0.32, "grad_norm": 0.24201270937919617, "learning_rate": 9.98375485438823e-05, "loss": 2.5303, "step": 44300 }, { "epoch": 0.32, "grad_norm": 0.18819460272789001, "learning_rate": 9.983677970788026e-05, "loss": 2.5285, "step": 44400 }, { "epoch": 0.32, "grad_norm": 0.16757667064666748, "learning_rate": 9.983600905980529e-05, "loss": 2.5289, "step": 44500 }, { "epoch": 0.32, "grad_norm": 0.16967040300369263, "learning_rate": 9.983523659968541e-05, "loss": 2.5306, "step": 44600 }, { "epoch": 0.33, "grad_norm": 0.19566090404987335, "learning_rate": 9.983446232754872e-05, "loss": 2.5276, "step": 44700 }, { "epoch": 0.33, "grad_norm": 0.19131731986999512, "learning_rate": 9.983368624342335e-05, "loss": 2.5304, "step": 44800 }, { "epoch": 0.33, "grad_norm": 0.1763111650943756, "learning_rate": 9.983290834733753e-05, "loss": 2.5281, "step": 44900 }, { "epoch": 0.33, "grad_norm": 0.18837742507457733, "learning_rate": 9.983212863931956e-05, "loss": 2.5301, "step": 45000 }, { "epoch": 0.33, "eval_loss": 2.5283169746398926, "eval_runtime": 5445.3147, "eval_samples_per_second": 1077.082, "eval_steps_per_second": 67.318, "step": 45000 }, { "epoch": 0.33, "grad_norm": 0.1776355654001236, "learning_rate": 9.983134711939777e-05, "loss": 2.5282, "step": 45100 }, { "epoch": 0.33, "grad_norm": 0.22749735414981842, "learning_rate": 9.983056378760059e-05, "loss": 2.5284, "step": 45200 }, { "epoch": 0.33, "grad_norm": 0.19231122732162476, "learning_rate": 9.982977864395649e-05, "loss": 2.5288, "step": 45300 }, { "epoch": 0.33, "grad_norm": 0.18644002079963684, "learning_rate": 9.9828991688494e-05, "loss": 2.5291, "step": 45400 }, { "epoch": 0.33, "grad_norm": 0.1956445276737213, "learning_rate": 9.982820292124177e-05, "loss": 2.5274, "step": 45500 }, { "epoch": 0.33, "grad_norm": 0.23224614560604095, "learning_rate": 9.982741234222848e-05, "loss": 2.5267, "step": 45600 }, { "epoch": 0.33, "grad_norm": 0.18513694405555725, "learning_rate": 9.982661995148284e-05, "loss": 2.5286, "step": 45700 }, { "epoch": 0.33, "grad_norm": 0.21743735671043396, "learning_rate": 9.982582574903369e-05, "loss": 2.5255, "step": 45800 }, { "epoch": 0.33, "grad_norm": 0.1804727017879486, "learning_rate": 9.982502973490989e-05, "loss": 2.5251, "step": 45900 }, { "epoch": 0.33, "grad_norm": 0.20864586532115936, "learning_rate": 9.982423190914041e-05, "loss": 2.5264, "step": 46000 }, { "epoch": 0.33, "eval_loss": 2.5271406173706055, "eval_runtime": 5458.9299, "eval_samples_per_second": 1074.396, "eval_steps_per_second": 67.15, "step": 46000 }, { "epoch": 0.34, "grad_norm": 0.2307577133178711, "learning_rate": 9.982343227175422e-05, "loss": 2.5278, "step": 46100 }, { "epoch": 0.34, "grad_norm": 0.20710083842277527, "learning_rate": 9.982263082278043e-05, "loss": 2.5277, "step": 46200 }, { "epoch": 0.34, "grad_norm": 0.1700790673494339, "learning_rate": 9.982182756224816e-05, "loss": 2.5279, "step": 46300 }, { "epoch": 0.34, "grad_norm": 0.2546616494655609, "learning_rate": 9.982102249018664e-05, "loss": 2.5257, "step": 46400 }, { "epoch": 0.34, "grad_norm": 0.2152370661497116, "learning_rate": 9.982021560662511e-05, "loss": 2.5245, "step": 46500 }, { "epoch": 0.34, "grad_norm": 0.177117258310318, "learning_rate": 9.981940691159294e-05, "loss": 2.5282, "step": 46600 }, { "epoch": 0.34, "grad_norm": 0.1751713901758194, "learning_rate": 9.98185964051195e-05, "loss": 2.5263, "step": 46700 }, { "epoch": 0.34, "grad_norm": 0.20219507813453674, "learning_rate": 9.981779221937954e-05, "loss": 2.5265, "step": 46800 }, { "epoch": 0.34, "grad_norm": 0.22525231540203094, "learning_rate": 9.981697810822577e-05, "loss": 2.5273, "step": 46900 }, { "epoch": 0.34, "grad_norm": 0.184763565659523, "learning_rate": 9.981616218571906e-05, "loss": 2.5277, "step": 47000 }, { "epoch": 0.34, "eval_loss": 2.526160955429077, "eval_runtime": 5512.6944, "eval_samples_per_second": 1063.918, "eval_steps_per_second": 66.495, "step": 47000 }, { "epoch": 0.34, "grad_norm": 0.21627645194530487, "learning_rate": 9.981534445188906e-05, "loss": 2.5277, "step": 47100 }, { "epoch": 0.34, "grad_norm": 0.19909755885601044, "learning_rate": 9.981452490676553e-05, "loss": 2.5279, "step": 47200 }, { "epoch": 0.34, "grad_norm": 0.17634861171245575, "learning_rate": 9.981370355037826e-05, "loss": 2.5262, "step": 47300 }, { "epoch": 0.34, "grad_norm": 0.19381621479988098, "learning_rate": 9.981288038275712e-05, "loss": 2.5266, "step": 47400 }, { "epoch": 0.35, "grad_norm": 0.20312373340129852, "learning_rate": 9.981205540393204e-05, "loss": 2.525, "step": 47500 }, { "epoch": 0.35, "grad_norm": 0.2033509761095047, "learning_rate": 9.9811228613933e-05, "loss": 2.5277, "step": 47600 }, { "epoch": 0.35, "grad_norm": 0.1845845878124237, "learning_rate": 9.981040001279007e-05, "loss": 2.5281, "step": 47700 }, { "epoch": 0.35, "grad_norm": 0.22188545763492584, "learning_rate": 9.98095696005334e-05, "loss": 2.5264, "step": 47800 }, { "epoch": 0.35, "grad_norm": 0.18713383376598358, "learning_rate": 9.980873737719315e-05, "loss": 2.5265, "step": 47900 }, { "epoch": 0.35, "grad_norm": 0.1767318844795227, "learning_rate": 9.980790334279959e-05, "loss": 2.5271, "step": 48000 }, { "epoch": 0.35, "eval_loss": 2.525097131729126, "eval_runtime": 5444.1367, "eval_samples_per_second": 1077.316, "eval_steps_per_second": 67.332, "step": 48000 }, { "epoch": 0.35, "grad_norm": 0.21088416874408722, "learning_rate": 9.980706749738306e-05, "loss": 2.5274, "step": 48100 }, { "epoch": 0.35, "grad_norm": 0.19877323508262634, "learning_rate": 9.980622984097396e-05, "loss": 2.5265, "step": 48200 }, { "epoch": 0.35, "grad_norm": 0.19006992876529694, "learning_rate": 9.98053903736027e-05, "loss": 2.5249, "step": 48300 }, { "epoch": 0.35, "grad_norm": 0.21013252437114716, "learning_rate": 9.980454909529985e-05, "loss": 2.5258, "step": 48400 }, { "epoch": 0.35, "grad_norm": 0.20572130382061005, "learning_rate": 9.980370600609597e-05, "loss": 2.525, "step": 48500 }, { "epoch": 0.35, "grad_norm": 0.1854819506406784, "learning_rate": 9.980286110602174e-05, "loss": 2.5267, "step": 48600 }, { "epoch": 0.35, "grad_norm": 0.18405982851982117, "learning_rate": 9.980201439510786e-05, "loss": 2.5266, "step": 48700 }, { "epoch": 0.36, "grad_norm": 0.1952805072069168, "learning_rate": 9.980116587338512e-05, "loss": 2.5257, "step": 48800 }, { "epoch": 0.36, "grad_norm": 0.19929055869579315, "learning_rate": 9.980031554088438e-05, "loss": 2.5236, "step": 48900 }, { "epoch": 0.36, "grad_norm": 0.19686761498451233, "learning_rate": 9.979946339763656e-05, "loss": 2.5265, "step": 49000 }, { "epoch": 0.36, "eval_loss": 2.5248045921325684, "eval_runtime": 5428.5128, "eval_samples_per_second": 1080.416, "eval_steps_per_second": 67.526, "step": 49000 }, { "epoch": 0.36, "grad_norm": 0.19359131157398224, "learning_rate": 9.979860944367263e-05, "loss": 2.525, "step": 49100 }, { "epoch": 0.36, "grad_norm": 0.1961933672428131, "learning_rate": 9.979776224563294e-05, "loss": 2.5269, "step": 49200 }, { "epoch": 0.36, "grad_norm": 0.21087850630283356, "learning_rate": 9.97969046884364e-05, "loss": 2.5256, "step": 49300 }, { "epoch": 0.36, "grad_norm": 0.23223061859607697, "learning_rate": 9.979604532061681e-05, "loss": 2.5252, "step": 49400 }, { "epoch": 0.36, "grad_norm": 0.19952614605426788, "learning_rate": 9.97951841422054e-05, "loss": 2.524, "step": 49500 }, { "epoch": 0.36, "grad_norm": 0.20062975585460663, "learning_rate": 9.979432115323348e-05, "loss": 2.5219, "step": 49600 }, { "epoch": 0.36, "grad_norm": 0.20036938786506653, "learning_rate": 9.979345635373243e-05, "loss": 2.5237, "step": 49700 }, { "epoch": 0.36, "grad_norm": 0.1781741976737976, "learning_rate": 9.97925897437337e-05, "loss": 2.5226, "step": 49800 }, { "epoch": 0.36, "grad_norm": 0.2209465205669403, "learning_rate": 9.97917213232688e-05, "loss": 2.5246, "step": 49900 }, { "epoch": 0.36, "grad_norm": 0.20452982187271118, "learning_rate": 9.979085109236929e-05, "loss": 2.526, "step": 50000 }, { "epoch": 0.36, "eval_loss": 2.5235376358032227, "eval_runtime": 5481.7843, "eval_samples_per_second": 1069.917, "eval_steps_per_second": 66.87, "step": 50000 } ], "logging_steps": 100, "max_steps": 412386, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 5000, "total_flos": 2.399847579648e+17, "train_batch_size": 96, "trial_name": null, "trial_params": null }