{ "best_metric": 0.9323536157608032, "best_model_checkpoint": "/scratch/czm5kz/llama2-13b_32_1_0.0003_alternate/checkpoint-1400", "epoch": 0.9975062344139651, "eval_steps": 20, "global_step": 1400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.9500238299369812, "learning_rate": 0.0002989308624376336, "loss": 2.9196, "step": 5 }, { "epoch": 0.01, "grad_norm": 1.3018202781677246, "learning_rate": 0.00029786172487526725, "loss": 2.1494, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.9286252856254578, "learning_rate": 0.0002967925873129009, "loss": 1.7255, "step": 15 }, { "epoch": 0.01, "grad_norm": 1.2644277811050415, "learning_rate": 0.00029572344975053457, "loss": 1.5678, "step": 20 }, { "epoch": 0.01, "eval_loss": 1.4308089017868042, "eval_runtime": 429.3923, "eval_samples_per_second": 26.151, "eval_steps_per_second": 3.27, "step": 20 }, { "epoch": 0.02, "grad_norm": 1.486525535583496, "learning_rate": 0.00029465431218816815, "loss": 1.3281, "step": 25 }, { "epoch": 0.02, "grad_norm": 1.1946622133255005, "learning_rate": 0.00029358517462580184, "loss": 1.2098, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.7685691118240356, "learning_rate": 0.0002925160370634355, "loss": 1.2045, "step": 35 }, { "epoch": 0.03, "grad_norm": 1.084057092666626, "learning_rate": 0.0002914468995010691, "loss": 1.2187, "step": 40 }, { "epoch": 0.03, "eval_loss": 1.1283082962036133, "eval_runtime": 429.1606, "eval_samples_per_second": 26.165, "eval_steps_per_second": 3.272, "step": 40 }, { "epoch": 0.03, "grad_norm": 0.6618586778640747, "learning_rate": 0.00029037776193870275, "loss": 1.1625, "step": 45 }, { "epoch": 0.04, "grad_norm": 0.5745049118995667, "learning_rate": 0.0002893086243763364, "loss": 1.1622, "step": 50 }, { "epoch": 0.04, "grad_norm": 0.6866307854652405, "learning_rate": 0.00028823948681397, "loss": 1.1375, "step": 55 }, { "epoch": 0.04, "grad_norm": 0.7473587393760681, "learning_rate": 0.0002871703492516037, "loss": 1.1048, "step": 60 }, { "epoch": 0.04, "eval_loss": 1.0678260326385498, "eval_runtime": 430.0891, "eval_samples_per_second": 26.109, "eval_steps_per_second": 3.264, "step": 60 }, { "epoch": 0.05, "grad_norm": 0.5050413012504578, "learning_rate": 0.0002861012116892373, "loss": 1.082, "step": 65 }, { "epoch": 0.05, "grad_norm": 0.546146810054779, "learning_rate": 0.000285032074126871, "loss": 1.0399, "step": 70 }, { "epoch": 0.05, "grad_norm": 0.6042059659957886, "learning_rate": 0.0002839629365645046, "loss": 0.9998, "step": 75 }, { "epoch": 0.06, "grad_norm": 0.5676208734512329, "learning_rate": 0.00028289379900213826, "loss": 1.0594, "step": 80 }, { "epoch": 0.06, "eval_loss": 1.0362563133239746, "eval_runtime": 429.9585, "eval_samples_per_second": 26.116, "eval_steps_per_second": 3.265, "step": 80 }, { "epoch": 0.06, "grad_norm": 0.6432541012763977, "learning_rate": 0.0002818246614397719, "loss": 1.052, "step": 85 }, { "epoch": 0.06, "grad_norm": 0.5946438312530518, "learning_rate": 0.00028075552387740553, "loss": 1.038, "step": 90 }, { "epoch": 0.07, "grad_norm": 0.6687917113304138, "learning_rate": 0.00027968638631503917, "loss": 1.0949, "step": 95 }, { "epoch": 0.07, "grad_norm": 1.521995186805725, "learning_rate": 0.0002786172487526728, "loss": 1.0393, "step": 100 }, { "epoch": 0.07, "eval_loss": 1.0224562883377075, "eval_runtime": 430.7099, "eval_samples_per_second": 26.071, "eval_steps_per_second": 3.26, "step": 100 }, { "epoch": 0.07, "grad_norm": 0.5327974557876587, "learning_rate": 0.00027754811119030644, "loss": 1.0709, "step": 105 }, { "epoch": 0.08, "grad_norm": 0.47005364298820496, "learning_rate": 0.0002764789736279401, "loss": 1.0119, "step": 110 }, { "epoch": 0.08, "grad_norm": 0.6974779963493347, "learning_rate": 0.00027540983606557377, "loss": 1.0285, "step": 115 }, { "epoch": 0.09, "grad_norm": 0.568474531173706, "learning_rate": 0.0002743406985032074, "loss": 1.085, "step": 120 }, { "epoch": 0.09, "eval_loss": 1.012302041053772, "eval_runtime": 430.9145, "eval_samples_per_second": 26.059, "eval_steps_per_second": 3.258, "step": 120 }, { "epoch": 0.09, "grad_norm": 0.4923990070819855, "learning_rate": 0.00027327156094084104, "loss": 1.0179, "step": 125 }, { "epoch": 0.09, "grad_norm": 0.4918578565120697, "learning_rate": 0.0002722024233784747, "loss": 0.982, "step": 130 }, { "epoch": 0.1, "grad_norm": 0.6147764921188354, "learning_rate": 0.0002711332858161083, "loss": 1.0864, "step": 135 }, { "epoch": 0.1, "grad_norm": 0.5552188158035278, "learning_rate": 0.00027006414825374195, "loss": 1.0125, "step": 140 }, { "epoch": 0.1, "eval_loss": 1.004356861114502, "eval_runtime": 428.4385, "eval_samples_per_second": 26.209, "eval_steps_per_second": 3.277, "step": 140 }, { "epoch": 0.1, "grad_norm": 0.5992436408996582, "learning_rate": 0.00026899501069137564, "loss": 1.0427, "step": 145 }, { "epoch": 0.11, "grad_norm": 0.44927549362182617, "learning_rate": 0.0002679258731290092, "loss": 0.9857, "step": 150 }, { "epoch": 0.11, "grad_norm": 0.7258988618850708, "learning_rate": 0.0002668567355666429, "loss": 1.0241, "step": 155 }, { "epoch": 0.11, "grad_norm": 0.4783088266849518, "learning_rate": 0.00026578759800427654, "loss": 0.9954, "step": 160 }, { "epoch": 0.11, "eval_loss": 0.9951605796813965, "eval_runtime": 429.3957, "eval_samples_per_second": 26.151, "eval_steps_per_second": 3.27, "step": 160 }, { "epoch": 0.12, "grad_norm": 0.547055184841156, "learning_rate": 0.0002647184604419102, "loss": 1.008, "step": 165 }, { "epoch": 0.12, "grad_norm": 0.5691074132919312, "learning_rate": 0.0002636493228795438, "loss": 0.9818, "step": 170 }, { "epoch": 0.12, "grad_norm": 0.5176872611045837, "learning_rate": 0.00026258018531717745, "loss": 1.009, "step": 175 }, { "epoch": 0.13, "grad_norm": 0.469517320394516, "learning_rate": 0.0002615110477548111, "loss": 1.0347, "step": 180 }, { "epoch": 0.13, "eval_loss": 0.9926482439041138, "eval_runtime": 428.9611, "eval_samples_per_second": 26.177, "eval_steps_per_second": 3.273, "step": 180 }, { "epoch": 0.13, "grad_norm": 0.4130084812641144, "learning_rate": 0.0002604419101924447, "loss": 0.9957, "step": 185 }, { "epoch": 0.14, "grad_norm": 0.43157169222831726, "learning_rate": 0.00025937277263007836, "loss": 1.0193, "step": 190 }, { "epoch": 0.14, "grad_norm": 0.4014897644519806, "learning_rate": 0.000258303635067712, "loss": 1.0301, "step": 195 }, { "epoch": 0.14, "grad_norm": 0.4352978765964508, "learning_rate": 0.0002572344975053457, "loss": 0.9812, "step": 200 }, { "epoch": 0.14, "eval_loss": 0.9875041246414185, "eval_runtime": 429.7836, "eval_samples_per_second": 26.127, "eval_steps_per_second": 3.267, "step": 200 }, { "epoch": 0.15, "grad_norm": 0.350909948348999, "learning_rate": 0.0002561653599429793, "loss": 0.9866, "step": 205 }, { "epoch": 0.15, "grad_norm": 0.48860958218574524, "learning_rate": 0.00025509622238061296, "loss": 0.9854, "step": 210 }, { "epoch": 0.15, "grad_norm": 0.5793005228042603, "learning_rate": 0.0002540270848182466, "loss": 1.0233, "step": 215 }, { "epoch": 0.16, "grad_norm": 0.4061422646045685, "learning_rate": 0.00025295794725588023, "loss": 0.9912, "step": 220 }, { "epoch": 0.16, "eval_loss": 0.9817266464233398, "eval_runtime": 428.9163, "eval_samples_per_second": 26.18, "eval_steps_per_second": 3.273, "step": 220 }, { "epoch": 0.16, "grad_norm": 0.41655540466308594, "learning_rate": 0.00025188880969351387, "loss": 0.9682, "step": 225 }, { "epoch": 0.16, "grad_norm": 0.40418338775634766, "learning_rate": 0.00025081967213114756, "loss": 0.9841, "step": 230 }, { "epoch": 0.17, "grad_norm": 0.4219729006290436, "learning_rate": 0.00024975053456878114, "loss": 0.9733, "step": 235 }, { "epoch": 0.17, "grad_norm": 0.4193793833255768, "learning_rate": 0.00024868139700641483, "loss": 1.0167, "step": 240 }, { "epoch": 0.17, "eval_loss": 0.9779065847396851, "eval_runtime": 429.6736, "eval_samples_per_second": 26.134, "eval_steps_per_second": 3.268, "step": 240 }, { "epoch": 0.17, "grad_norm": 0.38435232639312744, "learning_rate": 0.00024761225944404847, "loss": 0.9686, "step": 245 }, { "epoch": 0.18, "grad_norm": 0.36323386430740356, "learning_rate": 0.0002465431218816821, "loss": 0.9785, "step": 250 }, { "epoch": 0.18, "grad_norm": 0.4036902189254761, "learning_rate": 0.00024547398431931574, "loss": 0.9708, "step": 255 }, { "epoch": 0.19, "grad_norm": 0.46558523178100586, "learning_rate": 0.0002444048467569494, "loss": 0.9634, "step": 260 }, { "epoch": 0.19, "eval_loss": 0.9745126962661743, "eval_runtime": 429.4898, "eval_samples_per_second": 26.145, "eval_steps_per_second": 3.269, "step": 260 }, { "epoch": 0.19, "grad_norm": 0.5028152465820312, "learning_rate": 0.000243335709194583, "loss": 0.9661, "step": 265 }, { "epoch": 0.19, "grad_norm": 0.49994298815727234, "learning_rate": 0.00024226657163221665, "loss": 1.0082, "step": 270 }, { "epoch": 0.2, "grad_norm": 0.41407310962677, "learning_rate": 0.0002411974340698503, "loss": 0.9414, "step": 275 }, { "epoch": 0.2, "grad_norm": 0.39765608310699463, "learning_rate": 0.00024012829650748392, "loss": 0.9785, "step": 280 }, { "epoch": 0.2, "eval_loss": 0.9723387956619263, "eval_runtime": 429.7271, "eval_samples_per_second": 26.131, "eval_steps_per_second": 3.267, "step": 280 }, { "epoch": 0.2, "grad_norm": 0.36988940834999084, "learning_rate": 0.00023905915894511758, "loss": 0.9391, "step": 285 }, { "epoch": 0.21, "grad_norm": 0.4206317365169525, "learning_rate": 0.00023799002138275122, "loss": 0.9463, "step": 290 }, { "epoch": 0.21, "grad_norm": 0.45045849680900574, "learning_rate": 0.00023692088382038488, "loss": 0.9946, "step": 295 }, { "epoch": 0.21, "grad_norm": 0.39236941933631897, "learning_rate": 0.0002358517462580185, "loss": 0.976, "step": 300 }, { "epoch": 0.21, "eval_loss": 0.9704678654670715, "eval_runtime": 429.6884, "eval_samples_per_second": 26.133, "eval_steps_per_second": 3.267, "step": 300 }, { "epoch": 0.22, "grad_norm": 0.5147037506103516, "learning_rate": 0.00023478260869565215, "loss": 0.9661, "step": 305 }, { "epoch": 0.22, "grad_norm": 0.4205746352672577, "learning_rate": 0.0002337134711332858, "loss": 1.0082, "step": 310 }, { "epoch": 0.22, "grad_norm": 0.5038707852363586, "learning_rate": 0.00023264433357091945, "loss": 1.0248, "step": 315 }, { "epoch": 0.23, "grad_norm": 0.38720741868019104, "learning_rate": 0.0002315751960085531, "loss": 0.9593, "step": 320 }, { "epoch": 0.23, "eval_loss": 0.9690603017807007, "eval_runtime": 429.2915, "eval_samples_per_second": 26.157, "eval_steps_per_second": 3.271, "step": 320 }, { "epoch": 0.23, "grad_norm": 0.37954407930374146, "learning_rate": 0.00023050605844618672, "loss": 0.9919, "step": 325 }, { "epoch": 0.24, "grad_norm": 0.9392601251602173, "learning_rate": 0.00022943692088382036, "loss": 1.003, "step": 330 }, { "epoch": 0.24, "grad_norm": 0.5109981894493103, "learning_rate": 0.00022836778332145402, "loss": 0.9723, "step": 335 }, { "epoch": 0.24, "grad_norm": 0.56329345703125, "learning_rate": 0.00022729864575908766, "loss": 1.0046, "step": 340 }, { "epoch": 0.24, "eval_loss": 0.9681074619293213, "eval_runtime": 429.1046, "eval_samples_per_second": 26.168, "eval_steps_per_second": 3.272, "step": 340 }, { "epoch": 0.25, "grad_norm": 0.3970394730567932, "learning_rate": 0.00022622950819672127, "loss": 1.0459, "step": 345 }, { "epoch": 0.25, "grad_norm": 0.33091840147972107, "learning_rate": 0.00022516037063435493, "loss": 0.959, "step": 350 }, { "epoch": 0.25, "grad_norm": 0.39255568385124207, "learning_rate": 0.00022409123307198857, "loss": 0.9297, "step": 355 }, { "epoch": 0.26, "grad_norm": 0.40491119027137756, "learning_rate": 0.00022302209550962223, "loss": 0.9573, "step": 360 }, { "epoch": 0.26, "eval_loss": 0.964959442615509, "eval_runtime": 429.8883, "eval_samples_per_second": 26.121, "eval_steps_per_second": 3.266, "step": 360 }, { "epoch": 0.26, "grad_norm": 0.3292306065559387, "learning_rate": 0.00022195295794725584, "loss": 0.9592, "step": 365 }, { "epoch": 0.26, "grad_norm": 0.38449323177337646, "learning_rate": 0.0002208838203848895, "loss": 0.9375, "step": 370 }, { "epoch": 0.27, "grad_norm": 0.4422217309474945, "learning_rate": 0.00021981468282252314, "loss": 0.9794, "step": 375 }, { "epoch": 0.27, "grad_norm": 0.5002702474594116, "learning_rate": 0.0002187455452601568, "loss": 0.9956, "step": 380 }, { "epoch": 0.27, "eval_loss": 0.9628071188926697, "eval_runtime": 429.2302, "eval_samples_per_second": 26.161, "eval_steps_per_second": 3.271, "step": 380 }, { "epoch": 0.27, "grad_norm": 0.32773861289024353, "learning_rate": 0.0002176764076977904, "loss": 0.9852, "step": 385 }, { "epoch": 0.28, "grad_norm": 0.5032110214233398, "learning_rate": 0.00021660727013542407, "loss": 0.9525, "step": 390 }, { "epoch": 0.28, "grad_norm": 0.3590690791606903, "learning_rate": 0.0002155381325730577, "loss": 0.9591, "step": 395 }, { "epoch": 0.29, "grad_norm": 0.7039294242858887, "learning_rate": 0.00021446899501069137, "loss": 0.963, "step": 400 }, { "epoch": 0.29, "eval_loss": 0.9601727724075317, "eval_runtime": 429.5145, "eval_samples_per_second": 26.143, "eval_steps_per_second": 3.269, "step": 400 }, { "epoch": 0.29, "grad_norm": 0.36725956201553345, "learning_rate": 0.00021339985744832498, "loss": 0.9859, "step": 405 }, { "epoch": 0.29, "grad_norm": 0.323743611574173, "learning_rate": 0.00021233071988595865, "loss": 0.9349, "step": 410 }, { "epoch": 0.3, "grad_norm": 0.46232929825782776, "learning_rate": 0.00021126158232359228, "loss": 0.9344, "step": 415 }, { "epoch": 0.3, "grad_norm": 0.408657044172287, "learning_rate": 0.00021019244476122595, "loss": 0.9849, "step": 420 }, { "epoch": 0.3, "eval_loss": 0.9595761299133301, "eval_runtime": 428.7767, "eval_samples_per_second": 26.188, "eval_steps_per_second": 3.274, "step": 420 }, { "epoch": 0.3, "grad_norm": 0.3261207640171051, "learning_rate": 0.00020912330719885958, "loss": 0.9458, "step": 425 }, { "epoch": 0.31, "grad_norm": 0.38089486956596375, "learning_rate": 0.0002080541696364932, "loss": 0.9789, "step": 430 }, { "epoch": 0.31, "grad_norm": 0.33345696330070496, "learning_rate": 0.00020698503207412685, "loss": 0.9492, "step": 435 }, { "epoch": 0.31, "grad_norm": 0.4079163074493408, "learning_rate": 0.0002059158945117605, "loss": 0.9628, "step": 440 }, { "epoch": 0.31, "eval_loss": 0.9577447175979614, "eval_runtime": 428.7333, "eval_samples_per_second": 26.191, "eval_steps_per_second": 3.275, "step": 440 }, { "epoch": 0.32, "grad_norm": 0.3896418511867523, "learning_rate": 0.00020484675694939415, "loss": 0.9968, "step": 445 }, { "epoch": 0.32, "grad_norm": 0.48451879620552063, "learning_rate": 0.00020377761938702776, "loss": 0.9276, "step": 450 }, { "epoch": 0.32, "grad_norm": 0.3475559651851654, "learning_rate": 0.00020270848182466143, "loss": 0.9545, "step": 455 }, { "epoch": 0.33, "grad_norm": 0.39289623498916626, "learning_rate": 0.00020163934426229506, "loss": 0.942, "step": 460 }, { "epoch": 0.33, "eval_loss": 0.9580677151679993, "eval_runtime": 429.8733, "eval_samples_per_second": 26.122, "eval_steps_per_second": 3.266, "step": 460 }, { "epoch": 0.33, "grad_norm": 0.4262286424636841, "learning_rate": 0.00020057020669992872, "loss": 0.9626, "step": 465 }, { "epoch": 0.33, "grad_norm": 0.32213443517684937, "learning_rate": 0.00019950106913756233, "loss": 0.9654, "step": 470 }, { "epoch": 0.34, "grad_norm": 0.5463011264801025, "learning_rate": 0.000198431931575196, "loss": 0.9739, "step": 475 }, { "epoch": 0.34, "grad_norm": 0.43018388748168945, "learning_rate": 0.00019736279401282963, "loss": 0.9712, "step": 480 }, { "epoch": 0.34, "eval_loss": 0.955605149269104, "eval_runtime": 430.0606, "eval_samples_per_second": 26.11, "eval_steps_per_second": 3.265, "step": 480 }, { "epoch": 0.35, "grad_norm": 0.3303755819797516, "learning_rate": 0.0001962936564504633, "loss": 0.9799, "step": 485 }, { "epoch": 0.35, "grad_norm": 0.35282695293426514, "learning_rate": 0.0001952245188880969, "loss": 0.9797, "step": 490 }, { "epoch": 0.35, "grad_norm": 0.7973325848579407, "learning_rate": 0.00019415538132573057, "loss": 0.9733, "step": 495 }, { "epoch": 0.36, "grad_norm": 0.46459755301475525, "learning_rate": 0.0001930862437633642, "loss": 0.9368, "step": 500 }, { "epoch": 0.36, "eval_loss": 0.9580536484718323, "eval_runtime": 429.9689, "eval_samples_per_second": 26.116, "eval_steps_per_second": 3.265, "step": 500 }, { "epoch": 0.36, "grad_norm": 0.4230199158191681, "learning_rate": 0.00019201710620099787, "loss": 0.9822, "step": 505 }, { "epoch": 0.36, "grad_norm": 0.3397184908390045, "learning_rate": 0.00019094796863863148, "loss": 0.9522, "step": 510 }, { "epoch": 0.37, "grad_norm": 0.6563906073570251, "learning_rate": 0.0001898788310762651, "loss": 1.0053, "step": 515 }, { "epoch": 0.37, "grad_norm": 0.29375872015953064, "learning_rate": 0.00018880969351389878, "loss": 0.9623, "step": 520 }, { "epoch": 0.37, "eval_loss": 0.9542292356491089, "eval_runtime": 429.8107, "eval_samples_per_second": 26.125, "eval_steps_per_second": 3.267, "step": 520 }, { "epoch": 0.37, "grad_norm": 0.37316498160362244, "learning_rate": 0.0001877405559515324, "loss": 0.9615, "step": 525 }, { "epoch": 0.38, "grad_norm": 0.447251558303833, "learning_rate": 0.00018667141838916605, "loss": 0.9654, "step": 530 }, { "epoch": 0.38, "grad_norm": 0.4220346510410309, "learning_rate": 0.00018560228082679968, "loss": 0.9912, "step": 535 }, { "epoch": 0.38, "grad_norm": 0.4119402766227722, "learning_rate": 0.00018453314326443335, "loss": 0.9884, "step": 540 }, { "epoch": 0.38, "eval_loss": 0.9527921080589294, "eval_runtime": 429.6973, "eval_samples_per_second": 26.132, "eval_steps_per_second": 3.267, "step": 540 }, { "epoch": 0.39, "grad_norm": 0.370868057012558, "learning_rate": 0.00018346400570206698, "loss": 0.9405, "step": 545 }, { "epoch": 0.39, "grad_norm": 0.3117481768131256, "learning_rate": 0.00018239486813970065, "loss": 0.951, "step": 550 }, { "epoch": 0.4, "grad_norm": 0.3839544355869293, "learning_rate": 0.00018132573057733425, "loss": 0.9802, "step": 555 }, { "epoch": 0.4, "grad_norm": 0.28429457545280457, "learning_rate": 0.00018025659301496792, "loss": 0.9313, "step": 560 }, { "epoch": 0.4, "eval_loss": 0.9514893293380737, "eval_runtime": 429.9336, "eval_samples_per_second": 26.118, "eval_steps_per_second": 3.266, "step": 560 }, { "epoch": 0.4, "grad_norm": 0.36404356360435486, "learning_rate": 0.00017918745545260155, "loss": 0.9582, "step": 565 }, { "epoch": 0.41, "grad_norm": 0.33048421144485474, "learning_rate": 0.00017811831789023522, "loss": 0.95, "step": 570 }, { "epoch": 0.41, "grad_norm": 0.3449236750602722, "learning_rate": 0.00017704918032786883, "loss": 0.9765, "step": 575 }, { "epoch": 0.41, "grad_norm": 0.3965567350387573, "learning_rate": 0.0001759800427655025, "loss": 0.9743, "step": 580 }, { "epoch": 0.41, "eval_loss": 0.9508348107337952, "eval_runtime": 429.4982, "eval_samples_per_second": 26.144, "eval_steps_per_second": 3.269, "step": 580 }, { "epoch": 0.42, "grad_norm": 0.491098552942276, "learning_rate": 0.00017491090520313613, "loss": 0.9843, "step": 585 }, { "epoch": 0.42, "grad_norm": 0.3687509298324585, "learning_rate": 0.0001738417676407698, "loss": 0.9936, "step": 590 }, { "epoch": 0.42, "grad_norm": 0.33909574151039124, "learning_rate": 0.0001727726300784034, "loss": 0.9334, "step": 595 }, { "epoch": 0.43, "grad_norm": 0.36044976115226746, "learning_rate": 0.00017170349251603703, "loss": 0.9805, "step": 600 }, { "epoch": 0.43, "eval_loss": 0.9498609900474548, "eval_runtime": 429.1598, "eval_samples_per_second": 26.165, "eval_steps_per_second": 3.272, "step": 600 }, { "epoch": 0.43, "grad_norm": 0.28610169887542725, "learning_rate": 0.0001706343549536707, "loss": 0.9513, "step": 605 }, { "epoch": 0.43, "grad_norm": 0.29482224583625793, "learning_rate": 0.00016956521739130433, "loss": 0.9313, "step": 610 }, { "epoch": 0.44, "grad_norm": 0.46972745656967163, "learning_rate": 0.00016849607982893797, "loss": 0.9753, "step": 615 }, { "epoch": 0.44, "grad_norm": 0.38324683904647827, "learning_rate": 0.0001674269422665716, "loss": 0.9759, "step": 620 }, { "epoch": 0.44, "eval_loss": 0.949146568775177, "eval_runtime": 430.2097, "eval_samples_per_second": 26.101, "eval_steps_per_second": 3.264, "step": 620 }, { "epoch": 0.45, "grad_norm": 0.4306158125400543, "learning_rate": 0.00016635780470420527, "loss": 0.9788, "step": 625 }, { "epoch": 0.45, "grad_norm": 0.49315810203552246, "learning_rate": 0.0001652886671418389, "loss": 0.9636, "step": 630 }, { "epoch": 0.45, "grad_norm": 0.4102541506290436, "learning_rate": 0.00016421952957947254, "loss": 0.9957, "step": 635 }, { "epoch": 0.46, "grad_norm": 0.40014615654945374, "learning_rate": 0.00016315039201710618, "loss": 0.9468, "step": 640 }, { "epoch": 0.46, "eval_loss": 0.9478526711463928, "eval_runtime": 429.5532, "eval_samples_per_second": 26.141, "eval_steps_per_second": 3.269, "step": 640 }, { "epoch": 0.46, "grad_norm": 0.3694494962692261, "learning_rate": 0.00016208125445473984, "loss": 0.9824, "step": 645 }, { "epoch": 0.46, "grad_norm": 0.37701812386512756, "learning_rate": 0.00016101211689237348, "loss": 0.9274, "step": 650 }, { "epoch": 0.47, "grad_norm": 0.3787167966365814, "learning_rate": 0.00015994297933000714, "loss": 0.9719, "step": 655 }, { "epoch": 0.47, "grad_norm": 0.48614123463630676, "learning_rate": 0.00015887384176764075, "loss": 0.9593, "step": 660 }, { "epoch": 0.47, "eval_loss": 0.9467563629150391, "eval_runtime": 430.6789, "eval_samples_per_second": 26.073, "eval_steps_per_second": 3.26, "step": 660 }, { "epoch": 0.47, "grad_norm": 0.35638296604156494, "learning_rate": 0.0001578047042052744, "loss": 0.9862, "step": 665 }, { "epoch": 0.48, "grad_norm": 0.27917659282684326, "learning_rate": 0.00015673556664290805, "loss": 0.9504, "step": 670 }, { "epoch": 0.48, "grad_norm": 0.34654778242111206, "learning_rate": 0.0001556664290805417, "loss": 0.9428, "step": 675 }, { "epoch": 0.48, "grad_norm": 0.2783154845237732, "learning_rate": 0.00015459729151817532, "loss": 0.9242, "step": 680 }, { "epoch": 0.48, "eval_loss": 0.9472882151603699, "eval_runtime": 428.9437, "eval_samples_per_second": 26.178, "eval_steps_per_second": 3.273, "step": 680 }, { "epoch": 0.49, "grad_norm": 0.3296685814857483, "learning_rate": 0.00015352815395580896, "loss": 0.9817, "step": 685 }, { "epoch": 0.49, "grad_norm": 0.35922589898109436, "learning_rate": 0.00015245901639344262, "loss": 0.904, "step": 690 }, { "epoch": 0.5, "grad_norm": 0.33108726143836975, "learning_rate": 0.00015138987883107623, "loss": 0.9537, "step": 695 }, { "epoch": 0.5, "grad_norm": 0.3146369457244873, "learning_rate": 0.0001503207412687099, "loss": 0.9594, "step": 700 }, { "epoch": 0.5, "eval_loss": 0.9462445974349976, "eval_runtime": 430.3498, "eval_samples_per_second": 26.093, "eval_steps_per_second": 3.262, "step": 700 }, { "epoch": 0.5, "grad_norm": 0.3020135164260864, "learning_rate": 0.00014925160370634355, "loss": 0.9947, "step": 705 }, { "epoch": 0.51, "grad_norm": 0.33066317439079285, "learning_rate": 0.0001481824661439772, "loss": 0.926, "step": 710 }, { "epoch": 0.51, "grad_norm": 0.3479137420654297, "learning_rate": 0.00014711332858161083, "loss": 0.9388, "step": 715 }, { "epoch": 0.51, "grad_norm": 0.4258211851119995, "learning_rate": 0.00014604419101924446, "loss": 0.9587, "step": 720 }, { "epoch": 0.51, "eval_loss": 0.9459365606307983, "eval_runtime": 429.6711, "eval_samples_per_second": 26.134, "eval_steps_per_second": 3.268, "step": 720 }, { "epoch": 0.52, "grad_norm": 0.33177533745765686, "learning_rate": 0.0001449750534568781, "loss": 0.9466, "step": 725 }, { "epoch": 0.52, "grad_norm": 0.36505717039108276, "learning_rate": 0.00014390591589451173, "loss": 0.9591, "step": 730 }, { "epoch": 0.52, "grad_norm": 0.2933894693851471, "learning_rate": 0.0001428367783321454, "loss": 0.9857, "step": 735 }, { "epoch": 0.53, "grad_norm": 0.30695998668670654, "learning_rate": 0.00014176764076977903, "loss": 0.9311, "step": 740 }, { "epoch": 0.53, "eval_loss": 0.9447991251945496, "eval_runtime": 429.4504, "eval_samples_per_second": 26.147, "eval_steps_per_second": 3.269, "step": 740 }, { "epoch": 0.53, "grad_norm": 0.37066712975502014, "learning_rate": 0.00014069850320741267, "loss": 0.9075, "step": 745 }, { "epoch": 0.53, "grad_norm": 0.3553217053413391, "learning_rate": 0.0001396293656450463, "loss": 0.9355, "step": 750 }, { "epoch": 0.54, "grad_norm": 0.3907565176486969, "learning_rate": 0.00013856022808267997, "loss": 0.9593, "step": 755 }, { "epoch": 0.54, "grad_norm": 0.35674384236335754, "learning_rate": 0.0001374910905203136, "loss": 0.9485, "step": 760 }, { "epoch": 0.54, "eval_loss": 0.9437533617019653, "eval_runtime": 430.5708, "eval_samples_per_second": 26.079, "eval_steps_per_second": 3.261, "step": 760 }, { "epoch": 0.55, "grad_norm": 0.39057737588882446, "learning_rate": 0.00013642195295794724, "loss": 0.9201, "step": 765 }, { "epoch": 0.55, "grad_norm": 0.2841975688934326, "learning_rate": 0.00013535281539558088, "loss": 0.9466, "step": 770 }, { "epoch": 0.55, "grad_norm": 0.38508614897727966, "learning_rate": 0.00013428367783321454, "loss": 0.9772, "step": 775 }, { "epoch": 0.56, "grad_norm": 0.3465386629104614, "learning_rate": 0.00013321454027084818, "loss": 0.9974, "step": 780 }, { "epoch": 0.56, "eval_loss": 0.9442404508590698, "eval_runtime": 429.5011, "eval_samples_per_second": 26.144, "eval_steps_per_second": 3.269, "step": 780 }, { "epoch": 0.56, "grad_norm": 0.360097199678421, "learning_rate": 0.0001321454027084818, "loss": 0.9725, "step": 785 }, { "epoch": 0.56, "grad_norm": 0.34066542983055115, "learning_rate": 0.00013107626514611545, "loss": 1.0158, "step": 790 }, { "epoch": 0.57, "grad_norm": 0.3403652608394623, "learning_rate": 0.0001300071275837491, "loss": 0.927, "step": 795 }, { "epoch": 0.57, "grad_norm": 0.33237767219543457, "learning_rate": 0.00012893799002138275, "loss": 0.9516, "step": 800 }, { "epoch": 0.57, "eval_loss": 0.9429513216018677, "eval_runtime": 429.3919, "eval_samples_per_second": 26.151, "eval_steps_per_second": 3.27, "step": 800 }, { "epoch": 0.57, "grad_norm": 0.33179551362991333, "learning_rate": 0.00012786885245901638, "loss": 0.922, "step": 805 }, { "epoch": 0.58, "grad_norm": 0.3568633794784546, "learning_rate": 0.00012679971489665002, "loss": 0.9434, "step": 810 }, { "epoch": 0.58, "grad_norm": 0.4287993907928467, "learning_rate": 0.00012573057733428366, "loss": 0.9496, "step": 815 }, { "epoch": 0.58, "grad_norm": 0.3873540461063385, "learning_rate": 0.0001246614397719173, "loss": 0.9476, "step": 820 }, { "epoch": 0.58, "eval_loss": 0.942284107208252, "eval_runtime": 429.6801, "eval_samples_per_second": 26.133, "eval_steps_per_second": 3.268, "step": 820 }, { "epoch": 0.59, "grad_norm": 0.5366536974906921, "learning_rate": 0.00012359230220955095, "loss": 0.9609, "step": 825 }, { "epoch": 0.59, "grad_norm": 0.38116729259490967, "learning_rate": 0.0001225231646471846, "loss": 0.946, "step": 830 }, { "epoch": 0.59, "grad_norm": 0.3352102041244507, "learning_rate": 0.00012145402708481824, "loss": 0.9487, "step": 835 }, { "epoch": 0.6, "grad_norm": 0.314010888338089, "learning_rate": 0.00012038488952245188, "loss": 0.951, "step": 840 }, { "epoch": 0.6, "eval_loss": 0.9439210295677185, "eval_runtime": 429.6236, "eval_samples_per_second": 26.137, "eval_steps_per_second": 3.268, "step": 840 }, { "epoch": 0.6, "grad_norm": 0.359325647354126, "learning_rate": 0.00011931575196008553, "loss": 0.9628, "step": 845 }, { "epoch": 0.61, "grad_norm": 0.5471125841140747, "learning_rate": 0.00011824661439771916, "loss": 0.9581, "step": 850 }, { "epoch": 0.61, "grad_norm": 0.5925132632255554, "learning_rate": 0.00011717747683535281, "loss": 0.9777, "step": 855 }, { "epoch": 0.61, "grad_norm": 0.3140357732772827, "learning_rate": 0.00011610833927298645, "loss": 0.9269, "step": 860 }, { "epoch": 0.61, "eval_loss": 0.9412524700164795, "eval_runtime": 429.1443, "eval_samples_per_second": 26.166, "eval_steps_per_second": 3.272, "step": 860 }, { "epoch": 0.62, "grad_norm": 0.3626731038093567, "learning_rate": 0.0001150392017106201, "loss": 0.9668, "step": 865 }, { "epoch": 0.62, "grad_norm": 0.31055352091789246, "learning_rate": 0.00011397006414825373, "loss": 0.9272, "step": 870 }, { "epoch": 0.62, "grad_norm": 0.3563440144062042, "learning_rate": 0.00011290092658588738, "loss": 0.9574, "step": 875 }, { "epoch": 0.63, "grad_norm": 0.44283735752105713, "learning_rate": 0.00011183178902352102, "loss": 0.9294, "step": 880 }, { "epoch": 0.63, "eval_loss": 0.9413205981254578, "eval_runtime": 430.6763, "eval_samples_per_second": 26.073, "eval_steps_per_second": 3.26, "step": 880 }, { "epoch": 0.63, "grad_norm": 0.41935306787490845, "learning_rate": 0.00011076265146115467, "loss": 0.9551, "step": 885 }, { "epoch": 0.63, "grad_norm": 0.3747510612010956, "learning_rate": 0.00010969351389878829, "loss": 0.9838, "step": 890 }, { "epoch": 0.64, "grad_norm": 0.343944251537323, "learning_rate": 0.00010862437633642194, "loss": 0.9927, "step": 895 }, { "epoch": 0.64, "grad_norm": 0.4465276598930359, "learning_rate": 0.00010755523877405558, "loss": 1.0054, "step": 900 }, { "epoch": 0.64, "eval_loss": 0.9400754570960999, "eval_runtime": 428.8696, "eval_samples_per_second": 26.183, "eval_steps_per_second": 3.274, "step": 900 }, { "epoch": 0.64, "grad_norm": 0.3230556845664978, "learning_rate": 0.00010648610121168923, "loss": 0.9675, "step": 905 }, { "epoch": 0.65, "grad_norm": 0.384182333946228, "learning_rate": 0.00010541696364932286, "loss": 0.9806, "step": 910 }, { "epoch": 0.65, "grad_norm": 0.3371874690055847, "learning_rate": 0.00010434782608695651, "loss": 0.9508, "step": 915 }, { "epoch": 0.66, "grad_norm": 0.3343549370765686, "learning_rate": 0.00010327868852459015, "loss": 0.964, "step": 920 }, { "epoch": 0.66, "eval_loss": 0.9393041729927063, "eval_runtime": 429.8253, "eval_samples_per_second": 26.125, "eval_steps_per_second": 3.266, "step": 920 }, { "epoch": 0.66, "grad_norm": 0.35873135924339294, "learning_rate": 0.0001022095509622238, "loss": 0.9702, "step": 925 }, { "epoch": 0.66, "grad_norm": 0.34200361371040344, "learning_rate": 0.00010114041339985743, "loss": 0.9559, "step": 930 }, { "epoch": 0.67, "grad_norm": 0.37706607580184937, "learning_rate": 0.00010007127583749108, "loss": 0.9392, "step": 935 }, { "epoch": 0.67, "grad_norm": 0.37498244643211365, "learning_rate": 9.900213827512472e-05, "loss": 0.8863, "step": 940 }, { "epoch": 0.67, "eval_loss": 0.9388808608055115, "eval_runtime": 428.8283, "eval_samples_per_second": 26.185, "eval_steps_per_second": 3.274, "step": 940 }, { "epoch": 0.67, "grad_norm": 0.29014042019844055, "learning_rate": 9.793300071275837e-05, "loss": 0.9793, "step": 945 }, { "epoch": 0.68, "grad_norm": 0.5693466663360596, "learning_rate": 9.686386315039202e-05, "loss": 1.0071, "step": 950 }, { "epoch": 0.68, "grad_norm": 0.3804801106452942, "learning_rate": 9.579472558802566e-05, "loss": 1.0006, "step": 955 }, { "epoch": 0.68, "grad_norm": 0.3488084673881531, "learning_rate": 9.47255880256593e-05, "loss": 0.9766, "step": 960 }, { "epoch": 0.68, "eval_loss": 0.9387741684913635, "eval_runtime": 430.4724, "eval_samples_per_second": 26.085, "eval_steps_per_second": 3.262, "step": 960 }, { "epoch": 0.69, "grad_norm": 0.3514138460159302, "learning_rate": 9.365645046329294e-05, "loss": 0.9362, "step": 965 }, { "epoch": 0.69, "grad_norm": 0.2945019602775574, "learning_rate": 9.258731290092659e-05, "loss": 0.9354, "step": 970 }, { "epoch": 0.69, "grad_norm": 0.41364601254463196, "learning_rate": 9.151817533856021e-05, "loss": 0.9893, "step": 975 }, { "epoch": 0.7, "grad_norm": 0.3566943109035492, "learning_rate": 9.044903777619385e-05, "loss": 0.9803, "step": 980 }, { "epoch": 0.7, "eval_loss": 0.9385978579521179, "eval_runtime": 429.3426, "eval_samples_per_second": 26.154, "eval_steps_per_second": 3.27, "step": 980 }, { "epoch": 0.7, "grad_norm": 0.3117084503173828, "learning_rate": 8.93799002138275e-05, "loss": 0.9584, "step": 985 }, { "epoch": 0.71, "grad_norm": 0.3447965085506439, "learning_rate": 8.831076265146115e-05, "loss": 0.9283, "step": 990 }, { "epoch": 0.71, "grad_norm": 0.43709927797317505, "learning_rate": 8.724162508909478e-05, "loss": 0.9653, "step": 995 }, { "epoch": 0.71, "grad_norm": 0.30693626403808594, "learning_rate": 8.617248752672843e-05, "loss": 0.9367, "step": 1000 }, { "epoch": 0.71, "eval_loss": 0.938434898853302, "eval_runtime": 429.8587, "eval_samples_per_second": 26.123, "eval_steps_per_second": 3.266, "step": 1000 }, { "epoch": 0.72, "grad_norm": 0.3513086140155792, "learning_rate": 8.510334996436207e-05, "loss": 0.9486, "step": 1005 }, { "epoch": 0.72, "grad_norm": 0.30035388469696045, "learning_rate": 8.403421240199572e-05, "loss": 0.9253, "step": 1010 }, { "epoch": 0.72, "grad_norm": 0.34532248973846436, "learning_rate": 8.296507483962936e-05, "loss": 0.9817, "step": 1015 }, { "epoch": 0.73, "grad_norm": 0.2752978205680847, "learning_rate": 8.1895937277263e-05, "loss": 0.9092, "step": 1020 }, { "epoch": 0.73, "eval_loss": 0.9372485876083374, "eval_runtime": 429.979, "eval_samples_per_second": 26.115, "eval_steps_per_second": 3.265, "step": 1020 }, { "epoch": 0.73, "grad_norm": 0.37310290336608887, "learning_rate": 8.082679971489664e-05, "loss": 0.9679, "step": 1025 }, { "epoch": 0.73, "grad_norm": 0.26144835352897644, "learning_rate": 7.975766215253029e-05, "loss": 0.9374, "step": 1030 }, { "epoch": 0.74, "grad_norm": 0.442670613527298, "learning_rate": 7.868852459016393e-05, "loss": 0.9386, "step": 1035 }, { "epoch": 0.74, "grad_norm": 0.3123500347137451, "learning_rate": 7.761938702779758e-05, "loss": 0.9527, "step": 1040 }, { "epoch": 0.74, "eval_loss": 0.9374279975891113, "eval_runtime": 428.7356, "eval_samples_per_second": 26.191, "eval_steps_per_second": 3.275, "step": 1040 }, { "epoch": 0.74, "grad_norm": 0.34689196944236755, "learning_rate": 7.655024946543121e-05, "loss": 0.918, "step": 1045 }, { "epoch": 0.75, "grad_norm": 0.25890424847602844, "learning_rate": 7.548111190306486e-05, "loss": 0.9131, "step": 1050 }, { "epoch": 0.75, "grad_norm": 0.4108467102050781, "learning_rate": 7.44119743406985e-05, "loss": 0.9481, "step": 1055 }, { "epoch": 0.76, "grad_norm": 0.3142194151878357, "learning_rate": 7.334283677833213e-05, "loss": 0.9503, "step": 1060 }, { "epoch": 0.76, "eval_loss": 0.9363455772399902, "eval_runtime": 429.8697, "eval_samples_per_second": 26.122, "eval_steps_per_second": 3.266, "step": 1060 }, { "epoch": 0.76, "grad_norm": 0.3752140402793884, "learning_rate": 7.227369921596578e-05, "loss": 0.9614, "step": 1065 }, { "epoch": 0.76, "grad_norm": 0.3649095892906189, "learning_rate": 7.120456165359942e-05, "loss": 0.9428, "step": 1070 }, { "epoch": 0.77, "grad_norm": 0.28237637877464294, "learning_rate": 7.013542409123307e-05, "loss": 0.9698, "step": 1075 }, { "epoch": 0.77, "grad_norm": 0.32881519198417664, "learning_rate": 6.90662865288667e-05, "loss": 0.9494, "step": 1080 }, { "epoch": 0.77, "eval_loss": 0.9357610940933228, "eval_runtime": 430.0153, "eval_samples_per_second": 26.113, "eval_steps_per_second": 3.265, "step": 1080 }, { "epoch": 0.77, "grad_norm": 0.3415566086769104, "learning_rate": 6.799714896650034e-05, "loss": 0.9501, "step": 1085 }, { "epoch": 0.78, "grad_norm": 0.3997558057308197, "learning_rate": 6.692801140413399e-05, "loss": 0.9423, "step": 1090 }, { "epoch": 0.78, "grad_norm": 0.3917606472969055, "learning_rate": 6.585887384176763e-05, "loss": 0.9696, "step": 1095 }, { "epoch": 0.78, "grad_norm": 0.6566518545150757, "learning_rate": 6.478973627940128e-05, "loss": 0.961, "step": 1100 }, { "epoch": 0.78, "eval_loss": 0.936005711555481, "eval_runtime": 430.641, "eval_samples_per_second": 26.075, "eval_steps_per_second": 3.26, "step": 1100 }, { "epoch": 0.79, "grad_norm": 0.2704831063747406, "learning_rate": 6.372059871703493e-05, "loss": 0.9316, "step": 1105 }, { "epoch": 0.79, "grad_norm": 0.3626934885978699, "learning_rate": 6.265146115466856e-05, "loss": 0.942, "step": 1110 }, { "epoch": 0.79, "grad_norm": 0.3050808310508728, "learning_rate": 6.158232359230221e-05, "loss": 0.9681, "step": 1115 }, { "epoch": 0.8, "grad_norm": 0.33575522899627686, "learning_rate": 6.051318602993584e-05, "loss": 0.9675, "step": 1120 }, { "epoch": 0.8, "eval_loss": 0.9356703758239746, "eval_runtime": 430.1349, "eval_samples_per_second": 26.106, "eval_steps_per_second": 3.264, "step": 1120 }, { "epoch": 0.8, "grad_norm": 0.47601962089538574, "learning_rate": 5.9444048467569485e-05, "loss": 0.9837, "step": 1125 }, { "epoch": 0.81, "grad_norm": 0.3533668518066406, "learning_rate": 5.837491090520313e-05, "loss": 0.9234, "step": 1130 }, { "epoch": 0.81, "grad_norm": 0.32863783836364746, "learning_rate": 5.730577334283677e-05, "loss": 0.9773, "step": 1135 }, { "epoch": 0.81, "grad_norm": 0.3788212835788727, "learning_rate": 5.6236635780470413e-05, "loss": 0.9704, "step": 1140 }, { "epoch": 0.81, "eval_loss": 0.9352059364318848, "eval_runtime": 430.5021, "eval_samples_per_second": 26.083, "eval_steps_per_second": 3.261, "step": 1140 }, { "epoch": 0.82, "grad_norm": 0.5751201510429382, "learning_rate": 5.5167498218104056e-05, "loss": 0.9367, "step": 1145 }, { "epoch": 0.82, "grad_norm": 0.4319859445095062, "learning_rate": 5.40983606557377e-05, "loss": 0.9558, "step": 1150 }, { "epoch": 0.82, "grad_norm": 0.35112935304641724, "learning_rate": 5.302922309337134e-05, "loss": 0.9123, "step": 1155 }, { "epoch": 0.83, "grad_norm": 0.31719475984573364, "learning_rate": 5.196008553100499e-05, "loss": 0.9511, "step": 1160 }, { "epoch": 0.83, "eval_loss": 0.9346606135368347, "eval_runtime": 429.4307, "eval_samples_per_second": 26.149, "eval_steps_per_second": 3.269, "step": 1160 }, { "epoch": 0.83, "grad_norm": 0.3528451919555664, "learning_rate": 5.089094796863862e-05, "loss": 0.9789, "step": 1165 }, { "epoch": 0.83, "grad_norm": 0.29251405596733093, "learning_rate": 4.9821810406272264e-05, "loss": 0.9271, "step": 1170 }, { "epoch": 0.84, "grad_norm": 0.3118594288825989, "learning_rate": 4.875267284390591e-05, "loss": 0.9356, "step": 1175 }, { "epoch": 0.84, "grad_norm": 0.2919027805328369, "learning_rate": 4.7683535281539556e-05, "loss": 0.9042, "step": 1180 }, { "epoch": 0.84, "eval_loss": 0.9343792200088501, "eval_runtime": 429.6874, "eval_samples_per_second": 26.133, "eval_steps_per_second": 3.267, "step": 1180 }, { "epoch": 0.84, "grad_norm": 0.3041454553604126, "learning_rate": 4.66143977191732e-05, "loss": 0.9366, "step": 1185 }, { "epoch": 0.85, "grad_norm": 0.42262208461761475, "learning_rate": 4.554526015680684e-05, "loss": 0.954, "step": 1190 }, { "epoch": 0.85, "grad_norm": 0.4994548559188843, "learning_rate": 4.4476122594440485e-05, "loss": 0.9125, "step": 1195 }, { "epoch": 0.86, "grad_norm": 0.506973385810852, "learning_rate": 4.340698503207413e-05, "loss": 0.9359, "step": 1200 }, { "epoch": 0.86, "eval_loss": 0.9343673586845398, "eval_runtime": 429.7828, "eval_samples_per_second": 26.127, "eval_steps_per_second": 3.267, "step": 1200 }, { "epoch": 0.86, "grad_norm": 0.3126988112926483, "learning_rate": 4.2337847469707764e-05, "loss": 0.9643, "step": 1205 }, { "epoch": 0.86, "grad_norm": 0.33700090646743774, "learning_rate": 4.1268709907341407e-05, "loss": 0.9809, "step": 1210 }, { "epoch": 0.87, "grad_norm": 0.27363497018814087, "learning_rate": 4.019957234497505e-05, "loss": 0.9519, "step": 1215 }, { "epoch": 0.87, "grad_norm": 0.2861486077308655, "learning_rate": 3.913043478260869e-05, "loss": 0.9131, "step": 1220 }, { "epoch": 0.87, "eval_loss": 0.9337981343269348, "eval_runtime": 429.8829, "eval_samples_per_second": 26.121, "eval_steps_per_second": 3.266, "step": 1220 }, { "epoch": 0.87, "grad_norm": 0.6844860911369324, "learning_rate": 3.8061297220242335e-05, "loss": 0.996, "step": 1225 }, { "epoch": 0.88, "grad_norm": 0.4049902558326721, "learning_rate": 3.699215965787598e-05, "loss": 0.9593, "step": 1230 }, { "epoch": 0.88, "grad_norm": 0.3516334295272827, "learning_rate": 3.592302209550962e-05, "loss": 0.9, "step": 1235 }, { "epoch": 0.88, "grad_norm": 0.3603789508342743, "learning_rate": 3.485388453314326e-05, "loss": 0.9674, "step": 1240 }, { "epoch": 0.88, "eval_loss": 0.9333547353744507, "eval_runtime": 428.501, "eval_samples_per_second": 26.205, "eval_steps_per_second": 3.277, "step": 1240 }, { "epoch": 0.89, "grad_norm": 0.3410665690898895, "learning_rate": 3.3784746970776906e-05, "loss": 0.9335, "step": 1245 }, { "epoch": 0.89, "grad_norm": 0.34668412804603577, "learning_rate": 3.271560940841055e-05, "loss": 0.9618, "step": 1250 }, { "epoch": 0.89, "grad_norm": 0.361831396818161, "learning_rate": 3.164647184604419e-05, "loss": 0.9524, "step": 1255 }, { "epoch": 0.9, "grad_norm": 0.4450233280658722, "learning_rate": 3.057733428367783e-05, "loss": 0.9855, "step": 1260 }, { "epoch": 0.9, "eval_loss": 0.9331775307655334, "eval_runtime": 430.3503, "eval_samples_per_second": 26.093, "eval_steps_per_second": 3.262, "step": 1260 }, { "epoch": 0.9, "grad_norm": 0.38541775941848755, "learning_rate": 2.950819672131147e-05, "loss": 0.9691, "step": 1265 }, { "epoch": 0.9, "grad_norm": 0.43870338797569275, "learning_rate": 2.8439059158945114e-05, "loss": 0.9514, "step": 1270 }, { "epoch": 0.91, "grad_norm": 0.3042528033256531, "learning_rate": 2.736992159657876e-05, "loss": 0.9099, "step": 1275 }, { "epoch": 0.91, "grad_norm": 0.28177541494369507, "learning_rate": 2.63007840342124e-05, "loss": 0.9201, "step": 1280 }, { "epoch": 0.91, "eval_loss": 0.9329990744590759, "eval_runtime": 430.5216, "eval_samples_per_second": 26.082, "eval_steps_per_second": 3.261, "step": 1280 }, { "epoch": 0.92, "grad_norm": 0.31207114458084106, "learning_rate": 2.5231646471846042e-05, "loss": 0.9388, "step": 1285 }, { "epoch": 0.92, "grad_norm": 0.39428967237472534, "learning_rate": 2.4162508909479685e-05, "loss": 0.9827, "step": 1290 }, { "epoch": 0.92, "grad_norm": 0.35494768619537354, "learning_rate": 2.3093371347113328e-05, "loss": 0.9094, "step": 1295 }, { "epoch": 0.93, "grad_norm": 0.5314223766326904, "learning_rate": 2.2024233784746968e-05, "loss": 0.9453, "step": 1300 }, { "epoch": 0.93, "eval_loss": 0.9329606294631958, "eval_runtime": 429.558, "eval_samples_per_second": 26.141, "eval_steps_per_second": 3.268, "step": 1300 }, { "epoch": 0.93, "grad_norm": 0.3589262366294861, "learning_rate": 2.095509622238061e-05, "loss": 0.9202, "step": 1305 }, { "epoch": 0.93, "grad_norm": 0.3077951967716217, "learning_rate": 1.9885958660014253e-05, "loss": 0.9741, "step": 1310 }, { "epoch": 0.94, "grad_norm": 0.3348515033721924, "learning_rate": 1.8816821097647896e-05, "loss": 0.933, "step": 1315 }, { "epoch": 0.94, "grad_norm": 0.34439706802368164, "learning_rate": 1.774768353528154e-05, "loss": 0.9476, "step": 1320 }, { "epoch": 0.94, "eval_loss": 0.932703971862793, "eval_runtime": 428.8979, "eval_samples_per_second": 26.181, "eval_steps_per_second": 3.274, "step": 1320 }, { "epoch": 0.94, "grad_norm": 0.28818029165267944, "learning_rate": 1.6678545972915182e-05, "loss": 0.9288, "step": 1325 }, { "epoch": 0.95, "grad_norm": 0.2997274696826935, "learning_rate": 1.560940841054882e-05, "loss": 0.9418, "step": 1330 }, { "epoch": 0.95, "grad_norm": 0.3139705955982208, "learning_rate": 1.4540270848182466e-05, "loss": 0.9732, "step": 1335 }, { "epoch": 0.95, "grad_norm": 0.32881295680999756, "learning_rate": 1.3471133285816107e-05, "loss": 0.9166, "step": 1340 }, { "epoch": 0.95, "eval_loss": 0.9325681328773499, "eval_runtime": 429.5609, "eval_samples_per_second": 26.141, "eval_steps_per_second": 3.268, "step": 1340 }, { "epoch": 0.96, "grad_norm": 0.6226952075958252, "learning_rate": 1.240199572344975e-05, "loss": 1.0029, "step": 1345 }, { "epoch": 0.96, "grad_norm": 0.3391931354999542, "learning_rate": 1.1332858161083391e-05, "loss": 0.9522, "step": 1350 }, { "epoch": 0.97, "grad_norm": 0.37440812587738037, "learning_rate": 1.0263720598717034e-05, "loss": 0.9362, "step": 1355 }, { "epoch": 0.97, "grad_norm": 0.3198889493942261, "learning_rate": 9.194583036350677e-06, "loss": 0.9523, "step": 1360 }, { "epoch": 0.97, "eval_loss": 0.9325061440467834, "eval_runtime": 429.5538, "eval_samples_per_second": 26.141, "eval_steps_per_second": 3.269, "step": 1360 }, { "epoch": 0.97, "grad_norm": 0.4025633931159973, "learning_rate": 8.12544547398432e-06, "loss": 0.9422, "step": 1365 }, { "epoch": 0.98, "grad_norm": 0.38235458731651306, "learning_rate": 7.0563079116179615e-06, "loss": 1.0052, "step": 1370 }, { "epoch": 0.98, "grad_norm": 0.3941135108470917, "learning_rate": 5.9871703492516035e-06, "loss": 0.9714, "step": 1375 }, { "epoch": 0.98, "grad_norm": 0.31835320591926575, "learning_rate": 4.9180327868852455e-06, "loss": 0.9615, "step": 1380 }, { "epoch": 0.98, "eval_loss": 0.9324172735214233, "eval_runtime": 431.2711, "eval_samples_per_second": 26.037, "eval_steps_per_second": 3.255, "step": 1380 }, { "epoch": 0.99, "grad_norm": 0.3852035105228424, "learning_rate": 3.848895224518888e-06, "loss": 0.9238, "step": 1385 }, { "epoch": 0.99, "grad_norm": 0.33982229232788086, "learning_rate": 2.7797576621525303e-06, "loss": 0.9401, "step": 1390 }, { "epoch": 0.99, "grad_norm": 0.31724292039871216, "learning_rate": 1.7106200997861725e-06, "loss": 0.9274, "step": 1395 }, { "epoch": 1.0, "grad_norm": 0.35556310415267944, "learning_rate": 6.414825374198146e-07, "loss": 0.9633, "step": 1400 }, { "epoch": 1.0, "eval_loss": 0.9323536157608032, "eval_runtime": 430.5102, "eval_samples_per_second": 26.083, "eval_steps_per_second": 3.261, "step": 1400 } ], "logging_steps": 5, "max_steps": 1403, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 5.2878011087683584e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }