{ "best_metric": 1.3063520193099976, "best_model_checkpoint": "/mnt/users/n3thakur/vectara/huggingface-dpo/trained_models/v3/Meta-Llama-3-8B-Instruct-miracl-mix-raft-sft-25th-apr-v1.0/checkpoint-2000", "epoch": 0.9996544972935621, "eval_steps": 200, "global_step": 2170, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.0262953847325709, "learning_rate": 4.608294930875576e-08, "loss": 1.7621, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.954880939596983, "learning_rate": 2.3041474654377884e-07, "loss": 1.7602, "step": 5 }, { "epoch": 0.0, "grad_norm": 1.0004644202041775, "learning_rate": 4.608294930875577e-07, "loss": 1.8162, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.0280115397490373, "learning_rate": 6.912442396313365e-07, "loss": 1.7724, "step": 15 }, { "epoch": 0.01, "grad_norm": 0.8732665912953524, "learning_rate": 9.216589861751154e-07, "loss": 1.7378, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.9441885280056963, "learning_rate": 1.1520737327188942e-06, "loss": 1.7507, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.9845213443275476, "learning_rate": 1.382488479262673e-06, "loss": 1.7561, "step": 30 }, { "epoch": 0.02, "grad_norm": 0.9521516823054887, "learning_rate": 1.6129032258064516e-06, "loss": 1.7479, "step": 35 }, { "epoch": 0.02, "grad_norm": 1.0047303931298321, "learning_rate": 1.8433179723502307e-06, "loss": 1.7773, "step": 40 }, { "epoch": 0.02, "grad_norm": 0.9230639224993269, "learning_rate": 2.0737327188940094e-06, "loss": 1.7752, "step": 45 }, { "epoch": 0.02, "grad_norm": 0.9112137084493539, "learning_rate": 2.3041474654377884e-06, "loss": 1.7213, "step": 50 }, { "epoch": 0.03, "grad_norm": 0.8909029043612109, "learning_rate": 2.5345622119815673e-06, "loss": 1.7881, "step": 55 }, { "epoch": 0.03, "grad_norm": 0.7189360071641709, "learning_rate": 2.764976958525346e-06, "loss": 1.6636, "step": 60 }, { "epoch": 0.03, "grad_norm": 0.6692633878715751, "learning_rate": 2.9953917050691243e-06, "loss": 1.6508, "step": 65 }, { "epoch": 0.03, "grad_norm": 0.6188511577538577, "learning_rate": 3.225806451612903e-06, "loss": 1.7011, "step": 70 }, { "epoch": 0.03, "grad_norm": 0.6279410712564498, "learning_rate": 3.4562211981566825e-06, "loss": 1.6403, "step": 75 }, { "epoch": 0.04, "grad_norm": 0.5342697084988228, "learning_rate": 3.6866359447004615e-06, "loss": 1.656, "step": 80 }, { "epoch": 0.04, "grad_norm": 0.44402512139023, "learning_rate": 3.91705069124424e-06, "loss": 1.6617, "step": 85 }, { "epoch": 0.04, "grad_norm": 0.4644619191475398, "learning_rate": 4.147465437788019e-06, "loss": 1.6303, "step": 90 }, { "epoch": 0.04, "grad_norm": 0.371289417426217, "learning_rate": 4.377880184331797e-06, "loss": 1.6453, "step": 95 }, { "epoch": 0.05, "grad_norm": 0.3309787275557066, "learning_rate": 4.608294930875577e-06, "loss": 1.5851, "step": 100 }, { "epoch": 0.05, "grad_norm": 0.31477252449769144, "learning_rate": 4.838709677419355e-06, "loss": 1.604, "step": 105 }, { "epoch": 0.05, "grad_norm": 0.2612544823481869, "learning_rate": 5.0691244239631346e-06, "loss": 1.5725, "step": 110 }, { "epoch": 0.05, "grad_norm": 0.2605484332821398, "learning_rate": 5.299539170506913e-06, "loss": 1.5644, "step": 115 }, { "epoch": 0.06, "grad_norm": 0.25148232016385397, "learning_rate": 5.529953917050692e-06, "loss": 1.5467, "step": 120 }, { "epoch": 0.06, "grad_norm": 0.2601998287824473, "learning_rate": 5.76036866359447e-06, "loss": 1.589, "step": 125 }, { "epoch": 0.06, "grad_norm": 0.2385121609938763, "learning_rate": 5.9907834101382485e-06, "loss": 1.6055, "step": 130 }, { "epoch": 0.06, "grad_norm": 0.24629649793790628, "learning_rate": 6.221198156682028e-06, "loss": 1.5373, "step": 135 }, { "epoch": 0.06, "grad_norm": 0.2468655185850127, "learning_rate": 6.451612903225806e-06, "loss": 1.5614, "step": 140 }, { "epoch": 0.07, "grad_norm": 0.22223391970377232, "learning_rate": 6.682027649769586e-06, "loss": 1.5624, "step": 145 }, { "epoch": 0.07, "grad_norm": 0.2062049955510628, "learning_rate": 6.912442396313365e-06, "loss": 1.5013, "step": 150 }, { "epoch": 0.07, "grad_norm": 0.19876212655323225, "learning_rate": 7.1428571428571436e-06, "loss": 1.575, "step": 155 }, { "epoch": 0.07, "grad_norm": 0.19765238479942285, "learning_rate": 7.373271889400923e-06, "loss": 1.5167, "step": 160 }, { "epoch": 0.08, "grad_norm": 0.19240215611308686, "learning_rate": 7.603686635944701e-06, "loss": 1.5071, "step": 165 }, { "epoch": 0.08, "grad_norm": 0.2001053295395004, "learning_rate": 7.83410138248848e-06, "loss": 1.4932, "step": 170 }, { "epoch": 0.08, "grad_norm": 0.20863957442778325, "learning_rate": 8.064516129032258e-06, "loss": 1.5371, "step": 175 }, { "epoch": 0.08, "grad_norm": 0.1880073967302754, "learning_rate": 8.294930875576038e-06, "loss": 1.492, "step": 180 }, { "epoch": 0.09, "grad_norm": 0.18052519991071567, "learning_rate": 8.525345622119815e-06, "loss": 1.5039, "step": 185 }, { "epoch": 0.09, "grad_norm": 0.17252733686762506, "learning_rate": 8.755760368663595e-06, "loss": 1.492, "step": 190 }, { "epoch": 0.09, "grad_norm": 0.1754856007551659, "learning_rate": 8.986175115207374e-06, "loss": 1.4926, "step": 195 }, { "epoch": 0.09, "grad_norm": 0.16521189205999245, "learning_rate": 9.216589861751153e-06, "loss": 1.4903, "step": 200 }, { "epoch": 0.09, "eval_loss": 1.3960996866226196, "eval_runtime": 1753.0374, "eval_samples_per_second": 2.162, "eval_steps_per_second": 0.27, "step": 200 }, { "epoch": 0.09, "grad_norm": 0.1618648599795676, "learning_rate": 9.447004608294931e-06, "loss": 1.4499, "step": 205 }, { "epoch": 0.1, "grad_norm": 0.1609921643942943, "learning_rate": 9.67741935483871e-06, "loss": 1.4775, "step": 210 }, { "epoch": 0.1, "grad_norm": 0.161807092383862, "learning_rate": 9.90783410138249e-06, "loss": 1.4779, "step": 215 }, { "epoch": 0.1, "grad_norm": 0.15789342984068674, "learning_rate": 9.999941779365509e-06, "loss": 1.4064, "step": 220 }, { "epoch": 0.1, "grad_norm": 0.17953421611953463, "learning_rate": 9.99958599150926e-06, "loss": 1.4216, "step": 225 }, { "epoch": 0.11, "grad_norm": 0.17033046165043145, "learning_rate": 9.998906783581494e-06, "loss": 1.4872, "step": 230 }, { "epoch": 0.11, "grad_norm": 0.15935058975437466, "learning_rate": 9.997904199519748e-06, "loss": 1.4473, "step": 235 }, { "epoch": 0.11, "grad_norm": 0.16188090301066688, "learning_rate": 9.996578304180551e-06, "loss": 1.4484, "step": 240 }, { "epoch": 0.11, "grad_norm": 0.1692826467439708, "learning_rate": 9.994929183335237e-06, "loss": 1.4576, "step": 245 }, { "epoch": 0.12, "grad_norm": 0.16233371930510018, "learning_rate": 9.992956943664401e-06, "loss": 1.4674, "step": 250 }, { "epoch": 0.12, "grad_norm": 0.1601701723529579, "learning_rate": 9.99066171275098e-06, "loss": 1.434, "step": 255 }, { "epoch": 0.12, "grad_norm": 0.1515934744211598, "learning_rate": 9.988043639072021e-06, "loss": 1.469, "step": 260 }, { "epoch": 0.12, "grad_norm": 0.1558360755989006, "learning_rate": 9.985102891989063e-06, "loss": 1.4688, "step": 265 }, { "epoch": 0.12, "grad_norm": 0.20565866787405954, "learning_rate": 9.98183966173718e-06, "loss": 1.4794, "step": 270 }, { "epoch": 0.13, "grad_norm": 0.16589082756750548, "learning_rate": 9.97825415941269e-06, "loss": 1.4514, "step": 275 }, { "epoch": 0.13, "grad_norm": 0.16179243903344054, "learning_rate": 9.974346616959476e-06, "loss": 1.4802, "step": 280 }, { "epoch": 0.13, "grad_norm": 0.15782689090886812, "learning_rate": 9.970117287154004e-06, "loss": 1.4356, "step": 285 }, { "epoch": 0.13, "grad_norm": 0.15954652235182576, "learning_rate": 9.965566443588956e-06, "loss": 1.3886, "step": 290 }, { "epoch": 0.14, "grad_norm": 0.16253218390477409, "learning_rate": 9.960694380655539e-06, "loss": 1.456, "step": 295 }, { "epoch": 0.14, "grad_norm": 0.15571215758010262, "learning_rate": 9.955501413524438e-06, "loss": 1.4038, "step": 300 }, { "epoch": 0.14, "grad_norm": 0.15764668416666167, "learning_rate": 9.949987878125427e-06, "loss": 1.4292, "step": 305 }, { "epoch": 0.14, "grad_norm": 0.15565461047093965, "learning_rate": 9.944154131125643e-06, "loss": 1.3845, "step": 310 }, { "epoch": 0.15, "grad_norm": 0.16274505869667716, "learning_rate": 9.938000549906509e-06, "loss": 1.4143, "step": 315 }, { "epoch": 0.15, "grad_norm": 0.15771377018169355, "learning_rate": 9.93152753253932e-06, "loss": 1.414, "step": 320 }, { "epoch": 0.15, "grad_norm": 0.15819595549860918, "learning_rate": 9.924735497759497e-06, "loss": 1.398, "step": 325 }, { "epoch": 0.15, "grad_norm": 0.16836842547624103, "learning_rate": 9.917624884939495e-06, "loss": 1.415, "step": 330 }, { "epoch": 0.15, "grad_norm": 0.16454346246919038, "learning_rate": 9.910196154060381e-06, "loss": 1.5025, "step": 335 }, { "epoch": 0.16, "grad_norm": 0.15952712351687925, "learning_rate": 9.902449785682084e-06, "loss": 1.4602, "step": 340 }, { "epoch": 0.16, "grad_norm": 0.15913354058980336, "learning_rate": 9.894386280912298e-06, "loss": 1.4437, "step": 345 }, { "epoch": 0.16, "grad_norm": 0.16526633029775475, "learning_rate": 9.88600616137407e-06, "loss": 1.443, "step": 350 }, { "epoch": 0.16, "grad_norm": 0.1570973278965336, "learning_rate": 9.877309969172065e-06, "loss": 1.4001, "step": 355 }, { "epoch": 0.17, "grad_norm": 0.16323401596231638, "learning_rate": 9.868298266857477e-06, "loss": 1.4115, "step": 360 }, { "epoch": 0.17, "grad_norm": 0.16999252145703372, "learning_rate": 9.858971637391662e-06, "loss": 1.431, "step": 365 }, { "epoch": 0.17, "grad_norm": 0.1598876979960763, "learning_rate": 9.849330684108409e-06, "loss": 1.3925, "step": 370 }, { "epoch": 0.17, "grad_norm": 0.16716694688842804, "learning_rate": 9.83937603067492e-06, "loss": 1.4369, "step": 375 }, { "epoch": 0.18, "grad_norm": 0.17160997265475317, "learning_rate": 9.829108321051461e-06, "loss": 1.4236, "step": 380 }, { "epoch": 0.18, "grad_norm": 0.1709419213466886, "learning_rate": 9.818528219449705e-06, "loss": 1.4156, "step": 385 }, { "epoch": 0.18, "grad_norm": 0.1706660498176678, "learning_rate": 9.807636410289767e-06, "loss": 1.3531, "step": 390 }, { "epoch": 0.18, "grad_norm": 0.1654207117166432, "learning_rate": 9.796433598155928e-06, "loss": 1.4282, "step": 395 }, { "epoch": 0.18, "grad_norm": 0.17005569486504588, "learning_rate": 9.784920507751052e-06, "loss": 1.465, "step": 400 }, { "epoch": 0.18, "eval_loss": 1.3499208688735962, "eval_runtime": 1761.5948, "eval_samples_per_second": 2.151, "eval_steps_per_second": 0.269, "step": 400 }, { "epoch": 0.19, "grad_norm": 0.16465910018813473, "learning_rate": 9.773097883849715e-06, "loss": 1.4856, "step": 405 }, { "epoch": 0.19, "grad_norm": 0.1687241699251977, "learning_rate": 9.760966491250018e-06, "loss": 1.4448, "step": 410 }, { "epoch": 0.19, "grad_norm": 0.173826161389088, "learning_rate": 9.748527114724111e-06, "loss": 1.4588, "step": 415 }, { "epoch": 0.19, "grad_norm": 0.1664583424134419, "learning_rate": 9.735780558967434e-06, "loss": 1.3651, "step": 420 }, { "epoch": 0.2, "grad_norm": 0.16305809613976227, "learning_rate": 9.72272764854666e-06, "loss": 1.386, "step": 425 }, { "epoch": 0.2, "grad_norm": 0.16712240916605367, "learning_rate": 9.709369227846346e-06, "loss": 1.4249, "step": 430 }, { "epoch": 0.2, "grad_norm": 0.1685889272801495, "learning_rate": 9.695706161014322e-06, "loss": 1.4629, "step": 435 }, { "epoch": 0.2, "grad_norm": 0.16213828032523545, "learning_rate": 9.681739331905784e-06, "loss": 1.4633, "step": 440 }, { "epoch": 0.2, "grad_norm": 0.1722162760562697, "learning_rate": 9.667469644026118e-06, "loss": 1.4147, "step": 445 }, { "epoch": 0.21, "grad_norm": 0.16634245789106444, "learning_rate": 9.652898020472449e-06, "loss": 1.4254, "step": 450 }, { "epoch": 0.21, "grad_norm": 0.17414333030226264, "learning_rate": 9.638025403873939e-06, "loss": 1.3734, "step": 455 }, { "epoch": 0.21, "grad_norm": 0.17078620575685172, "learning_rate": 9.622852756330797e-06, "loss": 1.4313, "step": 460 }, { "epoch": 0.21, "grad_norm": 0.17201073443780537, "learning_rate": 9.60738105935204e-06, "loss": 1.4412, "step": 465 }, { "epoch": 0.22, "grad_norm": 0.1781118635582674, "learning_rate": 9.59161131379201e-06, "loss": 1.4102, "step": 470 }, { "epoch": 0.22, "grad_norm": 0.17417835939253798, "learning_rate": 9.575544539785626e-06, "loss": 1.4311, "step": 475 }, { "epoch": 0.22, "grad_norm": 0.17050610923384396, "learning_rate": 9.559181776682387e-06, "loss": 1.4627, "step": 480 }, { "epoch": 0.22, "grad_norm": 0.176092363473546, "learning_rate": 9.542524082979138e-06, "loss": 1.4517, "step": 485 }, { "epoch": 0.23, "grad_norm": 0.17498193119749225, "learning_rate": 9.525572536251608e-06, "loss": 1.3956, "step": 490 }, { "epoch": 0.23, "grad_norm": 0.17003085925682157, "learning_rate": 9.50832823308468e-06, "loss": 1.4012, "step": 495 }, { "epoch": 0.23, "grad_norm": 0.17064790853058265, "learning_rate": 9.490792289001476e-06, "loss": 1.3523, "step": 500 }, { "epoch": 0.23, "grad_norm": 0.1799693810618531, "learning_rate": 9.472965838391187e-06, "loss": 1.4446, "step": 505 }, { "epoch": 0.23, "grad_norm": 0.16868170325096435, "learning_rate": 9.454850034435679e-06, "loss": 1.3912, "step": 510 }, { "epoch": 0.24, "grad_norm": 0.16981991686336434, "learning_rate": 9.436446049034913e-06, "loss": 1.3986, "step": 515 }, { "epoch": 0.24, "grad_norm": 0.1755120644725739, "learning_rate": 9.417755072731121e-06, "loss": 1.4117, "step": 520 }, { "epoch": 0.24, "grad_norm": 0.1757934575464805, "learning_rate": 9.398778314631801e-06, "loss": 1.3587, "step": 525 }, { "epoch": 0.24, "grad_norm": 0.173876963970309, "learning_rate": 9.379517002331489e-06, "loss": 1.3862, "step": 530 }, { "epoch": 0.25, "grad_norm": 0.17746114421749437, "learning_rate": 9.359972381832358e-06, "loss": 1.4309, "step": 535 }, { "epoch": 0.25, "grad_norm": 0.16869333613216586, "learning_rate": 9.340145717463609e-06, "loss": 1.4118, "step": 540 }, { "epoch": 0.25, "grad_norm": 0.17746743334088458, "learning_rate": 9.320038291799679e-06, "loss": 1.4433, "step": 545 }, { "epoch": 0.25, "grad_norm": 0.17587914404814756, "learning_rate": 9.299651405577286e-06, "loss": 1.4421, "step": 550 }, { "epoch": 0.26, "grad_norm": 0.18048857244310448, "learning_rate": 9.278986377611266e-06, "loss": 1.4221, "step": 555 }, { "epoch": 0.26, "grad_norm": 0.17371693231140117, "learning_rate": 9.258044544709276e-06, "loss": 1.4131, "step": 560 }, { "epoch": 0.26, "grad_norm": 0.17693122935797964, "learning_rate": 9.236827261585306e-06, "loss": 1.4205, "step": 565 }, { "epoch": 0.26, "grad_norm": 0.18630601640696606, "learning_rate": 9.215335900772048e-06, "loss": 1.4067, "step": 570 }, { "epoch": 0.26, "grad_norm": 0.17220695844147543, "learning_rate": 9.193571852532112e-06, "loss": 1.3834, "step": 575 }, { "epoch": 0.27, "grad_norm": 0.18435801453629633, "learning_rate": 9.17153652476808e-06, "loss": 1.3485, "step": 580 }, { "epoch": 0.27, "grad_norm": 0.1719982815834864, "learning_rate": 9.14923134293144e-06, "loss": 1.4265, "step": 585 }, { "epoch": 0.27, "grad_norm": 0.17622887749418029, "learning_rate": 9.126657749930365e-06, "loss": 1.4242, "step": 590 }, { "epoch": 0.27, "grad_norm": 0.1772303960409497, "learning_rate": 9.103817206036383e-06, "loss": 1.3901, "step": 595 }, { "epoch": 0.28, "grad_norm": 0.18039291853040396, "learning_rate": 9.080711188789903e-06, "loss": 1.4193, "step": 600 }, { "epoch": 0.28, "eval_loss": 1.333003282546997, "eval_runtime": 1759.7499, "eval_samples_per_second": 2.154, "eval_steps_per_second": 0.269, "step": 600 }, { "epoch": 0.28, "grad_norm": 0.18468507528075692, "learning_rate": 9.057341192904641e-06, "loss": 1.4663, "step": 605 }, { "epoch": 0.28, "grad_norm": 0.17990669625911423, "learning_rate": 9.033708730170925e-06, "loss": 1.4289, "step": 610 }, { "epoch": 0.28, "grad_norm": 0.17925115015014306, "learning_rate": 9.009815329357893e-06, "loss": 1.4337, "step": 615 }, { "epoch": 0.29, "grad_norm": 0.1742038025383068, "learning_rate": 8.985662536114614e-06, "loss": 1.4156, "step": 620 }, { "epoch": 0.29, "grad_norm": 0.18266353155608991, "learning_rate": 8.961251912870077e-06, "loss": 1.3896, "step": 625 }, { "epoch": 0.29, "grad_norm": 0.18057511349721395, "learning_rate": 8.936585038732143e-06, "loss": 1.3764, "step": 630 }, { "epoch": 0.29, "grad_norm": 0.18596859871805246, "learning_rate": 8.91166350938537e-06, "loss": 1.4193, "step": 635 }, { "epoch": 0.29, "grad_norm": 0.18986971270095016, "learning_rate": 8.886488936987817e-06, "loss": 1.3955, "step": 640 }, { "epoch": 0.3, "grad_norm": 0.17418454140586195, "learning_rate": 8.861062950066723e-06, "loss": 1.427, "step": 645 }, { "epoch": 0.3, "grad_norm": 0.18149732001872015, "learning_rate": 8.835387193413185e-06, "loss": 1.4046, "step": 650 }, { "epoch": 0.3, "grad_norm": 0.18147198520857166, "learning_rate": 8.809463327975741e-06, "loss": 1.4058, "step": 655 }, { "epoch": 0.3, "grad_norm": 0.18099108296498378, "learning_rate": 8.783293030752932e-06, "loss": 1.4066, "step": 660 }, { "epoch": 0.31, "grad_norm": 0.1794240687483909, "learning_rate": 8.756877994684818e-06, "loss": 1.3921, "step": 665 }, { "epoch": 0.31, "grad_norm": 0.1839307276635119, "learning_rate": 8.730219928543458e-06, "loss": 1.4054, "step": 670 }, { "epoch": 0.31, "grad_norm": 0.18145863325096048, "learning_rate": 8.703320556822375e-06, "loss": 1.4053, "step": 675 }, { "epoch": 0.31, "grad_norm": 0.1808487553528171, "learning_rate": 8.676181619624996e-06, "loss": 1.4055, "step": 680 }, { "epoch": 0.32, "grad_norm": 0.1862476804387168, "learning_rate": 8.648804872552092e-06, "loss": 1.3841, "step": 685 }, { "epoch": 0.32, "grad_norm": 0.19563150015616995, "learning_rate": 8.6211920865882e-06, "loss": 1.371, "step": 690 }, { "epoch": 0.32, "grad_norm": 0.18923848385153544, "learning_rate": 8.593345047987069e-06, "loss": 1.3988, "step": 695 }, { "epoch": 0.32, "grad_norm": 0.18078263758464272, "learning_rate": 8.565265558156101e-06, "loss": 1.4024, "step": 700 }, { "epoch": 0.32, "grad_norm": 0.18258637947544226, "learning_rate": 8.536955433539824e-06, "loss": 1.371, "step": 705 }, { "epoch": 0.33, "grad_norm": 0.19116331272141834, "learning_rate": 8.508416505502383e-06, "loss": 1.4456, "step": 710 }, { "epoch": 0.33, "grad_norm": 0.17469959227839357, "learning_rate": 8.479650620209072e-06, "loss": 1.385, "step": 715 }, { "epoch": 0.33, "grad_norm": 0.18410741548679613, "learning_rate": 8.450659638506908e-06, "loss": 1.4095, "step": 720 }, { "epoch": 0.33, "grad_norm": 0.19634261946282605, "learning_rate": 8.421445435804255e-06, "loss": 1.3513, "step": 725 }, { "epoch": 0.34, "grad_norm": 0.17826721350047323, "learning_rate": 8.3920099019495e-06, "loss": 1.3792, "step": 730 }, { "epoch": 0.34, "grad_norm": 0.1826653979119606, "learning_rate": 8.362354941108803e-06, "loss": 1.4448, "step": 735 }, { "epoch": 0.34, "grad_norm": 0.18664731594802075, "learning_rate": 8.33248247164292e-06, "loss": 1.3751, "step": 740 }, { "epoch": 0.34, "grad_norm": 0.18231556377003602, "learning_rate": 8.3023944259831e-06, "loss": 1.3773, "step": 745 }, { "epoch": 0.35, "grad_norm": 0.18711137034484868, "learning_rate": 8.272092750506084e-06, "loss": 1.4096, "step": 750 }, { "epoch": 0.35, "grad_norm": 0.1877955269203901, "learning_rate": 8.241579405408192e-06, "loss": 1.3902, "step": 755 }, { "epoch": 0.35, "grad_norm": 0.18482019451091206, "learning_rate": 8.21085636457851e-06, "loss": 1.3734, "step": 760 }, { "epoch": 0.35, "grad_norm": 0.19891330231660218, "learning_rate": 8.179925615471218e-06, "loss": 1.4061, "step": 765 }, { "epoch": 0.35, "grad_norm": 0.18663983192529415, "learning_rate": 8.148789158977012e-06, "loss": 1.3326, "step": 770 }, { "epoch": 0.36, "grad_norm": 0.1874487096476331, "learning_rate": 8.117449009293668e-06, "loss": 1.3384, "step": 775 }, { "epoch": 0.36, "grad_norm": 0.18710305973456598, "learning_rate": 8.085907193795745e-06, "loss": 1.3828, "step": 780 }, { "epoch": 0.36, "grad_norm": 0.18416014945175566, "learning_rate": 8.05416575290344e-06, "loss": 1.3737, "step": 785 }, { "epoch": 0.36, "grad_norm": 0.18615555988464447, "learning_rate": 8.022226739950587e-06, "loss": 1.4359, "step": 790 }, { "epoch": 0.37, "grad_norm": 0.18594902983475312, "learning_rate": 7.990092221051835e-06, "loss": 1.389, "step": 795 }, { "epoch": 0.37, "grad_norm": 0.18537070852284854, "learning_rate": 7.95776427496899e-06, "loss": 1.3593, "step": 800 }, { "epoch": 0.37, "eval_loss": 1.323183298110962, "eval_runtime": 1742.9319, "eval_samples_per_second": 2.174, "eval_steps_per_second": 0.272, "step": 800 }, { "epoch": 0.37, "grad_norm": 0.1908169492182471, "learning_rate": 7.925244992976538e-06, "loss": 1.3406, "step": 805 }, { "epoch": 0.37, "grad_norm": 0.18784810075497232, "learning_rate": 7.89253647872637e-06, "loss": 1.3842, "step": 810 }, { "epoch": 0.38, "grad_norm": 0.19406647113841424, "learning_rate": 7.859640848111686e-06, "loss": 1.4286, "step": 815 }, { "epoch": 0.38, "grad_norm": 0.19197603494160256, "learning_rate": 7.826560229130132e-06, "loss": 1.3928, "step": 820 }, { "epoch": 0.38, "grad_norm": 0.19099716433921685, "learning_rate": 7.793296761746126e-06, "loss": 1.362, "step": 825 }, { "epoch": 0.38, "grad_norm": 0.18788356013356616, "learning_rate": 7.759852597752447e-06, "loss": 1.4034, "step": 830 }, { "epoch": 0.38, "grad_norm": 0.1921444320557867, "learning_rate": 7.726229900631015e-06, "loss": 1.3793, "step": 835 }, { "epoch": 0.39, "grad_norm": 0.20734046130350145, "learning_rate": 7.692430845412946e-06, "loss": 1.4203, "step": 840 }, { "epoch": 0.39, "grad_norm": 0.19179256662995678, "learning_rate": 7.658457618537853e-06, "loss": 1.4021, "step": 845 }, { "epoch": 0.39, "grad_norm": 0.18555040743415147, "learning_rate": 7.624312417712403e-06, "loss": 1.423, "step": 850 }, { "epoch": 0.39, "grad_norm": 0.19398963612254347, "learning_rate": 7.58999745176815e-06, "loss": 1.4367, "step": 855 }, { "epoch": 0.4, "grad_norm": 0.19074855950766817, "learning_rate": 7.555514940518647e-06, "loss": 1.3695, "step": 860 }, { "epoch": 0.4, "grad_norm": 0.1893550054117395, "learning_rate": 7.520867114615844e-06, "loss": 1.3939, "step": 865 }, { "epoch": 0.4, "grad_norm": 0.19196726666071628, "learning_rate": 7.486056215405797e-06, "loss": 1.3964, "step": 870 }, { "epoch": 0.4, "grad_norm": 0.2100904294893222, "learning_rate": 7.451084494783668e-06, "loss": 1.3753, "step": 875 }, { "epoch": 0.41, "grad_norm": 0.1870023533707271, "learning_rate": 7.415954215048057e-06, "loss": 1.379, "step": 880 }, { "epoch": 0.41, "grad_norm": 0.19635898208120364, "learning_rate": 7.38066764875465e-06, "loss": 1.4329, "step": 885 }, { "epoch": 0.41, "grad_norm": 0.1896615635850299, "learning_rate": 7.345227078569218e-06, "loss": 1.357, "step": 890 }, { "epoch": 0.41, "grad_norm": 0.19424130426015207, "learning_rate": 7.309634797119941e-06, "loss": 1.3774, "step": 895 }, { "epoch": 0.41, "grad_norm": 0.22888693201104138, "learning_rate": 7.273893106849108e-06, "loss": 1.3976, "step": 900 }, { "epoch": 0.42, "grad_norm": 0.1919456484613934, "learning_rate": 7.23800431986417e-06, "loss": 1.378, "step": 905 }, { "epoch": 0.42, "grad_norm": 0.19241540158105003, "learning_rate": 7.201970757788172e-06, "loss": 1.4094, "step": 910 }, { "epoch": 0.42, "grad_norm": 0.19333437065467562, "learning_rate": 7.165794751609569e-06, "loss": 1.3971, "step": 915 }, { "epoch": 0.42, "grad_norm": 0.19460601771644864, "learning_rate": 7.1294786415314336e-06, "loss": 1.3879, "step": 920 }, { "epoch": 0.43, "grad_norm": 0.18754355788787921, "learning_rate": 7.093024776820076e-06, "loss": 1.3534, "step": 925 }, { "epoch": 0.43, "grad_norm": 0.18648196033472134, "learning_rate": 7.056435515653059e-06, "loss": 1.3969, "step": 930 }, { "epoch": 0.43, "grad_norm": 0.18580702737411495, "learning_rate": 7.019713224966664e-06, "loss": 1.4416, "step": 935 }, { "epoch": 0.43, "grad_norm": 0.24918096880464727, "learning_rate": 6.9828602803027664e-06, "loss": 1.3814, "step": 940 }, { "epoch": 0.44, "grad_norm": 0.19003799001704857, "learning_rate": 6.945879065655164e-06, "loss": 1.3581, "step": 945 }, { "epoch": 0.44, "grad_norm": 0.19777328354162663, "learning_rate": 6.90877197331536e-06, "loss": 1.3883, "step": 950 }, { "epoch": 0.44, "grad_norm": 0.1978053982444075, "learning_rate": 6.871541403717808e-06, "loss": 1.4298, "step": 955 }, { "epoch": 0.44, "grad_norm": 0.19360877663534273, "learning_rate": 6.83418976528462e-06, "loss": 1.3623, "step": 960 }, { "epoch": 0.44, "grad_norm": 0.19124472077513613, "learning_rate": 6.7967194742697866e-06, "loss": 1.3965, "step": 965 }, { "epoch": 0.45, "grad_norm": 0.2026314484251062, "learning_rate": 6.759132954602852e-06, "loss": 1.3889, "step": 970 }, { "epoch": 0.45, "grad_norm": 0.20177996042623167, "learning_rate": 6.721432637732117e-06, "loss": 1.3987, "step": 975 }, { "epoch": 0.45, "grad_norm": 0.19199095023199664, "learning_rate": 6.6836209624673575e-06, "loss": 1.3658, "step": 980 }, { "epoch": 0.45, "grad_norm": 0.19092380360827313, "learning_rate": 6.64570037482205e-06, "loss": 1.3601, "step": 985 }, { "epoch": 0.46, "grad_norm": 0.21156775391897173, "learning_rate": 6.607673327855149e-06, "loss": 1.4427, "step": 990 }, { "epoch": 0.46, "grad_norm": 0.19213500215723073, "learning_rate": 6.569542281512388e-06, "loss": 1.3934, "step": 995 }, { "epoch": 0.46, "grad_norm": 0.19675677797230362, "learning_rate": 6.531309702467159e-06, "loss": 1.3552, "step": 1000 }, { "epoch": 0.46, "eval_loss": 1.3166489601135254, "eval_runtime": 1748.0429, "eval_samples_per_second": 2.168, "eval_steps_per_second": 0.271, "step": 1000 }, { "epoch": 0.46, "grad_norm": 0.18781337537819495, "learning_rate": 6.492978063960942e-06, "loss": 1.3937, "step": 1005 }, { "epoch": 0.47, "grad_norm": 0.1935060574506341, "learning_rate": 6.45454984564331e-06, "loss": 1.4284, "step": 1010 }, { "epoch": 0.47, "grad_norm": 0.1936429054806515, "learning_rate": 6.41602753341152e-06, "loss": 1.3618, "step": 1015 }, { "epoch": 0.47, "grad_norm": 0.19582370428755932, "learning_rate": 6.377413619249713e-06, "loss": 1.3822, "step": 1020 }, { "epoch": 0.47, "grad_norm": 0.18931877193304708, "learning_rate": 6.338710601067691e-06, "loss": 1.3473, "step": 1025 }, { "epoch": 0.47, "grad_norm": 0.1952618362908433, "learning_rate": 6.2999209825393445e-06, "loss": 1.369, "step": 1030 }, { "epoch": 0.48, "grad_norm": 0.196265797323174, "learning_rate": 6.2610472729406905e-06, "loss": 1.3679, "step": 1035 }, { "epoch": 0.48, "grad_norm": 0.1909001830769802, "learning_rate": 6.222091986987534e-06, "loss": 1.3939, "step": 1040 }, { "epoch": 0.48, "grad_norm": 0.1974314278029084, "learning_rate": 6.18305764467281e-06, "loss": 1.4111, "step": 1045 }, { "epoch": 0.48, "grad_norm": 0.19874201405315123, "learning_rate": 6.143946771103561e-06, "loss": 1.383, "step": 1050 }, { "epoch": 0.49, "grad_norm": 0.20109125948229767, "learning_rate": 6.104761896337581e-06, "loss": 1.3548, "step": 1055 }, { "epoch": 0.49, "grad_norm": 0.18937099603698346, "learning_rate": 6.0655055552197616e-06, "loss": 1.4427, "step": 1060 }, { "epoch": 0.49, "grad_norm": 0.20257668978871882, "learning_rate": 6.026180287218106e-06, "loss": 1.3773, "step": 1065 }, { "epoch": 0.49, "grad_norm": 0.19836764097355777, "learning_rate": 5.986788636259453e-06, "loss": 1.3945, "step": 1070 }, { "epoch": 0.5, "grad_norm": 0.19345413549116036, "learning_rate": 5.9473331505649125e-06, "loss": 1.4439, "step": 1075 }, { "epoch": 0.5, "grad_norm": 0.19576415671480885, "learning_rate": 5.907816382485026e-06, "loss": 1.3432, "step": 1080 }, { "epoch": 0.5, "grad_norm": 0.19643295445396305, "learning_rate": 5.8682408883346535e-06, "loss": 1.3459, "step": 1085 }, { "epoch": 0.5, "grad_norm": 0.19760654138918068, "learning_rate": 5.828609228227603e-06, "loss": 1.4334, "step": 1090 }, { "epoch": 0.5, "grad_norm": 0.1907622356589435, "learning_rate": 5.788923965911028e-06, "loss": 1.3195, "step": 1095 }, { "epoch": 0.51, "grad_norm": 0.19974010922162466, "learning_rate": 5.749187668599574e-06, "loss": 1.3973, "step": 1100 }, { "epoch": 0.51, "grad_norm": 0.1899811113208983, "learning_rate": 5.709402906809307e-06, "loss": 1.3788, "step": 1105 }, { "epoch": 0.51, "grad_norm": 0.19125136139691074, "learning_rate": 5.669572254191431e-06, "loss": 1.3749, "step": 1110 }, { "epoch": 0.51, "grad_norm": 0.19773999696793723, "learning_rate": 5.6296982873658e-06, "loss": 1.3812, "step": 1115 }, { "epoch": 0.52, "grad_norm": 0.19522230435252472, "learning_rate": 5.5897835857542315e-06, "loss": 1.3639, "step": 1120 }, { "epoch": 0.52, "grad_norm": 0.19203158758627778, "learning_rate": 5.549830731413655e-06, "loss": 1.3988, "step": 1125 }, { "epoch": 0.52, "grad_norm": 0.19785365288187984, "learning_rate": 5.509842308869075e-06, "loss": 1.4031, "step": 1130 }, { "epoch": 0.52, "grad_norm": 0.20110593532241236, "learning_rate": 5.469820904946383e-06, "loss": 1.3447, "step": 1135 }, { "epoch": 0.53, "grad_norm": 0.20730230846403253, "learning_rate": 5.429769108605013e-06, "loss": 1.433, "step": 1140 }, { "epoch": 0.53, "grad_norm": 0.19538915157122386, "learning_rate": 5.389689510770462e-06, "loss": 1.3751, "step": 1145 }, { "epoch": 0.53, "grad_norm": 0.19657731321379315, "learning_rate": 5.3495847041666935e-06, "loss": 1.4427, "step": 1150 }, { "epoch": 0.53, "grad_norm": 0.19885598934336826, "learning_rate": 5.30945728314841e-06, "loss": 1.3526, "step": 1155 }, { "epoch": 0.53, "grad_norm": 0.19763223830130308, "learning_rate": 5.269309843533222e-06, "loss": 1.3792, "step": 1160 }, { "epoch": 0.54, "grad_norm": 0.1934600019166271, "learning_rate": 5.229144982433736e-06, "loss": 1.3827, "step": 1165 }, { "epoch": 0.54, "grad_norm": 0.19215817298202406, "learning_rate": 5.188965298089538e-06, "loss": 1.3609, "step": 1170 }, { "epoch": 0.54, "grad_norm": 0.1950665854099098, "learning_rate": 5.148773389699123e-06, "loss": 1.3728, "step": 1175 }, { "epoch": 0.54, "grad_norm": 0.20163181813335146, "learning_rate": 5.108571857251754e-06, "loss": 1.3937, "step": 1180 }, { "epoch": 0.55, "grad_norm": 0.19492701026725848, "learning_rate": 5.068363301359263e-06, "loss": 1.3976, "step": 1185 }, { "epoch": 0.55, "grad_norm": 0.1915934216230785, "learning_rate": 5.0281503230878304e-06, "loss": 1.3778, "step": 1190 }, { "epoch": 0.55, "grad_norm": 0.19100537033272963, "learning_rate": 4.98793552378971e-06, "loss": 1.4221, "step": 1195 }, { "epoch": 0.55, "grad_norm": 0.19909784027087774, "learning_rate": 4.947721504934966e-06, "loss": 1.3685, "step": 1200 }, { "epoch": 0.55, "eval_loss": 1.3122756481170654, "eval_runtime": 1747.9099, "eval_samples_per_second": 2.168, "eval_steps_per_second": 0.271, "step": 1200 }, { "epoch": 0.56, "grad_norm": 0.19577442673853007, "learning_rate": 4.907510867943167e-06, "loss": 1.3595, "step": 1205 }, { "epoch": 0.56, "grad_norm": 0.2063936713267001, "learning_rate": 4.867306214015117e-06, "loss": 1.4202, "step": 1210 }, { "epoch": 0.56, "grad_norm": 0.19879562103497236, "learning_rate": 4.8271101439645765e-06, "loss": 1.3934, "step": 1215 }, { "epoch": 0.56, "grad_norm": 0.19945548173818886, "learning_rate": 4.786925258050024e-06, "loss": 1.3395, "step": 1220 }, { "epoch": 0.56, "grad_norm": 0.209259697791073, "learning_rate": 4.746754155806437e-06, "loss": 1.4072, "step": 1225 }, { "epoch": 0.57, "grad_norm": 0.19280116611929857, "learning_rate": 4.706599435877143e-06, "loss": 1.3976, "step": 1230 }, { "epoch": 0.57, "grad_norm": 0.1990941406891665, "learning_rate": 4.666463695845701e-06, "loss": 1.3912, "step": 1235 }, { "epoch": 0.57, "grad_norm": 0.19455600992105357, "learning_rate": 4.626349532067879e-06, "loss": 1.4003, "step": 1240 }, { "epoch": 0.57, "grad_norm": 0.20315382252326522, "learning_rate": 4.586259539503687e-06, "loss": 1.3876, "step": 1245 }, { "epoch": 0.58, "grad_norm": 0.19085048019709233, "learning_rate": 4.546196311549515e-06, "loss": 1.415, "step": 1250 }, { "epoch": 0.58, "grad_norm": 0.19155602915410036, "learning_rate": 4.506162439870366e-06, "loss": 1.388, "step": 1255 }, { "epoch": 0.58, "grad_norm": 0.1962993457750995, "learning_rate": 4.466160514232206e-06, "loss": 1.4069, "step": 1260 }, { "epoch": 0.58, "grad_norm": 0.19440064625069065, "learning_rate": 4.426193122334433e-06, "loss": 1.3625, "step": 1265 }, { "epoch": 0.59, "grad_norm": 0.19907155435638502, "learning_rate": 4.386262849642474e-06, "loss": 1.3621, "step": 1270 }, { "epoch": 0.59, "grad_norm": 0.19846401001306227, "learning_rate": 4.346372279220543e-06, "loss": 1.3438, "step": 1275 }, { "epoch": 0.59, "grad_norm": 0.19990637435356196, "learning_rate": 4.306523991564536e-06, "loss": 1.3857, "step": 1280 }, { "epoch": 0.59, "grad_norm": 0.1983045043565906, "learning_rate": 4.266720564435105e-06, "loss": 1.3477, "step": 1285 }, { "epoch": 0.59, "grad_norm": 0.2008086079053878, "learning_rate": 4.226964572690905e-06, "loss": 1.4032, "step": 1290 }, { "epoch": 0.6, "grad_norm": 0.2394515243143434, "learning_rate": 4.187258588122019e-06, "loss": 1.3757, "step": 1295 }, { "epoch": 0.6, "grad_norm": 0.1971799914885616, "learning_rate": 4.147605179283604e-06, "loss": 1.4156, "step": 1300 }, { "epoch": 0.6, "grad_norm": 0.2073477392149783, "learning_rate": 4.108006911329722e-06, "loss": 1.3881, "step": 1305 }, { "epoch": 0.6, "grad_norm": 0.2073153166108959, "learning_rate": 4.068466345847409e-06, "loss": 1.3687, "step": 1310 }, { "epoch": 0.61, "grad_norm": 0.20054177344121227, "learning_rate": 4.028986040690963e-06, "loss": 1.3785, "step": 1315 }, { "epoch": 0.61, "grad_norm": 0.20604849012426923, "learning_rate": 3.989568549816479e-06, "loss": 1.4169, "step": 1320 }, { "epoch": 0.61, "grad_norm": 0.19467062948633831, "learning_rate": 3.9502164231166354e-06, "loss": 1.4168, "step": 1325 }, { "epoch": 0.61, "grad_norm": 0.21219636801396732, "learning_rate": 3.910932206255742e-06, "loss": 1.3772, "step": 1330 }, { "epoch": 0.61, "grad_norm": 0.20051941796299297, "learning_rate": 3.87171844050507e-06, "loss": 1.3864, "step": 1335 }, { "epoch": 0.62, "grad_norm": 0.20033592892185176, "learning_rate": 3.8325776625784464e-06, "loss": 1.3984, "step": 1340 }, { "epoch": 0.62, "grad_norm": 0.19559628194598214, "learning_rate": 3.793512404468162e-06, "loss": 1.3954, "step": 1345 }, { "epoch": 0.62, "grad_norm": 0.21009377333677687, "learning_rate": 3.7545251932811824e-06, "loss": 1.3799, "step": 1350 }, { "epoch": 0.62, "grad_norm": 0.2058943301550432, "learning_rate": 3.7156185510756613e-06, "loss": 1.3763, "step": 1355 }, { "epoch": 0.63, "grad_norm": 0.20090178955938068, "learning_rate": 3.6767949946978026e-06, "loss": 1.4162, "step": 1360 }, { "epoch": 0.63, "grad_norm": 0.2034374577879598, "learning_rate": 3.6380570356190346e-06, "loss": 1.402, "step": 1365 }, { "epoch": 0.63, "grad_norm": 0.19652020002950302, "learning_rate": 3.5994071797735513e-06, "loss": 1.3667, "step": 1370 }, { "epoch": 0.63, "grad_norm": 0.20049219114904884, "learning_rate": 3.560847927396206e-06, "loss": 1.419, "step": 1375 }, { "epoch": 0.64, "grad_norm": 0.20857502981558484, "learning_rate": 3.5223817728607675e-06, "loss": 1.4082, "step": 1380 }, { "epoch": 0.64, "grad_norm": 0.19767809519926405, "learning_rate": 3.484011204518568e-06, "loss": 1.3947, "step": 1385 }, { "epoch": 0.64, "grad_norm": 0.1948689279113259, "learning_rate": 3.4457387045375255e-06, "loss": 1.3625, "step": 1390 }, { "epoch": 0.64, "grad_norm": 0.20429495021474384, "learning_rate": 3.4075667487415785e-06, "loss": 1.3978, "step": 1395 }, { "epoch": 0.64, "grad_norm": 0.2022436664727946, "learning_rate": 3.3694978064505258e-06, "loss": 1.3487, "step": 1400 }, { "epoch": 0.64, "eval_loss": 1.3093819618225098, "eval_runtime": 1769.0297, "eval_samples_per_second": 2.142, "eval_steps_per_second": 0.268, "step": 1400 }, { "epoch": 0.65, "grad_norm": 0.2010813579540422, "learning_rate": 3.331534340320287e-06, "loss": 1.3582, "step": 1405 }, { "epoch": 0.65, "grad_norm": 0.19489960246084403, "learning_rate": 3.293678806183596e-06, "loss": 1.42, "step": 1410 }, { "epoch": 0.65, "grad_norm": 0.20100567658267351, "learning_rate": 3.255933652891133e-06, "loss": 1.3887, "step": 1415 }, { "epoch": 0.65, "grad_norm": 0.198604846014806, "learning_rate": 3.218301322153111e-06, "loss": 1.3543, "step": 1420 }, { "epoch": 0.66, "grad_norm": 0.20073641523853392, "learning_rate": 3.180784248381322e-06, "loss": 1.3513, "step": 1425 }, { "epoch": 0.66, "grad_norm": 0.19424293274594104, "learning_rate": 3.1433848585316607e-06, "loss": 1.3885, "step": 1430 }, { "epoch": 0.66, "grad_norm": 0.2028571716247717, "learning_rate": 3.10610557194712e-06, "loss": 1.3824, "step": 1435 }, { "epoch": 0.66, "grad_norm": 0.1973272586923241, "learning_rate": 3.068948800201289e-06, "loss": 1.3332, "step": 1440 }, { "epoch": 0.67, "grad_norm": 0.20459650028827572, "learning_rate": 3.0319169469423487e-06, "loss": 1.3715, "step": 1445 }, { "epoch": 0.67, "grad_norm": 0.1966290880736998, "learning_rate": 2.995012407737581e-06, "loss": 1.3985, "step": 1450 }, { "epoch": 0.67, "grad_norm": 0.20394208158347749, "learning_rate": 2.958237569918404e-06, "loss": 1.3867, "step": 1455 }, { "epoch": 0.67, "grad_norm": 0.1971397504636877, "learning_rate": 2.9215948124259343e-06, "loss": 1.3739, "step": 1460 }, { "epoch": 0.67, "grad_norm": 0.1984132646647921, "learning_rate": 2.885086505657094e-06, "loss": 1.4459, "step": 1465 }, { "epoch": 0.68, "grad_norm": 0.20375542452519896, "learning_rate": 2.848715011311271e-06, "loss": 1.3606, "step": 1470 }, { "epoch": 0.68, "grad_norm": 0.19875885515941272, "learning_rate": 2.8124826822375473e-06, "loss": 1.4034, "step": 1475 }, { "epoch": 0.68, "grad_norm": 0.20571233168262154, "learning_rate": 2.7763918622824903e-06, "loss": 1.4358, "step": 1480 }, { "epoch": 0.68, "grad_norm": 0.20325983672817444, "learning_rate": 2.7404448861385293e-06, "loss": 1.3271, "step": 1485 }, { "epoch": 0.69, "grad_norm": 0.204674596724046, "learning_rate": 2.7046440791929306e-06, "loss": 1.3656, "step": 1490 }, { "epoch": 0.69, "grad_norm": 0.19864331126229412, "learning_rate": 2.6689917573773615e-06, "loss": 1.3712, "step": 1495 }, { "epoch": 0.69, "grad_norm": 0.19759957217116436, "learning_rate": 2.633490227018092e-06, "loss": 1.4061, "step": 1500 }, { "epoch": 0.69, "grad_norm": 0.1963001810018751, "learning_rate": 2.5981417846867753e-06, "loss": 1.3753, "step": 1505 }, { "epoch": 0.7, "grad_norm": 0.2037326390652298, "learning_rate": 2.5629487170518974e-06, "loss": 1.3468, "step": 1510 }, { "epoch": 0.7, "grad_norm": 0.20092378895163732, "learning_rate": 2.527913300730863e-06, "loss": 1.3831, "step": 1515 }, { "epoch": 0.7, "grad_norm": 0.2075661167474541, "learning_rate": 2.4930378021426977e-06, "loss": 1.3786, "step": 1520 }, { "epoch": 0.7, "grad_norm": 0.2002833079588797, "learning_rate": 2.4583244773614675e-06, "loss": 1.4058, "step": 1525 }, { "epoch": 0.7, "grad_norm": 0.20306778592495606, "learning_rate": 2.423775571970301e-06, "loss": 1.3704, "step": 1530 }, { "epoch": 0.71, "grad_norm": 0.22057110661612167, "learning_rate": 2.3893933209161465e-06, "loss": 1.3965, "step": 1535 }, { "epoch": 0.71, "grad_norm": 0.20413023154970353, "learning_rate": 2.3551799483651894e-06, "loss": 1.3935, "step": 1540 }, { "epoch": 0.71, "grad_norm": 0.2013070052195424, "learning_rate": 2.321137667558965e-06, "loss": 1.3757, "step": 1545 }, { "epoch": 0.71, "grad_norm": 0.20133662802149876, "learning_rate": 2.2872686806712037e-06, "loss": 1.3533, "step": 1550 }, { "epoch": 0.72, "grad_norm": 0.19816355686283976, "learning_rate": 2.2535751786653476e-06, "loss": 1.4014, "step": 1555 }, { "epoch": 0.72, "grad_norm": 0.20020195119951492, "learning_rate": 2.220059341152837e-06, "loss": 1.3721, "step": 1560 }, { "epoch": 0.72, "grad_norm": 0.20298393289470038, "learning_rate": 2.1867233362521127e-06, "loss": 1.3255, "step": 1565 }, { "epoch": 0.72, "grad_norm": 0.20399564011279728, "learning_rate": 2.153569320448348e-06, "loss": 1.3928, "step": 1570 }, { "epoch": 0.73, "grad_norm": 0.20016486527706043, "learning_rate": 2.120599438453968e-06, "loss": 1.3769, "step": 1575 }, { "epoch": 0.73, "grad_norm": 0.20169522873545517, "learning_rate": 2.087815823069886e-06, "loss": 1.3745, "step": 1580 }, { "epoch": 0.73, "grad_norm": 0.1991770376256046, "learning_rate": 2.055220595047551e-06, "loss": 1.3542, "step": 1585 }, { "epoch": 0.73, "grad_norm": 0.2083925008192503, "learning_rate": 2.022815862951751e-06, "loss": 1.4182, "step": 1590 }, { "epoch": 0.73, "grad_norm": 0.20086735172689546, "learning_rate": 1.990603723024213e-06, "loss": 1.3524, "step": 1595 }, { "epoch": 0.74, "grad_norm": 0.20621921540546848, "learning_rate": 1.9585862590480005e-06, "loss": 1.3891, "step": 1600 }, { "epoch": 0.74, "eval_loss": 1.3076461553573608, "eval_runtime": 1777.0113, "eval_samples_per_second": 2.133, "eval_steps_per_second": 0.267, "step": 1600 }, { "epoch": 0.74, "grad_norm": 0.20007769280547363, "learning_rate": 1.926765542212707e-06, "loss": 1.3856, "step": 1605 }, { "epoch": 0.74, "grad_norm": 0.1980821730251063, "learning_rate": 1.8951436309804766e-06, "loss": 1.383, "step": 1610 }, { "epoch": 0.74, "grad_norm": 0.20314121257047116, "learning_rate": 1.8637225709528506e-06, "loss": 1.3752, "step": 1615 }, { "epoch": 0.75, "grad_norm": 0.2000738503413009, "learning_rate": 1.832504394738428e-06, "loss": 1.3501, "step": 1620 }, { "epoch": 0.75, "grad_norm": 0.19626992677384789, "learning_rate": 1.8014911218213832e-06, "loss": 1.3776, "step": 1625 }, { "epoch": 0.75, "grad_norm": 0.2132143728962325, "learning_rate": 1.770684758430824e-06, "loss": 1.3641, "step": 1630 }, { "epoch": 0.75, "grad_norm": 0.20230145941791186, "learning_rate": 1.7400872974110088e-06, "loss": 1.3714, "step": 1635 }, { "epoch": 0.76, "grad_norm": 0.20322390315184813, "learning_rate": 1.7097007180924375e-06, "loss": 1.3559, "step": 1640 }, { "epoch": 0.76, "grad_norm": 0.2097270533204938, "learning_rate": 1.6795269861638041e-06, "loss": 1.3555, "step": 1645 }, { "epoch": 0.76, "grad_norm": 0.20409576963271045, "learning_rate": 1.6495680535448405e-06, "loss": 1.3376, "step": 1650 }, { "epoch": 0.76, "grad_norm": 0.209992582333707, "learning_rate": 1.6198258582600418e-06, "loss": 1.3393, "step": 1655 }, { "epoch": 0.76, "grad_norm": 0.20186485511811675, "learning_rate": 1.590302324313303e-06, "loss": 1.3476, "step": 1660 }, { "epoch": 0.77, "grad_norm": 0.2039959334961091, "learning_rate": 1.5609993615634578e-06, "loss": 1.4172, "step": 1665 }, { "epoch": 0.77, "grad_norm": 0.2032115658244104, "learning_rate": 1.531918865600725e-06, "loss": 1.3866, "step": 1670 }, { "epoch": 0.77, "grad_norm": 0.20663125873556282, "learning_rate": 1.5030627176240903e-06, "loss": 1.3413, "step": 1675 }, { "epoch": 0.77, "grad_norm": 0.21419285282068773, "learning_rate": 1.4744327843196043e-06, "loss": 1.3685, "step": 1680 }, { "epoch": 0.78, "grad_norm": 0.20427116148089472, "learning_rate": 1.446030917739633e-06, "loss": 1.3864, "step": 1685 }, { "epoch": 0.78, "grad_norm": 0.21267201464638189, "learning_rate": 1.4178589551830585e-06, "loss": 1.3578, "step": 1690 }, { "epoch": 0.78, "grad_norm": 0.2021417320485247, "learning_rate": 1.3899187190764062e-06, "loss": 1.4034, "step": 1695 }, { "epoch": 0.78, "grad_norm": 0.19953428976865786, "learning_rate": 1.3622120168559656e-06, "loss": 1.3378, "step": 1700 }, { "epoch": 0.79, "grad_norm": 0.20322168013048264, "learning_rate": 1.3347406408508695e-06, "loss": 1.4032, "step": 1705 }, { "epoch": 0.79, "grad_norm": 0.19617963419360818, "learning_rate": 1.3075063681671408e-06, "loss": 1.3815, "step": 1710 }, { "epoch": 0.79, "grad_norm": 0.20338294719666272, "learning_rate": 1.280510960572745e-06, "loss": 1.376, "step": 1715 }, { "epoch": 0.79, "grad_norm": 0.20462364560864363, "learning_rate": 1.2537561643836087e-06, "loss": 1.3866, "step": 1720 }, { "epoch": 0.79, "grad_norm": 0.1963149280229846, "learning_rate": 1.2272437103506596e-06, "loss": 1.372, "step": 1725 }, { "epoch": 0.8, "grad_norm": 0.19716113021249304, "learning_rate": 1.200975313547867e-06, "loss": 1.3599, "step": 1730 }, { "epoch": 0.8, "grad_norm": 0.20573469916350295, "learning_rate": 1.1749526732612842e-06, "loss": 1.3562, "step": 1735 }, { "epoch": 0.8, "grad_norm": 0.20684676012413714, "learning_rate": 1.1491774728791416e-06, "loss": 1.3296, "step": 1740 }, { "epoch": 0.8, "grad_norm": 0.2090768120512491, "learning_rate": 1.1236513797829285e-06, "loss": 1.4248, "step": 1745 }, { "epoch": 0.81, "grad_norm": 0.20571837930886522, "learning_rate": 1.0983760452395415e-06, "loss": 1.3609, "step": 1750 }, { "epoch": 0.81, "grad_norm": 0.20867562597707268, "learning_rate": 1.07335310429447e-06, "loss": 1.3848, "step": 1755 }, { "epoch": 0.81, "grad_norm": 0.1988423228103918, "learning_rate": 1.048584175666012e-06, "loss": 1.3712, "step": 1760 }, { "epoch": 0.81, "grad_norm": 0.21063039041467455, "learning_rate": 1.0240708616405788e-06, "loss": 1.3611, "step": 1765 }, { "epoch": 0.82, "grad_norm": 0.20306439311302277, "learning_rate": 9.998147479690251e-07, "loss": 1.3478, "step": 1770 }, { "epoch": 0.82, "grad_norm": 0.2020964557722793, "learning_rate": 9.75817403764079e-07, "loss": 1.3433, "step": 1775 }, { "epoch": 0.82, "grad_norm": 0.20597190432737983, "learning_rate": 9.520803813988366e-07, "loss": 1.4058, "step": 1780 }, { "epoch": 0.82, "grad_norm": 0.20380209329114748, "learning_rate": 9.286052164063369e-07, "loss": 1.4028, "step": 1785 }, { "epoch": 0.82, "grad_norm": 0.2041857004062742, "learning_rate": 9.053934273802312e-07, "loss": 1.383, "step": 1790 }, { "epoch": 0.83, "grad_norm": 0.20684812528414637, "learning_rate": 8.824465158765433e-07, "loss": 1.3512, "step": 1795 }, { "epoch": 0.83, "grad_norm": 0.20386591712113425, "learning_rate": 8.597659663165364e-07, "loss": 1.3858, "step": 1800 }, { "epoch": 0.83, "eval_loss": 1.306676983833313, "eval_runtime": 1760.6924, "eval_samples_per_second": 2.153, "eval_steps_per_second": 0.269, "step": 1800 }, { "epoch": 0.83, "grad_norm": 0.19511280510666812, "learning_rate": 8.373532458906897e-07, "loss": 1.3261, "step": 1805 }, { "epoch": 0.83, "grad_norm": 0.20356348959673148, "learning_rate": 8.15209804463783e-07, "loss": 1.3288, "step": 1810 }, { "epoch": 0.84, "grad_norm": 0.2103018153717413, "learning_rate": 7.93337074481108e-07, "loss": 1.4425, "step": 1815 }, { "epoch": 0.84, "grad_norm": 0.20468220080909677, "learning_rate": 7.717364708758024e-07, "loss": 1.406, "step": 1820 }, { "epoch": 0.84, "grad_norm": 0.2040164153992187, "learning_rate": 7.504093909773174e-07, "loss": 1.3601, "step": 1825 }, { "epoch": 0.84, "grad_norm": 0.19961500414001193, "learning_rate": 7.293572144210332e-07, "loss": 1.3777, "step": 1830 }, { "epoch": 0.85, "grad_norm": 0.20035042813237278, "learning_rate": 7.085813030590022e-07, "loss": 1.3944, "step": 1835 }, { "epoch": 0.85, "grad_norm": 0.20685428154054034, "learning_rate": 6.880830008718564e-07, "loss": 1.3778, "step": 1840 }, { "epoch": 0.85, "grad_norm": 0.19642667708352796, "learning_rate": 6.678636338818645e-07, "loss": 1.3458, "step": 1845 }, { "epoch": 0.85, "grad_norm": 0.20222774080494071, "learning_rate": 6.47924510067151e-07, "loss": 1.3655, "step": 1850 }, { "epoch": 0.85, "grad_norm": 0.20419539962584285, "learning_rate": 6.282669192770896e-07, "loss": 1.424, "step": 1855 }, { "epoch": 0.86, "grad_norm": 0.1975487856167091, "learning_rate": 6.088921331488568e-07, "loss": 1.3424, "step": 1860 }, { "epoch": 0.86, "grad_norm": 0.2094178118159778, "learning_rate": 5.898014050251765e-07, "loss": 1.3611, "step": 1865 }, { "epoch": 0.86, "grad_norm": 0.20636955576818874, "learning_rate": 5.709959698732359e-07, "loss": 1.3779, "step": 1870 }, { "epoch": 0.86, "grad_norm": 0.21747927560303068, "learning_rate": 5.524770442047978e-07, "loss": 1.3308, "step": 1875 }, { "epoch": 0.87, "grad_norm": 0.20050302654738292, "learning_rate": 5.342458259975147e-07, "loss": 1.3865, "step": 1880 }, { "epoch": 0.87, "grad_norm": 0.2392412132808601, "learning_rate": 5.163034946174161e-07, "loss": 1.3792, "step": 1885 }, { "epoch": 0.87, "grad_norm": 0.20914070749895322, "learning_rate": 4.986512107426283e-07, "loss": 1.3812, "step": 1890 }, { "epoch": 0.87, "grad_norm": 0.21320420478322508, "learning_rate": 4.812901162882871e-07, "loss": 1.443, "step": 1895 }, { "epoch": 0.88, "grad_norm": 0.20606583697965636, "learning_rate": 4.6422133433266513e-07, "loss": 1.3546, "step": 1900 }, { "epoch": 0.88, "grad_norm": 0.20597370559054526, "learning_rate": 4.474459690445293e-07, "loss": 1.3803, "step": 1905 }, { "epoch": 0.88, "grad_norm": 0.20609624124174958, "learning_rate": 4.309651056117009e-07, "loss": 1.3806, "step": 1910 }, { "epoch": 0.88, "grad_norm": 0.21011536963816732, "learning_rate": 4.1477981017086387e-07, "loss": 1.3857, "step": 1915 }, { "epoch": 0.88, "grad_norm": 0.20861991251552262, "learning_rate": 3.9889112973859554e-07, "loss": 1.4178, "step": 1920 }, { "epoch": 0.89, "grad_norm": 0.20565429912809985, "learning_rate": 3.8330009214363197e-07, "loss": 1.3485, "step": 1925 }, { "epoch": 0.89, "grad_norm": 0.213262370271027, "learning_rate": 3.680077059603876e-07, "loss": 1.3857, "step": 1930 }, { "epoch": 0.89, "grad_norm": 0.21006993775062202, "learning_rate": 3.530149604436983e-07, "loss": 1.3718, "step": 1935 }, { "epoch": 0.89, "grad_norm": 0.20971631358735854, "learning_rate": 3.3832282546483686e-07, "loss": 1.3401, "step": 1940 }, { "epoch": 0.9, "grad_norm": 0.20675053040706537, "learning_rate": 3.239322514487686e-07, "loss": 1.3976, "step": 1945 }, { "epoch": 0.9, "grad_norm": 0.2070605402743848, "learning_rate": 3.098441693126719e-07, "loss": 1.3801, "step": 1950 }, { "epoch": 0.9, "grad_norm": 0.2036473853339382, "learning_rate": 2.9605949040571456e-07, "loss": 1.3975, "step": 1955 }, { "epoch": 0.9, "grad_norm": 0.20446499651024982, "learning_rate": 2.8257910645009935e-07, "loss": 1.3932, "step": 1960 }, { "epoch": 0.91, "grad_norm": 0.20745728206982056, "learning_rate": 2.6940388948338057e-07, "loss": 1.4214, "step": 1965 }, { "epoch": 0.91, "grad_norm": 0.20628378675298625, "learning_rate": 2.565346918020534e-07, "loss": 1.3234, "step": 1970 }, { "epoch": 0.91, "grad_norm": 0.19884308459081187, "learning_rate": 2.4397234590641696e-07, "loss": 1.4086, "step": 1975 }, { "epoch": 0.91, "grad_norm": 0.196985413218799, "learning_rate": 2.3171766444672227e-07, "loss": 1.4203, "step": 1980 }, { "epoch": 0.91, "grad_norm": 0.20225024562843938, "learning_rate": 2.1977144017060027e-07, "loss": 1.3859, "step": 1985 }, { "epoch": 0.92, "grad_norm": 0.20030763530819898, "learning_rate": 2.0813444587178156e-07, "loss": 1.3889, "step": 1990 }, { "epoch": 0.92, "grad_norm": 0.20327002619311596, "learning_rate": 1.9680743434010385e-07, "loss": 1.3745, "step": 1995 }, { "epoch": 0.92, "grad_norm": 0.20095518840823687, "learning_rate": 1.8579113831281525e-07, "loss": 1.3635, "step": 2000 }, { "epoch": 0.92, "eval_loss": 1.3063520193099976, "eval_runtime": 2013.9504, "eval_samples_per_second": 1.882, "eval_steps_per_second": 0.235, "step": 2000 }, { "epoch": 0.92, "grad_norm": 0.20459938594623875, "learning_rate": 1.7508627042717387e-07, "loss": 1.4269, "step": 2005 }, { "epoch": 0.93, "grad_norm": 0.2004705031433909, "learning_rate": 1.6469352317434627e-07, "loss": 1.3789, "step": 2010 }, { "epoch": 0.93, "grad_norm": 0.19922047944368607, "learning_rate": 1.5461356885461077e-07, "loss": 1.3811, "step": 2015 }, { "epoch": 0.93, "grad_norm": 0.20506581374899654, "learning_rate": 1.4484705953386968e-07, "loss": 1.3677, "step": 2020 }, { "epoch": 0.93, "grad_norm": 0.20196871274072786, "learning_rate": 1.35394627001465e-07, "loss": 1.3871, "step": 2025 }, { "epoch": 0.94, "grad_norm": 0.21302376799020897, "learning_rate": 1.2625688272930925e-07, "loss": 1.3673, "step": 2030 }, { "epoch": 0.94, "grad_norm": 0.20437076323448575, "learning_rate": 1.174344178323289e-07, "loss": 1.3701, "step": 2035 }, { "epoch": 0.94, "grad_norm": 0.20259273082335125, "learning_rate": 1.0892780303022377e-07, "loss": 1.4004, "step": 2040 }, { "epoch": 0.94, "grad_norm": 0.19927961571470354, "learning_rate": 1.007375886105555e-07, "loss": 1.3781, "step": 2045 }, { "epoch": 0.94, "grad_norm": 0.19648191268958623, "learning_rate": 9.286430439313876e-08, "loss": 1.3719, "step": 2050 }, { "epoch": 0.95, "grad_norm": 0.2081842753213948, "learning_rate": 8.530845969577594e-08, "loss": 1.3347, "step": 2055 }, { "epoch": 0.95, "grad_norm": 0.19777453793305202, "learning_rate": 7.80705433013046e-08, "loss": 1.3645, "step": 2060 }, { "epoch": 0.95, "grad_norm": 0.2001901456526902, "learning_rate": 7.115102342598101e-08, "loss": 1.3549, "step": 2065 }, { "epoch": 0.95, "grad_norm": 0.2085779076020987, "learning_rate": 6.455034768919288e-08, "loss": 1.395, "step": 2070 }, { "epoch": 0.96, "grad_norm": 0.2007001569199437, "learning_rate": 5.826894308449904e-08, "loss": 1.3418, "step": 2075 }, { "epoch": 0.96, "grad_norm": 0.20019336918988379, "learning_rate": 5.230721595201049e-08, "loss": 1.3808, "step": 2080 }, { "epoch": 0.96, "grad_norm": 0.20532075618369505, "learning_rate": 4.666555195210365e-08, "loss": 1.3624, "step": 2085 }, { "epoch": 0.96, "grad_norm": 0.19955856360806487, "learning_rate": 4.134431604047195e-08, "loss": 1.3851, "step": 2090 }, { "epoch": 0.97, "grad_norm": 0.19995218707394016, "learning_rate": 3.63438524445181e-08, "loss": 1.404, "step": 2095 }, { "epoch": 0.97, "grad_norm": 0.2027473195079258, "learning_rate": 3.166448464108629e-08, "loss": 1.3654, "step": 2100 }, { "epoch": 0.97, "grad_norm": 0.21316411475600813, "learning_rate": 2.7306515335532857e-08, "loss": 1.4004, "step": 2105 }, { "epoch": 0.97, "grad_norm": 0.19985677477921807, "learning_rate": 2.327022644215193e-08, "loss": 1.3813, "step": 2110 }, { "epoch": 0.97, "grad_norm": 0.20946530787894166, "learning_rate": 1.9555879065930038e-08, "loss": 1.4226, "step": 2115 }, { "epoch": 0.98, "grad_norm": 0.1975351975591122, "learning_rate": 1.6163713485662923e-08, "loss": 1.3792, "step": 2120 }, { "epoch": 0.98, "grad_norm": 0.20571768755825784, "learning_rate": 1.3093949138406892e-08, "loss": 1.3918, "step": 2125 }, { "epoch": 0.98, "grad_norm": 0.44915439425778153, "learning_rate": 1.03467846052846e-08, "loss": 1.3642, "step": 2130 }, { "epoch": 0.98, "grad_norm": 0.20214902180431968, "learning_rate": 7.922397598642551e-09, "loss": 1.3599, "step": 2135 }, { "epoch": 0.99, "grad_norm": 0.19767842515824202, "learning_rate": 5.820944950549745e-09, "loss": 1.3599, "step": 2140 }, { "epoch": 0.99, "grad_norm": 0.1991438801798082, "learning_rate": 4.042562602655231e-09, "loss": 1.3446, "step": 2145 }, { "epoch": 0.99, "grad_norm": 0.20173015884228035, "learning_rate": 2.5873655973945864e-09, "loss": 1.3461, "step": 2150 }, { "epoch": 0.99, "grad_norm": 0.20119946240562925, "learning_rate": 1.4554480705458729e-09, "loss": 1.3474, "step": 2155 }, { "epoch": 1.0, "grad_norm": 0.20468648298029762, "learning_rate": 6.468832451417273e-10, "loss": 1.3649, "step": 2160 }, { "epoch": 1.0, "grad_norm": 0.19998165788056904, "learning_rate": 1.617234267320411e-10, "loss": 1.3677, "step": 2165 }, { "epoch": 1.0, "grad_norm": 0.21026892718522863, "learning_rate": 0.0, "loss": 1.3748, "step": 2170 }, { "epoch": 1.0, "step": 2170, "total_flos": 7055767844683776.0, "train_loss": 1.4133363889659056, "train_runtime": 112943.9974, "train_samples_per_second": 0.615, "train_steps_per_second": 0.019 } ], "logging_steps": 5, "max_steps": 2170, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "total_flos": 7055767844683776.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }