nthakur's picture
Model save
1e40fe4 verified
{
"best_metric": 1.3063520193099976,
"best_model_checkpoint": "/mnt/users/n3thakur/vectara/huggingface-dpo/trained_models/v3/Meta-Llama-3-8B-Instruct-miracl-mix-raft-sft-25th-apr-v1.0/checkpoint-2000",
"epoch": 0.9996544972935621,
"eval_steps": 200,
"global_step": 2170,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 1.0262953847325709,
"learning_rate": 4.608294930875576e-08,
"loss": 1.7621,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 0.954880939596983,
"learning_rate": 2.3041474654377884e-07,
"loss": 1.7602,
"step": 5
},
{
"epoch": 0.0,
"grad_norm": 1.0004644202041775,
"learning_rate": 4.608294930875577e-07,
"loss": 1.8162,
"step": 10
},
{
"epoch": 0.01,
"grad_norm": 1.0280115397490373,
"learning_rate": 6.912442396313365e-07,
"loss": 1.7724,
"step": 15
},
{
"epoch": 0.01,
"grad_norm": 0.8732665912953524,
"learning_rate": 9.216589861751154e-07,
"loss": 1.7378,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 0.9441885280056963,
"learning_rate": 1.1520737327188942e-06,
"loss": 1.7507,
"step": 25
},
{
"epoch": 0.01,
"grad_norm": 0.9845213443275476,
"learning_rate": 1.382488479262673e-06,
"loss": 1.7561,
"step": 30
},
{
"epoch": 0.02,
"grad_norm": 0.9521516823054887,
"learning_rate": 1.6129032258064516e-06,
"loss": 1.7479,
"step": 35
},
{
"epoch": 0.02,
"grad_norm": 1.0047303931298321,
"learning_rate": 1.8433179723502307e-06,
"loss": 1.7773,
"step": 40
},
{
"epoch": 0.02,
"grad_norm": 0.9230639224993269,
"learning_rate": 2.0737327188940094e-06,
"loss": 1.7752,
"step": 45
},
{
"epoch": 0.02,
"grad_norm": 0.9112137084493539,
"learning_rate": 2.3041474654377884e-06,
"loss": 1.7213,
"step": 50
},
{
"epoch": 0.03,
"grad_norm": 0.8909029043612109,
"learning_rate": 2.5345622119815673e-06,
"loss": 1.7881,
"step": 55
},
{
"epoch": 0.03,
"grad_norm": 0.7189360071641709,
"learning_rate": 2.764976958525346e-06,
"loss": 1.6636,
"step": 60
},
{
"epoch": 0.03,
"grad_norm": 0.6692633878715751,
"learning_rate": 2.9953917050691243e-06,
"loss": 1.6508,
"step": 65
},
{
"epoch": 0.03,
"grad_norm": 0.6188511577538577,
"learning_rate": 3.225806451612903e-06,
"loss": 1.7011,
"step": 70
},
{
"epoch": 0.03,
"grad_norm": 0.6279410712564498,
"learning_rate": 3.4562211981566825e-06,
"loss": 1.6403,
"step": 75
},
{
"epoch": 0.04,
"grad_norm": 0.5342697084988228,
"learning_rate": 3.6866359447004615e-06,
"loss": 1.656,
"step": 80
},
{
"epoch": 0.04,
"grad_norm": 0.44402512139023,
"learning_rate": 3.91705069124424e-06,
"loss": 1.6617,
"step": 85
},
{
"epoch": 0.04,
"grad_norm": 0.4644619191475398,
"learning_rate": 4.147465437788019e-06,
"loss": 1.6303,
"step": 90
},
{
"epoch": 0.04,
"grad_norm": 0.371289417426217,
"learning_rate": 4.377880184331797e-06,
"loss": 1.6453,
"step": 95
},
{
"epoch": 0.05,
"grad_norm": 0.3309787275557066,
"learning_rate": 4.608294930875577e-06,
"loss": 1.5851,
"step": 100
},
{
"epoch": 0.05,
"grad_norm": 0.31477252449769144,
"learning_rate": 4.838709677419355e-06,
"loss": 1.604,
"step": 105
},
{
"epoch": 0.05,
"grad_norm": 0.2612544823481869,
"learning_rate": 5.0691244239631346e-06,
"loss": 1.5725,
"step": 110
},
{
"epoch": 0.05,
"grad_norm": 0.2605484332821398,
"learning_rate": 5.299539170506913e-06,
"loss": 1.5644,
"step": 115
},
{
"epoch": 0.06,
"grad_norm": 0.25148232016385397,
"learning_rate": 5.529953917050692e-06,
"loss": 1.5467,
"step": 120
},
{
"epoch": 0.06,
"grad_norm": 0.2601998287824473,
"learning_rate": 5.76036866359447e-06,
"loss": 1.589,
"step": 125
},
{
"epoch": 0.06,
"grad_norm": 0.2385121609938763,
"learning_rate": 5.9907834101382485e-06,
"loss": 1.6055,
"step": 130
},
{
"epoch": 0.06,
"grad_norm": 0.24629649793790628,
"learning_rate": 6.221198156682028e-06,
"loss": 1.5373,
"step": 135
},
{
"epoch": 0.06,
"grad_norm": 0.2468655185850127,
"learning_rate": 6.451612903225806e-06,
"loss": 1.5614,
"step": 140
},
{
"epoch": 0.07,
"grad_norm": 0.22223391970377232,
"learning_rate": 6.682027649769586e-06,
"loss": 1.5624,
"step": 145
},
{
"epoch": 0.07,
"grad_norm": 0.2062049955510628,
"learning_rate": 6.912442396313365e-06,
"loss": 1.5013,
"step": 150
},
{
"epoch": 0.07,
"grad_norm": 0.19876212655323225,
"learning_rate": 7.1428571428571436e-06,
"loss": 1.575,
"step": 155
},
{
"epoch": 0.07,
"grad_norm": 0.19765238479942285,
"learning_rate": 7.373271889400923e-06,
"loss": 1.5167,
"step": 160
},
{
"epoch": 0.08,
"grad_norm": 0.19240215611308686,
"learning_rate": 7.603686635944701e-06,
"loss": 1.5071,
"step": 165
},
{
"epoch": 0.08,
"grad_norm": 0.2001053295395004,
"learning_rate": 7.83410138248848e-06,
"loss": 1.4932,
"step": 170
},
{
"epoch": 0.08,
"grad_norm": 0.20863957442778325,
"learning_rate": 8.064516129032258e-06,
"loss": 1.5371,
"step": 175
},
{
"epoch": 0.08,
"grad_norm": 0.1880073967302754,
"learning_rate": 8.294930875576038e-06,
"loss": 1.492,
"step": 180
},
{
"epoch": 0.09,
"grad_norm": 0.18052519991071567,
"learning_rate": 8.525345622119815e-06,
"loss": 1.5039,
"step": 185
},
{
"epoch": 0.09,
"grad_norm": 0.17252733686762506,
"learning_rate": 8.755760368663595e-06,
"loss": 1.492,
"step": 190
},
{
"epoch": 0.09,
"grad_norm": 0.1754856007551659,
"learning_rate": 8.986175115207374e-06,
"loss": 1.4926,
"step": 195
},
{
"epoch": 0.09,
"grad_norm": 0.16521189205999245,
"learning_rate": 9.216589861751153e-06,
"loss": 1.4903,
"step": 200
},
{
"epoch": 0.09,
"eval_loss": 1.3960996866226196,
"eval_runtime": 1753.0374,
"eval_samples_per_second": 2.162,
"eval_steps_per_second": 0.27,
"step": 200
},
{
"epoch": 0.09,
"grad_norm": 0.1618648599795676,
"learning_rate": 9.447004608294931e-06,
"loss": 1.4499,
"step": 205
},
{
"epoch": 0.1,
"grad_norm": 0.1609921643942943,
"learning_rate": 9.67741935483871e-06,
"loss": 1.4775,
"step": 210
},
{
"epoch": 0.1,
"grad_norm": 0.161807092383862,
"learning_rate": 9.90783410138249e-06,
"loss": 1.4779,
"step": 215
},
{
"epoch": 0.1,
"grad_norm": 0.15789342984068674,
"learning_rate": 9.999941779365509e-06,
"loss": 1.4064,
"step": 220
},
{
"epoch": 0.1,
"grad_norm": 0.17953421611953463,
"learning_rate": 9.99958599150926e-06,
"loss": 1.4216,
"step": 225
},
{
"epoch": 0.11,
"grad_norm": 0.17033046165043145,
"learning_rate": 9.998906783581494e-06,
"loss": 1.4872,
"step": 230
},
{
"epoch": 0.11,
"grad_norm": 0.15935058975437466,
"learning_rate": 9.997904199519748e-06,
"loss": 1.4473,
"step": 235
},
{
"epoch": 0.11,
"grad_norm": 0.16188090301066688,
"learning_rate": 9.996578304180551e-06,
"loss": 1.4484,
"step": 240
},
{
"epoch": 0.11,
"grad_norm": 0.1692826467439708,
"learning_rate": 9.994929183335237e-06,
"loss": 1.4576,
"step": 245
},
{
"epoch": 0.12,
"grad_norm": 0.16233371930510018,
"learning_rate": 9.992956943664401e-06,
"loss": 1.4674,
"step": 250
},
{
"epoch": 0.12,
"grad_norm": 0.1601701723529579,
"learning_rate": 9.99066171275098e-06,
"loss": 1.434,
"step": 255
},
{
"epoch": 0.12,
"grad_norm": 0.1515934744211598,
"learning_rate": 9.988043639072021e-06,
"loss": 1.469,
"step": 260
},
{
"epoch": 0.12,
"grad_norm": 0.1558360755989006,
"learning_rate": 9.985102891989063e-06,
"loss": 1.4688,
"step": 265
},
{
"epoch": 0.12,
"grad_norm": 0.20565866787405954,
"learning_rate": 9.98183966173718e-06,
"loss": 1.4794,
"step": 270
},
{
"epoch": 0.13,
"grad_norm": 0.16589082756750548,
"learning_rate": 9.97825415941269e-06,
"loss": 1.4514,
"step": 275
},
{
"epoch": 0.13,
"grad_norm": 0.16179243903344054,
"learning_rate": 9.974346616959476e-06,
"loss": 1.4802,
"step": 280
},
{
"epoch": 0.13,
"grad_norm": 0.15782689090886812,
"learning_rate": 9.970117287154004e-06,
"loss": 1.4356,
"step": 285
},
{
"epoch": 0.13,
"grad_norm": 0.15954652235182576,
"learning_rate": 9.965566443588956e-06,
"loss": 1.3886,
"step": 290
},
{
"epoch": 0.14,
"grad_norm": 0.16253218390477409,
"learning_rate": 9.960694380655539e-06,
"loss": 1.456,
"step": 295
},
{
"epoch": 0.14,
"grad_norm": 0.15571215758010262,
"learning_rate": 9.955501413524438e-06,
"loss": 1.4038,
"step": 300
},
{
"epoch": 0.14,
"grad_norm": 0.15764668416666167,
"learning_rate": 9.949987878125427e-06,
"loss": 1.4292,
"step": 305
},
{
"epoch": 0.14,
"grad_norm": 0.15565461047093965,
"learning_rate": 9.944154131125643e-06,
"loss": 1.3845,
"step": 310
},
{
"epoch": 0.15,
"grad_norm": 0.16274505869667716,
"learning_rate": 9.938000549906509e-06,
"loss": 1.4143,
"step": 315
},
{
"epoch": 0.15,
"grad_norm": 0.15771377018169355,
"learning_rate": 9.93152753253932e-06,
"loss": 1.414,
"step": 320
},
{
"epoch": 0.15,
"grad_norm": 0.15819595549860918,
"learning_rate": 9.924735497759497e-06,
"loss": 1.398,
"step": 325
},
{
"epoch": 0.15,
"grad_norm": 0.16836842547624103,
"learning_rate": 9.917624884939495e-06,
"loss": 1.415,
"step": 330
},
{
"epoch": 0.15,
"grad_norm": 0.16454346246919038,
"learning_rate": 9.910196154060381e-06,
"loss": 1.5025,
"step": 335
},
{
"epoch": 0.16,
"grad_norm": 0.15952712351687925,
"learning_rate": 9.902449785682084e-06,
"loss": 1.4602,
"step": 340
},
{
"epoch": 0.16,
"grad_norm": 0.15913354058980336,
"learning_rate": 9.894386280912298e-06,
"loss": 1.4437,
"step": 345
},
{
"epoch": 0.16,
"grad_norm": 0.16526633029775475,
"learning_rate": 9.88600616137407e-06,
"loss": 1.443,
"step": 350
},
{
"epoch": 0.16,
"grad_norm": 0.1570973278965336,
"learning_rate": 9.877309969172065e-06,
"loss": 1.4001,
"step": 355
},
{
"epoch": 0.17,
"grad_norm": 0.16323401596231638,
"learning_rate": 9.868298266857477e-06,
"loss": 1.4115,
"step": 360
},
{
"epoch": 0.17,
"grad_norm": 0.16999252145703372,
"learning_rate": 9.858971637391662e-06,
"loss": 1.431,
"step": 365
},
{
"epoch": 0.17,
"grad_norm": 0.1598876979960763,
"learning_rate": 9.849330684108409e-06,
"loss": 1.3925,
"step": 370
},
{
"epoch": 0.17,
"grad_norm": 0.16716694688842804,
"learning_rate": 9.83937603067492e-06,
"loss": 1.4369,
"step": 375
},
{
"epoch": 0.18,
"grad_norm": 0.17160997265475317,
"learning_rate": 9.829108321051461e-06,
"loss": 1.4236,
"step": 380
},
{
"epoch": 0.18,
"grad_norm": 0.1709419213466886,
"learning_rate": 9.818528219449705e-06,
"loss": 1.4156,
"step": 385
},
{
"epoch": 0.18,
"grad_norm": 0.1706660498176678,
"learning_rate": 9.807636410289767e-06,
"loss": 1.3531,
"step": 390
},
{
"epoch": 0.18,
"grad_norm": 0.1654207117166432,
"learning_rate": 9.796433598155928e-06,
"loss": 1.4282,
"step": 395
},
{
"epoch": 0.18,
"grad_norm": 0.17005569486504588,
"learning_rate": 9.784920507751052e-06,
"loss": 1.465,
"step": 400
},
{
"epoch": 0.18,
"eval_loss": 1.3499208688735962,
"eval_runtime": 1761.5948,
"eval_samples_per_second": 2.151,
"eval_steps_per_second": 0.269,
"step": 400
},
{
"epoch": 0.19,
"grad_norm": 0.16465910018813473,
"learning_rate": 9.773097883849715e-06,
"loss": 1.4856,
"step": 405
},
{
"epoch": 0.19,
"grad_norm": 0.1687241699251977,
"learning_rate": 9.760966491250018e-06,
"loss": 1.4448,
"step": 410
},
{
"epoch": 0.19,
"grad_norm": 0.173826161389088,
"learning_rate": 9.748527114724111e-06,
"loss": 1.4588,
"step": 415
},
{
"epoch": 0.19,
"grad_norm": 0.1664583424134419,
"learning_rate": 9.735780558967434e-06,
"loss": 1.3651,
"step": 420
},
{
"epoch": 0.2,
"grad_norm": 0.16305809613976227,
"learning_rate": 9.72272764854666e-06,
"loss": 1.386,
"step": 425
},
{
"epoch": 0.2,
"grad_norm": 0.16712240916605367,
"learning_rate": 9.709369227846346e-06,
"loss": 1.4249,
"step": 430
},
{
"epoch": 0.2,
"grad_norm": 0.1685889272801495,
"learning_rate": 9.695706161014322e-06,
"loss": 1.4629,
"step": 435
},
{
"epoch": 0.2,
"grad_norm": 0.16213828032523545,
"learning_rate": 9.681739331905784e-06,
"loss": 1.4633,
"step": 440
},
{
"epoch": 0.2,
"grad_norm": 0.1722162760562697,
"learning_rate": 9.667469644026118e-06,
"loss": 1.4147,
"step": 445
},
{
"epoch": 0.21,
"grad_norm": 0.16634245789106444,
"learning_rate": 9.652898020472449e-06,
"loss": 1.4254,
"step": 450
},
{
"epoch": 0.21,
"grad_norm": 0.17414333030226264,
"learning_rate": 9.638025403873939e-06,
"loss": 1.3734,
"step": 455
},
{
"epoch": 0.21,
"grad_norm": 0.17078620575685172,
"learning_rate": 9.622852756330797e-06,
"loss": 1.4313,
"step": 460
},
{
"epoch": 0.21,
"grad_norm": 0.17201073443780537,
"learning_rate": 9.60738105935204e-06,
"loss": 1.4412,
"step": 465
},
{
"epoch": 0.22,
"grad_norm": 0.1781118635582674,
"learning_rate": 9.59161131379201e-06,
"loss": 1.4102,
"step": 470
},
{
"epoch": 0.22,
"grad_norm": 0.17417835939253798,
"learning_rate": 9.575544539785626e-06,
"loss": 1.4311,
"step": 475
},
{
"epoch": 0.22,
"grad_norm": 0.17050610923384396,
"learning_rate": 9.559181776682387e-06,
"loss": 1.4627,
"step": 480
},
{
"epoch": 0.22,
"grad_norm": 0.176092363473546,
"learning_rate": 9.542524082979138e-06,
"loss": 1.4517,
"step": 485
},
{
"epoch": 0.23,
"grad_norm": 0.17498193119749225,
"learning_rate": 9.525572536251608e-06,
"loss": 1.3956,
"step": 490
},
{
"epoch": 0.23,
"grad_norm": 0.17003085925682157,
"learning_rate": 9.50832823308468e-06,
"loss": 1.4012,
"step": 495
},
{
"epoch": 0.23,
"grad_norm": 0.17064790853058265,
"learning_rate": 9.490792289001476e-06,
"loss": 1.3523,
"step": 500
},
{
"epoch": 0.23,
"grad_norm": 0.1799693810618531,
"learning_rate": 9.472965838391187e-06,
"loss": 1.4446,
"step": 505
},
{
"epoch": 0.23,
"grad_norm": 0.16868170325096435,
"learning_rate": 9.454850034435679e-06,
"loss": 1.3912,
"step": 510
},
{
"epoch": 0.24,
"grad_norm": 0.16981991686336434,
"learning_rate": 9.436446049034913e-06,
"loss": 1.3986,
"step": 515
},
{
"epoch": 0.24,
"grad_norm": 0.1755120644725739,
"learning_rate": 9.417755072731121e-06,
"loss": 1.4117,
"step": 520
},
{
"epoch": 0.24,
"grad_norm": 0.1757934575464805,
"learning_rate": 9.398778314631801e-06,
"loss": 1.3587,
"step": 525
},
{
"epoch": 0.24,
"grad_norm": 0.173876963970309,
"learning_rate": 9.379517002331489e-06,
"loss": 1.3862,
"step": 530
},
{
"epoch": 0.25,
"grad_norm": 0.17746114421749437,
"learning_rate": 9.359972381832358e-06,
"loss": 1.4309,
"step": 535
},
{
"epoch": 0.25,
"grad_norm": 0.16869333613216586,
"learning_rate": 9.340145717463609e-06,
"loss": 1.4118,
"step": 540
},
{
"epoch": 0.25,
"grad_norm": 0.17746743334088458,
"learning_rate": 9.320038291799679e-06,
"loss": 1.4433,
"step": 545
},
{
"epoch": 0.25,
"grad_norm": 0.17587914404814756,
"learning_rate": 9.299651405577286e-06,
"loss": 1.4421,
"step": 550
},
{
"epoch": 0.26,
"grad_norm": 0.18048857244310448,
"learning_rate": 9.278986377611266e-06,
"loss": 1.4221,
"step": 555
},
{
"epoch": 0.26,
"grad_norm": 0.17371693231140117,
"learning_rate": 9.258044544709276e-06,
"loss": 1.4131,
"step": 560
},
{
"epoch": 0.26,
"grad_norm": 0.17693122935797964,
"learning_rate": 9.236827261585306e-06,
"loss": 1.4205,
"step": 565
},
{
"epoch": 0.26,
"grad_norm": 0.18630601640696606,
"learning_rate": 9.215335900772048e-06,
"loss": 1.4067,
"step": 570
},
{
"epoch": 0.26,
"grad_norm": 0.17220695844147543,
"learning_rate": 9.193571852532112e-06,
"loss": 1.3834,
"step": 575
},
{
"epoch": 0.27,
"grad_norm": 0.18435801453629633,
"learning_rate": 9.17153652476808e-06,
"loss": 1.3485,
"step": 580
},
{
"epoch": 0.27,
"grad_norm": 0.1719982815834864,
"learning_rate": 9.14923134293144e-06,
"loss": 1.4265,
"step": 585
},
{
"epoch": 0.27,
"grad_norm": 0.17622887749418029,
"learning_rate": 9.126657749930365e-06,
"loss": 1.4242,
"step": 590
},
{
"epoch": 0.27,
"grad_norm": 0.1772303960409497,
"learning_rate": 9.103817206036383e-06,
"loss": 1.3901,
"step": 595
},
{
"epoch": 0.28,
"grad_norm": 0.18039291853040396,
"learning_rate": 9.080711188789903e-06,
"loss": 1.4193,
"step": 600
},
{
"epoch": 0.28,
"eval_loss": 1.333003282546997,
"eval_runtime": 1759.7499,
"eval_samples_per_second": 2.154,
"eval_steps_per_second": 0.269,
"step": 600
},
{
"epoch": 0.28,
"grad_norm": 0.18468507528075692,
"learning_rate": 9.057341192904641e-06,
"loss": 1.4663,
"step": 605
},
{
"epoch": 0.28,
"grad_norm": 0.17990669625911423,
"learning_rate": 9.033708730170925e-06,
"loss": 1.4289,
"step": 610
},
{
"epoch": 0.28,
"grad_norm": 0.17925115015014306,
"learning_rate": 9.009815329357893e-06,
"loss": 1.4337,
"step": 615
},
{
"epoch": 0.29,
"grad_norm": 0.1742038025383068,
"learning_rate": 8.985662536114614e-06,
"loss": 1.4156,
"step": 620
},
{
"epoch": 0.29,
"grad_norm": 0.18266353155608991,
"learning_rate": 8.961251912870077e-06,
"loss": 1.3896,
"step": 625
},
{
"epoch": 0.29,
"grad_norm": 0.18057511349721395,
"learning_rate": 8.936585038732143e-06,
"loss": 1.3764,
"step": 630
},
{
"epoch": 0.29,
"grad_norm": 0.18596859871805246,
"learning_rate": 8.91166350938537e-06,
"loss": 1.4193,
"step": 635
},
{
"epoch": 0.29,
"grad_norm": 0.18986971270095016,
"learning_rate": 8.886488936987817e-06,
"loss": 1.3955,
"step": 640
},
{
"epoch": 0.3,
"grad_norm": 0.17418454140586195,
"learning_rate": 8.861062950066723e-06,
"loss": 1.427,
"step": 645
},
{
"epoch": 0.3,
"grad_norm": 0.18149732001872015,
"learning_rate": 8.835387193413185e-06,
"loss": 1.4046,
"step": 650
},
{
"epoch": 0.3,
"grad_norm": 0.18147198520857166,
"learning_rate": 8.809463327975741e-06,
"loss": 1.4058,
"step": 655
},
{
"epoch": 0.3,
"grad_norm": 0.18099108296498378,
"learning_rate": 8.783293030752932e-06,
"loss": 1.4066,
"step": 660
},
{
"epoch": 0.31,
"grad_norm": 0.1794240687483909,
"learning_rate": 8.756877994684818e-06,
"loss": 1.3921,
"step": 665
},
{
"epoch": 0.31,
"grad_norm": 0.1839307276635119,
"learning_rate": 8.730219928543458e-06,
"loss": 1.4054,
"step": 670
},
{
"epoch": 0.31,
"grad_norm": 0.18145863325096048,
"learning_rate": 8.703320556822375e-06,
"loss": 1.4053,
"step": 675
},
{
"epoch": 0.31,
"grad_norm": 0.1808487553528171,
"learning_rate": 8.676181619624996e-06,
"loss": 1.4055,
"step": 680
},
{
"epoch": 0.32,
"grad_norm": 0.1862476804387168,
"learning_rate": 8.648804872552092e-06,
"loss": 1.3841,
"step": 685
},
{
"epoch": 0.32,
"grad_norm": 0.19563150015616995,
"learning_rate": 8.6211920865882e-06,
"loss": 1.371,
"step": 690
},
{
"epoch": 0.32,
"grad_norm": 0.18923848385153544,
"learning_rate": 8.593345047987069e-06,
"loss": 1.3988,
"step": 695
},
{
"epoch": 0.32,
"grad_norm": 0.18078263758464272,
"learning_rate": 8.565265558156101e-06,
"loss": 1.4024,
"step": 700
},
{
"epoch": 0.32,
"grad_norm": 0.18258637947544226,
"learning_rate": 8.536955433539824e-06,
"loss": 1.371,
"step": 705
},
{
"epoch": 0.33,
"grad_norm": 0.19116331272141834,
"learning_rate": 8.508416505502383e-06,
"loss": 1.4456,
"step": 710
},
{
"epoch": 0.33,
"grad_norm": 0.17469959227839357,
"learning_rate": 8.479650620209072e-06,
"loss": 1.385,
"step": 715
},
{
"epoch": 0.33,
"grad_norm": 0.18410741548679613,
"learning_rate": 8.450659638506908e-06,
"loss": 1.4095,
"step": 720
},
{
"epoch": 0.33,
"grad_norm": 0.19634261946282605,
"learning_rate": 8.421445435804255e-06,
"loss": 1.3513,
"step": 725
},
{
"epoch": 0.34,
"grad_norm": 0.17826721350047323,
"learning_rate": 8.3920099019495e-06,
"loss": 1.3792,
"step": 730
},
{
"epoch": 0.34,
"grad_norm": 0.1826653979119606,
"learning_rate": 8.362354941108803e-06,
"loss": 1.4448,
"step": 735
},
{
"epoch": 0.34,
"grad_norm": 0.18664731594802075,
"learning_rate": 8.33248247164292e-06,
"loss": 1.3751,
"step": 740
},
{
"epoch": 0.34,
"grad_norm": 0.18231556377003602,
"learning_rate": 8.3023944259831e-06,
"loss": 1.3773,
"step": 745
},
{
"epoch": 0.35,
"grad_norm": 0.18711137034484868,
"learning_rate": 8.272092750506084e-06,
"loss": 1.4096,
"step": 750
},
{
"epoch": 0.35,
"grad_norm": 0.1877955269203901,
"learning_rate": 8.241579405408192e-06,
"loss": 1.3902,
"step": 755
},
{
"epoch": 0.35,
"grad_norm": 0.18482019451091206,
"learning_rate": 8.21085636457851e-06,
"loss": 1.3734,
"step": 760
},
{
"epoch": 0.35,
"grad_norm": 0.19891330231660218,
"learning_rate": 8.179925615471218e-06,
"loss": 1.4061,
"step": 765
},
{
"epoch": 0.35,
"grad_norm": 0.18663983192529415,
"learning_rate": 8.148789158977012e-06,
"loss": 1.3326,
"step": 770
},
{
"epoch": 0.36,
"grad_norm": 0.1874487096476331,
"learning_rate": 8.117449009293668e-06,
"loss": 1.3384,
"step": 775
},
{
"epoch": 0.36,
"grad_norm": 0.18710305973456598,
"learning_rate": 8.085907193795745e-06,
"loss": 1.3828,
"step": 780
},
{
"epoch": 0.36,
"grad_norm": 0.18416014945175566,
"learning_rate": 8.05416575290344e-06,
"loss": 1.3737,
"step": 785
},
{
"epoch": 0.36,
"grad_norm": 0.18615555988464447,
"learning_rate": 8.022226739950587e-06,
"loss": 1.4359,
"step": 790
},
{
"epoch": 0.37,
"grad_norm": 0.18594902983475312,
"learning_rate": 7.990092221051835e-06,
"loss": 1.389,
"step": 795
},
{
"epoch": 0.37,
"grad_norm": 0.18537070852284854,
"learning_rate": 7.95776427496899e-06,
"loss": 1.3593,
"step": 800
},
{
"epoch": 0.37,
"eval_loss": 1.323183298110962,
"eval_runtime": 1742.9319,
"eval_samples_per_second": 2.174,
"eval_steps_per_second": 0.272,
"step": 800
},
{
"epoch": 0.37,
"grad_norm": 0.1908169492182471,
"learning_rate": 7.925244992976538e-06,
"loss": 1.3406,
"step": 805
},
{
"epoch": 0.37,
"grad_norm": 0.18784810075497232,
"learning_rate": 7.89253647872637e-06,
"loss": 1.3842,
"step": 810
},
{
"epoch": 0.38,
"grad_norm": 0.19406647113841424,
"learning_rate": 7.859640848111686e-06,
"loss": 1.4286,
"step": 815
},
{
"epoch": 0.38,
"grad_norm": 0.19197603494160256,
"learning_rate": 7.826560229130132e-06,
"loss": 1.3928,
"step": 820
},
{
"epoch": 0.38,
"grad_norm": 0.19099716433921685,
"learning_rate": 7.793296761746126e-06,
"loss": 1.362,
"step": 825
},
{
"epoch": 0.38,
"grad_norm": 0.18788356013356616,
"learning_rate": 7.759852597752447e-06,
"loss": 1.4034,
"step": 830
},
{
"epoch": 0.38,
"grad_norm": 0.1921444320557867,
"learning_rate": 7.726229900631015e-06,
"loss": 1.3793,
"step": 835
},
{
"epoch": 0.39,
"grad_norm": 0.20734046130350145,
"learning_rate": 7.692430845412946e-06,
"loss": 1.4203,
"step": 840
},
{
"epoch": 0.39,
"grad_norm": 0.19179256662995678,
"learning_rate": 7.658457618537853e-06,
"loss": 1.4021,
"step": 845
},
{
"epoch": 0.39,
"grad_norm": 0.18555040743415147,
"learning_rate": 7.624312417712403e-06,
"loss": 1.423,
"step": 850
},
{
"epoch": 0.39,
"grad_norm": 0.19398963612254347,
"learning_rate": 7.58999745176815e-06,
"loss": 1.4367,
"step": 855
},
{
"epoch": 0.4,
"grad_norm": 0.19074855950766817,
"learning_rate": 7.555514940518647e-06,
"loss": 1.3695,
"step": 860
},
{
"epoch": 0.4,
"grad_norm": 0.1893550054117395,
"learning_rate": 7.520867114615844e-06,
"loss": 1.3939,
"step": 865
},
{
"epoch": 0.4,
"grad_norm": 0.19196726666071628,
"learning_rate": 7.486056215405797e-06,
"loss": 1.3964,
"step": 870
},
{
"epoch": 0.4,
"grad_norm": 0.2100904294893222,
"learning_rate": 7.451084494783668e-06,
"loss": 1.3753,
"step": 875
},
{
"epoch": 0.41,
"grad_norm": 0.1870023533707271,
"learning_rate": 7.415954215048057e-06,
"loss": 1.379,
"step": 880
},
{
"epoch": 0.41,
"grad_norm": 0.19635898208120364,
"learning_rate": 7.38066764875465e-06,
"loss": 1.4329,
"step": 885
},
{
"epoch": 0.41,
"grad_norm": 0.1896615635850299,
"learning_rate": 7.345227078569218e-06,
"loss": 1.357,
"step": 890
},
{
"epoch": 0.41,
"grad_norm": 0.19424130426015207,
"learning_rate": 7.309634797119941e-06,
"loss": 1.3774,
"step": 895
},
{
"epoch": 0.41,
"grad_norm": 0.22888693201104138,
"learning_rate": 7.273893106849108e-06,
"loss": 1.3976,
"step": 900
},
{
"epoch": 0.42,
"grad_norm": 0.1919456484613934,
"learning_rate": 7.23800431986417e-06,
"loss": 1.378,
"step": 905
},
{
"epoch": 0.42,
"grad_norm": 0.19241540158105003,
"learning_rate": 7.201970757788172e-06,
"loss": 1.4094,
"step": 910
},
{
"epoch": 0.42,
"grad_norm": 0.19333437065467562,
"learning_rate": 7.165794751609569e-06,
"loss": 1.3971,
"step": 915
},
{
"epoch": 0.42,
"grad_norm": 0.19460601771644864,
"learning_rate": 7.1294786415314336e-06,
"loss": 1.3879,
"step": 920
},
{
"epoch": 0.43,
"grad_norm": 0.18754355788787921,
"learning_rate": 7.093024776820076e-06,
"loss": 1.3534,
"step": 925
},
{
"epoch": 0.43,
"grad_norm": 0.18648196033472134,
"learning_rate": 7.056435515653059e-06,
"loss": 1.3969,
"step": 930
},
{
"epoch": 0.43,
"grad_norm": 0.18580702737411495,
"learning_rate": 7.019713224966664e-06,
"loss": 1.4416,
"step": 935
},
{
"epoch": 0.43,
"grad_norm": 0.24918096880464727,
"learning_rate": 6.9828602803027664e-06,
"loss": 1.3814,
"step": 940
},
{
"epoch": 0.44,
"grad_norm": 0.19003799001704857,
"learning_rate": 6.945879065655164e-06,
"loss": 1.3581,
"step": 945
},
{
"epoch": 0.44,
"grad_norm": 0.19777328354162663,
"learning_rate": 6.90877197331536e-06,
"loss": 1.3883,
"step": 950
},
{
"epoch": 0.44,
"grad_norm": 0.1978053982444075,
"learning_rate": 6.871541403717808e-06,
"loss": 1.4298,
"step": 955
},
{
"epoch": 0.44,
"grad_norm": 0.19360877663534273,
"learning_rate": 6.83418976528462e-06,
"loss": 1.3623,
"step": 960
},
{
"epoch": 0.44,
"grad_norm": 0.19124472077513613,
"learning_rate": 6.7967194742697866e-06,
"loss": 1.3965,
"step": 965
},
{
"epoch": 0.45,
"grad_norm": 0.2026314484251062,
"learning_rate": 6.759132954602852e-06,
"loss": 1.3889,
"step": 970
},
{
"epoch": 0.45,
"grad_norm": 0.20177996042623167,
"learning_rate": 6.721432637732117e-06,
"loss": 1.3987,
"step": 975
},
{
"epoch": 0.45,
"grad_norm": 0.19199095023199664,
"learning_rate": 6.6836209624673575e-06,
"loss": 1.3658,
"step": 980
},
{
"epoch": 0.45,
"grad_norm": 0.19092380360827313,
"learning_rate": 6.64570037482205e-06,
"loss": 1.3601,
"step": 985
},
{
"epoch": 0.46,
"grad_norm": 0.21156775391897173,
"learning_rate": 6.607673327855149e-06,
"loss": 1.4427,
"step": 990
},
{
"epoch": 0.46,
"grad_norm": 0.19213500215723073,
"learning_rate": 6.569542281512388e-06,
"loss": 1.3934,
"step": 995
},
{
"epoch": 0.46,
"grad_norm": 0.19675677797230362,
"learning_rate": 6.531309702467159e-06,
"loss": 1.3552,
"step": 1000
},
{
"epoch": 0.46,
"eval_loss": 1.3166489601135254,
"eval_runtime": 1748.0429,
"eval_samples_per_second": 2.168,
"eval_steps_per_second": 0.271,
"step": 1000
},
{
"epoch": 0.46,
"grad_norm": 0.18781337537819495,
"learning_rate": 6.492978063960942e-06,
"loss": 1.3937,
"step": 1005
},
{
"epoch": 0.47,
"grad_norm": 0.1935060574506341,
"learning_rate": 6.45454984564331e-06,
"loss": 1.4284,
"step": 1010
},
{
"epoch": 0.47,
"grad_norm": 0.1936429054806515,
"learning_rate": 6.41602753341152e-06,
"loss": 1.3618,
"step": 1015
},
{
"epoch": 0.47,
"grad_norm": 0.19582370428755932,
"learning_rate": 6.377413619249713e-06,
"loss": 1.3822,
"step": 1020
},
{
"epoch": 0.47,
"grad_norm": 0.18931877193304708,
"learning_rate": 6.338710601067691e-06,
"loss": 1.3473,
"step": 1025
},
{
"epoch": 0.47,
"grad_norm": 0.1952618362908433,
"learning_rate": 6.2999209825393445e-06,
"loss": 1.369,
"step": 1030
},
{
"epoch": 0.48,
"grad_norm": 0.196265797323174,
"learning_rate": 6.2610472729406905e-06,
"loss": 1.3679,
"step": 1035
},
{
"epoch": 0.48,
"grad_norm": 0.1909001830769802,
"learning_rate": 6.222091986987534e-06,
"loss": 1.3939,
"step": 1040
},
{
"epoch": 0.48,
"grad_norm": 0.1974314278029084,
"learning_rate": 6.18305764467281e-06,
"loss": 1.4111,
"step": 1045
},
{
"epoch": 0.48,
"grad_norm": 0.19874201405315123,
"learning_rate": 6.143946771103561e-06,
"loss": 1.383,
"step": 1050
},
{
"epoch": 0.49,
"grad_norm": 0.20109125948229767,
"learning_rate": 6.104761896337581e-06,
"loss": 1.3548,
"step": 1055
},
{
"epoch": 0.49,
"grad_norm": 0.18937099603698346,
"learning_rate": 6.0655055552197616e-06,
"loss": 1.4427,
"step": 1060
},
{
"epoch": 0.49,
"grad_norm": 0.20257668978871882,
"learning_rate": 6.026180287218106e-06,
"loss": 1.3773,
"step": 1065
},
{
"epoch": 0.49,
"grad_norm": 0.19836764097355777,
"learning_rate": 5.986788636259453e-06,
"loss": 1.3945,
"step": 1070
},
{
"epoch": 0.5,
"grad_norm": 0.19345413549116036,
"learning_rate": 5.9473331505649125e-06,
"loss": 1.4439,
"step": 1075
},
{
"epoch": 0.5,
"grad_norm": 0.19576415671480885,
"learning_rate": 5.907816382485026e-06,
"loss": 1.3432,
"step": 1080
},
{
"epoch": 0.5,
"grad_norm": 0.19643295445396305,
"learning_rate": 5.8682408883346535e-06,
"loss": 1.3459,
"step": 1085
},
{
"epoch": 0.5,
"grad_norm": 0.19760654138918068,
"learning_rate": 5.828609228227603e-06,
"loss": 1.4334,
"step": 1090
},
{
"epoch": 0.5,
"grad_norm": 0.1907622356589435,
"learning_rate": 5.788923965911028e-06,
"loss": 1.3195,
"step": 1095
},
{
"epoch": 0.51,
"grad_norm": 0.19974010922162466,
"learning_rate": 5.749187668599574e-06,
"loss": 1.3973,
"step": 1100
},
{
"epoch": 0.51,
"grad_norm": 0.1899811113208983,
"learning_rate": 5.709402906809307e-06,
"loss": 1.3788,
"step": 1105
},
{
"epoch": 0.51,
"grad_norm": 0.19125136139691074,
"learning_rate": 5.669572254191431e-06,
"loss": 1.3749,
"step": 1110
},
{
"epoch": 0.51,
"grad_norm": 0.19773999696793723,
"learning_rate": 5.6296982873658e-06,
"loss": 1.3812,
"step": 1115
},
{
"epoch": 0.52,
"grad_norm": 0.19522230435252472,
"learning_rate": 5.5897835857542315e-06,
"loss": 1.3639,
"step": 1120
},
{
"epoch": 0.52,
"grad_norm": 0.19203158758627778,
"learning_rate": 5.549830731413655e-06,
"loss": 1.3988,
"step": 1125
},
{
"epoch": 0.52,
"grad_norm": 0.19785365288187984,
"learning_rate": 5.509842308869075e-06,
"loss": 1.4031,
"step": 1130
},
{
"epoch": 0.52,
"grad_norm": 0.20110593532241236,
"learning_rate": 5.469820904946383e-06,
"loss": 1.3447,
"step": 1135
},
{
"epoch": 0.53,
"grad_norm": 0.20730230846403253,
"learning_rate": 5.429769108605013e-06,
"loss": 1.433,
"step": 1140
},
{
"epoch": 0.53,
"grad_norm": 0.19538915157122386,
"learning_rate": 5.389689510770462e-06,
"loss": 1.3751,
"step": 1145
},
{
"epoch": 0.53,
"grad_norm": 0.19657731321379315,
"learning_rate": 5.3495847041666935e-06,
"loss": 1.4427,
"step": 1150
},
{
"epoch": 0.53,
"grad_norm": 0.19885598934336826,
"learning_rate": 5.30945728314841e-06,
"loss": 1.3526,
"step": 1155
},
{
"epoch": 0.53,
"grad_norm": 0.19763223830130308,
"learning_rate": 5.269309843533222e-06,
"loss": 1.3792,
"step": 1160
},
{
"epoch": 0.54,
"grad_norm": 0.1934600019166271,
"learning_rate": 5.229144982433736e-06,
"loss": 1.3827,
"step": 1165
},
{
"epoch": 0.54,
"grad_norm": 0.19215817298202406,
"learning_rate": 5.188965298089538e-06,
"loss": 1.3609,
"step": 1170
},
{
"epoch": 0.54,
"grad_norm": 0.1950665854099098,
"learning_rate": 5.148773389699123e-06,
"loss": 1.3728,
"step": 1175
},
{
"epoch": 0.54,
"grad_norm": 0.20163181813335146,
"learning_rate": 5.108571857251754e-06,
"loss": 1.3937,
"step": 1180
},
{
"epoch": 0.55,
"grad_norm": 0.19492701026725848,
"learning_rate": 5.068363301359263e-06,
"loss": 1.3976,
"step": 1185
},
{
"epoch": 0.55,
"grad_norm": 0.1915934216230785,
"learning_rate": 5.0281503230878304e-06,
"loss": 1.3778,
"step": 1190
},
{
"epoch": 0.55,
"grad_norm": 0.19100537033272963,
"learning_rate": 4.98793552378971e-06,
"loss": 1.4221,
"step": 1195
},
{
"epoch": 0.55,
"grad_norm": 0.19909784027087774,
"learning_rate": 4.947721504934966e-06,
"loss": 1.3685,
"step": 1200
},
{
"epoch": 0.55,
"eval_loss": 1.3122756481170654,
"eval_runtime": 1747.9099,
"eval_samples_per_second": 2.168,
"eval_steps_per_second": 0.271,
"step": 1200
},
{
"epoch": 0.56,
"grad_norm": 0.19577442673853007,
"learning_rate": 4.907510867943167e-06,
"loss": 1.3595,
"step": 1205
},
{
"epoch": 0.56,
"grad_norm": 0.2063936713267001,
"learning_rate": 4.867306214015117e-06,
"loss": 1.4202,
"step": 1210
},
{
"epoch": 0.56,
"grad_norm": 0.19879562103497236,
"learning_rate": 4.8271101439645765e-06,
"loss": 1.3934,
"step": 1215
},
{
"epoch": 0.56,
"grad_norm": 0.19945548173818886,
"learning_rate": 4.786925258050024e-06,
"loss": 1.3395,
"step": 1220
},
{
"epoch": 0.56,
"grad_norm": 0.209259697791073,
"learning_rate": 4.746754155806437e-06,
"loss": 1.4072,
"step": 1225
},
{
"epoch": 0.57,
"grad_norm": 0.19280116611929857,
"learning_rate": 4.706599435877143e-06,
"loss": 1.3976,
"step": 1230
},
{
"epoch": 0.57,
"grad_norm": 0.1990941406891665,
"learning_rate": 4.666463695845701e-06,
"loss": 1.3912,
"step": 1235
},
{
"epoch": 0.57,
"grad_norm": 0.19455600992105357,
"learning_rate": 4.626349532067879e-06,
"loss": 1.4003,
"step": 1240
},
{
"epoch": 0.57,
"grad_norm": 0.20315382252326522,
"learning_rate": 4.586259539503687e-06,
"loss": 1.3876,
"step": 1245
},
{
"epoch": 0.58,
"grad_norm": 0.19085048019709233,
"learning_rate": 4.546196311549515e-06,
"loss": 1.415,
"step": 1250
},
{
"epoch": 0.58,
"grad_norm": 0.19155602915410036,
"learning_rate": 4.506162439870366e-06,
"loss": 1.388,
"step": 1255
},
{
"epoch": 0.58,
"grad_norm": 0.1962993457750995,
"learning_rate": 4.466160514232206e-06,
"loss": 1.4069,
"step": 1260
},
{
"epoch": 0.58,
"grad_norm": 0.19440064625069065,
"learning_rate": 4.426193122334433e-06,
"loss": 1.3625,
"step": 1265
},
{
"epoch": 0.59,
"grad_norm": 0.19907155435638502,
"learning_rate": 4.386262849642474e-06,
"loss": 1.3621,
"step": 1270
},
{
"epoch": 0.59,
"grad_norm": 0.19846401001306227,
"learning_rate": 4.346372279220543e-06,
"loss": 1.3438,
"step": 1275
},
{
"epoch": 0.59,
"grad_norm": 0.19990637435356196,
"learning_rate": 4.306523991564536e-06,
"loss": 1.3857,
"step": 1280
},
{
"epoch": 0.59,
"grad_norm": 0.1983045043565906,
"learning_rate": 4.266720564435105e-06,
"loss": 1.3477,
"step": 1285
},
{
"epoch": 0.59,
"grad_norm": 0.2008086079053878,
"learning_rate": 4.226964572690905e-06,
"loss": 1.4032,
"step": 1290
},
{
"epoch": 0.6,
"grad_norm": 0.2394515243143434,
"learning_rate": 4.187258588122019e-06,
"loss": 1.3757,
"step": 1295
},
{
"epoch": 0.6,
"grad_norm": 0.1971799914885616,
"learning_rate": 4.147605179283604e-06,
"loss": 1.4156,
"step": 1300
},
{
"epoch": 0.6,
"grad_norm": 0.2073477392149783,
"learning_rate": 4.108006911329722e-06,
"loss": 1.3881,
"step": 1305
},
{
"epoch": 0.6,
"grad_norm": 0.2073153166108959,
"learning_rate": 4.068466345847409e-06,
"loss": 1.3687,
"step": 1310
},
{
"epoch": 0.61,
"grad_norm": 0.20054177344121227,
"learning_rate": 4.028986040690963e-06,
"loss": 1.3785,
"step": 1315
},
{
"epoch": 0.61,
"grad_norm": 0.20604849012426923,
"learning_rate": 3.989568549816479e-06,
"loss": 1.4169,
"step": 1320
},
{
"epoch": 0.61,
"grad_norm": 0.19467062948633831,
"learning_rate": 3.9502164231166354e-06,
"loss": 1.4168,
"step": 1325
},
{
"epoch": 0.61,
"grad_norm": 0.21219636801396732,
"learning_rate": 3.910932206255742e-06,
"loss": 1.3772,
"step": 1330
},
{
"epoch": 0.61,
"grad_norm": 0.20051941796299297,
"learning_rate": 3.87171844050507e-06,
"loss": 1.3864,
"step": 1335
},
{
"epoch": 0.62,
"grad_norm": 0.20033592892185176,
"learning_rate": 3.8325776625784464e-06,
"loss": 1.3984,
"step": 1340
},
{
"epoch": 0.62,
"grad_norm": 0.19559628194598214,
"learning_rate": 3.793512404468162e-06,
"loss": 1.3954,
"step": 1345
},
{
"epoch": 0.62,
"grad_norm": 0.21009377333677687,
"learning_rate": 3.7545251932811824e-06,
"loss": 1.3799,
"step": 1350
},
{
"epoch": 0.62,
"grad_norm": 0.2058943301550432,
"learning_rate": 3.7156185510756613e-06,
"loss": 1.3763,
"step": 1355
},
{
"epoch": 0.63,
"grad_norm": 0.20090178955938068,
"learning_rate": 3.6767949946978026e-06,
"loss": 1.4162,
"step": 1360
},
{
"epoch": 0.63,
"grad_norm": 0.2034374577879598,
"learning_rate": 3.6380570356190346e-06,
"loss": 1.402,
"step": 1365
},
{
"epoch": 0.63,
"grad_norm": 0.19652020002950302,
"learning_rate": 3.5994071797735513e-06,
"loss": 1.3667,
"step": 1370
},
{
"epoch": 0.63,
"grad_norm": 0.20049219114904884,
"learning_rate": 3.560847927396206e-06,
"loss": 1.419,
"step": 1375
},
{
"epoch": 0.64,
"grad_norm": 0.20857502981558484,
"learning_rate": 3.5223817728607675e-06,
"loss": 1.4082,
"step": 1380
},
{
"epoch": 0.64,
"grad_norm": 0.19767809519926405,
"learning_rate": 3.484011204518568e-06,
"loss": 1.3947,
"step": 1385
},
{
"epoch": 0.64,
"grad_norm": 0.1948689279113259,
"learning_rate": 3.4457387045375255e-06,
"loss": 1.3625,
"step": 1390
},
{
"epoch": 0.64,
"grad_norm": 0.20429495021474384,
"learning_rate": 3.4075667487415785e-06,
"loss": 1.3978,
"step": 1395
},
{
"epoch": 0.64,
"grad_norm": 0.2022436664727946,
"learning_rate": 3.3694978064505258e-06,
"loss": 1.3487,
"step": 1400
},
{
"epoch": 0.64,
"eval_loss": 1.3093819618225098,
"eval_runtime": 1769.0297,
"eval_samples_per_second": 2.142,
"eval_steps_per_second": 0.268,
"step": 1400
},
{
"epoch": 0.65,
"grad_norm": 0.2010813579540422,
"learning_rate": 3.331534340320287e-06,
"loss": 1.3582,
"step": 1405
},
{
"epoch": 0.65,
"grad_norm": 0.19489960246084403,
"learning_rate": 3.293678806183596e-06,
"loss": 1.42,
"step": 1410
},
{
"epoch": 0.65,
"grad_norm": 0.20100567658267351,
"learning_rate": 3.255933652891133e-06,
"loss": 1.3887,
"step": 1415
},
{
"epoch": 0.65,
"grad_norm": 0.198604846014806,
"learning_rate": 3.218301322153111e-06,
"loss": 1.3543,
"step": 1420
},
{
"epoch": 0.66,
"grad_norm": 0.20073641523853392,
"learning_rate": 3.180784248381322e-06,
"loss": 1.3513,
"step": 1425
},
{
"epoch": 0.66,
"grad_norm": 0.19424293274594104,
"learning_rate": 3.1433848585316607e-06,
"loss": 1.3885,
"step": 1430
},
{
"epoch": 0.66,
"grad_norm": 0.2028571716247717,
"learning_rate": 3.10610557194712e-06,
"loss": 1.3824,
"step": 1435
},
{
"epoch": 0.66,
"grad_norm": 0.1973272586923241,
"learning_rate": 3.068948800201289e-06,
"loss": 1.3332,
"step": 1440
},
{
"epoch": 0.67,
"grad_norm": 0.20459650028827572,
"learning_rate": 3.0319169469423487e-06,
"loss": 1.3715,
"step": 1445
},
{
"epoch": 0.67,
"grad_norm": 0.1966290880736998,
"learning_rate": 2.995012407737581e-06,
"loss": 1.3985,
"step": 1450
},
{
"epoch": 0.67,
"grad_norm": 0.20394208158347749,
"learning_rate": 2.958237569918404e-06,
"loss": 1.3867,
"step": 1455
},
{
"epoch": 0.67,
"grad_norm": 0.1971397504636877,
"learning_rate": 2.9215948124259343e-06,
"loss": 1.3739,
"step": 1460
},
{
"epoch": 0.67,
"grad_norm": 0.1984132646647921,
"learning_rate": 2.885086505657094e-06,
"loss": 1.4459,
"step": 1465
},
{
"epoch": 0.68,
"grad_norm": 0.20375542452519896,
"learning_rate": 2.848715011311271e-06,
"loss": 1.3606,
"step": 1470
},
{
"epoch": 0.68,
"grad_norm": 0.19875885515941272,
"learning_rate": 2.8124826822375473e-06,
"loss": 1.4034,
"step": 1475
},
{
"epoch": 0.68,
"grad_norm": 0.20571233168262154,
"learning_rate": 2.7763918622824903e-06,
"loss": 1.4358,
"step": 1480
},
{
"epoch": 0.68,
"grad_norm": 0.20325983672817444,
"learning_rate": 2.7404448861385293e-06,
"loss": 1.3271,
"step": 1485
},
{
"epoch": 0.69,
"grad_norm": 0.204674596724046,
"learning_rate": 2.7046440791929306e-06,
"loss": 1.3656,
"step": 1490
},
{
"epoch": 0.69,
"grad_norm": 0.19864331126229412,
"learning_rate": 2.6689917573773615e-06,
"loss": 1.3712,
"step": 1495
},
{
"epoch": 0.69,
"grad_norm": 0.19759957217116436,
"learning_rate": 2.633490227018092e-06,
"loss": 1.4061,
"step": 1500
},
{
"epoch": 0.69,
"grad_norm": 0.1963001810018751,
"learning_rate": 2.5981417846867753e-06,
"loss": 1.3753,
"step": 1505
},
{
"epoch": 0.7,
"grad_norm": 0.2037326390652298,
"learning_rate": 2.5629487170518974e-06,
"loss": 1.3468,
"step": 1510
},
{
"epoch": 0.7,
"grad_norm": 0.20092378895163732,
"learning_rate": 2.527913300730863e-06,
"loss": 1.3831,
"step": 1515
},
{
"epoch": 0.7,
"grad_norm": 0.2075661167474541,
"learning_rate": 2.4930378021426977e-06,
"loss": 1.3786,
"step": 1520
},
{
"epoch": 0.7,
"grad_norm": 0.2002833079588797,
"learning_rate": 2.4583244773614675e-06,
"loss": 1.4058,
"step": 1525
},
{
"epoch": 0.7,
"grad_norm": 0.20306778592495606,
"learning_rate": 2.423775571970301e-06,
"loss": 1.3704,
"step": 1530
},
{
"epoch": 0.71,
"grad_norm": 0.22057110661612167,
"learning_rate": 2.3893933209161465e-06,
"loss": 1.3965,
"step": 1535
},
{
"epoch": 0.71,
"grad_norm": 0.20413023154970353,
"learning_rate": 2.3551799483651894e-06,
"loss": 1.3935,
"step": 1540
},
{
"epoch": 0.71,
"grad_norm": 0.2013070052195424,
"learning_rate": 2.321137667558965e-06,
"loss": 1.3757,
"step": 1545
},
{
"epoch": 0.71,
"grad_norm": 0.20133662802149876,
"learning_rate": 2.2872686806712037e-06,
"loss": 1.3533,
"step": 1550
},
{
"epoch": 0.72,
"grad_norm": 0.19816355686283976,
"learning_rate": 2.2535751786653476e-06,
"loss": 1.4014,
"step": 1555
},
{
"epoch": 0.72,
"grad_norm": 0.20020195119951492,
"learning_rate": 2.220059341152837e-06,
"loss": 1.3721,
"step": 1560
},
{
"epoch": 0.72,
"grad_norm": 0.20298393289470038,
"learning_rate": 2.1867233362521127e-06,
"loss": 1.3255,
"step": 1565
},
{
"epoch": 0.72,
"grad_norm": 0.20399564011279728,
"learning_rate": 2.153569320448348e-06,
"loss": 1.3928,
"step": 1570
},
{
"epoch": 0.73,
"grad_norm": 0.20016486527706043,
"learning_rate": 2.120599438453968e-06,
"loss": 1.3769,
"step": 1575
},
{
"epoch": 0.73,
"grad_norm": 0.20169522873545517,
"learning_rate": 2.087815823069886e-06,
"loss": 1.3745,
"step": 1580
},
{
"epoch": 0.73,
"grad_norm": 0.1991770376256046,
"learning_rate": 2.055220595047551e-06,
"loss": 1.3542,
"step": 1585
},
{
"epoch": 0.73,
"grad_norm": 0.2083925008192503,
"learning_rate": 2.022815862951751e-06,
"loss": 1.4182,
"step": 1590
},
{
"epoch": 0.73,
"grad_norm": 0.20086735172689546,
"learning_rate": 1.990603723024213e-06,
"loss": 1.3524,
"step": 1595
},
{
"epoch": 0.74,
"grad_norm": 0.20621921540546848,
"learning_rate": 1.9585862590480005e-06,
"loss": 1.3891,
"step": 1600
},
{
"epoch": 0.74,
"eval_loss": 1.3076461553573608,
"eval_runtime": 1777.0113,
"eval_samples_per_second": 2.133,
"eval_steps_per_second": 0.267,
"step": 1600
},
{
"epoch": 0.74,
"grad_norm": 0.20007769280547363,
"learning_rate": 1.926765542212707e-06,
"loss": 1.3856,
"step": 1605
},
{
"epoch": 0.74,
"grad_norm": 0.1980821730251063,
"learning_rate": 1.8951436309804766e-06,
"loss": 1.383,
"step": 1610
},
{
"epoch": 0.74,
"grad_norm": 0.20314121257047116,
"learning_rate": 1.8637225709528506e-06,
"loss": 1.3752,
"step": 1615
},
{
"epoch": 0.75,
"grad_norm": 0.2000738503413009,
"learning_rate": 1.832504394738428e-06,
"loss": 1.3501,
"step": 1620
},
{
"epoch": 0.75,
"grad_norm": 0.19626992677384789,
"learning_rate": 1.8014911218213832e-06,
"loss": 1.3776,
"step": 1625
},
{
"epoch": 0.75,
"grad_norm": 0.2132143728962325,
"learning_rate": 1.770684758430824e-06,
"loss": 1.3641,
"step": 1630
},
{
"epoch": 0.75,
"grad_norm": 0.20230145941791186,
"learning_rate": 1.7400872974110088e-06,
"loss": 1.3714,
"step": 1635
},
{
"epoch": 0.76,
"grad_norm": 0.20322390315184813,
"learning_rate": 1.7097007180924375e-06,
"loss": 1.3559,
"step": 1640
},
{
"epoch": 0.76,
"grad_norm": 0.2097270533204938,
"learning_rate": 1.6795269861638041e-06,
"loss": 1.3555,
"step": 1645
},
{
"epoch": 0.76,
"grad_norm": 0.20409576963271045,
"learning_rate": 1.6495680535448405e-06,
"loss": 1.3376,
"step": 1650
},
{
"epoch": 0.76,
"grad_norm": 0.209992582333707,
"learning_rate": 1.6198258582600418e-06,
"loss": 1.3393,
"step": 1655
},
{
"epoch": 0.76,
"grad_norm": 0.20186485511811675,
"learning_rate": 1.590302324313303e-06,
"loss": 1.3476,
"step": 1660
},
{
"epoch": 0.77,
"grad_norm": 0.2039959334961091,
"learning_rate": 1.5609993615634578e-06,
"loss": 1.4172,
"step": 1665
},
{
"epoch": 0.77,
"grad_norm": 0.2032115658244104,
"learning_rate": 1.531918865600725e-06,
"loss": 1.3866,
"step": 1670
},
{
"epoch": 0.77,
"grad_norm": 0.20663125873556282,
"learning_rate": 1.5030627176240903e-06,
"loss": 1.3413,
"step": 1675
},
{
"epoch": 0.77,
"grad_norm": 0.21419285282068773,
"learning_rate": 1.4744327843196043e-06,
"loss": 1.3685,
"step": 1680
},
{
"epoch": 0.78,
"grad_norm": 0.20427116148089472,
"learning_rate": 1.446030917739633e-06,
"loss": 1.3864,
"step": 1685
},
{
"epoch": 0.78,
"grad_norm": 0.21267201464638189,
"learning_rate": 1.4178589551830585e-06,
"loss": 1.3578,
"step": 1690
},
{
"epoch": 0.78,
"grad_norm": 0.2021417320485247,
"learning_rate": 1.3899187190764062e-06,
"loss": 1.4034,
"step": 1695
},
{
"epoch": 0.78,
"grad_norm": 0.19953428976865786,
"learning_rate": 1.3622120168559656e-06,
"loss": 1.3378,
"step": 1700
},
{
"epoch": 0.79,
"grad_norm": 0.20322168013048264,
"learning_rate": 1.3347406408508695e-06,
"loss": 1.4032,
"step": 1705
},
{
"epoch": 0.79,
"grad_norm": 0.19617963419360818,
"learning_rate": 1.3075063681671408e-06,
"loss": 1.3815,
"step": 1710
},
{
"epoch": 0.79,
"grad_norm": 0.20338294719666272,
"learning_rate": 1.280510960572745e-06,
"loss": 1.376,
"step": 1715
},
{
"epoch": 0.79,
"grad_norm": 0.20462364560864363,
"learning_rate": 1.2537561643836087e-06,
"loss": 1.3866,
"step": 1720
},
{
"epoch": 0.79,
"grad_norm": 0.1963149280229846,
"learning_rate": 1.2272437103506596e-06,
"loss": 1.372,
"step": 1725
},
{
"epoch": 0.8,
"grad_norm": 0.19716113021249304,
"learning_rate": 1.200975313547867e-06,
"loss": 1.3599,
"step": 1730
},
{
"epoch": 0.8,
"grad_norm": 0.20573469916350295,
"learning_rate": 1.1749526732612842e-06,
"loss": 1.3562,
"step": 1735
},
{
"epoch": 0.8,
"grad_norm": 0.20684676012413714,
"learning_rate": 1.1491774728791416e-06,
"loss": 1.3296,
"step": 1740
},
{
"epoch": 0.8,
"grad_norm": 0.2090768120512491,
"learning_rate": 1.1236513797829285e-06,
"loss": 1.4248,
"step": 1745
},
{
"epoch": 0.81,
"grad_norm": 0.20571837930886522,
"learning_rate": 1.0983760452395415e-06,
"loss": 1.3609,
"step": 1750
},
{
"epoch": 0.81,
"grad_norm": 0.20867562597707268,
"learning_rate": 1.07335310429447e-06,
"loss": 1.3848,
"step": 1755
},
{
"epoch": 0.81,
"grad_norm": 0.1988423228103918,
"learning_rate": 1.048584175666012e-06,
"loss": 1.3712,
"step": 1760
},
{
"epoch": 0.81,
"grad_norm": 0.21063039041467455,
"learning_rate": 1.0240708616405788e-06,
"loss": 1.3611,
"step": 1765
},
{
"epoch": 0.82,
"grad_norm": 0.20306439311302277,
"learning_rate": 9.998147479690251e-07,
"loss": 1.3478,
"step": 1770
},
{
"epoch": 0.82,
"grad_norm": 0.2020964557722793,
"learning_rate": 9.75817403764079e-07,
"loss": 1.3433,
"step": 1775
},
{
"epoch": 0.82,
"grad_norm": 0.20597190432737983,
"learning_rate": 9.520803813988366e-07,
"loss": 1.4058,
"step": 1780
},
{
"epoch": 0.82,
"grad_norm": 0.20380209329114748,
"learning_rate": 9.286052164063369e-07,
"loss": 1.4028,
"step": 1785
},
{
"epoch": 0.82,
"grad_norm": 0.2041857004062742,
"learning_rate": 9.053934273802312e-07,
"loss": 1.383,
"step": 1790
},
{
"epoch": 0.83,
"grad_norm": 0.20684812528414637,
"learning_rate": 8.824465158765433e-07,
"loss": 1.3512,
"step": 1795
},
{
"epoch": 0.83,
"grad_norm": 0.20386591712113425,
"learning_rate": 8.597659663165364e-07,
"loss": 1.3858,
"step": 1800
},
{
"epoch": 0.83,
"eval_loss": 1.306676983833313,
"eval_runtime": 1760.6924,
"eval_samples_per_second": 2.153,
"eval_steps_per_second": 0.269,
"step": 1800
},
{
"epoch": 0.83,
"grad_norm": 0.19511280510666812,
"learning_rate": 8.373532458906897e-07,
"loss": 1.3261,
"step": 1805
},
{
"epoch": 0.83,
"grad_norm": 0.20356348959673148,
"learning_rate": 8.15209804463783e-07,
"loss": 1.3288,
"step": 1810
},
{
"epoch": 0.84,
"grad_norm": 0.2103018153717413,
"learning_rate": 7.93337074481108e-07,
"loss": 1.4425,
"step": 1815
},
{
"epoch": 0.84,
"grad_norm": 0.20468220080909677,
"learning_rate": 7.717364708758024e-07,
"loss": 1.406,
"step": 1820
},
{
"epoch": 0.84,
"grad_norm": 0.2040164153992187,
"learning_rate": 7.504093909773174e-07,
"loss": 1.3601,
"step": 1825
},
{
"epoch": 0.84,
"grad_norm": 0.19961500414001193,
"learning_rate": 7.293572144210332e-07,
"loss": 1.3777,
"step": 1830
},
{
"epoch": 0.85,
"grad_norm": 0.20035042813237278,
"learning_rate": 7.085813030590022e-07,
"loss": 1.3944,
"step": 1835
},
{
"epoch": 0.85,
"grad_norm": 0.20685428154054034,
"learning_rate": 6.880830008718564e-07,
"loss": 1.3778,
"step": 1840
},
{
"epoch": 0.85,
"grad_norm": 0.19642667708352796,
"learning_rate": 6.678636338818645e-07,
"loss": 1.3458,
"step": 1845
},
{
"epoch": 0.85,
"grad_norm": 0.20222774080494071,
"learning_rate": 6.47924510067151e-07,
"loss": 1.3655,
"step": 1850
},
{
"epoch": 0.85,
"grad_norm": 0.20419539962584285,
"learning_rate": 6.282669192770896e-07,
"loss": 1.424,
"step": 1855
},
{
"epoch": 0.86,
"grad_norm": 0.1975487856167091,
"learning_rate": 6.088921331488568e-07,
"loss": 1.3424,
"step": 1860
},
{
"epoch": 0.86,
"grad_norm": 0.2094178118159778,
"learning_rate": 5.898014050251765e-07,
"loss": 1.3611,
"step": 1865
},
{
"epoch": 0.86,
"grad_norm": 0.20636955576818874,
"learning_rate": 5.709959698732359e-07,
"loss": 1.3779,
"step": 1870
},
{
"epoch": 0.86,
"grad_norm": 0.21747927560303068,
"learning_rate": 5.524770442047978e-07,
"loss": 1.3308,
"step": 1875
},
{
"epoch": 0.87,
"grad_norm": 0.20050302654738292,
"learning_rate": 5.342458259975147e-07,
"loss": 1.3865,
"step": 1880
},
{
"epoch": 0.87,
"grad_norm": 0.2392412132808601,
"learning_rate": 5.163034946174161e-07,
"loss": 1.3792,
"step": 1885
},
{
"epoch": 0.87,
"grad_norm": 0.20914070749895322,
"learning_rate": 4.986512107426283e-07,
"loss": 1.3812,
"step": 1890
},
{
"epoch": 0.87,
"grad_norm": 0.21320420478322508,
"learning_rate": 4.812901162882871e-07,
"loss": 1.443,
"step": 1895
},
{
"epoch": 0.88,
"grad_norm": 0.20606583697965636,
"learning_rate": 4.6422133433266513e-07,
"loss": 1.3546,
"step": 1900
},
{
"epoch": 0.88,
"grad_norm": 0.20597370559054526,
"learning_rate": 4.474459690445293e-07,
"loss": 1.3803,
"step": 1905
},
{
"epoch": 0.88,
"grad_norm": 0.20609624124174958,
"learning_rate": 4.309651056117009e-07,
"loss": 1.3806,
"step": 1910
},
{
"epoch": 0.88,
"grad_norm": 0.21011536963816732,
"learning_rate": 4.1477981017086387e-07,
"loss": 1.3857,
"step": 1915
},
{
"epoch": 0.88,
"grad_norm": 0.20861991251552262,
"learning_rate": 3.9889112973859554e-07,
"loss": 1.4178,
"step": 1920
},
{
"epoch": 0.89,
"grad_norm": 0.20565429912809985,
"learning_rate": 3.8330009214363197e-07,
"loss": 1.3485,
"step": 1925
},
{
"epoch": 0.89,
"grad_norm": 0.213262370271027,
"learning_rate": 3.680077059603876e-07,
"loss": 1.3857,
"step": 1930
},
{
"epoch": 0.89,
"grad_norm": 0.21006993775062202,
"learning_rate": 3.530149604436983e-07,
"loss": 1.3718,
"step": 1935
},
{
"epoch": 0.89,
"grad_norm": 0.20971631358735854,
"learning_rate": 3.3832282546483686e-07,
"loss": 1.3401,
"step": 1940
},
{
"epoch": 0.9,
"grad_norm": 0.20675053040706537,
"learning_rate": 3.239322514487686e-07,
"loss": 1.3976,
"step": 1945
},
{
"epoch": 0.9,
"grad_norm": 0.2070605402743848,
"learning_rate": 3.098441693126719e-07,
"loss": 1.3801,
"step": 1950
},
{
"epoch": 0.9,
"grad_norm": 0.2036473853339382,
"learning_rate": 2.9605949040571456e-07,
"loss": 1.3975,
"step": 1955
},
{
"epoch": 0.9,
"grad_norm": 0.20446499651024982,
"learning_rate": 2.8257910645009935e-07,
"loss": 1.3932,
"step": 1960
},
{
"epoch": 0.91,
"grad_norm": 0.20745728206982056,
"learning_rate": 2.6940388948338057e-07,
"loss": 1.4214,
"step": 1965
},
{
"epoch": 0.91,
"grad_norm": 0.20628378675298625,
"learning_rate": 2.565346918020534e-07,
"loss": 1.3234,
"step": 1970
},
{
"epoch": 0.91,
"grad_norm": 0.19884308459081187,
"learning_rate": 2.4397234590641696e-07,
"loss": 1.4086,
"step": 1975
},
{
"epoch": 0.91,
"grad_norm": 0.196985413218799,
"learning_rate": 2.3171766444672227e-07,
"loss": 1.4203,
"step": 1980
},
{
"epoch": 0.91,
"grad_norm": 0.20225024562843938,
"learning_rate": 2.1977144017060027e-07,
"loss": 1.3859,
"step": 1985
},
{
"epoch": 0.92,
"grad_norm": 0.20030763530819898,
"learning_rate": 2.0813444587178156e-07,
"loss": 1.3889,
"step": 1990
},
{
"epoch": 0.92,
"grad_norm": 0.20327002619311596,
"learning_rate": 1.9680743434010385e-07,
"loss": 1.3745,
"step": 1995
},
{
"epoch": 0.92,
"grad_norm": 0.20095518840823687,
"learning_rate": 1.8579113831281525e-07,
"loss": 1.3635,
"step": 2000
},
{
"epoch": 0.92,
"eval_loss": 1.3063520193099976,
"eval_runtime": 2013.9504,
"eval_samples_per_second": 1.882,
"eval_steps_per_second": 0.235,
"step": 2000
},
{
"epoch": 0.92,
"grad_norm": 0.20459938594623875,
"learning_rate": 1.7508627042717387e-07,
"loss": 1.4269,
"step": 2005
},
{
"epoch": 0.93,
"grad_norm": 0.2004705031433909,
"learning_rate": 1.6469352317434627e-07,
"loss": 1.3789,
"step": 2010
},
{
"epoch": 0.93,
"grad_norm": 0.19922047944368607,
"learning_rate": 1.5461356885461077e-07,
"loss": 1.3811,
"step": 2015
},
{
"epoch": 0.93,
"grad_norm": 0.20506581374899654,
"learning_rate": 1.4484705953386968e-07,
"loss": 1.3677,
"step": 2020
},
{
"epoch": 0.93,
"grad_norm": 0.20196871274072786,
"learning_rate": 1.35394627001465e-07,
"loss": 1.3871,
"step": 2025
},
{
"epoch": 0.94,
"grad_norm": 0.21302376799020897,
"learning_rate": 1.2625688272930925e-07,
"loss": 1.3673,
"step": 2030
},
{
"epoch": 0.94,
"grad_norm": 0.20437076323448575,
"learning_rate": 1.174344178323289e-07,
"loss": 1.3701,
"step": 2035
},
{
"epoch": 0.94,
"grad_norm": 0.20259273082335125,
"learning_rate": 1.0892780303022377e-07,
"loss": 1.4004,
"step": 2040
},
{
"epoch": 0.94,
"grad_norm": 0.19927961571470354,
"learning_rate": 1.007375886105555e-07,
"loss": 1.3781,
"step": 2045
},
{
"epoch": 0.94,
"grad_norm": 0.19648191268958623,
"learning_rate": 9.286430439313876e-08,
"loss": 1.3719,
"step": 2050
},
{
"epoch": 0.95,
"grad_norm": 0.2081842753213948,
"learning_rate": 8.530845969577594e-08,
"loss": 1.3347,
"step": 2055
},
{
"epoch": 0.95,
"grad_norm": 0.19777453793305202,
"learning_rate": 7.80705433013046e-08,
"loss": 1.3645,
"step": 2060
},
{
"epoch": 0.95,
"grad_norm": 0.2001901456526902,
"learning_rate": 7.115102342598101e-08,
"loss": 1.3549,
"step": 2065
},
{
"epoch": 0.95,
"grad_norm": 0.2085779076020987,
"learning_rate": 6.455034768919288e-08,
"loss": 1.395,
"step": 2070
},
{
"epoch": 0.96,
"grad_norm": 0.2007001569199437,
"learning_rate": 5.826894308449904e-08,
"loss": 1.3418,
"step": 2075
},
{
"epoch": 0.96,
"grad_norm": 0.20019336918988379,
"learning_rate": 5.230721595201049e-08,
"loss": 1.3808,
"step": 2080
},
{
"epoch": 0.96,
"grad_norm": 0.20532075618369505,
"learning_rate": 4.666555195210365e-08,
"loss": 1.3624,
"step": 2085
},
{
"epoch": 0.96,
"grad_norm": 0.19955856360806487,
"learning_rate": 4.134431604047195e-08,
"loss": 1.3851,
"step": 2090
},
{
"epoch": 0.97,
"grad_norm": 0.19995218707394016,
"learning_rate": 3.63438524445181e-08,
"loss": 1.404,
"step": 2095
},
{
"epoch": 0.97,
"grad_norm": 0.2027473195079258,
"learning_rate": 3.166448464108629e-08,
"loss": 1.3654,
"step": 2100
},
{
"epoch": 0.97,
"grad_norm": 0.21316411475600813,
"learning_rate": 2.7306515335532857e-08,
"loss": 1.4004,
"step": 2105
},
{
"epoch": 0.97,
"grad_norm": 0.19985677477921807,
"learning_rate": 2.327022644215193e-08,
"loss": 1.3813,
"step": 2110
},
{
"epoch": 0.97,
"grad_norm": 0.20946530787894166,
"learning_rate": 1.9555879065930038e-08,
"loss": 1.4226,
"step": 2115
},
{
"epoch": 0.98,
"grad_norm": 0.1975351975591122,
"learning_rate": 1.6163713485662923e-08,
"loss": 1.3792,
"step": 2120
},
{
"epoch": 0.98,
"grad_norm": 0.20571768755825784,
"learning_rate": 1.3093949138406892e-08,
"loss": 1.3918,
"step": 2125
},
{
"epoch": 0.98,
"grad_norm": 0.44915439425778153,
"learning_rate": 1.03467846052846e-08,
"loss": 1.3642,
"step": 2130
},
{
"epoch": 0.98,
"grad_norm": 0.20214902180431968,
"learning_rate": 7.922397598642551e-09,
"loss": 1.3599,
"step": 2135
},
{
"epoch": 0.99,
"grad_norm": 0.19767842515824202,
"learning_rate": 5.820944950549745e-09,
"loss": 1.3599,
"step": 2140
},
{
"epoch": 0.99,
"grad_norm": 0.1991438801798082,
"learning_rate": 4.042562602655231e-09,
"loss": 1.3446,
"step": 2145
},
{
"epoch": 0.99,
"grad_norm": 0.20173015884228035,
"learning_rate": 2.5873655973945864e-09,
"loss": 1.3461,
"step": 2150
},
{
"epoch": 0.99,
"grad_norm": 0.20119946240562925,
"learning_rate": 1.4554480705458729e-09,
"loss": 1.3474,
"step": 2155
},
{
"epoch": 1.0,
"grad_norm": 0.20468648298029762,
"learning_rate": 6.468832451417273e-10,
"loss": 1.3649,
"step": 2160
},
{
"epoch": 1.0,
"grad_norm": 0.19998165788056904,
"learning_rate": 1.617234267320411e-10,
"loss": 1.3677,
"step": 2165
},
{
"epoch": 1.0,
"grad_norm": 0.21026892718522863,
"learning_rate": 0.0,
"loss": 1.3748,
"step": 2170
},
{
"epoch": 1.0,
"step": 2170,
"total_flos": 7055767844683776.0,
"train_loss": 1.4133363889659056,
"train_runtime": 112943.9974,
"train_samples_per_second": 0.615,
"train_steps_per_second": 0.019
}
],
"logging_steps": 5,
"max_steps": 2170,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 7055767844683776.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}