c2s_sep_2024 / trainer_state.json
obulikrish's picture
c2s_sep_2024
c64e8ce
{
"best_metric": 1.2792317867279053,
"best_model_checkpoint": "saved_model/c2s_sep_2024/checkpoint-4606",
"epoch": 2.9998371777476254,
"eval_steps": 500,
"global_step": 13818,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": NaN,
"learning_rate": 0.0,
"loss": 77.1448,
"step": 1
},
{
"epoch": 0.0,
"grad_norm": 17.278156280517578,
"learning_rate": 2.5e-06,
"loss": 76.5629,
"step": 10
},
{
"epoch": 0.0,
"grad_norm": 15.856775283813477,
"learning_rate": 7.000000000000001e-06,
"loss": 75.6974,
"step": 20
},
{
"epoch": 0.01,
"grad_norm": 15.606675148010254,
"learning_rate": 1.2e-05,
"loss": 74.9514,
"step": 30
},
{
"epoch": 0.01,
"grad_norm": 12.968363761901855,
"learning_rate": 1.7000000000000003e-05,
"loss": 72.4643,
"step": 40
},
{
"epoch": 0.01,
"grad_norm": 13.329130172729492,
"learning_rate": 2.2000000000000003e-05,
"loss": 69.0552,
"step": 50
},
{
"epoch": 0.01,
"grad_norm": 18.156723022460938,
"learning_rate": 2.7000000000000002e-05,
"loss": 64.2775,
"step": 60
},
{
"epoch": 0.02,
"grad_norm": 29.901222229003906,
"learning_rate": 3.2000000000000005e-05,
"loss": 52.1897,
"step": 70
},
{
"epoch": 0.02,
"grad_norm": 27.163593292236328,
"learning_rate": 3.65e-05,
"loss": 30.5964,
"step": 80
},
{
"epoch": 0.02,
"grad_norm": 13.53585433959961,
"learning_rate": 4.15e-05,
"loss": 12.5007,
"step": 90
},
{
"epoch": 0.02,
"grad_norm": 7.8353095054626465,
"learning_rate": 4.6500000000000005e-05,
"loss": 6.4802,
"step": 100
},
{
"epoch": 0.02,
"grad_norm": 6.838261127471924,
"learning_rate": 5.1500000000000005e-05,
"loss": 4.7819,
"step": 110
},
{
"epoch": 0.03,
"grad_norm": 8.852176666259766,
"learning_rate": 5.65e-05,
"loss": 4.1049,
"step": 120
},
{
"epoch": 0.03,
"grad_norm": 7.614436149597168,
"learning_rate": 6.15e-05,
"loss": 3.7732,
"step": 130
},
{
"epoch": 0.03,
"grad_norm": 7.756160259246826,
"learning_rate": 6.65e-05,
"loss": 3.6324,
"step": 140
},
{
"epoch": 0.03,
"grad_norm": 6.736324310302734,
"learning_rate": 7.15e-05,
"loss": 3.4327,
"step": 150
},
{
"epoch": 0.03,
"grad_norm": 8.393209457397461,
"learning_rate": 7.65e-05,
"loss": 3.4096,
"step": 160
},
{
"epoch": 0.04,
"grad_norm": 5.403553485870361,
"learning_rate": 8.15e-05,
"loss": 3.2845,
"step": 170
},
{
"epoch": 0.04,
"grad_norm": 5.367032051086426,
"learning_rate": 8.65e-05,
"loss": 3.2462,
"step": 180
},
{
"epoch": 0.04,
"grad_norm": 7.965042591094971,
"learning_rate": 9.15e-05,
"loss": 3.1463,
"step": 190
},
{
"epoch": 0.04,
"grad_norm": 7.074673175811768,
"learning_rate": 9.65e-05,
"loss": 3.1758,
"step": 200
},
{
"epoch": 0.05,
"grad_norm": 6.894763946533203,
"learning_rate": 9.999345835150458e-05,
"loss": 3.0311,
"step": 210
},
{
"epoch": 0.05,
"grad_norm": 6.925544738769531,
"learning_rate": 9.997165285651984e-05,
"loss": 3.0684,
"step": 220
},
{
"epoch": 0.05,
"grad_norm": 5.285668849945068,
"learning_rate": 9.994984736153511e-05,
"loss": 2.9234,
"step": 230
},
{
"epoch": 0.05,
"grad_norm": 6.81157922744751,
"learning_rate": 9.992804186655037e-05,
"loss": 2.8664,
"step": 240
},
{
"epoch": 0.05,
"grad_norm": 6.883147239685059,
"learning_rate": 9.990623637156565e-05,
"loss": 2.9204,
"step": 250
},
{
"epoch": 0.06,
"grad_norm": 5.505452632904053,
"learning_rate": 9.988443087658091e-05,
"loss": 2.8818,
"step": 260
},
{
"epoch": 0.06,
"grad_norm": 7.352786064147949,
"learning_rate": 9.986262538159616e-05,
"loss": 2.8999,
"step": 270
},
{
"epoch": 0.06,
"grad_norm": 6.875962734222412,
"learning_rate": 9.984081988661144e-05,
"loss": 2.8523,
"step": 280
},
{
"epoch": 0.06,
"grad_norm": 5.861810684204102,
"learning_rate": 9.98190143916267e-05,
"loss": 2.8062,
"step": 290
},
{
"epoch": 0.07,
"grad_norm": 5.396953582763672,
"learning_rate": 9.979720889664196e-05,
"loss": 2.7625,
"step": 300
},
{
"epoch": 0.07,
"grad_norm": 6.168801307678223,
"learning_rate": 9.977540340165722e-05,
"loss": 2.7063,
"step": 310
},
{
"epoch": 0.07,
"grad_norm": 4.478597640991211,
"learning_rate": 9.975359790667249e-05,
"loss": 2.6539,
"step": 320
},
{
"epoch": 0.07,
"grad_norm": 5.2905731201171875,
"learning_rate": 9.973179241168775e-05,
"loss": 2.7406,
"step": 330
},
{
"epoch": 0.07,
"grad_norm": 5.451777935028076,
"learning_rate": 9.970998691670301e-05,
"loss": 2.6599,
"step": 340
},
{
"epoch": 0.08,
"grad_norm": 5.45026969909668,
"learning_rate": 9.968818142171828e-05,
"loss": 2.6406,
"step": 350
},
{
"epoch": 0.08,
"grad_norm": 4.353079795837402,
"learning_rate": 9.966637592673354e-05,
"loss": 2.5285,
"step": 360
},
{
"epoch": 0.08,
"grad_norm": 4.052408218383789,
"learning_rate": 9.96445704317488e-05,
"loss": 2.4025,
"step": 370
},
{
"epoch": 0.08,
"grad_norm": 4.303618431091309,
"learning_rate": 9.962276493676407e-05,
"loss": 2.2459,
"step": 380
},
{
"epoch": 0.08,
"grad_norm": 3.2505452632904053,
"learning_rate": 9.960095944177933e-05,
"loss": 2.166,
"step": 390
},
{
"epoch": 0.09,
"grad_norm": 2.806292772293091,
"learning_rate": 9.95791539467946e-05,
"loss": 2.0462,
"step": 400
},
{
"epoch": 0.09,
"grad_norm": 2.9824328422546387,
"learning_rate": 9.955734845180987e-05,
"loss": 1.9315,
"step": 410
},
{
"epoch": 0.09,
"grad_norm": 2.7355027198791504,
"learning_rate": 9.953554295682512e-05,
"loss": 1.9072,
"step": 420
},
{
"epoch": 0.09,
"grad_norm": 2.385045051574707,
"learning_rate": 9.951373746184038e-05,
"loss": 1.8667,
"step": 430
},
{
"epoch": 0.1,
"grad_norm": 2.4067020416259766,
"learning_rate": 9.949193196685566e-05,
"loss": 1.8179,
"step": 440
},
{
"epoch": 0.1,
"grad_norm": 2.6805872917175293,
"learning_rate": 9.947012647187092e-05,
"loss": 1.8208,
"step": 450
},
{
"epoch": 0.1,
"grad_norm": 1.9335626363754272,
"learning_rate": 9.944832097688618e-05,
"loss": 1.8092,
"step": 460
},
{
"epoch": 0.1,
"grad_norm": 1.7954732179641724,
"learning_rate": 9.942651548190143e-05,
"loss": 1.7698,
"step": 470
},
{
"epoch": 0.1,
"grad_norm": 2.2542481422424316,
"learning_rate": 9.940470998691671e-05,
"loss": 1.7359,
"step": 480
},
{
"epoch": 0.11,
"grad_norm": 1.8089336156845093,
"learning_rate": 9.938290449193197e-05,
"loss": 1.7195,
"step": 490
},
{
"epoch": 0.11,
"grad_norm": 2.3044662475585938,
"learning_rate": 9.936109899694724e-05,
"loss": 1.6901,
"step": 500
},
{
"epoch": 0.11,
"grad_norm": 1.8811343908309937,
"learning_rate": 9.93392935019625e-05,
"loss": 1.6757,
"step": 510
},
{
"epoch": 0.11,
"grad_norm": 2.8750667572021484,
"learning_rate": 9.931748800697776e-05,
"loss": 1.6902,
"step": 520
},
{
"epoch": 0.12,
"grad_norm": 1.8759925365447998,
"learning_rate": 9.929568251199303e-05,
"loss": 1.6519,
"step": 530
},
{
"epoch": 0.12,
"grad_norm": 1.7360563278198242,
"learning_rate": 9.927387701700829e-05,
"loss": 1.6381,
"step": 540
},
{
"epoch": 0.12,
"grad_norm": 1.9994693994522095,
"learning_rate": 9.925207152202356e-05,
"loss": 1.6527,
"step": 550
},
{
"epoch": 0.12,
"grad_norm": 1.803330659866333,
"learning_rate": 9.923026602703881e-05,
"loss": 1.6453,
"step": 560
},
{
"epoch": 0.12,
"grad_norm": 1.569846272468567,
"learning_rate": 9.920846053205408e-05,
"loss": 1.6689,
"step": 570
},
{
"epoch": 0.13,
"grad_norm": 1.5712964534759521,
"learning_rate": 9.918665503706934e-05,
"loss": 1.6512,
"step": 580
},
{
"epoch": 0.13,
"grad_norm": 1.643431544303894,
"learning_rate": 9.916484954208462e-05,
"loss": 1.5994,
"step": 590
},
{
"epoch": 0.13,
"grad_norm": 1.619866132736206,
"learning_rate": 9.914304404709988e-05,
"loss": 1.6212,
"step": 600
},
{
"epoch": 0.13,
"grad_norm": 1.8739800453186035,
"learning_rate": 9.912123855211514e-05,
"loss": 1.5664,
"step": 610
},
{
"epoch": 0.13,
"grad_norm": 1.9525455236434937,
"learning_rate": 9.909943305713039e-05,
"loss": 1.6108,
"step": 620
},
{
"epoch": 0.14,
"grad_norm": 1.5381406545639038,
"learning_rate": 9.907762756214567e-05,
"loss": 1.6004,
"step": 630
},
{
"epoch": 0.14,
"grad_norm": 1.5303971767425537,
"learning_rate": 9.905582206716093e-05,
"loss": 1.581,
"step": 640
},
{
"epoch": 0.14,
"grad_norm": 1.6467609405517578,
"learning_rate": 9.90340165721762e-05,
"loss": 1.5812,
"step": 650
},
{
"epoch": 0.14,
"grad_norm": 1.6094383001327515,
"learning_rate": 9.901221107719146e-05,
"loss": 1.6027,
"step": 660
},
{
"epoch": 0.15,
"grad_norm": 1.5612354278564453,
"learning_rate": 9.899040558220672e-05,
"loss": 1.5477,
"step": 670
},
{
"epoch": 0.15,
"grad_norm": 1.5925028324127197,
"learning_rate": 9.896860008722198e-05,
"loss": 1.5747,
"step": 680
},
{
"epoch": 0.15,
"grad_norm": 1.434138298034668,
"learning_rate": 9.894679459223725e-05,
"loss": 1.5528,
"step": 690
},
{
"epoch": 0.15,
"grad_norm": 1.6473920345306396,
"learning_rate": 9.892498909725251e-05,
"loss": 1.622,
"step": 700
},
{
"epoch": 0.15,
"grad_norm": 1.599965214729309,
"learning_rate": 9.890318360226777e-05,
"loss": 1.5691,
"step": 710
},
{
"epoch": 0.16,
"grad_norm": 1.6525471210479736,
"learning_rate": 9.888137810728304e-05,
"loss": 1.6131,
"step": 720
},
{
"epoch": 0.16,
"grad_norm": 1.5170183181762695,
"learning_rate": 9.88595726122983e-05,
"loss": 1.5221,
"step": 730
},
{
"epoch": 0.16,
"grad_norm": 1.596643328666687,
"learning_rate": 9.883776711731358e-05,
"loss": 1.545,
"step": 740
},
{
"epoch": 0.16,
"grad_norm": 1.5849794149398804,
"learning_rate": 9.881596162232884e-05,
"loss": 1.5654,
"step": 750
},
{
"epoch": 0.16,
"grad_norm": 1.4768157005310059,
"learning_rate": 9.879415612734409e-05,
"loss": 1.5345,
"step": 760
},
{
"epoch": 0.17,
"grad_norm": 1.5123172998428345,
"learning_rate": 9.877235063235935e-05,
"loss": 1.5236,
"step": 770
},
{
"epoch": 0.17,
"grad_norm": 1.5827418565750122,
"learning_rate": 9.875054513737463e-05,
"loss": 1.5174,
"step": 780
},
{
"epoch": 0.17,
"grad_norm": 1.8722275495529175,
"learning_rate": 9.872873964238989e-05,
"loss": 1.5256,
"step": 790
},
{
"epoch": 0.17,
"grad_norm": 1.6323179006576538,
"learning_rate": 9.870693414740515e-05,
"loss": 1.4835,
"step": 800
},
{
"epoch": 0.18,
"grad_norm": 1.618322491645813,
"learning_rate": 9.868512865242042e-05,
"loss": 1.5214,
"step": 810
},
{
"epoch": 0.18,
"grad_norm": 1.6474233865737915,
"learning_rate": 9.866332315743568e-05,
"loss": 1.4811,
"step": 820
},
{
"epoch": 0.18,
"grad_norm": 1.4305635690689087,
"learning_rate": 9.864151766245094e-05,
"loss": 1.4727,
"step": 830
},
{
"epoch": 0.18,
"grad_norm": 1.6656005382537842,
"learning_rate": 9.86197121674662e-05,
"loss": 1.5373,
"step": 840
},
{
"epoch": 0.18,
"grad_norm": 1.640834927558899,
"learning_rate": 9.859790667248147e-05,
"loss": 1.4811,
"step": 850
},
{
"epoch": 0.19,
"grad_norm": 1.474351167678833,
"learning_rate": 9.857610117749673e-05,
"loss": 1.4819,
"step": 860
},
{
"epoch": 0.19,
"grad_norm": 1.28626549243927,
"learning_rate": 9.8554295682512e-05,
"loss": 1.5221,
"step": 870
},
{
"epoch": 0.19,
"grad_norm": 1.313599944114685,
"learning_rate": 9.853249018752726e-05,
"loss": 1.5221,
"step": 880
},
{
"epoch": 0.19,
"grad_norm": 1.609924554824829,
"learning_rate": 9.851068469254252e-05,
"loss": 1.519,
"step": 890
},
{
"epoch": 0.2,
"grad_norm": 1.2516050338745117,
"learning_rate": 9.84888791975578e-05,
"loss": 1.4906,
"step": 900
},
{
"epoch": 0.2,
"grad_norm": 1.3122848272323608,
"learning_rate": 9.846707370257305e-05,
"loss": 1.5051,
"step": 910
},
{
"epoch": 0.2,
"grad_norm": 1.4828795194625854,
"learning_rate": 9.844526820758831e-05,
"loss": 1.5206,
"step": 920
},
{
"epoch": 0.2,
"grad_norm": 1.3761475086212158,
"learning_rate": 9.842346271260357e-05,
"loss": 1.503,
"step": 930
},
{
"epoch": 0.2,
"grad_norm": 1.4912587404251099,
"learning_rate": 9.840165721761885e-05,
"loss": 1.4932,
"step": 940
},
{
"epoch": 0.21,
"grad_norm": 1.2759939432144165,
"learning_rate": 9.837985172263411e-05,
"loss": 1.4843,
"step": 950
},
{
"epoch": 0.21,
"grad_norm": 1.6568008661270142,
"learning_rate": 9.835804622764938e-05,
"loss": 1.5046,
"step": 960
},
{
"epoch": 0.21,
"grad_norm": 1.4292601346969604,
"learning_rate": 9.833624073266463e-05,
"loss": 1.5249,
"step": 970
},
{
"epoch": 0.21,
"grad_norm": 1.4866324663162231,
"learning_rate": 9.83144352376799e-05,
"loss": 1.4959,
"step": 980
},
{
"epoch": 0.21,
"grad_norm": 1.429203748703003,
"learning_rate": 9.829262974269517e-05,
"loss": 1.4725,
"step": 990
},
{
"epoch": 0.22,
"grad_norm": 1.3150511980056763,
"learning_rate": 9.827082424771043e-05,
"loss": 1.4644,
"step": 1000
},
{
"epoch": 0.22,
"grad_norm": 1.2386242151260376,
"learning_rate": 9.824901875272569e-05,
"loss": 1.4956,
"step": 1010
},
{
"epoch": 0.22,
"grad_norm": 1.74444580078125,
"learning_rate": 9.822721325774095e-05,
"loss": 1.4477,
"step": 1020
},
{
"epoch": 0.22,
"grad_norm": 1.21920907497406,
"learning_rate": 9.820540776275622e-05,
"loss": 1.5053,
"step": 1030
},
{
"epoch": 0.23,
"grad_norm": 1.172884464263916,
"learning_rate": 9.818360226777148e-05,
"loss": 1.4478,
"step": 1040
},
{
"epoch": 0.23,
"grad_norm": 1.3462252616882324,
"learning_rate": 9.816179677278676e-05,
"loss": 1.4749,
"step": 1050
},
{
"epoch": 0.23,
"grad_norm": 1.230682373046875,
"learning_rate": 9.8139991277802e-05,
"loss": 1.4608,
"step": 1060
},
{
"epoch": 0.23,
"grad_norm": 1.4852972030639648,
"learning_rate": 9.811818578281727e-05,
"loss": 1.5006,
"step": 1070
},
{
"epoch": 0.23,
"grad_norm": 1.2698734998703003,
"learning_rate": 9.809638028783253e-05,
"loss": 1.4521,
"step": 1080
},
{
"epoch": 0.24,
"grad_norm": 1.3210391998291016,
"learning_rate": 9.807457479284781e-05,
"loss": 1.4506,
"step": 1090
},
{
"epoch": 0.24,
"grad_norm": 1.329473853111267,
"learning_rate": 9.805276929786307e-05,
"loss": 1.4587,
"step": 1100
},
{
"epoch": 0.24,
"grad_norm": 1.185905933380127,
"learning_rate": 9.803096380287832e-05,
"loss": 1.439,
"step": 1110
},
{
"epoch": 0.24,
"grad_norm": 1.1401315927505493,
"learning_rate": 9.800915830789358e-05,
"loss": 1.4934,
"step": 1120
},
{
"epoch": 0.25,
"grad_norm": 1.2437337636947632,
"learning_rate": 9.798735281290886e-05,
"loss": 1.4771,
"step": 1130
},
{
"epoch": 0.25,
"grad_norm": 1.231963872909546,
"learning_rate": 9.796554731792412e-05,
"loss": 1.4428,
"step": 1140
},
{
"epoch": 0.25,
"grad_norm": 1.274877905845642,
"learning_rate": 9.794374182293939e-05,
"loss": 1.4414,
"step": 1150
},
{
"epoch": 0.25,
"grad_norm": 1.376755952835083,
"learning_rate": 9.792193632795465e-05,
"loss": 1.4366,
"step": 1160
},
{
"epoch": 0.25,
"grad_norm": 1.0724767446517944,
"learning_rate": 9.790013083296991e-05,
"loss": 1.4817,
"step": 1170
},
{
"epoch": 0.26,
"grad_norm": 1.3843764066696167,
"learning_rate": 9.787832533798518e-05,
"loss": 1.4986,
"step": 1180
},
{
"epoch": 0.26,
"grad_norm": 1.327138900756836,
"learning_rate": 9.785651984300044e-05,
"loss": 1.4484,
"step": 1190
},
{
"epoch": 0.26,
"grad_norm": 1.3678048849105835,
"learning_rate": 9.78347143480157e-05,
"loss": 1.454,
"step": 1200
},
{
"epoch": 0.26,
"grad_norm": 1.4238979816436768,
"learning_rate": 9.781290885303097e-05,
"loss": 1.4491,
"step": 1210
},
{
"epoch": 0.26,
"grad_norm": 1.1681418418884277,
"learning_rate": 9.779110335804623e-05,
"loss": 1.4524,
"step": 1220
},
{
"epoch": 0.27,
"grad_norm": 1.2097047567367554,
"learning_rate": 9.776929786306149e-05,
"loss": 1.4562,
"step": 1230
},
{
"epoch": 0.27,
"grad_norm": 1.3048409223556519,
"learning_rate": 9.774749236807677e-05,
"loss": 1.4508,
"step": 1240
},
{
"epoch": 0.27,
"grad_norm": 1.3852041959762573,
"learning_rate": 9.772568687309203e-05,
"loss": 1.4277,
"step": 1250
},
{
"epoch": 0.27,
"grad_norm": 1.179715871810913,
"learning_rate": 9.770388137810728e-05,
"loss": 1.415,
"step": 1260
},
{
"epoch": 0.28,
"grad_norm": 1.1659610271453857,
"learning_rate": 9.768207588312254e-05,
"loss": 1.4528,
"step": 1270
},
{
"epoch": 0.28,
"grad_norm": 1.334057331085205,
"learning_rate": 9.766027038813782e-05,
"loss": 1.4525,
"step": 1280
},
{
"epoch": 0.28,
"grad_norm": 1.5751981735229492,
"learning_rate": 9.763846489315308e-05,
"loss": 1.4427,
"step": 1290
},
{
"epoch": 0.28,
"grad_norm": 1.1843003034591675,
"learning_rate": 9.761665939816835e-05,
"loss": 1.4427,
"step": 1300
},
{
"epoch": 0.28,
"grad_norm": 1.3135390281677246,
"learning_rate": 9.759485390318361e-05,
"loss": 1.4245,
"step": 1310
},
{
"epoch": 0.29,
"grad_norm": 1.1618658304214478,
"learning_rate": 9.757304840819887e-05,
"loss": 1.4622,
"step": 1320
},
{
"epoch": 0.29,
"grad_norm": 1.159295678138733,
"learning_rate": 9.755124291321414e-05,
"loss": 1.4557,
"step": 1330
},
{
"epoch": 0.29,
"grad_norm": 1.209723949432373,
"learning_rate": 9.75294374182294e-05,
"loss": 1.41,
"step": 1340
},
{
"epoch": 0.29,
"grad_norm": 1.2520672082901,
"learning_rate": 9.750763192324466e-05,
"loss": 1.4362,
"step": 1350
},
{
"epoch": 0.3,
"grad_norm": 1.2639249563217163,
"learning_rate": 9.748582642825992e-05,
"loss": 1.4526,
"step": 1360
},
{
"epoch": 0.3,
"grad_norm": 1.2657458782196045,
"learning_rate": 9.746402093327519e-05,
"loss": 1.4479,
"step": 1370
},
{
"epoch": 0.3,
"grad_norm": 1.4267339706420898,
"learning_rate": 9.744221543829045e-05,
"loss": 1.4219,
"step": 1380
},
{
"epoch": 0.3,
"grad_norm": 1.1722772121429443,
"learning_rate": 9.742040994330571e-05,
"loss": 1.448,
"step": 1390
},
{
"epoch": 0.3,
"grad_norm": 1.1443181037902832,
"learning_rate": 9.739860444832099e-05,
"loss": 1.4193,
"step": 1400
},
{
"epoch": 0.31,
"grad_norm": 1.2879366874694824,
"learning_rate": 9.737679895333624e-05,
"loss": 1.4196,
"step": 1410
},
{
"epoch": 0.31,
"grad_norm": 1.2243574857711792,
"learning_rate": 9.73549934583515e-05,
"loss": 1.4296,
"step": 1420
},
{
"epoch": 0.31,
"grad_norm": 1.2071127891540527,
"learning_rate": 9.733318796336677e-05,
"loss": 1.4194,
"step": 1430
},
{
"epoch": 0.31,
"grad_norm": 1.1925525665283203,
"learning_rate": 9.731138246838204e-05,
"loss": 1.4243,
"step": 1440
},
{
"epoch": 0.31,
"grad_norm": 1.2962863445281982,
"learning_rate": 9.72895769733973e-05,
"loss": 1.4371,
"step": 1450
},
{
"epoch": 0.32,
"grad_norm": 1.0177215337753296,
"learning_rate": 9.726777147841255e-05,
"loss": 1.4237,
"step": 1460
},
{
"epoch": 0.32,
"grad_norm": 1.4175331592559814,
"learning_rate": 9.724596598342783e-05,
"loss": 1.4107,
"step": 1470
},
{
"epoch": 0.32,
"grad_norm": 1.0958452224731445,
"learning_rate": 9.72241604884431e-05,
"loss": 1.4176,
"step": 1480
},
{
"epoch": 0.32,
"grad_norm": 1.1612709760665894,
"learning_rate": 9.720235499345836e-05,
"loss": 1.4051,
"step": 1490
},
{
"epoch": 0.33,
"grad_norm": 1.0781750679016113,
"learning_rate": 9.718054949847362e-05,
"loss": 1.4179,
"step": 1500
},
{
"epoch": 0.33,
"grad_norm": 1.1481519937515259,
"learning_rate": 9.715874400348888e-05,
"loss": 1.4247,
"step": 1510
},
{
"epoch": 0.33,
"grad_norm": 1.155716896057129,
"learning_rate": 9.713693850850415e-05,
"loss": 1.4268,
"step": 1520
},
{
"epoch": 0.33,
"grad_norm": 1.0442588329315186,
"learning_rate": 9.711513301351941e-05,
"loss": 1.445,
"step": 1530
},
{
"epoch": 0.33,
"grad_norm": 1.0979626178741455,
"learning_rate": 9.709332751853467e-05,
"loss": 1.4149,
"step": 1540
},
{
"epoch": 0.34,
"grad_norm": 1.119378685951233,
"learning_rate": 9.707152202354995e-05,
"loss": 1.44,
"step": 1550
},
{
"epoch": 0.34,
"grad_norm": 1.2214171886444092,
"learning_rate": 9.70497165285652e-05,
"loss": 1.44,
"step": 1560
},
{
"epoch": 0.34,
"grad_norm": 1.1184163093566895,
"learning_rate": 9.702791103358046e-05,
"loss": 1.3981,
"step": 1570
},
{
"epoch": 0.34,
"grad_norm": 1.130410075187683,
"learning_rate": 9.700610553859572e-05,
"loss": 1.4296,
"step": 1580
},
{
"epoch": 0.35,
"grad_norm": 1.1225483417510986,
"learning_rate": 9.6984300043611e-05,
"loss": 1.4153,
"step": 1590
},
{
"epoch": 0.35,
"grad_norm": 1.0556180477142334,
"learning_rate": 9.696249454862626e-05,
"loss": 1.4219,
"step": 1600
},
{
"epoch": 0.35,
"grad_norm": 1.2000679969787598,
"learning_rate": 9.694068905364151e-05,
"loss": 1.3892,
"step": 1610
},
{
"epoch": 0.35,
"grad_norm": 1.0137077569961548,
"learning_rate": 9.691888355865678e-05,
"loss": 1.3976,
"step": 1620
},
{
"epoch": 0.35,
"grad_norm": 1.0124636888504028,
"learning_rate": 9.689707806367205e-05,
"loss": 1.4129,
"step": 1630
},
{
"epoch": 0.36,
"grad_norm": 1.0647350549697876,
"learning_rate": 9.687527256868732e-05,
"loss": 1.357,
"step": 1640
},
{
"epoch": 0.36,
"grad_norm": 1.0684030055999756,
"learning_rate": 9.685346707370258e-05,
"loss": 1.4082,
"step": 1650
},
{
"epoch": 0.36,
"grad_norm": 1.0580588579177856,
"learning_rate": 9.683166157871784e-05,
"loss": 1.3959,
"step": 1660
},
{
"epoch": 0.36,
"grad_norm": 1.1602911949157715,
"learning_rate": 9.68098560837331e-05,
"loss": 1.3857,
"step": 1670
},
{
"epoch": 0.36,
"grad_norm": 1.1642051935195923,
"learning_rate": 9.678805058874837e-05,
"loss": 1.4055,
"step": 1680
},
{
"epoch": 0.37,
"grad_norm": 1.0410170555114746,
"learning_rate": 9.676624509376363e-05,
"loss": 1.4071,
"step": 1690
},
{
"epoch": 0.37,
"grad_norm": 1.067542314529419,
"learning_rate": 9.674443959877891e-05,
"loss": 1.4093,
"step": 1700
},
{
"epoch": 0.37,
"grad_norm": 1.2621368169784546,
"learning_rate": 9.672263410379416e-05,
"loss": 1.3814,
"step": 1710
},
{
"epoch": 0.37,
"grad_norm": 1.0956709384918213,
"learning_rate": 9.670082860880942e-05,
"loss": 1.4024,
"step": 1720
},
{
"epoch": 0.38,
"grad_norm": 1.1027687788009644,
"learning_rate": 9.667902311382468e-05,
"loss": 1.3544,
"step": 1730
},
{
"epoch": 0.38,
"grad_norm": 1.1282079219818115,
"learning_rate": 9.665721761883996e-05,
"loss": 1.3818,
"step": 1740
},
{
"epoch": 0.38,
"grad_norm": 1.244485855102539,
"learning_rate": 9.663541212385522e-05,
"loss": 1.4024,
"step": 1750
},
{
"epoch": 0.38,
"grad_norm": 1.2329769134521484,
"learning_rate": 9.661360662887047e-05,
"loss": 1.413,
"step": 1760
},
{
"epoch": 0.38,
"grad_norm": 1.2671635150909424,
"learning_rate": 9.659180113388574e-05,
"loss": 1.4002,
"step": 1770
},
{
"epoch": 0.39,
"grad_norm": 1.2992949485778809,
"learning_rate": 9.656999563890101e-05,
"loss": 1.3972,
"step": 1780
},
{
"epoch": 0.39,
"grad_norm": 1.15711510181427,
"learning_rate": 9.654819014391628e-05,
"loss": 1.3882,
"step": 1790
},
{
"epoch": 0.39,
"grad_norm": 1.122938632965088,
"learning_rate": 9.652638464893154e-05,
"loss": 1.4222,
"step": 1800
},
{
"epoch": 0.39,
"grad_norm": 1.151628851890564,
"learning_rate": 9.650457915394679e-05,
"loss": 1.3898,
"step": 1810
},
{
"epoch": 0.4,
"grad_norm": 1.0860607624053955,
"learning_rate": 9.648277365896206e-05,
"loss": 1.3745,
"step": 1820
},
{
"epoch": 0.4,
"grad_norm": 0.9899650812149048,
"learning_rate": 9.646096816397733e-05,
"loss": 1.3985,
"step": 1830
},
{
"epoch": 0.4,
"grad_norm": 1.019313097000122,
"learning_rate": 9.643916266899259e-05,
"loss": 1.4031,
"step": 1840
},
{
"epoch": 0.4,
"grad_norm": 1.1719962358474731,
"learning_rate": 9.641735717400785e-05,
"loss": 1.3781,
"step": 1850
},
{
"epoch": 0.4,
"grad_norm": 1.117961049079895,
"learning_rate": 9.639555167902312e-05,
"loss": 1.3885,
"step": 1860
},
{
"epoch": 0.41,
"grad_norm": 1.3950169086456299,
"learning_rate": 9.637374618403838e-05,
"loss": 1.3746,
"step": 1870
},
{
"epoch": 0.41,
"grad_norm": 1.1064496040344238,
"learning_rate": 9.635194068905364e-05,
"loss": 1.3764,
"step": 1880
},
{
"epoch": 0.41,
"grad_norm": 1.174922227859497,
"learning_rate": 9.63301351940689e-05,
"loss": 1.42,
"step": 1890
},
{
"epoch": 0.41,
"grad_norm": 1.3221770524978638,
"learning_rate": 9.630832969908418e-05,
"loss": 1.3712,
"step": 1900
},
{
"epoch": 0.41,
"grad_norm": 1.0039620399475098,
"learning_rate": 9.628652420409943e-05,
"loss": 1.3976,
"step": 1910
},
{
"epoch": 0.42,
"grad_norm": 0.9963878393173218,
"learning_rate": 9.62647187091147e-05,
"loss": 1.3977,
"step": 1920
},
{
"epoch": 0.42,
"grad_norm": 1.2195067405700684,
"learning_rate": 9.624291321412997e-05,
"loss": 1.3847,
"step": 1930
},
{
"epoch": 0.42,
"grad_norm": 1.0968499183654785,
"learning_rate": 9.622110771914523e-05,
"loss": 1.3937,
"step": 1940
},
{
"epoch": 0.42,
"grad_norm": 0.992825448513031,
"learning_rate": 9.61993022241605e-05,
"loss": 1.4082,
"step": 1950
},
{
"epoch": 0.43,
"grad_norm": 1.0395129919052124,
"learning_rate": 9.617749672917575e-05,
"loss": 1.3696,
"step": 1960
},
{
"epoch": 0.43,
"grad_norm": 1.030629277229309,
"learning_rate": 9.615569123419102e-05,
"loss": 1.4,
"step": 1970
},
{
"epoch": 0.43,
"grad_norm": 1.0580593347549438,
"learning_rate": 9.613388573920629e-05,
"loss": 1.3461,
"step": 1980
},
{
"epoch": 0.43,
"grad_norm": 1.2588000297546387,
"learning_rate": 9.611208024422155e-05,
"loss": 1.3687,
"step": 1990
},
{
"epoch": 0.43,
"grad_norm": 1.1057671308517456,
"learning_rate": 9.609027474923681e-05,
"loss": 1.3876,
"step": 2000
},
{
"epoch": 0.44,
"grad_norm": 1.1952061653137207,
"learning_rate": 9.606846925425208e-05,
"loss": 1.3821,
"step": 2010
},
{
"epoch": 0.44,
"grad_norm": 1.105406641960144,
"learning_rate": 9.604666375926734e-05,
"loss": 1.375,
"step": 2020
},
{
"epoch": 0.44,
"grad_norm": 1.0594791173934937,
"learning_rate": 9.60248582642826e-05,
"loss": 1.3644,
"step": 2030
},
{
"epoch": 0.44,
"grad_norm": 1.055421233177185,
"learning_rate": 9.600305276929787e-05,
"loss": 1.3938,
"step": 2040
},
{
"epoch": 0.45,
"grad_norm": 1.2545115947723389,
"learning_rate": 9.598124727431314e-05,
"loss": 1.3709,
"step": 2050
},
{
"epoch": 0.45,
"grad_norm": 0.9864488244056702,
"learning_rate": 9.595944177932839e-05,
"loss": 1.3802,
"step": 2060
},
{
"epoch": 0.45,
"grad_norm": 1.0537374019622803,
"learning_rate": 9.593763628434365e-05,
"loss": 1.3847,
"step": 2070
},
{
"epoch": 0.45,
"grad_norm": 1.0474879741668701,
"learning_rate": 9.591583078935892e-05,
"loss": 1.3616,
"step": 2080
},
{
"epoch": 0.45,
"grad_norm": 1.1384907960891724,
"learning_rate": 9.58940252943742e-05,
"loss": 1.3548,
"step": 2090
},
{
"epoch": 0.46,
"grad_norm": 1.1582238674163818,
"learning_rate": 9.587221979938946e-05,
"loss": 1.374,
"step": 2100
},
{
"epoch": 0.46,
"grad_norm": 1.1610651016235352,
"learning_rate": 9.58504143044047e-05,
"loss": 1.3726,
"step": 2110
},
{
"epoch": 0.46,
"grad_norm": 1.0401073694229126,
"learning_rate": 9.582860880941997e-05,
"loss": 1.3617,
"step": 2120
},
{
"epoch": 0.46,
"grad_norm": 1.1059417724609375,
"learning_rate": 9.580680331443525e-05,
"loss": 1.3765,
"step": 2130
},
{
"epoch": 0.46,
"grad_norm": 1.055931806564331,
"learning_rate": 9.578499781945051e-05,
"loss": 1.377,
"step": 2140
},
{
"epoch": 0.47,
"grad_norm": 1.1078617572784424,
"learning_rate": 9.576319232446577e-05,
"loss": 1.3714,
"step": 2150
},
{
"epoch": 0.47,
"grad_norm": 1.0788148641586304,
"learning_rate": 9.574138682948104e-05,
"loss": 1.3769,
"step": 2160
},
{
"epoch": 0.47,
"grad_norm": 1.1252089738845825,
"learning_rate": 9.57195813344963e-05,
"loss": 1.3583,
"step": 2170
},
{
"epoch": 0.47,
"grad_norm": 1.0174541473388672,
"learning_rate": 9.569777583951156e-05,
"loss": 1.3665,
"step": 2180
},
{
"epoch": 0.48,
"grad_norm": 1.0689630508422852,
"learning_rate": 9.567597034452682e-05,
"loss": 1.3571,
"step": 2190
},
{
"epoch": 0.48,
"grad_norm": 1.1311278343200684,
"learning_rate": 9.565416484954209e-05,
"loss": 1.3475,
"step": 2200
},
{
"epoch": 0.48,
"grad_norm": 1.082227349281311,
"learning_rate": 9.563235935455735e-05,
"loss": 1.3952,
"step": 2210
},
{
"epoch": 0.48,
"grad_norm": 1.116151213645935,
"learning_rate": 9.561055385957261e-05,
"loss": 1.3644,
"step": 2220
},
{
"epoch": 0.48,
"grad_norm": 1.2500598430633545,
"learning_rate": 9.558874836458788e-05,
"loss": 1.3197,
"step": 2230
},
{
"epoch": 0.49,
"grad_norm": 1.1783186197280884,
"learning_rate": 9.556694286960315e-05,
"loss": 1.3599,
"step": 2240
},
{
"epoch": 0.49,
"grad_norm": 0.964650571346283,
"learning_rate": 9.554513737461842e-05,
"loss": 1.3765,
"step": 2250
},
{
"epoch": 0.49,
"grad_norm": 1.1065633296966553,
"learning_rate": 9.552333187963367e-05,
"loss": 1.3605,
"step": 2260
},
{
"epoch": 0.49,
"grad_norm": 1.4492055177688599,
"learning_rate": 9.550152638464893e-05,
"loss": 1.3766,
"step": 2270
},
{
"epoch": 0.49,
"grad_norm": 0.9989602565765381,
"learning_rate": 9.54797208896642e-05,
"loss": 1.3821,
"step": 2280
},
{
"epoch": 0.5,
"grad_norm": 1.2991678714752197,
"learning_rate": 9.545791539467947e-05,
"loss": 1.3418,
"step": 2290
},
{
"epoch": 0.5,
"grad_norm": 1.1501140594482422,
"learning_rate": 9.543610989969473e-05,
"loss": 1.3627,
"step": 2300
},
{
"epoch": 0.5,
"grad_norm": 0.9911489486694336,
"learning_rate": 9.541430440470998e-05,
"loss": 1.3413,
"step": 2310
},
{
"epoch": 0.5,
"grad_norm": 1.1046435832977295,
"learning_rate": 9.539249890972526e-05,
"loss": 1.3494,
"step": 2320
},
{
"epoch": 0.51,
"grad_norm": 1.0511558055877686,
"learning_rate": 9.537069341474052e-05,
"loss": 1.3347,
"step": 2330
},
{
"epoch": 0.51,
"grad_norm": 1.1485401391983032,
"learning_rate": 9.534888791975578e-05,
"loss": 1.3833,
"step": 2340
},
{
"epoch": 0.51,
"grad_norm": 1.2908611297607422,
"learning_rate": 9.532708242477105e-05,
"loss": 1.3958,
"step": 2350
},
{
"epoch": 0.51,
"grad_norm": 1.0557186603546143,
"learning_rate": 9.530527692978631e-05,
"loss": 1.3455,
"step": 2360
},
{
"epoch": 0.51,
"grad_norm": 1.0551774501800537,
"learning_rate": 9.528347143480157e-05,
"loss": 1.3366,
"step": 2370
},
{
"epoch": 0.52,
"grad_norm": 1.0171273946762085,
"learning_rate": 9.526166593981684e-05,
"loss": 1.3488,
"step": 2380
},
{
"epoch": 0.52,
"grad_norm": 1.3464566469192505,
"learning_rate": 9.523986044483211e-05,
"loss": 1.3274,
"step": 2390
},
{
"epoch": 0.52,
"grad_norm": 1.1853042840957642,
"learning_rate": 9.521805494984737e-05,
"loss": 1.3553,
"step": 2400
},
{
"epoch": 0.52,
"grad_norm": 1.2067043781280518,
"learning_rate": 9.519624945486262e-05,
"loss": 1.358,
"step": 2410
},
{
"epoch": 0.53,
"grad_norm": 1.0003714561462402,
"learning_rate": 9.517444395987789e-05,
"loss": 1.3768,
"step": 2420
},
{
"epoch": 0.53,
"grad_norm": 1.036536455154419,
"learning_rate": 9.515263846489316e-05,
"loss": 1.325,
"step": 2430
},
{
"epoch": 0.53,
"grad_norm": 1.2333424091339111,
"learning_rate": 9.513083296990843e-05,
"loss": 1.3179,
"step": 2440
},
{
"epoch": 0.53,
"grad_norm": 1.5285654067993164,
"learning_rate": 9.510902747492369e-05,
"loss": 1.3847,
"step": 2450
},
{
"epoch": 0.53,
"grad_norm": 0.9648860096931458,
"learning_rate": 9.508722197993894e-05,
"loss": 1.3624,
"step": 2460
},
{
"epoch": 0.54,
"grad_norm": 1.0200995206832886,
"learning_rate": 9.506541648495422e-05,
"loss": 1.3604,
"step": 2470
},
{
"epoch": 0.54,
"grad_norm": 1.0368491411209106,
"learning_rate": 9.504361098996948e-05,
"loss": 1.3778,
"step": 2480
},
{
"epoch": 0.54,
"grad_norm": 0.9241245985031128,
"learning_rate": 9.502180549498474e-05,
"loss": 1.3751,
"step": 2490
},
{
"epoch": 0.54,
"grad_norm": 1.0286930799484253,
"learning_rate": 9.5e-05,
"loss": 1.3429,
"step": 2500
},
{
"epoch": 0.54,
"grad_norm": 1.262276530265808,
"learning_rate": 9.497819450501527e-05,
"loss": 1.3533,
"step": 2510
},
{
"epoch": 0.55,
"grad_norm": 1.1345752477645874,
"learning_rate": 9.495638901003053e-05,
"loss": 1.3502,
"step": 2520
},
{
"epoch": 0.55,
"grad_norm": 1.025653600692749,
"learning_rate": 9.49345835150458e-05,
"loss": 1.3674,
"step": 2530
},
{
"epoch": 0.55,
"grad_norm": 1.0177459716796875,
"learning_rate": 9.491277802006106e-05,
"loss": 1.356,
"step": 2540
},
{
"epoch": 0.55,
"grad_norm": 1.1438894271850586,
"learning_rate": 9.489097252507632e-05,
"loss": 1.3488,
"step": 2550
},
{
"epoch": 0.56,
"grad_norm": 1.133844017982483,
"learning_rate": 9.486916703009158e-05,
"loss": 1.3649,
"step": 2560
},
{
"epoch": 0.56,
"grad_norm": 1.0228559970855713,
"learning_rate": 9.484736153510685e-05,
"loss": 1.3207,
"step": 2570
},
{
"epoch": 0.56,
"grad_norm": 1.037307858467102,
"learning_rate": 9.482555604012211e-05,
"loss": 1.3517,
"step": 2580
},
{
"epoch": 0.56,
"grad_norm": 1.123706340789795,
"learning_rate": 9.480375054513739e-05,
"loss": 1.371,
"step": 2590
},
{
"epoch": 0.56,
"grad_norm": 1.0684685707092285,
"learning_rate": 9.478194505015265e-05,
"loss": 1.335,
"step": 2600
},
{
"epoch": 0.57,
"grad_norm": 0.9726172089576721,
"learning_rate": 9.47601395551679e-05,
"loss": 1.3588,
"step": 2610
},
{
"epoch": 0.57,
"grad_norm": 0.8923851251602173,
"learning_rate": 9.473833406018318e-05,
"loss": 1.3269,
"step": 2620
},
{
"epoch": 0.57,
"grad_norm": 1.1655867099761963,
"learning_rate": 9.471652856519844e-05,
"loss": 1.3267,
"step": 2630
},
{
"epoch": 0.57,
"grad_norm": 0.9636451005935669,
"learning_rate": 9.46947230702137e-05,
"loss": 1.3545,
"step": 2640
},
{
"epoch": 0.58,
"grad_norm": 1.1559605598449707,
"learning_rate": 9.467291757522896e-05,
"loss": 1.3276,
"step": 2650
},
{
"epoch": 0.58,
"grad_norm": 1.1488990783691406,
"learning_rate": 9.465111208024423e-05,
"loss": 1.3312,
"step": 2660
},
{
"epoch": 0.58,
"grad_norm": 1.0026187896728516,
"learning_rate": 9.462930658525949e-05,
"loss": 1.3574,
"step": 2670
},
{
"epoch": 0.58,
"grad_norm": 1.0129337310791016,
"learning_rate": 9.460750109027475e-05,
"loss": 1.3524,
"step": 2680
},
{
"epoch": 0.58,
"grad_norm": 1.1561243534088135,
"learning_rate": 9.458569559529002e-05,
"loss": 1.3467,
"step": 2690
},
{
"epoch": 0.59,
"grad_norm": 1.0476332902908325,
"learning_rate": 9.456389010030528e-05,
"loss": 1.3552,
"step": 2700
},
{
"epoch": 0.59,
"grad_norm": 1.0199921131134033,
"learning_rate": 9.454208460532054e-05,
"loss": 1.3313,
"step": 2710
},
{
"epoch": 0.59,
"grad_norm": 1.2194985151290894,
"learning_rate": 9.45202791103358e-05,
"loss": 1.3134,
"step": 2720
},
{
"epoch": 0.59,
"grad_norm": 0.9112060070037842,
"learning_rate": 9.449847361535107e-05,
"loss": 1.3581,
"step": 2730
},
{
"epoch": 0.59,
"grad_norm": 1.085046648979187,
"learning_rate": 9.447666812036635e-05,
"loss": 1.3344,
"step": 2740
},
{
"epoch": 0.6,
"grad_norm": 1.0680015087127686,
"learning_rate": 9.445486262538161e-05,
"loss": 1.3227,
"step": 2750
},
{
"epoch": 0.6,
"grad_norm": 0.9969652891159058,
"learning_rate": 9.443305713039686e-05,
"loss": 1.3324,
"step": 2760
},
{
"epoch": 0.6,
"grad_norm": 1.0868465900421143,
"learning_rate": 9.441125163541212e-05,
"loss": 1.3261,
"step": 2770
},
{
"epoch": 0.6,
"grad_norm": 1.0380125045776367,
"learning_rate": 9.43894461404274e-05,
"loss": 1.3378,
"step": 2780
},
{
"epoch": 0.61,
"grad_norm": 0.9851745367050171,
"learning_rate": 9.436764064544266e-05,
"loss": 1.3171,
"step": 2790
},
{
"epoch": 0.61,
"grad_norm": 0.9909139275550842,
"learning_rate": 9.434583515045792e-05,
"loss": 1.3073,
"step": 2800
},
{
"epoch": 0.61,
"grad_norm": 1.0225688219070435,
"learning_rate": 9.432402965547317e-05,
"loss": 1.3119,
"step": 2810
},
{
"epoch": 0.61,
"grad_norm": 1.018894910812378,
"learning_rate": 9.430222416048845e-05,
"loss": 1.3337,
"step": 2820
},
{
"epoch": 0.61,
"grad_norm": 1.0594004392623901,
"learning_rate": 9.428041866550371e-05,
"loss": 1.309,
"step": 2830
},
{
"epoch": 0.62,
"grad_norm": 1.0812976360321045,
"learning_rate": 9.425861317051898e-05,
"loss": 1.3403,
"step": 2840
},
{
"epoch": 0.62,
"grad_norm": 0.9586821794509888,
"learning_rate": 9.423680767553424e-05,
"loss": 1.3413,
"step": 2850
},
{
"epoch": 0.62,
"grad_norm": 0.9033297896385193,
"learning_rate": 9.42150021805495e-05,
"loss": 1.3361,
"step": 2860
},
{
"epoch": 0.62,
"grad_norm": 0.976488471031189,
"learning_rate": 9.419319668556476e-05,
"loss": 1.3467,
"step": 2870
},
{
"epoch": 0.63,
"grad_norm": 0.9687233567237854,
"learning_rate": 9.417139119058003e-05,
"loss": 1.3089,
"step": 2880
},
{
"epoch": 0.63,
"grad_norm": 0.9967139959335327,
"learning_rate": 9.41495856955953e-05,
"loss": 1.3241,
"step": 2890
},
{
"epoch": 0.63,
"grad_norm": 0.9404115676879883,
"learning_rate": 9.412778020061055e-05,
"loss": 1.3489,
"step": 2900
},
{
"epoch": 0.63,
"grad_norm": 1.038221001625061,
"learning_rate": 9.410597470562582e-05,
"loss": 1.3405,
"step": 2910
},
{
"epoch": 0.63,
"grad_norm": 0.9442505240440369,
"learning_rate": 9.408416921064108e-05,
"loss": 1.3733,
"step": 2920
},
{
"epoch": 0.64,
"grad_norm": 0.8614059090614319,
"learning_rate": 9.406236371565636e-05,
"loss": 1.3369,
"step": 2930
},
{
"epoch": 0.64,
"grad_norm": 1.0159504413604736,
"learning_rate": 9.404055822067162e-05,
"loss": 1.3473,
"step": 2940
},
{
"epoch": 0.64,
"grad_norm": 0.9344844222068787,
"learning_rate": 9.401875272568688e-05,
"loss": 1.3191,
"step": 2950
},
{
"epoch": 0.64,
"grad_norm": 0.9241899251937866,
"learning_rate": 9.399694723070213e-05,
"loss": 1.3074,
"step": 2960
},
{
"epoch": 0.64,
"grad_norm": 1.0132297277450562,
"learning_rate": 9.397514173571741e-05,
"loss": 1.3345,
"step": 2970
},
{
"epoch": 0.65,
"grad_norm": 1.035719633102417,
"learning_rate": 9.395333624073267e-05,
"loss": 1.3241,
"step": 2980
},
{
"epoch": 0.65,
"grad_norm": 1.0716739892959595,
"learning_rate": 9.393153074574793e-05,
"loss": 1.3342,
"step": 2990
},
{
"epoch": 0.65,
"grad_norm": 1.05617094039917,
"learning_rate": 9.39097252507632e-05,
"loss": 1.3174,
"step": 3000
},
{
"epoch": 0.65,
"grad_norm": 1.0201910734176636,
"learning_rate": 9.388791975577846e-05,
"loss": 1.3427,
"step": 3010
},
{
"epoch": 0.66,
"grad_norm": 0.9820442199707031,
"learning_rate": 9.386611426079372e-05,
"loss": 1.3187,
"step": 3020
},
{
"epoch": 0.66,
"grad_norm": 0.9873951077461243,
"learning_rate": 9.384430876580899e-05,
"loss": 1.311,
"step": 3030
},
{
"epoch": 0.66,
"grad_norm": 1.0694694519042969,
"learning_rate": 9.382250327082425e-05,
"loss": 1.3409,
"step": 3040
},
{
"epoch": 0.66,
"grad_norm": 0.9933134317398071,
"learning_rate": 9.380069777583951e-05,
"loss": 1.3202,
"step": 3050
},
{
"epoch": 0.66,
"grad_norm": 1.0120593309402466,
"learning_rate": 9.377889228085478e-05,
"loss": 1.3243,
"step": 3060
},
{
"epoch": 0.67,
"grad_norm": 1.0012543201446533,
"learning_rate": 9.375708678587004e-05,
"loss": 1.3205,
"step": 3070
},
{
"epoch": 0.67,
"grad_norm": 0.9940156936645508,
"learning_rate": 9.373528129088532e-05,
"loss": 1.3319,
"step": 3080
},
{
"epoch": 0.67,
"grad_norm": 0.9410566687583923,
"learning_rate": 9.371347579590058e-05,
"loss": 1.3377,
"step": 3090
},
{
"epoch": 0.67,
"grad_norm": 1.0209511518478394,
"learning_rate": 9.369167030091584e-05,
"loss": 1.3226,
"step": 3100
},
{
"epoch": 0.68,
"grad_norm": 1.0901682376861572,
"learning_rate": 9.366986480593109e-05,
"loss": 1.3054,
"step": 3110
},
{
"epoch": 0.68,
"grad_norm": 1.1590335369110107,
"learning_rate": 9.364805931094637e-05,
"loss": 1.333,
"step": 3120
},
{
"epoch": 0.68,
"grad_norm": 0.9248669147491455,
"learning_rate": 9.362625381596163e-05,
"loss": 1.3195,
"step": 3130
},
{
"epoch": 0.68,
"grad_norm": 0.9178153276443481,
"learning_rate": 9.36044483209769e-05,
"loss": 1.3411,
"step": 3140
},
{
"epoch": 0.68,
"grad_norm": 0.8997146487236023,
"learning_rate": 9.358264282599216e-05,
"loss": 1.3238,
"step": 3150
},
{
"epoch": 0.69,
"grad_norm": 0.872699499130249,
"learning_rate": 9.356083733100742e-05,
"loss": 1.311,
"step": 3160
},
{
"epoch": 0.69,
"grad_norm": 1.0057190656661987,
"learning_rate": 9.353903183602268e-05,
"loss": 1.3419,
"step": 3170
},
{
"epoch": 0.69,
"grad_norm": 0.9421138763427734,
"learning_rate": 9.351722634103795e-05,
"loss": 1.3326,
"step": 3180
},
{
"epoch": 0.69,
"grad_norm": 1.072662353515625,
"learning_rate": 9.349542084605321e-05,
"loss": 1.3101,
"step": 3190
},
{
"epoch": 0.69,
"grad_norm": 0.9273852109909058,
"learning_rate": 9.347361535106847e-05,
"loss": 1.2917,
"step": 3200
},
{
"epoch": 0.7,
"grad_norm": 1.056483507156372,
"learning_rate": 9.345180985608373e-05,
"loss": 1.3145,
"step": 3210
},
{
"epoch": 0.7,
"grad_norm": 1.0562832355499268,
"learning_rate": 9.3430004361099e-05,
"loss": 1.3236,
"step": 3220
},
{
"epoch": 0.7,
"grad_norm": 0.9665394425392151,
"learning_rate": 9.340819886611426e-05,
"loss": 1.3311,
"step": 3230
},
{
"epoch": 0.7,
"grad_norm": 1.1284903287887573,
"learning_rate": 9.338639337112954e-05,
"loss": 1.2955,
"step": 3240
},
{
"epoch": 0.71,
"grad_norm": 0.8982547521591187,
"learning_rate": 9.336458787614479e-05,
"loss": 1.3064,
"step": 3250
},
{
"epoch": 0.71,
"grad_norm": 0.9506440162658691,
"learning_rate": 9.334278238116005e-05,
"loss": 1.2924,
"step": 3260
},
{
"epoch": 0.71,
"grad_norm": 0.990853488445282,
"learning_rate": 9.332097688617531e-05,
"loss": 1.3153,
"step": 3270
},
{
"epoch": 0.71,
"grad_norm": 1.048412561416626,
"learning_rate": 9.329917139119059e-05,
"loss": 1.3151,
"step": 3280
},
{
"epoch": 0.71,
"grad_norm": 0.9810274243354797,
"learning_rate": 9.327736589620585e-05,
"loss": 1.3106,
"step": 3290
},
{
"epoch": 0.72,
"grad_norm": 1.2232158184051514,
"learning_rate": 9.325556040122112e-05,
"loss": 1.3269,
"step": 3300
},
{
"epoch": 0.72,
"grad_norm": 0.9797046780586243,
"learning_rate": 9.323375490623638e-05,
"loss": 1.3237,
"step": 3310
},
{
"epoch": 0.72,
"grad_norm": 0.9088875651359558,
"learning_rate": 9.321194941125164e-05,
"loss": 1.328,
"step": 3320
},
{
"epoch": 0.72,
"grad_norm": 0.9865596294403076,
"learning_rate": 9.31901439162669e-05,
"loss": 1.3245,
"step": 3330
},
{
"epoch": 0.73,
"grad_norm": 0.890883207321167,
"learning_rate": 9.316833842128217e-05,
"loss": 1.3078,
"step": 3340
},
{
"epoch": 0.73,
"grad_norm": 1.2496368885040283,
"learning_rate": 9.314653292629743e-05,
"loss": 1.2926,
"step": 3350
},
{
"epoch": 0.73,
"grad_norm": 0.9493234753608704,
"learning_rate": 9.31247274313127e-05,
"loss": 1.3267,
"step": 3360
},
{
"epoch": 0.73,
"grad_norm": 0.9854113459587097,
"learning_rate": 9.310292193632796e-05,
"loss": 1.315,
"step": 3370
},
{
"epoch": 0.73,
"grad_norm": 0.9487243294715881,
"learning_rate": 9.308111644134322e-05,
"loss": 1.3089,
"step": 3380
},
{
"epoch": 0.74,
"grad_norm": 1.0045417547225952,
"learning_rate": 9.30593109463585e-05,
"loss": 1.3007,
"step": 3390
},
{
"epoch": 0.74,
"grad_norm": 0.9876412749290466,
"learning_rate": 9.303750545137375e-05,
"loss": 1.3276,
"step": 3400
},
{
"epoch": 0.74,
"grad_norm": 0.9821478724479675,
"learning_rate": 9.301569995638901e-05,
"loss": 1.3276,
"step": 3410
},
{
"epoch": 0.74,
"grad_norm": 1.0079724788665771,
"learning_rate": 9.299389446140427e-05,
"loss": 1.3379,
"step": 3420
},
{
"epoch": 0.74,
"grad_norm": 1.0058810710906982,
"learning_rate": 9.297208896641955e-05,
"loss": 1.309,
"step": 3430
},
{
"epoch": 0.75,
"grad_norm": 0.9457936882972717,
"learning_rate": 9.295028347143481e-05,
"loss": 1.3301,
"step": 3440
},
{
"epoch": 0.75,
"grad_norm": 1.0582879781723022,
"learning_rate": 9.292847797645007e-05,
"loss": 1.3075,
"step": 3450
},
{
"epoch": 0.75,
"grad_norm": 1.0312747955322266,
"learning_rate": 9.290667248146532e-05,
"loss": 1.3102,
"step": 3460
},
{
"epoch": 0.75,
"grad_norm": 1.3287076950073242,
"learning_rate": 9.28848669864806e-05,
"loss": 1.2828,
"step": 3470
},
{
"epoch": 0.76,
"grad_norm": 1.0003306865692139,
"learning_rate": 9.286306149149586e-05,
"loss": 1.3158,
"step": 3480
},
{
"epoch": 0.76,
"grad_norm": 0.9804103970527649,
"learning_rate": 9.284125599651113e-05,
"loss": 1.3429,
"step": 3490
},
{
"epoch": 0.76,
"grad_norm": 0.9052048325538635,
"learning_rate": 9.281945050152639e-05,
"loss": 1.3248,
"step": 3500
},
{
"epoch": 0.76,
"grad_norm": 0.9492114782333374,
"learning_rate": 9.279764500654165e-05,
"loss": 1.3173,
"step": 3510
},
{
"epoch": 0.76,
"grad_norm": 0.9319648742675781,
"learning_rate": 9.277583951155692e-05,
"loss": 1.3188,
"step": 3520
},
{
"epoch": 0.77,
"grad_norm": 0.9741306900978088,
"learning_rate": 9.275403401657218e-05,
"loss": 1.3263,
"step": 3530
},
{
"epoch": 0.77,
"grad_norm": 0.9644444584846497,
"learning_rate": 9.273222852158746e-05,
"loss": 1.3089,
"step": 3540
},
{
"epoch": 0.77,
"grad_norm": 0.972549319267273,
"learning_rate": 9.27104230266027e-05,
"loss": 1.3047,
"step": 3550
},
{
"epoch": 0.77,
"grad_norm": 1.1472231149673462,
"learning_rate": 9.268861753161797e-05,
"loss": 1.3414,
"step": 3560
},
{
"epoch": 0.78,
"grad_norm": 1.212759256362915,
"learning_rate": 9.266681203663323e-05,
"loss": 1.2955,
"step": 3570
},
{
"epoch": 0.78,
"grad_norm": 0.9833585023880005,
"learning_rate": 9.264500654164851e-05,
"loss": 1.3101,
"step": 3580
},
{
"epoch": 0.78,
"grad_norm": 1.0089327096939087,
"learning_rate": 9.262320104666377e-05,
"loss": 1.3078,
"step": 3590
},
{
"epoch": 0.78,
"grad_norm": 1.026849627494812,
"learning_rate": 9.260139555167902e-05,
"loss": 1.3062,
"step": 3600
},
{
"epoch": 0.78,
"grad_norm": 0.8988268375396729,
"learning_rate": 9.257959005669428e-05,
"loss": 1.2961,
"step": 3610
},
{
"epoch": 0.79,
"grad_norm": 1.0766083002090454,
"learning_rate": 9.255778456170956e-05,
"loss": 1.302,
"step": 3620
},
{
"epoch": 0.79,
"grad_norm": 1.111632227897644,
"learning_rate": 9.253597906672482e-05,
"loss": 1.3179,
"step": 3630
},
{
"epoch": 0.79,
"grad_norm": 0.9569946527481079,
"learning_rate": 9.251417357174009e-05,
"loss": 1.3392,
"step": 3640
},
{
"epoch": 0.79,
"grad_norm": 0.9719332456588745,
"learning_rate": 9.249236807675535e-05,
"loss": 1.3019,
"step": 3650
},
{
"epoch": 0.79,
"grad_norm": 0.9521161317825317,
"learning_rate": 9.247056258177061e-05,
"loss": 1.3226,
"step": 3660
},
{
"epoch": 0.8,
"grad_norm": 1.1349732875823975,
"learning_rate": 9.244875708678587e-05,
"loss": 1.3184,
"step": 3670
},
{
"epoch": 0.8,
"grad_norm": 1.0802345275878906,
"learning_rate": 9.242695159180114e-05,
"loss": 1.3236,
"step": 3680
},
{
"epoch": 0.8,
"grad_norm": 1.0327568054199219,
"learning_rate": 9.24051460968164e-05,
"loss": 1.3285,
"step": 3690
},
{
"epoch": 0.8,
"grad_norm": 1.064948320388794,
"learning_rate": 9.238334060183166e-05,
"loss": 1.3158,
"step": 3700
},
{
"epoch": 0.81,
"grad_norm": 0.88676518201828,
"learning_rate": 9.236153510684693e-05,
"loss": 1.3066,
"step": 3710
},
{
"epoch": 0.81,
"grad_norm": 0.942152202129364,
"learning_rate": 9.233972961186219e-05,
"loss": 1.332,
"step": 3720
},
{
"epoch": 0.81,
"grad_norm": 0.9341984987258911,
"learning_rate": 9.231792411687745e-05,
"loss": 1.3147,
"step": 3730
},
{
"epoch": 0.81,
"grad_norm": 0.8915871381759644,
"learning_rate": 9.229611862189273e-05,
"loss": 1.3071,
"step": 3740
},
{
"epoch": 0.81,
"grad_norm": 0.9265626668930054,
"learning_rate": 9.227431312690798e-05,
"loss": 1.3083,
"step": 3750
},
{
"epoch": 0.82,
"grad_norm": 0.9003929495811462,
"learning_rate": 9.225250763192324e-05,
"loss": 1.3101,
"step": 3760
},
{
"epoch": 0.82,
"grad_norm": 1.004757285118103,
"learning_rate": 9.223070213693852e-05,
"loss": 1.3324,
"step": 3770
},
{
"epoch": 0.82,
"grad_norm": 0.9720560908317566,
"learning_rate": 9.220889664195378e-05,
"loss": 1.3074,
"step": 3780
},
{
"epoch": 0.82,
"grad_norm": 1.0125725269317627,
"learning_rate": 9.218709114696904e-05,
"loss": 1.295,
"step": 3790
},
{
"epoch": 0.82,
"grad_norm": 0.9948697686195374,
"learning_rate": 9.21652856519843e-05,
"loss": 1.3072,
"step": 3800
},
{
"epoch": 0.83,
"grad_norm": 0.8904112577438354,
"learning_rate": 9.214348015699957e-05,
"loss": 1.2879,
"step": 3810
},
{
"epoch": 0.83,
"grad_norm": 0.9827283620834351,
"learning_rate": 9.212167466201483e-05,
"loss": 1.2859,
"step": 3820
},
{
"epoch": 0.83,
"grad_norm": 0.9134978652000427,
"learning_rate": 9.20998691670301e-05,
"loss": 1.2996,
"step": 3830
},
{
"epoch": 0.83,
"grad_norm": 0.9517325162887573,
"learning_rate": 9.207806367204536e-05,
"loss": 1.2764,
"step": 3840
},
{
"epoch": 0.84,
"grad_norm": 0.9537093043327332,
"learning_rate": 9.205625817706062e-05,
"loss": 1.3112,
"step": 3850
},
{
"epoch": 0.84,
"grad_norm": 1.011399269104004,
"learning_rate": 9.203445268207589e-05,
"loss": 1.3008,
"step": 3860
},
{
"epoch": 0.84,
"grad_norm": 1.0325734615325928,
"learning_rate": 9.201264718709115e-05,
"loss": 1.3032,
"step": 3870
},
{
"epoch": 0.84,
"grad_norm": 0.9590222239494324,
"learning_rate": 9.199084169210641e-05,
"loss": 1.3002,
"step": 3880
},
{
"epoch": 0.84,
"grad_norm": 0.984958827495575,
"learning_rate": 9.196903619712169e-05,
"loss": 1.3011,
"step": 3890
},
{
"epoch": 0.85,
"grad_norm": 1.1154364347457886,
"learning_rate": 9.194723070213694e-05,
"loss": 1.3065,
"step": 3900
},
{
"epoch": 0.85,
"grad_norm": 1.0203578472137451,
"learning_rate": 9.19254252071522e-05,
"loss": 1.3193,
"step": 3910
},
{
"epoch": 0.85,
"grad_norm": 1.0204946994781494,
"learning_rate": 9.190361971216746e-05,
"loss": 1.3048,
"step": 3920
},
{
"epoch": 0.85,
"grad_norm": 0.9758703708648682,
"learning_rate": 9.188181421718274e-05,
"loss": 1.2933,
"step": 3930
},
{
"epoch": 0.86,
"grad_norm": 1.0854405164718628,
"learning_rate": 9.1860008722198e-05,
"loss": 1.2947,
"step": 3940
},
{
"epoch": 0.86,
"grad_norm": 1.0030591487884521,
"learning_rate": 9.183820322721325e-05,
"loss": 1.2882,
"step": 3950
},
{
"epoch": 0.86,
"grad_norm": 0.9652947187423706,
"learning_rate": 9.181639773222852e-05,
"loss": 1.2779,
"step": 3960
},
{
"epoch": 0.86,
"grad_norm": 1.0450283288955688,
"learning_rate": 9.179459223724379e-05,
"loss": 1.2807,
"step": 3970
},
{
"epoch": 0.86,
"grad_norm": 1.0894801616668701,
"learning_rate": 9.177278674225906e-05,
"loss": 1.3072,
"step": 3980
},
{
"epoch": 0.87,
"grad_norm": 1.0392231941223145,
"learning_rate": 9.175098124727432e-05,
"loss": 1.3119,
"step": 3990
},
{
"epoch": 0.87,
"grad_norm": 0.9792558550834656,
"learning_rate": 9.172917575228958e-05,
"loss": 1.3062,
"step": 4000
},
{
"epoch": 0.87,
"grad_norm": 1.015689492225647,
"learning_rate": 9.170737025730485e-05,
"loss": 1.3075,
"step": 4010
},
{
"epoch": 0.87,
"grad_norm": 1.0359702110290527,
"learning_rate": 9.168556476232011e-05,
"loss": 1.3022,
"step": 4020
},
{
"epoch": 0.87,
"grad_norm": 0.9113004803657532,
"learning_rate": 9.166375926733537e-05,
"loss": 1.3298,
"step": 4030
},
{
"epoch": 0.88,
"grad_norm": 1.0571136474609375,
"learning_rate": 9.164195377235065e-05,
"loss": 1.2898,
"step": 4040
},
{
"epoch": 0.88,
"grad_norm": 0.9297426342964172,
"learning_rate": 9.16201482773659e-05,
"loss": 1.2895,
"step": 4050
},
{
"epoch": 0.88,
"grad_norm": 1.0925400257110596,
"learning_rate": 9.159834278238116e-05,
"loss": 1.2998,
"step": 4060
},
{
"epoch": 0.88,
"grad_norm": 0.9070808291435242,
"learning_rate": 9.157653728739642e-05,
"loss": 1.2998,
"step": 4070
},
{
"epoch": 0.89,
"grad_norm": 1.1315734386444092,
"learning_rate": 9.15547317924117e-05,
"loss": 1.2867,
"step": 4080
},
{
"epoch": 0.89,
"grad_norm": 1.0597316026687622,
"learning_rate": 9.153292629742696e-05,
"loss": 1.2931,
"step": 4090
},
{
"epoch": 0.89,
"grad_norm": 0.9442005157470703,
"learning_rate": 9.151112080244221e-05,
"loss": 1.2805,
"step": 4100
},
{
"epoch": 0.89,
"grad_norm": 1.3041001558303833,
"learning_rate": 9.148931530745748e-05,
"loss": 1.2934,
"step": 4110
},
{
"epoch": 0.89,
"grad_norm": 0.9306684136390686,
"learning_rate": 9.146750981247275e-05,
"loss": 1.2933,
"step": 4120
},
{
"epoch": 0.9,
"grad_norm": 0.9480651021003723,
"learning_rate": 9.144570431748802e-05,
"loss": 1.3147,
"step": 4130
},
{
"epoch": 0.9,
"grad_norm": 0.98679119348526,
"learning_rate": 9.142389882250328e-05,
"loss": 1.3063,
"step": 4140
},
{
"epoch": 0.9,
"grad_norm": 0.9486891627311707,
"learning_rate": 9.140209332751853e-05,
"loss": 1.2644,
"step": 4150
},
{
"epoch": 0.9,
"grad_norm": 0.9325621724128723,
"learning_rate": 9.13802878325338e-05,
"loss": 1.2718,
"step": 4160
},
{
"epoch": 0.91,
"grad_norm": 0.9871125221252441,
"learning_rate": 9.135848233754907e-05,
"loss": 1.2943,
"step": 4170
},
{
"epoch": 0.91,
"grad_norm": 0.9043755531311035,
"learning_rate": 9.133667684256433e-05,
"loss": 1.3015,
"step": 4180
},
{
"epoch": 0.91,
"grad_norm": 0.9878096580505371,
"learning_rate": 9.13148713475796e-05,
"loss": 1.2524,
"step": 4190
},
{
"epoch": 0.91,
"grad_norm": 0.925841748714447,
"learning_rate": 9.129306585259486e-05,
"loss": 1.2881,
"step": 4200
},
{
"epoch": 0.91,
"grad_norm": 0.8888818025588989,
"learning_rate": 9.127126035761012e-05,
"loss": 1.3057,
"step": 4210
},
{
"epoch": 0.92,
"grad_norm": 1.1273852586746216,
"learning_rate": 9.124945486262538e-05,
"loss": 1.3068,
"step": 4220
},
{
"epoch": 0.92,
"grad_norm": 1.078979253768921,
"learning_rate": 9.122764936764066e-05,
"loss": 1.311,
"step": 4230
},
{
"epoch": 0.92,
"grad_norm": 1.139224648475647,
"learning_rate": 9.120584387265592e-05,
"loss": 1.2961,
"step": 4240
},
{
"epoch": 0.92,
"grad_norm": 0.9568941593170166,
"learning_rate": 9.118403837767117e-05,
"loss": 1.3335,
"step": 4250
},
{
"epoch": 0.92,
"grad_norm": 0.8990288972854614,
"learning_rate": 9.116223288268643e-05,
"loss": 1.2983,
"step": 4260
},
{
"epoch": 0.93,
"grad_norm": 1.0404481887817383,
"learning_rate": 9.114042738770171e-05,
"loss": 1.2867,
"step": 4270
},
{
"epoch": 0.93,
"grad_norm": 0.970191240310669,
"learning_rate": 9.111862189271697e-05,
"loss": 1.2923,
"step": 4280
},
{
"epoch": 0.93,
"grad_norm": 0.9285945296287537,
"learning_rate": 9.109681639773224e-05,
"loss": 1.296,
"step": 4290
},
{
"epoch": 0.93,
"grad_norm": 1.0113970041275024,
"learning_rate": 9.107501090274749e-05,
"loss": 1.2861,
"step": 4300
},
{
"epoch": 0.94,
"grad_norm": 1.0101959705352783,
"learning_rate": 9.105320540776276e-05,
"loss": 1.2958,
"step": 4310
},
{
"epoch": 0.94,
"grad_norm": 0.9014917612075806,
"learning_rate": 9.103139991277803e-05,
"loss": 1.2735,
"step": 4320
},
{
"epoch": 0.94,
"grad_norm": 1.4451045989990234,
"learning_rate": 9.100959441779329e-05,
"loss": 1.3111,
"step": 4330
},
{
"epoch": 0.94,
"grad_norm": 0.9970597624778748,
"learning_rate": 9.098778892280855e-05,
"loss": 1.2725,
"step": 4340
},
{
"epoch": 0.94,
"grad_norm": 0.9795159101486206,
"learning_rate": 9.096598342782382e-05,
"loss": 1.286,
"step": 4350
},
{
"epoch": 0.95,
"grad_norm": 1.1754708290100098,
"learning_rate": 9.094417793283908e-05,
"loss": 1.2903,
"step": 4360
},
{
"epoch": 0.95,
"grad_norm": 1.02108895778656,
"learning_rate": 9.092237243785434e-05,
"loss": 1.2865,
"step": 4370
},
{
"epoch": 0.95,
"grad_norm": 0.9269696474075317,
"learning_rate": 9.09005669428696e-05,
"loss": 1.3163,
"step": 4380
},
{
"epoch": 0.95,
"grad_norm": 0.9824286103248596,
"learning_rate": 9.087876144788488e-05,
"loss": 1.2713,
"step": 4390
},
{
"epoch": 0.96,
"grad_norm": 1.2137070894241333,
"learning_rate": 9.085695595290013e-05,
"loss": 1.313,
"step": 4400
},
{
"epoch": 0.96,
"grad_norm": 1.0218490362167358,
"learning_rate": 9.08351504579154e-05,
"loss": 1.2864,
"step": 4410
},
{
"epoch": 0.96,
"grad_norm": 1.0295207500457764,
"learning_rate": 9.081334496293066e-05,
"loss": 1.2974,
"step": 4420
},
{
"epoch": 0.96,
"grad_norm": 1.0075607299804688,
"learning_rate": 9.079153946794593e-05,
"loss": 1.3011,
"step": 4430
},
{
"epoch": 0.96,
"grad_norm": 0.889430820941925,
"learning_rate": 9.07697339729612e-05,
"loss": 1.3112,
"step": 4440
},
{
"epoch": 0.97,
"grad_norm": 0.9565015435218811,
"learning_rate": 9.074792847797645e-05,
"loss": 1.3019,
"step": 4450
},
{
"epoch": 0.97,
"grad_norm": 1.0241695642471313,
"learning_rate": 9.072612298299172e-05,
"loss": 1.2878,
"step": 4460
},
{
"epoch": 0.97,
"grad_norm": 0.9693965315818787,
"learning_rate": 9.070431748800699e-05,
"loss": 1.3009,
"step": 4470
},
{
"epoch": 0.97,
"grad_norm": 0.8897150754928589,
"learning_rate": 9.068251199302225e-05,
"loss": 1.2757,
"step": 4480
},
{
"epoch": 0.97,
"grad_norm": 1.1614912748336792,
"learning_rate": 9.066070649803751e-05,
"loss": 1.2923,
"step": 4490
},
{
"epoch": 0.98,
"grad_norm": 0.8832863569259644,
"learning_rate": 9.063890100305277e-05,
"loss": 1.3098,
"step": 4500
},
{
"epoch": 0.98,
"grad_norm": 0.9805281162261963,
"learning_rate": 9.061709550806804e-05,
"loss": 1.2958,
"step": 4510
},
{
"epoch": 0.98,
"grad_norm": 1.0199958086013794,
"learning_rate": 9.05952900130833e-05,
"loss": 1.2824,
"step": 4520
},
{
"epoch": 0.98,
"grad_norm": 0.8528922200202942,
"learning_rate": 9.057348451809856e-05,
"loss": 1.2993,
"step": 4530
},
{
"epoch": 0.99,
"grad_norm": 0.9288610816001892,
"learning_rate": 9.055167902311384e-05,
"loss": 1.2758,
"step": 4540
},
{
"epoch": 0.99,
"grad_norm": 0.8977848887443542,
"learning_rate": 9.052987352812909e-05,
"loss": 1.2789,
"step": 4550
},
{
"epoch": 0.99,
"grad_norm": 0.8637726902961731,
"learning_rate": 9.050806803314435e-05,
"loss": 1.2734,
"step": 4560
},
{
"epoch": 0.99,
"grad_norm": 0.9056828022003174,
"learning_rate": 9.048626253815962e-05,
"loss": 1.272,
"step": 4570
},
{
"epoch": 0.99,
"grad_norm": 0.9080044627189636,
"learning_rate": 9.046445704317489e-05,
"loss": 1.264,
"step": 4580
},
{
"epoch": 1.0,
"grad_norm": 0.886441707611084,
"learning_rate": 9.044265154819016e-05,
"loss": 1.2752,
"step": 4590
},
{
"epoch": 1.0,
"grad_norm": 1.02278470993042,
"learning_rate": 9.04208460532054e-05,
"loss": 1.2819,
"step": 4600
},
{
"epoch": 1.0,
"eval_loss": 1.2792317867279053,
"eval_runtime": 1502.3325,
"eval_samples_per_second": 257.499,
"eval_steps_per_second": 4.024,
"step": 4606
},
{
"epoch": 1.0,
"grad_norm": 1.08243727684021,
"learning_rate": 9.039904055822067e-05,
"loss": 1.3113,
"step": 4610
},
{
"epoch": 1.0,
"grad_norm": 1.073258399963379,
"learning_rate": 9.037723506323594e-05,
"loss": 1.3031,
"step": 4620
},
{
"epoch": 1.01,
"grad_norm": 0.9962953329086304,
"learning_rate": 9.035542956825121e-05,
"loss": 1.2904,
"step": 4630
},
{
"epoch": 1.01,
"grad_norm": 0.9397081136703491,
"learning_rate": 9.033362407326647e-05,
"loss": 1.2672,
"step": 4640
},
{
"epoch": 1.01,
"grad_norm": 0.9223260879516602,
"learning_rate": 9.031181857828172e-05,
"loss": 1.2898,
"step": 4650
},
{
"epoch": 1.01,
"grad_norm": 1.0643510818481445,
"learning_rate": 9.0290013083297e-05,
"loss": 1.2831,
"step": 4660
},
{
"epoch": 1.01,
"grad_norm": 0.9219188094139099,
"learning_rate": 9.026820758831226e-05,
"loss": 1.2651,
"step": 4670
},
{
"epoch": 1.02,
"grad_norm": 0.9872779250144958,
"learning_rate": 9.024640209332752e-05,
"loss": 1.2695,
"step": 4680
},
{
"epoch": 1.02,
"grad_norm": 0.9516711235046387,
"learning_rate": 9.022459659834279e-05,
"loss": 1.2662,
"step": 4690
},
{
"epoch": 1.02,
"grad_norm": 0.9385516047477722,
"learning_rate": 9.020279110335805e-05,
"loss": 1.2744,
"step": 4700
},
{
"epoch": 1.02,
"grad_norm": 1.0308866500854492,
"learning_rate": 9.018098560837331e-05,
"loss": 1.2718,
"step": 4710
},
{
"epoch": 1.02,
"grad_norm": 0.9456400871276855,
"learning_rate": 9.015918011338857e-05,
"loss": 1.2494,
"step": 4720
},
{
"epoch": 1.03,
"grad_norm": 1.1350531578063965,
"learning_rate": 9.013737461840385e-05,
"loss": 1.2607,
"step": 4730
},
{
"epoch": 1.03,
"grad_norm": 0.9552891254425049,
"learning_rate": 9.011556912341911e-05,
"loss": 1.2563,
"step": 4740
},
{
"epoch": 1.03,
"grad_norm": 0.9082231521606445,
"learning_rate": 9.009376362843436e-05,
"loss": 1.268,
"step": 4750
},
{
"epoch": 1.03,
"grad_norm": 1.0419315099716187,
"learning_rate": 9.007195813344963e-05,
"loss": 1.3033,
"step": 4760
},
{
"epoch": 1.04,
"grad_norm": 0.827100396156311,
"learning_rate": 9.00501526384649e-05,
"loss": 1.2636,
"step": 4770
},
{
"epoch": 1.04,
"grad_norm": 1.0661678314208984,
"learning_rate": 9.002834714348017e-05,
"loss": 1.2487,
"step": 4780
},
{
"epoch": 1.04,
"grad_norm": 0.9938476085662842,
"learning_rate": 9.000654164849543e-05,
"loss": 1.2729,
"step": 4790
},
{
"epoch": 1.04,
"grad_norm": 1.1281195878982544,
"learning_rate": 8.998473615351068e-05,
"loss": 1.2391,
"step": 4800
},
{
"epoch": 1.04,
"grad_norm": 1.1780451536178589,
"learning_rate": 8.996293065852596e-05,
"loss": 1.2985,
"step": 4810
},
{
"epoch": 1.05,
"grad_norm": 1.0872817039489746,
"learning_rate": 8.994112516354122e-05,
"loss": 1.2615,
"step": 4820
},
{
"epoch": 1.05,
"grad_norm": 0.9712433815002441,
"learning_rate": 8.991931966855648e-05,
"loss": 1.2694,
"step": 4830
},
{
"epoch": 1.05,
"grad_norm": 1.2177668809890747,
"learning_rate": 8.989751417357174e-05,
"loss": 1.2726,
"step": 4840
},
{
"epoch": 1.05,
"grad_norm": 0.9332715272903442,
"learning_rate": 8.987570867858701e-05,
"loss": 1.2703,
"step": 4850
},
{
"epoch": 1.06,
"grad_norm": 0.9567763209342957,
"learning_rate": 8.985390318360227e-05,
"loss": 1.2711,
"step": 4860
},
{
"epoch": 1.06,
"grad_norm": 0.9975143074989319,
"learning_rate": 8.983209768861753e-05,
"loss": 1.2947,
"step": 4870
},
{
"epoch": 1.06,
"grad_norm": 1.0711029767990112,
"learning_rate": 8.98102921936328e-05,
"loss": 1.2723,
"step": 4880
},
{
"epoch": 1.06,
"grad_norm": 0.9394287467002869,
"learning_rate": 8.978848669864807e-05,
"loss": 1.2709,
"step": 4890
},
{
"epoch": 1.06,
"grad_norm": 1.0839319229125977,
"learning_rate": 8.976668120366332e-05,
"loss": 1.2892,
"step": 4900
},
{
"epoch": 1.07,
"grad_norm": 1.024117112159729,
"learning_rate": 8.974487570867859e-05,
"loss": 1.2627,
"step": 4910
},
{
"epoch": 1.07,
"grad_norm": 0.9055659174919128,
"learning_rate": 8.972307021369386e-05,
"loss": 1.2754,
"step": 4920
},
{
"epoch": 1.07,
"grad_norm": 0.9383713603019714,
"learning_rate": 8.970126471870913e-05,
"loss": 1.2713,
"step": 4930
},
{
"epoch": 1.07,
"grad_norm": 1.087470293045044,
"learning_rate": 8.967945922372439e-05,
"loss": 1.27,
"step": 4940
},
{
"epoch": 1.07,
"grad_norm": 0.9602554440498352,
"learning_rate": 8.965765372873964e-05,
"loss": 1.2829,
"step": 4950
},
{
"epoch": 1.08,
"grad_norm": 0.9457790851593018,
"learning_rate": 8.963584823375491e-05,
"loss": 1.2757,
"step": 4960
},
{
"epoch": 1.08,
"grad_norm": 0.8682853579521179,
"learning_rate": 8.961404273877018e-05,
"loss": 1.2662,
"step": 4970
},
{
"epoch": 1.08,
"grad_norm": 1.0000272989273071,
"learning_rate": 8.959223724378544e-05,
"loss": 1.2616,
"step": 4980
},
{
"epoch": 1.08,
"grad_norm": 1.0122287273406982,
"learning_rate": 8.95704317488007e-05,
"loss": 1.287,
"step": 4990
},
{
"epoch": 1.09,
"grad_norm": 0.9552735090255737,
"learning_rate": 8.954862625381597e-05,
"loss": 1.2782,
"step": 5000
},
{
"epoch": 1.09,
"grad_norm": 0.9103166460990906,
"learning_rate": 8.952682075883123e-05,
"loss": 1.2388,
"step": 5010
},
{
"epoch": 1.09,
"grad_norm": 1.0033226013183594,
"learning_rate": 8.950501526384649e-05,
"loss": 1.2762,
"step": 5020
},
{
"epoch": 1.09,
"grad_norm": 0.9572534561157227,
"learning_rate": 8.948320976886176e-05,
"loss": 1.2801,
"step": 5030
},
{
"epoch": 1.09,
"grad_norm": 0.9460912942886353,
"learning_rate": 8.946140427387702e-05,
"loss": 1.2651,
"step": 5040
},
{
"epoch": 1.1,
"grad_norm": 1.0236018896102905,
"learning_rate": 8.943959877889228e-05,
"loss": 1.2602,
"step": 5050
},
{
"epoch": 1.1,
"grad_norm": 1.0384821891784668,
"learning_rate": 8.941779328390754e-05,
"loss": 1.3027,
"step": 5060
},
{
"epoch": 1.1,
"grad_norm": 0.9547539949417114,
"learning_rate": 8.939598778892281e-05,
"loss": 1.2969,
"step": 5070
},
{
"epoch": 1.1,
"grad_norm": 0.9478334784507751,
"learning_rate": 8.937418229393808e-05,
"loss": 1.2829,
"step": 5080
},
{
"epoch": 1.11,
"grad_norm": 1.0621150732040405,
"learning_rate": 8.935237679895335e-05,
"loss": 1.2601,
"step": 5090
},
{
"epoch": 1.11,
"grad_norm": 0.9307476282119751,
"learning_rate": 8.93305713039686e-05,
"loss": 1.2656,
"step": 5100
},
{
"epoch": 1.11,
"grad_norm": 1.0189131498336792,
"learning_rate": 8.930876580898386e-05,
"loss": 1.2646,
"step": 5110
},
{
"epoch": 1.11,
"grad_norm": 1.1185131072998047,
"learning_rate": 8.928696031399914e-05,
"loss": 1.2785,
"step": 5120
},
{
"epoch": 1.11,
"grad_norm": 0.9753584265708923,
"learning_rate": 8.92651548190144e-05,
"loss": 1.2511,
"step": 5130
},
{
"epoch": 1.12,
"grad_norm": 1.0418280363082886,
"learning_rate": 8.924334932402966e-05,
"loss": 1.2537,
"step": 5140
},
{
"epoch": 1.12,
"grad_norm": 0.9717410802841187,
"learning_rate": 8.922154382904493e-05,
"loss": 1.2687,
"step": 5150
},
{
"epoch": 1.12,
"grad_norm": 0.988318681716919,
"learning_rate": 8.919973833406019e-05,
"loss": 1.2599,
"step": 5160
},
{
"epoch": 1.12,
"grad_norm": 0.9211105108261108,
"learning_rate": 8.917793283907545e-05,
"loss": 1.2646,
"step": 5170
},
{
"epoch": 1.12,
"grad_norm": 0.9481471180915833,
"learning_rate": 8.915612734409071e-05,
"loss": 1.271,
"step": 5180
},
{
"epoch": 1.13,
"grad_norm": 0.8939971923828125,
"learning_rate": 8.913432184910598e-05,
"loss": 1.2865,
"step": 5190
},
{
"epoch": 1.13,
"grad_norm": 0.9412124156951904,
"learning_rate": 8.911251635412124e-05,
"loss": 1.279,
"step": 5200
},
{
"epoch": 1.13,
"grad_norm": 0.9381204843521118,
"learning_rate": 8.90907108591365e-05,
"loss": 1.2813,
"step": 5210
},
{
"epoch": 1.13,
"grad_norm": 0.9502457976341248,
"learning_rate": 8.906890536415177e-05,
"loss": 1.2829,
"step": 5220
},
{
"epoch": 1.14,
"grad_norm": 1.0576632022857666,
"learning_rate": 8.904709986916704e-05,
"loss": 1.2708,
"step": 5230
},
{
"epoch": 1.14,
"grad_norm": 1.0302668809890747,
"learning_rate": 8.902529437418229e-05,
"loss": 1.2893,
"step": 5240
},
{
"epoch": 1.14,
"grad_norm": 0.9892765283584595,
"learning_rate": 8.900348887919756e-05,
"loss": 1.2691,
"step": 5250
},
{
"epoch": 1.14,
"grad_norm": 1.0383532047271729,
"learning_rate": 8.898168338421282e-05,
"loss": 1.2539,
"step": 5260
},
{
"epoch": 1.14,
"grad_norm": 0.9894425868988037,
"learning_rate": 8.89598778892281e-05,
"loss": 1.2838,
"step": 5270
},
{
"epoch": 1.15,
"grad_norm": 1.0066653490066528,
"learning_rate": 8.893807239424336e-05,
"loss": 1.2606,
"step": 5280
},
{
"epoch": 1.15,
"grad_norm": 1.0619821548461914,
"learning_rate": 8.891626689925862e-05,
"loss": 1.2724,
"step": 5290
},
{
"epoch": 1.15,
"grad_norm": 0.9619722962379456,
"learning_rate": 8.889446140427387e-05,
"loss": 1.2783,
"step": 5300
},
{
"epoch": 1.15,
"grad_norm": 0.8887227177619934,
"learning_rate": 8.887265590928915e-05,
"loss": 1.264,
"step": 5310
},
{
"epoch": 1.15,
"grad_norm": 1.0262665748596191,
"learning_rate": 8.885085041430441e-05,
"loss": 1.2482,
"step": 5320
},
{
"epoch": 1.16,
"grad_norm": 1.016381859779358,
"learning_rate": 8.882904491931967e-05,
"loss": 1.2523,
"step": 5330
},
{
"epoch": 1.16,
"grad_norm": 0.9932143092155457,
"learning_rate": 8.880723942433494e-05,
"loss": 1.2516,
"step": 5340
},
{
"epoch": 1.16,
"grad_norm": 0.9815816283226013,
"learning_rate": 8.87854339293502e-05,
"loss": 1.2574,
"step": 5350
},
{
"epoch": 1.16,
"grad_norm": 1.0072325468063354,
"learning_rate": 8.876362843436546e-05,
"loss": 1.2688,
"step": 5360
},
{
"epoch": 1.17,
"grad_norm": 0.9834664463996887,
"learning_rate": 8.874182293938073e-05,
"loss": 1.2632,
"step": 5370
},
{
"epoch": 1.17,
"grad_norm": 1.0800156593322754,
"learning_rate": 8.8720017444396e-05,
"loss": 1.2767,
"step": 5380
},
{
"epoch": 1.17,
"grad_norm": 0.9449285268783569,
"learning_rate": 8.869821194941125e-05,
"loss": 1.2667,
"step": 5390
},
{
"epoch": 1.17,
"grad_norm": 1.1136956214904785,
"learning_rate": 8.867640645442652e-05,
"loss": 1.2506,
"step": 5400
},
{
"epoch": 1.17,
"grad_norm": 0.9061567783355713,
"learning_rate": 8.865460095944178e-05,
"loss": 1.2658,
"step": 5410
},
{
"epoch": 1.18,
"grad_norm": 1.00759756565094,
"learning_rate": 8.863279546445705e-05,
"loss": 1.285,
"step": 5420
},
{
"epoch": 1.18,
"grad_norm": 1.0507421493530273,
"learning_rate": 8.861098996947232e-05,
"loss": 1.277,
"step": 5430
},
{
"epoch": 1.18,
"grad_norm": 1.0796302556991577,
"learning_rate": 8.858918447448758e-05,
"loss": 1.2604,
"step": 5440
},
{
"epoch": 1.18,
"grad_norm": 1.0264052152633667,
"learning_rate": 8.856737897950283e-05,
"loss": 1.2747,
"step": 5450
},
{
"epoch": 1.19,
"grad_norm": 0.9274656176567078,
"learning_rate": 8.854557348451811e-05,
"loss": 1.2617,
"step": 5460
},
{
"epoch": 1.19,
"grad_norm": 1.0233980417251587,
"learning_rate": 8.852376798953337e-05,
"loss": 1.2787,
"step": 5470
},
{
"epoch": 1.19,
"grad_norm": 0.9718747138977051,
"learning_rate": 8.850196249454863e-05,
"loss": 1.2511,
"step": 5480
},
{
"epoch": 1.19,
"grad_norm": 1.0765981674194336,
"learning_rate": 8.84801569995639e-05,
"loss": 1.2794,
"step": 5490
},
{
"epoch": 1.19,
"grad_norm": 1.048608660697937,
"learning_rate": 8.845835150457916e-05,
"loss": 1.2597,
"step": 5500
},
{
"epoch": 1.2,
"grad_norm": 0.9524050354957581,
"learning_rate": 8.843654600959442e-05,
"loss": 1.246,
"step": 5510
},
{
"epoch": 1.2,
"grad_norm": 0.9819397926330566,
"learning_rate": 8.841474051460969e-05,
"loss": 1.2732,
"step": 5520
},
{
"epoch": 1.2,
"grad_norm": 0.914893388748169,
"learning_rate": 8.839293501962495e-05,
"loss": 1.2694,
"step": 5530
},
{
"epoch": 1.2,
"grad_norm": 0.9561071395874023,
"learning_rate": 8.837112952464021e-05,
"loss": 1.2642,
"step": 5540
},
{
"epoch": 1.2,
"grad_norm": 0.9841814637184143,
"learning_rate": 8.834932402965547e-05,
"loss": 1.2684,
"step": 5550
},
{
"epoch": 1.21,
"grad_norm": 0.931611955165863,
"learning_rate": 8.832751853467074e-05,
"loss": 1.2751,
"step": 5560
},
{
"epoch": 1.21,
"grad_norm": 1.0068223476409912,
"learning_rate": 8.8305713039686e-05,
"loss": 1.2589,
"step": 5570
},
{
"epoch": 1.21,
"grad_norm": 1.088884711265564,
"learning_rate": 8.828390754470128e-05,
"loss": 1.2606,
"step": 5580
},
{
"epoch": 1.21,
"grad_norm": 0.9682032465934753,
"learning_rate": 8.826210204971653e-05,
"loss": 1.2467,
"step": 5590
},
{
"epoch": 1.22,
"grad_norm": 1.0218122005462646,
"learning_rate": 8.824029655473179e-05,
"loss": 1.2684,
"step": 5600
},
{
"epoch": 1.22,
"grad_norm": 0.9690065979957581,
"learning_rate": 8.821849105974707e-05,
"loss": 1.2906,
"step": 5610
},
{
"epoch": 1.22,
"grad_norm": 0.9736804366111755,
"learning_rate": 8.819668556476233e-05,
"loss": 1.2682,
"step": 5620
},
{
"epoch": 1.22,
"grad_norm": 1.0571842193603516,
"learning_rate": 8.817488006977759e-05,
"loss": 1.247,
"step": 5630
},
{
"epoch": 1.22,
"grad_norm": 1.1925692558288574,
"learning_rate": 8.815307457479286e-05,
"loss": 1.28,
"step": 5640
},
{
"epoch": 1.23,
"grad_norm": 0.8674301505088806,
"learning_rate": 8.813126907980812e-05,
"loss": 1.2699,
"step": 5650
},
{
"epoch": 1.23,
"grad_norm": 1.030501127243042,
"learning_rate": 8.810946358482338e-05,
"loss": 1.2455,
"step": 5660
},
{
"epoch": 1.23,
"grad_norm": 1.0425055027008057,
"learning_rate": 8.808765808983864e-05,
"loss": 1.2802,
"step": 5670
},
{
"epoch": 1.23,
"grad_norm": 0.9576709866523743,
"learning_rate": 8.806585259485391e-05,
"loss": 1.2584,
"step": 5680
},
{
"epoch": 1.24,
"grad_norm": 0.9852989912033081,
"learning_rate": 8.804404709986917e-05,
"loss": 1.2707,
"step": 5690
},
{
"epoch": 1.24,
"grad_norm": 1.0519157648086548,
"learning_rate": 8.802224160488443e-05,
"loss": 1.2647,
"step": 5700
},
{
"epoch": 1.24,
"grad_norm": 1.1391375064849854,
"learning_rate": 8.80004361098997e-05,
"loss": 1.2459,
"step": 5710
},
{
"epoch": 1.24,
"grad_norm": 1.295246958732605,
"learning_rate": 8.797863061491496e-05,
"loss": 1.2708,
"step": 5720
},
{
"epoch": 1.24,
"grad_norm": 0.9388042688369751,
"learning_rate": 8.795682511993024e-05,
"loss": 1.2761,
"step": 5730
},
{
"epoch": 1.25,
"grad_norm": 0.8345937728881836,
"learning_rate": 8.793501962494549e-05,
"loss": 1.2641,
"step": 5740
},
{
"epoch": 1.25,
"grad_norm": 0.9559466242790222,
"learning_rate": 8.791321412996075e-05,
"loss": 1.2608,
"step": 5750
},
{
"epoch": 1.25,
"grad_norm": 0.9135338068008423,
"learning_rate": 8.789140863497601e-05,
"loss": 1.245,
"step": 5760
},
{
"epoch": 1.25,
"grad_norm": 1.0820287466049194,
"learning_rate": 8.786960313999129e-05,
"loss": 1.2549,
"step": 5770
},
{
"epoch": 1.25,
"grad_norm": 1.05925714969635,
"learning_rate": 8.784779764500655e-05,
"loss": 1.2493,
"step": 5780
},
{
"epoch": 1.26,
"grad_norm": 1.0629942417144775,
"learning_rate": 8.782599215002181e-05,
"loss": 1.2803,
"step": 5790
},
{
"epoch": 1.26,
"grad_norm": 1.021894097328186,
"learning_rate": 8.780418665503706e-05,
"loss": 1.264,
"step": 5800
},
{
"epoch": 1.26,
"grad_norm": 0.9319231510162354,
"learning_rate": 8.778238116005234e-05,
"loss": 1.2757,
"step": 5810
},
{
"epoch": 1.26,
"grad_norm": 0.9403659701347351,
"learning_rate": 8.77605756650676e-05,
"loss": 1.2601,
"step": 5820
},
{
"epoch": 1.27,
"grad_norm": 1.0411070585250854,
"learning_rate": 8.773877017008287e-05,
"loss": 1.2747,
"step": 5830
},
{
"epoch": 1.27,
"grad_norm": 0.9437740445137024,
"learning_rate": 8.771696467509813e-05,
"loss": 1.2771,
"step": 5840
},
{
"epoch": 1.27,
"grad_norm": 1.0971676111221313,
"learning_rate": 8.769515918011339e-05,
"loss": 1.2631,
"step": 5850
},
{
"epoch": 1.27,
"grad_norm": 1.0248700380325317,
"learning_rate": 8.767335368512866e-05,
"loss": 1.255,
"step": 5860
},
{
"epoch": 1.27,
"grad_norm": 1.1890584230422974,
"learning_rate": 8.765154819014392e-05,
"loss": 1.265,
"step": 5870
},
{
"epoch": 1.28,
"grad_norm": 1.1310992240905762,
"learning_rate": 8.76297426951592e-05,
"loss": 1.2786,
"step": 5880
},
{
"epoch": 1.28,
"grad_norm": 0.95496666431427,
"learning_rate": 8.760793720017444e-05,
"loss": 1.2534,
"step": 5890
},
{
"epoch": 1.28,
"grad_norm": 1.0427186489105225,
"learning_rate": 8.758613170518971e-05,
"loss": 1.2767,
"step": 5900
},
{
"epoch": 1.28,
"grad_norm": 0.879298985004425,
"learning_rate": 8.756432621020497e-05,
"loss": 1.2453,
"step": 5910
},
{
"epoch": 1.29,
"grad_norm": 0.9911447167396545,
"learning_rate": 8.754252071522025e-05,
"loss": 1.248,
"step": 5920
},
{
"epoch": 1.29,
"grad_norm": 0.9124498963356018,
"learning_rate": 8.752071522023551e-05,
"loss": 1.2588,
"step": 5930
},
{
"epoch": 1.29,
"grad_norm": 0.9397348761558533,
"learning_rate": 8.749890972525076e-05,
"loss": 1.2822,
"step": 5940
},
{
"epoch": 1.29,
"grad_norm": 1.0716569423675537,
"learning_rate": 8.747710423026602e-05,
"loss": 1.2483,
"step": 5950
},
{
"epoch": 1.29,
"grad_norm": 0.8869634866714478,
"learning_rate": 8.74552987352813e-05,
"loss": 1.2752,
"step": 5960
},
{
"epoch": 1.3,
"grad_norm": 0.9538241028785706,
"learning_rate": 8.743349324029656e-05,
"loss": 1.2627,
"step": 5970
},
{
"epoch": 1.3,
"grad_norm": 0.9991753697395325,
"learning_rate": 8.741168774531183e-05,
"loss": 1.2718,
"step": 5980
},
{
"epoch": 1.3,
"grad_norm": 1.0785272121429443,
"learning_rate": 8.738988225032709e-05,
"loss": 1.2826,
"step": 5990
},
{
"epoch": 1.3,
"grad_norm": 1.002681851387024,
"learning_rate": 8.736807675534235e-05,
"loss": 1.2659,
"step": 6000
},
{
"epoch": 1.3,
"grad_norm": 0.9270432591438293,
"learning_rate": 8.734627126035761e-05,
"loss": 1.2493,
"step": 6010
},
{
"epoch": 1.31,
"grad_norm": 1.143751621246338,
"learning_rate": 8.732446576537288e-05,
"loss": 1.2965,
"step": 6020
},
{
"epoch": 1.31,
"grad_norm": 0.9666625261306763,
"learning_rate": 8.730266027038814e-05,
"loss": 1.2553,
"step": 6030
},
{
"epoch": 1.31,
"grad_norm": 0.9400457739830017,
"learning_rate": 8.72808547754034e-05,
"loss": 1.2657,
"step": 6040
},
{
"epoch": 1.31,
"grad_norm": 0.9232240319252014,
"learning_rate": 8.725904928041867e-05,
"loss": 1.2494,
"step": 6050
},
{
"epoch": 1.32,
"grad_norm": 0.9295173287391663,
"learning_rate": 8.723724378543393e-05,
"loss": 1.2496,
"step": 6060
},
{
"epoch": 1.32,
"grad_norm": 1.293441653251648,
"learning_rate": 8.72154382904492e-05,
"loss": 1.2578,
"step": 6070
},
{
"epoch": 1.32,
"grad_norm": 0.9575563669204712,
"learning_rate": 8.719363279546447e-05,
"loss": 1.2323,
"step": 6080
},
{
"epoch": 1.32,
"grad_norm": 1.0204386711120605,
"learning_rate": 8.717182730047972e-05,
"loss": 1.2652,
"step": 6090
},
{
"epoch": 1.32,
"grad_norm": 0.9446994066238403,
"learning_rate": 8.715002180549498e-05,
"loss": 1.2568,
"step": 6100
},
{
"epoch": 1.33,
"grad_norm": 1.0751984119415283,
"learning_rate": 8.712821631051026e-05,
"loss": 1.2806,
"step": 6110
},
{
"epoch": 1.33,
"grad_norm": 0.9466795921325684,
"learning_rate": 8.710641081552552e-05,
"loss": 1.2416,
"step": 6120
},
{
"epoch": 1.33,
"grad_norm": 1.1114068031311035,
"learning_rate": 8.708460532054078e-05,
"loss": 1.2405,
"step": 6130
},
{
"epoch": 1.33,
"grad_norm": 0.9612728953361511,
"learning_rate": 8.706279982555605e-05,
"loss": 1.2655,
"step": 6140
},
{
"epoch": 1.34,
"grad_norm": 0.9728400707244873,
"learning_rate": 8.704099433057131e-05,
"loss": 1.2654,
"step": 6150
},
{
"epoch": 1.34,
"grad_norm": 1.0217069387435913,
"learning_rate": 8.701918883558657e-05,
"loss": 1.2804,
"step": 6160
},
{
"epoch": 1.34,
"grad_norm": 0.9358672499656677,
"learning_rate": 8.699738334060184e-05,
"loss": 1.282,
"step": 6170
},
{
"epoch": 1.34,
"grad_norm": 0.875811755657196,
"learning_rate": 8.69755778456171e-05,
"loss": 1.2974,
"step": 6180
},
{
"epoch": 1.34,
"grad_norm": 0.9315816760063171,
"learning_rate": 8.695377235063236e-05,
"loss": 1.2515,
"step": 6190
},
{
"epoch": 1.35,
"grad_norm": 0.9914236664772034,
"learning_rate": 8.693196685564763e-05,
"loss": 1.2438,
"step": 6200
},
{
"epoch": 1.35,
"grad_norm": 0.9291836023330688,
"learning_rate": 8.691016136066289e-05,
"loss": 1.2794,
"step": 6210
},
{
"epoch": 1.35,
"grad_norm": 1.036189317703247,
"learning_rate": 8.688835586567815e-05,
"loss": 1.2497,
"step": 6220
},
{
"epoch": 1.35,
"grad_norm": 1.1179789304733276,
"learning_rate": 8.686655037069343e-05,
"loss": 1.2627,
"step": 6230
},
{
"epoch": 1.35,
"grad_norm": 1.0586695671081543,
"learning_rate": 8.684474487570868e-05,
"loss": 1.2611,
"step": 6240
},
{
"epoch": 1.36,
"grad_norm": 0.9113835692405701,
"learning_rate": 8.682293938072394e-05,
"loss": 1.2671,
"step": 6250
},
{
"epoch": 1.36,
"grad_norm": 0.911665141582489,
"learning_rate": 8.68011338857392e-05,
"loss": 1.2425,
"step": 6260
},
{
"epoch": 1.36,
"grad_norm": 1.016471266746521,
"learning_rate": 8.677932839075448e-05,
"loss": 1.2672,
"step": 6270
},
{
"epoch": 1.36,
"grad_norm": 1.0666197538375854,
"learning_rate": 8.675752289576974e-05,
"loss": 1.2647,
"step": 6280
},
{
"epoch": 1.37,
"grad_norm": 1.042350172996521,
"learning_rate": 8.673571740078499e-05,
"loss": 1.2211,
"step": 6290
},
{
"epoch": 1.37,
"grad_norm": 0.9714857339859009,
"learning_rate": 8.671391190580027e-05,
"loss": 1.2698,
"step": 6300
},
{
"epoch": 1.37,
"grad_norm": 0.9044662714004517,
"learning_rate": 8.669210641081553e-05,
"loss": 1.2753,
"step": 6310
},
{
"epoch": 1.37,
"grad_norm": 0.8921557664871216,
"learning_rate": 8.66703009158308e-05,
"loss": 1.2528,
"step": 6320
},
{
"epoch": 1.37,
"grad_norm": 0.9644028544425964,
"learning_rate": 8.664849542084606e-05,
"loss": 1.2642,
"step": 6330
},
{
"epoch": 1.38,
"grad_norm": 1.0202399492263794,
"learning_rate": 8.662668992586132e-05,
"loss": 1.2473,
"step": 6340
},
{
"epoch": 1.38,
"grad_norm": 1.0238714218139648,
"learning_rate": 8.660488443087658e-05,
"loss": 1.256,
"step": 6350
},
{
"epoch": 1.38,
"grad_norm": 1.1190308332443237,
"learning_rate": 8.658307893589185e-05,
"loss": 1.2579,
"step": 6360
},
{
"epoch": 1.38,
"grad_norm": 0.9763012528419495,
"learning_rate": 8.656127344090711e-05,
"loss": 1.2607,
"step": 6370
},
{
"epoch": 1.39,
"grad_norm": 0.9133914709091187,
"learning_rate": 8.653946794592239e-05,
"loss": 1.2685,
"step": 6380
},
{
"epoch": 1.39,
"grad_norm": 0.9674580693244934,
"learning_rate": 8.651766245093764e-05,
"loss": 1.2533,
"step": 6390
},
{
"epoch": 1.39,
"grad_norm": 1.1029064655303955,
"learning_rate": 8.64958569559529e-05,
"loss": 1.2487,
"step": 6400
},
{
"epoch": 1.39,
"grad_norm": 0.9458103775978088,
"learning_rate": 8.647405146096816e-05,
"loss": 1.2677,
"step": 6410
},
{
"epoch": 1.39,
"grad_norm": 1.1092442274093628,
"learning_rate": 8.645224596598344e-05,
"loss": 1.2624,
"step": 6420
},
{
"epoch": 1.4,
"grad_norm": 1.1490038633346558,
"learning_rate": 8.64304404709987e-05,
"loss": 1.2566,
"step": 6430
},
{
"epoch": 1.4,
"grad_norm": 0.9747464060783386,
"learning_rate": 8.640863497601395e-05,
"loss": 1.2571,
"step": 6440
},
{
"epoch": 1.4,
"grad_norm": 1.1297920942306519,
"learning_rate": 8.638682948102921e-05,
"loss": 1.2327,
"step": 6450
},
{
"epoch": 1.4,
"grad_norm": 0.9675096869468689,
"learning_rate": 8.636502398604449e-05,
"loss": 1.2327,
"step": 6460
},
{
"epoch": 1.4,
"grad_norm": 0.9282464385032654,
"learning_rate": 8.634321849105975e-05,
"loss": 1.2323,
"step": 6470
},
{
"epoch": 1.41,
"grad_norm": 1.011017918586731,
"learning_rate": 8.632141299607502e-05,
"loss": 1.2429,
"step": 6480
},
{
"epoch": 1.41,
"grad_norm": 1.02436363697052,
"learning_rate": 8.629960750109028e-05,
"loss": 1.2382,
"step": 6490
},
{
"epoch": 1.41,
"grad_norm": 1.0600727796554565,
"learning_rate": 8.627780200610554e-05,
"loss": 1.2689,
"step": 6500
},
{
"epoch": 1.41,
"grad_norm": 0.9400041103363037,
"learning_rate": 8.62559965111208e-05,
"loss": 1.2804,
"step": 6510
},
{
"epoch": 1.42,
"grad_norm": 1.156300663948059,
"learning_rate": 8.623419101613607e-05,
"loss": 1.2596,
"step": 6520
},
{
"epoch": 1.42,
"grad_norm": 0.9240378141403198,
"learning_rate": 8.621238552115133e-05,
"loss": 1.24,
"step": 6530
},
{
"epoch": 1.42,
"grad_norm": 0.8798494338989258,
"learning_rate": 8.61905800261666e-05,
"loss": 1.2526,
"step": 6540
},
{
"epoch": 1.42,
"grad_norm": 0.9512797594070435,
"learning_rate": 8.616877453118186e-05,
"loss": 1.2602,
"step": 6550
},
{
"epoch": 1.42,
"grad_norm": 0.9985531568527222,
"learning_rate": 8.614696903619712e-05,
"loss": 1.2616,
"step": 6560
},
{
"epoch": 1.43,
"grad_norm": 1.134756088256836,
"learning_rate": 8.61251635412124e-05,
"loss": 1.2688,
"step": 6570
},
{
"epoch": 1.43,
"grad_norm": 0.9372296333312988,
"learning_rate": 8.610335804622766e-05,
"loss": 1.2538,
"step": 6580
},
{
"epoch": 1.43,
"grad_norm": 1.011887788772583,
"learning_rate": 8.608155255124291e-05,
"loss": 1.246,
"step": 6590
},
{
"epoch": 1.43,
"grad_norm": 0.9553661346435547,
"learning_rate": 8.605974705625817e-05,
"loss": 1.2502,
"step": 6600
},
{
"epoch": 1.44,
"grad_norm": 0.9924313426017761,
"learning_rate": 8.603794156127345e-05,
"loss": 1.2362,
"step": 6610
},
{
"epoch": 1.44,
"grad_norm": 1.05217707157135,
"learning_rate": 8.601613606628871e-05,
"loss": 1.2655,
"step": 6620
},
{
"epoch": 1.44,
"grad_norm": 1.0302504301071167,
"learning_rate": 8.599433057130398e-05,
"loss": 1.2699,
"step": 6630
},
{
"epoch": 1.44,
"grad_norm": 1.043373942375183,
"learning_rate": 8.597252507631923e-05,
"loss": 1.2532,
"step": 6640
},
{
"epoch": 1.44,
"grad_norm": 0.9535781145095825,
"learning_rate": 8.59507195813345e-05,
"loss": 1.2586,
"step": 6650
},
{
"epoch": 1.45,
"grad_norm": 1.1055347919464111,
"learning_rate": 8.592891408634977e-05,
"loss": 1.2632,
"step": 6660
},
{
"epoch": 1.45,
"grad_norm": 1.0888850688934326,
"learning_rate": 8.590710859136503e-05,
"loss": 1.2497,
"step": 6670
},
{
"epoch": 1.45,
"grad_norm": 0.9970211386680603,
"learning_rate": 8.588530309638029e-05,
"loss": 1.2869,
"step": 6680
},
{
"epoch": 1.45,
"grad_norm": 1.0836609601974487,
"learning_rate": 8.586349760139555e-05,
"loss": 1.2321,
"step": 6690
},
{
"epoch": 1.45,
"grad_norm": 0.9511786103248596,
"learning_rate": 8.584169210641082e-05,
"loss": 1.2562,
"step": 6700
},
{
"epoch": 1.46,
"grad_norm": 1.088644027709961,
"learning_rate": 8.581988661142608e-05,
"loss": 1.2418,
"step": 6710
},
{
"epoch": 1.46,
"grad_norm": 1.0465929508209229,
"learning_rate": 8.579808111644134e-05,
"loss": 1.2608,
"step": 6720
},
{
"epoch": 1.46,
"grad_norm": 1.12638521194458,
"learning_rate": 8.577627562145662e-05,
"loss": 1.2725,
"step": 6730
},
{
"epoch": 1.46,
"grad_norm": 1.171322226524353,
"learning_rate": 8.575447012647187e-05,
"loss": 1.265,
"step": 6740
},
{
"epoch": 1.47,
"grad_norm": 0.926113486289978,
"learning_rate": 8.573266463148713e-05,
"loss": 1.2559,
"step": 6750
},
{
"epoch": 1.47,
"grad_norm": 0.9716551899909973,
"learning_rate": 8.57108591365024e-05,
"loss": 1.2568,
"step": 6760
},
{
"epoch": 1.47,
"grad_norm": 1.0213953256607056,
"learning_rate": 8.568905364151767e-05,
"loss": 1.2649,
"step": 6770
},
{
"epoch": 1.47,
"grad_norm": 0.9643402099609375,
"learning_rate": 8.566724814653294e-05,
"loss": 1.2433,
"step": 6780
},
{
"epoch": 1.47,
"grad_norm": 1.0367106199264526,
"learning_rate": 8.564544265154819e-05,
"loss": 1.2356,
"step": 6790
},
{
"epoch": 1.48,
"grad_norm": 0.9655973315238953,
"learning_rate": 8.562363715656346e-05,
"loss": 1.2439,
"step": 6800
},
{
"epoch": 1.48,
"grad_norm": 1.0422053337097168,
"learning_rate": 8.560183166157872e-05,
"loss": 1.2528,
"step": 6810
},
{
"epoch": 1.48,
"grad_norm": 0.9676966071128845,
"learning_rate": 8.558002616659399e-05,
"loss": 1.2577,
"step": 6820
},
{
"epoch": 1.48,
"grad_norm": 0.9732950329780579,
"learning_rate": 8.555822067160925e-05,
"loss": 1.2513,
"step": 6830
},
{
"epoch": 1.48,
"grad_norm": 1.0636634826660156,
"learning_rate": 8.553641517662451e-05,
"loss": 1.2694,
"step": 6840
},
{
"epoch": 1.49,
"grad_norm": 0.9392173290252686,
"learning_rate": 8.551460968163978e-05,
"loss": 1.2478,
"step": 6850
},
{
"epoch": 1.49,
"grad_norm": 0.9402878880500793,
"learning_rate": 8.549280418665504e-05,
"loss": 1.2528,
"step": 6860
},
{
"epoch": 1.49,
"grad_norm": 1.0256085395812988,
"learning_rate": 8.54709986916703e-05,
"loss": 1.2704,
"step": 6870
},
{
"epoch": 1.49,
"grad_norm": 1.0600332021713257,
"learning_rate": 8.544919319668558e-05,
"loss": 1.2338,
"step": 6880
},
{
"epoch": 1.5,
"grad_norm": 1.0218205451965332,
"learning_rate": 8.542738770170083e-05,
"loss": 1.2839,
"step": 6890
},
{
"epoch": 1.5,
"grad_norm": 0.8786155581474304,
"learning_rate": 8.540558220671609e-05,
"loss": 1.248,
"step": 6900
},
{
"epoch": 1.5,
"grad_norm": 0.9721015095710754,
"learning_rate": 8.538377671173136e-05,
"loss": 1.2734,
"step": 6910
},
{
"epoch": 1.5,
"grad_norm": 0.9734498858451843,
"learning_rate": 8.536197121674663e-05,
"loss": 1.2454,
"step": 6920
},
{
"epoch": 1.5,
"grad_norm": 0.9616742730140686,
"learning_rate": 8.53401657217619e-05,
"loss": 1.2565,
"step": 6930
},
{
"epoch": 1.51,
"grad_norm": 1.153671383857727,
"learning_rate": 8.531836022677714e-05,
"loss": 1.2549,
"step": 6940
},
{
"epoch": 1.51,
"grad_norm": 0.9344118237495422,
"learning_rate": 8.529655473179241e-05,
"loss": 1.2431,
"step": 6950
},
{
"epoch": 1.51,
"grad_norm": 1.0228878259658813,
"learning_rate": 8.527474923680768e-05,
"loss": 1.2276,
"step": 6960
},
{
"epoch": 1.51,
"grad_norm": 1.088304042816162,
"learning_rate": 8.525294374182295e-05,
"loss": 1.2423,
"step": 6970
},
{
"epoch": 1.52,
"grad_norm": 0.9886937737464905,
"learning_rate": 8.523113824683821e-05,
"loss": 1.2693,
"step": 6980
},
{
"epoch": 1.52,
"grad_norm": 0.8818524479866028,
"learning_rate": 8.520933275185346e-05,
"loss": 1.2424,
"step": 6990
},
{
"epoch": 1.52,
"grad_norm": 0.9912683963775635,
"learning_rate": 8.518752725686874e-05,
"loss": 1.2522,
"step": 7000
},
{
"epoch": 1.52,
"grad_norm": 0.9952061176300049,
"learning_rate": 8.5165721761884e-05,
"loss": 1.2519,
"step": 7010
},
{
"epoch": 1.52,
"grad_norm": 1.035301923751831,
"learning_rate": 8.514391626689926e-05,
"loss": 1.2501,
"step": 7020
},
{
"epoch": 1.53,
"grad_norm": 1.0349431037902832,
"learning_rate": 8.512211077191452e-05,
"loss": 1.2451,
"step": 7030
},
{
"epoch": 1.53,
"grad_norm": 0.9751808643341064,
"learning_rate": 8.510030527692979e-05,
"loss": 1.2381,
"step": 7040
},
{
"epoch": 1.53,
"grad_norm": 0.896840512752533,
"learning_rate": 8.507849978194505e-05,
"loss": 1.2509,
"step": 7050
},
{
"epoch": 1.53,
"grad_norm": 1.074179768562317,
"learning_rate": 8.505669428696031e-05,
"loss": 1.2439,
"step": 7060
},
{
"epoch": 1.53,
"grad_norm": 1.0536302328109741,
"learning_rate": 8.503488879197559e-05,
"loss": 1.2795,
"step": 7070
},
{
"epoch": 1.54,
"grad_norm": 0.9011424779891968,
"learning_rate": 8.501308329699085e-05,
"loss": 1.2418,
"step": 7080
},
{
"epoch": 1.54,
"grad_norm": 0.9322314262390137,
"learning_rate": 8.49912778020061e-05,
"loss": 1.2576,
"step": 7090
},
{
"epoch": 1.54,
"grad_norm": 0.9793155193328857,
"learning_rate": 8.496947230702137e-05,
"loss": 1.2492,
"step": 7100
},
{
"epoch": 1.54,
"grad_norm": 0.9420814514160156,
"learning_rate": 8.494766681203664e-05,
"loss": 1.2373,
"step": 7110
},
{
"epoch": 1.55,
"grad_norm": 0.8934997320175171,
"learning_rate": 8.49258613170519e-05,
"loss": 1.2433,
"step": 7120
},
{
"epoch": 1.55,
"grad_norm": 1.0100373029708862,
"learning_rate": 8.490405582206717e-05,
"loss": 1.2397,
"step": 7130
},
{
"epoch": 1.55,
"grad_norm": 0.9812464118003845,
"learning_rate": 8.488225032708242e-05,
"loss": 1.2536,
"step": 7140
},
{
"epoch": 1.55,
"grad_norm": 1.0419830083847046,
"learning_rate": 8.48604448320977e-05,
"loss": 1.2531,
"step": 7150
},
{
"epoch": 1.55,
"grad_norm": 1.0287178754806519,
"learning_rate": 8.483863933711296e-05,
"loss": 1.2853,
"step": 7160
},
{
"epoch": 1.56,
"grad_norm": 0.9258010983467102,
"learning_rate": 8.481683384212822e-05,
"loss": 1.2384,
"step": 7170
},
{
"epoch": 1.56,
"grad_norm": 1.0923179388046265,
"learning_rate": 8.479502834714348e-05,
"loss": 1.2388,
"step": 7180
},
{
"epoch": 1.56,
"grad_norm": 1.026920199394226,
"learning_rate": 8.477322285215875e-05,
"loss": 1.2403,
"step": 7190
},
{
"epoch": 1.56,
"grad_norm": 1.071996808052063,
"learning_rate": 8.475141735717401e-05,
"loss": 1.257,
"step": 7200
},
{
"epoch": 1.57,
"grad_norm": 1.0824863910675049,
"learning_rate": 8.472961186218927e-05,
"loss": 1.2358,
"step": 7210
},
{
"epoch": 1.57,
"grad_norm": 1.006395697593689,
"learning_rate": 8.470780636720454e-05,
"loss": 1.2675,
"step": 7220
},
{
"epoch": 1.57,
"grad_norm": 0.9629374146461487,
"learning_rate": 8.468600087221981e-05,
"loss": 1.2377,
"step": 7230
},
{
"epoch": 1.57,
"grad_norm": 0.9439448714256287,
"learning_rate": 8.466419537723506e-05,
"loss": 1.2269,
"step": 7240
},
{
"epoch": 1.57,
"grad_norm": 0.9413838386535645,
"learning_rate": 8.464238988225033e-05,
"loss": 1.248,
"step": 7250
},
{
"epoch": 1.58,
"grad_norm": 0.9353733658790588,
"learning_rate": 8.46205843872656e-05,
"loss": 1.2535,
"step": 7260
},
{
"epoch": 1.58,
"grad_norm": 1.0403653383255005,
"learning_rate": 8.459877889228086e-05,
"loss": 1.2323,
"step": 7270
},
{
"epoch": 1.58,
"grad_norm": 0.8675696849822998,
"learning_rate": 8.457697339729613e-05,
"loss": 1.2712,
"step": 7280
},
{
"epoch": 1.58,
"grad_norm": 0.9282375574111938,
"learning_rate": 8.455516790231138e-05,
"loss": 1.2259,
"step": 7290
},
{
"epoch": 1.58,
"grad_norm": 0.9778069853782654,
"learning_rate": 8.453336240732665e-05,
"loss": 1.2499,
"step": 7300
},
{
"epoch": 1.59,
"grad_norm": 1.0154436826705933,
"learning_rate": 8.451155691234192e-05,
"loss": 1.2253,
"step": 7310
},
{
"epoch": 1.59,
"grad_norm": 0.9822314381599426,
"learning_rate": 8.448975141735718e-05,
"loss": 1.2583,
"step": 7320
},
{
"epoch": 1.59,
"grad_norm": 1.0584256649017334,
"learning_rate": 8.446794592237244e-05,
"loss": 1.2682,
"step": 7330
},
{
"epoch": 1.59,
"grad_norm": 1.035949945449829,
"learning_rate": 8.44461404273877e-05,
"loss": 1.2604,
"step": 7340
},
{
"epoch": 1.6,
"grad_norm": 0.9688887596130371,
"learning_rate": 8.442433493240297e-05,
"loss": 1.2308,
"step": 7350
},
{
"epoch": 1.6,
"grad_norm": 1.0668280124664307,
"learning_rate": 8.440252943741823e-05,
"loss": 1.2523,
"step": 7360
},
{
"epoch": 1.6,
"grad_norm": 1.0507837533950806,
"learning_rate": 8.43807239424335e-05,
"loss": 1.2458,
"step": 7370
},
{
"epoch": 1.6,
"grad_norm": 0.9705730676651001,
"learning_rate": 8.435891844744876e-05,
"loss": 1.2623,
"step": 7380
},
{
"epoch": 1.6,
"grad_norm": 1.1198492050170898,
"learning_rate": 8.433711295246402e-05,
"loss": 1.2263,
"step": 7390
},
{
"epoch": 1.61,
"grad_norm": 1.090376853942871,
"learning_rate": 8.431530745747928e-05,
"loss": 1.2549,
"step": 7400
},
{
"epoch": 1.61,
"grad_norm": 0.9599369764328003,
"learning_rate": 8.429350196249455e-05,
"loss": 1.2453,
"step": 7410
},
{
"epoch": 1.61,
"grad_norm": 0.9473201036453247,
"learning_rate": 8.427169646750982e-05,
"loss": 1.2449,
"step": 7420
},
{
"epoch": 1.61,
"grad_norm": 1.0158095359802246,
"learning_rate": 8.424989097252509e-05,
"loss": 1.2395,
"step": 7430
},
{
"epoch": 1.62,
"grad_norm": 1.1401153802871704,
"learning_rate": 8.422808547754034e-05,
"loss": 1.2426,
"step": 7440
},
{
"epoch": 1.62,
"grad_norm": 0.9833976030349731,
"learning_rate": 8.42062799825556e-05,
"loss": 1.2238,
"step": 7450
},
{
"epoch": 1.62,
"grad_norm": 1.0531307458877563,
"learning_rate": 8.418447448757088e-05,
"loss": 1.2286,
"step": 7460
},
{
"epoch": 1.62,
"grad_norm": 0.9833014607429504,
"learning_rate": 8.416266899258614e-05,
"loss": 1.2483,
"step": 7470
},
{
"epoch": 1.62,
"grad_norm": 1.0215846300125122,
"learning_rate": 8.41408634976014e-05,
"loss": 1.2434,
"step": 7480
},
{
"epoch": 1.63,
"grad_norm": 0.9338911175727844,
"learning_rate": 8.411905800261667e-05,
"loss": 1.2263,
"step": 7490
},
{
"epoch": 1.63,
"grad_norm": 0.9091663360595703,
"learning_rate": 8.409725250763193e-05,
"loss": 1.2359,
"step": 7500
},
{
"epoch": 1.63,
"grad_norm": 0.9303663969039917,
"learning_rate": 8.407544701264719e-05,
"loss": 1.243,
"step": 7510
},
{
"epoch": 1.63,
"grad_norm": 0.9787565469741821,
"learning_rate": 8.405364151766245e-05,
"loss": 1.2444,
"step": 7520
},
{
"epoch": 1.63,
"grad_norm": 1.1064313650131226,
"learning_rate": 8.403183602267772e-05,
"loss": 1.2438,
"step": 7530
},
{
"epoch": 1.64,
"grad_norm": 0.9433283805847168,
"learning_rate": 8.401003052769298e-05,
"loss": 1.2442,
"step": 7540
},
{
"epoch": 1.64,
"grad_norm": 0.9914006590843201,
"learning_rate": 8.398822503270824e-05,
"loss": 1.2595,
"step": 7550
},
{
"epoch": 1.64,
"grad_norm": 1.1178406476974487,
"learning_rate": 8.39664195377235e-05,
"loss": 1.2223,
"step": 7560
},
{
"epoch": 1.64,
"grad_norm": 1.1177582740783691,
"learning_rate": 8.394461404273878e-05,
"loss": 1.2284,
"step": 7570
},
{
"epoch": 1.65,
"grad_norm": 1.0288305282592773,
"learning_rate": 8.392280854775405e-05,
"loss": 1.2329,
"step": 7580
},
{
"epoch": 1.65,
"grad_norm": 1.078165054321289,
"learning_rate": 8.39010030527693e-05,
"loss": 1.2149,
"step": 7590
},
{
"epoch": 1.65,
"grad_norm": 1.0270469188690186,
"learning_rate": 8.387919755778456e-05,
"loss": 1.2453,
"step": 7600
},
{
"epoch": 1.65,
"grad_norm": 1.142359733581543,
"learning_rate": 8.385739206279984e-05,
"loss": 1.2115,
"step": 7610
},
{
"epoch": 1.65,
"grad_norm": 1.066074252128601,
"learning_rate": 8.38355865678151e-05,
"loss": 1.2282,
"step": 7620
},
{
"epoch": 1.66,
"grad_norm": 0.9854233860969543,
"learning_rate": 8.381378107283036e-05,
"loss": 1.25,
"step": 7630
},
{
"epoch": 1.66,
"grad_norm": 1.0901075601577759,
"learning_rate": 8.379197557784561e-05,
"loss": 1.2237,
"step": 7640
},
{
"epoch": 1.66,
"grad_norm": 1.1587127447128296,
"learning_rate": 8.377017008286089e-05,
"loss": 1.219,
"step": 7650
},
{
"epoch": 1.66,
"grad_norm": 0.9623563289642334,
"learning_rate": 8.374836458787615e-05,
"loss": 1.2311,
"step": 7660
},
{
"epoch": 1.67,
"grad_norm": 0.9470689296722412,
"learning_rate": 8.372655909289141e-05,
"loss": 1.2515,
"step": 7670
},
{
"epoch": 1.67,
"grad_norm": 0.9638876914978027,
"learning_rate": 8.370475359790668e-05,
"loss": 1.2532,
"step": 7680
},
{
"epoch": 1.67,
"grad_norm": 1.163567304611206,
"learning_rate": 8.368294810292194e-05,
"loss": 1.2615,
"step": 7690
},
{
"epoch": 1.67,
"grad_norm": 1.001160979270935,
"learning_rate": 8.36611426079372e-05,
"loss": 1.2472,
"step": 7700
},
{
"epoch": 1.67,
"grad_norm": 1.0169782638549805,
"learning_rate": 8.363933711295247e-05,
"loss": 1.2473,
"step": 7710
},
{
"epoch": 1.68,
"grad_norm": 0.9867805242538452,
"learning_rate": 8.361753161796774e-05,
"loss": 1.2452,
"step": 7720
},
{
"epoch": 1.68,
"grad_norm": 1.0535905361175537,
"learning_rate": 8.359572612298299e-05,
"loss": 1.2405,
"step": 7730
},
{
"epoch": 1.68,
"grad_norm": 0.9246835708618164,
"learning_rate": 8.357392062799825e-05,
"loss": 1.2522,
"step": 7740
},
{
"epoch": 1.68,
"grad_norm": 1.0927287340164185,
"learning_rate": 8.355211513301352e-05,
"loss": 1.2493,
"step": 7750
},
{
"epoch": 1.68,
"grad_norm": 1.054208755493164,
"learning_rate": 8.35303096380288e-05,
"loss": 1.263,
"step": 7760
},
{
"epoch": 1.69,
"grad_norm": 0.9636792540550232,
"learning_rate": 8.350850414304406e-05,
"loss": 1.2426,
"step": 7770
},
{
"epoch": 1.69,
"grad_norm": 1.0837719440460205,
"learning_rate": 8.348669864805932e-05,
"loss": 1.2265,
"step": 7780
},
{
"epoch": 1.69,
"grad_norm": 0.9462710022926331,
"learning_rate": 8.346489315307457e-05,
"loss": 1.2242,
"step": 7790
},
{
"epoch": 1.69,
"grad_norm": 0.987519383430481,
"learning_rate": 8.344308765808985e-05,
"loss": 1.2261,
"step": 7800
},
{
"epoch": 1.7,
"grad_norm": 1.0755093097686768,
"learning_rate": 8.342128216310511e-05,
"loss": 1.2486,
"step": 7810
},
{
"epoch": 1.7,
"grad_norm": 0.9885231852531433,
"learning_rate": 8.339947666812037e-05,
"loss": 1.2325,
"step": 7820
},
{
"epoch": 1.7,
"grad_norm": 1.0870469808578491,
"learning_rate": 8.337767117313564e-05,
"loss": 1.2175,
"step": 7830
},
{
"epoch": 1.7,
"grad_norm": 1.0006695985794067,
"learning_rate": 8.33558656781509e-05,
"loss": 1.2521,
"step": 7840
},
{
"epoch": 1.7,
"grad_norm": 1.0880390405654907,
"learning_rate": 8.333406018316616e-05,
"loss": 1.2353,
"step": 7850
},
{
"epoch": 1.71,
"grad_norm": 0.9993226528167725,
"learning_rate": 8.331225468818142e-05,
"loss": 1.2365,
"step": 7860
},
{
"epoch": 1.71,
"grad_norm": 0.964745819568634,
"learning_rate": 8.329044919319669e-05,
"loss": 1.2566,
"step": 7870
},
{
"epoch": 1.71,
"grad_norm": 0.9665801525115967,
"learning_rate": 8.326864369821195e-05,
"loss": 1.2266,
"step": 7880
},
{
"epoch": 1.71,
"grad_norm": 1.0917197465896606,
"learning_rate": 8.324683820322721e-05,
"loss": 1.2457,
"step": 7890
},
{
"epoch": 1.72,
"grad_norm": 1.1263692378997803,
"learning_rate": 8.322503270824248e-05,
"loss": 1.2312,
"step": 7900
},
{
"epoch": 1.72,
"grad_norm": 0.9168413877487183,
"learning_rate": 8.320322721325774e-05,
"loss": 1.223,
"step": 7910
},
{
"epoch": 1.72,
"grad_norm": 0.9771096706390381,
"learning_rate": 8.318142171827302e-05,
"loss": 1.2219,
"step": 7920
},
{
"epoch": 1.72,
"grad_norm": 0.9901739358901978,
"learning_rate": 8.315961622328828e-05,
"loss": 1.2405,
"step": 7930
},
{
"epoch": 1.72,
"grad_norm": 1.004320502281189,
"learning_rate": 8.313781072830353e-05,
"loss": 1.2584,
"step": 7940
},
{
"epoch": 1.73,
"grad_norm": 0.897678554058075,
"learning_rate": 8.31160052333188e-05,
"loss": 1.2359,
"step": 7950
},
{
"epoch": 1.73,
"grad_norm": 0.9914141893386841,
"learning_rate": 8.309419973833407e-05,
"loss": 1.2269,
"step": 7960
},
{
"epoch": 1.73,
"grad_norm": 1.1783164739608765,
"learning_rate": 8.307239424334933e-05,
"loss": 1.2208,
"step": 7970
},
{
"epoch": 1.73,
"grad_norm": 1.0260601043701172,
"learning_rate": 8.30505887483646e-05,
"loss": 1.2206,
"step": 7980
},
{
"epoch": 1.73,
"grad_norm": 0.9606086015701294,
"learning_rate": 8.302878325337986e-05,
"loss": 1.246,
"step": 7990
},
{
"epoch": 1.74,
"grad_norm": 1.0758907794952393,
"learning_rate": 8.300697775839512e-05,
"loss": 1.2386,
"step": 8000
},
{
"epoch": 1.74,
"grad_norm": 0.9541261792182922,
"learning_rate": 8.298517226341038e-05,
"loss": 1.2554,
"step": 8010
},
{
"epoch": 1.74,
"grad_norm": 1.130035161972046,
"learning_rate": 8.296336676842565e-05,
"loss": 1.2292,
"step": 8020
},
{
"epoch": 1.74,
"grad_norm": 0.9219099879264832,
"learning_rate": 8.294156127344091e-05,
"loss": 1.2486,
"step": 8030
},
{
"epoch": 1.75,
"grad_norm": 0.9194048643112183,
"learning_rate": 8.291975577845617e-05,
"loss": 1.2065,
"step": 8040
},
{
"epoch": 1.75,
"grad_norm": 1.0724278688430786,
"learning_rate": 8.289795028347144e-05,
"loss": 1.232,
"step": 8050
},
{
"epoch": 1.75,
"grad_norm": 1.0829250812530518,
"learning_rate": 8.28761447884867e-05,
"loss": 1.2374,
"step": 8060
},
{
"epoch": 1.75,
"grad_norm": 0.9441924691200256,
"learning_rate": 8.285433929350198e-05,
"loss": 1.2248,
"step": 8070
},
{
"epoch": 1.75,
"grad_norm": 1.0257307291030884,
"learning_rate": 8.283253379851722e-05,
"loss": 1.2356,
"step": 8080
},
{
"epoch": 1.76,
"grad_norm": 0.8646122813224792,
"learning_rate": 8.281072830353249e-05,
"loss": 1.2497,
"step": 8090
},
{
"epoch": 1.76,
"grad_norm": 1.100232481956482,
"learning_rate": 8.278892280854775e-05,
"loss": 1.2365,
"step": 8100
},
{
"epoch": 1.76,
"grad_norm": 1.0597792863845825,
"learning_rate": 8.276711731356303e-05,
"loss": 1.2403,
"step": 8110
},
{
"epoch": 1.76,
"grad_norm": 1.0088367462158203,
"learning_rate": 8.274531181857829e-05,
"loss": 1.2281,
"step": 8120
},
{
"epoch": 1.76,
"grad_norm": 1.0818982124328613,
"learning_rate": 8.272350632359355e-05,
"loss": 1.2427,
"step": 8130
},
{
"epoch": 1.77,
"grad_norm": 0.9281474947929382,
"learning_rate": 8.27017008286088e-05,
"loss": 1.2595,
"step": 8140
},
{
"epoch": 1.77,
"grad_norm": 0.9748603105545044,
"learning_rate": 8.267989533362408e-05,
"loss": 1.248,
"step": 8150
},
{
"epoch": 1.77,
"grad_norm": 1.027099370956421,
"learning_rate": 8.265808983863934e-05,
"loss": 1.2313,
"step": 8160
},
{
"epoch": 1.77,
"grad_norm": 1.0615408420562744,
"learning_rate": 8.26362843436546e-05,
"loss": 1.2549,
"step": 8170
},
{
"epoch": 1.78,
"grad_norm": 0.9190282225608826,
"learning_rate": 8.261447884866987e-05,
"loss": 1.2169,
"step": 8180
},
{
"epoch": 1.78,
"grad_norm": 0.9824718236923218,
"learning_rate": 8.259267335368513e-05,
"loss": 1.2505,
"step": 8190
},
{
"epoch": 1.78,
"grad_norm": 0.9848600029945374,
"learning_rate": 8.25708678587004e-05,
"loss": 1.2414,
"step": 8200
},
{
"epoch": 1.78,
"grad_norm": 0.9373934268951416,
"learning_rate": 8.254906236371566e-05,
"loss": 1.2294,
"step": 8210
},
{
"epoch": 1.78,
"grad_norm": 1.0315806865692139,
"learning_rate": 8.252725686873093e-05,
"loss": 1.2259,
"step": 8220
},
{
"epoch": 1.79,
"grad_norm": 1.0654377937316895,
"learning_rate": 8.250545137374618e-05,
"loss": 1.249,
"step": 8230
},
{
"epoch": 1.79,
"grad_norm": 1.0188405513763428,
"learning_rate": 8.248364587876145e-05,
"loss": 1.2361,
"step": 8240
},
{
"epoch": 1.79,
"grad_norm": 0.9202408790588379,
"learning_rate": 8.246184038377671e-05,
"loss": 1.2344,
"step": 8250
},
{
"epoch": 1.79,
"grad_norm": 0.953535795211792,
"learning_rate": 8.244003488879199e-05,
"loss": 1.2439,
"step": 8260
},
{
"epoch": 1.8,
"grad_norm": 0.8910773992538452,
"learning_rate": 8.241822939380725e-05,
"loss": 1.2417,
"step": 8270
},
{
"epoch": 1.8,
"grad_norm": 1.0123344659805298,
"learning_rate": 8.23964238988225e-05,
"loss": 1.2437,
"step": 8280
},
{
"epoch": 1.8,
"grad_norm": 0.9692454934120178,
"learning_rate": 8.237461840383776e-05,
"loss": 1.2414,
"step": 8290
},
{
"epoch": 1.8,
"grad_norm": 1.2110908031463623,
"learning_rate": 8.235281290885304e-05,
"loss": 1.2273,
"step": 8300
},
{
"epoch": 1.8,
"grad_norm": 0.9399771690368652,
"learning_rate": 8.23310074138683e-05,
"loss": 1.2305,
"step": 8310
},
{
"epoch": 1.81,
"grad_norm": 1.0485948324203491,
"learning_rate": 8.230920191888356e-05,
"loss": 1.2243,
"step": 8320
},
{
"epoch": 1.81,
"grad_norm": 1.1290273666381836,
"learning_rate": 8.228739642389883e-05,
"loss": 1.2647,
"step": 8330
},
{
"epoch": 1.81,
"grad_norm": 1.113707184791565,
"learning_rate": 8.226559092891409e-05,
"loss": 1.2396,
"step": 8340
},
{
"epoch": 1.81,
"grad_norm": 1.161978006362915,
"learning_rate": 8.224378543392935e-05,
"loss": 1.2371,
"step": 8350
},
{
"epoch": 1.81,
"grad_norm": 1.075077772140503,
"learning_rate": 8.222197993894462e-05,
"loss": 1.2326,
"step": 8360
},
{
"epoch": 1.82,
"grad_norm": 0.9579611420631409,
"learning_rate": 8.220017444395988e-05,
"loss": 1.2212,
"step": 8370
},
{
"epoch": 1.82,
"grad_norm": 1.0509251356124878,
"learning_rate": 8.217836894897514e-05,
"loss": 1.2234,
"step": 8380
},
{
"epoch": 1.82,
"grad_norm": 1.02772057056427,
"learning_rate": 8.21565634539904e-05,
"loss": 1.212,
"step": 8390
},
{
"epoch": 1.82,
"grad_norm": 1.0468199253082275,
"learning_rate": 8.213475795900567e-05,
"loss": 1.2328,
"step": 8400
},
{
"epoch": 1.83,
"grad_norm": 0.9836091995239258,
"learning_rate": 8.211295246402095e-05,
"loss": 1.2368,
"step": 8410
},
{
"epoch": 1.83,
"grad_norm": 1.0582927465438843,
"learning_rate": 8.209114696903621e-05,
"loss": 1.2466,
"step": 8420
},
{
"epoch": 1.83,
"grad_norm": 1.039549708366394,
"learning_rate": 8.206934147405146e-05,
"loss": 1.2334,
"step": 8430
},
{
"epoch": 1.83,
"grad_norm": 0.9211510419845581,
"learning_rate": 8.204753597906672e-05,
"loss": 1.2205,
"step": 8440
},
{
"epoch": 1.83,
"grad_norm": 1.019851565361023,
"learning_rate": 8.2025730484082e-05,
"loss": 1.2416,
"step": 8450
},
{
"epoch": 1.84,
"grad_norm": 1.0609748363494873,
"learning_rate": 8.200392498909726e-05,
"loss": 1.2315,
"step": 8460
},
{
"epoch": 1.84,
"grad_norm": 1.1158742904663086,
"learning_rate": 8.198211949411252e-05,
"loss": 1.2485,
"step": 8470
},
{
"epoch": 1.84,
"grad_norm": 0.8996789455413818,
"learning_rate": 8.196031399912779e-05,
"loss": 1.2309,
"step": 8480
},
{
"epoch": 1.84,
"grad_norm": 0.9898722171783447,
"learning_rate": 8.193850850414305e-05,
"loss": 1.236,
"step": 8490
},
{
"epoch": 1.85,
"grad_norm": 1.1336474418640137,
"learning_rate": 8.191670300915831e-05,
"loss": 1.2375,
"step": 8500
},
{
"epoch": 1.85,
"grad_norm": 0.9630258679389954,
"learning_rate": 8.189489751417358e-05,
"loss": 1.2462,
"step": 8510
},
{
"epoch": 1.85,
"grad_norm": 0.9450762271881104,
"learning_rate": 8.187309201918884e-05,
"loss": 1.221,
"step": 8520
},
{
"epoch": 1.85,
"grad_norm": 0.9798605442047119,
"learning_rate": 8.18512865242041e-05,
"loss": 1.2222,
"step": 8530
},
{
"epoch": 1.85,
"grad_norm": 0.9023801684379578,
"learning_rate": 8.182948102921936e-05,
"loss": 1.2193,
"step": 8540
},
{
"epoch": 1.86,
"grad_norm": 0.9918519258499146,
"learning_rate": 8.180767553423463e-05,
"loss": 1.2538,
"step": 8550
},
{
"epoch": 1.86,
"grad_norm": 1.078640341758728,
"learning_rate": 8.178587003924989e-05,
"loss": 1.2239,
"step": 8560
},
{
"epoch": 1.86,
"grad_norm": 1.1001946926116943,
"learning_rate": 8.176406454426517e-05,
"loss": 1.2542,
"step": 8570
},
{
"epoch": 1.86,
"grad_norm": 0.9115540385246277,
"learning_rate": 8.174225904928042e-05,
"loss": 1.2231,
"step": 8580
},
{
"epoch": 1.86,
"grad_norm": 1.0351630449295044,
"learning_rate": 8.172045355429568e-05,
"loss": 1.2328,
"step": 8590
},
{
"epoch": 1.87,
"grad_norm": 1.1193772554397583,
"learning_rate": 8.169864805931094e-05,
"loss": 1.2344,
"step": 8600
},
{
"epoch": 1.87,
"grad_norm": 0.926569402217865,
"learning_rate": 8.167684256432622e-05,
"loss": 1.2318,
"step": 8610
},
{
"epoch": 1.87,
"grad_norm": 1.1995497941970825,
"learning_rate": 8.165503706934148e-05,
"loss": 1.2645,
"step": 8620
},
{
"epoch": 1.87,
"grad_norm": 1.0718098878860474,
"learning_rate": 8.163323157435673e-05,
"loss": 1.2372,
"step": 8630
},
{
"epoch": 1.88,
"grad_norm": 1.0319968461990356,
"learning_rate": 8.161142607937201e-05,
"loss": 1.222,
"step": 8640
},
{
"epoch": 1.88,
"grad_norm": 1.0868433713912964,
"learning_rate": 8.158962058438727e-05,
"loss": 1.2381,
"step": 8650
},
{
"epoch": 1.88,
"grad_norm": 1.0332001447677612,
"learning_rate": 8.156781508940253e-05,
"loss": 1.208,
"step": 8660
},
{
"epoch": 1.88,
"grad_norm": 1.050507664680481,
"learning_rate": 8.15460095944178e-05,
"loss": 1.2276,
"step": 8670
},
{
"epoch": 1.88,
"grad_norm": 0.9764347672462463,
"learning_rate": 8.152420409943306e-05,
"loss": 1.2289,
"step": 8680
},
{
"epoch": 1.89,
"grad_norm": 0.9142500758171082,
"learning_rate": 8.150239860444832e-05,
"loss": 1.2109,
"step": 8690
},
{
"epoch": 1.89,
"grad_norm": 1.028554916381836,
"learning_rate": 8.148059310946359e-05,
"loss": 1.2245,
"step": 8700
},
{
"epoch": 1.89,
"grad_norm": 1.09976327419281,
"learning_rate": 8.145878761447885e-05,
"loss": 1.2387,
"step": 8710
},
{
"epoch": 1.89,
"grad_norm": 1.0482656955718994,
"learning_rate": 8.143698211949413e-05,
"loss": 1.2225,
"step": 8720
},
{
"epoch": 1.9,
"grad_norm": 0.953663170337677,
"learning_rate": 8.141517662450938e-05,
"loss": 1.2605,
"step": 8730
},
{
"epoch": 1.9,
"grad_norm": 1.0766589641571045,
"learning_rate": 8.139337112952464e-05,
"loss": 1.2348,
"step": 8740
},
{
"epoch": 1.9,
"grad_norm": 1.1204911470413208,
"learning_rate": 8.13715656345399e-05,
"loss": 1.2248,
"step": 8750
},
{
"epoch": 1.9,
"grad_norm": 1.0836663246154785,
"learning_rate": 8.134976013955518e-05,
"loss": 1.2463,
"step": 8760
},
{
"epoch": 1.9,
"grad_norm": 1.0038310289382935,
"learning_rate": 8.132795464457044e-05,
"loss": 1.2415,
"step": 8770
},
{
"epoch": 1.91,
"grad_norm": 0.9727823138237,
"learning_rate": 8.130614914958569e-05,
"loss": 1.2291,
"step": 8780
},
{
"epoch": 1.91,
"grad_norm": 0.9913771748542786,
"learning_rate": 8.128434365460095e-05,
"loss": 1.2374,
"step": 8790
},
{
"epoch": 1.91,
"grad_norm": 1.0077624320983887,
"learning_rate": 8.126253815961623e-05,
"loss": 1.2126,
"step": 8800
},
{
"epoch": 1.91,
"grad_norm": 0.9802316427230835,
"learning_rate": 8.12407326646315e-05,
"loss": 1.2084,
"step": 8810
},
{
"epoch": 1.91,
"grad_norm": 1.1375538110733032,
"learning_rate": 8.121892716964676e-05,
"loss": 1.231,
"step": 8820
},
{
"epoch": 1.92,
"grad_norm": 1.0553092956542969,
"learning_rate": 8.119712167466202e-05,
"loss": 1.2132,
"step": 8830
},
{
"epoch": 1.92,
"grad_norm": 0.9583929777145386,
"learning_rate": 8.117531617967728e-05,
"loss": 1.2492,
"step": 8840
},
{
"epoch": 1.92,
"grad_norm": 1.1101999282836914,
"learning_rate": 8.115351068469255e-05,
"loss": 1.2381,
"step": 8850
},
{
"epoch": 1.92,
"grad_norm": 0.9837037920951843,
"learning_rate": 8.113170518970781e-05,
"loss": 1.2122,
"step": 8860
},
{
"epoch": 1.93,
"grad_norm": 0.9561728835105896,
"learning_rate": 8.110989969472309e-05,
"loss": 1.2371,
"step": 8870
},
{
"epoch": 1.93,
"grad_norm": 1.0024539232254028,
"learning_rate": 8.108809419973834e-05,
"loss": 1.2421,
"step": 8880
},
{
"epoch": 1.93,
"grad_norm": 0.8823496103286743,
"learning_rate": 8.10662887047536e-05,
"loss": 1.2221,
"step": 8890
},
{
"epoch": 1.93,
"grad_norm": 0.9598950743675232,
"learning_rate": 8.104448320976886e-05,
"loss": 1.2043,
"step": 8900
},
{
"epoch": 1.93,
"grad_norm": 1.165281057357788,
"learning_rate": 8.102267771478414e-05,
"loss": 1.2261,
"step": 8910
},
{
"epoch": 1.94,
"grad_norm": 0.9209827184677124,
"learning_rate": 8.10008722197994e-05,
"loss": 1.2196,
"step": 8920
},
{
"epoch": 1.94,
"grad_norm": 1.023848056793213,
"learning_rate": 8.097906672481465e-05,
"loss": 1.2393,
"step": 8930
},
{
"epoch": 1.94,
"grad_norm": 1.0043749809265137,
"learning_rate": 8.095726122982991e-05,
"loss": 1.2362,
"step": 8940
},
{
"epoch": 1.94,
"grad_norm": 0.9257699251174927,
"learning_rate": 8.093545573484519e-05,
"loss": 1.2258,
"step": 8950
},
{
"epoch": 1.95,
"grad_norm": 1.1696765422821045,
"learning_rate": 8.091365023986045e-05,
"loss": 1.2459,
"step": 8960
},
{
"epoch": 1.95,
"grad_norm": 0.9257934093475342,
"learning_rate": 8.089184474487572e-05,
"loss": 1.2492,
"step": 8970
},
{
"epoch": 1.95,
"grad_norm": 1.1503798961639404,
"learning_rate": 8.087003924989097e-05,
"loss": 1.2311,
"step": 8980
},
{
"epoch": 1.95,
"grad_norm": 1.1405220031738281,
"learning_rate": 8.084823375490624e-05,
"loss": 1.2409,
"step": 8990
},
{
"epoch": 1.95,
"grad_norm": 0.976625382900238,
"learning_rate": 8.08264282599215e-05,
"loss": 1.2266,
"step": 9000
},
{
"epoch": 1.96,
"grad_norm": 0.9233745336532593,
"learning_rate": 8.080462276493677e-05,
"loss": 1.2261,
"step": 9010
},
{
"epoch": 1.96,
"grad_norm": 1.0994141101837158,
"learning_rate": 8.078281726995203e-05,
"loss": 1.2352,
"step": 9020
},
{
"epoch": 1.96,
"grad_norm": 0.9999457001686096,
"learning_rate": 8.07610117749673e-05,
"loss": 1.2238,
"step": 9030
},
{
"epoch": 1.96,
"grad_norm": 1.0037119388580322,
"learning_rate": 8.073920627998256e-05,
"loss": 1.2439,
"step": 9040
},
{
"epoch": 1.96,
"grad_norm": 0.9493910670280457,
"learning_rate": 8.071740078499782e-05,
"loss": 1.2253,
"step": 9050
},
{
"epoch": 1.97,
"grad_norm": 1.099271535873413,
"learning_rate": 8.069559529001308e-05,
"loss": 1.211,
"step": 9060
},
{
"epoch": 1.97,
"grad_norm": 0.9729533791542053,
"learning_rate": 8.067378979502836e-05,
"loss": 1.2257,
"step": 9070
},
{
"epoch": 1.97,
"grad_norm": 1.112057089805603,
"learning_rate": 8.065198430004361e-05,
"loss": 1.2092,
"step": 9080
},
{
"epoch": 1.97,
"grad_norm": 0.9645751714706421,
"learning_rate": 8.063017880505887e-05,
"loss": 1.2123,
"step": 9090
},
{
"epoch": 1.98,
"grad_norm": 1.0263340473175049,
"learning_rate": 8.060837331007415e-05,
"loss": 1.2033,
"step": 9100
},
{
"epoch": 1.98,
"grad_norm": 1.1131114959716797,
"learning_rate": 8.058656781508941e-05,
"loss": 1.2303,
"step": 9110
},
{
"epoch": 1.98,
"grad_norm": 1.1425633430480957,
"learning_rate": 8.056476232010468e-05,
"loss": 1.2166,
"step": 9120
},
{
"epoch": 1.98,
"grad_norm": 0.9223284721374512,
"learning_rate": 8.054295682511992e-05,
"loss": 1.2588,
"step": 9130
},
{
"epoch": 1.98,
"grad_norm": 0.9477842450141907,
"learning_rate": 8.05211513301352e-05,
"loss": 1.2028,
"step": 9140
},
{
"epoch": 1.99,
"grad_norm": 1.0649006366729736,
"learning_rate": 8.049934583515046e-05,
"loss": 1.2238,
"step": 9150
},
{
"epoch": 1.99,
"grad_norm": 1.0043710470199585,
"learning_rate": 8.047754034016573e-05,
"loss": 1.2301,
"step": 9160
},
{
"epoch": 1.99,
"grad_norm": 1.0217610597610474,
"learning_rate": 8.045573484518099e-05,
"loss": 1.2406,
"step": 9170
},
{
"epoch": 1.99,
"grad_norm": 0.9688403606414795,
"learning_rate": 8.043392935019625e-05,
"loss": 1.2364,
"step": 9180
},
{
"epoch": 2.0,
"grad_norm": 1.095987319946289,
"learning_rate": 8.041212385521152e-05,
"loss": 1.241,
"step": 9190
},
{
"epoch": 2.0,
"grad_norm": 0.9398607611656189,
"learning_rate": 8.039031836022678e-05,
"loss": 1.226,
"step": 9200
},
{
"epoch": 2.0,
"grad_norm": 0.9815939664840698,
"learning_rate": 8.036851286524204e-05,
"loss": 1.2181,
"step": 9210
},
{
"epoch": 2.0,
"eval_loss": 1.2817823886871338,
"eval_runtime": 1495.0675,
"eval_samples_per_second": 258.75,
"eval_steps_per_second": 4.043,
"step": 9212
},
{
"epoch": 2.0,
"grad_norm": 1.0157184600830078,
"learning_rate": 8.034670737025732e-05,
"loss": 1.2142,
"step": 9220
},
{
"epoch": 2.0,
"grad_norm": 0.9625092148780823,
"learning_rate": 8.032490187527257e-05,
"loss": 1.2089,
"step": 9230
},
{
"epoch": 2.01,
"grad_norm": 0.9196017384529114,
"learning_rate": 8.030309638028783e-05,
"loss": 1.2335,
"step": 9240
},
{
"epoch": 2.01,
"grad_norm": 0.9308544397354126,
"learning_rate": 8.02812908853031e-05,
"loss": 1.2163,
"step": 9250
},
{
"epoch": 2.01,
"grad_norm": 1.2144242525100708,
"learning_rate": 8.025948539031837e-05,
"loss": 1.2008,
"step": 9260
},
{
"epoch": 2.01,
"grad_norm": 0.9780566692352295,
"learning_rate": 8.023767989533363e-05,
"loss": 1.1919,
"step": 9270
},
{
"epoch": 2.01,
"grad_norm": 0.9934610724449158,
"learning_rate": 8.021587440034888e-05,
"loss": 1.1813,
"step": 9280
},
{
"epoch": 2.02,
"grad_norm": 1.1047219038009644,
"learning_rate": 8.019406890536415e-05,
"loss": 1.1887,
"step": 9290
},
{
"epoch": 2.02,
"grad_norm": 1.0617597103118896,
"learning_rate": 8.017226341037942e-05,
"loss": 1.2142,
"step": 9300
},
{
"epoch": 2.02,
"grad_norm": 0.9656373858451843,
"learning_rate": 8.015045791539469e-05,
"loss": 1.1962,
"step": 9310
},
{
"epoch": 2.02,
"grad_norm": 0.9934256076812744,
"learning_rate": 8.012865242040995e-05,
"loss": 1.2093,
"step": 9320
},
{
"epoch": 2.03,
"grad_norm": 1.0616453886032104,
"learning_rate": 8.010684692542521e-05,
"loss": 1.227,
"step": 9330
},
{
"epoch": 2.03,
"grad_norm": 1.0761624574661255,
"learning_rate": 8.008504143044048e-05,
"loss": 1.2126,
"step": 9340
},
{
"epoch": 2.03,
"grad_norm": 1.06252920627594,
"learning_rate": 8.006323593545574e-05,
"loss": 1.1966,
"step": 9350
},
{
"epoch": 2.03,
"grad_norm": 0.9828883409500122,
"learning_rate": 8.0041430440471e-05,
"loss": 1.2032,
"step": 9360
},
{
"epoch": 2.03,
"grad_norm": 1.0415362119674683,
"learning_rate": 8.001962494548628e-05,
"loss": 1.2069,
"step": 9370
},
{
"epoch": 2.04,
"grad_norm": 0.9932116866111755,
"learning_rate": 7.999781945050153e-05,
"loss": 1.2099,
"step": 9380
},
{
"epoch": 2.04,
"grad_norm": 1.0453740358352661,
"learning_rate": 7.997601395551679e-05,
"loss": 1.1908,
"step": 9390
},
{
"epoch": 2.04,
"grad_norm": 0.9478277564048767,
"learning_rate": 7.995420846053205e-05,
"loss": 1.2016,
"step": 9400
},
{
"epoch": 2.04,
"grad_norm": 0.9447776079177856,
"learning_rate": 7.993240296554733e-05,
"loss": 1.2163,
"step": 9410
},
{
"epoch": 2.05,
"grad_norm": 0.9693462252616882,
"learning_rate": 7.991059747056259e-05,
"loss": 1.1871,
"step": 9420
},
{
"epoch": 2.05,
"grad_norm": 1.2381738424301147,
"learning_rate": 7.988879197557784e-05,
"loss": 1.214,
"step": 9430
},
{
"epoch": 2.05,
"grad_norm": 0.9551769495010376,
"learning_rate": 7.98669864805931e-05,
"loss": 1.2026,
"step": 9440
},
{
"epoch": 2.05,
"grad_norm": 1.009376883506775,
"learning_rate": 7.984518098560838e-05,
"loss": 1.1991,
"step": 9450
},
{
"epoch": 2.05,
"grad_norm": 0.9546257257461548,
"learning_rate": 7.982337549062365e-05,
"loss": 1.2164,
"step": 9460
},
{
"epoch": 2.06,
"grad_norm": 0.9941860437393188,
"learning_rate": 7.980156999563891e-05,
"loss": 1.2111,
"step": 9470
},
{
"epoch": 2.06,
"grad_norm": 1.211512565612793,
"learning_rate": 7.977976450065416e-05,
"loss": 1.1795,
"step": 9480
},
{
"epoch": 2.06,
"grad_norm": 1.004779577255249,
"learning_rate": 7.975795900566943e-05,
"loss": 1.2049,
"step": 9490
},
{
"epoch": 2.06,
"grad_norm": 1.0823005437850952,
"learning_rate": 7.97361535106847e-05,
"loss": 1.1886,
"step": 9500
},
{
"epoch": 2.06,
"grad_norm": 1.0418225526809692,
"learning_rate": 7.971434801569996e-05,
"loss": 1.2105,
"step": 9510
},
{
"epoch": 2.07,
"grad_norm": 1.1182845830917358,
"learning_rate": 7.969254252071522e-05,
"loss": 1.1897,
"step": 9520
},
{
"epoch": 2.07,
"grad_norm": 0.946642279624939,
"learning_rate": 7.967073702573049e-05,
"loss": 1.199,
"step": 9530
},
{
"epoch": 2.07,
"grad_norm": 1.1157629489898682,
"learning_rate": 7.964893153074575e-05,
"loss": 1.2294,
"step": 9540
},
{
"epoch": 2.07,
"grad_norm": 1.053207516670227,
"learning_rate": 7.962712603576101e-05,
"loss": 1.2412,
"step": 9550
},
{
"epoch": 2.08,
"grad_norm": 0.9756922721862793,
"learning_rate": 7.960532054077629e-05,
"loss": 1.1976,
"step": 9560
},
{
"epoch": 2.08,
"grad_norm": 1.049428105354309,
"learning_rate": 7.958351504579155e-05,
"loss": 1.2254,
"step": 9570
},
{
"epoch": 2.08,
"grad_norm": 0.9671922922134399,
"learning_rate": 7.95617095508068e-05,
"loss": 1.1905,
"step": 9580
},
{
"epoch": 2.08,
"grad_norm": 1.0883835554122925,
"learning_rate": 7.953990405582206e-05,
"loss": 1.2032,
"step": 9590
},
{
"epoch": 2.08,
"grad_norm": 1.080729365348816,
"learning_rate": 7.951809856083734e-05,
"loss": 1.216,
"step": 9600
},
{
"epoch": 2.09,
"grad_norm": 0.9762791395187378,
"learning_rate": 7.94962930658526e-05,
"loss": 1.2167,
"step": 9610
},
{
"epoch": 2.09,
"grad_norm": 1.1527519226074219,
"learning_rate": 7.947448757086787e-05,
"loss": 1.1682,
"step": 9620
},
{
"epoch": 2.09,
"grad_norm": 1.0505051612854004,
"learning_rate": 7.945268207588312e-05,
"loss": 1.211,
"step": 9630
},
{
"epoch": 2.09,
"grad_norm": 1.1166177988052368,
"learning_rate": 7.94308765808984e-05,
"loss": 1.1763,
"step": 9640
},
{
"epoch": 2.09,
"grad_norm": 1.038783073425293,
"learning_rate": 7.940907108591366e-05,
"loss": 1.2113,
"step": 9650
},
{
"epoch": 2.1,
"grad_norm": 1.0138919353485107,
"learning_rate": 7.938726559092892e-05,
"loss": 1.214,
"step": 9660
},
{
"epoch": 2.1,
"grad_norm": 0.8989730477333069,
"learning_rate": 7.936546009594418e-05,
"loss": 1.1975,
"step": 9670
},
{
"epoch": 2.1,
"grad_norm": 0.9866936206817627,
"learning_rate": 7.934365460095945e-05,
"loss": 1.2163,
"step": 9680
},
{
"epoch": 2.1,
"grad_norm": 0.9352193474769592,
"learning_rate": 7.932184910597471e-05,
"loss": 1.1936,
"step": 9690
},
{
"epoch": 2.11,
"grad_norm": 0.9865077137947083,
"learning_rate": 7.930004361098997e-05,
"loss": 1.2279,
"step": 9700
},
{
"epoch": 2.11,
"grad_norm": 0.9269611835479736,
"learning_rate": 7.927823811600523e-05,
"loss": 1.2089,
"step": 9710
},
{
"epoch": 2.11,
"grad_norm": 1.0865782499313354,
"learning_rate": 7.92564326210205e-05,
"loss": 1.2073,
"step": 9720
},
{
"epoch": 2.11,
"grad_norm": 1.077241063117981,
"learning_rate": 7.923462712603576e-05,
"loss": 1.1952,
"step": 9730
},
{
"epoch": 2.11,
"grad_norm": 1.1019902229309082,
"learning_rate": 7.921282163105102e-05,
"loss": 1.1845,
"step": 9740
},
{
"epoch": 2.12,
"grad_norm": 1.1047565937042236,
"learning_rate": 7.919101613606629e-05,
"loss": 1.2115,
"step": 9750
},
{
"epoch": 2.12,
"grad_norm": 1.038865327835083,
"learning_rate": 7.916921064108156e-05,
"loss": 1.1764,
"step": 9760
},
{
"epoch": 2.12,
"grad_norm": 1.039838194847107,
"learning_rate": 7.914740514609683e-05,
"loss": 1.2061,
"step": 9770
},
{
"epoch": 2.12,
"grad_norm": 1.1482833623886108,
"learning_rate": 7.912559965111208e-05,
"loss": 1.1819,
"step": 9780
},
{
"epoch": 2.13,
"grad_norm": 1.2092708349227905,
"learning_rate": 7.910379415612735e-05,
"loss": 1.2204,
"step": 9790
},
{
"epoch": 2.13,
"grad_norm": 0.9620797634124756,
"learning_rate": 7.908198866114262e-05,
"loss": 1.2282,
"step": 9800
},
{
"epoch": 2.13,
"grad_norm": 0.9821200966835022,
"learning_rate": 7.906018316615788e-05,
"loss": 1.1928,
"step": 9810
},
{
"epoch": 2.13,
"grad_norm": 0.9970041513442993,
"learning_rate": 7.903837767117314e-05,
"loss": 1.2293,
"step": 9820
},
{
"epoch": 2.13,
"grad_norm": 1.0370044708251953,
"learning_rate": 7.90165721761884e-05,
"loss": 1.2015,
"step": 9830
},
{
"epoch": 2.14,
"grad_norm": 0.9988645911216736,
"learning_rate": 7.899476668120367e-05,
"loss": 1.1827,
"step": 9840
},
{
"epoch": 2.14,
"grad_norm": 1.0234349966049194,
"learning_rate": 7.897296118621893e-05,
"loss": 1.2185,
"step": 9850
},
{
"epoch": 2.14,
"grad_norm": 1.1477036476135254,
"learning_rate": 7.89511556912342e-05,
"loss": 1.2108,
"step": 9860
},
{
"epoch": 2.14,
"grad_norm": 1.1326051950454712,
"learning_rate": 7.892935019624946e-05,
"loss": 1.1785,
"step": 9870
},
{
"epoch": 2.14,
"grad_norm": 1.003237009048462,
"learning_rate": 7.890754470126472e-05,
"loss": 1.2082,
"step": 9880
},
{
"epoch": 2.15,
"grad_norm": 1.0607051849365234,
"learning_rate": 7.888573920627998e-05,
"loss": 1.2112,
"step": 9890
},
{
"epoch": 2.15,
"grad_norm": 1.0867217779159546,
"learning_rate": 7.886393371129525e-05,
"loss": 1.1845,
"step": 9900
},
{
"epoch": 2.15,
"grad_norm": 0.945563018321991,
"learning_rate": 7.884212821631052e-05,
"loss": 1.1925,
"step": 9910
},
{
"epoch": 2.15,
"grad_norm": 1.0693022012710571,
"learning_rate": 7.882032272132579e-05,
"loss": 1.1956,
"step": 9920
},
{
"epoch": 2.16,
"grad_norm": 0.9993180632591248,
"learning_rate": 7.879851722634103e-05,
"loss": 1.1965,
"step": 9930
},
{
"epoch": 2.16,
"grad_norm": 1.010133147239685,
"learning_rate": 7.87767117313563e-05,
"loss": 1.2168,
"step": 9940
},
{
"epoch": 2.16,
"grad_norm": 1.0953561067581177,
"learning_rate": 7.875490623637157e-05,
"loss": 1.2114,
"step": 9950
},
{
"epoch": 2.16,
"grad_norm": 0.9444001317024231,
"learning_rate": 7.873310074138684e-05,
"loss": 1.1988,
"step": 9960
},
{
"epoch": 2.16,
"grad_norm": 0.9980970621109009,
"learning_rate": 7.87112952464021e-05,
"loss": 1.2275,
"step": 9970
},
{
"epoch": 2.17,
"grad_norm": 1.0584611892700195,
"learning_rate": 7.868948975141735e-05,
"loss": 1.2105,
"step": 9980
},
{
"epoch": 2.17,
"grad_norm": 1.1327629089355469,
"learning_rate": 7.866768425643263e-05,
"loss": 1.2022,
"step": 9990
},
{
"epoch": 2.17,
"grad_norm": 0.981350302696228,
"learning_rate": 7.864587876144789e-05,
"loss": 1.2151,
"step": 10000
},
{
"epoch": 2.17,
"grad_norm": 1.1142750978469849,
"learning_rate": 7.862407326646315e-05,
"loss": 1.1931,
"step": 10010
},
{
"epoch": 2.18,
"grad_norm": 1.0601882934570312,
"learning_rate": 7.860226777147842e-05,
"loss": 1.2141,
"step": 10020
},
{
"epoch": 2.18,
"grad_norm": 0.9991333484649658,
"learning_rate": 7.858046227649368e-05,
"loss": 1.1921,
"step": 10030
},
{
"epoch": 2.18,
"grad_norm": 1.1021018028259277,
"learning_rate": 7.855865678150894e-05,
"loss": 1.2225,
"step": 10040
},
{
"epoch": 2.18,
"grad_norm": 1.0568020343780518,
"learning_rate": 7.85368512865242e-05,
"loss": 1.2427,
"step": 10050
},
{
"epoch": 2.18,
"grad_norm": 0.9811879992485046,
"learning_rate": 7.851504579153948e-05,
"loss": 1.1997,
"step": 10060
},
{
"epoch": 2.19,
"grad_norm": 1.0988446474075317,
"learning_rate": 7.849324029655473e-05,
"loss": 1.2156,
"step": 10070
},
{
"epoch": 2.19,
"grad_norm": 1.0393906831741333,
"learning_rate": 7.847143480157e-05,
"loss": 1.2258,
"step": 10080
},
{
"epoch": 2.19,
"grad_norm": 1.1017202138900757,
"learning_rate": 7.844962930658526e-05,
"loss": 1.2069,
"step": 10090
},
{
"epoch": 2.19,
"grad_norm": 1.1102749109268188,
"learning_rate": 7.842782381160053e-05,
"loss": 1.2256,
"step": 10100
},
{
"epoch": 2.19,
"grad_norm": 1.0270189046859741,
"learning_rate": 7.84060183166158e-05,
"loss": 1.2174,
"step": 10110
},
{
"epoch": 2.2,
"grad_norm": 1.0221537351608276,
"learning_rate": 7.838421282163106e-05,
"loss": 1.1968,
"step": 10120
},
{
"epoch": 2.2,
"grad_norm": 0.95604407787323,
"learning_rate": 7.836240732664631e-05,
"loss": 1.213,
"step": 10130
},
{
"epoch": 2.2,
"grad_norm": 0.9393739700317383,
"learning_rate": 7.834060183166159e-05,
"loss": 1.2182,
"step": 10140
},
{
"epoch": 2.2,
"grad_norm": 1.014799952507019,
"learning_rate": 7.831879633667685e-05,
"loss": 1.2021,
"step": 10150
},
{
"epoch": 2.21,
"grad_norm": 1.0287479162216187,
"learning_rate": 7.829699084169211e-05,
"loss": 1.2114,
"step": 10160
},
{
"epoch": 2.21,
"grad_norm": 1.0790306329727173,
"learning_rate": 7.827736589620584e-05,
"loss": 1.1874,
"step": 10170
},
{
"epoch": 2.21,
"grad_norm": 0.9588958621025085,
"learning_rate": 7.82555604012211e-05,
"loss": 1.2191,
"step": 10180
},
{
"epoch": 2.21,
"grad_norm": 0.9004745483398438,
"learning_rate": 7.823375490623638e-05,
"loss": 1.1933,
"step": 10190
},
{
"epoch": 2.21,
"grad_norm": 1.0742331743240356,
"learning_rate": 7.821194941125164e-05,
"loss": 1.2128,
"step": 10200
},
{
"epoch": 2.22,
"grad_norm": 1.072489857673645,
"learning_rate": 7.81901439162669e-05,
"loss": 1.2143,
"step": 10210
},
{
"epoch": 2.22,
"grad_norm": 0.9534905552864075,
"learning_rate": 7.816833842128217e-05,
"loss": 1.2206,
"step": 10220
},
{
"epoch": 2.22,
"grad_norm": 1.0694421529769897,
"learning_rate": 7.814653292629743e-05,
"loss": 1.2051,
"step": 10230
},
{
"epoch": 2.22,
"grad_norm": 0.9729447364807129,
"learning_rate": 7.81247274313127e-05,
"loss": 1.2234,
"step": 10240
},
{
"epoch": 2.23,
"grad_norm": 1.0395437479019165,
"learning_rate": 7.810292193632796e-05,
"loss": 1.1977,
"step": 10250
},
{
"epoch": 2.23,
"grad_norm": 0.999451756477356,
"learning_rate": 7.808111644134322e-05,
"loss": 1.2053,
"step": 10260
},
{
"epoch": 2.23,
"grad_norm": 1.1238023042678833,
"learning_rate": 7.805931094635848e-05,
"loss": 1.2295,
"step": 10270
},
{
"epoch": 2.23,
"grad_norm": 1.0689754486083984,
"learning_rate": 7.803750545137375e-05,
"loss": 1.2059,
"step": 10280
},
{
"epoch": 2.23,
"grad_norm": 0.9754849672317505,
"learning_rate": 7.801569995638901e-05,
"loss": 1.206,
"step": 10290
},
{
"epoch": 2.24,
"grad_norm": 1.02662193775177,
"learning_rate": 7.799389446140429e-05,
"loss": 1.1967,
"step": 10300
},
{
"epoch": 2.24,
"grad_norm": 1.1547129154205322,
"learning_rate": 7.797208896641954e-05,
"loss": 1.211,
"step": 10310
},
{
"epoch": 2.24,
"grad_norm": 0.9812795519828796,
"learning_rate": 7.79502834714348e-05,
"loss": 1.1928,
"step": 10320
},
{
"epoch": 2.24,
"grad_norm": 1.0706185102462769,
"learning_rate": 7.792847797645006e-05,
"loss": 1.1914,
"step": 10330
},
{
"epoch": 2.24,
"grad_norm": 1.0410836935043335,
"learning_rate": 7.790667248146534e-05,
"loss": 1.2002,
"step": 10340
},
{
"epoch": 2.25,
"grad_norm": 0.9746688008308411,
"learning_rate": 7.78848669864806e-05,
"loss": 1.1863,
"step": 10350
},
{
"epoch": 2.25,
"grad_norm": 0.8778429627418518,
"learning_rate": 7.786524204099433e-05,
"loss": 1.2383,
"step": 10360
},
{
"epoch": 2.25,
"grad_norm": 0.969650149345398,
"learning_rate": 7.78434365460096e-05,
"loss": 1.177,
"step": 10370
},
{
"epoch": 2.25,
"grad_norm": 1.015781283378601,
"learning_rate": 7.782163105102486e-05,
"loss": 1.1838,
"step": 10380
},
{
"epoch": 2.26,
"grad_norm": 0.8965770602226257,
"learning_rate": 7.779982555604013e-05,
"loss": 1.2175,
"step": 10390
},
{
"epoch": 2.26,
"grad_norm": 1.007692575454712,
"learning_rate": 7.77780200610554e-05,
"loss": 1.1978,
"step": 10400
},
{
"epoch": 2.26,
"grad_norm": 0.9334578514099121,
"learning_rate": 7.775621456607065e-05,
"loss": 1.1887,
"step": 10410
},
{
"epoch": 2.26,
"grad_norm": 0.9570727348327637,
"learning_rate": 7.773440907108591e-05,
"loss": 1.211,
"step": 10420
},
{
"epoch": 2.26,
"grad_norm": 1.0146620273590088,
"learning_rate": 7.771260357610119e-05,
"loss": 1.2188,
"step": 10430
},
{
"epoch": 2.27,
"grad_norm": 1.0868462324142456,
"learning_rate": 7.769079808111645e-05,
"loss": 1.2147,
"step": 10440
},
{
"epoch": 2.27,
"grad_norm": 1.062110185623169,
"learning_rate": 7.766899258613171e-05,
"loss": 1.2172,
"step": 10450
},
{
"epoch": 2.27,
"grad_norm": 0.950108528137207,
"learning_rate": 7.764718709114697e-05,
"loss": 1.2077,
"step": 10460
},
{
"epoch": 2.27,
"grad_norm": 1.029308795928955,
"learning_rate": 7.762538159616224e-05,
"loss": 1.2112,
"step": 10470
},
{
"epoch": 2.28,
"grad_norm": 0.9809032678604126,
"learning_rate": 7.76035761011775e-05,
"loss": 1.2115,
"step": 10480
},
{
"epoch": 2.28,
"grad_norm": 1.0070390701293945,
"learning_rate": 7.758177060619276e-05,
"loss": 1.2032,
"step": 10490
},
{
"epoch": 2.28,
"grad_norm": 1.1221727132797241,
"learning_rate": 7.755996511120803e-05,
"loss": 1.2164,
"step": 10500
},
{
"epoch": 2.28,
"grad_norm": 1.013219952583313,
"learning_rate": 7.753815961622329e-05,
"loss": 1.1912,
"step": 10510
},
{
"epoch": 2.28,
"grad_norm": 1.0602985620498657,
"learning_rate": 7.751635412123855e-05,
"loss": 1.1607,
"step": 10520
},
{
"epoch": 2.29,
"grad_norm": 1.009325385093689,
"learning_rate": 7.749454862625382e-05,
"loss": 1.1943,
"step": 10530
},
{
"epoch": 2.29,
"grad_norm": 1.01610267162323,
"learning_rate": 7.747274313126909e-05,
"loss": 1.2036,
"step": 10540
},
{
"epoch": 2.29,
"grad_norm": 0.9865471720695496,
"learning_rate": 7.745093763628436e-05,
"loss": 1.1951,
"step": 10550
},
{
"epoch": 2.29,
"grad_norm": 1.1565035581588745,
"learning_rate": 7.74291321412996e-05,
"loss": 1.2132,
"step": 10560
},
{
"epoch": 2.29,
"grad_norm": 0.9530940651893616,
"learning_rate": 7.740732664631487e-05,
"loss": 1.191,
"step": 10570
},
{
"epoch": 2.3,
"grad_norm": 1.1055086851119995,
"learning_rate": 7.738552115133014e-05,
"loss": 1.2292,
"step": 10580
},
{
"epoch": 2.3,
"grad_norm": 1.0695475339889526,
"learning_rate": 7.736371565634541e-05,
"loss": 1.1937,
"step": 10590
},
{
"epoch": 2.3,
"grad_norm": 0.991439163684845,
"learning_rate": 7.734191016136067e-05,
"loss": 1.2117,
"step": 10600
},
{
"epoch": 2.3,
"grad_norm": 0.9743112921714783,
"learning_rate": 7.732010466637592e-05,
"loss": 1.2275,
"step": 10610
},
{
"epoch": 2.31,
"grad_norm": 1.030121922492981,
"learning_rate": 7.72982991713912e-05,
"loss": 1.1893,
"step": 10620
},
{
"epoch": 2.31,
"grad_norm": 1.0691959857940674,
"learning_rate": 7.727649367640646e-05,
"loss": 1.2044,
"step": 10630
},
{
"epoch": 2.31,
"grad_norm": 1.141326904296875,
"learning_rate": 7.725468818142172e-05,
"loss": 1.2208,
"step": 10640
},
{
"epoch": 2.31,
"grad_norm": 1.0179444551467896,
"learning_rate": 7.723288268643699e-05,
"loss": 1.1901,
"step": 10650
},
{
"epoch": 2.31,
"grad_norm": 1.1256074905395508,
"learning_rate": 7.721107719145225e-05,
"loss": 1.2,
"step": 10660
},
{
"epoch": 2.32,
"grad_norm": 1.0997061729431152,
"learning_rate": 7.718927169646751e-05,
"loss": 1.194,
"step": 10670
},
{
"epoch": 2.32,
"grad_norm": 1.0382623672485352,
"learning_rate": 7.716746620148277e-05,
"loss": 1.2277,
"step": 10680
},
{
"epoch": 2.32,
"grad_norm": 1.0295804738998413,
"learning_rate": 7.714566070649805e-05,
"loss": 1.1857,
"step": 10690
},
{
"epoch": 2.32,
"grad_norm": 1.0594016313552856,
"learning_rate": 7.71238552115133e-05,
"loss": 1.1955,
"step": 10700
},
{
"epoch": 2.33,
"grad_norm": 1.0921293497085571,
"learning_rate": 7.710204971652856e-05,
"loss": 1.1836,
"step": 10710
},
{
"epoch": 2.33,
"grad_norm": 1.0477246046066284,
"learning_rate": 7.708024422154383e-05,
"loss": 1.2023,
"step": 10720
},
{
"epoch": 2.33,
"grad_norm": 1.0246959924697876,
"learning_rate": 7.70584387265591e-05,
"loss": 1.222,
"step": 10730
},
{
"epoch": 2.33,
"grad_norm": 1.0640301704406738,
"learning_rate": 7.703663323157437e-05,
"loss": 1.1974,
"step": 10740
},
{
"epoch": 2.33,
"grad_norm": 1.0652765035629272,
"learning_rate": 7.701482773658963e-05,
"loss": 1.1997,
"step": 10750
},
{
"epoch": 2.34,
"grad_norm": 0.9220369458198547,
"learning_rate": 7.699302224160488e-05,
"loss": 1.212,
"step": 10760
},
{
"epoch": 2.34,
"grad_norm": 0.9531814455986023,
"learning_rate": 7.697121674662016e-05,
"loss": 1.1686,
"step": 10770
},
{
"epoch": 2.34,
"grad_norm": 1.1248044967651367,
"learning_rate": 7.694941125163542e-05,
"loss": 1.1971,
"step": 10780
},
{
"epoch": 2.34,
"grad_norm": 1.0232545137405396,
"learning_rate": 7.692760575665068e-05,
"loss": 1.194,
"step": 10790
},
{
"epoch": 2.34,
"grad_norm": 1.0724860429763794,
"learning_rate": 7.690580026166594e-05,
"loss": 1.1936,
"step": 10800
},
{
"epoch": 2.35,
"grad_norm": 1.036474347114563,
"learning_rate": 7.688399476668121e-05,
"loss": 1.2078,
"step": 10810
},
{
"epoch": 2.35,
"grad_norm": 1.0231555700302124,
"learning_rate": 7.686218927169647e-05,
"loss": 1.2056,
"step": 10820
},
{
"epoch": 2.35,
"grad_norm": 0.9879153370857239,
"learning_rate": 7.684038377671173e-05,
"loss": 1.2191,
"step": 10830
},
{
"epoch": 2.35,
"grad_norm": 1.0709577798843384,
"learning_rate": 7.6818578281727e-05,
"loss": 1.198,
"step": 10840
},
{
"epoch": 2.36,
"grad_norm": 1.0138386487960815,
"learning_rate": 7.679677278674226e-05,
"loss": 1.2284,
"step": 10850
},
{
"epoch": 2.36,
"grad_norm": 1.0676188468933105,
"learning_rate": 7.677496729175752e-05,
"loss": 1.2004,
"step": 10860
},
{
"epoch": 2.36,
"grad_norm": 1.0372511148452759,
"learning_rate": 7.675316179677279e-05,
"loss": 1.167,
"step": 10870
},
{
"epoch": 2.36,
"grad_norm": 1.0466020107269287,
"learning_rate": 7.673135630178805e-05,
"loss": 1.1958,
"step": 10880
},
{
"epoch": 2.36,
"grad_norm": 1.0521596670150757,
"learning_rate": 7.670955080680333e-05,
"loss": 1.2025,
"step": 10890
},
{
"epoch": 2.37,
"grad_norm": 0.9906710982322693,
"learning_rate": 7.668774531181858e-05,
"loss": 1.188,
"step": 10900
},
{
"epoch": 2.37,
"grad_norm": 1.1713993549346924,
"learning_rate": 7.666593981683384e-05,
"loss": 1.1992,
"step": 10910
},
{
"epoch": 2.37,
"grad_norm": 1.009819507598877,
"learning_rate": 7.664413432184911e-05,
"loss": 1.191,
"step": 10920
},
{
"epoch": 2.37,
"grad_norm": 1.0150312185287476,
"learning_rate": 7.662232882686438e-05,
"loss": 1.1951,
"step": 10930
},
{
"epoch": 2.38,
"grad_norm": 0.9645649790763855,
"learning_rate": 7.660052333187964e-05,
"loss": 1.1941,
"step": 10940
},
{
"epoch": 2.38,
"grad_norm": 1.0158168077468872,
"learning_rate": 7.65787178368949e-05,
"loss": 1.1911,
"step": 10950
},
{
"epoch": 2.38,
"grad_norm": 1.0730938911437988,
"learning_rate": 7.655691234191017e-05,
"loss": 1.1885,
"step": 10960
},
{
"epoch": 2.38,
"grad_norm": 1.09099543094635,
"learning_rate": 7.653510684692543e-05,
"loss": 1.195,
"step": 10970
},
{
"epoch": 2.38,
"grad_norm": 0.982562243938446,
"learning_rate": 7.651330135194069e-05,
"loss": 1.213,
"step": 10980
},
{
"epoch": 2.39,
"grad_norm": 1.0173815488815308,
"learning_rate": 7.649149585695596e-05,
"loss": 1.1931,
"step": 10990
},
{
"epoch": 2.39,
"grad_norm": 1.0644387006759644,
"learning_rate": 7.646969036197122e-05,
"loss": 1.2,
"step": 11000
},
{
"epoch": 2.39,
"grad_norm": 1.0456851720809937,
"learning_rate": 7.644788486698648e-05,
"loss": 1.2267,
"step": 11010
},
{
"epoch": 2.39,
"grad_norm": 1.0387489795684814,
"learning_rate": 7.642607937200175e-05,
"loss": 1.1818,
"step": 11020
},
{
"epoch": 2.39,
"grad_norm": 1.034599781036377,
"learning_rate": 7.640427387701701e-05,
"loss": 1.1972,
"step": 11030
},
{
"epoch": 2.4,
"grad_norm": 1.005964994430542,
"learning_rate": 7.638246838203228e-05,
"loss": 1.1882,
"step": 11040
},
{
"epoch": 2.4,
"grad_norm": 1.0190836191177368,
"learning_rate": 7.636066288704753e-05,
"loss": 1.1819,
"step": 11050
},
{
"epoch": 2.4,
"grad_norm": 1.010334849357605,
"learning_rate": 7.63388573920628e-05,
"loss": 1.2054,
"step": 11060
},
{
"epoch": 2.4,
"grad_norm": 0.986047089099884,
"learning_rate": 7.631705189707806e-05,
"loss": 1.1831,
"step": 11070
},
{
"epoch": 2.41,
"grad_norm": 1.0715646743774414,
"learning_rate": 7.629524640209334e-05,
"loss": 1.2143,
"step": 11080
},
{
"epoch": 2.41,
"grad_norm": 1.0573137998580933,
"learning_rate": 7.62734409071086e-05,
"loss": 1.1765,
"step": 11090
},
{
"epoch": 2.41,
"grad_norm": 0.9830726385116577,
"learning_rate": 7.625163541212386e-05,
"loss": 1.2195,
"step": 11100
},
{
"epoch": 2.41,
"grad_norm": 0.9928615689277649,
"learning_rate": 7.622982991713911e-05,
"loss": 1.2052,
"step": 11110
},
{
"epoch": 2.41,
"grad_norm": 0.916532039642334,
"learning_rate": 7.620802442215439e-05,
"loss": 1.2161,
"step": 11120
},
{
"epoch": 2.42,
"grad_norm": 1.024786353111267,
"learning_rate": 7.618621892716965e-05,
"loss": 1.1841,
"step": 11130
},
{
"epoch": 2.42,
"grad_norm": 0.9942538142204285,
"learning_rate": 7.616441343218491e-05,
"loss": 1.1969,
"step": 11140
},
{
"epoch": 2.42,
"grad_norm": 0.9637119770050049,
"learning_rate": 7.614260793720018e-05,
"loss": 1.1839,
"step": 11150
},
{
"epoch": 2.42,
"grad_norm": 1.0759954452514648,
"learning_rate": 7.612080244221544e-05,
"loss": 1.2087,
"step": 11160
},
{
"epoch": 2.42,
"grad_norm": 1.1083338260650635,
"learning_rate": 7.60989969472307e-05,
"loss": 1.1637,
"step": 11170
},
{
"epoch": 2.43,
"grad_norm": 0.9280533790588379,
"learning_rate": 7.607719145224597e-05,
"loss": 1.186,
"step": 11180
},
{
"epoch": 2.43,
"grad_norm": 1.005856990814209,
"learning_rate": 7.605538595726124e-05,
"loss": 1.2096,
"step": 11190
},
{
"epoch": 2.43,
"grad_norm": 1.0294781923294067,
"learning_rate": 7.603358046227649e-05,
"loss": 1.1933,
"step": 11200
},
{
"epoch": 2.43,
"grad_norm": 1.129011631011963,
"learning_rate": 7.601177496729176e-05,
"loss": 1.1975,
"step": 11210
},
{
"epoch": 2.44,
"grad_norm": 0.9473848938941956,
"learning_rate": 7.598996947230702e-05,
"loss": 1.191,
"step": 11220
},
{
"epoch": 2.44,
"grad_norm": 1.0725443363189697,
"learning_rate": 7.59681639773223e-05,
"loss": 1.2069,
"step": 11230
},
{
"epoch": 2.44,
"grad_norm": 1.0083664655685425,
"learning_rate": 7.594635848233756e-05,
"loss": 1.2012,
"step": 11240
},
{
"epoch": 2.44,
"grad_norm": 1.0504008531570435,
"learning_rate": 7.592455298735281e-05,
"loss": 1.1897,
"step": 11250
},
{
"epoch": 2.44,
"grad_norm": 1.02128267288208,
"learning_rate": 7.590274749236807e-05,
"loss": 1.193,
"step": 11260
},
{
"epoch": 2.45,
"grad_norm": 1.043655276298523,
"learning_rate": 7.588094199738335e-05,
"loss": 1.1984,
"step": 11270
},
{
"epoch": 2.45,
"grad_norm": 1.0775086879730225,
"learning_rate": 7.585913650239861e-05,
"loss": 1.1826,
"step": 11280
},
{
"epoch": 2.45,
"grad_norm": 1.0672656297683716,
"learning_rate": 7.583733100741387e-05,
"loss": 1.221,
"step": 11290
},
{
"epoch": 2.45,
"grad_norm": 1.1105164289474487,
"learning_rate": 7.581552551242914e-05,
"loss": 1.2124,
"step": 11300
},
{
"epoch": 2.46,
"grad_norm": 0.978393018245697,
"learning_rate": 7.57937200174444e-05,
"loss": 1.1749,
"step": 11310
},
{
"epoch": 2.46,
"grad_norm": 1.0011403560638428,
"learning_rate": 7.577191452245966e-05,
"loss": 1.1987,
"step": 11320
},
{
"epoch": 2.46,
"grad_norm": 0.9928615093231201,
"learning_rate": 7.575010902747493e-05,
"loss": 1.1916,
"step": 11330
},
{
"epoch": 2.46,
"grad_norm": 0.9368339776992798,
"learning_rate": 7.572830353249019e-05,
"loss": 1.2155,
"step": 11340
},
{
"epoch": 2.46,
"grad_norm": 1.0176599025726318,
"learning_rate": 7.570649803750545e-05,
"loss": 1.2108,
"step": 11350
},
{
"epoch": 2.47,
"grad_norm": 0.956798255443573,
"learning_rate": 7.568469254252072e-05,
"loss": 1.1951,
"step": 11360
},
{
"epoch": 2.47,
"grad_norm": 0.9456045627593994,
"learning_rate": 7.566288704753598e-05,
"loss": 1.1939,
"step": 11370
},
{
"epoch": 2.47,
"grad_norm": 1.1099495887756348,
"learning_rate": 7.564108155255125e-05,
"loss": 1.2113,
"step": 11380
},
{
"epoch": 2.47,
"grad_norm": 1.0258333683013916,
"learning_rate": 7.561927605756652e-05,
"loss": 1.1723,
"step": 11390
},
{
"epoch": 2.47,
"grad_norm": 1.0410195589065552,
"learning_rate": 7.559747056258177e-05,
"loss": 1.182,
"step": 11400
},
{
"epoch": 2.48,
"grad_norm": 0.9671265482902527,
"learning_rate": 7.557566506759703e-05,
"loss": 1.2038,
"step": 11410
},
{
"epoch": 2.48,
"grad_norm": 0.9647257328033447,
"learning_rate": 7.555385957261231e-05,
"loss": 1.2078,
"step": 11420
},
{
"epoch": 2.48,
"grad_norm": 1.0497002601623535,
"learning_rate": 7.553205407762757e-05,
"loss": 1.2053,
"step": 11430
},
{
"epoch": 2.48,
"grad_norm": 1.080557107925415,
"learning_rate": 7.551024858264283e-05,
"loss": 1.1925,
"step": 11440
},
{
"epoch": 2.49,
"grad_norm": 0.967833936214447,
"learning_rate": 7.54884430876581e-05,
"loss": 1.2106,
"step": 11450
},
{
"epoch": 2.49,
"grad_norm": 1.1252259016036987,
"learning_rate": 7.546663759267336e-05,
"loss": 1.2035,
"step": 11460
},
{
"epoch": 2.49,
"grad_norm": 1.021498203277588,
"learning_rate": 7.544483209768862e-05,
"loss": 1.1748,
"step": 11470
},
{
"epoch": 2.49,
"grad_norm": 1.1426560878753662,
"learning_rate": 7.542302660270389e-05,
"loss": 1.1916,
"step": 11480
},
{
"epoch": 2.49,
"grad_norm": 0.9883751273155212,
"learning_rate": 7.540122110771915e-05,
"loss": 1.1808,
"step": 11490
},
{
"epoch": 2.5,
"grad_norm": 0.9893055558204651,
"learning_rate": 7.537941561273441e-05,
"loss": 1.1961,
"step": 11500
},
{
"epoch": 2.5,
"grad_norm": 1.038801908493042,
"learning_rate": 7.535761011774967e-05,
"loss": 1.1813,
"step": 11510
},
{
"epoch": 2.5,
"grad_norm": 0.9812270998954773,
"learning_rate": 7.533580462276494e-05,
"loss": 1.1873,
"step": 11520
},
{
"epoch": 2.5,
"grad_norm": 1.0793439149856567,
"learning_rate": 7.53139991277802e-05,
"loss": 1.1858,
"step": 11530
},
{
"epoch": 2.51,
"grad_norm": 1.0743041038513184,
"learning_rate": 7.529219363279548e-05,
"loss": 1.1788,
"step": 11540
},
{
"epoch": 2.51,
"grad_norm": 1.1196831464767456,
"learning_rate": 7.527038813781073e-05,
"loss": 1.2059,
"step": 11550
},
{
"epoch": 2.51,
"grad_norm": 1.0126169919967651,
"learning_rate": 7.524858264282599e-05,
"loss": 1.2101,
"step": 11560
},
{
"epoch": 2.51,
"grad_norm": 1.2069376707077026,
"learning_rate": 7.522677714784125e-05,
"loss": 1.1964,
"step": 11570
},
{
"epoch": 2.51,
"grad_norm": 0.9865954518318176,
"learning_rate": 7.520497165285653e-05,
"loss": 1.1966,
"step": 11580
},
{
"epoch": 2.52,
"grad_norm": 0.9862752556800842,
"learning_rate": 7.518316615787179e-05,
"loss": 1.1954,
"step": 11590
},
{
"epoch": 2.52,
"grad_norm": 1.093674659729004,
"learning_rate": 7.516136066288704e-05,
"loss": 1.1931,
"step": 11600
},
{
"epoch": 2.52,
"grad_norm": 1.0402370691299438,
"learning_rate": 7.513955516790232e-05,
"loss": 1.1834,
"step": 11610
},
{
"epoch": 2.52,
"grad_norm": 0.9660056233406067,
"learning_rate": 7.511774967291758e-05,
"loss": 1.1978,
"step": 11620
},
{
"epoch": 2.52,
"grad_norm": 1.1045291423797607,
"learning_rate": 7.509594417793284e-05,
"loss": 1.1789,
"step": 11630
},
{
"epoch": 2.53,
"grad_norm": 1.1806862354278564,
"learning_rate": 7.507413868294811e-05,
"loss": 1.1849,
"step": 11640
},
{
"epoch": 2.53,
"grad_norm": 1.0600950717926025,
"learning_rate": 7.505233318796337e-05,
"loss": 1.1863,
"step": 11650
},
{
"epoch": 2.53,
"grad_norm": 1.2518783807754517,
"learning_rate": 7.503052769297863e-05,
"loss": 1.1911,
"step": 11660
},
{
"epoch": 2.53,
"grad_norm": 1.0559264421463013,
"learning_rate": 7.50087221979939e-05,
"loss": 1.2106,
"step": 11670
},
{
"epoch": 2.54,
"grad_norm": 0.9558138847351074,
"learning_rate": 7.498691670300916e-05,
"loss": 1.1719,
"step": 11680
},
{
"epoch": 2.54,
"grad_norm": 1.0867066383361816,
"learning_rate": 7.496511120802444e-05,
"loss": 1.2209,
"step": 11690
},
{
"epoch": 2.54,
"grad_norm": 0.9424611926078796,
"learning_rate": 7.494330571303969e-05,
"loss": 1.1812,
"step": 11700
},
{
"epoch": 2.54,
"grad_norm": 1.04227614402771,
"learning_rate": 7.492150021805495e-05,
"loss": 1.204,
"step": 11710
},
{
"epoch": 2.54,
"grad_norm": 0.9230485558509827,
"learning_rate": 7.489969472307021e-05,
"loss": 1.1923,
"step": 11720
},
{
"epoch": 2.55,
"grad_norm": 1.079827070236206,
"learning_rate": 7.487788922808549e-05,
"loss": 1.1633,
"step": 11730
},
{
"epoch": 2.55,
"grad_norm": 1.0158615112304688,
"learning_rate": 7.485608373310075e-05,
"loss": 1.1828,
"step": 11740
},
{
"epoch": 2.55,
"grad_norm": 1.0298587083816528,
"learning_rate": 7.4834278238116e-05,
"loss": 1.2046,
"step": 11750
},
{
"epoch": 2.55,
"grad_norm": 1.1021103858947754,
"learning_rate": 7.481247274313126e-05,
"loss": 1.2369,
"step": 11760
},
{
"epoch": 2.56,
"grad_norm": 1.0776439905166626,
"learning_rate": 7.479066724814654e-05,
"loss": 1.1884,
"step": 11770
},
{
"epoch": 2.56,
"grad_norm": 1.0745654106140137,
"learning_rate": 7.47688617531618e-05,
"loss": 1.1915,
"step": 11780
},
{
"epoch": 2.56,
"grad_norm": 0.9988030791282654,
"learning_rate": 7.474705625817707e-05,
"loss": 1.1783,
"step": 11790
},
{
"epoch": 2.56,
"grad_norm": 0.9837521910667419,
"learning_rate": 7.472525076319233e-05,
"loss": 1.1859,
"step": 11800
},
{
"epoch": 2.56,
"grad_norm": 1.076101541519165,
"learning_rate": 7.470344526820759e-05,
"loss": 1.194,
"step": 11810
},
{
"epoch": 2.57,
"grad_norm": 1.0141769647598267,
"learning_rate": 7.468163977322286e-05,
"loss": 1.1893,
"step": 11820
},
{
"epoch": 2.57,
"grad_norm": 0.9962597489356995,
"learning_rate": 7.465983427823812e-05,
"loss": 1.2143,
"step": 11830
},
{
"epoch": 2.57,
"grad_norm": 1.0923272371292114,
"learning_rate": 7.46380287832534e-05,
"loss": 1.184,
"step": 11840
},
{
"epoch": 2.57,
"grad_norm": 1.1431857347488403,
"learning_rate": 7.461622328826864e-05,
"loss": 1.1926,
"step": 11850
},
{
"epoch": 2.57,
"grad_norm": 1.0489574670791626,
"learning_rate": 7.459441779328391e-05,
"loss": 1.1584,
"step": 11860
},
{
"epoch": 2.58,
"grad_norm": 1.049176812171936,
"learning_rate": 7.457261229829917e-05,
"loss": 1.2145,
"step": 11870
},
{
"epoch": 2.58,
"grad_norm": 1.0617070198059082,
"learning_rate": 7.455080680331445e-05,
"loss": 1.1821,
"step": 11880
},
{
"epoch": 2.58,
"grad_norm": 1.1978720426559448,
"learning_rate": 7.452900130832971e-05,
"loss": 1.1832,
"step": 11890
},
{
"epoch": 2.58,
"grad_norm": 1.0322489738464355,
"learning_rate": 7.450719581334496e-05,
"loss": 1.1978,
"step": 11900
},
{
"epoch": 2.59,
"grad_norm": 1.0497206449508667,
"learning_rate": 7.448539031836022e-05,
"loss": 1.1771,
"step": 11910
},
{
"epoch": 2.59,
"grad_norm": 1.0136041641235352,
"learning_rate": 7.44635848233755e-05,
"loss": 1.198,
"step": 11920
},
{
"epoch": 2.59,
"grad_norm": 1.0500036478042603,
"learning_rate": 7.444177932839076e-05,
"loss": 1.2019,
"step": 11930
},
{
"epoch": 2.59,
"grad_norm": 1.0009404420852661,
"learning_rate": 7.441997383340603e-05,
"loss": 1.197,
"step": 11940
},
{
"epoch": 2.59,
"grad_norm": 1.1604543924331665,
"learning_rate": 7.439816833842127e-05,
"loss": 1.1921,
"step": 11950
},
{
"epoch": 2.6,
"grad_norm": 1.0473634004592896,
"learning_rate": 7.437636284343655e-05,
"loss": 1.1718,
"step": 11960
},
{
"epoch": 2.6,
"grad_norm": 1.0517455339431763,
"learning_rate": 7.435455734845181e-05,
"loss": 1.1721,
"step": 11970
},
{
"epoch": 2.6,
"grad_norm": 1.0030772686004639,
"learning_rate": 7.433275185346708e-05,
"loss": 1.1942,
"step": 11980
},
{
"epoch": 2.6,
"grad_norm": 1.067175269126892,
"learning_rate": 7.431094635848234e-05,
"loss": 1.2015,
"step": 11990
},
{
"epoch": 2.61,
"grad_norm": 1.0570900440216064,
"learning_rate": 7.42891408634976e-05,
"loss": 1.1715,
"step": 12000
},
{
"epoch": 2.61,
"grad_norm": 1.0768860578536987,
"learning_rate": 7.426733536851287e-05,
"loss": 1.2118,
"step": 12010
},
{
"epoch": 2.61,
"grad_norm": 0.9864534139633179,
"learning_rate": 7.424552987352813e-05,
"loss": 1.211,
"step": 12020
},
{
"epoch": 2.61,
"grad_norm": 0.9961116909980774,
"learning_rate": 7.422372437854339e-05,
"loss": 1.1726,
"step": 12030
},
{
"epoch": 2.61,
"grad_norm": 1.149584174156189,
"learning_rate": 7.420191888355867e-05,
"loss": 1.2015,
"step": 12040
},
{
"epoch": 2.62,
"grad_norm": 0.9385210275650024,
"learning_rate": 7.418011338857392e-05,
"loss": 1.1853,
"step": 12050
},
{
"epoch": 2.62,
"grad_norm": 0.9972238540649414,
"learning_rate": 7.415830789358918e-05,
"loss": 1.1862,
"step": 12060
},
{
"epoch": 2.62,
"grad_norm": 1.1037793159484863,
"learning_rate": 7.413650239860446e-05,
"loss": 1.2191,
"step": 12070
},
{
"epoch": 2.62,
"grad_norm": 1.082542896270752,
"learning_rate": 7.411469690361972e-05,
"loss": 1.2079,
"step": 12080
},
{
"epoch": 2.62,
"grad_norm": 1.103800892829895,
"learning_rate": 7.409289140863498e-05,
"loss": 1.2069,
"step": 12090
},
{
"epoch": 2.63,
"grad_norm": 1.1348109245300293,
"learning_rate": 7.407108591365023e-05,
"loss": 1.1853,
"step": 12100
},
{
"epoch": 2.63,
"grad_norm": 1.0272557735443115,
"learning_rate": 7.404928041866551e-05,
"loss": 1.206,
"step": 12110
},
{
"epoch": 2.63,
"grad_norm": 1.06856369972229,
"learning_rate": 7.402747492368077e-05,
"loss": 1.2077,
"step": 12120
},
{
"epoch": 2.63,
"grad_norm": 0.9664187431335449,
"learning_rate": 7.400566942869604e-05,
"loss": 1.2252,
"step": 12130
},
{
"epoch": 2.64,
"grad_norm": 1.0753014087677002,
"learning_rate": 7.39838639337113e-05,
"loss": 1.2033,
"step": 12140
},
{
"epoch": 2.64,
"grad_norm": 1.1803292036056519,
"learning_rate": 7.396205843872656e-05,
"loss": 1.1944,
"step": 12150
},
{
"epoch": 2.64,
"grad_norm": 0.9899237155914307,
"learning_rate": 7.394025294374183e-05,
"loss": 1.1768,
"step": 12160
},
{
"epoch": 2.64,
"grad_norm": 1.0693211555480957,
"learning_rate": 7.391844744875709e-05,
"loss": 1.1957,
"step": 12170
},
{
"epoch": 2.64,
"grad_norm": 1.0212500095367432,
"learning_rate": 7.389664195377235e-05,
"loss": 1.1807,
"step": 12180
},
{
"epoch": 2.65,
"grad_norm": 0.9626917839050293,
"learning_rate": 7.387483645878763e-05,
"loss": 1.2019,
"step": 12190
},
{
"epoch": 2.65,
"grad_norm": 1.0324492454528809,
"learning_rate": 7.385303096380288e-05,
"loss": 1.1787,
"step": 12200
},
{
"epoch": 2.65,
"grad_norm": 1.0183689594268799,
"learning_rate": 7.383122546881814e-05,
"loss": 1.1718,
"step": 12210
},
{
"epoch": 2.65,
"grad_norm": 1.03179132938385,
"learning_rate": 7.38094199738334e-05,
"loss": 1.1684,
"step": 12220
},
{
"epoch": 2.66,
"grad_norm": 1.0151221752166748,
"learning_rate": 7.378761447884868e-05,
"loss": 1.1754,
"step": 12230
},
{
"epoch": 2.66,
"grad_norm": 1.0675002336502075,
"learning_rate": 7.376580898386394e-05,
"loss": 1.1964,
"step": 12240
},
{
"epoch": 2.66,
"grad_norm": 0.9424752593040466,
"learning_rate": 7.374400348887919e-05,
"loss": 1.1994,
"step": 12250
},
{
"epoch": 2.66,
"grad_norm": 1.0181151628494263,
"learning_rate": 7.372219799389446e-05,
"loss": 1.1943,
"step": 12260
},
{
"epoch": 2.66,
"grad_norm": 1.0865308046340942,
"learning_rate": 7.370039249890973e-05,
"loss": 1.1703,
"step": 12270
},
{
"epoch": 2.67,
"grad_norm": 1.043016791343689,
"learning_rate": 7.3678587003925e-05,
"loss": 1.1813,
"step": 12280
},
{
"epoch": 2.67,
"grad_norm": 1.060164213180542,
"learning_rate": 7.365678150894026e-05,
"loss": 1.1769,
"step": 12290
},
{
"epoch": 2.67,
"grad_norm": 1.0264476537704468,
"learning_rate": 7.363497601395552e-05,
"loss": 1.1895,
"step": 12300
},
{
"epoch": 2.67,
"grad_norm": 1.0359675884246826,
"learning_rate": 7.361317051897078e-05,
"loss": 1.1773,
"step": 12310
},
{
"epoch": 2.67,
"grad_norm": 1.0558348894119263,
"learning_rate": 7.359136502398605e-05,
"loss": 1.2011,
"step": 12320
},
{
"epoch": 2.68,
"grad_norm": 1.0487242937088013,
"learning_rate": 7.356955952900131e-05,
"loss": 1.2145,
"step": 12330
},
{
"epoch": 2.68,
"grad_norm": 1.0390251874923706,
"learning_rate": 7.354775403401657e-05,
"loss": 1.1771,
"step": 12340
},
{
"epoch": 2.68,
"grad_norm": 0.9608905911445618,
"learning_rate": 7.352594853903184e-05,
"loss": 1.1988,
"step": 12350
},
{
"epoch": 2.68,
"grad_norm": 0.9924561977386475,
"learning_rate": 7.35041430440471e-05,
"loss": 1.2049,
"step": 12360
},
{
"epoch": 2.69,
"grad_norm": 0.9115813970565796,
"learning_rate": 7.348233754906236e-05,
"loss": 1.185,
"step": 12370
},
{
"epoch": 2.69,
"grad_norm": 0.9227597713470459,
"learning_rate": 7.346053205407764e-05,
"loss": 1.1964,
"step": 12380
},
{
"epoch": 2.69,
"grad_norm": 1.1192283630371094,
"learning_rate": 7.34387265590929e-05,
"loss": 1.1927,
"step": 12390
},
{
"epoch": 2.69,
"grad_norm": 0.9770265817642212,
"learning_rate": 7.341692106410815e-05,
"loss": 1.197,
"step": 12400
},
{
"epoch": 2.69,
"grad_norm": 1.0701338052749634,
"learning_rate": 7.339511556912341e-05,
"loss": 1.1834,
"step": 12410
},
{
"epoch": 2.7,
"grad_norm": 1.0348602533340454,
"learning_rate": 7.337331007413869e-05,
"loss": 1.2115,
"step": 12420
},
{
"epoch": 2.7,
"grad_norm": 1.0927150249481201,
"learning_rate": 7.335150457915395e-05,
"loss": 1.2032,
"step": 12430
},
{
"epoch": 2.7,
"grad_norm": 1.0548428297042847,
"learning_rate": 7.332969908416922e-05,
"loss": 1.1962,
"step": 12440
},
{
"epoch": 2.7,
"grad_norm": 0.9672625064849854,
"learning_rate": 7.330789358918447e-05,
"loss": 1.1761,
"step": 12450
},
{
"epoch": 2.71,
"grad_norm": 0.9257100820541382,
"learning_rate": 7.328608809419974e-05,
"loss": 1.2007,
"step": 12460
},
{
"epoch": 2.71,
"grad_norm": 1.0286579132080078,
"learning_rate": 7.3264282599215e-05,
"loss": 1.1988,
"step": 12470
},
{
"epoch": 2.71,
"grad_norm": 1.153806447982788,
"learning_rate": 7.324247710423027e-05,
"loss": 1.207,
"step": 12480
},
{
"epoch": 2.71,
"grad_norm": 0.9337689876556396,
"learning_rate": 7.322067160924553e-05,
"loss": 1.2006,
"step": 12490
},
{
"epoch": 2.71,
"grad_norm": 0.9721220135688782,
"learning_rate": 7.31988661142608e-05,
"loss": 1.2014,
"step": 12500
},
{
"epoch": 2.72,
"grad_norm": 1.158456802368164,
"learning_rate": 7.317706061927606e-05,
"loss": 1.2074,
"step": 12510
},
{
"epoch": 2.72,
"grad_norm": 1.0969914197921753,
"learning_rate": 7.315525512429132e-05,
"loss": 1.207,
"step": 12520
},
{
"epoch": 2.72,
"grad_norm": 0.9585858583450317,
"learning_rate": 7.31334496293066e-05,
"loss": 1.1783,
"step": 12530
},
{
"epoch": 2.72,
"grad_norm": 1.0447596311569214,
"learning_rate": 7.311164413432186e-05,
"loss": 1.1662,
"step": 12540
},
{
"epoch": 2.72,
"grad_norm": 1.0252220630645752,
"learning_rate": 7.308983863933711e-05,
"loss": 1.1891,
"step": 12550
},
{
"epoch": 2.73,
"grad_norm": 1.075294017791748,
"learning_rate": 7.306803314435237e-05,
"loss": 1.1917,
"step": 12560
},
{
"epoch": 2.73,
"grad_norm": 1.0980489253997803,
"learning_rate": 7.304622764936765e-05,
"loss": 1.1829,
"step": 12570
},
{
"epoch": 2.73,
"grad_norm": 1.0682340860366821,
"learning_rate": 7.302442215438291e-05,
"loss": 1.1859,
"step": 12580
},
{
"epoch": 2.73,
"grad_norm": 1.0863393545150757,
"learning_rate": 7.300261665939818e-05,
"loss": 1.188,
"step": 12590
},
{
"epoch": 2.74,
"grad_norm": 1.0569467544555664,
"learning_rate": 7.298081116441343e-05,
"loss": 1.1962,
"step": 12600
},
{
"epoch": 2.74,
"grad_norm": 1.0733450651168823,
"learning_rate": 7.29590056694287e-05,
"loss": 1.1934,
"step": 12610
},
{
"epoch": 2.74,
"grad_norm": 1.0762420892715454,
"learning_rate": 7.293720017444397e-05,
"loss": 1.181,
"step": 12620
},
{
"epoch": 2.74,
"grad_norm": 1.0010732412338257,
"learning_rate": 7.291539467945923e-05,
"loss": 1.1936,
"step": 12630
},
{
"epoch": 2.74,
"grad_norm": 1.039819598197937,
"learning_rate": 7.289358918447449e-05,
"loss": 1.2001,
"step": 12640
},
{
"epoch": 2.75,
"grad_norm": 1.1060088872909546,
"learning_rate": 7.287178368948975e-05,
"loss": 1.2056,
"step": 12650
},
{
"epoch": 2.75,
"grad_norm": 0.9314666986465454,
"learning_rate": 7.284997819450502e-05,
"loss": 1.1748,
"step": 12660
},
{
"epoch": 2.75,
"grad_norm": 1.2504175901412964,
"learning_rate": 7.282817269952028e-05,
"loss": 1.1737,
"step": 12670
},
{
"epoch": 2.75,
"grad_norm": 1.1391412019729614,
"learning_rate": 7.280636720453554e-05,
"loss": 1.1909,
"step": 12680
},
{
"epoch": 2.75,
"grad_norm": 1.0052971839904785,
"learning_rate": 7.278456170955081e-05,
"loss": 1.1902,
"step": 12690
},
{
"epoch": 2.76,
"grad_norm": 1.1059855222702026,
"learning_rate": 7.276275621456607e-05,
"loss": 1.2021,
"step": 12700
},
{
"epoch": 2.76,
"grad_norm": 1.0115567445755005,
"learning_rate": 7.274095071958133e-05,
"loss": 1.1512,
"step": 12710
},
{
"epoch": 2.76,
"grad_norm": 1.0905554294586182,
"learning_rate": 7.27191452245966e-05,
"loss": 1.1884,
"step": 12720
},
{
"epoch": 2.76,
"grad_norm": 1.023762583732605,
"learning_rate": 7.269733972961187e-05,
"loss": 1.1841,
"step": 12730
},
{
"epoch": 2.77,
"grad_norm": 1.0214531421661377,
"learning_rate": 7.267553423462714e-05,
"loss": 1.185,
"step": 12740
},
{
"epoch": 2.77,
"grad_norm": 1.043494701385498,
"learning_rate": 7.265372873964239e-05,
"loss": 1.1822,
"step": 12750
},
{
"epoch": 2.77,
"grad_norm": 1.0787135362625122,
"learning_rate": 7.263192324465766e-05,
"loss": 1.1827,
"step": 12760
},
{
"epoch": 2.77,
"grad_norm": 1.1063132286071777,
"learning_rate": 7.261011774967292e-05,
"loss": 1.1847,
"step": 12770
},
{
"epoch": 2.77,
"grad_norm": 1.0400912761688232,
"learning_rate": 7.258831225468819e-05,
"loss": 1.1603,
"step": 12780
},
{
"epoch": 2.78,
"grad_norm": 1.057569146156311,
"learning_rate": 7.256650675970345e-05,
"loss": 1.1713,
"step": 12790
},
{
"epoch": 2.78,
"grad_norm": 1.0713859796524048,
"learning_rate": 7.254470126471871e-05,
"loss": 1.2167,
"step": 12800
},
{
"epoch": 2.78,
"grad_norm": 1.0643656253814697,
"learning_rate": 7.252289576973398e-05,
"loss": 1.1744,
"step": 12810
},
{
"epoch": 2.78,
"grad_norm": 1.1218703985214233,
"learning_rate": 7.250109027474924e-05,
"loss": 1.2183,
"step": 12820
},
{
"epoch": 2.79,
"grad_norm": 0.9932084083557129,
"learning_rate": 7.24792847797645e-05,
"loss": 1.1774,
"step": 12830
},
{
"epoch": 2.79,
"grad_norm": 1.063856840133667,
"learning_rate": 7.245747928477977e-05,
"loss": 1.1519,
"step": 12840
},
{
"epoch": 2.79,
"grad_norm": 1.0655205249786377,
"learning_rate": 7.243567378979503e-05,
"loss": 1.1883,
"step": 12850
},
{
"epoch": 2.79,
"grad_norm": 0.9149487018585205,
"learning_rate": 7.241386829481029e-05,
"loss": 1.1636,
"step": 12860
},
{
"epoch": 2.79,
"grad_norm": 1.061606764793396,
"learning_rate": 7.239206279982556e-05,
"loss": 1.1933,
"step": 12870
},
{
"epoch": 2.8,
"grad_norm": 1.026875376701355,
"learning_rate": 7.237025730484083e-05,
"loss": 1.1697,
"step": 12880
},
{
"epoch": 2.8,
"grad_norm": 0.9857021570205688,
"learning_rate": 7.23484518098561e-05,
"loss": 1.1593,
"step": 12890
},
{
"epoch": 2.8,
"grad_norm": 1.0682117938995361,
"learning_rate": 7.232664631487134e-05,
"loss": 1.1846,
"step": 12900
},
{
"epoch": 2.8,
"grad_norm": 0.9390698671340942,
"learning_rate": 7.230484081988661e-05,
"loss": 1.1625,
"step": 12910
},
{
"epoch": 2.8,
"grad_norm": 1.0105453729629517,
"learning_rate": 7.228303532490188e-05,
"loss": 1.1929,
"step": 12920
},
{
"epoch": 2.81,
"grad_norm": 0.986284077167511,
"learning_rate": 7.226122982991715e-05,
"loss": 1.1973,
"step": 12930
},
{
"epoch": 2.81,
"grad_norm": 1.0369880199432373,
"learning_rate": 7.223942433493241e-05,
"loss": 1.1996,
"step": 12940
},
{
"epoch": 2.81,
"grad_norm": 1.1171998977661133,
"learning_rate": 7.221761883994766e-05,
"loss": 1.2022,
"step": 12950
},
{
"epoch": 2.81,
"grad_norm": 1.0862730741500854,
"learning_rate": 7.219581334496294e-05,
"loss": 1.19,
"step": 12960
},
{
"epoch": 2.82,
"grad_norm": 1.0609533786773682,
"learning_rate": 7.21740078499782e-05,
"loss": 1.1825,
"step": 12970
},
{
"epoch": 2.82,
"grad_norm": 0.98408043384552,
"learning_rate": 7.215220235499346e-05,
"loss": 1.1766,
"step": 12980
},
{
"epoch": 2.82,
"grad_norm": 1.0378422737121582,
"learning_rate": 7.213039686000873e-05,
"loss": 1.1843,
"step": 12990
},
{
"epoch": 2.82,
"grad_norm": 0.9478686451911926,
"learning_rate": 7.210859136502399e-05,
"loss": 1.1728,
"step": 13000
},
{
"epoch": 2.82,
"grad_norm": 1.0276613235473633,
"learning_rate": 7.208678587003925e-05,
"loss": 1.1796,
"step": 13010
},
{
"epoch": 2.83,
"grad_norm": 0.9244964122772217,
"learning_rate": 7.206498037505451e-05,
"loss": 1.1812,
"step": 13020
},
{
"epoch": 2.83,
"grad_norm": 1.0720821619033813,
"learning_rate": 7.204317488006979e-05,
"loss": 1.1597,
"step": 13030
},
{
"epoch": 2.83,
"grad_norm": 1.0820330381393433,
"learning_rate": 7.202136938508504e-05,
"loss": 1.1981,
"step": 13040
},
{
"epoch": 2.83,
"grad_norm": 0.9590197205543518,
"learning_rate": 7.19995638901003e-05,
"loss": 1.1898,
"step": 13050
},
{
"epoch": 2.84,
"grad_norm": 1.0559465885162354,
"learning_rate": 7.197775839511557e-05,
"loss": 1.1985,
"step": 13060
},
{
"epoch": 2.84,
"grad_norm": 0.9392025470733643,
"learning_rate": 7.195595290013084e-05,
"loss": 1.1933,
"step": 13070
},
{
"epoch": 2.84,
"grad_norm": 1.1029566526412964,
"learning_rate": 7.19341474051461e-05,
"loss": 1.1733,
"step": 13080
},
{
"epoch": 2.84,
"grad_norm": 1.0255013704299927,
"learning_rate": 7.191234191016137e-05,
"loss": 1.1762,
"step": 13090
},
{
"epoch": 2.84,
"grad_norm": 1.0394928455352783,
"learning_rate": 7.189053641517662e-05,
"loss": 1.151,
"step": 13100
},
{
"epoch": 2.85,
"grad_norm": 1.057391881942749,
"learning_rate": 7.18687309201919e-05,
"loss": 1.1761,
"step": 13110
},
{
"epoch": 2.85,
"grad_norm": 1.0358378887176514,
"learning_rate": 7.184692542520716e-05,
"loss": 1.1911,
"step": 13120
},
{
"epoch": 2.85,
"grad_norm": 1.0503947734832764,
"learning_rate": 7.182511993022242e-05,
"loss": 1.2198,
"step": 13130
},
{
"epoch": 2.85,
"grad_norm": 1.0237114429473877,
"learning_rate": 7.180331443523768e-05,
"loss": 1.2043,
"step": 13140
},
{
"epoch": 2.85,
"grad_norm": 0.9386830925941467,
"learning_rate": 7.178150894025295e-05,
"loss": 1.192,
"step": 13150
},
{
"epoch": 2.86,
"grad_norm": 0.9386530518531799,
"learning_rate": 7.175970344526821e-05,
"loss": 1.1864,
"step": 13160
},
{
"epoch": 2.86,
"grad_norm": 0.9574694633483887,
"learning_rate": 7.173789795028347e-05,
"loss": 1.1828,
"step": 13170
},
{
"epoch": 2.86,
"grad_norm": 1.0528520345687866,
"learning_rate": 7.171609245529874e-05,
"loss": 1.1861,
"step": 13180
},
{
"epoch": 2.86,
"grad_norm": 1.0283684730529785,
"learning_rate": 7.1694286960314e-05,
"loss": 1.1749,
"step": 13190
},
{
"epoch": 2.87,
"grad_norm": 0.9847733974456787,
"learning_rate": 7.167248146532926e-05,
"loss": 1.1903,
"step": 13200
},
{
"epoch": 2.87,
"grad_norm": 1.0302000045776367,
"learning_rate": 7.165067597034453e-05,
"loss": 1.1852,
"step": 13210
},
{
"epoch": 2.87,
"grad_norm": 1.0097705125808716,
"learning_rate": 7.16288704753598e-05,
"loss": 1.1874,
"step": 13220
},
{
"epoch": 2.87,
"grad_norm": 1.1593202352523804,
"learning_rate": 7.160706498037506e-05,
"loss": 1.1827,
"step": 13230
},
{
"epoch": 2.87,
"grad_norm": 0.9892207384109497,
"learning_rate": 7.158525948539033e-05,
"loss": 1.1694,
"step": 13240
},
{
"epoch": 2.88,
"grad_norm": 1.0846501588821411,
"learning_rate": 7.156345399040558e-05,
"loss": 1.1892,
"step": 13250
},
{
"epoch": 2.88,
"grad_norm": 1.014400601387024,
"learning_rate": 7.154164849542085e-05,
"loss": 1.1806,
"step": 13260
},
{
"epoch": 2.88,
"grad_norm": 1.0073882341384888,
"learning_rate": 7.151984300043612e-05,
"loss": 1.1781,
"step": 13270
},
{
"epoch": 2.88,
"grad_norm": 1.2205009460449219,
"learning_rate": 7.149803750545138e-05,
"loss": 1.1757,
"step": 13280
},
{
"epoch": 2.89,
"grad_norm": 1.058864951133728,
"learning_rate": 7.147623201046664e-05,
"loss": 1.1968,
"step": 13290
},
{
"epoch": 2.89,
"grad_norm": 1.0327656269073486,
"learning_rate": 7.14544265154819e-05,
"loss": 1.2232,
"step": 13300
},
{
"epoch": 2.89,
"grad_norm": 1.042557954788208,
"learning_rate": 7.143262102049717e-05,
"loss": 1.2254,
"step": 13310
},
{
"epoch": 2.89,
"grad_norm": 0.9692584276199341,
"learning_rate": 7.141081552551243e-05,
"loss": 1.1757,
"step": 13320
},
{
"epoch": 2.89,
"grad_norm": 1.0381295680999756,
"learning_rate": 7.13890100305277e-05,
"loss": 1.1961,
"step": 13330
},
{
"epoch": 2.9,
"grad_norm": 1.0239328145980835,
"learning_rate": 7.136720453554296e-05,
"loss": 1.155,
"step": 13340
},
{
"epoch": 2.9,
"grad_norm": 1.0357582569122314,
"learning_rate": 7.134539904055822e-05,
"loss": 1.1719,
"step": 13350
},
{
"epoch": 2.9,
"grad_norm": 1.0303056240081787,
"learning_rate": 7.132359354557348e-05,
"loss": 1.1934,
"step": 13360
},
{
"epoch": 2.9,
"grad_norm": 0.9931465983390808,
"learning_rate": 7.130178805058875e-05,
"loss": 1.1791,
"step": 13370
},
{
"epoch": 2.9,
"grad_norm": 1.0507264137268066,
"learning_rate": 7.127998255560402e-05,
"loss": 1.184,
"step": 13380
},
{
"epoch": 2.91,
"grad_norm": 1.0703891515731812,
"learning_rate": 7.125817706061927e-05,
"loss": 1.1853,
"step": 13390
},
{
"epoch": 2.91,
"grad_norm": 0.9957337975502014,
"learning_rate": 7.123637156563454e-05,
"loss": 1.1702,
"step": 13400
},
{
"epoch": 2.91,
"grad_norm": 1.1027911901474,
"learning_rate": 7.12145660706498e-05,
"loss": 1.1968,
"step": 13410
},
{
"epoch": 2.91,
"grad_norm": 0.9877254366874695,
"learning_rate": 7.119276057566508e-05,
"loss": 1.1752,
"step": 13420
},
{
"epoch": 2.92,
"grad_norm": 1.0115269422531128,
"learning_rate": 7.117095508068034e-05,
"loss": 1.1546,
"step": 13430
},
{
"epoch": 2.92,
"grad_norm": 0.9738414287567139,
"learning_rate": 7.11491495856956e-05,
"loss": 1.1576,
"step": 13440
},
{
"epoch": 2.92,
"grad_norm": 1.0419977903366089,
"learning_rate": 7.112734409071087e-05,
"loss": 1.1927,
"step": 13450
},
{
"epoch": 2.92,
"grad_norm": 1.0933623313903809,
"learning_rate": 7.110553859572613e-05,
"loss": 1.1747,
"step": 13460
},
{
"epoch": 2.92,
"grad_norm": 1.0882395505905151,
"learning_rate": 7.108373310074139e-05,
"loss": 1.189,
"step": 13470
},
{
"epoch": 2.93,
"grad_norm": 0.9442359209060669,
"learning_rate": 7.106192760575665e-05,
"loss": 1.1826,
"step": 13480
},
{
"epoch": 2.93,
"grad_norm": 1.0601658821105957,
"learning_rate": 7.104012211077192e-05,
"loss": 1.1854,
"step": 13490
},
{
"epoch": 2.93,
"grad_norm": 1.0670174360275269,
"learning_rate": 7.101831661578718e-05,
"loss": 1.1893,
"step": 13500
},
{
"epoch": 2.93,
"grad_norm": 1.0757992267608643,
"learning_rate": 7.099651112080244e-05,
"loss": 1.1984,
"step": 13510
},
{
"epoch": 2.94,
"grad_norm": 1.0340900421142578,
"learning_rate": 7.09747056258177e-05,
"loss": 1.2068,
"step": 13520
},
{
"epoch": 2.94,
"grad_norm": 1.0402545928955078,
"learning_rate": 7.095290013083298e-05,
"loss": 1.208,
"step": 13530
},
{
"epoch": 2.94,
"grad_norm": 1.1371444463729858,
"learning_rate": 7.093109463584823e-05,
"loss": 1.1883,
"step": 13540
},
{
"epoch": 2.94,
"grad_norm": 1.0464153289794922,
"learning_rate": 7.09092891408635e-05,
"loss": 1.1896,
"step": 13550
},
{
"epoch": 2.94,
"grad_norm": 0.9860671758651733,
"learning_rate": 7.088748364587876e-05,
"loss": 1.1782,
"step": 13560
},
{
"epoch": 2.95,
"grad_norm": 0.927305281162262,
"learning_rate": 7.086567815089404e-05,
"loss": 1.1759,
"step": 13570
},
{
"epoch": 2.95,
"grad_norm": 1.0116522312164307,
"learning_rate": 7.08438726559093e-05,
"loss": 1.1845,
"step": 13580
},
{
"epoch": 2.95,
"grad_norm": 1.0394808053970337,
"learning_rate": 7.082206716092456e-05,
"loss": 1.1949,
"step": 13590
},
{
"epoch": 2.95,
"grad_norm": 1.1558239459991455,
"learning_rate": 7.080026166593981e-05,
"loss": 1.1758,
"step": 13600
},
{
"epoch": 2.95,
"grad_norm": 0.9348282217979431,
"learning_rate": 7.077845617095509e-05,
"loss": 1.1976,
"step": 13610
},
{
"epoch": 2.96,
"grad_norm": 0.9124108552932739,
"learning_rate": 7.075665067597035e-05,
"loss": 1.172,
"step": 13620
},
{
"epoch": 2.96,
"grad_norm": 1.077690839767456,
"learning_rate": 7.073484518098561e-05,
"loss": 1.1835,
"step": 13630
},
{
"epoch": 2.96,
"grad_norm": 0.9495044350624084,
"learning_rate": 7.071303968600088e-05,
"loss": 1.1682,
"step": 13640
},
{
"epoch": 2.96,
"grad_norm": 0.9947417378425598,
"learning_rate": 7.069123419101614e-05,
"loss": 1.2216,
"step": 13650
},
{
"epoch": 2.97,
"grad_norm": 1.072772741317749,
"learning_rate": 7.06694286960314e-05,
"loss": 1.2006,
"step": 13660
},
{
"epoch": 2.97,
"grad_norm": 1.0669934749603271,
"learning_rate": 7.064762320104667e-05,
"loss": 1.1992,
"step": 13670
},
{
"epoch": 2.97,
"grad_norm": 1.0894432067871094,
"learning_rate": 7.062581770606194e-05,
"loss": 1.1745,
"step": 13680
},
{
"epoch": 2.97,
"grad_norm": 0.9627017378807068,
"learning_rate": 7.060401221107719e-05,
"loss": 1.1818,
"step": 13690
},
{
"epoch": 2.97,
"grad_norm": 0.9909853935241699,
"learning_rate": 7.058220671609245e-05,
"loss": 1.1705,
"step": 13700
},
{
"epoch": 2.98,
"grad_norm": 1.0125415325164795,
"learning_rate": 7.056040122110772e-05,
"loss": 1.211,
"step": 13710
},
{
"epoch": 2.98,
"grad_norm": 0.9729527235031128,
"learning_rate": 7.0538595726123e-05,
"loss": 1.1658,
"step": 13720
},
{
"epoch": 2.98,
"grad_norm": 1.0256701707839966,
"learning_rate": 7.051679023113826e-05,
"loss": 1.1657,
"step": 13730
},
{
"epoch": 2.98,
"grad_norm": 1.0687954425811768,
"learning_rate": 7.04949847361535e-05,
"loss": 1.1648,
"step": 13740
},
{
"epoch": 2.99,
"grad_norm": 0.9713466167449951,
"learning_rate": 7.047317924116877e-05,
"loss": 1.1774,
"step": 13750
},
{
"epoch": 2.99,
"grad_norm": 1.0809965133666992,
"learning_rate": 7.045137374618405e-05,
"loss": 1.1658,
"step": 13760
},
{
"epoch": 2.99,
"grad_norm": 1.0827128887176514,
"learning_rate": 7.042956825119931e-05,
"loss": 1.1639,
"step": 13770
},
{
"epoch": 2.99,
"grad_norm": 1.112669825553894,
"learning_rate": 7.040776275621457e-05,
"loss": 1.1743,
"step": 13780
},
{
"epoch": 2.99,
"grad_norm": 0.9779360890388489,
"learning_rate": 7.038595726122984e-05,
"loss": 1.1823,
"step": 13790
},
{
"epoch": 3.0,
"grad_norm": 1.0385786294937134,
"learning_rate": 7.03641517662451e-05,
"loss": 1.1804,
"step": 13800
},
{
"epoch": 3.0,
"grad_norm": 1.05619215965271,
"learning_rate": 7.034234627126036e-05,
"loss": 1.1936,
"step": 13810
},
{
"epoch": 3.0,
"eval_loss": 1.28429114818573,
"eval_runtime": 1501.2758,
"eval_samples_per_second": 257.68,
"eval_steps_per_second": 4.027,
"step": 13818
}
],
"logging_steps": 10,
"max_steps": 46060,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"total_flos": 5.893571450073252e+18,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}