{ "best_metric": 1.2792317867279053, "best_model_checkpoint": "saved_model/c2s_sep_2024/checkpoint-4606", "epoch": 2.9998371777476254, "eval_steps": 500, "global_step": 13818, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": NaN, "learning_rate": 0.0, "loss": 77.1448, "step": 1 }, { "epoch": 0.0, "grad_norm": 17.278156280517578, "learning_rate": 2.5e-06, "loss": 76.5629, "step": 10 }, { "epoch": 0.0, "grad_norm": 15.856775283813477, "learning_rate": 7.000000000000001e-06, "loss": 75.6974, "step": 20 }, { "epoch": 0.01, "grad_norm": 15.606675148010254, "learning_rate": 1.2e-05, "loss": 74.9514, "step": 30 }, { "epoch": 0.01, "grad_norm": 12.968363761901855, "learning_rate": 1.7000000000000003e-05, "loss": 72.4643, "step": 40 }, { "epoch": 0.01, "grad_norm": 13.329130172729492, "learning_rate": 2.2000000000000003e-05, "loss": 69.0552, "step": 50 }, { "epoch": 0.01, "grad_norm": 18.156723022460938, "learning_rate": 2.7000000000000002e-05, "loss": 64.2775, "step": 60 }, { "epoch": 0.02, "grad_norm": 29.901222229003906, "learning_rate": 3.2000000000000005e-05, "loss": 52.1897, "step": 70 }, { "epoch": 0.02, "grad_norm": 27.163593292236328, "learning_rate": 3.65e-05, "loss": 30.5964, "step": 80 }, { "epoch": 0.02, "grad_norm": 13.53585433959961, "learning_rate": 4.15e-05, "loss": 12.5007, "step": 90 }, { "epoch": 0.02, "grad_norm": 7.8353095054626465, "learning_rate": 4.6500000000000005e-05, "loss": 6.4802, "step": 100 }, { "epoch": 0.02, "grad_norm": 6.838261127471924, "learning_rate": 5.1500000000000005e-05, "loss": 4.7819, "step": 110 }, { "epoch": 0.03, "grad_norm": 8.852176666259766, "learning_rate": 5.65e-05, "loss": 4.1049, "step": 120 }, { "epoch": 0.03, "grad_norm": 7.614436149597168, "learning_rate": 6.15e-05, "loss": 3.7732, "step": 130 }, { "epoch": 0.03, "grad_norm": 7.756160259246826, "learning_rate": 6.65e-05, "loss": 3.6324, "step": 140 }, { "epoch": 0.03, "grad_norm": 6.736324310302734, "learning_rate": 7.15e-05, "loss": 3.4327, "step": 150 }, { "epoch": 0.03, "grad_norm": 8.393209457397461, "learning_rate": 7.65e-05, "loss": 3.4096, "step": 160 }, { "epoch": 0.04, "grad_norm": 5.403553485870361, "learning_rate": 8.15e-05, "loss": 3.2845, "step": 170 }, { "epoch": 0.04, "grad_norm": 5.367032051086426, "learning_rate": 8.65e-05, "loss": 3.2462, "step": 180 }, { "epoch": 0.04, "grad_norm": 7.965042591094971, "learning_rate": 9.15e-05, "loss": 3.1463, "step": 190 }, { "epoch": 0.04, "grad_norm": 7.074673175811768, "learning_rate": 9.65e-05, "loss": 3.1758, "step": 200 }, { "epoch": 0.05, "grad_norm": 6.894763946533203, "learning_rate": 9.999345835150458e-05, "loss": 3.0311, "step": 210 }, { "epoch": 0.05, "grad_norm": 6.925544738769531, "learning_rate": 9.997165285651984e-05, "loss": 3.0684, "step": 220 }, { "epoch": 0.05, "grad_norm": 5.285668849945068, "learning_rate": 9.994984736153511e-05, "loss": 2.9234, "step": 230 }, { "epoch": 0.05, "grad_norm": 6.81157922744751, "learning_rate": 9.992804186655037e-05, "loss": 2.8664, "step": 240 }, { "epoch": 0.05, "grad_norm": 6.883147239685059, "learning_rate": 9.990623637156565e-05, "loss": 2.9204, "step": 250 }, { "epoch": 0.06, "grad_norm": 5.505452632904053, "learning_rate": 9.988443087658091e-05, "loss": 2.8818, "step": 260 }, { "epoch": 0.06, "grad_norm": 7.352786064147949, "learning_rate": 9.986262538159616e-05, "loss": 2.8999, "step": 270 }, { "epoch": 0.06, "grad_norm": 6.875962734222412, "learning_rate": 9.984081988661144e-05, "loss": 2.8523, "step": 280 }, { "epoch": 0.06, "grad_norm": 5.861810684204102, "learning_rate": 9.98190143916267e-05, "loss": 2.8062, "step": 290 }, { "epoch": 0.07, "grad_norm": 5.396953582763672, "learning_rate": 9.979720889664196e-05, "loss": 2.7625, "step": 300 }, { "epoch": 0.07, "grad_norm": 6.168801307678223, "learning_rate": 9.977540340165722e-05, "loss": 2.7063, "step": 310 }, { "epoch": 0.07, "grad_norm": 4.478597640991211, "learning_rate": 9.975359790667249e-05, "loss": 2.6539, "step": 320 }, { "epoch": 0.07, "grad_norm": 5.2905731201171875, "learning_rate": 9.973179241168775e-05, "loss": 2.7406, "step": 330 }, { "epoch": 0.07, "grad_norm": 5.451777935028076, "learning_rate": 9.970998691670301e-05, "loss": 2.6599, "step": 340 }, { "epoch": 0.08, "grad_norm": 5.45026969909668, "learning_rate": 9.968818142171828e-05, "loss": 2.6406, "step": 350 }, { "epoch": 0.08, "grad_norm": 4.353079795837402, "learning_rate": 9.966637592673354e-05, "loss": 2.5285, "step": 360 }, { "epoch": 0.08, "grad_norm": 4.052408218383789, "learning_rate": 9.96445704317488e-05, "loss": 2.4025, "step": 370 }, { "epoch": 0.08, "grad_norm": 4.303618431091309, "learning_rate": 9.962276493676407e-05, "loss": 2.2459, "step": 380 }, { "epoch": 0.08, "grad_norm": 3.2505452632904053, "learning_rate": 9.960095944177933e-05, "loss": 2.166, "step": 390 }, { "epoch": 0.09, "grad_norm": 2.806292772293091, "learning_rate": 9.95791539467946e-05, "loss": 2.0462, "step": 400 }, { "epoch": 0.09, "grad_norm": 2.9824328422546387, "learning_rate": 9.955734845180987e-05, "loss": 1.9315, "step": 410 }, { "epoch": 0.09, "grad_norm": 2.7355027198791504, "learning_rate": 9.953554295682512e-05, "loss": 1.9072, "step": 420 }, { "epoch": 0.09, "grad_norm": 2.385045051574707, "learning_rate": 9.951373746184038e-05, "loss": 1.8667, "step": 430 }, { "epoch": 0.1, "grad_norm": 2.4067020416259766, "learning_rate": 9.949193196685566e-05, "loss": 1.8179, "step": 440 }, { "epoch": 0.1, "grad_norm": 2.6805872917175293, "learning_rate": 9.947012647187092e-05, "loss": 1.8208, "step": 450 }, { "epoch": 0.1, "grad_norm": 1.9335626363754272, "learning_rate": 9.944832097688618e-05, "loss": 1.8092, "step": 460 }, { "epoch": 0.1, "grad_norm": 1.7954732179641724, "learning_rate": 9.942651548190143e-05, "loss": 1.7698, "step": 470 }, { "epoch": 0.1, "grad_norm": 2.2542481422424316, "learning_rate": 9.940470998691671e-05, "loss": 1.7359, "step": 480 }, { "epoch": 0.11, "grad_norm": 1.8089336156845093, "learning_rate": 9.938290449193197e-05, "loss": 1.7195, "step": 490 }, { "epoch": 0.11, "grad_norm": 2.3044662475585938, "learning_rate": 9.936109899694724e-05, "loss": 1.6901, "step": 500 }, { "epoch": 0.11, "grad_norm": 1.8811343908309937, "learning_rate": 9.93392935019625e-05, "loss": 1.6757, "step": 510 }, { "epoch": 0.11, "grad_norm": 2.8750667572021484, "learning_rate": 9.931748800697776e-05, "loss": 1.6902, "step": 520 }, { "epoch": 0.12, "grad_norm": 1.8759925365447998, "learning_rate": 9.929568251199303e-05, "loss": 1.6519, "step": 530 }, { "epoch": 0.12, "grad_norm": 1.7360563278198242, "learning_rate": 9.927387701700829e-05, "loss": 1.6381, "step": 540 }, { "epoch": 0.12, "grad_norm": 1.9994693994522095, "learning_rate": 9.925207152202356e-05, "loss": 1.6527, "step": 550 }, { "epoch": 0.12, "grad_norm": 1.803330659866333, "learning_rate": 9.923026602703881e-05, "loss": 1.6453, "step": 560 }, { "epoch": 0.12, "grad_norm": 1.569846272468567, "learning_rate": 9.920846053205408e-05, "loss": 1.6689, "step": 570 }, { "epoch": 0.13, "grad_norm": 1.5712964534759521, "learning_rate": 9.918665503706934e-05, "loss": 1.6512, "step": 580 }, { "epoch": 0.13, "grad_norm": 1.643431544303894, "learning_rate": 9.916484954208462e-05, "loss": 1.5994, "step": 590 }, { "epoch": 0.13, "grad_norm": 1.619866132736206, "learning_rate": 9.914304404709988e-05, "loss": 1.6212, "step": 600 }, { "epoch": 0.13, "grad_norm": 1.8739800453186035, "learning_rate": 9.912123855211514e-05, "loss": 1.5664, "step": 610 }, { "epoch": 0.13, "grad_norm": 1.9525455236434937, "learning_rate": 9.909943305713039e-05, "loss": 1.6108, "step": 620 }, { "epoch": 0.14, "grad_norm": 1.5381406545639038, "learning_rate": 9.907762756214567e-05, "loss": 1.6004, "step": 630 }, { "epoch": 0.14, "grad_norm": 1.5303971767425537, "learning_rate": 9.905582206716093e-05, "loss": 1.581, "step": 640 }, { "epoch": 0.14, "grad_norm": 1.6467609405517578, "learning_rate": 9.90340165721762e-05, "loss": 1.5812, "step": 650 }, { "epoch": 0.14, "grad_norm": 1.6094383001327515, "learning_rate": 9.901221107719146e-05, "loss": 1.6027, "step": 660 }, { "epoch": 0.15, "grad_norm": 1.5612354278564453, "learning_rate": 9.899040558220672e-05, "loss": 1.5477, "step": 670 }, { "epoch": 0.15, "grad_norm": 1.5925028324127197, "learning_rate": 9.896860008722198e-05, "loss": 1.5747, "step": 680 }, { "epoch": 0.15, "grad_norm": 1.434138298034668, "learning_rate": 9.894679459223725e-05, "loss": 1.5528, "step": 690 }, { "epoch": 0.15, "grad_norm": 1.6473920345306396, "learning_rate": 9.892498909725251e-05, "loss": 1.622, "step": 700 }, { "epoch": 0.15, "grad_norm": 1.599965214729309, "learning_rate": 9.890318360226777e-05, "loss": 1.5691, "step": 710 }, { "epoch": 0.16, "grad_norm": 1.6525471210479736, "learning_rate": 9.888137810728304e-05, "loss": 1.6131, "step": 720 }, { "epoch": 0.16, "grad_norm": 1.5170183181762695, "learning_rate": 9.88595726122983e-05, "loss": 1.5221, "step": 730 }, { "epoch": 0.16, "grad_norm": 1.596643328666687, "learning_rate": 9.883776711731358e-05, "loss": 1.545, "step": 740 }, { "epoch": 0.16, "grad_norm": 1.5849794149398804, "learning_rate": 9.881596162232884e-05, "loss": 1.5654, "step": 750 }, { "epoch": 0.16, "grad_norm": 1.4768157005310059, "learning_rate": 9.879415612734409e-05, "loss": 1.5345, "step": 760 }, { "epoch": 0.17, "grad_norm": 1.5123172998428345, "learning_rate": 9.877235063235935e-05, "loss": 1.5236, "step": 770 }, { "epoch": 0.17, "grad_norm": 1.5827418565750122, "learning_rate": 9.875054513737463e-05, "loss": 1.5174, "step": 780 }, { "epoch": 0.17, "grad_norm": 1.8722275495529175, "learning_rate": 9.872873964238989e-05, "loss": 1.5256, "step": 790 }, { "epoch": 0.17, "grad_norm": 1.6323179006576538, "learning_rate": 9.870693414740515e-05, "loss": 1.4835, "step": 800 }, { "epoch": 0.18, "grad_norm": 1.618322491645813, "learning_rate": 9.868512865242042e-05, "loss": 1.5214, "step": 810 }, { "epoch": 0.18, "grad_norm": 1.6474233865737915, "learning_rate": 9.866332315743568e-05, "loss": 1.4811, "step": 820 }, { "epoch": 0.18, "grad_norm": 1.4305635690689087, "learning_rate": 9.864151766245094e-05, "loss": 1.4727, "step": 830 }, { "epoch": 0.18, "grad_norm": 1.6656005382537842, "learning_rate": 9.86197121674662e-05, "loss": 1.5373, "step": 840 }, { "epoch": 0.18, "grad_norm": 1.640834927558899, "learning_rate": 9.859790667248147e-05, "loss": 1.4811, "step": 850 }, { "epoch": 0.19, "grad_norm": 1.474351167678833, "learning_rate": 9.857610117749673e-05, "loss": 1.4819, "step": 860 }, { "epoch": 0.19, "grad_norm": 1.28626549243927, "learning_rate": 9.8554295682512e-05, "loss": 1.5221, "step": 870 }, { "epoch": 0.19, "grad_norm": 1.313599944114685, "learning_rate": 9.853249018752726e-05, "loss": 1.5221, "step": 880 }, { "epoch": 0.19, "grad_norm": 1.609924554824829, "learning_rate": 9.851068469254252e-05, "loss": 1.519, "step": 890 }, { "epoch": 0.2, "grad_norm": 1.2516050338745117, "learning_rate": 9.84888791975578e-05, "loss": 1.4906, "step": 900 }, { "epoch": 0.2, "grad_norm": 1.3122848272323608, "learning_rate": 9.846707370257305e-05, "loss": 1.5051, "step": 910 }, { "epoch": 0.2, "grad_norm": 1.4828795194625854, "learning_rate": 9.844526820758831e-05, "loss": 1.5206, "step": 920 }, { "epoch": 0.2, "grad_norm": 1.3761475086212158, "learning_rate": 9.842346271260357e-05, "loss": 1.503, "step": 930 }, { "epoch": 0.2, "grad_norm": 1.4912587404251099, "learning_rate": 9.840165721761885e-05, "loss": 1.4932, "step": 940 }, { "epoch": 0.21, "grad_norm": 1.2759939432144165, "learning_rate": 9.837985172263411e-05, "loss": 1.4843, "step": 950 }, { "epoch": 0.21, "grad_norm": 1.6568008661270142, "learning_rate": 9.835804622764938e-05, "loss": 1.5046, "step": 960 }, { "epoch": 0.21, "grad_norm": 1.4292601346969604, "learning_rate": 9.833624073266463e-05, "loss": 1.5249, "step": 970 }, { "epoch": 0.21, "grad_norm": 1.4866324663162231, "learning_rate": 9.83144352376799e-05, "loss": 1.4959, "step": 980 }, { "epoch": 0.21, "grad_norm": 1.429203748703003, "learning_rate": 9.829262974269517e-05, "loss": 1.4725, "step": 990 }, { "epoch": 0.22, "grad_norm": 1.3150511980056763, "learning_rate": 9.827082424771043e-05, "loss": 1.4644, "step": 1000 }, { "epoch": 0.22, "grad_norm": 1.2386242151260376, "learning_rate": 9.824901875272569e-05, "loss": 1.4956, "step": 1010 }, { "epoch": 0.22, "grad_norm": 1.74444580078125, "learning_rate": 9.822721325774095e-05, "loss": 1.4477, "step": 1020 }, { "epoch": 0.22, "grad_norm": 1.21920907497406, "learning_rate": 9.820540776275622e-05, "loss": 1.5053, "step": 1030 }, { "epoch": 0.23, "grad_norm": 1.172884464263916, "learning_rate": 9.818360226777148e-05, "loss": 1.4478, "step": 1040 }, { "epoch": 0.23, "grad_norm": 1.3462252616882324, "learning_rate": 9.816179677278676e-05, "loss": 1.4749, "step": 1050 }, { "epoch": 0.23, "grad_norm": 1.230682373046875, "learning_rate": 9.8139991277802e-05, "loss": 1.4608, "step": 1060 }, { "epoch": 0.23, "grad_norm": 1.4852972030639648, "learning_rate": 9.811818578281727e-05, "loss": 1.5006, "step": 1070 }, { "epoch": 0.23, "grad_norm": 1.2698734998703003, "learning_rate": 9.809638028783253e-05, "loss": 1.4521, "step": 1080 }, { "epoch": 0.24, "grad_norm": 1.3210391998291016, "learning_rate": 9.807457479284781e-05, "loss": 1.4506, "step": 1090 }, { "epoch": 0.24, "grad_norm": 1.329473853111267, "learning_rate": 9.805276929786307e-05, "loss": 1.4587, "step": 1100 }, { "epoch": 0.24, "grad_norm": 1.185905933380127, "learning_rate": 9.803096380287832e-05, "loss": 1.439, "step": 1110 }, { "epoch": 0.24, "grad_norm": 1.1401315927505493, "learning_rate": 9.800915830789358e-05, "loss": 1.4934, "step": 1120 }, { "epoch": 0.25, "grad_norm": 1.2437337636947632, "learning_rate": 9.798735281290886e-05, "loss": 1.4771, "step": 1130 }, { "epoch": 0.25, "grad_norm": 1.231963872909546, "learning_rate": 9.796554731792412e-05, "loss": 1.4428, "step": 1140 }, { "epoch": 0.25, "grad_norm": 1.274877905845642, "learning_rate": 9.794374182293939e-05, "loss": 1.4414, "step": 1150 }, { "epoch": 0.25, "grad_norm": 1.376755952835083, "learning_rate": 9.792193632795465e-05, "loss": 1.4366, "step": 1160 }, { "epoch": 0.25, "grad_norm": 1.0724767446517944, "learning_rate": 9.790013083296991e-05, "loss": 1.4817, "step": 1170 }, { "epoch": 0.26, "grad_norm": 1.3843764066696167, "learning_rate": 9.787832533798518e-05, "loss": 1.4986, "step": 1180 }, { "epoch": 0.26, "grad_norm": 1.327138900756836, "learning_rate": 9.785651984300044e-05, "loss": 1.4484, "step": 1190 }, { "epoch": 0.26, "grad_norm": 1.3678048849105835, "learning_rate": 9.78347143480157e-05, "loss": 1.454, "step": 1200 }, { "epoch": 0.26, "grad_norm": 1.4238979816436768, "learning_rate": 9.781290885303097e-05, "loss": 1.4491, "step": 1210 }, { "epoch": 0.26, "grad_norm": 1.1681418418884277, "learning_rate": 9.779110335804623e-05, "loss": 1.4524, "step": 1220 }, { "epoch": 0.27, "grad_norm": 1.2097047567367554, "learning_rate": 9.776929786306149e-05, "loss": 1.4562, "step": 1230 }, { "epoch": 0.27, "grad_norm": 1.3048409223556519, "learning_rate": 9.774749236807677e-05, "loss": 1.4508, "step": 1240 }, { "epoch": 0.27, "grad_norm": 1.3852041959762573, "learning_rate": 9.772568687309203e-05, "loss": 1.4277, "step": 1250 }, { "epoch": 0.27, "grad_norm": 1.179715871810913, "learning_rate": 9.770388137810728e-05, "loss": 1.415, "step": 1260 }, { "epoch": 0.28, "grad_norm": 1.1659610271453857, "learning_rate": 9.768207588312254e-05, "loss": 1.4528, "step": 1270 }, { "epoch": 0.28, "grad_norm": 1.334057331085205, "learning_rate": 9.766027038813782e-05, "loss": 1.4525, "step": 1280 }, { "epoch": 0.28, "grad_norm": 1.5751981735229492, "learning_rate": 9.763846489315308e-05, "loss": 1.4427, "step": 1290 }, { "epoch": 0.28, "grad_norm": 1.1843003034591675, "learning_rate": 9.761665939816835e-05, "loss": 1.4427, "step": 1300 }, { "epoch": 0.28, "grad_norm": 1.3135390281677246, "learning_rate": 9.759485390318361e-05, "loss": 1.4245, "step": 1310 }, { "epoch": 0.29, "grad_norm": 1.1618658304214478, "learning_rate": 9.757304840819887e-05, "loss": 1.4622, "step": 1320 }, { "epoch": 0.29, "grad_norm": 1.159295678138733, "learning_rate": 9.755124291321414e-05, "loss": 1.4557, "step": 1330 }, { "epoch": 0.29, "grad_norm": 1.209723949432373, "learning_rate": 9.75294374182294e-05, "loss": 1.41, "step": 1340 }, { "epoch": 0.29, "grad_norm": 1.2520672082901, "learning_rate": 9.750763192324466e-05, "loss": 1.4362, "step": 1350 }, { "epoch": 0.3, "grad_norm": 1.2639249563217163, "learning_rate": 9.748582642825992e-05, "loss": 1.4526, "step": 1360 }, { "epoch": 0.3, "grad_norm": 1.2657458782196045, "learning_rate": 9.746402093327519e-05, "loss": 1.4479, "step": 1370 }, { "epoch": 0.3, "grad_norm": 1.4267339706420898, "learning_rate": 9.744221543829045e-05, "loss": 1.4219, "step": 1380 }, { "epoch": 0.3, "grad_norm": 1.1722772121429443, "learning_rate": 9.742040994330571e-05, "loss": 1.448, "step": 1390 }, { "epoch": 0.3, "grad_norm": 1.1443181037902832, "learning_rate": 9.739860444832099e-05, "loss": 1.4193, "step": 1400 }, { "epoch": 0.31, "grad_norm": 1.2879366874694824, "learning_rate": 9.737679895333624e-05, "loss": 1.4196, "step": 1410 }, { "epoch": 0.31, "grad_norm": 1.2243574857711792, "learning_rate": 9.73549934583515e-05, "loss": 1.4296, "step": 1420 }, { "epoch": 0.31, "grad_norm": 1.2071127891540527, "learning_rate": 9.733318796336677e-05, "loss": 1.4194, "step": 1430 }, { "epoch": 0.31, "grad_norm": 1.1925525665283203, "learning_rate": 9.731138246838204e-05, "loss": 1.4243, "step": 1440 }, { "epoch": 0.31, "grad_norm": 1.2962863445281982, "learning_rate": 9.72895769733973e-05, "loss": 1.4371, "step": 1450 }, { "epoch": 0.32, "grad_norm": 1.0177215337753296, "learning_rate": 9.726777147841255e-05, "loss": 1.4237, "step": 1460 }, { "epoch": 0.32, "grad_norm": 1.4175331592559814, "learning_rate": 9.724596598342783e-05, "loss": 1.4107, "step": 1470 }, { "epoch": 0.32, "grad_norm": 1.0958452224731445, "learning_rate": 9.72241604884431e-05, "loss": 1.4176, "step": 1480 }, { "epoch": 0.32, "grad_norm": 1.1612709760665894, "learning_rate": 9.720235499345836e-05, "loss": 1.4051, "step": 1490 }, { "epoch": 0.33, "grad_norm": 1.0781750679016113, "learning_rate": 9.718054949847362e-05, "loss": 1.4179, "step": 1500 }, { "epoch": 0.33, "grad_norm": 1.1481519937515259, "learning_rate": 9.715874400348888e-05, "loss": 1.4247, "step": 1510 }, { "epoch": 0.33, "grad_norm": 1.155716896057129, "learning_rate": 9.713693850850415e-05, "loss": 1.4268, "step": 1520 }, { "epoch": 0.33, "grad_norm": 1.0442588329315186, "learning_rate": 9.711513301351941e-05, "loss": 1.445, "step": 1530 }, { "epoch": 0.33, "grad_norm": 1.0979626178741455, "learning_rate": 9.709332751853467e-05, "loss": 1.4149, "step": 1540 }, { "epoch": 0.34, "grad_norm": 1.119378685951233, "learning_rate": 9.707152202354995e-05, "loss": 1.44, "step": 1550 }, { "epoch": 0.34, "grad_norm": 1.2214171886444092, "learning_rate": 9.70497165285652e-05, "loss": 1.44, "step": 1560 }, { "epoch": 0.34, "grad_norm": 1.1184163093566895, "learning_rate": 9.702791103358046e-05, "loss": 1.3981, "step": 1570 }, { "epoch": 0.34, "grad_norm": 1.130410075187683, "learning_rate": 9.700610553859572e-05, "loss": 1.4296, "step": 1580 }, { "epoch": 0.35, "grad_norm": 1.1225483417510986, "learning_rate": 9.6984300043611e-05, "loss": 1.4153, "step": 1590 }, { "epoch": 0.35, "grad_norm": 1.0556180477142334, "learning_rate": 9.696249454862626e-05, "loss": 1.4219, "step": 1600 }, { "epoch": 0.35, "grad_norm": 1.2000679969787598, "learning_rate": 9.694068905364151e-05, "loss": 1.3892, "step": 1610 }, { "epoch": 0.35, "grad_norm": 1.0137077569961548, "learning_rate": 9.691888355865678e-05, "loss": 1.3976, "step": 1620 }, { "epoch": 0.35, "grad_norm": 1.0124636888504028, "learning_rate": 9.689707806367205e-05, "loss": 1.4129, "step": 1630 }, { "epoch": 0.36, "grad_norm": 1.0647350549697876, "learning_rate": 9.687527256868732e-05, "loss": 1.357, "step": 1640 }, { "epoch": 0.36, "grad_norm": 1.0684030055999756, "learning_rate": 9.685346707370258e-05, "loss": 1.4082, "step": 1650 }, { "epoch": 0.36, "grad_norm": 1.0580588579177856, "learning_rate": 9.683166157871784e-05, "loss": 1.3959, "step": 1660 }, { "epoch": 0.36, "grad_norm": 1.1602911949157715, "learning_rate": 9.68098560837331e-05, "loss": 1.3857, "step": 1670 }, { "epoch": 0.36, "grad_norm": 1.1642051935195923, "learning_rate": 9.678805058874837e-05, "loss": 1.4055, "step": 1680 }, { "epoch": 0.37, "grad_norm": 1.0410170555114746, "learning_rate": 9.676624509376363e-05, "loss": 1.4071, "step": 1690 }, { "epoch": 0.37, "grad_norm": 1.067542314529419, "learning_rate": 9.674443959877891e-05, "loss": 1.4093, "step": 1700 }, { "epoch": 0.37, "grad_norm": 1.2621368169784546, "learning_rate": 9.672263410379416e-05, "loss": 1.3814, "step": 1710 }, { "epoch": 0.37, "grad_norm": 1.0956709384918213, "learning_rate": 9.670082860880942e-05, "loss": 1.4024, "step": 1720 }, { "epoch": 0.38, "grad_norm": 1.1027687788009644, "learning_rate": 9.667902311382468e-05, "loss": 1.3544, "step": 1730 }, { "epoch": 0.38, "grad_norm": 1.1282079219818115, "learning_rate": 9.665721761883996e-05, "loss": 1.3818, "step": 1740 }, { "epoch": 0.38, "grad_norm": 1.244485855102539, "learning_rate": 9.663541212385522e-05, "loss": 1.4024, "step": 1750 }, { "epoch": 0.38, "grad_norm": 1.2329769134521484, "learning_rate": 9.661360662887047e-05, "loss": 1.413, "step": 1760 }, { "epoch": 0.38, "grad_norm": 1.2671635150909424, "learning_rate": 9.659180113388574e-05, "loss": 1.4002, "step": 1770 }, { "epoch": 0.39, "grad_norm": 1.2992949485778809, "learning_rate": 9.656999563890101e-05, "loss": 1.3972, "step": 1780 }, { "epoch": 0.39, "grad_norm": 1.15711510181427, "learning_rate": 9.654819014391628e-05, "loss": 1.3882, "step": 1790 }, { "epoch": 0.39, "grad_norm": 1.122938632965088, "learning_rate": 9.652638464893154e-05, "loss": 1.4222, "step": 1800 }, { "epoch": 0.39, "grad_norm": 1.151628851890564, "learning_rate": 9.650457915394679e-05, "loss": 1.3898, "step": 1810 }, { "epoch": 0.4, "grad_norm": 1.0860607624053955, "learning_rate": 9.648277365896206e-05, "loss": 1.3745, "step": 1820 }, { "epoch": 0.4, "grad_norm": 0.9899650812149048, "learning_rate": 9.646096816397733e-05, "loss": 1.3985, "step": 1830 }, { "epoch": 0.4, "grad_norm": 1.019313097000122, "learning_rate": 9.643916266899259e-05, "loss": 1.4031, "step": 1840 }, { "epoch": 0.4, "grad_norm": 1.1719962358474731, "learning_rate": 9.641735717400785e-05, "loss": 1.3781, "step": 1850 }, { "epoch": 0.4, "grad_norm": 1.117961049079895, "learning_rate": 9.639555167902312e-05, "loss": 1.3885, "step": 1860 }, { "epoch": 0.41, "grad_norm": 1.3950169086456299, "learning_rate": 9.637374618403838e-05, "loss": 1.3746, "step": 1870 }, { "epoch": 0.41, "grad_norm": 1.1064496040344238, "learning_rate": 9.635194068905364e-05, "loss": 1.3764, "step": 1880 }, { "epoch": 0.41, "grad_norm": 1.174922227859497, "learning_rate": 9.63301351940689e-05, "loss": 1.42, "step": 1890 }, { "epoch": 0.41, "grad_norm": 1.3221770524978638, "learning_rate": 9.630832969908418e-05, "loss": 1.3712, "step": 1900 }, { "epoch": 0.41, "grad_norm": 1.0039620399475098, "learning_rate": 9.628652420409943e-05, "loss": 1.3976, "step": 1910 }, { "epoch": 0.42, "grad_norm": 0.9963878393173218, "learning_rate": 9.62647187091147e-05, "loss": 1.3977, "step": 1920 }, { "epoch": 0.42, "grad_norm": 1.2195067405700684, "learning_rate": 9.624291321412997e-05, "loss": 1.3847, "step": 1930 }, { "epoch": 0.42, "grad_norm": 1.0968499183654785, "learning_rate": 9.622110771914523e-05, "loss": 1.3937, "step": 1940 }, { "epoch": 0.42, "grad_norm": 0.992825448513031, "learning_rate": 9.61993022241605e-05, "loss": 1.4082, "step": 1950 }, { "epoch": 0.43, "grad_norm": 1.0395129919052124, "learning_rate": 9.617749672917575e-05, "loss": 1.3696, "step": 1960 }, { "epoch": 0.43, "grad_norm": 1.030629277229309, "learning_rate": 9.615569123419102e-05, "loss": 1.4, "step": 1970 }, { "epoch": 0.43, "grad_norm": 1.0580593347549438, "learning_rate": 9.613388573920629e-05, "loss": 1.3461, "step": 1980 }, { "epoch": 0.43, "grad_norm": 1.2588000297546387, "learning_rate": 9.611208024422155e-05, "loss": 1.3687, "step": 1990 }, { "epoch": 0.43, "grad_norm": 1.1057671308517456, "learning_rate": 9.609027474923681e-05, "loss": 1.3876, "step": 2000 }, { "epoch": 0.44, "grad_norm": 1.1952061653137207, "learning_rate": 9.606846925425208e-05, "loss": 1.3821, "step": 2010 }, { "epoch": 0.44, "grad_norm": 1.105406641960144, "learning_rate": 9.604666375926734e-05, "loss": 1.375, "step": 2020 }, { "epoch": 0.44, "grad_norm": 1.0594791173934937, "learning_rate": 9.60248582642826e-05, "loss": 1.3644, "step": 2030 }, { "epoch": 0.44, "grad_norm": 1.055421233177185, "learning_rate": 9.600305276929787e-05, "loss": 1.3938, "step": 2040 }, { "epoch": 0.45, "grad_norm": 1.2545115947723389, "learning_rate": 9.598124727431314e-05, "loss": 1.3709, "step": 2050 }, { "epoch": 0.45, "grad_norm": 0.9864488244056702, "learning_rate": 9.595944177932839e-05, "loss": 1.3802, "step": 2060 }, { "epoch": 0.45, "grad_norm": 1.0537374019622803, "learning_rate": 9.593763628434365e-05, "loss": 1.3847, "step": 2070 }, { "epoch": 0.45, "grad_norm": 1.0474879741668701, "learning_rate": 9.591583078935892e-05, "loss": 1.3616, "step": 2080 }, { "epoch": 0.45, "grad_norm": 1.1384907960891724, "learning_rate": 9.58940252943742e-05, "loss": 1.3548, "step": 2090 }, { "epoch": 0.46, "grad_norm": 1.1582238674163818, "learning_rate": 9.587221979938946e-05, "loss": 1.374, "step": 2100 }, { "epoch": 0.46, "grad_norm": 1.1610651016235352, "learning_rate": 9.58504143044047e-05, "loss": 1.3726, "step": 2110 }, { "epoch": 0.46, "grad_norm": 1.0401073694229126, "learning_rate": 9.582860880941997e-05, "loss": 1.3617, "step": 2120 }, { "epoch": 0.46, "grad_norm": 1.1059417724609375, "learning_rate": 9.580680331443525e-05, "loss": 1.3765, "step": 2130 }, { "epoch": 0.46, "grad_norm": 1.055931806564331, "learning_rate": 9.578499781945051e-05, "loss": 1.377, "step": 2140 }, { "epoch": 0.47, "grad_norm": 1.1078617572784424, "learning_rate": 9.576319232446577e-05, "loss": 1.3714, "step": 2150 }, { "epoch": 0.47, "grad_norm": 1.0788148641586304, "learning_rate": 9.574138682948104e-05, "loss": 1.3769, "step": 2160 }, { "epoch": 0.47, "grad_norm": 1.1252089738845825, "learning_rate": 9.57195813344963e-05, "loss": 1.3583, "step": 2170 }, { "epoch": 0.47, "grad_norm": 1.0174541473388672, "learning_rate": 9.569777583951156e-05, "loss": 1.3665, "step": 2180 }, { "epoch": 0.48, "grad_norm": 1.0689630508422852, "learning_rate": 9.567597034452682e-05, "loss": 1.3571, "step": 2190 }, { "epoch": 0.48, "grad_norm": 1.1311278343200684, "learning_rate": 9.565416484954209e-05, "loss": 1.3475, "step": 2200 }, { "epoch": 0.48, "grad_norm": 1.082227349281311, "learning_rate": 9.563235935455735e-05, "loss": 1.3952, "step": 2210 }, { "epoch": 0.48, "grad_norm": 1.116151213645935, "learning_rate": 9.561055385957261e-05, "loss": 1.3644, "step": 2220 }, { "epoch": 0.48, "grad_norm": 1.2500598430633545, "learning_rate": 9.558874836458788e-05, "loss": 1.3197, "step": 2230 }, { "epoch": 0.49, "grad_norm": 1.1783186197280884, "learning_rate": 9.556694286960315e-05, "loss": 1.3599, "step": 2240 }, { "epoch": 0.49, "grad_norm": 0.964650571346283, "learning_rate": 9.554513737461842e-05, "loss": 1.3765, "step": 2250 }, { "epoch": 0.49, "grad_norm": 1.1065633296966553, "learning_rate": 9.552333187963367e-05, "loss": 1.3605, "step": 2260 }, { "epoch": 0.49, "grad_norm": 1.4492055177688599, "learning_rate": 9.550152638464893e-05, "loss": 1.3766, "step": 2270 }, { "epoch": 0.49, "grad_norm": 0.9989602565765381, "learning_rate": 9.54797208896642e-05, "loss": 1.3821, "step": 2280 }, { "epoch": 0.5, "grad_norm": 1.2991678714752197, "learning_rate": 9.545791539467947e-05, "loss": 1.3418, "step": 2290 }, { "epoch": 0.5, "grad_norm": 1.1501140594482422, "learning_rate": 9.543610989969473e-05, "loss": 1.3627, "step": 2300 }, { "epoch": 0.5, "grad_norm": 0.9911489486694336, "learning_rate": 9.541430440470998e-05, "loss": 1.3413, "step": 2310 }, { "epoch": 0.5, "grad_norm": 1.1046435832977295, "learning_rate": 9.539249890972526e-05, "loss": 1.3494, "step": 2320 }, { "epoch": 0.51, "grad_norm": 1.0511558055877686, "learning_rate": 9.537069341474052e-05, "loss": 1.3347, "step": 2330 }, { "epoch": 0.51, "grad_norm": 1.1485401391983032, "learning_rate": 9.534888791975578e-05, "loss": 1.3833, "step": 2340 }, { "epoch": 0.51, "grad_norm": 1.2908611297607422, "learning_rate": 9.532708242477105e-05, "loss": 1.3958, "step": 2350 }, { "epoch": 0.51, "grad_norm": 1.0557186603546143, "learning_rate": 9.530527692978631e-05, "loss": 1.3455, "step": 2360 }, { "epoch": 0.51, "grad_norm": 1.0551774501800537, "learning_rate": 9.528347143480157e-05, "loss": 1.3366, "step": 2370 }, { "epoch": 0.52, "grad_norm": 1.0171273946762085, "learning_rate": 9.526166593981684e-05, "loss": 1.3488, "step": 2380 }, { "epoch": 0.52, "grad_norm": 1.3464566469192505, "learning_rate": 9.523986044483211e-05, "loss": 1.3274, "step": 2390 }, { "epoch": 0.52, "grad_norm": 1.1853042840957642, "learning_rate": 9.521805494984737e-05, "loss": 1.3553, "step": 2400 }, { "epoch": 0.52, "grad_norm": 1.2067043781280518, "learning_rate": 9.519624945486262e-05, "loss": 1.358, "step": 2410 }, { "epoch": 0.53, "grad_norm": 1.0003714561462402, "learning_rate": 9.517444395987789e-05, "loss": 1.3768, "step": 2420 }, { "epoch": 0.53, "grad_norm": 1.036536455154419, "learning_rate": 9.515263846489316e-05, "loss": 1.325, "step": 2430 }, { "epoch": 0.53, "grad_norm": 1.2333424091339111, "learning_rate": 9.513083296990843e-05, "loss": 1.3179, "step": 2440 }, { "epoch": 0.53, "grad_norm": 1.5285654067993164, "learning_rate": 9.510902747492369e-05, "loss": 1.3847, "step": 2450 }, { "epoch": 0.53, "grad_norm": 0.9648860096931458, "learning_rate": 9.508722197993894e-05, "loss": 1.3624, "step": 2460 }, { "epoch": 0.54, "grad_norm": 1.0200995206832886, "learning_rate": 9.506541648495422e-05, "loss": 1.3604, "step": 2470 }, { "epoch": 0.54, "grad_norm": 1.0368491411209106, "learning_rate": 9.504361098996948e-05, "loss": 1.3778, "step": 2480 }, { "epoch": 0.54, "grad_norm": 0.9241245985031128, "learning_rate": 9.502180549498474e-05, "loss": 1.3751, "step": 2490 }, { "epoch": 0.54, "grad_norm": 1.0286930799484253, "learning_rate": 9.5e-05, "loss": 1.3429, "step": 2500 }, { "epoch": 0.54, "grad_norm": 1.262276530265808, "learning_rate": 9.497819450501527e-05, "loss": 1.3533, "step": 2510 }, { "epoch": 0.55, "grad_norm": 1.1345752477645874, "learning_rate": 9.495638901003053e-05, "loss": 1.3502, "step": 2520 }, { "epoch": 0.55, "grad_norm": 1.025653600692749, "learning_rate": 9.49345835150458e-05, "loss": 1.3674, "step": 2530 }, { "epoch": 0.55, "grad_norm": 1.0177459716796875, "learning_rate": 9.491277802006106e-05, "loss": 1.356, "step": 2540 }, { "epoch": 0.55, "grad_norm": 1.1438894271850586, "learning_rate": 9.489097252507632e-05, "loss": 1.3488, "step": 2550 }, { "epoch": 0.56, "grad_norm": 1.133844017982483, "learning_rate": 9.486916703009158e-05, "loss": 1.3649, "step": 2560 }, { "epoch": 0.56, "grad_norm": 1.0228559970855713, "learning_rate": 9.484736153510685e-05, "loss": 1.3207, "step": 2570 }, { "epoch": 0.56, "grad_norm": 1.037307858467102, "learning_rate": 9.482555604012211e-05, "loss": 1.3517, "step": 2580 }, { "epoch": 0.56, "grad_norm": 1.123706340789795, "learning_rate": 9.480375054513739e-05, "loss": 1.371, "step": 2590 }, { "epoch": 0.56, "grad_norm": 1.0684685707092285, "learning_rate": 9.478194505015265e-05, "loss": 1.335, "step": 2600 }, { "epoch": 0.57, "grad_norm": 0.9726172089576721, "learning_rate": 9.47601395551679e-05, "loss": 1.3588, "step": 2610 }, { "epoch": 0.57, "grad_norm": 0.8923851251602173, "learning_rate": 9.473833406018318e-05, "loss": 1.3269, "step": 2620 }, { "epoch": 0.57, "grad_norm": 1.1655867099761963, "learning_rate": 9.471652856519844e-05, "loss": 1.3267, "step": 2630 }, { "epoch": 0.57, "grad_norm": 0.9636451005935669, "learning_rate": 9.46947230702137e-05, "loss": 1.3545, "step": 2640 }, { "epoch": 0.58, "grad_norm": 1.1559605598449707, "learning_rate": 9.467291757522896e-05, "loss": 1.3276, "step": 2650 }, { "epoch": 0.58, "grad_norm": 1.1488990783691406, "learning_rate": 9.465111208024423e-05, "loss": 1.3312, "step": 2660 }, { "epoch": 0.58, "grad_norm": 1.0026187896728516, "learning_rate": 9.462930658525949e-05, "loss": 1.3574, "step": 2670 }, { "epoch": 0.58, "grad_norm": 1.0129337310791016, "learning_rate": 9.460750109027475e-05, "loss": 1.3524, "step": 2680 }, { "epoch": 0.58, "grad_norm": 1.1561243534088135, "learning_rate": 9.458569559529002e-05, "loss": 1.3467, "step": 2690 }, { "epoch": 0.59, "grad_norm": 1.0476332902908325, "learning_rate": 9.456389010030528e-05, "loss": 1.3552, "step": 2700 }, { "epoch": 0.59, "grad_norm": 1.0199921131134033, "learning_rate": 9.454208460532054e-05, "loss": 1.3313, "step": 2710 }, { "epoch": 0.59, "grad_norm": 1.2194985151290894, "learning_rate": 9.45202791103358e-05, "loss": 1.3134, "step": 2720 }, { "epoch": 0.59, "grad_norm": 0.9112060070037842, "learning_rate": 9.449847361535107e-05, "loss": 1.3581, "step": 2730 }, { "epoch": 0.59, "grad_norm": 1.085046648979187, "learning_rate": 9.447666812036635e-05, "loss": 1.3344, "step": 2740 }, { "epoch": 0.6, "grad_norm": 1.0680015087127686, "learning_rate": 9.445486262538161e-05, "loss": 1.3227, "step": 2750 }, { "epoch": 0.6, "grad_norm": 0.9969652891159058, "learning_rate": 9.443305713039686e-05, "loss": 1.3324, "step": 2760 }, { "epoch": 0.6, "grad_norm": 1.0868465900421143, "learning_rate": 9.441125163541212e-05, "loss": 1.3261, "step": 2770 }, { "epoch": 0.6, "grad_norm": 1.0380125045776367, "learning_rate": 9.43894461404274e-05, "loss": 1.3378, "step": 2780 }, { "epoch": 0.61, "grad_norm": 0.9851745367050171, "learning_rate": 9.436764064544266e-05, "loss": 1.3171, "step": 2790 }, { "epoch": 0.61, "grad_norm": 0.9909139275550842, "learning_rate": 9.434583515045792e-05, "loss": 1.3073, "step": 2800 }, { "epoch": 0.61, "grad_norm": 1.0225688219070435, "learning_rate": 9.432402965547317e-05, "loss": 1.3119, "step": 2810 }, { "epoch": 0.61, "grad_norm": 1.018894910812378, "learning_rate": 9.430222416048845e-05, "loss": 1.3337, "step": 2820 }, { "epoch": 0.61, "grad_norm": 1.0594004392623901, "learning_rate": 9.428041866550371e-05, "loss": 1.309, "step": 2830 }, { "epoch": 0.62, "grad_norm": 1.0812976360321045, "learning_rate": 9.425861317051898e-05, "loss": 1.3403, "step": 2840 }, { "epoch": 0.62, "grad_norm": 0.9586821794509888, "learning_rate": 9.423680767553424e-05, "loss": 1.3413, "step": 2850 }, { "epoch": 0.62, "grad_norm": 0.9033297896385193, "learning_rate": 9.42150021805495e-05, "loss": 1.3361, "step": 2860 }, { "epoch": 0.62, "grad_norm": 0.976488471031189, "learning_rate": 9.419319668556476e-05, "loss": 1.3467, "step": 2870 }, { "epoch": 0.63, "grad_norm": 0.9687233567237854, "learning_rate": 9.417139119058003e-05, "loss": 1.3089, "step": 2880 }, { "epoch": 0.63, "grad_norm": 0.9967139959335327, "learning_rate": 9.41495856955953e-05, "loss": 1.3241, "step": 2890 }, { "epoch": 0.63, "grad_norm": 0.9404115676879883, "learning_rate": 9.412778020061055e-05, "loss": 1.3489, "step": 2900 }, { "epoch": 0.63, "grad_norm": 1.038221001625061, "learning_rate": 9.410597470562582e-05, "loss": 1.3405, "step": 2910 }, { "epoch": 0.63, "grad_norm": 0.9442505240440369, "learning_rate": 9.408416921064108e-05, "loss": 1.3733, "step": 2920 }, { "epoch": 0.64, "grad_norm": 0.8614059090614319, "learning_rate": 9.406236371565636e-05, "loss": 1.3369, "step": 2930 }, { "epoch": 0.64, "grad_norm": 1.0159504413604736, "learning_rate": 9.404055822067162e-05, "loss": 1.3473, "step": 2940 }, { "epoch": 0.64, "grad_norm": 0.9344844222068787, "learning_rate": 9.401875272568688e-05, "loss": 1.3191, "step": 2950 }, { "epoch": 0.64, "grad_norm": 0.9241899251937866, "learning_rate": 9.399694723070213e-05, "loss": 1.3074, "step": 2960 }, { "epoch": 0.64, "grad_norm": 1.0132297277450562, "learning_rate": 9.397514173571741e-05, "loss": 1.3345, "step": 2970 }, { "epoch": 0.65, "grad_norm": 1.035719633102417, "learning_rate": 9.395333624073267e-05, "loss": 1.3241, "step": 2980 }, { "epoch": 0.65, "grad_norm": 1.0716739892959595, "learning_rate": 9.393153074574793e-05, "loss": 1.3342, "step": 2990 }, { "epoch": 0.65, "grad_norm": 1.05617094039917, "learning_rate": 9.39097252507632e-05, "loss": 1.3174, "step": 3000 }, { "epoch": 0.65, "grad_norm": 1.0201910734176636, "learning_rate": 9.388791975577846e-05, "loss": 1.3427, "step": 3010 }, { "epoch": 0.66, "grad_norm": 0.9820442199707031, "learning_rate": 9.386611426079372e-05, "loss": 1.3187, "step": 3020 }, { "epoch": 0.66, "grad_norm": 0.9873951077461243, "learning_rate": 9.384430876580899e-05, "loss": 1.311, "step": 3030 }, { "epoch": 0.66, "grad_norm": 1.0694694519042969, "learning_rate": 9.382250327082425e-05, "loss": 1.3409, "step": 3040 }, { "epoch": 0.66, "grad_norm": 0.9933134317398071, "learning_rate": 9.380069777583951e-05, "loss": 1.3202, "step": 3050 }, { "epoch": 0.66, "grad_norm": 1.0120593309402466, "learning_rate": 9.377889228085478e-05, "loss": 1.3243, "step": 3060 }, { "epoch": 0.67, "grad_norm": 1.0012543201446533, "learning_rate": 9.375708678587004e-05, "loss": 1.3205, "step": 3070 }, { "epoch": 0.67, "grad_norm": 0.9940156936645508, "learning_rate": 9.373528129088532e-05, "loss": 1.3319, "step": 3080 }, { "epoch": 0.67, "grad_norm": 0.9410566687583923, "learning_rate": 9.371347579590058e-05, "loss": 1.3377, "step": 3090 }, { "epoch": 0.67, "grad_norm": 1.0209511518478394, "learning_rate": 9.369167030091584e-05, "loss": 1.3226, "step": 3100 }, { "epoch": 0.68, "grad_norm": 1.0901682376861572, "learning_rate": 9.366986480593109e-05, "loss": 1.3054, "step": 3110 }, { "epoch": 0.68, "grad_norm": 1.1590335369110107, "learning_rate": 9.364805931094637e-05, "loss": 1.333, "step": 3120 }, { "epoch": 0.68, "grad_norm": 0.9248669147491455, "learning_rate": 9.362625381596163e-05, "loss": 1.3195, "step": 3130 }, { "epoch": 0.68, "grad_norm": 0.9178153276443481, "learning_rate": 9.36044483209769e-05, "loss": 1.3411, "step": 3140 }, { "epoch": 0.68, "grad_norm": 0.8997146487236023, "learning_rate": 9.358264282599216e-05, "loss": 1.3238, "step": 3150 }, { "epoch": 0.69, "grad_norm": 0.872699499130249, "learning_rate": 9.356083733100742e-05, "loss": 1.311, "step": 3160 }, { "epoch": 0.69, "grad_norm": 1.0057190656661987, "learning_rate": 9.353903183602268e-05, "loss": 1.3419, "step": 3170 }, { "epoch": 0.69, "grad_norm": 0.9421138763427734, "learning_rate": 9.351722634103795e-05, "loss": 1.3326, "step": 3180 }, { "epoch": 0.69, "grad_norm": 1.072662353515625, "learning_rate": 9.349542084605321e-05, "loss": 1.3101, "step": 3190 }, { "epoch": 0.69, "grad_norm": 0.9273852109909058, "learning_rate": 9.347361535106847e-05, "loss": 1.2917, "step": 3200 }, { "epoch": 0.7, "grad_norm": 1.056483507156372, "learning_rate": 9.345180985608373e-05, "loss": 1.3145, "step": 3210 }, { "epoch": 0.7, "grad_norm": 1.0562832355499268, "learning_rate": 9.3430004361099e-05, "loss": 1.3236, "step": 3220 }, { "epoch": 0.7, "grad_norm": 0.9665394425392151, "learning_rate": 9.340819886611426e-05, "loss": 1.3311, "step": 3230 }, { "epoch": 0.7, "grad_norm": 1.1284903287887573, "learning_rate": 9.338639337112954e-05, "loss": 1.2955, "step": 3240 }, { "epoch": 0.71, "grad_norm": 0.8982547521591187, "learning_rate": 9.336458787614479e-05, "loss": 1.3064, "step": 3250 }, { "epoch": 0.71, "grad_norm": 0.9506440162658691, "learning_rate": 9.334278238116005e-05, "loss": 1.2924, "step": 3260 }, { "epoch": 0.71, "grad_norm": 0.990853488445282, "learning_rate": 9.332097688617531e-05, "loss": 1.3153, "step": 3270 }, { "epoch": 0.71, "grad_norm": 1.048412561416626, "learning_rate": 9.329917139119059e-05, "loss": 1.3151, "step": 3280 }, { "epoch": 0.71, "grad_norm": 0.9810274243354797, "learning_rate": 9.327736589620585e-05, "loss": 1.3106, "step": 3290 }, { "epoch": 0.72, "grad_norm": 1.2232158184051514, "learning_rate": 9.325556040122112e-05, "loss": 1.3269, "step": 3300 }, { "epoch": 0.72, "grad_norm": 0.9797046780586243, "learning_rate": 9.323375490623638e-05, "loss": 1.3237, "step": 3310 }, { "epoch": 0.72, "grad_norm": 0.9088875651359558, "learning_rate": 9.321194941125164e-05, "loss": 1.328, "step": 3320 }, { "epoch": 0.72, "grad_norm": 0.9865596294403076, "learning_rate": 9.31901439162669e-05, "loss": 1.3245, "step": 3330 }, { "epoch": 0.73, "grad_norm": 0.890883207321167, "learning_rate": 9.316833842128217e-05, "loss": 1.3078, "step": 3340 }, { "epoch": 0.73, "grad_norm": 1.2496368885040283, "learning_rate": 9.314653292629743e-05, "loss": 1.2926, "step": 3350 }, { "epoch": 0.73, "grad_norm": 0.9493234753608704, "learning_rate": 9.31247274313127e-05, "loss": 1.3267, "step": 3360 }, { "epoch": 0.73, "grad_norm": 0.9854113459587097, "learning_rate": 9.310292193632796e-05, "loss": 1.315, "step": 3370 }, { "epoch": 0.73, "grad_norm": 0.9487243294715881, "learning_rate": 9.308111644134322e-05, "loss": 1.3089, "step": 3380 }, { "epoch": 0.74, "grad_norm": 1.0045417547225952, "learning_rate": 9.30593109463585e-05, "loss": 1.3007, "step": 3390 }, { "epoch": 0.74, "grad_norm": 0.9876412749290466, "learning_rate": 9.303750545137375e-05, "loss": 1.3276, "step": 3400 }, { "epoch": 0.74, "grad_norm": 0.9821478724479675, "learning_rate": 9.301569995638901e-05, "loss": 1.3276, "step": 3410 }, { "epoch": 0.74, "grad_norm": 1.0079724788665771, "learning_rate": 9.299389446140427e-05, "loss": 1.3379, "step": 3420 }, { "epoch": 0.74, "grad_norm": 1.0058810710906982, "learning_rate": 9.297208896641955e-05, "loss": 1.309, "step": 3430 }, { "epoch": 0.75, "grad_norm": 0.9457936882972717, "learning_rate": 9.295028347143481e-05, "loss": 1.3301, "step": 3440 }, { "epoch": 0.75, "grad_norm": 1.0582879781723022, "learning_rate": 9.292847797645007e-05, "loss": 1.3075, "step": 3450 }, { "epoch": 0.75, "grad_norm": 1.0312747955322266, "learning_rate": 9.290667248146532e-05, "loss": 1.3102, "step": 3460 }, { "epoch": 0.75, "grad_norm": 1.3287076950073242, "learning_rate": 9.28848669864806e-05, "loss": 1.2828, "step": 3470 }, { "epoch": 0.76, "grad_norm": 1.0003306865692139, "learning_rate": 9.286306149149586e-05, "loss": 1.3158, "step": 3480 }, { "epoch": 0.76, "grad_norm": 0.9804103970527649, "learning_rate": 9.284125599651113e-05, "loss": 1.3429, "step": 3490 }, { "epoch": 0.76, "grad_norm": 0.9052048325538635, "learning_rate": 9.281945050152639e-05, "loss": 1.3248, "step": 3500 }, { "epoch": 0.76, "grad_norm": 0.9492114782333374, "learning_rate": 9.279764500654165e-05, "loss": 1.3173, "step": 3510 }, { "epoch": 0.76, "grad_norm": 0.9319648742675781, "learning_rate": 9.277583951155692e-05, "loss": 1.3188, "step": 3520 }, { "epoch": 0.77, "grad_norm": 0.9741306900978088, "learning_rate": 9.275403401657218e-05, "loss": 1.3263, "step": 3530 }, { "epoch": 0.77, "grad_norm": 0.9644444584846497, "learning_rate": 9.273222852158746e-05, "loss": 1.3089, "step": 3540 }, { "epoch": 0.77, "grad_norm": 0.972549319267273, "learning_rate": 9.27104230266027e-05, "loss": 1.3047, "step": 3550 }, { "epoch": 0.77, "grad_norm": 1.1472231149673462, "learning_rate": 9.268861753161797e-05, "loss": 1.3414, "step": 3560 }, { "epoch": 0.78, "grad_norm": 1.212759256362915, "learning_rate": 9.266681203663323e-05, "loss": 1.2955, "step": 3570 }, { "epoch": 0.78, "grad_norm": 0.9833585023880005, "learning_rate": 9.264500654164851e-05, "loss": 1.3101, "step": 3580 }, { "epoch": 0.78, "grad_norm": 1.0089327096939087, "learning_rate": 9.262320104666377e-05, "loss": 1.3078, "step": 3590 }, { "epoch": 0.78, "grad_norm": 1.026849627494812, "learning_rate": 9.260139555167902e-05, "loss": 1.3062, "step": 3600 }, { "epoch": 0.78, "grad_norm": 0.8988268375396729, "learning_rate": 9.257959005669428e-05, "loss": 1.2961, "step": 3610 }, { "epoch": 0.79, "grad_norm": 1.0766083002090454, "learning_rate": 9.255778456170956e-05, "loss": 1.302, "step": 3620 }, { "epoch": 0.79, "grad_norm": 1.111632227897644, "learning_rate": 9.253597906672482e-05, "loss": 1.3179, "step": 3630 }, { "epoch": 0.79, "grad_norm": 0.9569946527481079, "learning_rate": 9.251417357174009e-05, "loss": 1.3392, "step": 3640 }, { "epoch": 0.79, "grad_norm": 0.9719332456588745, "learning_rate": 9.249236807675535e-05, "loss": 1.3019, "step": 3650 }, { "epoch": 0.79, "grad_norm": 0.9521161317825317, "learning_rate": 9.247056258177061e-05, "loss": 1.3226, "step": 3660 }, { "epoch": 0.8, "grad_norm": 1.1349732875823975, "learning_rate": 9.244875708678587e-05, "loss": 1.3184, "step": 3670 }, { "epoch": 0.8, "grad_norm": 1.0802345275878906, "learning_rate": 9.242695159180114e-05, "loss": 1.3236, "step": 3680 }, { "epoch": 0.8, "grad_norm": 1.0327568054199219, "learning_rate": 9.24051460968164e-05, "loss": 1.3285, "step": 3690 }, { "epoch": 0.8, "grad_norm": 1.064948320388794, "learning_rate": 9.238334060183166e-05, "loss": 1.3158, "step": 3700 }, { "epoch": 0.81, "grad_norm": 0.88676518201828, "learning_rate": 9.236153510684693e-05, "loss": 1.3066, "step": 3710 }, { "epoch": 0.81, "grad_norm": 0.942152202129364, "learning_rate": 9.233972961186219e-05, "loss": 1.332, "step": 3720 }, { "epoch": 0.81, "grad_norm": 0.9341984987258911, "learning_rate": 9.231792411687745e-05, "loss": 1.3147, "step": 3730 }, { "epoch": 0.81, "grad_norm": 0.8915871381759644, "learning_rate": 9.229611862189273e-05, "loss": 1.3071, "step": 3740 }, { "epoch": 0.81, "grad_norm": 0.9265626668930054, "learning_rate": 9.227431312690798e-05, "loss": 1.3083, "step": 3750 }, { "epoch": 0.82, "grad_norm": 0.9003929495811462, "learning_rate": 9.225250763192324e-05, "loss": 1.3101, "step": 3760 }, { "epoch": 0.82, "grad_norm": 1.004757285118103, "learning_rate": 9.223070213693852e-05, "loss": 1.3324, "step": 3770 }, { "epoch": 0.82, "grad_norm": 0.9720560908317566, "learning_rate": 9.220889664195378e-05, "loss": 1.3074, "step": 3780 }, { "epoch": 0.82, "grad_norm": 1.0125725269317627, "learning_rate": 9.218709114696904e-05, "loss": 1.295, "step": 3790 }, { "epoch": 0.82, "grad_norm": 0.9948697686195374, "learning_rate": 9.21652856519843e-05, "loss": 1.3072, "step": 3800 }, { "epoch": 0.83, "grad_norm": 0.8904112577438354, "learning_rate": 9.214348015699957e-05, "loss": 1.2879, "step": 3810 }, { "epoch": 0.83, "grad_norm": 0.9827283620834351, "learning_rate": 9.212167466201483e-05, "loss": 1.2859, "step": 3820 }, { "epoch": 0.83, "grad_norm": 0.9134978652000427, "learning_rate": 9.20998691670301e-05, "loss": 1.2996, "step": 3830 }, { "epoch": 0.83, "grad_norm": 0.9517325162887573, "learning_rate": 9.207806367204536e-05, "loss": 1.2764, "step": 3840 }, { "epoch": 0.84, "grad_norm": 0.9537093043327332, "learning_rate": 9.205625817706062e-05, "loss": 1.3112, "step": 3850 }, { "epoch": 0.84, "grad_norm": 1.011399269104004, "learning_rate": 9.203445268207589e-05, "loss": 1.3008, "step": 3860 }, { "epoch": 0.84, "grad_norm": 1.0325734615325928, "learning_rate": 9.201264718709115e-05, "loss": 1.3032, "step": 3870 }, { "epoch": 0.84, "grad_norm": 0.9590222239494324, "learning_rate": 9.199084169210641e-05, "loss": 1.3002, "step": 3880 }, { "epoch": 0.84, "grad_norm": 0.984958827495575, "learning_rate": 9.196903619712169e-05, "loss": 1.3011, "step": 3890 }, { "epoch": 0.85, "grad_norm": 1.1154364347457886, "learning_rate": 9.194723070213694e-05, "loss": 1.3065, "step": 3900 }, { "epoch": 0.85, "grad_norm": 1.0203578472137451, "learning_rate": 9.19254252071522e-05, "loss": 1.3193, "step": 3910 }, { "epoch": 0.85, "grad_norm": 1.0204946994781494, "learning_rate": 9.190361971216746e-05, "loss": 1.3048, "step": 3920 }, { "epoch": 0.85, "grad_norm": 0.9758703708648682, "learning_rate": 9.188181421718274e-05, "loss": 1.2933, "step": 3930 }, { "epoch": 0.86, "grad_norm": 1.0854405164718628, "learning_rate": 9.1860008722198e-05, "loss": 1.2947, "step": 3940 }, { "epoch": 0.86, "grad_norm": 1.0030591487884521, "learning_rate": 9.183820322721325e-05, "loss": 1.2882, "step": 3950 }, { "epoch": 0.86, "grad_norm": 0.9652947187423706, "learning_rate": 9.181639773222852e-05, "loss": 1.2779, "step": 3960 }, { "epoch": 0.86, "grad_norm": 1.0450283288955688, "learning_rate": 9.179459223724379e-05, "loss": 1.2807, "step": 3970 }, { "epoch": 0.86, "grad_norm": 1.0894801616668701, "learning_rate": 9.177278674225906e-05, "loss": 1.3072, "step": 3980 }, { "epoch": 0.87, "grad_norm": 1.0392231941223145, "learning_rate": 9.175098124727432e-05, "loss": 1.3119, "step": 3990 }, { "epoch": 0.87, "grad_norm": 0.9792558550834656, "learning_rate": 9.172917575228958e-05, "loss": 1.3062, "step": 4000 }, { "epoch": 0.87, "grad_norm": 1.015689492225647, "learning_rate": 9.170737025730485e-05, "loss": 1.3075, "step": 4010 }, { "epoch": 0.87, "grad_norm": 1.0359702110290527, "learning_rate": 9.168556476232011e-05, "loss": 1.3022, "step": 4020 }, { "epoch": 0.87, "grad_norm": 0.9113004803657532, "learning_rate": 9.166375926733537e-05, "loss": 1.3298, "step": 4030 }, { "epoch": 0.88, "grad_norm": 1.0571136474609375, "learning_rate": 9.164195377235065e-05, "loss": 1.2898, "step": 4040 }, { "epoch": 0.88, "grad_norm": 0.9297426342964172, "learning_rate": 9.16201482773659e-05, "loss": 1.2895, "step": 4050 }, { "epoch": 0.88, "grad_norm": 1.0925400257110596, "learning_rate": 9.159834278238116e-05, "loss": 1.2998, "step": 4060 }, { "epoch": 0.88, "grad_norm": 0.9070808291435242, "learning_rate": 9.157653728739642e-05, "loss": 1.2998, "step": 4070 }, { "epoch": 0.89, "grad_norm": 1.1315734386444092, "learning_rate": 9.15547317924117e-05, "loss": 1.2867, "step": 4080 }, { "epoch": 0.89, "grad_norm": 1.0597316026687622, "learning_rate": 9.153292629742696e-05, "loss": 1.2931, "step": 4090 }, { "epoch": 0.89, "grad_norm": 0.9442005157470703, "learning_rate": 9.151112080244221e-05, "loss": 1.2805, "step": 4100 }, { "epoch": 0.89, "grad_norm": 1.3041001558303833, "learning_rate": 9.148931530745748e-05, "loss": 1.2934, "step": 4110 }, { "epoch": 0.89, "grad_norm": 0.9306684136390686, "learning_rate": 9.146750981247275e-05, "loss": 1.2933, "step": 4120 }, { "epoch": 0.9, "grad_norm": 0.9480651021003723, "learning_rate": 9.144570431748802e-05, "loss": 1.3147, "step": 4130 }, { "epoch": 0.9, "grad_norm": 0.98679119348526, "learning_rate": 9.142389882250328e-05, "loss": 1.3063, "step": 4140 }, { "epoch": 0.9, "grad_norm": 0.9486891627311707, "learning_rate": 9.140209332751853e-05, "loss": 1.2644, "step": 4150 }, { "epoch": 0.9, "grad_norm": 0.9325621724128723, "learning_rate": 9.13802878325338e-05, "loss": 1.2718, "step": 4160 }, { "epoch": 0.91, "grad_norm": 0.9871125221252441, "learning_rate": 9.135848233754907e-05, "loss": 1.2943, "step": 4170 }, { "epoch": 0.91, "grad_norm": 0.9043755531311035, "learning_rate": 9.133667684256433e-05, "loss": 1.3015, "step": 4180 }, { "epoch": 0.91, "grad_norm": 0.9878096580505371, "learning_rate": 9.13148713475796e-05, "loss": 1.2524, "step": 4190 }, { "epoch": 0.91, "grad_norm": 0.925841748714447, "learning_rate": 9.129306585259486e-05, "loss": 1.2881, "step": 4200 }, { "epoch": 0.91, "grad_norm": 0.8888818025588989, "learning_rate": 9.127126035761012e-05, "loss": 1.3057, "step": 4210 }, { "epoch": 0.92, "grad_norm": 1.1273852586746216, "learning_rate": 9.124945486262538e-05, "loss": 1.3068, "step": 4220 }, { "epoch": 0.92, "grad_norm": 1.078979253768921, "learning_rate": 9.122764936764066e-05, "loss": 1.311, "step": 4230 }, { "epoch": 0.92, "grad_norm": 1.139224648475647, "learning_rate": 9.120584387265592e-05, "loss": 1.2961, "step": 4240 }, { "epoch": 0.92, "grad_norm": 0.9568941593170166, "learning_rate": 9.118403837767117e-05, "loss": 1.3335, "step": 4250 }, { "epoch": 0.92, "grad_norm": 0.8990288972854614, "learning_rate": 9.116223288268643e-05, "loss": 1.2983, "step": 4260 }, { "epoch": 0.93, "grad_norm": 1.0404481887817383, "learning_rate": 9.114042738770171e-05, "loss": 1.2867, "step": 4270 }, { "epoch": 0.93, "grad_norm": 0.970191240310669, "learning_rate": 9.111862189271697e-05, "loss": 1.2923, "step": 4280 }, { "epoch": 0.93, "grad_norm": 0.9285945296287537, "learning_rate": 9.109681639773224e-05, "loss": 1.296, "step": 4290 }, { "epoch": 0.93, "grad_norm": 1.0113970041275024, "learning_rate": 9.107501090274749e-05, "loss": 1.2861, "step": 4300 }, { "epoch": 0.94, "grad_norm": 1.0101959705352783, "learning_rate": 9.105320540776276e-05, "loss": 1.2958, "step": 4310 }, { "epoch": 0.94, "grad_norm": 0.9014917612075806, "learning_rate": 9.103139991277803e-05, "loss": 1.2735, "step": 4320 }, { "epoch": 0.94, "grad_norm": 1.4451045989990234, "learning_rate": 9.100959441779329e-05, "loss": 1.3111, "step": 4330 }, { "epoch": 0.94, "grad_norm": 0.9970597624778748, "learning_rate": 9.098778892280855e-05, "loss": 1.2725, "step": 4340 }, { "epoch": 0.94, "grad_norm": 0.9795159101486206, "learning_rate": 9.096598342782382e-05, "loss": 1.286, "step": 4350 }, { "epoch": 0.95, "grad_norm": 1.1754708290100098, "learning_rate": 9.094417793283908e-05, "loss": 1.2903, "step": 4360 }, { "epoch": 0.95, "grad_norm": 1.02108895778656, "learning_rate": 9.092237243785434e-05, "loss": 1.2865, "step": 4370 }, { "epoch": 0.95, "grad_norm": 0.9269696474075317, "learning_rate": 9.09005669428696e-05, "loss": 1.3163, "step": 4380 }, { "epoch": 0.95, "grad_norm": 0.9824286103248596, "learning_rate": 9.087876144788488e-05, "loss": 1.2713, "step": 4390 }, { "epoch": 0.96, "grad_norm": 1.2137070894241333, "learning_rate": 9.085695595290013e-05, "loss": 1.313, "step": 4400 }, { "epoch": 0.96, "grad_norm": 1.0218490362167358, "learning_rate": 9.08351504579154e-05, "loss": 1.2864, "step": 4410 }, { "epoch": 0.96, "grad_norm": 1.0295207500457764, "learning_rate": 9.081334496293066e-05, "loss": 1.2974, "step": 4420 }, { "epoch": 0.96, "grad_norm": 1.0075607299804688, "learning_rate": 9.079153946794593e-05, "loss": 1.3011, "step": 4430 }, { "epoch": 0.96, "grad_norm": 0.889430820941925, "learning_rate": 9.07697339729612e-05, "loss": 1.3112, "step": 4440 }, { "epoch": 0.97, "grad_norm": 0.9565015435218811, "learning_rate": 9.074792847797645e-05, "loss": 1.3019, "step": 4450 }, { "epoch": 0.97, "grad_norm": 1.0241695642471313, "learning_rate": 9.072612298299172e-05, "loss": 1.2878, "step": 4460 }, { "epoch": 0.97, "grad_norm": 0.9693965315818787, "learning_rate": 9.070431748800699e-05, "loss": 1.3009, "step": 4470 }, { "epoch": 0.97, "grad_norm": 0.8897150754928589, "learning_rate": 9.068251199302225e-05, "loss": 1.2757, "step": 4480 }, { "epoch": 0.97, "grad_norm": 1.1614912748336792, "learning_rate": 9.066070649803751e-05, "loss": 1.2923, "step": 4490 }, { "epoch": 0.98, "grad_norm": 0.8832863569259644, "learning_rate": 9.063890100305277e-05, "loss": 1.3098, "step": 4500 }, { "epoch": 0.98, "grad_norm": 0.9805281162261963, "learning_rate": 9.061709550806804e-05, "loss": 1.2958, "step": 4510 }, { "epoch": 0.98, "grad_norm": 1.0199958086013794, "learning_rate": 9.05952900130833e-05, "loss": 1.2824, "step": 4520 }, { "epoch": 0.98, "grad_norm": 0.8528922200202942, "learning_rate": 9.057348451809856e-05, "loss": 1.2993, "step": 4530 }, { "epoch": 0.99, "grad_norm": 0.9288610816001892, "learning_rate": 9.055167902311384e-05, "loss": 1.2758, "step": 4540 }, { "epoch": 0.99, "grad_norm": 0.8977848887443542, "learning_rate": 9.052987352812909e-05, "loss": 1.2789, "step": 4550 }, { "epoch": 0.99, "grad_norm": 0.8637726902961731, "learning_rate": 9.050806803314435e-05, "loss": 1.2734, "step": 4560 }, { "epoch": 0.99, "grad_norm": 0.9056828022003174, "learning_rate": 9.048626253815962e-05, "loss": 1.272, "step": 4570 }, { "epoch": 0.99, "grad_norm": 0.9080044627189636, "learning_rate": 9.046445704317489e-05, "loss": 1.264, "step": 4580 }, { "epoch": 1.0, "grad_norm": 0.886441707611084, "learning_rate": 9.044265154819016e-05, "loss": 1.2752, "step": 4590 }, { "epoch": 1.0, "grad_norm": 1.02278470993042, "learning_rate": 9.04208460532054e-05, "loss": 1.2819, "step": 4600 }, { "epoch": 1.0, "eval_loss": 1.2792317867279053, "eval_runtime": 1502.3325, "eval_samples_per_second": 257.499, "eval_steps_per_second": 4.024, "step": 4606 }, { "epoch": 1.0, "grad_norm": 1.08243727684021, "learning_rate": 9.039904055822067e-05, "loss": 1.3113, "step": 4610 }, { "epoch": 1.0, "grad_norm": 1.073258399963379, "learning_rate": 9.037723506323594e-05, "loss": 1.3031, "step": 4620 }, { "epoch": 1.01, "grad_norm": 0.9962953329086304, "learning_rate": 9.035542956825121e-05, "loss": 1.2904, "step": 4630 }, { "epoch": 1.01, "grad_norm": 0.9397081136703491, "learning_rate": 9.033362407326647e-05, "loss": 1.2672, "step": 4640 }, { "epoch": 1.01, "grad_norm": 0.9223260879516602, "learning_rate": 9.031181857828172e-05, "loss": 1.2898, "step": 4650 }, { "epoch": 1.01, "grad_norm": 1.0643510818481445, "learning_rate": 9.0290013083297e-05, "loss": 1.2831, "step": 4660 }, { "epoch": 1.01, "grad_norm": 0.9219188094139099, "learning_rate": 9.026820758831226e-05, "loss": 1.2651, "step": 4670 }, { "epoch": 1.02, "grad_norm": 0.9872779250144958, "learning_rate": 9.024640209332752e-05, "loss": 1.2695, "step": 4680 }, { "epoch": 1.02, "grad_norm": 0.9516711235046387, "learning_rate": 9.022459659834279e-05, "loss": 1.2662, "step": 4690 }, { "epoch": 1.02, "grad_norm": 0.9385516047477722, "learning_rate": 9.020279110335805e-05, "loss": 1.2744, "step": 4700 }, { "epoch": 1.02, "grad_norm": 1.0308866500854492, "learning_rate": 9.018098560837331e-05, "loss": 1.2718, "step": 4710 }, { "epoch": 1.02, "grad_norm": 0.9456400871276855, "learning_rate": 9.015918011338857e-05, "loss": 1.2494, "step": 4720 }, { "epoch": 1.03, "grad_norm": 1.1350531578063965, "learning_rate": 9.013737461840385e-05, "loss": 1.2607, "step": 4730 }, { "epoch": 1.03, "grad_norm": 0.9552891254425049, "learning_rate": 9.011556912341911e-05, "loss": 1.2563, "step": 4740 }, { "epoch": 1.03, "grad_norm": 0.9082231521606445, "learning_rate": 9.009376362843436e-05, "loss": 1.268, "step": 4750 }, { "epoch": 1.03, "grad_norm": 1.0419315099716187, "learning_rate": 9.007195813344963e-05, "loss": 1.3033, "step": 4760 }, { "epoch": 1.04, "grad_norm": 0.827100396156311, "learning_rate": 9.00501526384649e-05, "loss": 1.2636, "step": 4770 }, { "epoch": 1.04, "grad_norm": 1.0661678314208984, "learning_rate": 9.002834714348017e-05, "loss": 1.2487, "step": 4780 }, { "epoch": 1.04, "grad_norm": 0.9938476085662842, "learning_rate": 9.000654164849543e-05, "loss": 1.2729, "step": 4790 }, { "epoch": 1.04, "grad_norm": 1.1281195878982544, "learning_rate": 8.998473615351068e-05, "loss": 1.2391, "step": 4800 }, { "epoch": 1.04, "grad_norm": 1.1780451536178589, "learning_rate": 8.996293065852596e-05, "loss": 1.2985, "step": 4810 }, { "epoch": 1.05, "grad_norm": 1.0872817039489746, "learning_rate": 8.994112516354122e-05, "loss": 1.2615, "step": 4820 }, { "epoch": 1.05, "grad_norm": 0.9712433815002441, "learning_rate": 8.991931966855648e-05, "loss": 1.2694, "step": 4830 }, { "epoch": 1.05, "grad_norm": 1.2177668809890747, "learning_rate": 8.989751417357174e-05, "loss": 1.2726, "step": 4840 }, { "epoch": 1.05, "grad_norm": 0.9332715272903442, "learning_rate": 8.987570867858701e-05, "loss": 1.2703, "step": 4850 }, { "epoch": 1.06, "grad_norm": 0.9567763209342957, "learning_rate": 8.985390318360227e-05, "loss": 1.2711, "step": 4860 }, { "epoch": 1.06, "grad_norm": 0.9975143074989319, "learning_rate": 8.983209768861753e-05, "loss": 1.2947, "step": 4870 }, { "epoch": 1.06, "grad_norm": 1.0711029767990112, "learning_rate": 8.98102921936328e-05, "loss": 1.2723, "step": 4880 }, { "epoch": 1.06, "grad_norm": 0.9394287467002869, "learning_rate": 8.978848669864807e-05, "loss": 1.2709, "step": 4890 }, { "epoch": 1.06, "grad_norm": 1.0839319229125977, "learning_rate": 8.976668120366332e-05, "loss": 1.2892, "step": 4900 }, { "epoch": 1.07, "grad_norm": 1.024117112159729, "learning_rate": 8.974487570867859e-05, "loss": 1.2627, "step": 4910 }, { "epoch": 1.07, "grad_norm": 0.9055659174919128, "learning_rate": 8.972307021369386e-05, "loss": 1.2754, "step": 4920 }, { "epoch": 1.07, "grad_norm": 0.9383713603019714, "learning_rate": 8.970126471870913e-05, "loss": 1.2713, "step": 4930 }, { "epoch": 1.07, "grad_norm": 1.087470293045044, "learning_rate": 8.967945922372439e-05, "loss": 1.27, "step": 4940 }, { "epoch": 1.07, "grad_norm": 0.9602554440498352, "learning_rate": 8.965765372873964e-05, "loss": 1.2829, "step": 4950 }, { "epoch": 1.08, "grad_norm": 0.9457790851593018, "learning_rate": 8.963584823375491e-05, "loss": 1.2757, "step": 4960 }, { "epoch": 1.08, "grad_norm": 0.8682853579521179, "learning_rate": 8.961404273877018e-05, "loss": 1.2662, "step": 4970 }, { "epoch": 1.08, "grad_norm": 1.0000272989273071, "learning_rate": 8.959223724378544e-05, "loss": 1.2616, "step": 4980 }, { "epoch": 1.08, "grad_norm": 1.0122287273406982, "learning_rate": 8.95704317488007e-05, "loss": 1.287, "step": 4990 }, { "epoch": 1.09, "grad_norm": 0.9552735090255737, "learning_rate": 8.954862625381597e-05, "loss": 1.2782, "step": 5000 }, { "epoch": 1.09, "grad_norm": 0.9103166460990906, "learning_rate": 8.952682075883123e-05, "loss": 1.2388, "step": 5010 }, { "epoch": 1.09, "grad_norm": 1.0033226013183594, "learning_rate": 8.950501526384649e-05, "loss": 1.2762, "step": 5020 }, { "epoch": 1.09, "grad_norm": 0.9572534561157227, "learning_rate": 8.948320976886176e-05, "loss": 1.2801, "step": 5030 }, { "epoch": 1.09, "grad_norm": 0.9460912942886353, "learning_rate": 8.946140427387702e-05, "loss": 1.2651, "step": 5040 }, { "epoch": 1.1, "grad_norm": 1.0236018896102905, "learning_rate": 8.943959877889228e-05, "loss": 1.2602, "step": 5050 }, { "epoch": 1.1, "grad_norm": 1.0384821891784668, "learning_rate": 8.941779328390754e-05, "loss": 1.3027, "step": 5060 }, { "epoch": 1.1, "grad_norm": 0.9547539949417114, "learning_rate": 8.939598778892281e-05, "loss": 1.2969, "step": 5070 }, { "epoch": 1.1, "grad_norm": 0.9478334784507751, "learning_rate": 8.937418229393808e-05, "loss": 1.2829, "step": 5080 }, { "epoch": 1.11, "grad_norm": 1.0621150732040405, "learning_rate": 8.935237679895335e-05, "loss": 1.2601, "step": 5090 }, { "epoch": 1.11, "grad_norm": 0.9307476282119751, "learning_rate": 8.93305713039686e-05, "loss": 1.2656, "step": 5100 }, { "epoch": 1.11, "grad_norm": 1.0189131498336792, "learning_rate": 8.930876580898386e-05, "loss": 1.2646, "step": 5110 }, { "epoch": 1.11, "grad_norm": 1.1185131072998047, "learning_rate": 8.928696031399914e-05, "loss": 1.2785, "step": 5120 }, { "epoch": 1.11, "grad_norm": 0.9753584265708923, "learning_rate": 8.92651548190144e-05, "loss": 1.2511, "step": 5130 }, { "epoch": 1.12, "grad_norm": 1.0418280363082886, "learning_rate": 8.924334932402966e-05, "loss": 1.2537, "step": 5140 }, { "epoch": 1.12, "grad_norm": 0.9717410802841187, "learning_rate": 8.922154382904493e-05, "loss": 1.2687, "step": 5150 }, { "epoch": 1.12, "grad_norm": 0.988318681716919, "learning_rate": 8.919973833406019e-05, "loss": 1.2599, "step": 5160 }, { "epoch": 1.12, "grad_norm": 0.9211105108261108, "learning_rate": 8.917793283907545e-05, "loss": 1.2646, "step": 5170 }, { "epoch": 1.12, "grad_norm": 0.9481471180915833, "learning_rate": 8.915612734409071e-05, "loss": 1.271, "step": 5180 }, { "epoch": 1.13, "grad_norm": 0.8939971923828125, "learning_rate": 8.913432184910598e-05, "loss": 1.2865, "step": 5190 }, { "epoch": 1.13, "grad_norm": 0.9412124156951904, "learning_rate": 8.911251635412124e-05, "loss": 1.279, "step": 5200 }, { "epoch": 1.13, "grad_norm": 0.9381204843521118, "learning_rate": 8.90907108591365e-05, "loss": 1.2813, "step": 5210 }, { "epoch": 1.13, "grad_norm": 0.9502457976341248, "learning_rate": 8.906890536415177e-05, "loss": 1.2829, "step": 5220 }, { "epoch": 1.14, "grad_norm": 1.0576632022857666, "learning_rate": 8.904709986916704e-05, "loss": 1.2708, "step": 5230 }, { "epoch": 1.14, "grad_norm": 1.0302668809890747, "learning_rate": 8.902529437418229e-05, "loss": 1.2893, "step": 5240 }, { "epoch": 1.14, "grad_norm": 0.9892765283584595, "learning_rate": 8.900348887919756e-05, "loss": 1.2691, "step": 5250 }, { "epoch": 1.14, "grad_norm": 1.0383532047271729, "learning_rate": 8.898168338421282e-05, "loss": 1.2539, "step": 5260 }, { "epoch": 1.14, "grad_norm": 0.9894425868988037, "learning_rate": 8.89598778892281e-05, "loss": 1.2838, "step": 5270 }, { "epoch": 1.15, "grad_norm": 1.0066653490066528, "learning_rate": 8.893807239424336e-05, "loss": 1.2606, "step": 5280 }, { "epoch": 1.15, "grad_norm": 1.0619821548461914, "learning_rate": 8.891626689925862e-05, "loss": 1.2724, "step": 5290 }, { "epoch": 1.15, "grad_norm": 0.9619722962379456, "learning_rate": 8.889446140427387e-05, "loss": 1.2783, "step": 5300 }, { "epoch": 1.15, "grad_norm": 0.8887227177619934, "learning_rate": 8.887265590928915e-05, "loss": 1.264, "step": 5310 }, { "epoch": 1.15, "grad_norm": 1.0262665748596191, "learning_rate": 8.885085041430441e-05, "loss": 1.2482, "step": 5320 }, { "epoch": 1.16, "grad_norm": 1.016381859779358, "learning_rate": 8.882904491931967e-05, "loss": 1.2523, "step": 5330 }, { "epoch": 1.16, "grad_norm": 0.9932143092155457, "learning_rate": 8.880723942433494e-05, "loss": 1.2516, "step": 5340 }, { "epoch": 1.16, "grad_norm": 0.9815816283226013, "learning_rate": 8.87854339293502e-05, "loss": 1.2574, "step": 5350 }, { "epoch": 1.16, "grad_norm": 1.0072325468063354, "learning_rate": 8.876362843436546e-05, "loss": 1.2688, "step": 5360 }, { "epoch": 1.17, "grad_norm": 0.9834664463996887, "learning_rate": 8.874182293938073e-05, "loss": 1.2632, "step": 5370 }, { "epoch": 1.17, "grad_norm": 1.0800156593322754, "learning_rate": 8.8720017444396e-05, "loss": 1.2767, "step": 5380 }, { "epoch": 1.17, "grad_norm": 0.9449285268783569, "learning_rate": 8.869821194941125e-05, "loss": 1.2667, "step": 5390 }, { "epoch": 1.17, "grad_norm": 1.1136956214904785, "learning_rate": 8.867640645442652e-05, "loss": 1.2506, "step": 5400 }, { "epoch": 1.17, "grad_norm": 0.9061567783355713, "learning_rate": 8.865460095944178e-05, "loss": 1.2658, "step": 5410 }, { "epoch": 1.18, "grad_norm": 1.00759756565094, "learning_rate": 8.863279546445705e-05, "loss": 1.285, "step": 5420 }, { "epoch": 1.18, "grad_norm": 1.0507421493530273, "learning_rate": 8.861098996947232e-05, "loss": 1.277, "step": 5430 }, { "epoch": 1.18, "grad_norm": 1.0796302556991577, "learning_rate": 8.858918447448758e-05, "loss": 1.2604, "step": 5440 }, { "epoch": 1.18, "grad_norm": 1.0264052152633667, "learning_rate": 8.856737897950283e-05, "loss": 1.2747, "step": 5450 }, { "epoch": 1.19, "grad_norm": 0.9274656176567078, "learning_rate": 8.854557348451811e-05, "loss": 1.2617, "step": 5460 }, { "epoch": 1.19, "grad_norm": 1.0233980417251587, "learning_rate": 8.852376798953337e-05, "loss": 1.2787, "step": 5470 }, { "epoch": 1.19, "grad_norm": 0.9718747138977051, "learning_rate": 8.850196249454863e-05, "loss": 1.2511, "step": 5480 }, { "epoch": 1.19, "grad_norm": 1.0765981674194336, "learning_rate": 8.84801569995639e-05, "loss": 1.2794, "step": 5490 }, { "epoch": 1.19, "grad_norm": 1.048608660697937, "learning_rate": 8.845835150457916e-05, "loss": 1.2597, "step": 5500 }, { "epoch": 1.2, "grad_norm": 0.9524050354957581, "learning_rate": 8.843654600959442e-05, "loss": 1.246, "step": 5510 }, { "epoch": 1.2, "grad_norm": 0.9819397926330566, "learning_rate": 8.841474051460969e-05, "loss": 1.2732, "step": 5520 }, { "epoch": 1.2, "grad_norm": 0.914893388748169, "learning_rate": 8.839293501962495e-05, "loss": 1.2694, "step": 5530 }, { "epoch": 1.2, "grad_norm": 0.9561071395874023, "learning_rate": 8.837112952464021e-05, "loss": 1.2642, "step": 5540 }, { "epoch": 1.2, "grad_norm": 0.9841814637184143, "learning_rate": 8.834932402965547e-05, "loss": 1.2684, "step": 5550 }, { "epoch": 1.21, "grad_norm": 0.931611955165863, "learning_rate": 8.832751853467074e-05, "loss": 1.2751, "step": 5560 }, { "epoch": 1.21, "grad_norm": 1.0068223476409912, "learning_rate": 8.8305713039686e-05, "loss": 1.2589, "step": 5570 }, { "epoch": 1.21, "grad_norm": 1.088884711265564, "learning_rate": 8.828390754470128e-05, "loss": 1.2606, "step": 5580 }, { "epoch": 1.21, "grad_norm": 0.9682032465934753, "learning_rate": 8.826210204971653e-05, "loss": 1.2467, "step": 5590 }, { "epoch": 1.22, "grad_norm": 1.0218122005462646, "learning_rate": 8.824029655473179e-05, "loss": 1.2684, "step": 5600 }, { "epoch": 1.22, "grad_norm": 0.9690065979957581, "learning_rate": 8.821849105974707e-05, "loss": 1.2906, "step": 5610 }, { "epoch": 1.22, "grad_norm": 0.9736804366111755, "learning_rate": 8.819668556476233e-05, "loss": 1.2682, "step": 5620 }, { "epoch": 1.22, "grad_norm": 1.0571842193603516, "learning_rate": 8.817488006977759e-05, "loss": 1.247, "step": 5630 }, { "epoch": 1.22, "grad_norm": 1.1925692558288574, "learning_rate": 8.815307457479286e-05, "loss": 1.28, "step": 5640 }, { "epoch": 1.23, "grad_norm": 0.8674301505088806, "learning_rate": 8.813126907980812e-05, "loss": 1.2699, "step": 5650 }, { "epoch": 1.23, "grad_norm": 1.030501127243042, "learning_rate": 8.810946358482338e-05, "loss": 1.2455, "step": 5660 }, { "epoch": 1.23, "grad_norm": 1.0425055027008057, "learning_rate": 8.808765808983864e-05, "loss": 1.2802, "step": 5670 }, { "epoch": 1.23, "grad_norm": 0.9576709866523743, "learning_rate": 8.806585259485391e-05, "loss": 1.2584, "step": 5680 }, { "epoch": 1.24, "grad_norm": 0.9852989912033081, "learning_rate": 8.804404709986917e-05, "loss": 1.2707, "step": 5690 }, { "epoch": 1.24, "grad_norm": 1.0519157648086548, "learning_rate": 8.802224160488443e-05, "loss": 1.2647, "step": 5700 }, { "epoch": 1.24, "grad_norm": 1.1391375064849854, "learning_rate": 8.80004361098997e-05, "loss": 1.2459, "step": 5710 }, { "epoch": 1.24, "grad_norm": 1.295246958732605, "learning_rate": 8.797863061491496e-05, "loss": 1.2708, "step": 5720 }, { "epoch": 1.24, "grad_norm": 0.9388042688369751, "learning_rate": 8.795682511993024e-05, "loss": 1.2761, "step": 5730 }, { "epoch": 1.25, "grad_norm": 0.8345937728881836, "learning_rate": 8.793501962494549e-05, "loss": 1.2641, "step": 5740 }, { "epoch": 1.25, "grad_norm": 0.9559466242790222, "learning_rate": 8.791321412996075e-05, "loss": 1.2608, "step": 5750 }, { "epoch": 1.25, "grad_norm": 0.9135338068008423, "learning_rate": 8.789140863497601e-05, "loss": 1.245, "step": 5760 }, { "epoch": 1.25, "grad_norm": 1.0820287466049194, "learning_rate": 8.786960313999129e-05, "loss": 1.2549, "step": 5770 }, { "epoch": 1.25, "grad_norm": 1.05925714969635, "learning_rate": 8.784779764500655e-05, "loss": 1.2493, "step": 5780 }, { "epoch": 1.26, "grad_norm": 1.0629942417144775, "learning_rate": 8.782599215002181e-05, "loss": 1.2803, "step": 5790 }, { "epoch": 1.26, "grad_norm": 1.021894097328186, "learning_rate": 8.780418665503706e-05, "loss": 1.264, "step": 5800 }, { "epoch": 1.26, "grad_norm": 0.9319231510162354, "learning_rate": 8.778238116005234e-05, "loss": 1.2757, "step": 5810 }, { "epoch": 1.26, "grad_norm": 0.9403659701347351, "learning_rate": 8.77605756650676e-05, "loss": 1.2601, "step": 5820 }, { "epoch": 1.27, "grad_norm": 1.0411070585250854, "learning_rate": 8.773877017008287e-05, "loss": 1.2747, "step": 5830 }, { "epoch": 1.27, "grad_norm": 0.9437740445137024, "learning_rate": 8.771696467509813e-05, "loss": 1.2771, "step": 5840 }, { "epoch": 1.27, "grad_norm": 1.0971676111221313, "learning_rate": 8.769515918011339e-05, "loss": 1.2631, "step": 5850 }, { "epoch": 1.27, "grad_norm": 1.0248700380325317, "learning_rate": 8.767335368512866e-05, "loss": 1.255, "step": 5860 }, { "epoch": 1.27, "grad_norm": 1.1890584230422974, "learning_rate": 8.765154819014392e-05, "loss": 1.265, "step": 5870 }, { "epoch": 1.28, "grad_norm": 1.1310992240905762, "learning_rate": 8.76297426951592e-05, "loss": 1.2786, "step": 5880 }, { "epoch": 1.28, "grad_norm": 0.95496666431427, "learning_rate": 8.760793720017444e-05, "loss": 1.2534, "step": 5890 }, { "epoch": 1.28, "grad_norm": 1.0427186489105225, "learning_rate": 8.758613170518971e-05, "loss": 1.2767, "step": 5900 }, { "epoch": 1.28, "grad_norm": 0.879298985004425, "learning_rate": 8.756432621020497e-05, "loss": 1.2453, "step": 5910 }, { "epoch": 1.29, "grad_norm": 0.9911447167396545, "learning_rate": 8.754252071522025e-05, "loss": 1.248, "step": 5920 }, { "epoch": 1.29, "grad_norm": 0.9124498963356018, "learning_rate": 8.752071522023551e-05, "loss": 1.2588, "step": 5930 }, { "epoch": 1.29, "grad_norm": 0.9397348761558533, "learning_rate": 8.749890972525076e-05, "loss": 1.2822, "step": 5940 }, { "epoch": 1.29, "grad_norm": 1.0716569423675537, "learning_rate": 8.747710423026602e-05, "loss": 1.2483, "step": 5950 }, { "epoch": 1.29, "grad_norm": 0.8869634866714478, "learning_rate": 8.74552987352813e-05, "loss": 1.2752, "step": 5960 }, { "epoch": 1.3, "grad_norm": 0.9538241028785706, "learning_rate": 8.743349324029656e-05, "loss": 1.2627, "step": 5970 }, { "epoch": 1.3, "grad_norm": 0.9991753697395325, "learning_rate": 8.741168774531183e-05, "loss": 1.2718, "step": 5980 }, { "epoch": 1.3, "grad_norm": 1.0785272121429443, "learning_rate": 8.738988225032709e-05, "loss": 1.2826, "step": 5990 }, { "epoch": 1.3, "grad_norm": 1.002681851387024, "learning_rate": 8.736807675534235e-05, "loss": 1.2659, "step": 6000 }, { "epoch": 1.3, "grad_norm": 0.9270432591438293, "learning_rate": 8.734627126035761e-05, "loss": 1.2493, "step": 6010 }, { "epoch": 1.31, "grad_norm": 1.143751621246338, "learning_rate": 8.732446576537288e-05, "loss": 1.2965, "step": 6020 }, { "epoch": 1.31, "grad_norm": 0.9666625261306763, "learning_rate": 8.730266027038814e-05, "loss": 1.2553, "step": 6030 }, { "epoch": 1.31, "grad_norm": 0.9400457739830017, "learning_rate": 8.72808547754034e-05, "loss": 1.2657, "step": 6040 }, { "epoch": 1.31, "grad_norm": 0.9232240319252014, "learning_rate": 8.725904928041867e-05, "loss": 1.2494, "step": 6050 }, { "epoch": 1.32, "grad_norm": 0.9295173287391663, "learning_rate": 8.723724378543393e-05, "loss": 1.2496, "step": 6060 }, { "epoch": 1.32, "grad_norm": 1.293441653251648, "learning_rate": 8.72154382904492e-05, "loss": 1.2578, "step": 6070 }, { "epoch": 1.32, "grad_norm": 0.9575563669204712, "learning_rate": 8.719363279546447e-05, "loss": 1.2323, "step": 6080 }, { "epoch": 1.32, "grad_norm": 1.0204386711120605, "learning_rate": 8.717182730047972e-05, "loss": 1.2652, "step": 6090 }, { "epoch": 1.32, "grad_norm": 0.9446994066238403, "learning_rate": 8.715002180549498e-05, "loss": 1.2568, "step": 6100 }, { "epoch": 1.33, "grad_norm": 1.0751984119415283, "learning_rate": 8.712821631051026e-05, "loss": 1.2806, "step": 6110 }, { "epoch": 1.33, "grad_norm": 0.9466795921325684, "learning_rate": 8.710641081552552e-05, "loss": 1.2416, "step": 6120 }, { "epoch": 1.33, "grad_norm": 1.1114068031311035, "learning_rate": 8.708460532054078e-05, "loss": 1.2405, "step": 6130 }, { "epoch": 1.33, "grad_norm": 0.9612728953361511, "learning_rate": 8.706279982555605e-05, "loss": 1.2655, "step": 6140 }, { "epoch": 1.34, "grad_norm": 0.9728400707244873, "learning_rate": 8.704099433057131e-05, "loss": 1.2654, "step": 6150 }, { "epoch": 1.34, "grad_norm": 1.0217069387435913, "learning_rate": 8.701918883558657e-05, "loss": 1.2804, "step": 6160 }, { "epoch": 1.34, "grad_norm": 0.9358672499656677, "learning_rate": 8.699738334060184e-05, "loss": 1.282, "step": 6170 }, { "epoch": 1.34, "grad_norm": 0.875811755657196, "learning_rate": 8.69755778456171e-05, "loss": 1.2974, "step": 6180 }, { "epoch": 1.34, "grad_norm": 0.9315816760063171, "learning_rate": 8.695377235063236e-05, "loss": 1.2515, "step": 6190 }, { "epoch": 1.35, "grad_norm": 0.9914236664772034, "learning_rate": 8.693196685564763e-05, "loss": 1.2438, "step": 6200 }, { "epoch": 1.35, "grad_norm": 0.9291836023330688, "learning_rate": 8.691016136066289e-05, "loss": 1.2794, "step": 6210 }, { "epoch": 1.35, "grad_norm": 1.036189317703247, "learning_rate": 8.688835586567815e-05, "loss": 1.2497, "step": 6220 }, { "epoch": 1.35, "grad_norm": 1.1179789304733276, "learning_rate": 8.686655037069343e-05, "loss": 1.2627, "step": 6230 }, { "epoch": 1.35, "grad_norm": 1.0586695671081543, "learning_rate": 8.684474487570868e-05, "loss": 1.2611, "step": 6240 }, { "epoch": 1.36, "grad_norm": 0.9113835692405701, "learning_rate": 8.682293938072394e-05, "loss": 1.2671, "step": 6250 }, { "epoch": 1.36, "grad_norm": 0.911665141582489, "learning_rate": 8.68011338857392e-05, "loss": 1.2425, "step": 6260 }, { "epoch": 1.36, "grad_norm": 1.016471266746521, "learning_rate": 8.677932839075448e-05, "loss": 1.2672, "step": 6270 }, { "epoch": 1.36, "grad_norm": 1.0666197538375854, "learning_rate": 8.675752289576974e-05, "loss": 1.2647, "step": 6280 }, { "epoch": 1.37, "grad_norm": 1.042350172996521, "learning_rate": 8.673571740078499e-05, "loss": 1.2211, "step": 6290 }, { "epoch": 1.37, "grad_norm": 0.9714857339859009, "learning_rate": 8.671391190580027e-05, "loss": 1.2698, "step": 6300 }, { "epoch": 1.37, "grad_norm": 0.9044662714004517, "learning_rate": 8.669210641081553e-05, "loss": 1.2753, "step": 6310 }, { "epoch": 1.37, "grad_norm": 0.8921557664871216, "learning_rate": 8.66703009158308e-05, "loss": 1.2528, "step": 6320 }, { "epoch": 1.37, "grad_norm": 0.9644028544425964, "learning_rate": 8.664849542084606e-05, "loss": 1.2642, "step": 6330 }, { "epoch": 1.38, "grad_norm": 1.0202399492263794, "learning_rate": 8.662668992586132e-05, "loss": 1.2473, "step": 6340 }, { "epoch": 1.38, "grad_norm": 1.0238714218139648, "learning_rate": 8.660488443087658e-05, "loss": 1.256, "step": 6350 }, { "epoch": 1.38, "grad_norm": 1.1190308332443237, "learning_rate": 8.658307893589185e-05, "loss": 1.2579, "step": 6360 }, { "epoch": 1.38, "grad_norm": 0.9763012528419495, "learning_rate": 8.656127344090711e-05, "loss": 1.2607, "step": 6370 }, { "epoch": 1.39, "grad_norm": 0.9133914709091187, "learning_rate": 8.653946794592239e-05, "loss": 1.2685, "step": 6380 }, { "epoch": 1.39, "grad_norm": 0.9674580693244934, "learning_rate": 8.651766245093764e-05, "loss": 1.2533, "step": 6390 }, { "epoch": 1.39, "grad_norm": 1.1029064655303955, "learning_rate": 8.64958569559529e-05, "loss": 1.2487, "step": 6400 }, { "epoch": 1.39, "grad_norm": 0.9458103775978088, "learning_rate": 8.647405146096816e-05, "loss": 1.2677, "step": 6410 }, { "epoch": 1.39, "grad_norm": 1.1092442274093628, "learning_rate": 8.645224596598344e-05, "loss": 1.2624, "step": 6420 }, { "epoch": 1.4, "grad_norm": 1.1490038633346558, "learning_rate": 8.64304404709987e-05, "loss": 1.2566, "step": 6430 }, { "epoch": 1.4, "grad_norm": 0.9747464060783386, "learning_rate": 8.640863497601395e-05, "loss": 1.2571, "step": 6440 }, { "epoch": 1.4, "grad_norm": 1.1297920942306519, "learning_rate": 8.638682948102921e-05, "loss": 1.2327, "step": 6450 }, { "epoch": 1.4, "grad_norm": 0.9675096869468689, "learning_rate": 8.636502398604449e-05, "loss": 1.2327, "step": 6460 }, { "epoch": 1.4, "grad_norm": 0.9282464385032654, "learning_rate": 8.634321849105975e-05, "loss": 1.2323, "step": 6470 }, { "epoch": 1.41, "grad_norm": 1.011017918586731, "learning_rate": 8.632141299607502e-05, "loss": 1.2429, "step": 6480 }, { "epoch": 1.41, "grad_norm": 1.02436363697052, "learning_rate": 8.629960750109028e-05, "loss": 1.2382, "step": 6490 }, { "epoch": 1.41, "grad_norm": 1.0600727796554565, "learning_rate": 8.627780200610554e-05, "loss": 1.2689, "step": 6500 }, { "epoch": 1.41, "grad_norm": 0.9400041103363037, "learning_rate": 8.62559965111208e-05, "loss": 1.2804, "step": 6510 }, { "epoch": 1.42, "grad_norm": 1.156300663948059, "learning_rate": 8.623419101613607e-05, "loss": 1.2596, "step": 6520 }, { "epoch": 1.42, "grad_norm": 0.9240378141403198, "learning_rate": 8.621238552115133e-05, "loss": 1.24, "step": 6530 }, { "epoch": 1.42, "grad_norm": 0.8798494338989258, "learning_rate": 8.61905800261666e-05, "loss": 1.2526, "step": 6540 }, { "epoch": 1.42, "grad_norm": 0.9512797594070435, "learning_rate": 8.616877453118186e-05, "loss": 1.2602, "step": 6550 }, { "epoch": 1.42, "grad_norm": 0.9985531568527222, "learning_rate": 8.614696903619712e-05, "loss": 1.2616, "step": 6560 }, { "epoch": 1.43, "grad_norm": 1.134756088256836, "learning_rate": 8.61251635412124e-05, "loss": 1.2688, "step": 6570 }, { "epoch": 1.43, "grad_norm": 0.9372296333312988, "learning_rate": 8.610335804622766e-05, "loss": 1.2538, "step": 6580 }, { "epoch": 1.43, "grad_norm": 1.011887788772583, "learning_rate": 8.608155255124291e-05, "loss": 1.246, "step": 6590 }, { "epoch": 1.43, "grad_norm": 0.9553661346435547, "learning_rate": 8.605974705625817e-05, "loss": 1.2502, "step": 6600 }, { "epoch": 1.44, "grad_norm": 0.9924313426017761, "learning_rate": 8.603794156127345e-05, "loss": 1.2362, "step": 6610 }, { "epoch": 1.44, "grad_norm": 1.05217707157135, "learning_rate": 8.601613606628871e-05, "loss": 1.2655, "step": 6620 }, { "epoch": 1.44, "grad_norm": 1.0302504301071167, "learning_rate": 8.599433057130398e-05, "loss": 1.2699, "step": 6630 }, { "epoch": 1.44, "grad_norm": 1.043373942375183, "learning_rate": 8.597252507631923e-05, "loss": 1.2532, "step": 6640 }, { "epoch": 1.44, "grad_norm": 0.9535781145095825, "learning_rate": 8.59507195813345e-05, "loss": 1.2586, "step": 6650 }, { "epoch": 1.45, "grad_norm": 1.1055347919464111, "learning_rate": 8.592891408634977e-05, "loss": 1.2632, "step": 6660 }, { "epoch": 1.45, "grad_norm": 1.0888850688934326, "learning_rate": 8.590710859136503e-05, "loss": 1.2497, "step": 6670 }, { "epoch": 1.45, "grad_norm": 0.9970211386680603, "learning_rate": 8.588530309638029e-05, "loss": 1.2869, "step": 6680 }, { "epoch": 1.45, "grad_norm": 1.0836609601974487, "learning_rate": 8.586349760139555e-05, "loss": 1.2321, "step": 6690 }, { "epoch": 1.45, "grad_norm": 0.9511786103248596, "learning_rate": 8.584169210641082e-05, "loss": 1.2562, "step": 6700 }, { "epoch": 1.46, "grad_norm": 1.088644027709961, "learning_rate": 8.581988661142608e-05, "loss": 1.2418, "step": 6710 }, { "epoch": 1.46, "grad_norm": 1.0465929508209229, "learning_rate": 8.579808111644134e-05, "loss": 1.2608, "step": 6720 }, { "epoch": 1.46, "grad_norm": 1.12638521194458, "learning_rate": 8.577627562145662e-05, "loss": 1.2725, "step": 6730 }, { "epoch": 1.46, "grad_norm": 1.171322226524353, "learning_rate": 8.575447012647187e-05, "loss": 1.265, "step": 6740 }, { "epoch": 1.47, "grad_norm": 0.926113486289978, "learning_rate": 8.573266463148713e-05, "loss": 1.2559, "step": 6750 }, { "epoch": 1.47, "grad_norm": 0.9716551899909973, "learning_rate": 8.57108591365024e-05, "loss": 1.2568, "step": 6760 }, { "epoch": 1.47, "grad_norm": 1.0213953256607056, "learning_rate": 8.568905364151767e-05, "loss": 1.2649, "step": 6770 }, { "epoch": 1.47, "grad_norm": 0.9643402099609375, "learning_rate": 8.566724814653294e-05, "loss": 1.2433, "step": 6780 }, { "epoch": 1.47, "grad_norm": 1.0367106199264526, "learning_rate": 8.564544265154819e-05, "loss": 1.2356, "step": 6790 }, { "epoch": 1.48, "grad_norm": 0.9655973315238953, "learning_rate": 8.562363715656346e-05, "loss": 1.2439, "step": 6800 }, { "epoch": 1.48, "grad_norm": 1.0422053337097168, "learning_rate": 8.560183166157872e-05, "loss": 1.2528, "step": 6810 }, { "epoch": 1.48, "grad_norm": 0.9676966071128845, "learning_rate": 8.558002616659399e-05, "loss": 1.2577, "step": 6820 }, { "epoch": 1.48, "grad_norm": 0.9732950329780579, "learning_rate": 8.555822067160925e-05, "loss": 1.2513, "step": 6830 }, { "epoch": 1.48, "grad_norm": 1.0636634826660156, "learning_rate": 8.553641517662451e-05, "loss": 1.2694, "step": 6840 }, { "epoch": 1.49, "grad_norm": 0.9392173290252686, "learning_rate": 8.551460968163978e-05, "loss": 1.2478, "step": 6850 }, { "epoch": 1.49, "grad_norm": 0.9402878880500793, "learning_rate": 8.549280418665504e-05, "loss": 1.2528, "step": 6860 }, { "epoch": 1.49, "grad_norm": 1.0256085395812988, "learning_rate": 8.54709986916703e-05, "loss": 1.2704, "step": 6870 }, { "epoch": 1.49, "grad_norm": 1.0600332021713257, "learning_rate": 8.544919319668558e-05, "loss": 1.2338, "step": 6880 }, { "epoch": 1.5, "grad_norm": 1.0218205451965332, "learning_rate": 8.542738770170083e-05, "loss": 1.2839, "step": 6890 }, { "epoch": 1.5, "grad_norm": 0.8786155581474304, "learning_rate": 8.540558220671609e-05, "loss": 1.248, "step": 6900 }, { "epoch": 1.5, "grad_norm": 0.9721015095710754, "learning_rate": 8.538377671173136e-05, "loss": 1.2734, "step": 6910 }, { "epoch": 1.5, "grad_norm": 0.9734498858451843, "learning_rate": 8.536197121674663e-05, "loss": 1.2454, "step": 6920 }, { "epoch": 1.5, "grad_norm": 0.9616742730140686, "learning_rate": 8.53401657217619e-05, "loss": 1.2565, "step": 6930 }, { "epoch": 1.51, "grad_norm": 1.153671383857727, "learning_rate": 8.531836022677714e-05, "loss": 1.2549, "step": 6940 }, { "epoch": 1.51, "grad_norm": 0.9344118237495422, "learning_rate": 8.529655473179241e-05, "loss": 1.2431, "step": 6950 }, { "epoch": 1.51, "grad_norm": 1.0228878259658813, "learning_rate": 8.527474923680768e-05, "loss": 1.2276, "step": 6960 }, { "epoch": 1.51, "grad_norm": 1.088304042816162, "learning_rate": 8.525294374182295e-05, "loss": 1.2423, "step": 6970 }, { "epoch": 1.52, "grad_norm": 0.9886937737464905, "learning_rate": 8.523113824683821e-05, "loss": 1.2693, "step": 6980 }, { "epoch": 1.52, "grad_norm": 0.8818524479866028, "learning_rate": 8.520933275185346e-05, "loss": 1.2424, "step": 6990 }, { "epoch": 1.52, "grad_norm": 0.9912683963775635, "learning_rate": 8.518752725686874e-05, "loss": 1.2522, "step": 7000 }, { "epoch": 1.52, "grad_norm": 0.9952061176300049, "learning_rate": 8.5165721761884e-05, "loss": 1.2519, "step": 7010 }, { "epoch": 1.52, "grad_norm": 1.035301923751831, "learning_rate": 8.514391626689926e-05, "loss": 1.2501, "step": 7020 }, { "epoch": 1.53, "grad_norm": 1.0349431037902832, "learning_rate": 8.512211077191452e-05, "loss": 1.2451, "step": 7030 }, { "epoch": 1.53, "grad_norm": 0.9751808643341064, "learning_rate": 8.510030527692979e-05, "loss": 1.2381, "step": 7040 }, { "epoch": 1.53, "grad_norm": 0.896840512752533, "learning_rate": 8.507849978194505e-05, "loss": 1.2509, "step": 7050 }, { "epoch": 1.53, "grad_norm": 1.074179768562317, "learning_rate": 8.505669428696031e-05, "loss": 1.2439, "step": 7060 }, { "epoch": 1.53, "grad_norm": 1.0536302328109741, "learning_rate": 8.503488879197559e-05, "loss": 1.2795, "step": 7070 }, { "epoch": 1.54, "grad_norm": 0.9011424779891968, "learning_rate": 8.501308329699085e-05, "loss": 1.2418, "step": 7080 }, { "epoch": 1.54, "grad_norm": 0.9322314262390137, "learning_rate": 8.49912778020061e-05, "loss": 1.2576, "step": 7090 }, { "epoch": 1.54, "grad_norm": 0.9793155193328857, "learning_rate": 8.496947230702137e-05, "loss": 1.2492, "step": 7100 }, { "epoch": 1.54, "grad_norm": 0.9420814514160156, "learning_rate": 8.494766681203664e-05, "loss": 1.2373, "step": 7110 }, { "epoch": 1.55, "grad_norm": 0.8934997320175171, "learning_rate": 8.49258613170519e-05, "loss": 1.2433, "step": 7120 }, { "epoch": 1.55, "grad_norm": 1.0100373029708862, "learning_rate": 8.490405582206717e-05, "loss": 1.2397, "step": 7130 }, { "epoch": 1.55, "grad_norm": 0.9812464118003845, "learning_rate": 8.488225032708242e-05, "loss": 1.2536, "step": 7140 }, { "epoch": 1.55, "grad_norm": 1.0419830083847046, "learning_rate": 8.48604448320977e-05, "loss": 1.2531, "step": 7150 }, { "epoch": 1.55, "grad_norm": 1.0287178754806519, "learning_rate": 8.483863933711296e-05, "loss": 1.2853, "step": 7160 }, { "epoch": 1.56, "grad_norm": 0.9258010983467102, "learning_rate": 8.481683384212822e-05, "loss": 1.2384, "step": 7170 }, { "epoch": 1.56, "grad_norm": 1.0923179388046265, "learning_rate": 8.479502834714348e-05, "loss": 1.2388, "step": 7180 }, { "epoch": 1.56, "grad_norm": 1.026920199394226, "learning_rate": 8.477322285215875e-05, "loss": 1.2403, "step": 7190 }, { "epoch": 1.56, "grad_norm": 1.071996808052063, "learning_rate": 8.475141735717401e-05, "loss": 1.257, "step": 7200 }, { "epoch": 1.57, "grad_norm": 1.0824863910675049, "learning_rate": 8.472961186218927e-05, "loss": 1.2358, "step": 7210 }, { "epoch": 1.57, "grad_norm": 1.006395697593689, "learning_rate": 8.470780636720454e-05, "loss": 1.2675, "step": 7220 }, { "epoch": 1.57, "grad_norm": 0.9629374146461487, "learning_rate": 8.468600087221981e-05, "loss": 1.2377, "step": 7230 }, { "epoch": 1.57, "grad_norm": 0.9439448714256287, "learning_rate": 8.466419537723506e-05, "loss": 1.2269, "step": 7240 }, { "epoch": 1.57, "grad_norm": 0.9413838386535645, "learning_rate": 8.464238988225033e-05, "loss": 1.248, "step": 7250 }, { "epoch": 1.58, "grad_norm": 0.9353733658790588, "learning_rate": 8.46205843872656e-05, "loss": 1.2535, "step": 7260 }, { "epoch": 1.58, "grad_norm": 1.0403653383255005, "learning_rate": 8.459877889228086e-05, "loss": 1.2323, "step": 7270 }, { "epoch": 1.58, "grad_norm": 0.8675696849822998, "learning_rate": 8.457697339729613e-05, "loss": 1.2712, "step": 7280 }, { "epoch": 1.58, "grad_norm": 0.9282375574111938, "learning_rate": 8.455516790231138e-05, "loss": 1.2259, "step": 7290 }, { "epoch": 1.58, "grad_norm": 0.9778069853782654, "learning_rate": 8.453336240732665e-05, "loss": 1.2499, "step": 7300 }, { "epoch": 1.59, "grad_norm": 1.0154436826705933, "learning_rate": 8.451155691234192e-05, "loss": 1.2253, "step": 7310 }, { "epoch": 1.59, "grad_norm": 0.9822314381599426, "learning_rate": 8.448975141735718e-05, "loss": 1.2583, "step": 7320 }, { "epoch": 1.59, "grad_norm": 1.0584256649017334, "learning_rate": 8.446794592237244e-05, "loss": 1.2682, "step": 7330 }, { "epoch": 1.59, "grad_norm": 1.035949945449829, "learning_rate": 8.44461404273877e-05, "loss": 1.2604, "step": 7340 }, { "epoch": 1.6, "grad_norm": 0.9688887596130371, "learning_rate": 8.442433493240297e-05, "loss": 1.2308, "step": 7350 }, { "epoch": 1.6, "grad_norm": 1.0668280124664307, "learning_rate": 8.440252943741823e-05, "loss": 1.2523, "step": 7360 }, { "epoch": 1.6, "grad_norm": 1.0507837533950806, "learning_rate": 8.43807239424335e-05, "loss": 1.2458, "step": 7370 }, { "epoch": 1.6, "grad_norm": 0.9705730676651001, "learning_rate": 8.435891844744876e-05, "loss": 1.2623, "step": 7380 }, { "epoch": 1.6, "grad_norm": 1.1198492050170898, "learning_rate": 8.433711295246402e-05, "loss": 1.2263, "step": 7390 }, { "epoch": 1.61, "grad_norm": 1.090376853942871, "learning_rate": 8.431530745747928e-05, "loss": 1.2549, "step": 7400 }, { "epoch": 1.61, "grad_norm": 0.9599369764328003, "learning_rate": 8.429350196249455e-05, "loss": 1.2453, "step": 7410 }, { "epoch": 1.61, "grad_norm": 0.9473201036453247, "learning_rate": 8.427169646750982e-05, "loss": 1.2449, "step": 7420 }, { "epoch": 1.61, "grad_norm": 1.0158095359802246, "learning_rate": 8.424989097252509e-05, "loss": 1.2395, "step": 7430 }, { "epoch": 1.62, "grad_norm": 1.1401153802871704, "learning_rate": 8.422808547754034e-05, "loss": 1.2426, "step": 7440 }, { "epoch": 1.62, "grad_norm": 0.9833976030349731, "learning_rate": 8.42062799825556e-05, "loss": 1.2238, "step": 7450 }, { "epoch": 1.62, "grad_norm": 1.0531307458877563, "learning_rate": 8.418447448757088e-05, "loss": 1.2286, "step": 7460 }, { "epoch": 1.62, "grad_norm": 0.9833014607429504, "learning_rate": 8.416266899258614e-05, "loss": 1.2483, "step": 7470 }, { "epoch": 1.62, "grad_norm": 1.0215846300125122, "learning_rate": 8.41408634976014e-05, "loss": 1.2434, "step": 7480 }, { "epoch": 1.63, "grad_norm": 0.9338911175727844, "learning_rate": 8.411905800261667e-05, "loss": 1.2263, "step": 7490 }, { "epoch": 1.63, "grad_norm": 0.9091663360595703, "learning_rate": 8.409725250763193e-05, "loss": 1.2359, "step": 7500 }, { "epoch": 1.63, "grad_norm": 0.9303663969039917, "learning_rate": 8.407544701264719e-05, "loss": 1.243, "step": 7510 }, { "epoch": 1.63, "grad_norm": 0.9787565469741821, "learning_rate": 8.405364151766245e-05, "loss": 1.2444, "step": 7520 }, { "epoch": 1.63, "grad_norm": 1.1064313650131226, "learning_rate": 8.403183602267772e-05, "loss": 1.2438, "step": 7530 }, { "epoch": 1.64, "grad_norm": 0.9433283805847168, "learning_rate": 8.401003052769298e-05, "loss": 1.2442, "step": 7540 }, { "epoch": 1.64, "grad_norm": 0.9914006590843201, "learning_rate": 8.398822503270824e-05, "loss": 1.2595, "step": 7550 }, { "epoch": 1.64, "grad_norm": 1.1178406476974487, "learning_rate": 8.39664195377235e-05, "loss": 1.2223, "step": 7560 }, { "epoch": 1.64, "grad_norm": 1.1177582740783691, "learning_rate": 8.394461404273878e-05, "loss": 1.2284, "step": 7570 }, { "epoch": 1.65, "grad_norm": 1.0288305282592773, "learning_rate": 8.392280854775405e-05, "loss": 1.2329, "step": 7580 }, { "epoch": 1.65, "grad_norm": 1.078165054321289, "learning_rate": 8.39010030527693e-05, "loss": 1.2149, "step": 7590 }, { "epoch": 1.65, "grad_norm": 1.0270469188690186, "learning_rate": 8.387919755778456e-05, "loss": 1.2453, "step": 7600 }, { "epoch": 1.65, "grad_norm": 1.142359733581543, "learning_rate": 8.385739206279984e-05, "loss": 1.2115, "step": 7610 }, { "epoch": 1.65, "grad_norm": 1.066074252128601, "learning_rate": 8.38355865678151e-05, "loss": 1.2282, "step": 7620 }, { "epoch": 1.66, "grad_norm": 0.9854233860969543, "learning_rate": 8.381378107283036e-05, "loss": 1.25, "step": 7630 }, { "epoch": 1.66, "grad_norm": 1.0901075601577759, "learning_rate": 8.379197557784561e-05, "loss": 1.2237, "step": 7640 }, { "epoch": 1.66, "grad_norm": 1.1587127447128296, "learning_rate": 8.377017008286089e-05, "loss": 1.219, "step": 7650 }, { "epoch": 1.66, "grad_norm": 0.9623563289642334, "learning_rate": 8.374836458787615e-05, "loss": 1.2311, "step": 7660 }, { "epoch": 1.67, "grad_norm": 0.9470689296722412, "learning_rate": 8.372655909289141e-05, "loss": 1.2515, "step": 7670 }, { "epoch": 1.67, "grad_norm": 0.9638876914978027, "learning_rate": 8.370475359790668e-05, "loss": 1.2532, "step": 7680 }, { "epoch": 1.67, "grad_norm": 1.163567304611206, "learning_rate": 8.368294810292194e-05, "loss": 1.2615, "step": 7690 }, { "epoch": 1.67, "grad_norm": 1.001160979270935, "learning_rate": 8.36611426079372e-05, "loss": 1.2472, "step": 7700 }, { "epoch": 1.67, "grad_norm": 1.0169782638549805, "learning_rate": 8.363933711295247e-05, "loss": 1.2473, "step": 7710 }, { "epoch": 1.68, "grad_norm": 0.9867805242538452, "learning_rate": 8.361753161796774e-05, "loss": 1.2452, "step": 7720 }, { "epoch": 1.68, "grad_norm": 1.0535905361175537, "learning_rate": 8.359572612298299e-05, "loss": 1.2405, "step": 7730 }, { "epoch": 1.68, "grad_norm": 0.9246835708618164, "learning_rate": 8.357392062799825e-05, "loss": 1.2522, "step": 7740 }, { "epoch": 1.68, "grad_norm": 1.0927287340164185, "learning_rate": 8.355211513301352e-05, "loss": 1.2493, "step": 7750 }, { "epoch": 1.68, "grad_norm": 1.054208755493164, "learning_rate": 8.35303096380288e-05, "loss": 1.263, "step": 7760 }, { "epoch": 1.69, "grad_norm": 0.9636792540550232, "learning_rate": 8.350850414304406e-05, "loss": 1.2426, "step": 7770 }, { "epoch": 1.69, "grad_norm": 1.0837719440460205, "learning_rate": 8.348669864805932e-05, "loss": 1.2265, "step": 7780 }, { "epoch": 1.69, "grad_norm": 0.9462710022926331, "learning_rate": 8.346489315307457e-05, "loss": 1.2242, "step": 7790 }, { "epoch": 1.69, "grad_norm": 0.987519383430481, "learning_rate": 8.344308765808985e-05, "loss": 1.2261, "step": 7800 }, { "epoch": 1.7, "grad_norm": 1.0755093097686768, "learning_rate": 8.342128216310511e-05, "loss": 1.2486, "step": 7810 }, { "epoch": 1.7, "grad_norm": 0.9885231852531433, "learning_rate": 8.339947666812037e-05, "loss": 1.2325, "step": 7820 }, { "epoch": 1.7, "grad_norm": 1.0870469808578491, "learning_rate": 8.337767117313564e-05, "loss": 1.2175, "step": 7830 }, { "epoch": 1.7, "grad_norm": 1.0006695985794067, "learning_rate": 8.33558656781509e-05, "loss": 1.2521, "step": 7840 }, { "epoch": 1.7, "grad_norm": 1.0880390405654907, "learning_rate": 8.333406018316616e-05, "loss": 1.2353, "step": 7850 }, { "epoch": 1.71, "grad_norm": 0.9993226528167725, "learning_rate": 8.331225468818142e-05, "loss": 1.2365, "step": 7860 }, { "epoch": 1.71, "grad_norm": 0.964745819568634, "learning_rate": 8.329044919319669e-05, "loss": 1.2566, "step": 7870 }, { "epoch": 1.71, "grad_norm": 0.9665801525115967, "learning_rate": 8.326864369821195e-05, "loss": 1.2266, "step": 7880 }, { "epoch": 1.71, "grad_norm": 1.0917197465896606, "learning_rate": 8.324683820322721e-05, "loss": 1.2457, "step": 7890 }, { "epoch": 1.72, "grad_norm": 1.1263692378997803, "learning_rate": 8.322503270824248e-05, "loss": 1.2312, "step": 7900 }, { "epoch": 1.72, "grad_norm": 0.9168413877487183, "learning_rate": 8.320322721325774e-05, "loss": 1.223, "step": 7910 }, { "epoch": 1.72, "grad_norm": 0.9771096706390381, "learning_rate": 8.318142171827302e-05, "loss": 1.2219, "step": 7920 }, { "epoch": 1.72, "grad_norm": 0.9901739358901978, "learning_rate": 8.315961622328828e-05, "loss": 1.2405, "step": 7930 }, { "epoch": 1.72, "grad_norm": 1.004320502281189, "learning_rate": 8.313781072830353e-05, "loss": 1.2584, "step": 7940 }, { "epoch": 1.73, "grad_norm": 0.897678554058075, "learning_rate": 8.31160052333188e-05, "loss": 1.2359, "step": 7950 }, { "epoch": 1.73, "grad_norm": 0.9914141893386841, "learning_rate": 8.309419973833407e-05, "loss": 1.2269, "step": 7960 }, { "epoch": 1.73, "grad_norm": 1.1783164739608765, "learning_rate": 8.307239424334933e-05, "loss": 1.2208, "step": 7970 }, { "epoch": 1.73, "grad_norm": 1.0260601043701172, "learning_rate": 8.30505887483646e-05, "loss": 1.2206, "step": 7980 }, { "epoch": 1.73, "grad_norm": 0.9606086015701294, "learning_rate": 8.302878325337986e-05, "loss": 1.246, "step": 7990 }, { "epoch": 1.74, "grad_norm": 1.0758907794952393, "learning_rate": 8.300697775839512e-05, "loss": 1.2386, "step": 8000 }, { "epoch": 1.74, "grad_norm": 0.9541261792182922, "learning_rate": 8.298517226341038e-05, "loss": 1.2554, "step": 8010 }, { "epoch": 1.74, "grad_norm": 1.130035161972046, "learning_rate": 8.296336676842565e-05, "loss": 1.2292, "step": 8020 }, { "epoch": 1.74, "grad_norm": 0.9219099879264832, "learning_rate": 8.294156127344091e-05, "loss": 1.2486, "step": 8030 }, { "epoch": 1.75, "grad_norm": 0.9194048643112183, "learning_rate": 8.291975577845617e-05, "loss": 1.2065, "step": 8040 }, { "epoch": 1.75, "grad_norm": 1.0724278688430786, "learning_rate": 8.289795028347144e-05, "loss": 1.232, "step": 8050 }, { "epoch": 1.75, "grad_norm": 1.0829250812530518, "learning_rate": 8.28761447884867e-05, "loss": 1.2374, "step": 8060 }, { "epoch": 1.75, "grad_norm": 0.9441924691200256, "learning_rate": 8.285433929350198e-05, "loss": 1.2248, "step": 8070 }, { "epoch": 1.75, "grad_norm": 1.0257307291030884, "learning_rate": 8.283253379851722e-05, "loss": 1.2356, "step": 8080 }, { "epoch": 1.76, "grad_norm": 0.8646122813224792, "learning_rate": 8.281072830353249e-05, "loss": 1.2497, "step": 8090 }, { "epoch": 1.76, "grad_norm": 1.100232481956482, "learning_rate": 8.278892280854775e-05, "loss": 1.2365, "step": 8100 }, { "epoch": 1.76, "grad_norm": 1.0597792863845825, "learning_rate": 8.276711731356303e-05, "loss": 1.2403, "step": 8110 }, { "epoch": 1.76, "grad_norm": 1.0088367462158203, "learning_rate": 8.274531181857829e-05, "loss": 1.2281, "step": 8120 }, { "epoch": 1.76, "grad_norm": 1.0818982124328613, "learning_rate": 8.272350632359355e-05, "loss": 1.2427, "step": 8130 }, { "epoch": 1.77, "grad_norm": 0.9281474947929382, "learning_rate": 8.27017008286088e-05, "loss": 1.2595, "step": 8140 }, { "epoch": 1.77, "grad_norm": 0.9748603105545044, "learning_rate": 8.267989533362408e-05, "loss": 1.248, "step": 8150 }, { "epoch": 1.77, "grad_norm": 1.027099370956421, "learning_rate": 8.265808983863934e-05, "loss": 1.2313, "step": 8160 }, { "epoch": 1.77, "grad_norm": 1.0615408420562744, "learning_rate": 8.26362843436546e-05, "loss": 1.2549, "step": 8170 }, { "epoch": 1.78, "grad_norm": 0.9190282225608826, "learning_rate": 8.261447884866987e-05, "loss": 1.2169, "step": 8180 }, { "epoch": 1.78, "grad_norm": 0.9824718236923218, "learning_rate": 8.259267335368513e-05, "loss": 1.2505, "step": 8190 }, { "epoch": 1.78, "grad_norm": 0.9848600029945374, "learning_rate": 8.25708678587004e-05, "loss": 1.2414, "step": 8200 }, { "epoch": 1.78, "grad_norm": 0.9373934268951416, "learning_rate": 8.254906236371566e-05, "loss": 1.2294, "step": 8210 }, { "epoch": 1.78, "grad_norm": 1.0315806865692139, "learning_rate": 8.252725686873093e-05, "loss": 1.2259, "step": 8220 }, { "epoch": 1.79, "grad_norm": 1.0654377937316895, "learning_rate": 8.250545137374618e-05, "loss": 1.249, "step": 8230 }, { "epoch": 1.79, "grad_norm": 1.0188405513763428, "learning_rate": 8.248364587876145e-05, "loss": 1.2361, "step": 8240 }, { "epoch": 1.79, "grad_norm": 0.9202408790588379, "learning_rate": 8.246184038377671e-05, "loss": 1.2344, "step": 8250 }, { "epoch": 1.79, "grad_norm": 0.953535795211792, "learning_rate": 8.244003488879199e-05, "loss": 1.2439, "step": 8260 }, { "epoch": 1.8, "grad_norm": 0.8910773992538452, "learning_rate": 8.241822939380725e-05, "loss": 1.2417, "step": 8270 }, { "epoch": 1.8, "grad_norm": 1.0123344659805298, "learning_rate": 8.23964238988225e-05, "loss": 1.2437, "step": 8280 }, { "epoch": 1.8, "grad_norm": 0.9692454934120178, "learning_rate": 8.237461840383776e-05, "loss": 1.2414, "step": 8290 }, { "epoch": 1.8, "grad_norm": 1.2110908031463623, "learning_rate": 8.235281290885304e-05, "loss": 1.2273, "step": 8300 }, { "epoch": 1.8, "grad_norm": 0.9399771690368652, "learning_rate": 8.23310074138683e-05, "loss": 1.2305, "step": 8310 }, { "epoch": 1.81, "grad_norm": 1.0485948324203491, "learning_rate": 8.230920191888356e-05, "loss": 1.2243, "step": 8320 }, { "epoch": 1.81, "grad_norm": 1.1290273666381836, "learning_rate": 8.228739642389883e-05, "loss": 1.2647, "step": 8330 }, { "epoch": 1.81, "grad_norm": 1.113707184791565, "learning_rate": 8.226559092891409e-05, "loss": 1.2396, "step": 8340 }, { "epoch": 1.81, "grad_norm": 1.161978006362915, "learning_rate": 8.224378543392935e-05, "loss": 1.2371, "step": 8350 }, { "epoch": 1.81, "grad_norm": 1.075077772140503, "learning_rate": 8.222197993894462e-05, "loss": 1.2326, "step": 8360 }, { "epoch": 1.82, "grad_norm": 0.9579611420631409, "learning_rate": 8.220017444395988e-05, "loss": 1.2212, "step": 8370 }, { "epoch": 1.82, "grad_norm": 1.0509251356124878, "learning_rate": 8.217836894897514e-05, "loss": 1.2234, "step": 8380 }, { "epoch": 1.82, "grad_norm": 1.02772057056427, "learning_rate": 8.21565634539904e-05, "loss": 1.212, "step": 8390 }, { "epoch": 1.82, "grad_norm": 1.0468199253082275, "learning_rate": 8.213475795900567e-05, "loss": 1.2328, "step": 8400 }, { "epoch": 1.83, "grad_norm": 0.9836091995239258, "learning_rate": 8.211295246402095e-05, "loss": 1.2368, "step": 8410 }, { "epoch": 1.83, "grad_norm": 1.0582927465438843, "learning_rate": 8.209114696903621e-05, "loss": 1.2466, "step": 8420 }, { "epoch": 1.83, "grad_norm": 1.039549708366394, "learning_rate": 8.206934147405146e-05, "loss": 1.2334, "step": 8430 }, { "epoch": 1.83, "grad_norm": 0.9211510419845581, "learning_rate": 8.204753597906672e-05, "loss": 1.2205, "step": 8440 }, { "epoch": 1.83, "grad_norm": 1.019851565361023, "learning_rate": 8.2025730484082e-05, "loss": 1.2416, "step": 8450 }, { "epoch": 1.84, "grad_norm": 1.0609748363494873, "learning_rate": 8.200392498909726e-05, "loss": 1.2315, "step": 8460 }, { "epoch": 1.84, "grad_norm": 1.1158742904663086, "learning_rate": 8.198211949411252e-05, "loss": 1.2485, "step": 8470 }, { "epoch": 1.84, "grad_norm": 0.8996789455413818, "learning_rate": 8.196031399912779e-05, "loss": 1.2309, "step": 8480 }, { "epoch": 1.84, "grad_norm": 0.9898722171783447, "learning_rate": 8.193850850414305e-05, "loss": 1.236, "step": 8490 }, { "epoch": 1.85, "grad_norm": 1.1336474418640137, "learning_rate": 8.191670300915831e-05, "loss": 1.2375, "step": 8500 }, { "epoch": 1.85, "grad_norm": 0.9630258679389954, "learning_rate": 8.189489751417358e-05, "loss": 1.2462, "step": 8510 }, { "epoch": 1.85, "grad_norm": 0.9450762271881104, "learning_rate": 8.187309201918884e-05, "loss": 1.221, "step": 8520 }, { "epoch": 1.85, "grad_norm": 0.9798605442047119, "learning_rate": 8.18512865242041e-05, "loss": 1.2222, "step": 8530 }, { "epoch": 1.85, "grad_norm": 0.9023801684379578, "learning_rate": 8.182948102921936e-05, "loss": 1.2193, "step": 8540 }, { "epoch": 1.86, "grad_norm": 0.9918519258499146, "learning_rate": 8.180767553423463e-05, "loss": 1.2538, "step": 8550 }, { "epoch": 1.86, "grad_norm": 1.078640341758728, "learning_rate": 8.178587003924989e-05, "loss": 1.2239, "step": 8560 }, { "epoch": 1.86, "grad_norm": 1.1001946926116943, "learning_rate": 8.176406454426517e-05, "loss": 1.2542, "step": 8570 }, { "epoch": 1.86, "grad_norm": 0.9115540385246277, "learning_rate": 8.174225904928042e-05, "loss": 1.2231, "step": 8580 }, { "epoch": 1.86, "grad_norm": 1.0351630449295044, "learning_rate": 8.172045355429568e-05, "loss": 1.2328, "step": 8590 }, { "epoch": 1.87, "grad_norm": 1.1193772554397583, "learning_rate": 8.169864805931094e-05, "loss": 1.2344, "step": 8600 }, { "epoch": 1.87, "grad_norm": 0.926569402217865, "learning_rate": 8.167684256432622e-05, "loss": 1.2318, "step": 8610 }, { "epoch": 1.87, "grad_norm": 1.1995497941970825, "learning_rate": 8.165503706934148e-05, "loss": 1.2645, "step": 8620 }, { "epoch": 1.87, "grad_norm": 1.0718098878860474, "learning_rate": 8.163323157435673e-05, "loss": 1.2372, "step": 8630 }, { "epoch": 1.88, "grad_norm": 1.0319968461990356, "learning_rate": 8.161142607937201e-05, "loss": 1.222, "step": 8640 }, { "epoch": 1.88, "grad_norm": 1.0868433713912964, "learning_rate": 8.158962058438727e-05, "loss": 1.2381, "step": 8650 }, { "epoch": 1.88, "grad_norm": 1.0332001447677612, "learning_rate": 8.156781508940253e-05, "loss": 1.208, "step": 8660 }, { "epoch": 1.88, "grad_norm": 1.050507664680481, "learning_rate": 8.15460095944178e-05, "loss": 1.2276, "step": 8670 }, { "epoch": 1.88, "grad_norm": 0.9764347672462463, "learning_rate": 8.152420409943306e-05, "loss": 1.2289, "step": 8680 }, { "epoch": 1.89, "grad_norm": 0.9142500758171082, "learning_rate": 8.150239860444832e-05, "loss": 1.2109, "step": 8690 }, { "epoch": 1.89, "grad_norm": 1.028554916381836, "learning_rate": 8.148059310946359e-05, "loss": 1.2245, "step": 8700 }, { "epoch": 1.89, "grad_norm": 1.09976327419281, "learning_rate": 8.145878761447885e-05, "loss": 1.2387, "step": 8710 }, { "epoch": 1.89, "grad_norm": 1.0482656955718994, "learning_rate": 8.143698211949413e-05, "loss": 1.2225, "step": 8720 }, { "epoch": 1.9, "grad_norm": 0.953663170337677, "learning_rate": 8.141517662450938e-05, "loss": 1.2605, "step": 8730 }, { "epoch": 1.9, "grad_norm": 1.0766589641571045, "learning_rate": 8.139337112952464e-05, "loss": 1.2348, "step": 8740 }, { "epoch": 1.9, "grad_norm": 1.1204911470413208, "learning_rate": 8.13715656345399e-05, "loss": 1.2248, "step": 8750 }, { "epoch": 1.9, "grad_norm": 1.0836663246154785, "learning_rate": 8.134976013955518e-05, "loss": 1.2463, "step": 8760 }, { "epoch": 1.9, "grad_norm": 1.0038310289382935, "learning_rate": 8.132795464457044e-05, "loss": 1.2415, "step": 8770 }, { "epoch": 1.91, "grad_norm": 0.9727823138237, "learning_rate": 8.130614914958569e-05, "loss": 1.2291, "step": 8780 }, { "epoch": 1.91, "grad_norm": 0.9913771748542786, "learning_rate": 8.128434365460095e-05, "loss": 1.2374, "step": 8790 }, { "epoch": 1.91, "grad_norm": 1.0077624320983887, "learning_rate": 8.126253815961623e-05, "loss": 1.2126, "step": 8800 }, { "epoch": 1.91, "grad_norm": 0.9802316427230835, "learning_rate": 8.12407326646315e-05, "loss": 1.2084, "step": 8810 }, { "epoch": 1.91, "grad_norm": 1.1375538110733032, "learning_rate": 8.121892716964676e-05, "loss": 1.231, "step": 8820 }, { "epoch": 1.92, "grad_norm": 1.0553092956542969, "learning_rate": 8.119712167466202e-05, "loss": 1.2132, "step": 8830 }, { "epoch": 1.92, "grad_norm": 0.9583929777145386, "learning_rate": 8.117531617967728e-05, "loss": 1.2492, "step": 8840 }, { "epoch": 1.92, "grad_norm": 1.1101999282836914, "learning_rate": 8.115351068469255e-05, "loss": 1.2381, "step": 8850 }, { "epoch": 1.92, "grad_norm": 0.9837037920951843, "learning_rate": 8.113170518970781e-05, "loss": 1.2122, "step": 8860 }, { "epoch": 1.93, "grad_norm": 0.9561728835105896, "learning_rate": 8.110989969472309e-05, "loss": 1.2371, "step": 8870 }, { "epoch": 1.93, "grad_norm": 1.0024539232254028, "learning_rate": 8.108809419973834e-05, "loss": 1.2421, "step": 8880 }, { "epoch": 1.93, "grad_norm": 0.8823496103286743, "learning_rate": 8.10662887047536e-05, "loss": 1.2221, "step": 8890 }, { "epoch": 1.93, "grad_norm": 0.9598950743675232, "learning_rate": 8.104448320976886e-05, "loss": 1.2043, "step": 8900 }, { "epoch": 1.93, "grad_norm": 1.165281057357788, "learning_rate": 8.102267771478414e-05, "loss": 1.2261, "step": 8910 }, { "epoch": 1.94, "grad_norm": 0.9209827184677124, "learning_rate": 8.10008722197994e-05, "loss": 1.2196, "step": 8920 }, { "epoch": 1.94, "grad_norm": 1.023848056793213, "learning_rate": 8.097906672481465e-05, "loss": 1.2393, "step": 8930 }, { "epoch": 1.94, "grad_norm": 1.0043749809265137, "learning_rate": 8.095726122982991e-05, "loss": 1.2362, "step": 8940 }, { "epoch": 1.94, "grad_norm": 0.9257699251174927, "learning_rate": 8.093545573484519e-05, "loss": 1.2258, "step": 8950 }, { "epoch": 1.95, "grad_norm": 1.1696765422821045, "learning_rate": 8.091365023986045e-05, "loss": 1.2459, "step": 8960 }, { "epoch": 1.95, "grad_norm": 0.9257934093475342, "learning_rate": 8.089184474487572e-05, "loss": 1.2492, "step": 8970 }, { "epoch": 1.95, "grad_norm": 1.1503798961639404, "learning_rate": 8.087003924989097e-05, "loss": 1.2311, "step": 8980 }, { "epoch": 1.95, "grad_norm": 1.1405220031738281, "learning_rate": 8.084823375490624e-05, "loss": 1.2409, "step": 8990 }, { "epoch": 1.95, "grad_norm": 0.976625382900238, "learning_rate": 8.08264282599215e-05, "loss": 1.2266, "step": 9000 }, { "epoch": 1.96, "grad_norm": 0.9233745336532593, "learning_rate": 8.080462276493677e-05, "loss": 1.2261, "step": 9010 }, { "epoch": 1.96, "grad_norm": 1.0994141101837158, "learning_rate": 8.078281726995203e-05, "loss": 1.2352, "step": 9020 }, { "epoch": 1.96, "grad_norm": 0.9999457001686096, "learning_rate": 8.07610117749673e-05, "loss": 1.2238, "step": 9030 }, { "epoch": 1.96, "grad_norm": 1.0037119388580322, "learning_rate": 8.073920627998256e-05, "loss": 1.2439, "step": 9040 }, { "epoch": 1.96, "grad_norm": 0.9493910670280457, "learning_rate": 8.071740078499782e-05, "loss": 1.2253, "step": 9050 }, { "epoch": 1.97, "grad_norm": 1.099271535873413, "learning_rate": 8.069559529001308e-05, "loss": 1.211, "step": 9060 }, { "epoch": 1.97, "grad_norm": 0.9729533791542053, "learning_rate": 8.067378979502836e-05, "loss": 1.2257, "step": 9070 }, { "epoch": 1.97, "grad_norm": 1.112057089805603, "learning_rate": 8.065198430004361e-05, "loss": 1.2092, "step": 9080 }, { "epoch": 1.97, "grad_norm": 0.9645751714706421, "learning_rate": 8.063017880505887e-05, "loss": 1.2123, "step": 9090 }, { "epoch": 1.98, "grad_norm": 1.0263340473175049, "learning_rate": 8.060837331007415e-05, "loss": 1.2033, "step": 9100 }, { "epoch": 1.98, "grad_norm": 1.1131114959716797, "learning_rate": 8.058656781508941e-05, "loss": 1.2303, "step": 9110 }, { "epoch": 1.98, "grad_norm": 1.1425633430480957, "learning_rate": 8.056476232010468e-05, "loss": 1.2166, "step": 9120 }, { "epoch": 1.98, "grad_norm": 0.9223284721374512, "learning_rate": 8.054295682511992e-05, "loss": 1.2588, "step": 9130 }, { "epoch": 1.98, "grad_norm": 0.9477842450141907, "learning_rate": 8.05211513301352e-05, "loss": 1.2028, "step": 9140 }, { "epoch": 1.99, "grad_norm": 1.0649006366729736, "learning_rate": 8.049934583515046e-05, "loss": 1.2238, "step": 9150 }, { "epoch": 1.99, "grad_norm": 1.0043710470199585, "learning_rate": 8.047754034016573e-05, "loss": 1.2301, "step": 9160 }, { "epoch": 1.99, "grad_norm": 1.0217610597610474, "learning_rate": 8.045573484518099e-05, "loss": 1.2406, "step": 9170 }, { "epoch": 1.99, "grad_norm": 0.9688403606414795, "learning_rate": 8.043392935019625e-05, "loss": 1.2364, "step": 9180 }, { "epoch": 2.0, "grad_norm": 1.095987319946289, "learning_rate": 8.041212385521152e-05, "loss": 1.241, "step": 9190 }, { "epoch": 2.0, "grad_norm": 0.9398607611656189, "learning_rate": 8.039031836022678e-05, "loss": 1.226, "step": 9200 }, { "epoch": 2.0, "grad_norm": 0.9815939664840698, "learning_rate": 8.036851286524204e-05, "loss": 1.2181, "step": 9210 }, { "epoch": 2.0, "eval_loss": 1.2817823886871338, "eval_runtime": 1495.0675, "eval_samples_per_second": 258.75, "eval_steps_per_second": 4.043, "step": 9212 }, { "epoch": 2.0, "grad_norm": 1.0157184600830078, "learning_rate": 8.034670737025732e-05, "loss": 1.2142, "step": 9220 }, { "epoch": 2.0, "grad_norm": 0.9625092148780823, "learning_rate": 8.032490187527257e-05, "loss": 1.2089, "step": 9230 }, { "epoch": 2.01, "grad_norm": 0.9196017384529114, "learning_rate": 8.030309638028783e-05, "loss": 1.2335, "step": 9240 }, { "epoch": 2.01, "grad_norm": 0.9308544397354126, "learning_rate": 8.02812908853031e-05, "loss": 1.2163, "step": 9250 }, { "epoch": 2.01, "grad_norm": 1.2144242525100708, "learning_rate": 8.025948539031837e-05, "loss": 1.2008, "step": 9260 }, { "epoch": 2.01, "grad_norm": 0.9780566692352295, "learning_rate": 8.023767989533363e-05, "loss": 1.1919, "step": 9270 }, { "epoch": 2.01, "grad_norm": 0.9934610724449158, "learning_rate": 8.021587440034888e-05, "loss": 1.1813, "step": 9280 }, { "epoch": 2.02, "grad_norm": 1.1047219038009644, "learning_rate": 8.019406890536415e-05, "loss": 1.1887, "step": 9290 }, { "epoch": 2.02, "grad_norm": 1.0617597103118896, "learning_rate": 8.017226341037942e-05, "loss": 1.2142, "step": 9300 }, { "epoch": 2.02, "grad_norm": 0.9656373858451843, "learning_rate": 8.015045791539469e-05, "loss": 1.1962, "step": 9310 }, { "epoch": 2.02, "grad_norm": 0.9934256076812744, "learning_rate": 8.012865242040995e-05, "loss": 1.2093, "step": 9320 }, { "epoch": 2.03, "grad_norm": 1.0616453886032104, "learning_rate": 8.010684692542521e-05, "loss": 1.227, "step": 9330 }, { "epoch": 2.03, "grad_norm": 1.0761624574661255, "learning_rate": 8.008504143044048e-05, "loss": 1.2126, "step": 9340 }, { "epoch": 2.03, "grad_norm": 1.06252920627594, "learning_rate": 8.006323593545574e-05, "loss": 1.1966, "step": 9350 }, { "epoch": 2.03, "grad_norm": 0.9828883409500122, "learning_rate": 8.0041430440471e-05, "loss": 1.2032, "step": 9360 }, { "epoch": 2.03, "grad_norm": 1.0415362119674683, "learning_rate": 8.001962494548628e-05, "loss": 1.2069, "step": 9370 }, { "epoch": 2.04, "grad_norm": 0.9932116866111755, "learning_rate": 7.999781945050153e-05, "loss": 1.2099, "step": 9380 }, { "epoch": 2.04, "grad_norm": 1.0453740358352661, "learning_rate": 7.997601395551679e-05, "loss": 1.1908, "step": 9390 }, { "epoch": 2.04, "grad_norm": 0.9478277564048767, "learning_rate": 7.995420846053205e-05, "loss": 1.2016, "step": 9400 }, { "epoch": 2.04, "grad_norm": 0.9447776079177856, "learning_rate": 7.993240296554733e-05, "loss": 1.2163, "step": 9410 }, { "epoch": 2.05, "grad_norm": 0.9693462252616882, "learning_rate": 7.991059747056259e-05, "loss": 1.1871, "step": 9420 }, { "epoch": 2.05, "grad_norm": 1.2381738424301147, "learning_rate": 7.988879197557784e-05, "loss": 1.214, "step": 9430 }, { "epoch": 2.05, "grad_norm": 0.9551769495010376, "learning_rate": 7.98669864805931e-05, "loss": 1.2026, "step": 9440 }, { "epoch": 2.05, "grad_norm": 1.009376883506775, "learning_rate": 7.984518098560838e-05, "loss": 1.1991, "step": 9450 }, { "epoch": 2.05, "grad_norm": 0.9546257257461548, "learning_rate": 7.982337549062365e-05, "loss": 1.2164, "step": 9460 }, { "epoch": 2.06, "grad_norm": 0.9941860437393188, "learning_rate": 7.980156999563891e-05, "loss": 1.2111, "step": 9470 }, { "epoch": 2.06, "grad_norm": 1.211512565612793, "learning_rate": 7.977976450065416e-05, "loss": 1.1795, "step": 9480 }, { "epoch": 2.06, "grad_norm": 1.004779577255249, "learning_rate": 7.975795900566943e-05, "loss": 1.2049, "step": 9490 }, { "epoch": 2.06, "grad_norm": 1.0823005437850952, "learning_rate": 7.97361535106847e-05, "loss": 1.1886, "step": 9500 }, { "epoch": 2.06, "grad_norm": 1.0418225526809692, "learning_rate": 7.971434801569996e-05, "loss": 1.2105, "step": 9510 }, { "epoch": 2.07, "grad_norm": 1.1182845830917358, "learning_rate": 7.969254252071522e-05, "loss": 1.1897, "step": 9520 }, { "epoch": 2.07, "grad_norm": 0.946642279624939, "learning_rate": 7.967073702573049e-05, "loss": 1.199, "step": 9530 }, { "epoch": 2.07, "grad_norm": 1.1157629489898682, "learning_rate": 7.964893153074575e-05, "loss": 1.2294, "step": 9540 }, { "epoch": 2.07, "grad_norm": 1.053207516670227, "learning_rate": 7.962712603576101e-05, "loss": 1.2412, "step": 9550 }, { "epoch": 2.08, "grad_norm": 0.9756922721862793, "learning_rate": 7.960532054077629e-05, "loss": 1.1976, "step": 9560 }, { "epoch": 2.08, "grad_norm": 1.049428105354309, "learning_rate": 7.958351504579155e-05, "loss": 1.2254, "step": 9570 }, { "epoch": 2.08, "grad_norm": 0.9671922922134399, "learning_rate": 7.95617095508068e-05, "loss": 1.1905, "step": 9580 }, { "epoch": 2.08, "grad_norm": 1.0883835554122925, "learning_rate": 7.953990405582206e-05, "loss": 1.2032, "step": 9590 }, { "epoch": 2.08, "grad_norm": 1.080729365348816, "learning_rate": 7.951809856083734e-05, "loss": 1.216, "step": 9600 }, { "epoch": 2.09, "grad_norm": 0.9762791395187378, "learning_rate": 7.94962930658526e-05, "loss": 1.2167, "step": 9610 }, { "epoch": 2.09, "grad_norm": 1.1527519226074219, "learning_rate": 7.947448757086787e-05, "loss": 1.1682, "step": 9620 }, { "epoch": 2.09, "grad_norm": 1.0505051612854004, "learning_rate": 7.945268207588312e-05, "loss": 1.211, "step": 9630 }, { "epoch": 2.09, "grad_norm": 1.1166177988052368, "learning_rate": 7.94308765808984e-05, "loss": 1.1763, "step": 9640 }, { "epoch": 2.09, "grad_norm": 1.038783073425293, "learning_rate": 7.940907108591366e-05, "loss": 1.2113, "step": 9650 }, { "epoch": 2.1, "grad_norm": 1.0138919353485107, "learning_rate": 7.938726559092892e-05, "loss": 1.214, "step": 9660 }, { "epoch": 2.1, "grad_norm": 0.8989730477333069, "learning_rate": 7.936546009594418e-05, "loss": 1.1975, "step": 9670 }, { "epoch": 2.1, "grad_norm": 0.9866936206817627, "learning_rate": 7.934365460095945e-05, "loss": 1.2163, "step": 9680 }, { "epoch": 2.1, "grad_norm": 0.9352193474769592, "learning_rate": 7.932184910597471e-05, "loss": 1.1936, "step": 9690 }, { "epoch": 2.11, "grad_norm": 0.9865077137947083, "learning_rate": 7.930004361098997e-05, "loss": 1.2279, "step": 9700 }, { "epoch": 2.11, "grad_norm": 0.9269611835479736, "learning_rate": 7.927823811600523e-05, "loss": 1.2089, "step": 9710 }, { "epoch": 2.11, "grad_norm": 1.0865782499313354, "learning_rate": 7.92564326210205e-05, "loss": 1.2073, "step": 9720 }, { "epoch": 2.11, "grad_norm": 1.077241063117981, "learning_rate": 7.923462712603576e-05, "loss": 1.1952, "step": 9730 }, { "epoch": 2.11, "grad_norm": 1.1019902229309082, "learning_rate": 7.921282163105102e-05, "loss": 1.1845, "step": 9740 }, { "epoch": 2.12, "grad_norm": 1.1047565937042236, "learning_rate": 7.919101613606629e-05, "loss": 1.2115, "step": 9750 }, { "epoch": 2.12, "grad_norm": 1.038865327835083, "learning_rate": 7.916921064108156e-05, "loss": 1.1764, "step": 9760 }, { "epoch": 2.12, "grad_norm": 1.039838194847107, "learning_rate": 7.914740514609683e-05, "loss": 1.2061, "step": 9770 }, { "epoch": 2.12, "grad_norm": 1.1482833623886108, "learning_rate": 7.912559965111208e-05, "loss": 1.1819, "step": 9780 }, { "epoch": 2.13, "grad_norm": 1.2092708349227905, "learning_rate": 7.910379415612735e-05, "loss": 1.2204, "step": 9790 }, { "epoch": 2.13, "grad_norm": 0.9620797634124756, "learning_rate": 7.908198866114262e-05, "loss": 1.2282, "step": 9800 }, { "epoch": 2.13, "grad_norm": 0.9821200966835022, "learning_rate": 7.906018316615788e-05, "loss": 1.1928, "step": 9810 }, { "epoch": 2.13, "grad_norm": 0.9970041513442993, "learning_rate": 7.903837767117314e-05, "loss": 1.2293, "step": 9820 }, { "epoch": 2.13, "grad_norm": 1.0370044708251953, "learning_rate": 7.90165721761884e-05, "loss": 1.2015, "step": 9830 }, { "epoch": 2.14, "grad_norm": 0.9988645911216736, "learning_rate": 7.899476668120367e-05, "loss": 1.1827, "step": 9840 }, { "epoch": 2.14, "grad_norm": 1.0234349966049194, "learning_rate": 7.897296118621893e-05, "loss": 1.2185, "step": 9850 }, { "epoch": 2.14, "grad_norm": 1.1477036476135254, "learning_rate": 7.89511556912342e-05, "loss": 1.2108, "step": 9860 }, { "epoch": 2.14, "grad_norm": 1.1326051950454712, "learning_rate": 7.892935019624946e-05, "loss": 1.1785, "step": 9870 }, { "epoch": 2.14, "grad_norm": 1.003237009048462, "learning_rate": 7.890754470126472e-05, "loss": 1.2082, "step": 9880 }, { "epoch": 2.15, "grad_norm": 1.0607051849365234, "learning_rate": 7.888573920627998e-05, "loss": 1.2112, "step": 9890 }, { "epoch": 2.15, "grad_norm": 1.0867217779159546, "learning_rate": 7.886393371129525e-05, "loss": 1.1845, "step": 9900 }, { "epoch": 2.15, "grad_norm": 0.945563018321991, "learning_rate": 7.884212821631052e-05, "loss": 1.1925, "step": 9910 }, { "epoch": 2.15, "grad_norm": 1.0693022012710571, "learning_rate": 7.882032272132579e-05, "loss": 1.1956, "step": 9920 }, { "epoch": 2.16, "grad_norm": 0.9993180632591248, "learning_rate": 7.879851722634103e-05, "loss": 1.1965, "step": 9930 }, { "epoch": 2.16, "grad_norm": 1.010133147239685, "learning_rate": 7.87767117313563e-05, "loss": 1.2168, "step": 9940 }, { "epoch": 2.16, "grad_norm": 1.0953561067581177, "learning_rate": 7.875490623637157e-05, "loss": 1.2114, "step": 9950 }, { "epoch": 2.16, "grad_norm": 0.9444001317024231, "learning_rate": 7.873310074138684e-05, "loss": 1.1988, "step": 9960 }, { "epoch": 2.16, "grad_norm": 0.9980970621109009, "learning_rate": 7.87112952464021e-05, "loss": 1.2275, "step": 9970 }, { "epoch": 2.17, "grad_norm": 1.0584611892700195, "learning_rate": 7.868948975141735e-05, "loss": 1.2105, "step": 9980 }, { "epoch": 2.17, "grad_norm": 1.1327629089355469, "learning_rate": 7.866768425643263e-05, "loss": 1.2022, "step": 9990 }, { "epoch": 2.17, "grad_norm": 0.981350302696228, "learning_rate": 7.864587876144789e-05, "loss": 1.2151, "step": 10000 }, { "epoch": 2.17, "grad_norm": 1.1142750978469849, "learning_rate": 7.862407326646315e-05, "loss": 1.1931, "step": 10010 }, { "epoch": 2.18, "grad_norm": 1.0601882934570312, "learning_rate": 7.860226777147842e-05, "loss": 1.2141, "step": 10020 }, { "epoch": 2.18, "grad_norm": 0.9991333484649658, "learning_rate": 7.858046227649368e-05, "loss": 1.1921, "step": 10030 }, { "epoch": 2.18, "grad_norm": 1.1021018028259277, "learning_rate": 7.855865678150894e-05, "loss": 1.2225, "step": 10040 }, { "epoch": 2.18, "grad_norm": 1.0568020343780518, "learning_rate": 7.85368512865242e-05, "loss": 1.2427, "step": 10050 }, { "epoch": 2.18, "grad_norm": 0.9811879992485046, "learning_rate": 7.851504579153948e-05, "loss": 1.1997, "step": 10060 }, { "epoch": 2.19, "grad_norm": 1.0988446474075317, "learning_rate": 7.849324029655473e-05, "loss": 1.2156, "step": 10070 }, { "epoch": 2.19, "grad_norm": 1.0393906831741333, "learning_rate": 7.847143480157e-05, "loss": 1.2258, "step": 10080 }, { "epoch": 2.19, "grad_norm": 1.1017202138900757, "learning_rate": 7.844962930658526e-05, "loss": 1.2069, "step": 10090 }, { "epoch": 2.19, "grad_norm": 1.1102749109268188, "learning_rate": 7.842782381160053e-05, "loss": 1.2256, "step": 10100 }, { "epoch": 2.19, "grad_norm": 1.0270189046859741, "learning_rate": 7.84060183166158e-05, "loss": 1.2174, "step": 10110 }, { "epoch": 2.2, "grad_norm": 1.0221537351608276, "learning_rate": 7.838421282163106e-05, "loss": 1.1968, "step": 10120 }, { "epoch": 2.2, "grad_norm": 0.95604407787323, "learning_rate": 7.836240732664631e-05, "loss": 1.213, "step": 10130 }, { "epoch": 2.2, "grad_norm": 0.9393739700317383, "learning_rate": 7.834060183166159e-05, "loss": 1.2182, "step": 10140 }, { "epoch": 2.2, "grad_norm": 1.014799952507019, "learning_rate": 7.831879633667685e-05, "loss": 1.2021, "step": 10150 }, { "epoch": 2.21, "grad_norm": 1.0287479162216187, "learning_rate": 7.829699084169211e-05, "loss": 1.2114, "step": 10160 }, { "epoch": 2.21, "grad_norm": 1.0790306329727173, "learning_rate": 7.827736589620584e-05, "loss": 1.1874, "step": 10170 }, { "epoch": 2.21, "grad_norm": 0.9588958621025085, "learning_rate": 7.82555604012211e-05, "loss": 1.2191, "step": 10180 }, { "epoch": 2.21, "grad_norm": 0.9004745483398438, "learning_rate": 7.823375490623638e-05, "loss": 1.1933, "step": 10190 }, { "epoch": 2.21, "grad_norm": 1.0742331743240356, "learning_rate": 7.821194941125164e-05, "loss": 1.2128, "step": 10200 }, { "epoch": 2.22, "grad_norm": 1.072489857673645, "learning_rate": 7.81901439162669e-05, "loss": 1.2143, "step": 10210 }, { "epoch": 2.22, "grad_norm": 0.9534905552864075, "learning_rate": 7.816833842128217e-05, "loss": 1.2206, "step": 10220 }, { "epoch": 2.22, "grad_norm": 1.0694421529769897, "learning_rate": 7.814653292629743e-05, "loss": 1.2051, "step": 10230 }, { "epoch": 2.22, "grad_norm": 0.9729447364807129, "learning_rate": 7.81247274313127e-05, "loss": 1.2234, "step": 10240 }, { "epoch": 2.23, "grad_norm": 1.0395437479019165, "learning_rate": 7.810292193632796e-05, "loss": 1.1977, "step": 10250 }, { "epoch": 2.23, "grad_norm": 0.999451756477356, "learning_rate": 7.808111644134322e-05, "loss": 1.2053, "step": 10260 }, { "epoch": 2.23, "grad_norm": 1.1238023042678833, "learning_rate": 7.805931094635848e-05, "loss": 1.2295, "step": 10270 }, { "epoch": 2.23, "grad_norm": 1.0689754486083984, "learning_rate": 7.803750545137375e-05, "loss": 1.2059, "step": 10280 }, { "epoch": 2.23, "grad_norm": 0.9754849672317505, "learning_rate": 7.801569995638901e-05, "loss": 1.206, "step": 10290 }, { "epoch": 2.24, "grad_norm": 1.02662193775177, "learning_rate": 7.799389446140429e-05, "loss": 1.1967, "step": 10300 }, { "epoch": 2.24, "grad_norm": 1.1547129154205322, "learning_rate": 7.797208896641954e-05, "loss": 1.211, "step": 10310 }, { "epoch": 2.24, "grad_norm": 0.9812795519828796, "learning_rate": 7.79502834714348e-05, "loss": 1.1928, "step": 10320 }, { "epoch": 2.24, "grad_norm": 1.0706185102462769, "learning_rate": 7.792847797645006e-05, "loss": 1.1914, "step": 10330 }, { "epoch": 2.24, "grad_norm": 1.0410836935043335, "learning_rate": 7.790667248146534e-05, "loss": 1.2002, "step": 10340 }, { "epoch": 2.25, "grad_norm": 0.9746688008308411, "learning_rate": 7.78848669864806e-05, "loss": 1.1863, "step": 10350 }, { "epoch": 2.25, "grad_norm": 0.8778429627418518, "learning_rate": 7.786524204099433e-05, "loss": 1.2383, "step": 10360 }, { "epoch": 2.25, "grad_norm": 0.969650149345398, "learning_rate": 7.78434365460096e-05, "loss": 1.177, "step": 10370 }, { "epoch": 2.25, "grad_norm": 1.015781283378601, "learning_rate": 7.782163105102486e-05, "loss": 1.1838, "step": 10380 }, { "epoch": 2.26, "grad_norm": 0.8965770602226257, "learning_rate": 7.779982555604013e-05, "loss": 1.2175, "step": 10390 }, { "epoch": 2.26, "grad_norm": 1.007692575454712, "learning_rate": 7.77780200610554e-05, "loss": 1.1978, "step": 10400 }, { "epoch": 2.26, "grad_norm": 0.9334578514099121, "learning_rate": 7.775621456607065e-05, "loss": 1.1887, "step": 10410 }, { "epoch": 2.26, "grad_norm": 0.9570727348327637, "learning_rate": 7.773440907108591e-05, "loss": 1.211, "step": 10420 }, { "epoch": 2.26, "grad_norm": 1.0146620273590088, "learning_rate": 7.771260357610119e-05, "loss": 1.2188, "step": 10430 }, { "epoch": 2.27, "grad_norm": 1.0868462324142456, "learning_rate": 7.769079808111645e-05, "loss": 1.2147, "step": 10440 }, { "epoch": 2.27, "grad_norm": 1.062110185623169, "learning_rate": 7.766899258613171e-05, "loss": 1.2172, "step": 10450 }, { "epoch": 2.27, "grad_norm": 0.950108528137207, "learning_rate": 7.764718709114697e-05, "loss": 1.2077, "step": 10460 }, { "epoch": 2.27, "grad_norm": 1.029308795928955, "learning_rate": 7.762538159616224e-05, "loss": 1.2112, "step": 10470 }, { "epoch": 2.28, "grad_norm": 0.9809032678604126, "learning_rate": 7.76035761011775e-05, "loss": 1.2115, "step": 10480 }, { "epoch": 2.28, "grad_norm": 1.0070390701293945, "learning_rate": 7.758177060619276e-05, "loss": 1.2032, "step": 10490 }, { "epoch": 2.28, "grad_norm": 1.1221727132797241, "learning_rate": 7.755996511120803e-05, "loss": 1.2164, "step": 10500 }, { "epoch": 2.28, "grad_norm": 1.013219952583313, "learning_rate": 7.753815961622329e-05, "loss": 1.1912, "step": 10510 }, { "epoch": 2.28, "grad_norm": 1.0602985620498657, "learning_rate": 7.751635412123855e-05, "loss": 1.1607, "step": 10520 }, { "epoch": 2.29, "grad_norm": 1.009325385093689, "learning_rate": 7.749454862625382e-05, "loss": 1.1943, "step": 10530 }, { "epoch": 2.29, "grad_norm": 1.01610267162323, "learning_rate": 7.747274313126909e-05, "loss": 1.2036, "step": 10540 }, { "epoch": 2.29, "grad_norm": 0.9865471720695496, "learning_rate": 7.745093763628436e-05, "loss": 1.1951, "step": 10550 }, { "epoch": 2.29, "grad_norm": 1.1565035581588745, "learning_rate": 7.74291321412996e-05, "loss": 1.2132, "step": 10560 }, { "epoch": 2.29, "grad_norm": 0.9530940651893616, "learning_rate": 7.740732664631487e-05, "loss": 1.191, "step": 10570 }, { "epoch": 2.3, "grad_norm": 1.1055086851119995, "learning_rate": 7.738552115133014e-05, "loss": 1.2292, "step": 10580 }, { "epoch": 2.3, "grad_norm": 1.0695475339889526, "learning_rate": 7.736371565634541e-05, "loss": 1.1937, "step": 10590 }, { "epoch": 2.3, "grad_norm": 0.991439163684845, "learning_rate": 7.734191016136067e-05, "loss": 1.2117, "step": 10600 }, { "epoch": 2.3, "grad_norm": 0.9743112921714783, "learning_rate": 7.732010466637592e-05, "loss": 1.2275, "step": 10610 }, { "epoch": 2.31, "grad_norm": 1.030121922492981, "learning_rate": 7.72982991713912e-05, "loss": 1.1893, "step": 10620 }, { "epoch": 2.31, "grad_norm": 1.0691959857940674, "learning_rate": 7.727649367640646e-05, "loss": 1.2044, "step": 10630 }, { "epoch": 2.31, "grad_norm": 1.141326904296875, "learning_rate": 7.725468818142172e-05, "loss": 1.2208, "step": 10640 }, { "epoch": 2.31, "grad_norm": 1.0179444551467896, "learning_rate": 7.723288268643699e-05, "loss": 1.1901, "step": 10650 }, { "epoch": 2.31, "grad_norm": 1.1256074905395508, "learning_rate": 7.721107719145225e-05, "loss": 1.2, "step": 10660 }, { "epoch": 2.32, "grad_norm": 1.0997061729431152, "learning_rate": 7.718927169646751e-05, "loss": 1.194, "step": 10670 }, { "epoch": 2.32, "grad_norm": 1.0382623672485352, "learning_rate": 7.716746620148277e-05, "loss": 1.2277, "step": 10680 }, { "epoch": 2.32, "grad_norm": 1.0295804738998413, "learning_rate": 7.714566070649805e-05, "loss": 1.1857, "step": 10690 }, { "epoch": 2.32, "grad_norm": 1.0594016313552856, "learning_rate": 7.71238552115133e-05, "loss": 1.1955, "step": 10700 }, { "epoch": 2.33, "grad_norm": 1.0921293497085571, "learning_rate": 7.710204971652856e-05, "loss": 1.1836, "step": 10710 }, { "epoch": 2.33, "grad_norm": 1.0477246046066284, "learning_rate": 7.708024422154383e-05, "loss": 1.2023, "step": 10720 }, { "epoch": 2.33, "grad_norm": 1.0246959924697876, "learning_rate": 7.70584387265591e-05, "loss": 1.222, "step": 10730 }, { "epoch": 2.33, "grad_norm": 1.0640301704406738, "learning_rate": 7.703663323157437e-05, "loss": 1.1974, "step": 10740 }, { "epoch": 2.33, "grad_norm": 1.0652765035629272, "learning_rate": 7.701482773658963e-05, "loss": 1.1997, "step": 10750 }, { "epoch": 2.34, "grad_norm": 0.9220369458198547, "learning_rate": 7.699302224160488e-05, "loss": 1.212, "step": 10760 }, { "epoch": 2.34, "grad_norm": 0.9531814455986023, "learning_rate": 7.697121674662016e-05, "loss": 1.1686, "step": 10770 }, { "epoch": 2.34, "grad_norm": 1.1248044967651367, "learning_rate": 7.694941125163542e-05, "loss": 1.1971, "step": 10780 }, { "epoch": 2.34, "grad_norm": 1.0232545137405396, "learning_rate": 7.692760575665068e-05, "loss": 1.194, "step": 10790 }, { "epoch": 2.34, "grad_norm": 1.0724860429763794, "learning_rate": 7.690580026166594e-05, "loss": 1.1936, "step": 10800 }, { "epoch": 2.35, "grad_norm": 1.036474347114563, "learning_rate": 7.688399476668121e-05, "loss": 1.2078, "step": 10810 }, { "epoch": 2.35, "grad_norm": 1.0231555700302124, "learning_rate": 7.686218927169647e-05, "loss": 1.2056, "step": 10820 }, { "epoch": 2.35, "grad_norm": 0.9879153370857239, "learning_rate": 7.684038377671173e-05, "loss": 1.2191, "step": 10830 }, { "epoch": 2.35, "grad_norm": 1.0709577798843384, "learning_rate": 7.6818578281727e-05, "loss": 1.198, "step": 10840 }, { "epoch": 2.36, "grad_norm": 1.0138386487960815, "learning_rate": 7.679677278674226e-05, "loss": 1.2284, "step": 10850 }, { "epoch": 2.36, "grad_norm": 1.0676188468933105, "learning_rate": 7.677496729175752e-05, "loss": 1.2004, "step": 10860 }, { "epoch": 2.36, "grad_norm": 1.0372511148452759, "learning_rate": 7.675316179677279e-05, "loss": 1.167, "step": 10870 }, { "epoch": 2.36, "grad_norm": 1.0466020107269287, "learning_rate": 7.673135630178805e-05, "loss": 1.1958, "step": 10880 }, { "epoch": 2.36, "grad_norm": 1.0521596670150757, "learning_rate": 7.670955080680333e-05, "loss": 1.2025, "step": 10890 }, { "epoch": 2.37, "grad_norm": 0.9906710982322693, "learning_rate": 7.668774531181858e-05, "loss": 1.188, "step": 10900 }, { "epoch": 2.37, "grad_norm": 1.1713993549346924, "learning_rate": 7.666593981683384e-05, "loss": 1.1992, "step": 10910 }, { "epoch": 2.37, "grad_norm": 1.009819507598877, "learning_rate": 7.664413432184911e-05, "loss": 1.191, "step": 10920 }, { "epoch": 2.37, "grad_norm": 1.0150312185287476, "learning_rate": 7.662232882686438e-05, "loss": 1.1951, "step": 10930 }, { "epoch": 2.38, "grad_norm": 0.9645649790763855, "learning_rate": 7.660052333187964e-05, "loss": 1.1941, "step": 10940 }, { "epoch": 2.38, "grad_norm": 1.0158168077468872, "learning_rate": 7.65787178368949e-05, "loss": 1.1911, "step": 10950 }, { "epoch": 2.38, "grad_norm": 1.0730938911437988, "learning_rate": 7.655691234191017e-05, "loss": 1.1885, "step": 10960 }, { "epoch": 2.38, "grad_norm": 1.09099543094635, "learning_rate": 7.653510684692543e-05, "loss": 1.195, "step": 10970 }, { "epoch": 2.38, "grad_norm": 0.982562243938446, "learning_rate": 7.651330135194069e-05, "loss": 1.213, "step": 10980 }, { "epoch": 2.39, "grad_norm": 1.0173815488815308, "learning_rate": 7.649149585695596e-05, "loss": 1.1931, "step": 10990 }, { "epoch": 2.39, "grad_norm": 1.0644387006759644, "learning_rate": 7.646969036197122e-05, "loss": 1.2, "step": 11000 }, { "epoch": 2.39, "grad_norm": 1.0456851720809937, "learning_rate": 7.644788486698648e-05, "loss": 1.2267, "step": 11010 }, { "epoch": 2.39, "grad_norm": 1.0387489795684814, "learning_rate": 7.642607937200175e-05, "loss": 1.1818, "step": 11020 }, { "epoch": 2.39, "grad_norm": 1.034599781036377, "learning_rate": 7.640427387701701e-05, "loss": 1.1972, "step": 11030 }, { "epoch": 2.4, "grad_norm": 1.005964994430542, "learning_rate": 7.638246838203228e-05, "loss": 1.1882, "step": 11040 }, { "epoch": 2.4, "grad_norm": 1.0190836191177368, "learning_rate": 7.636066288704753e-05, "loss": 1.1819, "step": 11050 }, { "epoch": 2.4, "grad_norm": 1.010334849357605, "learning_rate": 7.63388573920628e-05, "loss": 1.2054, "step": 11060 }, { "epoch": 2.4, "grad_norm": 0.986047089099884, "learning_rate": 7.631705189707806e-05, "loss": 1.1831, "step": 11070 }, { "epoch": 2.41, "grad_norm": 1.0715646743774414, "learning_rate": 7.629524640209334e-05, "loss": 1.2143, "step": 11080 }, { "epoch": 2.41, "grad_norm": 1.0573137998580933, "learning_rate": 7.62734409071086e-05, "loss": 1.1765, "step": 11090 }, { "epoch": 2.41, "grad_norm": 0.9830726385116577, "learning_rate": 7.625163541212386e-05, "loss": 1.2195, "step": 11100 }, { "epoch": 2.41, "grad_norm": 0.9928615689277649, "learning_rate": 7.622982991713911e-05, "loss": 1.2052, "step": 11110 }, { "epoch": 2.41, "grad_norm": 0.916532039642334, "learning_rate": 7.620802442215439e-05, "loss": 1.2161, "step": 11120 }, { "epoch": 2.42, "grad_norm": 1.024786353111267, "learning_rate": 7.618621892716965e-05, "loss": 1.1841, "step": 11130 }, { "epoch": 2.42, "grad_norm": 0.9942538142204285, "learning_rate": 7.616441343218491e-05, "loss": 1.1969, "step": 11140 }, { "epoch": 2.42, "grad_norm": 0.9637119770050049, "learning_rate": 7.614260793720018e-05, "loss": 1.1839, "step": 11150 }, { "epoch": 2.42, "grad_norm": 1.0759954452514648, "learning_rate": 7.612080244221544e-05, "loss": 1.2087, "step": 11160 }, { "epoch": 2.42, "grad_norm": 1.1083338260650635, "learning_rate": 7.60989969472307e-05, "loss": 1.1637, "step": 11170 }, { "epoch": 2.43, "grad_norm": 0.9280533790588379, "learning_rate": 7.607719145224597e-05, "loss": 1.186, "step": 11180 }, { "epoch": 2.43, "grad_norm": 1.005856990814209, "learning_rate": 7.605538595726124e-05, "loss": 1.2096, "step": 11190 }, { "epoch": 2.43, "grad_norm": 1.0294781923294067, "learning_rate": 7.603358046227649e-05, "loss": 1.1933, "step": 11200 }, { "epoch": 2.43, "grad_norm": 1.129011631011963, "learning_rate": 7.601177496729176e-05, "loss": 1.1975, "step": 11210 }, { "epoch": 2.44, "grad_norm": 0.9473848938941956, "learning_rate": 7.598996947230702e-05, "loss": 1.191, "step": 11220 }, { "epoch": 2.44, "grad_norm": 1.0725443363189697, "learning_rate": 7.59681639773223e-05, "loss": 1.2069, "step": 11230 }, { "epoch": 2.44, "grad_norm": 1.0083664655685425, "learning_rate": 7.594635848233756e-05, "loss": 1.2012, "step": 11240 }, { "epoch": 2.44, "grad_norm": 1.0504008531570435, "learning_rate": 7.592455298735281e-05, "loss": 1.1897, "step": 11250 }, { "epoch": 2.44, "grad_norm": 1.02128267288208, "learning_rate": 7.590274749236807e-05, "loss": 1.193, "step": 11260 }, { "epoch": 2.45, "grad_norm": 1.043655276298523, "learning_rate": 7.588094199738335e-05, "loss": 1.1984, "step": 11270 }, { "epoch": 2.45, "grad_norm": 1.0775086879730225, "learning_rate": 7.585913650239861e-05, "loss": 1.1826, "step": 11280 }, { "epoch": 2.45, "grad_norm": 1.0672656297683716, "learning_rate": 7.583733100741387e-05, "loss": 1.221, "step": 11290 }, { "epoch": 2.45, "grad_norm": 1.1105164289474487, "learning_rate": 7.581552551242914e-05, "loss": 1.2124, "step": 11300 }, { "epoch": 2.46, "grad_norm": 0.978393018245697, "learning_rate": 7.57937200174444e-05, "loss": 1.1749, "step": 11310 }, { "epoch": 2.46, "grad_norm": 1.0011403560638428, "learning_rate": 7.577191452245966e-05, "loss": 1.1987, "step": 11320 }, { "epoch": 2.46, "grad_norm": 0.9928615093231201, "learning_rate": 7.575010902747493e-05, "loss": 1.1916, "step": 11330 }, { "epoch": 2.46, "grad_norm": 0.9368339776992798, "learning_rate": 7.572830353249019e-05, "loss": 1.2155, "step": 11340 }, { "epoch": 2.46, "grad_norm": 1.0176599025726318, "learning_rate": 7.570649803750545e-05, "loss": 1.2108, "step": 11350 }, { "epoch": 2.47, "grad_norm": 0.956798255443573, "learning_rate": 7.568469254252072e-05, "loss": 1.1951, "step": 11360 }, { "epoch": 2.47, "grad_norm": 0.9456045627593994, "learning_rate": 7.566288704753598e-05, "loss": 1.1939, "step": 11370 }, { "epoch": 2.47, "grad_norm": 1.1099495887756348, "learning_rate": 7.564108155255125e-05, "loss": 1.2113, "step": 11380 }, { "epoch": 2.47, "grad_norm": 1.0258333683013916, "learning_rate": 7.561927605756652e-05, "loss": 1.1723, "step": 11390 }, { "epoch": 2.47, "grad_norm": 1.0410195589065552, "learning_rate": 7.559747056258177e-05, "loss": 1.182, "step": 11400 }, { "epoch": 2.48, "grad_norm": 0.9671265482902527, "learning_rate": 7.557566506759703e-05, "loss": 1.2038, "step": 11410 }, { "epoch": 2.48, "grad_norm": 0.9647257328033447, "learning_rate": 7.555385957261231e-05, "loss": 1.2078, "step": 11420 }, { "epoch": 2.48, "grad_norm": 1.0497002601623535, "learning_rate": 7.553205407762757e-05, "loss": 1.2053, "step": 11430 }, { "epoch": 2.48, "grad_norm": 1.080557107925415, "learning_rate": 7.551024858264283e-05, "loss": 1.1925, "step": 11440 }, { "epoch": 2.49, "grad_norm": 0.967833936214447, "learning_rate": 7.54884430876581e-05, "loss": 1.2106, "step": 11450 }, { "epoch": 2.49, "grad_norm": 1.1252259016036987, "learning_rate": 7.546663759267336e-05, "loss": 1.2035, "step": 11460 }, { "epoch": 2.49, "grad_norm": 1.021498203277588, "learning_rate": 7.544483209768862e-05, "loss": 1.1748, "step": 11470 }, { "epoch": 2.49, "grad_norm": 1.1426560878753662, "learning_rate": 7.542302660270389e-05, "loss": 1.1916, "step": 11480 }, { "epoch": 2.49, "grad_norm": 0.9883751273155212, "learning_rate": 7.540122110771915e-05, "loss": 1.1808, "step": 11490 }, { "epoch": 2.5, "grad_norm": 0.9893055558204651, "learning_rate": 7.537941561273441e-05, "loss": 1.1961, "step": 11500 }, { "epoch": 2.5, "grad_norm": 1.038801908493042, "learning_rate": 7.535761011774967e-05, "loss": 1.1813, "step": 11510 }, { "epoch": 2.5, "grad_norm": 0.9812270998954773, "learning_rate": 7.533580462276494e-05, "loss": 1.1873, "step": 11520 }, { "epoch": 2.5, "grad_norm": 1.0793439149856567, "learning_rate": 7.53139991277802e-05, "loss": 1.1858, "step": 11530 }, { "epoch": 2.51, "grad_norm": 1.0743041038513184, "learning_rate": 7.529219363279548e-05, "loss": 1.1788, "step": 11540 }, { "epoch": 2.51, "grad_norm": 1.1196831464767456, "learning_rate": 7.527038813781073e-05, "loss": 1.2059, "step": 11550 }, { "epoch": 2.51, "grad_norm": 1.0126169919967651, "learning_rate": 7.524858264282599e-05, "loss": 1.2101, "step": 11560 }, { "epoch": 2.51, "grad_norm": 1.2069376707077026, "learning_rate": 7.522677714784125e-05, "loss": 1.1964, "step": 11570 }, { "epoch": 2.51, "grad_norm": 0.9865954518318176, "learning_rate": 7.520497165285653e-05, "loss": 1.1966, "step": 11580 }, { "epoch": 2.52, "grad_norm": 0.9862752556800842, "learning_rate": 7.518316615787179e-05, "loss": 1.1954, "step": 11590 }, { "epoch": 2.52, "grad_norm": 1.093674659729004, "learning_rate": 7.516136066288704e-05, "loss": 1.1931, "step": 11600 }, { "epoch": 2.52, "grad_norm": 1.0402370691299438, "learning_rate": 7.513955516790232e-05, "loss": 1.1834, "step": 11610 }, { "epoch": 2.52, "grad_norm": 0.9660056233406067, "learning_rate": 7.511774967291758e-05, "loss": 1.1978, "step": 11620 }, { "epoch": 2.52, "grad_norm": 1.1045291423797607, "learning_rate": 7.509594417793284e-05, "loss": 1.1789, "step": 11630 }, { "epoch": 2.53, "grad_norm": 1.1806862354278564, "learning_rate": 7.507413868294811e-05, "loss": 1.1849, "step": 11640 }, { "epoch": 2.53, "grad_norm": 1.0600950717926025, "learning_rate": 7.505233318796337e-05, "loss": 1.1863, "step": 11650 }, { "epoch": 2.53, "grad_norm": 1.2518783807754517, "learning_rate": 7.503052769297863e-05, "loss": 1.1911, "step": 11660 }, { "epoch": 2.53, "grad_norm": 1.0559264421463013, "learning_rate": 7.50087221979939e-05, "loss": 1.2106, "step": 11670 }, { "epoch": 2.54, "grad_norm": 0.9558138847351074, "learning_rate": 7.498691670300916e-05, "loss": 1.1719, "step": 11680 }, { "epoch": 2.54, "grad_norm": 1.0867066383361816, "learning_rate": 7.496511120802444e-05, "loss": 1.2209, "step": 11690 }, { "epoch": 2.54, "grad_norm": 0.9424611926078796, "learning_rate": 7.494330571303969e-05, "loss": 1.1812, "step": 11700 }, { "epoch": 2.54, "grad_norm": 1.04227614402771, "learning_rate": 7.492150021805495e-05, "loss": 1.204, "step": 11710 }, { "epoch": 2.54, "grad_norm": 0.9230485558509827, "learning_rate": 7.489969472307021e-05, "loss": 1.1923, "step": 11720 }, { "epoch": 2.55, "grad_norm": 1.079827070236206, "learning_rate": 7.487788922808549e-05, "loss": 1.1633, "step": 11730 }, { "epoch": 2.55, "grad_norm": 1.0158615112304688, "learning_rate": 7.485608373310075e-05, "loss": 1.1828, "step": 11740 }, { "epoch": 2.55, "grad_norm": 1.0298587083816528, "learning_rate": 7.4834278238116e-05, "loss": 1.2046, "step": 11750 }, { "epoch": 2.55, "grad_norm": 1.1021103858947754, "learning_rate": 7.481247274313126e-05, "loss": 1.2369, "step": 11760 }, { "epoch": 2.56, "grad_norm": 1.0776439905166626, "learning_rate": 7.479066724814654e-05, "loss": 1.1884, "step": 11770 }, { "epoch": 2.56, "grad_norm": 1.0745654106140137, "learning_rate": 7.47688617531618e-05, "loss": 1.1915, "step": 11780 }, { "epoch": 2.56, "grad_norm": 0.9988030791282654, "learning_rate": 7.474705625817707e-05, "loss": 1.1783, "step": 11790 }, { "epoch": 2.56, "grad_norm": 0.9837521910667419, "learning_rate": 7.472525076319233e-05, "loss": 1.1859, "step": 11800 }, { "epoch": 2.56, "grad_norm": 1.076101541519165, "learning_rate": 7.470344526820759e-05, "loss": 1.194, "step": 11810 }, { "epoch": 2.57, "grad_norm": 1.0141769647598267, "learning_rate": 7.468163977322286e-05, "loss": 1.1893, "step": 11820 }, { "epoch": 2.57, "grad_norm": 0.9962597489356995, "learning_rate": 7.465983427823812e-05, "loss": 1.2143, "step": 11830 }, { "epoch": 2.57, "grad_norm": 1.0923272371292114, "learning_rate": 7.46380287832534e-05, "loss": 1.184, "step": 11840 }, { "epoch": 2.57, "grad_norm": 1.1431857347488403, "learning_rate": 7.461622328826864e-05, "loss": 1.1926, "step": 11850 }, { "epoch": 2.57, "grad_norm": 1.0489574670791626, "learning_rate": 7.459441779328391e-05, "loss": 1.1584, "step": 11860 }, { "epoch": 2.58, "grad_norm": 1.049176812171936, "learning_rate": 7.457261229829917e-05, "loss": 1.2145, "step": 11870 }, { "epoch": 2.58, "grad_norm": 1.0617070198059082, "learning_rate": 7.455080680331445e-05, "loss": 1.1821, "step": 11880 }, { "epoch": 2.58, "grad_norm": 1.1978720426559448, "learning_rate": 7.452900130832971e-05, "loss": 1.1832, "step": 11890 }, { "epoch": 2.58, "grad_norm": 1.0322489738464355, "learning_rate": 7.450719581334496e-05, "loss": 1.1978, "step": 11900 }, { "epoch": 2.59, "grad_norm": 1.0497206449508667, "learning_rate": 7.448539031836022e-05, "loss": 1.1771, "step": 11910 }, { "epoch": 2.59, "grad_norm": 1.0136041641235352, "learning_rate": 7.44635848233755e-05, "loss": 1.198, "step": 11920 }, { "epoch": 2.59, "grad_norm": 1.0500036478042603, "learning_rate": 7.444177932839076e-05, "loss": 1.2019, "step": 11930 }, { "epoch": 2.59, "grad_norm": 1.0009404420852661, "learning_rate": 7.441997383340603e-05, "loss": 1.197, "step": 11940 }, { "epoch": 2.59, "grad_norm": 1.1604543924331665, "learning_rate": 7.439816833842127e-05, "loss": 1.1921, "step": 11950 }, { "epoch": 2.6, "grad_norm": 1.0473634004592896, "learning_rate": 7.437636284343655e-05, "loss": 1.1718, "step": 11960 }, { "epoch": 2.6, "grad_norm": 1.0517455339431763, "learning_rate": 7.435455734845181e-05, "loss": 1.1721, "step": 11970 }, { "epoch": 2.6, "grad_norm": 1.0030772686004639, "learning_rate": 7.433275185346708e-05, "loss": 1.1942, "step": 11980 }, { "epoch": 2.6, "grad_norm": 1.067175269126892, "learning_rate": 7.431094635848234e-05, "loss": 1.2015, "step": 11990 }, { "epoch": 2.61, "grad_norm": 1.0570900440216064, "learning_rate": 7.42891408634976e-05, "loss": 1.1715, "step": 12000 }, { "epoch": 2.61, "grad_norm": 1.0768860578536987, "learning_rate": 7.426733536851287e-05, "loss": 1.2118, "step": 12010 }, { "epoch": 2.61, "grad_norm": 0.9864534139633179, "learning_rate": 7.424552987352813e-05, "loss": 1.211, "step": 12020 }, { "epoch": 2.61, "grad_norm": 0.9961116909980774, "learning_rate": 7.422372437854339e-05, "loss": 1.1726, "step": 12030 }, { "epoch": 2.61, "grad_norm": 1.149584174156189, "learning_rate": 7.420191888355867e-05, "loss": 1.2015, "step": 12040 }, { "epoch": 2.62, "grad_norm": 0.9385210275650024, "learning_rate": 7.418011338857392e-05, "loss": 1.1853, "step": 12050 }, { "epoch": 2.62, "grad_norm": 0.9972238540649414, "learning_rate": 7.415830789358918e-05, "loss": 1.1862, "step": 12060 }, { "epoch": 2.62, "grad_norm": 1.1037793159484863, "learning_rate": 7.413650239860446e-05, "loss": 1.2191, "step": 12070 }, { "epoch": 2.62, "grad_norm": 1.082542896270752, "learning_rate": 7.411469690361972e-05, "loss": 1.2079, "step": 12080 }, { "epoch": 2.62, "grad_norm": 1.103800892829895, "learning_rate": 7.409289140863498e-05, "loss": 1.2069, "step": 12090 }, { "epoch": 2.63, "grad_norm": 1.1348109245300293, "learning_rate": 7.407108591365023e-05, "loss": 1.1853, "step": 12100 }, { "epoch": 2.63, "grad_norm": 1.0272557735443115, "learning_rate": 7.404928041866551e-05, "loss": 1.206, "step": 12110 }, { "epoch": 2.63, "grad_norm": 1.06856369972229, "learning_rate": 7.402747492368077e-05, "loss": 1.2077, "step": 12120 }, { "epoch": 2.63, "grad_norm": 0.9664187431335449, "learning_rate": 7.400566942869604e-05, "loss": 1.2252, "step": 12130 }, { "epoch": 2.64, "grad_norm": 1.0753014087677002, "learning_rate": 7.39838639337113e-05, "loss": 1.2033, "step": 12140 }, { "epoch": 2.64, "grad_norm": 1.1803292036056519, "learning_rate": 7.396205843872656e-05, "loss": 1.1944, "step": 12150 }, { "epoch": 2.64, "grad_norm": 0.9899237155914307, "learning_rate": 7.394025294374183e-05, "loss": 1.1768, "step": 12160 }, { "epoch": 2.64, "grad_norm": 1.0693211555480957, "learning_rate": 7.391844744875709e-05, "loss": 1.1957, "step": 12170 }, { "epoch": 2.64, "grad_norm": 1.0212500095367432, "learning_rate": 7.389664195377235e-05, "loss": 1.1807, "step": 12180 }, { "epoch": 2.65, "grad_norm": 0.9626917839050293, "learning_rate": 7.387483645878763e-05, "loss": 1.2019, "step": 12190 }, { "epoch": 2.65, "grad_norm": 1.0324492454528809, "learning_rate": 7.385303096380288e-05, "loss": 1.1787, "step": 12200 }, { "epoch": 2.65, "grad_norm": 1.0183689594268799, "learning_rate": 7.383122546881814e-05, "loss": 1.1718, "step": 12210 }, { "epoch": 2.65, "grad_norm": 1.03179132938385, "learning_rate": 7.38094199738334e-05, "loss": 1.1684, "step": 12220 }, { "epoch": 2.66, "grad_norm": 1.0151221752166748, "learning_rate": 7.378761447884868e-05, "loss": 1.1754, "step": 12230 }, { "epoch": 2.66, "grad_norm": 1.0675002336502075, "learning_rate": 7.376580898386394e-05, "loss": 1.1964, "step": 12240 }, { "epoch": 2.66, "grad_norm": 0.9424752593040466, "learning_rate": 7.374400348887919e-05, "loss": 1.1994, "step": 12250 }, { "epoch": 2.66, "grad_norm": 1.0181151628494263, "learning_rate": 7.372219799389446e-05, "loss": 1.1943, "step": 12260 }, { "epoch": 2.66, "grad_norm": 1.0865308046340942, "learning_rate": 7.370039249890973e-05, "loss": 1.1703, "step": 12270 }, { "epoch": 2.67, "grad_norm": 1.043016791343689, "learning_rate": 7.3678587003925e-05, "loss": 1.1813, "step": 12280 }, { "epoch": 2.67, "grad_norm": 1.060164213180542, "learning_rate": 7.365678150894026e-05, "loss": 1.1769, "step": 12290 }, { "epoch": 2.67, "grad_norm": 1.0264476537704468, "learning_rate": 7.363497601395552e-05, "loss": 1.1895, "step": 12300 }, { "epoch": 2.67, "grad_norm": 1.0359675884246826, "learning_rate": 7.361317051897078e-05, "loss": 1.1773, "step": 12310 }, { "epoch": 2.67, "grad_norm": 1.0558348894119263, "learning_rate": 7.359136502398605e-05, "loss": 1.2011, "step": 12320 }, { "epoch": 2.68, "grad_norm": 1.0487242937088013, "learning_rate": 7.356955952900131e-05, "loss": 1.2145, "step": 12330 }, { "epoch": 2.68, "grad_norm": 1.0390251874923706, "learning_rate": 7.354775403401657e-05, "loss": 1.1771, "step": 12340 }, { "epoch": 2.68, "grad_norm": 0.9608905911445618, "learning_rate": 7.352594853903184e-05, "loss": 1.1988, "step": 12350 }, { "epoch": 2.68, "grad_norm": 0.9924561977386475, "learning_rate": 7.35041430440471e-05, "loss": 1.2049, "step": 12360 }, { "epoch": 2.69, "grad_norm": 0.9115813970565796, "learning_rate": 7.348233754906236e-05, "loss": 1.185, "step": 12370 }, { "epoch": 2.69, "grad_norm": 0.9227597713470459, "learning_rate": 7.346053205407764e-05, "loss": 1.1964, "step": 12380 }, { "epoch": 2.69, "grad_norm": 1.1192283630371094, "learning_rate": 7.34387265590929e-05, "loss": 1.1927, "step": 12390 }, { "epoch": 2.69, "grad_norm": 0.9770265817642212, "learning_rate": 7.341692106410815e-05, "loss": 1.197, "step": 12400 }, { "epoch": 2.69, "grad_norm": 1.0701338052749634, "learning_rate": 7.339511556912341e-05, "loss": 1.1834, "step": 12410 }, { "epoch": 2.7, "grad_norm": 1.0348602533340454, "learning_rate": 7.337331007413869e-05, "loss": 1.2115, "step": 12420 }, { "epoch": 2.7, "grad_norm": 1.0927150249481201, "learning_rate": 7.335150457915395e-05, "loss": 1.2032, "step": 12430 }, { "epoch": 2.7, "grad_norm": 1.0548428297042847, "learning_rate": 7.332969908416922e-05, "loss": 1.1962, "step": 12440 }, { "epoch": 2.7, "grad_norm": 0.9672625064849854, "learning_rate": 7.330789358918447e-05, "loss": 1.1761, "step": 12450 }, { "epoch": 2.71, "grad_norm": 0.9257100820541382, "learning_rate": 7.328608809419974e-05, "loss": 1.2007, "step": 12460 }, { "epoch": 2.71, "grad_norm": 1.0286579132080078, "learning_rate": 7.3264282599215e-05, "loss": 1.1988, "step": 12470 }, { "epoch": 2.71, "grad_norm": 1.153806447982788, "learning_rate": 7.324247710423027e-05, "loss": 1.207, "step": 12480 }, { "epoch": 2.71, "grad_norm": 0.9337689876556396, "learning_rate": 7.322067160924553e-05, "loss": 1.2006, "step": 12490 }, { "epoch": 2.71, "grad_norm": 0.9721220135688782, "learning_rate": 7.31988661142608e-05, "loss": 1.2014, "step": 12500 }, { "epoch": 2.72, "grad_norm": 1.158456802368164, "learning_rate": 7.317706061927606e-05, "loss": 1.2074, "step": 12510 }, { "epoch": 2.72, "grad_norm": 1.0969914197921753, "learning_rate": 7.315525512429132e-05, "loss": 1.207, "step": 12520 }, { "epoch": 2.72, "grad_norm": 0.9585858583450317, "learning_rate": 7.31334496293066e-05, "loss": 1.1783, "step": 12530 }, { "epoch": 2.72, "grad_norm": 1.0447596311569214, "learning_rate": 7.311164413432186e-05, "loss": 1.1662, "step": 12540 }, { "epoch": 2.72, "grad_norm": 1.0252220630645752, "learning_rate": 7.308983863933711e-05, "loss": 1.1891, "step": 12550 }, { "epoch": 2.73, "grad_norm": 1.075294017791748, "learning_rate": 7.306803314435237e-05, "loss": 1.1917, "step": 12560 }, { "epoch": 2.73, "grad_norm": 1.0980489253997803, "learning_rate": 7.304622764936765e-05, "loss": 1.1829, "step": 12570 }, { "epoch": 2.73, "grad_norm": 1.0682340860366821, "learning_rate": 7.302442215438291e-05, "loss": 1.1859, "step": 12580 }, { "epoch": 2.73, "grad_norm": 1.0863393545150757, "learning_rate": 7.300261665939818e-05, "loss": 1.188, "step": 12590 }, { "epoch": 2.74, "grad_norm": 1.0569467544555664, "learning_rate": 7.298081116441343e-05, "loss": 1.1962, "step": 12600 }, { "epoch": 2.74, "grad_norm": 1.0733450651168823, "learning_rate": 7.29590056694287e-05, "loss": 1.1934, "step": 12610 }, { "epoch": 2.74, "grad_norm": 1.0762420892715454, "learning_rate": 7.293720017444397e-05, "loss": 1.181, "step": 12620 }, { "epoch": 2.74, "grad_norm": 1.0010732412338257, "learning_rate": 7.291539467945923e-05, "loss": 1.1936, "step": 12630 }, { "epoch": 2.74, "grad_norm": 1.039819598197937, "learning_rate": 7.289358918447449e-05, "loss": 1.2001, "step": 12640 }, { "epoch": 2.75, "grad_norm": 1.1060088872909546, "learning_rate": 7.287178368948975e-05, "loss": 1.2056, "step": 12650 }, { "epoch": 2.75, "grad_norm": 0.9314666986465454, "learning_rate": 7.284997819450502e-05, "loss": 1.1748, "step": 12660 }, { "epoch": 2.75, "grad_norm": 1.2504175901412964, "learning_rate": 7.282817269952028e-05, "loss": 1.1737, "step": 12670 }, { "epoch": 2.75, "grad_norm": 1.1391412019729614, "learning_rate": 7.280636720453554e-05, "loss": 1.1909, "step": 12680 }, { "epoch": 2.75, "grad_norm": 1.0052971839904785, "learning_rate": 7.278456170955081e-05, "loss": 1.1902, "step": 12690 }, { "epoch": 2.76, "grad_norm": 1.1059855222702026, "learning_rate": 7.276275621456607e-05, "loss": 1.2021, "step": 12700 }, { "epoch": 2.76, "grad_norm": 1.0115567445755005, "learning_rate": 7.274095071958133e-05, "loss": 1.1512, "step": 12710 }, { "epoch": 2.76, "grad_norm": 1.0905554294586182, "learning_rate": 7.27191452245966e-05, "loss": 1.1884, "step": 12720 }, { "epoch": 2.76, "grad_norm": 1.023762583732605, "learning_rate": 7.269733972961187e-05, "loss": 1.1841, "step": 12730 }, { "epoch": 2.77, "grad_norm": 1.0214531421661377, "learning_rate": 7.267553423462714e-05, "loss": 1.185, "step": 12740 }, { "epoch": 2.77, "grad_norm": 1.043494701385498, "learning_rate": 7.265372873964239e-05, "loss": 1.1822, "step": 12750 }, { "epoch": 2.77, "grad_norm": 1.0787135362625122, "learning_rate": 7.263192324465766e-05, "loss": 1.1827, "step": 12760 }, { "epoch": 2.77, "grad_norm": 1.1063132286071777, "learning_rate": 7.261011774967292e-05, "loss": 1.1847, "step": 12770 }, { "epoch": 2.77, "grad_norm": 1.0400912761688232, "learning_rate": 7.258831225468819e-05, "loss": 1.1603, "step": 12780 }, { "epoch": 2.78, "grad_norm": 1.057569146156311, "learning_rate": 7.256650675970345e-05, "loss": 1.1713, "step": 12790 }, { "epoch": 2.78, "grad_norm": 1.0713859796524048, "learning_rate": 7.254470126471871e-05, "loss": 1.2167, "step": 12800 }, { "epoch": 2.78, "grad_norm": 1.0643656253814697, "learning_rate": 7.252289576973398e-05, "loss": 1.1744, "step": 12810 }, { "epoch": 2.78, "grad_norm": 1.1218703985214233, "learning_rate": 7.250109027474924e-05, "loss": 1.2183, "step": 12820 }, { "epoch": 2.79, "grad_norm": 0.9932084083557129, "learning_rate": 7.24792847797645e-05, "loss": 1.1774, "step": 12830 }, { "epoch": 2.79, "grad_norm": 1.063856840133667, "learning_rate": 7.245747928477977e-05, "loss": 1.1519, "step": 12840 }, { "epoch": 2.79, "grad_norm": 1.0655205249786377, "learning_rate": 7.243567378979503e-05, "loss": 1.1883, "step": 12850 }, { "epoch": 2.79, "grad_norm": 0.9149487018585205, "learning_rate": 7.241386829481029e-05, "loss": 1.1636, "step": 12860 }, { "epoch": 2.79, "grad_norm": 1.061606764793396, "learning_rate": 7.239206279982556e-05, "loss": 1.1933, "step": 12870 }, { "epoch": 2.8, "grad_norm": 1.026875376701355, "learning_rate": 7.237025730484083e-05, "loss": 1.1697, "step": 12880 }, { "epoch": 2.8, "grad_norm": 0.9857021570205688, "learning_rate": 7.23484518098561e-05, "loss": 1.1593, "step": 12890 }, { "epoch": 2.8, "grad_norm": 1.0682117938995361, "learning_rate": 7.232664631487134e-05, "loss": 1.1846, "step": 12900 }, { "epoch": 2.8, "grad_norm": 0.9390698671340942, "learning_rate": 7.230484081988661e-05, "loss": 1.1625, "step": 12910 }, { "epoch": 2.8, "grad_norm": 1.0105453729629517, "learning_rate": 7.228303532490188e-05, "loss": 1.1929, "step": 12920 }, { "epoch": 2.81, "grad_norm": 0.986284077167511, "learning_rate": 7.226122982991715e-05, "loss": 1.1973, "step": 12930 }, { "epoch": 2.81, "grad_norm": 1.0369880199432373, "learning_rate": 7.223942433493241e-05, "loss": 1.1996, "step": 12940 }, { "epoch": 2.81, "grad_norm": 1.1171998977661133, "learning_rate": 7.221761883994766e-05, "loss": 1.2022, "step": 12950 }, { "epoch": 2.81, "grad_norm": 1.0862730741500854, "learning_rate": 7.219581334496294e-05, "loss": 1.19, "step": 12960 }, { "epoch": 2.82, "grad_norm": 1.0609533786773682, "learning_rate": 7.21740078499782e-05, "loss": 1.1825, "step": 12970 }, { "epoch": 2.82, "grad_norm": 0.98408043384552, "learning_rate": 7.215220235499346e-05, "loss": 1.1766, "step": 12980 }, { "epoch": 2.82, "grad_norm": 1.0378422737121582, "learning_rate": 7.213039686000873e-05, "loss": 1.1843, "step": 12990 }, { "epoch": 2.82, "grad_norm": 0.9478686451911926, "learning_rate": 7.210859136502399e-05, "loss": 1.1728, "step": 13000 }, { "epoch": 2.82, "grad_norm": 1.0276613235473633, "learning_rate": 7.208678587003925e-05, "loss": 1.1796, "step": 13010 }, { "epoch": 2.83, "grad_norm": 0.9244964122772217, "learning_rate": 7.206498037505451e-05, "loss": 1.1812, "step": 13020 }, { "epoch": 2.83, "grad_norm": 1.0720821619033813, "learning_rate": 7.204317488006979e-05, "loss": 1.1597, "step": 13030 }, { "epoch": 2.83, "grad_norm": 1.0820330381393433, "learning_rate": 7.202136938508504e-05, "loss": 1.1981, "step": 13040 }, { "epoch": 2.83, "grad_norm": 0.9590197205543518, "learning_rate": 7.19995638901003e-05, "loss": 1.1898, "step": 13050 }, { "epoch": 2.84, "grad_norm": 1.0559465885162354, "learning_rate": 7.197775839511557e-05, "loss": 1.1985, "step": 13060 }, { "epoch": 2.84, "grad_norm": 0.9392025470733643, "learning_rate": 7.195595290013084e-05, "loss": 1.1933, "step": 13070 }, { "epoch": 2.84, "grad_norm": 1.1029566526412964, "learning_rate": 7.19341474051461e-05, "loss": 1.1733, "step": 13080 }, { "epoch": 2.84, "grad_norm": 1.0255013704299927, "learning_rate": 7.191234191016137e-05, "loss": 1.1762, "step": 13090 }, { "epoch": 2.84, "grad_norm": 1.0394928455352783, "learning_rate": 7.189053641517662e-05, "loss": 1.151, "step": 13100 }, { "epoch": 2.85, "grad_norm": 1.057391881942749, "learning_rate": 7.18687309201919e-05, "loss": 1.1761, "step": 13110 }, { "epoch": 2.85, "grad_norm": 1.0358378887176514, "learning_rate": 7.184692542520716e-05, "loss": 1.1911, "step": 13120 }, { "epoch": 2.85, "grad_norm": 1.0503947734832764, "learning_rate": 7.182511993022242e-05, "loss": 1.2198, "step": 13130 }, { "epoch": 2.85, "grad_norm": 1.0237114429473877, "learning_rate": 7.180331443523768e-05, "loss": 1.2043, "step": 13140 }, { "epoch": 2.85, "grad_norm": 0.9386830925941467, "learning_rate": 7.178150894025295e-05, "loss": 1.192, "step": 13150 }, { "epoch": 2.86, "grad_norm": 0.9386530518531799, "learning_rate": 7.175970344526821e-05, "loss": 1.1864, "step": 13160 }, { "epoch": 2.86, "grad_norm": 0.9574694633483887, "learning_rate": 7.173789795028347e-05, "loss": 1.1828, "step": 13170 }, { "epoch": 2.86, "grad_norm": 1.0528520345687866, "learning_rate": 7.171609245529874e-05, "loss": 1.1861, "step": 13180 }, { "epoch": 2.86, "grad_norm": 1.0283684730529785, "learning_rate": 7.1694286960314e-05, "loss": 1.1749, "step": 13190 }, { "epoch": 2.87, "grad_norm": 0.9847733974456787, "learning_rate": 7.167248146532926e-05, "loss": 1.1903, "step": 13200 }, { "epoch": 2.87, "grad_norm": 1.0302000045776367, "learning_rate": 7.165067597034453e-05, "loss": 1.1852, "step": 13210 }, { "epoch": 2.87, "grad_norm": 1.0097705125808716, "learning_rate": 7.16288704753598e-05, "loss": 1.1874, "step": 13220 }, { "epoch": 2.87, "grad_norm": 1.1593202352523804, "learning_rate": 7.160706498037506e-05, "loss": 1.1827, "step": 13230 }, { "epoch": 2.87, "grad_norm": 0.9892207384109497, "learning_rate": 7.158525948539033e-05, "loss": 1.1694, "step": 13240 }, { "epoch": 2.88, "grad_norm": 1.0846501588821411, "learning_rate": 7.156345399040558e-05, "loss": 1.1892, "step": 13250 }, { "epoch": 2.88, "grad_norm": 1.014400601387024, "learning_rate": 7.154164849542085e-05, "loss": 1.1806, "step": 13260 }, { "epoch": 2.88, "grad_norm": 1.0073882341384888, "learning_rate": 7.151984300043612e-05, "loss": 1.1781, "step": 13270 }, { "epoch": 2.88, "grad_norm": 1.2205009460449219, "learning_rate": 7.149803750545138e-05, "loss": 1.1757, "step": 13280 }, { "epoch": 2.89, "grad_norm": 1.058864951133728, "learning_rate": 7.147623201046664e-05, "loss": 1.1968, "step": 13290 }, { "epoch": 2.89, "grad_norm": 1.0327656269073486, "learning_rate": 7.14544265154819e-05, "loss": 1.2232, "step": 13300 }, { "epoch": 2.89, "grad_norm": 1.042557954788208, "learning_rate": 7.143262102049717e-05, "loss": 1.2254, "step": 13310 }, { "epoch": 2.89, "grad_norm": 0.9692584276199341, "learning_rate": 7.141081552551243e-05, "loss": 1.1757, "step": 13320 }, { "epoch": 2.89, "grad_norm": 1.0381295680999756, "learning_rate": 7.13890100305277e-05, "loss": 1.1961, "step": 13330 }, { "epoch": 2.9, "grad_norm": 1.0239328145980835, "learning_rate": 7.136720453554296e-05, "loss": 1.155, "step": 13340 }, { "epoch": 2.9, "grad_norm": 1.0357582569122314, "learning_rate": 7.134539904055822e-05, "loss": 1.1719, "step": 13350 }, { "epoch": 2.9, "grad_norm": 1.0303056240081787, "learning_rate": 7.132359354557348e-05, "loss": 1.1934, "step": 13360 }, { "epoch": 2.9, "grad_norm": 0.9931465983390808, "learning_rate": 7.130178805058875e-05, "loss": 1.1791, "step": 13370 }, { "epoch": 2.9, "grad_norm": 1.0507264137268066, "learning_rate": 7.127998255560402e-05, "loss": 1.184, "step": 13380 }, { "epoch": 2.91, "grad_norm": 1.0703891515731812, "learning_rate": 7.125817706061927e-05, "loss": 1.1853, "step": 13390 }, { "epoch": 2.91, "grad_norm": 0.9957337975502014, "learning_rate": 7.123637156563454e-05, "loss": 1.1702, "step": 13400 }, { "epoch": 2.91, "grad_norm": 1.1027911901474, "learning_rate": 7.12145660706498e-05, "loss": 1.1968, "step": 13410 }, { "epoch": 2.91, "grad_norm": 0.9877254366874695, "learning_rate": 7.119276057566508e-05, "loss": 1.1752, "step": 13420 }, { "epoch": 2.92, "grad_norm": 1.0115269422531128, "learning_rate": 7.117095508068034e-05, "loss": 1.1546, "step": 13430 }, { "epoch": 2.92, "grad_norm": 0.9738414287567139, "learning_rate": 7.11491495856956e-05, "loss": 1.1576, "step": 13440 }, { "epoch": 2.92, "grad_norm": 1.0419977903366089, "learning_rate": 7.112734409071087e-05, "loss": 1.1927, "step": 13450 }, { "epoch": 2.92, "grad_norm": 1.0933623313903809, "learning_rate": 7.110553859572613e-05, "loss": 1.1747, "step": 13460 }, { "epoch": 2.92, "grad_norm": 1.0882395505905151, "learning_rate": 7.108373310074139e-05, "loss": 1.189, "step": 13470 }, { "epoch": 2.93, "grad_norm": 0.9442359209060669, "learning_rate": 7.106192760575665e-05, "loss": 1.1826, "step": 13480 }, { "epoch": 2.93, "grad_norm": 1.0601658821105957, "learning_rate": 7.104012211077192e-05, "loss": 1.1854, "step": 13490 }, { "epoch": 2.93, "grad_norm": 1.0670174360275269, "learning_rate": 7.101831661578718e-05, "loss": 1.1893, "step": 13500 }, { "epoch": 2.93, "grad_norm": 1.0757992267608643, "learning_rate": 7.099651112080244e-05, "loss": 1.1984, "step": 13510 }, { "epoch": 2.94, "grad_norm": 1.0340900421142578, "learning_rate": 7.09747056258177e-05, "loss": 1.2068, "step": 13520 }, { "epoch": 2.94, "grad_norm": 1.0402545928955078, "learning_rate": 7.095290013083298e-05, "loss": 1.208, "step": 13530 }, { "epoch": 2.94, "grad_norm": 1.1371444463729858, "learning_rate": 7.093109463584823e-05, "loss": 1.1883, "step": 13540 }, { "epoch": 2.94, "grad_norm": 1.0464153289794922, "learning_rate": 7.09092891408635e-05, "loss": 1.1896, "step": 13550 }, { "epoch": 2.94, "grad_norm": 0.9860671758651733, "learning_rate": 7.088748364587876e-05, "loss": 1.1782, "step": 13560 }, { "epoch": 2.95, "grad_norm": 0.927305281162262, "learning_rate": 7.086567815089404e-05, "loss": 1.1759, "step": 13570 }, { "epoch": 2.95, "grad_norm": 1.0116522312164307, "learning_rate": 7.08438726559093e-05, "loss": 1.1845, "step": 13580 }, { "epoch": 2.95, "grad_norm": 1.0394808053970337, "learning_rate": 7.082206716092456e-05, "loss": 1.1949, "step": 13590 }, { "epoch": 2.95, "grad_norm": 1.1558239459991455, "learning_rate": 7.080026166593981e-05, "loss": 1.1758, "step": 13600 }, { "epoch": 2.95, "grad_norm": 0.9348282217979431, "learning_rate": 7.077845617095509e-05, "loss": 1.1976, "step": 13610 }, { "epoch": 2.96, "grad_norm": 0.9124108552932739, "learning_rate": 7.075665067597035e-05, "loss": 1.172, "step": 13620 }, { "epoch": 2.96, "grad_norm": 1.077690839767456, "learning_rate": 7.073484518098561e-05, "loss": 1.1835, "step": 13630 }, { "epoch": 2.96, "grad_norm": 0.9495044350624084, "learning_rate": 7.071303968600088e-05, "loss": 1.1682, "step": 13640 }, { "epoch": 2.96, "grad_norm": 0.9947417378425598, "learning_rate": 7.069123419101614e-05, "loss": 1.2216, "step": 13650 }, { "epoch": 2.97, "grad_norm": 1.072772741317749, "learning_rate": 7.06694286960314e-05, "loss": 1.2006, "step": 13660 }, { "epoch": 2.97, "grad_norm": 1.0669934749603271, "learning_rate": 7.064762320104667e-05, "loss": 1.1992, "step": 13670 }, { "epoch": 2.97, "grad_norm": 1.0894432067871094, "learning_rate": 7.062581770606194e-05, "loss": 1.1745, "step": 13680 }, { "epoch": 2.97, "grad_norm": 0.9627017378807068, "learning_rate": 7.060401221107719e-05, "loss": 1.1818, "step": 13690 }, { "epoch": 2.97, "grad_norm": 0.9909853935241699, "learning_rate": 7.058220671609245e-05, "loss": 1.1705, "step": 13700 }, { "epoch": 2.98, "grad_norm": 1.0125415325164795, "learning_rate": 7.056040122110772e-05, "loss": 1.211, "step": 13710 }, { "epoch": 2.98, "grad_norm": 0.9729527235031128, "learning_rate": 7.0538595726123e-05, "loss": 1.1658, "step": 13720 }, { "epoch": 2.98, "grad_norm": 1.0256701707839966, "learning_rate": 7.051679023113826e-05, "loss": 1.1657, "step": 13730 }, { "epoch": 2.98, "grad_norm": 1.0687954425811768, "learning_rate": 7.04949847361535e-05, "loss": 1.1648, "step": 13740 }, { "epoch": 2.99, "grad_norm": 0.9713466167449951, "learning_rate": 7.047317924116877e-05, "loss": 1.1774, "step": 13750 }, { "epoch": 2.99, "grad_norm": 1.0809965133666992, "learning_rate": 7.045137374618405e-05, "loss": 1.1658, "step": 13760 }, { "epoch": 2.99, "grad_norm": 1.0827128887176514, "learning_rate": 7.042956825119931e-05, "loss": 1.1639, "step": 13770 }, { "epoch": 2.99, "grad_norm": 1.112669825553894, "learning_rate": 7.040776275621457e-05, "loss": 1.1743, "step": 13780 }, { "epoch": 2.99, "grad_norm": 0.9779360890388489, "learning_rate": 7.038595726122984e-05, "loss": 1.1823, "step": 13790 }, { "epoch": 3.0, "grad_norm": 1.0385786294937134, "learning_rate": 7.03641517662451e-05, "loss": 1.1804, "step": 13800 }, { "epoch": 3.0, "grad_norm": 1.05619215965271, "learning_rate": 7.034234627126036e-05, "loss": 1.1936, "step": 13810 }, { "epoch": 3.0, "eval_loss": 1.28429114818573, "eval_runtime": 1501.2758, "eval_samples_per_second": 257.68, "eval_steps_per_second": 4.027, "step": 13818 } ], "logging_steps": 10, "max_steps": 46060, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "total_flos": 5.893571450073252e+18, "train_batch_size": 6, "trial_name": null, "trial_params": null }